diff --git a/.github/workflows/test_and_deploy.yml b/.github/workflows/test_and_deploy.yml
index 7c2b97591..6ce6f4990 100644
--- a/.github/workflows/test_and_deploy.yml
+++ b/.github/workflows/test_and_deploy.yml
@@ -31,6 +31,22 @@ env:
jobs:
+ lint:
+ name: Check code style
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+ - uses: actions/setup-python@v5
+ with:
+ python-version: "3.12"
+ - name: Install ruff
+ run: pip install ruff
+ - name: Check code formatting
+ run: ruff format --diff
+ - name: Lint code base
+ run: ruff check
+
+
generate-wheels-matrix:
name: "Generate wheels matrix"
runs-on: "ubuntu-latest"
@@ -245,6 +261,7 @@ jobs:
permissions:
contents: write
needs:
+ - lint
- test-and-build
- make-sdist
- test-interfaces
diff --git a/.gitignore b/.gitignore
index b76f59e63..037c3f92e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -39,7 +39,6 @@ htmlcov
# Ignore all compiled python files (e.g. from running the unit tests)
*.pyc
*.pyo
-*.py{}
*.py-e
# Ignore potential directory created during install
diff --git a/doc/apidoc.py b/doc/apidoc.py
index 5bc412333..8f10f0923 100644
--- a/doc/apidoc.py
+++ b/doc/apidoc.py
@@ -3,17 +3,16 @@
# information.
__author__ = "Patrick Kunzmann"
-__all__ = ["create_api_doc", "skip_non_methods"]
+__all__ = ["create_api_doc", "skip_nonrelevant"]
-from os.path import join, isdir
-from os import listdir, makedirs
-from importlib import import_module
-import types
-import json
import enum
-from textwrap import dedent
+import json
+import types
from collections import OrderedDict
-
+from importlib import import_module
+from os import listdir, makedirs
+from os.path import isdir, join
+from textwrap import dedent
_INDENT = " " * 4
@@ -24,7 +23,6 @@
_pck_categories = json.load(file, object_pairs_hook=OrderedDict)
-
def create_api_doc(src_path, doc_path):
"""
Create *.rst files for API documentation.
@@ -40,11 +38,7 @@ def create_api_doc(src_path, doc_path):
# Create directory to store apidoc
if not isdir(doc_path):
makedirs(doc_path)
- package_list = _create_package_doc(
- "biotite",
- join(src_path, "biotite"),
- doc_path
- )
+ package_list = _create_package_doc("biotite", join(src_path, "biotite"), doc_path)
_create_package_index(doc_path, package_list)
@@ -67,19 +61,24 @@ def _create_package_doc(pck, src_path, doc_path):
module = import_module(pck)
attr_list = dir(module)
# Classify attribute names into classes and functions
- class_list = [attr for attr in attr_list
- # Do not document private classes
- if attr[0] != "_"
- # Check if object is a class
- and isinstance(getattr(module, attr), type)]
- func_list = [attr for attr in attr_list
- # Do not document private classes
- if attr[0] != "_"
- # All functions are callable...
- and callable(getattr(module, attr))
- # ...but classes are also callable
- and attr not in class_list
- ]
+ class_list = [
+ attr
+ for attr in attr_list
+ # Do not document private classes
+ if attr[0] != "_"
+ # Check if object is a class
+ and isinstance(getattr(module, attr), type)
+ ]
+ func_list = [
+ attr
+ for attr in attr_list
+ # Do not document private classes
+ if attr[0] != "_"
+ # All functions are callable...
+ and callable(getattr(module, attr))
+ # ...but classes are also callable
+ and attr not in class_list
+ ]
# Create *.rst files
_create_package_page(doc_path, pck, class_list, func_list, sub_pck)
for class_name in class_list:
@@ -87,11 +86,10 @@ def _create_package_doc(pck, src_path, doc_path):
for function_name in func_list:
_create_function_page(doc_path, pck, function_name)
- return([pck] + sub_pck)
+ return [pck] + sub_pck
-def _create_package_page(doc_path, package_name,
- classes, functions, subpackages):
+def _create_package_page(doc_path, package_name, classes, functions, subpackages):
attributes = classes + functions
# Get categories for this package
@@ -114,7 +112,6 @@ def _create_package_page(doc_path, package_name,
misc_category_name = "Miscellaneous" if categories else "Content"
categories[misc_category_name] = misc_attributes
-
# String for categorized class and function enumeration
category_strings = []
for category, attrs in categories.items():
@@ -135,12 +132,11 @@ def _create_package_page(doc_path, package_name,
attributes_string = "\n".join(category_strings)
# String for subpackage enumeration
- subpackages_string = "\n".join(
- [_INDENT + pck for pck in subpackages]
- )
+ subpackages_string = "\n".join([_INDENT + pck for pck in subpackages])
# Assemble page
- file_content = dedent(f"""
+ file_content = (
+ dedent(f"""
``{package_name}``
{"=" * (len(package_name) + 4)}
@@ -150,16 +146,21 @@ def _create_package_page(doc_path, package_name,
.. currentmodule:: {package_name}
- """) + attributes_string
+ """)
+ + attributes_string
+ )
if len(subpackages) > 0:
- file_content += dedent(f"""
+ file_content += (
+ dedent("""
Subpackages
-----------
.. autosummary::
- """) + subpackages_string
+ """)
+ + subpackages_string
+ )
with open(join(doc_path, f"{package_name}.rst"), "w") as f:
f.write(file_content)
@@ -201,18 +202,19 @@ def _create_function_page(doc_path, package_name, function_name):
def _create_package_index(doc_path, package_list):
# String for package enumeration
- packages_string = "\n".join(
- [_INDENT + pck for pck in sorted(package_list)]
- )
+ packages_string = "\n".join([_INDENT + pck for pck in sorted(package_list)])
- file_content = dedent(f"""
+ file_content = (
+ dedent("""
API Reference
=============
.. autosummary::
:toctree:
- """) + packages_string
+ """)
+ + packages_string
+ )
with open(join(doc_path, "index.rst"), "w") as f:
f.write(file_content)
@@ -249,20 +251,21 @@ def _is_relevant_type(obj):
# These are some special built-in Python methods
return False
return (
- # Functions
- type(obj) in [
- types.FunctionType, types.BuiltinFunctionType, types.MethodType
- ]
- ) | (
- # Functions from C-extensions
- type(obj).__name__ in [
- "cython_function_or_method",
- "fused_cython_function"
- ]
- ) | (
- # Enum instance
- isinstance(obj, enum.Enum)
- ) | (
- # Inner class
- isinstance(obj, type)
- )
\ No newline at end of file
+ (
+ # Functions
+ type(obj)
+ in [types.FunctionType, types.BuiltinFunctionType, types.MethodType]
+ )
+ | (
+ # Functions from C-extensions
+ type(obj).__name__ in ["cython_function_or_method", "fused_cython_function"]
+ )
+ | (
+ # Enum instance
+ isinstance(obj, enum.Enum)
+ )
+ | (
+ # Inner class
+ isinstance(obj, type)
+ )
+ )
diff --git a/doc/bibliography.py b/doc/bibliography.py
index 9c0bc4831..cf44587cc 100644
--- a/doc/bibliography.py
+++ b/doc/bibliography.py
@@ -5,14 +5,14 @@
__author__ = "Patrick Kunzmann"
import warnings
-from pybtex.richtext import Text, Tag, HRef
+from pybtex.richtext import HRef, Tag, Text
from pybtex.style.formatting import BaseStyle
class IEEEStyle(BaseStyle):
def format_article(self, param):
entry = param["entry"]
-
+
try:
authors = []
for author in entry.persons["author"]:
@@ -28,7 +28,7 @@ def format_article(self, param):
text += " "
text += " ".join([s for s in author.last_names])
authors.append(Text(text + ", "))
-
+
title = ""
in_protected = False
for char in entry.fields["title"]:
@@ -46,34 +46,34 @@ def format_article(self, param):
else:
title += char.lower()
title = Text('"', title, '," ')
-
+
journal = Text(Tag("em", entry.fields["journal"]), ", ")
-
+
if "volume" in entry.fields:
volume = Text("vol. ", entry.fields["volume"], ", ")
else:
volume = Text()
-
+
if "pages" in entry.fields:
pages = Text("pp. ", entry.fields["pages"], ", ")
else:
pages = Text()
-
+
date = entry.fields["year"]
if "month" in entry.fields:
date = entry.fields["month"] + " " + date
date = Text(date, ". ")
-
- if "doi" in entry.fields:
- doi = Text("doi: ", HRef(
- "https://doi.org/" + entry.fields["doi"],
- entry.fields["doi"]
- ))
+
+ if "doi" in entry.fields:
+ doi = Text(
+ "doi: ",
+ HRef("https://doi.org/" + entry.fields["doi"], entry.fields["doi"]),
+ )
else:
doi = Text()
-
+
return Text(*authors, title, journal, volume, pages, date, doi)
-
- except:
+
+ except Exception:
warnings.warn(f"Invalid BibTeX entry '{entry.key}'")
- return Text(entry.key)
\ No newline at end of file
+ return Text(entry.key)
diff --git a/doc/conf.py b/doc/conf.py
index d3f6e53c0..7f19bc67c 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -5,24 +5,21 @@
__author__ = "Patrick Kunzmann"
# Setup Cython for import of uncompiled *.pyx files
-import pyximport
import numpy as np
+import pyximport
+
pyximport.install(
- setup_args={'include_dirs': np.get_include()},
- build_in_temp=False,
- language_level=3
+ setup_args={"include_dirs": np.get_include()}, build_in_temp=False, language_level=3
)
-from os.path import realpath, dirname, join
import sys
import warnings
-import pybtex
-from sphinx_gallery.sorting import FileNameSortKey, ExplicitOrder
+from os.path import dirname, join, realpath
import matplotlib
-
+import pybtex
+from sphinx_gallery.sorting import ExplicitOrder, FileNameSortKey
import biotite
-
BIOTITE_DOMAIN = "www.biotite-python.org"
DOC_PATH = dirname(realpath(__file__))
PACKAGE_PATH = join(dirname(DOC_PATH), "src")
@@ -32,28 +29,21 @@
# in order to import modules for API doc generation etc.
sys.path.insert(0, DOC_PATH)
import apidoc
-import viewcode
-import scraper
import bibliography
import key
+import scraper
import switcher
-
+import viewcode
# Reset matplotlib params
matplotlib.rcdefaults()
# Pregeneration of files
apidoc.create_api_doc(PACKAGE_PATH, join(DOC_PATH, "apidoc"))
-switcher.create_switcher_json(
- join("static", "switcher.json"),
- "v0.41.0",
- n_versions=5
-)
+switcher.create_switcher_json(join("static", "switcher.json"), "v0.41.0", n_versions=5)
# Use custom citation style
-pybtex.plugin.register_plugin(
- "pybtex.style.formatting", "ieee", bibliography.IEEEStyle
-)
+pybtex.plugin.register_plugin("pybtex.style.formatting", "ieee", bibliography.IEEEStyle)
#### Source code link ###
@@ -61,14 +51,13 @@
#### General ####
-import warnings
# Removed standard matplotlib warning when generating gallery
warnings.filterwarnings(
"ignore",
category=UserWarning,
message="Matplotlib is currently using agg, which is a non-GUI backend, "
- "so cannot show the figure."
+ "so cannot show the figure.",
)
extensions = [
@@ -127,10 +116,7 @@
html_theme = "pydata_sphinx_theme"
html_static_path = ["static"]
-html_css_files = [
- "biotite.css",
- "fonts.css"
-]
+html_css_files = ["biotite.css", "fonts.css"]
html_title = "Biotite"
html_logo = "static/assets/general/biotite_logo.svg"
html_favicon = "static/assets/general/biotite_icon_32p.png"
@@ -162,11 +148,11 @@
"url": "https://biotite.bsky.social",
"icon": "fa-brands fa-bluesky",
"type": "fontawesome",
- }
- ],
- "use_edit_page_button": True,
- "show_prev_next": False,
- "show_toc_level": 2,
+ },
+ ],
+ "use_edit_page_button": True,
+ "show_prev_next": False,
+ "show_toc_level": 2,
}
html_sidebars = {
# No primary sidebar for these pages
@@ -183,53 +169,49 @@
}
sphinx_gallery_conf = {
- "examples_dirs" : [
- "examples/scripts/sequence",
- "examples/scripts/structure"
- ],
- "gallery_dirs" : [
- "examples/gallery/sequence",
- "examples/gallery/structure"
- ],
- "subsection_order": ExplicitOrder([
- "examples/scripts/sequence/homology",
- "examples/scripts/sequence/sequencing",
- "examples/scripts/sequence/profile",
- "examples/scripts/sequence/annotation",
- "examples/scripts/sequence/misc",
- "examples/scripts/structure/protein",
- "examples/scripts/structure/nucleotide",
- "examples/scripts/structure/molecule",
- "examples/scripts/structure/contacts",
- "examples/scripts/structure/modeling",
- "examples/scripts/structure/misc",
- ]),
- "within_subsection_order" : FileNameSortKey,
+ "examples_dirs": ["examples/scripts/sequence", "examples/scripts/structure"],
+ "gallery_dirs": ["examples/gallery/sequence", "examples/gallery/structure"],
+ "subsection_order": ExplicitOrder(
+ [
+ "examples/scripts/sequence/homology",
+ "examples/scripts/sequence/sequencing",
+ "examples/scripts/sequence/profile",
+ "examples/scripts/sequence/annotation",
+ "examples/scripts/sequence/misc",
+ "examples/scripts/structure/protein",
+ "examples/scripts/structure/nucleotide",
+ "examples/scripts/structure/molecule",
+ "examples/scripts/structure/contacts",
+ "examples/scripts/structure/modeling",
+ "examples/scripts/structure/misc",
+ ]
+ ),
+ "within_subsection_order": FileNameSortKey,
# Do not run example scripts with a trailing '_noexec'
- "filename_pattern" : "^((?!_noexec).)*$",
- "ignore_pattern" : "(.*ignore\.py)|(.*pymol\.py)",
- "backreferences_dir" : None,
- "download_all_examples" : False,
+ "filename_pattern": "^((?!_noexec).)*$",
+ "ignore_pattern": r"(.*ignore\.py)|(.*pymol\.py)",
+ "download_all_examples": False,
# Never report run time
- "min_reported_time" : sys.maxsize,
- "default_thumb_file" : join(
+ "min_reported_time": sys.maxsize,
+ "default_thumb_file": join(
DOC_PATH, "static/assets/general/biotite_icon_thumb.png"
),
- "image_scrapers" : (
+ "image_scrapers": (
"matplotlib",
scraper.static_image_scraper,
- scraper.pymol_scraper
+ scraper.pymol_scraper,
),
- "matplotlib_animations" : True,
- "backreferences_dir" : "examples/backreferences",
- "doc_module" : ("biotite",),
+ "matplotlib_animations": True,
+ "backreferences_dir": "examples/backreferences",
+ "doc_module": ("biotite",),
# Set the NCBI API key
- "reset_modules" : (key.set_ncbi_api_key_from_env,),
- "remove_config_comments" : True,
+ "reset_modules": (key.set_ncbi_api_key_from_env,),
+ "remove_config_comments": True,
}
#### App setup ####
+
def setup(app):
- app.connect("autodoc-skip-member", apidoc.skip_nonrelevant)
\ No newline at end of file
+ app.connect("autodoc-skip-member", apidoc.skip_nonrelevant)
diff --git a/doc/contribution/development.rst b/doc/contribution/development.rst
index 9cee6fefa..9def9fb3a 100644
--- a/doc/contribution/development.rst
+++ b/doc/contribution/development.rst
@@ -53,13 +53,19 @@ Official support for PyPy might be added someday.
Code style
----------
-*Biotite* is in compliance with PEP 8.
-The maximum line length is 79 for code lines and 72 for docstring and
-comment lines.
+*Biotite* is compliant with :pep:`8` and uses `Ruff `_ for
+code formatting and linting.
+The maximum line length is 88 characters.
An exception is made for docstring lines, if it is not possible to use a
-maximum of 72 characters (e.g. tables), and for
-`doctest `_ lines,
-where the actual code may take up to 79 characters.
+maximum of 88 characters (e.g. tables and parameter type descriptions).
+To make code changes ready for a pull request, simply run
+
+.. code-block:: console
+
+ $ ruff format
+ $ ruff check --fix
+
+and fix the remaining linter complaints.
Dependencies
------------
@@ -124,14 +130,14 @@ accessible, in a relative manner.
Import statements should be the only statements in a ``__init__.py`` file.
In case a module needs functionality from another subpackage of *Biotite*,
-use a relative import.
+use an absolute import as suggested by PEP 8.
This import should target the module directly and not the package to avoid
circular imports and thus an ``ImportError``.
So import statements like the following are totally OK:
.. code-block:: python
- from ...package.subpackage.module import foo
+ from biotite.subpackage.module import foo
In order to prevent namespace pollution, all modules must define the `__all__`
variable with all publicly accessible attributes of the module.
diff --git a/doc/examples/scripts/sequence/annotation/operon_map.py b/doc/examples/scripts/sequence/annotation/operon_map.py
index dcd730ee7..37be67652 100644
--- a/doc/examples/scripts/sequence/annotation/operon_map.py
+++ b/doc/examples/scripts/sequence/annotation/operon_map.py
@@ -10,31 +10,39 @@
# License: BSD 3 clause
import matplotlib.pyplot as plt
-from biotite.sequence import Annotation, Feature, Location
import biotite.sequence.graphics as graphics
+from biotite.sequence import Annotation, Feature, Location
strand = Location.Strand.FORWARD
-prom = Feature("regulatory", [Location(10, 50, strand)],
- {"regulatory_class" : "promoter",
- "note" : "T7"})
-rbs1 = Feature("regulatory", [Location(60, 75, strand)],
- {"regulatory_class" : "ribosome_binding_site",
- "note" : "RBS1"})
-gene1 = Feature("gene", [Location(81, 380, strand)],
- {"gene" : "gene1"})
-rbs2 = Feature("regulatory", [Location(400, 415, strand)],
- {"regulatory_class" : "ribosome_binding_site",
- "note" : "RBS2"})
-gene2 = Feature("gene", [Location(421, 1020, strand)],
- {"gene" : "gene2"})
-term = Feature("regulatory", [Location(1050, 1080, strand)],
- {"regulatory_class" : "terminator"})
+prom = Feature(
+ "regulatory",
+ [Location(10, 50, strand)],
+ {"regulatory_class": "promoter", "note": "T7"},
+)
+rbs1 = Feature(
+ "regulatory",
+ [Location(60, 75, strand)],
+ {"regulatory_class": "ribosome_binding_site", "note": "RBS1"},
+)
+gene1 = Feature("gene", [Location(81, 380, strand)], {"gene": "gene1"})
+rbs2 = Feature(
+ "regulatory",
+ [Location(400, 415, strand)],
+ {"regulatory_class": "ribosome_binding_site", "note": "RBS2"},
+)
+gene2 = Feature("gene", [Location(421, 1020, strand)], {"gene": "gene2"})
+term = Feature(
+ "regulatory", [Location(1050, 1080, strand)], {"regulatory_class": "terminator"}
+)
annotation = Annotation([prom, rbs1, gene1, rbs2, gene2, term])
fig = plt.figure(figsize=(8.0, 0.8))
ax = fig.add_subplot(111)
graphics.plot_feature_map(
- ax, annotation, multi_line=False, loc_range=(1, 1101),
+ ax,
+ annotation,
+ multi_line=False,
+ loc_range=(1, 1101),
)
fig.tight_layout()
-plt.show()
\ No newline at end of file
+plt.show()
diff --git a/doc/examples/scripts/sequence/annotation/plasmid_map.py b/doc/examples/scripts/sequence/annotation/plasmid_map.py
index b25623bf4..bf4118352 100644
--- a/doc/examples/scripts/sequence/annotation/plasmid_map.py
+++ b/doc/examples/scripts/sequence/annotation/plasmid_map.py
@@ -18,26 +18,33 @@
# License: BSD 3 clause
import io
-import requests
import matplotlib.pyplot as plt
-import numpy as np
+import requests
import biotite
-import biotite.sequence.io.genbank as gb
import biotite.sequence.graphics as graphics
-import biotite.database.entrez as entrez
-
+import biotite.sequence.io.genbank as gb
-PLASMID_URL = "https://media.addgene.org/snapgene-media/v1.7.9-0-g88a3305/"\
- "sequences/12250/9998fdbe-051f-4dc6-ba0f-24e65127a0c5/" \
- "addgene-plasmid-26092-sequence-12250.gbk"
+PLASMID_URL = (
+ "https://media.addgene.org/snapgene-media/v1.7.9-0-g88a3305/"
+ "sequences/12250/9998fdbe-051f-4dc6-ba0f-24e65127a0c5/"
+ "addgene-plasmid-26092-sequence-12250.gbk"
+)
response = requests.get(PLASMID_URL)
gb_file = gb.GenBankFile.read(io.StringIO(response.text))
-annotation = gb.get_annotation(gb_file, include_only=[
- "promoter", "terminator", "protein_bind",
- "RBS", "CDS", "rep_origin", "primer_bind"
-])
+annotation = gb.get_annotation(
+ gb_file,
+ include_only=[
+ "promoter",
+ "terminator",
+ "protein_bind",
+ "RBS",
+ "CDS",
+ "rep_origin",
+ "primer_bind",
+ ],
+)
_, seq_length, _, _, _, _ = gb.get_locus(gb_file)
# AddGene stores the plasmid name in the 'KEYWORDS' field
# [0][0][0] ->
@@ -69,8 +76,11 @@ def custom_feature_formatter(feature):
fig = plt.figure(figsize=(8.0, 8.0))
ax = fig.add_subplot(111, projection="polar")
graphics.plot_plasmid_map(
- ax, annotation, plasmid_size=seq_length,
- label=plasmid_name, feature_formatter=custom_feature_formatter
+ ax,
+ annotation,
+ plasmid_size=seq_length,
+ label=plasmid_name,
+ feature_formatter=custom_feature_formatter,
)
fig.tight_layout()
plt.show()
diff --git a/doc/examples/scripts/sequence/annotation/plasmid_map_custom.py b/doc/examples/scripts/sequence/annotation/plasmid_map_custom.py
index 67fb87834..494b2d17e 100644
--- a/doc/examples/scripts/sequence/annotation/plasmid_map_custom.py
+++ b/doc/examples/scripts/sequence/annotation/plasmid_map_custom.py
@@ -5,129 +5,100 @@
.. currentmodule:: biotite.sequence
This script shows how :class:`Feature` objects are displayed in a
-plasmid map by using a custom 'toy' :class:`Annotation`.
+plasmid map by using a custom 'toy' :class:`Annotation`.
"""
# Code source: Patrick Kunzmann
# License: BSD 3 clause
import matplotlib.pyplot as plt
-import numpy as np
import biotite.sequence as seq
-import biotite.sequence.io.genbank as gb
import biotite.sequence.graphics as graphics
-import biotite.database.entrez as entrez
-
-annotation = seq.Annotation([
- seq.Feature(
- "source",
- [seq.Location(0, 1500)],
- {"organism": "Escherichia coli"}
- ),
-
- # Ori
- seq.Feature(
- "rep_origin",
- [seq.Location(600, 700, seq.Location.Strand.REVERSE)],
- {"regulatory_class": "promoter", "note": "MyProm"}
- ),
-
- # Promoter
- seq.Feature(
- "regulatory",
- [seq.Location(1000, 1060)],
- {"regulatory_class": "promoter", "note": "MyProm"}
- ),
- seq.Feature(
- "protein_bind",
- [seq.Location(1025, 1045)],
- {"note": "repr"}
- ),
-
- # Gene A
- seq.Feature(
- "regulatory",
- [seq.Location(1070, 1080)],
- {"regulatory_class": "ribosome_binding_site"}
- ),
- seq.Feature(
- "CDS",
- [seq.Location(1091, 1150)],
- {"product": "geneA"}
- ),
-
- # Gene B
- seq.Feature(
- "regulatory",
- [seq.Location(1180, 1190)],
- {"regulatory_class": "ribosome_binding_site"}
- ),
- seq.Feature(
- "CDS",
- [seq.Location(1201, 1350)],
- {"product": "geneB"}
- ),
- seq.Feature(
- "regulatory",
- [seq.Location(1220, 1230)],
- {"regulatory_class": "ribosome_binding_site"}
- ),
- seq.Feature(
- "CDS",
- [seq.Location(1240, 1350)],
- {"product": "geneB2"}
- ),
-
- # Gene C
- seq.Feature(
- "regulatory",
- [seq.Location(1380, 1390)],
- {"regulatory_class": "ribosome_binding_site"}
- ),
- seq.Feature(
- "CDS",
- # CDS extends over periodic boundary -> two locations
- [seq.Location(1, 300), seq.Location(1402, 1500)],
- {"product": "geneC"}
- ),
-
- # Terminator
- seq.Feature(
- "regulatory",
- [seq.Location(310, 350)],
- {"regulatory_class": "terminator", "note": "MyTerm"}
- ),
-
- # Primers
- # The labels will be too long to be displayed on the map
- # If you want to display them nevertheless, set the
- # 'omit_oversized_labels' to False
- seq.Feature(
- "primer_bind",
- [seq.Location(1385, 1405)],
- {"note": "geneC"}
- ),
- seq.Feature(
- "primer_bind",
- [seq.Location(345, 365, seq.Location.Strand.REVERSE)],
- {"note": "geneC_R"}
- ),
-
- # Terminator
- seq.Feature(
- "regulatory",
- [seq.Location(310, 350)],
- {"regulatory_class": "terminator", "note": "MyTerm"}
- ),
-])
+annotation = seq.Annotation(
+ [
+ seq.Feature(
+ "source", [seq.Location(0, 1500)], {"organism": "Escherichia coli"}
+ ),
+ # Ori
+ seq.Feature(
+ "rep_origin",
+ [seq.Location(600, 700, seq.Location.Strand.REVERSE)],
+ {"regulatory_class": "promoter", "note": "MyProm"},
+ ),
+ # Promoter
+ seq.Feature(
+ "regulatory",
+ [seq.Location(1000, 1060)],
+ {"regulatory_class": "promoter", "note": "MyProm"},
+ ),
+ seq.Feature("protein_bind", [seq.Location(1025, 1045)], {"note": "repr"}),
+ # Gene A
+ seq.Feature(
+ "regulatory",
+ [seq.Location(1070, 1080)],
+ {"regulatory_class": "ribosome_binding_site"},
+ ),
+ seq.Feature("CDS", [seq.Location(1091, 1150)], {"product": "geneA"}),
+ # Gene B
+ seq.Feature(
+ "regulatory",
+ [seq.Location(1180, 1190)],
+ {"regulatory_class": "ribosome_binding_site"},
+ ),
+ seq.Feature("CDS", [seq.Location(1201, 1350)], {"product": "geneB"}),
+ seq.Feature(
+ "regulatory",
+ [seq.Location(1220, 1230)],
+ {"regulatory_class": "ribosome_binding_site"},
+ ),
+ seq.Feature("CDS", [seq.Location(1240, 1350)], {"product": "geneB2"}),
+ # Gene C
+ seq.Feature(
+ "regulatory",
+ [seq.Location(1380, 1390)],
+ {"regulatory_class": "ribosome_binding_site"},
+ ),
+ seq.Feature(
+ "CDS",
+ # CDS extends over periodic boundary -> two locations
+ [seq.Location(1, 300), seq.Location(1402, 1500)],
+ {"product": "geneC"},
+ ),
+ # Terminator
+ seq.Feature(
+ "regulatory",
+ [seq.Location(310, 350)],
+ {"regulatory_class": "terminator", "note": "MyTerm"},
+ ),
+ # Primers
+ # The labels will be too long to be displayed on the map
+ # If you want to display them nevertheless, set the
+ # 'omit_oversized_labels' to False
+ seq.Feature("primer_bind", [seq.Location(1385, 1405)], {"note": "geneC"}),
+ seq.Feature(
+ "primer_bind",
+ [seq.Location(345, 365, seq.Location.Strand.REVERSE)],
+ {"note": "geneC_R"},
+ ),
+ # Terminator
+ seq.Feature(
+ "regulatory",
+ [seq.Location(310, 350)],
+ {"regulatory_class": "terminator", "note": "MyTerm"},
+ ),
+ ]
+)
fig = plt.figure(figsize=(8.0, 8.0))
ax = fig.add_subplot(111, projection="polar")
graphics.plot_plasmid_map(
- ax, annotation, plasmid_size=1500, label="My plasmid",
- label_properties={"fontsize": 8}
+ ax,
+ annotation,
+ plasmid_size=1500,
+ label="My plasmid",
+ label_properties={"fontsize": 8},
)
ticks = ax.get_xticks()
diff --git a/doc/examples/scripts/sequence/annotation/region_visualization.py b/doc/examples/scripts/sequence/annotation/region_visualization.py
index 6bdd55455..09aa3b43c 100644
--- a/doc/examples/scripts/sequence/annotation/region_visualization.py
+++ b/doc/examples/scripts/sequence/annotation/region_visualization.py
@@ -9,16 +9,13 @@
# Code source: Patrick Kunzmann
# License: BSD 3 clause
-import biotite.sequence as seq
+import matplotlib.pyplot as plt
+import biotite.database.entrez as entrez
import biotite.sequence.graphics as graphics
import biotite.sequence.io.genbank as gb
-import biotite.database.entrez as entrez
-import numpy as np
-import matplotlib.pyplot as plt
# Download E. coli BL21 genome
-file = entrez.fetch("CP001509", None, suffix="gb",
- db_name="nuccore", ret_type="gb")
+file = entrez.fetch("CP001509", None, suffix="gb", db_name="nuccore", ret_type="gb")
gb_file = gb.GenBankFile.read(file)
_, seq_length, _, _, _, _ = gb.get_locus(gb_file)
annotation = gb.get_annotation(gb_file, include_only=["gene"])
@@ -29,13 +26,15 @@
for loc in feature.locs:
# Ignore if feature is only a pseudo-gene (e.g. gene fragment)
# and check if feature is lacA gene (begin of lac operon)
- if "gene" in feature.qual \
- and "pseudo" not in feature.qual \
- and feature.qual["gene"] == "lacA":
- if min_loc > loc.first:
- min_loc = loc.first
- if max_loc < loc.last:
- max_loc = loc.last
+ if (
+ "gene" in feature.qual
+ and "pseudo" not in feature.qual
+ and feature.qual["gene"] == "lacA"
+ ):
+ if min_loc > loc.first:
+ min_loc = loc.first
+ if max_loc < loc.last:
+ max_loc = loc.last
# Extend the location range by 1000 (arbitrary) in each direction
min_loc -= 10000
max_loc += 10000
@@ -44,9 +43,13 @@
fig = plt.figure(figsize=(8.0, 8.0))
ax = fig.add_subplot(111)
graphics.plot_feature_map(
- ax, annotation, loc_range=(min_loc, max_loc), symbols_per_line=2000,
- show_numbers=True, show_line_position=True
+ ax,
+ annotation,
+ loc_range=(min_loc, max_loc),
+ symbols_per_line=2000,
+ show_numbers=True,
+ show_line_position=True,
)
fig.tight_layout()
-plt.show()
\ No newline at end of file
+plt.show()
diff --git a/doc/examples/scripts/sequence/annotation/sigma_domains.py b/doc/examples/scripts/sequence/annotation/sigma_domains.py
index 5d5a5ea36..d3c6b7365 100644
--- a/doc/examples/scripts/sequence/annotation/sigma_domains.py
+++ b/doc/examples/scripts/sequence/annotation/sigma_domains.py
@@ -11,37 +11,37 @@
import re
from collections import OrderedDict
-import numpy as np
import matplotlib.pyplot as plt
-from matplotlib.patches import Rectangle, FancyBboxPatch
-import biotite.sequence as seq
-import biotite.sequence.io.genbank as gb
+import numpy as np
+from matplotlib.patches import FancyBboxPatch, Rectangle
import biotite.database.entrez as entrez
-
+import biotite.sequence.io.genbank as gb
# The names of the sigma factors and the corresponding genes
-genes = OrderedDict({
- r"$\sigma^{70}$": "rpoD",
- r"$\sigma^{24}$": "rpoE",
- r"$\sigma^{28}$": "rpoF",
- r"$\sigma^{32}$": "rpoH",
- r"$\sigma^{38}$": "rpoS",
-})
+genes = OrderedDict(
+ {
+ r"$\sigma^{70}$": "rpoD",
+ r"$\sigma^{24}$": "rpoE",
+ r"$\sigma^{28}$": "rpoF",
+ r"$\sigma^{32}$": "rpoH",
+ r"$\sigma^{38}$": "rpoS",
+ }
+)
# Find SwissProt entries for these genes in NCBI Entrez protein database
uids = []
for name, gene in genes.items():
- query = entrez.SimpleQuery(gene, "Gene Name") \
- & entrez.SimpleQuery("srcdb_swiss-prot", "Properties") \
- & entrez.SimpleQuery("Escherichia coli K-12", "Organism")
+ query = (
+ entrez.SimpleQuery(gene, "Gene Name")
+ & entrez.SimpleQuery("srcdb_swiss-prot", "Properties")
+ & entrez.SimpleQuery("Escherichia coli K-12", "Organism")
+ )
ids = entrez.search(query, "protein")
# Only one entry per gene in E. coli K-12 is expected
assert len(ids) == 1
uids += ids
# Download corresponding GenBank files as single, merged file
-file = entrez.fetch_single_file(
- uids, None, "protein", ret_type="gb"
-)
+file = entrez.fetch_single_file(uids, None, "protein", ret_type="gb")
# Array that will hold for each of the genes and each of the 4 domains
# the first and last position
@@ -55,53 +55,66 @@
# Iterate over each GenBank entry
for i, gb_file in enumerate(multi_file):
_, length, _, _, _, _ = gb.get_locus(gb_file)
- seq_lengths[i] = length
+ seq_lengths[i] = length
annotation = gb.get_annotation(gb_file)
# Find features, that represent a sigma factor domain
for feature in annotation:
- if feature.key == "Region" and "note" in feature.qual \
- and "Sigma-70 factor domain" in feature.qual["note"]:
- # Extract the domain number
- # and decrement for 0-based indexing
- #
- # e.g. 'Sigma-70 factor domain-2.' => 1
- # ^
- domain_index = int(re.findall(
- "(?<=Sigma-70 factor domain-)\d+",
- feature.qual["note"]
- )[0]) -1
- # Expect a single contiguous location of the domain
- assert len(feature.locs) == 1
- loc = list(feature.locs)[0]
- # Store first and last position of the domain
- domain_pos[i, domain_index, :] = [loc.first, loc.last]
+ if (
+ feature.key == "Region"
+ and "note" in feature.qual
+ and "Sigma-70 factor domain" in feature.qual["note"]
+ ):
+ # Extract the domain number
+ # and decrement for 0-based indexing
+ #
+ # e.g. 'Sigma-70 factor domain-2.' => 1
+ # ^
+ domain_index = (
+ int(
+ re.findall(
+ r"(?<=Sigma-70 factor domain-)\d+", feature.qual["note"]
+ )[0]
+ )
+ - 1
+ )
+ # Expect a single contiguous location of the domain
+ assert len(feature.locs) == 1
+ loc = list(feature.locs)[0]
+ # Store first and last position of the domain
+ domain_pos[i, domain_index, :] = [loc.first, loc.last]
fig = plt.figure(figsize=(8.0, 4.0))
ax = fig.gca()
# The color for each one of the four domains
colors = ["firebrick", "forestgreen", "dodgerblue", "goldenrod"]
# Draw each sequence
-for i, (gene_name, domain_pos_for_gene, length) \
- in enumerate(zip(genes.keys(), domain_pos, seq_lengths)):
- # Add base line representing the sequence itself
- ax.add_patch(Rectangle(
- (1, i-0.05), length, 0.1, color="gray"
- ))
- # Draw each domain
- for j, ((first, last), color) \
- in enumerate(zip(domain_pos_for_gene, colors)):
- if first != -1 and last != -1:
- # FancyBboxPatch to get rounded corners in rectangle
- ax.add_patch(FancyBboxPatch(
- (first, i-0.4), last-first, 0.8, #color=color,
- boxstyle="round,pad=0,rounding_size=10",
- ec="black", fc=color,
- mutation_aspect=0.02
- ))
- ax.text(
- x=(last+first)/2, y=i, s=fr"$\sigma_{j+1}$",
- ha="center", va="center"
- )
+for i, (gene_name, domain_pos_for_gene, length) in enumerate(
+ zip(genes.keys(), domain_pos, seq_lengths)
+):
+ # Add base line representing the sequence itself
+ ax.add_patch(Rectangle((1, i - 0.05), length, 0.1, color="gray"))
+ # Draw each domain
+ for j, ((first, last), color) in enumerate(zip(domain_pos_for_gene, colors)):
+ if first != -1 and last != -1:
+ # FancyBboxPatch to get rounded corners in rectangle
+ ax.add_patch(
+ FancyBboxPatch(
+ (first, i - 0.4),
+ last - first,
+ 0.8, # color=color,
+ boxstyle="round,pad=0,rounding_size=10",
+ ec="black",
+ fc=color,
+ mutation_aspect=0.02,
+ )
+ )
+ ax.text(
+ x=(last + first) / 2,
+ y=i,
+ s=rf"$\sigma_{j+1}$",
+ ha="center",
+ va="center",
+ )
ax.set_xlim(0, max(seq_lengths))
ax.set_xlabel("Sequence position")
# Inverted y-axis
diff --git a/doc/examples/scripts/sequence/homology/avidin_alignment.py b/doc/examples/scripts/sequence/homology/avidin_alignment.py
index 40b50083f..da67ff617 100644
--- a/doc/examples/scripts/sequence/homology/avidin_alignment.py
+++ b/doc/examples/scripts/sequence/homology/avidin_alignment.py
@@ -11,16 +11,16 @@
# License: BSD 3 clause
import matplotlib.pyplot as plt
+import biotite.database.entrez as entrez
import biotite.sequence as seq
import biotite.sequence.align as align
-import biotite.sequence.io.fasta as fasta
-import biotite.database.entrez as entrez
import biotite.sequence.graphics as graphics
+import biotite.sequence.io.fasta as fasta
# Download and parse protein sequences of avidin and streptavidin
-fasta_file = fasta.FastaFile.read(entrez.fetch_single_file(
- ["CAC34569", "ACL82594"], None, "protein", "fasta"
-))
+fasta_file = fasta.FastaFile.read(
+ entrez.fetch_single_file(["CAC34569", "ACL82594"], None, "protein", "fasta")
+)
for name, sequence in fasta_file.items():
if "CAC34569" in name:
avidin_seq = seq.ProteinSequence(sequence)
@@ -31,16 +31,21 @@
matrix = align.SubstitutionMatrix.std_protein_matrix()
# Perform pairwise sequence alignment with affine gap penalty
# Terminal gaps are not penalized
-alignments = align.align_optimal(avidin_seq, streptavidin_seq, matrix,
- gap_penalty=(-10, -1), terminal_penalty=False)
+alignments = align.align_optimal(
+ avidin_seq, streptavidin_seq, matrix, gap_penalty=(-10, -1), terminal_penalty=False
+)
# Draw first and only alignment
# The color intensity indicates the similiarity
fig = plt.figure(figsize=(8.0, 2.5))
ax = fig.add_subplot(111)
graphics.plot_alignment_similarity_based(
- ax, alignments[0], matrix=matrix, labels=["Avidin", "Streptavidin"],
- show_numbers=True, show_line_position=True
+ ax,
+ alignments[0],
+ matrix=matrix,
+ labels=["Avidin", "Streptavidin"],
+ show_numbers=True,
+ show_line_position=True,
)
fig.tight_layout()
diff --git a/doc/examples/scripts/sequence/homology/bionigma_alignment.py b/doc/examples/scripts/sequence/homology/bionigma_alignment.py
index 9a3b8d1b5..c2275b2fe 100644
--- a/doc/examples/scripts/sequence/homology/bionigma_alignment.py
+++ b/doc/examples/scripts/sequence/homology/bionigma_alignment.py
@@ -12,121 +12,132 @@
# Code source: Patrick Kunzmann
# License: BSD 3 clause
-import numpy as np
import matplotlib.pyplot as plt
+import numpy as np
from matplotlib.patches import Rectangle
from matplotlib.transforms import Bbox
+import biotite.database.entrez as entrez
import biotite.sequence as seq
import biotite.sequence.align as align
-import biotite.sequence.io.fasta as fasta
import biotite.sequence.graphics as graphics
-import biotite.database.entrez as entrez
-
+import biotite.sequence.io.fasta as fasta
# The polygon coordinates for the different shapes
-_hexagon_coord = np.array([
- (0.500, 0.000),
- (0.937, 0.250),
- (0.937, 0.750),
- (0.500, 1.000),
- (0.063, 0.750),
- (0.063, 0.250)
-])
-
-_spiked_coord = np.array([
- (0.000, 0.000),
- (0.500, 0.150),
- (1.000, 0.000),
- (0.850, 0.500),
- (1.000, 1.000),
- (0.500, 0.850),
- (0.000, 1.000),
- (0.150, 0.500),
-])
-
-_spiked_coord = np.array([
- (0.000, 0.000),
- (0.500, 0.150),
- (1.000, 0.000),
- (0.850, 0.500),
- (1.000, 1.000),
- (0.500, 0.850),
- (0.000, 1.000),
- (0.150, 0.500),
-])
-
-_cross_coord = np.array([
- (0.220, 0.000),
- (0.780, 0.000),
- (0.780, 0.220),
- (1.000, 0.220),
- (1.000, 0.780),
- (0.780, 0.780),
- (0.780, 1.000),
- (0.220, 1.000),
- (0.220, 0.780),
- (0.000, 0.780),
- (0.000, 0.220),
- (0.220, 0.220),
-])
-
-_star_coord = np.array([
- (0.500, 0.000),
- (0.648, 0.150),
- (0.852, 0.150),
- (0.852, 0.352),
- (1.000, 0.500),
- (0.852, 0.648),
- (0.852, 0.852),
- (0.648, 0.852),
- (0.500, 1.000),
- (0.352, 0.852),
- (0.148, 0.852),
- (0.148, 0.648),
- (0.000, 0.500),
- (0.148, 0.352),
- (0.148, 0.148),
- (0.352, 0.148),
-])
-
-_hourglass_coord = np.array([
- (0.000, 0.000),
- (1.000, 0.000),
- (1.000, 0.220),
- (0.740, 0.420),
- (0.740, 0.580),
- (1.000, 0.780),
- (1.000, 1.000),
- (0.000, 1.000),
- (0.000, 0.780),
- (0.260, 0.580),
- (0.260, 0.420),
- (0.000, 0.220),
-])
+_hexagon_coord = np.array(
+ [
+ (0.500, 0.000),
+ (0.937, 0.250),
+ (0.937, 0.750),
+ (0.500, 1.000),
+ (0.063, 0.750),
+ (0.063, 0.250),
+ ]
+)
+
+_spiked_coord = np.array(
+ [
+ (0.000, 0.000),
+ (0.500, 0.150),
+ (1.000, 0.000),
+ (0.850, 0.500),
+ (1.000, 1.000),
+ (0.500, 0.850),
+ (0.000, 1.000),
+ (0.150, 0.500),
+ ]
+)
+
+_spiked_coord = np.array(
+ [
+ (0.000, 0.000),
+ (0.500, 0.150),
+ (1.000, 0.000),
+ (0.850, 0.500),
+ (1.000, 1.000),
+ (0.500, 0.850),
+ (0.000, 1.000),
+ (0.150, 0.500),
+ ]
+)
+
+_cross_coord = np.array(
+ [
+ (0.220, 0.000),
+ (0.780, 0.000),
+ (0.780, 0.220),
+ (1.000, 0.220),
+ (1.000, 0.780),
+ (0.780, 0.780),
+ (0.780, 1.000),
+ (0.220, 1.000),
+ (0.220, 0.780),
+ (0.000, 0.780),
+ (0.000, 0.220),
+ (0.220, 0.220),
+ ]
+)
+
+_star_coord = np.array(
+ [
+ (0.500, 0.000),
+ (0.648, 0.150),
+ (0.852, 0.150),
+ (0.852, 0.352),
+ (1.000, 0.500),
+ (0.852, 0.648),
+ (0.852, 0.852),
+ (0.648, 0.852),
+ (0.500, 1.000),
+ (0.352, 0.852),
+ (0.148, 0.852),
+ (0.148, 0.648),
+ (0.000, 0.500),
+ (0.148, 0.352),
+ (0.148, 0.148),
+ (0.352, 0.148),
+ ]
+)
+
+_hourglass_coord = np.array(
+ [
+ (0.000, 0.000),
+ (1.000, 0.000),
+ (1.000, 0.220),
+ (0.740, 0.420),
+ (0.740, 0.580),
+ (1.000, 0.780),
+ (1.000, 1.000),
+ (0.000, 1.000),
+ (0.000, 0.780),
+ (0.260, 0.580),
+ (0.260, 0.420),
+ (0.000, 0.220),
+ ]
+)
# The shape color for each symbols
_colors = {
- "A" : "#1e67b6",
- "C" : "#00a391",
- "D" : "#ea42fc",
- "E" : "#109c4b",
- "F" : "#fed700",
- "G" : "#8d4712",
- "H" : "#ff8e00",
- "I" : "#d82626",
- "K" : "#109c4b",
- "L" : "#d82626",
- "M" : "#d82626",
- "N" : "#ea42fc",
- "P" : "#ffa9e3",
- "Q" : "#109c4b",
- "R" : "#109c4b",
- "S" : "#1e67b6",
- "T" : "#1e67b6",
- "V" : "#d82626",
- "W" : "#fed700",
- "Y" : "#fed700"
+ "A": "#1e67b6",
+ "C": "#00a391",
+ "D": "#ea42fc",
+ "E": "#109c4b",
+ "F": "#fed700",
+ "G": "#8d4712",
+ "H": "#ff8e00",
+ "I": "#d82626",
+ "K": "#109c4b",
+ "L": "#d82626",
+ "M": "#d82626",
+ "N": "#ea42fc",
+ "P": "#ffa9e3",
+ "Q": "#109c4b",
+ "R": "#109c4b",
+ "S": "#1e67b6",
+ "T": "#1e67b6",
+ "V": "#d82626",
+ "W": "#fed700",
+ "Y": "#fed700",
}
@@ -134,31 +145,32 @@ class ShapePlotter(graphics.SymbolPlotter):
"""
A symbol plotter that depicts each symbol by color and shape.
"""
+
def __init__(self, axes, font_size=None, font_param=None):
super().__init__(axes)
# The symbol to shape mapping
self._draw_funcs = {
- "A" : ShapePlotter._draw_circle,
- "T" : ShapePlotter._draw_circle,
- "S" : ShapePlotter._draw_circle,
- "N" : ShapePlotter._draw_circle,
- "D" : ShapePlotter._draw_rectangle,
- "E" : ShapePlotter._draw_rectangle,
- "Q" : ShapePlotter._draw_rectangle,
- "K" : ShapePlotter._draw_rectangle,
- "R" : ShapePlotter._draw_rectangle,
- "I" : ShapePlotter._draw_hexagon,
- "L" : ShapePlotter._draw_hexagon,
- "V" : ShapePlotter._draw_hexagon,
- "M" : ShapePlotter._draw_hexagon,
- "F" : ShapePlotter._draw_spiked,
- "W" : ShapePlotter._draw_spiked,
- "Y" : ShapePlotter._draw_spiked,
- "H" : ShapePlotter._draw_spiked,
- "G" : ShapePlotter._draw_cross,
- "P" : ShapePlotter._draw_star,
- "C" : ShapePlotter._draw_hourglass
+ "A": ShapePlotter._draw_circle,
+ "T": ShapePlotter._draw_circle,
+ "S": ShapePlotter._draw_circle,
+ "N": ShapePlotter._draw_circle,
+ "D": ShapePlotter._draw_rectangle,
+ "E": ShapePlotter._draw_rectangle,
+ "Q": ShapePlotter._draw_rectangle,
+ "K": ShapePlotter._draw_rectangle,
+ "R": ShapePlotter._draw_rectangle,
+ "I": ShapePlotter._draw_hexagon,
+ "L": ShapePlotter._draw_hexagon,
+ "V": ShapePlotter._draw_hexagon,
+ "M": ShapePlotter._draw_hexagon,
+ "F": ShapePlotter._draw_spiked,
+ "W": ShapePlotter._draw_spiked,
+ "Y": ShapePlotter._draw_spiked,
+ "H": ShapePlotter._draw_spiked,
+ "G": ShapePlotter._draw_cross,
+ "P": ShapePlotter._draw_star,
+ "C": ShapePlotter._draw_hourglass,
}
self._font_size = font_size
@@ -166,8 +178,8 @@ def __init__(self, axes, font_size=None, font_param=None):
def plot_symbol(self, bbox, alignment, column_i, seq_i):
trace = alignment.trace
- if trace[column_i,seq_i] != -1:
- symbol = alignment.sequences[seq_i][trace[column_i,seq_i]]
+ if trace[column_i, seq_i] != -1:
+ symbol = alignment.sequences[seq_i][trace[column_i, seq_i]]
else:
symbol = ""
color = self._get_color(alignment, column_i, seq_i)
@@ -178,16 +190,21 @@ def plot_symbol(self, bbox, alignment, column_i, seq_i):
# Shrink Bbox slightly to get a small margin between shapes
f = 0.04
shape_bbox = Bbox(
- ((bbox.x0 + f*bbox.width,
- bbox.y0 + f*bbox.height),
- (bbox.x1 - f*bbox.width,
- bbox.y1 - f*bbox.height)),
+ (
+ (bbox.x0 + f * bbox.width, bbox.y0 + f * bbox.height),
+ (bbox.x1 - f * bbox.width, bbox.y1 - f * bbox.height),
+ ),
)
draw_func(self, shape_bbox, color)
text = self.axes.text(
- bbox.x0 + bbox.width/2, bbox.y0 + bbox.height/2,
- symbol, color="black", ha="center", va="center",
- size=self._font_size, **self._font_param
+ bbox.x0 + bbox.width / 2,
+ bbox.y0 + bbox.height / 2,
+ symbol,
+ color="black",
+ ha="center",
+ va="center",
+ size=self._font_size,
+ **self._font_param,
)
text.set_clip_on(True)
@@ -203,15 +220,17 @@ def _draw_circle(self, bbox, color):
from matplotlib.patches import Circle
circle = Circle(
- (bbox.x0 + bbox.width/2, bbox.y0 + bbox.height/2), bbox.width/2,
- facecolor=color, edgecolor="None", fill=True
+ (bbox.x0 + bbox.width / 2, bbox.y0 + bbox.height / 2),
+ bbox.width / 2,
+ facecolor=color,
+ edgecolor="None",
+ fill=True,
)
self.axes.add_patch(circle)
def _draw_rectangle(self, bbox, color):
rectangle = Rectangle(
- bbox.p0, bbox.width, bbox.height,
- facecolor=color, edgecolor="None"
+ bbox.p0, bbox.width, bbox.height, facecolor=color, edgecolor="None"
)
self.axes.add_patch(rectangle)
@@ -241,45 +260,50 @@ def _draw_polygon(self, bbox, color, coord):
self.axes.add_patch(polygon)
-def plot_alignment_shapes(axes, alignment, symbols_per_line=30,
- show_numbers=False, number_size=None,
- number_functions=None,
- labels=None, label_size=None,
- show_line_position=False,
- spacing=1, color_symbols=False,
- symbol_size=None, symbol_param=None):
+def plot_alignment_shapes(
+ axes,
+ alignment,
+ symbols_per_line=30,
+ show_numbers=False,
+ number_size=None,
+ number_functions=None,
+ labels=None,
+ label_size=None,
+ show_line_position=False,
+ spacing=1,
+ symbol_size=None,
+ symbol_param=None,
+):
"""
A thin wrapper around the 'ShapePlotter' and 'plot_alignment()'
function.
"""
- alphabet = alignment.sequences[0].get_alphabet()
- symbol_plotter = ShapePlotter(
- axes, font_size=symbol_size, font_param=symbol_param
- )
+ symbol_plotter = ShapePlotter(axes, font_size=symbol_size, font_param=symbol_param)
graphics.plot_alignment(
- axes=axes, alignment=alignment, symbol_plotter=symbol_plotter,
+ axes=axes,
+ alignment=alignment,
+ symbol_plotter=symbol_plotter,
symbols_per_line=symbols_per_line,
- show_numbers=show_numbers, number_size=number_size,
+ show_numbers=show_numbers,
+ number_size=number_size,
number_functions=number_functions,
- labels=labels, label_size=label_size,
+ labels=labels,
+ label_size=label_size,
show_line_position=show_line_position,
- spacing=spacing
+ spacing=spacing,
)
twin = axes.get_shared_x_axes().get_siblings(axes)[0]
for ax in (axes, twin):
- ax.set_yticklabels(ax.get_yticklabels(), fontdict={"color":"white"})
+ ax.set_yticklabels(ax.get_yticklabels(), fontdict={"color": "white"})
axes.get_figure().patch.set_facecolor("#181818")
-
-
# Using cyclotide sequences as example
-query = (
- entrez.SimpleQuery("Cyclotide") &
- entrez.SimpleQuery("cter") &
- entrez.SimpleQuery("srcdb_swiss-prot", field="Properties") ^
- entrez.SimpleQuery("Precursor")
+query = entrez.SimpleQuery("Cyclotide") & entrez.SimpleQuery(
+ "cter"
+) & entrez.SimpleQuery("srcdb_swiss-prot", field="Properties") ^ entrez.SimpleQuery(
+ "Precursor"
)
uids = entrez.search(query, "protein")
fasta_file = fasta.FastaFile.read(
@@ -289,8 +313,7 @@ def plot_alignment_shapes(axes, alignment, symbols_per_line=30,
# Currently there seems to b a bug in the NCBI search,
# so that 'Precursor' results are still included
# Solve this by filtering the sequence length
-sequence_dict = {header: seq for header, seq in sequence_dict.items()
- if len(seq) < 100}
+sequence_dict = {header: seq for header, seq in sequence_dict.items() if len(seq) < 100}
headers = list(sequence_dict.keys())
sequences = list(sequence_dict.values())
labels = [header[-1] for header in headers]
@@ -306,8 +329,7 @@ def plot_alignment_shapes(axes, alignment, symbols_per_line=30,
fig = plt.figure(figsize=(8.0, 4.0))
ax = fig.add_subplot(111)
plot_alignment_shapes(
- ax, alignment, labels=labels, symbols_per_line=len(alignment),
- symbol_size=8
+ ax, alignment, labels=labels, symbols_per_line=len(alignment), symbol_size=8
)
# The aspect ratio of the shapes should be preserved:
# Squares should look like squares, circles should look like circles
@@ -316,4 +338,4 @@ def plot_alignment_shapes(axes, alignment, symbols_per_line=30,
ax.set_ylabel("Type", color="white")
ax.set_title("Comparison of cyclotide sequences", color="white")
fig.tight_layout()
-plt.show()
\ No newline at end of file
+plt.show()
diff --git a/doc/examples/scripts/sequence/homology/genome_comparison.py b/doc/examples/scripts/sequence/homology/genome_comparison.py
index fc360804d..066388ce5 100644
--- a/doc/examples/scripts/sequence/homology/genome_comparison.py
+++ b/doc/examples/scripts/sequence/homology/genome_comparison.py
@@ -31,28 +31,25 @@
# License: BSD 3 clause
import tempfile
-import numpy as np
import matplotlib.pyplot as plt
+import numpy as np
from matplotlib.patches import Rectangle
from matplotlib.ticker import MultipleLocator
import biotite
+import biotite.application.tantan as tantan
+import biotite.database.entrez as entrez
import biotite.sequence as seq
+import biotite.sequence.align as align
import biotite.sequence.io as seqio
import biotite.sequence.io.genbank as gb
-import biotite.sequence.align as align
-import biotite.database.entrez as entrez
-import biotite.application.tantan as tantan
-
fasta_file = entrez.fetch(
- "NC_000932", tempfile.gettempdir(), "fasta",
- db_name="Nucleotide", ret_type="fasta"
+ "NC_000932", tempfile.gettempdir(), "fasta", db_name="Nucleotide", ret_type="fasta"
)
chloroplast_seq = seqio.load_sequence(fasta_file)
fasta_file = entrez.fetch(
- "NC_000911", tempfile.gettempdir(), "fasta",
- db_name="Nucleotide", ret_type="fasta"
+ "NC_000911", tempfile.gettempdir(), "fasta", db_name="Nucleotide", ret_type="fasta"
)
bacterium_seq = seqio.load_sequence(fasta_file)
@@ -73,15 +70,13 @@
# one ``111∗1∗11∗1∗∗11∗111`` :footcite:`Choi2004` is used here.
repeat_mask = tantan.TantanApp.mask_repeats(bacterium_seq)
-bacterium_seqs = [
- bacterium_seq, bacterium_seq.reverse(copy=False).complement()
-]
+bacterium_seqs = [bacterium_seq, bacterium_seq.reverse(copy=False).complement()]
table = align.KmerTable.from_sequences(
- k = 12,
- sequences = bacterium_seqs,
- spacing = "111∗1∗11∗1∗∗11∗111",
- ignore_masks = [repeat_mask, repeat_mask[::-1].copy()]
+ k=12,
+ sequences=bacterium_seqs,
+ spacing="111∗1∗11∗1∗∗11∗111",
+ ignore_masks=[repeat_mask, repeat_mask[::-1].copy()],
)
########################################################################
@@ -117,7 +112,7 @@
# Store the indices to the match array
# for each combination of diagonal and strand on the bacterial genome
matches_for_diagonals = {}
-for i, (diag, strand) in enumerate(zip(diagonals, matches[:,1])):
+for i, (diag, strand) in enumerate(zip(diagonals, matches[:, 1])):
if (diag, strand) not in matches_for_diagonals:
matches_for_diagonals[(diag, strand)] = [i]
else:
@@ -125,8 +120,9 @@
# If a diagonal has more than one match,
# the first match on this diagonal is a double hit
-double_hit_indices = [indices[0] for indices
- in matches_for_diagonals.values() if len(indices) > 1]
+double_hit_indices = [
+ indices[0] for indices in matches_for_diagonals.values() if len(indices) > 1
+]
double_hits = matches[double_hit_indices]
print("Number of double hits:", len(double_hits))
@@ -148,13 +144,19 @@
ACCEPT_THRESHOLD = 100
matrix = align.SubstitutionMatrix.std_nucleotide_matrix()
-ungapped_scores = np.array([
- align.align_local_ungapped(
- chloroplast_seq, bacterium_seqs[strand], matrix,
- seed=(i,j), threshold=X_DROP, score_only=True
- )
- for i, strand, j in double_hits
-])
+ungapped_scores = np.array(
+ [
+ align.align_local_ungapped(
+ chloroplast_seq,
+ bacterium_seqs[strand],
+ matrix,
+ seed=(i, j),
+ threshold=X_DROP,
+ score_only=True,
+ )
+ for i, strand, j in double_hits
+ ]
+)
accepted_hits = double_hits[ungapped_scores > ACCEPT_THRESHOLD]
print("Number of accepted ungapped alignments:", len(accepted_hits))
@@ -190,19 +192,27 @@
estimator = align.EValueEstimator.from_samples(
chloroplast_seq.alphabet,
# The scoring scheme must be the same as used for the alignment
- matrix, GAP_PENALTY,
- background
+ matrix,
+ GAP_PENALTY,
+ background,
)
# Compute similarity scores for each hit
-gapped_scores = np.array([
- align.align_local_gapped(
- chloroplast_seq, bacterium_seqs[strand], matrix,
- seed=(i,j), gap_penalty=GAP_PENALTY, threshold=X_DROP, score_only=True,
- max_table_size=100_000_000
- )
- for i, strand, j in accepted_hits
-])
+gapped_scores = np.array(
+ [
+ align.align_local_gapped(
+ chloroplast_seq,
+ bacterium_seqs[strand],
+ matrix,
+ seed=(i, j),
+ gap_penalty=GAP_PENALTY,
+ threshold=X_DROP,
+ score_only=True,
+ max_table_size=100_000_000,
+ )
+ for i, strand, j in accepted_hits
+ ]
+)
# Calculate the E-values
# For numeric stability reasons the method returns the common logarithm
@@ -215,10 +225,14 @@
accepted_alignments = [
(
align.align_local_gapped(
- chloroplast_seq, bacterium_seqs[strand], matrix,
- seed=(i,j), gap_penalty=GAP_PENALTY, threshold=X_DROP,
+ chloroplast_seq,
+ bacterium_seqs[strand],
+ matrix,
+ seed=(i, j),
+ gap_penalty=GAP_PENALTY,
+ threshold=X_DROP,
)[0],
- log_evalue
+ log_evalue,
)
for (i, strand, j), log_evalue in zip(accepted_hits, log_evalues)
if log_evalue <= np.log10(EVALUE_THRESHOLD)
@@ -248,11 +262,11 @@
stop = alignment.trace[-1, 0]
# If this region was not covered by any other alignment before,
# accept it and mark the region as covered
- if not covered_range[start : stop].any():
+ if not covered_range[start:stop].any():
unique_alignments.append((alignment, log_evalue))
- covered_range[start : stop] = True
+ covered_range[start:stop] = True
-print("Number of unique alignments:", len(unique_alignments))
+print("Number of unique alignments:", len(unique_alignments))
########################################################################
# To take a closer look on the found homologous regions, they are viewed
@@ -269,9 +283,9 @@
MARGIN_SIZE = 250
COLORS = {
- "CDS" : biotite.colors["dimgreen"],
+ "CDS": biotite.colors["dimgreen"],
"tRNA": biotite.colors["orange"],
- "rRNA": biotite.colors["orange"]
+ "rRNA": biotite.colors["orange"],
}
@@ -282,7 +296,6 @@
annotation = gb.get_annotation(gb_file, include_only=["CDS", "rRNA", "tRNA"])
-
def draw_arrow(ax, feature, loc):
x = loc.first
dx = loc.last - loc.first + 1
@@ -294,18 +307,25 @@ def draw_arrow(ax, feature, loc):
dx = loc.first - loc.last + 1
# Create head with 90 degrees tip -> head width/length ratio = 1/2
- ax.add_patch(biotite.AdaptiveFancyArrow(
- x, 0.5, dx, 0, tail_width=0.4, head_width=0.7, head_ratio=0.5,
- draw_head=True, color=COLORS[feature.key], linewidth=0
- ))
+ ax.add_patch(
+ biotite.AdaptiveFancyArrow(
+ x,
+ 0.5,
+ dx,
+ 0,
+ tail_width=0.4,
+ head_width=0.7,
+ head_ratio=0.5,
+ draw_head=True,
+ color=COLORS[feature.key],
+ linewidth=0,
+ )
+ )
label = feature.qual.get("gene")
if label is not None:
- ax.text(
- x + dx/2, 0.5, label, color="black",
- ha="center", va="center", size=8
- )
+ ax.text(x + dx / 2, 0.5, label, color="black", ha="center", va="center", size=8)
# Fetch features of the chloroplast genome
@@ -315,21 +335,15 @@ def draw_arrow(ax, feature, loc):
annotation = gb.get_annotation(gb_file, include_only=["CDS", "rRNA", "tRNA"])
n_rows = int(np.ceil(len(unique_alignments) / N_COL))
-fig, axes = plt.subplots(
- n_rows, N_COL,
- figsize=(8.0, 24.0),
- constrained_layout=True
-)
+fig, axes = plt.subplots(n_rows, N_COL, figsize=(8.0, 24.0), constrained_layout=True)
-for (alignment, log_evalue), ax in zip(
- unique_alignments, axes.flatten()
-):
+for (alignment, log_evalue), ax in zip(unique_alignments, axes.flatten()):
# Transform 0-based sequence index to 1-based sequence position
first = alignment.trace[0, 0] + 1
last = alignment.trace[-1, 0] + 1
center = (first + last) // 2
if last - first < EXCERPT_SIZE - MARGIN_SIZE * 2:
- excerpt_loc = (center - EXCERPT_SIZE//2, center + EXCERPT_SIZE//2)
+ excerpt_loc = (center - EXCERPT_SIZE // 2, center + EXCERPT_SIZE // 2)
else:
# Exceed excerpt size to show entire alignment range
excerpt_loc = (first - MARGIN_SIZE, last + MARGIN_SIZE)
@@ -345,11 +359,18 @@ def draw_arrow(ax, feature, loc):
for loc in feature.locs:
draw_arrow(ax, feature, loc)
# Draw rectangle representing homologuous region
- ax.add_patch(Rectangle(
- (first, 0.1), last - first + 1, 1 - 2*0.1,
- facecolor="None", edgecolor="black", alpha=0.2, linewidth=1,
- clip_on=False
- ))
+ ax.add_patch(
+ Rectangle(
+ (first, 0.1),
+ last - first + 1,
+ 1 - 2 * 0.1,
+ facecolor="None",
+ edgecolor="black",
+ alpha=0.2,
+ linewidth=1,
+ clip_on=False,
+ )
+ )
ax.xaxis.set_major_locator(MultipleLocator(1000))
ax.tick_params(labelsize=6)
@@ -359,13 +380,13 @@ def draw_arrow(ax, feature, loc):
ax.get_yaxis().set_tick_params(left=False, right=False, labelleft=False)
exponent = int(np.floor(log_evalue))
- mantissa = 10**(log_evalue-exponent)
+ mantissa = 10 ** (log_evalue - exponent)
homolog_excerpt = annotation[first : last + 1]
if len(homolog_excerpt) > 0:
# Select the longest feature in range for name display in title
representative_feature = max(
homolog_excerpt,
- key=lambda feature: -np.subtract(*feature.get_location_range())
+ key=lambda feature: -np.subtract(*feature.get_location_range()),
)
feature_name = representative_feature.qual["product"]
else:
@@ -377,14 +398,15 @@ def draw_arrow(ax, feature, loc):
ax.set_title(
f"{feature_name}\n"
- fr"E-Value: ${mantissa:.2f} \times 10^{{{exponent}}}$"
+ rf"E-Value: ${mantissa:.2f} \times 10^{{{exponent}}}$"
f"\nIdentity: {align.get_sequence_identity(alignment) * 100:3.1f} %",
- loc="left", size=8
+ loc="left",
+ size=8,
)
# Hide empty axes
-for ax in axes.flatten()[len(unique_alignments):]:
- ax.axis('off')
+for ax in axes.flatten()[len(unique_alignments) :]:
+ ax.axis("off")
fig.tight_layout(h_pad=3.0, w_pad=0.5)
@@ -399,4 +421,4 @@ def draw_arrow(ax, feature, loc):
# ----------
#
# .. footbibliography::
-#
\ No newline at end of file
+#
diff --git a/doc/examples/scripts/sequence/homology/genome_search.py b/doc/examples/scripts/sequence/homology/genome_search.py
index 8e98bd446..6649704b8 100644
--- a/doc/examples/scripts/sequence/homology/genome_search.py
+++ b/doc/examples/scripts/sequence/homology/genome_search.py
@@ -15,30 +15,26 @@
and is similar to the method used by software like *BLAST*.
At first the sequences for the *M1* coding gene and the *S. enterica*
-genome are downloaded from *NCBI Entrez*.
+genome are downloaded from *NCBI Entrez*.
"""
# Code source: Patrick Kunzmann
# License: BSD 3 clause
-import numpy as np
import matplotlib.pyplot as plt
+import numpy as np
from matplotlib.collections import LineCollection
import biotite
-import biotite.sequence as seq
-import biotite.sequence.io.fasta as fasta
-import biotite.sequence.io.genbank as gb
-import biotite.sequence.graphics as seqgraphics
-import biotite.sequence.align as align
-import biotite.database.entrez as entrez
-import biotite.structure.graphics as strucgraphics
import biotite.application.viennarna as viennarna
-
+import biotite.database.entrez as entrez
+import biotite.sequence.align as align
+import biotite.sequence.graphics as seqgraphics
+import biotite.sequence.io.genbank as gb
# Download Escherichia coli BL21 and Salmonella enterica genome
-gb_file = gb.MultiFile.read(entrez.fetch_single_file(
- ["CP001509", "CP019649"], None, "nuccore", "gb"
-))
+gb_file = gb.MultiFile.read(
+ entrez.fetch_single_file(["CP001509", "CP019649"], None, "nuccore", "gb")
+)
ec_file, se_file = tuple(gb_file)
annot_seq = gb.get_annotated_sequence(ec_file, include_only=["ncRNA"])
@@ -83,24 +79,27 @@
trigger_matches = []
# 0 represents the original genome sequence, 1 the reverse complement
for strand in (0, 1):
- matches_for_strand = matches[matches[:,1] == strand]
+ matches_for_strand = matches[matches[:, 1] == strand]
# Plot match positions
- ax = fig.add_subplot(1, 2, strand+1)
+ ax = fig.add_subplot(1, 2, strand + 1)
ax.scatter(
- matches_for_strand[:,0], matches_for_strand[:,2] / 1e6,
- s=4, marker="o", color=biotite.colors["dimorange"]
+ matches_for_strand[:, 0],
+ matches_for_strand[:, 2] / 1e6,
+ s=4,
+ marker="o",
+ color=biotite.colors["dimorange"],
)
ax.set_xlim(0, len(m1_sequence))
ax.set_ylim(0, len(se_genome) / 1e6)
ax.set_xlabel("E. coli M1 position (b)")
if strand == 0:
ax.set_ylabel("S. enterica genome position (Mb)")
- else: # strand == 1
+ else: # strand == 1
ax.set_ylabel("S. enterica genome position (Mb) (reverse complement)")
-
+
# Check if there are two adjacent matches on the same diagonal
- diagonals = matches_for_strand[:,2] - matches_for_strand[:,0]
+ diagonals = matches_for_strand[:, 2] - matches_for_strand[:, 0]
unique_diag = np.unique(diagonals)
trigger_diagonals = np.array([], dtype=int)
for diag in unique_diag:
@@ -116,7 +115,7 @@
# The other match on the same diagonal should not overlap
# with this match and should be within a cutoff range
if np.any((distances > K) & (distances < DISCARD_RANGE)):
- trigger_matches.append((strand, pos, pos+diag))
+ trigger_matches.append((strand, pos, pos + diag))
trigger_diagonals = np.append(trigger_diagonals, diag)
# Only add one match per diagonal at maximum
break
@@ -142,11 +141,14 @@
genome = genomic_seqs[strand]
diagonal = genome_pos - m1_pos
alignment = align.align_banded(
- m1_sequence, genome, matrix,
- band=(diagonal - BAND_WIDTH, diagonal + BAND_WIDTH), max_number=1
+ m1_sequence,
+ genome,
+ matrix,
+ band=(diagonal - BAND_WIDTH, diagonal + BAND_WIDTH),
+ max_number=1,
)[0]
alignments.append((strand, alignment))
-
+
strand, best_alignment = max(
alignments, key=lambda strand_alignment: alignment[1].score
)
@@ -159,15 +161,19 @@
# genomic sequence.
# Reverse sequence numbering for second sequence (genome) in alignment
-number_funcs = [None, lambda x: len(best_alignment.sequences[1]) - x]
+number_funcs = [None, lambda x: len(best_alignment.sequences[1]) - x]
# Visualize alignment, use custom color
fig = plt.figure(figsize=(8.0, 4.0))
ax = fig.add_subplot(111)
seqgraphics.plot_alignment_similarity_based(
- ax, best_alignment, matrix=matrix,
- labels=["E. coli M1 coding gene", "S. enterica genome"], show_numbers=True,
- number_functions=number_funcs, show_line_position=True,
- color=biotite.colors["brightorange"]
+ ax,
+ best_alignment,
+ matrix=matrix,
+ labels=["E. coli M1 coding gene", "S. enterica genome"],
+ show_numbers=True,
+ number_functions=number_funcs,
+ show_line_position=True,
+ color=biotite.colors["brightorange"],
)
fig.tight_layout()
# sphinx_gallery_thumbnail_number = 2
@@ -199,22 +205,25 @@
# Plot base connections
ax.plot(*plot_coord.T, color="black", linewidth=1, zorder=1)
# Plot base pairings
-ax.add_collection(LineCollection(
- [(plot_coord[i], plot_coord[j]) for i, j in base_pairs],
- color="silver", linewidth=1, zorder=1
-))
+ax.add_collection(
+ LineCollection(
+ [(plot_coord[i], plot_coord[j]) for i, j in base_pairs],
+ color="silver",
+ linewidth=1,
+ zorder=1,
+ )
+)
# Plot base markers
ax.scatter(
*plot_coord.T,
- s = 12,
+ s=12,
# Render markers over lines
- zorder = 2,
- # Display base marker color based on the identity in the alignment
- color = ["forestgreen" if identity else "firebrick"
- for identity in identities]
+ zorder=2,
+ # Display base marker color based on the identity in the alignment
+ color=["forestgreen" if identity else "firebrick" for identity in identities],
)
ax.set_aspect("equal")
ax.axis("off")
fig.tight_layout()
-plt.show()
\ No newline at end of file
+plt.show()
diff --git a/doc/examples/scripts/sequence/homology/gpcr_evolution.py b/doc/examples/scripts/sequence/homology/gpcr_evolution.py
index 5ac3d7ea1..7072601c2 100644
--- a/doc/examples/scripts/sequence/homology/gpcr_evolution.py
+++ b/doc/examples/scripts/sequence/homology/gpcr_evolution.py
@@ -20,22 +20,23 @@
import re
import matplotlib.pyplot as plt
import networkx as nx
+import biotite.application.clustalo as clustalo
+import biotite.database.uniprot as uniprot
import biotite.sequence as seq
import biotite.sequence.align as align
-import biotite.sequence.phylo as phylo
import biotite.sequence.io.fasta as fasta
-import biotite.database.uniprot as uniprot
-import biotite.application.clustalo as clustalo
-
+import biotite.sequence.phylo as phylo
# The bovine GPCRs are investigated
SPECIES = "Bovine"
query = (
- uniprot.SimpleQuery("reviewed", "true") &
+ uniprot.SimpleQuery("reviewed", "true")
+ &
# Bovine proteins
- uniprot.SimpleQuery("organism_name", "Bos taurus") &
+ uniprot.SimpleQuery("organism_name", "Bos taurus")
+ &
# Keyword ID for GPCRs
uniprot.SimpleQuery("keyword", "KW-0297")
)
@@ -62,13 +63,11 @@
# The distance measure required for the tree calculation is the
# percentage of non-identical amino acids in the respective two
# sequences
-distances = 1 - align.get_pairwise_sequence_identity(
- alignment, mode="shortest"
-)
+distances = 1 - align.get_pairwise_sequence_identity(alignment, mode="shortest")
# Create tree via neighbor joining
tree = phylo.neighbor_joining(distances)
# Convert to NetworkX graph
-#For the graph visualization, the edge directions are unnecessary
+# For the graph visualization, the edge directions are unnecessary
graph = tree.as_graph().to_undirected()
fig = plt.figure(figsize=(8.0, 8.0))
@@ -78,15 +77,17 @@
pos = nx.kamada_kawai_layout(graph)
# Assign the gene names to the nodes that represent a reference index
node_labels = {i: name for i, name in enumerate(genes)}
-nx.draw_networkx_edges(
- graph, pos, ax=ax
-)
+nx.draw_networkx_edges(graph, pos, ax=ax)
nx.draw_networkx_labels(
- graph, pos, ax=ax, labels=node_labels, font_size=7,
+ graph,
+ pos,
+ ax=ax,
+ labels=node_labels,
+ font_size=7,
# Draw a white background behind the labeled nodes
# for better readability
- bbox=dict(pad=0, color="white")
+ bbox=dict(pad=0, color="white"),
)
fig.tight_layout()
-plt.show()
\ No newline at end of file
+plt.show()
diff --git a/doc/examples/scripts/sequence/homology/hcn_hydropathy.py b/doc/examples/scripts/sequence/homology/hcn_hydropathy.py
index 637959a52..ff879afca 100644
--- a/doc/examples/scripts/sequence/homology/hcn_hydropathy.py
+++ b/doc/examples/scripts/sequence/homology/hcn_hydropathy.py
@@ -16,17 +16,17 @@
# Code source: Patrick Kunzmann
# License: BSD 3 clause
-import numpy as np
import matplotlib.pyplot as plt
+import numpy as np
from matplotlib.patches import Patch
import biotite
+import biotite.application.mafft as mafft
import biotite.database.entrez as entrez
import biotite.sequence as seq
import biotite.sequence.align as align
import biotite.sequence.graphics as graphics
import biotite.sequence.io.fasta as fasta
import biotite.sequence.io.genbank as gb
-import biotite.application.mafft as mafft
# Taken from
# Kyte, J and Doolittle, RF.
@@ -35,37 +35,39 @@
# Journal of Molecular Biology (2015). 157(1):105–32.
# doi:10.1016/0022-2836(82)90515-0
hydropathy_dict = {
- "I" : 4.5,
- "V" : 4.2,
- "L" : 3.8,
- "F" : 2.8,
- "C" : 2.5,
- "M" : 1.9,
- "A" : 1.8,
- "G" : -0.4,
- "T" : -0.7,
- "S" : -0.8,
- "W" : -0.9,
- "Y" : -1.3,
- "P" : -1.6,
- "H" : -3.2,
- "E" : -3.5,
- "Q" : -3.5,
- "D" : -3.5,
- "N" : -3.5,
- "K" : -3.9,
- "R" : -4.5
+ "I": 4.5,
+ "V": 4.2,
+ "L": 3.8,
+ "F": 2.8,
+ "C": 2.5,
+ "M": 1.9,
+ "A": 1.8,
+ "G": -0.4,
+ "T": -0.7,
+ "S": -0.8,
+ "W": -0.9,
+ "Y": -1.3,
+ "P": -1.6,
+ "H": -3.2,
+ "E": -3.5,
+ "Q": -3.5,
+ "D": -3.5,
+ "N": -3.5,
+ "K": -3.9,
+ "R": -4.5,
}
# Look for the Swiss-Prot entry contaning the human HCN1 channel
-query = entrez.SimpleQuery("HCN1", "Gene Name") \
- & entrez.SimpleQuery("homo sapiens", "Organism") \
- & entrez.SimpleQuery("srcdb_swiss-prot", "Properties")
+query = (
+ entrez.SimpleQuery("HCN1", "Gene Name")
+ & entrez.SimpleQuery("homo sapiens", "Organism")
+ & entrez.SimpleQuery("srcdb_swiss-prot", "Properties")
+)
uids = entrez.search(query, db_name="protein")
-gp_file = gb.GenBankFile.read(entrez.fetch(
- uids[0], None, "gp", db_name="protein", ret_type="gp"
-))
+gp_file = gb.GenBankFile.read(
+ entrez.fetch(uids[0], None, "gp", db_name="protein", ret_type="gp")
+)
hcn1 = seq.ProteinSequence(gb.get_sequence(gp_file, format="gp"))
print(hcn1)
@@ -75,13 +77,15 @@
hydropathies = np.array([hydropathy_dict[symbol] for symbol in hcn1])
+
def moving_average(data_set, window_size):
- weights = np.full(window_size, 1/window_size)
- return np.convolve(data_set, weights, mode='valid')
+ weights = np.full(window_size, 1 / window_size)
+ return np.convolve(data_set, weights, mode="valid")
+
# Apply moving average over 15 amino acids for clearer visualization
ma_radius = 7
-hydropathies = moving_average(hydropathies, 2*ma_radius+1)
+hydropathies = moving_average(hydropathies, 2 * ma_radius + 1)
########################################################################
# In order to assess the positional conservation, the sequences
@@ -91,14 +95,16 @@ def moving_average(data_set, window_size):
uids = []
for name in names:
- query = entrez.SimpleQuery(name, "Gene Name") \
- & entrez.SimpleQuery("homo sapiens", "Organism") \
- & entrez.SimpleQuery("srcdb_swiss-prot", "Properties")
+ query = (
+ entrez.SimpleQuery(name, "Gene Name")
+ & entrez.SimpleQuery("homo sapiens", "Organism")
+ & entrez.SimpleQuery("srcdb_swiss-prot", "Properties")
+ )
uids += entrez.search(query, db_name="protein")
-fasta_file = fasta.FastaFile.read(entrez.fetch_single_file(
- uids, None, db_name="protein", ret_type="fasta"
-))
+fasta_file = fasta.FastaFile.read(
+ entrez.fetch_single_file(uids, None, db_name="protein", ret_type="fasta")
+)
for header in fasta_file:
print(header)
@@ -121,8 +127,8 @@ def moving_average(data_set, window_size):
scores = np.zeros(len(hcn1))
for i in range(len(alignment)):
# The column is also an alignment with length 1
- column = alignment[i:i+1]
- hcn1_index = column.trace[0,0]
+ column = alignment[i : i + 1]
+ hcn1_index = column.trace[0, 0]
if hcn1_index == -1:
# Gap in HCN1 row
# As similarity score should be analyzed in dependence of the
@@ -131,7 +137,7 @@ def moving_average(data_set, window_size):
continue
scores[hcn1_index] = align.score(column, matrix, gap_penalty=-5)
-scores = moving_average(scores, 2*ma_radius+1)
+scores = moving_average(scores, 2 * ma_radius + 1)
########################################################################
# Now the hydropathy and the similarity score can be plotted.
@@ -141,11 +147,12 @@ def moving_average(data_set, window_size):
# Plot hydropathy
ax.plot(
- np.arange(1+ma_radius, len(hcn1)-ma_radius+1), hydropathies,
- color=biotite.colors["dimorange"]
+ np.arange(1 + ma_radius, len(hcn1) - ma_radius + 1),
+ hydropathies,
+ color=biotite.colors["dimorange"],
)
ax.axhline(0, color="gray", linewidth=0.5)
-ax.set_xlim(1, len(hcn1)+1)
+ax.set_xlim(1, len(hcn1) + 1)
ax.set_xlabel("HCN1 sequence position")
ax.set_ylabel("Hydropathy (15 residues moving average)")
@@ -153,8 +160,11 @@ def moving_average(data_set, window_size):
# with hydropathy plot
annotation = gb.get_annotation(gp_file, include_only=["Region"])
transmembrane_annotation = seq.Annotation(
- [feature for feature in annotation
- if feature.qual["region_name"] == "Transmembrane region"]
+ [
+ feature
+ for feature in annotation
+ if feature.qual["region_name"] == "Transmembrane region"
+ ]
)
for feature in transmembrane_annotation:
first, last = feature.get_location_range()
@@ -163,17 +173,18 @@ def moving_average(data_set, window_size):
# Plot similarity score as measure for conservation
ax2 = ax.twinx()
ax2.plot(
- np.arange(1+ma_radius, len(hcn1)-ma_radius+1), scores,
- color=biotite.colors["brightorange"]
+ np.arange(1 + ma_radius, len(hcn1) - ma_radius + 1),
+ scores,
+ color=biotite.colors["brightorange"],
)
ax2.set_ylabel("Similarity score (15 residues moving average)")
ax.legend(
handles=[
- Patch(color=biotite.colors["dimorange"], label="Hydropathy"),
- Patch(color=biotite.colors["brightorange"], label="Score" )
+ Patch(color=biotite.colors["dimorange"], label="Hydropathy"),
+ Patch(color=biotite.colors["brightorange"], label="Score"),
],
- fontsize=9
+ fontsize=9,
)
########################################################################
@@ -190,17 +201,20 @@ def moving_average(data_set, window_size):
# values as input.
# Hydrophilic amino acids are depicted in blue, hydrophobic ones in red.
+
def hydropathy_to_color(hydropathy, colormap):
# Normalize hydropathy to range between 0 and 1
# (orginally between -4.5 and 4.5)
norm_hydropathy = (hydropathy - (-4.5)) / (4.5 - (-4.5))
return colormap(norm_hydropathy)
+
# Create a color scheme highlighting the hydropathy
colormap = plt.get_cmap("coolwarm")
colorscheme = [
hydropathy_to_color(hydropathy_dict[symbol], colormap)
- if symbol in hydropathy_dict else None
+ if symbol in hydropathy_dict
+ else None
for symbol in sequences[0].get_alphabet()
]
@@ -210,8 +224,7 @@ def hydropathy_to_color(hydropathy, colormap):
ax = fig.add_subplot(111)
# Color the symbols instead of the background
graphics.plot_alignment_type_based(
- ax, alignment[:600], labels=names, show_numbers=True,
- color_scheme=colorscheme
+ ax, alignment[:600], labels=names, show_numbers=True, color_scheme=colorscheme
)
plt.show()
diff --git a/doc/examples/scripts/sequence/homology/hcn_similarity.py b/doc/examples/scripts/sequence/homology/hcn_similarity.py
index f81c55ee5..961abcd07 100644
--- a/doc/examples/scripts/sequence/homology/hcn_similarity.py
+++ b/doc/examples/scripts/sequence/homology/hcn_similarity.py
@@ -15,32 +15,31 @@
# Code source: Daniel Bauer
# License: BSD 3 clause
-import biotite.sequence.io.fasta as fasta
+import matplotlib.pyplot as plt
+import biotite.application.clustalo as clustalo
import biotite.database.entrez as entrez
import biotite.sequence as seq
-import biotite.application.clustalo as clustalo
import biotite.sequence.align as align
-import biotite.sequence.phylo as phylo
-import matplotlib.pyplot as plt
import biotite.sequence.graphics as graphics
-
+import biotite.sequence.io.fasta as fasta
+import biotite.sequence.phylo as phylo
UNIPROT_IDS = dict(
- hHCN1 = "O60741",
- hHCN2 = "Q9UL51",
- hHCN3 = "Q9P1Z3",
- hHCN4 = "Q9Y3Q4",
- spHCN = "O76977",
- hEAG1 = "O95259",
- hERG1 = "Q12809",
- KAT1 = "Q39128",
+ hHCN1="O60741",
+ hHCN2="Q9UL51",
+ hHCN3="Q9P1Z3",
+ hHCN4="Q9Y3Q4",
+ spHCN="O76977",
+ hEAG1="O95259",
+ hERG1="Q12809",
+ KAT1="Q39128",
)
### fetch sequences for UniProt IDs from NCBI Entrez
-fasta_file = fasta.FastaFile.read(entrez.fetch_single_file(
- list(UNIPROT_IDS.values()), None, "protein", "fasta"
-))
+fasta_file = fasta.FastaFile.read(
+ entrez.fetch_single_file(list(UNIPROT_IDS.values()), None, "protein", "fasta")
+)
sequences = {
name: seq.ProteinSequence(seq_str)
for name, seq_str in zip(UNIPROT_IDS.keys(), fasta_file.values())
@@ -50,42 +49,44 @@
# create MSA
alignment = clustalo.ClustalOmegaApp.align(list(sequences.values()))
# build simple tree based on deviation from sequence identity
-distances = 1 - align.get_pairwise_sequence_identity(
- alignment, mode="shortest"
-)
+distances = 1 - align.get_pairwise_sequence_identity(alignment, mode="shortest")
tree = phylo.upgma(distances)
### plot the tree
fig, ax = plt.subplots(1, 1, figsize=(8, 5))
graphics.plot_dendrogram(
- ax, tree, orientation="left", labels=list(UNIPROT_IDS.keys()),
- show_distance=False, linewidth=2
- )
+ ax,
+ tree,
+ orientation="left",
+ labels=list(UNIPROT_IDS.keys()),
+ show_distance=False,
+ linewidth=2,
+)
ax.grid(False)
ax.set_xticks([])
# distance indicator
indicator_len = 0.1
indicator_start = (
- ax.get_xlim()[0] + ax.get_xlim()[1]*0.02,
- ax.get_ylim()[1] - ax.get_ylim()[1]*0.15
-)
-indicator_stop = (
- indicator_start[0] + indicator_len,
- indicator_start[1]
+ ax.get_xlim()[0] + ax.get_xlim()[1] * 0.02,
+ ax.get_ylim()[1] - ax.get_ylim()[1] * 0.15,
)
+indicator_stop = (indicator_start[0] + indicator_len, indicator_start[1])
indicator_center = (
- (indicator_start[0] + indicator_stop[0])/2,
- (indicator_start[1] + 0.25)
+ (indicator_start[0] + indicator_stop[0]) / 2,
+ (indicator_start[1] + 0.25),
)
ax.annotate(
- "", xy=indicator_start, xytext=indicator_stop, xycoords="data",
- textcoords="data", arrowprops={"arrowstyle": "|-|", "linewidth": 2}
+ "",
+ xy=indicator_start,
+ xytext=indicator_stop,
+ xycoords="data",
+ textcoords="data",
+ arrowprops={"arrowstyle": "|-|", "linewidth": 2},
)
ax.annotate(
- f"{int(indicator_len * 100)} %", xy=indicator_center,
- ha="center", va="center"
+ f"{int(indicator_len * 100)} %", xy=indicator_center, ha="center", va="center"
)
ax.set_title("Sequence deviation of HCN to other CNG superfamily channels")
diff --git a/doc/examples/scripts/sequence/homology/homolog_msa.py b/doc/examples/scripts/sequence/homology/homolog_msa.py
index c186bd222..f3b91dd65 100644
--- a/doc/examples/scripts/sequence/homology/homolog_msa.py
+++ b/doc/examples/scripts/sequence/homology/homolog_msa.py
@@ -10,13 +10,12 @@
# Code source: Patrick Kunzmann
# License: BSD 3 cl
from tempfile import gettempdir
-import biotite.sequence as seq
-import biotite.sequence.io.fasta as fasta
-import biotite.sequence.graphics as graphics
-import biotite.application.muscle as muscle
+import matplotlib.pyplot as plt
import biotite.application.blast as blast
+import biotite.application.muscle as muscle
import biotite.database.entrez as entrez
-import matplotlib.pyplot as plt
+import biotite.sequence.graphics as graphics
+import biotite.sequence.io.fasta as fasta
# Download sequence of Streptococcus pyogenes Cas9
file_name = entrez.fetch("Q99ZW2", gettempdir(), "fa", "protein", "fasta")
@@ -49,7 +48,7 @@
print("MSA results:")
gapped_seqs = alignment.get_gapped_sequences()
for i in range(len(gapped_seqs)):
- print(hits[i], " "*3, gapped_seqs[i])
+ print(hits[i], " " * 3, gapped_seqs[i])
# Visualize the first 200 columns of the alignment
# Reorder alignments to reflect sequence distance
@@ -58,9 +57,11 @@
ax = fig.add_subplot(111)
order = app.get_alignment_order()
graphics.plot_alignment_type_based(
- ax, alignment[:200, order.tolist()], labels=[hits[i] for i in order],
- show_numbers=True
+ ax,
+ alignment[:200, order.tolist()],
+ labels=[hits[i] for i in order],
+ show_numbers=True,
)
fig.tight_layout()
-plt.show()
\ No newline at end of file
+plt.show()
diff --git a/doc/examples/scripts/sequence/homology/lexa_conservation.py b/doc/examples/scripts/sequence/homology/lexa_conservation.py
index 104fe9fd4..957db2f6a 100644
--- a/doc/examples/scripts/sequence/homology/lexa_conservation.py
+++ b/doc/examples/scripts/sequence/homology/lexa_conservation.py
@@ -21,24 +21,22 @@
# Code source: Patrick Kunzmann
# License: BSD 3 clause
-import numpy as np
import matplotlib.pyplot as plt
-import biotite.sequence as seq
-import biotite.sequence.io.fasta as fasta
-import biotite.sequence.io.genbank as gb
-import biotite.sequence.graphics as graphics
import biotite.application.clustalo as clustalo
import biotite.database.entrez as entrez
+import biotite.sequence as seq
+import biotite.sequence.graphics as graphics
+import biotite.sequence.io.genbank as gb
+
# Search for protein products of LexA gene in UniProtKB/Swiss-Prot database
-query = entrez.SimpleQuery("lexA", "Gene Name") \
- & entrez.SimpleQuery("srcdb_swiss-prot", "Properties")
+query = entrez.SimpleQuery("lexA", "Gene Name") & entrez.SimpleQuery(
+ "srcdb_swiss-prot", "Properties"
+)
# Search for the first 200 hits
# More than 200 UIDs are not recommended for the EFetch service
# for a single fetch
uids = entrez.search(query, db_name="protein", number=200)
-file = entrez.fetch_single_file(
- uids, None, db_name="protein", ret_type="gp"
-)
+file = entrez.fetch_single_file(uids, None, db_name="protein", ret_type="gp")
# The file contains multiple concatenated GenPept files
# -> Usage of MultiFile
multi_file = gb.MultiFile.read(file)
@@ -57,12 +55,14 @@
# on. Therefore, we write a function that creates a proper abbreviation
# for a species name.
+
def abbreviate(species):
# Remove possible brackets
- species = species.replace("[","").replace("]","")
- splitted_species= species.split()
+ species = species.replace("[", "").replace("]", "")
+ splitted_species = species.split()
return "{:}. {:}".format(splitted_species[0][0], splitted_species[1])
+
print("Sources:")
all_sources = [abbreviate(gb.get_source(file)) for file in files]
for source in all_sources[:20]:
@@ -97,16 +97,16 @@ def abbreviate(species):
# Ignore already listed species
continue
bind_feature = None
- annot_seq = gb.get_annotated_sequence(
- file, include_only=["Site"], format="gp"
- )
+ annot_seq = gb.get_annotated_sequence(file, include_only=["Site"], format="gp")
# Find the feature for DNA-binding site
for feature in annot_seq.annotation:
# DNA binding site is a helix-turn-helix motif
- if "site_type" in feature.qual \
- and feature.qual["site_type"] == "DNA binding" \
- and "H-T-H motif" in feature.qual["note"]:
- bind_feature = feature
+ if (
+ "site_type" in feature.qual
+ and feature.qual["site_type"] == "DNA binding"
+ and "H-T-H motif" in feature.qual["note"]
+ ):
+ bind_feature = feature
if bind_feature is not None:
# If the feature is found,
# get the sequence slice that is defined by the feature...
@@ -128,10 +128,10 @@ def abbreviate(species):
fig = plt.figure(figsize=(4.5, 4.0))
ax = fig.add_subplot(111)
graphics.plot_alignment_similarity_based(
- ax, alignment[:,:20], labels=sources[:20], symbols_per_line=len(alignment)
+ ax, alignment[:, :20], labels=sources[:20], symbols_per_line=len(alignment)
)
# Source names in italic
-ax.set_yticklabels(ax.get_yticklabels(), fontdict={"fontstyle":"italic"})
+ax.set_yticklabels(ax.get_yticklabels(), fontdict={"fontstyle": "italic"})
fig.tight_layout()
########################################################################
@@ -145,7 +145,7 @@ def abbreviate(species):
fig = plt.figure(figsize=(8.0, 3.0))
ax = fig.add_subplot(111)
graphics.plot_sequence_logo(ax, profile, scheme="flower")
-ax.set_xticks([5,10,15,20])
+ax.set_xticks([5, 10, 15, 20])
ax.set_xlabel("Residue position")
ax.set_ylabel("Bits")
# Only show left and bottom spine
@@ -154,4 +154,4 @@ def abbreviate(species):
fig.tight_layout()
# sphinx_gallery_thumbnail_number = 2
-plt.show()
\ No newline at end of file
+plt.show()
diff --git a/doc/examples/scripts/sequence/homology/luxa_comparison.py b/doc/examples/scripts/sequence/homology/luxa_comparison.py
index 080531860..eda03243a 100644
--- a/doc/examples/scripts/sequence/homology/luxa_comparison.py
+++ b/doc/examples/scripts/sequence/homology/luxa_comparison.py
@@ -12,22 +12,21 @@
# Code source: Patrick Kunzmann
# License: BSD 3 clause
-import numpy as np
import matplotlib.pyplot as plt
+import biotite.database.entrez as entrez
import biotite.sequence as seq
-import biotite.sequence.io.fasta as fasta
import biotite.sequence.align as align
import biotite.sequence.graphics as graphics
-import biotite.database.entrez as entrez
-
+import biotite.sequence.io.fasta as fasta
# Search for protein products of LexA gene in UniProtKB/Swiss-Prot database
-query = entrez.SimpleQuery("luxA", "Gene Name") \
- & entrez.SimpleQuery("srcdb_swiss-prot", "Properties")
+query = entrez.SimpleQuery("luxA", "Gene Name") & entrez.SimpleQuery(
+ "srcdb_swiss-prot", "Properties"
+)
uids = entrez.search(query, db_name="protein")
-fasta_file = fasta.FastaFile.read(entrez.fetch_single_file(
- uids, None, db_name="protein", ret_type="fasta"
-))
+fasta_file = fasta.FastaFile.read(
+ entrez.fetch_single_file(uids, None, db_name="protein", ret_type="fasta")
+)
ids = []
sequences = []
@@ -39,7 +38,7 @@
matrix = align.SubstitutionMatrix.std_protein_matrix()
alignment, order, tree, distances = align.align_multiple(
- sequences, matrix, gap_penalty=(-10,-1), terminal_penalty=False
+ sequences, matrix, gap_penalty=(-10, -1), terminal_penalty=False
)
# Order alignment according to the guide tree
alignment = alignment[:, order]
@@ -48,9 +47,8 @@
fig = plt.figure(figsize=(8.0, 20.0))
ax = fig.add_subplot(111)
graphics.plot_alignment_type_based(
- ax, alignment, labels=ids, show_numbers=True, spacing=2.0,
- color_scheme="blossom"
+ ax, alignment, labels=ids, show_numbers=True, spacing=2.0, color_scheme="blossom"
)
fig.tight_layout()
-plt.show()
\ No newline at end of file
+plt.show()
diff --git a/doc/examples/scripts/sequence/homology/pi3k_alignment.py b/doc/examples/scripts/sequence/homology/pi3k_alignment.py
index e705566eb..4f745876b 100644
--- a/doc/examples/scripts/sequence/homology/pi3k_alignment.py
+++ b/doc/examples/scripts/sequence/homology/pi3k_alignment.py
@@ -16,23 +16,23 @@
# License: BSD 3 clause
import warnings
-import numpy as np
import matplotlib.pyplot as plt
+import numpy as np
import biotite
+import biotite.application.clustalo as clustalo
import biotite.database.entrez as entrez
import biotite.sequence as seq
import biotite.sequence.align as align
import biotite.sequence.graphics as graphics
import biotite.sequence.io.fasta as fasta
-import biotite.application.clustalo as clustalo
-uids = ["5JHB_A", "5LUQ_A", "5FLC_B", "5YZ0_A", "5NP0_A", "4FUL_A"]
-names = ["PI3K", "DNA-PKcs", "mTOR", "ATR", "ATM", "hSMG-1"]
+uids = ["5JHB_A", "5LUQ_A", "5FLC_B", "5YZ0_A", "5NP0_A", "4FUL_A"]
+names = ["PI3K", "DNA-PKcs", "mTOR", "ATR", "ATM", "hSMG-1"]
sequences = []
-file = fasta.FastaFile.read(entrez.fetch_single_file(
- uids, None, db_name="protein", ret_type="fasta"
-))
+file = fasta.FastaFile.read(
+ entrez.fetch_single_file(uids, None, db_name="protein", ret_type="fasta")
+)
for header, seq_str in file.items():
sequences.append(seq.ProteinSequence(seq_str))
@@ -47,25 +47,27 @@
# Like the :class:`LetterSimilarityPlotter` we will use the
# *average normalized similarity* as measure.
+
def get_average_normalized_similarity(trace_code, matrix, seq_i, pos_i):
- code1 = trace_code[seq_i, pos_i]
- if code1 == -1:
- return np.nan
- similarities = np.zeros(trace_code.shape[0])
- for i in range(trace_code.shape[0]):
- code2 = trace_code[i, pos_i]
- if code2 == -1:
- similarities[i] = 0
- else:
- sim = matrix[code1, code2]
- # Normalize (range 0.0 - 1.0)
- min_sim = np.min(matrix[code1])
- max_sim = np.max(matrix[code1])
- sim = (sim - min_sim) / (max_sim - min_sim)
- similarities[i] = sim
- # Delete self-similarity
- similarities = np.delete(similarities, seq_i)
- return np.average(similarities)
+ code1 = trace_code[seq_i, pos_i]
+ if code1 == -1:
+ return np.nan
+ similarities = np.zeros(trace_code.shape[0])
+ for i in range(trace_code.shape[0]):
+ code2 = trace_code[i, pos_i]
+ if code2 == -1:
+ similarities[i] = 0
+ else:
+ sim = matrix[code1, code2]
+ # Normalize (range 0.0 - 1.0)
+ min_sim = np.min(matrix[code1])
+ max_sim = np.max(matrix[code1])
+ sim = (sim - min_sim) / (max_sim - min_sim)
+ similarities[i] = sim
+ # Delete self-similarity
+ similarities = np.delete(similarities, seq_i)
+ return np.average(similarities)
+
matrix = align.SubstitutionMatrix.std_protein_matrix()
# Get the alignment columns as symbols codes (-1 for gaps)
@@ -73,15 +75,13 @@ def get_average_normalized_similarity(trace_code, matrix, seq_i, pos_i):
similarities = np.zeros(trace_code.shape)
for i in range(similarities.shape[0]):
for j in range(similarities.shape[1]):
- similarities[i,j] = get_average_normalized_similarity(
+ similarities[i, j] = get_average_normalized_similarity(
trace_code, matrix.score_matrix(), i, j
)
figure = plt.figure(figsize=(8.0, 3.0))
ax = figure.add_subplot(111)
-heatmap = ax.pcolor(
- similarities, cmap="RdYlGn", vmin=0.0, vmax=1.0
-)
+heatmap = ax.pcolor(similarities, cmap="RdYlGn", vmin=0.0, vmax=1.0)
cbar = figure.colorbar(heatmap)
figure.tight_layout()
@@ -93,16 +93,19 @@ def get_average_normalized_similarity(trace_code, matrix, seq_i, pos_i):
# Hence, we create bins, that contain the mean similarity over a range of
# columns.
+
def calculate_bins(similarities, bin_count):
edges = np.linspace(0, similarities.shape[1], bin_count, dtype=int)
edges = np.append(edges, similarities.shape[1])
binned_similarities = np.zeros(similarities.shape)
for i in range(similarities.shape[0]):
for j in range(len(edges) - 1):
- binned_similarities[i, edges[j]:edges[j+1]] = \
- np.nanmean(similarities[i, edges[j]:edges[j+1]])
+ binned_similarities[i, edges[j] : edges[j + 1]] = np.nanmean(
+ similarities[i, edges[j] : edges[j + 1]]
+ )
return binned_similarities
+
with warnings.catch_warnings():
# Catch warnings about empty slice for gap-only parts
warnings.simplefilter("ignore")
@@ -110,9 +113,7 @@ def calculate_bins(similarities, bin_count):
figure = plt.figure(figsize=(8.0, 3.0))
ax = figure.add_subplot(111)
-heatmap = ax.pcolor(
- similarities, cmap="RdYlGn", vmin=0.0, vmax=1.0
-)
+heatmap = ax.pcolor(similarities, cmap="RdYlGn", vmin=0.0, vmax=1.0)
cbar = figure.colorbar(heatmap)
# Furthermore, add some labels to the figure
cbar.set_label("Average normalized similarity")
@@ -136,14 +137,14 @@ def calculate_bins(similarities, bin_count):
# From beginning of the sequence...
for i in range(len(trace)):
# Check if all sequences have no gap at the given position
- if trace[i,0] != -1:
+ if trace[i, 0] != -1:
start_index = i
break
# ...and the end of the sequence
-for i in range(len(trace)-1, -1, -1):
+for i in range(len(trace) - 1, -1, -1):
# Check if all sequences have no gap at the given position
- if trace[i,0] != -1:
- stop_index = i+1
+ if trace[i, 0] != -1:
+ stop_index = i + 1
break
# Truncate alignment to region where the 'PI3K' sequence exists
@@ -155,11 +156,17 @@ def calculate_bins(similarities, bin_count):
# The alignment is quite long
# -> Reduce font size to reduce figure size
graphics.plot_alignment_similarity_based(
- ax, alignment, matrix=matrix, symbols_per_line=80, labels=names,
+ ax,
+ alignment,
+ matrix=matrix,
+ symbols_per_line=80,
+ labels=names,
show_numbers=True,
- label_size=10, number_size=10, symbol_size=6,
- color=biotite.colors["orange"]
+ label_size=10,
+ number_size=10,
+ symbol_size=6,
+ color=biotite.colors["orange"],
)
fig.tight_layout()
-plt.show()
\ No newline at end of file
+plt.show()
diff --git a/doc/examples/scripts/sequence/homology/plotepiscan.py b/doc/examples/scripts/sequence/homology/plotepiscan.py
index 87a6e1c6d..140f078ca 100644
--- a/doc/examples/scripts/sequence/homology/plotepiscan.py
+++ b/doc/examples/scripts/sequence/homology/plotepiscan.py
@@ -3,25 +3,25 @@
==========================================================
Peptide arrays can be used as a high-throughput platform for screening
-biological interactions. Typical screenings involve the immobilization
-of diverse peptides on a solid surface to study their interactions with
-various target molecules. Specifically, arrays of peptides with
-overlapping sequences can be used to identify the epitope of antibodies
+biological interactions. Typical screenings involve the immobilization
+of diverse peptides on a solid surface to study their interactions with
+various target molecules. Specifically, arrays of peptides with
+overlapping sequences can be used to identify the epitope of antibodies
on a protein antigen at amino acid level.
General scannings for molecular recognition using peptide arrays
-are particlularly useful for epitope identification on monoclonal
-antibodies. This example visualizes the data from two epitope mapping
+are particlularly useful for epitope identification on monoclonal
+antibodies. This example visualizes the data from two epitope mapping
studies, using a color coded sequence alignment representation
-of the antigens screened. The scannings interrogated a monoclonal
+of the antigens screened. The scannings interrogated a monoclonal
antibody (MAb) against two arrays of overlaping peptides :footcite:`Iyamu2023`.
The files containing peptide array data can be downloaded
:download:`here `
-and
+and
:download:`here `.
The antigens screened span the extracellular domain of VAR2CSA, a
virulence factor of *Plasmodiun falciparum* for the strains FCR3
-(residues 1-2659) and NF54 (residues 1-2652). The sequence of
+(residues 1-2659) and NF54 (residues 1-2652). The sequence of
the two domains can be downloaded
:download:`here `.
@@ -54,53 +54,55 @@
# Get BLOSUM62 matrix
matrix = align.SubstitutionMatrix.std_protein_matrix()
# Perform pairwise sequence alignment
-alignments = align.align_optimal(fcr3_seq, nf54_seq, matrix,
- gap_penalty = (-10, -1),
- terminal_penalty = False)
+alignments = align.align_optimal(
+ fcr3_seq, nf54_seq, matrix, gap_penalty=(-10, -1), terminal_penalty=False
+)
alignment = alignments[0]
print(alignment)
########################################################################
-# Epitope mapping data
+# Epitope mapping data
# --------------------
#
# This study used arrays of overlaping peptides to achive high acurracy
-# in mapping the epitope. Both FCR3 and NF54 arrays, consisted of
+# in mapping the epitope. Both FCR3 and NF54 arrays, consisted of
# 20-mer peptides with an overlap of 19 and 18 amino acids respectively.
# Arbitrary units (AU) of fluorescence intensity quantified the antibody
-# recognition for each peptide.
-# Our goal is to decorate the aligment, with the fluorescence intensity
-# scores of each peptide in the arrays. We used a
-# color code from red to white for high to low intensity, respectively.
-# The background color of the symbols on the aligment corresponds to the
+# recognition for each peptide.
+# Our goal is to decorate the aligment, with the fluorescence intensity
+# scores of each peptide in the arrays. We used a
+# color code from red to white for high to low intensity, respectively.
+# The background color of the symbols on the aligment corresponds to the
# score for the 20th amino acid at the end of the peptide.
#
-# Lets create a function that maps the peptide score to the 20th residue
+# Lets create a function that maps the peptide score to the 20th residue
# of the peptide:
+
def read_scan(filename, pep_len=20, score_res=20):
- if not type(pep_len) is int:
+ if type(pep_len) is not int:
raise TypeError("pep_len : only integers are allowed")
- elif not type(score_res) is int:
- raise TypeError("score_res : only integers are allowed")
+ elif type(score_res) is not int:
+ raise TypeError("score_res : only integers are allowed")
elif pep_len < score_res:
raise Exception("score_res can't be higher than pep_len")
-
- elif pep_len != 20 or score_res != 20:
- s = (score_res) - pep_len -1
+
+ elif pep_len != 20 or score_res != 20:
+ s = (score_res) - pep_len - 1
else:
- s =-1
+ s = -1
- df= pd.read_csv(filename)
- scor_res = df['Seq'].str[s]
- df['s_res'] = scor_res
+ df = pd.read_csv(filename)
+ scor_res = df["Seq"].str[s]
+ df["s_res"] = scor_res
return df
+
# Load epitope scan data
-fcr3_file_path = "../../../download/FCR3_10ug.csv"
-nf54_file_path = "../../../download/NF54_10ug.csv"
+fcr3_file_path = "../../../download/FCR3_10ug.csv"
+nf54_file_path = "../../../download/NF54_10ug.csv"
# Define the score residues on the arrays
files = [fcr3_file_path, nf54_file_path]
@@ -114,66 +116,70 @@ def read_scan(filename, pep_len=20, score_res=20):
ag1_scan.head(5)
########################################################################
-# The microarrays contained each peptide printed in duplicated spots. We
-# need to combine the values of those experimental replicates into a
+# The microarrays contained each peptide printed in duplicated spots. We
+# need to combine the values of those experimental replicates into a
# unique score for each peptide. Typically, this unique value could come
# from the geometric mean between replicates that do not deviate wildly.
-# If the average deviation between replicates is high, one can assumme
+# If the average deviation between replicates is high, one can assumme
# that experimental errors should result in a lower score at a given spot.
-# It is easy to imagine that imperfections on the printing of the spot,
-# will rather decrease and not increase, the antibody recognition, in
-# which case the the peptide signal is better represented
+# It is easy to imagine that imperfections on the printing of the spot,
+# will rather decrease and not increase, the antibody recognition, in
+# which case the the peptide signal is better represented
# by the higher score replicate.
#
-# Now lets write a function to combine the scores adding the flexibility
-# to choose cases for those criterias exposed above. We will flag with
-# 0 or 1 every peptide entry on the arrays: 1 if the deviation between
+# Now lets write a function to combine the scores adding the flexibility
+# to choose cases for those criterias exposed above. We will flag with
+# 0 or 1 every peptide entry on the arrays: 1 if the deviation between
# replicates is higher than 40%, otherwise 0.
-def combine_scores(dataframe, combine='max', flag_noisy=True):
- df= dataframe
+
+def combine_scores(dataframe, combine="max", flag_noisy=True):
+ df = dataframe
# mean
- df['ave'] = df.iloc[:,[1,2]].mean(axis = 1)
+ df["ave"] = df.iloc[:, [1, 2]].mean(axis=1)
# mean deviation
- df['avedev'] = ((df.r1 - df.ave).abs() + (df.r2 - df.ave).abs()) / 2
+ df["avedev"] = ((df.r1 - df.ave).abs() + (df.r2 - df.ave).abs()) / 2
# percent deviation between replicates
- df['dev_ratio'] = df.apply(lambda x:0
- if x.avedev==0 else x.avedev/x.ave, axis=1)
-
+ df["dev_ratio"] = df.apply(
+ lambda x: 0 if x.avedev == 0 else x.avedev / x.ave, axis=1
+ )
+
# signal value:
- if combine == 'max':
- df['comb_signal'] = df.apply(lambda x:max(x.r1, x.r2)
- if x.dev_ratio >=0.4 else x.ave, axis=1)
- elif combine == 'mean':
- df['comb_signal'] = df.apply(lambda x:x.ave
- if x.dev_ratio <= 0.4 else 0, axis=1)
-
+ if combine == "max":
+ df["comb_signal"] = df.apply(
+ lambda x: max(x.r1, x.r2) if x.dev_ratio >= 0.4 else x.ave, axis=1
+ )
+ elif combine == "mean":
+ df["comb_signal"] = df.apply(
+ lambda x: x.ave if x.dev_ratio <= 0.4 else 0, axis=1
+ )
+
if flag_noisy:
- df['flag'] = df.apply(lambda x:0
- if x.dev_ratio <= 0.4 else 1, axis=1)
+ df["flag"] = df.apply(lambda x: 0 if x.dev_ratio <= 0.4 else 1, axis=1)
return df
-# Make the corresponding signal equal the replicate with the higest
+
+# Make the corresponding signal equal the replicate with the higest
# score value.
-dfa = combine_scores(ag1_scan, combine = 'max', flag_noisy = True)
-dfb = combine_scores(ag2_scan, combine = 'max', flag_noisy = True)
+dfa = combine_scores(ag1_scan, combine="max", flag_noisy=True)
+dfb = combine_scores(ag2_scan, combine="max", flag_noisy=True)
dfa.head(5)
########################################################################
-# Many molecular recognition screening campaings e.g. epitope mapping
-# screenings follow a long-tailed data distribution. To properly
+# Many molecular recognition screening campaings e.g. epitope mapping
+# screenings follow a long-tailed data distribution. To properly
# represent such distribution one can normalize the date using linear or
# non-linear transformations on the combined score data.
+
def data_transform(dataframe, threshold=0):
df = dataframe
- # Option to set a "threshold" for the signal scores.
+ # Option to set a "threshold" for the signal scores.
t = threshold
- df['cubic'] = df.apply(lambda x: np.cbrt(max(0, x.comb_signal-t)),
- axis=1)
- df['signal_plot'] = df.apply(lambda x: x.cubic/df['cubic'].max(),
- axis=1)
+ df["cubic"] = df.apply(lambda x: np.cbrt(max(0, x.comb_signal - t)), axis=1)
+ df["signal_plot"] = df.apply(lambda x: x.cubic / df["cubic"].max(), axis=1)
+
# Normalize, using the power law with cubic exponent. No threshold
data_transform(dfa, threshold=0)
@@ -184,134 +190,136 @@ def data_transform(dataframe, threshold=0):
# -------------------------------------------------------------------------------
#
# So far, we have the peptide score data combined, normalized, and mapped
-# to a residue for each peptide.
+# to a residue for each peptide.
# Next, using the alignment trace as a template, we will match the signal
-# intensities associated to the score residues, to the position of each
+# intensities associated to the score residues, to the position of each
# symbol on the alignment, considering the gaps.
-# Get the trace for each sequence on the alignment:
+# Get the trace for each sequence on the alignment:
trace_a = align.get_symbols(alignment)[0]
trace_b = align.get_symbols(alignment)[1]
+
def gapped_seq(dataframe, seq_trace, p_len, overlap_step=1):
"""
- Generate a gapped sequence that relates peptide score data signal with a
- template alignment trace. The function returns a list of tuples representing
- the gapped sequence, where each tuple consists of a residue and its associated
- signal value.
+ Generate a gapped sequence that relates peptide score data signal with a
+ template alignment trace. The function returns a list of tuples representing
+ the gapped sequence, where each tuple consists of a residue and its associated
+ signal value.
Parameters
----------
- dataframe : DataFrame
- A *Pandas* dataframe containing columns for each peptide score data,
+ dataframe : DataFrame
+ A *Pandas* dataframe containing columns for each peptide score data,
and its designated score residue.
- seq_trace : list
+ seq_trace : list
The sequence trace obtained from the alignment.
- p_len : int
+ p_len : int
The length of each overlapping peptide.
overlap_step : int, optional
The step size for overlapping peptides.Default is 1.
Note:
-----
- The 'gapped' sequence may be shorter than the aligment trace if the alignment results
- in gaps at either end. Any remaining elements in the trace with 'None' values are
+ The 'gapped' sequence may be shorter than the aligment trace if the alignment results
+ in gaps at either end. Any remaining elements in the trace with 'None' values are
filled with tuples: ('None', 0).
"""
template = seq_trace
df = dataframe
- step = overlap_step
- gapped = list(zip(df.s_res , df.signal_plot))
- lk1 = df["s_res"].values.tolist()
- plen = p_len # peptide length
-
+ step = overlap_step
+ gapped = list(zip(df.s_res, df.signal_plot))
+ lk1 = df["s_res"].values.tolist()
+ plen = p_len # peptide length
+
if step == 1:
x, b = 0, 0
- c = 0 # cyclic counter up to the peptide length :20
- p = 0 # peptide counter
+ c = 0 # cyclic counter up to the peptide length :20
+ p = 0 # peptide counter
for b in range(len(lk1)):
for a in template[x:]:
- if c < plen-1 :
- if a==None:
- gapped.insert(x,(template[x],0))
- x=x+1
+ if c < plen - 1:
+ if a is None:
+ gapped.insert(x, (template[x], 0))
+ x = x + 1
elif a != lk1[b]:
- gapped.insert(x,(template[x],0))
- x=x+1
- c=c+1
- elif p==0:
- gapped.insert(x,(template[x],0))
- x=x+1
- c=c+1
+ gapped.insert(x, (template[x], 0))
+ x = x + 1
+ c = c + 1
+ elif p == 0:
+ gapped.insert(x, (template[x], 0))
+ x = x + 1
+ c = c + 1
else:
- x=x+1
- c=c+1
+ x = x + 1
+ c = c + 1
break
else:
- c = 0 # reset the counter
- p=p+1
- x=x+1
+ c = 0 # reset the counter
+ p = p + 1
+ x = x + 1
break
elif step == 2:
x, b = 0, 0
- c=0
- p=0
+ c = 0
+ p = 0
for b in range(len(lk1)):
for a in template[x:]:
- if c < plen-1 and p==0:
- if a==None:
- gapped.insert(x,(template[x],0))
- x=x+1
+ if c < plen - 1 and p == 0:
+ if a is None:
+ gapped.insert(x, (template[x], 0))
+ x = x + 1
else:
- gapped.insert(x,(template[x],0))
- x=x+1
- c=c+1
- elif p==0 :
- c = 0 # reset the counter
- p=p+1
- x=x+1
+ gapped.insert(x, (template[x], 0))
+ x = x + 1
+ c = c + 1
+ elif p == 0:
+ c = 0 # reset the counter
+ p = p + 1
+ x = x + 1
break
- if p!=0:
- if a==None and c == 0:
- gapped.insert(x,(template[x],0))
- x=x+1
- elif c % 2 == 0:
- if a==None:
- gapped.insert(x,(template[x],0))
- x=x+1
+ if p != 0:
+ if a is None and c == 0:
+ gapped.insert(x, (template[x], 0))
+ x = x + 1
+ elif c % 2 == 0:
+ if a is None:
+ gapped.insert(x, (template[x], 0))
+ x = x + 1
else:
- gapped.insert(x,(template[x],0))
- x=x+1
- c=c+1
- elif c % 2 != 0:
- if a==None:
- gapped.insert(x,(template[x],0))
- x=x+1
+ gapped.insert(x, (template[x], 0))
+ x = x + 1
+ c = c + 1
+ elif c % 2 != 0:
+ if a is None:
+ gapped.insert(x, (template[x], 0))
+ x = x + 1
elif a != lk1[b]:
- gapped.insert(x,(template[x],0))
- x=x+1
- c=c+1
- else:
- x=x+1
- c=c+1
+ gapped.insert(x, (template[x], 0))
+ x = x + 1
+ c = c + 1
+ else:
+ x = x + 1
+ c = c + 1
break
# For terminal gaps
- if len(gapped) < len(template) and template[len(gapped)+1]== None:
- gapped_tail=[]
- for n in range(len(template)-len(gapped)):
- gapped_tail.append(('None', 0))
+ if len(gapped) < len(template) and template[len(gapped) + 1] is None:
+ gapped_tail = []
+ for n in range(len(template) - len(gapped)):
+ gapped_tail.append(("None", 0))
gapped = gapped + gapped_tail
-
+
return gapped
+
# Let's use gapped_seq() to build the gapped sequences
# FCR3 array, overlap_step: 1 (pep = 20-mer with 19 overlap)
gapd_s1 = gapped_seq(dfa, trace_a, 20, 1)
# NF54 array, overlap_step: 2 (pep = 20-mer with 18 overlap)
-gapd_s2 = gapped_seq(dfb, trace_b, 20, 2)
+gapd_s2 = gapped_seq(dfb, trace_b, 20, 2)
# Checkpoint. Both gapped sequences must have the same length.
len(gapd_s1) == len(gapd_s2)
@@ -320,116 +328,133 @@ def gapped_seq(dataframe, seq_trace, p_len, overlap_step=1):
# Create a signal map
# -------------------
#
-# Now we will generate an object mapping the signal scores from two gapped
+# Now we will generate an object mapping the signal scores from two gapped
# sequences.
-def signal_map(gapped_seq1, gapped_seq2,):
+
+def signal_map(
+ gapped_seq1,
+ gapped_seq2,
+):
"""
Generate a mapping of signal scores from two gapped sequences.
This function takes two gapped sequences, `gapped_seq1` and
- `gapped_seq2`. Each sequence is represented as a list of tuples,
- with the first element being an amino acid symbol and the second
- element being a signal score. It extracts the signal scores from
- each sequence and creates a 2D array with two columns, where the
- first column contains signal scores from `gapped_seq1` and the
+ `gapped_seq2`. Each sequence is represented as a list of tuples,
+ with the first element being an amino acid symbol and the second
+ element being a signal score. It extracts the signal scores from
+ each sequence and creates a 2D array with two columns, where the
+ first column contains signal scores from `gapped_seq1` and the
second column contains signal scores from `gapped_seq2`.
Parameters:
-----------
gapped_seq1: list
The first gapped sequence.
- gapped_seq2: list
+ gapped_seq2: list
The second gapped sequence.
Returns:
--------
- numpy.ndarray: A 2D numpy array with two columns containing signal
+ numpy.ndarray: A 2D numpy array with two columns containing signal
scores extracted from `gapped_seq1` and `gapped_seq2`
respectively.
"""
gapd_s1 = gapped_seq1
gapd_s2 = gapped_seq2
- fl_score = np.zeros((len(gapd_s1),2))
-
+ fl_score = np.zeros((len(gapd_s1), 2))
+
for v1 in range(len(gapd_s1)):
- fl_score[v1,0] = gapd_s1[v1][1]
- fl_score[v1,1] = gapd_s2[v1][1]
-
+ fl_score[v1, 0] = gapd_s1[v1][1]
+ fl_score[v1, 1] = gapd_s2[v1][1]
+
return fl_score
+
score = signal_map(gapd_s1, gapd_s2)
########################################################################
-# Sequence alignment decorated with MAb recognition regions
+# Sequence alignment decorated with MAb recognition regions
# ---------------------------------------------------------
#
-# Now we can plot the sequence alignment using an :class:`ArrayPlotter`
-# instance that higlights sequence recognition regions at the positions
+# Now we can plot the sequence alignment using an :class:`ArrayPlotter`
+# instance that higlights sequence recognition regions at the positions
# of the respective score residue per alignment column.
-# To easily interpret the intensity-decorated alignment we will add a
-# colorbar scaled accordingly. The scale matches the transformation
+# To easily interpret the intensity-decorated alignment we will add a
+# colorbar scaled accordingly. The scale matches the transformation
# applied to the recognition signal recorded on the score ndarray.
#
-# Let's build a function to create a custom colorbar object. We will
-# specify the dataframes corresponding to the two antigens screened in
-# this example, the colormap, and the transformation to be
+# Let's build a function to create a custom colorbar object. We will
+# specify the dataframes corresponding to the two antigens screened in
+# this example, the colormap, and the transformation to be
# represented with the colorbar.
fig = plt.figure(figsize=(8.0, 15))
ax = fig.add_subplot(111)
graphics.plot_alignment_array(
- ax, alignments[0], fl_score=score, labels=["FCR3", "NF54"],
- show_numbers=True, symbols_per_line=80,
- show_line_position=True, label_size=10,
- number_size=10, symbol_size=6)
+ ax,
+ alignments[0],
+ fl_score=score,
+ labels=["FCR3", "NF54"],
+ show_numbers=True,
+ symbols_per_line=80,
+ show_line_position=True,
+ label_size=10,
+ number_size=10,
+ symbol_size=6,
+)
# Add the axes where the colorbar will reside:
-ax2 = fig.add_axes([0.13, 0.07, 0.8, 0.01])
+ax2 = fig.add_axes([0.13, 0.07, 0.8, 0.01])
ax2.set_frame_on(False)
-# Access the colormap of the relevant instace of ArrayPlotter:
+# Access the colormap of the relevant instace of ArrayPlotter:
colormap = graphics.ArrayPlotter(ax2, score).get_cmap()
-def draw_colorbar(axes, array1, array2, colormap,
- orient=None, title=None):
+
+def draw_colorbar(axes, array1, array2, colormap, orient=None, title=None):
df1 = array1
df2 = array2
cmp = colormap
ax = axes
orientation = orient
label = title
-
+
# custom Formtatter for tick labels on the colorbar
def fmt(x, pos):
- a, b = '{:.1e}'.format(x).split('e')
+ a, b = "{:.1e}".format(x).split("e")
b = int(b)
- return r'${}\cdot10^{{{}}}$'.format(a, b)
-
- vmiA = df1['comb_signal'].min()
- vmiB = df2['comb_signal'].min()
- vmxA = df1['comb_signal'].max()
- vmxB = df2['comb_signal'].max()
-
- # The normalization of this colormap needs to be consistent with the
- # data trasnformtion used earlier on this example. The "cubic" law:
- norm = mpl.colors.PowerNorm(gamma=0.33, vmin=min(vmiA,vmiB),
- vmax=max(vmxA,vmxB))
-
- fig = mpl.pyplot.figure()
- return fig.colorbar(mpl.cm.ScalarMappable(norm=norm, cmap=cmp),
- cax=ax, orientation=orientation, label=label,
- format=mpl.ticker.FuncFormatter(fmt))
-
-# Draw the colorbar
-cbar = draw_colorbar(ax2, dfa, dfb, colormap, orient='horizontal',
- title='Fluorescence Intensity [AU]')
+ return r"${}\cdot10^{{{}}}$".format(a, b)
+
+ vmiA = df1["comb_signal"].min()
+ vmiB = df2["comb_signal"].min()
+ vmxA = df1["comb_signal"].max()
+ vmxB = df2["comb_signal"].max()
+
+ # The normalization of this colormap needs to be consistent with the
+ # data trasnformtion used earlier on this example. The "cubic" law:
+ norm = mpl.colors.PowerNorm(gamma=0.33, vmin=min(vmiA, vmiB), vmax=max(vmxA, vmxB))
+
+ fig = mpl.pyplot.figure()
+ return fig.colorbar(
+ mpl.cm.ScalarMappable(norm=norm, cmap=cmp),
+ cax=ax,
+ orientation=orientation,
+ label=label,
+ format=mpl.ticker.FuncFormatter(fmt),
+ )
+
+
+# Draw the colorbar
+cbar = draw_colorbar(
+ ax2, dfa, dfb, colormap, orient="horizontal", title="Fluorescence Intensity [AU]"
+)
# To improve readability we tilt the ticklabels on the colorbar
labels = cbar.ax.get_xticklabels()
-plt.setp(labels, rotation=45, horizontalalignment='center')
+plt.setp(labels, rotation=45, horizontalalignment="center")
plt.show()
########################################################################
# References
# ----------
#
-# .. footbibliography::
\ No newline at end of file
+# .. footbibliography::
diff --git a/doc/examples/scripts/sequence/homology/residue_coevolution.py b/doc/examples/scripts/sequence/homology/residue_coevolution.py
index e1f2f7329..e84b59f2f 100644
--- a/doc/examples/scripts/sequence/homology/residue_coevolution.py
+++ b/doc/examples/scripts/sequence/homology/residue_coevolution.py
@@ -43,22 +43,21 @@
# License: BSD 3 clause
import warnings
-import numpy as np
-import matplotlib.pyplot as plt
import matplotlib.colors as colors
+import matplotlib.pyplot as plt
+import numpy as np
import biotite
-import biotite.structure as struc
-import biotite.structure.io.pdbx as pdbx
-import biotite.sequence.align as align
-import biotite.sequence.graphics as graphics
import biotite.application.blast as blast
import biotite.application.clustalo as clustalo
import biotite.database.rcsb as rcsb
-
+import biotite.sequence.align as align
+import biotite.sequence.graphics as graphics
+import biotite.structure as struc
+import biotite.structure.io.pdbx as pdbx
# Get structure and sequence
pdbx_file = pdbx.CIFFile.read(rcsb.fetch("1GUU", "mmcif"))
-sequence = pdbx.get_sequence(pdbx_file)['A']
+sequence = pdbx.get_sequence(pdbx_file)["A"]
# 'use_author_fields' is set to false,
# to ensure that values in the 'res_id' annotation point to the sequence
structure = pdbx.get_structure(pdbx_file, model=1, use_author_fields=False)
@@ -88,16 +87,24 @@
# Plot MSA
number_functions = []
for start in hit_starts:
+
def some_func(x, start=start):
return x + start
+
number_functions.append(some_func)
fig = plt.figure(figsize=(8.0, 8.0))
ax = fig.gca()
graphics.plot_alignment_type_based(
- ax, alignment, symbols_per_line=len(alignment), labels=hit_ids,
- symbol_size=8, number_size=8, label_size=8,
- show_numbers=True, number_functions=number_functions,
- color_scheme="flower"
+ ax,
+ alignment,
+ symbols_per_line=len(alignment),
+ labels=hit_ids,
+ symbol_size=8,
+ number_size=8,
+ label_size=8,
+ show_numbers=True,
+ number_functions=number_functions,
+ color_scheme="flower",
)
ax.set_title("C-Myb R1-like sequences")
fig.tight_layout()
@@ -111,6 +118,7 @@ def some_func(x, start=start):
# High values indicate that the residues at the respective two
# positions have coevolved.
+
def mutual_information_zscore(alignment, n_shuffle=100):
codes = align.get_codes(alignment).T
alph = alignment.sequences[0].alphabet
@@ -127,6 +135,7 @@ def mutual_information_zscore(alignment, n_shuffle=100):
z_score = (mi - mean) / std
return z_score
+
def _shuffle(codes):
shuffled_codes = codes.copy()
# Shuffle each alignment column
@@ -134,6 +143,7 @@ def _shuffle(codes):
np.random.shuffle(shuffled_codes[i])
return shuffled_codes
+
def _mutual_information(codes, alph):
mi = np.zeros((len(alignment), len(alignment)))
# Iterate over all columns to choose first column
@@ -147,10 +157,10 @@ def _mutual_information(codes, alph):
# Iterate over all symbols in both columns
for k in range(codes.shape[1]):
# Skip rows where either column has a gap
- if codes[i,k] != -1 and codes[j,k] != -1:
- marginal_counts_i[codes[i,k]] += 1
- marginal_counts_j[codes[j,k]] += 1
- combined_counts[codes[i,k], codes[j,k]] += 1
+ if codes[i, k] != -1 and codes[j, k] != -1:
+ marginal_counts_i[codes[i, k]] += 1
+ marginal_counts_j[codes[j, k]] += 1
+ combined_counts[codes[i, k], codes[j, k]] += 1
nrows += 1
marginal_probs_i = marginal_counts_i / nrows
marginal_probs_j = marginal_counts_j / nrows
@@ -159,27 +169,31 @@ def _mutual_information(codes, alph):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
mi_before_sum = (
- combined_probs * np.log2(
- combined_probs / (
- marginal_probs_i[:, np.newaxis] *
- marginal_probs_j[np.newaxis, :]
+ combined_probs
+ * np.log2(
+ combined_probs
+ / (
+ marginal_probs_i[:, np.newaxis]
+ * marginal_probs_j[np.newaxis, :]
)
)
).flatten()
- mi[i,j] = np.sum(mi_before_sum[~np.isnan(mi_before_sum)])
+ mi[i, j] = np.sum(mi_before_sum[~np.isnan(mi_before_sum)])
return mi
# Remove alignment columns that have a gap in the C-Myb sequence
-alignment = alignment[alignment.trace[:,0] != -1]
+alignment = alignment[alignment.trace[:, 0] != -1]
mi = mutual_information_zscore(alignment)
# Create the color map for the plot
color = colors.to_rgb(biotite.colors["dimorange"])
cmap_val = np.stack(
- [np.interp(np.linspace(0, 1, 100), [0, 1], [1, color[i]])
- for i in range(len(color))]
+ [
+ np.interp(np.linspace(0, 1, 100), [0, 1], [1, color[i]])
+ for i in range(len(color))
+ ]
).transpose()
cmap = colors.ListedColormap(cmap_val)
@@ -196,4 +210,4 @@ def _mutual_information(codes, alph):
fig.tight_layout()
# sphinx_gallery_thumbnail_number = 2
-plt.show()
\ No newline at end of file
+plt.show()
diff --git a/doc/examples/scripts/sequence/homology/thca_synthase_polymorphism.py b/doc/examples/scripts/sequence/homology/thca_synthase_polymorphism.py
index a88d24284..b8c8ad276 100644
--- a/doc/examples/scripts/sequence/homology/thca_synthase_polymorphism.py
+++ b/doc/examples/scripts/sequence/homology/thca_synthase_polymorphism.py
@@ -23,29 +23,28 @@
# Code source: Patrick Kunzmann
# License: BSD 3 clause
-import numpy as np
import matplotlib.pyplot as plt
+import numpy as np
from matplotlib.colors import LinearSegmentedColormap
+import biotite.database.entrez as entrez
import biotite.sequence as seq
import biotite.sequence.align as align
-import biotite.sequence.io.genbank as gb
-import biotite.sequence.align as align
import biotite.sequence.graphics as graphics
-import biotite.database.entrez as entrez
-import biotite.application.clustalo as clustalo
-
+import biotite.sequence.io.genbank as gb
# Search for DNA sequences that belong to the cited article
-query = entrez.SimpleQuery("Forensic Sci. Int.", "Journal") \
- & entrez.SimpleQuery("159", "Volume") \
- & entrez.SimpleQuery("132-140", "Page Number")
+query = (
+ entrez.SimpleQuery("Forensic Sci. Int.", "Journal")
+ & entrez.SimpleQuery("159", "Volume")
+ & entrez.SimpleQuery("132-140", "Page Number")
+)
uids = entrez.search(query, db_name="nuccore")
# Download and read file containing the Genbank records for the THCA
# synthase genes
-multi_file = gb.MultiFile.read(entrez.fetch_single_file(
- uids, file_name=None, db_name="nuccore", ret_type="gb"
-))
+multi_file = gb.MultiFile.read(
+ entrez.fetch_single_file(uids, file_name=None, db_name="nuccore", ret_type="gb")
+)
# This dictionary maps the strain ID to the protein sequence
@@ -81,6 +80,7 @@
for sequence in sequences.values():
assert len(sequence) == seq_len
+
# Create consensus sequences for the drug-type and fiber-type cannabis
# strains
def create_consensus(sequences):
@@ -89,9 +89,7 @@ def create_consensus(sequences):
for seq_pos in range(seq_len):
# Count the number of occurrences of each amino acid
# at the given sequence position
- counts = np.bincount(
- [sequence.code[seq_pos] for sequence in sequences]
- )
+ counts = np.bincount([sequence.code[seq_pos] for sequence in sequences])
# The consensus amino acid is the most frequent amino acid
consensus_code[seq_pos] = np.argmax(counts)
# Create empty ProteinSequence object...
@@ -101,6 +99,7 @@ def create_consensus(sequences):
consensus_sequence.code = consensus_code
return consensus_sequence
+
drug_type_consensus = create_consensus(
[sequences[strain] for strain in (1, 10, 13, 20, 53, 54)]
)
@@ -120,7 +119,8 @@ def create_consensus(sequences):
# At low similarity the symbols are colored red,
# at high similarity the symbols are colored white
cmap = LinearSegmentedColormap.from_list(
- "custom", colors=[(1.0, 0.3, 0.3), (1.0, 1.0, 1.0)]
+ "custom",
+ colors=[(1.0, 0.3, 0.3), (1.0, 1.0, 1.0)],
# ^ reddish ^ white
)
@@ -128,11 +128,16 @@ def create_consensus(sequences):
ax = fig.add_subplot(111)
graphics.plot_alignment_similarity_based(
- ax, alignment, matrix=matrix, symbols_per_line=50,
+ ax,
+ alignment,
+ matrix=matrix,
+ symbols_per_line=50,
labels=["Drug-type", "Fiber-type"],
- show_numbers=True, cmap=cmap, symbol_size=8
+ show_numbers=True,
+ cmap=cmap,
+ symbol_size=8,
)
fig.tight_layout()
-plt.show()
\ No newline at end of file
+plt.show()
diff --git a/doc/examples/scripts/sequence/misc/blosum_dendrogram.py b/doc/examples/scripts/sequence/misc/blosum_dendrogram.py
index 64d67f2f7..400497ef4 100644
--- a/doc/examples/scripts/sequence/misc/blosum_dendrogram.py
+++ b/doc/examples/scripts/sequence/misc/blosum_dendrogram.py
@@ -10,12 +10,12 @@
# Code source: Patrick Kunzmann
# License: BSD 3 clause
-import numpy as np
import matplotlib.pyplot as plt
+import numpy as np
import biotite.sequence as seq
import biotite.sequence.align as align
-import biotite.sequence.phylo as phylo
import biotite.sequence.graphics as graphics
+import biotite.sequence.phylo as phylo
# Obtain BLOSUM62
matrix = align.SubstitutionMatrix.std_protein_matrix()
@@ -31,11 +31,12 @@
matrix = align.SubstitutionMatrix(
seq.Alphabet(matrix.get_alphabet1().get_symbols()[:-4]),
seq.Alphabet(matrix.get_alphabet2().get_symbols()[:-4]),
- matrix.score_matrix()[:-4, :-4]
+ matrix.score_matrix()[:-4, :-4],
)
similarities = matrix.score_matrix()
print(matrix)
+
########################################################################
# Now a function must be defined, that converts the similarity depicted
# by a substitution matrix into a distance required by the UPGMA method.
@@ -45,25 +46,26 @@
#
# Finally the obtained (phylogenetic) tree is plotted as dendrogram.
def get_distance(similarities, i, j):
- s_max = (similarities[i,i] + similarities[j,j]) / 2
- return s_max - similarities[i,j]
+ s_max = (similarities[i, i] + similarities[j, j]) / 2
+ return s_max - similarities[i, j]
+
distances = np.zeros(similarities.shape)
for i in range(distances.shape[0]):
for j in range(distances.shape[1]):
- distances[i,j] = get_distance(similarities, i, j)
+ distances[i, j] = get_distance(similarities, i, j)
tree = phylo.upgma(distances)
fig = plt.figure(figsize=(8.0, 5.0))
ax = fig.add_subplot(111)
# Use the 3-letter amino acid code aa label
-labels = [seq.ProteinSequence.convert_letter_1to3(letter).capitalize()
- for letter in matrix.get_alphabet1()]
-graphics.plot_dendrogram(
- ax, tree, orientation="top", labels=labels
-)
+labels = [
+ seq.ProteinSequence.convert_letter_1to3(letter).capitalize()
+ for letter in matrix.get_alphabet1()
+]
+graphics.plot_dendrogram(ax, tree, orientation="top", labels=labels)
ax.set_ylabel("Distance")
# Add grid for clearer distance perception
ax.yaxis.grid(color="lightgray")
-plt.show()
\ No newline at end of file
+plt.show()
diff --git a/doc/examples/scripts/sequence/misc/codon_usage.py b/doc/examples/scripts/sequence/misc/codon_usage.py
index dd6963c24..e6d7b888d 100644
--- a/doc/examples/scripts/sequence/misc/codon_usage.py
+++ b/doc/examples/scripts/sequence/misc/codon_usage.py
@@ -35,14 +35,13 @@
# Code source: Patrick Kunzmann
# License: BSD 3 clause
-import tempfile
import itertools
+import tempfile
import numpy as np
+import biotite.database.entrez as entrez
import biotite.sequence as seq
-import biotite.sequence.io.genbank as gb
import biotite.sequence.io.fasta as fasta
-import biotite.database.entrez as entrez
-
+import biotite.sequence.io.genbank as gb
# Get the E. coli K-12 genome as annotated sequence
gb_file = gb.GenBankFile.read(
@@ -56,8 +55,8 @@
# For increased performance the dictionary uses symbol codes ([0 3 2])
# instead of symbols (['A' 'T' 'G']) as keys
codon_counter = {
- codon: 0 for codon
- in itertools.product( *([range(len(k12_genome.sequence.alphabet))] * 3) )
+ codon: 0
+ for codon in itertools.product(*([range(len(k12_genome.sequence.alphabet))] * 3))
}
# For demonstration purposes print the 64 codons in symbol code form
print(list(codon_counter.keys()))
@@ -82,7 +81,7 @@
# Iterate over the sequence in non-overlapping frames of 3
# and count the occurence of each codon
for i in range(0, len(cds_seq), 3):
- codon_code = tuple(cds_seq.code[i:i+3])
+ codon_code = tuple(cds_seq.code[i : i + 3])
codon_counter[codon_code] += 1
# Convert the total frequencies into relative frequencies
@@ -165,4 +164,4 @@
# Print the contents of the created FASTA file
print(fasta_file)
# In a real application it would be written onto the hard drive via
-# fasta_file.write("some_file.fasta")
\ No newline at end of file
+# fasta_file.write("some_file.fasta")
diff --git a/doc/examples/scripts/sequence/misc/color_schemes.py b/doc/examples/scripts/sequence/misc/color_schemes.py
index de2dd80ad..b84542932 100644
--- a/doc/examples/scripts/sequence/misc/color_schemes.py
+++ b/doc/examples/scripts/sequence/misc/color_schemes.py
@@ -8,57 +8,65 @@
# Code source: Patrick Kunzmann
# License: BSD 3 clause
-import biotite.sequence as seq
-import biotite.sequence.graphics as graphics
-import numpy as np
import matplotlib.pyplot as plt
+import numpy as np
from matplotlib.gridspec import GridSpec
from matplotlib.patches import Rectangle
+import biotite.sequence as seq
+import biotite.sequence.graphics as graphics
+
def plot_colors(ax, alphabet):
- x_space=0.1
- y_space=0.3
+ x_space = 0.1
+ y_space = 0.3
scheme_names = sorted(graphics.list_color_scheme_names(alphabet))
scheme_names.reverse()
- schemes = [graphics.get_color_scheme(name, alphabet)
- for name in scheme_names]
+ schemes = [graphics.get_color_scheme(name, alphabet) for name in scheme_names]
for i, scheme in enumerate(schemes):
for j, color in enumerate(scheme):
- box = Rectangle((j - 0.5 + x_space/2, i - 0.5 + y_space/2),
- 1 - x_space, 1 - y_space, color=color,
- linewidth=0)
+ box = Rectangle(
+ (j - 0.5 + x_space / 2, i - 0.5 + y_space / 2),
+ 1 - x_space,
+ 1 - y_space,
+ color=color,
+ linewidth=0,
+ )
ax.add_patch(box)
ax.set_xticks(np.arange(len(alphabet)))
ax.set_yticks(np.arange(len(schemes)))
ax.set_xticklabels([symbol for symbol in alphabet])
ax.set_yticklabels(scheme_names)
- ax.set_xlim(-0.5, len(alphabet)-0.5)
- ax.set_ylim(-0.5, len(schemes)-0.5)
+ ax.set_xlim(-0.5, len(alphabet) - 0.5)
+ ax.set_ylim(-0.5, len(schemes) - 0.5)
ax.spines["left"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.spines["bottom"].set_visible(False)
ax.spines["top"].set_visible(False)
- ax.xaxis.set_ticks_position("none")
+ ax.xaxis.set_ticks_position("none")
ax.yaxis.set_ticks_position("none")
+
nuc_alphabet = seq.NucleotideSequence.alphabet_amb
prot_alphabet = seq.ProteinSequence.alphabet
pb_alphabet = seq.LetterAlphabet("abcdefghijklmnop")
figure = plt.figure(figsize=(8.0, 5.0))
gs = GridSpec(
- 3, 1,
- height_ratios=[len(graphics.list_color_scheme_names(alphabet))
- for alphabet in (nuc_alphabet, prot_alphabet, pb_alphabet)],
+ 3,
+ 1,
+ height_ratios=[
+ len(graphics.list_color_scheme_names(alphabet))
+ for alphabet in (nuc_alphabet, prot_alphabet, pb_alphabet)
+ ],
)
-ax = figure.add_subplot(gs[0,0])
+ax = figure.add_subplot(gs[0, 0])
ax.set_title("Nucleotide color schemes")
plot_colors(ax, nuc_alphabet)
-ax = figure.add_subplot(gs[1,0])
+ax = figure.add_subplot(gs[1, 0])
ax.set_title("Protein color schemes")
plot_colors(ax, prot_alphabet)
-ax = figure.add_subplot(gs[2,0])
+ax = figure.add_subplot(gs[2, 0])
ax.set_title("Protein block color schemes")
plot_colors(ax, pb_alphabet)
plt.tight_layout()
-plt.show()
\ No newline at end of file
+plt.show()
diff --git a/doc/examples/scripts/sequence/misc/color_schemes_protein.py b/doc/examples/scripts/sequence/misc/color_schemes_protein.py
index d68b6cddc..a747c2c74 100644
--- a/doc/examples/scripts/sequence/misc/color_schemes_protein.py
+++ b/doc/examples/scripts/sequence/misc/color_schemes_protein.py
@@ -42,24 +42,23 @@
# Code source: Patrick Kunzmann
# License: BSD 3 clause
-import numpy as np
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
+import biotite.database.entrez as entrez
import biotite.sequence as seq
-import biotite.sequence.io.fasta as fasta
import biotite.sequence.align as align
import biotite.sequence.graphics as graphics
-import biotite.database.entrez as entrez
-
+import biotite.sequence.io.fasta as fasta
# Generate example alignment
# (the same as in the bacterial luciferase example)
-query = entrez.SimpleQuery("luxA", "Gene Name") \
- & entrez.SimpleQuery("srcdb_swiss-prot", "Properties")
+query = entrez.SimpleQuery("luxA", "Gene Name") & entrez.SimpleQuery(
+ "srcdb_swiss-prot", "Properties"
+)
uids = entrez.search(query, db_name="protein")
-fasta_file = fasta.FastaFile.read(entrez.fetch_single_file(
- uids, None, db_name="protein", ret_type="fasta"
-))
+fasta_file = fasta.FastaFile.read(
+ entrez.fetch_single_file(uids, None, db_name="protein", ret_type="fasta")
+)
sequences = [seq.ProteinSequence(seq_str) for seq_str in fasta_file.values()]
matrix = align.SubstitutionMatrix.std_protein_matrix()
alignment, order, _, _ = align.align_multiple(sequences, matrix)
@@ -70,10 +69,22 @@
# Get color scheme names
alphabet = seq.ProteinSequence.alphabet
schemes = [
- "flower", "blossom", "spring", "wither", "autumn", "sunset", "ocean",
- "rainbow", "clustalx",
- "zappo", "taylor", "buried", "hydrophobicity",
- "prophelix", "propstrand", "propturn"
+ "flower",
+ "blossom",
+ "spring",
+ "wither",
+ "autumn",
+ "sunset",
+ "ocean",
+ "rainbow",
+ "clustalx",
+ "zappo",
+ "taylor",
+ "buried",
+ "hydrophobicity",
+ "prophelix",
+ "propstrand",
+ "propturn",
]
count = len(schemes)
# Assert that this example displays all available amino acid color schemes
@@ -82,20 +93,24 @@
# Visualize each scheme using the example alignment
-fig = plt.figure(figsize=(8.0, count*2.0))
+fig = plt.figure(figsize=(8.0, count * 2.0))
gridspec = GridSpec(2, count)
for i, name in enumerate(schemes):
for j, color_symbols in enumerate([False, True]):
- ax = fig.add_subplot(count, 2, 2*i + j + 1)
+ ax = fig.add_subplot(count, 2, 2 * i + j + 1)
if j == 0:
ax.set_ylabel(name)
alignment_part = alignment[:40]
else:
alignment_part = alignment[40:]
graphics.plot_alignment_type_based(
- ax, alignment_part, symbols_per_line=len(alignment_part),
- color_scheme=name, color_symbols=color_symbols, symbol_size=8
+ ax,
+ alignment_part,
+ symbols_per_line=len(alignment_part),
+ color_scheme=name,
+ color_symbols=color_symbols,
+ symbol_size=8,
)
fig.tight_layout()
fig.subplots_adjust(wspace=0)
-plt.show()
\ No newline at end of file
+plt.show()
diff --git a/doc/examples/scripts/sequence/misc/local_alignment_statistics.py b/doc/examples/scripts/sequence/misc/local_alignment_statistics.py
index 88d0eb4ad..aa3fd533e 100644
--- a/doc/examples/scripts/sequence/misc/local_alignment_statistics.py
+++ b/doc/examples/scripts/sequence/misc/local_alignment_statistics.py
@@ -22,25 +22,23 @@
# License: BSD 3 clause
import matplotlib.pyplot as plt
-from matplotlib.lines import Line2D
import numpy as np
+from matplotlib.lines import Line2D
from scipy.stats import linregress
import biotite
+import biotite.database.entrez as entrez
import biotite.sequence as seq
import biotite.sequence.align as align
-from biotite.sequence.align.alignment import score
-import biotite.sequence.io.fasta as fasta
-import biotite.database.entrez as entrez
import biotite.sequence.graphics as graphics
-
+import biotite.sequence.io.fasta as fasta
GAP_PENALTY = (-12, -1)
# Download and parse protein sequences of avidin and streptavidin
-fasta_file = fasta.FastaFile.read(entrez.fetch_single_file(
- ["CAC34569", "ACL82594"], None, "protein", "fasta"
-))
+fasta_file = fasta.FastaFile.read(
+ entrez.fetch_single_file(["CAC34569", "ACL82594"], None, "protein", "fasta")
+)
for name, sequence in fasta_file.items():
if "CAC34569" in name:
query_seq = seq.ProteinSequence(sequence)
@@ -54,8 +52,7 @@
# Perform pairwise sequence alignment with affine gap penalty
# Terminal gaps are not penalized
alignment = align.align_optimal(
- query_seq, hit_seq, matrix,
- local=True, gap_penalty=GAP_PENALTY, max_number=1
+ query_seq, hit_seq, matrix, local=True, gap_penalty=GAP_PENALTY, max_number=1
)[0]
@@ -64,8 +61,12 @@
fig = plt.figure(figsize=(8.0, 3.0))
ax = fig.add_subplot(111)
graphics.plot_alignment_similarity_based(
- ax, alignment, matrix=matrix, labels=["Avidin (query)", "Database hit"],
- show_numbers=True, show_line_position=True
+ ax,
+ alignment,
+ matrix=matrix,
+ labels=["Avidin (query)", "Database hit"],
+ show_numbers=True,
+ show_line_position=True,
)
fig.tight_layout()
@@ -103,10 +104,12 @@
#
# f(x) = \lambda t(x) e^{-t(x)}
+
# The probability density function of the extreme value distribution
-def pdf(x, l, u):
- t = np.exp(-l * (x - u))
- return l * t * np.exp(-t)
+def pdf(x, lam, u):
+ t = np.exp(-lam * (x - u))
+ return lam * t * np.exp(-t)
+
x = np.linspace(-5, 10, 1000)
y = pdf(x, 1, 0)
@@ -124,7 +127,7 @@ def pdf(x, l, u):
# .. math::
#
# u = \frac{\ln Kmn}{\lambda},
-#
+#
# where :math:`m` and :math:`n` are the lengths of the aligned
# sequences.
# :math:`K` and :math:`\lambda` can be calculated from the substitution
@@ -166,32 +169,39 @@ def pdf(x, l, u):
SAMPLE_SIZE = 10000
SEQ_LENGTH = 300
-BACKGROUND = np.array(list({
- "A": 35155,
- "C": 8669,
- "D": 24161,
- "E": 28354,
- "F": 17367,
- "G": 33229,
- "H": 9906,
- "I": 23161,
- "K": 25872,
- "L": 40625,
- "M": 10101,
- "N": 20212,
- "P": 23435,
- "Q": 19208,
- "R": 23105,
- "S": 32070,
- "T": 26311,
- "V": 29012,
- "W": 5990,
- "Y": 14488,
- "B": 0,
- "Z": 0,
- "X": 0,
- "*": 0,
-}.values())) / 450431
+BACKGROUND = (
+ np.array(
+ list(
+ {
+ "A": 35155,
+ "C": 8669,
+ "D": 24161,
+ "E": 28354,
+ "F": 17367,
+ "G": 33229,
+ "H": 9906,
+ "I": 23161,
+ "K": 25872,
+ "L": 40625,
+ "M": 10101,
+ "N": 20212,
+ "P": 23435,
+ "Q": 19208,
+ "R": 23105,
+ "S": 32070,
+ "T": 26311,
+ "V": 29012,
+ "W": 5990,
+ "Y": 14488,
+ "B": 0,
+ "Z": 0,
+ "X": 0,
+ "*": 0,
+ }.values()
+ )
+ )
+ / 450431
+)
# Generate the sequence code for random sequences
@@ -199,7 +209,7 @@ def pdf(x, l, u):
random_sequence_code = np.random.choice(
np.arange(len(seq.ProteinSequence.alphabet)),
size=(SAMPLE_SIZE, 2, SEQ_LENGTH),
- p=BACKGROUND
+ p=BACKGROUND,
)
# Sample alignment scores
@@ -207,11 +217,10 @@ def pdf(x, l, u):
for i in range(SAMPLE_SIZE):
seq1 = seq.ProteinSequence()
seq2 = seq.ProteinSequence()
- seq1.code = random_sequence_code[i,0]
- seq2.code = random_sequence_code[i,1]
+ seq1.code = random_sequence_code[i, 0]
+ seq2.code = random_sequence_code[i, 1]
sample_alignment = align.align_optimal(
- seq1, seq2, matrix,
- local=True, gap_penalty=GAP_PENALTY, max_number=1
+ seq1, seq2, matrix, local=True, gap_penalty=GAP_PENALTY, max_number=1
)[0]
sample_scores[i] = sample_alignment.score
@@ -231,25 +240,24 @@ def pdf(x, l, u):
# respectively.
# Use method of moments to estimate distribution parameters
-l = np.pi / np.sqrt(6 * np.var(sample_scores))
-u = np.mean(sample_scores) - np.euler_gamma / l
+lam = np.pi / np.sqrt(6 * np.var(sample_scores))
+u = np.mean(sample_scores) - np.euler_gamma / lam
# Score frequencies for the histogram
freqs = np.bincount(sample_scores) / SAMPLE_SIZE
# Coordinates for the fit
-x = np.linspace(0, len(freqs)-1, 1000)
-y = pdf(x, l, u)
+x = np.linspace(0, len(freqs) - 1, 1000)
+y = pdf(x, lam, u)
fig, ax = plt.subplots(figsize=(8.0, 4.0))
ax.scatter(
- np.arange(len(freqs)), freqs, color=biotite.colors["dimorange"],
- label="Sample", s=8
+ np.arange(len(freqs)), freqs, color=biotite.colors["dimorange"], label="Sample", s=8
)
ax.plot(x, y, color="gray", linestyle="--", label="Fit")
ax.set_xlabel("Similarity score")
ax.set_ylabel("Probability")
-ax.set_xlim(0, len(freqs)-1)
+ax.set_xlim(0, len(freqs) - 1)
ax.legend(loc="upper left")
fig.tight_layout()
@@ -281,35 +289,33 @@ def pdf(x, l, u):
SAMPLE_SIZE_PER_LENGTH = 1000
# The sequence lengths to be sampled
-length_samples = np.logspace(*np.log10(LENGTH_RANGE), LENGTH_SAMPLE_SIZE) \
- .astype(int)
+length_samples = np.logspace(*np.log10(LENGTH_RANGE), LENGTH_SAMPLE_SIZE).astype(int)
u_series = np.zeros(LENGTH_SAMPLE_SIZE)
-l_series = np.zeros(LENGTH_SAMPLE_SIZE)
+lam_series = np.zeros(LENGTH_SAMPLE_SIZE)
for i, length in enumerate(length_samples):
# The same procedure from above
random_sequence_code = np.random.choice(
np.arange(len(seq.ProteinSequence.alphabet)),
size=(SAMPLE_SIZE_PER_LENGTH, 2, length),
- p=BACKGROUND
+ p=BACKGROUND,
)
scores = np.zeros(SAMPLE_SIZE_PER_LENGTH, dtype=int)
for j in range(SAMPLE_SIZE_PER_LENGTH):
seq1 = seq.ProteinSequence()
seq2 = seq.ProteinSequence()
- seq1.code = random_sequence_code[j,0]
- seq2.code = random_sequence_code[j,1]
+ seq1.code = random_sequence_code[j, 0]
+ seq2.code = random_sequence_code[j, 1]
sample_alignment = align.align_optimal(
- seq1, seq2, matrix,
- local=True, gap_penalty=GAP_PENALTY, max_number=1
+ seq1, seq2, matrix, local=True, gap_penalty=GAP_PENALTY, max_number=1
)[0]
scores[j] = sample_alignment.score
- l_series[i] = np.pi / np.sqrt(6 * np.var(scores))
- u_series[i] = np.mean(scores) - np.euler_gamma / l_series[i]
+ lam_series[i] = np.pi / np.sqrt(6 * np.var(scores))
+ u_series[i] = np.mean(scores) - np.euler_gamma / lam_series[i]
########################################################################
-# Now we use a linear fit of :math:`u` to check if there is a linear
+# Now we use a linear fit of :math:`u` to check if there is a linear
# relation.
# Furthermore, if this is true, the slope and intercept of
# the fit should give us a more precise estimation of :math:`\lambda`
@@ -319,39 +325,37 @@ def pdf(x, l, u):
slope, intercept, r, _, _ = linregress(ln_mn, u_series)
# More precise parameter estimation from fit
-l = 1/slope
-k = np.exp(intercept * l)
+lam = 1 / slope
+k = np.exp(intercept * lam)
# Coordinates for fit
x_fit = np.linspace(0, 16, 100)
y_fit = slope * x_fit + intercept
fig, ax = plt.subplots(figsize=(8.0, 4.0))
-arrowprops = dict(
- facecolor='black', shrink=0.1, width=3, headwidth=10, headlength=10
-)
+arrowprops = dict(facecolor="black", shrink=0.1, width=3, headwidth=10, headlength=10)
ax.scatter(ln_mn, u_series, color=biotite.colors["dimorange"], s=8)
ax.plot(x_fit, y_fit, color=biotite.colors["darkorange"], linestyle="--")
x_annot = 12
ax.annotate(
f"R² = {r**2:.3f}\nK = {k:.3f}",
- xy = (x_annot, slope * x_annot + intercept),
- xytext = (-100, 50),
- textcoords = "offset pixels",
- arrowprops = arrowprops,
+ xy=(x_annot, slope * x_annot + intercept),
+ xytext=(-100, 50),
+ textcoords="offset pixels",
+ arrowprops=arrowprops,
)
ax2 = ax.twinx()
-ax2.scatter(ln_mn, l_series, color=biotite.colors["lightgreen"], s=8)
-ax2.axhline(l, color=biotite.colors["darkgreen"], linestyle=":")
+ax2.scatter(ln_mn, lam_series, color=biotite.colors["lightgreen"], s=8)
+ax2.axhline(lam, color=biotite.colors["darkgreen"], linestyle=":")
x_annot = 2
ax2.annotate(
- f"λ = {l:.3f}",
- xy = (x_annot, l),
- xytext = (0, -50),
- textcoords = "offset pixels",
- arrowprops = arrowprops,
+ f"λ = {lam:.3f}",
+ xy=(x_annot, lam),
+ xytext=(0, -50),
+ textcoords="offset pixels",
+ arrowprops=arrowprops,
)
ax.set_xlabel("ln(mn)")
@@ -361,17 +365,25 @@ def pdf(x, l, u):
ax.set_ylim(0, 50)
ax2.set_ylim(0, 0.6)
ax.legend(
- handles = [
+ handles=[
Line2D(
- [0], [0], color=biotite.colors["dimorange"], label='u',
- marker='o', linestyle="None"
+ [0],
+ [0],
+ color=biotite.colors["dimorange"],
+ label="u",
+ marker="o",
+ linestyle="None",
),
Line2D(
- [0], [0], color=biotite.colors["lightgreen"], label='λ',
- marker='o', linestyle="None"
- )
+ [0],
+ [0],
+ color=biotite.colors["lightgreen"],
+ label="λ",
+ marker="o",
+ linestyle="None",
+ ),
],
- loc = "upper left"
+ loc="upper left",
)
fig.tight_layout()
@@ -398,17 +410,17 @@ def pdf(x, l, u):
# E-value calculation
# -------------------
#
-# Finally, we can use the estimated parameters to calculate the E-value
+# Finally, we can use the estimated parameters to calculate the E-value
# of the alignment of interest.
# In this case we use :math:`K` and :math:`\lambda` from the linear fit,
# but as already indicated we could alternatively use the parameters
# from sampling alignments of sequences at a single length :math:`n`.
# While :math:`\lambda` is a direct result of the method of moments as
-# shown above, :math:`K` is calculated as
+# shown above, :math:`K` is calculated as
#
# .. math::
#
-# K = \frac{e^{\lambda u}}{n^2}
+# K = \frac{e^{\lambda u}}{n^2}
#
# where :math:`n` is the length of both sequences in each sample.
#
@@ -425,12 +437,12 @@ def pdf(x, l, u):
DATABASE_SIZE = 1_000_000
-def e_value(score, length1, length2, k, l):
- return k * length1 * length2 * np.exp(-l * score)
-e = e_value(
- alignment.score, len(query_seq), len(hit_seq) * DATABASE_SIZE, k, l
-)
+def e_value(score, length1, length2, k, lam):
+ return k * length1 * length2 * np.exp(-lam * score)
+
+
+e = e_value(alignment.score, len(query_seq), len(hit_seq) * DATABASE_SIZE, k, lam)
print(f"E-value = {e:.2e}")
########################################################################
diff --git a/doc/examples/scripts/sequence/misc/orf_identification.py b/doc/examples/scripts/sequence/misc/orf_identification.py
index 6c7d87abd..695b30af8 100644
--- a/doc/examples/scripts/sequence/misc/orf_identification.py
+++ b/doc/examples/scripts/sequence/misc/orf_identification.py
@@ -16,10 +16,8 @@
# Code source: Patrick Kunzmann
# License: BSD 3 clause
-import biotite.sequence as seq
-import biotite.sequence.io.fasta as fasta
import biotite.database.entrez as entrez
-import matplotlib.pyplot as plt
+import biotite.sequence.io.fasta as fasta
# Download Porcine circovirus genome
file = entrez.fetch("KP282147", None, "fa", "nuccore", "fasta")
@@ -29,13 +27,19 @@
proteins, positions = genome.translate()
print("Forward strand:")
for i in range(len(proteins)):
- print("{:4d} - {:4d}: {:}"
- .format(positions[i][0], positions[i][1], str(proteins[i])))
+ print(
+ "{:4d} - {:4d}: {:}".format(
+ positions[i][0], positions[i][1], str(proteins[i])
+ )
+ )
print("\n")
# Perform translation for complementary strand
genome_rev = genome.reverse().complement()
proteins, positions = genome_rev.translate()
print("Reverse strand:")
for i in range(len(proteins)):
- print("{:5d} - {:5d}: {:}"
- .format(positions[i][0], positions[i][1], str(proteins[i])))
\ No newline at end of file
+ print(
+ "{:5d} - {:5d}: {:}".format(
+ positions[i][0], positions[i][1], str(proteins[i])
+ )
+ )
diff --git a/doc/examples/scripts/sequence/profile/anderson_logo.py b/doc/examples/scripts/sequence/profile/anderson_logo.py
index 50b195f56..20cee13b2 100644
--- a/doc/examples/scripts/sequence/profile/anderson_logo.py
+++ b/doc/examples/scripts/sequence/profile/anderson_logo.py
@@ -9,33 +9,35 @@
# Code source: Patrick Kunzmann
# License: BSD 3 clause
-import numpy as np
import matplotlib.pyplot as plt
+import numpy as np
import biotite.sequence as seq
import biotite.sequence.align as align
import biotite.sequence.graphics as graphics
# The list of Anderson promoters
-seqs = [seq.NucleotideSequence("ttgacagctagctcagtcctaggtataatgctagc"),
- seq.NucleotideSequence("ttgacagctagctcagtcctaggtataatgctagc"),
- seq.NucleotideSequence("tttacagctagctcagtcctaggtattatgctagc"),
- seq.NucleotideSequence("ttgacagctagctcagtcctaggtactgtgctagc"),
- seq.NucleotideSequence("ctgatagctagctcagtcctagggattatgctagc"),
- seq.NucleotideSequence("ttgacagctagctcagtcctaggtattgtgctagc"),
- seq.NucleotideSequence("tttacggctagctcagtcctaggtactatgctagc"),
- seq.NucleotideSequence("tttacggctagctcagtcctaggtatagtgctagc"),
- seq.NucleotideSequence("tttacggctagctcagccctaggtattatgctagc"),
- seq.NucleotideSequence("ctgacagctagctcagtcctaggtataatgctagc"),
- seq.NucleotideSequence("tttacagctagctcagtcctagggactgtgctagc"),
- seq.NucleotideSequence("tttacggctagctcagtcctaggtacaatgctagc"),
- seq.NucleotideSequence("ttgacggctagctcagtcctaggtatagtgctagc"),
- seq.NucleotideSequence("ctgatagctagctcagtcctagggattatgctagc"),
- seq.NucleotideSequence("ctgatggctagctcagtcctagggattatgctagc"),
- seq.NucleotideSequence("tttatggctagctcagtcctaggtacaatgctagc"),
- seq.NucleotideSequence("tttatagctagctcagcccttggtacaatgctagc"),
- seq.NucleotideSequence("ttgacagctagctcagtcctagggactatgctagc"),
- seq.NucleotideSequence("ttgacagctagctcagtcctagggattgtgctagc"),
- seq.NucleotideSequence("ttgacggctagctcagtcctaggtattgtgctagc")]
+seqs = [
+ seq.NucleotideSequence("ttgacagctagctcagtcctaggtataatgctagc"),
+ seq.NucleotideSequence("ttgacagctagctcagtcctaggtataatgctagc"),
+ seq.NucleotideSequence("tttacagctagctcagtcctaggtattatgctagc"),
+ seq.NucleotideSequence("ttgacagctagctcagtcctaggtactgtgctagc"),
+ seq.NucleotideSequence("ctgatagctagctcagtcctagggattatgctagc"),
+ seq.NucleotideSequence("ttgacagctagctcagtcctaggtattgtgctagc"),
+ seq.NucleotideSequence("tttacggctagctcagtcctaggtactatgctagc"),
+ seq.NucleotideSequence("tttacggctagctcagtcctaggtatagtgctagc"),
+ seq.NucleotideSequence("tttacggctagctcagccctaggtattatgctagc"),
+ seq.NucleotideSequence("ctgacagctagctcagtcctaggtataatgctagc"),
+ seq.NucleotideSequence("tttacagctagctcagtcctagggactgtgctagc"),
+ seq.NucleotideSequence("tttacggctagctcagtcctaggtacaatgctagc"),
+ seq.NucleotideSequence("ttgacggctagctcagtcctaggtatagtgctagc"),
+ seq.NucleotideSequence("ctgatagctagctcagtcctagggattatgctagc"),
+ seq.NucleotideSequence("ctgatggctagctcagtcctagggattatgctagc"),
+ seq.NucleotideSequence("tttatggctagctcagtcctaggtacaatgctagc"),
+ seq.NucleotideSequence("tttatagctagctcagcccttggtacaatgctagc"),
+ seq.NucleotideSequence("ttgacagctagctcagtcctagggactatgctagc"),
+ seq.NucleotideSequence("ttgacagctagctcagtcctagggattgtgctagc"),
+ seq.NucleotideSequence("ttgacggctagctcagtcctaggtattgtgctagc"),
+]
# Sequences do not need to be aligned
# -> Create alignment with trivial trace
# [[0 0 0 ...]
@@ -43,11 +45,11 @@
# [2 2 2 ...]
# ... ]
alignment = align.Alignment(
- sequences = seqs,
- trace = np.tile(np.arange(len(seqs[0])), len(seqs)) \
- .reshape(len(seqs), len(seqs[0])) \
- .transpose(),
- score = 0
+ sequences=seqs,
+ trace=np.tile(np.arange(len(seqs[0])), len(seqs))
+ .reshape(len(seqs), len(seqs[0]))
+ .transpose(),
+ score=0,
)
# Create sequence logo from alignment
fig = plt.figure(figsize=(8.0, 1.5))
@@ -57,4 +59,4 @@
# Remove the entire frame
ax.axis("off")
fig.tight_layout()
-plt.show()
\ No newline at end of file
+plt.show()
diff --git a/doc/examples/scripts/sequence/profile/rbs_identification.py b/doc/examples/scripts/sequence/profile/rbs_identification.py
index acb30ecc1..fd58280bb 100644
--- a/doc/examples/scripts/sequence/profile/rbs_identification.py
+++ b/doc/examples/scripts/sequence/profile/rbs_identification.py
@@ -16,17 +16,15 @@
# License: BSD 3 clause
import tempfile
-import numpy as np
import matplotlib.pyplot as plt
-from matplotlib.patches import Patch
import matplotlib.ticker as ticker
+import numpy as np
+from matplotlib.patches import Patch
import biotite
+import biotite.database.entrez as entrez
import biotite.sequence as seq
-import biotite.sequence.io.genbank as gb
import biotite.sequence.graphics as graphics
-import biotite.database.entrez as entrez
-import biotite.application.muscle as muscle
-
+import biotite.sequence.io.genbank as gb
UTR_LENGTH = 20
@@ -58,18 +56,15 @@
# CDS is on
if loc.strand == seq.Location.Strand.FORWARD:
utr_start = loc.first - UTR_LENGTH
- utr_stop = loc.first
+ utr_stop = loc.first
# Include the start codon (3 bases) in the UTRs for later
# visualization
- utrs.append(
- bl21_genome[utr_start : utr_stop + 3].sequence
- )
+ utrs.append(bl21_genome[utr_start : utr_stop + 3].sequence)
else:
utr_start = loc.last + 1
- utr_stop = loc.last + 1 + UTR_LENGTH
+ utr_stop = loc.last + 1 + UTR_LENGTH
utrs.append(
- bl21_genome[utr_start - 3 : utr_stop].sequence \
- .reverse().complement()
+ bl21_genome[utr_start - 3 : utr_stop].sequence.reverse().complement()
)
@@ -82,14 +77,15 @@
frequencies[np.arange(len(utr)), utr.code] += 1
profile = seq.SequenceProfile(
- symbols = frequencies,
- gaps = np.zeros(len(frequencies)),
- alphabet = bl21_genome.sequence.alphabet
+ symbols=frequencies,
+ gaps=np.zeros(len(frequencies)),
+ alphabet=bl21_genome.sequence.alphabet,
)
### Visualize the profile
+
# Spend extra effort for correct sequence postion labels
def normalize_seq_pos(x):
"""
@@ -103,15 +99,17 @@ def normalize_seq_pos(x):
x -= 1
return x
+
@ticker.FuncFormatter
def sequence_loc_formatter(x, pos):
x = normalize_seq_pos(x)
return f"{x:+}"
+
COLOR_SCHEME = [
- biotite.colors["lightgreen"], # A
- biotite.colors["orange"], # C
- biotite.colors["dimgreen"], # G
+ biotite.colors["lightgreen"], # A
+ biotite.colors["orange"], # C
+ biotite.colors["dimgreen"], # G
biotite.colors["brightorange"], # T
]
@@ -127,11 +125,14 @@ def sequence_loc_formatter(x, pos):
ax.set_ylabel("Conservation (Bits)")
ax.spines["right"].set_visible(False)
ax.spines["top"].set_visible(False)
-ax.legend(loc="upper left", handles=[
- Patch(color=biotite.colors["green"], label="Purine"),
- Patch(color=biotite.colors["lightorange"], label="Pyrimidine"),
-])
+ax.legend(
+ loc="upper left",
+ handles=[
+ Patch(color=biotite.colors["green"], label="Purine"),
+ Patch(color=biotite.colors["lightorange"], label="Pyrimidine"),
+ ],
+)
fig.tight_layout()
-plt.show()
\ No newline at end of file
+plt.show()
diff --git a/doc/examples/scripts/sequence/sequencing/gene_counts.py b/doc/examples/scripts/sequence/sequencing/gene_counts.py
index 6dd4bed65..6fa73b0c1 100644
--- a/doc/examples/scripts/sequence/sequencing/gene_counts.py
+++ b/doc/examples/scripts/sequence/sequencing/gene_counts.py
@@ -19,21 +19,20 @@
# Code source: Patrick Kunzmann
# License: BSD 3 clause
-from io import StringIO
import functools
-import multiprocessing
import gzip
-import numpy as np
+import multiprocessing
+from io import StringIO
import matplotlib.pyplot as plt
+import numpy as np
import pandas as pd
import requests
import biotite
+import biotite.application.sra as sra
import biotite.sequence as seq
+import biotite.sequence.align as align
import biotite.sequence.io.fasta as fasta
import biotite.sequence.io.fastq as fastq
-import biotite.sequence.align as align
-import biotite.application.sra as sra
-
# The number of processes for read mapping
N_PROCESS = 2
@@ -93,6 +92,7 @@
# extracts the gene symbols, i.e. the 'names' of the genes, and the
# corresponding cDNA sequences.
+
def get_gene_symbol(header):
fields = header.split()
for field in fields:
@@ -103,6 +103,7 @@ def get_gene_symbol(header):
# No gene symbol for this cDNA (e.g. non-coding)
return None
+
response = requests.get(CDNA_URL)
fasta_content = gzip.decompress(response.content).decode("UTF-8")
@@ -123,9 +124,7 @@ def get_gene_symbol(header):
# The k-mer code in restricted to int64, so a larger number
# of base alphabet codes decreases the *k* that fits into
# the integer type
- sequences.append(
- seq.NucleotideSequence(seq_string, ambiguous=False)
- )
+ sequences.append(seq.NucleotideSequence(seq_string, ambiguous=False))
except seq.AlphabetError:
# For the simplicity of this example just ignore sequences
# with unambiguous symbols
@@ -172,13 +171,10 @@ def get_gene_symbol(header):
base_alph = seq.NucleotideSequence.alphabet_unamb
kmer_alph = align.KmerAlphabet(base_alph, K)
-min_selector = align.MinimizerSelector(
- kmer_alph, WINDOW, align.RandomPermutation()
-)
+min_selector = align.MinimizerSelector(kmer_alph, WINDOW, align.RandomPermutation())
kmer_table = align.BucketKmerTable.from_kmer_selection(
- kmer_alph,
- *zip(*[min_selector.select(sequence) for sequence in sequences])
+ kmer_alph, *zip(*[min_selector.select(sequence) for sequence in sequences])
)
########################################################################
@@ -202,6 +198,7 @@ def get_gene_symbol(header):
# After all alignments have been collected, simply the highest-scoring
# one is chosen as the *correct* one.
+
def map_read(read_string, kmer_table, gene_sequences, substitution_matrix):
try:
read = seq.NucleotideSequence(read_string, ambiguous=False)
@@ -226,10 +223,13 @@ def map_read(read_string, kmer_table, gene_sequences, substitution_matrix):
(
gene_i,
align.align_banded(
- read, gene_sequences[gene_i], substitution_matrix,
+ read,
+ gene_sequences[gene_i],
+ substitution_matrix,
band=(diagonal - BAND_WIDTH, diagonal + BAND_WIDTH),
- gap_penalty= -10, max_number=1
- )[0]
+ gap_penalty=-10,
+ max_number=1,
+ )[0],
)
for gene_i, diagonal in zip(matched_gene_indices, matched_diagonals)
]
@@ -243,9 +243,9 @@ def map_read(read_string, kmer_table, gene_sequences, substitution_matrix):
substitution_matrix = align.SubstitutionMatrix.std_nucleotide_matrix()
-for i, (_, (seq_string, q)) in enumerate(fastq.FastqFile.read_iter(
- fastq_path, offset="Sanger"
-)):
+for i, (_, (seq_string, q)) in enumerate(
+ fastq.FastqFile.read_iter(fastq_path, offset="Sanger")
+):
# For demonstration only a single clean read is mapped
if i == 3:
read_string = seq_string
@@ -266,10 +266,11 @@ def map_read(read_string, kmer_table, gene_sequences, substitution_matrix):
# However, for the large number of reads which can be then processed in
# parallel, it is still worth it.
+
def read_iter(fastq_path):
- for i, (_, (read_string, quality)) in enumerate(fastq.FastqFile.read_iter(
- fastq_path, offset="Sanger"
- )):
+ for i, (_, (read_string, quality)) in enumerate(
+ fastq.FastqFile.read_iter(fastq_path, offset="Sanger")
+ ):
# For the purpose of this example only a faction of the reads
# are processed to save computation time
if i >= EXCERPT:
@@ -279,21 +280,24 @@ def read_iter(fastq_path):
continue
yield read_string
+
with multiprocessing.Pool(processes=N_PROCESS) as p:
# Use multiprocessing to map reads to genes
# and remove non-mappable reads (None values) afterwards
- mapping_results = list(filter(
- lambda mapping: mapping is not None,
- p.map(
- functools.partial(
- map_read,
- kmer_table=kmer_table,
- gene_sequences=sequences,
- substitution_matrix=substitution_matrix
+ mapping_results = list(
+ filter(
+ lambda mapping: mapping is not None,
+ p.map(
+ functools.partial(
+ map_read,
+ kmer_table=kmer_table,
+ gene_sequences=sequences,
+ substitution_matrix=substitution_matrix,
+ ),
+ read_iter(fastq_path),
),
- read_iter(fastq_path)
)
- ))
+ )
########################################################################
# Now the genes are counted:
@@ -324,7 +328,7 @@ def read_iter(fastq_path):
# Put into dataframe for prettier printing
counts = pd.DataFrame(
{"gene_symbol": ranked_gene_symbols, "count": ranked_counts},
- index = np.arange(1, len(ranked_counts) + 1)
+ index=np.arange(1, len(ranked_counts) + 1),
)
# Show Top N
@@ -335,10 +339,7 @@ def read_iter(fastq_path):
# Finally the top expressed genes are plotted.
figure, ax = plt.subplots(figsize=(8.0, 6.0), constrained_layout=True)
-ax.barh(
- top_counts["gene_symbol"], top_counts["count"],
- color=biotite.colors["orange"]
-)
+ax.barh(top_counts["gene_symbol"], top_counts["count"], color=biotite.colors["orange"])
ax.invert_yaxis()
ax.set_title(f"Top {N_TOP_LIST} expressed genes", weight="semibold")
ax.set_xlabel("Counts")
@@ -348,4 +349,4 @@ def read_iter(fastq_path):
# References
# ----------
#
-# .. footbibliography::
\ No newline at end of file
+# .. footbibliography::
diff --git a/doc/examples/scripts/sequence/sequencing/genome_assembly.py b/doc/examples/scripts/sequence/sequencing/genome_assembly.py
index bf5474fd9..29ab83656 100644
--- a/doc/examples/scripts/sequence/sequencing/genome_assembly.py
+++ b/doc/examples/scripts/sequence/sequencing/genome_assembly.py
@@ -1,4 +1,4 @@
-"""
+r"""
Comparative genome assembly
===========================
@@ -48,21 +48,20 @@
import itertools
import tempfile
from concurrent.futures import ProcessPoolExecutor
-import numpy as np
import matplotlib.pyplot as plt
-from matplotlib.lines import Line2D
+import numpy as np
from matplotlib.colors import LinearSegmentedColormap
+from matplotlib.lines import Line2D
import biotite
+import biotite.application.sra as sra
+import biotite.database.entrez as entrez
import biotite.sequence as seq
import biotite.sequence.align as align
+import biotite.sequence.graphics as graphics
import biotite.sequence.io as seqio
import biotite.sequence.io.fasta as fasta
import biotite.sequence.io.fastq as fastq
import biotite.sequence.io.genbank as gb
-import biotite.sequence.graphics as graphics
-import biotite.database.entrez as entrez
-import biotite.application.sra as sra
-
# Download the sequencing data
app = sra.FastqDumpApp("SRR13453793")
@@ -73,8 +72,9 @@
# There is only one read per spot
file_path = app.get_file_paths()[0]
fastq_file = fastq.FastqFile.read(file_path, offset="Sanger")
-reads = [seq.NucleotideSequence(seq_str)
- for seq_str, score_array in fastq_file.values()]
+reads = [
+ seq.NucleotideSequence(seq_str) for seq_str, score_array in fastq_file.values()
+]
score_arrays = [score_array for seq_str, score_array in fastq_file.values()]
print(f"Number of reads: {len(reads)}")
@@ -93,7 +93,8 @@
length_ax.hist(
[len(score_array) for score_array in score_arrays],
- bins=np.logspace(1, 5, N_BINS), color="gray"
+ bins=np.logspace(1, 5, N_BINS),
+ color="gray",
)
length_ax.set_xlabel("Read length")
length_ax.set_ylabel("Number of reads")
@@ -102,7 +103,8 @@
score_ax.hist(
[np.mean(score_array) for score_array in score_arrays],
- bins=N_BINS, color="gray",
+ bins=N_BINS,
+ color="gray",
)
score_ax.set_xlim(0, 30)
score_ax.set_xlabel("Phred score")
@@ -134,8 +136,10 @@
fig, ax = plt.subplots(figsize=(8.0, 4.0))
ax.fill_between(
# Value in megabases -> 1e-6
- np.arange(len(score_histogram)), score_histogram * 1e-6,
- linewidth=0, color="gray"
+ np.arange(len(score_histogram)),
+ score_histogram * 1e-6,
+ linewidth=0,
+ color="gray",
)
ax.set_xlim(
np.min(np.where(score_histogram > 0)[0]),
@@ -166,15 +170,14 @@
# Download and read the reference SARS-CoV-2 genome
orig_genome_file = entrez.fetch(
- "NC_045512", tempfile.gettempdir(), "gb",
- db_name="Nucleotide", ret_type="gb"
+ "NC_045512", tempfile.gettempdir(), "gb", db_name="Nucleotide", ret_type="gb"
)
orig_genome = seqio.load_sequence(orig_genome_file)
# Create complementary reads
-compl_reads = list(itertools.chain(
- *[(read, read.reverse(False).complement()) for read in reads]
-))
+compl_reads = list(
+ itertools.chain(*[(read, read.reverse(False).complement()) for read in reads])
+)
########################################################################
# To map the reads to their corresponding positions in the reference
@@ -239,19 +242,27 @@
read_length = len(compl_reads[INDEX])
# Find the correct diagonal for the example read
-diagonals = matches[:,2] - matches[:,0]
+diagonals = matches[:, 2] - matches[:, 0]
diag, counts = np.unique(diagonals, return_counts=True)
correct_diagonal = diag[np.argmax(counts)]
# Visualize the matches and the correct diagonal
fig, ax = plt.subplots(figsize=(8.0, 8.0))
ax.scatter(
- matches[:,0], matches[:,2],
- s=4, marker="o", color=biotite.colors["dimorange"], label="Match"
+ matches[:, 0],
+ matches[:, 2],
+ s=4,
+ marker="o",
+ color=biotite.colors["dimorange"],
+ label="Match",
)
ax.plot(
- [0, read_length], [correct_diagonal, read_length+correct_diagonal],
- linestyle=":", linewidth=1.0, color="black", label="Correct diagonal"
+ [0, read_length],
+ [correct_diagonal, read_length + correct_diagonal],
+ linestyle=":",
+ linewidth=1.0,
+ color="black",
+ label="Correct diagonal",
)
ax.set_xlim(0, read_length)
ax.set_xlabel("Read position")
@@ -263,7 +274,7 @@
# Find the correct diagonal for all reads
correct_diagonals = [None] * len(all_matches)
for i, matches in enumerate(all_matches):
- diagonals = matches[:,2] - matches[:,0]
+ diagonals = matches[:, 2] - matches[:, 0]
unqiue_diag, counts = np.unique(diagonals, return_counts=True)
if len(unqiue_diag) == 0:
# If no match is found for this sequence, ignore this sequence
@@ -325,23 +336,28 @@
matrix = align.SubstitutionMatrix.std_nucleotide_matrix()
+
def map_sequence(read, diag):
deviation = int(3 * np.sqrt(len(read) * P_INDEL))
if diag is None:
return None
else:
return align.align_banded(
- read, orig_genome, matrix, gap_penalty=-10,
- band = (diag - deviation, diag + deviation),
- max_number = 1
+ read,
+ orig_genome,
+ matrix,
+ gap_penalty=-10,
+ band=(diag - deviation, diag + deviation),
+ max_number=1,
)[0]
+
# Each process can be quite memory consuming
# -> Cap to two processes to make it work on low-RAM commodity hardware
with ProcessPoolExecutor(max_workers=2) as executor:
- alignments = list(executor.map(
- map_sequence, compl_reads, correct_diagonals, chunksize=1000
- ))
+ alignments = list(
+ executor.map(map_sequence, compl_reads, correct_diagonals, chunksize=1000)
+ )
########################################################################
# Now we have to select for each read, whether the original or
@@ -351,18 +367,25 @@ def map_sequence(read, diag):
for_alignments = [alignments[i] for i in range(0, len(alignments), 2)]
rev_alignments = [alignments[i] for i in range(1, len(alignments), 2)]
-scores = np.stack((
- [ali.score if ali is not None else 0 for ali in for_alignments],
- [ali.score if ali is not None else 0 for ali in rev_alignments]
-),axis=-1)
+scores = np.stack(
+ (
+ [ali.score if ali is not None else 0 for ali in for_alignments],
+ [ali.score if ali is not None else 0 for ali in rev_alignments],
+ ),
+ axis=-1,
+)
correct_sense = np.argmax(scores, axis=-1)
-correct_alignments = [for_a if sense == 0 else rev_a for for_a, rev_a, sense
- in zip(for_alignments, rev_alignments, correct_sense)]
+correct_alignments = [
+ for_a if sense == 0 else rev_a
+ for for_a, rev_a, sense in zip(for_alignments, rev_alignments, correct_sense)
+]
# If we use a reverse complementary read,
# we also need to reverse the Phred score arrays
-correct_score_arrays = [score if sense == 0 else score[::-1] for score, sense
- in zip(score_arrays, correct_sense)]
+correct_score_arrays = [
+ score if sense == 0 else score[::-1]
+ for score, sense in zip(score_arrays, correct_sense)
+]
########################################################################
# Now we know for each read where its corresponding position on the
@@ -371,12 +394,8 @@ def map_sequence(read, diag):
# Eventually, we visualize the mapping.
# Find genome positions for the starts and ends of all reads
-starts = np.array(
- [ali.trace[ 0, 1] for ali in correct_alignments if ali is not None]
-)
-stops = np.array(
- [ali.trace[-1, 1] for ali in correct_alignments if ali is not None]
-)
+starts = np.array([ali.trace[0, 1] for ali in correct_alignments if ali is not None])
+stops = np.array([ali.trace[-1, 1] for ali in correct_alignments if ali is not None])
# For a nicer plot sort these by their start position
order = np.argsort(starts)
starts = starts[order]
@@ -384,13 +403,17 @@ def map_sequence(read, diag):
fig, ax = plt.subplots(figsize=(8.0, 12.0))
ax.barh(
- np.arange(len(starts)), left=starts, width=stops-starts, height=1,
- color=biotite.colors["dimgreen"], linewidth=0
+ np.arange(len(starts)),
+ left=starts,
+ width=stops - starts,
+ height=1,
+ color=biotite.colors["dimgreen"],
+ linewidth=0,
)
-ax.set_ylim(0, len(starts)+1)
-ax.spines['top'].set_visible(False)
-ax.spines['right'].set_visible(False)
-ax.spines['left'].set_visible(False)
+ax.set_ylim(0, len(starts) + 1)
+ax.spines["top"].set_visible(False)
+ax.spines["right"].set_visible(False)
+ax.spines["left"].set_visible(False)
ax.tick_params(left=False, labelleft=False)
ax.set_xlabel("Sequence position")
ax.set_title("Read mappings to reference genome")
@@ -479,24 +502,21 @@ def map_sequence(read, diag):
if alignment is not None:
trace = alignment.trace
- no_gap_trace = trace[(trace[:,0] != -1) & (trace[:,1] != -1)]
+ no_gap_trace = trace[(trace[:, 0] != -1) & (trace[:, 1] != -1)]
# Get the sequence code for the aligned read symbols
- seq_code = alignment.sequences[0].code[no_gap_trace[:,0]]
+ seq_code = alignment.sequences[0].code[no_gap_trace[:, 0]]
# The sequence code contains the integers 0 - 3;
# one for each possible base
# Hence, we can use these integers directly to index the second
# dimension of the Pred score sum
# The index for the first dimension contains simply the genome
# positions taken from the alignment trace
- phred_sum[no_gap_trace[:,1], seq_code] \
- += score_array[no_gap_trace[:,0]]
+ phred_sum[no_gap_trace[:, 1], seq_code] += score_array[no_gap_trace[:, 0]]
- sequencing_depth[
- trace[0,1] : trace[-1,1]
- ] += 1
+ sequencing_depth[trace[0, 1] : trace[-1, 1]] += 1
- read_gap_trace = trace[trace[:,0] == -1]
- deletion_number[read_gap_trace[:,1]] += 1
+ read_gap_trace = trace[trace[:, 0] == -1]
+ deletion_number[read_gap_trace[:, 1]] += 1
# Call the most probable base for each genome position according to the
# formula above
@@ -504,23 +524,21 @@ def map_sequence(read, diag):
# Visualize the sequencing depth and score sum over the genome
-max_phred_sum = phred_sum[
- np.arange(len(phred_sum)), most_probable_symbol_codes
-]
+max_phred_sum = phred_sum[np.arange(len(phred_sum)), most_probable_symbol_codes]
+
def moving_average(data_set, window_size):
- weights = np.full(window_size, 1/window_size)
- return np.convolve(data_set, weights, mode='valid')
+ weights = np.full(window_size, 1 / window_size)
+ return np.convolve(data_set, weights, mode="valid")
+
fig, ax = plt.subplots(figsize=(8.0, 4.0))
-ax.plot(
- moving_average(max_phred_sum, 100),
- color="lightgray", linewidth=1.0
-)
+ax.plot(moving_average(max_phred_sum, 100), color="lightgray", linewidth=1.0)
ax2 = ax.twinx()
ax2.plot(
moving_average(sequencing_depth, 100),
- color=biotite.colors["dimorange"], linewidth=1.0
+ color=biotite.colors["dimorange"],
+ linewidth=1.0,
)
ax.axhline(0, color="silver", linewidth=0.5)
ax.set_xlim(0, len(orig_genome))
@@ -528,10 +546,9 @@ def moving_average(data_set, window_size):
ax.set_ylabel("Phred score sum")
ax2.set_ylabel("Sequencing depth")
ax.legend(
- [Line2D([0], [0], color=c)
- for c in ("lightgray", biotite.colors["dimorange"])],
+ [Line2D([0], [0], color=c) for c in ("lightgray", biotite.colors["dimorange"])],
["Phred score sum", "Sequencing depth"],
- loc="upper left"
+ loc="upper left",
)
fig.tight_layout()
@@ -551,14 +568,13 @@ def moving_average(data_set, window_size):
var_genome.code = most_probable_symbol_codes
# A deletion is called, if either enough reads include this deletion
# or the sequence position is not covered by any read at all
-deletion_mask = (deletion_number > sequencing_depth * DELETION_THRESHOLD) \
- | (sequencing_depth == 0)
+deletion_mask = (deletion_number > sequencing_depth * DELETION_THRESHOLD) | (
+ sequencing_depth == 0
+)
var_genome = var_genome[~deletion_mask]
# Write the assembled genome into a FASTA file
out_file = fasta.FastaFile()
-fasta.set_sequence(
- out_file, var_genome, header="SARS-CoV-2 B.1.1.7", as_rna=True
-)
+fasta.set_sequence(out_file, var_genome, header="SARS-CoV-2 B.1.1.7", as_rna=True)
out_file.write(tempfile.NamedTemporaryFile("w"))
########################################################################
@@ -578,10 +594,13 @@ def moving_average(data_set, window_size):
BAND_WIDTH = 1000
genome_alignment = align.align_banded(
- var_genome, orig_genome, matrix,
- band=(-BAND_WIDTH//2, BAND_WIDTH//2), max_number=1
+ var_genome,
+ orig_genome,
+ matrix,
+ band=(-BAND_WIDTH // 2, BAND_WIDTH // 2),
+ max_number=1,
)[0]
-identity = align.get_sequence_identity(genome_alignment, 'all')
+identity = align.get_sequence_identity(genome_alignment, "all")
print(f"Sequence identity: {identity * 100:.2f} %")
########################################################################
@@ -599,9 +618,9 @@ def moving_average(data_set, window_size):
# Calculate the sequence identity within each bin
bin_identities = np.zeros(N_BINS)
-edges = np.linspace(0, len(orig_genome), N_BINS+1)
+edges = np.linspace(0, len(orig_genome), N_BINS + 1)
for i, (bin_start, bin_stop) in enumerate(zip(edges[:-1], edges[1:])):
- orig_genome_trace = genome_alignment.trace[:,1]
+ orig_genome_trace = genome_alignment.trace[:, 1]
excerpt = genome_alignment[
(orig_genome_trace >= bin_start) & (orig_genome_trace < bin_stop)
]
@@ -612,9 +631,11 @@ def moving_average(data_set, window_size):
# Plot the deviation = 1 - sequence identity
deviation_ax.bar(
- edges[:-1], width=(edges[1:]-edges[:-1]),
+ edges[:-1],
+ width=(edges[1:] - edges[:-1]),
height=(1 - bin_identities),
- color=biotite.colors["dimorange"], align="edge"
+ color=biotite.colors["dimorange"],
+ align="edge",
)
deviation_ax.set_xlim(0, len(orig_genome))
deviation_ax.set_ylabel("1 - Sequence identity")
@@ -623,20 +644,24 @@ def moving_average(data_set, window_size):
deviation_ax.set_ylim(1e-3, 1e-1)
# Plot genmic coordinates of the genes
-for i, feature in enumerate(sorted(
- annot_seq.annotation,
- key=lambda feature: min([loc.first for loc in feature.locs])
-)):
+for i, feature in enumerate(
+ sorted(
+ annot_seq.annotation,
+ key=lambda feature: min([loc.first for loc in feature.locs]),
+ )
+):
for loc in feature.locs:
feature_ax.barh(
- left=loc.first, width=loc.last-loc.first, y=i, height=1,
- color=biotite.colors["dimgreen"]
+ left=loc.first,
+ width=loc.last - loc.first,
+ y=i,
+ height=1,
+ color=biotite.colors["dimgreen"],
)
feature_ax.text(
- loc.last + 100, i, feature.qual["gene"],
- fontsize=8, ha="left", va="center"
+ loc.last + 100, i, feature.qual["gene"], fontsize=8, ha="left", va="center"
)
-feature_ax.set_ylim(i+0.5, -0.5)
+feature_ax.set_ylim(i + 0.5, -0.5)
feature_ax.set_xlim(0, len(orig_genome))
feature_ax.xaxis.set_visible(False)
feature_ax.yaxis.set_visible(False)
@@ -671,17 +696,17 @@ def moving_average(data_set, window_size):
# The locations of some notable spike protein regions
FEATURES = {
# Signal peptide
- "SP": ( 1, 12),
+ "SP": (1, 12),
# N-terminal domain
- "NTD": ( 14, 303),
+ "NTD": (14, 303),
# Receptor binding domain
- "RBD": ( 319, 541),
+ "RBD": (319, 541),
# Fusion peptide
- "FP": ( 788, 806),
+ "FP": (788, 806),
# Transmembrane domain
- "TM": (1214, 1234),
+ "TM": (1214, 1234),
# Cytoplasmatic tail
- "CT": (1269, 1273),
+ "CT": (1269, 1273),
}
# Get RNA sequence coding for spike protein from the reference genome
@@ -694,11 +719,11 @@ def moving_average(data_set, window_size):
alignment = align.align_optimal(
var_genome, orig_spike_seq, matrix, local=True, max_number=1
)[0]
-var_spike_seq = var_genome[alignment.trace[alignment.trace[:,0] != -1, 0]]
+var_spike_seq = var_genome[alignment.trace[alignment.trace[:, 0] != -1, 0]]
# Obtain protein sequences from RNA sequences
orig_spike_prot_seq = orig_spike_seq.translate(complete=True).remove_stops()
-var_spike_prot_seq = var_spike_seq.translate(complete=True).remove_stops()
+var_spike_prot_seq = var_spike_seq.translate(complete=True).remove_stops()
# Align both protein sequences with each other for later comparison
blosum_matrix = align.SubstitutionMatrix.std_protein_matrix()
@@ -712,47 +737,50 @@ def moving_average(data_set, window_size):
# Plot alignment
cmap = LinearSegmentedColormap.from_list(
- "custom", colors=[(1.0, 0.3, 0.3), (1.0, 1.0, 1.0)]
+ "custom",
+ colors=[(1.0, 0.3, 0.3), (1.0, 1.0, 1.0)],
# ^ reddish ^ white
)
graphics.plot_alignment_similarity_based(
- ax, alignment, matrix=blosum_matrix, symbols_per_line=SYMBOLS_PER_LINE,
- labels=["B.1.1.7", "Reference"], show_numbers=True, label_size=9,
- number_size=9, symbol_size=7, spacing=SPACING, cmap=cmap
+ ax,
+ alignment,
+ matrix=blosum_matrix,
+ symbols_per_line=SYMBOLS_PER_LINE,
+ labels=["B.1.1.7", "Reference"],
+ show_numbers=True,
+ label_size=9,
+ number_size=9,
+ symbol_size=7,
+ spacing=SPACING,
+ cmap=cmap,
)
## Add indicator for features to the alignment
for row in range(1 + len(alignment) // SYMBOLS_PER_LINE):
col_start = SYMBOLS_PER_LINE * row
- col_stop = SYMBOLS_PER_LINE * (row + 1)
+ col_stop = SYMBOLS_PER_LINE * (row + 1)
if col_stop > len(alignment):
# This happens in the last line
col_stop = len(alignment)
seq_start = alignment.trace[col_start, 1]
- seq_stop = alignment.trace[col_stop-1, 1] + 1
+ seq_stop = alignment.trace[col_stop - 1, 1] + 1
n_sequences = len(alignment.sequences)
y_base = (n_sequences + SPACING) * row + n_sequences
for feature_name, (first, last) in FEATURES.items():
# Zero based sequence indexing
- start = first-1
+ start = first - 1
# Exclusive stop
stop = last
if start < seq_stop and stop > seq_start:
# The feature is found in this line
x_begin = np.clip(start - seq_start, 0, SYMBOLS_PER_LINE)
- x_end = np.clip(stop - seq_start, 0, SYMBOLS_PER_LINE)
+ x_end = np.clip(stop - seq_start, 0, SYMBOLS_PER_LINE)
x_mean = (x_begin + x_end) / 2
y_line = y_base + 0.3
y_text = y_base + 0.6
- ax.plot(
- [x_begin, x_end], [y_line, y_line],
- color="black", linewidth=2
- )
- ax.text(
- x_mean, y_text, feature_name,
- fontsize=8, va="top", ha="center"
- )
+ ax.plot([x_begin, x_end], [y_line, y_line], color="black", linewidth=2)
+ ax.text(x_mean, y_text, feature_name, fontsize=8, va="top", ha="center")
# Increase y-limit to include the feature indicators in the last line
ax.set_ylim(y_text, 0)
fig.tight_layout()
diff --git a/doc/examples/scripts/sequence/sequencing/quality_control.py b/doc/examples/scripts/sequence/sequencing/quality_control.py
index b488f1aeb..f7b769071 100644
--- a/doc/examples/scripts/sequence/sequencing/quality_control.py
+++ b/doc/examples/scripts/sequence/sequencing/quality_control.py
@@ -20,14 +20,13 @@
# License: BSD 3 clause
# sphinx_gallery_thumbnail_number = 2
-import numpy as np
-from scipy.stats import binom
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
+import numpy as np
+from scipy.stats import binom
import biotite
-import biotite.sequence as seq
import biotite.application.sra as sra
-
+import biotite.sequence as seq
FIG_SIZE = (8.0, 6.0)
@@ -38,12 +37,10 @@
# Each run can have multiple reads per spot
# by selecting index 0 we take only the first read for every spot
sequences_and_scores = app.get_sequences_and_scores()[0]
-sequence_codes = np.stack([
- sequence.code for sequence, _ in sequences_and_scores.values()
-])
-scores = np.stack([
- scores for _, scores in sequences_and_scores.values()
-])
+sequence_codes = np.stack(
+ [sequence.code for sequence, _ in sequences_and_scores.values()]
+)
+scores = np.stack([scores for _, scores in sequences_and_scores.values()])
seq_count = scores.shape[0]
seq_length = scores.shape[1]
positions = np.arange(1, seq_length + 1)
@@ -56,20 +53,18 @@
# For the plot we need the first, second (the median) and third
# quartile for each position.
-first_quartile, median, third_quartile = np.quantile(
- scores, (0.25, 0.5, 0.75), axis=0
-)
+first_quartile, median, third_quartile = np.quantile(scores, (0.25, 0.5, 0.75), axis=0)
fig, ax = plt.subplots(figsize=FIG_SIZE)
ax.bar(
positions,
- bottom=first_quartile, height=third_quartile-first_quartile, width=1.0,
- facecolor=biotite.colors["brightorange"], label="Lower/upper quartile"
-)
-ax.plot(
- positions, median,
- color=biotite.colors["dimorange"], label="Median"
+ bottom=first_quartile,
+ height=third_quartile - first_quartile,
+ width=1.0,
+ facecolor=biotite.colors["brightorange"],
+ label="Lower/upper quartile",
)
+ax.plot(positions, median, color=biotite.colors["dimorange"], label="Median")
ax.set_xlim(positions[0], positions[-1])
ax.set_xlabel("Position in read")
ax.set_ylabel("Phred score")
@@ -92,15 +87,13 @@
fig, ax = plt.subplots(figsize=FIG_SIZE)
ax.hist(
# Definition range of Sanger Phred scores is 0 to 40
- mean_scores, bins=np.linspace(0, 40, BIN_NUMBER),
- color=biotite.colors["lightorange"]
+ mean_scores,
+ bins=np.linspace(0, 40, BIN_NUMBER),
+ color=biotite.colors["lightorange"],
)
ax.set_xlabel("Mean Phred score of sequence")
ax.set_ylabel("Sequence count")
-ax.set_xlim(
- np.floor(np.min(mean_scores)),
- np.ceil( np.max(mean_scores))
-)
+ax.set_xlim(np.floor(np.min(mean_scores)), np.ceil(np.max(mean_scores)))
fig.tight_layout()
########################################################################
@@ -115,10 +108,9 @@
# as ambiguous bases might occur in some sequencing datasets
alphabet = seq.NucleotideSequence.alphabet_amb
-counts = np.stack([
- np.bincount(codes, minlength=len(alphabet))
- for codes in sequence_codes.T
-], axis=-1)
+counts = np.stack(
+ [np.bincount(codes, minlength=len(alphabet)) for codes in sequence_codes.T], axis=-1
+)
frequencies = counts / seq_count * 100
fig, ax = plt.subplots(figsize=FIG_SIZE)
@@ -141,38 +133,30 @@
# distribution.
gc_count = np.count_nonzero(
- (sequence_codes == alphabet.encode("G")) |
- (sequence_codes == alphabet.encode("C")),
- axis=1
+ (sequence_codes == alphabet.encode("G")) | (sequence_codes == alphabet.encode("C")),
+ axis=1,
)
at_count = np.count_nonzero(
- (sequence_codes == alphabet.encode("A")) |
- (sequence_codes == alphabet.encode("T")),
- axis=1
+ (sequence_codes == alphabet.encode("A")) | (sequence_codes == alphabet.encode("T")),
+ axis=1,
)
gc_content = gc_count / (gc_count + at_count)
# Exclusive range -> 0 to seq_length inclusive
-number_of_gc = np.arange(seq_length+1)
-exp_gc_content = binom.pmf(
- k=number_of_gc,
- n=seq_length,
- p=np.mean(gc_content)
-)
+number_of_gc = np.arange(seq_length + 1)
+exp_gc_content = binom.pmf(k=number_of_gc, n=seq_length, p=np.mean(gc_content))
fig, ax = plt.subplots(figsize=FIG_SIZE)
# Due to finite sequence length, the distribution is discrete
# -> use bar() instead of hist()
values, counts = np.unique(gc_content, return_counts=True)
bin_width = 100 / seq_length
-ax.bar(
- values * 100, counts, width=bin_width,
- color=biotite.colors["brightorange"]
-)
+ax.bar(values * 100, counts, width=bin_width, color=biotite.colors["brightorange"])
ax.plot(
number_of_gc / seq_length * 100,
exp_gc_content * seq_count,
- color=biotite.colors["dimorange"], linestyle=":"
+ color=biotite.colors["dimorange"],
+ linestyle=":",
)
ax.set_xlim(0, 100)
ax.set_xlabel("Sequence GC content (%)")
@@ -201,11 +185,9 @@
duplications[code] = 1
duplication_level_count = np.bincount(list(duplications.values()))
duplication_level_freq = (
- duplication_level_count
- * np.arange(len(duplication_level_count))
- / seq_count * 100
+ duplication_level_count * np.arange(len(duplication_level_count)) / seq_count * 100
)
-max_duplication = len(duplication_level_count)-1
+max_duplication = len(duplication_level_count) - 1
print("Maximum duplication number:", max_duplication)
fig, ax = plt.subplots(figsize=FIG_SIZE)
@@ -213,7 +195,7 @@
np.arange(0, len(duplication_level_freq)),
duplication_level_freq,
width=0.6,
- color=biotite.colors["dimorange"]
+ color=biotite.colors["dimorange"],
)
ax.set_xlim(0.5, len(duplication_level_freq) + 0.5)
ax.xaxis.set_major_locator(ticker.MaxNLocator(10))
@@ -228,4 +210,4 @@
# Usually one would expect, that most sequences occur only once and the
# following duplication numbers become decreasingly likely.
# However, in this case we have another peak at around 60 duplications.
-# And one read is even repeated astonishing 161 times!
\ No newline at end of file
+# And one read is even repeated astonishing 161 times!
diff --git a/doc/examples/scripts/sequence/sequencing/read_quality.py b/doc/examples/scripts/sequence/sequencing/read_quality.py
index 2cc12d492..47b89ddc4 100644
--- a/doc/examples/scripts/sequence/sequencing/read_quality.py
+++ b/doc/examples/scripts/sequence/sequencing/read_quality.py
@@ -10,13 +10,11 @@
# License: BSD 3 clause
from io import StringIO
-import numpy as np
import matplotlib.pyplot as plt
+import numpy as np
import biotite
-import biotite.sequence as seq
import biotite.sequence.io.fastq as fastq
-
# Sample FASTQ file from https://en.wikipedia.org/wiki/FASTQ_format
fastq_content = StringIO("""
@SEQ_ID
@@ -30,8 +28,12 @@
sequence, scores = fastq.get_sequence(fastq_file, "SEQ_ID")
figure, ax = plt.subplots(figsize=(8.0, 2.0))
ax.bar(
- x=np.arange(len(sequence)), height=scores, color=biotite.colors["orange"],
- width=1.0, linewidth=1, edgecolor="white"
+ x=np.arange(len(sequence)),
+ height=scores,
+ color=biotite.colors["orange"],
+ width=1.0,
+ linewidth=1,
+ edgecolor="white",
)
# -1 to put space between Y-axis and sequence
ax.set_xlim(-1, len(sequence))
@@ -44,6 +46,6 @@
# Show sequence as X-axis ticks
ax.set_xticks(np.arange(len(sequence)))
ax.set_xticklabels(sequence.symbols)
-ax.xaxis.set_ticks_position("none")
+ax.xaxis.set_ticks_position("none")
figure.tight_layout()
-plt.show()
\ No newline at end of file
+plt.show()
diff --git a/doc/examples/scripts/structure/contacts/adjacency_matrix.py b/doc/examples/scripts/structure/contacts/adjacency_matrix.py
index 2f5b9594a..d51a423bb 100644
--- a/doc/examples/scripts/structure/contacts/adjacency_matrix.py
+++ b/doc/examples/scripts/structure/contacts/adjacency_matrix.py
@@ -12,13 +12,12 @@
# License: BSD 3 clause
from tempfile import gettempdir
+import matplotlib.pyplot as plt
+from matplotlib.colors import ListedColormap
import biotite
+import biotite.database.rcsb as rcsb
import biotite.structure as struc
import biotite.structure.io as strucio
-import biotite.database.rcsb as rcsb
-import matplotlib.pyplot as plt
-from matplotlib.colors import ListedColormap
-
file_name = rcsb.fetch("1aki", "bcif", gettempdir())
array = strucio.load_structure(file_name)
@@ -41,4 +40,4 @@
ax.set_ylabel("Residue number")
ax.set_title("Adjacency matrix of the lysozyme crystal structure")
figure.tight_layout()
-plt.show()
\ No newline at end of file
+plt.show()
diff --git a/doc/examples/scripts/structure/contacts/contact_sites.py b/doc/examples/scripts/structure/contacts/contact_sites.py
index e7673983c..94fb7d975 100644
--- a/doc/examples/scripts/structure/contacts/contact_sites.py
+++ b/doc/examples/scripts/structure/contacts/contact_sites.py
@@ -14,10 +14,9 @@
# License: BSD 3 clause
import numpy as np
+import biotite.database.rcsb as rcsb
import biotite.structure as struc
import biotite.structure.io.pdbx as pdbx
-import biotite.database.rcsb as rcsb
-
# The maximum distance between an atom in the repressor and an atom in
# the DNA for them to be considered 'in contact'
@@ -30,15 +29,9 @@
# Separate structure into the DNA and the two identical protein chains
-dna = structure[
- np.isin(structure.chain_id, ["A", "B"]) & (structure.hetero == False)
-]
-protein_l = structure[
- (structure.chain_id == "L") & (structure.hetero == False)
-]
-protein_r = structure[
- (structure.chain_id == "R") & (structure.hetero == False)
-]
+dna = structure[np.isin(structure.chain_id, ["A", "B"]) & (structure.hetero is False)]
+protein_l = structure[(structure.chain_id == "L") & (structure.hetero is False)]
+protein_r = structure[(structure.chain_id == "R") & (structure.hetero is False)]
# Quick check if the two protein chains are really identical
assert len(struc.get_residues(protein_l)) == len(struc.get_residues(protein_r))
diff --git a/doc/examples/scripts/structure/contacts/contact_sites_pymol.py b/doc/examples/scripts/structure/contacts/contact_sites_pymol.py
index 5e51c9a80..a4eb3622e 100644
--- a/doc/examples/scripts/structure/contacts/contact_sites_pymol.py
+++ b/doc/examples/scripts/structure/contacts/contact_sites_pymol.py
@@ -1,9 +1,8 @@
+import ammolite
import numpy as np
from matplotlib.colors import to_rgb
import biotite
import biotite.structure as struc
-import ammolite
-
PNG_SIZE = (1000, 550)
@@ -15,10 +14,7 @@
# Define colors
for color_name, color_value in biotite.colors.items():
- ammolite.cmd.set_color(
- "biotite_" + color_name,
- to_rgb(color_value)
- )
+ ammolite.cmd.set_color("biotite_" + color_name, to_rgb(color_value))
# Add bonds to structure and convert to PyMOL
structure = structure[~struc.filter_solvent(structure)]
@@ -31,32 +27,38 @@
pymol_obj.color("biotite_lightgreen", structure.chain_id == "R")
# Set view
-ammolite.cmd.set_view((
- -0.044524662, 0.767611504, 0.639355302,
- 0.998693943, 0.018437184, 0.047413416,
- 0.024606399, 0.640637815, -0.767439663,
- 0.000000000, 0.000000000, -115.614288330,
- 56.031833649, 23.317802429, 3.761308193,
- 73.517341614, 157.711288452, -20.000000000
-))
+ammolite.cmd.set_view(
+ (
+ -0.044524662,
+ 0.767611504,
+ 0.639355302,
+ 0.998693943,
+ 0.018437184,
+ 0.047413416,
+ 0.024606399,
+ 0.640637815,
+ -0.767439663,
+ 0.000000000,
+ 0.000000000,
+ -115.614288330,
+ 56.031833649,
+ 23.317802429,
+ 3.761308193,
+ 73.517341614,
+ 157.711288452,
+ -20.000000000,
+ )
+)
# Highlight contacts
residue_mask = np.isin(structure.res_id, common_ids)
-pymol_obj.show(
- "sticks",
- np.isin(structure.chain_id, ["L", "R"]) & residue_mask
-)
-for chain, color in zip(
- ("L", "R"),
- ("biotite_dimorange","biotite_darkgreen")
-):
+pymol_obj.show("sticks", np.isin(structure.chain_id, ["L", "R"]) & residue_mask)
+for chain, color in zip(("L", "R"), ("biotite_dimorange", "biotite_darkgreen")):
pymol_obj.color(
color,
- (structure.chain_id == chain) &
- (structure.atom_name != "CA") &
- residue_mask
+ (structure.chain_id == chain) & (structure.atom_name != "CA") & residue_mask,
)
# Save image
ammolite.cmd.ray(*PNG_SIZE)
-ammolite.cmd.png(__image_destination__)
\ No newline at end of file
+ammolite.cmd.png(__image_destination__)
diff --git a/doc/examples/scripts/structure/contacts/disulfide_bonds.py b/doc/examples/scripts/structure/contacts/disulfide_bonds.py
index 8d99a675d..e87e33647 100644
--- a/doc/examples/scripts/structure/contacts/disulfide_bonds.py
+++ b/doc/examples/scripts/structure/contacts/disulfide_bonds.py
@@ -19,28 +19,26 @@
import io
from tempfile import gettempdir
-import numpy as np
-import matplotlib.pyplot as plt
import matplotlib.patches as patches
+import matplotlib.pyplot as plt
+import numpy as np
+import biotite.database.rcsb as rcsb
import biotite.sequence as seq
import biotite.structure as struc
import biotite.structure.io.pdbx as pdbx
-import biotite.database.rcsb as rcsb
-def detect_disulfide_bonds(structure, distance=2.05, distance_tol=0.05,
- dihedral=90, dihedral_tol=10):
+def detect_disulfide_bonds(
+ structure, distance=2.05, distance_tol=0.05, dihedral=90, dihedral_tol=10
+):
# Array where detected disulfide bonds are stored
disulfide_bonds = []
# A mask that selects only S-gamma atoms of cysteins
- sulfide_mask = (structure.res_name == "CYS") & \
- (structure.atom_name == "SG")
+ sulfide_mask = (structure.res_name == "CYS") & (structure.atom_name == "SG")
# sulfides in adjacency to other sulfides are detected in an
# efficient manner via a cell list
cell_list = struc.CellList(
- structure,
- cell_size=distance+distance_tol,
- selection=sulfide_mask
+ structure, cell_size=distance + distance_tol, selection=sulfide_mask
)
# Iterate over every index corresponding to an S-gamma atom
for sulfide_i in np.where(sulfide_mask)[0]:
@@ -65,31 +63,34 @@ def detect_disulfide_bonds(structure, distance=2.05, distance_tol=0.05,
# For dihedral angle measurement the corresponding
# C-beta atoms are required, too
cb1 = structure[
- (structure.chain_id == sg1.chain_id) &
- (structure.res_id == sg1.res_id) &
- (structure.atom_name == "CB")
+ (structure.chain_id == sg1.chain_id)
+ & (structure.res_id == sg1.res_id)
+ & (structure.atom_name == "CB")
]
cb2 = structure[
- (structure.chain_id == sg2.chain_id) &
- (structure.res_id == sg2.res_id) &
- (structure.atom_name == "CB")
+ (structure.chain_id == sg2.chain_id)
+ & (structure.res_id == sg2.res_id)
+ & (structure.atom_name == "CB")
]
# Measure distance and dihedral angle and check criteria
bond_dist = struc.distance(sg1, sg2)
bond_dihed = np.abs(np.rad2deg(struc.dihedral(cb1, sg1, sg2, cb2)))
- if bond_dist > distance - distance_tol and \
- bond_dist < distance + distance_tol and \
- bond_dihed > dihedral - dihedral_tol and \
- bond_dihed < dihedral + dihedral_tol:
- # Atom meet criteria -> we found a disulfide bond
- # -> the indices of the bond S-gamma atoms
- # are put into a tuple with the lower index first
- bond_tuple = sorted((sulfide_i, sulfide_j))
- # Add bond to list of bonds, but each bond only once
- if bond_tuple not in disulfide_bonds:
- disulfide_bonds.append(bond_tuple)
+ if (
+ bond_dist > distance - distance_tol
+ and bond_dist < distance + distance_tol
+ and bond_dihed > dihedral - dihedral_tol
+ and bond_dihed < dihedral + dihedral_tol
+ ):
+ # Atom meet criteria -> we found a disulfide bond
+ # -> the indices of the bond S-gamma atoms
+ # are put into a tuple with the lower index first
+ bond_tuple = sorted((sulfide_i, sulfide_j))
+ # Add bond to list of bonds, but each bond only once
+ if bond_tuple not in disulfide_bonds:
+ disulfide_bonds.append(bond_tuple)
return np.array(disulfide_bonds, dtype=int)
+
########################################################################
# As test case a structure of a *cysteine knot* protein is used,
# specifically the squash trypsin inhibitor *EETI-II*
@@ -104,19 +105,15 @@ def detect_disulfide_bonds(structure, distance=2.05, distance_tol=0.05,
# For later verification that the implemented function works correctly,
# the disulfide bonds, that are removed, are printed out.
-pdbx_file = pdbx.BinaryCIFFile.read(
- rcsb.fetch("2IT7", "bcif", gettempdir())
-)
+pdbx_file = pdbx.BinaryCIFFile.read(rcsb.fetch("2IT7", "bcif", gettempdir()))
knottin = pdbx.get_structure(pdbx_file, include_bonds=True, model=1)
-sulfide_indices = np.where(
- (knottin.res_name == "CYS") & (knottin.atom_name == "SG")
-)[0]
+sulfide_indices = np.where((knottin.res_name == "CYS") & (knottin.atom_name == "SG"))[0]
for i, j, _ in knottin.bonds.as_array():
if i in sulfide_indices and j in sulfide_indices:
print(knottin[i])
print(knottin[j])
print()
- knottin.bonds.remove_bond(i,j)
+ knottin.bonds.remove_bond(i, j)
########################################################################
# Now the sanitized structure is put into the disulfide detection
@@ -143,13 +140,11 @@ def detect_disulfide_bonds(structure, distance=2.05, distance_tol=0.05,
figure = plt.figure(figsize=(4.0, 1.0))
ax = figure.gca()
MARGIN = 0.2
-ax.set_xlim(1-MARGIN, len(sequence)+MARGIN)
-ax.set_ylim(0, 1+MARGIN)
-ax.set_xticks(np.arange(1, len(sequence)+1))
+ax.set_xlim(1 - MARGIN, len(sequence) + MARGIN)
+ax.set_ylim(0, 1 + MARGIN)
+ax.set_xticks(np.arange(1, len(sequence) + 1))
ax.set_xticklabels(str(sequence))
-ax.yaxis.set_tick_params(
- left=False, right=False, labelleft=False, labelright=False
-)
+ax.yaxis.set_tick_params(left=False, right=False, labelleft=False, labelright=False)
ax.xaxis.set_tick_params(
bottom=True, top=False, labelbottom=True, labeltop=False, width=0
)
@@ -161,10 +156,16 @@ def detect_disulfide_bonds(structure, distance=2.05, distance_tol=0.05,
ellipse_width = sg2_res_id - sg1_res_id
# Height is 2 instead of 1,
# because only the upper half of the ellipse is visible
- ax.add_patch(patches.Ellipse(
- xy=(ellipse_center, 0), width=ellipse_width, height=2,
- facecolor="None", edgecolor="gold", linewidth=2
- ))
+ ax.add_patch(
+ patches.Ellipse(
+ xy=(ellipse_center, 0),
+ width=ellipse_width,
+ height=2,
+ facecolor="None",
+ edgecolor="gold",
+ linewidth=2,
+ )
+ )
figure.tight_layout()
########################################################################
@@ -180,4 +181,4 @@ def detect_disulfide_bonds(structure, distance=2.05, distance_tol=0.05,
pdbx.set_structure(out_file, knottin)
out_file.write(io.BytesIO())
-plt.show()
\ No newline at end of file
+plt.show()
diff --git a/doc/examples/scripts/structure/contacts/domain_hbonds.py b/doc/examples/scripts/structure/contacts/domain_hbonds.py
index 03bbb75d7..93550583a 100644
--- a/doc/examples/scripts/structure/contacts/domain_hbonds.py
+++ b/doc/examples/scripts/structure/contacts/domain_hbonds.py
@@ -15,10 +15,9 @@
from tempfile import gettempdir
import matplotlib.pyplot as plt
import biotite
+import biotite.database.rcsb as rcsb
import biotite.structure as struc
import biotite.structure.io as strucio
-import biotite.database.rcsb as rcsb
-
file_name = rcsb.fetch("2KB1", "bcif", gettempdir())
stack = strucio.load_structure(file_name)
@@ -35,19 +34,22 @@
# Create names of bonds
label = "{d_resid}{d_resnm}-{d_a} -- {a_resid}{a_resnm}-{a_a}"
-names = [label.format(
- d_resid=chain_a.res_id[donor],
- d_resnm=chain_a.res_name[donor],
- d_a=chain_a.atom_name[donor],
- a_resid=chain_a.res_id[acceptor],
- a_resnm=chain_a.res_name[acceptor],
- a_a=chain_a.atom_name[acceptor]
- ) for donor, _, acceptor in triplets]
-
-plt.subplots(figsize=(11,4.5))
+names = [
+ label.format(
+ d_resid=chain_a.res_id[donor],
+ d_resnm=chain_a.res_name[donor],
+ d_a=chain_a.atom_name[donor],
+ a_resid=chain_a.res_id[acceptor],
+ a_resnm=chain_a.res_name[acceptor],
+ a_a=chain_a.atom_name[acceptor],
+ )
+ for donor, _, acceptor in triplets
+]
+
+plt.subplots(figsize=(11, 4.5))
plt.bar(names, freq, color=biotite.colors["orange"])
plt.xlabel("Hydrogen bond")
plt.ylabel("Hydrogen bond frequency")
plt.xticks(rotation=45)
plt.tight_layout()
-plt.show()
\ No newline at end of file
+plt.show()
diff --git a/doc/examples/scripts/structure/contacts/leaflet.py b/doc/examples/scripts/structure/contacts/leaflet.py
index 41c184e69..1a8655ecd 100644
--- a/doc/examples/scripts/structure/contacts/leaflet.py
+++ b/doc/examples/scripts/structure/contacts/leaflet.py
@@ -21,10 +21,10 @@
# Code source: Patrick Kunzmann
# License: BSD 3 clause
-from tempfile import NamedTemporaryFile
import warnings
-import numpy as np
+from tempfile import NamedTemporaryFile
import networkx as nx
+import numpy as np
import biotite.structure as struc
import biotite.structure.io as strucio
@@ -33,8 +33,7 @@
PDB_FILE_PATH = "../../../download/dppc_n128.pdb"
-def find_leaflets(structure, head_atom_mask,
- cutoff_distance=15.0, periodic=False):
+def find_leaflets(structure, head_atom_mask, cutoff_distance=15.0, periodic=False):
"""
Identify which lipids molecules belong to the same lipid bilayer
leaflet.
@@ -64,28 +63,29 @@ def find_leaflets(structure, head_atom_mask,
"""
cell_list = struc.CellList(
- structure, cell_size=cutoff_distance, selection=head_atom_mask,
- periodic=periodic
+ structure,
+ cell_size=cutoff_distance,
+ selection=head_atom_mask,
+ periodic=periodic,
)
adjacency_matrix = cell_list.create_adjacency_matrix(cutoff_distance)
graph = nx.Graph(adjacency_matrix)
- head_leaflets = [sorted(c) for c in nx.connected_components(graph)
- # A leaflet cannot consist of a single lipid
- # This also removes all entries
- # for atoms not in 'head_atom_mask'
- if len(c) > 1]
+ head_leaflets = [
+ sorted(c)
+ for c in nx.connected_components(graph)
+ # A leaflet cannot consist of a single lipid
+ # This also removes all entries
+ # for atoms not in 'head_atom_mask'
+ if len(c) > 1
+ ]
# 'leaflets' contains indices to head atoms
# Broadcast each head atom index to all atoms in its corresponding
# residue
- leaflet_masks = np.empty(
- (len(head_leaflets), structure.array_length()),
- dtype=bool
- )
+ leaflet_masks = np.empty((len(head_leaflets), structure.array_length()), dtype=bool)
for i, head_leaflet in enumerate(head_leaflets):
- leaflet_masks[i] = struc.get_residue_masks(structure, head_leaflet) \
- .any(axis=0)
+ leaflet_masks[i] = struc.get_residue_masks(structure, head_leaflet).any(axis=0)
return leaflet_masks
@@ -100,7 +100,7 @@ def find_leaflets(structure, head_atom_mask,
# periodicity should not matter
leaflets = find_leaflets(
structure,
- head_atom_mask=(structure.res_name == "DPP") & (structure.atom_name == "P")
+ head_atom_mask=(structure.res_name == "DPP") & (structure.atom_name == "P"),
)
# Bilayer -> Expect two leaflets
assert len(leaflets) == 2
diff --git a/doc/examples/scripts/structure/contacts/leaflet_pymol.py b/doc/examples/scripts/structure/contacts/leaflet_pymol.py
index 7678e53d5..59b2e98a0 100644
--- a/doc/examples/scripts/structure/contacts/leaflet_pymol.py
+++ b/doc/examples/scripts/structure/contacts/leaflet_pymol.py
@@ -1,9 +1,8 @@
+import ammolite
import numpy as np
from matplotlib.colors import to_rgb
import biotite
import biotite.structure as struc
-import ammolite
-
PNG_SIZE = (1000, 700)
@@ -14,15 +13,10 @@
# Define colors
for color_name, color_value in biotite.colors.items():
- ammolite.cmd.set_color(
- "biotite_" + color_name,
- to_rgb(color_value)
- )
+ ammolite.cmd.set_color("biotite_" + color_name, to_rgb(color_value))
# Remove hydrogen and water and convert to PyMOL
-structure = structure[
- (structure.element != "H") & (structure.res_name != "TIP")
-]
+structure = structure[(structure.element != "H") & (structure.res_name != "TIP")]
structure.bonds = struc.connect_via_distances(structure)
pymol_obj = ammolite.PyMOLObject.from_structure(structure)
@@ -33,16 +27,13 @@
# Configure lipid heads
pymol_obj.color(
- "biotite_darkgreen",
- (structure.chain_id == "A") & (structure.atom_name == "P")
+ "biotite_darkgreen", (structure.chain_id == "A") & (structure.atom_name == "P")
)
pymol_obj.color(
- "biotite_dimorange",
- (structure.chain_id == "B") & (structure.atom_name == "P")
+ "biotite_dimorange", (structure.chain_id == "B") & (structure.atom_name == "P")
)
pymol_obj.show(
- "spheres",
- np.isin(structure.chain_id, ("A", "B")) & (structure.atom_name == "P")
+ "spheres", np.isin(structure.chain_id, ("A", "B")) & (structure.atom_name == "P")
)
# Adjust camera
@@ -52,4 +43,4 @@
# Save image
ammolite.cmd.ray(*PNG_SIZE)
-ammolite.cmd.png(__image_destination__)
\ No newline at end of file
+ammolite.cmd.png(__image_destination__)
diff --git a/doc/examples/scripts/structure/misc/biological_assembly.py b/doc/examples/scripts/structure/misc/biological_assembly.py
index ee9fe79ab..83586dcca 100644
--- a/doc/examples/scripts/structure/misc/biological_assembly.py
+++ b/doc/examples/scripts/structure/misc/biological_assembly.py
@@ -38,11 +38,10 @@
# License: BSD 3 clause
from tempfile import NamedTemporaryFile
+import biotite.database.rcsb as rcsb
import biotite.structure as struc
-import biotite.structure.io.pdbx as pdbx
import biotite.structure.io as strucio
-import biotite.database.rcsb as rcsb
-
+import biotite.structure.io.pdbx as pdbx
pdbx_file = pdbx.BinaryCIFFile.read(rcsb.fetch("3J31", "bcif"))
@@ -77,4 +76,4 @@
# Visualization with PyMOL...
# sphinx_gallery_ammolite_script = "biological_assembly_pymol.py"
-temp.close()
\ No newline at end of file
+temp.close()
diff --git a/doc/examples/scripts/structure/misc/biological_assembly_pymol.py b/doc/examples/scripts/structure/misc/biological_assembly_pymol.py
index 377143fbd..3175b4eb9 100644
--- a/doc/examples/scripts/structure/misc/biological_assembly_pymol.py
+++ b/doc/examples/scripts/structure/misc/biological_assembly_pymol.py
@@ -1,8 +1,6 @@
-import numpy as np
-import matplotlib.pyplot as plt
-import biotite.structure as struc
import ammolite
-
+import matplotlib.pyplot as plt
+import numpy as np
PNG_SIZE = (1000, 1000)
@@ -21,4 +19,4 @@
# Save image
ammolite.cmd.ray(*PNG_SIZE)
-ammolite.cmd.png(__image_destination__)
\ No newline at end of file
+ammolite.cmd.png(__image_destination__)
diff --git a/doc/examples/scripts/structure/misc/diameter.py b/doc/examples/scripts/structure/misc/diameter.py
index 2f7154cd8..e428ccbd3 100644
--- a/doc/examples/scripts/structure/misc/diameter.py
+++ b/doc/examples/scripts/structure/misc/diameter.py
@@ -11,9 +11,10 @@
from tempfile import gettempdir
import numpy as np
+import biotite.database.rcsb as rcsb
import biotite.structure as struc
import biotite.structure.io as strucio
-import biotite.database.rcsb as rcsb
+
def get_diameter(pdb_id):
file_name = rcsb.fetch(pdb_id, "bcif", gettempdir())
@@ -24,10 +25,11 @@ def get_diameter(pdb_id):
# Calculate all pairwise difference vectors
diff = coord[:, np.newaxis, :] - coord[np.newaxis, :, :]
# Calculate absolute of difference vectors -> square distances
- sq_dist = np.sum(diff*diff, axis=-1)
+ sq_dist = np.sum(diff * diff, axis=-1)
# Maximum distance is diameter
diameter = np.sqrt(np.max(sq_dist))
return diameter
+
# Example application
-print("Diameter of 1QAW:", get_diameter("1QAW"), "Angstrom")
\ No newline at end of file
+print("Diameter of 1QAW:", get_diameter("1QAW"), "Angstrom")
diff --git a/doc/examples/scripts/structure/misc/gap_bars.py b/doc/examples/scripts/structure/misc/gap_bars.py
index 1ec9eb343..55024fd4e 100644
--- a/doc/examples/scripts/structure/misc/gap_bars.py
+++ b/doc/examples/scripts/structure/misc/gap_bars.py
@@ -16,11 +16,12 @@
# License: BSD 3 clause
from tempfile import gettempdir
-import biotite.structure.io as strucio
-import biotite.database.rcsb as rcsb
import matplotlib.pyplot as plt
-from matplotlib.patches import Rectangle
import numpy as np
+from matplotlib.patches import Rectangle
+import biotite.database.rcsb as rcsb
+import biotite.structure.io as strucio
+
def plot_gaps(pdb_id, chain_id, ax):
# Download and parse structure file
@@ -32,7 +33,7 @@ def plot_gaps(pdb_id, chain_id, ax):
states = np.zeros(atom_array.res_id[-1], dtype=int)
for i in range(len(states)):
# Get array for only one residue ID
- residue = atom_array[atom_array.res_id == i+1]
+ residue = atom_array[atom_array.res_id == i + 1]
if len(residue) == 0:
# not existing
states[i] = 0
@@ -52,7 +53,7 @@ def plot_gaps(pdb_id, chain_id, ax):
curr_start = i
curr_state = states[i]
else:
- if states[i] != states[i-1]:
+ if states[i] != states[i - 1]:
state_intervals.append((curr_start, i, curr_state))
curr_start = i
curr_state = states[i]
@@ -69,8 +70,11 @@ def plot_gaps(pdb_id, chain_id, ax):
color = "gold"
elif state == 2:
color = "forestgreen"
- ax.add_patch(Rectangle((start+1-0.5, 0), stop-start, 1,
- edgecolor="None", facecolor=color))
+ ax.add_patch(
+ Rectangle(
+ (start + 1 - 0.5, 0), stop - start, 1, edgecolor="None", facecolor=color
+ )
+ )
# Some other visual stuff
ax.spines["left"].set_visible(False)
ax.spines["bottom"].set_visible(False)
@@ -88,6 +92,6 @@ def plot_gaps(pdb_id, chain_id, ax):
ax = fig.add_subplot(212)
ax.set_title("5w1r", loc="left")
plot_gaps("5w1r", "A", ax)
-ax.set_xlabel("$Residue \ number$")
+ax.set_xlabel(r"$Residue \ number$")
fig.tight_layout()
-plt.show()
\ No newline at end of file
+plt.show()
diff --git a/doc/examples/scripts/structure/misc/glycan_visualization.py b/doc/examples/scripts/structure/misc/glycan_visualization.py
index 43bcccaf0..bd55ca1f7 100644
--- a/doc/examples/scripts/structure/misc/glycan_visualization.py
+++ b/doc/examples/scripts/structure/misc/glycan_visualization.py
@@ -18,21 +18,21 @@
# Code source: Patrick Kunzmann
# License: BSD 3 clause
-import numpy as np
import matplotlib.pyplot as plt
-from matplotlib.lines import Line2D
import networkx as nx
+import numpy as np
+from matplotlib.lines import Line2D
from networkx.drawing.nx_pydot import graphviz_layout
+import biotite.database.rcsb as rcsb
import biotite.sequence as seq
import biotite.structure as struc
import biotite.structure.io.pdbx as pdbx
-import biotite.database.rcsb as rcsb
-
# Adapted from "Mol*" Software
# The dictionary maps residue names of saccharides to their common names
SACCHARIDE_NAMES = {
- res_name : common_name for common_name, res_names in [
+ res_name: common_name
+ for common_name, res_names in [
("Glc", ["GLC", "BGC", "Z8T", "TRE", "MLR"]),
("Man", ["MAN", "BMA"]),
("Gal", ["GLA", "GAL", "GZL", "GXL", "GIV"]),
@@ -112,62 +112,51 @@
"All": ("o", "purple"),
"Tal": ("o", "lightsteelblue"),
"Ido": ("o", "chocolate"),
-
"GlcNAc": ("s", "royalblue"),
"ManNAc": ("s", "forestgreen"),
"GalNAc": ("s", "gold"),
"GulNAc": ("s", "darkorange"),
"AllNAc": ("s", "purple"),
"IdoNAc": ("s", "chocolate"),
-
"GlcN": ("1", "royalblue"),
"ManN": ("1", "forestgreen"),
"GalN": ("1", "gold"),
-
"GlcA": ("v", "royalblue"),
"ManA": ("v", "forestgreen"),
"GalA": ("v", "gold"),
"GulA": ("v", "darkorange"),
"TalA": ("v", "lightsteelblue"),
"IdoA": ("v", "chocolate"),
-
"Qui": ("^", "royalblue"),
"Rha": ("^", "forestgreen"),
"6dGul": ("^", "darkorange"),
"Fuc": ("^", "crimson"),
-
"QuiNAc": ("P", "royalblue"),
"FucNAc": ("P", "crimson"),
-
"Oli": ("X", "royalblue"),
"Tyv": ("X", "forestgreen"),
"Abe": ("X", "darkorange"),
"Par": ("X", "pink"),
"Dig": ("X", "purple"),
-
"Ara": ("*", "forestgreen"),
"Lyx": ("*", "gold"),
"Xyl": ("*", "darkorange"),
"Rib": ("*", "pink"),
-
"Kdn": ("D", "forestgreen"),
"Neu5Ac": ("D", "mediumvioletred"),
"Neu5Gc": ("D", "turquoise"),
-
"LDManHep": ("H", "forestgreen"),
"Kdo": ("H", "gold"),
"DDManHep": ("H", "pink"),
"MurNAc": ("H", "purple"),
"Mur": ("H", "chocolate"),
-
"Api": ("p", "royalblue"),
"Fru": ("p", "forestgreen"),
"Tag": ("p", "gold"),
"Sor": ("p", "darkorange"),
"Psi": ("p", "pink"),
-
# Default representation
- None: ("h", "black")
+ None: ("h", "black"),
}
#########################################################################
@@ -222,19 +211,22 @@
bonds = structure.bonds.as_array()[:, :2]
# Convert indices pointing to connected atoms to indices pointing to the
# starting atom of the respective residue
-connected = struc.get_residue_starts_for(
- structure, bonds.flatten()
-).reshape(bonds.shape)
+connected = struc.get_residue_starts_for(structure, bonds.flatten()).reshape(
+ bonds.shape
+)
# Omit bonds within the same residue
-connected = connected[connected[:,0] != connected[:,1]]
+connected = connected[connected[:, 0] != connected[:, 1]]
# Add the residue connections to the graph
graph.add_edges_from(connected)
fig, ax = plt.subplots(figsize=(8.0, 8.0))
nx.draw(
- graph, ax=ax, node_size=10,
- node_color=["crimson" if is_glycan[atom_i] else "royalblue"
- for atom_i in graph.nodes()]
+ graph,
+ ax=ax,
+ node_size=10,
+ node_color=[
+ "crimson" if is_glycan[atom_i] else "royalblue" for atom_i in graph.nodes()
+ ],
)
########################################################################
@@ -260,7 +252,8 @@
# Get connected subgraphs containing glycans
# -> any subgraph with more than one node
glycan_graphs = [
- graph.subgraph(nodes).copy() for nodes in nx.connected_components(graph)
+ graph.subgraph(nodes).copy()
+ for nodes in nx.connected_components(graph)
if len(nodes) > 1
]
@@ -297,14 +290,14 @@
# almost always an atom index that is lower than the saccharides
# attached to it
glycan_graph = nx.DiGraph(
- [(min(atom_i, atom_j), max(atom_i, atom_j))
- for atom_i, atom_j in glycan_graph.edges()]
+ [
+ (min(atom_i, atom_j), max(atom_i, atom_j))
+ for atom_i, atom_j in glycan_graph.edges()
+ ]
)
# The 'root' is the amino acid
- root = [
- atom_i for atom_i in glycan_graph.nodes() if is_amino_acid[atom_i]
- ]
+ root = [atom_i for atom_i in glycan_graph.nodes() if is_amino_acid[atom_i]]
if len(root) == 0:
# Saccharide is not attached to an amino acid -> Ignore glycan
continue
@@ -331,22 +324,20 @@
# Position the root at coordinate origin
pos_array -= pos_array[nodes.index(root)]
# Set vertical distances between nodes to 1
- pos_array[:,1] /= (
- pos_array[nodes.index(root_neighbor), 1] -
- pos_array[nodes.index(root), 1]
+ pos_array[:, 1] /= (
+ pos_array[nodes.index(root_neighbor), 1] - pos_array[nodes.index(root), 1]
)
# Set minimum horizontal distances between nodes to 1
- non_zero_dist = np.abs(pos_array[(pos_array[:,0] != 0), 0])
+ non_zero_dist = np.abs(pos_array[(pos_array[:, 0] != 0), 0])
if len(non_zero_dist) != 0:
- pos_array[:,0] *= HORIZONTAL_NODE_DISTANCE / np.min(non_zero_dist)
+ pos_array[:, 0] *= HORIZONTAL_NODE_DISTANCE / np.min(non_zero_dist)
# Move graph to residue ID position on x-axis
- pos_array[:,0] += structure.res_id[root]
+ pos_array[:, 0] += structure.res_id[root]
# Convert array back to dictionary
pos = {node: tuple(coord) for node, coord in zip(nodes, pos_array)}
nx.draw_networkx_edges(
- glycan_graph, pos, ax=ax,
- arrows=False, node_size=0, width=LINE_WIDTH
+ glycan_graph, pos, ax=ax, arrows=False, node_size=0, width=LINE_WIDTH
)
# Draw each node individually
@@ -359,14 +350,23 @@
common_name = SACCHARIDE_NAMES.get(structure.res_name[atom_i])
shape, color = SACCHARIDE_REPRESENTATION[common_name]
ax.scatter(
- pos[atom_i][0], pos[atom_i][1],
- s=NODE_SIZE, marker=shape, facecolor=color,
- edgecolor="black", linewidths=LINE_WIDTH
+ pos[atom_i][0],
+ pos[atom_i][1],
+ s=NODE_SIZE,
+ marker=shape,
+ facecolor=color,
+ edgecolor="black",
+ linewidths=LINE_WIDTH,
)
legend_elements[common_name] = Line2D(
- [0], [0], label=common_name, linestyle="None",
- marker=shape, markerfacecolor=color,
- markeredgecolor="black", markeredgewidth=LINE_WIDTH
+ [0],
+ [0],
+ label=common_name,
+ linestyle="None",
+ marker=shape,
+ markerfacecolor=color,
+ markeredgecolor="black",
+ markeredgewidth=LINE_WIDTH,
)
@@ -381,9 +381,13 @@
ax.tick_params(axis="y", left=False, labelleft=False)
ax.set_xticks(glycosylated_residue_ids)
ax.set_xticklabels(
- [symbol + str(res_id) for symbol, res_id
- in zip(glycosylated_residue_symbols, glycosylated_residue_ids)],
- rotation=45
+ [
+ symbol + str(res_id)
+ for symbol, res_id in zip(
+ glycosylated_residue_symbols, glycosylated_residue_ids
+ )
+ ],
+ rotation=45,
)
# Set the end of the axis to the last amino acid
@@ -393,4 +397,4 @@
fig.tight_layout()
# sphinx_gallery_thumbnail_number = 2
-plt.show()
\ No newline at end of file
+plt.show()
diff --git a/doc/examples/scripts/structure/misc/homolog_superimposition.py b/doc/examples/scripts/structure/misc/homolog_superimposition.py
index 2e0e03558..4db689581 100644
--- a/doc/examples/scripts/structure/misc/homolog_superimposition.py
+++ b/doc/examples/scripts/structure/misc/homolog_superimposition.py
@@ -13,20 +13,19 @@
# Code source: Patrick Kunzmann
# License: BSD 3 clause
-
+import biotite.database.rcsb as rcsb
import biotite.structure as struc
import biotite.structure.io.pdbx as pdbx
-import biotite.database.rcsb as rcsb
+
def _extract_monomer(complex):
complex = complex[struc.filter_amino_acids(complex)]
# Get the monomer that belongs to the first atom in the structure
return complex[struc.get_chain_masks(complex, [0])[0]]
+
avidin_file = pdbx.BinaryCIFFile.read(rcsb.fetch("1vyo", "bcif"))
-avidin = _extract_monomer(
- pdbx.get_structure(avidin_file, model=1, include_bonds=True)
-)
+avidin = _extract_monomer(pdbx.get_structure(avidin_file, model=1, include_bonds=True))
streptavidin_file = pdbx.BinaryCIFFile.read(rcsb.fetch("6j6j", "bcif"))
streptavidin = _extract_monomer(
pdbx.get_structure(streptavidin_file, model=1, include_bonds=True)
@@ -34,4 +33,4 @@ def _extract_monomer(complex):
streptavidin, _, _, _ = struc.superimpose_homologs(avidin, streptavidin)
# Visualization with PyMOL...
-# sphinx_gallery_ammolite_script = "homolog_superimposition_pymol.py"
\ No newline at end of file
+# sphinx_gallery_ammolite_script = "homolog_superimposition_pymol.py"
diff --git a/doc/examples/scripts/structure/misc/homolog_superimposition_pymol.py b/doc/examples/scripts/structure/misc/homolog_superimposition_pymol.py
index f9c204788..1760d527e 100644
--- a/doc/examples/scripts/structure/misc/homolog_superimposition_pymol.py
+++ b/doc/examples/scripts/structure/misc/homolog_superimposition_pymol.py
@@ -1,9 +1,6 @@
-import numpy as np
+import ammolite
from matplotlib.colors import to_rgb
import biotite
-import biotite.structure as struc
-import ammolite
-
PNG_SIZE = (1000, 750)
@@ -13,10 +10,7 @@
# Define colors
for color_name, color_value in biotite.colors.items():
- ammolite.cmd.set_color(
- "biotite_" + color_name,
- to_rgb(color_value)
- )
+ ammolite.cmd.set_color("biotite_" + color_name, to_rgb(color_value))
# Convert to PyMOL
pymol_avidin = ammolite.PyMOLObject.from_structure(avidin)
@@ -33,4 +27,4 @@
# Save image
ammolite.cmd.ray(*PNG_SIZE)
-ammolite.cmd.png(__image_destination__)
\ No newline at end of file
+ammolite.cmd.png(__image_destination__)
diff --git a/doc/examples/scripts/structure/misc/pdb_statistics.py b/doc/examples/scripts/structure/misc/pdb_statistics.py
index eaf7a3e05..ed8680eb8 100644
--- a/doc/examples/scripts/structure/misc/pdb_statistics.py
+++ b/doc/examples/scripts/structure/misc/pdb_statistics.py
@@ -10,12 +10,11 @@
# Code source: Patrick Kunzmann
# License: BSD 3 clause
-import numpy as np
+from datetime import datetime, time
import matplotlib.pyplot as plt
+import numpy as np
import biotite
-import biotite.database.rcsb as rcsb
-from datetime import datetime, time
-
+import biotite.database.rcsb as rcsb
years = np.arange(1990, datetime.today().year + 1)
xray_count = np.zeros(len(years), dtype=int)
@@ -28,20 +27,14 @@
# A query that comprises one year
date_query = rcsb.FieldQuery(
"rcsb_accession_info.initial_release_date",
- range_closed = (
- datetime.combine(datetime(year, 1, 1), time.min),
- datetime.combine(datetime(year, 12, 31), time.max)
- )
- )
- xray_query = rcsb.FieldQuery(
- "exptl.method", exact_match="X-RAY DIFFRACTION"
- )
- nmr_query = rcsb.FieldQuery(
- "exptl.method", exact_match="SOLUTION NMR"
- )
- em_query = rcsb.FieldQuery(
- "exptl.method", exact_match="ELECTRON MICROSCOPY"
+ range_closed=(
+ datetime.combine(datetime(year, 1, 1), time.min),
+ datetime.combine(datetime(year, 12, 31), time.max),
+ ),
)
+ xray_query = rcsb.FieldQuery("exptl.method", exact_match="X-RAY DIFFRACTION")
+ nmr_query = rcsb.FieldQuery("exptl.method", exact_match="SOLUTION NMR")
+ em_query = rcsb.FieldQuery("exptl.method", exact_match="ELECTRON MICROSCOPY")
# Get the amount of structures, that were released in that year
# AND were elucidated with the respective method
xray_count[i], nmr_count[i], em_count[i] = [
@@ -53,27 +46,32 @@
fig, ax = plt.subplots(figsize=(8.0, 5.0))
ax.set_title("PDB release statistics")
-ax.set_xlim(years[0]-1, years[-1]+1)
+ax.set_xlim(years[0] - 1, years[-1] + 1)
ax.set_xticks(years)
ax.set_xticklabels([str(y) for y in years], rotation=45)
ax.set_xlabel("Year")
ax.set_ylabel("Released structures per year")
+ax.bar(years, xray_count, color=biotite.colors["darkorange"], label="X-Ray")
ax.bar(
- years, xray_count,
- color=biotite.colors["darkorange"], label="X-Ray"
-)
-ax.bar(
- years, nmr_count, bottom=xray_count,
- color=biotite.colors["orange"], label="Solution NMR"
+ years,
+ nmr_count,
+ bottom=xray_count,
+ color=biotite.colors["orange"],
+ label="Solution NMR",
)
ax.bar(
- years, em_count, bottom=xray_count + nmr_count,
- color=biotite.colors["brightorange"], label="Electron Microscopy"
+ years,
+ em_count,
+ bottom=xray_count + nmr_count,
+ color=biotite.colors["brightorange"],
+ label="Electron Microscopy",
)
ax.bar(
- years, tot_count - xray_count - nmr_count - em_count,
+ years,
+ tot_count - xray_count - nmr_count - em_count,
bottom=xray_count + nmr_count + em_count,
- color="gray", label="Miscellaneous"
+ color="gray",
+ label="Miscellaneous",
)
ax.legend(loc="upper left")
fig.tight_layout()
diff --git a/doc/examples/scripts/structure/modeling/docking.py b/doc/examples/scripts/structure/modeling/docking.py
index 06492c242..eb9a3fcfa 100644
--- a/doc/examples/scripts/structure/modeling/docking.py
+++ b/doc/examples/scripts/structure/modeling/docking.py
@@ -28,22 +28,24 @@
# Code source: Patrick Kunzmann
# License: BSD 3 clause
-import numpy as np
import matplotlib.pyplot as plt
+import numpy as np
from scipy.stats import spearmanr
+import biotite.application.autodock as autodock
+import biotite.database.rcsb as rcsb
import biotite.structure as struc
import biotite.structure.info as info
import biotite.structure.io.pdbx as pdbx
-import biotite.database.rcsb as rcsb
-import biotite.application.autodock as autodock
-
# Get the receptor structure
# and the original 'correct' conformation of the ligand
pdbx_file = pdbx.BinaryCIFFile.read(rcsb.fetch("2RTG", "bcif"))
structure = pdbx.get_structure(
# Include formal charge for accurate partial charge calculation
- pdbx_file, model=1, include_bonds=True, extra_fields=["charge"]
+ pdbx_file,
+ model=1,
+ include_bonds=True,
+ extra_fields=["charge"],
)
# The asymmetric unit describes a streptavidin homodimer
# However, we are only interested in a single monomer
@@ -79,9 +81,7 @@
docked_ligand = struc.from_template(ligand, docked_coord)
# As Vina discards all nonpolar hydrogen atoms, their respective
# coordinates are NaN -> remove these atoms
-docked_ligand = docked_ligand[
- ..., ~np.isnan(docked_ligand.coord[0]).any(axis=-1)
-]
+docked_ligand = docked_ligand[..., ~np.isnan(docked_ligand.coord[0]).any(axis=-1)]
# For comparison of the docked pose with the experimentally determined
@@ -142,9 +142,9 @@
# Vina only keeps polar hydrogens in the modeled structure
# For consistency, remove all hydrogen atoms in the reference and
# modelled structure
-ref_ligand = ref_ligand[ref_ligand.element!= "H"]
-docked_ligand = docked_ligand[docked_ligand.element!= "H"]
+ref_ligand = ref_ligand[ref_ligand.element != "H"]
+docked_ligand = docked_ligand[docked_ligand.element != "H"]
# Visualization with PyMOL...
# sphinx_gallery_thumbnail_number = 2
-# sphinx_gallery_ammolite_script = "docking_pymol.py"
\ No newline at end of file
+# sphinx_gallery_ammolite_script = "docking_pymol.py"
diff --git a/doc/examples/scripts/structure/modeling/docking_pymol.py b/doc/examples/scripts/structure/modeling/docking_pymol.py
index 349f93b39..8f9adc263 100644
--- a/doc/examples/scripts/structure/modeling/docking_pymol.py
+++ b/doc/examples/scripts/structure/modeling/docking_pymol.py
@@ -1,23 +1,17 @@
-import numpy as np
+import ammolite
from matplotlib.colors import to_rgb
import biotite
-import biotite.structure as struc
-import ammolite
-
PNG_SIZE = (1000, 400)
# Define colors
for color_name, color_value in biotite.colors.items():
- ammolite.cmd.set_color(
- "biotite_" + color_name,
- to_rgb(color_value)
- )
+ ammolite.cmd.set_color("biotite_" + color_name, to_rgb(color_value))
# Convert to PyMOL
-pymol_receptor = ammolite.PyMOLObject.from_structure(receptor)
-pymol_ref_ligand = ammolite.PyMOLObject.from_structure(ref_ligand)
+pymol_receptor = ammolite.PyMOLObject.from_structure(receptor)
+pymol_ref_ligand = ammolite.PyMOLObject.from_structure(ref_ligand)
pymol_docked_ligand = ammolite.PyMOLObject.from_structure(docked_ligand)
# Visualize receptor as surface
@@ -53,4 +47,4 @@
# Save image
ammolite.cmd.ray(*PNG_SIZE)
-ammolite.cmd.png(__image_destination__)
\ No newline at end of file
+ammolite.cmd.png(__image_destination__)
diff --git a/doc/examples/scripts/structure/modeling/md_analysis.py b/doc/examples/scripts/structure/modeling/md_analysis.py
index 3e36779b3..dfdbf573b 100644
--- a/doc/examples/scripts/structure/modeling/md_analysis.py
+++ b/doc/examples/scripts/structure/modeling/md_analysis.py
@@ -22,16 +22,16 @@
# Code source: Patrick Kunzmann
# License: BSD 3 clause
+import matplotlib.pyplot as plt
+import numpy as np
import biotite
import biotite.structure as struc
import biotite.structure.io as strucio
import biotite.structure.io.xtc as xtc
-import numpy as np
-import matplotlib.pyplot as plt
# Put here the path of the downloaded files
templ_file_path = "../../../download/lysozyme_md.pdb"
-traj_file_path = "../../../download/lysozyme_md.xtc"
+traj_file_path = "../../../download/lysozyme_md.xtc"
# Gromacs does not set the element symbol in its PDB files,
# but Biotite guesses the element names from the atom names,
@@ -76,7 +76,7 @@
trajectory, _ = struc.superimpose(trajectory[0], trajectory)
rmsd = struc.rmsd(trajectory[0], trajectory)
-figure = plt.figure(figsize=(6,3))
+figure = plt.figure(figsize=(6, 3))
ax = figure.add_subplot(111)
ax.plot(time, rmsd, color=biotite.colors["dimorange"])
ax.set_xlim(time[0], time[-1])
@@ -97,7 +97,7 @@
radius = struc.gyration_radius(trajectory)
-figure = plt.figure(figsize=(6,3))
+figure = plt.figure(figsize=(6, 3))
ax = figure.add_subplot(111)
ax.plot(time, radius, color=biotite.colors["dimorange"])
ax.set_xlim(time[0], time[-1])
@@ -129,10 +129,10 @@
ca_trajectory = trajectory[:, trajectory.atom_name == "CA"]
rmsf = struc.rmsf(struc.average(ca_trajectory), ca_trajectory)
-figure = plt.figure(figsize=(6,3))
+figure = plt.figure(figsize=(6, 3))
ax = figure.add_subplot(111)
res_count = struc.get_residue_count(trajectory)
-ax.plot(np.arange(1, res_count+1), rmsf, color=biotite.colors["dimorange"])
+ax.plot(np.arange(1, res_count + 1), rmsf, color=biotite.colors["dimorange"])
ax.set_xlim(1, res_count)
ax.set_ylim(0, 1.5)
ax.set_xlabel("Residue")
@@ -140,4 +140,4 @@
figure.tight_layout()
-plt.show()
\ No newline at end of file
+plt.show()
diff --git a/doc/examples/scripts/structure/modeling/mmtf_trajectory.py b/doc/examples/scripts/structure/modeling/mmtf_trajectory.py
index 4bc706de8..cf4d8612c 100644
--- a/doc/examples/scripts/structure/modeling/mmtf_trajectory.py
+++ b/doc/examples/scripts/structure/modeling/mmtf_trajectory.py
@@ -25,13 +25,13 @@
# Code source: Patrick Kunzmann
# License: BSD 3 clause
+import os.path
from tempfile import NamedTemporaryFile
+import matplotlib.pyplot as plt
+import numpy as np
import biotite
-import biotite.structure.io.xtc as xtc
import biotite.structure.io.pdbx as pdbx
-import numpy as np
-import matplotlib.pyplot as plt
-import os.path
+import biotite.structure.io.xtc as xtc
# Put here the path of the downloaded trajectory file
xtc_file_path = "../../../download/lysozyme_md.xtc"
@@ -53,14 +53,14 @@
)
for i, dim in enumerate(("x", "y", "z")):
columns[f"coord_{dim}"] = pdbx.BinaryCIFData(
- coord[:,:,i].flatten(),
+ coord[:, :, i].flatten(),
encoding=[
pdbx.FixedPointEncoding(factor=100, src_type=np.float32),
pdbx.DeltaEncoding(),
# Encode the difference into two bytes
pdbx.IntegerPackingEncoding(byte_count=2, is_unsigned=False),
pdbx.ByteArrayEncoding(),
- ]
+ ],
)
category = pdbx.BinaryCIFCategory(columns)
bcif_file = pdbx.BinaryCIFFile(
@@ -77,15 +77,17 @@
figure = plt.figure()
ax = figure.add_subplot(111)
ax.bar(
- [1,2], [xtc_size/1e+6, bcif_size/1e+6], width=0.3,
+ [1, 2],
+ [xtc_size / 1e6, bcif_size / 1e6],
+ width=0.3,
color=[biotite.colors["dimgreen"], biotite.colors["dimorange"]],
- linewidth=0
+ linewidth=0,
)
-ax.set_xticks([1,2])
+ax.set_xticks([1, 2])
ax.set_xticklabels(["XTC", "BinaryCIF"])
ax.set_xlim(0.5, 2.5)
ax.set_ylim(0, 40)
ax.yaxis.grid(True)
ax.set_ylabel("File size (MB)")
figure.tight_layout()
-plt.show()
\ No newline at end of file
+plt.show()
diff --git a/doc/examples/scripts/structure/modeling/normal_modes.py b/doc/examples/scripts/structure/modeling/normal_modes.py
index 13c7eca3a..ac760c459 100644
--- a/doc/examples/scripts/structure/modeling/normal_modes.py
+++ b/doc/examples/scripts/structure/modeling/normal_modes.py
@@ -36,11 +36,10 @@
from tempfile import NamedTemporaryFile
import numpy as np
from numpy import newaxis
+import biotite.database.rcsb as rcsb
import biotite.structure as struc
import biotite.structure.io as strucio
import biotite.structure.io.pdbx as pdbx
-import biotite.database.rcsb as rcsb
-
# A CSV file containing the eigenvectors for the CA atoms
VECTOR_FILE = "../../../download/glycosylase_anm_vectors.csv"
@@ -64,8 +63,7 @@
# Filter first peptide chain
protein_chain = structure[
- struc.filter_amino_acids(structure)
- & (structure.chain_id == structure.chain_id[0])
+ struc.filter_amino_acids(structure) & (structure.chain_id == structure.chain_id[0])
]
# Filter CA atoms
ca = protein_chain[protein_chain.atom_name == "CA"]
@@ -88,7 +86,7 @@
# Stepwise application of eigenvectors as smooth sine oscillation
-time = np.linspace(0, 2*np.pi, FRAMES, endpoint=False)
+time = np.linspace(0, 2 * np.pi, FRAMES, endpoint=False)
deviation = np.sin(time)[:, newaxis, newaxis] * mode_vectors
# Apply oscillation of CA atom to all atoms in the corresponding residue
@@ -97,13 +95,14 @@
protein_chain,
# The last array element will be the length of the atom array,
# i.e. no valid index
- add_exclusive_stop=True
+ add_exclusive_stop=True,
)
-for i in range(len(residue_starts) -1):
+for i in range(len(residue_starts) - 1):
res_start = residue_starts[i]
- res_stop = residue_starts[i+1]
- oscillation[:, res_start:res_stop, :] \
- = protein_chain.coord[res_start:res_stop, :] + deviation[:, i:i+1, :]
+ res_stop = residue_starts[i + 1]
+ oscillation[:, res_start:res_stop, :] = (
+ protein_chain.coord[res_start:res_stop, :] + deviation[:, i : i + 1, :]
+ )
# An atom array stack containing all frames
oscillating_structure = struc.from_template(protein_chain, oscillation)
@@ -112,4 +111,4 @@
strucio.save_structure(temp.name, oscillating_structure)
# sphinx_gallery_static_image = "normal_modes.gif"
-temp.close()
\ No newline at end of file
+temp.close()
diff --git a/doc/examples/scripts/structure/modeling/normal_modes_pymol.py b/doc/examples/scripts/structure/modeling/normal_modes_pymol.py
index 5165510e9..1c0ad0e2c 100644
--- a/doc/examples/scripts/structure/modeling/normal_modes_pymol.py
+++ b/doc/examples/scripts/structure/modeling/normal_modes_pymol.py
@@ -1,6 +1,5 @@
+from os.path import isdir, join
from pymol import cmd
-from os.path import join, isdir
-
INPUT_STRUCTURE = "normal_modes.pdb"
OUTPUT_DIR = "normal_modes"
@@ -13,20 +12,34 @@
cmd.dss()
# Define colors
-cmd.set_color("biotite_lightgreen", [111/255, 222/255, 76/255])
+cmd.set_color("biotite_lightgreen", [111 / 255, 222 / 255, 76 / 255])
# Set overall colors
cmd.color("biotite_lightgreen", "chain A")
# Set view
-cmd.set_view((
- 0.605540633, 0.363677770, -0.707855821,
- -0.416691631, 0.902691007, 0.107316799,
- 0.678002179, 0.229972601, 0.698157668,
- 0.000000000, 0.000000000, -115.912551880,
- 32.098876953, 31.005725861, 78.377349854,
- 89.280677795, 142.544403076, -20.000000000
-))
+cmd.set_view(
+ (
+ 0.605540633,
+ 0.363677770,
+ -0.707855821,
+ -0.416691631,
+ 0.902691007,
+ 0.107316799,
+ 0.678002179,
+ 0.229972601,
+ 0.698157668,
+ 0.000000000,
+ 0.000000000,
+ -115.912551880,
+ 32.098876953,
+ 31.005725861,
+ 78.377349854,
+ 89.280677795,
+ 142.544403076,
+ -20.000000000,
+ )
+)
# Prepare output video frames
cmd.mset()
diff --git a/doc/examples/scripts/structure/modeling/rotamer_library.py b/doc/examples/scripts/structure/modeling/rotamer_library.py
index 2087a08de..fa828eb1d 100644
--- a/doc/examples/scripts/structure/modeling/rotamer_library.py
+++ b/doc/examples/scripts/structure/modeling/rotamer_library.py
@@ -13,14 +13,11 @@
# Code source: Patrick Kunzmann
# License: BSD 3 clause
-import numpy as np
-import networkx as nx
import matplotlib.pyplot as plt
+import numpy as np
import biotite.structure as struc
-import biotite.structure.io as strucio
-import biotite.structure.info as info
import biotite.structure.graphics as graphics
-
+import biotite.structure.info as info
# 'CA' is not in backbone,
# as we want to include the rotation between 'CA' and 'CB'
@@ -73,14 +70,12 @@
bond_list_without_axis.remove_bond(atom_i, atom_j)
# ...and these atoms are found by identifying the atoms that
# are still connected to one of the two atoms involved
- rotated_atom_indices = struc.find_connected(
- bond_list_without_axis, root=atom_i
- )
+ rotated_atom_indices = struc.find_connected(bond_list_without_axis, root=atom_i)
accepted = False
while not accepted:
# A random angle between 0 and 360 degrees
- angle = np.random.rand() * 2*np.pi
+ angle = np.random.rand() * 2 * np.pi
# Rotate
coord[rotated_atom_indices] = struc.rotate_about_axis(
coord[rotated_atom_indices], axis, angle, support
@@ -91,9 +86,7 @@
# than the sum of their VdW radii, if they are not bonded to
# each other
accepted = True
- distances = struc.distance(
- coord[:, np.newaxis], coord[np.newaxis, :]
- )
+ distances = struc.distance(coord[:, np.newaxis], coord[np.newaxis, :])
clashed = distances < vdw_radii_mean
for clash_atom1, clash_atom2 in zip(*np.where(clashed)):
if clash_atom1 == clash_atom2:
@@ -115,23 +108,28 @@
### Visualize rotamers ###
colors = np.zeros((residue.array_length(), 3))
-colors[residue.element == "H"] = (0.8, 0.8, 0.8) # gray
-colors[residue.element == "C"] = (0.0, 0.8, 0.0) # green
-colors[residue.element == "N"] = (0.0, 0.0, 0.8) # blue
-colors[residue.element == "O"] = (0.8, 0.0, 0.0) # red
+colors[residue.element == "H"] = (0.8, 0.8, 0.8) # gray
+colors[residue.element == "C"] = (0.0, 0.8, 0.0) # green
+colors[residue.element == "N"] = (0.0, 0.0, 0.8) # blue
+colors[residue.element == "O"] = (0.8, 0.0, 0.0) # red
# For consistency, each subplot has the same box size
coord = rotamers.coord
-size = np.array(
- [coord[:, :, 0].max() - coord[:, :, 0].min(),
- coord[:, :, 1].max() - coord[:, :, 1].min(),
- coord[:, :, 2].max() - coord[:, :, 2].min()]
-).max() * 0.5
+size = (
+ np.array(
+ [
+ coord[:, :, 0].max() - coord[:, :, 0].min(),
+ coord[:, :, 1].max() - coord[:, :, 1].min(),
+ coord[:, :, 2].max() - coord[:, :, 2].min(),
+ ]
+ ).max()
+ * 0.5
+)
fig = plt.figure(figsize=(8.0, 8.0))
fig.suptitle("Rotamers of tyrosine", fontsize=20, weight="bold")
for i, rotamer in enumerate(rotamers):
- ax = fig.add_subplot(3, 3, i+1, projection="3d")
+ ax = fig.add_subplot(3, 3, i + 1, projection="3d")
graphics.plot_atoms(ax, rotamer, colors, line_width=3, size=size, zoom=0.9)
fig.tight_layout()
@@ -139,4 +137,4 @@
### Write rotamers to structure file ###
-#strucio.save_structure("rotamers.pdb", rotamers)
\ No newline at end of file
+# strucio.save_structure("rotamers.pdb", rotamers)
diff --git a/doc/examples/scripts/structure/modeling/solvation_shells.py b/doc/examples/scripts/structure/modeling/solvation_shells.py
index cdd00be28..dcba8894d 100644
--- a/doc/examples/scripts/structure/modeling/solvation_shells.py
+++ b/doc/examples/scripts/structure/modeling/solvation_shells.py
@@ -25,16 +25,16 @@
# Code source: Patrick Kunzmann
# License: BSD 3 clause
+import matplotlib.pyplot as plt
import numpy as np
import scipy.signal as signal
-import matplotlib.pyplot as plt
import biotite
import biotite.structure as struc
import biotite.structure.io as strucio
# Put here the path of the downloaded files
templ_file_path = "../../../download/waterbox_md.pdb"
-traj_file_path = "../../../download/waterbox_md.xtc"
+traj_file_path = "../../../download/waterbox_md.xtc"
# Load the trajectory
traj = strucio.load_structure(traj_file_path, template=templ_file_path)
@@ -53,27 +53,19 @@
# Calculate the RDF of water molecules
# centered on sodium or chloride ions, respectively
N_BINS = 200
-bins, rdf_na = struc.rdf(
- center=na, atoms=solvent, periodic=True, bins=N_BINS
-)
-bins, rdf_cl = struc.rdf(
- center=cl, atoms=solvent, periodic=True, bins=N_BINS
-)
+bins, rdf_na = struc.rdf(center=na, atoms=solvent, periodic=True, bins=N_BINS)
+bins, rdf_cl = struc.rdf(center=cl, atoms=solvent, periodic=True, bins=N_BINS)
# Find peaks
# This requires a bit trial and error on the parameters
# The 'x' in '[x * N_BINS/10]' is the expected peak width in Å,
# that is transformed into a peak width in amount of values
-peak_indices_na = signal.find_peaks_cwt(
- rdf_na, widths=[0.2 * N_BINS/10]
-)
-peak_indices_cl = signal.find_peaks_cwt(
- rdf_cl, widths=[0.3 * N_BINS/10]
-)
+peak_indices_na = signal.find_peaks_cwt(rdf_na, widths=[0.2 * N_BINS / 10])
+peak_indices_cl = signal.find_peaks_cwt(rdf_cl, widths=[0.3 * N_BINS / 10])
peak_indices_na, peak_indices_cl = peak_indices_na[:3], peak_indices_cl[:3]
# Create plots
-fig, ax = plt.subplots(figsize=(8.0,3.0))
+fig, ax = plt.subplots(figsize=(8.0, 3.0))
# Plot average density in box
ax.axhline(1, color="lightgray", linestyle="--")
# Plot both RDFs
@@ -81,19 +73,25 @@
ax.plot(bins, rdf_cl, color=biotite.colors["dimorange"], label="Cl")
# The peak positions are shown as vertical lines
ax.vlines(
- bins[peak_indices_na], ymin=0, ymax=3,
- color=biotite.colors["darkgreen"], linestyle=":"
+ bins[peak_indices_na],
+ ymin=0,
+ ymax=3,
+ color=biotite.colors["darkgreen"],
+ linestyle=":",
)
ax.vlines(
- bins[peak_indices_cl], ymin=0, ymax=3,
- color=biotite.colors["dimorange"], linestyle=":"
+ bins[peak_indices_cl],
+ ymin=0,
+ ymax=3,
+ color=biotite.colors["dimorange"],
+ linestyle=":",
)
ax.set_xticks(np.arange(0, 10.5, 0.5))
-ax.set_xlim(0,10)
-ax.set_ylim(0,2.7)
+ax.set_xlim(0, 10)
+ax.set_ylim(0, 2.7)
ax.set_xlabel("Radius (Å)")
ax.set_ylabel("Relative density")
ax.legend()
fig.tight_layout()
-plt.show()
\ No newline at end of file
+plt.show()
diff --git a/doc/examples/scripts/structure/modeling/trajectory_sse.py b/doc/examples/scripts/structure/modeling/trajectory_sse.py
index 5b33d2156..a0acf219c 100644
--- a/doc/examples/scripts/structure/modeling/trajectory_sse.py
+++ b/doc/examples/scripts/structure/modeling/trajectory_sse.py
@@ -14,20 +14,18 @@
# Code source: Daniel Bauer, Patrick Kunzmann
# License: BSD 3 clause
-import numpy as np
import matplotlib.pyplot as plt
-from matplotlib.lines import Line2D
+import numpy as np
from matplotlib import colors
-import matplotlib as mpl
+from matplotlib.lines import Line2D
import biotite.structure as struc
import biotite.structure.io as strucio
import biotite.structure.io.xtc as xtc
from biotite.application.dssp import DsspApp
-
# Put here the path of the downloaded files
templ_file_path = "../../../download/lysozyme_md.pdb"
-traj_file_path = "../../../download/lysozyme_md.xtc"
+traj_file_path = "../../../download/lysozyme_md.xtc"
xtc_file = xtc.XTCFile.read(traj_file_path)
@@ -36,25 +34,28 @@
traj = traj[:, struc.filter_amino_acids(traj)]
# DSSP does not assign an SSE to the last residue -> -1
-sse = np.empty((traj.shape[0], struc.get_residue_count(traj)-1), dtype='U1')
+sse = np.empty((traj.shape[0], struc.get_residue_count(traj) - 1), dtype="U1")
for idx, frame in enumerate(traj):
app = DsspApp(traj[idx])
app.start()
app.join()
sse[idx] = app.get_sse()
+
# Matplotlib needs numbers to assign colors correctly
def sse_to_num(sse):
num = np.empty(sse.shape, dtype=int)
- num[sse == 'C'] = 0
- num[sse == 'E'] = 1
- num[sse == 'B'] = 2
- num[sse == 'S'] = 3
- num[sse == 'T'] = 4
- num[sse == 'H'] = 5
- num[sse == 'G'] = 6
- num[sse == 'I'] = 7
+ num[sse == "C"] = 0
+ num[sse == "E"] = 1
+ num[sse == "B"] = 2
+ num[sse == "S"] = 3
+ num[sse == "T"] = 4
+ num[sse == "H"] = 5
+ num[sse == "G"] = 6
+ num[sse == "I"] = 7
return num
+
+
sse = sse_to_num(sse)
@@ -68,24 +69,26 @@ def sse_to_num(sse):
r"turn": "yellow",
r"$\alpha$-helix": "blue",
r"$3_{10}$-helix": "gray",
- r"$\pi$-helix": "purple",
+ r"$\pi$-helix": "purple",
}
cmap = colors.ListedColormap(color_assign.values())
plt.figure(figsize=(8.0, 6.0))
-plt.imshow(sse.T, cmap=cmap, origin='lower')
+plt.imshow(sse.T, cmap=cmap, origin="lower")
plt.xlabel("Time / ps")
plt.ylabel("Residue")
ticks = np.arange(0, len(traj), 10)
plt.xticks(ticks, time[ticks].astype(int))
# Custom legend below the DSSP plot
-custom_lines = [
- Line2D([0], [0], color=cmap(i), lw=4) for i in range(len(color_assign))
-]
+custom_lines = [Line2D([0], [0], color=cmap(i), lw=4) for i in range(len(color_assign))]
plt.legend(
- custom_lines, color_assign.keys(), loc="upper center",
- bbox_to_anchor=(0.5, -0.15), ncol=len(color_assign), fontsize=8
+ custom_lines,
+ color_assign.keys(),
+ loc="upper center",
+ bbox_to_anchor=(0.5, -0.15),
+ ncol=len(color_assign),
+ fontsize=8,
)
plt.tight_layout()
-plt.show()
\ No newline at end of file
+plt.show()
diff --git a/doc/examples/scripts/structure/modeling/water_exchange_noexec.py b/doc/examples/scripts/structure/modeling/water_exchange_noexec.py
index 31a92644a..f62ca18ee 100644
--- a/doc/examples/scripts/structure/modeling/water_exchange_noexec.py
+++ b/doc/examples/scripts/structure/modeling/water_exchange_noexec.py
@@ -27,10 +27,11 @@
import matplotlib.pyplot as plt
import numpy as np
+from pylab import polyfit
import biotite
+import biotite.structure as struct
import biotite.structure.io.gro as gro
import biotite.structure.io.xtc as xtc
-import biotite.structure as struct
def water_in_prox(atoms, sele, cutoff):
@@ -38,27 +39,28 @@ def water_in_prox(atoms, sele, cutoff):
Get the atom indices of water oxygen atoms that are in vicinity of
the selected atoms.
"""
- cell_list = struct.CellList(atoms, cell_size=5,
- selection=atoms.atom_name == "OW")
+ cell_list = struct.CellList(atoms, cell_size=5, selection=atoms.atom_name == "OW")
adjacent_atoms = cell_list.get_atoms(atoms[sele].coord, cutoff)
adjacent_atoms = np.unique(adjacent_atoms.flatten())
adjacent_atoms = adjacent_atoms[adjacent_atoms > 0]
return adjacent_atoms
+
def cum_water_in_pore(traj, cutoff=6, key_residues=(507, 511)):
"""
Calculate the cumulative number of water molecules visiting the
pore.
"""
- protein_sele = np.isin(traj.res_id, key_residues) \
- & ~np.isin(traj.atom_name, ["N", "O", "CA", "C"])
+ protein_sele = np.isin(traj.res_id, key_residues) & ~np.isin(
+ traj.atom_name, ["N", "O", "CA", "C"]
+ )
water_count = np.zeros(traj.shape[0])
prev_counted_indices = []
for idx, frame in enumerate(traj):
indices = water_in_prox(frame, protein_sele, cutoff)
count = (~np.isin(indices, prev_counted_indices)).sum()
if idx != 0:
- count += water_count[idx-1]
+ count += water_count[idx - 1]
water_count[idx] = count
prev_counted_indices = indices
return water_count
@@ -82,36 +84,38 @@ def cum_water_in_pore(traj, cutoff=6, key_residues=(507, 511)):
# Linear fitting
-from pylab import polyfit
open_fit = polyfit(time, counts[0], 1)
closed_fit = polyfit(time, counts[1], 1)
-
fig, ax = plt.subplots(figsize=(8.0, 4.0))
-ax.plot(time, counts[0],
- label="open pore", color=biotite.colors["dimgreen"])
-ax.plot(time, open_fit[0]*time+open_fit[1],
- linestyle="--", color="black", zorder=-1)
-ax.plot(time, counts[1],
- label="closed pore", color=biotite.colors["lightorange"])
-ax.plot(time, closed_fit[0]*time+closed_fit[1],
- linestyle="--", color="black", zorder=-1)
+ax.plot(time, counts[0], label="open pore", color=biotite.colors["dimgreen"])
+ax.plot(
+ time, open_fit[0] * time + open_fit[1], linestyle="--", color="black", zorder=-1
+)
+ax.plot(time, counts[1], label="closed pore", color=biotite.colors["lightorange"])
+ax.plot(
+ time, closed_fit[0] * time + closed_fit[1], linestyle="--", color="black", zorder=-1
+)
ax.set(
- xlabel = "Time / ns",
- ylabel = "Count",
- title = "Cumulative count\nof individual water molecules visiting the pore"
+ xlabel="Time / ns",
+ ylabel="Count",
+ title="Cumulative count\nof individual water molecules visiting the pore",
)
ax.legend()
-ax.annotate(f"{open_fit[0]:.1f} per ns",
- xy=(20, 20*open_fit[0]+open_fit[1]+100),
- xytext=(20-5, 20*open_fit[0]+open_fit[1]+1300),
- arrowprops=dict(facecolor=biotite.colors["darkgreen"]),
- va="center")
-ax.annotate(f"{closed_fit[0]:.1f} per ns",
- xy=(30, 20*closed_fit[0]+closed_fit[1]+100),
- xytext=(30+2, 20*closed_fit[0]+closed_fit[1]+1300),
- arrowprops=dict(facecolor=biotite.colors["orange"]),
- va="center")
+ax.annotate(
+ f"{open_fit[0]:.1f} per ns",
+ xy=(20, 20 * open_fit[0] + open_fit[1] + 100),
+ xytext=(20 - 5, 20 * open_fit[0] + open_fit[1] + 1300),
+ arrowprops=dict(facecolor=biotite.colors["darkgreen"]),
+ va="center",
+)
+ax.annotate(
+ f"{closed_fit[0]:.1f} per ns",
+ xy=(30, 20 * closed_fit[0] + closed_fit[1] + 100),
+ xytext=(30 + 2, 20 * closed_fit[0] + closed_fit[1] + 1300),
+ arrowprops=dict(facecolor=biotite.colors["orange"]),
+ va="center",
+)
fig.savefig("water_exchange.png", bbox_inches="tight")
-plt.show()
\ No newline at end of file
+plt.show()
diff --git a/doc/examples/scripts/structure/molecule/alkane_isomers.py b/doc/examples/scripts/structure/molecule/alkane_isomers.py
index ed479a52f..c9ea8a265 100644
--- a/doc/examples/scripts/structure/molecule/alkane_isomers.py
+++ b/doc/examples/scripts/structure/molecule/alkane_isomers.py
@@ -24,12 +24,11 @@
opposed to one request per carbon number.
"""
-import numpy as np
import matplotlib.pyplot as plt
+import numpy as np
import biotite.database.pubchem as pubchem
-import biotite.structure.io.mol as mol
import biotite.structure as struc
-
+import biotite.structure.io.mol as mol
MAX_CARBON_COUNT = 12
PLOT_MAX_CARBON_COUNT = 6
@@ -37,13 +36,13 @@
carbon_numbers = []
alkane_cids = []
-for n_carbon in range(1, MAX_CARBON_COUNT+1):
+for n_carbon in range(1, MAX_CARBON_COUNT + 1):
formula = f"C{n_carbon}H{2 * n_carbon + 2}"
print(formula)
cids = np.array(pubchem.search(pubchem.FormulaQuery(formula)))
carbon_numbers.extend([n_carbon] * len(cids))
alkane_cids.extend(cids)
-carbon_numbers = np.array(carbon_numbers)
+carbon_numbers = np.array(carbon_numbers)
alkane_cids = np.array(alkane_cids)
########################################################################
@@ -58,15 +57,13 @@
# appropriate data type and used for filtering.
# Finally, also the IUPAC name for each remaining compound is retrieved
# to review the results.
-
+
# Filter natural isotopes...
n_isotopes = np.array(
pubchem.fetch_property(alkane_cids, "IsotopeAtomCount"), dtype=int
)
# ...and neutral compounds
-charge = np.array(
- pubchem.fetch_property(alkane_cids, "Charge"), dtype=int
-)
+charge = np.array(pubchem.fetch_property(alkane_cids, "Charge"), dtype=int)
# Apply filter
mask = (n_isotopes == 0) & (charge == 0)
carbon_numbers = carbon_numbers[mask]
@@ -85,7 +82,7 @@
# Remove compounds containing multiple molecules
# (indicated by the ';' as separator between molecule names)
-single_molecule_mask = np.array([not ";" in name for name in iupac_names])
+single_molecule_mask = np.array([";" not in name for name in iupac_names])
# Some compounds containing multiple molecules have no name at all
single_molecule_mask &= np.array([len(name) != 0 for name in iupac_names])
carbon_numbers = carbon_numbers[single_molecule_mask]
@@ -109,10 +106,7 @@
# for alkanes with zero carbon atoms, which does not make sense
isomer_numbers = np.bincount(carbon_numbers)[1:]
fig, ax = plt.subplots(figsize=(8.0, 4.0))
-ax.plot(
- np.arange(1, MAX_CARBON_COUNT+1), isomer_numbers,
- marker="o", color="gray"
-)
+ax.plot(np.arange(1, MAX_CARBON_COUNT + 1), isomer_numbers, marker="o", color="gray")
ax.set_xlim(left=0)
ax.set_ylim(bottom=0)
ax.set_xlabel("Number of carbon atoms")
@@ -127,18 +121,18 @@
# xy-coordinates are plotted as skeletal formula.
files = pubchem.fetch(
- alkane_cids[carbon_numbers <= PLOT_MAX_CARBON_COUNT],
- as_structural_formula=True
+ alkane_cids[carbon_numbers <= PLOT_MAX_CARBON_COUNT], as_structural_formula=True
)
fig, axes = plt.subplots(
nrows=np.max(isomer_numbers[:PLOT_MAX_CARBON_COUNT]),
ncols=PLOT_MAX_CARBON_COUNT,
figsize=(8.0, 6.0),
- sharex=True, sharey=True
+ sharex=True,
+ sharey=True,
)
fig.suptitle("Number of carbon atoms", fontsize=16)
-for i, n_carbon in enumerate(range(1, PLOT_MAX_CARBON_COUNT+1)):
+for i, n_carbon in enumerate(range(1, PLOT_MAX_CARBON_COUNT + 1)):
axes[0, i].set_title(n_carbon, fontsize=12)
indices_for_n_carbon = np.where(carbon_numbers == n_carbon)[0]
for j, file_index in enumerate(indices_for_n_carbon):
@@ -149,17 +143,13 @@
# Center atoms in origin
atoms.coord -= struc.centroid(atoms)
# Structural formula is 0 in z-dimension
- coord = atoms.coord[:,:2]
+ coord = atoms.coord[:, :2]
ax = axes[j, i]
- ax.plot(
- coord[:, 0], coord[:, 1],
- color="black", linestyle="None", marker="o"
- )
+ ax.plot(coord[:, 0], coord[:, 1], color="black", linestyle="None", marker="o")
for bond_i, bond_j, _ in atoms.bonds.as_array():
ax.plot(
- coord[[bond_i, bond_j], 0], coord[[bond_i, bond_j], 1],
- color="black"
+ coord[[bond_i, bond_j], 0], coord[[bond_i, bond_j], 1], color="black"
)
for ax in axes.flatten():
@@ -171,4 +161,4 @@
plt.show()
-# sphinx_gallery_thumbnail_number = 2
\ No newline at end of file
+# sphinx_gallery_thumbnail_number = 2
diff --git a/doc/examples/scripts/structure/molecule/molecular_visualization.py b/doc/examples/scripts/structure/molecule/molecular_visualization.py
index 70d77d837..883785167 100644
--- a/doc/examples/scripts/structure/molecule/molecular_visualization.py
+++ b/doc/examples/scripts/structure/molecule/molecular_visualization.py
@@ -16,13 +16,12 @@
# Code source: Patrick Kunzmann
# License: BSD 3 clause
-import numpy as np
import matplotlib.pyplot as plt
+import numpy as np
from matplotlib.animation import FuncAnimation
import biotite.structure as struc
-import biotite.structure.info as info
import biotite.structure.graphics as graphics
-
+import biotite.structure.info as info
# Get an atom array for caffeine
# Caffeine has the PDB reside name 'CFF'
@@ -35,34 +34,37 @@
# Normal vector of ring plane
normal = np.cross(n1.coord - n3.coord, n1.coord - n7.coord)
# Align ring plane normal to z-axis
-caffeine = struc.align_vectors(caffeine, normal, np.array([0,0,1]))
+caffeine = struc.align_vectors(caffeine, normal, np.array([0, 0, 1]))
# Caffeine should be colored by element
colors = np.zeros((caffeine.array_length(), 3))
-colors[caffeine.element == "H"] = (0.8, 0.8, 0.8) # gray
-colors[caffeine.element == "C"] = (0.0, 0.8, 0.0) # green
-colors[caffeine.element == "N"] = (0.0, 0.0, 0.8) # blue
-colors[caffeine.element == "O"] = (0.8, 0.0, 0.0) # red
+colors[caffeine.element == "H"] = (0.8, 0.8, 0.8) # gray
+colors[caffeine.element == "C"] = (0.0, 0.8, 0.0) # green
+colors[caffeine.element == "N"] = (0.0, 0.0, 0.8) # blue
+colors[caffeine.element == "O"] = (0.8, 0.0, 0.0) # red
fig = plt.figure(figsize=(8.0, 8.0))
ax = fig.add_subplot(111, projection="3d")
graphics.plot_atoms(
- ax, caffeine, colors, line_width=5, background_color="white",
- zoom=1.5
+ ax, caffeine, colors, line_width=5, background_color="white", zoom=1.5
)
fig.tight_layout()
+
# Create an animation that rotates the molecule about the x-axis
def update(angle):
ax.elev = angle
+
FPS = 50
DURATION = 4
angles = np.linspace(-180, 180, DURATION * FPS)
# Start at 90 degrees
-angles = np.concatenate([
- np.linspace( 90, 180, int(DURATION * FPS * 1/4)),
- np.linspace(-180, 90, int(DURATION * FPS * 3/4))
-])
-animation = FuncAnimation(fig, update, angles, interval=int(1000/FPS))
-plt.show()
\ No newline at end of file
+angles = np.concatenate(
+ [
+ np.linspace(90, 180, int(DURATION * FPS * 1 / 4)),
+ np.linspace(-180, 90, int(DURATION * FPS * 3 / 4)),
+ ]
+)
+animation = FuncAnimation(fig, update, angles, interval=int(1000 / FPS))
+plt.show()
diff --git a/doc/examples/scripts/structure/molecule/peoe_visualization.py b/doc/examples/scripts/structure/molecule/peoe_visualization.py
index d2dbaf66e..c38e51d98 100644
--- a/doc/examples/scripts/structure/molecule/peoe_visualization.py
+++ b/doc/examples/scripts/structure/molecule/peoe_visualization.py
@@ -13,15 +13,14 @@
# Code source: Patrick Kunzmann
# License: BSD 3 clause
-import numpy as np
-from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
-from matplotlib.colors import Normalize
+import numpy as np
from matplotlib.cm import ScalarMappable
+from matplotlib.colors import Normalize
+from sklearn.decomposition import PCA
import biotite.structure as struc
-import biotite.structure.info as info
import biotite.structure.graphics as graphics
-
+import biotite.structure.info as info
# Acetylsalicylic acid
MOLECULE_NAME = "AIN"
@@ -42,7 +41,6 @@
CMAP_NAME = "bwr_r"
-
# Get an atom array for the selected molecule
molecule = info.residue(MOLECULE_NAME)
@@ -71,17 +69,19 @@
colors = color_map(normalized_charges)
# Ball size should be proportional to VdW radius of the respective atom
-ball_sizes = np.array(
- [info.vdw_radius_single(e) for e in molecule.element]
-) * BALL_SCALE
+ball_sizes = (
+ np.array([info.vdw_radius_single(e) for e in molecule.element]) * BALL_SCALE
+)
# Gradient of ray strength
# The ray size is proportional to the absolute charge value
ray_full_sizes = ball_sizes + np.abs(charges) * RAY_SCALE
-ray_sizes = np.array([
- np.linspace(ray_full_sizes[i], ball_sizes[i], N_RAY_STEPS, endpoint=False)
- for i in range(molecule.array_length())
-]).T
+ray_sizes = np.array(
+ [
+ np.linspace(ray_full_sizes[i], ball_sizes[i], N_RAY_STEPS, endpoint=False)
+ for i in range(molecule.array_length())
+ ]
+).T
# The plotting begins here
@@ -92,32 +92,38 @@
# As 'axes.scatter()' uses sizes in points**2,
# the VdW-radii as also squared
graphics.plot_ball_and_stick_model(
- ax, molecule, colors, ball_size=ball_sizes**2, line_width=3,
- line_color=color_map(0.5), background_color=(.05, .05, .05), zoom=1.5
+ ax,
+ molecule,
+ colors,
+ ball_size=ball_sizes**2,
+ line_width=3,
+ line_color=color_map(0.5),
+ background_color=(0.05, 0.05, 0.05),
+ zoom=1.5,
)
# Plot the element labels
for atom in molecule:
ax.text(
- *atom.coord, atom.element,
- fontsize=ELEMENT_FONT_SIZE, color="black",
- ha="center", va="center", zorder=100
+ *atom.coord,
+ atom.element,
+ fontsize=ELEMENT_FONT_SIZE,
+ color="black",
+ ha="center",
+ va="center",
+ zorder=100,
)
# Plot the rays
for i in range(N_RAY_STEPS):
ax.scatter(
- *molecule.coord.T, s=ray_sizes[i]**2, c=colors,
- linewidth=0, alpha=RAY_ALPHA
+ *molecule.coord.T, s=ray_sizes[i] ** 2, c=colors, linewidth=0, alpha=RAY_ALPHA
)
# Plot the colorbar
color_bar = fig.colorbar(
- ScalarMappable(
- norm=Normalize(vmin=-max_charge, vmax=max_charge),
- cmap=color_map
- ),
- ax=ax
+ ScalarMappable(norm=Normalize(vmin=-max_charge, vmax=max_charge), cmap=color_map),
+ ax=ax,
)
color_bar.set_label("Partial charge (e)", color="white")
color_bar.ax.yaxis.set_tick_params(color="white")
@@ -126,4 +132,4 @@
label.set_color("white")
fig.tight_layout()
-plt.show()
\ No newline at end of file
+plt.show()
diff --git a/doc/examples/scripts/structure/nucleotide/base_pairs.py b/doc/examples/scripts/structure/nucleotide/base_pairs.py
index cde0b9b21..28681901e 100644
--- a/doc/examples/scripts/structure/nucleotide/base_pairs.py
+++ b/doc/examples/scripts/structure/nucleotide/base_pairs.py
@@ -10,15 +10,14 @@
# License: BSD 3 clause
from tempfile import gettempdir
-import biotite
-import biotite.structure.io.pdb as pdb
-import biotite.database.rcsb as rcsb
-import biotite.structure as struc
-import biotite.sequence.graphics as graphics
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
-from matplotlib.patches import Arc
import numpy as np
+from matplotlib.patches import Arc
+import biotite
+import biotite.database.rcsb as rcsb
+import biotite.structure as struc
+import biotite.structure.io.pdb as pdb
# Download the PDB file and read the structure
pdb_file_path = rcsb.fetch("4p5j", "pdb", gettempdir())
@@ -44,10 +43,10 @@
# Setup the axis
ax.set_xlim(0.5, len(residue_ids) + 0.5)
-ax.set_ylim(0, len(residue_ids)/2 + 0.5)
+ax.set_ylim(0, len(residue_ids) / 2 + 0.5)
ax.set_aspect("equal")
ax.xaxis.set_major_locator(ticker.MultipleLocator(3))
-ax.tick_params(axis='both', which='major', labelsize=8)
+ax.tick_params(axis="both", which="major", labelsize=8)
ax.set_yticks([])
# Remove the frame
@@ -55,7 +54,7 @@
# Plot the residue names in order
for residue_name, residue_id in zip(residue_names, residue_ids):
- ax.text(residue_id, 0, residue_name, ha='center', fontsize=8)
+ ax.text(residue_id, 0, residue_name, ha="center", fontsize=8)
# Compute the basepairs and pseudknot order (first result)
base_pairs = struc.base_pairs(nucleotides)
@@ -63,9 +62,7 @@
# Draw the arcs between base pairs
for (base1, base2), order in zip(base_pairs, pseudoknot_order):
- arc_center = (
- np.mean((nucleotides.res_id[base1],nucleotides.res_id[base2])), 1.5
- )
+ arc_center = (np.mean((nucleotides.res_id[base1], nucleotides.res_id[base2])), 1.5)
arc_diameter = abs(nucleotides.res_id[base2] - nucleotides.res_id[base1])
name1 = nucleotides.res_name[base1]
name2 = nucleotides.res_name[base2]
@@ -80,10 +77,16 @@
else:
linestyle = ":"
arc = Arc(
- arc_center, arc_diameter, arc_diameter, theta1=0, theta2=180,
- color=color, linewidth=1.5, linestyle=linestyle
+ arc_center,
+ arc_diameter,
+ arc_diameter,
+ theta1=0,
+ theta2=180,
+ color=color,
+ linewidth=1.5,
+ linestyle=linestyle,
)
ax.add_patch(arc)
# Display the plot
-plt.show()
\ No newline at end of file
+plt.show()
diff --git a/doc/examples/scripts/structure/nucleotide/leontis_westhof.py b/doc/examples/scripts/structure/nucleotide/leontis_westhof.py
index a8a436f97..460dd573e 100644
--- a/doc/examples/scripts/structure/nucleotide/leontis_westhof.py
+++ b/doc/examples/scripts/structure/nucleotide/leontis_westhof.py
@@ -2,7 +2,7 @@
Leontis-Westhof Nomenclature
============================
-In this example we plot a secondary structure diagram annotated with
+In this example we plot a secondary structure diagram annotated with
Leontis-Westhof nomenclature :footcite:`Leontis2001` of the sarcin-ricin
loop from E. coli (PDB ID: 6ZYB).
"""
@@ -11,14 +11,13 @@
# License: BSD 3 clause
from tempfile import gettempdir
+import matplotlib.pyplot as plt
+import numpy as np
import biotite
-import biotite.structure.io.pdb as pdb
import biotite.database.rcsb as rcsb
import biotite.structure as struc
import biotite.structure.graphics as graphics
-import matplotlib.pyplot as plt
-import numpy as np
-
+import biotite.structure.io.pdb as pdb
# Download the PDB file and read the structure
pdb_file_path = rcsb.fetch("6ZYB", "pdb", gettempdir())
@@ -30,9 +29,9 @@
base_pairs = struc.base_pairs(nucleotides)
glycosidic_bonds = struc.base_pairs_glycosidic_bond(nucleotides, base_pairs)
edges = struc.base_pairs_edge(nucleotides, base_pairs)
-base_pairs = struc.get_residue_positions(
- nucleotides, base_pairs.flatten()
-).reshape(base_pairs.shape)
+base_pairs = struc.get_residue_positions(nucleotides, base_pairs.flatten()).reshape(
+ base_pairs.shape
+)
# Get the one-letter-codes of the bases
base_labels = []
@@ -41,7 +40,7 @@
# Color canonical Watson-Crick base pairs with a darker orange and
# non-canonical base pairs with a lighter orange
-colors = np.full(base_pairs.shape[0], biotite.colors['brightorange'])
+colors = np.full(base_pairs.shape[0], biotite.colors["brightorange"])
for i, (base1, base2) in enumerate(base_pairs):
name1 = base_labels[base1]
name2 = base_labels[base2]
@@ -68,34 +67,33 @@
# Plot the secondary structure
graphics.plot_nucleotide_secondary_structure(
- ax, base_labels, base_pairs, struc.get_residue_count(nucleotides),
- bond_color=colors
+ ax, base_labels, base_pairs, struc.get_residue_count(nucleotides), bond_color=colors
)
# Display the plot
plt.show()
########################################################################
-# The sarcin-ricin loop is part of the 23s rRNA and is considered
+# The sarcin-ricin loop is part of the 23s rRNA and is considered
# crucial to the ribosome‘s activity. The incorporation of the
-# Leontis-Westhof nomenclature into the 2D-plot shows how the individual
-# base pairs are oriented and how their glycosidic bonds are oriented
+# Leontis-Westhof nomenclature into the 2D-plot shows how the individual
+# base pairs are oriented and how their glycosidic bonds are oriented
# relative to each other.
#
-# This visualization enables one to see a pattern that cannot be
-# communicated through the 2D structure alone. The upper part of the
-# sarcin-ricin loop consists of only cis (c) oriented glycosidic bonds.
-# All bases interact through their Watson-Crick edge (W). On the other
-# hand, the lower part of the sarcin ricin loop looks strikingly
-# different. The glycosidic bonds are oriented in cis (c) and trans (t)
-# orientation. The bases interact through all three edges: Watson-Crick
+# This visualization enables one to see a pattern that cannot be
+# communicated through the 2D structure alone. The upper part of the
+# sarcin-ricin loop consists of only cis (c) oriented glycosidic bonds.
+# All bases interact through their Watson-Crick edge (W). On the other
+# hand, the lower part of the sarcin ricin loop looks strikingly
+# different. The glycosidic bonds are oriented in cis (c) and trans (t)
+# orientation. The bases interact through all three edges: Watson-Crick
# (W), Hoogsteen (H), and Sugar (S).
-#
-# Thus, it can be concluded that the upper part of the sarcin ricin loop
-# represents a highly organized helix, while the lower part of the loop
+#
+# Thus, it can be concluded that the upper part of the sarcin ricin loop
+# represents a highly organized helix, while the lower part of the loop
# is comparatively unorganized.
#
# References
# ----------
-#
-# .. footbibliography::
\ No newline at end of file
+#
+# .. footbibliography::
diff --git a/doc/examples/scripts/structure/nucleotide/transfer_rnas.py b/doc/examples/scripts/structure/nucleotide/transfer_rnas.py
index 23823b3d9..5d238b346 100644
--- a/doc/examples/scripts/structure/nucleotide/transfer_rnas.py
+++ b/doc/examples/scripts/structure/nucleotide/transfer_rnas.py
@@ -2,7 +2,7 @@
Comparison of a tRNA-like-structure with a tRNA
===============================================
-In this example we plot a secondary-structure diagram of a tRNA mimic
+In this example we plot a secondary-structure diagram of a tRNA mimic
(PDB ID: 4P5J) from the *turnip yellow mosaic virus* (TYMV) and compare
it to a PHE-tRNA (PDB ID: 1EHZ).
"""
@@ -11,15 +11,16 @@
# License: BSD 3 clause
from tempfile import gettempdir
+import matplotlib.pyplot as plt
+import numpy as np
import biotite
-import biotite.structure.io.pdb as pdb
import biotite.database.rcsb as rcsb
import biotite.structure as struc
import biotite.structure.graphics as graphics
-import matplotlib.pyplot as plt
-import numpy as np
+import biotite.structure.io.pdb as pdb
+
-# Create a function to get the structures and compute information for
+# Create a function to get the structures and compute information for
# the plots.
def plot_rna(pdb_id, axes):
# Download the PDB file and read the structure
@@ -30,15 +31,15 @@ def plot_rna(pdb_id, axes):
# Compute the base pairs and their pseudoknot order
base_pairs = struc.base_pairs(nucleotides)
- base_pairs = struc.get_residue_positions(
- nucleotides, base_pairs.flatten()
- ).reshape(base_pairs.shape)
+ base_pairs = struc.get_residue_positions(nucleotides, base_pairs.flatten()).reshape(
+ base_pairs.shape
+ )
pseudoknot_order = struc.pseudoknots(base_pairs)[0]
# Set the linestyle according to the pseudoknot order
- linestyles = np.full(base_pairs.shape[0], '-', dtype=object)
- linestyles[pseudoknot_order == 1] = '--'
- linestyles[pseudoknot_order == 2] = ':'
+ linestyles = np.full(base_pairs.shape[0], "-", dtype=object)
+ linestyles[pseudoknot_order == 1] = "--"
+ linestyles[pseudoknot_order == 2] = ":"
# Indicate canonical nucleotides with an upper case one-letter-code
# and non-canonical nucleotides with a lower case one-letter-code
@@ -52,7 +53,7 @@ def plot_rna(pdb_id, axes):
# Color canonical Watson-Crick base pairs with a darker orange and
# non-canonical base pairs with a lighter orange
- colors = np.full(base_pairs.shape[0], biotite.colors['brightorange'])
+ colors = np.full(base_pairs.shape[0], biotite.colors["brightorange"])
for i, (base1, base2) in enumerate(base_pairs):
name1 = base_labels[base1]
name2 = base_labels[base2]
@@ -61,37 +62,45 @@ def plot_rna(pdb_id, axes):
# Plot the secondary structure
graphics.plot_nucleotide_secondary_structure(
- axes, base_labels, base_pairs, struc.get_residue_count(nucleotides),
- pseudoknot_order=pseudoknot_order, bond_linestyle=linestyles,
+ axes,
+ base_labels,
+ base_pairs,
+ struc.get_residue_count(nucleotides),
+ pseudoknot_order=pseudoknot_order,
+ bond_linestyle=linestyles,
bond_color=colors,
# Margin to compensate for reduced axis limits in shared axis
- border=0.13
+ border=0.13,
)
# Use the PDB ID to label each plot
axes.set_title(pdb_id, loc="left")
+
# Create a matplotlib pyplot
fig, (ax1, ax2) = plt.subplots(
- 2, 1, figsize=(8.0, 16.0),
+ 2,
+ 1,
+ figsize=(8.0, 16.0),
# Share both axes to ensure eqaul scaling of bath secondary structures
- sharex=True, sharey=True
+ sharex=True,
+ sharey=True,
)
# Plot the secondary structures
-plot_rna('1EHZ', ax1)
-plot_rna('4P5J', ax2)
+plot_rna("1EHZ", ax1)
+plot_rna("4P5J", ax2)
fig.tight_layout()
plt.show()
########################################################################
-# The generated plots show that both structures consist of four hairpin
-# loops. Two of those loops, which are opposite to each other, interact
-# through two pseudoknotted base pairs in the otherwise unpaired loop of
-# the respective hairpin structures. The fact that this interaction was
-# mimicked indicates functional importance.
-#
-# A third hairpin loop is folded towards the centre of the tRNA mimic.
-# This is not the case for the phenylalanine tRNA and thus signifies a
-# major difference between the structures.
\ No newline at end of file
+# The generated plots show that both structures consist of four hairpin
+# loops. Two of those loops, which are opposite to each other, interact
+# through two pseudoknotted base pairs in the otherwise unpaired loop of
+# the respective hairpin structures. The fact that this interaction was
+# mimicked indicates functional importance.
+#
+# A third hairpin loop is folded towards the centre of the tRNA mimic.
+# This is not the case for the phenylalanine tRNA and thus signifies a
+# major difference between the structures.
diff --git a/doc/examples/scripts/structure/nucleotide/watson_crick.py b/doc/examples/scripts/structure/nucleotide/watson_crick.py
index 5ac45ae82..00fbfd33c 100644
--- a/doc/examples/scripts/structure/nucleotide/watson_crick.py
+++ b/doc/examples/scripts/structure/nucleotide/watson_crick.py
@@ -9,13 +9,12 @@
# Code source: Patrick Kunzmann
# License: BSD 3 clause
-import numpy as np
import matplotlib.pyplot as plt
+import numpy as np
+import biotite.database.rcsb as rcsb
import biotite.structure as struc
-import biotite.structure.io.pdbx as pdbx
import biotite.structure.graphics as graphics
-import biotite.database.rcsb as rcsb
-
+import biotite.structure.io.pdbx as pdbx
# Structure of a DNA double helix
pdbx_file = pdbx.BinaryCIFFile.read(rcsb.fetch("1qxb", "bcif"))
@@ -26,13 +25,15 @@
base_pairs = struc.base_pairs(nucleotides)
for i, j in base_pairs:
if (nucleotides.res_name[i], nucleotides.res_name[j]) == ("DG", "DC"):
- guanine, cytosine = [nucleotides[mask] for mask
- in struc.get_residue_masks(nucleotides, [i, j])]
+ guanine, cytosine = [
+ nucleotides[mask] for mask in struc.get_residue_masks(nucleotides, [i, j])
+ ]
break
for i, j in base_pairs:
if (nucleotides.res_name[i], nucleotides.res_name[j]) == ("DA", "DT"):
- adenine, thymine = [nucleotides[mask] for mask
- in struc.get_residue_masks(nucleotides, [i, j])]
+ adenine, thymine = [
+ nucleotides[mask] for mask in struc.get_residue_masks(nucleotides, [i, j])
+ ]
break
pairs = [(guanine, cytosine), (adenine, thymine)]
@@ -41,19 +42,18 @@
# Arrange bases
for i, (purine, pyrimidine) in enumerate(pairs):
- n1, n3, c5, c6 = [pyrimidine[pyrimidine.atom_name == name][0]
- for name in ("N1", "N3", "C5", "C6")]
+ n1, n3, c5, c6 = [
+ pyrimidine[pyrimidine.atom_name == name][0] for name in ("N1", "N3", "C5", "C6")
+ ]
# Pyrimidine N3-C6 axis is aligned to x-axis
purine, pyrimidine = [
- struc.align_vectors(
- base,
- n3.coord - c6.coord,
- np.array([1, 0, 0])
- ) for base in (purine, pyrimidine)
+ struc.align_vectors(base, n3.coord - c6.coord, np.array([1, 0, 0]))
+ for base in (purine, pyrimidine)
]
# Coords are changed -> update 'Atom' objects
- n1, n3, c4, c5 = [pyrimidine[pyrimidine.atom_name == name][0]
- for name in ("N1", "N3", "C4", "C5")]
+ n1, n3, c4, c5 = [
+ pyrimidine[pyrimidine.atom_name == name][0] for name in ("N1", "N3", "C4", "C5")
+ ]
# Pyrimidine base plane normal vector is aligned to z-axis
# Furthermore, distance between bases is set
purine, pyrimidine = [
@@ -61,10 +61,11 @@
base,
np.cross(n3.coord - n1.coord, c5.coord - n1.coord),
np.array([0, 0, 1]),
- origin_position = struc.centroid(purine + pyrimidine),
+ origin_position=struc.centroid(purine + pyrimidine),
# 10 Å separation between pairs
- target_position = np.array([0, 10*i, 0])
- ) for base in (purine, pyrimidine)
+ target_position=np.array([0, 10 * i, 0]),
+ )
+ for base in (purine, pyrimidine)
]
pairs[i] = (purine, pyrimidine)
@@ -73,14 +74,12 @@
atoms = pairs[0][0] + pairs[0][1] + pairs[1][0] + pairs[1][1]
# Color by element
colors = np.zeros((atoms.array_length(), 3))
-colors[atoms.element == "H"] = (0.8, 0.8, 0.8) # gray
-colors[atoms.element == "C"] = (0.2, 0.2, 0.2) # darkgray
-colors[atoms.element == "N"] = (0.0, 0.0, 0.8) # blue
-colors[atoms.element == "O"] = (0.8, 0.0, 0.0) # red
-colors[atoms.element == "P"] = (0.0, 0.6, 0.0) # green
-graphics.plot_atoms(
- ax, atoms, colors, line_width=3, background_color="white", zoom=1.5
-)
+colors[atoms.element == "H"] = (0.8, 0.8, 0.8) # gray
+colors[atoms.element == "C"] = (0.2, 0.2, 0.2) # darkgray
+colors[atoms.element == "N"] = (0.0, 0.0, 0.8) # blue
+colors[atoms.element == "O"] = (0.8, 0.0, 0.0) # red
+colors[atoms.element == "P"] = (0.0, 0.6, 0.0) # green
+graphics.plot_atoms(ax, atoms, colors, line_width=3, background_color="white", zoom=1.5)
# Plot hydrogen bonds
for purine, pyrimidine in pairs:
@@ -102,14 +101,13 @@
for pair in pairs:
for base in pair:
label = base.res_name[0][1]
- ring_center = struc.centroid(base[
- np.isin(base.atom_name, ["N1", "C2", "N3", "C4", "C5", "C6"])
- ])
+ ring_center = struc.centroid(
+ base[np.isin(base.atom_name, ["N1", "C2", "N3", "C4", "C5", "C6"])]
+ )
x, y, z = ring_center
ax.text(
- x, y, z, label,
- fontsize=20, fontweight="bold", va="center", ha="center"
+ x, y, z, label, fontsize=20, fontweight="bold", va="center", ha="center"
)
fig.tight_layout()
-plt.show()
\ No newline at end of file
+plt.show()
diff --git a/doc/examples/scripts/structure/protein/pb_alignment.py b/doc/examples/scripts/structure/protein/pb_alignment.py
index 5b6dc3818..9a3396ecf 100644
--- a/doc/examples/scripts/structure/protein/pb_alignment.py
+++ b/doc/examples/scripts/structure/protein/pb_alignment.py
@@ -27,15 +27,14 @@
# License: BSD 3 clause
from tempfile import gettempdir
-import numpy as np
import matplotlib.pyplot as plt
+import numpy as np
+import biotite.database.rcsb as rcsb
import biotite.sequence as seq
import biotite.sequence.align as align
import biotite.sequence.graphics as graphics
import biotite.structure as struc
import biotite.structure.io.pdbx as pdbx
-import biotite.database.rcsb as rcsb
-
# PB alphabet
pb_alphabet = seq.LetterAlphabet("abcdefghijklmnop")
@@ -79,13 +78,12 @@
[-35.34, -65.03, -38.12, -66.34, -29.51, -89.10, -2.91, 77.90],
[-45.29, -67.44, -27.72, -87.27, 5.13, 77.49, 30.71, -93.23],
[-27.09, -86.14, 0.30, 59.85, 21.51, -96.30, 132.67, -92.91],
-])
+]) # fmt: skip
# Fetch animal lysoyzme structures
lyso_files = rcsb.fetch(
- ["1REX", "1AKI", "1DKJ", "1GD6"],
- format="bcif", target_path=gettempdir()
+ ["1REX", "1AKI", "1DKJ", "1GD6"], format="bcif", target_path=gettempdir()
)
organisms = ["H. sapiens", "G. gallus", "C. viginianus", "B. mori"]
@@ -106,25 +104,21 @@
# centered on the amino acid to calculate the PB for
# Hence, the PBs are not defined for the two amino acids
# at each terminus
- pb_angles = np.full((len(phi)-4, 8), np.nan)
- pb_angles[:, 0] = psi[ : -4]
- pb_angles[:, 1] = phi[1 : -3]
- pb_angles[:, 2] = psi[1 : -3]
- pb_angles[:, 3] = phi[2 : -2]
- pb_angles[:, 4] = psi[2 : -2]
- pb_angles[:, 5] = phi[3 : -1]
- pb_angles[:, 6] = psi[3 : -1]
- pb_angles[:, 7] = phi[4 : ]
+ pb_angles = np.full((len(phi) - 4, 8), np.nan)
+ pb_angles[:, 0] = psi[:-4]
+ pb_angles[:, 1] = phi[1:-3]
+ pb_angles[:, 2] = psi[1:-3]
+ pb_angles[:, 3] = phi[2:-2]
+ pb_angles[:, 4] = psi[2:-2]
+ pb_angles[:, 5] = phi[3:-1]
+ pb_angles[:, 6] = psi[3:-1]
+ pb_angles[:, 7] = phi[4:]
pb_angles = np.rad2deg(pb_angles)
# Angle RMSD of all reference angles with all actual angles
rmsda = np.sum(
- (
- (
- ref_angles[:, np.newaxis] - pb_angles[np.newaxis, :] + 180
- ) % 360 - 180
- )**2,
- axis=-1
+ ((ref_angles[:, np.newaxis] - pb_angles[np.newaxis, :] + 180) % 360 - 180) ** 2,
+ axis=-1,
)
# Chose PB, where the RMSDA to the reference angle is lowest
# Due to the definition of Biotite symbol codes
@@ -139,7 +133,7 @@
matrix_dict = align.SubstitutionMatrix.dict_from_str(matrix_str)
matrix = align.SubstitutionMatrix(pb_alphabet, pb_alphabet, matrix_dict)
alignment, order, _, _ = align.align_multiple(
- pb_seqs, matrix, gap_penalty=(-500,-100), terminal_penalty=False
+ pb_seqs, matrix, gap_penalty=(-500, -100), terminal_penalty=False
)
# Visualize the alignment
@@ -150,10 +144,15 @@
ax = fig.add_subplot(111)
# The color scheme was generated with the 'Gecos' software
graphics.plot_alignment_type_based(
- ax, alignment, labels=labels, symbols_per_line=45, spacing=2,
- show_numbers=True, color_scheme="flower"
+ ax,
+ alignment,
+ labels=labels,
+ symbols_per_line=45,
+ spacing=2,
+ show_numbers=True,
+ color_scheme="flower",
)
# Organism names in italic
-ax.set_yticklabels(ax.get_yticklabels(), fontdict={"fontstyle":"italic"})
+ax.set_yticklabels(ax.get_yticklabels(), fontdict={"fontstyle": "italic"})
fig.tight_layout()
-plt.show()
\ No newline at end of file
+plt.show()
diff --git a/doc/examples/scripts/structure/protein/peptide_assembly.py b/doc/examples/scripts/structure/protein/peptide_assembly.py
index 4c07451ad..de9f24704 100644
--- a/doc/examples/scripts/structure/protein/peptide_assembly.py
+++ b/doc/examples/scripts/structure/protein/peptide_assembly.py
@@ -21,19 +21,18 @@
# Code source: Patrick Kunzmann
# License: BSD 3 clause
-from tempfile import NamedTemporaryFile
import itertools
+from tempfile import NamedTemporaryFile
import numpy as np
from numpy.linalg import norm
import biotite.sequence as seq
import biotite.structure as struc
-import biotite.structure.io as strucio
import biotite.structure.info as info
+import biotite.structure.io as strucio
-
-C_N_LENGTH = 1.34
-N_CA_LENGTH = 1.46
-CA_C_LENGTH = 1.54
+C_N_LENGTH = 1.34
+N_CA_LENGTH = 1.46
+CA_C_LENGTH = 1.54
CA_C_N_ANGLE = 114
C_N_CA_ANGLE = 123
@@ -41,96 +40,15 @@
# Reference peptide bond atom coordinates taken from 1l2y:
# CA, C, N, O, H
-peptide_coord = np.array([
- [-8.608, 3.135, -1.618],
- [-7.117, 2.964, -1.897],
- [-6.379, 4.031, -2.228],
- [-6.634, 1.849, -1.758],
- [-6.821, 4.923, -2.394]
-])
-
-
-def create_raw_backbone_coord(number_of_res):
- """
- Create coordinates for straight peptide chain in z-plane.
- The peptide bonds are in trans configuration.
- """
- coord = np.zeros((number_of_res * 3, 3))
- for i, angle, angle_direction, length in zip(
- range(len(coord)),
- itertools.cycle([CA_C_N_ANGLE, C_N_CA_ANGLE, N_CA_C_ANGLE]),
- itertools.cycle([1, -1]),
- itertools.cycle([C_N_LENGTH, N_CA_LENGTH, CA_C_LENGTH])
- ):
- if i == 0:
- coord[i] = [0, 0, 0]
- elif i == 1:
- coord[i] = [0, length, 0]
- else:
- # Rotate about z-axis -> backbone lies in z-plane
- rot_axis = [0, 0, angle_direction]
- # Calculate the coordinates of a new atoms by rotating the previous
- # bond by the given angle
- new_coord = struc.rotate_about_axis(
- coord[i-2],
- axis = rot_axis,
- angle = np.deg2rad(angle),
- support = coord[i-1]
- )
- # Scale bond to correct bond length
- bond_vector = new_coord - coord[i-1]
- coord[i] = coord[i-1] + bond_vector * length / norm(bond_vector)
- return coord
-
-
-def append_residue(chain, residue):
- """
- Append a residue to an existing chain.
- Modify annotation arrays and remove atoms as necessary.
- The atom coordinates are not altered.
- """
- if chain.array_length() == 0:
- # Chain is empty
- residue.res_id[:] = 1
- return residue
-
- last_res_id = chain.res_id[-1]
-
- # Remove atoms removed by peptide bond
- chain = chain[
- (chain.res_id != last_res_id) |
- ~np.isin(
- chain.atom_name,
- ["OXT", "HXT"]
- )
- ]
- residue = residue[
- ~np.isin(
- residue.atom_name,
- ["H2", "H3"]
- )
+PEPTIDE_COORD = np.array(
+ [
+ [-8.608, 3.135, -1.618],
+ [-7.117, 2.964, -1.897],
+ [-6.379, 4.031, -2.228],
+ [-6.634, 1.849, -1.758],
+ [-6.821, 4.923, -2.394],
]
-
- # Increment residue ID for attached residue
- residue.res_id[:] = last_res_id + 1
-
-C_N_LENGTH = 1.34
-N_CA_LENGTH = 1.46
-CA_C_LENGTH = 1.54
-
-CA_C_N_ANGLE = 114
-C_N_CA_ANGLE = 123
-N_CA_C_ANGLE = 110
-
-# Reference peptide bond atom coordinates taken from 1l2y:
-# CA, C, N, O, H
-peptide_coord = np.array([
- [-8.608, 3.135, -1.618],
- [-7.117, 2.964, -1.897],
- [-6.379, 4.031, -2.228],
- [-6.634, 1.849, -1.758],
- [-6.821, 4.923, -2.394]
-])
+)
def create_raw_backbone_coord(number_of_res):
@@ -143,7 +61,7 @@ def create_raw_backbone_coord(number_of_res):
range(len(coord)),
itertools.cycle([CA_C_N_ANGLE, C_N_CA_ANGLE, N_CA_C_ANGLE]),
itertools.cycle([1, -1]),
- itertools.cycle([C_N_LENGTH, N_CA_LENGTH, CA_C_LENGTH])
+ itertools.cycle([C_N_LENGTH, N_CA_LENGTH, CA_C_LENGTH]),
):
if i == 0:
coord[i] = [0, 0, 0]
@@ -155,14 +73,14 @@ def create_raw_backbone_coord(number_of_res):
# Calculate the coordinates of a new atoms by rotating the
# previous bond by the given angle
new_coord = struc.rotate_about_axis(
- coord[i-2],
- axis = rot_axis,
- angle = np.deg2rad(angle),
- support = coord[i-1]
+ coord[i - 2],
+ axis=rot_axis,
+ angle=np.deg2rad(angle),
+ support=coord[i - 1],
)
# Scale bond to correct bond length
- bond_vector = new_coord - coord[i-1]
- coord[i] = coord[i-1] + bond_vector * length / norm(bond_vector)
+ bond_vector = new_coord - coord[i - 1]
+ coord[i] = coord[i - 1] + bond_vector * length / norm(bond_vector)
return coord
@@ -181,18 +99,9 @@ def append_residue(chain, residue):
# Remove atoms removed by peptide bond
chain = chain[
- (chain.res_id != last_res_id) |
- ~np.isin(
- chain.atom_name,
- ["OXT", "HXT"]
- )
- ]
- residue = residue[
- ~np.isin(
- residue.atom_name,
- ["H2", "H3"]
- )
+ (chain.res_id != last_res_id) | ~np.isin(chain.atom_name, ["OXT", "HXT"])
]
+ residue = residue[~np.isin(residue.atom_name, ["H2", "H3"])]
# Increment residue ID for attached residue
residue.res_id[:] = last_res_id + 1
@@ -203,9 +112,7 @@ def append_residue(chain, residue):
# Add peptide bond
index_prev_c = np.where(chain.atom_name == "C")[0][-2]
index_curr_n = np.where(chain.atom_name == "N")[0][-1]
- chain.bonds.add_bond(
- index_prev_c, index_curr_n, struc.BondType.SINGLE
- )
+ chain.bonds.add_bond(index_prev_c, index_curr_n, struc.BondType.SINGLE)
return chain
@@ -213,15 +120,14 @@ def assemble_peptide(sequence):
res_names = [seq.ProteinSequence.convert_letter_1to3(r) for r in sequence]
backbone_coord = create_raw_backbone_coord(len(sequence))
-
chain = struc.AtomArray(0)
for i, res_name in enumerate(res_names):
residue = info.residue(res_name)
# Superimpose residue to corresponding backbone coordinates
_, transformation = struc.superimpose(
- backbone_coord[3*i : 3*i + 3],
- residue.coord[np.isin(residue.atom_name, ["N", "CA", "C"])]
+ backbone_coord[3 * i : 3 * i + 3],
+ residue.coord[np.isin(residue.atom_name, ["N", "CA", "C"])],
)
residue = transformation.apply(residue)
@@ -238,10 +144,9 @@ def assemble_peptide(sequence):
for atom_name in ["N", "H"]
]
_, transformation = struc.superimpose(
- chain.coord[[ca_i, c_i, n_i]],
- peptide_coord[:3]
+ chain.coord[[ca_i, c_i, n_i]], PEPTIDE_COORD[:3]
)
- chain.coord[[o_i, h_i]] = transformation.apply(peptide_coord[3:])
+ chain.coord[[o_i, h_i]] = transformation.apply(PEPTIDE_COORD[3:])
return chain
diff --git a/doc/examples/scripts/structure/protein/peptide_assembly_pymol.py b/doc/examples/scripts/structure/protein/peptide_assembly_pymol.py
index 9ef7d7b2f..7afdc6a06 100644
--- a/doc/examples/scripts/structure/protein/peptide_assembly_pymol.py
+++ b/doc/examples/scripts/structure/protein/peptide_assembly_pymol.py
@@ -1,19 +1,14 @@
-import numpy as np
+import ammolite
from matplotlib.colors import to_rgb
import biotite
import biotite.structure as struc
-import ammolite
-
PNG_SIZE = (1000, 400)
# Define colors
for color_name, color_value in biotite.colors.items():
- ammolite.cmd.set_color(
- "biotite_" + color_name,
- to_rgb(color_value)
- )
+ ammolite.cmd.set_color("biotite_" + color_name, to_rgb(color_value))
# Convert to PyMOL
chain.bonds = struc.connect_via_distances(chain)
@@ -21,14 +16,8 @@
# Visualize as stick model
pymol_obj.show_as("sticks")
-pymol_obj.color(
- "biotite_lightgreen",
- (chain.res_id % 2 == 0) & (chain.element == "C")
-)
-pymol_obj.color(
- "biotite_dimgreen",
- (chain.res_id % 2 != 0) & (chain.element == "C")
-)
+pymol_obj.color("biotite_lightgreen", (chain.res_id % 2 == 0) & (chain.element == "C"))
+pymol_obj.color("biotite_dimgreen", (chain.res_id % 2 != 0) & (chain.element == "C"))
ammolite.cmd.set("depth_cue", 0)
# Adjust camera
@@ -37,4 +26,4 @@
# Save image
ammolite.cmd.ray(*PNG_SIZE)
-ammolite.cmd.png(__image_destination__)
\ No newline at end of file
+ammolite.cmd.png(__image_destination__)
diff --git a/doc/examples/scripts/structure/protein/ramachandran.py b/doc/examples/scripts/structure/protein/ramachandran.py
index 021349d36..806ac283f 100644
--- a/doc/examples/scripts/structure/protein/ramachandran.py
+++ b/doc/examples/scripts/structure/protein/ramachandran.py
@@ -12,34 +12,29 @@
# License: BSD 3 clause
from tempfile import gettempdir
-import biotite.structure as struc
-import biotite.structure.io as strucio
-import biotite.database.rcsb as rcsb
import matplotlib.pyplot as plt
import numpy as np
-from matplotlib import colors
-import scipy.stats as sts
+import biotite.database.rcsb as rcsb
+import biotite.structure as struc
+import biotite.structure.io as strucio
# Download and parse file
file = rcsb.fetch("3vkh", "cif", gettempdir())
atom_array = strucio.load_structure(file)
# Calculate backbone dihedral angles
# from one of the two identical chains in the asymmetric unit
-phi, psi, omega = struc.dihedral_backbone(
- atom_array[atom_array.chain_id == "A"]
-)
+phi, psi, omega = struc.dihedral_backbone(atom_array[atom_array.chain_id == "A"])
# Conversion from radians into degree
-phi *= 180/np.pi
-psi *= 180/np.pi
+phi *= 180 / np.pi
+psi *= 180 / np.pi
# Remove invalid values (NaN) at first and last position
-phi= phi[1:-1]
-psi= psi[1:-1]
+phi = phi[1:-1]
+psi = psi[1:-1]
# Plot density
figure = plt.figure()
ax = figure.add_subplot(111)
-h, xed, yed, image = ax.hist2d(phi, psi, bins=(200, 200),
- cmap="RdYlGn_r", cmin=1)
+h, xed, yed, image = ax.hist2d(phi, psi, bins=(200, 200), cmap="RdYlGn_r", cmin=1)
cbar = figure.colorbar(image, orientation="vertical")
cbar.set_label("Count")
ax.set_aspect("equal")
@@ -49,4 +44,4 @@
ax.set_ylabel(r"$\psi$")
ax.set_title("Ramachandran plot of dynein motor domain")
figure.tight_layout()
-plt.show()
\ No newline at end of file
+plt.show()
diff --git a/doc/examples/scripts/structure/protein/residue_chirality.py b/doc/examples/scripts/structure/protein/residue_chirality.py
index 9d6d94061..92dd15b87 100644
--- a/doc/examples/scripts/structure/protein/residue_chirality.py
+++ b/doc/examples/scripts/structure/protein/residue_chirality.py
@@ -18,9 +18,9 @@
from tempfile import gettempdir
import numpy as np
+import biotite.database.rcsb as rcsb
import biotite.structure as struc
import biotite.structure.io as strucio
-import biotite.database.rcsb as rcsb
def get_enantiomer(n, ca, c, cb):
@@ -29,16 +29,15 @@ def get_enantiomer(n, ca, c, cb):
# the enantiomer:
# L = 1
# D = -1
- n = np.cross(ca-n, ca-c)
+ n = np.cross(ca - n, ca - c)
sign = np.sign(np.dot(cb - ca, n))
return sign
+
def analyze_chirality(array):
# Filter backbone + CB
array = array[struc.filter_amino_acids(array)]
- array = array[
- (array.atom_name == "CB") | (struc.filter_peptide_backbone(array))
- ]
+ array = array[(array.atom_name == "CB") | (struc.filter_peptide_backbone(array))]
# Iterate over each residue
ids, names = struc.get_residues(array)
enantiomers = np.zeros(len(ids), dtype=int)
@@ -48,10 +47,10 @@ def analyze_chirality(array):
# Glyine -> no chirality
enantiomers[i] = 0
else:
- enantiomers[i] = get_enantiomer(coord[0], coord[1],
- coord[2], coord[3])
+ enantiomers[i] = get_enantiomer(coord[0], coord[1], coord[2], coord[3])
return enantiomers
+
# Fetch and parse structure file
file = rcsb.fetch("1l2y", "bcif", gettempdir())
stack = strucio.load_structure(file)
@@ -62,5 +61,5 @@ def analyze_chirality(array):
# Reflected structures have opposite enantiomers
# Test via reflection at x-y-plane, z -> -z
array_reflect = array.copy()
-array_reflect.coord[:,2] *= -1
-print("1l2y (reflected)", analyze_chirality(array_reflect))
\ No newline at end of file
+array_reflect.coord[:, 2] *= -1
+print("1l2y (reflected)", analyze_chirality(array_reflect))
diff --git a/doc/examples/scripts/structure/protein/sheet_arrangement.py b/doc/examples/scripts/structure/protein/sheet_arrangement.py
index aea9f4e60..930c6a0e3 100644
--- a/doc/examples/scripts/structure/protein/sheet_arrangement.py
+++ b/doc/examples/scripts/structure/protein/sheet_arrangement.py
@@ -17,42 +17,45 @@
# Code source: Patrick Kunzmann
# License: BSD 3 clause
-import numpy as np
-import networkx as nx
import matplotlib.pyplot as plt
+import networkx as nx
+import numpy as np
from matplotlib.patches import FancyArrow
import biotite
-import biotite.structure.io.pdbx as pdbx
import biotite.database.rcsb as rcsb
-
+import biotite.structure.io.pdbx as pdbx
##### OPTIONS #####
PDB_ID = "3AKO"
SHEETS = ["A"]
-FIG_SIZE = (8.0, 4.0) # Figure size in inches
-Y_LIMIT = 2.0 # Vertical plot limits
-SHEET_DISTANCE = 3.0 # Separation of strands in different sheets
-ARROW_TAIL_WITH = 0.4 # Width of the arrow tails
-ARROW_HEAD_WITH = 0.7 # Width of the arrow heads
-ARROW_HEAD_LENGTH = 0.25 # Length of the arrow heads
-ARROW_LINE_WIDTH = 1 # Width of the arrow edges
-ARROW_COLORS = [ # Each chain is colored differently
+FIG_SIZE = (8.0, 4.0) # Figure size in inches
+Y_LIMIT = 2.0 # Vertical plot limits
+SHEET_DISTANCE = 3.0 # Separation of strands in different sheets
+ARROW_TAIL_WITH = 0.4 # Width of the arrow tails
+ARROW_HEAD_WITH = 0.7 # Width of the arrow heads
+ARROW_HEAD_LENGTH = 0.25 # Length of the arrow heads
+ARROW_LINE_WIDTH = 1 # Width of the arrow edges
+ARROW_COLORS = [ # Each chain is colored differently
biotite.colors["darkgreen"],
biotite.colors["dimorange"],
biotite.colors["lightgreen"],
biotite.colors["brightorange"],
]
-CONNECTION_COLOR = "black" # Color of the connection lines
-CONNECTION_LINE_WIDTH = 1.5 # Width of the connection lines
-CONNECTION_HEIGHT = 0.1 # Minimum height of the connection lines
-CONNECTION_SEPARATION = 0.1 # Minimum vertical distance between the connection lines
-RES_ID_HEIGHT = -0.2 # The vertical distance of the residue ID labels from the arrow ends
-RES_ID_FONT_SIZE = 8 # The font size of the residue ID labels
-RES_ID_FONT_WEIGHT = "bold" # The font weight of the residue ID labels
-ADAPTIVE_ARROW_LENGTHS = True # If true, the arrow length is proportional to the number of its residues
-SHOW_SHEET_NAMES = False # If true, the sheets are labeled below the plot
-SHEET_NAME_FONT_SIZE = 14 # The font size of the sheet labels
+CONNECTION_COLOR = "black" # Color of the connection lines
+CONNECTION_LINE_WIDTH = 1.5 # Width of the connection lines
+CONNECTION_HEIGHT = 0.1 # Minimum height of the connection lines
+CONNECTION_SEPARATION = 0.1 # Minimum vertical distance between the connection lines
+RES_ID_HEIGHT = (
+ -0.2
+) # The vertical distance of the residue ID labels from the arrow ends
+RES_ID_FONT_SIZE = 8 # The font size of the residue ID labels
+RES_ID_FONT_WEIGHT = "bold" # The font weight of the residue ID labels
+ADAPTIVE_ARROW_LENGTHS = (
+ True # If true, the arrow length is proportional to the number of its residues
+)
+SHOW_SHEET_NAMES = False # If true, the sheets are labeled below the plot
+SHEET_NAME_FONT_SIZE = 14 # The font size of the sheet labels
##### SNOITPO #####
########################################################################
@@ -73,19 +76,20 @@
if SHEETS is None:
sele = np.full(sheet_order.row_count, True)
else:
- sele = np.array([
- sheet in SHEETS for sheet in sheet_order["sheet_id"].as_array()
- ])
+ sele = np.array([sheet in SHEETS for sheet in sheet_order["sheet_id"].as_array()])
sheet_ids = sheet_order["sheet_id"].as_array()[sele]
is_parallel_list = sheet_order["sense"].as_array()[sele] == "parallel"
-adjacent_strands = np.array([
- (strand_i, strand_j) for strand_i, strand_j in zip(
- sheet_order["range_id_1"].as_array()[sele],
- sheet_order["range_id_2"].as_array()[sele]
- )
-])
+adjacent_strands = np.array(
+ [
+ (strand_i, strand_j)
+ for strand_i, strand_j in zip(
+ sheet_order["range_id_1"].as_array()[sele],
+ sheet_order["range_id_2"].as_array()[sele],
+ )
+ ]
+)
print("Adjacent strands (sheet ID, strand ID):")
for sheet_id, (strand_i, strand_j) in zip(sheet_ids, adjacent_strands):
@@ -105,9 +109,7 @@
sheet_range = bcif_file.block["struct_sheet_range"]
# Again, create a boolean mask that covers the selected sheets
-sele = np.array([
- sheet in sheet_ids for sheet in sheet_range["sheet_id"].as_array()
-])
+sele = np.array([sheet in sheet_ids for sheet in sheet_range["sheet_id"].as_array()])
strand_chain_ids = sheet_range["beg_auth_asym_id"].as_array()[sele]
strand_res_id_begs = sheet_range["beg_auth_seq_id"].as_array(int)[sele]
strand_res_id_ends = sheet_range["end_auth_seq_id"].as_array(int)[sele]
@@ -127,19 +129,21 @@
# i.e. entries with the same chain ID and residue ID
# Duplicate entries appear e.g. in beta-barrel structure files
# Draw one of each duplicate as orphan -> no connections
-non_duplicate_mask = (np.diff(strand_res_id_begs[order], prepend=[-1]) != 0)
+non_duplicate_mask = np.diff(strand_res_id_begs[order], prepend=[-1]) != 0
connections = []
-non_duplicate_indices = np.arange(len(sorted_strand_ids))[non_duplicate_mask]
+non_duplicate_indices = np.arange(len(sorted_strand_ids))[non_duplicate_mask]
for i in range(len(non_duplicate_indices) - 1):
current_i = non_duplicate_indices[i]
- next_i = non_duplicate_indices[i+1]
+ next_i = non_duplicate_indices[i + 1]
if sorted_chain_ids[current_i] != sorted_chain_ids[next_i]:
# No connection between separate chains
continue
- connections.append((
- (sorted_sheet_ids[current_i], sorted_strand_ids[current_i]),
- (sorted_sheet_ids[next_i], sorted_strand_ids[next_i] )
- ))
+ connections.append(
+ (
+ (sorted_sheet_ids[current_i], sorted_strand_ids[current_i]),
+ (sorted_sheet_ids[next_i], sorted_strand_ids[next_i]),
+ )
+ )
print("Connected strands (sheet ID, strand ID):")
for strand_i, strand_j in connections:
@@ -148,18 +152,17 @@
# Save the start and end residue IDs for each strand for labeling
ranges = {
(sheet_id, strand_id): (begin, end)
- for sheet_id, strand_id, begin, end
- in zip(
- sorted_sheet_ids, sorted_strand_ids,
- sorted_res_id_begs, sorted_res_id_ends
+ for sheet_id, strand_id, begin, end in zip(
+ sorted_sheet_ids, sorted_strand_ids, sorted_res_id_begs, sorted_res_id_ends
)
}
# Save the chains ID for each strand for coloring
chain_ids = {
(sheet_id, strand_id): chain_id
- for sheet_id, strand_id, chain_id
- in zip(sorted_sheet_ids, sorted_strand_ids, sorted_chain_ids)
+ for sheet_id, strand_id, chain_id in zip(
+ sorted_sheet_ids, sorted_strand_ids, sorted_chain_ids
+ )
}
unique_chain_ids = np.unique(sorted_chain_ids)
@@ -176,14 +179,15 @@
sheet_graphs = {}
for sheet_id in np.unique(sheet_ids):
# Select only strands from the current sheet
- sheet_mask = (sheet_ids == sheet_id)
- sheet_graphs[sheet_id] = nx.Graph([
- (strand_i, strand_j, {"is_parallel": is_parallel})
- for (strand_i, strand_j), is_parallel in zip(
- adjacent_strands[sheet_mask],
- is_parallel_list[sheet_mask]
- )
- ])
+ sheet_mask = sheet_ids == sheet_id
+ sheet_graphs[sheet_id] = nx.Graph(
+ [
+ (strand_i, strand_j, {"is_parallel": is_parallel})
+ for (strand_i, strand_j), is_parallel in zip(
+ adjacent_strands[sheet_mask], is_parallel_list[sheet_mask]
+ )
+ ]
+ )
########################################################################
# Another missing information is the direction of the plotted arrows,
@@ -199,7 +203,7 @@
# The calculated arrow direction is stored as node attribute.
for graph in sheet_graphs.values():
- initial_strand = adjacent_strands[0,0]
+ initial_strand = adjacent_strands[0, 0]
graph.nodes[initial_strand]["is_upwards"] = True
for strand in graph.nodes:
if strand == initial_strand:
@@ -212,21 +216,15 @@
# yet determined
continue
is_parallel = graph.edges[(strand, adj_strand)]["is_parallel"]
- this_strand_is_upwards.append(
- is_upwards ^ ~is_parallel
- )
+ this_strand_is_upwards.append(is_upwards ^ ~is_parallel)
if len(this_strand_is_upwards) == 0:
- raise ValueError(
- "Cannot determine arrow direction from adjacent strands"
- )
+ raise ValueError("Cannot determine arrow direction from adjacent strands")
elif all(this_strand_is_upwards):
graph.nodes[strand]["is_upwards"] = True
elif not any(this_strand_is_upwards):
graph.nodes[strand]["is_upwards"] = False
else:
- raise ValueError(
- "Conflicting arrow directions from adjacent strands"
- )
+ raise ValueError("Conflicting arrow directions from adjacent strands")
########################################################################
# No we have got all positioning information we need to start plotting.
@@ -234,7 +232,7 @@
fig, ax = plt.subplots(figsize=FIG_SIZE)
### Plot arrows
-MAX_ARROW_LENGTH = 2 # from y=-1 to y=1
+MAX_ARROW_LENGTH = 2 # from y=-1 to y=1
arrow_length_per_seq_length = MAX_ARROW_LENGTH / np.max(
[end - beg + 1 for beg, end in ranges.values()]
)
@@ -280,14 +278,17 @@
dy = -arrow_length
ax.add_patch(
FancyArrow(
- x=pos, y=y, dx=0, dy=dy,
+ x=pos,
+ y=y,
+ dx=0,
+ dy=dy,
length_includes_head=True,
- width = ARROW_TAIL_WITH,
- head_width = ARROW_HEAD_WITH,
- head_length = ARROW_HEAD_LENGTH,
- facecolor = ARROW_COLORS[color_index % len(ARROW_COLORS)],
- edgecolor = CONNECTION_COLOR,
- linewidth = ARROW_LINE_WIDTH,
+ width=ARROW_TAIL_WITH,
+ head_width=ARROW_HEAD_WITH,
+ head_length=ARROW_HEAD_LENGTH,
+ facecolor=ARROW_COLORS[color_index % len(ARROW_COLORS)],
+ edgecolor=CONNECTION_COLOR,
+ linewidth=ARROW_LINE_WIDTH,
)
)
# Start and end coordinates of the respective arrow
@@ -299,10 +300,12 @@
# Plot the short connections at low height
# to decrease line intersections
# -> sort connections by length of connection
-order = np.argsort([
- np.abs(coord_dict[strand_i][0][0] - coord_dict[strand_j][0][0])
- for strand_i, strand_j in connections
-])
+order = np.argsort(
+ [
+ np.abs(coord_dict[strand_i][0][0] - coord_dict[strand_j][0][0])
+ for strand_i, strand_j in connections
+ ]
+)
connections = [connections[i] for i in order]
for i, (strand_i, strand_j) in enumerate(connections):
horizontal_line_height = 1 + CONNECTION_HEIGHT + i * CONNECTION_SEPARATION
@@ -311,17 +314,12 @@
if np.sign(coord_i_end[1]) == np.sign(coord_j_beg[1]):
# Start and end are on the same side of the arrows
- x = (
- coord_i_end[0],
- coord_i_end[0],
- coord_j_beg[0],
- coord_j_beg[0]
- )
+ x = (coord_i_end[0], coord_i_end[0], coord_j_beg[0], coord_j_beg[0])
y = (
coord_i_end[1],
np.sign(coord_i_end[1]) * horizontal_line_height,
np.sign(coord_j_beg[1]) * horizontal_line_height,
- coord_j_beg[1]
+ coord_j_beg[1],
)
else:
# Start and end are on different sides
@@ -332,7 +330,7 @@
coord_i_end[0] + offset,
coord_i_end[0] + offset,
coord_j_beg[0],
- coord_j_beg[0]
+ coord_j_beg[0],
)
y = (
coord_i_end[1],
@@ -340,14 +338,15 @@
np.sign(coord_i_end[1]) * horizontal_line_height,
np.sign(coord_j_beg[1]) * horizontal_line_height,
np.sign(coord_j_beg[1]) * horizontal_line_height,
- coord_j_beg[1]
+ coord_j_beg[1],
)
ax.plot(
- x, y,
- color = CONNECTION_COLOR,
- linewidth = CONNECTION_LINE_WIDTH,
+ x,
+ y,
+ color=CONNECTION_COLOR,
+ linewidth=CONNECTION_LINE_WIDTH,
# Avoid intersection of the line's end with the arrow
- solid_capstyle = "butt"
+ solid_capstyle="butt",
)
### Plot residue ID labels
@@ -358,16 +357,16 @@
coord[0],
np.sign(coord[1]) * (np.abs(coord[1]) + RES_ID_HEIGHT),
str(res_id),
- ha="center", va="center",
- fontsize=RES_ID_FONT_SIZE, weight=RES_ID_FONT_WEIGHT
+ ha="center",
+ va="center",
+ fontsize=RES_ID_FONT_SIZE,
+ weight=RES_ID_FONT_WEIGHT,
)
### Plot sheet names as x-axis ticks
if SHOW_SHEET_NAMES:
tick_pos = [
- np.mean([
- coord_dict[key][0][0] for key in coord_dict if key[0] == sheet_id
- ])
+ np.mean([coord_dict[key][0][0] for key in coord_dict if key[0] == sheet_id])
for sheet_id in sheet_ids
]
ax.set_xticks(tick_pos)
@@ -375,8 +374,11 @@
ax.set_frame_on(False)
ax.yaxis.set_visible(False)
ax.xaxis.set_tick_params(
- bottom=False, top=False, labelbottom=True, labeltop=False,
- labelsize=SHEET_NAME_FONT_SIZE
+ bottom=False,
+ top=False,
+ labelbottom=True,
+ labeltop=False,
+ labelsize=SHEET_NAME_FONT_SIZE,
)
else:
ax.axis("off")
@@ -385,4 +387,4 @@
ax.set_xlim(-1, current_position - SHEET_DISTANCE + 1)
ax.set_ylim(-Y_LIMIT, Y_LIMIT)
fig.tight_layout()
-plt.show()
\ No newline at end of file
+plt.show()
diff --git a/doc/examples/scripts/structure/protein/transketolase_sse.py b/doc/examples/scripts/structure/protein/transketolase_sse.py
index 78f3ba546..5c0896e2b 100644
--- a/doc/examples/scripts/structure/protein/transketolase_sse.py
+++ b/doc/examples/scripts/structure/protein/transketolase_sse.py
@@ -14,25 +14,24 @@
# License: BSD 3 clause
from tempfile import gettempdir
-import numpy as np
import matplotlib.pyplot as plt
+import numpy as np
from matplotlib.patches import Rectangle
import biotite
-import biotite.structure as struc
-import biotite.structure.io.pdbx as pdbx
+import biotite.application.dssp as dssp
+import biotite.database.entrez as entrez
+import biotite.database.rcsb as rcsb
import biotite.sequence as seq
import biotite.sequence.graphics as graphics
import biotite.sequence.io.genbank as gb
-import biotite.database.rcsb as rcsb
-import biotite.database.entrez as entrez
-import biotite.application.dssp as dssp
-
+import biotite.structure as struc
+import biotite.structure.io.pdbx as pdbx
# Create 'FeaturePlotter' subclasses
# for drawing the scondary structure features
-class HelixPlotter(graphics.FeaturePlotter):
+class HelixPlotter(graphics.FeaturePlotter):
def __init__(self):
pass
@@ -48,12 +47,12 @@ def matches(self, feature):
def draw(self, axes, feature, bbox, loc, style_param):
# Approx. 1 turn per 3.6 residues to resemble natural helix
n_turns = np.ceil((loc.last - loc.first + 1) / 3.6)
- x_val = np.linspace(0, n_turns * 2*np.pi, 100)
+ x_val = np.linspace(0, n_turns * 2 * np.pi, 100)
# Curve ranges from 0.3 to 0.7
- y_val = (-0.4*np.sin(x_val) + 1) / 2
+ y_val = (-0.4 * np.sin(x_val) + 1) / 2
# Transform values for correct location in feature map
- x_val *= bbox.width / (n_turns * 2*np.pi)
+ x_val *= bbox.width / (n_turns * 2 * np.pi)
x_val += bbox.x0
y_val *= bbox.height
y_val += bbox.y0
@@ -63,18 +62,14 @@ def draw(self, axes, feature, bbox, loc, style_param):
bbox.p0, bbox.width, bbox.height, color="white", linewidth=0
)
axes.add_patch(background)
- axes.plot(
- x_val, y_val, linewidth=2, color=biotite.colors["dimgreen"]
- )
+ axes.plot(x_val, y_val, linewidth=2, color=biotite.colors["dimgreen"])
class SheetPlotter(graphics.FeaturePlotter):
-
def __init__(self, head_width=0.8, tail_width=0.5):
self._head_width = head_width
self._tail_width = tail_width
-
def matches(self, feature):
if feature.key == "SecStr":
if "sec_str_type" in feature.qual:
@@ -84,39 +79,52 @@ def matches(self, feature):
def draw(self, axes, feature, bbox, loc, style_param):
x = bbox.x0
- y = bbox.y0 + bbox.height/2
+ y = bbox.y0 + bbox.height / 2
dx = bbox.width
dy = 0
- if loc.defect & seq.Location.Defect.MISS_RIGHT:
+ if loc.defect & seq.Location.Defect.MISS_RIGHT:
# If the feature extends into the prevoius or next line
# do not draw an arrow head
draw_head = False
else:
draw_head = True
- axes.add_patch(biotite.AdaptiveFancyArrow(
- x, y, dx, dy,
- self._tail_width*bbox.height, self._head_width*bbox.height,
- # Create head with 90 degrees tip
- # -> head width/length ratio = 1/2
- head_ratio=0.5, draw_head=draw_head,
- color=biotite.colors["orange"], linewidth=0
- ))
+ axes.add_patch(
+ biotite.AdaptiveFancyArrow(
+ x,
+ y,
+ dx,
+ dy,
+ self._tail_width * bbox.height,
+ self._head_width * bbox.height,
+ # Create head with 90 degrees tip
+ # -> head width/length ratio = 1/2
+ head_ratio=0.5,
+ draw_head=draw_head,
+ color=biotite.colors["orange"],
+ linewidth=0,
+ )
+ )
# Test our drawing functions with example annotation
-annotation = seq.Annotation([
- seq.Feature("SecStr", [seq.Location(10, 40)], {"sec_str_type" : "helix"}),
- seq.Feature("SecStr", [seq.Location(60, 90)], {"sec_str_type" : "sheet"}),
-])
+annotation = seq.Annotation(
+ [
+ seq.Feature("SecStr", [seq.Location(10, 40)], {"sec_str_type": "helix"}),
+ seq.Feature("SecStr", [seq.Location(60, 90)], {"sec_str_type": "sheet"}),
+ ]
+)
fig = plt.figure(figsize=(8.0, 0.8))
ax = fig.add_subplot(111)
graphics.plot_feature_map(
- ax, annotation, multi_line=False, loc_range=(1,100),
+ ax,
+ annotation,
+ multi_line=False,
+ loc_range=(1, 100),
# Register our drawing functions
- feature_plotters=[HelixPlotter(), SheetPlotter()]
+ feature_plotters=[HelixPlotter(), SheetPlotter()],
)
fig.tight_layout()
@@ -138,11 +146,14 @@ def draw(self, axes, feature, bbox, loc, style_param):
fig = plt.figure(figsize=(8.0, 3.0))
ax = fig.add_subplot(111)
graphics.plot_feature_map(
- ax, annotation, symbols_per_line=150,
- show_numbers=True, show_line_position=True,
+ ax,
+ annotation,
+ symbols_per_line=150,
+ show_numbers=True,
+ show_line_position=True,
# 'loc_range' takes exclusive stop -> length+1 is required
- loc_range=(1,length+1),
- feature_plotters=[HelixPlotter(), SheetPlotter()]
+ loc_range=(1, length + 1),
+ feature_plotters=[HelixPlotter(), SheetPlotter()],
)
fig.tight_layout()
@@ -152,14 +163,17 @@ def draw(self, axes, feature, bbox, loc, style_param):
# Converter for the DSSP secondary structure elements
# to the classical ones
-dssp_to_abc = {"I" : "c",
- "S" : "c",
- "H" : "a",
- "E" : "b",
- "G" : "c",
- "B" : "b",
- "T" : "c",
- "C" : "c"}
+dssp_to_abc = {
+ "I": "c",
+ "S": "c",
+ "H": "a",
+ "E": "b",
+ "G": "c",
+ "B": "b",
+ "T": "c",
+ "C": "c",
+}
+
def visualize_secondary_structure(sse, first_id):
"""
@@ -176,7 +190,7 @@ def _add_sec_str(annotation, first, last, str_type):
# coil
return
feature = seq.Feature(
- "SecStr", [seq.Location(first, last)], {"sec_str_type" : str_type}
+ "SecStr", [seq.Location(first, last)], {"sec_str_type": str_type}
)
annotation.add_feature(feature)
@@ -190,25 +204,29 @@ def _add_sec_str(annotation, first, last, str_type):
curr_start = i
curr_sse = sse[i]
else:
- if sse[i] != sse[i-1]:
+ if sse[i] != sse[i - 1]:
_add_sec_str(
- annotation, curr_start+first_id, i-1+first_id, curr_sse
+ annotation, curr_start + first_id, i - 1 + first_id, curr_sse
)
curr_start = i
curr_sse = sse[i]
# Add last secondary structure element to annotation
- _add_sec_str(annotation, curr_start+first_id, i-1+first_id, curr_sse)
+ _add_sec_str(annotation, curr_start + first_id, i - 1 + first_id, curr_sse)
fig = plt.figure(figsize=(8.0, 3.0))
ax = fig.add_subplot(111)
graphics.plot_feature_map(
- ax, annotation, symbols_per_line=150,
- loc_range=(first_id, first_id+len(sse)),
- show_numbers=True, show_line_position=True,
- feature_plotters=[HelixPlotter(), SheetPlotter()]
+ ax,
+ annotation,
+ symbols_per_line=150,
+ loc_range=(first_id, first_id + len(sse)),
+ show_numbers=True,
+ show_line_position=True,
+ feature_plotters=[HelixPlotter(), SheetPlotter()],
)
fig.tight_layout()
+
# Fetch and load structure
file_name = rcsb.fetch("1QGD", "bcif", gettempdir())
pdbx_file = pdbx.BinaryCIFFile.read(file_name)
@@ -230,4 +248,4 @@ def _add_sec_str(annotation, first, last, str_type):
sse = struc.annotate_sse(array, chain_id="A")
visualize_secondary_structure(sse, tk_mono.res_id[0])
-plt.show()
\ No newline at end of file
+plt.show()
diff --git a/doc/key.py b/doc/key.py
index b517f37fb..b7a2b4334 100644
--- a/doc/key.py
+++ b/doc/key.py
@@ -8,4 +8,4 @@ def set_ncbi_api_key_from_env(*args, **kwargs):
ncbi_api_key = os.environ.get("NCBI_API_KEY")
if ncbi_api_key is not None and ncbi_api_key != "":
- entrez.set_api_key(ncbi_api_key)
\ No newline at end of file
+ entrez.set_api_key(ncbi_api_key)
diff --git a/doc/scraper.py b/doc/scraper.py
index ac60188e6..c9fd629ce 100644
--- a/doc/scraper.py
+++ b/doc/scraper.py
@@ -1,12 +1,11 @@
-import shutil
import copy
-import sys
import os
-from os.path import splitext, join, dirname, isfile
-from sphinx_gallery.scrapers import figure_rst
-from sphinx_gallery.py_source_parser import extract_file_config
+import shutil
+import sys
+from os.path import dirname, isfile, join, splitext
from sphinx.errors import ExtensionError
-
+from sphinx_gallery.py_source_parser import extract_file_config
+from sphinx_gallery.scrapers import figure_rst
STATIC_IMAGE_COMMAND = "static_image"
PYMOL_IMAGE_COMMAND = "ammolite_script"
@@ -19,7 +18,7 @@ def static_image_scraper(block, block_vars, gallery_conf):
# Search for `sphinx_gallery_static_image` commands
block_conf = extract_file_config(code)
if STATIC_IMAGE_COMMAND not in block_conf:
- return figure_rst([], gallery_conf['src_dir'])
+ return figure_rst([], gallery_conf["src_dir"])
image_sources = [
join(script_dir, image_name.strip())
@@ -29,7 +28,7 @@ def static_image_scraper(block, block_vars, gallery_conf):
# Copy the images into the 'gallery' directory under a canonical
# sphinx-gallery name
image_destinations = []
- image_path_iterator = block_vars['image_path_iterator']
+ image_path_iterator = block_vars["image_path_iterator"]
for image in image_sources:
suffix = splitext(image)[1]
image_destination = image_path_iterator.next()
@@ -40,7 +39,7 @@ def static_image_scraper(block, block_vars, gallery_conf):
shutil.copy(image, image_destination)
# Generate rST for detected image files
- return figure_rst(image_destinations, gallery_conf['src_dir'])
+ return figure_rst(image_destinations, gallery_conf["src_dir"])
def pymol_scraper(block, block_vars, gallery_conf):
@@ -48,7 +47,7 @@ def pymol_scraper(block, block_vars, gallery_conf):
block_conf = extract_file_config(code)
# Search for a `sphinx_gallery_ammolite_script` command
if PYMOL_IMAGE_COMMAND not in block_conf:
- return figure_rst([], gallery_conf['src_dir'])
+ return figure_rst([], gallery_conf["src_dir"])
script_dir = dirname(block_vars["src_file"])
pymol_script_path = join(script_dir, block_conf[PYMOL_IMAGE_COMMAND])
@@ -56,7 +55,7 @@ def pymol_scraper(block, block_vars, gallery_conf):
# the example script
# -> the image will be included in version control
# -> Rendering with PyMOL is not necessary for building the docs
- pymol_image_path = splitext(block_vars["src_file"])[0] + ".png"
+ pymol_image_path = splitext(block_vars["src_file"])[0] + ".png"
if not isfile(pymol_script_path):
raise ExtensionError(
f"'{block_vars['src_file']}' has no corresponding "
@@ -64,8 +63,8 @@ def pymol_scraper(block, block_vars, gallery_conf):
)
try:
- import pymol
- import ammolite
+ import ammolite # noqa: F401
+ import pymol # noqa: F401
except ImportError:
# If Ammolite is not installed, fall back to the image file,
# if already existing
@@ -82,7 +81,7 @@ def pymol_scraper(block, block_vars, gallery_conf):
# to STDOUT or STDERR
# -> Save original STDOUT/STDERR and point them
# temporarily to DEVNULL
- dev_null = open(os.devnull, 'w')
+ dev_null = open(os.devnull, "w")
orig_stdout = sys.stdout
orig_stderr = sys.stderr
sys.stdout = dev_null
@@ -100,13 +99,12 @@ def pymol_scraper(block, block_vars, gallery_conf):
dev_null.close()
if not isfile(pymol_image_path):
raise ExtensionError(
- "PyMOL script did not create an image "
- "(at expected location)"
+ "PyMOL script did not create an image " "(at expected location)"
)
# Copy the images into the 'gallery' directory under a canonical
# sphinx-gallery name
- image_path_iterator = block_vars['image_path_iterator']
+ image_path_iterator = block_vars["image_path_iterator"]
image_destination = image_path_iterator.next()
shutil.copy(pymol_image_path, image_destination)
- return figure_rst([image_destination], gallery_conf['src_dir'])
+ return figure_rst([image_destination], gallery_conf["src_dir"])
diff --git a/doc/switcher.py b/doc/switcher.py
index 974715613..095d30a85 100644
--- a/doc/switcher.py
+++ b/doc/switcher.py
@@ -3,16 +3,15 @@
# information.
__author__ = "Patrick Kunzmann"
-__all__ = ["create_api_doc", "skip_non_methods"]
+__all__ = ["create_switcher_json"]
-from dataclasses import dataclass
-from pathlib import Path
import json
import re
+from dataclasses import dataclass
import requests
import biotite
-RELEASE_REQUEST = f"https://api.github.com/repos/biotite-dev/biotite/releases"
+RELEASE_REQUEST = "https://api.github.com/repos/biotite-dev/biotite/releases"
BIOTITE_URL = "https://www.biotite-python.org"
SEMVER_TAG_REGEX = r"^v(\d+)\.(\d+)\.(\d+)"
@@ -35,18 +34,17 @@ def __str__(self):
return f"{self.major}.{self.minor}.{self.patch}"
def __ge__(self, other):
- return (
- (self.major, self.minor, self.patch)
- >= (other.major, other.minor, other.patch)
+ return (self.major, self.minor, self.patch) >= (
+ other.major,
+ other.minor,
+ other.patch,
)
def _get_previous_versions(min_tag, n_versions):
response = requests.get(RELEASE_REQUEST, params={"per_page": n_versions})
release_data = json.loads(response.text)
- versions = [
- Version.from_tag(release["tag_name"]) for release in release_data
- ]
+ versions = [Version.from_tag(release["tag_name"]) for release in release_data]
return [version for version in versions if version >= Version.from_tag(min_tag)]
@@ -69,17 +67,21 @@ def create_switcher_json(file_path, min_tag, n_versions):
"""
version_config = []
for version in _get_previous_versions(min_tag, n_versions)[::-1]:
- version_config.append({
- "name": f"{version.major}.{version.minor}",
- "version": str(version),
- "url": f"{BIOTITE_URL}/{version}/",
- })
+ version_config.append(
+ {
+ "name": f"{version.major}.{version.minor}",
+ "version": str(version),
+ "url": f"{BIOTITE_URL}/{version}/",
+ }
+ )
current_version = _get_current_version()
- version_config.append({
- "name": f"{current_version.major}.{current_version.minor}",
- "version": str(current_version),
- "url": f"{BIOTITE_URL}/{current_version}/",
- "preferred": True
- })
+ version_config.append(
+ {
+ "name": f"{current_version.major}.{current_version.minor}",
+ "version": str(current_version),
+ "url": f"{BIOTITE_URL}/{current_version}/",
+ "preferred": True,
+ }
+ )
with open(file_path, "w") as file:
json.dump(version_config, file, indent=4)
diff --git a/doc/viewcode.py b/doc/viewcode.py
index d828f960f..ec0b28974 100644
--- a/doc/viewcode.py
+++ b/doc/viewcode.py
@@ -10,10 +10,10 @@
__author__ = "Patrick Kunzmann"
__all__ = ["linkcode_resolve"]
+import inspect
from importlib import import_module
-from os.path import dirname, join, isdir, splitext
from os import listdir
-import inspect
+from os.path import dirname, isdir, join, splitext
import biotite
@@ -66,10 +66,13 @@ def _index_attributes(package_name, src_path):
# Import all modules in directory and index attributes
source_files = [
- file_name for file_name in directory_content
- if file_name != "__init__.py" and (
+ file_name
+ for file_name in directory_content
+ if file_name != "__init__.py"
+ and (
# Standard Python modules
- file_name.endswith(".py") or
+ file_name.endswith(".py")
+ or
# Extension modules
file_name.endswith(".pyx")
)
@@ -83,9 +86,7 @@ def _index_attributes(package_name, src_path):
module = import_module(module_name)
if not hasattr(module, "__all__"):
- raise AttributeError(
- f"Module {module_name} has not attribute '__all__'"
- )
+ raise AttributeError(f"Module {module_name} has not attribute '__all__'")
# Only index attributes from modules that are available
# via respective Biotite (sub-)package
# If a the attribute is available, the module was imported in
@@ -98,8 +99,7 @@ def _index_attributes(package_name, src_path):
is_cython = source_file.endswith(".pyx")
for attribute in module.__all__:
- attribute_index[(package_name, attribute)] \
- = (module_name, is_cython)
+ attribute_index[(package_name, attribute)] = (module_name, is_cython)
if is_cython:
with open(join(src_path, source_file), "r") as cython_file:
lines = cython_file.read().splitlines()
@@ -144,16 +144,14 @@ def _index_cython_code(code_lines):
continue
if line.startswith(("def")):
- attr_type = "def"
# Get name of the function:
# Remove 'def' from line...
cropped_line = stripped_line[3:].strip()
# ...and determine the end of the name by finding the
# subsequent '('
- cropped_line = cropped_line[:cropped_line.index("(")].strip()
+ cropped_line = cropped_line[: cropped_line.index("(")].strip()
attr_name = cropped_line
elif line.startswith(("class", "cdef class")):
- attr_type = "class"
cropped_line = stripped_line
# Get name of the class:
# Remove potential 'cdef' from line...
@@ -163,8 +161,11 @@ def _index_cython_code(code_lines):
cropped_line = cropped_line[5:].strip()
# ...and determine the end of the name by finding the
# subsequent '(' or ':'
- index = cropped_line.index("(") if "(" in cropped_line \
- else cropped_line.index(":")
+ index = (
+ cropped_line.index("(")
+ if "(" in cropped_line
+ else cropped_line.index(":")
+ )
cropped_line = cropped_line[:index].strip()
attr_name = cropped_line
else:
@@ -172,8 +173,8 @@ def _index_cython_code(code_lines):
continue
attr_line_start = i
- attr_line_stop = i+1
- for j in range(i+1, len(code_lines)):
+ attr_line_stop = i + 1
+ for j in range(i + 1, len(code_lines)):
attr_line = code_lines[j]
if len(attr_line.strip()) == 0 or attr_line.strip()[0] == "#":
continue
@@ -189,7 +190,7 @@ def _index_cython_code(code_lines):
# 'One' based indexing
attr_line_start + 1,
# 'One' based indexing and inclusive stop
- attr_line_stop
+ attr_line_stop,
)
return line_index
@@ -203,7 +204,7 @@ def _is_package(path):
_attribute_index, _cython_line_index = _index_attributes(
"biotite",
# Directory to src/biotite
- join(dirname(dirname(__file__)), "src", "biotite")
+ join(dirname(dirname(__file__)), "src", "biotite"),
)
@@ -226,17 +227,11 @@ def linkcode_resolve(domain, info):
if is_cython:
if (package_name, attr_name) in _cython_line_index:
first, last = _cython_line_index[(package_name, attr_name)]
- return (
- base_url +
- f"{module_name.replace('.', '/')}.pyx#L{first}-L{last}"
- )
+ return base_url + f"{module_name.replace('.', '/')}.pyx#L{first}-L{last}"
else:
# In case the attribute is not found
# by the Cython code analyzer
- return (
- base_url +
- f"{module_name.replace('.', '/')}.pyx"
- )
+ return base_url + f"{module_name.replace('.', '/')}.pyx"
else:
module = import_module(module_name)
@@ -255,7 +250,4 @@ def linkcode_resolve(domain, info):
source_lines, first = inspect.getsourcelines(obj)
last = first + len(source_lines) - 1
- return (
- base_url +
- f"{module_name.replace('.', '/')}.py#L{first}-L{last}"
- )
\ No newline at end of file
+ return base_url + f"{module_name.replace('.', '/')}.py#L{first}-L{last}"
diff --git a/environment.yml b/environment.yml
index 8e484d75c..9e2017a35 100644
--- a/environment.yml
+++ b/environment.yml
@@ -24,6 +24,8 @@ dependencies:
# Testing
# - mdtraj >=1.9.3, <1.10 # tempoarily disabled due to incompatibility with numpy 2.0
- pytest >=7.0
+ # Code style
+ - ruff =0.5.0
# Interfaced software in biotite.application (can also be installed separately)
- autodock-vina
- clustalo
diff --git a/pyproject.toml b/pyproject.toml
index fe580be37..d4016888c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,6 +37,42 @@ homepage = "https://www.biotite-python.org"
repository = "https://github.com/biotite-dev/biotite"
documentation = "https://www.biotite-python.org"
+[tool.ruff.lint]
+# pyflakes, pycodestyle isort and varibale naming
+select = ["F", "E", "W", "I", "TID", "N"]
+ignore = [
+ # In docstrings long lines are often intentional
+ # Most other ocassions are caught by the ruff formatter
+ "E501",
+ # Due to constants and class placeholders defined in functions
+ "N806",
+]
+
+[tool.ruff.lint.per-file-ignores]
+# Due to `* import` of BCIF encoding
+"setup_ccd.py" = ["F405", "F403"]
+# Due to imports after the PATH has been adjusted
+"doc/conf.py" = ["E402"]
+# Due to `from .module import *` imports in `__init__.py` modules
+"__init__.py" = ["F403", "TID252"]
+# Due to pymol scripts that are evaluated in other example scripts
+"doc/examples/**/*_pymol.py" = ["F821"]
+
+[tool.ruff.lint.flake8-tidy-imports]
+ban-relative-imports = "all"
+
+[tool.ruff.lint.isort]
+# No separator lines between import sections
+no-lines-before = [
+ "future",
+ "standard-library",
+ "third-party",
+ "first-party",
+ "local-folder",
+]
+order-by-type = true
+known-first-party = ["biotite"]
+
[tool.hatch.build.targets.sdist]
exclude = [
"tests",
diff --git a/setup_ccd.py b/setup_ccd.py
index 07218964d..a3351c205 100644
--- a/setup_ccd.py
+++ b/setup_ccd.py
@@ -1,13 +1,14 @@
import gzip
import logging
-from io import StringIO
from dataclasses import dataclass
+from io import StringIO
+from pathlib import Path
import numpy as np
import requests
from biotite.structure.io.pdbx import *
-class ComponentException(Exception):
+class ComponentError(Exception):
pass
@@ -28,6 +29,7 @@ class ColumnInfo:
The name of an alternative column to use, if the original column
contains masked values and no `fill_value` is given.
"""
+
dtype: ...
encoding: ...
fill_value: ... = None
@@ -37,67 +39,75 @@ class ColumnInfo:
MAIN_COLUMNS = {
"id": ColumnInfo(
"U5",
- [StringArrayEncoding(
- data_encoding=[
- DeltaEncoding(src_type=TypeCode.INT32),
- RunLengthEncoding(),
- IntegerPackingEncoding(byte_count=2, is_unsigned=True),
- ByteArrayEncoding()
- ],
- offset_encoding=[
- DeltaEncoding(src_type=TypeCode.INT32),
- IntegerPackingEncoding(byte_count=1, is_unsigned=True),
- ByteArrayEncoding()
- ]
- )]
+ [
+ StringArrayEncoding(
+ data_encoding=[
+ DeltaEncoding(src_type=TypeCode.INT32),
+ RunLengthEncoding(),
+ IntegerPackingEncoding(byte_count=2, is_unsigned=True),
+ ByteArrayEncoding(),
+ ],
+ offset_encoding=[
+ DeltaEncoding(src_type=TypeCode.INT32),
+ IntegerPackingEncoding(byte_count=1, is_unsigned=True),
+ ByteArrayEncoding(),
+ ],
+ )
+ ],
),
"name": ColumnInfo(
str,
- [StringArrayEncoding(
- # The unique strings in the column are sorted
- # -> Indices do not follow distinct pattern
- data_encoding=[ByteArrayEncoding(type=TypeCode.INT32)],
- offset_encoding=[
- DeltaEncoding(src_type=TypeCode.INT32),
- IntegerPackingEncoding(byte_count=1, is_unsigned=True),
- ByteArrayEncoding()
- ]
- )]
+ [
+ StringArrayEncoding(
+ # The unique strings in the column are sorted
+ # -> Indices do not follow distinct pattern
+ data_encoding=[ByteArrayEncoding(type=TypeCode.INT32)],
+ offset_encoding=[
+ DeltaEncoding(src_type=TypeCode.INT32),
+ IntegerPackingEncoding(byte_count=1, is_unsigned=True),
+ ByteArrayEncoding(),
+ ],
+ )
+ ],
),
"type": ColumnInfo(
str,
- [StringArrayEncoding(
- # The unique strings in the column are sorted
- # -> Indices do not follow distinct pattern
- data_encoding=[ByteArrayEncoding(type=TypeCode.INT16)],
- offset_encoding=[
- DeltaEncoding(src_type=TypeCode.INT32),
- IntegerPackingEncoding(byte_count=1, is_unsigned=True),
- ByteArrayEncoding()
- ]
- )]
+ [
+ StringArrayEncoding(
+ # The unique strings in the column are sorted
+ # -> Indices do not follow distinct pattern
+ data_encoding=[ByteArrayEncoding(type=TypeCode.INT16)],
+ offset_encoding=[
+ DeltaEncoding(src_type=TypeCode.INT32),
+ IntegerPackingEncoding(byte_count=1, is_unsigned=True),
+ ByteArrayEncoding(),
+ ],
+ )
+ ],
),
"formula_weight": ColumnInfo(
"f8",
[
FixedPointEncoding(factor=1000, src_type=TypeCode.FLOAT64),
- ByteArrayEncoding()
+ ByteArrayEncoding(),
],
- fill_value=0
+ fill_value=0,
),
"one_letter_code": ColumnInfo(
"U1",
- [StringArrayEncoding(
- # The unique strings in the column are sorted
- # -> Indices do not follow distinct pattern
- data_encoding=[ByteArrayEncoding(type=TypeCode.INT16)],
- offset_encoding=[
- DeltaEncoding(src_type=TypeCode.INT32),
- IntegerPackingEncoding(byte_count=1, is_unsigned=True),
- ByteArrayEncoding()
- ]
- )],
- fill_value=""
+ [
+ StringArrayEncoding(
+ # The unique strings in the column are sorted
+ # -> Indices do not follow distinct pattern
+ data_encoding=[ByteArrayEncoding(type=TypeCode.INT16)],
+ offset_encoding=[
+ DeltaEncoding(src_type=TypeCode.INT32),
+ IntegerPackingEncoding(byte_count=1, is_unsigned=True),
+ ByteArrayEncoding(),
+ ],
+ )
+ ],
+ fill_value="",
),
}
@@ -105,148 +115,160 @@ class ColumnInfo:
ATOM_COLUMNS = {
"comp_id": ColumnInfo(
"U5",
- [StringArrayEncoding(
- data_encoding=[
- RunLengthEncoding(src_type=TypeCode.INT32),
- IntegerPackingEncoding(byte_count=2, is_unsigned=True),
- ByteArrayEncoding()
- ],
- offset_encoding=[
- DeltaEncoding(src_type=TypeCode.INT32),
- RunLengthEncoding(),
- IntegerPackingEncoding(byte_count=1, is_unsigned=True),
- ByteArrayEncoding()
- ]
- )]
+ [
+ StringArrayEncoding(
+ data_encoding=[
+ RunLengthEncoding(src_type=TypeCode.INT32),
+ IntegerPackingEncoding(byte_count=2, is_unsigned=True),
+ ByteArrayEncoding(),
+ ],
+ offset_encoding=[
+ DeltaEncoding(src_type=TypeCode.INT32),
+ RunLengthEncoding(),
+ IntegerPackingEncoding(byte_count=1, is_unsigned=True),
+ ByteArrayEncoding(),
+ ],
+ )
+ ],
),
"atom_id": ColumnInfo(
"U6",
- [StringArrayEncoding(
- # The unique strings in the column are sorted
- # -> Indices do not follow distinct pattern
- data_encoding=[ByteArrayEncoding(type=TypeCode.INT16)],
- offset_encoding=[
- DeltaEncoding(src_type=TypeCode.INT32),
- RunLengthEncoding(),
- IntegerPackingEncoding(byte_count=1, is_unsigned=True),
- ByteArrayEncoding()
- ]
- )]
+ [
+ StringArrayEncoding(
+ # The unique strings in the column are sorted
+ # -> Indices do not follow distinct pattern
+ data_encoding=[ByteArrayEncoding(type=TypeCode.INT16)],
+ offset_encoding=[
+ DeltaEncoding(src_type=TypeCode.INT32),
+ RunLengthEncoding(),
+ IntegerPackingEncoding(byte_count=1, is_unsigned=True),
+ ByteArrayEncoding(),
+ ],
+ )
+ ],
),
"type_symbol": ColumnInfo(
"U2",
- [StringArrayEncoding(
- # The unique strings in the column are sorted
- # -> Indices do not follow distinct pattern
- data_encoding=[ByteArrayEncoding(type=TypeCode.INT8)],
- offset_encoding=[
- DeltaEncoding(src_type=TypeCode.INT32),
- RunLengthEncoding(),
- IntegerPackingEncoding(byte_count=1, is_unsigned=True),
- ByteArrayEncoding()
- ]
- )]
- ),
- "charge": ColumnInfo(
- "i1",
- [ByteArrayEncoding(type=TypeCode.INT8)],
- fill_value=0
+ [
+ StringArrayEncoding(
+ # The unique strings in the column are sorted
+ # -> Indices do not follow distinct pattern
+ data_encoding=[ByteArrayEncoding(type=TypeCode.INT8)],
+ offset_encoding=[
+ DeltaEncoding(src_type=TypeCode.INT32),
+ RunLengthEncoding(),
+ IntegerPackingEncoding(byte_count=1, is_unsigned=True),
+ ByteArrayEncoding(),
+ ],
+ )
+ ],
),
+ "charge": ColumnInfo("i1", [ByteArrayEncoding(type=TypeCode.INT8)], fill_value=0),
"pdbx_model_Cartn_x_ideal": ColumnInfo(
"f4",
[
FixedPointEncoding(factor=100),
IntegerPackingEncoding(byte_count=2, is_unsigned=False),
- ByteArrayEncoding()
+ ByteArrayEncoding(),
],
- alternative="model_Cartn_x"
+ alternative="model_Cartn_x",
),
"pdbx_model_Cartn_y_ideal": ColumnInfo(
"f4",
[
FixedPointEncoding(factor=100),
IntegerPackingEncoding(byte_count=2, is_unsigned=False),
- ByteArrayEncoding()
+ ByteArrayEncoding(),
],
- alternative="model_Cartn_y"
+ alternative="model_Cartn_y",
),
"pdbx_model_Cartn_z_ideal": ColumnInfo(
"f4",
[
FixedPointEncoding(factor=100),
IntegerPackingEncoding(byte_count=2, is_unsigned=False),
- ByteArrayEncoding()
+ ByteArrayEncoding(),
],
- alternative="model_Cartn_z"
+ alternative="model_Cartn_z",
),
}
BOND_COLUMNS = {
"comp_id": ColumnInfo(
"U5",
- [StringArrayEncoding(
- data_encoding=[
- RunLengthEncoding(src_type=TypeCode.INT32),
- IntegerPackingEncoding(byte_count=2, is_unsigned=True),
- ByteArrayEncoding()
- ],
- offset_encoding=[
- DeltaEncoding(src_type=TypeCode.INT32),
- RunLengthEncoding(),
- IntegerPackingEncoding(byte_count=1, is_unsigned=True),
- ByteArrayEncoding()
- ]
- )]
+ [
+ StringArrayEncoding(
+ data_encoding=[
+ RunLengthEncoding(src_type=TypeCode.INT32),
+ IntegerPackingEncoding(byte_count=2, is_unsigned=True),
+ ByteArrayEncoding(),
+ ],
+ offset_encoding=[
+ DeltaEncoding(src_type=TypeCode.INT32),
+ RunLengthEncoding(),
+ IntegerPackingEncoding(byte_count=1, is_unsigned=True),
+ ByteArrayEncoding(),
+ ],
+ )
+ ],
),
"atom_id_1": ColumnInfo(
"U6",
- [StringArrayEncoding(
- # The unique strings in the column are sorted
- # -> Indices do not follow distinct pattern
- data_encoding=[ByteArrayEncoding(type=TypeCode.INT16)],
- offset_encoding=[
- DeltaEncoding(src_type=TypeCode.INT32),
- RunLengthEncoding(),
- IntegerPackingEncoding(byte_count=1, is_unsigned=True),
- ByteArrayEncoding()
- ]
- )]
+ [
+ StringArrayEncoding(
+ # The unique strings in the column are sorted
+ # -> Indices do not follow distinct pattern
+ data_encoding=[ByteArrayEncoding(type=TypeCode.INT16)],
+ offset_encoding=[
+ DeltaEncoding(src_type=TypeCode.INT32),
+ RunLengthEncoding(),
+ IntegerPackingEncoding(byte_count=1, is_unsigned=True),
+ ByteArrayEncoding(),
+ ],
+ )
+ ],
),
"atom_id_2": ColumnInfo(
"U6",
- [StringArrayEncoding(
- # The unique strings in the column are sorted
- # -> Indices do not follow distinct pattern
- data_encoding=[ByteArrayEncoding(type=TypeCode.INT16)],
- offset_encoding=[
- DeltaEncoding(src_type=TypeCode.INT32),
- RunLengthEncoding(),
- IntegerPackingEncoding(byte_count=1, is_unsigned=True),
- ByteArrayEncoding()
- ]
- )]
+ [
+ StringArrayEncoding(
+ # The unique strings in the column are sorted
+ # -> Indices do not follow distinct pattern
+ data_encoding=[ByteArrayEncoding(type=TypeCode.INT16)],
+ offset_encoding=[
+ DeltaEncoding(src_type=TypeCode.INT32),
+ RunLengthEncoding(),
+ IntegerPackingEncoding(byte_count=1, is_unsigned=True),
+ ByteArrayEncoding(),
+ ],
+ )
+ ],
),
"value_order": ColumnInfo(
"U4",
- [StringArrayEncoding(
- data_encoding=[
- RunLengthEncoding(src_type=TypeCode.INT32),
- IntegerPackingEncoding(byte_count=1, is_unsigned=True),
- ByteArrayEncoding()
- ],
- offset_encoding=[ByteArrayEncoding(type=TypeCode.UINT8)]
- )]
+ [
+ StringArrayEncoding(
+ data_encoding=[
+ RunLengthEncoding(src_type=TypeCode.INT32),
+ IntegerPackingEncoding(byte_count=1, is_unsigned=True),
+ ByteArrayEncoding(),
+ ],
+ offset_encoding=[ByteArrayEncoding(type=TypeCode.UINT8)],
+ )
+ ],
),
"pdbx_aromatic_flag": ColumnInfo(
"U1",
- [StringArrayEncoding(
- data_encoding=[
- RunLengthEncoding(src_type=TypeCode.INT32),
- IntegerPackingEncoding(byte_count=1, is_unsigned=True),
- ByteArrayEncoding()
- ],
- offset_encoding=[ByteArrayEncoding(type=TypeCode.UINT8)]
- )]
+ [
+ StringArrayEncoding(
+ data_encoding=[
+ RunLengthEncoding(src_type=TypeCode.INT32),
+ IntegerPackingEncoding(byte_count=1, is_unsigned=True),
+ ByteArrayEncoding(),
+ ],
+ offset_encoding=[ByteArrayEncoding(type=TypeCode.UINT8)],
+ )
+ ],
),
}
@@ -282,18 +304,14 @@ def check_presence(pdbx_file, category_name, column_names):
is_present = column_names[0] in category
for name in column_names:
if (name in category) != is_present:
- raise ComponentException(
- "Only some column names are missing"
- )
+ raise ComponentError("Only some column names are missing")
if not is_present:
return
is_unmasked = category[column_names[0]].mask is None
for name in column_names:
if (category[name].mask is None) != is_unmasked:
- raise ComponentException(
- "Only some column names are masked"
- )
+ raise ComponentError("Only some column names are masked")
def concatenate_blocks_into_category(pdbx_file, category_name, column_infos):
@@ -320,46 +338,40 @@ def concatenate_blocks_into_category(pdbx_file, category_name, column_infos):
for comp_id, block in pdbx_file.items():
try:
if category_name not in block:
- raise ComponentException(
- f"Block has no category '{category_name}'"
- )
+ raise ComponentError(f"Block has no category '{category_name}'")
chunk = {}
category = block[category_name]
for col_name, info in column_infos.items():
col = category.get(col_name)
- if (
- col is None
- or (col.mask is not None and info.fill_value is None)
- ):
+ if col is None or (col.mask is not None and info.fill_value is None):
# Some/all values are missing and there is no default
# -> Try alternative
if info.alternative is not None:
col = category[info.alternative]
if col.mask is not None:
- raise ComponentException(
+ raise ComponentError(
f"Missing values in alternative "
f"'{info.alternative}'"
)
else:
- raise ComponentException(
- f"Missing values in column '{col_name}'"
- )
+ raise ComponentError(f"Missing values in column '{col_name}'")
data_array = col.as_array(info.dtype, info.fill_value)
chunk[col_name] = data_array
- except ComponentException as e:
+ except ComponentError as e:
logging.warning(f"Skipping '{comp_id}': {e}")
# Append all columns in the chunk after the try-except block
# to avoid appending incomplete chunks
else:
for col_name, data_array in chunk.items():
column_chunks[col_name].append(data_array)
- return BinaryCIFCategory({
- col_name: BinaryCIFData(
- array=np.concatenate(col_data),
- encoding=column_infos[col_name].encoding
- )
- for col_name, col_data in column_chunks.items()
- })
+ return BinaryCIFCategory(
+ {
+ col_name: BinaryCIFData(
+ array=np.concatenate(col_data), encoding=column_infos[col_name].encoding
+ )
+ for col_name, col_data in column_chunks.items()
+ }
+ )
def extract_component_groups(type_dict, include, exclude, file_name):
@@ -393,8 +405,8 @@ def extract_component_groups(type_dict, include, exclude, file_name):
del type_dict[comp_id]
# Write extracted components into output file
logging.info(
- f"Using the following types for '{file_name.name}':\n" +
- ", ".join(types_for_group)
+ f"Using the following types for '{file_name.name}':\n"
+ + ", ".join(types_for_group)
)
with open(file_name, "w") as file:
for comp_id in comp_ids_for_group:
@@ -412,12 +424,12 @@ def setup_ccd(target_diriectory):
logging.info("Checking for consistent coordinates...")
check_presence(
- ccd_file, "chem_comp_atom",
- ["model_Cartn_x", "model_Cartn_y", "model_Cartn_z"]
+ ccd_file, "chem_comp_atom", ["model_Cartn_x", "model_Cartn_y", "model_Cartn_z"]
)
check_presence(
- ccd_file, "chem_comp_atom",
- ["model_Cartn_x_ideal", "model_Cartn_y_ideal", "model_Cartn_z_ideal"]
+ ccd_file,
+ "chem_comp_atom",
+ ["model_Cartn_x_ideal", "model_Cartn_y_ideal", "model_Cartn_z_ideal"],
)
logging.info("Extracting component groups...")
@@ -426,26 +438,25 @@ def setup_ccd(target_diriectory):
for comp_id, block in ccd_file.items()
}
extract_component_groups(
- type_dict, ["peptide", "amino"], ["peptide-like"],
- target_diriectory / "amino_acids.txt"
+ type_dict,
+ ["peptide", "amino"],
+ ["peptide-like"],
+ target_diriectory / "amino_acids.txt",
)
extract_component_groups(
- type_dict, ["rna", "dna"], [],
- target_diriectory / "nucleotides.txt"
+ type_dict, ["rna", "dna"], [], target_diriectory / "nucleotides.txt"
)
extract_component_groups(
- type_dict, ["saccharide"], [],
- target_diriectory / "carbohydrates.txt"
+ type_dict, ["saccharide"], [], target_diriectory / "carbohydrates.txt"
)
remaining_types = set(type_dict.values())
logging.info(
- "The following types are not used in any group:\n" +
- ", ".join(remaining_types)
+ "The following types are not used in any group:\n" + ", ".join(remaining_types)
)
compressed_block = BinaryCIFBlock()
for category_name, column_infos in [
- ("chem_comp", MAIN_COLUMNS),
+ ("chem_comp", MAIN_COLUMNS),
("chem_comp_atom", ATOM_COLUMNS),
("chem_comp_bond", BOND_COLUMNS),
]:
@@ -459,5 +470,5 @@ def setup_ccd(target_diriectory):
compressed_file["components"] = compressed_block
compressed_file.write(target_diriectory / "components.bcif")
-from pathlib import Path
-setup_ccd(Path(__file__).parent / "src" / "biotite" / "structure" / "info" / "ccd")
\ No newline at end of file
+
+setup_ccd(Path(__file__).parent / "src" / "biotite" / "structure" / "info" / "ccd")
diff --git a/src/biotite/__init__.py b/src/biotite/__init__.py
index f90e3b5ff..653caf8f5 100644
--- a/src/biotite/__init__.py
+++ b/src/biotite/__init__.py
@@ -12,7 +12,7 @@
__name__ = "biotite"
__author__ = "Patrick Kunzmann"
-from .file import *
from .copyable import *
+from .file import *
+from .version import __version__, __version_tuple__ # noqa: F401
from .visualize import *
-from .version import __version__, __version_tuple__
diff --git a/src/biotite/application/__init__.py b/src/biotite/application/__init__.py
index 72ca3f96c..de09a3dbf 100644
--- a/src/biotite/application/__init__.py
+++ b/src/biotite/application/__init__.py
@@ -65,5 +65,5 @@
from .application import *
from .localapp import *
+from .msaapp import *
from .webapp import *
-from .msaapp import *
\ No newline at end of file
diff --git a/src/biotite/application/application.py b/src/biotite/application/application.py
index 858658175..fb5d2c037 100644
--- a/src/biotite/application/application.py
+++ b/src/biotite/application/application.py
@@ -4,19 +4,26 @@
__name__ = "biotite.application"
__author__ = "Patrick Kunzmann"
-__all__ = ["Application", "AppStateError", "TimeoutError", "VersionError",
- "AppState", "requires_state"]
+__all__ = [
+ "Application",
+ "AppStateError",
+ "TimeoutError",
+ "VersionError",
+ "AppState",
+ "requires_state",
+]
import abc
import time
-from functools import wraps
from enum import Flag, auto
+from functools import wraps
class AppState(Flag):
"""
This enum type represents the app states of an application.
"""
+
CREATED = auto()
RUNNING = auto()
FINISHED = auto()
@@ -45,6 +52,7 @@ def requires_state(app_state):
... def function(self):
... pass
"""
+
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
@@ -52,16 +60,16 @@ def wrapper(*args, **kwargs):
try:
instance = args[0]
except IndexError:
- raise TypeError(
- "This method must be called from a class instance"
- )
+ raise TypeError("This method must be called from a class instance")
if not instance._state & app_state:
raise AppStateError(
f"The application is in {instance.get_app_state()} state, "
f"but {app_state} state is required"
)
return func(*args, **kwargs)
+
return wrapper
+
return decorator
@@ -146,11 +154,10 @@ def join(self, timeout=None):
"""
time.sleep(self.wait_interval())
while self.get_app_state() != AppState.FINISHED:
- if timeout is not None and time.time()-self._start_time > timeout:
+ if timeout is not None and time.time() - self._start_time > timeout:
self.cancel()
raise TimeoutError(
- f"The application expired its timeout "
- f"({timeout:.1f} s)"
+ f"The application expired its timeout " f"({timeout:.1f} s)"
)
else:
time.sleep(self.wait_interval())
@@ -249,6 +256,7 @@ class AppStateError(Exception):
"""
Indicate that the application lifecycle was violated.
"""
+
pass
@@ -256,6 +264,7 @@ class TimeoutError(Exception):
"""
Indicate that the application's timeout expired.
"""
+
pass
@@ -263,4 +272,5 @@ class VersionError(Exception):
"""
Indicate that the application's version is invalid.
"""
- pass
\ No newline at end of file
+
+ pass
diff --git a/src/biotite/application/autodock/__init__.py b/src/biotite/application/autodock/__init__.py
index 9d8aabe1e..756b6648c 100644
--- a/src/biotite/application/autodock/__init__.py
+++ b/src/biotite/application/autodock/__init__.py
@@ -9,4 +9,4 @@
__name__ = "biotite.application.autodock"
__author__ = "Patrick Kunzmann"
-from .app import *
\ No newline at end of file
+from .app import *
diff --git a/src/biotite/application/autodock/app.py b/src/biotite/application/autodock/app.py
index c93cd3cc8..babd860ae 100644
--- a/src/biotite/application/autodock/app.py
+++ b/src/biotite/application/autodock/app.py
@@ -9,12 +9,12 @@
import copy
from tempfile import NamedTemporaryFile
import numpy as np
-from ..localapp import LocalApp, cleanup_tempfile
-from ..application import AppState, requires_state
-from ...structure.io.pdbqt import PDBQTFile
-from ...structure.residues import get_residue_starts_for, get_residue_masks
-from ...structure.bonds import find_connected
-from ...structure.error import BadStructureError
+from biotite.application.application import AppState, requires_state
+from biotite.application.localapp import LocalApp, cleanup_tempfile
+from biotite.structure.bonds import find_connected
+from biotite.structure.error import BadStructureError
+from biotite.structure.io.pdbqt import PDBQTFile
+from biotite.structure.residues import get_residue_masks, get_residue_starts_for
class VinaApp(LocalApp):
@@ -62,8 +62,8 @@ class VinaApp(LocalApp):
... flexible=(receptor.res_id == 2) | (receptor.res_id == 5)
... )
"""
- def __init__(self, ligand, receptor, center, size, flexible=None,
- bin_path="vina"):
+
+ def __init__(self, ligand, receptor, center, size, flexible=None, bin_path="vina"):
super().__init__(bin_path)
if ligand.bonds is None:
@@ -83,23 +83,17 @@ def __init__(self, ligand, receptor, center, size, flexible=None,
if self._is_flexible:
flexible_indices = np.where(flexible)[0]
- self._flex_res_starts = np.unique(get_residue_starts_for(
- receptor, flexible_indices
- ))
-
- self._ligand_file = NamedTemporaryFile(
- "w", suffix=".pdbqt", delete=False
- )
- self._receptor_file = NamedTemporaryFile(
- "w", suffix=".pdbqt", delete=False
- )
- self._receptor_flex_file = NamedTemporaryFile(
+ self._flex_res_starts = np.unique(
+ get_residue_starts_for(receptor, flexible_indices)
+ )
+
+ self._ligand_file = NamedTemporaryFile("w", suffix=".pdbqt", delete=False)
+ self._receptor_file = NamedTemporaryFile("w", suffix=".pdbqt", delete=False)
+ self._receptor_flex_file = NamedTemporaryFile(
"w", suffix=".pdbqt", delete=False
)
- self._out_file = NamedTemporaryFile(
- "r", suffix=".pdbqt", delete=False
- )
-
+ self._out_file = NamedTemporaryFile("r", suffix=".pdbqt", delete=False)
+
@requires_state(AppState.CREATED)
def set_seed(self, seed):
"""
@@ -114,7 +108,7 @@ def set_seed(self, seed):
The seed for the random number generator.
"""
self._seed = seed
-
+
@requires_state(AppState.CREATED)
def set_exhaustiveness(self, exhaustiveness):
"""
@@ -131,7 +125,7 @@ def set_exhaustiveness(self, exhaustiveness):
Must be greater than 0.
"""
self._exhaustiveness = exhaustiveness
-
+
@requires_state(AppState.CREATED)
def set_max_number_of_models(self, number):
"""
@@ -147,7 +141,7 @@ def set_max_number_of_models(self, number):
The maximum number of generated modes/models.
"""
self._number = number
-
+
@requires_state(AppState.CREATED)
def set_energy_range(self, energy_range):
"""
@@ -168,34 +162,31 @@ def run(self):
# Use different atom ID ranges for atoms in ligand and receptor
# for unambiguous assignment, if the receptor contains flexible
# residues
- self._ligand.set_annotation("atom_id", np.arange(
- 1,
- self._ligand.array_length() + 1
- ))
- self._receptor.set_annotation("atom_id", np.arange(
- self._ligand.array_length() + 1,
- self._ligand.array_length() + self._receptor.array_length() + 1
- ))
+ self._ligand.set_annotation(
+ "atom_id", np.arange(1, self._ligand.array_length() + 1)
+ )
+ self._receptor.set_annotation(
+ "atom_id",
+ np.arange(
+ self._ligand.array_length() + 1,
+ self._ligand.array_length() + self._receptor.array_length() + 1,
+ ),
+ )
ligand_file = PDBQTFile()
- # Contains 'true' entries for all atoms that have not been
+ # Contains 'true' entries for all atoms that have not been
# removed from ligand
self._ligand_mask = ligand_file.set_structure(
- self._ligand,
- rotatable_bonds="all"
+ self._ligand, rotatable_bonds="all"
)
ligand_file.write(self._ligand_file)
self._ligand_file.flush()
-
+
if self._is_flexible:
- self._rigid_mask = np.ones(
- self._receptor.array_length(), dtype=bool
- )
- # Contains 'true' entries for all atoms that have not been
+ self._rigid_mask = np.ones(self._receptor.array_length(), dtype=bool)
+ # Contains 'true' entries for all atoms that have not been
# removed from receptor in flexible side chains
- self._receptor_mask = np.zeros(
- self._receptor.array_length(), dtype=bool
- )
+ self._receptor_mask = np.zeros(self._receptor.array_length(), dtype=bool)
for i, start in enumerate(self._flex_res_starts):
flex_mask, rigid_mask, root = self._get_flexible_residue(start)
self._rigid_mask &= rigid_mask
@@ -207,7 +198,7 @@ def run(self):
self._receptor[flex_mask],
rotatable_bonds="all",
root=root_in_flex_residue,
- include_torsdof=False
+ include_torsdof=False,
)
# Enclose each flexible residue
# with BEGIN_RES and END_RES
@@ -220,7 +211,7 @@ def run(self):
receptor_file.set_structure(
self._receptor[self._rigid_mask],
rotatable_bonds=None,
- include_torsdof=False
+ include_torsdof=False,
)
receptor_file.write(self._receptor_file)
self._receptor_file.flush()
@@ -228,23 +219,30 @@ def run(self):
else:
receptor_file = PDBQTFile()
receptor_file.set_structure(
- self._receptor,
- rotatable_bonds=None,
- include_torsdof=False
+ self._receptor, rotatable_bonds=None, include_torsdof=False
)
receptor_file.write(self._receptor_file)
self._receptor_file.flush()
arguments = [
- "--ligand", self._ligand_file.name,
- "--receptor", self._receptor_file.name,
- "--out", self._out_file.name,
- "--center_x", f"{self._center[0]:.3f}",
- "--center_y", f"{self._center[1]:.3f}",
- "--center_z", f"{self._center[2]:.3f}",
- "--size_x", f"{self._size[0]:.3f}",
- "--size_y", f"{self._size[1]:.3f}",
- "--size_z", f"{self._size[2]:.3f}",
+ "--ligand",
+ self._ligand_file.name,
+ "--receptor",
+ self._receptor_file.name,
+ "--out",
+ self._out_file.name,
+ "--center_x",
+ f"{self._center[0]:.3f}",
+ "--center_y",
+ f"{self._center[1]:.3f}",
+ "--center_z",
+ f"{self._center[2]:.3f}",
+ "--size_x",
+ f"{self._size[0]:.3f}",
+ "--size_y",
+ f"{self._size[1]:.3f}",
+ "--size_z",
+ f"{self._size[2]:.3f}",
]
if self._seed is not None:
arguments.extend(["--seed", str(self._seed)])
@@ -259,32 +257,32 @@ def run(self):
self.set_arguments(arguments)
super().run()
-
+
def evaluate(self):
super().evaluate()
out_file = PDBQTFile.read(self._out_file)
-
+
models = out_file.get_structure()
n_ligand_atoms = np.count_nonzero(self._ligand_mask)
self._ligand_models = models[..., :n_ligand_atoms]
self._flex_models = models[..., n_ligand_atoms:]
self._n_models = models.stack_depth()
-
+
remarks = out_file.get_remarks()
self._energies = np.array(
# VINA RESULT: -5.8 0.000 0.000
# ^
[float(remark[12:].split()[0]) for remark in remarks]
)
-
+
def clean_up(self):
super().clean_up()
cleanup_tempfile(self._ligand_file)
cleanup_tempfile(self._receptor_file)
cleanup_tempfile(self._receptor_flex_file)
cleanup_tempfile(self._out_file)
-
+
@requires_state(AppState.JOINED)
def get_energies(self):
"""
@@ -302,7 +300,7 @@ def get_energies(self):
@requires_state(AppState.JOINED)
def get_ligand_models(self):
"""
- Get the ligand structure with the conformations for each
+ Get the ligand structure with the conformations for each
generated binding mode.
Returns
@@ -312,7 +310,7 @@ def get_ligand_models(self):
Each model corresponds to one binding mode.
The models are sorted from best to worst predicted binding
affinity.
-
+
Notes
-----
The returned structure may contain less atoms than the input
@@ -338,12 +336,11 @@ def get_ligand_coord(self):
atoms are set to *NaN*.
"""
coord = np.full(
- (self._n_models, self._ligand.array_length(), 3),
- np.nan, dtype=np.float32
+ (self._n_models, self._ligand.array_length(), 3), np.nan, dtype=np.float32
)
coord[:, self._ligand_mask] = self._ligand_models.coord
return coord
-
+
@requires_state(AppState.JOINED)
def get_flexible_residue_models(self):
"""
@@ -360,7 +357,7 @@ def get_flexible_residue_models(self):
Each model corresponds to one binding mode.
The models are sorted from best to worst predicted binding
affinity.
-
+
Notes
-----
The returned structure may contain less atoms than the input
@@ -385,7 +382,7 @@ def get_receptor_coord(self):
affinity.
Missing coordinates due to the removed nonpolar hydrogen
atoms from flexible side chains are set to *NaN*.
-
+
Notes
-----
The output is only meaningful, if flexible side chains were
@@ -394,8 +391,7 @@ def get_receptor_coord(self):
of the input receptor coordinates.
"""
coord = np.repeat(
- self._receptor.coord[np.newaxis, ...],
- repeats=self._n_models, axis=0
+ self._receptor.coord[np.newaxis, ...], repeats=self._n_models, axis=0
)
if self._is_flexible:
# Replace original coordinates with modeled coordinates
@@ -424,16 +420,16 @@ def _get_flexible_residue(self, residue_start):
root_connect_indices, _ = self._receptor.bonds.get_bonds(root_index)
connected_index = None
try:
- connected_index = root_connect_indices[np.isin(
- self._receptor.atom_name[root_connect_indices], ("CB",)
- )][0]
+ connected_index = root_connect_indices[
+ np.isin(self._receptor.atom_name[root_connect_indices], ("CB",))
+ ][0]
except IndexError:
# Residue has no appropriate connection (e.g. in glycine)
# -> There is no atom in the flexible side chain
flex_mask = np.zeros(self._receptor.array_length(), dtype=bool)
rigid_mask = np.ones(self._receptor.array_length(), dtype=bool)
return flex_mask, rigid_mask, root_index
-
+
# Remove the root bond from the bond list
# to find the atoms involved in the flexible part
bonds = self._receptor.bonds.copy()
@@ -442,7 +438,7 @@ def _get_flexible_residue(self, residue_start):
if root_index in flexible_indices:
raise BadStructureError(
"There are multiple connections between the flexible and "
- "rigid part, maybe a cyclic residue like proline was selected"
+ "rigid part, maybe a cyclic residue like proline was selected"
)
flex_mask = np.zeros(self._receptor.array_length(), dtype=bool)
@@ -452,7 +448,6 @@ def _get_flexible_residue(self, residue_start):
flex_mask[root_index] = True
return flex_mask, rigid_mask, root_index
-
@staticmethod
def dock(ligand, receptor, center, size, flexible=None, bin_path="vina"):
diff --git a/src/biotite/application/blast/__init__.py b/src/biotite/application/blast/__init__.py
index 77caf3e64..65857b2b4 100644
--- a/src/biotite/application/blast/__init__.py
+++ b/src/biotite/application/blast/__init__.py
@@ -10,5 +10,5 @@
__name__ = "biotite.application.blast"
__author__ = "Patrick Kunzmann"
+from .alignment import *
from .webapp import *
-from .alignment import *
\ No newline at end of file
diff --git a/src/biotite/application/blast/alignment.py b/src/biotite/application/blast/alignment.py
index dc5b31784..85890df66 100644
--- a/src/biotite/application/blast/alignment.py
+++ b/src/biotite/application/blast/alignment.py
@@ -6,7 +6,7 @@
__author__ = "Patrick Kunzmann"
__all__ = ["BlastAlignment"]
-from ...sequence.align.alignment import Alignment
+from biotite.sequence.align.alignment import Alignment
class BlastAlignment(Alignment):
@@ -14,10 +14,10 @@ class BlastAlignment(Alignment):
A specialized :class:`Alignment` class for alignments using the
BLAST application. It stores additional data, like the E-value,
the HSP position and a description of the hit sequence.
-
+
Like its superclass, all attributes of a :class:`BlastAlignment` are
public. The attributes are the same as the constructor parameters.
-
+
Parameters
----------
sequences : list
@@ -44,16 +44,25 @@ class BlastAlignment(Alignment):
hit_definition : str
The name of the hit sequence.
"""
-
- def __init__(self, sequences, trace, score, e_value,
- query_interval, hit_interval, hit_id, hit_definition):
+
+ def __init__(
+ self,
+ sequences,
+ trace,
+ score,
+ e_value,
+ query_interval,
+ hit_interval,
+ hit_id,
+ hit_definition,
+ ):
super().__init__(sequences, trace, score)
self.e_value = e_value
self.query_interval = query_interval
self.hit_interval = hit_interval
self.hit_id = hit_id
self.hit_definition = hit_definition
-
+
def __eq__(self, item):
if not isinstance(item, BlastAlignment):
return False
@@ -68,7 +77,7 @@ def __eq__(self, item):
if self.hit_definition != item.hit_definition:
return False
return super().__eq__(item)
-
+
def __getitem__(self, index):
super_alignment = super().__getitem__(index)
return BlastAlignment(
@@ -79,5 +88,5 @@ def __getitem__(self, index):
self.query_interval,
self.hit_interval,
self.hit_id,
- self.hit_definition
- )
\ No newline at end of file
+ self.hit_definition,
+ )
diff --git a/src/biotite/application/blast/webapp.py b/src/biotite/application/blast/webapp.py
index cf358ac23..f8d6b09d1 100644
--- a/src/biotite/application/blast/webapp.py
+++ b/src/biotite/application/blast/webapp.py
@@ -6,26 +6,26 @@
__author__ = "Patrick Kunzmann"
__all__ = ["BlastWebApp"]
-from .alignment import BlastAlignment
-from ..application import Application, requires_state, AppState
-from ..webapp import WebApp, RuleViolationError
-from ...sequence.sequence import Sequence
-from ...sequence.seqtypes import NucleotideSequence, ProteinSequence
-from ...sequence.io.fasta.file import FastaFile
-from ...sequence.io.fasta.convert import get_sequence
-from ...sequence.align.alignment import Alignment
import time
-import requests
from xml.etree import ElementTree
-
+import requests
+from biotite.application.application import AppState, requires_state
+from biotite.application.blast.alignment import BlastAlignment
+from biotite.application.webapp import WebApp
+from biotite.sequence.align.alignment import Alignment
+from biotite.sequence.io.fasta.convert import get_sequence
+from biotite.sequence.io.fasta.file import FastaFile
+from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence
+from biotite.sequence.sequence import Sequence
_ncbi_url = "https://blast.ncbi.nlm.nih.gov/Blast.cgi"
+
class BlastWebApp(WebApp):
"""
Perform a local alignment against a large sequence database using
using the web-based BLAST application (by default NCBI BLAST).
-
+
Parameters
----------
program : str
@@ -35,7 +35,7 @@ class BlastWebApp(WebApp):
The query sequence. If a string is provided, it is interpreted
as path to a FASTA file, if the string contains a valid FASTA
file extension, otherwise it is interpreted as a single letter
- string representation of a sequence.
+ string representation of a sequence.
database : str, optional
The NCBI sequence database to blast against. By default it
contains all sequences (`database`='nr'`).
@@ -52,68 +52,71 @@ class BlastWebApp(WebApp):
HTTP request. This allows the NCBI to contact you in case
your application sends too many requests.
"""
-
+
_last_contact = 0
_last_request = 0
_contact_delay = 3
_request_delay = 60
-
- def __init__(self, program, query, database="nr",
- app_url=_ncbi_url, obey_rules=True,
- mail="padix.key@gmail.com"):
+
+ def __init__(
+ self,
+ program,
+ query,
+ database="nr",
+ app_url=_ncbi_url,
+ obey_rules=True,
+ mail="padix.key@gmail.com",
+ ):
super().__init__(app_url, obey_rules)
-
+
# 'megablast' is somehow not working
# When entering the corresponding HTTPS request into a browser
# you are redirected onto the blast mainpage
- if program not in ["blastn", "blastp",
- "blastx", "tblastn", "tblastx"]:
+ if program not in ["blastn", "blastp", "blastx", "tblastn", "tblastx"]:
raise ValueError(f"'{program}' is not a valid BLAST program")
self._program = program
-
- requires_protein = (program in ["blastp", "tblastn"])
- if isinstance(query, str) and query.endswith((".fa",".fst",".fasta")):
+
+ requires_protein = program in ["blastp", "tblastn"]
+ if isinstance(query, str) and query.endswith((".fa", ".fst", ".fasta")):
# If string has a file extension, it is interpreted as
# FASTA file from which the sequence is taken
file = FastaFile.read(query)
# Get first entry in file and take the sequence
- # (rather than header)
+ # (rather than header)
self._query = str(get_sequence(file))
elif isinstance(query, Sequence):
self._query = str(query)
else:
self._query = query
-
+
# Check for unsuitable symbols in query string
if requires_protein:
ref_alphabet = ProteinSequence.alphabet
else:
ref_alphabet = NucleotideSequence.alphabet_amb
for symbol in self._query:
- if not symbol.upper() in ref_alphabet:
- raise ValueError(
- f"Query sequence contains unsuitable symbol {symbol}"
- )
-
+ if symbol.upper() not in ref_alphabet:
+ raise ValueError(f"Query sequence contains unsuitable symbol {symbol}")
+
self._database = database
-
+
self._gap_openining = None
self._gap_extension = None
self._word_size = None
-
+
self._expect_value = None
self._max_results = None
self._entrez_query = None
-
+
self._reward = None
self._penalty = None
-
+
self._matrix = None
self._threshold = None
-
- self._mail=mail
+
+ self._mail = mail
self._rid = None
-
+
@requires_state(AppState.CREATED)
def set_entrez_query(self, query):
"""
@@ -126,7 +129,7 @@ def set_entrez_query(self, query):
An NCBI Entrez query.
"""
self._entrez_query = str(query)
-
+
@requires_state(AppState.CREATED)
def set_max_results(self, number):
"""
@@ -138,30 +141,30 @@ def set_max_results(self, number):
The maximum number of results.
"""
self._max_results = number
-
+
@requires_state(AppState.CREATED)
def set_max_expect_value(self, value):
"""
Set the threshold expectation value (E-value).
No alignments with an E-value above this threshold will be
considered.
-
+
The E-Value is the expectation value for the number of random
sequences of a similar sized database getting an equal or higher
score by change when aligned with the query sequence.
-
+
Parameters
----------
value : float
The threshold E-value.
"""
self._expect_value = value
-
+
@requires_state(AppState.CREATED)
def set_gap_penalty(self, opening, extension):
"""
Set the affine gap penalty for the alignment.
-
+
Parameters
----------
opening : float
@@ -171,75 +174,75 @@ def set_gap_penalty(self, opening, extension):
"""
self._gap_openining = opening
self._gap_extension = extension
-
+
@requires_state(AppState.CREATED)
def set_word_size(self, size):
"""
Set the word size for alignment seeds.
-
+
Parameters
----------
size : int
Word size.
"""
self._word_size = size
-
+
@requires_state(AppState.CREATED)
def set_match_reward(self, reward):
"""
Set the score of a symbol match in the alignment.
-
+
Used only in 'blastn' and 'megablast'.
-
+
Parameters
----------
reward : int
Match reward. Must be positive.
"""
self._reward = reward
-
+
@requires_state(AppState.CREATED)
def set_mismatch_penalty(self, penalty):
"""
Set the penalty of a symbol mismatch in the alignment.
-
+
Used only in 'blastn' and 'megablast'.
-
+
Parameters
----------
penalty : int
Mismatch penalty. Must be negative.
"""
self._penalty = penalty
-
+
@requires_state(AppState.CREATED)
def set_substitution_matrix(self, matrix_name):
"""
Set the penalty of a symbol mismatch in the alignment.
-
+
Used only in 'blastp', "blastx', 'tblastn' and 'tblastx'.
-
+
Parameters
----------
matrix_name : str
Name of the substitution matrix. Default is 'BLOSUM62'.
"""
self._matrix = matrix_name.upper()
-
+
@requires_state(AppState.CREATED)
def set_threshold(self, threshold):
"""
Set the threshold neighboring score for initial words.
-
+
Used only in 'blastp', "blastx', 'tblastn' and 'tblastx'.
-
+
Parameters
----------
threshold : int
Threshold value. Must be positve.
"""
self._threshold = threshold
-
+
def run(self):
param_dict = {}
param_dict["tool"] = "Biotite"
@@ -255,23 +258,24 @@ def run(self):
if self._expect_value is not None:
param_dict["EXPECT"] = self._expect_value
if self._gap_openining is not None and self._gap_extension is not None:
- param_dict["GAPCOSTS"] = "{:d} {:d}".format(self._gap_openining,
- self._gap_extension)
+ param_dict["GAPCOSTS"] = "{:d} {:d}".format(
+ self._gap_openining, self._gap_extension
+ )
if self._word_size is not None:
param_dict["WORD_SIZE"] = self._word_size
-
+
if self._program in ["blastn", "megablast"]:
if self._reward is not None:
param_dict["NUCL_REWARD"] = self._reward
if self._penalty is not None:
param_dict["NUCL_PENALTY"] = self._penalty
-
+
if self._program in ["blastp", "blastx", "tblastn", "tblastx"]:
if self._matrix is not None:
param_dict["MATRIX"] = self._matrix
if self._threshold is not None:
param_dict["THRESHOLD"] = self._threshold
-
+
request = requests.get(self.app_url(), params=param_dict)
if "Submitted URI too large" in request.text:
raise ValueError("The URI is too large, try a shorter sequence")
@@ -279,11 +283,9 @@ def run(self):
self._request()
info_dict = BlastWebApp._get_info(request.text)
self._rid = info_dict["RID"]
-
+
def is_finished(self):
- data_dict = {"FORMAT_OBJECT" : "SearchInfo",
- "RID" : self._rid,
- "CMD" : "Get"}
+ data_dict = {"FORMAT_OBJECT": "SearchInfo", "RID": self._rid, "CMD": "Get"}
request = requests.get(self.app_url(), params=data_dict)
self._contact()
info_dict = BlastWebApp._get_info(request.text)
@@ -294,17 +296,17 @@ def is_finished(self):
"(Server responsed status 'UNKNOWN')"
)
return info_dict["Status"] == "READY"
-
+
def wait_interval(self):
# NCBI requires a 3 second delay between server contacts
return BlastWebApp._contact_delay
-
+
def clean_up(self):
param_dict = {}
param_dict["CMD"] = "Delete"
param_dict["RID"] = self._rid
- request = requests.get(self.app_url(), params=param_dict)
-
+ requests.get(self.app_url(), params=param_dict)
+
def evaluate(self):
param_dict = {}
param_dict["tool"] = "BiotiteClient"
@@ -316,7 +318,7 @@ def evaluate(self):
param_dict["NCBI_GI"] = "T"
request = requests.get(self.app_url(), params=param_dict)
self._contact()
-
+
self._alignments = []
self._xml_response = request.text
root = ElementTree.fromstring(self._xml_response)
@@ -333,15 +335,14 @@ def evaluate(self):
query_end = int(hsp.find("Hsp_query-to").text)
hit_begin = int(hsp.find("Hsp_hit-from").text)
hit_end = int(hsp.find("Hsp_hit-to").text)
-
+
seq1_str = hsp.find("Hsp_qseq").text
seq2_str = hsp.find("Hsp_hseq").text
if self._program in ["blastn", "megablast"]:
# NucleotideSequence/ProteinSequence do ignore gaps
# Gaps are represented by the trace
seq1, seq2 = [
- NucleotideSequence(s.replace("-", ""))
- for s in (seq1_str, seq2_str)
+ NucleotideSequence(s.replace("-", "")) for s in (seq1_str, seq2_str)
]
else:
seq1, seq2 = [
@@ -349,18 +350,24 @@ def evaluate(self):
for s in (seq1_str, seq2_str)
]
trace = Alignment.trace_from_strings([seq1_str, seq2_str])
-
- alignment = BlastAlignment( [seq1 ,seq2], trace, score, e_value,
- (query_begin, query_end),
- (hit_begin, hit_end),
- hit_id, hit_definition )
+
+ alignment = BlastAlignment(
+ [seq1, seq2],
+ trace,
+ score,
+ e_value,
+ (query_begin, query_end),
+ (hit_begin, hit_end),
+ hit_id,
+ hit_definition,
+ )
self._alignments.append(alignment)
@requires_state(AppState.JOINED)
def get_xml_response(self):
"""
Get the raw XML response.
-
+
Returns
-------
response : str
@@ -372,14 +379,14 @@ def get_xml_response(self):
def get_alignments(self):
"""
Get the resulting local sequence alignments.
-
+
Returns
-------
alignment : list of BlastAlignment
The local sequence alignments.
"""
return self._alignments
-
+
@staticmethod
def _get_info(text):
"""
@@ -399,7 +406,7 @@ def _get_info(text):
pair = line.split("=")
info_dict[pair[0].strip()] = pair[1].strip()
return info_dict
-
+
def _contact(self):
"""
Resets the time since the last server contact. Used for
@@ -409,7 +416,7 @@ def _contact(self):
if (contact - BlastWebApp._last_contact) < BlastWebApp._contact_delay:
self.violate_rule("The server was contacted too often")
BlastWebApp._last_contact = contact
-
+
def _request(self):
"""
Resets the time since the last new alignment request. Used for
diff --git a/src/biotite/application/clustalo/__init__.py b/src/biotite/application/clustalo/__init__.py
index 1f3afebac..ba0f44704 100644
--- a/src/biotite/application/clustalo/__init__.py
+++ b/src/biotite/application/clustalo/__init__.py
@@ -9,4 +9,4 @@
__name__ = "biotite.application.clustalo"
__author__ = "Patrick Kunzmann"
-from .app import *
\ No newline at end of file
+from .app import *
diff --git a/src/biotite/application/clustalo/app.py b/src/biotite/application/clustalo/app.py
index 778c613d8..228300984 100644
--- a/src/biotite/application/clustalo/app.py
+++ b/src/biotite/application/clustalo/app.py
@@ -8,20 +8,16 @@
from tempfile import NamedTemporaryFile
import numpy as np
-from ...sequence.sequence import Sequence
-from ...sequence.seqtypes import NucleotideSequence, ProteinSequence
-from ...sequence.io.fasta.file import FastaFile
-from ...sequence.align.alignment import Alignment
-from ...sequence.phylo.tree import Tree
-from ..localapp import cleanup_tempfile
-from ..msaapp import MSAApp
-from ..application import AppState, requires_state
+from biotite.application.application import AppState, requires_state
+from biotite.application.localapp import cleanup_tempfile
+from biotite.application.msaapp import MSAApp
+from biotite.sequence.phylo.tree import Tree
class ClustalOmegaApp(MSAApp):
"""
Perform a multiple sequence alignment using Clustal-Omega.
-
+
Parameters
----------
sequences : list of ProteinSequence or NucleotideSequence
@@ -30,7 +26,7 @@ class ClustalOmegaApp(MSAApp):
Path of the Custal-Omega binary.
matrix : None
This parameter is used for compatibility reasons and is ignored.
-
+
Examples
--------
@@ -48,34 +44,30 @@ class ClustalOmegaApp(MSAApp):
-BISMITE
--IQLITE
"""
-
+
def __init__(self, sequences, bin_path="clustalo", matrix=None):
super().__init__(sequences, bin_path, None)
self._seq_count = len(sequences)
self._mbed = True
self._dist_matrix = None
self._tree = None
- self._in_dist_matrix_file = NamedTemporaryFile(
- "w", suffix=".mat", delete=False
- )
+ self._in_dist_matrix_file = NamedTemporaryFile("w", suffix=".mat", delete=False)
self._out_dist_matrix_file = NamedTemporaryFile(
"r", suffix=".mat", delete=False
)
- self._in_tree_file = NamedTemporaryFile(
- "w", suffix=".tree", delete=False
- )
- self._out_tree_file = NamedTemporaryFile(
- "r", suffix=".tree", delete=False
- )
-
+ self._in_tree_file = NamedTemporaryFile("w", suffix=".tree", delete=False)
+ self._out_tree_file = NamedTemporaryFile("r", suffix=".tree", delete=False)
+
def run(self):
args = [
- "--in", self.get_input_file_path(),
- "--out", self.get_output_file_path(),
+ "--in",
+ self.get_input_file_path(),
+ "--out",
+ self.get_output_file_path(),
# The temporary files are already created
# -> tell Clustal to overwrite these empty files
"--force",
- # Tree order for get_alignment_order() to work properly
+ # Tree order for get_alignment_order() to work properly
"--output-order=tree-order",
]
if self.get_seqtype() == "protein":
@@ -87,28 +79,24 @@ def run(self):
# as input and output#
# -> Only request tree output when not tree is input
args += [
- "--guidetree-out", self._out_tree_file.name,
+ "--guidetree-out",
+ self._out_tree_file.name,
]
if not self._mbed:
- args += [
- "--full",
- "--distmat-out", self._out_dist_matrix_file.name
- ]
+ args += ["--full", "--distmat-out", self._out_dist_matrix_file.name]
if self._dist_matrix is not None:
# Add the sequence names (0, 1, 2, 3 ...) as first column
dist_matrix_with_index = np.concatenate(
- (
- np.arange(self._seq_count)[:, np.newaxis],
- self._dist_matrix
- ), axis=1
+ (np.arange(self._seq_count)[:, np.newaxis], self._dist_matrix), axis=1
)
np.savetxt(
- self._in_dist_matrix_file.name, dist_matrix_with_index,
+ self._in_dist_matrix_file.name,
+ dist_matrix_with_index,
# The first line contains the amount of sequences
- comments = "",
- header = str(self._seq_count),
+ comments="",
+ header=str(self._seq_count),
# The sequence indices are integers, the rest are floats
- fmt = ["%d"] + ["%.5f"] * self._seq_count
+ fmt=["%d"] + ["%.5f"] * self._seq_count,
)
args += ["--distmat-in", self._in_dist_matrix_file.name]
if self._tree is not None:
@@ -117,15 +105,15 @@ def run(self):
args += ["--guidetree-in", self._in_tree_file.name]
self.set_arguments(args)
super().run()
-
+
def evaluate(self):
super().evaluate()
if not self._mbed:
self._dist_matrix = np.loadtxt(
self._out_dist_matrix_file.name,
# The first row only contains the number of sequences
- skiprows = 1,
- dtype = float
+ skiprows=1,
+ dtype=float,
)
# The first column contains only the name of the
# sequences, in this case 0, 1, 2, 3 ...
@@ -133,17 +121,15 @@ def evaluate(self):
self._dist_matrix = self._dist_matrix[:, 1:]
# Only read output tree if no tree was input
if self._tree is None:
- self._tree = Tree.from_newick(
- self._out_tree_file.read().replace("\n", "")
- )
-
+ self._tree = Tree.from_newick(self._out_tree_file.read().replace("\n", ""))
+
def clean_up(self):
super().clean_up()
cleanup_tempfile(self._in_dist_matrix_file)
cleanup_tempfile(self._out_dist_matrix_file)
cleanup_tempfile(self._in_tree_file)
cleanup_tempfile(self._out_tree_file)
-
+
@requires_state(AppState.CREATED)
def full_matrix_calculation(self):
"""
@@ -154,13 +140,13 @@ def full_matrix_calculation(self):
default *mBed* heuristic.
"""
self._mbed = False
-
+
@requires_state(AppState.CREATED)
def set_distance_matrix(self, matrix):
"""
Set the pairwise sequence distances, the program should use to
- calculate the guide tree.
-
+ calculate the guide tree.
+
Parameters
----------
matrix : ndarray, shape=(n,n), dtype=float
@@ -172,13 +158,13 @@ def set_distance_matrix(self, matrix):
f"{self._seq_count} sequences"
)
self._dist_matrix = matrix.astype(float, copy=False)
-
+
@requires_state(AppState.JOINED)
def get_distance_matrix(self):
"""
Get the pairwise sequence distances the program used to
- calculate the guide tree.
-
+ calculate the guide tree.
+
Returns
-------
matrix : ndarray, shape=(n,n), dtype=float
@@ -186,17 +172,16 @@ def get_distance_matrix(self):
"""
if self._mbed:
raise ValueError(
- "Getting the distance matrix requires "
- "'full_matrix_calculation()'"
+ "Getting the distance matrix requires " "'full_matrix_calculation()'"
)
return self._dist_matrix
-
+
@requires_state(AppState.CREATED)
def set_guide_tree(self, tree):
"""
Set the guide tree, the program should use for the
progressive alignment.
-
+
Parameters
----------
tree : Tree
@@ -208,31 +193,31 @@ def set_guide_tree(self, tree):
"{self._seq_count} sequences, must be equal"
)
self._tree = tree
-
+
@requires_state(AppState.JOINED)
def get_guide_tree(self):
"""
Get the guide tree created for the progressive alignment.
-
+
Returns
-------
tree : Tree
The guide tree.
"""
return self._tree
-
+
@staticmethod
def supports_nucleotide():
return True
-
+
@staticmethod
def supports_protein():
return True
-
+
@staticmethod
def supports_custom_nucleotide_matrix():
return False
-
+
@staticmethod
def supports_custom_protein_matrix():
return False
diff --git a/src/biotite/application/dssp/__init__.py b/src/biotite/application/dssp/__init__.py
index 93f8f17e2..b1d43758c 100644
--- a/src/biotite/application/dssp/__init__.py
+++ b/src/biotite/application/dssp/__init__.py
@@ -9,4 +9,4 @@
__name__ = "biotite.application.dssp"
__author__ = "Patrick Kunzmann"
-from .app import *
\ No newline at end of file
+from .app import *
diff --git a/src/biotite/application/dssp/app.py b/src/biotite/application/dssp/app.py
index eb0974460..57e4ac0f3 100644
--- a/src/biotite/application/dssp/app.py
+++ b/src/biotite/application/dssp/app.py
@@ -7,11 +7,11 @@
__all__ = ["DsspApp"]
from tempfile import NamedTemporaryFile
-from ..localapp import LocalApp, cleanup_tempfile
-from ..application import AppState, requires_state
-from ...structure.io.pdbx.cif import CIFFile
-from ...structure.io.pdbx.convert import set_structure
import numpy as np
+from biotite.application.application import AppState, requires_state
+from biotite.application.localapp import LocalApp, cleanup_tempfile
+from biotite.structure.io.pdbx.cif import CIFFile
+from biotite.structure.io.pdbx.convert import set_structure
class DsspApp(LocalApp):
@@ -73,7 +73,7 @@ def __init__(self, atom_array, bin_path="mkdssp"):
"occupancy", np.ones(self._array.array_length(), dtype=float)
)
- self._in_file = NamedTemporaryFile("w", suffix=".cif", delete=False)
+ self._in_file = NamedTemporaryFile("w", suffix=".cif", delete=False)
self._out_file = NamedTemporaryFile("r", suffix=".dssp", delete=False)
def run(self):
@@ -81,9 +81,7 @@ def run(self):
set_structure(in_file, self._array)
in_file.write(self._in_file)
self._in_file.flush()
- self.set_arguments(
- ["-i", self._in_file.name, "-o", self._out_file.name]
- )
+ self.set_arguments(["-i", self._in_file.name, "-o", self._out_file.name])
super().run()
def evaluate(self):
@@ -93,13 +91,12 @@ def evaluate(self):
sse_start = None
for i, line in enumerate(lines):
if line.startswith(" # RESIDUE AA STRUCTURE"):
- sse_start = i+1
+ sse_start = i + 1
if sse_start is None:
raise ValueError("DSSP file does not contain SSE records")
# Remove "!" for missing residues
lines = [
- line for line in lines[sse_start:]
- if len(line) != 0 and line[13] != "!"
+ line for line in lines[sse_start:] if len(line) != 0 and line[13] != "!"
]
self._sse = np.zeros(len(lines), dtype="U1")
# Parse file for SSE letters
diff --git a/src/biotite/application/localapp.py b/src/biotite/application/localapp.py
index acfd1bd8b..990f7ce0a 100644
--- a/src/biotite/application/localapp.py
+++ b/src/biotite/application/localapp.py
@@ -9,23 +9,29 @@
import abc
import copy
from os import chdir, getcwd, remove
-from .application import Application, AppState, AppStateError, requires_state
-from subprocess import Popen, PIPE, SubprocessError, TimeoutExpired
+from subprocess import PIPE, Popen, SubprocessError, TimeoutExpired
+from biotite.application.application import (
+ Application,
+ AppState,
+ AppStateError,
+ requires_state,
+)
+
class LocalApp(Application, metaclass=abc.ABCMeta):
"""
The base class for all locally installed applications, that are used
via the command line.
-
+
Internally this creates a :class:`Popen` instance, which handles
the execution.
-
+
Parameters
----------
bin_path : str
Path of the application represented by this class.
"""
-
+
def __init__(self, bin_path):
super().__init__()
self._bin_path = bin_path
@@ -35,28 +41,28 @@ def __init__(self, bin_path):
self._process = None
self._command = None
self._stdin_file = None
-
+
@requires_state(AppState.CREATED)
def set_arguments(self, arguments):
"""
Set command line arguments for the application run.
-
+
PROTECTED: Do not call from outside.
-
+
Parameters
----------
arguments : list of str
A list of strings representing the command line options.
"""
self._arguments = copy.copy(arguments)
-
+
@requires_state(AppState.CREATED)
def set_stdin(self, file):
"""
Set a file as standard input for the application run.
-
+
PROTECTED: Do not call from outside.
-
+
Parameters
----------
file : file object
@@ -65,7 +71,7 @@ def set_stdin(self, file):
such as `StringIO` are invalid.
"""
self._stdin_file = file
-
+
@requires_state(AppState.CREATED)
def add_additional_options(self, options):
"""
@@ -81,12 +87,12 @@ def add_additional_options(self, options):
It is recommended to use this method only, when the respective
:class:`LocalApp` subclass does not provide a method to set the
desired option.
-
+
Parameters
----------
options : list of str
A list of strings representing the command line options.
-
+
Notes
-----
In order to see which options the command line execution used,
@@ -114,27 +120,24 @@ def add_additional_options(self, options):
clustalo --full --in ...fa --out ...fa --force --output-order=tree-order --seqtype Protein --guidetree-out ...tree
"""
self._options += options
-
+
@requires_state(
- AppState.RUNNING | \
- AppState.CANCELLED | \
- AppState.FINISHED | \
- AppState.JOINED
+ AppState.RUNNING | AppState.CANCELLED | AppState.FINISHED | AppState.JOINED
)
def get_command(self):
"""
Get the executed command.
Cannot be called until the application has been started.
-
+
Returns
-------
command : str
The executed command.
-
+
Examples
--------
-
+
>>> seq1 = ProteinSequence("BIQTITE")
>>> seq2 = ProteinSequence("TITANITE")
>>> seq3 = ProteinSequence("BISMITE")
@@ -146,72 +149,71 @@ def get_command(self):
"""
return " ".join(self._command)
-
@requires_state(AppState.CREATED)
def set_exec_dir(self, exec_dir):
"""
Set the directory where the application should be executed.
If not set, it will be executed in the working directory at the
- time the application was created.
-
+ time the application was created.
+
PROTECTED: Do not call from outside.
-
+
Parameters
----------
exec_dir : str
The execution directory.
"""
self._exec_dir = exec_dir
-
+
@requires_state(AppState.RUNNING | AppState.FINISHED)
def get_process(self):
"""
Get the `Popen` instance.
-
+
PROTECTED: Do not call from outside.
-
+
Returns
-------
process : Popen
The `Popen` instance
"""
return self._process
-
+
@requires_state(AppState.FINISHED | AppState.JOINED)
def get_exit_code(self):
"""
Get the exit code of the process.
-
+
PROTECTED: Do not call from outside.
-
+
Returns
-------
code : int
The exit code.
"""
return self._process.returncode
-
+
@requires_state(AppState.FINISHED | AppState.JOINED)
def get_stdout(self):
"""
Get the STDOUT pipe content of the process.
-
+
PROTECTED: Do not call from outside.
-
+
Returns
-------
stdout : str
The standard output.
"""
return self._stdout
-
+
@requires_state(AppState.FINISHED | AppState.JOINED)
def get_stderr(self):
"""
Get the STDERR pipe content of the process.
-
+
PROTECTED: Do not call from outside.
-
+
Returns
-------
stdout : str
@@ -221,38 +223,37 @@ def get_stderr(self):
def run(self):
cwd = getcwd()
- chdir(self._exec_dir)
+ chdir(self._exec_dir)
self._command = [self._bin_path] + self._options + self._arguments
self._process = Popen(
- self._command, stdin=self._stdin_file, stdout=PIPE, stderr=PIPE,
- encoding="UTF-8"
+ self._command,
+ stdin=self._stdin_file,
+ stdout=PIPE,
+ stderr=PIPE,
+ encoding="UTF-8",
)
chdir(cwd)
-
+
def is_finished(self):
code = self._process.poll()
- if code == None:
+ if code is None:
return False
else:
self._stdout, self._stderr = self._process.communicate()
return True
-
+
@requires_state(AppState.RUNNING | AppState.FINISHED)
def join(self, timeout=None):
# Override method as repetitive calls of 'is_finished()'
# are not necessary as 'communicate()' already waits for the
# finished application
try:
- self._stdout, self._stderr = self._process.communicate(
- timeout=timeout
- )
+ self._stdout, self._stderr = self._process.communicate(timeout=timeout)
except TimeoutExpired:
self.cancel()
- raise TimeoutError(
- f"The application expired its timeout ({timeout:.1f} s)"
- )
+ raise TimeoutError(f"The application expired its timeout ({timeout:.1f} s)")
self._state = AppState.FINISHED
-
+
try:
self.evaluate()
except AppStateError:
@@ -263,12 +264,11 @@ def join(self, timeout=None):
else:
self._state = AppState.JOINED
self.clean_up()
-
-
+
def wait_interval(self):
# Not used in this implementation of 'join()'
raise NotImplementedError()
-
+
def evaluate(self):
super().evaluate()
# Check if applicaion terminated correctly
@@ -276,10 +276,9 @@ def evaluate(self):
if exit_code != 0:
err_msg = self.get_stderr().replace("\n", " ")
raise SubprocessError(
- f"'{self._bin_path}' returned with exit code {exit_code}: "
- f"{err_msg}"
+ f"'{self._bin_path}' returned with exit code {exit_code}: " f"{err_msg}"
)
-
+
def clean_up(self):
if self.get_app_state() == AppState.CANCELLED:
self._process.kill()
@@ -290,7 +289,7 @@ def cleanup_tempfile(temp_file):
Close a :class:`NamedTemporaryFile` and delete it manually,
if `delete` is set to ``False``.
This function is a small helper function intended for usage in
- `LocalApp` subclasses.
+ `LocalApp` subclasses.
The manual deletion is necessary, as Windows does not allow to open
a :class:`NamedTemporaryFile` as second time
@@ -303,4 +302,4 @@ def cleanup_tempfile(temp_file):
"""
temp_file.close()
if not temp_file.delete:
- remove(temp_file.name)
\ No newline at end of file
+ remove(temp_file.name)
diff --git a/src/biotite/application/mafft/__init__.py b/src/biotite/application/mafft/__init__.py
index 52f86e0ac..19def8bad 100644
--- a/src/biotite/application/mafft/__init__.py
+++ b/src/biotite/application/mafft/__init__.py
@@ -9,4 +9,4 @@
__name__ = "biotite.application.mafft"
__author__ = "Patrick Kunzmann"
-from .app import *
\ No newline at end of file
+from .app import *
diff --git a/src/biotite/application/mafft/app.py b/src/biotite/application/mafft/app.py
index 2d4a22530..84f3f6b9b 100644
--- a/src/biotite/application/mafft/app.py
+++ b/src/biotite/application/mafft/app.py
@@ -6,25 +6,19 @@
__author__ = "Patrick Kunzmann"
__all__ = ["MafftApp"]
-import re
import os
-from ..msaapp import MSAApp
-from ..application import AppState, requires_state
-from ...sequence.sequence import Sequence
-from ...sequence.seqtypes import NucleotideSequence, ProteinSequence
-from ...sequence.io.fasta.file import FastaFile
-from ...sequence.align.alignment import Alignment
-from ...sequence.phylo.tree import Tree
-
+import re
+from biotite.application.application import AppState, requires_state
+from biotite.application.msaapp import MSAApp
+from biotite.sequence.phylo.tree import Tree
_prefix_pattern = re.compile(r"\d*_")
-
class MafftApp(MSAApp):
"""
Perform a multiple sequence alignment using MAFFT.
-
+
Parameters
----------
sequences : list of Sequence
@@ -33,7 +27,7 @@ class MafftApp(MSAApp):
Path of the MUSCLE binary.
matrix : SubstitutionMatrix, optional
A custom substitution matrix.
-
+
Examples
--------
@@ -51,19 +45,19 @@ class MafftApp(MSAApp):
-BISMITE
--IQLITE
"""
-
+
def __init__(self, sequences, bin_path="mafft", matrix=None):
super().__init__(sequences, bin_path, matrix)
self._tree = None
self._out_tree_file_name = self.get_input_file_path() + ".tree"
-
+
def run(self):
args = [
"--quiet",
"--auto",
"--treeout",
# Get the reordered alignment in order for
- # get_alignment_order() to work properly
+ # get_alignment_order() to work properly
"--reorder",
]
if self.get_seqtype() == "protein":
@@ -75,7 +69,7 @@ def run(self):
args += [self.get_input_file_path()]
self.set_arguments(args)
super().run()
-
+
def evaluate(self):
with open(self.get_output_file_path(), "w") as f:
# MAFFT outputs alignment to stdout
@@ -89,7 +83,7 @@ def evaluate(self):
# -> remove the '_' prefix
newick = re.sub(_prefix_pattern, "", raw_newick)
self._tree = Tree.from_newick(newick)
-
+
def clean_up(self):
os.remove(self._out_tree_file_name)
@@ -97,26 +91,26 @@ def clean_up(self):
def get_guide_tree(self):
"""
Get the guide tree created for the progressive alignment.
-
+
Returns
-------
tree : Tree
The guide tree.
"""
return self._tree
-
+
@staticmethod
def supports_nucleotide():
return True
-
+
@staticmethod
def supports_protein():
return True
-
+
@staticmethod
def supports_custom_nucleotide_matrix():
return True
-
+
@staticmethod
def supports_custom_protein_matrix():
return True
diff --git a/src/biotite/application/msaapp.py b/src/biotite/application/msaapp.py
index bf490872e..31eb0064c 100644
--- a/src/biotite/application/msaapp.py
+++ b/src/biotite/application/msaapp.py
@@ -7,22 +7,22 @@
__all__ = ["MSAApp"]
import abc
-from tempfile import NamedTemporaryFile
from collections import OrderedDict
+from tempfile import NamedTemporaryFile
import numpy as np
-from .localapp import LocalApp, cleanup_tempfile
-from .application import AppState, requires_state
-from ..sequence.seqtypes import NucleotideSequence, ProteinSequence
-from ..sequence.io.fasta.file import FastaFile
-from ..sequence.align.alignment import Alignment
-from .util import map_sequence, map_matrix
+from biotite.application.application import AppState, requires_state
+from biotite.application.localapp import LocalApp, cleanup_tempfile
+from biotite.application.util import map_matrix, map_sequence
+from biotite.sequence.align.alignment import Alignment
+from biotite.sequence.io.fasta.file import FastaFile
+from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence
class MSAApp(LocalApp, metaclass=abc.ABCMeta):
"""
This is an abstract base class for multiple sequence alignment
software.
-
+
It handles conversion of :class:`Sequence` objects to FASTA input
and FASTA output to an :class:`Alignment` object.
Inheriting subclasses only need to incorporate the file path
@@ -41,10 +41,10 @@ class MSAApp(LocalApp, metaclass=abc.ABCMeta):
sequences are mapped back into the original sequence types.
The mapping does not work, when the alphabet of the exotic
sequences is larger than the amino acid alphabet.
-
+
Internally this creates a :class:`Popen` instance, which handles
the execution.
-
+
Parameters
----------
sequences : iterable object of Sequence
@@ -54,10 +54,10 @@ class MSAApp(LocalApp, metaclass=abc.ABCMeta):
matrix : SubstitutionMatrix, optional
A custom substitution matrix.
"""
-
+
def __init__(self, sequences, bin_path, matrix=None):
super().__init__(bin_path)
-
+
if len(sequences) < 2:
raise ValueError("At least two sequences are required")
# Check if all sequences share the same alphabet
@@ -68,40 +68,39 @@ def __init__(self, sequences, bin_path, matrix=None):
# Check matrix symmetry
if matrix is not None and not matrix.is_symmetric():
raise ValueError(
- "A symmetric matrix is required for "
- "multiple sequence alignments"
+ "A symmetric matrix is required for " "multiple sequence alignments"
)
-
# Check whether the program supports the alignment for the given
# sequence type
- if ProteinSequence.alphabet.extends(alphabet) \
- and self.supports_protein():
- self._is_mapped = False
- self._seqtype = "protein"
- if matrix is not None:
- if not self.supports_custom_protein_matrix():
- raise TypeError(
- "The software does not support custom "
- "substitution matrices for protein sequences"
- )
- self._matrix = matrix
- else:
- self._matrix = None
-
- elif NucleotideSequence.alphabet_amb.extends(alphabet) \
- and self.supports_nucleotide():
- self._is_mapped = False
- self._seqtype = "nucleotide"
- if matrix is not None:
- if not self.supports_custom_nucleotide_matrix():
- raise TypeError(
- "The software does not support custom "
- "substitution matrices for nucleotide sequences"
- )
- self._matrix = matrix
- else:
- self._matrix = None
+ if ProteinSequence.alphabet.extends(alphabet) and self.supports_protein():
+ self._is_mapped = False
+ self._seqtype = "protein"
+ if matrix is not None:
+ if not self.supports_custom_protein_matrix():
+ raise TypeError(
+ "The software does not support custom "
+ "substitution matrices for protein sequences"
+ )
+ self._matrix = matrix
+ else:
+ self._matrix = None
+
+ elif (
+ NucleotideSequence.alphabet_amb.extends(alphabet)
+ and self.supports_nucleotide()
+ ):
+ self._is_mapped = False
+ self._seqtype = "nucleotide"
+ if matrix is not None:
+ if not self.supports_custom_nucleotide_matrix():
+ raise TypeError(
+ "The software does not support custom "
+ "substitution matrices for nucleotide sequences"
+ )
+ self._matrix = matrix
+ else:
+ self._matrix = None
else:
# For all other sequence types, try to map the sequence into
@@ -126,26 +125,16 @@ def __init__(self, sequences, bin_path, matrix=None):
self._sequences = sequences
# Sequence masquerades as protein
self._seqtype = "protein"
- self._mapped_sequences = [
- map_sequence(sequence) for sequence in sequences
- ]
+ self._mapped_sequences = [map_sequence(sequence) for sequence in sequences]
self._matrix = map_matrix(matrix)
-
self._sequences = sequences
- self._in_file = NamedTemporaryFile(
- "w", suffix=".fa", delete=False
- )
- self._out_file = NamedTemporaryFile(
- "r", suffix=".fa", delete=False
- )
- self._matrix_file = NamedTemporaryFile(
- "w", suffix=".mat", delete=False
- )
+ self._in_file = NamedTemporaryFile("w", suffix=".fa", delete=False)
+ self._out_file = NamedTemporaryFile("r", suffix=".fa", delete=False)
+ self._matrix_file = NamedTemporaryFile("w", suffix=".mat", delete=False)
def run(self):
- sequences = self._sequences if not self._is_mapped \
- else self._mapped_sequences
+ sequences = self._sequences if not self._is_mapped else self._mapped_sequences
sequences_file = FastaFile()
for i, seq in enumerate(sequences):
sequences_file[str(i)] = str(seq)
@@ -155,7 +144,7 @@ def run(self):
self._matrix_file.write(str(self._matrix))
self._matrix_file.flush()
super().run()
-
+
def evaluate(self):
super().evaluate()
alignment_file = FastaFile.read(self._out_file)
@@ -169,26 +158,26 @@ def evaluate(self):
# Also obtain original order
self._order = np.zeros(len(seq_dict), dtype=int)
for i, seq_index in enumerate(seq_dict):
- self._order[i] = int(seq_index)
-
+ self._order[i] = int(seq_index)
+
def clean_up(self):
super().clean_up()
cleanup_tempfile(self._in_file)
cleanup_tempfile(self._out_file)
cleanup_tempfile(self._matrix_file)
-
+
@requires_state(AppState.JOINED)
def get_alignment(self):
"""
Get the resulting multiple sequence alignment.
-
+
Returns
-------
alignment : Alignment
The global multiple sequence alignment.
"""
return self._alignment
-
+
@requires_state(AppState.JOINED)
def get_alignment_order(self):
"""
@@ -202,12 +191,12 @@ def get_alignment_order(self):
order.
This method returns the order of the sequences intended by the
MSA software.
-
+
Returns
-------
order : ndarray, dtype=int
The sequence order intended by the MSA software.
-
+
Examples
--------
Align sequences and restore the original order:
@@ -220,39 +209,39 @@ def get_alignment_order(self):
alignment = alignment[:, order]
"""
return self._order
-
+
def get_input_file_path(self):
"""
Get input file path (FASTA format).
-
+
PROTECTED: Do not call from outside.
-
+
Returns
-------
path : str
Path of input file.
"""
return self._in_file.name
-
+
def get_output_file_path(self):
"""
Get output file path (FASTA format).
-
+
PROTECTED: Do not call from outside.
-
+
Returns
-------
path : str
Path of output file.
"""
return self._out_file.name
-
+
def get_matrix_file_path(self):
"""
Get file path for custom substitution matrix.
-
+
PROTECTED: Do not call from outside.
-
+
Returns
-------
path : str or None
@@ -260,7 +249,7 @@ def get_matrix_file_path(self):
None if no matrix was given.
"""
return self._matrix_file.name if self._matrix is not None else None
-
+
def get_seqtype(self):
"""
Get the type of aligned sequences.
@@ -268,16 +257,16 @@ def get_seqtype(self):
When a custom sequence type (neither nucleotide nor protein)
is mapped onto a protein sequence, the return value is also
``'protein'``.
-
+
PROTECTED: Do not call from outside.
-
+
Returns
-------
seqtype : {'nucleotide', 'protein'}
Type of sequences to be aligned.
"""
return self._seqtype
-
+
@staticmethod
@abc.abstractmethod
def supports_nucleotide():
@@ -289,11 +278,11 @@ def supports_nucleotide():
-------
support : bool
True, if the class has support, false otherwise.
-
+
PROTECTED: Override when inheriting.
"""
pass
-
+
@staticmethod
@abc.abstractmethod
def supports_protein():
@@ -305,11 +294,11 @@ def supports_protein():
-------
support : bool
True, if the class has support, false otherwise.
-
+
PROTECTED: Override when inheriting.
"""
pass
-
+
@staticmethod
@abc.abstractmethod
def supports_custom_nucleotide_matrix():
@@ -321,11 +310,11 @@ def supports_custom_nucleotide_matrix():
-------
support : bool
True, if the class has support, false otherwise.
-
+
PROTECTED: Override when inheriting.
"""
pass
-
+
@staticmethod
@abc.abstractmethod
def supports_custom_protein_matrix():
@@ -337,19 +326,19 @@ def supports_custom_protein_matrix():
-------
support : bool
True, if the class has support, false otherwise.
-
+
PROTECTED: Override when inheriting.
"""
pass
-
+
@classmethod
def align(cls, sequences, bin_path=None, matrix=None):
"""
Perform a multiple sequence alignment.
-
+
This is a convenience function, that wraps the :class:`MSAApp`
execution.
-
+
Parameters
----------
sequences : iterable object of Sequence
@@ -359,7 +348,7 @@ def align(cls, sequences, bin_path=None, matrix=None):
path will be used.
matrix : SubstitutionMatrix, optional
A custom substitution matrix.
-
+
Returns
-------
alignment : Alignment
diff --git a/src/biotite/application/muscle/__init__.py b/src/biotite/application/muscle/__init__.py
index 644e7a118..c75f0f8be 100644
--- a/src/biotite/application/muscle/__init__.py
+++ b/src/biotite/application/muscle/__init__.py
@@ -10,4 +10,4 @@
__author__ = "Patrick Kunzmann"
from .app3 import *
-from .app5 import *
\ No newline at end of file
+from .app5 import *
diff --git a/src/biotite/application/muscle/app3.py b/src/biotite/application/muscle/app3.py
index 8df72ce65..60118966a 100644
--- a/src/biotite/application/muscle/app3.py
+++ b/src/biotite/application/muscle/app3.py
@@ -6,25 +6,22 @@
__author__ = "Patrick Kunzmann"
__all__ = ["MuscleApp"]
-import re
import numbers
-import warnings
+import re
import subprocess
+import warnings
+from collections.abc import Sequence
from tempfile import NamedTemporaryFile
-from ..localapp import cleanup_tempfile
-from ..msaapp import MSAApp
-from ..application import AppState, VersionError, requires_state
-from ...sequence.sequence import Sequence
-from ...sequence.seqtypes import NucleotideSequence, ProteinSequence
-from ...sequence.align.matrix import SubstitutionMatrix
-from ...sequence.align.alignment import Alignment
-from ...sequence.phylo.tree import Tree
+from biotite.application.application import AppState, VersionError, requires_state
+from biotite.application.localapp import cleanup_tempfile
+from biotite.application.msaapp import MSAApp
+from biotite.sequence.phylo.tree import Tree
class MuscleApp(MSAApp):
"""
Perform a multiple sequence alignment using MUSCLE version 3.
-
+
Parameters
----------
sequences : list of Sequence
@@ -33,11 +30,11 @@ class MuscleApp(MSAApp):
Path of the MUSCLE binary.
matrix : SubstitutionMatrix, optional
A custom substitution matrix.
-
+
See also
--------
Muscle5App
-
+
Examples
--------
@@ -55,34 +52,32 @@ class MuscleApp(MSAApp):
BISM-ITE
-IQL-ITE
"""
-
+
def __init__(self, sequences, bin_path="muscle", matrix=None):
major_version = get_version(bin_path)[0]
if major_version != 3:
- raise VersionError(
- f"Muscle 3 is required, got version {major_version}"
- )
-
+ raise VersionError(f"Muscle 3 is required, got version {major_version}")
+
super().__init__(sequences, bin_path, matrix)
self._gap_open = None
self._gap_ext = None
self._terminal_penalty = None
self._tree1 = None
self._tree2 = None
- self._out_tree1_file = NamedTemporaryFile(
- "r", suffix=".tree", delete=False
- )
- self._out_tree2_file = NamedTemporaryFile(
- "r", suffix=".tree", delete=False
- )
-
+ self._out_tree1_file = NamedTemporaryFile("r", suffix=".tree", delete=False)
+ self._out_tree2_file = NamedTemporaryFile("r", suffix=".tree", delete=False)
+
def run(self):
args = [
"-quiet",
- "-in", self.get_input_file_path(),
- "-out", self.get_output_file_path(),
- "-tree1", self._out_tree1_file.name,
- "-tree2", self._out_tree2_file.name,
+ "-in",
+ self.get_input_file_path(),
+ "-out",
+ self.get_output_file_path(),
+ "-tree1",
+ self._out_tree1_file.name,
+ "-tree2",
+ self._out_tree2_file.name,
]
if self.get_seqtype() == "protein":
args += ["-seqtype", "protein"]
@@ -91,7 +86,7 @@ def run(self):
if self.get_matrix_file_path() is not None:
args += ["-matrix", self.get_matrix_file_path()]
if self._gap_open is not None and self._gap_ext is not None:
- args += ["-gapopen", f"{self._gap_open:.1f}"]
+ args += ["-gapopen", f"{self._gap_open:.1f}"]
args += ["-gapextend", f"{self._gap_ext:.1f}"]
# When the gap penalty is set,
# use the penalty also for hydrophobic regions
@@ -100,7 +95,7 @@ def run(self):
args += ["-center", "0.0"]
self.set_arguments(args)
super().run()
-
+
def evaluate(self):
super().evaluate()
@@ -108,23 +103,19 @@ def evaluate(self):
if len(newick) > 0:
self._tree1 = Tree.from_newick(newick)
else:
- warnings.warn(
- "MUSCLE did not write a tree file from the first iteration"
- )
-
+ warnings.warn("MUSCLE did not write a tree file from the first iteration")
+
newick = self._out_tree2_file.read().replace("\n", "")
if len(newick) > 0:
self._tree2 = Tree.from_newick(newick)
else:
- warnings.warn(
- "MUSCLE did not write a tree file from the second iteration"
- )
-
+ warnings.warn("MUSCLE did not write a tree file from the second iteration")
+
def clean_up(self):
super().clean_up()
cleanup_tempfile(self._out_tree1_file)
cleanup_tempfile(self._out_tree2_file)
-
+
@requires_state(AppState.CREATED)
def set_gap_penalty(self, gap_penalty):
"""
@@ -145,20 +136,20 @@ def set_gap_penalty(self, gap_penalty):
if gap_penalty > 0:
raise ValueError("Gap penalty must be negative")
self._gap_open = gap_penalty
- self._gap_ext= gap_penalty
- elif type(gap_penalty) == tuple:
+ self._gap_ext = gap_penalty
+ elif isinstance(gap_penalty, Sequence):
if gap_penalty[0] > 0 or gap_penalty[1] > 0:
- raise ValueError("Gap penalty must be negative")
+ raise ValueError("Gap penalty must be negative")
self._gap_open = gap_penalty[0]
self._gap_ext = gap_penalty[1]
else:
raise TypeError("Gap penalty must be either float or tuple")
-
+
@requires_state(AppState.JOINED)
def get_guide_tree(self, iteration="identity"):
"""
Get the guide tree created for the progressive alignment.
-
+
Parameters
----------
iteration : {'kmer', 'identity'}
@@ -168,7 +159,7 @@ def get_guide_tree(self, iteration="identity"):
If 'identity' the second iteration tree is returned.
This tree uses distances based on the pairwise sequence
identity after the first progressive alignment iteration.
-
+
Returns
-------
tree : Tree
@@ -180,32 +171,31 @@ def get_guide_tree(self, iteration="identity"):
return self._tree2
else:
raise ValueError("Iteration must be 'kmer' or 'identity'")
-
+
@staticmethod
def supports_nucleotide():
return True
-
+
@staticmethod
def supports_protein():
return True
-
+
@staticmethod
def supports_custom_nucleotide_matrix():
return False
-
+
@staticmethod
def supports_custom_protein_matrix():
return True
-
+
@classmethod
- def align(cls, sequences, bin_path=None, matrix=None,
- gap_penalty=None):
+ def align(cls, sequences, bin_path=None, matrix=None, gap_penalty=None):
"""
Perform a multiple sequence alignment.
-
+
This is a convenience function, that wraps the :class:`MuscleApp`
execution.
-
+
Parameters
----------
sequences : iterable object of Sequence
@@ -222,7 +212,7 @@ def align(cls, sequences, bin_path=None, matrix=None,
The first value in the tuple is the gap opening penalty,
the second value is the gap extension penalty.
The values need to be negative.
-
+
Returns
-------
alignment : Alignment
@@ -240,15 +230,11 @@ def align(cls, sequences, bin_path=None, matrix=None,
def get_version(bin_path="muscle"):
- output = subprocess.run(
- [bin_path, "-version"], capture_output=True, text=True
- )
+ output = subprocess.run([bin_path, "-version"], capture_output=True, text=True)
# Find matches for version string containing major and minor version
- match = re.search("\d+\.\d+", output.stdout)
+ match = re.search(r"\d+\.\d+", output.stdout)
if match is None:
- raise subprocess.SubprocessError(
- "Could not determine Muscle version"
- )
+ raise subprocess.SubprocessError("Could not determine Muscle version")
version_string = match.group(0)
splitted = version_string.split(".")
- return int(splitted[0]), int(splitted[1])
\ No newline at end of file
+ return int(splitted[0]), int(splitted[1])
diff --git a/src/biotite/application/muscle/app5.py b/src/biotite/application/muscle/app5.py
index 326c92227..cc1ef5e2a 100644
--- a/src/biotite/application/muscle/app5.py
+++ b/src/biotite/application/muscle/app5.py
@@ -6,31 +6,22 @@
__author__ = "Patrick Kunzmann"
__all__ = ["Muscle5App"]
-import numbers
-import warnings
-from tempfile import NamedTemporaryFile
-from ..localapp import cleanup_tempfile
-from ..msaapp import MSAApp
-from ..application import AppState, VersionError, requires_state
-from ...sequence.sequence import Sequence
-from ...sequence.seqtypes import NucleotideSequence, ProteinSequence
-from ...sequence.align.matrix import SubstitutionMatrix
-from ...sequence.align.alignment import Alignment
-from ...sequence.phylo.tree import Tree
-from .app3 import get_version
+from biotite.application.application import AppState, VersionError, requires_state
+from biotite.application.msaapp import MSAApp
+from biotite.application.muscle.app3 import get_version
class Muscle5App(MSAApp):
"""
Perform a multiple sequence alignment using MUSCLE version 5.
-
+
Parameters
----------
sequences : list of Sequence
The sequences to be aligned.
bin_path : str, optional
Path of the MUSCLE binary.
-
+
See also
--------
MuscleApp
@@ -38,7 +29,7 @@ class Muscle5App(MSAApp):
Notes
-----
Alignment ensemble generation is not supported, yet.
-
+
Examples
--------
@@ -56,14 +47,14 @@ class Muscle5App(MSAApp):
BI-SMITE
-I-QLITE
"""
-
+
def __init__(self, sequences, bin_path="muscle"):
major_version = get_version(bin_path)[0]
if major_version < 5:
raise VersionError(
f"At least Muscle 5 is required, got version {major_version}"
)
-
+
super().__init__(sequences, bin_path)
self._mode = "align"
self._consiters = None
@@ -86,7 +77,7 @@ def set_iterations(self, consistency=None, refinement=None):
self._consiters = consistency
if refinement is not None:
self._refineiters = refinement
-
+
@requires_state(AppState.CREATED)
def set_thread_number(self, number):
"""
@@ -110,48 +101,49 @@ def run(self):
args = [
f"-{self._mode}",
self.get_input_file_path(),
- "-output", self.get_output_file_path(),
+ "-output",
+ self.get_output_file_path(),
]
if self.get_seqtype() == "protein":
args += ["-amino"]
else:
args += ["-nt"]
if self._n_threads is not None:
- args += ["-threads", str(self._n_threads)]
+ args += ["-threads", str(self._n_threads)]
if self._consiters is not None:
- args += ["-consiters", str(self._consiters)]
+ args += ["-consiters", str(self._consiters)]
if self._refineiters is not None:
- args += ["-refineiters", str(self._refineiters)]
+ args += ["-refineiters", str(self._refineiters)]
self.set_arguments(args)
super().run()
-
+
def clean_up(self):
super().clean_up()
-
+
@staticmethod
def supports_nucleotide():
return True
-
+
@staticmethod
def supports_protein():
return True
-
+
@staticmethod
def supports_custom_nucleotide_matrix():
return False
-
+
@staticmethod
def supports_custom_protein_matrix():
return False
-
+
@classmethod
def align(cls, sequences, bin_path="muscle"):
"""
Perform a multiple sequence alignment.
-
+
This is a convenience function, that wraps the :class:`Muscle5App`
execution.
-
+
Parameters
----------
sequences : iterable object of Sequence
@@ -159,7 +151,7 @@ def align(cls, sequences, bin_path="muscle"):
bin_path : str, optional
Path of the MSA software binary. By default, the default path
will be used.
-
+
Returns
-------
alignment : Alignment
diff --git a/src/biotite/application/sra/__init__.py b/src/biotite/application/sra/__init__.py
index d68a49d3e..f69fccde6 100644
--- a/src/biotite/application/sra/__init__.py
+++ b/src/biotite/application/sra/__init__.py
@@ -15,4 +15,4 @@
__name__ = "biotite.application.sra"
__author__ = "Patrick Kunzmann"
-from .app import *
\ No newline at end of file
+from .app import *
diff --git a/src/biotite/application/sra/app.py b/src/biotite/application/sra/app.py
index 6f5a20955..7fc39ab4c 100644
--- a/src/biotite/application/sra/app.py
+++ b/src/biotite/application/sra/app.py
@@ -7,17 +7,21 @@
__all__ = ["FastaDumpApp", "FastqDumpApp"]
import abc
-from os.path import join
-from subprocess import Popen, SubprocessError, PIPE, TimeoutExpired
import glob
+from os.path import join
+from subprocess import PIPE, Popen, SubprocessError, TimeoutExpired
from tempfile import TemporaryDirectory
-from ..application import Application, AppState, AppStateError, \
- requires_state
-from ...sequence.seqtypes import NucleotideSequence
-from ...sequence.io.fastq.file import FastqFile
-from ...sequence.io.fasta.file import FastaFile
-from ...sequence.io.fastq.convert import get_sequences as get_sequences_and_scores
-from ...sequence.io.fasta.convert import get_sequences
+from biotite.application.application import (
+ Application,
+ AppState,
+ AppStateError,
+ requires_state,
+)
+from biotite.sequence.io.fasta.convert import get_sequences
+from biotite.sequence.io.fasta.file import FastaFile
+from biotite.sequence.io.fastq.convert import get_sequences as get_sequences_and_scores
+from biotite.sequence.io.fastq.file import FastqFile
+from biotite.sequence.seqtypes import NucleotideSequence
# Do not use LocalApp, as two programs are executed
@@ -48,8 +52,13 @@ class _DumpApp(Application, metaclass=abc.ABCMeta):
the score format.
"""
- def __init__(self, uid, output_path_prefix=None,
- prefetch_path="prefetch", fasterq_dump_path="fasterq-dump"):
+ def __init__(
+ self,
+ uid,
+ output_path_prefix=None,
+ prefetch_path="prefetch",
+ fasterq_dump_path="fasterq-dump",
+ ):
super().__init__()
self._prefetch_path = prefetch_path
self._fasterq_dump_path = fasterq_dump_path
@@ -62,21 +71,16 @@ def __init__(self, uid, output_path_prefix=None,
self._prefetch_process = None
self._fasterq_dump_process = None
-
@requires_state(AppState.RUNNING | AppState.FINISHED)
def join(self, timeout=None):
# Override method as repetitive calls of 'is_finished()'
# are not necessary as 'communicate()' already waits for the
# finished application
try:
- _, self._stderr = self._process.communicate(
- timeout=timeout
- )
+ _, self._stderr = self._process.communicate(timeout=timeout)
except TimeoutExpired:
self.cancel()
- raise TimeoutError(
- f"The application expired its timeout ({timeout:.1f} s)"
- )
+ raise TimeoutError(f"The application expired its timeout ({timeout:.1f} s)")
self._state = AppState.FINISHED
try:
@@ -90,7 +94,6 @@ def join(self, timeout=None):
self._state = AppState.JOINED
self.clean_up()
-
def run(self):
# Prefetch into a temp directory with file name equaling UID
# This ensures that the ID in the header is not the temp prefix
@@ -105,16 +108,14 @@ def run(self):
command, stdout=PIPE, stderr=PIPE, shell=True, encoding="UTF-8"
)
-
def is_finished(self):
code = self._process.poll()
- if code == None:
+ if code is None:
return False
else:
- _, self._stderr = self._process.communicate()
+ _, self._stderr = self._process.communicate()
return True
-
def evaluate(self):
super().evaluate()
# Check if applicaion terminated correctly
@@ -128,26 +129,24 @@ def evaluate(self):
self._file_names = (
# For entries with one read per spot
- glob.glob(self._prefix + ".fastq") +
+ glob.glob(self._prefix + ".fastq")
+ +
# For entries with multiple reads per spot
glob.glob(self._prefix + "_*.fastq")
)
# Only load FASTQ files into memory when needed
self._fastq_files = None
-
def wait_interval(self):
# Not used in this implementation of 'join()'
raise NotImplementedError()
-
def clean_up(self):
if self.get_app_state() == AppState.CANCELLED:
self._process.kill()
# Directory with temp files does not need to be deleted,
# as temp dir is automatically deleted upon object destruction
-
@requires_state(AppState.CREATED)
def get_prefetch_options(self):
"""
@@ -176,7 +175,6 @@ def get_fastq_dump_options(self):
"""
return ""
-
@requires_state(AppState.JOINED)
def get_file_paths(self):
"""
@@ -189,7 +187,6 @@ def get_file_paths(self):
"""
return self._file_names
-
@requires_state(AppState.JOINED)
@abc.abstractmethod
def get_sequences(self):
@@ -236,15 +233,18 @@ class FastqDumpApp(_DumpApp):
the score format.
"""
- def __init__(self, uid, output_path_prefix=None, prefetch_path="prefetch",
- fasterq_dump_path="fasterq-dump", offset="Sanger"):
- super().__init__(
- uid, output_path_prefix, prefetch_path, fasterq_dump_path
- )
+ def __init__(
+ self,
+ uid,
+ output_path_prefix=None,
+ prefetch_path="prefetch",
+ fasterq_dump_path="fasterq-dump",
+ offset="Sanger",
+ ):
+ super().__init__(uid, output_path_prefix, prefetch_path, fasterq_dump_path)
self._offset = offset
self._fastq_files = None
-
@requires_state(AppState.JOINED)
def get_fastq(self):
"""
@@ -265,20 +265,16 @@ def get_fastq(self):
]
return self._fastq_files
-
@requires_state(AppState.JOINED)
def get_sequences(self):
return [
{
- header: NucleotideSequence(
- seq_str.replace("U","T").replace("X","N")
- )
+ header: NucleotideSequence(seq_str.replace("U", "T").replace("X", "N"))
for header, (seq_str, _) in fastq_file.items()
}
for fastq_file in self.get_fastq()
]
-
@requires_state(AppState.JOINED)
def get_sequences_and_scores(self):
"""
@@ -294,15 +290,17 @@ def get_sequences_and_scores(self):
Each item in the list is a dictionary mapping identifiers to its
corresponding sequence and score values.
"""
- return [
- get_sequences_and_scores(fastq_file)
- for fastq_file in self.get_fastq()
- ]
-
+ return [get_sequences_and_scores(fastq_file) for fastq_file in self.get_fastq()]
@classmethod
- def fetch(cls, uid, output_path_prefix=None, prefetch_path="prefetch",
- fasterq_dump_path="fasterq-dump", offset="Sanger"):
+ def fetch(
+ cls,
+ uid,
+ output_path_prefix=None,
+ prefetch_path="prefetch",
+ fasterq_dump_path="fasterq-dump",
+ offset="Sanger",
+ ):
"""
Get the sequences belonging to the UID from the
*NCBI sequence read archive* (SRA).
@@ -338,9 +336,7 @@ def fetch(cls, uid, output_path_prefix=None, prefetch_path="prefetch",
Each item in the list is a dictionary mapping identifiers to its
corresponding sequence.
"""
- app = cls(
- uid, output_path_prefix, prefetch_path, fasterq_dump_path, offset
- )
+ app = cls(uid, output_path_prefix, prefetch_path, fasterq_dump_path, offset)
app.start()
app.join()
return app.get_sequences()
@@ -368,14 +364,16 @@ class FastaDumpApp(_DumpApp):
respectively.
"""
- def __init__(self, uid, output_path_prefix=None, prefetch_path="prefetch",
- fasterq_dump_path="fasterq-dump"):
- super().__init__(
- uid, output_path_prefix, prefetch_path, fasterq_dump_path
- )
+ def __init__(
+ self,
+ uid,
+ output_path_prefix=None,
+ prefetch_path="prefetch",
+ fasterq_dump_path="fasterq-dump",
+ ):
+ super().__init__(uid, output_path_prefix, prefetch_path, fasterq_dump_path)
self._fasta_files = None
-
@requires_state(AppState.CREATED)
def get_prefetch_options(self):
return
@@ -383,12 +381,10 @@ def get_prefetch_options(self):
# when https://github.com/ncbi/sra-tools/issues/883 is resolved
# return "--eliminate-quals"
-
@requires_state(AppState.CREATED)
def get_fastq_dump_options(self):
return "--fasta"
-
@requires_state(AppState.JOINED)
def get_fasta(self):
"""
@@ -404,20 +400,22 @@ def get_fasta(self):
"""
if self._fasta_files is None:
self._fasta_files = [
- FastaFile.read(file_name)
- for file_name in self.get_file_paths()
+ FastaFile.read(file_name) for file_name in self.get_file_paths()
]
return self._fasta_files
-
@requires_state(AppState.JOINED)
def get_sequences(self):
return [get_sequences(fasta_file) for fasta_file in self.get_fasta()]
-
@classmethod
- def fetch(cls, uid, output_path_prefix=None, prefetch_path="prefetch",
- fasterq_dump_path="fasterq-dump"):
+ def fetch(
+ cls,
+ uid,
+ output_path_prefix=None,
+ prefetch_path="prefetch",
+ fasterq_dump_path="fasterq-dump",
+ ):
"""
Get the sequences belonging to the UID from the
*NCBI sequence read archive* (SRA).
@@ -448,9 +446,7 @@ def fetch(cls, uid, output_path_prefix=None, prefetch_path="prefetch",
Each item in the list is a dictionary mapping identifiers to its
corresponding sequence.
"""
- app = cls(
- uid, output_path_prefix, prefetch_path, fasterq_dump_path
- )
+ app = cls(uid, output_path_prefix, prefetch_path, fasterq_dump_path)
app.start()
app.join()
- return app.get_sequences()
\ No newline at end of file
+ return app.get_sequences()
diff --git a/src/biotite/application/tantan/__init__.py b/src/biotite/application/tantan/__init__.py
index 6efc86610..7a829420a 100644
--- a/src/biotite/application/tantan/__init__.py
+++ b/src/biotite/application/tantan/__init__.py
@@ -9,4 +9,4 @@
__name__ = "biotite.application.tantan"
__author__ = "Patrick Kunzmann"
-from .app import *
\ No newline at end of file
+from .app import *
diff --git a/src/biotite/application/tantan/app.py b/src/biotite/application/tantan/app.py
index 077a5cbdd..6d7020569 100644
--- a/src/biotite/application/tantan/app.py
+++ b/src/biotite/application/tantan/app.py
@@ -6,17 +6,15 @@
__author__ = "Patrick Kunzmann"
__all__ = ["TantanApp"]
-from collections.abc import Sequence as SequenceABC
import io
+from collections.abc import Sequence as SequenceABC
from tempfile import NamedTemporaryFile
import numpy as np
-from ..localapp import LocalApp, cleanup_tempfile
-from ..application import AppState, requires_state
-from ...sequence.seqtypes import NucleotideSequence, ProteinSequence
-from ...sequence.alphabet import common_alphabet
-from ...sequence.io.fasta.file import FastaFile
-from ..util import map_sequence, map_matrix
-
+from biotite.application.application import AppState, requires_state
+from biotite.application.localapp import LocalApp, cleanup_tempfile
+from biotite.sequence.alphabet import common_alphabet
+from biotite.sequence.io.fasta.file import FastaFile
+from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence
MASKING_LETTER = "!"
@@ -43,7 +41,7 @@ class TantanApp(LocalApp):
References
----------
-
+
.. footbibliography::
Examples
@@ -59,10 +57,10 @@ class TantanApp(LocalApp):
True True True True True True True True False False False False
False]
>>> print(sequence, "\n" + "".join(["^" if e else " " for e in repeat_mask]))
- GGCATCGATATATATATATAGTCAA
- ^^^^^^^^^^^
+ GGCATCGATATATATATATAGTCAA
+ ^^^^^^^^^^^
"""
-
+
def __init__(self, sequence, matrix=None, bin_path="tantan"):
super().__init__(bin_path)
@@ -93,59 +91,43 @@ def __init__(self, sequence, matrix=None, bin_path="tantan"):
)
self._is_protein = True
else:
- raise TypeError(
- "A NucleotideSequence or ProteinSequence is required"
- )
-
+ raise TypeError("A NucleotideSequence or ProteinSequence is required")
+
if matrix is None:
self._matrix_file = None
else:
- common_alph = common_alphabet(
- (seq.alphabet for seq in self._sequences)
- )
+ common_alph = common_alphabet((seq.alphabet for seq in self._sequences))
if common_alph is None:
- raise ValueError(
- "There is no common alphabet within the sequences"
- )
+ raise ValueError("There is no common alphabet within the sequences")
if not matrix.get_alphabet1().extends(common_alph):
raise ValueError(
"The alphabet of the sequence(s) do not fit the matrix"
)
if not matrix.is_symmetric():
raise ValueError("A symmetric matrix is required")
- self._matrix_file = NamedTemporaryFile(
- "w", suffix=".mat", delete=False
- )
+ self._matrix_file = NamedTemporaryFile("w", suffix=".mat", delete=False)
self._matrix = matrix
-
- self._in_file = NamedTemporaryFile("w", suffix=".fa", delete=False)
+ self._in_file = NamedTemporaryFile("w", suffix=".fa", delete=False)
def run(self):
FastaFile.write_iter(
self._in_file,
- (
- (f"sequence_{i:d}", str(seq))
- for i, seq in enumerate(self._sequences)
- )
+ ((f"sequence_{i:d}", str(seq)) for i, seq in enumerate(self._sequences)),
)
self._in_file.flush()
if self._matrix is not None:
self._matrix_file.write(str(self._matrix))
self._matrix_file.flush()
-
+
args = []
if self._matrix is not None:
args += ["-m", self._matrix_file.name]
if self._is_protein:
- args += ["-p"]
- args += [
- "-x", MASKING_LETTER,
- self._in_file.name
- ]
+ args += ["-p"]
+ args += ["-x", MASKING_LETTER, self._in_file.name]
self.set_arguments(args)
super().run()
-
def evaluate(self):
super().evaluate()
@@ -154,18 +136,14 @@ def evaluate(self):
self._masks = []
encoded_masking_letter = MASKING_LETTER.encode("ASCII")[0]
for _, masked_seq_string in FastaFile.read_iter(out_file):
- array = np.frombuffer(
- masked_seq_string.encode("ASCII"), dtype=np.ubyte
- )
+ array = np.frombuffer(masked_seq_string.encode("ASCII"), dtype=np.ubyte)
self._masks.append(array == encoded_masking_letter)
-
def clean_up(self):
super().clean_up()
cleanup_tempfile(self._in_file)
if self._matrix_file is not None:
cleanup_tempfile(self._matrix_file)
-
@requires_state(AppState.JOINED)
def get_mask(self):
@@ -186,7 +164,6 @@ def get_mask(self):
else:
return self._masks[0]
-
@staticmethod
def mask_repeats(sequence, matrix=None, bin_path="tantan"):
"""
@@ -219,4 +196,4 @@ def mask_repeats(sequence, matrix=None, bin_path="tantan"):
app = TantanApp(sequence, matrix, bin_path)
app.start()
app.join()
- return app.get_mask()
\ No newline at end of file
+ return app.get_mask()
diff --git a/src/biotite/application/util.py b/src/biotite/application/util.py
index ce544c417..4da2a342f 100644
--- a/src/biotite/application/util.py
+++ b/src/biotite/application/util.py
@@ -8,15 +8,15 @@
import numpy as np
-from ..sequence.seqtypes import ProteinSequence
-from ..sequence.align.matrix import SubstitutionMatrix
+from biotite.sequence.align.matrix import SubstitutionMatrix
+from biotite.sequence.seqtypes import ProteinSequence
def map_sequence(sequence):
"""
Map a sequence with an arbitrary alphabet into a
:class:`ProteinSequence`, in order to support arbitrary sequence
- types in software that can handle protein sequences.
+ types in software that can handle protein sequences.
"""
if len(sequence.alphabet) > len(ProteinSequence.alphabet):
# Cannot map into a protein sequence if the alphabet
@@ -39,12 +39,11 @@ def map_matrix(matrix):
Map a :class:`SubstitutionMatrix` with an arbitrary alphabet into a
class:`SubstitutionMatrix` for protein sequences, in order to support
arbitrary sequence types in software that can handle protein
- sequences.
+ sequences.
"""
if matrix is None:
raise TypeError(
- "A substitution matrix must be provided for custom "
- "sequence types"
+ "A substitution matrix must be provided for custom " "sequence types"
)
# Create a protein substitution matrix with the values taken
# from the original matrix
@@ -54,6 +53,5 @@ def map_matrix(matrix):
new_score_matrix = np.zeros((new_length, new_length))
new_score_matrix[:old_length, :old_length] = matrix.score_matrix()
return SubstitutionMatrix(
- ProteinSequence.alphabet, ProteinSequence.alphabet,
- new_score_matrix
- )
\ No newline at end of file
+ ProteinSequence.alphabet, ProteinSequence.alphabet, new_score_matrix
+ )
diff --git a/src/biotite/application/viennarna/rnaalifold.py b/src/biotite/application/viennarna/rnaalifold.py
index aadc61b97..4604780aa 100644
--- a/src/biotite/application/viennarna/rnaalifold.py
+++ b/src/biotite/application/viennarna/rnaalifold.py
@@ -9,12 +9,12 @@
import copy
from tempfile import NamedTemporaryFile
import numpy as np
-from ..application import AppState, requires_state
-from ..localapp import LocalApp, cleanup_tempfile
-from ...sequence.io.fasta import FastaFile, set_alignment
-from ...structure.dotbracket import base_pairs_from_dot_bracket
-from ...structure.bonds import BondList
-from .util import build_constraint_string
+from biotite.application.application import AppState, requires_state
+from biotite.application.localapp import LocalApp, cleanup_tempfile
+from biotite.application.viennarna.util import build_constraint_string
+from biotite.sequence.io.fasta import FastaFile, set_alignment
+from biotite.structure.bonds import BondList
+from biotite.structure.dotbracket import base_pairs_from_dot_bracket
class RNAalifoldApp(LocalApp):
@@ -45,9 +45,7 @@ def __init__(self, alignment, temperature=37, bin_path="RNAalifold"):
self._temperature = str(temperature)
self._constraints = None
self._enforce = None
- self._in_file = NamedTemporaryFile(
- "w", suffix=".fa", delete=False
- )
+ self._in_file = NamedTemporaryFile("w", suffix=".fa", delete=False)
self._constraints_file = NamedTemporaryFile(
"w+", suffix=".constraints", delete=False
)
@@ -57,15 +55,17 @@ def run(self):
# -> Extremely high value for characters per line
fasta_file = FastaFile(chars_per_line=np.iinfo(np.int32).max)
set_alignment(
- fasta_file, self._alignment,
- seq_names=[str(i) for i in range(len(self._alignment.sequences))]
+ fasta_file,
+ self._alignment,
+ seq_names=[str(i) for i in range(len(self._alignment.sequences))],
)
fasta_file.write(self._in_file)
self._in_file.flush()
options = [
"--noPS",
- "-T", self._temperature,
+ "-T",
+ self._temperature,
]
if self._enforce is True:
options.append("--enforceConstraint")
@@ -78,7 +78,7 @@ def run(self):
self.set_arguments(options + [self._in_file.name])
super().run()
-
+
def clean_up(self):
super().clean_up()
cleanup_tempfile(self._in_file)
@@ -97,7 +97,7 @@ def evaluate(self):
self._free_energy = float(energy_contributions[0])
self._covariance_energy = float(energy_contributions[1])
self._dotbracket = dotbracket
-
+
@requires_state(AppState.CREATED)
def set_temperature(self, temperature):
"""
@@ -110,10 +110,17 @@ def set_temperature(self, temperature):
The temperature.
"""
self._temperature = str(temperature)
-
+
@requires_state(AppState.CREATED)
- def set_constraints(self, pairs=None, paired=None, unpaired=None,
- downstream=None, upstream=None, enforce=False):
+ def set_constraints(
+ self,
+ pairs=None,
+ paired=None,
+ unpaired=None,
+ downstream=None,
+ upstream=None,
+ enforce=False,
+ ):
"""
Add constraints of known paired or unpaired bases to the folding
algorithm.
@@ -138,15 +145,14 @@ def set_constraints(self, pairs=None, paired=None, unpaired=None,
the respective base pairs must form.
By default (false), a constraint does only forbid formation
of a pair that would conflict with this constraint.
-
+
Warnings
--------
If a constraint is given for a gap position in the consensus sequence,
the software may find no base pairs at all.
"""
self._constraints = build_constraint_string(
- len(self._alignment),
- pairs, paired, unpaired, downstream, upstream
+ len(self._alignment), pairs, paired, unpaired, downstream, upstream
)
self._enforce = enforce
@@ -160,19 +166,19 @@ def get_free_energy(self):
-------
free_energy : float
The free energy.
-
+
Notes
-----
The total energy of the secondary structure regarding the
minimization objective is the sum of the free energy and the
covariance term.
-
+
See also
--------
get_covariance_energy
"""
return self._free_energy
-
+
@requires_state(AppState.JOINED)
def get_covariance_energy(self):
"""
@@ -183,19 +189,19 @@ def get_covariance_energy(self):
-------
covariance_energy : float
The energy of the covariance term.
-
+
Notes
-----
The total energy of the secondary structure regarding the
minimization objective is the sum of the free energy and the
covariance term.
-
+
See also
--------
get_free_energy
"""
return self._covariance_energy
-
+
@requires_state(AppState.JOINED)
def get_consensus_sequence_string(self):
"""
@@ -265,7 +271,7 @@ def get_base_pairs(self, sequence_index=None):
pair_list = pair_list[trace != -1]
# Convert back to array of base pairs,
# remove unused BondType column
- base_pairs = pair_list.as_array()[:,:2]
+ base_pairs = pair_list.as_array()[:, :2]
return base_pairs
@staticmethod
@@ -300,5 +306,5 @@ def compute_secondary_structure(alignment, bin_path="RNAalifold"):
return (
app.get_dot_bracket(),
app.get_free_energy(),
- app.get_covariance_energy()
+ app.get_covariance_energy(),
)
diff --git a/src/biotite/application/viennarna/rnafold.py b/src/biotite/application/viennarna/rnafold.py
index 38877f963..37fb0e3d7 100644
--- a/src/biotite/application/viennarna/rnafold.py
+++ b/src/biotite/application/viennarna/rnafold.py
@@ -6,14 +6,13 @@
__author__ = "Tom David Müller, Patrick Kunzmann"
__all__ = ["RNAfoldApp"]
-import warnings
from tempfile import NamedTemporaryFile
import numpy as np
-from ..application import AppState, requires_state
-from ..localapp import LocalApp, cleanup_tempfile
-from ...sequence.io.fasta import FastaFile, set_sequence
-from ...structure.dotbracket import base_pairs_from_dot_bracket
-from .util import build_constraint_string
+from biotite.application.application import AppState, requires_state
+from biotite.application.localapp import LocalApp, cleanup_tempfile
+from biotite.application.viennarna.util import build_constraint_string
+from biotite.sequence.io.fasta import FastaFile, set_sequence
+from biotite.structure.dotbracket import base_pairs_from_dot_bracket
class RNAfoldApp(LocalApp):
@@ -51,9 +50,7 @@ def __init__(self, sequence, temperature=37, bin_path="RNAfold"):
self._temperature = str(temperature)
self._constraints = None
self._enforce = None
- self._in_file = NamedTemporaryFile(
- "w", suffix=".fa", delete=False
- )
+ self._in_file = NamedTemporaryFile("w", suffix=".fa", delete=False)
super().__init__(bin_path)
def run(self):
@@ -68,7 +65,8 @@ def run(self):
options = [
"--noPS",
- "-T", self._temperature,
+ "-T",
+ self._temperature,
]
if self._enforce is True:
options.append("--enforceConstraint")
@@ -106,8 +104,15 @@ def set_temperature(self, temperature):
self._temperature = str(temperature)
@requires_state(AppState.CREATED)
- def set_constraints(self, pairs=None, paired=None, unpaired=None,
- downstream=None, upstream=None, enforce=False):
+ def set_constraints(
+ self,
+ pairs=None,
+ paired=None,
+ unpaired=None,
+ downstream=None,
+ upstream=None,
+ enforce=False,
+ ):
"""
Add constraints of known paired or unpaired bases to the folding
algorithm.
@@ -134,8 +139,7 @@ def set_constraints(self, pairs=None, paired=None, unpaired=None,
of a pair that would conflict with this constraint.
"""
self._constraints = build_constraint_string(
- len(self._sequence),
- pairs, paired, unpaired, downstream, upstream
+ len(self._sequence), pairs, paired, unpaired, downstream, upstream
)
self._enforce = enforce
diff --git a/src/biotite/application/viennarna/rnaplot.py b/src/biotite/application/viennarna/rnaplot.py
index 7eedea7ee..1f36f9142 100644
--- a/src/biotite/application/viennarna/rnaplot.py
+++ b/src/biotite/application/viennarna/rnaplot.py
@@ -6,13 +6,14 @@
__author__ = "Tom David Müller"
__all__ = ["RNAplotApp"]
-import numpy as np
-from tempfile import NamedTemporaryFile
-from os import remove
from enum import IntEnum
-from ..localapp import LocalApp, cleanup_tempfile
-from ..application import AppState, requires_state
-from ...structure.dotbracket import dot_bracket as dot_bracket_
+from os import remove
+from tempfile import NamedTemporaryFile
+import numpy as np
+from biotite.application.application import AppState, requires_state
+from biotite.application.localapp import LocalApp, cleanup_tempfile
+from biotite.structure.dotbracket import dot_bracket as dot_bracket_
+
class RNAplotApp(LocalApp):
"""
@@ -60,21 +61,28 @@ class Layout(IntEnum):
This enum type represents the layout type of the plot according
to the official *RNAplot* orientation.
"""
- RADIAL = 0,
- NAVIEW = 1,
- CIRCULAR = 2,
- RNATURTLE = 3,
+
+ RADIAL = (0,)
+ NAVIEW = (1,)
+ CIRCULAR = (2,)
+ RNATURTLE = (3,)
RNAPUZZLER = 4
- def __init__(self, dot_bracket=None, base_pairs=None, length=None,
- layout_type=Layout.NAVIEW, bin_path="RNAplot"):
+ def __init__(
+ self,
+ dot_bracket=None,
+ base_pairs=None,
+ length=None,
+ layout_type=Layout.NAVIEW,
+ bin_path="RNAplot",
+ ):
super().__init__(bin_path)
if dot_bracket is not None:
self._dot_bracket = dot_bracket
elif (base_pairs is not None) and (length is not None):
self._dot_bracket = dot_bracket_(
- base_pairs, length, max_pseudoknot_order = 0
+ base_pairs, length, max_pseudoknot_order=0
)[0]
else:
raise ValueError(
@@ -84,10 +92,10 @@ def __init__(self, dot_bracket=None, base_pairs=None, length=None,
# Get the value of the enum type
self._layout_type = str(int(layout_type))
- self._in_file = NamedTemporaryFile("w", suffix=".fold", delete=False)
+ self._in_file = NamedTemporaryFile("w", suffix=".fold", delete=False)
def run(self):
- self._in_file.write("N"*len(self._dot_bracket) + "\n")
+ self._in_file.write("N" * len(self._dot_bracket) + "\n")
self._in_file.write(self._dot_bracket)
self._in_file.flush()
self.set_arguments(
@@ -146,8 +154,11 @@ def get_coordinates(self):
@staticmethod
def compute_coordinates(
- dot_bracket=None, base_pairs=None, length=None,
- layout_type=Layout.NAVIEW, bin_path="RNAplot"
+ dot_bracket=None,
+ base_pairs=None,
+ length=None,
+ layout_type=Layout.NAVIEW,
+ bin_path="RNAplot",
):
"""
Get coordinates for a 2D representation of any unknotted RNA
@@ -179,9 +190,13 @@ def compute_coordinates(
The 2D coordinates. Each row represents the *x* and *y*
coordinates for a total sequence length of *n*.
"""
- app = RNAplotApp(dot_bracket=dot_bracket, base_pairs=base_pairs,
- length=length, layout_type=layout_type,
- bin_path=bin_path)
+ app = RNAplotApp(
+ dot_bracket=dot_bracket,
+ base_pairs=base_pairs,
+ length=length,
+ layout_type=layout_type,
+ bin_path=bin_path,
+ )
app.start()
app.join()
- return app.get_coordinates()
\ No newline at end of file
+ return app.get_coordinates()
diff --git a/src/biotite/application/viennarna/util.py b/src/biotite/application/viennarna/util.py
index df6149a2b..90bcd6c4e 100644
--- a/src/biotite/application/viennarna/util.py
+++ b/src/biotite/application/viennarna/util.py
@@ -7,12 +7,17 @@
__all__ = ["build_constraint_string"]
import numpy as np
-from ...structure.pseudoknots import pseudoknots
+from biotite.structure.pseudoknots import pseudoknots
-def build_constraint_string(sequence_length,
- pairs=None, paired=None, unpaired=None,
- downstream=None, upstream=None):
+def build_constraint_string(
+ sequence_length,
+ pairs=None,
+ paired=None,
+ unpaired=None,
+ downstream=None,
+ upstream=None,
+):
"""
Build a ViennaRNA constraint string.
@@ -30,7 +35,7 @@ def build_constraint_string(sequence_length,
Positions of bases that are paired with any downstream base.
upstream : ndarray, shape=(n,), dtype=int or dtype=bool, optional
Positions of bases that are paired with any upstream base.
-
+
Returns
-------
constraints : str
@@ -45,21 +50,21 @@ def build_constraint_string(sequence_length,
raise ValueError("Given pairs include pseudoknots")
# Ensure the lower base comes first for each pair
pairs = np.sort(pairs, axis=-1)
- _set_constraints(constraints, pairs[:,0], "(")
- _set_constraints(constraints, pairs[:,1], ")")
+ _set_constraints(constraints, pairs[:, 0], "(")
+ _set_constraints(constraints, pairs[:, 1], ")")
_set_constraints(constraints, paired, "|")
_set_constraints(constraints, unpaired, "x")
_set_constraints(constraints, downstream, "<")
_set_constraints(constraints, upstream, ">")
-
+
return "".join(constraints)
-
+
def _set_constraints(constraints, index, character):
if index is None:
return
-
+
# Search for conflicts with other constraints
potential_conflict_indices = np.where(constraints[index] != ".")[0]
if len(potential_conflict_indices) > 0:
@@ -68,5 +73,5 @@ def _set_constraints(constraints, index, character):
f"Constraint '{character}' at position {conflict_i} "
f"conflicts with existing constraint '{constraints[conflict_i]}'"
)
-
- constraints[index] = character
\ No newline at end of file
+
+ constraints[index] = character
diff --git a/src/biotite/application/webapp.py b/src/biotite/application/webapp.py
index afeaaddaf..6e76eb1cd 100644
--- a/src/biotite/application/webapp.py
+++ b/src/biotite/application/webapp.py
@@ -7,22 +7,22 @@
__all__ = ["WebApp", "RuleViolationError"]
import abc
-from .application import Application
+from biotite.application.application import Application
class WebApp(Application, metaclass=abc.ABCMeta):
"""
The base class for all web based applications.
-
+
It allows for getting and setting the URL of the app and raises
an :class:`RuleViolationError` when a subclass calls
:func:`violate_rule()`
(e.g. when the server was contacted too often.)
-
+
Be careful, when calling func:`get_app_state()`. This may involve a
server contact and therefore frequent calls may raise a
:class:`RuleViolationError`.
-
+
Parameters
----------
app_url : str
@@ -31,19 +31,19 @@ class WebApp(Application, metaclass=abc.ABCMeta):
If true, the application raises an :class:`RuleViolationError`, if
the server rules are violated. (Default: True)
"""
-
+
def __init__(self, app_url, obey_rules=True):
super().__init__()
self._obey_rules = obey_rules
self._app_url = app_url
-
+
def violate_rule(self, msg=None):
"""
Indicate that a server rule was violated, i.e. this raises a
:class:`RuleViolationError` unless `obey_rules` is false.
-
+
PROTECTED: Do not call from outside.
-
+
Parameters
----------
msg : str, optional
@@ -51,16 +51,14 @@ def violate_rule(self, msg=None):
"""
if self._obey_rules:
if msg is None:
- raise RuleViolationError(
- "The user guidelines would be violated"
- )
+ raise RuleViolationError("The user guidelines would be violated")
else:
raise RuleViolationError(msg)
-
+
def app_url(self):
"""
Get the URL of the web app.
-
+
Returns
-------
url : str
@@ -74,4 +72,5 @@ class RuleViolationError(Exception):
Indicates that the user guidelines of the web application would be
violated, if the program continued.
"""
- pass
\ No newline at end of file
+
+ pass
diff --git a/src/biotite/copyable.py b/src/biotite/copyable.py
index d9c389b63..30d8a85d5 100644
--- a/src/biotite/copyable.py
+++ b/src/biotite/copyable.py
@@ -12,22 +12,22 @@
class Copyable(metaclass=abc.ABCMeta):
"""
Base class for all objects, that should be copyable.
-
+
The public method `copy()` first creates a fresh instance of the
class of the instance, that is copied via the `__copy_create__()`
method. All variables, that could not be set via the constructor,
are then copied via `__copy_fill__()`, starting with the method in
the uppermost base class and ending with the class of the instance
to be copied.
-
+
This approach solves the problem of encapsulated variables in
superclasses.
"""
-
+
def copy(self):
"""
Create a deep copy of this object.
-
+
Returns
-------
copy
@@ -36,36 +36,36 @@ def copy(self):
clone = self.__copy_create__()
self.__copy_fill__(clone)
return clone
-
+
def __copy_create__(self):
"""
Instantiate a new object of this class.
-
+
Only the constructor should be called in this method.
All further attributes, that need to be copied are handled
in `__copy_fill__()`
-
+
Do not call the `super()` method here.
-
+
This method must be overridden, if the constructor takes
parameters.
-
+
Returns
-------
copy
A freshly instantiated copy of *self*.
"""
return type(self)()
-
+
def __copy_fill__(self, clone):
"""
Copy all necessary attributes to the new object.
-
+
Always call the `super()` method as first statement.
-
+
Parameters
----------
clone
The freshly instantiated copy of *self*.
"""
- pass
\ No newline at end of file
+ pass
diff --git a/src/biotite/database/__init__.py b/src/biotite/database/__init__.py
index 36c544065..d4b733cb8 100644
--- a/src/biotite/database/__init__.py
+++ b/src/biotite/database/__init__.py
@@ -20,4 +20,4 @@
__name__ = "biotite.database"
__author__ = "Patrick Kunzmann"
-from .error import *
\ No newline at end of file
+from .error import *
diff --git a/src/biotite/database/entrez/__init__.py b/src/biotite/database/entrez/__init__.py
index 2b5488ce4..a27d11338 100644
--- a/src/biotite/database/entrez/__init__.py
+++ b/src/biotite/database/entrez/__init__.py
@@ -11,5 +11,5 @@
from .dbnames import *
from .download import *
+from .key import *
from .query import *
-from .key import *
\ No newline at end of file
diff --git a/src/biotite/database/entrez/check.py b/src/biotite/database/entrez/check.py
index 52bcd3fdc..a9e2db5e9 100644
--- a/src/biotite/database/entrez/check.py
+++ b/src/biotite/database/entrez/check.py
@@ -7,8 +7,7 @@
__all__ = ["check_for_errors"]
import json
-from ..error import RequestError
-
+from biotite.database.error import RequestError
# Taken from https://github.com/kblin/ncbi-entrez-error-messages
_error_messages = [
@@ -58,4 +57,4 @@ def check_for_errors(message):
for error_msg in _error_messages:
# Often whitespace is also replaced by '+' in error message
if error_msg.replace(" ", "") in message_end:
- raise RequestError(error_msg)
\ No newline at end of file
+ raise RequestError(error_msg)
diff --git a/src/biotite/database/entrez/dbnames.py b/src/biotite/database/entrez/dbnames.py
index dfa0a8e0a..e17796648 100644
--- a/src/biotite/database/entrez/dbnames.py
+++ b/src/biotite/database/entrez/dbnames.py
@@ -7,6 +7,7 @@
__all__ = ["get_database_name"]
+# fmt: off
_db_names = {
"BioProject" : "bioproject",
"BioSample" : "biosample",
@@ -45,26 +46,27 @@
"UniGene" : "unigene",
"UniSTS" : "unists"
}
+# fmt: on
def get_database_name(database):
"""
Map a common NCBI Entrez database name to an E-utility database
name.
-
+
Parameters
----------
database : str
Entrez database name.
-
+
Returns
-------
name : str
E-utility database name.
-
+
Examples
--------
-
+
>>> print(get_database_name("Nucleotide"))
nuccore
"""
@@ -86,4 +88,4 @@ def sanitize_database_name(db_name):
# Is already E-utility database name
return db_name
else:
- raise ValueError("Database '{db_name}' is not existing")
\ No newline at end of file
+ raise ValueError("Database '{db_name}' is not existing")
diff --git a/src/biotite/database/entrez/download.py b/src/biotite/database/entrez/download.py
index d30ac41ea..2c2438d8e 100644
--- a/src/biotite/database/entrez/download.py
+++ b/src/biotite/database/entrez/download.py
@@ -6,22 +6,28 @@
__author__ = "Patrick Kunzmann"
__all__ = ["fetch", "fetch_single_file"]
-from os.path import isdir, isfile, join, getsize
-import os
-import glob
import io
+import os
+from os.path import getsize, isdir, isfile, join
import requests
-from .check import check_for_errors
-from .dbnames import sanitize_database_name
-from .key import get_api_key
-from ..error import RequestError
-
+from biotite.database.entrez.check import check_for_errors
+from biotite.database.entrez.dbnames import sanitize_database_name
+from biotite.database.entrez.key import get_api_key
+from biotite.database.error import RequestError
_fetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
-def fetch(uids, target_path, suffix, db_name, ret_type,
- ret_mode="text", overwrite=False, verbose=False):
+def fetch(
+ uids,
+ target_path,
+ suffix,
+ db_name,
+ ret_type,
+ ret_mode="text",
+ overwrite=False,
+ verbose=False,
+):
"""
Download files from the NCBI Entrez database in various formats.
@@ -111,31 +117,28 @@ def fetch(uids, target_path, suffix, db_name, ret_type,
file = join(target_path, id + "." + suffix)
else:
file = None
- if file is None \
- or not isfile(file) \
- or getsize(file) == 0 \
- or overwrite:
- param_dict = {
- "db" : sanitize_database_name(db_name),
- "id" : id,
- "rettype" : ret_type,
- "retmode" : ret_mode,
- "tool" : "Biotite",
- "mail" : "padix.key@gmail.com"
- }
- api_key = get_api_key()
- if api_key is not None:
- param_dict["api_key"] = api_key
- r = requests.get(_fetch_url, params=param_dict)
- content = r.text
- check_for_errors(content)
- if content.startswith(" Error"):
- raise RequestError(content[8:])
- if file is None:
- file = io.StringIO(content)
- else:
- with open(file, "w+") as f:
- f.write(content)
+ if file is None or not isfile(file) or getsize(file) == 0 or overwrite:
+ param_dict = {
+ "db": sanitize_database_name(db_name),
+ "id": id,
+ "rettype": ret_type,
+ "retmode": ret_mode,
+ "tool": "Biotite",
+ "mail": "padix.key@gmail.com",
+ }
+ api_key = get_api_key()
+ if api_key is not None:
+ param_dict["api_key"] = api_key
+ r = requests.get(_fetch_url, params=param_dict)
+ content = r.text
+ check_for_errors(content)
+ if content.startswith(" Error"):
+ raise RequestError(content[8:])
+ if file is None:
+ file = io.StringIO(content)
+ else:
+ with open(file, "w+") as f:
+ f.write(content)
files.append(file)
if verbose:
print("\nDone")
@@ -146,8 +149,9 @@ def fetch(uids, target_path, suffix, db_name, ret_type,
return files
-def fetch_single_file(uids, file_name, db_name, ret_type, ret_mode="text",
- overwrite=False):
+def fetch_single_file(
+ uids, file_name, db_name, ret_type, ret_mode="text", overwrite=False
+):
"""
Almost the same as :func:`fetch()`, but the data for the given UIDs
will be stored in a single file.
@@ -188,24 +192,26 @@ def fetch_single_file(uids, file_name, db_name, ret_type, ret_mode="text",
--------
fetch
"""
- if file_name is not None \
- and os.path.isfile(file_name) \
- and getsize(file_name) > 0 \
- and not overwrite:
- # Do no redownload the already existing file
- return file_name
+ if (
+ file_name is not None
+ and os.path.isfile(file_name)
+ and getsize(file_name) > 0
+ and not overwrite
+ ):
+ # Do no redownload the already existing file
+ return file_name
uid_list_str = ""
for id in uids:
uid_list_str += id + ","
# Remove terminal comma
uid_list_str = uid_list_str[:-1]
param_dict = {
- "db" : sanitize_database_name(db_name),
- "id" : uid_list_str,
- "rettype" : ret_type,
- "retmode" : ret_mode,
- "tool" : "Biotite",
- "mail" : "padix.key@gmail.com"
+ "db": sanitize_database_name(db_name),
+ "id": uid_list_str,
+ "rettype": ret_type,
+ "retmode": ret_mode,
+ "tool": "Biotite",
+ "mail": "padix.key@gmail.com",
}
api_key = get_api_key()
if api_key is not None:
diff --git a/src/biotite/database/entrez/key.py b/src/biotite/database/entrez/key.py
index 2427fd13a..83e56869c 100644
--- a/src/biotite/database/entrez/key.py
+++ b/src/biotite/database/entrez/key.py
@@ -41,4 +41,4 @@ def set_api_key(key):
The API key.
"""
global _API_KEY
- _API_KEY = key
\ No newline at end of file
+ _API_KEY = key
diff --git a/src/biotite/database/entrez/query.py b/src/biotite/database/entrez/query.py
index 1626735f6..f9b4867ea 100644
--- a/src/biotite/database/entrez/query.py
+++ b/src/biotite/database/entrez/query.py
@@ -6,22 +6,23 @@
__author__ = "Patrick Kunzmann"
__all__ = ["Query", "SimpleQuery", "CompositeQuery", "search"]
-import requests
import abc
from xml.etree import ElementTree
-from .check import check_for_errors
-from .dbnames import sanitize_database_name
-from ..error import RequestError
-from .key import get_api_key
-
+import requests
+from biotite.database.entrez.check import check_for_errors
+from biotite.database.entrez.dbnames import sanitize_database_name
+from biotite.database.entrez.key import get_api_key
+from biotite.database.error import RequestError
_search_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
+
class Query(metaclass=abc.ABCMeta):
"""
Base class for a wrapper around a search term
for the NCBI Entrez search service.
"""
+
def __init__(self):
pass
@@ -85,7 +86,6 @@ def __str__(self):
return "({:}) {:} ({:})".format(str(self._q1), self._op, self._q2)
-
class SimpleQuery(Query):
"""
A simple query for the NCBI Entrez search service without
@@ -121,17 +121,59 @@ class SimpleQuery(Query):
# Field identifiers are taken from
# https://www.ncbi.nlm.nih.gov/books/NBK49540/
_fields = [
- "Accession", "All Fields", "Author", "EC/RN Number", "Feature Key",
- "Filter", "Gene Name", "Genome Project", "Issue", "Journal", "Keyword",
- "Modification Date", "Molecular Weight", "Organism", "Page Number",
- "Primary Accession", "Properties", "Protein Name", "Publication Date",
- "SeqID String", "Sequence Length", "Substance Name", "Text Word",
- "Title", "Volume",
+ "Accession",
+ "All Fields",
+ "Author",
+ "EC/RN Number",
+ "Feature Key",
+ "Filter",
+ "Gene Name",
+ "Genome Project",
+ "Issue",
+ "Journal",
+ "Keyword",
+ "Modification Date",
+ "Molecular Weight",
+ "Organism",
+ "Page Number",
+ "Primary Accession",
+ "Properties",
+ "Protein Name",
+ "Publication Date",
+ "SeqID String",
+ "Sequence Length",
+ "Substance Name",
+ "Text Word",
+ "Title",
+ "Volume",
# Abbreviations
- "ACCN", "ALL", "AU", "AUTH", "ECNO", "FKEY", "FILT", "SB", "GENE",
- "ISS", "JOUR", "KYWD", "MDAT", "MOLWT", "ORGN", "PAGE", "PACC",
- "PORGN", "PROP", "PROT", "PDAT", "SQID", "SLEN", "SUBS", "WORD", "TI",
- "TITL" "VOL"
+ "ACCN",
+ "ALL",
+ "AU",
+ "AUTH",
+ "ECNO",
+ "FKEY",
+ "FILT",
+ "SB",
+ "GENE",
+ "ISS",
+ "JOUR",
+ "KYWD",
+ "MDAT",
+ "MOLWT",
+ "ORGN",
+ "PAGE",
+ "PACC",
+ "PORGN",
+ "PROP",
+ "PROT",
+ "PDAT",
+ "SQID",
+ "SLEN",
+ "SUBS",
+ "WORD",
+ "TI",
+ "TITL" "VOL",
]
def __init__(self, term, field=None):
@@ -139,12 +181,9 @@ def __init__(self, term, field=None):
if field is not None:
if field not in SimpleQuery._fields:
raise ValueError(f"Unknown field identifier '{field}'")
- for invalid_string in \
- ['"', "AND", "OR", "NOT", "[", "]", "(", ")", "\t", "\n"]:
- if invalid_string in term:
- raise ValueError(
- f"Query contains illegal term {invalid_string}"
- )
+ for invalid_string in ['"', "AND", "OR", "NOT", "[", "]", "(", ")", "\t", "\n"]:
+ if invalid_string in term:
+ raise ValueError(f"Query contains illegal term {invalid_string}")
if " " in term:
# Encapsulate in quotes if spaces are in search term
term = f'"{term}"'
diff --git a/src/biotite/database/error.py b/src/biotite/database/error.py
index 577e6ce73..271aa37e0 100644
--- a/src/biotite/database/error.py
+++ b/src/biotite/database/error.py
@@ -12,4 +12,5 @@ class RequestError(Exception):
Indicates that the database returned a response with an error
message or other malformed content.
"""
- pass
\ No newline at end of file
+
+ pass
diff --git a/src/biotite/database/pubchem/__init__.py b/src/biotite/database/pubchem/__init__.py
index 73c3a296d..30c4813bb 100644
--- a/src/biotite/database/pubchem/__init__.py
+++ b/src/biotite/database/pubchem/__init__.py
@@ -18,4 +18,4 @@
from .download import *
from .query import *
-from .throttle import *
\ No newline at end of file
+from .throttle import *
diff --git a/src/biotite/database/pubchem/download.py b/src/biotite/database/pubchem/download.py
index e7f1c22ed..85fa09e9e 100644
--- a/src/biotite/database/pubchem/download.py
+++ b/src/biotite/database/pubchem/download.py
@@ -6,24 +6,29 @@
__author__ = "Patrick Kunzmann"
__all__ = ["fetch", "fetch_property"]
+import io
import numbers
-import requests
-from os.path import isdir, isfile, join, getsize
import os
-import io
-import numpy as np
-from .throttle import ThrottleStatus
-from .error import parse_error_details
-from ..error import RequestError
-
+from os.path import getsize, isdir, isfile, join
+import requests
+from biotite.database.error import RequestError
+from biotite.database.pubchem.error import parse_error_details
+from biotite.database.pubchem.throttle import ThrottleStatus
_base_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/"
_binary_formats = ["png", "asnb"]
-def fetch(cids, format="sdf", target_path=None, as_structural_formula=False,
- overwrite=False, verbose=False,
- throttle_threshold=0.5, return_throttle_status=False):
+def fetch(
+ cids,
+ format="sdf",
+ target_path=None,
+ as_structural_formula=False,
+ overwrite=False,
+ verbose=False,
+ throttle_threshold=0.5,
+ return_throttle_status=False,
+):
"""
Download structure files from *PubChem* in various formats.
@@ -109,8 +114,7 @@ def fetch(cids, format="sdf", target_path=None, as_structural_formula=False,
raise TypeError("CIDs must be given as integers, not as string")
# Verbose output
if verbose:
- print(f"Fetching file {i+1:d} / {len(cids):d} ({cid})...",
- end="\r")
+ print(f"Fetching file {i+1:d} / {len(cids):d} ({cid})...", end="\r")
# Fetch file from database
if target_path is not None:
@@ -119,36 +123,33 @@ def fetch(cids, format="sdf", target_path=None, as_structural_formula=False,
# 'file = None' -> store content in a file-like object
file = None
- if file is None \
- or not isfile(file) \
- or getsize(file) == 0 \
- or overwrite:
- record_type = "2d" if as_structural_formula else "3d"
- r = requests.get(
- _base_url + f"compound/cid/{cid}/{format.upper()}",
- params={"record_type": record_type}
- )
- if not r.ok:
- raise RequestError(parse_error_details(r.text))
+ if file is None or not isfile(file) or getsize(file) == 0 or overwrite:
+ record_type = "2d" if as_structural_formula else "3d"
+ r = requests.get(
+ _base_url + f"compound/cid/{cid}/{format.upper()}",
+ params={"record_type": record_type},
+ )
+ if not r.ok:
+ raise RequestError(parse_error_details(r.text))
- if format.lower() in _binary_formats:
- content = r.content
- else:
- content = r.text
+ if format.lower() in _binary_formats:
+ content = r.content
+ else:
+ content = r.text
- if file is None:
- if format in _binary_formats:
- file = io.BytesIO(content)
- else:
- file = io.StringIO(content)
+ if file is None:
+ if format in _binary_formats:
+ file = io.BytesIO(content)
else:
- mode = "wb+" if format in _binary_formats else "w+"
- with open(file, mode) as f:
- f.write(content)
+ file = io.StringIO(content)
+ else:
+ mode = "wb+" if format in _binary_formats else "w+"
+ with open(file, mode) as f:
+ f.write(content)
- throttle_status = ThrottleStatus.from_response(r)
- if throttle_threshold is not None:
- throttle_status.wait_if_busy(throttle_threshold)
+ throttle_status = ThrottleStatus.from_response(r)
+ if throttle_threshold is not None:
+ throttle_status.wait_if_busy(throttle_threshold)
files.append(file)
if verbose:
@@ -164,8 +165,7 @@ def fetch(cids, format="sdf", target_path=None, as_structural_formula=False,
return return_value
-def fetch_property(cids, name,
- throttle_threshold=0.5, return_throttle_status=False):
+def fetch_property(cids, name, throttle_threshold=0.5, return_throttle_status=False):
"""
Download the given property for the given CID(s).
@@ -230,15 +230,13 @@ def fetch_property(cids, name,
# Property names may only contain letters and numbers
if not name.isalnum():
- raise ValueError(
- f"Property '{name}' contains invalid characters"
- )
+ raise ValueError(f"Property '{name}' contains invalid characters")
# Use TXT format instead of CSV to avoid issues with ',' characters
# within table elements
r = requests.post(
_base_url + f"compound/cid/property/{name}/TXT",
- data={"cid": ','.join([str(cid) for cid in cids])}
+ data={"cid": ",".join([str(cid) for cid in cids])},
)
if not r.ok:
raise RequestError(parse_error_details(r.text))
diff --git a/src/biotite/database/pubchem/error.py b/src/biotite/database/pubchem/error.py
index cbbdc0dcd..963fac865 100644
--- a/src/biotite/database/pubchem/error.py
+++ b/src/biotite/database/pubchem/error.py
@@ -15,6 +15,6 @@ def parse_error_details(response_text):
for message_line_indicator in ["Detail: ", "Message: "]:
for line in response_text.splitlines():
if line.startswith(message_line_indicator):
- return line[len(message_line_indicator):]
+ return line[len(message_line_indicator) :]
# No 'Detail: ...' or 'Message: ' line found
- return "Unknown error"
\ No newline at end of file
+ return "Unknown error"
diff --git a/src/biotite/database/pubchem/query.py b/src/biotite/database/pubchem/query.py
index bb6eec92d..31a030e4a 100644
--- a/src/biotite/database/pubchem/query.py
+++ b/src/biotite/database/pubchem/query.py
@@ -4,20 +4,28 @@
__name__ = "biotite.database.pubchem"
__author__ = "Patrick Kunzmann"
-__all__ = ["Query", "NameQuery", "SmilesQuery", "InchiQuery", "InchiKeyQuery",
- "FormulaQuery", "SuperstructureQuery", "SubstructureQuery",
- "SimilarityQuery", "IdentityQuery",
- "search"]
+__all__ = [
+ "Query",
+ "NameQuery",
+ "SmilesQuery",
+ "InchiQuery",
+ "InchiKeyQuery",
+ "FormulaQuery",
+ "SuperstructureQuery",
+ "SubstructureQuery",
+ "SimilarityQuery",
+ "IdentityQuery",
+ "search",
+]
-import copy
import abc
import collections
+import copy
import requests
-from .error import parse_error_details
-from .throttle import ThrottleStatus
-from ..error import RequestError
-from ...structure.io.mol.mol import MOLFile
-
+from biotite.database.error import RequestError
+from biotite.database.pubchem.error import parse_error_details
+from biotite.database.pubchem.throttle import ThrottleStatus
+from biotite.structure.io.mol.mol import MOLFile
_base_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/"
@@ -258,9 +266,10 @@ def get_params(self):
# Only set maximum number, if provided by the user
# The PubChem default value for this might change over time
if self._number is not None:
- params["MaxRecords"] = self._number
+ params["MaxRecords"] = self._number
return params
+
def _format_element(element, count):
if count == 1:
return element.capitalize()
@@ -318,8 +327,8 @@ def __init__(self, **kwargs):
)
if not query_key_found:
raise TypeError(
- "Expected exactly one of 'smiles', 'smarts', 'inchi', 'sdf' "
- "or 'cid'")
+ "Expected exactly one of 'smiles', 'smarts', 'inchi', 'sdf' " "or 'cid'"
+ )
if "number" in kwargs:
self._number = kwargs["number"]
del kwargs["number"]
@@ -346,14 +355,10 @@ def from_atoms(cls, atoms, *args, **kwargs):
mol_file.set_structure(atoms)
# Every MOL string with "$$$$" is a valid SDF string
# Important: USE MS-style new lines
- return cls(
- *args,
- sdf = "\r\n".join(mol_file.lines) + "\r\n$$$$\r\n",
- **kwargs
- )
+ return cls(*args, sdf="\r\n".join(mol_file.lines) + "\r\n$$$$\r\n", **kwargs)
def get_input_url_path(self):
- input_string = f"compound/{self.search_type()}/{self._query_key}"
+ input_string = f"compound/{self.search_type()}/{self._query_key}"
if self._query_key == "cid":
# Put CID in URL and not in POST payload,
# as PubChem is confused otherwise
@@ -370,7 +375,7 @@ def get_params(self):
# Only set maximum number, if provided by the user
# The PubChem default value for this might change over time
if self._number is not None:
- params["MaxRecords"] = self._number
+ params["MaxRecords"] = self._number
for key, val in self.search_options().items():
# Convert 'snake case' Python parameters
# to 'camel case' request parameters
@@ -472,13 +477,13 @@ class SuperOrSubstructureQuery(StructureQuery, metaclass=abc.ABCMeta):
"""
_option_defaults = {
- "match_charges" : False,
- "match_tautomers" : False,
- "rings_not_embedded" : False,
- "single_double_bonds_match" : True,
- "chains_match_rings" : True,
- "strip_hydrogen" : False,
- "stereo" : "ignore",
+ "match_charges": False,
+ "match_tautomers": False,
+ "rings_not_embedded": False,
+ "single_double_bonds_match": True,
+ "chains_match_rings": True,
+ "strip_hydrogen": False,
+ "stereo": "ignore",
}
def __init__(self, **kwargs):
@@ -706,7 +711,7 @@ def search_type(self):
return f"fastsimilarity_{dim}"
def search_options(self):
- return {"threshold" : int(round(self._threshold * 100))}
+ return {"threshold": int(round(self._threshold * 100))}
class IdentityQuery(StructureQuery):
@@ -766,8 +771,6 @@ def get_params(self):
return params
-
-
def search(query, throttle_threshold=0.5, return_throttle_status=False):
"""
Get all CIDs that meet the given query requirements,
@@ -812,7 +815,7 @@ def search(query, throttle_threshold=0.5, return_throttle_status=False):
r = requests.post(
_base_url + query.get_input_url_path() + "/cids/TXT",
data=query.get_params(),
- files=files
+ files=files,
)
if not r.ok:
raise RequestError(parse_error_details(r.text))
diff --git a/src/biotite/database/pubchem/throttle.py b/src/biotite/database/pubchem/throttle.py
index 27cb09084..171c1a484 100644
--- a/src/biotite/database/pubchem/throttle.py
+++ b/src/biotite/database/pubchem/throttle.py
@@ -7,8 +7,8 @@
__all__ = ["ThrottleStatus"]
-from dataclasses import dataclass
import time
+from dataclasses import dataclass
@dataclass(frozen=True)
@@ -67,8 +67,7 @@ def from_response(response):
"""
throttle_control = response.headers["X-Throttling-Control"]
throttle_status = [
- substring.split(")")[0] for substring
- in throttle_control.split("(")[1:]
+ substring.split(")")[0] for substring in throttle_control.split("(")[1:]
]
# Remove '%' sign and convert to int
count_status, time_status, service_status = [
@@ -96,4 +95,4 @@ def wait_if_busy(self, threshold=0.5, wait_time=1.0):
threshold is exceeded.
"""
if self.count > threshold or self.time > threshold:
- time.sleep(wait_time)
\ No newline at end of file
+ time.sleep(wait_time)
diff --git a/src/biotite/database/rcsb/__init__.py b/src/biotite/database/rcsb/__init__.py
index c36dfb2b8..0e5faf41c 100644
--- a/src/biotite/database/rcsb/__init__.py
+++ b/src/biotite/database/rcsb/__init__.py
@@ -10,4 +10,4 @@
__author__ = "Patrick Kunzmann"
from .download import *
-from .query import *
\ No newline at end of file
+from .query import *
diff --git a/src/biotite/database/rcsb/download.py b/src/biotite/database/rcsb/download.py
index e24255672..230792dae 100644
--- a/src/biotite/database/rcsb/download.py
+++ b/src/biotite/database/rcsb/download.py
@@ -6,13 +6,11 @@
__author__ = "Patrick Kunzmann"
__all__ = ["fetch"]
-import requests
-from os.path import isdir, isfile, join, getsize
-import os
-import glob
import io
-from ..error import RequestError
-
+import os
+from os.path import getsize, isfile, join
+import requests
+from biotite.database.error import RequestError
_standard_url = "https://files.rcsb.org/download/"
_bcif_url = "https://models.rcsb.org/"
@@ -93,8 +91,7 @@ def fetch(pdb_ids, format, target_path=None, overwrite=False, verbose=False):
for i, id in enumerate(pdb_ids):
# Verbose output
if verbose:
- print(f"Fetching file {i+1:d} / {len(pdb_ids):d} ({id})...",
- end="\r")
+ print(f"Fetching file {i+1:d} / {len(pdb_ids):d} ({id})...", end="\r")
# Fetch file from database
if target_path is not None:
@@ -103,38 +100,35 @@ def fetch(pdb_ids, format, target_path=None, overwrite=False, verbose=False):
# 'file = None' -> store content in a file-like object
file = None
- if file is None \
- or not isfile(file) \
- or getsize(file) == 0 \
- or overwrite:
- if format == "pdb":
- r = requests.get(_standard_url + id + ".pdb")
- content = r.text
- _assert_valid_file(content, id)
- elif format in ["cif", "mmcif", "pdbx"]:
- r = requests.get(_standard_url + id + ".cif")
- content = r.text
- _assert_valid_file(content, id)
- elif format in ["bcif"]:
- r = requests.get(_bcif_url + id + ".bcif")
- content = r.content
- _assert_valid_file(r.text, id)
- elif format == "fasta":
- r = requests.get(_fasta_url + id)
- content = r.text
- _assert_valid_file(content, id)
- else:
- raise ValueError(f"Format '{format}' is not supported")
-
- if file is None:
- if format in _binary_formats:
- file = io.BytesIO(content)
- else:
- file = io.StringIO(content)
+ if file is None or not isfile(file) or getsize(file) == 0 or overwrite:
+ if format == "pdb":
+ r = requests.get(_standard_url + id + ".pdb")
+ content = r.text
+ _assert_valid_file(content, id)
+ elif format in ["cif", "mmcif", "pdbx"]:
+ r = requests.get(_standard_url + id + ".cif")
+ content = r.text
+ _assert_valid_file(content, id)
+ elif format in ["bcif"]:
+ r = requests.get(_bcif_url + id + ".bcif")
+ content = r.content
+ _assert_valid_file(r.text, id)
+ elif format == "fasta":
+ r = requests.get(_fasta_url + id)
+ content = r.text
+ _assert_valid_file(content, id)
+ else:
+ raise ValueError(f"Format '{format}' is not supported")
+
+ if file is None:
+ if format in _binary_formats:
+ file = io.BytesIO(content)
else:
- mode = "wb+" if format in _binary_formats else "w+"
- with open(file, mode) as f:
- f.write(content)
+ file = io.StringIO(content)
+ else:
+ mode = "wb+" if format in _binary_formats else "w+"
+ with open(file, mode) as f:
+ f.write(content)
files.append(file)
if verbose:
@@ -153,10 +147,13 @@ def _assert_valid_file(response_text, pdb_id):
"""
# Structure file and FASTA file retrieval
# have different error messages
- if len(response_text) == 0 or any(err_msg in response_text for err_msg in [
- "404 Not Found",
- "RCSB Protein Data Bank Error Page",
- "No fasta files were found.",
- "No valid PDB IDs were submitted.",
- ]):
+ if len(response_text) == 0 or any(
+ err_msg in response_text
+ for err_msg in [
+ "404 Not Found",
+ "RCSB Protein Data Bank Error Page",
+ "No fasta files were found.",
+ "No valid PDB IDs were submitted.",
+ ]
+ ):
raise RequestError("PDB ID {:} is invalid".format(pdb_id))
diff --git a/src/biotite/database/rcsb/query.py b/src/biotite/database/rcsb/query.py
index 7f131f3ee..95d59703e 100644
--- a/src/biotite/database/rcsb/query.py
+++ b/src/biotite/database/rcsb/query.py
@@ -4,28 +4,38 @@
__name__ = "biotite.database.rcsb"
__author__ = "Patrick Kunzmann, Maximilian Dombrowsky"
-__all__ = ["Query", "SingleQuery", "CompositeQuery",
- "BasicQuery", "FieldQuery",
- "SequenceQuery", "StructureQuery", "MotifQuery",
- "Sorting",
- "Grouping", "DepositGrouping", "IdentityGrouping", "UniprotGrouping",
- "search", "count"]
+__all__ = [
+ "Query",
+ "SingleQuery",
+ "CompositeQuery",
+ "BasicQuery",
+ "FieldQuery",
+ "SequenceQuery",
+ "StructureQuery",
+ "MotifQuery",
+ "Sorting",
+ "Grouping",
+ "DepositGrouping",
+ "IdentityGrouping",
+ "UniprotGrouping",
+ "search",
+ "count",
+]
import abc
-import json
import copy
+import json
from datetime import datetime
import numpy as np
import requests
-from ...sequence.seqtypes import NucleotideSequence
-from ..error import RequestError
-
+from biotite.database.error import RequestError
+from biotite.sequence.seqtypes import NucleotideSequence
_search_url = "https://search.rcsb.org/rcsbsearch/v2/query"
_scope_to_target = {
"protein": "pdb_protein_sequence",
- "rna": "pdb_rna_sequence",
- "dna": "pdb_dna_sequence"
+ "rna": "pdb_rna_sequence",
+ "dna": "pdb_dna_sequence",
}
@@ -35,6 +45,7 @@ class Query(metaclass=abc.ABCMeta):
This is the abstract base class for all queries.
"""
+
@abc.abstractmethod
def get_content(self):
"""
@@ -58,7 +69,6 @@ def __or__(self, query):
return CompositeQuery([self, query], "or")
-
class SingleQuery(Query, metaclass=abc.ABCMeta):
"""
A terminal query node for the RCSB search API.
@@ -69,6 +79,7 @@ class SingleQuery(Query, metaclass=abc.ABCMeta):
This is the abstract base class for all queries that are
terminal nodes.
"""
+
@abc.abstractmethod
def get_content(self):
return {"parameters": {}}
@@ -91,12 +102,11 @@ class CompositeQuery(Query):
operator : {'or', 'and'}
The type of combination.
"""
+
def __init__(self, queries, operator):
self._queries = queries
if operator not in ("or", "and"):
- raise ValueError(
- f"Operator must be 'or' or 'and', not '{operator}'"
- )
+ raise ValueError(f"Operator must be 'or' or 'and', not '{operator}'")
self._operator = operator
def get_content(self):
@@ -113,12 +123,11 @@ def get_content(self):
content = {
"type": "group",
"logical_operator": self._operator,
- "nodes": [query.get_content() for query in self._queries]
+ "nodes": [query.get_content() for query in self._queries],
}
return content
-
class BasicQuery(SingleQuery):
"""
A text query for searching for a given term across all available
@@ -141,6 +150,7 @@ class BasicQuery(SingleQuery):
>>> print(sorted(search(query)))
['1L2Y', '8ANG', '8ANH', '8ANI', '8ANM']
"""
+
def __init__(self, term):
super().__init__()
self._term = term
@@ -212,7 +222,10 @@ class FieldQuery(SingleQuery):
>>> print(sorted(search(query)))
['1EJG', '1I0T', '3NIR', '3P4J', '4JLJ', '5D8V', '5NW3', '7ATG', '7R0H']
"""
- def __init__(self, field, molecular_definition=False, case_sensitive=False, **kwargs):
+
+ def __init__(
+ self, field, molecular_definition=False, case_sensitive=False, **kwargs
+ ):
super().__init__()
self._negation = False
self._field = field
@@ -231,20 +244,25 @@ def __init__(self, field, molecular_definition=False, case_sensitive=False, **kw
if self._operator not in [
"exact_match",
- "contains_words", "contains_phrase",
- "greater", "less", "greater_or_equal", "less_or_equal", "equals",
- "range", "range_closed",
+ "contains_words",
+ "contains_phrase",
+ "greater",
+ "less",
+ "greater_or_equal",
+ "less_or_equal",
+ "equals",
+ "range",
+ "range_closed",
"is_in",
- "exists"
+ "exists",
]:
raise TypeError(
- f"Constructor got an unexpected keyword argument "
- f"'{self._operator}'"
+ f"Constructor got an unexpected keyword argument " f"'{self._operator}'"
)
# Convert dates into ISO 8601
if isinstance(self._value, datetime):
- self._value = _to_isoformat(self._value)
+ self._value = _to_isoformat(self._value)
elif isinstance(self._value, (tuple, list, np.ndarray)):
self._value = [
_to_isoformat(val) if isinstance(val, datetime) else val
@@ -257,14 +275,14 @@ def __init__(self, field, molecular_definition=False, case_sensitive=False, **kw
"from": self._value[0],
"include_lower": False,
"to": self._value[1],
- "include_upper": False
+ "include_upper": False,
}
elif self._operator == "range_closed":
self._value = {
"from": self._value[0],
"include_lower": True,
"to": self._value[1],
- "include_upper": True
+ "include_upper": True,
}
# Rename operators to names used in API
@@ -332,8 +350,8 @@ class SequenceQuery(SingleQuery):
>>> print(sorted(search(query)))
['1L2Y', '1RIJ', '2JOF', '2LDJ', '2LL5', '2MJ9', '3UC7', '3UC8']
"""
- def __init__(self, sequence, scope,
- min_identity=0.0, max_expect_value=10000000.0):
+
+ def __init__(self, sequence, scope, min_identity=0.0, max_expect_value=10000000.0):
super().__init__()
self._target = _scope_to_target.get(scope.lower())
if self._target is None:
@@ -381,6 +399,7 @@ class MotifQuery(SingleQuery):
... "protein"
... )
"""
+
def __init__(self, pattern, pattern_type, scope):
super().__init__()
self._pattern = pattern
@@ -424,27 +443,20 @@ class StructureQuery(SingleQuery):
>>> print(sorted(search(query)))
['1L2Y', '1RIJ', '2JOF', '2LDJ', '2M7D', '7MQS']
"""
+
def __init__(self, pdb_id, chain=None, assembly=None, strict=True):
super().__init__()
- if (chain is None and assembly is None) \
- or (chain is not None and assembly is not None):
- raise TypeError(
- "Either the chain ID or assembly ID must be set"
- )
+ if (chain is None and assembly is None) or (
+ chain is not None and assembly is not None
+ ):
+ raise TypeError("Either the chain ID or assembly ID must be set")
elif chain is None:
- self._value = {
- "entry_id": pdb_id,
- "asssembly_id": assembly
- }
+ self._value = {"entry_id": pdb_id, "asssembly_id": assembly}
else:
- self._value = {
- "entry_id": pdb_id,
- "asym_id": chain
- }
+ self._value = {"entry_id": pdb_id, "asym_id": chain}
- self._operator = "strict_shape_match" if strict \
- else "relaxed_shape_match"
+ self._operator = "strict_shape_match" if strict else "relaxed_shape_match"
def get_content(self):
content = super().get_content()
@@ -455,10 +467,7 @@ def get_content(self):
return content
-
-
class Sorting:
-
def __init__(self, field, descending=True):
self._field = field
self._descending = descending
@@ -487,12 +496,7 @@ def get_content(self):
``'ranking_criteria_type'`` attributes.
"""
direction = "desc" if self._descending else "asc"
- return {
- "sort_by" : self._field,
- "direction" : direction
- }
-
-
+ return {"sort_by": self._field, "direction": direction}
class Grouping(metaclass=abc.ABCMeta):
@@ -539,7 +543,7 @@ def get_content(self):
The content dictionary for the ``'group_by'`` attributes.
"""
if self._sorting is not None:
- return {"ranking_criteria_type" : self._sorting.get_content()}
+ return {"ranking_criteria_type": self._sorting.get_content()}
else:
return {}
@@ -627,6 +631,7 @@ class IdentityGrouping(Grouping):
To choose the order a :class:`Sorting` object needs to be
provided.
"""
+
def __init__(self, similarity_cutoff, sort_by=None):
super().__init__(sort_by)
if similarity_cutoff not in (100, 95, 90, 70, 50, 30):
@@ -677,11 +682,7 @@ def is_compatible_return_type(self, return_type):
return return_type == "polymer_entity"
-
-
-
-def count(query, return_type="entry", group_by=None,
- content_types=("experimental",)):
+def count(query, return_type="entry", group_by=None, content_types=("experimental",)):
"""
Count PDB entries that meet the given query requirements,
via the RCSB search API.
@@ -737,9 +738,7 @@ def count(query, return_type="entry", group_by=None,
>>> print(sorted(ids))
['1EJG', '1I0T', '3NIR', '3P4J', '4JLJ', '5D8V', '5NW3', '7ATG', '7R0H']
"""
- query_dict = _initialize_query_dict(
- query, return_type, group_by, content_types
- )
+ query_dict = _initialize_query_dict(query, return_type, group_by, content_types)
query_dict["request_options"]["return_counts"] = True
@@ -761,8 +760,15 @@ def count(query, return_type="entry", group_by=None,
raise RequestError(f"Error {r.status_code}")
-def search(query, return_type="entry", range=None, sort_by=None, group_by=None,
- return_groups=False, content_types=("experimental",)):
+def search(
+ query,
+ return_type="entry",
+ range=None,
+ sort_by=None,
+ group_by=None,
+ return_groups=False,
+ content_types=("experimental",),
+):
"""
Get all PDB IDs that meet the given query requirements,
via the RCSB search API.
@@ -864,17 +870,13 @@ def search(query, return_type="entry", range=None, sort_by=None, group_by=None,
... ))
{'P24297': ['5NW3_1'], 'P27707': ['4JLJ_1'], 'P80176': ['5D8V_1'], 'O29777': ['7R0H_1'], 'P01542': ['3NIR_1', '1EJG_1']}
"""
- query_dict = _initialize_query_dict(
- query, return_type, group_by, content_types
- )
+ query_dict = _initialize_query_dict(query, return_type, group_by, content_types)
if group_by is not None:
if return_groups:
- query_dict["request_options"]["group_by_return_type"] \
- = "groups"
+ query_dict["request_options"]["group_by_return_type"] = "groups"
else:
- query_dict["request_options"]["group_by_return_type"] \
- = "representatives"
+ query_dict["request_options"]["group_by_return_type"] = "representatives"
if sort_by is not None:
if isinstance(sort_by, Sorting):
@@ -890,7 +892,7 @@ def search(query, return_type="entry", range=None, sort_by=None, group_by=None,
else:
query_dict["request_options"]["paginate"] = {
"start": int(range[0]),
- "rows": int(range[1]) - int(range[0])
+ "rows": int(range[1]) - int(range[0]),
}
r = requests.get(_search_url, params={"json": json.dumps(query_dict)})
@@ -900,7 +902,7 @@ def search(query, return_type="entry", range=None, sort_by=None, group_by=None,
return [result["identifier"] for result in r.json()["result_set"]]
else:
return {
- group["identifier"] : [
+ group["identifier"]: [
result["identifier"] for result in group["result_set"]
]
for group in r.json()["group_set"]
@@ -922,8 +924,11 @@ def _initialize_query_dict(query, return_type, group_by, content_types):
`count()` and `search()` have in common.
"""
if return_type not in [
- "entry", "polymer_instance", "assembly",
- "polymer_entity", "non_polymer_entity",
+ "entry",
+ "polymer_instance",
+ "assembly",
+ "polymer_entity",
+ "non_polymer_entity",
]:
raise ValueError(f"'{return_type}' is an invalid return type")
@@ -947,7 +952,7 @@ def _initialize_query_dict(query, return_type, group_by, content_types):
query_dict = {
"query": query.get_content(),
"return_type": return_type,
- "request_options": request_options
+ "request_options": request_options,
}
return query_dict
@@ -956,4 +961,4 @@ def _to_isoformat(object):
"""
Convert a datetime into the specifc ISO 8601 format required by the RCSB.
"""
- return object.strftime("%Y-%m-%dT%H:%M:%SZ")
\ No newline at end of file
+ return object.strftime("%Y-%m-%dT%H:%M:%SZ")
diff --git a/src/biotite/database/uniprot/check.py b/src/biotite/database/uniprot/check.py
index 4b00845d2..a1782e1ba 100644
--- a/src/biotite/database/uniprot/check.py
+++ b/src/biotite/database/uniprot/check.py
@@ -6,7 +6,7 @@
__author__ = "Maximilian Greil"
__all__ = ["assert_valid_response"]
-from ..error import RequestError
+from biotite.database.error import RequestError
# Taken from https://www.uniprot.org/help/api_retrieve_entries
@@ -27,6 +27,9 @@ def assert_valid_response(response_status_code):
raise RequestError("Gone. The resource you requested was removed.")
elif response_status_code == 500:
raise RequestError(
- "Internal server error. Most likely a temporary problem, but if the problem persists please contact UniProt team.")
+ "Internal server error. Most likely a temporary problem, but if the problem persists please contact UniProt team."
+ )
elif response_status_code == 503:
- raise RequestError("Service not available. The server is being updated, try again later.")
+ raise RequestError(
+ "Service not available. The server is being updated, try again later."
+ )
diff --git a/src/biotite/database/uniprot/download.py b/src/biotite/database/uniprot/download.py
index 7faf37954..bacb40e96 100644
--- a/src/biotite/database/uniprot/download.py
+++ b/src/biotite/database/uniprot/download.py
@@ -6,11 +6,11 @@
__author__ = "Maximilian Greil"
__all__ = ["fetch"]
-from os.path import isdir, isfile, join, getsize
-import os
import io
+import os
+from os.path import getsize, isdir, isfile, join
import requests
-from .check import assert_valid_response
+from biotite.database.uniprot.check import assert_valid_response
_fetch_url = "https://rest.uniprot.org/"
@@ -36,8 +36,7 @@ def _get_database_name(id):
return "uniprotkb"
-def fetch(ids, format, target_path=None,
- overwrite=False, verbose=False):
+def fetch(ids, format, target_path=None, overwrite=False, verbose=False):
"""
Download files from the UniProt in various formats.
@@ -101,18 +100,14 @@ def fetch(ids, format, target_path=None,
db_name = _get_database_name(id)
# Verbose output
if verbose:
- print(f"Fetching file {i + 1:d} / {len(ids):d} ({id})...",
- end="\r")
+ print(f"Fetching file {i + 1:d} / {len(ids):d} ({id})...", end="\r")
# Fetch file from database
if target_path is not None:
file = join(target_path, id + "." + format)
else:
# 'file = None' -> store content in a file-like object
file = None
- if file is None \
- or not isfile(file) \
- or getsize(file) == 0 \
- or overwrite:
+ if file is None or not isfile(file) or getsize(file) == 0 or overwrite:
if format in ["fasta", "gff", "txt", "xml", "rdf", "tab"]:
r = requests.get(_fetch_url + db_name + "/" + id + "." + format)
content = r.text
diff --git a/src/biotite/database/uniprot/query.py b/src/biotite/database/uniprot/query.py
index 95e6f391d..687c61f5f 100644
--- a/src/biotite/database/uniprot/query.py
+++ b/src/biotite/database/uniprot/query.py
@@ -6,10 +6,9 @@
__author__ = "Maximilian Greil"
__all__ = ["Query", "SimpleQuery", "CompositeQuery", "search"]
-import requests
import abc
-from .check import assert_valid_response
-
+import requests
+from biotite.database.uniprot.check import assert_valid_response
_base_url = "https://rest.uniprot.org/uniprotkb/search/"
@@ -122,22 +121,114 @@ class SimpleQuery(Query):
# Field identifiers are taken from
# https://www.uniprot.org/help/query-fields
_fields = [
- "accession", "active", "ft_init_met", "ft_signal", "ft_transit", "ft_propep", "ft_chain", "ft_peptide",
- "ft_topo_dom", "ft_transmem", "ft_intramem", "ft_domain", "ft_repeat", "ft_zn_fing", "ft_dna_bind",
- "ft_region", "ft_coiled", "ft_motif", "ft_compbias", "ft_act_site", "ft_binding", "ft_site", "ft_non_std",
- "ft_mod_res", "ft_lipid", "ft_carbohyd", "ft_disulfid", "ft_crosslnk", "ft_var_seq", "ft_variant",
- "ft_mutagen", "ft_unsure", "ft_conflict", "ft_non_cons", "ft_non_ter", "ft_helix", "ft_turn", "ft_strand",
- "lit_author", "protein_name", "chebi", "citation", "uniref_cluster_90", "xrefcount_pdb", "date_created",
- "database", "xref", "ec", "cc_function", "cc_catalytic_activity", "cc_cofactor", "cc_activity_regulation",
- "cc_biophysicochemical_properties", "cc_subunit", "cc_pathway", "cc_scl_term", "cc_tissue_specificity",
- "cc_developmental_stage", "cc_induction", "cc_domain", "cc_ptm cc_rna_editing", "cc_mass_spectrometry",
- "cc_polymorphism", "cc_disease", "cc_disruption_phenotype", "cc_allergen", "cc_toxic_dose", "cc_biotechnology",
- "cc_pharmaceutical", "cc_miscellaneous", "cc_similarity", "cc_caution", "cc_sequence_caution",
- "existence", "family", "fragment", "gene", "gene_exact", "go", "virus_host_name", "virus_host_id",
- "accession_id", "inchikey", "protein_name", "interactor", "keyword", "length", "lineage", "mass",
- "cc_mass_spectrometry", "date_modified", "protein_name", "organelle", "organism_name", "organism_id",
- "plasmid", "proteome", "proteomecomponent", "sec_acc", "reviewed", "scope", "sequence",
- "date_sequence_modified", "strain", "taxonomy_name", "taxonomy_id", "tissue", "cc_webresource"
+ "accession",
+ "active",
+ "ft_init_met",
+ "ft_signal",
+ "ft_transit",
+ "ft_propep",
+ "ft_chain",
+ "ft_peptide",
+ "ft_topo_dom",
+ "ft_transmem",
+ "ft_intramem",
+ "ft_domain",
+ "ft_repeat",
+ "ft_zn_fing",
+ "ft_dna_bind",
+ "ft_region",
+ "ft_coiled",
+ "ft_motif",
+ "ft_compbias",
+ "ft_act_site",
+ "ft_binding",
+ "ft_site",
+ "ft_non_std",
+ "ft_mod_res",
+ "ft_lipid",
+ "ft_carbohyd",
+ "ft_disulfid",
+ "ft_crosslnk",
+ "ft_var_seq",
+ "ft_variant",
+ "ft_mutagen",
+ "ft_unsure",
+ "ft_conflict",
+ "ft_non_cons",
+ "ft_non_ter",
+ "ft_helix",
+ "ft_turn",
+ "ft_strand",
+ "lit_author",
+ "protein_name",
+ "chebi",
+ "citation",
+ "uniref_cluster_90",
+ "xrefcount_pdb",
+ "date_created",
+ "database",
+ "xref",
+ "ec",
+ "cc_function",
+ "cc_catalytic_activity",
+ "cc_cofactor",
+ "cc_activity_regulation",
+ "cc_biophysicochemical_properties",
+ "cc_subunit",
+ "cc_pathway",
+ "cc_scl_term",
+ "cc_tissue_specificity",
+ "cc_developmental_stage",
+ "cc_induction",
+ "cc_domain",
+ "cc_ptm cc_rna_editing",
+ "cc_mass_spectrometry",
+ "cc_polymorphism",
+ "cc_disease",
+ "cc_disruption_phenotype",
+ "cc_allergen",
+ "cc_toxic_dose",
+ "cc_biotechnology",
+ "cc_pharmaceutical",
+ "cc_miscellaneous",
+ "cc_similarity",
+ "cc_caution",
+ "cc_sequence_caution",
+ "existence",
+ "family",
+ "fragment",
+ "gene",
+ "gene_exact",
+ "go",
+ "virus_host_name",
+ "virus_host_id",
+ "accession_id",
+ "inchikey",
+ "protein_name",
+ "interactor",
+ "keyword",
+ "length",
+ "lineage",
+ "mass",
+ "cc_mass_spectrometry",
+ "date_modified",
+ "protein_name",
+ "organelle",
+ "organism_name",
+ "organism_id",
+ "plasmid",
+ "proteome",
+ "proteomecomponent",
+ "sec_acc",
+ "reviewed",
+ "scope",
+ "sequence",
+ "date_sequence_modified",
+ "strain",
+ "taxonomy_name",
+ "taxonomy_id",
+ "tissue",
+ "cc_webresource",
]
def __init__(self, field, term):
@@ -146,14 +237,11 @@ def __init__(self, field, term):
raise ValueError(f"Unknown field identifier '{field}'")
if not _check_brackets(term):
raise ValueError(
- f"Query term contains illegal number of round brackets ( ) and/or square brackets [ ]"
+ "Query term contains illegal number of round brackets ( ) and/or square brackets [ ]"
)
- for invalid_string in \
- ['"', "AND", "OR", "NOT", "\t", "\n"]:
+ for invalid_string in ['"', "AND", "OR", "NOT", "\t", "\n"]:
if invalid_string in term:
- raise ValueError(
- f"Query contains illegal term {invalid_string}"
- )
+ raise ValueError(f"Query contains illegal term {invalid_string}")
if " " in term:
term = f'"{term}"'
self._field = field
@@ -198,12 +286,8 @@ def search(query, number=500):
['P12345']
"""
- params = {
- 'query': str(query),
- 'format': 'list',
- 'size': str(number)
- }
+ params = {"query": str(query), "format": "list", "size": str(number)}
r = requests.get(_base_url, params=params)
content = r.text
assert_valid_response(r.status_code)
- return content.split('\n')[:-1]
+ return content.split("\n")[:-1]
diff --git a/src/biotite/file.py b/src/biotite/file.py
index fa1963b6a..ec7047db6 100644
--- a/src/biotite/file.py
+++ b/src/biotite/file.py
@@ -4,16 +4,19 @@
__name__ = "biotite"
__author__ = "Patrick Kunzmann"
-__all__ = ["File", "TextFile", "InvalidFileError",
- "SerializationError", "DeserializationError"]
+__all__ = [
+ "File",
+ "TextFile",
+ "InvalidFileError",
+ "SerializationError",
+ "DeserializationError",
+]
import abc
+import copy
import io
-import warnings
from os import PathLike
-
-from .copyable import Copyable
-import copy
+from biotite.copyable import Copyable
class File(Copyable, metaclass=abc.ABCMeta):
@@ -185,12 +188,14 @@ class InvalidFileError(Exception):
either because the file does not contain the required data or
because the file is malformed.
"""
+
pass
class SerializationError(Exception):
pass
+
class DeserializationError(Exception):
pass
@@ -205,7 +210,7 @@ def wrap_string(text, width):
"""
lines = []
for i in range(0, len(text), width):
- lines.append(text[i : i+width])
+ lines.append(text[i : i + width])
return lines
diff --git a/src/biotite/sequence/__init__.py b/src/biotite/sequence/__init__.py
index afda0ab34..005a7c88c 100644
--- a/src/biotite/sequence/__init__.py
+++ b/src/biotite/sequence/__init__.py
@@ -76,9 +76,9 @@
__author__ = "Patrick Kunzmann"
from .alphabet import *
+from .annotation import *
+from .codon import *
+from .profile import *
from .search import *
from .seqtypes import *
from .sequence import *
-from .codon import *
-from .annotation import *
-from .profile import *
diff --git a/src/biotite/sequence/align/__init__.py b/src/biotite/sequence/align/__init__.py
index d548b11a3..7e90c32ad 100644
--- a/src/biotite/sequence/align/__init__.py
+++ b/src/biotite/sequence/align/__init__.py
@@ -191,8 +191,8 @@
from .buckets import *
from .cigar import *
from .kmeralphabet import *
-from .kmertable import *
from .kmersimilarity import *
+from .kmertable import *
from .localgapped import *
from .localungapped import *
from .matrix import *
@@ -200,4 +200,4 @@
from .pairwise import *
from .permutation import *
from .selector import *
-from .statistics import *
\ No newline at end of file
+from .statistics import *
diff --git a/src/biotite/sequence/align/alignment.py b/src/biotite/sequence/align/alignment.py
index 7d97d15a8..d33e3d051 100644
--- a/src/biotite/sequence/align/alignment.py
+++ b/src/biotite/sequence/align/alignment.py
@@ -5,16 +5,22 @@
__name__ = "biotite.sequence.align"
__author__ = "Patrick Kunzmann"
-import numpy as np
import numbers
-import copy
import textwrap
-from ..alphabet import LetterAlphabet
-
+from collections.abc import Sequence
+import numpy as np
+from biotite.sequence.alphabet import LetterAlphabet
-__all__ = ["Alignment", "get_codes", "get_symbols",
- "get_sequence_identity", "get_pairwise_sequence_identity",
- "score", "find_terminal_gaps", "remove_terminal_gaps"]
+__all__ = [
+ "Alignment",
+ "get_codes",
+ "get_symbols",
+ "get_sequence_identity",
+ "get_pairwise_sequence_identity",
+ "score",
+ "find_terminal_gaps",
+ "remove_terminal_gaps",
+]
class Alignment(object):
@@ -95,8 +101,10 @@ def __init__(self, sequences, trace, score=None):
def __repr__(self):
"""Represent Alignment a string for debugging."""
- return f"Alignment([{', '.join([seq.__repr__() for seq in self.sequences])}], " \
- f"np.{np.array_repr(self.trace)}, score={self.score})"
+ return (
+ f"Alignment([{', '.join([seq.__repr__() for seq in self.sequences])}], "
+ f"np.{np.array_repr(self.trace)}, score={self.score})"
+ )
def _gapped_str(self, seq_index):
seq_str = ""
@@ -148,17 +156,18 @@ def __getitem__(self, index):
if isinstance(index, tuple):
if len(index) > 2:
raise IndexError("Only 1D or 2D indices are allowed")
- if isinstance(index[0], numbers.Integral) or \
- isinstance(index[0], numbers.Integral):
- raise IndexError(
- "Integers are invalid indices for alignments, "
- "a single sequence or alignment column cannot be "
- "selected"
- )
+ if isinstance(index[0], numbers.Integral) or isinstance(
+ index[0], numbers.Integral
+ ):
+ raise IndexError(
+ "Integers are invalid indices for alignments, "
+ "a single sequence or alignment column cannot be "
+ "selected"
+ )
return Alignment(
Alignment._index_sequences(self.sequences, index[1]),
self.trace[index],
- self.score
+ self.score,
)
else:
return Alignment(self.sequences, self.trace[index], self.score)
@@ -182,17 +191,16 @@ def __eq__(self, item):
@staticmethod
def _index_sequences(sequences, index):
- if isinstance(index, (list, tuple)) or \
- (isinstance(index, np.ndarray) and index.dtype != bool):
- return [sequences[i] for i in index]
+ if isinstance(index, (list, tuple)) or (
+ isinstance(index, np.ndarray) and index.dtype != bool
+ ):
+ return [sequences[i] for i in index]
elif isinstance(index, np.ndarray) and index.dtype == bool:
return [seq for seq, mask in zip(sequences, index) if mask]
if isinstance(index, slice):
return sequences[index]
else:
- raise IndexError(
- f"Invalid alignment index type '{type(index).__name__}'"
- )
+ raise IndexError(f"Invalid alignment index type '{type(index).__name__}'")
@staticmethod
def trace_from_strings(seq_str_list):
@@ -212,12 +220,9 @@ def trace_from_strings(seq_str_list):
The created trace.
"""
if len(seq_str_list) < 2:
- raise ValueError(
- "An alignment must contain at least two sequences"
- )
+ raise ValueError("An alignment must contain at least two sequences")
seq_i = np.zeros(len(seq_str_list))
- trace = np.full(( len(seq_str_list[0]), len(seq_str_list) ),
- -1, dtype=int)
+ trace = np.full((len(seq_str_list[0]), len(seq_str_list)), -1, dtype=int)
# Get length of string (same length for all strings)
# rather than length of list
for pos_i in range(len(seq_str_list[0])):
@@ -275,7 +280,7 @@ def get_codes(alignment):
# of the sequence code is used
# (https://numpy.org/neps/nep-0050-scalar-promotion.html)
codes[i] = np.where(
- trace[:,i] != -1, sequences[i].code[trace[:,i]], np.int64(-1)
+ trace[:, i] != -1, sequences[i].code[trace[:, i]], np.int64(-1)
)
return np.stack(codes)
@@ -366,7 +371,7 @@ def get_sequence_identity(alignment, mode="not_terminal"):
# Count matches
matches = 0
for i in range(codes.shape[1]):
- column = codes[:,i]
+ column = codes[:, i]
# One unique value -> all symbols match
unique_symbols = np.unique(column)
if len(unique_symbols) == 1 and unique_symbols[0] != -1:
@@ -430,9 +435,11 @@ def get_pairwise_sequence_identity(alignment, mode="not_terminal"):
# Count matches
# Calculate at which positions the sequences are identical
# and are not gaps
- equality_matrix = (codes[:, np.newaxis, :] == codes[np.newaxis, :, :]) \
- & (codes[:, np.newaxis, :] != -1) \
- & (codes[np.newaxis, :, :] != -1) \
+ equality_matrix = (
+ (codes[:, np.newaxis, :] == codes[np.newaxis, :, :])
+ & (codes[:, np.newaxis, :] != -1)
+ & (codes[np.newaxis, :, :] != -1)
+ )
# Sum these positions up
matches = np.count_nonzero(equality_matrix, axis=-1)
@@ -444,21 +451,20 @@ def get_pairwise_sequence_identity(alignment, mode="not_terminal"):
for i in range(n_seq):
for j in range(n_seq):
# Find latest start and earliest stop of all sequences
- start, stop = find_terminal_gaps(alignment[:, [i,j]])
+ start, stop = find_terminal_gaps(alignment[:, [i, j]])
if stop <= start:
raise ValueError(
"Cannot calculate non-terminal identity, "
"as the two sequences have no overlap"
)
- length[i,j] = stop - start
+ length[i, j] = stop - start
elif mode == "shortest":
length = np.zeros((n_seq, n_seq))
for i in range(n_seq):
for j in range(n_seq):
- length[i,j] = min([
- len(alignment.sequences[i]),
- len(alignment.sequences[j])
- ])
+ length[i, j] = min(
+ [len(alignment.sequences[i]), len(alignment.sequences[j])]
+ )
else:
raise ValueError(f"'{mode}' is an invalid calculation mode")
@@ -506,7 +512,7 @@ def score(alignment, matrix, gap_penalty=-10, terminal_penalty=True):
# Do not count self-similarity
# and do not count similarity twice (not S(i,j) and S(j,i))
for i in range(codes.shape[0]):
- for j in range(i+1, codes.shape[0]):
+ for j in range(i + 1, codes.shape[0]):
code_i = column[i]
code_j = column[j]
# Ignore gaps
@@ -514,10 +520,10 @@ def score(alignment, matrix, gap_penalty=-10, terminal_penalty=True):
score += matrix[code_i, code_j]
# Sum gap penalties
- if type(gap_penalty) == int:
+ if isinstance(gap_penalty, numbers.Real):
gap_open = gap_penalty
gap_ext = gap_penalty
- elif type(gap_penalty) == tuple:
+ elif isinstance(gap_penalty, Sequence):
gap_open = gap_penalty[0]
gap_ext = gap_penalty[1]
else:
@@ -593,11 +599,11 @@ def find_terminal_gaps(alignment):
"""
trace = alignment.trace
# Find for each sequence the positions of non-gap symbols
- no_gap_pos = [np.where(trace[:,i] != -1)[0] for i in range(trace.shape[1])]
+ no_gap_pos = [np.where(trace[:, i] != -1)[0] for i in range(trace.shape[1])]
# Find for each sequence the positions of the sequence start and end
# in the alignment
- firsts = [no_gap_pos[i][0 ] for i in range(trace.shape[1])]
- lasts = [no_gap_pos[i][-1] for i in range(trace.shape[1])]
+ firsts = [no_gap_pos[i][0] for i in range(trace.shape[1])]
+ lasts = [no_gap_pos[i][-1] for i in range(trace.shape[1])]
# The terminal gaps are before all sequences start and after any
# sequence ends
# Use exclusive stop -> -1
@@ -658,4 +664,4 @@ def remove_terminal_gaps(alignment):
"Cannot remove terminal gaps, since at least two sequences have "
"no overlap and the resulting alignment would be empty"
)
- return alignment[start : stop]
\ No newline at end of file
+ return alignment[start:stop]
diff --git a/src/biotite/sequence/align/buckets.py b/src/biotite/sequence/align/buckets.py
index 79a1afadd..5b99ef890 100644
--- a/src/biotite/sequence/align/buckets.py
+++ b/src/biotite/sequence/align/buckets.py
@@ -6,11 +6,12 @@
__author__ = "Patrick Kunzmann"
__all__ = ["bucket_number"]
-from os.path import realpath, dirname, join
+from os.path import dirname, join, realpath
import numpy as np
-
_primes = None
+
+
def bucket_number(n_kmers, load_factor=0.8):
"""
Find an appropriate number of buckets for a :class:`BucketKmerTable`
@@ -54,16 +55,17 @@ def bucket_number(n_kmers, load_factor=0.8):
"""
global _primes
if _primes is None:
- with open(
- join(dirname(realpath(__file__)), "primes.txt")
- ) as file:
- _primes = np.array([
- int(line) for line in file.read().splitlines()
- if len(line) != 0 and line[0] != "#"
- ])
+ with open(join(dirname(realpath(__file__)), "primes.txt")) as file:
+ _primes = np.array(
+ [
+ int(line)
+ for line in file.read().splitlines()
+ if len(line) != 0 and line[0] != "#"
+ ]
+ )
number = int(n_kmers / load_factor)
index = np.searchsorted(_primes, number, side="left")
if index == len(_primes):
raise ValueError("Number of buckets too large")
- return _primes[index]
\ No newline at end of file
+ return _primes[index]
diff --git a/src/biotite/sequence/align/cigar.py b/src/biotite/sequence/align/cigar.py
index abe76cae6..60366e897 100644
--- a/src/biotite/sequence/align/cigar.py
+++ b/src/biotite/sequence/align/cigar.py
@@ -8,13 +8,14 @@
import enum
import numpy as np
-from .alignment import Alignment, get_codes
+from biotite.sequence.align.alignment import Alignment, get_codes
class CigarOp(enum.IntEnum):
"""
An enum for the different CIGAR operations.
"""
+
MATCH = 0
INSERTION = 1
DELETION = 2
@@ -46,23 +47,23 @@ def from_cigar_symbol(symbol):
def to_cigar_symbol(self):
return _op_to_str[self]
+
_str_to_op = {
- "M" : CigarOp.MATCH,
- "I" : CigarOp.INSERTION,
- "D" : CigarOp.DELETION,
- "N" : CigarOp.INTRON,
- "S" : CigarOp.SOFT_CLIP,
- "H" : CigarOp.HARD_CLIP,
- "P" : CigarOp.PADDING,
- "=" : CigarOp.EQUAL,
- "X" : CigarOp.DIFFERENT,
- "B" : CigarOp.BACK
- }
+ "M": CigarOp.MATCH,
+ "I": CigarOp.INSERTION,
+ "D": CigarOp.DELETION,
+ "N": CigarOp.INTRON,
+ "S": CigarOp.SOFT_CLIP,
+ "H": CigarOp.HARD_CLIP,
+ "P": CigarOp.PADDING,
+ "=": CigarOp.EQUAL,
+ "X": CigarOp.DIFFERENT,
+ "B": CigarOp.BACK,
+}
_op_to_str = {v: k for k, v in _str_to_op.items()}
-def read_alignment_from_cigar(cigar, position,
- reference_sequence, segment_sequence):
+def read_alignment_from_cigar(cigar, position, reference_sequence, segment_sequence):
"""
Create an :class:`Alignment` from a CIGAR string.
@@ -147,20 +148,16 @@ def read_alignment_from_cigar(cigar, position,
else:
operations = np.asarray(cigar, dtype=int)
if operations.ndim != 2:
- raise ValueError(
- "Expected array with shape (n,2)"
- )
+ raise ValueError("Expected array with shape (n,2)")
if operations.shape[1] != 2:
- raise ValueError(
- "Expected (operation, length) pairs"
- )
+ raise ValueError("Expected (operation, length) pairs")
if len(operations) == 0:
return Alignment(
[reference_sequence, segment_sequence], np.zeros((0, 2), dtype=int)
)
- trace = np.zeros((np.sum(operations[:,1]), 2), dtype=int)
+ trace = np.zeros((np.sum(operations[:, 1]), 2), dtype=int)
clip_mask = np.ones(trace.shape[0], dtype=bool)
i = 0
@@ -187,19 +184,23 @@ def read_alignment_from_cigar(cigar, position,
elif op == CigarOp.HARD_CLIP:
clip_mask[i : i + length] = False
else:
- raise ValueError(
- f"CIGAR operation {op} is not implemented"
- )
+ raise ValueError(f"CIGAR operation {op} is not implemented")
i += length
# Remove clipped positions
trace = trace[clip_mask]
return Alignment([reference_sequence, segment_sequence], trace)
-def write_alignment_to_cigar(alignment, reference_index=0, segment_index=1,
- introns=(), distinguish_matches=False,
- hard_clip=False, include_terminal_gaps=False,
- as_string=True):
+def write_alignment_to_cigar(
+ alignment,
+ reference_index=0,
+ segment_index=1,
+ introns=(),
+ distinguish_matches=False,
+ hard_clip=False,
+ include_terminal_gaps=False,
+ as_string=True,
+):
"""
Convert an :class:`Alignment` into a CIGAR string.
@@ -305,8 +306,8 @@ def write_alignment_to_cigar(alignment, reference_index=0, segment_index=1,
seg_trace = alignment.trace[:, segment_index]
operations = np.full(alignment.trace.shape[0], CigarOp.MATCH, dtype=int)
- insertion_mask = (ref_trace == -1)
- deletion_mask = (seg_trace == -1)
+ insertion_mask = ref_trace == -1
+ deletion_mask = seg_trace == -1
if np.any(insertion_mask & deletion_mask):
raise ValueError(
"Alignment contains insertion and deletion at the same position"
@@ -318,35 +319,27 @@ def write_alignment_to_cigar(alignment, reference_index=0, segment_index=1,
intron_mask = np.zeros(operations.shape[0], dtype=bool)
for start, stop in introns:
if start >= stop:
- raise ValueError(
- "Intron start must be smaller than intron stop"
- )
+ raise ValueError("Intron start must be smaller than intron stop")
if start < 0:
- raise ValueError(
- "Intron start must not be negative"
- )
+ raise ValueError("Intron start must not be negative")
intron_mask[(ref_trace >= start) & (ref_trace < stop)] = True
if np.any(intron_mask & ~deletion_mask):
- raise ValueError(
- "Introns must be within gaps in the reference sequence"
- )
+ raise ValueError("Introns must be within gaps in the reference sequence")
operations[intron_mask] = CigarOp.INTRON
if distinguish_matches:
symbol_codes = get_codes(alignment)
ref_codes = symbol_codes[reference_index, :]
seg_codes = symbol_codes[segment_index, :]
- equal_mask = (ref_codes == seg_codes)
- match_mask = (operations == CigarOp.MATCH)
+ equal_mask = ref_codes == seg_codes
+ match_mask = operations == CigarOp.MATCH
operations[equal_mask & match_mask] = CigarOp.EQUAL
operations[~equal_mask & match_mask] = CigarOp.DIFFERENT
op_tuples = _aggregate_consecutive(operations)
clip_op = CigarOp.HARD_CLIP if hard_clip else CigarOp.SOFT_CLIP
- start_clip_length, end_clip_length = _find_clipped_bases(
- alignment, segment_index
- )
+ start_clip_length, end_clip_length = _find_clipped_bases(alignment, segment_index)
if start_clip_length != 0:
start_clip = [(clip_op, start_clip_length)]
else:
@@ -386,9 +379,7 @@ def _find_clipped_bases(alignment, segment_index):
# all previous bases are clipped...
start_clip_length = seg_trace[0]
# ...and the same applies for the last base
- end_clip_length = (
- len(alignment.sequences[segment_index]) - seg_trace[-1] - 1
- )
+ end_clip_length = len(alignment.sequences[segment_index]) - seg_trace[-1] - 1
return start_clip_length, end_clip_length
@@ -431,4 +422,4 @@ def _op_tuples_from_cigar(cigar):
op = CigarOp.from_cigar_symbol(char)
op_tuples.append((op, count))
count = ""
- return np.array(op_tuples, dtype=int)
\ No newline at end of file
+ return np.array(op_tuples, dtype=int)
diff --git a/src/biotite/sequence/align/matrix.py b/src/biotite/sequence/align/matrix.py
index 7f7d4f9eb..2a7d23437 100644
--- a/src/biotite/sequence/align/matrix.py
+++ b/src/biotite/sequence/align/matrix.py
@@ -5,11 +5,9 @@
__name__ = "biotite.sequence.align"
__author__ = "Patrick Kunzmann"
-from ..sequence import Sequence
-from ..seqtypes import NucleotideSequence, ProteinSequence
-from ..alphabet import Alphabet
-import numpy as np
import os
+import numpy as np
+from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence
__all__ = ["SubstitutionMatrix"]
@@ -21,54 +19,54 @@ class SubstitutionMatrix(object):
A :class:`SubstitutionMatrix` maps each possible pairing of a symbol
of a first alphabet with a symbol of a second alphabet to a score
(integer).
-
+
The class uses a 2-D (m x n) :class:`ndarray`
(dtype=:attr:`numpy.int32`),
where each element stores the score for a symbol pairing, indexed
by the symbol codes of the respective symbols in an *m*-length
alphabet 1 and an *n*-length alphabet 2.
-
+
There are 3 ways to creates instances:
-
+
At first a 2-D :class:`ndarray` containing the scores can be
directly provided.
-
+
Secondly a dictionary can be provided, where the keys are pairing
tuples and values are the corresponding scores.
The pairing tuples consist of a symbol of alphabet 1 as first
element and a symbol of alphabet 2 as second element. Parings have
to be provided for each possible combination.
-
+
At last a valid matrix name can be given, which is loaded from the
internal matrix database. The following matrices are avaliable:
-
+
- Nucleotide substitution matrices from NCBI database
- **NUC** - Also usable with ambiguous alphabet
-
+
- Protein substitution matrices from NCBI database
-
+
- **PAM**
- **BLOSUM**
- **MATCH** - Only differentiates between match and mismatch
- **IDENTITY** - Strongly penalizes mismatches
- **GONNET** - Not usable with default protein alphabet
- **DAYHOFF**
-
+
- Corrected protein substitution matrices :footcite:`Hess2016`,
**** is the BLOCKS version, the matrix is based on
-
+
- **BLOSUM_**
- **RBLOSUM_**
- **CorBLOSUM_**
-
+
A list of all available matrix names is returned by
:meth:`list_db()`.
-
+
Since this class can handle two different alphabets, it is possible
to align two different types of sequences.
-
+
Objects of this class are immutable.
-
+
Parameters
----------
alphabet1 : Alphabet, length=m
@@ -79,23 +77,23 @@ class SubstitutionMatrix(object):
Either a symbol code indexed :class:`ndarray` containing the scores,
or a dictionary mapping the symbol pairing to scores,
or a string referencing a matrix in the internal database.
-
+
Raises
------
KeyError
If the matrix dictionary misses a symbol given in the alphabet.
-
+
References
----------
-
+
.. footbibliography::
-
+
Examples
--------
-
+
Creating a matrix for two different (nonsense) alphabets
via a matrix dictionary:
-
+
>>> alph1 = Alphabet(["foo","bar"])
>>> alph2 = Alphabet([1,2,3])
>>> matrix_dict = {("foo",1):5, ("foo",2):10, ("foo",3):15,
@@ -119,17 +117,16 @@ class SubstitutionMatrix(object):
C 0 1 0 0
G 0 0 1 0
T 0 0 0 1
-
+
Creating a matrix via database name:
-
+
>>> alph = ProteinSequence.alphabet
>>> matrix = SubstitutionMatrix(alph, alph, "BLOSUM50")
"""
-
+
# Directory of matrix files
- _db_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
- "matrix_data")
-
+ _db_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "matrix_data")
+
def __init__(self, alphabet1, alphabet2, score_matrix):
self._alph1 = alphabet1
self._alph2 = alphabet2
@@ -147,16 +144,19 @@ def __init__(self, alphabet1, alphabet2, score_matrix):
matrix_dict = SubstitutionMatrix.dict_from_db(score_matrix)
self._fill_with_matrix_dict(matrix_dict)
else:
- raise TypeError("Matrix must be either a dictionary, "
- "an 2-D ndarray or a string")
+ raise TypeError(
+ "Matrix must be either a dictionary, " "an 2-D ndarray or a string"
+ )
# This class is immutable and has a getter function for the
# score matrix -> make the score matrix read-only
self._matrix.setflags(write=False)
def __repr__(self):
"""Represent SubstitutionMatrix as a string for debugging."""
- return f"SubstitutionMatrix({self._alph1.__repr__()}, {self._alph2.__repr__()}, " \
- f"np.{np.array_repr(self._matrix)})"
+ return (
+ f"SubstitutionMatrix({self._alph1.__repr__()}, {self._alph2.__repr__()}, "
+ f"np.{np.array_repr(self._matrix)})"
+ )
def __eq__(self, item):
if not isinstance(item, SubstitutionMatrix):
@@ -173,40 +173,39 @@ def __ne__(self, item):
return not self == item
def _fill_with_matrix_dict(self, matrix_dict):
- self._matrix = np.zeros(( len(self._alph1), len(self._alph2) ),
- dtype=np.int32)
+ self._matrix = np.zeros((len(self._alph1), len(self._alph2)), dtype=np.int32)
for i in range(len(self._alph1)):
for j in range(len(self._alph2)):
sym1 = self._alph1.decode(i)
sym2 = self._alph2.decode(j)
- self._matrix[i,j] = int(matrix_dict[sym1, sym2])
-
+ self._matrix[i, j] = int(matrix_dict[sym1, sym2])
+
def get_alphabet1(self):
"""
- Get the first alphabet.
-
+ Get the first alphabet.
+
Returns
-------
alphabet : Alphabet
The first alphabet.
"""
return self._alph1
-
+
def get_alphabet2(self):
"""
- Get the second alphabet.
-
+ Get the second alphabet.
+
Returns
-------
alphabet : Alphabet
The second alphabet.
"""
return self._alph2
-
+
def score_matrix(self):
"""
Get the 2-D :class:`ndarray` containing the score values.
-
+
Returns
-------
matrix : ndarray, shape=(m,n), dtype=np.int32
@@ -214,12 +213,12 @@ def score_matrix(self):
The array is read-only.
"""
return self._matrix
-
+
def transpose(self):
"""
Get a copy of this instance, where the alphabets are
interchanged.
-
+
Returns
-------
transposed : SubstitutionMatrix
@@ -229,7 +228,7 @@ def transpose(self):
new_alph2 = self._alph1
new_matrix = np.transpose(self._matrix)
return SubstitutionMatrix(new_alph1, new_alph2, new_matrix)
-
+
def is_symmetric(self):
"""
Check whether the substitution matrix is symmetric,
@@ -242,35 +241,36 @@ def is_symmetric(self):
True, if both alphabets are identical and the score matrix
is symmetric, false otherwise.
"""
- return self._alph1 == self._alph2 \
- and np.array_equal(self._matrix, np.transpose(self._matrix))
-
+ return self._alph1 == self._alph2 and np.array_equal(
+ self._matrix, np.transpose(self._matrix)
+ )
+
def get_score_by_code(self, code1, code2):
"""
Get the substitution score of two symbols,
represented by their code.
-
+
Parameters
----------
code1, code2 : int
Symbol codes of the two symbols to be aligned.
-
+
Returns
-------
score : int
The substitution / alignment score.
"""
return self._matrix[code1, code2]
-
+
def get_score(self, symbol1, symbol2):
"""
Get the substitution score of two symbols.
-
+
Parameters
----------
symbol1, symbol2 : object
Symbols to be aligned.
-
+
Returns
-------
score : int
@@ -279,19 +279,19 @@ def get_score(self, symbol1, symbol2):
code1 = self._alph1.encode(symbol1)
code2 = self._alph2.encode(symbol2)
return self._matrix[code1, code2]
-
+
def shape(self):
"""
Get the shape (i.e. the length of both alphabets)
of the subsitution matrix.
-
+
Returns
-------
shape : tuple
Matrix shape.
"""
return (len(self._alph1), len(self._alph2))
-
+
def __str__(self):
# Create matrix in NCBI format
string = " "
@@ -306,18 +306,18 @@ def __str__(self):
# Remove terminal line break
string = string[:-1]
return string
-
+
@staticmethod
def dict_from_str(string):
"""
Create a matrix dictionary from a string in NCBI matrix format.
-
+
Symbols of the first alphabet are taken from the left column,
symbols of the second alphabet are taken from the top row.
-
+
The keys of the dictionary consist of tuples containing the
aligned symbols and the values are the corresponding scores.
-
+
Returns
-------
matrix_dict : dict
@@ -329,22 +329,22 @@ def dict_from_str(string):
symbols2 = [e for e in lines[0].split()]
scores = np.array([line.split()[1:] for line in lines[1:]]).astype(int)
scores = np.transpose(scores)
-
+
matrix_dict = {}
for i in range(len(symbols1)):
for j in range(len(symbols2)):
- matrix_dict[(symbols1[i], symbols2[j])] = scores[i,j]
+ matrix_dict[(symbols1[i], symbols2[j])] = scores[i, j]
return matrix_dict
-
+
@staticmethod
def dict_from_db(matrix_name):
"""
Create a matrix dictionary from a valid matrix name in the
internal matrix database.
-
+
The keys of the dictionary consist of tuples containing the
aligned symbols and the values are the corresponding scores.
-
+
Returns
-------
matrix_dict : dict
@@ -353,12 +353,12 @@ def dict_from_db(matrix_name):
filename = SubstitutionMatrix._db_dir + os.sep + matrix_name + ".mat"
with open(filename, "r") as f:
return SubstitutionMatrix.dict_from_str(f.read())
-
+
@staticmethod
def list_db():
"""
List all matrix names in the internal database.
-
+
Returns
-------
db_list : list
@@ -367,27 +367,26 @@ def list_db():
files = os.listdir(SubstitutionMatrix._db_dir)
# Remove '.mat' from files
return [file[:-4] for file in sorted(files)]
-
-
+
@staticmethod
def std_protein_matrix():
"""
Get the default :class:`SubstitutionMatrix` for protein sequence
alignments, which is BLOSUM62.
-
+
Returns
-------
matrix : SubstitutionMatrix
Default matrix.
"""
return _matrix_blosum62
-
+
@staticmethod
def std_nucleotide_matrix():
"""
Get the default :class:`SubstitutionMatrix` for DNA sequence
alignments.
-
+
Returns
-------
matrix : SubstitutionMatrix
@@ -395,11 +394,11 @@ def std_nucleotide_matrix():
"""
return _matrix_nuc
-# Preformatted BLOSUM62 and NUC substitution matrix from NCBI
-_matrix_blosum62 = SubstitutionMatrix(ProteinSequence.alphabet,
- ProteinSequence.alphabet,
- "BLOSUM62")
-_matrix_nuc = SubstitutionMatrix(NucleotideSequence.alphabet_amb,
- NucleotideSequence.alphabet_amb,
- "NUC")
+# Preformatted BLOSUM62 and NUC substitution matrix from NCBI
+_matrix_blosum62 = SubstitutionMatrix(
+ ProteinSequence.alphabet, ProteinSequence.alphabet, "BLOSUM62"
+)
+_matrix_nuc = SubstitutionMatrix(
+ NucleotideSequence.alphabet_amb, NucleotideSequence.alphabet_amb, "NUC"
+)
diff --git a/src/biotite/sequence/align/statistics.py b/src/biotite/sequence/align/statistics.py
index 19a8c9aba..72a783ac5 100644
--- a/src/biotite/sequence/align/statistics.py
+++ b/src/biotite/sequence/align/statistics.py
@@ -7,8 +7,8 @@
__all__ = ["EValueEstimator"]
import numpy as np
-from ..seqtypes import GeneralSequence
-from .pairwise import align_optimal
+from biotite.sequence.align.pairwise import align_optimal
+from biotite.sequence.seqtypes import GeneralSequence
class EValueEstimator:
@@ -29,7 +29,7 @@ class EValueEstimator:
of random sequence alignments in :meth:`from_samples()`
:footcite:`Altschul1986`, which may be time consuming.
If these parameters are known, the constructor can be used instead.
-
+
Based on the sampled parameters, the decadic logarithm of the
E-value can be quickly calculated via :meth:`log_evalue()`.
@@ -39,7 +39,7 @@ class EValueEstimator:
The :math:`\lambda` parameter.
k : float
The :math:`K` parameter.
-
+
Notes
-----
The calculated E-value is a rough estimation that gets more
@@ -102,8 +102,9 @@ def __init__(self, lam, k):
self._k = k
@staticmethod
- def from_samples(alphabet, matrix, gap_penalty, frequencies,
- sample_length=1000, sample_size=1000):
+ def from_samples(
+ alphabet, matrix, gap_penalty, frequencies, sample_length=1000, sample_size=1000
+ ):
r"""
Create an :class:`EValueEstimator` with :math:`\lambda` and
:math:`K` estimated via sampling alignments of random sequences
@@ -137,13 +138,13 @@ def from_samples(alphabet, matrix, gap_penalty, frequencies,
The number of sampled sequences.
The accuracy of the estimated parameters and E-values,
but also the runtime increases with the sample size.
-
+
Returns
-------
estimator : EValueEstimator
A :class:`EValueEstimator` with sampled :math:`\lambda` and
:math:`K` parameters.
-
+
Notes
-----
The sampling process generates random sequences based on
@@ -167,15 +168,15 @@ def from_samples(alphabet, matrix, gap_penalty, frequencies,
raise ValueError("A symmetric substitution matrix is required")
if not matrix.get_alphabet1().extends(alphabet):
raise ValueError(
- "The substitution matrix is not compatible "
- "with the given alphabet"
+ "The substitution matrix is not compatible " "with the given alphabet"
)
- score_matrix = matrix.score_matrix()[:len(alphabet), :len(alphabet)]
- if np.sum(
- score_matrix \
- * frequencies[np.newaxis, :] \
- * frequencies[:, np.newaxis]
- ) >= 0:
+ score_matrix = matrix.score_matrix()[: len(alphabet), : len(alphabet)]
+ if (
+ np.sum(
+ score_matrix * frequencies[np.newaxis, :] * frequencies[:, np.newaxis]
+ )
+ >= 0
+ ):
raise ValueError(
"Invalid substitution matrix, the expected similarity "
"score between two random symbols is not negative"
@@ -183,9 +184,7 @@ def from_samples(alphabet, matrix, gap_penalty, frequencies,
# Generate the sequence code for the random sequences
random_sequence_code = np.random.choice(
- len(alphabet),
- size=(sample_size, 2, sample_length),
- p=frequencies
+ len(alphabet), size=(sample_size, 2, sample_length), p=frequencies
)
# Sample the alignments of random sequences
@@ -193,28 +192,27 @@ def from_samples(alphabet, matrix, gap_penalty, frequencies,
for i in range(sample_size):
seq1 = GeneralSequence(alphabet)
seq2 = GeneralSequence(alphabet)
- seq1.code = random_sequence_code[i,0]
- seq2.code = random_sequence_code[i,1]
+ seq1.code = random_sequence_code[i, 0]
+ seq2.code = random_sequence_code[i, 1]
sample_scores[i] = align_optimal(
- seq1, seq2, matrix,
- local=True, gap_penalty=gap_penalty, max_number=1
+ seq1, seq2, matrix, local=True, gap_penalty=gap_penalty, max_number=1
)[0].score
-
+
# Use method of moments to estimate parameters
lam = np.pi / np.sqrt(6 * np.var(sample_scores))
u = np.mean(sample_scores) - np.euler_gamma / lam
k = np.exp(lam * u) / sample_length**2
-
+
return EValueEstimator(lam, k)
@property
def lam(self):
return self._lam
-
+
@property
def k(self):
return self._k
-
+
def log_evalue(self, score, seq1_length, seq2_length):
r"""
Calculate the decadic logarithm of the E-value for a given
@@ -223,11 +221,11 @@ def log_evalue(self, score, seq1_length, seq2_length):
The E-value and the logarithm of the E-value is calculated as
.. math::
-
+
E = Kmn e^{-\lambda s}
\log_{10} E = (\log_{10} Kmn) - \frac{\lambda s}{\ln 10},
-
+
where :math:`s` is the similarity score and :math:`m` and
:math:`n` are the lengths of the aligned sequences.
@@ -245,12 +243,12 @@ def log_evalue(self, score, seq1_length, seq2_length):
this is usually either the combined length of all sequences
in the database or the length of the hit sequence multiplied
by the number of sequences in the database.
-
+
Returns
-------
log_e : float
The decadic logarithm of the E-value.
-
+
Notes
-----
This method returns the logarithm of the E-value instead of
@@ -261,5 +259,6 @@ def log_evalue(self, score, seq1_length, seq2_length):
seq1_length = np.asarray(seq1_length)
seq2_length = np.asarray(seq2_length)
- return np.log10(self._k * seq1_length * seq2_length) \
- - self._lam * score / np.log(10)
\ No newline at end of file
+ return np.log10(
+ self._k * seq1_length * seq2_length
+ ) - self._lam * score / np.log(10)
diff --git a/src/biotite/sequence/alphabet.py b/src/biotite/sequence/alphabet.py
index 4b9fe9683..4231817bd 100644
--- a/src/biotite/sequence/alphabet.py
+++ b/src/biotite/sequence/alphabet.py
@@ -4,14 +4,19 @@
__name__ = "biotite.sequence"
__author__ = "Patrick Kunzmann"
-__all__ = ["Alphabet", "LetterAlphabet", "AlphabetMapper", "AlphabetError",
- "common_alphabet"]
+__all__ = [
+ "Alphabet",
+ "LetterAlphabet",
+ "AlphabetMapper",
+ "AlphabetError",
+ "common_alphabet",
+]
import copy
-from numbers import Integral
import string
+from numbers import Integral
import numpy as np
-from .codec import encode_chars, decode_to_chars, map_sequence_code
+from biotite.sequence.codec import decode_to_chars, encode_chars, map_sequence_code
class Alphabet(object):
@@ -107,7 +112,7 @@ def __init__(self, symbols):
def __repr__(self):
"""Represent Alphabet as a string for debugging."""
- return f'Alphabet({self._symbols})'
+ return f"Alphabet({self._symbols})"
def get_symbols(self):
"""
@@ -139,8 +144,7 @@ def extends(self, alphabet):
elif len(alphabet) > len(self):
return False
else:
- return alphabet.get_symbols() \
- == self.get_symbols()[:len(alphabet)]
+ return alphabet.get_symbols() == self.get_symbols()[: len(alphabet)]
def encode(self, symbol):
"""
@@ -164,9 +168,7 @@ def encode(self, symbol):
try:
return self._symbol_dict[symbol]
except KeyError:
- raise AlphabetError(
- f"Symbol {repr(symbol)} is not in the alphabet"
- )
+ raise AlphabetError(f"Symbol {repr(symbol)} is not in the alphabet")
def decode(self, code):
"""
@@ -238,9 +240,8 @@ def is_letter_alphabet(self):
have length 1 and are printable.
"""
for symbol in self:
- if not isinstance(symbol, (str, bytes)) \
- or len(symbol) > 1:
- return False
+ if not isinstance(symbol, (str, bytes)) or len(symbol) > 1:
+ return False
if isinstance(symbol, str):
symbol = symbol.encode("ASCII")
if symbol not in LetterAlphabet.PRINATBLES:
@@ -292,8 +293,9 @@ class LetterAlphabet(Alphabet):
in this list.
"""
- PRINATBLES = (string.digits + string.ascii_letters + string.punctuation) \
- .encode("ASCII")
+ PRINATBLES = (string.digits + string.ascii_letters + string.punctuation).encode(
+ "ASCII"
+ )
def __init__(self, symbols):
if len(symbols) == 0:
@@ -312,13 +314,12 @@ def __init__(self, symbols):
# Direct 'astype' conversion is not allowed by numpy
# -> frombuffer()
self._symbols = np.frombuffer(
- np.array(self._symbols, dtype="|S1"),
- dtype=np.ubyte
+ np.array(self._symbols, dtype="|S1"), dtype=np.ubyte
)
def __repr__(self):
"""Represent LetterAlphabet as a string for debugging."""
- return f'LetterAlphabet({self.get_symbols()})'
+ return f"LetterAlphabet({self.get_symbols()})"
def extends(self, alphabet):
if alphabet is self:
@@ -326,9 +327,7 @@ def extends(self, alphabet):
elif type(alphabet) == LetterAlphabet:
if len(alphabet._symbols) > len(self._symbols):
return False
- return np.all(
- alphabet._symbols == self._symbols[:len(alphabet._symbols)]
- )
+ return np.all(alphabet._symbols == self._symbols[: len(alphabet._symbols)])
else:
return super().extends(alphabet)
@@ -341,17 +340,14 @@ def get_symbols(self):
symbols : list
Copy of the internal list of symbols.
"""
- return [symbol.decode("ASCII") for symbol
- in self._symbols_as_bytes()]
+ return [symbol.decode("ASCII") for symbol in self._symbols_as_bytes()]
def encode(self, symbol):
if not isinstance(symbol, (str, bytes)) or len(symbol) > 1:
raise AlphabetError(f"Symbol '{symbol}' is not a single letter")
indices = np.where(self._symbols == ord(symbol))[0]
if len(indices) == 0:
- raise AlphabetError(
- f"Symbol {repr(symbol)} is not in the alphabet"
- )
+ raise AlphabetError(f"Symbol {repr(symbol)} is not in the alphabet")
return indices[0].item()
def decode(self, code, as_bytes=False):
@@ -382,13 +378,10 @@ def encode_multiple(self, symbols, dtype=None):
elif isinstance(symbols, bytes):
symbols = np.frombuffer(symbols, dtype=np.ubyte)
elif isinstance(symbols, np.ndarray):
- symbols = np.frombuffer(
- symbols.astype(dtype="|S1"), dtype=np.ubyte
- )
+ symbols = np.frombuffer(symbols.astype(dtype="|S1"), dtype=np.ubyte)
else:
symbols = np.frombuffer(
- np.array(list(symbols), dtype="|S1"),
- dtype=np.ubyte
+ np.array(list(symbols), dtype="|S1"), dtype=np.ubyte
)
return encode_chars(alphabet=self._symbols, symbols=symbols)
@@ -435,7 +428,6 @@ def _symbols_as_bytes(self):
return np.frombuffer(self._symbols, dtype="|S1")
-
class AlphabetMapper(object):
"""
This class is used for symbol code conversion from a source
@@ -486,8 +478,7 @@ def __init__(self, source_alphabet, target_alphabet):
else:
self._necessary_mapping = True
self._mapper = np.zeros(
- len(source_alphabet),
- dtype=AlphabetMapper._dtype(len(target_alphabet))
+ len(source_alphabet), dtype=AlphabetMapper._dtype(len(target_alphabet))
)
for old_code in range(len(source_alphabet)):
symbol = source_alphabet.decode(old_code)
@@ -500,26 +491,25 @@ def __getitem__(self, code):
return self._mapper[code]
else:
return code
- if not isinstance(code, np.ndarray) \
- or code.dtype not in (np.uint8, np.uint16, np.uint32, np.uint64):
- code = np.array(code, dtype=np.uint64)
+ if not isinstance(code, np.ndarray) or code.dtype not in (
+ np.uint8,
+ np.uint16,
+ np.uint32,
+ np.uint64,
+ ):
+ code = np.array(code, dtype=np.uint64)
if self._necessary_mapping:
mapped_code = np.empty(len(code), dtype=self._mapper.dtype)
- map_sequence_code(
- self._mapper,
- code,
- mapped_code
- )
+ map_sequence_code(self._mapper, code, mapped_code)
return mapped_code
else:
return code
-
@staticmethod
def _dtype(alphabet_size):
- _size_uint8 = np.iinfo(np.uint8 ).max +1
- _size_uint16 = np.iinfo(np.uint16).max +1
- _size_uint32 = np.iinfo(np.uint32).max +1
+ _size_uint8 = np.iinfo(np.uint8).max + 1
+ _size_uint16 = np.iinfo(np.uint16).max + 1
+ _size_uint32 = np.iinfo(np.uint32).max + 1
if alphabet_size <= _size_uint8:
return np.uint8
elif alphabet_size <= _size_uint16:
@@ -535,6 +525,7 @@ class AlphabetError(Exception):
This exception is raised, when a code or a symbol is not in an
:class:`Alphabet`.
"""
+
pass
@@ -563,4 +554,4 @@ def common_alphabet(alphabets):
common_alphabet = alphabet
else:
return None
- return common_alphabet
\ No newline at end of file
+ return common_alphabet
diff --git a/src/biotite/sequence/annotation.py b/src/biotite/sequence/annotation.py
index cb2a9267e..5843e6bb8 100644
--- a/src/biotite/sequence/annotation.py
+++ b/src/biotite/sequence/annotation.py
@@ -6,17 +6,15 @@
__author__ = "Patrick Kunzmann"
__all__ = ["Location", "Feature", "Annotation", "AnnotatedSequence"]
-import numbers
import copy
+import numbers
import sys
-from enum import Flag, Enum, auto
+from enum import Enum, Flag, auto
import numpy as np
-from .sequence import Sequence
-from ..copyable import Copyable
-from .seqtypes import NucleotideSequence
+from biotite.copyable import Copyable
-class Location():
+class Location:
"""
A :class:`Location` defines at which base(s)/residue(s) a feature is
located.
@@ -63,24 +61,25 @@ class Defect(Flag):
- **BETWEEN** - The position is between to consecutive
bases/residues.
"""
- NONE = 0
- MISS_LEFT = auto()
- MISS_RIGHT = auto()
- BEYOND_LEFT = auto()
+
+ NONE = 0
+ MISS_LEFT = auto()
+ MISS_RIGHT = auto()
+ BEYOND_LEFT = auto()
BEYOND_RIGHT = auto()
- UNK_LOC = auto()
- BETWEEN = auto()
+ UNK_LOC = auto()
+ BETWEEN = auto()
class Strand(Enum):
"""
This enum type describes the strand of the feature location.
This is not relevant for protein sequence features.
"""
+
FORWARD = auto()
REVERSE = auto()
- def __init__(self, first, last, strand=Strand.FORWARD,
- defect=Defect.NONE):
+ def __init__(self, first, last, strand=Strand.FORWARD, defect=Defect.NONE):
if first > last:
raise ValueError(
"The first position cannot be higher than the last position"
@@ -92,8 +91,10 @@ def __init__(self, first, last, strand=Strand.FORWARD,
def __repr__(self):
"""Represent Location as a string for debugging."""
- return f'Location({self._first}, {self._last}, strand={"Location." + str(self._strand)}, ' \
- f'defect={"Location." + str(self._defect)})'
+ return (
+ f'Location({self._first}, {self._last}, strand={"Location." + str(self._strand)}, '
+ f'defect={"Location." + str(self._defect)})'
+ )
@property
def first(self):
@@ -122,10 +123,12 @@ def __str__(self):
def __eq__(self, item):
if not isinstance(item, Location):
return False
- return ( self.first == item.first
- and self.last == item.last
- and self.strand == item.strand
- and self.defect == item.defect)
+ return (
+ self.first == item.first
+ and self.last == item.last
+ and self.strand == item.strand
+ and self.defect == item.defect
+ )
def __hash__(self):
return hash((self._first, self._last, self._strand, self._defect))
@@ -208,9 +211,11 @@ def get_location_range(self):
def __eq__(self, item):
if not isinstance(item, Feature):
return False
- return ( self._key == item._key
- and self._locs == item._locs
- and self._qual == item._qual)
+ return (
+ self._key == item._key
+ and self._locs == item._locs
+ and self._qual == item._qual
+ )
def __lt__(self, item):
if not isinstance(item, Feature):
@@ -223,7 +228,7 @@ def __lt__(self, item):
return True
elif first > it_first:
return False
- else: # First is equal
+ else: # First is equal
return last > it_last
def __gt__(self, item):
@@ -237,7 +242,7 @@ def __gt__(self, item):
return True
elif first < it_first:
return False
- else: # First is equal
+ else: # First is equal
return last < it_last
@property
@@ -253,7 +258,7 @@ def qual(self):
return copy.copy(self._qual)
def __hash__(self):
- return hash(( self._key, self._locs, frozenset(self._qual.items()) ))
+ return hash((self._key, self._locs, frozenset(self._qual.items())))
class Annotation(Copyable):
@@ -350,7 +355,9 @@ def __init__(self, features=None):
def __repr__(self):
"""Represent Annotation as a string for debugging."""
- return f'Annotation([{", ".join([feat.__repr__() for feat in self._features])}])'
+ return (
+ f'Annotation([{", ".join([feat.__repr__() for feat in self._features])}])'
+ )
def __copy_create__(self):
return Annotation(self._features)
@@ -403,7 +410,7 @@ def get_location_range(self):
if loc.last > last:
last = loc.last
# Exclusive stop -> +1
- return first, last+1
+ return first, last + 1
def del_feature(self, feature):
"""
@@ -475,9 +482,7 @@ def __getitem__(self, index):
if loc.last > i_last:
defect |= Location.Defect.MISS_RIGHT
last = i_last
- locs_in_scope.append(Location(
- first, last, loc.strand, defect
- ))
+ locs_in_scope.append(Location(first, last, loc.strand, defect))
if len(locs_in_scope) > 0:
# The feature is present in the new annotation
# if any of the original locations is in the new
@@ -488,15 +493,12 @@ def __getitem__(self, index):
sub_annot.add_feature(new_feature)
return sub_annot
else:
- raise TypeError(
- f"'{type(index).__name__}' instances are invalid indices"
- )
+ raise TypeError(f"'{type(index).__name__}' instances are invalid indices")
def __delitem__(self, item):
if not isinstance(item, Feature):
raise TypeError(
- f"Only 'Feature' objects are supported, "
- f"not {type(item).__name__}"
+ f"Only 'Feature' objects are supported, " f"not {type(item).__name__}"
)
self.del_feature(item)
@@ -626,8 +628,10 @@ def __init__(self, annotation, sequence, sequence_start=1):
def __repr__(self):
"""Represent AnnotatedSequence as a string for debugging."""
- return f'AnnotatedSequence({self._annotation.__repr__()}, {self._sequence.__repr__()}, ' \
- f'sequence_start={self._seqstart})'
+ return (
+ f"AnnotatedSequence({self._annotation.__repr__()}, {self._sequence.__repr__()}, "
+ f"sequence_start={self._seqstart})"
+ )
@property
def sequence_start(self):
@@ -643,7 +647,8 @@ def annotation(self):
def __copy_create__(self):
return AnnotatedSequence(
- self._annotation.copy(), self._sequence.copy, self._seqstart)
+ self._annotation.copy(), self._sequence.copy, self._seqstart
+ )
def reverse_complement(self, sequence_start=1):
"""
@@ -676,10 +681,12 @@ def reverse_complement(self, sequence_start=1):
# (seq_len-1) -> last sequence index
# (loc.last-self._seqstart) -> location to index
# ... + rev_seqstart -> index to location
- rev_loc_first \
- = (seq_len-1) - (loc.last-self._seqstart) + rev_seqstart
- rev_loc_last \
- = (seq_len-1) - (loc.first-self._seqstart) + rev_seqstart
+ rev_loc_first = (
+ (seq_len - 1) - (loc.last - self._seqstart) + rev_seqstart
+ )
+ rev_loc_last = (
+ (seq_len - 1) - (loc.first - self._seqstart) + rev_seqstart
+ )
if loc.strand == Location.Strand.FORWARD:
rev_loc_strand = Location.Strand.REVERSE
@@ -700,17 +707,14 @@ def reverse_complement(self, sequence_start=1):
if loc.defect & Location.Defect.BETWEEN:
rev_loc_defect |= Location.Defect.BETWEEN
- rev_locs.append(Location(
- rev_loc_first, rev_loc_last,
- rev_loc_strand, rev_loc_defect
- ))
- rev_features.append(Feature(
- feature.key, rev_locs, feature.qual
- ))
+ rev_locs.append(
+ Location(
+ rev_loc_first, rev_loc_last, rev_loc_strand, rev_loc_defect
+ )
+ )
+ rev_features.append(Feature(feature.key, rev_locs, feature.qual))
- return AnnotatedSequence(
- Annotation(rev_features), rev_sequence, rev_seqstart
- )
+ return AnnotatedSequence(Annotation(rev_features), rev_sequence, rev_seqstart)
def __getitem__(self, index):
if isinstance(index, Feature):
@@ -730,24 +734,20 @@ def __getitem__(self, index):
pass
elif strand is None:
strand = loc.strand
- else: # loc.strand != strand
+ else: # loc.strand != strand
raise ValueError(
"All locations of the feature must have the same "
"strand direction"
)
if strand == Location.Strand.FORWARD:
- sorted_locs = sorted(
- locs, key=lambda loc: loc.first
- )
+ sorted_locs = sorted(locs, key=lambda loc: loc.first)
else:
- sorted_locs = sorted(
- locs, key=lambda loc: loc.last, reverse=True
- )
+ sorted_locs = sorted(locs, key=lambda loc: loc.last, reverse=True)
# Merge the sequences corresponding to the ordered locations
for loc in sorted_locs:
slice_start = loc.first - self._seqstart
# +1 due to exclusive stop
- slice_stop = loc.last - self._seqstart +1
+ slice_stop = loc.last - self._seqstart + 1
add_seq = self._sequence[slice_start:slice_stop]
if loc.strand == Location.Strand.REVERSE:
add_seq = add_seq.reverse().complement()
@@ -775,17 +775,17 @@ def __getitem__(self, index):
rel_seq_start = self._seqstart
else:
rel_seq_start = index.start
- return AnnotatedSequence(self._annotation[index],
- self._sequence[seq_start:seq_stop],
- rel_seq_start)
+ return AnnotatedSequence(
+ self._annotation[index],
+ self._sequence[seq_start:seq_stop],
+ rel_seq_start,
+ )
elif isinstance(index, numbers.Integral):
return self._sequence[index - self._seqstart]
else:
- raise TypeError(
- f"'{type(index).__name__}' instances are invalid indices"
- )
+ raise TypeError(f"'{type(index).__name__}' instances are invalid indices")
def __setitem__(self, index, item):
if isinstance(index, Feature):
@@ -796,10 +796,11 @@ def __setitem__(self, index, item):
for loc in index.locs:
slice_start = loc.first - self._seqstart
# +1 due to exclusive stop
- slice_stop = loc.last - self._seqstart +1
+ slice_stop = loc.last - self._seqstart + 1
interval_size = slice_stop - slice_start
- self._sequence[slice_start:slice_stop] \
- = sub_seq[sub_seq_i : sub_seq_i + interval_size]
+ self._sequence[slice_start:slice_stop] = sub_seq[
+ sub_seq_i : sub_seq_i + interval_size
+ ]
sub_seq_i += interval_size
elif isinstance(index, slice):
# Sequence start correction
@@ -817,13 +818,13 @@ def __setitem__(self, index, item):
# Item is a symbol
self._sequence[index - self._seqstart] = item
else:
- raise TypeError(
- f"'{type(index).__name__}' instances are invalid indices"
- )
+ raise TypeError(f"'{type(index).__name__}' instances are invalid indices")
def __eq__(self, item):
if not isinstance(item, AnnotatedSequence):
return False
- return ( self.annotation == item.annotation
- and self.sequence == item.sequence
- and self._seqstart == item._seqstart)
+ return (
+ self.annotation == item.annotation
+ and self.sequence == item.sequence
+ and self._seqstart == item._seqstart
+ )
diff --git a/src/biotite/sequence/codon.py b/src/biotite/sequence/codon.py
index fe50c791f..13a5d64d8 100644
--- a/src/biotite/sequence/codon.py
+++ b/src/biotite/sequence/codon.py
@@ -7,11 +7,10 @@
__all__ = ["CodonTable"]
import copy
-from os.path import join, dirname, realpath
-import numpy as np
from numbers import Integral
-from .seqtypes import NucleotideSequence, ProteinSequence
-
+from os.path import dirname, join, realpath
+import numpy as np
+from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence
# Abbreviations
_NUC_ALPH = NucleotideSequence.alphabet_unamb
@@ -20,7 +19,7 @@
# Multiplier array that converts a codon in code representation
# into a unique integer
_radix = len(_NUC_ALPH)
-_radix_multiplier = np.array([_radix**n for n in (2,1,0)], dtype=int)
+_radix_multiplier = np.array([_radix**n for n in (2, 1, 0)], dtype=int)
class CodonTable(object):
@@ -29,14 +28,14 @@ class CodonTable(object):
amino acid.
It also defines start codons. A :class:`CodonTable`
takes/outputs either the symbols or code of the codon/amino acid.
-
+
Furthermore, this class is able to give a list of codons that
corresponds to a given amino acid.
-
+
The :func:`load()` method allows loading of NCBI codon tables.
-
+
Objects of this class are immutable.
-
+
Parameters
----------
codon_dict : dict of (str -> str)
@@ -47,27 +46,27 @@ class CodonTable(object):
starts : iterable object of str
The start codons. Each entry must be a string of length 3
(all upper case).
-
+
Examples
--------
-
+
Get the amino acid coded by a given codon (symbol and code):
-
+
>>> table = CodonTable.default_table()
>>> print(table["ATG"])
M
>>> print(table[(1,2,3)])
14
-
+
Get the codons coding for a given amino acid (symbol and code):
-
+
>>> table = CodonTable.default_table()
>>> print(table["M"])
('ATG',)
>>> print(table[14])
((0, 2, 0), (0, 2, 2), (1, 2, 0), (1, 2, 1), (1, 2, 2), (1, 2, 3))
"""
-
+
# For efficient mapping of codon codes to amino acid codes,
# especially in in the 'map_codon_codes()' function, the class
# maps each possible codon into a unique number using a radix based
@@ -77,7 +76,7 @@ class CodonTable(object):
# file for builtin codon tables from NCBI
_table_file = join(dirname(realpath(__file__)), "codon_tables.txt")
-
+
def __init__(self, codon_dict, starts):
# Check if 'starts' is iterable object of length 3 string
for start in starts:
@@ -100,12 +99,10 @@ def __init__(self, codon_dict, starts):
if (self._codons == -1).any():
# Find the missing codon
missing_index = np.where(self._codons == -1)[0][0]
- codon_code = CodonTable._to_codon(missing_index)
+ codon_code = CodonTable._to_codon(missing_index)
codon = _NUC_ALPH.decode_multiple(codon_code)
codon_str = "".join(codon)
- raise ValueError(
- f"Codon dictionary does not contain codon '{codon_str}'"
- )
+ raise ValueError(f"Codon dictionary does not contain codon '{codon_str}'")
def __repr__(self):
"""Represent CodonTable as a string for debugging."""
@@ -131,8 +128,10 @@ def __getitem__(self, item):
codon_numbers = np.where(self._codons == aa_code)[0]
codon_codes = CodonTable._to_codon(codon_numbers)
codons = tuple(
- ["".join(_NUC_ALPH.decode_multiple(codon_code))
- for codon_code in codon_codes]
+ [
+ "".join(_NUC_ALPH.decode_multiple(codon_code))
+ for codon_code in codon_codes
+ ]
)
return codons
elif len(item) == 3:
@@ -155,30 +154,28 @@ def __getitem__(self, item):
# Code for codon as any iterable object
# Code for codon -> return corresponding amino acid codes
if len(item) != 3:
- raise ValueError(
- f"{item} is an invalid sequence code for a codon"
- )
+ raise ValueError(f"{item} is an invalid sequence code for a codon")
codon_number = CodonTable._to_number(item)
aa_code = self._codons[codon_number]
return aa_code
-
+
def map_codon_codes(self, codon_codes):
"""
Efficiently map multiple codons to the corresponding amino
acids.
-
+
Parameters
----------
codon_codes : ndarray, dtype=int, shape=(n,3)
The codons to be translated into amino acids.
The codons are given as symbol codes.
*n* is the amount of codons.
-
+
Returns
-------
aa_codes : ndarray, dtype=int, shape=(n,)
The amino acids as symbol codes.
-
+
Examples
--------
>>> dna = NucleotideSequence("ATGGTTTAA")
@@ -209,46 +206,50 @@ def map_codon_codes(self, codon_codes):
codon_numbers = CodonTable._to_number(codon_codes)
aa_codes = self._codons[codon_numbers]
return aa_codes
-
+
def codon_dict(self, code=False):
"""
Get the codon to amino acid mappings dictionary.
-
+
Parameters
----------
code : bool
If true, the dictionary contains keys and values as code.
Otherwise, the dictionary contains strings for codons and
amino acid. (Default: False)
-
+
Returns
-------
codon_dict : dict
The dictionary mapping codons to amino acids.
"""
if code:
- return {tuple(CodonTable._to_codon(codon_number)): aa_code
- for codon_number, aa_code in enumerate(self._codons)}
+ return {
+ tuple(CodonTable._to_codon(codon_number)): aa_code
+ for codon_number, aa_code in enumerate(self._codons)
+ }
else:
- return {"".join(_NUC_ALPH.decode_multiple(codon_code)):
- _PROT_ALPH.decode(aa_code)
- for codon_code, aa_code
- in self.codon_dict(code=True).items()}
-
+ return {
+ "".join(_NUC_ALPH.decode_multiple(codon_code)): _PROT_ALPH.decode(
+ aa_code
+ )
+ for codon_code, aa_code in self.codon_dict(code=True).items()
+ }
+
def is_start_codon(self, codon_codes):
codon_numbers = CodonTable._to_number(codon_codes)
return np.isin(codon_numbers, self._starts)
-
+
def start_codons(self, code=False):
"""
Get the start codons of the codon table.
-
+
Parameters
----------
code : bool
If true, the code will be returned instead of strings.
(Default: False)
-
+
Returns
-------
start_codons : tuple
@@ -257,25 +258,29 @@ def start_codons(self, code=False):
"""
if code:
return tuple(
- [tuple(CodonTable._to_codon(codon_number))
- for codon_number in self._starts]
+ [
+ tuple(CodonTable._to_codon(codon_number))
+ for codon_number in self._starts
+ ]
)
else:
return tuple(
- ["".join(_NUC_ALPH.decode_multiple(codon_code))
- for codon_code in self.start_codons(code=True)]
+ [
+ "".join(_NUC_ALPH.decode_multiple(codon_code))
+ for codon_code in self.start_codons(code=True)
+ ]
)
-
+
def with_start_codons(self, starts):
"""
Create an new :class:`CodonTable` with the same codon mappings,
but changed start codons.
-
+
Parameters
----------
starts : iterable object of str
The new start codons.
-
+
Returns
-------
new_table : CodonTable
@@ -288,17 +293,17 @@ def with_start_codons(self, starts):
)
new_table._starts = CodonTable._to_number(start_codon_codes)
return new_table
-
+
def with_codon_mappings(self, codon_dict):
"""
Create an new :class:`CodonTable` with partially changed codon
mappings.
-
+
Parameters
----------
codon_dict : dict of (str -> str)
The changed codon mappings.
-
+
Returns
-------
new_table : CodonTable
@@ -329,9 +334,9 @@ def __str__(self):
else:
string += " "
# Add space for next codon
- string += " "*3
+ string += " " * 3
# Remove terminal space
- string = string [:-6]
+ string = string[:-6]
# Jump to next line
string += "\n"
# Add empty line
@@ -354,10 +359,10 @@ def _to_codon(numbers):
if not isinstance(numbers, np.ndarray):
numbers = np.array(list(numbers), dtype=int)
codons = np.zeros(numbers.shape + (3,), dtype=int)
- for n in (2,1,0):
+ for n in (2, 1, 0):
val = _radix**n
digit = numbers // val
- codons[..., -(n+1)] = digit
+ codons[..., -(n + 1)] = digit
numbers = numbers - digit * val
return codons
@@ -365,14 +370,14 @@ def _to_codon(numbers):
def load(table_name):
"""
Load a NCBI codon table.
-
+
Parameters
----------
table_name : str or int
If a string is given, it is interpreted as official NCBI
codon table name (e.g. "Vertebrate Mitochondrial").
An integer is interpreted as NCBI codon table ID.
-
+
Returns
-------
table : CodonTable
@@ -381,7 +386,7 @@ def load(table_name):
# Loads codon tables from codon_tables.txt
with open(CodonTable._table_file, "r") as f:
lines = f.read().split("\n")
-
+
# Extract data for codon table from file
table_found = False
aa = None
@@ -392,11 +397,11 @@ def load(table_name):
for line in lines:
if not line:
table_found = False
- if type(table_name) == int and line.startswith("id"):
+ if isinstance(table_name, Integral) and line.startswith("id"):
# remove identifier 'id'
if table_name == int(line[2:]):
table_found = True
- elif type(table_name) == str and line.startswith("name"):
+ elif isinstance(table_name, str) and line.startswith("name"):
# Get list of table names from lines
# (separated with ';')
# remove identifier 'name'
@@ -405,7 +410,7 @@ def load(table_name):
table_found = True
if table_found:
if line.startswith("AA"):
- #Remove identifier
+ # Remove identifier
aa = line[5:].strip()
elif line.startswith("Init"):
init = line[5:].strip()
@@ -415,19 +420,24 @@ def load(table_name):
base2 = line[5:].strip()
elif line.startswith("Base3"):
base3 = line[5:].strip()
-
+
# Create codon table from data
- if aa is not None and init is not None \
- and base1 is not None and base2 is not None and base3 is not None:
- symbol_dict = {}
- starts = []
- # aa, init and baseX all have the same length
- for i in range(len(aa)):
- codon = base1[i] + base2[i] + base3[i]
- if init[i] == "i":
- starts.append(codon)
- symbol_dict[codon] = aa[i]
- return CodonTable(symbol_dict, starts)
+ if (
+ aa is not None
+ and init is not None
+ and base1 is not None
+ and base2 is not None
+ and base3 is not None
+ ):
+ symbol_dict = {}
+ starts = []
+ # aa, init and baseX all have the same length
+ for i in range(len(aa)):
+ codon = base1[i] + base2[i] + base3[i]
+ if init[i] == "i":
+ starts.append(codon)
+ symbol_dict[codon] = aa[i]
+ return CodonTable(symbol_dict, starts)
else:
raise ValueError(f"Codon table '{table_name}' was not found")
@@ -435,7 +445,7 @@ def load(table_name):
def table_names():
"""
The possible codon table names for :func:`load()`.
-
+
Returns
-------
names : list of str
@@ -448,14 +458,14 @@ def table_names():
if line.startswith("name"):
names.extend([name.strip() for name in line[4:].split(";")])
return names
-
+
@staticmethod
def default_table():
"""
The default codon table.
The table is equal to the NCBI "Standard" codon table,
with the difference that only "ATG" is a start codon.
-
+
Returns
-------
table : CodonTable
diff --git a/src/biotite/sequence/graphics/__init__.py b/src/biotite/sequence/graphics/__init__.py
index b1dbbf051..4b0b39b9f 100644
--- a/src/biotite/sequence/graphics/__init__.py
+++ b/src/biotite/sequence/graphics/__init__.py
@@ -29,5 +29,5 @@
from .colorschemes import *
from .dendrogram import *
from .features import *
-from .plasmid import *
from .logo import *
+from .plasmid import *
diff --git a/src/biotite/sequence/graphics/alignment.py b/src/biotite/sequence/graphics/alignment.py
index 45b44e326..f3bdb6380 100644
--- a/src/biotite/sequence/graphics/alignment.py
+++ b/src/biotite/sequence/graphics/alignment.py
@@ -4,15 +4,22 @@
__name__ = "biotite.sequence.graphics"
__author__ = "Patrick Kunzmann"
-__all__ = ["SymbolPlotter", "LetterPlotter", "LetterSimilarityPlotter",
- "LetterTypePlotter","ArrayPlotter",
- "plot_alignment", "plot_alignment_similarity_based",
- "plot_alignment_type_based","plot_alignment_array"]
+__all__ = [
+ "SymbolPlotter",
+ "LetterPlotter",
+ "LetterSimilarityPlotter",
+ "LetterTypePlotter",
+ "ArrayPlotter",
+ "plot_alignment",
+ "plot_alignment_similarity_based",
+ "plot_alignment_type_based",
+ "plot_alignment_array",
+]
import abc
import numpy as np
-from ...visualize import colors
-from .colorschemes import get_color_scheme
+from biotite.sequence.graphics.colorschemes import get_color_scheme
+from biotite.visualize import colors
class SymbolPlotter(metaclass=abc.ABCMeta):
@@ -81,8 +88,7 @@ class LetterPlotter(SymbolPlotter, metaclass=abc.ABCMeta):
:class:`matplotlib.Text` instance of each symbol.
"""
- def __init__(self, axes, color_symbols=False,
- font_size=None, font_param=None):
+ def __init__(self, axes, color_symbols=False, font_size=None, font_param=None):
super().__init__(axes)
self._color_symbols = color_symbols
self._font_size = font_size
@@ -101,9 +107,15 @@ def plot_symbol(self, bbox, alignment, column_i, seq_i):
box = Rectangle(bbox.p0, bbox.width, bbox.height)
self.axes.add_patch(box)
text = self.axes.text(
- bbox.x0 + bbox.width/2, bbox.y0 + bbox.height/2,
- symbol, color="black", ha="center", va="center",
- size=self._font_size, **self._font_param)
+ bbox.x0 + bbox.width / 2,
+ bbox.y0 + bbox.height / 2,
+ symbol,
+ color="black",
+ ha="center",
+ va="center",
+ size=self._font_size,
+ **self._font_param,
+ )
text.set_clip_on(True)
if self._color_symbols:
@@ -196,17 +208,16 @@ class LetterSimilarityPlotter(LetterPlotter):
because *a* does also occur in *b*\ :sub:`i`.
"""
- def __init__(self, axes, matrix=None, color_symbols=False,
- font_size=None, font_param=None):
-
+ def __init__(
+ self, axes, matrix=None, color_symbols=False, font_size=None, font_param=None
+ ):
super().__init__(axes, color_symbols, font_size, font_param)
if matrix is not None:
self._matrix = matrix.score_matrix()
else:
self._matrix = None
# Default colormap
- self._cmap = self._generate_colormap(colors["dimgreen"],
- self._color_symbols)
+ self._cmap = self._generate_colormap(colors["dimgreen"], self._color_symbols)
def set_color(self, color=None, cmap=None):
"""
@@ -257,8 +268,7 @@ def get_color(self, alignment, column_i, seq_i):
similarities[i] = 0
else:
code2 = alignment.sequences[i].code[index2]
- similarities[i] = self._get_similarity(self._matrix,
- code1, code2)
+ similarities[i] = self._get_similarity(self._matrix, code1, code2)
# Delete self-similarity
similarities = np.delete(similarities, seq_i)
similarity = np.average(similarities)
@@ -283,14 +293,18 @@ def _generate_colormap(color, to_black):
if to_black:
# From color to black
cmap_val = np.stack(
- [np.interp(np.linspace(0, 1, 100), [0, 1], [color[i], 0])
- for i in range(len(color))]
+ [
+ np.interp(np.linspace(0, 1, 100), [0, 1], [color[i], 0])
+ for i in range(len(color))
+ ]
).transpose()
else:
# From white to color
cmap_val = np.stack(
- [np.interp(np.linspace(0, 1, 100), [0, 1], [1, color[i]])
- for i in range(len(color))]
+ [
+ np.interp(np.linspace(0, 1, 100), [0, 1], [1, color[i]])
+ for i in range(len(color))
+ ]
).transpose()
return ListedColormap(cmap_val)
@@ -325,8 +339,15 @@ class LetterTypePlotter(LetterPlotter):
:class:`matplotlib.Text` instance of each symbol.
"""
- def __init__(self, axes, alphabet, color_scheme=None, color_symbols=False,
- font_size=None, font_param=None):
+ def __init__(
+ self,
+ axes,
+ alphabet,
+ color_scheme=None,
+ color_symbols=False,
+ font_size=None,
+ font_param=None,
+ ):
super().__init__(axes, color_symbols, font_size, font_param)
if color_scheme is None:
@@ -346,7 +367,7 @@ def get_color(self, alignment, column_i, seq_i):
class ArrayPlotter(LetterPlotter):
- '''
+ """
This :class:`SymbolPlotter` quantitatively decorates sequences alignments, with molecular
recognition data obtained from e.g. microarrays. Symbols are visualized as characters
on a colored background box. The color of a given box represents the recognition
@@ -371,15 +392,14 @@ class ArrayPlotter(LetterPlotter):
Additional parameters that is given to the
:class:`matplotlib.Text` instance of each symbol.
- '''
- def __init__(self, axes, fl_score, color_symbols=False,
- font_size=None, font_param=None):
+ """
+ def __init__(
+ self, axes, fl_score, color_symbols=False, font_size=None, font_param=None
+ ):
super().__init__(axes, color_symbols, font_size, font_param)
self.fl_score = fl_score
- self._cmap = self._generate_colormap(colors["dimorange"],
- self._color_symbols)
-
+ self._cmap = self._generate_colormap(colors["dimorange"], self._color_symbols)
def get_color(self, alignment, column_i, seq_i):
index1 = alignment.trace[column_i, seq_i]
@@ -389,7 +409,6 @@ def get_color(self, alignment, column_i, seq_i):
spot_signal = self._get_signal(self.fl_score, column_i, seq_i)
return self._cmap(spot_signal)
-
def _get_signal(self, fl_score, column_i, seq_i):
if fl_score is None:
signal = 0.0
@@ -400,7 +419,6 @@ def _get_signal(self, fl_score, column_i, seq_i):
def get_cmap(self):
return self._cmap
-
def plot_symbol(self, bbox, alignment, column_i, seq_i):
from matplotlib.patches import Rectangle
@@ -422,9 +440,15 @@ def plot_symbol(self, bbox, alignment, column_i, seq_i):
box = Rectangle(bbox.p0, bbox.width, bbox.height)
self.axes.add_patch(box)
text = self.axes.text(
- bbox.x0 + bbox.width/2, bbox.y0 + bbox.height/2,
- symbol, color="black", ha="center", va="center",
- size=self._font_size, **self._font_param)
+ bbox.x0 + bbox.width / 2,
+ bbox.y0 + bbox.height / 2,
+ symbol,
+ color="black",
+ ha="center",
+ va="center",
+ size=self._font_size,
+ **self._font_param,
+ )
text.set_clip_on(True)
if self._color_symbols:
@@ -455,11 +479,20 @@ def _generate_colormap(color, to_black):
return ListedColormap(cmap_val)
-def plot_alignment(axes, alignment, symbol_plotter, symbols_per_line=50,
- show_numbers=False, number_size=None, number_functions=None,
- labels=None, label_size=None,
- show_line_position=False,
- spacing=1, symbol_spacing=None):
+def plot_alignment(
+ axes,
+ alignment,
+ symbol_plotter,
+ symbols_per_line=50,
+ show_numbers=False,
+ number_size=None,
+ number_functions=None,
+ labels=None,
+ label_size=None,
+ show_line_position=False,
+ spacing=1,
+ symbol_spacing=None,
+):
"""
Plot a pairwise or multiple sequence alignment.
@@ -545,7 +578,7 @@ def plot_alignment(axes, alignment, symbol_plotter, symbols_per_line=50,
)
for i, func in enumerate(number_functions):
if func is None:
- number_functions[i] = (lambda x: x + 1)
+ number_functions[i] = lambda x: x + 1
seq_num = alignment.trace.shape[1]
seq_len = alignment.trace.shape[0]
@@ -573,7 +606,7 @@ def plot_alignment(axes, alignment, symbol_plotter, symbols_per_line=50,
for i in range(seq_len):
y = y_start
for j in range(seq_num):
- bbox = Bbox([[x, y], [x+1, y+1]])
+ bbox = Bbox([[x, y], [x + 1, y + 1]])
symbol_plotter.plot_symbol(bbox, alignment, i, j)
y += 1
line_pos += 1
@@ -583,8 +616,7 @@ def plot_alignment(axes, alignment, symbol_plotter, symbols_per_line=50,
y_start += seq_num + spacing
else:
x += 1
- if (symbol_spacing
- and (i + 1) % symbol_spacing == 0):
+ if symbol_spacing and (i + 1) % symbol_spacing == 0:
line_pos += 1
x += 1
@@ -613,14 +645,12 @@ def plot_alignment(axes, alignment, symbol_plotter, symbols_per_line=50,
y = 0.5
for i in range(line_count):
for j in range(seq_num):
- if i == line_count-1:
+ if i == line_count - 1:
# Last line -> get number of last column in trace
trace_pos = len(alignment.trace) - 1
else:
- trace_pos = (i+1) * symbols_per_line - 1
- seq_index = _get_last_valid_index(
- alignment, trace_pos, j
- )
+ trace_pos = (i + 1) * symbols_per_line - 1
+ seq_index = _get_last_valid_index(alignment, trace_pos, j)
# if -1 -> terminal gap
# -> skip number for this sequence in this line
if seq_index != -1:
@@ -636,18 +666,14 @@ def plot_alignment(axes, alignment, symbol_plotter, symbols_per_line=50,
axes.set_xlim(0, symbols_to_print)
# Y-axis starts from top
- lim = seq_num*line_count + spacing*(line_count-1)
+ lim = seq_num * line_count + spacing * (line_count - 1)
axes.set_ylim(lim, 0)
number_axes.set_ylim(lim, 0)
axes.set_frame_on(False)
number_axes.set_frame_on(False)
# Remove ticks and set label and number size
- axes.yaxis.set_tick_params(
- left=False, right=False, labelsize=label_size
- )
- number_axes.yaxis.set_tick_params(
- left=False, right=False, labelsize=number_size
- )
+ axes.yaxis.set_tick_params(left=False, right=False, labelsize=label_size)
+ number_axes.yaxis.set_tick_params(left=False, right=False, labelsize=number_size)
if show_line_position:
axes.xaxis.set_tick_params(
@@ -659,15 +685,25 @@ def plot_alignment(axes, alignment, symbol_plotter, symbols_per_line=50,
)
-def plot_alignment_similarity_based(axes, alignment, symbols_per_line=50,
- show_numbers=False, number_size=None,
- number_functions=None,
- labels=None, label_size=None,
- show_line_position=False,
- spacing=1,
- color=None, cmap=None, matrix=None,
- color_symbols=False, symbol_spacing=None,
- symbol_size=None, symbol_param=None):
+def plot_alignment_similarity_based(
+ axes,
+ alignment,
+ symbols_per_line=50,
+ show_numbers=False,
+ number_size=None,
+ number_functions=None,
+ labels=None,
+ label_size=None,
+ show_line_position=False,
+ spacing=1,
+ color=None,
+ cmap=None,
+ matrix=None,
+ color_symbols=False,
+ symbol_spacing=None,
+ symbol_size=None,
+ symbol_param=None,
+):
r"""
Plot a pairwise or multiple sequence alignment highlighting
the similarity per alignment column.
@@ -788,31 +824,47 @@ def plot_alignment_similarity_based(axes, alignment, symbols_per_line=50,
because *a* does also occur in *b*\ :sub:`i`.
"""
symbol_plotter = LetterSimilarityPlotter(
- axes, matrix=matrix, font_size=symbol_size, font_param=symbol_param,
- color_symbols=color_symbols
+ axes,
+ matrix=matrix,
+ font_size=symbol_size,
+ font_param=symbol_param,
+ color_symbols=color_symbols,
)
if color is not None or cmap is not None:
symbol_plotter.set_color(color=color, cmap=cmap)
plot_alignment(
- axes=axes, alignment=alignment, symbol_plotter=symbol_plotter,
+ axes=axes,
+ alignment=alignment,
+ symbol_plotter=symbol_plotter,
symbols_per_line=symbols_per_line,
- show_numbers=show_numbers, number_size=number_size,
+ show_numbers=show_numbers,
+ number_size=number_size,
number_functions=number_functions,
- labels=labels, label_size=label_size,
+ labels=labels,
+ label_size=label_size,
show_line_position=show_line_position,
- spacing=spacing, symbol_spacing=symbol_spacing
+ spacing=spacing,
+ symbol_spacing=symbol_spacing,
)
-def plot_alignment_type_based(axes, alignment, symbols_per_line=50,
- show_numbers=False, number_size=None,
- number_functions=None,
- labels=None, label_size=None,
- show_line_position=False,
- spacing=1,
- color_scheme=None, color_symbols=False,
- symbol_size=None, symbol_param=None,
- symbol_spacing=None):
+def plot_alignment_type_based(
+ axes,
+ alignment,
+ symbols_per_line=50,
+ show_numbers=False,
+ number_size=None,
+ number_functions=None,
+ labels=None,
+ label_size=None,
+ show_line_position=False,
+ spacing=1,
+ color_scheme=None,
+ color_symbols=False,
+ symbol_size=None,
+ symbol_param=None,
+ symbol_spacing=None,
+):
"""
Plot a pairwise or multiple sequence alignment coloring each symbol
based on the symbol type.
@@ -897,27 +949,48 @@ def plot_alignment_type_based(axes, alignment, symbols_per_line=50,
"""
alphabet = alignment.sequences[0].get_alphabet()
symbol_plotter = LetterTypePlotter(
- axes, alphabet, font_size=symbol_size, font_param=symbol_param,
- color_symbols=color_symbols, color_scheme=color_scheme
+ axes,
+ alphabet,
+ font_size=symbol_size,
+ font_param=symbol_param,
+ color_symbols=color_symbols,
+ color_scheme=color_scheme,
)
plot_alignment(
- axes=axes, alignment=alignment, symbol_plotter=symbol_plotter,
+ axes=axes,
+ alignment=alignment,
+ symbol_plotter=symbol_plotter,
symbols_per_line=symbols_per_line,
- show_numbers=show_numbers, number_size=number_size,
+ show_numbers=show_numbers,
+ number_size=number_size,
number_functions=number_functions,
- labels=labels, label_size=label_size,
+ labels=labels,
+ label_size=label_size,
show_line_position=show_line_position,
- spacing=spacing, symbol_spacing=symbol_spacing
+ spacing=spacing,
+ symbol_spacing=symbol_spacing,
)
-def plot_alignment_array(axes, alignment, fl_score, symbols_per_line=50,
- show_numbers=False, number_size=None,
- number_functions=None, labels=None, label_size=None,
- show_line_position=False, spacing=1, color=None,
- cmap=None, symbol_spacing=None,
- symbol_size=None, symbol_param=None):
- '''
+def plot_alignment_array(
+ axes,
+ alignment,
+ fl_score,
+ symbols_per_line=50,
+ show_numbers=False,
+ number_size=None,
+ number_functions=None,
+ labels=None,
+ label_size=None,
+ show_line_position=False,
+ spacing=1,
+ color=None,
+ cmap=None,
+ symbol_spacing=None,
+ symbol_size=None,
+ symbol_param=None,
+):
+ """
Plot a pairwise sequence alignment using an :class:`ArrayPlotter`
instance.
@@ -995,19 +1068,27 @@ def plot_alignment_array(axes, alignment, fl_score, symbols_per_line=50,
A '*' represents a sequence match on the alignment
A '-' represents a sequence gap on the alignment
- '''
+ """
symbol_plotter = ArrayPlotter(
- axes, fl_score = fl_score, font_size = symbol_size, font_param = symbol_param,
+ axes,
+ fl_score=fl_score,
+ font_size=symbol_size,
+ font_param=symbol_param,
)
plot_alignment(
- axes=axes, alignment=alignment, symbol_plotter=symbol_plotter,
+ axes=axes,
+ alignment=alignment,
+ symbol_plotter=symbol_plotter,
symbols_per_line=symbols_per_line,
- show_numbers=show_numbers, number_size=number_size,
+ show_numbers=show_numbers,
+ number_size=number_size,
number_functions=number_functions,
- labels=labels, label_size=label_size,
+ labels=labels,
+ label_size=label_size,
show_line_position=show_line_position,
- spacing=spacing, symbol_spacing=symbol_spacing
+ spacing=spacing,
+ symbol_spacing=symbol_spacing,
)
diff --git a/src/biotite/sequence/graphics/colorschemes.py b/src/biotite/sequence/graphics/colorschemes.py
index 049cddbb4..d38879c91 100644
--- a/src/biotite/sequence/graphics/colorschemes.py
+++ b/src/biotite/sequence/graphics/colorschemes.py
@@ -6,12 +6,11 @@
__author__ = "Patrick Kunzmann"
__all__ = ["get_color_scheme", "list_color_scheme_names", "load_color_scheme"]
-import numpy as np
-import json
-from os.path import join, dirname, realpath
import glob
+import json
import os
-from ..alphabet import Alphabet
+from os.path import dirname, join, realpath
+from biotite.sequence.alphabet import Alphabet
def load_color_scheme(file_name):
@@ -26,13 +25,13 @@ def load_color_scheme(file_name):
----------
file_name : str
The file name of the JSON file containing the scheme.
-
+
Returns
-------
scheme : dict
A dictionary representing the color scheme, It contains the
following keys, if the input file is proper:
-
+
- **name** - Name of the scheme.
- **alphabet** - :class:`Alphabet` instance describing the
type of sequence the scheme can be used for.
@@ -71,7 +70,7 @@ def get_color_scheme(name, alphabet, default="#FFFFFF"):
default : str or tuple, optional
A *Matplotlib* compatible color that is used for symbols that
have no defined color in the scheme.
-
+
Returns
-------
colors : list
@@ -99,11 +98,10 @@ def get_color_scheme(name, alphabet, default="#FFFFFF"):
if scheme["name"] == name and scheme["alphabet"].extends(alphabet):
colors = scheme["colors"]
# Replace None values with default color
- colors = [color if color is not None else default
- for color in colors]
+ colors = [color if color is not None else default for color in colors]
# Only return colors that are in scope of this alphabet
# and not the extended alphabet
- return colors[:len(alphabet)]
+ return colors[: len(alphabet)]
raise ValueError(f"Unkown scheme '{name}' for given alphabet")
@@ -117,7 +115,7 @@ def list_color_scheme_names(alphabet):
The alphbet to get the color scheme names for.
The alphabet of the scheme must equal or extend this parameter,
to be included in the list.
-
+
Returns
-------
schemes : list of str
@@ -136,4 +134,4 @@ def list_color_scheme_names(alphabet):
for file_name in glob.glob(_scheme_dir + os.sep + "*.json"):
scheme = load_color_scheme(file_name)
- _color_schemes.append(scheme)
\ No newline at end of file
+ _color_schemes.append(scheme)
diff --git a/src/biotite/sequence/graphics/dendrogram.py b/src/biotite/sequence/graphics/dendrogram.py
index f351c891f..254702443 100644
--- a/src/biotite/sequence/graphics/dendrogram.py
+++ b/src/biotite/sequence/graphics/dendrogram.py
@@ -8,9 +8,18 @@
import numpy as np
-def plot_dendrogram(axes, tree, orientation="left", use_distances=True,
- labels=None, label_size=None, color="black",
- show_distance=True, **kwargs):
+
+def plot_dendrogram(
+ axes,
+ tree,
+ orientation="left",
+ use_distances=True,
+ labels=None,
+ label_size=None,
+ color="black",
+ show_distance=True,
+ **kwargs,
+):
"""
Plot a dendrogram from a (phylogenetic) tree.
@@ -24,7 +33,7 @@ def plot_dendrogram(axes, tree, orientation="left", use_distances=True,
If true, the `distance` attribute of the :class:`TreeNode`
objects are used as distance measure.
Otherwise the topological distance is used.
- labels : list of str, optional
+ labels : list of str, optional
The leaf node labels.
The label of a leaf node is the entry at the position of its
`index` attribute.
@@ -40,9 +49,9 @@ def plot_dendrogram(axes, tree, orientation="left", use_distances=True,
Additional parameters that are used to draw the dendrogram
lines.
"""
-
+
indices = tree.root.get_indices()
- leaf_dict = {indices[i] : i for i in indices}
+ leaf_dict = {indices[i]: i for i in indices}
# Required for setting the plot limits
max_distance = 0
@@ -50,12 +59,12 @@ def plot_dendrogram(axes, tree, orientation="left", use_distances=True,
def _plot_node(node, distance):
"""
Draw the lines from the given node to its children.
-
+
Parameters
----------
dist : float
the distance of the node from root
-
+
Returns
-------
pos : float
@@ -88,31 +97,43 @@ def _plot_node(node, distance):
if orientation in ["left", "right"]:
# Line connecting the childs
axes.plot(
- [distance, distance], [child_pos[0], child_pos[-1]],
- color=color, marker="None", **kwargs
+ [distance, distance],
+ [child_pos[0], child_pos[-1]],
+ color=color,
+ marker="None",
+ **kwargs,
)
# Lines depicting the distances of the childs
for child_dist, pos in zip(child_distances, child_pos):
axes.plot(
- [distance, child_dist], [pos, pos],
- color=color, marker="None", **kwargs
+ [distance, child_dist],
+ [pos, pos],
+ color=color,
+ marker="None",
+ **kwargs,
)
elif orientation in ["bottom", "top"]:
# Line connecting the childs
axes.plot(
- [child_pos[0], child_pos[-1]], [distance, distance],
- color=color, marker="None", **kwargs
+ [child_pos[0], child_pos[-1]],
+ [distance, distance],
+ color=color,
+ marker="None",
+ **kwargs,
)
# Lines depicting the distances of the childs
for child_dist, pos in zip(child_distances, child_pos):
axes.plot(
- [pos, pos], [distance, child_dist],
- color=color, marker="None", **kwargs
+ [pos, pos],
+ [distance, child_dist],
+ color=color,
+ marker="None",
+ **kwargs,
)
else:
raise ValueError(f"'{orientation}' is not a valid orientation")
return center_pos
-
+
_plot_node(tree.root, 0)
if labels is not None:
@@ -133,12 +154,18 @@ def _plot_node(node, distance):
axes.set_yticks(np.arange(0, len(indices)))
axes.set_yticklabels(labels)
axes.yaxis.set_tick_params(
- left=False, right=False, labelleft=False, labelright=True,
- labelsize=label_size
+ left=False,
+ right=False,
+ labelleft=False,
+ labelright=True,
+ labelsize=label_size,
)
axes.xaxis.set_tick_params(
- bottom=True, top=False, labelbottom=show_distance, labeltop=False,
- labelsize=label_size
+ bottom=True,
+ top=False,
+ labelbottom=show_distance,
+ labeltop=False,
+ labelsize=label_size,
)
elif orientation == "right":
axes.set_xlim(max_distance, zero_limit)
@@ -146,12 +173,18 @@ def _plot_node(node, distance):
axes.set_yticks(np.arange(0, len(indices)))
axes.set_yticklabels(labels)
axes.yaxis.set_tick_params(
- left=False, right=False, labelleft=True, labelright=False,
- labelsize=label_size
+ left=False,
+ right=False,
+ labelleft=True,
+ labelright=False,
+ labelsize=label_size,
)
axes.xaxis.set_tick_params(
- bottom=True, top=False, labelbottom=show_distance, labeltop=False,
- labelsize=label_size
+ bottom=True,
+ top=False,
+ labelbottom=show_distance,
+ labeltop=False,
+ labelsize=label_size,
)
elif orientation == "bottom":
axes.set_ylim(zero_limit, max_distance)
@@ -159,12 +192,18 @@ def _plot_node(node, distance):
axes.set_xticks(np.arange(0, len(indices)))
axes.set_xticklabels(labels)
axes.xaxis.set_tick_params(
- bottom=False, top=False, labelbottom=False, labeltop=True,
- labelsize=label_size
+ bottom=False,
+ top=False,
+ labelbottom=False,
+ labeltop=True,
+ labelsize=label_size,
)
axes.yaxis.set_tick_params(
- left=True, right=False, labelleft=show_distance, labelright=False,
- labelsize=label_size
+ left=True,
+ right=False,
+ labelleft=show_distance,
+ labelright=False,
+ labelsize=label_size,
)
elif orientation == "top":
axes.set_ylim(max_distance, zero_limit)
@@ -172,13 +211,19 @@ def _plot_node(node, distance):
axes.set_xticks(np.arange(0, len(indices)))
axes.set_xticklabels(labels)
axes.xaxis.set_tick_params(
- bottom=False, top=False, labelbottom=True, labeltop=False,
- labelsize=label_size
+ bottom=False,
+ top=False,
+ labelbottom=True,
+ labeltop=False,
+ labelsize=label_size,
)
axes.yaxis.set_tick_params(
- left=True, right=False, labelleft=show_distance, labelright=False,
- labelsize=label_size
+ left=True,
+ right=False,
+ labelleft=show_distance,
+ labelright=False,
+ labelsize=label_size,
)
else:
raise ValueError(f"'{orientation}' is not a valid orientation")
- axes.set_frame_on(False)
\ No newline at end of file
+ axes.set_frame_on(False)
diff --git a/src/biotite/sequence/graphics/features.py b/src/biotite/sequence/graphics/features.py
index e3c6711ee..6fe25fa41 100644
--- a/src/biotite/sequence/graphics/features.py
+++ b/src/biotite/sequence/graphics/features.py
@@ -4,22 +4,35 @@
__name__ = "biotite.sequence.graphics"
__author__ = "Patrick Kunzmann"
-__all__ = ["plot_feature_map", "FeaturePlotter", "MiscFeaturePlotter",
- "CodingPlotter", "PromoterPlotter", "TerminatorPlotter",
- "RBSPlotter"]
+__all__ = [
+ "plot_feature_map",
+ "FeaturePlotter",
+ "MiscFeaturePlotter",
+ "CodingPlotter",
+ "PromoterPlotter",
+ "TerminatorPlotter",
+ "RBSPlotter",
+]
-import copy
import abc
-import numpy as np
-from ...visualize import colors, AdaptiveFancyArrow
-from ..annotation import Annotation, Feature, Location
-
-
-def plot_feature_map(axes, annotation, loc_range=None,
- multi_line=True, symbols_per_line=1000,
- show_numbers=False, number_size=None, line_width=0.05,
- show_line_position=False, spacing=0.25,
- feature_plotters=None, style_param=None):
+from biotite.sequence.annotation import Location
+from biotite.visualize import AdaptiveFancyArrow, colors
+
+
+def plot_feature_map(
+ axes,
+ annotation,
+ loc_range=None,
+ multi_line=True,
+ symbols_per_line=1000,
+ show_numbers=False,
+ number_size=None,
+ line_width=0.05,
+ show_line_position=False,
+ spacing=0.25,
+ feature_plotters=None,
+ style_param=None,
+):
"""
Plot a sequence annotation, by showing the range of each feature
on one or multiple position depicting line(s).
@@ -87,8 +100,8 @@ def plot_feature_map(axes, annotation, loc_range=None,
features.
When two features overlap, their drawing area does also overlap.
"""
- from matplotlib.transforms import Bbox
from matplotlib.patches import Rectangle
+ from matplotlib.transforms import Bbox
if loc_range is None:
loc_range = annotation.get_location_range()
@@ -98,13 +111,13 @@ def plot_feature_map(axes, annotation, loc_range=None,
else:
# Line length covers the entire location range
symbols_per_line = loc_range_length
-
+
plotters = [
PromoterPlotter(),
TerminatorPlotter(),
RBSPlotter(),
CodingPlotter(),
- MiscFeaturePlotter()
+ MiscFeaturePlotter(),
]
if feature_plotters is not None:
plotters = list(feature_plotters) + plotters
@@ -116,7 +129,6 @@ def plot_feature_map(axes, annotation, loc_range=None,
if loc_range_length % symbols_per_line != 0:
line_count += 1
-
### Draw lines ###
remaining_symbols = loc_range_length
y = 0.5
@@ -127,14 +139,19 @@ def plot_feature_map(axes, annotation, loc_range=None,
else:
# Last line -> Line spans to end of annotation
line_length = remaining_symbols
- axes.add_patch(Rectangle(
- (0, y-line_width/2), line_length, line_width,
- color="gray", linewidth=0
- ))
+ axes.add_patch(
+ Rectangle(
+ (0, y - line_width / 2),
+ line_length,
+ line_width,
+ color="gray",
+ linewidth=0,
+ )
+ )
# Increment by spacing and width (=1) of feature
y += spacing + 1
remaining_symbols -= symbols_per_line
-
+
### Draw features ###
line_start_loc = loc_range[0]
y = 0
@@ -160,15 +177,12 @@ def plot_feature_map(axes, annotation, loc_range=None,
width = loc_len
height = 1
bbox = Bbox.from_bounds(x, y, width, height)
- plotter.draw(
- axes, feature, bbox, loc,
- style_param=style_param
- )
+ plotter.draw(axes, feature, bbox, loc, style_param=style_param)
# Increment by spacing and width (=1) of feature
y += spacing + 1
remaining_symbols += symbols_per_line
line_start_loc += symbols_per_line
-
+
### Draw position numbers ###
ticks = []
tick_labels = []
@@ -176,11 +190,11 @@ def plot_feature_map(axes, annotation, loc_range=None,
# Numbers at center height of each feature line -> 0.5
y = 0.5
for i in range(line_count):
- if i == line_count-1:
+ if i == line_count - 1:
# Last line -> get number of last column in trace
- loc = loc_range[1] -1
+ loc = loc_range[1] - 1
else:
- loc = loc_range[0] + ((i+1) * symbols_per_line) -1
+ loc = loc_range[0] + ((i + 1) * symbols_per_line) - 1
ticks.append(y)
tick_labels.append(str(loc))
# Increment by spacing and width of feature (1)
@@ -188,20 +202,17 @@ def plot_feature_map(axes, annotation, loc_range=None,
axes.set_yticks(ticks)
axes.set_yticklabels(tick_labels)
-
axes.set_xlim(0, symbols_per_line)
# Y-axis starts from top
- axes.set_ylim(1*line_count + spacing*(line_count-1), 0)
+ axes.set_ylim(1 * line_count + spacing * (line_count - 1), 0)
axes.set_frame_on(False)
# Draw location numbers on right side
axes.get_yaxis().set_tick_params(
left=False, right=False, labelleft=False, labelright=True
)
# Remove ticks and set number font size
- axes.yaxis.set_tick_params(
- left=False, right=False, labelsize=number_size
- )
-
+ axes.yaxis.set_tick_params(left=False, right=False, labelsize=number_size)
+
if show_line_position:
axes.xaxis.set_tick_params(
top=False, bottom=True, labeltop=False, labelbottom=True
@@ -236,7 +247,7 @@ def matches(self, feature):
----------
feature : Feature
The sequence feature to be checked.
-
+
Returns
-------
compatibility : bool
@@ -244,7 +255,7 @@ def matches(self, feature):
false otherwise.
"""
pass
-
+
@abc.abstractmethod
def draw(self, axes, feature, bbox, location, style_param):
"""
@@ -284,7 +295,7 @@ class CodingPlotter(FeaturePlotter):
The width of the arrow head
as fraction of the feature drawing area height.
"""
-
+
def __init__(self, tail_width=0.5, head_width=0.8):
self._tail_width = tail_width
self._head_width = head_width
@@ -294,9 +305,9 @@ def matches(self, feature):
return True
else:
return False
-
+
def draw(self, axes, feature, bbox, loc, style_param):
- y = bbox.y0 + bbox.height/2
+ y = bbox.y0 + bbox.height / 2
dy = 0
if loc.strand == Location.Strand.FORWARD:
x = bbox.x0
@@ -304,25 +315,35 @@ def draw(self, axes, feature, bbox, loc, style_param):
else:
x = bbox.x1
dx = -bbox.width
-
- if (
- loc.strand == Location.Strand.FORWARD
- and loc.defect & Location.Defect.MISS_RIGHT
- ) or (
- loc.strand == Location.Strand.REVERSE
- and loc.defect & Location.Defect.MISS_LEFT
- ):
- # If the feature extends into the prevoius or next line
- # do not draw an arrow head
- draw_head = False
+
+ if (
+ loc.strand == Location.Strand.FORWARD
+ and loc.defect & Location.Defect.MISS_RIGHT
+ ) or (
+ loc.strand == Location.Strand.REVERSE
+ and loc.defect & Location.Defect.MISS_LEFT
+ ):
+ # If the feature extends into the prevoius or next line
+ # do not draw an arrow head
+ draw_head = False
else:
- draw_head = True
-
+ draw_head = True
+
# Create head with 90 degrees tip -> head width/length ratio = 1/2
- axes.add_patch(AdaptiveFancyArrow(
- x, y, dx, dy, self._tail_width, self._head_width, head_ratio=0.5,
- draw_head=draw_head, color=colors["dimgreen"], linewidth=0
- ))
+ axes.add_patch(
+ AdaptiveFancyArrow(
+ x,
+ y,
+ dx,
+ dy,
+ self._tail_width,
+ self._head_width,
+ head_ratio=0.5,
+ draw_head=draw_head,
+ color=colors["dimgreen"],
+ linewidth=0,
+ )
+ )
if feature.key == "CDS":
if "product" not in feature.qual:
@@ -332,17 +353,23 @@ def draw(self, axes, feature, bbox, loc, style_param):
else:
label = feature.qual["product"]
elif feature.key == "gene":
- if "gene" not in feature.qual:
+ if "gene" not in feature.qual:
label = None
else:
label = feature.qual["gene"]
-
+
if label is not None:
- center_x = bbox.x0 + bbox.width/2
- center_y = bbox.y0 + bbox.height/2
+ center_x = bbox.x0 + bbox.width / 2
+ center_y = bbox.y0 + bbox.height / 2
axes.text(
- center_x, center_y, label, color="black",
- ha="center", va="center", size=11)
+ center_x,
+ center_y,
+ label,
+ color="black",
+ ha="center",
+ va="center",
+ size=11,
+ )
class MiscFeaturePlotter(FeaturePlotter):
@@ -363,17 +390,20 @@ def __init__(self, height=0.4):
def matches(self, feature):
return True
-
+
def draw(self, axes, feature, bbox, loc, style_param):
from matplotlib.patches import Rectangle
rect = Rectangle(
- (bbox.x0, bbox.y0 + bbox.height/2 * (1-self._height)),
- bbox.width, bbox.height*self._height,
- color=colors["dimorange"], linewidth=0
+ (bbox.x0, bbox.y0 + bbox.height / 2 * (1 - self._height)),
+ bbox.width,
+ bbox.height * self._height,
+ color=colors["dimorange"],
+ linewidth=0,
)
axes.add_patch(rect)
+
class PromoterPlotter(FeaturePlotter):
"""
A plotter for *regulatory* features with the *promoter* or
@@ -394,8 +424,7 @@ class PromoterPlotter(FeaturePlotter):
as fraction of the halffeature drawing area height.
"""
- def __init__(self, line_width=2, head_width=2,
- head_length=6, head_height=0.8):
+ def __init__(self, line_width=2, head_width=2, head_length=6, head_height=0.8):
self._line_width = line_width
self._head_width = head_width
self._head_length = head_length
@@ -404,43 +433,42 @@ def __init__(self, line_width=2, head_width=2,
def matches(self, feature):
if feature.key == "regulatory":
if "regulatory_class" in feature.qual:
- if feature.qual["regulatory_class"] in ["promoter","TATA_box"]:
+ if feature.qual["regulatory_class"] in ["promoter", "TATA_box"]:
return True
return False
-
+
def draw(self, axes, feature, bbox, loc, style_param):
- from matplotlib.patches import FancyArrowPatch, ArrowStyle
+ from matplotlib.patches import ArrowStyle, FancyArrowPatch
from matplotlib.path import Path
- x_center = bbox.x0 + bbox.width/2
- y_center = bbox.y0 + bbox.height/2
+ x_center = bbox.x0 + bbox.width / 2
+ y_center = bbox.y0 + bbox.height / 2
path = Path(
vertices=[
(bbox.x0, y_center),
- (bbox.x0, y_center - bbox.height/2 * self._head_height),
- (bbox.x1, y_center - bbox.height/2 * self._head_height),
+ (bbox.x0, y_center - bbox.height / 2 * self._head_height),
+ (bbox.x1, y_center - bbox.height / 2 * self._head_height),
],
- codes=[
- Path.MOVETO,
- Path.CURVE3,
- Path.CURVE3
- ]
+ codes=[Path.MOVETO, Path.CURVE3, Path.CURVE3],
)
style = ArrowStyle.CurveFilledB(
head_width=self._head_width, head_length=self._head_length
)
arrow = FancyArrowPatch(
- path=path, arrowstyle=style, linewidth=self._line_width,
- color="black"
+ path=path, arrowstyle=style, linewidth=self._line_width, color="black"
)
axes.add_patch(arrow)
-
+
if "note" in feature.qual:
axes.text(
- x_center, y_center + bbox.height/4, feature.qual["note"],
- color="black", ha="center", va="center",
- size=9
+ x_center,
+ y_center + bbox.height / 4,
+ feature.qual["note"],
+ color="black",
+ ha="center",
+ va="center",
+ size=9,
)
@@ -465,14 +493,17 @@ def matches(self, feature):
if feature.qual["regulatory_class"] == "terminator":
return True
return False
-
- def draw(self, axes, feature, bbox, loc, style_param):
- x = bbox.x0 + bbox.width/2
+ def draw(self, axes, feature, bbox, loc, style_param):
+ x = bbox.x0 + bbox.width / 2
axes.plot(
- (x, x), (bbox.y0, bbox.y1), color="black",
- linestyle="-", linewidth=self._bar_width, marker="None"
+ (x, x),
+ (bbox.y0, bbox.y1),
+ color="black",
+ linestyle="-",
+ linewidth=self._bar_width,
+ marker="None",
)
@@ -499,12 +530,15 @@ def matches(self, feature):
if feature.qual["regulatory_class"] == "ribosome_binding_site":
return True
return False
-
+
def draw(self, axes, feature, bbox, loc, style_param):
from matplotlib.patches import Ellipse
ellipse = Ellipse(
- (bbox.x0 + bbox.width/2, bbox.y0 + bbox.height/2),
- bbox.width, self._height*bbox.height,
- color=colors["dimorange"], linewidth=0)
- axes.add_patch(ellipse)
\ No newline at end of file
+ (bbox.x0 + bbox.width / 2, bbox.y0 + bbox.height / 2),
+ bbox.width,
+ self._height * bbox.height,
+ color=colors["dimorange"],
+ linewidth=0,
+ )
+ axes.add_patch(ellipse)
diff --git a/src/biotite/sequence/graphics/logo.py b/src/biotite/sequence/graphics/logo.py
index 7de7d0c39..3fc32a052 100644
--- a/src/biotite/sequence/graphics/logo.py
+++ b/src/biotite/sequence/graphics/logo.py
@@ -7,12 +7,9 @@
__all__ = ["plot_sequence_logo"]
import numpy as np
-from ...visualize import set_font_size_in_coord
-from ..alphabet import LetterAlphabet
-from .colorschemes import get_color_scheme
-import warnings
-from ..align import Alignment
-from .. import SequenceProfile
+from biotite.sequence.alphabet import LetterAlphabet
+from biotite.sequence.graphics.colorschemes import get_color_scheme
+from biotite.visualize import set_font_size_in_coord
def plot_sequence_logo(axes, profile, scheme=None, **kwargs):
@@ -61,10 +58,10 @@ def plot_sequence_logo(axes, profile, scheme=None, **kwargs):
# 'color' and 'size' property is not passed on to text
kwargs.pop("color", None)
- kwargs.pop("size", None)
+ kwargs.pop("size", None)
frequencies, entropies, max_entropy = _get_entropy(profile)
- stack_heights = (max_entropy - entropies)
+ stack_heights = max_entropy - entropies
symbols_heights = stack_heights[:, np.newaxis] * frequencies
index_order = np.argsort(symbols_heights, axis=1)
for i in range(symbols_heights.shape[0]):
@@ -73,21 +70,25 @@ def plot_sequence_logo(axes, profile, scheme=None, **kwargs):
start_height = 0
for j in index_order[i]:
# Stack the symbols at position on top of the preceeding one
- height = symbols_heights[i,j]
+ height = symbols_heights[i, j]
if height > 0:
symbol = alphabet.decode(j)
text = axes.text(
- i+0.5, start_height, symbol,
- ha="left", va="bottom", color=colors[j],
+ i + 0.5,
+ start_height,
+ symbol,
+ ha="left",
+ va="bottom",
+ color=colors[j],
# Best results are obtained with this font size
size=1,
- **kwargs
+ **kwargs,
)
text.set_clip_on(True)
set_font_size_in_coord(text, width=1, height=height)
start_height += height
- axes.set_xlim(0.5, len(profile.symbols)+0.5)
+ axes.set_xlim(0.5, len(profile.symbols) + 0.5)
axes.set_ylim(0, max_entropy)
@@ -97,8 +98,7 @@ def _get_entropy(profile):
# 0 * log2(0) = 0 -> Convert NaN to 0
no_zeros = freq != 0
pre_entropies = np.zeros(freq.shape)
- pre_entropies[no_zeros] \
- = freq[no_zeros] * np.log2(freq[no_zeros])
+ pre_entropies[no_zeros] = freq[no_zeros] * np.log2(freq[no_zeros])
entropies = -np.sum(pre_entropies, axis=1)
max_entropy = np.log2(len(profile.alphabet))
- return freq, entropies, max_entropy
\ No newline at end of file
+ return freq, entropies, max_entropy
diff --git a/src/biotite/sequence/graphics/plasmid.py b/src/biotite/sequence/graphics/plasmid.py
index 8527dc8d7..08972fce9 100644
--- a/src/biotite/sequence/graphics/plasmid.py
+++ b/src/biotite/sequence/graphics/plasmid.py
@@ -6,20 +6,29 @@
__author__ = "Patrick Kunzmann"
__all__ = ["plot_plasmid_map"]
-import copy
+import re
import warnings
-import abc
import numpy as np
-import re
-from ...visualize import colors
-from ..annotation import Annotation, Feature, Location
-
-
-def plot_plasmid_map(axes, annotation, plasmid_size, tick_length=0.02,
- tick_step=200, ring_width=0.01, feature_width=0.06,
- spacing=0.01, arrow_head_length=0.04, label=None,
- face_properties=None, label_properties=None,
- omit_oversized_labels=True, feature_formatter=None):
+from biotite.sequence.annotation import Feature, Location
+from biotite.visualize import colors
+
+
+def plot_plasmid_map(
+ axes,
+ annotation,
+ plasmid_size,
+ tick_length=0.02,
+ tick_step=200,
+ ring_width=0.01,
+ feature_width=0.06,
+ spacing=0.01,
+ arrow_head_length=0.04,
+ label=None,
+ face_properties=None,
+ label_properties=None,
+ omit_oversized_labels=True,
+ feature_formatter=None,
+):
"""
Plot a plasmid map using the sequence features in the given
:class:`Annotation`.
@@ -84,26 +93,26 @@ def plot_plasmid_map(axes, annotation, plasmid_size, tick_length=0.02,
the following tuple:
- *directional* : bool
-
+
True, if the direction of the feature should be indicated by
an arrow.
Otherwise, the feature is plotted is arc.
-
+
- *face_color* : tuple or str, optional
-
+
A *Matplotlib* compatible color for the feature arrow/arc.
-
+
- *label_color* : tuple or str, optional
-
+
A *Matplotlib* compatible color for the feature label.
-
+
- *label* : str or None
-
+
The label to be displayed for this feature.
None, if no label should be displayed.
"""
from matplotlib.projections.polar import PolarAxes
-
+
if not isinstance(axes, PolarAxes):
raise TypeError("The given axes must be a 'PolarAxes'")
@@ -118,16 +127,13 @@ def plot_plasmid_map(axes, annotation, plasmid_size, tick_length=0.02,
if feature_formatter is None:
feature_formatter = _default_feature_formatter
-
### Setup matplotlib ###
# The x-coordinate is given as angle (rad)
# Full circle -> 2*pi
- axes.set_xlim(0, 2*np.pi)
+ axes.set_xlim(0, 2 * np.pi)
axes.set_ylim(0, 1)
axes.yaxis.set_visible(False)
- axes.xaxis.set_tick_params(
- bottom=False, labelbottom=True
- )
+ axes.xaxis.set_tick_params(bottom=False, labelbottom=True)
axes.set_theta_zero_location("N")
axes.set_theta_direction("clockwise")
axes.spines["polar"].set_visible(False)
@@ -142,32 +148,39 @@ def plot_plasmid_map(axes, annotation, plasmid_size, tick_length=0.02,
axes.xaxis.set_ticks([_loc_to_rad(tick, plasmid_size) for tick in ticks])
axes.xaxis.set_ticklabels(tick_labels)
### Draw plasmid ring with ticks and central label ###
-
+
# Plasmid ring
# Use 'barh()' instead of a Rectangle patch to ensure that the axes
# is properly initialized
# Otherwise the feature rectangles are not curved, but straight
axes.barh(
- 1-ring_width-tick_length, 2*np.pi, ring_width,
- align="edge", color="black"
+ 1 - ring_width - tick_length, 2 * np.pi, ring_width, align="edge", color="black"
)
-
+
# Ticks (ticks itself, not the tick labels)
for tick in ticks:
angle = _loc_to_rad(tick, plasmid_size)
axes.plot(
- (angle, angle), (1-tick_length, 1),
- color="black", linewidth=1, linestyle="-"
+ (angle, angle),
+ (1 - tick_length, 1),
+ color="black",
+ linewidth=1,
+ linestyle="-",
)
-
+
# Central plasmid label
if label is not None:
axes.text(
- 0, 0, label, ha="center", va="center",
- color="black", size=32, fontweight="bold"
+ 0,
+ 0,
+ label,
+ ha="center",
+ va="center",
+ color="black",
+ size=32,
+ fontweight="bold",
)
-
### Draw plasmid interior ###
inner_radius = 1 - ring_width - tick_length
features = sorted(
@@ -177,28 +190,51 @@ def plot_plasmid_map(axes, annotation, plasmid_size, tick_length=0.02,
],
# Features are sorted by the length of their location range
# The shortest come first
- key = lambda feature: np.diff(feature.get_location_range())[0],
- reverse = True
+ key=lambda feature: np.diff(feature.get_location_range())[0],
+ reverse=True,
+ )
+ axes.add_artist(
+ PlasmidMap(
+ axes,
+ 0,
+ features,
+ plasmid_size,
+ inner_radius,
+ feature_width,
+ spacing,
+ arrow_head_length,
+ label,
+ face_properties,
+ label_properties,
+ omit_oversized_labels,
+ feature_formatter,
+ )
)
- axes.add_artist(PlasmidMap(
- axes, 0, features, plasmid_size, inner_radius, feature_width, spacing,
- arrow_head_length, label, face_properties, label_properties,
- omit_oversized_labels, feature_formatter
- ))
try:
# Only create these classes when matplotlib is installed
from matplotlib.artist import Artist
+ from matplotlib.patches import Polygon, Rectangle
from matplotlib.transforms import Bbox
- from matplotlib.patches import Rectangle, Polygon
-
class PlasmidMap(Artist):
- def __init__(self, axes, zorder, features, plasmid_size, radius,
- feature_width, spacing, arrow_head_length, label,
- face_properties, label_properties, omit_oversized_labels,
- feature_formatter):
+ def __init__(
+ self,
+ axes,
+ zorder,
+ features,
+ plasmid_size,
+ radius,
+ feature_width,
+ spacing,
+ arrow_head_length,
+ label,
+ face_properties,
+ label_properties,
+ omit_oversized_labels,
+ feature_formatter,
+ ):
super().__init__()
self._axes = axes
self.zorder = zorder
@@ -212,30 +248,36 @@ def __init__(self, axes, zorder, features, plasmid_size, radius,
for feature in features:
indicators_for_feature = []
for loc in feature.locs:
- # Set proper positions in 'draw()' method
+ # Set proper positions in 'draw()' method
bbox = Bbox.from_extents(0, 0, 0, 0)
# Draw features as curved arrows (feature indicator)
- indicator = axes.add_artist(Feature_Indicator(
- axes, self.zorder + 1, feature, loc, bbox,
- arrow_head_length, face_properties, label_properties,
- omit_oversized_labels, feature_formatter
- ))
+ indicator = axes.add_artist(
+ FeatureIndicator(
+ axes,
+ self.zorder + 1,
+ feature,
+ loc,
+ bbox,
+ arrow_head_length,
+ face_properties,
+ label_properties,
+ omit_oversized_labels,
+ feature_formatter,
+ )
+ )
indicators_for_feature.append(indicator)
self._all_indicators.append(indicators_for_feature)
-
def draw(self, renderer, *args, **kwargs):
# Find the maximum amount of feature rows
# (used for overlapping features)
- row_count = int(
- self._radius // (self._feature_width + self._spacing)
- )
+ row_count = int(self._radius // (self._feature_width + self._spacing))
# Tracks the location ranges of feature that were added to
# a row in order to check if that row is occupied
ranges_in_row = [[] for i in range(row_count)]
# Stores the bottom coordinate (radius) for each row
row_bottoms = [
- self._radius - (row+1) * (self._feature_width + self._spacing)
+ self._radius - (row + 1) * (self._feature_width + self._spacing)
for row in range(row_count)
]
@@ -258,11 +300,13 @@ def draw(self, renderer, *args, **kwargs):
# 'Normal feature'
if first <= curr_last and last >= curr_first:
is_occupied = True
- else: # first < 1
+ else: # first < 1
# Location is over periodic boundary
- if first + self._plasmid_size <= curr_last \
- or last >= curr_first:
- is_occupied = True
+ if (
+ first + self._plasmid_size <= curr_last
+ or last >= curr_first
+ ):
+ is_occupied = True
if not is_occupied:
# Row is not occupied by another feature
# in the location range of the new feature
@@ -273,12 +317,10 @@ def draw(self, renderer, *args, **kwargs):
else:
# Location is over periodic boundary
# Split into 'end' and 'start' part
- ranges_in_row[row_i].append((
- first + self._plasmid_size, self._plasmid_size
- ))
- ranges_in_row[row_i].append((
- 1, last
- ))
+ ranges_in_row[row_i].append(
+ (first + self._plasmid_size, self._plasmid_size)
+ )
+ ranges_in_row[row_i].append((1, last))
row_bottom = row_bottoms[row_i]
break
if row_bottom is None:
@@ -288,24 +330,30 @@ def draw(self, renderer, *args, **kwargs):
"radius or decrease the feature width or spacing"
)
else:
- for loc, indicator in zip(
- feature.locs, indicators_for_feature
- ):
+ for loc, indicator in zip(feature.locs, indicators_for_feature):
# Calculate arrow shape parameters
- row_center = row_bottom + self._feature_width/2
row_top = row_bottom + self._feature_width
start_ang = _loc_to_rad(loc.first, self._plasmid_size)
- stop_ang = _loc_to_rad(loc.last, self._plasmid_size)
+ stop_ang = _loc_to_rad(loc.last, self._plasmid_size)
bbox = Bbox.from_extents(
start_ang, row_bottom, stop_ang, row_top
)
indicator.set_bbox(bbox)
-
- class Feature_Indicator(Artist):
- def __init__(self, axes, zorder, feature, loc, bbox, head_length,
- arrow_properties, label_properties, omit_oversized_labels,
- feature_formatter):
+ class FeatureIndicator(Artist):
+ def __init__(
+ self,
+ axes,
+ zorder,
+ feature,
+ loc,
+ bbox,
+ head_length,
+ arrow_properties,
+ label_properties,
+ omit_oversized_labels,
+ feature_formatter,
+ ):
super().__init__()
self._axes = axes
self.zorder = zorder
@@ -313,44 +361,59 @@ def __init__(self, axes, zorder, feature, loc, bbox, head_length,
self._bbox = bbox
self._head_length = head_length
self._omit_oversized_labels = omit_oversized_labels
-
+
# Determine how to draw the feature
- directional, face_color, label_color, label \
- = feature_formatter(feature)
-
+ directional, face_color, label_color, label = feature_formatter(feature)
+
# Draw arrow as composition of a rectangle and a triangle,
# as FancyArrow does not properly work for polar plots
- self._arrow_tail = axes.add_patch(Rectangle(
- # Set positions in 'draw()' method
- (0, 0), 0, 0,
- # Line width is set to 1 to avoid strange artifact in
- # the transition from rectangle (tail) to polygon (head)
- color=face_color, linewidth=1, zorder = self.zorder + 1,
- **arrow_properties
- ))
-
+ self._arrow_tail = axes.add_patch(
+ Rectangle(
+ # Set positions in 'draw()' method
+ (0, 0),
+ 0,
+ 0,
+ # Line width is set to 1 to avoid strange artifact in
+ # the transition from rectangle (tail) to polygon (head)
+ color=face_color,
+ linewidth=1,
+ zorder=self.zorder + 1,
+ **arrow_properties,
+ )
+ )
+
if directional:
# Only draw any arrow head when feature has a direction,
# otherwise simply draw the tail (rectangle)
- self._arrow_head = axes.add_patch(Polygon(
- # Set positions in 'draw()' method
- [(0, 0), (0, 0), (0, 0)],
- color=face_color, linewidth=1, zorder = self.zorder + 1,
- **arrow_properties
- ))
+ self._arrow_head = axes.add_patch(
+ Polygon(
+ # Set positions in 'draw()' method
+ [(0, 0), (0, 0), (0, 0)],
+ color=face_color,
+ linewidth=1,
+ zorder=self.zorder + 1,
+ **arrow_properties,
+ )
+ )
else:
self._arrow_head = None
if label is not None:
label_properties["color"] = label_color
- self._label = axes.add_artist(CurvedText(
- # Set positions in 'draw()' method
- axes, self.zorder + 1, 0, 0, label, label_properties
- ))
+ self._label = axes.add_artist(
+ CurvedText(
+ # Set positions in 'draw()' method
+ axes,
+ self.zorder + 1,
+ 0,
+ 0,
+ label,
+ label_properties,
+ )
+ )
else:
self._label = None
-
def set_bbox(self, bbox):
self._bbox = bbox
@@ -359,17 +422,15 @@ def set_bbox(self, bbox):
if self._label is not None:
self._label.set_position(center_x, center_y)
-
def draw(self, renderer, *args, **kwargs):
bbox = self._bbox
- center_x = (bbox.x0 + bbox.x1) / 2
center_y = (bbox.y0 + bbox.y1) / 2
# Constant absolute width for all arrows
# irrespective of the radius in the polar plot
# Calculate actual angle from given absolute width
head_length = self._head_length / center_y
-
+
# Check if the head should be drawn
if self._arrow_head is None:
head_length = 0
@@ -382,39 +443,38 @@ def draw(self, renderer, *args, **kwargs):
rect_pos = (bbox.x0, bbox.y0)
# (x0, y0), (x1, y1), (x2, y2)
triangle_coord = [
- (bbox.x1 - head_length, bbox.y0), # base 1
- (bbox.x1 - head_length, bbox.y1), # base 2
- (bbox.x1, center_y) # tip
+ (bbox.x1 - head_length, bbox.y0), # base 1
+ (bbox.x1 - head_length, bbox.y1), # base 2
+ (bbox.x1, center_y), # tip
]
else:
- rect_pos = (bbox.x0+head_length, bbox.y0)
+ rect_pos = (bbox.x0 + head_length, bbox.y0)
triangle_coord = [
- (bbox.x0 + head_length, bbox.y0), # base 1
- (bbox.x0 + head_length, bbox.y1), # base 2
- (bbox.x0, center_y) # tip
+ (bbox.x0 + head_length, bbox.y0), # base 1
+ (bbox.x0 + head_length, bbox.y1), # base 2
+ (bbox.x0, center_y), # tip
]
-
+
# Update coordinates of sub-artists
self._arrow_tail.set_xy(rect_pos)
- self._arrow_tail.set_width(bbox.width-head_length)
+ self._arrow_tail.set_width(bbox.width - head_length)
self._arrow_tail.set_height(bbox.height)
if self._arrow_head is not None:
self._arrow_head.set_xy(triangle_coord)
-
+
if self._label is not None:
# Do not draw the labels if it is larger than the
# indicator
- if self._omit_oversized_labels \
- and self._label.get_total_angle(renderer) > bbox.width:
- self._label.set_visible(False)
+ if (
+ self._omit_oversized_labels
+ and self._label.get_total_angle(renderer) > bbox.width
+ ):
+ self._label.set_visible(False)
else:
self._label.set_visible(True)
-
-
class CurvedText(Artist):
- def __init__(self, axes, zorder, angle, radius, string,
- text_properties):
+ def __init__(self, axes, zorder, angle, radius, string, text_properties):
super().__init__()
self._axes = axes
self.zorder = zorder
@@ -425,44 +485,35 @@ def __init__(self, axes, zorder, angle, radius, string,
for word in _split_into_words(string):
text = axes.text(
# Set position in 'draw()' method
- 0, 0,
+ 0,
+ 0,
word,
- ha="center", va="center",
+ ha="center",
+ va="center",
zorder=self.zorder + 1,
**text_properties,
)
self._texts.append(text)
-
def set_visible(self, visible):
super().set_visible(visible)
for text in self._texts:
text.set_visible(visible)
-
def set_position(self, angle, radius):
self._angle = angle
self._radius = radius
-
def get_total_angle(self, renderer):
return np.sum(self.get_word_angles(renderer))
-
def get_word_angles(self, renderer):
ax_px_radius = self._axes.get_window_extent(renderer).width / 2
ax_unit_radius = self._axes.get_ylim()[1]
- circle_px_circumference = ax_px_radius * 2*np.pi \
- * (self._radius / ax_unit_radius)
+ circle_px_circumference = (
+ ax_px_radius * 2 * np.pi * (self._radius / ax_unit_radius)
+ )
- rad_angle = 360 - np.rad2deg(self._angle)
- # Avoid to draw the text upside down, when drawn on the
- # bottom half of the map
- if rad_angle > 90 and rad_angle < 270:
- turn_around = True
- else:
- turn_around = False
-
angles = []
for text in self._texts:
orig_rot = text.get_rotation()
@@ -477,14 +528,12 @@ def get_word_angles(self, renderer):
# In this case, assign a fixed width
if np.isnan(word_px_width):
word_px_width = 5.0
- word_angle \
- = 2*np.pi * word_px_width / circle_px_circumference
+ word_angle = 2 * np.pi * word_px_width / circle_px_circumference
angles.append(word_angle)
# Restore
text.set_rotation(orig_rot)
text.set_visible(orig_visible)
return angles
-
def draw(self, renderer, *args, **kwargs):
angles = self.get_word_angles(renderer)
@@ -497,7 +546,7 @@ def draw(self, renderer, *args, **kwargs):
turn_around = True
else:
turn_around = False
-
+
# Now that the angle for each word is known,
# the appropriate position and rotation can be set
if turn_around:
@@ -526,20 +575,18 @@ def draw(self, renderer, *args, **kwargs):
pass
-
-
def _loc_to_rad(loc, plasmid_size):
if loc > plasmid_size:
raise ValueError(
f"Location {loc} is larger then the plasmid size of {plasmid_size}"
)
# Location starts at 1 -> (loc-1)
- return ((loc-1) / plasmid_size) * 2*np.pi
+ return ((loc - 1) / plasmid_size) * 2 * np.pi
def _rad_to_loc(rad, plasmid_size):
# Location starts at 1 -> + 1
- return rad / (2*np.pi) * plasmid_size + 1
+ return rad / (2 * np.pi) * plasmid_size + 1
def _merge_over_periodic_boundary(feature, plasmid_size):
@@ -547,7 +594,7 @@ def _merge_over_periodic_boundary(feature, plasmid_size):
# Only one location -> no merge possible
return feature
first_loc = None
- last_loc = None
+ last_loc = None
# Find total first location of the feature
for loc in feature.locs:
if first_loc is None or loc.first < first_loc.first:
@@ -558,38 +605,43 @@ def _merge_over_periodic_boundary(feature, plasmid_size):
last_loc = loc
# If the first and last location meet at the periodic boundary of
# the plasmid -> merge them
- if first_loc.first == 1 and last_loc.last == plasmid_size \
- and first_loc.strand == last_loc.strand:
- new_locs = set(feature.locs)
- new_locs.remove(first_loc)
- new_locs.remove(last_loc)
- new_locs.add(Location(
+ if (
+ first_loc.first == 1
+ and last_loc.last == plasmid_size
+ and first_loc.strand == last_loc.strand
+ ):
+ new_locs = set(feature.locs)
+ new_locs.remove(first_loc)
+ new_locs.remove(last_loc)
+ new_locs.add(
+ Location(
# the fist base is now at negative location
# by shifting by one plasmid 'period'
- first = last_loc.first - plasmid_size,
- last = first_loc.last,
- strand = first_loc.strand,
- defect = first_loc.defect | last_loc.defect
- ))
- return Feature(feature.key, new_locs, feature.qual)
+ first=last_loc.first - plasmid_size,
+ last=first_loc.last,
+ strand=first_loc.strand,
+ defect=first_loc.defect | last_loc.defect,
+ )
+ )
+ return Feature(feature.key, new_locs, feature.qual)
else:
return feature
# ' ', '-' and '_' are word delimiters
separators = re.compile(r"\s|_|-")
+
+
def _split_into_words(string):
- match_indices = sorted(
- [match.start() for match in separators.finditer(string)]
- )
+ match_indices = sorted([match.start() for match in separators.finditer(string)])
current_index = 0
words = []
for i in match_indices:
# Add word up to delimiter
- words.append(string[current_index : i])
+ words.append(string[current_index:i])
# Add delimiter
- words.append(string[i : i+1])
- current_index = i+1
+ words.append(string[i : i + 1])
+ current_index = i + 1
# If there is a word after the last delimiter, add it too
if current_index < len(string):
words.append(string[current_index:])
@@ -618,44 +670,43 @@ def _default_feature_formatter(f):
else:
label = None
return False, "black", "white", label
-
+
# Origin of Replication
elif f.key == "rep_origin":
- return False, "indigo", "white", \
- f.qual.get("standard_name", "ori")
-
+ return False, "indigo", "white", f.qual.get("standard_name", "ori")
+
# Coding sequences
elif f.key in ["gene", "CDS", "rRNA"]:
label = f.qual.get("product")
if label is None:
label = f.qual.get("gene")
return True, colors["orange"], "black", label
-
+
elif f.key == "regulatory":
# Promoters
if f.qual.get("regulatory_class") in [
"promoter",
"TATA_box",
"minus_35_signal",
- "minus_10_signal"
+ "minus_10_signal",
]:
return True, colors["dimgreen"], "black", f.qual.get("note")
-
+
# Terminators
elif f.qual.get("regulatory_class") in "terminator":
return False, "firebrick", "white", f.qual.get("note")
-
+
# RBS
elif f.qual.get("regulatory_class") == "ribosome_binding_site":
return False, colors["brightorange"], "white", None
-
+
# Primers
elif f.key == "primer_bind":
return True, "royalblue", "black", f.qual.get("note")
-
+
# Binding proteins
elif f.key == "protein_bind":
return False, colors["lightgreen"], "black", f.qual.get("note")
-
+
# Misc
- return True, "dimgray", "white", f.qual.get("note")
\ No newline at end of file
+ return True, "dimgray", "white", f.qual.get("note")
diff --git a/src/biotite/sequence/io/fasta/__init__.py b/src/biotite/sequence/io/fasta/__init__.py
index 5aa14febe..8fad54b21 100644
--- a/src/biotite/sequence/io/fasta/__init__.py
+++ b/src/biotite/sequence/io/fasta/__init__.py
@@ -18,5 +18,5 @@
__name__ = "biotite.sequence.io.fasta"
__author__ = "Patrick Kunzmann"
+from .convert import *
from .file import *
-from .convert import *
\ No newline at end of file
diff --git a/src/biotite/sequence/io/fasta/convert.py b/src/biotite/sequence/io/fasta/convert.py
index 0e8ca854a..0a73240dd 100644
--- a/src/biotite/sequence/io/fasta/convert.py
+++ b/src/biotite/sequence/io/fasta/convert.py
@@ -7,13 +7,18 @@
import warnings
from collections import OrderedDict
-from ...sequence import Sequence
-from ...alphabet import AlphabetError, LetterAlphabet
-from ...seqtypes import NucleotideSequence, ProteinSequence
-from ...align.alignment import Alignment
+from biotite.sequence.align.alignment import Alignment
+from biotite.sequence.alphabet import AlphabetError, LetterAlphabet
+from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence
-__all__ = ["get_sequence", "get_sequences", "set_sequence", "set_sequences",
- "get_alignment", "set_alignment"]
+__all__ = [
+ "get_sequence",
+ "get_sequences",
+ "set_sequence",
+ "set_sequences",
+ "get_alignment",
+ "set_alignment",
+]
def get_sequence(fasta_file, header=None, seq_type=None):
@@ -180,8 +185,10 @@ def get_alignment(fasta_file, additional_gap_chars=("_",), seq_type=None):
for i, seq_str in enumerate(seq_strings):
seq_strings[i] = seq_str.replace(char, "-")
# Remove gaps for creation of sequences
- sequences = [_convert_to_sequence(seq_str.replace("-",""), seq_type)
- for seq_str in seq_strings]
+ sequences = [
+ _convert_to_sequence(seq_str.replace("-", ""), seq_type)
+ for seq_str in seq_strings
+ ]
trace = Alignment.trace_from_strings(seq_strings)
return Alignment(sequences, trace, score=None)
@@ -212,44 +219,29 @@ def set_alignment(fasta_file, alignment, seq_names):
def _convert_to_sequence(seq_str, seq_type=None):
-
- # Define preprocessing of preimplemented sequence types
-
- # Replace selenocysteine with cysteine
- # and pyrrolysine with lysine
- process_protein_sequence = (
- lambda x : x.upper().replace("U", "C").replace("O", "K")
- )
- # For nucleotides uracil is represented by thymine and there is only
- # one letter for completely unknown nucleotides
- process_nucleotide_sequence = (
- lambda x : x.upper().replace("U","T").replace("X","N")
- )
-
# Set manually selected sequence type
-
if seq_type is not None:
# Do preprocessing as done without manual selection
if seq_type == NucleotideSequence:
- seq_str = process_nucleotide_sequence(seq_str)
+ seq_str = _process_nucleotide_sequence(seq_str)
elif seq_type == ProteinSequence:
if "U" in seq_str:
warnings.warn(
"ProteinSequence objects do not support selenocysteine "
"(U), occurrences were substituted by cysteine (C)"
)
- seq_str = process_protein_sequence(seq_str)
+ seq_str = _process_protein_sequence(seq_str)
# Return the converted sequence
return seq_type(seq_str)
# Attempt to automatically determine sequence type
try:
- return NucleotideSequence(process_nucleotide_sequence(seq_str))
+ return NucleotideSequence(_process_nucleotide_sequence(seq_str))
except AlphabetError:
pass
try:
- prot_seq = ProteinSequence(process_protein_sequence(seq_str))
+ prot_seq = ProteinSequence(_process_protein_sequence(seq_str))
# Raise Warning after conversion into 'ProteinSequence'
# to wait for potential 'AlphabetError'
if "U" in seq_str:
@@ -259,15 +251,34 @@ def _convert_to_sequence(seq_str, seq_type=None):
)
return prot_seq
except AlphabetError:
- raise ValueError("FASTA data cannot be converted either to "
- "'NucleotideSequence' nor to 'ProteinSequence'")
+ raise ValueError(
+ "FASTA data cannot be converted either to "
+ "'NucleotideSequence' nor to 'ProteinSequence'"
+ )
+
+
+def _process_protein_sequence(x):
+ """
+ Replace selenocysteine with cysteine and pyrrolysine with lysine.
+ """
+ return x.upper().replace("U", "C").replace("O", "K")
+
+
+def _process_nucleotide_sequence(x):
+ """
+ For nucleotides uracil is represented by thymine and there is only
+ one letter for completely unknown nucleotides
+ """
+ return x.upper().replace("U", "T").replace("X", "N")
def _convert_to_string(sequence, as_rna):
if not isinstance(sequence.get_alphabet(), LetterAlphabet):
- raise ValueError("Only sequences using single letter alphabets "
- "can be stored in a FASTA file")
+ raise ValueError(
+ "Only sequences using single letter alphabets "
+ "can be stored in a FASTA file"
+ )
if isinstance(sequence, NucleotideSequence) and as_rna:
- return(str(sequence).replace("T", "U"))
+ return str(sequence).replace("T", "U")
else:
- return(str(sequence))
+ return str(sequence)
diff --git a/src/biotite/sequence/io/fasta/file.py b/src/biotite/sequence/io/fasta/file.py
index 89eab5398..e0fe20ad7 100644
--- a/src/biotite/sequence/io/fasta/file.py
+++ b/src/biotite/sequence/io/fasta/file.py
@@ -6,21 +6,21 @@
__author__ = "Patrick Kunzmann"
__all__ = ["FastaFile"]
-from ....file import TextFile, InvalidFileError, wrap_string
from collections import OrderedDict
from collections.abc import MutableMapping
+from biotite.file import InvalidFileError, TextFile, wrap_string
class FastaFile(TextFile, MutableMapping):
"""
This class represents a file in FASTA format.
-
+
A FASTA file contains so called *header* lines, beginning with
``>``, that describe following sequence.
The corresponding sequence starts at the line after the header line
and ends at the next header line or at the end of file.
The header along with its sequence forms an entry.
-
+
This class is used in a dictionary like manner, implementing the
:class:`MutableMapping` interface:
Headers (without the leading ``>``) are used as keys,
@@ -35,10 +35,10 @@ class FastaFile(TextFile, MutableMapping):
after which a line break is inserted.
Only relevant, when adding sequences to a file.
Default is 80.
-
+
Examples
--------
-
+
>>> import os.path
>>> file = FastaFile()
>>> file["seq1"] = "ATACT"
@@ -61,17 +61,17 @@ class FastaFile(TextFile, MutableMapping):
{'seq2': 'AAAATT'}
>>> file.write(os.path.join(path_to_directory, "test.fasta"))
"""
-
+
def __init__(self, chars_per_line=80):
super().__init__()
self._chars_per_line = chars_per_line
self._entries = OrderedDict()
-
+
@classmethod
def read(cls, file, chars_per_line=80):
"""
Read a FASTA file.
-
+
Parameters
----------
file : file-like object or str
@@ -82,7 +82,7 @@ def read(cls, file, chars_per_line=80):
after which a line break is inserted.
Only relevant, when adding sequences to a file.
Default is 80.
-
+
Returns
-------
file_object : FastaFile
@@ -90,24 +90,23 @@ def read(cls, file, chars_per_line=80):
"""
file = super().read(file, chars_per_line)
# Filter out empty and comment lines
- file.lines = [line for line in file.lines
- if len(line.strip()) != 0 and line[0] != ";"]
+ file.lines = [
+ line for line in file.lines if len(line.strip()) != 0 and line[0] != ";"
+ ]
if len(file.lines) == 0:
raise InvalidFileError("File is empty or contains only comments")
file._find_entries()
return file
-
+
def __setitem__(self, header, seq_str):
if not isinstance(header, str):
- raise IndexError(
- "'FastaFile' only supports header strings as keys"
- )
+ raise IndexError("'FastaFile' only supports header strings as keys")
if not isinstance(seq_str, str):
- raise TypeError("'FastaFile' only supports sequence strings "
- "as values")
+ raise TypeError("'FastaFile' only supports sequence strings " "as values")
# Create lines for new header and sequence (with line breaks)
- new_lines = [">" + header.replace("\n","").strip()] + \
- wrap_string(seq_str, width=self._chars_per_line)
+ new_lines = [">" + header.replace("\n", "").strip()] + wrap_string(
+ seq_str, width=self._chars_per_line
+ )
if header in self:
# Delete lines of entry corresponding to the header,
# if existing
@@ -118,83 +117,75 @@ def __setitem__(self, header, seq_str):
# Simply append lines
# Add entry in a more efficient way than '_find_entries()'
# for this simple case
- self._entries[header] = (
- len(self.lines),
- len(self.lines) + len(new_lines)
- )
+ self._entries[header] = (len(self.lines), len(self.lines) + len(new_lines))
self.lines += new_lines
-
+
def __getitem__(self, header):
if not isinstance(header, str):
- raise IndexError(
- "'FastaFile' only supports header strings as keys"
- )
+ raise IndexError("'FastaFile' only supports header strings as keys")
start, stop = self._entries[header]
# Concatenate sequence string from following lines
- seq_string = "".join(
- [line.strip() for line in self.lines[start+1 : stop]]
- )
+ seq_string = "".join([line.strip() for line in self.lines[start + 1 : stop]])
return seq_string
-
+
def __delitem__(self, header):
start, stop = self._entries[header]
del self.lines[start:stop]
del self._entries[header]
self._find_entries()
-
+
def __len__(self):
return len(self._entries)
-
+
def __iter__(self):
return self._entries.__iter__()
-
+
def __contains__(self, identifer):
return identifer in self._entries
-
+
def _find_entries(self):
if len(self.lines) > 0 and self.lines[0][0] != ">":
raise InvalidFileError(
f"File starts with '{self.lines[0][0]}' instead of '>'"
)
-
+
header_i = []
for i, line in enumerate(self.lines):
if line[0] == ">":
header_i.append(i)
-
+
self._entries = OrderedDict()
for j in range(len(header_i)):
# Remove leading '>' from header
header = self.lines[header_i[j]].strip()[1:]
start = header_i[j]
- if j < len(header_i) -1:
+ if j < len(header_i) - 1:
# Header in mid or start of file
# -> stop is start of next header
- stop = header_i[j+1]
+ stop = header_i[j + 1]
else:
# Last header -> entry stops at end of file
stop = len(self.lines)
self._entries[header] = (start, stop)
-
@staticmethod
def read_iter(file):
"""
Create an iterator over each sequence of the given FASTA file.
-
+
Parameters
----------
file : file-like object or str
The file to be read.
Alternatively a file path can be supplied.
-
+
Yields
------
header : str
The header of the current sequence.
seq_str : str
The current sequence as string.
-
+
Notes
-----
This approach gives the same results as
@@ -221,7 +212,6 @@ def read_iter(file):
# Yield final entry
if header is not None:
yield header, "".join(seq_str_list)
-
@staticmethod
def write_iter(file, items, chars_per_line=80):
@@ -235,7 +225,7 @@ def write_iter(file, items, chars_per_line=80):
Hence, this static method may save a large amount of memory if
a large file should be written, especially if the `items`
are provided as generator.
-
+
Parameters
----------
file : file-like object or str
@@ -256,23 +246,20 @@ def write_iter(file, items, chars_per_line=80):
This method does not test, whether the given identifiers are
unambiguous.
"""
+
def line_generator():
for item in items:
header, seq_str = item
if not isinstance(header, str):
- raise IndexError(
- "'FastaFile' only supports header strings"
- )
+ raise IndexError("'FastaFile' only supports header strings")
if not isinstance(seq_str, str):
- raise TypeError(
- "'FastaFile' only supports sequence strings"
- )
-
+ raise TypeError("'FastaFile' only supports sequence strings")
+
# Yield header line
- yield ">" + header.replace("\n","").strip()
+ yield ">" + header.replace("\n", "").strip()
# Yield sequence line(s)
for line in wrap_string(seq_str, width=chars_per_line):
yield line
-
- TextFile.write_iter(file, line_generator())
\ No newline at end of file
+
+ TextFile.write_iter(file, line_generator())
diff --git a/src/biotite/sequence/io/fastq/__init__.py b/src/biotite/sequence/io/fastq/__init__.py
index d763198b1..cff2e7097 100644
--- a/src/biotite/sequence/io/fastq/__init__.py
+++ b/src/biotite/sequence/io/fastq/__init__.py
@@ -15,5 +15,5 @@
__name__ = "biotite.sequence.io.fastq"
__author__ = "Patrick Kunzmann"
+from .convert import *
from .file import *
-from .convert import *
\ No newline at end of file
diff --git a/src/biotite/sequence/io/fastq/convert.py b/src/biotite/sequence/io/fastq/convert.py
index 868536c6e..5b743fcd7 100644
--- a/src/biotite/sequence/io/fastq/convert.py
+++ b/src/biotite/sequence/io/fastq/convert.py
@@ -6,10 +6,7 @@
__author__ = "Patrick Kunzmann"
from collections import OrderedDict
-from ...sequence import Sequence
-from ...alphabet import AlphabetError, LetterAlphabet
-from ...seqtypes import NucleotideSequence
-from ...align.alignment import Alignment
+from biotite.sequence.seqtypes import NucleotideSequence
__all__ = ["get_sequence", "get_sequences", "set_sequence", "set_sequences"]
@@ -17,7 +14,7 @@
def get_sequence(fastq_file, header=None):
"""
Get a sequence and quality scores from a `FastqFile` instance.
-
+
Parameters
----------
fastq_file : FastqFile
@@ -25,7 +22,7 @@ def get_sequence(fastq_file, header=None):
header : str, optional
The identifier to get the sequence and scores from.
By default, the first sequence of the file is returned.
-
+
Returns
-------
sequence : NucleotideSequence
@@ -43,7 +40,7 @@ def get_sequence(fastq_file, header=None):
break
if seq_str is None:
raise ValueError("File does not contain any sequences")
- processed_seq_str = seq_str.replace("U","T").replace("X","N")
+ processed_seq_str = seq_str.replace("U", "T").replace("X", "N")
return NucleotideSequence(processed_seq_str), scores
@@ -51,12 +48,12 @@ def get_sequences(fastq_file):
"""
Get a dictionary from a `FastqFile` instance,
where identifiers are keys and sequence-score-tuples are values.
-
+
Parameters
----------
fastq_file : FastqFile
The `Fastqile` to be accessed.
-
+
Returns
-------
seq_dict : dict
@@ -65,7 +62,7 @@ def get_sequences(fastq_file):
"""
seq_dict = OrderedDict()
for header, (seq_str, scores) in fastq_file.items():
- processed_seq_str = seq_str.replace("U","T").replace("X","N")
+ processed_seq_str = seq_str.replace("U", "T").replace("X", "N")
seq_dict[header] = NucleotideSequence(processed_seq_str), scores
return seq_dict
@@ -73,7 +70,7 @@ def get_sequences(fastq_file):
def set_sequence(fastq_file, sequence, scores, header=None, as_rna=False):
"""
Set a sequence and a quality score array in a `FastqFile` instance.
-
+
Parameters
----------
fastq_file : FastqFile
@@ -96,7 +93,7 @@ def set_sequence(fastq_file, sequence, scores, header=None, as_rna=False):
def set_sequences(fastq_file, sequence_dict, as_rna=False):
"""
Set sequences in a `FastqFile` instance from a dictionary.
-
+
Parameters
----------
fastq_file : FastqFile
@@ -115,6 +112,6 @@ def set_sequences(fastq_file, sequence_dict, as_rna=False):
def _convert_to_string(sequence, as_rna):
if as_rna:
- return(str(sequence).replace("T", "U"))
+ return str(sequence).replace("T", "U")
else:
- return(str(sequence))
\ No newline at end of file
+ return str(sequence)
diff --git a/src/biotite/sequence/io/fastq/file.py b/src/biotite/sequence/io/fastq/file.py
index c90da37cd..c6c85c6cb 100644
--- a/src/biotite/sequence/io/fastq/file.py
+++ b/src/biotite/sequence/io/fastq/file.py
@@ -5,23 +5,21 @@
__name__ = "biotite.sequence.io.fastq"
__author__ = "Patrick Kunzmann"
-import warnings
-from numbers import Integral
from collections import OrderedDict
from collections.abc import MutableMapping
+from numbers import Integral
import numpy as np
-from ....file import TextFile, InvalidFileError, wrap_string
-from ...seqtypes import NucleotideSequence
+from biotite.file import InvalidFileError, TextFile, wrap_string
__all__ = ["FastqFile"]
_OFFSETS = {
- "Sanger" : 33,
- "Solexa" : 64,
- "Illumina-1.3" : 64,
- "Illumina-1.5" : 64,
- "Illumina-1.8" : 33,
+ "Sanger": 33,
+ "Solexa": 64,
+ "Illumina-1.3": 64,
+ "Illumina-1.5": 64,
+ "Illumina-1.8": 33,
}
@@ -151,13 +149,10 @@ def get_seq_string(self, identifier):
The sequence corresponding to the identifier.
"""
if not isinstance(identifier, str):
- raise IndexError(
- "'FastqFile' only supports identifier strings as keys"
- )
- seq_start, seq_stop, score_start, score_stop \
- = self._entries[identifier]
+ raise IndexError("'FastqFile' only supports identifier strings as keys")
+ seq_start, seq_stop, score_start, score_stop = self._entries[identifier]
# Concatenate sequence string from the sequence lines
- seq_str = "".join(self.lines[seq_start : seq_stop])
+ seq_str = "".join(self.lines[seq_start:seq_stop])
return seq_str
def get_quality(self, identifier):
@@ -175,15 +170,11 @@ def get_quality(self, identifier):
The quality scores corresponding to the identifier.
"""
if not isinstance(identifier, str):
- raise IndexError(
- "'FastqFile' only supports identifier strings as keys"
- )
- seq_start, seq_stop, score_start, score_stop \
- = self._entries[identifier]
+ raise IndexError("'FastqFile' only supports identifier strings as keys")
+ seq_start, seq_stop, score_start, score_stop = self._entries[identifier]
# Concatenate sequence string from the score lines
return _score_str_to_scores(
- "".join(self.lines[score_start : score_stop]),
- self._offset
+ "".join(self.lines[score_start:score_stop]), self._offset
)
def __setitem__(self, identifier, item):
@@ -194,9 +185,7 @@ def __setitem__(self, identifier, item):
f"but score length is {len(scores)}"
)
if not isinstance(identifier, str):
- raise IndexError(
- "'FastqFile' only supports strings as identifier"
- )
+ raise IndexError("'FastqFile' only supports strings as identifier")
# Delete lines of entry corresponding to the identifier,
# if already existing
if identifier in self:
@@ -204,14 +193,14 @@ def __setitem__(self, identifier, item):
# Create new lines
# Start with identifier line
- new_lines = ["@" + identifier.replace("\n","").strip()]
+ new_lines = ["@" + identifier.replace("\n", "").strip()]
# Append new lines with sequence string (with line breaks)
seq_start_i = len(new_lines)
if self._chars_per_line is None:
new_lines.append(str(sequence))
else:
new_lines += wrap_string(sequence, width=self._chars_per_line)
- seq_stop_i =len(new_lines)
+ seq_stop_i = len(new_lines)
# Append sequence-score separator
new_lines += ["+"]
# Append scores
@@ -237,7 +226,7 @@ def __setitem__(self, identifier, item):
len(self.lines) + seq_start_i,
len(self.lines) + seq_stop_i,
len(self.lines) + score_start_i,
- len(self.lines) + score_stop_i
+ len(self.lines) + score_stop_i,
)
self.lines += new_lines
@@ -245,9 +234,8 @@ def __getitem__(self, identifier):
return self.get_seq_string(identifier), self.get_quality(identifier)
def __delitem__(self, identifier):
- seq_start, seq_stop, score_start, score_stop \
- = self._entries[identifier]
- del self.lines[seq_start-1 : score_stop]
+ seq_start, seq_stop, score_start, score_stop = self._entries[identifier]
+ del self.lines[seq_start - 1 : score_stop]
del self._entries[identifier]
self._find_entries()
@@ -278,7 +266,7 @@ def _find_entries(self):
if not in_scores and not in_sequence and line[0] == "@":
# Identifier line
identifier = line[1:]
- seq_start_i = i+1
+ seq_start_i = i + 1
# Next line is sequence
in_sequence = True
# Reset
@@ -290,7 +278,7 @@ def _find_entries(self):
in_sequence = False
in_scores = True
seq_stop_i = i
- score_start_i = i+1
+ score_start_i = i + 1
else:
# Still in sequence
seq_len += len(line)
@@ -306,9 +294,12 @@ def _find_entries(self):
in_scores = False
# Record this entry
self._entries[identifier] = (
- seq_start_i, seq_stop_i, score_start_i, score_stop_i
+ seq_start_i,
+ seq_stop_i,
+ score_start_i,
+ score_stop_i,
)
- else: # score_len > seq_len
+ else: # score_len > seq_len
raise InvalidFileError(
f"The amount of scores is not equal to the sequence "
f"length for the sequence in line {seq_start_i+1} "
@@ -320,7 +311,6 @@ def _find_entries(self):
if in_sequence or in_scores:
raise InvalidFileError("The last entry in the file is incomplete")
-
@staticmethod
def read_iter(file, offset):
"""
@@ -398,20 +388,15 @@ def read_iter(file, offset):
# -> End of entry
in_scores = False
# yield this entry
- scores = _score_str_to_scores(
- "".join(score_str_list),
- offset
- )
+ scores = _score_str_to_scores("".join(score_str_list), offset)
yield identifier, ("".join(seq_str_list), scores)
- else: # score_len > seq_len
+ else: # score_len > seq_len
raise InvalidFileError(
- f"The amount of scores is not equal to the sequence "
- f"length"
+ "The amount of scores is not equal to the sequence " "length"
)
else:
- raise InvalidFileError(f"FASTQ file is invalid")
-
+ raise InvalidFileError("FASTQ file is invalid")
@staticmethod
def write_iter(file, items, offset, chars_per_line=None):
@@ -463,12 +448,10 @@ def line_generator():
f"but score length is {len(scores)}"
)
if not isinstance(identifier, str):
- raise IndexError(
- "'FastqFile' only supports strings as identifier"
- )
+ raise IndexError("'FastqFile' only supports strings as identifier")
# Yield identifier line
- yield "@" + identifier.replace("\n","").strip()
+ yield "@" + identifier.replace("\n", "").strip()
# Yield sequence line(s)
if chars_per_line is None:
@@ -495,15 +478,11 @@ def _score_str_to_scores(score_str, offset):
"""
Convert an ASCII string into actual score values.
"""
- scores = np.frombuffer(
- bytearray(
- score_str, encoding="ascii"
- ),
- dtype=np.int8
- )
+ scores = np.frombuffer(bytearray(score_str, encoding="ascii"), dtype=np.int8)
scores -= offset
return scores
+
def _scores_to_score_str(scores, offset):
"""
Convert score values into an ASCII string.
@@ -511,6 +490,7 @@ def _scores_to_score_str(scores, offset):
scores = np.asarray(scores) + offset
return scores.astype(np.int8, copy=False).tobytes().decode("ascii")
+
def _convert_offset(offset_val_or_string):
"""
If the given offset is a string return the corresponding numerical
@@ -519,9 +499,9 @@ def _convert_offset(offset_val_or_string):
if isinstance(offset_val_or_string, Integral):
return offset_val_or_string
elif isinstance(offset_val_or_string, str):
- return _OFFSETS[offset_val_or_string]
+ return _OFFSETS[offset_val_or_string]
else:
raise TypeError(
f"The offset must be either an integer or a string "
f"indicating the format, not {type(offset_val_or_string).__name__}"
- )
\ No newline at end of file
+ )
diff --git a/src/biotite/sequence/io/genbank/__init__.py b/src/biotite/sequence/io/genbank/__init__.py
index bccb3feab..11f745f10 100644
--- a/src/biotite/sequence/io/genbank/__init__.py
+++ b/src/biotite/sequence/io/genbank/__init__.py
@@ -11,7 +11,7 @@
__name__ = "biotite.sequence.io.genbank"
__author__ = "Patrick Kunzmann"
-from .file import *
from .annotation import *
+from .file import *
+from .metadata import *
from .sequence import *
-from .metadata import *
\ No newline at end of file
diff --git a/src/biotite/sequence/io/genbank/annotation.py b/src/biotite/sequence/io/genbank/annotation.py
index fcd5e072b..223a67ddb 100644
--- a/src/biotite/sequence/io/genbank/annotation.py
+++ b/src/biotite/sequence/io/genbank/annotation.py
@@ -12,10 +12,8 @@
import re
import warnings
-from ....file import InvalidFileError
-from ...annotation import Annotation, Feature, Location
-from .file import GenBankFile
-
+from biotite.file import InvalidFileError
+from biotite.sequence.annotation import Annotation, Feature, Location
_KEY_START = 5
_QUAL_START = 21
@@ -46,7 +44,6 @@ def get_annotation(gb_file, include_only=None):
raise InvalidFileError("File has multiple 'FEATURES' fields")
lines, _ = fields[0]
-
### Parse all lines to create an index of features,
# i.e. pairs of the feature key
# and the text belonging to the respective feature
@@ -60,13 +57,12 @@ def get_annotation(gb_file, include_only=None):
# Store old feature key and value
feature_list.append((feature_key, feature_value))
# Track new key
- feature_key = line[_KEY_START : _QUAL_START-1].strip()
+ feature_key = line[_KEY_START : _QUAL_START - 1].strip()
feature_value = ""
feature_value += line[_QUAL_START:] + " "
# Store last feature key and value (loop already exited)
feature_list.append((feature_key, feature_value))
-
### Process only relevant features and put them into an Annotation
annotation = Annotation()
# Regex to separate qualifiers from each other
@@ -92,7 +88,7 @@ def get_annotation(gb_file, include_only=None):
loc_string = qualifier_parts.pop(0).strip()
try:
locs = _parse_locs(loc_string)
- except:
+ except Exception:
warnings.warn(
f"'{loc_string}' is an unsupported location identifier, "
f"skipping feature"
@@ -114,7 +110,7 @@ def get_annotation(gb_file, include_only=None):
# -> split at whitespaces,
# as keys do not contain whitespaces
for subpart in part.split():
- if not "=" in subpart:
+ if "=" not in subpart:
# Qualifier without value, e.g. '/pseudo'
# -> store immediately
# Remove "/" -> subpart[1:]
@@ -147,11 +143,11 @@ def get_annotation(gb_file, include_only=None):
def _parse_locs(loc_str):
locs = []
if loc_str.startswith(("join", "order")):
- str_list = loc_str[loc_str.index("(")+1:loc_str.rindex(")")].split(",")
+ str_list = loc_str[loc_str.index("(") + 1 : loc_str.rindex(")")].split(",")
for s in str_list:
locs.extend(_parse_locs(s.strip()))
elif loc_str.startswith("complement"):
- compl_str = loc_str[loc_str.index("(")+1:loc_str.rindex(")")]
+ compl_str = loc_str[loc_str.index("(") + 1 : loc_str.rindex(")")]
compl_locs = [
Location(loc.first, loc.last, Location.Strand.REVERSE, loc.defect)
for loc in _parse_locs(compl_str)
@@ -214,8 +210,6 @@ def _set_qual(qual_dict, key, val):
qual_dict[key] = val
-
-
def set_annotation(gb_file, annotation):
"""
Set the *FEATURES* field of a GenBank file with an annotation.
@@ -236,12 +230,12 @@ def set_annotation(gb_file, annotation):
for key, values in feature.qual.items():
if values is None:
line = " " * _QUAL_START
- line += f'/{key}'
+ line += f"/{key}"
lines.append(line)
else:
for val in values.split("\n"):
line = " " * _QUAL_START
- line += f'/{key}="{val}"'
+ line += f'/{key}="{val}"'
lines.append(line)
gb_file.set_field("FEATURES", lines)
@@ -254,11 +248,11 @@ def _convert_to_loc_string(locs):
if len(locs) == 1:
loc = list(locs)[0]
loc_first_str = str(loc.first)
- loc_last_str = str(loc.last)
+ loc_last_str = str(loc.last)
if loc.defect & Location.Defect.BEYOND_LEFT:
loc_first_str = "<" + loc_first_str
if loc.defect & Location.Defect.BEYOND_RIGHT:
- loc_last_str = ">" + loc_last_str
+ loc_last_str = ">" + loc_last_str
if loc.first == loc.last:
loc_string = loc_first_str
elif loc.defect & Location.Defect.UNK_LOC:
@@ -270,8 +264,6 @@ def _convert_to_loc_string(locs):
if loc.strand == Location.Strand.REVERSE:
loc_string = f"complement({loc_string})"
else:
- loc_string = ",".join(
- [_convert_to_loc_string([loc]) for loc in locs]
- )
+ loc_string = ",".join([_convert_to_loc_string([loc]) for loc in locs])
loc_string = f"join({loc_string})"
return loc_string
diff --git a/src/biotite/sequence/io/genbank/file.py b/src/biotite/sequence/io/genbank/file.py
index 72a225647..0fdd99c63 100644
--- a/src/biotite/sequence/io/genbank/file.py
+++ b/src/biotite/sequence/io/genbank/file.py
@@ -6,14 +6,16 @@
__author__ = "Patrick Kunzmann"
__all__ = ["GenBankFile", "MultiFile"]
-#import textwrap
+# import textwrap
import copy
-#import re
+
+# import re
import io
-from ....file import TextFile, InvalidFileError
from collections import OrderedDict
-#from ...annotation import Location, Feature, Annotation, AnnotatedSequence
-#from ...seqtypes import NucleotideSequence, ProteinSequence
+from biotite.file import InvalidFileError, TextFile
+
+# from ...annotation import Location, Feature, Annotation, AnnotatedSequence
+# from ...seqtypes import NucleotideSequence, ProteinSequence
class GenBankFile(TextFile):
@@ -33,7 +35,7 @@ class GenBankFile(TextFile):
Some fields may occur multiple times, e.g. the *REFERENCE* field.
A sample GenBank file can be viewed at
``_.
-
+
This class provides a low-level interface for parsing, editing and
writing GenBank files.
It works like a list of field entries, where a field consists of the
@@ -47,7 +49,7 @@ class GenBankFile(TextFile):
The subfields are represented by a dictionary, with subfield names
being keys and the corresponding lines being values.
The *FEATURES* and *ORIGIN* fields have no subfields.
-
+
Every entry can be obtained, set and deleted via the index operator.
Notes
@@ -55,7 +57,7 @@ class GenBankFile(TextFile):
This class does not support location identifiers with references
to other Entrez database entries, e.g.
``join(1..100,J00194.1:100..202)``.
-
+
Examples
--------
Create a GenBank file from scratch:
@@ -79,9 +81,9 @@ class GenBankFile(TextFile):
['One line', 'A second line']
>>> print(subfields)
OrderedDict([('SUBFIELD1', ['Single Line']), ('SUBFIELD2', ['Two', 'lines'])])
-
+
Adding an additional field:
-
+
>>> file.insert(0, "OTHERFIELD", ["Another line"])
>>> print(len(file))
2
@@ -174,18 +176,18 @@ def __init__(self):
# and names of categories
self._field_pos = []
self._find_field_indices()
-
+
@classmethod
def read(cls, file):
"""
Read a GenBank file.
-
+
Parameters
----------
file : file-like object or str
The file to be read.
Alternatively a file path can be supplied.
-
+
Returns
-------
file_object : GenBankFile
@@ -194,16 +196,16 @@ def read(cls, file):
file = super().read(file)
file._find_field_indices()
return file
-
+
def get_fields(self, name):
"""
Get all *GenBank* fields associated with a given field name.
-
+
Parameters
----------
name : str
The field name.
-
+
Returns
-------
fields : list of (list of str, OrderedDict of str -> str)
@@ -218,17 +220,17 @@ def get_fields(self, name):
indices = self.get_indices(name)
# Omit the field name
return [self[i][1:] for i in indices]
-
+
def get_indices(self, name):
"""
Get the indices to all *GenBank* fields associated with a given
field name.
-
+
Parameters
----------
name : str
The field name.
-
+
Returns
-------
fields : list of int
@@ -242,7 +244,7 @@ def get_indices(self, name):
if fname == name:
indices.append(i)
return indices
-
+
def set_field(self, name, content, subfield_dict=None):
"""
Set a *GenBank* field with the given content.
@@ -250,7 +252,7 @@ def set_field(self, name, content, subfield_dict=None):
If the field already exists in the file, the field is
overwritten, otherwise a new field is created at the end of
the file.
-
+
Parameters
----------
name : str
@@ -261,7 +263,7 @@ def set_field(self, name, content, subfield_dict=None):
The subfields of the field.
The dictionary maps subfield names to the content lines of
the respective subfield.
-
+
Raises
------
InvalidFileError
@@ -283,13 +285,13 @@ def set_field(self, name, content, subfield_dict=None):
def __getitem__(self, index):
index = self._translate_idx(index)
start, stop, name = self._field_pos[index]
-
+
if name in ["FEATURES", "ORIGIN"]:
# For those two fields return the complete lines,
# beginning with the line after the field name
- content = self._get_field_content(start+1, stop, indent=0)
+ content = self._get_field_content(start + 1, stop, indent=0)
subfield_dict = OrderedDict()
-
+
else:
# For all metadata fields use the
# standard GenBank indentation (=12)
@@ -297,11 +299,11 @@ def __getitem__(self, index):
subfield_dict = OrderedDict()
subfield_start = None
first_subfield_start = None
- for i in range(start+1, stop):
+ header = None
+ for i in range(start + 1, stop):
line = self.lines[i]
- # Check if line contains a new subfield
- # (Header beginning from first column)
if len(line) != 0 and line[:12].strip() != "":
+ # New header -> new subfield
if first_subfield_start is None:
first_subfield_start = i
# Store previous subfield
@@ -320,12 +322,10 @@ def __getitem__(self, index):
# that are not part of a subfield
if first_subfield_start is not None:
stop = first_subfield_start
- content = self._get_field_content(
- start, stop, indent=12
- )
-
+ content = self._get_field_content(start, stop, indent=12)
+
return name, content, subfield_dict
-
+
def __setitem__(self, index, item):
index = self._translate_idx(index)
if not isinstance(item, tuple):
@@ -342,7 +342,7 @@ def __setitem__(self, index, item):
"Expected a tuple of name, content and optionally subfields"
)
inserted_lines = self._to_lines(name, content, subfields)
-
+
# Stop of field to be replaced is start of new field
start, old_stop, _ = self._field_pos[index]
# If not the last element is set,
@@ -355,12 +355,12 @@ def __setitem__(self, index, item):
# Shift the start/stop indices of the following fields
# by the amount of created fields
shift = len(inserted_lines) - (old_stop - start)
- for i in range(index+1, len(self._field_pos)):
+ for i in range(index + 1, len(self._field_pos)):
old_start, old_stop, fname = self._field_pos[i]
- self._field_pos[i] = old_start+shift, old_stop+shift, fname
+ self._field_pos[i] = old_start + shift, old_stop + shift, fname
# Add new entry
- self._field_pos[index] = start, start+len(inserted_lines), name.upper()
-
+ self._field_pos[index] = start, start + len(inserted_lines), name.upper()
+
def __delitem__(self, index):
index = self._translate_idx(index)
start, stop, _ = self._field_pos[index]
@@ -369,17 +369,17 @@ def __delitem__(self, index):
shift = stop - start
for i in range(index, len(self._field_pos)):
old_start, old_stop, name = self._field_pos[i]
- self._field_pos[i] = old_start-shift, old_stop-shift, name
- del self.lines[start : stop]
+ self._field_pos[i] = old_start - shift, old_stop - shift, name
+ del self.lines[start:stop]
del self._field_pos[index]
-
+
def __len__(self):
return len(self._field_pos)
def insert(self, index, name, content, subfields=None):
"""
Insert a *GenBank* field at the given position.
-
+
Parameters
----------
index : int
@@ -398,12 +398,12 @@ def insert(self, index, name, content, subfields=None):
"""
index = self._translate_idx(index, length_exclusive=False)
inserted_lines = self._to_lines(name, content, subfields)
-
+
# Stop of previous field is start of new field
if index == 0:
start = 0
else:
- _, start, _ = self._field_pos[index-1]
+ _, start, _ = self._field_pos[index - 1]
# If the new lines are not inserted at the end,
# the following lines need to be added, too
if start is not len(self.lines):
@@ -416,17 +416,16 @@ def insert(self, index, name, content, subfields=None):
shift = len(inserted_lines)
for i in range(index, len(self._field_pos)):
old_start, old_stop, fname = self._field_pos[i]
- self._field_pos[i] = old_start+shift, old_stop+shift, fname
+ self._field_pos[i] = old_start + shift, old_stop + shift, fname
# Add new entry
self._field_pos.insert(
- index,
- (start, start+len(inserted_lines), name.upper())
+ index, (start, start + len(inserted_lines), name.upper())
)
-
+
def append(self, name, content, subfields=None):
"""
Create a new *GenBank* field at the end of the file.
-
+
Parameters
----------
name : str
@@ -440,7 +439,6 @@ def append(self, name, content, subfields=None):
"""
self.insert(len(self), name, content, subfields)
-
def _find_field_indices(self):
"""
Identify the start and exclusive stop indices of lines
@@ -469,10 +467,10 @@ def _find_field_indices(self):
def _get_field_content(self, start, stop, indent):
if indent == 0:
- return self.lines[start : stop]
+ return self.lines[start:stop]
else:
- return [line[12:] for line in self.lines[start : stop]]
-
+ return [line[12:] for line in self.lines[start:stop]]
+
def _to_lines(self, name, content, subfields):
"""
Convert the field name, field content und subfield dictionary
@@ -480,22 +478,22 @@ def _to_lines(self, name, content, subfields):
"""
if subfields is None:
subfields = {}
-
+
name = name.strip().upper()
if len(name) == 0:
- raise ValueError(f"Must give a non emtpy name")
- subfields = OrderedDict({
- subfield_name.upper().strip() : subfield_lines
- for subfield_name, subfield_lines in subfields.items()
- })
-
+ raise ValueError("Must give a non emtpy name")
+ subfields = OrderedDict(
+ {
+ subfield_name.upper().strip(): subfield_lines
+ for subfield_name, subfield_lines in subfields.items()
+ }
+ )
+
# Create lines for new field
if name == "FEATURES":
# Header line plus all actual feature lines
lines = copy.copy(content)
- lines.insert(
- 0, "FEATURES" + " "*13 + "Location/Qualifiers"
- )
+ lines.insert(0, "FEATURES" + " " * 13 + "Location/Qualifiers")
elif name == "ORIGIN":
# Header line plus all actual sequence lines
lines = copy.copy(content)
@@ -504,19 +502,19 @@ def _to_lines(self, name, content, subfields):
name_column = []
content_column = []
# Create a line for the field name and empty lines
- # for each additional line required by the content
- name_column += [name] + [""] * (len(content)-1)
+ # for each additional line required by the content
+ name_column += [name] + [""] * (len(content) - 1)
content_column += content
for subfield_name, subfield_lines in subfields.items():
- name_column += [" " + subfield_name] \
- + [""] * (len(subfield_lines)-1)
+ name_column += [" " + subfield_name] + [""] * (len(subfield_lines) - 1)
content_column += subfield_lines
- lines = [f"{n_col:12}{c_col}" for n_col, c_col
- in zip(name_column, content_column)]
-
+ lines = [
+ f"{n_col:12}{c_col}"
+ for n_col, c_col in zip(name_column, content_column)
+ ]
+
return lines
-
def _translate_idx(self, index, length_exclusive=True):
"""
Check index boundaries and convert negative index to positive
@@ -539,15 +537,15 @@ class MultiFile(TextFile):
"""
This class represents a file in *GenBank* or *GenPept* format,
that contains multiple entries, for more than one UID.
-
+
The information for each UID are appended to each other in such a
file.
Objects of this class can be iterated to obtain a
:class:`GenBankFile` for each entry in the file.
-
+
Examples
--------
-
+
>>> import os.path
>>> file_name = fetch_single_file(
... ["1L2Y_A", "3O5R_A", "5UGO_A"],
@@ -568,8 +566,8 @@ def __iter__(self):
line = self.lines[i]
if line.strip() == "//":
# Create file with lines corresponding to that file
- file_content = "\n".join(self.lines[start_i : i+1])
+ file_content = "\n".join(self.lines[start_i : i + 1])
file = GenBankFile.read(io.StringIO(file_content))
# Reset file start index
start_i = i
- yield file
\ No newline at end of file
+ yield file
diff --git a/src/biotite/sequence/io/genbank/metadata.py b/src/biotite/sequence/io/genbank/metadata.py
index f4d25004f..477c0fbf2 100644
--- a/src/biotite/sequence/io/genbank/metadata.py
+++ b/src/biotite/sequence/io/genbank/metadata.py
@@ -8,17 +8,24 @@
__name__ = "biotite.sequence.io.genbank"
__author__ = "Patrick Kunzmann, Natasha Jaffe"
-__all__ = ["get_locus", "get_definition", "get_accession", "get_version",
- "get_gi", "get_db_link", "get_source",
- "set_locus"]
+__all__ = [
+ "get_locus",
+ "get_definition",
+ "get_accession",
+ "get_version",
+ "get_gi",
+ "get_db_link",
+ "get_source",
+ "set_locus",
+]
+
+from biotite.file import InvalidFileError
-from ....file import InvalidFileError
-from .file import GenBankFile
def get_locus(gb_file):
"""
Parse the *LOCUS* field of a GenBank or GenPept file.
-
+
Parameters
----------
gb_file : GenBankFile
@@ -39,10 +46,10 @@ def get_locus(gb_file):
The GenBank division to which the file belongs.
date : str, optional
The date of last modification.
-
+
Examples
--------
-
+
>>> import os.path
>>> file = GenBankFile.read(os.path.join(path_to_sequences, "ec_bl21.gb"))
>>> name, length, mol_type, is_circular, division, date = get_locus(file)
@@ -68,59 +75,57 @@ def get_locus(gb_file):
# The first field will always be the ID
name = fields[0]
- # The second field will always be the length followed
+ # The second field will always be the length followed
# by units (eg 1224 aa)
length = int(fields[1])
- # The third field *should* be the molecular type
+ # The third field *should* be the molecular type
# but sometimes this is missing. This gets tricky
# because sometimes the next field, circular/linear,
# is missing, too. The field after that, division,
# is a 3 letter all caps token. Unfortunately, mol_type
- # is also often a 3 letter all caps token (eg DNA)!
+ # is also often a 3 letter all caps token (eg DNA)!
# Fortunately, GenBank publishes the set list of divisions
# here: https://www.ncbi.nlm.nih.gov/genbank/samplerecord ,
# so we can check against that set when determining whether
# the current token represents the molecular type.
divisions = (
- 'PRI', # primate sequences
- 'ROD', # rodent sequences
- 'MAM', # other mammalian sequences
- 'VRT', # other vertebrate sequences
- 'INV', # invertebrate sequences
- 'PLN', # plant, fungal, and algal sequences
- 'BCT', # bacterial sequences
- 'VRL', # viral sequences
- 'PHG', # bacteriophage sequences
- 'SYN', # synthetic sequences
- 'UNA', # unannotated sequences
- 'EST', # EST sequences (expressed sequence tags)
- 'PAT', # patent sequences
- 'STS', # STS sequences (sequence tagged sites)
- 'GSS', # GSS sequences (genome survey sequences)
- 'HTG', # HTG sequences (high-throughput genomic sequences)
- 'HTC', # unfinished high-throughput cDNA sequencing
- 'ENV', # environmental sampling sequences
- 'CON',
+ "PRI", # primate sequences
+ "ROD", # rodent sequences
+ "MAM", # other mammalian sequences
+ "VRT", # other vertebrate sequences
+ "INV", # invertebrate sequences
+ "PLN", # plant, fungal, and algal sequences
+ "BCT", # bacterial sequences
+ "VRL", # viral sequences
+ "PHG", # bacteriophage sequences
+ "SYN", # synthetic sequences
+ "UNA", # unannotated sequences
+ "EST", # EST sequences (expressed sequence tags)
+ "PAT", # patent sequences
+ "STS", # STS sequences (sequence tagged sites)
+ "GSS", # GSS sequences (genome survey sequences)
+ "HTG", # HTG sequences (high-throughput genomic sequences)
+ "HTC", # unfinished high-throughput cDNA sequencing
+ "ENV", # environmental sampling sequences
+ "CON",
)
- # NOTE: Remember that fields[2] is the unit for length,
+ # NOTE: Remember that fields[2] is the unit for length,
# eg bp or aa, so we move to fields[3] here.
- if fields[3] not in ('linear', 'circular') \
- and fields[3] not in divisions:
+ if fields[3] not in ("linear", "circular") and fields[3] not in divisions:
mol_type = fields[3]
next_idx = 4
else:
mol_type = None
next_idx = 3
-
- # The next field should be the token 'linear' or 'circular',
+ # The next field should be the token 'linear' or 'circular',
# but sometimes this is missing
- if 'linear' == fields[next_idx]:
+ if "linear" == fields[next_idx]:
is_circular = False
next_idx += 1
- elif 'circular' == fields[next_idx]:
+ elif "circular" == fields[next_idx]:
is_circular = True
next_idx += 1
else:
@@ -136,23 +141,24 @@ def get_locus(gb_file):
return name, length, mol_type, is_circular, division, date
+
def get_definition(gb_file):
"""
Parse the *DEFINITION* field of a GenBank or GenPept file.
-
+
Parameters
----------
gb_file : GenBankFile
The GenBank file to read the *DEFINITION* field from.
-
+
Returns
-------
definition : str
Content of the *DEFINITION* field.
-
+
Examples
--------
-
+
>>> import os.path
>>> file = GenBankFile.read(os.path.join(path_to_sequences, "ec_bl21.gb"))
>>> print(get_definition(file))
@@ -161,23 +167,24 @@ def get_definition(gb_file):
lines, _ = _expect_single_field(gb_file, "DEFINITION")
return " ".join([line.strip() for line in lines])
+
def get_accession(gb_file):
"""
Parse the *ACCESSION* field of a GenBank or GenPept file.
-
+
Parameters
----------
gb_file : GenBankFile
The GenBank file to read the *ACCESSION* field from.
-
+
Returns
-------
accession : str
The accession ID of the file.
-
+
Examples
--------
-
+
>>> import os.path
>>> file = GenBankFile.read(os.path.join(path_to_sequences, "ec_bl21.gb"))
>>> print(get_accession(file))
@@ -187,16 +194,17 @@ def get_accession(gb_file):
# 'ACCESSION' field has only one line
return lines[0]
+
def get_version(gb_file):
"""
Parse the version from the *VERSION* field of a GenBank or GenPept
file.
-
+
Parameters
----------
gb_file : GenBankFile
The GenBank file to read the *VERSION* field from.
-
+
Returns
-------
version : str
@@ -206,16 +214,17 @@ def get_version(gb_file):
# 'VERSION' field has only one line
return lines[0].split()[0]
+
def get_gi(gb_file):
"""
Parse the GI from the *VERSION* field of a GenBank or GenPept
file.
-
+
Parameters
----------
gb_file : GenBankFile
The GenBank file to read the *VERSION* field from.
-
+
Returns
-------
gi : str
@@ -229,24 +238,25 @@ def get_gi(gb_file):
# Truncate GI
return int(version_info[1][3:])
+
def get_db_link(gb_file):
"""
Parse the *DBLINK* field of a GenBank or GenPept file.
-
+
Parameters
----------
gb_file : GenBankFile
The GenBank file to read the *DBLINK* field from.
-
+
Returns
-------
link_dict : dict
A dictionary storing the database links, with the database
name as key, and the corresponding ID as value.
-
+
Examples
--------
-
+
>>> import os.path
>>> file = GenBankFile.read(os.path.join(path_to_sequences, "ec_bl21.gb"))
>>> for key, val in get_db_link(file).items():
@@ -265,12 +275,12 @@ def get_db_link(gb_file):
def get_source(gb_file):
"""
Parse the *SOURCE* field of a GenBank or GenPept file.
-
+
Parameters
----------
gb_file : GenBankFile
The GenBank file to read the *SOURCE* field from.
-
+
Returns
-------
accession : str
@@ -290,12 +300,12 @@ def _expect_single_field(gb_file, name):
return fields[0]
-
-def set_locus(gb_file, name, length, mol_type=None, is_circular=False,
- division=None, date=None):
+def set_locus(
+ gb_file, name, length, mol_type=None, is_circular=False, division=None, date=None
+):
"""
Set the *LOCUS* field of a GenBank file.
-
+
Parameters
----------
gb_file : GenBankFile
@@ -319,6 +329,8 @@ def set_locus(gb_file, name, length, mol_type=None, is_circular=False,
circularity = "circular" if is_circular else "linear"
division = "" if division is None else division
date = "" if date is None else date
- line = f"{name:18} {length:>9} {restype_abbr} {mol_type:^10} " \
- f"{circularity:8} {division:3} {date:11}"
- gb_file.set_field("LOCUS", [line])
\ No newline at end of file
+ line = (
+ f"{name:18} {length:>9} {restype_abbr} {mol_type:^10} "
+ f"{circularity:8} {division:3} {date:11}"
+ )
+ gb_file.set_field("LOCUS", [line])
diff --git a/src/biotite/sequence/io/genbank/sequence.py b/src/biotite/sequence/io/genbank/sequence.py
index 26ec645bb..f5b194746 100644
--- a/src/biotite/sequence/io/genbank/sequence.py
+++ b/src/biotite/sequence/io/genbank/sequence.py
@@ -8,16 +8,19 @@
__name__ = "biotite.sequence.io.genbank"
__author__ = "Patrick Kunzmann"
-__all__ = ["get_raw_sequence", "get_sequence", "get_annotated_sequence",
- "set_sequence", "set_annotated_sequence"]
+__all__ = [
+ "get_raw_sequence",
+ "get_sequence",
+ "get_annotated_sequence",
+ "set_sequence",
+ "set_annotated_sequence",
+]
import re
-from ....file import InvalidFileError
-from ...seqtypes import ProteinSequence, NucleotideSequence
-from ...annotation import AnnotatedSequence
-from .file import GenBankFile
-from .annotation import get_annotation, set_annotation
-
+from biotite.file import InvalidFileError
+from biotite.sequence.annotation import AnnotatedSequence
+from biotite.sequence.io.genbank.annotation import get_annotation, set_annotation
+from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence
_SYMBOLS_PER_CHUNK = 10
_SEQ_CHUNKS_PER_LINE = 6
@@ -112,7 +115,7 @@ def _convert_seq_str(seq_str, format):
if len(seq_str) == 0:
raise InvalidFileError("The file's 'ORIGIN' field is empty")
if format == "gb":
- return NucleotideSequence(seq_str.replace("U","T").replace("X","N"))
+ return NucleotideSequence(seq_str.replace("U", "T").replace("X", "N"))
elif format == "gp":
return ProteinSequence(seq_str.replace("U", "C").replace("O", "K"))
else:
@@ -125,8 +128,6 @@ def _get_seq_start(origin_content):
return int(origin_content[0].split()[0])
-
-
def set_sequence(gb_file, sequence, sequence_start=1):
"""
Set the *ORIGIN* field of a GenBank file with a sequence.
@@ -167,6 +168,4 @@ def set_annotated_sequence(gb_file, annot_sequence):
The annotated sequence that is put into the GenBank file.
"""
set_annotation(gb_file, annot_sequence.annotation)
- set_sequence(
- gb_file, annot_sequence.sequence, annot_sequence.sequence_start
- )
\ No newline at end of file
+ set_sequence(gb_file, annot_sequence.sequence, annot_sequence.sequence_start)
diff --git a/src/biotite/sequence/io/general.py b/src/biotite/sequence/io/general.py
index 09b7c2722..c76e11b72 100644
--- a/src/biotite/sequence/io/general.py
+++ b/src/biotite/sequence/io/general.py
@@ -9,31 +9,27 @@
__name__ = "biotite.sequence.io"
__author__ = "Patrick Kunzmann"
-__all__ = ["load_sequence", "save_sequence",
- "load_sequences", "save_sequences"]
+__all__ = ["load_sequence", "save_sequence", "load_sequences", "save_sequences"]
-import itertools
import os.path
-import io
from collections import OrderedDict
import numpy as np
-from ..seqtypes import NucleotideSequence, ProteinSequence
-from ..alphabet import Alphabet
+from biotite.sequence.seqtypes import NucleotideSequence
def load_sequence(file_path):
"""
Load a sequence from a sequence file without the need
to manually instantiate a :class:`File` object.
-
+
Internally this function uses a :class:`File` object, based on the
file extension.
-
+
Parameters
----------
file_path : str
The path to the sequence file.
-
+
Returns
-------
sequence : Sequence
@@ -42,11 +38,13 @@ def load_sequence(file_path):
# We only need the suffix here
filename, suffix = os.path.splitext(file_path)
if suffix in [".fasta", ".fa", ".mpfa", ".fna", ".fsa"]:
- from .fasta import FastaFile, get_sequence
+ from biotite.sequence.io.fasta import FastaFile, get_sequence
+
file = FastaFile.read(file_path)
return get_sequence(file)
elif suffix in [".fastq", ".fq"]:
- from .fastq import FastqFile
+ from biotite.sequence.io.fastq import FastqFile
+
# Quality scores are irrelevant for this function
# -> Offset is irrelevant
file = FastqFile.read(file_path, offset="Sanger")
@@ -56,7 +54,8 @@ def load_sequence(file_path):
break
return sequence
elif suffix in [".gb", ".gbk", ".gp"]:
- from .genbank import GenBankFile, get_sequence
+ from biotite.sequence.io.genbank import GenBankFile, get_sequence
+
format = "gp" if suffix == ".gp" else "gb"
file = GenBankFile.read(file_path)
return get_sequence(file, format)
@@ -68,10 +67,10 @@ def save_sequence(file_path, sequence):
"""
Save a sequence into a sequence file without the need
to manually instantiate a :class:`File` object.
-
+
Internally this function uses a :class:`File` object, based on the
given file extension.
-
+
Parameters
----------
file_path : str
@@ -82,12 +81,14 @@ def save_sequence(file_path, sequence):
# We only need the suffix here
filename, suffix = os.path.splitext(file_path)
if suffix in [".fasta", ".fa", ".mpfa", ".fna", ".fsa"]:
- from .fasta import FastaFile, set_sequence
+ from biotite.sequence.io.fasta import FastaFile, set_sequence
+
file = FastaFile()
set_sequence(file, sequence)
file.write(file_path)
elif suffix in [".fastq", ".fq"]:
- from .fastq import FastqFile
+ from biotite.sequence.io.fastq import FastqFile
+
# Quality scores are irrelevant for this function
# -> Offset is irrelevant
file = FastqFile(offset="Sanger")
@@ -96,7 +97,8 @@ def save_sequence(file_path, sequence):
file["sequence"] = str(sequence), scores
file.write(file_path)
elif suffix in [".gb", ".gbk", ".gp"]:
- from .genbank import GenBankFile, set_locus, set_sequence
+ from biotite.sequence.io.genbank import GenBankFile, set_locus, set_sequence
+
file = GenBankFile()
set_locus(file, "sequence", len(sequence))
set_sequence(file, sequence)
@@ -109,37 +111,42 @@ def load_sequences(file_path):
"""
Load multiple sequences from a sequence file without the need
to manually instantiate a :class:`File` object.
-
+
Internally this function uses a :class:`File` object, based on the
file extension.
-
+
Parameters
----------
file_path : str
The path to the sequence file.
-
+
Returns
-------
sequences : dict of (str, Sequence)
The sequences in the file.
This dictionary maps each header name to
- the respective sequence.
+ the respective sequence.
"""
# We only need the suffix here
filename, suffix = os.path.splitext(file_path)
if suffix in [".fasta", ".fa", ".mpfa", ".fna", ".fsa"]:
- from .fasta import FastaFile, get_sequences
+ from biotite.sequence.io.fasta import FastaFile, get_sequences
+
file = FastaFile.read(file_path)
return get_sequences(file)
elif suffix in [".fastq", ".fq"]:
- from .fastq import FastqFile
+ from biotite.sequence.io.fastq import FastqFile
+
# Quality scores are irrelevant for this function
# -> Offset is irrelevant
file = FastqFile.read(file_path, offset="Sanger")
- return {identifier : NucleotideSequence(seq_str)
- for identifier, (seq_str, scores) in file.items()}
+ return {
+ identifier: NucleotideSequence(seq_str)
+ for identifier, (seq_str, scores) in file.items()
+ }
elif suffix in [".gb", ".gbk", ".gp"]:
- from .genbank import MultiFile, get_definition, get_sequence
+ from biotite.sequence.io.genbank import MultiFile, get_definition, get_sequence
+
file = MultiFile.read(file_path)
format = "gp" if suffix == ".gp" else "gb"
sequences = OrderedDict()
@@ -154,10 +161,10 @@ def save_sequences(file_path, sequences):
"""
Save multiple sequences into a sequence file without the need
to manually instantiate a :class:`File` object.
-
+
Internally this function uses a :class:`File` object, based on the
given file extension.
-
+
Parameters
----------
file_path : str
@@ -169,12 +176,14 @@ def save_sequences(file_path, sequences):
# We only need the suffix here
filename, suffix = os.path.splitext(file_path)
if suffix in [".fasta", ".fa", ".mpfa", ".fna", ".fsa"]:
- from .fasta import FastaFile, set_sequences
+ from biotite.sequence.io.fasta import FastaFile, set_sequences
+
file = FastaFile()
set_sequences(file, sequences)
file.write(file_path)
elif suffix in [".fastq", ".fq"]:
- from .fastq import FastqFile
+ from biotite.sequence.io.fastq import FastqFile
+
# Quality scores are irrelevant for this function
# -> Offset is irrelevant
file = FastqFile(offset="Sanger")
diff --git a/src/biotite/sequence/io/gff/__init__.py b/src/biotite/sequence/io/gff/__init__.py
index f544a0ddd..52bac129c 100644
--- a/src/biotite/sequence/io/gff/__init__.py
+++ b/src/biotite/sequence/io/gff/__init__.py
@@ -14,7 +14,7 @@
GFF 3 files. This means, that you cannot directly access the the
parent or child of a feature.
However, the ``Id`` and ``Name`` attributes are stored in the
- qualifiers of the created :class:`Feature` objects.
+ qualifiers of the created :class:`Feature` objects.
Hence, it is possible to implement such a data structure from this
information.
"""
@@ -22,5 +22,5 @@
__name__ = "biotite.sequence.io.gff"
__author__ = "Patrick Kunzmann"
+from .convert import *
from .file import *
-from .convert import *
\ No newline at end of file
diff --git a/src/biotite/sequence/io/gff/convert.py b/src/biotite/sequence/io/gff/convert.py
index 9c8782f65..8f3fb75f2 100644
--- a/src/biotite/sequence/io/gff/convert.py
+++ b/src/biotite/sequence/io/gff/convert.py
@@ -6,7 +6,7 @@
__author__ = "Patrick Kunzmann"
__all__ = ["get_annotation", "set_annotation"]
-from ...annotation import Location, Feature, Annotation
+from biotite.sequence.annotation import Annotation, Feature, Location
def get_annotation(gff_file):
@@ -22,12 +22,12 @@ def get_annotation(gff_file):
Thus, for entries with the same ``ID``, the *type* and *attributes*
are only parsed once and the locations are aggregated from each
entry.
-
+
Parameters
----------
gff_file : GFFFile
The file tro extract the :class:`Annotation` object from.
-
+
Returns
-------
annotation : Annotation
@@ -45,9 +45,7 @@ def get_annotation(gff_file):
# (beginning of the file)
if current_key is not None:
# Beginning of new feature -> Save previous feature
- annot.add_feature(
- Feature(current_key, current_locs, current_qual)
- )
+ annot.add_feature(Feature(current_key, current_locs, current_qual))
# Track new feature
current_key = type
current_locs = [Location(start, end, strand)]
@@ -61,15 +59,14 @@ def get_annotation(gff_file):
return annot
-def set_annotation(gff_file, annotation,
- seqid=None, source=None, is_stranded=True):
+def set_annotation(gff_file, annotation, seqid=None, source=None, is_stranded=True):
"""
Write an :class:`Annotation` object into a GFF3 file.
Each feature will get one entry for each location it has.
:class:`Feature` objects with multiple locations require the ``ID``
qualifier in its :attr:`Feature.qual` attribute.
-
+
Parameters
----------
gff_file : GFFFile
@@ -87,14 +84,13 @@ def set_annotation(gff_file, annotation,
for feature in sorted(annotation):
if len(feature.locs) > 1 and "ID" not in feature.qual:
raise ValueError(
- "The 'Id' qualifier is required "
- "for features with multiple locations"
+ "The 'Id' qualifier is required " "for features with multiple locations"
)
## seqid ##
if seqid is not None and " " in seqid:
raise ValueError("The 'seqid' must not contain whitespaces")
## source ##
- #Nothing to be done
+ # Nothing to be done
## type ##
type = feature.key
## strand ##
@@ -128,6 +124,5 @@ def set_annotation(gff_file, annotation,
else:
phase = None
gff_file.append(
- seqid, source, type, start, end,
- score, strand, phase, attributes
- )
\ No newline at end of file
+ seqid, source, type, start, end, score, strand, phase, attributes
+ )
diff --git a/src/biotite/sequence/io/gff/file.py b/src/biotite/sequence/io/gff/file.py
index f708712d2..c151bd869 100644
--- a/src/biotite/sequence/io/gff/file.py
+++ b/src/biotite/sequence/io/gff/file.py
@@ -6,19 +6,17 @@
__author__ = "Patrick Kunzmann"
__all__ = ["GFFFile"]
-import copy
import string
-from urllib.parse import quote, unquote
import warnings
-from ....file import TextFile, InvalidFileError
-from ...annotation import Location
-
+from urllib.parse import quote, unquote
+from biotite.file import InvalidFileError, TextFile
+from biotite.sequence.annotation import Location
# All punctuation characters except
# percent, semicolon, equals, ampersand, comma
-_NOT_QUOTED = "".join(
- [char for char in string.punctuation if char not in "%;=&,"]
-) + " "
+_NOT_QUOTED = (
+ "".join([char for char in string.punctuation if char not in "%;=&,"]) + " "
+)
class GFFFile(TextFile):
@@ -61,7 +59,7 @@ class GFFFile(TextFile):
The content after the ``##FASTA`` directive is simply ignored.
Please provide the sequence via a separate file or read the FASTA
data directly via the :attr:`lines` attribute:
-
+
>>> import os.path
>>> from io import StringIO
>>> gff_file = GFFFile.read(os.path.join(path_to_sequences, "indexing_test.gff3"))
@@ -121,7 +119,7 @@ class GFFFile(TextFile):
##Example directive param1 param2
SomeSeqID Biotite CDS 1 99 . + 0 ID=FeatureID;product=A protein
"""
-
+
def __init__(self):
super().__init__()
# Maps entry indices to line indices
@@ -132,18 +130,18 @@ def __init__(self):
self._has_fasta = None
self._index_entries()
self.append_directive("gff-version", "3")
-
+
@classmethod
def read(cls, file):
"""
Read a GFF3 file.
-
+
Parameters
----------
file : file-like object or str
The file to be read.
Alternatively a file path can be supplied.
-
+
Returns
-------
file_object : GFFFile
@@ -152,18 +150,29 @@ def read(cls, file):
file = super().read(file)
file._index_entries()
return file
-
- def insert(self, index, seqid, source, type, start, end,
- score, strand, phase, attributes=None):
+
+ def insert(
+ self,
+ index,
+ seqid,
+ source,
+ type,
+ start,
+ end,
+ score,
+ strand,
+ phase,
+ attributes=None,
+ ):
"""
Insert an entry at the given index.
-
+
Parameters
----------
index : int
Index where the entry is inserted.
If the index is equal to the length of the file, the entry
- is appended at the end of the file.
+ is appended at the end of the file.
seqid : str
The ID of the reference sequence.
source : str
@@ -184,22 +193,23 @@ def insert(self, index, seqid, source, type, start, end,
Additional properties of the feature.
"""
if index == len(self):
- self.append(seqid, source, type, start, end,
- score, strand, phase, attributes)
+ self.append(
+ seqid, source, type, start, end, score, strand, phase, attributes
+ )
else:
line_index = self._entries[index]
line = GFFFile._create_line(
- seqid, source, type, start, end,
- score, strand, phase, attributes
+ seqid, source, type, start, end, score, strand, phase, attributes
)
self.lines.insert(line_index, line)
self._index_entries()
-
- def append(self, seqid, source, type, start, end,
- score, strand, phase, attributes=None):
+
+ def append(
+ self, seqid, source, type, start, end, score, strand, phase, attributes=None
+ ):
"""
Append an entry to the end of the file.
-
+
Parameters
----------
seqid : str
@@ -232,11 +242,11 @@ def append(self, seqid, source, type, start, end,
self.lines.append(line)
# Fast update of entry index by adding last line
self._entries.append(len(self.lines) - 1)
-
+
def append_directive(self, directive, *args):
"""
Append a directive line to the end of the file.
-
+
Parameters
----------
directive : str
@@ -245,13 +255,13 @@ def append_directive(self, directive, *args):
Optional parameters for the directive.
Each argument is simply appended to the directive, separated
by a single space character.
-
+
Raises
------
NotImplementedError
If the ``##FASTA`` directive is used, which is not
supported.
-
+
Examples
--------
@@ -262,17 +272,15 @@ def append_directive(self, directive, *args):
##Example directive param1 param2
"""
if directive.startswith("FASTA"):
- raise NotImplementedError(
- "Adding FASTA information is not supported"
- )
+ raise NotImplementedError("Adding FASTA information is not supported")
directive_line = "##" + directive + " " + " ".join(args)
self._directives.append((directive_line[2:], len(self.lines)))
self.lines.append(directive_line)
-
+
def directives(self):
"""
Get the directives in the file.
-
+
Returns
-------
directives : list of tuple(str, int)
@@ -283,7 +291,7 @@ def directives(self):
"""
# Sort in line order
return sorted(self._directives, key=lambda directive: directive[1])
-
+
def __setitem__(self, index, item):
seqid, source, type, start, end, score, strand, phase, attrib = item
line = GFFFile._create_line(
@@ -292,15 +300,13 @@ def __setitem__(self, index, item):
line_index = self._entries[index]
self.lines[line_index] = line
-
def __getitem__(self, index):
- if (index >= 0 and index >= len(self)) or \
- (index < 0 and -index > len(self)):
- raise IndexError(
- f"Index {index} is out of range for GFFFile with "
- f"{len(self)} entries"
- )
-
+ if (index >= 0 and index >= len(self)) or (index < 0 and -index > len(self)):
+ raise IndexError(
+ f"Index {index} is out of range for GFFFile with "
+ f"{len(self)} entries"
+ )
+
line_index = self._entries[index]
# Columns are tab separated
s = self.lines[line_index].strip().split("\t")
@@ -324,15 +330,15 @@ def __getitem__(self, index):
attrib = GFFFile._parse_attributes(attrib)
return seqid, source, type, start, end, score, strand, phase, attrib
-
+
def __delitem__(self, index):
line_index = self._entries[index]
del self.lines[line_index]
self._index_entries()
-
+
def __len__(self):
return len(self._entries)
-
+
def _index_entries(self):
"""
Parse the file for comment and directive lines.
@@ -374,15 +380,12 @@ def _index_entries(self):
self._entries = self._entries[:entry_counter]
@staticmethod
- def _create_line(seqid, source, type, start, end,
- score, strand, phase, attributes):
+ def _create_line(seqid, source, type, start, end, score, strand, phase, attributes):
"""
Create a line for a newly created entry.
"""
- seqid = quote(seqid.strip(), safe=_NOT_QUOTED) \
- if seqid is not None else "."
- source = quote(source.strip(), safe=_NOT_QUOTED) \
- if source is not None else "."
+ seqid = quote(seqid.strip(), safe=_NOT_QUOTED) if seqid is not None else "."
+ source = quote(source.strip(), safe=_NOT_QUOTED) if source is not None else "."
type = type.strip()
# Perform checks
@@ -394,7 +397,7 @@ def _create_line(seqid, source, type, start, end,
raise ValueError("'type' must not be empty")
if seqid[0] == ">":
raise ValueError("'seqid' must not start with '>'")
-
+
score = str(score) if score is not None else "."
if strand == Location.Strand.FORWARD:
strand = "+"
@@ -403,16 +406,31 @@ def _create_line(seqid, source, type, start, end,
else:
strand = "."
phase = str(phase) if phase is not None else "."
- attributes = ";".join(
- [quote(key, safe=_NOT_QUOTED) + "=" + quote(val, safe=_NOT_QUOTED)
- for key, val in attributes.items()]
- ) if attributes is not None and len(attributes) > 0 else "."
+ attributes = (
+ ";".join(
+ [
+ quote(key, safe=_NOT_QUOTED) + "=" + quote(val, safe=_NOT_QUOTED)
+ for key, val in attributes.items()
+ ]
+ )
+ if attributes is not None and len(attributes) > 0
+ else "."
+ )
return "\t".join(
- [seqid, source, type, str(start), str(end),
- str(score), strand, phase, attributes]
+ [
+ seqid,
+ source,
+ type,
+ str(start),
+ str(end),
+ str(score),
+ strand,
+ phase,
+ attributes,
+ ]
)
-
+
@staticmethod
def _parse_attributes(attributes):
"""
@@ -426,9 +444,7 @@ def _parse_attributes(attributes):
for entry in attrib_entries:
compounds = entry.split("=")
if len(compounds) != 2:
- raise InvalidFileError(
- f"Attribute entry '{entry}' is invalid"
- )
+ raise InvalidFileError(f"Attribute entry '{entry}' is invalid")
key, val = compounds
attrib_dict[unquote(key)] = unquote(val)
- return attrib_dict
\ No newline at end of file
+ return attrib_dict
diff --git a/src/biotite/sequence/phylo/__init__.py b/src/biotite/sequence/phylo/__init__.py
index d70caa681..5d29f1a9e 100644
--- a/src/biotite/sequence/phylo/__init__.py
+++ b/src/biotite/sequence/phylo/__init__.py
@@ -31,6 +31,6 @@
__name__ = "biotite.sequence.phylo"
__author__ = "Patrick Kunzmann"
+from .nj import *
from .tree import *
from .upgma import *
-from .nj import *
\ No newline at end of file
diff --git a/src/biotite/sequence/profile.py b/src/biotite/sequence/profile.py
index 1a140e1f9..d208b2b3f 100644
--- a/src/biotite/sequence/profile.py
+++ b/src/biotite/sequence/profile.py
@@ -4,9 +4,13 @@
import warnings
import numpy as np
-from .seqtypes import NucleotideSequence, ProteinSequence, GeneralSequence
-from .alphabet import LetterAlphabet
-from .align.alignment import get_codes
+from biotite.sequence.align.alignment import get_codes
+from biotite.sequence.alphabet import LetterAlphabet
+from biotite.sequence.seqtypes import (
+ GeneralSequence,
+ NucleotideSequence,
+ ProteinSequence,
+)
__name__ = "biotite.sequence"
__author__ = "Maximilian Greil"
@@ -73,7 +77,7 @@ class SequenceProfile(object):
be created from an indefinite number of aligned sequences.
With :meth:`sequence_probability_from_matrix()` the probability of a
- sequence can be calculated based on the before calculated position
+ sequence can be calculated based on the before calculated position
probability matrix of this instance of object SequenceProfile.
With :meth:`sequence_score_from_matrix()` the score of a sequence
@@ -154,8 +158,10 @@ def gaps(self, new_gaps):
def __repr__(self):
"""Represent SequenceProfile as a string for debugging."""
- return f"SequenceProfile(np.{np.array_repr(self.symbols)}, " \
- f"np.{np.array_repr(self.gaps)}, Alphabet({self.alphabet}))"
+ return (
+ f"SequenceProfile(np.{np.array_repr(self.symbols)}, "
+ f"np.{np.array_repr(self.gaps)}, Alphabet({self.alphabet}))"
+ )
def __eq__(self, item):
if not isinstance(item, SequenceProfile):
@@ -204,16 +210,16 @@ def from_alignment(alignment, alphabet=None):
for alph in (seq.alphabet for seq in alignment.sequences):
if not alphabet.extends(alph):
raise ValueError(
- f"The given alphabet is incompatible with a least one "
+ "The given alphabet is incompatible with a least one "
"alphabet of the given sequences"
)
symbols = np.zeros((len(sequences[0]), len(alphabet)), dtype=int)
gaps = np.zeros(len(sequences[0]), dtype=int)
sequences = np.transpose(sequences)
for i in range(len(sequences)):
- row = np.where(sequences[i, ] == -1, len(alphabet), sequences[i, ])
+ row = np.where(sequences[i,] == -1, len(alphabet), sequences[i,])
count = np.bincount(row, minlength=len(alphabet) + 1)
- symbols[i, ] = count[0:len(alphabet)]
+ symbols[i,] = count[0 : len(alphabet)]
gaps[i] = count[-1]
return SequenceProfile(symbols, gaps, alphabet)
@@ -248,10 +254,21 @@ def to_consensus(self, as_general=False):
def _dna_to_consensus(self):
codes = {
- (0,): 'A', (1,): 'C', (2,): 'G', (3,): 'T',
- (0, 2): 'R', (1, 3): 'Y', (1, 2): 'S', (0, 3): 'W', (2, 3): 'K', (0, 1): 'M',
- (1, 2, 3): 'B', (0, 2, 3): 'D', (0, 1, 3): 'H', (0, 1, 2): 'V',
- (0, 1, 2, 3): 'N'
+ (0,): "A",
+ (1,): "C",
+ (2,): "G",
+ (3,): "T",
+ (0, 2): "R",
+ (1, 3): "Y",
+ (1, 2): "S",
+ (0, 3): "W",
+ (2, 3): "K",
+ (0, 1): "M",
+ (1, 2, 3): "B",
+ (0, 2, 3): "D",
+ (0, 1, 3): "H",
+ (0, 1, 2): "V",
+ (0, 1, 2, 3): "N",
}
consensus = ""
maxes = np.max(self.symbols, axis=1)
@@ -261,10 +278,21 @@ def _dna_to_consensus(self):
def _rna_to_consensus(self):
codes = {
- (0,): 'A', (1,): 'C', (2,): 'G', (3,): 'U',
- (0, 2): 'R', (1, 3): 'Y', (1, 2): 'S', (0, 3): 'W', (2, 3): 'K', (0, 1): 'M',
- (1, 2, 3): 'B', (0, 2, 3): 'D', (0, 1, 3): 'H', (0, 1, 2): 'V',
- (0, 1, 2, 3): 'N'
+ (0,): "A",
+ (1,): "C",
+ (2,): "G",
+ (3,): "U",
+ (0, 2): "R",
+ (1, 3): "Y",
+ (1, 2): "S",
+ (0, 3): "W",
+ (2, 3): "K",
+ (0, 1): "M",
+ (1, 2, 3): "B",
+ (0, 2, 3): "D",
+ (0, 1, 3): "H",
+ (0, 1, 2): "V",
+ (0, 1, 2, 3): "N",
}
consensus = ""
maxes = np.max(self.symbols, axis=1)
@@ -307,7 +335,7 @@ def probability_matrix(self, pseudocount=0):
.. math::
P(S) = \frac {C_S + \frac{c_p}{k}} {\sum_{i} C_i + c_p}
-
+
:math:`S`: The symbol.
:math:`C_S`: The count of symbol :math:`S` at the sequence
@@ -330,11 +358,10 @@ def probability_matrix(self, pseudocount=0):
The calculated the position probability matrix.
"""
if pseudocount < 0:
- raise ValueError(
- f"Pseudocount can not be smaller than zero."
- )
- return (self.symbols + pseudocount / self.symbols.shape[1]) / \
- (np.sum(self.symbols, axis=1)[:, np.newaxis] + pseudocount)
+ raise ValueError("Pseudocount can not be smaller than zero.")
+ return (self.symbols + pseudocount / self.symbols.shape[1]) / (
+ np.sum(self.symbols, axis=1)[:, np.newaxis] + pseudocount
+ )
def log_odds_matrix(self, background_frequencies=None, pseudocount=0):
r"""
@@ -346,7 +373,7 @@ def log_odds_matrix(self, background_frequencies=None, pseudocount=0):
.. math::
W(S) = \log_2 \left( \frac{P(S)}{B_S} \right)
-
+
:math:`S`: The symbol.
:math:`P(S)`: The probability of symbol :math:`S` at the
@@ -363,7 +390,7 @@ def log_odds_matrix(self, background_frequencies=None, pseudocount=0):
background_frequencies: ndarray, shape=(k,), dtype=float, optional
The background frequencies for each symbol in the alphabet.
By default, a uniform distribution is assumed.
-
+
Returns
-------
pwm: ndarray, dtype=float, shape=(n,k)
@@ -383,7 +410,7 @@ def sequence_probability(self, sequence, pseudocount=0):
Calculate probability of a sequence based on the
position probability matrix (PPM).
- The sequence probability is the product of the probability of
+ The sequence probability is the product of the probability of
the respective symbol over all sequence positions.
Parameters
@@ -419,7 +446,7 @@ def sequence_score(self, sequence, background_frequencies=None, pseudocount=0):
Calculate score of a sequence based on the
position weight matrix (PWM).
- The score is the sum of weights (log-odds scores) of
+ The score is the sum of weights (log-odds scores) of
the respective symbol over all sequence positions.
Parameters
@@ -442,7 +469,9 @@ def sequence_score(self, sequence, background_frequencies=None, pseudocount=0):
"""
if background_frequencies is None:
background_frequencies = 1 / len(self.alphabet)
- pwm = self.log_odds_matrix(background_frequencies=background_frequencies, pseudocount=pseudocount)
+ pwm = self.log_odds_matrix(
+ background_frequencies=background_frequencies, pseudocount=pseudocount
+ )
if len(sequence) != len(pwm):
raise ValueError(
f"The given sequence has a different length ({len(sequence)}) than "
diff --git a/src/biotite/sequence/search.py b/src/biotite/sequence/search.py
index c57e7d119..96af23d03 100644
--- a/src/biotite/sequence/search.py
+++ b/src/biotite/sequence/search.py
@@ -4,8 +4,7 @@
__name__ = "biotite.sequence"
__author__ = "Patrick Kunzmann"
-__all__ = ["find_subsequence", "find_symbol", "find_symbol_first",
- "find_symbol_last"]
+__all__ = ["find_subsequence", "find_symbol", "find_symbol_first", "find_symbol_last"]
import numpy as np
@@ -13,7 +12,7 @@
def find_subsequence(sequence, query):
"""
Find a subsequence in a sequence.
-
+
Parameters
----------
sequence : Sequence
@@ -21,26 +20,26 @@ def find_subsequence(sequence, query):
query : Sequence
The potential subsequence. Its alphabet must extend the
`sequence` alphabet.
-
+
Returns
-------
match_indices : ndarray
The starting indices in `sequence`, where `query` has been
found. The array is empty if no match has been found.
-
+
Raises
------
ValueError
If the `query` alphabet does not extend the `sequence` alphabet.
-
+
Examples
--------
-
+
>>> main_seq = NucleotideSequence("ACTGAATGA")
>>> sub_seq = NucleotideSequence("TGA")
>>> print(find_subsequence(main_seq, sub_seq))
[2 6]
-
+
"""
if not sequence.get_alphabet().extends(query.get_alphabet()):
raise ValueError("The sequences alphabets are not equal")
@@ -52,17 +51,18 @@ def find_subsequence(sequence, query):
match_indices.append(i)
return np.array(match_indices)
+
def find_symbol(sequence, symbol):
"""
Find a symbol in a sequence.
-
+
Parameters
----------
sequence : Sequence
The sequence to find the symbol in.
symbol : object
The symbol to be found in `sequence`.
-
+
Returns
-------
match_indices : ndarray
@@ -71,17 +71,18 @@ def find_symbol(sequence, symbol):
code = sequence.get_alphabet().encode(symbol)
return np.where(sequence.code == code)[0]
+
def find_symbol_first(sequence, symbol):
"""
Find first occurence of a symbol in a sequence.
-
+
Parameters
----------
sequence : Sequence
The sequence to find the symbol in.
symbol : object
The symbol to be found in `sequence`.
-
+
Returns
-------
first_index : int
@@ -92,18 +93,19 @@ def find_symbol_first(sequence, symbol):
if len(match_i) == 0:
return -1
return np.min(match_i)
-
+
+
def find_symbol_last(sequence, symbol):
"""
Find last occurence of a symbol in a sequence.
-
+
Parameters
----------
sequence : Sequence
The sequence to find the symbol in.
symbol : object
The symbol to be found in `sequence`.
-
+
Returns
-------
flast_index : int
diff --git a/src/biotite/sequence/seqtypes.py b/src/biotite/sequence/seqtypes.py
index 76254e13f..e09527c35 100644
--- a/src/biotite/sequence/seqtypes.py
+++ b/src/biotite/sequence/seqtypes.py
@@ -6,17 +6,16 @@
__author__ = "Patrick Kunzmann", "Thomas Nevolianis"
__all__ = ["GeneralSequence", "NucleotideSequence", "ProteinSequence"]
-from .sequence import Sequence
-from .alphabet import LetterAlphabet, AlphabetError, AlphabetMapper
import numpy as np
-import copy
+from biotite.sequence.alphabet import AlphabetError, AlphabetMapper, LetterAlphabet
+from biotite.sequence.sequence import Sequence
class GeneralSequence(Sequence):
"""
This class allows the creation of a sequence with custom
:class:`Alphabet` without the need to subclass :class:`Sequence`.
-
+
Parameters
----------
alphabet : Alphabet
@@ -27,22 +26,24 @@ class GeneralSequence(Sequence):
may also be a :class:`str` object.
By default the sequence is empty.
"""
-
+
def __init__(self, alphabet, sequence=()):
self._alphabet = alphabet
super().__init__(sequence)
def __repr__(self):
"""Represent GeneralSequence as a string for debugging."""
- return f"GeneralSequence(Alphabet({self._alphabet}), " \
- f"[{', '.join([repr(symbol) for symbol in self.symbols])}])"
+ return (
+ f"GeneralSequence(Alphabet({self._alphabet}), "
+ f"[{', '.join([repr(symbol) for symbol in self.symbols])}])"
+ )
def __copy_create__(self):
return GeneralSequence(self._alphabet)
-
+
def get_alphabet(self):
return self._alphabet
-
+
def as_type(self, sequence):
"""
Convert the :class:`GeneralSequence` into a sequence of another
@@ -58,12 +59,12 @@ def as_type(self, sequence):
of this object.
The alphabet must equal or extend the alphabet of this
object.
-
+
Returns
-------
sequence : Sequence
The input `sequence` with replaced sequence code.
-
+
Raises
------
AlphabetError
@@ -78,16 +79,17 @@ def as_type(self, sequence):
sequence.code = self.code
return sequence
+
class NucleotideSequence(Sequence):
"""
Representation of a nucleotide sequence (DNA or RNA).
-
+
This class may have one of two different alphabets:
:attr:`unambiguous_alphabet()` contains only the unambiguous DNA
letters 'A', 'C', 'G' and 'T'.
- :attr:`ambiguous_alphabet()` uses an extended alphabet for ambiguous
+ :attr:`ambiguous_alphabet()` uses an extended alphabet for ambiguous
letters.
-
+
Parameters
----------
sequence : iterable object, optional
@@ -100,35 +102,36 @@ class NucleotideSequence(Sequence):
ambiguous letters in the sequence, the ambiguous alphabet
is used.
"""
-
- alphabet_unamb = LetterAlphabet(["A","C","G","T"])
- alphabet_amb = LetterAlphabet(
- ["A","C","G","T","R","Y","W","S",
- "M","K","H","B","V","D","N"]
+
+ alphabet_unamb = LetterAlphabet(["A", "C", "G", "T"])
+ alphabet_amb = LetterAlphabet(
+ ["A", "C", "G", "T", "R", "Y", "W", "S", "M", "K", "H", "B", "V", "D", "N"]
)
-
- compl_symbol_dict = {"A" : "T",
- "C" : "G",
- "G" : "C",
- "T" : "A",
- "M" : "K",
- "R" : "Y",
- "W" : "W",
- "S" : "S",
- "Y" : "R",
- "K" : "M",
- "V" : "B",
- "H" : "D",
- "D" : "H",
- "B" : "V",
- "N" : "N"}
+
+ compl_symbol_dict = {
+ "A": "T",
+ "C": "G",
+ "G": "C",
+ "T": "A",
+ "M": "K",
+ "R": "Y",
+ "W": "W",
+ "S": "S",
+ "Y": "R",
+ "K": "M",
+ "V": "B",
+ "H": "D",
+ "D": "H",
+ "B": "V",
+ "N": "N",
+ }
# List comprehension does not work in this scope
_compl_symbols = []
for _symbol in alphabet_amb.get_symbols():
_compl_symbols.append(compl_symbol_dict[_symbol])
_compl_alphabet_unamb = LetterAlphabet(_compl_symbols)
_compl_mapper = AlphabetMapper(_compl_alphabet_unamb, alphabet_amb)
-
+
def __init__(self, sequence=[], ambiguous=None):
if isinstance(sequence, str):
sequence = sequence.upper()
@@ -164,28 +167,28 @@ def __copy_create__(self):
else:
seq_copy = NucleotideSequence(ambiguous=False)
return seq_copy
-
+
def get_alphabet(self):
return self._alphabet
-
+
def complement(self):
"""
Get the complement nucleotide sequence.
-
+
Returns
-------
complement : NucleotideSequence
The complement sequence.
-
+
Examples
--------
-
+
>>> dna_seq = NucleotideSequence("ACGCTT")
>>> print(dna_seq.complement())
TGCGAA
>>> print(dna_seq.reverse().complement())
AAGCGT
-
+
"""
# Interpreting the sequence code of this object in the
# complementary alphabet gives the complementary symbols
@@ -194,18 +197,18 @@ def complement(self):
# alphabet into the original alphabet
compl_code = NucleotideSequence._compl_mapper[self.code]
return self.copy(compl_code)
-
+
def translate(self, complete=False, codon_table=None, met_start=False):
"""
Translate the nucleotide sequence into a protein sequence.
-
+
If `complete` is true, the entire sequence is translated,
beginning with the first codon and ending with the last codon,
even if stop codons occur during the translation.
-
+
Otherwise this method returns possible ORFs in the
sequence, even if not stop codon occurs in an ORF.
-
+
Parameters
----------
complete : bool, optional
@@ -222,7 +225,7 @@ def translate(self, complete=False, codon_table=None, met_start=False):
Otherwise the translation starts with the amino acid
the codon codes for. Only applies, if `complete` is false.
(Default: False)
-
+
Returns
-------
protein : ProteinSequence or list of ProteinSequence
@@ -233,15 +236,15 @@ def translate(self, complete=False, codon_table=None, met_start=False):
pos : list of tuple (int, int)
Is only returned if `complete` is false. The list contains
a tuple for each ORF.
- The first element of the tuple is the index of the
+ The first element of the tuple is the index of the
:class:`NucleotideSequence`, where the translation starts.
The second element is the exclusive stop index, it
represents the first nucleotide in the
:class:`NucleotideSequence` after a stop codon.
-
+
Examples
--------
-
+
>>> dna_seq = NucleotideSequence("AATGATGCTATAGAT")
>>> prot_seq = dna_seq.translate(complete=True)
>>> print(prot_seq)
@@ -251,29 +254,32 @@ def translate(self, complete=False, codon_table=None, met_start=False):
... print(seq)
MML*
ML*
-
+
"""
if self._alphabet != NucleotideSequence.alphabet_unamb:
raise AlphabetError("Translation requires unambiguous alphabet")
# Determine codon_table
if codon_table is None:
# Import at this position to avoid circular import
- from .codon import CodonTable
+ from biotite.sequence.codon import CodonTable
+
codon_table = CodonTable.default_table()
-
+
if complete:
if len(self) % 3 != 0:
- raise ValueError("Sequence length needs to be a multiple of 3 "
- "for complete translation")
+ raise ValueError(
+ "Sequence length needs to be a multiple of 3 "
+ "for complete translation"
+ )
# Reshape code into (n,3), with n being the amount of codons
codons = self.code.reshape(-1, 3)
protein_seq = ProteinSequence()
protein_seq.code = codon_table.map_codon_codes(codons)
return protein_seq
-
+
else:
stop_code = ProteinSequence.alphabet.encode("*")
- met_code = ProteinSequence.alphabet.encode("M")
+ met_code = ProteinSequence.alphabet.encode("M")
protein_seqs = []
pos = []
code = self.code
@@ -282,7 +288,7 @@ def translate(self, complete=False, codon_table=None, met_start=False):
# The frame length is always a multiple of 3
# If there is a trailing partial codon, remove it
frame_length = ((len(code) - shift) // 3) * 3
- frame = code[shift : shift+frame_length]
+ frame = code[shift : shift + frame_length]
# Reshape frame into (n,3), with n being the amount of codons
frame_codons = frame.reshape(-1, 3)
# At first, translate frame completely
@@ -297,8 +303,7 @@ def translate(self, complete=False, codon_table=None, met_start=False):
stops = np.where(code_from_start == stop_code)[0]
# Find first stop codon after start codon
# Include stop -> stops[0] + 1
- stop_i = stops[0] + 1 if len(stops) > 0 \
- else len(code_from_start)
+ stop_i = stops[0] + 1 if len(stops) > 0 else len(code_from_start)
code_from_start_to_stop = code_from_start[:stop_i]
prot_seq = ProteinSequence()
if met_start:
@@ -310,13 +315,13 @@ def translate(self, complete=False, codon_table=None, met_start=False):
protein_seqs.append(prot_seq)
# Codon indices are transformed
# to nucleotide sequence indices
- pos.append((shift + start_i*3, shift + (start_i+stop_i)*3))
+ pos.append((shift + start_i * 3, shift + (start_i + stop_i) * 3))
# Sort by start position
order = np.argsort([start for start, stop in pos])
pos = [pos[i] for i in order]
protein_seqs = [protein_seqs[i] for i in order]
return protein_seqs, pos
-
+
@staticmethod
def unambiguous_alphabet():
"""
@@ -329,7 +334,7 @@ def unambiguous_alphabet():
The unambiguous nucleotide alphabet.
"""
return NucleotideSequence.alphabet_unamb
-
+
@staticmethod
def ambiguous_alphabet():
"""
@@ -348,10 +353,10 @@ def ambiguous_alphabet():
class ProteinSequence(Sequence):
"""
Representation of a protein sequence.
-
+
Furthermore this class offers a conversion of amino acids from
3-letter code into 1-letter code and vice versa.
-
+
Parameters
----------
sequence : iterable object, optional
@@ -359,7 +364,7 @@ class ProteinSequence(Sequence):
string. May take upper or lower case letters. If a list is
given, the list elements can be 1-letter or 3-letter amino acid
representations. By default the sequence is empty.
-
+
Notes
-----
The :class:`Alphabet` of this :class:`Sequence` class does not
@@ -370,106 +375,138 @@ class ProteinSequence(Sequence):
"""
_codon_table = None
-
- alphabet = LetterAlphabet(["A","C","D","E","F","G","H","I","K","L",
- "M","N","P","Q","R","S","T","V","W","Y",
- "B","Z","X","*"])
+
+ alphabet = LetterAlphabet(
+ [
+ "A",
+ "C",
+ "D",
+ "E",
+ "F",
+ "G",
+ "H",
+ "I",
+ "K",
+ "L",
+ "M",
+ "N",
+ "P",
+ "Q",
+ "R",
+ "S",
+ "T",
+ "V",
+ "W",
+ "Y",
+ "B",
+ "Z",
+ "X",
+ "*",
+ ]
+ )
# Masses are taken from
# https://web.expasy.org/findmod/findmod_masses.html#AA
- _mol_weight_average = np.array([
- 71.0788, # A
- 103.1388, # C
- 115.0886, # D
- 129.1155, # E
- 147.1766, # F
- 57.0519, # G
- 137.1411, # H
- 113.1594, # I
- 128.1741, # K
- 113.1594, # L
- 131.1926, # M
- 114.1038, # N
- 97.1167, # P
- 128.1307, # Q
- 156.1875, # R
- 87.0782, # S
- 101.1051, # T
- 99.1326, # V
- 186.2132, # W
- 163.1760, # Y
- np.nan, # B
- np.nan, # Z
- np.nan, # X
- np.nan, # *
- ])
-
- _mol_weight_monoisotopic = np.array([
- 71.03711, # A
- 103.00919, # C
- 115.02694, # D
- 129.04259, # E
- 147.06841, # F
- 57.02146, # G
- 137.05891, # H
- 113.08406, # I
- 128.09496, # K
- 113.08406, # L
- 131.04049, # M
- 114.04293, # N
- 97.05276, # P
- 128.05858, # Q
- 156.10111, # R
- 87.03203, # S
- 101.04768, # T
- 99.06841, # V
- 186.07931, # W
- 163.06333, # Y
- np.nan, # B
- np.nan, # Z
- np.nan, # X
- np.nan, # *
- ])
-
- _dict_1to3 = {"A" : "ALA",
- "C" : "CYS",
- "D" : "ASP",
- "E" : "GLU",
- "F" : "PHE",
- "G" : "GLY",
- "H" : "HIS",
- "I" : "ILE",
- "K" : "LYS",
- "L" : "LEU",
- "M" : "MET",
- "N" : "ASN",
- "P" : "PRO",
- "Q" : "GLN",
- "R" : "ARG",
- "S" : "SER",
- "T" : "THR",
- "V" : "VAL",
- "W" : "TRP",
- "Y" : "TYR",
- "B" : "ASX",
- "Z" : "GLX",
- "X" : "UNK",
- "*" : " * "}
-
+ _mol_weight_average = np.array(
+ [
+ 71.0788, # A
+ 103.1388, # C
+ 115.0886, # D
+ 129.1155, # E
+ 147.1766, # F
+ 57.0519, # G
+ 137.1411, # H
+ 113.1594, # I
+ 128.1741, # K
+ 113.1594, # L
+ 131.1926, # M
+ 114.1038, # N
+ 97.1167, # P
+ 128.1307, # Q
+ 156.1875, # R
+ 87.0782, # S
+ 101.1051, # T
+ 99.1326, # V
+ 186.2132, # W
+ 163.1760, # Y
+ np.nan, # B
+ np.nan, # Z
+ np.nan, # X
+ np.nan, # *
+ ]
+ )
+
+ _mol_weight_monoisotopic = np.array(
+ [
+ 71.03711, # A
+ 103.00919, # C
+ 115.02694, # D
+ 129.04259, # E
+ 147.06841, # F
+ 57.02146, # G
+ 137.05891, # H
+ 113.08406, # I
+ 128.09496, # K
+ 113.08406, # L
+ 131.04049, # M
+ 114.04293, # N
+ 97.05276, # P
+ 128.05858, # Q
+ 156.10111, # R
+ 87.03203, # S
+ 101.04768, # T
+ 99.06841, # V
+ 186.07931, # W
+ 163.06333, # Y
+ np.nan, # B
+ np.nan, # Z
+ np.nan, # X
+ np.nan, # *
+ ]
+ )
+
+ _dict_1to3 = {
+ "A": "ALA",
+ "C": "CYS",
+ "D": "ASP",
+ "E": "GLU",
+ "F": "PHE",
+ "G": "GLY",
+ "H": "HIS",
+ "I": "ILE",
+ "K": "LYS",
+ "L": "LEU",
+ "M": "MET",
+ "N": "ASN",
+ "P": "PRO",
+ "Q": "GLN",
+ "R": "ARG",
+ "S": "SER",
+ "T": "THR",
+ "V": "VAL",
+ "W": "TRP",
+ "Y": "TYR",
+ "B": "ASX",
+ "Z": "GLX",
+ "X": "UNK",
+ "*": " * ",
+ }
+
_dict_3to1 = {}
for _key, _value in _dict_1to3.items():
_dict_3to1[_value] = _key
_dict_3to1["SEC"] = "C"
_dict_3to1["MSE"] = "M"
-
+
def __init__(self, sequence=()):
dict_3to1 = ProteinSequence._dict_3to1
- alph = ProteinSequence.alphabet
# Convert 3-letter codes to single letter codes,
# if list contains 3-letter codes
- sequence = [dict_3to1[symbol.upper()] if len(symbol) == 3
- else symbol.upper() for symbol in sequence]
+ sequence = [
+ dict_3to1[symbol.upper()] if len(symbol) == 3 else symbol.upper()
+ for symbol in sequence
+ ]
super().__init__(sequence)
def __repr__(self):
@@ -478,11 +515,11 @@ def __repr__(self):
def get_alphabet(self):
return ProteinSequence.alphabet
-
+
def remove_stops(self):
"""
Remove *stop signals* from the sequence.
-
+
Returns
-------
no_stop : ProteinSequence
@@ -493,34 +530,34 @@ def remove_stops(self):
seq_code = no_stop.code
no_stop.code = seq_code[seq_code != stop_code]
return no_stop
-
+
@staticmethod
def convert_letter_3to1(symbol):
"""
Convert a 3-letter to a 1-letter amino acid representation.
-
+
Parameters
----------
symbol : string
3-letter amino acid representation.
-
+
Returns
-------
convert : string
1-letter amino acid representation.
"""
return ProteinSequence._dict_3to1[symbol.upper()]
-
+
@staticmethod
def convert_letter_1to3(symbol):
"""
Convert a 1-letter to a 3-letter amino acid representation.
-
+
Parameters
----------
symbol : string
1-letter amino acid representation.
-
+
Returns
-------
convert : string
@@ -531,7 +568,7 @@ def convert_letter_1to3(symbol):
def get_molecular_weight(self, monoisotopic=False):
"""
Calculate the molecular weight of this protein.
-
+
Average protein molecular weight is calculated by the addition
of average isotopic masses of the amino acids
in the protein and the average isotopic mass of one water
@@ -550,7 +587,6 @@ def get_molecular_weight(self, monoisotopic=False):
if np.isnan(weight):
raise ValueError(
- "Sequence contains ambiguous amino acids, "
- "cannot calculate weight"
+ "Sequence contains ambiguous amino acids, " "cannot calculate weight"
)
return weight
diff --git a/src/biotite/sequence/sequence.py b/src/biotite/sequence/sequence.py
index f9a69dfb0..4040fcc0e 100644
--- a/src/biotite/sequence/sequence.py
+++ b/src/biotite/sequence/sequence.py
@@ -10,16 +10,15 @@
__author__ = "Patrick Kunzmann"
__all__ = ["Sequence"]
-import numbers
import abc
+import numbers
import numpy as np
-from .alphabet import Alphabet, LetterAlphabet
-from ..copyable import Copyable
-
+from biotite.copyable import Copyable
+from biotite.sequence.alphabet import LetterAlphabet
-_size_uint8 = np.iinfo(np.uint8 ).max +1
-_size_uint16 = np.iinfo(np.uint16).max +1
-_size_uint32 = np.iinfo(np.uint32).max +1
+_size_uint8 = np.iinfo(np.uint8).max + 1
+_size_uint16 = np.iinfo(np.uint16).max + 1
+_size_uint32 = np.iinfo(np.uint32).max + 1
class Sequence(Copyable, metaclass=abc.ABCMeta):
@@ -277,12 +276,10 @@ def get_symbol_frequency(self):
corresponding number of occurences in the sequence as
values.
"""
- counts = np.bincount(
- self._seq_code, minlength=len(self.get_alphabet())
- )
+ counts = np.bincount(self._seq_code, minlength=len(self.get_alphabet()))
return {
- symbol: count for symbol, count
- in zip(self.get_alphabet().get_symbols(), counts)
+ symbol: count
+ for symbol, count in zip(self.get_alphabet().get_symbols(), counts)
}
def __getitem__(self, index):
@@ -329,12 +326,13 @@ def __eq__(self, item):
def __str__(self):
alph = self.get_alphabet()
if isinstance(alph, LetterAlphabet):
- return alph.decode_multiple(self._seq_code, as_bytes=True)\
- .tobytes().decode("ASCII")
- else:
- return ", ".join(
- [str(e) for e in alph.decode_multiple(self._seq_code)]
+ return (
+ alph.decode_multiple(self._seq_code, as_bytes=True)
+ .tobytes()
+ .decode("ASCII")
)
+ else:
+ return ", ".join([str(e) for e in alph.decode_multiple(self._seq_code)])
def __add__(self, sequence):
if self.get_alphabet().extends(sequence.get_alphabet()):
diff --git a/src/biotite/structure/__init__.py b/src/biotite/structure/__init__.py
index 0685b0e61..df9776324 100644
--- a/src/biotite/structure/__init__.py
+++ b/src/biotite/structure/__init__.py
@@ -104,9 +104,11 @@
__author__ = "Patrick Kunzmann"
from .atoms import *
+from .basepairs import *
from .bonds import *
from .box import *
from .celllist import *
+from .chains import *
from .charges import *
from .compare import *
from .density import *
@@ -122,11 +124,9 @@
from .rdf import *
from .repair import *
from .residues import *
-from .chains import *
from .sasa import *
from .sequence import *
from .sse import *
from .superimpose import *
from .transform import *
-from .basepairs import *
-# util and resutil are used internally
+# util and segments are used internally
diff --git a/src/biotite/structure/atoms.py b/src/biotite/structure/atoms.py
index 47f97de7d..d0641a125 100644
--- a/src/biotite/structure/atoms.py
+++ b/src/biotite/structure/atoms.py
@@ -4,19 +4,27 @@
"""
This module contains the main types of the ``structure`` subpackage:
-:class:`Atom`, :class:`AtomArray` and :class:`AtomArrayStack`.
+:class:`Atom`, :class:`AtomArray` and :class:`AtomArrayStack`.
"""
__name__ = "biotite.structure"
__author__ = "Patrick Kunzmann"
-__all__ = ["Atom", "AtomArray", "AtomArrayStack",
- "array", "stack", "repeat", "from_template", "coord"]
+__all__ = [
+ "Atom",
+ "AtomArray",
+ "AtomArrayStack",
+ "array",
+ "stack",
+ "repeat",
+ "from_template",
+ "coord",
+]
-import numbers
import abc
+import numbers
import numpy as np
-from .bonds import BondList
-from ..copyable import Copyable
+from biotite.copyable import Copyable
+from biotite.structure.bonds import BondList
class _AtomArrayBase(Copyable, metaclass=abc.ABCMeta):
@@ -26,7 +34,7 @@ class _AtomArrayBase(Copyable, metaclass=abc.ABCMeta):
It implements functionality for annotation arrays and also
rudimentarily for coordinates.
"""
-
+
def __init__(self, length):
"""
Create the annotation arrays
@@ -43,14 +51,14 @@ def __init__(self, length):
self.add_annotation("hetero", dtype=bool)
self.add_annotation("atom_name", dtype="U6")
self.add_annotation("element", dtype="U2")
-
+
def array_length(self):
"""
Get the length of the atom array.
-
+
This value is equivalent to the length of each annotation array.
For :class:`AtomArray` it is the same as ``len(array)``.
-
+
Returns
-------
length : int
@@ -71,15 +79,15 @@ def shape(self):
shape : tuple of int
Shape of the object.
"""
- return
-
+ return
+
def add_annotation(self, category, dtype):
"""
Add an annotation category, if not already existing.
-
+
Initially the new annotation is filled with the *zero*
representation of the given type.
-
+
Parameters
----------
category : str
@@ -87,19 +95,18 @@ def add_annotation(self, category, dtype):
dtype : type or str
A type instance or a valid *NumPy* *dtype* string.
Defines the type of the annotation
-
+
See Also
--------
set_annotation
"""
if category not in self._annot:
- self._annot[str(category)] = np.zeros(self._array_length,
- dtype=dtype)
-
+ self._annot[str(category)] = np.zeros(self._array_length, dtype=dtype)
+
def del_annotation(self, category):
"""
Removes an annotation category.
-
+
Parameters
----------
category : str
@@ -107,32 +114,30 @@ def del_annotation(self, category):
"""
if category in self._annot:
del self._annot[str(category)]
-
+
def get_annotation(self, category):
"""
Return an annotation array.
-
+
Parameters
----------
category : str
The annotation category to be returned.
-
+
Returns
-------
array : ndarray
The annotation array.
"""
if category not in self._annot:
- raise ValueError(
- f"Annotation category '{category}' is not existing"
- )
+ raise ValueError(f"Annotation category '{category}' is not existing")
return self._annot[category]
-
+
def set_annotation(self, category, array):
"""
Set an annotation array. If the annotation category does not
exist yet, the category is created.
-
+
Parameters
----------
category : str
@@ -143,28 +148,25 @@ def set_annotation(self, category, array):
"""
if len(array) != self._array_length:
raise IndexError(
- f"Expected array length {self._array_length}, "
- f"but got {len(array)}"
+ f"Expected array length {self._array_length}, " f"but got {len(array)}"
)
if category in self._annot:
# Keep the dtype if the annotation already exists
- self._annot[category] = np.asarray(
- array, dtype=self._annot[category].dtype
- )
+ self._annot[category] = np.asarray(array, dtype=self._annot[category].dtype)
else:
self._annot[category] = np.asarray(array)
-
+
def get_annotation_categories(self):
"""
Return a list containing all annotation array categories.
-
+
Returns
-------
categories : list
The list containing the names of each annotation array.
"""
return list(self._annot.keys())
-
+
def _subarray(self, index):
# Index is one dimensional (boolean mask, index array)
new_coord = self._coord[..., index, :]
@@ -180,10 +182,9 @@ def _subarray(self, index):
if self._box is not None:
new_object._box = self._box
for annotation in self._annot:
- new_object._annot[annotation] = (self._annot[annotation]
- .__getitem__(index))
+ new_object._annot[annotation] = self._annot[annotation].__getitem__(index)
return new_object
-
+
def _set_element(self, index, atom):
try:
if isinstance(index, (numbers.Integral, np.ndarray)):
@@ -191,12 +192,10 @@ def _set_element(self, index, atom):
self._annot[name][index] = atom._annot[name]
self._coord[..., index, :] = atom.coord
else:
- raise TypeError(
- f"Index must be integer, not '{type(index).__name__}'"
- )
+ raise TypeError(f"Index must be integer, not '{type(index).__name__}'")
except KeyError:
raise KeyError("The annotations of the 'Atom' are incompatible")
-
+
def _del_element(self, index):
if isinstance(index, numbers.Integral):
for name in self._annot:
@@ -208,20 +207,18 @@ def _del_element(self, index):
mask[index] = False
self._bonds = self._bonds[mask]
else:
- raise TypeError(
- f"Index must be integer, not '{type(index).__name__}'"
- )
-
+ raise TypeError(f"Index must be integer, not '{type(index).__name__}'")
+
def equal_annotations(self, item):
"""
Check, if this object shares equal annotation arrays with the
given :class:`AtomArray` or :class:`AtomArrayStack`.
-
+
Parameters
----------
item : AtomArray or AtomArrayStack
The object to compare the annotation arrays with.
-
+
Returns
-------
equality : bool
@@ -235,24 +232,24 @@ def equal_annotations(self, item):
if not np.array_equal(self._annot[name], item._annot[name]):
return False
return True
-
+
def equal_annotation_categories(self, item):
"""
Check, if this object shares equal annotation array categories
with the given :class:`AtomArray` or :class:`AtomArrayStack`.
-
+
Parameters
----------
item : AtomArray or AtomArrayStack
The object to compare the annotation arrays with.
-
+
Returns
-------
equality : bool
True, if the annotation array names are equal.
"""
return sorted(self._annot.keys()) == sorted(item._annot.keys())
-
+
def __getattr__(self, attr):
"""
If the attribute is an annotation, the annotation is returned
@@ -273,7 +270,7 @@ def __getattr__(self, attr):
raise AttributeError(
f"'{type(self).__name__}' object has no attribute '{attr}'"
)
-
+
def __setattr__(self, attr, value):
"""
If the attribute is an annotation, the :attr:`value` is saved
@@ -287,15 +284,13 @@ def __setattr__(self, attr, value):
if isinstance(self, AtomArray):
if value.ndim != 2:
raise ValueError(
- "A 2-dimensional ndarray is expected "
- "for an AtomArray"
- )
+ "A 2-dimensional ndarray is expected " "for an AtomArray"
+ )
elif isinstance(self, AtomArrayStack):
if value.ndim != 3:
raise ValueError(
- "A 3-dimensional ndarray is expected "
- "for an AtomArrayStack"
- )
+ "A 3-dimensional ndarray is expected " "for an AtomArrayStack"
+ )
if value.shape[-2] != self._array_length:
raise ValueError(
f"Expected array length {self._array_length}, "
@@ -304,7 +299,7 @@ def __setattr__(self, attr, value):
if value.shape[-1] != 3:
raise TypeError("Expected 3 coordinates for each atom")
super().__setattr__("_coord", value.astype(np.float32, copy=False))
-
+
elif attr == "bonds":
if isinstance(value, BondList):
if value.get_atom_count() != self._array_length:
@@ -318,22 +313,21 @@ def __setattr__(self, attr, value):
super().__setattr__("_bonds", None)
else:
raise TypeError("Value must be 'BondList'")
-
+
elif attr == "box":
if isinstance(value, np.ndarray):
if isinstance(self, AtomArray):
if value.ndim != 2:
raise ValueError(
- "A 2-dimensional ndarray is expected "
- "for an AtomArray"
- )
- else: # AtomArrayStack
+ "A 2-dimensional ndarray is expected " "for an AtomArray"
+ )
+ else: # AtomArrayStack
if value.ndim != 3:
raise ValueError(
"A 3-dimensional ndarray is expected "
"for an AtomArrayStack"
- )
- if value.shape[-2:] != (3,3):
+ )
+ if value.shape[-2:] != (3, 3):
raise TypeError("Box must be a 3x3 matrix (three vectors)")
box = value.astype(np.float32, copy=False)
super().__setattr__("_box", box)
@@ -342,14 +336,14 @@ def __setattr__(self, attr, value):
super().__setattr__("_box", None)
else:
raise TypeError("Box must be ndarray of floats or None")
-
+
elif attr == "_annot":
super().__setattr__(attr, value)
elif attr in self._annot:
self.set_annotation(attr, value)
else:
super().__setattr__(attr, value)
-
+
def __dir__(self):
attr = super().__dir__()
attr.append("coord")
@@ -358,7 +352,7 @@ def __dir__(self):
for name in self._annot.keys():
attr.append(name)
return attr
-
+
def __eq__(self, item):
"""
See Also
@@ -376,30 +370,31 @@ def __eq__(self, item):
if not np.array_equal(self._box, item._box):
return False
return np.array_equal(self._coord, item._coord)
-
+
def __len__(self):
"""
The length of the annotation arrays.
-
+
Returns
-------
length : int
Length of the annotation arrays.
"""
return self._array_length
-
+
def __add__(self, array):
if type(self) != type(array):
raise TypeError("Can only concatenate two arrays or two stacks")
# Create either new array or stack, depending of the own type
if isinstance(self, AtomArray):
- concat = AtomArray(length = self._array_length+array._array_length)
+ concat = AtomArray(length=self._array_length + array._array_length)
if isinstance(self, AtomArrayStack):
- concat = AtomArrayStack(self.stack_depth(),
- self._array_length + array._array_length)
-
+ concat = AtomArrayStack(
+ self.stack_depth(), self._array_length + array._array_length
+ )
+
concat._coord = np.concatenate((self._coord, array.coord), axis=-2)
-
+
# Transfer only annotations,
# which are existent in both operands
arr_categories = list(array._annot.keys())
@@ -407,29 +402,29 @@ def __add__(self, array):
if category in arr_categories:
annot = self._annot[category]
arr_annot = array._annot[category]
- concat._annot[category] = np.concatenate((annot,arr_annot))
-
+ concat._annot[category] = np.concatenate((annot, arr_annot))
+
# Concatenate bonds lists,
# if at least one of them contains bond information
if self._bonds is not None or array._bonds is not None:
bonds1 = self._bonds
bonds2 = array._bonds
if bonds1 is None:
- bonds1 = BondList(self._array_length)
+ bonds1 = BondList(self._array_length)
if bonds2 is None:
bonds2 = BondList(array._array_length)
concat._bonds = bonds1 + bonds2
-
+
# Copy box
if self._box is not None:
concat._box = np.copy(self._box)
return concat
-
+
def __copy_fill__(self, clone):
super().__copy_fill__(clone)
self._copy_annotations(clone)
clone._coord = np.copy(self._coord)
-
+
def _copy_annotations(self, clone):
for name in self._annot:
clone._annot[name] = np.copy(self._annot[name])
@@ -437,23 +432,23 @@ def _copy_annotations(self, clone):
clone._box = np.copy(self._box)
if self._bonds is not None:
clone._bonds = self._bonds.copy()
-
+
class Atom(Copyable):
"""
A representation of a single atom.
-
+
The coordinates an annotations can be accessed directly.
A detailed description of each annotation category can be viewed
:doc:`here `.
-
+
Parameters
----------
coord: list or ndarray
The x, y and z coordinates.
kwargs
Atom annotations as key value pair.
-
+
Attributes
----------
{annot} : scalar
@@ -463,19 +458,19 @@ class Atom(Copyable):
shape : tuple of int
Shape of the object.
In case of an :class:`Atom`, the tuple is empty.
-
+
Examples
--------
-
+
>>> atom = Atom([1,2,3], chain_id="A")
>>> atom.atom_name = "CA"
>>> print(atom.atom_name)
CA
>>> print(atom.coord)
[1. 2. 3.]
-
+
"""
-
+
def __init__(self, coord, **kwargs):
self._annot = {}
self._annot["chain_id"] = ""
@@ -500,17 +495,17 @@ def __repr__(self):
"""Represent Atom as a string for debugging."""
# print out key-value pairs and format strings in quotation marks
annot_parts = [
- f'{key}="{value}"' if isinstance(value, str) else f'{key}={value}'
+ f'{key}="{value}"' if isinstance(value, str) else f"{key}={value}"
for key, value in self._annot.items()
]
- annot = ', '.join(annot_parts)
- return f'Atom(np.{np.array_repr(self.coord)}, {annot})'
+ annot = ", ".join(annot_parts)
+ return f"Atom(np.{np.array_repr(self.coord)}, {annot})"
@property
def shape(self):
return ()
-
+
def __getattr__(self, attr):
if attr in super().__getattribute__("_annot"):
return self._annot[attr]
@@ -518,7 +513,7 @@ def __getattr__(self, attr):
raise AttributeError(
f"'{type(self).__name__}' object has no attribute '{attr}'"
)
-
+
def __setattr__(self, attr, value):
if attr == "_annot":
super().__setattr__(attr, value)
@@ -526,16 +521,18 @@ def __setattr__(self, attr, value):
super().__setattr__(attr, value)
else:
self._annot[attr] = value
-
+
def __str__(self):
hetero = "HET" if self.hetero else ""
- return f"{hetero:3} {self.chain_id:3} " \
- f"{self.res_id:5d}{self.ins_code:1} {self.res_name:3} " \
- f"{self.atom_name:6} {self.element:2} " \
- f"{self.coord[0]:8.3f} " \
- f"{self.coord[1]:8.3f} " \
- f"{self.coord[2]:8.3f}"
-
+ return (
+ f"{hetero:3} {self.chain_id:3} "
+ f"{self.res_id:5d}{self.ins_code:1} {self.res_name:3} "
+ f"{self.atom_name:6} {self.element:2} "
+ f"{self.coord[0]:8.3f} "
+ f"{self.coord[1]:8.3f} "
+ f"{self.coord[2]:8.3f}"
+ )
+
def __eq__(self, item):
if not isinstance(item, Atom):
return False
@@ -547,18 +544,18 @@ def __eq__(self, item):
if self._annot[name] != item._annot[name]:
return False
return True
-
+
def __ne__(self, item):
return not self == item
-
+
def __copy_create__(self):
return Atom(self.coord, **self._annot)
-
+
class AtomArray(_AtomArrayBase):
"""
An array representation of a model consisting of multiple atoms.
-
+
An :class:`AtomArray` can be seen as a list of :class:`Atom`
instances.
Instead of using directly a list, this class uses an *NumPy*
@@ -573,14 +570,14 @@ class AtomArray(_AtomArrayBase):
or :func:`set_annotation()`.
A detailed description of each annotation category can be viewed
:doc:`here `.
-
+
In order to get an an subarray of an :class:`AtomArray`,
*NumPy* style indexing is used.
This includes slices, boolean arrays, index arrays and even
*Ellipsis* notation.
Using a single integer as index returns a single :class:`Atom`
instance.
-
+
Inserting or appending an :class:`AtomArray` to another
:class:`AtomArray` is done with the '+' operator.
Only the annotation categories, which are existing in both arrays,
@@ -611,7 +608,7 @@ class AtomArray(_AtomArrayBase):
----------
length : int
The fixed amount of atoms in the array.
-
+
Attributes
----------
{annot} : ndarray
@@ -629,44 +626,44 @@ class AtomArray(_AtomArrayBase):
Shape of the atom array.
The single value in the tuple is
the length of the atom array.
-
+
Examples
--------
Creating an atom array from atoms:
-
+
>>> atom1 = Atom([1,2,3], chain_id="A")
>>> atom2 = Atom([2,3,4], chain_id="A")
>>> atom3 = Atom([3,4,5], chain_id="B")
>>> atom_array = array([atom1, atom2, atom3])
>>> print(atom_array.array_length())
3
-
+
Accessing an annotation array:
-
+
>>> print(atom_array.chain_id)
['A' 'A' 'B']
-
+
Accessing the coordinates:
-
+
>>> print(atom_array.coord)
[[1. 2. 3.]
[2. 3. 4.]
[3. 4. 5.]]
-
+
*NumPy* style filtering:
-
+
>>> atom_array = atom_array[atom_array.chain_id == "A"]
>>> print(atom_array.array_length())
2
-
+
Inserting an atom array:
-
+
>>> insert = array([Atom([7,8,9], chain_id="C")])
>>> atom_array = atom_array[0:1] + insert + atom_array[1:2]
>>> print(atom_array.chain_id)
['A' 'C' 'A']
"""
-
+
def __init__(self, length):
super().__init__(length)
if length is None:
@@ -676,13 +673,13 @@ def __init__(self, length):
def __repr__(self):
"""Represent AtomArray as a string for debugging."""
- atoms = ''
+ atoms = ""
for i in range(0, self.array_length()):
if len(atoms) == 0:
- atoms = '\n\t' + self.get_atom(i).__repr__()
+ atoms = "\n\t" + self.get_atom(i).__repr__()
else:
- atoms = atoms + ',\n\t' + self.get_atom(i).__repr__()
- return f'array([{atoms}\n])'
+ atoms = atoms + ",\n\t" + self.get_atom(i).__repr__()
+ return f"array([{atoms}\n])"
@property
def shape(self):
@@ -703,33 +700,33 @@ def shape(self):
--------
array_length
"""
- return self.array_length(),
+ return (self.array_length(),)
def get_atom(self, index):
"""
Obtain the atom instance of the array at the specified index.
-
+
The same as ``array[index]``, if `index` is an integer.
-
+
Parameters
----------
index : int
Index of the atom.
-
+
Returns
-------
atom : Atom
- Atom at position `index`.
+ Atom at position `index`.
"""
kwargs = {}
for name, annotation in self._annot.items():
kwargs[name] = annotation[index]
- return Atom(coord = self._coord[index], kwargs=kwargs)
-
+ return Atom(coord=self._coord[index], kwargs=kwargs)
+
def __iter__(self):
"""
Iterate through the array.
-
+
Yields
------
atom : Atom
@@ -738,16 +735,16 @@ def __iter__(self):
while i < len(self):
yield self.get_atom(i)
i += 1
-
+
def __getitem__(self, index):
"""
Obtain a subarray or the atom instance at the specified index.
-
+
Parameters
----------
index : object
All index types *NumPy* accepts, are valid.
-
+
Returns
-------
sub_array : Atom or AtomArray
@@ -763,16 +760,14 @@ def __getitem__(self, index):
# If first index is "...", just ignore the first index
return self.__getitem__(index[1])
else:
- raise IndexError(
- "'AtomArray' does not accept multidimensional indices"
- )
+ raise IndexError("'AtomArray' does not accept multidimensional indices")
else:
return self._subarray(index)
-
+
def __setitem__(self, index, atom):
"""
Set the atom at the specified array position.
-
+
Parameters
----------
index : int
@@ -781,38 +776,38 @@ def __setitem__(self, index, atom):
The atom to be set.
"""
self._set_element(index, atom)
-
+
def __delitem__(self, index):
"""
Deletes the atom at the specified array position.
-
+
Parameters
----------
index : int
The position where the atom should be deleted.
"""
self._del_element(index)
-
+
def __len__(self):
"""
The length of the array.
-
+
Returns
-------
length : int
Length of the array.
"""
return self.array_length()
-
+
def __eq__(self, item):
"""
Check if the array equals another :class:`AtomArray`.
-
+
Parameters
----------
item : object
Object to campare the array with.
-
+
Returns
-------
equal : bool
@@ -824,15 +819,15 @@ def __eq__(self, item):
if not isinstance(item, AtomArray):
return False
return True
-
+
def __str__(self):
"""
Get a string representation of the array.
-
+
Each line contains the attributes of one atom.
"""
return "\n".join([str(atom) for atom in self])
-
+
def __copy_create__(self):
return AtomArray(self.array_length())
@@ -841,7 +836,7 @@ class AtomArrayStack(_AtomArrayBase):
"""
A collection of multiple :class:`AtomArray` instances, where each
atom array has equal annotation arrays.
-
+
Effectively, this means that each atom is occuring in every array in
the stack at differing coordinates. This situation arises e.g. in
NMR-elucidated or simulated structures. Since the annotations are
@@ -849,7 +844,7 @@ class AtomArrayStack(_AtomArrayBase):
coordinate array is 3-D (m x n x 3).
A detailed description of each annotation category can be viewed
:doc:`here `.
-
+
Indexing works similar to :class:`AtomArray`, with the difference,
that two index dimensions are possible:
The first index dimension specifies the array(s), the second index
@@ -857,24 +852,24 @@ class AtomArrayStack(_AtomArrayBase):
in :class:`AtomArray`).
Using a single integer as first dimension index returns a single
:class:`AtomArray` instance.
-
+
Concatenation of atoms for each array in the stack is done using the
'+' operator. For addition of atom arrays onto the stack use the
:func:`stack()` method.
The :attr:`box` attribute has the shape *m x 3 x 3*, as the cell
might be different for each frame in the atom array stack.
-
+
Parameters
----------
depth : int
The fixed amount of arrays in the stack. When indexing, this is
the length of the first dimension.
-
+
length : int
The fixed amount of atoms in each array in the stack. When
indexing, this is the length of the second dimension.
-
+
Attributes
----------
{annot} : ndarray, shape=(n,)
@@ -892,15 +887,15 @@ class AtomArrayStack(_AtomArrayBase):
Shape of the stack.
The numbers correspond to the stack depth
and array length, respectively.
-
+
See also
--------
AtomArray
-
+
Examples
--------
Creating an atom array stack from two arrays:
-
+
>>> atom1 = Atom([1,2,3], chain_id="A")
>>> atom2 = Atom([2,3,4], chain_id="A")
>>> atom3 = Atom([3,4,5], chain_id="B")
@@ -925,40 +920,40 @@ class AtomArrayStack(_AtomArrayBase):
[5. 6. 7.]
[6. 7. 8.]]]
"""
-
+
def __init__(self, depth, length):
super().__init__(length)
- if depth == None or length == None:
+ if depth is None or length is None:
self._coord = None
else:
self._coord = np.full((depth, length, 3), np.nan, dtype=np.float32)
def __repr__(self):
"""Represent AtomArrayStack as a string for debugging."""
- arrays = ''
+ arrays = ""
for i in range(0, self.stack_depth()):
if len(arrays) == 0:
- arrays = '\n\t' + self.get_array(i).__repr__()
+ arrays = "\n\t" + self.get_array(i).__repr__()
else:
- arrays = arrays + ',\n\t' + self.get_array(i).__repr__()
- return f'stack([{arrays}\n])'
+ arrays = arrays + ",\n\t" + self.get_array(i).__repr__()
+ return f"stack([{arrays}\n])"
def get_array(self, index):
"""
Obtain the atom array instance of the stack at the specified
index.
-
+
The same as ``stack[index]``, if `index` is an integer.
-
+
Parameters
----------
index : int
Index of the atom array.
-
+
Returns
-------
array : AtomArray
- AtomArray at position `index`.
+ AtomArray at position `index`.
"""
array = AtomArray(self.array_length())
for name in self._annot:
@@ -970,14 +965,14 @@ def get_array(self, index):
array._box = self._box[index]
return array
-
+
def stack_depth(self):
"""
Get the depth of the stack.
-
+
This value represents the amount of atom arrays in the stack.
It is the same as ``len(array)``.
-
+
Returns
-------
length : int
@@ -1005,7 +1000,7 @@ def shape(self):
def __iter__(self):
"""
Iterate through the array.
-
+
Yields
------
array : AtomArray
@@ -1014,17 +1009,17 @@ def __iter__(self):
while i < len(self):
yield self.get_array(i)
i += 1
-
+
def __getitem__(self, index):
"""
Obtain the atom array instance or an substack at the specified
index.
-
+
Parameters
----------
index : object
All index types *NumPy* accepts are valid.
-
+
Returns
-------
sub_array : AtomArray or AtomArrayStack
@@ -1033,7 +1028,7 @@ def __getitem__(self, index):
Otherwise an :class:`AtomArrayStack` with reduced depth and
length is returned.
In case the index is a tuple(int, int) an :class:`Atom`
- instance is returned.
+ instance is returned.
"""
if isinstance(index, numbers.Integral):
return self.get_array(index)
@@ -1050,7 +1045,7 @@ def __getitem__(self, index):
if isinstance(index[1], numbers.Integral):
# Prevent reduction in dimensionality
# in second dimension
- new_stack = self._subarray(slice(index[1], index[1]+1))
+ new_stack = self._subarray(slice(index[1], index[1] + 1))
else:
new_stack = self._subarray(index[1])
if index[0] is not Ellipsis:
@@ -1065,14 +1060,13 @@ def __getitem__(self, index):
if self._box is not None:
new_stack._box = self._box[index]
return new_stack
-
-
+
def __setitem__(self, index, array):
"""
Set the atom array at the specified stack position.
-
+
The array and the stack must have equal annotation arrays.
-
+
Parameters
----------
index : int
@@ -1081,26 +1075,20 @@ def __setitem__(self, index, array):
The atom array to be set.
"""
if not self.equal_annotations(array):
- raise ValueError(
- "The stack and the array have unequal annotations"
- )
+ raise ValueError("The stack and the array have unequal annotations")
if self.bonds != array.bonds:
- raise ValueError(
- "The stack and the array have unequal bonds"
- )
+ raise ValueError("The stack and the array have unequal bonds")
if isinstance(index, numbers.Integral):
self.coord[index] = array.coord
if self.box is not None:
self.box[index] = array.box
else:
- raise TypeError(
- f"Index must be integer, not '{type(index).__name__}'"
- )
-
+ raise TypeError(f"Index must be integer, not '{type(index).__name__}'")
+
def __delitem__(self, index):
"""
Deletes the atom array at the specified stack position.
-
+
Parameters
----------
index : int
@@ -1109,14 +1097,12 @@ def __delitem__(self, index):
if isinstance(index, numbers.Integral):
self._coord = np.delete(self._coord, index, axis=0)
else:
- raise TypeError(
- f"Index must be integer, not '{type(index).__name__}'"
- )
-
+ raise TypeError(f"Index must be integer, not '{type(index).__name__}'")
+
def __len__(self):
"""
The depth of the stack, i.e. the amount of models.
-
+
Returns
-------
depth : int
@@ -1124,16 +1110,16 @@ def __len__(self):
"""
# length is determined by length of coord attribute
return self._coord.shape[0]
-
+
def __eq__(self, item):
"""
Check if the array equals another :class:`AtomArray`
-
+
Parameters
----------
item : object
Object to campare the array with.
-
+
Returns
-------
equal : bool
@@ -1145,20 +1131,20 @@ def __eq__(self, item):
if not isinstance(item, AtomArrayStack):
return False
return True
-
+
def __str__(self):
"""
Get a string representation of the stack.
-
+
:class:`AtomArray` strings eparated by blank lines
and a line indicating the index.
"""
string = ""
for i, array in enumerate(self):
- string += "Model " + str(i+1) + "\n"
+ string += "Model " + str(i + 1) + "\n"
string += str(array) + "\n" + "\n"
return string
-
+
def __copy_create__(self):
return AtomArrayStack(self.stack_depth(), self.array_length())
@@ -1166,23 +1152,23 @@ def __copy_create__(self):
def array(atoms):
"""
Create an :class:`AtomArray` from a list of :class:`Atom`.
-
+
Parameters
----------
atoms : iterable object of Atom
The atoms to be combined in an array.
All atoms must share the same annotation categories.
-
+
Returns
-------
array : AtomArray
The listed atoms as array.
-
+
Examples
--------
-
+
Creating an atom array from atoms:
-
+
>>> atom1 = Atom([1,2,3], chain_id="A")
>>> atom2 = Atom([2,3,4], chain_id="A")
>>> atom3 = Atom([3,4,5], chain_id="B")
@@ -1204,7 +1190,7 @@ def array(atoms):
array = AtomArray(len(atoms))
# Add all (also optional) annotation categories
for name in names:
- array.add_annotation(name, dtype=type(atoms[0]._annot[name]))
+ array.add_annotation(name, dtype=type(atoms[0]._annot[name]))
# Add all atoms to AtomArray
for i in range(len(atoms)):
for name in names:
@@ -1216,23 +1202,23 @@ def array(atoms):
def stack(arrays):
"""
Create an :class:`AtomArrayStack` from a list of :class:`AtomArray`.
-
+
Parameters
----------
arrays : iterable object of AtomArray
The atom arrays to be combined in a stack.
All atom arrays must have an equal number of atoms and equal
annotation arrays.
-
+
Returns
-------
stack : AtomArrayStack
The stacked atom arrays.
-
+
Examples
--------
Creating an atom array stack from two arrays:
-
+
>>> atom1 = Atom([1,2,3], chain_id="A")
>>> atom2 = Atom([2,3,4], chain_id="A")
>>> atom3 = Atom([3,4,5], chain_id="B")
@@ -1272,7 +1258,7 @@ def stack(arrays):
array_stack = AtomArrayStack(array_count, ref_array.array_length())
for name, annotation in ref_array._annot.items():
array_stack._annot[name] = annotation
- coord_list = [array._coord for array in arrays]
+ coord_list = [array._coord for array in arrays]
array_stack._coord = np.stack(coord_list, axis=0)
# Take bond list from first array
array_stack._bonds = ref_array._bonds
@@ -1296,14 +1282,14 @@ def repeat(atoms, coord):
The length of first dimension determines the number of repeats.
If `atoms` is an :class:`AtomArray` 3 dimensions, otherwise
4 dimensions are required.
-
+
Returns
-------
repeated: AtomArray, shape=(n*k,) or AtomArrayStack, shape=(m,n*k)
The repeated atoms.
Whether an :class:`AtomArray` or an :class:`AtomArrayStack` is
returned depends on the input `atoms`.
-
+
Examples
--------
@@ -1336,7 +1322,7 @@ def repeat(atoms, coord):
raise ValueError(
f"Expected 4 dimensions for the coordinate array, got {coord.ndim}"
)
-
+
repetitions = len(coord)
orig_length = atoms.array_length()
new_length = orig_length * repetitions
@@ -1358,24 +1344,24 @@ def repeat(atoms, coord):
)
repeated = AtomArrayStack(atoms.stack_depth(), new_length)
repeated.coord = coord.reshape((atoms.stack_depth(), new_length, 3))
-
+
else:
raise TypeError(
f"Expected 'AtomArray' or 'AtomArrayStack', "
f"but got {type(atoms).__name__}"
)
-
+
for category in atoms.get_annotation_categories():
annot = np.tile(atoms.get_annotation(category), repetitions)
repeated.set_annotation(category, annot)
if atoms.bonds is not None:
repeated_bonds = atoms.bonds.copy()
- for _ in range(repetitions-1):
+ for _ in range(repetitions - 1):
repeated_bonds += atoms.bonds
repeated.bonds = repeated_bonds
if atoms.box is not None:
repeated.box = atoms.box.copy()
-
+
return repeated
@@ -1383,7 +1369,7 @@ def from_template(template, coord, box=None):
"""
Create an :class:`AtomArrayStack` using template atoms and given
coordinates.
-
+
Parameters
----------
template : AtomArray, shape=(n,) or AtomArrayStack, shape=(m,n)
@@ -1393,7 +1379,7 @@ def from_template(template, coord, box=None):
The coordinates for each model of the returned stack.
box : ndarray, optional, dtype=float, shape=(l,3,3)
The box for each model of the returned stack.
-
+
Returns
-------
array_stack : AtomArrayStack
@@ -1409,7 +1395,7 @@ def from_template(template, coord, box=None):
# Create empty stack with no models
new_stack = AtomArrayStack(0, template.array_length())
-
+
for category in template.get_annotation_categories():
annot = template.get_annotation(category)
new_stack.set_annotation(category, annot)
@@ -1417,30 +1403,30 @@ def from_template(template, coord, box=None):
new_stack.bonds = template.bonds.copy()
if box is not None:
new_stack.box = box.copy()
-
+
# After setting the coordinates the number of models is the number
# of models in the new coordinates
new_stack.coord = coord
-
+
return new_stack
def coord(item):
"""
Get the atom coordinates of the given array.
-
+
This may be directly and :class:`Atom`, :class:`AtomArray` or
:class:`AtomArrayStack` or
alternatively an (n x 3) or (m x n x 3) :class:`ndarray`
containing the coordinates.
-
+
Parameters
----------
item : Atom or AtomArray or AtomArrayStack or ndarray
Returns the :attr:`coord` attribute, if `item` is an
:class:`Atom`, :class:`AtomArray` or :class:`AtomArrayStack`.
Directly returns the input, if `item` is a :class:`ndarray`.
-
+
Returns
-------
coord : ndarray
diff --git a/src/biotite/structure/basepairs.py b/src/biotite/structure/basepairs.py
index 371477cd0..19265c756 100644
--- a/src/biotite/structure/basepairs.py
+++ b/src/biotite/structure/basepairs.py
@@ -8,23 +8,33 @@
__name__ = "biotite.structure"
__author__ = "Tom David Müller"
-__all__ = ["base_pairs", "map_nucleotide", "base_stacking", "base_pairs_edge",
- "Edge", "base_pairs_glycosidic_bond", "GlycosidicBond"]
+__all__ = [
+ "base_pairs",
+ "map_nucleotide",
+ "base_stacking",
+ "base_pairs_edge",
+ "Edge",
+ "base_pairs_glycosidic_bond",
+ "GlycosidicBond",
+]
-import numpy as np
import warnings
from enum import IntEnum
-from .atoms import Atom, array
-from .superimpose import superimpose
-from .filter import filter_nucleotides
-from .celllist import CellList
-from .hbond import hbond
-from .error import IncompleteStructureWarning, UnexpectedStructureWarning, \
- BadStructureError
-from .util import distance, norm_vector
-from .residues import get_residue_starts_for, get_residue_masks
-from .info.standardize import standardize_order
-from .compare import rmsd
+import numpy as np
+from biotite.structure.atoms import Atom, array
+from biotite.structure.celllist import CellList
+from biotite.structure.compare import rmsd
+from biotite.structure.error import (
+ BadStructureError,
+ IncompleteStructureWarning,
+ UnexpectedStructureWarning,
+)
+from biotite.structure.filter import filter_nucleotides
+from biotite.structure.hbond import hbond
+from biotite.structure.info.standardize import standardize_order
+from biotite.structure.residues import get_residue_masks, get_residue_starts_for
+from biotite.structure.superimpose import superimpose
+from biotite.structure.util import distance, norm_vector
def _get_std_adenine():
@@ -43,31 +53,29 @@ def _get_std_adenine():
ring center, :class:`ndarray` containing the coordinates of the
imidazole ring center
"""
- atom1 = Atom([-1.291, 4.498, 0.000], atom_name="N9", res_name="A")
- atom2 = Atom([0.024, 4.897, 0.000], atom_name="C8", res_name="A")
- atom3 = Atom([0.877, 3.902, 0.000], atom_name="N7", res_name="A")
- atom4 = Atom([0.071, 2.771, 0.000], atom_name="C5", res_name="A")
- atom5 = Atom([0.369, 1.398, 0.000], atom_name="C6", res_name="A")
- atom6 = Atom([1.611, 0.909, 0.000], atom_name="N6", res_name="A")
- atom7 = Atom([-0.668, 0.532, 0.000], atom_name="N1", res_name="A")
- atom8 = Atom([-1.912, 1.023, 0.000], atom_name="C2", res_name="A")
- atom9 = Atom([-2.320, 2.290, 0.000], atom_name="N3", res_name="A")
- atom10 = Atom([-1.267, 3.124, 0.000], atom_name="C4", res_name="A")
+ atom1 = Atom([-1.291, 4.498, 0.000], atom_name="N9", res_name="A")
+ atom2 = Atom([0.024, 4.897, 0.000], atom_name="C8", res_name="A")
+ atom3 = Atom([0.877, 3.902, 0.000], atom_name="N7", res_name="A")
+ atom4 = Atom([0.071, 2.771, 0.000], atom_name="C5", res_name="A")
+ atom5 = Atom([0.369, 1.398, 0.000], atom_name="C6", res_name="A")
+ atom6 = Atom([1.611, 0.909, 0.000], atom_name="N6", res_name="A")
+ atom7 = Atom([-0.668, 0.532, 0.000], atom_name="N1", res_name="A")
+ atom8 = Atom([-1.912, 1.023, 0.000], atom_name="C2", res_name="A")
+ atom9 = Atom([-2.320, 2.290, 0.000], atom_name="N3", res_name="A")
+ atom10 = Atom([-1.267, 3.124, 0.000], atom_name="C4", res_name="A")
adenine = array(
- [atom1, atom2, atom3, atom4, atom5, atom6, atom7, atom8,
- atom9, atom10]
+ [atom1, atom2, atom3, atom4, atom5, atom6, atom7, atom8, atom9, atom10]
)
# Get the midpoint between the N1 and C4 atoms
midpoint = np.mean([atom7.coord, atom10.coord], axis=-2)
# Calculate the coordinates of the aromatic ring centers
pyrimidine_center = np.mean(
- [atom4.coord, atom5.coord, atom7.coord,
- atom8.coord, atom9.coord, atom10.coord], axis=-2
+ [atom4.coord, atom5.coord, atom7.coord, atom8.coord, atom9.coord, atom10.coord],
+ axis=-2,
)
imidazole_center = np.mean(
- [atom1.coord, atom2.coord, atom3.coord,
- atom4.coord, atom10.coord], axis=-2
+ [atom1.coord, atom2.coord, atom3.coord, atom4.coord, atom10.coord], axis=-2
)
return adenine, (midpoint, pyrimidine_center, imidazole_center)
@@ -75,37 +83,35 @@ def _get_std_adenine():
def _get_std_cytosine():
"""
- Get standard base variables for cytosine.
+ Get standard base variables for cytosine.
- Returns
- -------
- standard_base : AtomArray
- Standard coordinates nomenclature of the cytosine base as
- :class:`AtomArray` with nomenclature of PDB File Format V3
- coordinates : tuple (ndarray, ndarray, dtype=float)
- :class:`ndarray` containing the center according to the SCHNaP-
- paper referenced in the function ``base_pairs``,
- :class:`ndarray` containing the coordinates of the pyrimidine
- ring center
+ Returns
+ -------
+ standard_base : AtomArray
+ Standard coordinates nomenclature of the cytosine base as
+ :class:`AtomArray` with nomenclature of PDB File Format V3
+ coordinates : tuple (ndarray, ndarray, dtype=float)
+ :class:`ndarray` containing the center according to the SCHNaP-
+ paper referenced in the function ``base_pairs``,
+ :class:`ndarray` containing the coordinates of the pyrimidine
+ ring center
"""
- atom1 = Atom([-1.285, 4.542, 0.000], atom_name="N1", res_name="C")
- atom2 = Atom([-1.472, 3.158, 0.000], atom_name="C2", res_name="C")
- atom3 = Atom([-2.628, 2.709, 0.000], atom_name="O2", res_name="C")
- atom4 = Atom([-0.391, 2.344, 0.000], atom_name="N3", res_name="C")
- atom5 = Atom([0.837, 2.868, 0.000], atom_name="C4", res_name="C")
- atom6 = Atom([1.875, 2.027, 0.000], atom_name="N4", res_name="C")
- atom7 = Atom([1.056, 4.275, 0.000], atom_name="C5", res_name="C")
- atom8 = Atom([-0.023, 5.068, 0.000], atom_name="C6", res_name="C")
- cytosine = array(
- [atom1, atom2, atom3, atom4, atom5, atom6, atom7, atom8]
- )
+ atom1 = Atom([-1.285, 4.542, 0.000], atom_name="N1", res_name="C")
+ atom2 = Atom([-1.472, 3.158, 0.000], atom_name="C2", res_name="C")
+ atom3 = Atom([-2.628, 2.709, 0.000], atom_name="O2", res_name="C")
+ atom4 = Atom([-0.391, 2.344, 0.000], atom_name="N3", res_name="C")
+ atom5 = Atom([0.837, 2.868, 0.000], atom_name="C4", res_name="C")
+ atom6 = Atom([1.875, 2.027, 0.000], atom_name="N4", res_name="C")
+ atom7 = Atom([1.056, 4.275, 0.000], atom_name="C5", res_name="C")
+ atom8 = Atom([-0.023, 5.068, 0.000], atom_name="C6", res_name="C")
+ cytosine = array([atom1, atom2, atom3, atom4, atom5, atom6, atom7, atom8])
# Get the midpoint between the N3 and C6 atoms
midpoint = np.mean([atom4.coord, atom8.coord], axis=-2)
# Calculate the coordinates of the aromatic ring center
pyrimidine_center = np.mean(
- [atom1.coord, atom2.coord, atom4.coord,
- atom5.coord, atom7.coord, atom8.coord], axis=-2
+ [atom1.coord, atom2.coord, atom4.coord, atom5.coord, atom7.coord, atom8.coord],
+ axis=-2,
)
return cytosine, (midpoint, pyrimidine_center)
@@ -127,32 +133,37 @@ def _get_std_guanine():
ring center, :class:`ndarray` containing the coordinates of the
imidazole ring center
"""
- atom1 = Atom([-1.289, 4.551, 0.000], atom_name="N9", res_name="G")
- atom2 = Atom([0.023, 4.962, 0.000], atom_name="C8", res_name="G")
- atom3 = Atom([0.870, 3.969, 0.000], atom_name="N7", res_name="G")
- atom4 = Atom([0.071, 2.833, 0.000], atom_name="C5", res_name="G")
- atom5 = Atom([0.424, 1.460, 0.000], atom_name="C6", res_name="G")
- atom6 = Atom([1.554, 0.955, 0.000], atom_name="O6", res_name="G")
- atom7 = Atom([-0.700, 0.641, 0.000], atom_name="N1", res_name="G")
- atom8 = Atom([-1.999, 1.087, 0.000], atom_name="C2", res_name="G")
- atom9 = Atom([-2.949, 0.139, -0.001], atom_name="N2", res_name="G")
- atom10 = Atom([-2.342, 2.364, 0.001], atom_name="N3", res_name="G")
- atom11 = Atom([-1.265, 3.177, 0.000], atom_name="C4", res_name="G")
+ atom1 = Atom([-1.289, 4.551, 0.000], atom_name="N9", res_name="G")
+ atom2 = Atom([0.023, 4.962, 0.000], atom_name="C8", res_name="G")
+ atom3 = Atom([0.870, 3.969, 0.000], atom_name="N7", res_name="G")
+ atom4 = Atom([0.071, 2.833, 0.000], atom_name="C5", res_name="G")
+ atom5 = Atom([0.424, 1.460, 0.000], atom_name="C6", res_name="G")
+ atom6 = Atom([1.554, 0.955, 0.000], atom_name="O6", res_name="G")
+ atom7 = Atom([-0.700, 0.641, 0.000], atom_name="N1", res_name="G")
+ atom8 = Atom([-1.999, 1.087, 0.000], atom_name="C2", res_name="G")
+ atom9 = Atom([-2.949, 0.139, -0.001], atom_name="N2", res_name="G")
+ atom10 = Atom([-2.342, 2.364, 0.001], atom_name="N3", res_name="G")
+ atom11 = Atom([-1.265, 3.177, 0.000], atom_name="C4", res_name="G")
guanine = array(
- [atom1, atom2, atom3, atom4, atom5, atom6, atom7, atom8,
- atom9, atom10, atom11]
+ [atom1, atom2, atom3, atom4, atom5, atom6, atom7, atom8, atom9, atom10, atom11]
)
# Get the midpoint between the N1 and C4 atoms
midpoint = np.mean([atom7.coord, atom11.coord], axis=-2)
# Calculate the coordinates of the aromatic ring centers
pyrimidine_center = np.mean(
- [atom4.coord, atom5.coord, atom7.coord,
- atom8.coord, atom10.coord, atom11.coord], axis=-2
+ [
+ atom4.coord,
+ atom5.coord,
+ atom7.coord,
+ atom8.coord,
+ atom10.coord,
+ atom11.coord,
+ ],
+ axis=-2,
)
imidazole_center = np.mean(
- [atom1.coord, atom2.coord, atom3.coord,
- atom4.coord, atom11.coord], axis=-2
+ [atom1.coord, atom2.coord, atom3.coord, atom4.coord, atom11.coord], axis=-2
)
return guanine, (midpoint, pyrimidine_center, imidazole_center)
@@ -173,25 +184,23 @@ def _get_std_thymine():
:class:`ndarray` containing the coordinates of the pyrimidine
ring center
"""
- atom1 = Atom([-1.284, 4.500, 0.000], atom_name="N1", res_name="T")
- atom2 = Atom([-1.462, 3.135, 0.000], atom_name="C2", res_name="T")
- atom3 = Atom([-2.562, 2.608, 0.000], atom_name="O2", res_name="T")
- atom4 = Atom([-0.298, 2.407, 0.000], atom_name="N3", res_name="T")
- atom5 = Atom([0.994, 2.897, 0.000], atom_name="C4", res_name="T")
- atom6 = Atom([1.944, 2.119, 0.000], atom_name="O4", res_name="T")
- atom7 = Atom([1.106, 4.338, 0.000], atom_name="C5", res_name="T")
- atom8 = Atom([2.466, 4.961, 0.001], atom_name="C7", res_name="T")
- atom9 = Atom([-0.024, 5.057, 0.000], atom_name="C6", res_name="T")
- thymine = array(
- [atom1, atom2, atom3, atom4, atom5, atom6, atom7, atom8, atom9]
- )
+ atom1 = Atom([-1.284, 4.500, 0.000], atom_name="N1", res_name="T")
+ atom2 = Atom([-1.462, 3.135, 0.000], atom_name="C2", res_name="T")
+ atom3 = Atom([-2.562, 2.608, 0.000], atom_name="O2", res_name="T")
+ atom4 = Atom([-0.298, 2.407, 0.000], atom_name="N3", res_name="T")
+ atom5 = Atom([0.994, 2.897, 0.000], atom_name="C4", res_name="T")
+ atom6 = Atom([1.944, 2.119, 0.000], atom_name="O4", res_name="T")
+ atom7 = Atom([1.106, 4.338, 0.000], atom_name="C5", res_name="T")
+ atom8 = Atom([2.466, 4.961, 0.001], atom_name="C7", res_name="T")
+ atom9 = Atom([-0.024, 5.057, 0.000], atom_name="C6", res_name="T")
+ thymine = array([atom1, atom2, atom3, atom4, atom5, atom6, atom7, atom8, atom9])
# Get the midpoint between the N3 and C6 atoms
midpoint = np.mean([atom4.coord, atom9.coord], axis=-2)
# Calculate the coordinates of the aromatic ring center
pyrimidine_center = np.mean(
- [atom1.coord, atom2.coord, atom4.coord,
- atom5.coord, atom7.coord, atom9.coord], axis=-2
+ [atom1.coord, atom2.coord, atom4.coord, atom5.coord, atom7.coord, atom9.coord],
+ axis=-2,
)
return thymine, (midpoint, pyrimidine_center)
@@ -212,30 +221,28 @@ def _get_std_uracil():
:class:`ndarray` containing the coordinates of the pyrimidine
ring center
"""
- atom1 = Atom([-1.284, 4.500, 0.000], atom_name="N1", res_name="U")
- atom2 = Atom([-1.462, 3.131, 0.000], atom_name="C2", res_name="U")
- atom3 = Atom([-2.563, 2.608, 0.000], atom_name="O2", res_name="U")
- atom4 = Atom([-0.302, 2.397, 0.000], atom_name="N3", res_name="U")
- atom5 = Atom([0.989, 2.884, 0.000], atom_name="C4", res_name="U")
- atom6 = Atom([1.935, 2.094, -0.001], atom_name="O4", res_name="U")
- atom7 = Atom([1.089, 4.311, 0.000], atom_name="C5", res_name="U")
- atom8 = Atom([-0.024, 5.053, 0.000], atom_name="C6", res_name="U")
- uracil = array(
- [atom1, atom2, atom3, atom4, atom5, atom6, atom7, atom8]
- )
+ atom1 = Atom([-1.284, 4.500, 0.000], atom_name="N1", res_name="U")
+ atom2 = Atom([-1.462, 3.131, 0.000], atom_name="C2", res_name="U")
+ atom3 = Atom([-2.563, 2.608, 0.000], atom_name="O2", res_name="U")
+ atom4 = Atom([-0.302, 2.397, 0.000], atom_name="N3", res_name="U")
+ atom5 = Atom([0.989, 2.884, 0.000], atom_name="C4", res_name="U")
+ atom6 = Atom([1.935, 2.094, -0.001], atom_name="O4", res_name="U")
+ atom7 = Atom([1.089, 4.311, 0.000], atom_name="C5", res_name="U")
+ atom8 = Atom([-0.024, 5.053, 0.000], atom_name="C6", res_name="U")
+ uracil = array([atom1, atom2, atom3, atom4, atom5, atom6, atom7, atom8])
# Get the midpoint between the N3 and C6 atoms
midpoint = np.mean([atom4.coord, atom8.coord], axis=-2)
# Calculate the coordinates of the aromatic ring center
pyrimidine_center = np.mean(
- [atom1.coord, atom2.coord, atom4.coord,
- atom5.coord, atom7.coord, atom8.coord], axis=-2
+ [atom1.coord, atom2.coord, atom4.coord, atom5.coord, atom7.coord, atom8.coord],
+ axis=-2,
)
return uracil, (midpoint, pyrimidine_center)
-_STD_ADENINE, _STD_ADENINE_RING_CENTERS = _get_std_adenine()
+_STD_ADENINE, _STD_ADENINE_RING_CENTERS = _get_std_adenine()
_STD_CYTOSINE, _STD_CYTOSINE_RING_CENTERS = _get_std_cytosine()
_STD_GUANINE, _STD_GUANINE_RING_CENTERS = _get_std_guanine()
_STD_THYMINE, _STD_THYMINE_RING_CENTERS = _get_std_thymine()
@@ -247,35 +254,35 @@ def _get_std_uracil():
_GUANINE_CONTAINING_NUCLEOTIDES = ["G", "DG"]
_URACIL_CONTAINING_NUCLEOTIDES = ["U", "DU"]
_REFERENCE_NUCLEOTIDE_NAMES = (
- _ADENINE_CONTAINING_NUCLEOTIDES +
- _THYMINE_CONTAINING_NUCLEOTIDES +
- _CYTOSINE_CONTAINING_NUCLEOTIDES +
- _GUANINE_CONTAINING_NUCLEOTIDES +
- _URACIL_CONTAINING_NUCLEOTIDES
+ _ADENINE_CONTAINING_NUCLEOTIDES
+ + _THYMINE_CONTAINING_NUCLEOTIDES
+ + _CYTOSINE_CONTAINING_NUCLEOTIDES
+ + _GUANINE_CONTAINING_NUCLEOTIDES
+ + _URACIL_CONTAINING_NUCLEOTIDES
)
# Atoms that are part of respective base edges according to the
# Leontis-Westhof nomenclature
_WATSON_CRICK_EDGE = {
- "A" : ["N6", "N1"],
- "G" : ["O6", "N1", "N2"],
- "U" : ["O4", "N3", "O2"],
- "T" : ["O4", "N3", "O2"],
- "C" : ["N4", "N3", "O2"]
+ "A": ["N6", "N1"],
+ "G": ["O6", "N1", "N2"],
+ "U": ["O4", "N3", "O2"],
+ "T": ["O4", "N3", "O2"],
+ "C": ["N4", "N3", "O2"],
}
_HOOGSTEEN_EDGE = {
- "A" : ["N6", "N7"],
- "G" : ["O6", "N7"],
- "U" : ["O4"],
- "T" : ["O4"],
- "C" : ["N4"]
+ "A": ["N6", "N7"],
+ "G": ["O6", "N7"],
+ "U": ["O4"],
+ "T": ["O4"],
+ "C": ["N4"],
}
_SUGAR_EDGE = {
- "A" : ["N3", "O2'"],
- "G" : ["N2", "N3", "O2'"],
- "U" : ["O2", "O2'"],
- "T" : ["O2", "O2'"],
- "C" : ["O2", "O2'"]
+ "A": ["N3", "O2'"],
+ "G": ["N2", "N3", "O2'"],
+ "U": ["O2", "O2'"],
+ "T": ["O2", "O2'"],
+ "C": ["O2", "O2'"],
}
_EDGES = [_WATSON_CRICK_EDGE, _HOOGSTEEN_EDGE, _SUGAR_EDGE]
@@ -284,9 +291,10 @@ class Edge(IntEnum):
"""
This enum type represents the interacting edge for a given base.
"""
- INVALID = 0,
- WATSON_CRICK = 1,
- HOOGSTEEN = 2,
+
+ INVALID = (0,)
+ WATSON_CRICK = (1,)
+ HOOGSTEEN = (2,)
SUGAR = 3
@@ -295,9 +303,10 @@ class GlycosidicBond(IntEnum):
This enum type represents the relative glycosidic bond orientation
for a given base pair.
"""
+
INVALID = 0
- CIS = 1,
- TRANS = 2,
+ CIS = (1,)
+ TRANS = (2,)
def base_pairs_edge(atom_array, base_pairs):
@@ -390,7 +399,7 @@ def base_pairs_edge(atom_array, base_pairs):
.. footbibliography::
"""
# Result-``ndarray`` matches the dimensions of the input array
- results = np.zeros_like(base_pairs, dtype='uint8')
+ results = np.zeros_like(base_pairs, dtype="uint8")
# Get the residue masks for each residue
base_pairs_masks = get_residue_masks(atom_array, base_pairs.flatten())
@@ -441,16 +450,15 @@ def _get_edge_matrix(atom_array, base_masks):
)
# filter out donor/acceptor heteroatoms and flatten for easy
# iteration
- hbonds = hbonds[:, (0,2)].flatten()
+ hbonds = hbonds[:, (0, 2)].flatten()
# ``ndarray`` with one row for each base and the number of
# bonded edge heteroatoms as in ``_edge`` as columns
- matrix = np.zeros((2, 3), dtype='int32')
+ matrix = np.zeros((2, 3), dtype="int32")
# Iterate through the atoms and corresponding atoms indices
# that are part of the hydrogen bonds
for atom, atom_index in zip(atom_array[hbonds], hbonds):
-
if atom.res_name not in _REFERENCE_NUCLEOTIDE_NAMES:
continue
@@ -460,8 +468,10 @@ def _get_edge_matrix(atom_array, base_masks):
for base_index, base_mask in enumerate(base_masks):
# If a donor/acceptor atom name matches a name in
# the corresponding edge list increase the tally
- if (base_mask[atom_index] and
- atom.atom_name in edge_type[atom.res_name[-1]]):
+ if (
+ base_mask[atom_index]
+ and atom.atom_name in edge_type[atom.res_name[-1]]
+ ):
matrix[base_index, edge_type_index] += 1
return matrix
@@ -540,7 +550,7 @@ def base_pairs_glycosidic_bond(atom_array, base_pairs):
.. footbibliography::
"""
- results = np.zeros(len(base_pairs), dtype='uint8')
+ results = np.zeros(len(base_pairs), dtype="uint8")
# Get the residue masks for each residue
base_pairs_masks = get_residue_masks(atom_array, base_pairs.flatten())
@@ -552,7 +562,6 @@ def base_pairs_glycosidic_bond(atom_array, base_pairs):
)
for i, pair_masks in enumerate(base_pairs_masks):
-
# position vectors of each bases geometric center
geometric_centers = np.zeros((2, 3))
# direction vectors of the glycosidic bonds
@@ -565,23 +574,22 @@ def base_pairs_glycosidic_bond(atom_array, base_pairs):
# For Purines the glycosidic bond is between the C1' and the
# N9 atoms, for pyrimidines it is between the C1' atom and
# the N1 atom
- if (base.res_name[0] in _ADENINE_CONTAINING_NUCLEOTIDES or
- base.res_name[0] in _GUANINE_CONTAINING_NUCLEOTIDES):
-
- geometric_centers[base_index] = (
- (ring_center[0] + ring_center[1]) / 2
- )
+ if (
+ base.res_name[0] in _ADENINE_CONTAINING_NUCLEOTIDES
+ or base.res_name[0] in _GUANINE_CONTAINING_NUCLEOTIDES
+ ):
+ geometric_centers[base_index] = (ring_center[0] + ring_center[1]) / 2
base_atom = base[base.atom_name == "N9"][0]
- elif (base.res_name[0] in _THYMINE_CONTAINING_NUCLEOTIDES or
- base.res_name[0] in _URACIL_CONTAINING_NUCLEOTIDES or
- base.res_name[0] in _CYTOSINE_CONTAINING_NUCLEOTIDES):
-
+ elif (
+ base.res_name[0] in _THYMINE_CONTAINING_NUCLEOTIDES
+ or base.res_name[0] in _URACIL_CONTAINING_NUCLEOTIDES
+ or base.res_name[0] in _CYTOSINE_CONTAINING_NUCLEOTIDES
+ ):
geometric_centers[base_index] = ring_center[0]
base_atom = base[base.atom_name == "N1"][0]
else:
-
results[i] = GlycosidicBond.INVALID
break
@@ -596,15 +604,16 @@ def base_pairs_glycosidic_bond(atom_array, base_pairs):
geometric_centers_dir = geometric_centers[1] - geometric_centers[0]
# Check the orientation of the glycosidic bonds
- if np.dot(
- np.cross(geometric_centers_dir, glycosidic_bonds[0]),
- np.cross(geometric_centers_dir, glycosidic_bonds[1])
- ) < 0:
-
+ if (
+ np.dot(
+ np.cross(geometric_centers_dir, glycosidic_bonds[0]),
+ np.cross(geometric_centers_dir, glycosidic_bonds[1]),
+ )
+ < 0
+ ):
results[i] = GlycosidicBond.TRANS
else:
-
results[i] = GlycosidicBond.CIS
return results
@@ -723,15 +732,18 @@ def base_stacking(atom_array, min_atoms_per_base=3):
for i in range(2):
base_tuple = _match_base(bases[i], min_atoms_per_base)
- if(base_tuple is None):
+ if base_tuple is None:
break
transformed_std_vectors[i] = base_tuple
- normal_vectors = np.vstack((transformed_std_vectors[0][1],
- transformed_std_vectors[1][1]))
- aromatic_ring_centers = [transformed_std_vectors[0][3:],
- transformed_std_vectors[1][3:]]
+ normal_vectors = np.vstack(
+ (transformed_std_vectors[0][1], transformed_std_vectors[1][1])
+ )
+ aromatic_ring_centers = [
+ transformed_std_vectors[0][3:],
+ transformed_std_vectors[1][3:],
+ ]
# Check if the base pairs are stacked.
stacked = _check_base_stacking(aromatic_ring_centers, normal_vectors)
@@ -744,7 +756,7 @@ def base_stacking(atom_array, min_atoms_per_base=3):
return np.array(stacked_bases)
-def base_pairs(atom_array, min_atoms_per_base = 3, unique = True):
+def base_pairs(atom_array, min_atoms_per_base=3, unique=True):
"""
Use DSSR criteria to find the base pairs in an :class:`AtomArray`.
@@ -854,11 +866,8 @@ def base_pairs(atom_array, min_atoms_per_base = 3, unique = True):
nucleotides_boolean = filter_nucleotides(atom_array)
# Disregard the phosphate-backbone
- non_phosphate_boolean = (
- ~ np.isin(
- atom_array.atom_name,
- ["O5'", "P", "OP1", "OP2", "OP3", "HOP2", "HOP3"]
- )
+ non_phosphate_boolean = ~np.isin(
+ atom_array.atom_name, ["O5'", "P", "OP1", "OP2", "OP3", "HOP2", "HOP3"]
)
# Combine the two boolean masks
@@ -867,7 +876,6 @@ def base_pairs(atom_array, min_atoms_per_base = 3, unique = True):
# Get only nucleosides
nucleosides = atom_array[boolean_mask]
-
# Get the base pair candidates according to a N/O cutoff distance,
# where each base is identified as the first index of its respective
# residue
@@ -896,9 +904,7 @@ def base_pairs(atom_array, min_atoms_per_base = 3, unique = True):
base1 = nucleosides[base1_mask]
base2 = nucleosides[base2_mask]
- hbonds = _check_dssr_criteria(
- (base1, base2), min_atoms_per_base, unique
- )
+ hbonds = _check_dssr_criteria((base1, base2), min_atoms_per_base, unique)
# If no hydrogens are present use the number N/O pairs to
# decide between multiple pairing possibilities.
@@ -906,7 +912,7 @@ def base_pairs(atom_array, min_atoms_per_base = 3, unique = True):
if hbonds is None:
# Each N/O-pair is detected twice. Thus, the number of
# matches must be divided by two.
- hbonds = n_o_pairs/2
+ hbonds = n_o_pairs / 2
if hbonds != -1:
basepairs.append((base1_index, base2_index))
if unique:
@@ -922,20 +928,16 @@ def base_pairs(atom_array, min_atoms_per_base = 3, unique = True):
# Get all bases that have non-unique pairing interactions
base_indices, occurrences = np.unique(basepairs, return_counts=True)
for base_index, occurrence in zip(base_indices, occurrences):
- if(occurrence > 1):
+ if occurrence > 1:
# Write the non-unique base pairs to a dictionary as
# 'index: number of hydrogen bonds'
remove_candidates = {}
- for i, row in enumerate(
- np.asarray(basepair_array == base_index)
- ):
- if(np.any(row)):
+ for i, row in enumerate(np.asarray(basepair_array == base_index)):
+ if np.any(row):
remove_candidates[i] = basepairs_hbonds[i]
# Flag all non-unique base pairs for removal except the
# one that has the most hydrogen bonds
- del remove_candidates[
- max(remove_candidates, key=remove_candidates.get)
- ]
+ del remove_candidates[max(remove_candidates, key=remove_candidates.get)]
to_remove += list(remove_candidates.keys())
# Remove all flagged base pairs from the output `ndarray`
basepair_array = np.delete(basepair_array, to_remove, axis=0)
@@ -984,21 +986,22 @@ def _check_dssr_criteria(basepair, min_atoms_per_base, unique):
# Generate the data necessary for analysis of each base.
for i in range(2):
- transformed_std_vectors[i] = _match_base(
- basepair[i], min_atoms_per_base
- )
+ transformed_std_vectors[i] = _match_base(basepair[i], min_atoms_per_base)
- if(transformed_std_vectors[i] is None):
+ if transformed_std_vectors[i] is None:
return -1
- origins = np.vstack((transformed_std_vectors[0][0],
- transformed_std_vectors[1][0]))
- normal_vectors = np.vstack((transformed_std_vectors[0][1],
- transformed_std_vectors[1][1]))
- schnaap_origins = np.vstack((transformed_std_vectors[0][2],
- transformed_std_vectors[1][2]))
- aromatic_ring_centers = [transformed_std_vectors[0][3:],
- transformed_std_vectors[1][3:]]
+ origins = np.vstack((transformed_std_vectors[0][0], transformed_std_vectors[1][0]))
+ normal_vectors = np.vstack(
+ (transformed_std_vectors[0][1], transformed_std_vectors[1][1])
+ )
+ schnaap_origins = np.vstack(
+ (transformed_std_vectors[0][2], transformed_std_vectors[1][2])
+ )
+ aromatic_ring_centers = [
+ transformed_std_vectors[0][3:],
+ transformed_std_vectors[1][3:],
+ ]
# Criterion 1: Distance between orgins <=15 Å
if not (distance(origins[0], origins[1]) <= 15):
@@ -1009,9 +1012,8 @@ def _check_dssr_criteria(basepair, min_atoms_per_base, unique):
# Average the base normal vectors. If the angle between the vectors
# is >=90°, flip one vector before averaging
mean_normal_vector = (
- normal_vectors[0] + (normal_vectors[1] * np.sign(np.dot(
- normal_vectors[0], normal_vectors[1]
- )))
+ normal_vectors[0]
+ + (normal_vectors[1] * np.sign(np.dot(normal_vectors[0], normal_vectors[1])))
) / 2
norm_vector(mean_normal_vector)
# Calculate the distance vector between the two SCHNAaP origins
@@ -1024,8 +1026,9 @@ def _check_dssr_criteria(basepair, min_atoms_per_base, unique):
return -1
# Criterion 3: Angle between normal vectors <=65°
- if not (np.arccos(np.dot(normal_vectors[0], normal_vectors[1]))
- >= ((115*np.pi)/180)):
+ if not (
+ np.arccos(np.dot(normal_vectors[0], normal_vectors[1])) >= ((115 * np.pi) / 180)
+ ):
return -1
# Criterion 4: Absence of stacking
@@ -1035,8 +1038,7 @@ def _check_dssr_criteria(basepair, min_atoms_per_base, unique):
# Criterion 5: Presence of at least one hydrogen bond
#
# Check if both bases came with hydrogens.
- if (("H" in basepair[0].element)
- and ("H" in basepair[1].element)):
+ if ("H" in basepair[0].element) and ("H" in basepair[1].element):
# For Structures that contain hydrogens, check for their
# presence directly.
#
@@ -1044,11 +1046,13 @@ def _check_dssr_criteria(basepair, min_atoms_per_base, unique):
potential_basepair = basepair[0] + basepair[1]
# Get the number of hydrogen bonds
- bonds = len(hbond(
- potential_basepair,
- np.ones_like(potential_basepair, dtype=bool),
- np.ones_like(potential_basepair, dtype=bool)
- ))
+ bonds = len(
+ hbond(
+ potential_basepair,
+ np.ones_like(potential_basepair, dtype=bool),
+ np.ones_like(potential_basepair, dtype=bool),
+ )
+ )
if bonds > 0:
return bonds
@@ -1085,7 +1089,7 @@ def _check_base_stacking(aromatic_ring_centers, normal_vectors):
wrong_distance = True
for ring_center1 in aromatic_ring_centers[0]:
for ring_center2 in aromatic_ring_centers[1]:
- if (distance(ring_center1, ring_center2) <= 4.5):
+ if distance(ring_center1, ring_center2) <= 4.5:
wrong_distance = False
normalized_distance_vectors.append(ring_center2 - ring_center1)
norm_vector(normalized_distance_vectors[-1])
@@ -1106,8 +1110,7 @@ def _check_base_stacking(aromatic_ring_centers, normal_vectors):
dist_normal_vector_angle = np.rad2deg(
np.arccos(np.dot(normal_vector, normalized_dist_vector))
)
- if ((dist_normal_vector_angle >= 40) and
- (dist_normal_vector_angle <= 140)):
+ if (dist_normal_vector_angle >= 40) and (dist_normal_vector_angle <= 140):
return False
return True
@@ -1142,19 +1145,19 @@ def _match_base(nucleotide, min_atoms_per_base):
if one_letter_code is None:
return None
- if (one_letter_code == 'A'):
+ if one_letter_code == "A":
std_base = _STD_ADENINE
std_ring_centers = _STD_ADENINE_RING_CENTERS
- elif (one_letter_code == 'T'):
+ elif one_letter_code == "T":
std_base = _STD_THYMINE
std_ring_centers = _STD_THYMINE_RING_CENTERS
- elif (one_letter_code == 'C'):
+ elif one_letter_code == "C":
std_base = _STD_CYTOSINE
std_ring_centers = _STD_CYTOSINE_RING_CENTERS
- elif (one_letter_code == 'G'):
+ elif one_letter_code == "G":
std_base = _STD_GUANINE
std_ring_centers = _STD_GUANINE_RING_CENTERS
- elif (one_letter_code == 'U'):
+ elif one_letter_code == "U":
std_base = _STD_URACIL
std_ring_centers = _STD_URACIL_RING_CENTERS
@@ -1162,16 +1165,10 @@ def _match_base(nucleotide, min_atoms_per_base):
vectors = np.vstack((vectors, std_ring_centers))
# Select the matching atoms of the nucleotide and the standard base
- nucleotide_matched = nucleotide[
- np.isin(nucleotide.atom_name, std_base.atom_name)
- ]
- std_base_matched = std_base[
- np.isin(std_base.atom_name, nucleotide.atom_name)
- ]
+ nucleotide_matched = nucleotide[np.isin(nucleotide.atom_name, std_base.atom_name)]
+ std_base_matched = std_base[np.isin(std_base.atom_name, nucleotide.atom_name)]
# Ensure the nucleotide does not contain duplicate atom names
- _, unique_indices = np.unique(
- nucleotide_matched.atom_name, return_index=True
- )
+ _, unique_indices = np.unique(nucleotide_matched.atom_name, return_index=True)
nucleotide_matched = nucleotide_matched[unique_indices]
# Only continue if minimum number of matching atoms is reached
if len(nucleotide_matched) < min_atoms_per_base:
@@ -1179,21 +1176,19 @@ def _match_base(nucleotide, min_atoms_per_base):
f"Nucleotide with res_id {nucleotide.res_id[0]} and "
f"chain_id {nucleotide.chain_id[0]} has less than 3 base "
f"atoms, unable to check for base pair.",
- IncompleteStructureWarning
+ IncompleteStructureWarning,
)
return None
# Reorder the atoms of the nucleotide to obtain the standard RCSB
# PDB atom order.
- nucleotide_matched = nucleotide_matched[
- standardize_order(nucleotide_matched)
- ]
+ nucleotide_matched = nucleotide_matched[standardize_order(nucleotide_matched)]
# Match the selected std_base to the base.
_, transformation = superimpose(nucleotide_matched, std_base_matched)
vectors = transformation.apply(vectors)
# Normalize the base-normal-vector
- vectors[1,:] = vectors[1,:]-vectors[0,:]
- norm_vector(vectors[1,:])
+ vectors[1, :] = vectors[1, :] - vectors[0, :]
+ norm_vector(vectors[1, :])
return vectors
@@ -1259,8 +1254,11 @@ def map_nucleotide(residue, min_atoms_per_base=3, rmsd_cutoff=0.28):
# List of the standard bases for easy iteration
std_base_list = [
- _STD_ADENINE, _STD_THYMINE, _STD_CYTOSINE, _STD_GUANINE,
- _STD_URACIL
+ _STD_ADENINE,
+ _STD_THYMINE,
+ _STD_CYTOSINE,
+ _STD_GUANINE,
+ _STD_URACIL,
]
# The number of matched atoms for each 'standard' base
@@ -1275,7 +1273,7 @@ def map_nucleotide(residue, min_atoms_per_base=3, rmsd_cutoff=0.28):
f"{residue.chain_id[0]} has an overlap with the reference "
f"bases which is less than {min_atoms_per_base} atoms. "
f"Unable to map nucleotide.",
- IncompleteStructureWarning
+ IncompleteStructureWarning,
)
return None, False
@@ -1284,7 +1282,7 @@ def map_nucleotide(residue, min_atoms_per_base=3, rmsd_cutoff=0.28):
# Iterate through the reference bases with the maximum number of
# matching atoms
- for ref_base in np.array(std_base_list, dtype='object')[
+ for ref_base in np.array(std_base_list, dtype="object")[
np.array(matched_atom_no) == np.max(matched_atom_no)
]:
# Copy the residue as the res_name property of the ``AtomArray``
@@ -1293,12 +1291,8 @@ def map_nucleotide(residue, min_atoms_per_base=3, rmsd_cutoff=0.28):
# Select the matching atoms of the nucleotide and the reference
# base
- nuc = nuc[
- np.isin(nuc.atom_name, ref_base.atom_name)
- ]
- ref_base_matched = ref_base[
- np.isin(ref_base.atom_name, nuc.atom_name)
- ]
+ nuc = nuc[np.isin(nuc.atom_name, ref_base.atom_name)]
+ ref_base_matched = ref_base[np.isin(ref_base.atom_name, nuc.atom_name)]
# Set the res_name property to the same as the reference base.
# This is a requirement for ``standardize_order``
@@ -1319,14 +1313,14 @@ def map_nucleotide(residue, min_atoms_per_base=3, rmsd_cutoff=0.28):
# If the RMSD is lower than the specified cutoff or better than
# a previous found reference, the current reference is selected
# as best base
- if(rmsd(fitted, ref_base_matched) < rmsd_cutoff):
+ if rmsd(fitted, ref_base_matched) < rmsd_cutoff:
rmsd_cutoff = rmsd(fitted, ref_base_matched)
best_base = ref_base_matched.res_name[0][-1]
if best_base is None:
warnings.warn(
f"Base Type {residue.res_name[0]} not supported. ",
- UnexpectedStructureWarning
+ UnexpectedStructureWarning,
)
return None
@@ -1360,9 +1354,9 @@ def _get_proximate_residues(atom_array, boolean_mask, cutoff):
# Get the indices of the atoms that are within the maximum cutoff
# of each other
- indices = CellList(
- atom_array, cutoff, selection=boolean_mask
- ).get_atoms(atom_array.coord[boolean_mask], cutoff)
+ indices = CellList(atom_array, cutoff, selection=boolean_mask).get_atoms(
+ atom_array.coord[boolean_mask], cutoff
+ )
# Loop through the indices of potential partners
pairs = []
@@ -1375,16 +1369,12 @@ def _get_proximate_residues(atom_array, boolean_mask, cutoff):
# indices.
pairs = np.array(pairs)
basepair_candidates_shape = pairs.shape
- pairs = get_residue_starts_for(
- atom_array, pairs.flatten()
- ).reshape(basepair_candidates_shape)
+ pairs = get_residue_starts_for(atom_array, pairs.flatten()).reshape(
+ basepair_candidates_shape
+ )
# Remove candidates where the pairs are from the same residue
- pairs = np.delete(
- pairs, np.where(
- pairs[:,0] == pairs[:,1]
- ), axis=0
- )
+ pairs = np.delete(pairs, np.where(pairs[:, 0] == pairs[:, 1]), axis=0)
# Sort the residue starts for each pair
for i, candidate in enumerate(pairs):
pairs[i] = sorted(candidate)
@@ -1411,5 +1401,4 @@ def _filter_atom_type(atom_array, atom_names):
This array is ``True`` for all indices in the :class:`AtomArray`
, where the atom has the desired atom names.
"""
- return (np.isin(atom_array.atom_name, atom_names)
- & (atom_array.res_id != -1))
+ return np.isin(atom_array.atom_name, atom_names) & (atom_array.res_id != -1)
diff --git a/src/biotite/structure/bonds.pyx b/src/biotite/structure/bonds.pyx
index 783fbda0e..e3d30105c 100644
--- a/src/biotite/structure/bonds.pyx
+++ b/src/biotite/structure/bonds.pyx
@@ -1330,6 +1330,7 @@ def _invert_index(IndexType[:] index_v, uint32 length):
+# fmt: off
_DEFAULT_DISTANCE_RANGE = {
# Taken from Allen et al.
# min - 2*std max + 2*std
@@ -1376,6 +1377,7 @@ _DEFAULT_DISTANCE_RANGE = {
("SE", "SE") : (2.340 - 2*0.024, 2.340 + 2*0.024),
("SI", "SE") : (2.359 - 2*0.012, 2.359 + 2*0.012),
}
+# fmt: on
def connect_via_distances(atoms, dict distance_range=None, bint inter_residue=True,
default_bond_type=BondType.ANY, bint periodic=False):
diff --git a/src/biotite/structure/box.py b/src/biotite/structure/box.py
index ae4918add..41349bb9d 100644
--- a/src/biotite/structure/box.py
+++ b/src/biotite/structure/box.py
@@ -4,25 +4,33 @@
"""
Functions related to working with the simulation box or unit cell
-of a structure
+of a structure
"""
__name__ = "biotite.structure"
__author__ = "Patrick Kunzmann"
-__all__ = ["vectors_from_unitcell", "unitcell_from_vectors", "box_volume",
- "repeat_box", "repeat_box_coord", "move_inside_box",
- "remove_pbc", "remove_pbc_from_coord",
- "coord_to_fraction", "fraction_to_coord", "is_orthogonal"]
+__all__ = [
+ "vectors_from_unitcell",
+ "unitcell_from_vectors",
+ "box_volume",
+ "repeat_box",
+ "repeat_box_coord",
+ "move_inside_box",
+ "remove_pbc",
+ "remove_pbc_from_coord",
+ "coord_to_fraction",
+ "fraction_to_coord",
+ "is_orthogonal",
+]
-from collections.abc import Iterable
from numbers import Integral
import numpy as np
import numpy.linalg as linalg
-from .util import vector_dot
-from .atoms import repeat
-from .molecules import get_molecule_masks
-from .chains import get_chain_masks, get_chain_starts
-from .error import BadStructureError
+from biotite.structure.atoms import repeat
+from biotite.structure.chains import get_chain_masks, get_chain_starts
+from biotite.structure.error import BadStructureError
+from biotite.structure.molecules import get_molecule_masks
+from biotite.structure.util import vector_dot
def vectors_from_unitcell(len_a, len_b, len_c, alpha, beta, gamma):
@@ -41,7 +49,7 @@ def vectors_from_unitcell(len_a, len_b, len_c, alpha, beta, gamma):
The angles between the box vectors in radians.
*alpha* is the angle between *b* and *c*,
*beta* between *a* and *c*, *gamma* between *a* and *b*
-
+
Returns
-------
box : ndarray, dtype=float, shape=(3,3)
@@ -49,7 +57,7 @@ def vectors_from_unitcell(len_a, len_b, len_c, alpha, beta, gamma):
The vector components are in the last dimension.
The value can be directly used as :attr:`box` attribute in an
atom array.
-
+
See also
--------
unitcell_from_vectors
@@ -58,19 +66,15 @@ def vectors_from_unitcell(len_a, len_b, len_c, alpha, beta, gamma):
b_x = len_b * np.cos(gamma)
b_y = len_b * np.sin(gamma)
c_x = len_c * np.cos(beta)
- c_y = len_c * (np.cos(alpha) - np.cos(beta)*np.cos(gamma)) / np.sin(gamma)
- c_z = np.sqrt(len_c*len_c - c_x*c_x - c_y*c_y)
- box = np.array([
- [a_x, 0, 0],
- [b_x, b_y, 0],
- [c_x, c_y, c_z]
- ], dtype=np.float32)
-
+ c_y = len_c * (np.cos(alpha) - np.cos(beta) * np.cos(gamma)) / np.sin(gamma)
+ c_z = np.sqrt(len_c * len_c - c_x * c_x - c_y * c_y)
+ box = np.array([[a_x, 0, 0], [b_x, b_y, 0], [c_x, c_y, c_z]], dtype=np.float32)
+
# Fix numerical errors, as values, that are actually 0,
# might not be calculated as such
tol = 1e-4 * (len_a + len_b + len_c)
box[np.abs(box) < tol] = 0
-
+
return box
@@ -84,7 +88,7 @@ def unitcell_from_vectors(box):
----------
box : ndarray, shape=(3,3)
The box vectors
-
+
Returns
-------
len_a, len_b, len_c : float
@@ -103,7 +107,7 @@ def unitcell_from_vectors(box):
len_b = linalg.norm(b)
len_c = linalg.norm(c)
alpha = np.arccos(np.dot(b, c) / (len_b * len_c))
- beta = np.arccos(np.dot(a, c) / (len_a * len_c))
+ beta = np.arccos(np.dot(a, c) / (len_a * len_c))
gamma = np.arccos(np.dot(a, b) / (len_a * len_b))
return len_a, len_b, len_c, alpha, beta, gamma
@@ -116,7 +120,7 @@ def box_volume(box):
----------
box : ndarray, shape=(3,3) or shape=(m,3,3)
One or multiple boxes to get the volume for.
-
+
Returns
-------
volume : float or ndarray, shape=(m,)
@@ -159,7 +163,7 @@ def repeat_box(atoms, amount=1):
Indices to the atoms in the original atom array (stack).
Equal to
``numpy.tile(np.arange(atoms.array_length()), (1 + 2 * amount) ** 3)``.
-
+
See also
--------
repeat_box_coord
@@ -232,12 +236,12 @@ def repeat_box(atoms, amount=1):
"""
if atoms.box is None:
raise BadStructureError("Structure has no box")
-
+
repeat_coord, indices = repeat_box_coord(atoms.coord, atoms.box)
# Unroll repeated coordinates for input to 'repeat()'
if repeat_coord.ndim == 2:
repeat_coord = repeat_coord.reshape(-1, atoms.array_length(), 3)
- else: # ndim == 3
+ else: # ndim == 3
repeat_coord = repeat_coord.reshape(
atoms.stack_depth(), -1, atoms.array_length(), 3
)
@@ -283,16 +287,15 @@ def repeat_box_coord(coord, box, amount=1):
raise TypeError("The amount must be an integer")
# List of numpy arrays for each box repeat
coords_for_boxes = [coord]
- for i in range(-amount, amount+1):
- for j in range(-amount, amount+1):
- for k in range(-amount, amount+1):
+ for i in range(-amount, amount + 1):
+ for j in range(-amount, amount + 1):
+ for k in range(-amount, amount + 1):
# Omit the central box
if i != 0 or j != 0 or k != 0:
temp_coord = coord.copy()
# Shift coordinates to adjacent box/unit cell
translation_vec = np.sum(
- box * np.array([i,j,k])[:, np.newaxis],
- axis=-2
+ box * np.array([i, j, k])[:, np.newaxis], axis=-2
)
# 'newaxis' to perform same translation on all
# atoms for each model
@@ -300,7 +303,7 @@ def repeat_box_coord(coord, box, amount=1):
coords_for_boxes.append(temp_coord)
return (
np.concatenate(coords_for_boxes, axis=-2),
- np.tile(np.arange(coord.shape[-2]), (1 + 2 * amount) ** 3)
+ np.tile(np.arange(coord.shape[-2]), (1 + 2 * amount) ** 3),
)
@@ -323,16 +326,16 @@ def move_inside_box(coord, box):
The box(es) for one or multiple models.
When `coord` is given for multiple models, :attr:`box` must be
given for multiple models as well.
-
+
Returns
-------
moved_coord : ndarray, dtype=float, shape=(n,3) or shape=(m,n,3)
The moved coordinates.
Has the same shape is the input `coord`.
-
+
Examples
--------
-
+
>>> box = np.array([[10,0,0], [0,10,0], [0,0,10]], dtype=float)
>>> inside_coord = [ 1, 2, 3]
>>> outside_coord = [ 1, 22, 54]
@@ -363,7 +366,7 @@ def remove_pbc(atoms, selection=None):
To determine the molecules the structure is required to have an
associated `BondList`.
Otherwise segmentation removal is performed on a per-chain basis.
-
+
Parameters
----------
atoms : AtomArray, shape=(n,) or AtomArrayStack, shape=(m,n)
@@ -373,13 +376,13 @@ def remove_pbc(atoms, selection=None):
selection : ndarray, dtype=bool, shape=(n,)
Specifies which parts of `atoms` are sanitized, i.e the
segmentation is removed.
-
+
Returns
-------
sanitized_atoms : AtomArray or AtomArrayStack
The input structure with removed segmentation over periodic
boundaries.
-
+
See also
--------
remove_pbc_from_coord
@@ -392,12 +395,10 @@ def remove_pbc(atoms, selection=None):
half box size.
"""
# Avoid circular import
- from .geometry import centroid
-
+ from biotite.structure.geometry import centroid
+
if atoms.box is None:
- raise BadStructureError(
- "The 'box' attribute must be set in the structure"
- )
+ raise BadStructureError("The 'box' attribute must be set in the structure")
new_atoms = atoms.copy()
if atoms.bonds is not None:
@@ -414,10 +415,8 @@ def remove_pbc(atoms, selection=None):
)
# Put center of molecule into box
center = centroid(new_atoms.coord[..., mask, :])[..., np.newaxis, :]
- center_in_box = move_inside_box(
- center, new_atoms.box
- )
- new_atoms.coord[..., mask, :] += (center_in_box - center)
+ center_in_box = move_inside_box(center, new_atoms.box)
+ new_atoms.coord[..., mask, :] += center_in_box - center
return new_atoms
@@ -433,11 +432,11 @@ def remove_pbc_from_coord(coord, box):
the displacement coordinates in adjacent array positions.
Basically, this function performs the reverse action of
:func:`move_inside_box()`.
-
+
Parameters
----------
coord : ndarray, dtype=float, shape=(m,n,3) or shape=(n,3)
- The coordinates of the potentially segmented structure.
+ The coordinates of the potentially segmented structure.
box : ndarray, dtype=float, shape=(m,3,3) or shape=(3,3)
The simulation box or unit cell that is used as periodic
boundary.
@@ -447,7 +446,7 @@ def remove_pbc_from_coord(coord, box):
-------
sanitized_coord : ndarray, dtype=float, shape=(m,n,3) or shape=(n,3)
The reassembled coordinates.
-
+
See also
--------
remove_pbc_from_coord
@@ -463,19 +462,14 @@ def remove_pbc_from_coord(coord, box):
"""
# Import in function to avoid circular import
- from .geometry import index_displacement
+ from biotite.structure.geometry import index_displacement
+
# Get the PBC-sanitized displacements of all coordinates
# to the respective next coordinate
index_pairs = np.stack(
- [
- np.arange(0, coord.shape[-2] - 1),
- np.arange(1, coord.shape[-2] )
- ],
- axis=1
- )
- neighbour_disp = index_displacement(
- coord, index_pairs, box=box, periodic=True
+ [np.arange(0, coord.shape[-2] - 1), np.arange(1, coord.shape[-2])], axis=1
)
+ neighbour_disp = index_displacement(coord, index_pairs, box=box, periodic=True)
# Get the PBC-sanitized displacements of all but the first
# coordinates to (0,0,0)
absolute_disp = np.cumsum(neighbour_disp, axis=-2)
@@ -501,19 +495,19 @@ def coord_to_fraction(coord, box):
The box(es) for one or multiple models.
When `coord` is given for multiple models, :attr:`box` must be
given for multiple models as well.
-
+
Returns
-------
fraction : ndarray, dtype=float, shape=(n,3) or shape=(m,n,3)
The fractions of the box vectors.
-
+
See also
--------
fraction_to_coord
Examples
--------
-
+
>>> box = np.array([[5,0,0], [0,5,0], [0,5,5]], dtype=float)
>>> coord = np.array(
... [[1,1,1], [10,0,0], [0,0,10], [-5,2,1]],
@@ -548,12 +542,12 @@ def fraction_to_coord(fraction, box):
The box(es) for one or multiple models.
When `coord` is given for multiple models, :attr:`box` must be
given for multiple models as well.
-
+
Returns
-------
coord : ndarray, dtype=float, shape=(n,3) or shape=(m,n,3)
The coordinates.
-
+
See also
--------
coord_to_fraction
@@ -572,12 +566,12 @@ def is_orthogonal(box):
----------
box : ndarray, dtype=float, shape=(3,3) or shape=(m,3,3)
A single box or multiple boxes.
-
+
Returns
-------
is_orthgonal : bool or ndarray, shape=(m,), dtype=bool
True, if the box vectors are orthogonal, false otherwise
-
+
Notes
-----
Due to possible numerical errors, this function also evaluates two
@@ -587,6 +581,8 @@ def is_orthogonal(box):
# Fix numerical errors, as values, that are actually 0,
# might not be calculated as such
tol = 1e-6
- return (np.abs(vector_dot(box[..., 0, :], box[..., 1, :])) < tol) & \
- (np.abs(vector_dot(box[..., 0, :], box[..., 2, :])) < tol) & \
- (np.abs(vector_dot(box[..., 1, :], box[..., 2, :])) < tol)
\ No newline at end of file
+ return (
+ (np.abs(vector_dot(box[..., 0, :], box[..., 1, :])) < tol)
+ & (np.abs(vector_dot(box[..., 0, :], box[..., 2, :])) < tol)
+ & (np.abs(vector_dot(box[..., 1, :], box[..., 2, :])) < tol)
+ )
diff --git a/src/biotite/structure/chains.py b/src/biotite/structure/chains.py
index df3134267..c4bbd4996 100644
--- a/src/biotite/structure/chains.py
+++ b/src/biotite/structure/chains.py
@@ -9,22 +9,38 @@
__name__ = "biotite.structure"
__author__ = "Patrick Kunzmann"
-__all__ = ["get_chain_starts", "apply_chain_wise", "spread_chain_wise",
- "get_chain_masks", "get_chain_starts_for", "get_chain_positions",
- "chain_iter", "get_chains", "get_chain_count", "chain_iter"]
+__all__ = [
+ "get_chain_starts",
+ "apply_chain_wise",
+ "spread_chain_wise",
+ "get_chain_masks",
+ "get_chain_starts_for",
+ "get_chain_positions",
+ "chain_iter",
+ "get_chains",
+ "get_chain_count",
+ "chain_iter",
+]
import numpy as np
-from .resutil import *
+from biotite.structure.segments import (
+ apply_segment_wise,
+ get_segment_masks,
+ get_segment_positions,
+ get_segment_starts_for,
+ segment_iter,
+ spread_segment_wise,
+)
def get_chain_starts(array, add_exclusive_stop=False):
"""
Get the indices in an atom array, which indicates the beginning of
a new chain.
-
+
A new chain starts, when the chain ID changes or when the residue ID
decreases.
-
+
Parameters
----------
array : AtomArray or AtomArrayStack
@@ -33,17 +49,17 @@ def get_chain_starts(array, add_exclusive_stop=False):
If true, the exclusive stop of the input atom array, i.e.
``array.array_length()``, is added to the returned array of
start indices as last element.
-
+
Returns
-------
starts : ndarray, dtype=int
The start indices of new chains in `array`.
-
+
Notes
-----
This method is internally used by all other chain-related
functions.
-
+
See also
--------
get_residue_starts
@@ -51,13 +67,13 @@ def get_chain_starts(array, add_exclusive_stop=False):
diff = np.diff(array.res_id)
res_id_decrement = diff < 0
# This mask is 'true' at indices where the value changes
- chain_id_changes = (array.chain_id[1:] != array.chain_id[:-1])
-
+ chain_id_changes = array.chain_id[1:] != array.chain_id[:-1]
+
# Convert mask to indices
# Add 1, to shift the indices from the end of a chain
# to the start of a new chain
chain_starts = np.where(res_id_decrement | chain_id_changes)[0] + 1
-
+
# The first chain is not included yet -> Insert '[0]'
if add_exclusive_stop:
return np.concatenate(([0], chain_starts, [array.array_length()]))
@@ -69,7 +85,7 @@ def apply_chain_wise(array, data, function, axis=None):
"""
Apply a function to intervals of data, where each interval
corresponds to one chain.
-
+
The function takes an atom array (stack) and an data array
(`ndarray`) of the same length. The function iterates through the
chain IDs of the atom array (stack) and identifies intervals of
@@ -77,8 +93,8 @@ def apply_chain_wise(array, data, function, axis=None):
partitioned into the same intervals, and each interval (also an
:class:`ndarray`) is put as parameter into `function`. Each return value is
stored as element in the resulting :class:`ndarray`, therefore each element
- corresponds to one chain.
-
+ corresponds to one chain.
+
Parameters
----------
array : AtomArray or AtomArrayStack
@@ -92,14 +108,14 @@ def apply_chain_wise(array, data, function, axis=None):
must return a value with the same shape and data type.
axis : int, optional
This value is given to the `axis` parameter of `function`.
-
+
Returns
-------
processed_data : ndarray
Chain-wise evaluation of `data` by `function`. The size of the
first dimension of this array is equal to the amount of
chains.
-
+
See also
--------
apply_residue_wise
@@ -114,11 +130,11 @@ def spread_chain_wise(array, input_data):
Each value in the chain-wise input is assigned to all atoms of
this chain:
-
+
``output_data[i] = input_data[j]``,
*i* is incremented from atom to atom,
*j* is incremented every chain change.
-
+
Parameters
----------
array : AtomArray or AtomArrayStack
@@ -126,13 +142,13 @@ def spread_chain_wise(array, input_data):
input_data : ndarray
The data to be spread. The length of axis=0 must be equal to
the amount of different chain IDs in `array`.
-
+
Returns
-------
output_data : ndarray
Chain-wise spread `input_data`. Length is the same as
`array_length()` of `array`.
-
+
See also
--------
spread_residue_wise
@@ -154,14 +170,14 @@ def get_chain_masks(array, indices):
These indices indicate the atoms to get the corresponding
chains for.
Negative indices are not allowed.
-
+
Returns
-------
chains_masks : ndarray, dtype=bool, shape=(k,n)
Multiple boolean masks, one for each given index in `indices`.
Each array masks the atoms that belong to the same chain as
the atom at the given index.
-
+
See also
--------
get_residue_masks
@@ -183,13 +199,13 @@ def get_chain_starts_for(array, indices):
These indices point to the atoms to get the corresponding
chain starts for.
Negative indices are not allowed.
-
+
Returns
-------
start_indices : ndarray, dtype=int, shape=(k,)
The indices that point to the chain starts for the input
`indices`.
-
+
See also
--------
get_residue_starts_for
@@ -214,12 +230,12 @@ def get_chain_positions(array, indices):
These indices point to the atoms to get the corresponding
chain positions for.
Negative indices are not allowed.
-
+
Returns
-------
start_indices : ndarray, dtype=int, shape=(k,)
The indices that point to the position of the chains.
-
+
See also
--------
get_residue_positions
@@ -231,20 +247,20 @@ def get_chain_positions(array, indices):
def get_chains(array):
"""
Get the chain IDs of an atom array (stack).
-
+
The chains are listed in the same order they occur in the array
(stack).
-
+
Parameters
----------
array : AtomArray or AtomArrayStack
The atom array (stack), where the chains are determined.
-
+
Returns
-------
ids : ndarray, dtype=str
List of chain IDs.
-
+
See also
--------
get_residues
@@ -255,20 +271,20 @@ def get_chains(array):
def get_chain_count(array):
"""
Get the amount of chains in an atom array (stack).
-
+
The count is determined from the `chain_id` annotation.
Each time the chain ID changes, the count is incremented.
-
+
Parameters
----------
array : AtomArray or AtomArrayStack
The atom array (stack), where the chains are counted.
-
+
Returns
-------
count : int
Amount of chains.
-
+
See also
--------
get_residue_count
@@ -279,20 +295,20 @@ def get_chain_count(array):
def chain_iter(array):
"""
Iterate over all chains in an atom array (stack).
-
+
Parameters
----------
array : AtomArray or AtomArrayStack
The atom array (stack) to iterate over.
-
+
Yields
------
chain : AtomArray or AtomArrayStack
A single chain of the input `array`.
-
+
See also
--------
residue_iter
"""
starts = get_chain_starts(array, add_exclusive_stop=True)
- return segment_iter(array, starts)
\ No newline at end of file
+ return segment_iter(array, starts)
diff --git a/src/biotite/structure/compare.py b/src/biotite/structure/compare.py
index abb6b7e9f..bdce1d7a0 100644
--- a/src/biotite/structure/compare.py
+++ b/src/biotite/structure/compare.py
@@ -12,21 +12,21 @@
__all__ = ["rmsd", "rmspd", "rmsf", "average"]
import numpy as np
-from .atoms import Atom, AtomArray, AtomArrayStack, coord
-from .geometry import index_distance
-from .util import vector_dot
+from biotite.structure.atoms import AtomArrayStack, coord
+from biotite.structure.geometry import index_distance
+from biotite.structure.util import vector_dot
def rmsd(reference, subject):
r"""
Calculate the RMSD between two structures.
-
+
The *root-mean-square-deviation* (RMSD) indicates the overall
deviation of each model of a structure to a reference structure.
It is defined as:
-
+
.. math:: RMSD = \sqrt{ \frac{1}{n} \sum\limits_{i=1}^n (x_i - x_{ref,i})^2}
-
+
Parameters
----------
reference : AtomArray or ndarray, dtype=float, shape=(n,3)
@@ -37,7 +37,7 @@ def rmsd(reference, subject):
Structure(s) to be compared with `reference`.
Alternatively, coordinates can be provided directly as
:class:`ndarray`.
-
+
Returns
-------
rmsd : float or ndarray, dtype=float, shape=(m,)
@@ -45,7 +45,7 @@ def rmsd(reference, subject):
If subject is an :class:`AtomArray` a float is returned.
If subject is an :class:`AtomArrayStack` a :class:`ndarray`
containing the RMSD for each model is returned.
-
+
See Also
--------
rmsf
@@ -71,16 +71,17 @@ def rmsd(reference, subject):
"""
return np.sqrt(np.mean(_sq_euclidian(reference, subject), axis=-1))
+
def rmspd(reference, subject, periodic=False, box=None):
r"""
- Calculate the RMSD of atom pair distances for given structures
+ Calculate the RMSD of atom pair distances for given structures
relative to those found in a reference structure.
- Unlike the standard RMSD, the *root-mean-square-pairwise-deviation*
- (RMSPD) is a fit-free method to determine deviations between
+ Unlike the standard RMSD, the *root-mean-square-pairwise-deviation*
+ (RMSPD) is a fit-free method to determine deviations between
a structure and a preset reference.
- .. math:: RMSPD = \sqrt{ \frac{1}{n^2} \sum\limits_{i=1}^n \sum\limits_{j \neq i}^n (d_{ij} - d_{ref,ij})^2}
+ .. math:: RMSPD = \sqrt{ \frac{1}{n^2} \sum\limits_{i=1}^n \sum\limits_{j \neq i}^n (d_{ij} - d_{ref,ij})^2}
Parameters
----------
@@ -102,7 +103,7 @@ def rmspd(reference, subject, periodic=False, box=None):
box : ndarray, shape=(3,3) or shape=(m,3,3), optional
If this parameter is set, the given box is used instead of the
`box` attribute of `atoms`.
-
+
Returns
-------
rmspd : float or ndarray, dtype=float, shape=(m,)
@@ -110,7 +111,7 @@ def rmspd(reference, subject, periodic=False, box=None):
If subject is an :class:`AtomArray` a float is returned.
If subject is an :class:`AtomArrayStack` a :class:`ndarray`
containing the RMSD for each model is returned.
-
+
Warnings
--------
Internally, this function uses :func:`index_distance()`.
@@ -119,7 +120,7 @@ def rmspd(reference, subject, periodic=False, box=None):
prior to the computation of RMSPDs with `periodic` set to false
to ensure correct results.
(e.g. with :func:`remove_pbc()`).
-
+
See also
--------
index_distance
@@ -134,9 +135,10 @@ def rmspd(reference, subject, periodic=False, box=None):
refdist = index_distance(reference, pairs, periodic=periodic, box=box)
subjdist = index_distance(subject, pairs, periodic=periodic, box=box)
- rmspd = np.sqrt(np.sum((subjdist - refdist)**2, axis = -1))/reflen
+ rmspd = np.sqrt(np.sum((subjdist - refdist) ** 2, axis=-1)) / reflen
return rmspd
+
def rmsf(reference, subject):
r"""
Calculate the RMSF between two structures.
@@ -146,9 +148,9 @@ def rmsf(reference, subject):
models.
Usually the reference structure, is the average over all models.
The RMSF is defined as:
-
+
.. math:: RMSF(i) = \sqrt{ \frac{1}{T} \sum\limits_{t=1}^T (x_i(t) - x_{ref,i}(t))^2}
-
+
Parameters
----------
reference : AtomArray or ndarray, dtype=float, shape=(n,3)
@@ -161,14 +163,14 @@ def rmsf(reference, subject):
:class:`AtomArrayStack`.
Alternatively, coordinates can be provided directly as
:class:`ndarray`.
-
+
Returns
-------
rmsf : ndarray, dtype=float, shape=(n,)
RMSF between subject and reference structure.
Each element gives the RMSF for the atom at the respective
index.
-
+
See Also
--------
rmsd
@@ -198,41 +200,39 @@ def rmsf(reference, subject):
def average(atoms):
"""
Calculate an average structure.
-
+
The average structure has the average coordinates
of the input models.
-
+
Parameters
----------
atoms : AtomArrayStack or ndarray, dtype=float, shape=(m,n,3)
The structure models to be averaged.
Alternatively, coordinates can be provided directly as
:class:`ndarray`.
-
+
Returns
-------
average : AtomArray or ndarray, dtype=float, shape=(n,3)
Structure with averaged atom coordinates.
If `atoms` is a :class:`ndarray` and :class:`ndarray` is also
returned.
-
+
See Also
--------
rmsd, rmsf
-
+
Notes
-----
The calculated average structure is not suitable for visualization
or geometric calculations, since bond lengths and angles will
deviate from meaningful values.
This method is rather useful to provide a reference structure for
- calculation of e.g. the RMSD or RMSF.
+ calculation of e.g. the RMSD or RMSF.
"""
coords = coord(atoms)
if coords.ndim != 3:
- raise TypeError(
- "Expected an AtomArrayStack or an ndarray with shape (m,n,3)"
- )
+ raise TypeError("Expected an AtomArrayStack or an ndarray with shape (m,n,3)")
mean_coords = np.mean(coords, axis=0)
if isinstance(atoms, AtomArrayStack):
mean_array = atoms[0].copy()
@@ -246,7 +246,7 @@ def _sq_euclidian(reference, subject):
"""
Calculate squared euclidian distance between atoms in two
structures.
-
+
Parameters
----------
reference : AtomArray or ndarray, dtype=float, shape=(n,3)
@@ -254,7 +254,7 @@ def _sq_euclidian(reference, subject):
subject : AtomArray or AtomArrayStack or ndarray, dtype=float, shape=(n,3) or shape=(m,n,3)
Structure(s) whose atoms squared euclidian distance to
`reference` is measured.
-
+
Returns
-------
ndarray, dtype=float, shape=(n,) or shape=(m,n)
@@ -271,4 +271,4 @@ def _sq_euclidian(reference, subject):
"Expected an AtomArray or an ndarray with shape (n,3) as reference"
)
dif = subject_coord - reference_coord
- return vector_dot(dif, dif)
\ No newline at end of file
+ return vector_dot(dif, dif)
diff --git a/src/biotite/structure/density.py b/src/biotite/structure/density.py
index 5f6043412..86f24d53e 100644
--- a/src/biotite/structure/density.py
+++ b/src/biotite/structure/density.py
@@ -11,11 +11,10 @@
__all__ = ["density"]
import numpy as np
-from .atoms import coord
+from biotite.structure.atoms import coord
-def density(atoms, selection=None, delta=1.0, bins=None,
- density=False, weights=None):
+def density(atoms, selection=None, delta=1.0, bins=None, density=False, weights=None):
r"""
Compute the density of the selected atoms.
@@ -51,13 +50,13 @@ def density(atoms, selection=None, delta=1.0, bins=None,
Otherwise, returns the probability density function of each bin.
See :func:`numpy.histogramdd()` for further details.
weights: ndarray, shape=(n,) or shape=(m,n), optional
- An array of values to weight the contribution of *n* atoms in
+ An array of values to weight the contribution of *n* atoms in
*m* models.
If the shape is *(n,)*, the weights will be interpreted as
*per atom*.
A shape of *(m,n)* allows to additionally weight atoms on a
*per model* basis.
-
+
Returns
-------
H : ndarray, dtype=float
@@ -69,12 +68,12 @@ def density(atoms, selection=None, delta=1.0, bins=None,
A list containing the 3 arrays describing the bin edges.
"""
coords = coord(atoms)
-
+
is_stack = coords.ndim == 3
# Define the grid for coordinate binning based on coordinates of
# supplied atoms
- # This makes the binning independent of a supplied box vector and
+ # This makes the binning independent of a supplied box vector and
# fluctuating box dimensions are not a problem
# However, this means that the user has to make sure the region of
# interest is in the center of the box, i.e. by centering the
@@ -84,19 +83,17 @@ def density(atoms, selection=None, delta=1.0, bins=None,
axis = (0, 1)
else:
axis = 0
- grid_min, grid_max = np.min(
- coords, axis=axis), np.max(coords, axis=axis
- )
+ grid_min, grid_max = np.min(coords, axis=axis), np.max(coords, axis=axis)
bins = [
- np.arange(grid_min[0], grid_max[0]+delta, delta),
- np.arange(grid_min[1], grid_max[1]+delta, delta),
- np.arange(grid_min[2], grid_max[2]+delta, delta),
+ np.arange(grid_min[0], grid_max[0] + delta, delta),
+ np.arange(grid_min[1], grid_max[1] + delta, delta),
+ np.arange(grid_min[2], grid_max[2] + delta, delta),
]
if selection is None:
selected_coords = coords
else:
- selected_coords = coords[...,selection, :]
+ selected_coords = coords[..., selection, :]
# Reshape the coords into Nx3
coords = selected_coords.reshape((np.prod(selected_coords.shape[:-1]), 3))
@@ -106,9 +103,7 @@ def density(atoms, selection=None, delta=1.0, bins=None,
if is_stack and len(weights.shape) < 2:
weights = np.tile(weights, len(selected_coords))
weights = weights.reshape(coords.shape[0])
-
+
# Calculate the histogram
- hist = np.histogramdd(
- coords, bins=bins, density=density, weights=weights
- )
+ hist = np.histogramdd(coords, bins=bins, density=density, weights=weights)
return hist
diff --git a/src/biotite/structure/dotbracket.py b/src/biotite/structure/dotbracket.py
index ebfc3cf7f..66d8af441 100644
--- a/src/biotite/structure/dotbracket.py
+++ b/src/biotite/structure/dotbracket.py
@@ -9,13 +9,12 @@
__name__ = "biotite.structure"
__author__ = "Tom David Müller"
-__all__ = ["dot_bracket_from_structure", "dot_bracket",
- "base_pairs_from_dot_bracket"]
+__all__ = ["dot_bracket_from_structure", "dot_bracket", "base_pairs_from_dot_bracket"]
import numpy as np
-from .basepairs import base_pairs
-from .pseudoknots import pseudoknots
-from .residues import get_residue_count, get_residue_positions
+from biotite.structure.basepairs import base_pairs
+from biotite.structure.pseudoknots import pseudoknots
+from biotite.structure.residues import get_residue_count, get_residue_positions
_OPENING_BRACKETS = "([{ 0:
diff --git a/src/biotite/structure/error.py b/src/biotite/structure/error.py
index 269ee2276..1fe632e97 100644
--- a/src/biotite/structure/error.py
+++ b/src/biotite/structure/error.py
@@ -8,24 +8,32 @@
__name__ = "biotite.structure"
__author__ = "Patrick Kunzmann"
-__all__ = ["BadStructureError", "IncompleteStructureWarning",
- "UnexpectedStructureWarning"]
+__all__ = [
+ "BadStructureError",
+ "IncompleteStructureWarning",
+ "UnexpectedStructureWarning",
+]
class BadStructureError(Exception):
"""
Indicates that a structure is not suitable for a certain operation.
"""
+
pass
+
class IncompleteStructureWarning(Warning):
"""
Indicates that a structure is not complete.
"""
+
pass
+
class UnexpectedStructureWarning(Warning):
"""
Indicates that a structure was not expected.
"""
+
pass
diff --git a/src/biotite/structure/filter.py b/src/biotite/structure/filter.py
index 13ccd486a..c6e4aefd6 100644
--- a/src/biotite/structure/filter.py
+++ b/src/biotite/structure/filter.py
@@ -9,32 +9,64 @@
__name__ = "biotite.structure"
__author__ = "Patrick Kunzmann, Tom David Müller"
-__all__ = ["filter_solvent", "filter_monoatomic_ions", "filter_nucleotides",
- "filter_canonical_nucleotides", "filter_amino_acids",
- "filter_canonical_amino_acids", "filter_carbohydrates",
- "filter_intersection", "filter_first_altloc",
- "filter_highest_occupancy_altloc", "filter_peptide_backbone",
- "filter_phosphate_backbone", "filter_linear_bond_continuity",
- "filter_polymer"]
+__all__ = [
+ "filter_solvent",
+ "filter_monoatomic_ions",
+ "filter_nucleotides",
+ "filter_canonical_nucleotides",
+ "filter_amino_acids",
+ "filter_canonical_amino_acids",
+ "filter_carbohydrates",
+ "filter_intersection",
+ "filter_first_altloc",
+ "filter_highest_occupancy_altloc",
+ "filter_peptide_backbone",
+ "filter_phosphate_backbone",
+ "filter_linear_bond_continuity",
+ "filter_polymer",
+]
-import warnings
-import numpy as np
from functools import partial
-from .atoms import array as atom_array
-from .residues import get_residue_starts, get_residue_count
-from .info.groups import amino_acid_names, carbohydrate_names, nucleotide_names
-
-
-_canonical_aa_list = ["ALA","ARG","ASN","ASP","CYS","GLN","GLU","GLY","HIS",
- "ILE","LEU","LYS","MET","PHE","PRO","PYL","SER","THR",
- "TRP","TYR","VAL", "SEC"]
+import numpy as np
+from biotite.structure.atoms import array as atom_array
+from biotite.structure.info.groups import (
+ amino_acid_names,
+ carbohydrate_names,
+ nucleotide_names,
+)
+from biotite.structure.residues import get_residue_count, get_residue_starts
+
+_canonical_aa_list = [
+ "ALA",
+ "ARG",
+ "ASN",
+ "ASP",
+ "CYS",
+ "GLN",
+ "GLU",
+ "GLY",
+ "HIS",
+ "ILE",
+ "LEU",
+ "LYS",
+ "MET",
+ "PHE",
+ "PRO",
+ "PYL",
+ "SER",
+ "THR",
+ "TRP",
+ "TYR",
+ "VAL",
+ "SEC",
+]
_canonical_nucleotide_list = ["A", "DA", "G", "DG", "C", "DC", "U", "DT"]
-_solvent_list = ["HOH","SOL"]
+_solvent_list = ["HOH", "SOL"]
-_peptide_backbone_atoms = ['N', 'CA', 'C']
-_phosphate_backbone_atoms = ['P', 'O5\'', 'C5\'', 'C4\'', 'C3\'', 'O3\'']
+_peptide_backbone_atoms = ["N", "CA", "C"]
+_phosphate_backbone_atoms = ["P", "O5'", "C5'", "C4'", "C3'", "O3'"]
def filter_monoatomic_ions(array):
@@ -55,7 +87,7 @@ def filter_monoatomic_ions(array):
"""
# Exclusively in monoatomic ions,
# the element name is equal to the residue name
- return (array.res_name == array.element)
+ return array.res_name == array.element
def filter_solvent(array):
@@ -228,8 +260,9 @@ def filter_peptide_backbone(array):
is a part of the peptide backbone.
"""
- return (_filter_atom_names(array, _peptide_backbone_atoms) &
- filter_amino_acids(array))
+ return _filter_atom_names(array, _peptide_backbone_atoms) & filter_amino_acids(
+ array
+ )
def filter_phosphate_backbone(array):
@@ -250,8 +283,9 @@ def filter_phosphate_backbone(array):
is a part of the phosphate backbone.
"""
- return (_filter_atom_names(array, _phosphate_backbone_atoms) &
- filter_nucleotides(array))
+ return _filter_atom_names(array, _phosphate_backbone_atoms) & filter_nucleotides(
+ array
+ )
def filter_linear_bond_continuity(array, min_len=1.2, max_len=1.8):
@@ -297,21 +331,20 @@ def filter_linear_bond_continuity(array, min_len=1.2, max_len=1.8):
def _is_polymer(array, min_size, pol_type):
-
- if pol_type.startswith('p'):
+ if pol_type.startswith("p"):
filt_fn = filter_amino_acids
- elif pol_type.startswith('n'):
+ elif pol_type.startswith("n"):
filt_fn = filter_nucleotides
- elif pol_type.startswith('c'):
+ elif pol_type.startswith("c"):
filt_fn = filter_carbohydrates
else:
- raise ValueError(f'Unsupported polymer type {pol_type}')
+ raise ValueError(f"Unsupported polymer type {pol_type}")
mask = filt_fn(array)
return get_residue_count(array[mask]) >= min_size
-def filter_polymer(array, min_size=2, pol_type='peptide'):
+def filter_polymer(array, min_size=2, pol_type="peptide"):
"""
Filter for atoms that are a part of a consecutive standard macromolecular
polymer entity.
@@ -334,13 +367,14 @@ def filter_polymer(array, min_size=2, pol_type='peptide'):
"""
# Import `check_res_id_continuity` here to avoid circular imports
- from .integrity import check_res_id_continuity
+ from biotite.structure.integrity import check_res_id_continuity
+
split_idx = check_res_id_continuity(array)
check_pol = partial(_is_polymer, min_size=min_size, pol_type=pol_type)
bool_idx = map(
lambda a: np.full(len(a), check_pol(atom_array(a)), dtype=bool),
- np.split(array, split_idx)
+ np.split(array, split_idx),
)
return np.concatenate(list(bool_idx))
@@ -384,13 +418,17 @@ def filter_intersection(array, intersect):
intersect_categories = intersect.get_annotation_categories()
# Check atom equality only for categories,
# which exist in both arrays
- categories = [category for category in array.get_annotation_categories()
- if category in intersect_categories]
+ categories = [
+ category
+ for category in array.get_annotation_categories()
+ if category in intersect_categories
+ ]
for i in range(array.array_length()):
subfilter = np.full(intersect.array_length(), True, dtype=bool)
for category in categories:
- subfilter &= (intersect.get_annotation(category)
- == array.get_annotation(category)[i])
+ subfilter &= (
+ intersect.get_annotation(category) == array.get_annotation(category)[i]
+ )
filter[i] = subfilter.any()
return filter
@@ -453,10 +491,10 @@ def filter_first_altloc(atoms, altloc_ids):
# And filter all atoms for each residue with the first altloc ID
residue_starts = get_residue_starts(atoms, add_exclusive_stop=True)
for start, stop in zip(residue_starts[:-1], residue_starts[1:]):
- letter_altloc_ids = [l for l in altloc_ids[start:stop] if l.isalpha()]
+ letter_altloc_ids = [loc for loc in altloc_ids[start:stop] if loc.isalpha()]
if len(letter_altloc_ids) > 0:
first_id = letter_altloc_ids[0]
- altloc_filter[start:stop] |= (altloc_ids[start:stop] == first_id)
+ altloc_filter[start:stop] |= altloc_ids[start:stop] == first_id
else:
# No altloc ID in this residue -> Nothing to do
pass
@@ -534,19 +572,17 @@ def filter_highest_occupancy_altloc(atoms, altloc_ids, occupancies):
occupancies_in_res = occupancies[start:stop]
altloc_ids_in_res = altloc_ids[start:stop]
- letter_altloc_ids = [l for l in altloc_ids_in_res if l.isalpha()]
+ letter_altloc_ids = [loc for loc in altloc_ids_in_res if loc.isalpha()]
if len(letter_altloc_ids) > 0:
highest = -1.0
highest_id = None
for id in set(letter_altloc_ids):
- occupancy_sum = np.sum(
- occupancies_in_res[altloc_ids_in_res == id]
- )
+ occupancy_sum = np.sum(occupancies_in_res[altloc_ids_in_res == id])
if occupancy_sum > highest:
highest = occupancy_sum
highest_id = id
- altloc_filter[start:stop] |= (altloc_ids[start:stop] == highest_id)
+ altloc_filter[start:stop] |= altloc_ids[start:stop] == highest_id
else:
# No altloc ID in this residue -> Nothing to do
pass
diff --git a/src/biotite/structure/geometry.py b/src/biotite/structure/geometry.py
index ce39d1e82..cc5c59f4e 100644
--- a/src/biotite/structure/geometry.py
+++ b/src/biotite/structure/geometry.py
@@ -9,25 +9,33 @@
__name__ = "biotite.structure"
__author__ = "Patrick Kunzmann"
-__all__ = ["displacement", "index_displacement", "distance", "index_distance",
- "angle", "index_angle", "dihedral", "index_dihedral",
- "dihedral_backbone", "centroid"]
+__all__ = [
+ "displacement",
+ "index_displacement",
+ "distance",
+ "index_distance",
+ "angle",
+ "index_angle",
+ "dihedral",
+ "index_dihedral",
+ "dihedral_backbone",
+ "centroid",
+]
import numpy as np
-from .atoms import Atom, AtomArray, AtomArrayStack, coord
-from .util import vector_dot, norm_vector
-from .filter import filter_peptide_backbone
-from .chains import chain_iter
-from .box import (coord_to_fraction, fraction_to_coord,
- move_inside_box, is_orthogonal)
-from .error import BadStructureError
+from biotite.structure.atoms import AtomArray, AtomArrayStack, coord
+from biotite.structure.box import coord_to_fraction, fraction_to_coord, is_orthogonal
+from biotite.structure.chains import chain_iter
+from biotite.structure.error import BadStructureError
+from biotite.structure.filter import filter_peptide_backbone
+from biotite.structure.util import norm_vector, vector_dot
def displacement(atoms1, atoms2, box=None):
"""
Measure the displacement vector, i.e. the vector difference, from
one array of atom coordinates to another array of coordinates.
-
+
Parameters
----------
atoms1, atoms2 : ndarray, shape=(m,n,3) or ndarray, shape=(n,3) or ndarray, shape=(3,) or Atom or AtomArray or AtomArrayStack
@@ -43,13 +51,13 @@ def displacement(atoms1, atoms2, box=None):
the box vectors given with this parameter.
The shape *(m,3,3)* is only allowed, when the input coordinates
comprise multiple models.
-
+
Returns
-------
disp : ndarray, shape=(m,n,3) or ndarray, shape=(n,3) or ndarray, shape=(3,)
The displacement vector(s). The shape is equal to the shape of
the input `atoms` with the highest dimensionality.
-
+
See also
--------
index_displacement
@@ -62,7 +70,7 @@ def displacement(atoms1, atoms2, box=None):
diff = v2 - v1
else:
diff = -(v1 - v2)
-
+
# Use minimum-image convention if box is given
if box is not None:
# Transform difference vector
@@ -81,28 +89,24 @@ def displacement(atoms1, atoms2, box=None):
fractions = fractions[np.newaxis, :]
disp = disp[np.newaxis, :]
if orthogonality:
- _displacement_orthogonal_box(
- fractions, box, disp
- )
+ _displacement_orthogonal_box(fractions, box, disp)
else:
_displacement_triclinic_box(
fractions.astype(diff.dtype, copy=False),
box.astype(diff.dtype, copy=False),
- disp
+ disp,
)
# Transform back
disp = disp[0]
if fractions.ndim == 2:
# Single model
if orthogonality:
- _displacement_orthogonal_box(
- fractions, box, disp
- )
+ _displacement_orthogonal_box(fractions, box, disp)
else:
_displacement_triclinic_box(
fractions.astype(diff.dtype, copy=False),
box.astype(diff.dtype, copy=False),
- disp
+ disp,
)
elif fractions.ndim == 3:
# Multiple models
@@ -117,21 +121,17 @@ def displacement(atoms1, atoms2, box=None):
else:
raise ValueError(f"{box.ndim} are to many box dimensions")
if orthogonality_for_model:
- _displacement_orthogonal_box(
- fractions[i], box_for_model, disp[i]
- )
+ _displacement_orthogonal_box(fractions[i], box_for_model, disp[i])
else:
_displacement_triclinic_box(
fractions[i].astype(diff.dtype, copy=False),
box_for_model.astype(diff.dtype, copy=False),
- disp[i]
+ disp[i],
)
else:
- raise ValueError(
- f"{diff.shape} is an invalid shape for atom coordinates"
- )
+ raise ValueError(f"{diff.shape} is an invalid shape for atom coordinates")
return disp
-
+
else:
return diff
@@ -139,7 +139,7 @@ def displacement(atoms1, atoms2, box=None):
def index_displacement(*args, **kwargs):
"""
index_displacement(atoms, indices, periodic=False, box=None)
-
+
Measure the displacement, i.e. the vector difference, between pairs
of atoms.
@@ -159,7 +159,7 @@ def index_displacement(*args, **kwargs):
:class:`ndarray`.
indices : ndarray, shape=(k,2)
Pairs of indices that point to `atoms`.
- The displacement is measured from ``indices[x,0]`` to
+ The displacement is measured from ``indices[x,0]`` to
``indices[x,1]``.
periodic : bool, optional
If set to true, periodic boundary conditions are taken into
@@ -171,14 +171,14 @@ def index_displacement(*args, **kwargs):
box : ndarray, shape=(3,3) or shape=(m,3,3), optional
If this parameter is set, the given box is used instead of the
`box` attribute of `atoms`.
-
+
Returns
-------
disp : ndarray, shape=(k,) or shape=(m,k)
The pairwise displacements.
If `atoms` is an atom array stack, The distances are
calculated for each model.
-
+
Warnings
--------
In case `periodic` is set to true and if the box is not orthorhombic
@@ -199,7 +199,7 @@ def index_displacement(*args, **kwargs):
def distance(atoms1, atoms2, box=None):
"""
Measure the euclidian distance between atoms.
-
+
Parameters
----------
atoms1, atoms2 : ndarray or Atom or AtomArray or AtomArrayStack
@@ -214,14 +214,14 @@ def distance(atoms1, atoms2, box=None):
the box vectors given with this parameter.
The shape *(m,3,3)* is only allowed, when the input coordinates
comprise multiple models.
-
+
Returns
-------
dist : float or ndarray
The atom distances.
The shape is equal to the shape of the input `atoms` with the
highest dimensionality minus the last axis.
-
+
See also
--------
index_distance
@@ -233,7 +233,7 @@ def distance(atoms1, atoms2, box=None):
def index_distance(*args, **kwargs):
"""
index_distance(atoms, indices, periodic=False, box=None)
-
+
Measure the euclidian distance between pairs of atoms.
The pairs refer to indices of a given atom array, whose pairwise
@@ -262,14 +262,14 @@ def index_distance(*args, **kwargs):
box : ndarray, shape=(3,3) or shape=(m,3,3), optional
If this parameter is set, the given box is used instead of the
`box` attribute of `atoms`.
-
+
Returns
-------
dist : ndarray, shape=(k,) or shape=(m,k)
The pairwise distances.
If `atoms` is an atom array stack, The distances are
calculated for each model.
-
+
Warnings
--------
In case `periodic` is set to true and if the box is not orthorhombic
@@ -290,7 +290,7 @@ def index_distance(*args, **kwargs):
def angle(atoms1, atoms2, atoms3, box=None):
"""
Measure the angle between 3 atoms.
-
+
Parameters
----------
atoms1, atoms2, atoms3 : ndarray or Atom or AtomArray or AtomArrayStack
@@ -302,14 +302,14 @@ def angle(atoms1, atoms2, atoms3, box=None):
the box vectors given with this parameter.
The shape *(m,3,3)* is only allowed, when the input coordinates
comprise multiple models.
-
+
Returns
-------
angle : float or ndarray
The angle(s) between the atoms. The shape is equal to the shape
of the input `atoms` with the highest dimensionality minus the
last axis.
-
+
See also
--------
index_angle
@@ -318,13 +318,13 @@ def angle(atoms1, atoms2, atoms3, box=None):
v2 = displacement(atoms3, atoms2, box)
norm_vector(v1)
norm_vector(v2)
- return np.arccos(vector_dot(v1,v2))
+ return np.arccos(vector_dot(v1, v2))
def index_angle(*args, **kwargs):
"""
index_angle(atoms, indices, periodic=False, box=None)
-
+
Measure the angle between triples of atoms.
The triples refer to indices of a given atom array, whose triplewise
@@ -351,14 +351,14 @@ def index_angle(*args, **kwargs):
box : ndarray, shape=(3,3) or shape=(m,3,3), optional
If this parameter is set, the given box is used instead of the
`box` attribute of `atoms`.
-
+
Returns
-------
angle : ndarray, shape=(k,) or shape=(m,k)
The triplewise angles.
If `atoms` is an atom array stack, The distances are
calculated for each model.
-
+
Warnings
--------
In case `periodic` is set to true and if the box is not orthorhombic
@@ -379,7 +379,7 @@ def index_angle(*args, **kwargs):
def dihedral(atoms1, atoms2, atoms3, atoms4, box=None):
"""
Measure the dihedral angle between 4 atoms.
-
+
Parameters
----------
atoms1, atoms2, atoms3, atoms4 : ndarray or Atom or AtomArray or AtomArrayStack
@@ -392,14 +392,14 @@ def dihedral(atoms1, atoms2, atoms3, atoms4, box=None):
the box vectors given with this parameter.
The shape *(m,3,3)* is only allowed, when the input coordinates
comprise multiple models.
-
+
Returns
-------
dihed : float or ndarray
The dihedral angle(s) between the atoms. The shape is equal to
the shape of the input `atoms` with the highest dimensionality
minus the last axis.
-
+
See Also
--------
index_dihedral
@@ -411,20 +411,20 @@ def dihedral(atoms1, atoms2, atoms3, atoms4, box=None):
norm_vector(v1)
norm_vector(v2)
norm_vector(v3)
-
+
n1 = np.cross(v1, v2)
n2 = np.cross(v2, v3)
-
- # Calculation using atan2, to ensure the correct sign of the angle
- x = vector_dot(n1,n2)
- y = vector_dot(np.cross(n1,n2), v2)
- return np.arctan2(y,x)
+
+ # Calculation using atan2, to ensure the correct sign of the angle
+ x = vector_dot(n1, n2)
+ y = vector_dot(np.cross(n1, n2), v2)
+ return np.arctan2(y, x)
def index_dihedral(*args, **kwargs):
"""
index_dihedral(atoms, indices, periodic=False, box=None)
-
+
Measure the dihedral angle between quadruples of atoms.
The triples refer to indices of a given atom array, whose
@@ -452,14 +452,14 @@ def index_dihedral(*args, **kwargs):
box : ndarray, shape=(3,3) or shape=(m,3,3), optional
If this parameter is set, the given box is used instead of the
`box` attribute of `atoms`.
-
+
Returns
-------
dihedral : ndarray, shape=(k,) or shape=(m,k)
The quadruplewise dihedral angles.
If `atoms` is an atom array stack, The distances are
calculated for each model.
-
+
Warnings
--------
In case `periodic` is set to true and if the box is not orthorhombic
@@ -482,7 +482,7 @@ def dihedral_backbone(atom_array):
"""
Measure the characteristic backbone dihedral angles of a protein
structure.
-
+
Parameters
----------
atom_array: AtomArray or AtomArrayStack
@@ -492,7 +492,7 @@ def dihedral_backbone(atom_array):
`NaN`.
The order of the backbone atoms for each residue must be
(N, CA, C).
-
+
Returns
-------
phi, psi, omega : ndarray
@@ -502,20 +502,20 @@ def dihedral_backbone(atom_array):
have *NaN* values. If an :class:`AtomArrayStack` is given, the
output angles are 2-dimensional, the first dimension corresponds
to the model number.
-
+
Raises
------
BadStructureError
If the amount of backbone atoms is not equal to amount of
residues times 3 (for N, CA and C).
-
+
See Also
--------
dihedral
-
+
Examples
--------
-
+
>>> phi, psi, omega = dihedral_backbone(atom_array)
>>> print(np.stack([np.rad2deg(phi), np.rad2deg(psi)]).T)
[[ nan -56.145]
@@ -541,15 +541,17 @@ def dihedral_backbone(atom_array):
"""
bb_filter = filter_peptide_backbone(atom_array)
backbone = atom_array[..., bb_filter]
-
- if backbone.array_length() % 3 != 0 \
- or (backbone.atom_name[0::3] != "N" ).any() \
- or (backbone.atom_name[1::3] != "CA").any() \
- or (backbone.atom_name[2::3] != "C" ).any():
- raise BadStructureError(
- "The backbone is invalid, must be repeats of (N, CA, C), "
- "maybe a backbone atom is missing"
- )
+
+ if (
+ backbone.array_length() % 3 != 0
+ or (backbone.atom_name[0::3] != "N").any()
+ or (backbone.atom_name[1::3] != "CA").any()
+ or (backbone.atom_name[2::3] != "C").any()
+ ):
+ raise BadStructureError(
+ "The backbone is invalid, must be repeats of (N, CA, C), "
+ "maybe a backbone atom is missing"
+ )
phis = []
psis = []
omegas = []
@@ -558,9 +560,11 @@ def dihedral_backbone(atom_array):
phis.append(phi)
psis.append(psi)
omegas.append(omega)
- return np.concatenate(phis, axis=-1), np.concatenate(psis, axis=-1), \
- np.concatenate(omegas, axis=-1)
-
+ return (
+ np.concatenate(phis, axis=-1),
+ np.concatenate(psis, axis=-1),
+ np.concatenate(omegas, axis=-1),
+ )
def _dihedral_backbone(chain_bb):
@@ -571,49 +575,57 @@ def _dihedral_backbone(chain_bb):
# Dim 2: X, Y, Z coordinates
# Dim 3: Atoms involved in dihedral angle
if isinstance(chain_bb, AtomArray):
- angle_coord_shape = (len(bb_coord)//3, 3, 4)
+ angle_coord_shape = (len(bb_coord) // 3, 3, 4)
elif isinstance(chain_bb, AtomArrayStack):
- angle_coord_shape = (bb_coord.shape[0], bb_coord.shape[1]//3, 3, 4)
- phi_coord = np.full(angle_coord_shape, np.nan)
- psi_coord = np.full(angle_coord_shape, np.nan)
+ angle_coord_shape = (bb_coord.shape[0], bb_coord.shape[1] // 3, 3, 4)
+ phi_coord = np.full(angle_coord_shape, np.nan)
+ psi_coord = np.full(angle_coord_shape, np.nan)
omega_coord = np.full(angle_coord_shape, np.nan)
-
- # Indices for coordinates of CA atoms
- ca_i = np.arange(bb_coord.shape[-2]//3) * 3 + 1
- phi_coord [..., 1: , :, 0] = bb_coord[..., ca_i[1: ]-2 ,:]
- phi_coord [..., 1: , :, 1] = bb_coord[..., ca_i[1: ]-1 ,:]
- phi_coord [..., 1: , :, 2] = bb_coord[..., ca_i[1: ] ,:]
- phi_coord [..., 1: , :, 3] = bb_coord[..., ca_i[1: ]+1 ,:]
- psi_coord [..., :-1, :, 0] = bb_coord[..., ca_i[:-1]-1 ,:]
- psi_coord [..., :-1, :, 1] = bb_coord[..., ca_i[:-1] ,:]
- psi_coord [..., :-1, :, 2] = bb_coord[..., ca_i[:-1]+1 ,:]
- psi_coord [..., :-1, :, 3] = bb_coord[..., ca_i[:-1]+2 ,:]
- omega_coord[..., :-1, :, 0] = bb_coord[..., ca_i[:-1] ,:]
- omega_coord[..., :-1, :, 1] = bb_coord[..., ca_i[:-1]+1 ,:]
- omega_coord[..., :-1, :, 2] = bb_coord[..., ca_i[:-1]+2 ,:]
- omega_coord[..., :-1, :, 3] = bb_coord[..., ca_i[:-1]+3 ,:]
-
- phi = dihedral(phi_coord[...,0], phi_coord[...,1],
- phi_coord[...,2], phi_coord[...,3])
- psi = dihedral(psi_coord[...,0], psi_coord[...,1],
- psi_coord[...,2], psi_coord[...,3])
- omega = dihedral(omega_coord[...,0], omega_coord[...,1],
- omega_coord[...,2], omega_coord[...,3])
-
+
+ # Indices for coordinates of CA atoms
+ ca_i = np.arange(bb_coord.shape[-2] // 3) * 3 + 1
+ # fmt: off
+ phi_coord [..., 1:, :, 0] = bb_coord[..., ca_i[1: ]-2, :]
+ phi_coord [..., 1:, :, 1] = bb_coord[..., ca_i[1: ]-1, :]
+ phi_coord [..., 1:, :, 2] = bb_coord[..., ca_i[1: ], :]
+ phi_coord [..., 1:, :, 3] = bb_coord[..., ca_i[1: ]+1, :]
+ psi_coord [..., :-1, :, 0] = bb_coord[..., ca_i[:-1]-1, :]
+ psi_coord [..., :-1, :, 1] = bb_coord[..., ca_i[:-1], :]
+ psi_coord [..., :-1, :, 2] = bb_coord[..., ca_i[:-1]+1, :]
+ psi_coord [..., :-1, :, 3] = bb_coord[..., ca_i[:-1]+2, :]
+ omega_coord[..., :-1, :, 0] = bb_coord[..., ca_i[:-1], :]
+ omega_coord[..., :-1, :, 1] = bb_coord[..., ca_i[:-1]+1, :]
+ omega_coord[..., :-1, :, 2] = bb_coord[..., ca_i[:-1]+2, :]
+ omega_coord[..., :-1, :, 3] = bb_coord[..., ca_i[:-1]+3, :]
+ # fmt: on
+
+ phi = dihedral(
+ phi_coord[..., 0], phi_coord[..., 1], phi_coord[..., 2], phi_coord[..., 3]
+ )
+ psi = dihedral(
+ psi_coord[..., 0], psi_coord[..., 1], psi_coord[..., 2], psi_coord[..., 3]
+ )
+ omega = dihedral(
+ omega_coord[..., 0],
+ omega_coord[..., 1],
+ omega_coord[..., 2],
+ omega_coord[..., 3],
+ )
+
return phi, psi, omega
def centroid(atoms):
"""
Measure the centroid of a structure.
-
+
Parameters
----------
atoms: ndarray or AtomArray or AtomArrayStack
The structures to determine the centroid from.
Alternatively an ndarray containing the coordinates can be
provided.
-
+
Returns
-------
centroid : float or ndarray
@@ -623,8 +635,9 @@ def centroid(atoms):
return np.mean(coord(atoms), axis=-2)
-def _call_non_index_function(function, expected_amount,
- atoms, indices, periodic=False, box=None):
+def _call_non_index_function(
+ function, expected_amount, atoms, indices, periodic=False, box=None
+):
"""
Call an `xxx()` function based on the parameters given to a
`index_xxx()` function.
@@ -636,15 +649,14 @@ def _call_non_index_function(function, expected_amount,
)
coord_list = []
for i in range(expected_amount):
- coord_list.append(coord(atoms)[..., indices[:,i], :])
+ coord_list.append(coord(atoms)[..., indices[:, i], :])
if periodic:
if box is None:
if isinstance(atoms, (AtomArray, AtomArrayStack)):
box = atoms.box
else:
raise ValueError(
- "If `atoms` are coordinates, "
- "the box must be set explicitly"
+ "If `atoms` are coordinates, " "the box must be set explicitly"
)
else:
box = None
@@ -656,7 +668,7 @@ def _displacement_orthogonal_box(fractions, box, disp):
Fill in the PBC-aware displacement vector for non-PBC-aware
displacements given as fractions of given box vectors.
"""
- # Fraction components are guaranteed to be positive
+ # Fraction components are guaranteed to be positive
# Use fraction vector components with lower absolute
# -> new_vec[i] = vec[i] - 1 if vec[i] > 0.5 else vec[i]
fractions[fractions > 0.5] -= 1
@@ -669,7 +681,7 @@ def _displacement_triclinic_box(fractions, box, disp):
displacements given as fractions of given box vectors.
"""
diffs = fraction_to_coord(fractions, box)
- # Fraction components are guaranteed to be positive
+ # Fraction components are guaranteed to be positive
# Test all 3 fraction vector components
# with positive and negative sign
# (i,j,k in {-1, 0})
@@ -678,10 +690,10 @@ def _displacement_triclinic_box(fractions, box, disp):
for i in range(-1, 1):
for j in range(-1, 1):
for k in range(-1, 1):
- x = i*box[0,0] + j*box[1,0] + k*box[2,0]
- y = i*box[0,1] + j*box[1,1] + k*box[2,1]
- z = i*box[0,2] + j*box[1,2] + k*box[2,2]
- periodic_shift.append([x,y,z])
+ x = i * box[0, 0] + j * box[1, 0] + k * box[2, 0]
+ y = i * box[0, 1] + j * box[1, 1] + k * box[2, 1]
+ z = i * box[0, 2] + j * box[1, 2] + k * box[2, 2]
+ periodic_shift.append([x, y, z])
periodic_shift = np.array(periodic_shift, dtype=disp.dtype)
# Create 8 periodically shifted variants for each atom
shifted_diffs = diffs[:, np.newaxis, :] + periodic_shift[np.newaxis, :, :]
@@ -692,6 +704,5 @@ def _displacement_triclinic_box(fractions, box, disp):
# for each given non-PBC-aware displacement find the PBC-aware
# displacement with the lowest distance
disp[:] = shifted_diffs[
- np.arange(len(shifted_diffs)),
- np.argmin(sq_distance, axis=1)
+ np.arange(len(shifted_diffs)), np.argmin(sq_distance, axis=1)
]
diff --git a/src/biotite/structure/graphics/atoms.py b/src/biotite/structure/graphics/atoms.py
index bc91492d9..dec54f1fa 100644
--- a/src/biotite/structure/graphics/atoms.py
+++ b/src/biotite/structure/graphics/atoms.py
@@ -7,18 +7,25 @@
__all__ = ["plot_atoms", "plot_ball_and_stick_model"]
import numpy as np
-import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from mpl_toolkits.mplot3d.art3d import Line3DCollection
-def plot_atoms(axes, atoms, colors, line_width=1.0, background_color=None,
- center=None, size=None, zoom=1.0):
+def plot_atoms(
+ axes,
+ atoms,
+ colors,
+ line_width=1.0,
+ background_color=None,
+ center=None,
+ size=None,
+ zoom=1.0,
+):
"""
Plot an :class:`AtomArray` as lines between bonded atoms.
The z-axis points into the screen plane.
-
+
Parameters
----------
axes : Axes3D
@@ -49,7 +56,7 @@ def plot_atoms(axes, atoms, colors, line_width=1.0, background_color=None,
- ``> 1.0``: Zoom in.
- ``< 1.0``: Zoom out.
-
+
Notes
-----
This is a very simple visualization tools for quick visual analysis
@@ -61,38 +68,37 @@ def plot_atoms(axes, atoms, colors, line_width=1.0, background_color=None,
raise ValueError("The given axes mut be an 'Axes3D'")
if atoms.bonds is None:
raise ValueError("The atom array must have an associated bond list")
-
+
# Calculating connections between atoms
line_coord = []
line_colors = []
- for index1, index2 in atoms.bonds.as_array()[:,:2]:
+ for index1, index2 in atoms.bonds.as_array()[:, :2]:
# Every connection consist of two lines:
# One from the first atom to the center
# and from from the second atom to the center
line_start = atoms.coord[index1]
line_end = atoms.coord[index2]
line_center = (line_start + line_end) / 2
-
+
# Add line from first atom
- line_coord.append((
- line_start, line_center
- ))
+ line_coord.append((line_start, line_center))
line_colors.append(colors[index1])
-
+
# Add line from second atom
- line_coord.append((
- line_end, line_center
- ))
+ line_coord.append((line_end, line_center))
line_colors.append(colors[index2])
# Plot computed line coordinates and colors
# Use 'Line3DCollection' for higher efficiency
lines = Line3DCollection(
- line_coord, color=line_colors, linewidths=line_width,
- capstyle="round", joinstyle="round"
+ line_coord,
+ color=line_colors,
+ linewidths=line_width,
+ capstyle="round",
+ joinstyle="round",
)
axes.add_collection(lines)
-
+
# Set viewing angle
axes.azim = -90
axes.elev = 90
@@ -105,17 +111,25 @@ def plot_atoms(axes, atoms, colors, line_width=1.0, background_color=None,
_set_box(axes, atoms.coord, center, size, zoom)
-def plot_ball_and_stick_model(axes, atoms, colors, ball_size=200,
- line_color="black", line_width=1.0,
- background_color=None, center=None,
- size=None, zoom=1.0):
+def plot_ball_and_stick_model(
+ axes,
+ atoms,
+ colors,
+ ball_size=200,
+ line_color="black",
+ line_width=1.0,
+ background_color=None,
+ center=None,
+ size=None,
+ zoom=1.0,
+):
"""
Plot an :class:`AtomArray` as *ball-and-stick* model.
The z-axis points into the screen plane.
UNSTABLE: This function is probably subject to future changes.
-
+
Parameters
----------
axes : Axes3D
@@ -154,7 +168,7 @@ def plot_ball_and_stick_model(axes, atoms, colors, ball_size=200,
- ``> 1.0``: Zoom in.
- ``< 1.0``: Zoom out.
-
+
Notes
-----
This is a very simple visualization tools for quick visual analysis
@@ -166,26 +180,27 @@ def plot_ball_and_stick_model(axes, atoms, colors, ball_size=200,
raise ValueError("The given axes mut be an 'Axes3D'")
if atoms.bonds is None:
raise ValueError("The atom array must have an associated bond list")
-
+
# Calculating connections between atoms
line_coord = [
(atoms.coord[index1], atoms.coord[index2])
- for index1, index2 in atoms.bonds.as_array()[:,:2]
+ for index1, index2 in atoms.bonds.as_array()[:, :2]
]
# Plot sticks
# Use 'Line3DCollection' for higher efficiency
sticks = Line3DCollection(
- line_coord, color=line_color, linewidths=line_width,
- capstyle="round", joinstyle="round"
+ line_coord,
+ color=line_color,
+ linewidths=line_width,
+ capstyle="round",
+ joinstyle="round",
)
axes.add_collection(sticks)
# Plot balls
- axes.scatter(
- *atoms.coord.T, s=ball_size, c=colors, linewidth=0, alpha=1
- )
-
+ axes.scatter(*atoms.coord.T, s=ball_size, c=colors, linewidth=0, alpha=1)
+
# Set viewing angle
axes.azim = -90
axes.elev = 90
@@ -211,16 +226,18 @@ def _set_box(axes, coord, center, size, zoom):
)
if size is None:
- size = np.array([
- coord[:, 0].max() - coord[:, 0].min(),
- coord[:, 1].max() - coord[:, 1].min(),
- coord[:, 2].max() - coord[:, 2].min()
- ]).max()
-
- axes.set_xlim(center[0] - size/(2*zoom), center[0] + size/(2*zoom))
- axes.set_ylim(center[1] - size/(2*zoom), center[1] + size/(2*zoom))
- axes.set_zlim(center[2] - size/(2*zoom), center[2] + size/(2*zoom))
-
+ size = np.array(
+ [
+ coord[:, 0].max() - coord[:, 0].min(),
+ coord[:, 1].max() - coord[:, 1].min(),
+ coord[:, 2].max() - coord[:, 2].min(),
+ ]
+ ).max()
+
+ axes.set_xlim(center[0] - size / (2 * zoom), center[0] + size / (2 * zoom))
+ axes.set_ylim(center[1] - size / (2 * zoom), center[1] + size / (2 * zoom))
+ axes.set_zlim(center[2] - size / (2 * zoom), center[2] + size / (2 * zoom))
+
# Make the axis lengths of the 'plot box' equal
# The 'plot box' is not visible due to 'axes.axis("off")'
- axes.set_box_aspect([1,1,1])
\ No newline at end of file
+ axes.set_box_aspect([1, 1, 1])
diff --git a/src/biotite/structure/graphics/rna.py b/src/biotite/structure/graphics/rna.py
index b2cf6d198..49648728a 100644
--- a/src/biotite/structure/graphics/rna.py
+++ b/src/biotite/structure/graphics/rna.py
@@ -7,29 +7,43 @@
__all__ = ["plot_nucleotide_secondary_structure"]
import shutil
-import numpy as np
from itertools import repeat
-from .. import pseudoknots
-from ...application.viennarna import RNAplotApp
+import numpy as np
+from biotite.application.viennarna import RNAplotApp
+from biotite.structure import pseudoknots
def plot_nucleotide_secondary_structure(
- axes, base_labels, base_pairs, length,
- layout_type=RNAplotApp.Layout.NAVIEW, draw_pseudoknots=True,
- pseudoknot_order=None, angle=0, bond_linewidth=1, bond_linestyle=None,
- bond_color='black', backbone_linewidth=1, backbone_linestyle='solid',
- backbone_color='grey', base_text=None, base_box=None,
- annotation_positions=None, annotation_offset=8.5, annotation_text=None,
- border=0.03, bin_path="RNAplot"
- ):
+ axes,
+ base_labels,
+ base_pairs,
+ length,
+ layout_type=RNAplotApp.Layout.NAVIEW,
+ draw_pseudoknots=True,
+ pseudoknot_order=None,
+ angle=0,
+ bond_linewidth=1,
+ bond_linestyle=None,
+ bond_color="black",
+ backbone_linewidth=1,
+ backbone_linestyle="solid",
+ backbone_color="grey",
+ base_text=None,
+ base_box=None,
+ annotation_positions=None,
+ annotation_offset=8.5,
+ annotation_text=None,
+ border=0.03,
+ bin_path="RNAplot",
+):
"""
Generate 2D plots of nucleic acid secondary structures using the
interface to *RNAplot*, which is part of the *ViennaRNA* software
package.
- Internally a :class:`biotite.application.viennarna.RNAplotApp`
- instance is created to generate coordinates for each individual base
- on a 2D plane. *ViennaRNA* must be installed in order to use this
+ Internally a :class:`biotite.application.viennarna.RNAplotApp`
+ instance is created to generate coordinates for each individual base
+ on a 2D plane. *ViennaRNA* must be installed in order to use this
function.
Parameters
@@ -49,7 +63,7 @@ def plot_nucleotide_secondary_structure(
Whether pseudoknotted bonds should be drawn.
pseudoknot_order : iterable, optional (default: None)
The pseudoknot order of each pair in the input `base_pairs`.
- If no pseudoknot order is given, a solution determined by
+ If no pseudoknot order is given, a solution determined by
:func:`biotite.structure.pseudoknots` is picked at random.
angle : int or float, optional (default: 0)
The angle the plot should be rotated.
@@ -74,9 +88,9 @@ def plot_nucleotide_secondary_structure(
backbone_color : str or ndarray, shape=(3,) or shape=(4,), dtype=float, optional (default: 'grey')
The *Matplotlib* compatible color of the backbone.
base_text : dict or iterable, optional (default: {'size': 'small'})
- The keyword parameters for the *Matplotlib* ``Text`` objects
- denoting the type of each base. Provide a single value to set
- the parameters for all labels or an iterable to set the
+ The keyword parameters for the *Matplotlib* ``Text`` objects
+ denoting the type of each base. Provide a single value to set
+ the parameters for all labels or an iterable to set the
parameters for each individual label.
base_box : dict or iterable, optional (default: {'pad'=0, 'color'='white'})
The *Matplotlib* compatible properties of the ``FancyBboxPatch``
@@ -91,9 +105,9 @@ def plot_nucleotide_secondary_structure(
annotation_offset : int or float, optional (default: 8.5)
The offset of the annotations from the base labels.
annotation_text : dict or iterable, optional (default: {'size': 'small'})
- The keyword parameters for the *Matplotlib* ``Text`` objects
- annotating the sequence. Provide a single value to set the
- parameters for all annotations or an iterable to set the
+ The keyword parameters for the *Matplotlib* ``Text`` objects
+ annotating the sequence. Provide a single value to set the
+ parameters for all annotations or an iterable to set the
parameters for each individual annotation.
border : float, optional (default: 0.03)
The percentage of the coordinate range to be left as whitespace
@@ -105,8 +119,8 @@ def plot_nucleotide_secondary_structure(
# Check if RNAplot is installed
if shutil.which(bin_path) is None:
raise FileNotFoundError(
- 'RNAplot is not installed at the specified location, unable to '
- 'plot secondary structure.'
+ "RNAplot is not installed at the specified location, unable to "
+ "plot secondary structure."
)
# Get the unknotted base pairs
@@ -127,7 +141,7 @@ def plot_nucleotide_secondary_structure(
# Set the default properties of the Matplotlib `bbox` surrounding
# the base labels
if base_box is None:
- base_box=np.full(length, {'pad': 0, 'color': 'white'})
+ base_box = np.full(length, {"pad": 0, "color": "white"})
# if `base_box` is a dictionary, extrapolate
elif isinstance(base_box, dict):
base_box = np.full(length, base_box)
@@ -135,25 +149,23 @@ def plot_nucleotide_secondary_structure(
# By default pseudoknotted bonds are denoted as dashed lines, while
# unknotted bonds are denoted as solid lines
if bond_linestyle is None:
- bond_linestyle = np.full(base_pairs.shape[0], 'solid', dtype='object')
- bond_linestyle[pseudoknot_order != 0] = 'dashed'
+ bond_linestyle = np.full(base_pairs.shape[0], "solid", dtype="object")
+ bond_linestyle[pseudoknot_order != 0] = "dashed"
# If `bond_linestyle` is a string, extrapolate
elif isinstance(bond_linestyle, str):
- bond_linestyle = np.full(
- base_pairs.shape[0], bond_linestyle, dtype='object'
- )
+ bond_linestyle = np.full(base_pairs.shape[0], bond_linestyle, dtype="object")
# If pseudoknots are not to be drawn, remove pseudoknotted bonds,
# regardless of the given linestyles
if not draw_pseudoknots:
# Ensure that the array can hold the 'None' value
# (not possible with 'U1' dtype for example)
- bond_linestyle = np.asarray(bond_linestyle, dtype='object')
- bond_linestyle[pseudoknot_order != 0] = 'None'
+ bond_linestyle = np.asarray(bond_linestyle, dtype="object")
+ bond_linestyle[pseudoknot_order != 0] = "None"
# Set the default properties of the base labels
if base_text is None:
- base_text = np.full(length, {'size': 'small'})
+ base_text = np.full(length, {"size": "small"})
elif isinstance(base_text, dict):
base_text = np.full(length, base_text)
@@ -164,7 +176,7 @@ def plot_nucleotide_secondary_structure(
# Set the default font properties of the base annotations
if annotation_text is None:
- annotation_text = repeat({'size': 'small'})
+ annotation_text = repeat({"size": "small"})
elif isinstance(annotation_text, dict):
annotation_text = repeat(annotation_text)
@@ -173,15 +185,14 @@ def plot_nucleotide_secondary_structure(
base_pairs=unknotted_base_pairs,
length=length,
bin_path=bin_path,
- layout_type=layout_type
+ layout_type=layout_type,
)
# Rotate Coordinates
if angle != 0:
angle = np.deg2rad(angle)
rot_matrix = np.array(
- [[np.cos(angle), -np.sin(angle)],
- [np.sin(angle), np.cos(angle)]]
+ [[np.cos(angle), -np.sin(angle)], [np.sin(angle), np.cos(angle)]]
)
for i, coord in enumerate(coordinates):
coordinates[i] = np.dot(rot_matrix, coord)
@@ -197,31 +208,32 @@ def plot_nucleotide_secondary_structure(
)
axes.set_frame_on(False)
-
# Define buffer area (Border)
coord_range = abs(np.max(coordinates)) + abs(np.min(coordinates))
- buffer = border*coord_range
+ buffer = border * coord_range
# Adjust display
axes.set_xlim(
- np.min(coordinates[:,0])-buffer, np.max(coordinates[:,0])+buffer
+ np.min(coordinates[:, 0]) - buffer, np.max(coordinates[:, 0]) + buffer
)
axes.set_ylim(
- np.min(coordinates[:,1])-buffer, np.max(coordinates[:,1])+buffer
+ np.min(coordinates[:, 1]) - buffer, np.max(coordinates[:, 1]) + buffer
)
- axes.set_aspect(aspect='equal')
+ axes.set_aspect(aspect="equal")
# Draw backbone
- axes.plot(coordinates[:,0], coordinates[:,1], color=backbone_color,
- linestyle=backbone_linestyle, linewidth=backbone_linewidth)
+ axes.plot(
+ coordinates[:, 0],
+ coordinates[:, 1],
+ color=backbone_color,
+ linestyle=backbone_linestyle,
+ linewidth=backbone_linewidth,
+ )
# Draw base labels
- for coords, label, box, text in zip(
- coordinates, base_labels, base_box, base_text
- ):
+ for coords, label, box, text in zip(coordinates, base_labels, base_box, base_text):
t = axes.text(
- x=coords[0], y=coords[1], s=label,
- ha='center', va='center', **text
+ x=coords[0], y=coords[1], s=label, ha="center", va="center", **text
)
t.set_bbox(box)
@@ -237,37 +249,41 @@ def plot_nucleotide_secondary_structure(
# Draw annotations
for i, text in zip(annotation_positions, annotation_text):
- if (i > 0) and ((i+1) < length):
+ if (i > 0) and ((i + 1) < length):
# Get the average of the direction vectors to the next and
# previous base
vector_to_previous = np.array(
- [coordinates[i-1][0] - coordinates[i][0],
- coordinates[i-1][1] - coordinates[i][1]]
- )
- vector_to_previous = vector_to_previous / np.linalg.norm(
- vector_to_previous
+ [
+ coordinates[i - 1][0] - coordinates[i][0],
+ coordinates[i - 1][1] - coordinates[i][1],
+ ]
)
+ vector_to_previous = vector_to_previous / np.linalg.norm(vector_to_previous)
vector_to_next = np.array(
- [coordinates[i][0] - coordinates[i+1][0],
- coordinates[i][1] - coordinates[i+1][1]]
- )
- vector_to_next = vector_to_next / np.linalg.norm(
- vector_to_next
+ [
+ coordinates[i][0] - coordinates[i + 1][0],
+ coordinates[i][1] - coordinates[i + 1][1],
+ ]
)
+ vector_to_next = vector_to_next / np.linalg.norm(vector_to_next)
vector = (vector_to_next + vector_to_previous) / 2
elif i > 0:
# For the last base get the direction vector to the previous
# base
vector = np.array(
- [coordinates[i-1][0] - coordinates[i][0],
- coordinates[i-1][1] - coordinates[i][1]]
+ [
+ coordinates[i - 1][0] - coordinates[i][0],
+ coordinates[i - 1][1] - coordinates[i][1],
+ ]
)
else:
# For the first base get the direction vector to the next
# base
vector = np.array(
- [coordinates[i][0] - coordinates[i+1][0],
- coordinates[i][1] - coordinates[i+1][1]]
+ [
+ coordinates[i][0] - coordinates[i + 1][0],
+ coordinates[i][1] - coordinates[i + 1][1],
+ ]
)
# Normalize the vector
vector = vector / np.linalg.norm(vector)
@@ -275,8 +291,5 @@ def plot_nucleotide_secondary_structure(
vector = np.array([vector[1], -vector[0]])
# The annotations are offset in the direction of the
# perpendicular vector
- x, y = coordinates[i] + (annotation_offset*vector)
- axes.text(
- x=x, y=y, s=i+1,
- ha='center', va='center', **text
- )
\ No newline at end of file
+ x, y = coordinates[i] + (annotation_offset * vector)
+ axes.text(x=x, y=y, s=i + 1, ha="center", va="center", **text)
diff --git a/src/biotite/structure/hbond.py b/src/biotite/structure/hbond.py
index a23c5cdcd..96d0d87f8 100644
--- a/src/biotite/structure/hbond.py
+++ b/src/biotite/structure/hbond.py
@@ -11,16 +11,23 @@
__all__ = ["hbond", "hbond_frequency"]
import warnings
-from .geometry import distance, angle
import numpy as np
-from .atoms import AtomArrayStack, stack
-from .celllist import CellList
-
-
-def hbond(atoms, selection1=None, selection2=None, selection1_type='both',
- cutoff_dist=2.5, cutoff_angle=120,
- donor_elements=('O', 'N', 'S'), acceptor_elements=('O', 'N', 'S'),
- periodic=False):
+from biotite.structure.atoms import AtomArrayStack, stack
+from biotite.structure.celllist import CellList
+from biotite.structure.geometry import angle, distance
+
+
+def hbond(
+ atoms,
+ selection1=None,
+ selection2=None,
+ selection1_type="both",
+ cutoff_dist=2.5,
+ cutoff_angle=120,
+ donor_elements=("O", "N", "S"),
+ acceptor_elements=("O", "N", "S"),
+ periodic=False,
+):
r"""
Find hydrogen bonds in a structure using the Baker-Hubbard
algorithm. :footcite:`Baker1984`
@@ -31,7 +38,7 @@ def hbond(atoms, selection1=None, selection2=None, selection1_type='both',
and :math:`d_{H,A} \le 2.5 \mathring{A}`.
Consequently, the given structure must contain hydrogen atoms.
Otherwise, no hydrogen bonds will be found.
-
+
Parameters
----------
atoms : AtomArray or AtomArrayStack
@@ -60,7 +67,7 @@ def hbond(atoms, selection1=None, selection2=None, selection1_type='both',
boundary conditions.
The `box` attribute of `atoms` is required in this case.
(Default: False).
-
+
Returns
-------
triplets : ndarray, dtype=int, shape=(n,3)
@@ -74,7 +81,7 @@ def hbond(atoms, selection1=None, selection2=None, selection1_type='both',
*m x n* matrix that shows if an interaction with index *n* in
`triplets` is present in the model *m* of the input `atoms`.
Only returned if `atoms` is an :class:`AtomArrayStack`.
-
+
Notes
-----
The result of this function may include false positives:
@@ -84,19 +91,19 @@ def hbond(atoms, selection1=None, selection2=None, selection1_type='both',
For example, a nitrogen atom with positive charge could be
considered as acceptor atom by this method, although this does
make sense from a chemical perspective.
-
+
Examples
--------
Calculate the total number of hydrogen bonds found in each model:
-
+
>>> triplets, mask = hbond(atom_array_stack)
>>> hbonds_per_model = np.count_nonzero(mask, axis=1)
>>> print(hbonds_per_model)
[14 14 14 12 11 12 9 13 9 14 13 13 14 11 11 12 11 14 14 13 14 13 15 17
14 12 15 12 12 13 13 13 12 12 11 14 10 11]
-
+
Get hydrogen bond donors of third model:
-
+
>>> # Third model -> index 2
>>> triplets = triplets[mask[2,:]]
>>> # First column contains donors
@@ -137,12 +144,12 @@ def hbond(atoms, selection1=None, selection2=None, selection1_type='both',
single_model = True
else:
single_model = False
-
+
if periodic:
box = atoms.box
else:
box = None
-
+
# Mask for donor/acceptor elements
donor_element_mask = np.isin(atoms.element, donor_elements)
acceptor_element_mask = np.isin(atoms.element, acceptor_elements)
@@ -152,69 +159,81 @@ def hbond(atoms, selection1=None, selection2=None, selection1_type='both',
if selection2 is None:
selection2 = np.ones(atoms.array_length(), dtype=bool)
- if selection1_type == 'both':
+ if selection1_type == "both":
# The two selections are separated into three selections:
# the original ones without the overlaping part
# and one containing the overlap
- # This prevents redundant triplets and unnecessary computation
+ # This prevents redundant triplets and unnecessary computation
overlap_selection = selection1 & selection2
# Original selections without overlaping part
exclusive_selection1 = selection1 & (~overlap_selection)
exclusive_selection2 = selection2 & (~overlap_selection)
-
+
# Put selections to list for cleaner iteration
- selections = [
- exclusive_selection1, exclusive_selection2, overlap_selection
- ]
+ selections = [exclusive_selection1, exclusive_selection2, overlap_selection]
selection_combinations = [
- #(0,0), is not included, would be same selection
+ # (0,0), is not included, would be same selection
# as donor and acceptor simultaneously
- (0,1),
- (0,2),
- (1,0),
- #(1,1), # same reason above
- (1,2),
- (2,0),
- (2,1),
- (2,2) # overlaping part, combination is necessary
+ (0, 1),
+ (0, 2),
+ (1, 0),
+ # (1,1), # same reason above
+ (1, 2),
+ (2, 0),
+ (2, 1),
+ (2, 2), # overlaping part, combination is necessary
]
-
+
all_comb_triplets = []
all_comb_mask = []
for selection_index1, selection_index2 in selection_combinations:
donor_mask = selections[selection_index1]
acceptor_mask = selections[selection_index2]
- if np.count_nonzero(donor_mask) != 0 and \
- np.count_nonzero(acceptor_mask) != 0:
- # Calculate triplets and mask
- triplets, mask = _hbond(
- atoms, donor_mask, acceptor_mask,
- donor_element_mask, acceptor_element_mask,
- cutoff_dist, cutoff_angle,
- box
- )
- all_comb_triplets.append(triplets)
- all_comb_mask.append(mask)
+ if (
+ np.count_nonzero(donor_mask) != 0
+ and np.count_nonzero(acceptor_mask) != 0
+ ):
+ # Calculate triplets and mask
+ triplets, mask = _hbond(
+ atoms,
+ donor_mask,
+ acceptor_mask,
+ donor_element_mask,
+ acceptor_element_mask,
+ cutoff_dist,
+ cutoff_angle,
+ box,
+ )
+ all_comb_triplets.append(triplets)
+ all_comb_mask.append(mask)
# Merge results from all combinations
triplets = np.concatenate(all_comb_triplets, axis=0)
mask = np.concatenate(all_comb_mask, axis=1)
- elif selection1_type == 'donor':
+ elif selection1_type == "donor":
triplets, mask = _hbond(
- atoms, selection1, selection2,
- donor_element_mask, acceptor_element_mask,
- cutoff_dist, cutoff_angle,
- box
+ atoms,
+ selection1,
+ selection2,
+ donor_element_mask,
+ acceptor_element_mask,
+ cutoff_dist,
+ cutoff_angle,
+ box,
)
-
- elif selection1_type == 'acceptor':
+
+ elif selection1_type == "acceptor":
triplets, mask = _hbond(
- atoms, selection2, selection1,
- donor_element_mask, acceptor_element_mask,
- cutoff_dist, cutoff_angle,
- box
+ atoms,
+ selection2,
+ selection1,
+ donor_element_mask,
+ acceptor_element_mask,
+ cutoff_dist,
+ cutoff_angle,
+ box,
)
-
+
else:
raise ValueError(f"Unkown selection type '{selection1_type}'")
@@ -228,12 +247,18 @@ def hbond(atoms, selection1=None, selection2=None, selection1_type='both',
return triplets, mask
-def _hbond(atoms, donor_mask, acceptor_mask,
- donor_element_mask, acceptor_element_mask,
- cutoff_dist, cutoff_angle, box):
-
+def _hbond(
+ atoms,
+ donor_mask,
+ acceptor_mask,
+ donor_element_mask,
+ acceptor_element_mask,
+ cutoff_dist,
+ cutoff_angle,
+ box,
+):
# Filter donor/acceptor elements
- donor_mask &= donor_element_mask
+ donor_mask &= donor_element_mask
acceptor_mask &= acceptor_element_mask
first_model_box = box[0] if box is not None else None
@@ -254,47 +279,43 @@ def _hbond(atoms, donor_mask, acceptor_mask,
if len(donor_h_i) == 0 or len(acceptor_i) == 0:
# Return empty triplets and mask
return (
- np.zeros((0,3), dtype=int),
- np.zeros((atoms.stack_depth(),0), dtype=bool)
+ np.zeros((0, 3), dtype=int),
+ np.zeros((atoms.stack_depth(), 0), dtype=bool),
)
-
+
# Narrow the amount of possible acceptor to donor-H connections
# down via the distance cutoff parameter using a cell list
# Save in acceptor-to-hydrogen matrix
# (true when distance smaller than cutoff)
coord = atoms.coord
- possible_bonds = np.zeros(
- (len(acceptor_i), len(donor_h_i)),
- dtype=bool
- )
+ possible_bonds = np.zeros((len(acceptor_i), len(donor_h_i)), dtype=bool)
periodic = False if box is None else True
for model_i in range(atoms.stack_depth()):
donor_h_coord = coord[model_i, donor_h_mask]
acceptor_coord = coord[model_i, acceptor_mask]
box_for_model = box[model_i] if box is not None else None
cell_list = CellList(
- donor_h_coord, cell_size=cutoff_dist,
- periodic=periodic, box=box_for_model
- )
- possible_bonds |= cell_list.get_atoms_in_cells(
- acceptor_coord, as_mask=True
+ donor_h_coord, cell_size=cutoff_dist, periodic=periodic, box=box_for_model
)
+ possible_bonds |= cell_list.get_atoms_in_cells(acceptor_coord, as_mask=True)
possible_bonds_i = np.where(possible_bonds)
# Narrow down
acceptor_i = acceptor_i[possible_bonds_i[0]]
donor_h_i = donor_h_i[possible_bonds_i[1]]
-
+
# Build D-H..A triplets
donor_i = associated_donor_indices[donor_h_i]
triplets = np.stack((donor_i, donor_h_i, acceptor_i), axis=1)
# Remove entries where donor and acceptor are the same
triplets = triplets[donor_i != acceptor_i]
-
+
hbond_mask = _is_hbond(
- coord[:, triplets[:,0]], # donors
- coord[:, triplets[:,1]], # donor hydrogens
- coord[:, triplets[:,2]], # acceptors
- box, cutoff_dist=cutoff_dist, cutoff_angle=cutoff_angle
+ coord[:, triplets[:, 0]], # donors
+ coord[:, triplets[:, 1]], # donor hydrogens
+ coord[:, triplets[:, 2]], # acceptors
+ box,
+ cutoff_dist=cutoff_dist,
+ cutoff_angle=cutoff_angle,
)
# Reduce output to contain only triplets counted at least once
@@ -311,14 +332,14 @@ def _get_bonded_h(array, donor_mask, bonds):
all donors in atoms[donor_mask].
A `BondsList` is used for detecting bonded hydrogen atoms.
"""
- hydrogen_mask = (array.element == "H")
-
+ hydrogen_mask = array.element == "H"
+
donor_hydrogen_mask = np.zeros(len(array), dtype=bool)
associated_donor_indices = np.full(len(array), -1, dtype=int)
all_bond_indices, _ = bonds.get_all_bonds()
donor_indices = np.where(donor_mask)[0]
-
+
for donor_i in donor_indices:
bonded_indices = all_bond_indices[donor_i]
# Remove padding values
@@ -327,7 +348,7 @@ def _get_bonded_h(array, donor_mask, bonds):
bonded_indices = bonded_indices[hydrogen_mask[bonded_indices]]
donor_hydrogen_mask[bonded_indices] = True
associated_donor_indices[bonded_indices] = donor_i
-
+
return donor_hydrogen_mask, associated_donor_indices
@@ -342,22 +363,20 @@ def _get_bonded_h_via_distance(array, donor_mask, box):
coord = array.coord
res_id = array.res_id
- hydrogen_mask = (array.element == "H")
-
+ hydrogen_mask = array.element == "H"
+
donor_hydrogen_mask = np.zeros(len(array), dtype=bool)
associated_donor_indices = np.full(len(array), -1, dtype=int)
donor_indices = np.where(donor_mask)[0]
for donor_i in donor_indices:
candidate_mask = hydrogen_mask & (res_id == res_id[donor_i])
- distances = distance(
- coord[donor_i], coord[candidate_mask], box=box
- )
+ distances = distance(coord[donor_i], coord[candidate_mask], box=box)
donor_h_indices = np.where(candidate_mask)[0][distances <= CUTOFF]
for i in donor_h_indices:
associated_donor_indices[i] = donor_i
donor_hydrogen_mask[i] = True
-
+
return donor_hydrogen_mask, associated_donor_indices
@@ -378,12 +397,12 @@ def hbond_frequency(mask):
The frequency is the amount of models, where the respective bond
exists divided by the total amount of models.
-
+
Parameters
----------
mask: ndarray, dtype=bool, shape=(m,n)
Input mask obtained from `hbond` function.
-
+
Returns
-------
ndarray, dtype=Float
@@ -406,4 +425,4 @@ def hbond_frequency(mask):
0.132 0.053 0.026 0.158 0.026 0.868 0.211 0.026 0.921 0.316 0.079 0.237
0.105 0.421 0.079 0.026 1.000 0.053 0.132 0.026 0.184]
"""
- return mask.sum(axis=0)/len(mask)
+ return mask.sum(axis=0) / len(mask)
diff --git a/src/biotite/structure/info/__init__.py b/src/biotite/structure/info/__init__.py
index 4d754a9b8..3c7078ff7 100644
--- a/src/biotite/structure/info/__init__.py
+++ b/src/biotite/structure/info/__init__.py
@@ -14,8 +14,6 @@
__name__ = "biotite.structure.info"
__author__ = "Patrick Kunzmann, Tom David Müller"
-from .groups import *
-
from .atoms import *
from .bonds import *
from .groups import *
diff --git a/src/biotite/structure/info/atoms.py b/src/biotite/structure/info/atoms.py
index 6ab063a99..349fb40e4 100644
--- a/src/biotite/structure/info/atoms.py
+++ b/src/biotite/structure/info/atoms.py
@@ -6,15 +6,16 @@
__author__ = "Patrick Kunzmann"
__all__ = ["residue"]
-from .ccd import get_ccd
+from biotite.structure.info.ccd import get_ccd
-
-non_hetero_residues = set([
- "ALA","ARG","ASN","ASP","CYS","GLN","GLU","GLY","HIS",
- "ILE","LEU","LYS","MET","PHE","PRO","PYL","SER","THR",
- "TRP","TYR","VAL", "SEC",
+# fmt: off
+NON_HETERO_RESIDUES = set([
+ "ALA", "ARG", "ASN", "ASP", "CYS", "GLN", "GLU", "GLY", "HIS",
+ "ILE", "LEU", "LYS", "MET", "PHE", "PRO", "PYL", "SER", "THR",
+ "TRP", "TYR", "VAL", "SEC",
"A", "DA", "G", "DG", "C", "DC", "U", "DT",
])
+# fmt: on
def residue(res_name):
@@ -70,13 +71,11 @@ def residue(res_name):
['OXT' 'HXT']]
"""
# Avoid circular import
- from ..io.pdbx import get_component
+ from biotite.structure.io.pdbx import get_component
try:
component = get_component(get_ccd(), res_name=res_name)
except KeyError:
- raise KeyError(
- f"No atom information found for residue '{res_name}' in CCD"
- )
- component.hetero[:] = res_name not in non_hetero_residues
+ raise KeyError(f"No atom information found for residue '{res_name}' in CCD")
+ component.hetero[:] = res_name not in NON_HETERO_RESIDUES
return component
diff --git a/src/biotite/structure/info/bonds.py b/src/biotite/structure/info/bonds.py
index 421058162..cccd541c5 100644
--- a/src/biotite/structure/info/bonds.py
+++ b/src/biotite/structure/info/bonds.py
@@ -6,18 +6,17 @@
__author__ = "Patrick Kunzmann"
__all__ = ["bond_type", "bonds_in_residue"]
-from ..bonds import BondType
-from .ccd import get_from_ccd
-
+from biotite.structure.bonds import BondType
+from biotite.structure.info.ccd import get_from_ccd
BOND_TYPES = {
- ("SING", "N") : BondType.SINGLE,
- ("DOUB", "N") : BondType.DOUBLE,
- ("TRIP", "N") : BondType.TRIPLE,
- ("QUAD", "N") : BondType.QUADRUPLE,
- ("SING", "Y") : BondType.AROMATIC_SINGLE,
- ("DOUB", "Y") : BondType.AROMATIC_DOUBLE,
- ("TRIP", "Y") : BondType.AROMATIC_TRIPLE,
+ ("SING", "N"): BondType.SINGLE,
+ ("DOUB", "N"): BondType.DOUBLE,
+ ("TRIP", "N"): BondType.TRIPLE,
+ ("QUAD", "N"): BondType.QUADRUPLE,
+ ("SING", "Y"): BondType.AROMATIC_SINGLE,
+ ("DOUB", "Y"): BondType.AROMATIC_DOUBLE,
+ ("TRIP", "Y"): BondType.AROMATIC_TRIPLE,
}
_intra_bonds = {}
@@ -62,8 +61,7 @@ def bond_type(res_name, atom_name1, atom_name2):
return None
# Try both atom orders
bond_type_int = bonds_for_residue.get(
- (atom_name1, atom_name2),
- bonds_for_residue.get((atom_name2, atom_name1))
+ (atom_name1, atom_name2), bonds_for_residue.get((atom_name2, atom_name1))
)
if bond_type_int is not None:
return BondType(bond_type_int)
@@ -137,7 +135,7 @@ def bonds_in_residue(res_name):
chem_comp_bond_dict["atom_id_1"],
chem_comp_bond_dict["atom_id_2"],
chem_comp_bond_dict["value_order"],
- chem_comp_bond_dict["pdbx_aromatic_flag"]
+ chem_comp_bond_dict["pdbx_aromatic_flag"],
):
bond_type = BOND_TYPES[order, aromatic_flag]
bonds_for_residue[atom1.item(), atom2.item()] = bond_type
diff --git a/src/biotite/structure/info/ccd.py b/src/biotite/structure/info/ccd.py
index 8942f59ba..d48ab0f68 100644
--- a/src/biotite/structure/info/ccd.py
+++ b/src/biotite/structure/info/ccd.py
@@ -9,7 +9,6 @@
from pathlib import Path
import numpy as np
-
CCD_DIR = Path(__file__).parent / "ccd"
INDEX_COLUMN_NAME = {
"chem_comp": "id",
@@ -40,7 +39,7 @@ def get_ccd():
"""
# Avoid circular import
- from ..io.pdbx.bcif import BinaryCIFFile
+ from biotite.structure.io.pdbx.bcif import BinaryCIFFile
global _ccd_block
if _ccd_block is None:
@@ -104,7 +103,7 @@ def _index_residues(id_column):
# The final start is the exclusive stop of last residue
residue_starts = np.concatenate(([0], residue_starts, [len(id_column)]))
index = {}
- for i in range(len(residue_starts)-1):
+ for i in range(len(residue_starts) - 1):
comp_id = id_column[residue_starts[i]].item()
- index[comp_id] = (residue_starts[i], residue_starts[i+1])
- return index
\ No newline at end of file
+ index[comp_id] = (residue_starts[i], residue_starts[i + 1])
+ return index
diff --git a/src/biotite/structure/info/groups.py b/src/biotite/structure/info/groups.py
index c719acd3f..781f9c587 100644
--- a/src/biotite/structure/info/groups.py
+++ b/src/biotite/structure/info/groups.py
@@ -7,8 +7,6 @@
__all__ = ["amino_acid_names", "nucleotide_names", "carbohydrate_names"]
from pathlib import Path
-import copy
-
CCD_DIR = Path(__file__).parent / "ccd"
@@ -84,4 +82,4 @@ def _get_group_members(group_name):
if group_name not in group_lists:
with open(CCD_DIR / f"{group_name}.txt", "r") as file:
group_lists[group_name] = tuple(file.read().split())
- return group_lists[group_name]
\ No newline at end of file
+ return group_lists[group_name]
diff --git a/src/biotite/structure/info/masses.py b/src/biotite/structure/info/masses.py
index 73c0b6828..e0ac8cd8d 100644
--- a/src/biotite/structure/info/masses.py
+++ b/src/biotite/structure/info/masses.py
@@ -8,9 +8,8 @@
import json
from pathlib import Path
-from ..atoms import Atom, AtomArray, AtomArrayStack
-from .ccd import get_from_ccd
-
+from biotite.structure.atoms import Atom, AtomArray, AtomArrayStack
+from biotite.structure.info.ccd import get_from_ccd
# Masses are taken from http://www.sbcs.qmul.ac.uk/iupac/AtWt/ (2018/03/01)
ATOM_MASSES_FILE = Path(__file__).parent / "atom_masses.json"
@@ -109,15 +108,11 @@ def mass(item, is_residue=None):
elif isinstance(item, Atom):
result_mass = mass(item.element, is_residue=False)
elif isinstance(item, AtomArray) or isinstance(item, AtomArrayStack):
- result_mass = sum(
- (mass(element, is_residue=False) for element in item.element)
- )
+ result_mass = sum((mass(element, is_residue=False) for element in item.element))
else:
- raise TypeError(
- f"Cannot calculate mass for {type(item).__name__} objects"
- )
+ raise TypeError(f"Cannot calculate mass for {type(item).__name__} objects")
if result_mass is None:
raise KeyError(f"{item} is not known")
- return result_mass
\ No newline at end of file
+ return result_mass
diff --git a/src/biotite/structure/info/misc.py b/src/biotite/structure/info/misc.py
index 2fb9de55a..57e270568 100644
--- a/src/biotite/structure/info/misc.py
+++ b/src/biotite/structure/info/misc.py
@@ -6,7 +6,7 @@
__author__ = "Patrick Kunzmann"
__all__ = ["all_residues", "full_name", "link_type", "one_letter_code"]
-from .ccd import get_ccd, get_from_ccd
+from biotite.structure.info.ccd import get_ccd, get_from_ccd
def all_residues():
diff --git a/src/biotite/structure/info/radii.py b/src/biotite/structure/info/radii.py
index 392dd0c00..64ef734bc 100644
--- a/src/biotite/structure/info/radii.py
+++ b/src/biotite/structure/info/radii.py
@@ -6,9 +6,9 @@
__author__ = "Patrick Kunzmann"
__all__ = ["vdw_radius_protor", "vdw_radius_single"]
-from .bonds import bonds_in_residue
-
+from biotite.structure.info.bonds import bonds_in_residue
+# fmt: off
# Contains tuples for the different ProtOr groups:
# Tuple contains: element, valency, H count
_PROTOR_RADII = {
@@ -35,28 +35,29 @@
_SINGLE_RADII = {
"H": 1.20,
"HE": 1.40,
-
+
"C": 1.70,
"N": 1.55,
"O": 1.52,
"F": 1.47,
"NE": 1.54,
-
+
"SI": 2.10,
"P": 1.80,
"S": 1.80,
"CL": 1.75,
"AR": 1.88,
-
+
"AS": 1.85,
"SE": 1.90,
"BR": 1.85,
"KR": 2.02,
-
+
"TE": 2.06,
"I": 1.98,
"XE": 2.16,
}
+# fmt: on
# A dictionary that caches radii for each residue
_protor_radii = {}
@@ -82,7 +83,7 @@ def vdw_radius_protor(res_name, atom_name):
to.
atom_name : str
The name of the non-hydrogen atom.
-
+
Returns
-------
The Van-der-Waals radius of the given atom.
@@ -91,12 +92,12 @@ def vdw_radius_protor(res_name, atom_name):
See also
--------
vdw_radius_single
-
+
References
----------
-
+
.. footbibliography::
-
+
Examples
--------
@@ -113,8 +114,7 @@ def vdw_radius_protor(res_name, atom_name):
# Use cached radii for the residue, if already calculated
if atom_name not in _protor_radii[res_name]:
raise KeyError(
- f"Residue '{res_name}' does not contain an atom named "
- f"'{atom_name}'"
+ f"Residue '{res_name}' does not contain an atom named " f"'{atom_name}'"
)
return _protor_radii[res_name].get(atom_name)
else:
@@ -124,6 +124,7 @@ def vdw_radius_protor(res_name, atom_name):
# are cached
return vdw_radius_protor(res_name, atom_name)
+
def _calculate_protor_radii(res_name):
"""
Calculate the ProtOr VdW radii for all atoms (atom names) in
@@ -159,8 +160,7 @@ def _calculate_protor_radii(res_name):
group[2] += 1
groups[main_atom] = group
# Get radii based on ProtOr groups
- radii = {atom : _PROTOR_RADII.get(tuple(group))
- for atom, group in groups.items()}
+ radii = {atom: _PROTOR_RADII.get(tuple(group)) for atom, group in groups.items()}
return radii
@@ -173,25 +173,25 @@ def vdw_radius_single(element):
----------
element : str
The chemical element of the atoms.
-
+
Returns
-------
The Van-der-Waals radius of the atom.
If the radius is unknown for the element, `None` is returned.
-
+
See also
--------
vdw_radius_protor
-
+
References
----------
-
+
.. footbibliography::
-
+
Examples
--------
>>> print(vdw_radius_single("C"))
1.7
"""
- return _SINGLE_RADII.get(element.upper())
\ No newline at end of file
+ return _SINGLE_RADII.get(element.upper())
diff --git a/src/biotite/structure/info/standardize.py b/src/biotite/structure/info/standardize.py
index 2b1000265..558b81f41 100644
--- a/src/biotite/structure/info/standardize.py
+++ b/src/biotite/structure/info/standardize.py
@@ -8,9 +8,9 @@
import warnings
import numpy as np
-from .ccd import get_from_ccd
-from ..residues import get_residue_starts
-from ..error import BadStructureError
+from biotite.structure.error import BadStructureError
+from biotite.structure.info.ccd import get_from_ccd
+from biotite.structure.residues import get_residue_starts
def standardize_order(atoms):
@@ -116,26 +116,24 @@ def standardize_order(atoms):
reordered_indices = np.zeros(atoms.array_length(), dtype=int)
starts = get_residue_starts(atoms, add_exclusive_stop=True)
- for i in range(len(starts)-1):
+ for i in range(len(starts) - 1):
start = starts[i]
- stop = starts[i+1]
+ stop = starts[i + 1]
res_name = atoms.res_name[start]
- standard_atom_names = get_from_ccd(
- "chem_comp_atom", res_name, "atom_id"
- )
+ standard_atom_names = get_from_ccd("chem_comp_atom", res_name, "atom_id")
if standard_atom_names is None:
# If the residue is not in the CCD, keep the current order
warnings.warn(
f"Residue '{res_name}' is not in the CCD, "
f"keeping current atom order"
)
- reordered_indices[start : stop] = np.arange(start, stop)
+ reordered_indices[start:stop] = np.arange(start, stop)
continue
- reordered_indices[start : stop] = _reorder(
- atoms.atom_name[start : stop], standard_atom_names
- ) + start
+ reordered_indices[start:stop] = (
+ _reorder(atoms.atom_name[start:stop], standard_atom_names) + start
+ )
return reordered_indices
@@ -164,17 +162,13 @@ def _reorder(origin, target):
Indices for `origin` that that changes the order of `origin`
to the order of `target`.
"""
- target_hits, origin_hits = np.where(
- target[:, np.newaxis] == origin[np.newaxis, :]
- )
+ target_hits, origin_hits = np.where(target[:, np.newaxis] == origin[np.newaxis, :])
counts = np.bincount(target_hits, minlength=len(target))
if (counts > 1).any():
counts = np.bincount(target_hits, minlength=len(target))
# Identify which atom is duplicate
- duplicate_i = np.where(
- counts > 1
- )[0][0]
+ duplicate_i = np.where(counts > 1)[0][0]
duplicate_name = target[duplicate_i]
raise BadStructureError(
f"Input structure has duplicate atom '{duplicate_name}'"
@@ -185,12 +179,7 @@ def _reorder(origin, target):
# to the target structure
# -> Identify which atoms are missing in the target structure
# and append these to the end of the residue
- missing_atom_mask = np.bincount(
- origin_hits, minlength=len(origin)
- ).astype(bool)
- return np.concatenate([
- origin_hits,
- np.where(~missing_atom_mask)[0]
- ])
+ missing_atom_mask = np.bincount(origin_hits, minlength=len(origin)).astype(bool)
+ return np.concatenate([origin_hits, np.where(~missing_atom_mask)[0]])
else:
- return origin_hits
\ No newline at end of file
+ return origin_hits
diff --git a/src/biotite/structure/integrity.py b/src/biotite/structure/integrity.py
index 567908fad..b3e867666 100644
--- a/src/biotite/structure/integrity.py
+++ b/src/biotite/structure/integrity.py
@@ -9,21 +9,26 @@
__name__ = "biotite.structure"
__author__ = "Patrick Kunzmann, Daniel Bauer"
-__all__ = ["check_atom_id_continuity",
- "check_res_id_continuity", "check_backbone_continuity",
- "check_duplicate_atoms",
- "check_linear_continuity"]
+__all__ = [
+ "check_atom_id_continuity",
+ "check_res_id_continuity",
+ "check_backbone_continuity",
+ "check_duplicate_atoms",
+ "check_linear_continuity",
+]
import numpy as np
-import warnings
-from .filter import (
- filter_peptide_backbone, filter_phosphate_backbone, filter_linear_bond_continuity)
-from .box import coord_to_fraction
+from biotite.structure.box import coord_to_fraction
+from biotite.structure.filter import (
+ filter_linear_bond_continuity,
+ filter_peptide_backbone,
+ filter_phosphate_backbone,
+)
def _check_continuity(array):
diff = np.diff(array)
- discontinuity = np.where( ((diff != 0) & (diff != 1)) )
+ discontinuity = np.where(((diff != 0) & (diff != 1)))
return discontinuity[0] + 1
@@ -164,8 +169,9 @@ def check_duplicate_atoms(array):
The first occurence of an atom is not counted as duplicate.
"""
duplicates = []
- annots = [array.get_annotation(category) for category
- in array.get_annotation_categories()]
+ annots = [
+ array.get_annotation(category) for category in array.get_annotation_categories()
+ ]
for i in range(1, array.array_length()):
# Start with assumption that all atoms in the array
# until index i are duplicates of the atom at index i
@@ -174,7 +180,7 @@ def check_duplicate_atoms(array):
# For each annotation array filter out the atoms until
# index i that have an unequal annotation
# to the atom at index i
- is_duplicate &= (annot[:i] == annot[i])
+ is_duplicate &= annot[:i] == annot[i]
# After checking all annotation arrays,
# if there still is any duplicate to the atom at index i,
# add i the the list of duplicate atom indices
diff --git a/src/biotite/structure/io/__init__.py b/src/biotite/structure/io/__init__.py
index 3c3678c0d..510a65cf4 100644
--- a/src/biotite/structure/io/__init__.py
+++ b/src/biotite/structure/io/__init__.py
@@ -26,4 +26,4 @@
__author__ = "Patrick Kunzmann"
from .general import *
-from .trajfile import *
\ No newline at end of file
+from .trajfile import *
diff --git a/src/biotite/structure/io/dcd/__init__.py b/src/biotite/structure/io/dcd/__init__.py
index aa5e79366..1145f2376 100644
--- a/src/biotite/structure/io/dcd/__init__.py
+++ b/src/biotite/structure/io/dcd/__init__.py
@@ -10,4 +10,4 @@
__name__ = "biotite.structure.io.dcd"
__author__ = "Patrick Kunzmann"
-from .file import *
\ No newline at end of file
+from .file import *
diff --git a/src/biotite/structure/io/dcd/file.py b/src/biotite/structure/io/dcd/file.py
index 5aa1071f4..baffebe4e 100644
--- a/src/biotite/structure/io/dcd/file.py
+++ b/src/biotite/structure/io/dcd/file.py
@@ -7,20 +7,21 @@
__all__ = ["DCDFile"]
import numpy as np
-from ..trajfile import TrajectoryFile
-from ...box import vectors_from_unitcell, unitcell_from_vectors
+from biotite.structure.box import unitcell_from_vectors, vectors_from_unitcell
+from biotite.structure.io.trajfile import TrajectoryFile
class DCDFile(TrajectoryFile):
"""
This file class represents a DCD trajectory file.
"""
-
+
@classmethod
def traj_type(cls):
import mdtraj.formats as traj
+
return traj.DCDTrajectoryFile
-
+
@classmethod
def process_read_values(cls, read_values):
# .netcdf files use Angstrom
@@ -28,38 +29,40 @@ def process_read_values(cls, read_values):
cell_lengths = read_values[1]
cell_angles = read_values[2]
if cell_lengths is None or cell_angles is None:
- box = None
+ box = None
else:
box = np.stack(
- [vectors_from_unitcell(a, b, c, alpha, beta, gamma)
- for (a, b, c), (alpha, beta, gamma)
- in zip(cell_lengths, np.deg2rad(cell_angles))],
- axis=0
+ [
+ vectors_from_unitcell(a, b, c, alpha, beta, gamma)
+ for (a, b, c), (alpha, beta, gamma) in zip(
+ cell_lengths, np.deg2rad(cell_angles)
+ )
+ ],
+ axis=0,
)
return coord, box, None
-
+
@classmethod
def prepare_write_values(cls, coord, box, time):
- xyz = coord.astype(np.float32, copy=False) \
- if coord is not None else None
+ xyz = coord.astype(np.float32, copy=False) if coord is not None else None
if box is None:
cell_lengths = None
- cell_angles = None
+ cell_angles = None
else:
cell_lengths = np.zeros((len(box), 3), dtype=np.float32)
- cell_angles = np.zeros((len(box), 3), dtype=np.float32)
+ cell_angles = np.zeros((len(box), 3), dtype=np.float32)
for i, model_box in enumerate(box):
a, b, c, alpha, beta, gamma = unitcell_from_vectors(model_box)
cell_lengths[i] = np.array((a, b, c))
cell_angles[i] = np.rad2deg((alpha, beta, gamma))
return {
- "xyz" : xyz,
- "cell_lengths" : cell_lengths,
- "cell_angles" : cell_angles,
+ "xyz": xyz,
+ "cell_lengths": cell_lengths,
+ "cell_angles": cell_angles,
}
def set_time(self, time):
if time is not None:
raise NotImplementedError(
"This trajectory file does not support writing simulation time"
- )
\ No newline at end of file
+ )
diff --git a/src/biotite/structure/io/general.py b/src/biotite/structure/io/general.py
index ba0e0828b..626a321c3 100644
--- a/src/biotite/structure/io/general.py
+++ b/src/biotite/structure/io/general.py
@@ -12,9 +12,9 @@
__all__ = ["load_structure", "save_structure"]
import datetime
-import os.path
import io
-from ..atoms import AtomArrayStack
+import os.path
+from biotite.structure.atoms import AtomArrayStack
def load_structure(file_path, template=None, **kwargs):
@@ -64,57 +64,60 @@ def load_structure(file_path, template=None, **kwargs):
_, suffix = os.path.splitext(file_path)
match suffix:
case ".pdb":
- from .pdb import PDBFile
+ from biotite.structure.io.pdb import PDBFile
+
file = PDBFile.read(file_path)
array = file.get_structure(**kwargs)
return _as_single_model_if_possible(array)
case ".pdbqt":
- from .pdbqt import PDBQTFile
+ from biotite.structure.io.pdbqt import PDBQTFile
+
file = PDBQTFile.read(file_path)
array = file.get_structure(**kwargs)
return _as_single_model_if_possible(array)
case ".cif" | ".pdbx":
- from .pdbx import CIFFile, get_structure
+ from biotite.structure.io.pdbx import CIFFile, get_structure
+
file = CIFFile.read(file_path)
array = get_structure(file, **kwargs)
return _as_single_model_if_possible(array)
case ".bcif":
- from .pdbx import BinaryCIFFile, get_structure
+ from biotite.structure.io.pdbx import BinaryCIFFile, get_structure
+
file = BinaryCIFFile.read(file_path)
array = get_structure(file, **kwargs)
return _as_single_model_if_possible(array)
case ".gro":
- from .gro import GROFile
+ from biotite.structure.io.gro import GROFile
+
file = GROFile.read(file_path)
array = file.get_structure(**kwargs)
return _as_single_model_if_possible(array)
case ".mol":
- from .mol import MOLFile
+ from biotite.structure.io.mol import MOLFile
+
file = MOLFile.read(file_path)
array = file.get_structure(**kwargs)
# MOL and SDF files only contain a single model
return array
case ".sdf" | ".sd":
- from .mol import SDFile, get_structure
+ from biotite.structure.io.mol import SDFile, get_structure
+
file = SDFile.read(file_path)
array = get_structure(file, **kwargs)
return array
case ".trr" | ".xtc" | ".tng" | ".dcd" | ".netcdf":
if template is None:
- raise TypeError(
- "Template must be specified for trajectory files"
- )
+ raise TypeError("Template must be specified for trajectory files")
# Filter template for atom ids, if an unfiltered template
- if (
- "atom_i" in kwargs
- and template.shape[-1] != len(kwargs["atom_i"])
- ):
+ if "atom_i" in kwargs and template.shape[-1] != len(kwargs["atom_i"]):
template = template[..., kwargs["atom_i"]]
- from .trr import TRRFile
- from .xtc import XTCFile
- from .tng import TNGFile
- from .dcd import DCDFile
- from .netcdf import NetCDFFile
+ from biotite.structure.io.dcd import DCDFile
+ from biotite.structure.io.netcdf import NetCDFFile
+ from biotite.structure.io.tng import TNGFile
+ from biotite.structure.io.trr import TRRFile
+ from biotite.structure.io.xtc import XTCFile
+
if suffix == ".trr":
traj_file_cls = TRRFile
if suffix == ".xtc":
@@ -159,49 +162,57 @@ def save_structure(file_path, array, **kwargs):
_, suffix = os.path.splitext(file_path)
match suffix:
case ".pdb":
- from .pdb import PDBFile
+ from biotite.structure.io.pdb import PDBFile
+
file = PDBFile()
file.set_structure(array, **kwargs)
file.write(file_path)
case ".pdbqt":
- from .pdbqt import PDBQTFile
+ from biotite.structure.io.pdbqt import PDBQTFile
+
file = PDBQTFile()
file.set_structure(array, **kwargs)
file.write(file_path)
case ".cif" | ".pdbx":
- from .pdbx import CIFFile, set_structure
+ from biotite.structure.io.pdbx import CIFFile, set_structure
+
file = CIFFile()
set_structure(file, array, **kwargs)
file.write(file_path)
case ".bcif":
- from .pdbx import BinaryCIFFile, set_structure
+ from biotite.structure.io.pdbx import BinaryCIFFile, set_structure
+
file = BinaryCIFFile()
set_structure(file, array, **kwargs)
file.write(file_path)
case ".gro":
- from .gro import GROFile
+ from biotite.structure.io.gro import GROFile
+
file = GROFile()
file.set_structure(array, **kwargs)
file.write(file_path)
case ".mol":
- from .mol import MOLFile
+ from biotite.structure.io.mol import MOLFile
+
file = MOLFile()
file.set_structure(array, **kwargs)
file.header = _mol_header()
file.write(file_path)
case ".sdf" | ".sd":
- from .mol import SDFile, SDRecord, set_structure
+ from biotite.structure.io.mol import SDFile, SDRecord, set_structure
+
record = SDRecord()
record.set_structure(array, **kwargs)
record.header = _mol_header()
file = SDFile({"Molecule": record})
file.write(file_path)
case ".trr" | ".xtc" | ".tng" | ".dcd" | ".netcdf":
- from .trr import TRRFile
- from .xtc import XTCFile
- from .tng import TNGFile
- from .dcd import DCDFile
- from .netcdf import NetCDFFile
+ from biotite.structure.io.dcd import DCDFile
+ from biotite.structure.io.netcdf import NetCDFFile
+ from biotite.structure.io.tng import TNGFile
+ from biotite.structure.io.trr import TRRFile
+ from biotite.structure.io.xtc import XTCFile
+
if suffix == ".trr":
traj_file_cls = TRRFile
if suffix == ".xtc":
@@ -228,10 +239,11 @@ def _as_single_model_if_possible(atoms):
def _mol_header():
- from .mol import Header
+ from biotite.structure.io.mol import Header
+
return Header(
mol_name="Molecule",
program="Biotite",
time=datetime.datetime.now(),
dimensions="3D",
- )
\ No newline at end of file
+ )
diff --git a/src/biotite/structure/io/gro/__init__.py b/src/biotite/structure/io/gro/__init__.py
index 8d10671b5..e58ccff55 100644
--- a/src/biotite/structure/io/gro/__init__.py
+++ b/src/biotite/structure/io/gro/__init__.py
@@ -11,4 +11,4 @@
__name__ = "biotite.structure.io.gro"
__author__ = "Daniel Bauer"
-from .file import *
\ No newline at end of file
+from .file import *
diff --git a/src/biotite/structure/io/gro/file.py b/src/biotite/structure/io/gro/file.py
index 188338e50..8279b639d 100644
--- a/src/biotite/structure/io/gro/file.py
+++ b/src/biotite/structure/io/gro/file.py
@@ -6,25 +6,27 @@
__author__ = "Daniel Bauer, Patrick Kunzmann"
__all__ = ["GROFile"]
-import numpy as np
-from ...atoms import AtomArray, AtomArrayStack
-from ...box import is_orthogonal
-from ....file import TextFile, InvalidFileError
-from ...repair import infer_elements
-from ...error import BadStructureError
import copy
from datetime import datetime
-
-_atom_records = {"res_id" : (0, 5),
- "res_name" : (5,10),
- "atom_name" : (10,15),
- "atom_id" : (15,20),
- "coord_x" : (20, 28),
- "coord_y" : (28, 36),
- "coord_z" : (36, 44),
- "v_x" : (44, 52),
- "v_y" : (52, 60),
- "v_z" : (60, 68)}
+import numpy as np
+from biotite.file import InvalidFileError, TextFile
+from biotite.structure.atoms import AtomArray, AtomArrayStack
+from biotite.structure.box import is_orthogonal
+from biotite.structure.error import BadStructureError
+from biotite.structure.repair import infer_elements
+
+_atom_records = {
+ "res_id": (0, 5),
+ "res_name": (5, 10),
+ "atom_name": (10, 15),
+ "atom_id": (15, 20),
+ "coord_x": (20, 28),
+ "coord_y": (28, 36),
+ "coord_z": (36, 44),
+ "v_x": (44, 52),
+ "v_y": (52, 60),
+ "v_z": (60, 68),
+}
class GROFile(TextFile):
@@ -48,6 +50,7 @@ class GROFile(TextFile):
>>> file.write(os.path.join(path_to_directory, "1l2y_mod.gro"))
"""
+
def get_model_count(self):
"""
Get the number of models contained in this GRO file.
@@ -63,7 +66,6 @@ def get_model_count(self):
model_count += 1
return model_count
-
def get_structure(self, model=None):
"""
Get an :class:`AtomArray` or :class:`AtomArrayStack` from the
@@ -91,9 +93,7 @@ def get_atom_line_i(model_start_i, model_atom_counts):
"""
Helper function to get the indices of all atoms for a model
"""
- return np.arange(
- model_start_i+1, model_start_i+1+model_atom_counts
- )
+ return np.arange(model_start_i + 1, model_start_i + 1 + model_atom_counts)
def set_box_dimen(box_param):
"""
@@ -114,33 +114,31 @@ def set_box_dimen(box_param):
return None
if len(box_param) == 3:
x, y, z = box_param
- return np.array([[x,0,0], [0,y,0], [0,0,z]], dtype=float)
+ return np.array([[x, 0, 0], [0, y, 0], [0, 0, z]], dtype=float)
elif len(box_param) == 9:
x1, y2, z3, x2, x3, y1, y3, z1, z2 = box_param
- return np.array(
- [[x1,x2,x3], [y1,y2,y3], [z1,z2,z3]], dtype=float
- )
+ return np.array([[x1, x2, x3], [y1, y2, y3], [z1, z2, z3]], dtype=float)
else:
raise InvalidFileError(
f"Invalid amount of box parameters: {len(box_param)}"
)
# Line indices where a new model starts
- model_start_i = np.array([i for i in range(len(self.lines))
- if _is_int(self.lines[i])],
- dtype=int)
+ model_start_i = np.array(
+ [i for i in range(len(self.lines)) if _is_int(self.lines[i])], dtype=int
+ )
# Number of atoms in each model
- model_atom_counts = np.array(
- [int(self.lines[i]) for i in model_start_i]
- )
+ model_atom_counts = np.array([int(self.lines[i]) for i in model_start_i])
if model is None:
# Check if all models have the same length
if np.all(model_atom_counts != model_atom_counts[0]):
- raise BadStructureError("The models in the file have unequal "
- "amount of atoms, give an explicit "
- "model instead")
+ raise BadStructureError(
+ "The models in the file have unequal "
+ "amount of atoms, give an explicit "
+ "model instead"
+ )
depth = len(model_start_i)
length = model_atom_counts[0]
array = AtomArrayStack(depth, length)
@@ -159,10 +157,10 @@ def set_box_dimen(box_param):
f"the given model {model} does not exist"
)
- length = model_atom_counts[model-1]
+ length = model_atom_counts[model - 1]
array = AtomArray(length)
- annot_i = get_atom_line_i(model_start_i[model-1], length)
+ annot_i = get_atom_line_i(model_start_i[model - 1], length)
# Replace empty strings for elements with guessed types
# i is index in array, line_i is line index
@@ -179,27 +177,25 @@ def set_box_dimen(box_param):
for i, line_i in enumerate(atom_i):
line = self.lines[line_i]
# gro files use nm instead of A
- array.coord[i,0] = float(line[20:28])*10
- array.coord[i,1] = float(line[28:36])*10
- array.coord[i,2] = float(line[36:44])*10
+ array.coord[i, 0] = float(line[20:28]) * 10
+ array.coord[i, 1] = float(line[28:36]) * 10
+ array.coord[i, 2] = float(line[36:44]) * 10
# Box is stored in last line (after coordinates)
box_i = atom_i[-1] + 1
- box_param = [float(e)*10 for e in self.lines[box_i].split()]
+ box_param = [float(e) * 10 for e in self.lines[box_i].split()]
array.box = set_box_dimen(box_param)
elif isinstance(array, AtomArrayStack):
for m in range(len(model_start_i)):
- atom_i = get_atom_line_i(
- model_start_i[m], model_atom_counts[m]
- )
+ atom_i = get_atom_line_i(model_start_i[m], model_atom_counts[m])
for i, line_i in enumerate(atom_i):
line = self.lines[line_i]
- array.coord[m,i,0] = float(line[20:28])*10
- array.coord[m,i,1] = float(line[28:36])*10
- array.coord[m,i,2] = float(line[36:44])*10
+ array.coord[m, i, 0] = float(line[20:28]) * 10
+ array.coord[m, i, 1] = float(line[28:36]) * 10
+ array.coord[m, i, 2] = float(line[36:44]) * 10
# Box is stored in last line (after coordinates)
box_i = atom_i[-1] + 1
- box_param = [float(e)*10 for e in self.lines[box_i].split()]
+ box_param = [float(e) * 10 for e in self.lines[box_i].split()]
box = set_box_dimen(box_param)
# Create a box in the stack if not already existing
# and the box is not a dummy
@@ -210,7 +206,6 @@ def set_box_dimen(box_param):
return array
-
def set_structure(self, array):
"""
Set the :class:`AtomArray` or :class:`AtomArrayStack` for the
@@ -223,6 +218,7 @@ def set_structure(self, array):
is given, each array in the stack is saved as separate
model.
"""
+
def get_box_dimen(array):
"""
GRO files have the box dimensions as last line for each
@@ -253,10 +249,15 @@ def get_box_dimen(array):
else:
box = box / 10
box_elements = (
- box[0,0], box[1,1], box[2,2],
- box[0,1], box[0,2],
- box[1,0], box[1,2],
- box[2,0], box[2,1],
+ box[0, 0],
+ box[1, 1],
+ box[2, 2],
+ box[0, 1],
+ box[0, 2],
+ box[1, 0],
+ box[1, 2],
+ box[2, 0],
+ box[2, 1],
)
return " ".join([f"{e:>9.5f}" for e in box_elements])
@@ -266,17 +267,11 @@ def get_box_dimen(array):
atom_id = np.arange(1, array.array_length() + 1)
# Atom IDs are supported up to 99999,
# but negative IDs are also possible
- gro_atom_id = np.where(
- atom_id > 0,
- ((atom_id - 1) % 99999) + 1,
- atom_id
- )
+ gro_atom_id = np.where(atom_id > 0, ((atom_id - 1) % 99999) + 1, atom_id)
# Residue IDs are supported up to 9999,
# but negative IDs are also possible
gro_res_id = np.where(
- array.res_id > 0,
- ((array.res_id - 1) % 99999) + 1,
- array.res_id
+ array.res_id > 0, ((array.res_id - 1) % 99999) + 1, array.res_id
)
if isinstance(array, AtomArray):
@@ -290,10 +285,14 @@ def get_box_dimen(array):
fmt = "{:>5d}{:5s}{:>5s}{:>5d}{:>8.3f}{:>8.3f}{:>8.3f}"
for i in range(array.array_length()):
# gro format is in nm -> multiply coords by 10
- self.lines[i+2] = fmt.format(
- gro_res_id[i], array.res_name[i], array.atom_name[i],
- gro_atom_id[i], array.coord[i,0]/10, array.coord[i,1]/10,
- array.coord[i,2]/10
+ self.lines[i + 2] = fmt.format(
+ gro_res_id[i],
+ array.res_name[i],
+ array.atom_name[i],
+ gro_atom_id[i],
+ array.coord[i, 0] / 10,
+ array.coord[i, 1] / 10,
+ array.coord[i, 2] / 10,
)
# Write box lines
self.lines[-1] = get_box_dimen(array)
@@ -304,10 +303,11 @@ def get_box_dimen(array):
# Therefore template lines are created
# which are afterwards applied for each model
templines = [None] * array.array_length()
- fmt = '{:>5d}{:5s}{:>5s}{:5d}'
+ fmt = "{:>5d}{:5s}{:>5s}{:5d}"
for i in range(array.array_length()):
- templines[i] = fmt.format(gro_res_id[i], array.res_name[i],
- array.atom_name[i], gro_atom_id[i])
+ templines[i] = fmt.format(
+ gro_res_id[i], array.res_name[i], array.atom_name[i], gro_atom_id[i]
+ )
for i in range(array.stack_depth()):
self.lines.append(
@@ -319,10 +319,11 @@ def get_box_dimen(array):
modellines = copy.copy(templines)
for j, line in enumerate(modellines):
# Insert coordinates
- line = (line + "{:>8.3f}{:>8.3f}{:>8.3f}".format(
- array.coord[i,j,0]/10,
- array.coord[i,j,1]/10,
- array.coord[i,j,2]/10))
+ line = line + "{:>8.3f}{:>8.3f}{:>8.3f}".format(
+ array.coord[i, j, 0] / 10,
+ array.coord[i, j, 1] / 10,
+ array.coord[i, j, 2] / 10,
+ )
modellines[j] = line
self.lines.extend(modellines)
self.lines.append(get_box_dimen(array[i]))
@@ -340,4 +341,4 @@ def _is_int(string):
int(string)
return True
except ValueError:
- return False
\ No newline at end of file
+ return False
diff --git a/src/biotite/structure/io/mol/__init__.py b/src/biotite/structure/io/mol/__init__.py
index 9e8ee2097..ba71d85a2 100644
--- a/src/biotite/structure/io/mol/__init__.py
+++ b/src/biotite/structure/io/mol/__init__.py
@@ -17,4 +17,4 @@
from .convert import *
from .header import *
from .mol import *
-from .sdf import *
\ No newline at end of file
+from .sdf import *
diff --git a/src/biotite/structure/io/mol/convert.py b/src/biotite/structure/io/mol/convert.py
index 2961c79c9..64cae7ff3 100644
--- a/src/biotite/structure/io/mol/convert.py
+++ b/src/biotite/structure/io/mol/convert.py
@@ -6,9 +6,9 @@
__author__ = "Patrick Kunzmann"
__all__ = ["get_structure", "set_structure"]
-from .mol import MOLFile
-from .sdf import SDFile, SDRecord
-from ...bonds import BondType
+from biotite.structure.bonds import BondType
+from biotite.structure.io.mol.mol import MOLFile
+from biotite.structure.io.mol.sdf import SDFile, SDRecord
def get_structure(mol_file, record_name=None):
@@ -39,8 +39,9 @@ def get_structure(mol_file, record_name=None):
return record.get_structure()
-def set_structure(mol_file, atoms, default_bond_type=BondType.ANY,
- version=None, record_name=None):
+def set_structure(
+ mol_file, atoms, default_bond_type=BondType.ANY, version=None, record_name=None
+):
"""
Set the :class:`AtomArray` for the MOL file.
@@ -88,9 +89,7 @@ def _get_record(file, record_name):
else:
return file[record_name]
else:
- raise TypeError(
- f"Unsupported file type '{type(file).__name__}'"
- )
+ raise TypeError(f"Unsupported file type '{type(file).__name__}'")
def _get_or_create_record(file, record_name):
@@ -110,6 +109,4 @@ def _get_or_create_record(file, record_name):
file[record_name] = record
return file[record_name]
else:
- raise TypeError(
- f"Unsupported file type '{type(file).__name__}'"
- )
\ No newline at end of file
+ raise TypeError(f"Unsupported file type '{type(file).__name__}'")
diff --git a/src/biotite/structure/io/mol/ctab.py b/src/biotite/structure/io/mol/ctab.py
index e8fff5d10..d4577e382 100644
--- a/src/biotite/structure/io/mol/ctab.py
+++ b/src/biotite/structure/io/mol/ctab.py
@@ -12,13 +12,13 @@
__all__ = ["read_structure_from_ctab", "write_structure_to_ctab"]
import itertools
-import warnings
import shlex
+import warnings
import numpy as np
-from ....file import InvalidFileError
-from ...error import BadStructureError
-from ...atoms import AtomArray, AtomArrayStack
-from ...bonds import BondList, BondType
+from biotite.file import InvalidFileError
+from biotite.structure.atoms import AtomArray, AtomArrayStack
+from biotite.structure.bonds import BondList, BondType
+from biotite.structure.error import BadStructureError
BOND_TYPE_MAPPING = {
1: BondType.SINGLE,
@@ -84,8 +84,7 @@ def read_structure_from_ctab(ctab_lines):
raise InvalidFileError(f"Unknown CTAB version '{unkown_version}'")
-def write_structure_to_ctab(atoms, default_bond_type=BondType.ANY,
- version=None):
+def write_structure_to_ctab(atoms, default_bond_type=BondType.ANY, version=None):
"""
Convert an :class:`AtomArray` into a
*MDL* connection table (Ctab).
@@ -124,8 +123,7 @@ def write_structure_to_ctab(atoms, default_bond_type=BondType.ANY,
"""
if isinstance(atoms, AtomArrayStack):
raise TypeError(
- "An 'AtomArrayStack' was given, "
- "but only a single model can be written"
+ "An 'AtomArrayStack' was given, " "but only a single model can be written"
)
if atoms.bonds is None:
raise BadStructureError("Input AtomArray has no associated BondList")
@@ -134,9 +132,7 @@ def write_structure_to_ctab(atoms, default_bond_type=BondType.ANY,
match version:
case None:
- if _is_v2000_compatible(
- atoms.array_length(), atoms.bonds.get_bond_count()
- ):
+ if _is_v2000_compatible(atoms.array_length(), atoms.bonds.get_bond_count()):
return _write_structure_to_ctab_v2000(atoms, default_bond_type)
else:
return _write_structure_to_ctab_v3000(atoms, default_bond_type)
@@ -160,7 +156,8 @@ def _read_structure_from_ctab_v2000(ctab_lines):
atom_lines = ctab_lines[1 : 1 + n_atoms]
bond_lines = ctab_lines[1 + n_atoms : 1 + n_atoms + n_bonds]
charge_lines = [
- line for line in ctab_lines[1 + n_atoms + n_bonds:]
+ line
+ for line in ctab_lines[1 + n_atoms + n_bonds :]
if line.startswith("M CHG")
]
@@ -208,10 +205,9 @@ def _read_structure_from_ctab_v2000(ctab_lines):
return atoms
+
def _read_structure_from_ctab_v3000(ctab_lines):
- v30_lines = [
- line[6:].strip() for line in ctab_lines if line.startswith("M V30")
- ]
+ v30_lines = [line[6:].strip() for line in ctab_lines if line.startswith("M V30")]
atom_lines = _get_block_v3000(v30_lines, "ATOM")
if len(atom_lines) == 0:
@@ -262,16 +258,20 @@ def _read_structure_from_ctab_v3000(ctab_lines):
return atoms
+
def _get_version(counts_line):
return counts_line[33:39].strip()
+
def _is_v2000_compatible(n_atoms, n_bonds):
# The format uses a maximum of 3 digits for the atom and bond count
return n_atoms < 1000 and n_bonds < 1000
+
def _get_counts_v2000(counts_line):
return int(counts_line[0:3]), int(counts_line[3:6])
+
def _get_block_v3000(v30_lines, block_name):
block_lines = []
in_block = False
@@ -282,13 +282,12 @@ def _get_block_v3000(v30_lines, block_name):
if in_block:
return block_lines
else:
- raise InvalidFileError(
- f"Block '{block_name}' ended before it began"
- )
+ raise InvalidFileError(f"Block '{block_name}' ended before it began")
elif in_block:
block_lines.append(line)
return block_lines
+
def create_property_dict_v3000(property_strings):
properties = {}
for prop in property_strings:
@@ -315,7 +314,8 @@ def _write_structure_to_ctab_v2000(atoms, default_bond_type):
f" {atoms.element[i].capitalize():3}"
f"{0:>2}" # Mass difference -> unused
f"{CHARGE_MAPPING_REV.get(charge[i], 0):>3d}"
- + f"{0:>3d}" * 10 # More unused fields
+ + f"{0:>3d}"
+ * 10 # More unused fields
for i in range(atoms.array_length())
]
@@ -323,7 +323,8 @@ def _write_structure_to_ctab_v2000(atoms, default_bond_type):
bond_lines = [
f"{i+1:>3d}{j+1:>3d}"
f"{BOND_TYPE_MAPPING_REV.get(bond_type, default_bond_value):>3d}"
- + f"{0:>3d}" * 4
+ + f"{0:>3d}"
+ * 4
for i, j, bond_type in atoms.bonds.as_array()
]
@@ -332,8 +333,7 @@ def _write_structure_to_ctab_v2000(atoms, default_bond_type):
charge_lines = []
# Each `M CHG` line can contain up to 8 charges
for batch in _batched(
- [(atom_i, c) for atom_i, c in enumerate(charge) if c != 0],
- N_CHARGES_PER_LINE
+ [(atom_i, c) for atom_i, c in enumerate(charge) if c != 0], N_CHARGES_PER_LINE
):
charge_lines.append(
f"M CHG{len(batch):>3d}"
@@ -349,9 +349,7 @@ def _write_structure_to_ctab_v3000(atoms, default_bond_type):
except AttributeError:
charges = np.zeros(atoms.array_length(), dtype=int)
- counts_line = (
- f"COUNTS {atoms.array_length()} {atoms.bonds.get_bond_count()} 0 0 0"
- )
+ counts_line = f"COUNTS {atoms.array_length()} {atoms.bonds.get_bond_count()} 0 0 0"
atom_lines = [
f"{i + 1}"
@@ -375,32 +373,35 @@ def _write_structure_to_ctab_v3000(atoms, default_bond_type):
]
lines = (
- ["BEGIN CTAB"] +
- [counts_line] +
- ["BEGIN ATOM"] +
- atom_lines +
- ["END ATOM"] +
- ["BEGIN BOND"] +
- bond_lines +
- ["END BOND"] +
- ["END CTAB"]
+ ["BEGIN CTAB"]
+ + [counts_line]
+ + ["BEGIN ATOM"]
+ + atom_lines
+ + ["END ATOM"]
+ + ["BEGIN BOND"]
+ + bond_lines
+ + ["END BOND"]
+ + ["END CTAB"]
)
# Mark lines as V3000 CTAB
lines = ["M V30 " + line for line in lines]
return [V2000_COMPATIBILITY_LINE] + lines + ["M END"]
+
def _to_property(charge):
if charge == 0:
return ""
else:
return f"CHG={charge}"
+
def _quote(string):
if " " in string or len(string) == 0:
return f'"{string}"'
else:
return string
+
def _batched(iterable, n):
"""
Equivalent to :func:`itertools.batched()`.
@@ -411,4 +412,4 @@ def _batched(iterable, n):
"""
iterator = iter(iterable)
while batch := tuple(itertools.islice(iterator, n)):
- yield batch
\ No newline at end of file
+ yield batch
diff --git a/src/biotite/structure/io/mol/header.py b/src/biotite/structure/io/mol/header.py
index 3b4f1b48d..0c459acac 100644
--- a/src/biotite/structure/io/mol/header.py
+++ b/src/biotite/structure/io/mol/header.py
@@ -6,16 +6,15 @@
__author__ = "Patrick Kunzmann"
__all__ = ["Header"]
-import warnings
import datetime
+import warnings
from dataclasses import dataclass
-
_DATE_FORMAT = "%m%d%y%H%M"
@dataclass
-class Header():
+class Header:
"""
The header for connection tables.
@@ -70,20 +69,25 @@ def deserialize(text):
try:
time = datetime.datetime.strptime(time_string, _DATE_FORMAT)
except ValueError:
- warnings.warn(
- f"Invalid time format '{time_string}' in file header"
- )
+ warnings.warn(f"Invalid time format '{time_string}' in file header")
time = None
dimensions = lines[1][20:22].strip()
scaling_factors = lines[1][22:34].strip()
- energy = lines[1][34:46].strip()
+ energy = lines[1][34:46].strip()
registry_number = lines[1][46:52].strip()
comments = lines[2].strip()
return Header(
- mol_name, initials, program, time, dimensions,
- scaling_factors, energy, registry_number, comments
+ mol_name,
+ initials,
+ program,
+ time,
+ dimensions,
+ scaling_factors,
+ energy,
+ registry_number,
+ comments,
)
def serialize(self):
@@ -113,4 +117,4 @@ def serialize(self):
return text
def __str__(self):
- return self.serialize()
\ No newline at end of file
+ return self.serialize()
diff --git a/src/biotite/structure/io/mol/mol.py b/src/biotite/structure/io/mol/mol.py
index de58cdfb7..72122f0ef 100644
--- a/src/biotite/structure/io/mol/mol.py
+++ b/src/biotite/structure/io/mol/mol.py
@@ -6,11 +6,13 @@
__author__ = "Patrick Kunzmann"
__all__ = ["MOLFile"]
-from ....file import TextFile, InvalidFileError
-from .ctab import read_structure_from_ctab, write_structure_to_ctab
-from .header import Header
-from ...bonds import BondType
-
+from biotite.file import InvalidFileError, TextFile
+from biotite.structure.bonds import BondType
+from biotite.structure.io.mol.ctab import (
+ read_structure_from_ctab,
+ write_structure_to_ctab,
+)
+from biotite.structure.io.mol.header import Header
# Number of header lines
N_HEADER = 3
@@ -80,27 +82,23 @@ def __init__(self):
self.lines = [""] * N_HEADER
self._header = None
-
@classmethod
def read(cls, file):
mol_file = super().read(file)
mol_file._header = None
return mol_file
-
@property
def header(self):
if self._header is None:
self._header = Header.deserialize("\n".join(self.lines[0:3]) + "\n")
return self._header
-
@header.setter
def header(self, header):
self._header = header
self.lines[0:3] = self._header.serialize().splitlines()
-
def get_structure(self):
"""
Get an :class:`AtomArray` from the MOL file.
@@ -118,9 +116,7 @@ def get_structure(self):
raise InvalidFileError("File does not contain structure data")
return read_structure_from_ctab(ctab_lines)
-
- def set_structure(self, atoms, default_bond_type=BondType.ANY,
- version=None):
+ def set_structure(self, atoms, default_bond_type=BondType.ANY, version=None):
"""
Set the :class:`AtomArray` for the file.
@@ -146,9 +142,8 @@ def set_structure(self, atoms, default_bond_type=BondType.ANY,
)
-
def _get_ctab_lines(lines):
for i, line in enumerate(lines):
if line.startswith("M END"):
- return lines[N_HEADER:i+1]
+ return lines[N_HEADER : i + 1]
return lines[N_HEADER:]
diff --git a/src/biotite/structure/io/mol/sdf.py b/src/biotite/structure/io/mol/sdf.py
index a2b35096b..2048a482a 100644
--- a/src/biotite/structure/io/mol/sdf.py
+++ b/src/biotite/structure/io/mol/sdf.py
@@ -8,16 +8,24 @@
import re
import warnings
+from collections.abc import Mapping, MutableMapping
from dataclasses import dataclass
-from collections.abc import MutableMapping, Mapping
import numpy as np
-from ....file import File, InvalidFileError, is_open_compatible, is_text, \
- DeserializationError, SerializationError
-from .ctab import read_structure_from_ctab, write_structure_to_ctab
-from .header import Header
-from ...atoms import AtomArray
-from ...bonds import BondList, BondType
-
+from biotite.file import (
+ DeserializationError,
+ File,
+ InvalidFileError,
+ SerializationError,
+ is_open_compatible,
+ is_text,
+)
+from biotite.structure.atoms import AtomArray
+from biotite.structure.bonds import BondList, BondType
+from biotite.structure.io.mol.ctab import (
+ read_structure_from_ctab,
+ write_structure_to_ctab,
+)
+from biotite.structure.io.mol.header import Header
_N_HEADER = 3
# Number of header lines
@@ -96,6 +104,7 @@ class Key:
number, name, registry_internal, registry_external
The same as the parameters.
"""
+
# The characters that can be given as input to `name`
# First character must be alphanumeric,
# following characters may include underscores and periods
@@ -103,7 +112,7 @@ class Key:
# they are still used in practice and therefore allowed here
_NAME_INPUT_REGEX = re.compile(r"^[a-zA-Z0-9][\w.]*$")
# These regexes are used to parse the key from a line
- _COMPONENT_REGEX = {
+ _COMPONENT_REGEX = {
"number": re.compile(r"^DT(\d+)$"),
"name": re.compile(r"^<([a-zA-Z0-9][\w.]*)>$"),
"registry_internal": re.compile(r"^(\d+)$"),
@@ -162,9 +171,7 @@ def deserialize(text):
break
else:
# There is no matching pattern
- raise DeserializationError(
- f"Invalid key component '{component}'"
- )
+ raise DeserializationError(f"Invalid key component '{component}'")
return Metadata.Key(**parsed_component_dict)
def serialize(self):
@@ -190,7 +197,6 @@ def serialize(self):
def __str__(self):
return self.serialize()
-
def __init__(self, metadata=None):
if metadata is None:
metadata = {}
@@ -222,9 +228,7 @@ def deserialize(text):
current_value = None
else:
if current_key is None:
- raise DeserializationError(
- "Value found before metadata key"
- )
+ raise DeserializationError("Value found before metadata key")
if current_value is None:
current_value = line
else:
@@ -388,7 +392,7 @@ def header(self):
if isinstance(self._header, str):
try:
self._header = Header.deserialize(self._header)
- except:
+ except Exception:
raise DeserializationError("Failed to deserialize header")
return self._header
@@ -406,7 +410,7 @@ def metadata(self):
if isinstance(self._metadata, str):
try:
self._metadata = Metadata.deserialize(self._metadata)
- except:
+ except Exception:
raise DeserializationError("Failed to deserialize metadata")
return self._metadata
@@ -483,8 +487,7 @@ def get_structure(self):
raise InvalidFileError("File does not contain structure data")
return read_structure_from_ctab(ctab_lines)
- def set_structure(self, atoms, default_bond_type=BondType.ANY,
- version=None):
+ def set_structure(self, atoms, default_bond_type=BondType.ANY, version=None):
"""
Set the structural data in the SD record.
@@ -505,9 +508,9 @@ def set_structure(self, atoms, default_bond_type=BondType.ANY,
By default, ``"V2000"`` is used, unless the number of atoms
or bonds exceeds 999, in which case ``"V3000"`` is used.
"""
- self._ctab = _join_with_terminal_newline(write_structure_to_ctab(
- atoms, default_bond_type, version
- ))
+ self._ctab = _join_with_terminal_newline(
+ write_structure_to_ctab(atoms, default_bond_type, version)
+ )
def __eq__(self, other):
if not isinstance(other, type(self)):
@@ -736,28 +739,29 @@ def deserialize(text):
The content to be deserialized.
"""
lines = text.splitlines()
- record_ends = np.array([
- i for i, line in enumerate(lines)
- if line.startswith(_RECORD_DELIMITER)
- ], dtype=int)
+ record_ends = np.array(
+ [i for i, line in enumerate(lines) if line.startswith(_RECORD_DELIMITER)],
+ dtype=int,
+ )
if len(record_ends) == 0:
warnings.warn(
"Final record delimiter missing, "
"maybe this is a MOL file instead of a SD file"
)
- record_ends = np.array([len(lines)-1], dtype=int)
+ record_ends = np.array([len(lines) - 1], dtype=int)
# The first record starts at the first line and the last
# delimiter is at the end of the file
# Records in the middle start directly after the delimiter
record_starts = np.concatenate(([0], record_ends[:-1] + 1), dtype=int)
record_names = [lines[start].strip() for start in record_starts]
- return SDFile({
- # Do not include the delimiter
- # -> stop at end (instead of end + 1)
- name: _join_with_terminal_newline(lines[start : end])
- for name, start, end
- in zip(record_names, record_starts, record_ends)
- })
+ return SDFile(
+ {
+ # Do not include the delimiter
+ # -> stop at end (instead of end + 1)
+ name: _join_with_terminal_newline(lines[start:end])
+ for name, start, end in zip(record_names, record_starts, record_ends)
+ }
+ )
def serialize(self):
"""
@@ -776,7 +780,7 @@ def serialize(self):
else:
try:
text_blocks.append(record.serialize())
- except:
+ except Exception:
raise SerializationError(
f"Failed to serialize record '{record_name}'"
)
@@ -835,19 +839,15 @@ def __getitem__(self, key):
# -> must be deserialized first
try:
record = SDRecord.deserialize(record)
- except:
- raise DeserializationError(
- f"Failed to deserialize record '{key}'"
- )
+ except Exception:
+ raise DeserializationError(f"Failed to deserialize record '{key}'")
# Update with deserialized object
self._records[key] = record
return record
def __setitem__(self, key, record):
if not isinstance(record, SDRecord):
- raise TypeError(
- f"Expected 'SDRecord', but got '{type(record).__name__}'"
- )
+ raise TypeError(f"Expected 'SDRecord', but got '{type(record).__name__}'")
# The molecule name in the header is unique across the file
record.header.mol_name = key
self._records[key] = record
@@ -895,22 +895,19 @@ def _to_metadata_key(key):
return Metadata.Key(name=key)
else:
raise TypeError(
- "Expected 'Metadata.Key' or str, "
- f"but got '{type(key).__name__}'"
+ "Expected 'Metadata.Key' or str, " f"but got '{type(key).__name__}'"
)
def _add_key_value_pair(metadata, key, value):
if key is not None:
if value is None:
- raise DeserializationError(
- f"No value found for metadata key {key}"
- )
+ raise DeserializationError(f"No value found for metadata key {key}")
metadata[key] = value
def _get_ctab_stop(lines):
for i in range(_N_HEADER, len(lines)):
if lines[i].startswith("M END"):
- return i+1
- return len(lines)
\ No newline at end of file
+ return i + 1
+ return len(lines)
diff --git a/src/biotite/structure/io/netcdf/__init__.py b/src/biotite/structure/io/netcdf/__init__.py
index 9926d405c..085e0c080 100644
--- a/src/biotite/structure/io/netcdf/__init__.py
+++ b/src/biotite/structure/io/netcdf/__init__.py
@@ -10,4 +10,4 @@
__name__ = "biotite.structure.io.netcdf"
__author__ = "Patrick Kunzmann"
-from .file import *
\ No newline at end of file
+from .file import *
diff --git a/src/biotite/structure/io/netcdf/file.py b/src/biotite/structure/io/netcdf/file.py
index c651657e1..ea0d757c9 100644
--- a/src/biotite/structure/io/netcdf/file.py
+++ b/src/biotite/structure/io/netcdf/file.py
@@ -7,20 +7,21 @@
__all__ = ["NetCDFFile"]
import numpy as np
-from ..trajfile import TrajectoryFile
-from ...box import vectors_from_unitcell, unitcell_from_vectors
+from biotite.structure.box import unitcell_from_vectors, vectors_from_unitcell
+from biotite.structure.io.trajfile import TrajectoryFile
class NetCDFFile(TrajectoryFile):
"""
This file class represents a NetCDF trajectory file.
"""
-
+
@classmethod
def traj_type(cls):
import mdtraj.formats as traj
+
return traj.NetCDFTrajectoryFile
-
+
@classmethod
def process_read_values(cls, read_values):
# .dcd files use Angstrom
@@ -29,35 +30,36 @@ def process_read_values(cls, read_values):
cell_lengths = read_values[2]
cell_angles = read_values[3]
if cell_lengths is None or cell_angles is None:
- box = None
+ box = None
else:
box = np.stack(
- [vectors_from_unitcell(a, b, c, alpha, beta, gamma)
- for (a, b, c), (alpha, beta, gamma)
- in zip(cell_lengths, np.deg2rad(cell_angles))],
- axis=0
+ [
+ vectors_from_unitcell(a, b, c, alpha, beta, gamma)
+ for (a, b, c), (alpha, beta, gamma) in zip(
+ cell_lengths, np.deg2rad(cell_angles)
+ )
+ ],
+ axis=0,
)
return coord, box, time
-
+
@classmethod
def prepare_write_values(cls, coord, box, time):
- coord = coord.astype(np.float32, copy=False) \
- if coord is not None else None
- time = time.astype(np.float32, copy=False) \
- if time is not None else None
+ coord = coord.astype(np.float32, copy=False) if coord is not None else None
+ time = time.astype(np.float32, copy=False) if time is not None else None
if box is None:
cell_lengths = None
- cell_angles = None
+ cell_angles = None
else:
cell_lengths = np.zeros((len(box), 3), dtype=np.float32)
- cell_angles = np.zeros((len(box), 3), dtype=np.float32)
+ cell_angles = np.zeros((len(box), 3), dtype=np.float32)
for i, model_box in enumerate(box):
a, b, c, alpha, beta, gamma = unitcell_from_vectors(model_box)
cell_lengths[i] = np.array((a, b, c))
cell_angles[i] = np.rad2deg((alpha, beta, gamma))
return {
- "coordinates" : coord,
- "time" : time,
- "cell_lengths" : cell_lengths,
- "cell_angles" : cell_angles,
- }
\ No newline at end of file
+ "coordinates": coord,
+ "time": time,
+ "cell_lengths": cell_lengths,
+ "cell_angles": cell_angles,
+ }
diff --git a/src/biotite/structure/io/pdb/__init__.py b/src/biotite/structure/io/pdb/__init__.py
index 1dc97904b..687527d69 100644
--- a/src/biotite/structure/io/pdb/__init__.py
+++ b/src/biotite/structure/io/pdb/__init__.py
@@ -16,5 +16,5 @@
__name__ = "biotite.structure.io.pdb"
__author__ = "Patrick Kunzmann"
+from .convert import *
from .file import *
-from .convert import *
\ No newline at end of file
diff --git a/src/biotite/structure/io/pdb/convert.py b/src/biotite/structure/io/pdb/convert.py
index 7d4bc19dd..127e49fbb 100644
--- a/src/biotite/structure/io/pdb/convert.py
+++ b/src/biotite/structure/io/pdb/convert.py
@@ -9,8 +9,14 @@
__name__ = "biotite.structure.io.pdb"
__author__ = "Patrick Kunzmann"
-__all__ = ["get_model_count", "get_structure", "set_structure",
- "list_assemblies", "get_assembly", "get_symmetry_mates"]
+__all__ = [
+ "get_model_count",
+ "get_structure",
+ "set_structure",
+ "list_assemblies",
+ "get_assembly",
+ "get_symmetry_mates",
+]
def get_model_count(pdb_file):
@@ -30,8 +36,9 @@ def get_model_count(pdb_file):
return pdb_file.get_model_count()
-def get_structure(pdb_file, model=None, altloc="first", extra_fields=[],
- include_bonds=False):
+def get_structure(
+ pdb_file, model=None, altloc="first", extra_fields=[], include_bonds=False
+):
"""
Create an :class:`AtomArray` or :class:`AtomArrayStack` from a
:class:`PDBFile`.
@@ -39,7 +46,7 @@ def get_structure(pdb_file, model=None, altloc="first", extra_fields=[],
This function is a thin wrapper around the :class:`PDBFile` method
:func:`get_structure()` for the sake of consistency with other
``structure.io`` subpackages.
-
+
Parameters
----------
pdb_file : PDBFile
@@ -77,12 +84,12 @@ def get_structure(pdb_file, model=None, altloc="first", extra_fields=[],
(e.g. especially inter-residue bonds),
have :attr:`BondType.ANY`, since the PDB format itself does
not support bond orders.
-
+
Returns
-------
array : AtomArray or AtomArrayStack
The return type depends on the `model` parameter.
-
+
"""
return pdb_file.get_structure(model, altloc, extra_fields, include_bonds)
@@ -95,11 +102,11 @@ def set_structure(pdb_file, array, hybrid36=False):
This function is a thin wrapper around the :class:`PDBFile` method
:func:`set_structure()` for the sake of consistency with other
``structure.io`` subpackages.
-
+
This will save the coordinates, the mandatory annotation categories
and the optional annotation categories
'atom_id', 'b_factor', 'occupancy' and 'charge'.
-
+
Parameters
----------
pdb_file : PDBFile
@@ -137,7 +144,7 @@ def list_assemblies(pdb_file):
-------
assemblies : list of str
A list that contains the available assembly IDs.
-
+
Examples
--------
>>> import os.path
@@ -148,8 +155,14 @@ def list_assemblies(pdb_file):
return pdb_file.list_assemblies()
-def get_assembly(pdb_file, assembly_id=None, model=None, altloc="first",
- extra_fields=[], include_bonds=False):
+def get_assembly(
+ pdb_file,
+ assembly_id=None,
+ model=None,
+ altloc="first",
+ extra_fields=[],
+ include_bonds=False,
+):
"""
Build the given biological assembly.
@@ -205,7 +218,7 @@ def get_assembly(pdb_file, assembly_id=None, model=None, altloc="first",
assembly : AtomArray or AtomArrayStack
The assembly.
The return type depends on the `model` parameter.
-
+
Examples
--------
@@ -218,8 +231,9 @@ def get_assembly(pdb_file, assembly_id=None, model=None, altloc="first",
)
-def get_symmetry_mates(pdb_file, model=None, altloc="first",
- extra_fields=[], include_bonds=False):
+def get_symmetry_mates(
+ pdb_file, model=None, altloc="first", extra_fields=[], include_bonds=False
+):
"""
Build a structure model containing all symmetric copies
of the structure within a single unit cell, given by the space
@@ -274,13 +288,13 @@ def get_symmetry_mates(pdb_file, model=None, altloc="first",
symmetry_mates : AtomArray or AtomArrayStack
All atoms within a single unit cell.
The return type depends on the `model` parameter.
-
+
Notes
-----
To expand the structure beyond a single unit cell, use
:func:`repeat_box()` with the return value as its
input.
-
+
Examples
--------
@@ -288,6 +302,4 @@ def get_symmetry_mates(pdb_file, model=None, altloc="first",
>>> file = PDBFile.read(os.path.join(path_to_structures, "1aki.pdb"))
>>> atoms_in_unit_cell = get_symmetry_mates(file, model=1)
"""
- return pdb_file.get_symmetry_mates(
- model, altloc, extra_fields, include_bonds
- )
\ No newline at end of file
+ return pdb_file.get_symmetry_mates(model, altloc, extra_fields, include_bonds)
diff --git a/src/biotite/structure/io/pdb/file.py b/src/biotite/structure/io/pdb/file.py
index 208f6acfb..6d192dac6 100644
--- a/src/biotite/structure/io/pdb/file.py
+++ b/src/biotite/structure/io/pdb/file.py
@@ -8,20 +8,23 @@
import warnings
import numpy as np
-from ...atoms import AtomArray, AtomArrayStack, repeat
-from ...bonds import BondList, connect_via_residue_names
-from ...box import vectors_from_unitcell, unitcell_from_vectors
-from ....file import TextFile, InvalidFileError
-from ...repair import infer_elements
-from ...error import BadStructureError
-from ...filter import (
+from biotite.file import InvalidFileError, TextFile
+from biotite.structure.atoms import AtomArray, AtomArrayStack, repeat
+from biotite.structure.bonds import BondList, connect_via_residue_names
+from biotite.structure.box import unitcell_from_vectors, vectors_from_unitcell
+from biotite.structure.error import BadStructureError
+from biotite.structure.filter import (
filter_first_altloc,
filter_highest_occupancy_altloc,
filter_solvent,
)
-from ...util import matrix_rotate
-from .hybrid36 import encode_hybrid36, decode_hybrid36, max_hybrid36_number
-
+from biotite.structure.io.pdb.hybrid36 import (
+ decode_hybrid36,
+ encode_hybrid36,
+ max_hybrid36_number,
+)
+from biotite.structure.repair import infer_elements
+from biotite.structure.util import matrix_rotate
_PDB_MAX_ATOMS = 99999
_PDB_MAX_RESIDUES = 9999
@@ -82,6 +85,7 @@ class PDBFile(TextFile):
>>> file.set_structure(array_stack_mod)
>>> file.write(os.path.join(path_to_directory, "1l2y_mod.pdb"))
"""
+
@classmethod
def read(cls, file):
file = super().read(file)
@@ -91,7 +95,6 @@ def read(cls, file):
file._index_models_and_atoms()
return file
-
def get_remark(self, number):
r"""
Get the lines containing the *REMARK* records with the given
@@ -140,7 +143,8 @@ def get_remark(self, number):
remark_string = f"REMARK {number:>3d}"
# Find lines and omit ``REMARK XXX `` part
remark_lines = [
- line[CONTENT_START_COLUMN:] for line in self.lines
+ line[CONTENT_START_COLUMN:]
+ for line in self.lines
if line.startswith(remark_string)
]
if len(remark_lines) == 0:
@@ -149,7 +153,6 @@ def get_remark(self, number):
remark_lines = remark_lines[1:]
return remark_lines
-
def get_model_count(self):
"""
Get the number of models contained in the PDB file.
@@ -161,7 +164,6 @@ def get_model_count(self):
"""
return len(self._model_start_i)
-
def get_coord(self, model=None):
"""
Get only the coordinates from the PDB file.
@@ -239,21 +241,21 @@ def get_coord(self, model=None):
if model is None:
coord = np.zeros(
(len(self._model_start_i), self._get_model_length(), 3),
- dtype=np.float32
+ dtype=np.float32,
)
m = 0
i = 0
for line_i in self._atom_line_i:
if (
- m < len(self._model_start_i)-1
- and line_i > self._model_start_i[m+1]
+ m < len(self._model_start_i) - 1
+ and line_i > self._model_start_i[m + 1]
):
m += 1
i = 0
line = self.lines[line_i]
- coord[m,i,0] = float(line[_coord_x])
- coord[m,i,1] = float(line[_coord_y])
- coord[m,i,2] = float(line[_coord_z])
+ coord[m, i, 0] = float(line[_coord_x])
+ coord[m, i, 1] = float(line[_coord_y])
+ coord[m, i, 2] = float(line[_coord_z])
i += 1
return coord
@@ -262,12 +264,11 @@ def get_coord(self, model=None):
coord = np.zeros((len(coord_i), 3), dtype=np.float32)
for i, line_i in enumerate(coord_i):
line = self.lines[line_i]
- coord[i,0] = float(line[_coord_x])
- coord[i,1] = float(line[_coord_y])
- coord[i,2] = float(line[_coord_z])
+ coord[i, 0] = float(line[_coord_x])
+ coord[i, 1] = float(line[_coord_y])
+ coord[i, 2] = float(line[_coord_z])
return coord
-
def get_b_factor(self, model=None):
"""
Get only the B-factors from the PDB file.
@@ -300,20 +301,19 @@ def get_b_factor(self, model=None):
"""
if model is None:
b_factor = np.zeros(
- (len(self._model_start_i), self._get_model_length()),
- dtype=np.float32
+ (len(self._model_start_i), self._get_model_length()), dtype=np.float32
)
m = 0
i = 0
for line_i in self._atom_line_i:
if (
- m < len(self._model_start_i)-1
- and line_i > self._model_start_i[m+1]
+ m < len(self._model_start_i) - 1
+ and line_i > self._model_start_i[m + 1]
):
m += 1
i = 0
line = self.lines[line_i]
- b_factor[m,i] = float(line[_temp_f])
+ b_factor[m, i] = float(line[_temp_f])
i += 1
return b_factor
@@ -325,9 +325,9 @@ def get_b_factor(self, model=None):
b_factor[i] = float(line[_temp_f])
return b_factor
-
- def get_structure(self, model=None, altloc="first", extra_fields=[],
- include_bonds=False):
+ def get_structure(
+ self, model=None, altloc="first", extra_fields=[], include_bonds=False
+ ):
"""
Get an :class:`AtomArray` or :class:`AtomArrayStack` from the PDB file.
@@ -391,17 +391,17 @@ def get_structure(self, model=None, altloc="first", extra_fields=[],
array = AtomArray(len(coord_i))
# Create mandatory and optional annotation arrays
- chain_id = np.zeros(array.array_length(), array.chain_id.dtype)
- res_id = np.zeros(array.array_length(), array.res_id.dtype)
- ins_code = np.zeros(array.array_length(), array.ins_code.dtype)
- res_name = np.zeros(array.array_length(), array.res_name.dtype)
- hetero = np.zeros(array.array_length(), array.hetero.dtype)
+ chain_id = np.zeros(array.array_length(), array.chain_id.dtype)
+ res_id = np.zeros(array.array_length(), array.res_id.dtype)
+ ins_code = np.zeros(array.array_length(), array.ins_code.dtype)
+ res_name = np.zeros(array.array_length(), array.res_name.dtype)
+ hetero = np.zeros(array.array_length(), array.hetero.dtype)
atom_name = np.zeros(array.array_length(), array.atom_name.dtype)
- element = np.zeros(array.array_length(), array.element.dtype)
+ element = np.zeros(array.array_length(), array.element.dtype)
atom_id_raw = np.zeros(array.array_length(), "U5")
- charge_raw = np.zeros(array.array_length(), "U2")
+ charge_raw = np.zeros(array.array_length(), "U2")
occupancy = np.zeros(array.array_length(), float)
- b_factor = np.zeros(array.array_length(), float)
+ b_factor = np.zeros(array.array_length(), float)
altloc_id = np.zeros(array.array_length(), dtype="U1")
# Fill annotation array
@@ -425,13 +425,11 @@ def get_structure(self, model=None, altloc="first", extra_fields=[],
occupancy[i] = float(line[_occupancy].strip())
b_factor[i] = float(line[_temp_f].strip())
- if include_bonds or \
- (extra_fields is not None and "atom_id" in extra_fields):
- # The atom IDs are only required in these two cases
- atom_id = np.array(
- [decode_hybrid36(raw_id.item()) for raw_id in atom_id_raw],
- dtype=int
- )
+ if include_bonds or (extra_fields is not None and "atom_id" in extra_fields):
+ # The atom IDs are only required in these two cases
+ atom_id = np.array(
+ [decode_hybrid36(raw_id.item()) for raw_id in atom_id_raw], dtype=int
+ )
else:
atom_id = None
@@ -444,16 +442,16 @@ def get_structure(self, model=None, altloc="first", extra_fields=[],
array.atom_name = atom_name
array.element = element
- for field in (extra_fields if extra_fields is not None else []):
+ for field in extra_fields if extra_fields is not None else []:
if field == "atom_id":
# Copy is necessary to avoid double masking in
# later altloc ID filtering
array.set_annotation("atom_id", atom_id.copy())
elif field == "charge":
charge = np.array(charge_raw)
- array.set_annotation("charge", np.where(
- charge == " ", "0", charge
- ).astype(int))
+ array.set_annotation(
+ "charge", np.where(charge == " ", "0", charge).astype(int)
+ )
elif field == "occupancy":
array.set_annotation("occupancy", occupancy)
elif field == "b_factor":
@@ -485,7 +483,10 @@ def get_structure(self, model=None, altloc="first", extra_fields=[],
m = 0
i = 0
for line_i in self._atom_line_i:
- if m < len(self._model_start_i)-1 and line_i > self._model_start_i[m+1]:
+ if (
+ m < len(self._model_start_i) - 1
+ and line_i > self._model_start_i[m + 1]
+ ):
m += 1
i = 0
line = self.lines[line_i]
@@ -506,9 +507,7 @@ def get_structure(self, model=None, altloc="first", extra_fields=[],
alpha = np.deg2rad(float(line[_alpha]))
beta = np.deg2rad(float(line[_beta]))
gamma = np.deg2rad(float(line[_gamma]))
- box = vectors_from_unitcell(
- len_a, len_b, len_c, alpha, beta, gamma
- )
+ box = vectors_from_unitcell(len_a, len_b, len_c, alpha, beta, gamma)
except ValueError:
# File contains invalid 'CRYST1' record
warnings.warn(
@@ -526,9 +525,7 @@ def get_structure(self, model=None, altloc="first", extra_fields=[],
# Filter altloc IDs
if altloc == "occupancy":
- filter = filter_highest_occupancy_altloc(
- array, altloc_id, occupancy
- )
+ filter = filter_highest_occupancy_altloc(array, altloc_id, occupancy)
array = array[..., filter]
atom_id = atom_id[filter] if atom_id is not None else None
elif altloc == "first":
@@ -548,7 +545,6 @@ def get_structure(self, model=None, altloc="first", extra_fields=[],
return array
-
def set_structure(self, array, hybrid36=False):
"""
Set the :class:`AtomArray` or :class:`AtomArrayStack` for the
@@ -596,39 +592,42 @@ def set_structure(self, array, hybrid36=False):
occupancy = np.char.array(np.full(natoms, " 1.00", dtype="U6"))
if "charge" in annot_categories:
charge = np.char.array(
- [str(np.abs(charge)) + "+" if charge > 0 else
- (str(np.abs(charge)) + "-" if charge < 0 else "")
- for charge in array.get_annotation("charge")]
+ [
+ str(np.abs(charge)) + "+"
+ if charge > 0
+ else (str(np.abs(charge)) + "-" if charge < 0 else "")
+ for charge in array.get_annotation("charge")
+ ]
)
else:
charge = np.char.array(np.full(natoms, " ", dtype="U2"))
if hybrid36:
- pdb_atom_id = np.char.array(
- [encode_hybrid36(i, 5) for i in atom_id]
- )
- pdb_res_id = np.char.array(
- [encode_hybrid36(i, 4) for i in array.res_id]
- )
+ pdb_atom_id = np.char.array([encode_hybrid36(i, 5) for i in atom_id])
+ pdb_res_id = np.char.array([encode_hybrid36(i, 4) for i in array.res_id])
else:
# Atom IDs are supported up to 99999,
# but negative IDs are also possible
- pdb_atom_id = np.char.array(np.where(
- atom_id > 0,
- ((atom_id - 1) % _PDB_MAX_ATOMS) + 1,
- atom_id
- ).astype(str))
+ pdb_atom_id = np.char.array(
+ np.where(
+ atom_id > 0, ((atom_id - 1) % _PDB_MAX_ATOMS) + 1, atom_id
+ ).astype(str)
+ )
# Residue IDs are supported up to 9999,
# but negative IDs are also possible
- pdb_res_id = np.char.array(np.where(
- array.res_id > 0,
- ((array.res_id - 1) % _PDB_MAX_RESIDUES) + 1,
- array.res_id
- ).astype(str))
+ pdb_res_id = np.char.array(
+ np.where(
+ array.res_id > 0,
+ ((array.res_id - 1) % _PDB_MAX_RESIDUES) + 1,
+ array.res_id,
+ ).astype(str)
+ )
names = np.char.array(
- [f" {atm}" if len(elem) == 1 and len(atm) < 4 else atm
- for atm, elem in zip(array.atom_name, array.element)]
+ [
+ f" {atm}" if len(elem) == 1 and len(atm) < 4 else atm
+ for atm, elem in zip(array.atom_name, array.element)
+ ]
)
res_names = np.char.array(array.res_name)
chain_ids = np.char.array(array.chain_id)
@@ -637,17 +636,20 @@ def set_structure(self, array, hybrid36=False):
elements = np.char.array(array.element)
first_half = (
- record.ljust(6) +
- pdb_atom_id.rjust(5) +
- spaces +
- names.ljust(4) +
- spaces + res_names.rjust(3) + spaces + chain_ids +
- pdb_res_id.rjust(4) + ins_codes.rjust(1)
+ record.ljust(6)
+ + pdb_atom_id.rjust(5)
+ + spaces
+ + names.ljust(4)
+ + spaces
+ + res_names.rjust(3)
+ + spaces
+ + chain_ids
+ + pdb_res_id.rjust(4)
+ + ins_codes.rjust(1)
)
second_half = (
- occupancy + b_factor + 10 * spaces +
- elements.rjust(2) + charge.rjust(2)
+ occupancy + b_factor + 10 * spaces + elements.rjust(2) + charge.rjust(2)
)
coords = array.coord
@@ -674,9 +676,10 @@ def set_structure(self, array, hybrid36=False):
self.lines.append(f"MODEL {model_num:4}")
# Bundle non-coordinate data to simplify iteration
self.lines.extend(
- [f"{start:27} {x:>8.3f}{y:>8.3f}{z:>8.3f}{end:26}"
- for start, (x, y, z), end in
- zip(first_half, coord_i, second_half)]
+ [
+ f"{start:27} {x:>8.3f}{y:>8.3f}{z:>8.3f}{end:26}"
+ for start, (x, y, z), end in zip(first_half, coord_i, second_half)
+ ]
)
if is_stack:
self.lines.append("ENDMDL")
@@ -688,18 +691,15 @@ def set_structure(self, array, hybrid36=False):
hetero_indices = np.where(array.hetero & ~filter_solvent(array))[0]
bond_array = array.bonds.as_array()
bond_array = bond_array[
- np.isin(bond_array[:,0], hetero_indices) |
- np.isin(bond_array[:,1], hetero_indices) |
- (array.res_id [bond_array[:,0]] != array.res_id [bond_array[:,1]]) |
- (array.chain_id[bond_array[:,0]] != array.chain_id[bond_array[:,1]])
+ np.isin(bond_array[:, 0], hetero_indices)
+ | np.isin(bond_array[:, 1], hetero_indices)
+ | (array.res_id[bond_array[:, 0]] != array.res_id[bond_array[:, 1]])
+ | (array.chain_id[bond_array[:, 0]] != array.chain_id[bond_array[:, 1]])
]
- self._set_bonds(
- BondList(array.array_length(), bond_array), pdb_atom_id
- )
+ self._set_bonds(BondList(array.array_length(), bond_array), pdb_atom_id)
self._index_models_and_atoms()
-
def list_assemblies(self):
"""
List the biological assemblies that are available for the
@@ -727,14 +727,16 @@ def list_assemblies(self):
raise InvalidFileError(
"File does not contain assembly information (REMARK 300)"
)
- return [
- assembly_id.strip()
- for assembly_id in remark_lines[0][12:].split(",")
- ]
-
-
- def get_assembly(self, assembly_id=None, model=None, altloc="first",
- extra_fields=[], include_bonds=False):
+ return [assembly_id.strip() for assembly_id in remark_lines[0][12:].split(",")]
+
+ def get_assembly(
+ self,
+ assembly_id=None,
+ model=None,
+ altloc="first",
+ extra_fields=[],
+ include_bonds=False,
+ ):
"""
Build the given biological assembly.
@@ -829,18 +831,16 @@ def get_assembly(self, assembly_id=None, model=None, altloc="first",
if assembly_start_i is None:
if assembly_id is None:
raise InvalidFileError(
- "File does not contain transformation "
- "expressions for assemblies"
+ "File does not contain transformation " "expressions for assemblies"
)
else:
- raise KeyError(
- f"The assembly ID '{assembly_id}' is not found"
- )
- assembly_lines = remark_lines[assembly_start_i : assembly_stop_i]
+ raise KeyError(f"The assembly ID '{assembly_id}' is not found")
+ assembly_lines = remark_lines[assembly_start_i:assembly_stop_i]
# Get transformations for a set of chains
chain_set_start_indices = [
- i for i, line in enumerate(assembly_lines)
+ i
+ for i, line in enumerate(assembly_lines)
if line.startswith("APPLY THE FOLLOWING TO CHAINS")
]
# Add exclusive stop at end of records
@@ -848,17 +848,17 @@ def get_assembly(self, assembly_id=None, model=None, altloc="first",
assembly = None
for i in range(len(chain_set_start_indices) - 1):
start = chain_set_start_indices[i]
- stop = chain_set_start_indices[i+1]
+ stop = chain_set_start_indices[i + 1]
# Read affected chain IDs from the following line(s)
affected_chain_ids = []
transform_start = None
- for j, line in enumerate(assembly_lines[start : stop]):
- if line.startswith("APPLY THE FOLLOWING TO CHAINS:") or \
- line.startswith(" AND CHAINS:"):
- affected_chain_ids += [
- chain_id.strip()
- for chain_id in line[30:].split(",")
- ]
+ for j, line in enumerate(assembly_lines[start:stop]):
+ if line.startswith("APPLY THE FOLLOWING TO CHAINS:") or line.startswith(
+ " AND CHAINS:"
+ ):
+ affected_chain_ids += [
+ chain_id.strip() for chain_id in line[30:].split(",")
+ ]
else:
# Chain specification has finished
# BIOMT lines start directly after chain specification
@@ -866,11 +866,9 @@ def get_assembly(self, assembly_id=None, model=None, altloc="first",
break
# Parse transformations from BIOMT lines
if transform_start is None:
- raise InvalidFileError(
- "No 'BIOMT' records found for chosen assembly"
- )
+ raise InvalidFileError("No 'BIOMT' records found for chosen assembly")
rotations, translations = _parse_transformations(
- assembly_lines[transform_start : stop]
+ assembly_lines[transform_start:stop]
)
# Filter affected chains
sub_structure = structure[
@@ -888,9 +886,9 @@ def get_assembly(self, assembly_id=None, model=None, altloc="first",
return assembly
-
- def get_symmetry_mates(self, model=None, altloc="first",
- extra_fields=[], include_bonds=False):
+ def get_symmetry_mates(
+ self, model=None, altloc="first", extra_fields=[], include_bonds=False
+ ):
"""
Build a structure model containing all symmetric copies
of the structure within a single unit cell, given by the space
@@ -971,27 +969,15 @@ def get_symmetry_mates(self, model=None, altloc="first",
"File does not contain crystallographic symmetry "
"information (REMARK 350)"
)
- transform_lines = [
- line for line in remark_lines if line.startswith(" SMTRY")
- ]
- rotations, translations = _parse_transformations(
- transform_lines
- )
- return _apply_transformations(
- structure, rotations, translations
- )
-
-
-
+ transform_lines = [line for line in remark_lines if line.startswith(" SMTRY")]
+ rotations, translations = _parse_transformations(transform_lines)
+ return _apply_transformations(structure, rotations, translations)
def _index_models_and_atoms(self):
# Line indices where a new model starts
self._model_start_i = np.array(
- [
- i for i in range(len(self.lines))
- if self.lines[i].startswith(("MODEL"))
- ],
- dtype=int
+ [i for i in range(len(self.lines)) if self.lines[i].startswith(("MODEL"))],
+ dtype=int,
)
if len(self._model_start_i) == 0:
# It could be an empty file or a file with a single model,
@@ -1005,13 +991,13 @@ def _index_models_and_atoms(self):
# Line indices with ATOM or HETATM records
self._atom_line_i = np.array(
[
- i for i in range(len(self.lines))
+ i
+ for i in range(len(self.lines))
if self.lines[i].startswith(("ATOM", "HETATM"))
],
- dtype=int
+ dtype=int,
)
-
def _get_atom_record_indices_for_model(self, model):
last_model = len(self._model_start_i)
if model == 0:
@@ -1020,12 +1006,11 @@ def _get_atom_record_indices_for_model(self, model):
model = last_model + model + 1 if model < 0 else model
if model < last_model:
- line_filter = (
- (self._atom_line_i >= self._model_start_i[model-1]) &
- (self._atom_line_i < self._model_start_i[model ])
+ line_filter = (self._atom_line_i >= self._model_start_i[model - 1]) & (
+ self._atom_line_i < self._model_start_i[model]
)
elif model == last_model:
- line_filter = (self._atom_line_i >= self._model_start_i[model-1])
+ line_filter = self._atom_line_i >= self._model_start_i[model - 1]
else:
raise ValueError(
f"The file has {last_model} models, "
@@ -1033,7 +1018,6 @@ def _get_atom_record_indices_for_model(self, model):
)
return self._atom_line_i[line_filter]
-
def _get_model_length(self):
"""
Determine length of models and check that all models
@@ -1043,11 +1027,13 @@ def _get_model_length(self):
length = None
for model_i in range(len(self._model_start_i)):
model_start = self._model_start_i[model_i]
- model_stop = self._model_start_i[model_i+1] \
- if model_i+1 < n_models else len(self.lines)
+ model_stop = (
+ self._model_start_i[model_i + 1]
+ if model_i + 1 < n_models
+ else len(self.lines)
+ )
model_length = np.count_nonzero(
- (self._atom_line_i >= model_start) &
- (self._atom_line_i < model_stop)
+ (self._atom_line_i >= model_start) & (self._atom_line_i < model_stop)
)
if length is None:
length = model_length
@@ -1058,26 +1044,22 @@ def _get_model_length(self):
)
return length
-
def _get_bonds(self, atom_ids):
- conect_lines = [line for line in self.lines
- if line.startswith("CONECT")]
+ conect_lines = [line for line in self.lines if line.startswith("CONECT")]
# Mapping from atom ids to indices in an AtomArray
- atom_id_to_index = np.zeros(atom_ids[-1]+1, dtype=int)
+ atom_id_to_index = np.zeros(atom_ids[-1] + 1, dtype=int)
try:
for i, id in enumerate(atom_ids):
atom_id_to_index[id] = i
except IndexError as e:
- raise InvalidFileError(
- "Atom IDs are not strictly increasing"
- ) from e
+ raise InvalidFileError("Atom IDs are not strictly increasing") from e
bonds = []
for line in conect_lines:
- center_id = atom_id_to_index[decode_hybrid36(line[6 : 11])]
+ center_id = atom_id_to_index[decode_hybrid36(line[6:11])]
for i in range(11, 31, 5):
- id_string = line[i : i+5]
+ id_string = line[i : i + 5]
try:
id = atom_id_to_index[decode_hybrid36(id_string)]
except ValueError:
@@ -1089,7 +1071,6 @@ def _get_bonds(self, atom_ids):
# is equal to the length of the AtomArray
return BondList(len(atom_ids), np.array(bonds, dtype=np.uint32))
-
def _set_bonds(self, bond_list, atom_ids):
# Bond type is unused since PDB does not support bond orders
bonds, _ = bond_list.get_all_bonds()
@@ -1136,9 +1117,7 @@ def _parse_transformations(lines):
# transformation index) are not used
transformations = [float(e) for e in line.split()[2:]]
if len(transformations) != 4:
- raise InvalidFileError(
- "Invalid number of transformation vector elements"
- )
+ raise InvalidFileError("Invalid number of transformation vector elements")
rotations[transformation_i, component_i, :] = transformations[:3]
translations[transformation_i, component_i] = transformations[3]
@@ -1237,4 +1216,4 @@ def _number_of_integer_digits(values):
n_digits = 0
n_digits = max(n_digits, len(str(np.min(values))))
n_digits = max(n_digits, len(str(np.max(values))))
- return n_digits
\ No newline at end of file
+ return n_digits
diff --git a/src/biotite/structure/io/pdbqt/__init__.py b/src/biotite/structure/io/pdbqt/__init__.py
index 6c406636a..ea81ca4fc 100644
--- a/src/biotite/structure/io/pdbqt/__init__.py
+++ b/src/biotite/structure/io/pdbqt/__init__.py
@@ -11,5 +11,5 @@
__name__ = "biotite.structure.io.pdbqt"
__author__ = "Patrick Kunzmann"
+from .convert import *
from .file import *
-from .convert import *
\ No newline at end of file
diff --git a/src/biotite/structure/io/pdbqt/convert.py b/src/biotite/structure/io/pdbqt/convert.py
index ee335ccc6..051339c4f 100644
--- a/src/biotite/structure/io/pdbqt/convert.py
+++ b/src/biotite/structure/io/pdbqt/convert.py
@@ -18,7 +18,7 @@ def get_structure(pdbqt_file, model=None):
PDBQT file.
EXPERIMENTAL: Future API changes are probable.
-
+
Parameters
----------
pdbqt_file : PDBQTFile
@@ -32,7 +32,7 @@ def get_structure(pdbqt_file, model=None):
If this parameter is omitted, an :class:`AtomArrayStack`
containing all models will be returned, even if the
structure contains only one model.
-
+
Returns
-------
array : AtomArray or AtomArrayStack
@@ -41,13 +41,20 @@ def get_structure(pdbqt_file, model=None):
return pdbqt_file.get_structure(model)
-def set_structure(pdbqt_file, atoms, charges=None, atom_types=None,
- rotatable_bonds=None, root=None, include_torsdof=True):
+def set_structure(
+ pdbqt_file,
+ atoms,
+ charges=None,
+ atom_types=None,
+ rotatable_bonds=None,
+ root=None,
+ include_torsdof=True,
+):
"""
Write an :class:`AtomArray` into a PDBQT file.
EXPERIMENTAL: Future API changes are probable.
-
+
Parameters
----------
pdbqt_file : PDBQTFile
@@ -71,7 +78,7 @@ def set_structure(pdbqt_file, atoms, charges=None, atom_types=None,
be written.
- ``'rigid'`` - The molecule is handled as rigid ligand:
Only a ``ROOT`` line will be written.
- - ``'all'`` - The molecule is handled as flexible
+ - ``'all'`` - The molecule is handled as flexible
ligand:
A ``ROOT`` line will be written and all rotatable
bonds are included using ``BRANCH`` and ``ENDBRANCH``
@@ -81,7 +88,7 @@ def set_structure(pdbqt_file, atoms, charges=None, atom_types=None,
A ``ROOT`` line will be written and all bonds in the
given :class:`BondList` are considered flexible via
``BRANCH`` and ``ENDBRANCH`` lines.
-
+
root : int, optional
Specifies the index of the atom following the ``ROOT`` line.
Setting the root atom is useful for specifying the *anchor*
@@ -93,7 +100,7 @@ def set_structure(pdbqt_file, atoms, charges=None, atom_types=None,
By default, a ``TORSDOF`` (torsional degrees of freedom)
record is written at the end of the file.
By setting this parameter to false, the record is omitted.
-
+
Returns
-------
mask : ndarray, shape=(n,), dtype=bool
@@ -102,6 +109,5 @@ def set_structure(pdbqt_file, atoms, charges=None, atom_types=None,
hydrogen.
"""
return pdbqt_file.set_structure(
- atoms, charges, atom_types, rotatable_bonds, root,
- include_torsdof
- )
\ No newline at end of file
+ atoms, charges, atom_types, rotatable_bonds, root, include_torsdof
+ )
diff --git a/src/biotite/structure/io/pdbqt/file.py b/src/biotite/structure/io/pdbqt/file.py
index 271d4bc69..21f883c0a 100644
--- a/src/biotite/structure/io/pdbqt/file.py
+++ b/src/biotite/structure/io/pdbqt/file.py
@@ -8,17 +8,33 @@
import warnings
import numpy as np
-from ....file import TextFile, InvalidFileError
-from ...error import BadStructureError
-from ...atoms import AtomArray, AtomArrayStack
-from ...charges import partial_charges
-from ...bonds import BondList, BondType, find_connected, find_rotatable_bonds
-
+from biotite.file import InvalidFileError, TextFile
+from biotite.structure.atoms import AtomArray, AtomArrayStack
+from biotite.structure.bonds import (
+ BondList,
+ BondType,
+ find_connected,
+ find_rotatable_bonds,
+)
+from biotite.structure.charges import partial_charges
+from biotite.structure.error import BadStructureError
PARAMETRIZED_ELEMENTS = [
- "H", "C", "N", "O", "P", "S",
- "F", "Cl", "Br", "I",
- "Mg", "Ca", "Mn", "Fe", "Zn"
+ "H",
+ "C",
+ "N",
+ "O",
+ "P",
+ "S",
+ "F",
+ "Cl",
+ "Br",
+ "I",
+ "Mg",
+ "Ca",
+ "Mn",
+ "Fe",
+ "Zn",
]
@@ -116,13 +132,15 @@ def get_remarks(self, model=None):
``'REMARKS'``.
"""
# Line indices where a new model starts
- model_start_i = np.array([i for i in range(len(self.lines))
- if self.lines[i].startswith(("MODEL"))],
- dtype=int)
+ model_start_i = np.array(
+ [i for i in range(len(self.lines)) if self.lines[i].startswith(("MODEL"))],
+ dtype=int,
+ )
# Line indices with ATOM or HETATM records
- remark_line_i = np.array([i for i in range(len(self.lines)) if
- self.lines[i].startswith("REMARK")],
- dtype=int)
+ remark_line_i = np.array(
+ [i for i in range(len(self.lines)) if self.lines[i].startswith("REMARK")],
+ dtype=int,
+ )
# Structures containing only one model may omit MODEL record
# In these cases model starting index is set to 0
if len(model_start_i) == 0:
@@ -131,11 +149,10 @@ def get_remarks(self, model=None):
if model is None:
# Add exclusive end of file
model_start_i = np.concatenate((model_start_i, [len(self.lines)]))
- model_i = 0
remarks = []
for i in range(len(model_start_i) - 1):
start = model_start_i[i]
- stop = model_start_i[i+1]
+ stop = model_start_i[i + 1]
model_remark_line_i = remark_line_i[
(remark_line_i >= start) & (remark_line_i < stop)
]
@@ -152,10 +169,11 @@ def get_remarks(self, model=None):
model = last_model + model + 1 if model < 0 else model
if model < last_model:
- line_filter = ( ( remark_line_i >= model_start_i[model-1] ) &
- ( remark_line_i < model_start_i[model ] ) )
+ line_filter = (remark_line_i >= model_start_i[model - 1]) & (
+ remark_line_i < model_start_i[model]
+ )
elif model == last_model:
- line_filter = (remark_line_i >= model_start_i[model-1])
+ line_filter = remark_line_i >= model_start_i[model - 1]
else:
raise ValueError(
f"The file has {last_model} models, "
@@ -166,7 +184,6 @@ def get_remarks(self, model=None):
# Do not include 'REMARK ' itself -> begin from pos 8
return "\n".join([self.lines[i][7:] for i in remark_line_i])
-
def get_structure(self, model=None):
"""
Get an :class:`AtomArray` or :class:`AtomArrayStack` from the
@@ -190,13 +207,19 @@ def get_structure(self, model=None):
The return type depends on the `model` parameter.
"""
# Line indices where a new model starts
- model_start_i = np.array([i for i in range(len(self.lines))
- if self.lines[i].startswith(("MODEL"))],
- dtype=int)
+ model_start_i = np.array(
+ [i for i in range(len(self.lines)) if self.lines[i].startswith(("MODEL"))],
+ dtype=int,
+ )
# Line indices with ATOM or HETATM records
- atom_line_i = np.array([i for i in range(len(self.lines)) if
- self.lines[i].startswith(("ATOM", "HETATM"))],
- dtype=int)
+ atom_line_i = np.array(
+ [
+ i
+ for i in range(len(self.lines))
+ if self.lines[i].startswith(("ATOM", "HETATM"))
+ ],
+ dtype=int,
+ )
# Structures containing only one model may omit MODEL record
# In these cases model starting index is set to 0
if len(model_start_i) == 0:
@@ -224,10 +247,11 @@ def get_structure(self, model=None):
model = last_model + model + 1 if model < 0 else model
if model < last_model:
- line_filter = ( ( atom_line_i >= model_start_i[model-1] ) &
- ( atom_line_i < model_start_i[model ] ) )
+ line_filter = (atom_line_i >= model_start_i[model - 1]) & (
+ atom_line_i < model_start_i[model]
+ )
elif model == last_model:
- line_filter = (atom_line_i >= model_start_i[model-1])
+ line_filter = atom_line_i >= model_start_i[model - 1]
else:
raise ValueError(
f"The file has {last_model} models, "
@@ -237,16 +261,16 @@ def get_structure(self, model=None):
array = AtomArray(len(coord_i))
# Save atom IDs for later sorting into the original atom order
- atom_id = np.zeros(array.array_length(), int)
+ atom_id = np.zeros(array.array_length(), int)
# Create annotation arrays
- chain_id = np.zeros(array.array_length(), array.chain_id.dtype)
- res_id = np.zeros(array.array_length(), array.res_id.dtype)
- ins_code = np.zeros(array.array_length(), array.ins_code.dtype)
- res_name = np.zeros(array.array_length(), array.res_name.dtype)
- hetero = np.zeros(array.array_length(), array.hetero.dtype)
+ chain_id = np.zeros(array.array_length(), array.chain_id.dtype)
+ res_id = np.zeros(array.array_length(), array.res_id.dtype)
+ ins_code = np.zeros(array.array_length(), array.ins_code.dtype)
+ res_name = np.zeros(array.array_length(), array.res_name.dtype)
+ hetero = np.zeros(array.array_length(), array.hetero.dtype)
atom_name = np.zeros(array.array_length(), array.atom_name.dtype)
- element = np.zeros(array.array_length(), array.element.dtype)
+ element = np.zeros(array.array_length(), array.element.dtype)
# Fill annotation array
# i is index in array, line_i is line index
@@ -258,7 +282,7 @@ def get_structure(self, model=None):
res_id[i] = int(line[22:26])
ins_code[i] = line[26].strip()
res_name[i] = line[17:20].strip()
- hetero[i] = (False if line[0:4] == "ATOM" else True)
+ hetero[i] = False if line[0:4] == "ATOM" else True
atom_name[i] = line[12:16].strip()
element[i] = line[76:78].strip()
@@ -275,21 +299,21 @@ def get_structure(self, model=None):
if isinstance(array, AtomArray):
for i, line_i in enumerate(coord_i):
line = self.lines[line_i]
- array.coord[i,0] = float(line[30:38])
- array.coord[i,1] = float(line[38:46])
- array.coord[i,2] = float(line[46:54])
+ array.coord[i, 0] = float(line[30:38])
+ array.coord[i, 1] = float(line[38:46])
+ array.coord[i, 2] = float(line[46:54])
elif isinstance(array, AtomArrayStack):
m = 0
i = 0
for line_i in atom_line_i:
- if m < len(model_start_i)-1 and line_i > model_start_i[m+1]:
+ if m < len(model_start_i) - 1 and line_i > model_start_i[m + 1]:
m += 1
i = 0
line = self.lines[line_i]
- array.coord[m,i,0] = float(line[30:38])
- array.coord[m,i,1] = float(line[38:46])
- array.coord[m,i,2] = float(line[46:54])
+ array.coord[m, i, 0] = float(line[30:38])
+ array.coord[m, i, 1] = float(line[38:46])
+ array.coord[m, i, 2] = float(line[46:54])
i += 1
# Sort into the original atom order
@@ -297,9 +321,15 @@ def get_structure(self, model=None):
return array
-
- def set_structure(self, atoms, charges=None, atom_types=None,
- rotatable_bonds=None, root=None, include_torsdof=True):
+ def set_structure(
+ self,
+ atoms,
+ charges=None,
+ atom_types=None,
+ rotatable_bonds=None,
+ root=None,
+ include_torsdof=True,
+ ):
"""
Write an :class:`AtomArray` into the PDBQT file.
@@ -394,12 +424,8 @@ def set_structure(self, atoms, charges=None, atom_types=None,
use_root = True
else:
if rotatable_bonds.ndim != 2 or rotatable_bonds.shape[1] != 2:
- raise ValueError(
- "An (nx2) array is expected for rotatable bonds"
- )
- rotatable_bonds = BondList(
- len(mask), np.asarray(rotatable_bonds)
- )[mask]
+ raise ValueError("An (nx2) array is expected for rotatable bonds")
+ rotatable_bonds = BondList(len(mask), np.asarray(rotatable_bonds))[mask]
use_root = True
if root is None:
@@ -426,35 +452,51 @@ def set_structure(self, atoms, charges=None, atom_types=None,
# for simple branch determination in '_write_atoms()'
atoms.bonds.remove_bonds(rotatable_bonds)
- hetero = ["ATOM" if e == False else "HETATM" for e in atoms.hetero]
+ hetero = ["HETATM" if e else "ATOM" for e in atoms.hetero]
if "atom_id" in atoms.get_annotation_categories():
atom_id = atoms.atom_id
else:
- atom_id = np.arange(1, atoms.array_length()+1)
+ atom_id = np.arange(1, atoms.array_length() + 1)
occupancy = np.ones(atoms.array_length())
b_factor = np.zeros(atoms.array_length())
# Convert rotatable bonds into array for easier handling
# The bond type is irrelevant from this point on
- rotatable_bonds = rotatable_bonds.as_array()[:,:2]
+ rotatable_bonds = rotatable_bonds.as_array()[:, :2]
self.lines = []
self._write_atoms(
- atoms, charges, types,
- atom_id, hetero, occupancy, b_factor,
- root_index, rotatable_bonds,
- np.zeros(len(rotatable_bonds), dtype=bool), use_root
+ atoms,
+ charges,
+ types,
+ atom_id,
+ hetero,
+ occupancy,
+ b_factor,
+ root_index,
+ rotatable_bonds,
+ np.zeros(len(rotatable_bonds), dtype=bool),
+ use_root,
)
if include_torsdof:
self.lines.append(f"TORSDOF {len(rotatable_bonds)}")
return mask
-
- def _write_atoms(self, atoms, charges, types,
- atom_id, hetero, occupancy, b_factor,
- root_atom, rotatable_bonds, visited_rotatable_bonds,
- is_root):
+ def _write_atoms(
+ self,
+ atoms,
+ charges,
+ types,
+ atom_id,
+ hetero,
+ occupancy,
+ b_factor,
+ root_atom,
+ rotatable_bonds,
+ visited_rotatable_bonds,
+ is_root,
+ ):
if len(rotatable_bonds) != 0:
# Get the indices to atoms of this branch, i.e. a group of
# atoms that are connected by non-rotatable bonds
@@ -465,9 +507,7 @@ def _write_atoms(self, atoms, charges, types,
# the rotatable bond should always be listed first
# -> Remove root atom and insert it at the beginning
this_branch_indices = np.insert(
- this_branch_indices[this_branch_indices != root_atom],
- 0,
- root_atom
+ this_branch_indices[this_branch_indices != root_atom], 0, root_atom
)
else:
# No rotatable bonds
@@ -525,18 +565,24 @@ def _write_atoms(self, atoms, charges, types,
f"BRANCH {atom_id[this_br_i]:>3d} {atom_id[new_br_i]:>3d}"
)
self._write_atoms(
- atoms, charges, types,
- atom_id, hetero, occupancy, b_factor,
+ atoms,
+ charges,
+ types,
+ atom_id,
+ hetero,
+ occupancy,
+ b_factor,
# The root atom of the branch
- #is the other atom of the rotatable bond
- new_br_i, rotatable_bonds, visited_rotatable_bonds,
- False
+ # is the other atom of the rotatable bond
+ new_br_i,
+ rotatable_bonds,
+ visited_rotatable_bonds,
+ False,
)
self.lines.append(
f"ENDBRANCH {atom_id[this_br_i]:>3d} {atom_id[new_br_i]:>3d}"
)
-
def _get_model_length(self, model_start_i, atom_line_i):
"""
Determine length of models and check that all models
@@ -546,8 +592,11 @@ def _get_model_length(self, model_start_i, atom_line_i):
length = None
for model_i in range(len(model_start_i)):
model_start = model_start_i[model_i]
- model_stop = model_start_i[model_i+1] if model_i+1 < n_models \
- else len(self.lines)
+ model_stop = (
+ model_start_i[model_i + 1]
+ if model_i + 1 < n_models
+ else len(self.lines)
+ )
model_length = np.count_nonzero(
(atom_line_i >= model_start) & (atom_line_i < model_stop)
)
@@ -613,8 +662,7 @@ def convert_atoms(atoms, charges):
)
elif element == "C":
if np.isin(
- all_bond_types[i],
- [BondType.AROMATIC_SINGLE, BondType.AROMATIC_DOUBLE]
+ all_bond_types[i], [BondType.AROMATIC_SINGLE, BondType.AROMATIC_DOUBLE]
).any():
# Aromatic carbon
atom_types[i] = "A"
@@ -637,4 +685,4 @@ def convert_atoms(atoms, charges):
atom_types[i] = "H"
mask = ~hydrogen_removal_mask
- return atoms[mask], charges[mask], atom_types[mask], mask
\ No newline at end of file
+ return atoms[mask], charges[mask], atom_types[mask], mask
diff --git a/src/biotite/structure/io/pdbx/__init__.py b/src/biotite/structure/io/pdbx/__init__.py
index ccad4ca21..0b3714b48 100644
--- a/src/biotite/structure/io/pdbx/__init__.py
+++ b/src/biotite/structure/io/pdbx/__init__.py
@@ -15,8 +15,8 @@
__name__ = "biotite.structure.io.pdbx"
__author__ = "Patrick Kunzmann"
-from .convert import *
from .bcif import *
from .cif import *
from .component import *
-from .encoding import *
\ No newline at end of file
+from .convert import *
+from .encoding import *
diff --git a/src/biotite/structure/io/pdbx/bcif.py b/src/biotite/structure/io/pdbx/bcif.py
index 4b9331ff6..4f3aef3a5 100644
--- a/src/biotite/structure/io/pdbx/bcif.py
+++ b/src/biotite/structure/io/pdbx/bcif.py
@@ -4,16 +4,29 @@
__name__ = "biotite.structure.io.pdbx"
__author__ = "Patrick Kunzmann"
-__all__ = ["BinaryCIFFile", "BinaryCIFBlock", "BinaryCIFCategory",
- "BinaryCIFColumn", "BinaryCIFData"]
+__all__ = [
+ "BinaryCIFFile",
+ "BinaryCIFBlock",
+ "BinaryCIFCategory",
+ "BinaryCIFColumn",
+ "BinaryCIFData",
+]
from collections.abc import Sequence
-import numpy as np
import msgpack
-from .component import _Component, _HierarchicalContainer, MaskValue
-from .encoding import decode_stepwise, encode_stepwise, deserialize_encoding, \
- create_uncompressed_encoding
-from ....file import File, is_binary, is_open_compatible, SerializationError
+import numpy as np
+from biotite.file import File, SerializationError, is_binary, is_open_compatible
+from biotite.structure.io.pdbx.component import (
+ MaskValue,
+ _Component,
+ _HierarchicalContainer,
+)
+from biotite.structure.io.pdbx.encoding import (
+ create_uncompressed_encoding,
+ decode_stepwise,
+ deserialize_encoding,
+ encode_stepwise,
+)
class BinaryCIFData(_Component):
@@ -74,10 +87,7 @@ class BinaryCIFData(_Component):
"""
def __init__(self, array, encoding=None):
- if (
- not isinstance(array, (Sequence, np.ndarray))
- or isinstance(array, str)
- ):
+ if not isinstance(array, (Sequence, np.ndarray)) or isinstance(array, str):
array = [array]
array = np.asarray(array)
if np.issubdtype(array.dtype, np.object_):
@@ -107,19 +117,13 @@ def supercomponent_class():
@staticmethod
def deserialize(content):
- encoding = [
- deserialize_encoding(enc) for enc in content["encoding"]
- ]
- return BinaryCIFData(
- decode_stepwise(content["data"], encoding), encoding
- )
+ encoding = [deserialize_encoding(enc) for enc in content["encoding"]]
+ return BinaryCIFData(decode_stepwise(content["data"], encoding), encoding)
def serialize(self):
serialized_data = encode_stepwise(self._array, self._encoding)
if not isinstance(serialized_data, bytes):
- raise SerializationError(
- "Final encoding must return 'bytes'"
- )
+ raise SerializationError("Final encoding must return 'bytes'")
serialized_encoding = [enc.serialize() for enc in self._encoding]
return {"data": serialized_data, "encoding": serialized_encoding}
@@ -190,8 +194,7 @@ def __init__(self, data, mask=None):
mask = BinaryCIFData(mask)
if len(data) != len(mask):
raise IndexError(
- f"Data has length {len(data)}, "
- f"but mask has length {len(mask)}"
+ f"Data has length {len(data)}, " f"but mask has length {len(mask)}"
)
self._data = data
self._mask = mask
@@ -290,9 +293,7 @@ def as_array(self, dtype=None, masked_value=None):
array = np.full(len(self._data), masked_value, dtype=dtype)
present_mask = self._mask.array == MaskValue.PRESENT
- array[present_mask] = (
- self._data.array[present_mask].astype(dtype)
- )
+ array[present_mask] = self._data.array[present_mask].astype(dtype)
return array
@staticmethod
@@ -300,13 +301,14 @@ def deserialize(content):
return BinaryCIFColumn(
BinaryCIFData.deserialize(content["data"]),
BinaryCIFData.deserialize(content["mask"])
- if content["mask"] is not None else None
+ if content["mask"] is not None
+ else None,
)
def serialize(self):
return {
"data": self._data.serialize(),
- "mask": self._mask.serialize() if self._mask is not None else None
+ "mask": self._mask.serialize() if self._mask is not None else None,
}
def __len__(self):
@@ -392,10 +394,8 @@ def supercomponent_class():
@staticmethod
def deserialize(content):
return BinaryCIFCategory(
- BinaryCIFCategory._deserialize_elements(
- content["columns"], "name"
- ),
- content["rowCount"]
+ BinaryCIFCategory._deserialize_elements(content["columns"], "name"),
+ content["rowCount"],
)
def serialize(self):
@@ -470,9 +470,7 @@ def supercomponent_class():
@staticmethod
def deserialize(content):
return BinaryCIFBlock(
- BinaryCIFBlock._deserialize_elements(
- content["categories"], "name"
- )
+ BinaryCIFBlock._deserialize_elements(content["categories"], "name")
)
def serialize(self):
@@ -559,16 +557,14 @@ def supercomponent_class():
@staticmethod
def deserialize(content):
return BinaryCIFFile(
- BinaryCIFFile._deserialize_elements(
- content["dataBlocks"], "header"
- )
+ BinaryCIFFile._deserialize_elements(content["dataBlocks"], "header")
)
def serialize(self):
return {"dataBlocks": self._serialize_elements("header")}
@classmethod
- def read(self, file):
+ def read(cls, file):
"""
Read a *BinaryCIF* file.
@@ -587,18 +583,14 @@ def read(self, file):
if is_open_compatible(file):
with open(file, "rb") as f:
return BinaryCIFFile.deserialize(
- msgpack.unpackb(
- f.read(), use_list=True, raw=False
- )
+ msgpack.unpackb(f.read(), use_list=True, raw=False)
)
# File object
else:
if not is_binary(file):
raise TypeError("A file opened in 'binary' mode is required")
return BinaryCIFFile.deserialize(
- msgpack.unpackb(
- file.read(), use_list=True, raw=False
- )
+ msgpack.unpackb(file.read(), use_list=True, raw=False)
)
def write(self, file):
diff --git a/src/biotite/structure/io/pdbx/cif.py b/src/biotite/structure/io/pdbx/cif.py
index 6ca9bfe66..25cd91387 100644
--- a/src/biotite/structure/io/pdbx/cif.py
+++ b/src/biotite/structure/io/pdbx/cif.py
@@ -10,10 +10,14 @@
import shlex
from collections.abc import MutableMapping, Sequence
import numpy as np
-from .component import _Component, MaskValue
-from ....file import File, is_open_compatible, is_text, DeserializationError, \
- SerializationError
-
+from biotite.file import (
+ DeserializationError,
+ File,
+ SerializationError,
+ is_open_compatible,
+ is_text,
+)
+from biotite.structure.io.pdbx.component import MaskValue, _Component
UNICODE_CHAR_SIZE = 4
@@ -133,9 +137,7 @@ def __init__(self, data, mask=None):
if not isinstance(data, CIFData):
data = CIFData(data, str)
if mask is None:
- mask = np.full(
- len(data), MaskValue.PRESENT, dtype=np.uint8
- )
+ mask = np.full(len(data), MaskValue.PRESENT, dtype=np.uint8)
mask[data.array == "."] = MaskValue.INAPPLICABLE
mask[data.array == "?"] = MaskValue.MISSING
if np.all(mask == MaskValue.PRESENT):
@@ -148,8 +150,7 @@ def __init__(self, data, mask=None):
mask = CIFData(mask, np.uint8)
if len(mask) != len(data):
raise IndexError(
- f"Data has length {len(data)}, "
- f"but mask has length {len(mask)}"
+ f"Data has length {len(data)}, " f"but mask has length {len(mask)}"
)
self._data = data
self._mask = mask
@@ -222,9 +223,7 @@ def as_array(self, dtype=str, masked_value=None):
elif np.issubdtype(dtype, np.str_):
# Limit float precision to 3 decimals
if np.issubdtype(self._data.array.dtype, np.floating):
- array = np.array(
- [f"{e:.3f}" for e in self._data.array], type=dtype
- )
+ array = np.array([f"{e:.3f}" for e in self._data.array], type=dtype)
else:
# Copy, as otherwise original data would be overwritten
# with mask values
@@ -247,9 +246,7 @@ def as_array(self, dtype=str, masked_value=None):
array = np.full(len(self._data), masked_value, dtype=dtype)
present_mask = self._mask.array == MaskValue.PRESENT
- array[present_mask] = (
- self._data.array[present_mask].astype(dtype)
- )
+ array[present_mask] = self._data.array[present_mask].astype(dtype)
return array
def __len__(self):
@@ -361,9 +358,7 @@ def supercomponent_class():
@staticmethod
def deserialize(text, expect_whitespace=True):
- lines = [
- line.strip() for line in text.splitlines() if not _is_empty(line)
- ]
+ lines = [line.strip() for line in text.splitlines() if not _is_empty(line)]
if _is_loop_start(lines[0]):
is_looped = True
@@ -373,15 +368,11 @@ def deserialize(text, expect_whitespace=True):
category_name = _parse_category_name(lines[0])
if category_name is None:
- raise DeserializationError(
- "Failed to parse category name"
- )
+ raise DeserializationError("Failed to parse category name")
lines = _to_single(lines, is_looped)
if is_looped:
- category_dict = CIFCategory._deserialize_looped(
- lines, expect_whitespace
- )
+ category_dict = CIFCategory._deserialize_looped(lines, expect_whitespace)
else:
category_dict = CIFCategory._deserialize_single(lines)
return CIFCategory(category_dict, category_name)
@@ -511,27 +502,21 @@ def _serialize_single(self):
]
def _serialize_looped(self):
- key_lines = [
- "_" + self._name + "." + key + " "
- for key in self.keys()
- ]
+ key_lines = ["_" + self._name + "." + key + " " for key in self.keys()]
column_arrays = []
for column in self.values():
array = column.as_array(str)
# Quote before measuring the number of chars,
# as the quote characters modify the length
- array = np.array(
- [_multiline(_quote(element)) for element in array]
- )
+ array = np.array([_multiline(_quote(element)) for element in array])
column_arrays.append(array)
# Number of characters the longest string in the column needs
# This can be deduced from the dtype
# The "+1" is for the small whitespace column
column_n_chars = [
- array.dtype.itemsize // UNICODE_CHAR_SIZE + 1
- for array in column_arrays
+ array.dtype.itemsize // UNICODE_CHAR_SIZE + 1 for array in column_arrays
]
value_lines = [""] * self._row_count
for i in range(self._row_count):
@@ -615,15 +600,11 @@ def deserialize(text):
if is_loop_in_line:
# In case of lines with "loop_" the category is
# in the next line
- category_name_in_line = _parse_category_name(
- lines[i + 1]
- )
+ category_name_in_line = _parse_category_name(lines[i + 1])
current_category_name = category_name_in_line
category_starts.append(i)
category_names.append(current_category_name)
- return CIFBlock(_create_element_dict(
- lines, category_names, category_starts
- ))
+ return CIFBlock(_create_element_dict(lines, category_names, category_starts))
def serialize(self):
text_blocks = []
@@ -635,7 +616,7 @@ def serialize(self):
try:
category.name = category_name
text_blocks.append(category.serialize())
- except:
+ except Exception:
raise SerializationError(
f"Failed to serialize category '{category_name}'"
)
@@ -658,10 +639,8 @@ def __getitem__(self, key):
else:
expect_whitespace = True
category = CIFCategory.deserialize(category, expect_whitespace)
- except:
- raise DeserializationError(
- f"Failed to deserialize category '{key}'"
- )
+ except Exception:
+ raise DeserializationError(f"Failed to deserialize category '{key}'")
# Update with deserialized object
self._categories[key] = category
return category
@@ -809,7 +788,7 @@ def serialize(self):
else:
try:
text_blocks.append(block.serialize())
- except:
+ except Exception:
raise SerializationError(
f"Failed to serialize block '{block_name}'"
)
@@ -869,19 +848,15 @@ def __getitem__(self, key):
# -> must be deserialized first
try:
block = CIFBlock.deserialize(block)
- except:
- raise DeserializationError(
- f"Failed to deserialize block '{key}'"
- )
+ except Exception:
+ raise DeserializationError(f"Failed to deserialize block '{key}'")
# Update with deserialized object
self._blocks[key] = block
return block
def __setitem__(self, key, block):
if not isinstance(block, CIFBlock):
- raise TypeError(
- f"Expected 'CIFBlock', but got '{type(block).__name__}'"
- )
+ raise TypeError(f"Expected 'CIFBlock', but got '{type(block).__name__}'")
self._blocks[key] = block
def __delitem__(self, key):
@@ -919,7 +894,7 @@ def _create_element_dict(lines, element_names, element_starts):
# Lazy deserialization
# -> keep as text for now and deserialize later if needed
return {
- element_name: "\n".join(lines[element_starts[i] : element_starts[i+1]])
+ element_name: "\n".join(lines[element_starts[i] : element_starts[i + 1]])
for i, element_name in enumerate(element_names)
}
diff --git a/src/biotite/structure/io/pdbx/component.py b/src/biotite/structure/io/pdbx/component.py
index 76eb0c8da..fb2f228ed 100644
--- a/src/biotite/structure/io/pdbx/component.py
+++ b/src/biotite/structure/io/pdbx/component.py
@@ -11,10 +11,10 @@
__author__ = "Patrick Kunzmann"
__all__ = ["MaskValue"]
-from enum import IntEnum
from abc import ABCMeta, abstractmethod
from collections.abc import MutableMapping
-from ....file import SerializationError, DeserializationError
+from enum import IntEnum
+from biotite.file import DeserializationError, SerializationError
class MaskValue(IntEnum):
@@ -29,6 +29,7 @@ class MaskValue(IntEnum):
- `MISSING` : For this row the value is missing or unknown
(``?`` in *CIF*).
"""
+
PRESENT = 0
INAPPLICABLE = 1
MISSING = 2
@@ -109,8 +110,7 @@ def __str__(self):
return str(self.serialize())
-class _HierarchicalContainer(_Component, MutableMapping,
- metaclass=ABCMeta):
+class _HierarchicalContainer(_Component, MutableMapping, metaclass=ABCMeta):
"""
A container for hierarchical data in BinaryCIF files.
For example, the file contains multiple blocks, each block contains
@@ -181,10 +181,8 @@ def _serialize_elements(self, store_key_in=None):
if isinstance(element, self.subcomponent_class()):
try:
serialized_element = element.serialize()
- except:
- raise SerializationError(
- f"Failed to serialize element '{key}'"
- )
+ except Exception:
+ raise SerializationError(f"Failed to serialize element '{key}'")
else:
# Element is already stored in serialized form
serialized_element = element
@@ -200,10 +198,8 @@ def __getitem__(self, key):
# -> must be deserialized first
try:
element = self.subcomponent_class().deserialize(element)
- except:
- raise DeserializationError(
- f"Failed to deserialize element '{key}'"
- )
+ except Exception:
+ raise DeserializationError(f"Failed to deserialize element '{key}'")
# Update container with deserialized object
self._elements[key] = element
return element
@@ -220,10 +216,8 @@ def __setitem__(self, key, element):
else:
try:
element = self.subcomponent_class().deserialize(element)
- except:
- raise DeserializationError(
- f"Failed to deserialize given value"
- )
+ except Exception:
+ raise DeserializationError("Failed to deserialize given value")
self._elements[key] = element
def __delitem__(self, key):
diff --git a/src/biotite/structure/io/pdbx/convert.py b/src/biotite/structure/io/pdbx/convert.py
index 712dec0b9..d514ed87c 100644
--- a/src/biotite/structure/io/pdbx/convert.py
+++ b/src/biotite/structure/io/pdbx/convert.py
@@ -18,29 +18,41 @@
import itertools
import warnings
import numpy as np
-from ....file import InvalidFileError
-from ....sequence.seqtypes import NucleotideSequence, ProteinSequence
-from ...atoms import AtomArray, AtomArrayStack, repeat
-from ...bonds import BondList, BondType, connect_via_residue_names
-from ...box import unitcell_from_vectors, vectors_from_unitcell
-from ...filter import filter_first_altloc, filter_highest_occupancy_altloc
-from ...residues import get_residue_count, get_residue_starts_for
-from ...error import BadStructureError
-from ...util import matrix_rotate
-from .component import MaskValue
-from .cif import CIFFile, CIFBlock
-from .bcif import BinaryCIFFile, BinaryCIFBlock, BinaryCIFColumn
-from .encoding import StringArrayEncoding
-
+from biotite.file import InvalidFileError
+from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence
+from biotite.structure.atoms import AtomArray, AtomArrayStack, repeat
+from biotite.structure.bonds import BondList, BondType, connect_via_residue_names
+from biotite.structure.box import unitcell_from_vectors, vectors_from_unitcell
+from biotite.structure.error import BadStructureError
+from biotite.structure.filter import (
+ filter_first_altloc,
+ filter_highest_occupancy_altloc,
+)
+from biotite.structure.io.pdbx.bcif import (
+ BinaryCIFBlock,
+ BinaryCIFColumn,
+ BinaryCIFFile,
+)
+from biotite.structure.io.pdbx.cif import CIFBlock, CIFFile
+from biotite.structure.io.pdbx.component import MaskValue
+from biotite.structure.io.pdbx.encoding import StringArrayEncoding
+from biotite.structure.residues import get_residue_count, get_residue_starts_for
+from biotite.structure.util import matrix_rotate
# Cond types in `struct_conn` category that refer to covalent bonds
PDBX_COVALENT_TYPES = [
- "covale", "covale_base", "covale_phosphate", "covale_sugar",
- "disulf", "modres", "modres_link", "metalc"
+ "covale",
+ "covale_base",
+ "covale_phosphate",
+ "covale_sugar",
+ "disulf",
+ "modres",
+ "modres_link",
+ "metalc",
]
# Map 'struct_conn' bond orders to 'BondType'...
PDBX_BOND_ORDER_TO_TYPE = {
- "": BondType.ANY,
+ "": BondType.ANY,
"sing": BondType.SINGLE,
"doub": BondType.DOUBLE,
"trip": BondType.TRIPLE,
@@ -60,13 +72,13 @@
}
# Map 'chem_comp_bond' bond orders and aromaticity to 'BondType'...
COMP_BOND_ORDER_TO_TYPE = {
- ("SING", "N") : BondType.SINGLE,
- ("DOUB", "N") : BondType.DOUBLE,
- ("TRIP", "N") : BondType.TRIPLE,
- ("QUAD", "N") : BondType.QUADRUPLE,
- ("SING", "Y") : BondType.AROMATIC_SINGLE,
- ("DOUB", "Y") : BondType.AROMATIC_DOUBLE,
- ("TRIP", "Y") : BondType.AROMATIC_TRIPLE,
+ ("SING", "N"): BondType.SINGLE,
+ ("DOUB", "N"): BondType.DOUBLE,
+ ("TRIP", "N"): BondType.TRIPLE,
+ ("QUAD", "N"): BondType.QUADRUPLE,
+ ("SING", "Y"): BondType.AROMATIC_SINGLE,
+ ("DOUB", "Y"): BondType.AROMATIC_DOUBLE,
+ ("TRIP", "Y"): BondType.AROMATIC_TRIPLE,
}
# ...and vice versa
COMP_BOND_TYPE_TO_ORDER = {
@@ -97,16 +109,15 @@ def _filter(category, index):
Column = Category.subcomponent_class()
Data = Column.subcomponent_class()
- return Category({
- key: Column(
- Data(column.data.array[index]),
- (
- Data(column.mask.array[index])
- if column.mask is not None else None
+ return Category(
+ {
+ key: Column(
+ Data(column.data.array[index]),
+ (Data(column.mask.array[index]) if column.mask is not None else None),
)
- )
- for key, column in category.items()
- })
+ for key, column in category.items()
+ }
+ )
def get_sequence(pdbx_file, data_block=None):
@@ -148,7 +159,7 @@ def get_sequence(pdbx_file, data_block=None):
"""
block = _get_block(pdbx_file, data_block)
- poly_category= block["entity_poly"]
+ poly_category = block["entity_poly"]
seq_string = poly_category["pdbx_seq_one_letter_code_can"].as_array(str)
seq_type = poly_category["type"].as_array(str)
@@ -158,7 +169,7 @@ def get_sequence(pdbx_file, data_block=None):
for string, stype in zip(seq_string, seq_type)
]
- strand_ids = poly_category['pdbx_strand_id'].as_array(str)
+ strand_ids = poly_category["pdbx_strand_id"].as_array(str)
strand_ids = [strand_id.split(",") for strand_id in strand_ids]
sequence_dict = {
@@ -192,14 +203,20 @@ def get_model_count(pdbx_file, data_block=None):
The number of models.
"""
block = _get_block(pdbx_file, data_block)
- return len(_get_model_starts(
- block["atom_site"]["pdbx_PDB_model_num"].as_array(np.int32)
- ))
+ return len(
+ _get_model_starts(block["atom_site"]["pdbx_PDB_model_num"].as_array(np.int32))
+ )
-def get_structure(pdbx_file, model=None, data_block=None, altloc="first",
- extra_fields=None, use_author_fields=True,
- include_bonds=False):
+def get_structure(
+ pdbx_file,
+ model=None,
+ data_block=None,
+ altloc="first",
+ extra_fields=None,
+ use_author_fields=True,
+ include_bonds=False,
+):
"""
Create an :class:`AtomArray` or :class:`AtomArrayStack` from the
``atom_site`` category in a file.
@@ -310,12 +327,21 @@ def get_structure(pdbx_file, model=None, data_block=None, altloc="first",
"instead"
)
- atoms.coord[:, :, 0] = atom_site["Cartn_x"].as_array(np.float32) \
- .reshape((model_count, model_length))
- atoms.coord[:, :, 1] = atom_site["Cartn_y"].as_array(np.float32) \
- .reshape((model_count, model_length))
- atoms.coord[:, :, 2] = atom_site["Cartn_z"].as_array(np.float32) \
- .reshape((model_count, model_length))
+ atoms.coord[:, :, 0] = (
+ atom_site["Cartn_x"]
+ .as_array(np.float32)
+ .reshape((model_count, model_length))
+ )
+ atoms.coord[:, :, 1] = (
+ atom_site["Cartn_y"]
+ .as_array(np.float32)
+ .reshape((model_count, model_length))
+ )
+ atoms.coord[:, :, 2] = (
+ atom_site["Cartn_z"]
+ .as_array(np.float32)
+ .reshape((model_count, model_length))
+ )
box = _get_box(block)
if box is not None:
@@ -345,31 +371,25 @@ def get_structure(pdbx_file, model=None, data_block=None, altloc="first",
atoms.box = _get_box(block)
# The below part is the same for both, AtomArray and AtomArrayStack
- _fill_annotations(
- atoms, model_atom_site, extra_fields, use_author_fields
- )
+ _fill_annotations(atoms, model_atom_site, extra_fields, use_author_fields)
if include_bonds:
if "chem_comp_bond" in block:
try:
- custom_bond_dict = _parse_intra_residue_bonds(
- block["chem_comp_bond"]
- )
+ custom_bond_dict = _parse_intra_residue_bonds(block["chem_comp_bond"])
except KeyError:
warnings.warn(
"The 'chem_comp_bond' category has missing columns, "
"falling back to using Chemical Component Dictionary",
- UserWarning
+ UserWarning,
)
custom_bond_dict = None
- bonds = connect_via_residue_names(
- atoms, custom_bond_dict=custom_bond_dict
- )
+ bonds = connect_via_residue_names(atoms, custom_bond_dict=custom_bond_dict)
else:
bonds = connect_via_residue_names(atoms)
if "struct_conn" in block:
- bonds = bonds.merge(_parse_inter_residue_bonds(
- model_atom_site, block["struct_conn"]
- ))
+ bonds = bonds.merge(
+ _parse_inter_residue_bonds(model_atom_site, block["struct_conn"])
+ )
atoms.bonds = bonds
atoms = _filter_altloc(atoms, model_atom_site, altloc)
@@ -388,24 +408,24 @@ def _get_block(pdbx_component, block_name):
def _get_or_fallback(category, key, fallback_key):
- """
- Return column related to key in category if it exists,
- otherwise try to get the column related to fallback key.
- """
- if key not in category:
- warnings.warn(
- f"Attribute '{key}' not found within 'atom_site' category. "
- f"The fallback attribute '{fallback_key}' will be used instead",
- UserWarning
- )
- try:
- return category[fallback_key]
- except KeyError as key_exc:
- raise InvalidFileError(
- f"Fallback attribute '{fallback_key}' not found within "
- "'atom_site' category"
- ) from key_exc
- return category[key]
+ """
+ Return column related to key in category if it exists,
+ otherwise try to get the column related to fallback key.
+ """
+ if key not in category:
+ warnings.warn(
+ f"Attribute '{key}' not found within 'atom_site' category. "
+ f"The fallback attribute '{fallback_key}' will be used instead",
+ UserWarning,
+ )
+ try:
+ return category[fallback_key]
+ except KeyError as key_exc:
+ raise InvalidFileError(
+ f"Fallback attribute '{fallback_key}' not found within "
+ "'atom_site' category"
+ ) from key_exc
+ return category[key]
def _fill_annotations(array, atom_site, extra_fields, use_author_fields):
@@ -424,78 +444,52 @@ def _fill_annotations(array, atom_site, extra_fields, use_author_fields):
instead of ``label_``.
"""
- prefix, alt_prefix = (
- ("auth", "label") if use_author_fields else ("label", "auth")
- )
+ prefix, alt_prefix = ("auth", "label") if use_author_fields else ("label", "auth")
array.set_annotation(
"chain_id",
_get_or_fallback(
atom_site, f"{prefix}_asym_id", f"{alt_prefix}_asym_id"
- ).as_array("U4")
+ ).as_array("U4"),
)
array.set_annotation(
"res_id",
_get_or_fallback(
atom_site, f"{prefix}_seq_id", f"{alt_prefix}_seq_id"
- ).as_array(int, -1)
- )
- array.set_annotation(
- "ins_code",
- atom_site["pdbx_PDB_ins_code"].as_array("U1", "")
+ ).as_array(int, -1),
)
+ array.set_annotation("ins_code", atom_site["pdbx_PDB_ins_code"].as_array("U1", ""))
array.set_annotation(
"res_name",
_get_or_fallback(
atom_site, f"{prefix}_comp_id", f"{alt_prefix}_comp_id"
- ).as_array("U5")
- )
- array.set_annotation(
- "hetero",
- atom_site["group_PDB"].as_array(str) == "HETATM"
+ ).as_array("U5"),
)
+ array.set_annotation("hetero", atom_site["group_PDB"].as_array(str) == "HETATM")
array.set_annotation(
"atom_name",
_get_or_fallback(
atom_site, f"{prefix}_atom_id", f"{alt_prefix}_atom_id"
- ).as_array("U6")
- )
- array.set_annotation(
- "element",
- atom_site["type_symbol"].as_array("U2")
+ ).as_array("U6"),
)
+ array.set_annotation("element", atom_site["type_symbol"].as_array("U2"))
if "atom_id" in extra_fields:
- array.set_annotation(
- "atom_id",
- atom_site["id"].as_array(int)
- )
+ array.set_annotation("atom_id", atom_site["id"].as_array(int))
extra_fields.remove("atom_id")
if "b_factor" in extra_fields:
- array.set_annotation(
- "b_factor",
- atom_site["B_iso_or_equiv"].as_array(float)
- )
+ array.set_annotation("b_factor", atom_site["B_iso_or_equiv"].as_array(float))
extra_fields.remove("b_factor")
if "occupancy" in extra_fields:
- array.set_annotation(
- "occupancy",
- atom_site["occupancy"].as_array(float)
- )
+ array.set_annotation("occupancy", atom_site["occupancy"].as_array(float))
extra_fields.remove("occupancy")
if "charge" in extra_fields:
- array.set_annotation(
- "charge",
- atom_site["pdbx_formal_charge"].as_array(int, 0)
- )
+ array.set_annotation("charge", atom_site["pdbx_formal_charge"].as_array(int, 0))
extra_fields.remove("charge")
# Handle all remaining custom fields
for field in extra_fields:
- array.set_annotation(
- field,
- atom_site[field].as_array(str)
- )
+ array.set_annotation(field, atom_site[field].as_array(str))
def _parse_intra_residue_bonds(chem_comp_bond):
@@ -509,7 +503,7 @@ def _parse_intra_residue_bonds(chem_comp_bond):
chem_comp_bond["atom_id_1"].as_array(str),
chem_comp_bond["atom_id_2"].as_array(str),
chem_comp_bond["value_order"].as_array(str),
- chem_comp_bond["pdbx_aromatic_flag"].as_array(str)
+ chem_comp_bond["pdbx_aromatic_flag"].as_array(str),
):
if res_name not in custom_bond_dict:
custom_bond_dict[res_name] = {}
@@ -530,33 +524,32 @@ def _parse_inter_residue_bonds(atom_site, struct_conn):
IDENTITY = "1_555"
# Columns in 'atom_site' that should be matched by 'struct_conn'
COLUMNS = [
- "label_asym_id", "label_comp_id", "label_seq_id", "label_atom_id",
- "label_alt_id", "auth_asym_id", "auth_comp_id", "auth_seq_id",
- "pdbx_PDB_ins_code"
+ "label_asym_id",
+ "label_comp_id",
+ "label_seq_id",
+ "label_atom_id",
+ "label_alt_id",
+ "auth_asym_id",
+ "auth_comp_id",
+ "auth_seq_id",
+ "pdbx_PDB_ins_code",
]
covale_mask = np.isin(
struct_conn["conn_type_id"].as_array(str), PDBX_COVALENT_TYPES
)
if "ptnr1_symmetry" in struct_conn:
- covale_mask &= (
- struct_conn["ptnr1_symmetry"].as_array(str, IDENTITY) == IDENTITY
- )
+ covale_mask &= struct_conn["ptnr1_symmetry"].as_array(str, IDENTITY) == IDENTITY
if "ptnr2_symmetry" in struct_conn:
- covale_mask &= (
- struct_conn["ptnr2_symmetry"].as_array(str, IDENTITY) == IDENTITY
- )
+ covale_mask &= struct_conn["ptnr2_symmetry"].as_array(str, IDENTITY) == IDENTITY
atom_indices = [None] * 2
for i in range(2):
reference_arrays = []
query_arrays = []
for col_name in COLUMNS:
- struct_conn_col_name = _get_struct_conn_col_name(col_name, i+1)
- if (
- col_name not in atom_site
- or struct_conn_col_name not in struct_conn
- ):
+ struct_conn_col_name = _get_struct_conn_col_name(col_name, i + 1)
+ if col_name not in atom_site or struct_conn_col_name not in struct_conn:
continue
# Ensure both arrays have the same dtype to allow comparison
reference = atom_site[col_name].as_array()
@@ -593,7 +586,7 @@ def _parse_inter_residue_bonds(atom_site, struct_conn):
return BondList(
atom_site.row_count,
- np.stack([atoms_indices_1, atoms_indices_2, bond_types], axis=-1)
+ np.stack([atoms_indices_1, atoms_indices_2, bond_types], axis=-1),
)
@@ -603,10 +596,13 @@ def _find_matches(query_arrays, reference_arrays):
`reference_arrays` where all query values the reference counterpart.
If no match is found for a query, the corresponding index is -1.
"""
- match_masks_for_all_columns = np.stack([
- query[:, np.newaxis] == reference[np.newaxis, :]
- for query, reference in zip(query_arrays, reference_arrays)
- ], axis=-1)
+ match_masks_for_all_columns = np.stack(
+ [
+ query[:, np.newaxis] == reference[np.newaxis, :]
+ for query, reference in zip(query_arrays, reference_arrays)
+ ],
+ axis=-1,
+ )
match_masks = np.all(match_masks_for_all_columns, axis=-1)
query_matches, reference_matches = np.where(match_masks)
@@ -680,14 +676,8 @@ def _filter_model(atom_site, model_starts, model):
Reduce the ``atom_site`` category to the values for the given
model.
"""
- Category = type(atom_site)
- Column = Category.subcomponent_class()
- Data = Column.subcomponent_class()
-
# Append exclusive stop
- model_starts = np.append(
- model_starts, [atom_site.row_count]
- )
+ model_starts = np.append(model_starts, [atom_site.row_count])
# Indexing starts at 0, but model number starts at 1
model_index = model - 1
index = slice(model_starts[model_index], model_starts[model_index + 1])
@@ -773,9 +763,7 @@ def set_structure(pdbx_file, array, data_block=None, include_bonds=False):
# Fill PDBx columns from information
# in structures' attribute arrays as good as possible
atom_site = Category()
- atom_site["group_PDB"] = np.where(
- array.hetero, "HETATM", "ATOM"
- )
+ atom_site["group_PDB"] = np.where(array.hetero, "HETATM", "ATOM")
atom_site["type_symbol"] = np.copy(array.element)
atom_site["label_atom_id"] = np.copy(array.atom_name)
atom_site["label_alt_id"] = Column(
@@ -789,7 +777,7 @@ def set_structure(pdbx_file, array, data_block=None, include_bonds=False):
atom_site["label_seq_id"] = np.copy(array.res_id)
atom_site["pdbx_PDB_ins_code"] = Column(
np.copy(array.ins_code),
- np.where(array.ins_code == "", MaskValue.INAPPLICABLE, MaskValue.PRESENT)
+ np.where(array.ins_code == "", MaskValue.INAPPLICABLE, MaskValue.PRESENT),
)
atom_site["auth_seq_id"] = atom_site["label_seq_id"]
atom_site["auth_comp_id"] = atom_site["label_comp_id"]
@@ -806,11 +794,11 @@ def set_structure(pdbx_file, array, data_block=None, include_bonds=False):
if "charge" in annot_categories:
atom_site["pdbx_formal_charge"] = Column(
np.array([f"{c:+d}" if c != 0 else "?" for c in array.charge]),
- np.where(array.charge == 0, MaskValue.MISSING, MaskValue.PRESENT)
+ np.where(array.charge == 0, MaskValue.MISSING, MaskValue.PRESENT),
)
if array.bonds is not None:
- struct_conn = _set_inter_residue_bonds(array, atom_site)
+ struct_conn = _set_inter_residue_bonds(array, atom_site)
if struct_conn is not None:
block["struct_conn"] = struct_conn
if include_bonds:
@@ -828,16 +816,12 @@ def set_structure(pdbx_file, array, data_block=None, include_bonds=False):
atom_site["Cartn_x"] = np.copy(np.ravel(array.coord[..., 0]))
atom_site["Cartn_y"] = np.copy(np.ravel(array.coord[..., 1]))
atom_site["Cartn_z"] = np.copy(np.ravel(array.coord[..., 2]))
- atom_site["pdbx_PDB_model_num"] = np.ones(
- array.array_length(), dtype=np.int32
- )
+ atom_site["pdbx_PDB_model_num"] = np.ones(array.array_length(), dtype=np.int32)
# In case of multiple models repeat annotations
# and use model specific coordinates
else:
atom_site = _repeat(atom_site, array.stack_depth())
- coord = np.reshape(
- array.coord, (array.stack_depth() * array.array_length(), 3)
- )
+ coord = np.reshape(array.coord, (array.stack_depth() * array.array_length(), 3))
atom_site["Cartn_x"] = np.copy(coord[:, 0])
atom_site["Cartn_y"] = np.copy(coord[:, 1])
atom_site["Cartn_z"] = np.copy(coord[:, 2])
@@ -845,11 +829,9 @@ def set_structure(pdbx_file, array, data_block=None, include_bonds=False):
np.arange(1, array.stack_depth() + 1, dtype=np.int32),
repeats=array.array_length(),
)
- if not "atom_id" in annot_categories:
+ if "atom_id" not in annot_categories:
# Count from 1
- atom_site["id"] = np.arange(
- 1, len(atom_site["group_PDB"]) + 1
- )
+ atom_site["id"] = np.arange(1, len(atom_site["group_PDB"]) + 1)
block["atom_site"] = atom_site
# Write box into file
@@ -913,7 +895,7 @@ def _determine_entity_id(chain_id):
for i in range(len(chain_id)):
try:
entity_id[i] = id_translation[chain_id[i]]
- except:
+ except KeyError:
# chain_id is not in dictionary -> new entry
id_translation[chain_id[i]] = id
entity_id[i] = id_translation[chain_id[i]]
@@ -938,8 +920,11 @@ def _repeat(category, repetitions):
data = Data(np.tile(column.data.array, repetitions), data_encoding)
else:
data = Data(np.tile(column.data.array, repetitions))
- mask = Data(np.tile(column.mask.array, repetitions)) \
- if column.mask is not None else None
+ mask = (
+ Data(np.tile(column.mask.array, repetitions))
+ if column.mask is not None
+ else None
+ )
category_dict[key] = Column(data, mask)
return Category(category_dict)
@@ -986,22 +971,18 @@ def _set_intra_residue_bonds(array, atom_site):
chem_comp_bond["atom_id_1"] = array.atom_name[bond_array[:, 0]]
chem_comp_bond["atom_id_2"] = array.atom_name[bond_array[:, 1]]
chem_comp_bond["value_order"] = Column(
- value_order,
- np.where(any_mask, MaskValue.MISSING, MaskValue.PRESENT)
+ value_order, np.where(any_mask, MaskValue.MISSING, MaskValue.PRESENT)
)
chem_comp_bond["pdbx_aromatic_flag"] = Column(
- aromatic_flag,
- np.where(any_mask, MaskValue.MISSING, MaskValue.PRESENT)
+ aromatic_flag, np.where(any_mask, MaskValue.MISSING, MaskValue.PRESENT)
)
# BondList does not contain stereo information
# -> all values are missing
chem_comp_bond["pdbx_stereo_config"] = Column(
np.zeros(len(bond_array), dtype="U1"),
- np.full(len(bond_array), MaskValue.MISSING)
- )
- chem_comp_bond["pdbx_ordinal"] = np.arange(
- 1, len(bond_array) + 1, dtype=np.int32
+ np.full(len(bond_array), MaskValue.MISSING),
)
+ chem_comp_bond["pdbx_ordinal"] = np.arange(1, len(bond_array) + 1, dtype=np.int32)
return chem_comp_bond
@@ -1013,8 +994,11 @@ def _set_inter_residue_bonds(array, atom_site):
``atom_site`` category.
"""
COLUMNS = [
- "label_asym_id", "label_comp_id", "label_seq_id", "label_atom_id",
- "pdbx_PDB_ins_code"
+ "label_asym_id",
+ "label_comp_id",
+ "label_seq_id",
+ "label_atom_id",
+ "pdbx_PDB_ins_code",
]
Category = type(atom_site)
@@ -1027,13 +1011,12 @@ def _set_inter_residue_bonds(array, atom_site):
struct_conn["id"] = np.arange(1, len(bond_array) + 1)
struct_conn["conn_type_id"] = np.full(len(bond_array), "covale")
struct_conn["pdbx_value_order"] = Column(
- np.array(
- [PDBX_BOND_TYPE_TO_ORDER[btype] for btype in bond_array[:, 2]]
- ),
+ np.array([PDBX_BOND_TYPE_TO_ORDER[btype] for btype in bond_array[:, 2]]),
np.where(
bond_array[:, 2] == BondType.ANY,
- MaskValue.MISSING, MaskValue.PRESENT,
- )
+ MaskValue.MISSING,
+ MaskValue.PRESENT,
+ ),
)
# Write the identifying annotation...
for col_name in COLUMNS:
@@ -1041,8 +1024,9 @@ def _set_inter_residue_bonds(array, atom_site):
# ...for each bond partner
for i in range(2):
atom_indices = bond_array[:, i]
- struct_conn[_get_struct_conn_col_name(col_name, i+1)] \
- = annot[atom_indices]
+ struct_conn[_get_struct_conn_col_name(col_name, i + 1)] = annot[
+ atom_indices
+ ]
return struct_conn
@@ -1054,9 +1038,9 @@ def _filter_bonds(array, connection):
bond_array = array.bonds.as_array()
# To save computation time call 'get_residue_starts_for()' only once
# with indices of the first and second atom of each bond
- residue_starts_1, residue_starts_2 = get_residue_starts_for(
- array, bond_array[:, :2].flatten()
- ).reshape(-1, 2).T
+ residue_starts_1, residue_starts_2 = (
+ get_residue_starts_for(array, bond_array[:, :2].flatten()).reshape(-1, 2).T
+ )
if connection == "intra":
return bond_array[residue_starts_1 == residue_starts_2]
elif connection == "inter":
@@ -1065,8 +1049,7 @@ def _filter_bonds(array, connection):
raise ValueError("Invalid 'connection' option")
-def get_component(pdbx_file, data_block=None, use_ideal_coord=True,
- res_name=None):
+def get_component(pdbx_file, data_block=None, use_ideal_coord=True, res_name=None):
"""
Create an :class:`AtomArray` for a chemical component from the
``chem_comp_atom`` and, if available, the ``chem_comp_bond``
@@ -1166,16 +1149,16 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True,
coord_fields, alt_coord_fields = alt_coord_fields, coord_fields
try:
for i, field in enumerate(coord_fields):
- array.coord[:,i] = atom_category[field].as_array(np.float32)
+ array.coord[:, i] = atom_category[field].as_array(np.float32)
except KeyError as err:
key = err.args[0]
warnings.warn(
f"Attribute '{key}' not found within 'chem_comp_atom' category. "
f"The fallback coordinates will be used instead",
- UserWarning
+ UserWarning,
)
for i, field in enumerate(alt_coord_fields):
- array.coord[:,i] = atom_category[field].as_array(np.float32)
+ array.coord[:, i] = atom_category[field].as_array(np.float32)
try:
bond_category = block["chem_comp_bond"]
@@ -1185,9 +1168,8 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True,
)
except KeyError:
warnings.warn(
- f"Category 'chem_comp_bond' not found. "
- f"No bonds will be parsed",
- UserWarning
+ "Category 'chem_comp_bond' not found. " "No bonds will be parsed",
+ UserWarning,
)
else:
bonds = BondList(array.array_length())
@@ -1195,7 +1177,7 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True,
bond_category["atom_id_1"].as_array(str),
bond_category["atom_id_2"].as_array(str),
bond_category["value_order"].as_array(str),
- bond_category["pdbx_aromatic_flag"].as_array(str)
+ bond_category["pdbx_aromatic_flag"].as_array(str),
):
atom_i = np.where(array.atom_name == atom1)[0][0]
atom_j = np.where(array.atom_name == atom2)[0][0]
@@ -1237,9 +1219,7 @@ def set_component(pdbx_file, array, data_block=None):
Category = block.subcomponent_class()
if get_residue_count(array) > 1:
- raise BadStructureError(
- "The input atom array must comprise only one residue"
- )
+ raise BadStructureError("The input atom array must comprise only one residue")
res_name = array.res_name[0]
annot_categories = array.get_annotation_categories()
@@ -1262,31 +1242,28 @@ def set_component(pdbx_file, array, data_block=None):
atom_cat["pdbx_model_Cartn_z_ideal"] = atom_cat["model_Cartn_z"]
atom_cat["pdbx_component_atom_id"] = atom_cat["atom_id"]
atom_cat["pdbx_component_comp_id"] = atom_cat["comp_id"]
- atom_cat["pdbx_ordinal"] = np.arange(
- 1, array.array_length() + 1
- ).astype(str)
+ atom_cat["pdbx_ordinal"] = np.arange(1, array.array_length() + 1).astype(str)
block["chem_comp_atom"] = atom_cat
if array.bonds is not None and array.bonds.get_bond_count() > 0:
bond_array = array.bonds.as_array()
order_flags = []
aromatic_flags = []
- for bond_type in bond_array[:,2]:
+ for bond_type in bond_array[:, 2]:
order_flag, aromatic_flag = COMP_BOND_TYPE_TO_ORDER[bond_type]
order_flags.append(order_flag)
aromatic_flags.append(aromatic_flag)
bond_cat = Category()
bond_cat["comp_id"] = np.full(len(bond_array), res_name)
- bond_cat["atom_id_1"] = array.atom_name[bond_array[:,0]]
- bond_cat["atom_id_2"] = array.atom_name[bond_array[:,1]]
+ bond_cat["atom_id_1"] = array.atom_name[bond_array[:, 0]]
+ bond_cat["atom_id_2"] = array.atom_name[bond_array[:, 1]]
bond_cat["value_order"] = np.array(order_flags)
bond_cat["pdbx_aromatic_flag"] = np.array(aromatic_flags)
- bond_cat["pdbx_ordinal"] = np.arange(
- 1, len(bond_array) + 1
- ).astype(str)
+ bond_cat["pdbx_ordinal"] = np.arange(1, len(bond_array) + 1).astype(str)
block["chem_comp_bond"] = bond_cat
+
def list_assemblies(pdbx_file, data_block=None):
"""
List the biological assemblies that are available for the structure
@@ -1337,14 +1314,21 @@ def list_assemblies(pdbx_file, data_block=None):
id: details
for id, details in zip(
assembly_category["id"].as_array(str),
- assembly_category["details"].as_array(str)
+ assembly_category["details"].as_array(str),
)
}
-def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None,
- altloc="first", extra_fields=None, use_author_fields=True,
- include_bonds=False):
+def get_assembly(
+ pdbx_file,
+ assembly_id=None,
+ model=None,
+ data_block=None,
+ altloc="first",
+ extra_fields=None,
+ use_author_fields=True,
+ include_bonds=False,
+):
"""
Build the given biological assembly.
@@ -1434,9 +1418,7 @@ def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None,
try:
assembly_gen_category = block["pdbx_struct_assembly_gen"]
except KeyError:
- raise InvalidFileError(
- "File has no 'pdbx_struct_assembly_gen' category"
- )
+ raise InvalidFileError("File has no 'pdbx_struct_assembly_gen' category")
try:
struct_oper_category = block["pdbx_struct_oper_list"]
@@ -1469,7 +1451,7 @@ def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None,
altloc,
extra_fields_and_asym,
use_author_fields,
- include_bonds
+ include_bonds,
)
### Get transformations and apply them to the affected asym IDs
@@ -1485,9 +1467,7 @@ def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None,
operations = _parse_operation_expression(op_expr)
asym_ids = asym_id_expr.split(",")
# Filter affected asym IDs
- sub_structure = structure[
- ..., np.isin(structure.label_asym_id, asym_ids)
- ]
+ sub_structure = structure[..., np.isin(structure.label_asym_id, asym_ids)]
sub_assembly = _apply_transformations(
sub_structure, transformations, operations
)
@@ -1546,10 +1526,9 @@ def _get_transformations(struct_oper):
for i in (1, 2, 3)
]
)
- translation_vector = np.array([
- struct_oper[f"vector[{i}]"].as_array(float)[index]
- for i in (1, 2, 3)
- ])
+ translation_vector = np.array(
+ [struct_oper[f"vector[{i}]"].as_array(float)[index] for i in (1, 2, 3)]
+ )
transformation_dict[id] = (rotation_matrix, translation_vector)
return transformation_dict
@@ -1604,6 +1583,4 @@ def _convert_string_to_sequence(string, stype):
elif stype in _other_type_list:
return None
else:
- raise InvalidFileError(
- "mmCIF _entity_poly.type unsupported" " type: " + stype
- )
+ raise InvalidFileError("mmCIF _entity_poly.type unsupported" " type: " + stype)
diff --git a/src/biotite/structure/io/tng/__init__.py b/src/biotite/structure/io/tng/__init__.py
index b344635fd..250b5b3c3 100644
--- a/src/biotite/structure/io/tng/__init__.py
+++ b/src/biotite/structure/io/tng/__init__.py
@@ -10,4 +10,4 @@
__name__ = "biotite.structure.io.tng"
__author__ = "Patrick Kunzmann"
-from .file import *
\ No newline at end of file
+from .file import *
diff --git a/src/biotite/structure/io/tng/file.py b/src/biotite/structure/io/tng/file.py
index 8666ecc39..fb0705ab9 100644
--- a/src/biotite/structure/io/tng/file.py
+++ b/src/biotite/structure/io/tng/file.py
@@ -7,19 +7,20 @@
__all__ = ["TNGFile"]
import numpy as np
-from ..trajfile import TrajectoryFile
+from biotite.structure.io.trajfile import TrajectoryFile
class TNGFile(TrajectoryFile):
"""
This file class represents a TNG trajectory file.
"""
-
+
@classmethod
def traj_type(cls):
import mdtraj.formats as traj
+
return traj.TNGTrajectoryFile
-
+
@classmethod
def process_read_values(cls, read_values):
# nm to Angstrom
@@ -29,18 +30,15 @@ def process_read_values(cls, read_values):
box *= 10
time = read_values[1]
return coord, box, time
-
+
@classmethod
def prepare_write_values(cls, coord, box, time):
# Angstrom to nm
- xyz = np.divide(coord, 10, dtype=np.float32) \
- if coord is not None else None
- time = time.astype(np.float32, copy=False) \
- if time is not None else None
- box = np.divide(box, 10, dtype=np.float32) \
- if box is not None else None
+ xyz = np.divide(coord, 10, dtype=np.float32) if coord is not None else None
+ time = time.astype(np.float32, copy=False) if time is not None else None
+ box = np.divide(box, 10, dtype=np.float32) if box is not None else None
return {
- "xyz" : xyz,
- "box" : box,
- "time" : time,
+ "xyz": xyz,
+ "box": box,
+ "time": time,
}
diff --git a/src/biotite/structure/io/trajfile.py b/src/biotite/structure/io/trajfile.py
index 23842ea4e..8635bb2e2 100644
--- a/src/biotite/structure/io/trajfile.py
+++ b/src/biotite/structure/io/trajfile.py
@@ -6,18 +6,18 @@
__author__ = "Patrick Kunzmann"
__all__ = ["TrajectoryFile"]
-import itertools
import abc
+import itertools
import numpy as np
-from ..atoms import AtomArray, AtomArrayStack, stack, from_template
-from ...file import File
+from biotite.file import File
+from biotite.structure.atoms import AtomArray, AtomArrayStack, from_template
class TrajectoryFile(File, metaclass=abc.ABCMeta):
"""
This file class represents a trajectory file interfacing a
trajectory file class from `MDtraj`.
-
+
A trajectory file stores atom coordinates over multiple (time)
frames. The file formats are usually binary and involve sometimes
heavy compression, so that a large number of frames can be stored
@@ -34,27 +34,27 @@ class TrajectoryFile(File, metaclass=abc.ABCMeta):
Therefore, it is strongly recommended to make a copy of the
respective array, if the array is modified.
"""
-
+
def __init__(self):
super().__init__()
self._coord = None
self._time = None
self._box = None
self._model_count = None
-
@classmethod
- def read(cls, file_name, start=None, stop=None, step=None,
- atom_i=None, chunk_size=None):
+ def read(
+ cls, file_name, start=None, stop=None, step=None, atom_i=None, chunk_size=None
+ ):
"""
Read a trajectory file.
-
+
A trajectory file can be seen as a file representation of an
:class:`AtomArrayStack`.
Therefore, `start`, `stop` and `step` represent slice parameters
of the index of the first dimension and
`atom_i` represents an index array for the second dimension.
-
+
Parameters
----------
file_name : str
@@ -85,7 +85,7 @@ def read(cls, file_name, start=None, stop=None, step=None,
Although lower values can decrease the memory consumption of
reading trajectories, they also increase the computation
time.
-
+
Returns
-------
file_object : TrajectoryFile
@@ -105,7 +105,6 @@ def read(cls, file_name, start=None, stop=None, step=None,
traj_type = cls.traj_type()
with traj_type(file_name, "r") as f:
-
if start is None:
start = 0
# Discard atoms before start
@@ -116,13 +115,13 @@ def read(cls, file_name, start=None, stop=None, step=None,
TrajectoryFile._read_chunk_wise(
f, start, None, atom_i, chunk_size, discard=True
)
-
+
# The upcoming frames are saved
# Calculate the amount of frames to be read
if stop is None:
n_frames = None
else:
- n_frames = stop-start
+ n_frames = stop - start
if step is not None and n_frames is not None:
# Divide number of frames by 'step' in order to convert
# 'step' into 'stride'
@@ -130,7 +129,7 @@ def read(cls, file_name, start=None, stop=None, step=None,
# the number of frames is decremented before division
# and incremented afterwards again
n_frames = ((n_frames - 1) // step) + 1
-
+
# Read frames
if chunk_size is None:
result = f.read(n_frames, stride=step, atom_indices=atom_i)
@@ -138,7 +137,7 @@ def read(cls, file_name, start=None, stop=None, step=None,
result = TrajectoryFile._read_chunk_wise(
f, n_frames, step, atom_i, chunk_size, discard=False
)
-
+
# nm to Angstrom
coord, box, time = cls.process_read_values(result)
file.set_coord(coord)
@@ -146,15 +145,15 @@ def read(cls, file_name, start=None, stop=None, step=None,
file.set_time(time)
return file
-
@classmethod
- def read_iter(cls, file_name, start=None, stop=None, step=None,
- atom_i=None, stack_size=None):
+ def read_iter(
+ cls, file_name, start=None, stop=None, step=None, atom_i=None, stack_size=None
+ ):
"""
Create an iterator over each frame of the given trajectory file
in the selected range.
-
+
Parameters
----------
file_name : str
@@ -181,7 +180,7 @@ def read_iter(cls, file_name, start=None, stop=None, step=None,
values.
If the number of frames is not a multiple of `stack_size`,
the final stack is smaller than `stack_size`.
-
+
Yields
------
coord : ndarray, dtype=float32, shape=(n,3) or shape=(m,n,3)
@@ -190,30 +189,29 @@ def read_iter(cls, file_name, start=None, stop=None, step=None,
The box vectors of the current frame or stack.
time : float or ndarray, dtype=float32, shape=(n,) or None
The simulation time of the current frame or stack in *ps*.
-
+
See also
--------
read_iter_structure
-
+
Notes
-----
The `step` parameter does currently not work for *DCD* files.
"""
traj_type = cls.traj_type()
with traj_type(file_name, "r") as f:
-
if start is None:
start = 0
# Discard atoms before start
if start != 0:
f.read(n_frames=start, stride=None, atom_indices=atom_i)
-
+
# The upcoming frames are read
# Calculate the amount of frames to be read
if stop is None:
n_frames = None
else:
- n_frames = stop-start
+ n_frames = stop - start
if step is not None and n_frames is not None:
# Divide number of frames by 'step' in order to convert
# 'step' into 'stride'
@@ -221,7 +219,6 @@ def read_iter(cls, file_name, start=None, stop=None, step=None,
# the number of frames is decremented before division
# and incremented afterwards again
n_frames = ((n_frames - 1) // step) + 1
-
# Read frames
if stack_size is None:
@@ -242,7 +239,7 @@ def read_iter(cls, file_name, start=None, stop=None, step=None,
yield coord, box, time
if remaining_frames is not None:
remaining_frames -= 1
-
+
else:
remaining_frames = n_frames
while remaining_frames is None or remaining_frames > 0:
@@ -260,11 +257,18 @@ def read_iter(cls, file_name, start=None, stop=None, step=None,
yield coord, box, time
if remaining_frames is not None:
remaining_frames -= stack_size
-
@classmethod
- def read_iter_structure(cls, file_name, template, start=None, stop=None,
- step=None, atom_i=None, stack_size=None):
+ def read_iter_structure(
+ cls,
+ file_name,
+ template,
+ start=None,
+ stop=None,
+ step=None,
+ atom_i=None,
+ stack_size=None,
+ ):
"""
Create an iterator over each frame of the given trajectory file
in the selected range.
@@ -275,8 +279,8 @@ def read_iter_structure(cls, file_name, template, start=None, stop=None,
information and no topology information, this method requires
a template atom array or stack. This template can be acquired
for example from a PDB file, which is associated with the
- trajectory file.
-
+ trajectory file.
+
Parameters
----------
file_name : str
@@ -306,18 +310,18 @@ def read_iter_structure(cls, file_name, template, start=None, stop=None,
determined by this parameter.
If the number of frames is not a multiple of `stack_size`,
the final stack is smaller than `stack_size`.
-
+
Yields
------
structure : AtomArray or AtomArrayStack
The structure of the current frame as :class:`AtomArray`.
If `stack_size` is set, multiple frames are returned as
:class:`AtomArrayStack`.
-
+
See also
--------
read_iter
-
+
Notes
-----
This iterator creates a new copy of the given template for every
@@ -335,7 +339,7 @@ def read_iter_structure(cls, file_name, template, start=None, stop=None,
f"An 'AtomArray' or 'AtomArrayStack' is expected as template, "
f"not '{type(template).__name__}'"
)
-
+
for coord, box, _ in cls.read_iter(
file_name, start, stop, step, atom_i, stack_size
):
@@ -347,7 +351,6 @@ def read_iter_structure(cls, file_name, template, start=None, stop=None,
else:
yield from_template(template, coord, box)
-
def write(self, file_name):
"""
Write the content into a trajectory file.
@@ -360,9 +363,8 @@ def write(self, file_name):
"""
traj_type = self.traj_type()
param = self.prepare_write_values(self._coord, self._box, self._time)
- with traj_type(file_name, 'w') as f:
+ with traj_type(file_name, "w") as f:
f.write(**param)
-
@classmethod
def write_iter(cls, file_name, coord, box=None, time=None):
@@ -376,7 +378,7 @@ def write_iter(cls, file_name, coord, box=None, time=None):
Hence, this class method may save a large amount of memory if
a large file should be written, if `coord` are provided as
generator.
-
+
Parameters
----------
file_name : str
@@ -399,7 +401,7 @@ def write_iter(cls, file_name, coord, box=None, time=None):
time = itertools.repeat(None)
traj_type = cls.traj_type()
- with traj_type(file_name, 'w') as f:
+ with traj_type(file_name, "w") as f:
for c, b, t in zip(coord, box, time):
if c.ndim != 2:
raise IndexError(
@@ -414,24 +416,22 @@ def write_iter(cls, file_name, coord, box=None, time=None):
t = np.expand_dims(t, axis=0)
param = cls.prepare_write_values(c, b, t)
f.write(**param)
-
def get_coord(self):
"""
Extract only the atom coordinates from the trajectory file.
-
+
Returns
-------
coord : ndarray, dtype=float, shape=(m,n,3)
The coordinates stored in the trajectory file.
"""
return self._coord
-
def get_time(self):
"""
Get the simlation time in *ps* values for each frame.
-
+
Returns
-------
time : ndarray, dtype=float, shape=(m,)
@@ -439,12 +439,11 @@ def get_time(self):
frames, that were read from the file.
"""
return self._time
-
def get_box(self):
"""
Get the box vectors for each frame.
-
+
Returns
-------
box : ndarray, dtype=float, shape=(m,3,3)
@@ -452,12 +451,11 @@ def get_box(self):
frames, that were read from the file.
"""
return self._box
-
def set_coord(self, coord):
"""
Set the atom coordinates in the trajectory file.
-
+
Parameters
----------
coord : ndarray, dtype=float, shape=(m,n,3)
@@ -465,12 +463,11 @@ def set_coord(self, coord):
"""
self._check_model_count(coord)
self._coord = coord
-
def set_time(self, time):
"""
Set the simulation time of each frame in the trajectory file.
-
+
Parameters
----------
time : ndarray, dtype=float, shape=(m,)
@@ -478,13 +475,12 @@ def set_time(self, time):
"""
self._check_model_count(time)
self._time = time
-
def set_box(self, box):
"""
Set the periodic box vectors of each frame in the trajectory
file.
-
+
Parameters
----------
time : ndarray, dtype=float, shape=(m,3,3)
@@ -492,25 +488,24 @@ def set_box(self, box):
"""
self._check_model_count(box)
self._box = box
-
def get_structure(self, template):
"""
Convert the trajectory file content into an
:class:`AtomArrayStack`.
-
+
Since trajectory files usually only contain atom coordinate
information and no topology information, this method requires
a template atom array or stack. This template can be acquired
for example from a PDB file, which is associated with the
- trajectory file.
-
+ trajectory file.
+
Parameters
----------
template : AtomArray or AtomArrayStack
The template array or stack, where the atom annotation data
is taken from.
-
+
Returns
-------
array_stack : AtomArrayStack
@@ -519,15 +514,14 @@ def get_structure(self, template):
trajectory file.
"""
return from_template(template, self.get_coord(), self.get_box())
-
def set_structure(self, structure, time=None):
"""
Write an atom array (stack) into the trajectory file object.
-
+
The topology information (chain, residue, etc.) is not saved in
the file.
-
+
Parameters
----------
structure : AtomArray or AtomArrayStack
@@ -547,34 +541,30 @@ def set_structure(self, structure, time=None):
if time is not None:
self.set_time(time)
-
def copy(self):
"""
This operation is not implemented for trajectory files.
-
+
Raises
------
NotImplementedError
"""
- raise NotImplementedError("Copying is not implemented "
- "for trajectory files")
-
+ raise NotImplementedError("Copying is not implemented " "for trajectory files")
@classmethod
@abc.abstractmethod
def traj_type(cls):
"""
The `MDtraj` files class to be used.
-
+
PROTECTED: Override when inheriting.
-
+
Returns
-------
class
An `MDtraj` subclass of :class:`TrajectoryFile`.
"""
pass
-
@classmethod
@abc.abstractmethod
@@ -583,15 +573,15 @@ def process_read_values(cls, read_values):
Convert the return value of the `read()` method of the
respective :class:`mdtraj.TrajectoryFile` into coordinates,
simulation box and simulation time.
-
+
PROTECTED: Override when inheriting.
-
+
Parameters
----------
read_values : tuple
The return value of the respective
:func:`mdtraj.TrajectoryFile.read()` method.
-
+
Returns
-------
coord : ndarray, dtype=float, shape=(m,n,3)
@@ -602,7 +592,6 @@ def process_read_values(cls, read_values):
The simulation time in ps for each frame.
"""
pass
-
@classmethod
@abc.abstractmethod
@@ -622,7 +611,7 @@ def prepare_write_values(cls, coord, box, time):
The box vectors in Å for each frame.
time : ndarray, dtype=float, shape=(m,)
The simulation time in ps for each frame.
-
+
Returns
-------
parameters : dict
@@ -631,7 +620,6 @@ def prepare_write_values(cls, coord, box, time):
"""
pass
-
def _check_model_count(self, array):
"""
Check if the amount of models in the given array is equal to
@@ -650,11 +638,9 @@ def _check_model_count(self, array):
f"{len(array)} models were given, "
f"but the file contains {self._model_count} models"
)
-
@staticmethod
- def _read_chunk_wise(file, n_frames, step, atom_i, chunk_size,
- discard=False):
+ def _read_chunk_wise(file, n_frames, step, atom_i, chunk_size, discard=False):
"""
Similar to :func:`read()`, just for chunk-wise reading of the
trajectory.
@@ -691,7 +677,7 @@ def _read_chunk_wise(file, n_frames, step, atom_i, chunk_size,
chunks.append(chunk)
if remaining_frames is not None:
remaining_frames -= n
-
+
if not discard:
# Assemble the chunks into contiguous arrays
# for each value (coord, box, time)
@@ -707,4 +693,4 @@ def _read_chunk_wise(file, n_frames, step, atom_i, chunk_size,
result[i] = None
return tuple(result)
else:
- return None
\ No newline at end of file
+ return None
diff --git a/src/biotite/structure/io/trr/__init__.py b/src/biotite/structure/io/trr/__init__.py
index cf2f0510d..c7ed3f8d9 100644
--- a/src/biotite/structure/io/trr/__init__.py
+++ b/src/biotite/structure/io/trr/__init__.py
@@ -10,4 +10,4 @@
__name__ = "biotite.structure.io.trr"
__author__ = "Patrick Kunzmann"
-from .file import *
\ No newline at end of file
+from .file import *
diff --git a/src/biotite/structure/io/trr/file.py b/src/biotite/structure/io/trr/file.py
index 435fd6f7a..4aecb472a 100644
--- a/src/biotite/structure/io/trr/file.py
+++ b/src/biotite/structure/io/trr/file.py
@@ -7,19 +7,20 @@
__all__ = ["TRRFile"]
import numpy as np
-from ..trajfile import TrajectoryFile
+from biotite.structure.io.trajfile import TrajectoryFile
class TRRFile(TrajectoryFile):
"""
This file class represents a TRR trajectory file.
"""
-
+
@classmethod
def traj_type(cls):
import mdtraj.formats as traj
+
return traj.TRRTrajectoryFile
-
+
@classmethod
def process_read_values(cls, read_values):
# nm to Angstrom
@@ -29,18 +30,15 @@ def process_read_values(cls, read_values):
box *= 10
time = read_values[1]
return coord, box, time
-
+
@classmethod
def prepare_write_values(cls, coord, box, time):
# Angstrom to nm
- xyz = np.divide(coord, 10, dtype=np.float32) \
- if coord is not None else None
- time = time.astype(np.float32, copy=False) \
- if time is not None else None
- box = np.divide(box, 10, dtype=np.float32) \
- if box is not None else None
+ xyz = np.divide(coord, 10, dtype=np.float32) if coord is not None else None
+ time = time.astype(np.float32, copy=False) if time is not None else None
+ box = np.divide(box, 10, dtype=np.float32) if box is not None else None
return {
- "xyz" : xyz,
- "box" : box,
- "time" : time,
+ "xyz": xyz,
+ "box": box,
+ "time": time,
}
diff --git a/src/biotite/structure/io/xtc/__init__.py b/src/biotite/structure/io/xtc/__init__.py
index 5803ef784..5fe71216e 100644
--- a/src/biotite/structure/io/xtc/__init__.py
+++ b/src/biotite/structure/io/xtc/__init__.py
@@ -10,4 +10,4 @@
__name__ = "biotite.structure.io.xtc"
__author__ = "Patrick Kunzmann"
-from .file import *
\ No newline at end of file
+from .file import *
diff --git a/src/biotite/structure/io/xtc/file.py b/src/biotite/structure/io/xtc/file.py
index 62d9a977f..0664c49f1 100644
--- a/src/biotite/structure/io/xtc/file.py
+++ b/src/biotite/structure/io/xtc/file.py
@@ -7,17 +7,18 @@
__all__ = ["XTCFile"]
import numpy as np
-from ..trajfile import TrajectoryFile
+from biotite.structure.io.trajfile import TrajectoryFile
class XTCFile(TrajectoryFile):
"""
This file class represents a XTC trajectory file.
"""
-
+
@classmethod
def traj_type(cls):
import mdtraj.formats as traj
+
return traj.XTCTrajectoryFile
@classmethod
@@ -29,18 +30,15 @@ def process_read_values(cls, read_values):
box *= 10
time = read_values[1]
return coord, box, time
-
+
@classmethod
def prepare_write_values(cls, coord, box, time):
# Angstrom to nm
- xyz = np.divide(coord, 10, dtype=np.float32) \
- if coord is not None else None
- time = time.astype(np.float32, copy=False) \
- if time is not None else None
- box = np.divide(box, 10, dtype=np.float32) \
- if box is not None else None
+ xyz = np.divide(coord, 10, dtype=np.float32) if coord is not None else None
+ time = time.astype(np.float32, copy=False) if time is not None else None
+ box = np.divide(box, 10, dtype=np.float32) if box is not None else None
return {
- "xyz" : xyz,
- "box" : box,
- "time" : time,
+ "xyz": xyz,
+ "box": box,
+ "time": time,
}
diff --git a/src/biotite/structure/mechanics.py b/src/biotite/structure/mechanics.py
index d79e23908..6e6ffedcb 100644
--- a/src/biotite/structure/mechanics.py
+++ b/src/biotite/structure/mechanics.py
@@ -12,17 +12,14 @@
__all__ = ["mass_center", "gyration_radius"]
import numpy as np
-from .atoms import Atom, AtomArray, AtomArrayStack, coord
-from .util import vector_dot, norm_vector
-from .error import BadStructureError
-from .geometry import distance
-from .info.masses import mass
+from biotite.structure.geometry import distance
+from biotite.structure.info.masses import mass
def gyration_radius(array, masses=None):
"""
Compute the radius/radii of gyration of an atom array or stack.
-
+
Parameters
----------
array : AtomArray or AtomArrayStack
@@ -33,7 +30,7 @@ def gyration_radius(array, masses=None):
Must have the same length as `array`. By default, the standard
atomic mass for each element is taken.
-
+
Returns
-------
masses : float or ndarray, dtype=float
@@ -46,13 +43,14 @@ def gyration_radius(array, masses=None):
masses = np.array([mass(element) for element in array.element])
center = mass_center(array, masses)
radii = distance(array, center[..., np.newaxis, :])
- inertia_moment = np.sum(masses * radii*radii, axis=-1)
+ inertia_moment = np.sum(masses * radii * radii, axis=-1)
return np.sqrt(inertia_moment / np.sum(masses))
+
def mass_center(array, masses=None):
"""
Calculate the center(s) of mass of an atom array or stack.
-
+
Parameters
----------
array : AtomArray or AtomArrayStack
@@ -61,7 +59,7 @@ def mass_center(array, masses=None):
The masses to use for each atom in the input `array`.
Must have the same length as `array`. By default, the standard
atomic mass for each element is taken.
-
+
Returns
-------
radius : ndarray, ndarray, dtype=float
@@ -72,4 +70,4 @@ def mass_center(array, masses=None):
"""
if masses is None:
masses = np.array([mass(element) for element in array.element])
- return np.sum(masses[:,np.newaxis] * array.coord, axis=-2) / np.sum(masses)
\ No newline at end of file
+ return np.sum(masses[:, np.newaxis] * array.coord, axis=-2) / np.sum(masses)
diff --git a/src/biotite/structure/molecules.py b/src/biotite/structure/molecules.py
index d40920b18..f20a5a1b6 100644
--- a/src/biotite/structure/molecules.py
+++ b/src/biotite/structure/molecules.py
@@ -12,8 +12,8 @@
__all__ = ["get_molecule_indices", "get_molecule_masks", "molecule_iter"]
import numpy as np
-from .atoms import AtomArray, AtomArrayStack
-from .bonds import BondList, find_connected
+from biotite.structure.atoms import AtomArray, AtomArrayStack
+from biotite.structure.bonds import BondList, find_connected
def get_molecule_indices(array):
@@ -244,8 +244,7 @@ def get_molecule_masks(array):
molecule_indices = get_molecule_indices(bonds)
molecule_masks = np.zeros(
- (len(molecule_indices), bonds.get_atom_count()),
- dtype=bool
+ (len(molecule_indices), bonds.get_atom_count()), dtype=bool
)
for i in range(len(molecule_indices)):
molecule_masks[i, molecule_indices[i]] = True
diff --git a/src/biotite/structure/pseudoknots.py b/src/biotite/structure/pseudoknots.py
index 36a877a84..2a065f16b 100644
--- a/src/biotite/structure/pseudoknots.py
+++ b/src/biotite/structure/pseudoknots.py
@@ -10,9 +10,10 @@
__author__ = "Tom David Müller"
__all__ = ["pseudoknots"]
-import numpy as np
-import networkx as nx
from itertools import chain, product
+import networkx as nx
+import numpy as np
+
def pseudoknots(base_pairs, scores=None, max_pseudoknot_order=None):
"""
@@ -118,7 +119,7 @@ def pseudoknots(base_pairs, scores=None, max_pseudoknot_order=None):
return np.array([[]], dtype=np.int32)
# List containing the results
- results = [np.full(len(base_pairs), -1, dtype='int32')]
+ results = [np.full(len(base_pairs), -1, dtype="int32")]
# if no score array is given, each base pairs' score is one
if scores is None:
@@ -126,9 +127,7 @@ def pseudoknots(base_pairs, scores=None, max_pseudoknot_order=None):
# Make sure `base_pairs` has the same length as the score array
if len(base_pairs) != len(scores):
- raise ValueError(
- "'base_pair' and 'scores' must have the same shape"
- )
+ raise ValueError("'base_pair' and 'scores' must have the same shape")
# Split the base pairs in regions
regions = _find_regions(base_pairs, scores)
@@ -139,7 +138,7 @@ def pseudoknots(base_pairs, scores=None, max_pseudoknot_order=None):
return np.vstack(results)
-class _Region():
+class _Region:
"""
This class represents a paired region.
@@ -159,7 +158,7 @@ class _Region():
The score for each base pair.
"""
- def __init__ (self, base_pairs, region_pairs, scores):
+ def __init__(self, base_pairs, region_pairs, scores):
# The Start and Stop indices for each Region
self.start = np.min(base_pairs[region_pairs])
self.stop = np.max(base_pairs[region_pairs])
@@ -245,19 +244,18 @@ def _find_regions(base_pairs, scores):
# Check if the current base pair belongs to the region that is
# currently being defined
- previous_upstream_rank = rank[i-1, 0]
+ previous_upstream_rank = rank[i - 1, 0]
this_upstream_rank = rank[i, 0]
- previous_downstream_rank = rank[i-1, 1]
+ previous_downstream_rank = rank[i - 1, 1]
this_downstream_rank = rank[i, 1]
# if the current base pair belongs to a new region, save the
# current region and start a new region
- if ((previous_downstream_rank - this_downstream_rank) != 1 or
- (this_upstream_rank - previous_upstream_rank) != 1):
- regions.add(
- _Region(base_pairs, np.array(region_pairs), scores)
- )
- region_pairs = []
+ if (previous_downstream_rank - this_downstream_rank) != 1 or (
+ this_upstream_rank - previous_upstream_rank
+ ) != 1:
+ regions.add(_Region(base_pairs, np.array(region_pairs), scores))
+ region_pairs = []
# Append the current base pair to the region
region_pairs.append(original_indices[i])
@@ -296,7 +294,7 @@ def _generate_graphical_representation(regions):
# Get the region array and a boolean array, where the start of each
# region is ``True``.
region_array, (start_stops,) = _get_region_array_for(
- regions, content=[lambda a : [True, False]], dtype=['bool']
+ regions, content=[lambda a: [True, False]], dtype=["bool"]
)
# Check each region for conflicts with other regions
@@ -307,15 +305,15 @@ def _generate_graphical_representation(regions):
# Find the index of the stopping of the region in the region
# array
- stop = _get_first_occurrence_for(region_array[start+1:], region)
- stop += (start + 1)
+ stop = _get_first_occurrence_for(region_array[start + 1 :], region)
+ stop += start + 1
# Store regions the current region conflicts with
conflicts = set()
# Iterate over the regions between the starting and stopping
# point of the current region
- for other_region in region_array[start+1:stop]:
+ for other_region in region_array[start + 1 : stop]:
# If the other region is not already a conflict, add it to
# the conflict set
if other_region not in conflicts:
@@ -389,17 +387,17 @@ def _get_region_array_for(regions, content=[], dtype=[]):
The custom output.
"""
# region_array and index array
- region_array = np.empty(len(regions)*2, dtype=_Region)
- index_array = np.empty(len(regions)*2, dtype='int32')
+ region_array = np.empty(len(regions) * 2, dtype=_Region)
+ index_array = np.empty(len(regions) * 2, dtype="int32")
# Content array for custom return arrays
- content_list = [None]*len(content)
+ content_list = [None] * len(content)
for i in range(len(content)):
- content_list[i] = np.empty(len(regions)*2, dtype=dtype[i])
+ content_list[i] = np.empty(len(regions) * 2, dtype=dtype[i])
# Fill the arrays
for i, reg in enumerate(regions):
- indices = [2*i, 2*i+1]
+ indices = [2 * i, 2 * i + 1]
region_array[indices] = reg
for c in range(len(content_list)):
content_list[c][indices] = content[c](reg)
@@ -443,8 +441,8 @@ def _remove_pseudoknots(regions):
represented as ``set`` of unknotted regions.
"""
# Create dynamic programming matrix
- dp_matrix_shape = len(regions)*2, len(regions)*2
- dp_matrix = np.empty(dp_matrix_shape, dtype='object')
+ dp_matrix_shape = len(regions) * 2, len(regions) * 2
+ dp_matrix = np.empty(dp_matrix_shape, dtype="object")
dp_matrix_solutions_starts = np.zeros_like(dp_matrix)
dp_matrix_solutions_stops = np.zeros_like(dp_matrix)
@@ -452,9 +450,7 @@ def _remove_pseudoknots(regions):
# ``region_array`` contains the region objects and ``start_stops``
# contains the lowest and highest positions of the regions
region_array, (start_stops,) = _get_region_array_for(
- regions,
- [lambda a : (a.start, a.stop)],
- ['int32']
+ regions, [lambda a: (a.start, a.stop)], ["int32"]
)
# Initialise the matrix diagonal with ndarrays of empty frozensets
for i in range(len(dp_matrix)):
@@ -462,11 +458,11 @@ def _remove_pseudoknots(regions):
# Iterate through the top right half of the dynamic programming
# matrix
- for j in range(len(regions)*2):
- for i in range(j-1, -1, -1):
+ for j in range(len(regions) * 2):
+ for i in range(j - 1, -1, -1):
solution_candidates = set()
- left = dp_matrix[i, j-1]
- bottom = dp_matrix[i+1, j]
+ left = dp_matrix[i, j - 1]
+ bottom = dp_matrix[i + 1, j]
# Add all solutions of the cell to the left
for solution in left:
@@ -474,24 +470,21 @@ def _remove_pseudoknots(regions):
# Add all solutions of the cell to the bottom
for solution in bottom:
- solution_candidates.add(solution)
+ solution_candidates.add(solution)
# Check if i and j are start/end-points of the same region
if region_array[i] is region_array[j]:
-
# Add all solutions from the cell to the bottom left
# plus this region
- bottom_left = dp_matrix[i+1, j-1]
+ bottom_left = dp_matrix[i + 1, j - 1]
for solution in bottom_left:
solution_candidates.add(solution | set([region_array[i]]))
# Perform additional tests if solution in the left cell and
# bottom cell both differ from an empty solution
- if (np.any(left != [frozenset()]) and
- np.any(bottom != [frozenset()])):
-
- left_highest = dp_matrix_solutions_stops[i, j-1]
- bottom_lowest = dp_matrix_solutions_starts[i+1, j]
+ if np.any(left != [frozenset()]) and np.any(bottom != [frozenset()]):
+ left_highest = dp_matrix_solutions_stops[i, j - 1]
+ bottom_lowest = dp_matrix_solutions_starts[i + 1, j]
# For each pair of solutions check if solutions are
# disjoint
@@ -504,11 +497,11 @@ def _remove_pseudoknots(regions):
# Both solutions are not disjoint
# Add subsolutions
for k in range(
- np.where(start_stops==lowest)[0][0]-1,
- np.where(start_stops==highest)[0][0]+1
+ np.where(start_stops == lowest)[0][0] - 1,
+ np.where(start_stops == highest)[0][0] + 1,
):
cell1 = dp_matrix[i, k]
- cell2 = dp_matrix[k+1, j]
+ cell2 = dp_matrix[k + 1, j]
for subsolution1 in cell1:
for subsolution2 in cell2:
solution_candidates.add(
@@ -536,16 +529,12 @@ def _remove_pseudoknots(regions):
# Add the solutions to the dynamic programming matrix
dp_matrix[i, j] = solution_candidates
- solution_starts = np.zeros_like(solution_candidates, dtype='int32')
- solution_stops = np.zeros_like(solution_candidates, dtype='int32')
+ solution_starts = np.zeros_like(solution_candidates, dtype="int32")
+ solution_stops = np.zeros_like(solution_candidates, dtype="int32")
for s, solution in enumerate(solution_candidates):
- solution_starts[s] = min(
- [reg.start for reg in solution], default=-1
- )
- solution_stops[s] = max(
- [reg.stop for reg in solution], default=-1
- )
+ solution_starts[s] = min([reg.start for reg in solution], default=-1)
+ solution_stops[s] = max([reg.stop for reg in solution], default=-1)
dp_matrix_solutions_starts[i, j] = solution_starts
dp_matrix_solutions_stops[i, j] = solution_stops
@@ -586,14 +575,11 @@ def _get_results(regions, results, max_pseudoknot_order, order=0):
# Non-conflicting regions are of the current order:
index_list_non_conflicting = list(
- chain(
- *[region.get_index_array() for region in non_conflicting]
- )
- )
+ chain(*[region.get_index_array() for region in non_conflicting])
+ )
for result in results:
result[index_list_non_conflicting] = order
-
# If no conflicts remain, the results are complete
if len(regions) == 0:
return results
@@ -601,9 +587,10 @@ def _get_results(regions, results, max_pseudoknot_order, order=0):
# Get the optimal solutions for given regions. Evaluate each clique
# of mutually conflicting regions seperately
cliques = [component for component in nx.connected_components(regions)]
- solutions = [set(chain(*e)) for e in product(
- *[_remove_pseudoknots(clique) for clique in cliques]
- )]
+ solutions = [
+ set(chain(*e))
+ for e in product(*[_remove_pseudoknots(clique) for clique in cliques])
+ ]
# Get a copy of the current results for each optimal solution
results_list = [
@@ -612,16 +599,13 @@ def _get_results(regions, results, max_pseudoknot_order, order=0):
# Evaluate each optimal solution
for i, solution in enumerate(solutions):
-
# Get the pseudoknotted regions
pseudoknotted_regions = regions.copy()
pseudoknotted_regions.remove_nodes_from(solution)
# Get an index list of the unknotted base pairs
index_list_unknotted = list(
- chain(
- *[region.get_index_array() for region in solution]
- )
+ chain(*[region.get_index_array() for region in solution])
)
# Write results for current solution
@@ -634,8 +618,10 @@ def _get_results(regions, results, max_pseudoknot_order, order=0):
# Evaluate the pseudoknotted region
results_list[i] = _get_results(
- pseudoknotted_regions, results_list[i],
- max_pseudoknot_order, order=order+1
+ pseudoknotted_regions,
+ results_list[i],
+ max_pseudoknot_order,
+ order=order + 1,
)
# Flatten the results
diff --git a/src/biotite/structure/rdf.py b/src/biotite/structure/rdf.py
index 563cd0ae3..448a81ffa 100644
--- a/src/biotite/structure/rdf.py
+++ b/src/biotite/structure/rdf.py
@@ -12,15 +12,16 @@
from numbers import Integral
import numpy as np
-from .atoms import Atom, AtomArray, stack, array, coord, AtomArrayStack
-from .box import box_volume
-from .geometry import displacement
-from .util import vector_dot
-from .celllist import CellList
+from biotite.structure.atoms import AtomArray, coord, stack
+from biotite.structure.box import box_volume
+from biotite.structure.celllist import CellList
+from biotite.structure.geometry import displacement
+from biotite.structure.util import vector_dot
-def rdf(center, atoms, selection=None, interval=(0, 10), bins=100, box=None,
- periodic=False):
+def rdf(
+ center, atoms, selection=None, interval=(0, 10), bins=100, box=None, periodic=False
+):
r"""
Compute the radial distribution function *g(r)* (RDF) for one or
multiple given central positions based on a given system of
@@ -155,7 +156,7 @@ def rdf(center, atoms, selection=None, interval=(0, 10), bins=100, box=None,
Find the radius for the first solvation shell.
In this simple case, the density peak is identified by finding
the maximum of the function.
-
+
>>> peak_position = np.argmax(g_r)
>>> print(f"{bins[peak_position]/10:.2f} nm")
0.29 nm
@@ -165,9 +166,9 @@ def rdf(center, atoms, selection=None, interval=(0, 10), bins=100, box=None,
atoms = stack([atoms])
if selection is not None:
atoms = atoms[..., selection]
-
+
atom_coord = atoms.coord
-
+
if box is None:
if atoms.box is None:
raise ValueError("A box must be supplied")
@@ -175,17 +176,15 @@ def rdf(center, atoms, selection=None, interval=(0, 10), bins=100, box=None,
box = atoms.box
elif box.ndim == 2 and atoms.stack_depth() == 1:
box = box[np.newaxis, :, :]
-
+
center = coord(center)
if center.ndim == 1:
center = center.reshape((1, 1) + center.shape)
elif center.ndim == 2:
center = center.reshape((1,) + center.shape)
-
+
if box.shape[0] != center.shape[0] or box.shape[0] != atom_coord.shape[0]:
- raise ValueError(
- "Center, box, and atoms must have the same model count"
- )
+ raise ValueError("Center, box, and atoms must have the same model count")
# Calculate distance histogram
edges = _calculate_edges(interval, bins)
@@ -209,17 +208,20 @@ def rdf(center, atoms, selection=None, interval=(0, 10), bins=100, box=None,
for j in range(center.shape[1]):
dist_box = box[i] if periodic else None
# Calculate squared distances
- disp.append(displacement(
- center[i,j], atom_coord[i, near_atom_mask[j]], box=dist_box
- ))
+ disp.append(
+ displacement(
+ center[i, j], atom_coord[i, near_atom_mask[j]], box=dist_box
+ )
+ )
# Make one array from multiple arrays with different length
disp = np.concatenate(disp)
sq_distances = vector_dot(disp, disp)
hist, _ = np.histogram(sq_distances, bins=sq_edges)
# Normalize with average particle density (N/V) in each bin
- bin_volume = (4 / 3 * np.pi * np.power(edges[1: ], 3)) \
- - (4 / 3 * np.pi * np.power(edges[:-1], 3))
+ bin_volume = (4 / 3 * np.pi * np.power(edges[1:], 3)) - (
+ 4 / 3 * np.pi * np.power(edges[:-1], 3)
+ )
n_frames = len(atoms)
volume = box_volume(box).mean()
density = atoms.array_length() / volume
@@ -237,7 +239,7 @@ def _calculate_edges(interval, bins):
if isinstance(bins, Integral):
if bins < 1:
raise ValueError("At least one bin is required")
- return np.linspace(*interval, bins+1)
+ return np.linspace(*interval, bins + 1)
else:
# 'bins' contains edges
return np.array(bins, dtype=float)
diff --git a/src/biotite/structure/repair.py b/src/biotite/structure/repair.py
index abc7a96e5..2a567ea4a 100644
--- a/src/biotite/structure/repair.py
+++ b/src/biotite/structure/repair.py
@@ -10,12 +10,12 @@
__author__ = "Patrick Kunzmann, Daniel Bauer"
__all__ = ["create_continuous_res_ids", "infer_elements", "create_atom_names"]
-from collections import Counter
import warnings
+from collections import Counter
import numpy as np
-from .atoms import AtomArray, AtomArrayStack
-from .residues import get_residue_starts
-from .chains import get_chain_starts
+from biotite.structure.atoms import AtomArray, AtomArrayStack
+from biotite.structure.chains import get_chain_starts
+from biotite.structure.residues import get_residue_starts
def create_continuous_res_ids(atoms, restart_each_chain=True):
@@ -151,18 +151,131 @@ def create_atom_names(atoms):
return atom_names
-_elements = [elem.upper() for elem in
-["H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na", "Mg",
-"Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe",
-"Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y",
-"Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te",
-"I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb",
-"Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt",
-"Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th", "Pa",
-"U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr", "Rf",
-"Db", "Sg", "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Nh", "Fl", "Mc", "Lv", "Ts",
-"Og"]
+_elements = [
+ elem.upper()
+ for elem in [
+ "H",
+ "He",
+ "Li",
+ "Be",
+ "B",
+ "C",
+ "N",
+ "O",
+ "F",
+ "Ne",
+ "Na",
+ "Mg",
+ "Al",
+ "Si",
+ "P",
+ "S",
+ "Cl",
+ "Ar",
+ "K",
+ "Ca",
+ "Sc",
+ "Ti",
+ "V",
+ "Cr",
+ "Mn",
+ "Fe",
+ "Co",
+ "Ni",
+ "Cu",
+ "Zn",
+ "Ga",
+ "Ge",
+ "As",
+ "Se",
+ "Br",
+ "Kr",
+ "Rb",
+ "Sr",
+ "Y",
+ "Zr",
+ "Nb",
+ "Mo",
+ "Tc",
+ "Ru",
+ "Rh",
+ "Pd",
+ "Ag",
+ "Cd",
+ "In",
+ "Sn",
+ "Sb",
+ "Te",
+ "I",
+ "Xe",
+ "Cs",
+ "Ba",
+ "La",
+ "Ce",
+ "Pr",
+ "Nd",
+ "Pm",
+ "Sm",
+ "Eu",
+ "Gd",
+ "Tb",
+ "Dy",
+ "Ho",
+ "Er",
+ "Tm",
+ "Yb",
+ "Lu",
+ "Hf",
+ "Ta",
+ "W",
+ "Re",
+ "Os",
+ "Ir",
+ "Pt",
+ "Au",
+ "Hg",
+ "Tl",
+ "Pb",
+ "Bi",
+ "Po",
+ "At",
+ "Rn",
+ "Fr",
+ "Ra",
+ "Ac",
+ "Th",
+ "Pa",
+ "U",
+ "Np",
+ "Pu",
+ "Am",
+ "Cm",
+ "Bk",
+ "Cf",
+ "Es",
+ "Fm",
+ "Md",
+ "No",
+ "Lr",
+ "Rf",
+ "Db",
+ "Sg",
+ "Bh",
+ "Hs",
+ "Mt",
+ "Ds",
+ "Rg",
+ "Cn",
+ "Nh",
+ "Fl",
+ "Mc",
+ "Lv",
+ "Ts",
+ "Og",
+ ]
]
+
+
def _guess_element(atom_name):
# remove digits (1H -> H)
elem = "".join([i for i in atom_name if not i.isdigit()])
@@ -171,9 +284,13 @@ def _guess_element(atom_name):
return ""
# Some often used elements for biomolecules
- if elem.startswith("C") or elem.startswith("N") or \
- elem.startswith("O") or elem.startswith("S") or \
- elem.startswith("H"):
+ if (
+ elem.startswith("C")
+ or elem.startswith("N")
+ or elem.startswith("O")
+ or elem.startswith("S")
+ or elem.startswith("H")
+ ):
return elem[0]
# Exactly match element abbreviations
@@ -184,4 +301,4 @@ def _guess_element(atom_name):
return _elements[_elements.index(elem[0])]
except ValueError:
warnings.warn(f"Could not infer element for '{atom_name}'")
- return ""
\ No newline at end of file
+ return ""
diff --git a/src/biotite/structure/residues.py b/src/biotite/structure/residues.py
index a32a79f18..61ae1712a 100644
--- a/src/biotite/structure/residues.py
+++ b/src/biotite/structure/residues.py
@@ -9,14 +9,27 @@
__name__ = "biotite.structure"
__author__ = "Patrick Kunzmann"
-__all__ = ["get_residue_starts", "apply_residue_wise", "spread_residue_wise",
- "get_residue_masks", "get_residue_starts_for",
- "get_residue_positions", "get_residues", "get_residue_count",
- "residue_iter"]
+__all__ = [
+ "get_residue_starts",
+ "apply_residue_wise",
+ "spread_residue_wise",
+ "get_residue_masks",
+ "get_residue_starts_for",
+ "get_residue_positions",
+ "get_residues",
+ "get_residue_count",
+ "residue_iter",
+]
import numpy as np
-from .atoms import AtomArray, AtomArrayStack
-from .resutil import *
+from biotite.structure.segments import (
+ apply_segment_wise,
+ get_segment_masks,
+ get_segment_positions,
+ get_segment_starts_for,
+ segment_iter,
+ spread_segment_wise,
+)
def get_residue_starts(array, add_exclusive_stop=False):
@@ -57,23 +70,20 @@ def get_residue_starts(array, add_exclusive_stop=False):
278 292 304]
"""
# These mask are 'true' at indices where the value changes
- chain_id_changes = (array.chain_id[1:] != array.chain_id[:-1])
- res_id_changes = (array.res_id[1:] != array.res_id[:-1] )
- ins_code_changes = (array.ins_code[1:] != array.ins_code[:-1])
- res_name_changes = (array.res_name[1:] != array.res_name[:-1])
+ chain_id_changes = array.chain_id[1:] != array.chain_id[:-1]
+ res_id_changes = array.res_id[1:] != array.res_id[:-1]
+ ins_code_changes = array.ins_code[1:] != array.ins_code[:-1]
+ res_name_changes = array.res_name[1:] != array.res_name[:-1]
# If any of these annotation arrays change, a new residue starts
residue_change_mask = (
- chain_id_changes |
- res_id_changes |
- ins_code_changes |
- res_name_changes
+ chain_id_changes | res_id_changes | ins_code_changes | res_name_changes
)
# Convert mask to indices
# Add 1, to shift the indices from the end of a residue
# to the start of a new residue
- residue_starts = np.where(residue_change_mask)[0] +1
+ residue_starts = np.where(residue_change_mask)[0] + 1
# The first residue is not included yet -> Insert '[0]'
if add_exclusive_stop:
diff --git a/src/biotite/structure/resutil.py b/src/biotite/structure/segments.py
similarity index 83%
rename from src/biotite/structure/resutil.py
rename to src/biotite/structure/segments.py
index 64c5339e1..5841346b3 100644
--- a/src/biotite/structure/resutil.py
+++ b/src/biotite/structure/segments.py
@@ -4,8 +4,14 @@
__name__ = "biotite.structure"
__author__ = "Patrick Kunzmann"
-__all__ = ["apply_segment_wise", "spread_segment_wise", "get_segment_masks",
- "get_segment_starts_for", "get_segment_positions", "segment_iter"]
+__all__ = [
+ "apply_segment_wise",
+ "spread_segment_wise",
+ "get_segment_masks",
+ "get_segment_starts_for",
+ "get_segment_positions",
+ "segment_iter",
+]
import numpy as np
@@ -24,9 +30,9 @@ def apply_segment_wise(starts, data, function, axis):
"""
# The result array
processed_data = None
- for i in range(len(starts)-1):
- segment = data[starts[i]:starts[i+1]]
- if axis == None:
+ for i in range(len(starts) - 1):
+ segment = data[starts[i] : starts[i + 1]]
+ if axis is None:
value = function(segment)
else:
value = function(segment, axis=axis)
@@ -39,13 +45,11 @@ def apply_segment_wise(starts, data, function, axis):
# is length of segment of size 1 -> length of all IDs
# (equal to atom array length)
processed_data = np.zeros(
- (len(starts)-1,) + value.shape, dtype=value.dtype
+ (len(starts) - 1,) + value.shape, dtype=value.dtype
)
else:
# Scalar value -> one dimensional result array
- processed_data = np.zeros(
- len(starts)-1, dtype=type(value)
- )
+ processed_data = np.zeros(len(starts) - 1, dtype=type(value))
# Write values into result arrays
processed_data[i] = value
return processed_data
@@ -64,7 +68,7 @@ def spread_segment_wise(starts, input_data):
atom array.
"""
output_data = np.zeros(starts[-1], dtype=input_data.dtype)
- for i in range(len(starts)-1):
+ for i in range(len(starts) - 1):
start = starts[i]
stop = starts[i + 1]
output_data[start:stop] = input_data[i]
@@ -92,14 +96,13 @@ def get_segment_masks(starts, indices):
if (indices >= length).any():
index = np.min(np.where(indices >= length)[0])
raise ValueError(
- f"Index {index} is out of range for "
- f"an atom array with length {length}"
+ f"Index {index} is out of range for " f"an atom array with length {length}"
)
-
+
insertion_points = np.searchsorted(starts, indices, side="right") - 1
for i, point in enumerate(insertion_points):
- masks[i, starts[point] : starts[point+1]] = True
-
+ masks[i, starts[point] : starts[point + 1]] = True
+
return masks
@@ -125,10 +128,9 @@ def get_segment_starts_for(starts, indices):
if (indices >= length).any():
index = np.min(np.where(indices >= length)[0])
raise ValueError(
- f"Index {index} is out of range for "
- f"an atom array with length {length}"
+ f"Index {index} is out of range for " f"an atom array with length {length}"
)
-
+
insertion_points = np.searchsorted(starts, indices, side="right") - 1
return starts[insertion_points]
@@ -155,10 +157,9 @@ def get_segment_positions(starts, indices):
if (indices >= length).any():
index = np.min(np.where(indices >= length)[0])
raise ValueError(
- f"Index {index} is out of range for "
- f"an atom array with length {length}"
+ f"Index {index} is out of range for " f"an atom array with length {length}"
)
-
+
return np.searchsorted(starts, indices, side="right") - 1
@@ -174,5 +175,5 @@ def segment_iter(array, starts):
Includes exclusive stop, i.e. the length of the corresponding
atom array.
"""
- for i in range(len(starts)-1):
- yield array[..., starts[i] : starts[i+1]]
+ for i in range(len(starts) - 1):
+ yield array[..., starts[i] : starts[i + 1]]
diff --git a/src/biotite/structure/sequence.py b/src/biotite/structure/sequence.py
index 0cad79b73..a0538e314 100644
--- a/src/biotite/structure/sequence.py
+++ b/src/biotite/structure/sequence.py
@@ -11,13 +11,12 @@
__all__ = ["to_sequence"]
import numpy as np
-from .info.misc import one_letter_code
-from .info.groups import amino_acid_names, nucleotide_names
-from .residues import get_residues
-from .chains import get_chain_starts
-from .error import BadStructureError
-from ..sequence.seqtypes import ProteinSequence, NucleotideSequence
-
+from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence
+from biotite.structure.chains import get_chain_starts
+from biotite.structure.error import BadStructureError
+from biotite.structure.info.groups import amino_acid_names, nucleotide_names
+from biotite.structure.info.misc import one_letter_code
+from biotite.structure.residues import get_residues
HETERO_PLACEHOLDER = "."
@@ -63,9 +62,9 @@ def to_sequence(atoms, allow_hetero=False):
"""
sequences = []
chain_start_indices = get_chain_starts(atoms, add_exclusive_stop=True)
- for i in range(len(chain_start_indices)-1):
+ for i in range(len(chain_start_indices) - 1):
start = chain_start_indices[i]
- stop = chain_start_indices[i+1]
+ stop = chain_start_indices[i + 1]
chain = atoms[start:stop]
_, residues = get_residues(chain)
one_letter_symbols = np.array(
@@ -73,7 +72,7 @@ def to_sequence(atoms, allow_hetero=False):
)
hetero_mask = one_letter_symbols == HETERO_PLACEHOLDER
- aa_count = np.count_nonzero(np.isin(residues, amino_acid_names()))
+ aa_count = np.count_nonzero(np.isin(residues, amino_acid_names()))
nuc_count = np.count_nonzero(np.isin(residues, nucleotide_names()))
if aa_count == 0 and nuc_count == 0:
raise BadStructureError(
@@ -109,4 +108,4 @@ def to_sequence(atoms, allow_hetero=False):
sequences.append(NucleotideSequence("".join(one_letter_symbols)))
# Remove exclusive stop
- return sequences, chain_start_indices[:-1]
\ No newline at end of file
+ return sequences, chain_start_indices[:-1]
diff --git a/src/biotite/structure/sse.py b/src/biotite/structure/sse.py
index 0d870071c..ee505716c 100644
--- a/src/biotite/structure/sse.py
+++ b/src/biotite/structure/sse.py
@@ -12,25 +12,23 @@
__all__ = ["annotate_sse"]
import numpy as np
-from .celllist import CellList
-from .geometry import distance, angle, dihedral
-from .filter import filter_amino_acids
-from .residues import get_residue_starts
-from .integrity import check_res_id_continuity
+from biotite.structure.celllist import CellList
+from biotite.structure.filter import filter_amino_acids
+from biotite.structure.geometry import angle, dihedral, distance
+from biotite.structure.integrity import check_res_id_continuity
+from biotite.structure.residues import get_residue_starts
+_r_helix = (np.deg2rad(89 - 12), np.deg2rad(89 + 12))
+_a_helix = (np.deg2rad(50 - 20), np.deg2rad(50 + 20))
+_d2_helix = ((5.5 - 0.5), (5.5 + 0.5)) # Not used in the algorithm description
+_d3_helix = ((5.3 - 0.5), (5.3 + 0.5))
+_d4_helix = ((6.4 - 0.6), (6.4 + 0.6))
-_r_helix = (np.deg2rad(89-12), np.deg2rad(89+12))
-_a_helix = (np.deg2rad(50-20), np.deg2rad(50+20))
-_d2_helix = ((5.5-0.5), (5.5+0.5)) # Not used in the algorithm description
-_d3_helix = ((5.3-0.5), (5.3+0.5))
-_d4_helix = ((6.4-0.6), (6.4+0.6))
-
-_r_strand = (np.deg2rad(124-14), np.deg2rad(124+14))
-_a_strand = (np.deg2rad(-180), np.deg2rad(-125),
- np.deg2rad(145), np.deg2rad(180))
-_d2_strand = ((6.7-0.6), (6.7+0.6))
-_d3_strand = ((9.9-0.9), (9.9+0.9))
-_d4_strand = ((12.4-1.1), (12.4+1.1))
+_r_strand = (np.deg2rad(124 - 14), np.deg2rad(124 + 14))
+_a_strand = (np.deg2rad(-180), np.deg2rad(-125), np.deg2rad(145), np.deg2rad(180))
+_d2_strand = ((6.7 - 0.6), (6.7 + 0.6))
+_d3_strand = ((9.9 - 0.9), (9.9 + 0.9))
+_d4_strand = ((12.4 - 1.1), (12.4 + 1.1))
def annotate_sse(atom_array):
@@ -93,9 +91,9 @@ def annotate_sse(atom_array):
ca_indices = np.where(
filter_amino_acids(atom_array) & (atom_array.atom_name == "CA")
)[0]
- ca_coord[
- np.searchsorted(residue_starts, ca_indices, "right") - 1
- ] = atom_array.coord[ca_indices]
+ ca_coord[np.searchsorted(residue_starts, ca_indices, "right") - 1] = (
+ atom_array.coord[ca_indices]
+ )
if len(ca_coord) <= 5:
# The number of atoms is too small #
@@ -112,12 +110,12 @@ def annotate_sse(atom_array):
# purpose of geometric measurements
# -> the distances/angles spanning discontinuities are NaN
discont_indices = check_res_id_continuity(atom_array)
- discont_res_indices = np.searchsorted(
- residue_starts, discont_indices, "right"
- ) - 1
+ discont_res_indices = np.searchsorted(residue_starts, discont_indices, "right") - 1
ca_coord = np.insert(
- ca_coord, discont_res_indices,
- np.full((len(discont_res_indices),3), np.nan), axis=0
+ ca_coord,
+ discont_res_indices,
+ np.full((len(discont_res_indices), 3), np.nan),
+ axis=0,
)
# Later the SSE for virtual residues are removed again
# via this mask
@@ -126,60 +124,62 @@ def annotate_sse(atom_array):
length = len(ca_coord)
-
# The distances and angles are not defined for the entire interval,
# therefore the indices do not have the full range
# Values that are not defined are NaN
d2i = np.full(length, np.nan)
d3i = np.full(length, np.nan)
d4i = np.full(length, np.nan)
- ri = np.full(length, np.nan)
- ai = np.full(length, np.nan)
-
- d2i[1 : length-1] = distance(ca_coord[0 : length-2], ca_coord[2 : length])
- d3i[1 : length-2] = distance(ca_coord[0 : length-3], ca_coord[3 : length])
- d4i[1 : length-3] = distance(ca_coord[0 : length-4], ca_coord[4 : length])
- ri[1 : length-1] = angle(
- ca_coord[0 : length-2],
- ca_coord[1 : length-1],
- ca_coord[2 : length]
+ ri = np.full(length, np.nan)
+ ai = np.full(length, np.nan)
+
+ d2i[1 : length - 1] = distance(ca_coord[0 : length - 2], ca_coord[2:length])
+ d3i[1 : length - 2] = distance(ca_coord[0 : length - 3], ca_coord[3:length])
+ d4i[1 : length - 3] = distance(ca_coord[0 : length - 4], ca_coord[4:length])
+ ri[1 : length - 1] = angle(
+ ca_coord[0 : length - 2], ca_coord[1 : length - 1], ca_coord[2:length]
)
- ai[1 : length-2] = dihedral(
- ca_coord[0 : length-3],
- ca_coord[1 : length-2],
- ca_coord[2 : length-1],
- ca_coord[3 : length-0]
+ ai[1 : length - 2] = dihedral(
+ ca_coord[0 : length - 3],
+ ca_coord[1 : length - 2],
+ ca_coord[2 : length - 1],
+ ca_coord[3 : length - 0],
)
# Find CA that meet criteria for potential helices and strands
- relaxed_helix = (
- (d3i >= _d3_helix[0]) & (d3i <= _d3_helix[1])
- ) | (
- (ri >= _r_helix[0] ) & ( ri <= _r_helix[1])
+ relaxed_helix = ((d3i >= _d3_helix[0]) & (d3i <= _d3_helix[1])) | (
+ (ri >= _r_helix[0]) & (ri <= _r_helix[1])
)
strict_helix = (
- (d3i >= _d3_helix[0]) & (d3i <= _d3_helix[1]) &
- (d4i >= _d4_helix[0]) & (d4i <= _d4_helix[1])
+ (d3i >= _d3_helix[0])
+ & (d3i <= _d3_helix[1])
+ & (d4i >= _d4_helix[0])
+ & (d4i <= _d4_helix[1])
) | (
- (ri >= _r_helix[0] ) & ( ri <= _r_helix[1]) &
- (ai >= _a_helix[0] ) & ( ai <= _a_helix[1])
+ (ri >= _r_helix[0])
+ & (ri <= _r_helix[1])
+ & (ai >= _a_helix[0])
+ & (ai <= _a_helix[1])
)
relaxed_strand = (d3i >= _d3_strand[0]) & (d3i <= _d3_strand[1])
strict_strand = (
- (d2i >= _d2_strand[0]) & (d2i <= _d2_strand[1]) &
- (d3i >= _d3_strand[0]) & (d3i <= _d3_strand[1]) &
- (d4i >= _d4_strand[0]) & (d4i <= _d4_strand[1])
+ (d2i >= _d2_strand[0])
+ & (d2i <= _d2_strand[1])
+ & (d3i >= _d3_strand[0])
+ & (d3i <= _d3_strand[1])
+ & (d4i >= _d4_strand[0])
+ & (d4i <= _d4_strand[1])
) | (
- (ri >= _r_strand[0] ) & ( ri <= _r_strand[1]) &
- (
+ (ri >= _r_strand[0])
+ & (ri <= _r_strand[1])
+ & (
# Account for periodic boundary of dihedral angle
- ((ai >= _a_strand[0] ) & ( ai <= _a_strand[1])) |
- ((ai >= _a_strand[2] ) & ( ai <= _a_strand[3]))
+ ((ai >= _a_strand[0]) & (ai <= _a_strand[1]))
+ | ((ai >= _a_strand[2]) & (ai <= _a_strand[3]))
)
)
-
helix_mask = _mask_consecutive(strict_helix, 5)
helix_mask = _extend_region(helix_mask, relaxed_helix)
@@ -187,12 +187,11 @@ def annotate_sse(atom_array):
short_strand_mask = _mask_regions_with_contacts(
ca_coord,
_mask_consecutive(strict_strand, 3),
- min_contacts=5, min_distance=4.2, max_distance=5.2
- )
- strand_mask = _extend_region(
- strand_mask | short_strand_mask, relaxed_strand
+ min_contacts=5,
+ min_distance=4.2,
+ max_distance=5.2,
)
-
+ strand_mask = _extend_region(strand_mask | short_strand_mask, relaxed_strand)
sse = np.full(length, "c", dtype="U1")
sse[helix_mask] = "a"
@@ -215,10 +214,10 @@ def _mask_consecutive(mask, number):
# if it and the following `number-1` elements are True
# The elements `mask[-(number-1):]` cannot have the sufficient count
# by this definition, as they are at the end of the array
- counts = np.zeros(len(mask) - (number-1), dtype=int)
+ counts = np.zeros(len(mask) - (number - 1), dtype=int)
for i in range(number):
counts[mask[i : i + len(counts)]] += 1
- consecutive_seed = (counts == number)
+ consecutive_seed = counts == number
# Not only that element, but also the
# following `number-1` elements are in a consecutive region
@@ -257,8 +256,9 @@ def _extend_region(base_condition_mask, extension_condition_mask):
)
-def _mask_regions_with_contacts(coord, candidate_mask,
- min_contacts, min_distance, max_distance):
+def _mask_regions_with_contacts(
+ coord, candidate_mask, min_contacts, min_distance, max_distance
+):
"""
Mask regions of `candidate_mask` that have at least `min_contacts`
contacts with `coord` in the range `min_distance` to `max_distance`.
@@ -269,9 +269,7 @@ def _mask_regions_with_contacts(coord, candidate_mask,
# -> no residue can satisfy 'min_contacts'
return np.zeros(len(candidate_mask), dtype=bool)
- cell_list = CellList(
- potential_contact_coord, max_distance
- )
+ cell_list = CellList(potential_contact_coord, max_distance)
# For each candidate position,
# get all contacts within maximum distance
all_within_max_dist_indices = cell_list.get_atoms(
@@ -282,33 +280,29 @@ def _mask_regions_with_contacts(coord, candidate_mask,
for i, atom_index in enumerate(np.where(candidate_mask)[0]):
within_max_dist_indices = all_within_max_dist_indices[i]
# Remove padding values
- within_max_dist_indices = within_max_dist_indices[
- within_max_dist_indices != -1
- ]
+ within_max_dist_indices = within_max_dist_indices[within_max_dist_indices != -1]
# Now count all contacts within maximum distance
# that also satisfy the minimum distance
contacts[atom_index] = np.count_nonzero(
distance(
- coord[atom_index],
- potential_contact_coord[within_max_dist_indices]
- ) > min_distance
+ coord[atom_index], potential_contact_coord[within_max_dist_indices]
+ )
+ > min_distance
)
# Count the number of contacts per region
# These indices mark the start of either a 'True' or 'False' region
# Prepend absent region to the start to capture the event,
# that the first element is already the start of a region
- region_change_indices = np.where(
- np.diff(np.append([False], candidate_mask))
- )[0]
+ region_change_indices = np.where(np.diff(np.append([False], candidate_mask)))[0]
# Add exclusive stop
region_change_indices = np.append(region_change_indices, [len(coord)])
output_mask = np.zeros(len(candidate_mask), dtype=bool)
for i in range(len(region_change_indices) - 1):
start = region_change_indices[i]
- stop = region_change_indices[i+1]
- total_contacts = np.sum(contacts[start : stop])
+ stop = region_change_indices[i + 1]
+ total_contacts = np.sum(contacts[start:stop])
if total_contacts >= min_contacts:
- output_mask[start : stop] = True
+ output_mask[start:stop] = True
- return output_mask
\ No newline at end of file
+ return output_mask
diff --git a/src/biotite/structure/superimpose.py b/src/biotite/structure/superimpose.py
index d522471fa..d06d0abdb 100755
--- a/src/biotite/structure/superimpose.py
+++ b/src/biotite/structure/superimpose.py
@@ -8,19 +8,22 @@
__name__ = "biotite.structure"
__author__ = "Patrick Kunzmann, Claude J. Rogers"
-__all__ = ["superimpose", "superimpose_homologs",
- "superimpose_without_outliers",
- "AffineTransformation"]
+__all__ = [
+ "superimpose",
+ "superimpose_homologs",
+ "superimpose_without_outliers",
+ "AffineTransformation",
+]
import numpy as np
-from .atoms import coord
-from .geometry import centroid, distance
-from .filter import filter_amino_acids, filter_nucleotides
-from .sequence import to_sequence
-from ..sequence.alphabet import common_alphabet
-from ..sequence.seqtypes import ProteinSequence
-from ..sequence.align import SubstitutionMatrix, align_optimal, get_codes
+from biotite.sequence.align import SubstitutionMatrix, align_optimal, get_codes
+from biotite.sequence.alphabet import common_alphabet
+from biotite.sequence.seqtypes import ProteinSequence
+from biotite.structure.atoms import coord
+from biotite.structure.filter import filter_amino_acids, filter_nucleotides
+from biotite.structure.geometry import centroid, distance
+from biotite.structure.sequence import to_sequence
class AffineTransformation:
@@ -45,12 +48,12 @@ class AffineTransformation:
The dimensions are always expanded to *(m,3)* or *(m,3,3)*,
respectively.
"""
+
def __init__(self, center_translation, rotation, target_translation):
self.center_translation = _expand_dims(center_translation, 2)
self.rotation = _expand_dims(rotation, 3)
self.target_translation = _expand_dims(target_translation, 2)
-
def apply(self, atoms):
"""
Apply this transformation on the given structure.
@@ -118,7 +121,6 @@ def apply(self, atoms):
superimposed.coord = superimposed_coord
return superimposed
-
def as_matrix(self):
"""
Get the translations and rotation as a combined 4x4
@@ -316,16 +318,19 @@ def superimpose(fixed, mobile, atom_mask=None):
mob_centered_filtered = mob_filtered - mob_centroid[:, np.newaxis, :]
fix_centered_filtered = fix_filtered - fix_centroid[:, np.newaxis, :]
- rotation = _get_rotation_matrices(
- fix_centered_filtered, mob_centered_filtered
- )
+ rotation = _get_rotation_matrices(fix_centered_filtered, mob_centered_filtered)
transform = AffineTransformation(-mob_centroid, rotation, fix_centroid)
return transform.apply(mobile), transform
-def superimpose_without_outliers(fixed, mobile, min_anchors=3,
- max_iterations=10, quantiles=(0.25, 0.75),
- outlier_threshold=1.5):
+def superimpose_without_outliers(
+ fixed,
+ mobile,
+ min_anchors=3,
+ max_iterations=10,
+ quantiles=(0.25, 0.75),
+ outlier_threshold=1.5,
+):
r"""
Superimpose structures onto a fixed structure, ignoring
conformational outliers.
@@ -458,8 +463,9 @@ def superimpose_without_outliers(fixed, mobile, min_anchors=3,
return transform.apply(mobile), transform, anchor_indices
-def superimpose_homologs(fixed, mobile, substitution_matrix=None,
- gap_penalty=-10, min_anchors=3, **kwargs):
+def superimpose_homologs(
+ fixed, mobile, substitution_matrix=None, gap_penalty=-10, min_anchors=3, **kwargs
+):
r"""
Superimpose one protein or nucleotide chain onto another one,
considering sequence differences and conformational outliers.
@@ -530,8 +536,8 @@ def superimpose_homologs(fixed, mobile, substitution_matrix=None,
fixed_anchor_indices = _get_backbone_anchor_indices(fixed)
mobile_anchor_indices = _get_backbone_anchor_indices(mobile)
if (
- len(fixed_anchor_indices) < min_anchors or
- len(mobile_anchor_indices) < min_anchors
+ len(fixed_anchor_indices) < min_anchors
+ or len(mobile_anchor_indices) < min_anchors
):
raise ValueError(
"Structures have too few CA atoms for required number of anchors"
@@ -562,7 +568,7 @@ def superimpose_homologs(fixed, mobile, substitution_matrix=None,
fixed[..., fixed_anchor_indices],
mobile[..., mobile_anchor_indices],
min_anchors,
- **kwargs
+ **kwargs,
)
fixed_anchor_indices = fixed_anchor_indices[selected_anchor_indices]
mobile_anchor_indices = mobile_anchor_indices[selected_anchor_indices]
@@ -580,17 +586,13 @@ def _reshape_to_3d(coord):
Reshape the coordinate array to 3D, if it is 2D.
"""
if coord.ndim < 2:
- raise ValueError(
- "Coordinates must be at least two-dimensional"
- )
+ raise ValueError("Coordinates must be at least two-dimensional")
if coord.ndim == 2:
return coord[np.newaxis, ...]
elif coord.ndim == 3:
return coord
else:
- raise ValueError(
- "Coordinates must be at most three-dimensional"
- )
+ raise ValueError("Coordinates must be at most three-dimensional")
def _get_rotation_matrices(fixed, mobile):
@@ -602,10 +604,10 @@ def _get_rotation_matrices(fixed, mobile):
Both sets of coordinates must already be centered at origin.
"""
# Calculate cross-covariance matrices
- cov = np.sum(fixed[:,:,:,np.newaxis] * mobile[:,:,np.newaxis,:], axis=1)
+ cov = np.sum(fixed[:, :, :, np.newaxis] * mobile[:, :, np.newaxis, :], axis=1)
v, s, w = np.linalg.svd(cov)
# Remove possibility of reflected atom coordinates
- reflected_mask = (np.linalg.det(v) * np.linalg.det(w) < 0)
+ reflected_mask = np.linalg.det(v) * np.linalg.det(w) < 0
v[reflected_mask, :, -1] *= -1
matrices = np.matmul(v, w)
return matrices
@@ -617,11 +619,7 @@ def _multi_matmul(matrices, vectors):
with m x n vectors.
"""
return np.transpose(
- np.matmul(
- matrices,
- np.transpose(vectors, axes=(0, 2, 1))
- ),
- axes=(0, 2, 1)
+ np.matmul(matrices, np.transpose(vectors, axes=(0, 2, 1))), axes=(0, 2, 1)
)
@@ -631,8 +629,8 @@ def _get_backbone_anchor_indices(atoms):
nucleotide and return their indices.
"""
return np.where(
- ((filter_amino_acids(atoms)) & (atoms.atom_name == "CA")) |
- ((filter_nucleotides(atoms)) & (atoms.atom_name == "P"))
+ ((filter_amino_acids(atoms)) & (atoms.atom_name == "CA"))
+ | ((filter_nucleotides(atoms)) & (atoms.atom_name == "P"))
)[0]
@@ -685,11 +683,7 @@ def _find_matching_anchors(
def _to_sequence(atoms):
sequences, _ = to_sequence(atoms, allow_hetero=True)
if len(sequences) == 0:
- raise ValueError(
- "Structure does not contain any amino acids or nucleotides"
- )
+ raise ValueError("Structure does not contain any amino acids or nucleotides")
if len(sequences) > 1:
- raise ValueError(
- "Structure contains multiple chains, but only one is allowed"
- )
- return sequences[0]
\ No newline at end of file
+ raise ValueError("Structure contains multiple chains, but only one is allowed")
+ return sequences[0]
diff --git a/src/biotite/structure/transform.py b/src/biotite/structure/transform.py
index 0ab281c8d..c094b7730 100644
--- a/src/biotite/structure/transform.py
+++ b/src/biotite/structure/transform.py
@@ -9,20 +9,25 @@
__name__ = "biotite.structure"
__author__ = "Patrick Kunzmann", "Claude J. Rogers"
-__all__ = ["translate", "rotate", "rotate_centered", "rotate_about_axis",
- "orient_principal_components", "align_vectors"]
+__all__ = [
+ "translate",
+ "rotate",
+ "rotate_centered",
+ "rotate_about_axis",
+ "orient_principal_components",
+ "align_vectors",
+]
import numpy as np
-from .geometry import centroid
-from .error import BadStructureError
-from .atoms import Atom, AtomArray, AtomArrayStack, coord
-from .util import norm_vector, vector_dot, matrix_rotate
+from biotite.structure.atoms import Atom, AtomArray, AtomArrayStack, coord
+from biotite.structure.geometry import centroid
+from biotite.structure.util import matrix_rotate, norm_vector, vector_dot
def translate(atoms, vector):
"""
Translate the given atoms or coordinates by a given vector.
-
+
Parameters
----------
atoms : Atom or AtomArray or AtomArrayStack or ndarray, shape=(3,) or shape=(n,3) or shape=(m,n,3)
@@ -30,7 +35,7 @@ def translate(atoms, vector):
The coordinates can be directly provided as :class:`ndarray`.
vector: array-like, shape=(3,) or shape=(n,3) or shape=(m,n,3)
The translation vector :math:`(x, y, z)`.
-
+
Returns
-------
transformed : Atom or AtomArray or AtomArrayStack or ndarray, shape=(3,) or shape=(n,3) or shape=(m,n,3)
@@ -39,7 +44,7 @@ def translate(atoms, vector):
"""
positions = coord(atoms).copy()
vector = np.asarray(vector)
-
+
if vector.shape[-1] != 3:
raise ValueError("Translation vector must contain 3 coordinates")
positions += vector
@@ -50,10 +55,10 @@ def rotate(atoms, angles):
"""
Rotate the given atoms or coordinates about the *x*, *y* and *z*
axes by given angles.
-
+
The rotations are centered at the origin and are performed
sequentially in the order *x*, *y*, *z*.
-
+
Parameters
----------
atoms : Atom or AtomArray or AtomArrayStack or ndarray, shape=(3,) or shape=(n,3) or shape=(m,n,3)
@@ -61,13 +66,13 @@ def rotate(atoms, angles):
The coordinates can be directly provided as :class:`ndarray`.
angles: array-like, length=3
The rotation angles in radians around *x*, *y* and *z*.
-
+
Returns
-------
transformed : Atom or AtomArray or AtomArrayStack or ndarray, shape=(3,) or shape=(n,3) or shape=(m,n,3)
A copy of the input atoms or coordinates, rotated by the given
angles.
-
+
See Also
--------
rotate_centered
@@ -82,27 +87,39 @@ def rotate(atoms, angles):
>>> print(rotated)
[1.225e-16 2.000e+00 0.000e+00]
"""
- from numpy import sin, cos
+ from numpy import cos, sin
# Check if "angles" contains 3 angles for all dimensions
if len(angles) != 3:
raise ValueError("Translation vector must be container of length 3")
# Create rotation matrices for all 3 dimensions
- rot_x = np.array([[ 1, 0, 0 ],
- [ 0, cos(angles[0]), -sin(angles[0]) ],
- [ 0, sin(angles[0]), cos(angles[0]) ]])
-
- rot_y = np.array([[ cos(angles[1]), 0, sin(angles[1]) ],
- [ 0, 1, 0 ],
- [ -sin(angles[1]), 0, cos(angles[1]) ]])
-
- rot_z = np.array([[ cos(angles[2]), -sin(angles[2]), 0 ],
- [ sin(angles[2]), cos(angles[2]), 0 ],
- [ 0, 0, 1 ]])
-
+ rot_x = np.array(
+ [
+ [1, 0, 0],
+ [0, cos(angles[0]), -sin(angles[0])],
+ [0, sin(angles[0]), cos(angles[0])],
+ ]
+ )
+
+ rot_y = np.array(
+ [
+ [cos(angles[1]), 0, sin(angles[1])],
+ [0, 1, 0],
+ [-sin(angles[1]), 0, cos(angles[1])],
+ ]
+ )
+
+ rot_z = np.array(
+ [
+ [cos(angles[2]), -sin(angles[2]), 0],
+ [sin(angles[2]), cos(angles[2]), 0],
+ [0, 0, 1],
+ ]
+ )
+
positions = coord(atoms).copy()
positions = matrix_rotate(positions, rot_z @ rot_y @ rot_x)
-
+
return _put_back(atoms, positions)
@@ -110,10 +127,10 @@ def rotate_centered(atoms, angles):
"""
Rotate the given atoms or coordinates about the *x*, *y* and *z*
axes by given angles.
-
+
The rotations are centered at the centroid of the corresponding
structure and are performed sequentially in the order *x*, *y*, *z*.
-
+
Parameters
----------
atoms : Atom or AtomArray or AtomArrayStack or ndarray, shape=(3,) or shape=(n,3) or shape=(m,n,3)
@@ -121,13 +138,13 @@ def rotate_centered(atoms, angles):
The coordinates can be directly provided as :class:`ndarray`.
angles: array-like, length=3
The rotation angles in radians around axes *x*, *y* and *z*.
-
+
Returns
-------
transformed : Atom or AtomArray or AtomArrayStack or ndarray, shape=(3,) or shape=(n,3) or shape=(m,n,3)
A copy of the input atoms or coordinates, rotated by the given
angles.
-
+
See Also
--------
rotate
@@ -136,7 +153,7 @@ def rotate_centered(atoms, angles):
if len(coord(atoms).shape) == 1:
# Single value -> centered rotation does not change coordinates
return atoms.copy()
-
+
# Rotation around centroid requires moving centroid to origin
center = coord(centroid(atoms))
# 'centroid()' removes the second last dimesion
@@ -152,7 +169,7 @@ def rotate_about_axis(atoms, axis, angle, support=None):
"""
Rotate the given atoms or coordinates about a given axis by a given
angle.
-
+
Parameters
----------
atoms : Atom or AtomArray or AtomArrayStack or ndarray, shape=(3,) or shape=(n,3) or shape=(m,n,3)
@@ -167,13 +184,13 @@ def rotate_about_axis(atoms, axis, angle, support=None):
An optional support vector for the rotation axis, i.e. the
center of the rotation.
By default, the center of the rotation is at *(0,0,0)*.
-
+
Returns
-------
transformed : Atom or AtomArray or AtomArrayStack or ndarray, shape=(3,) or shape=(n,3) or shape=(m,n,3)
A copy of the input atoms or coordinates, rotated about the
given axis.
-
+
See Also
--------
rotate
@@ -194,7 +211,7 @@ def rotate_about_axis(atoms, axis, angle, support=None):
# Transform coordinates
# so that the axis support vector is at (0,0,0)
positions -= np.asarray(support)
-
+
# Normalize axis
axis = np.asarray(axis, dtype=np.float32).copy()
if np.linalg.norm(axis) == 0:
@@ -205,16 +222,30 @@ def rotate_about_axis(atoms, axis, angle, support=None):
sin_a = np.sin(angle)
cos_a = np.cos(angle)
icos_a = 1 - cos_a
- x = axis[...,0]
- y = axis[...,1]
- z = axis[...,2]
+ x = axis[..., 0]
+ y = axis[..., 1]
+ z = axis[..., 2]
# Rotation matrix is taken from
# https://en.wikipedia.org/wiki/Rotation_matrix#Rotation_matrix_from_axis_and_angle
- rot_matrix = np.array([
- [ cos_a + icos_a*x**2, icos_a*x*y - z*sin_a, icos_a*x*z + y*sin_a],
- [icos_a*x*y + z*sin_a, cos_a + icos_a*y**2, icos_a*y*z - x*sin_a],
- [icos_a*x*z - y*sin_a, icos_a*y*z + x*sin_a, cos_a + icos_a*z**2]
- ])
+ rot_matrix = np.array(
+ [
+ [
+ cos_a + icos_a * x**2,
+ icos_a * x * y - z * sin_a,
+ icos_a * x * z + y * sin_a,
+ ],
+ [
+ icos_a * x * y + z * sin_a,
+ cos_a + icos_a * y**2,
+ icos_a * y * z - x * sin_a,
+ ],
+ [
+ icos_a * x * z - y * sin_a,
+ icos_a * y * z + x * sin_a,
+ cos_a + icos_a * z**2,
+ ],
+ ]
+ )
# For proper rotation reshape into a maximum of 2 dimensions
orig_ndim = positions.ndim
@@ -230,7 +261,7 @@ def rotate_about_axis(atoms, axis, angle, support=None):
if support is not None:
# Transform coordinates back to original support vector position
positions += np.asarray(support)
-
+
return _put_back(atoms, positions)
@@ -298,9 +329,7 @@ def orient_principal_components(atoms, order=None):
else:
order = np.asarray(order, dtype=int)
if order.shape != (3,):
- raise ValueError(
- f"Expected order to have shape (3,), not {order.shape}"
- )
+ raise ValueError(f"Expected order to have shape (3,), not {order.shape}")
if not (np.sort(order) == np.arange(3)).all():
raise ValueError("Expected order to contain [0, 1, 2].")
@@ -333,8 +362,13 @@ def orient_principal_components(atoms, order=None):
return _put_back(atoms, centered)
-def align_vectors(atoms, origin_direction, target_direction,
- origin_position=None, target_position=None):
+def align_vectors(
+ atoms,
+ origin_direction,
+ target_direction,
+ origin_position=None,
+ target_position=None,
+):
"""
Apply a transformation to atoms or coordinates, that would transfer
a origin vector to a target vector.
@@ -345,8 +379,8 @@ def align_vectors(atoms, origin_direction, target_direction,
This means, that the application of the transformation on the
origin vector would give the target vector.
Then the same transformation is applied to the given
- atoms/coordinates.
-
+ atoms/coordinates.
+
Parameters
----------
atoms : Atom or AtomArray or AtomArrayStack or ndarray, shape=(3,) or shape=(n,3) or shape=(m,n,3)
@@ -359,13 +393,13 @@ def align_vectors(atoms, origin_direction, target_direction,
origin_position, target_position : array-like, length=3, optional
Optional support vectors for the origin or target, respectively.
By default, origin and target start at *(0,0,0)*.
-
+
Returns
-------
transformed : Atom or AtomArray or AtomArrayStack or ndarray, shape=(3,) or shape=(n,3) or shape=(m,n,3)
A copy of the input atoms or coordinates with the applied
transformation.
-
+
See Also
--------
rotate
@@ -428,12 +462,8 @@ def align_vectors(atoms, origin_direction, target_direction,
A 2 LEU HD22 H -6.255 7.544 -2.657
A 2 LEU HD23 H -5.592 8.445 -1.281
"""
- origin_direction = np.asarray(
- origin_direction, dtype=np.float32
- ).squeeze()
- target_direction = np.asarray(
- target_direction, dtype=np.float32
- ).squeeze()
+ origin_direction = np.asarray(origin_direction, dtype=np.float32).squeeze()
+ target_direction = np.asarray(target_direction, dtype=np.float32).squeeze()
# check that original and target direction are vectors of shape (3,)
if origin_direction.shape != (3,):
raise ValueError(
@@ -449,9 +479,9 @@ def align_vectors(atoms, origin_direction, target_direction,
raise ValueError("Length of the origin vector is 0")
if np.linalg.norm(target_direction) == 0:
raise ValueError("Length of the target vector is 0")
- if origin_position is not None:
+ if origin_position is not None:
origin_position = np.asarray(origin_position, dtype=np.float32)
- if target_position is not None:
+ if target_position is not None:
target_position = np.asarray(target_position, dtype=np.float32)
positions = coord(atoms).copy()
@@ -459,7 +489,7 @@ def align_vectors(atoms, origin_direction, target_direction,
# Transform coordinates
# so that the position of the origin vector is at (0,0,0)
positions -= origin_position
-
+
# Normalize direction vectors
origin_direction = origin_direction.copy()
norm_vector(origin_direction)
@@ -468,11 +498,7 @@ def align_vectors(atoms, origin_direction, target_direction,
# Formula is taken from
# https://math.stackexchange.com/questions/180418/calculate-rotation-matrix-to-align-vector-a-to-vector-b-in-3d/476311#476311
vx, vy, vz = np.cross(origin_direction, target_direction)
- v_c = np.array([
- [ 0, -vz, vy],
- [ vz, 0, -vx],
- [-vy, vx, 0]
- ], dtype=float)
+ v_c = np.array([[0, -vz, vy], [vz, 0, -vx], [-vy, vx, 0]], dtype=float)
cos_a = vector_dot(origin_direction, target_direction)
if np.all(cos_a == -1):
raise ValueError(
@@ -480,9 +506,9 @@ def align_vectors(atoms, origin_direction, target_direction,
"cannot calculate rotation matrix"
)
rot_matrix = np.identity(3) + v_c + (v_c @ v_c) / (1 + cos_a)
-
+
positions = matrix_rotate(positions, rot_matrix)
-
+
if target_position is not None:
# Transform coordinates to position of the target vector
positions += target_position
@@ -501,4 +527,4 @@ def _put_back(input_atoms, transformed):
moved_atoms.coord = transformed
return moved_atoms
else:
- return transformed
\ No newline at end of file
+ return transformed
diff --git a/src/biotite/structure/util.py b/src/biotite/structure/util.py
index 68f13f20d..cabbdc8f5 100644
--- a/src/biotite/structure/util.py
+++ b/src/biotite/structure/util.py
@@ -11,31 +11,30 @@
__all__ = ["vector_dot", "norm_vector", "distance", "matrix_rotate"]
import numpy as np
-from .atoms import Atom, array
-def vector_dot(v1,v2):
+def vector_dot(v1, v2):
"""
Calculate vector dot product of two vectors.
-
+
Parameters
----------
v1,v2 : ndarray
The arrays to calculate the product from.
The vectors are represented by the last axis.
-
+
Returns
-------
product : float or ndarray
Scalar product over the last dimension of the arrays.
"""
- return (v1*v2).sum(axis=-1)
+ return (v1 * v2).sum(axis=-1)
def norm_vector(v):
"""
Normalise a vector.
-
+
Parameters
----------
v : ndarray
@@ -47,25 +46,25 @@ def norm_vector(v):
v /= factor[..., np.newaxis]
else:
v /= factor
-
-def distance(v1,v2):
+
+def distance(v1, v2):
"""
Calculate the distance between two position vectors.
-
+
Parameters
----------
v1,v2 : ndarray
The arrays to calculate the product from.
The vectors are represented by the last axis.
-
+
Returns
-------
product : float or ndarray
Vector distance over the last dimension of the array.
"""
dif = v1 - v2
- return np.sqrt((dif*dif).sum(axis=-1))
+ return np.sqrt((dif * dif).sum(axis=-1))
def matrix_rotate(v, matrix):
@@ -78,7 +77,7 @@ def matrix_rotate(v, matrix):
The coordinates to rotate.
matrix : ndarray
The rotation matrix.
-
+
Returns
-------
rotated : ndarray
@@ -95,4 +94,3 @@ def matrix_rotate(v, matrix):
if orig_ndim > 2:
v = v.reshape(*orig_shape)
return v
-
diff --git a/src/biotite/visualize.py b/src/biotite/visualize.py
index a2839c6a6..eb7444c54 100644
--- a/src/biotite/visualize.py
+++ b/src/biotite/visualize.py
@@ -6,25 +6,25 @@
__author__ = "Patrick Kunzmann"
__all__ = ["colors", "set_font_size_in_coord", "AdaptiveFancyArrow"]
-import abc
from collections import OrderedDict
import numpy as np
from numpy.linalg import norm
-
# Biotite themed colors
-colors = OrderedDict([
- ("brightorange" , "#ffb569ff"),
- ("lightorange" , "#ff982dff"),
- ("orange" , "#ff8405ff"),
- ("dimorange" , "#dc7000ff"),
- ("darkorange" , "#b45c00ff"),
- ("brightgreen" , "#98e97fff"),
- ("lightgreen" , "#6fe04cff"),
- ("green" , "#52da2aff"),
- ("dimgreen" , "#45bc20ff"),
- ("darkgreen" , "#389a1aff"),
-])
+colors = OrderedDict(
+ [
+ ("brightorange", "#ffb569ff"),
+ ("lightorange", "#ff982dff"),
+ ("orange", "#ff8405ff"),
+ ("dimorange", "#dc7000ff"),
+ ("darkorange", "#b45c00ff"),
+ ("brightgreen", "#98e97fff"),
+ ("lightgreen", "#6fe04cff"),
+ ("green", "#52da2aff"),
+ ("dimgreen", "#45bc20ff"),
+ ("darkgreen", "#389a1aff"),
+ ]
+)
def set_font_size_in_coord(text, width=None, height=None, mode="unlocked"):
@@ -75,8 +75,8 @@ def set_font_size_in_coord(text, width=None, height=None, mode="unlocked"):
This behavior is not equal for all initial font sizes (in 'pt'),
the boundaries for an initial size of 1 'pt' seem to be most exact.
"""
- from matplotlib.transforms import Bbox, Affine2D
from matplotlib.patheffects import AbstractPathEffect
+ from matplotlib.transforms import Affine2D, Bbox
class TextScaler(AbstractPathEffect):
def __init__(self, text, width, height, mode):
@@ -85,11 +85,11 @@ def __init__(self, text, width, height, mode):
self._width = width
self._height = height
- def draw_path(self, renderer, gc, tpath, affine, rgbFace=None):
+ def draw_path(self, renderer, gc, tpath, affine, rgbFace=None): # noqa: N803
ax = self._text.axes
try:
renderer = ax.get_figure().canvas.get_renderer()
- except:
+ except Exception:
# Use cached renderer for backends, where
# `get_renderer()` is not available
# Based on the strategy from `Text.get_window_extent()`
@@ -127,25 +127,21 @@ def draw_path(self, renderer, gc, tpath, affine, rgbFace=None):
if mode in ["unlocked", "minimum", "maximum"]:
if width is None or height is None:
- raise TypeError(
- f"Width and height must be set in '{mode}' mode"
- )
+ raise TypeError(f"Width and height must be set in '{mode}' mode")
elif mode == "proportional":
- if not (width is None and height is not None) or \
- not (height is None and width is not None):
- raise TypeError(
- f"Either width or height must be set in '{mode}' mode"
- )
+ if not (width is None and height is not None) or not (
+ height is None and width is not None
+ ):
+ raise TypeError(f"Either width or height must be set in '{mode}' mode")
else:
- raise ValueError(
- f"Unknown mode '{mode}'"
- )
+ raise ValueError(f"Unknown mode '{mode}'")
text.set_path_effects([TextScaler(text, width, height, mode)])
+
try:
# Only create this class when matplotlib is installed
- from matplotlib.transforms import Bbox
from matplotlib.patches import FancyArrow
+ from matplotlib.transforms import Bbox
class AdaptiveFancyArrow(FancyArrow):
"""
@@ -177,9 +173,19 @@ class AdaptiveFancyArrow(FancyArrow):
`FancyArrow`.
"""
- def __init__(self, x, y, dx, dy,
- tail_width, head_width, head_ratio, draw_head=True,
- shape="full", **kwargs):
+ def __init__(
+ self,
+ x,
+ y,
+ dx,
+ dy,
+ tail_width,
+ head_width,
+ head_ratio,
+ draw_head=True,
+ shape="full",
+ **kwargs,
+ ):
self._x = x
self._y = y
self._dx = dx
@@ -193,23 +199,25 @@ def __init__(self, x, y, dx, dy,
if not draw_head:
head_width = tail_width
super().__init__(
- x, y, dx, dy,
- width=tail_width, head_width=head_width,
- overhang=0, shape=shape,
- length_includes_head=True, **kwargs
+ x,
+ y,
+ dx,
+ dy,
+ width=tail_width,
+ head_width=head_width,
+ overhang=0,
+ shape=shape,
+ length_includes_head=True,
+ **kwargs,
)
def draw(self, renderer):
- arrow_box = Bbox([(0,0), (0,self._head_width)])
+ arrow_box = Bbox([(0, 0), (0, self._head_width)])
arrow_box_display = self.axes.transData.transform_bbox(arrow_box)
- head_length_display = np.abs(
- arrow_box_display.height * self._head_ratio
- )
+ head_length_display = np.abs(arrow_box_display.height * self._head_ratio)
arrow_box_display.x1 = arrow_box_display.x0 + head_length_display
# Transfrom back to data coordinates for plotting
- arrow_box = self.axes.transData.inverted().transform_bbox(
- arrow_box_display
- )
+ arrow_box = self.axes.transData.inverted().transform_bbox(arrow_box_display)
head_length = arrow_box.width
arrow_length = norm((self._dx, self._dy))
if head_length > arrow_length:
@@ -221,11 +229,19 @@ def draw(self, renderer):
# Renew the arrow's properties
super().__init__(
- self._x, self._y, self._dx, self._dy,
- width=self._tail_width, head_width=self._head_width,
- overhang=0, shape=self._shape,
- head_length=head_length, length_includes_head=True,
- axes=self.axes, transform=self.get_transform(), **self._kwargs
+ self._x,
+ self._y,
+ self._dx,
+ self._dy,
+ width=self._tail_width,
+ head_width=self._head_width,
+ overhang=0,
+ shape=self._shape,
+ head_length=head_length,
+ length_includes_head=True,
+ axes=self.axes,
+ transform=self.get_transform(),
+ **self._kwargs,
)
self.set_clip_path(self.axes.patch)
super().draw(renderer)
@@ -234,18 +250,16 @@ def draw(self, renderer):
# Removes warning:
# unknown document: /tutorials/intermediate/constrainedlayout_guide
def get_in_layout(self):
- """
- """
+ """ """
return super().get_in_layout()
+
def set_in_layout(self, in_layout):
- """
- """
+ """ """
return super().set_in_layout(in_layout)
except ImportError:
-
# Dummy class that propagates a meaningful error,
# i.e. that Matplotlib is not installed
- class AdaptiveFancyArrow():
+ class AdaptiveFancyArrow:
def __init__(*args, **kwargs):
- raise ModuleNotFoundError(f"No module named 'matplotlib'")
\ No newline at end of file
+ raise ModuleNotFoundError("No module named 'matplotlib'")
diff --git a/tests/application/test_autodock.py b/tests/application/test_autodock.py
index 126f424d2..846b88f29 100644
--- a/tests/application/test_autodock.py
+++ b/tests/application/test_autodock.py
@@ -9,12 +9,10 @@
import biotite.structure.info as info
import biotite.structure.io.pdbx as pdbx
from biotite.application.autodock import VinaApp
-from ..util import data_dir, is_not_installed
+from tests.util import data_dir, is_not_installed
-@pytest.mark.skipif(
- is_not_installed("vina"), reason="Autodock Vina is not installed"
-)
+@pytest.mark.skipif(is_not_installed("vina"), reason="Autodock Vina is not installed")
@pytest.mark.parametrize("flexible", [False, True])
def test_docking(flexible):
"""
@@ -24,9 +22,7 @@ def test_docking(flexible):
PDB structure.
"""
# A structure of a straptavidin-biotin complex
- pdbx_file = pdbx.BinaryCIFFile.read(
- join(data_dir("application"), "2rtg.bcif")
- )
+ pdbx_file = pdbx.BinaryCIFFile.read(join(data_dir("application"), "2rtg.bcif"))
structure = pdbx.get_structure(
pdbx_file, model=1, extra_fields=["charge"], include_bonds=True
)
@@ -46,8 +42,11 @@ def test_docking(flexible):
flexible_mask = None
app = VinaApp(
- ligand, receptor, struc.centroid(ref_ligand), [20, 20, 20],
- flexible=flexible_mask
+ ligand,
+ receptor,
+ struc.centroid(ref_ligand),
+ [20, 20, 20],
+ flexible=flexible_mask,
)
app.set_seed(0)
app.start()
@@ -65,7 +64,7 @@ def test_docking(flexible):
# Select best binding pose
test_ligand_coord = test_ligand_coord[0]
not_nan_mask = ~np.isnan(test_ligand_coord).any(axis=-1)
- ref_ligand_coord = ref_ligand_coord[not_nan_mask]
+ ref_ligand_coord = ref_ligand_coord[not_nan_mask]
test_ligand_coord = test_ligand_coord[not_nan_mask]
# Check if it least one atom is preserved
assert test_ligand_coord.shape[1] > 0
@@ -78,7 +77,7 @@ def test_docking(flexible):
# Select best binding pose
test_receptor_coord = test_receptor_coord[0]
not_nan_mask = ~np.isnan(test_receptor_coord).any(axis=-1)
- ref_receptor_coord = receptor[not_nan_mask]
+ ref_receptor_coord = receptor[not_nan_mask]
test_receptor_coord = test_receptor_coord[not_nan_mask]
# Check if it least one atom is preserved
assert test_receptor_coord.shape[1] > 0
@@ -86,9 +85,7 @@ def test_docking(flexible):
# from the original conformation
# NOTE: Currently 1.0 Å is sufficient in local testing,
# but not in the CI (1.6 Å)
- assert np.max(
- struc.distance(test_receptor_coord, ref_receptor_coord)
- ) < 1.7
+ assert np.max(struc.distance(test_receptor_coord, ref_receptor_coord)) < 1.7
else:
ref_receptor_coord = receptor.coord
for model_coord in test_receptor_coord:
diff --git a/tests/application/test_blast.py b/tests/application/test_blast.py
index 49bfed2b4..d9bb69f3a 100644
--- a/tests/application/test_blast.py
+++ b/tests/application/test_blast.py
@@ -2,15 +2,12 @@
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.
+import os.path
+import pytest
+import biotite.application.blast as blast
import biotite.sequence as seq
import biotite.sequence.io as seqio
-import biotite.application.blast as blast
-import numpy as np
-from requests.exceptions import ConnectionError
-import pytest
-import os.path
-from ..util import data_dir, cannot_connect_to
-
+from tests.util import cannot_connect_to, data_dir
BLAST_URL = "https://blast.ncbi.nlm.nih.gov/Blast.cgi"
@@ -22,10 +19,7 @@
prot_seq = seq.ProteinSequence("MTMITPSFPGNS")
-@pytest.mark.skipif(
- cannot_connect_to(BLAST_URL),
- reason="NCBI BLAST is not available"
-)
+@pytest.mark.skipif(cannot_connect_to(BLAST_URL), reason="NCBI BLAST is not available")
def test_blastn():
app = blast.BlastWebApp("blastn", dna_seq, obey_rules=False)
app.set_max_expect_value(100)
@@ -36,10 +30,8 @@ def test_blastn():
assert dna_seq == alignments[0].sequences[0]
assert dna_seq == alignments[0].sequences[1]
-@pytest.mark.skipif(
- cannot_connect_to(BLAST_URL),
- reason="NCBI BLAST is not available"
-)
+
+@pytest.mark.skipif(cannot_connect_to(BLAST_URL), reason="NCBI BLAST is not available")
def test_blastx():
app = blast.BlastWebApp("blastx", dna_seq, obey_rules=False)
app.set_max_expect_value(100)
@@ -50,10 +42,8 @@ def test_blastx():
assert prot_seq == alignments[0].sequences[0]
assert prot_seq == alignments[0].sequences[1]
-@pytest.mark.skipif(
- cannot_connect_to(BLAST_URL),
- reason="NCBI BLAST is not available"
-)
+
+@pytest.mark.skipif(cannot_connect_to(BLAST_URL), reason="NCBI BLAST is not available")
def test_tblastx():
app = blast.BlastWebApp("tblastx", dna_seq, obey_rules=False)
app.set_max_expect_value(100)
@@ -61,16 +51,14 @@ def test_tblastx():
app.join(timeout=300)
alignments = app.get_alignments()
# BLAST should find original sequence as best hit
- print (alignments[0].sequences[0])
- print (alignments[0].sequences[1])
+ print(alignments[0].sequences[0])
+ print(alignments[0].sequences[1])
rev_prot_seq = dna_seq.reverse().complement().translate(complete=True)
assert rev_prot_seq == alignments[0].sequences[0]
assert rev_prot_seq == alignments[0].sequences[1]
-@pytest.mark.skipif(
- cannot_connect_to(BLAST_URL),
- reason="NCBI BLAST is not available"
-)
+
+@pytest.mark.skipif(cannot_connect_to(BLAST_URL), reason="NCBI BLAST is not available")
def test_blastp():
app = blast.BlastWebApp("blastp", prot_seq, obey_rules=False)
app.set_max_expect_value(100)
@@ -81,10 +69,8 @@ def test_blastp():
assert prot_seq == alignments[0].sequences[0]
assert prot_seq == alignments[0].sequences[1]
-@pytest.mark.skipif(
- cannot_connect_to(BLAST_URL),
- reason="NCBI BLAST is not available"
-)
+
+@pytest.mark.skipif(cannot_connect_to(BLAST_URL), reason="NCBI BLAST is not available")
def test_tblastn():
app = blast.BlastWebApp("tblastn", prot_seq, obey_rules=False)
app.set_max_expect_value(200)
@@ -95,20 +81,20 @@ def test_tblastn():
assert prot_seq == alignments[0].sequences[0]
assert prot_seq == alignments[0].sequences[1]
+
def test_file_input():
path = os.path.join(data_dir("sequence"), "prot.fasta")
- app = blast.BlastWebApp("blastp", path, obey_rules=False)
+ blast.BlastWebApp("blastp", path, obey_rules=False)
+
def test_invalid_query():
with pytest.raises(ValueError):
- app = blast.BlastWebApp("blastn", "ABCDEFGHIJKLMNOP", obey_rules=False)
+ blast.BlastWebApp("blastn", "ABCDEFGHIJKLMNOP", obey_rules=False)
with pytest.raises(ValueError):
- app = blast.BlastWebApp("blastp", "ABCDEFGHIJKLMNOP", obey_rules=False)
-
-@pytest.mark.skipif(
- cannot_connect_to(BLAST_URL),
- reason="NCBI BLAST is not available"
-)
+ blast.BlastWebApp("blastp", "ABCDEFGHIJKLMNOP", obey_rules=False)
+
+
+@pytest.mark.skipif(cannot_connect_to(BLAST_URL), reason="NCBI BLAST is not available")
def test_no_hit():
app = blast.BlastWebApp("blastn", "ACTGTACGAAACTCGGCGTA", obey_rules=False)
app.set_word_size(20)
@@ -118,10 +104,8 @@ def test_no_hit():
# BLAST should find original sequence as best hit
assert len(alignments) == 0
-@pytest.mark.skipif(
- cannot_connect_to(BLAST_URL),
- reason="NCBI BLAST is not available"
-)
+
+@pytest.mark.skipif(cannot_connect_to(BLAST_URL), reason="NCBI BLAST is not available")
def test_invalid_input():
app = blast.BlastWebApp("blastn", dna_seq, obey_rules=False)
# Set some invalid parameters
@@ -132,18 +116,15 @@ def test_invalid_input():
app.join(timeout=300)
-@pytest.mark.skipif(
- cannot_connect_to(BLAST_URL),
- reason="NCBI BLAST is not available"
-)
+@pytest.mark.skipif(cannot_connect_to(BLAST_URL), reason="NCBI BLAST is not available")
def test_hit_with_selenocysteine():
# Sequence is taken from issue #344
query = seqio.load_sequence(
os.path.join(data_dir("sequence"), "selenocysteine.fasta")
)
-
+
# Expect hit containing selenocysteine when searching Swiss-Prot
blast_app = blast.BlastWebApp("blastp", query, "swissprot")
blast_app.start()
# No AlphabetError should be raised here
- blast_app.join()
\ No newline at end of file
+ blast_app.join()
diff --git a/tests/application/test_dssp.py b/tests/application/test_dssp.py
index 197790236..0a201c922 100644
--- a/tests/application/test_dssp.py
+++ b/tests/application/test_dssp.py
@@ -10,14 +10,13 @@
import biotite.structure.io as strucio
import biotite.structure.io.pdbx as pdbx
from biotite.application.dssp import DsspApp
-from ..util import data_dir, is_not_installed
+from tests.util import data_dir, is_not_installed
@pytest.mark.skipif(is_not_installed("mkdssp"), reason="DSSP is not installed")
def test_multiple_chains():
atoms = pdbx.get_structure(
- pdbx.BinaryCIFFile.read(join(data_dir("structure"), "1igy.bcif")),
- model=1
+ pdbx.BinaryCIFFile.read(join(data_dir("structure"), "1igy.bcif")), model=1
)
atoms = atoms[struc.filter_canonical_amino_acids(atoms)]
sse = DsspApp.annotate_sse(atoms)
diff --git a/tests/application/test_msa.py b/tests/application/test_msa.py
index ca0554e1e..942a781e9 100644
--- a/tests/application/test_msa.py
+++ b/tests/application/test_msa.py
@@ -2,64 +2,70 @@
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.
-from distutils.version import Version
+import numpy as np
+import pytest
import biotite.sequence as seq
-import biotite.sequence.phylo as phylo
import biotite.sequence.align as align
+import biotite.sequence.phylo as phylo
from biotite.application import VersionError
-from biotite.application.muscle import MuscleApp, Muscle5App
-from biotite.application.mafft import MafftApp
from biotite.application.clustalo import ClustalOmegaApp
-import numpy as np
-import pytest
-import shutil
-from ..util import is_not_installed
-
+from biotite.application.mafft import MafftApp
+from biotite.application.muscle import Muscle5App, MuscleApp
+from tests.util import is_not_installed
BIN_PATH = {
- MuscleApp : "muscle",
- Muscle5App : "muscle",
- MafftApp : "mafft",
- ClustalOmegaApp: "clustalo"
+ MuscleApp: "muscle",
+ Muscle5App: "muscle",
+ MafftApp: "mafft",
+ ClustalOmegaApp: "clustalo",
}
@pytest.fixture
def sequences():
- return [seq.ProteinSequence(string) for string in [
- "BIQTITE",
- "TITANITE",
- "BISMITE",
- "IQLITE"
-]]
-
-
-@pytest.mark.parametrize("app_cls, exp_ali, exp_order",
- [(MuscleApp,
- "BIQT-ITE\n"
- "TITANITE\n"
- "BISM-ITE\n"
- "-IQL-ITE",
- [1, 2, 0, 3]),
- (Muscle5App,
- "BI-QTITE\n"
- "TITANITE\n"
- "BI-SMITE\n"
- "-I-QLITE",
- [0, 3, 1, 2]),
- (MafftApp,
- "-BIQTITE\n"
- "TITANITE\n"
- "-BISMITE\n"
- "--IQLITE",
- [0, 3, 2, 1]),
- (ClustalOmegaApp,
- "-BIQTITE\n"
- "TITANITE\n"
- "-BISMITE\n"
- "--IQLITE",
- [1, 2, 0, 3])]
-)
+ return [
+ seq.ProteinSequence(string)
+ for string in ["BIQTITE", "TITANITE", "BISMITE", "IQLITE"]
+ ]
+
+
+@pytest.mark.parametrize(
+ "app_cls, exp_ali, exp_order",
+ [
+ (
+ MuscleApp,
+ "BIQT-ITE\n"
+ "TITANITE\n"
+ "BISM-ITE\n"
+ "-IQL-ITE",
+ [1, 2, 0, 3]
+ ),
+ (
+ Muscle5App,
+ "BI-QTITE\n"
+ "TITANITE\n"
+ "BI-SMITE\n"
+ "-I-QLITE",
+ [0, 3, 1, 2]
+ ),
+ (
+ MafftApp,
+ "-BIQTITE\n"
+ "TITANITE\n"
+ "-BISMITE\n"
+ "--IQLITE",
+ [0, 3, 2, 1]
+ ),
+ (
+ ClustalOmegaApp,
+ "-BIQTITE\n"
+ "TITANITE\n"
+ "-BISMITE\n"
+ "--IQLITE",
+ [1, 2, 0, 3]
+ )
+ ]
+) # fmt: skip
def test_msa(sequences, app_cls, exp_ali, exp_order):
"""
Test MSA software on short toy sequences with known alignment
@@ -72,7 +78,7 @@ def test_msa(sequences, app_cls, exp_ali, exp_order):
try:
app = app_cls(sequences)
except VersionError:
- pytest.skip(f"Invalid software version")
+ pytest.skip("Invalid software version")
app.start()
app.join()
alignment = app.get_alignment()
@@ -104,14 +110,13 @@ def test_large_sequence_number(app_cls):
try:
app = app_cls(sequences)
except VersionError:
- pytest.skip(f"Invalid software version")
+ pytest.skip("Invalid software version")
app.start()
app.join()
alignment = app.get_alignment()
# Expect completely matching sequences
- assert alignment.trace.tolist() == [
- [i]*SEQ_NUMBER for i in range(SEQ_LENGTH)
- ]
+ assert alignment.trace.tolist() == [[i] * SEQ_NUMBER for i in range(SEQ_LENGTH)]
+
def test_additional_options(sequences):
bin_path = BIN_PATH[ClustalOmegaApp]
@@ -120,15 +125,15 @@ def test_additional_options(sequences):
app1 = ClustalOmegaApp(sequences)
app1.start()
-
+
app2 = ClustalOmegaApp(sequences)
app2.add_additional_options(["--full"])
app2.start()
-
+
app1.join()
app2.join()
assert "--full" not in app1.get_command()
- assert "--full" in app2.get_command()
+ assert "--full" in app2.get_command()
assert app1.get_alignment() == app2.get_alignment()
@@ -137,7 +142,7 @@ def test_custom_substitution_matrix(sequences, app_cls):
bin_path = BIN_PATH[app_cls]
if is_not_installed(bin_path):
pytest.skip(f"'{bin_path}' is not installed")
-
+
alph = seq.ProteinSequence.alphabet
# Strong identity matrix
score_matrix = np.identity(len(alph)) * 1000
@@ -147,11 +152,11 @@ def test_custom_substitution_matrix(sequences, app_cls):
"TITANITE\n"
"BI-SMITE\n"
"-I-QLITE"
- )
+ ) # fmt: skip
try:
app = app_cls(sequences, matrix=matrix)
except VersionError:
- pytest.skip(f"Invalid software version")
+ pytest.skip("Invalid software version")
app.start()
app.join()
alignment = app.get_alignment()
@@ -165,21 +170,21 @@ def test_custom_sequence_type(app_cls):
bin_path = BIN_PATH[app_cls]
if is_not_installed(bin_path):
pytest.skip(f"'{bin_path}' is not installed")
-
+
alph = seq.Alphabet(("foo", "bar", 42))
sequences = [seq.GeneralSequence(alph, sequence) for sequence in [
["foo", "bar", 42, "foo", "foo", 42, 42],
["foo", 42, "foo", "bar", "foo", 42, 42],
- ]]
+ ]] # fmt: skip
exp_trace = [
- [ 0, 0],
- [ 1, -1],
- [ 2, 1],
- [ 3, 2],
- [-1, 3],
- [ 4, 4],
- [ 5, 5],
- [ 6, 6],
+ [0, 0],
+ [1, -1],
+ [2, 1],
+ [3, 2],
+ [-1, 3],
+ [4, 4],
+ [5, 5],
+ [6, 6],
]
# Strong identity matrix
score_matrix = np.identity(len(alph))
@@ -189,7 +194,7 @@ def test_custom_sequence_type(app_cls):
try:
app = app_cls(sequences, matrix=matrix)
except VersionError:
- pytest.skip(f"Invalid software version")
+ pytest.skip("Invalid software version")
app.start()
app.join()
alignment = app.get_alignment()
@@ -206,17 +211,17 @@ def test_invalid_sequence_type_no_matrix(app_cls):
bin_path = BIN_PATH[app_cls]
if is_not_installed(bin_path):
pytest.skip(f"'{bin_path}' is not installed")
-
+
alph = seq.Alphabet(("foo", "bar", 42))
sequences = [seq.GeneralSequence(alph, sequence) for sequence in [
["foo", "bar", 42, "foo", "foo", 42, 42],
["foo", 42, "foo", "bar", "foo", 42, 42],
- ]]
+ ]] # fmt: skip
with pytest.raises(TypeError):
try:
app_cls(sequences)
except VersionError:
- pytest.skip(f"Invalid software version")
+ pytest.skip("Invalid software version")
@pytest.mark.parametrize("app_cls", [MuscleApp, MafftApp, ClustalOmegaApp])
@@ -228,17 +233,20 @@ def test_invalid_sequence_type_unsuitable_alphabet(app_cls):
bin_path = BIN_PATH[app_cls]
if is_not_installed(bin_path):
pytest.skip(f"'{bin_path}' is not installed")
-
+
alph = seq.Alphabet(range(50))
- sequences = [seq.GeneralSequence(alph, sequence) for sequence in [
- [1,2,3],
- [1,2,3],
- ]]
+ sequences = [
+ seq.GeneralSequence(alph, sequence)
+ for sequence in [
+ [1, 2, 3],
+ [1, 2, 3],
+ ]
+ ]
with pytest.raises(TypeError):
try:
app_cls(sequences)
except VersionError:
- pytest.skip(f"Invalid software version")
+ pytest.skip("Invalid software version")
def test_invalid_muscle_version(sequences):
@@ -249,9 +257,9 @@ def test_invalid_muscle_version(sequences):
bin_path = BIN_PATH[MuscleApp]
if is_not_installed(bin_path):
pytest.skip(f"'{bin_path}' is not installed")
-
+
if is_not_installed("muscle"):
- pytest.skip(f"'muscle' is not installed")
+ pytest.skip("'muscle' is not installed")
with pytest.raises(VersionError):
MuscleApp(sequences)
@@ -262,13 +270,13 @@ def test_clustalo_matrix(sequences):
bin_path = BIN_PATH[ClustalOmegaApp]
if is_not_installed(bin_path):
pytest.skip(f"'{bin_path}' is not installed")
-
+
ref_matrix = [
[0, 1, 2, 3],
[1, 0, 1, 2],
[2, 1, 0, 1],
[3, 2, 1, 0]
- ]
+ ] # fmt: skip
app = ClustalOmegaApp(sequences)
app.full_matrix_calculation()
app.set_distance_matrix(np.array(ref_matrix))
@@ -282,7 +290,7 @@ def test_clustalo_tree(sequences):
bin_path = BIN_PATH[ClustalOmegaApp]
if is_not_installed(bin_path):
pytest.skip(f"'{bin_path}' is not installed")
-
+
leaves = [phylo.TreeNode(index=i) for i in range(len(sequences))]
inter1 = phylo.TreeNode([leaves[0], leaves[1]], [1.0, 1.0])
inter2 = phylo.TreeNode([leaves[2], leaves[3]], [2.5, 2.5])
@@ -305,7 +313,7 @@ def test_mafft_tree(sequences):
bin_path = BIN_PATH[MafftApp]
if is_not_installed(bin_path):
pytest.skip(f"'{bin_path}' is not installed")
-
+
app = MafftApp(sequences)
app.start()
app.join()
@@ -317,11 +325,11 @@ def test_muscle_tree(sequences):
bin_path = BIN_PATH[MuscleApp]
if is_not_installed(bin_path):
pytest.skip(f"'{bin_path}' is not installed")
-
+
try:
app = MuscleApp(sequences)
except VersionError:
- pytest.skip(f"Invalid software version")
+ pytest.skip("Invalid software version")
app.start()
app.join()
tree1 = app.get_guide_tree(iteration="kmer")
@@ -334,11 +342,11 @@ def test_muscle5_options(sequences):
bin_path = BIN_PATH[Muscle5App]
if is_not_installed(bin_path):
pytest.skip(f"'{bin_path}' is not installed")
-
+
try:
app = Muscle5App(sequences)
except VersionError:
- pytest.skip(f"Invalid software version")
+ pytest.skip("Invalid software version")
app.use_super5()
app.set_iterations(2, 100)
app.set_thread_number(2)
@@ -350,7 +358,9 @@ def test_muscle5_options(sequences):
assert "-threads" in app.get_command()
app.join()
- assert str(app.get_alignment()) == "BI-QTITE\n" \
- "TITANITE\n" \
- "BI-SMITE\n" \
- "-I-QLITE"
\ No newline at end of file
+ assert str(app.get_alignment()) == (
+ "BI-QTITE\n" \
+ "TITANITE\n" \
+ "BI-SMITE\n" \
+ "-I-QLITE"
+ ) # fmt: skip
diff --git a/tests/application/test_rnaalifold.py b/tests/application/test_rnaalifold.py
index f55b6bdb1..a432b65fe 100644
--- a/tests/application/test_rnaalifold.py
+++ b/tests/application/test_rnaalifold.py
@@ -7,7 +7,7 @@
import biotite.sequence as seq
import biotite.sequence.align as align
from biotite.application.viennarna import RNAalifoldApp
-from ..util import is_not_installed
+from tests.util import is_not_installed
@pytest.fixture
@@ -29,7 +29,7 @@ def sample_app():
is_not_installed("RNAalifold"), reason="RNAalifold is not installed"
)
def test_get_dot_bracket(sample_app):
- assert sample_app.get_dot_bracket() == "(((.((((.......)).)))))...."
+ assert sample_app.get_dot_bracket() == "(((.((((.......)).)))))...."
@pytest.mark.skipif(
@@ -38,19 +38,17 @@ def test_get_dot_bracket(sample_app):
def test_get_free_energy(sample_app):
assert sample_app.get_free_energy() == -1.3
+
@pytest.mark.skipif(
is_not_installed("RNAalifold"), reason="RNAalifold is not installed"
)
def test_get_base_pairs(sample_app):
- expected_basepairs = np.array([[ 0, 22],
- [ 1, 21],
- [ 2, 20],
- [ 4, 19],
- [ 5, 18],
- [ 6, 16],
- [ 7, 15]])
+ expected_basepairs = np.array(
+ [[0, 22], [1, 21], [2, 20], [4, 19], [5, 18], [6, 16], [7, 15]]
+ )
assert np.all(sample_app.get_base_pairs() == expected_basepairs)
+
@pytest.mark.skipif(
is_not_installed("RNAalifold"), reason="RNAalifold is not installed"
)
@@ -63,7 +61,7 @@ def test_constraints():
sequence = seq.NucleotideSequence("A" * 20)
matrix = align.SubstitutionMatrix.std_nucleotide_matrix()
alignment = align.align_ungapped(sequence, sequence, matrix)
-
+
# An arbitrary secondary structure
# The loop in the center must probably comprise at least 5 bases
# due to the dynamic programming algorithm
@@ -72,15 +70,18 @@ def test_constraints():
app = RNAalifoldApp(alignment)
app.set_constraints(
- pairs=np.stack([
- np.where(ref_dotbracket_array == "(")[0],
- np.where(ref_dotbracket_array == ")")[0][::-1]
- ], axis=-1),
- unpaired = (ref_dotbracket_array == "x"),
- enforce=True
+ pairs=np.stack(
+ [
+ np.where(ref_dotbracket_array == "(")[0],
+ np.where(ref_dotbracket_array == ")")[0][::-1],
+ ],
+ axis=-1,
+ ),
+ unpaired=(ref_dotbracket_array == "x"),
+ enforce=True,
)
app.start()
app.join()
test_dotbracket = app.get_dot_bracket()
- assert test_dotbracket == ref_dotbracket.replace("x", ".")
\ No newline at end of file
+ assert test_dotbracket == ref_dotbracket.replace("x", ".")
diff --git a/tests/application/test_rnafold.py b/tests/application/test_rnafold.py
index f8b0ccfd7..c40f16070 100644
--- a/tests/application/test_rnafold.py
+++ b/tests/application/test_rnafold.py
@@ -6,7 +6,7 @@
import pytest
import biotite.sequence as seq
from biotite.application.viennarna import RNAfoldApp
-from ..util import is_not_installed
+from tests.util import is_not_installed
@pytest.fixture
@@ -22,36 +22,25 @@ def sample_app():
return app
-@pytest.mark.skipif(
- is_not_installed("RNAfold"), reason="RNAfold is not installed"
-)
+@pytest.mark.skipif(is_not_installed("RNAfold"), reason="RNAfold is not installed")
def test_get_dot_bracket(sample_app):
- assert sample_app.get_dot_bracket() == "(((.((((.......)).)))))...."
+ assert sample_app.get_dot_bracket() == "(((.((((.......)).)))))...."
-@pytest.mark.skipif(
- is_not_installed("RNAfold"), reason="RNAfold is not installed"
-)
+@pytest.mark.skipif(is_not_installed("RNAfold"), reason="RNAfold is not installed")
def test_get_free_energy(sample_app):
assert sample_app.get_free_energy() == -1.3
-@pytest.mark.skipif(
- is_not_installed("RNAfold"), reason="RNAfold is not installed"
-)
+
+@pytest.mark.skipif(is_not_installed("RNAfold"), reason="RNAfold is not installed")
def test_get_base_pairs(sample_app):
- expected_basepairs = np.array([[ 0, 22],
- [ 1, 21],
- [ 2, 20],
- [ 4, 19],
- [ 5, 18],
- [ 6, 16],
- [ 7, 15]])
+ expected_basepairs = np.array(
+ [[0, 22], [1, 21], [2, 20], [4, 19], [5, 18], [6, 16], [7, 15]]
+ )
assert np.all(sample_app.get_base_pairs() == expected_basepairs)
-@pytest.mark.skipif(
- is_not_installed("RNAfold"), reason="RNAfold is not installed"
-)
+@pytest.mark.skipif(is_not_installed("RNAfold"), reason="RNAfold is not installed")
def test_constraints():
"""
Constrain every position of the input sequence and expect that the
@@ -59,7 +48,7 @@ def test_constraints():
"""
# Sequence should not matter
sequence = seq.NucleotideSequence("A" * 20)
-
+
# An arbitrary secondary structure
# The loop in the center must probably comprise at least 5 bases
# due to the dynamic programming algorithm
@@ -68,15 +57,18 @@ def test_constraints():
app = RNAfoldApp(sequence)
app.set_constraints(
- pairs=np.stack([
- np.where(ref_dotbracket_array == "(")[0],
- np.where(ref_dotbracket_array == ")")[0][::-1]
- ], axis=-1),
- unpaired = (ref_dotbracket_array == "x"),
- enforce=True
+ pairs=np.stack(
+ [
+ np.where(ref_dotbracket_array == "(")[0],
+ np.where(ref_dotbracket_array == ")")[0][::-1],
+ ],
+ axis=-1,
+ ),
+ unpaired=(ref_dotbracket_array == "x"),
+ enforce=True,
)
app.start()
app.join()
test_dotbracket = app.get_dot_bracket()
- assert test_dotbracket == ref_dotbracket.replace("x", ".")
\ No newline at end of file
+ assert test_dotbracket == ref_dotbracket.replace("x", ".")
diff --git a/tests/application/test_rnaplot.py b/tests/application/test_rnaplot.py
index 8810d1131..e0eb8649d 100644
--- a/tests/application/test_rnaplot.py
+++ b/tests/application/test_rnaplot.py
@@ -5,7 +5,7 @@
import numpy as np
import pytest
from biotite.application.viennarna import RNAplotApp
-from ..util import is_not_installed
+from tests.util import is_not_installed
@pytest.fixture
@@ -14,23 +14,24 @@ def sample_app():
Provide a `RNAplotApp` object, where *RNAplot* has been executed for
a sample structure.
"""
- app = RNAplotApp('((..))')
+ app = RNAplotApp("((..))")
app.start()
app.join()
return app
-@pytest.mark.skipif(
- is_not_installed("RNAplot"), reason="RNAplot is not installed"
-)
+@pytest.mark.skipif(is_not_installed("RNAplot"), reason="RNAplot is not installed")
def test_get_cooordinates(sample_app):
- assert (
- np.all(
- sample_app.get_coordinates() == np.array([[ -92.5 , 92.5 ],
- [ -92.5 , 77.5 ],
- [ -90.31, 58.24],
- [-109.69, 58.24],
- [-107.5 , 77.5 ],
- [-107.5 , 92.5 ]])
+ assert np.all(
+ sample_app.get_coordinates()
+ == np.array(
+ [
+ [-92.5, 92.5],
+ [-92.5, 77.5],
+ [-90.31, 58.24],
+ [-109.69, 58.24],
+ [-107.5, 77.5],
+ [-107.5, 92.5],
+ ]
)
)
diff --git a/tests/application/test_sra.py b/tests/application/test_sra.py
index 78b471538..7728ae33a 100644
--- a/tests/application/test_sra.py
+++ b/tests/application/test_sra.py
@@ -6,16 +6,14 @@
from os.path import join
from tempfile import gettempdir
import pytest
-from biotite.application.sra import FastqDumpApp, FastaDumpApp
-from biotite.sequence.io.fastq import FastqFile
+from biotite.application.sra import FastaDumpApp, FastqDumpApp
from biotite.sequence.io.fasta import FastaFile
+from biotite.sequence.io.fastq import FastqFile
@pytest.mark.parametrize(
- "app_class, custom_prefix", itertools.product(
- [FastqDumpApp, FastaDumpApp],
- [False, True]
- )
+ "app_class, custom_prefix",
+ itertools.product([FastqDumpApp, FastaDumpApp], [False, True]),
)
def test_objects(app_class, custom_prefix):
"""
@@ -45,10 +43,8 @@ def test_objects(app_class, custom_prefix):
@pytest.mark.parametrize(
- "app_class, custom_prefix", itertools.product(
- [FastqDumpApp, FastaDumpApp],
- [False, True]
- )
+ "app_class, custom_prefix",
+ itertools.product([FastqDumpApp, FastaDumpApp], [False, True]),
)
def test_classmethod(app_class, custom_prefix):
"""
diff --git a/tests/application/test_tantan.py b/tests/application/test_tantan.py
index dd88abd66..91bce56b8 100644
--- a/tests/application/test_tantan.py
+++ b/tests/application/test_tantan.py
@@ -7,24 +7,20 @@
import biotite.sequence as seq
import biotite.sequence.align as align
from biotite.application.tantan import TantanApp
-from ..util import is_not_installed
+from tests.util import is_not_installed
+
@pytest.fixture
def simple_matrix():
alph = seq.NucleotideSequence.alphabet_unamb
return align.SubstitutionMatrix(
- alph, alph, np.array(
- [[ 1, -1, -1, -1],
- [-1, 1, -1, -1],
- [-1, -1, 1, -1],
- [-1, -1, -1, 1]]
- )
+ alph,
+ alph,
+ np.array([[1, -1, -1, -1], [-1, 1, -1, -1], [-1, -1, 1, -1], [-1, -1, -1, 1]]),
)
-@pytest.mark.skipif(
- is_not_installed("tantan"), reason="tantan is not installed"
-)
+@pytest.mark.skipif(is_not_installed("tantan"), reason="tantan is not installed")
@pytest.mark.parametrize("use_custom_matrix", [False, True])
def test_nucleotide(simple_matrix, use_custom_matrix):
"""
@@ -45,9 +41,7 @@ def test_nucleotide(simple_matrix, use_custom_matrix):
assert test_mask.tolist() == ref_mask
-@pytest.mark.skipif(
- is_not_installed("tantan"), reason="tantan is not installed"
-)
+@pytest.mark.skipif(is_not_installed("tantan"), reason="tantan is not installed")
@pytest.mark.parametrize("use_custom_matrix", [False, True])
def test_protein(use_custom_matrix):
"""
@@ -68,16 +62,14 @@ def test_protein(use_custom_matrix):
assert test_mask.tolist() == ref_mask
-@pytest.mark.skipif(
- is_not_installed("tantan"), reason="tantan is not installed"
-)
+@pytest.mark.skipif(is_not_installed("tantan"), reason="tantan is not installed")
def test_multiple_sequences():
"""
Test masking multiple sequences in a single run.
"""
seq_strings = [
"CANYQVcanacanasacannercancanACAN",
- "NEARAnearanearerearanearlyeerieear"
+ "NEARAnearanearerearanearlyeerieear",
]
sequences = [seq.ProteinSequence(seq_string) for seq_string in seq_strings]
@@ -91,4 +83,4 @@ def test_multiple_sequences():
assert len(test_masks) == len(ref_masks)
for test_mask, ref_mask in zip(test_masks, ref_masks):
- assert test_mask.tolist() == ref_mask
\ No newline at end of file
+ assert test_mask.tolist() == ref_mask
diff --git a/tests/conftest.py b/tests/conftest.py
index af3b9597b..7701902e0 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -2,8 +2,6 @@
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.
-import pytest
-import numpy as np
def pytest_sessionstart(session):
"""
@@ -13,10 +11,11 @@ def pytest_sessionstart(session):
try:
import numpy as np
import pyximport
+
pyximport.install(
build_in_temp=False,
- setup_args={"include_dirs":np.get_include()},
- language_level=3
+ setup_args={"include_dirs": np.get_include()},
+ language_level=3,
)
except ImportError:
- pass
\ No newline at end of file
+ pass
diff --git a/tests/database/test_entrez.py b/tests/database/test_entrez.py
index a0c4dee44..bc1e94f34 100644
--- a/tests/database/test_entrez.py
+++ b/tests/database/test_entrez.py
@@ -4,40 +4,29 @@
import itertools
import tempfile
-import numpy as np
-from requests.exceptions import ConnectionError
import pytest
import biotite.database.entrez as entrez
import biotite.sequence.io.fasta as fasta
from biotite.database import RequestError
-from ..util import cannot_connect_to
-
+from tests.util import cannot_connect_to
NCBI_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/"
-@pytest.mark.skipif(
- cannot_connect_to(NCBI_URL),
- reason="NCBI Entrez is not available"
-)
+@pytest.mark.skipif(cannot_connect_to(NCBI_URL), reason="NCBI Entrez is not available")
@pytest.mark.parametrize(
- "common_name, as_file_like",
- itertools.product([False, True], [False, True])
+ "common_name, as_file_like", itertools.product([False, True], [False, True])
)
def test_fetch(common_name, as_file_like):
path = None if as_file_like else tempfile.gettempdir()
db_name = "Protein" if common_name else "protein"
- file = entrez.fetch(
- "1L2Y_A", path, "fa", db_name, "fasta", overwrite=True
- )
+ file = entrez.fetch("1L2Y_A", path, "fa", db_name, "fasta", overwrite=True)
fasta_file = fasta.FastaFile.read(file)
prot_seq = fasta.get_sequence(fasta_file)
assert len(prot_seq) == 20
-@pytest.mark.skipif(
- cannot_connect_to(NCBI_URL),
- reason="NCBI Entrez is not available"
-)
+
+@pytest.mark.skipif(cannot_connect_to(NCBI_URL), reason="NCBI Entrez is not available")
@pytest.mark.parametrize("as_file_like", [False, True])
def test_fetch_single_file(as_file_like):
if as_file_like:
@@ -45,7 +34,7 @@ def test_fetch_single_file(as_file_like):
else:
file = tempfile.NamedTemporaryFile("r", suffix=".fa")
file_name = file.name
-
+
downloaded_file_name = entrez.fetch_single_file(
["1L2Y_A", "3O5R_A"], file_name, "protein", "fasta"
)
@@ -56,17 +45,12 @@ def test_fetch_single_file(as_file_like):
if not as_file_like:
file.close()
-@pytest.mark.skipif(
- cannot_connect_to(NCBI_URL),
- reason="NCBI Entrez is not available"
-)
+
+@pytest.mark.skipif(cannot_connect_to(NCBI_URL), reason="NCBI Entrez is not available")
def test_fetch_invalid():
with pytest.raises(RequestError):
# Empty ID list
- file = entrez.fetch_single_file(
- [], None, "protein", "fasta", overwrite=True)
+ entrez.fetch_single_file([], None, "protein", "fasta", overwrite=True)
with pytest.raises(RequestError):
# Nonexisting ID
- file = entrez.fetch(
- "xxxx", None, "fa", "protein", "fasta", overwrite=True
- )
\ No newline at end of file
+ entrez.fetch("xxxx", None, "fa", "protein", "fasta", overwrite=True)
diff --git a/tests/database/test_pubchem.py b/tests/database/test_pubchem.py
index 8c26a1ddc..ed84809e3 100644
--- a/tests/database/test_pubchem.py
+++ b/tests/database/test_pubchem.py
@@ -2,27 +2,22 @@
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.
-import re
import itertools
+import re
import tempfile
-import pytest
import numpy as np
+import pytest
import biotite.database.pubchem as pubchem
import biotite.structure.io.mol as mol
from biotite.database import RequestError
-from ..util import cannot_connect_to
-
+from tests.util import cannot_connect_to
PUBCHEM_URL = "https://pubchem.ncbi.nlm.nih.gov/"
-@pytest.mark.skipif(
- cannot_connect_to(PUBCHEM_URL),
- reason="Pubchem is not available"
-)
+@pytest.mark.skipif(cannot_connect_to(PUBCHEM_URL), reason="Pubchem is not available")
@pytest.mark.parametrize(
- "format, as_file_like",
- itertools.product(["sdf", "png"], [False, True])
+ "format, as_file_like", itertools.product(["sdf", "png"], [False, True])
)
def test_fetch(format, as_file_like):
"""
@@ -39,10 +34,7 @@ def test_fetch(format, as_file_like):
mol_file.get_structure()
-@pytest.mark.skipif(
- cannot_connect_to(PUBCHEM_URL),
- reason="PubChem is not available"
-)
+@pytest.mark.skipif(cannot_connect_to(PUBCHEM_URL), reason="PubChem is not available")
@pytest.mark.parametrize("as_structural_formula", [False, True])
def test_fetch_structural_formula(as_structural_formula):
"""
@@ -52,9 +44,9 @@ def test_fetch_structural_formula(as_structural_formula):
"""
CID = 2244
- mol_file = mol.MOLFile.read(pubchem.fetch(
- 2244, as_structural_formula=as_structural_formula
- ))
+ mol_file = mol.MOLFile.read(
+ pubchem.fetch(CID, as_structural_formula=as_structural_formula)
+ )
atoms = mol_file.get_structure()
if as_structural_formula:
@@ -63,10 +55,7 @@ def test_fetch_structural_formula(as_structural_formula):
assert np.any(atoms.coord[:, 2] != 0)
-@pytest.mark.skipif(
- cannot_connect_to(PUBCHEM_URL),
- reason="PubChem is not available"
-)
+@pytest.mark.skipif(cannot_connect_to(PUBCHEM_URL), reason="PubChem is not available")
def test_fetch_invalid():
"""
An exception is expected when the CID is not available.
@@ -77,10 +66,7 @@ def test_fetch_invalid():
pubchem.fetch(1234567890)
-@pytest.mark.skipif(
- cannot_connect_to(PUBCHEM_URL),
- reason="PubChem is not available"
-)
+@pytest.mark.skipif(cannot_connect_to(PUBCHEM_URL), reason="PubChem is not available")
@pytest.mark.parametrize(
"query, ref_ids",
[
@@ -89,7 +75,7 @@ def test_fetch_invalid():
(pubchem.InchiQuery("InChI=1S/C4H10/c1-3-4-2/h3-4H2,1-2H3"), [7843]),
(pubchem.InchiKeyQuery("IJDNQMDRQITEOD-UHFFFAOYSA-N"), [7843]),
],
- ids=["NameQuery", "SmilesQuery", "InchiQuery", "InchiKeyQuery"]
+ ids=["NameQuery", "SmilesQuery", "InchiQuery", "InchiKeyQuery"],
)
def test_search_simple(query, ref_ids):
"""
@@ -102,10 +88,7 @@ def test_search_simple(query, ref_ids):
assert set(ref_ids).issubset(pubchem.search(query))
-@pytest.mark.skipif(
- cannot_connect_to(PUBCHEM_URL),
- reason="PubChem is not available"
-)
+@pytest.mark.skipif(cannot_connect_to(PUBCHEM_URL), reason="PubChem is not available")
def test_search_formula():
"""
Download a structure and search for its molecular formula in
@@ -115,23 +98,17 @@ def test_search_formula():
CID = 101608985
atoms = mol.MOLFile.read(pubchem.fetch(CID)).get_structure()
- test_cids = pubchem.search(
- pubchem.FormulaQuery.from_atoms(atoms)
- )
+ test_cids = pubchem.search(pubchem.FormulaQuery.from_atoms(atoms))
assert CID in (test_cids)
-@pytest.mark.skipif(
- cannot_connect_to(PUBCHEM_URL),
- reason="PubChem is not available"
-)
+@pytest.mark.skipif(cannot_connect_to(PUBCHEM_URL), reason="PubChem is not available")
@pytest.mark.parametrize(
- "cid, from_atoms, query_type", itertools.product(
- [2244],
- [False, True],
- [pubchem.SuperstructureQuery, pubchem.SubstructureQuery]
- )
+ "cid, from_atoms, query_type",
+ itertools.product(
+ [2244], [False, True], [pubchem.SuperstructureQuery, pubchem.SubstructureQuery]
+ ),
)
def test_search_super_and_substructure(cid, from_atoms, query_type):
"""
@@ -170,16 +147,9 @@ def test_search_super_and_substructure(cid, from_atoms, query_type):
assert atoms.array_length() >= original_atoms.array_length()
-@pytest.mark.skipif(
- cannot_connect_to(PUBCHEM_URL),
- reason="PubChem is not available"
-)
+@pytest.mark.skipif(cannot_connect_to(PUBCHEM_URL), reason="PubChem is not available")
@pytest.mark.parametrize(
- "conformation_based, from_atoms",
- itertools.product(
- [False, True],
- [False, True]
- )
+ "conformation_based, from_atoms", itertools.product([False, True], [False, True])
)
def test_search_similarity(conformation_based, from_atoms):
"""
@@ -192,8 +162,7 @@ def test_search_similarity(conformation_based, from_atoms):
if from_atoms:
original_atoms = mol.MOLFile.read(pubchem.fetch(CID)).get_structure()
query = pubchem.SimilarityQuery.from_atoms(
- original_atoms, threshold=1.0,
- conformation_based=conformation_based
+ original_atoms, threshold=1.0, conformation_based=conformation_based
)
else:
query = pubchem.SimilarityQuery(
@@ -204,10 +173,7 @@ def test_search_similarity(conformation_based, from_atoms):
assert CID in cids
-@pytest.mark.skipif(
- cannot_connect_to(PUBCHEM_URL),
- reason="PubChem is not available"
-)
+@pytest.mark.skipif(cannot_connect_to(PUBCHEM_URL), reason="PubChem is not available")
@pytest.mark.parametrize("from_atoms", [False, True])
def test_search_identity(from_atoms):
"""
@@ -222,4 +188,4 @@ def test_search_identity(from_atoms):
query = pubchem.IdentityQuery(cid=CID)
cids = pubchem.search(query)
- assert cids == [CID]
\ No newline at end of file
+ assert cids == [CID]
diff --git a/tests/database/test_rcsb.py b/tests/database/test_rcsb.py
index b05aa3cef..ec37d6b5a 100644
--- a/tests/database/test_rcsb.py
+++ b/tests/database/test_rcsb.py
@@ -2,32 +2,28 @@
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.
-from os.path import join
import itertools
import tempfile
-import pytest
+from os.path import join
import numpy as np
+import pytest
import biotite.database.rcsb as rcsb
+import biotite.sequence.align as align
+import biotite.sequence.io.fasta as fasta
import biotite.structure.io.pdb as pdb
import biotite.structure.io.pdbx as pdbx
-import biotite.sequence.io.fasta as fasta
-import biotite.sequence.align as align
from biotite.database import RequestError
-from ..util import cannot_connect_to, data_dir
-
+from tests.util import cannot_connect_to, data_dir
RCSB_URL = "https://www.rcsb.org/"
# Search term that should only find the entry 1L2Y
TC5B_TERM = "Miniprotein Construct TC5b"
-@pytest.mark.skipif(
- cannot_connect_to(RCSB_URL),
- reason="RCSB PDB is not available"
-)
+@pytest.mark.skipif(cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available")
@pytest.mark.parametrize(
"format, as_file_like",
- itertools.product(["pdb", "cif", "bcif", "fasta"], [False, True])
+ itertools.product(["pdb", "cif", "bcif", "fasta"], [False, True]),
)
def test_fetch(format, as_file_like):
path = None if as_file_like else tempfile.gettempdir()
@@ -47,16 +43,11 @@ def test_fetch(format, as_file_like):
assert len(fasta.get_sequences(file)) > 0
-@pytest.mark.skipif(
- cannot_connect_to(RCSB_URL),
- reason="RCSB PDB is not available"
-)
+@pytest.mark.skipif(cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available")
@pytest.mark.parametrize("format", ["pdb", "cif", "bcif", "fasta"])
def test_fetch_invalid(format):
with pytest.raises(RequestError):
- rcsb.fetch(
- "xxxx", format, tempfile.gettempdir(), overwrite=True
- )
+ rcsb.fetch("xxxx", format, tempfile.gettempdir(), overwrite=True)
def test_search_basic():
@@ -72,58 +63,78 @@ def test_search_basic():
"pdbx_serial_crystallography_sample_delivery_injection.preparation",
False,
{},
- ["6IG7", "6IG6", "7JRI", "7JR5", "7QX4", "7QX5", "7QX6", "7QX7",
- "8A2O", "8A2P"]
+ [
+ "6IG7",
+ "6IG6",
+ "7JRI",
+ "7JR5",
+ "7QX4",
+ "7QX5",
+ "7QX6",
+ "7QX7",
+ "8A2O",
+ "8A2P",
+ ],
),
(
"audit_author.name",
False,
{"is_in": ["Neidigh, J.W."]},
- ["1JRJ", "1L2Y", "2O3P", "2O63", "2O64", "2O65"]
+ ["1JRJ", "1L2Y", "2O3P", "2O63", "2O64", "2O65"],
),
(
"rcsb_entity_source_organism.rcsb_gene_name.value",
False,
{"exact_match": "lacA"},
- ["5JUV", "1KQA", "1KRV", "1KRU", "1KRR", "3U7V", "4IUG", "4LFK",
- "4LFL", "4LFM", "4LFN", "5IFP", "5IFT", "5IHR", "4DUW", "5MGD",
- "5MGC"]
+ [
+ "5JUV",
+ "1KQA",
+ "1KRV",
+ "1KRU",
+ "1KRR",
+ "3U7V",
+ "4IUG",
+ "4LFK",
+ "4LFL",
+ "4LFM",
+ "4LFN",
+ "5IFP",
+ "5IFT",
+ "5IHR",
+ "4DUW",
+ "5MGD",
+ "5MGC",
+ ],
),
(
"struct.title",
False,
{"contains_words": "tc5b"},
- ["1L2Y", "8ANH", "8ANM", "8ANG", "8ANI"]
+ ["1L2Y", "8ANH", "8ANM", "8ANG", "8ANI"],
),
(
"reflns.d_resolution_high",
False,
{"less_or_equal": 0.6},
- ["1EJG", "1I0T", "3NIR", "3P4J", "5D8V", "5NW3", "4JLJ", "7ATG",
- "7R0H"]
+ ["1EJG", "1I0T", "3NIR", "3P4J", "5D8V", "5NW3", "4JLJ", "7ATG", "7R0H"],
),
(
"rcsb_entry_info.deposited_model_count",
False,
{"range_closed": (60, 61)},
- ["1BBO", "1GB1", "1O5P", "1XU6", "2LUM", "2NO8"]
+ ["1BBO", "1GB1", "1O5P", "1XU6", "2LUM", "2NO8"],
),
(
"rcsb_id",
True,
{"exact_match": "AIN"},
- ["1OXR", "1TGM", "3IAZ", "3GCL", "6MQF", "2QQT", "4NSB", "8J3W"]
+ ["1OXR", "1TGM", "3IAZ", "3GCL", "6MQF", "2QQT", "4NSB", "8J3W"],
),
- ]
-)
-@pytest.mark.skipif(
- cannot_connect_to(RCSB_URL),
- reason="RCSB PDB is not available"
+ ],
)
+@pytest.mark.skipif(cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available")
def test_search_field(field, molecular_definition, params, ref_ids):
- query = rcsb.FieldQuery(
- field, molecular_definition, **params
- )
+ query = rcsb.FieldQuery(field, molecular_definition, **params)
test_ids = rcsb.search(query)
test_count = rcsb.count(query)
@@ -131,17 +142,12 @@ def test_search_field(field, molecular_definition, params, ref_ids):
assert test_count == len(ref_ids)
-@pytest.mark.skipif(
- cannot_connect_to(RCSB_URL),
- reason="RCSB PDB is not available"
-)
+@pytest.mark.skipif(cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available")
def test_search_sequence():
IDENTIY_CUTOFF = 0.9
pdbx_file = pdbx.BinaryCIFFile.read(join(data_dir("structure"), "1l2y.bcif"))
- ref_sequence = pdbx.get_sequence(pdbx_file)['A']
- query = rcsb.SequenceQuery(
- ref_sequence, "protein", min_identity=IDENTIY_CUTOFF
- )
+ ref_sequence = pdbx.get_sequence(pdbx_file)["A"]
+ query = rcsb.SequenceQuery(ref_sequence, "protein", min_identity=IDENTIY_CUTOFF)
test_ids = rcsb.search(query)
assert len(test_ids) >= 2
@@ -156,20 +162,14 @@ def test_search_sequence():
assert identity >= IDENTIY_CUTOFF
-@pytest.mark.skipif(
- cannot_connect_to(RCSB_URL),
- reason="RCSB PDB is not available"
-)
+@pytest.mark.skipif(cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available")
def test_search_structure():
query = rcsb.StructureQuery("1L2Y", chain="A")
test_ids = rcsb.search(query)
assert "1L2Y" in test_ids
-@pytest.mark.skipif(
- cannot_connect_to(RCSB_URL),
- reason="RCSB PDB is not available"
-)
+@pytest.mark.skipif(cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available")
def test_search_motif():
# motif is taken from official RCSB search API tutorial
MOTIF = "C-x(2,4)-C-x(3)-[LIVMFYWC]-x(8)-H-x(3,5)-H."
@@ -178,25 +178,18 @@ def test_search_motif():
assert test_count == pytest.approx(639, rel=0.1)
-@pytest.mark.skipif(
- cannot_connect_to(RCSB_URL),
- reason="RCSB PDB is not available"
-)
+@pytest.mark.skipif(cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available")
def test_search_composite():
query1 = rcsb.FieldQuery(
- "rcsb_entity_host_organism.scientific_name",
- exact_match="Homo sapiens"
- )
- query2 = rcsb.FieldQuery(
- "exptl.method",
- exact_match="SOLUTION NMR"
+ "rcsb_entity_host_organism.scientific_name", exact_match="Homo sapiens"
)
+ query2 = rcsb.FieldQuery("exptl.method", exact_match="SOLUTION NMR")
ids_1 = set(rcsb.search(query1))
ids_2 = set(rcsb.search(query2))
ids_or = set(rcsb.search(query1 | query2))
ids_and = set(rcsb.search(query1 & query2))
- assert ids_or == ids_1 | ids_2
+ assert ids_or == ids_1 | ids_2
assert ids_and == ids_1 & ids_2
@@ -209,26 +202,19 @@ def test_search_composite():
("non_polymer_entity", [] ),
("polymer_instance", ["1L2Y.A"]),
]
-)
-@pytest.mark.skipif(
- cannot_connect_to(RCSB_URL),
- reason="RCSB PDB is not available"
-)
+) # fmt: skip
+@pytest.mark.skipif(cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available")
def test_search_return_type(return_type, expected):
query = rcsb.BasicQuery(TC5B_TERM)
assert rcsb.search(query, return_type) == expected
assert rcsb.count(query, return_type) == len(expected)
-@pytest.mark.skipif(
- cannot_connect_to(RCSB_URL),
- reason="RCSB PDB is not available"
-)
+@pytest.mark.skipif(cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available")
@pytest.mark.parametrize("seed", np.arange(5))
def test_search_range(seed):
query = rcsb.FieldQuery(
- "rcsb_entity_host_organism.scientific_name",
- exact_match="Homo sapiens"
+ "rcsb_entity_host_organism.scientific_name", exact_match="Homo sapiens"
)
count = rcsb.count(query)
ref_entries = rcsb.search(query)
@@ -241,15 +227,11 @@ def test_search_range(seed):
assert test_entries == ref_entries[range[0] : range[1]]
-@pytest.mark.skipif(
- cannot_connect_to(RCSB_URL),
- reason="RCSB PDB is not available"
-)
+@pytest.mark.skipif(cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available")
@pytest.mark.parametrize("as_sorting_object", [False, True])
def test_search_sort(as_sorting_object):
query = rcsb.FieldQuery(
- "rcsb_entity_host_organism.scientific_name",
- exact_match="Homo sapiens"
+ "rcsb_entity_host_organism.scientific_name", exact_match="Homo sapiens"
)
if as_sorting_object:
sort_by = rcsb.Sorting("reflns.d_resolution_high", descending=False)
@@ -270,20 +252,18 @@ def test_search_sort(as_sorting_object):
assert resolutions == list(reversed(sorted(resolutions)))
-@pytest.mark.skipif(
- cannot_connect_to(RCSB_URL),
- reason="RCSB PDB is not available"
-)
+@pytest.mark.skipif(cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available")
def test_search_content_types():
# Query to limit the number of returned results
# for improved performance
query = rcsb.FieldQuery(
- "rcsb_entity_host_organism.scientific_name",
- exact_match="Homo sapiens"
+ "rcsb_entity_host_organism.scientific_name", exact_match="Homo sapiens"
)
- experimental_set = set(rcsb.search(query, content_types=["experimental"]))
+ experimental_set = set(rcsb.search(query, content_types=["experimental"]))
computational_set = set(rcsb.search(query, content_types=["computational"]))
- combined_set = set(rcsb.search(query, content_types=["experimental", "computational"]))
+ combined_set = set(
+ rcsb.search(query, content_types=["experimental", "computational"])
+ )
# If there are no results, the following tests make no sense
assert len(combined_set) > 0
@@ -294,7 +274,9 @@ def test_search_content_types():
assert rcsb.count(query, content_types=["experimental"]) == len(experimental_set)
assert rcsb.count(query, content_types=["computational"]) == len(computational_set)
- assert rcsb.count(query, content_types=["experimental", "computational"]) == len(combined_set)
+ assert rcsb.count(query, content_types=["experimental", "computational"]) == len(
+ combined_set
+ )
# Expect an exception if no content_type
with pytest.raises(ValueError):
@@ -303,10 +285,7 @@ def test_search_content_types():
rcsb.count(query, content_types=[])
-@pytest.mark.skipif(
- cannot_connect_to(RCSB_URL),
- reason="RCSB PDB is not available"
-)
+@pytest.mark.skipif(cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available")
@pytest.mark.parametrize(
"grouping, resolution_threshold, return_type, ref_groups",
[
@@ -316,79 +295,65 @@ def test_search_content_types():
),
0.7,
"polymer_entity",
- set([
- ("3X2M_1",),
- ("6E6O_1",),
- ("1YK4_1",),
- ("5NW3_1",),
- ("1US0_1",),
- ("4HP2_1",),
- ("2DSX_1",),
- ("2VB1_1",),
- ("7VOS_1", "5D8V_1", "3A38_1"),
- ("1UCS_1",),
- ("3NIR_1", "1EJG_1"),
- ])
+ set(
+ [
+ ("3X2M_1",),
+ ("6E6O_1",),
+ ("1YK4_1",),
+ ("5NW3_1",),
+ ("1US0_1",),
+ ("4HP2_1",),
+ ("2DSX_1",),
+ ("2VB1_1",),
+ ("7VOS_1", "5D8V_1", "3A38_1"),
+ ("1UCS_1",),
+ ("3NIR_1", "1EJG_1"),
+ ]
+ ),
),
-
(
- rcsb.UniprotGrouping(
- sort_by="rcsb_accession_info.initial_release_date"
- ),
+ rcsb.UniprotGrouping(sort_by="rcsb_accession_info.initial_release_date"),
0.7,
"polymer_entity",
- set([
- ("3X2M_1",),
- ("6E6O_1",),
- ("1YK4_1",),
- ("5NW3_1",),
- ("1US0_1",),
- ("4HP2_1",),
- ("2DSX_1",),
- ("2VB1_1",),
- ("7VOS_1", "5D8V_1", "3A38_1"),
- ("1UCS_1",),
- ("3NIR_1", "1EJG_1"),
- ])
+ set(
+ [
+ ("3X2M_1",),
+ ("6E6O_1",),
+ ("1YK4_1",),
+ ("5NW3_1",),
+ ("1US0_1",),
+ ("4HP2_1",),
+ ("2DSX_1",),
+ ("2VB1_1",),
+ ("7VOS_1", "5D8V_1", "3A38_1"),
+ ("1UCS_1",),
+ ("3NIR_1", "1EJG_1"),
+ ]
+ ),
),
-
(
- rcsb.DepositGrouping(
- sort_by="rcsb_accession_info.initial_release_date"
- ),
+ rcsb.DepositGrouping(sort_by="rcsb_accession_info.initial_release_date"),
0.9,
"entry",
- set([
- ("5R32",),
- ("5RDH", "5RBR"),
- ("7G0Z", "7FXV")
- ])
- )
- ]
+ set([("5R32",), ("5RDH", "5RBR"), ("7G0Z", "7FXV")]),
+ ),
+ ],
)
-def test_search_grouping(grouping, resolution_threshold, return_type,
- ref_groups):
+def test_search_grouping(grouping, resolution_threshold, return_type, ref_groups):
"""
Check whether the same result as in a known example is achieved.
"""
- query = (
- rcsb.FieldQuery(
- "exptl.method",
- exact_match="X-RAY DIFFRACTION"
- )
- & rcsb.FieldQuery(
- "rcsb_entry_info.resolution_combined",
- range_closed=(0.0, resolution_threshold)
- )
+ query = rcsb.FieldQuery(
+ "exptl.method", exact_match="X-RAY DIFFRACTION"
+ ) & rcsb.FieldQuery(
+ "rcsb_entry_info.resolution_combined", range_closed=(0.0, resolution_threshold)
)
- test_groups = list(rcsb.search(
- query, return_type,
- group_by=grouping, return_groups=True
- ).values())
+ test_groups = list(
+ rcsb.search(query, return_type, group_by=grouping, return_groups=True).values()
+ )
test_representatives = rcsb.search(
- query, return_type,
- group_by=grouping, return_groups=False
+ query, return_type, group_by=grouping, return_groups=False
)
test_count = rcsb.count(query, return_type, group_by=grouping)
@@ -398,10 +363,7 @@ def test_search_grouping(grouping, resolution_threshold, return_type,
assert test_count == len(ref_groups)
-@pytest.mark.skipif(
- cannot_connect_to(RCSB_URL),
- reason="RCSB PDB is not available"
-)
+@pytest.mark.skipif(cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available")
def test_search_empty():
query = rcsb.BasicQuery("This will not match any ID")
assert rcsb.search(query) == []
@@ -410,21 +372,9 @@ def test_search_empty():
@pytest.mark.parametrize(
"field, params",
- [
- (
- "invalid.field",
- {"exact_match": "Some Value"}
- ),
- (
- "exptl.method",
- {"less": 5}
- )
- ]
-)
-@pytest.mark.skipif(
- cannot_connect_to(RCSB_URL),
- reason="RCSB PDB is not available"
+ [("invalid.field", {"exact_match": "Some Value"}), ("exptl.method", {"less": 5})],
)
+@pytest.mark.skipif(cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available")
def test_search_invalid(field, params):
invalid_query = rcsb.FieldQuery(field, **params)
with pytest.raises(RequestError, match="400"):
diff --git a/tests/database/test_uniprot.py b/tests/database/test_uniprot.py
index 53c12e60b..7af70393a 100644
--- a/tests/database/test_uniprot.py
+++ b/tests/database/test_uniprot.py
@@ -8,76 +8,51 @@
import biotite.database.uniprot as uniprot
import biotite.sequence.io.fasta as fasta
from biotite.database import RequestError
-from ..util import cannot_connect_to
-
+from tests.util import cannot_connect_to
UNIPROT_URL = "https://www.uniprot.org/"
-@pytest.mark.skipif(
- cannot_connect_to(UNIPROT_URL),
- reason="UniProt is not available"
-)
-@pytest.mark.parametrize(
- "as_file_like",
- itertools.product([False, True])
-)
+@pytest.mark.skipif(cannot_connect_to(UNIPROT_URL), reason="UniProt is not available")
+@pytest.mark.parametrize("as_file_like", itertools.product([False, True]))
def test_fetch(as_file_like):
path = None if as_file_like else tempfile.gettempdir()
# UniProtKB
- file = uniprot.fetch(
- "P12345", "fasta", path, overwrite=True
- )
+ file = uniprot.fetch("P12345", "fasta", path, overwrite=True)
fasta_file = fasta.FastaFile.read(file)
prot_seq = fasta.get_sequence(fasta_file)
assert len(prot_seq) == 430
# UniRef
- file = uniprot.fetch(
- "UniRef90_P99999", "fasta", path, overwrite=True
- )
+ file = uniprot.fetch("UniRef90_P99999", "fasta", path, overwrite=True)
fasta_file = fasta.FastaFile.read(file)
prot_seq = fasta.get_sequence(fasta_file)
assert len(prot_seq) == 105
# UniParc
- file = uniprot.fetch(
- "UPI000000001F", "fasta", path, overwrite=True
- )
+ file = uniprot.fetch("UPI000000001F", "fasta", path, overwrite=True)
fasta_file = fasta.FastaFile.read(file)
prot_seq = fasta.get_sequence(fasta_file)
assert len(prot_seq) == 551
-@pytest.mark.skipif(
- cannot_connect_to(UNIPROT_URL),
- reason="UniProt is not available"
-)
+@pytest.mark.skipif(cannot_connect_to(UNIPROT_URL), reason="UniProt is not available")
@pytest.mark.parametrize("format", ["fasta", "gff", "txt", "xml", "rdf", "tab"])
def test_fetch_invalid(format):
with pytest.raises(RequestError):
- file = uniprot.fetch(
- "xxxx", format, tempfile.gettempdir(), overwrite=True
- )
+ uniprot.fetch("xxxx", format, tempfile.gettempdir(), overwrite=True)
-@pytest.mark.skipif(
- cannot_connect_to(UNIPROT_URL),
- reason="UniProt is not available"
-)
+@pytest.mark.skipif(cannot_connect_to(UNIPROT_URL), reason="UniProt is not available")
def test_search_simple():
query = uniprot.SimpleQuery("accession", "P12345")
- assert uniprot.search(query) \
- == ['P12345']
+ assert uniprot.search(query) == ["P12345"]
-@pytest.mark.skipif(
- cannot_connect_to(UNIPROT_URL),
- reason="UniProt is not available"
-)
+@pytest.mark.skipif(cannot_connect_to(UNIPROT_URL), reason="UniProt is not available")
def test_search_composite():
- query = uniprot.SimpleQuery("accession", "P12345") & uniprot.SimpleQuery("reviewed", "true")
- assert uniprot.search(query) \
- == ['P12345']
-
+ query = uniprot.SimpleQuery("accession", "P12345") & uniprot.SimpleQuery(
+ "reviewed", "true"
+ )
+ assert uniprot.search(query) == ["P12345"]
diff --git a/tests/sequence/align/util.py b/tests/sequence/align/conftest.py
similarity index 91%
rename from tests/sequence/align/util.py
rename to tests/sequence/align/conftest.py
index 191fbde6f..3320f4255 100644
--- a/tests/sequence/align/util.py
+++ b/tests/sequence/align/conftest.py
@@ -6,7 +6,7 @@
import pytest
import biotite.sequence as seq
import biotite.sequence.io.fasta as fasta
-from ...util import data_dir
+from tests.util import data_dir
@pytest.fixture
@@ -15,4 +15,4 @@ def sequences():
10 Cas9 sequences.
"""
fasta_file = fasta.FastaFile.read(join(data_dir("sequence"), "cas9.fasta"))
- return [seq.ProteinSequence(sequence) for sequence in fasta_file.values()]
\ No newline at end of file
+ return [seq.ProteinSequence(sequence) for sequence in fasta_file.values()]
diff --git a/tests/sequence/align/test_alignment.py b/tests/sequence/align/test_alignment.py
index a56cee3c8..971aecb73 100644
--- a/tests/sequence/align/test_alignment.py
+++ b/tests/sequence/align/test_alignment.py
@@ -6,8 +6,6 @@
import pytest
import biotite.sequence as seq
import biotite.sequence.align as align
-from .util import sequences
-
def test_alignment_str():
@@ -16,12 +14,15 @@ def test_alignment_str():
"""
seq1 = seq.NucleotideSequence("ACCTGA")
seq2 = seq.NucleotideSequence("TATGCT")
- ali_str = ["A-CCTGA----",
- "----T-ATGCT"]
+ ali_str = [
+ "A-CCTGA----",
+ "----T-ATGCT"
+ ] # fmt: skip
trace = align.Alignment.trace_from_strings(ali_str)
alignment = align.Alignment([seq1, seq2], trace, None)
assert str(alignment).split("\n") == ali_str
+
def test_conversion_to_symbols():
"""
Test conversion of alignments to strings.
@@ -30,16 +31,20 @@ def test_conversion_to_symbols():
seq_str2 = "HA--PRDDADWKLHH"
seq_str3 = "HA----DDADWKLHH"
seq_strings = [seq_str1, seq_str2, seq_str3]
- sequences = [seq.ProteinSequence(seq_str.replace("-",""))
- for seq_str in seq_strings]
+ sequences = [
+ seq.ProteinSequence(seq_str.replace("-", "")) for seq_str in seq_strings
+ ]
trace = align.Alignment.trace_from_strings(seq_strings)
alignment = align.Alignment(sequences, trace, score=None)
# Test the conversion bach to strings of symbols
symbols = align.get_symbols(alignment)
- symbols = ["".join([sym if sym is not None else "-" for sym in sym_list])
- for sym_list in symbols]
+ symbols = [
+ "".join([sym if sym is not None else "-" for sym in sym_list])
+ for sym_list in symbols
+ ]
assert symbols == seq_strings
+
def test_identity():
"""
Test correct calculation of `get_sequence_identity()` via a known
@@ -48,16 +53,18 @@ def test_identity():
seq_str1 = "--HAKLPRDD--WL--"
seq_str2 = "FRHA--QRTDADWLHH"
seq_strings = [seq_str1, seq_str2]
- sequences = [seq.ProteinSequence(seq_str.replace("-",""))
- for seq_str in seq_strings]
+ sequences = [
+ seq.ProteinSequence(seq_str.replace("-", "")) for seq_str in seq_strings
+ ]
trace = align.Alignment.trace_from_strings(seq_strings)
alignment = align.Alignment(sequences, trace, score=None)
# Assert correct sequence identity calculation
modes = ["all", "not_terminal", "shortest"]
- values = [6/16, 6/12, 6/10]
+ values = [6 / 16, 6 / 12, 6 / 10]
for mode, value in zip(modes, values):
assert align.get_sequence_identity(alignment, mode=mode) == value
+
@pytest.mark.parametrize("mode", ["all", "not_terminal", "shortest"])
def test_pairwise_identity(sequences, mode):
"""
@@ -66,19 +73,18 @@ def test_pairwise_identity(sequences, mode):
"""
sequences = sequences
msa, _, _, _ = align.align_multiple(
- sequences,
- matrix=align.SubstitutionMatrix.std_protein_matrix()
+ sequences, matrix=align.SubstitutionMatrix.std_protein_matrix()
)
-
+
ref_identity_matrix = np.zeros((len(sequences), len(sequences)))
for i in range(len(sequences)):
for j in range(len(sequences)):
- ref_identity_matrix[i,j] = align.get_sequence_identity(
- msa[:, [i,j]], mode=mode
+ ref_identity_matrix[i, j] = align.get_sequence_identity(
+ msa[:, [i, j]], mode=mode
)
-
+
test_identity_matrix = align.get_pairwise_sequence_identity(msa, mode=mode)
-
+
# Identity of two equal sequences should be 1, if only the length of
# the sequence is counted
if mode == "shortest":
@@ -88,4 +94,4 @@ def test_pairwise_identity(sequences, mode):
# Identity matrix is symmetric
assert (test_identity_matrix == test_identity_matrix.T).all()
# Pairwise identity must be equal in the two functions
- assert (test_identity_matrix == ref_identity_matrix).all()
\ No newline at end of file
+ assert (test_identity_matrix == ref_identity_matrix).all()
diff --git a/tests/sequence/align/test_banded.py b/tests/sequence/align/test_banded.py
index 351139925..85e297dcb 100644
--- a/tests/sequence/align/test_banded.py
+++ b/tests/sequence/align/test_banded.py
@@ -3,19 +3,16 @@
# information.
import itertools
-import pytest
import numpy as np
+import pytest
import biotite.sequence as seq
import biotite.sequence.align as align
-from .util import sequences
@pytest.mark.parametrize(
- "gap_penalty, local, band_width", itertools.product(
- [-10, (-10,-1)],
- [False, True],
- [2, 5, 20, 100]
-))
+ "gap_penalty, local, band_width",
+ itertools.product([-10, (-10, -1)], [False, True], [2, 5, 20, 100]),
+)
def test_simple_alignment(gap_penalty, local, band_width):
"""
Test `align_banded()` by comparing the output to `align_optimal()`.
@@ -28,16 +25,19 @@ def test_simple_alignment(gap_penalty, local, band_width):
matrix = align.SubstitutionMatrix.std_protein_matrix()
ref_alignments = align.align_optimal(
- seq1, seq2, matrix,
- gap_penalty=gap_penalty, local=local, terminal_penalty=False
+ seq1, seq2, matrix, gap_penalty=gap_penalty, local=local, terminal_penalty=False
)
# Remove terminal gaps in reference to obtain a true semi-global
# alignment, as returned by align_banded()
ref_alignments = [align.remove_terminal_gaps(al) for al in ref_alignments]
-
+
test_alignments = align.align_banded(
- seq1, seq2, matrix, (-band_width, band_width),
- gap_penalty=gap_penalty, local=local
+ seq1,
+ seq2,
+ matrix,
+ (-band_width, band_width),
+ gap_penalty=gap_penalty,
+ local=local,
)
assert len(test_alignments) == len(ref_alignments)
@@ -46,11 +46,13 @@ def test_simple_alignment(gap_penalty, local, band_width):
@pytest.mark.parametrize(
- "gap_penalty, local, seq_indices", itertools.product(
- [-10, (-10,-1)],
- [False, True],
- [(i,j) for i in range(10) for j in range(i+1)]
-))
+ "gap_penalty, local, seq_indices",
+ itertools.product(
+ [-10, (-10, -1)],
+ [False, True],
+ [(i, j) for i in range(10) for j in range(i + 1)],
+ ),
+)
def test_complex_alignment(sequences, gap_penalty, local, seq_indices):
"""
Test `align_banded()` by comparing the output to `align_optimal()`.
@@ -59,28 +61,37 @@ def test_complex_alignment(sequences, gap_penalty, local, seq_indices):
can return the optimal alignment(s).
"""
MAX_NUMBER = 100
-
+
matrix = align.SubstitutionMatrix.std_protein_matrix()
index1, index2 = seq_indices
seq1 = sequences[index1]
seq2 = sequences[index2]
ref_alignments = align.align_optimal(
- seq1, seq2, matrix,
- gap_penalty=gap_penalty, local=local, terminal_penalty=False,
- max_number=MAX_NUMBER
+ seq1,
+ seq2,
+ matrix,
+ gap_penalty=gap_penalty,
+ local=local,
+ terminal_penalty=False,
+ max_number=MAX_NUMBER,
)
# Remove terminal gaps in reference to obtain a true semi-global
# alignment, as returned by align_banded()
ref_alignments = [align.remove_terminal_gaps(al) for al in ref_alignments]
-
+
identity = align.get_sequence_identity(ref_alignments[0])
# Use a relatively small band width, if the sequences are similar,
# otherwise use the entire search space
band_width = 100 if identity > 0.5 else len(seq1) + len(seq2)
test_alignments = align.align_banded(
- seq1, seq2, matrix, (-band_width, band_width),
- gap_penalty=gap_penalty, local=local, max_number=MAX_NUMBER
+ seq1,
+ seq2,
+ matrix,
+ (-band_width, band_width),
+ gap_penalty=gap_penalty,
+ local=local,
+ max_number=MAX_NUMBER,
)
try:
@@ -103,18 +114,16 @@ def test_complex_alignment(sequences, gap_penalty, local, seq_indices):
@pytest.mark.parametrize(
- "length, excerpt_length, seed", itertools.product(
- [1_000, 1_000_000],
- [50, 500],
- range(10)
-))
+ "length, excerpt_length, seed",
+ itertools.product([1_000, 1_000_000], [50, 500], range(10)),
+)
def test_large_sequence_mapping(length, excerpt_length, seed):
"""
Test whether an excerpt of a very large sequence is aligned to that
sequence at the position, where the excerpt was taken from.
"""
BAND_WIDTH = 100
-
+
np.random.seed(seed)
sequence = seq.NucleotideSequence()
@@ -122,51 +131,37 @@ def test_large_sequence_mapping(length, excerpt_length, seed):
excerpt_pos = np.random.randint(len(sequence) - excerpt_length)
excerpt = sequence[excerpt_pos : excerpt_pos + excerpt_length]
- diagonal = np.random.randint(
- excerpt_pos - BAND_WIDTH,
- excerpt_pos + BAND_WIDTH
- )
- band = (
- diagonal - BAND_WIDTH,
- diagonal + BAND_WIDTH
- )
+ diagonal = np.random.randint(excerpt_pos - BAND_WIDTH, excerpt_pos + BAND_WIDTH)
+ band = (diagonal - BAND_WIDTH, diagonal + BAND_WIDTH)
print(band)
print(len(sequence), len(excerpt))
matrix = align.SubstitutionMatrix.std_nucleotide_matrix()
- test_alignments = align.align_banded(
- excerpt, sequence, matrix, band=band
- )
+ test_alignments = align.align_banded(excerpt, sequence, matrix, band=band)
# The excerpt should be uniquely mappable to a single location on
# the long sequence
assert len(test_alignments) == 1
test_alignment = test_alignments[0]
test_trace = test_alignment.trace
- ref_trace = np.stack([
- np.arange(len(excerpt)),
- np.arange(excerpt_pos, len(excerpt) + excerpt_pos)
- ], axis=1)
+ ref_trace = np.stack(
+ [np.arange(len(excerpt)), np.arange(excerpt_pos, len(excerpt) + excerpt_pos)],
+ axis=1,
+ )
assert np.array_equal(test_trace, ref_trace)
-
@pytest.mark.parametrize(
- "gap_penalty, local, seed", itertools.product(
- [-10, (-10, -1)],
- [False, True],
- range(100)
-))
+ "gap_penalty, local, seed",
+ itertools.product([-10, (-10, -1)], [False, True], range(100)),
+)
def test_swapping(gap_penalty, local, seed):
"""
Check if `align_banded()` returns a 'swapped' alignment, if
the order of input sequences is swapped.
"""
np.random.seed(seed)
- band = (
- np.random.randint(-30, -10),
- np.random.randint( 10, 30)
- )
+ band = (np.random.randint(-30, -10), np.random.randint(10, 30))
seq1, seq2 = _create_random_pair(seed)
matrix = align.SubstitutionMatrix.std_protein_matrix()
@@ -178,7 +173,7 @@ def test_swapping(gap_penalty, local, seed):
seq2, seq1, matrix, band=band, local=local, gap_penalty=gap_penalty
)
- if len(ref_alignments) != 1 or len(test_alignments) != 1:
+ if len(ref_alignments) != 1 or len(test_alignments) != 1:
# If multiple optimal alignments exist,
# it is not easy to assign a swapped one to an original one
# therefore, simply return in this case
@@ -187,16 +182,20 @@ def test_swapping(gap_penalty, local, seed):
return
ref_alignment = ref_alignments[0]
test_alignment = test_alignments[0]
-
+
assert test_alignment.sequences[0] == ref_alignment.sequences[1]
assert test_alignment.sequences[1] == ref_alignment.sequences[0]
assert np.array_equal(test_alignment.trace, ref_alignment.trace[:, ::-1])
-
-def _create_random_pair(seed, length=100, max_subsitutions=5,
- max_insertions=5, max_deletions=5,
- max_truncations=5):
+def _create_random_pair(
+ seed,
+ length=100,
+ max_subsitutions=5,
+ max_insertions=5,
+ max_deletions=5,
+ max_truncations=5,
+):
"""
generate a pair of protein sequences.
Each pair contains
@@ -217,9 +216,7 @@ def _create_random_pair(seed, length=100, max_subsitutions=5,
subsitution_indices = np.random.choice(
np.arange(len(mutant)), size=n_subsitutions, replace=False
)
- subsitution_values = np.random.randint(
- len(original.alphabet), size=n_subsitutions
- )
+ subsitution_values = np.random.randint(len(original.alphabet), size=n_subsitutions)
mutant.code[subsitution_indices] = subsitution_values
# Random insertions
@@ -227,9 +224,7 @@ def _create_random_pair(seed, length=100, max_subsitutions=5,
insertion_indices = np.random.choice(
np.arange(len(mutant)), size=n_insertions, replace=False
)
- insertion_values = np.random.randint(
- len(original.alphabet), size=n_insertions
- )
+ insertion_values = np.random.randint(len(original.alphabet), size=n_insertions)
mutant.code = np.insert(mutant.code, insertion_indices, insertion_values)
# Random deletions
@@ -241,12 +236,10 @@ def _create_random_pair(seed, length=100, max_subsitutions=5,
# Truncate at both ends of original and mutant
original = original[
- np.random.randint(max_truncations) :
- -(1 + np.random.randint(max_truncations))
+ np.random.randint(max_truncations) : -(1 + np.random.randint(max_truncations))
]
mutant = mutant[
- np.random.randint(max_truncations) :
- -(1 + np.random.randint(max_truncations))
+ np.random.randint(max_truncations) : -(1 + np.random.randint(max_truncations))
]
- return original, mutant
\ No newline at end of file
+ return original, mutant
diff --git a/tests/sequence/align/test_cigar.py b/tests/sequence/align/test_cigar.py
index 2c4767ddc..e4ffe4b04 100644
--- a/tests/sequence/align/test_cigar.py
+++ b/tests/sequence/align/test_cigar.py
@@ -18,10 +18,12 @@ def _generate_cigar(seed):
# Alternatingly insert matches and insertions/deletions
cigar += f"{np.random.randint(1, 100)}M"
op = align.CigarOp(
- np.random.choice([
- align.CigarOp.INSERTION,
- align.CigarOp.DELETION,
- ])
+ np.random.choice(
+ [
+ align.CigarOp.INSERTION,
+ align.CigarOp.DELETION,
+ ]
+ )
).to_cigar_symbol()
cigar += f"{np.random.randint(1, 100)}{op}"
# Alignment must end with a match
@@ -34,8 +36,9 @@ def _generate_cigar(seed):
return cigar
-def _mutate_sequence(original,
- max_subsitutions=50, max_insertions=50, max_deletions=50):
+def _mutate_sequence(
+ original, max_subsitutions=50, max_insertions=50, max_deletions=50
+):
"""
Introduce random deletions, insertions and substitutions into a
sequence.
@@ -47,9 +50,7 @@ def _mutate_sequence(original,
subsitution_indices = np.random.choice(
np.arange(len(mutant)), size=n_subsitutions, replace=False
)
- subsitution_values = np.random.randint(
- len(original.alphabet), size=n_subsitutions
- )
+ subsitution_values = np.random.randint(len(original.alphabet), size=n_subsitutions)
mutant.code[subsitution_indices] = subsitution_values
# Random insertions
@@ -57,9 +58,7 @@ def _mutate_sequence(original,
insertion_indices = np.random.choice(
np.arange(len(mutant)), size=n_insertions, replace=False
)
- insertion_values = np.random.randint(
- len(original.alphabet), size=n_insertions
- )
+ insertion_values = np.random.randint(len(original.alphabet), size=n_insertions)
mutant.code = np.insert(mutant.code, insertion_indices, insertion_values)
# Random deletions
@@ -83,8 +82,8 @@ def test_cigar_conversion(cigar):
# The sequences are arbitrary, only the alignment trace matters
# However, they still need to be long enough for the number of CIGAR
# operations
- ref = seq.NucleotideSequence(["A"]*LENGTH)
- seg = seq.NucleotideSequence(["A"]*LENGTH)
+ ref = seq.NucleotideSequence(["A"] * LENGTH)
+ seg = seq.NucleotideSequence(["A"] * LENGTH)
alignment = align.read_alignment_from_cigar(cigar, 0, ref, seg)
print(alignment)
@@ -103,10 +102,9 @@ def test_cigar_conversion(cigar):
[False, True],
[False, True],
[False, True],
- )
+ ),
)
-def test_alignment_conversion(seed, local, distinguish_matches,
- include_terminal_gaps):
+def test_alignment_conversion(seed, local, distinguish_matches, include_terminal_gaps):
"""
Check whether an :class:`Alignment` converted into a CIGAR string
and back again into an :class:`Alignment` gives the same result.
@@ -114,20 +112,16 @@ def test_alignment_conversion(seed, local, distinguish_matches,
REF_LENGTH = 1000
np.random.seed(seed)
ref = seq.NucleotideSequence(ambiguous=False)
- ref.code = np.random.randint(
- 0, len(ref.alphabet), REF_LENGTH, dtype=np.uint8
- )
+ ref.code = np.random.randint(0, len(ref.alphabet), REF_LENGTH, dtype=np.uint8)
excerpt_start = np.random.randint(0, 200)
- excerpt_stop = np.random.randint(REF_LENGTH-200, REF_LENGTH)
- seg = ref[excerpt_start: excerpt_stop]
+ excerpt_stop = np.random.randint(REF_LENGTH - 200, REF_LENGTH)
+ seg = ref[excerpt_start:excerpt_stop]
seg = _mutate_sequence(seg)
matrix = align.SubstitutionMatrix.std_nucleotide_matrix()
if local:
- ref_ali = align.align_optimal(
- ref, seg, matrix, local=True, max_number=1
- )[0]
+ ref_ali = align.align_optimal(ref, seg, matrix, local=True, max_number=1)[0]
else:
ref_ali = align.align_optimal(
ref, seg, matrix, terminal_penalty=False, max_number=1
@@ -138,17 +132,15 @@ def test_alignment_conversion(seed, local, distinguish_matches,
# Remove score as the compared reconstructed alignment does not
# contain it either
ref_ali.score = None
- start_position = ref_ali.trace[0,0]
+ start_position = ref_ali.trace[0, 0]
cigar = align.write_alignment_to_cigar(
ref_ali,
distinguish_matches=distinguish_matches,
- include_terminal_gaps=include_terminal_gaps
+ include_terminal_gaps=include_terminal_gaps,
)
- test_ali = align.read_alignment_from_cigar(
- cigar, start_position, ref, seg
- )
+ test_ali = align.read_alignment_from_cigar(cigar, start_position, ref, seg)
print(cigar)
print("\n\n")
@@ -156,4 +148,4 @@ def test_alignment_conversion(seed, local, distinguish_matches,
print("\n\n")
print(test_ali)
print("\n\n")
- assert test_ali == ref_ali
\ No newline at end of file
+ assert test_ali == ref_ali
diff --git a/tests/sequence/align/test_kmeralphabet.py b/tests/sequence/align/test_kmeralphabet.py
index 1ea31a400..67b3f9b03 100644
--- a/tests/sequence/align/test_kmeralphabet.py
+++ b/tests/sequence/align/test_kmeralphabet.py
@@ -7,7 +7,6 @@
import biotite.sequence as seq
import biotite.sequence.align as align
-
K = 3
@@ -15,21 +14,24 @@
def kmer_alphabet():
return align.KmerAlphabet(seq.ProteinSequence.alphabet, K)
+
@pytest.fixture
def spaced_kmer_alphabet():
- return align.KmerAlphabet(seq.ProteinSequence.alphabet, K, spacing=[0,1,2])
-
+ return align.KmerAlphabet(seq.ProteinSequence.alphabet, K, spacing=[0, 1, 2])
np.random.seed(0)
N = 10
L = 30
+
+
@pytest.mark.parametrize(
"ref_split_kmer_code",
# Test for single instances as input
- list(np.random.randint(len(seq.ProteinSequence.alphabet), size=(N, K))) +
+ list(np.random.randint(len(seq.ProteinSequence.alphabet), size=(N, K)))
+ +
# Test for multiple instances as input
- list(np.random.randint(len(seq.ProteinSequence.alphabet), size=(N, L, K)))
+ list(np.random.randint(len(seq.ProteinSequence.alphabet), size=(N, L, K))),
)
def test_fuse_and_split(kmer_alphabet, ref_split_kmer_code):
"""
@@ -38,15 +40,16 @@ def test_fuse_and_split(kmer_alphabet, ref_split_kmer_code):
"""
fused = kmer_alphabet.fuse(ref_split_kmer_code)
test_split_kmer_code = kmer_alphabet.split(fused)
-
+
assert test_split_kmer_code.tolist() == ref_split_kmer_code.tolist()
np.random.seed(0)
N = 10
+
+
@pytest.mark.parametrize(
- "split_kmer_code",
- np.random.randint(len(seq.ProteinSequence.alphabet), size=(N, K))
+ "split_kmer_code", np.random.randint(len(seq.ProteinSequence.alphabet), size=(N, K))
)
def test_encode_and_decode(kmer_alphabet, split_kmer_code):
"""
@@ -58,7 +61,7 @@ def test_encode_and_decode(kmer_alphabet, split_kmer_code):
ref_kmer_symbol = alph.decode_multiple(split_kmer_code)
kmer_code = kmer_alphabet.encode(ref_kmer_symbol)
test_kmer_symbol = kmer_alphabet.decode(kmer_code)
-
+
assert test_kmer_symbol.tolist() == ref_kmer_symbol.tolist()
@@ -86,6 +89,8 @@ def test_create_continuous_kmers(kmer_alphabet):
N = 50
+
+
@pytest.mark.parametrize("seed", range(N))
def test_create_spaced_kmers(kmer_alphabet, spaced_kmer_alphabet, seed):
"""
@@ -99,8 +104,7 @@ def test_create_spaced_kmers(kmer_alphabet, spaced_kmer_alphabet, seed):
np.random.seed(seed)
sequence = seq.ProteinSequence()
sequence.code = np.random.randint(
- len(sequence.alphabet),
- size=np.random.randint(MIN_LENGTH, MAX_LENGTH)
+ len(sequence.alphabet), size=np.random.randint(MIN_LENGTH, MAX_LENGTH)
)
ref_kmers = kmer_alphabet.create_kmers(sequence.code)
diff --git a/tests/sequence/align/test_kmersimilarity.py b/tests/sequence/align/test_kmersimilarity.py
index a72aeca72..5f1fbbf44 100644
--- a/tests/sequence/align/test_kmersimilarity.py
+++ b/tests/sequence/align/test_kmersimilarity.py
@@ -15,22 +15,24 @@ def kmer_alphabet():
np.random.seed(0)
N = 10
-@pytest.mark.parametrize("ref_kmer, threshold", zip(
- np.random.randint(10000, size=N),
- np.random.randint(-5, 15, size=N)
-))
+
+
+@pytest.mark.parametrize(
+ "ref_kmer, threshold",
+ zip(np.random.randint(10000, size=N), np.random.randint(-5, 15, size=N)),
+)
def test_score_threshold_rule(kmer_alphabet, ref_kmer, threshold):
"""
Test if the similar k-mers given by :class:`ScoreThresholdRule`
are equal to k-mers generated by a brute-force approach.
"""
matrix = align.SubstitutionMatrix.std_protein_matrix()
-
+
ref_kmer_sequence = seq.ProteinSequence()
ref_kmer_sequence.code = kmer_alphabet.split(ref_kmer)
-
+
ref_sim_kmer_set = set()
- # Iterate through all possible k-mers
+ # Iterate through all possible k-mers
for kmer in range(len(kmer_alphabet)):
kmer_sequence = seq.ProteinSequence()
kmer_sequence.code = kmer_alphabet.split(kmer)
@@ -40,7 +42,7 @@ def test_score_threshold_rule(kmer_alphabet, ref_kmer, threshold):
# Add k-mer to list if the threshold score is reached
if score >= threshold:
ref_sim_kmer_set.add(kmer)
-
+
test_rule = align.ScoreThresholdRule(matrix, threshold)
test_sim_kmer_set = set(test_rule.similar_kmers(kmer_alphabet, ref_kmer))
@@ -68,4 +70,4 @@ def test_invalid_kmer(kmer_alphabet, invalid_kmer):
align.SubstitutionMatrix.std_protein_matrix(), 0
)
with pytest.raises(seq.AlphabetError):
- test_rule.similar_kmers(kmer_alphabet, invalid_kmer)
\ No newline at end of file
+ test_rule.similar_kmers(kmer_alphabet, invalid_kmer)
diff --git a/tests/sequence/align/test_kmertable.py b/tests/sequence/align/test_kmertable.py
index 64439bd27..deb4b1923 100644
--- a/tests/sequence/align/test_kmertable.py
+++ b/tests/sequence/align/test_kmertable.py
@@ -4,9 +4,8 @@
import functools
import itertools
-import string
import pickle
-from typing import Any
+import string
import numpy as np
import pytest
import biotite.sequence as seq
@@ -27,9 +26,7 @@ def __init__(self, n_buckets):
def __getattr__(self, name):
attr = getattr(align.BucketKmerTable, name)
- if attr.__name__ in [
- "from_sequences", "from_kmers", "from_kmer_selection"
- ]:
+ if attr.__name__ in ["from_sequences", "from_kmers", "from_kmer_selection"]:
return functools.partial(attr, n_buckets=self._n_buckets)
else:
return attr
@@ -47,10 +44,12 @@ def idfn(val):
def k():
return 8
+
@pytest.fixture
def alphabet():
return seq.NucleotideSequence.unambiguous_alphabet()
+
@pytest.fixture
def random_sequences(k, alphabet):
N_SEQS = 10
@@ -75,10 +74,10 @@ def random_sequences(k, alphabet):
# with less buckets than number of possible kmers ...
FixedBucketKmerTable(1000),
# ... and one test case with more buckets (perfect hashing)
- FixedBucketKmerTable(1000000)
- ]
+ FixedBucketKmerTable(1000000),
+ ],
),
- ids = idfn
+ ids=idfn,
)
def test_from_sequences(k, random_sequences, spacing, table_class):
"""
@@ -86,29 +85,23 @@ def test_from_sequences(k, random_sequences, spacing, table_class):
sequence position, if the position is in the C-array of the
corresponding k-mer.
"""
- table = table_class.from_sequences(
- k, random_sequences, spacing=spacing
- )
+ table = table_class.from_sequences(k, random_sequences, spacing=spacing)
kmer_alph = align.KmerAlphabet(random_sequences[0].alphabet, k, spacing)
assert kmer_alph == table.kmer_alphabet
for i, sequence in enumerate(random_sequences):
for j in range(kmer_alph.kmer_array_length(len(sequence))):
if spacing is None:
- kmer = kmer_alph.fuse(sequence.code[j : j+k])
+ kmer = kmer_alph.fuse(sequence.code[j : j + k])
else:
kmer = kmer_alph.fuse(sequence.code[kmer_alph.spacing + j])
- assert np.array([i,j]) in table[kmer]
+ assert np.array([i, j]) in table[kmer]
@pytest.mark.parametrize(
"table_class",
- [
- align.KmerTable,
- FixedBucketKmerTable(1000),
- FixedBucketKmerTable(1000000)
- ],
- ids = idfn
+ [align.KmerTable, FixedBucketKmerTable(1000), FixedBucketKmerTable(1000000)],
+ ids=idfn,
)
def test_from_kmers(k, random_sequences, table_class):
"""
@@ -128,12 +121,8 @@ def test_from_kmers(k, random_sequences, table_class):
@pytest.mark.parametrize(
"table_class",
- [
- align.KmerTable,
- FixedBucketKmerTable(1000),
- FixedBucketKmerTable(1000000)
- ],
- ids = idfn
+ [align.KmerTable, FixedBucketKmerTable(1000), FixedBucketKmerTable(1000000)],
+ ids=idfn,
)
def test_from_kmer_selection(k, alphabet, random_sequences, table_class):
"""
@@ -149,8 +138,7 @@ def test_from_kmer_selection(k, alphabet, random_sequences, table_class):
]
np.random.seed(0)
filtered_pos_arrays = [
- np.random.randint(len(kmers), size=N_POSITIONS)
- for kmers in kmer_arrays
+ np.random.randint(len(kmers), size=N_POSITIONS) for kmers in kmer_arrays
]
filtered_kmer_arrays = [
kmers[filtered_pos]
@@ -162,8 +150,9 @@ def test_from_kmer_selection(k, alphabet, random_sequences, table_class):
# The total number of k-mers in the table
# should be the total number of input k-mers
- assert np.sum(kmer_table.count(np.arange(len(kmer_alph)))) \
- == np.sum([len(kmers) for kmers in filtered_kmer_arrays])
+ assert np.sum(kmer_table.count(np.arange(len(kmer_alph)))) == np.sum(
+ [len(kmers) for kmers in filtered_kmer_arrays]
+ )
# Each k-mer in the table should be found
# in the original k-mer sequences
for kmer in range(len(kmer_alph)):
@@ -173,12 +162,8 @@ def test_from_kmer_selection(k, alphabet, random_sequences, table_class):
@pytest.mark.parametrize(
"table_class",
- [
- align.KmerTable,
- FixedBucketKmerTable(1000),
- FixedBucketKmerTable(1000000)
- ],
- ids = idfn
+ [align.KmerTable, FixedBucketKmerTable(1000), FixedBucketKmerTable(1000000)],
+ ids=idfn,
)
def test_from_tables(k, random_sequences, table_class):
"""
@@ -205,10 +190,8 @@ def test_from_positions(k, random_sequences):
"""
ref_table = align.KmerTable.from_sequences(k, random_sequences)
- kmer_dict = {kmer : ref_table[kmer] for kmer in range(len(ref_table))}
- test_table = align.KmerTable.from_positions(
- ref_table.kmer_alphabet, kmer_dict
- )
+ kmer_dict = {kmer: ref_table[kmer] for kmer in range(len(ref_table))}
+ test_table = align.KmerTable.from_positions(ref_table.kmer_alphabet, kmer_dict)
assert test_table == ref_table
@@ -216,14 +199,10 @@ def test_from_positions(k, random_sequences):
@pytest.mark.parametrize(
"table_class, use_similarity_rule",
itertools.product(
- [
- align.KmerTable,
- FixedBucketKmerTable(1000),
- FixedBucketKmerTable(10000000)
- ],
- [False, True]
+ [align.KmerTable, FixedBucketKmerTable(1000), FixedBucketKmerTable(10000000)],
+ [False, True],
),
- ids = idfn
+ ids=idfn,
)
def test_match_table(table_class, use_similarity_rule):
"""
@@ -233,8 +212,7 @@ def test_match_table(table_class, use_similarity_rule):
chosen to yield only the same k-mer as similar k-mer.
"""
alphabet = seq.LetterAlphabet(string.ascii_lowercase + "_")
- phrase1 = "how_much_wood_would_a_woodchuck_chuck_if_a_woodchuck_could_" \
- "chuck_wood"
+ phrase1 = "how_much_wood_would_a_woodchuck_chuck_if_a_woodchuck_could_" "chuck_wood"
phrase2 = "woodchuck"
sequence1 = seq.GeneralSequence(alphabet, phrase1)
sequence2 = seq.GeneralSequence(alphabet, phrase2)
@@ -244,30 +222,32 @@ def test_match_table(table_class, use_similarity_rule):
table1 = table_class.from_sequences(4, [sequence1])
table2 = table_class.from_sequences(4, [sequence2])
- ref_matches = set([
- (0, 9),
- (0, 22),
- (1, 23),
- (2, 24),
- (3, 25),
- (4, 26),
- (5, 27),
- (4, 32),
- (5, 33),
- (0, 43),
- (1, 44),
- (2, 45),
- (3, 46),
- (4, 47),
- (5, 48),
- (4, 59),
- (5, 60),
- (0, 65),
- ])
+ ref_matches = set(
+ [
+ (0, 9),
+ (0, 22),
+ (1, 23),
+ (2, 24),
+ (3, 25),
+ (4, 26),
+ (5, 27),
+ (4, 32),
+ (5, 33),
+ (0, 43),
+ (1, 44),
+ (2, 45),
+ (3, 46),
+ (4, 47),
+ (5, 48),
+ (4, 59),
+ (5, 60),
+ (0, 65),
+ ]
+ )
test_matches = table1.match_table(table2, similarity_rule=rule)
# the reference indices are irrelevant for this test
- test_matches = test_matches[:, [1,3]]
+ test_matches = test_matches[:, [1, 3]]
test_matches = set([tuple(match) for match in test_matches])
assert test_matches == ref_matches
@@ -275,14 +255,10 @@ def test_match_table(table_class, use_similarity_rule):
@pytest.mark.parametrize(
"table_class, use_similarity_rule",
itertools.product(
- [
- align.KmerTable,
- FixedBucketKmerTable(1000),
- FixedBucketKmerTable(1000000)
- ],
- [False, True]
+ [align.KmerTable, FixedBucketKmerTable(1000), FixedBucketKmerTable(1000000)],
+ [False, True],
),
- ids = idfn
+ ids=idfn,
)
def test_match(k, random_sequences, table_class, use_similarity_rule):
"""
@@ -301,12 +277,8 @@ def test_match(k, random_sequences, table_class, use_similarity_rule):
for i, kmer in enumerate(kmers):
matches = table[kmer]
matches = np.stack(
- [
- np.full(len(matches), i, dtype=np.uint32),
- matches[:,0],
- matches[:,1]
- ],
- axis=1
+ [np.full(len(matches), i, dtype=np.uint32), matches[:, 0], matches[:, 1]],
+ axis=1,
)
ref_matches.append(matches)
ref_matches = np.concatenate(ref_matches)
@@ -319,12 +291,8 @@ def test_match(k, random_sequences, table_class, use_similarity_rule):
@pytest.mark.parametrize(
"table_class",
- [
- align.KmerTable,
- FixedBucketKmerTable(1000),
- FixedBucketKmerTable(1000000)
- ],
- ids = idfn
+ [align.KmerTable, FixedBucketKmerTable(1000), FixedBucketKmerTable(1000000)],
+ ids=idfn,
)
def test_match_kmer_selection(k, random_sequences, table_class):
"""
@@ -344,12 +312,8 @@ def test_match_kmer_selection(k, random_sequences, table_class):
kmer = kmers[pos]
matches = table[kmer]
matches = np.stack(
- [
- np.full(len(matches), pos, dtype=np.uint32),
- matches[:,0],
- matches[:,1]
- ],
- axis=1
+ [np.full(len(matches), pos, dtype=np.uint32), matches[:, 0], matches[:, 1]],
+ axis=1,
)
ref_matches.append(matches)
ref_matches = np.concatenate(ref_matches)
@@ -362,14 +326,10 @@ def test_match_kmer_selection(k, random_sequences, table_class):
@pytest.mark.parametrize(
"table_class, use_mask",
itertools.product(
- [
- align.KmerTable,
- FixedBucketKmerTable(1000),
- FixedBucketKmerTable(1000000)
- ],
- [False, True]
+ [align.KmerTable, FixedBucketKmerTable(1000), FixedBucketKmerTable(1000000)],
+ [False, True],
),
- ids = idfn
+ ids=idfn,
)
def test_match_equivalence(k, random_sequences, table_class, use_mask):
"""
@@ -391,27 +351,22 @@ def test_match_equivalence(k, random_sequences, table_class, use_mask):
query_mask = removal_masks[0]
table_masks = removal_masks[1:]
- table = table_class.from_sequences(
- k, table_sequences, ignore_masks=table_masks
- )
+ table = table_class.from_sequences(k, table_sequences, ignore_masks=table_masks)
# 42 -> Dummy value that is distinct from all reference indices
ref_table = table_class.from_sequences(
k, [query_sequence], [42], ignore_masks=[query_mask]
)
ref_matches = table.match_table(ref_table)
- assert np.all(ref_matches[:,0] == 42)
+ assert np.all(ref_matches[:, 0] == 42)
# Store matches in set to remove the order dependency
# The first column is not present in the matches
# returned by 'match_sequence()' -> [:, 1:]
ref_matches = set([tuple(match) for match in ref_matches[:, 1:]])
- test_matches = table.match(
- query_sequence, ignore_mask=query_mask
- )
+ test_matches = table.match(query_sequence, ignore_mask=query_mask)
test_matches = set([tuple(match) for match in test_matches])
-
# Check if any match is found at all
assert len(ref_matches) > 0
# The first column is not present in 'test_matches'
@@ -433,7 +388,7 @@ def test_match_equivalence(k, random_sequences, table_class, use_mask):
),
],
ids = idfn
-)
+) # fmt: skip
def test_masking(k, input_mask, ref_output_mask):
"""
Explicitly test the conversion of removal masks to k-mer masks
@@ -446,9 +401,7 @@ def test_masking(k, input_mask, ref_output_mask):
sequence = seq.NucleotideSequence()
sequence.code = np.zeros(len(input_mask))
- table = align.KmerTable.from_sequences(
- k, [sequence], ignore_masks=[input_mask]
- )
+ table = align.KmerTable.from_sequences(k, [sequence], ignore_masks=[input_mask])
# Get the k-mer positions that were masked
test_output_mask = np.zeros(len(ref_output_mask), dtype=bool)
@@ -467,7 +420,7 @@ def test_masking(k, input_mask, ref_output_mask):
(FixedBucketKmerTable(1000), True),
(FixedBucketKmerTable(1000000), True),
],
- ids = idfn
+ ids=idfn,
)
def test_count(k, random_sequences, table_class, selected_kmers):
"""
@@ -476,9 +429,7 @@ def test_count(k, random_sequences, table_class, selected_kmers):
"""
N_KMERS = 100
- table = table_class.from_sequences(
- k, random_sequences
- )
+ table = table_class.from_sequences(k, random_sequences)
if selected_kmers:
np.random.seed(0)
@@ -486,9 +437,7 @@ def test_count(k, random_sequences, table_class, selected_kmers):
ref_counts = [len(table[kmer]) for kmer in kmers]
test_counts = table.count(kmers)
else:
- ref_counts = [
- len(table[kmer]) for kmer in range(len(table.kmer_alphabet))
- ]
+ ref_counts = [len(table[kmer]) for kmer in range(len(table.kmer_alphabet))]
test_counts = table.count()
assert test_counts.tolist() == ref_counts
@@ -496,12 +445,8 @@ def test_count(k, random_sequences, table_class, selected_kmers):
@pytest.mark.parametrize(
"table_class",
- [
- align.KmerTable,
- FixedBucketKmerTable(1000),
- FixedBucketKmerTable(1000000)
- ],
- ids = idfn
+ [align.KmerTable, FixedBucketKmerTable(1000), FixedBucketKmerTable(1000000)],
+ ids=idfn,
)
def test_get_kmers(table_class):
"""
@@ -511,10 +456,7 @@ def test_get_kmers(table_class):
"""
np.random.seed(0)
- kmer_alphabet = align.KmerAlphabet(
- seq.NucleotideSequence.unambiguous_alphabet(),
- 8
- )
+ kmer_alphabet = align.KmerAlphabet(seq.NucleotideSequence.unambiguous_alphabet(), 8)
ref_mask = np.random.choice([False, True], size=len(kmer_alphabet))
ref_kmers = np.where(ref_mask)[0]
table = table_class.from_kmers(kmer_alphabet, [ref_kmers])
@@ -526,12 +468,8 @@ def test_get_kmers(table_class):
@pytest.mark.parametrize(
"table_class",
- [
- align.KmerTable,
- FixedBucketKmerTable(1000),
- FixedBucketKmerTable(1000000)
- ],
- ids = idfn
+ [align.KmerTable, FixedBucketKmerTable(1000), FixedBucketKmerTable(1000000)],
+ ids=idfn,
)
def test_pickle(k, random_sequences, table_class):
"""
@@ -548,10 +486,7 @@ def test_pickle(k, random_sequences, table_class):
@pytest.mark.parametrize(
"n_kmers, load_factor",
- itertools.product(
- [1_000, 100_000, 10_000_000, 1_000_000_000],
- [0.2, 1.0, 2.0]
- )
+ itertools.product([1_000, 100_000, 10_000_000, 1_000_000_000], [0.2, 1.0, 2.0]),
)
def test_bucket_number(n_kmers, load_factor):
"""
@@ -563,7 +498,6 @@ def test_bucket_number(n_kmers, load_factor):
min_n_buckets = int(n_kmers / load_factor)
test_n_buckets = align.bucket_number(n_kmers, load_factor)
-
assert test_n_buckets >= min_n_buckets
assert test_n_buckets <= min_n_buckets * 1.05
@@ -573,4 +507,4 @@ def _identity_rule(alphabet):
np.fill_diagonal(score_matrix, 0)
matrix = align.SubstitutionMatrix(alphabet, alphabet, score_matrix)
rule = align.ScoreThresholdRule(matrix, 0)
- return rule
\ No newline at end of file
+ return rule
diff --git a/tests/sequence/align/test_localgapped.py b/tests/sequence/align/test_localgapped.py
index 7fbe19f48..714004118 100644
--- a/tests/sequence/align/test_localgapped.py
+++ b/tests/sequence/align/test_localgapped.py
@@ -3,25 +3,23 @@
# information.
import itertools
-import pytest
import numpy as np
+import pytest
import biotite.sequence as seq
import biotite.sequence.align as align
-from .util import sequences
@pytest.mark.parametrize(
"gap_penalty, seed, threshold, direction, score_only",
itertools.product(
- [-10, (-10,-1)],
+ [-10, (-10, -1)],
[(0, 0), (11, 11), (20, 19), (30, 29)],
[20, 100, 500],
- ["both", "upstream","downstream"],
- [False, True]
- )
+ ["both", "upstream", "downstream"],
+ [False, True],
+ ),
)
-def test_simple_alignment(gap_penalty, seed, threshold,
- direction, score_only):
+def test_simple_alignment(gap_penalty, seed, threshold, direction, score_only):
"""
Test `align_local_gapped()` by comparing the output to
`align_optimal()`.
@@ -34,22 +32,20 @@ def test_simple_alignment(gap_penalty, seed, threshold,
matrix = align.SubstitutionMatrix.std_protein_matrix()
ref_alignments = align.align_optimal(
- seq1, seq2, matrix,
- gap_penalty=gap_penalty, local=True
+ seq1, seq2, matrix, gap_penalty=gap_penalty, local=True
)
# Limit reference alignment range to seed
# if the alignment does not extend in both directions
for alignment in ref_alignments:
- seed_index = np.where(alignment.trace[:,0] == seed[0])[0][0]
+ seed_index = np.where(alignment.trace[:, 0] == seed[0])[0][0]
if direction == "upstream":
- alignment.trace = alignment.trace[:seed_index + 1]
+ alignment.trace = alignment.trace[: seed_index + 1]
elif direction == "downstream":
alignment.trace = alignment.trace[seed_index:]
alignment.score = align.score(alignment, matrix, gap_penalty)
-
+
test_result = align.align_local_gapped(
- seq1, seq2, matrix, seed, threshold, gap_penalty,
- 1000, direction, score_only
+ seq1, seq2, matrix, seed, threshold, gap_penalty, 1000, direction, score_only
)
if score_only:
@@ -66,13 +62,12 @@ def test_simple_alignment(gap_penalty, seed, threshold,
@pytest.mark.parametrize(
"gap_penalty, score_only, seq_indices",
itertools.product(
- [-10, (-10,-1)],
+ [-10, (-10, -1)],
[False, True],
- [(i,j) for i in range(10) for j in range(i+1)]
- )
+ [(i, j) for i in range(10) for j in range(i + 1)],
+ ),
)
-def test_complex_alignment(sequences, gap_penalty, score_only,
- seq_indices):
+def test_complex_alignment(sequences, gap_penalty, score_only, seq_indices):
"""
Test `align_local_gapped()` by comparing the output to
`align_optimal()`.
@@ -84,24 +79,22 @@ def test_complex_alignment(sequences, gap_penalty, score_only,
# The linear gap penalty for longer gaps easily exceeds
# a small threshold -> increase threshold for linear penalty
THRESHOLD = 200 if isinstance(gap_penalty, int) else 50
-
+
matrix = align.SubstitutionMatrix.std_protein_matrix()
index1, index2 = seq_indices
seq1 = sequences[index1]
seq2 = sequences[index2]
ref_alignments = align.align_optimal(
- seq1, seq2, matrix,
- gap_penalty=gap_penalty, local=True, max_number=MAX_NUMBER
+ seq1, seq2, matrix, gap_penalty=gap_penalty, local=True, max_number=MAX_NUMBER
)
# Select the center of the alignment as seed
trace = ref_alignments[0].trace
trace = trace[(trace != -1).all(axis=1)]
seed = trace[len(trace) // 2]
-
+
test_result = align.align_local_gapped(
- seq1, seq2, matrix, seed, THRESHOLD, gap_penalty,
- MAX_NUMBER, "both", score_only
+ seq1, seq2, matrix, seed, THRESHOLD, gap_penalty, MAX_NUMBER, "both", score_only
)
if score_only:
@@ -113,30 +106,29 @@ def test_complex_alignment(sequences, gap_penalty, score_only,
test_alignments = test_result
assert test_alignments[0].score == ref_alignments[0].score
# Test if the score is also correctly calculated
- assert align.score(test_alignments[0], matrix, gap_penalty) \
+ assert (
+ align.score(test_alignments[0], matrix, gap_penalty)
== ref_alignments[0].score
- if len(ref_alignments) < MAX_NUMBER \
- and len(test_alignments) < MAX_NUMBER:
- # Only test if the exact same alignments were created,
- # if the number of traces was not limited by MAX_NUMBER
- for i, alignment in enumerate(test_alignments):
- try:
- assert alignment in ref_alignments
- except AssertionError:
- # Edge case:
- # In rare case the local alignment may be
- # slightly longer on the upstream side for
- # 'align_local_ungapped()', since the
- # upstream side is handled in an inverted
- # manner
- # However this does not effect the score
- # Consequently, the exception is ignored
- # if the alignment is longer than all
- # reference alignments
- if len(alignment) <= max(
- [len(ali) for ali in ref_alignments]
- ):
- raise
+ )
+ if len(ref_alignments) < MAX_NUMBER and len(test_alignments) < MAX_NUMBER:
+ # Only test if the exact same alignments were created,
+ # if the number of traces was not limited by MAX_NUMBER
+ for i, alignment in enumerate(test_alignments):
+ try:
+ assert alignment in ref_alignments
+ except AssertionError:
+ # Edge case:
+ # In rare case the local alignment may be
+ # slightly longer on the upstream side for
+ # 'align_local_ungapped()', since the
+ # upstream side is handled in an inverted
+ # manner
+ # However this does not effect the score
+ # Consequently, the exception is ignored
+ # if the alignment is longer than all
+ # reference alignments
+ if len(alignment) <= max([len(ali) for ali in ref_alignments]):
+ raise
except AssertionError:
print(f"Missing test alignment at index {i}:")
print()
@@ -151,11 +143,11 @@ def test_complex_alignment(sequences, gap_penalty, score_only,
@pytest.mark.parametrize(
"gap_penalty, direction, score_only, should_raise",
itertools.product(
- [-10, (-10,-1)],
- ["both", "upstream","downstream"],
+ [-10, (-10, -1)],
+ ["both", "upstream", "downstream"],
[False, True],
- [False, True]
- )
+ [False, True],
+ ),
)
def test_max_table_size(gap_penalty, direction, score_only, should_raise):
"""
@@ -171,7 +163,7 @@ def test_max_table_size(gap_penalty, direction, score_only, should_raise):
max_table_size = 1_000_000_000
# Align a long random sequence to itself,
- # effectively resulting in a global alignment
+ # effectively resulting in a global alignment
np.random.seed(0)
seq1 = seq.NucleotideSequence()
seq1.code = np.random.randint(len(seq1.alphabet), size=10000)
@@ -184,15 +176,31 @@ def test_max_table_size(gap_penalty, direction, score_only, should_raise):
if should_raise:
with pytest.raises(MemoryError):
align.align_local_gapped(
- seq1, seq1, matrix, seed, threshold, gap_penalty, 1,
- direction, score_only, max_table_size
+ seq1,
+ seq1,
+ matrix,
+ seed,
+ threshold,
+ gap_penalty,
+ 1,
+ direction,
+ score_only,
+ max_table_size,
)
else:
result = align.align_local_gapped(
- seq1, seq1, matrix, seed, threshold, gap_penalty, 1,
- direction, score_only, max_table_size
+ seq1,
+ seq1,
+ matrix,
+ seed,
+ threshold,
+ gap_penalty,
+ 1,
+ direction,
+ score_only,
+ max_table_size,
)
if not score_only and direction == "both":
alignment = result[0]
# Expect that no gaps are introduced
- assert len(alignment) == len(seq1)
\ No newline at end of file
+ assert len(alignment) == len(seq1)
diff --git a/tests/sequence/align/test_localungapped.py b/tests/sequence/align/test_localungapped.py
index b3f24dc59..11105a11a 100644
--- a/tests/sequence/align/test_localungapped.py
+++ b/tests/sequence/align/test_localungapped.py
@@ -66,15 +66,24 @@
],
[["both"], ["upstream"], ["downstream"]], # direction
-
+
[[False], [True]], # score_only
[[False], [True]], # uint8_code
)]
-)
-def test_simple_alignments(seq_type, seq1, seq2, seed, threshold,
- ref_range1, ref_range2,
- direction, score_only, uint8_code):
+) # fmt: skip
+def test_simple_alignments(
+ seq_type,
+ seq1,
+ seq2,
+ seed,
+ threshold,
+ ref_range1,
+ ref_range2,
+ direction,
+ score_only,
+ uint8_code,
+):
"""
Check if `algin_local_ungapped()` produces correct alignments based on
simple known examples.
@@ -90,29 +99,26 @@ def test_simple_alignments(seq_type, seq1, seq2, seed, threshold,
seq1 = seq_type(seq1)
seq2 = seq_type(seq2)
-
+
if seq_type == seq.NucleotideSequence:
matrix = align.SubstitutionMatrix.std_nucleotide_matrix()
else:
matrix = align.SubstitutionMatrix.std_protein_matrix()
-
+
if not uint8_code:
seq1, seq2, matrix = _convert_to_uint16_code(seq1, seq2, matrix)
-
ref_alignment = align.Alignment(
[seq1, seq2],
- np.stack([
- np.arange(*ref_range1),
- np.arange(*ref_range2)
- ], axis=-1)
+ np.stack([np.arange(*ref_range1), np.arange(*ref_range2)], axis=-1),
)
ref_score = align.score(ref_alignment, matrix)
ref_alignment.score = ref_score
test_result = align.align_local_ungapped(
- seq1, seq2, matrix, seed, threshold, direction, score_only)
-
+ seq1, seq2, matrix, seed, threshold, direction, score_only
+ )
+
if score_only:
assert test_result == ref_score
else:
@@ -120,10 +126,7 @@ def test_simple_alignments(seq_type, seq1, seq2, seed, threshold,
@pytest.mark.parametrize(
- "seed, uint8_code", itertools.product(
- range(100),
- [False, True]
- )
+ "seed, uint8_code", itertools.product(range(100), [False, True])
)
def test_random_alignment(seed, uint8_code):
"""
@@ -141,29 +144,26 @@ def test_random_alignment(seed, uint8_code):
CONSERVED_ENDS = 5
MUTATION_PROB = 0.1
THRESHOLD = 100
-
+
np.random.seed(seed)
# Create conserved regions
conserved1 = ProteinSequence()
- conserved_len = np.random.randint(MIN_CONSERVED_SIZE, MAX_CONSERVED_SIZE+1)
+ conserved_len = np.random.randint(MIN_CONSERVED_SIZE, MAX_CONSERVED_SIZE + 1)
conserved1.code = np.random.randint(
# Do not include stop symbol for aesthetic reasons -> -1
- len(conserved1.alphabet)-1,
- size=conserved_len
+ len(conserved1.alphabet) - 1,
+ size=conserved_len,
)
conserved2 = ProteinSequence()
# The second conserved regions is equal to the first one,
# except a few point mutations
conserved2.code = conserved1.code.copy()
mutation_mask = np.random.choice(
- [False, True],
- size=conserved_len,
- p = [1 - MUTATION_PROB, MUTATION_PROB]
+ [False, True], size=conserved_len, p=[1 - MUTATION_PROB, MUTATION_PROB]
)
conserved2.code[mutation_mask] = np.random.randint(
- len(conserved2.alphabet)-1,
- size=np.count_nonzero(mutation_mask)
+ len(conserved2.alphabet) - 1, size=np.count_nonzero(mutation_mask)
)
# Flank the conserved regions with equal termini to ensure
# that the alignment extends from start to end of the region
@@ -174,36 +174,33 @@ def test_random_alignment(seed, uint8_code):
seq1 = ProteinSequence()
seq2 = ProteinSequence()
offset = []
- for sequence, conserved in zip(
- (seq1, seq2), (conserved1, conserved2)
- ):
+ for sequence, conserved in zip((seq1, seq2), (conserved1, conserved2)):
sequence.code = np.random.randint(
- len(sequence.alphabet)-1,
- size=np.random.randint(MIN_SIZE, MAX_SIZE+1)
+ len(sequence.alphabet) - 1, size=np.random.randint(MIN_SIZE, MAX_SIZE + 1)
)
# Place conserved region randomly within the sequence
conserved_pos = np.random.randint(0, len(sequence) - len(conserved))
- sequence.code[conserved_pos : conserved_pos + len(conserved)] \
- = conserved.code
+ sequence.code[conserved_pos : conserved_pos + len(conserved)] = conserved.code
offset.append(conserved_pos)
# The seed is placed somewhere in the conserved region
seed = np.array(offset) + np.random.randint(len(conserved))
-
matrix = align.SubstitutionMatrix.std_protein_matrix()
if not uint8_code:
seq1, seq2, matrix = _convert_to_uint16_code(seq1, seq2, matrix)
-
+
ref_score = align.align_optimal(
- seq1, seq2, matrix, local=True, max_number=1,
- # High gap penalty to prevent introduction of gaps,
+ seq1,
+ seq2,
+ matrix,
+ local=True,
+ max_number=1,
+ # High gap penalty to prevent introduction of gaps,
# since 'align_local_ungapped()' is also no able to place gaps
- gap_penalty=-1000
+ gap_penalty=-1000,
)[0].score
- test_alignment = align.align_local_ungapped(
- seq1, seq2, matrix, seed, THRESHOLD
- )
+ test_alignment = align.align_local_ungapped(seq1, seq2, matrix, seed, THRESHOLD)
assert test_alignment.score == ref_score
# Test if the score is also correctly calculated
@@ -211,23 +208,23 @@ def test_random_alignment(seed, uint8_code):
def _convert_to_uint16_code(seq1, seq2, matrix):
- """
- Adjust sequences, so that they use 'uint16' as dtype for the
- code.
- This is a necessary test, since 'uint8' uses a separate
- implementation.
- """
- new_alph = seq.Alphabet(np.arange(500))
- code = seq1.code
- seq1 = seq.GeneralSequence(new_alph)
- seq1.code = code
- code = seq2.code
- seq2 = seq.GeneralSequence(new_alph)
- seq2.code = code
- # Adjust the substitution matrix as well,
- # so that it is compatible with the new alphabet
- score_matrix = np.zeros((len(new_alph), len(new_alph)), dtype=np.int32)
- orig_len = len(matrix.score_matrix())
- score_matrix[:orig_len, :orig_len] = matrix.score_matrix()
- matrix = align.SubstitutionMatrix(new_alph, new_alph, score_matrix)
- return seq1, seq2, matrix
\ No newline at end of file
+ """
+ Adjust sequences, so that they use 'uint16' as dtype for the
+ code.
+ This is a necessary test, since 'uint8' uses a separate
+ implementation.
+ """
+ new_alph = seq.Alphabet(np.arange(500))
+ code = seq1.code
+ seq1 = seq.GeneralSequence(new_alph)
+ seq1.code = code
+ code = seq2.code
+ seq2 = seq.GeneralSequence(new_alph)
+ seq2.code = code
+ # Adjust the substitution matrix as well,
+ # so that it is compatible with the new alphabet
+ score_matrix = np.zeros((len(new_alph), len(new_alph)), dtype=np.int32)
+ orig_len = len(matrix.score_matrix())
+ score_matrix[:orig_len, :orig_len] = matrix.score_matrix()
+ matrix = align.SubstitutionMatrix(new_alph, new_alph, score_matrix)
+ return seq1, seq2, matrix
diff --git a/tests/sequence/align/test_matrix.py b/tests/sequence/align/test_matrix.py
index 79763213f..570878945 100644
--- a/tests/sequence/align/test_matrix.py
+++ b/tests/sequence/align/test_matrix.py
@@ -8,16 +8,22 @@
import biotite.sequence.align as align
-@pytest.mark.parametrize("db_entry", [entry for entry
- in align.SubstitutionMatrix.list_db()
- if entry not in ["NUC","GONNET"]])
+@pytest.mark.parametrize(
+ "db_entry",
+ [
+ entry
+ for entry in align.SubstitutionMatrix.list_db()
+ if entry not in ["NUC", "GONNET"]
+ ],
+)
def test_matrices(db_entry):
"""
Test for exceptions when reading matrix files.
"""
alph1 = seq.ProteinSequence.alphabet
alph2 = seq.ProteinSequence.alphabet
- matrix = align.SubstitutionMatrix(alph1, alph2, db_entry)
+ align.SubstitutionMatrix(alph1, alph2, db_entry)
+
def test_matrix_str():
"""
@@ -26,11 +32,11 @@ def test_matrix_str():
"""
alph1 = seq.Alphabet("abc")
alph2 = seq.Alphabet("def")
- score_matrix = np.arange(9).reshape((3,3))
+ score_matrix = np.arange(9).reshape((3, 3))
matrix = align.SubstitutionMatrix(alph1, alph2, score_matrix)
assert str(matrix) == "\n".join(
[" d e f",
"a 0 1 2",
"b 3 4 5",
"c 6 7 8"]
- )
\ No newline at end of file
+ ) # fmt: skip
diff --git a/tests/sequence/align/test_multiple.py b/tests/sequence/align/test_multiple.py
index 3a5a470c9..47f5200c4 100644
--- a/tests/sequence/align/test_multiple.py
+++ b/tests/sequence/align/test_multiple.py
@@ -3,19 +3,14 @@
# information.
import pytest
-import biotite.sequence.align as align
import biotite.application.muscle as muscle
+import biotite.sequence.align as align
from biotite.application import VersionError
-from ...util import is_not_installed
-from .util import sequences
+from tests.util import is_not_installed
-
-@pytest.mark.skipif(
- is_not_installed("muscle"),
- reason="MUSCLE is not installed"
-)
-@pytest.mark.parametrize("gap_penalty", [-10, (-10,-1)])
+@pytest.mark.skipif(is_not_installed("muscle"), reason="MUSCLE is not installed")
+@pytest.mark.parametrize("gap_penalty", [-10, (-10, -1)])
def test_align_multiple(sequences, gap_penalty):
r"""
Test `align_multiple()` function using actual long sequences,
@@ -26,22 +21,18 @@ def test_align_multiple(sequences, gap_penalty):
score of the MUSCLE alignment.
"""
matrix = align.SubstitutionMatrix.std_protein_matrix()
-
+
test_alignment, order, tree, distances = align.align_multiple(
sequences, matrix, gap_penalty=gap_penalty, terminal_penalty=True
)
- test_score = align.score(
- test_alignment, matrix, gap_penalty, terminal_penalty=True
- )
-
+ test_score = align.score(test_alignment, matrix, gap_penalty, terminal_penalty=True)
+
try:
ref_alignment = muscle.MuscleApp.align(
sequences, matrix=matrix, gap_penalty=gap_penalty
)
except VersionError:
- pytest.skip(f"Invalid Muscle software version")
- ref_score = align.score(
- ref_alignment, matrix, gap_penalty, terminal_penalty=True
- )
-
- assert test_score >= ref_score * 0.5
\ No newline at end of file
+ pytest.skip("Invalid Muscle software version")
+ ref_score = align.score(ref_alignment, matrix, gap_penalty, terminal_penalty=True)
+
+ assert test_score >= ref_score * 0.5
diff --git a/tests/sequence/align/test_pairwise.py b/tests/sequence/align/test_pairwise.py
index 00717df15..712dfb6b8 100644
--- a/tests/sequence/align/test_pairwise.py
+++ b/tests/sequence/align/test_pairwise.py
@@ -5,12 +5,11 @@
import itertools
import numpy as np
import pytest
+import biotite.application.muscle as muscle
import biotite.sequence as seq
import biotite.sequence.align as align
-import biotite.application.muscle as muscle
from biotite.application import VersionError
-from ...util import is_not_installed
-from .util import sequences
+from tests.util import is_not_installed
def test_align_ungapped():
@@ -26,32 +25,35 @@ def test_align_ungapped():
# [local, gap_penalty, input1, input2, expect]
-align_cases = [(False,True, -7, "TATGGGTATCC","TATGTATAA",
- ("TATGGGTATCC\nTATG--TATAA",
- "TATGGGTATCC\nTAT-G-TATAA",
- "TATGGGTATCC\nTAT--GTATAA",)),
- (True, True, -6, "TATGGGTATCC","TATGTATAA",
- ("TATGGGTAT\nTATG--TAT",
- "TATGGGTAT\nTAT-G-TAT",
- "TATGGGTAT\nTAT--GTAT",)),
- (False,True, (-7,-1), "TACTATGGGTATCC","TCATATGTATAA",
- ("TACTATGGGTATCC\nTCATATG--TATAA",
- "TACTATGGGTATCC\nTCATAT--GTATAA",)),
- (True, True, (-7,-1), "TACTATGGGTATCC","TCATATGTATAA",
- ("TATGGGTAT\nTATG--TAT",
- "TATGGGTAT\nTAT--GTAT",)),
- (False,True, (-7,-1), "T","TTT",
- ("T--\nTTT",
- "--T\nTTT",)),
- (False,True, -7, "TAAAGCGAAAT","TGCGT",
- ("TAAAGCGAAAT\nT---GCG---T")),
- (False,False,-7, "TAAAGCGAAAT","TGCGT",
- ("TAAAGCGAAAT\n---TGCGT---"))
- ]
-@pytest.mark.parametrize("local, term, gap_penalty, input1, input2, expect",
- align_cases)
-def test_align_optimal_simple(local, term, gap_penalty,
- input1, input2, expect):
+align_cases = [
+ (False,True, -7, "TATGGGTATCC","TATGTATAA",
+ ("TATGGGTATCC\nTATG--TATAA",
+ "TATGGGTATCC\nTAT-G-TATAA",
+ "TATGGGTATCC\nTAT--GTATAA",)),
+ (True, True, -6, "TATGGGTATCC","TATGTATAA",
+ ("TATGGGTAT\nTATG--TAT",
+ "TATGGGTAT\nTAT-G-TAT",
+ "TATGGGTAT\nTAT--GTAT",)),
+ (False,True, (-7,-1), "TACTATGGGTATCC","TCATATGTATAA",
+ ("TACTATGGGTATCC\nTCATATG--TATAA",
+ "TACTATGGGTATCC\nTCATAT--GTATAA",)),
+ (True, True, (-7,-1), "TACTATGGGTATCC","TCATATGTATAA",
+ ("TATGGGTAT\nTATG--TAT",
+ "TATGGGTAT\nTAT--GTAT",)),
+ (False,True, (-7,-1), "T","TTT",
+ ("T--\nTTT",
+ "--T\nTTT",)),
+ (False,True, -7, "TAAAGCGAAAT","TGCGT",
+ ("TAAAGCGAAAT\nT---GCG---T")),
+ (False,False,-7, "TAAAGCGAAAT","TGCGT",
+ ("TAAAGCGAAAT\n---TGCGT---"))
+] # fmt: skip
+
+
+@pytest.mark.parametrize(
+ "local, term, gap_penalty, input1, input2, expect", align_cases
+)
+def test_align_optimal_simple(local, term, gap_penalty, input1, input2, expect):
"""
Test `align_optimal()` function using constructed test cases.
"""
@@ -59,29 +61,27 @@ def test_align_optimal_simple(local, term, gap_penalty,
seq2 = seq.NucleotideSequence(input2)
matrix = align.SubstitutionMatrix.std_nucleotide_matrix()
# Test alignment function
- alignments = align.align_optimal(seq1, seq2,
- matrix,
- gap_penalty=gap_penalty, terminal_penalty=term,
- local=local)
-
+ alignments = align.align_optimal(
+ seq1, seq2, matrix, gap_penalty=gap_penalty, terminal_penalty=term, local=local
+ )
+
for ali in alignments:
assert str(ali) in expect
# Test if separate score function calculates the same score
for ali in alignments:
- score = align.score(ali, matrix,
- gap_penalty=gap_penalty, terminal_penalty=term)
+ score = align.score(ali, matrix, gap_penalty=gap_penalty, terminal_penalty=term)
assert score == ali.score
-@pytest.mark.skipif(
- is_not_installed("muscle"),
- reason="MUSCLE is not installed"
-)
+@pytest.mark.skipif(is_not_installed("muscle"), reason="MUSCLE is not installed")
# Ignore warning about MUSCLE writing no second guide tree
@pytest.mark.filterwarnings("ignore")
-@pytest.mark.parametrize("gap_penalty, seq_indices", itertools.product(
- [-10, (-10,-1)], [(i,j) for i in range(10) for j in range(i+1)]
-))
+@pytest.mark.parametrize(
+ "gap_penalty, seq_indices",
+ itertools.product(
+ [-10, (-10, -1)], [(i, j) for i in range(10) for j in range(i + 1)]
+ ),
+)
def test_align_optimal_complex(sequences, gap_penalty, seq_indices):
"""
Test `align_optimal()` function using real world sequences,
@@ -92,8 +92,7 @@ def test_align_optimal_complex(sequences, gap_penalty, seq_indices):
seq1 = sequences[index1]
seq2 = sequences[index2]
test_alignment = align.align_optimal(
- seq1, seq2, matrix,
- gap_penalty=gap_penalty, terminal_penalty=True, max_number=1
+ seq1, seq2, matrix, gap_penalty=gap_penalty, terminal_penalty=True, max_number=1
)[0]
try:
@@ -101,18 +100,14 @@ def test_align_optimal_complex(sequences, gap_penalty, seq_indices):
[seq1, seq2], matrix=matrix, gap_penalty=gap_penalty
)
except VersionError:
- pytest.skip(f"Invalid Muscle software version")
+ pytest.skip("Invalid Muscle software version")
# Check whether the score of the optimal alignments is the same
# or higher as the MUSCLE alignment
# Direct alignment comparison is not feasible,
# since the treatment of terminal gaps is different in MUSCLE
- test_score = align.score(
- test_alignment, matrix, gap_penalty, terminal_penalty=True
- )
- ref_score = align.score(
- ref_alignment, matrix, gap_penalty, terminal_penalty=True
- )
+ test_score = align.score(test_alignment, matrix, gap_penalty, terminal_penalty=True)
+ ref_score = align.score(ref_alignment, matrix, gap_penalty, terminal_penalty=True)
try:
assert test_score >= ref_score
except AssertionError:
@@ -127,9 +122,8 @@ def test_align_optimal_complex(sequences, gap_penalty, seq_indices):
@pytest.mark.parametrize(
- "local, term, gap_penalty, seed", itertools.product(
- [True, False], [True, False], [-5, -8, -10, -15], range(10)
- )
+ "local, term, gap_penalty, seed",
+ itertools.product([True, False], [True, False], [-5, -8, -10, -15], range(10)),
)
def test_affine_gap_penalty(local, term, gap_penalty, seed):
"""
@@ -144,11 +138,9 @@ def test_affine_gap_penalty(local, term, gap_penalty, seed):
for _ in range(2):
sequence = seq.NucleotideSequence()
length = np.random.randint(*LENGTH_RANGE)
- sequence.code = np.random.randint(
- len(sequence.alphabet), size=length
- )
+ sequence.code = np.random.randint(len(sequence.alphabet), size=length)
sequences.append(sequence)
-
+
matrix = align.SubstitutionMatrix.std_nucleotide_matrix()
ref_alignments = align.align_optimal(
@@ -177,13 +169,15 @@ def test_affine_gap_penalty(local, term, gap_penalty, seed):
@pytest.mark.parametrize(
- "local, term, gap_penalty, seq_indices", itertools.product(
- [True, False], [True, False], [-10, (-10,-1)],
- [(i,j) for i in range(10) for j in range(i+1)]
- )
+ "local, term, gap_penalty, seq_indices",
+ itertools.product(
+ [True, False],
+ [True, False],
+ [-10, (-10, -1)],
+ [(i, j) for i in range(10) for j in range(i + 1)],
+ ),
)
-def test_align_optimal_symmetry(sequences, local, term, gap_penalty,
- seq_indices):
+def test_align_optimal_symmetry(sequences, local, term, gap_penalty, seq_indices):
"""
Alignments should be indifferent about which sequence comes first.
"""
@@ -192,15 +186,23 @@ def test_align_optimal_symmetry(sequences, local, term, gap_penalty,
seq1 = sequences[index1]
seq2 = sequences[index2]
alignment1 = align.align_optimal(
- seq1, seq2, matrix,
- gap_penalty=gap_penalty, terminal_penalty=term, local=local,
- max_number=1
+ seq1,
+ seq2,
+ matrix,
+ gap_penalty=gap_penalty,
+ terminal_penalty=term,
+ local=local,
+ max_number=1,
)[0]
# Swap the sequences
alignment2 = align.align_optimal(
- seq2, seq1, matrix,
- gap_penalty=gap_penalty, terminal_penalty=term, local=local,
- max_number=1
+ seq2,
+ seq1,
+ matrix,
+ gap_penalty=gap_penalty,
+ terminal_penalty=term,
+ local=local,
+ max_number=1,
)[0]
# Comparing all traces of both alignments to each other
# would be unfeasible
@@ -209,10 +211,12 @@ def test_align_optimal_symmetry(sequences, local, term, gap_penalty,
@pytest.mark.parametrize(
- "gap_penalty, term, seq_indices", itertools.product(
- [-10, (-10,-1)], [False, True],
- [(i,j) for i in range(10) for j in range(i+1)]
- )
+ "gap_penalty, term, seq_indices",
+ itertools.product(
+ [-10, (-10, -1)],
+ [False, True],
+ [(i, j) for i in range(10) for j in range(i + 1)],
+ ),
)
def test_scoring(sequences, gap_penalty, term, seq_indices):
"""
@@ -224,12 +228,10 @@ def test_scoring(sequences, gap_penalty, term, seq_indices):
seq1 = sequences[index1]
seq2 = sequences[index2]
alignment = align.align_optimal(
- seq1, seq2, matrix, gap_penalty=gap_penalty, terminal_penalty=term,
- max_number=1
+ seq1, seq2, matrix, gap_penalty=gap_penalty, terminal_penalty=term, max_number=1
)[0]
try:
- assert align.score(alignment, matrix, gap_penalty, term) \
- == alignment.score
+ assert align.score(alignment, matrix, gap_penalty, term) == alignment.score
except AssertionError:
print(alignment)
- raise
\ No newline at end of file
+ raise
diff --git a/tests/sequence/align/test_permutation.py b/tests/sequence/align/test_permutation.py
index 9f9085f52..1b22b5579 100644
--- a/tests/sequence/align/test_permutation.py
+++ b/tests/sequence/align/test_permutation.py
@@ -3,9 +3,9 @@
# information.
import numpy as np
+import pytest
import biotite.sequence as seq
import biotite.sequence.align as align
-import pytest
def _create_frequency_permutation(k):
@@ -34,10 +34,7 @@ def test_random_permutation_modulo():
np.iinfo(np.int64).max + 1, size=SEQ_LENGTH, dtype=np.int64
)
- ref_order = [
- (LCG_A * kmer.item() + LCG_C) % LCG_M
- for kmer in kmers
- ]
+ ref_order = [(LCG_A * kmer.item() + LCG_C) % LCG_M for kmer in kmers]
permutation = align.RandomPermutation()
test_order = permutation.permute(kmers)
@@ -60,11 +57,9 @@ def test_random_permutation_randomness():
kmers = np.arange(0, SEQ_LENGTH, dtype=np.int64)
permutation = align.RandomPermutation()
order = permutation.permute(kmers)
- positive = (np.sign(order) == 1)
- n_positive = np.convolve(positive, np.ones(FRAME_SIZE), mode='valid')
- distribution, _ = np.histogram(
- n_positive, bins=np.arange(0, 10 * FRAME_SIZE)
- )
+ positive = np.sign(order) == 1
+ n_positive = np.convolve(positive, np.ones(FRAME_SIZE), mode="valid")
+ distribution, _ = np.histogram(n_positive, bins=np.arange(0, 10 * FRAME_SIZE))
# Since each value in the k-mer array is unique,
# all mapped values should be unique as well
@@ -76,9 +71,7 @@ def test_random_permutation_randomness():
def test_frequency_permutation():
K = 5
- kmer_alphabet = align.KmerAlphabet(
- seq.NucleotideSequence.alphabet_unamb, K
- )
+ kmer_alphabet = align.KmerAlphabet(seq.NucleotideSequence.alphabet_unamb, K)
np.random.seed(0)
# Generate a random count order for each k-mer
# Use 'np.arange()' to generate a unique order,
@@ -89,21 +82,24 @@ def test_frequency_permutation():
kmer_alphabet,
# The actual k-mer positions are dummy values,
# only the number of each k-mer is important for this test
- {i: np.zeros((count, 2)) for i, count in enumerate(counts)}
+ {i: np.zeros((count, 2)) for i, count in enumerate(counts)},
)
permutation = align.FrequencyPermutation.from_table(kmer_table)
kmers_sorted_by_frequency = np.argsort(counts)
- assert permutation.permute(kmers_sorted_by_frequency).tolist() \
+ assert (
+ permutation.permute(kmers_sorted_by_frequency).tolist()
== np.arange(len(kmer_alphabet), dtype=np.int64).tolist()
+ )
@pytest.mark.parametrize(
- "kmer_range, permutation", [
+ "kmer_range, permutation",
+ [
(np.iinfo(np.int64).max, align.RandomPermutation()),
(int(4**5), _create_frequency_permutation(5)),
(int(4**8), _create_frequency_permutation(8)),
- ]
+ ],
)
def test_min_max(kmer_range, permutation):
"""
diff --git a/tests/sequence/align/test_selector.py b/tests/sequence/align/test_selector.py
index cd2bcc4bb..a062df7eb 100644
--- a/tests/sequence/align/test_selector.py
+++ b/tests/sequence/align/test_selector.py
@@ -11,12 +11,7 @@
@pytest.mark.parametrize(
"seed, window, from_sequence, use_permutation",
- itertools.product(
- range(20),
- [2, 5, 10, 25],
- [False, True],
- [False, True]
- )
+ itertools.product(range(20), [2, 5, 10, 25], [False, True], [False, True]),
)
def test_minimizer(seed, window, from_sequence, use_permutation):
"""
@@ -40,23 +35,20 @@ def test_minimizer(seed, window, from_sequence, use_permutation):
order = kmers
# Use an inefficient but simple algorithm for comparison
- ref_minimizer_pos = np.array([
- np.argmin(order[i : i + window]) + i
- for i in range(len(order) - (window - 1))
- ])
+ ref_minimizer_pos = np.array(
+ [np.argmin(order[i : i + window]) + i for i in range(len(order) - (window - 1))]
+ )
# Remove duplicates
ref_minimizer_pos = np.unique(ref_minimizer_pos)
ref_minimizers = kmers[ref_minimizer_pos]
- minimizer_selector = align.MinimizerSelector(
- kmer_alph, window, permutation
- )
+ minimizer_selector = align.MinimizerSelector(kmer_alph, window, permutation)
if from_sequence:
- test_minimizer_pos, test_minimizers \
- = minimizer_selector.select(sequence)
+ test_minimizer_pos, test_minimizers = minimizer_selector.select(sequence)
else:
- test_minimizer_pos, test_minimizers \
- = minimizer_selector.select_from_kmers(kmers)
+ test_minimizer_pos, test_minimizers = minimizer_selector.select_from_kmers(
+ kmers
+ )
assert test_minimizer_pos.tolist() == ref_minimizer_pos.tolist()
assert test_minimizers.tolist() == ref_minimizers.tolist()
@@ -69,10 +61,10 @@ def test_minimizer(seed, window, from_sequence, use_permutation):
[2, 3, 5, 7],
[(0,), (0, 1, 2), (0, -1), (-2, -1)],
[False, True],
- [False, True]
+ [False, True],
),
# Print tuples in name of test
- ids=lambda x: str(x).replace(" ", "") if isinstance(x, tuple) else None
+ ids=lambda x: str(x).replace(" ", "") if isinstance(x, tuple) else None,
)
def test_syncmer(seed, s, offset, from_sequence, use_permutation):
"""
@@ -113,11 +105,9 @@ def test_syncmer(seed, s, offset, from_sequence, use_permutation):
sequence.alphabet, K, s, permutation, offset
)
if from_sequence:
- test_syncmer_pos, test_syncmers \
- = syncmer_selector.select(sequence)
+ test_syncmer_pos, test_syncmers = syncmer_selector.select(sequence)
else:
- test_syncmer_pos, test_syncmers \
- = syncmer_selector.select_from_kmers(kmers)
+ test_syncmer_pos, test_syncmers = syncmer_selector.select_from_kmers(kmers)
assert test_syncmer_pos.tolist() == ref_syncmer_pos.tolist()
assert test_syncmers.tolist() == ref_syncmers.tolist()
@@ -141,14 +131,10 @@ def test_cached_syncmer():
np.random.seed(0)
sequence.code = np.random.randint(len(sequence.alphabet), size=LENGTH)
- syncmer_selector = align.SyncmerSelector(
- sequence.alphabet, K, S
- )
+ syncmer_selector = align.SyncmerSelector(sequence.alphabet, K, S)
ref_syncmer_pos, ref_syncmers = syncmer_selector.select(sequence)
- cached_syncmer_selector = align.CachedSyncmerSelector(
- sequence.alphabet, K, S
- )
+ cached_syncmer_selector = align.CachedSyncmerSelector(sequence.alphabet, K, S)
test_syncmer_pos, test_syncmers = cached_syncmer_selector.select(sequence)
assert test_syncmer_pos.tolist() == ref_syncmer_pos.tolist()
@@ -159,13 +145,13 @@ def test_cached_syncmer():
"offset, exception_type",
[
# Duplicate values
- ((1, 1), ValueError),
+ ((1, 1), ValueError),
((0, 2, 0), ValueError),
- ((0, -10), ValueError),
+ ((0, -10), ValueError),
# Offset out of window range
- ((-11,), IndexError),
- ((10,), IndexError),
- ]
+ ((-11,), IndexError),
+ ((10,), IndexError),
+ ],
)
def test_syncmer_invalid_offset(offset, exception_type):
"""
@@ -176,7 +162,10 @@ def test_syncmer_invalid_offset(offset, exception_type):
with pytest.raises(exception_type):
align.SyncmerSelector(
# Any alphabet would work here
- seq.NucleotideSequence.alphabet_unamb, K, S, offset=offset
+ seq.NucleotideSequence.alphabet_unamb,
+ K,
+ S,
+ offset=offset,
)
@@ -205,12 +194,9 @@ def test_mincode(use_permutation):
permutation_range = len(kmer_alph)
order = kmers
- mincode_selector = align.MincodeSelector(
- kmer_alph, COMPRESSION, permutation
- )
+ mincode_selector = align.MincodeSelector(kmer_alph, COMPRESSION, permutation)
_, mincode_pos = mincode_selector.select_from_kmers(kmers)
threshold = permutation_offset + permutation_range / COMPRESSION
assert mincode_pos.tolist() == np.where(order < threshold)[0].tolist()
- assert len(mincode_pos) * COMPRESSION \
- == pytest.approx(len(kmers), rel=0.02)
\ No newline at end of file
+ assert len(mincode_pos) * COMPRESSION == pytest.approx(len(kmers), rel=0.02)
diff --git a/tests/sequence/align/test_statistics.py b/tests/sequence/align/test_statistics.py
index b9defcc46..cb0840a16 100644
--- a/tests/sequence/align/test_statistics.py
+++ b/tests/sequence/align/test_statistics.py
@@ -2,50 +2,55 @@
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.
-import pytest
import numpy as np
+import pytest
import biotite.sequence as seq
import biotite.sequence.align as align
from biotite.sequence.align.statistics import EValueEstimator
-from .util import sequences
-
-
-BACKGROUND = np.array(list({
- "A": 35155,
- "C": 8669,
- "D": 24161,
- "E": 28354,
- "F": 17367,
- "G": 33229,
- "H": 9906,
- "I": 23161,
- "K": 25872,
- "L": 40625,
- "M": 10101,
- "N": 20212,
- "P": 23435,
- "Q": 19208,
- "R": 23105,
- "S": 32070,
- "T": 26311,
- "V": 29012,
- "W": 5990,
- "Y": 14488,
- "B": 0,
- "Z": 0,
- "X": 0,
- "*": 0,
-}.values())) / 450431
+
+BACKGROUND = (
+ np.array(
+ list(
+ {
+ "A": 35155,
+ "C": 8669,
+ "D": 24161,
+ "E": 28354,
+ "F": 17367,
+ "G": 33229,
+ "H": 9906,
+ "I": 23161,
+ "K": 25872,
+ "L": 40625,
+ "M": 10101,
+ "N": 20212,
+ "P": 23435,
+ "Q": 19208,
+ "R": 23105,
+ "S": 32070,
+ "T": 26311,
+ "V": 29012,
+ "W": 5990,
+ "Y": 14488,
+ "B": 0,
+ "Z": 0,
+ "X": 0,
+ "*": 0,
+ }.values()
+ )
+ )
+ / 450431
+)
@pytest.mark.parametrize(
"matrix_name, gap_penalty, ref_lam, ref_k",
[
("BLOSUM62", (-10000, -10000), 0.318, 0.130),
- ("BLOSUM62", ( -12, -2), 0.300, 0.090),
- ("BLOSUM62", ( -5, -5), 0.131, 0.009),
- ( "PAM250", ( -16, -1), 0.172, 0.018),
- ]
+ ("BLOSUM62", (-12, -2), 0.300, 0.090),
+ ("BLOSUM62", (-5, -5), 0.131, 0.009),
+ ("PAM250", (-16, -1), 0.172, 0.018),
+ ],
)
def test_distribution_param(matrix_name, gap_penalty, ref_lam, ref_k):
"""
@@ -55,14 +60,13 @@ def test_distribution_param(matrix_name, gap_penalty, ref_lam, ref_k):
"""
SAMPLE_LENGTH = 500
SAMPLE_SIZE = 1000
-
+
alphabet = seq.ProteinSequence.alphabet
matrix = align.SubstitutionMatrix(alphabet, alphabet, matrix_name)
np.random.seed(0)
estimator = align.EValueEstimator.from_samples(
- alphabet, matrix, gap_penalty, BACKGROUND,
- SAMPLE_LENGTH, SAMPLE_SIZE
+ alphabet, matrix, gap_penalty, BACKGROUND, SAMPLE_LENGTH, SAMPLE_SIZE
)
# Due to relatively low sample size, expect rather large deviation
@@ -85,35 +89,29 @@ def test_evalue():
matrix = align.SubstitutionMatrix.std_protein_matrix()
estimator = align.EValueEstimator.from_samples(
- seq.ProteinSequence.alphabet, matrix, GAP_PENALTY,
- BACKGROUND
+ seq.ProteinSequence.alphabet, matrix, GAP_PENALTY, BACKGROUND
)
# Generate large number of alignments of random sequences
np.random.seed(0)
random_sequence_code = np.random.choice(
- len(seq.ProteinSequence.alphabet),
- size=(N_SAMPLES, 2, SEQ_LENGTH),
- p=BACKGROUND
+ len(seq.ProteinSequence.alphabet), size=(N_SAMPLES, 2, SEQ_LENGTH), p=BACKGROUND
)
sample_scores = np.zeros(N_SAMPLES, dtype=int)
for i in range(N_SAMPLES):
seq1 = seq.ProteinSequence()
seq2 = seq.ProteinSequence()
- seq1.code = random_sequence_code[i,0]
- seq2.code = random_sequence_code[i,1]
+ seq1.code = random_sequence_code[i, 0]
+ seq2.code = random_sequence_code[i, 1]
sample_scores[i] = align.align_optimal(
- seq1, seq2, matrix,
- local=True, gap_penalty=GAP_PENALTY, max_number=1
+ seq1, seq2, matrix, local=True, gap_penalty=GAP_PENALTY, max_number=1
)[0].score
e_values = [
10 ** estimator.log_evalue(score, SEQ_LENGTH, SEQ_LENGTH * N_SAMPLES)
for score in TEST_SCORES
]
- counts = [
- np.count_nonzero(sample_scores >= score) for score in TEST_SCORES
- ]
+ counts = [np.count_nonzero(sample_scores >= score) for score in TEST_SCORES]
assert e_values == pytest.approx(counts, rel=0.5)
@@ -133,45 +131,50 @@ def test_score_scaling(sequences):
np.random.seed(0)
std_estimator = align.EValueEstimator.from_samples(
- seq.ProteinSequence.alphabet, matrix, GAP_PENALTY,
- BACKGROUND
+ seq.ProteinSequence.alphabet, matrix, GAP_PENALTY, BACKGROUND
)
scores = [
align.align_optimal(
- sequences[i], sequences[i+1], matrix, GAP_PENALTY, local=True,
- max_number=1
- )[0].score for i in range(9)
+ sequences[i],
+ sequences[i + 1],
+ matrix,
+ GAP_PENALTY,
+ local=True,
+ max_number=1,
+ )[0].score
+ for i in range(9)
]
- std_log_evalues = std_estimator.log_evalue(
- scores, SEQ_LENGTH, SEQ_LENGTH
- )
+ std_log_evalues = std_estimator.log_evalue(scores, SEQ_LENGTH, SEQ_LENGTH)
scaled_matrix = align.SubstitutionMatrix(
seq.ProteinSequence.alphabet,
seq.ProteinSequence.alphabet,
- matrix.score_matrix() * SCALING_FACTOR
+ matrix.score_matrix() * SCALING_FACTOR,
)
scaled_gap_penalty = (
GAP_PENALTY[0] * SCALING_FACTOR,
- GAP_PENALTY[1] * SCALING_FACTOR
+ GAP_PENALTY[1] * SCALING_FACTOR,
)
scaled_estimator = align.EValueEstimator.from_samples(
- seq.ProteinSequence.alphabet, scaled_matrix, scaled_gap_penalty,
- BACKGROUND
+ seq.ProteinSequence.alphabet, scaled_matrix, scaled_gap_penalty, BACKGROUND
)
scores = [
align.align_optimal(
- sequences[i], sequences[i+1], scaled_matrix, scaled_gap_penalty,
- local=True, max_number=1
- )[0].score for i in range(9)
+ sequences[i],
+ sequences[i + 1],
+ scaled_matrix,
+ scaled_gap_penalty,
+ local=True,
+ max_number=1,
+ )[0].score
+ for i in range(9)
]
- scaled_log_evalues = scaled_estimator.log_evalue(
- scores, SEQ_LENGTH, SEQ_LENGTH
- )
+ scaled_log_evalues = scaled_estimator.log_evalue(scores, SEQ_LENGTH, SEQ_LENGTH)
# Due to relatively low sample size, expect rather large deviation
- assert std_log_evalues.tolist() \
- == pytest.approx(scaled_log_evalues.tolist(), rel=0.2)
+ assert std_log_evalues.tolist() == pytest.approx(
+ scaled_log_evalues.tolist(), rel=0.2
+ )
def test_invalid_scoring_scheme():
@@ -185,6 +188,6 @@ def test_invalid_scoring_scheme():
)
# Uniform background frequencies
freq = np.ones(len(alph))
-
+
with pytest.raises(ValueError):
- estimator = EValueEstimator.from_samples(alph, matrix, -10, freq)
\ No newline at end of file
+ EValueEstimator.from_samples(alph, matrix, -10, freq)
diff --git a/tests/sequence/test_alphabet.py b/tests/sequence/test_alphabet.py
index b99756e79..ba6ef023f 100644
--- a/tests/sequence/test_alphabet.py
+++ b/tests/sequence/test_alphabet.py
@@ -3,16 +3,19 @@
# information.
import itertools
-import pytest
import numpy as np
+import pytest
import biotite.sequence as seq
-
test_cases = {
- "A" : [0],
- "D" : [3],
- "ABC" : [0,1,2,],
- "ABAFF" : [0,1,0,5,5]
+ "A": [0],
+ "D": [3],
+ "ABC": [
+ 0,
+ 1,
+ 2,
+ ],
+ "ABAFF": [0, 1, 0, 5, 5],
}
@@ -24,17 +27,17 @@ def alphabet_symbols():
@pytest.mark.parametrize(
"symbols, exp_code, use_letter_alphabet",
zip(
- list(test_cases.keys() ) * 2,
+ list(test_cases.keys()) * 2,
list(test_cases.values()) * 2,
- [False] * len(test_cases) + [True] * len(test_cases)
- )
+ [False] * len(test_cases) + [True] * len(test_cases),
+ ),
)
def test_encoding(alphabet_symbols, symbols, exp_code, use_letter_alphabet):
if use_letter_alphabet:
alph = seq.LetterAlphabet(alphabet_symbols)
else:
alph = seq.Alphabet(alphabet_symbols)
-
+
if len(symbols) == 1:
assert alph.encode(symbols[0]) == exp_code[0]
else:
@@ -44,17 +47,17 @@ def test_encoding(alphabet_symbols, symbols, exp_code, use_letter_alphabet):
@pytest.mark.parametrize(
"exp_symbols, code, use_letter_alphabet",
zip(
- list(test_cases.keys() ) * 2,
+ list(test_cases.keys()) * 2,
list(test_cases.values()) * 2,
- [False] * len(test_cases) + [True] * len(test_cases)
- )
+ [False] * len(test_cases) + [True] * len(test_cases),
+ ),
)
def test_decoding(alphabet_symbols, exp_symbols, code, use_letter_alphabet):
if use_letter_alphabet:
alph = seq.LetterAlphabet(alphabet_symbols)
else:
alph = seq.Alphabet(alphabet_symbols)
-
+
code = np.array(code, dtype=np.uint8)
if len(code) == 1:
assert alph.decode(code[0]) == exp_symbols[0]
@@ -64,9 +67,7 @@ def test_decoding(alphabet_symbols, exp_symbols, code, use_letter_alphabet):
@pytest.mark.parametrize(
"use_letter_alphabet, is_single_val",
- itertools.product(
- [False, True], [False, True]
- )
+ itertools.product([False, True], [False, True]),
)
def test_error(alphabet_symbols, use_letter_alphabet, is_single_val):
if use_letter_alphabet:
@@ -96,8 +97,13 @@ def test_error(alphabet_symbols, use_letter_alphabet, is_single_val):
@pytest.mark.parametrize(
"symbols",
- ["ABC", b"ABC", ["A","B","C"],
- np.array(["A","B","C"]), np.array([b"A",b"B",b"C"])]
+ [
+ "ABC",
+ b"ABC",
+ ["A", "B", "C"],
+ np.array(["A", "B", "C"]),
+ np.array([b"A", b"B", b"C"]),
+ ],
)
def test_input_types(alphabet_symbols, symbols):
"""
@@ -108,13 +114,14 @@ def test_input_types(alphabet_symbols, symbols):
alph = seq.LetterAlphabet(alphabet_symbols)
code = alph.encode_multiple(symbols)
conv_symbols = alph.decode_multiple(code)
-
-
+
if isinstance(symbols, bytes):
symbols = symbols.decode("ASCII")
assert list(conv_symbols) == list(
- [symbol.decode("ASCII") if isinstance(symbol, bytes) else symbol
- for symbol in symbols]
+ [
+ symbol.decode("ASCII") if isinstance(symbol, bytes) else symbol
+ for symbol in symbols
+ ]
)
@@ -137,26 +144,24 @@ def test_contains(alphabet_symbols, use_letter_alphabet):
@pytest.mark.parametrize(
- "source_alph_symbols, target_alph_symbols",
+ "source_alph_symbols, target_alph_symbols",
[
("A", "AB"),
(["foo", "bar"], ["bar", "foo", 42]),
("ACGT", "AGTC"),
("ACGT", "ACGNT"),
(np.arange(0, 1000), np.arange(999, -1, -1)),
- ]
+ ],
)
def test_alphabet_mapper(source_alph_symbols, target_alph_symbols):
CODE_LENGTH = 10000
source_alph = seq.Alphabet(source_alph_symbols)
target_alph = seq.Alphabet(target_alph_symbols)
mapper = seq.AlphabetMapper(source_alph, target_alph)
-
+
ref_sequence = seq.GeneralSequence(source_alph)
np.random.seed(0)
- ref_sequence.code = np.random.randint(
- len(source_alph), size=CODE_LENGTH, dtype=int
- )
+ ref_sequence.code = np.random.randint(len(source_alph), size=CODE_LENGTH, dtype=int)
test_sequence = seq.GeneralSequence(target_alph)
test_sequence.code = mapper[ref_sequence.code]
@@ -164,22 +169,25 @@ def test_alphabet_mapper(source_alph_symbols, target_alph_symbols):
assert test_sequence.symbols == ref_sequence.symbols
-@pytest.mark.parametrize("alphabets, common_alph", [
- (
- [
+@pytest.mark.parametrize(
+ "alphabets, common_alph",
+ [
+ (
+ [
+ seq.NucleotideSequence.alphabet_amb,
+ seq.NucleotideSequence.alphabet_unamb,
+ ],
seq.NucleotideSequence.alphabet_amb,
- seq.NucleotideSequence.alphabet_unamb,
- ],
- seq.NucleotideSequence.alphabet_amb
- ),
- (
- [
- seq.NucleotideSequence.alphabet_unamb,
+ ),
+ (
+ [
+ seq.NucleotideSequence.alphabet_unamb,
+ seq.NucleotideSequence.alphabet_amb,
+ ],
seq.NucleotideSequence.alphabet_amb,
- ],
- seq.NucleotideSequence.alphabet_amb
- ),
-])
+ ),
+ ],
+)
def test_common_alphabet(alphabets, common_alph):
"""
Check if :func:`common_alphabet()` correctly identifies the common
@@ -188,13 +196,14 @@ def test_common_alphabet(alphabets, common_alph):
seq.common_alphabet(alphabets) == common_alph
-
def test_common_alphabet_no_common():
"""
Check if :func:`common_alphabet()` correctly identifies that no
common alphabet exists in a simple known test case.
"""
- assert seq.common_alphabet([
- seq.NucleotideSequence.alphabet_unamb,
- seq.ProteinSequence.alphabet
- ]) is None
\ No newline at end of file
+ assert (
+ seq.common_alphabet(
+ [seq.NucleotideSequence.alphabet_unamb, seq.ProteinSequence.alphabet]
+ )
+ is None
+ )
diff --git a/tests/sequence/test_annotation.py b/tests/sequence/test_annotation.py
index 4ce771692..b1159933f 100644
--- a/tests/sequence/test_annotation.py
+++ b/tests/sequence/test_annotation.py
@@ -2,58 +2,62 @@
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.
+from os.path import join
import biotite.sequence as seq
-from biotite.sequence import Location, Feature, Annotation, AnnotatedSequence
import biotite.sequence.io.genbank as gb
-import numpy as np
-from os.path import join
-from ..util import data_dir
-import pytest
+from biotite.sequence import AnnotatedSequence, Annotation, Feature, Location
+from tests.util import data_dir
def test_annotation_creation():
- feature1 = Feature("CDS", [seq.Location(1,2)], qual={"gene" : "test1"})
- feature2 = Feature("CDS", [seq.Location(3,4)], qual={"gene" : "test2"})
+ feature1 = Feature("CDS", [seq.Location(1, 2)], qual={"gene": "test1"})
+ feature2 = Feature("CDS", [seq.Location(3, 4)], qual={"gene": "test2"})
feature_list = [feature1, feature2]
annotation = Annotation(feature_list)
for feature in annotation:
assert feature.key in [f.key for f in feature_list]
- assert feature.qual["gene"] in [
- f.qual["gene"] for f in feature_list
- ]
+ assert feature.qual["gene"] in [f.qual["gene"] for f in feature_list]
+
def test_annotation_concatenation():
- feature1 = Feature("CDS", [seq.Location(1,1)], qual={"gene" : "test1"})
- feature2 = Feature("CDS", [seq.Location(2,2)], qual={"gene" : "test2"})
+ feature1 = Feature("CDS", [seq.Location(1, 1)], qual={"gene": "test1"})
+ feature2 = Feature("CDS", [seq.Location(2, 2)], qual={"gene": "test2"})
annot1 = Annotation([feature1, feature2])
- feature3 = Feature("CDS", [seq.Location(3,3)], qual={"gene" : "test3"})
- feature4 = Feature("CDS", [seq.Location(4,4)], qual={"gene" : "test4"})
+ feature3 = Feature("CDS", [seq.Location(3, 3)], qual={"gene": "test3"})
+ feature4 = Feature("CDS", [seq.Location(4, 4)], qual={"gene": "test4"})
annot2 = Annotation([feature3, feature4])
- feature5 = Feature("CDS", [seq.Location(5,5)], qual={"gene" : "test5"})
+ feature5 = Feature("CDS", [seq.Location(5, 5)], qual={"gene": "test5"})
concat = annot1 + annot2 + feature5
- assert set([f.qual["gene"] for f in concat]) \
- == set(["test1", "test2", "test3", "test4", "test5"])
+ assert set([f.qual["gene"] for f in concat]) == set(
+ ["test1", "test2", "test3", "test4", "test5"]
+ )
+
def test_annotation_indexing():
- feature1 = Feature("CDS", [Location(-10,30 )], qual={"gene" : "test1"})
- feature2 = Feature("CDS", [Location(20, 50 )], qual={"gene" : "test2"})
- feature3 = Feature("CDS", [Location(100,130)], qual={"gene" : "test3"})
- feature4 = Feature("CDS", [Location(150,250)], qual={"gene" : "test4"})
- feature5 = Feature("CDS", [Location(-50,200)], qual={"gene" : "test5"})
- annotation = Annotation([feature1,feature2,feature3,feature4,feature5])
+ feature1 = Feature("CDS", [Location(-10, 30)], qual={"gene": "test1"})
+ feature2 = Feature("CDS", [Location(20, 50)], qual={"gene": "test2"})
+ feature3 = Feature("CDS", [Location(100, 130)], qual={"gene": "test3"})
+ feature4 = Feature("CDS", [Location(150, 250)], qual={"gene": "test4"})
+ feature5 = Feature("CDS", [Location(-50, 200)], qual={"gene": "test5"})
+ annotation = Annotation([feature1, feature2, feature3, feature4, feature5])
sub_annot = annotation[40:150]
# Only one location per feature
- assert set([list(f.locs)[0].defect for f in sub_annot]) \
- == set([Location.Defect.MISS_LEFT, Location.Defect.NONE,
- (Location.Defect.MISS_LEFT | Location.Defect.MISS_RIGHT)])
- assert set([f.qual["gene"] for f in sub_annot]) \
- == set(["test2", "test3", "test5"])
+ assert set([list(f.locs)[0].defect for f in sub_annot]) == set(
+ [
+ Location.Defect.MISS_LEFT,
+ Location.Defect.NONE,
+ (Location.Defect.MISS_LEFT | Location.Defect.MISS_RIGHT),
+ ]
+ )
+ assert set([f.qual["gene"] for f in sub_annot]) == set(["test2", "test3", "test5"])
+
def test_annotated_sequence():
sequence = seq.NucleotideSequence("ATGGCGTACGATTAGAAAAAAA")
- feature1 = Feature("misc_feature", [Location(1,2), Location(11,12)],
- {"note" : "walker"})
- feature2 = Feature("misc_feature", [Location(16,22)], {"note" : "poly-A"})
+ feature1 = Feature(
+ "misc_feature", [Location(1, 2), Location(11, 12)], {"note": "walker"}
+ )
+ feature2 = Feature("misc_feature", [Location(16, 22)], {"note": "poly-A"})
annotation = Annotation([feature1, feature2])
annot_seq = AnnotatedSequence(annotation, sequence)
assert annot_seq[2] == "T"
@@ -62,17 +66,19 @@ def test_annotated_sequence():
# test slicing with only stop
annot_seq2 = annot_seq[:16]
assert annot_seq2.sequence == seq.NucleotideSequence("ATGGCGTACGATTAG")
- assert set([f.qual['note'] for f in annot_seq2.annotation]) == {'walker'}
+ assert set([f.qual["note"] for f in annot_seq2.annotation]) == {"walker"}
# test slicing with only start
annot_seq3 = annot_seq[16:]
assert annot_seq3.sequence == seq.NucleotideSequence("AAAAAAA")
- assert set([f.qual['note'] for f in annot_seq3.annotation]) == {'poly-A'}
+ assert set([f.qual["note"] for f in annot_seq3.annotation]) == {"poly-A"}
# test slicing with start and stop
annot_seq4 = annot_seq[1:17]
- assert annot_seq4.sequence == seq.NucleotideSequence("ATGGCGTACGATTAGA") # sequences are 1-indexed
- assert set([f.qual['note'] for f in annot_seq4.annotation]) == {'walker', 'poly-A'}
+ assert annot_seq4.sequence == seq.NucleotideSequence(
+ "ATGGCGTACGATTAGA"
+ ) # sequences are 1-indexed
+ assert set([f.qual["note"] for f in annot_seq4.annotation]) == {"walker", "poly-A"}
assert annot_seq[feature1] == seq.NucleotideSequence("ATAT")
assert annot_seq[feature2] == seq.NucleotideSequence("AAAAAAA")
@@ -80,12 +86,17 @@ def test_annotated_sequence():
assert annot_seq.sequence == seq.NucleotideSequence("CCGGCGTACGCCTAGAAAAAAA")
# test slicing with feature on minus strand
- feature3 = Feature("misc_feature", [Location(1,4), Location(8,12)])
- feature4 = Feature("misc_feature_minus", [
- Location(1,4,strand=Location.Strand.REVERSE),
- Location(8,12,strand=Location.Strand.REVERSE)])
+ feature3 = Feature("misc_feature", [Location(1, 4), Location(8, 12)])
+ feature4 = Feature(
+ "misc_feature_minus",
+ [
+ Location(1, 4, strand=Location.Strand.REVERSE),
+ Location(8, 12, strand=Location.Strand.REVERSE),
+ ],
+ )
assert annot_seq[feature4] == annot_seq[feature3].reverse().complement()
+
def test_reverse_complement():
gb_file = gb.GenBankFile.read(join(data_dir("sequence"), "ec_bl21.gb"))
annot_seq = gb.get_annotated_sequence(gb_file)
diff --git a/tests/sequence/test_codon.py b/tests/sequence/test_codon.py
index 315f2b0fc..fe8d38eb4 100644
--- a/tests/sequence/test_codon.py
+++ b/tests/sequence/test_codon.py
@@ -2,14 +2,41 @@
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.
-import biotite.sequence as seq
import pytest
+import biotite.sequence as seq
-@pytest.mark.parametrize("table_id",
- [1,2,3,4,5,6,9,10,11,12,13,14,16,21,22,23,24,25,26,27,28,29,30,31])
+@pytest.mark.parametrize(
+ "table_id",
+ [
+ 1,
+ 2,
+ 3,
+ 4,
+ 5,
+ 6,
+ 9,
+ 10,
+ 11,
+ 12,
+ 13,
+ 14,
+ 16,
+ 21,
+ 22,
+ 23,
+ 24,
+ 25,
+ 26,
+ 27,
+ 28,
+ 29,
+ 30,
+ 31,
+ ],
+)
def test_table_load(table_id):
- table = seq.CodonTable.load(table_id)
+ seq.CodonTable.load(table_id)
def test_table_indexing():
diff --git a/tests/sequence/test_fasta.py b/tests/sequence/test_fasta.py
index 1b7103e30..68133f44b 100644
--- a/tests/sequence/test_fasta.py
+++ b/tests/sequence/test_fasta.py
@@ -2,18 +2,18 @@
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.
-import itertools
import glob
import io
-import biotite.sequence as seq
-import biotite.sequence.io.fasta as fasta
-import numpy as np
+import itertools
import os
import os.path
-from ..util import data_dir
+import numpy as np
import pytest
+import biotite.sequence as seq
+import biotite.sequence.io.fasta as fasta
+from tests.util import data_dir
+
-
def test_access_low_level():
path = os.path.join(data_dir("sequence"), "nuc.fasta")
file = fasta.FastaFile.read(path)
@@ -21,21 +21,21 @@ def test_access_low_level():
assert file["another dna sequence"] == "A"
assert file["third dna sequence"] == "ACGT"
assert dict(file.items()) == {
- "dna sequence" : "ACGCTACGT",
- "another dna sequence" : "A",
- "third dna sequence" : "ACGT",
- "rna sequence" : "ACGU",
- "ambiguous rna sequence" : "ACGUNN",
+ "dna sequence": "ACGCTACGT",
+ "another dna sequence": "A",
+ "third dna sequence": "ACGT",
+ "rna sequence": "ACGU",
+ "ambiguous rna sequence": "ACGUNN",
}
file["another dna sequence"] = "AA"
del file["dna sequence"]
file["yet another sequence"] = "ACGT"
assert dict(file.items()) == {
- "another dna sequence" : "AA",
- "third dna sequence" : "ACGT",
- "rna sequence" : "ACGU",
- "ambiguous rna sequence" : "ACGUNN",
- "yet another sequence" : "ACGT",
+ "another dna sequence": "AA",
+ "third dna sequence": "ACGT",
+ "rna sequence": "ACGU",
+ "ambiguous rna sequence": "ACGUNN",
+ "yet another sequence": "ACGT",
}
@@ -45,16 +45,16 @@ def test_access_high_level(seq_type):
file = fasta.FastaFile.read(path)
sequences = fasta.get_sequences(file, seq_type=seq_type)
assert sequences == {
- "dna sequence" : seq.NucleotideSequence("ACGCTACGT", False),
- "another dna sequence" : seq.NucleotideSequence("A", False),
- "third dna sequence" : seq.NucleotideSequence("ACGT", False),
- "rna sequence" : seq.NucleotideSequence("ACGT", False),
- "ambiguous rna sequence" : seq.NucleotideSequence("ACGTNN", True),
+ "dna sequence": seq.NucleotideSequence("ACGCTACGT", False),
+ "another dna sequence": seq.NucleotideSequence("A", False),
+ "third dna sequence": seq.NucleotideSequence("ACGT", False),
+ "rna sequence": seq.NucleotideSequence("ACGT", False),
+ "ambiguous rna sequence": seq.NucleotideSequence("ACGTNN", True),
}
@pytest.mark.parametrize(
- "seq_type", (None, seq.NucleotideSequence, seq.ProteinSequence)
+ "seq_type", (None, seq.NucleotideSequence, seq.ProteinSequence)
)
def test_sequence_conversion_ambiguous(seq_type):
path = os.path.join(data_dir("sequence"), "nuc.fasta")
@@ -67,10 +67,8 @@ def test_sequence_conversion_ambiguous(seq_type):
file, seq_type=None
)
else:
- assert seq_type(sequence) == fasta.get_sequence(
- file, seq_type=seq_type
- )
-
+ assert seq_type(sequence) == fasta.get_sequence(file, seq_type=seq_type)
+
seq_dict = fasta.get_sequences(file)
file2 = fasta.FastaFile()
fasta.set_sequences(file2, seq_dict)
@@ -84,7 +82,7 @@ def test_sequence_conversion_ambiguous(seq_type):
assert str(seq1) == str(seq2)
else:
assert seq_dict == seq_dict2
-
+
if seq_type is not None:
sequence = "AACCTTGG"
file3 = fasta.FastaFile()
@@ -93,7 +91,7 @@ def test_sequence_conversion_ambiguous(seq_type):
@pytest.mark.parametrize(
- "seq_type", (None, seq.NucleotideSequence, seq.ProteinSequence)
+ "seq_type", (None, seq.NucleotideSequence, seq.ProteinSequence)
)
def test_sequence_conversion_protein(seq_type):
path = os.path.join(data_dir("sequence"), "prot.fasta")
@@ -112,7 +110,7 @@ def test_sequence_conversion_protein(seq_type):
@pytest.mark.parametrize(
- "seq_type", (None, seq.NucleotideSequence, seq.ProteinSequence)
+ "seq_type", (None, seq.NucleotideSequence, seq.ProteinSequence)
)
def test_sequence_conversion_invalid(seq_type):
path = os.path.join(data_dir("sequence"), "invalid.fasta")
@@ -134,32 +132,31 @@ def test_alignment_conversion():
path = os.path.join(data_dir("sequence"), "alignment.fasta")
file = fasta.FastaFile.read(path)
alignment = fasta.get_alignment(file)
- assert str(alignment) == ("ADTRCGTARDCGTR-DRTCGRAGD\n"
- "ADTRCGT---CGTRADRTCGRAGD\n"
- "ADTRCGTARDCGTRADR--GRAGD")
-
+ assert str(alignment) == (
+ "ADTRCGTARDCGTR-DRTCGRAGD\n"
+ "ADTRCGT---CGTRADRTCGRAGD\n"
+ "ADTRCGTARDCGTRADR--GRAGD"
+ )
+
file2 = fasta.FastaFile()
- fasta.set_alignment(file2, alignment, seq_names=["seq1","seq2","seq3"])
+ fasta.set_alignment(file2, alignment, seq_names=["seq1", "seq2", "seq3"])
alignment2 = fasta.get_alignment(file2)
assert str(alignment) == str(alignment2)
+
@pytest.mark.parametrize(
- "file_name",
- glob.glob(os.path.join(data_dir("sequence"), "*.fasta"))
+ "file_name", glob.glob(os.path.join(data_dir("sequence"), "*.fasta"))
)
def test_read_iter(file_name):
ref_dict = dict(fasta.FastaFile.read(file_name).items())
-
+
test_dict = dict(fasta.FastaFile.read_iter(file_name))
assert test_dict == ref_dict
@pytest.mark.parametrize(
- "chars_per_line, n_sequences", itertools.product(
- [80, 200],
- [1, 10]
- )
+ "chars_per_line, n_sequences", itertools.product([80, 200], [1, 10])
)
def test_write_iter(chars_per_line, n_sequences):
"""
@@ -168,7 +165,6 @@ def test_write_iter(chars_per_line, n_sequences):
random sequences.
"""
LENGTH_RANGE = (50, 150)
- SCORE_RANGE = (10, 60)
# Generate random sequences and scores
np.random.seed(0)
@@ -176,28 +172,24 @@ def test_write_iter(chars_per_line, n_sequences):
for i in range(n_sequences):
seq_length = np.random.randint(*LENGTH_RANGE)
code = np.random.randint(
- len(seq.NucleotideSequence.alphabet_unamb),
- size=seq_length
+ len(seq.NucleotideSequence.alphabet_unamb), size=seq_length
)
sequence = seq.NucleotideSequence()
sequence.code = code
sequences.append(sequence)
-
+
fasta_file = fasta.FastaFile(chars_per_line)
for i, sequence in enumerate(sequences):
header = f"seq_{i}"
fasta_file[header] = str(sequence)
ref_file = io.StringIO()
fasta_file.write(ref_file)
-
+
test_file = io.StringIO()
fasta.FastaFile.write_iter(
test_file,
- (
- (f"seq_{i}", str(sequence))
- for i, sequence in enumerate(sequences)
- ),
- chars_per_line
+ ((f"seq_{i}", str(sequence)) for i, sequence in enumerate(sequences)),
+ chars_per_line,
)
- assert test_file.getvalue() == ref_file.getvalue()
\ No newline at end of file
+ assert test_file.getvalue() == ref_file.getvalue()
diff --git a/tests/sequence/test_fastq.py b/tests/sequence/test_fastq.py
index d497787a8..3d0023337 100644
--- a/tests/sequence/test_fastq.py
+++ b/tests/sequence/test_fastq.py
@@ -5,43 +5,40 @@
import glob
import io
import itertools
-from tempfile import TemporaryFile
-import biotite.sequence as seq
-import biotite.sequence.io.fastq as fastq
-import numpy as np
import os
import os.path
-from ..util import data_dir
+from tempfile import TemporaryFile
+import numpy as np
import pytest
+import biotite.sequence as seq
+import biotite.sequence.io.fastq as fastq
+from tests.util import data_dir
+
@pytest.mark.parametrize("chars_per_line", [None, 80])
def test_access(chars_per_line):
path = os.path.join(data_dir("sequence"), "random.fastq")
- file = fastq.FastqFile.read(
- path, offset=33, chars_per_line=chars_per_line
- )
+ file = fastq.FastqFile.read(path, offset=33, chars_per_line=chars_per_line)
assert len(file) == 20
assert list(file.keys()) == [f"Read:{i+1:02d}" for i in range(20)]
- del(file["Read:05"])
+ del file["Read:05"]
assert len(file) == 19
- assert list(file.keys()) == [f"Read:{i+1:02d}" for i in range(20)
- if i+1 != 5]
+ assert list(file.keys()) == [f"Read:{i+1:02d}" for i in range(20) if i + 1 != 5]
for seq_str, scores in file.values():
assert len(seq_str) == len(scores)
assert (scores >= 0).all()
seq_str = "ACTCGGT"
- scores = np.array([10,12,20,11,0,80,42])
+ scores = np.array([10, 12, 20, 11, 0, 80, 42])
file["test"] = seq_str, scores
seq_str2, scores2 = file["test"]
assert seq_str == seq_str2
assert np.array_equal(scores, scores2)
+
@pytest.mark.parametrize("chars_per_line", [None, 80])
def test_conversion(chars_per_line):
path = os.path.join(data_dir("sequence"), "random.fastq")
- fasta_file = fastq.FastqFile.read(
- path, offset=33, chars_per_line=chars_per_line
- )
+ fasta_file = fastq.FastqFile.read(path, offset=33, chars_per_line=chars_per_line)
ref_content = dict(fasta_file.items())
fasta_file = fastq.FastqFile(offset=33, chars_per_line=chars_per_line)
@@ -51,48 +48,46 @@ def test_conversion(chars_per_line):
fasta_file.write(temp)
temp.seek(0)
- fasta_file = fastq.FastqFile.read(
- temp, offset=33, chars_per_line=chars_per_line
- )
+ fasta_file = fastq.FastqFile.read(temp, offset=33, chars_per_line=chars_per_line)
content = dict(fasta_file.items())
temp.close()
-
+
for identifier in ref_content:
ref_sequence, ref_scores = ref_content[identifier]
test_sequence, test_scores = content[identifier]
assert test_sequence == ref_sequence
assert np.array_equal(test_scores, ref_scores)
+
def test_rna_conversion():
sequence = seq.NucleotideSequence("ACGT")
scores = np.array([0, 0, 0, 0])
fastq_file = fastq.FastqFile(offset="Sanger")
fastq.set_sequence(fastq_file, sequence, scores, "seq1", as_rna=False)
fastq.set_sequence(fastq_file, sequence, scores, "seq2", as_rna=True)
- assert fastq_file["seq1"][0] == "ACGT"
+ assert fastq_file["seq1"][0] == "ACGT"
assert fastq_file["seq2"][0] == "ACGU"
+
@pytest.mark.parametrize(
- "file_name",
- glob.glob(os.path.join(data_dir("sequence"), "*.fastq"))
+ "file_name", glob.glob(os.path.join(data_dir("sequence"), "*.fastq"))
)
def test_read_iter(file_name):
ref_dict = dict(fastq.FastqFile.read(file_name, offset="Sanger").items())
-
+
test_dict = dict(fastq.FastqFile.read_iter(file_name, offset="Sanger"))
- for (test_id, (test_seq, test_sc)), (ref_id, (ref_seq, ref_sc)) \
- in zip(test_dict.items(), ref_dict.items()):
- assert test_id == ref_id
- assert test_seq == ref_seq
- assert (test_sc == ref_sc).all()
+ for (test_id, (test_seq, test_sc)), (ref_id, (ref_seq, ref_sc)) in zip(
+ test_dict.items(), ref_dict.items()
+ ):
+ assert test_id == ref_id
+ assert test_seq == ref_seq
+ assert (test_sc == ref_sc).all()
+
@pytest.mark.parametrize(
- "offset, chars_per_line, n_sequences", itertools.product(
- [33, 42, "Solexa"],
- [None, 80],
- [1, 10]
- )
+ "offset, chars_per_line, n_sequences",
+ itertools.product([33, 42, "Solexa"], [None, 80], [1, 10]),
)
def test_write_iter(offset, chars_per_line, n_sequences):
"""
@@ -110,22 +105,21 @@ def test_write_iter(offset, chars_per_line, n_sequences):
for i in range(n_sequences):
seq_length = np.random.randint(*LENGTH_RANGE)
code = np.random.randint(
- len(seq.NucleotideSequence.alphabet_unamb),
- size=seq_length
+ len(seq.NucleotideSequence.alphabet_unamb), size=seq_length
)
sequence = seq.NucleotideSequence()
sequence.code = code
sequences.append(sequence)
score = np.random.randint(*SCORE_RANGE, size=seq_length)
scores.append(score)
-
+
fastq_file = fastq.FastqFile(offset, chars_per_line)
for i, (sequence, score) in enumerate(zip(sequences, scores)):
identifier = f"seq_{i}"
fastq_file[identifier] = (str(sequence), score)
ref_file = io.StringIO()
fastq_file.write(ref_file)
-
+
test_file = io.StringIO()
fastq.FastqFile.write_iter(
test_file,
@@ -133,7 +127,8 @@ def test_write_iter(offset, chars_per_line, n_sequences):
(f"seq_{i}", (str(sequence), score))
for i, (sequence, score) in enumerate(zip(sequences, scores))
),
- offset, chars_per_line
+ offset,
+ chars_per_line,
)
- assert test_file.getvalue() == ref_file.getvalue()
\ No newline at end of file
+ assert test_file.getvalue() == ref_file.getvalue()
diff --git a/tests/sequence/test_genbank.py b/tests/sequence/test_genbank.py
index 6ecefd061..d96cbddc6 100644
--- a/tests/sequence/test_genbank.py
+++ b/tests/sequence/test_genbank.py
@@ -2,20 +2,19 @@
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.
-from tempfile import TemporaryFile
import glob
from os.path import join
+from tempfile import TemporaryFile
+import pytest
import biotite.sequence as seq
import biotite.sequence.io.genbank as gb
-import numpy as np
-import pytest
-from ..util import data_dir
+from tests.util import data_dir
@pytest.mark.parametrize(
"path",
- glob.glob(join(data_dir("sequence"), "*.gb")) + \
- glob.glob(join(data_dir("sequence"), "[!multifile]*.gp"))
+ glob.glob(join(data_dir("sequence"), "*.gb"))
+ + glob.glob(join(data_dir("sequence"), "[!multifile]*.gp")),
)
def test_contiguous_field_pos(path):
"""
@@ -25,7 +24,7 @@ def test_contiguous_field_pos(path):
assert gb_file._field_pos[0][0] == 0
for i in range(1, len(gb_file._field_pos)):
start, _, _ = gb_file._field_pos[i]
- _, stop, _ = gb_file._field_pos[i-1]
+ _, stop, _ = gb_file._field_pos[i - 1]
assert start == stop
@@ -37,27 +36,23 @@ def test_file_access():
gb_file = gb.GenBankFile()
gb_file.append("SOMEFIELD", ["Some content", "some other content"])
gb_file.insert(0, "OTHERFIELD", ["Additional content"])
- assert gb_file[1] \
- == ("SOMEFIELD", ["Some content", "some other content"], {})
- gb_file[1] \
- = "NEWFIELD", ["Extra content"], {"SUBFIELD" : ["L 1", "L 2"]}
+ assert gb_file[1] == ("SOMEFIELD", ["Some content", "some other content"], {})
+ gb_file[1] = "NEWFIELD", ["Extra content"], {"SUBFIELD": ["L 1", "L 2"]}
gb_file.append("THIRDFIELD", ["Supplementary content"])
assert len(gb_file) == 3
assert gb_file[0] == ("OTHERFIELD", ["Additional content"], {})
del gb_file[0]
- assert gb_file[0] \
- == ("NEWFIELD", ["Extra content"], {"SUBFIELD" : ["L 1", "L 2"]})
+ assert gb_file[0] == ("NEWFIELD", ["Extra content"], {"SUBFIELD": ["L 1", "L 2"]})
del gb_file[0]
assert gb_file[0] == ("THIRDFIELD", ["Supplementary content"], {})
del gb_file[0]
assert len(gb_file) == 0
-
@pytest.mark.parametrize(
"path",
- glob.glob(join(data_dir("sequence"), "*.gb")) + \
- glob.glob(join(data_dir("sequence"), "[!multifile]*.gp"))
+ glob.glob(join(data_dir("sequence"), "*.gb"))
+ + glob.glob(join(data_dir("sequence"), "[!multifile]*.gp")),
)
def test_conversion_lowlevel(path):
"""
@@ -72,7 +67,7 @@ def test_conversion_lowlevel(path):
gb_file.append(name, content, subfields)
temp = TemporaryFile("w+")
gb_file.write(temp)
-
+
temp.seek(0)
gb_file = gb.GenBankFile.read(temp)
temp.close()
@@ -82,8 +77,8 @@ def test_conversion_lowlevel(path):
@pytest.mark.parametrize(
"path",
- glob.glob(join(data_dir("sequence"), "*.gb")) + \
- glob.glob(join(data_dir("sequence"), "[!multifile]*.gp"))
+ glob.glob(join(data_dir("sequence"), "*.gb"))
+ + glob.glob(join(data_dir("sequence"), "[!multifile]*.gp")),
)
def test_conversion_highlevel(path):
"""
@@ -101,44 +96,55 @@ def test_conversion_highlevel(path):
gb.set_annotated_sequence(gb_file, ref_annot_seq)
temp = TemporaryFile("w+")
gb_file.write(temp)
-
+
temp.seek(0)
gb_file = gb.GenBankFile.read(temp)
temp.close()
test_locus = gb.get_locus(gb_file)
test_annot_seq = gb.get_annotated_sequence(gb_file, format=suffix)
assert test_locus == ref_locus
- assert test_annot_seq.sequence == ref_annot_seq.sequence
- assert test_annot_seq.annotation == ref_annot_seq.annotation
+ assert test_annot_seq.sequence == ref_annot_seq.sequence
+ assert test_annot_seq.annotation == ref_annot_seq.annotation
assert test_annot_seq.sequence_start == ref_annot_seq.sequence_start
def test_genbank_utility_gb():
"""
Check whether the high-level utility functions return the expected
- content of a known GenBank file.
+ content of a known GenBank file.
"""
gb_file = gb.GenBankFile.read(join(data_dir("sequence"), "ec_bl21.gb"))
- assert gb.get_locus(gb_file) \
- == ("CP001509", 4558953, "DNA", True, "BCT", "16-FEB-2017")
- assert gb.get_definition(gb_file) \
- == ("Escherichia coli BL21(DE3), complete genome.")
+ assert gb.get_locus(gb_file) == (
+ "CP001509",
+ 4558953,
+ "DNA",
+ True,
+ "BCT",
+ "16-FEB-2017",
+ )
+ assert gb.get_definition(gb_file) == (
+ "Escherichia coli BL21(DE3), complete genome."
+ )
assert gb.get_version(gb_file) == "CP001509.3"
assert gb.get_gi(gb_file) == 296142109
- assert gb.get_db_link(gb_file) \
- == {"BioProject" : "PRJNA20713", "BioSample" : "SAMN02603478"}
+ assert gb.get_db_link(gb_file) == {
+ "BioProject": "PRJNA20713",
+ "BioSample": "SAMN02603478",
+ }
annotation = gb.get_annotation(gb_file, include_only=["CDS"])
feature = seq.Feature(
"CDS",
[seq.Location(5681, 6457, seq.Location.Strand.REVERSE)],
- {"gene": "yaaA", "transl_table": "11"}
+ {"gene": "yaaA", "transl_table": "11"},
)
in_annotation = False
for f in annotation:
- if f.key == feature.key and f.locs == feature.locs and \
- all([(key, val in f.qual.items())
- for key, val in feature.qual.items()]):
- in_annotation = True
+ if (
+ f.key == feature.key
+ and f.locs == feature.locs
+ and all([(key, val in f.qual.items()) for key, val in feature.qual.items()])
+ ):
+ in_annotation = True
assert in_annotation
assert len(gb.get_sequence(gb_file, format="gb")) == 4558953
@@ -146,30 +152,34 @@ def test_genbank_utility_gb():
def test_genbank_utility_gp():
"""
Check whether the high-level utility functions return the expected
- content of a known GenPept file.
+ content of a known GenPept file.
"""
gp_file = gb.GenBankFile.read(join(data_dir("sequence"), "bt_lysozyme.gp"))
- #[print(e) for e in gp_file._field_pos]
- assert gb.get_locus(gp_file) \
- == ("AAC37312", 147, None, False, "MAM", "27-APR-1993")
+ # [print(e) for e in gp_file._field_pos]
+ assert gb.get_locus(gp_file) == ("AAC37312", 147, None, False, "MAM", "27-APR-1993")
assert gb.get_definition(gp_file) == "lysozyme [Bos taurus]."
assert gb.get_version(gp_file) == "AAC37312.1"
assert gb.get_gi(gp_file) == 163334
annotation = gb.get_annotation(gp_file)
feature = seq.Feature(
"Site",
- [seq.Location(start, stop) for start, stop in zip(
- [52,55,62,76,78,81,117,120,125],
- [53,55,62,76,78,81,117,120,126]
- )],
- {"note": "lysozyme catalytic cleft [active]", "site_type": "active"}
+ [
+ seq.Location(start, stop)
+ for start, stop in zip(
+ [52, 55, 62, 76, 78, 81, 117, 120, 125],
+ [53, 55, 62, 76, 78, 81, 117, 120, 126],
+ )
+ ],
+ {"note": "lysozyme catalytic cleft [active]", "site_type": "active"},
)
in_annotation = False
for f in annotation:
- if f.key == feature.key and f.locs == feature.locs and \
- all([(key, val in f.qual.items())
- for key, val in feature.qual.items()]):
- in_annotation = True
+ if (
+ f.key == feature.key
+ and f.locs == feature.locs
+ and all([(key, val in f.qual.items()) for key, val in feature.qual.items()])
+ ):
+ in_annotation = True
assert in_annotation
assert len(gb.get_sequence(gp_file, format="gp")) == 147
@@ -184,21 +194,27 @@ def test_multi_file():
"locus_content, expected_result",
[
(
- "AJ311647LOOOOOOOOOOOOOOOOOOOOOOOOOONGID 1224 bp DNA linear VRT 14-NOV-2006",
- ("AJ311647LOOOOOOOOOOOOOOOOOOOOOOOOOONGID", 1224, "DNA", False, "VRT", "14-NOV-2006")
+ "AJ311647LOOOOOOOOOOOOOOOOOOOOOOOOOONGID 1224 bp DNA linear VRT 14-NOV-2006",
+ (
+ "AJ311647LOOOOOOOOOOOOOOOOOOOOOOOOOONGID",
+ 1224,
+ "DNA",
+ False,
+ "VRT",
+ "14-NOV-2006",
+ ),
),
(
- "SCU49845 5028 bp DNA PLN 21-JUN-1999",
- ("SCU49845", 5028, "DNA", False, "PLN", "21-JUN-1999")
+ "SCU49845 5028 bp DNA PLN 21-JUN-1999",
+ ("SCU49845", 5028, "DNA", False, "PLN", "21-JUN-1999"),
),
(
- "123MissingMolTypeAndCircular 5028 bp PLN 21-JUN-1999",
- ("123MissingMolTypeAndCircular", 5028, None, False, "PLN", "21-JUN-1999")
- )
- ]
+ "123MissingMolTypeAndCircular 5028 bp PLN 21-JUN-1999",
+ ("123MissingMolTypeAndCircular", 5028, None, False, "PLN", "21-JUN-1999"),
+ ),
+ ],
)
def test_parse_locus(locus_content, expected_result):
gb_file = gb.GenBankFile()
gb_file.append("LOCUS", [locus_content])
assert gb.get_locus(gb_file) == expected_result
-
\ No newline at end of file
diff --git a/tests/sequence/test_generalio.py b/tests/sequence/test_generalio.py
index a5b21315b..36ba1a7e4 100644
--- a/tests/sequence/test_generalio.py
+++ b/tests/sequence/test_generalio.py
@@ -2,33 +2,24 @@
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.
-from tempfile import NamedTemporaryFile
-import biotite
-import biotite.sequence as seq
-import biotite.sequence.io as seqio
-import numpy as np
import glob
from os.path import join
-from ..util import data_dir
+from tempfile import NamedTemporaryFile
import pytest
+import biotite.sequence.io as seqio
+from tests.util import data_dir
-@pytest.mark.parametrize(
- "path", glob.glob(join(data_dir("sequence"), "random.*"))
-)
+@pytest.mark.parametrize("path", glob.glob(join(data_dir("sequence"), "random.*")))
def test_loading_single(path):
- ref_sequence = seqio.load_sequence(
- join(data_dir("sequence"), "random.fasta")
- )
+ ref_sequence = seqio.load_sequence(join(data_dir("sequence"), "random.fasta"))
sequence = seqio.load_sequence(path)
assert ref_sequence == sequence
@pytest.mark.parametrize("suffix", ["fasta", "fastq"])
def test_saving_single(suffix):
- ref_sequence = seqio.load_sequence(
- join(data_dir("sequence"), "random.fasta")
- )
+ ref_sequence = seqio.load_sequence(join(data_dir("sequence"), "random.fasta"))
temp = NamedTemporaryFile("w+", suffix=f".{suffix}")
try:
seqio.save_sequence(temp.name, ref_sequence)
@@ -37,22 +28,16 @@ def test_saving_single(suffix):
pytest.skip("Permission is denied")
-@pytest.mark.parametrize(
- "path", glob.glob(join(data_dir("sequence"), "random.*"))
-)
+@pytest.mark.parametrize("path", glob.glob(join(data_dir("sequence"), "random.*")))
def test_loading_multiple(path):
- ref_sequences = seqio.load_sequences(
- join(data_dir("sequence"), "random.fasta")
- )
+ ref_sequences = seqio.load_sequences(join(data_dir("sequence"), "random.fasta"))
sequences = seqio.load_sequences(path)
assert ref_sequences == sequences
@pytest.mark.parametrize("suffix", ["fasta", "fastq"])
def test_saving_multiple(suffix):
- ref_sequences = seqio.load_sequences(
- join(data_dir("sequence"), "random.fasta")
- )
+ ref_sequences = seqio.load_sequences(join(data_dir("sequence"), "random.fasta"))
temp = NamedTemporaryFile("w+", suffix=f".{suffix}")
try:
seqio.save_sequences(temp.name, ref_sequences)
@@ -60,6 +45,7 @@ def test_saving_multiple(suffix):
# This error might occur on AppVeyor
pytest.skip("Permission is denied")
+
@pytest.mark.parametrize("file_name", ["gg_avidin.gb", "bt_lysozyme.gp"])
def test_genbank(file_name):
"""
@@ -73,4 +59,4 @@ def test_genbank(file_name):
seqio.save_sequence(temp.name, sequence)
except PermissionError:
# This error might occur on AppVeyor
- pytest.skip("Permission is denied")
\ No newline at end of file
+ pytest.skip("Permission is denied")
diff --git a/tests/sequence/test_gff.py b/tests/sequence/test_gff.py
index 5c6ee77b4..0713a8324 100644
--- a/tests/sequence/test_gff.py
+++ b/tests/sequence/test_gff.py
@@ -2,19 +2,17 @@
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.
-from tempfile import TemporaryFile
from os.path import join
+from tempfile import TemporaryFile
+import pytest
import biotite.sequence as seq
-import biotite.sequence.io.gff as gff
import biotite.sequence.io.genbank as gb
-import numpy as np
-import pytest
-from ..util import data_dir
+import biotite.sequence.io.gff as gff
+from tests.util import data_dir
@pytest.mark.parametrize(
- "path",
- ["bt_lysozyme.gff3", "gg_avidin.gff3", "ec_bl21.gff3", "sc_chrom1.gff3"]
+ "path", ["bt_lysozyme.gff3", "gg_avidin.gff3", "ec_bl21.gff3", "sc_chrom1.gff3"]
)
def test_conversion_lowlevel(path):
"""
@@ -38,8 +36,7 @@ def test_conversion_lowlevel(path):
@pytest.mark.parametrize(
- "path",
- ["bt_lysozyme.gff3", "gg_avidin.gff3", "ec_bl21.gff3", "sc_chrom1.gff3"]
+ "path", ["bt_lysozyme.gff3", "gg_avidin.gff3", "ec_bl21.gff3", "sc_chrom1.gff3"]
)
def test_conversion_highlevel(path):
"""
@@ -69,7 +66,7 @@ def test_conversion_highlevel(path):
for _, _, type, _, _, _, _, phase, _ in gff_file:
if type == "CDS":
test_phases.append(phase)
-
+
assert ref_annot == test_annot
assert test_phases == ref_phases
@@ -87,7 +84,7 @@ def test_genbank_consistency(path):
gff_file = gff.GFFFile.read(join(data_dir("sequence"), path[:-3] + ".gff3"))
test_annot = gff.get_annotation(gff_file)
-
+
# Remove qualifiers, since they will be different
# in GFF3 and GenBank
ref_annot = seq.Annotation(
@@ -115,7 +112,7 @@ def test_file_access():
file.
"""
file = gff.GFFFile()
- entry_scaffold = ("ab", "cd", 1, 2, None, None, None, {"Id":"foo"})
+ entry_scaffold = ("ab", "cd", 1, 2, None, None, None, {"Id": "foo"})
entry = ("a",) + entry_scaffold
file.append(*entry)
assert file[0] == entry
@@ -124,8 +121,11 @@ def test_file_access():
file[1] = ("d",) + entry_scaffold
file.insert(3, *(("e",) + entry_scaffold))
del file[2]
- assert [seqid for seqid, _, _, _, _, _, _, _, _ in file] \
- == ["a", "d", "e", ]
+ assert [seqid for seqid, _, _, _, _, _, _, _, _ in file] == [
+ "a",
+ "d",
+ "e",
+ ]
def test_entry_indexing():
@@ -134,17 +134,14 @@ def test_entry_indexing():
test file with multiple directives, including '##FASTA'.
"""
with pytest.warns(UserWarning):
- file = gff.GFFFile.read(
- join(data_dir("sequence"), "indexing_test.gff3")
- )
+ file = gff.GFFFile.read(join(data_dir("sequence"), "indexing_test.gff3"))
assert file._directives == [
("directive 1", 1),
("directive 2", 2),
("directive 3", 7),
("FASTA", 8),
]
- assert file._entries == [3,4,6]
-
+ assert file._entries == [3, 4, 6]
def test_percent_encoding():
@@ -153,21 +150,19 @@ def test_percent_encoding():
artificial test file.
"""
file = gff.GFFFile.read(join(data_dir("sequence"), "percent_test.gff3"))
- seqid, source, type, start, end, score, strand, phase, attrib \
- = file[0]
+ seqid, source, type, start, end, score, strand, phase, attrib = file[0]
assert seqid == "123,456"
assert source == "ääh"
assert type == "regi&n"
assert attrib == {
- "ID" : "AnID;AnotherID",
- "Name" : "Ångström",
- "c$l$r": "red\tgreen\tblue"
+ "ID": "AnID;AnotherID",
+ "Name": "Ångström",
+ "c$l$r": "red\tgreen\tblue",
}
file2 = gff.GFFFile()
- file.append(seqid, source, type, start, end, score, strand, phase, attrib)
- assert (seqid, source, type, start, end, score, strand, phase, attrib) \
- == file[0]
+ file2.append(seqid, source, type, start, end, score, strand, phase, attrib)
+ assert (seqid, source, type, start, end, score, strand, phase, attrib) == file2[0]
def test_error():
@@ -177,16 +172,17 @@ def test_error():
file = gff.GFFFile()
with pytest.raises(ValueError):
# 'seqid' beginning with '>' is not legal
- file.append(">xyz", "ab", "cd", 1, 2, None, None, None, {"Id":"foo"})
+ file.append(">xyz", "ab", "cd", 1, 2, None, None, None, {"Id": "foo"})
with pytest.raises(ValueError):
# String fields must not be empty
- file.append("", "ab", "cd", 1, 2, None, None, None, {"Id":"foo"})
+ file.append("", "ab", "cd", 1, 2, None, None, None, {"Id": "foo"})
with pytest.raises(ValueError):
# String fields must not be empty
- file.append("xyz", "", "cd", 1, 2, None, None, None, {"Id":"foo"})
+ file.append("xyz", "", "cd", 1, 2, None, None, None, {"Id": "foo"})
with pytest.raises(ValueError):
# String fields must not be empty
- file.append("xyz", "ab", "", 1, 2, None, None, None, {"Id":"foo"})
+ file.append("xyz", "ab", "", 1, 2, None, None, None, {"Id": "foo"})
+
def test_feature_without_id():
"""
@@ -194,12 +190,14 @@ def test_feature_without_id():
locations and consequently multiple entries in the GFF3 file.
"""
annot = seq.Annotation(
- [seq.Feature(
- key = "CDS",
- locs = [seq.Location(1,2), seq.Location(4,5)],
- qual = {"some" : "qualifiers"}
- )]
+ [
+ seq.Feature(
+ key="CDS",
+ locs=[seq.Location(1, 2), seq.Location(4, 5)],
+ qual={"some": "qualifiers"},
+ )
+ ]
)
file = gff.GFFFile()
with pytest.raises(ValueError):
- gff.set_annotation(file, annot)
\ No newline at end of file
+ gff.set_annotation(file, annot)
diff --git a/tests/sequence/test_graphics.py b/tests/sequence/test_graphics.py
index bfad27840..cddb45ad6 100644
--- a/tests/sequence/test_graphics.py
+++ b/tests/sequence/test_graphics.py
@@ -2,23 +2,19 @@
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.
-from os.path import dirname, abspath, join
import glob
+from os.path import abspath, dirname, join
import pytest
import biotite.sequence as seq
-from ..util import cannot_import
+from tests.util import cannot_import
-@pytest.mark.skipif(
- cannot_import("matplotlib"), reason="Matplotlib is not installed"
-)
+@pytest.mark.skipif(cannot_import("matplotlib"), reason="Matplotlib is not installed")
@pytest.mark.parametrize(
- "scheme_path", glob.glob(
- join(
- dirname(abspath(seq.__file__)),
- "graphics", "color_schemes", "*.json"
- )
- )
+ "scheme_path",
+ glob.glob(
+ join(dirname(abspath(seq.__file__)), "graphics", "color_schemes", "*.json")
+ ),
)
def test_load_color_scheme(scheme_path):
from matplotlib.colors import to_rgb
@@ -27,9 +23,9 @@ def test_load_color_scheme(scheme_path):
supported_alphabets = [
seq.NucleotideSequence.alphabet_amb,
seq.ProteinSequence.alphabet,
- seq.LetterAlphabet("abcdefghijklmnop") # Protein block alphabet
+ seq.LetterAlphabet("abcdefghijklmnop"), # Protein block alphabet
]
-
+
test_scheme = graphics.load_color_scheme(scheme_path)
assert test_scheme["alphabet"] in supported_alphabets
@@ -37,4 +33,4 @@ def test_load_color_scheme(scheme_path):
for color in test_scheme["colors"]:
if color is not None:
# Should not raise error
- to_rgb(color)
\ No newline at end of file
+ to_rgb(color)
diff --git a/tests/sequence/test_phylo.py b/tests/sequence/test_phylo.py
index e2e52349c..0943d6002 100644
--- a/tests/sequence/test_phylo.py
+++ b/tests/sequence/test_phylo.py
@@ -7,7 +7,7 @@
import pytest
import biotite
import biotite.sequence.phylo as phylo
-from ..util import data_dir
+from tests.util import data_dir
@pytest.fixture
@@ -43,10 +43,12 @@ def test_upgma(tree, upgma_newick):
for i in range(len(tree)):
for j in range(len(tree)):
# Check for equal distances and equal topologies
- assert tree.get_distance(i,j) \
- == pytest.approx(ref_tree.get_distance(i,j), abs=1e-3)
- assert tree.get_distance(i,j, topological=True) \
- == ref_tree.get_distance(i,j, topological=True)
+ assert tree.get_distance(i, j) == pytest.approx(
+ ref_tree.get_distance(i, j), abs=1e-3
+ )
+ assert tree.get_distance(i, j, topological=True) == ref_tree.get_distance(
+ i, j, topological=True
+ )
def test_neighbor_joining():
@@ -60,34 +62,36 @@ def test_neighbor_joining():
[ 7, 10, 7, 0, 5, 9],
[ 6, 9, 6, 5, 0, 8],
[ 8, 11, 8, 9, 8, 0],
- ])
-
- ref_tree = phylo.Tree(phylo.TreeNode(
- [
- phylo.TreeNode(
- [
- phylo.TreeNode(
- [
- phylo.TreeNode(index=0),
- phylo.TreeNode(index=1),
- ],
- [1,4]
- ),
- phylo.TreeNode(index=2),
- ],
- [1, 2]
- ),
- phylo.TreeNode(
- [
- phylo.TreeNode(index=3),
- phylo.TreeNode(index=4),
- ],
- [3,2]
- ),
- phylo.TreeNode(index=5),
- ],
- [1,1,5]
- ))
+ ]) # fmt: skip
+
+ ref_tree = phylo.Tree(
+ phylo.TreeNode(
+ [
+ phylo.TreeNode(
+ [
+ phylo.TreeNode(
+ [
+ phylo.TreeNode(index=0),
+ phylo.TreeNode(index=1),
+ ],
+ [1, 4],
+ ),
+ phylo.TreeNode(index=2),
+ ],
+ [1, 2],
+ ),
+ phylo.TreeNode(
+ [
+ phylo.TreeNode(index=3),
+ phylo.TreeNode(index=4),
+ ],
+ [3, 2],
+ ),
+ phylo.TreeNode(index=5),
+ ],
+ [1, 1, 5],
+ )
+ )
test_tree = phylo.neighbor_joining(dist)
@@ -106,20 +110,20 @@ def test_node_distance(tree):
assert leaf.distance_to(tree.root) == dist
# Example topological distances
assert tree.get_distance(0, 19, True) == 9
- assert tree.get_distance(4, 2, True) == 10
-
+ assert tree.get_distance(4, 2, True) == 10
+
# All pairwise leaf node distances should be sufficient
# to reconstruct the same tree via UPGMA
ref_dist_mat = np.zeros((len(tree), len(tree)))
for i in range(len(tree)):
for j in range(len(tree)):
- ref_dist_mat[i,j] = tree.get_distance(i,j)
+ ref_dist_mat[i, j] = tree.get_distance(i, j)
assert np.allclose(ref_dist_mat, ref_dist_mat.T)
new_tree = phylo.upgma(ref_dist_mat)
test_dist_mat = np.zeros((len(tree), len(tree)))
for i in range(len(tree)):
for j in range(len(tree)):
- test_dist_mat[i,j] = new_tree.get_distance(i,j)
+ test_dist_mat[i, j] = new_tree.get_distance(i, j)
assert np.allclose(test_dist_mat, ref_dist_mat)
@@ -136,19 +140,18 @@ def test_distances(tree):
assert leaf.distance_to(tree.root) == dist
# Example topological distances
assert tree.get_distance(0, 19, True) == 9
- assert tree.get_distance(4, 2, True) == 10
+ assert tree.get_distance(4, 2, True) == 10
def test_get_leaves(tree):
# Manual example cases
- node = tree.leaves[6]
assert set(tree.leaves[6].parent.get_indices()) == set(
- [6,11,2,3,13,8,14,5,0,15,16]
+ [6, 11, 2, 3, 13, 8, 14, 5, 0, 15, 16]
)
assert set(tree.leaves[10].get_indices()) == set([10])
assert tree.root.get_leaf_count() == 20
-
+
def test_copy(tree):
assert tree is not tree.copy()
assert tree == tree.copy()
@@ -190,30 +193,33 @@ def test_immutability():
phylo.Tree(node1)
-@pytest.mark.parametrize("newick, labels, error", [
- # Reference index out of range
- ("((1,0),4),2);", None, biotite.InvalidFileError),
- # Empty string
- ("", None, biotite.InvalidFileError),
- # Empty node
- ("();", None, biotite.InvalidFileError),
- # Missing brackets
- ("((0,1,(2,3));", None, biotite.InvalidFileError),
- # A node with three leaves
- ("((0,1),(2,3),(4,5));", None, None),
- # A node with one leaf
- ("((0,1),(2,3),(4));", None, None),
- # Named intermediate nodes
- ("((0,1,3)A,2)B;", None, None),
- # Named intermediate nodes and distances
- ("((0:1.0,1:3.0,3:5.0)A:2.0,2:5.0)B;", None, None),
- # Nodes with labels
- ("((((A,B),(C,D)),E),F);", ["A","B","C","D","E","F"], None),
- # Nodes with labels and distances
- ("((((A:1,B:2),(C:3,D:4)),E:5),F:6);", ["A","B","C","D","E","F"], None),
- # Newick with spaces
- (" ( 0 : 1.0 , 1 : 3.0 ) A ; ", None, None),
-])
+@pytest.mark.parametrize(
+ "newick, labels, error",
+ [
+ # Reference index out of range
+ ("((1,0),4),2);", None, biotite.InvalidFileError),
+ # Empty string
+ ("", None, biotite.InvalidFileError),
+ # Empty node
+ ("();", None, biotite.InvalidFileError),
+ # Missing brackets
+ ("((0,1,(2,3));", None, biotite.InvalidFileError),
+ # A node with three leaves
+ ("((0,1),(2,3),(4,5));", None, None),
+ # A node with one leaf
+ ("((0,1),(2,3),(4));", None, None),
+ # Named intermediate nodes
+ ("((0,1,3)A,2)B;", None, None),
+ # Named intermediate nodes and distances
+ ("((0:1.0,1:3.0,3:5.0)A:2.0,2:5.0)B;", None, None),
+ # Nodes with labels
+ ("((((A,B),(C,D)),E),F);", ["A", "B", "C", "D", "E", "F"], None),
+ # Nodes with labels and distances
+ ("((((A:1,B:2),(C:3,D:4)),E:5),F:6);", ["A", "B", "C", "D", "E", "F"], None),
+ # Newick with spaces
+ (" ( 0 : 1.0 , 1 : 3.0 ) A ; ", None, None),
+ ],
+)
def test_newick_simple(newick, labels, error):
# Read, write and read again a Newick notation and expect
# the same reult from both reads
@@ -223,8 +229,8 @@ def test_newick_simple(newick, labels, error):
tree2 = phylo.Tree.from_newick(newick, labels)
assert tree1 == tree2
else:
- with pytest.raises(error):
- tree1 = phylo.Tree.from_newick(newick, labels)
+ with pytest.raises(error):
+ tree1 = phylo.Tree.from_newick(newick, labels)
@pytest.mark.parametrize("use_labels", [False, True])
@@ -243,14 +249,16 @@ def test_newick_complex(upgma_newick, use_labels):
def test_newick_rounding():
# Create the distance matrix
distances = np.array(
- [[0. , 0.53, 0.93, 0.78, 0.38, 0.99, 1.02, 0.76],
- [0.53, 0. , 0.59, 0.41, 0.35, 0.87, 1.03, 0.83],
- [0.93, 0.59, 0. , 0.16, 0.58, 0.55, 1.59, 1.19],
- [0.78, 0.41, 0.16, 0. , 0.42, 0.69, 1.4 , 1.18],
- [0.38, 0.35, 0.58, 0.42, 0. , 1.02, 1.11, 0.89],
- [0.99, 0.87, 0.55, 0.69, 1.02, 0. , 1.47, 1.26],
- [1.02, 1.03, 1.59, 1.4 , 1.11, 1.47, 0. , 1.39],
- [0.76, 0.83, 1.19, 1.18, 0.89, 1.26, 1.39, 0. ]]
+ [
+ [0.0, 0.53, 0.93, 0.78, 0.38, 0.99, 1.02, 0.76],
+ [0.53, 0.0, 0.59, 0.41, 0.35, 0.87, 1.03, 0.83],
+ [0.93, 0.59, 0.0, 0.16, 0.58, 0.55, 1.59, 1.19],
+ [0.78, 0.41, 0.16, 0.0, 0.42, 0.69, 1.4, 1.18],
+ [0.38, 0.35, 0.58, 0.42, 0.0, 1.02, 1.11, 0.89],
+ [0.99, 0.87, 0.55, 0.69, 1.02, 0.0, 1.47, 1.26],
+ [1.02, 1.03, 1.59, 1.4, 1.11, 1.47, 0.0, 1.39],
+ [0.76, 0.83, 1.19, 1.18, 0.89, 1.26, 1.39, 0.0],
+ ]
)
# Create the tree
tree = phylo.neighbor_joining(distances)
@@ -270,12 +278,15 @@ def test_newick_rounding():
)
-@pytest.mark.parametrize("newick_in, exp_newick_out", [
- ("(0:1.0, 1:2.0);", "(0:1.0,1:2.0):0.0;" ),
- ("(0:1.0, 1:2.0, 2:3.0);", "((0:1.0,1:2.0):0.0,2:3.0):0.0;" ),
- ("(((0:1.0, 1:2.0):10.0):5.0, 2:8.0);", "((0:1.0,1:2.0):15.0,2:8.0):0.0;"),
- ("((0:1.0, 1:2.0):10.0):5.0;", "(0:1.0,1:2.0):0.0;" ),
-])
+@pytest.mark.parametrize(
+ "newick_in, exp_newick_out",
+ [
+ ("(0:1.0, 1:2.0);", "(0:1.0,1:2.0):0.0;"),
+ ("(0:1.0, 1:2.0, 2:3.0);", "((0:1.0,1:2.0):0.0,2:3.0):0.0;"),
+ ("(((0:1.0, 1:2.0):10.0):5.0, 2:8.0);", "((0:1.0,1:2.0):15.0,2:8.0):0.0;"),
+ ("((0:1.0, 1:2.0):10.0):5.0;", "(0:1.0,1:2.0):0.0;"),
+ ],
+)
def test_as_binary_cases(newick_in, exp_newick_out):
"""
Test the `as_binary()` function based on known cases.
@@ -296,13 +307,13 @@ def test_as_binary_distances():
ref_dist_mat = np.zeros((len(tree), len(tree)))
for i in range(len(tree)):
for j in range(len(tree)):
- ref_dist_mat[i,j] = tree.get_distance(i,j)
-
+ ref_dist_mat[i, j] = tree.get_distance(i, j)
+
bin_tree = phylo.as_binary(tree)
test_dist_mat = np.zeros((len(tree), len(tree)))
for i in range(len(tree)):
for j in range(len(tree)):
- test_dist_mat[i,j] = bin_tree.get_distance(i,j)
+ test_dist_mat[i, j] = bin_tree.get_distance(i, j)
assert np.allclose(test_dist_mat, ref_dist_mat)
@@ -313,26 +324,27 @@ def test_equality(tree):
"""
assert tree == tree.copy()
# Order of children is not important
- assert tree == phylo.Tree(phylo.TreeNode(
- [tree.root.children[1].copy(), tree.root.children[0].copy()],
- [tree.root.children[1].distance, tree.root.children[0].distance]
- ))
+ assert tree == phylo.Tree(
+ phylo.TreeNode(
+ [tree.root.children[1].copy(), tree.root.children[0].copy()],
+ [tree.root.children[1].distance, tree.root.children[0].distance],
+ )
+ )
# Different distance -> Unequal tree
- assert tree != phylo.Tree(phylo.TreeNode(
- [tree.root.children[0].copy(), tree.root.children[1].copy()],
- [tree.root.children[0].distance, 42]
- ))
+ assert tree != phylo.Tree(
+ phylo.TreeNode(
+ [tree.root.children[0].copy(), tree.root.children[1].copy()],
+ [tree.root.children[0].distance, 42],
+ )
+ )
# Additional node -> Unequal tree
- assert tree != phylo.Tree(phylo.TreeNode(
- [
- tree.root.children[0].copy(),
- tree.root.children[1].copy(),
- phylo.TreeNode(index=len(tree))
- ],
- [
- tree.root.children[0].distance,
- tree.root.children[1].distance,
- 42
- ]
- ))
-
+ assert tree != phylo.Tree(
+ phylo.TreeNode(
+ [
+ tree.root.children[0].copy(),
+ tree.root.children[1].copy(),
+ phylo.TreeNode(index=len(tree)),
+ ],
+ [tree.root.children[0].distance, tree.root.children[1].distance, 42],
+ )
+ )
diff --git a/tests/sequence/test_profile.py b/tests/sequence/test_profile.py
index 658779bd2..3f7669bbd 100644
--- a/tests/sequence/test_profile.py
+++ b/tests/sequence/test_profile.py
@@ -11,24 +11,43 @@
def test_from_alignment():
seq1 = seq.NucleotideSequence("CGTCAT")
seq2 = seq.NucleotideSequence("TCATGC")
- ali_str = ["CGTCAT--",
- "--TCATGC"]
+ ali_str = ["CGTCAT--", "--TCATGC"]
trace = align.Alignment.trace_from_strings(ali_str)
alignment = align.Alignment([seq1, seq2], trace, None)
profile = seq.SequenceProfile.from_alignment(alignment)
- symbols = np.array([[0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 2], [0, 2, 0, 0],
- [2, 0, 0, 0], [0, 0, 0, 2], [0, 0, 1, 0], [0, 1, 0, 0]])
+ symbols = np.array(
+ [
+ [0, 1, 0, 0],
+ [0, 0, 1, 0],
+ [0, 0, 0, 2],
+ [0, 2, 0, 0],
+ [2, 0, 0, 0],
+ [0, 0, 0, 2],
+ [0, 0, 1, 0],
+ [0, 1, 0, 0],
+ ]
+ )
gaps = np.array([1, 1, 0, 0, 0, 0, 1, 1])
alphabet = seq.Alphabet(["A", "C", "G", "T"])
assert np.array_equal(symbols, profile.symbols)
assert np.array_equal(gaps, profile.gaps)
- assert (alphabet == profile.alphabet)
+ assert alphabet == profile.alphabet
def test_to_consensus_nuc():
- symbols = np.array([[0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 2], [0, 2, 0, 0],
- [2, 0, 0, 0], [0, 0, 0, 2], [0, 0, 1, 0], [0, 1, 0, 0]])
+ symbols = np.array(
+ [
+ [0, 1, 0, 0],
+ [0, 0, 1, 0],
+ [0, 0, 0, 2],
+ [0, 2, 0, 0],
+ [2, 0, 0, 0],
+ [0, 0, 0, 2],
+ [0, 0, 1, 0],
+ [0, 1, 0, 0],
+ ]
+ )
gaps = np.array([1, 1, 0, 0, 0, 0, 1, 1])
alphabet = seq.Alphabet(["A", "C", "G", "T"])
profile = seq.SequenceProfile(symbols, gaps, alphabet)
@@ -37,8 +56,18 @@ def test_to_consensus_nuc():
def test_to_consensus_nuc_ambiguous():
- symbols = np.array([[1, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 2], [0, 2, 0, 0],
- [2, 0, 0, 0], [0, 0, 0, 2], [0, 0, 1, 0], [0, 1, 0, 0]])
+ symbols = np.array(
+ [
+ [1, 1, 0, 0],
+ [0, 0, 1, 0],
+ [0, 0, 0, 2],
+ [0, 2, 0, 0],
+ [2, 0, 0, 0],
+ [0, 0, 0, 2],
+ [0, 0, 1, 0],
+ [0, 1, 0, 0],
+ ]
+ )
gaps = np.array([1, 1, 0, 0, 0, 0, 1, 1])
alphabet = seq.Alphabet(["A", "C", "G", "T"])
profile = seq.SequenceProfile(symbols, gaps, alphabet)
@@ -48,45 +77,65 @@ def test_to_consensus_nuc_ambiguous():
def test_to_consensus_prot():
# Avidin protein sequence
- seq1 = seq.ProteinSequence("MVHATSPLLLLLLLSLALVAPGLSARKCSLTGKWTNDLGSNMTIGAVNSRGEFTGTYITAVTATSNEIKESPLHGTQNTINKRTQP"
- "TFGFTVNWKFSESTTVFTGQCFIDRNGKEVLKTMWLLRSSVNDIGDDWKATRVGINIFTRLRTQKE")
+ seq1 = seq.ProteinSequence(
+ "MVHATSPLLLLLLLSLALVAPGLSARKCSLTGKWTNDLGSNMTIGAVNSRGEFTGTYITAVTATSNEIKESPLHGTQNTINKRTQP"
+ "TFGFTVNWKFSESTTVFTGQCFIDRNGKEVLKTMWLLRSSVNDIGDDWKATRVGINIFTRLRTQKE"
+ )
# Streptavidin protein sequence
- seq2 = seq.ProteinSequence("MRKIVVAAIAVSLTTVSITASASADPSKDSKAQVSAAEAGITGTWYNQLGSTFIVTAGADGALTGTYESAVGNAESRYVLTGRYDSA"
- "PATDGSGTALGWTVAWKNNYRNAHSATTWSGQYVGGAEARINTQWLLTSGTTEANAWKSTLVGHDTFTKVKPSAASIDAAKKAGVNN"
- "GNPLDAVQQ")
+ seq2 = seq.ProteinSequence(
+ "MRKIVVAAIAVSLTTVSITASASADPSKDSKAQVSAAEAGITGTWYNQLGSTFIVTAGADGALTGTYESAVGNAESRYVLTGRYDSA"
+ "PATDGSGTALGWTVAWKNNYRNAHSATTWSGQYVGGAEARINTQWLLTSGTTEANAWKSTLVGHDTFTKVKPSAASIDAAKKAGVNN"
+ "GNPLDAVQQ"
+ )
matrix = align.SubstitutionMatrix.std_protein_matrix()
alignment = align.align_optimal(seq1, seq2, matrix)[0]
profile = seq.SequenceProfile.from_alignment(alignment)
- assert seq.ProteinSequence("MRHIATAAIALSLLLLSITALASADPGKDSKAQLSAAEAGITGKWTNDLGSNFIIGAVGADGAFTGTYESAVGNAESNEIKEGPLD"
- "GAPATDGKGTALGWTFAFKNNWKFAESATTFSGQCFGGADARINGKELLTKGTMEANAWKSTLLGHDSFSKVKDIAADIDAAKKAG"
- "INIFNPLDAQKE") == profile.to_consensus()
+ assert (
+ seq.ProteinSequence(
+ "MRHIATAAIALSLLLLSITALASADPGKDSKAQLSAAEAGITGKWTNDLGSNFIIGAVGADGAFTGTYESAVGNAESNEIKEGPLD"
+ "GAPATDGKGTALGWTFAFKNNWKFAESATTFSGQCFGGADARINGKELLTKGTMEANAWKSTLLGHDSFSKVKDIAADIDAAKKAG"
+ "INIFNPLDAQKE"
+ )
+ == profile.to_consensus()
+ )
def test_new_position_matrices():
- seqs = [seq.NucleotideSequence("AAGAAT"),
- seq.NucleotideSequence("ATCATA"),
- seq.NucleotideSequence("AAGTAA"),
- seq.NucleotideSequence("AACAAA"),
- seq.NucleotideSequence("ATTAAA"),
- seq.NucleotideSequence("AAGAAT")]
+ seqs = [
+ seq.NucleotideSequence("AAGAAT"),
+ seq.NucleotideSequence("ATCATA"),
+ seq.NucleotideSequence("AAGTAA"),
+ seq.NucleotideSequence("AACAAA"),
+ seq.NucleotideSequence("ATTAAA"),
+ seq.NucleotideSequence("AAGAAT"),
+ ]
alignment = align.Alignment(
sequences=seqs,
- trace=np.tile(np.arange(len(seqs[0])), len(seqs)) \
- .reshape(len(seqs), len(seqs[0])) \
- .transpose(),
- score=0
+ trace=np.tile(np.arange(len(seqs[0])), len(seqs))
+ .reshape(len(seqs), len(seqs[0]))
+ .transpose(),
+ score=0,
)
profile = seq.SequenceProfile.from_alignment(alignment)
- probability_matrix = np.array([[1., 0., 0., 0., ],
- [0.66666667, 0., 0., 0.33333333],
- [0., 0.33333333, 0.5, 0.16666667],
- [0.83333333, 0., 0., 0.16666667],
- [0.83333333, 0., 0., 0.16666667],
- [0.66666667, 0., 0., 0.33333333]])
+ probability_matrix = np.array(
+ [
+ [
+ 1.0,
+ 0.0,
+ 0.0,
+ 0.0,
+ ],
+ [0.66666667, 0.0, 0.0, 0.33333333],
+ [0.0, 0.33333333, 0.5, 0.16666667],
+ [0.83333333, 0.0, 0.0, 0.16666667],
+ [0.83333333, 0.0, 0.0, 0.16666667],
+ [0.66666667, 0.0, 0.0, 0.33333333],
+ ]
+ )
ppm = profile.probability_matrix()
@@ -98,25 +147,35 @@ def test_new_position_matrices():
ppm = profile.probability_matrix(pseudocount=1)
- probability_matrix = np.array([[0.89285714, 0.03571429, 0.03571429, 0.03571429],
- [0.60714286, 0.03571429, 0.03571429, 0.32142857],
- [0.03571429, 0.32142857, 0.46428571, 0.17857143],
- [0.75, 0.03571429, 0.03571429, 0.17857143],
- [0.75, 0.03571429, 0.03571429, 0.17857143],
- [0.60714286, 0.03571429, 0.03571429, 0.32142857]])
+ probability_matrix = np.array(
+ [
+ [0.89285714, 0.03571429, 0.03571429, 0.03571429],
+ [0.60714286, 0.03571429, 0.03571429, 0.32142857],
+ [0.03571429, 0.32142857, 0.46428571, 0.17857143],
+ [0.75, 0.03571429, 0.03571429, 0.17857143],
+ [0.75, 0.03571429, 0.03571429, 0.17857143],
+ [0.60714286, 0.03571429, 0.03571429, 0.32142857],
+ ]
+ )
assert np.allclose(probability_matrix, ppm, atol=1e-3)
- probability = profile.sequence_probability(seq.NucleotideSequence("AAAAAA"), pseudocount=1)
+ probability = profile.sequence_probability(
+ seq.NucleotideSequence("AAAAAA"), pseudocount=1
+ )
assert probability == pytest.approx(0.0066, abs=1e-3)
- log_odds_matrix = np.array([[1.83650127, -2.80735492, -2.80735492, -2.80735492],
- [1.28010792, -2.80735492, -2.80735492, 0.36257008],
- [-2.80735492, 0.36257008, 0.8930848, -0.48542683],
- [1.5849625, -2.80735492, -2.80735492, -0.48542683],
- [1.5849625, -2.80735492, -2.80735492, -0.48542683],
- [1.28010792, -2.80735492, -2.80735492, 0.36257008]])
+ log_odds_matrix = np.array(
+ [
+ [1.83650127, -2.80735492, -2.80735492, -2.80735492],
+ [1.28010792, -2.80735492, -2.80735492, 0.36257008],
+ [-2.80735492, 0.36257008, 0.8930848, -0.48542683],
+ [1.5849625, -2.80735492, -2.80735492, -0.48542683],
+ [1.5849625, -2.80735492, -2.80735492, -0.48542683],
+ [1.28010792, -2.80735492, -2.80735492, 0.36257008],
+ ]
+ )
pwm = profile.log_odds_matrix(pseudocount=1)
diff --git a/tests/sequence/test_search.py b/tests/sequence/test_search.py
index 7ef2b4618..c2150afac 100644
--- a/tests/sequence/test_search.py
+++ b/tests/sequence/test_search.py
@@ -3,8 +3,6 @@
# information.
import biotite.sequence as seq
-import numpy as np
-import pytest
def test_find_subsequence():
@@ -13,12 +11,13 @@ def test_find_subsequence():
main_seq = seq.NucleotideSequence(string)
sub_seq = seq.NucleotideSequence(substring)
matches = seq.find_subsequence(main_seq, sub_seq)
- assert list(matches) == [4,8]
-
+ assert list(matches) == [4, 8]
+
+
def test_find_symbol():
string = "ATACGCTTGCT"
symbol = "T"
dna = seq.NucleotideSequence(string)
- assert list(seq.find_symbol(dna, symbol)) == [1,6,7,10]
+ assert list(seq.find_symbol(dna, symbol)) == [1, 6, 7, 10]
assert seq.find_symbol_first(dna, symbol) == 1
- assert seq.find_symbol_last(dna, symbol) == 10
\ No newline at end of file
+ assert seq.find_symbol_last(dna, symbol) == 10
diff --git a/tests/sequence/test_seqtypes.py b/tests/sequence/test_seqtypes.py
index 157d8d9ff..086f972a9 100644
--- a/tests/sequence/test_seqtypes.py
+++ b/tests/sequence/test_seqtypes.py
@@ -2,9 +2,8 @@
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.
-import biotite.sequence as seq
-import numpy as np
import pytest
+import biotite.sequence as seq
def test_nucleotide_construction():
@@ -23,26 +22,31 @@ def test_reverse_complement():
dna = seq.NucleotideSequence(string)
assert str(dna.reverse().complement()) == "RNTAACGCATT"
+
def test_stop_removal():
string = "LYG*GR*"
protein = seq.ProteinSequence(string)
assert str(protein.remove_stops()) == string.replace("*", "")
-@pytest.mark.parametrize("dna_str, protein_str",
- [("CACATAGCATGA", "HIA*"),
- ("ATGTAGCTA", "M*L")])
+@pytest.mark.parametrize(
+ "dna_str, protein_str", [("CACATAGCATGA", "HIA*"), ("ATGTAGCTA", "M*L")]
+)
def test_full_translation(dna_str, protein_str):
dna = seq.NucleotideSequence(dna_str)
protein = dna.translate(complete=True)
assert protein_str == str(protein)
-@pytest.mark.parametrize("dna_str, protein_str_list",
- [("CA", []),
- ("GAATGCACTGAGATGCAATAG", ["MH*","MQ*"]),
- ("ATGCACATGTAGGG", ["MHM*","M*"]),
- ("GATGCATGTGAAAA", ["MHVK","M*"])])
+@pytest.mark.parametrize(
+ "dna_str, protein_str_list",
+ [
+ ("CA", []),
+ ("GAATGCACTGAGATGCAATAG", ["MH*", "MQ*"]),
+ ("ATGCACATGTAGGG", ["MHM*", "M*"]),
+ ("GATGCATGTGAAAA", ["MHVK", "M*"]),
+ ],
+)
def test_frame_translation(dna_str, protein_str_list):
dna = seq.NucleotideSequence(dna_str)
proteins, pos = dna.translate(complete=False)
@@ -50,8 +54,8 @@ def test_frame_translation(dna_str, protein_str_list):
assert set([str(protein) for protein in proteins]) == set(protein_str_list)
# Test if the positions are also right
# -> Get sequence slice and translate completely
- assert set([str(dna[start : stop].translate(complete=True))
- for start, stop in pos]
+ assert set(
+ [str(dna[start:stop].translate(complete=True)) for start, stop in pos]
) == set(protein_str_list)
@@ -76,7 +80,7 @@ def test_letter_conversion():
@pytest.mark.parametrize(
"monoisotopic, expected_mol_weight_protein",
# Reference values taken from https://web.expasy.org/compute_pi/
- [(True, 2231.06), (False, 2232.56)]
+ [(True, 2231.06), (False, 2232.56)],
)
def test_get_molecular_weight(monoisotopic, expected_mol_weight_protein):
"""
@@ -84,8 +88,5 @@ def test_get_molecular_weight(monoisotopic, expected_mol_weight_protein):
correctly.
"""
protein = seq.ProteinSequence("ACDEFGHIKLMNPQRSTVW")
- mol_weight_protein = protein.get_molecular_weight(
- monoisotopic=monoisotopic)
- assert mol_weight_protein == \
- pytest.approx(expected_mol_weight_protein, abs=1e-2)
-
+ mol_weight_protein = protein.get_molecular_weight(monoisotopic=monoisotopic)
+ assert mol_weight_protein == pytest.approx(expected_mol_weight_protein, abs=1e-2)
diff --git a/tests/sequence/test_sequence.py b/tests/sequence/test_sequence.py
index 78a815b5f..bfffaedb5 100644
--- a/tests/sequence/test_sequence.py
+++ b/tests/sequence/test_sequence.py
@@ -2,8 +2,8 @@
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.
-import pytest
import numpy as np
+import pytest
import biotite.sequence as seq
@@ -13,13 +13,15 @@ def test_encoding():
string2 = str(dna)
assert string1 == string2
+
def test_validity_check():
dna = seq.NucleotideSequence()
- dna.code = np.array([0,1,0,3,3])
+ dna.code = np.array([0, 1, 0, 3, 3])
assert dna.is_valid()
- dna.code = np.array([0,1,4,3,3])
+ dna.code = np.array([0, 1, 4, 3, 3])
assert not dna.is_valid()
-
+
+
def test_access():
string = "AATGCGTTA"
dna = seq.NucleotideSequence(string)
@@ -28,6 +30,7 @@ def test_access():
dna = dna[3:-2]
assert "GCGT" == str(dna)
+
def test_manipulation():
dna_seq = seq.NucleotideSequence("ACGTA")
dna_copy = dna_seq.copy()
@@ -37,12 +40,13 @@ def test_manipulation():
dna_copy[0:2] = dna_copy[3:5]
assert "TAGTA" == str(dna_copy)
dna_copy = dna_seq.copy()
- dna_copy[np.array([True,False,False,False,True])] = "T"
+ dna_copy[np.array([True, False, False, False, True])] = "T"
assert "TCGTT" == str(dna_copy)
dna_copy = dna_seq.copy()
- dna_copy[1:4] = np.array([0,1,2])
+ dna_copy[1:4] = np.array([0, 1, 2])
assert "AACGA" == str(dna_copy)
+
def test_concatenation():
str1 = "AAGTTA"
str2 = "CGA"
@@ -54,16 +58,19 @@ def test_concatenation():
concat_seq = seq.NucleotideSequence(str3) + seq.NucleotideSequence(str1)
assert str3 + str1 == str(concat_seq)
+
def test_frequency():
string = "ACGCGAGAAAGCGGG"
dna = seq.NucleotideSequence(string)
assert dna.get_symbol_frequency() == {"A": 5, "C": 3, "G": 7, "T": 0}
-
+
+
def test_alph_error():
string = "AATGCGTUTA"
with pytest.raises(seq.AlphabetError):
seq.NucleotideSequence(string)
+
def test_alphabet_extension():
alph1 = seq.Alphabet("abc")
alph2 = seq.Alphabet("abc")
@@ -73,4 +80,4 @@ def test_alphabet_extension():
assert alph2.extends(alph1)
assert not alph3.extends(alph1)
assert alph4.extends(alph1)
- assert not alph1.extends(alph4)
\ No newline at end of file
+ assert not alph1.extends(alph4)
diff --git a/tests/structure/data/base_pairs/create_bond_orientation_test_data.py b/tests/structure/data/base_pairs/create_bond_orientation_test_data.py
index c81f9e050..e0c2fa1f4 100644
--- a/tests/structure/data/base_pairs/create_bond_orientation_test_data.py
+++ b/tests/structure/data/base_pairs/create_bond_orientation_test_data.py
@@ -1,35 +1,35 @@
-import pandas as pd
import argparse
-import numpy as np
import json
+import numpy as np
+import pandas as pd
+
def process(input, output, chain):
data = pd.read_csv(input)
# Only retain rows with basepair annotation
- data = data[data['Leontis-Westhof'].notna()]
+ data = data[data["Leontis-Westhof"].notna()]
output_list = []
for _, row in data.iterrows():
-
- nucleotides = [row['Nucleotide 1'], row['Nucleotide 2']]
+ nucleotides = [row["Nucleotide 1"], row["Nucleotide 2"]]
# Extract the Leontis-Westhof annotation
- lw_string = row['Leontis-Westhof']
+ lw_string = row["Leontis-Westhof"]
# Some interactions are labelled with `n` for near. These are
# ignored
- if lw_string[0] == 'n':
+ if lw_string[0] == "n":
continue
# Get sugar orientation from string (`c` = cis, `t` = trans)
sugar_orientation = lw_string[0]
# The residue ids of the nucleotides
- res_ids = [None]*2
+ res_ids = [None] * 2
for i, nucleotide in enumerate(nucleotides):
- nucleotide_list = nucleotide.split('.')
+ nucleotide_list = nucleotide.split(".")
# if the nucleotide is not part of the specified chain, skip
# base pair
@@ -41,37 +41,28 @@ def process(input, output, chain):
if None in res_ids:
continue
- if sugar_orientation == 'c':
+ if sugar_orientation == "c":
sugar_orientation = 1
- elif sugar_orientation == 't':
+ elif sugar_orientation == "t":
sugar_orientation = 2
this_output = sorted((int(res_ids[0]), int(res_ids[1])))
this_output.append(int(sugar_orientation))
output_list.append(this_output)
output_list = np.unique(output_list, axis=0).tolist()
- with open(output, 'w') as f:
+ with open(output, "w") as f:
json.dump(output_list, f, indent=1)
+
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Parse the glycosidic bond orientation annotations in the "
"NAKB-database for a specific chain. The annotations can be "
"downloaded in the section 'Base Pairs'."
)
- parser.add_argument(
- "infile",
- help="The path to the input file."
- )
- parser.add_argument(
- "outfile",
- help="The path to the output JSON file."
- )
- parser.add_argument(
- "chain",
- help="The chain ID to be extracted."
- )
+ parser.add_argument("infile", help="The path to the input file.")
+ parser.add_argument("outfile", help="The path to the output JSON file.")
+ parser.add_argument("chain", help="The chain ID to be extracted.")
args = parser.parse_args()
process(args.infile, args.outfile, args.chain)
-
diff --git a/tests/structure/data/base_pairs/create_interacting_edge_test_data.py b/tests/structure/data/base_pairs/create_interacting_edge_test_data.py
index 1a46eb4d3..bdcd1f586 100644
--- a/tests/structure/data/base_pairs/create_interacting_edge_test_data.py
+++ b/tests/structure/data/base_pairs/create_interacting_edge_test_data.py
@@ -1,36 +1,37 @@
-import pandas as pd
import argparse
import json
import numpy as np
+import pandas as pd
+
def process(input, output, chain):
data = pd.read_csv(input)
# Only retain rows with basepair annotation
- data = data[data['Leontis-Westhof'].notna()]
+ data = data[data["Leontis-Westhof"].notna()]
output_list = []
for _, row in data.iterrows():
- nucleotides = [row['Nucleotide 1'], row['Nucleotide 2']]
+ nucleotides = [row["Nucleotide 1"], row["Nucleotide 2"]]
# Extract the Leontis-Westhof annotation
- lw_string = row['Leontis-Westhof']
+ lw_string = row["Leontis-Westhof"]
# Some interactions are labelled with `n` for near. These are
# ignored
- if lw_string[0] == 'n':
+ if lw_string[0] == "n":
continue
# Get edge annotations from string
edges = [lw_string[-2], lw_string[-1]]
-
+
# Dont allow unspecified edges in test data
- if '.' in edges:
+ if "." in edges:
continue
- res_ids = [None]*2
+ res_ids = [None] * 2
for i, nucleotide in enumerate(nucleotides):
- nucleotide_list = nucleotide.split('.')
+ nucleotide_list = nucleotide.split(".")
# if the nucleotide is not part of the specified chain, skip
# base pair
@@ -43,11 +44,11 @@ def process(input, output, chain):
continue
for i, edge in enumerate(edges):
- if edge == 'W':
+ if edge == "W":
edges[i] = 1
- if edge == 'H':
+ if edge == "H":
edges[i] = 2
- if edge == 'S':
+ if edge == "S":
edges[i] = 3
# Lower residue id on the left, higher residue id on the right
@@ -62,28 +63,19 @@ def process(input, output, chain):
)
output_list = np.unique(output_list, axis=0).tolist()
- with open(output, 'w') as f:
+ with open(output, "w") as f:
json.dump(output_list, f, indent=1)
+
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Parse the edge type annotations in the NAKB-database for "
"a specific chain. The annotations can be downloaded in the section "
"'Base Pairs'."
)
- parser.add_argument(
- "infile",
- help="The path to the input file."
- )
- parser.add_argument(
- "outfile",
- help="The path to the output JSON file."
- )
- parser.add_argument(
- "chain",
- help="The chain ID to be extracted."
- )
+ parser.add_argument("infile", help="The path to the input file.")
+ parser.add_argument("outfile", help="The path to the output JSON file.")
+ parser.add_argument("chain", help="The chain ID to be extracted.")
args = parser.parse_args()
process(args.infile, args.outfile, args.chain)
-
diff --git a/tests/structure/data/create_test_structures.py b/tests/structure/data/create_test_structures.py
index 4bf0ae175..da0f0ff48 100644
--- a/tests/structure/data/create_test_structures.py
+++ b/tests/structure/data/create_test_structures.py
@@ -1,12 +1,12 @@
import argparse
-import subprocess
-from os.path import join
import logging
+import subprocess
import sys
+from os.path import join
import biotite
-from biotite.database import RequestError
import biotite.database.rcsb as rcsb
import biotite.structure.io as strucio
+from biotite.database import RequestError
def create(pdb_id, directory, include_gro):
@@ -18,7 +18,7 @@ def create(pdb_id, directory, include_gro):
# PDB entry is not provided in this format
pass
try:
- array = strucio.load_structure(join(directory, pdb_id+".pdb"))
+ array = strucio.load_structure(join(directory, pdb_id + ".pdb"))
except biotite.InvalidFileError:
# Structure probably contains multiple models with different
# number of atoms
@@ -31,41 +31,55 @@ def create(pdb_id, directory, include_gro):
cleaned_file_name = biotite.temp_file("pdb")
strucio.save_structure(cleaned_file_name, array)
# Run GROMACS for file conversion
- subprocess.run([
- "editconf",
- "-f", cleaned_file_name,
- "-o", join(directory, pdb_id+".gro")
- ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+ subprocess.run(
+ [
+ "editconf",
+ "-f",
+ cleaned_file_name,
+ "-o",
+ join(directory, pdb_id + ".gro"),
+ ],
+ stdout=subprocess.DEVNULL,
+ stderr=subprocess.DEVNULL,
+ )
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Create structure files for unit tests "
- "in all supported formats from PDB ID "
- "(excluding GROMACS trajectory files)"
- )
- parser.add_argument(
- "--dir", "-d", dest="directory", default=".",
- help="the Biotite project directory to put the test files into"
+ "in all supported formats from PDB ID "
+ "(excluding GROMACS trajectory files)"
)
parser.add_argument(
- "--id", "-i", dest="id",
- help="the PDB ID"
+ "--dir",
+ "-d",
+ dest="directory",
+ default=".",
+ help="the Biotite project directory to put the test files into",
)
+ parser.add_argument("--id", "-i", dest="id", help="the PDB ID")
parser.add_argument(
- "--file", "-f", dest="file",
- help="read mutliple PDB IDs from text file (line break separated IDs)"
+ "--file",
+ "-f",
+ dest="file",
+ help="read mutliple PDB IDs from text file (line break separated IDs)",
)
parser.add_argument(
- "--gromacs", "-g", action="store_true", dest="include_gro",
- help="Create '*.gro' files using the Gromacs software"
+ "--gromacs",
+ "-g",
+ action="store_true",
+ dest="include_gro",
+ help="Create '*.gro' files using the Gromacs software",
)
args = parser.parse_args()
if args.file is not None:
with open(args.file, "r") as file:
- pdb_ids = [pdb_id.strip().lower() for pdb_id
- in file.read().split("\n") if len(pdb_id.strip()) != 0]
+ pdb_ids = [
+ pdb_id.strip().lower()
+ for pdb_id in file.read().split("\n")
+ if len(pdb_id.strip()) != 0
+ ]
elif args.id is not None:
pdb_ids = [args.id.lower()]
else:
@@ -78,4 +92,4 @@ def create(pdb_id, directory, include_gro):
create(pdb_id, args.directory, args.include_gro)
except:
print()
- raise
\ No newline at end of file
+ raise
diff --git a/tests/structure/data/molecules/create_v3000_sdf.py b/tests/structure/data/molecules/create_v3000_sdf.py
index dc313722f..9630e71a1 100644
--- a/tests/structure/data/molecules/create_v3000_sdf.py
+++ b/tests/structure/data/molecules/create_v3000_sdf.py
@@ -11,4 +11,4 @@
writer.SetForceV3000(True)
for molecule in supplier:
writer.write(molecule)
- writer.close()
\ No newline at end of file
+ writer.close()
diff --git a/tests/structure/test_atoms.py b/tests/structure/test_atoms.py
index 93a94ac65..22b75c919 100644
--- a/tests/structure/test_atoms.py
+++ b/tests/structure/test_atoms.py
@@ -10,81 +10,88 @@
@pytest.fixture
def atom_list():
- chain_id = ["A","A","B","B","B"]
- res_id = [1,1,1,1,2]
+ chain_id = ["A", "A", "B", "B", "B"]
+ res_id = [1, 1, 1, 1, 2]
ins_code = [""] * 5
- res_name = ["ALA","ALA","PRO","PRO","MSE"]
+ res_name = ["ALA", "ALA", "PRO", "PRO", "MSE"]
hetero = [False, False, False, False, True]
atom_name = ["N", "CA", "O", "CA", "SE"]
- element = ["N","C","O","C","SE"]
+ element = ["N", "C", "O", "C", "SE"]
atom_list = []
for i in range(5):
- atom_list.append(struc.Atom([i,i,i],
- chain_id = chain_id[i],
- res_id = res_id[i],
- ins_code = ins_code[i],
- res_name = res_name[i],
- hetero = hetero[i],
- atom_name = atom_name[i],
- element = element[i]))
+ atom_list.append(
+ struc.Atom(
+ [i, i, i],
+ chain_id=chain_id[i],
+ res_id=res_id[i],
+ ins_code=ins_code[i],
+ res_name=res_name[i],
+ hetero=hetero[i],
+ atom_name=atom_name[i],
+ element=element[i],
+ )
+ )
return atom_list
+
@pytest.fixture
def atom(atom_list):
return atom_list[2]
+
@pytest.fixture
def array(atom_list):
return struc.array(atom_list)
+
@pytest.fixture
def stack(array):
return struc.stack([array, array.copy(), array.copy()])
+
@pytest.fixture
def array_box():
- return np.array([
- [1,0,0],
- [0,2,0],
- [0,0,3]
- ])
+ return np.array([[1, 0, 0], [0, 2, 0], [0, 0, 3]])
+
@pytest.fixture
def stack_box(stack, array_box):
return np.array([array_box] * stack.stack_depth())
+
def test_shape(array, stack):
assert array.shape == (5,)
assert stack.shape == (3, 5)
+
def test_access(array):
- chain_id = ["A","A","B","B","B"]
- assert array.coord.shape == (5,3)
+ chain_id = ["A", "A", "B", "B", "B"]
+ assert array.coord.shape == (5, 3)
assert array.chain_id.tolist() == chain_id
assert array.get_annotation("chain_id").tolist() == chain_id
array.add_annotation("test1", dtype=int)
- assert array.test1.tolist() == [0,0,0,0,0]
+ assert array.test1.tolist() == [0, 0, 0, 0, 0]
with pytest.raises(IndexError):
- array.set_annotation("test2", np.array([0,1,2,3]))
+ array.set_annotation("test2", np.array([0, 1, 2, 3]))
def test_modification(atom, array, stack):
new_atom = atom
new_atom.chain_id = "C"
del array[2]
- assert array.chain_id.tolist() == ["A","A","B","B"]
+ assert array.chain_id.tolist() == ["A", "A", "B", "B"]
array[-1] = new_atom
- assert array.chain_id.tolist() == ["A","A","B","C"]
+ assert array.chain_id.tolist() == ["A", "A", "B", "C"]
del stack[1]
assert stack.stack_depth() == 2
def test_array_indexing(atom, array):
filtered_array = array[array.chain_id == "B"]
- assert filtered_array.res_name.tolist() == ["PRO","PRO","MSE"]
+ assert filtered_array.res_name.tolist() == ["PRO", "PRO", "MSE"]
assert atom == filtered_array[0]
- filtered_array = array[[0,2,4]]
- assert filtered_array.element.tolist() == ["N","O","SE"]
+ filtered_array = array[[0, 2, 4]]
+ assert filtered_array.element.tolist() == ["N", "O", "SE"]
def test_stack_indexing(stack):
@@ -93,22 +100,22 @@ def test_stack_indexing(stack):
filtered_stack = stack[0]
assert type(filtered_stack) == struc.AtomArray
filtered_stack = stack[0:2, stack.res_name == "PRO"]
- assert filtered_stack.atom_name.tolist() == ["O","CA"]
- filtered_stack = stack[np.array([True,False,True])]
+ assert filtered_stack.atom_name.tolist() == ["O", "CA"]
+ filtered_stack = stack[np.array([True, False, True])]
assert filtered_stack.stack_depth() == 2
assert filtered_stack.array_length() == 5
- filtered_stack = stack[:,0]
+ filtered_stack = stack[:, 0]
assert filtered_stack.stack_depth() == 3
assert filtered_stack.array_length() == 1
-
+
def test_concatenation(array, stack):
concat_array = array[2:] + array[:2]
- assert concat_array.chain_id.tolist() == ["B","B","B","A","A"]
- assert concat_array.coord.shape == (5,3)
- concat_stack = stack[:,2:] + stack[:,:2]
- assert concat_array.chain_id.tolist() == ["B","B","B","A","A"]
- assert concat_stack.coord.shape == (3,5,3)
+ assert concat_array.chain_id.tolist() == ["B", "B", "B", "A", "A"]
+ assert concat_array.coord.shape == (5, 3)
+ concat_stack = stack[:, 2:] + stack[:, :2]
+ assert concat_array.chain_id.tolist() == ["B", "B", "B", "A", "A"]
+ assert concat_stack.coord.shape == (3, 5, 3)
def test_comparison(array):
@@ -129,23 +136,26 @@ def test_bonds(array):
with pytest.raises(ValueError):
# Expect a BondList with array length as atom count
array.bonds = struc.BondList(13)
- array.bonds = struc.BondList(5, np.array([(0,1),(0,2),(2,3),(2,4)]))
- assert array.bonds.as_array().tolist() == [[0, 1, 0],
- [0, 2, 0],
- [2, 3, 0],
- [2, 4, 0],]
+ array.bonds = struc.BondList(5, np.array([(0, 1), (0, 2), (2, 3), (2, 4)]))
+ assert array.bonds.as_array().tolist() == [
+ [0, 1, 0],
+ [0, 2, 0],
+ [2, 3, 0],
+ [2, 4, 0],
+ ]
filtered_array = array[array.chain_id == "B"]
- assert filtered_array.bonds.as_array().tolist() == [[0, 1, 0],
- [0, 2, 0]]
+ assert filtered_array.bonds.as_array().tolist() == [[0, 1, 0], [0, 2, 0]]
concat_array = array + array
- assert concat_array.bonds.as_array().tolist() == [[0, 1, 0],
- [0, 2, 0],
- [2, 3, 0],
- [2, 4, 0],
- [5, 6, 0],
- [5, 7, 0],
- [7, 8, 0],
- [7, 9, 0]]
+ assert concat_array.bonds.as_array().tolist() == [
+ [0, 1, 0],
+ [0, 2, 0],
+ [2, 3, 0],
+ [2, 4, 0],
+ [5, 6, 0],
+ [5, 7, 0],
+ [7, 8, 0],
+ [7, 9, 0],
+ ]
def test_box(array, stack, array_box, stack_box):
@@ -193,4 +203,4 @@ def test_pickle(atom, array, stack):
assert test_array == array
test_stack = pickle.loads(pickle.dumps(stack))
- assert test_stack == stack
\ No newline at end of file
+ assert test_stack == stack
diff --git a/tests/structure/test_basepairs.py b/tests/structure/test_basepairs.py
index d0b554f27..b11b78ce5 100644
--- a/tests/structure/test_basepairs.py
+++ b/tests/structure/test_basepairs.py
@@ -2,23 +2,22 @@
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.
-import pytest
import json
-import warnings
+from os.path import join
import numpy as np
+import pytest
import biotite.structure as struc
import biotite.structure.io as strucio
-from biotite.structure.info import residue
-from biotite.structure.residues import get_residue_masks
-from biotite.structure.hbond import hbond
-from os.path import join
-from ..util import data_dir
+
# For ``base_pairs_edge()`` differences to a reference can be ambiguous
# as the number hydrogen bonds between two different edges can be equal.
# In order to distinguish ambiguously identified edges from wrongfully
# identified edges the full edge matrix, listing the number of hydrogen
# bonds for each edge has to be considered.
from biotite.structure.basepairs import _get_edge_matrix
+from biotite.structure.info import residue
+from biotite.structure.residues import get_residue_masks
+from tests.util import data_dir
def reversed_iterator(iter):
@@ -30,9 +29,7 @@ def reversed_iterator(iter):
@pytest.fixture
def nuc_sample_array():
- return strucio.load_structure(
- join(data_dir("structure"), "base_pairs", "1qxb.cif")
- )
+ return strucio.load_structure(join(data_dir("structure"), "base_pairs", "1qxb.cif"))
@pytest.fixture
@@ -40,11 +37,10 @@ def basepairs(nuc_sample_array):
"""
Generate a test output for the base_pairs function.
"""
- residue_indices, residue_names = struc.residues.get_residues(
- nuc_sample_array
- )[0:24]
+ residue_indices, residue_names = struc.residues.get_residues(nuc_sample_array)[0:24]
return np.vstack((residue_indices[:12], np.flip(residue_indices)[:12])).T
+
def check_residue_starts(computed_starts, nuc_sample_array):
"""
Assert that computed starts are residue starts.
@@ -53,6 +49,7 @@ def check_residue_starts(computed_starts, nuc_sample_array):
for start in computed_starts.flatten():
assert start in residue_starts
+
def check_output(computed_basepairs, basepairs):
"""
Check the output of base_pairs.
@@ -60,16 +57,17 @@ def check_output(computed_basepairs, basepairs):
# Check if base pairs are unique in computed_basepairs
seen = set()
- assert (not any(
- (base1, base2) in seen) or (base2, base1 in seen)
- or seen.add((base1, base2)) for base1, base2 in computed_basepairs
- )
+ assert (
+ not any((base1, base2) in seen)
+ or (base2, base1 in seen)
+ or seen.add((base1, base2))
+ for base1, base2 in computed_basepairs
+ )
# Check if the right number of base pairs is in computed_base pairs
- assert(len(computed_basepairs) == len(basepairs))
+ assert len(computed_basepairs) == len(basepairs)
# Check if the right base pairs are in computed_basepairs
for comp_basepair in computed_basepairs:
- assert ((comp_basepair in basepairs) \
- or (comp_basepair in np.flip(basepairs)))
+ assert (comp_basepair in basepairs) or (comp_basepair in np.flip(basepairs))
@pytest.mark.parametrize("unique_bool", [False, True])
@@ -102,16 +100,12 @@ def test_base_pairs_reverse(nuc_sample_array, basepairs, unique_bool):
# Reverse sequence of residues in nuc_sample_array
reversed_nuc_sample_array = struc.AtomArray(0)
- for residue in reversed_iterator(struc.residue_iter(nuc_sample_array)):
- reversed_nuc_sample_array = reversed_nuc_sample_array + residue
+ for res in reversed_iterator(struc.residue_iter(nuc_sample_array)):
+ reversed_nuc_sample_array = reversed_nuc_sample_array + res
- computed_basepairs = struc.base_pairs(
- reversed_nuc_sample_array, unique=unique_bool
- )
+ computed_basepairs = struc.base_pairs(reversed_nuc_sample_array, unique=unique_bool)
check_residue_starts(computed_basepairs, reversed_nuc_sample_array)
- check_output(
- reversed_nuc_sample_array[computed_basepairs].res_id, basepairs
- )
+ check_output(reversed_nuc_sample_array[computed_basepairs].res_id, basepairs)
def test_base_pairs_reverse_no_hydrogen(nuc_sample_array, basepairs):
@@ -123,14 +117,13 @@ def test_base_pairs_reverse_no_hydrogen(nuc_sample_array, basepairs):
nuc_sample_array = nuc_sample_array[nuc_sample_array.element != "H"]
# Reverse sequence of residues in nuc_sample_array
reversed_nuc_sample_array = struc.AtomArray(0)
- for residue in reversed_iterator(struc.residue_iter(nuc_sample_array)):
- reversed_nuc_sample_array = reversed_nuc_sample_array + residue
+ for res in reversed_iterator(struc.residue_iter(nuc_sample_array)):
+ reversed_nuc_sample_array = reversed_nuc_sample_array + res
computed_basepairs = struc.base_pairs(reversed_nuc_sample_array)
check_residue_starts(computed_basepairs, reversed_nuc_sample_array)
- check_output(
- reversed_nuc_sample_array[computed_basepairs].res_id, basepairs
- )
+ check_output(reversed_nuc_sample_array[computed_basepairs].res_id, basepairs)
+
def test_base_pairs_incomplete_structure(nuc_sample_array):
"""
@@ -142,14 +135,15 @@ def test_base_pairs_incomplete_structure(nuc_sample_array):
"""
nuc_sample_array = nuc_sample_array[
- ~ np.isin(
+ ~np.isin(
nuc_sample_array.atom_name,
- ['N1', 'C2', 'N3', 'C4', 'C5', 'C6', 'N7', 'C8', 'N9', 'O2']
+ ["N1", "C2", "N3", "C4", "C5", "C6", "N7", "C8", "N9", "O2"],
)
]
with pytest.warns(struc.IncompleteStructureWarning):
assert len(struc.base_pairs(nuc_sample_array)) == 0
+
@pytest.mark.parametrize("seed", range(10))
def test_base_pairs_reordered(nuc_sample_array, seed):
"""
@@ -160,52 +154,49 @@ def test_base_pairs_reordered(nuc_sample_array, seed):
nuc_sample_array_reordered = struc.AtomArray(0)
np.random.seed(seed)
- for residue in struc.residue_iter(nuc_sample_array):
- bound = residue.array_length()
- indices = np.random.choice(
- np.arange(bound), bound,replace=False
- )
- nuc_sample_array_reordered += residue[..., indices]
+ for res in struc.residue_iter(nuc_sample_array):
+ bound = res.array_length()
+ indices = np.random.choice(np.arange(bound), bound, replace=False)
+ nuc_sample_array_reordered += res[..., indices]
- assert(np.all(
+ assert np.all(
struc.base_pairs(nuc_sample_array)
== struc.base_pairs(nuc_sample_array_reordered)
- ))
+ )
def test_map_nucleotide():
- """Test the function map_nucleotide with some examples.
- """
- pyrimidines = ['C', 'T', 'U']
- purines = ['A', 'G']
+ """Test the function map_nucleotide with some examples."""
+ pyrimidines = ["C", "T", "U"]
+ purines = ["A", "G"]
# Test that the standard bases are correctly identified
- assert struc.map_nucleotide(residue('U')) == ('U', True)
- assert struc.map_nucleotide(residue('A')) == ('A', True)
- assert struc.map_nucleotide(residue('T')) == ('T', True)
- assert struc.map_nucleotide(residue('G')) == ('G', True)
- assert struc.map_nucleotide(residue('C')) == ('C', True)
+ assert struc.map_nucleotide(residue("U")) == ("U", True)
+ assert struc.map_nucleotide(residue("A")) == ("A", True)
+ assert struc.map_nucleotide(residue("T")) == ("T", True)
+ assert struc.map_nucleotide(residue("G")) == ("G", True)
+ assert struc.map_nucleotide(residue("C")) == ("C", True)
# Test that some non_standard nucleotides are mapped correctly to
# pyrimidine/purine references
- psu_tuple = struc.map_nucleotide(residue('PSU'))
+ psu_tuple = struc.map_nucleotide(residue("PSU"))
assert psu_tuple[0] in pyrimidines
- assert psu_tuple[1] == False
+ assert psu_tuple[1] is False
- psu_tuple = struc.map_nucleotide(residue('3MC'))
+ psu_tuple = struc.map_nucleotide(residue("3MC"))
assert psu_tuple[0] in pyrimidines
- assert psu_tuple[1] == False
+ assert psu_tuple[1] is False
- i_tuple = struc.map_nucleotide(residue('I'))
+ i_tuple = struc.map_nucleotide(residue("I"))
assert i_tuple[0] in purines
- assert i_tuple[1] == False
+ assert i_tuple[1] is False
- m7g_tuple = struc.map_nucleotide(residue('M7G'))
+ m7g_tuple = struc.map_nucleotide(residue("M7G"))
assert m7g_tuple[0] in purines
- assert m7g_tuple[1] == False
+ assert m7g_tuple[1] is False
with pytest.warns(struc.IncompleteStructureWarning):
- assert struc.map_nucleotide(residue('ALA')) == (None, False)
+ assert struc.map_nucleotide(residue("ALA")) == (None, False)
def get_reference(pdb_id, suffix):
@@ -218,12 +209,13 @@ def get_reference(pdb_id, suffix):
)
with open(
- join(data_dir("structure"), "base_pairs", f"{pdb_id}_{suffix}.json"
- ), "r") as file:
+ join(data_dir("structure"), "base_pairs", f"{pdb_id}_{suffix}.json"), "r"
+ ) as file:
reference = np.array(json.load(file))
return structure, reference
+
def get_reference_index(pair, array):
"""
Get the index of the row in a reference array, where the first two
@@ -236,10 +228,7 @@ def get_reference_index(pair, array):
return None
-
-def check_edge_plausibility(
- reference_structure, pair, reference_edges, output_edges
-):
+def check_edge_plausibility(reference_structure, pair, reference_edges, output_edges):
"""
Checks if the difference to a reference edge is at least ambiguous.
A difference is defined as ambiguous, if the number of hydrogen
@@ -280,8 +269,9 @@ def test_base_pairs_edge(pdb_id):
pair_res_ids = reference_structure[pair].res_id
index = get_reference_index(pair_res_ids, reference_edges)
if index is not None:
- pair_reference_edges = [
- reference_edges[index, 2], reference_edges[index, 3]
+ pair_reference_edges = [
+ reference_edges[index, 2],
+ reference_edges[index, 3],
]
check_edge_plausibility(
reference_structure, pair, pair_reference_edges, pair_edges
@@ -309,9 +299,7 @@ def test_base_pairs_glycosidic_bond(pdb_id):
pair_res_ids = reference_structure[pair].res_id
index = get_reference_index(pair_res_ids, reference_gly_bonds)
if index is not None:
- reference_orientation = struc.GlycosidicBond(
- reference_gly_bonds[index, 2]
- )
+ reference_orientation = struc.GlycosidicBond(reference_gly_bonds[index, 2])
assert reference_orientation == pair_orientation
@@ -333,7 +321,7 @@ def test_base_stacking():
# stacked.
expected_stackings = []
for i in range(1, 24):
- expected_stackings.append([i, i+1])
+ expected_stackings.append([i, i + 1])
# Due to distortions in the helix not all adjacent bases have a
# geometry that meets the criteria of `base_stacking`.
@@ -353,5 +341,3 @@ def test_base_stacking():
# Assert the stacking interactions are correct
for interaction in helix[stacking].res_id:
assert list(interaction) in expected_stackings
-
-
diff --git a/tests/structure/test_bonds.py b/tests/structure/test_bonds.py
index a5474ffde..d5c8b3508 100644
--- a/tests/structure/test_bonds.py
+++ b/tests/structure/test_bonds.py
@@ -9,7 +9,7 @@
import biotite.structure.info as info
import biotite.structure.io as strucio
import biotite.structure.io.pdbx as pdbx
-from ..util import data_dir
+from tests.util import data_dir
def generate_random_bond_list(atom_count, bond_count, seed=0):
@@ -23,20 +23,22 @@ def generate_random_bond_list(atom_count, bond_count, seed=0):
# Clip bond types to allowed BondType values
bonds[:, 2] %= len(struc.BondType)
# Remove bonds of atoms to itself
- bonds = bonds[bonds[:,0] != bonds[:,1]]
+ bonds = bonds[bonds[:, 0] != bonds[:, 1]]
assert len(bonds) > 0
return struc.BondList(atom_count, bonds)
@pytest.fixture(
- params=[False, True] # as_negative
+ params=[False, True] # as_negative
)
def bond_list(request):
"""
A toy :class:`BondList`.
"""
as_negative = request.param
- bond_array = np.array([(0,1),(2,1),(3,1),(3,4),(3,1),(1,2),(4,0),(6,4)])
+ bond_array = np.array(
+ [(0, 1), (2, 1), (3, 1), (3, 4), (3, 1), (1, 2), (4, 0), (6, 4)]
+ )
if as_negative:
return struc.BondList(7, -7 + bond_array)
else:
@@ -48,12 +50,14 @@ def test_creation(bond_list):
Test creating a :class:`BondList` on a known example.
"""
# Test includes redundancy removal and max bonds calculation
- assert bond_list.as_array().tolist() == [[0, 1, 0],
- [1, 2, 0],
- [1, 3, 0],
- [3, 4, 0],
- [0, 4, 0],
- [4, 6, 0]]
+ assert bond_list.as_array().tolist() == [
+ [0, 1, 0],
+ [1, 2, 0],
+ [1, 3, 0],
+ [3, 4, 0],
+ [0, 4, 0],
+ [4, 6, 0],
+ ]
assert bond_list._max_bonds_per_atom == 3
assert bond_list._atom_count == 7
@@ -65,46 +69,44 @@ def test_invalid_creation():
"""
# Test invalid input shapes
with pytest.raises(ValueError):
- struc.BondList(
- 5,
- np.array([
- [1,2,3,4]
- ])
- )
+ struc.BondList(5, np.array([[1, 2, 3, 4]]))
with pytest.raises(ValueError):
- struc.BondList(
- 5,
- np.array([1,2])
- )
+ struc.BondList(5, np.array([1, 2]))
# Test invalid atom indices
with pytest.raises(IndexError):
struc.BondList(
5,
- np.array([
- [1,2],
- # 5 is an invalid index for an atom count of 5
- [5,2]
- ])
+ np.array(
+ [
+ [1, 2],
+ # 5 is an invalid index for an atom count of 5
+ [5, 2],
+ ]
+ ),
)
with pytest.raises(IndexError):
struc.BondList(
5,
- np.array([
- # Index -6 is invalid for an atom count of 5
- [-6,3],
- [3,4]
- ])
+ np.array(
+ [
+ # Index -6 is invalid for an atom count of 5
+ [-6, 3],
+ [3, 4],
+ ]
+ ),
)
# Test invalid BondType
with pytest.raises(ValueError):
struc.BondList(
5,
- np.array([
- # BondType '8' does not exist
- [1,2,8]
- ])
+ np.array(
+ [
+ # BondType '8' does not exist
+ [1, 2, 8]
+ ]
+ ),
)
@@ -126,25 +128,21 @@ def test_modification(bond_list):
# Not in list -> Do nothing
bond_list.remove_bond(0, 3)
# Remove mutliple bonds, one of them is not in list
- bond_list.remove_bonds(struc.BondList(10, np.array([(1,0),(1,2),(8,9)])))
- assert bond_list.as_array().tolist() == [[1, 3, 1],
- [3, 4, 0],
- [4, 6, 0],
- [1, 4, 0]]
+ bond_list.remove_bonds(struc.BondList(10, np.array([(1, 0), (1, 2), (8, 9)])))
+ assert bond_list.as_array().tolist() == [[1, 3, 1], [3, 4, 0], [4, 6, 0], [1, 4, 0]]
def test_add_two_bond_list():
"""
Test adding two `BondList` objects.
"""
- bond_list1 = struc.BondList(2, np.array([(0,1)])) # max_bond_per_atom=1
- bond_list2 = struc.BondList(3, np.array([(0,1),(0,2)])) # max_bond_per_atom=2
+ bond_list1 = struc.BondList(2, np.array([(0, 1)])) # max_bond_per_atom=1
+ bond_list2 = struc.BondList(3, np.array([(0, 1), (0, 2)])) # max_bond_per_atom=2
added_list = bond_list1 + bond_list2
assert added_list._max_bonds_per_atom == 2
assert added_list.get_bonds(2)[0].tolist() == [3, 4]
- assert added_list.as_array().tolist() == [[0, 1, 0],
- [2, 3, 0],
- [2, 4, 0]]
+ assert added_list.as_array().tolist() == [[0, 1, 0], [2, 3, 0], [2, 4, 0]]
+
def test_contains(bond_list):
"""
@@ -185,29 +183,33 @@ def test_merge(bond_list):
"""
Test merging two `BondList` objects on a known example.
"""
- merged_list = struc.BondList(8, np.array([(4,6),(6,7)])).merge(bond_list)
- assert merged_list.as_array().tolist() == [[0, 1, 0],
- [1, 2, 0],
- [1, 3, 0],
- [3, 4, 0],
- [0, 4, 0],
- [4, 6, 0],
- [6, 7, 0]]
+ merged_list = struc.BondList(8, np.array([(4, 6), (6, 7)])).merge(bond_list)
+ assert merged_list.as_array().tolist() == [
+ [0, 1, 0],
+ [1, 2, 0],
+ [1, 3, 0],
+ [3, 4, 0],
+ [0, 4, 0],
+ [4, 6, 0],
+ [6, 7, 0],
+ ]
def test_concatenation(bond_list):
"""
Test concatenation of two `BondList` objects on a known example.
"""
- bond_list += struc.BondList(3, np.array([(0,1,2),(1,2,2)]))
- assert bond_list.as_array().tolist() == [[0, 1, 0],
- [1, 2, 0],
- [1, 3, 0],
- [3, 4, 0],
- [0, 4, 0],
- [4, 6, 0],
- [7, 8, 2],
- [8, 9, 2]]
+ bond_list += struc.BondList(3, np.array([(0, 1, 2), (1, 2, 2)]))
+ assert bond_list.as_array().tolist() == [
+ [0, 1, 0],
+ [1, 2, 0],
+ [1, 3, 0],
+ [3, 4, 0],
+ [0, 4, 0],
+ [4, 6, 0],
+ [7, 8, 2],
+ [8, 9, 2],
+ ]
assert bond_list._max_bonds_per_atom == 3
assert bond_list._atom_count == 10
@@ -219,30 +221,27 @@ def test_indexing(bond_list):
sub_list = bond_list[:]
assert sub_list.as_array().tolist() == bond_list.as_array().tolist()
sub_list = bond_list[::-1]
- assert sub_list.as_array().tolist() == [[5, 6, 0],
- [4, 5, 0],
- [3, 5, 0],
- [2, 3, 0],
- [2, 6, 0],
- [0, 2, 0]]
+ assert sub_list.as_array().tolist() == [
+ [5, 6, 0],
+ [4, 5, 0],
+ [3, 5, 0],
+ [2, 3, 0],
+ [2, 6, 0],
+ [0, 2, 0],
+ ]
sub_list = bond_list[1:6:2]
assert sub_list.as_array().tolist() == [[0, 1, 0]]
sub_list = bond_list[:4]
- assert sub_list.as_array().tolist() == [[0, 1, 0],
- [1, 2, 0],
- [1, 3, 0]]
+ assert sub_list.as_array().tolist() == [[0, 1, 0], [1, 2, 0], [1, 3, 0]]
sub_list = bond_list[2:]
- assert sub_list.as_array().tolist() == [[1, 2, 0],
- [2, 4, 0]]
+ assert sub_list.as_array().tolist() == [[1, 2, 0], [2, 4, 0]]
- sub_list = bond_list[[0,3,4]]
- assert sub_list.as_array().tolist() == [[1, 2, 0],
- [0, 2, 0]]
+ sub_list = bond_list[[0, 3, 4]]
+ assert sub_list.as_array().tolist() == [[1, 2, 0], [0, 2, 0]]
+
+ sub_list = bond_list[np.array([True, False, False, True, True, False, True])]
+ assert sub_list.as_array().tolist() == [[1, 2, 0], [0, 2, 0], [2, 3, 0]]
- sub_list = bond_list[np.array([True,False,False,True,True,False,True])]
- assert sub_list.as_array().tolist() == [[1, 2, 0],
- [0, 2, 0],
- [2, 3, 0]]
def test_get_all_bonds():
"""
@@ -261,17 +260,13 @@ def test_get_all_bonds():
assert (bond_types != -1).all(axis=1).any(axis=0)
test_bonds = [
- (
- bonded_i[bonded_i != -1].tolist(),
- bond_type[bond_type != -1].tolist()
- )
+ (bonded_i[bonded_i != -1].tolist(), bond_type[bond_type != -1].tolist())
for bonded_i, bond_type in zip(bonds, bond_types)
]
ref_bonds = [bond_list.get_bonds(i) for i in range(ATOM_COUNT)]
ref_bonds = [
- (bonded_i.tolist(), bond_type.tolist())
- for bonded_i, bond_type in ref_bonds
+ (bonded_i.tolist(), bond_type.tolist()) for bonded_i, bond_type in ref_bonds
]
assert test_bonds == ref_bonds
@@ -330,9 +325,9 @@ def test_sorted_array_indexing():
# Create a sorted array of random indices for the BondList
# Indices may not occur multiple times -> 'replace=False'
- index_array = np.sort(np.random.choice(
- np.arange(ATOM_COUNT), INDEX_SIZE, replace=False
- ))
+ index_array = np.sort(
+ np.random.choice(np.arange(ATOM_COUNT), INDEX_SIZE, replace=False)
+ )
test_bonds = bonds[index_array]
# Create a boolean mask that indexes the same elements as the array
@@ -363,15 +358,13 @@ def test_unsorted_array_indexing():
# Create random bonds between the reference integers
bonds = np.random.randint(ATOM_COUNT, size=(BOND_COUNT, 2))
# Remove bonds of elements to itself
- bonds = bonds[bonds[:,0] != bonds[:,1]]
+ bonds = bonds[bonds[:, 0] != bonds[:, 1]]
assert len(bonds) > 0
bonds = struc.BondList(ATOM_COUNT, bonds)
# Create an unsorted array of random indices for the BondList
# Indices should be unsorted -> 'replace=False'
- unsorted_index = np.random.choice(
- np.arange(ATOM_COUNT), INDEX_SIZE, replace=False
- )
+ unsorted_index = np.random.choice(np.arange(ATOM_COUNT), INDEX_SIZE, replace=False)
test_bonds = bonds[unsorted_index]
# Create a sorted variant of the index array
@@ -385,14 +378,18 @@ def test_unsorted_array_indexing():
# Get the 'atoms', in this case integers, that are connected with a bond
# Use a set for simpler comparison between the sorted and unsorted variant
# Omit the bond type -> 'bonds.as_array()[:, :2]'
- test_integer_pairs = set([
- frozenset((unsorted_indexed_integers[i], unsorted_indexed_integers[j]))
- for i, j in test_bonds.as_array()[:, :2]
- ])
- ref_integer_pairs = set([
- frozenset((sorted_indexed_integers[i], sorted_indexed_integers[j]))
- for i, j in ref_bonds.as_array()[:, :2]
- ])
+ test_integer_pairs = set(
+ [
+ frozenset((unsorted_indexed_integers[i], unsorted_indexed_integers[j]))
+ for i, j in test_bonds.as_array()[:, :2]
+ ]
+ )
+ ref_integer_pairs = set(
+ [
+ frozenset((sorted_indexed_integers[i], sorted_indexed_integers[j]))
+ for i, j in ref_bonds.as_array()[:, :2]
+ ]
+ )
# The BondList entries should be different,
# since they point to different positions in the reference array
@@ -415,18 +412,21 @@ def test_atom_array_consistency():
array = strucio.load_structure(join(data_dir("structure"), "1l2y.bcif"))[0]
ca = array[array.atom_name == "CA"]
# Just for testing, does not reflect real bonds
- bond_list = struc.BondList(ca.array_length(),
- np.array([(0,1),(2,8),(5,15),(1,5),(0,9),(3,18),(2,9)])
+ bond_list = struc.BondList(
+ ca.array_length(),
+ np.array([(0, 1), (2, 8), (5, 15), (1, 5), (0, 9), (3, 18), (2, 9)]),
)
ca.bonds = bond_list
- ref_ids = ca.res_id[bond_list.as_array()[:,:2].flatten()]
+ ref_ids = ca.res_id[bond_list.as_array()[:, :2].flatten()]
# Some random boolean mask as index,
# but all bonded atoms are included
- mask = np.array([1,1,1,1,0,1,0,0,1,1,0,1,1,0,0,1,1,0,1,1], dtype=bool)
+ mask = np.array(
+ [1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1], dtype=bool
+ )
masked_ca = ca[mask]
- test_ids = masked_ca.res_id[masked_ca.bonds.as_array()[:,:2].flatten()]
+ test_ids = masked_ca.res_id[masked_ca.bonds.as_array()[:, :2].flatten()]
# The bonds, should always point to the same atoms (same res_id),
# irrespective of indexing
@@ -442,9 +442,7 @@ def test_method_consistency(periodic):
THRESHOLD_PERCENTAGE = 0.99
# Structure with peptide, nucleotide, small molecules and water
- pdbx_file = pdbx.BinaryCIFFile.read(
- join(data_dir("structure"), "5ugo.bcif")
- )
+ pdbx_file = pdbx.BinaryCIFFile.read(join(data_dir("structure"), "5ugo.bcif"))
atoms = pdbx.get_structure(pdbx_file, model=1)
if periodic:
# Add large dummy box to test parameter
@@ -454,22 +452,22 @@ def test_method_consistency(periodic):
bonds_from_names = struc.connect_via_residue_names(atoms)
bonds_from_names.remove_bond_order()
- bonds_from_distances = struc.connect_via_distances(
- atoms, periodic=periodic
- )
+ bonds_from_distances = struc.connect_via_distances(atoms, periodic=periodic)
# The distance based method may not detect all bonds
assert bonds_from_distances.as_set().issubset(bonds_from_names.as_set())
- assert len(bonds_from_distances.as_array()) \
+ assert (
+ len(bonds_from_distances.as_array())
>= len(bonds_from_names.as_array()) * THRESHOLD_PERCENTAGE
+ )
def test_find_connected(bond_list):
"""
Find all connected atoms to an atom in a known example.
"""
- for index in (0,1,2,3,4,6):
- assert struc.find_connected(bond_list, index).tolist() == [0,1,2,3,4,6]
+ for index in (0, 1, 2, 3, 4, 6):
+ assert struc.find_connected(bond_list, index).tolist() == [0, 1, 2, 3, 4, 6]
assert struc.find_connected(bond_list, 5).tolist() == [5]
@@ -498,7 +496,7 @@ def test_find_connected(bond_list):
("C17", "C22"),
]),
]
-)
+) # fmt: skip
def test_find_rotatable_bonds(res_name, expected_bonds):
"""
Check the :func:`find_rotatable_bonds()` function based on
@@ -513,11 +511,9 @@ def test_find_rotatable_bonds(res_name, expected_bonds):
rotatable_bonds = struc.find_rotatable_bonds(molecule.bonds)
test_bond_set = set()
for i, j, _ in rotatable_bonds.as_array():
- test_bond_set.add(
- tuple(sorted((molecule.atom_name[i], molecule.atom_name[j])))
- )
+ test_bond_set.add(tuple(sorted((molecule.atom_name[i], molecule.atom_name[j]))))
# Compare with reference bonded atom names
assert test_bond_set == ref_bond_set
# All rotatable bonds must be single bonds
- assert np.all(rotatable_bonds.as_array()[:, 2] == struc.BondType.SINGLE)
\ No newline at end of file
+ assert np.all(rotatable_bonds.as_array()[:, 2] == struc.BondType.SINGLE)
diff --git a/tests/structure/test_box.py b/tests/structure/test_box.py
index 513f9cd35..716f2cf26 100644
--- a/tests/structure/test_box.py
+++ b/tests/structure/test_box.py
@@ -2,16 +2,15 @@
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.
-from os.path import join
import itertools
import warnings
+from os.path import join
import numpy as np
import pytest
import biotite.structure as struc
import biotite.structure.io.pdbx as pdbx
from biotite.structure.io import load_structure
-from ..util import data_dir, cannot_import
-
+from tests.util import cannot_import, data_dir
SAMPLE_BOXES = [
(1, 1, 1, 90, 90, 90),
@@ -21,86 +20,82 @@
(2, 4, 6, 100, 110, 120),
(9, 9, 9, 90, 90, 170),
(9, 8, 7, 50, 80, 50),
-]
+] # fmt: skip
SAMPLE_COORD = [
( 1, 1, 1),
( 5, 10, 20),
(-1, 5, 8),
( 3, 1, 54)
-]
-
+] # fmt: skip
# Ignore warning about dummy unit cell vector
@pytest.mark.filterwarnings("ignore")
-@pytest.mark.skipif(
- cannot_import("mdtraj"),
- reason="MDTraj is not installed"
-)
-@pytest.mark.parametrize(
- "len_a, len_b, len_c, alpha, beta, gamma", SAMPLE_BOXES
-)
+@pytest.mark.skipif(cannot_import("mdtraj"), reason="MDTraj is not installed")
+@pytest.mark.parametrize("len_a, len_b, len_c, alpha, beta, gamma", SAMPLE_BOXES)
def test_box_vector_calculation(len_a, len_b, len_c, alpha, beta, gamma):
box = struc.vectors_from_unitcell(
- len_a, len_b, len_c,
- np.deg2rad(alpha), np.deg2rad(beta), np.deg2rad(gamma)
+ len_a, len_b, len_c, np.deg2rad(alpha), np.deg2rad(beta), np.deg2rad(gamma)
)
from mdtraj.utils import lengths_and_angles_to_box_vectors
+
ref_box = np.stack(
- lengths_and_angles_to_box_vectors(
- len_a, len_b, len_c, alpha, beta, gamma
- )
+ lengths_and_angles_to_box_vectors(len_a, len_b, len_c, alpha, beta, gamma)
)
assert np.allclose(box, ref_box)
assert struc.unitcell_from_vectors(box) == pytest.approx(
- (len_a, len_b, len_c,
- alpha * 2*np.pi / 360, beta * 2*np.pi / 360, gamma * 2*np.pi / 360)
+ (
+ len_a,
+ len_b,
+ len_c,
+ alpha * 2 * np.pi / 360,
+ beta * 2 * np.pi / 360,
+ gamma * 2 * np.pi / 360,
+ )
)
def test_volume():
# Very rudimentary test
- box = np.array([
- [5,0,0],
- [0,8,0],
- [0,0,2],
- ])
+ box = np.array(
+ [
+ [5, 0, 0],
+ [0, 8, 0],
+ [0, 0, 2],
+ ]
+ )
assert struc.box_volume(box) == pytest.approx(80)
boxes = np.stack([box, box])
- assert struc.box_volume(boxes) == pytest.approx(80,80)
+ assert struc.box_volume(boxes) == pytest.approx(80, 80)
@pytest.mark.parametrize(
"len_a, len_b, len_c, alpha, beta, gamma, x, y,z",
- [box+coord for box, coord in itertools.product(SAMPLE_BOXES, SAMPLE_COORD)]
+ [box + coord for box, coord in itertools.product(SAMPLE_BOXES, SAMPLE_COORD)],
)
def test_move_into_box(len_a, len_b, len_c, alpha, beta, gamma, x, y, z):
box = struc.vectors_from_unitcell(
- len_a, len_b, len_c,
- np.deg2rad(alpha), np.deg2rad(beta), np.deg2rad(gamma)
+ len_a, len_b, len_c, np.deg2rad(alpha), np.deg2rad(beta), np.deg2rad(gamma)
)
- coord = np.array([x,y,z])
+ coord = np.array([x, y, z])
moved_coord = struc.move_inside_box(coord, box)
fractions = struc.coord_to_fraction(moved_coord, box)
- assert ((fractions >= 0) & (fractions <=1)).all()
+ assert ((fractions >= 0) & (fractions <= 1)).all()
@pytest.mark.parametrize(
"len_a, len_b, len_c, alpha, beta, gamma, x, y,z",
- [box+coord for box, coord in itertools.product(SAMPLE_BOXES, SAMPLE_COORD)]
+ [box + coord for box, coord in itertools.product(SAMPLE_BOXES, SAMPLE_COORD)],
)
-def test_conversion_to_fraction(len_a, len_b, len_c,
- alpha, beta, gamma,
- x, y, z):
+def test_conversion_to_fraction(len_a, len_b, len_c, alpha, beta, gamma, x, y, z):
box = struc.vectors_from_unitcell(
- len_a, len_b, len_c,
- np.deg2rad(alpha), np.deg2rad(beta), np.deg2rad(gamma)
+ len_a, len_b, len_c, np.deg2rad(alpha), np.deg2rad(beta), np.deg2rad(gamma)
)
- coord = np.array([x,y,z])
+ coord = np.array([x, y, z])
fractions = struc.coord_to_fraction(coord, box)
if struc.is_orthogonal(box):
@@ -119,12 +114,11 @@ def test_conversion_to_fraction(len_a, len_b, len_c,
def test_repeat_box(multi_model):
model = None if multi_model else 1
array = pdbx.get_structure(
- pdbx.BinaryCIFFile.read(join(data_dir("structure"), "3o5r.bcif")),
- model=model
+ pdbx.BinaryCIFFile.read(join(data_dir("structure"), "3o5r.bcif")), model=model
)
repeat_array, _ = struc.repeat_box(array)
assert repeat_array.array_length() == array.array_length() * 27
- assert repeat_array[..., :array.array_length()] == array
+ assert repeat_array[..., : array.array_length()] == array
@pytest.mark.parametrize("multi_model", [True, False])
@@ -135,14 +129,12 @@ def test_remove_pbc_unsegmented(multi_model):
"""
model = None if multi_model else 1
ref_array = load_structure(
- join(data_dir("structure"), "3o5r.bcif"),
- model=model,
- include_bonds=True
+ join(data_dir("structure"), "3o5r.bcif"), model=model, include_bonds=True
)
# Center structure in box
centroid = struc.centroid(ref_array)
box_center = np.diag(ref_array.box) / 2
- ref_array = struc.translate(ref_array, box_center-centroid)
+ ref_array = struc.translate(ref_array, box_center - centroid)
test_array = struc.remove_pbc(ref_array)
assert ref_array.equal_annotation_categories(test_array)
@@ -150,11 +142,7 @@ def test_remove_pbc_unsegmented(multi_model):
@pytest.mark.parametrize(
- "multi_model, seed",
- itertools.product(
- [False, True],
- range(10)
- )
+ "multi_model, seed", itertools.product([False, True], range(10))
)
def test_remove_pbc_restore(multi_model, seed):
BUFFER = 5
@@ -162,14 +150,12 @@ def test_remove_pbc_restore(multi_model, seed):
def get_distance_matrices(array):
if isinstance(array, struc.AtomArray):
matrix = struc.distance(
- array.coord[:, np.newaxis, :],
- array.coord[np.newaxis, :, :],
- box=None
+ array.coord[:, np.newaxis, :], array.coord[np.newaxis, :, :], box=None
)
matrix_pbc = struc.distance(
array.coord[:, np.newaxis, :],
array.coord[np.newaxis, :, :],
- box=array.box
+ box=array.box,
)
elif isinstance(array, struc.AtomArrayStack):
matrices = [get_distance_matrices(model) for model in array]
@@ -177,9 +163,7 @@ def get_distance_matrices(array):
matrix_pbc = np.stack([m[1] for m in matrices])
return matrix, matrix_pbc
- stack = load_structure(
- join(data_dir("structure"), "1l2y.bcif"), include_bonds=True
- )
+ stack = load_structure(join(data_dir("structure"), "1l2y.bcif"), include_bonds=True)
# Only consider a single molecule
# -> remove all other atoms (in this case some unbound hydrogen)
@@ -188,10 +172,12 @@ def get_distance_matrices(array):
stack = stack[..., largest_mask]
# Create a relatively tight box around the protein
- stack.box = np.array([
- np.diag(np.max(coord, axis=0) - np.min(coord, axis=0) + BUFFER)
- for coord in stack.coord
- ])
+ stack.box = np.array(
+ [
+ np.diag(np.max(coord, axis=0) - np.min(coord, axis=0) + BUFFER)
+ for coord in stack.coord
+ ]
+ )
stack.coord -= np.min(stack.coord, axis=-2)[:, np.newaxis, :] + BUFFER / 2
if multi_model:
array = stack
@@ -203,8 +189,7 @@ def get_distance_matrices(array):
np.random.seed(seed)
size = (array.stack_depth(), 3) if isinstance(array, struc.AtomArrayStack) else 3
translation_vector = np.sum(
- np.random.uniform(-5, 5, size)[:, np.newaxis] * array.box,
- axis=-2
+ np.random.uniform(-5, 5, size)[:, np.newaxis] * array.box, axis=-2
)[..., np.newaxis, :]
# Move atoms over periodic boundary...
array = struc.translate(array, translation_vector)
@@ -226,10 +211,7 @@ def get_distance_matrices(array):
# The centroid of the structure should be inside the box dimensions
centroid = struc.centroid(array)
- assert np.all(
- (centroid > np.zeros(3)) &
- (centroid < np.sum(array.box, axis=-2))
- )
+ assert np.all((centroid > np.zeros(3)) & (centroid < np.sum(array.box, axis=-2)))
@pytest.mark.parametrize("multi_model", [True, False])
@@ -249,4 +231,4 @@ def test_remove_pbc_selection(multi_model):
# A warning due to a zero-division (centroid of empty list of
# atoms) is raised here
warnings.simplefilter("ignore")
- assert struc.remove_pbc(array, select_none) == array
\ No newline at end of file
+ assert struc.remove_pbc(array, select_none) == array
diff --git a/tests/structure/test_celllist.py b/tests/structure/test_celllist.py
index 13267ce74..8ddef530b 100644
--- a/tests/structure/test_celllist.py
+++ b/tests/structure/test_celllist.py
@@ -2,13 +2,13 @@
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.
-from os.path import join
import itertools
+from os.path import join
import numpy as np
import pytest
import biotite.structure as struc
import biotite.structure.io as strucio
-from ..util import data_dir
+from tests.util import data_dir
# Result should be independent of cell size
@@ -19,28 +19,20 @@ def test_get_atoms(cell_size):
with known solutions.
"""
array = struc.AtomArray(length=5)
- array.coord = np.array([[0,0,i] for i in range(5)])
+ array.coord = np.array([[0, 0, i] for i in range(5)])
cell_list = struc.CellList(array, cell_size=cell_size)
- assert cell_list.get_atoms(np.array([0,0,0.1]), 1).tolist() == [0,1]
- assert cell_list.get_atoms(np.array([0,0,1.1]), 1).tolist() == [1,2]
- assert cell_list.get_atoms(np.array([0,0,1.1]), 2).tolist() == [0,1,2,3]
+ assert cell_list.get_atoms(np.array([0, 0, 0.1]), 1).tolist() == [0, 1]
+ assert cell_list.get_atoms(np.array([0, 0, 1.1]), 1).tolist() == [1, 2]
+ assert cell_list.get_atoms(np.array([0, 0, 1.1]), 2).tolist() == [0, 1, 2, 3]
# Multiple positions
- pos = np.array([[0,0,0.1],
- [0,0,1.1],
- [0,0,4.1]])
- expected_indices = [0, 1, 2,
- 0, 1, 2, 3,
- 3, 4]
+ pos = np.array([[0, 0, 0.1], [0, 0, 1.1], [0, 0, 4.1]])
+ expected_indices = [0, 1, 2, 0, 1, 2, 3, 3, 4]
indices = cell_list.get_atoms(pos, 2)
assert indices[indices != -1].tolist() == expected_indices
# Multiple positions and multiple radii
- pos = np.array([[0,0,0.1],
- [0,0,1.1],
- [0,0,4.1]])
+ pos = np.array([[0, 0, 0.1], [0, 0, 1.1], [0, 0, 4.1]])
rad = np.array([1.0, 2.0, 3.0])
- expected_indices = [0, 1,
- 0, 1, 2, 3,
- 2, 3, 4]
+ expected_indices = [0, 1, 0, 1, 2, 3, 2, 3, 4]
indices = cell_list.get_atoms(pos, rad)
assert indices[indices != -1].tolist() == expected_indices
@@ -52,7 +44,7 @@ def test_get_atoms(cell_size):
[2, 5, 10],
[False, True],
[False, True],
- )
+ ),
)
def test_adjacency_matrix(cell_size, threshold, periodic, use_selection):
"""
@@ -64,9 +56,7 @@ def test_adjacency_matrix(cell_size, threshold, periodic, use_selection):
if periodic:
# Create an orthorhombic box
# with the outer coordinates as bounds
- array.box = np.diag(
- np.max(array.coord, axis=-2) - np.min(array.coord, axis=-2)
- )
+ array.box = np.diag(np.max(array.coord, axis=-2) - np.min(array.coord, axis=-2))
if use_selection:
np.random.seed(0)
@@ -83,17 +73,14 @@ def test_adjacency_matrix(cell_size, threshold, periodic, use_selection):
distance = struc.index_distance(
array,
np.stack(
- [
- np.repeat(np.arange(length), length),
- np.tile(np.arange(length), length)
- ],
- axis=-1
+ [np.repeat(np.arange(length), length), np.tile(np.arange(length), length)],
+ axis=-1,
),
- periodic
+ periodic,
)
distance = np.reshape(distance, (length, length))
# Create adjacency matrix from distance matrix
- exp_matrix = (distance <= threshold)
+ exp_matrix = distance <= threshold
if use_selection:
# Set rows and columns to False for filtered out atoms
exp_matrix[~selection, :] = False
@@ -145,12 +132,10 @@ def test_empty_coordinates():
array = strucio.load_structure(join(data_dir("structure"), "3o5r.bcif"))
cell_list = struc.CellList(array, cell_size=10)
- for method in (
- struc.CellList.get_atoms, struc.CellList.get_atoms_in_cells
- ):
+ for method in (struc.CellList.get_atoms, struc.CellList.get_atoms_in_cells):
indices = method(cell_list, np.array([]), 1, as_mask=False)
mask = method(cell_list, np.array([]), 1, as_mask=True)
assert len(indices) == 0
assert len(mask) == 0
assert indices.dtype == np.int32
- assert mask.dtype == bool
\ No newline at end of file
+ assert mask.dtype == bool
diff --git a/tests/structure/test_chains.py b/tests/structure/test_chains.py
index ffd5f682b..3c7daa782 100644
--- a/tests/structure/test_chains.py
+++ b/tests/structure/test_chains.py
@@ -2,18 +2,19 @@
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.
-import biotite.structure as struc
-import biotite.structure.io as strucio
-import numpy as np
from os.path import join
-from ..util import data_dir
+import numpy as np
import pytest
+import biotite.structure as struc
+import biotite.structure.io as strucio
+from tests.util import data_dir
@pytest.fixture
def array():
return strucio.load_structure(join(data_dir("structure"), "1igy.bcif"))
+
def test_get_chain_starts(array):
"""
Compare :func:`test_get_chain_starts()` with :func:`np.unique` in a
@@ -24,6 +25,7 @@ def test_get_chain_starts(array):
# All first occurences of a chain id are automatically chain starts
assert set(ref_starts).issubset(set(test_starts))
+
def test_get_chain_starts_same_id(array):
"""
Expect correct number of chains in a case where two successive
@@ -34,18 +36,20 @@ def test_get_chain_starts_same_id(array):
merged = array + array
assert struc.get_chain_starts(merged).tolist() == [0, array.array_length()]
+
def test_apply_chain_wise(array):
data = struc.apply_chain_wise(array, np.ones(len(array)), np.sum)
assert data.tolist() == [
- len(array[array.chain_id == chain_id])
- for chain_id in np.unique(array.chain_id)
+ len(array[array.chain_id == chain_id]) for chain_id in np.unique(array.chain_id)
]
+
def test_spread_chain_wise(array):
input_data = np.unique(array.chain_id)
output_data = struc.spread_chain_wise(array, input_data)
assert output_data.tolist() == array.chain_id.tolist()
+
def test_get_chain_masks(array):
SAMPLE_SIZE = 100
np.random.seed(0)
@@ -55,26 +59,29 @@ def test_get_chain_masks(array):
ref_mask = array.chain_id == array.chain_id[index]
assert test_mask.tolist() == ref_mask.tolist()
+
def test_get_chain_starts_for(array):
SAMPLE_SIZE = 100
np.random.seed(0)
indices = np.random.randint(0, array.array_length(), SAMPLE_SIZE)
ref_starts = np.array(
- [np.where(mask)[0][0] for mask
- in struc.get_chain_masks(array, indices)]
+ [np.where(mask)[0][0] for mask in struc.get_chain_masks(array, indices)]
)
test_starts = struc.get_chain_starts_for(array, indices)
assert test_starts.tolist() == ref_starts.tolist()
+
def test_get_chains(array):
assert struc.get_chains(array).tolist() == ["A", "B", "C", "D", "E", "F"]
+
def test_get_chain_count(array):
assert struc.get_chain_count(array) == 6
+
def test_chain_iter(array):
n = 0
for chain in struc.get_chains(array):
n += 1
assert isinstance(array, struc.AtomArray)
- assert n == 6
\ No newline at end of file
+ assert n == 6
diff --git a/tests/structure/test_charges.py b/tests/structure/test_charges.py
index 4d85d411b..35a99f11a 100644
--- a/tests/structure/test_charges.py
+++ b/tests/structure/test_charges.py
@@ -3,13 +3,9 @@
# information.
import warnings
-import pytest
import numpy as np
-from biotite.structure import Atom
-from biotite.structure import array
-from biotite.structure import BondList
-from biotite.structure import partial_charges
-
+import pytest
+from biotite.structure import Atom, BondList, array, partial_charges
# Test the partial charge of carbon in the molecules given in table
# 3 of the Gasteiger-Marsili publication
@@ -19,236 +15,236 @@
# the relevant information is the BondList
# Creating atoms to build molecules with
-carbon = Atom([0, 0, 0], element="C")
+carbon = Atom([0, 0, 0], element="C")
hydrogen = Atom([0, 0, 0], element="H")
-oxygen = Atom([0, 0, 0], element="O")
+oxygen = Atom([0, 0, 0], element="O")
nitrogen = Atom([0, 0, 0], element="N")
fluorine = Atom([0, 0, 0], element="F")
-sulfur = Atom([0, 0, 0], element="S")
+sulfur = Atom([0, 0, 0], element="S")
# Building molecules
methane = array([carbon, hydrogen, hydrogen, hydrogen, hydrogen])
methane.bonds = BondList(
- methane.array_length(),
- np.array([[0,1,1], [0,2,1], [0,3,1], [0,4,1]])
+ methane.array_length(), np.array([[0, 1, 1], [0, 2, 1], [0, 3, 1], [0, 4, 1]])
)
mol_length = methane.array_length()
methane.charge = np.array([0] * mol_length)
ethane = array(
- [carbon, carbon, hydrogen, hydrogen, hydrogen, hydrogen, hydrogen,
- hydrogen]
+ [carbon, carbon, hydrogen, hydrogen, hydrogen, hydrogen, hydrogen, hydrogen]
)
ethane.bonds = BondList(
ethane.array_length(),
- np.array([
- [0,1,1], [0,2,1], [0,3,1], [0,4,1], [1,5,1], [1,6,1], [1,7,1]
- ])
+ np.array(
+ [[0, 1, 1], [0, 2, 1], [0, 3, 1], [0, 4, 1], [1, 5, 1], [1, 6, 1], [1, 7, 1]]
+ ),
)
mol_length = ethane.array_length()
ethane.charge = np.array([0] * mol_length)
-ethylene = array(
- [carbon, carbon, hydrogen, hydrogen, hydrogen, hydrogen]
-)
+ethylene = array([carbon, carbon, hydrogen, hydrogen, hydrogen, hydrogen])
ethylene.bonds = BondList(
ethylene.array_length(),
- np.array([[0,1,2], [0,2,1], [0,3,1], [1,4,1], [1,5,1]])
+ np.array([[0, 1, 2], [0, 2, 1], [0, 3, 1], [1, 4, 1], [1, 5, 1]]),
)
mol_length = ethylene.array_length()
ethylene.charge = np.array([0] * mol_length)
-acetylene = array(
- [carbon, carbon, hydrogen, hydrogen]
-)
+acetylene = array([carbon, carbon, hydrogen, hydrogen])
acetylene.bonds = BondList(
- acetylene.array_length(),
- np.array([[0,1,3], [0,2,1], [1,3,1]])
+ acetylene.array_length(), np.array([[0, 1, 3], [0, 2, 1], [1, 3, 1]])
)
mol_length = acetylene.array_length()
acetylene.charge = np.array([0] * mol_length)
-fluoromethane = array(
- [carbon, fluorine, hydrogen, hydrogen, hydrogen]
-)
+fluoromethane = array([carbon, fluorine, hydrogen, hydrogen, hydrogen])
fluoromethane.bonds = BondList(
- fluoromethane.array_length(),
- np.array([[0,1,1], [0,2,1], [0,3,1], [0,4,1]])
+ fluoromethane.array_length(), np.array([[0, 1, 1], [0, 2, 1], [0, 3, 1], [0, 4, 1]])
)
mol_length = fluoromethane.array_length()
fluoromethane.charge = np.array([0] * mol_length)
-difluoromethane = array(
- [carbon, fluorine, fluorine, hydrogen, hydrogen]
-)
+difluoromethane = array([carbon, fluorine, fluorine, hydrogen, hydrogen])
difluoromethane.bonds = BondList(
difluoromethane.array_length(),
- np.array([[0,1,1], [0,2,1], [0,3,1], [0,4,1]])
+ np.array([[0, 1, 1], [0, 2, 1], [0, 3, 1], [0, 4, 1]]),
)
mol_length = difluoromethane.array_length()
difluoromethane.charge = np.array([0] * mol_length)
-trifluoromethane = array(
- [carbon, fluorine, fluorine, fluorine, hydrogen]
-)
+trifluoromethane = array([carbon, fluorine, fluorine, fluorine, hydrogen])
trifluoromethane.bonds = BondList(
trifluoromethane.array_length(),
- np.array([[0,1,1], [0,2,1], [0,3,1], [0,4,1]])
+ np.array([[0, 1, 1], [0, 2, 1], [0, 3, 1], [0, 4, 1]]),
)
mol_length = trifluoromethane.array_length()
trifluoromethane.charge = np.array([0] * mol_length)
-tetrafluoromethane = array(
- [carbon, fluorine, fluorine, fluorine, fluorine]
-)
+tetrafluoromethane = array([carbon, fluorine, fluorine, fluorine, fluorine])
tetrafluoromethane.bonds = BondList(
tetrafluoromethane.array_length(),
- np.array([[0,1,1], [0,2,1], [0,3,1], [0,4,1]])
+ np.array([[0, 1, 1], [0, 2, 1], [0, 3, 1], [0, 4, 1]]),
)
mol_length = tetrafluoromethane.array_length()
tetrafluoromethane.charge = np.array([0] * mol_length)
fluoroethane = array(
- [carbon, carbon, fluorine, hydrogen, hydrogen, hydrogen,
- hydrogen, hydrogen]
+ [carbon, carbon, fluorine, hydrogen, hydrogen, hydrogen, hydrogen, hydrogen]
)
fluoroethane.bonds = BondList(
fluoroethane.array_length(),
- np.array([
- [0,1,1], [0,2,1], [0,3,1], [0,4,1], [1,5,1], [1,6,1], [1,7,1]
- ])
+ np.array(
+ [[0, 1, 1], [0, 2, 1], [0, 3, 1], [0, 4, 1], [1, 5, 1], [1, 6, 1], [1, 7, 1]]
+ ),
)
mol_length = fluoroethane.array_length()
fluoroethane.charge = np.array([0] * mol_length)
trifluoroethane = array(
- [carbon, carbon, fluorine, fluorine, fluorine, hydrogen,
- hydrogen, hydrogen]
+ [carbon, carbon, fluorine, fluorine, fluorine, hydrogen, hydrogen, hydrogen]
)
trifluoroethane.bonds = BondList(
trifluoroethane.array_length(),
- np.array([
- [0,1,1], [0,2,1], [0,3,1], [0,4,1], [1,5,1], [1,6,1], [1,7,1]
- ])
+ np.array(
+ [[0, 1, 1], [0, 2, 1], [0, 3, 1], [0, 4, 1], [1, 5, 1], [1, 6, 1], [1, 7, 1]]
+ ),
)
mol_length = trifluoroethane.array_length()
trifluoroethane.charge = np.array([0] * mol_length)
-methanole = array(
- [carbon, oxygen, hydrogen, hydrogen, hydrogen, hydrogen]
-)
+methanole = array([carbon, oxygen, hydrogen, hydrogen, hydrogen, hydrogen])
methanole.bonds = BondList(
methanole.array_length(),
- np.array([[0,1,1], [0,2,1], [0,3,1], [0,4,1], [1,5,1]])
+ np.array([[0, 1, 1], [0, 2, 1], [0, 3, 1], [0, 4, 1], [1, 5, 1]]),
)
mol_length = methanole.array_length()
methanole.charge = np.array([0] * mol_length)
dimethyl_ether = array(
- [carbon, carbon, oxygen, hydrogen, hydrogen, hydrogen, hydrogen,
- hydrogen, hydrogen]
+ [carbon, carbon, oxygen, hydrogen, hydrogen, hydrogen, hydrogen, hydrogen, hydrogen]
)
dimethyl_ether.bonds = BondList(
dimethyl_ether.array_length(),
- np.array([
- [0,2,1], [1,2,1], [0,3,1], [0,4,1], [0,5,1], [1,6,1], [1,7,1],
- [1,8,1]
- ])
+ np.array(
+ [
+ [0, 2, 1],
+ [1, 2, 1],
+ [0, 3, 1],
+ [0, 4, 1],
+ [0, 5, 1],
+ [1, 6, 1],
+ [1, 7, 1],
+ [1, 8, 1],
+ ]
+ ),
)
mol_length = dimethyl_ether.array_length()
dimethyl_ether.charge = np.array([0] * mol_length)
-formaldehyde = array(
- [carbon, oxygen, hydrogen, hydrogen]
-)
+formaldehyde = array([carbon, oxygen, hydrogen, hydrogen])
formaldehyde.bonds = BondList(
- formaldehyde.array_length(),
- np.array([[0,1,2], [0,2,1], [0,3,1]])
+ formaldehyde.array_length(), np.array([[0, 1, 2], [0, 2, 1], [0, 3, 1]])
)
mol_length = formaldehyde.array_length()
formaldehyde.charge = np.array([0] * mol_length)
-acetaldehyde = array(
- [carbon, carbon, oxygen, hydrogen, hydrogen, hydrogen, hydrogen]
-)
+acetaldehyde = array([carbon, carbon, oxygen, hydrogen, hydrogen, hydrogen, hydrogen])
acetaldehyde.bonds = BondList(
acetaldehyde.array_length(),
- np.array([[0,1,1], [1,2,2], [0,3,1], [0,4,1], [0,5,1], [1,6,1]])
+ np.array([[0, 1, 1], [1, 2, 2], [0, 3, 1], [0, 4, 1], [0, 5, 1], [1, 6, 1]]),
)
mol_length = acetaldehyde.array_length()
acetaldehyde.charge = np.array([0] * mol_length)
acetone = array(
- [carbon, carbon, carbon, oxygen, hydrogen, hydrogen, hydrogen,
- hydrogen, hydrogen, hydrogen]
+ [
+ carbon,
+ carbon,
+ carbon,
+ oxygen,
+ hydrogen,
+ hydrogen,
+ hydrogen,
+ hydrogen,
+ hydrogen,
+ hydrogen,
+ ]
)
acetone.bonds = BondList(
acetone.array_length(),
- np.array([
- [0,1,1], [1,2,1], [1,3,2], [0,4,1], [0,5,1], [0,6,1], [2,7,1],
- [2,8,1], [2,9,1]
- ])
+ np.array(
+ [
+ [0, 1, 1],
+ [1, 2, 1],
+ [1, 3, 2],
+ [0, 4, 1],
+ [0, 5, 1],
+ [0, 6, 1],
+ [2, 7, 1],
+ [2, 8, 1],
+ [2, 9, 1],
+ ]
+ ),
)
mol_length = acetone.array_length()
acetone.charge = np.array([0] * mol_length)
-hydrogen_cyanide = array(
- [carbon, nitrogen, hydrogen]
-)
+hydrogen_cyanide = array([carbon, nitrogen, hydrogen])
hydrogen_cyanide.bonds = BondList(
- hydrogen_cyanide.array_length(),
- np.array([[0,1,3], [0,2,1]])
+ hydrogen_cyanide.array_length(), np.array([[0, 1, 3], [0, 2, 1]])
)
mol_length = hydrogen_cyanide.array_length()
hydrogen_cyanide.charge = np.array([0] * mol_length)
-acetonitrile = array(
- [carbon, carbon, nitrogen, hydrogen, hydrogen, hydrogen]
-)
+acetonitrile = array([carbon, carbon, nitrogen, hydrogen, hydrogen, hydrogen])
acetonitrile.bonds = BondList(
acetonitrile.array_length(),
- np.array([[0,1,1], [1,2,3], [0,3,1], [0,4,1], [0,5,1]])
+ np.array([[0, 1, 1], [1, 2, 3], [0, 3, 1], [0, 4, 1], [0, 5, 1]]),
)
mol_length = acetonitrile.array_length()
acetonitrile.charge = np.array([0] * mol_length)
+
# For this purpose, parametrization via pytest is performed
-@pytest.mark.parametrize("molecule, expected_results", [
- (methane, (-0.078,)),
- (ethane, (-0.068, -0.068)),
- (ethylene, (-0.106, -0.106)),
- (acetylene, (-0.122, -0.122)),
- (fluoromethane, (0.079,)),
- (difluoromethane, (0.23,)),
- (trifluoromethane, (0.38,)),
- (tetrafluoromethane, (0.561,)),
- (fluoroethane, (0.087, -0.037)),
- (trifluoroethane, (0.387, 0.039)),
- (methanole, (0.033,)),
- (dimethyl_ether, (0.036, 0.036)),
- (formaldehyde, (0.115,)),
- (acetaldehyde, (-0.009, 0.123)),
- (acetone, (-0.006, 0.131, -0.006)),
- (hydrogen_cyanide, (0.051,)),
- (acetonitrile, (0.023, 0.06))
-])
+@pytest.mark.parametrize(
+ "molecule, expected_results",
+ [
+ (methane, (-0.078,)),
+ (ethane, (-0.068, -0.068)),
+ (ethylene, (-0.106, -0.106)),
+ (acetylene, (-0.122, -0.122)),
+ (fluoromethane, (0.079,)),
+ (difluoromethane, (0.23,)),
+ (trifluoromethane, (0.38,)),
+ (tetrafluoromethane, (0.561,)),
+ (fluoroethane, (0.087, -0.037)),
+ (trifluoroethane, (0.387, 0.039)),
+ (methanole, (0.033,)),
+ (dimethyl_ether, (0.036, 0.036)),
+ (formaldehyde, (0.115,)),
+ (acetaldehyde, (-0.009, 0.123)),
+ (acetone, (-0.006, 0.131, -0.006)),
+ (hydrogen_cyanide, (0.051,)),
+ (acetonitrile, (0.023, 0.06)),
+ ],
+)
def test_partial_charges(molecule, expected_results):
"""
Test whether the partial charges of the carbon atoms comprised in
@@ -257,29 +253,33 @@ def test_partial_charges(molecule, expected_results):
within a certain tolerance range.
"""
charges = partial_charges(molecule)
- assert charges[molecule.element == "C"].tolist() == \
- pytest.approx(expected_results, abs=1e-2)
-
-
-@pytest.mark.parametrize("molecule", [
- methane,
- ethane,
- ethylene,
- acetylene,
- fluoromethane,
- difluoromethane,
- trifluoromethane,
- tetrafluoromethane,
- fluoroethane,
- trifluoroethane,
- methanole,
- dimethyl_ether,
- formaldehyde,
- acetaldehyde,
- acetone,
- hydrogen_cyanide,
- acetonitrile
-])
+ assert charges[molecule.element == "C"].tolist() == pytest.approx(
+ expected_results, abs=1e-2
+ )
+
+
+@pytest.mark.parametrize(
+ "molecule",
+ [
+ methane,
+ ethane,
+ ethylene,
+ acetylene,
+ fluoromethane,
+ difluoromethane,
+ trifluoromethane,
+ tetrafluoromethane,
+ fluoroethane,
+ trifluoroethane,
+ methanole,
+ dimethyl_ether,
+ formaldehyde,
+ acetaldehyde,
+ acetone,
+ hydrogen_cyanide,
+ acetonitrile,
+ ],
+)
def test_total_charge_zero(molecule):
"""
In the case of the 17 molecules given in table 3, it is verified
@@ -302,14 +302,8 @@ def test_pos_formal_charge():
pos_methane = methane.copy()
pos_methane.charge = np.array([1, 0, 0, 0, 0])
- ref_carb_part_charge = partial_charges(
- methane,
- iteration_step_num=6
- )[0]
- pos_carb_part_charge = partial_charges(
- pos_methane,
- iteration_step_num=6
- )[0]
+ ref_carb_part_charge = partial_charges(methane, iteration_step_num=6)[0]
+ pos_carb_part_charge = partial_charges(pos_methane, iteration_step_num=6)[0]
assert pos_carb_part_charge < 1
assert pos_carb_part_charge > ref_carb_part_charge
@@ -331,16 +325,12 @@ def test_valence_state_not_parametrized():
with pytest.warns(
UserWarning,
match=(
- "Parameters for specific valence states of some atoms "
- "are not available"
- )
+ "Parameters for specific valence states of some atoms " "are not available"
+ ),
):
- thioformaldehyde = array(
- [carbon, sulfur, hydrogen, hydrogen]
- )
+ thioformaldehyde = array([carbon, sulfur, hydrogen, hydrogen])
thioformaldehyde.bonds = BondList(
- thioformaldehyde.array_length(),
- np.array([[0,1,2], [0,2,1], [0,3,1]])
+ thioformaldehyde.array_length(), np.array([[0, 1, 2], [0, 2, 1], [0, 3, 1]])
)
mol_length = thioformaldehyde.array_length()
thioformaldehyde.charge = np.array([0] * mol_length)
@@ -368,9 +358,7 @@ def test_correct_output_ions():
sodium_array.bonds = BondList(sodium_array.array_length())
with warnings.catch_warnings():
warnings.simplefilter("error")
- sodium_charge = partial_charges(
- sodium_array, iteration_step_num=1
- )[0]
+ sodium_charge = partial_charges(sodium_array, iteration_step_num=1)[0]
assert sodium_charge == 1
@@ -414,51 +402,72 @@ def test_correct_output_charged_aa():
unspecified bond types throughout the whole AtomArray is raised.
"""
- glycine_charge = np.array(
- [+1, 0, 0, 0, -1, 0, 0, 0, 0, 0]
- )
+ glycine_charge = np.array([+1, 0, 0, 0, -1, 0, 0, 0, 0, 0])
glycine_with_btype = array(
- [nitrogen, carbon, carbon, oxygen, oxygen, hydrogen, hydrogen,
- hydrogen, hydrogen, hydrogen]
+ [
+ nitrogen,
+ carbon,
+ carbon,
+ oxygen,
+ oxygen,
+ hydrogen,
+ hydrogen,
+ hydrogen,
+ hydrogen,
+ hydrogen,
+ ]
)
glycine_with_btype.charge = glycine_charge
glycine_with_btype.bonds = BondList(
glycine_with_btype.array_length(),
- np.array([
- [0,1,1], [0,5,1], [0,6,1], [0,7,1], [1,2,1], [1,8,1],
- [1,9,1], [2,3,2], [2,4,1]
- ])
+ np.array(
+ [
+ [0, 1, 1],
+ [0, 5, 1],
+ [0, 6, 1],
+ [0, 7, 1],
+ [1, 2, 1],
+ [1, 8, 1],
+ [1, 9, 1],
+ [2, 3, 2],
+ [2, 4, 1],
+ ]
+ ),
)
glycine_without_btype = glycine_with_btype.copy()
glycine_without_btype.charge = glycine_charge
glycine_without_btype.bonds = BondList(
glycine_without_btype.array_length(),
- np.array([
- [0,1,0], [0,5,0], [0,6,0], [0,7,0], [1,2,0], [1,8,0],
- [1,9,0], [2,3,0], [2,4,0]
- ])
+ np.array(
+ [
+ [0, 1, 0],
+ [0, 5, 0],
+ [0, 6, 0],
+ [0, 7, 0],
+ [1, 2, 0],
+ [1, 8, 0],
+ [1, 9, 0],
+ [2, 3, 0],
+ [2, 4, 0],
+ ]
+ ),
)
part_charges_with_btype = partial_charges(glycine_with_btype)
with pytest.warns(UserWarning, match="Each atom's bond type is 0"):
- part_charges_without_btype = partial_charges(
- glycine_without_btype
- )
+ part_charges_without_btype = partial_charges(glycine_without_btype)
# Nitrogen of the amino group has the index 0
nitr_charge_with_btype = part_charges_with_btype[0]
nitr_charge_without_btype = part_charges_without_btype[0]
- assert nitr_charge_with_btype == pytest.approx(
- nitr_charge_without_btype, abs=5e-4
- )
+ assert nitr_charge_with_btype == pytest.approx(nitr_charge_without_btype, abs=5e-4)
# Oxygen of the hydroxyl group in the carboxyl group has the index 2
oxyg_charge_with_btype = part_charges_with_btype[2]
oxyg_charge_without_btype = part_charges_without_btype[2]
assert oxyg_charge_with_btype < oxyg_charge_without_btype
# Assert that difference between the two values is significant
- difference_oxyg_charges = abs(oxyg_charge_with_btype
- - oxyg_charge_without_btype)
- assert difference_oxyg_charges > 3e-2
\ No newline at end of file
+ difference_oxyg_charges = abs(oxyg_charge_with_btype - oxyg_charge_without_btype)
+ assert difference_oxyg_charges > 3e-2
diff --git a/tests/structure/test_compare.py b/tests/structure/test_compare.py
index a4cf024a1..1895c8ba4 100644
--- a/tests/structure/test_compare.py
+++ b/tests/structure/test_compare.py
@@ -2,17 +2,18 @@
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.
-import biotite.structure as struc
-import biotite.structure.io as strucio
from os.path import join
import numpy as np
import pytest
-from ..util import data_dir
+import biotite.structure as struc
+import biotite.structure.io as strucio
+from tests.util import data_dir
+
@pytest.fixture
def stack():
stack = struc.AtomArrayStack(depth=3, length=5)
- stack.coord = np.arange(45).reshape((3,5,3))
+ stack.coord = np.arange(45).reshape((3, 5, 3))
return stack
@@ -20,92 +21,178 @@ def stack():
def test_rmsd(stack, as_coord):
if as_coord:
stack = stack.coord
- assert struc.rmsd(stack[0], stack).tolist() \
- == pytest.approx([0.0, 25.98076211, 51.96152423])
- assert struc.rmsd(stack[0], stack[1]) \
- == pytest.approx(25.9807621135)
+ assert struc.rmsd(stack[0], stack).tolist() == pytest.approx(
+ [0.0, 25.98076211, 51.96152423]
+ )
+ assert struc.rmsd(stack[0], stack[1]) == pytest.approx(25.9807621135)
@pytest.mark.parametrize("as_coord", [False, True])
def test_rmsf(stack, as_coord):
if as_coord:
stack = stack.coord
- assert struc.rmsf(struc.average(stack), stack).tolist() \
- == pytest.approx([21.21320344] * 5)
+ assert struc.rmsf(struc.average(stack), stack).tolist() == pytest.approx(
+ [21.21320344] * 5
+ )
+
@pytest.fixture
def load_stack_superimpose():
- stack = strucio.load_structure(join(
- data_dir("structure"), "1l2y.bcif"
- ))
+ stack = strucio.load_structure(join(data_dir("structure"), "1l2y.bcif"))
# Superimpose with first frame
bb_mask = struc.filter_peptide_backbone(stack[0])
supimp, _ = struc.superimpose(stack[0], stack, atom_mask=bb_mask)
return stack, supimp
+
def test_rmsd_gmx(load_stack_superimpose):
"""
Comparison of RMSD values computed with Biotite with results
obtained from GROMACS 2021.5.
"""
stack, supimp = load_stack_superimpose
- rmsd = struc.rmsd(stack[0], supimp)/10
+ rmsd = struc.rmsd(stack[0], supimp) / 10
# Gromacs RMSDs -> Without mass-weighting:
# echo "Backbone Protein" | \
# gmx rms -s 1l2y.gro -f 1l2y.xtc -o rmsd.xvg -mw no
- rmsd_gmx = np.array([
- 0.0005037, 0.1957698, 0.2119313, 0.2226127, 0.184382,
- 0.2210998, 0.2712815, 0.1372861, 0.2348654, 0.1848784,
- 0.1893576, 0.2500543, 0.1946374, 0.2101624, 0.2180645,
- 0.1836762, 0.1681345, 0.2363865, 0.2287371, 0.2546207,
- 0.1604872, 0.2167119, 0.2176063, 0.2069806, 0.2535706,
- 0.2682233, 0.2252388, 0.2419151, 0.2343987, 0.1902994,
- 0.2334525, 0.2010523, 0.215444, 0.1786632, 0.2652018,
- 0.174061, 0.2591569, 0.2602662
- ])
+ rmsd_gmx = np.array(
+ [
+ 0.0005037,
+ 0.1957698,
+ 0.2119313,
+ 0.2226127,
+ 0.184382,
+ 0.2210998,
+ 0.2712815,
+ 0.1372861,
+ 0.2348654,
+ 0.1848784,
+ 0.1893576,
+ 0.2500543,
+ 0.1946374,
+ 0.2101624,
+ 0.2180645,
+ 0.1836762,
+ 0.1681345,
+ 0.2363865,
+ 0.2287371,
+ 0.2546207,
+ 0.1604872,
+ 0.2167119,
+ 0.2176063,
+ 0.2069806,
+ 0.2535706,
+ 0.2682233,
+ 0.2252388,
+ 0.2419151,
+ 0.2343987,
+ 0.1902994,
+ 0.2334525,
+ 0.2010523,
+ 0.215444,
+ 0.1786632,
+ 0.2652018,
+ 0.174061,
+ 0.2591569,
+ 0.2602662,
+ ]
+ )
assert np.allclose(rmsd, rmsd_gmx, atol=1e-03)
+
def test_rmspd_gmx(load_stack_superimpose):
"""
Comparison of the RMSPD computed with Biotite with results
obtained from GROMACS 2021.5.
"""
stack, _ = load_stack_superimpose
- rmspd = struc.rmspd(stack[0], stack)/10
+ rmspd = struc.rmspd(stack[0], stack) / 10
# Gromacs RMSDist:
# echo "Protein" | \
# gmx rmsdist -f 1l2y.xtc -s 1l2y.gro -o rmsdist.xvg -sumh no -pbc no
- rmspd_gmx = np.array([
- 0.000401147, 0.125482, 0.138913, 0.138847, 0.113917,
- 0.132915, 0.173084, 0.103089, 0.156309, 0.114694,
- 0.12964, 0.15875, 0.12876, 0.128983, 0.137031,
- 0.126059, 0.106726, 0.154244, 0.144405, 0.174041,
- 0.10417, 0.130936, 0.141216, 0.125559, 0.171342,
- 0.165306, 0.137616, 0.154447, 0.146337, 0.116433,
- 0.154976, 0.128477, 0.150537, 0.111494, 0.173234,
- 0.116638, 0.169524, 0.15953
- ])
+ rmspd_gmx = np.array(
+ [
+ 0.000401147,
+ 0.125482,
+ 0.138913,
+ 0.138847,
+ 0.113917,
+ 0.132915,
+ 0.173084,
+ 0.103089,
+ 0.156309,
+ 0.114694,
+ 0.12964,
+ 0.15875,
+ 0.12876,
+ 0.128983,
+ 0.137031,
+ 0.126059,
+ 0.106726,
+ 0.154244,
+ 0.144405,
+ 0.174041,
+ 0.10417,
+ 0.130936,
+ 0.141216,
+ 0.125559,
+ 0.171342,
+ 0.165306,
+ 0.137616,
+ 0.154447,
+ 0.146337,
+ 0.116433,
+ 0.154976,
+ 0.128477,
+ 0.150537,
+ 0.111494,
+ 0.173234,
+ 0.116638,
+ 0.169524,
+ 0.15953,
+ ]
+ )
assert np.allclose(rmspd, rmspd_gmx, atol=1e-03)
+
def test_rmsf_gmx(load_stack_superimpose):
"""
Comparison of RMSF values computed with Biotite with results
obtained from GROMACS 2021.5.
"""
stack, supimp = load_stack_superimpose
- ca_mask = ((stack[0].atom_name == "CA") & (stack[0].element == "C"))
- rmsf = struc.rmsf(struc.average(supimp[:, ca_mask]), supimp[:, ca_mask])/10
+ ca_mask = (stack[0].atom_name == "CA") & (stack[0].element == "C")
+ rmsf = struc.rmsf(struc.average(supimp[:, ca_mask]), supimp[:, ca_mask]) / 10
# Gromacs RMSF:
# echo "C-alpha" | gmx rmsf -s 1l2y.gro -f 1l2y.xtc -o rmsf.xvg -res
- rmsf_gmx = np.array([
- 0.1379, 0.036, 0.0261, 0.0255, 0.029, 0.0204, 0.0199,
- 0.0317, 0.0365, 0.0249, 0.0269, 0.032, 0.0356, 0.0446,
- 0.059, 0.037, 0.0331, 0.0392, 0.0403, 0.0954
- ])
-
- assert np.allclose(rmsf, rmsf_gmx, atol=1e-02)
\ No newline at end of file
+ rmsf_gmx = np.array(
+ [
+ 0.1379,
+ 0.036,
+ 0.0261,
+ 0.0255,
+ 0.029,
+ 0.0204,
+ 0.0199,
+ 0.0317,
+ 0.0365,
+ 0.0249,
+ 0.0269,
+ 0.032,
+ 0.0356,
+ 0.0446,
+ 0.059,
+ 0.037,
+ 0.0331,
+ 0.0392,
+ 0.0403,
+ 0.0954,
+ ]
+ )
+
+ assert np.allclose(rmsf, rmsf_gmx, atol=1e-02)
diff --git a/tests/structure/test_density.py b/tests/structure/test_density.py
index bfbb3e1e4..012b5eb02 100644
--- a/tests/structure/test_density.py
+++ b/tests/structure/test_density.py
@@ -2,10 +2,11 @@
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.
-import biotite.structure as struc
-from biotite.structure import Atom
import numpy as np
import pytest
+import biotite.structure as struc
+from biotite.structure import Atom
+
@pytest.fixture
def array():
@@ -18,52 +19,56 @@ def array():
atom_list.append(Atom([2.5, 0.5, 1.1]))
return struc.array(atom_list)
+
@pytest.fixture
def stack(array):
return struc.stack([array, array.copy()])
-
def test_density(array, stack):
density, (x, y, z) = struc.density(array)
assert np.array_equal(x, [0.5, 1.5, 2.5])
assert np.array_equal(y, [0.5, 1.5, 2.5, 3.5])
assert np.array_equal(z, [1.0, 2.0])
assert density.sum() == 6
- assert density[0,2] == 2
- assert density[1,0] == 3
- assert density[1,1] == 1
+ assert density[0, 2] == 2
+ assert density[1, 0] == 3
+ assert density[1, 1] == 1
density, (x, y, z) = struc.density(stack)
assert np.array_equal(x, [0.5, 1.5, 2.5])
assert np.array_equal(y, [0.5, 1.5, 2.5, 3.5])
assert np.array_equal(z, [1.0, 2.0])
assert density.sum() == 12
- assert density[0,2] == 4
- assert density[1,0] == 6
- assert density[1,1] == 2
+ assert density[0, 2] == 4
+ assert density[1, 0] == 6
+ assert density[1, 1] == 2
+
def test_density_with_bins(array):
- bins = np.array([[0, 1, 2, 3],[0, 1, 2, 3],[0, 1, 2, 3]])
+ bins = np.array([[0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3]])
density, (x, y, z) = struc.density(array, bins=bins)
- assert np.array_equal(x, [0,1,2,3])
- assert np.array_equal(y, [0,1,2,3])
- assert np.array_equal(z, [0,1,2,3])
+ assert np.array_equal(x, [0, 1, 2, 3])
+ assert np.array_equal(y, [0, 1, 2, 3])
+ assert np.array_equal(z, [0, 1, 2, 3])
assert density.sum() == 6
- assert density[0,2,1] == 2
- assert density[1,1,1] == 1
- assert density[2,0,1] == 3
+ assert density[0, 2, 1] == 2
+ assert density[1, 1, 1] == 1
+ assert density[2, 0, 1] == 3
+
def test_density_with_delta(array):
density, (x, y, z) = struc.density(array, delta=5.0)
assert density.shape == (1, 1, 1)
assert density.sum() == 6
- assert density[0,0,0] == 6
+ assert density[0, 0, 0] == 6
+
def test_density_normalized(array):
density, (x, y, z) = struc.density(array, density=True)
assert np.abs(density.sum() - 1.0) < 0.0001
- assert np.abs(density[0,2] - 2.0/6.0) < 0.0001
+ assert np.abs(density[0, 2] - 2.0 / 6.0) < 0.0001
+
def test_density_weights(array, stack):
# assign weights to coordinates
@@ -74,15 +79,15 @@ def test_density_weights(array, stack):
assert density.sum() == atomic_weights.sum()
assert density[0, 2] == atomic_weights[0] + atomic_weights[1]
assert density[1, 0] == atomic_weights[3:].sum()
- assert density[1,1] == atomic_weights[2]
+ assert density[1, 1] == atomic_weights[2]
# weights should be repeated along stack dimensions and lead to the same
# result independent of shape
density, (x, y, z) = struc.density(stack, weights=atomic_weights)
- density2, (x, y, z) = struc.density(stack,
- weights=np.array([atomic_weights, atomic_weights]))
+ density2, (x, y, z) = struc.density(
+ stack, weights=np.array([atomic_weights, atomic_weights])
+ )
assert density.sum() == density2.sum()
- assert density[0,2] == density2[0,2]
- assert density[1,0] == density2[1,0]
- assert density[1,1] == density2[1,1]
-
+ assert density[0, 2] == density2[0, 2]
+ assert density[1, 0] == density2[1, 0]
+ assert density[1, 1] == density2[1, 1]
diff --git a/tests/structure/test_dotbracket.py b/tests/structure/test_dotbracket.py
index 4e6827cd7..cbe5ef2e6 100644
--- a/tests/structure/test_dotbracket.py
+++ b/tests/structure/test_dotbracket.py
@@ -2,12 +2,12 @@
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.
-import pytest
+from os.path import join
import numpy as np
+import pytest
import biotite.structure as struc
import biotite.structure.io as strucio
-from os.path import join
-from ..util import data_dir
+from tests.util import data_dir
@pytest.fixture
@@ -15,11 +15,10 @@ def nuc_sample_array():
"""
Sample structure.
"""
- nuc_sample_array = strucio.load_structure(
- join(data_dir("structure"), "4p5j.cif")
- )
+ nuc_sample_array = strucio.load_structure(join(data_dir("structure"), "4p5j.cif"))
return nuc_sample_array[struc.filter_nucleotides(nuc_sample_array)]
+
@pytest.fixture
def expected_output():
"""
@@ -29,47 +28,51 @@ def expected_output():
".[(((((.[{...)))))(((((((.......)))))))...(((((]}.)..))))[[[...(((((("
"]]].].))))))(.)",
".[(((((.{[...)))))(((((((.......)))))))...(((((}].)..))))[[[...(((((("
- "]]].].))))))(.)"
+ "]]].].))))))(.)",
]
+
@pytest.fixture
def basepair_residue_positions():
"""
The base pairs in the sample array by their residue postions.
"""
return np.array(
- [[1, 73],
- [2, 17],
- [3, 16],
- [4, 15],
- [5, 14],
- [6, 13],
- [8, 47],
- [9, 48],
- [18, 38],
- [19, 37],
- [20, 36],
- [21, 35],
- [22, 34],
- [23, 33],
- [24, 32],
- [42, 56],
- [43, 55],
- [44, 54],
- [45, 53],
- [46, 50],
- [57, 71],
- [58, 70],
- [59, 69],
- [63, 80],
- [64, 79],
- [65, 78],
- [66, 77],
- [67, 76],
- [68, 75],
- [81, 83]]
+ [
+ [1, 73],
+ [2, 17],
+ [3, 16],
+ [4, 15],
+ [5, 14],
+ [6, 13],
+ [8, 47],
+ [9, 48],
+ [18, 38],
+ [19, 37],
+ [20, 36],
+ [21, 35],
+ [22, 34],
+ [23, 33],
+ [24, 32],
+ [42, 56],
+ [43, 55],
+ [44, 54],
+ [45, 53],
+ [46, 50],
+ [57, 71],
+ [58, 70],
+ [59, 69],
+ [63, 80],
+ [64, 79],
+ [65, 78],
+ [66, 77],
+ [67, 76],
+ [68, 75],
+ [81, 83],
+ ]
)
+
def verify_dot_bracket_notation(output, expected_output):
"""
Ensure that the dot_bracket notation matches a reference.
@@ -82,6 +85,7 @@ def verify_dot_bracket_notation(output, expected_output):
unique_solutions = set(output)
assert len(output) == len(unique_solutions)
+
def test_dot_bracket_from_structure(nuc_sample_array, expected_output):
"""
Check the output of ``dot_bracket_from_structure()``.
@@ -89,22 +93,20 @@ def test_dot_bracket_from_structure(nuc_sample_array, expected_output):
output = struc.dot_bracket_from_structure(nuc_sample_array)
verify_dot_bracket_notation(output, expected_output)
+
def test_dot_bracket(basepair_residue_positions, expected_output):
"""
Check the output of ``dot_bracket()``.
"""
- output = struc.dot_bracket(
- basepair_residue_positions, len(expected_output[0])
- )
+ output = struc.dot_bracket(basepair_residue_positions, len(expected_output[0]))
verify_dot_bracket_notation(output, expected_output)
-def test_base_pairs_from_dot_bracket(
- basepair_residue_positions, expected_output
-):
+
+def test_base_pairs_from_dot_bracket(basepair_residue_positions, expected_output):
"""
Ensure that the base pairs are correctly extracted from the
DBL-notation
"""
for notation in expected_output:
test_residue_positions = struc.base_pairs_from_dot_bracket(notation)
- assert np.all(test_residue_positions == basepair_residue_positions)
\ No newline at end of file
+ assert np.all(test_residue_positions == basepair_residue_positions)
diff --git a/tests/structure/test_filter.py b/tests/structure/test_filter.py
index 996593831..5bac8099c 100644
--- a/tests/structure/test_filter.py
+++ b/tests/structure/test_filter.py
@@ -2,110 +2,125 @@
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.
-import biotite.structure as struc
-import biotite.structure.io as strucio
-import numpy as np
from os.path import join
-from ..util import data_dir
+import numpy as np
import pytest
+import biotite.structure as struc
+import biotite.structure.io as strucio
+from tests.util import data_dir
+
@pytest.fixture
def canonical_sample_protein():
- return strucio.load_structure(
- join(data_dir("structure"), "3o5r.bcif")
- )
+ return strucio.load_structure(join(data_dir("structure"), "3o5r.bcif"))
+
@pytest.fixture
def sample_protein():
- return strucio.load_structure(
- join(data_dir("structure"), "5eil.bcif")
- )
+ return strucio.load_structure(join(data_dir("structure"), "5eil.bcif"))
+
@pytest.fixture
def canonical_sample_nucleotide():
- return strucio.load_structure(
- join(data_dir("structure"), "5ugo.bcif")
- )
+ return strucio.load_structure(join(data_dir("structure"), "5ugo.bcif"))
+
@pytest.fixture
def sample_nucleotide():
- return strucio.load_structure(
- join(data_dir("structure"), "4p5j.bcif")
- )
+ return strucio.load_structure(join(data_dir("structure"), "4p5j.bcif"))
+
@pytest.fixture
def sample_carbohydrate():
- return strucio.load_structure(
- join(data_dir("structure"), "2d0f.bcif")
- )
+ return strucio.load_structure(join(data_dir("structure"), "2d0f.bcif"))
+
@pytest.fixture
def all_atloc_structure():
return strucio.load_structure(
join(data_dir("structure"), "1o1z.bcif"),
- extra_fields = ["occupancy"],
- altloc="all"
+ extra_fields=["occupancy"],
+ altloc="all",
)
+
def test_solvent_filter(canonical_sample_protein):
- assert len(canonical_sample_protein[struc.filter_solvent(canonical_sample_protein)]) == 287
+ assert (
+ len(canonical_sample_protein[struc.filter_solvent(canonical_sample_protein)])
+ == 287
+ )
+
def test_canonical_amino_acid_filter(canonical_sample_protein):
assert (
- len(canonical_sample_protein[
- struc.filter_canonical_amino_acids(canonical_sample_protein)
- ]) == 982
+ len(
+ canonical_sample_protein[
+ struc.filter_canonical_amino_acids(canonical_sample_protein)
+ ]
+ )
+ == 982
)
+
def test_amino_acid_filter(sample_protein):
assert (
- struc.get_residue_count((sample_protein[
- struc.filter_amino_acids(sample_protein)
- ])) ==
- struc.get_residue_count((sample_protein[
- struc.filter_canonical_amino_acids(sample_protein)
- ])) + 3
+ struc.get_residue_count(
+ (sample_protein[struc.filter_amino_acids(sample_protein)])
+ )
+ == struc.get_residue_count(
+ (sample_protein[struc.filter_canonical_amino_acids(sample_protein)])
+ )
+ + 3
)
+
def test_canonical_nucleotide_filter(canonical_sample_nucleotide):
assert (
- len(canonical_sample_nucleotide[
- struc.filter_canonical_nucleotides(canonical_sample_nucleotide)
- ]) == 651
+ len(
+ canonical_sample_nucleotide[
+ struc.filter_canonical_nucleotides(canonical_sample_nucleotide)
+ ]
+ )
+ == 651
)
+
def test_nucleotide_filter(sample_nucleotide):
assert (
- struc.get_residue_count((sample_nucleotide[
- struc.filter_nucleotides(sample_nucleotide)
- ])) ==
- struc.get_residue_count((sample_nucleotide[
- struc.filter_canonical_nucleotides(sample_nucleotide)
- ])) + 1
+ struc.get_residue_count(
+ (sample_nucleotide[struc.filter_nucleotides(sample_nucleotide)])
+ )
+ == struc.get_residue_count(
+ (sample_nucleotide[struc.filter_canonical_nucleotides(sample_nucleotide)])
+ )
+ + 1
)
+
def test_carbohydrate_filter(sample_carbohydrate):
assert (
- struc.get_residue_count((sample_carbohydrate[
- struc.filter_carbohydrates(sample_carbohydrate)
- ])) == 8
+ struc.get_residue_count(
+ (sample_carbohydrate[struc.filter_carbohydrates(sample_carbohydrate)])
+ )
+ == 8
)
def test_peptide_backbone_filter(canonical_sample_protein):
assert (
- len(canonical_sample_protein[
- struc.filter_peptide_backbone(canonical_sample_protein)
- ]) == 384
+ len(
+ canonical_sample_protein[
+ struc.filter_peptide_backbone(canonical_sample_protein)
+ ]
+ )
+ == 384
)
def test_phosphate_backbone_filter(canonical_sample_nucleotide):
# take a chain D with five canonical nucleotides
# => there should be 5 x 6 = 30 backbone atoms
- chain_d = canonical_sample_nucleotide[
- canonical_sample_nucleotide.chain_id == 'D'
- ]
+ chain_d = canonical_sample_nucleotide[canonical_sample_nucleotide.chain_id == "D"]
assert len(chain_d[struc.filter_phosphate_backbone(chain_d)]) == 30
@@ -139,39 +154,45 @@ def test_polymer_filter(canonical_sample_nucleotide, sample_carbohydrate):
a = canonical_sample_nucleotide
# Check for nucleotide filtering
- a_nuc = a[struc.filter_polymer(a, pol_type='n')]
+ a_nuc = a[struc.filter_polymer(a, pol_type="n")]
# Take three nucleic acids chains and remove solvent => the result should
# encompass all nucleotide polymer atoms, which is exactly the output of the
# `filter_polymer()`. In the structure file, the filtered atoms are 1-651.
- a_nuc_manual = a[np.isin(a.chain_id, ['D', 'P', 'T']) & ~struc.filter_solvent(a)]
+ a_nuc_manual = a[np.isin(a.chain_id, ["D", "P", "T"]) & ~struc.filter_solvent(a)]
assert len(a_nuc) == len(a_nuc_manual) == 651
- assert set(a_nuc.chain_id) == {'D', 'P', 'T'}
+ assert set(a_nuc.chain_id) == {"D", "P", "T"}
# chain D should be absent
- a_nuc = a_nuc[struc.filter_polymer(a_nuc, min_size=6, pol_type='n')]
- assert set(a_nuc.chain_id) == {'P', 'T'}
+ a_nuc = a_nuc[struc.filter_polymer(a_nuc, min_size=6, pol_type="n")]
+ assert set(a_nuc.chain_id) == {"P", "T"}
# Single protein chain A: residues 10-335
- a_pep = a[struc.filter_polymer(a, pol_type='p')]
- assert len(a_pep) == len(a[(a.res_id >= 10) & (a.res_id <= 335) & (a.chain_id == 'A')])
+ a_pep = a[struc.filter_polymer(a, pol_type="p")]
+ assert len(a_pep) == len(
+ a[(a.res_id >= 10) & (a.res_id <= 335) & (a.chain_id == "A")]
+ )
# Chain B has five carbohydrate residues
# Chain C has four
# => Only chain B is selected
a = sample_carbohydrate
- a_carb = a[struc.filter_polymer(a, min_size=4, pol_type='carb')]
- assert set(a_carb.chain_id) == {'B'}
+ a_carb = a[struc.filter_polymer(a, min_size=4, pol_type="carb")]
+ assert set(a_carb.chain_id) == {"B"}
assert struc.get_residue_count(a_carb) == 5
def test_intersection_filter(canonical_sample_protein):
assert (
- len(canonical_sample_protein[:200][
- struc.filter_intersection(
- canonical_sample_protein[:200],canonical_sample_protein[100:]
- )
- ]) == 100
+ len(
+ canonical_sample_protein[:200][
+ struc.filter_intersection(
+ canonical_sample_protein[:200], canonical_sample_protein[100:]
+ )
+ ]
+ )
+ == 100
)
+
@pytest.mark.parametrize("filter_func", ["first", "occupancy"])
def test_filter_altloc(all_atloc_structure, filter_func):
"""
@@ -183,21 +204,22 @@ def test_filter_altloc(all_atloc_structure, filter_func):
all_atloc_structure.chain_id,
all_atloc_structure.res_id,
all_atloc_structure.ins_code,
- all_atloc_structure.atom_name
+ all_atloc_structure.atom_name,
):
ref_atom_set.add(atom_tuple)
if filter_func == "first":
- filtered_structure = all_atloc_structure[struc.filter_first_altloc(
- all_atloc_structure,
- all_atloc_structure.altloc_id
- )]
+ filtered_structure = all_atloc_structure[
+ struc.filter_first_altloc(
+ all_atloc_structure, all_atloc_structure.altloc_id
+ )
+ ]
elif filter_func == "occupancy":
filtered_structure = all_atloc_structure[
struc.filter_highest_occupancy_altloc(
all_atloc_structure,
all_atloc_structure.altloc_id,
- all_atloc_structure.occupancy
+ all_atloc_structure.occupancy,
)
]
@@ -206,7 +228,7 @@ def test_filter_altloc(all_atloc_structure, filter_func):
filtered_structure.chain_id,
filtered_structure.res_id,
filtered_structure.ins_code,
- filtered_structure.atom_name
+ filtered_structure.atom_name,
):
try:
# No atom should be present twice
@@ -230,10 +252,9 @@ def test_filter_highest_occupancy_altloc(all_atloc_structure):
all_atloc_structure.occupancy[all_atloc_structure.altloc_id == "B"] = 1.0
# filter_first_altloc
- filtered_structure = all_atloc_structure[struc.filter_first_altloc(
- all_atloc_structure,
- all_atloc_structure.altloc_id
- )]
+ filtered_structure = all_atloc_structure[
+ struc.filter_first_altloc(all_atloc_structure, all_atloc_structure.altloc_id)
+ ]
ref_occupancy_sum = np.average(filtered_structure.occupancy)
# filter_highest_occupancy_altloc
@@ -241,9 +262,9 @@ def test_filter_highest_occupancy_altloc(all_atloc_structure):
struc.filter_highest_occupancy_altloc(
all_atloc_structure,
all_atloc_structure.altloc_id,
- all_atloc_structure.occupancy
+ all_atloc_structure.occupancy,
)
]
test_occupancy_sum = np.average(filtered_structure.occupancy)
- assert test_occupancy_sum > ref_occupancy_sum
\ No newline at end of file
+ assert test_occupancy_sum > ref_occupancy_sum
diff --git a/tests/structure/test_generalio.py b/tests/structure/test_generalio.py
index 1dbeb73a6..9593af1d2 100644
--- a/tests/structure/test_generalio.py
+++ b/tests/structure/test_generalio.py
@@ -2,23 +2,18 @@
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.
-from tempfile import NamedTemporaryFile
-import biotite.structure as struc
-import biotite.structure.io as strucio
import glob
import os
from os.path import join, splitext
-from ..util import data_dir, cannot_import
+from tempfile import NamedTemporaryFile
import pytest
+import biotite.structure as struc
+import biotite.structure.io as strucio
+from tests.util import cannot_import, data_dir
-@pytest.mark.skipif(
- cannot_import("mdtraj"),
- reason="MDTraj is not installed"
-)
-@pytest.mark.parametrize(
- "path", glob.glob(join(data_dir("structure"), "1l2y.*"))
-)
+@pytest.mark.skipif(cannot_import("mdtraj"), reason="MDTraj is not installed")
+@pytest.mark.parametrize("path", glob.glob(join(data_dir("structure"), "1l2y.*")))
def test_loading(path):
"""
Just check if :func:`load_structure()` does not raise an exception
@@ -26,9 +21,7 @@ def test_loading(path):
"""
suffix = splitext(path)[1]
if suffix in [".trr", ".xtc", ".tng", ".dcd", ".netcdf"]:
- template = strucio.load_structure(
- join(data_dir("structure"), "1l2y.bcif")
- )
+ template = strucio.load_structure(join(data_dir("structure"), "1l2y.bcif"))
array = strucio.load_structure(path, template)
else:
array = strucio.load_structure(path)
@@ -40,10 +33,7 @@ def test_loading(path):
assert isinstance(array, struc.AtomArrayStack)
-@pytest.mark.skipif(
- cannot_import("mdtraj"),
- reason="MDTraj is not installed"
-)
+@pytest.mark.skipif(cannot_import("mdtraj"), reason="MDTraj is not installed")
def test_loading_template_with_trj():
"""
Check if :func:`load_structure()` using a trajectory file does not
@@ -57,10 +47,7 @@ def test_loading_template_with_trj():
assert len(stack) > 1
-@pytest.mark.skipif(
- cannot_import("mdtraj"),
- reason="MDTraj is not installed"
-)
+@pytest.mark.skipif(cannot_import("mdtraj"), reason="MDTraj is not installed")
def test_loading_with_extra_args():
"""
Check if :func:`load_structure()` witt optional arguments does not
@@ -74,9 +61,7 @@ def test_loading_with_extra_args():
assert "b_factor" in structure.get_annotation_categories()
# test if arguments are passed to read for trajectories
- stack = strucio.load_structure(
- trajectory, template=structure[0], start=5, stop=6
- )
+ stack = strucio.load_structure(trajectory, template=structure[0], start=5, stop=6)
assert len(stack) == 1
# loading should fail with wrong arguments
@@ -88,16 +73,10 @@ def test_loading_with_extra_args():
assert stack.shape[1] == 2
-@pytest.mark.skipif(
- cannot_import("mdtraj"),
- reason="MDTraj is not installed"
-)
+@pytest.mark.skipif(cannot_import("mdtraj"), reason="MDTraj is not installed")
@pytest.mark.parametrize(
"suffix",
- [
- "pdb", "pdbx", "cif", "bcif", "gro", "trr", "xtc", "tng",
- "dcd", "netcdf"
- ]
+ ["pdb", "pdbx", "cif", "bcif", "gro", "trr", "xtc", "tng", "dcd", "netcdf"],
)
def test_saving(suffix):
"""
@@ -124,23 +103,19 @@ def test_saving(suffix):
if category == "chain_id" and suffix == "gro":
# The chain ID is not written to GRO files
continue
- assert test_array.get_annotation(category).tolist() \
- == ref_array.get_annotation(category).tolist()
+ assert (
+ test_array.get_annotation(category).tolist()
+ == ref_array.get_annotation(category).tolist()
+ )
assert test_array.coord.flatten().tolist() == pytest.approx(
- ref_array.coord.flatten().tolist(), abs=1e-2
+ ref_array.coord.flatten().tolist(), abs=1e-2
)
-@pytest.mark.skipif(
- cannot_import("mdtraj"),
- reason="MDTraj is not installed"
-)
+@pytest.mark.skipif(cannot_import("mdtraj"), reason="MDTraj is not installed")
@pytest.mark.parametrize(
"suffix",
- [
- "pdb", "pdbx", "cif", "bcif", "gro", "trr", "xtc", "tng",
- "dcd", "netcdf"
- ]
+ ["pdb", "pdbx", "cif", "bcif", "gro", "trr", "xtc", "tng", "dcd", "netcdf"],
)
def test_saving_with_extra_args(suffix):
"""
@@ -150,9 +125,7 @@ def test_saving_with_extra_args(suffix):
array = strucio.load_structure(join(data_dir("structure"), "1l2y.bcif"))
temp = NamedTemporaryFile("w+", suffix=f".{suffix}")
with pytest.raises(TypeError):
- strucio.save_structure(
- temp.name, array, answer=42
- )
+ strucio.save_structure(temp.name, array, answer=42)
temp.close()
diff --git a/tests/structure/test_geometry.py b/tests/structure/test_geometry.py
index 3239d43b4..d5ab03fdc 100644
--- a/tests/structure/test_geometry.py
+++ b/tests/structure/test_geometry.py
@@ -2,44 +2,43 @@
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.
-from tempfile import NamedTemporaryFile
-import itertools
import glob
+import itertools
from os.path import join
+from tempfile import NamedTemporaryFile
import numpy as np
import numpy.random as random
import pytest
import biotite.structure as struc
import biotite.structure.io as strucio
import biotite.structure.io.pdbx as pdbx
-from ..util import data_dir, cannot_import
+from tests.util import cannot_import, data_dir
def test_distance():
- coord1 = struc.coord([0,1,1])
- coord2 = struc.coord([0,2,2])
+ coord1 = struc.coord([0, 1, 1])
+ coord2 = struc.coord([0, 2, 2])
assert struc.distance(coord1, coord2) == pytest.approx(np.sqrt(2))
def test_centroid():
- coord = struc.coord([[1,1,1],[0,-1,-1],[-1,0,0]])
- assert struc.centroid(coord).tolist() == [0,0,0]
+ coord = struc.coord([[1, 1, 1], [0, -1, -1], [-1, 0, 0]])
+ assert struc.centroid(coord).tolist() == [0, 0, 0]
def test_angle():
- coord1 = struc.coord([0,0,1])
- coord2 = struc.coord([0,0,0])
- coord3 = struc.coord([0,1,1])
- assert struc.angle(coord1, coord2, coord3) == pytest.approx(0.25*np.pi)
+ coord1 = struc.coord([0, 0, 1])
+ coord2 = struc.coord([0, 0, 0])
+ coord3 = struc.coord([0, 1, 1])
+ assert struc.angle(coord1, coord2, coord3) == pytest.approx(0.25 * np.pi)
def test_dihedral():
- coord1 = struc.coord([-0.5,-1,0])
- coord2 = struc.coord([0,0,0])
- coord3 = struc.coord([1,0,0])
- coord4 = struc.coord([0,0,-1])
- assert struc.dihedral(coord1, coord2, coord3, coord4) \
- == pytest.approx(0.5*np.pi)
+ coord1 = struc.coord([-0.5, -1, 0])
+ coord2 = struc.coord([0, 0, 0])
+ coord3 = struc.coord([1, 0, 0])
+ coord4 = struc.coord([0, 0, -1])
+ assert struc.dihedral(coord1, coord2, coord3, coord4) == pytest.approx(0.5 * np.pi)
@pytest.mark.parametrize("multiple_chains", [False, True])
@@ -55,17 +54,18 @@ def test_dihedral_backbone_general(multiple_chains):
array = stack[0]
# Test array
phi, psi, omega = struc.dihedral_backbone(array)
- assert phi.shape == (n_res,)
- assert psi.shape == (n_res,)
+ assert phi.shape == (n_res,)
+ assert psi.shape == (n_res,)
assert omega.shape == (n_res,)
_assert_plausible_omega(omega)
# Test stack
phi, psi, omega = struc.dihedral_backbone(stack)
- assert phi.shape == (n_models, n_res)
- assert psi.shape == (n_models, n_res)
+ assert phi.shape == (n_models, n_res)
+ assert psi.shape == (n_models, n_res)
assert omega.shape == (n_models, n_res)
_assert_plausible_omega(omega)
+
def _assert_plausible_omega(omega):
# Remove nan values
omega = omega.flatten()
@@ -74,13 +74,8 @@ def _assert_plausible_omega(omega):
assert omega.tolist() == pytest.approx([np.pi] * len(omega), rel=0.6)
-@pytest.mark.skipif(
- cannot_import("mdtraj"),
- reason="MDTraj is not installed"
-)
-@pytest.mark.parametrize(
- "file_name", glob.glob(join(data_dir("structure"), "*.bcif"))
-)
+@pytest.mark.skipif(cannot_import("mdtraj"), reason="MDTraj is not installed")
+@pytest.mark.parametrize("file_name", glob.glob(join(data_dir("structure"), "*.bcif")))
def test_dihedral_backbone_result(file_name):
import mdtraj
@@ -113,12 +108,11 @@ def test_dihedral_backbone_result(file_name):
_, ref_ome = mdtraj.compute_omega(traj)
ref_phi, ref_psi, ref_ome = ref_phi[0], ref_psi[0], ref_ome[0]
- assert test_phi[1: ] == pytest.approx(ref_phi, abs=1e-5, rel=5e-3)
+ assert test_phi[1:] == pytest.approx(ref_phi, abs=1e-5, rel=5e-3)
assert test_psi[:-1] == pytest.approx(ref_psi, abs=1e-5, rel=5e-3)
assert test_ome[:-1] == pytest.approx(ref_ome, abs=1e-5, rel=5e-3)
-
def test_index_distance_non_periodic():
"""
Without PBC the result should be equal to the normal distance
@@ -126,26 +120,21 @@ def test_index_distance_non_periodic():
"""
array = strucio.load_structure(join(data_dir("structure"), "3o5r.bcif"))
ref_dist = struc.distance(
- array.coord[np.newaxis, :, :],
- array.coord[:, np.newaxis, :]
+ array.coord[np.newaxis, :, :], array.coord[:, np.newaxis, :]
).flatten()
length = array.array_length()
dist = struc.index_distance(
array,
- indices = np.stack([
- np.repeat(np.arange(length), length),
- np.tile(np.arange(length), length)
- ], axis=1)
+ indices=np.stack(
+ [np.repeat(np.arange(length), length), np.tile(np.arange(length), length)],
+ axis=1,
+ ),
)
assert np.allclose(dist, ref_dist)
@pytest.mark.parametrize(
- "shift", [
- np.array([10, 20, 30]),
- np.array([-8, 12, 28]),
- np.array([ 0, 99, 54])
- ]
+ "shift", [np.array([10, 20, 30]), np.array([-8, 12, 28]), np.array([0, 99, 54])]
)
def test_index_distance_periodic_orthogonal(shift):
"""
@@ -155,15 +144,13 @@ def test_index_distance_periodic_orthogonal(shift):
array = strucio.load_structure(join(data_dir("structure"), "3o5r.bcif"))
# Use a box based on the boundaries of the structure
# '+1' to add a margin
- array.box = np.diag(
- np.max(array.coord, axis=0) - np.min(array.coord, axis=0) + 1
- )
+ array.box = np.diag(np.max(array.coord, axis=0) - np.min(array.coord, axis=0) + 1)
length = array.array_length()
- dist_indices = np.stack([
- np.repeat(np.arange(length), length),
- np.tile(np.arange(length), length)
- ], axis=1)
+ dist_indices = np.stack(
+ [np.repeat(np.arange(length), length), np.tile(np.arange(length), length)],
+ axis=1,
+ )
ref_dist = struc.index_distance(array, dist_indices, periodic=True)
array.coord += shift
@@ -173,23 +160,13 @@ def test_index_distance_periodic_orthogonal(shift):
@pytest.mark.filterwarnings("ignore")
-@pytest.mark.skipif(
- cannot_import("mdtraj"),
- reason="MDTraj is not installed"
-)
+@pytest.mark.skipif(cannot_import("mdtraj"), reason="MDTraj is not installed")
@pytest.mark.parametrize(
- "shift, angles", itertools.product(
- [
- np.array([10, 20, 30]),
- np.array([-8, 12, 28]),
- np.array([ 0, 99, 54])
- ],
- [
- np.array([ 50, 90, 90]),
- np.array([ 90, 90, 120]),
- np.array([ 60, 60, 60])
- ]
- )
+ "shift, angles",
+ itertools.product(
+ [np.array([10, 20, 30]), np.array([-8, 12, 28]), np.array([0, 99, 54])],
+ [np.array([50, 90, 90]), np.array([90, 90, 120]), np.array([60, 60, 60])],
+ ),
)
def test_index_distance_periodic_triclinic(shift, angles):
"""
@@ -202,15 +179,14 @@ def test_index_distance_periodic_triclinic(shift, angles):
boundaries = np.max(array.coord, axis=0) - np.min(array.coord, axis=0) + 1
angles = np.deg2rad(angles)
array.box = struc.vectors_from_unitcell(
- boundaries[0], boundaries[1], boundaries[2],
- angles[0], angles[1], angles[2]
+ boundaries[0], boundaries[1], boundaries[2], angles[0], angles[1], angles[2]
)
length = array.array_length()
- dist_indices = np.stack([
- np.repeat(np.arange(length), length),
- np.tile(np.arange(length), length)
- ], axis=1)
+ dist_indices = np.stack(
+ [np.repeat(np.arange(length), length), np.tile(np.arange(length), length)],
+ axis=1,
+ )
# index_distance() creates a large ndarray
try:
ref_dist = struc.index_distance(array, dist_indices, periodic=True)
@@ -219,12 +195,12 @@ def test_index_distance_periodic_triclinic(shift, angles):
# Compare with MDTraj
import mdtraj
+
traj = mdtraj.load(join(data_dir("structure"), "3o5r.pdb"))
# Angstrom to Nanometers
traj.unitcell_vectors = array.box[np.newaxis, :, :] / 10
# Nanometers to Angstrom
mdtraj_dist = mdtraj.compute_distances(traj, dist_indices)[0] * 10
- ind = np.where(~np.isclose(ref_dist, mdtraj_dist, atol=2e-5, rtol=1e-3))[0]
assert np.allclose(ref_dist, mdtraj_dist, atol=2e-5, rtol=1e-3)
# Compare with shifted variant
@@ -249,38 +225,35 @@ def test_index_functions():
samples = (array, stack, struc.coord(array), struc.coord(stack))
# Generate random indices
random.seed(42)
- indices = random.randint(array.array_length(), size=(100,4), dtype=int)
+ indices = random.randint(array.array_length(), size=(100, 4), dtype=int)
for sample in samples:
if isinstance(sample, np.ndarray):
- atoms1 = sample[..., indices[:,0], :]
- atoms2 = sample[..., indices[:,1], :]
- atoms3 = sample[..., indices[:,2], :]
- atoms4 = sample[..., indices[:,3], :]
+ atoms1 = sample[..., indices[:, 0], :]
+ atoms2 = sample[..., indices[:, 1], :]
+ atoms3 = sample[..., indices[:, 2], :]
+ atoms4 = sample[..., indices[:, 3], :]
else:
- atoms1 = sample[..., indices[:,0]]
- atoms2 = sample[..., indices[:,1]]
- atoms3 = sample[..., indices[:,2]]
- atoms4 = sample[..., indices[:,3]]
+ atoms1 = sample[..., indices[:, 0]]
+ atoms2 = sample[..., indices[:, 1]]
+ atoms3 = sample[..., indices[:, 2]]
+ atoms4 = sample[..., indices[:, 3]]
assert np.allclose(
struc.displacement(atoms1, atoms2),
- struc.index_displacement(sample, indices[:,:2]),
- atol=1e-5
+ struc.index_displacement(sample, indices[:, :2]),
+ atol=1e-5,
)
assert np.allclose(
struc.distance(atoms1, atoms2),
- struc.index_distance(sample, indices[:,:2]),
- atol=1e-5
+ struc.index_distance(sample, indices[:, :2]),
+ atol=1e-5,
)
assert np.allclose(
struc.angle(atoms1, atoms2, atoms3),
- struc.index_angle(sample, indices[:,:3]),
- atol=1e-5
+ struc.index_angle(sample, indices[:, :3]),
+ atol=1e-5,
)
assert np.allclose(
struc.dihedral(atoms1, atoms2, atoms3, atoms4),
- struc.index_dihedral(sample, indices[:,:4]),
- atol=1e-5
+ struc.index_dihedral(sample, indices[:, :4]),
+ atol=1e-5,
)
-
-
-
diff --git a/tests/structure/test_gro.py b/tests/structure/test_gro.py
index 02faf6f68..806c65a0d 100644
--- a/tests/structure/test_gro.py
+++ b/tests/structure/test_gro.py
@@ -2,18 +2,18 @@
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.
-from tempfile import TemporaryFile
import glob
import itertools
from os.path import join, splitext
+from tempfile import TemporaryFile
+import numpy as np
import pytest
from pytest import approx
-import numpy as np
import biotite
import biotite.structure.io.gro as gro
import biotite.structure.io.pdb as pdb
from biotite.structure import Atom, array
-from ..util import data_dir
+from tests.util import data_dir
def test_get_model_count():
@@ -25,10 +25,7 @@ def test_get_model_count():
@pytest.mark.parametrize(
"path, model",
- itertools.product(
- glob.glob(join(data_dir("structure"), "*.gro")),
- [None, 1, -1]
- )
+ itertools.product(glob.glob(join(data_dir("structure"), "*.gro")), [None, 1, -1]),
)
def test_array_conversion(path, model):
gro_file = gro.GROFile.read(path)
@@ -40,8 +37,10 @@ def test_array_conversion(path, model):
assert np.allclose(array1.box, array2.box)
assert array1.bonds == array2.bonds
for category in array1.get_annotation_categories():
- assert array1.get_annotation(category).tolist() == \
- array2.get_annotation(category).tolist()
+ assert (
+ array1.get_annotation(category).tolist()
+ == array2.get_annotation(category).tolist()
+ )
assert array1.coord.tolist() == array2.coord.tolist()
@@ -58,20 +57,17 @@ def test_pdb_consistency(path):
assert a1.array_length() == a2.array_length()
for category in ["res_id", "res_name", "atom_name"]:
- assert a1.get_annotation(category).tolist() == \
- a2.get_annotation(category).tolist()
+ assert (
+ a1.get_annotation(category).tolist() == a2.get_annotation(category).tolist()
+ )
# Mind rounding errors when converting pdb to gro (A -> nm)
- assert a1.coord.flatten().tolist() \
- == approx(a2.coord.flatten().tolist(), abs=1e-2)
+ assert a1.coord.flatten().tolist() == approx(a2.coord.flatten().tolist(), abs=1e-2)
@pytest.mark.parametrize(
"path, model",
- itertools.product(
- glob.glob(join(data_dir("structure"), "*.pdb")),
- [None, 1, -1]
- )
+ itertools.product(glob.glob(join(data_dir("structure"), "*.pdb")), [None, 1, -1]),
)
def test_pdb_to_gro(path, model):
"""
@@ -105,20 +101,24 @@ def test_pdb_to_gro(path, model):
assert a1.array_length() == a2.array_length()
for category in ["res_id", "res_name", "atom_name"]:
- assert a1.get_annotation(category).tolist() == \
- a2.get_annotation(category).tolist()
+ assert (
+ a1.get_annotation(category).tolist() == a2.get_annotation(category).tolist()
+ )
# Mind rounding errors when converting pdb to gro (A -> nm)
- assert a1.coord.flatten().tolist() \
- == approx(a2.coord.flatten().tolist(), abs=1e-2)
+ assert a1.coord.flatten().tolist() == approx(a2.coord.flatten().tolist(), abs=1e-2)
def test_gro_id_overflow():
# Create an oversized AtomArray where atom_id > 100000 and res_id > 10000
num_atoms = 100005
- atoms = array([Atom([1,2,3], atom_name="CA", element="C", res_name="X",
- res_id=i+1) for i in range(num_atoms)])
- atoms.box = np.array([[1,0,0], [0,1,0], [0,0,1]])
+ atoms = array(
+ [
+ Atom([1, 2, 3], atom_name="CA", element="C", res_name="X", res_id=i + 1)
+ for i in range(num_atoms)
+ ]
+ )
+ atoms.box = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
# Write .gro file
temp = TemporaryFile("w+")
@@ -143,7 +143,7 @@ def test_gro_no_box():
"""
# Create an AtomArray
- atom = Atom([1,2,3], atom_name="CA", element="C", res_name="X", res_id=1)
+ atom = Atom([1, 2, 3], atom_name="CA", element="C", res_name="X", res_id=1)
atoms = array([atom])
# Write .gro file
@@ -151,7 +151,7 @@ def test_gro_no_box():
gro_file = gro.GROFile()
gro_file.set_structure(atoms)
gro_file.write(temp)
-
+
# Read in file
temp.seek(0)
gro_file = gro.GROFile.read(temp)
@@ -159,4 +159,4 @@ def test_gro_no_box():
s = gro_file.get_structure()
# Assert no box with 0 dimension
- assert s.box is None
\ No newline at end of file
+ assert s.box is None
diff --git a/tests/structure/test_hbond.py b/tests/structure/test_hbond.py
index 64d4068f9..bd95eb826 100644
--- a/tests/structure/test_hbond.py
+++ b/tests/structure/test_hbond.py
@@ -3,20 +3,18 @@
# information.
import itertools
-from tempfile import NamedTemporaryFile
from os.path import join
+from tempfile import NamedTemporaryFile
import numpy as np
import pytest
import biotite.structure as struc
from biotite.structure.io import load_structure, save_structure
-from ..util import data_dir, cannot_import
+from tests.util import cannot_import, data_dir
@pytest.fixture()
def stack(request):
- stack = load_structure(
- join(data_dir("structure"), "1l2y.bcif")
- )
+ stack = load_structure(join(data_dir("structure"), "1l2y.bcif"))
if request.param:
# Use connect_via_distances, since 1l2y has invalidly bonded
# N-terminal hydrogen atoms
@@ -26,21 +24,15 @@ def stack(request):
# Ignore warning about dummy unit cell vector
@pytest.mark.filterwarnings("ignore")
-@pytest.mark.skipif(
- cannot_import("mdtraj"),
- reason="MDTraj is not installed"
-)
+@pytest.mark.skipif(cannot_import("mdtraj"), reason="MDTraj is not installed")
@pytest.mark.parametrize(
- "pdb_id, use_bond_list", itertools.product(
- ["1l2y", "1gya", "1igy"],
- [False, True]
- )
+ "pdb_id, use_bond_list", itertools.product(["1l2y", "1gya", "1igy"], [False, True])
)
def test_hbond_structure(pdb_id, use_bond_list):
"""
Compare hydrogen bond detection with MDTraj
"""
- file_name = join(data_dir("structure"), pdb_id+".bcif")
+ file_name = join(data_dir("structure"), pdb_id + ".bcif")
array = load_structure(file_name)
if use_bond_list:
@@ -58,9 +50,9 @@ def test_hbond_structure(pdb_id, use_bond_list):
if isinstance(array, struc.AtomArrayStack):
# For consistency with MDTraj 'S' cannot be acceptor element
# https://github.com/mdtraj/mdtraj/blob/master/mdtraj/geometry/hbond.py#L365
- triplets, mask = struc.hbond(array, acceptor_elements=("O","N"))
+ triplets, mask = struc.hbond(array, acceptor_elements=("O", "N"))
else:
- triplets = struc.hbond(array, acceptor_elements=("O","N"))
+ triplets = struc.hbond(array, acceptor_elements=("O", "N"))
# Save to new pdb file for consistent treatment of inscode/altloc
# im MDTraj
@@ -69,11 +61,10 @@ def test_hbond_structure(pdb_id, use_bond_list):
# Compare with MDTraj
import mdtraj
+
traj = mdtraj.load(temp.name)
temp.close()
- triplets_ref = mdtraj.baker_hubbard(
- traj, freq=0, periodic=False
- )
+ triplets_ref = mdtraj.baker_hubbard(traj, freq=0, periodic=False)
# Both packages may use different order
# -> use set for comparison
@@ -122,28 +113,27 @@ def test_hbond_with_selections(stack):
of this boundary should be found. Also, hbond should respect the
selection type.
"""
- selection1 = (stack.res_id == 3) & (stack.atom_name == 'O') # 3TYR BB Ox
+ selection1 = (stack.res_id == 3) & (stack.atom_name == "O") # 3TYR BB Ox
selection2 = stack.res_id == 7
# backbone hbond should be found if selection1/2 type is both
- triplets, mask = struc.hbond(stack, selection1, selection2,
- selection1_type="both")
+ triplets, mask = struc.hbond(stack, selection1, selection2, selection1_type="both")
assert len(triplets) == 1
assert triplets[0][0] == 116
assert triplets[0][2] == 38
# backbone hbond should be found if selection1 is acceptor and
# selection2 is donor
- triplets, mask = struc.hbond(stack, selection1, selection2,
- selection1_type="acceptor")
+ triplets, mask = struc.hbond(
+ stack, selection1, selection2, selection1_type="acceptor"
+ )
assert len(triplets) == 1
assert triplets[0][0] == 116
assert triplets[0][2] == 38
# no hbond should be found,
# because the backbone oxygen cannot be a donor
- triplets, mask = struc.hbond(stack, selection1, selection2,
- selection1_type="donor")
+ triplets, mask = struc.hbond(stack, selection1, selection2, selection1_type="donor")
assert len(triplets) == 0
@@ -164,18 +154,20 @@ def test_hbond_single_selection(stack):
def test_hbond_frequency():
- mask = np.array([
- [True, True, True, True, True], # 1.0
- [False, False, False, False, False], # 0.0
- [False, False, False, True, True] # 0.4
- ]).T
+ mask = np.array(
+ [
+ [True, True, True, True, True], # 1.0
+ [False, False, False, False, False], # 0.0
+ [False, False, False, True, True], # 0.4
+ ]
+ ).T
freq = struc.hbond_frequency(mask)
assert not np.isin(False, np.isclose(freq, np.array([1.0, 0.0, 0.4])))
# Ignore warning about missing BondList
@pytest.mark.filterwarnings("ignore")
-@pytest.mark.parametrize("translation_vector", [(10,20,30), (-5, 3, 18)])
+@pytest.mark.parametrize("translation_vector", [(10, 20, 30), (-5, 3, 18)])
def test_hbond_periodicity(translation_vector):
"""
Test whether hydrogen bond identification uses periodic boundary
@@ -197,4 +189,4 @@ def test_hbond_periodicity(translation_vector):
array.coord = struc.move_inside_box(array.coord, array.box)
hbonds = struc.hbond(array, periodic=True)
hbonds = set([tuple(triplet) for triplet in hbonds])
- assert ref_hbonds == hbonds
\ No newline at end of file
+ assert ref_hbonds == hbonds
diff --git a/tests/structure/test_info.py b/tests/structure/test_info.py
index f23c75030..2d823aaf1 100644
--- a/tests/structure/test_info.py
+++ b/tests/structure/test_info.py
@@ -9,7 +9,7 @@
import biotite.structure as struc
import biotite.structure.info as strucinfo
from biotite.structure.io import load_structure
-from ..util import data_dir
+from tests.util import data_dir
@pytest.mark.parametrize(
@@ -18,7 +18,7 @@
(strucinfo.amino_acid_names, ["ALA", "ARG", "ASN", "ASP"], ["HOH"]),
(strucinfo.nucleotide_names, ["A", "C", "G", "U"], ["HOH", "ALA"]),
(strucinfo.carbohydrate_names, ["GLC", "RIB"], ["HOH", "ALA"]),
- ]
+ ],
)
def test_group_names(function, included, excluded):
"""
@@ -49,16 +49,16 @@ def test_mass():
ref_masses = [strucinfo.mass(res) for res in struc.residue_iter(array)]
# Up to three additional/missing hydrogens are allowed
# (protonation state)
- mass_diff = np.abs(np.array(
- [mass - ref_mass for mass, ref_mass in zip(masses, ref_masses)]
- ))
+ mass_diff = np.abs(
+ np.array([mass - ref_mass for mass, ref_mass in zip(masses, ref_masses)])
+ )
assert (mass_diff // strucinfo.mass("H") <= 3).all()
# Check if the mass difference is a multiple of the hydrogen mass
multiple_of_h_masses = mass_diff / strucinfo.mass("H")
assert np.all(np.round(multiple_of_h_masses, decimals=2) % 1 == 0)
-def test_protOr_radii():
+def test_protor_radii():
"""
Assert that ProtOr VdW radii (except hydrogen) can be calculated for
all atoms in the given structure, since the structure (1GYA)
@@ -72,7 +72,7 @@ def test_protOr_radii():
for res_name, atom_name in zip(array.res_name, array.atom_name):
radius = strucinfo.vdw_radius_protor(res_name, atom_name)
assert isinstance(radius, float)
- assert radius != None
+ assert radius is not None
def test_protor_radii_invalid():
@@ -83,7 +83,7 @@ def test_protor_radii_invalid():
# Expect raised exception when a residue does not contain an atom
strucinfo.vdw_radius_protor("ALA", "K")
# For all other unknown radii expect None
- assert strucinfo.vdw_radius_protor("HOH", "O") == None
+ assert strucinfo.vdw_radius_protor("HOH", "O") is None
def test_single_radii():
@@ -105,12 +105,16 @@ def test_link_type():
[
(strucinfo.amino_acid_names(), True, 0.4),
(strucinfo.nucleotide_names(), True, 0.4),
- (sorted(
- set(strucinfo.all_residues())
- - set(strucinfo.amino_acid_names())
- - set(strucinfo.nucleotide_names())
- ), False, 0.01),
- ]
+ (
+ sorted(
+ set(strucinfo.all_residues())
+ - set(strucinfo.amino_acid_names())
+ - set(strucinfo.nucleotide_names())
+ ),
+ False,
+ 0.01,
+ ),
+ ],
)
def test_one_letter_code(residues, should_have_one_letter, exception_ratio):
"""
@@ -145,14 +149,13 @@ def test_standardize_order(multi_model, seed):
reordered = struc.AtomArray(0)
for residue in struc.residue_iter(original):
bound = residue.array_length()
- indices = np.random.choice(
- np.arange(bound), bound,replace=False
- )
+ indices = np.random.choice(np.arange(bound), bound, replace=False)
reordered += residue[..., indices]
# Restore the original PDB standard order
restored = reordered[..., strucinfo.standardize_order(reordered)]
assert restored.shape == original.shape
- assert restored[..., restored.element != "H"] \
- == original[..., original.element != "H"]
+ assert (
+ restored[..., restored.element != "H"] == original[..., original.element != "H"]
+ )
diff --git a/tests/structure/test_integrity.py b/tests/structure/test_integrity.py
index b8fbbb89d..c9f92ccc5 100644
--- a/tests/structure/test_integrity.py
+++ b/tests/structure/test_integrity.py
@@ -2,51 +2,56 @@
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.
-import biotite.structure as struc
-import biotite.structure.io.pdbx as pdbx
-import numpy as np
from os.path import join
-from ..util import data_dir
+import numpy as np
import pytest
+import biotite.structure as struc
+import biotite.structure.io.pdbx as pdbx
+from tests.util import data_dir
@pytest.fixture
def sample_array():
- pdbx_file = pdbx.BinaryCIFFile.read(
- join(data_dir("structure"), "1l2y.bcif")
- )
+ pdbx_file = pdbx.BinaryCIFFile.read(join(data_dir("structure"), "1l2y.bcif"))
return pdbx.get_structure(pdbx_file, model=1)
+
@pytest.fixture
def gapped_sample_array(sample_array):
- atom_ids = np.arange(1, sample_array.shape[0]+1)
+ atom_ids = np.arange(1, sample_array.shape[0] + 1)
sample_array.add_annotation("atom_id", dtype=int)
sample_array.atom_id = atom_ids
sample_array = sample_array[sample_array.res_id != 5]
- sample_array = sample_array[(sample_array.res_id != 9) |
- (sample_array.atom_name != "N")]
+ sample_array = sample_array[
+ (sample_array.res_id != 9) | (sample_array.atom_name != "N")
+ ]
return sample_array
+
@pytest.fixture
def duplicate_sample_array(sample_array):
sample_array[42] = sample_array[10]
sample_array[234] = sample_array[123]
return sample_array
+
def test_atom_id_continuity_check(gapped_sample_array):
discon = struc.check_atom_id_continuity(gapped_sample_array)
discon_array = gapped_sample_array[discon]
assert discon_array.atom_id.tolist() == [93, 159]
+
def test_res_id_continuity_check(gapped_sample_array):
discon = struc.check_res_id_continuity(gapped_sample_array)
discon_array = gapped_sample_array[discon]
assert discon_array.res_id.tolist() == [6]
+
def test_linear_continuity_check(gapped_sample_array):
# Take the first ASN residue and remove hydrogens
asn = gapped_sample_array[
- (gapped_sample_array.res_id == 1) & (gapped_sample_array.element != 'H')]
+ (gapped_sample_array.res_id == 1) & (gapped_sample_array.element != "H")
+ ]
# The consecutive atom groups are
# (1) N, CA, C, O
# - break
@@ -57,11 +62,13 @@ def test_linear_continuity_check(gapped_sample_array):
discon = struc.check_linear_continuity(asn)
assert discon.tolist() == [4, 7]
+
def test_bond_continuity_check(gapped_sample_array):
discon = struc.check_backbone_continuity(gapped_sample_array)
discon_array = gapped_sample_array[discon]
- assert discon_array.res_id.tolist() == [6,9]
+ assert discon_array.res_id.tolist() == [6, 9]
+
def test_duplicate_atoms_check(duplicate_sample_array):
discon = struc.check_duplicate_atoms(duplicate_sample_array)
- assert discon.tolist() == [42,234]
\ No newline at end of file
+ assert discon.tolist() == [42, 234]
diff --git a/tests/structure/test_mechanics.py b/tests/structure/test_mechanics.py
index 7be195882..34c5d23c3 100644
--- a/tests/structure/test_mechanics.py
+++ b/tests/structure/test_mechanics.py
@@ -1,25 +1,57 @@
-import biotite.structure as struc
-import biotite.structure.io as strucio
-import numpy as np
from os.path import join
-from ..util import data_dir
import pytest
+import biotite.structure as struc
+import biotite.structure.io as strucio
+from tests.util import data_dir
+
def test_gyration_radius():
stack = strucio.load_structure(join(data_dir("structure"), "1l2y.bcif"))
radii = struc.gyration_radius(stack)
# Compare with results from MDTraj
- exp_radii = \
- [7.30527532, 7.34189463, 7.21863721, 7.29877736, 7.25389752, 7.22292189,
- 7.20646252, 7.27215909, 7.30437723, 7.30455437, 7.37979331, 7.14176259,
- 7.20674397, 7.27594995, 7.31665835, 7.29850786, 7.34378951, 7.2642137,
- 7.20727158, 7.16336879, 7.3479218, 7.19362027, 7.24841519, 7.29229237,
- 7.15243826, 7.31285673, 7.22585756, 7.25467109, 7.3493648, 7.34203588,
- 7.3310182, 7.29236536, 7.20527373, 7.33138918, 7.2284936, 7.40374312,
- 7.24856173, 7.25581809]
+ exp_radii = [
+ 7.30527532,
+ 7.34189463,
+ 7.21863721,
+ 7.29877736,
+ 7.25389752,
+ 7.22292189,
+ 7.20646252,
+ 7.27215909,
+ 7.30437723,
+ 7.30455437,
+ 7.37979331,
+ 7.14176259,
+ 7.20674397,
+ 7.27594995,
+ 7.31665835,
+ 7.29850786,
+ 7.34378951,
+ 7.2642137,
+ 7.20727158,
+ 7.16336879,
+ 7.3479218,
+ 7.19362027,
+ 7.24841519,
+ 7.29229237,
+ 7.15243826,
+ 7.31285673,
+ 7.22585756,
+ 7.25467109,
+ 7.3493648,
+ 7.34203588,
+ 7.3310182,
+ 7.29236536,
+ 7.20527373,
+ 7.33138918,
+ 7.2284936,
+ 7.40374312,
+ 7.24856173,
+ 7.25581809,
+ ]
assert radii.tolist() == pytest.approx(exp_radii, abs=2e-2)
# Same for atom array instead of stack
array = stack[0]
radius = struc.gyration_radius(array)
- assert radius == pytest.approx(exp_radii[0], abs=2e-2)
\ No newline at end of file
+ assert radius == pytest.approx(exp_radii[0], abs=2e-2)
diff --git a/tests/structure/test_mol.py b/tests/structure/test_mol.py
index 55ce15f04..ce4378e86 100644
--- a/tests/structure/test_mol.py
+++ b/tests/structure/test_mol.py
@@ -14,16 +14,17 @@
import biotite.structure.io.pdbx as pdbx
from biotite.structure.bonds import BondType
from biotite.structure.io.mol.ctab import BOND_TYPE_MAPPING_REV
-from ..util import data_dir
+from tests.util import data_dir
def list_v2000_sdf_files():
return [
- path for path
- in glob.glob(join(data_dir("structure"), "molecules", "*.sdf"))
- if not "v3000" in path
+ path
+ for path in glob.glob(join(data_dir("structure"), "molecules", "*.sdf"))
+ if "v3000" not in path
]
+
def list_v3000_sdf_files():
return glob.glob(join(data_dir("structure"), "molecules", "*v3000.sdf"))
@@ -79,11 +80,16 @@ def test_header_conversion():
list_v2000_sdf_files(),
["V2000", "V3000"],
[False, True],
- [False, True]
- )
+ [False, True],
+ ),
)
-def test_structure_conversion(FileClass, path, version, omit_charge,
- use_charge_property):
+def test_structure_conversion(
+ FileClass, # noqa: N803
+ path,
+ version,
+ omit_charge,
+ use_charge_property,
+):
"""
After reading a file, writing the structure back to a new file
and reading it again should give the same structure.
@@ -123,9 +129,10 @@ def test_structure_conversion(FileClass, path, version, omit_charge,
@pytest.mark.parametrize(
"path",
[
- file for file in list_v2000_sdf_files() + list_v3000_sdf_files()
+ file
+ for file in list_v2000_sdf_files() + list_v3000_sdf_files()
if file.split(".")[0] + ".cif" in list_cif_files()
- ]
+ ],
)
def test_pdbx_consistency(path):
"""
@@ -145,20 +152,17 @@ def test_pdbx_consistency(path):
test_atoms = mol.get_structure(sdf_file)
assert test_atoms.coord.shape == ref_atoms.coord.shape
- assert test_atoms.coord.flatten().tolist() \
- == ref_atoms.coord.flatten().tolist()
+ assert test_atoms.coord.flatten().tolist() == ref_atoms.coord.flatten().tolist()
assert test_atoms.element.tolist() == ref_atoms.element.tolist()
assert test_atoms.charge.tolist() == ref_atoms.charge.tolist()
- assert set(tuple(bond) for bond in test_atoms.bonds.as_array()) \
- == set(tuple(bond) for bond in ref_atoms.bonds.as_array())
+ assert set(tuple(bond) for bond in test_atoms.bonds.as_array()) == set(
+ tuple(bond) for bond in ref_atoms.bonds.as_array()
+ )
@pytest.mark.parametrize(
"v2000_path, v3000_path",
- zip(
- sorted(list_v2000_sdf_files()),
- sorted(list_v3000_sdf_files())
- )
+ zip(sorted(list_v2000_sdf_files()), sorted(list_v3000_sdf_files())),
)
def test_version_consistency(v2000_path, v3000_path):
"""
@@ -198,10 +202,7 @@ def test_multi_record_files():
temp.seek(0)
sdf_file = mol.SDFile.read(temp)
- test_atom_arrays = [
- sdf_file[res_name].get_structure()
- for res_name in RES_NAMES
- ]
+ test_atom_arrays = [sdf_file[res_name].get_structure() for res_name in RES_NAMES]
assert test_atom_arrays == ref_atom_arrays
@@ -210,9 +211,7 @@ def test_metadata_parsing():
"""
Check if metadata is parsed correctly based on a known example.
"""
- sdf_file = mol.SDFile.read(
- join(data_dir("structure"), "molecules", "13136.sdf")
- )
+ sdf_file = mol.SDFile.read(join(data_dir("structure"), "molecules", "13136.sdf"))
metadata = sdf_file.record.metadata
assert metadata["PUBCHEM_COMPOUND_CID"] == "13136"
@@ -224,10 +223,7 @@ def test_metadata_conversion():
"""
Writing metadata and reading it again should give the same data.
"""
- ref_metadata = {
- "test_1": "value 1",
- "test_2": "value 2\nvalue 3"
- }
+ ref_metadata = {"test_1": "value 1", "test_2": "value 2\nvalue 3"}
record = mol.SDRecord(metadata=ref_metadata)
sdf_file = mol.SDFile({"Molecule": record})
@@ -236,9 +232,7 @@ def test_metadata_conversion():
temp.seek(0)
sdf_file = mol.SDFile.read(temp)
- test_metadata = {
- key.name: val for key, val in sdf_file.record.metadata.items()
- }
+ test_metadata = {key.name: val for key, val in sdf_file.record.metadata.items()}
temp.close()
assert test_metadata == ref_metadata
@@ -248,18 +242,10 @@ def test_metadata_conversion():
"key_string, ref_key_attributes",
[
# Cases from Dalby1992
- (
- "> ",
- (None, "MELTING.POINT", None, None)
- ),
- (
- "> 55 (MD-08974) DT12",
- (12, "BOILING.POINT", 55, "MD-08974")
- ),
- (
- "> DT12 55", (12, None, 55, None)
- ),
- ]
+ ("> ", (None, "MELTING.POINT", None, None)),
+ ("> 55 (MD-08974) DT12", (12, "BOILING.POINT", 55, "MD-08974")),
+ ("> DT12 55", (12, None, 55, None)),
+ ],
)
def test_metadata_key_parsing(key_string, ref_key_attributes):
"""
@@ -270,7 +256,7 @@ def test_metadata_key_parsing(key_string, ref_key_attributes):
number=number,
name=name,
registry_internal=registry_internal,
- registry_external=registry_external
+ registry_external=registry_external,
)
test_key = mol.Metadata.Key.deserialize(key_string)
@@ -292,7 +278,7 @@ def test_structure_bond_type_fallback(path):
# the default bond type
ref_atoms.bonds.add_bond(0, 1, BondType.QUADRUPLE)
updated_bond = ref_atoms.bonds.as_array()[
- np.all(ref_atoms.bonds.as_array()[:,[0,1]] == [0,1], axis=1)
+ np.all(ref_atoms.bonds.as_array()[:, [0, 1]] == [0, 1], axis=1)
]
assert updated_bond.tolist()[0][2] == BondType.QUADRUPLE
test_mol_file = mol.MOLFile()
@@ -300,21 +286,16 @@ def test_structure_bond_type_fallback(path):
# Test bond type fallback to BondType.ANY value (8) in
# MolFile.set_structure during mol_file.lines formatting
updated_line = [
- mol_line
- for mol_line in test_mol_file.lines if mol_line.startswith(' 1 2 ')
+ mol_line for mol_line in test_mol_file.lines if mol_line.startswith(" 1 2 ")
].pop()
- assert int(updated_line[8]) == \
- BOND_TYPE_MAPPING_REV[BondType.ANY]
+ assert int(updated_line[8]) == BOND_TYPE_MAPPING_REV[BondType.ANY]
# Test bond type fallback to BondType.SINGLE value (1) in
# MolFile.set_structure during mol_file.lines formatting
- mol.set_structure(test_mol_file, ref_atoms,
- default_bond_type=BondType.SINGLE)
+ mol.set_structure(test_mol_file, ref_atoms, default_bond_type=BondType.SINGLE)
updated_line = [
- mol_line
- for mol_line in test_mol_file.lines if mol_line.startswith(' 1 2 ')
+ mol_line for mol_line in test_mol_file.lines if mol_line.startswith(" 1 2 ")
].pop()
- assert int(updated_line[8]) == \
- BOND_TYPE_MAPPING_REV[BondType.SINGLE]
+ assert int(updated_line[8]) == BOND_TYPE_MAPPING_REV[BondType.SINGLE]
@pytest.mark.parametrize("atom_type", ["", " ", "A ", " A"])
@@ -396,4 +377,4 @@ def _delete_charge_property(file):
lines = [line for line in lines if not line.startswith("M CHG")]
file.seek(0)
file.truncate()
- file.write("\n".join(lines) + "\n")
\ No newline at end of file
+ file.write("\n".join(lines) + "\n")
diff --git a/tests/structure/test_molecules.py b/tests/structure/test_molecules.py
index d983f9f83..6880cd8cd 100644
--- a/tests/structure/test_molecules.py
+++ b/tests/structure/test_molecules.py
@@ -18,26 +18,24 @@ def array():
:class:`AtomArray`.
"""
MOL_NAMES = [
- "ARG", # Molecule with multiple branches
- "TRP", # Molecule with a cycle
- "GLC", # Molecule with a cycle
+ "ARG", # Molecule with multiple branches
+ "TRP", # Molecule with a cycle
+ "GLC", # Molecule with a cycle
"NA", # A single atom
- "ATP" # Larger molecule
+ "ATP", # Larger molecule
]
N_MOLECULES = 20
np.random.seed(0)
-
+
atom_array = struc.AtomArray(0)
for i, mol_name in enumerate(np.random.choice(MOL_NAMES, N_MOLECULES)):
molecule = info.residue(mol_name)
- molecule.res_id[:] = i+1
+ molecule.res_id[:] = i + 1
atom_array += molecule
-
+
reordered_indices = np.random.choice(
- np.arange(atom_array.array_length()),
- atom_array.array_length(),
- replace=False
+ np.arange(atom_array.array_length()), atom_array.array_length(), replace=False
)
atom_array = atom_array[reordered_indices]
@@ -45,12 +43,7 @@ def array():
@pytest.mark.parametrize(
- "as_stack, as_bonds",
- [
- (False, False),
- (True, False),
- (False, True )
- ]
+ "as_stack, as_bonds", [(False, False), (True, False), (False, True)]
)
def test_get_molecule_indices(array, as_stack, as_bonds):
"""
@@ -59,12 +52,12 @@ def test_get_molecule_indices(array, as_stack, as_bonds):
"""
if as_stack:
array = struc.stack([array])
-
+
if as_bonds:
test_indices = struc.get_molecule_indices(array.bonds)
else:
test_indices = struc.get_molecule_indices(array)
-
+
seen_atoms = 0
for indices in test_indices:
molecule = array[..., indices]
@@ -72,20 +65,16 @@ def test_get_molecule_indices(array, as_stack, as_bonds):
# -> all atoms from the same molecule
assert (molecule.res_id == molecule.res_id[0]).all()
# Assert that no atom is missing from the molecule
- assert molecule.array_length() \
- == info.residue(molecule.res_name[0]).array_length()
+ assert (
+ molecule.array_length() == info.residue(molecule.res_name[0]).array_length()
+ )
seen_atoms += molecule.array_length()
# Assert that all molecules are fond
assert seen_atoms == array.array_length()
@pytest.mark.parametrize(
- "as_stack, as_bonds",
- [
- (False, False),
- (True, False),
- (False, True )
- ]
+ "as_stack, as_bonds", [(False, False), (True, False), (False, True)]
)
def test_get_molecule_masks(array, as_stack, as_bonds):
"""
@@ -95,18 +84,18 @@ def test_get_molecule_masks(array, as_stack, as_bonds):
"""
if as_stack:
array = struc.stack([array])
-
+
if as_bonds:
ref_indices = struc.get_molecule_indices(array.bonds)
test_masks = struc.get_molecule_masks(array.bonds)
else:
ref_indices = struc.get_molecule_indices(array)
test_masks = struc.get_molecule_masks(array)
-
+
for i in range(len(test_masks)):
# Assert that the mask is 'True' for all indices
# and that these 'True' values are the only ones in the mask
- assert (test_masks[i, ref_indices[i]] == True).all()
+ assert test_masks[i, ref_indices[i]].all()
assert np.count_nonzero(test_masks[i]) == len(ref_indices[i])
@@ -123,4 +112,4 @@ def test_molecule_iter(array, as_stack):
test_iterator = struc.molecule_iter(array)
for i, molecule in enumerate(test_iterator):
- assert molecule == array[..., ref_indices[i]]
\ No newline at end of file
+ assert molecule == array[..., ref_indices[i]]
diff --git a/tests/structure/test_pdb.py b/tests/structure/test_pdb.py
index d93974f72..3b1ef07a6 100644
--- a/tests/structure/test_pdb.py
+++ b/tests/structure/test_pdb.py
@@ -2,22 +2,21 @@
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.
-from tempfile import TemporaryFile
-import warnings
-import itertools
import glob
-from os.path import join, splitext
+import itertools
import sys
+import warnings
+from os.path import join, splitext
+from tempfile import TemporaryFile
+import numpy as np
import pytest
from pytest import approx
-import numpy as np
import biotite
import biotite.structure as struc
import biotite.structure.io.pdb as pdb
import biotite.structure.io.pdb.hybrid36 as hybrid36
import biotite.structure.io.pdbx as pdbx
-import biotite.structure.io as io
-from ..util import data_dir
+from tests.util import data_dir
def test_get_model_count():
@@ -35,17 +34,15 @@ def test_get_model_count():
glob.glob(join(data_dir("structure"), "*.pdb")),
[None, 1, -1],
[False, True],
- [False, True]
- )
+ [False, True],
+ ),
)
def test_array_conversion(path, model, hybrid36, include_bonds):
pdb_file = pdb.PDBFile.read(path)
# Test also the thin wrapper around the methods
# 'get_structure()' and 'set_structure()'
try:
- array1 = pdb.get_structure(
- pdb_file, model=model, include_bonds=include_bonds
- )
+ array1 = pdb.get_structure(pdb_file, model=model, include_bonds=include_bonds)
except biotite.InvalidFileError:
if model is None:
# The file cannot be parsed into an AtomArrayStack,
@@ -58,8 +55,7 @@ def test_array_conversion(path, model, hybrid36, include_bonds):
if hybrid36 and (array1.res_id < 0).any():
with pytest.raises(
ValueError,
- match="Only positive integers can be converted "
- "into hybrid-36 notation"
+ match="Only positive integers can be converted " "into hybrid-36 notation",
):
pdb_file = pdb.PDBFile()
pdb.set_structure(pdb_file, array1, hybrid36=hybrid36)
@@ -68,33 +64,28 @@ def test_array_conversion(path, model, hybrid36, include_bonds):
pdb_file = pdb.PDBFile()
pdb.set_structure(pdb_file, array1, hybrid36=hybrid36)
- array2 = pdb.get_structure(
- pdb_file, model=model, include_bonds=include_bonds
- )
+ array2 = pdb.get_structure(pdb_file, model=model, include_bonds=include_bonds)
if array1.box is not None:
assert np.allclose(array1.box, array2.box)
assert array1.bonds == array2.bonds
for category in array1.get_annotation_categories():
- assert array1.get_annotation(category).tolist() == \
- array2.get_annotation(category).tolist()
+ assert (
+ array1.get_annotation(category).tolist()
+ == array2.get_annotation(category).tolist()
+ )
assert array1.coord.tolist() == array2.coord.tolist()
@pytest.mark.parametrize(
"path, model",
- itertools.product(
- glob.glob(join(data_dir("structure"), "*.pdb")),
- [None, 1, -1]
- )
+ itertools.product(glob.glob(join(data_dir("structure"), "*.pdb")), [None, 1, -1]),
)
def test_pdbx_consistency(path, model):
bcif_path = splitext(path)[0] + ".bcif"
pdbx_file = pdbx.BinaryCIFFile.read(bcif_path)
try:
- ref_atoms = pdbx.get_structure(
- pdbx_file, model=model, include_bonds=True
- )
+ ref_atoms = pdbx.get_structure(pdbx_file, model=model, include_bonds=True)
except biotite.InvalidFileError:
if model is None:
# The file cannot be parsed into an AtomArrayStack,
@@ -134,17 +125,16 @@ def test_pdbx_consistency(path, model):
print(file=sys.stderr)
raise
for category in ref_atoms.get_annotation_categories():
- assert test_atoms.get_annotation(category).tolist() == \
- ref_atoms.get_annotation(category).tolist()
+ assert (
+ test_atoms.get_annotation(category).tolist()
+ == ref_atoms.get_annotation(category).tolist()
+ )
assert test_atoms.coord.tolist() == ref_atoms.coord.tolist()
@pytest.mark.parametrize(
"path, model",
- itertools.product(
- glob.glob(join(data_dir("structure"), "*.pdb")),
- [None, 1]
- )
+ itertools.product(glob.glob(join(data_dir("structure"), "*.pdb")), [None, 1]),
)
def test_pdbx_consistency_assembly(path, model):
"""
@@ -168,10 +158,13 @@ def test_pdbx_consistency_assembly(path, model):
ref_assembly = pdbx.get_assembly(pdbx_file, model=model)
for category in ref_assembly.get_annotation_categories():
- assert test_assembly.get_annotation(category).tolist() == \
- ref_assembly.get_annotation(category).tolist()
- assert test_assembly.coord.flatten().tolist() == \
- approx(ref_assembly.coord.flatten().tolist(), abs=1e-3)
+ assert (
+ test_assembly.get_annotation(category).tolist()
+ == ref_assembly.get_annotation(category).tolist()
+ )
+ assert test_assembly.coord.flatten().tolist() == approx(
+ ref_assembly.coord.flatten().tolist(), abs=1e-3
+ )
@pytest.mark.parametrize("hybrid36", [False, True])
@@ -179,9 +172,7 @@ def test_extra_fields(hybrid36):
path = join(data_dir("structure"), "1l2y.pdb")
pdb_file = pdb.PDBFile.read(path)
stack1 = pdb_file.get_structure(
- extra_fields=[
- "atom_id", "b_factor", "occupancy", "charge"
- ]
+ extra_fields=["atom_id", "b_factor", "occupancy", "charge"]
)
with pytest.raises(ValueError):
@@ -196,9 +187,7 @@ def test_extra_fields(hybrid36):
pdb_file.set_structure(stack1, hybrid36=hybrid36)
stack2 = pdb_file.get_structure(
- extra_fields=[
- "atom_id", "b_factor", "occupancy", "charge"
- ]
+ extra_fields=["atom_id", "b_factor", "occupancy", "charge"]
)
assert stack1.ins_code.tolist() == stack2.ins_code.tolist()
@@ -218,7 +207,7 @@ def test_inferred_elements():
# Remove all elements
removed_stack = stack.copy()
- removed_stack.element[:] = ''
+ removed_stack.element[:] = ""
# Save stack without elements to tmp file
temp = TemporaryFile("w+")
@@ -237,10 +226,7 @@ def test_inferred_elements():
@pytest.mark.parametrize(
"path, model",
- itertools.product(
- glob.glob(join(data_dir("structure"), "*.pdb")),
- [None, 1, -1]
- )
+ itertools.product(glob.glob(join(data_dir("structure"), "*.pdb")), [None, 1, -1]),
)
def test_box_shape(path, model):
pdb_file = pdb.PDBFile.read(path)
@@ -266,14 +252,11 @@ def test_box_parsing():
path = join(data_dir("structure"), "1igy.pdb")
pdb_file = pdb.PDBFile.read(path)
a = pdb_file.get_structure()
- expected_box = np.array([[
- [66.65, 0.00, 0.00],
- [0.00, 190.66, 0.00],
- [-24.59, 0.00, 68.84]
- ]])
+ expected_box = np.array(
+ [[[66.65, 0.00, 0.00], [0.00, 190.66, 0.00], [-24.59, 0.00, 68.84]]]
+ )
- assert expected_box.flatten().tolist() \
- == approx(a.box.flatten().tolist(), abs=1e-2)
+ assert expected_box.flatten().tolist() == approx(a.box.flatten().tolist(), abs=1e-2)
def test_id_overflow():
@@ -283,7 +266,7 @@ def test_id_overflow():
a.coord = np.zeros(a.coord.shape)
a.chain_id = np.full(length, "A")
# Create residue IDs over 10000
- a.res_id = np.arange(1, length+1)
+ a.res_id = np.arange(1, length + 1)
a.res_name = np.full(length, "GLY")
a.hetero = np.full(length, False)
a.atom_name = np.full(length, "CA")
@@ -299,13 +282,13 @@ def test_id_overflow():
# Assert file can be read properly
temp.seek(0)
a2 = pdb.get_structure(pdb.PDBFile.read(temp))
- assert(a2.array_length() == a.array_length())
+ assert a2.array_length() == a.array_length()
# Manually check if the written atom id is correct
temp.seek(0)
last_line = temp.readlines()[-1]
atom_id = int(last_line.split()[1])
- assert(atom_id == 1)
+ assert atom_id == 1
temp.close()
@@ -321,9 +304,9 @@ def test_id_overflow():
temp.seek(0)
last_line = temp.readlines()[-1]
atom_id = last_line.split()[1]
- assert(atom_id == "A0000")
+ assert atom_id == "A0000"
res_id = last_line.split()[4][1:]
- assert(res_id == "BXG0")
+ assert res_id == "BXG0"
temp.close()
@@ -353,38 +336,41 @@ def test_get_b_factor(model):
if model is None:
# The B-factor is an annotation category
# -> it can only be extracted in a per-model basis
- ref_b_factor = np.stack([
- pdb_file.get_structure(
- model=m, extra_fields=["b_factor"]
- ).b_factor
- for m in range(1, pdb_file.get_model_count() + 1)
- ])
+ ref_b_factor = np.stack(
+ [
+ pdb_file.get_structure(model=m, extra_fields=["b_factor"]).b_factor
+ for m in range(1, pdb_file.get_model_count() + 1)
+ ]
+ )
else:
ref_b_factor = pdb_file.get_structure(
model=model, extra_fields=["b_factor"]
).b_factor
- test_b_factor= pdb_file.get_b_factor(model=model)
+ test_b_factor = pdb_file.get_b_factor(model=model)
assert test_b_factor.shape == ref_b_factor.shape
assert (test_b_factor == ref_b_factor).all()
-
np.random.seed(0)
N = 200
LENGTHS = [3, 4, 5]
+
+
@pytest.mark.parametrize(
"number, length",
zip(
- list(itertools.chain(*[
- np.random.randint(0, hybrid36.max_hybrid36_number(length), N)
- for length in LENGTHS
- ])),
- list(itertools.chain(*[
- [length] * N for length in LENGTHS
- ]))
- )
+ list(
+ itertools.chain(
+ *[
+ np.random.randint(0, hybrid36.max_hybrid36_number(length), N)
+ for length in LENGTHS
+ ]
+ )
+ ),
+ list(itertools.chain(*[[length] * N for length in LENGTHS])),
+ ),
)
def test_hybrid36_codec(number, length):
"""
@@ -401,7 +387,6 @@ def test_max_hybrid36_number():
assert hybrid36.max_hybrid36_number(5) == 87440031
-
@pytest.mark.parametrize("hybrid36", [False, True])
def test_bond_records(hybrid36):
"""
@@ -420,7 +405,7 @@ def test_bond_records(hybrid36):
np.random.seed(0)
# Create random bonds four times the number of atoms
- bond_array = np.random.randint(n_atoms, size=(4*n_atoms, 2))
+ bond_array = np.random.randint(n_atoms, size=(4 * n_atoms, 2))
# Remove bonds of atoms to themselves
bond_array = bond_array[bond_array[:, 0] != bond_array[:, 1]]
ref_bonds = struc.BondList(n_atoms, bond_array)
@@ -459,8 +444,8 @@ def test_get_symmetry_mates(model):
Test generated symmetry mates on a known example with a simple
space group and a single chain.
"""
- INVERSION_AXES = [(0,0,0), (0,0,1), (0,1,0), (1,0,0)]
- TRANSLATION_AXES = [(0,0,0), (1,0,1), (0,1,1), (1,1,0)]
+ INVERSION_AXES = [(0, 0, 0), (0, 0, 1), (0, 1, 0), (1, 0, 0)]
+ TRANSLATION_AXES = [(0, 0, 0), (1, 0, 1), (0, 1, 1), (1, 1, 0)]
path = join(data_dir("structure"), "1aki.pdb")
pdb_file = pdb.PDBFile.read(path)
@@ -475,8 +460,7 @@ def test_get_symmetry_mates(model):
symmetry_mates = pdb_file.get_symmetry_mates(model=model)
# Space group has 4 copies in a unit cell
- assert symmetry_mates.array_length() \
- == original_structure.array_length() * 4
+ assert symmetry_mates.array_length() == original_structure.array_length() * 4
if model is None:
assert symmetry_mates.stack_depth() == original_structure.stack_depth()
for chain, inv_axes, trans_axes in zip(
@@ -490,10 +474,13 @@ def test_get_symmetry_mates(model):
chain = struc.rotate(chain, angles)
# Now both mates should be equal
for category in original_structure.get_annotation_categories():
- assert chain.get_annotation(category).tolist() == \
- original_structure.get_annotation(category).tolist()
- assert chain.coord.flatten().tolist() == \
- approx(original_structure.coord.flatten().tolist(), abs=1e-3)
+ assert (
+ chain.get_annotation(category).tolist()
+ == original_structure.get_annotation(category).tolist()
+ )
+ assert chain.coord.flatten().tolist() == approx(
+ original_structure.coord.flatten().tolist(), abs=1e-3
+ )
@pytest.mark.parametrize(
@@ -512,7 +499,7 @@ def test_get_symmetry_mates(model):
("occupancy", 1000, False),
("charge", -10, False),
("charge", 10, False),
- ]
+ ],
)
def test_setting_incompatible_structure(annotation, value, warning_only):
"""
@@ -535,7 +522,7 @@ def test_setting_incompatible_structure(annotation, value, warning_only):
# Set one annotation to a value that exceeds the number of columns
if annotation == "coord":
- atoms.coord[0,0] = value
+ atoms.coord[0, 0] = value
else:
atoms.get_annotation(annotation)[0] = value
diff --git a/tests/structure/test_pdbqt.py b/tests/structure/test_pdbqt.py
index 1a7c2e049..2f18b496c 100644
--- a/tests/structure/test_pdbqt.py
+++ b/tests/structure/test_pdbqt.py
@@ -2,31 +2,30 @@
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.
-import warnings
-from tempfile import TemporaryFile
import glob
+import warnings
from os.path import join
-import pytest
+from tempfile import TemporaryFile
import numpy as np
+import pytest
import biotite.structure as struc
import biotite.structure.io.pdbqt as pdbqt
import biotite.structure.io.pdbx as pdbx
-from ..util import data_dir
+from tests.util import data_dir
@pytest.mark.parametrize(
"path",
[
- path for path in glob.glob(join(data_dir("structure"), "*.bcif"))
+ path
+ for path in glob.glob(join(data_dir("structure"), "*.bcif"))
# Skip this PDB ID as it contains 5-character residue names
if "7gsa" not in path
- ]
+ ],
)
def test_array_conversion(path):
pdbx_file = pdbx.BinaryCIFFile.read(path)
- ref_structure = pdbx.get_structure(
- pdbx_file, model=1, extra_fields=["charge"]
- )
+ ref_structure = pdbx.get_structure(pdbx_file, model=1, extra_fields=["charge"])
ref_structure.bonds = struc.connect_via_residue_names(ref_structure)
pdbqt_file = pdbqt.PDBQTFile()
@@ -53,7 +52,7 @@ def test_array_conversion(path):
try:
assert np.array_equal(
test_structure.get_annotation(category),
- ref_structure.get_annotation(category)
+ ref_structure.get_annotation(category),
)
except AssertionError:
print(f"Inequality in '{category}' category")
diff --git a/tests/structure/test_pdbx.py b/tests/structure/test_pdbx.py
index a3f88d44c..7a02960f5 100644
--- a/tests/structure/test_pdbx.py
+++ b/tests/structure/test_pdbx.py
@@ -2,9 +2,9 @@
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.
-import warnings
import glob
import itertools
+import warnings
from os.path import join, splitext
import numpy as np
import pytest
@@ -13,7 +13,7 @@
import biotite.sequence as seq
import biotite.structure as struc
import biotite.structure.io.pdbx as pdbx
-from ..util import data_dir
+from tests.util import data_dir
@pytest.mark.parametrize("format", ["cif", "bcif"])
@@ -22,7 +22,7 @@ def test_get_model_count(format):
Check of :func:`get_model_count()`gives the same number of models
as :func:`get_structure()`.
"""
- base_path = join(data_dir("structure"), f"1l2y")
+ base_path = join(data_dir("structure"), "1l2y")
if format == "cif":
pdbx_file = pdbx.CIFFile.read(base_path + ".cif")
else:
@@ -35,8 +35,17 @@ def test_get_model_count(format):
@pytest.mark.parametrize(
"string, looped",
itertools.product(
- ["", " ", " ", "te xt", "'", '"' ,"te\nxt", "\t",],
- [False, True]
+ [
+ "",
+ " ",
+ " ",
+ "te xt",
+ "'",
+ '"',
+ "te\nxt",
+ "\t",
+ ],
+ [False, True],
),
)
def test_escape(string, looped):
@@ -60,9 +69,7 @@ def test_escape(string, looped):
@pytest.mark.parametrize(
"format, path, model",
itertools.product(
- ["cif", "bcif"],
- glob.glob(join(data_dir("structure"), "*.cif")),
- [None, 1, -1]
+ ["cif", "bcif"], glob.glob(join(data_dir("structure"), "*.cif")), [None, 1, -1]
),
)
def test_conversion(tmpdir, format, path, model):
@@ -82,9 +89,7 @@ def test_conversion(tmpdir, format, path, model):
pdbx_file = File.read(data_path)
try:
- ref_atoms = pdbx.get_structure(
- pdbx_file, model=model, include_bonds=True
- )
+ ref_atoms = pdbx.get_structure(pdbx_file, model=model, include_bonds=True)
except biotite.InvalidFileError:
if model is None:
# The file cannot be parsed into an AtomArrayStack,
@@ -103,9 +108,7 @@ def test_conversion(tmpdir, format, path, model):
# Remove one label section to test fallback to auth fields
del pdbx_file.block["atom_site"][DELETED_ANNOTATION]
with pytest.warns(UserWarning, match=f"'{DELETED_ANNOTATION}' not found"):
- test_atoms = pdbx.get_structure(
- pdbx_file, model=model, include_bonds=True
- )
+ test_atoms = pdbx.get_structure(pdbx_file, model=model, include_bonds=True)
assert ref_atoms.array_length() > 0
if ref_atoms.box is not None:
@@ -144,9 +147,7 @@ def test_bond_conversion(tmpdir, format, path):
File = pdbx.BinaryCIFFile
pdbx_file = File.read(data_path)
- atoms = pdbx.get_structure(
- pdbx_file, model=1, include_bonds=True
- )
+ atoms = pdbx.get_structure(pdbx_file, model=1, include_bonds=True)
ref_bonds = atoms.bonds
pdbx_file = File()
@@ -160,16 +161,12 @@ def test_bond_conversion(tmpdir, format, path):
# i.e. the bonds can be properly read from ``chem_comp_bond``
with warnings.catch_warnings():
warnings.simplefilter("error")
- test_bonds = pdbx.get_structure(
- pdbx_file, model=1, include_bonds=True
- ).bonds
+ test_bonds = pdbx.get_structure(pdbx_file, model=1, include_bonds=True).bonds
assert test_bonds == ref_bonds
-@pytest.mark.parametrize(
- "format", ["cif", "bcif"]
-)
+@pytest.mark.parametrize("format", ["cif", "bcif"])
def test_extra_fields(tmpdir, format):
path = join(data_dir("structure"), f"1l2y.{format}")
if format == "cif":
@@ -208,9 +205,7 @@ def test_intra_bond_residue_parsing():
"""
cif_path = join(data_dir("structure"), "1l2y.cif")
cif_file = pdbx.CIFFile.read(cif_path)
- ref_bonds = pdbx.get_structure(
- cif_file, model=1, include_bonds=True
- ).bonds
+ ref_bonds = pdbx.get_structure(cif_file, model=1, include_bonds=True).bonds
nextgen_cif_path = join(
data_dir("structure"), "nextgen", "pdb_00001l2y_xyz-enrich.cif"
@@ -227,9 +222,7 @@ def test_intra_bond_residue_parsing():
assert test_bonds == ref_bonds
-@pytest.mark.parametrize(
- "format", ["cif", "bcif"]
-)
+@pytest.mark.parametrize("format", ["cif", "bcif"])
def test_any_bonds(tmpdir, format):
"""
Check if ``BondType.ANY`` bonds can be written and read from a PDBx
@@ -266,16 +259,12 @@ def test_any_bonds(tmpdir, format):
# i.e. the bonds can be properly read from ``chem_comp_bond``
with warnings.catch_warnings():
warnings.simplefilter("error")
- test_bonds = pdbx.get_structure(
- pdbx_file, model=1, include_bonds=True
- ).bonds
+ test_bonds = pdbx.get_structure(pdbx_file, model=1, include_bonds=True).bonds
assert test_bonds == ref_bonds
-@pytest.mark.parametrize(
- "format", ["cif", "bcif"]
-)
+@pytest.mark.parametrize("format", ["cif", "bcif"])
def test_unequal_lengths(format):
"""
Check if setting columns with unequal lengths in the same category
@@ -299,9 +288,7 @@ def test_setting_empty_column():
"""
Check if setting an empty column raises an exception.
"""
- with pytest.raises(
- ValueError, match="Array must contain at least one element"
- ):
+ with pytest.raises(ValueError, match="Array must contain at least one element"):
pdbx.CIFCategory({"foo": []})
@@ -324,9 +311,7 @@ def test_setting_empty_structure():
pdbx.set_structure(pdbx.CIFFile(), atoms, include_bonds=True)
-@pytest.mark.parametrize(
- "format", ["cif", "bcif"]
-)
+@pytest.mark.parametrize("format", ["cif", "bcif"])
def test_list_assemblies(format):
"""
Test the :func:`list_assemblies()` function based on a known
@@ -351,11 +336,10 @@ def test_list_assemblies(format):
}
-@pytest.mark.parametrize("format, pdb_id, model", itertools.product(
- ["cif", "bcif"],
- ["1f2n", "5zng"],
- [None, 1, -1]
-))
+@pytest.mark.parametrize(
+ "format, pdb_id, model",
+ itertools.product(["cif", "bcif"], ["1f2n", "5zng"], [None, 1, -1]),
+)
def test_get_assembly(format, pdb_id, model):
"""
Test whether the :func:`get_assembly()` function produces the same
@@ -376,13 +360,11 @@ def test_get_assembly(format, pdb_id, model):
# Test each available assembly
for id, ref_oligomer_count in zip(
assembly_category["id"].as_array(str),
- assembly_category["oligomeric_count"].as_array(int)
+ assembly_category["oligomeric_count"].as_array(int),
):
print("Assembly ID:", id)
try:
- assembly = pdbx.get_assembly(
- pdbx_file, assembly_id=id, model=model
- )
+ assembly = pdbx.get_assembly(pdbx_file, assembly_id=id, model=model)
except biotite.InvalidFileError:
if model is None:
# The file cannot be parsed into an AtomArrayStack,
@@ -409,8 +391,7 @@ def test_get_assembly(format, pdb_id, model):
@pytest.mark.parametrize(
"path, use_ideal_coord",
itertools.product(
- glob.glob(join(data_dir("structure"), "molecules", "*.cif")),
- [False, True]
+ glob.glob(join(data_dir("structure"), "molecules", "*.cif")), [False, True]
),
)
def test_component_conversion(tmpdir, path, use_ideal_coord):
@@ -420,9 +401,7 @@ def test_component_conversion(tmpdir, path, use_ideal_coord):
structure.
"""
cif_file = pdbx.CIFFile.read(path)
- ref_atoms = pdbx.get_component(
- cif_file, use_ideal_coord=use_ideal_coord
- )
+ ref_atoms = pdbx.get_component(cif_file, use_ideal_coord=use_ideal_coord)
cif_file = pdbx.CIFFile()
pdbx.set_component(cif_file, ref_atoms, data_block="test")
@@ -430,9 +409,7 @@ def test_component_conversion(tmpdir, path, use_ideal_coord):
cif_file.write(file_path)
cif_file = pdbx.CIFFile.read(path)
- test_atoms = pdbx.get_component(
- cif_file, use_ideal_coord=use_ideal_coord
- )
+ test_atoms = pdbx.get_component(cif_file, use_ideal_coord=use_ideal_coord)
assert test_atoms == ref_atoms
@@ -452,14 +429,14 @@ def test_get_sequence(format):
sequences_1 = pdbx.get_sequence(pdbx_file)
pdbx_file = File.read(join(data_dir("structure"), f"4gxy.{format}"))
sequences_2 = pdbx.get_sequence(pdbx_file)
- assert str(sequences_1['T']) == "CCGACGGCGCATCAGC"
- assert type(sequences_1['T']) is seq.NucleotideSequence
- assert str(sequences_1['P']) == "GCTGATGCGCC"
- assert type(sequences_1['P']) is seq.NucleotideSequence
- assert str(sequences_1['D']) == "GTCGG"
- assert type(sequences_1['D']) is seq.NucleotideSequence
+ assert str(sequences_1["T"]) == "CCGACGGCGCATCAGC"
+ assert type(sequences_1["T"]) is seq.NucleotideSequence
+ assert str(sequences_1["P"]) == "GCTGATGCGCC"
+ assert type(sequences_1["P"]) is seq.NucleotideSequence
+ assert str(sequences_1["D"]) == "GTCGG"
+ assert type(sequences_1["D"]) is seq.NucleotideSequence
assert (
- str(sequences_1['A']) == "MSKRKAPQETLNGGITDMLTELANFEKNVSQAIHKYN"
+ str(sequences_1["A"]) == "MSKRKAPQETLNGGITDMLTELANFEKNVSQAIHKYN"
"AYRKAASVIAKYPHKIKSGAEAKKLPGVGTKIAEKIDEFLATGKLRKLEKIRQD"
"DTSSSINFLTRVSGIGPSAARKFVDEGIKTLEDLRKNEDKLNHHQRIGLKYFGD"
"FEKRIPREEMLQMQDIVLNEVKKVDSEYIATVCGSFRRGAESSGDMDVLLTHPS"
@@ -467,14 +444,14 @@ def test_get_sequence(format):
"RIDIRLIPKDQYYCGVLYFTGSDIFNKNMRAHALEKGFTINEYTIRPLGVTGVA"
"GEPLPVDSEKDIFDYIQWKYREPKDRSE"
)
- assert type(sequences_1['A']) is seq.ProteinSequence
+ assert type(sequences_1["A"]) is seq.ProteinSequence
assert (
- str(sequences_2['A']) == "GGCGGCAGGTGCTCCCGACCCTGCGGTCGGGAGTTAA"
+ str(sequences_2["A"]) == "GGCGGCAGGTGCTCCCGACCCTGCGGTCGGGAGTTAA"
"AAGGGAAGCCGGTGCAAGTCCGGCACGGTCCCGCCACTGTGACGGGGAGTCGCC"
"CCTCGGGATGTGCCACTGGCCCGAAGGCCGGGAAGGCGGAGGGGCGGCGAGGAT"
"CCGGAGTCAGGAAACCTGCCTGCCGTC"
)
- assert type(sequences_2['A']) is seq.NucleotideSequence
+ assert type(sequences_2["A"]) is seq.NucleotideSequence
def test_bcif_encoding():
@@ -485,21 +462,20 @@ def test_bcif_encoding():
PDB_ID = "1aki"
encodings_used = {
- encoding: False for encoding in [
+ encoding: False
+ for encoding in [
pdbx.ByteArrayEncoding,
pdbx.FixedPointEncoding,
# This encoding is not used in the test file
- #pdbx.IntervalQuantizationEncoding,
+ # pdbx.IntervalQuantizationEncoding,
pdbx.RunLengthEncoding,
pdbx.DeltaEncoding,
pdbx.IntegerPackingEncoding,
- pdbx.StringArrayEncoding
+ pdbx.StringArrayEncoding,
]
}
- bcif_file = pdbx.BinaryCIFFile.read(
- join(data_dir("structure"), f"{PDB_ID}.bcif")
- )
+ bcif_file = pdbx.BinaryCIFFile.read(join(data_dir("structure"), f"{PDB_ID}.bcif"))
for category_name, category in bcif_file[PDB_ID.upper()].items():
for column_name in category.keys():
try:
@@ -519,17 +495,15 @@ def test_bcif_encoding():
test_msgpack = column.serialize()
assert test_msgpack == ref_msgpack
- except:
- raise Exception(
- f"Encoding failed for '{category_name}.{column_name}'"
- )
+ except Exception:
+ raise Exception(f"Encoding failed for '{category_name}.{column_name}'")
# Check if each encoding was used at least once
# to ensure that the test was thorough
for key, was_used in encodings_used.items():
try:
assert was_used
- except:
+ except Exception:
raise Exception(f"Encoding {key} was not used")
@@ -587,14 +561,17 @@ def test_bcif_cif_consistency():
if cif_column.mask is None:
assert bcif_column.mask is None
else:
- assert cif_column.mask.array.tolist() \
+ assert (
+ cif_column.mask.array.tolist()
== bcif_column.mask.array.tolist()
+ )
# In CIF format, all vales are strings
# -> ensure consistency
dtype = bcif_column.data.array.dtype
- assert cif_column.as_array(dtype).tolist() \
- == pytest.approx(bcif_column.as_array(dtype).tolist())
- except:
+ assert cif_column.as_array(dtype).tolist() == pytest.approx(
+ bcif_column.as_array(dtype).tolist()
+ )
+ except Exception:
raise Exception(
f"Comparison failed for '{category_name}.{column_name}'"
)
@@ -606,7 +583,7 @@ def test_bcif_cif_consistency():
("cif", None),
("bcif", False),
("bcif", True),
- ]
+ ],
)
def test_serialization_consistency(format, create_new_encoding):
"""
@@ -626,22 +603,18 @@ def test_serialization_consistency(format, create_new_encoding):
for category_name, ref_category in file.block.items():
if format == "cif":
- test_category = pdbx.CIFCategory.deserialize(
- ref_category.serialize()
- )
+ test_category = pdbx.CIFCategory.deserialize(ref_category.serialize())
elif format == "bcif":
# Access each column to force otherwise lazy deserialization
for _ in ref_category.values():
pass
if create_new_encoding:
ref_category = _clear_encoding(ref_category)
- test_category = pdbx.BinaryCIFCategory.deserialize(
- ref_category.serialize()
- )
+ test_category = pdbx.BinaryCIFCategory.deserialize(ref_category.serialize())
try:
for key in test_category.keys():
assert ref_category[key] == test_category[key]
- except:
+ except Exception:
raise Exception(f"Comparison failed for '{category_name}.{key}'")
diff --git a/tests/structure/test_pseudoknots.py b/tests/structure/test_pseudoknots.py
index d7a594bcf..b263db5af 100644
--- a/tests/structure/test_pseudoknots.py
+++ b/tests/structure/test_pseudoknots.py
@@ -2,14 +2,13 @@
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.
-import pytest
import json
+from os.path import join
import numpy as np
-import pickle as pkl
+import pytest
import biotite.structure as struc
import biotite.structure.io as strucio
-from os.path import join
-from ..util import data_dir
+from tests.util import data_dir
@pytest.fixture
@@ -19,6 +18,7 @@ def nuc_sample_array():
"""
return strucio.load_structure(join(data_dir("structure"), "4p5j.cif"))
+
def test_pseudoknots(nuc_sample_array):
"""
Check the output of :func:`pseudoknots()`.
@@ -26,11 +26,9 @@ def test_pseudoknots(nuc_sample_array):
# Known base pairs with pseudoknot-order = 1:
pseudoknot_order_one = [{2, 74}, {58, 72}, {59, 71}, {60, 70}]
# Known base pairs that can either be of order one or two
- pseudoknot_order_one_or_two = [{9, 48}, {10, 49}]
- order_one_count = (
- len(pseudoknot_order_one) + (len(pseudoknot_order_one_or_two)/2)
- )
- order_two_count = len(pseudoknot_order_one_or_two)/2
+ pseudoknot_order_one_or_two = [{9, 48}, {10, 49}]
+ order_one_count = len(pseudoknot_order_one) + (len(pseudoknot_order_one_or_two) / 2)
+ order_two_count = len(pseudoknot_order_one_or_two) / 2
base_pairs = struc.base_pairs(nuc_sample_array)
pseudoknot_order = struc.pseudoknots(base_pairs)
@@ -51,15 +49,14 @@ def test_pseudoknots(nuc_sample_array):
for base_pair, order in zip(
nuc_sample_array[base_pairs].res_id, optimal_solution
):
- if(order == 1):
+ if order == 1:
assert (
- set(base_pair) in pseudoknot_order_one or
- set(base_pair) in pseudoknot_order_one_or_two
- )
- elif (order == 2):
- assert (
- set(base_pair) in pseudoknot_order_one_or_two
+ set(base_pair) in pseudoknot_order_one
+ or set(base_pair) in pseudoknot_order_one_or_two
)
+ elif order == 2:
+ assert set(base_pair) in pseudoknot_order_one_or_two
+
def load_test(name):
"""
@@ -67,20 +64,19 @@ def load_test(name):
"""
# Base pairs as numpy array (input for `pseudoknots()`)
with open(
- join(data_dir("structure"), "pseudoknots", f"{name}_knotted.json"),
- "r"
+ join(data_dir("structure"), "pseudoknots", f"{name}_knotted.json"), "r"
) as f:
basepairs = np.array(json.load(f))
# List of solutions (set of tuples)
with open(
- join(data_dir("structure"), "pseudoknots", f"{name}_unknotted.json"),
- "rb"
+ join(data_dir("structure"), "pseudoknots", f"{name}_unknotted.json"), "rb"
) as f:
solutions = json.load(f)
for i, solution in enumerate(solutions):
solutions[i] = set([tuple(pair) for pair in solution])
return basepairs, solutions
+
@pytest.mark.parametrize("name", [f"test{x}" for x in range(21)])
def test_pseudoknot_removal(name):
"""
@@ -116,6 +112,7 @@ def test_pseudoknot_removal(name):
# Verify that the number of solutions matches the reference
assert len(reference_solutions) == solutions_count
+
@pytest.mark.parametrize("seed", range(10))
def test_pseudoknot_orders(seed):
"""
@@ -136,7 +133,7 @@ def test_pseudoknot_orders(seed):
for solution in solutions:
# Number of base pairs in the previous order
previous_order = -1
- for order in range(np.max(solution)+1):
+ for order in range(np.max(solution) + 1):
# Ensure that the base pairs of the same order are unknotted
assert (struc.pseudoknots(basepairs[solution == order]) == 0).all()
@@ -148,9 +145,10 @@ def test_pseudoknot_orders(seed):
assert this_order <= previous_order
previous_order = this_order
+
def test_empty_base_pairs():
"""
Assert than an empty array of base pairs generates an empty array of
- pseudoknot orders.
+ pseudoknot orders.
"""
- assert struc.pseudoknots([]).shape == (1,0)
\ No newline at end of file
+ assert struc.pseudoknots([]).shape == (1, 0)
diff --git a/tests/structure/test_rdf.py b/tests/structure/test_rdf.py
index bd072fbbe..273900dac 100644
--- a/tests/structure/test_rdf.py
+++ b/tests/structure/test_rdf.py
@@ -2,147 +2,170 @@
from os.path import join
import numpy as np
import pytest
+from biotite.structure.box import vectors_from_unitcell
from biotite.structure.io import load_structure
from biotite.structure.rdf import rdf
-from biotite.structure.box import vectors_from_unitcell
-from ..util import data_dir, cannot_import
-
+from tests.util import cannot_import, data_dir
TEST_FILE = join(data_dir("structure"), "waterbox.gro")
-@pytest.mark.skipif(
- cannot_import("mdtraj"),
- reason="MDTraj is not installed"
-)
+@pytest.mark.skipif(cannot_import("mdtraj"), reason="MDTraj is not installed")
def test_rdf():
- """ General test to reproduce oxygen RDF for a box of water"""
+ """General test to reproduce oxygen RDF for a box of water"""
test_file = TEST_FILE
stack = load_structure(test_file)
# calculate oxygen RDF for water
- oxygen = stack[:, stack.atom_name == 'OW']
+ oxygen = stack[:, stack.atom_name == "OW"]
interval = np.array([0, 10])
n_bins = 100
- bins, g_r = rdf(oxygen[:, 0].coord, oxygen, interval=interval,
- bins=n_bins, periodic=False)
+ bins, g_r = rdf(
+ oxygen[:, 0].coord, oxygen, interval=interval, bins=n_bins, periodic=False
+ )
# Compare with MDTraj
import mdtraj
+
traj = mdtraj.load(TEST_FILE)
- ow = [a.index for a in traj.topology.atoms if a.name == 'O']
+ ow = [a.index for a in traj.topology.atoms if a.name == "O"]
pairs = itertools.product([ow[0]], ow)
- mdt_bins, mdt_g_r = mdtraj.compute_rdf(traj, list(pairs),
- r_range=interval/10, n_bins=n_bins,
- periodic=False)
+ mdt_bins, mdt_g_r = mdtraj.compute_rdf(
+ traj, list(pairs), r_range=interval / 10, n_bins=n_bins, periodic=False
+ )
- assert np.allclose(bins, mdt_bins*10)
+ assert np.allclose(bins, mdt_bins * 10)
assert np.allclose(g_r, mdt_g_r, rtol=0.0001)
def test_rdf_bins():
- """ Test if RDF produce correct bin ranges """
+ """Test if RDF produce correct bin ranges"""
stack = load_structure(TEST_FILE)
center = stack[:, 0]
num_bins = 44
bin_range = (0, 11.7)
bins, g_r = rdf(center, stack, bins=num_bins, interval=bin_range)
- assert(len(bins) == num_bins)
- assert(bins[0] > bin_range[0])
- assert(bins[1] < bin_range[1])
+ assert len(bins) == num_bins
+ assert bins[0] > bin_range[0]
+ assert bins[1] < bin_range[1]
def test_rdf_with_selection():
- """ Test if the selection argument of rdf function works as expected """
+ """Test if the selection argument of rdf function works as expected"""
stack = load_structure(TEST_FILE)
# calculate oxygen RDF for water with and without a selection
- oxygen = stack[:, stack.atom_name == 'OW']
+ oxygen = stack[:, stack.atom_name == "OW"]
interval = np.array([0, 10])
n_bins = 100
- sele = (stack.atom_name == 'OW') & (stack.res_id >= 3)
- bins, g_r = rdf(oxygen[:, 0].coord, stack, selection=sele,
- interval=interval, bins=n_bins, periodic=False)
-
- nosel_bins, nosel_g_r = rdf(oxygen[:, 0].coord, oxygen[:, 1:],
- interval=interval, bins=n_bins, periodic=False)
+ sele = (stack.atom_name == "OW") & (stack.res_id >= 3)
+ bins, g_r = rdf(
+ oxygen[:, 0].coord,
+ stack,
+ selection=sele,
+ interval=interval,
+ bins=n_bins,
+ periodic=False,
+ )
+
+ nosel_bins, nosel_g_r = rdf(
+ oxygen[:, 0].coord,
+ oxygen[:, 1:],
+ interval=interval,
+ bins=n_bins,
+ periodic=False,
+ )
assert np.allclose(bins, nosel_bins)
assert np.allclose(g_r, nosel_g_r)
def test_rdf_atom_argument():
- """ Test if the first argument allows to use AtomArrayStack """
+ """Test if the first argument allows to use AtomArrayStack"""
stack = load_structure(TEST_FILE)
# calculate oxygen RDF for water with and without a selection
- oxygen = stack[:, stack.atom_name == 'OW']
+ oxygen = stack[:, stack.atom_name == "OW"]
interval = np.array([0, 10])
n_bins = 100
- bins, g_r = rdf(oxygen[:, 0], stack, interval=interval,
- bins=n_bins, periodic=False)
+ bins, g_r = rdf(oxygen[:, 0], stack, interval=interval, bins=n_bins, periodic=False)
- atom_bins, atoms_g_r = rdf(oxygen[:, 0].coord, stack, interval=interval,
- bins=n_bins, periodic=False)
+ atom_bins, atoms_g_r = rdf(
+ oxygen[:, 0].coord, stack, interval=interval, bins=n_bins, periodic=False
+ )
assert np.allclose(g_r, atoms_g_r)
def test_rdf_multiple_center():
- """ Test if the first argument allows to use multiple centers"""
+ """Test if the first argument allows to use multiple centers"""
stack = load_structure(TEST_FILE)
# calculate oxygen RDF for water with and without a selection
- oxygen = stack[:, stack.atom_name == 'OW']
+ oxygen = stack[:, stack.atom_name == "OW"]
interval = np.array([0, 10])
n_bins = 100
# averaging individual calculations
- bins1, g_r1 = rdf(oxygen[:, 1].coord, oxygen[:, 2:], interval=interval,
- bins=n_bins, periodic=False)
- bins2, g_r2 = rdf(oxygen[:, 0].coord, oxygen[:, 2:], interval=interval,
- bins=n_bins, periodic=False)
+ bins1, g_r1 = rdf(
+ oxygen[:, 1].coord,
+ oxygen[:, 2:],
+ interval=interval,
+ bins=n_bins,
+ periodic=False,
+ )
+ bins2, g_r2 = rdf(
+ oxygen[:, 0].coord,
+ oxygen[:, 2:],
+ interval=interval,
+ bins=n_bins,
+ periodic=False,
+ )
mean = np.mean([g_r1, g_r2], axis=0)
# this should give the same result as averaging for oxygen 0 and 1
- bins, g_r = rdf(oxygen[:, 0:2].coord, oxygen[:, 2:], interval=interval,
- bins=n_bins, periodic=False)
+ bins, g_r = rdf(
+ oxygen[:, 0:2].coord,
+ oxygen[:, 2:],
+ interval=interval,
+ bins=n_bins,
+ periodic=False,
+ )
assert np.allclose(g_r, mean, rtol=0.0001)
-@pytest.mark.skipif(
- cannot_import("mdtraj"),
- reason="MDTraj is not installed"
-)
+@pytest.mark.skipif(cannot_import("mdtraj"), reason="MDTraj is not installed")
def test_rdf_periodic():
- """ Test if the periodic argument gives the correct results"""
+ """Test if the periodic argument gives the correct results"""
test_file = TEST_FILE
stack = load_structure(test_file)
# calculate oxygen RDF for water
- oxygen = stack[:, stack.atom_name == 'OW']
+ oxygen = stack[:, stack.atom_name == "OW"]
interval = np.array([0, 10])
n_bins = 100
- bins, g_r = rdf(oxygen[:, 0].coord, oxygen[:, 1:], interval=interval,
- bins=n_bins, periodic=True)
+ bins, g_r = rdf(
+ oxygen[:, 0].coord, oxygen[:, 1:], interval=interval, bins=n_bins, periodic=True
+ )
# Compare with MDTraj
import mdtraj
+
traj = mdtraj.load(TEST_FILE)
- ow = [a.index for a in traj.topology.atoms if a.name == 'O']
+ ow = [a.index for a in traj.topology.atoms if a.name == "O"]
pairs = itertools.product([ow[0]], ow[1:])
- mdt_bins, mdt_g_r = mdtraj.compute_rdf(traj, list(pairs),
- r_range=interval/10, n_bins=n_bins,
- periodic=True)
+ mdt_bins, mdt_g_r = mdtraj.compute_rdf(
+ traj, list(pairs), r_range=interval / 10, n_bins=n_bins, periodic=True
+ )
- assert np.allclose(bins, mdt_bins*10)
+ assert np.allclose(bins, mdt_bins * 10)
assert np.allclose(g_r, mdt_g_r, rtol=0.0001)
def test_rdf_box():
- """ Test correct use of simulation boxes """
+ """Test correct use of simulation boxes"""
stack = load_structure(TEST_FILE)
box = vectors_from_unitcell(1, 1, 1, 90, 90, 90)
box_stack = np.repeat(box[np.newaxis, :, :], len(stack), axis=0)
@@ -169,16 +192,14 @@ def test_rdf_box():
def test_rdf_normalized():
- """ Assert that the RDF tail is normalized to 1"""
+ """Assert that the RDF tail is normalized to 1"""
test_file = TEST_FILE
stack = load_structure(test_file)
# calculate oxygen RDF for water
- oxygen = stack[:, stack.atom_name == 'OW']
+ oxygen = stack[:, stack.atom_name == "OW"]
interval = np.array([0, 5])
n_bins = 100
- bins, g_r = rdf(oxygen.coord, oxygen, interval=interval,
- bins=n_bins, periodic=True)
+ bins, g_r = rdf(oxygen.coord, oxygen, interval=interval, bins=n_bins, periodic=True)
assert np.allclose(g_r[-10:], np.ones(10), atol=0.1)
-
diff --git a/tests/structure/test_repair.py b/tests/structure/test_repair.py
index 34ba9f622..52bb168a8 100644
--- a/tests/structure/test_repair.py
+++ b/tests/structure/test_repair.py
@@ -2,19 +2,17 @@
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.
-import biotite.structure as struc
-import biotite.structure.io.pdbx as pdbx
-import numpy as np
from os.path import join
-from ..util import data_dir
+import numpy as np
import pytest
+import biotite.structure as struc
+import biotite.structure.io.pdbx as pdbx
+from tests.util import data_dir
@pytest.fixture
def single_chain():
- pdbx_file = pdbx.BinaryCIFFile.read(
- join(data_dir("structure"), "1l2y.bcif")
- )
+ pdbx_file = pdbx.BinaryCIFFile.read(join(data_dir("structure"), "1l2y.bcif"))
return pdbx.get_structure(pdbx_file, model=1)
@@ -40,35 +38,37 @@ def test_create_continuous_res_ids(multi_chain, restart_each_chain):
test_res_ids, _ = struc.get_residues(multi_chain)
if restart_each_chain:
- assert test_res_ids.tolist() == np.concatenate(
- [np.arange(len(test_res_ids) // 2) + 1] * 2
- ).tolist()
+ assert (
+ test_res_ids.tolist()
+ == np.concatenate([np.arange(len(test_res_ids) // 2) + 1] * 2).tolist()
+ )
else:
- assert test_res_ids.tolist() \
- == (np.arange(len(test_res_ids)) + 1).tolist()
+ assert test_res_ids.tolist() == (np.arange(len(test_res_ids)) + 1).tolist()
@pytest.mark.parametrize(
"name,expected",
- [("CA", "C"),
- ("C", "C"),
- ("CB", "C"),
- ("OD1", "O"),
- ("HD21", "H"),
- ("1H", "H"),
- #("CL", "CL"), # This is an edge case where inference is difficult
- ("HE", "H"),
- ("SD", "S"),
- ("NA", "N"),
- ("NX", "N"),
- ("BE", "BE"),
- ("BEA", "BE"),
- ("K", "K"),
- ("KA", "K"),
- ("QWERT", "")]
+ [
+ ("CA", "C"),
+ ("C", "C"),
+ ("CB", "C"),
+ ("OD1", "O"),
+ ("HD21", "H"),
+ ("1H", "H"),
+ # ("CL", "CL"), # This is an edge case where inference is difficult
+ ("HE", "H"),
+ ("SD", "S"),
+ ("NA", "N"),
+ ("NX", "N"),
+ ("BE", "BE"),
+ ("BEA", "BE"),
+ ("K", "K"),
+ ("KA", "K"),
+ ("QWERT", ""),
+ ],
)
def test_infer_elements(name, expected):
"""
Check if elements are correctly guessed based on known examples.
"""
- assert struc.infer_elements([name])[0] == expected
\ No newline at end of file
+ assert struc.infer_elements([name])[0] == expected
diff --git a/tests/structure/test_residues.py b/tests/structure/test_residues.py
index c3597a73f..024c3e696 100644
--- a/tests/structure/test_residues.py
+++ b/tests/structure/test_residues.py
@@ -2,12 +2,12 @@
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.
-import biotite.structure as struc
-import biotite.structure.io as strucio
-import numpy as np
from os.path import join
-from ..util import data_dir
+import numpy as np
import pytest
+import biotite.structure as struc
+import biotite.structure.io as strucio
+from tests.util import data_dir
@pytest.fixture
@@ -17,11 +17,11 @@ def array():
def test_apply_residue_wise(array):
data = struc.apply_residue_wise(array, np.ones(len(array)), np.sum)
- assert data.tolist() == [len(array[array.res_id == i])
- for i in range(1, 21)]
+ assert data.tolist() == [len(array[array.res_id == i]) for i in range(1, 21)]
+
def test_spread_residue_wise(array):
- input_data = np.arange(1,21)
+ input_data = np.arange(1, 21)
output_data = struc.spread_residue_wise(array, input_data)
assert output_data.tolist() == array.res_id.tolist()
@@ -41,8 +41,7 @@ def test_get_residue_starts_for(array):
np.random.seed(0)
indices = np.random.randint(0, array.array_length(), SAMPLE_SIZE)
ref_starts = np.array(
- [np.where(mask)[0][0] for mask
- in struc.get_residue_masks(array, indices)]
+ [np.where(mask)[0][0] for mask in struc.get_residue_masks(array, indices)]
)
test_starts = struc.get_residue_starts_for(array, indices)
assert test_starts.tolist() == ref_starts.tolist()
@@ -51,16 +50,32 @@ def test_get_residue_starts_for(array):
def test_get_residues(array):
ids, names = struc.get_residues(array)
assert ids.tolist() == list(range(1, 21))
- assert names.tolist() == ["ASN","LEU","TYR","ILE","GLN","TRP","LEU","LYS",
- "ASP","GLY","GLY","PRO","SER","SER","GLY","ARG",
- "PRO","PRO","PRO","SER"]
+ assert names.tolist() == [
+ "ASN",
+ "LEU",
+ "TYR",
+ "ILE",
+ "GLN",
+ "TRP",
+ "LEU",
+ "LYS",
+ "ASP",
+ "GLY",
+ "GLY",
+ "PRO",
+ "SER",
+ "SER",
+ "GLY",
+ "ARG",
+ "PRO",
+ "PRO",
+ "PRO",
+ "SER",
+ ]
assert len(ids) == struc.get_residue_count(array)
def test_residue_iter(array):
- centroid = [struc.centroid(res).tolist()
- for res in struc.residue_iter(array)]
- ref_centroid = struc.apply_residue_wise(
- array, array.coord, np.average, axis=0
- )
- assert centroid == ref_centroid.tolist()
\ No newline at end of file
+ centroid = [struc.centroid(res).tolist() for res in struc.residue_iter(array)]
+ ref_centroid = struc.apply_residue_wise(array, array.coord, np.average, axis=0)
+ assert centroid == ref_centroid.tolist()
diff --git a/tests/structure/test_sasa.py b/tests/structure/test_sasa.py
index 12827f533..a5f9d0171 100644
--- a/tests/structure/test_sasa.py
+++ b/tests/structure/test_sasa.py
@@ -3,58 +3,58 @@
# information.
from os.path import join
-import pytest
import numpy as np
+import pytest
import biotite.structure as struc
import biotite.structure.io.pdb as pdb
import biotite.structure.io.pdbx as pdbx
-from ..util import data_dir, cannot_import
+from tests.util import cannot_import, data_dir
# Ignore warning about dummy unit cell vector
@pytest.mark.filterwarnings("ignore")
-@pytest.mark.skipif(
- cannot_import("mdtraj"),
- reason="MDTraj is not installed"
-)
+@pytest.mark.skipif(cannot_import("mdtraj"), reason="MDTraj is not installed")
@pytest.mark.parametrize("pdb_id", ["1l2y", "1gya"])
def test_single(pdb_id):
- file_name = join(data_dir("structure"), pdb_id+".pdb")
+ file_name = join(data_dir("structure"), pdb_id + ".pdb")
# Single atom SASA, compare with MDTraj
file = pdb.PDBFile.read(file_name)
array = file.get_structure(model=1)
sasa = struc.sasa(array, vdw_radii="Single", point_number=5000)
- from biotite.structure.info.radii import _SINGLE_RADII as radii
import mdtraj
+ from biotite.structure.info.radii import _SINGLE_RADII as SINGLE_RADII
+
# Use the same atom radii
- radii = {element.capitalize() : radius / 10
- for element, radius in radii.items()}
+ radii = {
+ element.capitalize(): radius / 10 for element, radius in SINGLE_RADII.items()
+ }
traj = mdtraj.load(file_name)
# Conversion from nm^2 to A^2
- sasa_exp = mdtraj.shrake_rupley(
- traj, change_radii=radii, n_sphere_points=5000
- )[0] * 100
-
+ sasa_exp = (
+ mdtraj.shrake_rupley(traj, change_radii=radii, n_sphere_points=5000)[0] * 100
+ )
# Assert that more than 90% of atoms
# have less than 10% SASA difference
- assert np.count_nonzero(
- np.isclose(sasa, sasa_exp, rtol=1e-1, atol=1e-1)
- ) / len(sasa) > 0.9
+ assert (
+ np.count_nonzero(np.isclose(sasa, sasa_exp, rtol=1e-1, atol=1e-1)) / len(sasa)
+ > 0.9
+ )
# Assert that more than 98% of atoms
# have less than 1% SASA difference
- assert np.count_nonzero(
- np.isclose(sasa, sasa_exp, rtol=1e-2, atol=1e-1)
- ) / len(sasa) > 0.98
+ assert (
+ np.count_nonzero(np.isclose(sasa, sasa_exp, rtol=1e-2, atol=1e-1)) / len(sasa)
+ > 0.98
+ )
@pytest.mark.parametrize("pdb_id", ["1l2y", "1gya"])
def test_coarse_grained(pdb_id):
# Multi atom SASA (ProtOr), compare with single atom SASA
# on residue level
- file = pdbx.BinaryCIFFile.read(join(data_dir("structure"), pdb_id+".bcif"))
+ file = pdbx.BinaryCIFFile.read(join(data_dir("structure"), pdb_id + ".bcif"))
array = pdbx.get_structure(file, model=1)
array = array[struc.filter_amino_acids(array)]
sasa = struc.apply_residue_wise(
@@ -66,11 +66,13 @@ def test_coarse_grained(pdb_id):
# Assert that more than 90% of atoms
# have less than 10% SASA difference
- assert np.count_nonzero(
- np.isclose(sasa, sasa_exp, rtol=1e-1, atol=1)
- ) / len(sasa) > 0.9
+ assert (
+ np.count_nonzero(np.isclose(sasa, sasa_exp, rtol=1e-1, atol=1)) / len(sasa)
+ > 0.9
+ )
# Assert that more than 98% of atoms
# have less than 40% SASA difference
- assert np.count_nonzero(
- np.isclose(sasa, sasa_exp, rtol=4e-1, atol=1)
- ) / len(sasa) > 0.98
\ No newline at end of file
+ assert (
+ np.count_nonzero(np.isclose(sasa, sasa_exp, rtol=4e-1, atol=1)) / len(sasa)
+ > 0.98
+ )
diff --git a/tests/structure/test_sequence.py b/tests/structure/test_sequence.py
index 098958824..e8bbd337a 100644
--- a/tests/structure/test_sequence.py
+++ b/tests/structure/test_sequence.py
@@ -5,16 +5,14 @@
import glob
from os.path import join
import pytest
-import biotite.structure as struc
import biotite.sequence as seq
import biotite.sequence.align as align
+import biotite.structure as struc
import biotite.structure.io.pdbx as pdbx
-from ..util import data_dir
+from tests.util import data_dir
-@pytest.mark.parametrize(
- "path", glob.glob(join(data_dir("structure"), "*.bcif"))
-)
+@pytest.mark.parametrize("path", glob.glob(join(data_dir("structure"), "*.bcif")))
def test_pdbx_sequence_consistency(path):
"""
Check if sequences created with :func:`to_sequence()` are equal to
@@ -61,8 +59,7 @@ def _find_best_match(sequence, ref_sequences):
else:
matrix = align.SubstitutionMatrix.std_nucleotide_matrix()
alignment = align.align_optimal(
- sequence, ref_sequence, matrix,
- terminal_penalty=False, max_number=1
+ sequence, ref_sequence, matrix, terminal_penalty=False, max_number=1
)[0]
# The 'shortest' identity is 1.0, if every residue in the
# test sequence is aligned to an identical residue
@@ -70,4 +67,4 @@ def _find_best_match(sequence, ref_sequences):
if identity > best_identity:
best_alignment = alignment
best_identity = identity
- return best_alignment, best_identity
\ No newline at end of file
+ return best_alignment, best_identity
diff --git a/tests/structure/test_sse.py b/tests/structure/test_sse.py
index 30b6d75cf..543175ff3 100644
--- a/tests/structure/test_sse.py
+++ b/tests/structure/test_sse.py
@@ -6,10 +6,10 @@
from os.path import join
import numpy as np
import pytest
+import biotite.sequence.io.fasta as fasta
import biotite.structure as struc
import biotite.structure.io.pdbx as pdbx
-import biotite.sequence.io.fasta as fasta
-from ..util import data_dir
+from tests.util import data_dir
def test_sse():
@@ -23,18 +23,14 @@ def test_sse():
matches = 0
total = 0
- ref_psea_file = fasta.FastaFile.read(
- join(data_dir("structure"), "psea.fasta")
- )
+ ref_psea_file = fasta.FastaFile.read(join(data_dir("structure"), "psea.fasta"))
for pdb_id in ref_psea_file:
ref_sse = np.array(list(ref_psea_file[pdb_id]))
atoms = pdbx.get_structure(
- pdbx.BinaryCIFFile.read(
- join(data_dir("structure"), f"{pdb_id}.bcif")
- ),
- model=1
+ pdbx.BinaryCIFFile.read(join(data_dir("structure"), f"{pdb_id}.bcif")),
+ model=1,
)
atoms = atoms[struc.filter_canonical_amino_acids(atoms)]
if atoms.array_length() == 0:
@@ -51,9 +47,9 @@ def test_sse():
np.random.seed(0)
-@pytest.mark.parametrize(
- "discont_pos", np.random.randint(2, 105, size=100)
-)
+
+
+@pytest.mark.parametrize("discont_pos", np.random.randint(2, 105, size=100))
def test_sse_discontinuity(discont_pos):
"""
Check if discontinuities are properly handled by inserting a
@@ -61,8 +57,7 @@ def test_sse_discontinuity(discont_pos):
proximity becomes 'coil'.
"""
atoms = pdbx.get_structure(
- pdbx.BinaryCIFFile.read(join(data_dir("structure"), "1gya.bcif")),
- model=1
+ pdbx.BinaryCIFFile.read(join(data_dir("structure"), "1gya.bcif")), model=1
)
atoms = atoms[struc.filter_canonical_amino_acids(atoms)]
@@ -72,7 +67,7 @@ def test_sse_discontinuity(discont_pos):
assert len(struc.check_res_id_continuity(atoms)) == 0
# Introduce discontinuity
res_starts = struc.get_residue_starts(atoms)
- atoms.res_id[res_starts[discont_pos]:] += 1
+ atoms.res_id[res_starts[discont_pos] :] += 1
test_sse = struc.annotate_sse(atoms)
assert len(test_sse) == len(ref_sse)
@@ -89,9 +84,7 @@ def test_sse_discontinuity(discont_pos):
assert (test_sse[discont_proximity] == "c").all()
-@pytest.mark.parametrize(
- "file_name", glob.glob(join(data_dir("structure"), "*.bcif"))
-)
+@pytest.mark.parametrize("file_name", glob.glob(join(data_dir("structure"), "*.bcif")))
def test_sse_non_peptide(file_name):
"""
Test whether only amino acids get SSE annotated.
@@ -101,9 +94,7 @@ def test_sse_non_peptide(file_name):
# Special case for PDB 5EIL:
# The residue BP5 is an amino acid, but has no CA
# -> rename analogous atom
- atoms.atom_name[
- (atoms.res_name == "BP5") & (atoms.atom_name == "C13")
- ] = "CA"
+ atoms.atom_name[(atoms.res_name == "BP5") & (atoms.atom_name == "C13")] = "CA"
sse = struc.annotate_sse(atoms)
peptide_mask = struc.filter_amino_acids(atoms)
@@ -111,4 +102,4 @@ def test_sse_non_peptide(file_name):
peptide_mask = peptide_mask[struc.get_residue_starts(atoms)]
assert np.all(np.isin(sse[peptide_mask], ["a", "b", "c"]))
- assert np.all(sse[~peptide_mask] == "")
\ No newline at end of file
+ assert np.all(sse[~peptide_mask] == "")
diff --git a/tests/structure/test_superimpose.py b/tests/structure/test_superimpose.py
index fd9514734..70a70e56e 100755
--- a/tests/structure/test_superimpose.py
+++ b/tests/structure/test_superimpose.py
@@ -9,9 +9,8 @@
import pytest
import biotite.structure as struc
import biotite.structure.io as strucio
-import biotite.structure as struc
from biotite.structure.superimpose import _multi_matmul as multi_matmul
-from ..util import data_dir
+from tests.util import data_dir
def test_transform_as_matrix():
@@ -30,7 +29,7 @@ def test_transform_as_matrix():
# This is not really a rotation matrix,
# but the same maths apply
rotation=np.random.rand(N_MODELS, 3, 3),
- target_translation=np.random.rand(N_MODELS, 3)
+ target_translation=np.random.rand(N_MODELS, 3),
)
ref_coord = transform.apply(orig_coord)
@@ -41,15 +40,13 @@ def test_transform_as_matrix():
test_coord_4 = multi_matmul(transform.as_matrix(), orig_coord_4)
test_coord = test_coord_4[..., :3]
- assert test_coord.flatten().tolist() \
- == pytest.approx(ref_coord.flatten().tolist(), abs=1e-6)
+ assert test_coord.flatten().tolist() == pytest.approx(
+ ref_coord.flatten().tolist(), abs=1e-6
+ )
@pytest.mark.parametrize(
- "seed, multi_model", itertools.product(
- range(10),
- [False, True]
- )
+ "seed, multi_model", itertools.product(range(10), [False, True])
)
def test_restoration(seed, multi_model):
"""
@@ -70,8 +67,9 @@ def test_restoration(seed, multi_model):
test_coord = _transform_random_affine(ref_coord)
test_coord, _ = struc.superimpose(ref_coord, test_coord)
- assert test_coord.flatten().tolist() \
- == pytest.approx(ref_coord.flatten().tolist(), abs=1e-6)
+ assert test_coord.flatten().tolist() == pytest.approx(
+ ref_coord.flatten().tolist(), abs=1e-6
+ )
def test_rotation_matrix():
@@ -83,28 +81,23 @@ def test_rotation_matrix():
N_COORD = 100
# A rotation matrix that rotates 90 degrees around the z-axis
- ref_rotation = np.array([
- [0, -1, 0],
- [1, 0, 0],
- [0, 0, 1]
- ])
+ ref_rotation = np.array([[0, -1, 0], [1, 0, 0], [0, 0, 1]])
np.random.seed(0)
original_coord = np.random.rand(N_COORD, 3)
# Rotate about 90 degrees around z-axis
- rotated_coord = struc.rotate(original_coord, angles=(0, 0, np.pi/2))
+ rotated_coord = struc.rotate(original_coord, angles=(0, 0, np.pi / 2))
_, transform = struc.superimpose(rotated_coord, original_coord)
test_rotation = transform.rotation
- assert test_rotation.flatten().tolist() \
- == pytest.approx(ref_rotation.flatten().tolist(), abs=1e-6)
+ assert test_rotation.flatten().tolist() == pytest.approx(
+ ref_rotation.flatten().tolist(), abs=1e-6
+ )
@pytest.mark.parametrize(
- "path, coord_only", itertools.product(
- glob.glob(join(data_dir("structure"), "*.bcif")),
- [False, True]
- )
+ "path, coord_only",
+ itertools.product(glob.glob(join(data_dir("structure"), "*.bcif")), [False, True]),
)
def test_superimposition_array(path, coord_only):
"""
@@ -116,16 +109,14 @@ def test_superimposition_array(path, coord_only):
fixed = strucio.load_structure(path, model=1)
mobile = fixed.copy()
- mobile = struc.rotate(mobile, (1,2,3))
- mobile = struc.translate(mobile, (1,2,3))
+ mobile = struc.rotate(mobile, (1, 2, 3))
+ mobile = struc.translate(mobile, (1, 2, 3))
if coord_only:
fixed = fixed.coord
mobile = mobile.coord
- fitted, transformation = struc.superimpose(
- fixed, mobile
- )
+ fitted, transformation = struc.superimpose(fixed, mobile)
if coord_only:
assert isinstance(fitted, np.ndarray)
@@ -150,7 +141,7 @@ def test_superimposition_stack(ca_only):
fixed = stack[0]
mobile = stack[1:]
if ca_only:
- mask = (mobile.atom_name == "CA")
+ mask = mobile.atom_name == "CA"
else:
mask = None
@@ -160,15 +151,13 @@ def test_superimposition_stack(ca_only):
# The superimpositions are better for most cases than the
# superimpositions in the structure file
# -> Use average
- assert np.mean(struc.rmsd(fixed, fitted)) \
- < np.mean(struc.rmsd(fixed, mobile))
+ assert np.mean(struc.rmsd(fixed, fitted)) < np.mean(struc.rmsd(fixed, mobile))
else:
# The superimpositions are better than the superimpositions
# in the structure file
assert (struc.rmsd(fixed, fitted) < struc.rmsd(fixed, mobile)).all()
-
@pytest.mark.parametrize("seed", range(5))
def test_masked_superimposition(seed):
"""
@@ -188,25 +177,19 @@ def test_masked_superimposition(seed):
# The distance between the atom in both models should not be
# already 0 prior to superimposition
- assert struc.distance(fixed[mask], mobile[mask])[0] \
- != pytest.approx(0, abs=5e-4)
+ assert struc.distance(fixed[mask], mobile[mask])[0] != pytest.approx(0, abs=5e-4)
- fitted, transformation = struc.superimpose(
- fixed, mobile, mask
- )
+ fitted, transformation = struc.superimpose(fixed, mobile, mask)
- assert struc.distance(fixed[mask], fitted[mask])[0] \
- == pytest.approx(0, abs=5e-4)
+ assert struc.distance(fixed[mask], fitted[mask])[0] == pytest.approx(0, abs=5e-4)
fitted = transformation.apply(mobile)
- struc.distance(fixed[mask], fitted[mask])[0] \
- == pytest.approx(0, abs=5e-4)
+ struc.distance(fixed[mask], fitted[mask])[0] == pytest.approx(0, abs=5e-4)
@pytest.mark.parametrize(
- "single_model, single_atom",
- itertools.product([False, True], [False, True])
+ "single_model, single_atom", itertools.product([False, True], [False, True])
)
def test_input_shapes(single_model, single_atom):
"""
@@ -258,24 +241,25 @@ def test_outlier_detection(seed):
superimposed_coord, _, anchors = struc.superimpose_without_outliers(
# Increase the threshold a bit,
# to ensure that no inlier is classified as outlier
- fixed_coord, mobile_coord, outlier_threshold=3.0
+ fixed_coord,
+ mobile_coord,
+ outlier_threshold=3.0,
)
test_outlier_mask = np.full(N_COORD, True)
test_outlier_mask[anchors] = False
assert test_outlier_mask.tolist() == ref_outlier_mask.tolist()
# Without the outliers, the RMSD should be in the noise range
- assert struc.rmsd(
- fixed_coord[~ref_outlier_mask], superimposed_coord[~ref_outlier_mask]
- ) < NOISE
+ assert (
+ struc.rmsd(
+ fixed_coord[~ref_outlier_mask], superimposed_coord[~ref_outlier_mask]
+ )
+ < NOISE
+ )
@pytest.mark.parametrize(
- "multi_model, coord_only",
- itertools.product(
- [False, True],
- [False, True]
- )
+ "multi_model, coord_only", itertools.product([False, True], [False, True])
)
def test_superimpose_without_outliers_inputs(multi_model, coord_only):
"""
@@ -289,9 +273,7 @@ def test_superimpose_without_outliers_inputs(multi_model, coord_only):
if coord_only:
atoms = atoms.coord
- superimposed, transform, _ = struc.superimpose_without_outliers(
- atoms, atoms
- )
+ superimposed, transform, _ = struc.superimpose_without_outliers(atoms, atoms)
assert type(superimposed) == type(atoms)
assert superimposed.shape == atoms.shape
@@ -313,7 +295,7 @@ def test_superimpose_without_outliers_inputs(multi_model, coord_only):
("1aki", "A", True),
("4gxy", "A", False), # is a nucleic acid
("4gxy", "A", True),
- ]
+ ],
)
def test_superimpose_homologs(pdb_id, chain_id, as_stack):
"""
@@ -342,8 +324,10 @@ def test_superimpose_homologs(pdb_id, chain_id, as_stack):
)
# Check if corresponding residues were superimposed
- assert fixed_atoms.res_id[fix_anchors].tolist() \
+ assert (
+ fixed_atoms.res_id[fix_anchors].tolist()
== mobile_atoms.res_id[mob_anchors].tolist()
+ )
# If a stack, it only contains one model
if as_stack:
fixed_atoms = fixed_atoms[0]
@@ -355,15 +339,14 @@ def test_superimpose_homologs(pdb_id, chain_id, as_stack):
def _transform_random_affine(coord):
coord = struc.translate(coord, np.random.rand(3))
- coord = struc.rotate(coord, np.random.uniform(low=0, high=2*np.pi, size=3))
+ coord = struc.rotate(coord, np.random.uniform(low=0, high=2 * np.pi, size=3))
return coord
def _delete_random_residues(atoms, p_conservation):
residue_starts = struc.get_residue_starts(atoms)
conserved_residue_starts = np.random.choice(
- residue_starts, size=int(p_conservation * len(residue_starts)),
- replace=False
+ residue_starts, size=int(p_conservation * len(residue_starts)), replace=False
)
conservation_mask = np.any(
struc.get_residue_masks(atoms, conserved_residue_starts), axis=0
diff --git a/tests/structure/test_trajectory.py b/tests/structure/test_trajectory.py
index e4a9a1ba3..8bdb0da9a 100644
--- a/tests/structure/test_trajectory.py
+++ b/tests/structure/test_trajectory.py
@@ -2,33 +2,27 @@
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.
-from tempfile import NamedTemporaryFile
import itertools
-import glob
-from os.path import join, basename
+from os.path import join
+from tempfile import NamedTemporaryFile
import numpy as np
import pytest
import biotite.structure as struc
import biotite.structure.io as strucio
-import biotite.structure.io.xtc as xtc
-import biotite.structure.io.trr as trr
-import biotite.structure.io.tng as tng
import biotite.structure.io.dcd as dcd
import biotite.structure.io.netcdf as netcdf
-from ..util import data_dir, cannot_import
+import biotite.structure.io.tng as tng
+import biotite.structure.io.trr as trr
+import biotite.structure.io.xtc as xtc
+from tests.util import cannot_import, data_dir
-@pytest.mark.skipif(
- cannot_import("mdtraj"),
- reason="MDTraj is not installed"
-)
+@pytest.mark.skipif(cannot_import("mdtraj"), reason="MDTraj is not installed")
@pytest.mark.parametrize("format", ["trr", "xtc", "tng", "dcd", "netcdf"])
def test_array_conversion(format):
- template = strucio.load_structure(
- join(data_dir("structure"), "1l2y.bcif")
- )[0]
+ template = strucio.load_structure(join(data_dir("structure"), "1l2y.bcif"))[0]
# Add fake box
- template.box = np.diag([1,2,3])
+ template.box = np.diag([1, 2, 3])
if format == "trr":
traj_file_cls = trr.TRRFile
if format == "xtc":
@@ -39,9 +33,7 @@ def test_array_conversion(format):
traj_file_cls = dcd.DCDFile
if format == "netcdf":
traj_file_cls = netcdf.NetCDFFile
- traj_file = traj_file_cls.read(
- join(data_dir("structure"), f"1l2y.{format}")
- )
+ traj_file = traj_file_cls.read(join(data_dir("structure"), f"1l2y.{format}"))
ref_array = traj_file.get_structure(template)
traj_file = traj_file_cls()
@@ -58,10 +50,7 @@ def test_array_conversion(format):
assert ref_array.coord == pytest.approx(array.coord, abs=1e-2)
-@pytest.mark.skipif(
- cannot_import("mdtraj"),
- reason="MDTraj is not installed"
-)
+@pytest.mark.skipif(cannot_import("mdtraj"), reason="MDTraj is not installed")
@pytest.mark.parametrize(
"format, start, stop, step, chunk_size",
itertools.product(
@@ -69,8 +58,8 @@ def test_array_conversion(format):
[None, 2],
[None, 17],
[None, 2],
- [None, 3]
- )
+ [None, 3],
+ ),
)
def test_bcif_consistency(format, start, stop, step, chunk_size):
if format == "netcdf" and stop is not None and step is not None:
@@ -97,7 +86,10 @@ def test_bcif_consistency(format, start, stop, step, chunk_size):
traj_file_cls = netcdf.NetCDFFile
traj_file = traj_file_cls.read(
join(data_dir("structure"), f"1l2y.{format}"),
- start, stop, step, chunk_size=chunk_size
+ start,
+ stop,
+ step,
+ chunk_size=chunk_size,
)
test_traj = traj_file.get_structure(template)
test_traj_time = traj_file.get_time()
@@ -108,10 +100,9 @@ def test_bcif_consistency(format, start, stop, step, chunk_size):
# Shift to ensure time starts at 0
test_traj_time -= 1
start = start if start is not None else 0
- stop = stop if stop is not None else 38 # 38 models in 1l2y
+ stop = stop if stop is not None else 38 # 38 models in 1l2y
step = step if step is not None else 1
- assert test_traj_time.astype(int).tolist() \
- == list(range(start, stop, step))
+ assert test_traj_time.astype(int).tolist() == list(range(start, stop, step))
assert test_traj.stack_depth() == ref_traj.stack_depth()
# 1l2y has no box
@@ -121,10 +112,7 @@ def test_bcif_consistency(format, start, stop, step, chunk_size):
assert test_traj.coord == pytest.approx(ref_traj.coord, abs=1e-2)
-@pytest.mark.skipif(
- cannot_import("mdtraj"),
- reason="MDTraj is not installed"
-)
+@pytest.mark.skipif(cannot_import("mdtraj"), reason="MDTraj is not installed")
@pytest.mark.parametrize(
"format, start, stop, step, stack_size",
itertools.product(
@@ -132,8 +120,8 @@ def test_bcif_consistency(format, start, stop, step, chunk_size):
[None, 2],
[None, 17],
[None, 2],
- [None, 2, 3]
- )
+ [None, 2, 3],
+ ),
)
def test_read_iter(format, start, stop, step, stack_size):
"""
@@ -176,7 +164,7 @@ def test_read_iter(format, start, stop, step, stack_size):
# Convert list to NumPy array
combination_func = np.stack if stack_size is None else np.concatenate
- test_coord =combination_func(test_coord)
+ test_coord = combination_func(test_coord)
if test_box[0] is not None:
test_box = combination_func(test_box)
else:
@@ -197,10 +185,7 @@ def test_read_iter(format, start, stop, step, stack_size):
assert test_time.tolist() == ref_time.tolist()
-@pytest.mark.skipif(
- cannot_import("mdtraj"),
- reason="MDTraj is not installed"
-)
+@pytest.mark.skipif(cannot_import("mdtraj"), reason="MDTraj is not installed")
@pytest.mark.parametrize(
"format, start, stop, step, stack_size",
itertools.product(
@@ -208,8 +193,8 @@ def test_read_iter(format, start, stop, step, stack_size):
[None, 2],
[None, 17],
[None, 2],
- [None, 2, 3]
- )
+ [None, 2, 3],
+ ),
)
def test_read_iter_structure(format, start, stop, step, stack_size):
"""
@@ -241,9 +226,12 @@ def test_read_iter_structure(format, start, stop, step, stack_size):
traj_file = traj_file_cls.read(file_name, start, stop, step)
ref_traj = traj_file.get_structure(template)
- frames = [frame for frame in traj_file_cls.read_iter_structure(
- file_name, template, start, stop, step, stack_size=stack_size
- )]
+ frames = [
+ frame
+ for frame in traj_file_cls.read_iter_structure(
+ file_name, template, start, stop, step, stack_size=stack_size
+ )
+ ]
if stack_size is None:
assert isinstance(frames[0], struc.AtomArray)
@@ -255,10 +243,7 @@ def test_read_iter_structure(format, start, stop, step, stack_size):
assert test_traj == ref_traj
-@pytest.mark.skipif(
- cannot_import("mdtraj"),
- reason="MDTraj is not installed"
-)
+@pytest.mark.skipif(cannot_import("mdtraj"), reason="MDTraj is not installed")
@pytest.mark.parametrize(
"format, n_models, n_atoms, include_box, include_time",
itertools.product(
@@ -267,7 +252,7 @@ def test_read_iter_structure(format, start, stop, step, stack_size):
[1, 1000],
[False, True],
[False, True],
- )
+ ),
)
def test_write_iter(format, n_models, n_atoms, include_box, include_time):
"""
@@ -297,7 +282,6 @@ def test_write_iter(format, n_models, n_atoms, include_box, include_time):
# time is evenly spaced for TNG compatibility
time = np.linspace(0, 10, n_models) if include_time else None
-
ref_file = NamedTemporaryFile("w+b")
traj_file = traj_file_cls()
traj_file.set_coord(coord)
@@ -311,7 +295,6 @@ def test_write_iter(format, n_models, n_atoms, include_box, include_time):
ref_time = traj_file.get_time()
ref_file.close()
-
test_file = NamedTemporaryFile("w+b")
traj_file_cls.write_iter(test_file.name, coord, box, time)
@@ -321,9 +304,8 @@ def test_write_iter(format, n_models, n_atoms, include_box, include_time):
test_time = traj_file.get_time()
test_file.close()
-
assert np.allclose(test_coord, ref_coord, atol=1e-2)
if include_box:
assert np.allclose(test_box, ref_box, atol=1e-2)
if include_time:
- assert np.allclose(test_time, ref_time, atol=1e-2)
\ No newline at end of file
+ assert np.allclose(test_time, ref_time, atol=1e-2)
diff --git a/tests/structure/test_transform.py b/tests/structure/test_transform.py
index 12c42aa0b..33a794c20 100644
--- a/tests/structure/test_transform.py
+++ b/tests/structure/test_transform.py
@@ -8,13 +8,13 @@
import pytest
import biotite.structure as struc
import biotite.structure.io.pdbx as pdbx
-from ..util import data_dir
+from tests.util import data_dir
@pytest.fixture(
params=itertools.product(
- [1, 2, 3], # ndim
- [False, True] # as_coord
+ [1, 2, 3], # ndim
+ [False, True], # as_coord
)
)
def input_atoms(request):
@@ -28,7 +28,7 @@ def input_atoms(request):
atoms = atoms[0]
elif ndim == 1:
# Only one atom
- atoms = atoms[0,0]
+ atoms = atoms[0, 0]
if as_coord:
return atoms.coord
@@ -62,13 +62,11 @@ def test_translate(input_atoms, ndim, as_list, random_seed):
assert type(restored) == type(input_atoms)
assert struc.coord(restored).shape == struc.coord(input_atoms).shape
- assert np.allclose(
- struc.coord(restored), struc.coord(input_atoms), atol=1e-5
- )
+ assert np.allclose(struc.coord(restored), struc.coord(input_atoms), atol=1e-5)
@pytest.mark.parametrize("as_list", [False, True])
-@pytest.mark.parametrize("axis", [0, 1, 2]) # x, y, z
+@pytest.mark.parametrize("axis", [0, 1, 2]) # x, y, z
@pytest.mark.parametrize("random_seed", np.arange(5))
@pytest.mark.parametrize("centered", [False, True])
def test_rotate(input_atoms, as_list, axis, random_seed, centered):
@@ -78,7 +76,7 @@ def test_rotate(input_atoms, as_list, axis, random_seed, centered):
"""
np.random.seed(random_seed)
angles = np.zeros(3)
- angles[axis] = np.random.rand() * 2*np.pi
+ angles[axis] = np.random.rand() * 2 * np.pi
neg_angles = -angles
if as_list:
angles = angles.tolist()
@@ -91,18 +89,16 @@ def test_rotate(input_atoms, as_list, axis, random_seed, centered):
assert type(restored) == type(input_atoms)
assert struc.coord(restored).shape == struc.coord(input_atoms).shape
print(np.max(np.abs(struc.coord(restored) - struc.coord(input_atoms))))
- assert np.allclose(
- struc.coord(restored), struc.coord(input_atoms), atol=1e-5
- )
+ assert np.allclose(struc.coord(restored), struc.coord(input_atoms), atol=1e-5)
if centered and struc.coord(input_atoms).ndim > 1:
assert np.allclose(
struc.centroid(restored), struc.centroid(input_atoms), atol=1e-5
)
-@pytest.mark.parametrize("x", [0, 2*np.pi])
-@pytest.mark.parametrize("y", [0, 2*np.pi])
-@pytest.mark.parametrize("z", [0, 2*np.pi])
+@pytest.mark.parametrize("x", [0, 2 * np.pi])
+@pytest.mark.parametrize("y", [0, 2 * np.pi])
+@pytest.mark.parametrize("z", [0, 2 * np.pi])
@pytest.mark.parametrize("centered", [False, True])
def test_rotate_360(input_atoms, x, y, z, centered):
"""
@@ -114,9 +110,7 @@ def test_rotate_360(input_atoms, x, y, z, centered):
assert type(rotated) == type(input_atoms)
assert struc.coord(rotated).shape == struc.coord(input_atoms).shape
- assert np.allclose(
- struc.coord(rotated), struc.coord(input_atoms), atol=1e-5
- )
+ assert np.allclose(struc.coord(rotated), struc.coord(input_atoms), atol=1e-5)
if centered and struc.coord(input_atoms).ndim > 1:
assert np.allclose(
struc.centroid(rotated), struc.centroid(input_atoms), atol=1e-5
@@ -129,7 +123,7 @@ def test_rotate_known(ndim):
Rotate a vector at the Y-axis about the X-axis by 90 degrees and
expect a rotated vector at the Z-axis.
"""
- shape = (1,) * (ndim-1) + (3,)
+ shape = (1,) * (ndim - 1) + (3,)
vector = np.zeros(shape)
vector[...] = [0, 1, 0]
@@ -143,7 +137,7 @@ def test_rotate_known(ndim):
assert np.allclose(test_rotated, exp_rotated, atol=1e-5)
-@pytest.mark.parametrize("axis", [0, 1, 2]) # x, y, z
+@pytest.mark.parametrize("axis", [0, 1, 2]) # x, y, z
@pytest.mark.parametrize("random_seed", np.arange(5))
def test_rotate_measure(axis, random_seed):
"""
@@ -166,8 +160,7 @@ def test_rotate_measure(axis, random_seed):
test_angle = struc.angle(rotated, 0, input_coord)
# Vector length should be unchanged
- assert np.linalg.norm(rotated) \
- == pytest.approx(np.linalg.norm(input_coord))
+ assert np.linalg.norm(rotated) == pytest.approx(np.linalg.norm(input_coord))
assert test_angle == pytest.approx(ref_angle)
@@ -193,12 +186,10 @@ def test_rotate_about_axis(input_atoms, as_list, use_support, random_seed):
assert type(restored) == type(input_atoms)
assert struc.coord(restored).shape == struc.coord(input_atoms).shape
- assert np.allclose(
- struc.coord(restored), struc.coord(input_atoms), atol=1e-5
- )
+ assert np.allclose(struc.coord(restored), struc.coord(input_atoms), atol=1e-5)
-@pytest.mark.parametrize("axis", [0, 1, 2]) # x, y, z
+@pytest.mark.parametrize("axis", [0, 1, 2]) # x, y, z
@pytest.mark.parametrize("random_seed", np.arange(5))
def test_rotate_about_axis_consistency(input_atoms, axis, random_seed):
"""
@@ -215,13 +206,15 @@ def test_rotate_about_axis_consistency(input_atoms, axis, random_seed):
rot_axis = np.zeros(3)
# Length of axis should be irrelevant
rot_axis[axis] = np.random.rand()
- test_rotated = struc.rotate_about_axis(input_atoms, rot_axis, angle,)
+ test_rotated = struc.rotate_about_axis(
+ input_atoms,
+ rot_axis,
+ angle,
+ )
assert type(test_rotated) == type(ref_rotated)
assert struc.coord(test_rotated).shape == struc.coord(ref_rotated).shape
- assert np.allclose(
- struc.coord(test_rotated), struc.coord(ref_rotated), atol=1e-5
- )
+ assert np.allclose(struc.coord(test_rotated), struc.coord(ref_rotated), atol=1e-5)
@pytest.mark.parametrize("random_seed", np.arange(5))
@@ -233,26 +226,27 @@ def test_rotate_about_axis_360(input_atoms, random_seed, use_support):
"""
np.random.seed(random_seed)
axis = np.random.rand(3)
- support = np.random.rand(3) if use_support else None
+ support = np.random.rand(3) if use_support else None
- rotated = struc.rotate_about_axis(input_atoms, axis, 2*np.pi, support)
+ rotated = struc.rotate_about_axis(input_atoms, axis, 2 * np.pi, support)
assert type(rotated) == type(input_atoms)
assert struc.coord(rotated).shape == struc.coord(input_atoms).shape
- assert np.allclose(
- struc.coord(rotated), struc.coord(input_atoms), atol=1e-5
- )
+ assert np.allclose(struc.coord(rotated), struc.coord(input_atoms), atol=1e-5)
@pytest.mark.parametrize("as_list", [False, True])
-@pytest.mark.parametrize("order", (
- np.array([0, 1, 2]),
- np.array([0, 2, 1]),
- np.array([1, 0, 2]),
- np.array([2, 0, 1]),
- np.array([2, 1, 0]),
- np.array([1, 2, 0]),
-))
+@pytest.mark.parametrize(
+ "order",
+ (
+ np.array([0, 1, 2]),
+ np.array([0, 2, 1]),
+ np.array([1, 0, 2]),
+ np.array([2, 0, 1]),
+ np.array([2, 1, 0]),
+ np.array([1, 2, 0]),
+ ),
+)
def test_orient_principal_components(input_atoms, as_list, order):
"""
Orient atoms such that the variance in each axis is greatest
@@ -295,8 +289,8 @@ def test_align_vectors(input_atoms, as_list, use_support, random_seed):
source_direction = np.random.rand(3)
target_direction = np.random.rand(3)
if use_support:
- source_position = np.random.rand(3)
- target_position = np.random.rand(3)
+ source_position = np.random.rand(3)
+ target_position = np.random.rand(3)
else:
source_position = None
target_position = None
@@ -310,20 +304,22 @@ def test_align_vectors(input_atoms, as_list, use_support, random_seed):
transformed = struc.align_vectors(
input_atoms,
- source_direction, target_direction,
- source_position, target_position
+ source_direction,
+ target_direction,
+ source_position,
+ target_position,
)
restored = struc.align_vectors(
transformed,
- target_direction, source_direction,
- target_position, source_position
+ target_direction,
+ source_direction,
+ target_position,
+ source_position,
)
assert type(restored) == type(input_atoms)
assert struc.coord(restored).shape == struc.coord(input_atoms).shape
- assert np.allclose(
- struc.coord(restored), struc.coord(input_atoms), atol=1e-5
- )
+ assert np.allclose(struc.coord(restored), struc.coord(input_atoms), atol=1e-5)
def test_align_vectors_non_vector_inputs(input_atoms):
diff --git a/tests/test_doctest.py b/tests/test_doctest.py
index 6b6792b69..8293210b6 100644
--- a/tests/test_doctest.py
+++ b/tests/test_doctest.py
@@ -5,15 +5,14 @@
__author__ = "Patrick Kunzmann"
import doctest
-from os.path import join
import tempfile
from importlib import import_module
+from os.path import join
import numpy as np
import pytest
-import biotite.structure.io as strucio
import biotite.structure as struc
-from .util import is_not_installed, cannot_import, cannot_connect_to
-
+import biotite.structure.io as strucio
+from tests.util import cannot_connect_to, cannot_import, is_not_installed
NCBI_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/"
RCSB_URL = "https://www.rcsb.org/"
@@ -23,129 +22,93 @@
# Keep test parameters in separate variable to generate IDs from them
TEST_PARAMETERS = [
- pytest.param(
- "biotite",
- []
- ),
- pytest.param(
- "biotite.sequence",
- []
- ),
- pytest.param(
- "biotite.sequence.align",
- ["biotite.sequence"]
- ),
- pytest.param(
- "biotite.sequence.phylo",
- ["biotite.sequence"]
- ),
+ pytest.param("biotite", []),
+ pytest.param("biotite.sequence", []),
+ pytest.param("biotite.sequence.align", ["biotite.sequence"]),
+ pytest.param("biotite.sequence.phylo", ["biotite.sequence"]),
pytest.param(
"biotite.sequence.graphics",
["biotite.sequence"],
- marks = pytest.mark.skipif(
+ marks=pytest.mark.skipif(
cannot_import("matplotlib"), reason="Matplotlib is not installed"
- )
- ),
- pytest.param(
- "biotite.sequence.io",
- ["biotite.sequence"]
- ),
- pytest.param(
- "biotite.sequence.io.fasta",
- ["biotite.sequence"]
- ),
- pytest.param(
- "biotite.sequence.io.fastq",
- ["biotite.sequence"]
+ ),
),
+ pytest.param("biotite.sequence.io", ["biotite.sequence"]),
+ pytest.param("biotite.sequence.io.fasta", ["biotite.sequence"]),
+ pytest.param("biotite.sequence.io.fastq", ["biotite.sequence"]),
pytest.param(
"biotite.sequence.io.genbank",
["biotite.sequence", "biotite.database.entrez"],
- marks = pytest.mark.skipif(
+ marks=pytest.mark.skipif(
cannot_connect_to(NCBI_URL), reason="NCBI Entrez is not available"
- )
+ ),
),
pytest.param(
"biotite.sequence.io.gff",
["biotite.sequence", "biotite.sequence.io.fasta"],
- marks = pytest.mark.filterwarnings("ignore:")
+ marks=pytest.mark.filterwarnings("ignore:"),
),
pytest.param(
- "biotite.structure",
- ["biotite.structure.io", "biotite.structure.info"]
+ "biotite.structure", ["biotite.structure.io", "biotite.structure.info"]
),
pytest.param(
"biotite.structure.graphics",
["biotite.structure"],
- marks = pytest.mark.skipif(
+ marks=pytest.mark.skipif(
cannot_import("matplotlib"), reason="Matplotlib is not installed"
),
),
+ pytest.param("biotite.structure.io", ["biotite.structure"]),
+ pytest.param("biotite.structure.io.pdb", ["biotite.structure", "biotite"]),
+ pytest.param("biotite.structure.io.pdbx", ["biotite.structure"]),
pytest.param(
- "biotite.structure.io",
- ["biotite.structure"]
- ),
- pytest.param(
- "biotite.structure.io.pdb",
- ["biotite.structure", "biotite"]
- ),
- pytest.param(
- "biotite.structure.io.pdbx",
- ["biotite.structure"]
+ "biotite.structure.io.pdbqt", ["biotite.structure", "biotite.structure.info"]
),
pytest.param(
- "biotite.structure.io.pdbqt",
- ["biotite.structure", "biotite.structure.info"]
- ),
- pytest.param(
- "biotite.structure.io.mol",
- ["biotite.structure", "biotite.structure.info"]
- ),
- pytest.param(
- "biotite.structure.info",
- ["biotite.structure"]
+ "biotite.structure.io.mol", ["biotite.structure", "biotite.structure.info"]
),
+ pytest.param("biotite.structure.info", ["biotite.structure"]),
pytest.param(
"biotite.database.entrez",
[],
- marks = pytest.mark.skipif(
+ marks=pytest.mark.skipif(
cannot_connect_to(NCBI_URL), reason="NCBI Entrez is not available"
- )
+ ),
),
pytest.param(
"biotite.database.rcsb",
[],
- marks = pytest.mark.skipif(
+ marks=pytest.mark.skipif(
cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available"
- )
+ ),
),
pytest.param(
"biotite.database.uniprot",
[],
- marks = pytest.mark.skipif(
+ marks=pytest.mark.skipif(
cannot_connect_to(UNIPROT_URL), reason="UniProt is not available"
- )
+ ),
),
pytest.param(
"biotite.database.pubchem",
["biotite.structure.info"],
- marks = pytest.mark.skipif(
+ marks=pytest.mark.skipif(
cannot_connect_to(PUBCHEM_URL), reason="PubChem is not available"
- )
+ ),
),
pytest.param(
"biotite.application",
["biotite.application.clustalo", "biotite.sequence"],
- marks = pytest.mark.skipif(
+ marks=pytest.mark.skipif(
is_not_installed("clustalo"), reason="Software is not installed"
- )
+ ),
),
pytest.param(
"biotite.application.blast",
[],
),
# Do not test Muscle due to version clash
- #pytest.param(
+ # pytest.param(
# "biotite.application.muscle",
# ["biotite.sequence"],
# marks = pytest.mark.skipif(
@@ -154,50 +117,52 @@
pytest.param(
"biotite.application.clustalo",
["biotite.sequence"],
- marks = pytest.mark.skipif(
+ marks=pytest.mark.skipif(
is_not_installed("clustalo"), reason="Software is not installed"
- )
+ ),
),
pytest.param(
"biotite.application.mafft",
["biotite.sequence"],
- marks = pytest.mark.skipif(
- is_not_installed("mafft"), reason="Software is not installed")
+ marks=pytest.mark.skipif(
+ is_not_installed("mafft"), reason="Software is not installed"
),
+ ),
pytest.param(
- "biotite.application.sra", ["biotite.sequence"],
- marks = pytest.mark.skipif(
- is_not_installed("fasterq-dump"),
- reason="Software is not installed"
- )
+ "biotite.application.sra",
+ ["biotite.sequence"],
+ marks=pytest.mark.skipif(
+ is_not_installed("fasterq-dump"), reason="Software is not installed"
+ ),
),
pytest.param(
"biotite.application.tantan",
["biotite.sequence"],
- marks = pytest.mark.skipif(
- is_not_installed("tantan"), reason="Software is not installed")
+ marks=pytest.mark.skipif(
+ is_not_installed("tantan"), reason="Software is not installed"
),
+ ),
pytest.param(
"biotite.application.viennarna",
["biotite.sequence"],
- marks = pytest.mark.skipif(
+ marks=pytest.mark.skipif(
is_not_installed("RNAfold") | is_not_installed("RNAplot"),
- reason="Software is not installed"
- )
+ reason="Software is not installed",
+ ),
),
pytest.param(
"biotite.application.dssp",
["biotite.structure"],
- marks = pytest.mark.skipif(
+ marks=pytest.mark.skipif(
is_not_installed("mkdssp"), reason="Software is not installed"
- )
+ ),
),
pytest.param(
"biotite.application.autodock",
["biotite.structure", "biotite.structure.info"],
- marks = pytest.mark.skipif(
+ marks=pytest.mark.skipif(
is_not_installed("vina"), reason="Software is not installed"
- )
+ ),
),
]
@@ -205,7 +170,7 @@
@pytest.mark.parametrize(
"package_name, context_package_names",
TEST_PARAMETERS,
- ids=[param.values[0] for param in TEST_PARAMETERS]
+ ids=[param.values[0] for param in TEST_PARAMETERS],
)
def test_doctest(package_name, context_package_names):
"""
@@ -214,18 +179,17 @@ def test_doctest(package_name, context_package_names):
# Collect all attributes of this package and its subpackages
# as globals for the doctests
globs = {}
- #The package itself is also used as context
+ # The package itself is also used as context
for name in context_package_names + [package_name]:
context_package = import_module(name)
globs.update(
- {attr : getattr(context_package, attr)
- for attr in dir(context_package)}
+ {attr: getattr(context_package, attr) for attr in dir(context_package)}
)
# Add fixed names for certain paths
- globs["path_to_directory"] = tempfile.gettempdir()
+ globs["path_to_directory"] = tempfile.gettempdir()
globs["path_to_structures"] = join(".", "tests", "structure", "data")
- globs["path_to_sequences"] = join(".", "tests", "sequence", "data")
+ globs["path_to_sequences"] = join(".", "tests", "sequence", "data")
# Add frequently used modules
globs["np"] = np
# Add frequently used objects
@@ -245,14 +209,14 @@ def test_doctest(package_name, context_package_names):
# More information below
package = import_module(package_name)
runner = doctest.DocTestRunner(
- verbose = False,
- optionflags =
- doctest.ELLIPSIS |
- doctest.REPORT_ONLY_FIRST_FAILURE |
- doctest.NORMALIZE_WHITESPACE
+ verbose=False,
+ optionflags=doctest.ELLIPSIS
+ | doctest.REPORT_ONLY_FIRST_FAILURE
+ | doctest.NORMALIZE_WHITESPACE,
)
for test in doctest.DocTestFinder(exclude_empty=False).find(
- package, package.__name__,
+ package,
+ package.__name__,
# It is necessary to set 'module' to 'False', as otherwise
# Cython functions and classes would be falsely identified
# as members of an external module by 'DocTestFinder._find()'
@@ -263,7 +227,7 @@ def test_doctest(package_name, context_package_names):
# ('__init__.py' modules) should only contain attributes, that
# are part of the package itself.
module=False,
- extraglobs=globs
+ extraglobs=globs,
):
runner.run(test)
results = doctest.TestResults(runner.failures, runner.tries)
@@ -271,4 +235,4 @@ def test_doctest(package_name, context_package_names):
assert results.failed == 0
except AssertionError:
print(f"Failing doctest in module {package}")
- raise
\ No newline at end of file
+ raise
diff --git a/tests/test_init.py b/tests/test_init.py
deleted file mode 100644
index 644659ce9..000000000
--- a/tests/test_init.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# This source code is part of the Biotite package and is distributed
-# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
-# information.
-
-__author__ = "Daniel Bauer"
-
-import biotite
-import pytest
-
-
-def test_version_number():
- version = biotite.__version__
- assert hasattr(biotite, "__version__")
\ No newline at end of file
diff --git a/tests/test_modname.py b/tests/test_modname.py
index 808625f4b..8f8d88b17 100644
--- a/tests/test_modname.py
+++ b/tests/test_modname.py
@@ -2,11 +2,11 @@
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.
-import pkgutil
-from os.path import dirname, join, isdir, splitext
import importlib
+import pkgutil
+from os.path import dirname, join
import pytest
-from .util import cannot_import
+from tests.util import cannot_import
def find_all_modules(package_name, src_dir):
@@ -18,10 +18,9 @@ def find_all_modules(package_name, src_dir):
for _, module_name, is_package in pkgutil.iter_modules([src_dir]):
full_module_name = f"{package_name}.{module_name}"
if is_package:
- module_names.extend(find_all_modules(
- full_module_name,
- join(src_dir, module_name)
- ))
+ module_names.extend(
+ find_all_modules(full_module_name, join(src_dir, module_name))
+ )
else:
module_names.append(full_module_name)
return module_names
@@ -29,14 +28,11 @@ def find_all_modules(package_name, src_dir):
@pytest.mark.skipif(
cannot_import("matplotlib") | cannot_import("mdtraj"),
- reason="Optional dependencies are not met"
+ reason="Optional dependencies are not met",
)
@pytest.mark.parametrize(
"module_name",
- find_all_modules(
- "biotite",
- join(dirname(dirname(__file__)), "src", "biotite")
- )
+ find_all_modules("biotite", join(dirname(dirname(__file__)), "src", "biotite")),
)
def test_module_name(module_name):
"""
@@ -55,4 +51,4 @@ def test_module_name(module_name):
# Autogenerated module from hatch-vcs
# # It contains no '__name__' attribute on purpose
return
- assert module.__name__ == package_name
\ No newline at end of file
+ assert module.__name__ == package_name
diff --git a/tests/test_repr.py b/tests/test_repr.py
index 5f9714af8..f8bf319c4 100644
--- a/tests/test_repr.py
+++ b/tests/test_repr.py
@@ -2,51 +2,85 @@
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.
-from biotite.sequence import NucleotideSequence
-from biotite.sequence import ProteinSequence
-from biotite.sequence import Alphabet
-from biotite.sequence import GeneralSequence
-from biotite.sequence import LetterAlphabet
-from biotite.sequence import Location
-from biotite.sequence import Feature
-from biotite.sequence import Annotation
-from biotite.sequence import AnnotatedSequence
-from biotite.sequence.align import Alignment
-from biotite.structure import Atom
import numpy as np
-from numpy import float32, int32
-from biotite.sequence import CodonTable
-from biotite.sequence.align import SubstitutionMatrix
-from biotite.sequence import SequenceProfile
import pytest
+from numpy import float32, int32 # noqa: F401
+from biotite.sequence import (
+ Alphabet,
+ AnnotatedSequence,
+ Annotation,
+ CodonTable,
+ Feature,
+ GeneralSequence,
+ LetterAlphabet,
+ Location,
+ NucleotideSequence,
+ ProteinSequence,
+ SequenceProfile,
+)
+from biotite.sequence.align import Alignment, SubstitutionMatrix
+from biotite.structure import Atom
__author__ = "Maximilian Greil"
-@pytest.mark.parametrize("repr_object",
- [NucleotideSequence("AACTGCTA"),
- NucleotideSequence("AACTGCTA", ambiguous=True),
- ProteinSequence("BIQTITE"),
- Alphabet(["X", "Y", "Z"]),
- GeneralSequence(Alphabet(["X", 42, False]), ["X", 42, "X"]),
- LetterAlphabet(["X", "Y", "Z"]),
- Location(98, 178),
- Feature("CDS", [Location(98, 178)], qual={"gene": "test1"}),
- Annotation([Feature("CDS", [Location(98, 178)], qual={"gene": "test1"})]),
- AnnotatedSequence(Annotation([Feature("CDS", [Location(98, 178)], qual={"gene": "test1"})]),
- NucleotideSequence("AACTGCTA")),
- Alignment([NucleotideSequence("CGTCAT", ambiguous=False),
- NucleotideSequence("TCATGC", ambiguous=False)],
- np.array([[0, -1], [1, -1], [2, 0], [3, 1], [4, 2], [5, 3], [-1, 4], [-1, 5]]),
- score=-20),
- Atom([1, 2, 3], chain_id="A"),
- CodonTable.default_table(),
- SubstitutionMatrix(Alphabet(["foo", "bar"]), Alphabet([1, 2, 3]),
- {("foo", 1): 5, ("foo", 2): 10, ("foo", 3): 15, ("bar", 1): 42,
- ("bar", 2): 42, ("bar", 3): 42}),
- SequenceProfile(np.array([[0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 2], [0, 2, 0, 0],
- [2, 0, 0, 0], [0, 0, 0, 2], [0, 0, 1, 0], [0, 1, 0, 0]]),
- np.array([1, 1, 0, 0, 0, 0, 1, 1]),
- Alphabet(["A", "C", "G", "T"]))])
+@pytest.mark.parametrize(
+ "repr_object",
+ [
+ NucleotideSequence("AACTGCTA"),
+ NucleotideSequence("AACTGCTA", ambiguous=True),
+ ProteinSequence("BIQTITE"),
+ Alphabet(["X", "Y", "Z"]),
+ GeneralSequence(Alphabet(["X", 42, False]), ["X", 42, "X"]),
+ LetterAlphabet(["X", "Y", "Z"]),
+ Location(98, 178),
+ Feature("CDS", [Location(98, 178)], qual={"gene": "test1"}),
+ Annotation([Feature("CDS", [Location(98, 178)], qual={"gene": "test1"})]),
+ AnnotatedSequence(
+ Annotation([Feature("CDS", [Location(98, 178)], qual={"gene": "test1"})]),
+ NucleotideSequence("AACTGCTA"),
+ ),
+ Alignment(
+ [
+ NucleotideSequence("CGTCAT", ambiguous=False),
+ NucleotideSequence("TCATGC", ambiguous=False),
+ ],
+ np.array(
+ [[0, -1], [1, -1], [2, 0], [3, 1], [4, 2], [5, 3], [-1, 4], [-1, 5]]
+ ),
+ score=-20,
+ ),
+ Atom([1, 2, 3], chain_id="A"),
+ CodonTable.default_table(),
+ SubstitutionMatrix(
+ Alphabet(["foo", "bar"]),
+ Alphabet([1, 2, 3]),
+ {
+ ("foo", 1): 5,
+ ("foo", 2): 10,
+ ("foo", 3): 15,
+ ("bar", 1): 42,
+ ("bar", 2): 42,
+ ("bar", 3): 42,
+ },
+ ),
+ SequenceProfile(
+ np.array(
+ [
+ [0, 1, 0, 0],
+ [0, 0, 1, 0],
+ [0, 0, 0, 2],
+ [0, 2, 0, 0],
+ [2, 0, 0, 0],
+ [0, 0, 0, 2],
+ [0, 0, 1, 0],
+ [0, 1, 0, 0],
+ ]
+ ),
+ np.array([1, 1, 0, 0, 0, 0, 1, 1]),
+ Alphabet(["A", "C", "G", "T"]),
+ ),
+ ],
+)
def test_repr(repr_object):
assert eval(repr(repr_object)) == repr_object
diff --git a/tests/test_version.py b/tests/test_version.py
index 5f11daa2a..ec7bca6f9 100644
--- a/tests/test_version.py
+++ b/tests/test_version.py
@@ -6,4 +6,4 @@ def test_version():
"""
Check if version imported from version.py is correct.
"""
- assert biotite.__version__ == version("biotite")
\ No newline at end of file
+ assert biotite.__version__ == version("biotite")
diff --git a/tests/util.py b/tests/util.py
index e72cc5cb5..99cf24741 100644
--- a/tests/util.py
+++ b/tests/util.py
@@ -2,11 +2,11 @@
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.
-from os.path import join, dirname, realpath
-import urllib.error
-import urllib.request
import importlib
import shutil
+import urllib.error
+import urllib.request
+from os.path import dirname, join, realpath
def data_dir(subdir):
@@ -16,6 +16,8 @@ def data_dir(subdir):
### Functions for conditional test skips ###
tested_urls = {}
+
+
def cannot_connect_to(url):
if url not in tested_urls:
try:
@@ -25,8 +27,10 @@ def cannot_connect_to(url):
tested_urls[url] = True
return tested_urls[url]
+
def cannot_import(module):
return importlib.util.find_spec(module) is None
+
def is_not_installed(program):
- return shutil.which(program) is None
\ No newline at end of file
+ return shutil.which(program) is None