From a1d361eaed9138f25718031bd37932f3062b95d8 Mon Sep 17 00:00:00 2001 From: xvlaurent Date: Fri, 25 Oct 2024 16:23:04 +0000 Subject: [PATCH 1/6] Enhance _split_one_line and remove whitespace parameter --- src/biotite/structure/io/pdbx/cif.py | 81 ++++++++++++---------------- 1 file changed, 33 insertions(+), 48 deletions(-) diff --git a/src/biotite/structure/io/pdbx/cif.py b/src/biotite/structure/io/pdbx/cif.py index 77e0258b7..484dd206b 100644 --- a/src/biotite/structure/io/pdbx/cif.py +++ b/src/biotite/structure/io/pdbx/cif.py @@ -357,7 +357,7 @@ def supercomponent_class(): return CIFBlock @staticmethod - def deserialize(text, expect_whitespace=True): + def deserialize(text): lines = [line.strip() for line in text.splitlines() if not _is_empty(line)] if _is_loop_start(lines[0]): @@ -372,7 +372,7 @@ def deserialize(text, expect_whitespace=True): lines = _to_single(lines) if is_looped: - category_dict = CIFCategory._deserialize_looped(lines, expect_whitespace) + category_dict = CIFCategory._deserialize_looped(lines) else: category_dict = CIFCategory._deserialize_single(lines) return CIFCategory(category_dict, category_name) @@ -467,7 +467,7 @@ def _deserialize_single(lines): return category_dict @staticmethod - def _deserialize_looped(lines, expect_whitespace): + def _deserialize_looped(lines): """ Process a category where each field has multiple values (category is a table). @@ -490,20 +490,7 @@ def _deserialize_looped(lines, expect_whitespace): # row-line-alignment at all and simply cycle through columns column_indices = itertools.cycle(range(len(column_names))) for data_line in data_lines: - # If whitespace is expected in quote protected values, - # use regex-based _split_one_line() to split - # Otherwise use much more faster whitespace split - # and quote removal if applicable. - if expect_whitespace: - values = _split_one_line(data_line) - else: - values = data_line.split() - for k in range(len(values)): - # Remove quotes - if (values[k][0] == '"' and values[k][-1] == '"') or ( - values[k][0] == "'" and values[k][-1] == "'" - ): - values[k] = values[k][1:-1] + values = _split_one_line(data_line) for val in values: column_index = next(column_indices) column_name = column_names[column_index] @@ -685,15 +672,7 @@ def __getitem__(self, key): # Element is stored in serialized form # -> must be deserialized first try: - # Special optimization for "atom_site": - # Even if the values are quote protected, - # no whitespace is expected in escaped values - # Therefore slow regex-based _split_one_line() call is not necessary - if key == "atom_site": - expect_whitespace = False - else: - expect_whitespace = True - category = CIFCategory.deserialize(category, expect_whitespace) + category = CIFCategory.deserialize(category) except Exception: raise DeserializationError(f"Failed to deserialize category '{key}'") # Update with deserialized object @@ -1064,28 +1043,34 @@ def _split_one_line(line): if line[0] == ";": return [line[1:]] - # Define the patterns for different types of fields - single_quote_pattern = r"('(?:'(?! )|[^'])*')(?:\s|$)" - double_quote_pattern = r'("(?:"(?! )|[^"])*")(?:\s|$)' - unquoted_pattern = r"([^\s]+)" - - # Combine the patterns using alternation - combined_pattern = ( - f"{single_quote_pattern}|{double_quote_pattern}|{unquoted_pattern}" - ) - - # Find all matches - matches = re.findall(combined_pattern, line) - - # Extract non-empty groups from the matches - fields = [] - for match in matches: - field = next(group for group in match if group) - if field[0] == field[-1] == "'" or field[0] == field[-1] == '"': - field = field[1:-1] - fields.append(field) - return fields - + # Initialize loop variables + res = [] + separator = " " + # Loop over the line + while line: + # Split the line on separator + word, _, new_line = line.lstrip().partition(separator) + # Handle the case where the word strat with a quote + if word.startswith(("'", '"')): + # Set the separator to the quote found + separator = word[0] + # Handle the case of a quoted word without space + if word.endswith(separator) and len(word) > 1: + res.append(word[1:-1]) + separator = " " + line = new_line + continue + # Reconstruct line without the starting quote + new_line = line.lstrip()[1:] + # Capture the word until the closing quote + word = "".join(itertools.takewhile(lambda x : x != separator, new_line)) + # Reconstruct line without the captured word + new_line = new_line.removeprefix(word + separator) + # Reset separator to whitespace + separator = " " + res.append(word) + line = new_line + return res def _arrayfy(data): if not isinstance(data, (Sequence, np.ndarray)) or isinstance(data, str): From bcf7ba2a16444af6a1536fef14702db7215f1290 Mon Sep 17 00:00:00 2001 From: xvlaurent Date: Fri, 25 Oct 2024 16:23:29 +0000 Subject: [PATCH 2/6] Fix faulty test. --- tests/database/test_rcsb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/database/test_rcsb.py b/tests/database/test_rcsb.py index 42bbc53e4..86bd55a02 100644 --- a/tests/database/test_rcsb.py +++ b/tests/database/test_rcsb.py @@ -31,7 +31,7 @@ def test_fetch(format, as_file_like): if format == "pdb": file = pdb.PDBFile.read(file_path_or_obj) pdb.get_structure(file) - elif format == "pdbx": + elif format == "cif": file = pdbx.CIFFile.read(file_path_or_obj) pdbx.get_structure(file) elif format == "bcif": From f12bc77ee6e12f25605f05a32ff21e330433db5e Mon Sep 17 00:00:00 2001 From: xvlaurent Date: Mon, 28 Oct 2024 10:37:40 +0000 Subject: [PATCH 3/6] Attempt iterator version of _split_one_line --- src/biotite/structure/io/pdbx/cif.py | 55 ++++++++++++---------------- 1 file changed, 23 insertions(+), 32 deletions(-) diff --git a/src/biotite/structure/io/pdbx/cif.py b/src/biotite/structure/io/pdbx/cif.py index 484dd206b..3479e4db7 100644 --- a/src/biotite/structure/io/pdbx/cif.py +++ b/src/biotite/structure/io/pdbx/cif.py @@ -445,7 +445,7 @@ def _deserialize_single(lines): line_i = 0 while line_i < len(lines): line = lines[line_i] - parts = _split_one_line(line) + parts = list(_split_one_line(line)) if len(parts) == 2: # Standard case -> name and value in one line name_part, value_part = parts @@ -453,7 +453,7 @@ def _deserialize_single(lines): elif len(parts) == 1: # Value is a multiline value on the next line name_part = parts[0] - parts = _split_one_line(lines[line_i + 1]) + parts = list(_split_one_line(lines[line_i + 1])) if len(parts) == 1: value_part = parts[0] else: @@ -1041,36 +1041,27 @@ def _split_one_line(line): """ # Special case of multiline value, where the line starts with ';' if line[0] == ";": - return [line[1:]] - - # Initialize loop variables - res = [] - separator = " " - # Loop over the line - while line: - # Split the line on separator - word, _, new_line = line.lstrip().partition(separator) - # Handle the case where the word strat with a quote - if word.startswith(("'", '"')): - # Set the separator to the quote found - separator = word[0] - # Handle the case of a quoted word without space - if word.endswith(separator) and len(word) > 1: - res.append(word[1:-1]) - separator = " " - line = new_line - continue - # Reconstruct line without the starting quote - new_line = line.lstrip()[1:] - # Capture the word until the closing quote - word = "".join(itertools.takewhile(lambda x : x != separator, new_line)) - # Reconstruct line without the captured word - new_line = new_line.removeprefix(word + separator) - # Reset separator to whitespace - separator = " " - res.append(word) - line = new_line - return res + yield line[1:] + else: + # Loop over the line + while line: + # Strip leading whitespace(s) + striped_line = line.lstrip() + # Split the line on whitespace + word, _, line = striped_line.partition(" ") + # Handle the case where the word start with a quote + if word.startswith(("'", '"')): + # Set the separator to the quote found + separator = word[0] + # Handle the case of a quoted word without space + if word.endswith(separator) and len(word) > 1: + # Yield the word without the opening and closing quotes + yield word[1:-1] + continue + # split the word on the separator + word, _, line = striped_line[1:].partition(separator) + + yield word def _arrayfy(data): if not isinstance(data, (Sequence, np.ndarray)) or isinstance(data, str): From 83d5bc0e8152cbb3106066e3e87cc8e1592b6a2c Mon Sep 17 00:00:00 2001 From: xvlaurent Date: Mon, 28 Oct 2024 10:43:09 +0000 Subject: [PATCH 4/6] Format ruff structure/io/pdbx/cif.py with ruff --- src/biotite/structure/io/pdbx/cif.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/biotite/structure/io/pdbx/cif.py b/src/biotite/structure/io/pdbx/cif.py index 3479e4db7..f5defae7d 100644 --- a/src/biotite/structure/io/pdbx/cif.py +++ b/src/biotite/structure/io/pdbx/cif.py @@ -7,7 +7,6 @@ __all__ = ["CIFFile", "CIFBlock", "CIFCategory", "CIFColumn", "CIFData"] import itertools -import re from collections.abc import MutableMapping, Sequence import numpy as np from biotite.file import ( @@ -1063,6 +1062,7 @@ def _split_one_line(line): yield word + def _arrayfy(data): if not isinstance(data, (Sequence, np.ndarray)) or isinstance(data, str): data = [data] From 33f193693f4edb17e37da3b6f92dfc839d2ae1b0 Mon Sep 17 00:00:00 2001 From: xvlaurent Date: Mon, 4 Nov 2024 07:59:45 +0000 Subject: [PATCH 5/6] Fix PDBx test. --- tests/structure/io/test_pdbx.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/structure/io/test_pdbx.py b/tests/structure/io/test_pdbx.py index a105a259b..96123294f 100644 --- a/tests/structure/io/test_pdbx.py +++ b/tests/structure/io/test_pdbx.py @@ -85,7 +85,7 @@ def test_split_one_line(cif_line, expected_fields): """ Test whether values that have an embedded quote are properly escaped. """ - assert pdbx.cif._split_one_line(cif_line) == expected_fields + assert list(pdbx.cif._split_one_line(cif_line)) == expected_fields @pytest.mark.parametrize( From c465aa5d3e0f2fbefe37230dfdb97ebae2074fc2 Mon Sep 17 00:00:00 2001 From: xvlaurent Date: Mon, 18 Nov 2024 08:03:13 +0000 Subject: [PATCH 6/6] Fix typo. --- src/biotite/structure/io/pdbx/cif.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/biotite/structure/io/pdbx/cif.py b/src/biotite/structure/io/pdbx/cif.py index f5defae7d..94557ca11 100644 --- a/src/biotite/structure/io/pdbx/cif.py +++ b/src/biotite/structure/io/pdbx/cif.py @@ -1045,9 +1045,9 @@ def _split_one_line(line): # Loop over the line while line: # Strip leading whitespace(s) - striped_line = line.lstrip() + stripped_line = line.lstrip() # Split the line on whitespace - word, _, line = striped_line.partition(" ") + word, _, line = stripped_line.partition(" ") # Handle the case where the word start with a quote if word.startswith(("'", '"')): # Set the separator to the quote found @@ -1058,7 +1058,7 @@ def _split_one_line(line): yield word[1:-1] continue # split the word on the separator - word, _, line = striped_line[1:].partition(separator) + word, _, line = stripped_line[1:].partition(separator) yield word