From a1d361eaed9138f25718031bd37932f3062b95d8 Mon Sep 17 00:00:00 2001
From: xvlaurent <xavier.laurent@discngine.com>
Date: Fri, 25 Oct 2024 16:23:04 +0000
Subject: [PATCH 1/6] Enhance _split_one_line and remove whitespace parameter

---
 src/biotite/structure/io/pdbx/cif.py | 81 ++++++++++++----------------
 1 file changed, 33 insertions(+), 48 deletions(-)

diff --git a/src/biotite/structure/io/pdbx/cif.py b/src/biotite/structure/io/pdbx/cif.py
index 77e0258b7..484dd206b 100644
--- a/src/biotite/structure/io/pdbx/cif.py
+++ b/src/biotite/structure/io/pdbx/cif.py
@@ -357,7 +357,7 @@ def supercomponent_class():
         return CIFBlock
 
     @staticmethod
-    def deserialize(text, expect_whitespace=True):
+    def deserialize(text):
         lines = [line.strip() for line in text.splitlines() if not _is_empty(line)]
 
         if _is_loop_start(lines[0]):
@@ -372,7 +372,7 @@ def deserialize(text, expect_whitespace=True):
 
         lines = _to_single(lines)
         if is_looped:
-            category_dict = CIFCategory._deserialize_looped(lines, expect_whitespace)
+            category_dict = CIFCategory._deserialize_looped(lines)
         else:
             category_dict = CIFCategory._deserialize_single(lines)
         return CIFCategory(category_dict, category_name)
@@ -467,7 +467,7 @@ def _deserialize_single(lines):
         return category_dict
 
     @staticmethod
-    def _deserialize_looped(lines, expect_whitespace):
+    def _deserialize_looped(lines):
         """
         Process a category where each field has multiple values
         (category is a table).
@@ -490,20 +490,7 @@ def _deserialize_looped(lines, expect_whitespace):
         # row-line-alignment at all and simply cycle through columns
         column_indices = itertools.cycle(range(len(column_names)))
         for data_line in data_lines:
-            # If whitespace is expected in quote protected values,
-            # use regex-based _split_one_line() to split
-            # Otherwise use much more faster whitespace split
-            # and quote removal if applicable.
-            if expect_whitespace:
-                values = _split_one_line(data_line)
-            else:
-                values = data_line.split()
-                for k in range(len(values)):
-                    # Remove quotes
-                    if (values[k][0] == '"' and values[k][-1] == '"') or (
-                        values[k][0] == "'" and values[k][-1] == "'"
-                    ):
-                        values[k] = values[k][1:-1]
+            values = _split_one_line(data_line)
             for val in values:
                 column_index = next(column_indices)
                 column_name = column_names[column_index]
@@ -685,15 +672,7 @@ def __getitem__(self, key):
             # Element is stored in serialized form
             # -> must be deserialized first
             try:
-                # Special optimization for "atom_site":
-                # Even if the values are quote protected,
-                # no whitespace is expected in escaped values
-                # Therefore slow regex-based _split_one_line() call is not necessary
-                if key == "atom_site":
-                    expect_whitespace = False
-                else:
-                    expect_whitespace = True
-                category = CIFCategory.deserialize(category, expect_whitespace)
+                category = CIFCategory.deserialize(category)
             except Exception:
                 raise DeserializationError(f"Failed to deserialize category '{key}'")
             # Update with deserialized object
@@ -1064,28 +1043,34 @@ def _split_one_line(line):
     if line[0] == ";":
         return [line[1:]]
 
-    # Define the patterns for different types of fields
-    single_quote_pattern = r"('(?:'(?! )|[^'])*')(?:\s|$)"
-    double_quote_pattern = r'("(?:"(?! )|[^"])*")(?:\s|$)'
-    unquoted_pattern = r"([^\s]+)"
-
-    # Combine the patterns using alternation
-    combined_pattern = (
-        f"{single_quote_pattern}|{double_quote_pattern}|{unquoted_pattern}"
-    )
-
-    # Find all matches
-    matches = re.findall(combined_pattern, line)
-
-    # Extract non-empty groups from the matches
-    fields = []
-    for match in matches:
-        field = next(group for group in match if group)
-        if field[0] == field[-1] == "'" or field[0] == field[-1] == '"':
-            field = field[1:-1]
-        fields.append(field)
-    return fields
-
+    # Initialize loop variables
+    res = []
+    separator = " "
+    # Loop over the line
+    while line:
+        # Split the line on separator
+        word, _, new_line = line.lstrip().partition(separator)
+        # Handle the case where the word strat with a quote
+        if word.startswith(("'", '"')):
+            # Set the separator to the quote found
+            separator = word[0]
+            # Handle the case of a quoted word without space
+            if word.endswith(separator) and len(word) > 1:
+                res.append(word[1:-1])
+                separator = " "
+                line = new_line
+                continue
+            # Reconstruct line without the starting quote
+            new_line = line.lstrip()[1:]
+            # Capture the word until the closing quote
+            word = "".join(itertools.takewhile(lambda x : x != separator, new_line))
+            # Reconstruct line without the captured word
+            new_line = new_line.removeprefix(word + separator)
+        # Reset separator to whitespace
+        separator = " "
+        res.append(word)
+        line = new_line
+    return res
 
 def _arrayfy(data):
     if not isinstance(data, (Sequence, np.ndarray)) or isinstance(data, str):

From bcf7ba2a16444af6a1536fef14702db7215f1290 Mon Sep 17 00:00:00 2001
From: xvlaurent <xavier.laurent@discngine.com>
Date: Fri, 25 Oct 2024 16:23:29 +0000
Subject: [PATCH 2/6] Fix faulty test.

---
 tests/database/test_rcsb.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/database/test_rcsb.py b/tests/database/test_rcsb.py
index 42bbc53e4..86bd55a02 100644
--- a/tests/database/test_rcsb.py
+++ b/tests/database/test_rcsb.py
@@ -31,7 +31,7 @@ def test_fetch(format, as_file_like):
     if format == "pdb":
         file = pdb.PDBFile.read(file_path_or_obj)
         pdb.get_structure(file)
-    elif format == "pdbx":
+    elif format == "cif":
         file = pdbx.CIFFile.read(file_path_or_obj)
         pdbx.get_structure(file)
     elif format == "bcif":

From f12bc77ee6e12f25605f05a32ff21e330433db5e Mon Sep 17 00:00:00 2001
From: xvlaurent <xavier.laurent@discngine.com>
Date: Mon, 28 Oct 2024 10:37:40 +0000
Subject: [PATCH 3/6] Attempt iterator version of _split_one_line

---
 src/biotite/structure/io/pdbx/cif.py | 55 ++++++++++++----------------
 1 file changed, 23 insertions(+), 32 deletions(-)

diff --git a/src/biotite/structure/io/pdbx/cif.py b/src/biotite/structure/io/pdbx/cif.py
index 484dd206b..3479e4db7 100644
--- a/src/biotite/structure/io/pdbx/cif.py
+++ b/src/biotite/structure/io/pdbx/cif.py
@@ -445,7 +445,7 @@ def _deserialize_single(lines):
         line_i = 0
         while line_i < len(lines):
             line = lines[line_i]
-            parts = _split_one_line(line)
+            parts = list(_split_one_line(line))
             if len(parts) == 2:
                 # Standard case -> name and value in one line
                 name_part, value_part = parts
@@ -453,7 +453,7 @@ def _deserialize_single(lines):
             elif len(parts) == 1:
                 # Value is a multiline value on the next line
                 name_part = parts[0]
-                parts = _split_one_line(lines[line_i + 1])
+                parts = list(_split_one_line(lines[line_i + 1]))
                 if len(parts) == 1:
                     value_part = parts[0]
                 else:
@@ -1041,36 +1041,27 @@ def _split_one_line(line):
     """
     # Special case of multiline value, where the line starts with ';'
     if line[0] == ";":
-        return [line[1:]]
-
-    # Initialize loop variables
-    res = []
-    separator = " "
-    # Loop over the line
-    while line:
-        # Split the line on separator
-        word, _, new_line = line.lstrip().partition(separator)
-        # Handle the case where the word strat with a quote
-        if word.startswith(("'", '"')):
-            # Set the separator to the quote found
-            separator = word[0]
-            # Handle the case of a quoted word without space
-            if word.endswith(separator) and len(word) > 1:
-                res.append(word[1:-1])
-                separator = " "
-                line = new_line
-                continue
-            # Reconstruct line without the starting quote
-            new_line = line.lstrip()[1:]
-            # Capture the word until the closing quote
-            word = "".join(itertools.takewhile(lambda x : x != separator, new_line))
-            # Reconstruct line without the captured word
-            new_line = new_line.removeprefix(word + separator)
-        # Reset separator to whitespace
-        separator = " "
-        res.append(word)
-        line = new_line
-    return res
+        yield line[1:]
+    else:
+        # Loop over the line
+        while line:
+            # Strip leading whitespace(s)
+            striped_line = line.lstrip()
+            # Split the line on whitespace
+            word, _, line = striped_line.partition(" ")
+            # Handle the case where the word start with a quote
+            if word.startswith(("'", '"')):
+                # Set the separator to the quote found
+                separator = word[0]
+                # Handle the case of a quoted word without space
+                if word.endswith(separator) and len(word) > 1:
+                    # Yield the word without the opening and closing quotes
+                    yield word[1:-1]
+                    continue
+                # split the word on the separator
+                word, _, line = striped_line[1:].partition(separator)
+
+            yield word
 
 def _arrayfy(data):
     if not isinstance(data, (Sequence, np.ndarray)) or isinstance(data, str):

From 83d5bc0e8152cbb3106066e3e87cc8e1592b6a2c Mon Sep 17 00:00:00 2001
From: xvlaurent <xavier.laurent@discngine.com>
Date: Mon, 28 Oct 2024 10:43:09 +0000
Subject: [PATCH 4/6] Format ruff structure/io/pdbx/cif.py with ruff

---
 src/biotite/structure/io/pdbx/cif.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/biotite/structure/io/pdbx/cif.py b/src/biotite/structure/io/pdbx/cif.py
index 3479e4db7..f5defae7d 100644
--- a/src/biotite/structure/io/pdbx/cif.py
+++ b/src/biotite/structure/io/pdbx/cif.py
@@ -7,7 +7,6 @@
 __all__ = ["CIFFile", "CIFBlock", "CIFCategory", "CIFColumn", "CIFData"]
 
 import itertools
-import re
 from collections.abc import MutableMapping, Sequence
 import numpy as np
 from biotite.file import (
@@ -1063,6 +1062,7 @@ def _split_one_line(line):
 
             yield word
 
+
 def _arrayfy(data):
     if not isinstance(data, (Sequence, np.ndarray)) or isinstance(data, str):
         data = [data]

From 33f193693f4edb17e37da3b6f92dfc839d2ae1b0 Mon Sep 17 00:00:00 2001
From: xvlaurent <xavier.laurent@discngine.com>
Date: Mon, 4 Nov 2024 07:59:45 +0000
Subject: [PATCH 5/6] Fix PDBx test.

---
 tests/structure/io/test_pdbx.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/structure/io/test_pdbx.py b/tests/structure/io/test_pdbx.py
index a105a259b..96123294f 100644
--- a/tests/structure/io/test_pdbx.py
+++ b/tests/structure/io/test_pdbx.py
@@ -85,7 +85,7 @@ def test_split_one_line(cif_line, expected_fields):
     """
     Test whether values that have an embedded quote are properly escaped.
     """
-    assert pdbx.cif._split_one_line(cif_line) == expected_fields
+    assert list(pdbx.cif._split_one_line(cif_line)) == expected_fields
 
 
 @pytest.mark.parametrize(

From c465aa5d3e0f2fbefe37230dfdb97ebae2074fc2 Mon Sep 17 00:00:00 2001
From: xvlaurent <xavier.laurent@discngine.com>
Date: Mon, 18 Nov 2024 08:03:13 +0000
Subject: [PATCH 6/6] Fix typo.

---
 src/biotite/structure/io/pdbx/cif.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/biotite/structure/io/pdbx/cif.py b/src/biotite/structure/io/pdbx/cif.py
index f5defae7d..94557ca11 100644
--- a/src/biotite/structure/io/pdbx/cif.py
+++ b/src/biotite/structure/io/pdbx/cif.py
@@ -1045,9 +1045,9 @@ def _split_one_line(line):
         # Loop over the line
         while line:
             # Strip leading whitespace(s)
-            striped_line = line.lstrip()
+            stripped_line = line.lstrip()
             # Split the line on whitespace
-            word, _, line = striped_line.partition(" ")
+            word, _, line = stripped_line.partition(" ")
             # Handle the case where the word start with a quote
             if word.startswith(("'", '"')):
                 # Set the separator to the quote found
@@ -1058,7 +1058,7 @@ def _split_one_line(line):
                     yield word[1:-1]
                     continue
                 # split the word on the separator
-                word, _, line = striped_line[1:].partition(separator)
+                word, _, line = stripped_line[1:].partition(separator)
 
             yield word