Handle embedded quote in mmcif (#619)

* handle embedded quote in data value like 'dog's life', replace shlex.split with _split_one_line() using regex. * fix #570, add embedded quote example of 1n5m.cif into test_pdbx.py * replace shlex.quote() with updated _quote() * Avoid calling `quote()` on the value with '\n' twice. not call _quote in to_single(). * test `_split_one_line()` directly
biotite-dev · Jul 14, 2024 · 0404084 · 0404084
1 parent 93d43c3
commit 0404084
Show file tree

Hide file tree

Showing 2 changed files with 52 additions and 9 deletions.
diff --git a/src/biotite/structure/io/pdbx/cif.py b/src/biotite/structure/io/pdbx/cif.py
@@ -6,8 +6,8 @@
 __author__ = "Patrick Kunzmann"
 __all__ = ["CIFFile", "CIFBlock", "CIFCategory", "CIFColumn", "CIFData"]
 
+import re
 import itertools
-import shlex
 from collections.abc import MutableMapping, Sequence
 import numpy as np
 from .component import _Component, MaskValue
@@ -449,7 +449,7 @@ def _deserialize_single(lines):
         """
         category_dict = {}
         for line in lines:
-            parts = shlex.split(line)
+            parts = _split_one_line(line)
             column_name = parts[0].split(".")[1]
             column = parts[1]
             category_dict[column_name] = CIFColumn(column)
@@ -480,12 +480,11 @@ def _deserialize_looped(lines, expect_whitespace):
         column_names = itertools.cycle(column_names)
         for data_line in data_lines:
             # If whitespace is expected in quote protected values,
-            # use standard shlex split
+            # use regex-based _split_one_line() to split
             # Otherwise use much more faster whitespace split
-            # and quote removal if applicable,
-            # bypassing the slow shlex module
+            # and quote removal if applicable.
             if expect_whitespace:
-                values = shlex.split(data_line)
+                values = _split_one_line(data_line)
             else:
                 values = data_line.split()
                 for k in range(len(values)):
@@ -652,7 +651,7 @@ def __getitem__(self, key):
                 # Special optimization for "atom_site":
                 # Even if the values are quote protected,
                 # no whitespace is expected in escaped values
-                # Therefore slow shlex.split() call is not necessary
+                # Therefore slow regex-based _split_one_line() call is not necessary
                 if key == "atom_site":
                     expect_whitespace = False
                 else:
@@ -973,11 +972,11 @@ def _to_single(lines, is_looped):
                 j += 1
             if is_looped:
                 # Create a line for the multiline string only
-                processed_lines[out_i] = shlex.quote(multi_line_str)
+                processed_lines[out_i] = f"'{multi_line_str}'"
                 out_i += 1
             else:
                 # Append multiline string to previous line
-                processed_lines[out_i - 1] += " " + shlex.quote(multi_line_str)
+                processed_lines[out_i - 1] += " " + f"'{multi_line_str}'"
             in_i = j + 1
 
         elif not is_looped and lines[in_i][0] != "_":
@@ -1024,6 +1023,34 @@ def _multiline(value):
     return value
 
 
+def _split_one_line(line):
+    """
+    Split a line into its fields.
+    Supporting embedded quotes (' or "), like `'a dog's life'` to  `a dog's life`
+    """
+    # Define the patterns for different types of fields
+    single_quote_pattern = r"('(?:'(?! )|[^'])*')(?:\s|$)"
+    double_quote_pattern = r'("(?:"(?! )|[^"])*")(?:\s|$)'
+    unquoted_pattern = r"([^\s]+)"
+
+    # Combine the patterns using alternation
+    combined_pattern = (
+        f"{single_quote_pattern}|{double_quote_pattern}|{unquoted_pattern}"
+    )
+
+    # Find all matches
+    matches = re.findall(combined_pattern, line)
+
+    # Extract non-empty groups from the matches
+    fields = []
+    for match in matches:
+        field = next(group for group in match if group)
+        if field[0] == field[-1] == "'" or field[0] == field[-1] == '"':
+            field = field[1:-1]
+        fields.append(field)
+    return fields
+
+
 def _arrayfy(data):
     if not isinstance(data, (Sequence, np.ndarray)) or isinstance(data, str):
         data = [data]

diff --git a/tests/structure/test_pdbx.py b/tests/structure/test_pdbx.py
@@ -59,6 +59,22 @@ def test_escape(string, looped):
     assert test_value == ref_value
 
 
+@pytest.mark.parametrize(
+    "cif_line, expected_fields",
+    [
+        ["'' 'embed'quote' ", ['', "embed'quote"]],
+        ['2 "embed"quote" "\t\n"', ['2', 'embed"quote', '\t\n']],
+        [" 3 '' \"\" 'spac e' 'embed\"quote'", ['3', '', '', 'spac e', 'embed"quote']],
+        ["''' \"\"\" ''quoted''", ["'", '"', "'quoted'"]]
+    ]
+)
+def test_split_one_line(cif_line, expected_fields):
+    """
+    Test whether values that have an embedded quote are properly escaped.
+    """
+    assert pdbx.cif._split_one_line(cif_line) == expected_fields
+
+
 @pytest.mark.parametrize(
     "format, path, model",
     itertools.product(