Skip to content

Commit

Permalink
Handle embedded quote in mmcif (#619)
Browse files Browse the repository at this point in the history
* handle embedded quote in data value like 'dog's life', replace shlex.split with _split_one_line() using regex.

* fix #570, add embedded quote example of 1n5m.cif into test_pdbx.py

* replace shlex.quote() with updated _quote()

* Avoid calling `quote()` on the value with '\n' twice.

not call _quote in to_single().

* test `_split_one_line()` directly
  • Loading branch information
0ut0fcontrol authored Jul 14, 2024
1 parent 93d43c3 commit 0404084
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 9 deletions.
45 changes: 36 additions & 9 deletions src/biotite/structure/io/pdbx/cif.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
__author__ = "Patrick Kunzmann"
__all__ = ["CIFFile", "CIFBlock", "CIFCategory", "CIFColumn", "CIFData"]

import re
import itertools
import shlex
from collections.abc import MutableMapping, Sequence
import numpy as np
from .component import _Component, MaskValue
Expand Down Expand Up @@ -449,7 +449,7 @@ def _deserialize_single(lines):
"""
category_dict = {}
for line in lines:
parts = shlex.split(line)
parts = _split_one_line(line)
column_name = parts[0].split(".")[1]
column = parts[1]
category_dict[column_name] = CIFColumn(column)
Expand Down Expand Up @@ -480,12 +480,11 @@ def _deserialize_looped(lines, expect_whitespace):
column_names = itertools.cycle(column_names)
for data_line in data_lines:
# If whitespace is expected in quote protected values,
# use standard shlex split
# use regex-based _split_one_line() to split
# Otherwise use much more faster whitespace split
# and quote removal if applicable,
# bypassing the slow shlex module
# and quote removal if applicable.
if expect_whitespace:
values = shlex.split(data_line)
values = _split_one_line(data_line)
else:
values = data_line.split()
for k in range(len(values)):
Expand Down Expand Up @@ -652,7 +651,7 @@ def __getitem__(self, key):
# Special optimization for "atom_site":
# Even if the values are quote protected,
# no whitespace is expected in escaped values
# Therefore slow shlex.split() call is not necessary
# Therefore slow regex-based _split_one_line() call is not necessary
if key == "atom_site":
expect_whitespace = False
else:
Expand Down Expand Up @@ -973,11 +972,11 @@ def _to_single(lines, is_looped):
j += 1
if is_looped:
# Create a line for the multiline string only
processed_lines[out_i] = shlex.quote(multi_line_str)
processed_lines[out_i] = f"'{multi_line_str}'"
out_i += 1
else:
# Append multiline string to previous line
processed_lines[out_i - 1] += " " + shlex.quote(multi_line_str)
processed_lines[out_i - 1] += " " + f"'{multi_line_str}'"
in_i = j + 1

elif not is_looped and lines[in_i][0] != "_":
Expand Down Expand Up @@ -1024,6 +1023,34 @@ def _multiline(value):
return value


def _split_one_line(line):
"""
Split a line into its fields.
Supporting embedded quotes (' or "), like `'a dog's life'` to `a dog's life`
"""
# Define the patterns for different types of fields
single_quote_pattern = r"('(?:'(?! )|[^'])*')(?:\s|$)"
double_quote_pattern = r'("(?:"(?! )|[^"])*")(?:\s|$)'
unquoted_pattern = r"([^\s]+)"

# Combine the patterns using alternation
combined_pattern = (
f"{single_quote_pattern}|{double_quote_pattern}|{unquoted_pattern}"
)

# Find all matches
matches = re.findall(combined_pattern, line)

# Extract non-empty groups from the matches
fields = []
for match in matches:
field = next(group for group in match if group)
if field[0] == field[-1] == "'" or field[0] == field[-1] == '"':
field = field[1:-1]
fields.append(field)
return fields


def _arrayfy(data):
if not isinstance(data, (Sequence, np.ndarray)) or isinstance(data, str):
data = [data]
Expand Down
16 changes: 16 additions & 0 deletions tests/structure/test_pdbx.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,22 @@ def test_escape(string, looped):
assert test_value == ref_value


@pytest.mark.parametrize(
"cif_line, expected_fields",
[
["'' 'embed'quote' ", ['', "embed'quote"]],
['2 "embed"quote" "\t\n"', ['2', 'embed"quote', '\t\n']],
[" 3 '' \"\" 'spac e' 'embed\"quote'", ['3', '', '', 'spac e', 'embed"quote']],
["''' \"\"\" ''quoted''", ["'", '"', "'quoted'"]]
]
)
def test_split_one_line(cif_line, expected_fields):
"""
Test whether values that have an embedded quote are properly escaped.
"""
assert pdbx.cif._split_one_line(cif_line) == expected_fields


@pytest.mark.parametrize(
"format, path, model",
itertools.product(
Expand Down

0 comments on commit 0404084

Please sign in to comment.