Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update _split_one_line and remove whitespace parameter #686

Merged
merged 6 commits into from
Nov 18, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 28 additions & 52 deletions src/biotite/structure/io/pdbx/cif.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
__all__ = ["CIFFile", "CIFBlock", "CIFCategory", "CIFColumn", "CIFData"]

import itertools
import re
from collections.abc import MutableMapping, Sequence
import numpy as np
from biotite.file import (
Expand Down Expand Up @@ -357,7 +356,7 @@ def supercomponent_class():
return CIFBlock

@staticmethod
def deserialize(text, expect_whitespace=True):
def deserialize(text):
lines = [line.strip() for line in text.splitlines() if not _is_empty(line)]

if _is_loop_start(lines[0]):
Expand All @@ -372,7 +371,7 @@ def deserialize(text, expect_whitespace=True):

lines = _to_single(lines)
if is_looped:
category_dict = CIFCategory._deserialize_looped(lines, expect_whitespace)
category_dict = CIFCategory._deserialize_looped(lines)
else:
category_dict = CIFCategory._deserialize_single(lines)
return CIFCategory(category_dict, category_name)
Expand Down Expand Up @@ -445,15 +444,15 @@ def _deserialize_single(lines):
line_i = 0
while line_i < len(lines):
line = lines[line_i]
parts = _split_one_line(line)
parts = list(_split_one_line(line))
if len(parts) == 2:
# Standard case -> name and value in one line
name_part, value_part = parts
line_i += 1
elif len(parts) == 1:
# Value is a multiline value on the next line
name_part = parts[0]
parts = _split_one_line(lines[line_i + 1])
parts = list(_split_one_line(lines[line_i + 1]))
if len(parts) == 1:
value_part = parts[0]
else:
Expand All @@ -467,7 +466,7 @@ def _deserialize_single(lines):
return category_dict

@staticmethod
def _deserialize_looped(lines, expect_whitespace):
def _deserialize_looped(lines):
"""
Process a category where each field has multiple values
(category is a table).
Expand All @@ -490,20 +489,7 @@ def _deserialize_looped(lines, expect_whitespace):
# row-line-alignment at all and simply cycle through columns
column_indices = itertools.cycle(range(len(column_names)))
for data_line in data_lines:
# If whitespace is expected in quote protected values,
# use regex-based _split_one_line() to split
# Otherwise use much more faster whitespace split
# and quote removal if applicable.
if expect_whitespace:
values = _split_one_line(data_line)
else:
values = data_line.split()
for k in range(len(values)):
# Remove quotes
if (values[k][0] == '"' and values[k][-1] == '"') or (
values[k][0] == "'" and values[k][-1] == "'"
):
values[k] = values[k][1:-1]
values = _split_one_line(data_line)
for val in values:
column_index = next(column_indices)
column_name = column_names[column_index]
Expand Down Expand Up @@ -685,15 +671,7 @@ def __getitem__(self, key):
# Element is stored in serialized form
# -> must be deserialized first
try:
# Special optimization for "atom_site":
# Even if the values are quote protected,
# no whitespace is expected in escaped values
# Therefore slow regex-based _split_one_line() call is not necessary
if key == "atom_site":
expect_whitespace = False
else:
expect_whitespace = True
category = CIFCategory.deserialize(category, expect_whitespace)
category = CIFCategory.deserialize(category)
except Exception:
raise DeserializationError(f"Failed to deserialize category '{key}'")
# Update with deserialized object
Expand Down Expand Up @@ -1062,29 +1040,27 @@ def _split_one_line(line):
"""
# Special case of multiline value, where the line starts with ';'
if line[0] == ";":
return [line[1:]]

# Define the patterns for different types of fields
single_quote_pattern = r"('(?:'(?! )|[^'])*')(?:\s|$)"
double_quote_pattern = r'("(?:"(?! )|[^"])*")(?:\s|$)'
unquoted_pattern = r"([^\s]+)"

# Combine the patterns using alternation
combined_pattern = (
f"{single_quote_pattern}|{double_quote_pattern}|{unquoted_pattern}"
)

# Find all matches
matches = re.findall(combined_pattern, line)

# Extract non-empty groups from the matches
fields = []
for match in matches:
field = next(group for group in match if group)
if field[0] == field[-1] == "'" or field[0] == field[-1] == '"':
field = field[1:-1]
fields.append(field)
return fields
yield line[1:]
else:
# Loop over the line
while line:
# Strip leading whitespace(s)
striped_line = line.lstrip()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Only a small typo

Suggested change
striped_line = line.lstrip()
stripped_line = line.lstrip()

# Split the line on whitespace
word, _, line = striped_line.partition(" ")
padix-key marked this conversation as resolved.
Show resolved Hide resolved
# Handle the case where the word start with a quote
if word.startswith(("'", '"')):
# Set the separator to the quote found
separator = word[0]
# Handle the case of a quoted word without space
if word.endswith(separator) and len(word) > 1:
# Yield the word without the opening and closing quotes
yield word[1:-1]
continue
# split the word on the separator
word, _, line = striped_line[1:].partition(separator)
padix-key marked this conversation as resolved.
Show resolved Hide resolved

yield word


def _arrayfy(data):
Expand Down
2 changes: 1 addition & 1 deletion tests/database/test_rcsb.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def test_fetch(format, as_file_like):
if format == "pdb":
file = pdb.PDBFile.read(file_path_or_obj)
pdb.get_structure(file)
elif format == "pdbx":
elif format == "cif":
padix-key marked this conversation as resolved.
Show resolved Hide resolved
file = pdbx.CIFFile.read(file_path_or_obj)
pdbx.get_structure(file)
elif format == "bcif":
Expand Down
2 changes: 1 addition & 1 deletion tests/structure/io/test_pdbx.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def test_split_one_line(cif_line, expected_fields):
"""
Test whether values that have an embedded quote are properly escaped.
"""
assert pdbx.cif._split_one_line(cif_line) == expected_fields
assert list(pdbx.cif._split_one_line(cif_line)) == expected_fields


@pytest.mark.parametrize(
Expand Down
Loading