AMBER parser had a "too strict" regex to extract values (#273)

This PR addresses issue #272. This issue exposed a nasty bug inside the regex pattern we used to extract sections in the amber output file. we are trying to match a sequence of type filed = int/float extracting the int/float corresponding to the field. Up till now, the pattern was fr' {field}\s+=\s+(\*+|{_FP_RE}|\d+)' were field was the string we want to extract the value for, and _FP_RE is defined to extract the float/int after the equal. The problem arises when the field and/or the value are not separated from the = sign by a space (as was the case in the issue). It's strange this didn't appear before! I just changed the + in \s+=\s+ to *. Thinking about it, this should not break anything else, but I can't be 100% sure and the tests should catch any "macro" problems eventually introduced with this PR.
alchemistry · Nov 14, 2022 · 96d5a50 · 96d5a50
1 parent 6e19d4e
commit 96d5a50
Show file tree

Hide file tree

Showing 3 changed files with 34 additions and 1 deletion.
diff --git a/CHANGES b/CHANGES
@@ -13,6 +13,15 @@ The rules for this file:
   * release numbers follow "Semantic Versioning" https://semver.org
 
 ------------------------------------------------------------------------------
+  * 1.0.1
+
+Fixes
+  - AMBER parser now raises ValueError when the initial simulation time
+    is not found (issue #272, PR #273).
+  - The regex in the AMBER parser now reads also 'field=value' pairs where
+    there are no spaces around the equal sign (issue #272, PR #273).
+
+
 10/31/2022 orbeckst, xiki-tempula, DrDomenicoMarson 
 
   * 1.0.0

diff --git a/src/alchemlyb/parsing/amber.py b/src/alchemlyb/parsing/amber.py
@@ -113,7 +113,7 @@ def extract_section(self, start, end, fields, limit=None, extra=''):
         line = ''.join(lines)
         result = []
         for field in fields:
-            match = re.search(fr' {field}\s+=\s+(\*+|{_FP_RE}|\d+)', line)
+            match = re.search(fr' {field}\s*=\s*(\*+|{_FP_RE}|\d+)', line)
             if match:
                 value = match.group(1)
                 if '*' in value:  # catch fortran format overflow
@@ -250,6 +250,10 @@ def file_validation(outfile):
             raise ValueError(f'no "ATOMIC" section found in file {outfile}')
 
         t0, = secp.extract_section('^ begin time', '^$', ['coords'])
+        if t0 is None:
+            logger.error('No starting simulation time in file %s.', outfile)
+            raise ValueError(f'No starting simulation time in file {outfile}')
+
         if not secp.skip_after('^   4.  RESULTS'):
             logger.error('No "RESULTS" section found in the file %s.', outfile)
             raise ValueError(f'no "RESULTS" section found in file {outfile}')

diff --git a/src/alchemlyb/tests/parsing/test_amber.py b/src/alchemlyb/tests/parsing/test_amber.py
@@ -4,6 +4,7 @@
 import logging
 import pytest
 from numpy.testing import assert_allclose
+import pandas as pd
 
 from alchemlyb.parsing.amber import extract_dHdl
 from alchemlyb.parsing.amber import extract_u_nk
@@ -121,6 +122,25 @@ def test_long_and_wrong_number_MBAR(caplog, testfiles):
     assert 'the number of lambda windows read' in caplog.text
 
 
+def test_no_starting_time(caplog, testfiles):
+    """Test if raise an exception if the starting time is not read"""
+    filename = testfiles["no_starting_simulation_time"][0]
+    with pytest.raises(ValueError, match='No starting simulation time in file'):
+        with caplog.at_level(logging.ERROR):
+            _ = extract(str(filename), T=298.0)
+    assert 'No starting simulation time in file' in caplog.text
+
+
+def test_parse_without_spaces_around_equal(testfiles):
+    """
+    Test if the regex is able to extract values where the are no
+    spaces around the equal sign
+    """
+    filename = testfiles["no_spaces_around_equal"][0]
+    df_dict = extract(str(filename), T=298.0)
+    assert isinstance(df_dict['dHdl'], pd.DataFrame)
+
+
 ##################################################################################
 ################ Check the parser behaviour with standard single files
 ##################################################################################