Skip to content

Commit

Permalink
Detect license trouble in slave Eclipse models
Browse files Browse the repository at this point in the history
In a coupled reservoir simulation, there is a master
Eclipse process, which itself starts up Eclipse processes
for its slaves. If the master process fails due to license
trouble, it is caught by the existing code, but if the master
passes but any of the slaves do not, the PRT files of the
slaves must be parsed to deduce license failure or not.
  • Loading branch information
berland committed Sep 23, 2024
1 parent 22bd48c commit d535c95
Show file tree
Hide file tree
Showing 2 changed files with 141 additions and 6 deletions.
38 changes: 32 additions & 6 deletions src/ert/resources/forward-models/res/script/ecl_run.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import datetime
import glob
import os
import os.path
import re
Expand All @@ -9,6 +10,7 @@
from argparse import ArgumentParser
from collections import namedtuple
from contextlib import contextmanager, suppress
from pathlib import Path
from random import random
from typing import List

Expand All @@ -17,13 +19,30 @@
from packaging import version


def ecl_output_has_license_error(ecl_output: str):
return (
"LICENSE ERROR" in ecl_output
or "LICENSE FAILURE" in ecl_output
or "not allowed in license" in ecl_output
)


class EclError(RuntimeError):
def failed_due_to_license_problems(self) -> bool:
return (
"LICENSE ERROR" in self.args[0]
or "LICENSE FAILURE" in self.args[0]
or "not allowed in license" in self.args[0]
)
# self.args[0] contains the multiline ERROR messages and SLAVE startup messages
if ecl_output_has_license_error(self.args[0]):
return True
if re.search(a_slave_failed_pattern, self.args[0]):
for match in re.finditer(slave_run_paths, self.args[0], re.MULTILINE):
(ecl_case_starts_with, ecl_case_dir) = match.groups()
for prt_file in glob.glob(
f"{ecl_case_dir}/{ecl_case_starts_with}*.PRT"
):
if ecl_output_has_license_error(
Path(prt_file).read_text(encoding="utf-8")
):
return True
return False


def await_process_tee(process, *out_files) -> int:
Expand Down Expand Up @@ -60,6 +79,12 @@ def await_process_tee(process, *out_files) -> int:
date_sub_pattern = r"\s+AT TIME\s+(?P<Days>\d+\.\d+)\s+DAYS\s+\((?P<Date>(.+)):\s*$"
error_pattern_e100 = rf"^\s@-- ERROR{date_sub_pattern}${body_sub_pattern}"
error_pattern_e300 = rf"^\s@--Error${body_sub_pattern}"
slave_started_pattern = (
rf"^\s@--MESSAGE{date_sub_pattern}\s^\s@\s+STARTING SLAVE.+${body_sub_pattern}"
)
a_slave_failed_pattern = r"\s@\s+SLAVE RUN.*HAS STOPPED WITH AN ERROR CONDITION.\s*"
slave_run_paths = r"^\s@\s+STARTING SLAVE\s+[^ ]+RUNNING \([^ ]\)\s*$"
slave_run_paths = r"\s@\s+STARTING SLAVE .* RUNNING (\w+)\s*^\s@\s+ON HOST.*IN DIRECTORY\s*^\s@\s+(.*)"


def make_LSB_MCPU_machine_list(LSB_MCPU_HOSTS):
Expand Down Expand Up @@ -510,10 +535,11 @@ def parseErrors(self) -> List[str]:
error_list = []
error_e100_regexp = re.compile(error_pattern_e100, re.MULTILINE)
error_e300_regexp = re.compile(error_pattern_e300, re.MULTILINE)
slave_started_regexp = re.compile(slave_started_pattern, re.MULTILINE)
with open(prt_file, "r", encoding="utf-8") as filehandle:
content = filehandle.read()

for regexp in [error_e100_regexp, error_e300_regexp]:
for regexp in [error_e100_regexp, error_e300_regexp, slave_started_regexp]:
offset = 0
while True:
match = regexp.search(content[offset:])
Expand Down
109 changes: 109 additions & 0 deletions tests/unit_tests/resources/test_ecl_run_new_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,3 +390,112 @@ def test_ecl300_crash_is_not_mistaken_as_license_trouble():
with pytest.raises(ecl_run.EclError) as exception_info:
run.assertECLEND()
assert not exception_info.value.failed_due_to_license_problems()


@pytest.mark.usefixtures("use_tmpdir")
def test_license_error_in_slave_is_caught():
"""If a coupled Eclipse model fails in one of the slave runs
due to license issues, there is no trace of licence in the master PRT file.
The master PRT file must be trace for the paths to the SLAVE runs
and then those PRT files must be parsed.
Note that the name of the DATA file is truncated in the MESSAGE listing
the slaves.
"""
Path("slave1").mkdir()
Path("slave2").mkdir()
master_prt_error = f"""\
@--MESSAGE AT TIME 0.0 DAYS ( 1-JAN-2000):
@ THIS IS JUST A MESSAGE, NOTHING ELSE
@--MESSAGE AT TIME 0.0 DAYS ( 1-JAN-2000):
@ STARTING SLAVE SLAVE1 RUNNING EIGHTCEL
@ ON HOST localhost IN DIRECTORY
@ {os.getcwd()}/slave1
@--MESSAGE AT TIME 0.0 DAYS ( 1-JAN-2000):
@ STARTING SLAVE SLAVE2 RUNNING EIGHTCEL
@ ON HOST localhost IN DIRECTORY
@ {os.getcwd()}/slave2
<various_output>
@-- ERROR AT TIME 0.0 DAYS ( 1-JAN-2000):
@ SLAVE RUN SLAVE2 HAS STOPPED WITH AN ERROR CONDITION.
@ MASTER RUN AND REMAINING SLAVES WILL ALSO STOP.
"""
master_eclend = """
Error summary
Comments 1
Warnings 1
Problems 0
Errors 1
Bugs 0"""

Path("EIGHTCELLS_MASTER.PRT").write_text(
master_prt_error + "\n" + master_eclend, encoding="utf-8"
)
Path("EIGHTCELLS_MASTER.ECLEND").write_text(master_eclend, encoding="utf-8")

slave_prt_error = """\
@-- ERROR AT TIME 0.0 DAYS ( 1-JAN-2000):
@ LICENSE ERROR -15 FOR MULTI-SEGMENT WELL OPTION
@ FEATURE IS INVALID. CHECK YOUR LICENSE FILE AND
@ THE LICENSE LOG FILE
"""
Path("slave1/EIGHTCELLS_SLAVE.PRT").write_text("", encoding="utf-8")
Path("slave2/EIGHTCELLS_SLAVE.PRT").write_text(slave_prt_error, encoding="utf-8")
Path("EIGHTCELLS_MASTER.DATA").write_text("", encoding="utf-8")

run = ecl_run.EclRun("EIGHTCELLS_MASTER.DATA", "dummysimulatorobject")
with pytest.raises(ecl_run.EclError) as exception_info:
run.assertECLEND()
assert exception_info.value.failed_due_to_license_problems()


@pytest.mark.usefixtures("use_tmpdir")
def test_crash_in_slave_is_not_mistaken_as_license():
Path("slave1").mkdir()
Path("slave2").mkdir()
master_prt_error = f"""\
@--MESSAGE AT TIME 0.0 DAYS ( 1-JAN-2000):
@ THIS IS JUST A MESSAGE, NOTHING ELSE
@--MESSAGE AT TIME 0.0 DAYS ( 1-JAN-2000):
@ STARTING SLAVE SLAVE1 RUNNING EIGHTCEL
@ ON HOST localhost IN DIRECTORY
@ {os.getcwd()}/slave1
@--MESSAGE AT TIME 0.0 DAYS ( 1-JAN-2000):
@ STARTING SLAVE SLAVE2 RUNNING EIGHTCEL
@ ON HOST localhost IN DIRECTORY
@ {os.getcwd()}/slave2
<various_output>
@-- ERROR AT TIME 0.0 DAYS ( 1-JAN-2000):
@ SLAVE RUN SLAVE2 HAS STOPPED WITH AN ERROR CONDITION.
@ MASTER RUN AND REMAINING SLAVES WILL ALSO STOP.
"""
master_eclend = """
Error summary
Comments 1
Warnings 1
Problems 0
Errors 1
Bugs 0"""

Path("EIGHTCELLS_MASTER.PRT").write_text(
master_prt_error + "\n" + master_eclend, encoding="utf-8"
)
Path("EIGHTCELLS_MASTER.ECLEND").write_text(master_eclend, encoding="utf-8")

slave_prt_error = """\
@-- ERROR AT TIME 0.0 DAYS ( 1-JAN-2000):
@ NON-LINEAR CONVERGENCE FAILURE
"""
Path("slave1/EIGHTCELLS_SLAVE.PRT").write_text("", encoding="utf-8")
Path("slave2/EIGHTCELLS_SLAVE.PRT").write_text(slave_prt_error, encoding="utf-8")
Path("EIGHTCELLS_MASTER.DATA").write_text("", encoding="utf-8")

run = ecl_run.EclRun("EIGHTCELLS_MASTER.DATA", "dummysimulatorobject")
with pytest.raises(ecl_run.EclError) as exception_info:
run.assertECLEND()
assert not exception_info.value.failed_due_to_license_problems()

0 comments on commit d535c95

Please sign in to comment.