diff --git a/.github/workflows/lint_and_test.yml b/.github/workflows/lint_and_test.yml index ed8d822..dc8caf4 100644 --- a/.github/workflows/lint_and_test.yml +++ b/.github/workflows/lint_and_test.yml @@ -15,10 +15,10 @@ jobs: steps: - name: Checkout source repository - uses: actions/checkout@v3.3.0 + uses: actions/checkout@v4 - name: Setup Python - uses: actions/setup-python@v4.5.0 + uses: actions/setup-python@v5 with: python-version: 3.11 architecture: x64 diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 0aadcbb..9523348 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -30,7 +30,7 @@ jobs: contents: read steps: - - uses: actions/checkout@v3.3.0 + - uses: actions/checkout@v4 - name: Build image run: docker build . --file Dockerfile --tag $IMAGE_NAME --label "runnumber=${GITHUB_RUN_ID}" diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml index 2c651d1..32c2483 100644 --- a/.github/workflows/pages.yml +++ b/.github/workflows/pages.yml @@ -14,11 +14,11 @@ jobs: run: | git config user.name github-actions[bot] git config user.email 41898282+github-actions[bot]@users.noreply.github.com - - uses: actions/setup-python@v4 + - uses: actions/setup-python@v5 with: python-version: 3.x - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV - - uses: actions/cache@v3 + - uses: actions/cache@v4 with: key: mkdocs-material-${{ env.cache_id }} path: .cache diff --git a/app/borehole_map.py b/app/borehole_map.py index 1e5d70c..c1bedbd 100644 --- a/app/borehole_map.py +++ b/app/borehole_map.py @@ -22,7 +22,7 @@ def extract_geojson(filepath: Path) -> dict: Read an AGS4 file and extract geojson represenation of LOCA table and metadata. """ - logger.info("Extracting geojson from %s", filepath.name) + logger.info("Extracting geojson from %s", filepath.name) # Read data file tables, load_error, _ = load_tables_reporting_errors(filepath) diff --git a/app/checkers.py b/app/checkers.py index a473626..04f2145 100644 --- a/app/checkers.py +++ b/app/checkers.py @@ -43,11 +43,11 @@ def check_ags(filename: Path, standard_AGS4_dictionary: Optional[str] = None) -> errors = {'File read error': [{'line': line_no, 'group': '', 'desc': description}]} dictionary = '' - # Discard unecessary summary from errors dictionary - errors.pop('Summary of data', None) + # Use summary from errors dictionary is available + summary = errors.pop('Summary of data', []) return dict(checker=f'python_ags4 v{python_ags4.__version__}', - errors=errors, dictionary=dictionary) + errors=errors, dictionary=dictionary, summary=summary) def check_bgs(filename: Path, **kwargs) -> dict: diff --git a/app/routes.py b/app/routes.py index 7c813fe..7ab96ea 100644 --- a/app/routes.py +++ b/app/routes.py @@ -15,6 +15,7 @@ from requests.exceptions import Timeout, ConnectionError, HTTPError from app import conversion, validation +from app.borehole_map import extract_geojson from app.checkers import check_ags, check_bgs from app.errors import error_responses, InvalidPayloadError from app.schemas import ValidationResponse, BoreholeCountResponse @@ -68,6 +69,14 @@ class Checker(StrEnum): bgs = "bgs" +# Enum for sorting strategy logic +class SortingStrategy(StrEnum): + default = "default" + alphabetical = "alphabetical" + hierarchy = "hierarchy" + dictionary = "dictionary" + + # Enum for pdf response type logic class ResponseType(StrEnum): attachment = "attachment" @@ -85,6 +94,13 @@ class ResponseType(StrEnum): description='Response format: json or text', ) +geometry_form = Form( + default=False, + title='GeoJSON Option', + description=('Return GeoJSON if possible, otherwise return an error message ' + ' Option: True or False'), +) + dictionary_form = Form( default=None, title='Validation Dictionary', @@ -110,10 +126,10 @@ class ResponseType(StrEnum): ) sort_tables_form = Form( - default='default', + default=SortingStrategy.default, title='Sort worksheets', - description=('Sort the worksheets into alphabetical order ' - 'or leave in the order found in the AGS file. ' + description=('Sort the worksheets into alphabetical, hierarchical ' + 'dictionary or default order, that found in the AGS file. ' 'This option is ignored when converting to AGS.'), ) @@ -163,6 +179,7 @@ async def validate(background_tasks: BackgroundTasks, std_dictionary: Dictionary = dictionary_form, checkers: List[Checker] = validate_form, fmt: Format = format_form, + return_geometry: bool = geometry_form, request: Request = None): """ Validate an AGS4 file to the AGS File Format v4.x rules and the NGDC data submission requirements. @@ -178,6 +195,8 @@ async def validate(background_tasks: BackgroundTasks, :param fmt: The format to return the validation results in. Options are "text" or "json". :type fmt: Format + :param return_geometry: Include GeoJSON in validation response. Options are True or False. + :type return_geometry: bool :param request: The request object. :type request: Request :return: A response with the validation results in either plain text or JSON format. @@ -203,6 +222,13 @@ async def validate(background_tasks: BackgroundTasks, local_ags_file.write_bytes(contents) result = validation.validate( local_ags_file, checkers=checkers, standard_AGS4_dictionary=dictionary) + if return_geometry: + try: + geojson = extract_geojson(local_ags_file) + result['geojson'] = geojson + except ValueError as ve: + result['geojson'] = {} + result['geojson_error'] = str(ve) data.append(result) if fmt == Format.TEXT: @@ -258,7 +284,7 @@ async def convert(background_tasks: BackgroundTasks, :raises Exception: If the conversion fails or an unexpected error occurs. """ - if sort_tables == 'default': + if sort_tables == SortingStrategy.default: sort_tables = None if not files[0].filename: raise InvalidPayloadError(request) diff --git a/app/schemas.py b/app/schemas.py index c78477d..859f04f 100644 --- a/app/schemas.py +++ b/app/schemas.py @@ -47,7 +47,10 @@ class Validation(BaseModel): message: str = Field(None, example="7 error(s) found in file!") errors: Dict[str, List[LineError]] = Field(..., example="Rule 1a") valid: bool = Field(..., example='false') + summary: List[dict] = list() additional_metadata: dict = Field(...) + geojson: dict = dict() + geojson_error: str = None @validator('errors') def errors_keys_must_be_known_rules(cls, errors): diff --git a/app/templates/landing_page.html b/app/templates/landing_page.html index 8d9098b..13e4201 100644 --- a/app/templates/landing_page.html +++ b/app/templates/landing_page.html @@ -123,12 +123,20 @@

Future data validation rules: (Coming Soon)


Select response format: - - -
+ + +

+
+ If HTML show LOCA features on a map / If JSON include GeoJSON + + + +
+
+
Select .ags / .AGS file(s) for validation (v4.x only) (50 Mb Maximum) @@ -165,15 +173,15 @@

AGS Converter


- Sort worksheets in .xlsx file using sorting strategy(Warning: .ags to .xlsx only. The original group order will be lost) + Sort worksheets in .xlsx file using a sorting strategy (Warning: .ags to .xlsx only. The original group order will be lost) + +
- +
- +
- - -
+

diff --git a/app/validation.py b/app/validation.py index e4f950a..2f5b73a 100644 --- a/app/validation.py +++ b/app/validation.py @@ -47,6 +47,8 @@ def validate(filename: Path, all_errors = {} all_checkers = [] + bgs_additional_metadata = {} + ags_summary = [] # Don't process if file is not .ags format if filename.suffix.lower() != '.ags': all_errors.update( @@ -57,19 +59,32 @@ def validate(filename: Path, # Run checkers to extract errors and other metadata for checker in checkers: # result is a dictionary with 'errors', 'checker' and other keys - result = checker(filename, standard_AGS4_dictionary=dictionary_file) + result: dict = checker(filename, standard_AGS4_dictionary=dictionary_file) + # Pull 'errors' out to add to running total all_errors.update(result.pop('errors')) all_checkers.append(result.pop('checker')) - # Handle additional metadata + + # Extract checker-dependent additional metadata try: - response['additional_metadata'].update(result.pop('additional_metadata')) + bgs_additional_metadata = result.pop('additional_metadata') except KeyError: - # No additional metadata pass + + try: + ags_summary = result.pop('summary') + except KeyError: + pass + # Add remaining keys to response response.update(result) + # We only want one checker-dependent metadata; use BGS if available. + if bgs_additional_metadata: + response['additional_metadata'] = bgs_additional_metadata + else: + response['summary'] = ags_summary + error_count = len(reduce(lambda total, current: total + current, all_errors.values(), [])) if error_count > 0: message = f'{error_count} error(s) found in file!' diff --git a/test/fixtures_json.py b/test/fixtures_json.py index 309ddeb..4b2f939 100644 --- a/test/fixtures_json.py +++ b/test/fixtures_json.py @@ -10,7 +10,10 @@ 'message': 'All checks passed!', 'errors': {}, 'valid': True, - 'additional_metadata': {} + 'summary': [], + 'additional_metadata': {}, + 'geojson': {}, + 'geojson_error': None }, 'example_broken_ags.ags': { "filename": "example_broken_ags.ags", @@ -95,7 +98,10 @@ ], }, "valid": False, - 'additional_metadata': {} + 'summary': [], + 'additional_metadata': {}, + 'geojson': {}, + 'geojson_error': None }, 'nonsense.AGS': { 'filename': 'nonsense.AGS', @@ -119,7 +125,10 @@ 'AGS Format Rule 15': [{'line': '-', 'group': 'UNIT', 'desc': 'UNIT group not found.'}], 'AGS Format Rule 17': [{'line': '-', 'group': 'TYPE', 'desc': 'TYPE group not found.'}]}, 'valid': False, - 'additional_metadata': {} + 'summary': [], + 'additional_metadata': {}, + 'geojson': {}, + 'geojson_error': None }, 'random_binary.ags': { 'filename': 'random_binary.ags', @@ -281,7 +290,10 @@ "If not 'utf-8', then the encoding is most likely to be 'windows-1252' " "aka 'cp1252')"}]}, 'valid': False, - 'additional_metadata': {} + 'summary': [], + 'additional_metadata': {}, + 'geojson': {}, + 'geojson_error': None }, 'real/AGS3/CG014058_F.ags': { 'filename': 'CG014058_F.ags', @@ -297,7 +309,10 @@ 'group': '', 'line': 1}]}, 'valid': False, - 'additional_metadata': {} + 'summary': [], + 'additional_metadata': {}, + 'geojson': {}, + 'geojson_error': None }, 'real/Blackburn Southern Bypass.ags': { 'filename': 'Blackburn Southern Bypass.ags', @@ -695,7 +710,10 @@ 'It is highly recommended that the file be saved without BOM encoding ' 'to avoid issues with other software.'}]}, 'valid': False, - 'additional_metadata': {} + 'summary': [], + 'additional_metadata': {}, + 'geojson': {}, + 'geojson_error': None }, 'real/AGS3/A3040_03.ags': { 'filename': 'A3040_03.ags', @@ -712,7 +730,10 @@ 'line': 1}]}, 'valid': False, - 'additional_metadata': {} + 'summary': [], + 'additional_metadata': {}, + 'geojson': {}, + 'geojson_error': None }, 'extension_is.bad': { 'filename': 'extension_is.bad', @@ -724,10 +745,173 @@ 'errors': {'File read error': [ {'line': '-', 'group': '', 'desc': 'extension_is.bad is not an .ags file'}]}, 'valid': False, - 'additional_metadata': {} + 'summary': [], + 'additional_metadata': {}, + 'geojson': {}, + 'geojson_error': None }, } +GEOJSON_RESPONSES = { + 'example_ags.ags': { + 'filename': 'example_ags.ags', + 'filesize': 4105, + 'checkers': ['python_ags4 v0.5.0'], + 'dictionary': 'Standard_dictionary_v4_1_1.ags', + 'time': dt.datetime(2021, 8, 23, 14, 25, 43, tzinfo=dt.timezone.utc), + 'message': 'All checks passed!', + 'errors': {}, + 'valid': True, + 'summary': [], + 'additional_metadata': {}, + 'geojson': { + 'features': [{ + 'geometry': { + 'coordinates': [-0.22760675836552394, 51.491649521233036], + 'type': 'Point'}, + 'id': '121415.327-16A', + 'properties': { + 'LOCA_ALID': '', + 'LOCA_CLST': '', + 'LOCA_CNGE': '', + 'LOCA_DATM': '', + 'LOCA_ELAT': '', + 'LOCA_ELON': '', + 'LOCA_ENDD': '', + 'LOCA_ETRV': None, + 'LOCA_FDEP': None, + 'LOCA_FILE_FSET': '', + 'LOCA_GL': None, + 'LOCA_GREF': '', + 'LOCA_ID': '327-16A', + 'LOCA_LAT': '', + 'LOCA_LETT': '', + 'LOCA_LLZ': '', + 'LOCA_LOCA': '', + 'LOCA_LOCM': '', + 'LOCA_LOCX': None, + 'LOCA_LOCY': None, + 'LOCA_LOCZ': None, + 'LOCA_LON': '', + 'LOCA_LREF': '', + 'LOCA_LTRV': None, + 'LOCA_NATE': 523145.0, + 'LOCA_NATN': 178456.12, + 'LOCA_NTRV': None, + 'LOCA_OFFS': None, + 'LOCA_PURP': '', + 'LOCA_REM': '', + 'LOCA_STAR': '', + 'LOCA_STAT': '', + 'LOCA_TERM': '', + 'LOCA_TRAN': '', + 'LOCA_TYPE': '', + 'LOCA_XTRL': None, + 'LOCA_YTRL': None, + 'LOCA_ZTRL': None, + 'PROJ_CLNT': 'ACME Enterprises', + 'PROJ_CONT': 'ACME Drilling Ltd', + 'PROJ_ENG': '', + 'PROJ_FILE_FSET': '', + 'PROJ_ID': '121415', + 'PROJ_LOC': 'Anytown', + 'PROJ_MEMO': '', + 'PROJ_NAME': 'ACME Gas Works Redevelopment', + 'line_no': 1}, + 'type': 'Feature'}], + 'type': 'FeatureCollection'}, + 'geojson_error': None + }, + 'example_broken_ags.ags': { + "filename": "example_broken_ags.ags", + "filesize": 4111, + "checkers": ["python_ags4 v0.5.0"], + 'dictionary': 'Standard_dictionary_v4_1_1.ags', + 'time': dt.datetime(2021, 8, 23, 14, 25, 43, tzinfo=dt.timezone.utc), + "message": "13 error(s) found in file!", + "errors": { + "AGS Format Rule 4": [ + { + "line": 31, + "group": "TYPE", + "desc": "Number of fields does not match the HEADING row." + }, + { + "line": 34, + "group": "TYPE", + "desc": "Number of fields does not match the HEADING row." + }, + { + "line": 36, + "group": "TYPE", + "desc": "Number of fields does not match the HEADING row." + } + ], + "AGS Format Rule 5": [ + { + "line": 31, + "group": "", + "desc": "Contains fields that are not enclosed in double quotes." + }, + { + "line": 32, + "group": "", + "desc": "Contains fields that are not enclosed in double quotes." + }, + { + "line": 34, + "group": "", + "desc": "Contains fields that are not enclosed in double quotes." + }, + { + "line": 35, + "group": "", + "desc": "Contains fields that are not enclosed in double quotes." + }, + { + "line": 36, + "group": "", + "desc": "Contains fields that are not enclosed in double quotes." + }, + { + "line": 37, + "group": "", + "desc": "Contains fields that are not enclosed in double quotes." + } + ], + "AGS Format Rule 3": [ + { + "line": 32, + "group": "", + "desc": "Does not start with a valid data descriptor." + }, + { + "line": 35, + "group": "", + "desc": "Does not start with a valid data descriptor." + }, + { + "line": 37, + "group": "", + "desc": "Does not start with a valid data descriptor." + } + ], + "AGS Format Rule ?": [ + { + "line": "-", + "group": "", + "desc": "Line 31 does not have the same number of entries as the HEADING row in TYPE." + } + ], + }, + "valid": False, + 'summary': [], + 'additional_metadata': {}, + 'geojson': {}, + 'geojson_error': 'Line 31 does not have the same number of entries as the HEADING row in TYPE.' + } +} + # These response values break the schema BROKEN_JSON_RESPONSES = [ { @@ -743,7 +927,10 @@ 'desc': ''}], }, 'valid': False, - 'additional_metadata': {} + 'summary': [], + 'additional_metadata': {}, + 'geojson': {}, + 'geojson_error': None }, { 'filename': 'nonsense.AGS', @@ -758,6 +945,9 @@ 'desc': ''}], }, 'valid': False, - 'additional_metadata': {} + 'summary': [], + 'additional_metadata': {}, + 'geojson': {}, + 'geojson_error': None }, ] diff --git a/test/integration/test_api.py b/test/integration/test_api.py index 8790411..c95c87c 100644 --- a/test/integration/test_api.py +++ b/test/integration/test_api.py @@ -18,7 +18,7 @@ import app.routes as app_routes from test.fixtures import (BAD_FILE_DATA, DICTIONARIES, FROZEN_TIME, GOOD_FILE_DATA) -from test.fixtures_json import JSON_RESPONSES +from test.fixtures_json import JSON_RESPONSES, GEOJSON_RESPONSES from test.fixtures_plain_text import PLAIN_TEXT_RESPONSES IN_GITHUB_ACTIONS = os.getenv("GITHUB_ACTIONS") == "true" @@ -41,14 +41,17 @@ def test_openapi_json(client): @pytest.mark.parametrize('filename, expected', [item for item in JSON_RESPONSES.items()]) +@pytest.mark.parametrize('return_geometry', [False, None]) @pytest.mark.asyncio -async def test_validate_json(async_client, filename, expected): +async def test_validate_json(async_client, filename, expected, return_geometry): # Arrange filename = TEST_FILE_DIR / filename file = ('files', (filename.name, open(filename, 'rb'), 'text/plain')) fields = [] fields.append(file) fields.append(('checkers', 'ags')) + if return_geometry is not None: + fields.append(('return_geometry', str(return_geometry))) mp_encoder = MultipartEncoder(fields=fields) # Act @@ -69,6 +72,43 @@ async def test_validate_json(async_client, filename, expected): assert len(body['data']) == 1 assert set(body['data'][0]) == set(expected.keys()) assert body['data'][0]['filename'] == expected['filename'] + assert body['data'][0]['geojson'] == expected['geojson'] + assert body['data'][0]['geojson_error'] == expected['geojson_error'] + + +@pytest.mark.parametrize('filename, expected', + [item for item in GEOJSON_RESPONSES.items()]) +@pytest.mark.asyncio +async def test_geojson_response(async_client, filename, expected): + # Arrange + filename = TEST_FILE_DIR / filename + file = ('files', (filename.name, open(filename, 'rb'), 'text/plain')) + fields = [] + fields.append(file) + fields.append(('checkers', 'ags')) + fields.append(('return_geometry', str(True))) + mp_encoder = MultipartEncoder(fields=fields) + + # Act + async with async_client as ac: + response = await ac.post( + '/validate/', + headers={'Content-Type': mp_encoder.content_type}, + data=mp_encoder.to_string()) + + # Assert + assert response.status_code == 200 + assert response.headers['content-type'] == 'application/json' + body = response.json() + assert set(body.keys()) == {'msg', 'type', 'self', 'data'} + assert body['msg'] is not None + assert body['type'] == 'success' + assert body['self'] is not None + assert len(body['data']) == 1 + assert set(body['data'][0]) == set(expected.keys()) + assert body['data'][0]['filename'] == expected['filename'] + assert body['data'][0]['geojson'] == expected['geojson'] + assert body['data'][0]['geojson_error'] == expected['geojson_error'] @pytest.mark.asyncio diff --git a/test/unit/test_validation.py b/test/unit/test_validation.py index 2716409..43f6e53 100644 --- a/test/unit/test_validation.py +++ b/test/unit/test_validation.py @@ -15,12 +15,32 @@ def mock_check_ags(filename, standard_AGS4_dictionary=None): return dict(checker='ags', dictionary='some_dict', - errors={}) + errors={}, + summary=[{'desc': '7 groups identified in file: PROJ ABBR TRAN TYPE UNIT ' + 'LOCA SAMP', + 'group': '', + 'line': ''}, + {'desc': '1 data row(s) in LOCA group', 'group': '', 'line': ''}, + {'desc': 'Optional DICT group present? False', + 'group': '', + 'line': ''}, + {'desc': 'Optional FILE group present? False', + 'group': '', + 'line': ''}] + ) def mock_check_bgs(filename, **kwargs): return dict(checker='bgs', - errors={'BGS': [{}]}) + errors={'BGS': [{}]}, + additional_metadata={ + 'bgs_all_groups': '7 groups identified in file: ' + 'PROJ ABBR TRAN TYPE UNIT LOCA SAMP', + 'bgs_dict': 'Optional DICT group present: False', + 'bgs_file': 'Optional FILE group present: False', + 'bgs_loca_rows': '1 data rows in LOCA group', + 'bgs_projects': '1 projects found: 121415 (ACME Gas Works Redevelopment)' + }) @freeze_time(FROZEN_TIME) @@ -36,6 +56,17 @@ def test_validate_default_checker(): 'filesize': 0, 'message': 'All checks passed!', 'time': dt.datetime(2021, 8, 23, 14, 25, 43, tzinfo=dt.timezone.utc), + 'summary': [{'desc': '7 groups identified in file: PROJ ABBR TRAN TYPE UNIT ' + 'LOCA SAMP', + 'group': '', + 'line': ''}, + {'desc': '1 data row(s) in LOCA group', 'group': '', 'line': ''}, + {'desc': 'Optional DICT group present? False', + 'group': '', + 'line': ''}, + {'desc': 'Optional FILE group present? False', + 'group': '', + 'line': ''}], 'valid': True, 'additional_metadata': {}} @@ -60,7 +91,14 @@ def test_validate_bgs_checker(): 'message': '1 error(s) found in file!', 'time': dt.datetime(2021, 8, 23, 14, 25, 43, tzinfo=dt.timezone.utc), 'valid': False, - 'additional_metadata': {}} + 'additional_metadata': {'bgs_all_groups': '7 groups identified in file: PROJ ' + 'ABBR TRAN TYPE UNIT LOCA SAMP', + 'bgs_dict': 'Optional DICT group present: False', + 'bgs_file': 'Optional FILE group present: False', + 'bgs_loca_rows': '1 data rows in LOCA group', + 'bgs_projects': '1 projects found: 121415 (ACME Gas ' + 'Works Redevelopment)'} + } # Act response = validation.validate(filename, checkers=[mock_check_bgs]) @@ -83,7 +121,14 @@ def test_validate_both_checkers(): 'message': '1 error(s) found in file!', 'time': dt.datetime(2021, 8, 23, 14, 25, 43, tzinfo=dt.timezone.utc), 'valid': False, - 'additional_metadata': {}} + 'additional_metadata': { + 'bgs_all_groups': '7 groups identified in file: ' + 'PROJ ABBR TRAN TYPE UNIT LOCA SAMP', + 'bgs_dict': 'Optional DICT group present: False', + 'bgs_file': 'Optional FILE group present: False', + 'bgs_loca_rows': '1 data rows in LOCA group', + 'bgs_projects': '1 projects found: 121415 (ACME Gas Works Redevelopment)' + }} # Act response = validation.validate(filename, checkers=[mock_check_bgs, mock_check_ags]) @@ -106,6 +151,7 @@ def test_validate_non_ags(): 'message': '1 error(s) found in file!', 'time': dt.datetime(2021, 8, 23, 14, 25, 43, tzinfo=dt.timezone.utc), 'valid': False, + 'summary': [], 'additional_metadata': {}} # Act