From 8411779a6bb8875e3c17268587cfbe1ee264af4c Mon Sep 17 00:00:00 2001
From: paulfouquet <86932794+paulfouquet@users.noreply.github.com>
Date: Wed, 13 Jul 2022 09:27:47 +1200
Subject: [PATCH] feat: Non Visual QA script (TDE-309) (#40)

* refactor: implement a reusable component to run a GDAL command (with AWS permissions)

* wip

* feat: Non Visual QA script running on single file

* test: add Non Visual QA Unit Tests for the check methods

* feat: Non Visual QA script accepts multiple files
---
 scripts/aws_helper.py               |   4 +
 scripts/file_helper.py              |   5 +
 scripts/gdal_helper.py              |  74 ++++++++++++++
 scripts/non_visual_qa.py            | 124 ++++++++++++++++++++++++
 scripts/standardising.py            |  28 ++----
 scripts/tests/non_visual_qa_test.py | 144 ++++++++++++++++++++++++++++
 6 files changed, 357 insertions(+), 22 deletions(-)
 create mode 100644 scripts/file_helper.py
 create mode 100644 scripts/gdal_helper.py
 create mode 100644 scripts/non_visual_qa.py
 create mode 100644 scripts/tests/non_visual_qa_test.py

diff --git a/scripts/aws_helper.py b/scripts/aws_helper.py
index 3904933ea..1c9e86049 100644
--- a/scripts/aws_helper.py
+++ b/scripts/aws_helper.py
@@ -81,3 +81,7 @@ def get_bucket_name_from_path(path: str) -> str:
 def parse_path(path: str) -> Tuple[str, str]:
     parse = urlparse(path, allow_fragments=False)
     return parse.netloc, parse.path
+
+
+def is_s3(path: str) -> bool:
+    return path.startswith("s3://")
diff --git a/scripts/file_helper.py b/scripts/file_helper.py
new file mode 100644
index 000000000..d6d391052
--- /dev/null
+++ b/scripts/file_helper.py
@@ -0,0 +1,5 @@
+import os
+
+
+def get_file_name_from_path(path: str) -> str:
+    return os.path.basename(path)
diff --git a/scripts/gdal_helper.py b/scripts/gdal_helper.py
new file mode 100644
index 000000000..ce674cc60
--- /dev/null
+++ b/scripts/gdal_helper.py
@@ -0,0 +1,74 @@
+import os
+import subprocess
+from typing import List
+
+from aws_helper import get_bucket_name_from_path, get_credentials, is_s3
+from linz_logger import get_log
+
+
+def get_vfs_path(path: str) -> str:
+    """Make the path as a GDAL Virtual File Systems path.
+
+    Args:
+        path (str): a path to a file.
+
+    Returns:
+        str: the path modified to comply with the corresponding storage service.
+    """
+    return path.replace("s3://", "/vsis3/")
+
+
+def command_to_string(command: List[str]) -> str:
+    """Format the command, each arguments separated by a white space.
+
+    Args:
+        command (List[str]): each arguments of the command as a string in a list.
+
+    Returns:
+        str: the formatted command.
+    """
+    return " ".join(command)
+
+
+def run_gdal(command: List[str], input_file: str = "", output_file: str = "") -> "subprocess.CompletedProcess[bytes]":
+    """Run the GDAL command. The permissions to access to the input file are applied to the gdal environment.
+
+    Args:
+        command (List[str]): each arguments of the GDAL command.
+        input_file (str, optional): the input file path. Defaults to "".
+        output_file (str, optional): the output file path. Defaults to "".
+
+    Raises:
+        cpe: CalledProcessError is raised if something goes wrong during the execution of the command.
+
+    Returns:
+        subprocess.CompletedProcess: the output process.
+    """
+    gdal_env = os.environ.copy()
+
+    if input_file:
+        if is_s3(input_file):
+            # Set the credentials for GDAL to be able to read the input file
+            credentials = get_credentials(get_bucket_name_from_path(input_file))
+            gdal_env["AWS_ACCESS_KEY_ID"] = credentials.access_key
+            gdal_env["AWS_SECRET_ACCESS_KEY"] = credentials.secret_key
+            gdal_env["AWS_SESSION_TOKEN"] = credentials.token
+        command.append(get_vfs_path(input_file))
+
+    if output_file:
+        command.append(output_file)
+
+    try:
+        get_log().debug("run_gdal", command=command_to_string(command))
+        proc = subprocess.run(
+            command,
+            env=gdal_env,
+            check=True,
+            capture_output=True,
+        )
+    except subprocess.CalledProcessError as cpe:
+        get_log().error("run_gdal_failed", command=command_to_string(command))
+        raise cpe
+    get_log().debug("run_gdal_translate_succeded", command=command_to_string(command))
+
+    return proc
diff --git a/scripts/non_visual_qa.py b/scripts/non_visual_qa.py
new file mode 100644
index 000000000..0fea83ece
--- /dev/null
+++ b/scripts/non_visual_qa.py
@@ -0,0 +1,124 @@
+import argparse
+import json
+from typing import Any, Dict, List
+
+from format_source import format_source
+from gdal_helper import run_gdal
+from linz_logger import get_log
+
+
+def check_no_data(gdalinfo: Dict[str, Any], errors_list: List[str]) -> None:
+    """Add an error in errors_list if there is no "noDataValue" or the "noDataValue" is not equal to 255 in the "bands".
+
+    Args:
+        gdalinfo (Dict[str, Any]): JSON return of gdalinfo in a Python Dictionary.
+        errors_list (List[str]): List of errors as strings.
+    """
+    bands = gdalinfo["bands"]
+    if "noDataValue" in bands[0]:
+        current_nodata_val = bands[0]["noDataValue"]
+        if current_nodata_val != 255:
+            errors_list.append(f"noDataValue is {int(current_nodata_val)} not 255")
+    else:
+        errors_list.append("noDataValue not set")
+
+
+def check_band_count(gdalinfo: Dict[str, Any], errors_list: List[str]) -> None:
+    """Add an error in errors_list if there is no exactly 3 bands found.
+
+    Args:
+        gdalinfo (Dict[str, Any]): JSON returned by gdalinfo as a Python Dictionary.
+        errors_list (List[str]): List of errors as strings.
+    """
+    bands = gdalinfo["bands"]
+    bands_num = len(bands)
+    if bands_num != 3:
+        errors_list.append(f"not 3 bands, {bands_num} bands found")
+
+
+def check_srs(gdalsrsinfo: bytes, gdalsrsinfo_tif: bytes, errors_list: List[str]) -> None:
+    """Add an error in errors_list if gdalsrsinfo and gdalsrsinfo_tif values are different.
+
+    Args:
+        gdalsrsinfo (str): Value returned by gdalsrsinfo as a string.
+        gdalsrsinfo_tif (str): Value returned by gdalsrsinfo for the tif as a string.
+        errors_list (List[str]): List of errors as strings.
+    """
+    if gdalsrsinfo_tif != gdalsrsinfo:
+        errors_list.append("different srs")
+
+
+def check_color_interpretation(gdalinfo: Dict[str, Any], errors_list: List[str]) -> None:
+    bands = gdalinfo["bands"]
+    missing_bands = []
+    band_colour_ints = {1: "Red", 2: "Green", 3: "Blue"}
+    n = 1
+    for band in bands:
+        colour_int = band["colorInterpretation"]
+        if n in band_colour_ints:
+            if colour_int != band_colour_ints[n]:
+                missing_bands.append(f"band {n} {colour_int}")
+        else:
+            missing_bands.append(f"band {n} {colour_int}")
+        n += 1
+    if missing_bands:
+        missing_bands.sort()
+        errors_list.append(f"unexpected color interpretation bands; {', '.join(missing_bands)}")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--source", dest="source", nargs="+", required=True)
+    arguments = parser.parse_args()
+    source = arguments.source
+
+    source = format_source(source)
+
+    # Get srs
+    gdalsrsinfo_command = ["gdalsrsinfo", "-o", "wkt", "EPSG:2193"]
+    gdalsrsinfo_result = run_gdal(gdalsrsinfo_command)
+    if gdalsrsinfo_result.stderr:
+        raise Exception(
+            f"Error trying to retrieve srs from epsg code, no files have been checked\n{gdalsrsinfo_result.stderr!r}"
+        )
+    srs = gdalsrsinfo_result.stdout
+
+    for file in source:
+        gdalinfo_command = ["gdalinfo", "-stats", "-json"]
+        gdalinfo_process = run_gdal(gdalinfo_command, file)
+        gdalinfo_result = {}
+        try:
+            gdalinfo_result = json.loads(gdalinfo_process.stdout)
+        except json.JSONDecodeError as e:
+            get_log().error("load_gdalinfo_result_error", file=file, error=e)
+            continue
+
+        gdalinfo_errors = gdalinfo_process.stderr
+
+        # Check result
+        errors: List[str] = []
+        # No data
+        check_no_data(gdalinfo_result, errors)
+
+        # Band count
+        check_band_count(gdalinfo_result, errors)
+
+        # srs
+        gdalsrsinfo_tif_command = ["gdalsrsinfo", "-o", "wkt"]
+        gdalsrsinfo_tif_result = run_gdal(gdalsrsinfo_tif_command, file)
+        check_srs(srs, gdalsrsinfo_tif_result.stdout, errors)
+
+        # Color interpretation
+        check_color_interpretation(gdalinfo_result, errors)
+
+        # gdal errors
+        errors.append(f"{gdalinfo_errors!r}")
+
+        if len(errors) > 0:
+            get_log().info("non_visual_qa_errors_found", file=file, result=errors)
+        else:
+            get_log().info("non_visual_qa_no_error", file=file)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/standardising.py b/scripts/standardising.py
index 474955459..58c4edccc 100644
--- a/scripts/standardising.py
+++ b/scripts/standardising.py
@@ -1,10 +1,11 @@
 import argparse
 import os
-import subprocess
 import tempfile
 
-from aws_helper import get_bucket, get_credentials, parse_path
+from aws_helper import get_bucket, parse_path
+from file_helper import get_file_name_from_path
 from format_source import format_source
+from gdal_helper import run_gdal
 from linz_logger import get_log
 
 parser = argparse.ArgumentParser()
@@ -25,20 +26,10 @@
 for file in source:
     with tempfile.TemporaryDirectory() as tmp_dir:
         src_bucket_name, src_file_path = parse_path(file)
-        get_log().debug("processing_file", bucket=src_bucket_name, file_path=src_file_path)
-        standardized_file_name = f"standardized_{os.path.basename(src_file_path)}"
+        standardized_file_name = f"standardized_{get_file_name_from_path(src_file_path)}"
         tmp_file_path = os.path.join(tmp_dir, standardized_file_name)
-        src_gdal_path = file.replace("s3://", "/vsis3/")
 
-        # Set the credentials for GDAL to be able to read the source file
-        credentials = get_credentials(src_bucket_name)
-        gdal_env["AWS_ACCESS_KEY_ID"] = credentials.access_key
-        gdal_env["AWS_SECRET_ACCESS_KEY"] = credentials.secret_key
-        gdal_env["AWS_SESSION_TOKEN"] = credentials.token
-
-        # Run GDAL to standardized the file
-        get_log().debug("run_gdal_translate", src=src_gdal_path, output=tmp_file_path)
-        gdal_command = [
+        command = [
             "gdal_translate",
             "-q",
             "-scale",
@@ -58,15 +49,8 @@
             "3",
             "-co",
             "compress=lzw",
-            src_gdal_path,
-            tmp_file_path,
         ]
-        try:
-            proc = subprocess.run(gdal_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=gdal_env, check=True)
-        except subprocess.CalledProcessError as cpe:
-            get_log().error("run_gdal_translate_failed", command=" ".join(gdal_command))
-            raise cpe
-        get_log().debug("run_gdal_translate_succeded", command=" ".join(gdal_command))
+        run_gdal(command, file, tmp_file_path)
 
         # Upload the standardized file to destination
         dst_file_path = os.path.join(dst_path, standardized_file_name).strip("/")
diff --git a/scripts/tests/non_visual_qa_test.py b/scripts/tests/non_visual_qa_test.py
new file mode 100644
index 000000000..2711155c1
--- /dev/null
+++ b/scripts/tests/non_visual_qa_test.py
@@ -0,0 +1,144 @@
+from typing import List
+
+from non_visual_qa import check_band_count, check_color_interpretation, check_no_data, check_srs
+
+
+def test_check_band_count_valid() -> None:
+    """
+    tests check_band_count when the input layer has a valid band count
+    which is 3 bands
+    """
+    gdalinfo = {}
+    gdalinfo["bands"] = [{"band": 1}, {"band": 2}, {"band": 3}]
+    errors: List[str] = []
+
+    check_band_count(gdalinfo, errors)
+    assert len(errors) == 0
+
+
+def test_check_band_count_invalid() -> None:
+    """
+    tests check_band_count when the input layer has a invalid band count of 2
+    which is 3 bands to be valid
+    """
+    gdalinfo = {}
+    gdalinfo["bands"] = [{"band": 1}, {"band": 2}]
+    errors: List[str] = []
+
+    check_band_count(gdalinfo, errors)
+    assert len(errors) == 1
+
+
+def test_check_color_interpretation_valid() -> None:
+    """
+    tests check_color_interpretation with the correct color interpretation
+    """
+    gdalinfo = {}
+    gdalinfo["bands"] = [
+        {
+            "colorInterpretation": "Red",
+        },
+        {
+            "colorInterpretation": "Green",
+        },
+        {
+            "colorInterpretation": "Blue",
+        },
+    ]
+    errors: List[str] = []
+
+    check_color_interpretation(gdalinfo, errors)
+    assert len(errors) == 0
+
+
+def test_check_color_interpretation_invalid() -> None:
+    """
+    tests check_color_interpretation with the incorrect color interpretation
+    """
+    gdalinfo = {}
+    gdalinfo["bands"] = [
+        {
+            "colorInterpretation": "Red",
+        },
+        {
+            "colorInterpretation": "Green",
+        },
+        {
+            "colorInterpretation": "Blue",
+        },
+        {
+            "colorInterpretation": "Undefined",
+        },
+    ]
+    errors: List[str] = []
+
+    check_color_interpretation(gdalinfo, errors)
+    assert len(errors) == 1
+
+
+def test_check_no_data_valid() -> None:
+    """
+    tests check_no_data when the input layer has a valid no data value of 255
+    """
+    gdalinfo = {}
+    gdalinfo["bands"] = [
+        {
+            "noDataValue": 255,
+        }
+    ]
+    errors: List[str] = []
+
+    check_no_data(gdalinfo, errors)
+    assert len(errors) == 0
+
+
+def test_check_no_data_no_value() -> None:
+    """
+    tests check_no_data when the input layer has no no_data value assigned
+    """
+    gdalinfo = {}
+    gdalinfo["bands"] = [{"test": 1}]
+    errors: List[str] = []
+
+    check_no_data(gdalinfo, errors)
+    assert len(errors) == 1
+
+
+def test_check_no_data_invalid_value() -> None:
+    """
+    tests check_no_data when the input layer has the wrong value of 0 assigned
+    """
+    gdalinfo = {}
+    gdalinfo["bands"] = [
+        {
+            "noDataValue": 0,
+        }
+    ]
+    errors: List[str] = []
+
+    check_no_data(gdalinfo, errors)
+    assert len(errors) == 1
+
+
+def test_check_srs_valid() -> None:
+    """
+    tests check_srs with the same srs value
+    """
+    srs_to_test_against = b"SRS Test"
+    srs_tif = b"SRS Test"
+    errors: List[str] = []
+
+    check_srs(srs_to_test_against, srs_tif, errors)
+    assert len(errors) == 0
+
+
+def test_check_srs_invalid() -> None:
+    """
+    tests check_srs with a different srs value
+    """
+    srs_to_test_against = b"SRS Test"
+    srs_tif = b"SRS Different"
+    errors: List[str] = []
+
+    check_srs(srs_to_test_against, srs_tif, errors)
+    assert len(errors) == 1