feat (encoding): Added tool to strip non-ascii characters and improve…

…d encoding error handling - Non-ASCII characters are now replaced with a space - A CLI tool was added to remove non-ascii characters from an input file.
idaholab · Feb 13, 2024 · 7f7b606 · 7f7b606
2 parents 5cf1e1e + b75fe65
commit 7f7b606
Show file tree

Hide file tree

Showing 20 changed files with 556 additions and 9 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -21,6 +21,8 @@ jobs:
       - run: pip install .
       - run: pip uninstall -y montepy
       - run: pip install --user dist/*.whl
+        # run scripts
+      - run: change_to_ascii -h
       - run: pip uninstall -y montepy
       - run: pip install --user dist/*.tar.gz
       - run: pip install --user montepy[test]

diff --git a/doc/source/api/montepy.input_parser.input_file.rst b/doc/source/api/montepy.input_parser.input_file.rst
@@ -0,0 +1,9 @@
+montepy.input\_parser.input\_file module
+========================================
+
+
+.. automodule:: montepy.input_parser.input_file
+   :members:
+   :undoc-members:
+   :show-inheritance:
+   :private-members: _convert_to_int, _convert_to_enum
diff --git a/doc/source/api/montepy.input_parser.rst b/doc/source/api/montepy.input_parser.rst
@@ -16,6 +16,7 @@ Submodules
    montepy.input_parser.block_type
    montepy.input_parser.cell_parser
    montepy.input_parser.data_parser
+   montepy.input_parser.input_file
    montepy.input_parser.input_reader
    montepy.input_parser.input_syntax_reader
    montepy.input_parser.mcnp_input

diff --git a/doc/source/faq.rst b/doc/source/faq.rst
@@ -0,0 +1,70 @@
+Frequently Asked Questions
+==========================
+
+Or more likely Frequent Error Debugging.
+
+Encoding Errors: UnicodeDecodeError
+-----------------------------------
+
+If you received the error below while opening a file in MontePy,
+there is like a non-ASCII character in your input file.
+You can read more about :ref:`Character Encoding here <encoding_background>`.
+
+To solve this problem you can:
+
+1. Try another encoding such as ``'utf8'`` or ``'cp1252'``. Pass it as an argument to :func:`~montepy.input_parser.input_reader.read_input`.
+2. Remove all non-ASCII characters with :ref:`the change_to_ascii utility <convert_ascii>`
+
+.. code-block:: python
+
+      ---------------------------------------------------------------------------
+        UnicodeDecodeError                        Traceback (most recent call last)
+        Cell In[2], line 1
+        ----> 1 problem = montepy.read_input("tests/inputs/bad_encoding.imcnp")
+
+        File ~/dev/montepy/montepy/input_parser/input_reader.py:35, in read_input(input_file, mcnp_version, encoding)
+             33 problem = mcnp_problem.MCNP_Problem(input_file)
+             34 problem.mcnp_version = mcnp_version
+        ---> 35 problem.parse_input(encoding=encoding)
+             36 return problem
+
+        File ~/dev/montepy/montepy/mcnp_problem.py:262, in MCNP_Problem.parse_input(self, check_input, encoding)
+            253 OBJ_MATCHER = {
+            254     block_type.BlockType.CELL: (Cell, self._cells),
+            255     block_type.BlockType.SURFACE: (
+           (...)
+            259     block_type.BlockType.DATA: (parse_data, self._data_inputs),
+            260 }
+            261 try:
+        --> 262     for i, input in enumerate(
+            263         input_syntax_reader.read_input_syntax(
+            264             self._input_file, self.mcnp_version, encoding=encoding
+            265         )
+            266     ):
+            267         self._original_inputs.append(input)
+            268         if i == 0 and isinstance(input, mcnp_input.Message):
+
+        File ~/dev/montepy/montepy/input_parser/input_syntax_reader.py:48, in read_input_syntax(input_file, mcnp_version, encoding)
+             46 reading_queue = deque()
+             47 with input_file.open("r", encoding=encoding) as fh:
+        ---> 48     yield from read_front_matters(fh, mcnp_version)
+             49     yield from read_data(fh, mcnp_version)
+
+        File ~/dev/montepy/montepy/input_parser/input_syntax_reader.py:79, in read_front_matters(fh, mcnp_version)
+             77 lines = []
+             78 raw_lines = []
+        ---> 79 for i, line in enumerate(fh):
+             80     if i == 0 and line.upper().startswith("MESSAGE:"):
+             81         is_in_message_block = True
+
+        File ~/dev/montepy/montepy/input_parser/input_file.py:95, in MCNP_InputFile.__iter__(self)
+             94 def __iter__(self):
+        ---> 95     for lineno, line in enumerate(self._fh):
+             96         self._lineno = lineno + 1
+             97         yield line
+
+        File ~/mambaforge/lib/python3.10/encodings/ascii.py:26, in IncrementalDecoder.decode(self, input, final)
+             25 def decode(self, input, final=False):
+        ---> 26     return codecs.ascii_decode(input, self.errors)[0]
+
+        UnicodeDecodeError: 'ascii' codec can't decode byte 0xff in position 159: ordinal not in range(128)
diff --git a/doc/source/index.rst b/doc/source/index.rst
@@ -12,8 +12,12 @@ Welcome to MontePy's documentation!
 
    starting
 
+   utilities
+
    tricks
 
+   faq
+
    developing
 
    api/modules

diff --git a/doc/source/utilities.rst b/doc/source/utilities.rst
@@ -0,0 +1,115 @@
+Utility Scripts
+===============
+
+Package Level Execution Options
+-------------------------------
+.. code-block:: console
+
+        usage: montepy [-h] [-c [input_file ...]]
+
+        Tool for editing and working with MCNP input files.
+
+        options:
+          -h, --help            show this help message and exit
+          -c [input_file ...], --check [input_file ...]
+                                Check the given input file(s) for errors. Accepts globs, and multiple arguments.
+
+Checking Input Files for Errors
+-------------------------------
+MontePy can be used to check for errors that it will check for.
+MontePy will check for:
+
+* general syntax errors
+* syntax errors for all MCNP Objects supported (e.g., cells, surfaces, materials, etc.)
+* Bad references to other object when the object referring to another object is supported.
+* Bad mode options
+
+It will print all errors it found in the input to the terminal.
+
+To use this run:
+
+.. code-block:: console
+
+   python -m montepy -c [files]
+
+.. _convert_ascii:
+
+Converting Encoding to ASCII
+----------------------------
+
+.. _ascii_command:
+
+Command Line Options
+++++++++++++++++++++
+.. code-block:: console
+
+        usage: Change_to_ascii [-h] [-d | -w] in_file out_file
+
+        Change the encoding of a file to strict ASCII. Everything not compliant will be removed.
+
+        positional arguments:
+          in_file           The input file to convert
+          out_file          The input file to convert
+
+        options:
+          -h, --help        show this help message and exit
+          -d, --delete      Delete any non-ascii characters. This is the default.
+          -w, --whitespace  Replace non-ascii characters with a space.
+
+
+.. _encoding_background:
+
+Background
+++++++++++
+`Character encoding <https://en.wikipedia.org/wiki/Character_encoding>`_ is the process of representing all characters as numbers,
+so they may be used by a computer.
+It is the bane of almost all programmers.
+
+The `American Standard Code for Information Interchange (ASCII) <https://en.wikipedia.org/wiki/ASCII>`_ is one of the oldest,
+and simplest encoding standards.
+It uses one byte per character, 
+and only goes from 0 – 127.
+This has some issues, being very American-centric,
+and also only allowing 128 characters, 
+52 of them being the English alphabet.
+One solution to this was `"Extended ASCII" <https://en.wikipedia.org/wiki/Extended_ASCII>`_,
+which used the final bit, and allowed the encoding system
+to include 0 – 255.
+There isn't one "Extended ASCII",
+but one of the most popular encodings is Windows CP-1252.
+This isn't great.
+
+The most commonly used encoding now is `UTF-8 <https://en.wikipedia.org/wiki/UTF-8>`_, or "unicode".
+UTF-8 can support almost any printable character in any language, including emojis.
+The complexity is that each character is a variable-length of bytes.
+This means that older software, like fortran, may get confused by it.
+
+As far as I can tell MCNP does not document what encoding it uses.
+ASCII is the most conservative bet, 
+so MontePy by default tries to read input files in strict ASCII.
+
+Dealing with Encoding Issues
+++++++++++++++++++++++++++++
+
+You are likely here because you got an error message something like this:
+
+>>> montepy.read_input("example.imcnp")
+UnicodeDecodeError                        Traceback (most recent call last)
+<snip>
+UnicodeDecodeError: 'ascii' codec can't decode byte 0xc2 in position 1132: ordinal not in range(128)
+
+You can either change the encoding used by :func:`~montepy.input_parser.input_reader.read_input`,
+or just force the entire file to be strictly ASCII.
+
+MontePY offers the ``change_to_ascii`` script. 
+The options are listed above: :ref:`ascii_command`.
+For any non-ASCII character it will either remove
+the character or replace it with a space (``' '``).
+It defaults to deleting.
+To replace it with a space instead use ``-w``. 
+Otherwise the arguments are the input file to correct,
+and the path to write the output file to.
+
+.. code-block:: console
+
+   change_to_ascii -w unicode_input.imcnp ascii_input.imcnp
diff --git a/montepy/__init__.py b/montepy/__init__.py
@@ -23,7 +23,7 @@
 from montepy.universe import Universe
 import sys
 
-__version__ = "0.2.5"
+__version__ = "0.3.0dev1"
 
 # enable deprecated warnings for users
 if not sys.warnoptions:

diff --git a/montepy/_scripts/__init__.py b/montepy/_scripts/__init__.py
diff --git a/montepy/_scripts/change_to_ascii.py b/montepy/_scripts/change_to_ascii.py
@@ -0,0 +1,87 @@
+import argparse
+import sys
+
+
+def define_args(args):
+    """
+    Parses the arguments from the command line.
+
+    :param args: the arguments from the command line.
+    :type args: list
+    :returns: the parsed arguments (with argparse)
+    :rtype: argparse.Namespace
+    """
+    parser = argparse.ArgumentParser(
+        prog="Change_to_ascii",
+        description="Change the encoding of a file to strict ASCII. Everything not compliant will be removed.",
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "-d",
+        "--delete",
+        dest="delete",
+        action="store_true",
+        help="Delete any non-ascii characters. This is the default.",
+    )
+    group.add_argument(
+        "-w",
+        "--whitespace",
+        dest="whitespace",
+        action="store_true",
+        help="Replace non-ascii characters with a space.",
+    )
+    parser.add_argument("in_file", nargs=1, help="The input file to convert")
+    parser.add_argument("out_file", nargs=1, help="The input file to convert")
+    args = parser.parse_args(args)
+    return args
+
+
+def strip_characters(args):
+    """
+    Strips non-ascii characters from the input file, and writes out the output file.
+
+    :param args: the parsed command line arguments.
+    :type args: argparse.Namespace
+    """
+    if args.whitespace:
+        replacer = " "
+    elif args.delete:
+        replacer = ""
+    # default to delete
+    else:
+        replacer = ""
+    with open(args.in_file[0], "rb") as in_fh, open(args.out_file[0], "wb") as out_fh:
+        for line in in_fh:
+            utf8_line = line.decode(encoding="utf8", errors="replace")
+            utf8_line = utf8_line.replace("�", replacer)
+
+            try:
+                out_fh.write(utf8_line.encode(encoding="ascii", errors="strict"))
+            except UnicodeError as e:
+                new_line = []
+                # find the bad characters character by character
+                for char in utf8_line:
+                    if ord(char) > 128:
+                        new_line.append(replacer)
+                    else:
+                        new_line.append(char)
+                out_fh.write(
+                    "".join(new_line).encode(encoding="ascii", errors="strict")
+                )
+
+
+def main(args=None):
+    """
+    Main runner function.
+
+    :param args: The arguments passed from the command line.
+    :type args: list
+    """
+    if args is None:
+        args = sys.argv[1:]
+    args = define_args(args)
+    strip_characters(args)
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
diff --git a/montepy/constants.py b/montepy/constants.py
@@ -40,6 +40,13 @@
 How many spaces a tab is expand to.
 """
 
+ASCII_CEILING = 127
+"""
+The maximum allowed code point allowed by ASCII.
+
+Source: `Wikipedia <https://en.wikipedia.org/wiki/ASCII>`_
+"""
+
 
 def get_max_line_length(mcnp_version=DEFAULT_VERSION):
     """