-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat (encoding): Added tool to strip non-ascii characters and improve…
…d encoding error handling - Non-ASCII characters are now replaced with a space - A CLI tool was added to remove non-ascii characters from an input file.
- Loading branch information
Showing
20 changed files
with
556 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
montepy.input\_parser.input\_file module | ||
======================================== | ||
|
||
|
||
.. automodule:: montepy.input_parser.input_file | ||
:members: | ||
:undoc-members: | ||
:show-inheritance: | ||
:private-members: _convert_to_int, _convert_to_enum |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
Frequently Asked Questions | ||
========================== | ||
|
||
Or more likely Frequent Error Debugging. | ||
|
||
Encoding Errors: UnicodeDecodeError | ||
----------------------------------- | ||
|
||
If you received the error below while opening a file in MontePy, | ||
there is like a non-ASCII character in your input file. | ||
You can read more about :ref:`Character Encoding here <encoding_background>`. | ||
|
||
To solve this problem you can: | ||
|
||
1. Try another encoding such as ``'utf8'`` or ``'cp1252'``. Pass it as an argument to :func:`~montepy.input_parser.input_reader.read_input`. | ||
2. Remove all non-ASCII characters with :ref:`the change_to_ascii utility <convert_ascii>` | ||
|
||
.. code-block:: python | ||
--------------------------------------------------------------------------- | ||
UnicodeDecodeError Traceback (most recent call last) | ||
Cell In[2], line 1 | ||
----> 1 problem = montepy.read_input("tests/inputs/bad_encoding.imcnp") | ||
File ~/dev/montepy/montepy/input_parser/input_reader.py:35, in read_input(input_file, mcnp_version, encoding) | ||
33 problem = mcnp_problem.MCNP_Problem(input_file) | ||
34 problem.mcnp_version = mcnp_version | ||
---> 35 problem.parse_input(encoding=encoding) | ||
36 return problem | ||
File ~/dev/montepy/montepy/mcnp_problem.py:262, in MCNP_Problem.parse_input(self, check_input, encoding) | ||
253 OBJ_MATCHER = { | ||
254 block_type.BlockType.CELL: (Cell, self._cells), | ||
255 block_type.BlockType.SURFACE: ( | ||
(...) | ||
259 block_type.BlockType.DATA: (parse_data, self._data_inputs), | ||
260 } | ||
261 try: | ||
--> 262 for i, input in enumerate( | ||
263 input_syntax_reader.read_input_syntax( | ||
264 self._input_file, self.mcnp_version, encoding=encoding | ||
265 ) | ||
266 ): | ||
267 self._original_inputs.append(input) | ||
268 if i == 0 and isinstance(input, mcnp_input.Message): | ||
File ~/dev/montepy/montepy/input_parser/input_syntax_reader.py:48, in read_input_syntax(input_file, mcnp_version, encoding) | ||
46 reading_queue = deque() | ||
47 with input_file.open("r", encoding=encoding) as fh: | ||
---> 48 yield from read_front_matters(fh, mcnp_version) | ||
49 yield from read_data(fh, mcnp_version) | ||
File ~/dev/montepy/montepy/input_parser/input_syntax_reader.py:79, in read_front_matters(fh, mcnp_version) | ||
77 lines = [] | ||
78 raw_lines = [] | ||
---> 79 for i, line in enumerate(fh): | ||
80 if i == 0 and line.upper().startswith("MESSAGE:"): | ||
81 is_in_message_block = True | ||
File ~/dev/montepy/montepy/input_parser/input_file.py:95, in MCNP_InputFile.__iter__(self) | ||
94 def __iter__(self): | ||
---> 95 for lineno, line in enumerate(self._fh): | ||
96 self._lineno = lineno + 1 | ||
97 yield line | ||
File ~/mambaforge/lib/python3.10/encodings/ascii.py:26, in IncrementalDecoder.decode(self, input, final) | ||
25 def decode(self, input, final=False): | ||
---> 26 return codecs.ascii_decode(input, self.errors)[0] | ||
UnicodeDecodeError: 'ascii' codec can't decode byte 0xff in position 159: ordinal not in range(128) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,8 +12,12 @@ Welcome to MontePy's documentation! | |
|
||
starting | ||
|
||
utilities | ||
|
||
tricks | ||
|
||
faq | ||
|
||
developing | ||
|
||
api/modules | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
Utility Scripts | ||
=============== | ||
|
||
Package Level Execution Options | ||
------------------------------- | ||
.. code-block:: console | ||
usage: montepy [-h] [-c [input_file ...]] | ||
Tool for editing and working with MCNP input files. | ||
options: | ||
-h, --help show this help message and exit | ||
-c [input_file ...], --check [input_file ...] | ||
Check the given input file(s) for errors. Accepts globs, and multiple arguments. | ||
Checking Input Files for Errors | ||
------------------------------- | ||
MontePy can be used to check for errors that it will check for. | ||
MontePy will check for: | ||
|
||
* general syntax errors | ||
* syntax errors for all MCNP Objects supported (e.g., cells, surfaces, materials, etc.) | ||
* Bad references to other object when the object referring to another object is supported. | ||
* Bad mode options | ||
|
||
It will print all errors it found in the input to the terminal. | ||
|
||
To use this run: | ||
|
||
.. code-block:: console | ||
python -m montepy -c [files] | ||
.. _convert_ascii: | ||
|
||
Converting Encoding to ASCII | ||
---------------------------- | ||
|
||
.. _ascii_command: | ||
|
||
Command Line Options | ||
++++++++++++++++++++ | ||
.. code-block:: console | ||
usage: Change_to_ascii [-h] [-d | -w] in_file out_file | ||
Change the encoding of a file to strict ASCII. Everything not compliant will be removed. | ||
positional arguments: | ||
in_file The input file to convert | ||
out_file The input file to convert | ||
options: | ||
-h, --help show this help message and exit | ||
-d, --delete Delete any non-ascii characters. This is the default. | ||
-w, --whitespace Replace non-ascii characters with a space. | ||
.. _encoding_background: | ||
|
||
Background | ||
++++++++++ | ||
`Character encoding <https://en.wikipedia.org/wiki/Character_encoding>`_ is the process of representing all characters as numbers, | ||
so they may be used by a computer. | ||
It is the bane of almost all programmers. | ||
|
||
The `American Standard Code for Information Interchange (ASCII) <https://en.wikipedia.org/wiki/ASCII>`_ is one of the oldest, | ||
and simplest encoding standards. | ||
It uses one byte per character, | ||
and only goes from 0 – 127. | ||
This has some issues, being very American-centric, | ||
and also only allowing 128 characters, | ||
52 of them being the English alphabet. | ||
One solution to this was `"Extended ASCII" <https://en.wikipedia.org/wiki/Extended_ASCII>`_, | ||
which used the final bit, and allowed the encoding system | ||
to include 0 – 255. | ||
There isn't one "Extended ASCII", | ||
but one of the most popular encodings is Windows CP-1252. | ||
This isn't great. | ||
|
||
The most commonly used encoding now is `UTF-8 <https://en.wikipedia.org/wiki/UTF-8>`_, or "unicode". | ||
UTF-8 can support almost any printable character in any language, including emojis. | ||
The complexity is that each character is a variable-length of bytes. | ||
This means that older software, like fortran, may get confused by it. | ||
|
||
As far as I can tell MCNP does not document what encoding it uses. | ||
ASCII is the most conservative bet, | ||
so MontePy by default tries to read input files in strict ASCII. | ||
|
||
Dealing with Encoding Issues | ||
++++++++++++++++++++++++++++ | ||
|
||
You are likely here because you got an error message something like this: | ||
|
||
>>> montepy.read_input("example.imcnp") | ||
UnicodeDecodeError Traceback (most recent call last) | ||
<snip> | ||
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc2 in position 1132: ordinal not in range(128) | ||
|
||
You can either change the encoding used by :func:`~montepy.input_parser.input_reader.read_input`, | ||
or just force the entire file to be strictly ASCII. | ||
|
||
MontePY offers the ``change_to_ascii`` script. | ||
The options are listed above: :ref:`ascii_command`. | ||
For any non-ASCII character it will either remove | ||
the character or replace it with a space (``' '``). | ||
It defaults to deleting. | ||
To replace it with a space instead use ``-w``. | ||
Otherwise the arguments are the input file to correct, | ||
and the path to write the output file to. | ||
|
||
.. code-block:: console | ||
change_to_ascii -w unicode_input.imcnp ascii_input.imcnp |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
import argparse | ||
import sys | ||
|
||
|
||
def define_args(args): | ||
""" | ||
Parses the arguments from the command line. | ||
:param args: the arguments from the command line. | ||
:type args: list | ||
:returns: the parsed arguments (with argparse) | ||
:rtype: argparse.Namespace | ||
""" | ||
parser = argparse.ArgumentParser( | ||
prog="Change_to_ascii", | ||
description="Change the encoding of a file to strict ASCII. Everything not compliant will be removed.", | ||
) | ||
group = parser.add_mutually_exclusive_group() | ||
group.add_argument( | ||
"-d", | ||
"--delete", | ||
dest="delete", | ||
action="store_true", | ||
help="Delete any non-ascii characters. This is the default.", | ||
) | ||
group.add_argument( | ||
"-w", | ||
"--whitespace", | ||
dest="whitespace", | ||
action="store_true", | ||
help="Replace non-ascii characters with a space.", | ||
) | ||
parser.add_argument("in_file", nargs=1, help="The input file to convert") | ||
parser.add_argument("out_file", nargs=1, help="The input file to convert") | ||
args = parser.parse_args(args) | ||
return args | ||
|
||
|
||
def strip_characters(args): | ||
""" | ||
Strips non-ascii characters from the input file, and writes out the output file. | ||
:param args: the parsed command line arguments. | ||
:type args: argparse.Namespace | ||
""" | ||
if args.whitespace: | ||
replacer = " " | ||
elif args.delete: | ||
replacer = "" | ||
# default to delete | ||
else: | ||
replacer = "" | ||
with open(args.in_file[0], "rb") as in_fh, open(args.out_file[0], "wb") as out_fh: | ||
for line in in_fh: | ||
utf8_line = line.decode(encoding="utf8", errors="replace") | ||
utf8_line = utf8_line.replace("�", replacer) | ||
|
||
try: | ||
out_fh.write(utf8_line.encode(encoding="ascii", errors="strict")) | ||
except UnicodeError as e: | ||
new_line = [] | ||
# find the bad characters character by character | ||
for char in utf8_line: | ||
if ord(char) > 128: | ||
new_line.append(replacer) | ||
else: | ||
new_line.append(char) | ||
out_fh.write( | ||
"".join(new_line).encode(encoding="ascii", errors="strict") | ||
) | ||
|
||
|
||
def main(args=None): | ||
""" | ||
Main runner function. | ||
:param args: The arguments passed from the command line. | ||
:type args: list | ||
""" | ||
if args is None: | ||
args = sys.argv[1:] | ||
args = define_args(args) | ||
strip_characters(args) | ||
|
||
|
||
if __name__ == "__main__": | ||
main(sys.argv[1:]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.