Skip to content

Commit

Permalink
🔧 Logging behavior to be less verbose under INFO and DEBUG (#163)
Browse files Browse the repository at this point in the history
* 🔧 The logging behavior have been completely reviewed, now using only TRACE and DEBUG levels

Following #162 and #145 and #134
  • Loading branch information
Ousret authored Jan 29, 2022
1 parent ce38494 commit 0cc1528
Show file tree
Hide file tree
Showing 5 changed files with 84 additions and 41 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,13 @@
All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

## [2.0.11.dev0](https://github.com/Ousret/charset_normalizer/compare/2.0.10...master) (unreleased)

- [Short description of non-trivial change.]

### Changed
- The logging behavior have been completely reviewed, now using only TRACE and DEBUG levels (PR #163)

## [2.0.10](https://github.com/Ousret/charset_normalizer/compare/2.0.9...2.0.10) (2022-01-04)

### Fixed
Expand Down
102 changes: 67 additions & 35 deletions charset_normalizer/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
mb_encoding_languages,
merge_coherence_ratios,
)
from .constant import IANA_SUPPORTED, TOO_BIG_SEQUENCE, TOO_SMALL_SEQUENCE
from .constant import IANA_SUPPORTED, TOO_BIG_SEQUENCE, TOO_SMALL_SEQUENCE, TRACE
from .md import mess_ratio
from .models import CharsetMatch, CharsetMatches
from .utils import (
Expand All @@ -25,6 +25,7 @@
should_strip_sig_or_bom,
)

logging.addLevelName(TRACE, "TRACE")
logger = logging.getLogger("charset_normalizer")
explain_handler = logging.StreamHandler()
explain_handler.setFormatter(
Expand Down Expand Up @@ -70,19 +71,20 @@ def from_bytes(
if explain:
previous_logger_level = logger.level # type: int
logger.addHandler(explain_handler)
logger.setLevel(logging.DEBUG)
logger.setLevel(TRACE)

length = len(sequences) # type: int

if length == 0:
logger.warning("Encoding detection on empty bytes, assuming utf_8 intention.")
logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
if explain:
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level or logging.WARNING)
return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])

if cp_isolation is not None:
logger.debug(
logger.log(
TRACE,
"cp_isolation is set. use this flag for debugging purpose. "
"limited list of encoding allowed : %s.",
", ".join(cp_isolation),
Expand All @@ -92,7 +94,8 @@ def from_bytes(
cp_isolation = []

if cp_exclusion is not None:
logger.debug(
logger.log(
TRACE,
"cp_exclusion is set. use this flag for debugging purpose. "
"limited list of encoding excluded : %s.",
", ".join(cp_exclusion),
Expand All @@ -102,7 +105,8 @@ def from_bytes(
cp_exclusion = []

if length <= (chunk_size * steps):
logger.debug(
logger.log(
TRACE,
"override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.",
steps,
chunk_size,
Expand All @@ -118,16 +122,18 @@ def from_bytes(
is_too_large_sequence = len(sequences) >= TOO_BIG_SEQUENCE # type: bool

if is_too_small_sequence:
logger.warning(
logger.log(
TRACE,
"Trying to detect encoding from a tiny portion of ({}) byte(s).".format(
length
)
),
)
elif is_too_large_sequence:
logger.info(
logger.log(
TRACE,
"Using lazy str decoding because the payload is quite large, ({}) byte(s).".format(
length
)
),
)

prioritized_encodings = [] # type: List[str]
Expand All @@ -138,7 +144,8 @@ def from_bytes(

if specified_encoding is not None:
prioritized_encodings.append(specified_encoding)
logger.info(
logger.log(
TRACE,
"Detected declarative mark in sequence. Priority +1 given for %s.",
specified_encoding,
)
Expand All @@ -157,7 +164,8 @@ def from_bytes(

if sig_encoding is not None:
prioritized_encodings.append(sig_encoding)
logger.info(
logger.log(
TRACE,
"Detected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.",
len(sig_payload),
sig_encoding,
Expand Down Expand Up @@ -188,7 +196,8 @@ def from_bytes(
) # type: bool

if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
logger.debug(
logger.log(
TRACE,
"Encoding %s wont be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
encoding_iana,
)
Expand All @@ -197,8 +206,10 @@ def from_bytes(
try:
is_multi_byte_decoder = is_multi_byte_encoding(encoding_iana) # type: bool
except (ModuleNotFoundError, ImportError):
logger.debug(
"Encoding %s does not provide an IncrementalDecoder", encoding_iana
logger.log(
TRACE,
"Encoding %s does not provide an IncrementalDecoder",
encoding_iana,
)
continue

Expand All @@ -219,7 +230,8 @@ def from_bytes(
)
except (UnicodeDecodeError, LookupError) as e:
if not isinstance(e, LookupError):
logger.debug(
logger.log(
TRACE,
"Code page %s does not fit given bytes sequence at ALL. %s",
encoding_iana,
str(e),
Expand All @@ -235,7 +247,8 @@ def from_bytes(
break

if similar_soft_failure_test:
logger.debug(
logger.log(
TRACE,
"%s is deemed too similar to code page %s and was consider unsuited already. Continuing!",
encoding_iana,
encoding_soft_failed,
Expand All @@ -255,7 +268,8 @@ def from_bytes(
) # type: bool

if multi_byte_bonus:
logger.debug(
logger.log(
TRACE,
"Code page %s is a multi byte encoding table and it appear that at least one character "
"was encoded using n-bytes.",
encoding_iana,
Expand Down Expand Up @@ -285,7 +299,8 @@ def from_bytes(
errors="ignore" if is_multi_byte_decoder else "strict",
) # type: str
except UnicodeDecodeError as e: # Lazy str loading may have missed something there
logger.debug(
logger.log(
TRACE,
"LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
encoding_iana,
str(e),
Expand Down Expand Up @@ -337,7 +352,8 @@ def from_bytes(
try:
sequences[int(50e3) :].decode(encoding_iana, errors="strict")
except UnicodeDecodeError as e:
logger.debug(
logger.log(
TRACE,
"LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
encoding_iana,
str(e),
Expand All @@ -350,7 +366,8 @@ def from_bytes(
) # type: float
if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
tested_but_soft_failure.append(encoding_iana)
logger.info(
logger.log(
TRACE,
"%s was excluded because of initial chaos probing. Gave up %i time(s). "
"Computed mean chaos is %f %%.",
encoding_iana,
Expand All @@ -373,7 +390,8 @@ def from_bytes(
fallback_u8 = fallback_entry
continue

logger.info(
logger.log(
TRACE,
"%s passed initial chaos probing. Mean measured chaos is %f %%",
encoding_iana,
round(mean_mess_ratio * 100, ndigits=3),
Expand All @@ -385,10 +403,11 @@ def from_bytes(
target_languages = mb_encoding_languages(encoding_iana)

if target_languages:
logger.debug(
logger.log(
TRACE,
"{} should target any language(s) of {}".format(
encoding_iana, str(target_languages)
)
),
)

cd_ratios = []
Expand All @@ -406,10 +425,11 @@ def from_bytes(
cd_ratios_merged = merge_coherence_ratios(cd_ratios)

if cd_ratios_merged:
logger.info(
logger.log(
TRACE,
"We detected language {} using {}".format(
cd_ratios_merged, encoding_iana
)
),
)

results.append(
Expand All @@ -427,17 +447,18 @@ def from_bytes(
encoding_iana in [specified_encoding, "ascii", "utf_8"]
and mean_mess_ratio < 0.1
):
logger.info(
"%s is most likely the one. Stopping the process.", encoding_iana
logger.debug(
"Encoding detection: %s is most likely the one.", encoding_iana
)
if explain:
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level)
return CharsetMatches([results[encoding_iana]])

if encoding_iana == sig_encoding:
logger.info(
"%s is most likely the one as we detected a BOM or SIG within the beginning of the sequence.",
logger.debug(
"Encoding detection: %s is most likely the one as we detected a BOM or SIG within "
"the beginning of the sequence.",
encoding_iana,
)
if explain:
Expand All @@ -447,13 +468,15 @@ def from_bytes(

if len(results) == 0:
if fallback_u8 or fallback_ascii or fallback_specified:
logger.debug(
"Nothing got out of the detection process. Using ASCII/UTF-8/Specified fallback."
logger.log(
TRACE,
"Nothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.",
)

if fallback_specified:
logger.debug(
"%s will be used as a fallback match", fallback_specified.encoding
"Encoding detection: %s will be used as a fallback match",
fallback_specified.encoding,
)
results.append(fallback_specified)
elif (
Expand All @@ -465,12 +488,21 @@ def from_bytes(
)
or (fallback_u8 is not None)
):
logger.warning("utf_8 will be used as a fallback match")
logger.debug("Encoding detection: utf_8 will be used as a fallback match")
results.append(fallback_u8)
elif fallback_ascii:
logger.warning("ascii will be used as a fallback match")
logger.debug("Encoding detection: ascii will be used as a fallback match")
results.append(fallback_ascii)

if results:
logger.debug(
"Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.",
results.best().encoding, # type: ignore
len(results) - 1,
)
else:
logger.debug("Encoding detection: Unable to determine any suitable charset.")

if explain:
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level)
Expand Down
3 changes: 3 additions & 0 deletions charset_normalizer/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -498,3 +498,6 @@
NOT_PRINTABLE_PATTERN = re_compile(r"[0-9\W\n\r\t]+")

LANGUAGE_SUPPORTED_COUNT = len(FREQUENCIES) # type: int

# Logging LEVEL bellow DEBUG
TRACE = 5 # type: int
2 changes: 1 addition & 1 deletion charset_normalizer/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
Expose version
"""

__version__ = "2.0.10"
__version__ = "2.0.11.dev0"
VERSION = __version__.split(".")
11 changes: 6 additions & 5 deletions tests/test_logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from charset_normalizer.utils import set_logging_handler
from charset_normalizer.api import from_bytes, explain_handler
from charset_normalizer.constant import TRACE


class TestLogBehaviorClass:
Expand All @@ -17,24 +18,24 @@ def test_explain_true_behavior(self, caplog):
from_bytes(test_sequence, steps=1, chunk_size=50, explain=True)
assert explain_handler not in self.logger.handlers
for record in caplog.records:
assert record.levelname in ["INFO", "DEBUG"]
assert record.levelname in ["TRACE", "DEBUG"]

def test_explain_false_handler_set_behavior(self, caplog):
test_sequence = b'This is a test sequence of bytes that should be sufficient'
set_logging_handler(level=logging.INFO, format_string="%(message)s")
set_logging_handler(level=TRACE, format_string="%(message)s")
from_bytes(test_sequence, steps=1, chunk_size=50, explain=False)
assert any(isinstance(hdl, logging.StreamHandler) for hdl in self.logger.handlers)
for record in caplog.records:
assert record.levelname in ["INFO", "DEBUG"]
assert "ascii is most likely the one. Stopping the process." in caplog.text
assert record.levelname in ["TRACE", "DEBUG"]
assert "Encoding detection: ascii is most likely the one." in caplog.text

def test_set_stream_handler(self, caplog):
set_logging_handler(
"charset_normalizer", level=logging.DEBUG
)
self.logger.debug("log content should log with default format")
for record in caplog.records:
assert record.levelname in ["INFO", "DEBUG"]
assert record.levelname in ["TRACE", "DEBUG"]
assert "log content should log with default format" in caplog.text

def test_set_stream_handler_format(self, caplog):
Expand Down

0 comments on commit 0cc1528

Please sign in to comment.