From 2d26aeb7c0a6152427a964e3c391c1b809e88325 Mon Sep 17 00:00:00 2001 From: TAHRI Ahmed R Date: Sat, 12 Nov 2022 14:17:00 +0100 Subject: [PATCH] Improve multi-byte cutter/chunk (#233) * :bug: multi-byte cutter/chunk is not accurate enough on u16, u32 (le/be) * :bookmark: bump version 3.0.1-dev --- CHANGELOG.md | 5 +++++ charset_normalizer/utils.py | 2 +- charset_normalizer/version.py | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 16a52274..15542a76 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,11 @@ All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). +## [3.0.1](https://github.com/Ousret/charset_normalizer/compare/3.0.0...master) (unreleased) + +### Fixed +- Multi-bytes cutter/chunk generator did not always cut correctly (PR #233) + ## [3.0.0](https://github.com/Ousret/charset_normalizer/compare/2.1.1...3.0.0) (2022-10-20) ### Added diff --git a/charset_normalizer/utils.py b/charset_normalizer/utils.py index 425d8365..e3536267 100644 --- a/charset_normalizer/utils.py +++ b/charset_normalizer/utils.py @@ -396,7 +396,7 @@ def cut_sequence_chunks( # multi-byte bad cutting detector and adjustment # not the cleanest way to perform that fix but clever enough for now. - if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80: + if is_multi_byte_decoder and i > 0: chunk_partial_size_chk: int = min(chunk_size, 16) diff --git a/charset_normalizer/version.py b/charset_normalizer/version.py index cc238a5a..287d7493 100644 --- a/charset_normalizer/version.py +++ b/charset_normalizer/version.py @@ -2,5 +2,5 @@ Expose version """ -__version__ = "3.0.0" +__version__ = "3.0.1-dev" VERSION = __version__.split(".")