From bd845303ade6e79e06bcafb1f727866e0c555fb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 13 Sep 2021 21:29:38 +0200 Subject: [PATCH] implement a way to shorten filenames with east-asian characters (#1377) Setting 'output.shorten' to "eaw" (East-Asian Width) uses a slower algorithm that also considers characters with a width > 1. --- docs/configuration.rst | 3 + gallery_dl/output.py | 73 +++++++++++++++---- scripts/run_tests.sh | 2 +- test/test_output.py | 156 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 219 insertions(+), 15 deletions(-) create mode 100644 test/test_output.py diff --git a/docs/configuration.rst b/docs/configuration.rst index 986ecc3966..03fc1938ca 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -2531,6 +2531,9 @@ Description Controls whether the output strings should be shortened to fit on one console line. + Set this option to ``"eaw"`` to also work with east-asian characters + with a display width greater than 1. + output.skip ----------- diff --git a/gallery_dl/output.py b/gallery_dl/output.py index 7e1f8c1ec0..4e0439e347 100644 --- a/gallery_dl/output.py +++ b/gallery_dl/output.py @@ -10,6 +10,7 @@ import sys import shutil import logging +import unicodedata from . import config, util @@ -270,9 +271,14 @@ def success(self, path, tries): class TerminalOutput(NullOutput): def __init__(self): - self.short = config.get(("output",), "shorten", True) - if self.short: - self.width = shutil.get_terminal_size().columns - OFFSET + shorten = config.get(("output",), "shorten", True) + if shorten: + func = shorten_string_eaw if shorten == "eaw" else shorten_string + limit = shutil.get_terminal_size().columns - OFFSET + sep = CHAR_ELLIPSIES + self.shorten = lambda txt: func(txt, limit, sep) + else: + self.shorten = util.identity def start(self, path): print(self.shorten(" " + path), end="", flush=True) @@ -283,17 +289,6 @@ def skip(self, path): def success(self, path, tries): print("\r", self.shorten(CHAR_SUCCESS + path), sep="") - def shorten(self, txt): - """Reduce the length of 'txt' to the width of the terminal""" - if self.short and len(txt) > self.width: - hwidth = self.width // 2 - OFFSET - return "".join(( - txt[:hwidth-1], - CHAR_ELLIPSIES, - txt[-hwidth-(self.width % 2):] - )) - return txt - class ColorOutput(TerminalOutput): @@ -307,6 +302,56 @@ def success(self, path, tries): print("\r\033[1;32m", self.shorten(path), "\033[0m", sep="") +class EAWCache(dict): + + def __missing__(self, key): + width = self[key] = \ + 2 if unicodedata.east_asian_width(key) in "WF" else 1 + return width + + +def shorten_string(txt, limit, sep="…"): + """Limit width of 'txt'; assume all characters have a width of 1""" + if len(txt) <= limit: + return txt + limit -= len(sep) + return txt[:limit // 2] + sep + txt[-((limit+1) // 2):] + + +def shorten_string_eaw(txt, limit, sep="…", cache=EAWCache()): + """Limit width of 'txt'; check for east-asian characters with width > 1""" + char_widths = [cache[c] for c in txt] + text_width = sum(char_widths) + + if text_width <= limit: + # no shortening required + return txt + + limit -= len(sep) + if text_width == len(txt): + # all characters have a width of 1 + return txt[:limit // 2] + sep + txt[-((limit+1) // 2):] + + # wide characters + left = 0 + lwidth = limit // 2 + while True: + lwidth -= char_widths[left] + if lwidth < 0: + break + left += 1 + + right = -1 + rwidth = (limit+1) // 2 + (lwidth + char_widths[left]) + while True: + rwidth -= char_widths[right] + if rwidth < 0: + break + right -= 1 + + return txt[:left] + sep + txt[right+1:] + + if util.WINDOWS: ANSI = os.environ.get("TERM") == "ANSI" OFFSET = 1 diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh index 2c7eb48add..0a7f0d6c24 100755 --- a/scripts/run_tests.sh +++ b/scripts/run_tests.sh @@ -2,7 +2,7 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" -TESTS_CORE=(cache config cookies downloader extractor job oauth postprocessor text util) +TESTS_CORE=(cache config cookies downloader extractor job oauth output postprocessor text util) TESTS_RESULTS=(results) diff --git a/test/test_output.py b/test/test_output.py new file mode 100644 index 0000000000..84433f0815 --- /dev/null +++ b/test/test_output.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Copyright 2021 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +import os +import sys +import unittest + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from gallery_dl import output # noqa E402 + + +class TestShorten(unittest.TestCase): + + def test_shorten_noop(self, f=output.shorten_string): + self.assertEqual(f("" , 10), "") + self.assertEqual(f("foobar", 10), "foobar") + + def test_shorten(self, f=output.shorten_string): + s = "01234567890123456789" # string of length 20 + self.assertEqual(f(s, 30), s) + self.assertEqual(f(s, 25), s) + self.assertEqual(f(s, 20), s) + self.assertEqual(f(s, 19), "012345678…123456789") + self.assertEqual(f(s, 18), "01234567…123456789") + self.assertEqual(f(s, 17), "01234567…23456789") + self.assertEqual(f(s, 16), "0123456…23456789") + self.assertEqual(f(s, 15), "0123456…3456789") + self.assertEqual(f(s, 14), "012345…3456789") + self.assertEqual(f(s, 13), "012345…456789") + self.assertEqual(f(s, 12), "01234…456789") + self.assertEqual(f(s, 11), "01234…56789") + self.assertEqual(f(s, 10), "0123…56789") + self.assertEqual(f(s, 9) , "0123…6789") + self.assertEqual(f(s, 3) , "0…9") + self.assertEqual(f(s, 2) , "…9") + + def test_shorten_separator(self, f=output.shorten_string): + s = "01234567890123456789" # string of length 20 + self.assertEqual(f(s, 20, "|---|"), s) + self.assertEqual(f(s, 19, "|---|"), "0123456|---|3456789") + self.assertEqual(f(s, 15, "|---|"), "01234|---|56789") + self.assertEqual(f(s, 10, "|---|"), "01|---|789") + + self.assertEqual(f(s, 19, "..."), "01234567...23456789") + self.assertEqual(f(s, 19, "..") , "01234567..123456789") + self.assertEqual(f(s, 19, ".") , "012345678.123456789") + self.assertEqual(f(s, 19, "") , "0123456780123456789") + + +class TestShortenEAW(unittest.TestCase): + + def test_shorten_eaw_noop(self, f=output.shorten_string_eaw): + self.assertEqual(f("" , 10), "") + self.assertEqual(f("foobar", 10), "foobar") + + def test_shorten_eaw(self, f=output.shorten_string_eaw): + s = "01234567890123456789" # 20 ascii characters + self.assertEqual(f(s, 30), s) + self.assertEqual(f(s, 25), s) + self.assertEqual(f(s, 20), s) + self.assertEqual(f(s, 19), "012345678…123456789") + self.assertEqual(f(s, 18), "01234567…123456789") + self.assertEqual(f(s, 17), "01234567…23456789") + self.assertEqual(f(s, 16), "0123456…23456789") + self.assertEqual(f(s, 15), "0123456…3456789") + self.assertEqual(f(s, 14), "012345…3456789") + self.assertEqual(f(s, 13), "012345…456789") + self.assertEqual(f(s, 12), "01234…456789") + self.assertEqual(f(s, 11), "01234…56789") + self.assertEqual(f(s, 10), "0123…56789") + self.assertEqual(f(s, 9) , "0123…6789") + self.assertEqual(f(s, 3) , "0…9") + self.assertEqual(f(s, 2) , "…9") + + def test_shorten_eaw_wide(self, f=output.shorten_string_eaw): + s = "幻想郷幻想郷幻想郷幻想郷" # 12 wide characters + self.assertEqual(f(s, 30), s) + self.assertEqual(f(s, 25), s) + self.assertEqual(f(s, 20), "幻想郷幻…想郷幻想郷") + self.assertEqual(f(s, 19), "幻想郷幻…想郷幻想郷") + self.assertEqual(f(s, 18), "幻想郷幻…郷幻想郷") + self.assertEqual(f(s, 17), "幻想郷幻…郷幻想郷") + self.assertEqual(f(s, 16), "幻想郷…郷幻想郷") + self.assertEqual(f(s, 15), "幻想郷…郷幻想郷") + self.assertEqual(f(s, 14), "幻想郷…幻想郷") + self.assertEqual(f(s, 13), "幻想郷…幻想郷") + self.assertEqual(f(s, 12), "幻想…幻想郷") + self.assertEqual(f(s, 11), "幻想…幻想郷") + self.assertEqual(f(s, 10), "幻想…想郷") + self.assertEqual(f(s, 9) , "幻想…想郷") + self.assertEqual(f(s, 3) , "…郷") + + def test_shorten_eaw_mix(self, f=output.shorten_string_eaw): + s = "幻-想-郷##幻-想-郷##幻-想-郷" # mixed characters + self.assertEqual(f(s, 28), s) + self.assertEqual(f(s, 25), "幻-想-郷##幻…郷##幻-想-郷") + + self.assertEqual(f(s, 20), "幻-想-郷#…##幻-想-郷") + self.assertEqual(f(s, 19), "幻-想-郷#…#幻-想-郷") + self.assertEqual(f(s, 18), "幻-想-郷…#幻-想-郷") + self.assertEqual(f(s, 17), "幻-想-郷…幻-想-郷") + self.assertEqual(f(s, 16), "幻-想-…#幻-想-郷") + self.assertEqual(f(s, 15), "幻-想-…幻-想-郷") + self.assertEqual(f(s, 14), "幻-想-…-想-郷") + self.assertEqual(f(s, 13), "幻-想-…-想-郷") + self.assertEqual(f(s, 12), "幻-想…-想-郷") + self.assertEqual(f(s, 11), "幻-想…想-郷") + self.assertEqual(f(s, 10), "幻-…-想-郷") + self.assertEqual(f(s, 9) , "幻-…想-郷") + self.assertEqual(f(s, 3) , "…郷") + + def test_shorten_eaw_separator(self, f=output.shorten_string_eaw): + s = "01234567890123456789" # 20 ascii characters + self.assertEqual(f(s, 20, "|---|"), s) + self.assertEqual(f(s, 19, "|---|"), "0123456|---|3456789") + self.assertEqual(f(s, 15, "|---|"), "01234|---|56789") + self.assertEqual(f(s, 10, "|---|"), "01|---|789") + + self.assertEqual(f(s, 19, "..."), "01234567...23456789") + self.assertEqual(f(s, 19, "..") , "01234567..123456789") + self.assertEqual(f(s, 19, ".") , "012345678.123456789") + self.assertEqual(f(s, 19, "") , "0123456780123456789") + + def test_shorten_eaw_separator_wide(self, f=output.shorten_string_eaw): + s = "幻想郷幻想郷幻想郷幻想郷" # 12 wide characters + self.assertEqual(f(s, 24, "|---|"), s) + self.assertEqual(f(s, 19, "|---|"), "幻想郷|---|郷幻想郷") + self.assertEqual(f(s, 15, "|---|"), "幻想|---|幻想郷") + self.assertEqual(f(s, 10, "|---|"), "幻|---|郷") + + self.assertEqual(f(s, 19, "..."), "幻想郷幻...郷幻想郷") + self.assertEqual(f(s, 19, "..") , "幻想郷幻..郷幻想郷") + self.assertEqual(f(s, 19, ".") , "幻想郷幻.想郷幻想郷") + self.assertEqual(f(s, 19, "") , "幻想郷幻想郷幻想郷") + + def test_shorten_eaw_separator_mix_(self, f=output.shorten_string_eaw): + s = "幻-想-郷##幻-想-郷##幻-想-郷" # mixed characters + self.assertEqual(f(s, 30, "|---|"), s) + self.assertEqual(f(s, 19, "|---|"), "幻-想-|---|幻-想-郷") + self.assertEqual(f(s, 15, "|---|"), "幻-想|---|想-郷") + self.assertEqual(f(s, 10, "|---|"), "幻|---|-郷") + + self.assertEqual(f(s, 19, "..."), "幻-想-郷...幻-想-郷") + self.assertEqual(f(s, 19, "..") , "幻-想-郷..#幻-想-郷") + self.assertEqual(f(s, 19, ".") , "幻-想-郷#.#幻-想-郷") + self.assertEqual(f(s, 19, "") , "幻-想-郷###幻-想-郷") + + +if __name__ == '__main__': + unittest.main()