implement a way to shorten filenames with east-asian characters

(#1377) Setting 'output.shorten' to "eaw" (East-Asian Width) uses a slower algorithm that also considers characters with a width > 1.
mikf · Sep 13, 2021 · bd84530 · bd84530
1 parent 2ff2974
commit bd84530
Show file tree

Hide file tree

Showing 4 changed files with 219 additions and 15 deletions.
diff --git a/docs/configuration.rst b/docs/configuration.rst
@@ -2531,6 +2531,9 @@ Description
     Controls whether the output strings should be shortened to fit
     on one console line.
 
+    Set this option to ``"eaw"`` to also work with east-asian characters
+    with a display width greater than 1.
+
 
 output.skip
 -----------

diff --git a/gallery_dl/output.py b/gallery_dl/output.py
@@ -10,6 +10,7 @@
 import sys
 import shutil
 import logging
+import unicodedata
 from . import config, util
 
 
@@ -270,9 +271,14 @@ def success(self, path, tries):
 class TerminalOutput(NullOutput):
 
     def __init__(self):
-        self.short = config.get(("output",), "shorten", True)
-        if self.short:
-            self.width = shutil.get_terminal_size().columns - OFFSET
+        shorten = config.get(("output",), "shorten", True)
+        if shorten:
+            func = shorten_string_eaw if shorten == "eaw" else shorten_string
+            limit = shutil.get_terminal_size().columns - OFFSET
+            sep = CHAR_ELLIPSIES
+            self.shorten = lambda txt: func(txt, limit, sep)
+        else:
+            self.shorten = util.identity
 
     def start(self, path):
         print(self.shorten("  " + path), end="", flush=True)
@@ -283,17 +289,6 @@ def skip(self, path):
     def success(self, path, tries):
         print("\r", self.shorten(CHAR_SUCCESS + path), sep="")
 
-    def shorten(self, txt):
-        """Reduce the length of 'txt' to the width of the terminal"""
-        if self.short and len(txt) > self.width:
-            hwidth = self.width // 2 - OFFSET
-            return "".join((
-                txt[:hwidth-1],
-                CHAR_ELLIPSIES,
-                txt[-hwidth-(self.width % 2):]
-            ))
-        return txt
-
 
 class ColorOutput(TerminalOutput):
 
@@ -307,6 +302,56 @@ def success(self, path, tries):
         print("\r\033[1;32m", self.shorten(path), "\033[0m", sep="")
 
 
+class EAWCache(dict):
+
+    def __missing__(self, key):
+        width = self[key] = \
+            2 if unicodedata.east_asian_width(key) in "WF" else 1
+        return width
+
+
+def shorten_string(txt, limit, sep="…"):
+    """Limit width of 'txt'; assume all characters have a width of 1"""
+    if len(txt) <= limit:
+        return txt
+    limit -= len(sep)
+    return txt[:limit // 2] + sep + txt[-((limit+1) // 2):]
+
+
+def shorten_string_eaw(txt, limit, sep="…", cache=EAWCache()):
+    """Limit width of 'txt'; check for east-asian characters with width > 1"""
+    char_widths = [cache[c] for c in txt]
+    text_width = sum(char_widths)
+
+    if text_width <= limit:
+        # no shortening required
+        return txt
+
+    limit -= len(sep)
+    if text_width == len(txt):
+        # all characters have a width of 1
+        return txt[:limit // 2] + sep + txt[-((limit+1) // 2):]
+
+    # wide characters
+    left = 0
+    lwidth = limit // 2
+    while True:
+        lwidth -= char_widths[left]
+        if lwidth < 0:
+            break
+        left += 1
+
+    right = -1
+    rwidth = (limit+1) // 2 + (lwidth + char_widths[left])
+    while True:
+        rwidth -= char_widths[right]
+        if rwidth < 0:
+            break
+        right -= 1
+
+    return txt[:left] + sep + txt[right+1:]
+
+
 if util.WINDOWS:
     ANSI = os.environ.get("TERM") == "ANSI"
     OFFSET = 1

diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh
@@ -2,7 +2,7 @@
 
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 
-TESTS_CORE=(cache config cookies downloader extractor job oauth postprocessor text util)
+TESTS_CORE=(cache config cookies downloader extractor job oauth output postprocessor text util)
 TESTS_RESULTS=(results)
 
 

diff --git a/test/test_output.py b/test/test_output.py
@@ -0,0 +1,156 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright 2021 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+import os
+import sys
+import unittest
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from gallery_dl import output  # noqa E402
+
+
+class TestShorten(unittest.TestCase):
+
+    def test_shorten_noop(self, f=output.shorten_string):
+        self.assertEqual(f(""      , 10), "")
+        self.assertEqual(f("foobar", 10), "foobar")
+
+    def test_shorten(self, f=output.shorten_string):
+        s = "01234567890123456789"  # string of length 20
+        self.assertEqual(f(s, 30), s)
+        self.assertEqual(f(s, 25), s)
+        self.assertEqual(f(s, 20), s)
+        self.assertEqual(f(s, 19), "012345678…123456789")
+        self.assertEqual(f(s, 18), "01234567…123456789")
+        self.assertEqual(f(s, 17), "01234567…23456789")
+        self.assertEqual(f(s, 16), "0123456…23456789")
+        self.assertEqual(f(s, 15), "0123456…3456789")
+        self.assertEqual(f(s, 14), "012345…3456789")
+        self.assertEqual(f(s, 13), "012345…456789")
+        self.assertEqual(f(s, 12), "01234…456789")
+        self.assertEqual(f(s, 11), "01234…56789")
+        self.assertEqual(f(s, 10), "0123…56789")
+        self.assertEqual(f(s, 9) , "0123…6789")
+        self.assertEqual(f(s, 3) , "0…9")
+        self.assertEqual(f(s, 2) , "…9")
+
+    def test_shorten_separator(self, f=output.shorten_string):
+        s = "01234567890123456789"  # string of length 20
+        self.assertEqual(f(s, 20, "|---|"), s)
+        self.assertEqual(f(s, 19, "|---|"), "0123456|---|3456789")
+        self.assertEqual(f(s, 15, "|---|"), "01234|---|56789")
+        self.assertEqual(f(s, 10, "|---|"), "01|---|789")
+
+        self.assertEqual(f(s, 19, "..."), "01234567...23456789")
+        self.assertEqual(f(s, 19, "..") , "01234567..123456789")
+        self.assertEqual(f(s, 19, ".")  , "012345678.123456789")
+        self.assertEqual(f(s, 19, "")   , "0123456780123456789")
+
+
+class TestShortenEAW(unittest.TestCase):
+
+    def test_shorten_eaw_noop(self, f=output.shorten_string_eaw):
+        self.assertEqual(f(""      , 10), "")
+        self.assertEqual(f("foobar", 10), "foobar")
+
+    def test_shorten_eaw(self, f=output.shorten_string_eaw):
+        s = "01234567890123456789"  # 20 ascii characters
+        self.assertEqual(f(s, 30), s)
+        self.assertEqual(f(s, 25), s)
+        self.assertEqual(f(s, 20), s)
+        self.assertEqual(f(s, 19), "012345678…123456789")
+        self.assertEqual(f(s, 18), "01234567…123456789")
+        self.assertEqual(f(s, 17), "01234567…23456789")
+        self.assertEqual(f(s, 16), "0123456…23456789")
+        self.assertEqual(f(s, 15), "0123456…3456789")
+        self.assertEqual(f(s, 14), "012345…3456789")
+        self.assertEqual(f(s, 13), "012345…456789")
+        self.assertEqual(f(s, 12), "01234…456789")
+        self.assertEqual(f(s, 11), "01234…56789")
+        self.assertEqual(f(s, 10), "0123…56789")
+        self.assertEqual(f(s, 9) , "0123…6789")
+        self.assertEqual(f(s, 3) , "0…9")
+        self.assertEqual(f(s, 2) , "…9")
+
+    def test_shorten_eaw_wide(self, f=output.shorten_string_eaw):
+        s = "幻想郷幻想郷幻想郷幻想郷"  # 12 wide characters
+        self.assertEqual(f(s, 30), s)
+        self.assertEqual(f(s, 25), s)
+        self.assertEqual(f(s, 20), "幻想郷幻…想郷幻想郷")
+        self.assertEqual(f(s, 19), "幻想郷幻…想郷幻想郷")
+        self.assertEqual(f(s, 18), "幻想郷幻…郷幻想郷")
+        self.assertEqual(f(s, 17), "幻想郷幻…郷幻想郷")
+        self.assertEqual(f(s, 16), "幻想郷…郷幻想郷")
+        self.assertEqual(f(s, 15), "幻想郷…郷幻想郷")
+        self.assertEqual(f(s, 14), "幻想郷…幻想郷")
+        self.assertEqual(f(s, 13), "幻想郷…幻想郷")
+        self.assertEqual(f(s, 12), "幻想…幻想郷")
+        self.assertEqual(f(s, 11), "幻想…幻想郷")
+        self.assertEqual(f(s, 10), "幻想…想郷")
+        self.assertEqual(f(s, 9) , "幻想…想郷")
+        self.assertEqual(f(s, 3) , "…郷")
+
+    def test_shorten_eaw_mix(self, f=output.shorten_string_eaw):
+        s = "幻-想-郷##幻-想-郷##幻-想-郷"  # mixed characters
+        self.assertEqual(f(s, 28), s)
+        self.assertEqual(f(s, 25), "幻-想-郷##幻…郷##幻-想-郷")
+
+        self.assertEqual(f(s, 20), "幻-想-郷#…##幻-想-郷")
+        self.assertEqual(f(s, 19), "幻-想-郷#…#幻-想-郷")
+        self.assertEqual(f(s, 18), "幻-想-郷…#幻-想-郷")
+        self.assertEqual(f(s, 17), "幻-想-郷…幻-想-郷")
+        self.assertEqual(f(s, 16), "幻-想-…#幻-想-郷")
+        self.assertEqual(f(s, 15), "幻-想-…幻-想-郷")
+        self.assertEqual(f(s, 14), "幻-想-…-想-郷")
+        self.assertEqual(f(s, 13), "幻-想-…-想-郷")
+        self.assertEqual(f(s, 12), "幻-想…-想-郷")
+        self.assertEqual(f(s, 11), "幻-想…想-郷")
+        self.assertEqual(f(s, 10), "幻-…-想-郷")
+        self.assertEqual(f(s, 9) , "幻-…想-郷")
+        self.assertEqual(f(s, 3) , "…郷")
+
+    def test_shorten_eaw_separator(self, f=output.shorten_string_eaw):
+        s = "01234567890123456789"  # 20 ascii characters
+        self.assertEqual(f(s, 20, "|---|"), s)
+        self.assertEqual(f(s, 19, "|---|"), "0123456|---|3456789")
+        self.assertEqual(f(s, 15, "|---|"), "01234|---|56789")
+        self.assertEqual(f(s, 10, "|---|"), "01|---|789")
+
+        self.assertEqual(f(s, 19, "..."), "01234567...23456789")
+        self.assertEqual(f(s, 19, "..") , "01234567..123456789")
+        self.assertEqual(f(s, 19, ".")  , "012345678.123456789")
+        self.assertEqual(f(s, 19, "")   , "0123456780123456789")
+
+    def test_shorten_eaw_separator_wide(self, f=output.shorten_string_eaw):
+        s = "幻想郷幻想郷幻想郷幻想郷"  # 12 wide characters
+        self.assertEqual(f(s, 24, "|---|"), s)
+        self.assertEqual(f(s, 19, "|---|"), "幻想郷|---|郷幻想郷")
+        self.assertEqual(f(s, 15, "|---|"), "幻想|---|幻想郷")
+        self.assertEqual(f(s, 10, "|---|"), "幻|---|郷")
+
+        self.assertEqual(f(s, 19, "..."), "幻想郷幻...郷幻想郷")
+        self.assertEqual(f(s, 19, "..") , "幻想郷幻..郷幻想郷")
+        self.assertEqual(f(s, 19, ".")  , "幻想郷幻.想郷幻想郷")
+        self.assertEqual(f(s, 19, "")   , "幻想郷幻想郷幻想郷")
+
+    def test_shorten_eaw_separator_mix_(self, f=output.shorten_string_eaw):
+        s = "幻-想-郷##幻-想-郷##幻-想-郷"  # mixed characters
+        self.assertEqual(f(s, 30, "|---|"), s)
+        self.assertEqual(f(s, 19, "|---|"), "幻-想-|---|幻-想-郷")
+        self.assertEqual(f(s, 15, "|---|"), "幻-想|---|想-郷")
+        self.assertEqual(f(s, 10, "|---|"), "幻|---|-郷")
+
+        self.assertEqual(f(s, 19, "..."), "幻-想-郷...幻-想-郷")
+        self.assertEqual(f(s, 19, "..") , "幻-想-郷..#幻-想-郷")
+        self.assertEqual(f(s, 19, ".")  , "幻-想-郷#.#幻-想-郷")
+        self.assertEqual(f(s, 19, "")   , "幻-想-郷###幻-想-郷")
+
+
+if __name__ == '__main__':
+    unittest.main()