Skip to content

Commit

Permalink
implement a way to shorten filenames with east-asian characters
Browse files Browse the repository at this point in the history
(#1377)

Setting 'output.shorten' to "eaw" (East-Asian Width) uses a slower
algorithm that also considers characters with a width > 1.
  • Loading branch information
mikf committed Sep 13, 2021
1 parent 2ff2974 commit bd84530
Show file tree
Hide file tree
Showing 4 changed files with 219 additions and 15 deletions.
3 changes: 3 additions & 0 deletions docs/configuration.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2531,6 +2531,9 @@ Description
Controls whether the output strings should be shortened to fit
on one console line.

Set this option to ``"eaw"`` to also work with east-asian characters
with a display width greater than 1.


output.skip
-----------
Expand Down
73 changes: 59 additions & 14 deletions gallery_dl/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import sys
import shutil
import logging
import unicodedata
from . import config, util


Expand Down Expand Up @@ -270,9 +271,14 @@ def success(self, path, tries):
class TerminalOutput(NullOutput):

def __init__(self):
self.short = config.get(("output",), "shorten", True)
if self.short:
self.width = shutil.get_terminal_size().columns - OFFSET
shorten = config.get(("output",), "shorten", True)
if shorten:
func = shorten_string_eaw if shorten == "eaw" else shorten_string
limit = shutil.get_terminal_size().columns - OFFSET
sep = CHAR_ELLIPSIES
self.shorten = lambda txt: func(txt, limit, sep)
else:
self.shorten = util.identity

def start(self, path):
print(self.shorten(" " + path), end="", flush=True)
Expand All @@ -283,17 +289,6 @@ def skip(self, path):
def success(self, path, tries):
print("\r", self.shorten(CHAR_SUCCESS + path), sep="")

def shorten(self, txt):
"""Reduce the length of 'txt' to the width of the terminal"""
if self.short and len(txt) > self.width:
hwidth = self.width // 2 - OFFSET
return "".join((
txt[:hwidth-1],
CHAR_ELLIPSIES,
txt[-hwidth-(self.width % 2):]
))
return txt


class ColorOutput(TerminalOutput):

Expand All @@ -307,6 +302,56 @@ def success(self, path, tries):
print("\r\033[1;32m", self.shorten(path), "\033[0m", sep="")


class EAWCache(dict):

def __missing__(self, key):
width = self[key] = \
2 if unicodedata.east_asian_width(key) in "WF" else 1
return width


def shorten_string(txt, limit, sep="…"):
"""Limit width of 'txt'; assume all characters have a width of 1"""
if len(txt) <= limit:
return txt
limit -= len(sep)
return txt[:limit // 2] + sep + txt[-((limit+1) // 2):]


def shorten_string_eaw(txt, limit, sep="…", cache=EAWCache()):
"""Limit width of 'txt'; check for east-asian characters with width > 1"""
char_widths = [cache[c] for c in txt]
text_width = sum(char_widths)

if text_width <= limit:
# no shortening required
return txt

limit -= len(sep)
if text_width == len(txt):
# all characters have a width of 1
return txt[:limit // 2] + sep + txt[-((limit+1) // 2):]

# wide characters
left = 0
lwidth = limit // 2
while True:
lwidth -= char_widths[left]
if lwidth < 0:
break
left += 1

right = -1
rwidth = (limit+1) // 2 + (lwidth + char_widths[left])
while True:
rwidth -= char_widths[right]
if rwidth < 0:
break
right -= 1

return txt[:left] + sep + txt[right+1:]


if util.WINDOWS:
ANSI = os.environ.get("TERM") == "ANSI"
OFFSET = 1
Expand Down
2 changes: 1 addition & 1 deletion scripts/run_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"

TESTS_CORE=(cache config cookies downloader extractor job oauth postprocessor text util)
TESTS_CORE=(cache config cookies downloader extractor job oauth output postprocessor text util)
TESTS_RESULTS=(results)


Expand Down
156 changes: 156 additions & 0 deletions test/test_output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright 2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

import os
import sys
import unittest

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from gallery_dl import output # noqa E402


class TestShorten(unittest.TestCase):

def test_shorten_noop(self, f=output.shorten_string):
self.assertEqual(f("" , 10), "")
self.assertEqual(f("foobar", 10), "foobar")

def test_shorten(self, f=output.shorten_string):
s = "01234567890123456789" # string of length 20
self.assertEqual(f(s, 30), s)
self.assertEqual(f(s, 25), s)
self.assertEqual(f(s, 20), s)
self.assertEqual(f(s, 19), "012345678…123456789")
self.assertEqual(f(s, 18), "01234567…123456789")
self.assertEqual(f(s, 17), "01234567…23456789")
self.assertEqual(f(s, 16), "0123456…23456789")
self.assertEqual(f(s, 15), "0123456…3456789")
self.assertEqual(f(s, 14), "012345…3456789")
self.assertEqual(f(s, 13), "012345…456789")
self.assertEqual(f(s, 12), "01234…456789")
self.assertEqual(f(s, 11), "01234…56789")
self.assertEqual(f(s, 10), "0123…56789")
self.assertEqual(f(s, 9) , "0123…6789")
self.assertEqual(f(s, 3) , "0…9")
self.assertEqual(f(s, 2) , "…9")

def test_shorten_separator(self, f=output.shorten_string):
s = "01234567890123456789" # string of length 20
self.assertEqual(f(s, 20, "|---|"), s)
self.assertEqual(f(s, 19, "|---|"), "0123456|---|3456789")
self.assertEqual(f(s, 15, "|---|"), "01234|---|56789")
self.assertEqual(f(s, 10, "|---|"), "01|---|789")

self.assertEqual(f(s, 19, "..."), "01234567...23456789")
self.assertEqual(f(s, 19, "..") , "01234567..123456789")
self.assertEqual(f(s, 19, ".") , "012345678.123456789")
self.assertEqual(f(s, 19, "") , "0123456780123456789")


class TestShortenEAW(unittest.TestCase):

def test_shorten_eaw_noop(self, f=output.shorten_string_eaw):
self.assertEqual(f("" , 10), "")
self.assertEqual(f("foobar", 10), "foobar")

def test_shorten_eaw(self, f=output.shorten_string_eaw):
s = "01234567890123456789" # 20 ascii characters
self.assertEqual(f(s, 30), s)
self.assertEqual(f(s, 25), s)
self.assertEqual(f(s, 20), s)
self.assertEqual(f(s, 19), "012345678…123456789")
self.assertEqual(f(s, 18), "01234567…123456789")
self.assertEqual(f(s, 17), "01234567…23456789")
self.assertEqual(f(s, 16), "0123456…23456789")
self.assertEqual(f(s, 15), "0123456…3456789")
self.assertEqual(f(s, 14), "012345…3456789")
self.assertEqual(f(s, 13), "012345…456789")
self.assertEqual(f(s, 12), "01234…456789")
self.assertEqual(f(s, 11), "01234…56789")
self.assertEqual(f(s, 10), "0123…56789")
self.assertEqual(f(s, 9) , "0123…6789")
self.assertEqual(f(s, 3) , "0…9")
self.assertEqual(f(s, 2) , "…9")

def test_shorten_eaw_wide(self, f=output.shorten_string_eaw):
s = "幻想郷幻想郷幻想郷幻想郷" # 12 wide characters
self.assertEqual(f(s, 30), s)
self.assertEqual(f(s, 25), s)
self.assertEqual(f(s, 20), "幻想郷幻…想郷幻想郷")
self.assertEqual(f(s, 19), "幻想郷幻…想郷幻想郷")
self.assertEqual(f(s, 18), "幻想郷幻…郷幻想郷")
self.assertEqual(f(s, 17), "幻想郷幻…郷幻想郷")
self.assertEqual(f(s, 16), "幻想郷…郷幻想郷")
self.assertEqual(f(s, 15), "幻想郷…郷幻想郷")
self.assertEqual(f(s, 14), "幻想郷…幻想郷")
self.assertEqual(f(s, 13), "幻想郷…幻想郷")
self.assertEqual(f(s, 12), "幻想…幻想郷")
self.assertEqual(f(s, 11), "幻想…幻想郷")
self.assertEqual(f(s, 10), "幻想…想郷")
self.assertEqual(f(s, 9) , "幻想…想郷")
self.assertEqual(f(s, 3) , "…郷")

def test_shorten_eaw_mix(self, f=output.shorten_string_eaw):
s = "幻-想-郷##幻-想-郷##幻-想-郷" # mixed characters
self.assertEqual(f(s, 28), s)
self.assertEqual(f(s, 25), "幻-想-郷##幻…郷##幻-想-郷")

self.assertEqual(f(s, 20), "幻-想-郷#…##幻-想-郷")
self.assertEqual(f(s, 19), "幻-想-郷#…#幻-想-郷")
self.assertEqual(f(s, 18), "幻-想-郷…#幻-想-郷")
self.assertEqual(f(s, 17), "幻-想-郷…幻-想-郷")
self.assertEqual(f(s, 16), "幻-想-…#幻-想-郷")
self.assertEqual(f(s, 15), "幻-想-…幻-想-郷")
self.assertEqual(f(s, 14), "幻-想-…-想-郷")
self.assertEqual(f(s, 13), "幻-想-…-想-郷")
self.assertEqual(f(s, 12), "幻-想…-想-郷")
self.assertEqual(f(s, 11), "幻-想…想-郷")
self.assertEqual(f(s, 10), "幻-…-想-郷")
self.assertEqual(f(s, 9) , "幻-…想-郷")
self.assertEqual(f(s, 3) , "…郷")

def test_shorten_eaw_separator(self, f=output.shorten_string_eaw):
s = "01234567890123456789" # 20 ascii characters
self.assertEqual(f(s, 20, "|---|"), s)
self.assertEqual(f(s, 19, "|---|"), "0123456|---|3456789")
self.assertEqual(f(s, 15, "|---|"), "01234|---|56789")
self.assertEqual(f(s, 10, "|---|"), "01|---|789")

self.assertEqual(f(s, 19, "..."), "01234567...23456789")
self.assertEqual(f(s, 19, "..") , "01234567..123456789")
self.assertEqual(f(s, 19, ".") , "012345678.123456789")
self.assertEqual(f(s, 19, "") , "0123456780123456789")

def test_shorten_eaw_separator_wide(self, f=output.shorten_string_eaw):
s = "幻想郷幻想郷幻想郷幻想郷" # 12 wide characters
self.assertEqual(f(s, 24, "|---|"), s)
self.assertEqual(f(s, 19, "|---|"), "幻想郷|---|郷幻想郷")
self.assertEqual(f(s, 15, "|---|"), "幻想|---|幻想郷")
self.assertEqual(f(s, 10, "|---|"), "幻|---|郷")

self.assertEqual(f(s, 19, "..."), "幻想郷幻...郷幻想郷")
self.assertEqual(f(s, 19, "..") , "幻想郷幻..郷幻想郷")
self.assertEqual(f(s, 19, ".") , "幻想郷幻.想郷幻想郷")
self.assertEqual(f(s, 19, "") , "幻想郷幻想郷幻想郷")

def test_shorten_eaw_separator_mix_(self, f=output.shorten_string_eaw):
s = "幻-想-郷##幻-想-郷##幻-想-郷" # mixed characters
self.assertEqual(f(s, 30, "|---|"), s)
self.assertEqual(f(s, 19, "|---|"), "幻-想-|---|幻-想-郷")
self.assertEqual(f(s, 15, "|---|"), "幻-想|---|想-郷")
self.assertEqual(f(s, 10, "|---|"), "幻|---|-郷")

self.assertEqual(f(s, 19, "..."), "幻-想-郷...幻-想-郷")
self.assertEqual(f(s, 19, "..") , "幻-想-郷..#幻-想-郷")
self.assertEqual(f(s, 19, ".") , "幻-想-郷#.#幻-想-郷")
self.assertEqual(f(s, 19, "") , "幻-想-郷###幻-想-郷")


if __name__ == '__main__':
unittest.main()

0 comments on commit bd84530

Please sign in to comment.