From 90ccc5746626cdd06f52cda3119336fe714cfdc5 Mon Sep 17 00:00:00 2001 From: moto <855818+mthrok@users.noreply.github.com> Date: Thu, 23 Jul 2020 16:15:43 +0000 Subject: [PATCH] Make walk_files traverse in alphabetically breadth-first order. --- test/datasets/libritts_test.py | 5 +--- test/datasets/utils_test.py | 44 ++++++++++++++++++++++++++++++++++ test/datasets/yesno_test.py | 4 +--- torchaudio/datasets/utils.py | 8 ++++++- 4 files changed, 53 insertions(+), 8 deletions(-) create mode 100644 test/datasets/utils_test.py diff --git a/test/datasets/libritts_test.py b/test/datasets/libritts_test.py index b84cfdac30..16c3fdf8a7 100644 --- a/test/datasets/libritts_test.py +++ b/test/datasets/libritts_test.py @@ -49,16 +49,13 @@ def setUpClass(cls): def test_libritts(self): dataset = LIBRITTS(self.root_dir) - samples = list(dataset) - samples.sort(key=lambda s: s[4]) - for i, (waveform, sample_rate, original_text, normalized_text, speaker_id, chapter_id, - utterance_id) in enumerate(samples): + utterance_id) in enumerate(dataset): expected_ids = self.utterance_ids[i] expected_data = self.data[i] diff --git a/test/datasets/utils_test.py b/test/datasets/utils_test.py new file mode 100644 index 0000000000..98cac636b0 --- /dev/null +++ b/test/datasets/utils_test.py @@ -0,0 +1,44 @@ +import os +from pathlib import Path + +from torchaudio.datasets import utils as dataset_utils + +from ..common_utils import ( + TempDirMixin, + TorchaudioTestCase, +) + + +class TestWalkFiles(TempDirMixin, TorchaudioTestCase): + root = None + expected = None + + def _add_file(self, *parts): + path = self.get_temp_path(*parts) + self.expected.append(path) + Path(path).touch() + + def setUp(self): + self.root = self.get_temp_path() + self.expected = [] + + # level 1 + for filename in ['a.txt', 'b.txt', 'c.txt']: + self._add_file(filename) + + # level 2 + for dir1 in ['d1', 'd2', 'd3']: + for filename in ['d.txt', 'e.txt', 'f.txt']: + self._add_file(dir1, filename) + # level 3 + for dir2 in ['d1', 'd2', 'd3']: + for filename in ['g.txt', 'h.txt', 'i.txt']: + self._add_file(dir1, dir2, filename) + + print('\n'.join(self.expected)) + + def test_walk_files(self): + """walk_files should traverse files in alphabetical order""" + for i, path in enumerate(dataset_utils.walk_files(self.root, '.txt', prefix=True)): + found = os.path.join(self.root, path) + assert found == self.expected[i] diff --git a/test/datasets/yesno_test.py b/test/datasets/yesno_test.py index 42b6112e7f..08114c5f2e 100644 --- a/test/datasets/yesno_test.py +++ b/test/datasets/yesno_test.py @@ -38,9 +38,7 @@ def setUpClass(cls): def test_yesno(self): dataset = yesno.YESNO(self.root_dir) - samples = list(dataset) - samples.sort(key=lambda s: s[2]) - for i, (waveform, sample_rate, label) in enumerate(samples): + for i, (waveform, sample_rate, label) in enumerate(dataset): expected_label = self.labels[i] expected_data = self.data[i] self.assertEqual(expected_data, waveform, atol=5e-5, rtol=1e-8) diff --git a/torchaudio/datasets/utils.py b/torchaudio/datasets/utils.py index 2840b43471..d00ffc143b 100644 --- a/torchaudio/datasets/utils.py +++ b/torchaudio/datasets/utils.py @@ -264,7 +264,13 @@ def walk_files(root: str, root = os.path.expanduser(root) - for dirpath, _, files in os.walk(root): + for dirpath, dirs, files in os.walk(root): + dirs.sort() + # `dirs` is the list used in os.walk function and by sorting it in-place here, we change the + # behavior of os.walk to traverse sub directory alphabetically + # see also + # https://stackoverflow.com/questions/6670029/can-i-force-python3s-os-walk-to-visit-directories-in-alphabetical-order-how#comment71993866_6670926 + files.sort() for f in files: if f.endswith(suffix):