Import torchaudio #1639 37dbf29

Summary: Import torchaudio #1639 37dbf29 Reviewed By: carolineechen, mthrok Differential Revision: D29920658 fbshipit-source-id: 94ba8c04edcfb50e355b1ca8e937f612917ecf38
pytorch · Jul 27, 2021 · 3702055 · 3702055
1 parent 1f1bd18
commit 3702055
Show file tree

Hide file tree

Showing 9 changed files with 211 additions and 64 deletions.
diff --git a/examples/pipeline_tacotron2/text/__init__.py b/examples/pipeline_tacotron2/text/__init__.py
diff --git a/examples/pipeline_tacotron2/text/numbers.py b/examples/pipeline_tacotron2/text/numbers.py
@@ -0,0 +1,95 @@
+# *****************************************************************************
+# Copyright (c) 2017 Keith Ito
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#
+# *****************************************************************************
+"""
+Modified from https://github.com/keithito/tacotron
+"""
+
+import inflect
+import re
+
+
+_inflect = inflect.engine()
+_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
+_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
+_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
+_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
+_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
+_number_re = re.compile(r'[0-9]+')
+
+
+def _remove_commas(m: re.Match) -> str:
+    return m.group(1).replace(',', '')
+
+
+def _expand_decimal_point(m: re.Match) -> str:
+    return m.group(1).replace('.', ' point ')
+
+
+def _expand_dollars(m: re.Match) -> str:
+    match = m.group(1)
+    parts = match.split('.')
+    if len(parts) > 2:
+        return match + ' dollars'  # Unexpected format
+    dollars = int(parts[0]) if parts[0] else 0
+    cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
+    if dollars and cents:
+        dollar_unit = 'dollar' if dollars == 1 else 'dollars'
+        cent_unit = 'cent' if cents == 1 else 'cents'
+        return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
+    elif dollars:
+        dollar_unit = 'dollar' if dollars == 1 else 'dollars'
+        return '%s %s' % (dollars, dollar_unit)
+    elif cents:
+        cent_unit = 'cent' if cents == 1 else 'cents'
+        return '%s %s' % (cents, cent_unit)
+    else:
+        return 'zero dollars'
+
+
+def _expand_ordinal(m: re.Match) -> str:
+    return _inflect.number_to_words(m.group(0))
+
+
+def _expand_number(m: re.Match) -> str:
+    num = int(m.group(0))
+    if num > 1000 and num < 3000:
+        if num == 2000:
+            return 'two thousand'
+        elif num > 2000 and num < 2010:
+            return 'two thousand ' + _inflect.number_to_words(num % 100)
+        elif num % 100 == 0:
+            return _inflect.number_to_words(num // 100) + ' hundred'
+        else:
+            return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
+    else:
+        return _inflect.number_to_words(num, andword='')
+
+
+def normalize_numbers(text: str) -> str:
+    text = re.sub(_comma_number_re, _remove_commas, text)
+    text = re.sub(_pounds_re, r'\1 pounds', text)
+    text = re.sub(_dollars_re, _expand_dollars, text)
+    text = re.sub(_decimal_number_re, _expand_decimal_point, text)
+    text = re.sub(_ordinal_re, _expand_ordinal, text)
+    text = re.sub(_number_re, _expand_number, text)
+    return text
diff --git a/examples/pipeline_tacotron2/text/test_text.py b/examples/pipeline_tacotron2/text/test_text.py
@@ -0,0 +1,22 @@
+import unittest
+
+from parameterized import parameterized
+
+from .text_preprocessing import text_to_sequence
+
+
+class TestTextPreprocessor(unittest.TestCase):
+
+    @parameterized.expand(
+        [
+            ["dr.  Strange?", [15, 26, 14, 31, 26, 29, 11, 30, 31, 29, 12, 25, 18, 16, 10]],
+            ["ML, is        fun.", [24, 23, 6, 11, 20, 30, 11, 17, 32, 25, 7]],
+            ["I love torchaudio!", [20, 11, 23, 26, 33, 16, 11, 31, 26, 29, 14, 19, 12, 32, 15, 20, 26, 2]],
+            # 'one thousand dollars, twenty cents'
+            ["$1,000.20", [26, 25, 16, 11, 31, 19, 26, 32, 30, 12, 25, 15, 11, 15, 26, 23, 23,
+                           12, 29, 30, 6, 11, 31, 34, 16, 25, 31, 36, 11, 14, 16, 25, 31, 30]],
+        ]
+    )
+    def test_text_to_sequence(self, sent, seq):
+
+        assert (text_to_sequence(sent) == seq)
diff --git a/examples/pipeline_tacotron2/text/text_preprocessing.py b/examples/pipeline_tacotron2/text/text_preprocessing.py
@@ -0,0 +1,85 @@
+# *****************************************************************************
+# Copyright (c) 2017 Keith Ito
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#
+# *****************************************************************************
+"""
+Modified from https://github.com/keithito/tacotron
+"""
+
+from typing import List
+import re
+
+from unidecode import unidecode
+
+from .numbers import normalize_numbers
+
+
+# Regular expression matching whitespace:
+_whitespace_re = re.compile(r'\s+')
+
+# List of (regular expression, replacement) pairs for abbreviations:
+_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
+    ('mrs', 'misess'),
+    ('mr', 'mister'),
+    ('dr', 'doctor'),
+    ('st', 'saint'),
+    ('co', 'company'),
+    ('jr', 'junior'),
+    ('maj', 'major'),
+    ('gen', 'general'),
+    ('drs', 'doctors'),
+    ('rev', 'reverend'),
+    ('lt', 'lieutenant'),
+    ('hon', 'honorable'),
+    ('sgt', 'sergeant'),
+    ('capt', 'captain'),
+    ('esq', 'esquire'),
+    ('ltd', 'limited'),
+    ('col', 'colonel'),
+    ('ft', 'fort'),
+]]
+
+_pad = '_'
+_punctuation = '!\'(),.:;? '
+_special = '-'
+_letters = 'abcdefghijklmnopqrstuvwxyz'
+
+symbols = [_pad] + list(_special) + list(_punctuation) + list(_letters)
+_symbol_to_id = {s: i for i, s in enumerate(symbols)}
+
+
+def text_to_sequence(sent: str) -> List[int]:
+    r'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
+
+      Args:
+        sent (str): The input sentence to convert to a sequence.
+
+      Returns:
+        List of integers corresponding to the symbols in the sentence.
+    '''
+    sent = unidecode(sent)  # convert to ascii
+    sent = sent.lower()  # lower case
+    sent = normalize_numbers(sent)  # expand numbers
+    for regex, replacement in _abbreviations:  # expand abbreviations
+        sent = re.sub(regex, replacement, sent)
+    sent = re.sub(_whitespace_re, ' ', sent)  # collapse whitespace
+
+    return [_symbol_to_id[s] for s in sent if s in _symbol_to_id]
diff --git a/test/torchaudio_unittest/transforms/batch_consistency_test.py b/test/torchaudio_unittest/transforms/batch_consistency_test.py
@@ -33,15 +33,15 @@ def test_batch_Resample(self):
         self.assertEqual(computed, expected)
 
     def test_batch_MelScale(self):
-        specgram = torch.randn(2, 31, 2786)
+        specgram = torch.randn(2, 201, 256)
 
         # Single then transform then batch
         expected = torchaudio.transforms.MelScale()(specgram).repeat(3, 1, 1, 1)
 
         # Batch then transform
         computed = torchaudio.transforms.MelScale()(specgram.repeat(3, 1, 1, 1))
 
-        # shape = (3, 2, 201, 1394)
+        # shape = (3, 2, 128, 256)
         self.assertEqual(computed, expected)
 
     def test_batch_InverseMelScale(self):

diff --git a/test/torchaudio_unittest/transforms/torchscript_consistency_impl.py b/test/torchaudio_unittest/transforms/torchscript_consistency_impl.py
@@ -59,10 +59,6 @@ def test_AmplitudeToDB(self):
         spec = torch.rand((6, 201))
         self._assert_consistency(T.AmplitudeToDB(), spec)
 
-    def test_MelScale_invalid(self):
-        with self.assertRaises(ValueError):
-            torch.jit.script(T.MelScale())
-
     def test_MelScale(self):
         spec_f = torch.rand((1, 201, 6))
         self._assert_consistency(T.MelScale(n_stft=201), spec_f)

diff --git a/test/torchaudio_unittest/transforms/transforms_test.py b/test/torchaudio_unittest/transforms/transforms_test.py
@@ -55,17 +55,17 @@ def test_AmplitudeToDB(self):
         self.assertEqual(mag_to_db_torch, power_to_db_torch)
 
     def test_melscale_load_save(self):
-        specgram = torch.ones(1, 1000, 100)
+        specgram = torch.ones(1, 201, 100)
         melscale_transform = transforms.MelScale()
         melscale_transform(specgram)
 
-        melscale_transform_copy = transforms.MelScale(n_stft=1000)
+        melscale_transform_copy = transforms.MelScale()
         melscale_transform_copy.load_state_dict(melscale_transform.state_dict())
 
         fb = melscale_transform.fb
         fb_copy = melscale_transform_copy.fb
 
-        self.assertEqual(fb_copy.size(), (1000, 128))
+        self.assertEqual(fb_copy.size(), (201, 128))
         self.assertEqual(fb, fb_copy)
 
     def test_melspectrogram_load_save(self):

diff --git a/test/torchaudio_unittest/transforms/transforms_test_impl.py b/test/torchaudio_unittest/transforms/transforms_test_impl.py
@@ -1,5 +1,3 @@
-import warnings
-
 import torch
 import torchaudio.transforms as T
 
@@ -63,22 +61,6 @@ def test_InverseMelScale(self):
         assert _get_ratio(relative_diff < 1e-3) > 5e-3
         assert _get_ratio(relative_diff < 1e-5) > 1e-5
 
-    def test_melscale_unset_weight_warning(self):
-        """Issue a warning if MelScale initialized without a weight
-
-        As part of the deprecation of lazy intialization behavior (#1510),
-        issue a warning if `n_stft` is not set.
-        """
-        with warnings.catch_warnings(record=True) as caught_warnings:
-            warnings.simplefilter("always")
-            T.MelScale(n_mels=64, sample_rate=8000)
-        assert len(caught_warnings) == 1
-
-        with warnings.catch_warnings(record=True) as caught_warnings:
-            warnings.simplefilter("always")
-            T.MelScale(n_mels=64, sample_rate=8000, n_stft=201)
-        assert len(caught_warnings) == 0
-
     @nested_params(
         ["sinc_interpolation", "kaiser_window"],
         [16000, 44100],

diff --git a/torchaudio/transforms.py b/torchaudio/transforms.py
@@ -244,9 +244,8 @@ class MelScale(torch.nn.Module):
         sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
         f_min (float, optional): Minimum frequency. (Default: ``0.``)
         f_max (float or None, optional): Maximum frequency. (Default: ``sample_rate // 2``)
-        n_stft (int, optional): Number of bins in STFT. Calculated from first input
-            if None is given.  See ``n_fft`` in :class:`Spectrogram`. (Default: ``None``)
-        norm (Optional[str]): If 'slaney', divide the triangular mel weights by the width of the mel band
+        n_stft (int, optional): Number of bins in STFT. See ``n_fft`` in :class:`Spectrogram`. (Default: ``201``)
+        norm (str or None, optional): If 'slaney', divide the triangular mel weights by the width of the mel band
         (area normalization). (Default: ``None``)
         mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
     """
@@ -257,7 +256,7 @@ def __init__(self,
                  sample_rate: int = 16000,
                  f_min: float = 0.,
                  f_max: Optional[float] = None,
-                 n_stft: Optional[int] = None,
+                 n_stft: int = 201,
                  norm: Optional[str] = None,
                  mel_scale: str = "htk") -> None:
         super(MelScale, self).__init__()
@@ -269,35 +268,11 @@ def __init__(self,
         self.mel_scale = mel_scale
 
         assert f_min <= self.f_max, 'Require f_min: {} < f_max: {}'.format(f_min, self.f_max)
-
-        if n_stft is None or n_stft == 0:
-            warnings.warn(
-                'Initialization of torchaudio.transforms.MelScale with an unset weight '
-                '`n_stft=None` is deprecated and will be removed in release 0.10. '
-                'Please set a proper `n_stft` value. Typically this is `n_fft // 2 + 1`. '
-                'Refer to https://github.com/pytorch/audio/issues/1510 '
-                'for more details.'
-            )
-
-        fb = torch.empty(0) if n_stft is None else F.create_fb_matrix(
+        fb = F.create_fb_matrix(
             n_stft, self.f_min, self.f_max, self.n_mels, self.sample_rate, self.norm,
             self.mel_scale)
         self.register_buffer('fb', fb)
 
-    def __prepare_scriptable__(self):
-        r"""If `self.fb` is empty, the `forward` method will try to resize the parameter,
-        which does not work once the transform is scripted. However, this error does not happen
-        until the transform is executed. This is inconvenient especially if the resulting
-        TorchScript object is executed in other environments. Therefore, we check the
-        validity of `self.fb` here and fail if the resulting TS does not work.
-
-        Returns:
-            MelScale: self
-        """
-        if self.fb.numel() == 0:
-            raise ValueError("n_stft must be provided at construction")
-        return self
-
     def forward(self, specgram: Tensor) -> Tensor:
         r"""
         Args:
@@ -311,14 +286,6 @@ def forward(self, specgram: Tensor) -> Tensor:
         shape = specgram.size()
         specgram = specgram.reshape(-1, shape[-2], shape[-1])
 
-        if self.fb.numel() == 0:
-            tmp_fb = F.create_fb_matrix(specgram.size(1), self.f_min, self.f_max,
-                                        self.n_mels, self.sample_rate, self.norm,
-                                        self.mel_scale)
-            # Attributes cannot be reassigned outside __init__ so workaround
-            self.fb.resize_(tmp_fb.size())
-            self.fb.copy_(tmp_fb)
-
         # (channel, frequency, time).transpose(...) dot (frequency, n_mels)
         # -> (channel, time, n_mels).transpose(...)
         mel_specgram = torch.matmul(specgram.transpose(1, 2), self.fb).transpose(1, 2)