[formatter] Implement conversion to TextGrid (#4)

Introduce a class which houses functions to convert the json output of the transcriber and diarizer into formats that can be input into various alignment software. This commitn introduces a conversion to TextGrids for use with MFA though other formats may be introduced later.
Forced-Alignment-and-Vowel-Extraction · Feb 8, 2024 · 0e286e7 · 0e286e7
1 parent b958136
commit 0e286e7
Show file tree

Hide file tree

Showing 4 changed files with 204 additions and 0 deletions.
diff --git a/formatter.py b/formatter.py
@@ -0,0 +1,62 @@
+# This program is part of fave-asr
+# Copyright (C) 2024 Christian Brickhouse and FAVE Contributors
+#
+# fave-asr is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation as version 3 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+import textgrid
+
+class Formatter():
+    def __init__(self):
+        pass
+
+    def to_TextGrid(self, diarized_transcription):
+        """
+        Convert a diarized transcription dictionary to a TextGrid
+
+        Args:
+            diarized_transcription: Output of pipeline.assign_speakers()
+
+        Returns:
+            A textgrid.TextGrid object populated with the diarized and
+            transcribed data. Tiers are by speaker and contain word-level
+            intervals not utterance-level.
+        """
+        minTime = diarized_transcription['segments'][0]['start']
+        maxTime = diarized_transcription['segments'][-1]['end']
+        tg = textgrid.TextGrid(minTime=minTime,maxTime=maxTime)
+
+        speakers = [x['speaker'] for x in diarized_transcription['segments']]
+        for speaker in set(speakers):
+            tg.append(textgrid.IntervalTier(name=speaker,minTime=minTime,maxTime=maxTime))
+        # Create a lookup table of tier indices based on the given speaker name
+        tier_key = dict((name,index) for index, name in enumerate([x.name for x in tg.tiers]))
+
+        for segment in diarized_transcription['segments']:
+            # There's no guarantee, weirdly, that a given word's assigned speaker
+            # is the same as the speaker assigned to the whole segment. Since
+            # the tiers are based on assigned /segment/ speakers, not assigned 
+            # word speakers, we need to look up the tier in the segment loop
+            # not in the word loop. See Issue #7
+            tier_index = tier_key[segment['speaker']]
+            tier = tg.tiers[tier_index]
+            minTime = segment['start']
+            maxTime = segment['end']
+            mark = segment['text']
+            tier.add(minTime,maxTime,mark)
+            # In testing, the word-level alignments are not very good. A future version
+            # might want to add an option for end users to enable the following loop.
+            #for word in segment['words']:
+            #    minTime = word['start']
+            #    maxTime = word['end']
+            #    mark = word['word']
+            #    tier.add(minTime,maxTime,mark)
+        return tg
diff --git a/requirements.txt b/requirements.txt
@@ -2,3 +2,4 @@ openai-whisper @ git+https://github.com/openai/whisper.git@b38a1f20f4b23f3f3099a
 whisperx @ git+https://github.com/m-bain/whisperx.git@49e0130e4e0c0d99d60715d76e65a71826a97109
 GPUtil
 psutil
+textgrid
diff --git a/tests/data/TestAudio_SnoopDogg_85SouthMedia.TextGrid b/tests/data/TestAudio_SnoopDogg_85SouthMedia.TextGrid
@@ -0,0 +1,114 @@
+File type = "ooTextFile"
+Object class = "TextGrid"
+
+xmin = 0.362
+xmax = 51.495
+tiers? <exists>
+size = 1
+item []:
+	item [1]:
+		class = "IntervalTier"
+		name = "SPEAKER_00"
+		xmin = 0.362
+		xmax = 51.495
+		intervals: size = 25
+			intervals [1]:
+				xmin = 0.362
+				xmax = 3.238
+				text = " So, you know the pimpin', fuck y'all."
+			intervals [2]:
+				xmin = 4.024
+				xmax = 5.214
+				text = " I'm finna go over to Def Jam"
+			intervals [3]:
+				xmin = 6.286
+				xmax = 7.24
+				text = " and learn a little bit of corporate work,"
+			intervals [4]:
+				xmin = 7.382
+				xmax = 8.618
+				text = " because I don't know corporate yet."
+			intervals [5]:
+				xmin = 8.923
+				xmax = 9.96
+				text = " I only need a few months."
+			intervals [6]:
+				xmin = 10.0
+				xmax = 11.14
+				text = " You give me a few months to run the shit,"
+			intervals [7]:
+				xmin = 11.181
+				xmax = 12.437
+				text = " I'm a fast learner."
+			intervals [8]:
+				xmin = 13.084
+				xmax = 14.859
+				text = " Go to Def Jam, get a job in a position,"
+			intervals [9]:
+				xmin = 15.324
+				xmax = 16.72
+				text = " drop a record, get Benny the Butcher signed,"
+			intervals [10]:
+				xmin = 17.145
+				xmax = 18.277
+				text = " get Hip Hop Harry signed,"
+			intervals [11]:
+				xmin = 19.046
+				xmax = 20.08
+				text = " learn a few tricks of the trade,"
+			intervals [12]:
+				xmin = 20.524
+				xmax = 22.08
+				text = " find out that the niggas that had it"
+			intervals [13]:
+				xmin = 22.465
+				xmax = 23.539
+				text = " that wanted me to hold for them,"
+			intervals [14]:
+				xmin = 24.003
+				xmax = 25.336
+				text = " then sold it to some other people."
+			intervals [15]:
+				xmin = 26.243
+				xmax = 28.28
+				text = " So now, one of my big wig buddies called me"
+			intervals [16]:
+				xmin = 28.72
+				xmax = 31.336
+				text = " and said, hey dog, I know the people that got Def Ro."
+			intervals [17]:
+				xmin = 32.1
+				xmax = 33.714
+				text = " And they don't know what to do with it."
+			intervals [18]:
+				xmin = 34.46
+				xmax = 36.26
+				text = " Let me holler at them, I know just what to do with it."
+			intervals [19]:
+				xmin = 36.28
+				xmax = 39.116
+				text = " So I hit them and like, let me work for y'all."
+			intervals [20]:
+				xmin = 40.8
+				xmax = 43.32
+				text = " The play was cool, but it's like, yeah, fuck that."
+			intervals [21]:
+				xmin = 43.36
+				xmax = 44.818
+				text = " How much to buy this shit?"
+			intervals [22]:
+				xmin = 45.0
+				xmax = 46.24
+				text = " What you talkin' about?"
+			intervals [23]:
+				xmin = 46.24
+				xmax = 47.371
+				text = " How much to buy Def Ro first?"
+			intervals [24]:
+				xmin = 48.24
+				xmax = 49.25
+				text = " How much for my masters?"
+			intervals [25]:
+				xmin = 50.3
+				xmax = 51.495
+				text = " How much for all of the masters?"
diff --git a/tests/test_formatter.py b/tests/test_formatter.py
@@ -0,0 +1,27 @@
+import json
+import numpy.testing as nptest
+import textgrid
+
+import formatter
+
+class TestFormatter():
+    Format = formatter.Formatter()
+
+    def test_to_TextGrid(self):
+        for input_fname, ex_fname in self.provide_to_TextGrid():
+            with open(input_fname) as f:
+                case = json.load(f)
+            observed = self.Format.to_TextGrid(case)
+
+            expected = textgrid.TextGrid()
+            expected.read(ex_fname)
+
+            nptest.assert_array_equal(observed,expected)
+
+    def provide_to_TextGrid(self):
+        return [
+                (
+                    'tests/data/TestAudio_SnoopDogg_85SouthMedia_segments.json',
+                    'tests/data/TestAudio_SnoopDogg_85SouthMedia.TextGrid'
+                ),
+            ]
-Original file line number
+Diff line change
@@ Expand Up @@
     whisperx @ git+https://github.com/m-bain/whisperx.git@49e0130e4e0c0d99d60715d76e65a71826a97109
     GPUtil
     psutil
+    textgrid