[formatter] Implement conversion to TextGrid (#4) (#8)

* [formatter] Implement conversion to TextGrid (#4) Introduce a class which houses functions to convert the json output of the transcriber and diarizer into formats that can be input into various alignment software. This commitn introduces a conversion to TextGrids for use with MFA though other formats may be introduced later. * [tests] Only test transcript equivalence (#5) Tests of alignment and speaker assignment no longer test for equivalence of start and end times or assignment weights because different machines have slightly different results leading to failures. Instead, only the transcriptions themselves are asserted to be equivalent. Fixes #5 * [formatter] Implement conversion to TextGrid (#4) Introduce a class which houses functions to convert the json output of the transcriber and diarizer into formats that can be input into various alignment software. This commitn introduces a conversion to TextGrids for use with MFA though other formats may be introduced later. * Avoid gaps between phrases Lots of work arounds to handle the gaps between phrases in the timestamps, but after looking at them, they should probably just be included in the preceding segment because they tend to contain a fair amount of speech. * Revert "Avoid gaps between phrases" This reverts commit 54c0c59.
Forced-Alignment-and-Vowel-Extraction · Mar 20, 2024 · 2b9efaf · 2b9efaf
1 parent 13dc6ca
commit 2b9efaf
Show file tree

Hide file tree

Showing 4 changed files with 204 additions and 0 deletions.
diff --git a/formatter.py b/formatter.py
@@ -0,0 +1,62 @@
+# This program is part of fave-asr
+# Copyright (C) 2024 Christian Brickhouse and FAVE Contributors
+#
+# fave-asr is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation as version 3 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+import textgrid
+
+class Formatter():
+    def __init__(self):
+        pass
+
+    def to_TextGrid(self, diarized_transcription):
+        """
+        Convert a diarized transcription dictionary to a TextGrid
+
+        Args:
+            diarized_transcription: Output of pipeline.assign_speakers()
+
+        Returns:
+            A textgrid.TextGrid object populated with the diarized and
+            transcribed data. Tiers are by speaker and contain word-level
+            intervals not utterance-level.
+        """
+        minTime = diarized_transcription['segments'][0]['start']
+        maxTime = diarized_transcription['segments'][-1]['end']
+        tg = textgrid.TextGrid(minTime=minTime,maxTime=maxTime)
+
+        speakers = [x['speaker'] for x in diarized_transcription['segments']]
+        for speaker in set(speakers):
+            tg.append(textgrid.IntervalTier(name=speaker,minTime=minTime,maxTime=maxTime))
+        # Create a lookup table of tier indices based on the given speaker name
+        tier_key = dict((name,index) for index, name in enumerate([x.name for x in tg.tiers]))
+
+        for segment in diarized_transcription['segments']:
+            # There's no guarantee, weirdly, that a given word's assigned speaker
+            # is the same as the speaker assigned to the whole segment. Since
+            # the tiers are based on assigned /segment/ speakers, not assigned 
+            # word speakers, we need to look up the tier in the segment loop
+            # not in the word loop. See Issue #7
+            tier_index = tier_key[segment['speaker']]
+            tier = tg.tiers[tier_index]
+            minTime = segment['start']
+            maxTime = segment['end']
+            mark = segment['text']
+            tier.add(minTime,maxTime,mark)
+            # In testing, the word-level alignments are not very good. A future version
+            # might want to add an option for end users to enable the following loop.
+            #for word in segment['words']:
+            #    minTime = word['start']
+            #    maxTime = word['end']
+            #    mark = word['word']
+            #    tier.add(minTime,maxTime,mark)
+        return tg
diff --git a/requirements.txt b/requirements.txt
@@ -2,3 +2,4 @@ openai-whisper @ git+https://github.com/openai/whisper.git@b38a1f20f4b23f3f3099a
 whisperx @ git+https://github.com/m-bain/whisperx.git@49e0130e4e0c0d99d60715d76e65a71826a97109
 GPUtil
 psutil
+textgrid
diff --git a/tests/data/TestAudio_SnoopDogg_85SouthMedia.TextGrid b/tests/data/TestAudio_SnoopDogg_85SouthMedia.TextGrid
@@ -0,0 +1,114 @@
+File type = "ooTextFile"
+Object class = "TextGrid"
+
+xmin = 0.362
+xmax = 51.495
+tiers? <exists>
+size = 1
+item []:
+	item [1]:
+		class = "IntervalTier"
+		name = "SPEAKER_00"
+		xmin = 0.362
+		xmax = 51.495
+		intervals: size = 25
+			intervals [1]:
+				xmin = 0.362
+				xmax = 3.238
+				text = " So, you know the pimpin', fuck y'all."
+			intervals [2]:
+				xmin = 4.024
+				xmax = 5.214
+				text = " I'm finna go over to Def Jam"
+			intervals [3]:
+				xmin = 6.286
+				xmax = 7.24
+				text = " and learn a little bit of corporate work,"
+			intervals [4]:
+				xmin = 7.382
+				xmax = 8.618
+				text = " because I don't know corporate yet."
+			intervals [5]:
+				xmin = 8.923
+				xmax = 9.96
+				text = " I only need a few months."
+			intervals [6]:
+				xmin = 10.0
+				xmax = 11.14
+				text = " You give me a few months to run the shit,"
+			intervals [7]:
+				xmin = 11.181
+				xmax = 12.437
+				text = " I'm a fast learner."
+			intervals [8]:
+				xmin = 13.084
+				xmax = 14.859
+				text = " Go to Def Jam, get a job in a position,"
+			intervals [9]:
+				xmin = 15.324
+				xmax = 16.72
+				text = " drop a record, get Benny the Butcher signed,"
+			intervals [10]:
+				xmin = 17.145
+				xmax = 18.277
+				text = " get Hip Hop Harry signed,"
+			intervals [11]:
+				xmin = 19.046
+				xmax = 20.08
+				text = " learn a few tricks of the trade,"
+			intervals [12]:
+				xmin = 20.524
+				xmax = 22.08
+				text = " find out that the niggas that had it"
+			intervals [13]:
+				xmin = 22.465
+				xmax = 23.539
+				text = " that wanted me to hold for them,"
+			intervals [14]:
+				xmin = 24.003
+				xmax = 25.336
+				text = " then sold it to some other people."
+			intervals [15]:
+				xmin = 26.243
+				xmax = 28.28
+				text = " So now, one of my big wig buddies called me"
+			intervals [16]:
+				xmin = 28.72
+				xmax = 31.336
+				text = " and said, hey dog, I know the people that got Def Ro."
+			intervals [17]:
+				xmin = 32.1
+				xmax = 33.714
+				text = " And they don't know what to do with it."
+			intervals [18]:
+				xmin = 34.46
+				xmax = 36.26
+				text = " Let me holler at them, I know just what to do with it."
+			intervals [19]:
+				xmin = 36.28
+				xmax = 39.116
+				text = " So I hit them and like, let me work for y'all."
+			intervals [20]:
+				xmin = 40.8
+				xmax = 43.32
+				text = " The play was cool, but it's like, yeah, fuck that."
+			intervals [21]:
+				xmin = 43.36
+				xmax = 44.818
+				text = " How much to buy this shit?"
+			intervals [22]:
+				xmin = 45.0
+				xmax = 46.24
+				text = " What you talkin' about?"
+			intervals [23]:
+				xmin = 46.24
+				xmax = 47.371
+				text = " How much to buy Def Ro first?"
+			intervals [24]:
+				xmin = 48.24
+				xmax = 49.25
+				text = " How much for my masters?"
+			intervals [25]:
+				xmin = 50.3
+				xmax = 51.495
+				text = " How much for all of the masters?"
diff --git a/tests/test_formatter.py b/tests/test_formatter.py
@@ -0,0 +1,27 @@
+import json
+import numpy.testing as nptest
+import textgrid
+
+import formatter
+
+class TestFormatter():
+    Format = formatter.Formatter()
+
+    def test_to_TextGrid(self):
+        for input_fname, ex_fname in self.provide_to_TextGrid():
+            with open(input_fname) as f:
+                case = json.load(f)
+            observed = self.Format.to_TextGrid(case)
+
+            expected = textgrid.TextGrid()
+            expected.read(ex_fname)
+
+            nptest.assert_array_equal(observed,expected)
+
+    def provide_to_TextGrid(self):
+        return [
+                (
+                    'tests/data/TestAudio_SnoopDogg_85SouthMedia_segments.json',
+                    'tests/data/TestAudio_SnoopDogg_85SouthMedia.TextGrid'
+                ),
+            ]
-Original file line number
+Diff line change
@@ Expand Up @@
     whisperx @ git+https://github.com/m-bain/whisperx.git@49e0130e4e0c0d99d60715d76e65a71826a97109
     GPUtil
     psutil
+    textgrid