From 2b9efafe71b721d1ecf0c8b34c3bf28407265eb0 Mon Sep 17 00:00:00 2001
From: Christian Brickhouse <chrisbrickhouse@users.noreply.github.com>
Date: Tue, 19 Mar 2024 17:09:42 -0700
Subject: [PATCH] [formatter] Implement conversion to TextGrid (#4) (#8)

* [formatter] Implement conversion to TextGrid (#4)

Introduce a class which houses functions to convert the json output of
the transcriber and diarizer into formats that can be input into various
alignment software. This commitn introduces a conversion to TextGrids
for use with MFA though other formats may be introduced later.

* [tests] Only test transcript equivalence (#5)

Tests of alignment and speaker assignment no longer test for equivalence
of start and end times or assignment weights because different machines
have slightly different results leading to failures. Instead, only the
transcriptions themselves are asserted to be equivalent.

Fixes #5

* [formatter] Implement conversion to TextGrid (#4)

Introduce a class which houses functions to convert the json output of
the transcriber and diarizer into formats that can be input into various
alignment software. This commitn introduces a conversion to TextGrids
for use with MFA though other formats may be introduced later.

* Avoid gaps between phrases

Lots of work arounds to handle the gaps between phrases in the
timestamps, but after looking at them, they should probably just be
included in the preceding segment because they tend to contain a fair
amount of speech.

* Revert "Avoid gaps between phrases"

This reverts commit 54c0c59c20c53254f1e550bdbebb518d6c143b02.
---
 formatter.py                                  |  62 ++++++++++
 requirements.txt                              |   1 +
 .../TestAudio_SnoopDogg_85SouthMedia.TextGrid | 114 ++++++++++++++++++
 tests/test_formatter.py                       |  27 +++++
 4 files changed, 204 insertions(+)
 create mode 100644 formatter.py
 create mode 100644 tests/data/TestAudio_SnoopDogg_85SouthMedia.TextGrid
 create mode 100644 tests/test_formatter.py

diff --git a/formatter.py b/formatter.py
new file mode 100644
index 0000000..b35963b
--- /dev/null
+++ b/formatter.py
@@ -0,0 +1,62 @@
+# This program is part of fave-asr
+# Copyright (C) 2024 Christian Brickhouse and FAVE Contributors
+#
+# fave-asr is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation as version 3 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+import textgrid
+
+class Formatter():
+    def __init__(self):
+        pass
+
+    def to_TextGrid(self, diarized_transcription):
+        """
+        Convert a diarized transcription dictionary to a TextGrid
+
+        Args:
+            diarized_transcription: Output of pipeline.assign_speakers()
+
+        Returns:
+            A textgrid.TextGrid object populated with the diarized and
+            transcribed data. Tiers are by speaker and contain word-level
+            intervals not utterance-level.
+        """
+        minTime = diarized_transcription['segments'][0]['start']
+        maxTime = diarized_transcription['segments'][-1]['end']
+        tg = textgrid.TextGrid(minTime=minTime,maxTime=maxTime)
+
+        speakers = [x['speaker'] for x in diarized_transcription['segments']]
+        for speaker in set(speakers):
+            tg.append(textgrid.IntervalTier(name=speaker,minTime=minTime,maxTime=maxTime))
+        # Create a lookup table of tier indices based on the given speaker name
+        tier_key = dict((name,index) for index, name in enumerate([x.name for x in tg.tiers]))
+
+        for segment in diarized_transcription['segments']:
+            # There's no guarantee, weirdly, that a given word's assigned speaker
+            # is the same as the speaker assigned to the whole segment. Since
+            # the tiers are based on assigned /segment/ speakers, not assigned 
+            # word speakers, we need to look up the tier in the segment loop
+            # not in the word loop. See Issue #7
+            tier_index = tier_key[segment['speaker']]
+            tier = tg.tiers[tier_index]
+            minTime = segment['start']
+            maxTime = segment['end']
+            mark = segment['text']
+            tier.add(minTime,maxTime,mark)
+            # In testing, the word-level alignments are not very good. A future version
+            # might want to add an option for end users to enable the following loop.
+            #for word in segment['words']:
+            #    minTime = word['start']
+            #    maxTime = word['end']
+            #    mark = word['word']
+            #    tier.add(minTime,maxTime,mark)
+        return tg
diff --git a/requirements.txt b/requirements.txt
index ac89fe2..b18a8e2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,3 +2,4 @@ openai-whisper @ git+https://github.com/openai/whisper.git@b38a1f20f4b23f3f3099a
 whisperx @ git+https://github.com/m-bain/whisperx.git@49e0130e4e0c0d99d60715d76e65a71826a97109
 GPUtil
 psutil
+textgrid
diff --git a/tests/data/TestAudio_SnoopDogg_85SouthMedia.TextGrid b/tests/data/TestAudio_SnoopDogg_85SouthMedia.TextGrid
new file mode 100644
index 0000000..4f37994
--- /dev/null
+++ b/tests/data/TestAudio_SnoopDogg_85SouthMedia.TextGrid
@@ -0,0 +1,114 @@
+File type = "ooTextFile"
+Object class = "TextGrid"
+
+xmin = 0.362
+xmax = 51.495
+tiers? <exists>
+size = 1
+item []:
+	item [1]:
+		class = "IntervalTier"
+		name = "SPEAKER_00"
+		xmin = 0.362
+		xmax = 51.495
+		intervals: size = 25
+			intervals [1]:
+				xmin = 0.362
+				xmax = 3.238
+				text = " So, you know the pimpin', fuck y'all."
+			intervals [2]:
+				xmin = 4.024
+				xmax = 5.214
+				text = " I'm finna go over to Def Jam"
+			intervals [3]:
+				xmin = 6.286
+				xmax = 7.24
+				text = " and learn a little bit of corporate work,"
+			intervals [4]:
+				xmin = 7.382
+				xmax = 8.618
+				text = " because I don't know corporate yet."
+			intervals [5]:
+				xmin = 8.923
+				xmax = 9.96
+				text = " I only need a few months."
+			intervals [6]:
+				xmin = 10.0
+				xmax = 11.14
+				text = " You give me a few months to run the shit,"
+			intervals [7]:
+				xmin = 11.181
+				xmax = 12.437
+				text = " I'm a fast learner."
+			intervals [8]:
+				xmin = 13.084
+				xmax = 14.859
+				text = " Go to Def Jam, get a job in a position,"
+			intervals [9]:
+				xmin = 15.324
+				xmax = 16.72
+				text = " drop a record, get Benny the Butcher signed,"
+			intervals [10]:
+				xmin = 17.145
+				xmax = 18.277
+				text = " get Hip Hop Harry signed,"
+			intervals [11]:
+				xmin = 19.046
+				xmax = 20.08
+				text = " learn a few tricks of the trade,"
+			intervals [12]:
+				xmin = 20.524
+				xmax = 22.08
+				text = " find out that the niggas that had it"
+			intervals [13]:
+				xmin = 22.465
+				xmax = 23.539
+				text = " that wanted me to hold for them,"
+			intervals [14]:
+				xmin = 24.003
+				xmax = 25.336
+				text = " then sold it to some other people."
+			intervals [15]:
+				xmin = 26.243
+				xmax = 28.28
+				text = " So now, one of my big wig buddies called me"
+			intervals [16]:
+				xmin = 28.72
+				xmax = 31.336
+				text = " and said, hey dog, I know the people that got Def Ro."
+			intervals [17]:
+				xmin = 32.1
+				xmax = 33.714
+				text = " And they don't know what to do with it."
+			intervals [18]:
+				xmin = 34.46
+				xmax = 36.26
+				text = " Let me holler at them, I know just what to do with it."
+			intervals [19]:
+				xmin = 36.28
+				xmax = 39.116
+				text = " So I hit them and like, let me work for y'all."
+			intervals [20]:
+				xmin = 40.8
+				xmax = 43.32
+				text = " The play was cool, but it's like, yeah, fuck that."
+			intervals [21]:
+				xmin = 43.36
+				xmax = 44.818
+				text = " How much to buy this shit?"
+			intervals [22]:
+				xmin = 45.0
+				xmax = 46.24
+				text = " What you talkin' about?"
+			intervals [23]:
+				xmin = 46.24
+				xmax = 47.371
+				text = " How much to buy Def Ro first?"
+			intervals [24]:
+				xmin = 48.24
+				xmax = 49.25
+				text = " How much for my masters?"
+			intervals [25]:
+				xmin = 50.3
+				xmax = 51.495
+				text = " How much for all of the masters?"
diff --git a/tests/test_formatter.py b/tests/test_formatter.py
new file mode 100644
index 0000000..afaeae4
--- /dev/null
+++ b/tests/test_formatter.py
@@ -0,0 +1,27 @@
+import json
+import numpy.testing as nptest
+import textgrid
+
+import formatter
+
+class TestFormatter():
+    Format = formatter.Formatter()
+
+    def test_to_TextGrid(self):
+        for input_fname, ex_fname in self.provide_to_TextGrid():
+            with open(input_fname) as f:
+                case = json.load(f)
+            observed = self.Format.to_TextGrid(case)
+            
+            expected = textgrid.TextGrid()
+            expected.read(ex_fname)
+
+            nptest.assert_array_equal(observed,expected)
+
+    def provide_to_TextGrid(self):
+        return [
+                (
+                    'tests/data/TestAudio_SnoopDogg_85SouthMedia_segments.json',
+                    'tests/data/TestAudio_SnoopDogg_85SouthMedia.TextGrid'
+                ),
+            ]