From 0e286e756328e8316936bab10a0f2fe85c1e5fb8 Mon Sep 17 00:00:00 2001 From: Christian Date: Wed, 7 Feb 2024 16:29:11 -0800 Subject: [PATCH] [formatter] Implement conversion to TextGrid (#4) Introduce a class which houses functions to convert the json output of the transcriber and diarizer into formats that can be input into various alignment software. This commitn introduces a conversion to TextGrids for use with MFA though other formats may be introduced later. --- formatter.py | 62 ++++++++++ requirements.txt | 1 + .../TestAudio_SnoopDogg_85SouthMedia.TextGrid | 114 ++++++++++++++++++ tests/test_formatter.py | 27 +++++ 4 files changed, 204 insertions(+) create mode 100644 formatter.py create mode 100644 tests/data/TestAudio_SnoopDogg_85SouthMedia.TextGrid create mode 100644 tests/test_formatter.py diff --git a/formatter.py b/formatter.py new file mode 100644 index 0000000..b35963b --- /dev/null +++ b/formatter.py @@ -0,0 +1,62 @@ +# This program is part of fave-asr +# Copyright (C) 2024 Christian Brickhouse and FAVE Contributors +# +# fave-asr is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation as version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +import textgrid + +class Formatter(): + def __init__(self): + pass + + def to_TextGrid(self, diarized_transcription): + """ + Convert a diarized transcription dictionary to a TextGrid + + Args: + diarized_transcription: Output of pipeline.assign_speakers() + + Returns: + A textgrid.TextGrid object populated with the diarized and + transcribed data. Tiers are by speaker and contain word-level + intervals not utterance-level. + """ + minTime = diarized_transcription['segments'][0]['start'] + maxTime = diarized_transcription['segments'][-1]['end'] + tg = textgrid.TextGrid(minTime=minTime,maxTime=maxTime) + + speakers = [x['speaker'] for x in diarized_transcription['segments']] + for speaker in set(speakers): + tg.append(textgrid.IntervalTier(name=speaker,minTime=minTime,maxTime=maxTime)) + # Create a lookup table of tier indices based on the given speaker name + tier_key = dict((name,index) for index, name in enumerate([x.name for x in tg.tiers])) + + for segment in diarized_transcription['segments']: + # There's no guarantee, weirdly, that a given word's assigned speaker + # is the same as the speaker assigned to the whole segment. Since + # the tiers are based on assigned /segment/ speakers, not assigned + # word speakers, we need to look up the tier in the segment loop + # not in the word loop. See Issue #7 + tier_index = tier_key[segment['speaker']] + tier = tg.tiers[tier_index] + minTime = segment['start'] + maxTime = segment['end'] + mark = segment['text'] + tier.add(minTime,maxTime,mark) + # In testing, the word-level alignments are not very good. A future version + # might want to add an option for end users to enable the following loop. + #for word in segment['words']: + # minTime = word['start'] + # maxTime = word['end'] + # mark = word['word'] + # tier.add(minTime,maxTime,mark) + return tg diff --git a/requirements.txt b/requirements.txt index ac89fe2..b18a8e2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,4 @@ openai-whisper @ git+https://github.com/openai/whisper.git@b38a1f20f4b23f3f3099a whisperx @ git+https://github.com/m-bain/whisperx.git@49e0130e4e0c0d99d60715d76e65a71826a97109 GPUtil psutil +textgrid diff --git a/tests/data/TestAudio_SnoopDogg_85SouthMedia.TextGrid b/tests/data/TestAudio_SnoopDogg_85SouthMedia.TextGrid new file mode 100644 index 0000000..4f37994 --- /dev/null +++ b/tests/data/TestAudio_SnoopDogg_85SouthMedia.TextGrid @@ -0,0 +1,114 @@ +File type = "ooTextFile" +Object class = "TextGrid" + +xmin = 0.362 +xmax = 51.495 +tiers? +size = 1 +item []: + item [1]: + class = "IntervalTier" + name = "SPEAKER_00" + xmin = 0.362 + xmax = 51.495 + intervals: size = 25 + intervals [1]: + xmin = 0.362 + xmax = 3.238 + text = " So, you know the pimpin', fuck y'all." + intervals [2]: + xmin = 4.024 + xmax = 5.214 + text = " I'm finna go over to Def Jam" + intervals [3]: + xmin = 6.286 + xmax = 7.24 + text = " and learn a little bit of corporate work," + intervals [4]: + xmin = 7.382 + xmax = 8.618 + text = " because I don't know corporate yet." + intervals [5]: + xmin = 8.923 + xmax = 9.96 + text = " I only need a few months." + intervals [6]: + xmin = 10.0 + xmax = 11.14 + text = " You give me a few months to run the shit," + intervals [7]: + xmin = 11.181 + xmax = 12.437 + text = " I'm a fast learner." + intervals [8]: + xmin = 13.084 + xmax = 14.859 + text = " Go to Def Jam, get a job in a position," + intervals [9]: + xmin = 15.324 + xmax = 16.72 + text = " drop a record, get Benny the Butcher signed," + intervals [10]: + xmin = 17.145 + xmax = 18.277 + text = " get Hip Hop Harry signed," + intervals [11]: + xmin = 19.046 + xmax = 20.08 + text = " learn a few tricks of the trade," + intervals [12]: + xmin = 20.524 + xmax = 22.08 + text = " find out that the niggas that had it" + intervals [13]: + xmin = 22.465 + xmax = 23.539 + text = " that wanted me to hold for them," + intervals [14]: + xmin = 24.003 + xmax = 25.336 + text = " then sold it to some other people." + intervals [15]: + xmin = 26.243 + xmax = 28.28 + text = " So now, one of my big wig buddies called me" + intervals [16]: + xmin = 28.72 + xmax = 31.336 + text = " and said, hey dog, I know the people that got Def Ro." + intervals [17]: + xmin = 32.1 + xmax = 33.714 + text = " And they don't know what to do with it." + intervals [18]: + xmin = 34.46 + xmax = 36.26 + text = " Let me holler at them, I know just what to do with it." + intervals [19]: + xmin = 36.28 + xmax = 39.116 + text = " So I hit them and like, let me work for y'all." + intervals [20]: + xmin = 40.8 + xmax = 43.32 + text = " The play was cool, but it's like, yeah, fuck that." + intervals [21]: + xmin = 43.36 + xmax = 44.818 + text = " How much to buy this shit?" + intervals [22]: + xmin = 45.0 + xmax = 46.24 + text = " What you talkin' about?" + intervals [23]: + xmin = 46.24 + xmax = 47.371 + text = " How much to buy Def Ro first?" + intervals [24]: + xmin = 48.24 + xmax = 49.25 + text = " How much for my masters?" + intervals [25]: + xmin = 50.3 + xmax = 51.495 + text = " How much for all of the masters?" diff --git a/tests/test_formatter.py b/tests/test_formatter.py new file mode 100644 index 0000000..afaeae4 --- /dev/null +++ b/tests/test_formatter.py @@ -0,0 +1,27 @@ +import json +import numpy.testing as nptest +import textgrid + +import formatter + +class TestFormatter(): + Format = formatter.Formatter() + + def test_to_TextGrid(self): + for input_fname, ex_fname in self.provide_to_TextGrid(): + with open(input_fname) as f: + case = json.load(f) + observed = self.Format.to_TextGrid(case) + + expected = textgrid.TextGrid() + expected.read(ex_fname) + + nptest.assert_array_equal(observed,expected) + + def provide_to_TextGrid(self): + return [ + ( + 'tests/data/TestAudio_SnoopDogg_85SouthMedia_segments.json', + 'tests/data/TestAudio_SnoopDogg_85SouthMedia.TextGrid' + ), + ]