-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[formatter] Implement conversion to TextGrid (#4)
Introduce a class which houses functions to convert the json output of the transcriber and diarizer into formats that can be input into various alignment software. This commitn introduces a conversion to TextGrids for use with MFA though other formats may be introduced later.
- Loading branch information
1 parent
b958136
commit 0e286e7
Showing
4 changed files
with
204 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
# This program is part of fave-asr | ||
# Copyright (C) 2024 Christian Brickhouse and FAVE Contributors | ||
# | ||
# fave-asr is free software: you can redistribute it and/or modify | ||
# it under the terms of the GNU General Public License as published by | ||
# the Free Software Foundation as version 3 of the License. | ||
# | ||
# This program is distributed in the hope that it will be useful, | ||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
# GNU General Public License for more details. | ||
# | ||
# You should have received a copy of the GNU General Public License | ||
# along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
import textgrid | ||
|
||
class Formatter(): | ||
def __init__(self): | ||
pass | ||
|
||
def to_TextGrid(self, diarized_transcription): | ||
""" | ||
Convert a diarized transcription dictionary to a TextGrid | ||
Args: | ||
diarized_transcription: Output of pipeline.assign_speakers() | ||
Returns: | ||
A textgrid.TextGrid object populated with the diarized and | ||
transcribed data. Tiers are by speaker and contain word-level | ||
intervals not utterance-level. | ||
""" | ||
minTime = diarized_transcription['segments'][0]['start'] | ||
maxTime = diarized_transcription['segments'][-1]['end'] | ||
tg = textgrid.TextGrid(minTime=minTime,maxTime=maxTime) | ||
|
||
speakers = [x['speaker'] for x in diarized_transcription['segments']] | ||
for speaker in set(speakers): | ||
tg.append(textgrid.IntervalTier(name=speaker,minTime=minTime,maxTime=maxTime)) | ||
# Create a lookup table of tier indices based on the given speaker name | ||
tier_key = dict((name,index) for index, name in enumerate([x.name for x in tg.tiers])) | ||
|
||
for segment in diarized_transcription['segments']: | ||
# There's no guarantee, weirdly, that a given word's assigned speaker | ||
# is the same as the speaker assigned to the whole segment. Since | ||
# the tiers are based on assigned /segment/ speakers, not assigned | ||
# word speakers, we need to look up the tier in the segment loop | ||
# not in the word loop. See Issue #7 | ||
tier_index = tier_key[segment['speaker']] | ||
tier = tg.tiers[tier_index] | ||
minTime = segment['start'] | ||
maxTime = segment['end'] | ||
mark = segment['text'] | ||
tier.add(minTime,maxTime,mark) | ||
# In testing, the word-level alignments are not very good. A future version | ||
# might want to add an option for end users to enable the following loop. | ||
#for word in segment['words']: | ||
# minTime = word['start'] | ||
# maxTime = word['end'] | ||
# mark = word['word'] | ||
# tier.add(minTime,maxTime,mark) | ||
return tg |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
File type = "ooTextFile" | ||
Object class = "TextGrid" | ||
|
||
xmin = 0.362 | ||
xmax = 51.495 | ||
tiers? <exists> | ||
size = 1 | ||
item []: | ||
item [1]: | ||
class = "IntervalTier" | ||
name = "SPEAKER_00" | ||
xmin = 0.362 | ||
xmax = 51.495 | ||
intervals: size = 25 | ||
intervals [1]: | ||
xmin = 0.362 | ||
xmax = 3.238 | ||
text = " So, you know the pimpin', fuck y'all." | ||
intervals [2]: | ||
xmin = 4.024 | ||
xmax = 5.214 | ||
text = " I'm finna go over to Def Jam" | ||
intervals [3]: | ||
xmin = 6.286 | ||
xmax = 7.24 | ||
text = " and learn a little bit of corporate work," | ||
intervals [4]: | ||
xmin = 7.382 | ||
xmax = 8.618 | ||
text = " because I don't know corporate yet." | ||
intervals [5]: | ||
xmin = 8.923 | ||
xmax = 9.96 | ||
text = " I only need a few months." | ||
intervals [6]: | ||
xmin = 10.0 | ||
xmax = 11.14 | ||
text = " You give me a few months to run the shit," | ||
intervals [7]: | ||
xmin = 11.181 | ||
xmax = 12.437 | ||
text = " I'm a fast learner." | ||
intervals [8]: | ||
xmin = 13.084 | ||
xmax = 14.859 | ||
text = " Go to Def Jam, get a job in a position," | ||
intervals [9]: | ||
xmin = 15.324 | ||
xmax = 16.72 | ||
text = " drop a record, get Benny the Butcher signed," | ||
intervals [10]: | ||
xmin = 17.145 | ||
xmax = 18.277 | ||
text = " get Hip Hop Harry signed," | ||
intervals [11]: | ||
xmin = 19.046 | ||
xmax = 20.08 | ||
text = " learn a few tricks of the trade," | ||
intervals [12]: | ||
xmin = 20.524 | ||
xmax = 22.08 | ||
text = " find out that the niggas that had it" | ||
intervals [13]: | ||
xmin = 22.465 | ||
xmax = 23.539 | ||
text = " that wanted me to hold for them," | ||
intervals [14]: | ||
xmin = 24.003 | ||
xmax = 25.336 | ||
text = " then sold it to some other people." | ||
intervals [15]: | ||
xmin = 26.243 | ||
xmax = 28.28 | ||
text = " So now, one of my big wig buddies called me" | ||
intervals [16]: | ||
xmin = 28.72 | ||
xmax = 31.336 | ||
text = " and said, hey dog, I know the people that got Def Ro." | ||
intervals [17]: | ||
xmin = 32.1 | ||
xmax = 33.714 | ||
text = " And they don't know what to do with it." | ||
intervals [18]: | ||
xmin = 34.46 | ||
xmax = 36.26 | ||
text = " Let me holler at them, I know just what to do with it." | ||
intervals [19]: | ||
xmin = 36.28 | ||
xmax = 39.116 | ||
text = " So I hit them and like, let me work for y'all." | ||
intervals [20]: | ||
xmin = 40.8 | ||
xmax = 43.32 | ||
text = " The play was cool, but it's like, yeah, fuck that." | ||
intervals [21]: | ||
xmin = 43.36 | ||
xmax = 44.818 | ||
text = " How much to buy this shit?" | ||
intervals [22]: | ||
xmin = 45.0 | ||
xmax = 46.24 | ||
text = " What you talkin' about?" | ||
intervals [23]: | ||
xmin = 46.24 | ||
xmax = 47.371 | ||
text = " How much to buy Def Ro first?" | ||
intervals [24]: | ||
xmin = 48.24 | ||
xmax = 49.25 | ||
text = " How much for my masters?" | ||
intervals [25]: | ||
xmin = 50.3 | ||
xmax = 51.495 | ||
text = " How much for all of the masters?" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
import json | ||
import numpy.testing as nptest | ||
import textgrid | ||
|
||
import formatter | ||
|
||
class TestFormatter(): | ||
Format = formatter.Formatter() | ||
|
||
def test_to_TextGrid(self): | ||
for input_fname, ex_fname in self.provide_to_TextGrid(): | ||
with open(input_fname) as f: | ||
case = json.load(f) | ||
observed = self.Format.to_TextGrid(case) | ||
|
||
expected = textgrid.TextGrid() | ||
expected.read(ex_fname) | ||
|
||
nptest.assert_array_equal(observed,expected) | ||
|
||
def provide_to_TextGrid(self): | ||
return [ | ||
( | ||
'tests/data/TestAudio_SnoopDogg_85SouthMedia_segments.json', | ||
'tests/data/TestAudio_SnoopDogg_85SouthMedia.TextGrid' | ||
), | ||
] |