-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
* [formatter] Implement conversion to TextGrid (#4) Introduce a class which houses functions to convert the json output of the transcriber and diarizer into formats that can be input into various alignment software. This commitn introduces a conversion to TextGrids for use with MFA though other formats may be introduced later. * [tests] Only test transcript equivalence (#5) Tests of alignment and speaker assignment no longer test for equivalence of start and end times or assignment weights because different machines have slightly different results leading to failures. Instead, only the transcriptions themselves are asserted to be equivalent. Fixes #5 * [formatter] Implement conversion to TextGrid (#4) Introduce a class which houses functions to convert the json output of the transcriber and diarizer into formats that can be input into various alignment software. This commitn introduces a conversion to TextGrids for use with MFA though other formats may be introduced later. * Avoid gaps between phrases Lots of work arounds to handle the gaps between phrases in the timestamps, but after looking at them, they should probably just be included in the preceding segment because they tend to contain a fair amount of speech. * Revert "Avoid gaps between phrases" This reverts commit 54c0c59.
- Loading branch information
1 parent
13dc6ca
commit 2b9efaf
Showing
4 changed files
with
204 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
# This program is part of fave-asr | ||
# Copyright (C) 2024 Christian Brickhouse and FAVE Contributors | ||
# | ||
# fave-asr is free software: you can redistribute it and/or modify | ||
# it under the terms of the GNU General Public License as published by | ||
# the Free Software Foundation as version 3 of the License. | ||
# | ||
# This program is distributed in the hope that it will be useful, | ||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
# GNU General Public License for more details. | ||
# | ||
# You should have received a copy of the GNU General Public License | ||
# along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
import textgrid | ||
|
||
class Formatter(): | ||
def __init__(self): | ||
pass | ||
|
||
def to_TextGrid(self, diarized_transcription): | ||
""" | ||
Convert a diarized transcription dictionary to a TextGrid | ||
Args: | ||
diarized_transcription: Output of pipeline.assign_speakers() | ||
Returns: | ||
A textgrid.TextGrid object populated with the diarized and | ||
transcribed data. Tiers are by speaker and contain word-level | ||
intervals not utterance-level. | ||
""" | ||
minTime = diarized_transcription['segments'][0]['start'] | ||
maxTime = diarized_transcription['segments'][-1]['end'] | ||
tg = textgrid.TextGrid(minTime=minTime,maxTime=maxTime) | ||
|
||
speakers = [x['speaker'] for x in diarized_transcription['segments']] | ||
for speaker in set(speakers): | ||
tg.append(textgrid.IntervalTier(name=speaker,minTime=minTime,maxTime=maxTime)) | ||
# Create a lookup table of tier indices based on the given speaker name | ||
tier_key = dict((name,index) for index, name in enumerate([x.name for x in tg.tiers])) | ||
|
||
for segment in diarized_transcription['segments']: | ||
# There's no guarantee, weirdly, that a given word's assigned speaker | ||
# is the same as the speaker assigned to the whole segment. Since | ||
# the tiers are based on assigned /segment/ speakers, not assigned | ||
# word speakers, we need to look up the tier in the segment loop | ||
# not in the word loop. See Issue #7 | ||
tier_index = tier_key[segment['speaker']] | ||
tier = tg.tiers[tier_index] | ||
minTime = segment['start'] | ||
maxTime = segment['end'] | ||
mark = segment['text'] | ||
tier.add(minTime,maxTime,mark) | ||
# In testing, the word-level alignments are not very good. A future version | ||
# might want to add an option for end users to enable the following loop. | ||
#for word in segment['words']: | ||
# minTime = word['start'] | ||
# maxTime = word['end'] | ||
# mark = word['word'] | ||
# tier.add(minTime,maxTime,mark) | ||
return tg |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
File type = "ooTextFile" | ||
Object class = "TextGrid" | ||
|
||
xmin = 0.362 | ||
xmax = 51.495 | ||
tiers? <exists> | ||
size = 1 | ||
item []: | ||
item [1]: | ||
class = "IntervalTier" | ||
name = "SPEAKER_00" | ||
xmin = 0.362 | ||
xmax = 51.495 | ||
intervals: size = 25 | ||
intervals [1]: | ||
xmin = 0.362 | ||
xmax = 3.238 | ||
text = " So, you know the pimpin', fuck y'all." | ||
intervals [2]: | ||
xmin = 4.024 | ||
xmax = 5.214 | ||
text = " I'm finna go over to Def Jam" | ||
intervals [3]: | ||
xmin = 6.286 | ||
xmax = 7.24 | ||
text = " and learn a little bit of corporate work," | ||
intervals [4]: | ||
xmin = 7.382 | ||
xmax = 8.618 | ||
text = " because I don't know corporate yet." | ||
intervals [5]: | ||
xmin = 8.923 | ||
xmax = 9.96 | ||
text = " I only need a few months." | ||
intervals [6]: | ||
xmin = 10.0 | ||
xmax = 11.14 | ||
text = " You give me a few months to run the shit," | ||
intervals [7]: | ||
xmin = 11.181 | ||
xmax = 12.437 | ||
text = " I'm a fast learner." | ||
intervals [8]: | ||
xmin = 13.084 | ||
xmax = 14.859 | ||
text = " Go to Def Jam, get a job in a position," | ||
intervals [9]: | ||
xmin = 15.324 | ||
xmax = 16.72 | ||
text = " drop a record, get Benny the Butcher signed," | ||
intervals [10]: | ||
xmin = 17.145 | ||
xmax = 18.277 | ||
text = " get Hip Hop Harry signed," | ||
intervals [11]: | ||
xmin = 19.046 | ||
xmax = 20.08 | ||
text = " learn a few tricks of the trade," | ||
intervals [12]: | ||
xmin = 20.524 | ||
xmax = 22.08 | ||
text = " find out that the niggas that had it" | ||
intervals [13]: | ||
xmin = 22.465 | ||
xmax = 23.539 | ||
text = " that wanted me to hold for them," | ||
intervals [14]: | ||
xmin = 24.003 | ||
xmax = 25.336 | ||
text = " then sold it to some other people." | ||
intervals [15]: | ||
xmin = 26.243 | ||
xmax = 28.28 | ||
text = " So now, one of my big wig buddies called me" | ||
intervals [16]: | ||
xmin = 28.72 | ||
xmax = 31.336 | ||
text = " and said, hey dog, I know the people that got Def Ro." | ||
intervals [17]: | ||
xmin = 32.1 | ||
xmax = 33.714 | ||
text = " And they don't know what to do with it." | ||
intervals [18]: | ||
xmin = 34.46 | ||
xmax = 36.26 | ||
text = " Let me holler at them, I know just what to do with it." | ||
intervals [19]: | ||
xmin = 36.28 | ||
xmax = 39.116 | ||
text = " So I hit them and like, let me work for y'all." | ||
intervals [20]: | ||
xmin = 40.8 | ||
xmax = 43.32 | ||
text = " The play was cool, but it's like, yeah, fuck that." | ||
intervals [21]: | ||
xmin = 43.36 | ||
xmax = 44.818 | ||
text = " How much to buy this shit?" | ||
intervals [22]: | ||
xmin = 45.0 | ||
xmax = 46.24 | ||
text = " What you talkin' about?" | ||
intervals [23]: | ||
xmin = 46.24 | ||
xmax = 47.371 | ||
text = " How much to buy Def Ro first?" | ||
intervals [24]: | ||
xmin = 48.24 | ||
xmax = 49.25 | ||
text = " How much for my masters?" | ||
intervals [25]: | ||
xmin = 50.3 | ||
xmax = 51.495 | ||
text = " How much for all of the masters?" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
import json | ||
import numpy.testing as nptest | ||
import textgrid | ||
|
||
import formatter | ||
|
||
class TestFormatter(): | ||
Format = formatter.Formatter() | ||
|
||
def test_to_TextGrid(self): | ||
for input_fname, ex_fname in self.provide_to_TextGrid(): | ||
with open(input_fname) as f: | ||
case = json.load(f) | ||
observed = self.Format.to_TextGrid(case) | ||
|
||
expected = textgrid.TextGrid() | ||
expected.read(ex_fname) | ||
|
||
nptest.assert_array_equal(observed,expected) | ||
|
||
def provide_to_TextGrid(self): | ||
return [ | ||
( | ||
'tests/data/TestAudio_SnoopDogg_85SouthMedia_segments.json', | ||
'tests/data/TestAudio_SnoopDogg_85SouthMedia.TextGrid' | ||
), | ||
] |