Skip to content

Commit

Permalink
[formatter] Implement conversion to TextGrid (#4)
Browse files Browse the repository at this point in the history
Introduce a class which houses functions to convert the json output of
the transcriber and diarizer into formats that can be input into various
alignment software. This commitn introduces a conversion to TextGrids
for use with MFA though other formats may be introduced later.
  • Loading branch information
chrisbrickhouse committed Feb 8, 2024
1 parent b958136 commit 0e286e7
Show file tree
Hide file tree
Showing 4 changed files with 204 additions and 0 deletions.
62 changes: 62 additions & 0 deletions formatter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# This program is part of fave-asr
# Copyright (C) 2024 Christian Brickhouse and FAVE Contributors
#
# fave-asr is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation as version 3 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import textgrid

class Formatter():
def __init__(self):
pass

def to_TextGrid(self, diarized_transcription):
"""
Convert a diarized transcription dictionary to a TextGrid
Args:
diarized_transcription: Output of pipeline.assign_speakers()
Returns:
A textgrid.TextGrid object populated with the diarized and
transcribed data. Tiers are by speaker and contain word-level
intervals not utterance-level.
"""
minTime = diarized_transcription['segments'][0]['start']
maxTime = diarized_transcription['segments'][-1]['end']
tg = textgrid.TextGrid(minTime=minTime,maxTime=maxTime)

speakers = [x['speaker'] for x in diarized_transcription['segments']]
for speaker in set(speakers):
tg.append(textgrid.IntervalTier(name=speaker,minTime=minTime,maxTime=maxTime))
# Create a lookup table of tier indices based on the given speaker name
tier_key = dict((name,index) for index, name in enumerate([x.name for x in tg.tiers]))

for segment in diarized_transcription['segments']:
# There's no guarantee, weirdly, that a given word's assigned speaker
# is the same as the speaker assigned to the whole segment. Since
# the tiers are based on assigned /segment/ speakers, not assigned
# word speakers, we need to look up the tier in the segment loop
# not in the word loop. See Issue #7
tier_index = tier_key[segment['speaker']]
tier = tg.tiers[tier_index]
minTime = segment['start']
maxTime = segment['end']
mark = segment['text']
tier.add(minTime,maxTime,mark)
# In testing, the word-level alignments are not very good. A future version
# might want to add an option for end users to enable the following loop.
#for word in segment['words']:
# minTime = word['start']
# maxTime = word['end']
# mark = word['word']
# tier.add(minTime,maxTime,mark)
return tg
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ openai-whisper @ git+https://github.com/openai/whisper.git@b38a1f20f4b23f3f3099a
whisperx @ git+https://github.com/m-bain/whisperx.git@49e0130e4e0c0d99d60715d76e65a71826a97109
GPUtil
psutil
textgrid
114 changes: 114 additions & 0 deletions tests/data/TestAudio_SnoopDogg_85SouthMedia.TextGrid
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
File type = "ooTextFile"
Object class = "TextGrid"

xmin = 0.362
xmax = 51.495
tiers? <exists>
size = 1
item []:
item [1]:
class = "IntervalTier"
name = "SPEAKER_00"
xmin = 0.362
xmax = 51.495
intervals: size = 25
intervals [1]:
xmin = 0.362
xmax = 3.238
text = " So, you know the pimpin', fuck y'all."
intervals [2]:
xmin = 4.024
xmax = 5.214
text = " I'm finna go over to Def Jam"
intervals [3]:
xmin = 6.286
xmax = 7.24
text = " and learn a little bit of corporate work,"
intervals [4]:
xmin = 7.382
xmax = 8.618
text = " because I don't know corporate yet."
intervals [5]:
xmin = 8.923
xmax = 9.96
text = " I only need a few months."
intervals [6]:
xmin = 10.0
xmax = 11.14
text = " You give me a few months to run the shit,"
intervals [7]:
xmin = 11.181
xmax = 12.437
text = " I'm a fast learner."
intervals [8]:
xmin = 13.084
xmax = 14.859
text = " Go to Def Jam, get a job in a position,"
intervals [9]:
xmin = 15.324
xmax = 16.72
text = " drop a record, get Benny the Butcher signed,"
intervals [10]:
xmin = 17.145
xmax = 18.277
text = " get Hip Hop Harry signed,"
intervals [11]:
xmin = 19.046
xmax = 20.08
text = " learn a few tricks of the trade,"
intervals [12]:
xmin = 20.524
xmax = 22.08
text = " find out that the niggas that had it"
intervals [13]:
xmin = 22.465
xmax = 23.539
text = " that wanted me to hold for them,"
intervals [14]:
xmin = 24.003
xmax = 25.336
text = " then sold it to some other people."
intervals [15]:
xmin = 26.243
xmax = 28.28
text = " So now, one of my big wig buddies called me"
intervals [16]:
xmin = 28.72
xmax = 31.336
text = " and said, hey dog, I know the people that got Def Ro."
intervals [17]:
xmin = 32.1
xmax = 33.714
text = " And they don't know what to do with it."
intervals [18]:
xmin = 34.46
xmax = 36.26
text = " Let me holler at them, I know just what to do with it."
intervals [19]:
xmin = 36.28
xmax = 39.116
text = " So I hit them and like, let me work for y'all."
intervals [20]:
xmin = 40.8
xmax = 43.32
text = " The play was cool, but it's like, yeah, fuck that."
intervals [21]:
xmin = 43.36
xmax = 44.818
text = " How much to buy this shit?"
intervals [22]:
xmin = 45.0
xmax = 46.24
text = " What you talkin' about?"
intervals [23]:
xmin = 46.24
xmax = 47.371
text = " How much to buy Def Ro first?"
intervals [24]:
xmin = 48.24
xmax = 49.25
text = " How much for my masters?"
intervals [25]:
xmin = 50.3
xmax = 51.495
text = " How much for all of the masters?"
27 changes: 27 additions & 0 deletions tests/test_formatter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import json
import numpy.testing as nptest
import textgrid

import formatter

class TestFormatter():
Format = formatter.Formatter()

def test_to_TextGrid(self):
for input_fname, ex_fname in self.provide_to_TextGrid():
with open(input_fname) as f:
case = json.load(f)
observed = self.Format.to_TextGrid(case)

expected = textgrid.TextGrid()
expected.read(ex_fname)

nptest.assert_array_equal(observed,expected)

def provide_to_TextGrid(self):
return [
(
'tests/data/TestAudio_SnoopDogg_85SouthMedia_segments.json',
'tests/data/TestAudio_SnoopDogg_85SouthMedia.TextGrid'
),
]

0 comments on commit 0e286e7

Please sign in to comment.