diff --git a/formatter.py b/formatter.py index b35963b..50d42ed 100644 --- a/formatter.py +++ b/formatter.py @@ -12,18 +12,21 @@ # # You should have received a copy of the GNU General Public License # along with this program. If not, see . +import warnings + import textgrid class Formatter(): def __init__(self): pass - def to_TextGrid(self, diarized_transcription): + def to_TextGrid(self, diarized_transcription, by_phrase=True): """ Convert a diarized transcription dictionary to a TextGrid Args: diarized_transcription: Output of pipeline.assign_speakers() + by_phrase: Flag for whether the intervals should be by phrase (True) or word (False) Returns: A textgrid.TextGrid object populated with the diarized and @@ -34,29 +37,44 @@ def to_TextGrid(self, diarized_transcription): maxTime = diarized_transcription['segments'][-1]['end'] tg = textgrid.TextGrid(minTime=minTime,maxTime=maxTime) - speakers = [x['speaker'] for x in diarized_transcription['segments']] + speakers = [x['speaker'] for x in diarized_transcription['segments'] if 'speaker' in x] for speaker in set(speakers): tg.append(textgrid.IntervalTier(name=speaker,minTime=minTime,maxTime=maxTime)) # Create a lookup table of tier indices based on the given speaker name tier_key = dict((name,index) for index, name in enumerate([x.name for x in tg.tiers])) - for segment in diarized_transcription['segments']: + for i in range(len(diarized_transcription['segments'])): + segment = diarized_transcription['segments'][i] # There's no guarantee, weirdly, that a given word's assigned speaker # is the same as the speaker assigned to the whole segment. Since # the tiers are based on assigned /segment/ speakers, not assigned # word speakers, we need to look up the tier in the segment loop # not in the word loop. See Issue #7 + if 'speaker' not in segment: + warnings.warn('No speaker for segment') + #print(segment) + continue tier_index = tier_key[segment['speaker']] tier = tg.tiers[tier_index] minTime = segment['start'] - maxTime = segment['end'] + if i+1 == len(diarized_transcription['segments']): + maxTime = segment['end'] + else: + maxTime = diarized_transcription['segments'][i+1]['start'] mark = segment['text'] - tier.add(minTime,maxTime,mark) - # In testing, the word-level alignments are not very good. A future version - # might want to add an option for end users to enable the following loop. - #for word in segment['words']: - # minTime = word['start'] - # maxTime = word['end'] - # mark = word['word'] - # tier.add(minTime,maxTime,mark) + if by_phrase: + tier.add(minTime,maxTime,mark) + continue + for word in segment['words']: + if 'speaker' not in word: + warnings.warn('No speaker assigned to word, using phrase-level speaker') + elif word['speaker'] != segment['speaker']: + warnings.warn('Mismatched speaker for word and phrase, using phrase-level speaker') + #print(word['speaker'],word) + #print(segment['speaker'],segment) + #raise ValueError('Word and segment have different speakers') + minTime = word['start'] + maxTime = word['end'] + mark = word['text'] + tier.add(minTime,maxTime,mark) return tg diff --git a/pipeline.py b/pipeline.py index 0af857d..4a421c1 100644 --- a/pipeline.py +++ b/pipeline.py @@ -12,11 +12,16 @@ import psutil import GPUtil import matplotlib.pyplot as plt -import whisper +import whisper_timestamped as whisper from whisperx import load_align_model, align from whisperx.diarize import DiarizationPipeline, assign_word_speakers -def transcribe(audio_file: str, model_name: str, device: str = "cpu") -> Dict[str, Any]: +def transcribe( + audio_file: str, + model_name: str, + device: str = "cpu", + detect_disfluencies: bool = True + ) -> Dict[str, Any]: """ Transcribe an audio file using a whisper model. @@ -24,14 +29,16 @@ def transcribe(audio_file: str, model_name: str, device: str = "cpu") -> Dict[st audio_file: Path to the audio file to transcribe. model_name: Name of the model to use for transcription. device: The device to use for inference (e.g., "cpu" or "cuda"). + detect_disfluencies: Flag for whether the transcription should include disfluencies, marked with [*] Returns: A dictionary representing the transcript segments and language code. """ - model = whisper.load_model(model_name, device) - result = model.transcribe(audio_file) + model = whisper.load_model(model_name, device=device) + audio = whisper.load_audio(audio_file) + result = whisper.transcribe(model, audio_file,detect_disfluencies=detect_disfluencies) - language_code = result["language"] + language_code = result['language'] return { "segments": result["segments"], "language_code": language_code, @@ -130,11 +137,11 @@ def transcribe_and_diarize( spoken text, and the speaker ID. """ transcript = transcribe(audio_file, model_name, device) - aligned_segments = align_segments( - transcript["segments"], transcript["language_code"], audio_file, device - ) + #aligned_segments = align_segments( + # transcript["segments"], transcript["language_code"], audio_file, device + #) diarization_result = diarize(audio_file, hf_token) - results_segments_w_speakers = assign_speakers(diarization_result, aligned_segments) + results_segments_w_speakers = assign_speakers(diarization_result, transcript) # Print the results in a user-friendly way for i, segment in enumerate(results_segments_w_speakers['segments']): diff --git a/requirements.txt b/requirements.txt index b18a8e2..4b32241 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -openai-whisper @ git+https://github.com/openai/whisper.git@b38a1f20f4b23f3f3099af2c3e0ca95627276ddf whisperx @ git+https://github.com/m-bain/whisperx.git@49e0130e4e0c0d99d60715d76e65a71826a97109 +whisper_timestamped GPUtil psutil textgrid diff --git a/tests/data/TestAudio_SnoopDogg_85SouthMedia_WhisperTimestampSegments.json b/tests/data/TestAudio_SnoopDogg_85SouthMedia_WhisperTimestampSegments.json new file mode 100644 index 0000000..57b59e4 --- /dev/null +++ b/tests/data/TestAudio_SnoopDogg_85SouthMedia_WhisperTimestampSegments.json @@ -0,0 +1 @@ +{"segments": [{"id": 0, "seek": 0, "start": 0.22, "end": 3.44, "text": " So, you know the pimpin', fuck y'all.", "tokens": [50363, 1406, 11, 345, 760, 262, 279, 11011, 259, 3256, 5089, 331, 6, 439, 13, 50543], "temperature": 0.0, "avg_logprob": -0.2396322811351103, "compression_ratio": 1.6773162939297124, "no_speech_prob": 0.06969249993562698, "confidence": 0.87, "words": [{"text": "So,", "start": 0.22, "end": 0.6, "confidence": 0.88, "speaker": "SPEAKER_00"}, {"text": "you", "start": 0.66, "end": 1.2, "confidence": 0.986, "speaker": "SPEAKER_00"}, {"text": "know", "start": 1.2, "end": 1.34, "confidence": 0.997, "speaker": "SPEAKER_00"}, {"text": "the", "start": 1.34, "end": 1.46, "confidence": 0.656, "speaker": "SPEAKER_00"}, {"text": "pimpin',", "start": 1.46, "end": 2.6, "confidence": 0.877, "speaker": "SPEAKER_00"}, {"text": "[*]", "start": 2.6, "end": 2.84, "confidence": 0.0, "speaker": "SPEAKER_00"}, {"text": "fuck", "start": 2.84, "end": 2.98, "confidence": 0.79, "speaker": "SPEAKER_00"}, {"text": "y'all.", "start": 2.98, "end": 3.44, "confidence": 0.895, "speaker": "SPEAKER_00"}], "speaker": "SPEAKER_00"}, {"id": 1, "seek": 0, "start": 3.96, "end": 5.28, "text": " I'm finna go over to Def Jam", "tokens": [50543, 314, 1101, 957, 2616, 467, 625, 284, 2896, 9986, 50658], "temperature": 0.0, "avg_logprob": -0.2396322811351103, "compression_ratio": 1.6773162939297124, "no_speech_prob": 0.06969249993562698, "confidence": 0.868, "words": [{"text": "I'm", "start": 3.96, "end": 4.18, "confidence": 0.984, "speaker": "SPEAKER_00"}, {"text": "finna", "start": 4.18, "end": 4.3, "confidence": 0.924, "speaker": "SPEAKER_00"}, {"text": "go", "start": 4.3, "end": 4.44, "confidence": 0.964, "speaker": "SPEAKER_00"}, {"text": "over", "start": 4.44, "end": 4.58, "confidence": 0.891, "speaker": "SPEAKER_00"}, {"text": "to", "start": 4.58, "end": 4.7, "confidence": 0.847, "speaker": "SPEAKER_00"}, {"text": "Def", "start": 4.7, "end": 4.94, "confidence": 0.485, "speaker": "SPEAKER_00"}, {"text": "Jam", "start": 4.94, "end": 5.28, "confidence": 0.964, "speaker": "SPEAKER_00"}], "speaker": "SPEAKER_00"}, {"id": 2, "seek": 0, "start": 6.24, "end": 7.54, "text": " and learn a little bit of corporate work,", "tokens": [50658, 290, 2193, 257, 1310, 1643, 286, 6355, 670, 11, 50725], "temperature": 0.0, "avg_logprob": -0.2396322811351103, "compression_ratio": 1.6773162939297124, "no_speech_prob": 0.06969249993562698, "confidence": 0.981, "words": [{"text": "and", "start": 6.24, "end": 6.4, "confidence": 0.989, "speaker": "SPEAKER_00"}, {"text": "learn", "start": 6.4, "end": 6.56, "confidence": 0.995, "speaker": "SPEAKER_00"}, {"text": "a", "start": 6.56, "end": 6.68, "confidence": 0.998, "speaker": "SPEAKER_00"}, {"text": "little", "start": 6.68, "end": 6.78, "confidence": 0.991, "speaker": "SPEAKER_00"}, {"text": "bit", "start": 6.78, "end": 6.92, "confidence": 0.992, "speaker": "SPEAKER_00"}, {"text": "of", "start": 6.92, "end": 7.02, "confidence": 0.985, "speaker": "SPEAKER_00"}, {"text": "corporate", "start": 7.02, "end": 7.32, "confidence": 0.905, "speaker": "SPEAKER_00"}, {"text": "work,", "start": 7.32, "end": 7.54, "confidence": 0.994, "speaker": "SPEAKER_00"}], "speaker": "SPEAKER_00"}, {"id": 3, "seek": 0, "start": 7.58, "end": 8.7, "text": " because I don't know corporate yet.", "tokens": [50725, 780, 314, 836, 470, 760, 6355, 1865, 13, 50800], "temperature": 0.0, "avg_logprob": -0.2396322811351103, "compression_ratio": 1.6773162939297124, "no_speech_prob": 0.06969249993562698, "confidence": 0.899, "words": [{"text": "because", "start": 7.58, "end": 7.7, "confidence": 0.548, "speaker": "SPEAKER_00"}, {"text": "I", "start": 7.7, "end": 7.82, "confidence": 0.997, "speaker": "SPEAKER_00"}, {"text": "don't", "start": 7.82, "end": 7.96, "confidence": 0.997, "speaker": "SPEAKER_00"}, {"text": "know", "start": 7.96, "end": 8.08, "confidence": 0.988, "speaker": "SPEAKER_00"}, {"text": "corporate", "start": 8.08, "end": 8.4, "confidence": 0.957, "speaker": "SPEAKER_00"}, {"text": "yet.", "start": 8.4, "end": 8.7, "confidence": 0.924, "speaker": "SPEAKER_00"}], "speaker": "SPEAKER_00"}, {"id": 4, "seek": 0, "start": 8.82, "end": 9.96, "text": " I only need a few months.", "tokens": [50800, 314, 691, 761, 257, 1178, 1933, 13, 50862], "temperature": 0.0, "avg_logprob": -0.2396322811351103, "compression_ratio": 1.6773162939297124, "no_speech_prob": 0.06969249993562698, "confidence": 0.953, "words": [{"text": "I", "start": 8.82, "end": 9.2, "confidence": 0.815, "speaker": "SPEAKER_01"}, {"text": "only", "start": 9.2, "end": 9.32, "confidence": 0.937, "speaker": "SPEAKER_00"}, {"text": "need", "start": 9.32, "end": 9.44, "confidence": 0.987, "speaker": "SPEAKER_00"}, {"text": "a", "start": 9.44, "end": 9.54, "confidence": 0.998, "speaker": "SPEAKER_00"}, {"text": "few", "start": 9.54, "end": 9.68, "confidence": 1.0, "speaker": "SPEAKER_00"}, {"text": "months.", "start": 9.68, "end": 9.96, "confidence": 0.998, "speaker": "SPEAKER_00"}], "speaker": "SPEAKER_00"}, {"id": 5, "seek": 0, "start": 10.34, "end": 11.42, "text": " You give me a few months to run the shit,", "tokens": [50862, 921, 1577, 502, 257, 1178, 1933, 284, 1057, 262, 7510, 11, 50920], "temperature": 0.0, "avg_logprob": -0.2396322811351103, "compression_ratio": 1.6773162939297124, "no_speech_prob": 0.06969249993562698, "confidence": 0.803, "words": [{"text": "You", "start": 10.34, "end": 10.48, "confidence": 0.699, "speaker": "SPEAKER_00"}, {"text": "give", "start": 10.48, "end": 10.54, "confidence": 0.946, "speaker": "SPEAKER_00"}, {"text": "me", "start": 10.54, "end": 10.64, "confidence": 0.995, "speaker": "SPEAKER_00"}, {"text": "a", "start": 10.64, "end": 10.72, "confidence": 0.995, "speaker": "SPEAKER_00"}, {"text": "few", "start": 10.72, "end": 10.84, "confidence": 0.998, "speaker": "SPEAKER_00"}, {"text": "months", "start": 10.84, "end": 11.02, "confidence": 0.994, "speaker": "SPEAKER_00"}, {"text": "to", "start": 11.02, "end": 11.14, "confidence": 0.728, "speaker": "SPEAKER_00"}, {"text": "run", "start": 11.14, "end": 11.24, "confidence": 0.763, "speaker": "SPEAKER_00"}, {"text": "the", "start": 11.24, "end": 11.34, "confidence": 0.399, "speaker": "SPEAKER_00"}, {"text": "shit,", "start": 11.34, "end": 11.42, "confidence": 0.778, "speaker": "SPEAKER_00"}], "speaker": "SPEAKER_00"}, {"id": 6, "seek": 0, "start": 11.42, "end": 12.88, "text": " I'm a fast learner.", "tokens": [50920, 314, 1101, 257, 3049, 22454, 1008, 13, 50997], "temperature": 0.0, "avg_logprob": -0.2396322811351103, "compression_ratio": 1.6773162939297124, "no_speech_prob": 0.06969249993562698, "confidence": 0.76, "words": [{"text": "I'm", "start": 11.42, "end": 11.66, "confidence": 0.78, "speaker": "SPEAKER_00"}, {"text": "a", "start": 11.66, "end": 11.74, "confidence": 0.391, "speaker": "SPEAKER_00"}, {"text": "fast", "start": 11.74, "end": 12.14, "confidence": 0.821, "speaker": "SPEAKER_00"}, {"text": "learner.", "start": 12.14, "end": 12.88, "confidence": 0.996, "speaker": "SPEAKER_00"}], "speaker": "SPEAKER_00"}, {"id": 7, "seek": 0, "start": 12.98, "end": 14.86, "text": " Go to Def Jam, get a job in a position,", "tokens": [50997, 1514, 284, 2896, 9986, 11, 651, 257, 1693, 287, 257, 2292, 11, 51109], "temperature": 0.0, "avg_logprob": -0.2396322811351103, "compression_ratio": 1.6773162939297124, "no_speech_prob": 0.06969249993562698, "confidence": 0.957, "words": [{"text": "Go", "start": 12.98, "end": 13.2, "confidence": 0.983, "speaker": "SPEAKER_00"}, {"text": "to", "start": 13.2, "end": 13.32, "confidence": 0.997, "speaker": "SPEAKER_00"}, {"text": "Def", "start": 13.32, "end": 13.54, "confidence": 0.987, "speaker": "SPEAKER_00"}, {"text": "Jam,", "start": 13.54, "end": 13.76, "confidence": 0.993, "speaker": "SPEAKER_00"}, {"text": "get", "start": 13.88, "end": 13.92, "confidence": 0.986, "speaker": "SPEAKER_00"}, {"text": "a", "start": 13.92, "end": 14.06, "confidence": 0.998, "speaker": "SPEAKER_00"}, {"text": "job", "start": 14.06, "end": 14.24, "confidence": 0.999, "speaker": "SPEAKER_00"}, {"text": "in", "start": 14.24, "end": 14.38, "confidence": 0.808, "speaker": "SPEAKER_00"}, {"text": "a", "start": 14.38, "end": 14.46, "confidence": 0.85, "speaker": "SPEAKER_00"}, {"text": "[*]", "start": 14.46, "end": 14.72, "confidence": 0.0, "speaker": "SPEAKER_00"}, {"text": "position,", "start": 14.72, "end": 14.86, "confidence": 0.994, "speaker": "SPEAKER_00"}], "speaker": "SPEAKER_00"}, {"id": 8, "seek": 0, "start": 15.24, "end": 17.02, "text": " drop a record, get Benny the Butcher signed,", "tokens": [51109, 4268, 257, 1700, 11, 651, 44275, 262, 39680, 4488, 11, 51199], "temperature": 0.0, "avg_logprob": -0.2396322811351103, "compression_ratio": 1.6773162939297124, "no_speech_prob": 0.06969249993562698, "confidence": 0.963, "words": [{"text": "drop", "start": 15.24, "end": 15.52, "confidence": 0.995, "speaker": "SPEAKER_00"}, {"text": "a", "start": 15.52, "end": 15.68, "confidence": 0.998, "speaker": "SPEAKER_00"}, {"text": "record,", "start": 15.68, "end": 15.9, "confidence": 0.999, "speaker": "SPEAKER_00"}, {"text": "get", "start": 16.02, "end": 16.12, "confidence": 0.961, "speaker": "SPEAKER_00"}, {"text": "Benny", "start": 16.12, "end": 16.32, "confidence": 0.97, "speaker": "SPEAKER_00"}, {"text": "the", "start": 16.32, "end": 16.46, "confidence": 0.92, "speaker": "SPEAKER_00"}, {"text": "Butcher", "start": 16.46, "end": 16.7, "confidence": 0.988, "speaker": "SPEAKER_00"}, {"text": "signed,", "start": 16.7, "end": 17.02, "confidence": 0.879, "speaker": "SPEAKER_00"}], "speaker": "SPEAKER_00"}, {"id": 9, "seek": 0, "start": 17.08, "end": 18.3, "text": " get Hip Hop Harry signed,", "tokens": [51199, 651, 29437, 9996, 5850, 4488, 11, 51292], "temperature": 0.0, "avg_logprob": -0.2396322811351103, "compression_ratio": 1.6773162939297124, "no_speech_prob": 0.06969249993562698, "confidence": 0.927, "words": [{"text": "get", "start": 17.08, "end": 17.26, "confidence": 0.992, "speaker": "SPEAKER_00"}, {"text": "Hip", "start": 17.26, "end": 17.44, "confidence": 0.924, "speaker": "SPEAKER_00"}, {"text": "Hop", "start": 17.44, "end": 17.62, "confidence": 0.76, "speaker": "SPEAKER_00"}, {"text": "Harry", "start": 17.62, "end": 17.88, "confidence": 0.997, "speaker": "SPEAKER_00"}, {"text": "signed,", "start": 17.88, "end": 18.3, "confidence": 0.985, "speaker": "SPEAKER_00"}], "speaker": "SPEAKER_00"}, {"id": 10, "seek": 0, "start": 18.9, "end": 20.32, "text": " learn a few tricks of the trade,", "tokens": [51292, 2193, 257, 1178, 15910, 286, 262, 3292, 11, 51367], "temperature": 0.0, "avg_logprob": -0.2396322811351103, "compression_ratio": 1.6773162939297124, "no_speech_prob": 0.06969249993562698, "confidence": 0.976, "words": [{"text": "learn", "start": 18.9, "end": 19.18, "confidence": 0.959, "speaker": "SPEAKER_00"}, {"text": "a", "start": 19.18, "end": 19.34, "confidence": 0.995, "speaker": "SPEAKER_00"}, {"text": "few", "start": 19.34, "end": 19.48, "confidence": 1.0, "speaker": "SPEAKER_00"}, {"text": "tricks", "start": 19.48, "end": 19.74, "confidence": 0.997, "speaker": "SPEAKER_00"}, {"text": "of", "start": 19.74, "end": 19.88, "confidence": 0.898, "speaker": "SPEAKER_00"}, {"text": "the", "start": 19.88, "end": 19.98, "confidence": 0.997, "speaker": "SPEAKER_00"}, {"text": "trade,", "start": 19.98, "end": 20.32, "confidence": 0.994, "speaker": "SPEAKER_00"}], "speaker": "SPEAKER_00"}, {"id": 11, "seek": 0, "start": 20.4, "end": 22.46, "text": " find out that the niggas that had it", "tokens": [51367, 1064, 503, 326, 262, 299, 6950, 292, 326, 550, 340, 51467], "temperature": 0.0, "avg_logprob": -0.2396322811351103, "compression_ratio": 1.6773162939297124, "no_speech_prob": 0.06969249993562698, "confidence": 0.978, "words": [{"text": "find", "start": 20.4, "end": 20.74, "confidence": 0.947, "speaker": "SPEAKER_00"}, {"text": "out", "start": 20.74, "end": 21.1, "confidence": 0.998, "speaker": "SPEAKER_00"}, {"text": "that", "start": 21.1, "end": 21.58, "confidence": 0.981, "speaker": "SPEAKER_00"}, {"text": "the", "start": 21.58, "end": 21.7, "confidence": 0.994, "speaker": "SPEAKER_00"}, {"text": "niggas", "start": 21.7, "end": 21.96, "confidence": 0.978, "speaker": "SPEAKER_00"}, {"text": "that", "start": 21.96, "end": 22.12, "confidence": 0.945, "speaker": "SPEAKER_00"}, {"text": "had", "start": 22.12, "end": 22.32, "confidence": 0.995, "speaker": "SPEAKER_00"}, {"text": "it", "start": 22.32, "end": 22.46, "confidence": 0.993, "speaker": "SPEAKER_00"}], "speaker": "SPEAKER_00"}, {"id": 12, "seek": 0, "start": 22.46, "end": 23.72, "text": " that wanted me to hold for them,", "tokens": [51467, 326, 2227, 502, 284, 1745, 329, 606, 11, 51544], "temperature": 0.0, "avg_logprob": -0.2396322811351103, "compression_ratio": 1.6773162939297124, "no_speech_prob": 0.06969249993562698, "confidence": 0.935, "words": [{"text": "that", "start": 22.46, "end": 22.6, "confidence": 0.976, "speaker": "SPEAKER_00"}, {"text": "wanted", "start": 22.6, "end": 22.76, "confidence": 0.996, "speaker": "SPEAKER_00"}, {"text": "me", "start": 22.76, "end": 22.9, "confidence": 0.995, "speaker": "SPEAKER_00"}, {"text": "to", "start": 22.9, "end": 23.04, "confidence": 0.984, "speaker": "SPEAKER_00"}, {"text": "hold", "start": 23.04, "end": 23.2, "confidence": 0.856, "speaker": "SPEAKER_00"}, {"text": "for", "start": 23.2, "end": 23.42, "confidence": 0.99, "speaker": "SPEAKER_00"}, {"text": "them,", "start": 23.42, "end": 23.72, "confidence": 0.772, "speaker": "SPEAKER_00"}], "speaker": "SPEAKER_00"}, {"id": 13, "seek": 0, "start": 23.88, "end": 25.4, "text": " then sold it to some other people.", "tokens": [51544, 788, 2702, 340, 284, 617, 584, 661, 13, 51654], "temperature": 0.0, "avg_logprob": -0.2396322811351103, "compression_ratio": 1.6773162939297124, "no_speech_prob": 0.06969249993562698, "confidence": 0.987, "words": [{"text": "then", "start": 23.88, "end": 24.18, "confidence": 0.923, "speaker": "SPEAKER_00"}, {"text": "sold", "start": 24.18, "end": 24.46, "confidence": 0.997, "speaker": "SPEAKER_00"}, {"text": "it", "start": 24.46, "end": 24.58, "confidence": 0.996, "speaker": "SPEAKER_00"}, {"text": "to", "start": 24.58, "end": 24.72, "confidence": 0.998, "speaker": "SPEAKER_00"}, {"text": "some", "start": 24.72, "end": 24.88, "confidence": 0.998, "speaker": "SPEAKER_00"}, {"text": "other", "start": 24.88, "end": 25.1, "confidence": 0.999, "speaker": "SPEAKER_00"}, {"text": "people.", "start": 25.1, "end": 25.4, "confidence": 1.0, "speaker": "SPEAKER_00"}], "speaker": "SPEAKER_00"}, {"id": 14, "seek": 0, "start": 26.14, "end": 28.69, "text": " So now, one of my big wig buddies called me", "tokens": [51654, 1406, 783, 11, 530, 286, 616, 1263, 45678, 35548, 1444, 502, 51779], "temperature": 0.0, "avg_logprob": -0.2396322811351103, "compression_ratio": 1.6773162939297124, "no_speech_prob": 0.06969249993562698, "confidence": 0.923, "words": [{"text": "So", "start": 26.14, "end": 26.4, "confidence": 0.991, "speaker": "SPEAKER_00"}, {"text": "now,", "start": 26.4, "end": 26.76, "confidence": 0.83, "speaker": "SPEAKER_00"}, {"text": "one", "start": 27.12, "end": 27.26, "confidence": 0.993, "speaker": "SPEAKER_00"}, {"text": "of", "start": 27.26, "end": 27.36, "confidence": 0.998, "speaker": "SPEAKER_00"}, {"text": "my", "start": 27.36, "end": 27.52, "confidence": 0.998, "speaker": "SPEAKER_00"}, {"text": "big", "start": 27.52, "end": 27.78, "confidence": 0.92, "speaker": "SPEAKER_00"}, {"text": "wig", "start": 27.78, "end": 28.0, "confidence": 0.715, "speaker": "SPEAKER_00"}, {"text": "buddies", "start": 28.0, "end": 28.26, "confidence": 0.999, "speaker": "SPEAKER_00"}, {"text": "called", "start": 28.26, "end": 28.54, "confidence": 0.841, "speaker": "SPEAKER_00"}, {"text": "me", "start": 28.54, "end": 28.69, "confidence": 0.997, "speaker": "SPEAKER_00"}], "speaker": "SPEAKER_00"}, {"id": 15, "seek": 2832, "start": 28.69, "end": 31.38, "text": " and said, hey dog, I know the people that got Def Ro.", "tokens": [50383, 290, 531, 11, 17207, 3290, 11, 314, 760, 262, 661, 326, 1392, 2896, 5564, 13, 50550], "temperature": 0.0, "avg_logprob": -0.23926071048707, "compression_ratio": 1.7264573991031391, "no_speech_prob": 0.014520817436277866, "confidence": 0.713, "words": [{"text": "and", "start": 28.69, "end": 28.84, "confidence": 0.979, "speaker": "SPEAKER_00"}, {"text": "said,", "start": 28.84, "end": 28.98, "confidence": 0.765, "speaker": "SPEAKER_00"}, {"text": "hey", "start": 29.1, "end": 29.18, "confidence": 0.816, "speaker": "SPEAKER_00"}, {"text": "dog,", "start": 29.18, "end": 29.48, "confidence": 0.455, "speaker": "SPEAKER_00"}, {"text": "I", "start": 29.94, "end": 30.04, "confidence": 0.94, "speaker": "SPEAKER_00"}, {"text": "know", "start": 30.04, "end": 30.18, "confidence": 0.998, "speaker": "SPEAKER_00"}, {"text": "the", "start": 30.18, "end": 30.32, "confidence": 0.992, "speaker": "SPEAKER_00"}, {"text": "people", "start": 30.32, "end": 30.54, "confidence": 0.999, "speaker": "SPEAKER_00"}, {"text": "that", "start": 30.54, "end": 30.7, "confidence": 0.845, "speaker": "SPEAKER_00"}, {"text": "got", "start": 30.7, "end": 30.92, "confidence": 0.982, "speaker": "SPEAKER_00"}, {"text": "Def", "start": 30.92, "end": 31.18, "confidence": 0.362, "speaker": "SPEAKER_00"}, {"text": "Ro.", "start": 31.18, "end": 31.38, "confidence": 0.224, "speaker": "SPEAKER_00"}], "speaker": "SPEAKER_00"}, {"id": 16, "seek": 2832, "start": 32.02, "end": 33.44, "text": " And they don't know what to do with it.", "tokens": [50550, 843, 484, 836, 470, 760, 644, 284, 466, 351, 340, 13, 50668], "temperature": 0.0, "avg_logprob": -0.23926071048707, "compression_ratio": 1.7264573991031391, "no_speech_prob": 0.014520817436277866, "confidence": 0.992, "words": [{"text": "And", "start": 32.02, "end": 32.2, "confidence": 0.977, "speaker": "SPEAKER_00"}, {"text": "they", "start": 32.2, "end": 32.3, "confidence": 0.987, "speaker": "SPEAKER_00"}, {"text": "don't", "start": 32.3, "end": 32.46, "confidence": 0.998, "speaker": "SPEAKER_00"}, {"text": "know", "start": 32.46, "end": 32.58, "confidence": 0.996, "speaker": "SPEAKER_00"}, {"text": "what", "start": 32.58, "end": 32.7, "confidence": 0.998, "speaker": "SPEAKER_00"}, {"text": "to", "start": 32.7, "end": 32.84, "confidence": 0.995, "speaker": "SPEAKER_00"}, {"text": "do", "start": 32.84, "end": 33.02, "confidence": 0.995, "speaker": "SPEAKER_00"}, {"text": "with", "start": 33.02, "end": 33.18, "confidence": 0.995, "speaker": "SPEAKER_00"}, {"text": "it.", "start": 33.18, "end": 33.44, "confidence": 0.985}], "speaker": "SPEAKER_00"}, {"id": 17, "seek": 2832, "start": 34.22, "end": 36.24, "text": " Let me holler at them, I know just what to do with it.", "tokens": [50668, 3914, 502, 289, 49252, 379, 606, 11, 314, 760, 655, 644, 284, 466, 351, 340, 13, 50760], "temperature": 0.0, "avg_logprob": -0.23926071048707, "compression_ratio": 1.7264573991031391, "no_speech_prob": 0.014520817436277866, "confidence": 0.928, "words": [{"text": "Let", "start": 34.22, "end": 34.56, "confidence": 0.848, "speaker": "SPEAKER_02"}, {"text": "me", "start": 34.56, "end": 34.7, "confidence": 0.998, "speaker": "SPEAKER_00"}, {"text": "holler", "start": 34.7, "end": 34.9, "confidence": 0.827, "speaker": "SPEAKER_00"}, {"text": "at", "start": 34.9, "end": 35.08, "confidence": 0.993, "speaker": "SPEAKER_00"}, {"text": "them,", "start": 35.08, "end": 35.18, "confidence": 0.843, "speaker": "SPEAKER_00"}, {"text": "I", "start": 35.24, "end": 35.3, "confidence": 0.854, "speaker": "SPEAKER_00"}, {"text": "know", "start": 35.3, "end": 35.48, "confidence": 0.963, "speaker": "SPEAKER_00"}, {"text": "just", "start": 35.48, "end": 35.68, "confidence": 0.995, "speaker": "SPEAKER_00"}, {"text": "what", "start": 35.68, "end": 35.86, "confidence": 0.994, "speaker": "SPEAKER_00"}, {"text": "to", "start": 35.86, "end": 35.98, "confidence": 0.988, "speaker": "SPEAKER_00"}, {"text": "do", "start": 35.98, "end": 36.1, "confidence": 0.996, "speaker": "SPEAKER_00"}, {"text": "with", "start": 36.1, "end": 36.2, "confidence": 0.987, "speaker": "SPEAKER_00"}, {"text": "it.", "start": 36.2, "end": 36.24, "confidence": 0.916, "speaker": "SPEAKER_00"}], "speaker": "SPEAKER_00"}, {"id": 18, "seek": 2832, "start": 36.24, "end": 39.68, "text": " So I hit them and like, let me work for y'all.", "tokens": [50760, 1406, 314, 2277, 606, 290, 588, 11, 1309, 502, 670, 329, 331, 6, 439, 13, 50937], "temperature": 0.0, "avg_logprob": -0.23926071048707, "compression_ratio": 1.7264573991031391, "no_speech_prob": 0.014520817436277866, "confidence": 0.833, "words": [{"text": "So", "start": 36.24, "end": 36.4, "confidence": 0.998, "speaker": "SPEAKER_00"}, {"text": "I", "start": 36.4, "end": 36.52, "confidence": 0.9, "speaker": "SPEAKER_00"}, {"text": "hit", "start": 36.52, "end": 36.66, "confidence": 0.987, "speaker": "SPEAKER_00"}, {"text": "them", "start": 36.66, "end": 36.82, "confidence": 0.995, "speaker": "SPEAKER_00"}, {"text": "and", "start": 36.82, "end": 36.98, "confidence": 0.486, "speaker": "SPEAKER_00"}, {"text": "like,", "start": 36.98, "end": 37.2, "confidence": 0.514, "speaker": "SPEAKER_00"}, {"text": "let", "start": 37.56, "end": 37.68, "confidence": 0.801, "speaker": "SPEAKER_00"}, {"text": "me", "start": 37.68, "end": 37.88, "confidence": 0.997, "speaker": "SPEAKER_00"}, {"text": "[*]", "start": 37.88, "end": 39.1, "confidence": 0.0, "speaker": "SPEAKER_00"}, {"text": "work", "start": 39.1, "end": 39.24, "confidence": 0.555, "speaker": "SPEAKER_00"}, {"text": "for", "start": 39.24, "end": 39.44, "confidence": 0.996, "speaker": "SPEAKER_00"}, {"text": "y'all.", "start": 39.44, "end": 39.68, "confidence": 0.986, "speaker": "SPEAKER_00"}], "speaker": "SPEAKER_00"}, {"id": 19, "seek": 2832, "start": 40.76, "end": 43.32, "text": " The play was cool, but it's like, yeah, fuck that.", "tokens": [50987, 383, 711, 373, 3608, 11, 475, 340, 338, 588, 11, 10194, 11, 5089, 326, 13, 51115], "temperature": 0.0, "avg_logprob": -0.23926071048707, "compression_ratio": 1.7264573991031391, "no_speech_prob": 0.014520817436277866, "confidence": 0.85, "words": [{"text": "The", "start": 40.76, "end": 40.9, "confidence": 0.989, "speaker": "SPEAKER_00"}, {"text": "play", "start": 40.9, "end": 41.1, "confidence": 0.846, "speaker": "SPEAKER_00"}, {"text": "was", "start": 41.1, "end": 41.28, "confidence": 0.999, "speaker": "SPEAKER_00"}, {"text": "cool,", "start": 41.28, "end": 41.56, "confidence": 0.997, "speaker": "SPEAKER_00"}, {"text": "but", "start": 41.66, "end": 41.76, "confidence": 0.977, "speaker": "SPEAKER_00"}, {"text": "it's", "start": 41.76, "end": 41.96, "confidence": 0.947, "speaker": "SPEAKER_00"}, {"text": "like,", "start": 41.96, "end": 42.2, "confidence": 0.997, "speaker": "SPEAKER_00"}, {"text": "[*]", "start": 42.32, "end": 42.48, "confidence": 0.0, "speaker": "SPEAKER_00"}, {"text": "yeah,", "start": 42.48, "end": 42.7, "confidence": 0.243, "speaker": "SPEAKER_00"}, {"text": "fuck", "start": 42.94, "end": 43.1, "confidence": 0.951, "speaker": "SPEAKER_00"}, {"text": "that.", "start": 43.1, "end": 43.32, "confidence": 0.997, "speaker": "SPEAKER_00"}], "speaker": "SPEAKER_00"}, {"id": 20, "seek": 2832, "start": 43.34, "end": 44.83, "text": " How much to buy this shit?", "tokens": [51115, 1374, 881, 284, 2822, 428, 7510, 30, 51197], "temperature": 0.0, "avg_logprob": -0.23926071048707, "compression_ratio": 1.7264573991031391, "no_speech_prob": 0.014520817436277866, "confidence": 0.954, "words": [{"text": "How", "start": 43.34, "end": 43.48, "confidence": 0.988, "speaker": "SPEAKER_00"}, {"text": "[*]", "start": 43.48, "end": 44.0, "confidence": 0.0, "speaker": "SPEAKER_00"}, {"text": "much", "start": 44.0, "end": 44.12, "confidence": 0.998, "speaker": "SPEAKER_00"}, {"text": "to", "start": 44.12, "end": 44.26, "confidence": 0.922, "speaker": "SPEAKER_00"}, {"text": "buy", "start": 44.26, "end": 44.46, "confidence": 0.995, "speaker": "SPEAKER_00"}, {"text": "this", "start": 44.46, "end": 44.64, "confidence": 0.833, "speaker": "SPEAKER_00"}, {"text": "shit?", "start": 44.64, "end": 44.83, "confidence": 0.998, "speaker": "SPEAKER_00"}], "speaker": "SPEAKER_00"}, {"id": 21, "seek": 2832, "start": 44.83, "end": 45.64, "text": " What you talkin' about?", "tokens": [51197, 1867, 345, 1561, 259, 6, 546, 30, 51259], "temperature": 0.0, "avg_logprob": -0.23926071048707, "compression_ratio": 1.7264573991031391, "no_speech_prob": 0.014520817436277866, "confidence": 0.802, "words": [{"text": "What", "start": 44.83, "end": 44.96, "confidence": 0.895, "speaker": "SPEAKER_00"}, {"text": "you", "start": 44.96, "end": 45.06, "confidence": 0.594, "speaker": "SPEAKER_00"}, {"text": "talkin'", "start": 45.06, "end": 45.46, "confidence": 0.862, "speaker": "SPEAKER_00"}, {"text": "about?", "start": 45.46, "end": 45.64, "confidence": 0.785}], "speaker": "SPEAKER_00"}, {"id": 22, "seek": 2832, "start": 45.98, "end": 47.38, "text": " How much to buy Def Ro first?", "tokens": [51259, 1374, 881, 284, 2822, 2896, 5564, 717, 30, 51359], "temperature": 0.0, "avg_logprob": -0.23926071048707, "compression_ratio": 1.7264573991031391, "no_speech_prob": 0.014520817436277866, "confidence": 0.959, "words": [{"text": "How", "start": 45.98, "end": 46.18, "confidence": 0.996, "speaker": "SPEAKER_00"}, {"text": "much", "start": 46.18, "end": 46.32, "confidence": 0.999, "speaker": "SPEAKER_00"}, {"text": "to", "start": 46.32, "end": 46.46, "confidence": 0.989, "speaker": "SPEAKER_00"}, {"text": "buy", "start": 46.46, "end": 46.64, "confidence": 0.998, "speaker": "SPEAKER_00"}, {"text": "Def", "start": 46.64, "end": 46.88, "confidence": 0.833, "speaker": "SPEAKER_00"}, {"text": "Ro", "start": 46.88, "end": 47.08, "confidence": 0.996, "speaker": "SPEAKER_00"}, {"text": "first?", "start": 47.08, "end": 47.38, "confidence": 0.913, "speaker": "SPEAKER_00"}], "speaker": "SPEAKER_00"}, {"id": 23, "seek": 2832, "start": 48.14, "end": 49.22, "text": " How much for my masters?", "tokens": [51359, 1374, 881, 329, 616, 18159, 30, 51462], "temperature": 0.0, "avg_logprob": -0.23926071048707, "compression_ratio": 1.7264573991031391, "no_speech_prob": 0.014520817436277866, "confidence": 0.931, "words": [{"text": "How", "start": 48.14, "end": 48.34, "confidence": 0.999, "speaker": "SPEAKER_00"}, {"text": "much", "start": 48.34, "end": 48.48, "confidence": 0.999, "speaker": "SPEAKER_00"}, {"text": "for", "start": 48.48, "end": 48.62, "confidence": 0.991, "speaker": "SPEAKER_00"}, {"text": "my", "start": 48.62, "end": 48.8, "confidence": 0.997, "speaker": "SPEAKER_00"}, {"text": "[*]", "start": 48.8, "end": 49.06, "confidence": 0.0, "speaker": "SPEAKER_00"}, {"text": "masters?", "start": 49.06, "end": 49.22, "confidence": 0.71, "speaker": "SPEAKER_00"}], "speaker": "SPEAKER_00"}, {"id": 24, "seek": 2832, "start": 50.18, "end": 51.48, "text": " How much for all of the masters?", "tokens": [51462, 1374, 881, 329, 477, 286, 262, 18159, 30, 51542], "temperature": 0.0, "avg_logprob": -0.23926071048707, "compression_ratio": 1.7264573991031391, "no_speech_prob": 0.014520817436277866, "confidence": 0.982, "words": [{"text": "How", "start": 50.18, "end": 50.4, "confidence": 0.998, "speaker": "SPEAKER_00"}, {"text": "much", "start": 50.4, "end": 50.54, "confidence": 0.999, "speaker": "SPEAKER_00"}, {"text": "for", "start": 50.54, "end": 50.7, "confidence": 0.998, "speaker": "SPEAKER_00"}, {"text": "all", "start": 50.7, "end": 50.88, "confidence": 0.999, "speaker": "SPEAKER_00"}, {"text": "of", "start": 50.88, "end": 50.98, "confidence": 0.971, "speaker": "SPEAKER_00"}, {"text": "the", "start": 50.98, "end": 51.08, "confidence": 0.919, "speaker": "SPEAKER_00"}, {"text": "masters?", "start": 51.08, "end": 51.48, "confidence": 0.994, "speaker": "SPEAKER_00"}], "speaker": "SPEAKER_00"}], "language_code": "en"} \ No newline at end of file diff --git a/tests/test_formatter.py b/tests/test_formatter.py index afaeae4..0509e57 100644 --- a/tests/test_formatter.py +++ b/tests/test_formatter.py @@ -1,6 +1,9 @@ +import math import json import numpy.testing as nptest +import pytest import textgrid +import warnings import formatter @@ -8,20 +11,34 @@ class TestFormatter(): Format = formatter.Formatter() def test_to_TextGrid(self): - for input_fname, ex_fname in self.provide_to_TextGrid(): + for input_fname, by_phrase in self.provide_to_TextGrid(): with open(input_fname) as f: case = json.load(f) - observed = self.Format.to_TextGrid(case) + observed = self.Format.to_TextGrid(case, by_phrase=by_phrase) - expected = textgrid.TextGrid() - expected.read(ex_fname) + assert observed.maxTime is not None + assert len(observed.tiers) > 0 - nptest.assert_array_equal(observed,expected) + def test_no_speaker_warning(self): + for input_fname in self.provide_no_speaker_warning(): + with open(input_fname) as f: + case = json.load(f) + with pytest.warns(UserWarning, match="No speaker for segment") as record: + _ = self.Format.to_TextGrid(case, by_phrase=False) def provide_to_TextGrid(self): return [ ( - 'tests/data/TestAudio_SnoopDogg_85SouthMedia_segments.json', - 'tests/data/TestAudio_SnoopDogg_85SouthMedia.TextGrid' + 'tests/data/TestAudio_SnoopDogg_85SouthMedia_WhisperTimestampSegments.json', + True + ), + ( + 'tests/data/TestAudio_SnoopDogg_85SouthMedia_WhisperTimestampSegments.json', + False ), ] + + def provide_no_speaker_warning(self): + return [ + 'tests/data/TestAudio_SnoopDogg_85SouthMedia.json', + ]