Skip to content

Commit

Permalink
add new cli commands
Browse files Browse the repository at this point in the history
Add new cli commands to make it easier to process YouTube playlists and
RSS feeds.
- The user can now use `preprocess-sources` to supply the sources.
Preprocessing will fetch all the given sources, and output them in JSON
alongside the available metadata.
- The JSON can then be edited (add missing metadata) and piped to
`transcribe-from-json` which will transcribe all the given sources.
  • Loading branch information
kouloumos committed Nov 24, 2023
1 parent 64dbbe0 commit 98e976d
Show file tree
Hide file tree
Showing 5 changed files with 190 additions and 23 deletions.
8 changes: 4 additions & 4 deletions Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ To check the version:

## Usage

`tstbtc {source_file/url} {directory}` transcribe the given source
`tstbtc transcribe {source_file/url} {directory}` transcribe the given source

Suported sources:
- YouTube videos
Expand Down Expand Up @@ -120,13 +120,13 @@ from Stephan Livera's podcast and add the associated metadata, we would run eith
of the below commands. The first uses short argument tags, while the second uses
long argument tags. The result is the same.

- `tstbtc Nq6WxJ0PgJ4 bitcointranscripts/stephan-livera-podcast -t 'OP_Vault - A New Way to HODL?' -d '2023-01-30' -T 'script' -T 'op_vault' -s 'James O’Beirne' -s 'Stephan Livera' -c ‘podcast’`
- `tstbtc Nq6WxJ0PgJ4 bitcointranscripts/stephan-livera-podcast --title 'OP_Vault - A New Way to HODL?' --date '2023-01-30' --tags 'script' --tags 'op_vault' --speakers 'James O’Beirne' --speakers 'Stephan Livera' --category ‘podcast’`
- `tstbtc transcribe Nq6WxJ0PgJ4 bitcointranscripts/stephan-livera-podcast -t 'OP_Vault - A New Way to HODL?' -d '2023-01-30' -T 'script' -T 'op_vault' -s 'James O’Beirne' -s 'Stephan Livera' -c ‘podcast’`
- `tstbtc transcribe Nq6WxJ0PgJ4 bitcointranscripts/stephan-livera-podcast --title 'OP_Vault - A New Way to HODL?' --date '2023-01-30' --tags 'script' --tags 'op_vault' --speakers 'James O’Beirne' --speakers 'Stephan Livera' --category ‘podcast’`

You can also transcribe a remote audio/mp3 link, such as the following from Stephan Livera's podcast:
```shell
mp3_link="https://anchor.fm/s/7d083a4/podcast/play/64348045/https%3A%2F%2Fd3ctxlq1ktw2nl.cloudfront.net%2Fstaging%2F2023-1-1%2Ff7fafb12-9441-7d85-d557-e9e5d18ab788.mp3"
tstbtc $mp3_link bitcointranscripts/stephan-livera-podcast --title 'SLP455 Anant Tapadia - Single Sig or Multi Sig?' --date '2023-02-01' --tags 'multisig' --speakers 'Anant Tapadia' --speakers 'Stephan Livera' --category 'podcast'
tstbtc transcribe $mp3_link bitcointranscripts/stephan-livera-podcast --title 'SLP455 Anant Tapadia - Single Sig or Multi Sig?' --date '2023-02-01' --tags 'multisig' --speakers 'Anant Tapadia' --speakers 'Stephan Livera' --category 'podcast'
```

## Testing
Expand Down
34 changes: 32 additions & 2 deletions app/transcript.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,12 +235,14 @@ def initialize(self):


class Audio(Source):
def __init__(self, source):
def __init__(self, source, description=None, chapters=[]):
try:
# initialize source using a base Source
super().__init__(source_file=source.source_file, link=source.link, loc=source.loc, local=source.local, title=source.title,
date=source.event_date, tags=source.tags, category=source.category, speakers=source.speakers, preprocess=source.preprocess)
self.type = "audio"
self.description = description
self.chapters = chapters
self.__config_source()
except Exception as e:
raise Exception(f"Error during Audio creation: {e}")
Expand Down Expand Up @@ -299,6 +301,20 @@ def download_audio():
except Exception as e:
raise Exception(f"Error processing audio file: {e}")

def to_json(self):
return {
'type': self.type,
'loc': self.loc,
"source_file": self.source_file,
"media": self.media,
'title': self.title,
'categories': self.category,
'tags': self.tags,
'speakers': self.speakers,
'date': self.event_date.strftime("%Y-%m-%d"),
'description': self.description,
'chapters': self.chapters,
}


class Video(Source):
Expand Down Expand Up @@ -418,6 +434,20 @@ def extract_chapters_from_downloaded_video_metadata():
except Exception as e:
raise Exception(f"Error processing video file: {e}")

def to_json(self):
return {
'type': self.type,
'loc': self.loc,
"source_file": self.source_file,
'title': self.title,
'categories': self.category,
'tags': self.tags,
'speakers': self.speakers,
'date': self.event_date.strftime("%Y-%m-%d"),
'chapters': self.chapters,
'youtube': self.youtube_metadata
}


class Playlist(Source):
def __init__(self, source, entries, preprocess=False):
Expand Down Expand Up @@ -462,7 +492,7 @@ def __config_source(self):
if enclosure.type in ['audio/mpeg', 'audio/wav', 'audio/x-m4a']:
published_date = date(*entry.published_parsed[:3])
source = Audio(Source(enclosure.href, self.loc, self.local, entry.title, published_date, self.tags,
self.category, self.speakers, self.preprocess, link=entry.link))
self.category, self.speakers, self.preprocess, link=entry.link), description=entry.description)
self.entries.append(source)
else:
self.logger.warning(
Expand Down
5 changes: 5 additions & 0 deletions app/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@ def write_to_json(json_data, output_dir, filename, add_timestamp=True):
return file_path


def check_if_valid_file_path(file_path):
if not isinstance(file_path, str) or not os.path.isfile(file_path):
raise Exception(f"Not a valid file: {file_path}")


def get_status():
"""Helper method to fetch and store status.json locally"""
STATUS_FILE_PATH = "status.json" # the file path for storing the status locally
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,6 @@
],
entry_points="""
[console_scripts]
tstbtc=transcriber:add
tstbtc=transcriber:cli
""",
)
164 changes: 148 additions & 16 deletions transcriber.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import logging
import tempfile

Expand All @@ -7,22 +8,32 @@
from app.transcript import Transcript
from app.transcription import Transcription
from app.logging import configure_logger, get_logger
from app.utils import check_if_valid_file_path, write_to_json

logger = get_logger()


@click.group()
def cli():
pass


def print_version(ctx, param, value):
if not value or ctx.resilient_parsing:
return
click.echo(f"{__app_name__} v{__version__}")
ctx.exit()


@click.option(
"-v",
"--version",
is_flag=True,
callback=print_version,
expose_value=False,
is_eager=True,
help="Show the application's version and exit.",
)
@click.group()
def cli():
pass


def print_help(ctx, param, value):
if not value or ctx.resilient_parsing:
return
Expand Down Expand Up @@ -130,7 +141,7 @@ def print_help(ctx, param, value):

@cli.command()
@click.argument("source", nargs=1)
@click.argument("loc", nargs=1) # location in the bitcointranscripts hierarchy
@click.argument("loc", nargs=1) # location in the bitcointranscripts hierarchy
# Available transcription models and services
@whisper
@deepgram
Expand Down Expand Up @@ -176,16 +187,7 @@ def print_help(ctx, param, value):
@model_output_dir
@nocleanup
@verbose_logging
@click.option(
"-v",
"--version",
is_flag=True,
callback=print_version,
expose_value=False,
is_eager=True,
help="Show the application's version and exit.",
)
def add(
def transcribe(
source: str,
loc: str,
model: str,
Expand Down Expand Up @@ -245,3 +247,133 @@ def add(
except Exception as e:
logger.error(e)
logger.info(f"Exited with error, not cleaning up temp files: {tmp_dir}")


@cli.command()
@click.argument("json_file", nargs=1)
@whisper
@deepgram
@diarize
@summarize
@use_youtube_chapters
@open_pr
@upload_to_s3
@save_to_markdown
@noqueue
@model_output_dir
@nocleanup
@verbose_logging
def transcribe_from_json(
json_file: str,
model: str,
chapters: bool,
deepgram: bool,
diarize: bool,
summarize: bool,
pr: bool,
upload: bool,
markdown: bool,
noqueue: bool,
model_output_dir: str,
nocleanup: bool,
verbose: bool,
):
"""Supply sources in a JSON file for transcription.
The JSON can be generated by `preprocess-sources` or created manually.
"""
try:
check_if_valid_file_path(json_file)
tmp_dir = tempfile.mkdtemp()
configure_logger(logging.DEBUG if verbose else logging.INFO, tmp_dir)
logger.info(f"Adding transcripts from {json_file}")
transcription = Transcription(
model=model,
deepgram=deepgram,
chapters=chapters,
diarize=diarize,
summarize=summarize,
upload=upload,
markdown=markdown,
queue=not noqueue,
model_output_dir=model_output_dir,
nocleanup=nocleanup,
working_dir=tmp_dir
)

with open(json_file, 'r') as file:
sources = json.load(file)

for source in sources:
# Configure metadata given from JSON
speakers = source.get("speakers", [])
category = source.get("categories", [])
tags = source.get("tags", [])
loc = source.get("loc", "")
youtube_metadata = source.get("youtube", None)
transcription.add_transcription_source(
source_file=source["source_file"], loc=loc,
title=source["title"], category=category, tags=tags,
speakers=speakers, date=source["date"],
youtube_metadata=youtube_metadata,
chapters=source["chapters"]
)

transcription.start()
if nocleanup:
logger.info("Not cleaning up temp files...")
else:
transcription.clean_up()

except Exception as e:
logger.error(e)


@cli.command()
@click.argument("json_file", nargs=1)
@click.option(
"--nocheck",
is_flag=True,
default=False,
help="Do not check for existing sources using btctranscripts.com/status.json",
)
def preprocess_sources(json_file, nocheck):
"""Supply sources in a JSON file for preprocess. Preprocessing will fetch
all the given sources, and output them in a JSON alongside the available
metadata. The JSON can then be edited and piped to `transcribe-from-json`
"""
try:
configure_logger(log_level=logging.INFO)
check_if_valid_file_path(json_file)
transcription = Transcription()
with open(json_file, "r") as outfile:
sources = json.load(outfile)
outfile.close()
logger.info(f"Sources detected: {len(sources)}")
transcription_sources = []
for source in sources:
logger.info(f"Preprocessing {source['title']}: {source['source']}")
# Configure metadata given from source
speakers = source.get("speakers", [])
category = source.get("categories", [])
tags = source.get("tags", [])
loc = source.get("loc", "")
excluded_media = source.get(
"existing_entries_not_covered_by_btctranscripts/status.json", [])
excluded_media = [entry["media"] for entry in excluded_media]
transcription_source = transcription.add_transcription_source(
source['source'], loc=loc, tags=tags, category=category,
speakers=speakers, nocheck=nocheck, preprocess=True,
excluded_media=excluded_media
)
for transcription_source in transcription_source["added"]:
transcription_sources.append(transcription_source)
# Write all preprocessed sources to JSON
write_to_json([source.to_json() for source in transcription_sources],
transcription.model_output_dir, "preprocessed_sources")
except Exception as e:
logger.error(e)
logger.info(f"Exited with error")


if __name__ == '__main__':
cli()

0 comments on commit 98e976d

Please sign in to comment.