add new cli commands

Add new cli commands to make it easier to process YouTube playlists and RSS feeds. - The user can now use `preprocess-sources` to supply the sources. Preprocessing will fetch all the given sources, and output them in JSON alongside the available metadata. - The JSON can then be edited (add missing metadata) and piped to `transcribe-from-json` which will transcribe all the given sources.
bitcointranscripts · Nov 24, 2023 · 98e976d · 98e976d
1 parent 64dbbe0
commit 98e976d
Show file tree

Hide file tree

Showing 5 changed files with 190 additions and 23 deletions.
diff --git a/Readme.md b/Readme.md
@@ -79,7 +79,7 @@ To check the version:
 
 ## Usage
 
-`tstbtc {source_file/url} {directory}` transcribe the given source
+`tstbtc transcribe {source_file/url} {directory}` transcribe the given source
 
 Suported sources:
   - YouTube videos
@@ -120,13 +120,13 @@ from Stephan Livera's podcast and add the associated metadata, we would run eith
 of the below commands. The first uses short argument tags, while the second uses
 long argument tags. The result is the same.
 
-- `tstbtc Nq6WxJ0PgJ4 bitcointranscripts/stephan-livera-podcast -t 'OP_Vault - A New Way to HODL?' -d '2023-01-30' -T 'script' -T 'op_vault' -s 'James O’Beirne' -s 'Stephan Livera' -c ‘podcast’`
-- `tstbtc Nq6WxJ0PgJ4 bitcointranscripts/stephan-livera-podcast --title 'OP_Vault - A New Way to HODL?' --date '2023-01-30' --tags 'script' --tags 'op_vault' --speakers 'James O’Beirne' --speakers 'Stephan Livera' --category ‘podcast’`
+- `tstbtc transcribe Nq6WxJ0PgJ4 bitcointranscripts/stephan-livera-podcast -t 'OP_Vault - A New Way to HODL?' -d '2023-01-30' -T 'script' -T 'op_vault' -s 'James O’Beirne' -s 'Stephan Livera' -c ‘podcast’`
+- `tstbtc transcribe Nq6WxJ0PgJ4 bitcointranscripts/stephan-livera-podcast --title 'OP_Vault - A New Way to HODL?' --date '2023-01-30' --tags 'script' --tags 'op_vault' --speakers 'James O’Beirne' --speakers 'Stephan Livera' --category ‘podcast’`
 
 You can also transcribe a remote audio/mp3 link, such as the following from Stephan Livera's podcast: 
 ```shell
 mp3_link="https://anchor.fm/s/7d083a4/podcast/play/64348045/https%3A%2F%2Fd3ctxlq1ktw2nl.cloudfront.net%2Fstaging%2F2023-1-1%2Ff7fafb12-9441-7d85-d557-e9e5d18ab788.mp3"
-tstbtc $mp3_link bitcointranscripts/stephan-livera-podcast --title 'SLP455 Anant Tapadia - Single Sig or Multi Sig?' --date '2023-02-01' --tags 'multisig' --speakers 'Anant Tapadia' --speakers 'Stephan Livera' --category 'podcast'
+tstbtc transcribe $mp3_link bitcointranscripts/stephan-livera-podcast --title 'SLP455 Anant Tapadia - Single Sig or Multi Sig?' --date '2023-02-01' --tags 'multisig' --speakers 'Anant Tapadia' --speakers 'Stephan Livera' --category 'podcast'
 ```
 
 ## Testing

diff --git a/app/transcript.py b/app/transcript.py
@@ -235,12 +235,14 @@ def initialize(self):
 
 
 class Audio(Source):
-    def __init__(self, source):
+    def __init__(self, source, description=None, chapters=[]):
         try:
             # initialize source using a base Source
             super().__init__(source_file=source.source_file, link=source.link, loc=source.loc, local=source.local, title=source.title,
                              date=source.event_date, tags=source.tags, category=source.category, speakers=source.speakers, preprocess=source.preprocess)
             self.type = "audio"
+            self.description = description
+            self.chapters = chapters
             self.__config_source()
         except Exception as e:
             raise Exception(f"Error during Audio creation: {e}")
@@ -299,6 +301,20 @@ def download_audio():
         except Exception as e:
             raise Exception(f"Error processing audio file: {e}")
 
+    def to_json(self):
+        return {
+            'type': self.type,
+            'loc': self.loc,
+            "source_file": self.source_file,
+            "media": self.media,
+            'title': self.title,
+            'categories': self.category,
+            'tags': self.tags,
+            'speakers': self.speakers,
+            'date': self.event_date.strftime("%Y-%m-%d"),
+            'description': self.description,
+            'chapters': self.chapters,
+        }
 
 
 class Video(Source):
@@ -418,6 +434,20 @@ def extract_chapters_from_downloaded_video_metadata():
         except Exception as e:
             raise Exception(f"Error processing video file: {e}")
 
+    def to_json(self):
+        return {
+            'type': self.type,
+            'loc': self.loc,
+            "source_file": self.source_file,
+            'title': self.title,
+            'categories': self.category,
+            'tags': self.tags,
+            'speakers': self.speakers,
+            'date': self.event_date.strftime("%Y-%m-%d"),
+            'chapters': self.chapters,
+            'youtube': self.youtube_metadata
+        }
+
 
 class Playlist(Source):
     def __init__(self, source, entries, preprocess=False):
@@ -462,7 +492,7 @@ def __config_source(self):
             if enclosure.type in ['audio/mpeg', 'audio/wav', 'audio/x-m4a']:
                 published_date = date(*entry.published_parsed[:3])
                 source = Audio(Source(enclosure.href, self.loc, self.local, entry.title, published_date, self.tags,
-                               self.category, self.speakers, self.preprocess, link=entry.link))
+                               self.category, self.speakers, self.preprocess, link=entry.link), description=entry.description)
                 self.entries.append(source)
             else:
                 self.logger.warning(

diff --git a/app/utils.py b/app/utils.py
@@ -26,6 +26,11 @@ def write_to_json(json_data, output_dir, filename, add_timestamp=True):
     return file_path
 
 
+def check_if_valid_file_path(file_path):
+    if not isinstance(file_path, str) or not os.path.isfile(file_path):
+        raise Exception(f"Not a valid file: {file_path}")
+
+
 def get_status():
     """Helper method to fetch and store status.json locally"""
     STATUS_FILE_PATH = "status.json"  # the file path for storing the status locally

diff --git a/setup.py b/setup.py
@@ -27,6 +27,6 @@
     ],
     entry_points="""
         [console_scripts]
-        tstbtc=transcriber:add
+        tstbtc=transcriber:cli
     """,
 )
diff --git a/transcriber.py b/transcriber.py
@@ -1,3 +1,4 @@
+import json
 import logging
 import tempfile
 
@@ -7,22 +8,32 @@
 from app.transcript import Transcript
 from app.transcription import Transcription
 from app.logging import configure_logger, get_logger
+from app.utils import check_if_valid_file_path, write_to_json
 
 logger = get_logger()
 
 
-@click.group()
-def cli():
-    pass
-
-
 def print_version(ctx, param, value):
     if not value or ctx.resilient_parsing:
         return
     click.echo(f"{__app_name__} v{__version__}")
     ctx.exit()
 
 
+@click.option(
+    "-v",
+    "--version",
+    is_flag=True,
+    callback=print_version,
+    expose_value=False,
+    is_eager=True,
+    help="Show the application's version and exit.",
+)
+@click.group()
+def cli():
+    pass
+
+
 def print_help(ctx, param, value):
     if not value or ctx.resilient_parsing:
         return
@@ -130,7 +141,7 @@ def print_help(ctx, param, value):
 
 @cli.command()
 @click.argument("source", nargs=1)
-@click.argument("loc", nargs=1) # location in the bitcointranscripts hierarchy
+@click.argument("loc", nargs=1)  # location in the bitcointranscripts hierarchy
 # Available transcription models and services
 @whisper
 @deepgram
@@ -176,16 +187,7 @@ def print_help(ctx, param, value):
 @model_output_dir
 @nocleanup
 @verbose_logging
-@click.option(
-    "-v",
-    "--version",
-    is_flag=True,
-    callback=print_version,
-    expose_value=False,
-    is_eager=True,
-    help="Show the application's version and exit.",
-)
-def add(
+def transcribe(
     source: str,
     loc: str,
     model: str,
@@ -245,3 +247,133 @@ def add(
     except Exception as e:
         logger.error(e)
         logger.info(f"Exited with error, not cleaning up temp files: {tmp_dir}")
+
+
+@cli.command()
+@click.argument("json_file", nargs=1)
+@whisper
+@deepgram
+@diarize
+@summarize
+@use_youtube_chapters
+@open_pr
+@upload_to_s3
+@save_to_markdown
+@noqueue
+@model_output_dir
+@nocleanup
+@verbose_logging
+def transcribe_from_json(
+    json_file: str,
+    model: str,
+    chapters: bool,
+    deepgram: bool,
+    diarize: bool,
+    summarize: bool,
+    pr: bool,
+    upload: bool,
+    markdown: bool,
+    noqueue: bool,
+    model_output_dir: str,
+    nocleanup: bool,
+    verbose: bool,
+):
+    """Supply sources in a JSON file for transcription.
+    The JSON can be generated by `preprocess-sources` or created manually.
+    """
+    try:
+        check_if_valid_file_path(json_file)
+        tmp_dir = tempfile.mkdtemp()
+        configure_logger(logging.DEBUG if verbose else logging.INFO, tmp_dir)
+        logger.info(f"Adding transcripts from {json_file}")
+        transcription = Transcription(
+            model=model,
+            deepgram=deepgram,
+            chapters=chapters,
+            diarize=diarize,
+            summarize=summarize,
+            upload=upload,
+            markdown=markdown,
+            queue=not noqueue,
+            model_output_dir=model_output_dir,
+            nocleanup=nocleanup,
+            working_dir=tmp_dir
+        )
+
+        with open(json_file, 'r') as file:
+            sources = json.load(file)
+
+        for source in sources:
+            # Configure metadata given from JSON
+            speakers = source.get("speakers", [])
+            category = source.get("categories", [])
+            tags = source.get("tags", [])
+            loc = source.get("loc", "")
+            youtube_metadata = source.get("youtube", None)
+            transcription.add_transcription_source(
+                source_file=source["source_file"], loc=loc,
+                title=source["title"], category=category, tags=tags,
+                speakers=speakers, date=source["date"],
+                youtube_metadata=youtube_metadata,
+                chapters=source["chapters"]
+            )
+
+        transcription.start()
+        if nocleanup:
+            logger.info("Not cleaning up temp files...")
+        else:
+            transcription.clean_up()
+
+    except Exception as e:
+        logger.error(e)
+
+
+@cli.command()
+@click.argument("json_file", nargs=1)
+@click.option(
+    "--nocheck",
+    is_flag=True,
+    default=False,
+    help="Do not check for existing sources using btctranscripts.com/status.json",
+)
+def preprocess_sources(json_file, nocheck):
+    """Supply sources in a JSON file for preprocess. Preprocessing will fetch
+    all the given sources, and output them in a JSON alongside the available
+    metadata. The JSON can then be edited and piped to `transcribe-from-json`
+    """
+    try:
+        configure_logger(log_level=logging.INFO)
+        check_if_valid_file_path(json_file)
+        transcription = Transcription()
+        with open(json_file, "r") as outfile:
+            sources = json.load(outfile)
+            outfile.close()
+        logger.info(f"Sources detected: {len(sources)}")
+        transcription_sources = []
+        for source in sources:
+            logger.info(f"Preprocessing {source['title']}: {source['source']}")
+            # Configure metadata given from source
+            speakers = source.get("speakers", [])
+            category = source.get("categories", [])
+            tags = source.get("tags", [])
+            loc = source.get("loc", "")
+            excluded_media = source.get(
+                "existing_entries_not_covered_by_btctranscripts/status.json", [])
+            excluded_media = [entry["media"] for entry in excluded_media]
+            transcription_source = transcription.add_transcription_source(
+                source['source'], loc=loc, tags=tags, category=category,
+                speakers=speakers, nocheck=nocheck, preprocess=True,
+                excluded_media=excluded_media
+            )
+            for transcription_source in transcription_source["added"]:
+                transcription_sources.append(transcription_source)
+        # Write all preprocessed sources to JSON
+        write_to_json([source.to_json() for source in transcription_sources],
+                      transcription.model_output_dir, "preprocessed_sources")
+    except Exception as e:
+        logger.error(e)
+        logger.info(f"Exited with error")
+
+
+if __name__ == '__main__':
+    cli()