Merge pull request #1127 from jqnatividad/describegpt/jsonl

describegpt: add --jsonl option (resolves #1086)
dathere · Jul 12, 2023 · 73db2ba · 73db2ba
2 parents de7b220 + e3e56be
commit 73db2ba
Show file tree

Hide file tree

Showing 2 changed files with 112 additions and 26 deletions.
diff --git a/docs/Describegpt.md b/docs/Describegpt.md
@@ -32,6 +32,34 @@ You may often see this error when `--max-tokens` is set too low and therefore th
 
 The invalid output will be printed in `stderr`.
 
+Note that `--json` may not be used alongside `--jsonl`, nor may they both be set to true in a prompt file at the same time. This will result in an error.
+
+## `--jsonl`
+
+Similar to `--json`, you can use the the `--jsonl` option to expect [JSON Lines](https://jsonlines.org/) output.
+
+If you use `--output` with `--jsonl`, the output will be written to a new file if it doesn't exist and any lines after the first will be appended to the file. If the file already exists, the output will be appended to the file. Each inference option (`--dictionary`, `--description`, `--tags`) will be written to a new line in the file.
+
+If you use `--prompt-file` with `--jsonl`, the prompt name and timestamp will also be included in the JSONL output for each inference option.
+
+Note that **the `--jsonl` option does not indicate to your prompt that you want to generate JSONL output based on your dataset**. It instead ensures the command output is in JSONL format. You must specify in your prompt to make a completion in JSON format, such as adding the phrase "in JSON format" to your prompt, and this will then be parsed into JSONL format by `describegpt`.
+
+If the prompt output is not in valid JSON format but the `--jsonl` option is specified, the command will generate a default error JSON output printed to `stdout`, such as the following:
+
+```json
+{
+    "option": {
+        "error": "Invalid JSON output for option."
+    }
+}
+```
+
+You may often see this error when `--max-tokens` is set too low and therefore the output is incomplete.
+
+The invalid output will be printed in `stderr`.
+
+Note that `--jsonl` may not be used alongside `--json`, nor may they both be set to true in a prompt file at the same time. This will result in an error.
+
 ## `--max-tokens <value>`
 
 `--max-tokens` is a option that allows you to specify the maximum number of tokens in the completion **output**. This is limited by the maximum number of tokens allowed by the model including the input tokens.
@@ -80,7 +108,8 @@ Here is an example of a prompt:
     "dictionary_prompt": "Here are the columns for each field in a data dictionary:\n\n- Type: the data type of this column\n- Label: a human-friendly label for this column\n- Description: a full description for this column (can be multiple sentences)\n\nGenerate a data dictionary as aforementioned{json_add} where each field has Name, Type, Label, and Description (so four columns in total) based on the following summary statistics and frequency data from a CSV file.\n\nSummary Statistics:\n\n{stats}\n\nFrequency:\n\n{frequency}",
     "description_prompt": "Generate only a description that is within 8 sentences about the entire dataset{json_add} based on the following summary statistics and frequency data derived from the CSV file it came from.\n\nSummary Statistics:\n\n{stats}\n\nFrequency:\n\n{frequency}\n\nDo not output the summary statistics for each field. Do not output the frequency for each field. Do not output data about each field individually, but instead output about the dataset as a whole in one 1-8 sentence description.",
     "tags_prompt": "A tag is a keyword or label that categorizes datasets with other, similar datasets. Using the right tags makes it easier for others to find and use datasets.\n\nGenerate single-word tags{json_add} about the dataset (lowercase only and remove all whitespace) based on the following summary statistics and frequency data from a CSV file.\n\nSummary Statistics:\n\n{stats}\n\nFrequency:\n\n{frequency}",
-    "json": true
+    "json": true,
+    "jsonl": false
 }
 ```
 

diff --git a/src/cmd/describegpt.rs b/src/cmd/describegpt.rs
@@ -25,6 +25,7 @@ describegpt options:
     --max-tokens <value>   Limits the number of generated tokens in the output.
                            [default: 50]
     --json                 Return results in JSON format.
+    --jsonl                Return results in JSON Lines format.
     --prompt-file <file>   The JSON file containing the prompts to use for inferencing.
                            If not specified, default prompts will be used.
     --model <model>        The model to use for inferencing.
@@ -61,6 +62,7 @@ struct Args {
     flag_max_tokens:  u16,
     flag_model:       Option<String>,
     flag_json:        bool,
+    flag_jsonl:       bool,
     flag_prompt_file: Option<String>,
     flag_user_agent:  Option<String>,
     flag_timeout:     u16,
@@ -78,6 +80,7 @@ struct PromptFile {
     description_prompt: String,
     tags_prompt:        String,
     json:               bool,
+    jsonl:              bool,
 }
 
 const OPENAI_KEY_ERROR: &str = "Error: QSV_OPENAI_KEY environment variable not found.\nNote that \
@@ -224,6 +227,7 @@ fn get_prompt_file(args: &Args) -> CliResult<PromptFile> {
                                  Statistics:\n\n{stats}\n\nFrequency:\n\n{frequency}"
                 .to_string(),
             json:               true,
+            jsonl:              false,
         };
         default_prompt_file
     };
@@ -255,7 +259,10 @@ fn get_prompt(
         .replace("{frequency}", frequency.unwrap_or(""))
         .replace(
             "{json_add}",
-            if prompt_file.json || (args.flag_prompt_file.is_none() && args.flag_json) {
+            if prompt_file.json
+                || prompt_file.jsonl
+                || (args.flag_prompt_file.is_none() && (args.flag_json || args.flag_jsonl))
+            {
                 " (in JSON format)"
             } else {
                 ""
@@ -327,6 +334,41 @@ fn get_completion(
     Ok(completion.to_string())
 }
 
+// Check if JSON output is expected
+fn is_json_output(args: &Args) -> CliResult<bool> {
+    // By default expect plaintext output
+    let mut json_output = false;
+    // Set expect_json to true if --prompt-file is used & the "json" field is true
+    if args.flag_prompt_file.is_some() {
+        let prompt_file = get_prompt_file(args)?;
+        if prompt_file.json {
+            json_output = true;
+        }
+    }
+    // Set expect_json to true if --prompt-file is not used & --json is used
+    else if args.flag_json {
+        json_output = true;
+    }
+    Ok(json_output)
+}
+// Check if JSONL output is expected
+fn is_jsonl_output(args: &Args) -> CliResult<bool> {
+    // By default expect plaintext output
+    let mut jsonl_output = false;
+    // Set expect_jsonl to true if --prompt-file is used & the "jsonl" field is true
+    if args.flag_prompt_file.is_some() {
+        let prompt_file = get_prompt_file(args)?;
+        if prompt_file.jsonl {
+            jsonl_output = true;
+        }
+    }
+    // Set expect_jsonl to true if --prompt-file is not used & --jsonl is used
+    else if args.flag_jsonl {
+        jsonl_output = true;
+    }
+    Ok(jsonl_output)
+}
+
 // Generates output for all inference options
 fn run_inference_options(
     args: &Args,
@@ -351,32 +393,15 @@ fn run_inference_options(
             .replace("\\'", "'")
             .replace("\\`", "`")
     }
-    // Check if JSON output is expected
-    fn is_json_output(args: &Args) -> CliResult<bool> {
-        // By default expect plaintext output
-        let mut json_output = false;
-        // Set expect_json to true if --prompt-file is used & the "json" field is true
-        if args.flag_prompt_file.is_some() {
-            let prompt_file = get_prompt_file(args)?;
-            if prompt_file.json {
-                json_output = true;
-            }
-        }
-        // Set expect_json to true if --prompt-file is not used & --json is used
-        else if args.flag_json {
-            json_output = true;
-        }
-        Ok(json_output)
-    }
     // Generate the plaintext and/or JSON output of an inference option
     fn process_output(
         option: &str,
         output: &str,
         total_json_output: &mut serde_json::Value,
         args: &Args,
     ) -> CliResult<()> {
-        // Process JSON output
-        if is_json_output(args)? {
+        // Process JSON output if expected or JSONL output is expected
+        if is_json_output(args)? || is_jsonl_output(args)? {
             // Parse the completion JSON
             let completion_json: serde_json::Value = match serde_json::from_str(output) {
                 // Output is valid JSON
@@ -462,14 +487,35 @@ fn run_inference_options(
         process_output("tags", &completion, &mut total_json_output, args)?;
     }
 
-    if is_json_output(args)? {
-        // Print all JSON output
+    // Expecting JSON output
+    if is_json_output(args)? && !is_jsonl_output(args)? {
+        // Format & print JSON output
         let formatted_output =
             format_output(&serde_json::to_string_pretty(&total_json_output).unwrap());
         println!("{formatted_output}");
-        // If --output is used, write JSON to file
-        if let Some(output) = args.flag_output.clone() {
-            fs::write(output, formatted_output)?;
+        // Write to file if --output is used, or overwrite if already exists
+        if let Some(output_file_path) = args.flag_output.clone() {
+            fs::write(output_file_path, formatted_output)?;
+        }
+    }
+    // Expecting JSONL output
+    else if is_jsonl_output(args)? {
+        // If --prompt-file is used, add prompt file name and timestamp to JSONL output
+        if args.flag_prompt_file.clone().is_some() {
+            let prompt_file = get_prompt_file(args)?;
+            total_json_output["prompt_file"] = json!(prompt_file.name);
+            total_json_output["timestamp"] = json!(chrono::offset::Utc::now().to_rfc3339());
+        }
+        // Format & print JSONL output
+        let formatted_output = format_output(&serde_json::to_string(&total_json_output).unwrap());
+        println!("{formatted_output}");
+        // Write to file if --output is used, or append if already exists
+        if let Some(output_file_path) = args.flag_output.clone() {
+            fs::OpenOptions::new()
+                .create(true)
+                .append(true)
+                .open(output_file_path)?
+                .write_all(format!("\n{formatted_output}").as_bytes())?;
         }
     }
 
@@ -535,6 +581,17 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
     } else if args.flag_all && (args.flag_dictionary || args.flag_description || args.flag_tags) {
         return fail!("Error: --all option cannot be specified with other inference flags.");
     }
+    // If --prompt-file flag is specified but the prompt file does not exist, print error message.
+    if let Some(prompt_file) = args.flag_prompt_file.clone() {
+        if !PathBuf::from(prompt_file.clone()).exists() {
+            let error_msg = format!("Error: Prompt file '{prompt_file}' does not exist.");
+            return fail!(error_msg);
+        }
+    }
+    // If --json and --jsonl flags are specified, print error message.
+    if is_json_output(&args)? && is_jsonl_output(&args)? {
+        return fail!("Error: --json and --jsonl options cannot be specified together.");
+    }
 
     // Get qsv executable's path
     let qsv_path = env::current_exe().unwrap();