From dab0f50d93c1f9f35c5113e309001d113cdaab39 Mon Sep 17 00:00:00 2001
From: Jiayu Liu <Jimexist@users.noreply.github.com>
Date: Thu, 1 Feb 2024 09:50:37 +0800
Subject: [PATCH] Mkl, accelerate, readme, heatmap chunking (#18)

---
 Cargo.toml         |  14 +++-
 README.md          |  44 ++++++++----
 src/bbox.rs        |   6 +-
 src/bin/main.rs    | 168 ++++++++++++++++++++++++++++++++++++---------
 src/postprocess.rs |  86 +++++++++++++++++------
 src/preprocess.rs  |  36 +---------
 6 files changed, 248 insertions(+), 106 deletions(-)
diff --git a/Cargo.toml b/Cargo.toml
index 356bac0..481726b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -4,6 +4,7 @@ version = "0.3.0"
 edition = "2021"
 description = "Surya is a multilingual document OCR toolkit, original implementation in Python and PyTorch"
 license = "Apache-2.0"
+authors = ["Jiayu Liu <jiayu@hey.com>"]
 repository = "https://github.com/jimexist/surya-rs"
 default-run = "surya"
 
@@ -21,10 +22,21 @@ opencv = { version = "0.88.8", default-features = false, features = [
 ] }
 serde = { version = "1.0.196" }
 serde_json = { version = "1.0.112" }
-thiserror = "1.0.56"
+accelerate-src = { version = "0.3.2", optional = true }
+intel-mkl-src = { version = "0.8.1", features = [
+  "mkl-static-lp64-iomp",
+], optional = true }
+thiserror = { version = "1.0.56" }
 
 [features]
+default = ["cli"]
 metal = ["candle-core/metal", "candle-nn/metal"]
+accelerate = [
+  "accelerate-src",
+  "candle-core/accelerate",
+  "candle-nn/accelerate",
+]
+mkl = ["intel-mkl-src", "candle-core/mkl", "candle-nn/mkl"]
 cli = ["clap", "anyhow"]
 
 [[bin]]
diff --git a/README.md b/README.md
index 92e64d9..e9b9e0d 100644
--- a/README.md
+++ b/README.md
@@ -31,6 +31,12 @@ Setup rust toolchain if you haven't yet:
 curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
 ```
 
+Install `llvm` and `opencv` (example on Mac):
+
+```bash
+brew install llvm opencv
+```
+
 Build and install the binary:
 
 ```bash
@@ -38,10 +44,8 @@ Build and install the binary:
 export DYLD_FALLBACK_LIBRARY_PATH="$(xcode-select --print-path)/usr/lib/"
 # run this first on other Mac
 export DYLD_FALLBACK_LIBRARY_PATH="$(xcode-select --print-path)/Toolchains/XcodeDefault.xctoolchain/"
-# run this if you have a mac with Metal support
-cargo install --path . --features=cli,metal --bin surya
-# run this on other architectures
-cargo install --path . --features=cli --bin surya
+# optionally you can include features like accelerate, metal, mkl, etc.
+cargo install --path . --features=cli
 ```
 
 The binary when built does _not_ include the weights file itself, and will instead download via the HuggingFace Hub API. Once downloaded, the weights file will be cached in the HuggingFace cache directory.
@@ -57,32 +61,44 @@ Arguments:
   <IMAGE>  path to image
 
 Options:
+      --batch-size <BATCH_SIZE>
+          detection batch size, if not supplied defaults to 2 on CPU and 16 on GPU
       --model-repo <MODEL_REPO>
-          model's hugging face repo [default: vikp/line_detector]
+          detection model's hugging face repo [default: vikp/line_detector]
       --weights-file-name <WEIGHTS_FILE_NAME>
-          model's weights file name [default: model.safetensors]
+          detection model's weights file name [default: model.safetensors]
       --config-file-name <CONFIG_FILE_NAME>
-          model's config file name [default: config.json]
-      --generate-bbox-image
+          detection model's config file name [default: config.json]
+      --non-max-suppression-threshold <NON_MAX_SUPPRESSION_THRESHOLD>
+          a value between 0.0 and 1.0 to filter low density part of heatmap [default: 0.35]
+      --extract-text-threshold <EXTRACT_TEXT_THRESHOLD>
+          a value between 0.0 and 1.0 to filter out bbox with low heatmap density [default: 0.6]
+      --bbox-area-threshold <BBOX_AREA_THRESHOLD>
+          a pixel threshold to filter out small area bbox [default: 10]
+      --polygons
+          whether to output polygons json file
+      --image
           whether to generate bbox image
-      --generate-heatmap
+      --heatmap
           whether to generate heatmap
-      --generate-affinity-map
+      --affinity-map
           whether to generate affinity map
       --output-dir <OUTPUT_DIR>
-          output directory, each file will be generating a subdirectory under this directory [default: ./surya_output]
-      --device-type <DEVICE_TYPE>
+          output directory, under which the input image will be generating a subdirectory [default: ./surya_output]
+      --device <DEVICE_TYPE>
           [default: cpu] [possible values: cpu, gpu, metal]
+      --verbose
+          whether to enable verbose mode
   -h, --help
           Print help
   -V, --version
           Print version
 ```
 
-You can use this to control logging level:
+You can also use this to control logging level:
 
 ```bash
-export RUST_LOG=info # or debug, warn, etc.
+export SURYA_LOG=warn # or debug, warn, etc.
 ```
 
 ## Library
diff --git a/src/bbox.rs b/src/bbox.rs
index 38a1c9d..075a352 100644
--- a/src/bbox.rs
+++ b/src/bbox.rs
@@ -159,7 +159,7 @@ pub fn draw_bboxes<P: AsRef<Path>>(
     image: &mut Mat,
     heatmap_size: Size,
     image_with_padding_size: Size,
-    bboxes: Vec<BBox>,
+    bboxes: &[BBox],
     output_file: P,
 ) -> crate::Result<()> {
     debug!(
@@ -185,7 +185,7 @@ pub fn draw_bboxes<P: AsRef<Path>>(
 pub fn generate_bbox(
     heatmap: &Mat,
     non_max_suppression_threshold: f64,
-    text_threshold: f64,
+    extract_text_threshold: f64,
     bbox_area_threshold: i32,
 ) -> crate::Result<Vec<BBox>> {
     let labels = image_threshold(heatmap, non_max_suppression_threshold)?;
@@ -212,7 +212,7 @@ pub fn generate_bbox(
             continue;
         }
         let max_value = heatmap_label_max(heatmap, &labels, label)?;
-        if max_value < text_threshold {
+        if max_value < extract_text_threshold {
             continue;
         }
         let polygon = connected_area_to_bbox(&labels, stats_row, label)?;
diff --git a/src/bin/main.rs b/src/bin/main.rs
index 08de5c4..1bd5ca1 100644
--- a/src/bin/main.rs
+++ b/src/bin/main.rs
@@ -1,13 +1,22 @@
+#[cfg(feature = "accelerate")]
+extern crate accelerate_src;
+#[cfg(feature = "mkl")]
+extern crate intel_mkl_src;
+
 use candle_core::{Device, IndexOp, Module, Tensor};
 use candle_nn::VarBuilder;
 use clap::{Parser, ValueEnum};
-use hf_hub::api::sync::Api;
+use env_logger::Env;
+use hf_hub::api::sync::ApiBuilder;
 use log::{debug, info};
 use opencv::hub_prelude::MatTraitConst;
+use std::fs::File;
+use std::io::BufWriter;
+use std::io::Write;
 use std::path::PathBuf;
 use std::time::Instant;
 use surya::bbox::{draw_bboxes, generate_bbox};
-use surya::postprocess::save_grayscale_image_with_resize;
+use surya::postprocess::save_image;
 use surya::preprocess::{image_to_tensor, read_chunked_resized_image, read_image};
 use surya::segformer::SemanticSegmentationModel;
 
@@ -34,61 +43,118 @@ impl TryInto<Device> for DeviceType {
 
 #[derive(Parser, Debug)]
 #[command(author, version, about, long_about = None)]
-struct Args {
+struct Cli {
     #[arg(help = "path to image")]
     image: PathBuf,
+
+    #[arg(
+        long,
+        help = "detection batch size, if not supplied defaults to 2 on CPU and 16 on GPU"
+    )]
+    batch_size: Option<usize>,
+
     #[arg(
         long,
         default_value = "vikp/line_detector",
-        help = "model's hugging face repo"
+        help = "detection model's hugging face repo"
     )]
     model_repo: String,
+
     #[arg(
         long,
         default_value = "model.safetensors",
-        help = "model's weights file name"
+        help = "detection model's weights file name"
     )]
     weights_file_name: String,
-    #[arg(long, default_value = "config.json", help = "model's config file name")]
+
+    #[arg(
+        long,
+        default_value = "config.json",
+        help = "detection model's config file name"
+    )]
     config_file_name: String,
-    #[arg(long, default_value_t = true, help = "whether to generate bbox image")]
+
+    #[arg(
+        long,
+        default_value_t = 0.35,
+        help = "a value between 0.0 and 1.0 to filter low density part of heatmap"
+    )]
+    non_max_suppression_threshold: f64,
+
+    #[arg(
+        long,
+        default_value_t = 0.6,
+        help = "a value between 0.0 and 1.0 to filter out bbox with low heatmap density"
+    )]
+    extract_text_threshold: f64,
+
+    #[arg(
+        long,
+        default_value_t = 10,
+        help = "a pixel threshold to filter out small area bbox"
+    )]
+    bbox_area_threshold: usize,
+
+    #[arg(
+        long = "polygons",
+        default_value_t = true,
+        help = "whether to output polygons json file"
+    )]
+    output_polygons: bool,
+
+    #[arg(
+        long = "image",
+        default_value_t = true,
+        help = "whether to generate bbox image"
+    )]
     generate_bbox_image: bool,
-    #[arg(long, default_value_t = true, help = "whether to generate heatmap")]
+
+    #[arg(
+        long = "heatmap",
+        default_value_t = true,
+        help = "whether to generate heatmap"
+    )]
     generate_heatmap: bool,
+
     #[arg(
-        long,
+        long = "affinity-map",
         default_value_t = true,
         help = "whether to generate affinity map"
     )]
     generate_affinity_map: bool,
+
     #[arg(
         long,
         default_value = "./surya_output",
-        help = "output directory, each file will be generating a subdirectory under this directory"
+        help = "output directory, under which the input image will be generating a subdirectory"
     )]
     output_dir: PathBuf,
-    #[arg(long, value_enum, default_value_t = DeviceType::Cpu)]
+
+    #[arg(long = "device", value_enum, default_value_t = DeviceType::Cpu)]
     device_type: DeviceType,
+
+    #[arg(long, help = "whether to enable verbose mode")]
+    verbose: bool,
 }
 
-impl Args {
+impl Cli {
     fn get_model(
         &self,
         device: &Device,
         num_labels: usize,
     ) -> surya::Result<SemanticSegmentationModel> {
-        let api = Api::new()?;
+        let api = ApiBuilder::new().with_progress(true).build()?;
         let repo = api.model(self.model_repo.clone());
-        info!("using model from HuggingFace repo {0}", self.model_repo);
+        debug!("using model from HuggingFace repo {0}", self.model_repo);
         let model_file = repo.get(&self.weights_file_name)?;
-        info!("using weights file '{0}'", self.weights_file_name);
+        debug!("using weights file '{0}'", self.weights_file_name);
         let vb = unsafe {
             VarBuilder::from_mmaped_safetensors(&[model_file], candle_core::DType::F32, device)?
         };
         let config_file = repo.get(&self.config_file_name)?;
-        info!("using config file '{0}'", self.config_file_name);
+        debug!("using config file '{0}'", self.config_file_name);
         let config = serde_json::from_str(&std::fs::read_to_string(config_file)?)?;
-        info!("loaded config: {:?}, num_labels {}", config, num_labels);
+        debug!("loaded config: {:?}, num_labels {}", config, num_labels);
         Ok(SemanticSegmentationModel::new(&config, num_labels, vb)?)
     }
 }
@@ -96,12 +162,25 @@ impl Args {
 const NUM_LABELS: usize = 2;
 
 fn main() -> surya::Result<()> {
-    env_logger::init();
+    let args = Cli::parse();
+    let env = Env::new().filter_or("SURYA_LOG", if args.verbose { "debug" } else { "info" });
+    env_logger::init_from_env(env);
 
-    let args = Args::parse();
+    assert!(
+        0.0 <= args.non_max_suppression_threshold && args.non_max_suppression_threshold <= 1.0,
+        "non-max-suppression-threshold must be between 0.0 and 1.0"
+    );
+    assert!(
+        0.0 <= args.extract_text_threshold && args.extract_text_threshold <= 1.0,
+        "extract-text-threshold must be between 0.0 and 1.0"
+    );
+    assert!(
+        args.bbox_area_threshold > 0,
+        "bbox-area-threshold must be > 0"
+    );
 
     let device = args.device_type.try_into()?;
-    info!("using device {:?}", device);
+    debug!("using device {:?}", device);
 
     let image_chunks = read_chunked_resized_image(&args.image)?;
 
@@ -115,7 +194,10 @@ fn main() -> surya::Result<()> {
 
     let model = args.get_model(&device, NUM_LABELS)?;
 
-    let batch_size = 2;
+    let batch_size = args.batch_size.unwrap_or_else(|| match device {
+        Device::Cpu => 2,
+        Device::Cuda(_) | Device::Metal(_) => 16,
+    });
     let image_tensors: Vec<Tensor> = image_chunks
         .resized_chunks
         .iter()
@@ -125,7 +207,12 @@ fn main() -> surya::Result<()> {
     let mut heatmaps = Vec::new();
     let mut affinity_maps = Vec::new();
     for batch in image_tensors.chunks(batch_size) {
+        let batch_size = batch.len();
         let batch = Tensor::stack(batch, 0)?;
+        info!(
+            "starting segformer inference with batch size {}...",
+            batch_size,
+        );
         let now = Instant::now();
         let segmentation = model.forward(&batch)?;
         info!("inference took {:.3}s", now.elapsed().as_secs_f32());
@@ -143,17 +230,34 @@ fn main() -> surya::Result<()> {
     debug!("heatmap {:?}", heatmap);
     debug!("affinity_map {:?}", affinity_map);
 
-    let non_max_suppression_threshold = 0.35;
-    let extract_text_threshold = 0.6;
-    let bbox_area_threshold = 10;
-
     let bboxes = generate_bbox(
         &heatmap,
-        non_max_suppression_threshold,
-        extract_text_threshold,
-        bbox_area_threshold,
+        args.non_max_suppression_threshold,
+        args.extract_text_threshold,
+        args.bbox_area_threshold as i32,
     )?;
 
+    if args.output_polygons {
+        let output_file = output_dir.join("polygons.jsonl");
+        let mut buf_writer = BufWriter::new(File::create(&output_file)?);
+        for bbox in &bboxes {
+            let polygons: Vec<(f32, f32)> = bbox
+                .polygon
+                .iter()
+                .map(|p| {
+                    let precision = 1.0e3;
+                    let x = (p.x * precision).round() / precision;
+                    let y = (p.y * precision).round() / precision;
+                    (x, y)
+                })
+                .collect();
+            serde_json::to_writer(&mut buf_writer, &polygons)?;
+            writeln!(&mut buf_writer)?;
+        }
+        buf_writer.flush()?;
+        info!("polygons json file {:?} generated", output_file);
+    }
+
     if args.generate_bbox_image {
         let mut image = read_image(args.image)?;
         let output_file = output_dir.join("bbox.png");
@@ -161,7 +265,7 @@ fn main() -> surya::Result<()> {
             &mut image,
             heatmap.size()?,
             image_chunks.original_size_with_padding,
-            bboxes,
+            &bboxes,
             &output_file,
         )?;
         info!("bbox image {:?} generated", output_file);
@@ -169,13 +273,15 @@ fn main() -> surya::Result<()> {
 
     if args.generate_heatmap {
         let output_file = output_dir.join("heatmap.png");
-        save_grayscale_image_with_resize(&heatmap, image_chunks.original_size, &output_file)?;
+        let image = image_chunks.resize_heatmap_to_image(heatmap)?;
+        save_image(&image, &output_file)?;
         info!("heatmap image {:?} generated", output_file);
     }
 
     if args.generate_affinity_map {
         let output_file = output_dir.join("affinity_map.png");
-        save_grayscale_image_with_resize(&affinity_map, image_chunks.original_size, &output_file)?;
+        let image = image_chunks.resize_heatmap_to_image(affinity_map)?;
+        save_image(&image, &output_file)?;
         info!("affinity map image {:?} generated", output_file);
     }
 
diff --git a/src/postprocess.rs b/src/postprocess.rs
index 12e202d..3031dc8 100644
--- a/src/postprocess.rs
+++ b/src/postprocess.rs
@@ -1,30 +1,72 @@
-use opencv::core::{self, Mat, Size};
-use opencv::prelude::MatTraitConst;
-use opencv::{imgcodecs, imgproc};
 use std::path::Path;
 
+use candle_core::Tensor;
+use opencv::core::{self, Mat};
+use opencv::imgcodecs;
+use opencv::imgproc;
+use opencv::prelude::*;
+use opencv::types::VectorOfMat;
+
+pub struct ImageChunks {
+    pub resized_chunks: Vec<Mat>,
+    pub padding: i32,
+    pub original_size: core::Size,
+    pub original_size_with_padding: core::Size,
+}
+
+impl ImageChunks {
+    pub fn stitch_image_tensors(&self, images: Vec<Tensor>) -> crate::Result<Mat> {
+        let image_chunks = images
+            .into_iter()
+            .map(heatmap_tensor_to_mat)
+            .collect::<crate::Result<Vec<_>>>()?;
+        let mut image = Mat::default();
+        let image_chunks = VectorOfMat::from_iter(image_chunks);
+        core::vconcat(&image_chunks, &mut image)?;
+        Ok(image)
+    }
+
+    pub fn resize_heatmap_to_image(&self, heatmap: Mat) -> crate::Result<Mat> {
+        // convert image [0,1) to 255 grayscale image
+        let mut gray_scale_image = Mat::default();
+        heatmap.convert_to(&mut gray_scale_image, core::CV_8UC1, 255.0, 0.0)?;
+        // resize image
+        let mut resized_image = Mat::default();
+        imgproc::resize(
+            &gray_scale_image,
+            &mut resized_image,
+            self.original_size_with_padding,
+            0.0,
+            0.0,
+            opencv::imgproc::INTER_LINEAR,
+        )?;
+        let result = Mat::roi(
+            &resized_image,
+            core::Rect::new(0, 0, self.original_size.width, self.original_size.height),
+        )?;
+        Ok(result)
+    }
+}
+
+fn heatmap_tensor_to_mat(heatmap: Tensor) -> crate::Result<Mat> {
+    let (height, width) = heatmap.dims2()?;
+    debug_assert_eq!(height, width, "original heatmap must be square");
+    let heatmap: Vec<Vec<f32>> = heatmap.to_vec2()?;
+    let mut img =
+        unsafe { Mat::new_size(core::Size::new(width as i32, height as i32), core::CV_32F)? };
+    for (x, row) in heatmap.iter().enumerate() {
+        for (y, &value) in row.iter().enumerate() {
+            *(img.at_2d_mut::<f32>(x as i32, y as i32)?) = value;
+        }
+    }
+    Ok(img)
+}
+
 /// convert an image from map to gray scale image and save it to output_path
-pub fn save_grayscale_image_with_resize<P: AsRef<Path>>(
-    image: &Mat,
-    size: Size,
-    output_path: P,
-) -> crate::Result<()> {
-    // convert image [0,1) to 255 grayscale image
-    let mut gray_scale_image = Mat::default();
-    image.convert_to(&mut gray_scale_image, core::CV_8UC1, 255.0, 0.0)?;
-    // resize image
-    let mut resized_image = Mat::default();
-    imgproc::resize(
-        &gray_scale_image,
-        &mut resized_image,
-        size,
-        0.0,
-        0.0,
-        opencv::imgproc::INTER_LINEAR,
-    )?;
+pub fn save_image<P: AsRef<Path>>(image: &Mat, output_path: P) -> crate::Result<()> {
     imgcodecs::imwrite(
         output_path.as_ref().as_os_str().to_str().unwrap(),
-        &resized_image,
+        image,
         &core::Vector::new(),
     )?;
     Ok(())
diff --git a/src/preprocess.rs b/src/preprocess.rs
index 913549f..efe4a47 100644
--- a/src/preprocess.rs
+++ b/src/preprocess.rs
@@ -1,3 +1,4 @@
+use crate::postprocess::ImageChunks;
 use candle_core::{Device, Tensor};
 use log::debug;
 use opencv::{
@@ -5,33 +6,12 @@ use opencv::{
     imgcodecs::{self, IMREAD_COLOR},
     imgproc,
     prelude::*,
-    types::VectorOfMat,
 };
 use std::path::Path;
 
 const INPUT_IMAGE_SIZE: i32 = 896;
 const IMAGE_CHUNK_HEIGHT: i32 = 1200;
 
-pub struct ImageChunks {
-    pub resized_chunks: Vec<Mat>,
-    pub padding: i32,
-    pub original_size: core::Size,
-    pub original_size_with_padding: core::Size,
-}
-
-impl ImageChunks {
-    pub fn stitch_image_tensors(&self, images: Vec<Tensor>) -> crate::Result<Mat> {
-        let image_chunks = images
-            .into_iter()
-            .map(heatmap_tensor_to_mat)
-            .collect::<crate::Result<Vec<_>>>()?;
-        let mut image = Mat::default();
-        let image_chunks = VectorOfMat::from_iter(image_chunks);
-        core::vconcat(&image_chunks, &mut image)?;
-        Ok(image)
-    }
-}
-
 /// load image from path and resize it to [INPUT_IMAGE_SIZE] and return the resized image and
 /// its original size
 pub fn read_chunked_resized_image<P: AsRef<Path>>(image_path: P) -> crate::Result<ImageChunks> {
@@ -125,20 +105,6 @@ pub fn image_to_tensor(input: &Mat, device: &Device) -> crate::Result<Tensor> {
         .broadcast_div(&std)?)
 }
 
-fn heatmap_tensor_to_mat(heatmap: Tensor) -> crate::Result<Mat> {
-    let (height, width) = heatmap.dims2()?;
-    debug_assert_eq!(height, width, "original heatmap must be square");
-    let heatmap: Vec<Vec<f32>> = heatmap.to_vec2()?;
-    let mut img =
-        unsafe { Mat::new_size(core::Size::new(width as i32, height as i32), core::CV_32F)? };
-    for (x, row) in heatmap.iter().enumerate() {
-        for (y, &value) in row.iter().enumerate() {
-            *(img.at_2d_mut::<f32>(x as i32, y as i32)?) = value;
-        }
-    }
-    Ok(img)
-}
-
 fn resize(image: Mat, new_size: core::Size) -> crate::Result<Mat> {
     let mut resized_image = Mat::default();
     imgproc::resize(