From dab0f50d93c1f9f35c5113e309001d113cdaab39 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Thu, 1 Feb 2024 09:50:37 +0800 Subject: [PATCH] Mkl, accelerate, readme, heatmap chunking (#18) --- Cargo.toml | 14 +++- README.md | 44 ++++++++---- src/bbox.rs | 6 +- src/bin/main.rs | 168 ++++++++++++++++++++++++++++++++++++--------- src/postprocess.rs | 86 +++++++++++++++++------ src/preprocess.rs | 36 +--------- 6 files changed, 248 insertions(+), 106 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 356bac0..481726b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,6 +4,7 @@ version = "0.3.0" edition = "2021" description = "Surya is a multilingual document OCR toolkit, original implementation in Python and PyTorch" license = "Apache-2.0" +authors = ["Jiayu Liu "] repository = "https://github.com/jimexist/surya-rs" default-run = "surya" @@ -21,10 +22,21 @@ opencv = { version = "0.88.8", default-features = false, features = [ ] } serde = { version = "1.0.196" } serde_json = { version = "1.0.112" } -thiserror = "1.0.56" +accelerate-src = { version = "0.3.2", optional = true } +intel-mkl-src = { version = "0.8.1", features = [ + "mkl-static-lp64-iomp", +], optional = true } +thiserror = { version = "1.0.56" } [features] +default = ["cli"] metal = ["candle-core/metal", "candle-nn/metal"] +accelerate = [ + "accelerate-src", + "candle-core/accelerate", + "candle-nn/accelerate", +] +mkl = ["intel-mkl-src", "candle-core/mkl", "candle-nn/mkl"] cli = ["clap", "anyhow"] [[bin]] diff --git a/README.md b/README.md index 92e64d9..e9b9e0d 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,12 @@ Setup rust toolchain if you haven't yet: curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh ``` +Install `llvm` and `opencv` (example on Mac): + +```bash +brew install llvm opencv +``` + Build and install the binary: ```bash @@ -38,10 +44,8 @@ Build and install the binary: export DYLD_FALLBACK_LIBRARY_PATH="$(xcode-select --print-path)/usr/lib/" # run this first on other Mac export DYLD_FALLBACK_LIBRARY_PATH="$(xcode-select --print-path)/Toolchains/XcodeDefault.xctoolchain/" -# run this if you have a mac with Metal support -cargo install --path . --features=cli,metal --bin surya -# run this on other architectures -cargo install --path . --features=cli --bin surya +# optionally you can include features like accelerate, metal, mkl, etc. +cargo install --path . --features=cli ``` The binary when built does _not_ include the weights file itself, and will instead download via the HuggingFace Hub API. Once downloaded, the weights file will be cached in the HuggingFace cache directory. @@ -57,32 +61,44 @@ Arguments: path to image Options: + --batch-size + detection batch size, if not supplied defaults to 2 on CPU and 16 on GPU --model-repo - model's hugging face repo [default: vikp/line_detector] + detection model's hugging face repo [default: vikp/line_detector] --weights-file-name - model's weights file name [default: model.safetensors] + detection model's weights file name [default: model.safetensors] --config-file-name - model's config file name [default: config.json] - --generate-bbox-image + detection model's config file name [default: config.json] + --non-max-suppression-threshold + a value between 0.0 and 1.0 to filter low density part of heatmap [default: 0.35] + --extract-text-threshold + a value between 0.0 and 1.0 to filter out bbox with low heatmap density [default: 0.6] + --bbox-area-threshold + a pixel threshold to filter out small area bbox [default: 10] + --polygons + whether to output polygons json file + --image whether to generate bbox image - --generate-heatmap + --heatmap whether to generate heatmap - --generate-affinity-map + --affinity-map whether to generate affinity map --output-dir - output directory, each file will be generating a subdirectory under this directory [default: ./surya_output] - --device-type + output directory, under which the input image will be generating a subdirectory [default: ./surya_output] + --device [default: cpu] [possible values: cpu, gpu, metal] + --verbose + whether to enable verbose mode -h, --help Print help -V, --version Print version ``` -You can use this to control logging level: +You can also use this to control logging level: ```bash -export RUST_LOG=info # or debug, warn, etc. +export SURYA_LOG=warn # or debug, warn, etc. ``` ## Library diff --git a/src/bbox.rs b/src/bbox.rs index 38a1c9d..075a352 100644 --- a/src/bbox.rs +++ b/src/bbox.rs @@ -159,7 +159,7 @@ pub fn draw_bboxes>( image: &mut Mat, heatmap_size: Size, image_with_padding_size: Size, - bboxes: Vec, + bboxes: &[BBox], output_file: P, ) -> crate::Result<()> { debug!( @@ -185,7 +185,7 @@ pub fn draw_bboxes>( pub fn generate_bbox( heatmap: &Mat, non_max_suppression_threshold: f64, - text_threshold: f64, + extract_text_threshold: f64, bbox_area_threshold: i32, ) -> crate::Result> { let labels = image_threshold(heatmap, non_max_suppression_threshold)?; @@ -212,7 +212,7 @@ pub fn generate_bbox( continue; } let max_value = heatmap_label_max(heatmap, &labels, label)?; - if max_value < text_threshold { + if max_value < extract_text_threshold { continue; } let polygon = connected_area_to_bbox(&labels, stats_row, label)?; diff --git a/src/bin/main.rs b/src/bin/main.rs index 08de5c4..1bd5ca1 100644 --- a/src/bin/main.rs +++ b/src/bin/main.rs @@ -1,13 +1,22 @@ +#[cfg(feature = "accelerate")] +extern crate accelerate_src; +#[cfg(feature = "mkl")] +extern crate intel_mkl_src; + use candle_core::{Device, IndexOp, Module, Tensor}; use candle_nn::VarBuilder; use clap::{Parser, ValueEnum}; -use hf_hub::api::sync::Api; +use env_logger::Env; +use hf_hub::api::sync::ApiBuilder; use log::{debug, info}; use opencv::hub_prelude::MatTraitConst; +use std::fs::File; +use std::io::BufWriter; +use std::io::Write; use std::path::PathBuf; use std::time::Instant; use surya::bbox::{draw_bboxes, generate_bbox}; -use surya::postprocess::save_grayscale_image_with_resize; +use surya::postprocess::save_image; use surya::preprocess::{image_to_tensor, read_chunked_resized_image, read_image}; use surya::segformer::SemanticSegmentationModel; @@ -34,61 +43,118 @@ impl TryInto for DeviceType { #[derive(Parser, Debug)] #[command(author, version, about, long_about = None)] -struct Args { +struct Cli { #[arg(help = "path to image")] image: PathBuf, + + #[arg( + long, + help = "detection batch size, if not supplied defaults to 2 on CPU and 16 on GPU" + )] + batch_size: Option, + #[arg( long, default_value = "vikp/line_detector", - help = "model's hugging face repo" + help = "detection model's hugging face repo" )] model_repo: String, + #[arg( long, default_value = "model.safetensors", - help = "model's weights file name" + help = "detection model's weights file name" )] weights_file_name: String, - #[arg(long, default_value = "config.json", help = "model's config file name")] + + #[arg( + long, + default_value = "config.json", + help = "detection model's config file name" + )] config_file_name: String, - #[arg(long, default_value_t = true, help = "whether to generate bbox image")] + + #[arg( + long, + default_value_t = 0.35, + help = "a value between 0.0 and 1.0 to filter low density part of heatmap" + )] + non_max_suppression_threshold: f64, + + #[arg( + long, + default_value_t = 0.6, + help = "a value between 0.0 and 1.0 to filter out bbox with low heatmap density" + )] + extract_text_threshold: f64, + + #[arg( + long, + default_value_t = 10, + help = "a pixel threshold to filter out small area bbox" + )] + bbox_area_threshold: usize, + + #[arg( + long = "polygons", + default_value_t = true, + help = "whether to output polygons json file" + )] + output_polygons: bool, + + #[arg( + long = "image", + default_value_t = true, + help = "whether to generate bbox image" + )] generate_bbox_image: bool, - #[arg(long, default_value_t = true, help = "whether to generate heatmap")] + + #[arg( + long = "heatmap", + default_value_t = true, + help = "whether to generate heatmap" + )] generate_heatmap: bool, + #[arg( - long, + long = "affinity-map", default_value_t = true, help = "whether to generate affinity map" )] generate_affinity_map: bool, + #[arg( long, default_value = "./surya_output", - help = "output directory, each file will be generating a subdirectory under this directory" + help = "output directory, under which the input image will be generating a subdirectory" )] output_dir: PathBuf, - #[arg(long, value_enum, default_value_t = DeviceType::Cpu)] + + #[arg(long = "device", value_enum, default_value_t = DeviceType::Cpu)] device_type: DeviceType, + + #[arg(long, help = "whether to enable verbose mode")] + verbose: bool, } -impl Args { +impl Cli { fn get_model( &self, device: &Device, num_labels: usize, ) -> surya::Result { - let api = Api::new()?; + let api = ApiBuilder::new().with_progress(true).build()?; let repo = api.model(self.model_repo.clone()); - info!("using model from HuggingFace repo {0}", self.model_repo); + debug!("using model from HuggingFace repo {0}", self.model_repo); let model_file = repo.get(&self.weights_file_name)?; - info!("using weights file '{0}'", self.weights_file_name); + debug!("using weights file '{0}'", self.weights_file_name); let vb = unsafe { VarBuilder::from_mmaped_safetensors(&[model_file], candle_core::DType::F32, device)? }; let config_file = repo.get(&self.config_file_name)?; - info!("using config file '{0}'", self.config_file_name); + debug!("using config file '{0}'", self.config_file_name); let config = serde_json::from_str(&std::fs::read_to_string(config_file)?)?; - info!("loaded config: {:?}, num_labels {}", config, num_labels); + debug!("loaded config: {:?}, num_labels {}", config, num_labels); Ok(SemanticSegmentationModel::new(&config, num_labels, vb)?) } } @@ -96,12 +162,25 @@ impl Args { const NUM_LABELS: usize = 2; fn main() -> surya::Result<()> { - env_logger::init(); + let args = Cli::parse(); + let env = Env::new().filter_or("SURYA_LOG", if args.verbose { "debug" } else { "info" }); + env_logger::init_from_env(env); - let args = Args::parse(); + assert!( + 0.0 <= args.non_max_suppression_threshold && args.non_max_suppression_threshold <= 1.0, + "non-max-suppression-threshold must be between 0.0 and 1.0" + ); + assert!( + 0.0 <= args.extract_text_threshold && args.extract_text_threshold <= 1.0, + "extract-text-threshold must be between 0.0 and 1.0" + ); + assert!( + args.bbox_area_threshold > 0, + "bbox-area-threshold must be > 0" + ); let device = args.device_type.try_into()?; - info!("using device {:?}", device); + debug!("using device {:?}", device); let image_chunks = read_chunked_resized_image(&args.image)?; @@ -115,7 +194,10 @@ fn main() -> surya::Result<()> { let model = args.get_model(&device, NUM_LABELS)?; - let batch_size = 2; + let batch_size = args.batch_size.unwrap_or_else(|| match device { + Device::Cpu => 2, + Device::Cuda(_) | Device::Metal(_) => 16, + }); let image_tensors: Vec = image_chunks .resized_chunks .iter() @@ -125,7 +207,12 @@ fn main() -> surya::Result<()> { let mut heatmaps = Vec::new(); let mut affinity_maps = Vec::new(); for batch in image_tensors.chunks(batch_size) { + let batch_size = batch.len(); let batch = Tensor::stack(batch, 0)?; + info!( + "starting segformer inference with batch size {}...", + batch_size, + ); let now = Instant::now(); let segmentation = model.forward(&batch)?; info!("inference took {:.3}s", now.elapsed().as_secs_f32()); @@ -143,17 +230,34 @@ fn main() -> surya::Result<()> { debug!("heatmap {:?}", heatmap); debug!("affinity_map {:?}", affinity_map); - let non_max_suppression_threshold = 0.35; - let extract_text_threshold = 0.6; - let bbox_area_threshold = 10; - let bboxes = generate_bbox( &heatmap, - non_max_suppression_threshold, - extract_text_threshold, - bbox_area_threshold, + args.non_max_suppression_threshold, + args.extract_text_threshold, + args.bbox_area_threshold as i32, )?; + if args.output_polygons { + let output_file = output_dir.join("polygons.jsonl"); + let mut buf_writer = BufWriter::new(File::create(&output_file)?); + for bbox in &bboxes { + let polygons: Vec<(f32, f32)> = bbox + .polygon + .iter() + .map(|p| { + let precision = 1.0e3; + let x = (p.x * precision).round() / precision; + let y = (p.y * precision).round() / precision; + (x, y) + }) + .collect(); + serde_json::to_writer(&mut buf_writer, &polygons)?; + writeln!(&mut buf_writer)?; + } + buf_writer.flush()?; + info!("polygons json file {:?} generated", output_file); + } + if args.generate_bbox_image { let mut image = read_image(args.image)?; let output_file = output_dir.join("bbox.png"); @@ -161,7 +265,7 @@ fn main() -> surya::Result<()> { &mut image, heatmap.size()?, image_chunks.original_size_with_padding, - bboxes, + &bboxes, &output_file, )?; info!("bbox image {:?} generated", output_file); @@ -169,13 +273,15 @@ fn main() -> surya::Result<()> { if args.generate_heatmap { let output_file = output_dir.join("heatmap.png"); - save_grayscale_image_with_resize(&heatmap, image_chunks.original_size, &output_file)?; + let image = image_chunks.resize_heatmap_to_image(heatmap)?; + save_image(&image, &output_file)?; info!("heatmap image {:?} generated", output_file); } if args.generate_affinity_map { let output_file = output_dir.join("affinity_map.png"); - save_grayscale_image_with_resize(&affinity_map, image_chunks.original_size, &output_file)?; + let image = image_chunks.resize_heatmap_to_image(affinity_map)?; + save_image(&image, &output_file)?; info!("affinity map image {:?} generated", output_file); } diff --git a/src/postprocess.rs b/src/postprocess.rs index 12e202d..3031dc8 100644 --- a/src/postprocess.rs +++ b/src/postprocess.rs @@ -1,30 +1,72 @@ -use opencv::core::{self, Mat, Size}; -use opencv::prelude::MatTraitConst; -use opencv::{imgcodecs, imgproc}; use std::path::Path; +use candle_core::Tensor; +use opencv::core::{self, Mat}; +use opencv::imgcodecs; +use opencv::imgproc; +use opencv::prelude::*; +use opencv::types::VectorOfMat; + +pub struct ImageChunks { + pub resized_chunks: Vec, + pub padding: i32, + pub original_size: core::Size, + pub original_size_with_padding: core::Size, +} + +impl ImageChunks { + pub fn stitch_image_tensors(&self, images: Vec) -> crate::Result { + let image_chunks = images + .into_iter() + .map(heatmap_tensor_to_mat) + .collect::>>()?; + let mut image = Mat::default(); + let image_chunks = VectorOfMat::from_iter(image_chunks); + core::vconcat(&image_chunks, &mut image)?; + Ok(image) + } + + pub fn resize_heatmap_to_image(&self, heatmap: Mat) -> crate::Result { + // convert image [0,1) to 255 grayscale image + let mut gray_scale_image = Mat::default(); + heatmap.convert_to(&mut gray_scale_image, core::CV_8UC1, 255.0, 0.0)?; + // resize image + let mut resized_image = Mat::default(); + imgproc::resize( + &gray_scale_image, + &mut resized_image, + self.original_size_with_padding, + 0.0, + 0.0, + opencv::imgproc::INTER_LINEAR, + )?; + let result = Mat::roi( + &resized_image, + core::Rect::new(0, 0, self.original_size.width, self.original_size.height), + )?; + Ok(result) + } +} + +fn heatmap_tensor_to_mat(heatmap: Tensor) -> crate::Result { + let (height, width) = heatmap.dims2()?; + debug_assert_eq!(height, width, "original heatmap must be square"); + let heatmap: Vec> = heatmap.to_vec2()?; + let mut img = + unsafe { Mat::new_size(core::Size::new(width as i32, height as i32), core::CV_32F)? }; + for (x, row) in heatmap.iter().enumerate() { + for (y, &value) in row.iter().enumerate() { + *(img.at_2d_mut::(x as i32, y as i32)?) = value; + } + } + Ok(img) +} + /// convert an image from map to gray scale image and save it to output_path -pub fn save_grayscale_image_with_resize>( - image: &Mat, - size: Size, - output_path: P, -) -> crate::Result<()> { - // convert image [0,1) to 255 grayscale image - let mut gray_scale_image = Mat::default(); - image.convert_to(&mut gray_scale_image, core::CV_8UC1, 255.0, 0.0)?; - // resize image - let mut resized_image = Mat::default(); - imgproc::resize( - &gray_scale_image, - &mut resized_image, - size, - 0.0, - 0.0, - opencv::imgproc::INTER_LINEAR, - )?; +pub fn save_image>(image: &Mat, output_path: P) -> crate::Result<()> { imgcodecs::imwrite( output_path.as_ref().as_os_str().to_str().unwrap(), - &resized_image, + image, &core::Vector::new(), )?; Ok(()) diff --git a/src/preprocess.rs b/src/preprocess.rs index 913549f..efe4a47 100644 --- a/src/preprocess.rs +++ b/src/preprocess.rs @@ -1,3 +1,4 @@ +use crate::postprocess::ImageChunks; use candle_core::{Device, Tensor}; use log::debug; use opencv::{ @@ -5,33 +6,12 @@ use opencv::{ imgcodecs::{self, IMREAD_COLOR}, imgproc, prelude::*, - types::VectorOfMat, }; use std::path::Path; const INPUT_IMAGE_SIZE: i32 = 896; const IMAGE_CHUNK_HEIGHT: i32 = 1200; -pub struct ImageChunks { - pub resized_chunks: Vec, - pub padding: i32, - pub original_size: core::Size, - pub original_size_with_padding: core::Size, -} - -impl ImageChunks { - pub fn stitch_image_tensors(&self, images: Vec) -> crate::Result { - let image_chunks = images - .into_iter() - .map(heatmap_tensor_to_mat) - .collect::>>()?; - let mut image = Mat::default(); - let image_chunks = VectorOfMat::from_iter(image_chunks); - core::vconcat(&image_chunks, &mut image)?; - Ok(image) - } -} - /// load image from path and resize it to [INPUT_IMAGE_SIZE] and return the resized image and /// its original size pub fn read_chunked_resized_image>(image_path: P) -> crate::Result { @@ -125,20 +105,6 @@ pub fn image_to_tensor(input: &Mat, device: &Device) -> crate::Result { .broadcast_div(&std)?) } -fn heatmap_tensor_to_mat(heatmap: Tensor) -> crate::Result { - let (height, width) = heatmap.dims2()?; - debug_assert_eq!(height, width, "original heatmap must be square"); - let heatmap: Vec> = heatmap.to_vec2()?; - let mut img = - unsafe { Mat::new_size(core::Size::new(width as i32, height as i32), core::CV_32F)? }; - for (x, row) in heatmap.iter().enumerate() { - for (y, &value) in row.iter().enumerate() { - *(img.at_2d_mut::(x as i32, y as i32)?) = value; - } - } - Ok(img) -} - fn resize(image: Mat, new_size: core::Size) -> crate::Result { let mut resized_image = Mat::default(); imgproc::resize(