diff --git a/src/sauvola.jl b/src/sauvola.jl
index ffe32f9..c16815c 100644
--- a/src/sauvola.jl
+++ b/src/sauvola.jl
@@ -1,19 +1,17 @@
-struct Sauvola <: AbstractImageBinarizationAlgorithm
-    window_size::Int
-    bias::Float32
-end
-
 """
-```
-binarize(Sauvola(; window_size = 7, bias = 0.2), img)
-```
+    Sauvola <: AbstractImageBinarizationAlgorithm
+    Sauvola(; bias = 0.2, window_size=7)
+
+    binarize([T,] img, f::Sauvola)
+    binarize!([out,] img, f::Sauvola)
 
 Applies Sauvola--Pietikäinen adaptive image binarization [1] under the
 assumption that the input image is textual.
 
 # Output
 
-Returns the binarized image as an `Array{Gray{Bool},2}`.
+Return the binarized image as an `Array{Gray{T}}` of size `size(img)`. If
+`T` is not specified, it is inferred from `out` and `img`.
 
 # Details
 
@@ -55,16 +53,17 @@ source image, runtime is significantly improved.
 An image which is binarized according to a per-pixel adaptive
 threshold into background (0) and foreground (1) pixel values.
 
-## `window_size` (denoted by ``w`` in the publication)
+## `window_size::Integer` (denoted by ``w`` in the publication)
 
 The threshold for each pixel is a function of the distribution of the intensities
 of all neighboring pixels in a square window around it. The side length of this
 window is ``2w + 1``, with the target pixel in the center position.
 
-## `bias` (denoted by ``k`` in the publication)
+## `bias::Real` (denoted by ``k`` in the publication)
 
 A user-defined biasing parameter. This can take negative values, though values
-in the range [0.2, 0.5] are typical.
+in the range [0.2, 0.5] are typical. According to [1], this algorithm is not too
+sensitive to the value of ``k```.
 
 # Example
 
@@ -74,7 +73,7 @@ Binarize the "cameraman" image in the `TestImages` package.
 using TestImages, ImageBinarization
 
 img = testimage("cameraman")
-img_binary = binarize(Sauvola(window_size = 9, bias = 0.2), img)
+img_binary = binarize(img, Sauvola(window_size = 9, bias = 0.2))
 ```
 
 # References
@@ -83,31 +82,42 @@ img_binary = binarize(Sauvola(window_size = 9, bias = 0.2), img)
 2. Wayne Niblack (1986). *An Introduction to Image Processing*. Prentice-Hall, Englewood Cliffs, NJ: 115-16.
 3. Faisal Shafait, Daniel Keysers and Thomas M. Breuel (2008). "Efficient implementation of local adaptive thresholding techniques using integral images". Proc. SPIE 6815, Document Recognition and Retrieval XV, 681510 (28 January 2008). [doi:10.1117/12.767755](https://doi.org/10.1117/12.767755)
 """
+struct Sauvola <: AbstractImageBinarizationAlgorithm
+    window_size::Int
+    bias::Float32
+end
+
+Sauvola(; window_size::Int = 7, bias::Real = 0.2) = Sauvola(window_size, bias)
+
 function binarize(algorithm::Sauvola, img::AbstractArray{T,2}) where T <: Colorant
     binarize(algorithm, Gray.(img))
 end
 
-function binarize(algorithm::Sauvola, img::AbstractArray{T,2}) where T <: AbstractGray
-    w = algorithm.window_size
-    k = algorithm.bias
-    img₀₁ = zeros(Gray{Bool}, axes(img))
+function (f::Sauvola)(out::GenericGrayImage, img::GenericGrayImage)
+    window_size = f.window_size
+    k = f.bias
+
+    window_size < 0 && throw(ArgumentError("window_size should be non-negative."))
+    size(out) == size(img) || throw(ArgumentError("out and img should have the same shape, instead they are $(size(out)) and $(size(img))"))
+
     img_raw = channelview(img)
     I = integral_image(img_raw)
     I² = integral_image(img_raw.^2)
-    R = 0.5
+    R = 0.5 # dynamic range of standard deviation, in [1] it's set to 128 for 8-bit image
 
     function threshold(pixel::CartesianIndex{2})
-        row₀, col₀, row₁, col₁ = get_window_bounds(img, pixel, w)
+        row₀, col₀, row₁, col₁ = get_window_bounds(img, pixel, window_size)
         m = μ_in_window(I, row₀, col₀, row₁, col₁)
         s = σ_in_window(I², m, row₀, col₀, row₁, col₁)
         return m * (1 + (k * ((s / R) - 1)))
     end
 
-    for pixel in CartesianIndices(img)
-        img₀₁[pixel] = img[pixel] <= threshold(pixel) ? 0 : 1
+    @simd for pixel in CartesianIndices(img)
+        out[pixel] = img[pixel] <= threshold(pixel) ? 0 : 1
     end
 
-    return img₀₁
+    return out
 end
 
-Sauvola(; window_size::Int = 7, bias::Real = 0.2) = Sauvola(window_size, bias)
+(f::Sauvola)(out::GenericGrayImage, img::AbstractArray{<:Color3}) =
+    f(out, of_eltype(Gray, img))
diff --git a/test/References/Sauvola_Color3.png b/test/References/Sauvola_Color3.png
new file mode 100644
index 0000000..6c4f381
Binary files /dev/null and b/test/References/Sauvola_Color3.png differ
diff --git a/test/References/Sauvola_Gray.png b/test/References/Sauvola_Gray.png
new file mode 100644
index 0000000..6c4f381
Binary files /dev/null and b/test/References/Sauvola_Gray.png differ
diff --git a/test/runtests.jl b/test/runtests.jl
index c0163d3..c025b89 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -18,7 +18,7 @@ include("testutils.jl")
     include("niblack.jl")
     include("otsu.jl")
     # include("polysegment.jl")
-    # include("sauvola.jl")
+    include("sauvola.jl")
     include("unimodal.jl")
     include("yen.jl")
 end
diff --git a/test/sauvola.jl b/test/sauvola.jl
index 6074d49..96d9234 100644
--- a/test/sauvola.jl
+++ b/test/sauvola.jl
@@ -1,19 +1,85 @@
-@testset "Sauvola" begin
-
-    for T in (Gray{N0f8}, Gray{N0f16}, Gray{Float32}, Gray{Float64})
-        img = T.([i <= 25 && j <= 25 ? 0.8 : 1.0 for i = 1:50, j = 1:50])
-        target_row = target_col = 13
-        img[target_row,target_col] = 0
-
-        for i in 0:10:50, j in 0:10:50
-            img₀ = circshift(img, (i,j))
-            target_row₀ = (target_row + i) % 50
-            target_col₀ = (target_col + j) % 50
-
-            img_bin = binarize(Sauvola(window_size = 7, bias = 0.21), img₀)
-            @test eltype(img_bin) == Gray{Bool}
-            @test sum(img_bin .== 0) == 1
-            @test img_bin[target_row₀, target_col₀] == 0
+@testset "sauvola" begin
+    @info "Test: Sauvola"
+
+    @testset "API" begin
+        img_gray = imresize(testimage("lena_gray_256"); ratio=0.25)
+        img = copy(img_gray)
+
+        # binarize
+        f = Sauvola(window_size=7, bias=0.2)
+        binarized_img_1 = binarize(img, f)
+        @test img == img_gray # img unchanged
+        @test eltype(binarized_img_1) == Gray{N0f8}
+
+        binarized_img_2 = binarize(Gray{Bool}, img, f)
+        @test img == img_gray # img unchanged
+        @test eltype(binarized_img_2) == Gray{Bool}
+
+        binarized_img_3 = similar(img, Bool)
+        binarize!(binarized_img_3, img, f)
+        @test img == img_gray # img unchanged
+        @test eltype(binarized_img_3) == Bool
+
+        binarized_img_4 = copy(img_gray)
+        binarize!(binarized_img_4, f)
+        @test eltype(binarized_img_4) == Gray{N0f8}
+
+        @test binarized_img_1 == binarized_img_2
+        @test binarized_img_1 == binarized_img_3
+        @test binarized_img_1 == binarized_img_4
+    end
+
+    @testset "Types" begin
+        # Gray
+        img_gray = imresize(testimage("lena_gray_256"); ratio=0.25)
+        f = Sauvola(window_size=7, bias=0.2)
+
+        type_list = generate_test_types([Float32, N0f8], [Gray])
+        for T in type_list
+            img = T.(img_gray)
+            @test_reference "References/Sauvola_Gray.png" Gray.(binarize(img, f))
+        end
+
+        # Color3
+        img_color = imresize(testimage("lena_color_256"); ratio=0.25)
+        f = Sauvola(window_size=7, bias=0.2)
+
+        type_list = generate_test_types([Float32, N0f8], [RGB, Lab])
+        for T in type_list
+            img = T.(img_gray)
+            @test_reference "References/Sauvola_Color3.png" Gray.(binarize(img, f))
+        end
+    end
+
+    @testset "Numerical" begin
+        # Check that the image only has ones or zeros.
+        img = imresize(testimage("lena_gray_256"); ratio=0.25)
+        f = Sauvola(window_size=7, bias=0.2)
+        img₀₁ = binarize(img, f)
+        non_zeros = findall(x -> x != 0.0 && x != 1.0, img₀₁)
+        @test length(non_zeros) == 0
+
+        # Check that ones and zeros have been assigned to the correct side of the threshold.
+        maxval, maxpos = findmax(Gray.(img))
+        @test img₀₁[maxpos] == 1
+        minval, minpos = findmin(Gray.(img))
+        @test img₀₁[minpos] == 0
+
+
+        for T in (Gray{N0f8}, Gray{N0f16}, Gray{Float32}, Gray{Float64})
+            img = T.([i <= 25 && j <= 25 ? 0.8 : 1.0 for i = 1:50, j = 1:50])
+            target_row = target_col = 13
+            img[target_row,target_col] = 0
+
+            for i in 0:10:50, j in 0:10:50
+                img₀ = circshift(img, (i,j))
+                target_row₀ = (target_row + i) % 50
+                target_col₀ = (target_col + j) % 50
+
+                img_bin = binarize(img₀, Sauvola(window_size = 7, bias = 0.21))
+                @test sum(img_bin .== 0) == 1
+                @test img_bin[target_row₀, target_col₀] == 0
+            end
         end
     end