Skip to content

Commit

Permalink
Introduce ParsingOptions.FilterProvider and BaseFilterProvider and ma…
Browse files Browse the repository at this point in the history
…ke CcittFaxCompressionType a byte
  • Loading branch information
BobLd committed Oct 17, 2024
1 parent 4b5cb47 commit 8cee4f4
Show file tree
Hide file tree
Showing 8 changed files with 242 additions and 81 deletions.
126 changes: 126 additions & 0 deletions src/UglyToad.PdfPig.Tests/Integration/FilterTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
namespace UglyToad.PdfPig.Tests.Integration
{
using PdfPig.Filters;
using PdfPig.Tokens;
using System;
using System.Collections.Generic;
using System.Linq;

public class FilterTests
{
private static readonly Lazy<string> DocumentFolder = new Lazy<string>(() => Path.GetFullPath(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "..", "..", "..", "Integration", "Documents")));
private static readonly HashSet<string> _documentsToIgnore =
[
"issue_671.pdf",
"GHOSTSCRIPT-698363-0.pdf",
"ErcotFacts.pdf"
];

[Theory]
[MemberData(nameof(GetAllDocuments))]
public void NoImageDecoding(string documentName)
{
// Add the full path back on, we removed it so we could see it in the test explorer.
documentName = Path.Combine(DocumentFolder.Value, documentName);

var parsingOptions = new ParsingOptions
{
UseLenientParsing = true,
FilterProvider = MyFilterProvider.Instance
};

using (var document = PdfDocument.Open(documentName, parsingOptions))
{
for (var i = 0; i < document.NumberOfPages; i++)
{
var page = document.GetPage(i + 1);

foreach (var pdfImage in page.GetImages())
{
if (pdfImage.ImageDictionary.TryGet(NameToken.Filter, out NameToken filter))
{
if (filter.Data.Equals(NameToken.FlateDecode.Data) ||
filter.Data.Equals(NameToken.FlateDecodeAbbreviation.Data) ||
filter.Data.Equals(NameToken.LzwDecode.Data) ||
filter.Data.Equals(NameToken.LzwDecodeAbbreviation.Data))
{
continue;
}
}
else
{
continue;
}

Assert.False(pdfImage.TryGetPng(out _));
}
}
}
}

public sealed class NoFilter : IFilter
{
public bool IsSupported => false;

public ReadOnlyMemory<byte> Decode(ReadOnlySpan<byte> input, DictionaryToken streamDictionary, int filterIndex)
{
throw new NotImplementedException();
}
}

public class MyFilterProvider : BaseFilterProvider
{
/// <summary>
/// The single instance of this provider.
/// </summary>
public static readonly IFilterProvider Instance = new MyFilterProvider();

/// <inheritdoc/>
protected MyFilterProvider() : base(GetDictionary())
{
}

private static Dictionary<string, IFilter> GetDictionary()
{
var ascii85 = new Ascii85Filter();
var asciiHex = new AsciiHexDecodeFilter();
var flate = new FlateFilter();
var runLength = new RunLengthFilter();
var lzw = new LzwFilter();

var noFilter = new NoFilter();

return new Dictionary<string, IFilter>
{
{ NameToken.Ascii85Decode.Data, ascii85 },
{ NameToken.Ascii85DecodeAbbreviation.Data, ascii85 },
{ NameToken.AsciiHexDecode.Data, asciiHex },
{ NameToken.AsciiHexDecodeAbbreviation.Data, asciiHex },
{ NameToken.CcittfaxDecode.Data, noFilter },
{ NameToken.CcittfaxDecodeAbbreviation.Data, noFilter },
{ NameToken.DctDecode.Data, noFilter },
{ NameToken.DctDecodeAbbreviation.Data, noFilter },
{ NameToken.FlateDecode.Data, flate },
{ NameToken.FlateDecodeAbbreviation.Data, flate },
{ NameToken.Jbig2Decode.Data, noFilter },
{ NameToken.JpxDecode.Data, noFilter },
{ NameToken.RunLengthDecode.Data, runLength },
{ NameToken.RunLengthDecodeAbbreviation.Data, runLength },
{NameToken.LzwDecode, lzw },
{NameToken.LzwDecodeAbbreviation, lzw }
};
}
}

public static IEnumerable<object[]> GetAllDocuments
{
get
{
var files = Directory.GetFiles(DocumentFolder.Value, "*.pdf");

// Return the shortname so we can see it in the test explorer.
return files.Where(x => !_documentsToIgnore.Any(i => x.EndsWith(i))).Select(x => new object[] { Path.GetFileName(x) });
}
}
}
}
1 change: 1 addition & 0 deletions src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ public void OnlyExposedApiIsPublic()
"UglyToad.PdfPig.CrossReference.CrossReferenceType",
"UglyToad.PdfPig.CrossReference.TrailerDictionary",
"UglyToad.PdfPig.Exceptions.PdfDocumentEncryptedException",
"UglyToad.PdfPig.Filters.BaseFilterProvider",
"UglyToad.PdfPig.Filters.DefaultFilterProvider",
"UglyToad.PdfPig.Filters.IFilter",
"UglyToad.PdfPig.Filters.IFilterProvider",
Expand Down
96 changes: 96 additions & 0 deletions src/UglyToad.PdfPig/Filters/BaseFilterProvider.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
namespace UglyToad.PdfPig.Filters
{
using Core;
using System;
using System.Collections.Generic;
using System.Linq;
using Tokens;
using Util;

/// <summary>
/// Base abstract class for FilterProvider.
/// </summary>
public abstract class BaseFilterProvider : IFilterProvider
{
/// <summary>
/// Dictionary of filters.
/// </summary>
protected readonly IReadOnlyDictionary<string, IFilter> FilterInstances;

/// <summary>
/// Create a new <see cref="BaseFilterProvider"/> with the given filters.
/// </summary>
/// <param name="filterInstances"></param>
protected BaseFilterProvider(IReadOnlyDictionary<string, IFilter> filterInstances)
{
FilterInstances = filterInstances;
}

/// <inheritdoc />
public IReadOnlyList<IFilter> GetFilters(DictionaryToken dictionary)
{
if (dictionary is null)
{
throw new ArgumentNullException(nameof(dictionary));
}

var token = dictionary.GetObjectOrDefault(NameToken.Filter, NameToken.F);
if (token is null)
{
return Array.Empty<IFilter>();
}

switch (token)
{
case ArrayToken filters:
var result = new IFilter[filters.Data.Count];
for (var i = 0; i < filters.Data.Count; i++)
{
var filterToken = filters.Data[i];
var filterName = ((NameToken)filterToken).Data;
result[i] = GetFilterStrict(filterName);
}

return result;
case NameToken name:
return new[] { GetFilterStrict(name.Data) };
default:
throw new PdfDocumentFormatException($"The filter for the stream was not a valid object. Expected name or array, instead got: {token}.");
}
}

/// <inheritdoc />
public IReadOnlyList<IFilter> GetNamedFilters(IReadOnlyList<NameToken> names)
{
if (names is null)
{
throw new ArgumentNullException(nameof(names));
}

var result = new List<IFilter>();

foreach (var name in names)
{
result.Add(GetFilterStrict(name));
}

return result;
}

private IFilter GetFilterStrict(string name)
{
if (!FilterInstances.TryGetValue(name, out var factory))
{
throw new NotSupportedException($"The filter with the name {name} is not supported yet. Please raise an issue.");
}

return factory;
}

/// <inheritdoc />
public IReadOnlyList<IFilter> GetAllFilters()
{
return FilterInstances.Values.Distinct().ToList();
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
/// <summary>
/// Specifies the compression type to use with <see cref="T:UglyToad.PdfPig.Filters.CcittFaxDecoderStream" />.
/// </summary>
internal enum CcittFaxCompressionType
internal enum CcittFaxCompressionType : byte
{
/// <summary>
/// Modified Huffman (MH) - Group 3 variation (T2)
Expand Down
89 changes: 10 additions & 79 deletions src/UglyToad.PdfPig/Filters/DefaultFilterProvider.cs
Original file line number Diff line number Diff line change
@@ -1,25 +1,24 @@
namespace UglyToad.PdfPig.Filters
{
using System;
using System.Collections.Generic;
using System.Linq;
using Core;
using Tokens;
using Util;

/// <summary>
/// The default implementation of the <see cref="T:UglyToad.PdfPig.Filters.IFilterProvider" />.
/// </summary>
public class DefaultFilterProvider : IFilterProvider
public sealed class DefaultFilterProvider : BaseFilterProvider
{
private readonly IReadOnlyDictionary<string, IFilter> filterInstances;

/// <summary>
/// The single instance of this provider.
/// </summary>
public static readonly IFilterProvider Instance = new DefaultFilterProvider();

private DefaultFilterProvider()
/// <inheritdoc/>
private DefaultFilterProvider() : base(GetDictionary())
{
}

private static Dictionary<string, IFilter> GetDictionary()
{
var ascii85 = new Ascii85Filter();
var asciiHex = new AsciiHexDecodeFilter();
Expand All @@ -31,7 +30,7 @@ private DefaultFilterProvider()
var runLength = new RunLengthFilter();
var lzw = new LzwFilter();

filterInstances = new Dictionary<string, IFilter>
return new Dictionary<string, IFilter>
{
{ NameToken.Ascii85Decode.Data, ascii85 },
{ NameToken.Ascii85DecodeAbbreviation.Data, ascii85 },
Expand All @@ -47,77 +46,9 @@ private DefaultFilterProvider()
{ NameToken.JpxDecode.Data, jpx },
{ NameToken.RunLengthDecode.Data, runLength },
{ NameToken.RunLengthDecodeAbbreviation.Data, runLength },
{NameToken.LzwDecode, lzw },
{NameToken.LzwDecodeAbbreviation, lzw }
{ NameToken.LzwDecode.Data, lzw },
{ NameToken.LzwDecodeAbbreviation.Data, lzw }
};
}

/// <inheritdoc />
public IReadOnlyList<IFilter> GetFilters(DictionaryToken dictionary)
{
if (dictionary is null)
{
throw new ArgumentNullException(nameof(dictionary));
}

var token = dictionary.GetObjectOrDefault(NameToken.Filter, NameToken.F);
if (token is null)
{
return Array.Empty<IFilter>();
}

switch (token)
{
case ArrayToken filters:
var result = new IFilter[filters.Data.Count];
for (var i = 0; i < filters.Data.Count; i++)
{
var filterToken = filters.Data[i];
var filterName = ((NameToken) filterToken).Data;
result[i] = GetFilterStrict(filterName);
}

return result;
case NameToken name:
return new[] { GetFilterStrict(name.Data) };
default:
throw new PdfDocumentFormatException($"The filter for the stream was not a valid object. Expected name or array, instead got: {token}.");
}
}

/// <inheritdoc />
public IReadOnlyList<IFilter> GetNamedFilters(IReadOnlyList<NameToken> names)
{
if (names is null)
{
throw new ArgumentNullException(nameof(names));
}

var result = new List<IFilter>();

foreach (var name in names)
{
result.Add(GetFilterStrict(name));
}

return result;
}

private IFilter GetFilterStrict(string name)
{
if (!filterInstances.TryGetValue(name, out var factory))
{
throw new NotSupportedException($"The filter with the name {name} is not supported yet. Please raise an issue.");
}

return factory;
}

/// <inheritdoc />
public IReadOnlyList<IFilter> GetAllFilters()
{
return filterInstances.Values.Distinct().ToList();
}

}
}
1 change: 1 addition & 0 deletions src/UglyToad.PdfPig/Filters/FlateFilter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ private static byte[] Decompress(byte[] input)
}
}

/// <inheritdoc />
public byte[] Encode(Stream input, DictionaryToken streamDictionary, int index)
{
const int headerLength = 2;
Expand Down
2 changes: 1 addition & 1 deletion src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ private static PdfDocument OpenDocument(
ISeekableTokenScanner scanner,
ParsingOptions parsingOptions)
{
var filterProvider = new FilterProviderWithLookup(DefaultFilterProvider.Instance);
var filterProvider = new FilterProviderWithLookup(parsingOptions.FilterProvider ?? DefaultFilterProvider.Instance);

CrossReferenceTable? crossReferenceTable = null;

Expand Down
6 changes: 6 additions & 0 deletions src/UglyToad.PdfPig/ParsingOptions.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
namespace UglyToad.PdfPig
{
using Filters;
using System.Collections.Generic;
using Logging;

Expand Down Expand Up @@ -50,5 +51,10 @@ public sealed class ParsingOptions
/// forms and images when missing.
/// </summary>
public bool SkipMissingFonts { get; set; } = false;

/// <summary>
/// Filter provider to use while parsing the document. The <see cref="DefaultFilterProvider"/> will be used if set to <c>null</c>.
/// </summary>
public IFilterProvider? FilterProvider { get; set; } = null;
}
}

0 comments on commit 8cee4f4

Please sign in to comment.