From 8cee4f480fc4d2c8661778851650ccc57f862fa4 Mon Sep 17 00:00:00 2001 From: BobLd <38405645+BobLd@users.noreply.github.com> Date: Tue, 15 Oct 2024 22:51:22 +0100 Subject: [PATCH] Introduce ParsingOptions.FilterProvider and BaseFilterProvider and make CcittFaxCompressionType a byte --- .../Integration/FilterTests.cs | 126 ++++++++++++++++++ .../PublicApiScannerTests.cs | 1 + .../Filters/BaseFilterProvider.cs | 96 +++++++++++++ .../CcittFax/CcittFaxCompressionType.cs | 2 +- .../Filters/DefaultFilterProvider.cs | 89 ++----------- src/UglyToad.PdfPig/Filters/FlateFilter.cs | 1 + .../Parser/PdfDocumentFactory.cs | 2 +- src/UglyToad.PdfPig/ParsingOptions.cs | 6 + 8 files changed, 242 insertions(+), 81 deletions(-) create mode 100644 src/UglyToad.PdfPig.Tests/Integration/FilterTests.cs create mode 100644 src/UglyToad.PdfPig/Filters/BaseFilterProvider.cs diff --git a/src/UglyToad.PdfPig.Tests/Integration/FilterTests.cs b/src/UglyToad.PdfPig.Tests/Integration/FilterTests.cs new file mode 100644 index 000000000..735b6ae10 --- /dev/null +++ b/src/UglyToad.PdfPig.Tests/Integration/FilterTests.cs @@ -0,0 +1,126 @@ +namespace UglyToad.PdfPig.Tests.Integration +{ + using PdfPig.Filters; + using PdfPig.Tokens; + using System; + using System.Collections.Generic; + using System.Linq; + + public class FilterTests + { + private static readonly Lazy DocumentFolder = new Lazy(() => Path.GetFullPath(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "..", "..", "..", "Integration", "Documents"))); + private static readonly HashSet _documentsToIgnore = + [ + "issue_671.pdf", + "GHOSTSCRIPT-698363-0.pdf", + "ErcotFacts.pdf" + ]; + + [Theory] + [MemberData(nameof(GetAllDocuments))] + public void NoImageDecoding(string documentName) + { + // Add the full path back on, we removed it so we could see it in the test explorer. + documentName = Path.Combine(DocumentFolder.Value, documentName); + + var parsingOptions = new ParsingOptions + { + UseLenientParsing = true, + FilterProvider = MyFilterProvider.Instance + }; + + using (var document = PdfDocument.Open(documentName, parsingOptions)) + { + for (var i = 0; i < document.NumberOfPages; i++) + { + var page = document.GetPage(i + 1); + + foreach (var pdfImage in page.GetImages()) + { + if (pdfImage.ImageDictionary.TryGet(NameToken.Filter, out NameToken filter)) + { + if (filter.Data.Equals(NameToken.FlateDecode.Data) || + filter.Data.Equals(NameToken.FlateDecodeAbbreviation.Data) || + filter.Data.Equals(NameToken.LzwDecode.Data) || + filter.Data.Equals(NameToken.LzwDecodeAbbreviation.Data)) + { + continue; + } + } + else + { + continue; + } + + Assert.False(pdfImage.TryGetPng(out _)); + } + } + } + } + + public sealed class NoFilter : IFilter + { + public bool IsSupported => false; + + public ReadOnlyMemory Decode(ReadOnlySpan input, DictionaryToken streamDictionary, int filterIndex) + { + throw new NotImplementedException(); + } + } + + public class MyFilterProvider : BaseFilterProvider + { + /// + /// The single instance of this provider. + /// + public static readonly IFilterProvider Instance = new MyFilterProvider(); + + /// + protected MyFilterProvider() : base(GetDictionary()) + { + } + + private static Dictionary GetDictionary() + { + var ascii85 = new Ascii85Filter(); + var asciiHex = new AsciiHexDecodeFilter(); + var flate = new FlateFilter(); + var runLength = new RunLengthFilter(); + var lzw = new LzwFilter(); + + var noFilter = new NoFilter(); + + return new Dictionary + { + { NameToken.Ascii85Decode.Data, ascii85 }, + { NameToken.Ascii85DecodeAbbreviation.Data, ascii85 }, + { NameToken.AsciiHexDecode.Data, asciiHex }, + { NameToken.AsciiHexDecodeAbbreviation.Data, asciiHex }, + { NameToken.CcittfaxDecode.Data, noFilter }, + { NameToken.CcittfaxDecodeAbbreviation.Data, noFilter }, + { NameToken.DctDecode.Data, noFilter }, + { NameToken.DctDecodeAbbreviation.Data, noFilter }, + { NameToken.FlateDecode.Data, flate }, + { NameToken.FlateDecodeAbbreviation.Data, flate }, + { NameToken.Jbig2Decode.Data, noFilter }, + { NameToken.JpxDecode.Data, noFilter }, + { NameToken.RunLengthDecode.Data, runLength }, + { NameToken.RunLengthDecodeAbbreviation.Data, runLength }, + {NameToken.LzwDecode, lzw }, + {NameToken.LzwDecodeAbbreviation, lzw } + }; + } + } + + public static IEnumerable GetAllDocuments + { + get + { + var files = Directory.GetFiles(DocumentFolder.Value, "*.pdf"); + + // Return the shortname so we can see it in the test explorer. + return files.Where(x => !_documentsToIgnore.Any(i => x.EndsWith(i))).Select(x => new object[] { Path.GetFileName(x) }); + } + } + } +} diff --git a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs index 6690d9240..364bb53b3 100644 --- a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs +++ b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs @@ -97,6 +97,7 @@ public void OnlyExposedApiIsPublic() "UglyToad.PdfPig.CrossReference.CrossReferenceType", "UglyToad.PdfPig.CrossReference.TrailerDictionary", "UglyToad.PdfPig.Exceptions.PdfDocumentEncryptedException", + "UglyToad.PdfPig.Filters.BaseFilterProvider", "UglyToad.PdfPig.Filters.DefaultFilterProvider", "UglyToad.PdfPig.Filters.IFilter", "UglyToad.PdfPig.Filters.IFilterProvider", diff --git a/src/UglyToad.PdfPig/Filters/BaseFilterProvider.cs b/src/UglyToad.PdfPig/Filters/BaseFilterProvider.cs new file mode 100644 index 000000000..0e51f4cff --- /dev/null +++ b/src/UglyToad.PdfPig/Filters/BaseFilterProvider.cs @@ -0,0 +1,96 @@ +namespace UglyToad.PdfPig.Filters +{ + using Core; + using System; + using System.Collections.Generic; + using System.Linq; + using Tokens; + using Util; + + /// + /// Base abstract class for FilterProvider. + /// + public abstract class BaseFilterProvider : IFilterProvider + { + /// + /// Dictionary of filters. + /// + protected readonly IReadOnlyDictionary FilterInstances; + + /// + /// Create a new with the given filters. + /// + /// + protected BaseFilterProvider(IReadOnlyDictionary filterInstances) + { + FilterInstances = filterInstances; + } + + /// + public IReadOnlyList GetFilters(DictionaryToken dictionary) + { + if (dictionary is null) + { + throw new ArgumentNullException(nameof(dictionary)); + } + + var token = dictionary.GetObjectOrDefault(NameToken.Filter, NameToken.F); + if (token is null) + { + return Array.Empty(); + } + + switch (token) + { + case ArrayToken filters: + var result = new IFilter[filters.Data.Count]; + for (var i = 0; i < filters.Data.Count; i++) + { + var filterToken = filters.Data[i]; + var filterName = ((NameToken)filterToken).Data; + result[i] = GetFilterStrict(filterName); + } + + return result; + case NameToken name: + return new[] { GetFilterStrict(name.Data) }; + default: + throw new PdfDocumentFormatException($"The filter for the stream was not a valid object. Expected name or array, instead got: {token}."); + } + } + + /// + public IReadOnlyList GetNamedFilters(IReadOnlyList names) + { + if (names is null) + { + throw new ArgumentNullException(nameof(names)); + } + + var result = new List(); + + foreach (var name in names) + { + result.Add(GetFilterStrict(name)); + } + + return result; + } + + private IFilter GetFilterStrict(string name) + { + if (!FilterInstances.TryGetValue(name, out var factory)) + { + throw new NotSupportedException($"The filter with the name {name} is not supported yet. Please raise an issue."); + } + + return factory; + } + + /// + public IReadOnlyList GetAllFilters() + { + return FilterInstances.Values.Distinct().ToList(); + } + } +} diff --git a/src/UglyToad.PdfPig/Filters/CcittFax/CcittFaxCompressionType.cs b/src/UglyToad.PdfPig/Filters/CcittFax/CcittFaxCompressionType.cs index 85f2ba817..723494116 100644 --- a/src/UglyToad.PdfPig/Filters/CcittFax/CcittFaxCompressionType.cs +++ b/src/UglyToad.PdfPig/Filters/CcittFax/CcittFaxCompressionType.cs @@ -3,7 +3,7 @@ /// /// Specifies the compression type to use with . /// - internal enum CcittFaxCompressionType + internal enum CcittFaxCompressionType : byte { /// /// Modified Huffman (MH) - Group 3 variation (T2) diff --git a/src/UglyToad.PdfPig/Filters/DefaultFilterProvider.cs b/src/UglyToad.PdfPig/Filters/DefaultFilterProvider.cs index 61a4efdf2..0c69890e4 100644 --- a/src/UglyToad.PdfPig/Filters/DefaultFilterProvider.cs +++ b/src/UglyToad.PdfPig/Filters/DefaultFilterProvider.cs @@ -1,25 +1,24 @@ namespace UglyToad.PdfPig.Filters { - using System; using System.Collections.Generic; - using System.Linq; - using Core; using Tokens; - using Util; /// /// The default implementation of the . /// - public class DefaultFilterProvider : IFilterProvider + public sealed class DefaultFilterProvider : BaseFilterProvider { - private readonly IReadOnlyDictionary filterInstances; - /// /// The single instance of this provider. /// public static readonly IFilterProvider Instance = new DefaultFilterProvider(); - private DefaultFilterProvider() + /// + private DefaultFilterProvider() : base(GetDictionary()) + { + } + + private static Dictionary GetDictionary() { var ascii85 = new Ascii85Filter(); var asciiHex = new AsciiHexDecodeFilter(); @@ -31,7 +30,7 @@ private DefaultFilterProvider() var runLength = new RunLengthFilter(); var lzw = new LzwFilter(); - filterInstances = new Dictionary + return new Dictionary { { NameToken.Ascii85Decode.Data, ascii85 }, { NameToken.Ascii85DecodeAbbreviation.Data, ascii85 }, @@ -47,77 +46,9 @@ private DefaultFilterProvider() { NameToken.JpxDecode.Data, jpx }, { NameToken.RunLengthDecode.Data, runLength }, { NameToken.RunLengthDecodeAbbreviation.Data, runLength }, - {NameToken.LzwDecode, lzw }, - {NameToken.LzwDecodeAbbreviation, lzw } + { NameToken.LzwDecode.Data, lzw }, + { NameToken.LzwDecodeAbbreviation.Data, lzw } }; } - - /// - public IReadOnlyList GetFilters(DictionaryToken dictionary) - { - if (dictionary is null) - { - throw new ArgumentNullException(nameof(dictionary)); - } - - var token = dictionary.GetObjectOrDefault(NameToken.Filter, NameToken.F); - if (token is null) - { - return Array.Empty(); - } - - switch (token) - { - case ArrayToken filters: - var result = new IFilter[filters.Data.Count]; - for (var i = 0; i < filters.Data.Count; i++) - { - var filterToken = filters.Data[i]; - var filterName = ((NameToken) filterToken).Data; - result[i] = GetFilterStrict(filterName); - } - - return result; - case NameToken name: - return new[] { GetFilterStrict(name.Data) }; - default: - throw new PdfDocumentFormatException($"The filter for the stream was not a valid object. Expected name or array, instead got: {token}."); - } - } - - /// - public IReadOnlyList GetNamedFilters(IReadOnlyList names) - { - if (names is null) - { - throw new ArgumentNullException(nameof(names)); - } - - var result = new List(); - - foreach (var name in names) - { - result.Add(GetFilterStrict(name)); - } - - return result; - } - - private IFilter GetFilterStrict(string name) - { - if (!filterInstances.TryGetValue(name, out var factory)) - { - throw new NotSupportedException($"The filter with the name {name} is not supported yet. Please raise an issue."); - } - - return factory; - } - - /// - public IReadOnlyList GetAllFilters() - { - return filterInstances.Values.Distinct().ToList(); - } - } } \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Filters/FlateFilter.cs b/src/UglyToad.PdfPig/Filters/FlateFilter.cs index aa6a2cc3b..35099c926 100644 --- a/src/UglyToad.PdfPig/Filters/FlateFilter.cs +++ b/src/UglyToad.PdfPig/Filters/FlateFilter.cs @@ -85,6 +85,7 @@ private static byte[] Decompress(byte[] input) } } + /// public byte[] Encode(Stream input, DictionaryToken streamDictionary, int index) { const int headerLength = 2; diff --git a/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs b/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs index 7e5d6ca1d..8cf852927 100644 --- a/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs +++ b/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs @@ -106,7 +106,7 @@ private static PdfDocument OpenDocument( ISeekableTokenScanner scanner, ParsingOptions parsingOptions) { - var filterProvider = new FilterProviderWithLookup(DefaultFilterProvider.Instance); + var filterProvider = new FilterProviderWithLookup(parsingOptions.FilterProvider ?? DefaultFilterProvider.Instance); CrossReferenceTable? crossReferenceTable = null; diff --git a/src/UglyToad.PdfPig/ParsingOptions.cs b/src/UglyToad.PdfPig/ParsingOptions.cs index d509bb00c..c4191c411 100644 --- a/src/UglyToad.PdfPig/ParsingOptions.cs +++ b/src/UglyToad.PdfPig/ParsingOptions.cs @@ -1,5 +1,6 @@ namespace UglyToad.PdfPig { + using Filters; using System.Collections.Generic; using Logging; @@ -50,5 +51,10 @@ public sealed class ParsingOptions /// forms and images when missing. /// public bool SkipMissingFonts { get; set; } = false; + + /// + /// Filter provider to use while parsing the document. The will be used if set to null. + /// + public IFilterProvider? FilterProvider { get; set; } = null; } } \ No newline at end of file