Implement more pdf importers (#7947)

* GrobidPdfMetadataImporter implemented Implemented an Importer that querries Grobid for metadata of a pdf. The necessary Grobid functionality (retrieving BibTeX for a pdf) is not yet available in Grobid, but we opened a PR that implements it (kermitt2/grobid#800). * Fixed class when accessing resources * Use FileHelper method to get extension * Use jsoup to issue POST request * Removed unnecessary field * Reverted URLDownload It's no longer necessary to set the POST data by bytes as we use JSoup for that. * Changelog entry * Add pdf link to imported entry * Remove citationkey from Grobid Grobid cannot predict a citationkey * FirstPageImporter * Fixed grammar mistake in CHANGELOG.md Co-authored-by: Christoph <[email protected]> * Fixed Grobid tests * Fixed Grobid URL * Checkstyle * Fixed doc * Checkstyle * Use JSoup for plaintext citations as well * Renamed FirstPageImporter to PdfVerbatimBibTextImporter * Fixed getName (no importer) * Renamed Grobid importer to match convention * PdfEmbeddedBibTeXImporter * Renamed PdfEmbeddedBibTeXImporter to PdfEmbeddedBibFileImporter * Checkstyle * Remove debug output * Checkstyle * PdfMergeMetadataImporter * Add DOI and ISBN fetching in PdfMergeMetadataImporter * Fixed concurrent list access * Adapted tests to contain fetchable ID's * Derive XMP preferences from importFormatPreferences * Localization * Use Importers in JabRef * Remove unnecessary test documents * Checkstyle * Grobid Timeout * Null-check * Use MergeImporter as WebFetcher Users can perform a PDF import on already imported pdf's to improve the quality of the entry * Only force BibTeX import if everything else fails Fixes #7984 * Prioritize non-bruteforce importers that When importing, try importers that can tell if they are suitable for a certain file format or not. Some importers only check if a file is present, not if it in the correct format (isRecognizedFormat is always true if an existing file is given). They are used last. The List of importers now reflects that prioritization. It is not sorted by importer names anymore. The getter-methods getImportFormats and getImportFormatList still sort the List by name for the View. * Checkstyle * Fixed WebFetchersTest * Grobid does not need localization * Followup on removed Grobid localization * Fixed tests * Checkstyle * Grobid Fetcher and Tests adapted to updated Grobid * Adapted GrobidServiceTest to updated Grobid Co-authored-by: Christoph <[email protected]>
JabRef · Aug 18, 2021 · 0b02dd4 · 0b02dd4
1 parent a80435f
commit 0b02dd4
Show file tree

Hide file tree

Showing 21 changed files with 993 additions and 47 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,7 @@ Note that this project **does not** adhere to [Semantic Versioning](http://semve
 
 - We added the option to copy the DOI of an entry directly from the context menu copy submenu. [#7826](https://github.com/JabRef/jabref/issues/7826)
 - We added a fulltext search feature. [#2838](https://github.com/JabRef/jabref/pull/2838)
+- We improved the deduction of bib-entries from imported fulltext pdfs. [#7947](https://github.com/JabRef/jabref/pull/7947)
 - We added unprotect_terms to the list of bracketed pattern modifiers [#7826](https://github.com/JabRef/jabref/pull/7960)
 - We added an icon picker in group edit dialog. [#6142](https://github.com/JabRef/jabref/issues/6142)
 

diff --git a/src/main/java/org/jabref/gui/entryeditor/EntryEditor.java b/src/main/java/org/jabref/gui/entryeditor/EntryEditor.java
@@ -355,7 +355,7 @@ private void setupToolBar() {
 
         // Add menu for fetching bibliographic information
         ContextMenu fetcherMenu = new ContextMenu();
-        for (EntryBasedFetcher fetcher : WebFetchers.getEntryBasedFetchers(preferencesService.getImportFormatPreferences())) {
+        for (EntryBasedFetcher fetcher : WebFetchers.getEntryBasedFetchers(preferencesService.getImportFormatPreferences(), preferencesService.getFilePreferences(), databaseContext, preferencesService.getDefaultEncoding())) {
             MenuItem fetcherMenuItem = new MenuItem(fetcher.getName());
             fetcherMenuItem.setOnAction(event -> fetchAndMerge(fetcher));
             fetcherMenu.getItems().add(fetcherMenuItem);

diff --git a/src/main/java/org/jabref/logic/externalfiles/ExternalFilesContentImporter.java b/src/main/java/org/jabref/logic/externalfiles/ExternalFilesContentImporter.java
@@ -7,7 +7,7 @@
 import org.jabref.logic.importer.ImportFormatPreferences;
 import org.jabref.logic.importer.OpenDatabase;
 import org.jabref.logic.importer.ParserResult;
-import org.jabref.logic.importer.fileformat.PdfContentImporter;
+import org.jabref.logic.importer.fileformat.PdfMergeMetadataImporter;
 import org.jabref.logic.importer.fileformat.PdfXmpImporter;
 import org.jabref.logic.preferences.TimestampPreferences;
 import org.jabref.model.util.FileUpdateMonitor;
@@ -23,7 +23,11 @@ public ExternalFilesContentImporter(ImportFormatPreferences importFormatPreferen
     }
 
     public ParserResult importPDFContent(Path file) {
-        return new PdfContentImporter(importFormatPreferences).importDatabase(file, StandardCharsets.UTF_8);
+        try {
+            return new PdfMergeMetadataImporter(importFormatPreferences).importDatabase(file, StandardCharsets.UTF_8);
+        } catch (IOException e) {
+           return ParserResult.fromError(e);
+        }
     }
 
     public ParserResult importXMPContent(Path file) {

diff --git a/src/main/java/org/jabref/logic/importer/ImportFormatReader.java b/src/main/java/org/jabref/logic/importer/ImportFormatReader.java
@@ -2,12 +2,14 @@
 
 import java.io.IOException;
 import java.nio.file.Path;
+import java.util.ArrayList;
 import java.util.List;
 import java.util.Objects;
 import java.util.Optional;
 import java.util.SortedSet;
 import java.util.TreeSet;
 
+import org.jabref.logic.importer.fetcher.GrobidCitationFetcher;
 import org.jabref.logic.importer.fileformat.BibTeXMLImporter;
 import org.jabref.logic.importer.fileformat.BiblioscapeImporter;
 import org.jabref.logic.importer.fileformat.BibtexImporter;
@@ -22,6 +24,10 @@
 import org.jabref.logic.importer.fileformat.MsBibImporter;
 import org.jabref.logic.importer.fileformat.OvidImporter;
 import org.jabref.logic.importer.fileformat.PdfContentImporter;
+import org.jabref.logic.importer.fileformat.PdfEmbeddedBibFileImporter;
+import org.jabref.logic.importer.fileformat.PdfGrobidImporter;
+import org.jabref.logic.importer.fileformat.PdfMergeMetadataImporter;
+import org.jabref.logic.importer.fileformat.PdfVerbatimBibTextImporter;
 import org.jabref.logic.importer.fileformat.PdfXmpImporter;
 import org.jabref.logic.importer.fileformat.RepecNepImporter;
 import org.jabref.logic.importer.fileformat.RisImporter;
@@ -42,7 +48,7 @@ public class ImportFormatReader {
      * All import formats.
      * Sorted accordingly to {@link Importer#compareTo}, which defaults to alphabetically by the name
      */
-    private final SortedSet<Importer> formats = new TreeSet<>();
+    private final List<Importer> formats = new ArrayList<>();
 
     private ImportFormatPreferences importFormatPreferences;
 
@@ -51,8 +57,6 @@ public void resetImportFormats(ImportFormatPreferences newImportFormatPreference
 
         formats.clear();
 
-        formats.add(new BiblioscapeImporter());
-        formats.add(new BibtexImporter(importFormatPreferences, fileMonitor));
         formats.add(new BibTeXMLImporter());
         formats.add(new CopacImporter());
         formats.add(new EndnoteImporter(importFormatPreferences));
@@ -64,11 +68,17 @@ public void resetImportFormats(ImportFormatPreferences newImportFormatPreference
         formats.add(new ModsImporter(importFormatPreferences));
         formats.add(new MsBibImporter());
         formats.add(new OvidImporter());
+        formats.add(new PdfMergeMetadataImporter(importFormatPreferences));
+        formats.add(new PdfVerbatimBibTextImporter(importFormatPreferences));
         formats.add(new PdfContentImporter(importFormatPreferences));
+        formats.add(new PdfEmbeddedBibFileImporter(importFormatPreferences));
+        formats.add(new PdfGrobidImporter(GrobidCitationFetcher.GROBID_URL, importFormatPreferences));
         formats.add(new PdfXmpImporter(xmpPreferences));
         formats.add(new RepecNepImporter(importFormatPreferences));
         formats.add(new RisImporter());
         formats.add(new SilverPlatterImporter());
+        formats.add(new BiblioscapeImporter());
+        formats.add(new BibtexImporter(importFormatPreferences, fileMonitor));
 
         // Get custom import formats
         formats.addAll(importFormatPreferences.getCustomImportList());
@@ -110,26 +120,26 @@ public ParserResult importFromFile(String format, Path file) throws ImportExcept
      * All importers.
      * <p>
      * <p>
-     * Elements are in default order.
+     * Elements are sorted by name.
      * </p>
      *
      * @return all custom importers, elements are of type InputFormat
      */
     public SortedSet<Importer> getImportFormats() {
-        return this.formats;
+        return new TreeSet<>(this.formats);
     }
 
     /**
      * Human readable list of all known import formats (name and CLI Id).
      * <p>
-     * <p>List is in default-order.</p>
+     * <p>List is sorted by importer name.</p>
      *
      * @return human readable list of all known import formats
      */
     public String getImportFormatList() {
         StringBuilder sb = new StringBuilder();
 
-        for (Importer imFo : formats) {
+        for (Importer imFo : getImportFormats()) {
             int pad = Math.max(0, 14 - imFo.getName().length());
             sb.append("  ");
             sb.append(imFo.getName());
@@ -166,20 +176,25 @@ public UnknownFormatImport(String format, ParserResult parserResult) {
     public UnknownFormatImport importUnknownFormat(Path filePath, TimestampPreferences timestampPreferences, FileUpdateMonitor fileMonitor) throws ImportException {
         Objects.requireNonNull(filePath);
 
-        // First, see if it is a BibTeX file:
         try {
-            ParserResult parserResult = OpenDatabase.loadDatabase(filePath, importFormatPreferences, timestampPreferences, fileMonitor);
-            if (parserResult.getDatabase().hasEntries() || !parserResult.getDatabase().hasNoStrings()) {
-                parserResult.setFile(filePath.toFile());
-                return new UnknownFormatImport(ImportFormatReader.BIBTEX_FORMAT, parserResult);
+            UnknownFormatImport unknownFormatImport = importUnknownFormat(importer -> importer.importDatabase(filePath, importFormatPreferences.getEncoding()), importer -> importer.isRecognizedFormat(filePath, importFormatPreferences.getEncoding()));
+            unknownFormatImport.parserResult.setFile(filePath.toFile());
+            return unknownFormatImport;
+        } catch (ImportException e) {
+            // If all importers fail, try to read the file as BibTeX
+            try {
+                ParserResult parserResult = OpenDatabase.loadDatabase(filePath, importFormatPreferences, timestampPreferences, fileMonitor);
+                if (parserResult.getDatabase().hasEntries() || !parserResult.getDatabase().hasNoStrings()) {
+                    parserResult.setFile(filePath.toFile());
+                    return new UnknownFormatImport(ImportFormatReader.BIBTEX_FORMAT, parserResult);
+                } else {
+                    throw new ImportException(Localization.lang("Could not find a suitable import format."));
+                }
+            } catch (IOException ignore) {
+                // Ignored
+                throw new ImportException(Localization.lang("Could not find a suitable import format."));
             }
-        } catch (IOException ignore) {
-            // Ignored
         }
-
-        UnknownFormatImport unknownFormatImport = importUnknownFormat(importer -> importer.importDatabase(filePath, importFormatPreferences.getEncoding()), importer -> importer.isRecognizedFormat(filePath, importFormatPreferences.getEncoding()));
-        unknownFormatImport.parserResult.setFile(filePath.toFile());
-        return unknownFormatImport;
     }
 
     /**
@@ -198,7 +213,7 @@ private UnknownFormatImport importUnknownFormat(CheckedFunction<Importer, Parser
         String bestFormatName = null;
 
         // Cycle through all importers:
-        for (Importer imFo : getImportFormats()) {
+        for (Importer imFo : formats) {
             try {
                 if (!isRecognizedFormat.apply(imFo)) {
                     continue;

diff --git a/src/main/java/org/jabref/logic/importer/WebFetchers.java b/src/main/java/org/jabref/logic/importer/WebFetchers.java
@@ -1,5 +1,6 @@
 package org.jabref.logic.importer;
 
+import java.nio.charset.Charset;
 import java.util.Comparator;
 import java.util.HashSet;
 import java.util.Optional;
@@ -37,10 +38,13 @@
 import org.jabref.logic.importer.fetcher.SpringerLink;
 import org.jabref.logic.importer.fetcher.TitleFetcher;
 import org.jabref.logic.importer.fetcher.ZbMATH;
+import org.jabref.logic.importer.fileformat.PdfMergeMetadataImporter;
+import org.jabref.model.database.BibDatabaseContext;
 import org.jabref.model.entry.field.Field;
 import org.jabref.model.entry.field.StandardField;
 import org.jabref.model.entry.identifier.DOI;
 import org.jabref.model.entry.identifier.Identifier;
+import org.jabref.preferences.FilePreferences;
 
 import static org.jabref.model.entry.field.StandardField.EPRINT;
 import static org.jabref.model.entry.field.StandardField.ISBN;
@@ -133,14 +137,15 @@ public static SortedSet<IdBasedFetcher> getIdBasedFetchers(ImportFormatPreferenc
     /**
      * @return sorted set containing entry based fetchers
      */
-    public static SortedSet<EntryBasedFetcher> getEntryBasedFetchers(ImportFormatPreferences importFormatPreferences) {
+    public static SortedSet<EntryBasedFetcher> getEntryBasedFetchers(ImportFormatPreferences importFormatPreferences, FilePreferences filePreferences, BibDatabaseContext databaseContext, Charset defaultEncoding) {
         SortedSet<EntryBasedFetcher> set = new TreeSet<>(Comparator.comparing(WebFetcher::getName));
         set.add(new AstrophysicsDataSystem(importFormatPreferences));
         set.add(new DoiFetcher(importFormatPreferences));
         set.add(new IsbnFetcher(importFormatPreferences));
         set.add(new MathSciNet(importFormatPreferences));
         set.add(new CrossRef());
         set.add(new ZbMATH(importFormatPreferences));
+        set.add(new PdfMergeMetadataImporter.EntryBasedFetcherWrapper(importFormatPreferences, filePreferences, databaseContext, defaultEncoding));
         return set;
     }
 

diff --git a/src/main/java/org/jabref/logic/importer/fetcher/GrobidCitationFetcher.java b/src/main/java/org/jabref/logic/importer/fetcher/GrobidCitationFetcher.java
@@ -23,9 +23,10 @@
 
 public class GrobidCitationFetcher implements SearchBasedFetcher {
 
+    public static final String GROBID_URL = "http://grobid.jabref.org:8070";
+
     private static final Logger LOGGER = LoggerFactory.getLogger(GrobidCitationFetcher.class);
 
-    private static final String GROBID_URL = "http://grobid.jabref.org:8070";
     private ImportFormatPreferences importFormatPreferences;
     private GrobidService grobidService;
 

diff --git a/src/main/java/org/jabref/logic/importer/fileformat/PdfEmbeddedBibFileImporter.java b/src/main/java/org/jabref/logic/importer/fileformat/PdfEmbeddedBibFileImporter.java
@@ -0,0 +1,166 @@
+package org.jabref.logic.importer.fileformat;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+
+import org.jabref.logic.importer.ImportFormatPreferences;
+import org.jabref.logic.importer.Importer;
+import org.jabref.logic.importer.ParseException;
+import org.jabref.logic.importer.ParserResult;
+import org.jabref.logic.l10n.Localization;
+import org.jabref.logic.util.StandardFileType;
+import org.jabref.logic.util.io.FileUtil;
+import org.jabref.logic.xmp.EncryptedPdfsNotSupportedException;
+import org.jabref.logic.xmp.XmpUtilReader;
+import org.jabref.model.entry.BibEntry;
+import org.jabref.model.util.DummyFileUpdateMonitor;
+
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
+import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.common.PDNameTreeNode;
+import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
+import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationFileAttachment;
+
+/**
+ * PdfEmbeddedBibFileImporter imports an embedded Bib-File from the PDF.
+ */
+public class PdfEmbeddedBibFileImporter extends Importer {
+
+    private final ImportFormatPreferences importFormatPreferences;
+    private final BibtexParser bibtexParser;
+
+    public PdfEmbeddedBibFileImporter(ImportFormatPreferences importFormatPreferences) {
+        this.importFormatPreferences = importFormatPreferences;
+        bibtexParser = new BibtexParser(importFormatPreferences, new DummyFileUpdateMonitor());
+    }
+
+    @Override
+    public boolean isRecognizedFormat(BufferedReader input) throws IOException {
+        return input.readLine().startsWith("%PDF");
+    }
+
+    @Override
+    public ParserResult importDatabase(BufferedReader reader) throws IOException {
+        Objects.requireNonNull(reader);
+        throw new UnsupportedOperationException("PdfEmbeddedBibFileImporter does not support importDatabase(BufferedReader reader)."
+                + "Instead use importDatabase(Path filePath, Charset defaultEncoding).");
+    }
+
+    @Override
+    public ParserResult importDatabase(String data) throws IOException {
+        Objects.requireNonNull(data);
+        throw new UnsupportedOperationException("PdfEmbeddedBibFileImporter does not support importDatabase(String data)."
+                + "Instead use importDatabase(Path filePath, Charset defaultEncoding).");
+    }
+
+    @Override
+    public ParserResult importDatabase(Path filePath, Charset defaultEncoding) {
+        try (PDDocument document = XmpUtilReader.loadWithAutomaticDecryption(filePath)) {
+            return new ParserResult(getEmbeddedBibFileEntries(document));
+        } catch (EncryptedPdfsNotSupportedException e) {
+            return ParserResult.fromErrorMessage(Localization.lang("Decryption not supported."));
+        } catch (IOException | ParseException e) {
+            return ParserResult.fromError(e);
+        }
+    }
+
+    /**
+     * Extraction of embedded files in pdfs adapted from:
+     * Adapted from https://svn.apache.org/repos/asf/pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/pdmodel/ExtractEmbeddedFiles.javaj
+     */
+
+    private List<BibEntry> getEmbeddedBibFileEntries(PDDocument document) throws IOException, ParseException {
+        List<BibEntry> allParsedEntries = new ArrayList<>();
+        PDDocumentNameDictionary nameDictionary = document.getDocumentCatalog().getNames();
+        if (nameDictionary != null) {
+            PDEmbeddedFilesNameTreeNode efTree = nameDictionary.getEmbeddedFiles();
+            if (efTree != null) {
+                Map<String, PDComplexFileSpecification> names = efTree.getNames();
+                if (names != null) {
+                    allParsedEntries.addAll(extractAndParseFiles(names));
+                } else {
+                    List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids();
+                    for (PDNameTreeNode<PDComplexFileSpecification> node : kids) {
+                        names = node.getNames();
+                        allParsedEntries.addAll(extractAndParseFiles(names));
+                    }
+                }
+            }
+        }
+        // extract files from annotations
+        for (PDPage page : document.getPages()) {
+            for (PDAnnotation annotation : page.getAnnotations()) {
+                if (annotation instanceof PDAnnotationFileAttachment) {
+                    PDAnnotationFileAttachment annotationFileAttachment = (PDAnnotationFileAttachment) annotation;
+                    PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) annotationFileAttachment.getFile();
+                    allParsedEntries.addAll(extractAndParseFile(getEmbeddedFile(fileSpec)));
+                }
+            }
+        }
+        return allParsedEntries;
+    }
+
+    private List<BibEntry> extractAndParseFiles(Map<String, PDComplexFileSpecification> names) throws IOException, ParseException {
+        List<BibEntry> allParsedEntries = new ArrayList<>();
+        for (Map.Entry<String, PDComplexFileSpecification> entry : names.entrySet()) {
+            String filename = entry.getKey();
+            FileUtil.getFileExtension(filename);
+            if (FileUtil.isBibFile(Path.of(filename))) {
+                PDComplexFileSpecification fileSpec = entry.getValue();
+                allParsedEntries.addAll(extractAndParseFile(getEmbeddedFile(fileSpec)));
+            }
+        }
+        return allParsedEntries;
+    }
+
+    private List<BibEntry> extractAndParseFile(PDEmbeddedFile embeddedFile) throws IOException, ParseException {
+        return bibtexParser.parseEntries(embeddedFile.createInputStream());
+    }
+
+    private static PDEmbeddedFile getEmbeddedFile(PDComplexFileSpecification fileSpec) {
+        // search for the first available alternative of the embedded file
+        PDEmbeddedFile embeddedFile = null;
+        if (fileSpec != null) {
+            embeddedFile = fileSpec.getEmbeddedFileUnicode();
+            if (embeddedFile == null) {
+                embeddedFile = fileSpec.getEmbeddedFileDos();
+            }
+            if (embeddedFile == null) {
+                embeddedFile = fileSpec.getEmbeddedFileMac();
+            }
+            if (embeddedFile == null) {
+                embeddedFile = fileSpec.getEmbeddedFileUnix();
+            }
+            if (embeddedFile == null) {
+                embeddedFile = fileSpec.getEmbeddedFile();
+            }
+        }
+        return embeddedFile;
+    }
+
+    @Override
+    public String getName() {
+        return "PDFembeddedbibfile";
+    }
+
+    @Override
+    public StandardFileType getFileType() {
+        return StandardFileType.PDF;
+    }
+
+    @Override
+    public String getDescription() {
+        return "PdfEmbeddedBibFileImporter imports an embedded Bib-File from the PDF.";
+    }
+
+}