JabRef · XYZ567AB · Oct 25, 2024 · Oct 25, 2024 · Oct 25, 2024 · Oct 25, 2024
diff --git a/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java b/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java
@@ -21,6 +21,7 @@
 import org.jabref.model.entry.BibEntry;
 import org.jabref.model.entry.LinkedFile;
 import org.jabref.model.entry.field.StandardField;
+import org.jabref.model.entry.identifier.ArXivIdentifier;
 import org.jabref.model.entry.identifier.DOI;
 import org.jabref.model.entry.types.EntryType;
 import org.jabref.model.entry.types.StandardEntryType;
@@ -244,6 +245,7 @@ Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineS
         String title;
         String conference = null;
         String doi = null;
+        String arxivId = null;
         String series = null;
         String volume = null;
         String number = null;
@@ -256,6 +258,7 @@ Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineS
             // special case: possibly conference as first line on the page
             extractYear();
             doi = getDoi(null);
+            arxivId = getArxivId(null);
             if (curString.contains("Conference")) {
                 fillCurStringWithNonEmptyLines();
                 conference = curString;
@@ -388,6 +391,7 @@ Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineS
                 }
             } else {
                 doi = getDoi(doi);
+                arxivId = getArxivId(arxivId);
 
                 if ((publisher == null) && curString.contains("IEEE")) {
                     // IEEE has the conference things at the end
@@ -445,6 +449,9 @@ Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineS
         if (doi != null) {
             entry.setField(StandardField.DOI, doi);
         }
+        if (arxivId != null) {
+            entry.setField(StandardField.EPRINT, arxivId);
+        }
         if (series != null) {
             entry.setField(StandardField.SERIES, series);
         }
@@ -458,6 +465,9 @@ Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineS
             entry.setField(StandardField.PAGES, pages);
         }
         if (year != null) {
+            if (arxivId != null) {
+                year = "20" + arxivId.substring(0, 2);
+            }
             entry.setField(StandardField.YEAR, year);
         }
         if (publisher != null) {
@@ -480,6 +490,21 @@ private String getDoi(String doi) {
         return doi;
     }
 
+    private String getArxivId(String arxivId) {
+        int pos;
+        if (arxivId == null) {
+            pos = curString.indexOf("arxiv");
+            if (pos < 0) {
+                pos = curString.indexOf("arXiv");
+            }
+            if (pos >= 0) {
+                String arxivText = curString.substring(pos);
+                return ArXivIdentifier.parse(arxivText).map(ArXivIdentifier::asString).orElse(null);
+            }
+        }
+        return arxivId;
+    }
+
     private String getFirstPageContents(PDDocument document) throws IOException {
         PDFTextStripper stripper = new PDFTextStripper();
 

diff --git a/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java b/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java
@@ -123,4 +123,38 @@ British Journal of Nutrition (2008), 99, 1–11 doi: 10.1017/S0007114507795296
 
         assertEquals(Optional.of(entry), importer.getEntryFromPDFContent(firstPageContent, "\n"));
     }
+
+    @Test
+    void extractArxivIdFromPage1() {
+        BibEntry entry = new BibEntry(StandardEntryType.InProceedings)
+                .withField(StandardField.DOI, "10.1017/S0007114507795296")
+                .withField(StandardField.AUTHOR, "Review Article")
+                .withField(StandardField.TITLE, "British Journal of Nutrition (2008), 99, 1–11 doi: 10.1017/S0007114507795296 arXiv:2408.06224v1 q The Authors")
+                .withField(StandardField.YEAR, "2024")
+                .withField(StandardField.EPRINT, "2408.06224v1");
+
+        String firstPageContent = """
+                British Journal of Nutrition (2008), 99, 1–11 doi: 10.1017/S0007114507795296 arXiv:2408.06224v1
+                q The Authors 2024
+
+                Review Article
+
+                Cocoa and health: a decade of research
+
+                Karen A. Cooper1, Jennifer L. Donovan2, Andrew L. Waterhouse3 and Gary Williamson1*
+                1Nestlé Research Center, Vers-Chez-les-Blanc, PO Box 44, CH-1000 Lausanne 26, Switzerland
+                2Department of Psychiatry and Behavioural Sciences, Medical University of South Carolina, Charleston, SC 29425, USA
+                3Department of Viticulture & Enology, University of California, Davis, CA 95616, USA
+
+                (Received 5 December 2006 – Revised 29 May 2007 – Accepted 31 May 2007)
+
+                Abbreviations: FMD, flow-mediated dilation; NO, nitirc oxide.
+
+                *Corresponding author: Dr Gary Williamson, fax þ41 21 785 8544, email [email protected]
+
+                British Journal of Nutrition
+                https://doi.org/10.1017/S0007114507795296 Published online by Cambridge University Press""";
+
+        assertEquals(Optional.of(entry), importer.getEntryFromPDFContent(firstPageContent, "\n"));
+    }
 }