JabRef · XYZ567AB · Oct 25, 2024 · Oct 25, 2024 · Oct 25, 2024 · Oct 25, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,7 @@ Note that this project **does not** adhere to [Semantic Versioning](https://semv
 
 ### Added
 
+- We added functionality to handle arXiv ID in `PdfContentImporter` and implemented related test case. [#12000](https://github.com/JabRef/jabref/issues/12000)
 - We added a "view as BibTeX" option before importing an entry from the citation relation tab. [#11826](https://github.com/JabRef/jabref/issues/11826)
 - We added support finding LaTeX-encoded special characters based on plain Unicode and vice versa. [#11542](https://github.com/JabRef/jabref/pull/11542)
 - When a search hits a file, the file icon of that entry is changed accordingly. [#11542](https://github.com/JabRef/jabref/pull/11542)

diff --git a/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java b/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java
@@ -21,6 +21,7 @@
 import org.jabref.model.entry.BibEntry;
 import org.jabref.model.entry.LinkedFile;
 import org.jabref.model.entry.field.StandardField;
+import org.jabref.model.entry.identifier.ArXivIdentifier;
 import org.jabref.model.entry.identifier.DOI;
 import org.jabref.model.entry.types.EntryType;
 import org.jabref.model.entry.types.StandardEntryType;
@@ -244,6 +245,7 @@ Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineS
         String title;
         String conference = null;
         String doi = null;
+        String arxivId = null;
         String series = null;
         String volume = null;
         String number = null;
@@ -256,6 +258,7 @@ Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineS
             // special case: possibly conference as first line on the page
             extractYear();
             doi = getDoi(null);
+            arxivId = getArxivId(null);
             if (curString.contains("Conference")) {
                 fillCurStringWithNonEmptyLines();
                 conference = curString;
@@ -388,6 +391,7 @@ Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineS
                 }
             } else {
                 doi = getDoi(doi);
+                arxivId = getArxivId(arxivId);
 
                 if ((publisher == null) && curString.contains("IEEE")) {
                     // IEEE has the conference things at the end
@@ -445,6 +449,9 @@ Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineS
         if (doi != null) {
             entry.setField(StandardField.DOI, doi);
         }
+        if (arxivId != null) {
+            entry.setField(StandardField.EPRINT, arxivId);
+        }
         if (series != null) {
             entry.setField(StandardField.SERIES, series);
         }
@@ -458,6 +465,9 @@ Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineS
             entry.setField(StandardField.PAGES, pages);
         }
         if (year != null) {
+            if (arxivId != null) {
+                year = "20" + arxivId.substring(0, 2);
+            }
             entry.setField(StandardField.YEAR, year);
         }
         if (publisher != null) {
@@ -480,6 +490,21 @@ private String getDoi(String doi) {
         return doi;
     }
 
+    private String getArxivId(String arxivId) {
+        int pos;
+        if (arxivId == null) {
+            pos = curString.indexOf("arxiv");
+            if (pos < 0) {
+                pos = curString.indexOf("arXiv");
+            }
+            if (pos >= 0) {
+                String arxivText = curString.substring(pos);
+                return ArXivIdentifier.parse(arxivText).map(ArXivIdentifier::asString).orElse(null);
+            }
+        }
+        return arxivId;
+    }
+
     private String getFirstPageContents(PDDocument document) throws IOException {
         PDFTextStripper stripper = new PDFTextStripper();
 

diff --git a/src/main/java/org/jabref/model/entry/identifier/ArXivIdentifier.java b/src/main/java/org/jabref/model/entry/identifier/ArXivIdentifier.java
@@ -45,13 +45,13 @@ public static Optional<ArXivIdentifier> parse(String value) {
             return getArXivIdentifier(identifierMatcher);
         }
 
-        Pattern oldIdentifierPattern = Pattern.compile("(" + ARXIV_PREFIX + ")?\\s?:?\\s?(?<id>(?<classification>[a-z\\-]+(\\.[A-Z]{2})?)/\\d{7})(v(?<version>\\d+))?");
-        Matcher oldIdentifierMatcher = oldIdentifierPattern.matcher(identifier);
-        if (oldIdentifierMatcher.matches()) {
-            return getArXivIdentifier(oldIdentifierMatcher);
-        }
+       Pattern oldIdentifierPattern = Pattern.compile("(" + ARXIV_PREFIX + ")?\\s?:?\\s?(?<id>(?<classification>[a-z\\-]+(\\.[A-Z]{2})?)/\\d{7})(v(?<version>\\d+))?");
+       Matcher oldIdentifierMatcher = oldIdentifierPattern.matcher(identifier);
+       if (oldIdentifierMatcher.matches()) {
+           return getArXivIdentifier(oldIdentifierMatcher);
+       }
 
-        return Optional.empty();
+       return Optional.empty();
     }
 
     private static Optional<ArXivIdentifier> getArXivIdentifier(Matcher matcher) {

diff --git a/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java b/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java
@@ -123,4 +123,38 @@ British Journal of Nutrition (2008), 99, 1–11 doi: 10.1017/S0007114507795296
 
         assertEquals(Optional.of(entry), importer.getEntryFromPDFContent(firstPageContent, "\n"));
     }
+
+    @Test
+    void extractArxivIdFromPage1() {
+        BibEntry entry = new BibEntry(StandardEntryType.InProceedings)
+                .withField(StandardField.DOI, "10.1017/S0007114507795296")
+                .withField(StandardField.AUTHOR, "Review Article")
+                .withField(StandardField.TITLE, "British Journal of Nutrition (2008), 99, 1–11 doi: 10.1017/S0007114507795296 arXiv:2408.06224v1 q The Authors")
+                .withField(StandardField.YEAR, "2024")
+                .withField(StandardField.EPRINT, "2408.06224v1");
+
+        String firstPageContent = """
+                British Journal of Nutrition (2008), 99, 1–11 doi: 10.1017/S0007114507795296 arXiv:2408.06224v1
+                q The Authors 2024
+
+                Review Article
+
+                Cocoa and health: a decade of research
+
+                Karen A. Cooper1, Jennifer L. Donovan2, Andrew L. Waterhouse3 and Gary Williamson1*
+                1Nestlé Research Center, Vers-Chez-les-Blanc, PO Box 44, CH-1000 Lausanne 26, Switzerland
+                2Department of Psychiatry and Behavioural Sciences, Medical University of South Carolina, Charleston, SC 29425, USA
+                3Department of Viticulture & Enology, University of California, Davis, CA 95616, USA
+
+                (Received 5 December 2006 – Revised 29 May 2007 – Accepted 31 May 2007)
+
+                Abbreviations: FMD, flow-mediated dilation; NO, nitirc oxide.
+
+                *Corresponding author: Dr Gary Williamson, fax þ41 21 785 8544, email [email protected]
+
+                British Journal of Nutrition
+                https://doi.org/10.1017/S0007114507795296 Published online by Cambridge University Press""";
+
+        assertEquals(Optional.of(entry), importer.getEntryFromPDFContent(firstPageContent, "\n"));
+    }
 }