Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix issue #12000: Parsing arXiv Id when importing a PDF with arXiv Id #12079

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.LinkedFile;
import org.jabref.model.entry.field.StandardField;
import org.jabref.model.entry.identifier.ArXivIdentifier;
import org.jabref.model.entry.identifier.DOI;
import org.jabref.model.entry.types.EntryType;
import org.jabref.model.entry.types.StandardEntryType;
Expand Down Expand Up @@ -244,6 +245,7 @@ Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineS
String title;
String conference = null;
String doi = null;
String arxivId = null;
String series = null;
String volume = null;
String number = null;
Expand All @@ -256,6 +258,7 @@ Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineS
// special case: possibly conference as first line on the page
extractYear();
doi = getDoi(null);
arxivId = getArxivId(null);
if (curString.contains("Conference")) {
fillCurStringWithNonEmptyLines();
conference = curString;
Expand Down Expand Up @@ -388,6 +391,7 @@ Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineS
}
} else {
doi = getDoi(doi);
arxivId = getArxivId(arxivId);

if ((publisher == null) && curString.contains("IEEE")) {
// IEEE has the conference things at the end
Expand Down Expand Up @@ -445,6 +449,9 @@ Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineS
if (doi != null) {
entry.setField(StandardField.DOI, doi);
}
if (arxivId != null) {
entry.setField(StandardField.EPRINT, arxivId);
}
if (series != null) {
entry.setField(StandardField.SERIES, series);
}
Expand All @@ -458,6 +465,9 @@ Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineS
entry.setField(StandardField.PAGES, pages);
}
if (year != null) {
if (arxivId != null) {
year = "20" + arxivId.substring(0, 2);
}
entry.setField(StandardField.YEAR, year);
}
if (publisher != null) {
Expand All @@ -480,6 +490,21 @@ private String getDoi(String doi) {
return doi;
}

private String getArxivId(String arxivId) {
int pos;
if (arxivId == null) {
pos = curString.indexOf("arxiv");
if (pos < 0) {
pos = curString.indexOf("arXiv");
}
if (pos >= 0) {
String arxivText = curString.substring(pos);
return ArXivIdentifier.parse(arxivText).map(ArXivIdentifier::asString).orElse(null);
}
}
return arxivId;
}
Comment on lines +493 to +506
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No! We have the class org.jabref.model.entry.identifier.ArXivIdentifier use this.


private String getFirstPageContents(PDDocument document) throws IOException {
PDFTextStripper stripper = new PDFTextStripper();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,4 +123,38 @@ British Journal of Nutrition (2008), 99, 1–11 doi: 10.1017/S0007114507795296

assertEquals(Optional.of(entry), importer.getEntryFromPDFContent(firstPageContent, "\n"));
}

@Test
void extractArxivIdFromPage1() {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please also use a real arXiv PDF. I think, there is a link to one in the issue?

BibEntry entry = new BibEntry(StandardEntryType.InProceedings)
.withField(StandardField.DOI, "10.1017/S0007114507795296")
.withField(StandardField.AUTHOR, "Review Article")
.withField(StandardField.TITLE, "British Journal of Nutrition (2008), 99, 1–11 doi: 10.1017/S0007114507795296 arXiv:2408.06224v1 q The Authors")
.withField(StandardField.YEAR, "2024")
.withField(StandardField.EPRINT, "2408.06224v1");

String firstPageContent = """
British Journal of Nutrition (2008), 99, 1–11 doi: 10.1017/S0007114507795296 arXiv:2408.06224v1
q The Authors 2024

Review Article

Cocoa and health: a decade of research

Karen A. Cooper1, Jennifer L. Donovan2, Andrew L. Waterhouse3 and Gary Williamson1*
1Nestlé Research Center, Vers-Chez-les-Blanc, PO Box 44, CH-1000 Lausanne 26, Switzerland
2Department of Psychiatry and Behavioural Sciences, Medical University of South Carolina, Charleston, SC 29425, USA
3Department of Viticulture & Enology, University of California, Davis, CA 95616, USA

(Received 5 December 2006 – Revised 29 May 2007 – Accepted 31 May 2007)

Abbreviations: FMD, flow-mediated dilation; NO, nitirc oxide.

*Corresponding author: Dr Gary Williamson, fax þ41 21 785 8544, email [email protected]

British Journal of Nutrition
https://doi.org/10.1017/S0007114507795296 Published online by Cambridge University Press""";

assertEquals(Optional.of(entry), importer.getEntryFromPDFContent(firstPageContent, "\n"));
}
}
Loading