From 15c7981b23d46b06da9b7b468d42fc179e01d3ec Mon Sep 17 00:00:00 2001 From: Stefan Kolb Date: Sun, 15 Mar 2020 14:52:42 +0100 Subject: [PATCH] Improve DoiResolution fetcher --- .../logic/importer/fetcher/DoiResolution.java | 130 +++++++++++------- .../importer/fetcher/DoiResolutionTest.java | 28 +++- 2 files changed, 103 insertions(+), 55 deletions(-) diff --git a/src/main/java/org/jabref/logic/importer/fetcher/DoiResolution.java b/src/main/java/org/jabref/logic/importer/fetcher/DoiResolution.java index beb1370586e..5cc2d16eb8a 100644 --- a/src/main/java/org/jabref/logic/importer/fetcher/DoiResolution.java +++ b/src/main/java/org/jabref/logic/importer/fetcher/DoiResolution.java @@ -7,9 +7,11 @@ import java.util.Locale; import java.util.Objects; import java.util.Optional; +import java.util.stream.Collectors; import org.jabref.logic.importer.FulltextFetcher; import org.jabref.logic.net.URLDownload; +import org.jabref.logic.util.strings.StringSimilarity; import org.jabref.model.entry.BibEntry; import org.jabref.model.entry.field.StandardField; import org.jabref.model.entry.identifier.DOI; @@ -27,68 +29,94 @@ * FulltextFetcher implementation that follows the DOI resolution redirects and scans for a full-text PDF URL. */ public class DoiResolution implements FulltextFetcher { - private static final Logger LOGGER = LoggerFactory.getLogger(DoiResolution.class); @Override public Optional findFullText(BibEntry entry) throws IOException { Objects.requireNonNull(entry); - Optional pdfLink = Optional.empty(); Optional doi = entry.getField(StandardField.DOI).flatMap(DOI::parse); - if (doi.isPresent()) { - String sciLink = doi.get().getURIAsASCIIString(); - - // follow all redirects and scan for a single pdf link - if (!sciLink.isEmpty()) { - try { - Connection connection = Jsoup.connect(sciLink); - // pretend to be a browser (agent & referrer) - connection.userAgent(URLDownload.USER_AGENT); - connection.referrer("http://www.google.com"); - connection.followRedirects(true); - connection.ignoreHttpErrors(true); - // some publishers are quite slow (default is 3s) - connection.timeout(10000); - - Document html = connection.get(); - - // scan for PDF - Elements elements = html.body().select("a[href]"); - List> links = new ArrayList<>(); - - for (Element element : elements) { - String href = element.attr("abs:href").toLowerCase(Locale.ENGLISH); - String hrefText = element.text().toLowerCase(Locale.ENGLISH); - // Only check if pdf is included in the link or inside the text - // ACM uses tokens without PDF inside the link - // See https://github.com/lehner/LocalCopy for more scrape ideas - if (element.attr("title").toLowerCase(Locale.ENGLISH).contains("pdf") && new URLDownload(href).isPdf()) { - return Optional.of(new URL(href)); - } - - if (href.contains("pdf") || hrefText.contains("pdf") && new URLDownload(href).isPdf()) { - links.add(Optional.of(new URL(href))); - } - } - // return if only one link was found (high accuracy) - if (links.size() == 1) { - LOGGER.info("Fulltext PDF found @ " + sciLink); - pdfLink = links.get(0); - } - } catch (UnsupportedMimeTypeException type) { - // this might be the PDF already as we follow redirects - if (type.getMimeType().startsWith("application/pdf")) { - return Optional.of(new URL(type.getUrl())); - } - LOGGER.warn("DoiResolution fetcher failed: ", type); - } catch (IOException e) { - LOGGER.warn("DoiResolution fetcher failed: ", e); + if (!doi.isPresent()) { + return Optional.empty(); + } + + String doiLink = doi.get().getURIAsASCIIString(); + if (doiLink.isEmpty()) { + return Optional.empty(); + } + + // follow all redirects and scan for a single pdf link + try { + Connection connection = Jsoup.connect(doiLink); + // pretend to be a browser (agent & referrer) + connection.userAgent(URLDownload.USER_AGENT); + connection.referrer("http://www.google.com"); + connection.followRedirects(true); + connection.ignoreHttpErrors(true); + // some publishers are quite slow (default is 3s) + connection.timeout(10000); + + Document html = connection.get(); + // scan for PDF + Elements hrefElements = html.body().select("a[href]"); + + List links = new ArrayList<>(); + for (Element element : hrefElements) { + String href = element.attr("abs:href").toLowerCase(Locale.ENGLISH); + String hrefText = element.text().toLowerCase(Locale.ENGLISH); + // Only check if pdf is included in the link or inside the text + // ACM uses tokens without PDF inside the link + // See https://github.com/lehner/LocalCopy for more scrape ideas + // link with "PDF" in title tag + if (element.attr("title").toLowerCase(Locale.ENGLISH).contains("pdf") && new URLDownload(href).isPdf()) { + return Optional.of(new URL(href)); } + + if (href.contains("pdf") || hrefText.contains("pdf") && new URLDownload(href).isPdf()) { + links.add(new URL(href)); + } + } + + // return if only one link was found (high accuracy) + if (links.size() == 1) { + LOGGER.info("Fulltext PDF found @ " + doiLink); + return Optional.of(links.get(0)); + } + // return if links are similar or multiple links are similar + return findSimilarLinks(links); + } catch (UnsupportedMimeTypeException type) { + // this might be the PDF already as we follow redirects + if (type.getMimeType().startsWith("application/pdf")) { + return Optional.of(new URL(type.getUrl())); } + LOGGER.warn("DoiResolution fetcher failed: ", type); + } catch (IOException e) { + LOGGER.warn("DoiResolution fetcher failed: ", e); + } + + return Optional.empty(); + } + + private Optional findSimilarLinks(List urls) { + List distinctLinks = urls.stream().distinct().collect(Collectors.toList()); + + if (distinctLinks.isEmpty()) { + return Optional.empty(); + } + // equal + if (distinctLinks.size() == 1) { + return Optional.of(distinctLinks.get(0)); } - return pdfLink; + // similar + final String firstElement = distinctLinks.get(0).toString(); + StringSimilarity similarity = new StringSimilarity(); + List similarLinks = distinctLinks.stream().filter(elem -> similarity.isSimilar(firstElement, elem.toString())).collect(Collectors.toList()); + if (similarLinks.size() == distinctLinks.size()) { + return Optional.of(similarLinks.get(0)); + } + + return Optional.empty(); } @Override diff --git a/src/test/java/org/jabref/logic/importer/fetcher/DoiResolutionTest.java b/src/test/java/org/jabref/logic/importer/fetcher/DoiResolutionTest.java index fd9e6024451..5461e383dbf 100644 --- a/src/test/java/org/jabref/logic/importer/fetcher/DoiResolutionTest.java +++ b/src/test/java/org/jabref/logic/importer/fetcher/DoiResolutionTest.java @@ -27,8 +27,7 @@ void setUp() { } @Test - @DisabledOnCIServer("CI server is blocked") - void findByDOI() throws IOException { + void linkWithPdfInTitleTag() throws IOException { entry.setField(StandardField.DOI, "10.1051/0004-6361/201527330"); assertEquals( @@ -37,17 +36,38 @@ void findByDOI() throws IOException { ); } + @Test + void linkWithPdfStringLeadsToFulltext() throws IOException { + entry.setField(StandardField.DOI, "10.1002/acr2.11101"); + assertEquals(Optional.of(new URL("https://onlinelibrary.wiley.com/doi/epdf/10.1002/acr2.11101")), finder.findFullText(entry)); + } + + @Test + void multipleLinksWithSmallEditDistanceLeadToFulltext() throws IOException { + entry.setField(StandardField.DOI, "10.1002/acr2.11101"); + assertEquals(Optional.of(new URL("https://onlinelibrary.wiley.com/doi/epdf/10.1002/acr2.11101")), finder.findFullText(entry)); + } + @Test void notReturnAnythingWhenMultipleLinksAreFound() throws IOException { - entry.setField(StandardField.DOI, "10.1051/0004-6361/201527330; 10.1051/0004-6361/20152711233"); + entry.setField(StandardField.DOI, "10.1109/JXCDC.2019.2911135"); assertEquals(Optional.empty(), finder.findFullText(entry)); } @Test - @DisabledOnCIServer("CI server is blocked") void notFoundByDOI() throws IOException { entry.setField(StandardField.DOI, "10.1186/unknown-doi"); assertEquals(Optional.empty(), finder.findFullText(entry)); } + + @Test + void entityWithoutDoi() throws IOException { + assertEquals(Optional.empty(), finder.findFullText(entry)); + } + + @Test + void trustLevel() { + assertEquals(TrustLevel.SOURCE, finder.getTrustLevel()); + } }