diff --git a/src/main/java/org/jabref/gui/importer/fetcher/CiteSeerXFetcher.java b/src/main/java/org/jabref/gui/importer/fetcher/CiteSeerXFetcher.java deleted file mode 100644 index 78ca7062d55..00000000000 --- a/src/main/java/org/jabref/gui/importer/fetcher/CiteSeerXFetcher.java +++ /dev/null @@ -1,162 +0,0 @@ -package org.jabref.gui.importer.fetcher; - -import java.io.IOException; -import java.net.URLEncoder; -import java.nio.charset.StandardCharsets; -import java.util.ArrayList; -import java.util.List; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import javax.swing.JPanel; - -import org.jabref.Globals; -import org.jabref.gui.importer.ImportInspectionDialog; -import org.jabref.logic.formatter.bibtexfields.NormalizeNamesFormatter; -import org.jabref.logic.help.HelpFile; -import org.jabref.logic.importer.ImportInspector; -import org.jabref.logic.importer.OutputPrinter; -import org.jabref.logic.net.URLDownload; -import org.jabref.model.entry.BibEntry; -import org.jabref.model.entry.FieldName; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class CiteSeerXFetcher implements EntryFetcher { - - private static final int MAX_PAGES_TO_LOAD = 8; - private static final String QUERY_MARKER = "___QUERY___"; - private static final String URL_START = "http://citeseer.ist.psu.edu"; - private static final String SEARCH_URL = CiteSeerXFetcher.URL_START + "/search?q=" + CiteSeerXFetcher.QUERY_MARKER - + "&submit=Search&sort=rlv&t=doc"; - private static final Pattern CITE_LINK_PATTERN = Pattern.compile(""); - - private static final String BASE_PATTERN = ""; - private static final Pattern TITLE_PATTERN = Pattern - .compile(CiteSeerXFetcher.BASE_PATTERN.replace(CiteSeerXFetcher.QUERY_MARKER, "citation_title")); - private static final Pattern AUTHOR_PATTERN = Pattern - .compile(CiteSeerXFetcher.BASE_PATTERN.replace(CiteSeerXFetcher.QUERY_MARKER, "citation_authors")); - private static final Pattern YEAR_PATTERN = Pattern - .compile(CiteSeerXFetcher.BASE_PATTERN.replace(CiteSeerXFetcher.QUERY_MARKER, "citation_year")); - private static final Pattern ABSTRACT_PATTERN = Pattern.compile("

Abstract

\\s*

(.*)

"); - - private static final Logger LOGGER = LoggerFactory.getLogger(CiteSeerXFetcher.class); - - private boolean stopFetching; - - @Override - public boolean processQuery(String query, ImportInspector inspector, OutputPrinter status) { - stopFetching = false; - try { - List citations = getCitations(query); - for (String citation : citations) { - if (stopFetching) { - break; - } - BibEntry entry = getSingleCitation(citation); - if (entry != null) { - inspector.addEntry(entry); - } - } - - return true; - } catch (IOException e) { - LOGGER.error("Error while fetching from " + getTitle(), e); - ((ImportInspectionDialog)inspector).showErrorMessage(this.getTitle(), e.getLocalizedMessage()); - return false; - } - } - - @Override - public String getTitle() { - return "CiteSeerX"; - } - - @Override - public HelpFile getHelpPage() { - return HelpFile.FETCHER_CITESEERX; - } - - @Override - public JPanel getOptionsPanel() { - return null; - } - - @Override - public void stopFetching() { - stopFetching = true; - } - - /** - * - * @param query - * The search term to query JStor for. - * @return a list of IDs - * @throws java.io.IOException - */ - private List getCitations(String query) throws IOException { - String urlQuery; - List ids = new ArrayList<>(); - urlQuery = CiteSeerXFetcher.SEARCH_URL.replace(CiteSeerXFetcher.QUERY_MARKER, - URLEncoder.encode(query, StandardCharsets.UTF_8.name())); - int count = 1; - String nextPage; - while (((nextPage = getCitationsFromUrl(urlQuery, ids)) != null) - && (count < CiteSeerXFetcher.MAX_PAGES_TO_LOAD)) { - urlQuery = nextPage; - count++; - if (stopFetching) { - break; - } - } - return ids; - } - - private static String getCitationsFromUrl(String urlQuery, List ids) throws IOException { - String cont = new URLDownload(urlQuery).asString(Globals.prefs.getDefaultEncoding()); - Matcher m = CiteSeerXFetcher.CITE_LINK_PATTERN.matcher(cont); - while (m.find()) { - ids.add(CiteSeerXFetcher.URL_START + m.group(1)); - } - - return null; - } - - private static BibEntry getSingleCitation(String urlString) throws IOException { - String cont = new URLDownload(urlString).asString(); - - // Find title, and create entry if we do. Otherwise assume we did not get an entry: - Matcher m = CiteSeerXFetcher.TITLE_PATTERN.matcher(cont); - if (m.find()) { - BibEntry entry = new BibEntry(); - entry.setField(FieldName.TITLE, m.group(1)); - - // Find authors: - m = CiteSeerXFetcher.AUTHOR_PATTERN.matcher(cont); - if (m.find()) { - String authors = m.group(1); - entry.setField(FieldName.AUTHOR, new NormalizeNamesFormatter().format(authors)); - } - - // Find year: - m = CiteSeerXFetcher.YEAR_PATTERN.matcher(cont); - if (m.find()) { - entry.setField(FieldName.YEAR, m.group(1)); - } - - // Find abstract: - m = CiteSeerXFetcher.ABSTRACT_PATTERN.matcher(cont); - if (m.find()) { - entry.setField(FieldName.ABSTRACT, m.group(1)); - } - - return entry; - } else { - return null; - } - - } - -} diff --git a/src/main/java/org/jabref/gui/importer/fetcher/EntryFetchers.java b/src/main/java/org/jabref/gui/importer/fetcher/EntryFetchers.java index f0f4689a7f9..353eddde4ae 100644 --- a/src/main/java/org/jabref/gui/importer/fetcher/EntryFetchers.java +++ b/src/main/java/org/jabref/gui/importer/fetcher/EntryFetchers.java @@ -12,9 +12,7 @@ public class EntryFetchers { private final List entryFetchers = new LinkedList<>(); - public EntryFetchers(JournalAbbreviationLoader abbreviationLoader) { - entryFetchers.add(new CiteSeerXFetcher()); - + public EntryFetchers(JournalAbbreviationLoader abbreviationLoader) { WebFetchers.getSearchBasedFetchers(Globals.prefs.getImportFormatPreferences()).stream() .map(SearchBasedEntryFetcher::new) .forEach(entryFetchers::add); diff --git a/src/main/java/org/jabref/logic/importer/Parser.java b/src/main/java/org/jabref/logic/importer/Parser.java index 3279906d534..4b9c77c3783 100644 --- a/src/main/java/org/jabref/logic/importer/Parser.java +++ b/src/main/java/org/jabref/logic/importer/Parser.java @@ -1,5 +1,6 @@ package org.jabref.logic.importer; +import java.io.ByteArrayInputStream; import java.io.InputStream; import java.util.List; @@ -11,4 +12,8 @@ public interface Parser { List parseEntries(InputStream inputStream) throws ParseException; + + default List parseEntries(String dataString) throws ParseException { + return parseEntries(new ByteArrayInputStream(dataString.getBytes())); + } } diff --git a/src/main/java/org/jabref/logic/importer/WebFetchers.java b/src/main/java/org/jabref/logic/importer/WebFetchers.java index 824db853094..3329a6215f2 100644 --- a/src/main/java/org/jabref/logic/importer/WebFetchers.java +++ b/src/main/java/org/jabref/logic/importer/WebFetchers.java @@ -9,6 +9,7 @@ import org.jabref.logic.importer.fetcher.ACS; import org.jabref.logic.importer.fetcher.ArXiv; import org.jabref.logic.importer.fetcher.AstrophysicsDataSystem; +import org.jabref.logic.importer.fetcher.CiteSeer; import org.jabref.logic.importer.fetcher.CrossRef; import org.jabref.logic.importer.fetcher.DBLPFetcher; import org.jabref.logic.importer.fetcher.DOAJFetcher; @@ -89,6 +90,7 @@ public static List getSearchBasedFetchers(ImportFormatPrefer list.add(new DBLPFetcher(importFormatPreferences)); list.add(new SpringerFetcher()); list.add(new CrossRef()); + list.add(new CiteSeer()); list.add(new DOAJFetcher(importFormatPreferences)); list.add(new IEEE(importFormatPreferences)); list.sort(Comparator.comparing(WebFetcher::getName)); diff --git a/src/main/java/org/jabref/logic/importer/fetcher/CiteSeer.java b/src/main/java/org/jabref/logic/importer/fetcher/CiteSeer.java new file mode 100644 index 00000000000..8d244b101f8 --- /dev/null +++ b/src/main/java/org/jabref/logic/importer/fetcher/CiteSeer.java @@ -0,0 +1,94 @@ +package org.jabref.logic.importer.fetcher; + +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.net.MalformedURLException; +import java.net.URISyntaxException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + +import org.jabref.logic.formatter.bibtexfields.HtmlToUnicodeFormatter; +import org.jabref.logic.formatter.casechanger.TitleCaseFormatter; +import org.jabref.logic.help.HelpFile; +import org.jabref.logic.importer.FetcherException; +import org.jabref.logic.importer.Parser; +import org.jabref.logic.importer.SearchBasedParserFetcher; +import org.jabref.logic.importer.fileformat.CoinsParser; +import org.jabref.logic.util.OS; +import org.jabref.model.cleanup.FieldFormatterCleanup; +import org.jabref.model.cleanup.Formatter; +import org.jabref.model.entry.BibEntry; +import org.jabref.model.entry.FieldName; + +import org.apache.http.client.utils.URIBuilder; + +public class CiteSeer implements SearchBasedParserFetcher { + + public CiteSeer() { + } + + @Override + public String getName() { + return "CiteSeerX"; + } + + @Override + public HelpFile getHelpPage() { + return HelpFile.FETCHER_CITESEERX; + } + + @Override + public URL getURLForQuery(String query) throws URISyntaxException, MalformedURLException, FetcherException { + URIBuilder uriBuilder = new URIBuilder("https://citeseer.ist.psu.edu/search"); + uriBuilder.addParameter("sort", "rlv"); // Sort by relevance + uriBuilder.addParameter("q", query); // Query + uriBuilder.addParameter("t", "doc"); // Type: documents + //uriBuilder.addParameter("start", "0"); // Start index (not supported at the moment) + return uriBuilder.build().toURL(); + } + + @Override + public Parser getParser() { + // MathSciNet returns COinS result embedded in HTML + // So we extract the data string from the tags and pass the content to the COinS parser + return inputStream -> { + String response = new BufferedReader(new InputStreamReader(inputStream)).lines().collect(Collectors.joining(OS.NEWLINE)); + + List entries = new ArrayList<>(); + CoinsParser parser = new CoinsParser(); + Pattern pattern = Pattern.compile(""); + Matcher matcher = pattern.matcher(response); + while (matcher.find()) { + String encodedDataString = matcher.group(1); + entries.addAll(parser.parseEntries(encodedDataString)); + } + return entries; + }; + } + + @Override + public void doPostCleanup(BibEntry entry) { + // CiteSeer escapes some characters in a way that is not recognized by the normal html to unicode formatter + // We, of course, also want to convert these special characters + Formatter extendedHtmlFormatter = new HtmlToUnicodeFormatter() { + @Override + public String format(String fieldText) { + String formatted = super.format(fieldText); + formatted = formatted.replaceAll("%3A", ":"); + formatted = formatted.replaceAll("%3Cem%3", ""); + formatted = formatted.replaceAll("%3C%2Fem%3E", ""); + formatted = formatted.replaceAll("%2C\\+", " "); + formatted = formatted.replaceAll("\\+", " "); + return formatted; + } + }; + new FieldFormatterCleanup(FieldName.INTERNAL_ALL_FIELD, extendedHtmlFormatter).cleanup(entry); + + // Many titles in the CiteSeer database have all-capital titles, for convenience we convert them to title case + new FieldFormatterCleanup(FieldName.TITLE, new TitleCaseFormatter()).cleanup(entry); + } +} diff --git a/src/main/java/org/jabref/logic/importer/fileformat/BibtexParser.java b/src/main/java/org/jabref/logic/importer/fileformat/BibtexParser.java index d9e3f5d4e83..16093dca0b3 100644 --- a/src/main/java/org/jabref/logic/importer/fileformat/BibtexParser.java +++ b/src/main/java/org/jabref/logic/importer/fileformat/BibtexParser.java @@ -6,7 +6,6 @@ import java.io.InputStreamReader; import java.io.PushbackReader; import java.io.Reader; -import java.io.StringReader; import java.nio.charset.StandardCharsets; import java.util.Collection; import java.util.Deque; @@ -110,10 +109,6 @@ public List parseEntries(Reader reader) throws ParseException { } } - public List parseEntries(String bibtexString) throws ParseException { - return parseEntries(new StringReader(bibtexString)); - } - public Optional parseSingleEntry(String bibtexString) throws ParseException { return parseEntries(bibtexString).stream().findFirst(); } diff --git a/src/main/java/org/jabref/logic/importer/fileformat/CoinsParser.java b/src/main/java/org/jabref/logic/importer/fileformat/CoinsParser.java new file mode 100644 index 00000000000..156dc551b5e --- /dev/null +++ b/src/main/java/org/jabref/logic/importer/fileformat/CoinsParser.java @@ -0,0 +1,78 @@ +package org.jabref.logic.importer.fileformat; + +import java.io.BufferedReader; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + +import org.jabref.logic.importer.ParseException; +import org.jabref.logic.importer.Parser; +import org.jabref.logic.util.OS; +import org.jabref.model.entry.BibEntry; +import org.jabref.model.entry.BiblatexEntryTypes; +import org.jabref.model.entry.FieldName; + +/** + * @implNote implemented by reverse-engineering
the implementation by CiteSeerX + */ +public class CoinsParser implements Parser { + + private final Pattern DOI = Pattern.compile("%3Fdoi%3D([^&]+)"); + private final Pattern TITLE = Pattern.compile("&rft.atitle=([^&]+)"); + private final Pattern JOURNAL = Pattern.compile("&rft.jtitle=([^&]+)"); + private final Pattern YEAR = Pattern.compile("&rft.date=([^&]+)"); + private final Pattern VOLUME = Pattern.compile("&rft.volume=([^&]+)"); + private final Pattern PAGES = Pattern.compile("&rft.pages=([^&]+)"); + private final Pattern ISSUE = Pattern.compile("&rft.issue=([^&]+)"); + private final Pattern TYPE = Pattern.compile("&rft.genre=([^&]+)"); + private final Pattern AUTHOR = Pattern.compile("&rft.au=([^&]+)"); + + @Override + public List parseEntries(InputStream inputStream) throws ParseException { + String data = new BufferedReader(new InputStreamReader(inputStream)).lines().collect(Collectors.joining(OS.NEWLINE)); + BibEntry entry = new BibEntry(); + + appendData(data, entry, DOI, FieldName.DOI); + appendData(data, entry, TITLE, FieldName.TITLE); + appendData(data, entry, JOURNAL, FieldName.JOURNALTITLE); + appendData(data, entry, YEAR, FieldName.YEAR); + appendData(data, entry, VOLUME, FieldName.VOLUME); + appendData(data, entry, PAGES, FieldName.PAGES); + appendData(data, entry, ISSUE, FieldName.ISSUE); + + Matcher matcherType = TYPE.matcher(data); + if (matcherType.find()) { + switch (matcherType.group(1)) { + case "article": + entry.setType(BiblatexEntryTypes.ARTICLE); + break; + case "unknown": + default: + entry.setType(BiblatexEntryTypes.MISC); + break; + } + } + + List authors = new ArrayList<>(); + Matcher matcherAuthors = AUTHOR.matcher(data); + while (matcherAuthors.find()) { + String author = matcherAuthors.group(1); + authors.add(author); + } + entry.setField(FieldName.AUTHOR, authors.stream().collect(Collectors.joining(" and "))); + + return Collections.singletonList(entry); + } + + private void appendData(String data, BibEntry entry, Pattern pattern, String fieldName) { + Matcher matcher = pattern.matcher(data); + if (matcher.find()) { + entry.setField(fieldName, matcher.group(1)); + } + } +} diff --git a/src/test/java/org/jabref/logic/importer/fetcher/ACMPortalFetcherTest.java b/src/test/java/org/jabref/logic/importer/fetcher/ACMPortalFetcherTest.java index 4bd9848ab4d..df85c30857f 100644 --- a/src/test/java/org/jabref/logic/importer/fetcher/ACMPortalFetcherTest.java +++ b/src/test/java/org/jabref/logic/importer/fetcher/ACMPortalFetcherTest.java @@ -7,6 +7,7 @@ import org.jabref.logic.importer.ImportFormatPreferences; import org.jabref.model.entry.BibEntry; import org.jabref.model.entry.BibtexEntryTypes; +import org.jabref.testutils.category.FetcherTest; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -15,6 +16,7 @@ import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; +@FetcherTest class ACMPortalFetcherTest { ACMPortalFetcher fetcher; diff --git a/src/test/java/org/jabref/logic/importer/fetcher/CiteSeerTest.java b/src/test/java/org/jabref/logic/importer/fetcher/CiteSeerTest.java new file mode 100644 index 00000000000..5460e4d3abb --- /dev/null +++ b/src/test/java/org/jabref/logic/importer/fetcher/CiteSeerTest.java @@ -0,0 +1,50 @@ +package org.jabref.logic.importer.fetcher; + +import java.util.Collections; +import java.util.List; + +import org.jabref.model.entry.BibEntry; +import org.jabref.model.entry.BibtexEntryTypes; +import org.jabref.testutils.category.FetcherTest; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +@FetcherTest +class CiteSeerTest { + + CiteSeer fetcher; + + @BeforeEach + void setUp() throws Exception { + fetcher = new CiteSeer(); + } + + @Test + void searchByQueryFindsEntry() throws Exception { + BibEntry expected = new BibEntry(); + expected.setType(BibtexEntryTypes.MISC); + expected.setField("author", "Wang Wei and Zhang Pingwen and Zhang Zhifei"); + expected.setField("title", "Rigorous Derivation from Landau-de Gennes Theory to Eericksen-leslie Theory"); + expected.setField("doi", "10.1.1.744.5780"); + + List fetchedEntries = fetcher.performSearch("title:Ericksen-Leslie AND venue:q AND ncites:[10 TO 15000]"); + assertEquals(Collections.singletonList(expected), fetchedEntries); + } + + @Test + void searchByQueryFindsEntry2() throws Exception { + BibEntry expected = new BibEntry(); + expected.setType(BibtexEntryTypes.MISC); + expected.setField("author", "Lazarus Richard S."); + expected.setField("title", "Coping Theory and Research: Past Present and Future"); + expected.setField("doi", "10.1.1.115.9665"); + expected.setField("year", "1993"); + expected.setField("journaltitle", "PSYCHOSOMATIC MEDICINE"); + + List fetchedEntries = fetcher.performSearch("JabRef"); + assertEquals(expected, fetchedEntries.get(4)); + } +} diff --git a/src/test/java/org/jabref/logic/importer/fetcher/GvkParserTest.java b/src/test/java/org/jabref/logic/importer/fetcher/GvkParserTest.java index 443b46be140..5e650d3378b 100644 --- a/src/test/java/org/jabref/logic/importer/fetcher/GvkParserTest.java +++ b/src/test/java/org/jabref/logic/importer/fetcher/GvkParserTest.java @@ -9,13 +9,14 @@ import org.jabref.logic.bibtex.BibEntryAssert; import org.jabref.logic.importer.fileformat.GvkParser; import org.jabref.model.entry.BibEntry; +import org.jabref.testutils.category.FetcherTest; import org.junit.jupiter.api.Test; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; - +@FetcherTest public class GvkParserTest { private void doTest(String xmlName, int expectedSize, List resourceNames) throws Exception { diff --git a/src/test/java/org/jabref/logic/importer/fetcher/INSPIREFetcherTest.java b/src/test/java/org/jabref/logic/importer/fetcher/INSPIREFetcherTest.java index 5f436313d5a..d919f96ce1d 100644 --- a/src/test/java/org/jabref/logic/importer/fetcher/INSPIREFetcherTest.java +++ b/src/test/java/org/jabref/logic/importer/fetcher/INSPIREFetcherTest.java @@ -7,6 +7,7 @@ import org.jabref.logic.importer.ImportFormatPreferences; import org.jabref.model.entry.BibEntry; import org.jabref.model.entry.BibtexEntryTypes; +import org.jabref.testutils.category.FetcherTest; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -15,6 +16,7 @@ import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; +@FetcherTest class INSPIREFetcherTest { private INSPIREFetcher fetcher; diff --git a/src/test/java/org/jabref/logic/importer/fetcher/SpringerFetcherTest.java b/src/test/java/org/jabref/logic/importer/fetcher/SpringerFetcherTest.java index f224476c553..7123d5c3e50 100644 --- a/src/test/java/org/jabref/logic/importer/fetcher/SpringerFetcherTest.java +++ b/src/test/java/org/jabref/logic/importer/fetcher/SpringerFetcherTest.java @@ -6,6 +6,7 @@ import org.jabref.model.entry.BibEntry; import org.jabref.model.entry.BibtexEntryTypes; +import org.jabref.testutils.category.FetcherTest; import org.json.JSONObject; import org.junit.jupiter.api.BeforeEach; @@ -13,6 +14,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals; +@FetcherTest class SpringerFetcherTest { SpringerFetcher fetcher;