Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/enable paginated fetchers #7082

Merged
merged 9 commits into from
Nov 15, 2020
Original file line number Diff line number Diff line change
@@ -1,17 +1,27 @@
package org.jabref.logic.importer;

import org.jabref.logic.importer.fetcher.ComplexSearchQuery;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.paging.Page;

public interface PagedSearchBasedFetcher extends SearchBasedFetcher {

/**
* @param query search query send to endpoint
* @param pageNumber requested site number
* @param pageNumber requested site number indexed from 0
* @return Page with search results
*/
Page<BibEntry> performSearchPaged(String query, int pageNumber) throws FetcherException;

/**
* @param query search query send to endpoint
* @param pageNumber requested site number indexed from 0
* @return Page with search results
*/
default Page<BibEntry> performComplexSearchPaged(ComplexSearchQuery query, int pageNumber) throws FetcherException {
return performSearchPaged(query.toString(), pageNumber);
}

/**
* @return default pageSize
*/
Expand Down
Original file line number Diff line number Diff line change
@@ -1,16 +1,75 @@
package org.jabref.logic.importer;

import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.Collections;
import java.util.List;

import org.jabref.logic.importer.fetcher.ComplexSearchQuery;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.paging.Page;
import org.jabref.model.strings.StringUtil;

public interface PagedSearchBasedParserFetcher extends SearchBasedParserFetcher, PagedSearchBasedFetcher {

@Override
default Page<BibEntry> performSearchPaged(String query, int pageNumber) throws FetcherException {
if (StringUtil.isBlank(query)) {
return new Page<BibEntry>(query, pageNumber, Collections.emptyList());
}

// ADR-0014
URL urlForQuery;
try {
urlForQuery = getURLForQuery(query, pageNumber);
} catch (URISyntaxException | MalformedURLException e) {
throw new FetcherException(String.format("Search URI crafted from query %s is malformed", query), e);
}
return new Page<>(query, pageNumber, getBibEntries(urlForQuery));
}

private List<BibEntry> getBibEntries(URL urlForQuery) throws FetcherException {
try (InputStream stream = getUrlDownload(urlForQuery).asInputStream()) {
List<BibEntry> fetchedEntries = getParser().parseEntries(stream);
fetchedEntries.forEach(this::doPostCleanup);
return fetchedEntries;
} catch (IOException e) {
throw new FetcherException("A network error occurred while fetching from " + urlForQuery, e);
} catch (ParseException e) {
throw new FetcherException("An internal parser error occurred while fetching from " + urlForQuery, e);
}
}

@Override
default Page<BibEntry> performComplexSearchPaged(ComplexSearchQuery complexSearchQuery, int pageNumber) throws FetcherException {
// ADR-0014
URL urlForQuery;
try {
urlForQuery = getComplexQueryURL(complexSearchQuery, pageNumber);
} catch (URISyntaxException | MalformedURLException e) {
throw new FetcherException("Search URI crafted from complex search query is malformed", e);
}
return new Page<>(complexSearchQuery.toString(), pageNumber, getBibEntries(urlForQuery));
}

/**
* Constructs a URL based on the query, size and page number.
*
* @param query the search query
* @param pageNumber the number of the page indexed from 0
*/
URL getURLForQuery(String query, int pageNumber) throws URISyntaxException, MalformedURLException;

/**
* Constructs a URL based on the query, size and page number.
* @param query the search query
* @param size the size of the page
* @param pageNumber the number of the page
* */
URL getURLForQuery(String query, int size, int pageNumber) throws URISyntaxException, MalformedURLException, FetcherException;
*
* @param complexSearchQuery the search query
* @param pageNumber the number of the page indexed from 0
*/
default URL getComplexQueryURL(ComplexSearchQuery complexSearchQuery, int pageNumber) throws URISyntaxException, MalformedURLException {
return getURLForQuery(complexSearchQuery.toString(), pageNumber);
}
}
24 changes: 21 additions & 3 deletions src/main/java/org/jabref/logic/importer/fetcher/ArXiv.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import org.jabref.logic.importer.IdBasedFetcher;
import org.jabref.logic.importer.IdFetcher;
import org.jabref.logic.importer.ImportFormatPreferences;
import org.jabref.logic.importer.SearchBasedFetcher;
import org.jabref.logic.importer.PagedSearchBasedFetcher;
import org.jabref.logic.util.io.XMLUtil;
import org.jabref.logic.util.strings.StringSimilarity;
import org.jabref.model.entry.BibEntry;
Expand All @@ -31,6 +31,7 @@
import org.jabref.model.entry.identifier.ArXivIdentifier;
import org.jabref.model.entry.identifier.DOI;
import org.jabref.model.entry.types.StandardEntryType;
import org.jabref.model.paging.Page;
import org.jabref.model.strings.StringUtil;
import org.jabref.model.util.OptionalUtil;

Expand All @@ -52,7 +53,7 @@
* <a href="https://github.com/nathangrigg/arxiv2bib">arxiv2bib</a> which is <a href="https://arxiv2bibtex.org/">live</a>
* <a herf="https://gitlab.c3sl.ufpr.br/portalmec/dspace-portalmec/blob/aa209d15082a9870f9daac42c78a35490ce77b52/dspace-api/src/main/java/org/dspace/submit/lookup/ArXivService.java">dspace-portalmec</a>
*/
public class ArXiv implements FulltextFetcher, SearchBasedFetcher, IdBasedFetcher, IdFetcher<ArXivIdentifier> {
public class ArXiv implements FulltextFetcher, PagedSearchBasedFetcher, IdBasedFetcher, IdFetcher<ArXivIdentifier> {

private static final Logger LOGGER = LoggerFactory.getLogger(ArXiv.class);

Expand Down Expand Up @@ -157,6 +158,10 @@ private List<ArXivEntry> searchForEntries(String searchQuery) throws FetcherExce
return queryApi(searchQuery, Collections.emptyList(), 0, 10);
}

private List<ArXivEntry> searchForEntries(String searchQuery, int pageNumber) throws FetcherException {
return queryApi(searchQuery, Collections.emptyList(), getPageSize() * pageNumber, getPageSize());
}

private List<ArXivEntry> queryApi(String searchQuery, List<ArXivIdentifier> ids, int start, int maxResults)
throws FetcherException {
Document result = callApi(searchQuery, ids, start, maxResults);
Expand Down Expand Up @@ -263,6 +268,19 @@ public List<BibEntry> performSearch(String query) throws FetcherException {
*/
@Override
public List<BibEntry> performComplexSearch(ComplexSearchQuery complexSearchQuery) throws FetcherException {
return new ArrayList<>(performComplexSearchPaged(complexSearchQuery, 0).getContent());
}

@Override
public Page<BibEntry> performSearchPaged(String query, int pageNumber) throws FetcherException {
List<BibEntry> searchResult = searchForEntries(query, pageNumber).stream()
.map((arXivEntry) -> arXivEntry.toBibEntry(importFormatPreferences.getKeywordSeparator()))
.collect(Collectors.toList());
return new Page<>(query, pageNumber, searchResult);
}

@Override
public Page<BibEntry> performComplexSearchPaged(ComplexSearchQuery complexSearchQuery, int pageNumber) throws FetcherException {
List<String> searchTerms = new ArrayList<>();
complexSearchQuery.getAuthors().forEach(author -> searchTerms.add("au:" + author));
complexSearchQuery.getTitlePhrases().forEach(title -> searchTerms.add("ti:" + title));
Expand All @@ -272,7 +290,7 @@ public List<BibEntry> performComplexSearch(ComplexSearchQuery complexSearchQuery
complexSearchQuery.getToYear().ifPresent(year -> searchTerms.add(year.toString()));
searchTerms.addAll(complexSearchQuery.getDefaultFieldPhrases());
String complexQueryString = String.join(" AND ", searchTerms);
return performSearch(complexQueryString);
return performSearchPaged(complexQueryString, pageNumber);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,12 +91,12 @@ public URL getURLForQuery(String query) throws URISyntaxException, MalformedURLE
}

@Override
public URL getURLForQuery(String query, int size, int pageNumber) throws URISyntaxException, MalformedURLException, FetcherException {
public URL getURLForQuery(String query, int pageNumber) throws URISyntaxException, MalformedURLException {
URIBuilder builder = new URIBuilder(API_SEARCH_URL);
builder.addParameter("q", query);
builder.addParameter("fl", "bibcode");
builder.addParameter("rows", String.valueOf(size));
builder.addParameter("start", String.valueOf(size * pageNumber));
builder.addParameter("rows", String.valueOf(getPageSize()));
builder.addParameter("start", String.valueOf(getPageSize() * pageNumber));
return builder.build().toURL();
}

Expand All @@ -105,7 +105,7 @@ public URL getURLForQuery(String query, int size, int pageNumber) throws URISynt
* @return URL which points to a search request for given entry
*/
@Override
public URL getURLForEntry(BibEntry entry) throws URISyntaxException, MalformedURLException, FetcherException {
public URL getURLForEntry(BibEntry entry) throws URISyntaxException, MalformedURLException {
StringBuilder stringBuilder = new StringBuilder();

Optional<String> title = entry.getFieldOrAlias(StandardField.TITLE).map(t -> "title:\"" + t + "\"");
Expand Down Expand Up @@ -300,12 +300,11 @@ private List<BibEntry> performSearchByIds(Collection<String> identifiers) throws

@Override
public Page<BibEntry> performSearchPaged(String query, int pageNumber) throws FetcherException {

if (StringUtil.isBlank(query)) {
return new Page<>(query, pageNumber);
}
try {
List<String> bibcodes = fetchBibcodes(getURLForQuery(query, getPageSize(), pageNumber));
List<String> bibcodes = fetchBibcodes(getURLForQuery(query, pageNumber));
Collection<BibEntry> results = performSearchByIds(bibcodes);
return new Page<>(query, pageNumber, results);
} catch (URISyntaxException e) {
Expand Down
79 changes: 46 additions & 33 deletions src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,14 @@
import org.jabref.logic.importer.FetcherException;
import org.jabref.logic.importer.FulltextFetcher;
import org.jabref.logic.importer.ImportFormatPreferences;
import org.jabref.logic.importer.PagedSearchBasedFetcher;
import org.jabref.logic.importer.ParserResult;
import org.jabref.logic.importer.SearchBasedFetcher;
import org.jabref.logic.importer.fileformat.BibtexParser;
import org.jabref.logic.l10n.Localization;
import org.jabref.logic.net.URLDownload;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.field.StandardField;
import org.jabref.model.paging.Page;
import org.jabref.model.util.DummyFileUpdateMonitor;

import org.apache.http.client.utils.URIBuilder;
Expand All @@ -38,7 +39,7 @@
* <p>
* Search String infos: https://scholar.google.com/intl/en/scholar/help.html#searching
*/
public class GoogleScholar implements FulltextFetcher, SearchBasedFetcher {
public class GoogleScholar implements FulltextFetcher, PagedSearchBasedFetcher {
private static final Logger LOGGER = LoggerFactory.getLogger(GoogleScholar.class);

private static final Pattern LINK_TO_BIB_PATTERN = Pattern.compile("(https:\\/\\/scholar.googleusercontent.com\\/scholar.bib[^\"]*)");
Expand Down Expand Up @@ -128,37 +129,7 @@ public Optional<HelpFile> getHelpPage() {

@Override
public List<BibEntry> performSearch(String query) throws FetcherException {
LOGGER.debug("Using URL {}", query);
obtainAndModifyCookie();
List<BibEntry> foundEntries = new ArrayList<>(20);

URIBuilder uriBuilder = null;
try {
uriBuilder = new URIBuilder(BASIC_SEARCH_URL);
} catch (URISyntaxException e) {
throw new FetcherException("Error while fetching from " + getName() + " at URL " + BASIC_SEARCH_URL, e);
}

uriBuilder.addParameter("hl", "en");
uriBuilder.addParameter("btnG", "Search");
uriBuilder.addParameter("q", query);
String queryURL = uriBuilder.toString();

try {
addHitsFromQuery(foundEntries, queryURL);
} catch (IOException e) {
// if there are too much requests from the same IP address google is answering with a 503 and redirecting to a captcha challenge
// The caught IOException looks for example like this:
// java.io.IOException: Server returned HTTP response code: 503 for URL: https://ipv4.google.com/sorry/index?continue=https://scholar.google.com/scholar%3Fhl%3Den%26btnG%3DSearch%26q%3Dbpmn&hl=en&q=CGMSBI0NBDkYuqy9wAUiGQDxp4NLQCWbIEY1HjpH5zFJhv4ANPGdWj0
if (e.getMessage().contains("Server returned HTTP response code: 503 for URL")) {
throw new FetcherException("Fetching from Google Scholar at URL " + queryURL + " failed.",
Localization.lang("This might be caused by reaching the traffic limitation of Google Scholar (see 'Help' for details)."), e);
} else {
throw new FetcherException("Error while fetching from " + getName() + " at URL " + queryURL, e);
}
}

return foundEntries;
return new ArrayList<>(performSearchPaged(query, 0).getContent());
}

@Override
Expand Down Expand Up @@ -259,4 +230,46 @@ private void obtainAndModifyCookie() throws FetcherException {
throw new FetcherException("Cookie configuration for Google Scholar failed.", e);
}
}

@Override
public Page<BibEntry> performSearchPaged(String query, int pageNumber) throws FetcherException {
LOGGER.debug("Using URL {}", query);
obtainAndModifyCookie();
List<BibEntry> foundEntries = new ArrayList<>(20);

URIBuilder uriBuilder = null;
try {
uriBuilder = new URIBuilder(BASIC_SEARCH_URL);
} catch (URISyntaxException e) {
throw new FetcherException("Error while fetching from " + getName() + " at URL " + BASIC_SEARCH_URL, e);
}

uriBuilder.addParameter("hl", "en");
uriBuilder.addParameter("start", String.valueOf(pageNumber * getPageSize()));
uriBuilder.addParameter("num", String.valueOf(getPageSize()));
uriBuilder.addParameter("btnG", "Search");
uriBuilder.addParameter("q", query);
String queryURL = uriBuilder.toString();

try {
addHitsFromQuery(foundEntries, queryURL);
} catch (IOException e) {
// if there are too much requests from the same IP address google is answering with a 503 and redirecting to a captcha challenge
// The caught IOException looks for example like this:
// java.io.IOException: Server returned HTTP response code: 503 for URL: https://ipv4.google.com/sorry/index?continue=https://scholar.google.com/scholar%3Fhl%3Den%26btnG%3DSearch%26q%3Dbpmn&hl=en&q=CGMSBI0NBDkYuqy9wAUiGQDxp4NLQCWbIEY1HjpH5zFJhv4ANPGdWj0
if (e.getMessage().contains("Server returned HTTP response code: 503 for URL")) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This struck me recently, isn't there a way to check if the exception is a more specific exception so that one could check for the status code number?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do not quite understand what the benefits of that would be.
Could you provide me with an example to grasp your idea, please? :)

throw new FetcherException("Fetching from Google Scholar at URL " + queryURL + " failed.",
Localization.lang("This might be caused by reaching the traffic limitation of Google Scholar (see 'Help' for details)."), e);
} else {
throw new FetcherException("Error while fetching from " + getName() + " at URL " + queryURL, e);
}
}

return new Page<>(query, pageNumber, foundEntries);
}

@Override
public Page<BibEntry> performComplexSearchPaged(ComplexSearchQuery query, int pageNumber) throws FetcherException {
return null;
}
}
35 changes: 26 additions & 9 deletions src/main/java/org/jabref/logic/importer/fetcher/IEEE.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
import org.jabref.logic.help.HelpFile;
import org.jabref.logic.importer.FulltextFetcher;
import org.jabref.logic.importer.ImportFormatPreferences;
import org.jabref.logic.importer.PagedSearchBasedParserFetcher;
import org.jabref.logic.importer.Parser;
import org.jabref.logic.importer.SearchBasedParserFetcher;
import org.jabref.logic.net.URLDownload;
import org.jabref.logic.util.BuildInfo;
import org.jabref.logic.util.OS;
Expand All @@ -41,7 +41,7 @@
*
* @implNote <a href="https://developer.ieee.org/docs">API documentation</a>
*/
public class IEEE implements FulltextFetcher, SearchBasedParserFetcher {
public class IEEE implements FulltextFetcher, PagedSearchBasedParserFetcher {

private static final Logger LOGGER = LoggerFactory.getLogger(IEEE.class);
private static final String STAMP_BASE_STRING_DOCUMENT = "/stamp/stamp.jsp?tp=&arnumber=";
Expand Down Expand Up @@ -193,13 +193,7 @@ public TrustLevel getTrustLevel() {

@Override
public URL getURLForQuery(String query) throws URISyntaxException, MalformedURLException {
URIBuilder uriBuilder = new URIBuilder("https://ieeexploreapi.ieee.org/api/v1/search/articles");
uriBuilder.addParameter("apikey", API_KEY);
uriBuilder.addParameter("querytext", query);

URLDownload.bypassSSLVerification();

return uriBuilder.build().toURL();
return getURLForQuery(query, 0);
}

@Override
Expand Down Expand Up @@ -234,8 +228,31 @@ public Optional<HelpFile> getHelpPage() {

@Override
public URL getComplexQueryURL(ComplexSearchQuery complexSearchQuery) throws URISyntaxException, MalformedURLException {
return getComplexQueryURL(complexSearchQuery, 0);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be the default implementation, i.e. push to PagedSearchBasedParserFetcher

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You have currently quite a lot of code duplications. The main sources are:

  1. Complex and string-based queries have to be implemented separately, leading to mostly the same code. Suggestion: remove string-based queries completely
  2. Unpaged search methods always fall back to paged search method with page number 0. Suggestion: put these fallbacks in the general PagedSearchFetcher interface.

Thanks for your feedback! :)
Regarding 1:
I think that this is a good idea but will require more work than just replacing the method.
This is because for the WebSearchPane for instance the normal string-based search is used as a fallback case when the query could not be parsed.
Therefore I do not feel comfortable just tossing the string-based version.
However, I will address this in the upcoming weeks :)!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I implemented all other suggestions

Copy link
Member

@tobiasdiez tobiasdiez Nov 11, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good!

Concerning the first point, doesn't it work to parse the queries as follows:
author=me and title=something -> ComplexQuery[author: "me", title: "something", rest: ""]
author=me something -> ComplexQuery[author: "me", title: "", rest: "something"]
something -> ComplexQuery[author: "", title: "", rest: ""something]

Then you don't need any fall-back to a purely string-based search.

In my opinion, this question should be resolved before changing the fetcher in other ways. Otherwise you have a lot of overhead/code duplication now, that will be removed later.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I now removed the normal performSearch all together :)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks a lot! If you now also remove the "complex" in the names, I'm super happy ;-)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I now removed the complex from perform search :)
Regarding the query object, I believe that it is sensible to keep that name as it will be extended in the future such as structured information.

}

@Override
public URL getURLForQuery(String query, int pageNumber) throws URISyntaxException, MalformedURLException {
URIBuilder uriBuilder = new URIBuilder("https://ieeexploreapi.ieee.org/api/v1/search/articles");
uriBuilder.addParameter("apikey", API_KEY);
uriBuilder.addParameter("querytext", query);
uriBuilder.addParameter("max_records", String.valueOf(getPageSize()));
// Starts to index at 1 for the first entry
uriBuilder.addParameter("start_record", String.valueOf(getPageSize() * pageNumber) + 1);

URLDownload.bypassSSLVerification();

return uriBuilder.build().toURL();
}

@Override
public URL getComplexQueryURL(ComplexSearchQuery complexSearchQuery, int pageNumber) throws URISyntaxException, MalformedURLException {
URIBuilder uriBuilder = new URIBuilder("https://ieeexploreapi.ieee.org/api/v1/search/articles");
uriBuilder.addParameter("apikey", API_KEY);
uriBuilder.addParameter("max_records", String.valueOf(getPageSize()));
// Starts to index at 1 for the first entry
uriBuilder.addParameter("start_record", String.valueOf(getPageSize() * pageNumber) + 1);

if (!complexSearchQuery.getDefaultFieldPhrases().isEmpty()) {
uriBuilder.addParameter("querytext", String.join(" AND ", complexSearchQuery.getDefaultFieldPhrases()));
}
Expand Down
Loading