Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Re-use CURLs to got to same host again and again #4399

Merged
merged 9 commits into from
Feb 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 26 additions & 16 deletions Template.php
Original file line number Diff line number Diff line change
Expand Up @@ -1979,6 +1979,10 @@
}

public function get_doi_from_crossref() : void {
static $ch = NULL;
if ($ch === NULL) {
$ch = curl_init_array(1.0, [CURLOPT_USERAGENT => BOT_CROSSREF_USER_AGENT]);
}
set_time_limit(120);
if ($this->has('doi')) return;
report_action("Checking CrossRef database for doi. ");
Expand Down Expand Up @@ -2024,11 +2028,8 @@
. ($data['volume'] ? "&volume=" . urlencode($data['volume']) : '')
. ($data['issn'] ? "&issn=" . urlencode($data['issn']) : "&title=" . urlencode($data['journal']))
. "&mailto=".CROSSREFUSERNAME; // do not encode crossref email
$ch = curl_init_array(1.0,
[CURLOPT_URL => $url,
CURLOPT_USERAGENT => BOT_CROSSREF_USER_AGENT]);
curl_setopt($ch, CURLOPT_URL, $url);

Check failure on line 2031 in Template.php

View workflow job for this annotation

GitHub Actions / build

TaintedSSRF

Template.php:2031:34: TaintedSSRF: Detected tainted network request (see https://psalm.dev/253)
$xml = curl_exec($ch);
unset($ch);
if (is_string($xml) && (strlen($xml) > 0)) {
$result = @simplexml_load_string($xml);
} else {
Expand Down Expand Up @@ -2617,14 +2618,18 @@
}

protected function get_semanticscholar_url(string $doi) : void {
static $ch = NULL;
if ($ch === NULL) {
$ch = curl_init_array(0.5, [CURLOPT_HTTPHEADER => HEADER_S2]);
}
set_time_limit(120);
if( $this->has('pmc') ||
($this->has('doi') && $this->get('doi-access') === 'free') ||
($this->has('jstor') && $this->get('jstor-access') === 'free')
) return; // do not add url if have OA already. Do indlude preprints in list
if ($this->has('s2cid') || $this->has('S2CID')) return;
$url = 'https://api.semanticscholar.org/v1/paper/' . doi_encode(urldecode($doi));
$ch = curl_init_array(0.5, [CURLOPT_HTTPHEADER => HEADER_S2, CURLOPT_URL => $url]);
curl_setopt($ch, CURLOPT_URL, $url);
$response = (string) curl_exec($ch);
if ($response) {
$oa = @json_decode($response);
Expand All @@ -2635,13 +2640,14 @@
}

public function get_unpaywall_url(string $doi) : string {
static $ch_oa = NULL;
if ($ch_oa === NULL) {
$ch_oa = curl_init_array(0.5, [CURLOPT_USERAGENT => BOT_CROSSREF_USER_AGENT]);
}
set_time_limit(120);
$url = "https://api.unpaywall.org/v2/$doi?email=" . CROSSREFUSERNAME;
$ch = curl_init_array(1.0,
[CURLOPT_URL => $url,
CURLOPT_USERAGENT => BOT_CROSSREF_USER_AGENT]);
$json = (string) @curl_exec($ch);
unset($ch);
curl_setopt($ch_oa, CURLOPT_URL, $url);

Check failure on line 2649 in Template.php

View workflow job for this annotation

GitHub Actions / build

TaintedSSRF

Template.php:2649:35: TaintedSSRF: Detected tainted network request (see https://psalm.dev/253)
$json = (string) @curl_exec($ch_oa);
if ($json) {
$oa = @json_decode($json);
if ($oa !== FALSE && isset($oa->best_oa_location)) {
Expand Down Expand Up @@ -2862,6 +2868,10 @@
}

protected function expand_by_google_books_inner(string $url_type, bool $use_it) : bool {
static $ch = NULL;
if ($ch === NULL) {
$ch = curl_init_array(1.0, []);
}
set_time_limit(120);
if ($url_type) {
$url = $this->get($url_type);
Expand Down Expand Up @@ -2897,10 +2907,8 @@
}
if ($isbn) { // Try Books.Google.Com
$google_book_url = 'https://www.google.com/search?tbo=p&tbm=bks&q=isbn:' . $isbn;
$ch = curl_init_array(1.0,
[CURLOPT_URL => $google_book_url]);
curl_setopt($ch, CURLOPT_URL, $google_book_url);

Check failure on line 2910 in Template.php

View workflow job for this annotation

GitHub Actions / build

TaintedSSRF

Template.php:2910:33: TaintedSSRF: Detected tainted network request (see https://psalm.dev/253)
$google_content = (string) @curl_exec($ch);
unset($ch);
if ($google_content && preg_match_all('~[Bb]ooks\.[Gg]oogle\.com/books\?id=(............)&amp~', $google_content, $google_results)) {
$google_results = $google_results[1];
$google_results = array_unique($google_results);
Expand Down Expand Up @@ -2940,12 +2948,14 @@
}

protected function google_book_details(string $gid) : void {
static $ch = NULL;
if ($ch === NULL) {
$ch = curl_init_array(1.0, []);
}
set_time_limit(120);
$google_book_url = "https://books.google.com/books/feeds/volumes/" . $gid;
$ch = curl_init_array(1.0,
[CURLOPT_URL => $google_book_url]);
curl_setopt($ch, CURLOPT_URL, $google_book_url);
$data = (string) @curl_exec($ch);
unset($ch);
if ($data === '') return;
$simplified_xml = str_replace('http___//www.w3.org/2005/Atom', 'http://www.w3.org/2005/Atom',
str_replace(":", "___", $data));
Expand Down
86 changes: 58 additions & 28 deletions apiFunctions.php
Original file line number Diff line number Diff line change
Expand Up @@ -223,12 +223,15 @@
@param array<Template> $templates
**/
function arxiv_api(array $ids, array &$templates) : bool { // Pointer to save memory
static $ch = NULL;
if ($ch === NULL) {
$ch = curl_init_array(1.0, []);
}
set_time_limit(120);
if (count($ids) === 0) return FALSE;
report_action("Getting data from arXiv API");
$request = "https://export.arxiv.org/api/query?start=0&max_results=2000&id_list=" . implode(',', $ids);
$ch = curl_init_array(1.0,
[CURLOPT_URL => $request]);
curl_setopt($ch, CURLOPT_URL, $request);

Check failure on line 234 in apiFunctions.php

View workflow job for this annotation

GitHub Actions / build

TaintedSSRF

apiFunctions.php:234:33: TaintedSSRF: Detected tainted network request (see https://psalm.dev/253)
$response = (string) @curl_exec($ch);
if ($response) {
$xml = @simplexml_load_string(
Expand Down Expand Up @@ -594,12 +597,15 @@
}

function query_crossref(string $doi) : ?object {
static $ch = NULL;
if ($ch === NULL) {
$ch = curl_init_array(1.0, []);
}
if (strpos($doi, '10.2307') === 0) return NULL; // jstor API is better
set_time_limit(120);
$doi = str_replace(DOI_URL_DECODE, DOI_URL_ENCODE, $doi);
$url = "https://www.crossref.org/openurl/?pid=" . CROSSREFUSERNAME . "&id=doi:$doi&noredirect=TRUE";
$ch = curl_init_array(1.0,
[CURLOPT_URL => $url]);
curl_setopt($ch, CURLOPT_URL, $url);

Check failure on line 608 in apiFunctions.php

View workflow job for this annotation

GitHub Actions / build

TaintedSSRF

apiFunctions.php:608:33: TaintedSSRF: Detected tainted network request (see https://psalm.dev/253)
for ($i = 0; $i < 2; $i++) {
$raw_xml = (string) @curl_exec($ch);
if (!$raw_xml) {
Expand All @@ -613,7 +619,6 @@
$raw_xml);
$xml = @simplexml_load_string($raw_xml);
if (is_object($xml) && isset($xml->query_result->body->query)) {
unset($ch);
$result = $xml->query_result->body->query;
if ((string) @$result["status"] === "resolved") {
if (stripos($doi, '10.1515/crll') === 0) {
Expand All @@ -638,7 +643,6 @@
// Keep trying...
}
}
unset($ch); // @codeCoverageIgnore
report_warning("Error loading CrossRef file from DOI " . echoable($doi) . "!"); // @codeCoverageIgnore
return NULL; // @codeCoverageIgnore
}
Expand All @@ -650,6 +654,11 @@
// Examples of DOI usage https://www.doi.org/demos.html
// This basically does this:
// curl -LH "Accept: application/vnd.citationstyles.csl+json" https://dx.doi.org/10.5524/100077
static $ch = NULL;
if ($ch === NULL) {
$ch = curl_init_array(1.5, // can take a long time when nothing to be found
[CURLOPT_HTTPHEADER => ["Accept: application/vnd.citationstyles.csl+json"]]);
}
if (strpos($doi, '10.2307') === 0) return FALSE; // jstor API is better
if (strpos($doi, '10.24436') === 0) return FALSE; // They have horrible meta-data
if (strpos($doi, '10.5284/1028203') === 0) return FALSE; // database
Expand All @@ -666,17 +675,14 @@
return $template->add_if_new($name, (string) $data, 'dx');
};
if (!$doi) return FALSE;
$ch = curl_init_array(1.5, // can take a long time when nothing to be found
[CURLOPT_URL => 'https://doi.org/' . $doi,
CURLOPT_HTTPHEADER => ["Accept: application/vnd.citationstyles.csl+json"]]);
curl_setopt($ch, CURLOPT_URL, 'https://doi.org/' . $doi);

Check failure on line 678 in apiFunctions.php

View workflow job for this annotation

GitHub Actions / build

TaintedSSRF

apiFunctions.php:678:36: TaintedSSRF: Detected tainted network request (see https://psalm.dev/253)
report_action("Querying dx.doi.org: doi:" . doi_link($doi));
try {
$data = (string) @curl_exec($ch);
} catch (Exception $e) { // @codeCoverageIgnoreStart
$template->mark_inactive_doi();
return FALSE;
} // @codeCoverageIgnoreEnd
unset($ch);
if ($data === "" || stripos($data, 'DOI Not Found') !== FALSE || stripos($data, 'DOI prefix') !== FALSE) {
$template->mark_inactive_doi();
return FALSE;
Expand Down Expand Up @@ -789,6 +795,10 @@
}

function expand_by_jstor(Template $template) : bool {
static $ch = NULL;
if ($ch === NULL) {
$ch = curl_init_array(1.0, []);
}
set_time_limit(120);
if ($template->incomplete() === FALSE) return FALSE;
if ($template->has('jstor')) {
Expand All @@ -804,10 +814,8 @@
$jstor = trim($jstor);
if (strpos($jstor, ' ') !== FALSE) return FALSE ; // Comment/template found
if (substr($jstor, 0, 1) === 'i') return FALSE ; // We do not want i12342 kind
$ch = curl_init_array(1.0,
[CURLOPT_URL => 'https://www.jstor.org/citation/ris/' . $jstor ]);
curl_setopt($ch, CURLOPT_URL, 'https://www.jstor.org/citation/ris/' . $jstor);

Check failure on line 817 in apiFunctions.php

View workflow job for this annotation

GitHub Actions / build

TaintedSSRF

apiFunctions.php:817:33: TaintedSSRF: Detected tainted network request (see https://psalm.dev/253)
$dat = (string) @curl_exec($ch);
unset($ch);
if ($dat === '') {
report_info("JSTOR API returned nothing for ". jstor_link($jstor)); // @codeCoverageIgnore
return FALSE; // @codeCoverageIgnore
Expand Down Expand Up @@ -1047,8 +1055,12 @@
}

function getS2CID(string $url) : string {
static $ch = NULL;
if ($ch === NULL) {
$ch = curl_init_array(0.5, [CURLOPT_HTTPHEADER => HEADER_S2]);
}
$url = 'https://api.semanticscholar.org/v1/paper/URL:' . urlencode(urldecode($url));
$ch = curl_init_array(0.5, [CURLOPT_HTTPHEADER => HEADER_S2, CURLOPT_URL => $url]);
curl_setopt($ch, CURLOPT_URL, $url);
$response = (string) @curl_exec($ch);
if (!$response) {
report_warning("No response from semanticscholar."); // @codeCoverageIgnore
Expand All @@ -1071,8 +1083,12 @@
}

function ConvertS2CID_DOI(string $s2cid) : string {
static $ch = NULL;
if ($ch === NULL) {
$ch = curl_init_array(0.5, [CURLOPT_HTTPHEADER => HEADER_S2]);
}
$url = 'https://api.semanticscholar.org/v1/paper/CorpusID:' . urlencode($s2cid);
$ch = curl_init_array(0.5, [CURLOPT_HTTPHEADER => HEADER_S2, CURLOPT_URL => $url]);
curl_setopt($ch, CURLOPT_URL, $url);

Check failure on line 1091 in apiFunctions.php

View workflow job for this annotation

GitHub Actions / build

TaintedSSRF

apiFunctions.php:1091:33: TaintedSSRF: Detected tainted network request (see https://psalm.dev/253)
$response = (string) @curl_exec($ch);
if (!$response) {
report_warning("No response from semanticscholar."); // @codeCoverageIgnore
Expand Down Expand Up @@ -1101,8 +1117,12 @@
}

function get_semanticscholar_license(string $s2cid) : ?bool {
static $ch = NULL;
if ($ch === NULL) {
$ch = curl_init_array(0.5, [CURLOPT_HTTPHEADER => HEADER_S2]);
}
$url = 'https://api.semanticscholar.org/v1/paper/CorpusID:' . urlencode($s2cid);
$ch = curl_init_array(0.5, [CURLOPT_HTTPHEADER => HEADER_S2, CURLOPT_URL => $url]);
curl_setopt($ch, CURLOPT_URL, $url);
$response = (string) @curl_exec($ch);
if ($response === '') return NULL;
if (stripos($response, 'Paper not found') !== FALSE) return FALSE;
Expand All @@ -1116,9 +1136,11 @@
@param array<Template> $templates
**/
function expand_templates_from_archives(array &$templates) : void { // This is done very late as a latch ditch effort // Pointer to save memory
static $ch = NULL;
set_time_limit(120);
$ch = curl_init_array(1.0,
[CURLOPT_HEADER => TRUE]);
if ($ch === NULL) {
$ch = curl_init_array(0.5, [CURLOPT_HEADER => TRUE]);
}
foreach ($templates as $template) {
set_time_limit(120);
if ($template->has('script-title') && (strtolower($template->get('title')) === 'usurped title' || strtolower($template->get('title')) === 'archived copy' || strtolower($template->get('title')) === 'archive copy')) {
Expand Down Expand Up @@ -1233,7 +1255,7 @@
/** @param array<int|string|bool|array<string>> $curl_opts **/
function Bibcode_Response_Processing(array $curl_opts, string $adsabs_url) : object {
try {
$ch = curl_init_array(1.0, $curl_opts);
$ch = curl_init_array(1.0, $curl_opts); // Type varies greatly
$return = (string) @curl_exec($ch);
if ($return === "") {
// @codeCoverageIgnoreStart
Expand Down Expand Up @@ -1384,14 +1406,19 @@
}
// Must use post in order to get DOIs with <, >, [, and ] in them and other problems
function xml_post(string $url, string $post) : ?SimpleXMLElement {
$ch = curl_init_array(1.0,
[CURLOPT_URL => $url,
CURLOPT_POST => TRUE,
CURLOPT_POSTFIELDS => $post,
CURLOPT_HTTPHEADER => array(
"Content-Type: application/x-www-form-urlencoded",
static $ch = NULL;
if ($ch === NULL) {
$ch = curl_init_array(1.0,
[CURLOPT_POST => TRUE,
CURLOPT_HTTPHEADER => array(
"Content-Type: application/x-www-form-urlencoded",
"Accept: application/xml")
]);
}
curl_setopt_array($ch,
[CURLOPT_URL => $url,
CURLOPT_POSTFIELDS => $post,
]);
$output = (string) @curl_exec($ch);
$xml = @simplexml_load_string($output);
if ($xml === FALSE) $xml = NULL;
Expand Down Expand Up @@ -1510,10 +1537,13 @@

// Might want to look at using instead https://doi.crossref.org/openurl/[email protected]&id=doi:10.1080/00222938700771131&redirect=no&format=unixref
function CrossRefTitle(string $doi) : string {
static $ch = NULL;
if ($ch === NULL) {
$ch = curl_init_array(1.0,
[CURLOPT_USERAGENT => BOT_CROSSREF_USER_AGENT]);
}
$url = "https://api.crossref.org/v1/works/".str_replace(DOI_URL_DECODE, DOI_URL_ENCODE, $doi)."?mailto=".CROSSREFUSERNAME; // do not encode crossref email
$ch = curl_init_array(1.0,
[CURLOPT_URL => $url,
CURLOPT_USERAGENT => BOT_CROSSREF_USER_AGENT]);
curl_setopt($ch, CURLOPT_URL, $url);

Check failure on line 1546 in apiFunctions.php

View workflow job for this annotation

GitHub Actions / build

TaintedSSRF

apiFunctions.php:1546:36: TaintedSSRF: Detected tainted network request (see https://psalm.dev/253)
$json = (string) @curl_exec($ch);
$json = @json_decode($json);
if (isset($json->message->title[0]) && !isset($json->message->title[1])) {
Expand Down
7 changes: 5 additions & 2 deletions expandFns.php
Original file line number Diff line number Diff line change
Expand Up @@ -1205,14 +1205,17 @@
}

function check_doi_for_jstor(string $doi, Template $template) : void {
static $ch = NULL;
if ($ch === NULL) {
$ch = curl_init_array(1.0, []);
}
if ($template->has('jstor')) return;
$doi = trim($doi);
if ($doi === '') return;
if (strpos($doi, '10.2307') === 0) { // special case
$doi = substr($doi, 8);
}
$ch = curl_init_array(1.0,
[CURLOPT_URL => "https://www.jstor.org/citation/ris/" . $doi]);
curl_setopt($ch, CURLOPT_URL, "https://www.jstor.org/citation/ris/" . $doi);

Check failure on line 1218 in expandFns.php

View workflow job for this annotation

GitHub Actions / build

TaintedSSRF

expandFns.php:1218:33: TaintedSSRF: Detected tainted network request (see https://psalm.dev/253)
$ris = (string) @curl_exec($ch);
$httpCode = (int) @curl_getinfo($ch, CURLINFO_HTTP_CODE);
if ($httpCode === 200 &&
Expand Down
Loading