Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added (and set as default) a cURL-based method for fetching HTTP content #624

Merged
merged 1 commit into from
Aug 9, 2016
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
160 changes: 150 additions & 10 deletions application/HttpUtils.php
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
<?php
/**
* GET an HTTP URL to retrieve its content
* Uses the cURL library or a fallback method
*
* @param string $url URL to get (http://...)
* @param int $timeout network timeout (in seconds)
Expand All @@ -20,38 +21,177 @@
* echo 'There was an error: '.htmlspecialchars($headers[0]);
* }
*
* @see http://php.net/manual/en/function.file-get-contents.php
* @see http://php.net/manual/en/function.stream-context-create.php
* @see http://php.net/manual/en/function.get-headers.php
* @see https://secure.php.net/manual/en/ref.curl.php
* @see https://secure.php.net/manual/en/functions.anonymous.php
* @see https://secure.php.net/manual/en/function.preg-split.php
* @see https://secure.php.net/manual/en/function.explode.php
* @see http://stackoverflow.com/q/17641073
* @see http://stackoverflow.com/q/9183178
* @see http://stackoverflow.com/q/1462720
*/
function get_http_response($url, $timeout = 30, $maxBytes = 4194304)
{
$urlObj = new Url($url);
$cleanUrl = $urlObj->idnToAscii();

if (! filter_var($cleanUrl, FILTER_VALIDATE_URL) || ! $urlObj->isHttp()) {
if (!filter_var($cleanUrl, FILTER_VALIDATE_URL) || !$urlObj->isHttp()) {
return array(array(0 => 'Invalid HTTP Url'), false);
}

$userAgent =
'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:45.0)'
. ' Gecko/20100101 Firefox/45.0';
$acceptLanguage =
substr(setlocale(LC_COLLATE, 0), 0, 2) . ',en-US;q=0.7,en;q=0.3';
$maxRedirs = 3;

if (!function_exists('curl_init')) {
return get_http_response_fallback(
$cleanUrl,
$timeout,
$maxBytes,
$userAgent,
$acceptLanguage,
$maxRedirs
);
}

$ch = curl_init($cleanUrl);
if ($ch === false) {
return array(array(0 => 'curl_init() error'), false);
}

// General cURL settings
curl_setopt($ch, CURLOPT_AUTOREFERER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt(
$ch,
CURLOPT_HTTPHEADER,
array('Accept-Language: ' . $acceptLanguage)
);
curl_setopt($ch, CURLOPT_MAXREDIRS, $maxRedirs);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);

// Max download size management
curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024);
curl_setopt($ch, CURLOPT_NOPROGRESS, false);
curl_setopt($ch, CURLOPT_PROGRESSFUNCTION,
function($arg0, $arg1, $arg2, $arg3, $arg4 = 0) use ($maxBytes)
{
if (version_compare(phpversion(), '5.5', '<')) {
// PHP version lower than 5.5
// Callback has 4 arguments
$downloaded = $arg1;
} else {
// Callback has 5 arguments
$downloaded = $arg2;
}
// Non-zero return stops downloading
return ($downloaded > $maxBytes) ? 1 : 0;
}
);

$response = curl_exec($ch);
$errorNo = curl_errno($ch);
$errorStr = curl_error($ch);
$headSize = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
curl_close($ch);

if ($response === false) {
if ($errorNo == CURLE_COULDNT_RESOLVE_HOST) {
/*
* Workaround to match fallback method behaviour
* Removing this would require updating
* GetHttpUrlTest::testGetInvalidRemoteUrl()
*/
return array(false, false);
}
return array(array(0 => 'curl_exec() error: ' . $errorStr), false);
}

// Formatting output like the fallback method
$rawHeaders = substr($response, 0, $headSize);

// Keep only headers from latest redirection
$rawHeadersArrayRedirs = explode("\r\n\r\n", trim($rawHeaders));
$rawHeadersLastRedir = end($rawHeadersArrayRedirs);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are redirections always the last headers to be set?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, the latest header set come from the most recent redirect, e.g.:

original link headers
<blank line>
1st redirect headers
<blank line>
2nd redirect headers
<blank line>


$content = substr($response, $headSize);
$headers = array();
foreach (preg_split('~[\r\n]+~', $rawHeadersLastRedir) as $line) {
if (empty($line) or ctype_space($line)) {
continue;
}
$splitLine = explode(': ', $line, 2);
if (count($splitLine) > 1) {
$key = $splitLine[0];
$value = $splitLine[1];
if (array_key_exists($key, $headers)) {
if (!is_array($headers[$key])) {
$headers[$key] = array(0 => $headers[$key]);
}
$headers[$key][] = $value;
} else {
$headers[$key] = $value;
}
} else {
$headers[] = $splitLine[0];
}
}

return array($headers, $content);
}

/**
* GET an HTTP URL to retrieve its content (fallback method)
*
* @param string $cleanUrl URL to get (http://... valid and in ASCII form)
* @param int $timeout network timeout (in seconds)
* @param int $maxBytes maximum downloaded bytes
* @param string $userAgent "User-Agent" header
* @param string $acceptLanguage "Accept-Language" header
* @param int $maxRedr maximum amount of redirections followed
*
* @return array HTTP response headers, downloaded content
*
* Output format:
* [0] = associative array containing HTTP response headers
* [1] = URL content (downloaded data)
*
* @see http://php.net/manual/en/function.file-get-contents.php
* @see http://php.net/manual/en/function.stream-context-create.php
* @see http://php.net/manual/en/function.get-headers.php
*/
function get_http_response_fallback(
$cleanUrl,
$timeout,
$maxBytes,
$userAgent,
$acceptLanguage,
$maxRedr
) {
$options = array(
'http' => array(
'method' => 'GET',
'timeout' => $timeout,
'user_agent' => 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:45.0)'
.' Gecko/20100101 Firefox/45.0',
'accept_language' => substr(setlocale(LC_COLLATE, 0), 0, 2) . ',en-US;q=0.7,en;q=0.3',
'user_agent' => $userAgent,
'header' => "Accept: */*\r\n"
. 'Accept-Language: ' . $acceptLanguage
)
);

stream_context_set_default($options);
list($headers, $finalUrl) = get_redirected_headers($cleanUrl);
list($headers, $finalUrl) = get_redirected_headers($cleanUrl, $maxRedr);
if (! $headers || strpos($headers[0], '200 OK') === false) {
$options['http']['request_fulluri'] = true;
stream_context_set_default($options);
list($headers, $finalUrl) = get_redirected_headers($cleanUrl);
list($headers, $finalUrl) = get_redirected_headers($cleanUrl, $maxRedr);
}

if (! $headers || strpos($headers[0], '200 OK') === false) {
if (! $headers) {
return array($headers, false);
}

Expand Down