From 31058ecdaafe888fa0d18fa2ff689578b5828dd7 Mon Sep 17 00:00:00 2001 From: Menno Holtkamp Date: Fri, 22 Jan 2021 17:04:02 +0100 Subject: [PATCH] Prevent blocking operation when loading sitemap files by using promises --- composer.json | 1 + src/Config.php | 2 + src/Processor.php | 46 +++++--- src/Result.php | 37 +++--- src/TargetManager.php | 24 ---- src/TargetManager/NullTargetManager.php | 73 ------------ src/TargetManager/StreamTargetManager.php | 125 +++++++++++---------- src/TargetManager/TargetManagerFactory.php | 20 ---- 8 files changed, 115 insertions(+), 213 deletions(-) delete mode 100644 src/TargetManager.php delete mode 100644 src/TargetManager/NullTargetManager.php delete mode 100644 src/TargetManager/TargetManagerFactory.php diff --git a/composer.json b/composer.json index aa1b5a4..c198512 100644 --- a/composer.json +++ b/composer.json @@ -12,6 +12,7 @@ "ext-simplexml": "*", "clue/reactphp-flux": "^1.3", "psr/log": "^1.1", + "react/filesystem": "^0.1", "react/http": "^1.2", "symfony/console": "^5.2", "teapot/status-code": "^1.1" diff --git a/src/Config.php b/src/Config.php index f0a088e..661a12c 100644 --- a/src/Config.php +++ b/src/Config.php @@ -89,6 +89,8 @@ class Config /** * An array of some additional response headers to count. + * + * @var array */ public array $additionalResponseHeadersToCount = []; diff --git a/src/Processor.php b/src/Processor.php index 89b9e8f..8b3e98c 100644 --- a/src/Processor.php +++ b/src/Processor.php @@ -6,19 +6,18 @@ use Clue\React\Flux\Transformer; use Exception; -use Octopus\TargetManager\TargetManagerFactory; +use Octopus\TargetManager\StreamTargetManager; use Psr\Http\Message\ResponseInterface; use Psr\Log\LoggerInterface; use Psr\Log\NullLogger; use React\EventLoop\Factory as EventLoopFactory; use React\EventLoop\LoopInterface; +use React\Filesystem\Filesystem; use React\Http\Browser; use React\Http\Message\ResponseException; use React\Promise\ExtendedPromiseInterface; use React\Promise\PromiseInterface; use React\Promise\Timer\TimeoutException; -use React\Stream\ReadableResourceStream; -use React\Stream\ReadableStreamInterface; use RuntimeException; use Teapot\StatusCode\Http; @@ -53,7 +52,7 @@ class Processor private Browser $browser; private LoggerInterface $logger; private LoopInterface $loop; - private TargetManager$targetManager; + private StreamTargetManager $targetManager; private Transformer $transformer; private bool $followRedirects = false; private Presenter $presenter; @@ -122,21 +121,34 @@ private function getLoop(): LoopInterface return $this->loop ??= EventLoopFactory::create(); } - private function getTargetManager(): TargetManager + private function getTargetManager(): StreamTargetManager { - return $this->targetManager ??= TargetManagerFactory::getInstance($this->getStream(), $this->logger); + return $this->targetManager ??= new StreamTargetManager($this->getStream(), $this->logger); } - private function getStream(): ?ReadableStreamInterface + private function getStream(): PromiseInterface { - $handle = @\fopen($this->config->targetFile, 'r'); - if (!$handle) { - $this->logger->error(\sprintf('Could not open target file "%s"', $this->config->targetFile)); + return \filter_var($this->config->targetFile, \FILTER_VALIDATE_URL) + ? $this->getStreamForUrl($this->config->targetFile) + : $this->getStreamForLocalFile($this->config->targetFile); + } - return null; - } + private function getStreamForUrl(string $url): PromiseInterface + { + $this->logger->info('acquire stream for URL '.$url); + $browser = new Browser($this->getLoop()); + + return $browser->requestStreaming('GET', $url); + } + + private function getStreamForLocalFile(string $filename): PromiseInterface + { + $this->logger->info('acquire stream for local file '.$filename); - return new ReadableResourceStream($handle, $this->getLoop()); + $filesystem = Filesystem::create($this->getLoop()); + $file = $filesystem->file($filename); + + return $file->getContents(); } private function isCompleted(): bool @@ -230,10 +242,10 @@ private function getOnFulfilledCallback(string $url): callable /* if ($this->saveEnabled) { - $path = $this->savePath.$this->makeFilename($url); - if (\file_put_contents($path, $response->getBody(), FILE_APPEND) === false) { - throw new Exception("Cannot write file: $path"); - } + $path = $this->savePath.$this->makeFilename($url); + if (\file_put_contents($path, $response->getBody(), FILE_APPEND) === false) { + throw new Exception("Cannot write file: $path"); + } } */ diff --git a/src/Result.php b/src/Result.php index 5ecfcc1..441643e 100644 --- a/src/Result.php +++ b/src/Result.php @@ -6,53 +6,43 @@ class Result { - /** - * @var Config - */ - public $config; + public Config $config; /** - * @var array + * @var array */ - private $additionalResponseHeadersToCount = []; + private array $additionalResponseHeadersToCount = []; /** * URLs that could not be loaded. * - * @var array + * @var array */ - private $brokenUrls = []; + private array $brokenUrls = []; /** - * @var array + * @var array */ - private $finishedUrls = []; + private array $finishedUrls = []; /** * URLs that were redirected to another location. * - * @var array + * @var array */ - private $redirectedUrls = []; + private array $redirectedUrls = []; /** * Timestamp to track execution time. - * - * @var float */ - private $started; + private float $started; - /** - * @var array - */ - private $statusCodes = []; + private array $statusCodes = []; /** * Total amount of processed data. - * - * @var int */ - private $totalData = 0; + private int $totalData = 0; public function __construct(Config $config) { @@ -60,6 +50,9 @@ public function __construct(Config $config) $this->started = \microtime(true); } + /** + * @param array $additionalResponseHeadersToCount + */ public function setAdditionalResponseHeadersToCount(array $additionalResponseHeadersToCount): void { $this->additionalResponseHeadersToCount = $additionalResponseHeadersToCount; diff --git a/src/TargetManager.php b/src/TargetManager.php deleted file mode 100644 index 4398f4c..0000000 --- a/src/TargetManager.php +++ /dev/null @@ -1,24 +0,0 @@ -input = $input; - $this->logger = $logger ?? new NullLogger(); - } - - public function close(): void - { - $this->logger->debug(\sprintf('received "%s" request', __FUNCTION__)); - } - - public function getNumberOfUrls(): int - { - return 0; - } - - public function isInitialized(): bool - { - return true; - } - - public function isReadable(): bool - { - return false; - } - - public function pause(): void - { - $this->logger->debug(\sprintf('received "%s" request', __FUNCTION__)); - } - - public function resume(): void - { - $this->logger->debug(\sprintf('received "%s" request', __FUNCTION__)); - } - - public function pipe(WritableStreamInterface $destination, array $options = []): WritableStreamInterface - { - $this->logger->debug(\sprintf('received "%s" request', __FUNCTION__)); - - return $destination; - } - - public function addUrl(string $url): void - { - $this->logger->debug(\sprintf('adding URL "%s"', $url)); - } -} diff --git a/src/TargetManager/StreamTargetManager.php b/src/TargetManager/StreamTargetManager.php index 3a51e1d..f2efc84 100644 --- a/src/TargetManager/StreamTargetManager.php +++ b/src/TargetManager/StreamTargetManager.php @@ -3,9 +3,11 @@ namespace Octopus\TargetManager; use Evenement\EventEmitter; -use Octopus\TargetManager; +use Psr\Http\Message\ResponseInterface; +use Psr\Http\Message\StreamInterface; use Psr\Log\LoggerInterface; use Psr\Log\NullLogger; +use React\Promise\PromiseInterface; use React\Stream\ReadableStreamInterface; use React\Stream\Util; use React\Stream\WritableStreamInterface; @@ -14,7 +16,7 @@ /** * The TargetManager reads URLs from a plain stream and emits them. */ -class StreamTargetManager extends EventEmitter implements ReadableStreamInterface, TargetManager +class StreamTargetManager extends EventEmitter implements ReadableStreamInterface { /** * @see https://www.sitemaps.org/protocol.html @@ -44,42 +46,30 @@ class StreamTargetManager extends EventEmitter implements ReadableStreamInterfac */ private const XML_SITEMAP_ROOT_ELEMENT = 'urlset'; - /** - * @var string - */ - private $buffer = ''; + private string $buffer = ''; - /** - * @var bool - */ - private $closed = false; + private bool $closed = false; - /** - * @var ReadableStreamInterface - */ - private $input; + private ReadableStreamInterface $input; - /** - * @var LoggerInterface - */ - private $logger; + private LoggerInterface $logger; - /** - * @var int - */ - private $numberOfUrls = 0; + private int $numberOfUrls = 0; /** * Flag to indicated whether this TargetManager has been initialized: were URLs loaded. - * - * @var bool */ - private $initialized = false; + private bool $initialized = false; - public function __construct(ReadableStreamInterface $input = null, LoggerInterface $logger = null) + public function __construct(PromiseInterface $promise, LoggerInterface $logger = null) { $this->logger = $logger ?? new NullLogger(); - if ($input) { + + $promise->then(function (ResponseInterface $response) { + $input = $response->getBody(); + \assert($input instanceof StreamInterface); + \assert($input instanceof ReadableStreamInterface); + if (!$input->isReadable()) { $this->logger->info('Input is not readable, closing'); @@ -90,7 +80,9 @@ public function __construct(ReadableStreamInterface $input = null, LoggerInterfa } $this->setInput($input); - } + }, function (\Exception $exception) { + $this->logger->critical((string) $exception); + }); } public function close(): void @@ -196,15 +188,15 @@ private function processBufferAsXml(): bool { $xmlElement = $this->getSimpleXMLElement($this->buffer); - if ($xmlElement instanceof SimpleXMLElement) { - $this->logger->notice('Instantiated SimpleXMLElement with "{elementCount}" children', ['elementCount' => $xmlElement->count()]); + if ($xmlElement === null) { + return false; + } - $this->processSimpleXMLElement($xmlElement); + $this->logger->notice('Instantiated SimpleXMLElement with "{elementCount}" children', ['elementCount' => $xmlElement->count()]); - return true; - } + $this->processSimpleXMLElement($xmlElement); - return false; + return true; } private function getSimpleXMLElement(string $data): ?SimpleXMLElement @@ -241,51 +233,70 @@ private function isXmlSitemapIndex(SimpleXMLElement $xmlElement): bool private function processSitemapIndex(SimpleXMLElement $sitemapIndexElement): void { - if ($sitemapLocationElements = $this->getSitemapLocationElements($sitemapIndexElement)) { - foreach ($sitemapLocationElements as $sitemapLocationElement) { - $sitemapUrl = (string) $sitemapLocationElement; - $this->processSitemapUrl($sitemapUrl); - $this->logger->info('processed '.$sitemapUrl.', #URLs: '.$this->getNumberOfUrls()); - } + $sitemapLocationElements = $this->getSitemapLocationElements($sitemapIndexElement); + if ($sitemapLocationElements === null) { + return; } + + $this->processSitemapLocationElementsContainingSitemapUrls($sitemapLocationElements); } private function getSitemapLocationElements(SimpleXMLElement $xmlElement): ?array { $xmlElement->registerXPathNamespace('sitemap', self::XML_SITEMAP_NAMESPACE); - if ($sitemapLocationElements = $xmlElement->xpath('//sitemap:loc')) { - return $sitemapLocationElements; - } + $sitemapLocationElements = $xmlElement->xpath('//sitemap:loc'); - return null; + return \is_array($sitemapLocationElements) ? $sitemapLocationElements : null; + } + + private function processSitemapLocationElementsContainingSitemapUrls(array $sitemapLocationElements): void + { + foreach ($sitemapLocationElements as $sitemapLocationElement) { + $sitemapUrl = (string) $sitemapLocationElement; + $this->processSitemapUrl($sitemapUrl); + $this->logger->info('processed '.$sitemapUrl.', #URLs: '.$this->getNumberOfUrls()); + } } private function processSitemapUrl(string $sitemapUrl): void { - if ($data = $this->loadExternalData($sitemapUrl)) { - if ($sitemapElement = $this->getSimpleXMLElement($data)) { - $this->processSitemapElement($sitemapElement); - } + $data = $this->loadExternalData($sitemapUrl); + if ($data === null) { + return; } + $sitemapElement = $this->getSimpleXMLElement($data); + if ($sitemapElement === null) { + return; + } + + $this->processSitemapElement($sitemapElement); } private function loadExternalData(string $file): ?string { - if ($data = @\file_get_contents($file)) { - return $data; - } + $data = \file_get_contents($file); - return null; + return \is_string($data) ? $data : null; } private function processSitemapElement(SimpleXMLElement $sitemapElement): void { - if ($sitemapLocationElements = $this->getSitemapLocationElements($sitemapElement)) { - foreach ($sitemapLocationElements as $sitemapLocationElement) { - $sitemapUrl = (string) $sitemapLocationElement; + $sitemapLocationElements = $this->getSitemapLocationElements($sitemapElement); - $this->addUrl($sitemapUrl); - } + if ($sitemapLocationElements === null) { + return; + } + + $this->processSitemapLocationElementsContainingUrls($sitemapLocationElements); + } + + private function processSitemapLocationElementsContainingUrls(array $sitemapLocationElements): void + { + $this->logger->info(\sprintf('process %d SitemapLocation elements containing URLs', \count($sitemapLocationElements))); + foreach ($sitemapLocationElements as $sitemapLocationElement) { + $sitemapUrl = (string) $sitemapLocationElement; + + $this->addUrl($sitemapUrl); } } diff --git a/src/TargetManager/TargetManagerFactory.php b/src/TargetManager/TargetManagerFactory.php deleted file mode 100644 index 4e80c2c..0000000 --- a/src/TargetManager/TargetManagerFactory.php +++ /dev/null @@ -1,20 +0,0 @@ -