From 82272742e94008b893964b1be5d6c3588696578a Mon Sep 17 00:00:00 2001 From: tuutti Date: Fri, 25 Oct 2024 12:00:15 +0300 Subject: [PATCH 01/11] UHF-10891: Initial command to transliterate filenames embbed in text fields --- .../Commands/TransliterateFilesCommands.php | 155 +++++++++++++++++- 1 file changed, 149 insertions(+), 6 deletions(-) diff --git a/src/Drush/Commands/TransliterateFilesCommands.php b/src/Drush/Commands/TransliterateFilesCommands.php index 8c961e2..8e414a9 100644 --- a/src/Drush/Commands/TransliterateFilesCommands.php +++ b/src/Drush/Commands/TransliterateFilesCommands.php @@ -4,16 +4,20 @@ namespace Drupal\helfi_azure_fs\Drush\Commands; +use Drupal\Component\Utility\Html; +use Drupal\Core\Entity\ContentEntityInterface; +use Drupal\Core\Entity\EntityFieldManagerInterface; use Drupal\Core\Entity\EntityTypeManagerInterface; use Drupal\Core\File\Event\FileUploadSanitizeNameEvent; use Drupal\Core\File\Exception\FileException; use Drupal\Core\File\FileSystemInterface; use Drupal\Core\StreamWrapper\StreamWrapperManagerInterface; use Drupal\file\Entity\File; -use Drupal\file\FileInterface; use Drush\Attributes\Command; use Drush\Commands\AutowireTrait; use Drush\Commands\DrushCommands; +use GuzzleHttp\ClientInterface; +use GuzzleHttp\Exception\ClientException; use Symfony\Component\Console\Style\SymfonyStyle; use Symfony\Contracts\EventDispatcher\EventDispatcherInterface; @@ -38,6 +42,8 @@ public function __construct( private readonly EventDispatcherInterface $eventDispatcher, private readonly EntityTypeManagerInterface $entityTypeManager, private readonly FileSystemInterface $fileSystem, + private readonly EntityFieldManagerInterface $entityFieldManager, + private readonly ClientInterface $httpClient, ) { $this->io = new SymfonyStyle($this->input(), $this->output()); parent::__construct(); @@ -46,26 +52,163 @@ public function __construct( /** * Gets the sanitized filename. * - * @param \Drupal\file\FileInterface $file + * @param string $filename * The file to sanitize. * * @return string * The sanitized filename. */ - private function getSanitizedFilename(FileInterface $file): string { - $event = new FileUploadSanitizeNameEvent($file->getFilename(), ''); + private function getSanitizedFilename(string $filename): string { + $event = new FileUploadSanitizeNameEvent($filename, ''); $this->eventDispatcher->dispatch($event); return $event->getFilename(); } + /** + * Processes all fields for given entity type. + * + * @param string $entityType + * The entity type to process. + * @param array $fields + * The fields to process. + */ + private function processEntityType(string $entityType, array $fields) : void { + foreach ($fields as $name => $field) { + $query = $this->entityTypeManager + ->getStorage($entityType) + ->getQuery(); + // Only load entities that has link to a local or MS blob + // storage file. + $conditionGroup = $query->orConditionGroup(); + $conditionGroup + ->condition($name, '%blob.core.windows.net%', 'LIKE'); + $conditionGroup + ->condition($name, '/sites/default/files%', 'LIKE'); + $query->exists($name) + ->condition($conditionGroup) + ->accessCheck(FALSE); + $ids = $query->execute(); + + foreach ($ids as $id) { + $entity = $this->entityTypeManager->getStorage($entityType) + ->load($id); + + foreach ($entity->getTranslationLanguages() as $language) { + $this->processFieldLinks($entity->getTranslation($language->getId()), $name); + } + } + } + } + + /** + * Checks if the given remote file exists. + * + * @param string $url + * The url to check. + * + * @return bool + * TRUE if remote file exists, FALSE if not. + */ + private function remoteFileExists(string $url) : bool { + try { + $this->httpClient->head($url); + + return TRUE; + } + catch (ClientException) { + } + return FALSE; + } + + /** + * Sanitize filenames inside text fields. + * + * @param \Drupal\Core\Entity\ContentEntityInterface $entity + * The entity translation to process. + * @param string $fieldName + * The field name. + */ + private function processFieldLinks(ContentEntityInterface $entity, string $fieldName) : void { + if (!$value = $entity->get($fieldName)->value) { + return; + } + + $hasChanges = FALSE; + $dom = Html::load($value); + /** @var \DOMElement $node */ + foreach ($dom->getElementsByTagName('a') as $node) { + // Nothing to do if link has no href. + if (!$href = $node->getAttribute('href')) { + continue; + } + // Do nothing if file exists already. + if ($this->remoteFileExists($href)) { + continue; + } + $basename = basename($href); + + // Test sanitized filename and urldecoded+sanitized filename. + $candidates = [ + $this->getSanitizedFilename($basename), + $this->getSanitizedFilename(urldecode($basename)), + ]; + + $newUrl = NULL; + foreach ($candidates as $candidate) { + $sanitizedUrl = str_replace($basename, $candidate, $href); + + if (!$this->remoteFileExists($sanitizedUrl)) { + continue; + } + $newUrl = $sanitizedUrl; + } + + if (!$newUrl) { + $this->io()->warning(sprintf('Failed to process the link "%s" for "%s"', $href, $entity->toUrl()->toString())); + + continue; + } + $hasChanges = TRUE; + $value = str_replace($href, $newUrl, $value); + } + + if ($hasChanges) { + $entity->set($fieldName, $value); + $entity->save(); + } + } + + /** + * Transliterates all files embedded in text fields. + * + * @return int + * The exit code. + */ + #[Command(name: 'helfi:transliterate:fields')] + public function transliterateTextFields() : int { + $fieldTypes = [ + 'text_with_summary', + 'text', + 'text_long', + ]; + foreach ($fieldTypes as $fieldType) { + $fieldMap = $this->entityFieldManager->getFieldMapByFieldType($fieldType); + + foreach ($fieldMap as $entityType => $fields) { + $this->processEntityType($entityType, $fields); + } + } + return DrushCommands::EXIT_SUCCESS; + } + /** * Transliterates the existing filenames. * * @return int * The exit code. */ - #[Command(name: 'helfi:files:transliterate')] + #[Command(name: 'helfi:transliterate:files')] public function transliterate() : int { $ids = $this->entityTypeManager ->getStorage('file') @@ -78,7 +221,7 @@ public function transliterate() : int { continue; } - $sanitizedFilename = $this->getSanitizedFilename($file); + $sanitizedFilename = $this->getSanitizedFilename($file->getFilename()); if ($sanitizedFilename === $file->getFilename()) { continue; From 1af52b1dc569cb4feb6a34e0b8a69a95c5fb9880 Mon Sep 17 00:00:00 2001 From: tuutti Date: Fri, 25 Oct 2024 12:04:09 +0300 Subject: [PATCH 02/11] UHF-10891: Debug message --- src/Drush/Commands/TransliterateFilesCommands.php | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Drush/Commands/TransliterateFilesCommands.php b/src/Drush/Commands/TransliterateFilesCommands.php index 8e414a9..512d0c1 100644 --- a/src/Drush/Commands/TransliterateFilesCommands.php +++ b/src/Drush/Commands/TransliterateFilesCommands.php @@ -146,6 +146,7 @@ private function processFieldLinks(ContentEntityInterface $entity, string $field if ($this->remoteFileExists($href)) { continue; } + $this->io()->note(sprintf('Found a broken link [%s]: "%s"', $entity->toUrl()->toString(), $href)); $basename = basename($href); // Test sanitized filename and urldecoded+sanitized filename. @@ -165,7 +166,7 @@ private function processFieldLinks(ContentEntityInterface $entity, string $field } if (!$newUrl) { - $this->io()->warning(sprintf('Failed to process the link "%s" for "%s"', $href, $entity->toUrl()->toString())); + $this->io()->warning(sprintf('Failed to process [%s]: "%s"', $entity->toUrl()->toString(), $href)); continue; } From ae402323053d711c0b894a26953df69fe0cc72ff Mon Sep 17 00:00:00 2001 From: tuutti Date: Fri, 25 Oct 2024 12:25:34 +0300 Subject: [PATCH 03/11] UHF-10891: phpstan fixes --- src/Drush/Commands/TransliterateFilesCommands.php | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/Drush/Commands/TransliterateFilesCommands.php b/src/Drush/Commands/TransliterateFilesCommands.php index 512d0c1..b4ca7dc 100644 --- a/src/Drush/Commands/TransliterateFilesCommands.php +++ b/src/Drush/Commands/TransliterateFilesCommands.php @@ -8,6 +8,7 @@ use Drupal\Core\Entity\ContentEntityInterface; use Drupal\Core\Entity\EntityFieldManagerInterface; use Drupal\Core\Entity\EntityTypeManagerInterface; +use Drupal\Core\Entity\TranslatableInterface; use Drupal\Core\File\Event\FileUploadSanitizeNameEvent; use Drupal\Core\File\Exception\FileException; use Drupal\Core\File\FileSystemInterface; @@ -86,14 +87,15 @@ private function processEntityType(string $entityType, array $fields) : void { $conditionGroup ->condition($name, '/sites/default/files%', 'LIKE'); $query->exists($name) - ->condition($conditionGroup) - ->accessCheck(FALSE); + ->condition($conditionGroup); + $query->accessCheck(FALSE); $ids = $query->execute(); foreach ($ids as $id) { $entity = $this->entityTypeManager->getStorage($entityType) ->load($id); + assert($entity instanceof TranslatableInterface); foreach ($entity->getTranslationLanguages() as $language) { $this->processFieldLinks($entity->getTranslation($language->getId()), $name); } @@ -112,7 +114,7 @@ private function processEntityType(string $entityType, array $fields) : void { */ private function remoteFileExists(string $url) : bool { try { - $this->httpClient->head($url); + $this->httpClient->request('HEAD', $url); return TRUE; } From ddd281e99420ebd8981acc4e6432e665a9e93309 Mon Sep 17 00:00:00 2001 From: tuutti Date: Mon, 28 Oct 2024 10:00:32 +0200 Subject: [PATCH 04/11] UHF-10891: Fix tests --- tests/src/Kernel/TransliterateFilesCommandsTest.php | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/tests/src/Kernel/TransliterateFilesCommandsTest.php b/tests/src/Kernel/TransliterateFilesCommandsTest.php index 2a6d436..560a20a 100644 --- a/tests/src/Kernel/TransliterateFilesCommandsTest.php +++ b/tests/src/Kernel/TransliterateFilesCommandsTest.php @@ -8,6 +8,7 @@ use Drupal\Tests\field\Kernel\FieldKernelTestBase; use Drupal\helfi_azure_fs\AzureFileSystem; use Drupal\helfi_azure_fs\Drush\Commands\TransliterateFilesCommands; +use Drupal\Tests\helfi_api_base\Traits\ApiTestTrait; /** * Tests transliterate file Drush command. @@ -16,6 +17,8 @@ */ class TransliterateFilesCommandsTest extends FieldKernelTestBase { + use ApiTestTrait; + /** * {@inheritdoc} */ @@ -72,12 +75,7 @@ public function testTransliterateFilesCommand() : void { 'replacement_character' => '_', ])->save(); - $command = new TransliterateFilesCommands( - $this->container->get('stream_wrapper_manager'), - $this->container->get('event_dispatcher'), - $this->container->get('entity_type.manager'), - $fileSystem, - ); + $command = TransliterateFilesCommands::create($this->container); $command->transliterate(); foreach ($files as $expected) { From 06d30b201ef640b84122215ff1536c074616074b Mon Sep 17 00:00:00 2001 From: tuutti Date: Mon, 28 Oct 2024 10:03:27 +0200 Subject: [PATCH 05/11] UHF-10891: phpcs fixes --- tests/src/Kernel/TransliterateFilesCommandsTest.php | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/src/Kernel/TransliterateFilesCommandsTest.php b/tests/src/Kernel/TransliterateFilesCommandsTest.php index 560a20a..fb0f36a 100644 --- a/tests/src/Kernel/TransliterateFilesCommandsTest.php +++ b/tests/src/Kernel/TransliterateFilesCommandsTest.php @@ -8,7 +8,6 @@ use Drupal\Tests\field\Kernel\FieldKernelTestBase; use Drupal\helfi_azure_fs\AzureFileSystem; use Drupal\helfi_azure_fs\Drush\Commands\TransliterateFilesCommands; -use Drupal\Tests\helfi_api_base\Traits\ApiTestTrait; /** * Tests transliterate file Drush command. @@ -17,8 +16,6 @@ */ class TransliterateFilesCommandsTest extends FieldKernelTestBase { - use ApiTestTrait; - /** * {@inheritdoc} */ From 0459dcf349e4dea53c2eae38b9cceeac87878174 Mon Sep 17 00:00:00 2001 From: tuutti Date: Mon, 28 Oct 2024 10:14:58 +0200 Subject: [PATCH 06/11] UHF-10891: More fixes --- src/Drush/Commands/TransliterateFilesCommands.php | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Drush/Commands/TransliterateFilesCommands.php b/src/Drush/Commands/TransliterateFilesCommands.php index b4ca7dc..ae3907b 100644 --- a/src/Drush/Commands/TransliterateFilesCommands.php +++ b/src/Drush/Commands/TransliterateFilesCommands.php @@ -46,7 +46,6 @@ public function __construct( private readonly EntityFieldManagerInterface $entityFieldManager, private readonly ClientInterface $httpClient, ) { - $this->io = new SymfonyStyle($this->input(), $this->output()); parent::__construct(); } From 7f39e314c2165d2eee3e23b9b81ce95a420bded4 Mon Sep 17 00:00:00 2001 From: tuutti Date: Mon, 28 Oct 2024 10:22:46 +0200 Subject: [PATCH 07/11] UHF-10891: phpcs fixes --- src/Drush/Commands/TransliterateFilesCommands.php | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Drush/Commands/TransliterateFilesCommands.php b/src/Drush/Commands/TransliterateFilesCommands.php index ae3907b..2ac0a9f 100644 --- a/src/Drush/Commands/TransliterateFilesCommands.php +++ b/src/Drush/Commands/TransliterateFilesCommands.php @@ -19,7 +19,6 @@ use Drush\Commands\DrushCommands; use GuzzleHttp\ClientInterface; use GuzzleHttp\Exception\ClientException; -use Symfony\Component\Console\Style\SymfonyStyle; use Symfony\Contracts\EventDispatcher\EventDispatcherInterface; /** From 065eab6ef5a844617b10ba42872d0191c938e72c Mon Sep 17 00:00:00 2001 From: tuutti Date: Mon, 28 Oct 2024 13:15:08 +0200 Subject: [PATCH 08/11] UHF-10891: Skip non-404 responses, skip URL that seems to timeout without VPN --- .../Commands/TransliterateFilesCommands.php | 34 ++++++++++++++++--- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/src/Drush/Commands/TransliterateFilesCommands.php b/src/Drush/Commands/TransliterateFilesCommands.php index 2ac0a9f..c327047 100644 --- a/src/Drush/Commands/TransliterateFilesCommands.php +++ b/src/Drush/Commands/TransliterateFilesCommands.php @@ -19,6 +19,7 @@ use Drush\Commands\DrushCommands; use GuzzleHttp\ClientInterface; use GuzzleHttp\Exception\ClientException; +use GuzzleHttp\Exception\GuzzleException; use Symfony\Contracts\EventDispatcher\EventDispatcherInterface; /** @@ -111,12 +112,35 @@ private function processEntityType(string $entityType, array $fields) : void { * TRUE if remote file exists, FALSE if not. */ private function remoteFileExists(string $url) : bool { + // Skip wps since it seems to require a VPN. + if (str_contains('https://www.hel.fi/wps/', $url)) { + return TRUE; + } + try { - $this->httpClient->request('HEAD', $url); + $this->httpClient->request('HEAD', $url, ['timeout' => 15]); return TRUE; } - catch (ClientException) { + catch (ClientException $e) { + $response = $e->getResponse(); + + // Skip non-404 responses. + if ($response->getStatusCode() !== 404) { + return TRUE; + } + $skip = [ + 'text/html', + 'text/plain', + ]; + foreach ($skip as $type) { + // Skip html content. + if (str_contains($response->getHeaderLine('Content-Type'), $type)) { + return TRUE; + } + } + } + catch (GuzzleException) { } return FALSE; } @@ -142,11 +166,13 @@ private function processFieldLinks(ContentEntityInterface $entity, string $field if (!$href = $node->getAttribute('href')) { continue; } + $href = trim($href); + // Do nothing if file exists already. if ($this->remoteFileExists($href)) { continue; } - $this->io()->note(sprintf('Found a broken link [%s]: "%s"', $entity->toUrl()->toString(), $href)); + $this->io()->note(sprintf('Found a broken link "%s"', $href)); $basename = basename($href); // Test sanitized filename and urldecoded+sanitized filename. @@ -166,7 +192,7 @@ private function processFieldLinks(ContentEntityInterface $entity, string $field } if (!$newUrl) { - $this->io()->warning(sprintf('Failed to process [%s]: "%s"', $entity->toUrl()->toString(), $href)); + $this->io()->warning(sprintf('Failed to process: "%s"', $href)); continue; } From 6da0b037025a1dc9ae56eb23cf74f69a5f39c2a7 Mon Sep 17 00:00:00 2001 From: tuutti Date: Mon, 28 Oct 2024 15:34:13 +0200 Subject: [PATCH 09/11] UHF-10891: Better url validation --- .../Commands/TransliterateFilesCommands.php | 45 +++++++++---------- 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/src/Drush/Commands/TransliterateFilesCommands.php b/src/Drush/Commands/TransliterateFilesCommands.php index c327047..a87fec5 100644 --- a/src/Drush/Commands/TransliterateFilesCommands.php +++ b/src/Drush/Commands/TransliterateFilesCommands.php @@ -102,6 +102,24 @@ private function processEntityType(string $entityType, array $fields) : void { } } + /** + * Checks if the given link is valid. + * + * @param string $url + * The URL. + * + * @return bool + * TRUE if link is valid, FALSE if not. + */ + private function isValidLink(string $url) : bool { + $validLinks = [ + 'blob.core.windows.net', + '/sites/default/files/', + ]; + + return (bool) array_filter($validLinks, fn ($link) => str_contains($url, $link)); + } + /** * Checks if the given remote file exists. * @@ -112,34 +130,11 @@ private function processEntityType(string $entityType, array $fields) : void { * TRUE if remote file exists, FALSE if not. */ private function remoteFileExists(string $url) : bool { - // Skip wps since it seems to require a VPN. - if (str_contains('https://www.hel.fi/wps/', $url)) { - return TRUE; - } - try { $this->httpClient->request('HEAD', $url, ['timeout' => 15]); return TRUE; } - catch (ClientException $e) { - $response = $e->getResponse(); - - // Skip non-404 responses. - if ($response->getStatusCode() !== 404) { - return TRUE; - } - $skip = [ - 'text/html', - 'text/plain', - ]; - foreach ($skip as $type) { - // Skip html content. - if (str_contains($response->getHeaderLine('Content-Type'), $type)) { - return TRUE; - } - } - } catch (GuzzleException) { } return FALSE; @@ -168,8 +163,8 @@ private function processFieldLinks(ContentEntityInterface $entity, string $field } $href = trim($href); - // Do nothing if file exists already. - if ($this->remoteFileExists($href)) { + // Skip invalid links or links that does not result in 404 error. + if (!$this->isValidLink($href) || $this->remoteFileExists($href)) { continue; } $this->io()->note(sprintf('Found a broken link "%s"', $href)); From d6dc857e570711fdc70b5e91930336d78e85a447 Mon Sep 17 00:00:00 2001 From: tuutti Date: Mon, 28 Oct 2024 15:39:15 +0200 Subject: [PATCH 10/11] UHF-10891: Output entity type and id --- src/Drush/Commands/TransliterateFilesCommands.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Drush/Commands/TransliterateFilesCommands.php b/src/Drush/Commands/TransliterateFilesCommands.php index a87fec5..d314955 100644 --- a/src/Drush/Commands/TransliterateFilesCommands.php +++ b/src/Drush/Commands/TransliterateFilesCommands.php @@ -187,7 +187,7 @@ private function processFieldLinks(ContentEntityInterface $entity, string $field } if (!$newUrl) { - $this->io()->warning(sprintf('Failed to process: "%s"', $href)); + $this->io()->warning(sprintf('Failed to process [entity id: %s, entity type: %s]: "%s"', $entity->id(), $entity->getEntityTypeId(), $href)); continue; } From bd2991198c180ddf597ac647d9e4270bfa9ec69f Mon Sep 17 00:00:00 2001 From: tuutti Date: Mon, 28 Oct 2024 15:40:48 +0200 Subject: [PATCH 11/11] UHF-10891: phpcs fixes --- src/Drush/Commands/TransliterateFilesCommands.php | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Drush/Commands/TransliterateFilesCommands.php b/src/Drush/Commands/TransliterateFilesCommands.php index d314955..3bef2a0 100644 --- a/src/Drush/Commands/TransliterateFilesCommands.php +++ b/src/Drush/Commands/TransliterateFilesCommands.php @@ -18,7 +18,6 @@ use Drush\Commands\AutowireTrait; use Drush\Commands\DrushCommands; use GuzzleHttp\ClientInterface; -use GuzzleHttp\Exception\ClientException; use GuzzleHttp\Exception\GuzzleException; use Symfony\Contracts\EventDispatcher\EventDispatcherInterface;