diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2d18f4ee143..f5a7a94d6ad 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -36,7 +36,7 @@ repos: - id: check-xml - id: check-yaml - id: end-of-file-fixer - exclude: frontend/test/tapes/.+\.json5 + exclude: (frontend/test/tapes/.+\.json5|frontend/.*snapshots.*) - id: check-symlinks - id: mixed-line-ending - id: fix-encoding-pragma diff --git a/frontend/nuxt.config.ts b/frontend/nuxt.config.ts index 0e8597ecded..d8bd3b6b3d9 100644 --- a/frontend/nuxt.config.ts +++ b/frontend/nuxt.config.ts @@ -74,14 +74,18 @@ export default defineNuxtConfig({ }, /** * Robots.txt rules are configured here via the \@nuxtjs/robots package. - * @see {@link https://nuxtseo.com/robots/guides/nuxt-config|Robots Config Rules} + * @see {@link https://nuxtseo.com/robots/guides/nuxt-config} */ robots: { - disallow: ["/search", "/search/audio", "/search/image"], + disallow: [ + // robots rules are prefixed-based, so there's no need to configure specific media type searches + "/search", + // Other routes have more complex requirements; we configure those with `useRobotsRule` as needed + ], groups: [ ...disallowedBots.map((bot) => ({ userAgent: [bot], - disallow: ["/"], // block bots from all routes + disallow: ["/"], // block disallowed bots from all routes })), ], }, diff --git a/frontend/src/composables/use-page-robots-rule.ts b/frontend/src/composables/use-page-robots-rule.ts new file mode 100644 index 00000000000..d999fe3e15d --- /dev/null +++ b/frontend/src/composables/use-page-robots-rule.ts @@ -0,0 +1,30 @@ +import { useRobotsRule, useSiteConfig } from "#imports" + +/** + * Robots meta tag and header instructions for pages + * These are distinct from robots.txt rules because do not + * want to prevent bots from viewing the pages altogether + * in case they are visiting for e.g., embed information. + * We _do_ want to disallow following links that will cause + * rapid and unwanted crawling behaviour (e.g., related + * results on a single result page, collection results, etc) + * + * Pages not listed here are either covered by the robots.txt + * rules configured in nuxt.config.ts or are allowed to be + * crawled with default settings (index and follow links) + */ +const pageRobots = { + "single-result": "noindex, nofollow", + "tag-collection": "noindex, nofollow", + "source-collection": "index, nofollow", + "creator-collection": "noindex, nofollow", +} as const + +export const usePageRobotsRule = (page: keyof typeof pageRobots) => { + const siteConfig = useSiteConfig() + if (!siteConfig.indexable) { + useRobotsRule("noindex, nofollow") + } else { + useRobotsRule(pageRobots[page]) + } +} diff --git a/frontend/src/constants/deploy-env.ts b/frontend/src/constants/deploy-env.ts index f71d7482f23..303372ae590 100644 --- a/frontend/src/constants/deploy-env.ts +++ b/frontend/src/constants/deploy-env.ts @@ -1,10 +1,9 @@ export const LOCAL = "local" -export const DEVELOPMENT = "development" export const STAGING = "staging" export const PRODUCTION = "production" // The order of the environments is important. They should be arranged in // increasing order of code-readiness, from local to production. -export const DEPLOY_ENVS = [LOCAL, DEVELOPMENT, STAGING, PRODUCTION] as const +export const DEPLOY_ENVS = [LOCAL, STAGING, PRODUCTION] as const export type DeployEnv = (typeof DEPLOY_ENVS)[number] diff --git a/frontend/src/middleware/single-result.ts b/frontend/src/middleware/single-result.ts index 7df3f081523..3f4d4964df5 100644 --- a/frontend/src/middleware/single-result.ts +++ b/frontend/src/middleware/single-result.ts @@ -34,7 +34,9 @@ export default defineNuxtRouteMiddleware(async (to, from) => { if (!mediaId) { return } + singleResultStore.setMediaById(mediaType, mediaId) + if (import.meta.server) { await Promise.allSettled([ singleResultStore.fetch(mediaType, mediaId), @@ -42,6 +44,7 @@ export default defineNuxtRouteMiddleware(async (to, from) => { ]) const fetchingError = singleResultStore.fetchState.fetchingError + if ( !singleResultStore.mediaItem && fetchingError && diff --git a/frontend/src/pages/audio/[id]/index.vue b/frontend/src/pages/audio/[id]/index.vue index 9c60341f1b5..a0f5393d1ad 100644 --- a/frontend/src/pages/audio/[id]/index.vue +++ b/frontend/src/pages/audio/[id]/index.vue @@ -24,6 +24,8 @@ import { useSensitiveMedia } from "~/composables/use-sensitive-media" import { useSingleResultStore } from "~/stores/media/single-result" import singleResultMiddleware from "~/middleware/single-result" +import { usePageRobotsRule } from "~/composables/use-page-robots-rule" + import VAudioTrack from "~/components/VAudioTrack/VAudioTrack.vue" import VMediaReuse from "~/components/VMediaInfo/VMediaReuse.vue" import VRelatedMedia from "~/components/VMediaInfo/VRelatedMedia.vue" @@ -42,6 +44,8 @@ definePageMeta({ middleware: singleResultMiddleware, }) +usePageRobotsRule("single-result") + const singleResultStore = useSingleResultStore() const route = useRoute() diff --git a/frontend/src/pages/audio/collection.vue b/frontend/src/pages/audio/collection.vue index 92016df667b..d218c472c87 100644 --- a/frontend/src/pages/audio/collection.vue +++ b/frontend/src/pages/audio/collection.vue @@ -6,8 +6,11 @@ import { collectionMiddleware } from "~/middleware/collection" import { skipToContentTargetId } from "~/constants/window" import { useCollection } from "~/composables/use-collection" +import { usePageRobotsRule } from "~/composables/use-page-robots-rule" import { AUDIO } from "~/constants/media" +import { CollectionParams } from "~/types/search" + import VCollectionResults from "~/components/VSearchResultsGrid/VCollectionResults.vue" defineOptions({ @@ -30,10 +33,17 @@ const { pageTitle, } = useCollection({ mediaType: AUDIO }) +// Collection params are not nullable in the collections route, this is enforced by the middleware +// Question: should this non-nullability be filtered in the type and enforced in runtime by `useCollection`? +usePageRobotsRule( + `${(collectionParams.value as NonNullable).collection}-collection` +) + useHead({ meta: [{ hid: "og:title", property: "og:title", content: pageTitle.value }], title: pageTitle.value, }) + /** * Media is not empty when we navigate back to this page, so we don't need to fetch * it again to make sure that all the previously fetched media is displayed. diff --git a/frontend/src/pages/image/[id]/index.vue b/frontend/src/pages/image/[id]/index.vue index f147e4f717b..643f801ae70 100644 --- a/frontend/src/pages/image/[id]/index.vue +++ b/frontend/src/pages/image/[id]/index.vue @@ -25,6 +25,8 @@ import { useSingleResultPageMeta } from "~/composables/use-single-result-page-me import { useSingleResultStore } from "~/stores/media/single-result" import singleResultMiddleware from "~/middleware/single-result" +import { usePageRobotsRule } from "~/composables/use-page-robots-rule" + import VBone from "~/components/VSkeleton/VBone.vue" import VMediaReuse from "~/components/VMediaInfo/VMediaReuse.vue" import VRelatedMedia from "~/components/VMediaInfo/VRelatedMedia.vue" @@ -47,6 +49,8 @@ definePageMeta({ middleware: singleResultMiddleware, }) +usePageRobotsRule("single-result") + const singleResultStore = useSingleResultStore() const nuxtApp = useNuxtApp() diff --git a/frontend/src/pages/image/collection.vue b/frontend/src/pages/image/collection.vue index 095bc3cf49a..485c320642f 100644 --- a/frontend/src/pages/image/collection.vue +++ b/frontend/src/pages/image/collection.vue @@ -7,6 +7,9 @@ import { skipToContentTargetId } from "~/constants/window" import { useCollection } from "~/composables/use-collection" import { IMAGE } from "~/constants/media" +import { usePageRobotsRule } from "~/composables/use-page-robots-rule" +import { CollectionParams } from "~/types/search" + import VCollectionResults from "~/components/VSearchResultsGrid/VCollectionResults.vue" defineOptions({ @@ -34,6 +37,12 @@ useHead(() => ({ title: pageTitle.value, })) +// Collection params are not nullable in the collections route, this is enforced by the middleware +// Question: should this non-nullability be filtered in the type and enforced in runtime by `useCollection`? +usePageRobotsRule( + `${(collectionParams.value as NonNullable).collection}-collection` +) + /** * Media is not empty when we navigate back to this page, so we don't need * to fetch it again to make sure that all the previously fetched media is displayed. diff --git a/frontend/test/playwright/e2e/seo.spec.ts b/frontend/test/playwright/e2e/seo.spec.ts index 36ac12048c4..9826f7d9abe 100644 --- a/frontend/test/playwright/e2e/seo.spec.ts +++ b/frontend/test/playwright/e2e/seo.spec.ts @@ -8,6 +8,7 @@ const DESCRIPTION = const NO_INDEX = "noindex, nofollow" const INDEX = "index, follow, max-image-preview:large, max-snippet:-1, max-video-preview:-1" +const INDEX_NO_FOLLOW = "index, nofollow" const DEFAULT_IMAGE = "/openverse-default.jpg" const pages = { @@ -46,7 +47,7 @@ const pages = { "/v1/images/da5cb478-c093-4d62-b721-cda18797e3fb/thumb/" ), ogTitle: "bird", - robots: INDEX, + robots: NO_INDEX, }, audioDetail: { url: "/audio/7e063ee6-343f-48e4-a4a5-f436393730f6", @@ -55,7 +56,7 @@ const pages = { "/v1/audio/7e063ee6-343f-48e4-a4a5-f436393730f6/thumb/" ), ogTitle: "I Love My Dog You Love your Cat", - robots: INDEX, + robots: NO_INDEX, }, about: { url: "/about", @@ -69,21 +70,21 @@ const pages = { title: "cat images | Openverse", ogImage: DEFAULT_IMAGE, ogTitle: "cat images | Openverse", - robots: INDEX, + robots: NO_INDEX, }, source: { url: "/image/collection?source=flickr", title: "Flickr images | Openverse", ogImage: DEFAULT_IMAGE, ogTitle: "Flickr images | Openverse", - robots: INDEX, + robots: INDEX_NO_FOLLOW, }, creator: { url: "/image/collection?source=flickr&creator=strogoscope", title: "strogoscope | Openverse", ogImage: DEFAULT_IMAGE, ogTitle: "strogoscope | Openverse", - robots: INDEX, + robots: NO_INDEX, }, } test.describe("page metadata", () => { @@ -112,3 +113,12 @@ test.describe("page metadata", () => { }) } }) + +test.describe("robots.txt", () => { + test("snapshot", async ({ page }) => { + await page.goto("/robots.txt") + const robotsText = await page.innerText("body") + + expect(robotsText).toMatchSnapshot({ name: "robots.txt" }) + }) +}) diff --git a/frontend/test/playwright/e2e/seo.spec.ts-snapshots/robots-linux.txt b/frontend/test/playwright/e2e/seo.spec.ts-snapshots/robots-linux.txt new file mode 100644 index 00000000000..512f75fb93a --- /dev/null +++ b/frontend/test/playwright/e2e/seo.spec.ts-snapshots/robots-linux.txt @@ -0,0 +1,81 @@ +# START nuxt-robots (indexable) +User-agent: * +Disallow: /search +Disallow: /ar/search +Disallow: /es/search +Disallow: /ru/search + +User-agent: GPTBot +Disallow: / +Disallow: /ar/ +Disallow: /es/ +Disallow: /ru/ + +User-agent: CCBot +Disallow: / +Disallow: /ar/ +Disallow: /es/ +Disallow: /ru/ + +User-agent: ChatGPT-User +Disallow: / +Disallow: /ar/ +Disallow: /es/ +Disallow: /ru/ + +User-agent: Google-Extended +Disallow: / +Disallow: /ar/ +Disallow: /es/ +Disallow: /ru/ + +User-agent: anthropic-ai +Disallow: / +Disallow: /ar/ +Disallow: /es/ +Disallow: /ru/ + +User-agent: Omgilibot +Disallow: / +Disallow: /ar/ +Disallow: /es/ +Disallow: /ru/ + +User-agent: Omgili +Disallow: / +Disallow: /ar/ +Disallow: /es/ +Disallow: /ru/ + +User-agent: FacebookBot +Disallow: / +Disallow: /ar/ +Disallow: /es/ +Disallow: /ru/ + +User-agent: Diffbot +Disallow: / +Disallow: /ar/ +Disallow: /es/ +Disallow: /ru/ + +User-agent: Bytespider +Disallow: / +Disallow: /ar/ +Disallow: /es/ +Disallow: /ru/ + +User-agent: ImagesiftBot +Disallow: / +Disallow: /ar/ +Disallow: /es/ +Disallow: /ru/ + +User-agent: cohere-ai +Disallow: / +Disallow: /ar/ +Disallow: /es/ +Disallow: /ru/ + +Sitemap: http://localhost:8443/sitemap_index.xml +# END nuxt-robots \ No newline at end of file diff --git a/frontend/test/playwright/playwright.config.ts b/frontend/test/playwright/playwright.config.ts index 6ccc3e9d108..2ad345677aa 100644 --- a/frontend/test/playwright/playwright.config.ts +++ b/frontend/test/playwright/playwright.config.ts @@ -27,14 +27,15 @@ const config: PlaywrightTestConfig = { port: 8443, reuseExistingServer: !process.env.CI || process.env.PWDEBUG === "1", env: { - UPDATE_TAPES, + UPDATE_TAPES: UPDATE_TAPES, NUXT_PUBLIC_API_URL: API_URL, + // Must be true for seo tests to receive appropriate values + NUXT_PUBLIC_SITE_INDEXABLE: "true", NUXT_PUBLIC_DEPLOYMENT_ENV: STAGING, NUXT_PUBLIC_PLAUSIBLE_DOMAIN: "localhost", NUXT_PUBLIC_PLAUSIBLE_API_HOST: "http://localhost:50290", NUXT_PUBLIC_PLAUSIBLE_AUTO_PAGEVIEWS: "false", NUXT_PUBLIC_PLAUSIBLE_IGNORED_HOSTNAMES: "[]", - NUXT_PUBLIC_SITE_INDEXABLE: "true", }, }, use: {