docs: reflect Glean-derived nature of popularities.json (#10609)

also remove duplicated command from build
mdn · Feb 28, 2024 · 02f2010 · 02f2010
1 parent 1751cc1
commit 02f2010
Show file tree

Hide file tree

Showing 7 changed files with 8 additions and 116 deletions.
diff --git a/.github/workflows/dev-build.yml b/.github/workflows/dev-build.yml
@@ -136,11 +136,6 @@ jobs:
           echo "CONTENT_TRANSLATED_ROOT=$CONTENT_TRANSLATED_ROOT"
           yarn build:prepare
 
-          # (July 15, 2021) This is a temporary solution. This should become an
-          # integrated part of 'build:prepare'.
-          # See https://github.com/mdn/yari/issues/4217
-          yarn tool popularities
-
           yarn tool sync-translated-content
 
           # Spread the work across 2 processes. Why 2? Because that's what you

diff --git a/.github/workflows/prod-build.yml b/.github/workflows/prod-build.yml
@@ -259,11 +259,6 @@ jobs:
           yarn build:sw
           yarn build:prepare
 
-          # (July 15, 2021) This is a temporary solution. This should become an
-          # integrated part of 'build:prepare'.
-          # See https://github.com/mdn/yari/issues/4217
-          yarn tool popularities
-
           yarn tool sync-translated-content
 
           # Build using one process per locale.

diff --git a/.github/workflows/stage-build.yml b/.github/workflows/stage-build.yml
@@ -252,11 +252,6 @@ jobs:
           yarn build:sw
           yarn build:prepare
 
-          # (July 15, 2021) This is a temporary solution. This should become an
-          # integrated part of 'build:prepare'.
-          # See https://github.com/mdn/yari/issues/4217
-          yarn tool popularities
-
           yarn tool sync-translated-content
 
           # Build using one process per locale.

diff --git a/.github/workflows/xyz-build.yml b/.github/workflows/xyz-build.yml
@@ -171,11 +171,6 @@ jobs:
           yarn build:sw
           yarn build:prepare
 
-          # (July 15, 2021) This is a temporary solution. This should become an
-          # integrated part of 'build:prepare'.
-          # See https://github.com/mdn/yari/issues/4217
-          yarn tool popularities
-
           yarn tool sync-translated-content
 
           # Build using one process per locale.

diff --git a/docs/popularities.md b/docs/popularities.md
@@ -5,8 +5,8 @@ logs. Being popular helps search because when a certain search term matches many
 documents, too many to display all, we need to sort them to try to predict which
 one the user most probably wanted to find.
 
-To accomplish this we check-in a file in the content repo called
-`popularities.json` which looks like this:
+To accomplish this we create a file during build `popularities.json` which looks
+like this:
 
 ```json
 {
@@ -30,82 +30,8 @@ a popularity. So don't expect every known URL in the content to appear in the
 
 ## Where's the data from
 
-Popularities are based on our CDN access logs. We use CloudFront for our CDN.
-Access logs are post processed using an
-[AWS Lambda function](https://github.com/aws-samples/amazon-cloudfront-access-logs-queries).
-
-Every month these logs are aggregated by another Lambda called
-`popularitiesCron` using AWS Athena:
-
-```python
-import time
-import boto3
-
-from datetime import datetime, timezone, timedelta
-
-last_month = datetime.now(timezone.utc) - timedelta(weeks=1)
-
-month = "{:0>2}".format(last_month.month)
-year = "{}".format(last_month.year)
-
-query = """
-SELECT u AS Page,
-         count(*) AS Pageviews
-FROM
-    (SELECT replace(uri,
-         '/index.json', '') AS u
-    FROM partitioned_parquet
-    WHERE year = '{}'
-            AND month = '{}'
-            AND status = 200
-            AND user_agent LIKE 'Mozilla%'
-            AND uri NOT LIKE '%/_sample_%'
-            AND (uri LIKE '/%/docs/%'
-            AND sc_content_type = 'text/html;%20charset=utf-8'
-            OR uri LIKE '/%/docs/%/index.json'))
-GROUP BY  u
-ORDER BY  Pageviews DESC
-""".format(year, month)
-
-DATABASE = 'yariprod_cf_access_logs_db'
-output='s3://mdn-popularities-prod/{}/{}/'.format(year, month)
-
-def lambda_handler(event, context):
-    client = boto3.client('athena')
-
-    # Execution
-    response = client.start_query_execution(
-        QueryString=query,
-        QueryExecutionContext={
-            'Database': DATABASE
-        },
-        ResultConfiguration={
-            'OutputLocation': output,
-        }
-    )
-    s3 = boto3.resource('s3')
-    uuid=response["QueryExecutionId"]
-
-    current = None
-    if uuid:
-        current = "https://mdn-popularities-prod.s3.amazonaws.com/{year}/{month}/{uuid}.csv".format(year=year, month=month, uuid=uuid)
-        s3.Object('mdn-popularities-prod', 'current.txt').put(Body=current, ContentType="text/plain; charset=utf-8")
-
-    bucket = s3.Bucket('mdn-popularities-prod')
-    history = ["https://mdn-popularities-prod.s3.amazonaws.com/{}".format(o.key) for o in bucket.objects.all() if o.key.endswith(".csv")]
-    if current is not None and (len(history) == 0 or history[-1] != current):
-        history.append(current)
-    body = "\n".join(history)
-    s3.Object('mdn-popularities-prod', 'history.txt').put(Body=body, ContentType="text/plain; charset=utf-8")
-
-    return response
-```
-
-This is trigger at via a CloudWatch cron job (`popularities-cron-trigger`) every
-1st of the month.
-
-Output is stored in an S3 bucket named `mdn-popularities-prod`.
-<'s3://mdn-popularities-prod/current.txt> points to the current file.
+Popularities are based on our Glean data, exposed at
+https://popularities.mdn.mozilla.net/current.csv.
 
 ## Run the CLI tool
 
@@ -114,14 +40,4 @@ yarn tool popularities
 ```
 
 This should now download the latest popularities csv and update the file
-`files/popularities.json` in your `mdn/content` repo. It takes the value of the
-`CONTENT_ROOT` constant.
-
-Once you've done this, you need to make a pull request on the new `mdn/content`
-repo.
-
-## The future
-
-One idea would be that we instead use Kuma to collect this. Then Yari could
-download it from Kuma right before the build starts. If we do this we would
-fully automate everything and the data would be more up-to-date.
+`popularities.json`.
diff --git a/tool/cli.ts b/tool/cli.ts
@@ -808,7 +808,7 @@ program
 
   .command(
     "popularities",
-    "Convert an AWS Athena log aggregation CSV into a popularities.json file"
+    "Convert Glean-derived page view CSV into a popularities.json file"
   )
   .option("--outfile <path>", "output file", {
     default: fileURLToPath(new URL("../popularities.json", import.meta.url)),

diff --git a/tool/popularities.ts b/tool/popularities.ts
@@ -1,13 +1,9 @@
 /**
  * This script exists only to periodically generate a
- * 'popularities.json' file from a Cloudfront access CSV export.
+ * 'popularities.json' file from a Glean page view CSV export.
  *
  * Generally, only the core MDN Web Docs team needs to run this. The output
- * file gets checked into git so it's easily available to everyone.
- *
- * In production build it might be a future option to generate this
- * dynamically on every single production build.
- *
+ * file gets added to our npm package so it's easily available to everyone.
  */
 import fs from "node:fs";