diff --git a/.gitignore b/.gitignore index c77d95e90b1..207b05ca6d2 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,6 @@ jekyll-algolia-* # Jetbrains IDEs .idea/ braze-docs.iml + +# Braze scripts +scripts/temp/* diff --git a/_docs/_contributing/bdocs.md b/_docs/_contributing/bdocs.md index d4c3f6047f0..126e427a787 100644 --- a/_docs/_contributing/bdocs.md +++ b/_docs/_contributing/bdocs.md @@ -15,13 +15,13 @@ page_order: 8.5 To run a command, use the following syntax. Replace `COMMAND` with one of the [available commands](#list-of-commands). -```terminal +```bash ./bdocs COMMAND ``` To see the list of commands in your terminal, use the `help` command: -```terminal +```bash $ ./bdocs help bdocs is a CLI tool for executing Braze Docs scripts. @@ -38,13 +38,11 @@ OPTIONS: help Display this help message and exit ``` - - ## Copying to your clipboard If you're on MacOS, you can copy the output of `bdocs` directly to your clipboard by using the following command. The `|` means to "pipe" (or send) the output of the first command to the next command. `pbcopy` means to write the output to your clipboard instead of the terminal. By combining these commands, you're sending the output from `bdocs` to `pbcopy` using a pipe. -```terminal +```bash ./bdocs COMMAND | pbcopy ``` @@ -56,7 +54,7 @@ This command creates the pull request description for weekly deployments by comp {% tabs local %} {% tab usage example %} -```terminal +```bash $ ./bdocs deploy - [#6980](https://github.com/braze-inc/braze-docs/pull/6980) - Update index.md @@ -72,7 +70,7 @@ This command creates the pull request description for monthly releases by compar {% tabs local %} {% tab usage example %} -```terminal +```bash $ ./bdocs release ## Deploy - September 17, 2024 @@ -96,19 +94,19 @@ $ ./bdocs release Reference-style links are not supported within Liquid `{% raw %}{% tab %}{% endraw %}` tags. `tlinks` (short for "transform links") transforms all the reference-style links on a file into [in-line links]({{site.baseurl}}/contributing/content_management/cross_referencing)—whether it be a normal URL, a `{% raw %}{{site.baseurl}}{% endraw %}`, an image, or other link. This command takes a single file or an entire directory as an argument. -{% alert tip %} -After you run `tlinks`, you'll be asked if you'd like to run [`rlinks`](#rlinks) next. +{% alert note %} +After you run `tlinks`, [`rlinks`](#rlinks) will be automatically run against the same file or directory. {% endalert %} {% tabs local %} {% tab usage example %} #### Example command -```terminal +```bash ./bdocs tlinks _docs/_user_guide/onboarding_faq.md ``` -#### Example page: before +#### Example page: Before {% raw %} ```markdown @@ -119,7 +117,7 @@ Before continuing, [create your SSH token][2]. When you're finished, see [Step 2 ``` {% endraw %} -#### Example page: after +#### Example page: After {% raw %} ```markdown @@ -136,19 +134,19 @@ Before continuing, [create your SSH token]({{site.baseurl}}/developer_guide/plat `rlinks` (short for "remove links") removes any unused reference links from the bottom of a Markdown file. This command takes a single file or an entire directory as an argument. -{% alert tip %} -After you run `tlinks`, you'll be asked if you'd like to run `rlinks` next. +{% alert note %} +After you run `tlinks`, `rlinks` will be automatically run against the same file or directory. {% endalert %} {% tabs local %} {% tab usage example %} #### Example command -```terminal +```bash ./bdocs rlinks _docs/_user_guide/onboarding_faq.md ``` -#### Example page: before +#### Example page: Before {% raw %} ```markdown @@ -159,7 +157,7 @@ Before continuing, [create your SSH token]({{site.baseurl}}/developer_guide/plat ``` {% endraw %} -#### Example Page: After +#### Example page: After {% raw %} ```markdown @@ -170,9 +168,50 @@ Before continuing, [create your SSH token]({{site.baseurl}}/developer_guide/plat {% endtab %} {% endtabs %} -### `redirects` +### `ulinks` + +`ulinks` (short for "update links") takes a file or directory and updates any old links listed on [`broken_redirect_list.js`](https://github.com/braze-inc/braze-docs/blob/develop/assets/js/broken_redirect_list.js) with the newest possible link. For example, if link `one` redirects to link `two`, and link `two` redirects to link `three`, `ulinks` will replace both link `one` and link `two` with link `three`. This command only updates links starting with {% raw %}`{{site.baseurl}}`{% endraw %}. + +{% tabs local %} +{% tab usage example %} +#### Example command + +```bash +$ ./bdocs ulinks _docs/_developer_guide/content_cards/creating_custom_content_cards.md +In 'update_old_links.md', made 1 replacement. +Total replacements made: 1 +``` + +#### Example page: Before + +{% raw %} +```markdown +Learn how to [log analytics]({{site.baseurl}}/developer_guide/customization_guides/content_cards/logging_analytics) for your custom Content Cards. +``` +{% endraw %} + +#### Example page: After + +{% raw %} +```markdown +Learn how to [log analytics]({{site.baseurl}}/developer_guide/content_cards/logging_analytics/) for your custom Content Cards. +``` +{% endraw %} +{% endtab %} +{% endtabs %} + +#### Why you should update old links + +Ideally, redirects added to [`assets/js/broken_redirect_list.js`](https://github.com/braze-inc/braze-docs/blob/develop/assets/js/broken_redirect_list.js) should only be used to: + +- Redirect traffic from outside of Braze Docs to the correct content (such as those coming from Stack Overflow, [Braze Learning](https://learning.braze.com/), the [Braze Blog]({{site.baseurl}}/resources/articles), etc.). +- Prevent existing bookmarks from breaking. + +It should not be used to redirect URLs on an existing Braze Docs page to another existing Braze Docs page. Instead, these URLs should be updated with the newest possible link. We want to avoid cases in which someone reading an existing Braze Docs page clicks a link and is redirected from one page, to another page, to another page, and so on. `ulinks` helps solves this issue, improving the end-user experience. + +### `lredirects` -This command checks if any new redirects have been added to [`broken_redirect_list.js`](https://github.com/braze-inc/braze-docs/blob/develop/assets/js/broken_redirect_list.js), then lists all of the old URLs using a base URL of your choice. For more general information, see [Redirecting URLs]({{site.baseurl}}/contributing/content_management/redirecting_urls). +`lredirects` (short for "list redirects") checks if any new redirects have been added to [`broken_redirect_list.js`](https://github.com/braze-inc/braze-docs/blob/develop/assets/js/broken_redirect_list.js), then lists all of the old URLs using a base URL of your choice. For more general information, see [Redirecting URLs]({{site.baseurl}}/contributing/content_management/redirecting_urls). {% alert tip %} If you're using VS Code, hold **CMD** while right-clicking a link to open it in your default browser. Because these are the old links, they should all redirect to the new URL specified in the redirect file. If it doesn't, there's an issue with the redirect. @@ -182,7 +221,7 @@ If you're using VS Code, hold **CMD** while right-clicking a link to open it in {% tab usage example %} The following example uses the [Sage AI rebrand PR](https://github.com/braze-inc/braze-docs/pull/8040). -```terminal +```bash $ git checkout bd-3442 $ ./bdocs redirects https://braze-docs-gtcavota9-braze.vercel.app/ diff --git a/bdocs b/bdocs index 38c1f3014f0..6ea1777f27e 100755 --- a/bdocs +++ b/bdocs @@ -1,14 +1,26 @@ #!/bin/bash -# This is a bash script for interacting with the various files in './scripts/'. +# This is a wrapper script for interacting with the files in './scripts/'. # # Usage: ./bdocs [option] set -e -# The project's root directory. -export PROJECT_ROOT -PROJECT_ROOT="$(dirname "$(realpath "$0")")" +# The project's root directory and redirect file. +export PROJECT_ROOT="$(dirname "$(realpath "$0")")" +export REDIRECT_FILE="./assets/js/broken_redirect_list.js" +export REDIRECT_MATCHES="./scripts/temp/redirect_matches.json" + +# All scripts exported so they can source bdocs and call each other if needed. +export DEPLOY="$PROJECT_ROOT/scripts/create_deploy_text.sh" +export RELEASE="$PROJECT_ROOT/scripts/create_release_text.sh" +export TLINKS="$PROJECT_ROOT/scripts/transform_reference_links.py" +export RLINKS="$PROJECT_ROOT/scripts/remove_unused_reference_links.rb" +export ULINKS="$PROJECT_ROOT/scripts/update_old_links.py" +export LREDIRECTS="$PROJECT_ROOT/scripts/list_new_redirect_urls.sh" +# Utility scripts that are not directly used in bdocs: +export MRD="$PROJECT_ROOT/scripts/utils/merge_redirect_descendants.py" +export TEMP_DIR="$PROJECT_ROOT/scripts/temp" # Displays usage for bdocs display_help() { @@ -19,72 +31,84 @@ USAGE: ./bdocs [option] OPTIONS: - deploy Create the deploy body text for weekly deployments - release Create the release body text for monthly releases - tlinks Transform reference links to inline links on 1 or more pages - rlinks Remove reference links that are not being used on 1 or more pages - redirects List the old URLs for all new redirects in this branch - help Display this help message and exit + deploy Create the deploy body text for weekly deployments + release Create the release body text for monthly releases + tlinks Transform reference links to inline links on 1 or more pages + rlinks Remove unused reference links on 1 or more pages + ulinks Update old links using newest redirect on 1 or more pages + lredirects Test new redirects by listing old URLs in this branch + help Display this help message and exit EOF } +# If no './scripts/temp' directory, create one. +if [ ! -d "$TEMP_DIR" ]; then + mkdir "$TEMP_DIR" +fi + +# If a file or directory is required, pass or fail. +require_path_or_file() { + if [[ -z "$1" ]]; then + echo "Error: A file or directory path is required." + exit 1 + fi +} + +# If new merges into 'develop' are required, pass or fail. +require_new_merges() { + LATEST_COMMIT_HASH=$(git log --max-count=1 --format="%H" origin/master ^origin/develop) + COMMIT_LOGS=$(git log --first-parent "$LATEST_COMMIT_HASH"..origin/develop) + if [ -z "$COMMIT_LOGS" ]; then + echo "Error: No new merges into 'develop' since the last deployment." + exit 1 + fi +} + # Check if no arguments were provided if [[ $# -eq 0 ]]; then display_help exit 1 fi +# Fetch the latest changes from the remote quietly. +git fetch origin develop --quiet + # Argument parsing case $1 in deploy) + require_new_merges if [[ $# -eq 3 ]]; then - "$PROJECT_ROOT/scripts/create_deploy_text.sh" "$2" "$3" + "$DEPLOY" "$2" "$3" else - "$PROJECT_ROOT/scripts/create_deploy_text.sh" + "$DEPLOY" fi ;; release) - "$PROJECT_ROOT/scripts/create_release_text.sh" + require_new_merges + "$RELEASE" ;; tlinks) - if [[ -z "$2" ]]; then - echo "Error: A file or directory path is required." - exit 1 - fi - python3 "$PROJECT_ROOT/scripts/transform_reference_links.py" "$2" - echo "Success!" - while true; do - echo "Do you want to remove the unused reference links? [Y/n]." - read -r opt - case $opt in - y*|Y*) - ruby "$PROJECT_ROOT/scripts/remove_unused_reference_links.rb" "$2" - echo "Success!" - break - ;; - n*|N*) - echo "The unused reference links were left untouched." - break - ;; - *) echo "Error: Invalid choice." - echo "" - ;; - esac - done + require_path_or_file "$2" + "$TLINKS" "$2" + "$RLINKS" "$2" # Run rlinks next, to clean up unused reference links. ;; rlinks) - if [[ -z "$2" ]]; then - echo "Error: The path to file or directory is required." - exit 1 - fi - ruby "$PROJECT_ROOT/scripts/remove_unused_reference_links.rb" "$2" + require_path_or_file "$2" + "$RLINKS" "$2" + ;; + ulinks) + require_path_or_file "$2" + touch "$REDIRECT_MATCHES" + "$MRD" + "$ULINKS" "$2" + # rm "$REDIRECT_MATCHES" ;; - redirects) + lredirects) if [[ $# -eq 2 ]]; then - "$PROJECT_ROOT/scripts/list_redirect_urls.sh" "$2" + "$LREDIRECTS" "$2" else - "$PROJECT_ROOT/scripts/list_redirect_urls.sh" + "$LREDIRECTS" fi ;; help) diff --git a/scripts/create_release_text.sh b/scripts/create_release_text.sh index 33eebece9b2..9acb433f500 100755 --- a/scripts/create_release_text.sh +++ b/scripts/create_release_text.sh @@ -44,9 +44,9 @@ main() { COMMIT_TITLE=$(echo "$commit" | jq -r '.title') COMMIT_BODY=$(echo "$commit" | jq -r '.body') - # Print the deploy text for each deployment. + # Print the deploy text for each deployment using the sourced DEPLOY. echo "## $COMMIT_BODY" - ./scripts/create_deploy_text.sh "$PREV_COMMIT_DATE" "$COMMIT_DATE" + "$DEPLOY" "$PREV_COMMIT_DATE" "$COMMIT_DATE" echo "" # Get the next range of commits by increasing 'PREV_COMMIT_DATE'. diff --git a/scripts/list_redirect_urls.sh b/scripts/list_new_redirect_urls.sh similarity index 84% rename from scripts/list_redirect_urls.sh rename to scripts/list_new_redirect_urls.sh index 5acc2380fea..409b3aaef5f 100755 --- a/scripts/list_redirect_urls.sh +++ b/scripts/list_new_redirect_urls.sh @@ -4,13 +4,10 @@ # base URL to list all old URLs so the user can open old links directly from # the terminal to test redirects. # -# Usage: ./bdocs redirects - -# Fetch the latest changes from the remote. -git fetch origin develop --quiet +# Usage: ./bdocs lredirects # Check new redirects by comparing the current branch to develop. -NEW_REDIRECTS=$(git diff develop -- $PROJECT_ROOT/assets/js/broken_redirect_list.js) +NEW_REDIRECTS=$(git diff develop -- $REDIRECT_FILE) # If there's no differences, print an error message and exit. if [[ -z "$NEW_REDIRECTS" ]]; then @@ -22,7 +19,6 @@ fi # Check if a base URL was passed as an argument from bdocs, otherwise prompt the user. if [[ -z "$1" ]]; then echo "Which base URL would you like to use? Note: You can use a local or deployment base URL." - echo "" read BASE_URL else BASE_URL=$1 diff --git a/scripts/remove_unused_reference_links.rb b/scripts/remove_unused_reference_links.rb old mode 100644 new mode 100755 index 795ed331de7..20ab1b47a29 --- a/scripts/remove_unused_reference_links.rb +++ b/scripts/remove_unused_reference_links.rb @@ -1,3 +1,5 @@ +#!/usr/bin/env ruby + # Removes unused reference-style links from the bottom of a file, such as: # [1]: {{site.baseurl}}/contributing/your_first_contribution/ # diff --git a/scripts/tests/update_old_links.md b/scripts/tests/update_old_links.md new file mode 100644 index 00000000000..89345ba5d99 --- /dev/null +++ b/scripts/tests/update_old_links.md @@ -0,0 +1,14 @@ +This is a test page for testing `ulinks`. When adding "bad links", the syntax needs to match the syntax on this page: + +https://www.braze.com/docs/contributing/content_management/cross_referencing + +i.e. no `/docs` should be found in the `site.baseurl`. + +Here's some bad links: +1. [Bad link 1]({{site.baseurl}}/best_practices/). +2. [Bad link 2]({{site.baseurl}}/best_practices/#android-push-category). +3. [Bad link 3]({{site.baseurl}}/user_guide/message_building_by_channel/email/link_templates/). + +Here's two good links: +1. [Good link 1](https://www.braze.com/docs/developer_guide/platform_wide/getting_started/analytics_overview) +2. [Good link 2]({{site.baseurl}}/developer_guide/getting_started/analytics_overview) diff --git a/scripts/transform_reference_links.py b/scripts/transform_reference_links.py index 12d0fa23c8e..75c7e7a03d5 100755 --- a/scripts/transform_reference_links.py +++ b/scripts/transform_reference_links.py @@ -1,4 +1,6 @@ -# Converts Markdown reference links to in-line links. Created because +#!/usr/bin/env python3 + +# Transforms Markdown reference links to in-line links. Created because # reference links cannot be placed inside Liquid {% tab %} tags. # # For more information, see: @@ -14,6 +16,7 @@ import sys import re + # Create a dictionary with all reference links at the bottom of the file. def create_link_dictionary(file_path): link_dict = {} @@ -40,6 +43,7 @@ def create_link_dictionary(file_path): return link_dict + # Use the dictionary to find and replace all references with the full link. def replace_links(file_path, link_dict): with open(file_path, 'r') as file: @@ -57,6 +61,8 @@ def replace_links(file_path, link_dict): with open(file_path, 'w') as file: file.writelines(updated_lines) + +# TODO: Move this to bdocs directly for easier reuse. # Recursively convert links for all Markdown files in given directory. def process_directory(directory): for root, dirs, files in os.walk(directory): @@ -66,6 +72,7 @@ def process_directory(directory): link_dict = create_link_dictionary(file_path) replace_links(file_path, link_dict) + # If arg == directory, convert links for all Markdown files in that directory. # If arg == file, convert links for that Markdown file only. def main(path): diff --git a/scripts/update_old_links.py b/scripts/update_old_links.py new file mode 100755 index 00000000000..0f4d02610bb --- /dev/null +++ b/scripts/update_old_links.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python3 + +# Uses 'assets/js/broken_redirect_list.js' to determine the newest version of +# a link, then updates all 'OLD' links with the 'NEW' link for the given +# page or directory. +# +# Requires: 'update_redirect_list.py' +# +# Usage: ./bdocs ulinks [FILE|DIRECTORY] +# +# Options: +# FILE Updates old links in a single file. +# DIRECTORY Recursively updates old links in a directory. + +import os +import json +import re +import sys + +# Get project root +PROJECT_ROOT = os.environ.get('PROJECT_ROOT') +REDIRECT_MATCHES = os.environ.get('REDIRECT_MATCHES') + + +def update_old_links(filepath, redirects): + # redirects: { key: { "new_url": str, "old_urls": [str, ...] }, ... } + # Replace ({{site.baseurl}}old_url) with ({{site.baseurl}}new_url) + if not os.path.isfile(filepath): + return 0 + + with open(filepath, 'r', encoding='utf-8', errors='replace') as f: + content = f.read() + + original_content = content + total_replacements = 0 + + for entry_key, data in redirects.items(): + new_url = data["new_url"] + old_urls = data["old_urls"] + + for old in old_urls: + # If there's an anchor (#), match exactly. + # Otherwise, match with or without trailing slash. + if "#" in old: + # Exact match + pattern = r"\(" + re.escape("{{site.baseurl}}") + re.escape(old) + r"\)" + else: + # old.rstrip('/') removes any trailing slash + # The /? makes the slash optional in the pattern + old_noslash = old.rstrip('/') + pattern = r"\(" + re.escape("{{site.baseurl}}") + re.escape(old_noslash) + r"/?\)" + + # Count how many times the old link appears + count_before = len(re.findall(pattern, content)) + if count_before > 0: + # Replace all occurrences with the new URL + content = re.sub(pattern, + "(" + "{{site.baseurl}}" + new_url + ")", + content) + total_replacements += count_before + + # If content changed, write the file back out + if content != original_content: + with open(filepath, 'w', encoding='utf-8') as f: + f.write(content) + + return total_replacements + + +def get_redirect_matches(json_file): + if not os.path.exists(json_file): + print(f"Error: '{json_file}' not found.") + exit(1) + with open(json_file, 'r') as f: + data_dict = json.load(f) + return data_dict + + +# TODO: Move this to bdocs directly for easier reuse. +def process_directory(directory, redirects): + total_global_replacements = 0 + for root, dirs, files in os.walk(directory): + for fn in files: + file_path = os.path.join(root, fn) + replacements = update_old_links(file_path, redirects) + if replacements > 0: + # Print relative path from the given directory + relative_path = os.path.relpath(file_path, start=directory) + print(f"In '{relative_path}', made {replacements} replacements.") + total_global_replacements += replacements + return total_global_replacements + + +# TODO: Move this to bdocs directly for easier reuse. +def process_single_file(filepath, redirects): + replacements = update_old_links(filepath, redirects) + if replacements > 0: + # When given a single file, just print the filename + print(f"In '{os.path.basename(filepath)}', made {replacements} replacements.") + return replacements + + +def main(path): + redirects = get_redirect_matches(REDIRECT_MATCHES) + + if os.path.isdir(path): + # Process directory + total_replacements = process_directory(path, redirects) + print(f"Total replacements made across all files: {total_replacements}") + elif os.path.isfile(path): + # Process single file + total_replacements = process_single_file(path, redirects) + print(f"Total replacements made: {total_replacements}") + else: + print(f"Invalid path: {path}. Please provide a valid directory or file.") + sys.exit(1) + + +if __name__ == "__main__": + if len(sys.argv) < 2: + print("Usage: python script.py ") + sys.exit(1) + user_path = sys.argv[1] + user_path = os.path.abspath(user_path) + main(user_path) diff --git a/scripts/utils/merge_redirect_descendants.py b/scripts/utils/merge_redirect_descendants.py new file mode 100755 index 00000000000..5c182cf2453 --- /dev/null +++ b/scripts/utils/merge_redirect_descendants.py @@ -0,0 +1,214 @@ +#!/usr/bin/env python3 + +# This script uses 'assets/js/broken_redirect_list.js' to create a JSON dictionary +# of merged redirect descendants by finding all ancestors and merging them +# into the newest descendant. This script is necessary for 'update_old_links.py'. +# +# As of now, this script is used by ./scripts/update_old_links.py but is +# not used by bdocs directly. However, you can use the following command to +# call it directly and generate a JSON file of merged duplicates. +# +# Usage: ./scripts/utils/merge_redirect_descendants.py [FILE|DIRECTORY] +# +# Options: +# FILE Delete unused reference links in a single file. +# DIRECTORY Recursively delete unused reference links in a directory. + +import os +import json +import re +import subprocess + +PROJECT_ROOT = os.environ.get('PROJECT_ROOT') +REDIRECT_FILE = os.environ.get('REDIRECT_FILE') +REDIRECT_MATCHES = os.environ.get('REDIRECT_MATCHES') + + +# Create JSON dictionary from 'assets/js/broken_redirect_list.js'. Syntax: +# validurls['OLD'] = 'NEW'; +def create_dict(): + # print("Running build_dict...") + data_dict = {} + total_old_urls = 0 + + with open(REDIRECT_FILE, 'r') as f: + for index, line in enumerate(f, start=1): + match = re.match(r"validurls\['/docs([^']+)'\] = '/docs([^']+)';", line.strip()) + if match: + old_url, new_url = match.groups() + + # Remove leading slash + old_url = old_url.lstrip('/') + new_url = new_url.lstrip('/') + + # 1) If '#' not in old_url, ensure it ends with '/' + if '#' not in old_url: + if not old_url.endswith('/'): + old_url += '/' + + # 2) If '#' not in new_url, ensure it ends with '/' + if '#' not in new_url: + if not new_url.endswith('/'): + new_url += '/' + + # Put the leading slash back into each + old_url = f"/{old_url}" + new_url = f"/{new_url}" + + data_dict[f"entry_{index}"] = { + "new_url": new_url, + "old_urls": [old_url] + } + + total_old_urls += 1 + + with open(REDIRECT_MATCHES, 'w') as f: + json.dump(data_dict, f, indent=4) + + # After building + # print(f"# of new_urls: {len(data_dict)}") + # print(f"# of old_urls: {total_old_urls}\n") + + +# If 2 or more identical keys exist, merge all values into the key with the +# highest 'entry_key' number, then delete the other keys. +def merge_duplicate_keys(): + # print("Running merge_duplicate_keys...") + with open(REDIRECT_MATCHES, 'r') as f: + data_dict = json.load(f) + + # Group entries by new_url + new_url_map = {} + for entry_key, data in data_dict.items(): + nu = data["new_url"] + new_url_map.setdefault(nu, []).append(entry_key) + + merged_count = 0 + for nu, entries in new_url_map.items(): + if len(entries) > 1: + entries_with_num = [(int(e.split('_')[1]), e) for e in entries] + entries_with_num.sort(key=lambda x: x[0]) # ascending by number + largest_num, largest_entry = entries_with_num[-1] + + for (num, e) in entries_with_num[:-1]: + data_dict[largest_entry]["old_urls"].extend(data_dict[e]["old_urls"]) + del data_dict[e] + merged_count += 1 + + with open(REDIRECT_MATCHES, 'w') as f: + json.dump(data_dict, f, indent=4) + + # print(f"# of keys merged: {merged_count}") + # print_counts(data_dict) + + +# If a key contains more than one identical value, keep one and delete the rest. +def delete_duplicate_values(): + # print("Running delete_duplicate_values...") + with open(REDIRECT_MATCHES, 'r') as f: + data_dict = json.load(f) + + values_deleted = 0 + for entry_key, data in data_dict.items(): + old_urls = data["old_urls"] + unique = list(set(old_urls)) + diff = len(old_urls) - len(unique) + if diff > 0: + values_deleted += diff + data["old_urls"] = unique + + with open(REDIRECT_MATCHES, 'w') as f: + json.dump(data_dict, f, indent=4) + + # print(f"# of values deleted: {values_deleted}") + # print_counts(data_dict) + + +# If A redirects to B, and B to C, both A and B are considered descendants of C. +# This function, merges all descendants into the newest parent (in this case C), +# If 1st entry exists as a value in the last entry: +# reassign all its values to the last entry, then remove the 1st entry. +# If no match, continue to 2nd-to-last entry and so on. +# If no descendants are found, keep 1st entry, then check 2nd entry, and so on. +def merge_descendants(): + # print("Running merge_descendants...") + with open(REDIRECT_MATCHES, 'r') as f: + data_dict = json.load(f) + + entry_keys = list(data_dict.keys()) + + merged_count = 0 + # Must be in ascending order (i.e. smallest entry number first). + for i, current_entry in enumerate(entry_keys): + if current_entry not in data_dict: + continue + + current_new_url = data_dict[current_entry]["new_url"] + current_old_urls = data_dict[current_entry]["old_urls"] + + # Must be in descending order (i.e. largest entry number first). + for j in range(len(entry_keys) - 1, i, -1): + other_entry = entry_keys[j] + if other_entry not in data_dict or other_entry == current_entry: + continue + + other_data = data_dict[other_entry] + if current_new_url in other_data["old_urls"]: + other_data["old_urls"].extend(current_old_urls) + other_data["old_urls"] = list(set(other_data["old_urls"])) + del data_dict[current_entry] + merged_count += 1 + break + + with open(REDIRECT_MATCHES, 'w') as f: + json.dump(data_dict, f, indent=4) + + # print(f"# of descendants merged: {merged_count}") + # print_counts(data_dict) + + +# If a key contains a value of itself, remove that value. +def remove_self_references(): + # print("Running remove_self_references...") + with open(REDIRECT_MATCHES, 'r') as f: + data_dict = json.load(f) + + self_removed = 0 + for entry_key, data in data_dict.items(): + nu = data["new_url"] + old_urls = data["old_urls"] + if nu in old_urls: + old_len = len(old_urls) + data["old_urls"] = [v for v in old_urls if v != nu] + new_len = len(data["old_urls"]) + self_removed += (old_len - new_len) + + with open(REDIRECT_MATCHES, 'w') as f: + json.dump(data_dict, f, indent=4) + + # print(f"# of self refs removed: {self_removed}") + with open(REDIRECT_MATCHES, 'r') as f: + data_dict = json.load(f) + # print_counts(data_dict) + + +# Debugging: Counts the number of new and old urls current in file when called. +def print_counts(data_dict): + # Count how many entries (new_urls) + new_urls_count = len(data_dict) + # Count how many old_urls total + old_urls_count = sum(len(data["old_urls"]) for data in data_dict.values()) + # print(f"# of new_urls: {new_urls_count}") + # print(f"# of old_urls: {old_urls_count}\n") + + +def main(): + create_dict() + merge_duplicate_keys() + delete_duplicate_values() + merge_descendants() + remove_self_references() + + +if __name__ == "__main__": + main()