From c496baa74e14103ba72c7ac2d35121e1a72e01d7 Mon Sep 17 00:00:00 2001 From: Ed Morley <501702+edmorley@users.noreply.github.com> Date: Fri, 8 Nov 2024 19:15:28 +0000 Subject: [PATCH] Improve NLTK downloader log output and error messages (#1690) Improves the build log output, error messages and buildpack failure metrics for the NLTK downloader feature, and adds a test for the app having an invalid `nltk.txt` (since previously only the successful cases were tested). This should be the last failure type seen in the wild that doesn't emit a `failure_reason` metric. GUS-W-8059892. --- CHANGELOG.md | 3 +- bin/steps/nltk | 24 +++++++--- spec/fixtures/nltk_txt_invalid/nltk.txt | 1 + .../nltk_txt_invalid/requirements.txt | 1 + spec/hatchet/nltk_spec.rb | 46 +++++++++++++++---- 5 files changed, 60 insertions(+), 15 deletions(-) create mode 100644 spec/fixtures/nltk_txt_invalid/nltk.txt create mode 100644 spec/fixtures/nltk_txt_invalid/requirements.txt diff --git a/CHANGELOG.md b/CHANGELOG.md index ffccb62c6..6ad5059c4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,8 +4,9 @@ - Added a warning when the files for multiple package managers are found. In the future this warning will become an error. ([#1692](https://github.com/heroku/heroku-buildpack-python/pull/1692)) - Updated the build log message shown when installing dependencies to include the package manager command being run. ([#1689](https://github.com/heroku/heroku-buildpack-python/pull/1689)) -- Improved the error messages and buildpack metrics for package manager related failures. ([#1689](https://github.com/heroku/heroku-buildpack-python/pull/1689)) - Changed test dependency installation on Heroku CI to now install `requirements.txt` and `requirements-test.txt` in a single `pip install` invocation rather than separately. This allows pip's resolver to resolve any version conflicts between the two files. ([#1689](https://github.com/heroku/heroku-buildpack-python/pull/1689)) +- Improved the error messages and buildpack metrics for package manager related failures. ([#1689](https://github.com/heroku/heroku-buildpack-python/pull/1689)) +- Improved the build log output, error messages and buildpack failure metrics for the NLTK downloader feature. ([#1690](https://github.com/heroku/heroku-buildpack-python/pull/1690)) ## [v265] - 2024-11-06 diff --git a/bin/steps/nltk b/bin/steps/nltk index 804e7a41d..ad99493fe 100755 --- a/bin/steps/nltk +++ b/bin/steps/nltk @@ -1,5 +1,4 @@ #!/usr/bin/env bash -# shellcheck disable=SC2250 # TODO: Use braces around variable references even when not strictly required. # This script is run in a subshell via sub_env so doesn't inherit the options/vars/utils from `bin/compile`. # TODO: Stop running this script in a subshell. @@ -21,17 +20,30 @@ EXPORT_PATH="${BUILDPACK_DIR}/export" if is_module_available 'nltk'; then output::step "Downloading NLTK corpora..." - nltk_packages_definition="$BUILD_DIR/nltk.txt" + nltk_packages_definition="${BUILD_DIR}/nltk.txt" - if [[ -f "$nltk_packages_definition" ]]; then + if [[ -f "${nltk_packages_definition}" ]]; then meta_set "nltk_downloader" "enabled" - readarray -t nltk_packages <"$nltk_packages_definition" + readarray -t nltk_packages <"${nltk_packages_definition}" output::step "Downloading NLTK packages: ${nltk_packages[*]}" - python -m nltk.downloader -d "$BUILD_DIR/.heroku/python/nltk_data" "${nltk_packages[@]}" | output::indent - set_env NLTK_DATA "/app/.heroku/python/nltk_data" + nltk_data_dir="/app/.heroku/python/nltk_data" + if ! python -m nltk.downloader -d "${nltk_data_dir}" "${nltk_packages[@]}" |& output::indent; then + output::error <<-EOF + Error: Unable to download NLTK data. + + The 'python -m nltk.downloader' command to download NLTK + data did not exit successfully. + + See the log output above for more information. + EOF + meta_set "failure_reason" "nltk-downloader" + exit 1 + fi + + set_env NLTK_DATA "${nltk_data_dir}" else meta_set "nltk_downloader" "skipped-no-nltk-file" echo " 'nltk.txt' not found, not downloading any corpora" diff --git a/spec/fixtures/nltk_txt_invalid/nltk.txt b/spec/fixtures/nltk_txt_invalid/nltk.txt new file mode 100644 index 000000000..719ce91ff --- /dev/null +++ b/spec/fixtures/nltk_txt_invalid/nltk.txt @@ -0,0 +1 @@ +invalid! diff --git a/spec/fixtures/nltk_txt_invalid/requirements.txt b/spec/fixtures/nltk_txt_invalid/requirements.txt new file mode 100644 index 000000000..846929621 --- /dev/null +++ b/spec/fixtures/nltk_txt_invalid/requirements.txt @@ -0,0 +1 @@ +nltk diff --git a/spec/hatchet/nltk_spec.rb b/spec/hatchet/nltk_spec.rb index 28b1ab660..d61451278 100644 --- a/spec/hatchet/nltk_spec.rb +++ b/spec/hatchet/nltk_spec.rb @@ -11,13 +11,13 @@ expect(clean_output(app.output)).to match(Regexp.new(<<~REGEX)) remote: -----> Downloading NLTK corpora... remote: -----> Downloading NLTK packages: city_database stopwords - remote: .*: RuntimeWarning: 'nltk.downloader' found in sys.modules after import of package 'nltk', but prior to execution of 'nltk.downloader'; this may result in unpredictable behaviour - remote: \\[nltk_data\\] Downloading package city_database to - remote: \\[nltk_data\\] /tmp/build_.*/.heroku/python/nltk_data... - remote: \\[nltk_data\\] Unzipping corpora/city_database.zip. - remote: \\[nltk_data\\] Downloading package stopwords to - remote: \\[nltk_data\\] /tmp/build_.*/.heroku/python/nltk_data... - remote: \\[nltk_data\\] Unzipping corpora/stopwords.zip. + remote: .*: RuntimeWarning: 'nltk.downloader' found in sys.modules after import of package 'nltk', but prior to execution of 'nltk.downloader'; this may result in unpredictable behaviour + remote: \\[nltk_data\\] Downloading package city_database to + remote: \\[nltk_data\\] /app/.heroku/python/nltk_data... + remote: \\[nltk_data\\] Unzipping corpora/city_database.zip. + remote: \\[nltk_data\\] Downloading package stopwords to + remote: \\[nltk_data\\] /app/.heroku/python/nltk_data... + remote: \\[nltk_data\\] Unzipping corpora/stopwords.zip. REGEX # TODO: Add a test that the downloaded corpora can be found at runtime. @@ -44,7 +44,37 @@ it 'does not try to install the specified NLTK corpora' do app.deploy do |app| - expect(app.output.downcase).not_to include('nltk') + expect(app.output).not_to include('NLTK') + expect(app.output).not_to include('nltk_data') + end + end + end + + context 'when nltk.txt contains invalid entries' do + let(:app) { Hatchet::Runner.new('spec/fixtures/nltk_txt_invalid', allow_failure: true) } + + it 'fails the build' do + app.deploy do |app| + expect(clean_output(app.output)).to match(Regexp.new(<<~REGEX, Regexp::MULTILINE)) + remote: -----> Downloading NLTK corpora... + remote: -----> Downloading NLTK packages: invalid! + remote: .+: RuntimeWarning: 'nltk.downloader' found in sys.modules after import of package 'nltk', but prior to execution of 'nltk.downloader'; this may result in unpredictable behaviour + remote: \\[nltk_data\\] Error loading invalid!: Package 'invalid!' not found in + remote: \\[nltk_data\\] index + remote: Error installing package. Retry\\? \\[n/y/e\\] + remote: Traceback \\(most recent call last\\): + remote: .+ + remote: EOFError: EOF when reading a line + remote: + remote: ! Error: Unable to download NLTK data. + remote: ! + remote: ! The 'python -m nltk.downloader' command to download NLTK + remote: ! data did not exit successfully. + remote: ! + remote: ! See the log output above for more information. + remote: + remote: ! Push rejected, failed to compile Python app. + REGEX end end end