From 8bc745944e50badff7a7eb2a67e8898d3a1c4538 Mon Sep 17 00:00:00 2001 From: Geoffrey Booth Date: Wed, 20 Mar 2024 08:48:05 -0700 Subject: [PATCH] module: eliminate performance cost of detection for cjs entry PR-URL: https://github.com/nodejs/node/pull/52093 Reviewed-By: Matteo Collina Reviewed-By: Yagiz Nizipli Reviewed-By: Joyee Cheung Reviewed-By: Jacob Smith Reviewed-By: Richard Lau --- benchmark/misc/startup-core.js | 1 + lib/internal/modules/cjs/loader.js | 12 +- lib/internal/modules/helpers.js | 35 ++++++ lib/internal/modules/run_main.js | 42 +++++-- src/node_contextify.cc | 196 ++++++++++++++++------------- src/node_contextify.h | 3 + 6 files changed, 192 insertions(+), 97 deletions(-) diff --git a/benchmark/misc/startup-core.js b/benchmark/misc/startup-core.js index 07c0701d128899..d16a989fd2ce75 100644 --- a/benchmark/misc/startup-core.js +++ b/benchmark/misc/startup-core.js @@ -9,6 +9,7 @@ const bench = common.createBenchmark(main, { script: [ 'benchmark/fixtures/require-builtins', 'test/fixtures/semicolon', + 'test/fixtures/snapshot/typescript', ], mode: ['process', 'worker'], count: [30], diff --git a/lib/internal/modules/cjs/loader.js b/lib/internal/modules/cjs/loader.js index 85bf598192d8f1..22e77d8bff70c0 100644 --- a/lib/internal/modules/cjs/loader.js +++ b/lib/internal/modules/cjs/loader.js @@ -75,6 +75,7 @@ module.exports = { cjsExportsCache, cjsSourceCache, initializeCJS, + entryPointSource: undefined, // Set below. Module, wrapSafe, }; @@ -1337,8 +1338,15 @@ function wrapSafe(filename, content, cjsModuleInstance, codeCache) { return result; } catch (err) { if (process.mainModule === cjsModuleInstance) { - const { enrichCJSError } = require('internal/modules/esm/translators'); - enrichCJSError(err, content, filename); + if (getOptionValue('--experimental-detect-module')) { + // For the main entry point, cache the source to potentially retry as ESM. + module.exports.entryPointSource = content; + } else { + // We only enrich the error (print a warning) if we're sure we're going to for-sure throw it; so if we're + // retrying as ESM, wait until we know whether we're going to retry before calling `enrichCJSError`. + const { enrichCJSError } = require('internal/modules/esm/translators'); + enrichCJSError(err, content, filename); + } } throw err; } diff --git a/lib/internal/modules/helpers.js b/lib/internal/modules/helpers.js index 665019e93d1ade..940f83e49af86f 100644 --- a/lib/internal/modules/helpers.js +++ b/lib/internal/modules/helpers.js @@ -19,6 +19,15 @@ const { } = require('internal/errors').codes; const { BuiltinModule } = require('internal/bootstrap/realm'); +const { + shouldRetryAsESM: contextifyShouldRetryAsESM, + constants: { + syntaxDetectionErrors: { + esmSyntaxErrorMessages, + throwsOnlyInCommonJSErrorMessages, + }, + }, +} = internalBinding('contextify'); const { validateString } = require('internal/validators'); const fs = require('fs'); // Import all of `fs` so that it can be monkey-patched. const internalFS = require('internal/fs/utils'); @@ -320,6 +329,31 @@ function normalizeReferrerURL(referrerName) { } +let esmSyntaxErrorMessagesSet; // Declared lazily in shouldRetryAsESM +let throwsOnlyInCommonJSErrorMessagesSet; // Declared lazily in shouldRetryAsESM +/** + * After an attempt to parse a module as CommonJS throws an error, should we try again as ESM? + * We only want to try again as ESM if the error is due to syntax that is only valid in ESM; and if the CommonJS parse + * throws on an error that would not have been a syntax error in ESM (like via top-level `await` or a lexical + * redeclaration of one of the CommonJS variables) then we need to parse again to see if it would have thrown in ESM. + * @param {string} errorMessage The string message thrown by V8 when attempting to parse as CommonJS + * @param {string} source Module contents + */ +function shouldRetryAsESM(errorMessage, source) { + esmSyntaxErrorMessagesSet ??= new SafeSet(esmSyntaxErrorMessages); + if (esmSyntaxErrorMessagesSet.has(errorMessage)) { + return true; + } + + throwsOnlyInCommonJSErrorMessagesSet ??= new SafeSet(throwsOnlyInCommonJSErrorMessages); + if (throwsOnlyInCommonJSErrorMessagesSet.has(errorMessage)) { + return /** @type {boolean} */(contextifyShouldRetryAsESM(source)); + } + + return false; +} + + // Whether we have started executing any user-provided CJS code. // This is set right before we call the wrapped CJS code (not after, // in case we are half-way in the execution when internals check this). @@ -339,6 +373,7 @@ module.exports = { loadBuiltinModule, makeRequireFunction, normalizeReferrerURL, + shouldRetryAsESM, stripBOM, toRealPath, hasStartedUserCJSExecution() { diff --git a/lib/internal/modules/run_main.js b/lib/internal/modules/run_main.js index b134a54cfec92a..c06c8cc72cc9f4 100644 --- a/lib/internal/modules/run_main.js +++ b/lib/internal/modules/run_main.js @@ -5,7 +5,6 @@ const { globalThis, } = primordials; -const { containsModuleSyntax } = internalBinding('contextify'); const { getNearestParentPackageJSONType } = internalBinding('modules'); const { getOptionValue } = require('internal/options'); const { checkPackageJSONIntegrity } = require('internal/modules/package_json_reader'); @@ -87,10 +86,6 @@ function shouldUseESMLoader(mainPath) { // No package.json or no `type` field. if (response === undefined || response[0] === 'none') { - if (getOptionValue('--experimental-detect-module')) { - // If the first argument of `containsModuleSyntax` is undefined, it will read `mainPath` from the file system. - return containsModuleSyntax(undefined, mainPath); - } return false; } @@ -157,12 +152,43 @@ function runEntryPointWithESMLoader(callback) { * by `require('module')`) even when the entry point is ESM. * This monkey-patchable code is bypassed under `--experimental-default-type=module`. * Because of backwards compatibility, this function is exposed publicly via `import { runMain } from 'node:module'`. + * When `--experimental-detect-module` is passed, this function will attempt to run ambiguous (no explicit extension, no + * `package.json` type field) entry points as CommonJS first; under certain conditions, it will retry running as ESM. * @param {string} main - First positional CLI argument, such as `'entry.js'` from `node entry.js` */ function executeUserEntryPoint(main = process.argv[1]) { const resolvedMain = resolveMainPath(main); const useESMLoader = shouldUseESMLoader(resolvedMain); - if (useESMLoader) { + + // Unless we know we should use the ESM loader to handle the entry point per the checks in `shouldUseESMLoader`, first + // try to run the entry point via the CommonJS loader; and if that fails under certain conditions, retry as ESM. + let retryAsESM = false; + if (!useESMLoader) { + const cjsLoader = require('internal/modules/cjs/loader'); + const { Module } = cjsLoader; + if (getOptionValue('--experimental-detect-module')) { + try { + // Module._load is the monkey-patchable CJS module loader. + Module._load(main, null, true); + } catch (error) { + const source = cjsLoader.entryPointSource; + const { shouldRetryAsESM } = require('internal/modules/helpers'); + retryAsESM = shouldRetryAsESM(error.message, source); + // In case the entry point is a large file, such as a bundle, + // ensure no further references can prevent it being garbage-collected. + cjsLoader.entryPointSource = undefined; + if (!retryAsESM) { + const { enrichCJSError } = require('internal/modules/esm/translators'); + enrichCJSError(error, source, resolvedMain); + throw error; + } + } + } else { // `--experimental-detect-module` is not passed + Module._load(main, null, true); + } + } + + if (useESMLoader || retryAsESM) { const mainPath = resolvedMain || main; const mainURL = pathToFileURL(mainPath).href; @@ -171,10 +197,6 @@ function executeUserEntryPoint(main = process.argv[1]) { // even after the event loop stops running. return cascadedLoader.import(mainURL, undefined, { __proto__: null }, true); }); - } else { - // Module._load is the monkey-patchable CJS module loader. - const { Module } = require('internal/modules/cjs/loader'); - Module._load(main, null, true); } } diff --git a/src/node_contextify.cc b/src/node_contextify.cc index 09079e963c7036..1bc99765ba9f6e 100644 --- a/src/node_contextify.cc +++ b/src/node_contextify.cc @@ -345,6 +345,7 @@ void ContextifyContext::CreatePerIsolateProperties( SetMethod(isolate, target, "makeContext", MakeContext); SetMethod(isolate, target, "compileFunction", CompileFunction); SetMethod(isolate, target, "containsModuleSyntax", ContainsModuleSyntax); + SetMethod(isolate, target, "shouldRetryAsESM", ShouldRetryAsESM); } void ContextifyContext::RegisterExternalReferences( @@ -352,6 +353,7 @@ void ContextifyContext::RegisterExternalReferences( registry->Register(MakeContext); registry->Register(CompileFunction); registry->Register(ContainsModuleSyntax); + registry->Register(ShouldRetryAsESM); registry->Register(PropertyGetterCallback); registry->Register(PropertySetterCallback); registry->Register(PropertyDescriptorCallback); @@ -1404,7 +1406,7 @@ Local ContextifyContext::CompileFunctionAndCacheResult( // While top-level `await` is not permitted in CommonJS, it returns the same // error message as when `await` is used in a sync function, so we don't use it // as a disambiguation. -constexpr std::array esm_syntax_error_messages = { +static std::vector esm_syntax_error_messages = { "Cannot use import statement outside a module", // `import` statements "Unexpected token 'export'", // `export` statements "Cannot use 'import.meta' outside a module"}; // `import.meta` references @@ -1419,7 +1421,7 @@ constexpr std::array esm_syntax_error_messages = { // - Top-level `await`: if the user writes `await` at the top level of a // CommonJS module, it will throw a syntax error; but the same code is valid // in ESM. -constexpr std::array throws_only_in_cjs_error_messages = { +static std::vector throws_only_in_cjs_error_messages = { "Identifier 'module' has already been declared", "Identifier 'exports' has already been declared", "Identifier 'require' has already been declared", @@ -1439,6 +1441,10 @@ void ContextifyContext::ContainsModuleSyntax( env, "containsModuleSyntax needs at least 1 argument"); } + // Argument 1: source code + CHECK(args[0]->IsString()); + auto code = args[0].As(); + // Argument 2: filename; if undefined, use empty string Local filename = String::Empty(isolate); if (!args[1]->IsUndefined()) { @@ -1446,28 +1452,6 @@ void ContextifyContext::ContainsModuleSyntax( filename = args[1].As(); } - // Argument 1: source code; if undefined, read from filename in argument 2 - Local code; - if (args[0]->IsUndefined()) { - CHECK(!filename.IsEmpty()); - const char* filename_str = Utf8Value(isolate, filename).out(); - std::string contents; - int result = ReadFileSync(&contents, filename_str); - if (result != 0) { - isolate->ThrowException( - ERR_MODULE_NOT_FOUND(isolate, "Cannot read file %s", filename_str)); - return; - } - code = String::NewFromUtf8(isolate, - contents.c_str(), - v8::NewStringType::kNormal, - contents.length()) - .ToLocalChecked(); - } else { - CHECK(args[0]->IsString()); - code = args[0].As(); - } - // TODO(geoffreybooth): Centralize this rather than matching the logic in // cjs/loader.js and translators.js Local script_id = String::Concat( @@ -1497,73 +1481,95 @@ void ContextifyContext::ContainsModuleSyntax( bool should_retry_as_esm = false; if (try_catch.HasCaught() && !try_catch.HasTerminated()) { - Utf8Value message_value(env->isolate(), try_catch.Message()->Get()); - auto message = message_value.ToStringView(); + should_retry_as_esm = + ContextifyContext::ShouldRetryAsESMInternal(env, code); + } + args.GetReturnValue().Set(should_retry_as_esm); +} - for (const auto& error_message : esm_syntax_error_messages) { - if (message.find(error_message) != std::string_view::npos) { - should_retry_as_esm = true; - break; - } - } +void ContextifyContext::ShouldRetryAsESM( + const FunctionCallbackInfo& args) { + Environment* env = Environment::GetCurrent(args); + + CHECK_EQ(args.Length(), 1); // code + + // Argument 1: source code + Local code; + CHECK(args[0]->IsString()); + code = args[0].As(); - if (!should_retry_as_esm) { - for (const auto& error_message : throws_only_in_cjs_error_messages) { - if (message.find(error_message) != std::string_view::npos) { - // Try parsing again where the CommonJS wrapper is replaced by an - // async function wrapper. If the new parse succeeds, then the error - // was caused by either a top-level declaration of one of the CommonJS - // module variables, or a top-level `await`. - TryCatchScope second_parse_try_catch(env); - code = - String::Concat(isolate, - String::NewFromUtf8(isolate, "(async function() {") - .ToLocalChecked(), - code); - code = String::Concat( - isolate, - code, - String::NewFromUtf8(isolate, "})();").ToLocalChecked()); - ScriptCompiler::Source wrapped_source = GetCommonJSSourceInstance( - isolate, code, filename, 0, 0, host_defined_options, nullptr); - std::ignore = ScriptCompiler::CompileFunction( - context, - &wrapped_source, - params.size(), - params.data(), - 0, - nullptr, - options, - v8::ScriptCompiler::NoCacheReason::kNoCacheNoReason); - if (!second_parse_try_catch.HasTerminated()) { - if (second_parse_try_catch.HasCaught()) { - // If on the second parse an error is thrown by ESM syntax, then - // what happened was that the user had top-level `await` or a - // top-level declaration of one of the CommonJS module variables - // above the first `import` or `export`. - Utf8Value second_message_value( - env->isolate(), second_parse_try_catch.Message()->Get()); - auto second_message = second_message_value.ToStringView(); - for (const auto& error_message : esm_syntax_error_messages) { - if (second_message.find(error_message) != - std::string_view::npos) { - should_retry_as_esm = true; - break; - } - } - } else { - // No errors thrown in the second parse, so most likely the error - // was caused by a top-level `await` or a top-level declaration of - // one of the CommonJS module variables. - should_retry_as_esm = true; - } - } - break; + bool should_retry_as_esm = + ContextifyContext::ShouldRetryAsESMInternal(env, code); + + args.GetReturnValue().Set(should_retry_as_esm); +} + +bool ContextifyContext::ShouldRetryAsESMInternal(Environment* env, + Local code) { + Isolate* isolate = env->isolate(); + + Local script_id = + FIXED_ONE_BYTE_STRING(isolate, "[retry_as_esm_check]"); + Local id_symbol = Symbol::New(isolate, script_id); + + Local host_defined_options = + GetHostDefinedOptions(isolate, id_symbol); + ScriptCompiler::Source source = + GetCommonJSSourceInstance(isolate, + code, + script_id, // filename + 0, // line offset + 0, // column offset + host_defined_options, + nullptr); // cached_data + + TryCatchScope try_catch(env); + ShouldNotAbortOnUncaughtScope no_abort_scope(env); + + // Try parsing where instead of the CommonJS wrapper we use an async function + // wrapper. If the parse succeeds, then any CommonJS parse error for this + // module was caused by either a top-level declaration of one of the CommonJS + // module variables, or a top-level `await`. + code = String::Concat( + isolate, FIXED_ONE_BYTE_STRING(isolate, "(async function() {"), code); + code = String::Concat(isolate, code, FIXED_ONE_BYTE_STRING(isolate, "})();")); + + ScriptCompiler::Source wrapped_source = GetCommonJSSourceInstance( + isolate, code, script_id, 0, 0, host_defined_options, nullptr); + + Local context = env->context(); + std::vector> params = GetCJSParameters(env->isolate_data()); + USE(ScriptCompiler::CompileFunction( + context, + &wrapped_source, + params.size(), + params.data(), + 0, + nullptr, + ScriptCompiler::kNoCompileOptions, + v8::ScriptCompiler::NoCacheReason::kNoCacheNoReason)); + + if (!try_catch.HasTerminated()) { + if (try_catch.HasCaught()) { + // If on the second parse an error is thrown by ESM syntax, then + // what happened was that the user had top-level `await` or a + // top-level declaration of one of the CommonJS module variables + // above the first `import` or `export`. + Utf8Value message_value(env->isolate(), try_catch.Message()->Get()); + auto message_view = message_value.ToStringView(); + for (const auto& error_message : esm_syntax_error_messages) { + if (message_view.find(error_message) != std::string_view::npos) { + return true; } } + } else { + // No errors thrown in the second parse, so most likely the error + // was caused by a top-level `await` or a top-level declaration of + // one of the CommonJS module variables. + return true; } } - args.GetReturnValue().Set(should_retry_as_esm); + return false; } static void CompileFunctionForCJSLoader( @@ -1724,6 +1730,7 @@ static void CreatePerContextProperties(Local target, Local constants = Object::New(env->isolate()); Local measure_memory = Object::New(env->isolate()); Local memory_execution = Object::New(env->isolate()); + Local syntax_detection_errors = Object::New(env->isolate()); { Local memory_mode = Object::New(env->isolate()); @@ -1744,6 +1751,25 @@ static void CreatePerContextProperties(Local target, READONLY_PROPERTY(constants, "measureMemory", measure_memory); + { + Local esm_syntax_error_messages_array = + ToV8Value(context, esm_syntax_error_messages).ToLocalChecked(); + READONLY_PROPERTY(syntax_detection_errors, + "esmSyntaxErrorMessages", + esm_syntax_error_messages_array); + } + + { + Local throws_only_in_cjs_error_messages_array = + ToV8Value(context, throws_only_in_cjs_error_messages).ToLocalChecked(); + READONLY_PROPERTY(syntax_detection_errors, + "throwsOnlyInCommonJSErrorMessages", + throws_only_in_cjs_error_messages_array); + } + + READONLY_PROPERTY( + constants, "syntaxDetectionErrors", syntax_detection_errors); + target->Set(context, env->constants_string(), constants).Check(); } diff --git a/src/node_contextify.h b/src/node_contextify.h index e96df803b7ec2a..66a9f765fe9212 100644 --- a/src/node_contextify.h +++ b/src/node_contextify.h @@ -106,6 +106,9 @@ class ContextifyContext : public BaseObject { const v8::ScriptCompiler::Source& source); static void ContainsModuleSyntax( const v8::FunctionCallbackInfo& args); + static void ShouldRetryAsESM(const v8::FunctionCallbackInfo& args); + static bool ShouldRetryAsESMInternal(Environment* env, + v8::Local code); static void WeakCallback( const v8::WeakCallbackInfo& data); static void PropertyGetterCallback(