From 4223ee1ad4490e94a6273b24413cb3a44f7a6bbb Mon Sep 17 00:00:00 2001 From: David Coles Date: Mon, 27 May 2019 22:43:04 -0700 Subject: [PATCH 1/2] Add RegEx support using RE2 Introduces 5 new built-in methods to the stdlib: - `regexFullMatch(pattern, str)` -- Full match regex - `regexPartialMatch(pattern, str)` -- Partial match regex - `regexQuoteMeta(str)` -- Escape regex metachararacters - `regexReplace(str, pattern, to)` -- Replace single occurance using regex - `regexGlobalReplace(str, pattern, to)` -- Replace globally using regex Since both `regexFullMatch` and `regexPartialMatch` can perform captures these functions return a "match" object upon match or `null` otherwise. For example: ``` $ ./jsonnet -e 'std.regexFullMatch("h(?P.*)o", "hello")' { "captures": [ "ell" ], "namedCaptures": { "mid": "ell" }, "string": "hello" } ``` Introduces a dependency on RE2 2019-06-01. Builds tested using make, CMake and Bazel on Ubuntu 18.04. --- .travis.yml | 2 + CMakeLists.txt | 45 +++++- ...ists.txt.in => GoogleTestCMakeLists.txt.in | 0 Makefile | 7 +- RE2CMakeLists.txt.in | 18 +++ WORKSPACE | 11 +- core/BUILD | 1 + core/CMakeLists.txt | 9 +- core/desugarer.cpp | 7 +- core/vm.cpp | 133 ++++++++++++++++++ test_suite/stdlib.jsonnet | 71 +++++++++- 11 files changed, 289 insertions(+), 15 deletions(-) rename CMakeLists.txt.in => GoogleTestCMakeLists.txt.in (100%) create mode 100644 RE2CMakeLists.txt.in diff --git a/.travis.yml b/.travis.yml index 96ee51e77..6cbd626d6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,6 +9,7 @@ matrix: apt: packages: - g++-5 + - libre2-dev sources: &sources - llvm-toolchain-bionic-11 - sourceline: 'deb https://apt.llvm.org/bionic/ llvm-toolchain-bionic-11 main' @@ -20,6 +21,7 @@ matrix: apt: packages: - clang-11 + - libre2-dev sources: *sources - os: osx osx_image: xcode11.3 diff --git a/CMakeLists.txt b/CMakeLists.txt index 5df20ca27..971afc6b7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -45,9 +45,50 @@ endif() set(CMAKE_CXX_STANDARD 11) +# Include external RE2 project. This runs a CMake sub-script +# (RE2CMakeLists.txt.in) that downloads googletest source. It's then built as part +# of the jsonnet project. The conventional way of handling CMake dependencies is +# to use a find_package script, which finds and installs the library from +# known locations on the local machine. Downloading the library ourselves +# allows us to pin to a specific version and makes things easier for users +# who don't have package managers. + +# Generate and download RE2 project. +set(RE2_DIR ${GLOBAL_OUTPUT_PATH}/re2-download) +configure_file(RE2CMakeLists.txt.in ${RE2_DIR}/CMakeLists.txt) +execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . + RESULT_VARIABLE result + WORKING_DIRECTORY ${RE2_DIR} +) +if(result) + message(FATAL_ERROR "RE2 download failed: ${result}") +endif() + +# Build RE2. +execute_process(COMMAND ${CMAKE_COMMAND} --build . + RESULT_VARIABLE result + WORKING_DIRECTORY ${RE2_DIR}) +if(result) + message(FATAL_ERROR "Build step for re2 failed: ${result}") +endif() + +# Add RE2 directly to our build. This defines +# the re2 target. +add_subdirectory(${GLOBAL_OUTPUT_PATH}/re2-src + ${GLOBAL_OUTPUT_PATH}/re2-build) + +# Include RE2 headers. +include_directories("${RE2_SOURCE_DIR}/include") + +# Allow linking into a shared library. +set_property(TARGET re2 PROPERTY POSITION_INDEPENDENT_CODE ON) + +# RE2 requires pthreads +set_property(TARGET re2 PROPERTY INTERFACE_COMPILE_OPTIONS $<${UNIX}:-pthread>) +set_property(TARGET re2 PROPERTY INTERFACE_LINK_LIBRARIES $<${UNIX}:-pthread>) # Include external googletest project. This runs a CMake sub-script -# (CMakeLists.txt.in) that downloads googletest source. It's then built as part +# (GoogleTestCMakeLists.txt.in) that downloads googletest source. It's then built as part # of the jsonnet project. The conventional way of handling CMake dependencies is # to use a find_package script, which finds and installs the library from # known locations on the local machine. Downloading the library ourselves @@ -58,7 +99,7 @@ if (BUILD_TESTS AND NOT USE_SYSTEM_GTEST) # Generate and download googletest project. set(GOOGLETEST_DIR ${GLOBAL_OUTPUT_PATH}/googletest-download) - configure_file(CMakeLists.txt.in ${GOOGLETEST_DIR}/CMakeLists.txt) + configure_file(GoogleTestCMakeLists.txt.in ${GOOGLETEST_DIR}/CMakeLists.txt) execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . RESULT_VARIABLE result WORKING_DIRECTORY ${GOOGLETEST_DIR} diff --git a/CMakeLists.txt.in b/GoogleTestCMakeLists.txt.in similarity index 100% rename from CMakeLists.txt.in rename to GoogleTestCMakeLists.txt.in diff --git a/Makefile b/Makefile index 262e791ba..c302ffb30 100644 --- a/Makefile +++ b/Makefile @@ -32,8 +32,7 @@ CXXFLAGS += -Iinclude -Ithird_party/md5 -Ithird_party/json -Ithird_party/rapidya CFLAGS ?= -g $(OPT) -Wall -Wextra -pedantic -std=c99 -fPIC CFLAGS += -Iinclude MAKEDEPENDFLAGS += -Iinclude -Ithird_party/md5 -Ithird_party/json -Ithird_party/rapidyaml/rapidyaml/src/ -Ithird_party/rapidyaml/rapidyaml/ext/c4core/src/ -LDFLAGS ?= - +LDFLAGS ?= -lre2 SHARED_LDFLAGS ?= -shared @@ -160,11 +159,11 @@ core/desugarer.cpp: core/std.jsonnet.h # Commandline executable. jsonnet: cmd/jsonnet.cpp cmd/utils.cpp $(LIB_OBJ) - $(CXX) $(CXXFLAGS) $(LDFLAGS) $< cmd/utils.cpp $(LIB_SRC:.cpp=.o) -o $@ + $(CXX) $(CXXFLAGS) $< cmd/utils.cpp $(LIB_SRC:.cpp=.o) -o $@ $(LDFLAGS) # Commandline executable (reformatter). jsonnetfmt: cmd/jsonnetfmt.cpp cmd/utils.cpp $(LIB_OBJ) - $(CXX) $(CXXFLAGS) $(LDFLAGS) $< cmd/utils.cpp $(LIB_SRC:.cpp=.o) -o $@ + $(CXX) $(CXXFLAGS) $< cmd/utils.cpp $(LIB_SRC:.cpp=.o) -o $@ $(LDFLAGS) # C binding. libjsonnet.so.$(VERSION): $(LIB_OBJ) diff --git a/RE2CMakeLists.txt.in b/RE2CMakeLists.txt.in new file mode 100644 index 000000000..808b92359 --- /dev/null +++ b/RE2CMakeLists.txt.in @@ -0,0 +1,18 @@ +# CMake script run a generation-time. This must be separate from the main +# CMakeLists.txt file to allow downloading and building googletest at generation +# time. +cmake_minimum_required(VERSION 2.8.2) + +project(re2-download NONE) + +include(ExternalProject) +ExternalProject_Add(re2 + GIT_REPOSITORY https://github.com/google/re2.git + GIT_TAG 2019-06-01 + SOURCE_DIR "${GLOBAL_OUTPUT_PATH}/re2-src" + BINARY_DIR "${GLOBAL_OUTPUT_PATH}/re2-build" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "" +) \ No newline at end of file diff --git a/WORKSPACE b/WORKSPACE index 97dfb640e..9e2d27333 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -12,15 +12,22 @@ git_repository( git_repository( name = "com_google_googletest", remote = "https://github.com/google/googletest.git", - # If updating googletest version, also update CMakeLists.txt.in. + # If updating googletest version, also update GoogleTestCMakeLists.txt.in. commit = "2fe3bd994b3189899d93f1d5a881e725e046fdc2", # release: release-1.8.1 shallow_since = "1535728917 -0400", ) +git_repository( + name = "com_googlesource_code_re2", + remote = "https://github.com/google/re2.git", + # If updating RE2 version, also update RE2CMakeLists.txt.in. + commit = "0c95bcce2f1f0f071a786ca2c42384b211b8caba", # release: 2019-06-01 + shallow_since = "1558525654 +0000", +) + # This allows using py_test and py_library against python3. register_toolchains("//platform_defs:default_python3_toolchain") # This allows building C++ against python3 headers. load("//tools/build_defs:python_repo.bzl", "python_headers") python_headers(name = "default_python3_headers") - diff --git a/core/BUILD b/core/BUILD index 06a5f6773..977abba5f 100644 --- a/core/BUILD +++ b/core/BUILD @@ -37,6 +37,7 @@ cc_library( "//third_party/json", "//third_party/md5:libmd5", "//third_party/rapidyaml:ryml", + "@com_googlesource_code_re2//:re2", ], ) diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt index e62a85811..4cd04e860 100644 --- a/core/CMakeLists.txt +++ b/core/CMakeLists.txt @@ -29,15 +29,14 @@ set(LIBJSONNET_SOURCE vm.cpp) add_library(libjsonnet SHARED ${LIBJSONNET_HEADERS} ${LIBJSONNET_SOURCE}) -add_dependencies(libjsonnet md5 stdlib) -target_link_libraries(libjsonnet md5 nlohmann_json::nlohmann_json ryml) +add_dependencies(libjsonnet md5 re2 stdlib) +target_link_libraries(libjsonnet md5 nlohmann_json::nlohmann_json ryml re2) file(STRINGS ${CMAKE_CURRENT_SOURCE_DIR}/../include/libjsonnet.h JSONNET_VERSION_DEF REGEX "[#]define[ \t]+LIB_JSONNET_VERSION[ \t]+") string(REGEX REPLACE ".*\"v([^\"]+)\".*" "\\1" JSONNET_VERSION ${JSONNET_VERSION_DEF}) message("Extracted Jsonnet version: " ${JSONNET_VERSION}) - # CMake prepends CMAKE_SHARED_LIBRARY_PREFIX to shared libraries, so without # this step the output would be |liblibjsonnet|. set_target_properties(libjsonnet PROPERTIES OUTPUT_NAME jsonnet @@ -54,8 +53,8 @@ target_include_directories(libjsonnet INTERFACE if (BUILD_STATIC_LIBS) # Static library for jsonnet command-line tool. add_library(libjsonnet_static STATIC ${LIBJSONNET_SOURCE}) - add_dependencies(libjsonnet_static md5 stdlib) - target_link_libraries(libjsonnet_static md5 nlohmann_json::nlohmann_json ryml) + add_dependencies(libjsonnet_static md5 re2 stdlib) + target_link_libraries(libjsonnet_static md5 nlohmann_json::nlohmann_json ryml re2) set_target_properties(libjsonnet_static PROPERTIES OUTPUT_NAME jsonnet) install(TARGETS libjsonnet_static DESTINATION "${CMAKE_INSTALL_LIBDIR}") target_include_directories(libjsonnet_static INTERFACE diff --git a/core/desugarer.cpp b/core/desugarer.cpp index 956be9eab..db63c92de 100644 --- a/core/desugarer.cpp +++ b/core/desugarer.cpp @@ -34,7 +34,7 @@ struct BuiltinDecl { std::vector params; }; -static unsigned long max_builtin = 38; +static unsigned long max_builtin = 43; BuiltinDecl jsonnet_builtin_decl(unsigned long builtin) { switch (builtin) { @@ -77,6 +77,11 @@ BuiltinDecl jsonnet_builtin_decl(unsigned long builtin) case 36: return {U"parseYaml", {U"str"}}; case 37: return {U"encodeUTF8", {U"str"}}; case 38: return {U"decodeUTF8", {U"arr"}}; + case 39: return {U"regexFullMatch", {U"pattern", U"str"}}; + case 40: return {U"regexPartialMatch", {U"pattern", U"str"}}; + case 41: return {U"regexQuoteMeta", {U"str"}}; + case 42: return {U"regexReplace", {U"str", U"pattern", U"to"}}; + case 43: return {U"regexGlobalReplace", {U"str", U"pattern", U"to"}}; default: std::cerr << "INTERNAL ERROR: Unrecognized builtin function: " << builtin << std::endl; std::abort(); diff --git a/core/vm.cpp b/core/vm.cpp index 464170d05..e9673a88c 100644 --- a/core/vm.cpp +++ b/core/vm.cpp @@ -28,6 +28,7 @@ limitations under the License. #include "parser.h" #include "ryml_std.hpp" // include this before any other ryml header #include "ryml.hpp" +#include "re2/re2.h" #include "state.h" #include "static_analysis.h" #include "string_utils.h" @@ -49,6 +50,10 @@ using json = nlohmann::json; namespace { +static const Fodder EF; // Empty fodder. + +static const LocationRange E; // Empty. + /** Turn a path e.g. "/a/b/c" into a dir, e.g. "/a/b/". If there is no path returns "". */ std::string dir_name(const std::string &path) @@ -938,6 +943,11 @@ class Interpreter { builtins["parseYaml"] = &Interpreter::builtinParseYaml; builtins["encodeUTF8"] = &Interpreter::builtinEncodeUTF8; builtins["decodeUTF8"] = &Interpreter::builtinDecodeUTF8; + builtins["regexFullMatch"] = &Interpreter::builtinRegexFullMatch; + builtins["regexPartialMatch"] = &Interpreter::builtinRegexPartialMatch; + builtins["regexQuoteMeta"] = &Interpreter::builtinRegexQuoteMeta; + builtins["regexReplace"] = &Interpreter::builtinRegexReplace; + builtins["regexGlobalReplace"] = &Interpreter::builtinRegexGlobalReplace; DesugaredObject *stdlib = makeStdlibAST(alloc, "__internal__"); jsonnet_static_analysis(stdlib); @@ -1440,6 +1450,129 @@ class Interpreter { return decodeUTF8(); } + const AST *regexMatch(const std::string &pattern, const std::string &string, bool full) + { + RE2 re(pattern, RE2::CannedOptions::Quiet); + if (!re.ok()) { + std::stringstream ss; + ss << "Invalid regex '" << re.pattern() << "': " << re.error(); + throw makeError(stack.top().location, ss.str()); + } + + int num_groups = re.NumberOfCapturingGroups(); + + std::vector rcaptures(num_groups); + std::vector rargv(num_groups); + std::vector rargs(num_groups); + for (int i = 0; i < num_groups; ++i) { + rargs[i] = &rargv[i]; + rargv[i] = &rcaptures[i]; + } + + if (full ? RE2::FullMatchN(string, re, rargs.data(), num_groups) + : RE2::PartialMatchN(string, re, rargs.data(), num_groups)) { + std::map fields; + + const Identifier *fid = alloc->makeIdentifier(U"string"); + fields[fid].hide = ObjectField::VISIBLE; + fields[fid].body = alloc->make(E, EF, decode_utf8(string), LiteralString::DOUBLE, "", ""); + + fid = alloc->makeIdentifier(U"captures"); + fields[fid].hide = ObjectField::VISIBLE; + std::vector captures; + for (int i = 0; i < num_groups; ++i) { + captures.push_back(Array::Element( + alloc->make(E, EF, decode_utf8(rcaptures[i]), LiteralString::DOUBLE, "", ""), + EF)); + } + fields[fid].body = alloc->make(E, EF, captures, false, EF); + + fid = alloc->makeIdentifier(U"namedCaptures"); + fields[fid].hide = ObjectField::VISIBLE; + DesugaredObject::Fields named_captures; + const std::map &named_groups = re.NamedCapturingGroups(); + for (auto it = named_groups.cbegin(); it != named_groups.cend(); ++it) { + named_captures.push_back(DesugaredObject::Field( + ObjectField::VISIBLE, + alloc->make(E, EF, decode_utf8(it->first), LiteralString::DOUBLE, "", ""), + alloc->make(E, EF, decode_utf8(rcaptures[it->second-1]), LiteralString::DOUBLE, "", ""))); + } + fields[fid].body = alloc->make(E, ASTs{}, named_captures); + + scratch = makeObject(BindingFrame{}, fields, ASTs{}); + } else { + scratch = makeNull(); + } + return nullptr; + } + + const AST *builtinRegexFullMatch(const LocationRange &loc, const std::vector &args) + { + validateBuiltinArgs(loc, "regexFullMatch", args, {Value::STRING, Value::STRING}); + + std::string pattern = encode_utf8(static_cast(args[0].v.h)->value); + std::string string = encode_utf8(static_cast(args[1].v.h)->value); + + return regexMatch(pattern, string, true); + } + + const AST *builtinRegexPartialMatch(const LocationRange &loc, const std::vector &args) + { + validateBuiltinArgs(loc, "regexPartialMatch", args, {Value::STRING, Value::STRING}); + + std::string pattern = encode_utf8(static_cast(args[0].v.h)->value); + std::string string = encode_utf8(static_cast(args[1].v.h)->value); + + return regexMatch(pattern, string, false); + } + + const AST *builtinRegexQuoteMeta(const LocationRange &loc, const std::vector &args) + { + validateBuiltinArgs(loc, "regexQuoteMeta", args, {Value::STRING}); + scratch = makeString(decode_utf8(RE2::QuoteMeta(encode_utf8(static_cast(args[0].v.h)->value)))); + return nullptr; + } + + const AST *builtinRegexReplace(const LocationRange &loc, const std::vector &args) + { + validateBuiltinArgs(loc, "regexReplace", args, {Value::STRING, Value::STRING, Value::STRING}); + + std::string string = encode_utf8(static_cast(args[0].v.h)->value); + std::string pattern = encode_utf8(static_cast(args[1].v.h)->value); + std::string replace = encode_utf8(static_cast(args[2].v.h)->value); + + RE2 re(pattern, RE2::CannedOptions::Quiet); + if(!re.ok()) { + std::stringstream ss; + ss << "Invalid regex '" << re.pattern() << "': " << re.error(); + throw makeError(stack.top().location, ss.str()); + } + + RE2::Replace(&string, re, replace); + scratch = makeString(decode_utf8(string)); + return nullptr; + } + + const AST *builtinRegexGlobalReplace(const LocationRange &loc, const std::vector &args) + { + validateBuiltinArgs(loc, "regexGlobalReplace", args, {Value::STRING, Value::STRING, Value::STRING}); + + std::string string = encode_utf8(static_cast(args[0].v.h)->value); + std::string pattern = encode_utf8(static_cast(args[1].v.h)->value); + std::string replace = encode_utf8(static_cast(args[2].v.h)->value); + + RE2 re(pattern, RE2::CannedOptions::Quiet); + if(!re.ok()) { + std::stringstream ss; + ss << "Invalid regex '" << re.pattern() << "': " << re.error(); + throw makeError(stack.top().location, ss.str()); + } + + RE2::GlobalReplace(&string, re, replace); + scratch = makeString(decode_utf8(string)); + return nullptr; + } + const AST *builtinTrace(const LocationRange &loc, const std::vector &args) { if(args[0].t != Value::STRING) { diff --git a/test_suite/stdlib.jsonnet b/test_suite/stdlib.jsonnet index bf77a859a..599f4dad5 100644 --- a/test_suite/stdlib.jsonnet +++ b/test_suite/stdlib.jsonnet @@ -1521,7 +1521,6 @@ std.assertEqual(std.decodeUTF8([65 + 1 - 1]), 'A') && std.assertEqual(std.decodeUTF8([90, 97, 197, 188, 195, 179, 197, 130, 196, 135, 32, 103, 196, 153, 197, 155, 108, 196, 133, 32, 106, 97, 197, 186, 197, 132]), 'Zażółć gęślą jaźń') && std.assertEqual(std.decodeUTF8([240, 159, 152, 131]), '😃') && - std.assertEqual(std.any([true, false]), true) && std.assertEqual(std.any([false, false]), false) && std.assertEqual(std.any([]), false) && @@ -1530,4 +1529,74 @@ std.assertEqual(std.all([true, false]), false) && std.assertEqual(std.all([true, true]), true) && std.assertEqual(std.all([]), true) && +std.assertEqual(std.regexFullMatch(@'e', 'hello'), null) && + +std.assertEqual( + std.regexFullMatch(@'h.*o', 'hello'), + { + string: 'hello', + captures: [], + namedCaptures: {}, + } +) && + +std.assertEqual( + std.regexFullMatch(@'h(.*)o', 'hello'), + { + string: 'hello', + captures: ['ell'], + namedCaptures: {}, + } +) && + +std.assertEqual( + std.regexFullMatch(@'h(?P.*)o', 'hello'), + { + string: 'hello', + captures: ['ell'], + namedCaptures: { + mid: 'ell', + }, + } +) && + +std.assertEqual(std.regexPartialMatch(@'world', 'hello'), null) && + +std.assertEqual( + std.regexPartialMatch(@'e', 'hello'), + { + string: 'hello', + captures: [], + namedCaptures: {}, + } +) && + +std.assertEqual( + std.regexPartialMatch(@'e(.*)o', 'hello'), + { + string: 'hello', + captures: ['ll'], + namedCaptures: {}, + } +) && + +std.assertEqual( + std.regexPartialMatch(@'e(?P.*)o', 'hello'), + { + string: 'hello', + captures: ['ll'], + namedCaptures: { + mid: 'll', + }, + } +) && + +std.assertEqual(std.regexQuoteMeta(@'1.5-2.0?'), '1\\.5\\-2\\.0\\?') && + +std.assertEqual(std.regexReplace('wishyfishyisishy', @'ish', 'and'), 'wandyfishyisishy') && +std.assertEqual(std.regexReplace('yabba dabba doo', @'b+', 'd'), 'yada dabba doo') && + +std.assertEqual(std.regexGlobalReplace('wishyfishyisishy', @'ish', 'and'), 'wandyfandyisandy') && +std.assertEqual(std.regexGlobalReplace('yabba dabba doo', @'b+', 'd'), 'yada dada doo') && + true From e548dd9f5550ad406e180a0c9d5ef02b2a505789 Mon Sep 17 00:00:00 2001 From: Duologic Date: Tue, 29 Nov 2022 20:18:04 +0100 Subject: [PATCH 2/2] chore: use newer google/re2 version --- RE2CMakeLists.txt.in | 2 +- WORKSPACE | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/RE2CMakeLists.txt.in b/RE2CMakeLists.txt.in index 808b92359..2b0d53ac1 100644 --- a/RE2CMakeLists.txt.in +++ b/RE2CMakeLists.txt.in @@ -8,7 +8,7 @@ project(re2-download NONE) include(ExternalProject) ExternalProject_Add(re2 GIT_REPOSITORY https://github.com/google/re2.git - GIT_TAG 2019-06-01 + GIT_TAG 2022-06-01 SOURCE_DIR "${GLOBAL_OUTPUT_PATH}/re2-src" BINARY_DIR "${GLOBAL_OUTPUT_PATH}/re2-build" CONFIGURE_COMMAND "" diff --git a/WORKSPACE b/WORKSPACE index 9e2d27333..3357de04c 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -21,7 +21,7 @@ git_repository( name = "com_googlesource_code_re2", remote = "https://github.com/google/re2.git", # If updating RE2 version, also update RE2CMakeLists.txt.in. - commit = "0c95bcce2f1f0f071a786ca2c42384b211b8caba", # release: 2019-06-01 + commit = "5723bb8950318135ed9cf4fc76bed988a087f536", # release: 2022-06-01 shallow_since = "1558525654 +0000", ) @@ -31,3 +31,4 @@ register_toolchains("//platform_defs:default_python3_toolchain") # This allows building C++ against python3 headers. load("//tools/build_defs:python_repo.bzl", "python_headers") python_headers(name = "default_python3_headers") +