From 9154949fb24a9416e1392e0681c6961dae7fffb7 Mon Sep 17 00:00:00 2001 From: kirkrodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Thu, 7 Nov 2024 22:04:31 -0500 Subject: [PATCH 01/10] fix(taskfiles): Trim trailing slash from URL prefix in `download-and-extract-tar` (fixes #577). (#578) --- Taskfile.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Taskfile.yml b/Taskfile.yml index 5912bd579..02786f9af 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -348,7 +348,7 @@ tasks: - "mkdir -p '{{.OUTPUT_TMP_DIR}}'" - >- curl --fail --location --show-error - "{{.URL_PREFIX}}/{{.TAR_NAME}}" + "{{trimSuffix "/" .URL_PREFIX}}/{{.TAR_NAME}}" --output "{{.TAR_PATH}}" - "tar xf '{{.TAR_PATH}}' --directory '{{.OUTPUT_TMP_DIR}}'" - "mv '{{.EXTRACTED_DIR}}' '{{.OUTPUT_DIR}}'" From 31de766ecc3175b1fa472d12881587e3673294de Mon Sep 17 00:00:00 2001 From: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> Date: Fri, 8 Nov 2024 03:58:28 -0500 Subject: [PATCH 02/10] fix(ffi): Correct `clp::ffi::ir_stream::Deserializer::deserialize_next_ir_unit`'s return value when failing to read the next IR unit's type tag. (#579) --- components/core/src/clp/ffi/ir_stream/Deserializer.hpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/components/core/src/clp/ffi/ir_stream/Deserializer.hpp b/components/core/src/clp/ffi/ir_stream/Deserializer.hpp index 45cca8f26..3418a39ae 100644 --- a/components/core/src/clp/ffi/ir_stream/Deserializer.hpp +++ b/components/core/src/clp/ffi/ir_stream/Deserializer.hpp @@ -14,7 +14,6 @@ #include "../../ReaderInterface.hpp" #include "../../time_types.hpp" -#include "../KeyValuePairLogEvent.hpp" #include "../SchemaTree.hpp" #include "decoding_methods.hpp" #include "ir_unit_deserialization_methods.hpp" @@ -66,8 +65,8 @@ class Deserializer { /** * Deserializes the stream from the given reader up to and including the next log event IR unit. * @param reader - * @return std::errc::no_message_available if no tag bytes can be read to determine the next IR - * unit type. + * @return Forwards `deserialize_tag`s return values if no tag bytes can be read to determine + * the next IR unit type. * @return std::errc::protocol_not_supported if the IR unit type is not supported. * @return std::errc::operation_not_permitted if the deserializer already reached the end of * stream by deserializing an end-of-stream IR unit in the previous calls. @@ -172,8 +171,8 @@ auto Deserializer::deserialize_next_ir_unit(ReaderInterface& read } encoded_tag_t tag{}; - if (IRErrorCode::IRErrorCode_Success != deserialize_tag(reader, tag)) { - return std::errc::no_message_available; + if (auto const err{deserialize_tag(reader, tag)}; IRErrorCode::IRErrorCode_Success != err) { + return ir_error_code_to_errc(err); } auto const optional_ir_unit_type{get_ir_unit_type_from_tag(tag)}; From 7ea39416b713c0214d2f045f9af9d1822c61cdfa Mon Sep 17 00:00:00 2001 From: Junhao Liao Date: Fri, 8 Nov 2024 12:21:09 -0500 Subject: [PATCH 03/10] fix(taskfiles): Update `yscope-log-viewer` sources in `log-viewer-webui-clients` sources list (fixes #576). (#580) --- Taskfile.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Taskfile.yml b/Taskfile.yml index 02786f9af..5c20ef460 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -228,10 +228,10 @@ tasks: - "client/src/**/*.css" - "client/src/**/*.jsx" - "client/src/webpack.config.js" - - "yscope-log-viewer/.babelrc" - - "yscope-log-viewer/customized-packages/**/*" - "yscope-log-viewer/package.json" + - "yscope-log-viewer/public/**/*" - "yscope-log-viewer/src/**/*" + - "yscope-log-viewer/tsconfig.json" - "yscope-log-viewer/webpack.common.js" - "yscope-log-viewer/webpack.prod.js" dir: "components/log-viewer-webui" From 4446eb56e315c30f8ccc7a779f0b5a8cc3544575 Mon Sep 17 00:00:00 2001 From: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> Date: Sat, 9 Nov 2024 23:00:11 -0500 Subject: [PATCH 04/10] fix(cmake): Add Homebrew path detection for `mariadb-connector-c` to fix macOS build failure. (#582) Co-authored-by: kirkrodrigues <2454684+kirkrodrigues@users.noreply.github.com> --- .../cmake/Modules/FindMariaDBClient.cmake | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/components/core/cmake/Modules/FindMariaDBClient.cmake b/components/core/cmake/Modules/FindMariaDBClient.cmake index 543f31a6b..5801be2e6 100644 --- a/components/core/cmake/Modules/FindMariaDBClient.cmake +++ b/components/core/cmake/Modules/FindMariaDBClient.cmake @@ -20,6 +20,28 @@ include(cmake/Modules/FindLibraryDependencies.cmake) find_package(PkgConfig) pkg_check_modules(mariadbclient_PKGCONF QUIET "lib${mariadbclient_LIBNAME}") +if(NOT mariadbclient_PKGCONF_FOUND AND APPLE) + execute_process( + COMMAND brew --prefix mariadb-connector-c + RESULT_VARIABLE mariadbclient_BREW_RESULT + OUTPUT_VARIABLE mariadbclient_MACOS_PREFIX + ) + if(NOT mariadbclient_BREW_RESULT EQUAL 0) + message( + FATAL_ERROR + "pkg-config cannot find ${mariadbclient_LIBNAME} and mariadb-connector-c isn't" + " installed via Homebrew" + ) + endif() + string(STRIP "${mariadbclient_MACOS_PREFIX}" mariadbclient_MACOS_PREFIX) + list(PREPEND CMAKE_PREFIX_PATH ${mariadbclient_MACOS_PREFIX}) + pkg_check_modules(mariadbclient_PKGCONF QUIET "lib${mariadbclient_LIBNAME}") +endif() + +if(NOT mariadbclient_PKGCONF_FOUND) + message(FATAL_ERROR "pkg-config cannot find ${mariadbclient_LIBNAME}") +endif() + # Set include directory find_path(MariaDBClient_INCLUDE_DIR mysql.h HINTS ${mariadbclient_PKGCONF_INCLUDEDIR} From 5559c1ec56e4a16ca623110d67f4a9d26cdb5586 Mon Sep 17 00:00:00 2001 From: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> Date: Sat, 9 Nov 2024 23:41:26 -0500 Subject: [PATCH 05/10] refactor(ffi): Make `get_schema_subtree_bitmap` a public method of `KeyValuePairLogEvent`. (#581) --- .../core/src/clp/ffi/KeyValuePairLogEvent.cpp | 80 +++++++------------ .../core/src/clp/ffi/KeyValuePairLogEvent.hpp | 12 +++ 2 files changed, 43 insertions(+), 49 deletions(-) diff --git a/components/core/src/clp/ffi/KeyValuePairLogEvent.cpp b/components/core/src/clp/ffi/KeyValuePairLogEvent.cpp index 2a84795f7..a8a8cf617 100644 --- a/components/core/src/clp/ffi/KeyValuePairLogEvent.cpp +++ b/components/core/src/clp/ffi/KeyValuePairLogEvent.cpp @@ -153,20 +153,6 @@ node_type_matches_value_type(SchemaTree::Node::Type type, Value const& value) -> KeyValuePairLogEvent::NodeIdValuePairs const& node_id_value_pairs ) -> bool; -/** - * @param node_id_value_pairs - * @param schema_tree - * @return A result containing a bitmap where every bit corresponds to the ID of a node in the - * schema tree, and the set bits correspond to the nodes in the subtree defined by all paths from - * the root node to the nodes in `node_id_value_pairs`; or an error code indicating a failure: - * - std::errc::result_out_of_range if a node ID in `node_id_value_pairs` doesn't exist in the - * schema tree. - */ -[[nodiscard]] auto get_schema_subtree_bitmap( - KeyValuePairLogEvent::NodeIdValuePairs const& node_id_value_pairs, - SchemaTree const& schema_tree -) -> OUTCOME_V2_NAMESPACE::std_result>; - /** * Inserts the given key-value pair into the JSON object (map). * @param node The schema tree node of the key to insert. @@ -283,38 +269,6 @@ auto is_leaf_node( return true; } -auto get_schema_subtree_bitmap( - KeyValuePairLogEvent::NodeIdValuePairs const& node_id_value_pairs, - SchemaTree const& schema_tree -) -> OUTCOME_V2_NAMESPACE::std_result> { - auto schema_subtree_bitmap{vector(schema_tree.get_size(), false)}; - for (auto const& [node_id, val] : node_id_value_pairs) { - if (node_id >= schema_subtree_bitmap.size()) { - return std::errc::result_out_of_range; - } - schema_subtree_bitmap[node_id] = true; - - // Iteratively mark the parents as true - auto optional_parent_id{schema_tree.get_node(node_id).get_parent_id()}; - while (true) { - // Ideally, we'd use this if statement as the loop condition, but clang-tidy will - // complain about an unchecked `optional` access. - if (false == optional_parent_id.has_value()) { - // Reached the root - break; - } - auto const parent_id{optional_parent_id.value()}; - if (schema_subtree_bitmap[parent_id]) { - // Parent already set by other child - break; - } - schema_subtree_bitmap[parent_id] = true; - optional_parent_id = schema_tree.get_node(parent_id).get_parent_id(); - } - } - return schema_subtree_bitmap; -} - auto insert_kv_pair_into_json_obj( SchemaTree::Node const& node, std::optional const& optional_val, @@ -393,6 +347,36 @@ auto KeyValuePairLogEvent::create( return KeyValuePairLogEvent{std::move(schema_tree), std::move(node_id_value_pairs), utc_offset}; } +auto KeyValuePairLogEvent::get_schema_subtree_bitmap( +) const -> OUTCOME_V2_NAMESPACE::std_result> { + auto schema_subtree_bitmap{vector(m_schema_tree->get_size(), false)}; + for (auto const& [node_id, val] : m_node_id_value_pairs) { + if (node_id >= schema_subtree_bitmap.size()) { + return std::errc::result_out_of_range; + } + schema_subtree_bitmap[node_id] = true; + + // Iteratively mark the parents as true + auto optional_parent_id{m_schema_tree->get_node(node_id).get_parent_id()}; + while (true) { + // Ideally, we'd use this if statement as the loop condition, but clang-tidy will + // complain about an unchecked `optional` access. + if (false == optional_parent_id.has_value()) { + // Reached the root + break; + } + auto const parent_id{optional_parent_id.value()}; + if (schema_subtree_bitmap[parent_id]) { + // Parent already set by other child + break; + } + schema_subtree_bitmap[parent_id] = true; + optional_parent_id = m_schema_tree->get_node(parent_id).get_parent_id(); + } + } + return schema_subtree_bitmap; +} + auto KeyValuePairLogEvent::serialize_to_json( ) const -> OUTCOME_V2_NAMESPACE::std_result { if (m_node_id_value_pairs.empty()) { @@ -409,9 +393,7 @@ auto KeyValuePairLogEvent::serialize_to_json( // vector grows). std::stack dfs_stack; - auto const schema_subtree_bitmap_ret{ - get_schema_subtree_bitmap(m_node_id_value_pairs, *m_schema_tree) - }; + auto const schema_subtree_bitmap_ret{get_schema_subtree_bitmap()}; if (schema_subtree_bitmap_ret.has_error()) { return schema_subtree_bitmap_ret.error(); } diff --git a/components/core/src/clp/ffi/KeyValuePairLogEvent.hpp b/components/core/src/clp/ffi/KeyValuePairLogEvent.hpp index 04fd31c9e..f6334d378 100644 --- a/components/core/src/clp/ffi/KeyValuePairLogEvent.hpp +++ b/components/core/src/clp/ffi/KeyValuePairLogEvent.hpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -60,6 +61,17 @@ class KeyValuePairLogEvent { [[nodiscard]] auto get_utc_offset() const -> UtcOffset { return m_utc_offset; } + /** + * @return A result containing a bitmap where every bit corresponds to the ID of a node in the + * schema tree, and the set bits correspond to the nodes in the subtree defined by all paths + * from the root node to the nodes in `node_id_value_pairs`; or an error code indicating a + * failure: + * - std::errc::result_out_of_range if a node ID in `node_id_value_pairs` doesn't exist in the + * schema tree. + */ + [[nodiscard]] auto get_schema_subtree_bitmap( + ) const -> OUTCOME_V2_NAMESPACE::std_result>; + /** * Serializes the log event into a `nlohmann::json` object. * @return A result containing the serialized JSON object or an error code indicating the From 8b3fd63fdcbc5eb3628673f8ff79f9399d244d29 Mon Sep 17 00:00:00 2001 From: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> Date: Sun, 10 Nov 2024 17:38:33 -0500 Subject: [PATCH 06/10] ci: Schedule GitHub workflows to daily run to detect failures due to upgraded dependencies or environments. (#583) --- .github/workflows/clp-core-build-macos.yaml | 3 +++ .github/workflows/clp-core-build.yaml | 3 +++ .github/workflows/clp-docs.yaml | 3 +++ .github/workflows/clp-execution-image-build.yaml | 3 +++ .github/workflows/clp-lint.yaml | 2 +- 5 files changed, 13 insertions(+), 1 deletion(-) diff --git a/.github/workflows/clp-core-build-macos.yaml b/.github/workflows/clp-core-build-macos.yaml index 85d04d9c7..8196e75d8 100644 --- a/.github/workflows/clp-core-build-macos.yaml +++ b/.github/workflows/clp-core-build-macos.yaml @@ -27,6 +27,9 @@ on: - "deps-tasks.yml" - "Taskfile.yml" - "tools/scripts/deps-download/**" + schedule: + # Run daily at 00:15 UTC (the 15 is to avoid periods of high load) + - cron: "15 0 * * *" workflow_dispatch: concurrency: diff --git a/.github/workflows/clp-core-build.yaml b/.github/workflows/clp-core-build.yaml index 9046f15da..20b305f8d 100644 --- a/.github/workflows/clp-core-build.yaml +++ b/.github/workflows/clp-core-build.yaml @@ -23,6 +23,9 @@ on: - "Taskfile.yml" - "tools/scripts/deps-download/**" - "!components/core/tools/scripts/lib_install/macos/**" + schedule: + # Run daily at 00:15 UTC (the 15 is to avoid periods of high load) + - cron: "15 0 * * *" workflow_dispatch: env: diff --git a/.github/workflows/clp-docs.yaml b/.github/workflows/clp-docs.yaml index 2f0a68e77..38e4cb172 100644 --- a/.github/workflows/clp-docs.yaml +++ b/.github/workflows/clp-docs.yaml @@ -3,6 +3,9 @@ name: "clp-docs" on: pull_request: push: + schedule: + # Run daily at 00:15 UTC (the 15 is to avoid periods of high load) + - cron: "15 0 * * *" workflow_dispatch: concurrency: diff --git a/.github/workflows/clp-execution-image-build.yaml b/.github/workflows/clp-execution-image-build.yaml index d0bc5b017..058e23d5f 100644 --- a/.github/workflows/clp-execution-image-build.yaml +++ b/.github/workflows/clp-execution-image-build.yaml @@ -11,6 +11,9 @@ on: - ".github/actions/clp-execution-image-build/action.yaml" - ".github/workflows/clp-execution-image-build.yaml" - "tools/docker-images/**/*" + schedule: + # Run daily at 00:15 UTC (the 15 is to avoid periods of high load) + - cron: "15 0 * * *" workflow_dispatch: concurrency: diff --git a/.github/workflows/clp-lint.yaml b/.github/workflows/clp-lint.yaml index 75f74fe4a..bbe485c5d 100644 --- a/.github/workflows/clp-lint.yaml +++ b/.github/workflows/clp-lint.yaml @@ -4,7 +4,7 @@ on: pull_request: push: schedule: - # Run at midnight UTC every day with 15 minutes delay added to avoid high load periods + # Run daily at 00:15 UTC (the 15 is to avoid periods of high load) - cron: "15 0 * * *" workflow_dispatch: From f3b1cf61a28fec4e21e8dd2e8b5f6095ea689fc8 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Mon, 11 Nov 2024 14:34:23 -0500 Subject: [PATCH 07/10] docs: Update the required version of task. (#567) --- docs/README.md | 3 ++- docs/src/dev-guide/building-package.md | 2 +- docs/src/dev-guide/components-core/index.md | 2 +- docs/src/dev-guide/contributing-linting.md | 2 +- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/docs/README.md b/docs/README.md index 722bd625c..8f3d7207c 100644 --- a/docs/README.md +++ b/docs/README.md @@ -13,7 +13,7 @@ this project: the size of repo as we add and update images. * [Node.js] >= 16 to be able to [view the output](#viewing-the-output) * Python 3.10 or later -* [Task] >= 3.35 +* [Task] >= 3.38.0 * We constrain the version because, in lower versions, the Taskfile syntax we use has bugs. ## Build Commands @@ -44,3 +44,4 @@ the address it binds to (usually http://localhost:8080). [git-lfs]: https://git-lfs.com [http-server]: https://www.npmjs.com/package/http-server [Node.js]: https://nodejs.org/en/download/current +[Task]: https://taskfile.dev/ diff --git a/docs/src/dev-guide/building-package.md b/docs/src/dev-guide/building-package.md index c0778b1c4..6d47185f4 100644 --- a/docs/src/dev-guide/building-package.md +++ b/docs/src/dev-guide/building-package.md @@ -13,7 +13,7 @@ prebuilt version instead, check out the [releases](https://github.com/y-scope/cl * Python 3.8 or newer * python3-dev * python3-venv -* [Task](https://taskfile.dev/) +* [Task](https://taskfile.dev/) >= 3.38.0 ## Setup diff --git a/docs/src/dev-guide/components-core/index.md b/docs/src/dev-guide/components-core/index.md index 1406f1bd2..1af0fb13e 100644 --- a/docs/src/dev-guide/components-core/index.md +++ b/docs/src/dev-guide/components-core/index.md @@ -7,7 +7,7 @@ CLP core is the low-level component that performs compression, decompression, an * We have built and tested CLP on the OSes listed [below](#native-environment). * If you have trouble building for another OS, file an issue, and we may be able to help. * A compiler that supports C++20 and std::span (e.g., gcc-10) -* [Task](https://taskfile.dev/) +* [Task](https://taskfile.dev/) >= 3.38.0 To build, we require some source dependencies, packages from package managers, and libraries built from source. diff --git a/docs/src/dev-guide/contributing-linting.md b/docs/src/dev-guide/contributing-linting.md index 599935157..fb246d045 100644 --- a/docs/src/dev-guide/contributing-linting.md +++ b/docs/src/dev-guide/contributing-linting.md @@ -15,7 +15,7 @@ To run the linting tools, besides commonly installed tools like `tar`, you'll ne * `md5sum` * Python 3.8 or newer * python3-venv -* [Task] +* [Task] >= 3.38.0 ## Running the linters From 9a8bb5937970e163448a8e08239d5735a5facad2 Mon Sep 17 00:00:00 2001 From: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> Date: Tue, 12 Nov 2024 18:09:10 -0500 Subject: [PATCH 08/10] docs(ffi): Update `ffi::ir_stream::Serializer`'s doc string to remove "work-in-progress". (#586) --- components/core/src/clp/ffi/ir_stream/Serializer.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/core/src/clp/ffi/ir_stream/Serializer.hpp b/components/core/src/clp/ffi/ir_stream/Serializer.hpp index 292b360a3..14077ffba 100644 --- a/components/core/src/clp/ffi/ir_stream/Serializer.hpp +++ b/components/core/src/clp/ffi/ir_stream/Serializer.hpp @@ -14,7 +14,7 @@ namespace clp::ffi::ir_stream { /** - * A work-in-progress class for serializing log events into the kv-pair IR format. + * Class for serializing log events into the kv-pair IR format. * * This class: * - maintains all necessary internal data structures to track serialization state; From 2dfcd8a8eac498858814e08c7d64483daabbd18a Mon Sep 17 00:00:00 2001 From: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> Date: Tue, 12 Nov 2024 18:56:10 -0500 Subject: [PATCH 09/10] ci: Add GH workflow to validate PR titles follow Conventional Commits. (#587) Co-authored-by: kirkrodrigues <2454684+kirkrodrigues@users.noreply.github.com> --- .github/PULL_REQUEST_TEMPLATE.md | 8 +++++--- .github/workflows/clp-pr-title-checks.yaml | 23 ++++++++++++++++++++++ 2 files changed, 28 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/clp-pr-title-checks.yaml diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 91462f325..74460716f 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,7 +1,9 @@ # Description diff --git a/.github/workflows/clp-pr-title-checks.yaml b/.github/workflows/clp-pr-title-checks.yaml new file mode 100644 index 000000000..428e9f21d --- /dev/null +++ b/.github/workflows/clp-pr-title-checks.yaml @@ -0,0 +1,23 @@ +name: "clp-pr-title-checks" + +on: + pull_request_target: + types: ["edited", "opened", "reopened"] + branches: ["main"] + +concurrency: + group: "${{github.workflow}}-${{github.ref}}" + + # Cancel in-progress jobs for efficiency + cancel-in-progress: true + +jobs: + conventional-commits: + permissions: + # For amannn/action-semantic-pull-request + pull-requests: "read" + runs-on: "ubuntu-latest" + steps: + - uses: "amannn/action-semantic-pull-request@v5" + env: + GITHUB_TOKEN: "${{secrets.GITHUB_TOKEN}}" From 53c4f52b3ae2ef0e990a8d66a7be87fd1e85cfbe Mon Sep 17 00:00:00 2001 From: Devin Gibson Date: Wed, 13 Nov 2024 11:09:23 -0500 Subject: [PATCH 10/10] feat: Add support for escaping characters in KQL key names. (#560) --- components/core/src/clp_s/JsonParser.cpp | 7 ++- .../src/clp_s/TimestampDictionaryReader.cpp | 4 +- components/core/src/clp_s/Utils.cpp | 34 ++++++++++---- components/core/src/clp_s/Utils.hpp | 4 +- components/core/src/clp_s/clp-s.cpp | 5 ++- components/core/src/clp_s/search/kql/Kql.g4 | 2 +- components/core/src/clp_s/search/kql/kql.cpp | 11 ++++- components/core/tests/test-kql.cpp | 45 +++++++++++++++++++ 8 files changed, 95 insertions(+), 17 deletions(-) diff --git a/components/core/src/clp_s/JsonParser.cpp b/components/core/src/clp_s/JsonParser.cpp index 7d4af1469..5336c367a 100644 --- a/components/core/src/clp_s/JsonParser.cpp +++ b/components/core/src/clp_s/JsonParser.cpp @@ -21,7 +21,12 @@ JsonParser::JsonParser(JsonParserOption const& option) } if (false == m_timestamp_key.empty()) { - clp_s::StringUtils::tokenize_column_descriptor(m_timestamp_key, m_timestamp_column); + if (false + == clp_s::StringUtils::tokenize_column_descriptor(m_timestamp_key, m_timestamp_column)) + { + SPDLOG_ERROR("Can not parse invalid timestamp key: \"{}\"", m_timestamp_key); + throw OperationFailed(ErrorCodeBadParam, __FILENAME__, __LINE__); + } } for (auto& file_path : option.file_paths) { diff --git a/components/core/src/clp_s/TimestampDictionaryReader.cpp b/components/core/src/clp_s/TimestampDictionaryReader.cpp index c366e4f59..15685a97e 100644 --- a/components/core/src/clp_s/TimestampDictionaryReader.cpp +++ b/components/core/src/clp_s/TimestampDictionaryReader.cpp @@ -44,7 +44,9 @@ void TimestampDictionaryReader::read_new_entries() { TimestampEntry entry; std::vector tokens; entry.try_read_from_file(m_dictionary_decompressor); - StringUtils::tokenize_column_descriptor(entry.get_key_name(), tokens); + if (false == StringUtils::tokenize_column_descriptor(entry.get_key_name(), tokens)) { + throw OperationFailed(ErrorCodeCorrupt, __FILENAME__, __LINE__); + } m_entries.emplace_back(std::move(entry)); // TODO: Currently, we only allow a single authoritative timestamp column at ingestion time, diff --git a/components/core/src/clp_s/Utils.cpp b/components/core/src/clp_s/Utils.cpp index f429fb4a3..acee48851 100644 --- a/components/core/src/clp_s/Utils.cpp +++ b/components/core/src/clp_s/Utils.cpp @@ -427,18 +427,34 @@ bool StringUtils::convert_string_to_double(std::string const& raw, double& conve return true; } -void StringUtils::tokenize_column_descriptor( +bool StringUtils::tokenize_column_descriptor( std::string const& descriptor, std::vector& tokens ) { - // TODO: handle escaped . correctly - auto start = 0U; - auto end = descriptor.find('.'); - while (end != std::string::npos) { - tokens.push_back(descriptor.substr(start, end - start)); - start = end + 1; - end = descriptor.find('.', start); + // TODO: add support for unicode sequences e.g. \u263A + std::string cur_tok; + for (size_t cur = 0; cur < descriptor.size(); ++cur) { + if ('\\' == descriptor[cur]) { + ++cur; + if (cur >= descriptor.size()) { + return false; + } + } else if ('.' == descriptor[cur]) { + if (cur_tok.empty()) { + return false; + } + tokens.push_back(cur_tok); + cur_tok.clear(); + continue; + } + cur_tok.push_back(descriptor[cur]); } - tokens.push_back(descriptor.substr(start)); + + if (cur_tok.empty()) { + return false; + } + + tokens.push_back(cur_tok); + return true; } } // namespace clp_s diff --git a/components/core/src/clp_s/Utils.hpp b/components/core/src/clp_s/Utils.hpp index de33b7728..d6deb3280 100644 --- a/components/core/src/clp_s/Utils.hpp +++ b/components/core/src/clp_s/Utils.hpp @@ -211,9 +211,9 @@ class StringUtils { * Converts a string column descriptor delimited by '.' into a list of tokens * @param descriptor * @param tokens - * @return the list of tokens pushed into the 'tokens' parameter + * @return true if the descriptor was tokenized successfully, false otherwise */ - static void + [[nodiscard]] static bool tokenize_column_descriptor(std::string const& descriptor, std::vector& tokens); private: diff --git a/components/core/src/clp_s/clp-s.cpp b/components/core/src/clp_s/clp-s.cpp index 5f4384a1c..8752384ae 100644 --- a/components/core/src/clp_s/clp-s.cpp +++ b/components/core/src/clp_s/clp-s.cpp @@ -191,7 +191,10 @@ bool search_archive( try { for (auto const& column : command_line_arguments.get_projection_columns()) { std::vector descriptor_tokens; - StringUtils::tokenize_column_descriptor(column, descriptor_tokens); + if (false == StringUtils::tokenize_column_descriptor(column, descriptor_tokens)) { + SPDLOG_ERROR("Can not tokenize invalid column: \"{}\"", column); + return false; + } projection->add_column(ColumnDescriptor::create(descriptor_tokens)); } } catch (clp_s::TraceableException& e) { diff --git a/components/core/src/clp_s/search/kql/Kql.g4 b/components/core/src/clp_s/search/kql/Kql.g4 index 2ddef732c..33abf66bd 100644 --- a/components/core/src/clp_s/search/kql/Kql.g4 +++ b/components/core/src/clp_s/search/kql/Kql.g4 @@ -96,7 +96,7 @@ fragment ESCAPED_SPACE ; fragment SPECIAL_CHARACTER - : [\\():<>"*?{}] + : [\\():<>"*?{}.] ; diff --git a/components/core/src/clp_s/search/kql/kql.cpp b/components/core/src/clp_s/search/kql/kql.cpp index fa560cef5..972e44ad7 100644 --- a/components/core/src/clp_s/search/kql/kql.cpp +++ b/components/core/src/clp_s/search/kql/kql.cpp @@ -112,7 +112,10 @@ class ParseTreeVisitor : public KqlBaseVisitor { std::string column = unquote_string(ctx->LITERAL()->getText()); std::vector descriptor_tokens; - StringUtils::tokenize_column_descriptor(column, descriptor_tokens); + if (false == StringUtils::tokenize_column_descriptor(column, descriptor_tokens)) { + SPDLOG_ERROR("Can not tokenize invalid column: \"{}\"", column); + return nullptr; + } return ColumnDescriptor::create(descriptor_tokens); } @@ -248,6 +251,10 @@ std::shared_ptr parse_kql_expression(std::istream& in) { } ParseTreeVisitor visitor; - return std::any_cast>(visitor.visitStart(tree)); + try { + return std::any_cast>(visitor.visitStart(tree)); + } catch (std::exception& e) { + return {}; + } } } // namespace clp_s::search::kql diff --git a/components/core/tests/test-kql.cpp b/components/core/tests/test-kql.cpp index 6b9eb594f..2646ff5e0 100644 --- a/components/core/tests/test-kql.cpp +++ b/components/core/tests/test-kql.cpp @@ -187,4 +187,49 @@ TEST_CASE("Test parsing KQL", "[KQL]") { auto failure = parse_kql_expression(incorrect_query); REQUIRE(nullptr == failure); } + + SECTION("Escape sequences in column name") { + auto query = GENERATE( + "a\\.b.c: *", + "\"a\\.b.c\": *", + "a\\.b: {c: *}", + "\"a\\.b\": {\"c\": *}" + ); + stringstream escaped_column_query{query}; + auto filter + = std::dynamic_pointer_cast(parse_kql_expression(escaped_column_query)); + REQUIRE(nullptr != filter); + REQUIRE(nullptr != filter->get_operand()); + REQUIRE(nullptr != filter->get_column()); + REQUIRE(false == filter->has_only_expression_operands()); + REQUIRE(false == filter->is_inverted()); + REQUIRE(FilterOperation::EQ == filter->get_operation()); + REQUIRE(2 == filter->get_column()->get_descriptor_list().size()); + auto it = filter->get_column()->descriptor_begin(); + REQUIRE(DescriptorToken{"a.b"} == *it++); + REQUIRE(DescriptorToken{"c"} == *it++); + } + + SECTION("Illegal escape sequences in column name") { + auto query = GENERATE( + //"a\\:*", this case is technically legal since ':' gets escaped + "\"a\\\":*", + "a\\ :*", + "\"a\\\" :*", + "a.:*", + "\"a.\":*", + "a. :*", + "\"a.\" :*" + ); + stringstream illegal_escape{query}; + auto filter = parse_kql_expression(illegal_escape); + REQUIRE(nullptr == filter); + } + + SECTION("Empty token in column name") { + auto query = GENERATE(".a:*", "a.:*", "a..c:*", "a.b.:*"); + stringstream empty_token_column{query}; + auto filter = parse_kql_expression(empty_token_column); + REQUIRE(nullptr == filter); + } }