diff --git a/rust/test/BUILD b/rust/test/BUILD index b13d70df324b..f9bb85cd4262 100644 --- a/rust/test/BUILD +++ b/rust/test/BUILD @@ -1,3 +1,9 @@ +# Protocol Buffers - Google's data interchange format +# Copyright 2023 Google LLC. All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file or at +# https://developers.google.com/open-source/licenses/bsd + load( "//rust:defs.bzl", "rust_cc_proto_library", @@ -15,7 +21,10 @@ UNITTEST_EDITION_TARGET = "//src/google/protobuf:test_protos" rust_upb_proto_library( name = "unittest_upb_rust_proto", testonly = True, - visibility = ["//rust/test/shared:__subpackages__"], + visibility = [ + "//rust/test/shared:__subpackages__", + "//rust/test/upb:__subpackages__", + ], deps = [UNITTEST_PROTO_TARGET], ) @@ -70,7 +79,10 @@ rust_cc_proto_library( rust_upb_proto_library( name = "unittest_edition_upb_rust_proto", testonly = True, - visibility = ["//rust/test/shared:__subpackages__"], + visibility = [ + "//rust/test/shared:__subpackages__", + "//rust/test/upb:__subpackages__", + ], deps = [UNITTEST_EDITION_TARGET], ) @@ -379,6 +391,7 @@ rust_upb_proto_library( testonly = True, visibility = [ "//rust/test/shared:__subpackages__", + "//rust/test/upb:__subpackages__", ], deps = ["//src/google/protobuf:map_unittest_proto"], ) diff --git a/rust/test/upb/BUILD b/rust/test/upb/BUILD index 4777021b3bdb..8cfc78f4c7e3 100644 --- a/rust/test/upb/BUILD +++ b/rust/test/upb/BUILD @@ -1,3 +1,9 @@ +# Protocol Buffers - Google's data interchange format +# Copyright 2023 Google LLC. All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file or at +# https://developers.google.com/open-source/licenses/bsd + # Tests specific to upb kernel. # # Only add tests that are cpp kernel specific and it is not possible to make them work for upb ( @@ -13,6 +19,8 @@ load("@rules_rust//rust:defs.bzl", "rust_test") +licenses(["notice"]) + # TODO: Enable this for the cpp kernel and move these tests to shared. rust_test( name = "string_ctypes_test_upb_test", @@ -26,3 +34,17 @@ rust_test( "@crate_index//:googletest", ], ) + +# blaze test //rust/test/upb:debug_string_test --test_arg=--nocapture -c dbg +# --test_output=all to see debug string in test output logs. +rust_test( + name = "debug_string_test", + srcs = ["debug_string_test.rs"], + deps = [ + "//rust:protobuf_upb", + "//rust/test:map_unittest_upb_rust_proto", + "//rust/test:unittest_edition_upb_rust_proto", + "//rust/test:unittest_upb_rust_proto", + "@crate_index//:googletest", + ], +) diff --git a/rust/test/upb/debug_string_test.rs b/rust/test/upb/debug_string_test.rs new file mode 100644 index 000000000000..fa2f429c1c55 --- /dev/null +++ b/rust/test/upb/debug_string_test.rs @@ -0,0 +1,88 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2024 Google LLC. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file or at +// https://developers.google.com/open-source/licenses/bsd + +use googletest::prelude::*; +use map_unittest_rust_proto::TestMapWithMessages; +use protobuf_upb::proto; +use unittest_rust_proto::{ + test_all_types::NestedEnum as NestedEnumProto2, + test_all_types::NestedMessage as NestedMessageProto2, TestAllTypes as TestAllTypesProto2, +}; + +#[test] +fn test_debug_string() { + let mut msg = proto!(TestAllTypesProto2 { + optional_int32: 42, + optional_string: "Hello World", + optional_nested_enum: NestedEnumProto2::Bar, + oneof_uint32: 452235, + optional_nested_message: proto!(NestedMessageProto2 { bb: 100 }), + }); + let mut repeated_string = msg.repeated_string_mut(); + repeated_string.push("Hello World"); + repeated_string.push("Hello World"); + repeated_string.push("Hello World"); + + let mut msg_map = TestMapWithMessages::new(); + println!("EMPTY MSG: {:?}", msg_map); // Make sure that we can print an empty message. + msg_map.map_string_all_types_mut().insert("hello", msg.as_view()); + msg_map.map_string_all_types_mut().insert("fizz", msg.as_view()); + msg_map.map_string_all_types_mut().insert("boo", msg.as_view()); + + println!("{:?}", msg_map); + println!("{:?}", msg_map.as_view()); // Make sure that we can print as_view + println!("{:?}", msg_map.as_mut()); // Make sure that we can print as_mut + let golden = r#"12 { + key: "hello" + value { + 1: 42 + 14: "Hello World" + 18 { + 1: 100 + } + 21: 2 + 44: "Hello World" + 44: "Hello World" + 44: "Hello World" + 111: 452235 + } +} +12 { + key: "fizz" + value { + 1: 42 + 14: "Hello World" + 18 { + 1: 100 + } + 21: 2 + 44: "Hello World" + 44: "Hello World" + 44: "Hello World" + 111: 452235 + } +} +12 { + key: "boo" + value { + 1: 42 + 14: "Hello World" + 18 { + 1: 100 + } + 21: 2 + 44: "Hello World" + 44: "Hello World" + 44: "Hello World" + 111: 452235 + } +} +"#; + // C strings are null terminated while Rust strings are not. + let null_terminated_str = format!("{}\0", golden); + assert_that!(format!("{:?}", msg_map), eq(null_terminated_str.as_str())); +} diff --git a/rust/upb/BUILD b/rust/upb/BUILD index 83322121d167..8fa34f623756 100644 --- a/rust/upb/BUILD +++ b/rust/upb/BUILD @@ -25,6 +25,7 @@ rust_library( "opaque_pointee.rs", "owned_arena_box.rs", "string_view.rs", + "text.rs", "wire.rs", ], visibility = [ @@ -48,5 +49,6 @@ cc_library( "//upb:message_compare", "//upb:message_copy", "//upb/mini_table", + "//upb/text:debug", ], ) diff --git a/rust/upb/lib.rs b/rust/upb/lib.rs index 4e0acb90d7bf..bbfc106b6b9c 100644 --- a/rust/upb/lib.rs +++ b/rust/upb/lib.rs @@ -50,5 +50,8 @@ pub use owned_arena_box::OwnedArenaBox; mod string_view; pub use string_view::StringView; +mod text; +pub use text::debug_string; + pub mod wire; pub use wire::{upb_Decode, DecodeStatus, EncodeStatus}; diff --git a/rust/upb/text.rs b/rust/upb/text.rs new file mode 100644 index 000000000000..2e6f84adaf7d --- /dev/null +++ b/rust/upb/text.rs @@ -0,0 +1,66 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2024 Google LLC. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file or at +// https://developers.google.com/open-source/licenses/bsd + +use crate::{upb_MiniTable, RawMessage}; + +extern "C" { + /// Returns the minimum needed length (excluding NULL) that `buf` has to be + /// to hold the `msg`s debug string. + /// + /// SAFETY: + /// - `msg` is pointing at a valid upb_Message with associated minitable + /// `mt` + /// - `buf` is legally writable for `size` bytes (`buf` may be nullptr if + /// `size` is 0) + fn upb_DebugString( + msg: RawMessage, + mt: *const upb_MiniTable, + options: i32, + buf: *mut u8, + size: usize, + ) -> usize; +} + +#[allow(dead_code)] +#[repr(i32)] +enum Options { + // When set, prints everything on a single line. + SingleLine = 1, + + // When set, unknown fields are not printed. + SkipUnknown = 2, + + // When set, maps are *not* sorted (this avoids allocating tmp mem). + NoSortMaps = 4, +} + +/// Returns a string of field number to value entries of a message. +/// +/// # Safety +/// - `mt` must correspond to the `msg`s minitable. +pub unsafe fn debug_string(msg: RawMessage, mt: *const upb_MiniTable) -> String { + // Only find out the length first to then allocate a buffer of the minimum size + // needed. + // SAFETY: + // - `msg` is a legally dereferencable upb_Message whose associated minitable is + // `mt` + // - `buf` is nullptr and `buf_len` is 0 + let len = + unsafe { upb_DebugString(msg, mt, Options::NoSortMaps as i32, std::ptr::null_mut(), 0) }; + assert!(len < isize::MAX as usize); + // +1 for the trailing NULL + let mut buf = vec![0u8; len + 1]; + // SAFETY: + // - `msg` is a legally dereferencable upb_Message whose associated minitable is + // `mt` + // - `buf` is legally writable for 'buf_len' bytes + let written_len = unsafe { + upb_DebugString(msg, mt, Options::NoSortMaps as i32, buf.as_mut_ptr(), buf.len()) + }; + assert_eq!(len, written_len); + String::from_utf8_lossy(buf.as_slice()).to_string() +} diff --git a/rust/upb/upb_api.c b/rust/upb/upb_api.c index 952021a3d79a..16cc9e081c40 100644 --- a/rust/upb/upb_api.c +++ b/rust/upb/upb_api.c @@ -19,6 +19,7 @@ #include "upb/message/map.h" // IWYU pragma: keep #include "upb/message/merge.h" // IWYU pragma: keep #include "upb/mini_table/message.h" // IWYU pragma: keep +#include "upb/text/debug_string.h" // IWYU pragma: keep // go/keep-sorted end const size_t __rust_proto_kUpb_Map_Begin = kUpb_Map_Begin; diff --git a/src/google/protobuf/compiler/rust/message.cc b/src/google/protobuf/compiler/rust/message.cc index 9eac1897cc86..23875cbe9f7e 100644 --- a/src/google/protobuf/compiler/rust/message.cc +++ b/src/google/protobuf/compiler/rust/message.cc @@ -156,11 +156,16 @@ void MessageDebug(Context& ctx, const Descriptor& msg) { return; case Kernel::kUpb: - ctx.Emit({}, + ctx.Emit({{"minitable", UpbMinitableName(msg)}}, R"rs( - f.debug_struct(std::any::type_name::()) - .field("raw_msg", &self.raw_msg()) - .finish() + let mini_table = unsafe { $std$::ptr::addr_of!($minitable$) }; + let string = unsafe { + $pbr$::debug_string( + self.raw_msg(), + mini_table, + ) + }; + write!(f, "{}", string) )rs"); return; } diff --git a/upb/BUILD b/upb/BUILD index fc2d1b372e6a..42797dc2ecb0 100644 --- a/upb/BUILD +++ b/upb/BUILD @@ -281,6 +281,7 @@ upb_amalgamation( "//upb/lex:lex", "//upb/mem:internal", "//upb/message:internal", + "//upb/message:iterator", "//upb/message:types", "//upb/mini_descriptor:internal", "//upb/mini_table:internal", @@ -327,6 +328,7 @@ upb_amalgamation( "//upb/lex:lex", "//upb/mem:internal", "//upb/message:internal", + "//upb/message:iterator", "//upb/message:types", "//upb/mini_descriptor:internal", "//upb/mini_table:internal", @@ -374,6 +376,7 @@ upb_amalgamation( "//upb/lex:lex", "//upb/mem:internal", "//upb/message:internal", + "//upb/message:iterator", "//upb/message:types", "//upb/mini_descriptor:internal", "//upb/mini_table:internal", diff --git a/upb/message/BUILD b/upb/message/BUILD index 1e2fe17ea358..0e8b5883c9d6 100644 --- a/upb/message/BUILD +++ b/upb/message/BUILD @@ -85,6 +85,24 @@ cc_library( ], ) +cc_library( + name = "iterator", + srcs = [ + "internal/iterator.c", + ], + hdrs = [ + "internal/iterator.h", + ], + copts = UPB_DEFAULT_COPTS, + visibility = ["//visibility:public"], + deps = [ + ":internal", + ":message", + "//upb:mini_table", + "//upb:port", + ], +) + cc_library( name = "compare", srcs = [ @@ -97,6 +115,7 @@ cc_library( visibility = ["//visibility:public"], deps = [ ":internal", + ":iterator", ":message", "//upb:base", "//upb:mini_table", diff --git a/upb/message/compare.c b/upb/message/compare.c index d2c88b6595cb..70ad026bc49d 100644 --- a/upb/message/compare.c +++ b/upb/message/compare.c @@ -15,6 +15,7 @@ #include "upb/message/internal/accessors.h" #include "upb/message/internal/compare_unknown.h" #include "upb/message/internal/extension.h" +#include "upb/message/internal/iterator.h" #include "upb/message/map.h" #include "upb/message/message.h" #include "upb/mini_table/extension.h" @@ -25,74 +26,18 @@ // Must be last. #include "upb/port/def.inc" -#define kUpb_BaseField_Begin ((size_t)-1) -#define kUpb_Extension_Begin ((size_t)-1) #ifdef __cplusplus extern "C" { #endif -static bool _upb_Message_NextBaseField(const upb_Message* msg, - const upb_MiniTable* m, - const upb_MiniTableField** out_f, - upb_MessageValue* out_v, size_t* iter) { - const size_t count = upb_MiniTable_FieldCount(m); - size_t i = *iter; - - while (++i < count) { - const upb_MiniTableField* f = upb_MiniTable_GetFieldByIndex(m, i); - const void* src = UPB_PRIVATE(_upb_Message_DataPtr)(msg, f); - - upb_MessageValue val; - UPB_PRIVATE(_upb_MiniTableField_DataCopy)(f, &val, src); - - // Skip field if unset or empty. - if (upb_MiniTableField_HasPresence(f)) { - if (!upb_Message_HasBaseField(msg, f)) continue; - } else { - if (UPB_PRIVATE(_upb_MiniTableField_DataIsZero)(f, src)) continue; - - if (upb_MiniTableField_IsArray(f)) { - if (upb_Array_Size(val.array_val) == 0) continue; - } else if (upb_MiniTableField_IsMap(f)) { - if (upb_Map_Size(val.map_val) == 0) continue; - } - } - - *out_f = f; - *out_v = val; - *iter = i; - return true; - } - - return false; -} - -static bool _upb_Message_NextExtension(const upb_Message* msg, - const upb_MiniTable* m, - const upb_MiniTableExtension** out_e, - upb_MessageValue* out_v, size_t* iter) { - size_t count; - const upb_Extension* exts = UPB_PRIVATE(_upb_Message_Getexts)(msg, &count); - size_t i = *iter; - - if (++i < count) { - *out_e = exts[i].ext; - *out_v = exts[i].data; - *iter = i; - return true; - } - - return false; -} - bool upb_Message_IsEmpty(const upb_Message* msg, const upb_MiniTable* m) { if (upb_Message_ExtensionCount(msg)) return false; const upb_MiniTableField* f; upb_MessageValue v; size_t iter = kUpb_BaseField_Begin; - return !_upb_Message_NextBaseField(msg, m, &f, &v, &iter); + return !UPB_PRIVATE(_upb_Message_NextBaseField)(msg, m, &f, &v, &iter); } static bool _upb_Array_IsEqual(const upb_Array* arr1, const upb_Array* arr2, @@ -154,8 +99,10 @@ static bool _upb_Message_BaseFieldsAreEqual(const upb_Message* msg1, const upb_MiniTableField *f1, *f2; upb_MessageValue val1, val2; - const bool got1 = _upb_Message_NextBaseField(msg1, m, &f1, &val1, &iter1); - const bool got2 = _upb_Message_NextBaseField(msg2, m, &f2, &val2, &iter2); + const bool got1 = + UPB_PRIVATE(_upb_Message_NextBaseField)(msg1, m, &f1, &val1, &iter1); + const bool got2 = + UPB_PRIVATE(_upb_Message_NextBaseField)(msg2, m, &f2, &val2, &iter2); if (got1 != got2) return false; // Must have identical field counts. if (!got1) return true; // Loop termination condition. @@ -195,7 +142,7 @@ static bool _upb_Message_ExtensionsAreEqual(const upb_Message* msg1, // Iterate over all extensions for msg1, and search msg2 for each extension. size_t iter1 = kUpb_Extension_Begin; - while (_upb_Message_NextExtension(msg1, m, &e, &val1, &iter1)) { + while (UPB_PRIVATE(_upb_Message_NextExtension)(msg1, m, &e, &val1, &iter1)) { const upb_Extension* ext2 = UPB_PRIVATE(_upb_Message_Getext)(msg2, e); if (!ext2) return false; diff --git a/upb/message/internal/iterator.c b/upb/message/internal/iterator.c new file mode 100644 index 000000000000..82f2b378b91a --- /dev/null +++ b/upb/message/internal/iterator.c @@ -0,0 +1,78 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2024 Google LLC. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file or at +// https://developers.google.com/open-source/licenses/bsd + +#include "upb/message/internal/iterator.h" // IWYU pragma: keep + +#include + +#include "upb/message/accessors.h" +#include "upb/message/array.h" +#include "upb/message/internal/accessors.h" +#include "upb/message/internal/extension.h" +#include "upb/message/map.h" +#include "upb/message/message.h" +#include "upb/mini_table/extension.h" +#include "upb/mini_table/field.h" +#include "upb/mini_table/message.h" + +// Must be last. +#include "upb/port/def.inc" + +bool UPB_PRIVATE(_upb_Message_NextBaseField)(const upb_Message* msg, + const upb_MiniTable* m, + const upb_MiniTableField** out_f, + upb_MessageValue* out_v, + size_t* iter) { + const size_t count = upb_MiniTable_FieldCount(m); + size_t i = *iter; + + while (++i < count) { + const upb_MiniTableField* f = upb_MiniTable_GetFieldByIndex(m, i); + const void* src = UPB_PRIVATE(_upb_Message_DataPtr)(msg, f); + + upb_MessageValue val; + UPB_PRIVATE(_upb_MiniTableField_DataCopy)(f, &val, src); + + // Skip field if unset or empty. + if (upb_MiniTableField_HasPresence(f)) { + if (!upb_Message_HasBaseField(msg, f)) continue; + } else { + if (UPB_PRIVATE(_upb_MiniTableField_DataIsZero)(f, src)) continue; + + if (upb_MiniTableField_IsArray(f)) { + if (upb_Array_Size(val.array_val) == 0) continue; + } else if (upb_MiniTableField_IsMap(f)) { + if (upb_Map_Size(val.map_val) == 0) continue; + } + } + + *out_f = f; + *out_v = val; + *iter = i; + return true; + } + + return false; +} + +bool UPB_PRIVATE(_upb_Message_NextExtension)( + const upb_Message* msg, const upb_MiniTable* m, + const upb_MiniTableExtension** out_e, upb_MessageValue* out_v, + size_t* iter) { + size_t count; + const upb_Extension* exts = UPB_PRIVATE(_upb_Message_Getexts)(msg, &count); + size_t i = *iter; + + if (++i < count) { + *out_e = exts[i].ext; + *out_v = exts[i].data; + *iter = i; + return true; + } + + return false; +} \ No newline at end of file diff --git a/upb/message/internal/iterator.h b/upb/message/internal/iterator.h new file mode 100644 index 000000000000..ad080db70da9 --- /dev/null +++ b/upb/message/internal/iterator.h @@ -0,0 +1,35 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2024 Google LLC. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file or at +// https://developers.google.com/open-source/licenses/bsd + +#ifndef THIRD_PARTY_UPB_UPB_MESSAGE_INTERNAL_ITERATOR_H_ +#define THIRD_PARTY_UPB_UPB_MESSAGE_INTERNAL_ITERATOR_H_ + +#include + +#include "upb/message/message.h" +#include "upb/message/value.h" +#include "upb/mini_table/extension.h" +#include "upb/mini_table/field.h" +#include "upb/mini_table/message.h" + +// Must be last. +#include "upb/port/def.inc" + +#define kUpb_BaseField_Begin ((size_t)-1) +#define kUpb_Extension_Begin ((size_t)-1) + +bool UPB_PRIVATE(_upb_Message_NextBaseField)(const upb_Message* msg, + const upb_MiniTable* m, + const upb_MiniTableField** out_f, + upb_MessageValue* out_v, + size_t* iter); + +bool UPB_PRIVATE(_upb_Message_NextExtension)( + const upb_Message* msg, const upb_MiniTable* m, + const upb_MiniTableExtension** out_e, upb_MessageValue* out_v, + size_t* iter); +#endif // THIRD_PARTY_UPB_UPB_MESSAGE_INTERNAL_ITERATOR_H_ diff --git a/upb/text/BUILD b/upb/text/BUILD index 0f96b2db7dff..8b79887ef3b2 100644 --- a/upb/text/BUILD +++ b/upb/text/BUILD @@ -14,23 +14,95 @@ cc_library( ], hdrs = [ "encode.h", + "options.h", ], copts = UPB_DEFAULT_COPTS, visibility = ["//visibility:public"], deps = [ - "//third_party/utf8_range", + ":internal", "//upb:base", "//upb:eps_copy_input_stream", "//upb:message", "//upb:port", "//upb:reflection", - "//upb:wire_reader", "//upb/lex", "//upb/message:internal", "//upb/message:types", ], ) +cc_library( + name = "debug", + srcs = [ + "debug_string.c", + ], + hdrs = [ + "debug_string.h", + "options.h", + ], + copts = UPB_DEFAULT_COPTS, + visibility = ["//visibility:public"], + deps = [ + ":internal", + "//upb:base", + "//upb:eps_copy_input_stream", + "//upb:message", + "//upb:mini_table", + "//upb:port", + "//upb/lex", + "//upb/message:internal", + "//upb/message:iterator", + "//upb/message:types", + "//upb/mini_table:internal", + ], +) + +cc_library( + name = "internal", + srcs = [ + "internal/encode.c", + ], + hdrs = [ + "internal/encode.h", + "options.h", + ], + copts = UPB_DEFAULT_COPTS, + visibility = ["//visibility:public"], + deps = [ + "//third_party/utf8_range", + "//upb:base", + "//upb:eps_copy_input_stream", + "//upb:message", + "//upb:port", + "//upb:wire_reader", + "//upb/lex", + "//upb/message:internal", + ], +) + +cc_test( + name = "encode_debug_test", + srcs = [ + "encode_debug_test.cc", + ], + deps = [ + ":debug", + "//upb:base", + "//upb:eps_copy_input_stream", + "//upb:mem", + "//upb:message", + "//upb:mini_table", + "//upb:port", + "//upb:wire_reader", + "//upb/message:internal", + "//upb/test:test_proto_upb_minitable", + "//upb/test:test_upb_proto", + "@com_google_absl//absl/log:absl_log", + "@com_google_googletest//:gtest", + "@com_google_googletest//:gtest_main", + ], +) + # begin:github_only filegroup( name = "source_files", diff --git a/upb/text/debug_string.c b/upb/text/debug_string.c new file mode 100644 index 000000000000..731cd1cfe0d8 --- /dev/null +++ b/upb/text/debug_string.c @@ -0,0 +1,235 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2024 Google LLC. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file or at +// https://developers.google.com/open-source/licenses/bsd + +#include "upb/text/debug_string.h" + +#include +#include +#include +#include +#include + +#include "upb/base/descriptor_constants.h" +#include "upb/message/array.h" +#include "upb/message/internal/iterator.h" +#include "upb/message/internal/map_entry.h" +#include "upb/message/internal/map_sorter.h" +#include "upb/message/map.h" +#include "upb/message/message.h" +#include "upb/message/value.h" +#include "upb/mini_table/extension.h" +#include "upb/mini_table/field.h" +#include "upb/mini_table/internal/field.h" +#include "upb/mini_table/internal/message.h" +#include "upb/mini_table/message.h" +#include "upb/text/internal/encode.h" +#include "upb/wire/eps_copy_input_stream.h" + +// Must be last. +#include "upb/port/def.inc" + +static void _upb_MessageDebugString(txtenc* e, const upb_Message* msg, + const upb_MiniTable* mt); + +static void _upb_FieldDebugString(txtenc* e, upb_MessageValue val, + const upb_MiniTableField* f, + const upb_MiniTable* mt, const char* label, + const upb_MiniTableExtension* ext) { + UPB_PRIVATE(_upb_TextEncode_Indent)(e); + const upb_CType ctype = upb_MiniTableField_CType(f); + const bool is_ext = upb_MiniTableField_IsExtension(f); + char number[10]; // A 32-bit integer can hold up to 10 digits. + snprintf(number, sizeof(number), "%" PRIu32, upb_MiniTableField_Number(f)); + // label is to pass down whether we're dealing with a "key" of a map or + // a "value" of a map. + if (!label) label = number; + + if (is_ext) { + UPB_PRIVATE(_upb_TextEncode_Printf)(e, "[%s]", label); + } else { + UPB_PRIVATE(_upb_TextEncode_Printf)(e, "%s", label); + } + + if (ctype == kUpb_CType_Message) { + UPB_PRIVATE(_upb_TextEncode_Printf)(e, " {"); + UPB_PRIVATE(_upb_TextEncode_EndField)(e); + e->indent_depth++; + const upb_MiniTable* subm = ext ? upb_MiniTableExtension_GetSubMessage(ext) + : upb_MiniTable_SubMessage(mt, f); + _upb_MessageDebugString(e, val.msg_val, subm); + e->indent_depth--; + UPB_PRIVATE(_upb_TextEncode_Indent)(e); + UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "}"); + UPB_PRIVATE(_upb_TextEncode_EndField)(e); + return; + } + + UPB_PRIVATE(_upb_TextEncode_Printf)(e, ": "); + + if (ctype == + kUpb_CType_Enum) { // Enum has to be processed separately because of + // divergent behavior between encoders + UPB_PRIVATE(_upb_TextEncode_Printf)(e, "%" PRId32, val.int32_val); + } else { + UPB_PRIVATE(_upb_TextEncode_Scalar)(e, val, ctype); + } + + UPB_PRIVATE(_upb_TextEncode_EndField)(e); +} + +/* + * Arrays print as simple repeated elements, eg. + * + * 5: 1 + * 5: 2 + * 5: 3 + */ +static void _upb_ArrayDebugString(txtenc* e, const upb_Array* arr, + const upb_MiniTableField* f, + const upb_MiniTable* mt, + const upb_MiniTableExtension* ext) { + for (size_t i = 0, n = upb_Array_Size(arr); i < n; i++) { + _upb_FieldDebugString(e, upb_Array_Get(arr, i), f, mt, NULL, ext); + } +} + +static void _upb_MapEntryDebugString(txtenc* e, upb_MessageValue key, + upb_MessageValue val, + const upb_MiniTableField* f, + const upb_MiniTable* mt) { + const upb_MiniTable* entry = upb_MiniTable_SubMessage(mt, f); + const upb_MiniTableField* key_f = upb_MiniTable_MapKey(entry); + const upb_MiniTableField* val_f = upb_MiniTable_MapValue(entry); + + UPB_PRIVATE(_upb_TextEncode_Indent)(e); + UPB_PRIVATE(_upb_TextEncode_Printf)(e, "%u {", upb_MiniTableField_Number(f)); + UPB_PRIVATE(_upb_TextEncode_EndField)(e); + e->indent_depth++; + + _upb_FieldDebugString(e, key, key_f, entry, "key", NULL); + _upb_FieldDebugString(e, val, val_f, entry, "value", NULL); + + e->indent_depth--; + UPB_PRIVATE(_upb_TextEncode_Indent)(e); + UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "}"); + UPB_PRIVATE(_upb_TextEncode_EndField)(e); +} + +/* + * Maps print as messages of key/value, etc. + * + * 1 { + * key: "abc" + * value: 123 + * } + * 2 { + * key: "def" + * value: 456 + * } + */ +static void _upb_MapDebugString(txtenc* e, const upb_Map* map, + const upb_MiniTableField* f, + const upb_MiniTable* mt) { + if (e->options & UPB_TXTENC_NOSORT) { + size_t iter = kUpb_Map_Begin; + upb_MessageValue key, val; + while (upb_Map_Next(map, &key, &val, &iter)) { + _upb_MapEntryDebugString(e, key, val, f, mt); + } + } else { + if (upb_Map_Size(map) == 0) return; + + const upb_MiniTable* entry = upb_MiniTable_SubMessage(mt, f); + const upb_MiniTableField* key_f = upb_MiniTable_GetFieldByIndex(entry, 0); + _upb_sortedmap sorted; + upb_MapEntry ent; + + _upb_mapsorter_pushmap(&e->sorter, upb_MiniTableField_Type(key_f), map, + &sorted); + while (_upb_sortedmap_next(&e->sorter, map, &sorted, &ent)) { + upb_MessageValue key, val; + memcpy(&key, &ent.k, sizeof(key)); + memcpy(&val, &ent.v, sizeof(val)); + _upb_MapEntryDebugString(e, key, val, f, mt); + } + _upb_mapsorter_popmap(&e->sorter, &sorted); + } +} + +static void _upb_MessageDebugString(txtenc* e, const upb_Message* msg, + const upb_MiniTable* mt) { + size_t iter = kUpb_BaseField_Begin; + const upb_MiniTableField* f; + upb_MessageValue val; + + // Base fields will be printed out first, followed by extension fields, and + // finally unknown fields. + + while (UPB_PRIVATE(_upb_Message_NextBaseField)(msg, mt, &f, &val, &iter)) { + if (upb_MiniTableField_IsMap(f)) { + _upb_MapDebugString(e, val.map_val, f, mt); + } else if (upb_MiniTableField_IsArray(f)) { + // ext set to NULL as we're not dealing with extensions yet + _upb_ArrayDebugString(e, val.array_val, f, mt, NULL); + } else { + // ext set to NULL as we're not dealing with extensions yet + // label set to NULL as we're not currently working with a MapEntry + _upb_FieldDebugString(e, val, f, mt, NULL, NULL); + } + } + + const upb_MiniTableExtension* ext; + upb_MessageValue val_ext; + iter = kUpb_Extension_Begin; + while ( + UPB_PRIVATE(_upb_Message_NextExtension)(msg, mt, &ext, &val_ext, &iter)) { + const upb_MiniTableField* f = &ext->UPB_PRIVATE(field); + // It is not sufficient to only pass |f| as we lose valuable information + // about sub-messages. It is required that we pass |ext|. + if (upb_MiniTableField_IsMap(f)) { + UPB_UNREACHABLE(); // Maps cannot be extensions. + break; + } else if (upb_MiniTableField_IsArray(f)) { + _upb_ArrayDebugString(e, val_ext.array_val, f, mt, ext); + } else { + // label set to NULL as we're not currently working with a MapEntry + _upb_FieldDebugString(e, val_ext, f, mt, NULL, ext); + } + } + + if ((e->options & UPB_TXTENC_SKIPUNKNOWN) == 0) { + size_t size; + const char* ptr = upb_Message_GetUnknown(msg, &size); + if (size != 0) { + char* start = e->ptr; + upb_EpsCopyInputStream stream; + upb_EpsCopyInputStream_Init(&stream, &ptr, size, true); + if (!UPB_PRIVATE(_upb_TextEncode_Unknown)(e, ptr, &stream, -1)) { + /* Unknown failed to parse, back up and don't print it at all. */ + e->ptr = start; + } + } + } +} + +size_t upb_DebugString(const upb_Message* msg, const upb_MiniTable* mt, + int options, char* buf, size_t size) { + txtenc e; + + e.buf = buf; + e.ptr = buf; + e.end = UPB_PTRADD(buf, size); + e.overflow = 0; + e.indent_depth = 0; + e.options = options; + e.ext_pool = NULL; + _upb_mapsorter_init(&e.sorter); + + _upb_MessageDebugString(&e, msg, mt); + _upb_mapsorter_destroy(&e.sorter); + return UPB_PRIVATE(_upb_TextEncode_Nullz)(&e, size); +} diff --git a/upb/text/debug_string.h b/upb/text/debug_string.h new file mode 100644 index 000000000000..cef8ad5231df --- /dev/null +++ b/upb/text/debug_string.h @@ -0,0 +1,42 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2024 Google LLC. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file or at +// https://developers.google.com/open-source/licenses/bsd + +#ifndef UPB_TEXT_ENCODE_DEBUG_H_ +#define UPB_TEXT_ENCODE_DEBUG_H_ + +#include + +#include "upb/message/message.h" +#include "upb/mini_table/message.h" +#include "upb/text/options.h" // IWYU pragma: export + +// Must be last. +#include "upb/port/def.inc" + +#ifdef __cplusplus +extern "C" { +#endif + +/* Encodes the given |msg| to a psuedo-text format: Instead of printing field + * name to value entries, it will print field number to value entries; much like + * how unknown fields are printed in upb_TextEncode in this directory's + * encode.h. |mt| should correspond to the |msg|'s minitable. + * + * Output is placed in the given buffer, and always NULL-terminated. The output + * size (excluding NULL) iss returned. This means that a return value >= |size| + * implies that the output was truncated. (These are the same semantics as + * snprintf()). */ +UPB_API size_t upb_DebugString(const upb_Message* msg, const upb_MiniTable* mt, + int options, char* buf, size_t size); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#include "upb/port/undef.inc" + +#endif /* UPB_TEXT_ENCODE_DEBUG_H_ */ diff --git a/upb/text/encode.c b/upb/text/encode.c index 055d005f5805..9e3a5356c522 100644 --- a/upb/text/encode.c +++ b/upb/text/encode.c @@ -7,10 +7,9 @@ #include "upb/text/encode.h" -#include -#include #include #include +#include #include #include @@ -23,213 +22,32 @@ #include "upb/message/map.h" #include "upb/message/message.h" #include "upb/message/value.h" -#include "upb/port/vsnprintf_compat.h" #include "upb/reflection/def.h" #include "upb/reflection/message.h" +#include "upb/text/internal/encode.h" #include "upb/wire/eps_copy_input_stream.h" -#include "upb/wire/reader.h" -#include "upb/wire/types.h" -#include "utf8_range.h" // Must be last. #include "upb/port/def.inc" -typedef struct { - char *buf, *ptr, *end; - size_t overflow; - int indent_depth; - int options; - const upb_DefPool* ext_pool; - _upb_mapsorter sorter; -} txtenc; +static void _upb_TextEncode_Msg(txtenc* e, const upb_Message* msg, + const upb_MessageDef* m); -static void txtenc_msg(txtenc* e, const upb_Message* msg, - const upb_MessageDef* m); - -static void txtenc_putbytes(txtenc* e, const void* data, size_t len) { - size_t have = e->end - e->ptr; - if (UPB_LIKELY(have >= len)) { - memcpy(e->ptr, data, len); - e->ptr += len; - } else { - if (have) { - memcpy(e->ptr, data, have); - e->ptr += have; - } - e->overflow += (len - have); - } -} - -static void txtenc_putstr(txtenc* e, const char* str) { - txtenc_putbytes(e, str, strlen(str)); -} - -static void txtenc_printf(txtenc* e, const char* fmt, ...) { - size_t n; - size_t have = e->end - e->ptr; - va_list args; - - va_start(args, fmt); - n = _upb_vsnprintf(e->ptr, have, fmt, args); - va_end(args); - - if (UPB_LIKELY(have > n)) { - e->ptr += n; - } else { - e->ptr = UPB_PTRADD(e->ptr, have); - e->overflow += (n - have); - } -} - -static void txtenc_indent(txtenc* e) { - if ((e->options & UPB_TXTENC_SINGLELINE) == 0) { - int i = e->indent_depth; - while (i-- > 0) { - txtenc_putstr(e, " "); - } - } -} - -static void txtenc_endfield(txtenc* e) { - if (e->options & UPB_TXTENC_SINGLELINE) { - txtenc_putstr(e, " "); - } else { - txtenc_putstr(e, "\n"); - } -} - -static void txtenc_enum(int32_t val, const upb_FieldDef* f, txtenc* e) { +static void _upb_TextEncode_Enum(int32_t val, const upb_FieldDef* f, + txtenc* e) { const upb_EnumDef* e_def = upb_FieldDef_EnumSubDef(f); const upb_EnumValueDef* ev = upb_EnumDef_FindValueByNumber(e_def, val); if (ev) { - txtenc_printf(e, "%s", upb_EnumValueDef_Name(ev)); + UPB_PRIVATE(_upb_TextEncode_Printf)(e, "%s", upb_EnumValueDef_Name(ev)); } else { - txtenc_printf(e, "%" PRId32, val); - } -} - -static void txtenc_escaped(txtenc* e, unsigned char ch) { - switch (ch) { - case '\n': - txtenc_putstr(e, "\\n"); - break; - case '\r': - txtenc_putstr(e, "\\r"); - break; - case '\t': - txtenc_putstr(e, "\\t"); - break; - case '\"': - txtenc_putstr(e, "\\\""); - break; - case '\'': - txtenc_putstr(e, "\\'"); - break; - case '\\': - txtenc_putstr(e, "\\\\"); - break; - default: - txtenc_printf(e, "\\%03o", ch); - break; - } -} - -// Returns true if `ch` needs to be escaped in TextFormat, independent of any -// UTF-8 validity issues. -static bool upb_DefinitelyNeedsEscape(unsigned char ch) { - if (ch < 32) return true; - switch (ch) { - case '\"': - case '\'': - case '\\': - case 127: - return true; - } - return false; -} - -static bool upb_AsciiIsPrint(unsigned char ch) { return ch >= 32 && ch < 127; } - -// Returns true if this is a high byte that requires UTF-8 validation. If the -// UTF-8 validation fails, we must escape the byte. -static bool upb_NeedsUtf8Validation(unsigned char ch) { return ch > 127; } - -// Returns the number of bytes in the prefix of `val` that do not need escaping. -// This is like utf8_range::SpanStructurallyValid(), except that it also -// terminates at any ASCII char that needs to be escaped in TextFormat (any char -// that has `DefinitelyNeedsEscape(ch) == true`). -// -// If we could get a variant of utf8_range::SpanStructurallyValid() that could -// terminate on any of these chars, that might be more efficient, but it would -// be much more complicated to modify that heavily SIMD code. -static size_t SkipPassthroughBytes(const char* ptr, size_t size) { - for (size_t i = 0; i < size; i++) { - unsigned char uc = ptr[i]; - if (upb_DefinitelyNeedsEscape(uc)) return i; - if (upb_NeedsUtf8Validation(uc)) { - // Find the end of this region of consecutive high bytes, so that we only - // give high bytes to the UTF-8 checker. This avoids needing to perform - // a second scan of the ASCII characters looking for characters that - // need escaping. - // - // We assume that high bytes are less frequent than plain, printable ASCII - // bytes, so we accept the double-scan of high bytes. - size_t end = i + 1; - for (; end < size; end++) { - if (!upb_NeedsUtf8Validation(ptr[end])) break; - } - size_t n = end - i; - size_t ok = utf8_range_ValidPrefix(ptr + i, n); - if (ok != n) return i + ok; - i += ok - 1; - } - } - return size; -} - -static void upb_HardenedPrintString(txtenc* e, const char* ptr, size_t len) { - // Print as UTF-8, while guarding against any invalid UTF-8 in the string - // field. - // - // If in the future we have a guaranteed invariant that invalid UTF-8 will - // never be present, we could avoid the UTF-8 check here. - txtenc_putstr(e, "\""); - const char* end = ptr + len; - while (ptr < end) { - size_t n = SkipPassthroughBytes(ptr, end - ptr); - if (n != 0) { - txtenc_putbytes(e, ptr, n); - ptr += n; - if (ptr == end) break; - } - - // If repeated calls to CEscape() and PrintString() are expensive, we could - // consider batching them, at the cost of some complexity. - txtenc_escaped(e, *ptr); - ptr++; - } - txtenc_putstr(e, "\""); -} - -static void txtenc_bytes(txtenc* e, upb_StringView data) { - const char* ptr = data.data; - const char* end = ptr + data.size; - txtenc_putstr(e, "\""); - for (; ptr < end; ptr++) { - unsigned char uc = *ptr; - if (upb_AsciiIsPrint(uc)) { - txtenc_putbytes(e, ptr, 1); - } else { - txtenc_escaped(e, uc); - } + UPB_PRIVATE(_upb_TextEncode_Printf)(e, "%" PRId32, val); } - txtenc_putstr(e, "\""); } -static void txtenc_field(txtenc* e, upb_MessageValue val, - const upb_FieldDef* f) { - txtenc_indent(e); +static void _upb_TextEncode_Field(txtenc* e, upb_MessageValue val, + const upb_FieldDef* f) { + UPB_PRIVATE(_upb_TextEncode_Indent)(e); const upb_CType ctype = upb_FieldDef_CType(f); const bool is_ext = upb_FieldDef_IsExtension(f); const char* full = upb_FieldDef_FullName(f); @@ -244,68 +62,33 @@ static void txtenc_field(txtenc* e, upb_MessageValue val, // } // end:google_only if (is_ext) { - txtenc_printf(e, "[%s] {", full); + UPB_PRIVATE(_upb_TextEncode_Printf)(e, "[%s] {", full); } else { - txtenc_printf(e, "%s {", name); + UPB_PRIVATE(_upb_TextEncode_Printf)(e, "%s {", name); } - txtenc_endfield(e); + UPB_PRIVATE(_upb_TextEncode_EndField)(e); e->indent_depth++; - txtenc_msg(e, val.msg_val, upb_FieldDef_MessageSubDef(f)); + _upb_TextEncode_Msg(e, val.msg_val, upb_FieldDef_MessageSubDef(f)); e->indent_depth--; - txtenc_indent(e); - txtenc_putstr(e, "}"); - txtenc_endfield(e); + UPB_PRIVATE(_upb_TextEncode_Indent)(e); + UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "}"); + UPB_PRIVATE(_upb_TextEncode_EndField)(e); return; } if (is_ext) { - txtenc_printf(e, "[%s]: ", full); + UPB_PRIVATE(_upb_TextEncode_Printf)(e, "[%s]: ", full); } else { - txtenc_printf(e, "%s: ", name); + UPB_PRIVATE(_upb_TextEncode_Printf)(e, "%s: ", name); } - switch (ctype) { - case kUpb_CType_Bool: - txtenc_putstr(e, val.bool_val ? "true" : "false"); - break; - case kUpb_CType_Float: { - char buf[32]; - _upb_EncodeRoundTripFloat(val.float_val, buf, sizeof(buf)); - txtenc_putstr(e, buf); - break; - } - case kUpb_CType_Double: { - char buf[32]; - _upb_EncodeRoundTripDouble(val.double_val, buf, sizeof(buf)); - txtenc_putstr(e, buf); - break; - } - case kUpb_CType_Int32: - txtenc_printf(e, "%" PRId32, val.int32_val); - break; - case kUpb_CType_UInt32: - txtenc_printf(e, "%" PRIu32, val.uint32_val); - break; - case kUpb_CType_Int64: - txtenc_printf(e, "%" PRId64, val.int64_val); - break; - case kUpb_CType_UInt64: - txtenc_printf(e, "%" PRIu64, val.uint64_val); - break; - case kUpb_CType_String: - upb_HardenedPrintString(e, val.str_val.data, val.str_val.size); - break; - case kUpb_CType_Bytes: - txtenc_bytes(e, val.str_val); - break; - case kUpb_CType_Enum: - txtenc_enum(val.int32_val, f, e); - break; - default: - UPB_UNREACHABLE(); + if (ctype == kUpb_CType_Enum) { + _upb_TextEncode_Enum(val.int32_val, f, e); + } else { + UPB_PRIVATE(_upb_TextEncode_Scalar)(e, val, ctype); } - txtenc_endfield(e); + UPB_PRIVATE(_upb_TextEncode_EndField)(e); } /* @@ -315,33 +98,34 @@ static void txtenc_field(txtenc* e, upb_MessageValue val, * foo_field: 2 * foo_field: 3 */ -static void txtenc_array(txtenc* e, const upb_Array* arr, - const upb_FieldDef* f) { +static void _upb_TextEncode_Array(txtenc* e, const upb_Array* arr, + const upb_FieldDef* f) { size_t i; size_t size = upb_Array_Size(arr); for (i = 0; i < size; i++) { - txtenc_field(e, upb_Array_Get(arr, i), f); + _upb_TextEncode_Field(e, upb_Array_Get(arr, i), f); } } -static void txtenc_mapentry(txtenc* e, upb_MessageValue key, - upb_MessageValue val, const upb_FieldDef* f) { +static void _upb_TextEncode_MapEntry(txtenc* e, upb_MessageValue key, + upb_MessageValue val, + const upb_FieldDef* f) { const upb_MessageDef* entry = upb_FieldDef_MessageSubDef(f); const upb_FieldDef* key_f = upb_MessageDef_Field(entry, 0); const upb_FieldDef* val_f = upb_MessageDef_Field(entry, 1); - txtenc_indent(e); - txtenc_printf(e, "%s {", upb_FieldDef_Name(f)); - txtenc_endfield(e); + UPB_PRIVATE(_upb_TextEncode_Indent)(e); + UPB_PRIVATE(_upb_TextEncode_Printf)(e, "%s {", upb_FieldDef_Name(f)); + UPB_PRIVATE(_upb_TextEncode_EndField)(e); e->indent_depth++; - txtenc_field(e, key, key_f); - txtenc_field(e, val, val_f); + _upb_TextEncode_Field(e, key, key_f); + _upb_TextEncode_Field(e, val, val_f); e->indent_depth--; - txtenc_indent(e); - txtenc_putstr(e, "}"); - txtenc_endfield(e); + UPB_PRIVATE(_upb_TextEncode_Indent)(e); + UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "}"); + UPB_PRIVATE(_upb_TextEncode_EndField)(e); } /* @@ -356,12 +140,13 @@ static void txtenc_mapentry(txtenc* e, upb_MessageValue key, * value: 456 * } */ -static void txtenc_map(txtenc* e, const upb_Map* map, const upb_FieldDef* f) { +static void _upb_TextEncode_Map(txtenc* e, const upb_Map* map, + const upb_FieldDef* f) { if (e->options & UPB_TXTENC_NOSORT) { size_t iter = kUpb_Map_Begin; upb_MessageValue key, val; while (upb_Map_Next(map, &key, &val, &iter)) { - txtenc_mapentry(e, key, val, f); + _upb_TextEncode_MapEntry(e, key, val, f); } } else { if (upb_Map_Size(map) == 0) return; @@ -376,135 +161,25 @@ static void txtenc_map(txtenc* e, const upb_Map* map, const upb_FieldDef* f) { upb_MessageValue key, val; memcpy(&key, &ent.k, sizeof(key)); memcpy(&val, &ent.v, sizeof(val)); - txtenc_mapentry(e, key, val, f); + _upb_TextEncode_MapEntry(e, key, val, f); } _upb_mapsorter_popmap(&e->sorter, &sorted); } } -#define CHK(x) \ - do { \ - if (!(x)) { \ - return false; \ - } \ - } while (0) - -/* - * Unknown fields are printed by number. - * - * 1001: 123 - * 1002: "hello" - * 1006: 0xdeadbeef - * 1003: { - * 1: 111 - * } - */ -static const char* txtenc_unknown(txtenc* e, const char* ptr, - upb_EpsCopyInputStream* stream, - int groupnum) { - // We are guaranteed that the unknown data is valid wire format, and will not - // contain tag zero. - uint32_t end_group = groupnum > 0 - ? ((groupnum << kUpb_WireReader_WireTypeBits) | - kUpb_WireType_EndGroup) - : 0; - - while (!upb_EpsCopyInputStream_IsDone(stream, &ptr)) { - uint32_t tag; - CHK(ptr = upb_WireReader_ReadTag(ptr, &tag)); - if (tag == end_group) return ptr; - - txtenc_indent(e); - txtenc_printf(e, "%d: ", (int)upb_WireReader_GetFieldNumber(tag)); - - switch (upb_WireReader_GetWireType(tag)) { - case kUpb_WireType_Varint: { - uint64_t val; - CHK(ptr = upb_WireReader_ReadVarint(ptr, &val)); - txtenc_printf(e, "%" PRIu64, val); - break; - } - case kUpb_WireType_32Bit: { - uint32_t val; - ptr = upb_WireReader_ReadFixed32(ptr, &val); - txtenc_printf(e, "0x%08" PRIu32, val); - break; - } - case kUpb_WireType_64Bit: { - uint64_t val; - ptr = upb_WireReader_ReadFixed64(ptr, &val); - txtenc_printf(e, "0x%016" PRIu64, val); - break; - } - case kUpb_WireType_Delimited: { - int size; - char* start = e->ptr; - size_t start_overflow = e->overflow; - CHK(ptr = upb_WireReader_ReadSize(ptr, &size)); - CHK(upb_EpsCopyInputStream_CheckDataSizeAvailable(stream, ptr, size)); - - // Speculatively try to parse as message. - txtenc_putstr(e, "{"); - txtenc_endfield(e); - - // EpsCopyInputStream can't back up, so create a sub-stream for the - // speculative parse. - upb_EpsCopyInputStream sub_stream; - const char* sub_ptr = upb_EpsCopyInputStream_GetAliasedPtr(stream, ptr); - upb_EpsCopyInputStream_Init(&sub_stream, &sub_ptr, size, true); - - e->indent_depth++; - if (txtenc_unknown(e, sub_ptr, &sub_stream, -1)) { - ptr = upb_EpsCopyInputStream_Skip(stream, ptr, size); - e->indent_depth--; - txtenc_indent(e); - txtenc_putstr(e, "}"); - } else { - // Didn't work out, print as raw bytes. - e->indent_depth--; - e->ptr = start; - e->overflow = start_overflow; - const char* str = ptr; - ptr = upb_EpsCopyInputStream_ReadString(stream, &str, size, NULL); - UPB_ASSERT(ptr); - txtenc_bytes(e, (upb_StringView){.data = str, .size = size}); - } - break; - } - case kUpb_WireType_StartGroup: - txtenc_putstr(e, "{"); - txtenc_endfield(e); - e->indent_depth++; - CHK(ptr = txtenc_unknown(e, ptr, stream, - upb_WireReader_GetFieldNumber(tag))); - e->indent_depth--; - txtenc_indent(e); - txtenc_putstr(e, "}"); - break; - default: - return NULL; - } - txtenc_endfield(e); - } - - return end_group == 0 && !upb_EpsCopyInputStream_IsError(stream) ? ptr : NULL; -} - -#undef CHK - -static void txtenc_msg(txtenc* e, const upb_Message* msg, - const upb_MessageDef* m) { +static void _upb_TextEncode_Msg(txtenc* e, const upb_Message* msg, + const upb_MessageDef* m) { size_t iter = kUpb_Message_Begin; const upb_FieldDef* f; upb_MessageValue val; while (upb_Message_Next(msg, m, e->ext_pool, &f, &val, &iter)) { if (upb_FieldDef_IsMap(f)) { - txtenc_map(e, val.map_val, f); + _upb_TextEncode_Map(e, val.map_val, f); } else if (upb_FieldDef_IsRepeated(f)) { - txtenc_array(e, val.array_val, f); + _upb_TextEncode_Array(e, val.array_val, f); } else { - txtenc_field(e, val, f); + _upb_TextEncode_Field(e, val, f); } } @@ -515,7 +190,7 @@ static void txtenc_msg(txtenc* e, const upb_Message* msg, char* start = e->ptr; upb_EpsCopyInputStream stream; upb_EpsCopyInputStream_Init(&stream, &ptr, size, true); - if (!txtenc_unknown(e, ptr, &stream, -1)) { + if (!UPB_PRIVATE(_upb_TextEncode_Unknown)(e, ptr, &stream, -1)) { /* Unknown failed to parse, back up and don't print it at all. */ e->ptr = start; } @@ -523,17 +198,6 @@ static void txtenc_msg(txtenc* e, const upb_Message* msg, } } -size_t txtenc_nullz(txtenc* e, size_t size) { - size_t ret = e->ptr - e->buf + e->overflow; - - if (size > 0) { - if (e->ptr == e->end) e->ptr--; - *e->ptr = '\0'; - } - - return ret; -} - size_t upb_TextEncode(const upb_Message* msg, const upb_MessageDef* m, const upb_DefPool* ext_pool, int options, char* buf, size_t size) { @@ -548,7 +212,7 @@ size_t upb_TextEncode(const upb_Message* msg, const upb_MessageDef* m, e.ext_pool = ext_pool; _upb_mapsorter_init(&e.sorter); - txtenc_msg(&e, msg, m); + _upb_TextEncode_Msg(&e, msg, m); _upb_mapsorter_destroy(&e.sorter); - return txtenc_nullz(&e, size); + return UPB_PRIVATE(_upb_TextEncode_Nullz)(&e, size); } diff --git a/upb/text/encode.h b/upb/text/encode.h index 9374dc3d30a1..7aea54c5511d 100644 --- a/upb/text/encode.h +++ b/upb/text/encode.h @@ -9,6 +9,7 @@ #define UPB_TEXT_ENCODE_H_ #include "upb/reflection/def.h" +#include "upb/text/options.h" // IWYU pragma: export // Must be last. #include "upb/port/def.inc" @@ -17,17 +18,6 @@ extern "C" { #endif -enum { - // When set, prints everything on a single line. - UPB_TXTENC_SINGLELINE = 1, - - // When set, unknown fields are not printed. - UPB_TXTENC_SKIPUNKNOWN = 2, - - // When set, maps are *not* sorted (this avoids allocating tmp mem). - UPB_TXTENC_NOSORT = 4 -}; - /* Encodes the given |msg| to text format. The message's reflection is given in * |m|. The symtab in |symtab| is used to find extensions (if NULL, extensions * will not be printed). diff --git a/upb/text/encode_debug_test.cc b/upb/text/encode_debug_test.cc new file mode 100644 index 000000000000..b8820141a49e --- /dev/null +++ b/upb/text/encode_debug_test.cc @@ -0,0 +1,63 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2024 Google LLC. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file or at +// https://developers.google.com/open-source/licenses/bsd + +#include + +#include + +#include +#include "absl/log/absl_log.h" +#include "upb/base/string_view.h" +#include "upb/base/upcast.h" +#include "upb/mem/arena.h" +#include "upb/message/message.h" +#include "upb/mini_table/message.h" +#include "upb/test/test.upb.h" +#include "upb/test/test.upb_minitable.h" +#include "upb/text/debug_string.h" + +TEST(TextNoReflection, Extensions) { + const upb_MiniTable* mt_main = upb_0test__ModelWithExtensions_msg_init_ptr; + upb_Arena* arena = upb_Arena_New(); + + upb_test_ModelExtension1* extension1 = upb_test_ModelExtension1_new(arena); + upb_test_ModelExtension1_set_str(extension1, + upb_StringView_FromString("Hello")); + + upb_test_ModelExtension2* extension2 = upb_test_ModelExtension2_new(arena); + upb_test_ModelExtension2_set_i(extension2, 5); + + upb_test_ModelWithExtensions* msg = upb_test_ModelWithExtensions_new(arena); + + upb_test_ModelExtension1_set_model_ext(msg, extension1, arena); + upb_test_ModelExtension2_set_model_ext(msg, extension2, arena); + + // Convert to a type of upb_Message* + upb_Message* input = UPB_UPCAST(msg); + // Resizing/reallocation of the buffer is not necessary since we're only + // testing that we get the expected debug string. + char* buf = new char[100]; + int options = + UPB_TXTENC_NOSORT; // Does not matter, but maps will not be sorted. + size_t size = 100; + size_t real_size = upb_DebugString(input, mt_main, options, buf, size); + ABSL_LOG(INFO) << "Buffer: \n" + << buf << "\n" + << "Size:" << real_size << "\n"; + std::string golden = R"([4135] { + 9: 5 +} +[1547] { + 25: "Hello" +} +)"; + ASSERT_EQ(buf[real_size], '\0'); + std::string str(buf); + ASSERT_EQ(buf, golden); + delete[] buf; + upb_Arena_Free(arena); +} \ No newline at end of file diff --git a/upb/text/internal/encode.c b/upb/text/internal/encode.c new file mode 100644 index 000000000000..fc9cc6fd2737 --- /dev/null +++ b/upb/text/internal/encode.c @@ -0,0 +1,180 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2024 Google LLC. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file or at +// https://developers.google.com/open-source/licenses/bsd + +#include "upb/text/internal/encode.h" + +#include +#include +#include + +#include "upb/base/descriptor_constants.h" +#include "upb/base/string_view.h" +#include "upb/lex/round_trip.h" +#include "upb/message/array.h" +#include "upb/wire/eps_copy_input_stream.h" +#include "upb/wire/reader.h" +#include "upb/wire/types.h" + +// Must be last. +#include "upb/port/def.inc" + +#define CHK(x) \ + do { \ + if (!(x)) { \ + return false; \ + } \ + } while (0) + +/* + * Unknown fields are printed by number. + * + * 1001: 123 + * 1002: "hello" + * 1006: 0xdeadbeef + * 1003: { + * 1: 111 + * } + */ +const char* UPB_PRIVATE(_upb_TextEncode_Unknown)(txtenc* e, const char* ptr, + upb_EpsCopyInputStream* stream, + int groupnum) { + // We are guaranteed that the unknown data is valid wire format, and will not + // contain tag zero. + uint32_t end_group = groupnum > 0 + ? ((groupnum << kUpb_WireReader_WireTypeBits) | + kUpb_WireType_EndGroup) + : 0; + + while (!upb_EpsCopyInputStream_IsDone(stream, &ptr)) { + uint32_t tag; + CHK(ptr = upb_WireReader_ReadTag(ptr, &tag)); + if (tag == end_group) return ptr; + + UPB_PRIVATE(_upb_TextEncode_Indent)(e); + UPB_PRIVATE(_upb_TextEncode_Printf) + (e, "%d: ", (int)upb_WireReader_GetFieldNumber(tag)); + + switch (upb_WireReader_GetWireType(tag)) { + case kUpb_WireType_Varint: { + uint64_t val; + CHK(ptr = upb_WireReader_ReadVarint(ptr, &val)); + UPB_PRIVATE(_upb_TextEncode_Printf)(e, "%" PRIu64, val); + break; + } + case kUpb_WireType_32Bit: { + uint32_t val; + ptr = upb_WireReader_ReadFixed32(ptr, &val); + UPB_PRIVATE(_upb_TextEncode_Printf)(e, "0x%08" PRIu32, val); + break; + } + case kUpb_WireType_64Bit: { + uint64_t val; + ptr = upb_WireReader_ReadFixed64(ptr, &val); + UPB_PRIVATE(_upb_TextEncode_Printf)(e, "0x%016" PRIu64, val); + break; + } + case kUpb_WireType_Delimited: { + int size; + char* start = e->ptr; + size_t start_overflow = e->overflow; + CHK(ptr = upb_WireReader_ReadSize(ptr, &size)); + CHK(upb_EpsCopyInputStream_CheckDataSizeAvailable(stream, ptr, size)); + + // Speculatively try to parse as message. + UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "{"); + UPB_PRIVATE(_upb_TextEncode_EndField)(e); + + // EpsCopyInputStream can't back up, so create a sub-stream for the + // speculative parse. + upb_EpsCopyInputStream sub_stream; + const char* sub_ptr = upb_EpsCopyInputStream_GetAliasedPtr(stream, ptr); + upb_EpsCopyInputStream_Init(&sub_stream, &sub_ptr, size, true); + + e->indent_depth++; + if (UPB_PRIVATE(_upb_TextEncode_Unknown)(e, sub_ptr, &sub_stream, -1)) { + ptr = upb_EpsCopyInputStream_Skip(stream, ptr, size); + e->indent_depth--; + UPB_PRIVATE(_upb_TextEncode_Indent)(e); + UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "}"); + } else { + // Didn't work out, print as raw bytes. + e->indent_depth--; + e->ptr = start; + e->overflow = start_overflow; + const char* str = ptr; + ptr = upb_EpsCopyInputStream_ReadString(stream, &str, size, NULL); + UPB_ASSERT(ptr); + UPB_PRIVATE(_upb_TextEncode_Bytes) + (e, (upb_StringView){.data = str, .size = size}); + } + break; + } + case kUpb_WireType_StartGroup: + UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "{"); + UPB_PRIVATE(_upb_TextEncode_EndField)(e); + e->indent_depth++; + CHK(ptr = UPB_PRIVATE(_upb_TextEncode_Unknown)( + e, ptr, stream, upb_WireReader_GetFieldNumber(tag))); + e->indent_depth--; + UPB_PRIVATE(_upb_TextEncode_Indent)(e); + UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "}"); + break; + default: + return NULL; + } + UPB_PRIVATE(_upb_TextEncode_EndField)(e); + } + + return end_group == 0 && !upb_EpsCopyInputStream_IsError(stream) ? ptr : NULL; +} + +#undef CHK + +void UPB_PRIVATE(_upb_TextEncode_Scalar)(txtenc* e, upb_MessageValue val, + upb_CType ctype) { + switch (ctype) { + case kUpb_CType_Bool: + UPB_PRIVATE(_upb_TextEncode_PutStr)(e, val.bool_val ? "true" : "false"); + break; + case kUpb_CType_Float: { + char buf[32]; + _upb_EncodeRoundTripFloat(val.float_val, buf, sizeof(buf)); + UPB_PRIVATE(_upb_TextEncode_PutStr)(e, buf); + break; + } + case kUpb_CType_Double: { + char buf[32]; + _upb_EncodeRoundTripDouble(val.double_val, buf, sizeof(buf)); + UPB_PRIVATE(_upb_TextEncode_PutStr)(e, buf); + break; + } + case kUpb_CType_Int32: + UPB_PRIVATE(_upb_TextEncode_Printf)(e, "%" PRId32, val.int32_val); + break; + case kUpb_CType_UInt32: + UPB_PRIVATE(_upb_TextEncode_Printf)(e, "%" PRIu32, val.uint32_val); + break; + case kUpb_CType_Int64: + UPB_PRIVATE(_upb_TextEncode_Printf)(e, "%" PRId64, val.int64_val); + break; + case kUpb_CType_UInt64: + UPB_PRIVATE(_upb_TextEncode_Printf)(e, "%" PRIu64, val.uint64_val); + break; + case kUpb_CType_String: + UPB_PRIVATE(_upb_HardenedPrintString) + (e, val.str_val.data, val.str_val.size); + break; + case kUpb_CType_Bytes: + UPB_PRIVATE(_upb_TextEncode_Bytes)(e, val.str_val); + break; + case kUpb_CType_Enum: + UPB_ASSERT(false); // handled separately in each encoder + break; + default: + UPB_UNREACHABLE(); + } +} \ No newline at end of file diff --git a/upb/text/internal/encode.h b/upb/text/internal/encode.h new file mode 100644 index 000000000000..598b2a0c45c0 --- /dev/null +++ b/upb/text/internal/encode.h @@ -0,0 +1,240 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2023 Google LLC. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file or at +// https://developers.google.com/open-source/licenses/bsd + +#ifndef UPB_TEXT_ENCODE_INTERNAL_H_ +#define UPB_TEXT_ENCODE_INTERNAL_H_ + +#include +#include + +#include "upb/base/descriptor_constants.h" +#include "upb/base/string_view.h" +#include "upb/message/array.h" +#include "upb/message/internal/map_sorter.h" +#include "upb/port/vsnprintf_compat.h" +#include "upb/text/options.h" +#include "upb/wire/eps_copy_input_stream.h" +#include "utf8_range.h" + +// Must be last. +#include "upb/port/def.inc" + +typedef struct { + char *buf, *ptr, *end; + size_t overflow; + int indent_depth; + int options; + const struct upb_DefPool* ext_pool; + _upb_mapsorter sorter; +} txtenc; + +UPB_INLINE void UPB_PRIVATE(_upb_TextEncode_PutBytes)(txtenc* e, + const void* data, + size_t len) { + size_t have = e->end - e->ptr; + if (UPB_LIKELY(have >= len)) { + memcpy(e->ptr, data, len); + e->ptr += len; + } else { + if (have) { + memcpy(e->ptr, data, have); + e->ptr += have; + } + e->overflow += (len - have); + } +} + +UPB_INLINE void UPB_PRIVATE(_upb_TextEncode_PutStr)(txtenc* e, + const char* str) { + UPB_PRIVATE(_upb_TextEncode_PutBytes)(e, str, strlen(str)); +} + +UPB_INLINE void UPB_PRIVATE(_upb_TextEncode_Printf)(txtenc* e, const char* fmt, + ...) { + size_t n; + size_t have = e->end - e->ptr; + va_list args; + + va_start(args, fmt); + n = _upb_vsnprintf(e->ptr, have, fmt, args); + va_end(args); + + if (UPB_LIKELY(have > n)) { + e->ptr += n; + } else { + e->ptr = UPB_PTRADD(e->ptr, have); + e->overflow += (n - have); + } +} + +UPB_INLINE void UPB_PRIVATE(_upb_TextEncode_Indent)(txtenc* e) { + if ((e->options & UPB_TXTENC_SINGLELINE) == 0) { + int i = e->indent_depth; + while (i-- > 0) { + UPB_PRIVATE(_upb_TextEncode_PutStr)(e, " "); + } + } +} + +UPB_INLINE void UPB_PRIVATE(_upb_TextEncode_EndField)(txtenc* e) { + if (e->options & UPB_TXTENC_SINGLELINE) { + UPB_PRIVATE(_upb_TextEncode_PutStr)(e, " "); + } else { + UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\n"); + } +} + +UPB_INLINE void UPB_PRIVATE(_upb_TextEncode_Escaped)(txtenc* e, + unsigned char ch) { + switch (ch) { + case '\n': + UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\\n"); + break; + case '\r': + UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\\r"); + break; + case '\t': + UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\\t"); + break; + case '\"': + UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\\\""); + break; + case '\'': + UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\\'"); + break; + case '\\': + UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\\\\"); + break; + default: + UPB_PRIVATE(_upb_TextEncode_Printf)(e, "\\%03o", ch); + break; + } +} + +// Returns true if `ch` needs to be escaped in TextFormat, independent of any +// UTF-8 validity issues. +UPB_INLINE bool UPB_PRIVATE(_upb_DefinitelyNeedsEscape)(unsigned char ch) { + if (ch < 32) return true; + switch (ch) { + case '\"': + case '\'': + case '\\': + case 127: + return true; + } + return false; +} + +UPB_INLINE bool UPB_PRIVATE(_upb_AsciiIsPrint)(unsigned char ch) { + return ch >= 32 && ch < 127; +} + +// Returns true if this is a high byte that requires UTF-8 validation. If the +// UTF-8 validation fails, we must escape the byte. +UPB_INLINE bool UPB_PRIVATE(_upb_NeedsUtf8Validation)(unsigned char ch) { + return ch > 127; +} + +// Returns the number of bytes in the prefix of `val` that do not need escaping. +// This is like utf8_range::SpanStructurallyValid(), except that it also +// terminates at any ASCII char that needs to be escaped in TextFormat (any char +// that has `DefinitelyNeedsEscape(ch) == true`). +// +// If we could get a variant of utf8_range::SpanStructurallyValid() that could +// terminate on any of these chars, that might be more efficient, but it would +// be much more complicated to modify that heavily SIMD code. +UPB_INLINE size_t UPB_PRIVATE(_SkipPassthroughBytes)(const char* ptr, + size_t size) { + for (size_t i = 0; i < size; i++) { + unsigned char uc = ptr[i]; + if (UPB_PRIVATE(_upb_DefinitelyNeedsEscape)(uc)) return i; + if (UPB_PRIVATE(_upb_NeedsUtf8Validation)(uc)) { + // Find the end of this region of consecutive high bytes, so that we only + // give high bytes to the UTF-8 checker. This avoids needing to perform + // a second scan of the ASCII characters looking for characters that + // need escaping. + // + // We assume that high bytes are less frequent than plain, printable ASCII + // bytes, so we accept the double-scan of high bytes. + size_t end = i + 1; + for (; end < size; end++) { + if (!UPB_PRIVATE(_upb_NeedsUtf8Validation)(ptr[end])) break; + } + size_t n = end - i; + size_t ok = utf8_range_ValidPrefix(ptr + i, n); + if (ok != n) return i + ok; + i += ok - 1; + } + } + return size; +} + +UPB_INLINE void UPB_PRIVATE(_upb_HardenedPrintString)(txtenc* e, + const char* ptr, + size_t len) { + // Print as UTF-8, while guarding against any invalid UTF-8 in the string + // field. + // + // If in the future we have a guaranteed invariant that invalid UTF-8 will + // never be present, we could avoid the UTF-8 check here. + UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\""); + const char* end = ptr + len; + while (ptr < end) { + size_t n = UPB_PRIVATE(_SkipPassthroughBytes)(ptr, end - ptr); + if (n != 0) { + UPB_PRIVATE(_upb_TextEncode_PutBytes)(e, ptr, n); + ptr += n; + if (ptr == end) break; + } + + // If repeated calls to CEscape() and PrintString() are expensive, we could + // consider batching them, at the cost of some complexity. + UPB_PRIVATE(_upb_TextEncode_Escaped)(e, *ptr); + ptr++; + } + UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\""); +} + +UPB_INLINE void UPB_PRIVATE(_upb_TextEncode_Bytes)(txtenc* e, + upb_StringView data) { + const char* ptr = data.data; + const char* end = ptr + data.size; + UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\""); + for (; ptr < end; ptr++) { + unsigned char uc = *ptr; + if (UPB_PRIVATE(_upb_AsciiIsPrint)(uc)) { + UPB_PRIVATE(_upb_TextEncode_PutBytes)(e, ptr, 1); + } else { + UPB_PRIVATE(_upb_TextEncode_Escaped)(e, uc); + } + } + UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\""); +} + +UPB_INLINE size_t UPB_PRIVATE(_upb_TextEncode_Nullz)(txtenc* e, size_t size) { + size_t ret = e->ptr - e->buf + e->overflow; + + if (size > 0) { + if (e->ptr == e->end) e->ptr--; + *e->ptr = '\0'; + } + + return ret; +} + +const char* UPB_PRIVATE(_upb_TextEncode_Unknown)(txtenc* e, const char* ptr, + upb_EpsCopyInputStream* stream, + int groupnum); + +// Must not be called for ctype = kUpb_CType_Enum, as they require different +// handling depending on whether or not we're doing reflection-based encoding. +void UPB_PRIVATE(_upb_TextEncode_Scalar)(txtenc* e, upb_MessageValue val, + upb_CType ctype); + +#include "upb/port/undef.inc" + +#endif // UPB_TEXT_ENCODE_INTERNAL_H_ diff --git a/upb/text/options.h b/upb/text/options.h new file mode 100644 index 000000000000..dcaa8bd7d231 --- /dev/null +++ b/upb/text/options.h @@ -0,0 +1,22 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2024 Google LLC. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file or at +// https://developers.google.com/open-source/licenses/bsd + +#ifndef UPB_TEXT_OPTIONS_H_ +#define UPB_TEXT_OPTIONS_H_ + +enum { + // When set, prints everything on a single line. + UPB_TXTENC_SINGLELINE = 1, + + // When set, unknown fields are not printed. + UPB_TXTENC_SKIPUNKNOWN = 2, + + // When set, maps are *not* sorted (this avoids allocating tmp mem). + UPB_TXTENC_NOSORT = 4 +}; + +#endif // UPB_TEXT_OPTIONS_H_