diff --git a/Cargo.lock b/Cargo.lock index 86c5827f0fda..7404b2a25f2b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -35,7 +35,7 @@ version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47" dependencies = [ - "getrandom 0.2.7", + "getrandom 0.2.8", "once_cell", "version_check", ] @@ -48,16 +48,16 @@ checksum = "bf6ccdb167abbf410dcb915cabd428929d7f6a04980b54a11f26a39f1c7f7107" dependencies = [ "cfg-if 1.0.0", "const-random", - "getrandom 0.2.7", + "getrandom 0.2.8", "once_cell", "version_check", ] [[package]] name = "aho-corasick" -version = "0.7.19" +version = "0.7.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4f55bd91a0978cbfd91c457a164bab8b4001c833b7f323132c0a4e1922dd44e" +checksum = "cc936419f96fa211c1b9166887b38e5e40b19958e5b895be7c1f93adec7071ac" dependencies = [ "memchr", ] @@ -68,7 +68,7 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "befdff0b4683a0824fc8719ce639a252d9d62cd89c8d0004c39e2417128c1eb8" dependencies = [ - "axum 0.6.1", + "axum", "bytes", "cfg-if 1.0.0", "http", @@ -123,9 +123,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.65" +version = "1.0.66" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98161a4e3e2184da77bb14f02184cdd111e83bbbcc9979dfee3c44b9a85f5602" +checksum = "216261ddc8289130e551ddcd5ce8a064710c0d064a4d2895c67151c92b5443f6" [[package]] name = "anymap" @@ -141,7 +141,7 @@ dependencies = [ "common-error", "common-time", "datatypes", - "prost 0.11.0", + "prost 0.11.3", "snafu", "tonic", "tonic-build", @@ -192,30 +192,6 @@ version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8da52d66c7071e2e3fa2a1e5c6d088fec47b593032b254f5e980de8ea54454d6" -[[package]] -name = "arrow" -version = "10.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1328dbc6d5d76a08b13df3ac630f61a6a31276d9e9d08eb813e98efa624c2382" -dependencies = [ - "bitflags", - "chrono", - "csv", - "flatbuffers 2.1.1", - "half 1.8.2", - "hex", - "indexmap", - "lazy_static", - "lexical-core", - "multiversion", - "num", - "rand 0.8.5", - "regex", - "serde", - "serde_derive", - "serde_json", -] - [[package]] name = "arrow" version = "26.0.0" @@ -230,10 +206,11 @@ dependencies = [ "arrow-select", "bitflags", "chrono", + "comfy-table", "csv", - "flatbuffers 22.9.29", + "flatbuffers", "half 2.1.0", - "hashbrown", + "hashbrown 0.12.3", "indexmap", "lazy_static", "lexical-core", @@ -256,7 +233,7 @@ dependencies = [ "arrow-schema", "chrono", "half 2.1.0", - "hashbrown", + "hashbrown 0.12.3", "num", ] @@ -282,21 +259,14 @@ dependencies = [ "num", ] -[[package]] -name = "arrow-format" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2333f8ccf0d597ba779863c57a0b61f635721187fb2fdeabae92691d7d582fe5" -dependencies = [ - "planus", - "serde", -] - [[package]] name = "arrow-schema" version = "26.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0f9406eb7834ca6bd8350d1baa515d18b9fcec487eddacfb62f5e19511f7bd37" +dependencies = [ + "serde", +] [[package]] name = "arrow-select" @@ -311,38 +281,6 @@ dependencies = [ "num", ] -[[package]] -name = "arrow2" -version = "0.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e387b20dd573a96f36b173d9027483898f944d696521afd74e2caa3c813d86e" -dependencies = [ - "ahash 0.7.6", - "arrow-format", - "base64", - "bytemuck", - "chrono", - "csv", - "csv-core", - "either", - "fallible-streaming-iterator", - "futures", - "hash_hasher", - "indexmap", - "itertools", - "lexical-core", - "multiversion", - "num-traits", - "parquet2", - "regex", - "serde", - "serde_derive", - "serde_json", - "simdutf8", - "streaming-iterator", - "strength_reduce", -] - [[package]] name = "ascii" version = "1.1.0" @@ -360,9 +298,9 @@ dependencies = [ [[package]] name = "async-channel" -version = "1.7.1" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e14485364214912d3b19cc3435dde4df66065127f05fa0d75c712f36f12c2f28" +checksum = "cf46fee83e5ccffc220104713af3292ff9bc7c64c7de289f66dae8e38d826833" dependencies = [ "concurrent-queue", "event-listener", @@ -384,13 +322,15 @@ dependencies = [ [[package]] name = "async-compression" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "345fd392ab01f746c717b1357165b76f0b67a60192007b234058c9045fdcf695" +checksum = "942c7cd7ae39e91bde4820d74132e9862e62c2f386c3aa90ccf55949f5bad63a" dependencies = [ "brotli", + "bzip2", "flate2", "futures-core", + "futures-io", "memchr", "pin-project-lite", "tokio", @@ -398,9 +338,9 @@ dependencies = [ [[package]] name = "async-io" -version = "1.10.0" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8121296a9f05be7f34aa4196b1747243b3b62e048bb7906f644f3fbfc490cf7" +checksum = "8c374dda1ed3e7d8f0d9ba58715f924862c63eae6849c92d3a18e7fbde9e2794" dependencies = [ "async-lock", "autocfg", @@ -413,7 +353,7 @@ dependencies = [ "slab", "socket2", "waker-fn", - "winapi", + "windows-sys 0.42.0", ] [[package]] @@ -449,9 +389,9 @@ dependencies = [ [[package]] name = "async-trait" -version = "0.1.57" +version = "0.1.59" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76464446b8bc32758d7e88ee1a804d9914cd9b1cb264c029899680b0be29826f" +checksum = "31e6e93155431f3931513b243d371981bb2770112b370c82745a1d19d2f99364" dependencies = [ "proc-macro2", "quote", @@ -479,7 +419,7 @@ version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" dependencies = [ - "hermit-abi", + "hermit-abi 0.1.19", "libc", "winapi", ] @@ -496,37 +436,6 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" -[[package]] -name = "axum" -version = "0.5.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c9e3356844c4d6a6d6467b8da2cffb4a2820be256f50a3a386c9d152bab31043" -dependencies = [ - "async-trait", - "axum-core 0.2.8", - "bitflags", - "bytes", - "futures-util", - "http", - "http-body", - "hyper", - "itoa 1.0.3", - "matchit 0.5.0", - "memchr", - "mime", - "percent-encoding", - "pin-project-lite", - "serde", - "serde_json", - "serde_urlencoded", - "sync_wrapper", - "tokio", - "tower", - "tower-http", - "tower-layer", - "tower-service", -] - [[package]] name = "axum" version = "0.6.1" @@ -534,15 +443,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08b108ad2665fa3f6e6a517c3d80ec3e77d224c47d605167aefaa5d7ef97fa48" dependencies = [ "async-trait", - "axum-core 0.3.0", + "axum-core", "bitflags", "bytes", "futures-util", "http", "http-body", "hyper", - "itoa 1.0.3", - "matchit 0.7.0", + "itoa 1.0.4", + "matchit", "memchr", "mime", "percent-encoding", @@ -560,22 +469,6 @@ dependencies = [ "tower-service", ] -[[package]] -name = "axum-core" -version = "0.2.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9f0c0a60006f2a293d82d571f635042a72edf927539b7685bd62d361963839b" -dependencies = [ - "async-trait", - "bytes", - "futures-util", - "http", - "http-body", - "mime", - "tower-layer", - "tower-service", -] - [[package]] name = "axum-core" version = "0.3.0" @@ -608,9 +501,9 @@ dependencies = [ [[package]] name = "axum-test-helper" version = "0.1.1" -source = "git+https://github.com/sunng87/axum-test-helper.git?branch=patch-1#c90b5fed699080636330f3a97c1ee20d845329f0" +source = "git+https://github.com/sunng87/axum-test-helper.git?branch=patch-1#5aa7843ce2250144ea1b7f589f274c00cf1af4ab" dependencies = [ - "axum 0.5.16", + "axum", "bytes", "http", "http-body", @@ -629,7 +522,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b62ddb9cb1ec0a098ad4bbf9344d0713fa193ae1a80af55febcff2627b6a00c1" dependencies = [ "futures-core", - "getrandom 0.2.7", + "getrandom 0.2.8", "instant", "pin-project-lite", "rand 0.8.5", @@ -658,23 +551,23 @@ dependencies = [ "cc", "cfg-if 1.0.0", "libc", - "miniz_oxide", + "miniz_oxide 0.5.4", "object", "rustc-demangle", ] [[package]] name = "base64" -version = "0.13.0" +version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd" +checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" [[package]] name = "benchmarks" version = "0.1.0" dependencies = [ - "arrow 10.0.0", - "clap 4.0.18", + "arrow", + "clap 4.0.29", "client", "indicatif", "itertools", @@ -761,15 +654,6 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" -[[package]] -name = "bitpacking" -version = "0.8.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8c7d2ac73c167c06af4a5f37e6e59d84148d57ccbe4480b76f0273eefea82d7" -dependencies = [ - "crunchy", -] - [[package]] name = "bitvec" version = "1.0.1" @@ -784,9 +668,9 @@ dependencies = [ [[package]] name = "blake2" -version = "0.10.4" +version = "0.10.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9cf849ee05b2ee5fba5e36f97ff8ec2533916700fc0758d40d92136a42f3388" +checksum = "b12e5fd123190ce1c2e559308a94c9bacad77907d4c6005d9e58fe1a0689e55e" dependencies = [ "digest", ] @@ -799,20 +683,20 @@ checksum = "afa748e348ad3be8263be728124b24a24f268266f6f5d58af9d75f6a40b5c587" dependencies = [ "arrayref", "arrayvec 0.5.2", - "constant_time_eq", + "constant_time_eq 0.1.5", ] [[package]] name = "blake3" -version = "1.3.1" +version = "1.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a08e53fc5a564bb15bfe6fae56bd71522205f1f91893f9c0116edad6496c183f" +checksum = "42ae2468a89544a466886840aa467a25b766499f4f04bf7d9fcd10ecee9fccef" dependencies = [ "arrayref", "arrayvec 0.7.2", "cc", "cfg-if 1.0.0", - "constant_time_eq", + "constant_time_eq 0.2.4", "digest", ] @@ -825,6 +709,51 @@ dependencies = [ "generic-array", ] +[[package]] +name = "borsh" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15bf3650200d8bffa99015595e10f1fbd17de07abbc25bb067da79e769939bfa" +dependencies = [ + "borsh-derive", + "hashbrown 0.11.2", +] + +[[package]] +name = "borsh-derive" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6441c552f230375d18e3cc377677914d2ca2b0d36e52129fe15450a2dce46775" +dependencies = [ + "borsh-derive-internal", + "borsh-schema-derive-internal", + "proc-macro-crate 0.1.5", + "proc-macro2", + "syn", +] + +[[package]] +name = "borsh-derive-internal" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5449c28a7b352f2d1e592a8a28bf139bc71afb0764a14f3c02500935d8c44065" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "borsh-schema-derive-internal" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cdbd5696d8bfa21d53d9fe39a714a18538bad11492a42d066dbbc395fb1951c0" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "brotli" version = "3.3.4" @@ -871,36 +800,37 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.11.0" +version = "3.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1ad822118d20d2c234f427000d5acc36eabe1e29a348c89b63dd60b13f28e5d" +checksum = "572f695136211188308f16ad2ca5c851a712c464060ae6974944458eb83880ba" [[package]] -name = "bytecount" -version = "0.6.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c" - -[[package]] -name = "bytemuck" -version = "1.12.1" +name = "bytecheck" +version = "0.6.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f5715e491b5a1598fc2bef5a606847b5dc1d48ea625bd3c02c00de8285591da" +checksum = "d11cac2c12b5adc6570dad2ee1b87eff4955dac476fe12d81e5fdd352e52406f" dependencies = [ - "bytemuck_derive", + "bytecheck_derive", + "ptr_meta", ] [[package]] -name = "bytemuck_derive" -version = "1.2.1" +name = "bytecheck_derive" +version = "0.6.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b9e1f5fa78f69496407a27ae9ed989e3c3b072310286f5ef385525e4cbc24a9" +checksum = "13e576ebe98e605500b3c8041bb888e966653577172df6dd97398714eb30b9bf" dependencies = [ "proc-macro2", "quote", "syn", ] +[[package]] +name = "bytecount" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c" + [[package]] name = "byteorder" version = "1.4.3" @@ -909,18 +839,33 @@ checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" [[package]] name = "bytes" -version = "1.2.1" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec8a7b6a70fde80372154c65702f00a0f56f3e1c36abbc6c440484be248856db" +checksum = "dfb24e866b15a1af2a1b663f10c6b6b8f397a84aadb828f12e5b289ec23a3a3c" dependencies = [ "serde", ] [[package]] -name = "cache-padded" -version = "1.2.0" +name = "bzip2" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6afcd980b5f3a45017c57e57a2fcccbb351cc43a356ce117ef760ef8052b89b0" +dependencies = [ + "bzip2-sys", + "libc", +] + +[[package]] +name = "bzip2-sys" +version = "0.1.11+1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1db59621ec70f09c5e9b597b220c7a2b43611f4710dc03ceb8748637775692c" +checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" +dependencies = [ + "cc", + "libc", + "pkg-config", +] [[package]] name = "cactus" @@ -1014,9 +959,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.0.73" +version = "1.0.77" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11" +checksum = "e9f73505338f7d905b19d18738976aae232eb46b8efc15554ffc56deb5d9ebe4" dependencies = [ "jobserver", ] @@ -1058,9 +1003,9 @@ dependencies = [ [[package]] name = "chrono" -version = "0.4.22" +version = "0.4.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfd4d1b31faaa3a89d7934dbded3111da0d2ef28e3ebccdb4f0179f5929d1ef1" +checksum = "16b0a3d9ed01224b22057780a37bb8c5dbfe1be8ba48678e7bf57ec4b385411f" dependencies = [ "iana-time-zone", "js-sys", @@ -1155,9 +1100,9 @@ dependencies = [ [[package]] name = "clap" -version = "3.2.22" +version = "3.2.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86447ad904c7fb335a790c9d7fe3d0d971dc523b8ccd1561a520de9a85302750" +checksum = "71655c45cb9845d3270c9d6df84ebe72b4dad3c2ba3f7023ad47c144e4e473a5" dependencies = [ "atty", "bitflags", @@ -1167,19 +1112,19 @@ dependencies = [ "once_cell", "strsim 0.10.0", "termcolor", - "textwrap 0.15.1", + "textwrap 0.16.0", ] [[package]] name = "clap" -version = "4.0.18" +version = "4.0.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "335867764ed2de42325fafe6d18b8af74ba97ee0c590fa016f157535b42ab04b" +checksum = "4d63b9e9c07271b9957ad22c173bae2a4d9a81127680962039296abcd2f8251d" dependencies = [ - "atty", "bitflags", - "clap_derive 4.0.18", + "clap_derive 4.0.21", "clap_lex 0.3.0", + "is-terminal", "once_cell", "strsim 0.10.0", "termcolor", @@ -1200,9 +1145,9 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.0.18" +version = "4.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16a1b0f6422af32d5da0c58e2703320f379216ee70198241c84173a8c5ac28f3" +checksum = "0177313f9f02afc995627906bbd8967e2be069f5261954222dac78290c2b9014" dependencies = [ "heck 0.4.0", "proc-macro-error", @@ -1271,9 +1216,9 @@ dependencies = [ [[package]] name = "cmake" -version = "0.1.48" +version = "0.1.49" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8ad8cef104ac57b68b89df3208164d228503abbdce70f6880ffa3d970e7443a" +checksum = "db34956e100b30725f2eb215f90d4871051239535632f84fea3bc92722c66b7c" dependencies = [ "cc", ] @@ -1284,7 +1229,7 @@ version = "0.1.0" dependencies = [ "anymap", "build-data", - "clap 3.2.22", + "clap 3.2.23", "common-error", "common-telemetry", "datanode", @@ -1301,25 +1246,24 @@ dependencies = [ ] [[package]] -name = "comfy-table" -version = "5.0.1" +name = "codespan-reporting" +version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b103d85ca6e209388771bfb7aa6b68a7aeec4afbf6f0a0264bfbf50360e5212e" +checksum = "3538270d33cc669650c4b093848450d380def10c331d38c768e34cac80576e6e" dependencies = [ - "strum 0.23.0", - "strum_macros 0.23.1", + "termcolor", "unicode-width", ] [[package]] name = "comfy-table" -version = "6.1.2" +version = "6.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1090f39f45786ec6dc6286f8ea9c75d0a7ef0a0d3cda674cef0c3af7b307fbc2" +checksum = "e621e7e86c46fd8a14c32c6ae3cb95656621b4743a27d0cffedb831d46e7ad21" dependencies = [ "crossterm", - "strum 0.24.1", - "strum_macros 0.24.3", + "strum", + "strum_macros", "unicode-width", ] @@ -1370,7 +1314,7 @@ dependencies = [ "common-function-macro", "common-query", "common-time", - "datafusion-common 7.0.0", + "datafusion-common", "datatypes", "libc", "num", @@ -1446,7 +1390,7 @@ dependencies = [ "common-recordbatch", "common-time", "datafusion", - "datafusion-common 7.0.0", + "datafusion-common", "datafusion-expr", "datatypes", "snafu", @@ -1460,7 +1404,7 @@ version = "0.1.0" dependencies = [ "common-error", "datafusion", - "datafusion-common 7.0.0", + "datafusion-common", "datatypes", "futures", "paste", @@ -1518,22 +1462,22 @@ dependencies = [ [[package]] name = "concurrent-queue" -version = "1.2.4" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af4780a44ab5696ea9e28294517f1fffb421a83a25af521333c838635509db9c" +checksum = "bd7bef69dc86e3c610e4e7aed41035e2a7ed12e72dd7530f61327a6579a4390b" dependencies = [ - "cache-padded", + "crossbeam-utils", ] [[package]] name = "console" -version = "0.15.1" +version = "0.15.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89eab4d20ce20cea182308bca13088fecea9c05f6776cf287205d41a0ed3c847" +checksum = "c050367d967ced717c04b65d8c619d863ef9292ce0c5760028655a2fb298718c" dependencies = [ "encode_unicode", + "lazy_static", "libc", - "once_cell", "terminal_size", "unicode-width", "winapi", @@ -1545,8 +1489,8 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e57ff02e8ad8e06ab9731d5dc72dc23bef9200778eae1a89d555d8c42e5d4a86" dependencies = [ - "prost 0.11.0", - "prost-types 0.11.1", + "prost 0.11.3", + "prost-types 0.11.2", "tonic", "tracing-core", ] @@ -1563,7 +1507,7 @@ dependencies = [ "futures", "hdrhistogram", "humantime", - "prost-types 0.11.1", + "prost-types 0.11.2", "serde", "serde_json", "thread_local", @@ -1591,7 +1535,7 @@ version = "0.1.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9d7d6ab3c3a2282db210df5f02c4dab6e0a7057af0fb7ebd4070f30fe05c0ddb" dependencies = [ - "getrandom 0.2.7", + "getrandom 0.2.8", "once_cell", "proc-macro-hack", "tiny-keccak", @@ -1603,6 +1547,12 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "245097e9a4535ee1e3e3931fcfcd55a796a44c643e8596ff6566d68f09b87bbc" +[[package]] +name = "constant_time_eq" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3ad85c1f65dc7b37604eb0e89748faf0b9653065f2a8ef69f96a687ec1e9279" + [[package]] name = "core-foundation" version = "0.9.3" @@ -1688,7 +1638,7 @@ dependencies = [ "atty", "cast", "ciborium", - "clap 3.2.22", + "clap 3.2.23", "criterion-plot 0.5.0", "itertools", "lazy_static", @@ -1761,23 +1711,22 @@ dependencies = [ [[package]] name = "crossbeam-epoch" -version = "0.9.10" +version = "0.9.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "045ebe27666471bb549370b4b0b3e51b07f56325befa4284db65fc89c02511b1" +checksum = "01a9af1f4c2ef74bb8aa1f7e19706bc72d03598c8a570bb5de72243c7a9d9d5a" dependencies = [ "autocfg", "cfg-if 1.0.0", "crossbeam-utils", - "memoffset", - "once_cell", + "memoffset 0.7.1", "scopeguard", ] [[package]] name = "crossbeam-queue" -version = "0.3.6" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cd42583b04998a5363558e5f9291ee5a5ff6b49944332103f251e7479a82aa7" +checksum = "d1cfb3ea8a53f37c40dea2c7bedcbd88bdfae54f5e2175d6ecaff1c988353add" dependencies = [ "cfg-if 1.0.0", "crossbeam-utils", @@ -1785,12 +1734,11 @@ dependencies = [ [[package]] name = "crossbeam-utils" -version = "0.8.11" +version = "0.8.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51887d4adc7b564537b15adcfb307936f8075dfcd5f00dde9a9f1d29383682bc" +checksum = "4fb766fa798726286dbbb842f174001dab8abc7b627a1dd86e0b7222a95d929f" dependencies = [ "cfg-if 1.0.0", - "once_cell", ] [[package]] @@ -1856,11 +1804,55 @@ dependencies = [ "memchr", ] +[[package]] +name = "cxx" +version = "1.0.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bdf07d07d6531bfcdbe9b8b739b104610c6508dcc4d63b410585faf338241daf" +dependencies = [ + "cc", + "cxxbridge-flags", + "cxxbridge-macro", + "link-cplusplus", +] + +[[package]] +name = "cxx-build" +version = "1.0.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2eb5b96ecdc99f72657332953d4d9c50135af1bac34277801cc3937906ebd39" +dependencies = [ + "cc", + "codespan-reporting", + "once_cell", + "proc-macro2", + "quote", + "scratch", + "syn", +] + +[[package]] +name = "cxxbridge-flags" +version = "1.0.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac040a39517fd1674e0f32177648334b0f4074625b5588a64519804ba0553b12" + +[[package]] +name = "cxxbridge-macro" +version = "1.0.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1362b0ddcfc4eb0a1f57b68bd77dd99f0e826958a96abd0ae9bd092e114ffed6" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "darling" -version = "0.14.1" +version = "0.14.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4529658bdda7fd6769b8614be250cdcfc3aeb0ee72fe66f9e41e5e5eb73eac02" +checksum = "b0dd3cd20dc6b5a876612a6e5accfe7f3dd883db6d07acfbf14c128f61550dfa" dependencies = [ "darling_core", "darling_macro", @@ -1868,9 +1860,9 @@ dependencies = [ [[package]] name = "darling_core" -version = "0.14.1" +version = "0.14.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "649c91bc01e8b1eac09fb91e8dbc7d517684ca6be8ebc75bb9cafc894f9fdb6f" +checksum = "a784d2ccaf7c98501746bf0be29b2022ba41fd62a2e622af997a03e9f972859f" dependencies = [ "fnv", "ident_case", @@ -1882,9 +1874,9 @@ dependencies = [ [[package]] name = "darling_macro" -version = "0.14.1" +version = "0.14.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddfc69c5bfcbd2fc09a0f38451d2daf0e372e367986a83906d1b0dbc88134fb5" +checksum = "7618812407e9402654622dd402b0a89dff9ba93badd6540781526117b92aab7e" dependencies = [ "darling_core", "quote", @@ -1898,7 +1890,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "907076dfda823b0b36d2a1bb5f90c96660a5bbcd7729e10727f07858f22c4edc" dependencies = [ "cfg-if 1.0.0", - "hashbrown", + "hashbrown 0.12.3", "lock_api", "once_cell", "parking_lot_core", @@ -1906,90 +1898,145 @@ dependencies = [ [[package]] name = "datafusion" -version = "7.0.0" -source = "git+https://github.com/apache/arrow-datafusion.git?branch=arrow2#744b2626081db95a254fc882820fc7812f95aa51" +version = "14.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7a8411475928479fe57af18698626f0a44f3c29153e051dce45f7455c08a6d5" dependencies = [ - "ahash 0.7.6", - "arrow2", + "ahash 0.8.2", + "arrow", + "async-compression", "async-trait", + "bytes", + "bzip2", "chrono", - "comfy-table 5.0.1", - "datafusion-common 7.0.0", + "datafusion-common", "datafusion-expr", + "datafusion-optimizer", "datafusion-physical-expr", + "datafusion-row", + "datafusion-sql", + "flate2", "futures", - "hashbrown", + "glob", + "hashbrown 0.12.3", + "itertools", "lazy_static", "log", "num_cpus", - "ordered-float 2.10.0", + "object_store", + "ordered-float 3.4.0", "parking_lot", - "parquet2", + "parquet", "paste", + "percent-encoding", "pin-project-lite", "rand 0.8.5", "smallvec", - "sqlparser 0.15.0", + "sqlparser", "tempfile", "tokio", "tokio-stream", + "tokio-util", + "url", + "uuid", ] [[package]] name = "datafusion-common" -version = "7.0.0" -source = "git+https://github.com/apache/arrow-datafusion.git?branch=arrow2#744b2626081db95a254fc882820fc7812f95aa51" +version = "14.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15f1ffcbc1f040c9ab99f41db1c743d95aff267bb2e7286aaa010738b7402251" dependencies = [ - "arrow2", - "ordered-float 2.10.0", - "parquet2", - "sqlparser 0.15.0", + "arrow", + "chrono", + "object_store", + "ordered-float 3.4.0", + "parquet", + "sqlparser", ] [[package]] -name = "datafusion-common" +name = "datafusion-expr" version = "14.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15f1ffcbc1f040c9ab99f41db1c743d95aff267bb2e7286aaa010738b7402251" +checksum = "1883d9590d303ef38fa295567e7fdb9f8f5f511fcc167412d232844678cd295c" dependencies = [ - "arrow 26.0.0", - "chrono", - "ordered-float 3.1.0", - "sqlparser 0.26.0", + "ahash 0.8.2", + "arrow", + "datafusion-common", + "log", + "sqlparser", ] [[package]] -name = "datafusion-expr" -version = "7.0.0" -source = "git+https://github.com/apache/arrow-datafusion.git?branch=arrow2#744b2626081db95a254fc882820fc7812f95aa51" +name = "datafusion-optimizer" +version = "14.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2127d46d566ab3463d70da9675fc07b9d634be8d17e80d0e1ce79600709fe651" dependencies = [ - "ahash 0.7.6", - "arrow2", - "datafusion-common 7.0.0", - "sqlparser 0.15.0", + "arrow", + "async-trait", + "chrono", + "datafusion-common", + "datafusion-expr", + "datafusion-physical-expr", + "hashbrown 0.12.3", + "log", ] [[package]] name = "datafusion-physical-expr" -version = "7.0.0" -source = "git+https://github.com/apache/arrow-datafusion.git?branch=arrow2#744b2626081db95a254fc882820fc7812f95aa51" +version = "14.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d108b6fe8eeb317ecad1d74619e8758de49cccc8c771b56c97962fd52eaae23" dependencies = [ - "ahash 0.7.6", - "arrow2", + "ahash 0.8.2", + "arrow", + "arrow-buffer", + "arrow-schema", "blake2", "blake3", "chrono", - "datafusion-common 7.0.0", + "datafusion-common", "datafusion-expr", - "hashbrown", + "datafusion-row", + "half 2.1.0", + "hashbrown 0.12.3", + "itertools", "lazy_static", "md-5", - "ordered-float 2.10.0", + "num-traits", + "ordered-float 3.4.0", "paste", "rand 0.8.5", "regex", "sha2", "unicode-segmentation", + "uuid", +] + +[[package]] +name = "datafusion-row" +version = "14.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43537b6377d506e4788bf21e9ed943340e076b48ca4d077e6ea4405ca5e54a1c" +dependencies = [ + "arrow", + "datafusion-common", + "paste", + "rand 0.8.5", +] + +[[package]] +name = "datafusion-sql" +version = "14.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "244d08d4710e1088d9c0949c9b5b8d68d9cf2cde7203134a4cc389e870fe2354" +dependencies = [ + "arrow", + "datafusion-common", + "datafusion-expr", + "sqlparser", ] [[package]] @@ -1998,7 +2045,7 @@ version = "0.1.0" dependencies = [ "api", "async-trait", - "axum 0.6.1", + "axum", "axum-macros", "axum-test-helper", "backon", @@ -2015,7 +2062,7 @@ dependencies = [ "common-telemetry", "common-time", "datafusion", - "datafusion-common 7.0.0", + "datafusion-common", "datatypes", "futures", "hyper", @@ -2049,34 +2096,16 @@ dependencies = [ name = "datatypes" version = "0.1.0" dependencies = [ - "arrow2", - "common-base", - "common-error", - "common-time", - "datafusion-common 7.0.0", - "enum_dispatch", - "num", - "num-traits", - "ordered-float 3.1.0", - "paste", - "serde", - "serde_json", - "snafu", -] - -[[package]] -name = "datatypes2" -version = "0.1.0" -dependencies = [ - "arrow 26.0.0", + "arrow", + "arrow-schema", "common-base", "common-error", "common-time", - "datafusion-common 14.0.0", + "datafusion-common", "enum_dispatch", "num", "num-traits", - "ordered-float 3.1.0", + "ordered-float 3.4.0", "paste", "serde", "serde_json", @@ -2133,9 +2162,9 @@ checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8" [[package]] name = "digest" -version = "0.10.5" +version = "0.10.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adfbc57365a37acbd2ebf2b64d7e69bb766e2fea813521ed536f5d0520dcf86c" +checksum = "8168378f4e5023e7218c89c891c0fd8ecdb5e5e4f18cb78f38cf245dd021e76f" dependencies = [ "block-buffer", "crypto-common", @@ -2345,7 +2374,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b1259da3b15ec7e54bd7203adb2c4335adb9ca1d47b56220d650e52c247e824a" dependencies = [ "http", - "prost 0.11.0", + "prost 0.11.3", "tokio", "tokio-stream", "tonic", @@ -2372,12 +2401,6 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7" -[[package]] -name = "fallible-streaming-iterator" -version = "0.1.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" - [[package]] name = "fastrand" version = "1.8.0" @@ -2389,13 +2412,13 @@ dependencies = [ [[package]] name = "fd-lock" -version = "3.0.6" +version = "3.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e11dcc7e4d79a8c89b9ab4c6f5c30b1fc4a83c420792da3542fd31179ed5f517" +checksum = "bb21c69b9fea5e15dbc1049e4b77145dd0ba1c84019c488102de0dc4ea4b0a27" dependencies = [ "cfg-if 1.0.0", "rustix", - "windows-sys 0.36.1", + "windows-sys 0.42.0", ] [[package]] @@ -2422,17 +2445,6 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cda653ca797810c02f7ca4b804b40b8b95ae046eb989d356bce17919a8c25499" -[[package]] -name = "flatbuffers" -version = "2.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ea97b4fe4b84e2f2765449bcea21cbdb3ee28cecb88afbf38a0c2e1639f5eb5" -dependencies = [ - "bitflags", - "smallvec", - "thiserror", -] - [[package]] name = "flatbuffers" version = "22.9.29" @@ -2445,13 +2457,13 @@ dependencies = [ [[package]] name = "flate2" -version = "1.0.24" +version = "1.0.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f82b0f4c27ad9f8bfd1f3208d882da2b09c301bc1c828fd3a00d0216d2fbbff6" +checksum = "a8a2db397cb1c8772f31494cb8917e48cd1e64f0fa7efac59fbd741a0a8ce841" dependencies = [ "crc32fast", "libz-sys", - "miniz_oxide", + "miniz_oxide 0.6.2", ] [[package]] @@ -2500,7 +2512,7 @@ dependencies = [ "common-telemetry", "common-time", "datafusion", - "datafusion-common 7.0.0", + "datafusion-common", "datafusion-expr", "datanode", "datatypes", @@ -2511,7 +2523,7 @@ dependencies = [ "meta-srv", "moka", "openmetrics-parser", - "prost 0.11.0", + "prost 0.11.3", "query", "rustls", "serde", @@ -2520,7 +2532,6 @@ dependencies = [ "session", "snafu", "sql", - "sqlparser 0.15.0", "store-api", "substrait 0.1.0", "table", @@ -2532,9 +2543,9 @@ dependencies = [ [[package]] name = "frunk" -version = "0.4.0" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0cd67cf7d54b7e72d0ea76f3985c3747d74aee43e0218ad993b7903ba7a5395e" +checksum = "a89c703bf50009f383a0873845357cc400a95fc535f836feddfe015d7df6e1e0" dependencies = [ "frunk_core", "frunk_derives", @@ -2543,15 +2554,15 @@ dependencies = [ [[package]] name = "frunk_core" -version = "0.4.0" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1246cf43ec80bf8b2505b5c360b8fb999c97dabd17dbb604d85558d5cbc25482" +checksum = "2a446d01a558301dca28ef43222864a9fa2bd9a2e71370f769d5d5d5ec9f3537" [[package]] name = "frunk_derives" -version = "0.4.0" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3dbc4f084ec5a3f031d24ccedeb87ab2c3189a2f33b8d070889073837d5ea09e" +checksum = "b83164912bb4c97cfe0772913c7af7387ee2e00cb6d4636fb65a35b3d0c8f173" dependencies = [ "frunk_proc_macro_helpers", "quote", @@ -2560,9 +2571,9 @@ dependencies = [ [[package]] name = "frunk_proc_macro_helpers" -version = "0.1.0" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99f11257f106c6753f5ffcb8e601fb39c390a088017aaa55b70c526bff15f63e" +checksum = "015425591bbeb0f5b8a75593340f1789af428e9f887a4f1e36c0c471f067ef50" dependencies = [ "frunk_core", "proc-macro2", @@ -2572,9 +2583,9 @@ dependencies = [ [[package]] name = "frunk_proc_macros" -version = "0.1.0" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a078bd8459eccbb85e0b007b8f756585762a72a9efc53f359b371c3b6351dbcc" +checksum = "ea01524f285deab48affffb342b97f186e657b119c3f1821ac531780e0fbfae0" dependencies = [ "frunk_core", "frunk_proc_macros_impl", @@ -2583,9 +2594,9 @@ dependencies = [ [[package]] name = "frunk_proc_macros_impl" -version = "0.1.0" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ffba99f0fa4f57e42f57388fbb9a0ca863bc2b4261f3c5570fed579d5df6c32" +checksum = "0a802d974cc18ee7fe1a7868fc9ce31086294fd96ba62f8da64ecb44e92a2653" dependencies = [ "frunk_core", "frunk_proc_macro_helpers", @@ -2608,9 +2619,9 @@ checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" [[package]] name = "futures" -version = "0.3.24" +version = "0.3.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f21eda599937fba36daeb58a22e8f5cee2d14c4a17b5b7739c7c8e5e3b8230c" +checksum = "38390104763dc37a5145a53c29c63c1290b5d316d6086ec32c293f6736051bb0" dependencies = [ "futures-channel", "futures-core", @@ -2623,9 +2634,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.24" +version = "0.3.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30bdd20c28fadd505d0fd6712cdfcb0d4b5648baf45faef7f852afb2399bb050" +checksum = "52ba265a92256105f45b719605a571ffe2d1f0fea3807304b522c1d778f79eed" dependencies = [ "futures-core", "futures-sink", @@ -2633,15 +2644,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.24" +version = "0.3.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e5aa3de05362c3fb88de6531e6296e85cde7739cccad4b9dfeeb7f6ebce56bf" +checksum = "04909a7a7e4633ae6c4a9ab280aeb86da1236243a77b694a49eacd659a4bd3ac" [[package]] name = "futures-executor" -version = "0.3.24" +version = "0.3.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ff63c23854bee61b6e9cd331d523909f238fc7636290b96826e9cfa5faa00ab" +checksum = "7acc85df6714c176ab5edf386123fafe217be88c0840ec11f199441134a074e2" dependencies = [ "futures-core", "futures-task", @@ -2650,9 +2661,9 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.24" +version = "0.3.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbf4d2a7a308fd4578637c0b17c7e1c7ba127b8f6ba00b29f717e9655d85eb68" +checksum = "00f5fb52a06bdcadeb54e8d3671f8888a39697dcb0b81b23b55174030427f4eb" [[package]] name = "futures-lite" @@ -2671,9 +2682,9 @@ dependencies = [ [[package]] name = "futures-macro" -version = "0.3.24" +version = "0.3.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42cd15d1c7456c04dbdf7e88bcd69760d74f3a798d6444e16974b505b0e62f17" +checksum = "bdfb8ce053d86b91919aad980c220b1fb8401a9394410e1c289ed7e66b61835d" dependencies = [ "proc-macro2", "quote", @@ -2682,21 +2693,21 @@ dependencies = [ [[package]] name = "futures-sink" -version = "0.3.24" +version = "0.3.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21b20ba5a92e727ba30e72834706623d94ac93a725410b6a6b6fbc1b07f7ba56" +checksum = "39c15cf1a4aa79df40f1bb462fb39676d0ad9e366c2a33b590d7c66f4f81fcf9" [[package]] name = "futures-task" -version = "0.3.24" +version = "0.3.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6508c467c73851293f390476d4491cf4d227dbabcd4170f3bb6044959b294f1" +checksum = "2ffb393ac5d9a6eaa9d3fdf37ae2776656b706e200c8e16b1bdb227f5198e6ea" [[package]] name = "futures-util" -version = "0.3.24" +version = "0.3.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44fb6cb1be61cc1d2e43b262516aafcf63b241cffdb1d3fa115f91d9c7b09c90" +checksum = "197676987abd2f9cadff84926f410af1c183608d36641465df73ae8211dc65d6" dependencies = [ "futures-channel", "futures-core", @@ -2752,9 +2763,9 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.7" +version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4eb1a864a501629691edf6c15a593b7a51eebaa1e8468e9ddc623de7c9b58ec6" +checksum = "c05aeb6a22b8f62540c194aac980f2115af067bfe15a0734d7277a768d396b31" dependencies = [ "cfg-if 1.0.0", "js-sys", @@ -2789,9 +2800,9 @@ checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" [[package]] name = "h2" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ca32592cf21ac7ccab1825cd87f6c9b3d9022c44d086172ed0966bec8af30be" +checksum = "5f9f29bc9dda355256b2916cf526ab02ce0aeaaaf2bad60d65ef3f12f11dd0f4" dependencies = [ "bytes", "fnv", @@ -2823,10 +2834,13 @@ dependencies = [ ] [[package]] -name = "hash_hasher" -version = "2.0.3" +name = "hashbrown" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74721d007512d0cb3338cd20f0654ac913920061a4c4d0d8708edb3f2a698c0c" +checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e" +dependencies = [ + "ahash 0.7.6", +] [[package]] name = "hashbrown" @@ -2874,6 +2888,15 @@ dependencies = [ "libc", ] +[[package]] +name = "hermit-abi" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee512640fe35acbfb4bb779db6f0d80704c2cacfa2e39b601ef3e3f47d1ae4c7" +dependencies = [ + "libc", +] + [[package]] name = "hex" version = "0.4.3" @@ -2903,7 +2926,7 @@ checksum = "75f43d41e26995c17e71ee126451dd3941010b0514a81a9d11f3b341debc2399" dependencies = [ "bytes", "fnv", - "itoa 1.0.3", + "itoa 1.0.4", ] [[package]] @@ -2953,9 +2976,9 @@ dependencies = [ [[package]] name = "hyper" -version = "0.14.20" +version = "0.14.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02c929dc5c39e335a03c405292728118860721b10190d98c2a0f0efd5baafbac" +checksum = "034711faac9d2166cb1baf1a2fb0b60b1f277f8492fd72176c17f3515e1abd3c" dependencies = [ "bytes", "futures-channel", @@ -2966,7 +2989,7 @@ dependencies = [ "http-body", "httparse", "httpdate", - "itoa 1.0.3", + "itoa 1.0.4", "pin-project-lite", "socket2", "tokio", @@ -2977,9 +3000,9 @@ dependencies = [ [[package]] name = "hyper-rustls" -version = "0.23.0" +version = "0.23.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d87c48c02e0dc5e3b849a2041db3029fd066650f8f717c07bf8ed78ccb895cac" +checksum = "59df7c4e19c950e6e0e868dcc0a300b09a9b88e9ec55bd879ca819087a77355d" dependencies = [ "http", "hyper", @@ -3002,17 +3025,28 @@ dependencies = [ [[package]] name = "iana-time-zone" -version = "0.1.49" +version = "0.1.53" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3bbaead50122b06e9a973ac20bc7445074d99ad9a0a0654934876908a9cec82c" +checksum = "64c122667b287044802d6ce17ee2ddf13207ed924c712de9a66a5814d5b64765" dependencies = [ "android_system_properties", "core-foundation-sys", + "iana-time-zone-haiku", "js-sys", "wasm-bindgen", "winapi", ] +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0703ae284fc167426161c2e3f1da3ea71d94b21bedbcc9494e92b28e334e3dca" +dependencies = [ + "cxx", + "cxx-build", +] + [[package]] name = "ident_case" version = "1.0.1" @@ -3031,23 +3065,24 @@ dependencies = [ [[package]] name = "indexmap" -version = "1.9.1" +version = "1.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10a35a97730320ffe8e2d410b5d3b69279b98d2c14bdb8b70ea89ecf7888d41e" +checksum = "1885e79c1fc4b10f0e172c475f458b7f7b93061064d98c3293e98c5ba0c8b399" dependencies = [ "autocfg", - "hashbrown", + "hashbrown 0.12.3", "serde", ] [[package]] name = "indicatif" -version = "0.17.1" +version = "0.17.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfddc9561e8baf264e0e45e197fd7696320026eb10a8180340debc27b18f535b" +checksum = "4295cbb7573c16d310e99e713cf9e75101eb190ab31fccd35f2d2691b4352b19" dependencies = [ "console", "number_prefix", + "portable-atomic", "unicode-width", ] @@ -3071,12 +3106,6 @@ dependencies = [ "cfg-if 1.0.0", ] -[[package]] -name = "integer-encoding" -version = "1.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48dc51180a9b377fd75814d0cc02199c20f8e99433d6762f650d39cdbbd3b56f" - [[package]] name = "integer-encoding" version = "3.0.4" @@ -3089,15 +3118,19 @@ dependencies = [ [[package]] name = "io-lifetimes" -version = "0.7.3" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ea37f355c05dde75b84bba2d767906ad522e97cd9e2eef2be7a4ab7fb442c06" +checksum = "46112a93252b123d31a119a8d1a1ac19deac4fac6e0e8b0df58f0d4e5870e63c" +dependencies = [ + "libc", + "windows-sys 0.42.0", +] [[package]] name = "ipnet" -version = "2.5.0" +version = "2.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "879d54834c8c76457ef4293a689b2a8c59b076067ad77b15efafbb05f92a592b" +checksum = "f88c5561171189e69df9d98bcf18fd5f9558300f7ea7b801eb8a0fd748bd8745" [[package]] name = "iri-string" @@ -3121,6 +3154,18 @@ dependencies = [ "syn", ] +[[package]] +name = "is-terminal" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "927609f78c2913a6f6ac3c27a4fe87f43e2a35367c0c4b0f8265e8f49a104330" +dependencies = [ + "hermit-abi 0.2.6", + "io-lifetimes", + "rustix", + "windows-sys 0.42.0", +] + [[package]] name = "itertools" version = "0.10.5" @@ -3138,15 +3183,15 @@ checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" [[package]] name = "itoa" -version = "1.0.3" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c8af84674fe1f223a982c933a0ee1086ac4d4052aa0fb8060c12c6ad838e754" +checksum = "4217ad341ebadf8d8e724e264f13e593e0648f5b3e94b3896a5df283be015ecc" [[package]] name = "jobserver" -version = "0.1.24" +version = "0.1.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af25a77299a7f711a01975c35a6a424eb6862092cc2d6c72c4ed6cbc56dfc1fa" +checksum = "068b1ee6743e4d11fb9c6a1e6064b3693a1b600e7f5f5988047d98b3dc9fb90b" dependencies = [ "libc", ] @@ -3162,9 +3207,9 @@ dependencies = [ [[package]] name = "jsonwebtoken" -version = "8.1.1" +version = "8.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1aa4b4af834c6cfd35d8763d359661b90f2e45d8f750a0849156c7f4671af09c" +checksum = "09f4f04699947111ec1733e71778d763555737579e44b85844cae8e1940a1828" dependencies = [ "base64", "pem", @@ -3302,15 +3347,15 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.133" +version = "0.2.138" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0f80d65747a3e43d1596c7c5492d95d5edddaabd45a7fcdb02b95f644164966" +checksum = "db6d7e329c562c5dfab7a46a2afabc8b987ab9a4834c9d1ca04dc54c1546cef8" [[package]] name = "libloading" -version = "0.7.3" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "efbc0f03f9a775e9f6aed295c6a1ba2253c5757a9e03d55c6caa46a681abcddd" +checksum = "b67380fd3b2fbe7527a606e18729d21c6f3951633d0500574c4dc22d2d638b9f" dependencies = [ "cfg-if 1.0.0", "winapi", @@ -3318,9 +3363,9 @@ dependencies = [ [[package]] name = "libm" -version = "0.2.5" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "292a948cd991e376cf75541fe5b97a1081d713c618b4f1b9500f8844e49eb565" +checksum = "348108ab3fba42ec82ff6e9564fc4ca0247bdccdc68dd8af9764bbc79c3c8ffb" [[package]] name = "libz-sys" @@ -3333,11 +3378,20 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "link-cplusplus" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9272ab7b96c9046fbc5bc56c06c117cb639fe2d509df0c421cad82d2915cf369" +dependencies = [ + "cc", +] + [[package]] name = "linux-raw-sys" -version = "0.0.46" +version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4d2456c373231a208ad294c33dc5bff30051eafd954cd4caae83a712b12854d" +checksum = "8f9f08d8963a6c613f4b1a78f4f4a4dbfadf8e6545b2d72861731e4858b8b47f" [[package]] name = "lock_api" @@ -3443,7 +3497,7 @@ version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6e8aaa3f231bb4bd57b84b2d5dc3ae7f350265df8aa96492e0bc394a1571909" dependencies = [ - "hashbrown", + "hashbrown 0.12.3", ] [[package]] @@ -3481,7 +3535,7 @@ version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b238e3235c8382b7653c6408ed1b08dd379bdb9fdf990fb0bbae3db2cc0ae963" dependencies = [ - "nix 0.23.1", + "nix 0.23.2", "winapi", ] @@ -3515,12 +3569,6 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a3e378b66a060d48947b590737b30a1be76706c8dd7b8ba0f2fe3989c68a853f" -[[package]] -name = "matchit" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73cbba799671b762df5a175adf59ce145165747bb891505c43d09aefbbf38beb" - [[package]] name = "matchit" version = "0.7.0" @@ -3575,6 +3623,15 @@ dependencies = [ "autocfg", ] +[[package]] +name = "memoffset" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5de893c32cde5f383baa4c04c5d6dbdd735cfd4a794b0debdb2bb1b421da5ff4" +dependencies = [ + "autocfg", +] + [[package]] name = "meta-client" version = "0.1.0" @@ -3618,7 +3675,7 @@ dependencies = [ "http-body", "lazy_static", "parking_lot", - "prost 0.11.0", + "prost 0.11.3", "regex", "serde", "serde_json", @@ -3677,7 +3734,7 @@ checksum = "f7d24dc2dbae22bff6f1f9326ffce828c9f07ef9cc1e8002e5279f845432a30a" dependencies = [ "crossbeam-epoch", "crossbeam-utils", - "hashbrown", + "hashbrown 0.12.3", "metrics", "num_cpus", "parking_lot", @@ -3717,16 +3774,25 @@ dependencies = [ "adler", ] +[[package]] +name = "miniz_oxide" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b275950c28b37e794e8c55d88aeb5e139d0ce23fdbbeda68f8d7174abdf9e8fa" +dependencies = [ + "adler", +] + [[package]] name = "mio" -version = "0.8.4" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57ee1c23c7c63b0c9250c339ffdc69255f110b298b901b9f6c82547b7b87caaf" +checksum = "e5d732bc30207a6423068df043e3d02e0735b155ad7ce1a6f76fe2baa5b158de" dependencies = [ "libc", "log", "wasi 0.11.0+wasi-snapshot-preview1", - "windows-sys 0.36.1", + "windows-sys 0.42.0", ] [[package]] @@ -3744,7 +3810,7 @@ dependencies = [ "common-telemetry", "common-time", "datafusion", - "datafusion-common 7.0.0", + "datafusion-common", "datatypes", "futures", "log-store", @@ -3822,9 +3888,9 @@ dependencies = [ [[package]] name = "mysql_async" -version = "0.31.0" +version = "0.31.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8fbd756177cfa8248baa7c5f555b9446349822bb94810c22336ec7597a72652" +checksum = "52d8156a1f6a19224593c556c8aac642cf8070abd53d563405da92879dcf341b" dependencies = [ "bytes", "crossbeam", @@ -3840,6 +3906,7 @@ dependencies = [ "pem", "percent-encoding", "pin-project", + "priority-queue", "rustls", "rustls-pemfile", "serde", @@ -3889,7 +3956,7 @@ dependencies = [ "smallvec", "subprocess", "thiserror", - "time 0.3.14", + "time 0.3.17", "uuid", ] @@ -3939,27 +4006,27 @@ dependencies = [ [[package]] name = "nix" -version = "0.23.1" +version = "0.23.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f866317acbd3a240710c63f065ffb1e4fd466259045ccb504130b7f668f35c6" +checksum = "8f3790c00a0150112de0f4cd161e3d7fc4b2d8a5542ffc35f099a2562aecb35c" dependencies = [ "bitflags", "cc", "cfg-if 1.0.0", "libc", - "memoffset", + "memoffset 0.6.5", ] [[package]] name = "nix" -version = "0.24.2" +version = "0.24.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "195cdbc1741b8134346d515b3a56a1c94b0912758009cfd53f99ea0f57b065fc" +checksum = "fa52e972a9a719cecb6864fb88568781eb706bac2cd1d4f04a648542dbf78069" dependencies = [ "bitflags", "cfg-if 1.0.0", "libc", - "memoffset", + "memoffset 0.6.5", ] [[package]] @@ -3972,6 +4039,16 @@ dependencies = [ "minimal-lexical", ] +[[package]] +name = "nu-ansi-term" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84" +dependencies = [ + "overload", + "winapi", +] + [[package]] name = "num" version = "0.4.0" @@ -4053,11 +4130,11 @@ dependencies = [ [[package]] name = "num_cpus" -version = "1.13.1" +version = "1.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19e64526ebdee182341572e50e9ad03965aa510cd94427a4549448f285e957a1" +checksum = "f6058e64324c71e02bc2b150e4f3bc8286db6c83092132ffa3f6b1eab0f9def5" dependencies = [ - "hermit-abi", + "hermit-abi 0.1.19", "libc", ] @@ -4076,21 +4153,12 @@ version = "0.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3b0498641e53dd6ac1a4f22547548caa6864cc4933784319cd1775271c5a46ce" dependencies = [ - "proc-macro-crate", + "proc-macro-crate 1.2.1", "proc-macro2", "quote", "syn", ] -[[package]] -name = "num_threads" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2819ce041d2ee131036f4fc9d6ae7ae125a3a40e97ba64d04fe799ad9dabbb44" -dependencies = [ - "libc", -] - [[package]] name = "number_prefix" version = "0.4.0" @@ -4119,6 +4187,26 @@ dependencies = [ "uuid", ] +[[package]] +name = "object_store" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0014545954c5023b5fb8260415e54467cde434db6c824c9028a4b329f1b28e48" +dependencies = [ + "async-trait", + "bytes", + "chrono", + "futures", + "itertools", + "parking_lot", + "percent-encoding", + "snafu", + "tokio", + "tracing", + "url", + "walkdir", +] + [[package]] name = "once_cell" version = "1.16.0" @@ -4159,7 +4247,7 @@ dependencies = [ "reqwest", "serde", "serde_json", - "time 0.3.14", + "time 0.3.17", "tokio", "tracing", "ureq", @@ -4261,18 +4349,9 @@ dependencies = [ [[package]] name = "ordered-float" -version = "2.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7940cf2ca942593318d07fcf2596cdca60a85c9e7fab408a5e21a4f9dcd40d87" -dependencies = [ - "num-traits", -] - -[[package]] -name = "ordered-float" -version = "3.1.0" +version = "3.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98ffdb14730ed2ef599c65810c15b000896e21e8776b512de0db0c3d7335cc2a" +checksum = "d84eb1409416d254e4a9c8fa56cc24701755025b458f0fcd8e59e1f5f40c23bf" dependencies = [ "num-traits", "serde", @@ -4285,14 +4364,20 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ccd746e37177e1711c20dd619a1620f34f5c8b569c53590a72dedd5344d8924a" dependencies = [ "dlv-list", - "hashbrown", + "hashbrown 0.12.3", ] [[package]] name = "os_str_bytes" -version = "6.3.0" +version = "6.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ff7415e9ae3fff1225851df9e0d9e4e5479f947619774677a63572e55e80eff" +checksum = "9b7820b9daea5457c9f21c69448905d723fbd21136ccf521748f23fd49e723ee" + +[[package]] +name = "overload" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" [[package]] name = "packedvec" @@ -4332,9 +4417,9 @@ dependencies = [ [[package]] name = "parking_lot_core" -version = "0.9.3" +version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09a279cbf25cb0757810394fbc1e359949b59e348145c643a939a525692e6929" +checksum = "7ff9f3fef3968a3ec5945535ed654cb38ff72d7495a25619e2247fb15a2ed9ba" dependencies = [ "backtrace", "cfg-if 1.0.0", @@ -4343,40 +4428,34 @@ dependencies = [ "redox_syscall 0.2.16", "smallvec", "thread-id", - "windows-sys 0.36.1", + "windows-sys 0.42.0", ] [[package]] name = "parquet" -version = "10.0.0" +version = "26.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53e9c8fc20af9b92d85d42ec86e5217b2eaf1340fbba75c4b4296de764ea7921" +checksum = "3bf8fa7ab6572791325a8595f55dc532dde88b996ae10a5ca8a2db746784ecc4" dependencies = [ - "arrow 10.0.0", + "ahash 0.8.2", + "arrow", "base64", "brotli", - "byteorder", + "bytes", "chrono", "flate2", + "futures", + "hashbrown 0.12.3", "lz4", "num", "num-bigint", - "parquet-format", - "rand 0.8.5", + "seq-macro", "snap", - "thrift 0.13.0", + "thrift 0.16.0", + "tokio", "zstd", ] -[[package]] -name = "parquet-format" -version = "4.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f0c06cdcd5460967c485f9c40a821746f5955ad81990533c7fae95dbd9bc0b5" -dependencies = [ - "thrift 0.13.0", -] - [[package]] name = "parquet-format-async-temp" version = "0.2.0" @@ -4386,28 +4465,10 @@ dependencies = [ "async-trait", "byteorder", "futures", - "integer-encoding 3.0.4", + "integer-encoding", "ordered-float 1.1.1", ] -[[package]] -name = "parquet2" -version = "0.10.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b085f9e78e4842865151b693f6d94bdf7b280af66daa6e3587adeb3106a07e9" -dependencies = [ - "async-stream", - "bitpacking", - "brotli", - "flate2", - "futures", - "lz4", - "parquet-format-async-temp", - "snap", - "streaming-decompression", - "zstd", -] - [[package]] name = "parse-zoneinfo" version = "0.3.0" @@ -4446,9 +4507,9 @@ checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e" [[package]] name = "pest" -version = "2.4.0" +version = "2.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbc7bc69c062e492337d74d59b120c274fd3d261b6bf6d3207d499b4b379c41a" +checksum = "cc8bed3549e0f9b0a2a78bf7c0018237a2cdf085eecbbc048e52612438e4e9d0" dependencies = [ "thiserror", "ucd-trie", @@ -4456,9 +4517,9 @@ dependencies = [ [[package]] name = "pest_derive" -version = "2.4.0" +version = "2.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60b75706b9642ebcb34dab3bc7750f811609a0eb1dd8b88c2d15bf628c1c65b2" +checksum = "cdc078600d06ff90d4ed238f0119d84ab5d43dbaad278b0e33a8820293b32344" dependencies = [ "pest", "pest_generator", @@ -4466,9 +4527,9 @@ dependencies = [ [[package]] name = "pest_generator" -version = "2.4.0" +version = "2.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4f9272122f5979a6511a749af9db9bfc810393f63119970d7085fed1c4ea0db" +checksum = "28a1af60b1c4148bb269006a750cff8e2ea36aff34d2d96cf7be0b14d1bed23c" dependencies = [ "pest", "pest_meta", @@ -4479,9 +4540,9 @@ dependencies = [ [[package]] name = "pest_meta" -version = "2.4.0" +version = "2.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c8717927f9b79515e565a64fe46c38b8cd0427e64c40680b14a7365ab09ac8d" +checksum = "fec8605d59fc2ae0c6c1aefc0c7c7a9769732017c0ce07f7a9cfffa7b4404f20" dependencies = [ "once_cell", "pest", @@ -4515,7 +4576,7 @@ dependencies = [ "postgres-types", "rand 0.8.5", "thiserror", - "time 0.3.14", + "time 0.3.17", "tokio", "tokio-rustls", "tokio-util", @@ -4638,9 +4699,9 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" [[package]] name = "pkg-config" -version = "0.3.25" +version = "0.3.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1df8c4ec4b0627e53bdf214615ad287367e482558cf84b109250b37464dc03ae" +checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160" [[package]] name = "planus" @@ -4692,16 +4753,16 @@ dependencies = [ [[package]] name = "polling" -version = "2.4.0" +version = "2.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab4609a838d88b73d8238967b60dd115cc08d38e2bbaf51ee1e4b695f89122e2" +checksum = "166ca89eb77fd403230b9c156612965a81e094ec6ec3aa13663d4c8b113fa748" dependencies = [ "autocfg", "cfg-if 1.0.0", "libc", "log", "wepoll-ffi", - "winapi", + "windows-sys 0.42.0", ] [[package]] @@ -4741,9 +4802,9 @@ dependencies = [ [[package]] name = "ppv-lite86" -version = "0.2.16" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872" +checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" [[package]] name = "precomputed-hash" @@ -4764,9 +4825,9 @@ dependencies = [ [[package]] name = "prettyplease" -version = "0.1.19" +version = "0.1.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a49e86d2c26a24059894a3afa13fd17d063419b05dfb83f06d9c3566060c3f5a" +checksum = "c142c0e46b57171fe0c528bee8c5b7569e80f0c17e377cd0e30ea57dbc11bb51" dependencies = [ "proc-macro2", "syn", @@ -4786,6 +4847,25 @@ dependencies = [ "unicode-width", ] +[[package]] +name = "priority-queue" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7685ca4cc0b3ad748c22ce6803e23b55b9206ef7715b965ebeaf41639238fdc" +dependencies = [ + "autocfg", + "indexmap", +] + +[[package]] +name = "proc-macro-crate" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d6ea3c4595b96363c13943497db34af4460fb474a95c43f4446ad341b8c9785" +dependencies = [ + "toml", +] + [[package]] name = "proc-macro-crate" version = "1.2.1" @@ -4829,9 +4909,9 @@ checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5" [[package]] name = "proc-macro2" -version = "1.0.43" +version = "1.0.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a2ca2c61bc9f3d74d2886294ab7b9853abd9c1ad903a3ac7815c58989bb7bab" +checksum = "5ea3d908b0e36316caf9e9e2c4625cdde190a7e6f440d794667ed17a1855e725" dependencies = [ "unicode-ident", ] @@ -4869,12 +4949,12 @@ dependencies = [ [[package]] name = "prost" -version = "0.11.0" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "399c3c31cdec40583bb68f0b18403400d01ec4289c383aa047560439952c4dd7" +checksum = "c0b18e655c21ff5ac2084a5ad0611e827b3f92badf79f4910b5a5c58f4d87ff0" dependencies = [ "bytes", - "prost-derive 0.11.0", + "prost-derive 0.11.2", ] [[package]] @@ -4899,9 +4979,9 @@ dependencies = [ [[package]] name = "prost-build" -version = "0.11.1" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f835c582e6bd972ba8347313300219fed5bfa52caf175298d860b61ff6069bb" +checksum = "e330bf1316db56b12c2bcfa399e8edddd4821965ea25ddb2c134b610b1c1c604" dependencies = [ "bytes", "heck 0.4.0", @@ -4910,9 +4990,11 @@ dependencies = [ "log", "multimap", "petgraph", - "prost 0.11.0", - "prost-types 0.11.1", + "prettyplease", + "prost 0.11.3", + "prost-types 0.11.2", "regex", + "syn", "tempfile", "which", ] @@ -4932,9 +5014,9 @@ dependencies = [ [[package]] name = "prost-derive" -version = "0.11.0" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7345d5f0e08c0536d7ac7229952590239e77abf0a0100a1b1d890add6ea96364" +checksum = "164ae68b6587001ca506d3bf7f1000bfa248d0e1217b618108fba4ec1d0cc306" dependencies = [ "anyhow", "itertools", @@ -4955,12 +5037,32 @@ dependencies = [ [[package]] name = "prost-types" -version = "0.11.1" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4dfaa718ad76a44b3415e6c4d53b17c8f99160dcb3a99b10470fce8ad43f6e3e" +checksum = "747761bc3dc48f9a34553bf65605cf6cb6288ba219f3450b4275dbd81539551a" dependencies = [ "bytes", - "prost 0.11.0", + "prost 0.11.3", +] + +[[package]] +name = "ptr_meta" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0738ccf7ea06b608c10564b31debd4f5bc5e197fc8bfe088f68ae5ce81e7a4f1" +dependencies = [ + "ptr_meta_derive", +] + +[[package]] +name = "ptr_meta_derive" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16b845dbfca988fa33db069c0e230574d15a3088f147a87b64c7589eb662c9ac" +dependencies = [ + "proc-macro2", + "quote", + "syn", ] [[package]] @@ -5013,8 +5115,11 @@ dependencies = [ "common-telemetry", "common-time", "datafusion", - "datafusion-common 7.0.0", + "datafusion-common", + "datafusion-expr", + "datafusion-optimizer", "datafusion-physical-expr", + "datafusion-sql", "datatypes", "format_num", "futures", @@ -5128,7 +5233,7 @@ version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ - "getrandom 0.2.7", + "getrandom 0.2.8", ] [[package]] @@ -5158,11 +5263,10 @@ checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" [[package]] name = "rayon" -version = "1.5.3" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd99e5772ead8baa5215278c9b15bf92087709e9c1b2d1f97cdb5a183c933a7d" +checksum = "1e060280438193c554f654141c9ea9417886713b7acd75974c85b18a69a88e0b" dependencies = [ - "autocfg", "crossbeam-deque", "either", "rayon-core", @@ -5170,9 +5274,9 @@ dependencies = [ [[package]] name = "rayon-core" -version = "1.9.3" +version = "1.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "258bcdb5ac6dad48491bb2992db6b7cf74878b0384908af124823d118c99683f" +checksum = "cac410af5d00ab6884528b4ab69d1e8e146e8d471201800fa1b4524126de6ad3" dependencies = [ "crossbeam-channel", "crossbeam-deque", @@ -5221,16 +5325,16 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b033d837a7cf162d7993aded9304e30a83213c648b6e389db233191f891e5c2b" dependencies = [ - "getrandom 0.2.7", + "getrandom 0.2.8", "redox_syscall 0.2.16", "thiserror", ] [[package]] name = "regex" -version = "1.6.0" +version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c4eb3267174b8c6c2f654116623910a0fef09c4753f8dd83db29c48a0df988b" +checksum = "e076559ef8e241f2ae3479e36f97bd5741c0330689e217ad51ce2c76808b868a" dependencies = [ "aho-corasick", "memchr", @@ -5248,9 +5352,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.6.27" +version = "0.6.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a3f87b73ce11b1619a3c6332f45341e0047173771e8b8b73f87bfeefb7b56244" +checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848" [[package]] name = "remove_dir_all" @@ -5261,6 +5365,15 @@ dependencies = [ "winapi", ] +[[package]] +name = "rend" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79af64b4b6362ffba04eef3a4e10829718a4896dac19daa741851c86781edf95" +dependencies = [ + "bytecheck", +] + [[package]] name = "reqsign" version = "0.6.9" @@ -5286,15 +5399,15 @@ dependencies = [ "serde_json", "sha1", "sha2", - "time 0.3.14", + "time 0.3.17", "ureq", ] [[package]] name = "reqwest" -version = "0.11.12" +version = "0.11.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "431949c384f4e2ae07605ccaa56d1d9d2ecdb5cadd4f9577ccfab29f2e5149fc" +checksum = "68cc60575865c7831548863cc02356512e3f1dc2f3f82cb837d7fc4cc8f3c97c" dependencies = [ "base64", "bytes", @@ -5368,6 +5481,31 @@ dependencies = [ "winapi", ] +[[package]] +name = "rkyv" +version = "0.7.39" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cec2b3485b07d96ddfd3134767b8a447b45ea4eb91448d0a35180ec0ffd5ed15" +dependencies = [ + "bytecheck", + "hashbrown 0.12.3", + "ptr_meta", + "rend", + "rkyv_derive", + "seahash", +] + +[[package]] +name = "rkyv_derive" +version = "0.7.39" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6eaedadc88b53e36dd32d940ed21ae4d850d5916f2581526921f553a72ac34c4" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "ron" version = "0.7.1" @@ -5387,7 +5525,7 @@ checksum = "4b18820d944b33caa75a71378964ac46f58517c92b6ae5f762636247c09e78fb" dependencies = [ "base64", "blake2b_simd", - "constant_time_eq", + "constant_time_eq 0.1.5", "crossbeam-utils", ] @@ -5403,13 +5541,20 @@ dependencies = [ [[package]] name = "rust_decimal" -version = "1.26.1" +version = "1.27.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee9164faf726e4f3ece4978b25ca877ddc6802fa77f38cdccb32c7f805ecd70c" +checksum = "33c321ee4e17d2b7abe12b5d20c1231db708dd36185c8a21e9de5fed6da4dbe9" dependencies = [ "arrayvec 0.7.2", + "borsh", + "bytecheck", + "byteorder", + "bytes", "num-traits", + "rand 0.8.5", + "rkyv", "serde", + "serde_json", ] [[package]] @@ -5444,16 +5589,16 @@ dependencies = [ [[package]] name = "rustix" -version = "0.35.10" +version = "0.36.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af895b90e5c071badc3136fc10ff0bcfc98747eadbaf43ed8f214e07ba8f8477" +checksum = "a3807b5d10909833d3e9acd1eb5fb988f79376ff10fce42937de71a449c4c588" dependencies = [ "bitflags", "errno", "io-lifetimes", "libc", "linux-raw-sys", - "windows-sys 0.36.1", + "windows-sys 0.42.0", ] [[package]] @@ -5583,7 +5728,7 @@ dependencies = [ "rustpython-doc", "syn", "syn-ext", - "textwrap 0.15.1", + "textwrap 0.15.2", ] [[package]] @@ -5655,7 +5800,7 @@ dependencies = [ "memchr", "memmap2", "mt19937", - "nix 0.24.2", + "nix 0.24.3", "num-bigint", "num-complex", "num-integer", @@ -5709,7 +5854,7 @@ dependencies = [ "crossbeam-utils", "exitcode", "flate2", - "getrandom 0.2.7", + "getrandom 0.2.8", "glob", "half 1.8.2", "hex", @@ -5720,8 +5865,8 @@ dependencies = [ "libc", "log", "memchr", - "memoffset", - "nix 0.24.2", + "memoffset 0.6.5", + "nix 0.24.3", "num-bigint", "num-complex", "num-integer", @@ -5748,8 +5893,8 @@ dependencies = [ "serde", "sre-engine", "static_assertions", - "strum 0.24.1", - "strum_macros 0.24.3", + "strum", + "strum_macros", "thiserror", "thread_local", "timsort", @@ -5787,7 +5932,7 @@ dependencies = [ "libc", "log", "memchr", - "nix 0.24.2", + "nix 0.24.3", "radix_trie", "scopeguard", "unicode-segmentation", @@ -5920,6 +6065,12 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" +[[package]] +name = "scratch" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8132065adcfd6e02db789d9285a0deb2f3fcb04002865ab67d5fb103533898" + [[package]] name = "script" version = "0.1.0" @@ -5935,7 +6086,7 @@ dependencies = [ "common-time", "console", "datafusion", - "datafusion-common 7.0.0", + "datafusion-common", "datafusion-expr", "datafusion-physical-expr", "datatypes", @@ -5975,6 +6126,12 @@ dependencies = [ "untrusted", ] +[[package]] +name = "seahash" +version = "4.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b" + [[package]] name = "security-framework" version = "2.7.0" @@ -6025,11 +6182,17 @@ dependencies = [ "pest", ] +[[package]] +name = "seq-macro" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0772c5c30e1a0d91f6834f8e545c69281c099dfa9a3ac58d96a9fd629c8d4898" + [[package]] name = "serde" -version = "1.0.145" +version = "1.0.149" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "728eb6351430bccb993660dfffc5a72f91ccc1295abaa8ce19b27ebe4f75568b" +checksum = "256b9932320c590e707b94576e3cc1f7c9024d0ee6612dfbcf1cb106cbe8e055" dependencies = [ "serde_derive", ] @@ -6046,9 +6209,9 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.145" +version = "1.0.149" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81fa1584d3d1bcacd84c277a0dfe21f5b0f6accf4a23d04d4c6d61f1af522b4c" +checksum = "b4eae9b04cbffdfd550eb462ed33bc6a1b68c935127d008b27444d08380f94e4" dependencies = [ "proc-macro2", "quote", @@ -6068,12 +6231,11 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.85" +version = "1.0.89" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e55a28e3aaef9d5ce0506d0a14dbba8054ddc7e499ef522dd8b26859ec9d4a44" +checksum = "020ff22c755c2ed3f8cf162dbb41a7268d934702f3ed3631656ea597e08fc3db" dependencies = [ - "indexmap", - "itoa 1.0.3", + "itoa 1.0.4", "ryu", "serde", ] @@ -6094,7 +6256,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" dependencies = [ "form_urlencoded", - "itoa 1.0.3", + "itoa 1.0.4", "ryu", "serde", ] @@ -6106,7 +6268,7 @@ dependencies = [ "aide", "api", "async-trait", - "axum 0.6.1", + "axum", "axum-macros", "axum-test-helper", "base64", @@ -6136,7 +6298,7 @@ dependencies = [ "openmetrics-parser", "opensrv-mysql", "pgwire", - "prost 0.11.0", + "prost 0.11.3", "query", "rand 0.8.5", "regex", @@ -6150,7 +6312,7 @@ dependencies = [ "sha1", "snafu", "snap", - "strum 0.24.1", + "strum", "table", "tempdir", "tokio", @@ -6175,9 +6337,9 @@ dependencies = [ [[package]] name = "sha-1" -version = "0.10.0" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "028f48d513f9678cda28f6e4064755b3fbb2af6acd672f2c209b62323f7aea0f" +checksum = "f5058ada175748e33390e40e872bd0fe59a19f265d0158daa551c5a88a76009c" dependencies = [ "cfg-if 1.0.0", "cpufeatures", @@ -6273,12 +6435,6 @@ dependencies = [ "paste", ] -[[package]] -name = "simdutf8" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f27f6278552951f1f2b8cf9da965d10969b2efdea95a6ec47987ab46edfe263a" - [[package]] name = "simple_asn1" version = "0.6.2" @@ -6288,7 +6444,7 @@ dependencies = [ "num-bigint", "num-traits", "thiserror", - "time 0.3.14", + "time 0.3.17", ] [[package]] @@ -6340,15 +6496,15 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.9.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fd0db749597d91ff862fd1d55ea87f7855a744a8425a64695b6fca237d1dad1" +checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0" [[package]] name = "snafu" -version = "0.7.1" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5177903bf45656592d9eb5c0e22f408fc023aae51dbe2088889b71633ba451f2" +checksum = "a152ba99b054b22972ee794cf04e5ef572da1229e33b65f3c57abbff0525a454" dependencies = [ "backtrace", "doc-comment", @@ -6357,9 +6513,9 @@ dependencies = [ [[package]] name = "snafu-derive" -version = "0.7.1" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "410b26ed97440d90ced3e2488c868d56a86e2064f5d7d6f417909b286afe25e5" +checksum = "d5e79cdebbabaebb06a9bdbaedc7f159b410461f63611d4d0e3fb0fab8fed850" dependencies = [ "heck 0.4.0", "proc-macro2", @@ -6369,9 +6525,9 @@ dependencies = [ [[package]] name = "snap" -version = "1.0.5" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45456094d1983e2ee2a18fdfebce3189fa451699d0502cb8e3b49dba5ba41451" +checksum = "5e9f0ab6ef7eb7353d9119c170a436d1bf248eea575ac42d19d12f4e34130831" [[package]] name = "socket2" @@ -6415,7 +6571,7 @@ dependencies = [ "mito", "once_cell", "snafu", - "sqlparser 0.15.0", + "sqlparser", ] [[package]] @@ -6439,20 +6595,11 @@ version = "0.1.0" dependencies = [ "async-trait", "client", - "comfy-table 6.1.2", + "comfy-table", "sqlness", "tokio", ] -[[package]] -name = "sqlparser" -version = "0.15.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adbbea2526ad0d02ad9414a07c396078a5b944bbf9ca4fbab8f01bb4cb579081" -dependencies = [ - "log", -] - [[package]] name = "sqlparser" version = "0.26.0" @@ -6507,7 +6654,7 @@ name = "storage" version = "0.1.0" dependencies = [ "arc-swap", - "arrow-format", + "async-compat", "async-stream", "async-trait", "atomic_float", @@ -6525,9 +6672,10 @@ dependencies = [ "lazy_static", "log-store", "object-store", + "parquet", "paste", "planus", - "prost 0.11.0", + "prost 0.11.3", "rand 0.8.5", "regex", "serde", @@ -6569,21 +6717,6 @@ version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e08d8363704e6c71fc928674353e6b7c23dcea9d82d7012c8faf2a3a025f8d0" -[[package]] -name = "streaming-decompression" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf6cc3b19bfb128a8ad11026086e31d3ce9ad23f8ea37354b31383a187c44cf3" -dependencies = [ - "fallible-streaming-iterator", -] - -[[package]] -name = "streaming-iterator" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0085b81d5d4e57f264d492641cf80ea508c96d9a0e47c6296e8f016504e28fd7" - [[package]] name = "streaming-stats" version = "0.2.3" @@ -6593,12 +6726,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "strength_reduce" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a3ff2f71c82567c565ba4b3009a9350a96a7269eaa4001ebedae926230bc2254" - [[package]] name = "string_cache" version = "0.8.4" @@ -6658,32 +6785,13 @@ dependencies = [ "syn", ] -[[package]] -name = "strum" -version = "0.23.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cae14b91c7d11c9a851d3fbc80a963198998c2a64eec840477fa92d8ce9b70bb" - [[package]] name = "strum" version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "063e6045c0e62079840579a7e47a355ae92f60eb74daaf156fb1e84ba164e63f" dependencies = [ - "strum_macros 0.24.3", -] - -[[package]] -name = "strum_macros" -version = "0.23.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5bb0dc7ee9c15cea6199cde9a127fa16a4c5819af85395457ad72d68edc85a38" -dependencies = [ - "heck 0.3.3", - "proc-macro2", - "quote", - "rustversion", - "syn", + "strum_macros", ] [[package]] @@ -6749,9 +6857,9 @@ checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601" [[package]] name = "syn" -version = "1.0.100" +version = "1.0.105" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52205623b1b0f064a4e71182c3b18ae902267282930c6d5462c91b859668426e" +checksum = "60b9b43d45702de4c839cb9b51d9f529c5dd26a4aff255b42b1ebc03e88ee908" dependencies = [ "proc-macro2", "quote", @@ -6806,11 +6914,12 @@ dependencies = [ "common-recordbatch", "common-telemetry", "datafusion", - "datafusion-common 7.0.0", + "datafusion-common", "datafusion-expr", "datatypes", "derive_builder", "futures", + "parquet", "parquet-format-async-temp", "paste", "serde", @@ -6912,7 +7021,7 @@ name = "tests-integration" version = "0.1.0" dependencies = [ "api", - "axum 0.6.1", + "axum", "axum-test-helper", "catalog", "client", @@ -6950,24 +7059,30 @@ dependencies = [ [[package]] name = "textwrap" -version = "0.15.1" +version = "0.15.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7b3e525a49ec206798b40326a44121291b530c963cfb01018f63e135bac543d" + +[[package]] +name = "textwrap" +version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "949517c0cf1bf4ee812e2e07e08ab448e3ae0d23472aee8a06c985f0c8815b16" +checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d" [[package]] name = "thiserror" -version = "1.0.35" +version = "1.0.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c53f98874615aea268107765aa1ed8f6116782501d18e53d08b471733bea6c85" +checksum = "10deb33631e3c9018b9baf9dcbbc4f737320d2b576bac10f6aefa048fa407e3e" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.35" +version = "1.0.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8b463991b4eab2d801e724172285ec4195c650e8ec79b149e6c2a8e6dd3f783" +checksum = "982d17546b47146b28f7c22e3d08465f6b8903d0ea13c1660d9d84a6e7adcdbb" dependencies = [ "proc-macro2", "quote", @@ -7005,12 +7120,12 @@ dependencies = [ [[package]] name = "thrift" -version = "0.13.0" +version = "0.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c6d965454947cc7266d22716ebfd07b18d84ebaf35eec558586bbb2a8cb6b5b" +checksum = "b82ca8f46f95b3ce96081fe3dd89160fdea970c254bb72925255d1b62aae692e" dependencies = [ "byteorder", - "integer-encoding 1.1.7", + "integer-encoding", "log", "ordered-float 1.1.1", "threadpool", @@ -7018,15 +7133,13 @@ dependencies = [ [[package]] name = "thrift" -version = "0.15.0" +version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b82ca8f46f95b3ce96081fe3dd89160fdea970c254bb72925255d1b62aae692e" +checksum = "09678c4cdbb4eed72e18b7c2af1329c69825ed16fcbac62d083fc3e2b0590ff0" dependencies = [ "byteorder", - "integer-encoding 3.0.4", - "log", + "integer-encoding", "ordered-float 1.1.1", - "threadpool", ] [[package]] @@ -7041,22 +7154,30 @@ dependencies = [ [[package]] name = "time" -version = "0.3.14" +version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c3f9a28b618c3a6b9251b6908e9c99e04b9e5c02e6581ccbb67d59c34ef7f9b" +checksum = "a561bf4617eebd33bca6434b988f39ed798e527f51a1e797d0ee4f61c0a38376" dependencies = [ - "itoa 1.0.3", - "libc", - "num_threads", + "itoa 1.0.4", "serde", + "time-core", "time-macros", ] +[[package]] +name = "time-core" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e153e1f1acaef8acc537e68b44906d2db6436e2b35ac2c6b42640fff91f00fd" + [[package]] name = "time-macros" -version = "0.2.4" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42657b1a6f4d817cda8e7a0ace261fe0cc946cf3a80314390b22cc61ae080792" +checksum = "d967f99f534ca7e495c575c62638eebc2898a8c84c119b89e250477bc4ba16b2" +dependencies = [ + "time-core", +] [[package]] name = "timsort" @@ -7100,9 +7221,9 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" [[package]] name = "tokio" -version = "1.21.1" +version = "1.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0020c875007ad96677dcc890298f4b942882c5d4eb7cc8f439fc3bf813dc9c95" +checksum = "eab6d665857cc6ca78d6e80303a02cea7a7851e85dfbd77cbdc09bd129f1ef46" dependencies = [ "autocfg", "bytes", @@ -7110,14 +7231,13 @@ dependencies = [ "memchr", "mio", "num_cpus", - "once_cell", "parking_lot", "pin-project-lite", "signal-hook-registry", "socket2", "tokio-macros", "tracing", - "winapi", + "windows-sys 0.42.0", ] [[package]] @@ -7132,9 +7252,9 @@ dependencies = [ [[package]] name = "tokio-macros" -version = "1.8.0" +version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9724f9a975fb987ef7a3cd9be0350edcbe130698af5b8f7a631e23d42d052484" +checksum = "d266c00fde287f55d3f1c3e96c500c362a2b8c695076ec180f27918820bc6df8" dependencies = [ "proc-macro2", "quote", @@ -7192,9 +7312,9 @@ dependencies = [ [[package]] name = "tokio-stream" -version = "0.1.10" +version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6edf2d6bc038a43d31353570e27270603f4648d18f5ed10c0e179abe43255af" +checksum = "d660770404473ccd7bc9f8b28494a811bc18542b915c0855c51e8f419d5223ce" dependencies = [ "futures-core", "pin-project-lite", @@ -7240,13 +7360,13 @@ dependencies = [ [[package]] name = "tonic" -version = "0.8.1" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11cd56bdb54ef93935a6a79dbd1d91f1ebd4c64150fd61654031fd6b8b775c91" +checksum = "8f219fad3b929bef19b1f86fbc0358d35daed8f2cac972037ac0dc10bbb8d5fb" dependencies = [ "async-stream", "async-trait", - "axum 0.5.16", + "axum", "base64", "bytes", "futures-core", @@ -7258,8 +7378,8 @@ dependencies = [ "hyper-timeout", "percent-encoding", "pin-project", - "prost 0.11.0", - "prost-derive 0.11.0", + "prost 0.11.3", + "prost-derive 0.11.2", "tokio", "tokio-stream", "tokio-util", @@ -7272,13 +7392,13 @@ dependencies = [ [[package]] name = "tonic-build" -version = "0.8.0" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fbcd2800e34e743b9ae795867d5f77b535d3a3be69fd731e39145719752df8c" +checksum = "5bf5e9b9c0f7e0a7c027dcfaba7b2c60816c7049171f679d99ee2ff65d0de8c4" dependencies = [ "prettyplease", "proc-macro2", - "prost-build 0.11.1", + "prost-build 0.11.3", "quote", "syn", ] @@ -7290,8 +7410,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0455f730d540a1484bffc3c55c94100b18a662597b982c2e9073f2c55c602616" dependencies = [ "bytes", - "prost 0.11.0", - "prost-types 0.11.1", + "prost 0.11.3", + "prost-types 0.11.2", "tokio", "tokio-stream", "tonic", @@ -7321,9 +7441,9 @@ dependencies = [ [[package]] name = "tower-http" -version = "0.3.4" +version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c530c8675c1dbf98facee631536fa116b5fb6382d7dd6dc1b118d970eafe3ba" +checksum = "f873044bf02dd1e8239e9c1293ea39dad76dc594ec16185d0a1bf31d8dc8d858" dependencies = [ "async-compression", "base64", @@ -7363,9 +7483,9 @@ checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" [[package]] name = "tracing" -version = "0.1.36" +version = "0.1.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fce9567bd60a67d08a16488756721ba392f24f29006402881e43b19aac64307" +checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8" dependencies = [ "cfg-if 1.0.0", "log", @@ -7381,15 +7501,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09d48f71a791638519505cefafe162606f706c25592e4bde4d97600c0195312e" dependencies = [ "crossbeam-channel", - "time 0.3.14", + "time 0.3.17", "tracing-subscriber", ] [[package]] name = "tracing-attributes" -version = "0.1.22" +version = "0.1.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11c75893af559bc8e10716548bdef5cb2b983f8e637db9d0e15126b61b484ee2" +checksum = "4017f8f45139870ca7e672686113917c71c7a6e02d4924eda67186083c03081a" dependencies = [ "proc-macro2", "quote", @@ -7398,15 +7518,15 @@ dependencies = [ [[package]] name = "tracing-bunyan-formatter" -version = "0.3.3" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a788f2119fde477cd33823330c14004fa8cdac6892fd6f12181bbda9dbf14fc9" +checksum = "a2445962f94a813b2aaea29ceeccb6dce9fd3aa5b1cb45595cde755b00d021ad" dependencies = [ "gethostname", "log", "serde", "serde_json", - "time 0.3.14", + "time 0.3.17", "tracing", "tracing-core", "tracing-log", @@ -7415,9 +7535,9 @@ dependencies = [ [[package]] name = "tracing-core" -version = "0.1.29" +version = "0.1.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5aeea4303076558a00714b823f9ad67d58a3bbda1df83d8827d21193156e22f7" +checksum = "24eb03ba0eab1fd845050058ce5e616558e8f8d8fca633e6b163fe25c797213a" dependencies = [ "once_cell", "valuable", @@ -7462,12 +7582,12 @@ dependencies = [ [[package]] name = "tracing-subscriber" -version = "0.3.15" +version = "0.3.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60db860322da191b40952ad9affe65ea23e7dd6a5c442c2c42865810c6ab8e6b" +checksum = "a6176eae26dd70d0c919749377897b54a9276bd7061339665dd68777926b5a70" dependencies = [ - "ansi_term", "matchers", + "nu-ansi-term", "once_cell", "regex", "sharded-slab", @@ -7512,9 +7632,9 @@ dependencies = [ [[package]] name = "typenum" -version = "1.15.0" +version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dcf81ac59edc17cc8697ff311e8f5ef2d99fcbd9817b34cec66f90b6c3dfd987" +checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" [[package]] name = "ucd-trie" @@ -7679,9 +7799,9 @@ checksum = "623f59e6af2a98bdafeb93fa277ac8e1e40440973001ca15cf4ae1541cd16d56" [[package]] name = "unicode-ident" -version = "1.0.4" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dcc811dc4066ac62f84f11307873c4850cb653bfa9b1719cee2bd2204a4bc5dd" +checksum = "6ceab39d59e4c9499d4e5a8ee0e2735b891bb7308ac83dfb4e80cad195c9f6f3" [[package]] name = "unicode-normalization" @@ -7758,12 +7878,12 @@ checksum = "936e4b492acfd135421d8dca4b1aa80a7bfc26e702ef3af710e0752684df5372" [[package]] name = "uuid" -version = "1.1.2" +version = "1.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd6469f4314d5f1ffec476e05f17cc9a78bc7a27a6a857842170bdf8d6f98d2f" +checksum = "422ee0de9031b5b948b97a8fc04e3aa35230001a722ddd27943e0be31564ce4c" dependencies = [ "atomic", - "getrandom 0.2.7", + "getrandom 0.2.8", "rand 0.8.5", "serde", "uuid-macro-internal", @@ -7810,7 +7930,7 @@ dependencies = [ "getset", "rustversion", "thiserror", - "time 0.3.14", + "time 0.3.17", ] [[package]] @@ -7975,9 +8095,9 @@ dependencies = [ [[package]] name = "webpki-roots" -version = "0.22.4" +version = "0.22.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1c760f0d366a6c24a02ed7816e23e691f5d92291f94d15e836006fd11b04daf" +checksum = "368bfe657969fb01238bb756d351dcade285e0f6fcbd36dcb23359a5169975be" dependencies = [ "webpki", ] @@ -8193,9 +8313,9 @@ dependencies = [ [[package]] name = "wyz" -version = "0.5.0" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30b31594f29d27036c383b53b59ed3476874d518f0efb151b27a4c275141390e" +checksum = "05f360fc0b24296329c78fda852a1e9ae82de9cf7b27dae4b7f62f118f77b9ed" dependencies = [ "tap", ] @@ -8208,18 +8328,18 @@ checksum = "d2d7d3948613f75c98fd9328cfdcc45acc4d360655289d0a7d4ec931392200a3" [[package]] name = "zstd" -version = "0.10.2+zstd.1.5.2" +version = "0.11.2+zstd.1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f4a6bd64f22b5e3e94b4e238669ff9f10815c27a5180108b849d24174a83847" +checksum = "20cc960326ece64f010d2d2107537f26dc589a6573a316bd5b1dba685fa5fde4" dependencies = [ "zstd-safe", ] [[package]] name = "zstd-safe" -version = "4.1.6+zstd.1.5.2" +version = "5.0.2+zstd.1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94b61c51bb270702d6167b8ce67340d2754b088d0c091b06e593aa772c3ee9bb" +checksum = "1d2a5585e04f9eea4b2a3d1eca508c4dee9592a89ef6f450c11719da0726f4db" dependencies = [ "libc", "zstd-sys", @@ -8227,9 +8347,9 @@ dependencies = [ [[package]] name = "zstd-sys" -version = "1.6.3+zstd.1.5.2" +version = "2.0.4+zstd.1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc49afa5c8d634e75761feda8c592051e7eeb4683ba827211eb0d731d3402ea8" +checksum = "4fa202f2ef00074143e219d15b62ffc317d17cc33909feac471c044087cad7b0" dependencies = [ "cc", "libc", diff --git a/Cargo.toml b/Cargo.toml index a960138d4b7f..80a592b46439 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,7 +20,6 @@ members = [ "src/common/time", "src/datanode", "src/datatypes", - "src/datatypes2", "src/frontend", "src/log-store", "src/meta-client", diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 2d2819178eb8..ea8d78ef529b 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -5,10 +5,10 @@ edition = "2021" license = "Apache-2.0" [dependencies] -arrow = "10" +arrow = "26.0.0" clap = { version = "4.0", features = ["derive"] } client = { path = "../src/client" } indicatif = "0.17.1" itertools = "0.10.5" -parquet = { version = "*" } +parquet = "26.0.0" tokio = { version = "1.21", features = ["full"] } diff --git a/benchmarks/src/bin/nyc-taxi.rs b/benchmarks/src/bin/nyc-taxi.rs index 0ca1f33182ed..f39b48c87e76 100644 --- a/benchmarks/src/bin/nyc-taxi.rs +++ b/benchmarks/src/bin/nyc-taxi.rs @@ -20,7 +20,6 @@ use std::collections::HashMap; use std::path::{Path, PathBuf}; -use std::sync::Arc; use std::time::Instant; use arrow::array::{ArrayRef, PrimitiveArray, StringArray, TimestampNanosecondArray}; @@ -32,9 +31,7 @@ use client::api::v1::column::Values; use client::api::v1::{Column, ColumnDataType, ColumnDef, CreateExpr, InsertExpr}; use client::{Client, Database, Select}; use indicatif::{MultiProgress, ProgressBar, ProgressStyle}; -use parquet::arrow::{ArrowReader, ParquetFileArrowReader}; -use parquet::file::reader::FileReader; -use parquet::file::serialized_reader::SerializedFileReader; +use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; use tokio::task::JoinSet; const DATABASE_NAME: &str = "greptime"; @@ -86,10 +83,14 @@ async fn write_data( pb_style: ProgressStyle, ) -> u128 { let file = std::fs::File::open(&path).unwrap(); - let file_reader = Arc::new(SerializedFileReader::new(file).unwrap()); - let row_num = file_reader.metadata().file_metadata().num_rows(); - let record_batch_reader = ParquetFileArrowReader::new(file_reader) - .get_record_reader(batch_size) + let record_batch_reader_builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap(); + let row_num = record_batch_reader_builder + .metadata() + .file_metadata() + .num_rows(); + let record_batch_reader = record_batch_reader_builder + .with_batch_size(batch_size) + .build() .unwrap(); let progress_bar = mpb.add(ProgressBar::new(row_num as _)); progress_bar.set_style(pb_style); @@ -210,9 +211,10 @@ fn build_values(column: &ArrayRef) -> Values { | DataType::FixedSizeList(_, _) | DataType::LargeList(_) | DataType::Struct(_) - | DataType::Union(_, _) + | DataType::Union(_, _, _) | DataType::Dictionary(_, _) - | DataType::Decimal(_, _) + | DataType::Decimal128(_, _) + | DataType::Decimal256(_, _) | DataType::Map(_, _) => todo!(), } } diff --git a/src/api/greptime/v1/column.proto b/src/api/greptime/v1/column.proto index ec6993abe943..6f5692747e77 100644 --- a/src/api/greptime/v1/column.proto +++ b/src/api/greptime/v1/column.proto @@ -32,7 +32,10 @@ message Column { repeated int32 date_values = 14; repeated int64 datetime_values = 15; - repeated int64 ts_millis_values = 16; + repeated int64 ts_second_values = 16; + repeated int64 ts_millisecond_values = 17; + repeated int64 ts_microsecond_values = 18; + repeated int64 ts_nanosecond_values = 19; } // The array of non-null values in this column. // @@ -75,5 +78,8 @@ enum ColumnDataType { STRING = 12; DATE = 13; DATETIME = 14; - TIMESTAMP = 15; + TIMESTAMP_SECOND = 15; + TIMESTAMP_MILLISECOND = 16; + TIMESTAMP_MICROSECOND = 17; + TIMESTAMP_NANOSECOND = 18; } diff --git a/src/api/src/helper.rs b/src/api/src/helper.rs index cdcf704c8dcf..72fb0c507bcd 100644 --- a/src/api/src/helper.rs +++ b/src/api/src/helper.rs @@ -15,6 +15,7 @@ use common_base::BitVec; use common_time::timestamp::TimeUnit; use datatypes::prelude::ConcreteDataType; +use datatypes::types::TimestampType; use datatypes::value::Value; use datatypes::vectors::VectorRef; use snafu::prelude::*; @@ -56,7 +57,16 @@ impl From for ConcreteDataType { ColumnDataType::String => ConcreteDataType::string_datatype(), ColumnDataType::Date => ConcreteDataType::date_datatype(), ColumnDataType::Datetime => ConcreteDataType::datetime_datatype(), - ColumnDataType::Timestamp => ConcreteDataType::timestamp_millis_datatype(), + ColumnDataType::TimestampSecond => ConcreteDataType::timestamp_second_datatype(), + ColumnDataType::TimestampMillisecond => { + ConcreteDataType::timestamp_millisecond_datatype() + } + ColumnDataType::TimestampMicrosecond => { + ConcreteDataType::timestamp_microsecond_datatype() + } + ColumnDataType::TimestampNanosecond => { + ConcreteDataType::timestamp_nanosecond_datatype() + } } } } @@ -81,7 +91,12 @@ impl TryFrom for ColumnDataTypeWrapper { ConcreteDataType::String(_) => ColumnDataType::String, ConcreteDataType::Date(_) => ColumnDataType::Date, ConcreteDataType::DateTime(_) => ColumnDataType::Datetime, - ConcreteDataType::Timestamp(_) => ColumnDataType::Timestamp, + ConcreteDataType::Timestamp(unit) => match unit { + TimestampType::Second(_) => ColumnDataType::TimestampSecond, + TimestampType::Millisecond(_) => ColumnDataType::TimestampMillisecond, + TimestampType::Microsecond(_) => ColumnDataType::TimestampMicrosecond, + TimestampType::Nanosecond(_) => ColumnDataType::TimestampNanosecond, + }, ConcreteDataType::Null(_) | ConcreteDataType::List(_) => { return error::IntoColumnDataTypeSnafu { from: datatype }.fail() } @@ -153,8 +168,20 @@ impl Values { datetime_values: Vec::with_capacity(capacity), ..Default::default() }, - ColumnDataType::Timestamp => Values { - ts_millis_values: Vec::with_capacity(capacity), + ColumnDataType::TimestampSecond => Values { + ts_second_values: Vec::with_capacity(capacity), + ..Default::default() + }, + ColumnDataType::TimestampMillisecond => Values { + ts_millisecond_values: Vec::with_capacity(capacity), + ..Default::default() + }, + ColumnDataType::TimestampMicrosecond => Values { + ts_microsecond_values: Vec::with_capacity(capacity), + ..Default::default() + }, + ColumnDataType::TimestampNanosecond => Values { + ts_nanosecond_values: Vec::with_capacity(capacity), ..Default::default() }, } @@ -187,9 +214,12 @@ impl Column { Value::Binary(val) => values.binary_values.push(val.to_vec()), Value::Date(val) => values.date_values.push(val.val()), Value::DateTime(val) => values.datetime_values.push(val.val()), - Value::Timestamp(val) => values - .ts_millis_values - .push(val.convert_to(TimeUnit::Millisecond)), + Value::Timestamp(val) => match val.unit() { + TimeUnit::Second => values.ts_second_values.push(val.value()), + TimeUnit::Millisecond => values.ts_millisecond_values.push(val.value()), + TimeUnit::Microsecond => values.ts_microsecond_values.push(val.value()), + TimeUnit::Nanosecond => values.ts_nanosecond_values.push(val.value()), + }, Value::List(_) => unreachable!(), }); self.null_mask = null_mask.into_vec(); @@ -200,7 +230,10 @@ impl Column { mod tests { use std::sync::Arc; - use datatypes::vectors::BooleanVector; + use datatypes::vectors::{ + BooleanVector, TimestampMicrosecondVector, TimestampMillisecondVector, + TimestampNanosecondVector, TimestampSecondVector, + }; use super::*; @@ -258,8 +291,8 @@ mod tests { let values = values.datetime_values; assert_eq!(2, values.capacity()); - let values = Values::with_capacity(ColumnDataType::Timestamp, 2); - let values = values.ts_millis_values; + let values = Values::with_capacity(ColumnDataType::TimestampMillisecond, 2); + let values = values.ts_millisecond_values; assert_eq!(2, values.capacity()); } @@ -326,8 +359,8 @@ mod tests { ColumnDataTypeWrapper(ColumnDataType::Datetime).into() ); assert_eq!( - ConcreteDataType::timestamp_millis_datatype(), - ColumnDataTypeWrapper(ColumnDataType::Timestamp).into() + ConcreteDataType::timestamp_millisecond_datatype(), + ColumnDataTypeWrapper(ColumnDataType::TimestampMillisecond).into() ); } @@ -394,8 +427,8 @@ mod tests { ConcreteDataType::datetime_datatype().try_into().unwrap() ); assert_eq!( - ColumnDataTypeWrapper(ColumnDataType::Timestamp), - ConcreteDataType::timestamp_millis_datatype() + ColumnDataTypeWrapper(ColumnDataType::TimestampMillisecond), + ConcreteDataType::timestamp_millisecond_datatype() .try_into() .unwrap() ); @@ -412,7 +445,48 @@ mod tests { assert!(result.is_err()); assert_eq!( result.unwrap_err().to_string(), - "Failed to create column datatype from List(ListType { inner: Boolean(BooleanType) })" + "Failed to create column datatype from List(ListType { item_type: Boolean(BooleanType) })" + ); + } + + #[test] + fn test_column_put_timestamp_values() { + let mut column = Column { + column_name: "test".to_string(), + semantic_type: 0, + values: Some(Values { + ..Default::default() + }), + null_mask: vec![], + datatype: 0, + }; + + let vector = Arc::new(TimestampNanosecondVector::from_vec(vec![1, 2, 3])); + column.push_vals(3, vector); + assert_eq!( + vec![1, 2, 3], + column.values.as_ref().unwrap().ts_nanosecond_values + ); + + let vector = Arc::new(TimestampMillisecondVector::from_vec(vec![4, 5, 6])); + column.push_vals(3, vector); + assert_eq!( + vec![4, 5, 6], + column.values.as_ref().unwrap().ts_millisecond_values + ); + + let vector = Arc::new(TimestampMicrosecondVector::from_vec(vec![7, 8, 9])); + column.push_vals(3, vector); + assert_eq!( + vec![7, 8, 9], + column.values.as_ref().unwrap().ts_microsecond_values + ); + + let vector = Arc::new(TimestampSecondVector::from_vec(vec![10, 11, 12])); + column.push_vals(3, vector); + assert_eq!( + vec![10, 11, 12], + column.values.as_ref().unwrap().ts_second_values ); } diff --git a/src/catalog/Cargo.toml b/src/catalog/Cargo.toml index 1c6f7a063ef5..90adcf8e8ab5 100644 --- a/src/catalog/Cargo.toml +++ b/src/catalog/Cargo.toml @@ -19,9 +19,7 @@ common-recordbatch = { path = "../common/recordbatch" } common-runtime = { path = "../common/runtime" } common-telemetry = { path = "../common/telemetry" } common-time = { path = "../common/time" } -datafusion = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2", features = [ - "simd", -] } +datafusion = "14.0.0" datatypes = { path = "../datatypes" } futures = "0.3" futures-util = "0.3" diff --git a/src/catalog/src/error.rs b/src/catalog/src/error.rs index 05e6944cd54c..f344ae3bb8a4 100644 --- a/src/catalog/src/error.rs +++ b/src/catalog/src/error.rs @@ -17,7 +17,7 @@ use std::any::Any; use common_error::ext::{BoxedError, ErrorExt}; use common_error::prelude::{Snafu, StatusCode}; use datafusion::error::DataFusionError; -use datatypes::arrow; +use datatypes::prelude::ConcreteDataType; use datatypes::schema::RawSchema; use snafu::{Backtrace, ErrorCompat}; @@ -51,14 +51,12 @@ pub enum Error { SystemCatalog { msg: String, backtrace: Backtrace }, #[snafu(display( - "System catalog table type mismatch, expected: binary, found: {:?} source: {}", + "System catalog table type mismatch, expected: binary, found: {:?}", data_type, - source ))] SystemCatalogTypeMismatch { - data_type: arrow::datatypes::DataType, - #[snafu(backtrace)] - source: datatypes::error::Error, + data_type: ConcreteDataType, + backtrace: Backtrace, }, #[snafu(display("Invalid system catalog entry type: {:?}", entry_type))] @@ -222,10 +220,11 @@ impl ErrorExt for Error { | Error::ValueDeserialize { .. } | Error::Io { .. } => StatusCode::StorageUnavailable, - Error::RegisterTable { .. } => StatusCode::Internal, + Error::RegisterTable { .. } | Error::SystemCatalogTypeMismatch { .. } => { + StatusCode::Internal + } Error::ReadSystemCatalog { source, .. } => source.status_code(), - Error::SystemCatalogTypeMismatch { source, .. } => source.status_code(), Error::InvalidCatalogValue { source, .. } => source.status_code(), Error::TableExists { .. } => StatusCode::TableAlreadyExists, @@ -265,7 +264,6 @@ impl From for DataFusionError { #[cfg(test)] mod tests { use common_error::mock::MockError; - use datatypes::arrow::datatypes::DataType; use snafu::GenerateImplicitData; use super::*; @@ -314,11 +312,8 @@ mod tests { assert_eq!( StatusCode::Internal, Error::SystemCatalogTypeMismatch { - data_type: DataType::Boolean, - source: datatypes::error::Error::UnsupportedArrowType { - arrow_type: DataType::Boolean, - backtrace: Backtrace::generate() - } + data_type: ConcreteDataType::binary_datatype(), + backtrace: Backtrace::generate(), } .status_code() ); diff --git a/src/catalog/src/helper.rs b/src/catalog/src/helper.rs index 2caf098865b8..062d07bc1997 100644 --- a/src/catalog/src/helper.rs +++ b/src/catalog/src/helper.rs @@ -138,7 +138,7 @@ impl TableGlobalKey { /// Table global info contains necessary info for a datanode to create table regions, including /// table id, table meta(schema...), region id allocation across datanodes. -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub struct TableGlobalValue { /// Id of datanode that created the global table info kv. only for debugging. pub node_id: u64, diff --git a/src/catalog/src/local/manager.rs b/src/catalog/src/local/manager.rs index d09411cbaa2a..e4c89933e04d 100644 --- a/src/catalog/src/local/manager.rs +++ b/src/catalog/src/local/manager.rs @@ -145,27 +145,34 @@ impl LocalCatalogManager { /// Convert `RecordBatch` to a vector of `Entry`. fn record_batch_to_entry(rb: RecordBatch) -> Result> { ensure!( - rb.df_recordbatch.columns().len() >= 6, + rb.num_columns() >= 6, SystemCatalogSnafu { - msg: format!("Length mismatch: {}", rb.df_recordbatch.columns().len()) + msg: format!("Length mismatch: {}", rb.num_columns()) } ); - let entry_type = UInt8Vector::try_from_arrow_array(&rb.df_recordbatch.columns()[0]) - .with_context(|_| SystemCatalogTypeMismatchSnafu { - data_type: rb.df_recordbatch.columns()[ENTRY_TYPE_INDEX] - .data_type() - .clone(), + let entry_type = rb + .column(ENTRY_TYPE_INDEX) + .as_any() + .downcast_ref::() + .with_context(|| SystemCatalogTypeMismatchSnafu { + data_type: rb.column(ENTRY_TYPE_INDEX).data_type(), })?; - let key = BinaryVector::try_from_arrow_array(&rb.df_recordbatch.columns()[1]) - .with_context(|_| SystemCatalogTypeMismatchSnafu { - data_type: rb.df_recordbatch.columns()[KEY_INDEX].data_type().clone(), + let key = rb + .column(KEY_INDEX) + .as_any() + .downcast_ref::() + .with_context(|| SystemCatalogTypeMismatchSnafu { + data_type: rb.column(KEY_INDEX).data_type(), })?; - let value = BinaryVector::try_from_arrow_array(&rb.df_recordbatch.columns()[3]) - .with_context(|_| SystemCatalogTypeMismatchSnafu { - data_type: rb.df_recordbatch.columns()[VALUE_INDEX].data_type().clone(), + let value = rb + .column(VALUE_INDEX) + .as_any() + .downcast_ref::() + .with_context(|| SystemCatalogTypeMismatchSnafu { + data_type: rb.column(VALUE_INDEX).data_type(), })?; let mut res = Vec::with_capacity(rb.num_rows()); diff --git a/src/catalog/src/system.rs b/src/catalog/src/system.rs index b6555b935339..960be1fa24f3 100644 --- a/src/catalog/src/system.rs +++ b/src/catalog/src/system.rs @@ -21,14 +21,13 @@ use common_catalog::consts::{ SYSTEM_CATALOG_TABLE_ID, SYSTEM_CATALOG_TABLE_NAME, }; use common_query::logical_plan::Expr; -use common_query::physical_plan::{PhysicalPlanRef, RuntimeEnv}; +use common_query::physical_plan::{PhysicalPlanRef, SessionContext}; use common_recordbatch::SendableRecordBatchStream; use common_telemetry::debug; -use common_time::timestamp::Timestamp; use common_time::util; use datatypes::prelude::{ConcreteDataType, ScalarVector}; use datatypes::schema::{ColumnSchema, Schema, SchemaBuilder, SchemaRef}; -use datatypes::vectors::{BinaryVector, TimestampVector, UInt8Vector}; +use datatypes::vectors::{BinaryVector, TimestampMillisecondVector, UInt8Vector}; use serde::{Deserialize, Serialize}; use snafu::{ensure, OptionExt, ResultExt}; use table::engine::{EngineContext, TableEngineRef}; @@ -127,13 +126,14 @@ impl SystemCatalogTable { /// Create a stream of all entries inside system catalog table pub async fn records(&self) -> Result { let full_projection = None; + let ctx = SessionContext::new(); let scan = self .table .scan(&full_projection, &[], None) .await .context(error::SystemCatalogTableScanSnafu)?; let stream = scan - .execute(0, Arc::new(RuntimeEnv::default())) + .execute(0, ctx.task_ctx()) .context(error::SystemCatalogTableScanExecSnafu)?; Ok(stream) } @@ -161,7 +161,7 @@ fn build_system_catalog_schema() -> Schema { ), ColumnSchema::new( "timestamp".to_string(), - ConcreteDataType::timestamp_millis_datatype(), + ConcreteDataType::timestamp_millisecond_datatype(), false, ) .with_time_index(true), @@ -172,12 +172,12 @@ fn build_system_catalog_schema() -> Schema { ), ColumnSchema::new( "gmt_created".to_string(), - ConcreteDataType::timestamp_millis_datatype(), + ConcreteDataType::timestamp_millisecond_datatype(), false, ), ColumnSchema::new( "gmt_modified".to_string(), - ConcreteDataType::timestamp_millis_datatype(), + ConcreteDataType::timestamp_millisecond_datatype(), false, ), ]; @@ -222,7 +222,7 @@ pub fn build_insert_request(entry_type: EntryType, key: &[u8], value: &[u8]) -> // Timestamp in key part is intentionally left to 0 columns_values.insert( "timestamp".to_string(), - Arc::new(TimestampVector::from_slice(&[Timestamp::from_millis(0)])) as _, + Arc::new(TimestampMillisecondVector::from_slice(&[0])) as _, ); columns_values.insert( @@ -230,18 +230,15 @@ pub fn build_insert_request(entry_type: EntryType, key: &[u8], value: &[u8]) -> Arc::new(BinaryVector::from_slice(&[value])) as _, ); + let now = util::current_time_millis(); columns_values.insert( "gmt_created".to_string(), - Arc::new(TimestampVector::from_slice(&[Timestamp::from_millis( - util::current_time_millis(), - )])) as _, + Arc::new(TimestampMillisecondVector::from_slice(&[now])) as _, ); columns_values.insert( "gmt_modified".to_string(), - Arc::new(TimestampVector::from_slice(&[Timestamp::from_millis( - util::current_time_millis(), - )])) as _, + Arc::new(TimestampMillisecondVector::from_slice(&[now])) as _, ); InsertRequest { diff --git a/src/catalog/src/tables.rs b/src/catalog/src/tables.rs index b11fc870deab..8dd59fb1bf3f 100644 --- a/src/catalog/src/tables.rs +++ b/src/catalog/src/tables.rs @@ -26,9 +26,9 @@ use common_query::logical_plan::Expr; use common_query::physical_plan::PhysicalPlanRef; use common_recordbatch::error::Result as RecordBatchResult; use common_recordbatch::{RecordBatch, RecordBatchStream}; -use datatypes::prelude::{ConcreteDataType, VectorBuilder}; +use datatypes::prelude::{ConcreteDataType, DataType}; use datatypes::schema::{ColumnSchema, Schema, SchemaRef}; -use datatypes::value::Value; +use datatypes::value::ValueRef; use datatypes::vectors::VectorRef; use futures::Stream; use snafu::ResultExt; @@ -149,26 +149,33 @@ fn tables_to_record_batch( engine: &str, ) -> Vec { let mut catalog_vec = - VectorBuilder::with_capacity(ConcreteDataType::string_datatype(), table_names.len()); + ConcreteDataType::string_datatype().create_mutable_vector(table_names.len()); let mut schema_vec = - VectorBuilder::with_capacity(ConcreteDataType::string_datatype(), table_names.len()); + ConcreteDataType::string_datatype().create_mutable_vector(table_names.len()); let mut table_name_vec = - VectorBuilder::with_capacity(ConcreteDataType::string_datatype(), table_names.len()); + ConcreteDataType::string_datatype().create_mutable_vector(table_names.len()); let mut engine_vec = - VectorBuilder::with_capacity(ConcreteDataType::string_datatype(), table_names.len()); + ConcreteDataType::string_datatype().create_mutable_vector(table_names.len()); for table_name in table_names { - catalog_vec.push(&Value::String(catalog_name.into())); - schema_vec.push(&Value::String(schema_name.into())); - table_name_vec.push(&Value::String(table_name.into())); - engine_vec.push(&Value::String(engine.into())); + // Safety: All these vectors are string type. + catalog_vec + .push_value_ref(ValueRef::String(catalog_name)) + .unwrap(); + schema_vec + .push_value_ref(ValueRef::String(schema_name)) + .unwrap(); + table_name_vec + .push_value_ref(ValueRef::String(&table_name)) + .unwrap(); + engine_vec.push_value_ref(ValueRef::String(engine)).unwrap(); } vec![ - catalog_vec.finish(), - schema_vec.finish(), - table_name_vec.finish(), - engine_vec.finish(), + catalog_vec.to_vector(), + schema_vec.to_vector(), + table_name_vec.to_vector(), + engine_vec.to_vector(), ] } @@ -340,9 +347,7 @@ fn build_schema_for_tables() -> Schema { #[cfg(test)] mod tests { use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME}; - use common_query::physical_plan::RuntimeEnv; - use datatypes::arrow::array::Utf8Array; - use datatypes::arrow::datatypes::DataType; + use common_query::physical_plan::SessionContext; use futures_util::StreamExt; use table::table::numbers::NumbersTable; @@ -366,56 +371,47 @@ mod tests { let tables = Tables::new(catalog_list, "test_engine".to_string()); let tables_stream = tables.scan(&None, &[], None).await.unwrap(); - let mut tables_stream = tables_stream - .execute(0, Arc::new(RuntimeEnv::default())) - .unwrap(); + let session_ctx = SessionContext::new(); + let mut tables_stream = tables_stream.execute(0, session_ctx.task_ctx()).unwrap(); if let Some(t) = tables_stream.next().await { - let batch = t.unwrap().df_recordbatch; + let batch = t.unwrap(); assert_eq!(1, batch.num_rows()); assert_eq!(4, batch.num_columns()); - assert_eq!(&DataType::Utf8, batch.column(0).data_type()); - assert_eq!(&DataType::Utf8, batch.column(1).data_type()); - assert_eq!(&DataType::Utf8, batch.column(2).data_type()); - assert_eq!(&DataType::Utf8, batch.column(3).data_type()); + assert_eq!( + ConcreteDataType::string_datatype(), + batch.column(0).data_type() + ); + assert_eq!( + ConcreteDataType::string_datatype(), + batch.column(1).data_type() + ); + assert_eq!( + ConcreteDataType::string_datatype(), + batch.column(2).data_type() + ); + assert_eq!( + ConcreteDataType::string_datatype(), + batch.column(3).data_type() + ); assert_eq!( "greptime", - batch - .column(0) - .as_any() - .downcast_ref::>() - .unwrap() - .value(0) + batch.column(0).get_ref(0).as_string().unwrap().unwrap() ); assert_eq!( "public", - batch - .column(1) - .as_any() - .downcast_ref::>() - .unwrap() - .value(0) + batch.column(1).get_ref(0).as_string().unwrap().unwrap() ); assert_eq!( "test_table", - batch - .column(2) - .as_any() - .downcast_ref::>() - .unwrap() - .value(0) + batch.column(2).get_ref(0).as_string().unwrap().unwrap() ); assert_eq!( "test_engine", - batch - .column(3) - .as_any() - .downcast_ref::>() - .unwrap() - .value(0) + batch.column(3).get_ref(0).as_string().unwrap().unwrap() ); } else { panic!("Record batch should not be empty!") diff --git a/src/client/Cargo.toml b/src/client/Cargo.toml index da58e9c88475..5c19f89970e8 100644 --- a/src/client/Cargo.toml +++ b/src/client/Cargo.toml @@ -15,9 +15,7 @@ common-grpc-expr = { path = "../common/grpc-expr" } common-query = { path = "../common/query" } common-recordbatch = { path = "../common/recordbatch" } common-time = { path = "../common/time" } -datafusion = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2", features = [ - "simd", -] } +datafusion = "14.0.0" datatypes = { path = "../datatypes" } enum_dispatch = "0.3" parking_lot = "0.12" diff --git a/src/client/examples/logical.rs b/src/client/examples/logical.rs index 9e00269f2fb3..9ea6cdc42fdd 100644 --- a/src/client/examples/logical.rs +++ b/src/client/examples/logical.rs @@ -41,7 +41,7 @@ async fn run() { column_defs: vec![ ColumnDef { name: "timestamp".to_string(), - datatype: ColumnDataType::Timestamp as i32, + datatype: ColumnDataType::TimestampMillisecond as i32, is_nullable: false, default_constraint: None, }, diff --git a/src/client/src/database.rs b/src/client/src/database.rs index 54ab889bf5d9..2dcc62569b62 100644 --- a/src/client/src/database.rs +++ b/src/client/src/database.rs @@ -318,12 +318,11 @@ mod tests { fn create_test_column(vector: VectorRef) -> Column { let wrapper: ColumnDataTypeWrapper = vector.data_type().try_into().unwrap(); - let array = vector.to_arrow_array(); Column { column_name: "test".to_string(), semantic_type: 1, - values: Some(values(&[array.clone()]).unwrap()), - null_mask: null_mask(&vec![array], vector.len()), + values: Some(values(&[vector.clone()]).unwrap()), + null_mask: null_mask(&[vector.clone()], vector.len()), datatype: wrapper.datatype() as i32, } } diff --git a/src/cmd/src/frontend.rs b/src/cmd/src/frontend.rs index 3b98332b3357..6bea05ce676d 100644 --- a/src/cmd/src/frontend.rs +++ b/src/cmd/src/frontend.rs @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::sync::Arc; - use anymap::AnyMap; use clap::Parser; use frontend::frontend::{Frontend, FrontendOptions}; @@ -138,14 +136,14 @@ impl TryFrom for FrontendOptions { if let Some(addr) = cmd.mysql_addr { opts.mysql_options = Some(MysqlOptions { addr, - tls: Arc::new(tls_option.clone()), + tls: tls_option.clone(), ..Default::default() }); } if let Some(addr) = cmd.postgres_addr { opts.postgres_options = Some(PostgresOptions { addr, - tls: Arc::new(tls_option), + tls: tls_option, ..Default::default() }); } diff --git a/src/cmd/src/standalone.rs b/src/cmd/src/standalone.rs index 42f1e0a71e0f..d4b65c3a85c7 100644 --- a/src/cmd/src/standalone.rs +++ b/src/cmd/src/standalone.rs @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::sync::Arc; - use anymap::AnyMap; use clap::Parser; use common_telemetry::info; @@ -262,12 +260,12 @@ impl TryFrom for FrontendOptions { let tls_option = TlsOption::new(cmd.tls_mode, cmd.tls_cert_path, cmd.tls_key_path); if let Some(mut mysql_options) = opts.mysql_options { - mysql_options.tls = Arc::new(tls_option.clone()); + mysql_options.tls = tls_option.clone(); opts.mysql_options = Some(mysql_options); } if let Some(mut postgres_options) = opts.postgres_options { - postgres_options.tls = Arc::new(tls_option); + postgres_options.tls = tls_option; opts.postgres_options = Some(postgres_options); } diff --git a/src/common/function/Cargo.toml b/src/common/function/Cargo.toml index b14738fe94ae..ce49cb5e5b83 100644 --- a/src/common/function/Cargo.toml +++ b/src/common/function/Cargo.toml @@ -11,7 +11,7 @@ common-error = { path = "../error" } common-function-macro = { path = "../function-macro" } common-query = { path = "../query" } common-time = { path = "../time" } -datafusion-common = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2" } +datafusion-common = "14.0.0" datatypes = { path = "../../datatypes" } libc = "0.2" num = "0.4" diff --git a/src/common/function/src/error.rs b/src/common/function/src/error.rs deleted file mode 100644 index 73c3928a009c..000000000000 --- a/src/common/function/src/error.rs +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::any::Any; - -use common_error::prelude::*; -pub use common_query::error::{Error, Result}; -use datatypes::error::Error as DataTypeError; - -#[derive(Debug, Snafu)] -#[snafu(visibility(pub))] -pub enum InnerError { - #[snafu(display("Fail to get scalar vector, {}", source))] - GetScalarVector { - source: DataTypeError, - backtrace: Backtrace, - }, -} - -impl ErrorExt for InnerError { - fn backtrace_opt(&self) -> Option<&Backtrace> { - ErrorCompat::backtrace(self) - } - - fn as_any(&self) -> &dyn Any { - self - } -} - -impl From for Error { - fn from(err: InnerError) -> Self { - Self::new(err) - } -} - -#[cfg(test)] -mod tests { - use snafu::GenerateImplicitData; - - use super::*; - - fn raise_datatype_error() -> std::result::Result<(), DataTypeError> { - Err(DataTypeError::Conversion { - from: "test".to_string(), - backtrace: Backtrace::generate(), - }) - } - - #[test] - fn test_get_scalar_vector_error() { - let err: Error = raise_datatype_error() - .context(GetScalarVectorSnafu) - .err() - .unwrap() - .into(); - assert!(err.backtrace_opt().is_some()); - } -} diff --git a/src/common/function/src/lib.rs b/src/common/function/src/lib.rs index 5a1b8edacb69..8d15fe0b25c2 100644 --- a/src/common/function/src/lib.rs +++ b/src/common/function/src/lib.rs @@ -12,5 +12,4 @@ // See the License for the specific language governing permissions and // limitations under the License. -pub mod error; pub mod scalars; diff --git a/src/common/function/src/scalars.rs b/src/common/function/src/scalars.rs index d362ea5f897d..e9499b215140 100644 --- a/src/common/function/src/scalars.rs +++ b/src/common/function/src/scalars.rs @@ -23,6 +23,5 @@ pub(crate) mod test; mod timestamp; pub mod udf; -pub use aggregate::MedianAccumulatorCreator; pub use function::{Function, FunctionRef}; pub use function_registry::{FunctionRegistry, FUNCTION_REGISTRY}; diff --git a/src/common/function/src/scalars/aggregate/mod.rs b/src/common/function/src/scalars/aggregate.rs similarity index 96% rename from src/common/function/src/scalars/aggregate/mod.rs rename to src/common/function/src/scalars/aggregate.rs index 8a4712a1b825..f605fff2f2f6 100644 --- a/src/common/function/src/scalars/aggregate/mod.rs +++ b/src/common/function/src/scalars/aggregate.rs @@ -16,7 +16,6 @@ mod argmax; mod argmin; mod diff; mod mean; -mod median; mod percentile; mod polyval; mod scipy_stats_norm_cdf; @@ -29,7 +28,6 @@ pub use argmin::ArgminAccumulatorCreator; use common_query::logical_plan::AggregateFunctionCreatorRef; pub use diff::DiffAccumulatorCreator; pub use mean::MeanAccumulatorCreator; -pub use median::MedianAccumulatorCreator; pub use percentile::PercentileAccumulatorCreator; pub use polyval::PolyvalAccumulatorCreator; pub use scipy_stats_norm_cdf::ScipyStatsNormCdfAccumulatorCreator; @@ -88,7 +86,6 @@ impl AggregateFunctions { }; } - register_aggr_func!("median", 1, MedianAccumulatorCreator); register_aggr_func!("diff", 1, DiffAccumulatorCreator); register_aggr_func!("mean", 1, MeanAccumulatorCreator); register_aggr_func!("polyval", 2, PolyvalAccumulatorCreator); diff --git a/src/common/function/src/scalars/aggregate/argmax.rs b/src/common/function/src/scalars/aggregate/argmax.rs index 0b63a766bd41..d42d4550c65a 100644 --- a/src/common/function/src/scalars/aggregate/argmax.rs +++ b/src/common/function/src/scalars/aggregate/argmax.rs @@ -20,24 +20,22 @@ use common_query::error::{BadAccumulatorImplSnafu, CreateAccumulatorSnafu, Resul use common_query::logical_plan::{Accumulator, AggregateFunctionCreator}; use common_query::prelude::*; use datatypes::prelude::*; -use datatypes::vectors::ConstantVector; +use datatypes::types::{LogicalPrimitiveType, WrapperType}; +use datatypes::vectors::{ConstantVector, Helper}; use datatypes::with_match_primitive_type_id; use snafu::ensure; // https://numpy.org/doc/stable/reference/generated/numpy.argmax.html // return the index of the max value #[derive(Debug, Default)] -pub struct Argmax -where - T: Primitive + PartialOrd, -{ +pub struct Argmax { max: Option, n: u64, } impl Argmax where - T: Primitive + PartialOrd, + T: PartialOrd + Copy, { fn update(&mut self, value: T, index: u64) { if let Some(Ordering::Less) = self.max.partial_cmp(&Some(value)) { @@ -49,8 +47,7 @@ where impl Accumulator for Argmax where - T: Primitive + PartialOrd, - for<'a> T: Scalar = T>, + T: WrapperType + PartialOrd, { fn state(&self) -> Result> { match self.max { @@ -66,10 +63,10 @@ where let column = &values[0]; let column: &::VectorType = if column.is_const() { - let column: &ConstantVector = unsafe { VectorHelper::static_cast(column) }; - unsafe { VectorHelper::static_cast(column.inner()) } + let column: &ConstantVector = unsafe { Helper::static_cast(column) }; + unsafe { Helper::static_cast(column.inner()) } } else { - unsafe { VectorHelper::static_cast(column) } + unsafe { Helper::static_cast(column) } }; for (i, v) in column.iter_data().enumerate() { if let Some(value) = v { @@ -93,8 +90,8 @@ where let max = &states[0]; let index = &states[1]; - let max: &::VectorType = unsafe { VectorHelper::static_cast(max) }; - let index: &::VectorType = unsafe { VectorHelper::static_cast(index) }; + let max: &::VectorType = unsafe { Helper::static_cast(max) }; + let index: &::VectorType = unsafe { Helper::static_cast(index) }; index .iter_data() .flatten() @@ -122,7 +119,7 @@ impl AggregateFunctionCreator for ArgmaxAccumulatorCreator { with_match_primitive_type_id!( input_type.logical_type_id(), |$S| { - Ok(Box::new(Argmax::<$S>::default())) + Ok(Box::new(Argmax::<<$S as LogicalPrimitiveType>::Wrapper>::default())) }, { let err_msg = format!( @@ -154,7 +151,7 @@ impl AggregateFunctionCreator for ArgmaxAccumulatorCreator { #[cfg(test)] mod test { - use datatypes::vectors::PrimitiveVector; + use datatypes::vectors::Int32Vector; use super::*; #[test] @@ -166,21 +163,19 @@ mod test { // test update one not-null value let mut argmax = Argmax::::default(); - let v: Vec = vec![Arc::new(PrimitiveVector::::from(vec![Some(42)]))]; + let v: Vec = vec![Arc::new(Int32Vector::from(vec![Some(42)]))]; assert!(argmax.update_batch(&v).is_ok()); assert_eq!(Value::from(0_u64), argmax.evaluate().unwrap()); // test update one null value let mut argmax = Argmax::::default(); - let v: Vec = vec![Arc::new(PrimitiveVector::::from(vec![ - Option::::None, - ]))]; + let v: Vec = vec![Arc::new(Int32Vector::from(vec![Option::::None]))]; assert!(argmax.update_batch(&v).is_ok()); assert_eq!(Value::Null, argmax.evaluate().unwrap()); // test update no null-value batch let mut argmax = Argmax::::default(); - let v: Vec = vec![Arc::new(PrimitiveVector::::from(vec![ + let v: Vec = vec![Arc::new(Int32Vector::from(vec![ Some(-1i32), Some(1), Some(3), @@ -190,7 +185,7 @@ mod test { // test update null-value batch let mut argmax = Argmax::::default(); - let v: Vec = vec![Arc::new(PrimitiveVector::::from(vec![ + let v: Vec = vec![Arc::new(Int32Vector::from(vec![ Some(-2i32), None, Some(4), @@ -201,7 +196,7 @@ mod test { // test update with constant vector let mut argmax = Argmax::::default(); let v: Vec = vec![Arc::new(ConstantVector::new( - Arc::new(PrimitiveVector::::from_vec(vec![4])), + Arc::new(Int32Vector::from_vec(vec![4])), 10, ))]; assert!(argmax.update_batch(&v).is_ok()); diff --git a/src/common/function/src/scalars/aggregate/argmin.rs b/src/common/function/src/scalars/aggregate/argmin.rs index bcbd6571c5b8..5b9356128613 100644 --- a/src/common/function/src/scalars/aggregate/argmin.rs +++ b/src/common/function/src/scalars/aggregate/argmin.rs @@ -20,23 +20,20 @@ use common_query::error::{BadAccumulatorImplSnafu, CreateAccumulatorSnafu, Resul use common_query::logical_plan::{Accumulator, AggregateFunctionCreator}; use common_query::prelude::*; use datatypes::prelude::*; -use datatypes::vectors::ConstantVector; +use datatypes::vectors::{ConstantVector, Helper}; use datatypes::with_match_primitive_type_id; use snafu::ensure; // // https://numpy.org/doc/stable/reference/generated/numpy.argmin.html #[derive(Debug, Default)] -pub struct Argmin -where - T: Primitive + PartialOrd, -{ +pub struct Argmin { min: Option, n: u32, } impl Argmin where - T: Primitive + PartialOrd, + T: Copy + PartialOrd, { fn update(&mut self, value: T, index: u32) { match self.min { @@ -56,8 +53,7 @@ where impl Accumulator for Argmin where - T: Primitive + PartialOrd, - for<'a> T: Scalar = T>, + T: WrapperType + PartialOrd, { fn state(&self) -> Result> { match self.min { @@ -75,10 +71,10 @@ where let column = &values[0]; let column: &::VectorType = if column.is_const() { - let column: &ConstantVector = unsafe { VectorHelper::static_cast(column) }; - unsafe { VectorHelper::static_cast(column.inner()) } + let column: &ConstantVector = unsafe { Helper::static_cast(column) }; + unsafe { Helper::static_cast(column.inner()) } } else { - unsafe { VectorHelper::static_cast(column) } + unsafe { Helper::static_cast(column) } }; for (i, v) in column.iter_data().enumerate() { if let Some(value) = v { @@ -102,8 +98,8 @@ where let min = &states[0]; let index = &states[1]; - let min: &::VectorType = unsafe { VectorHelper::static_cast(min) }; - let index: &::VectorType = unsafe { VectorHelper::static_cast(index) }; + let min: &::VectorType = unsafe { Helper::static_cast(min) }; + let index: &::VectorType = unsafe { Helper::static_cast(index) }; index .iter_data() .flatten() @@ -131,7 +127,7 @@ impl AggregateFunctionCreator for ArgminAccumulatorCreator { with_match_primitive_type_id!( input_type.logical_type_id(), |$S| { - Ok(Box::new(Argmin::<$S>::default())) + Ok(Box::new(Argmin::<<$S as LogicalPrimitiveType>::Wrapper>::default())) }, { let err_msg = format!( @@ -163,7 +159,7 @@ impl AggregateFunctionCreator for ArgminAccumulatorCreator { #[cfg(test)] mod test { - use datatypes::vectors::PrimitiveVector; + use datatypes::vectors::Int32Vector; use super::*; #[test] @@ -175,21 +171,19 @@ mod test { // test update one not-null value let mut argmin = Argmin::::default(); - let v: Vec = vec![Arc::new(PrimitiveVector::::from(vec![Some(42)]))]; + let v: Vec = vec![Arc::new(Int32Vector::from(vec![Some(42)]))]; assert!(argmin.update_batch(&v).is_ok()); assert_eq!(Value::from(0_u32), argmin.evaluate().unwrap()); // test update one null value let mut argmin = Argmin::::default(); - let v: Vec = vec![Arc::new(PrimitiveVector::::from(vec![ - Option::::None, - ]))]; + let v: Vec = vec![Arc::new(Int32Vector::from(vec![Option::::None]))]; assert!(argmin.update_batch(&v).is_ok()); assert_eq!(Value::Null, argmin.evaluate().unwrap()); // test update no null-value batch let mut argmin = Argmin::::default(); - let v: Vec = vec![Arc::new(PrimitiveVector::::from(vec![ + let v: Vec = vec![Arc::new(Int32Vector::from(vec![ Some(-1i32), Some(1), Some(3), @@ -199,7 +193,7 @@ mod test { // test update null-value batch let mut argmin = Argmin::::default(); - let v: Vec = vec![Arc::new(PrimitiveVector::::from(vec![ + let v: Vec = vec![Arc::new(Int32Vector::from(vec![ Some(-2i32), None, Some(4), @@ -210,7 +204,7 @@ mod test { // test update with constant vector let mut argmin = Argmin::::default(); let v: Vec = vec![Arc::new(ConstantVector::new( - Arc::new(PrimitiveVector::::from_vec(vec![4])), + Arc::new(Int32Vector::from_vec(vec![4])), 10, ))]; assert!(argmin.update_batch(&v).is_ok()); diff --git a/src/common/function/src/scalars/aggregate/diff.rs b/src/common/function/src/scalars/aggregate/diff.rs index d0e7ca340646..3f7ecc24004f 100644 --- a/src/common/function/src/scalars/aggregate/diff.rs +++ b/src/common/function/src/scalars/aggregate/diff.rs @@ -22,40 +22,32 @@ use common_query::error::{ use common_query::logical_plan::{Accumulator, AggregateFunctionCreator}; use common_query::prelude::*; use datatypes::prelude::*; -use datatypes::types::PrimitiveType; use datatypes::value::ListValue; -use datatypes::vectors::{ConstantVector, ListVector}; +use datatypes::vectors::{ConstantVector, Helper, ListVector}; use datatypes::with_match_primitive_type_id; use num_traits::AsPrimitive; use snafu::{ensure, OptionExt, ResultExt}; // https://numpy.org/doc/stable/reference/generated/numpy.diff.html +// I is the input type, O is the output type. #[derive(Debug, Default)] -pub struct Diff -where - T: Primitive + AsPrimitive, - SubT: Primitive + std::ops::Sub, -{ - values: Vec, - _phantom: PhantomData, +pub struct Diff { + values: Vec, + _phantom: PhantomData, } -impl Diff -where - T: Primitive + AsPrimitive, - SubT: Primitive + std::ops::Sub, -{ - fn push(&mut self, value: T) { +impl Diff { + fn push(&mut self, value: I) { self.values.push(value); } } -impl Accumulator for Diff +impl Accumulator for Diff where - T: Primitive + AsPrimitive, - for<'a> T: Scalar = T>, - SubT: Primitive + std::ops::Sub, - for<'a> SubT: Scalar = SubT>, + I: WrapperType, + O: WrapperType, + I::Native: AsPrimitive, + O::Native: std::ops::Sub, { fn state(&self) -> Result> { let nums = self @@ -65,7 +57,7 @@ where .collect::>(); Ok(vec![Value::List(ListValue::new( Some(Box::new(nums)), - T::default().into().data_type(), + I::LogicalType::build_data_type(), ))]) } @@ -78,12 +70,12 @@ where let column = &values[0]; let mut len = 1; - let column: &::VectorType = if column.is_const() { + let column: &::VectorType = if column.is_const() { len = column.len(); - let column: &ConstantVector = unsafe { VectorHelper::static_cast(column) }; - unsafe { VectorHelper::static_cast(column.inner()) } + let column: &ConstantVector = unsafe { Helper::static_cast(column) }; + unsafe { Helper::static_cast(column.inner()) } } else { - unsafe { VectorHelper::static_cast(column) } + unsafe { Helper::static_cast(column) } }; (0..len).for_each(|_| { for v in column.iter_data().flatten() { @@ -109,8 +101,9 @@ where ), })?; for state in states.values_iter() { - let state = state.context(FromScalarValueSnafu)?; - self.update_batch(&[state])? + if let Some(state) = state.context(FromScalarValueSnafu)? { + self.update_batch(&[state])?; + } } Ok(()) } @@ -122,11 +115,14 @@ where let diff = self .values .windows(2) - .map(|x| (x[1].as_() - x[0].as_()).into()) + .map(|x| { + let native = x[1].into_native().as_() - x[0].into_native().as_(); + O::from_native(native).into() + }) .collect::>(); let diff = Value::List(ListValue::new( Some(Box::new(diff)), - SubT::default().into().data_type(), + O::LogicalType::build_data_type(), )); Ok(diff) } @@ -143,7 +139,7 @@ impl AggregateFunctionCreator for DiffAccumulatorCreator { with_match_primitive_type_id!( input_type.logical_type_id(), |$S| { - Ok(Box::new(Diff::<$S,<$S as Primitive>::LargestType>::default())) + Ok(Box::new(Diff::<<$S as LogicalPrimitiveType>::Wrapper, <<$S as LogicalPrimitiveType>::LargestType as LogicalPrimitiveType>::Wrapper>::default())) }, { let err_msg = format!( @@ -163,7 +159,7 @@ impl AggregateFunctionCreator for DiffAccumulatorCreator { with_match_primitive_type_id!( input_types[0].logical_type_id(), |$S| { - Ok(ConcreteDataType::list_datatype(PrimitiveType::<<$S as Primitive>::LargestType>::default().into())) + Ok(ConcreteDataType::list_datatype($S::default().into())) }, { unreachable!() @@ -177,7 +173,7 @@ impl AggregateFunctionCreator for DiffAccumulatorCreator { with_match_primitive_type_id!( input_types[0].logical_type_id(), |$S| { - Ok(vec![ConcreteDataType::list_datatype(PrimitiveType::<$S>::default().into())]) + Ok(vec![ConcreteDataType::list_datatype($S::default().into())]) }, { unreachable!() @@ -188,9 +184,10 @@ impl AggregateFunctionCreator for DiffAccumulatorCreator { #[cfg(test)] mod test { - use datatypes::vectors::PrimitiveVector; + use datatypes::vectors::Int32Vector; use super::*; + #[test] fn test_update_batch() { // test update empty batch, expect not updating anything @@ -201,21 +198,19 @@ mod test { // test update one not-null value let mut diff = Diff::::default(); - let v: Vec = vec![Arc::new(PrimitiveVector::::from(vec![Some(42)]))]; + let v: Vec = vec![Arc::new(Int32Vector::from(vec![Some(42)]))]; assert!(diff.update_batch(&v).is_ok()); assert_eq!(Value::Null, diff.evaluate().unwrap()); // test update one null value let mut diff = Diff::::default(); - let v: Vec = vec![Arc::new(PrimitiveVector::::from(vec![ - Option::::None, - ]))]; + let v: Vec = vec![Arc::new(Int32Vector::from(vec![Option::::None]))]; assert!(diff.update_batch(&v).is_ok()); assert_eq!(Value::Null, diff.evaluate().unwrap()); // test update no null-value batch let mut diff = Diff::::default(); - let v: Vec = vec![Arc::new(PrimitiveVector::::from(vec![ + let v: Vec = vec![Arc::new(Int32Vector::from(vec![ Some(-1i32), Some(1), Some(2), @@ -232,7 +227,7 @@ mod test { // test update null-value batch let mut diff = Diff::::default(); - let v: Vec = vec![Arc::new(PrimitiveVector::::from(vec![ + let v: Vec = vec![Arc::new(Int32Vector::from(vec![ Some(-2i32), None, Some(3), @@ -251,7 +246,7 @@ mod test { // test update with constant vector let mut diff = Diff::::default(); let v: Vec = vec![Arc::new(ConstantVector::new( - Arc::new(PrimitiveVector::::from_vec(vec![4])), + Arc::new(Int32Vector::from_vec(vec![4])), 4, ))]; let values = vec![Value::from(0_i64), Value::from(0_i64), Value::from(0_i64)]; diff --git a/src/common/function/src/scalars/aggregate/mean.rs b/src/common/function/src/scalars/aggregate/mean.rs index 2393a58cd2ef..ce619bb2532a 100644 --- a/src/common/function/src/scalars/aggregate/mean.rs +++ b/src/common/function/src/scalars/aggregate/mean.rs @@ -22,16 +22,14 @@ use common_query::error::{ use common_query::logical_plan::{Accumulator, AggregateFunctionCreator}; use common_query::prelude::*; use datatypes::prelude::*; -use datatypes::vectors::{ConstantVector, Float64Vector, UInt64Vector}; +use datatypes::types::WrapperType; +use datatypes::vectors::{ConstantVector, Float64Vector, Helper, UInt64Vector}; use datatypes::with_match_primitive_type_id; use num_traits::AsPrimitive; use snafu::{ensure, OptionExt}; #[derive(Debug, Default)] -pub struct Mean -where - T: Primitive + AsPrimitive, -{ +pub struct Mean { sum: f64, n: u64, _phantom: PhantomData, @@ -39,11 +37,12 @@ where impl Mean where - T: Primitive + AsPrimitive, + T: WrapperType, + T::Native: AsPrimitive, { #[inline(always)] fn push(&mut self, value: T) { - self.sum += value.as_(); + self.sum += value.into_native().as_(); self.n += 1; } @@ -56,8 +55,8 @@ where impl Accumulator for Mean where - T: Primitive + AsPrimitive, - for<'a> T: Scalar = T>, + T: WrapperType, + T::Native: AsPrimitive, { fn state(&self) -> Result> { Ok(vec![self.sum.into(), self.n.into()]) @@ -73,10 +72,10 @@ where let mut len = 1; let column: &::VectorType = if column.is_const() { len = column.len(); - let column: &ConstantVector = unsafe { VectorHelper::static_cast(column) }; - unsafe { VectorHelper::static_cast(column.inner()) } + let column: &ConstantVector = unsafe { Helper::static_cast(column) }; + unsafe { Helper::static_cast(column.inner()) } } else { - unsafe { VectorHelper::static_cast(column) } + unsafe { Helper::static_cast(column) } }; (0..len).for_each(|_| { for v in column.iter_data().flatten() { @@ -150,7 +149,7 @@ impl AggregateFunctionCreator for MeanAccumulatorCreator { with_match_primitive_type_id!( input_type.logical_type_id(), |$S| { - Ok(Box::new(Mean::<$S>::default())) + Ok(Box::new(Mean::<<$S as LogicalPrimitiveType>::Native>::default())) }, { let err_msg = format!( @@ -182,7 +181,7 @@ impl AggregateFunctionCreator for MeanAccumulatorCreator { #[cfg(test)] mod test { - use datatypes::vectors::PrimitiveVector; + use datatypes::vectors::Int32Vector; use super::*; #[test] @@ -194,21 +193,19 @@ mod test { // test update one not-null value let mut mean = Mean::::default(); - let v: Vec = vec![Arc::new(PrimitiveVector::::from(vec![Some(42)]))]; + let v: Vec = vec![Arc::new(Int32Vector::from(vec![Some(42)]))]; assert!(mean.update_batch(&v).is_ok()); assert_eq!(Value::from(42.0_f64), mean.evaluate().unwrap()); // test update one null value let mut mean = Mean::::default(); - let v: Vec = vec![Arc::new(PrimitiveVector::::from(vec![ - Option::::None, - ]))]; + let v: Vec = vec![Arc::new(Int32Vector::from(vec![Option::::None]))]; assert!(mean.update_batch(&v).is_ok()); assert_eq!(Value::Null, mean.evaluate().unwrap()); // test update no null-value batch let mut mean = Mean::::default(); - let v: Vec = vec![Arc::new(PrimitiveVector::::from(vec![ + let v: Vec = vec![Arc::new(Int32Vector::from(vec![ Some(-1i32), Some(1), Some(2), @@ -218,7 +215,7 @@ mod test { // test update null-value batch let mut mean = Mean::::default(); - let v: Vec = vec![Arc::new(PrimitiveVector::::from(vec![ + let v: Vec = vec![Arc::new(Int32Vector::from(vec![ Some(-2i32), None, Some(3), @@ -230,7 +227,7 @@ mod test { // test update with constant vector let mut mean = Mean::::default(); let v: Vec = vec![Arc::new(ConstantVector::new( - Arc::new(PrimitiveVector::::from_vec(vec![4])), + Arc::new(Int32Vector::from_vec(vec![4])), 10, ))]; assert!(mean.update_batch(&v).is_ok()); diff --git a/src/common/function/src/scalars/aggregate/median.rs b/src/common/function/src/scalars/aggregate/median.rs deleted file mode 100644 index 4c445c0fb912..000000000000 --- a/src/common/function/src/scalars/aggregate/median.rs +++ /dev/null @@ -1,289 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::cmp::Reverse; -use std::collections::BinaryHeap; -use std::sync::Arc; - -use common_function_macro::{as_aggr_func_creator, AggrFuncTypeStore}; -use common_query::error::{ - CreateAccumulatorSnafu, DowncastVectorSnafu, FromScalarValueSnafu, Result, -}; -use common_query::logical_plan::{Accumulator, AggregateFunctionCreator}; -use common_query::prelude::*; -use datatypes::prelude::*; -use datatypes::types::OrdPrimitive; -use datatypes::value::ListValue; -use datatypes::vectors::{ConstantVector, ListVector}; -use datatypes::with_match_primitive_type_id; -use num::NumCast; -use snafu::{ensure, OptionExt, ResultExt}; - -// This median calculation algorithm's details can be found at -// https://leetcode.cn/problems/find-median-from-data-stream/ -// -// Basically, it uses two heaps, a maximum heap and a minimum. The maximum heap stores numbers that -// are not greater than the median, and the minimum heap stores the greater. In a streaming of -// numbers, when a number is arrived, we adjust the heaps' tops, so that either one top is the -// median or both tops can be averaged to get the median. -// -// The time complexity to update the median is O(logn), O(1) to get the median; and the space -// complexity is O(n). (Ignore the costs for heap expansion.) -// -// From the point of algorithm, [quick select](https://en.wikipedia.org/wiki/Quickselect) might be -// better. But to use quick select here, we need a mutable self in the final calculation(`evaluate`) -// to swap stored numbers in the states vector. Though we can make our `evaluate` received -// `&mut self`, DataFusion calls our accumulator with `&self` (see `DfAccumulatorAdaptor`). That -// means we have to introduce some kinds of interior mutability, and the overhead is not neglectable. -// -// TODO(LFC): Use quick select to get median when we can modify DataFusion's code, and benchmark with two-heap algorithm. -#[derive(Debug, Default)] -pub struct Median -where - T: Primitive, -{ - greater: BinaryHeap>>, - not_greater: BinaryHeap>, -} - -impl Median -where - T: Primitive, -{ - fn push(&mut self, value: T) { - let value = OrdPrimitive::(value); - - if self.not_greater.is_empty() { - self.not_greater.push(value); - return; - } - // The `unwrap`s below are safe because there are `push`s before them. - if value <= *self.not_greater.peek().unwrap() { - self.not_greater.push(value); - if self.not_greater.len() > self.greater.len() + 1 { - self.greater.push(Reverse(self.not_greater.pop().unwrap())); - } - } else { - self.greater.push(Reverse(value)); - if self.greater.len() > self.not_greater.len() { - self.not_greater.push(self.greater.pop().unwrap().0); - } - } - } -} - -// UDAFs are built using the trait `Accumulator`, that offers DataFusion the necessary functions -// to use them. -impl Accumulator for Median -where - T: Primitive, - for<'a> T: Scalar = T>, -{ - // This function serializes our state to `ScalarValue`, which DataFusion uses to pass this - // state between execution stages. Note that this can be arbitrary data. - // - // The `ScalarValue`s returned here will be passed in as argument `states: &[VectorRef]` to - // `merge_batch` function. - fn state(&self) -> Result> { - let nums = self - .greater - .iter() - .map(|x| &x.0) - .chain(self.not_greater.iter()) - .map(|&n| n.into()) - .collect::>(); - Ok(vec![Value::List(ListValue::new( - Some(Box::new(nums)), - T::default().into().data_type(), - ))]) - } - - // DataFusion calls this function to update the accumulator's state for a batch of inputs rows. - // It is expected this function to update the accumulator's state. - fn update_batch(&mut self, values: &[VectorRef]) -> Result<()> { - if values.is_empty() { - return Ok(()); - } - - ensure!(values.len() == 1, InvalidInputStateSnafu); - - // This is a unary accumulator, so only one column is provided. - let column = &values[0]; - let mut len = 1; - let column: &::VectorType = if column.is_const() { - len = column.len(); - let column: &ConstantVector = unsafe { VectorHelper::static_cast(column) }; - unsafe { VectorHelper::static_cast(column.inner()) } - } else { - unsafe { VectorHelper::static_cast(column) } - }; - (0..len).for_each(|_| { - for v in column.iter_data().flatten() { - self.push(v); - } - }); - Ok(()) - } - - // DataFusion executes accumulators in partitions. In some execution stage, DataFusion will - // merge states from other accumulators (returned by `state()` method). - fn merge_batch(&mut self, states: &[VectorRef]) -> Result<()> { - if states.is_empty() { - return Ok(()); - } - - // The states here are returned by the `state` method. Since we only returned a vector - // with one value in that method, `states[0]` is fine. - let states = &states[0]; - let states = states - .as_any() - .downcast_ref::() - .with_context(|| DowncastVectorSnafu { - err_msg: format!( - "expect ListVector, got vector type {}", - states.vector_type_name() - ), - })?; - for state in states.values_iter() { - let state = state.context(FromScalarValueSnafu)?; - // merging state is simply accumulate stored numbers from others', so just call update - self.update_batch(&[state])? - } - Ok(()) - } - - // DataFusion expects this function to return the final value of this aggregator. - fn evaluate(&self) -> Result { - if self.not_greater.is_empty() { - assert!( - self.greater.is_empty(), - "not expected in two-heap median algorithm, there must be a bug when implementing it" - ); - return Ok(Value::Null); - } - - // unwrap is safe because we checked not_greater heap's len above - let not_greater = *self.not_greater.peek().unwrap(); - let median = if self.not_greater.len() > self.greater.len() { - not_greater.into() - } else { - // unwrap is safe because greater heap len >= not_greater heap len, which is > 0 - let greater = self.greater.peek().unwrap(); - - // the following three NumCast's `unwrap`s are safe because T is primitive - let not_greater_v: f64 = NumCast::from(not_greater.as_primitive()).unwrap(); - let greater_v: f64 = NumCast::from(greater.0.as_primitive()).unwrap(); - let median: T = NumCast::from((not_greater_v + greater_v) / 2.0).unwrap(); - median.into() - }; - Ok(median) - } -} - -#[as_aggr_func_creator] -#[derive(Debug, Default, AggrFuncTypeStore)] -pub struct MedianAccumulatorCreator {} - -impl AggregateFunctionCreator for MedianAccumulatorCreator { - fn creator(&self) -> AccumulatorCreatorFunction { - let creator: AccumulatorCreatorFunction = Arc::new(move |types: &[ConcreteDataType]| { - let input_type = &types[0]; - with_match_primitive_type_id!( - input_type.logical_type_id(), - |$S| { - Ok(Box::new(Median::<$S>::default())) - }, - { - let err_msg = format!( - "\"MEDIAN\" aggregate function not support data type {:?}", - input_type.logical_type_id(), - ); - CreateAccumulatorSnafu { err_msg }.fail()? - } - ) - }); - creator - } - - fn output_type(&self) -> Result { - let input_types = self.input_types()?; - ensure!(input_types.len() == 1, InvalidInputStateSnafu); - // unwrap is safe because we have checked input_types len must equals 1 - Ok(input_types.into_iter().next().unwrap()) - } - - fn state_types(&self) -> Result> { - Ok(vec![ConcreteDataType::list_datatype(self.output_type()?)]) - } -} - -#[cfg(test)] -mod test { - use datatypes::vectors::PrimitiveVector; - - use super::*; - #[test] - fn test_update_batch() { - // test update empty batch, expect not updating anything - let mut median = Median::::default(); - assert!(median.update_batch(&[]).is_ok()); - assert!(median.not_greater.is_empty()); - assert!(median.greater.is_empty()); - assert_eq!(Value::Null, median.evaluate().unwrap()); - - // test update one not-null value - let mut median = Median::::default(); - let v: Vec = vec![Arc::new(PrimitiveVector::::from(vec![Some(42)]))]; - assert!(median.update_batch(&v).is_ok()); - assert_eq!(Value::Int32(42), median.evaluate().unwrap()); - - // test update one null value - let mut median = Median::::default(); - let v: Vec = vec![Arc::new(PrimitiveVector::::from(vec![ - Option::::None, - ]))]; - assert!(median.update_batch(&v).is_ok()); - assert_eq!(Value::Null, median.evaluate().unwrap()); - - // test update no null-value batch - let mut median = Median::::default(); - let v: Vec = vec![Arc::new(PrimitiveVector::::from(vec![ - Some(-1i32), - Some(1), - Some(2), - ]))]; - assert!(median.update_batch(&v).is_ok()); - assert_eq!(Value::Int32(1), median.evaluate().unwrap()); - - // test update null-value batch - let mut median = Median::::default(); - let v: Vec = vec![Arc::new(PrimitiveVector::::from(vec![ - Some(-2i32), - None, - Some(3), - Some(4), - ]))]; - assert!(median.update_batch(&v).is_ok()); - assert_eq!(Value::Int32(3), median.evaluate().unwrap()); - - // test update with constant vector - let mut median = Median::::default(); - let v: Vec = vec![Arc::new(ConstantVector::new( - Arc::new(PrimitiveVector::::from_vec(vec![4])), - 10, - ))]; - assert!(median.update_batch(&v).is_ok()); - assert_eq!(Value::Int32(4), median.evaluate().unwrap()); - } -} diff --git a/src/common/function/src/scalars/aggregate/percentile.rs b/src/common/function/src/scalars/aggregate/percentile.rs index 1b642dd27412..1517f90e6282 100644 --- a/src/common/function/src/scalars/aggregate/percentile.rs +++ b/src/common/function/src/scalars/aggregate/percentile.rs @@ -26,7 +26,7 @@ use common_query::prelude::*; use datatypes::prelude::*; use datatypes::types::OrdPrimitive; use datatypes::value::{ListValue, OrderedFloat}; -use datatypes::vectors::{ConstantVector, Float64Vector, ListVector}; +use datatypes::vectors::{ConstantVector, Float64Vector, Helper, ListVector}; use datatypes::with_match_primitive_type_id; use num::NumCast; use snafu::{ensure, OptionExt, ResultExt}; @@ -44,15 +44,15 @@ use snafu::{ensure, OptionExt, ResultExt}; // This optional method parameter specifies the method to use when the desired quantile lies between two data points i < j. // If g is the fractional part of the index surrounded by i and alpha and beta are correction constants modifying i and j. // i+g = (q-alpha)/(n-alpha-beta+1) -// Below, ‘q’ is the quantile value, ‘n’ is the sample size and alpha and beta are constants. The following formula gives an interpolation “i + g” of where the quantile would be in the sorted sample. -// With ‘i’ being the floor and ‘g’ the fractional part of the result. +// Below, 'q' is the quantile value, 'n' is the sample size and alpha and beta are constants. The following formula gives an interpolation "i + g" of where the quantile would be in the sorted sample. +// With 'i' being the floor and 'g' the fractional part of the result. // the default method is linear where // alpha = 1 // beta = 1 #[derive(Debug, Default)] pub struct Percentile where - T: Primitive, + T: WrapperType, { greater: BinaryHeap>>, not_greater: BinaryHeap>, @@ -62,7 +62,7 @@ where impl Percentile where - T: Primitive, + T: WrapperType, { fn push(&mut self, value: T) { let value = OrdPrimitive::(value); @@ -93,8 +93,7 @@ where impl Accumulator for Percentile where - T: Primitive, - for<'a> T: Scalar = T>, + T: WrapperType, { fn state(&self) -> Result> { let nums = self @@ -107,7 +106,7 @@ where Ok(vec![ Value::List(ListValue::new( Some(Box::new(nums)), - T::default().into().data_type(), + T::LogicalType::build_data_type(), )), self.p.into(), ]) @@ -129,14 +128,14 @@ where let mut len = 1; let column: &::VectorType = if column.is_const() { len = column.len(); - let column: &ConstantVector = unsafe { VectorHelper::static_cast(column) }; - unsafe { VectorHelper::static_cast(column.inner()) } + let column: &ConstantVector = unsafe { Helper::static_cast(column) }; + unsafe { Helper::static_cast(column.inner()) } } else { - unsafe { VectorHelper::static_cast(column) } + unsafe { Helper::static_cast(column) } }; let x = &values[1]; - let x = VectorHelper::check_get_scalar::(x).context(error::InvalidInputsSnafu { + let x = Helper::check_get_scalar::(x).context(error::InvalidInputTypeSnafu { err_msg: "expecting \"POLYVAL\" function's second argument to be float64", })?; // `get(0)` is safe because we have checked `values[1].len() == values[0].len() != 0` @@ -209,10 +208,11 @@ where ), })?; for value in values.values_iter() { - let value = value.context(FromScalarValueSnafu)?; - let column: &::VectorType = unsafe { VectorHelper::static_cast(&value) }; - for v in column.iter_data().flatten() { - self.push(v); + if let Some(value) = value.context(FromScalarValueSnafu)? { + let column: &::VectorType = unsafe { Helper::static_cast(&value) }; + for v in column.iter_data().flatten() { + self.push(v); + } } } Ok(()) @@ -259,7 +259,7 @@ impl AggregateFunctionCreator for PercentileAccumulatorCreator { with_match_primitive_type_id!( input_type.logical_type_id(), |$S| { - Ok(Box::new(Percentile::<$S>::default())) + Ok(Box::new(Percentile::<<$S as LogicalPrimitiveType>::Wrapper>::default())) }, { let err_msg = format!( @@ -292,7 +292,7 @@ impl AggregateFunctionCreator for PercentileAccumulatorCreator { #[cfg(test)] mod test { - use datatypes::vectors::PrimitiveVector; + use datatypes::vectors::{Float64Vector, Int32Vector}; use super::*; #[test] @@ -307,8 +307,8 @@ mod test { // test update one not-null value let mut percentile = Percentile::::default(); let v: Vec = vec![ - Arc::new(PrimitiveVector::::from(vec![Some(42)])), - Arc::new(PrimitiveVector::::from(vec![Some(100.0_f64)])), + Arc::new(Int32Vector::from(vec![Some(42)])), + Arc::new(Float64Vector::from(vec![Some(100.0_f64)])), ]; assert!(percentile.update_batch(&v).is_ok()); assert_eq!(Value::from(42.0_f64), percentile.evaluate().unwrap()); @@ -316,8 +316,8 @@ mod test { // test update one null value let mut percentile = Percentile::::default(); let v: Vec = vec![ - Arc::new(PrimitiveVector::::from(vec![Option::::None])), - Arc::new(PrimitiveVector::::from(vec![Some(100.0_f64)])), + Arc::new(Int32Vector::from(vec![Option::::None])), + Arc::new(Float64Vector::from(vec![Some(100.0_f64)])), ]; assert!(percentile.update_batch(&v).is_ok()); assert_eq!(Value::Null, percentile.evaluate().unwrap()); @@ -325,12 +325,8 @@ mod test { // test update no null-value batch let mut percentile = Percentile::::default(); let v: Vec = vec![ - Arc::new(PrimitiveVector::::from(vec![ - Some(-1i32), - Some(1), - Some(2), - ])), - Arc::new(PrimitiveVector::::from(vec![ + Arc::new(Int32Vector::from(vec![Some(-1i32), Some(1), Some(2)])), + Arc::new(Float64Vector::from(vec![ Some(100.0_f64), Some(100.0_f64), Some(100.0_f64), @@ -342,13 +338,8 @@ mod test { // test update null-value batch let mut percentile = Percentile::::default(); let v: Vec = vec![ - Arc::new(PrimitiveVector::::from(vec![ - Some(-2i32), - None, - Some(3), - Some(4), - ])), - Arc::new(PrimitiveVector::::from(vec![ + Arc::new(Int32Vector::from(vec![Some(-2i32), None, Some(3), Some(4)])), + Arc::new(Float64Vector::from(vec![ Some(100.0_f64), Some(100.0_f64), Some(100.0_f64), @@ -362,13 +353,10 @@ mod test { let mut percentile = Percentile::::default(); let v: Vec = vec![ Arc::new(ConstantVector::new( - Arc::new(PrimitiveVector::::from_vec(vec![4])), + Arc::new(Int32Vector::from_vec(vec![4])), 2, )), - Arc::new(PrimitiveVector::::from(vec![ - Some(100.0_f64), - Some(100.0_f64), - ])), + Arc::new(Float64Vector::from(vec![Some(100.0_f64), Some(100.0_f64)])), ]; assert!(percentile.update_batch(&v).is_ok()); assert_eq!(Value::from(4_f64), percentile.evaluate().unwrap()); @@ -376,12 +364,8 @@ mod test { // test left border let mut percentile = Percentile::::default(); let v: Vec = vec![ - Arc::new(PrimitiveVector::::from(vec![ - Some(-1i32), - Some(1), - Some(2), - ])), - Arc::new(PrimitiveVector::::from(vec![ + Arc::new(Int32Vector::from(vec![Some(-1i32), Some(1), Some(2)])), + Arc::new(Float64Vector::from(vec![ Some(0.0_f64), Some(0.0_f64), Some(0.0_f64), @@ -393,12 +377,8 @@ mod test { // test medium let mut percentile = Percentile::::default(); let v: Vec = vec![ - Arc::new(PrimitiveVector::::from(vec![ - Some(-1i32), - Some(1), - Some(2), - ])), - Arc::new(PrimitiveVector::::from(vec![ + Arc::new(Int32Vector::from(vec![Some(-1i32), Some(1), Some(2)])), + Arc::new(Float64Vector::from(vec![ Some(50.0_f64), Some(50.0_f64), Some(50.0_f64), @@ -410,12 +390,8 @@ mod test { // test right border let mut percentile = Percentile::::default(); let v: Vec = vec![ - Arc::new(PrimitiveVector::::from(vec![ - Some(-1i32), - Some(1), - Some(2), - ])), - Arc::new(PrimitiveVector::::from(vec![ + Arc::new(Int32Vector::from(vec![Some(-1i32), Some(1), Some(2)])), + Arc::new(Float64Vector::from(vec![ Some(100.0_f64), Some(100.0_f64), Some(100.0_f64), @@ -431,12 +407,8 @@ mod test { // >> 6.400000000000 let mut percentile = Percentile::::default(); let v: Vec = vec![ - Arc::new(PrimitiveVector::::from(vec![ - Some(10i32), - Some(7), - Some(4), - ])), - Arc::new(PrimitiveVector::::from(vec![ + Arc::new(Int32Vector::from(vec![Some(10i32), Some(7), Some(4)])), + Arc::new(Float64Vector::from(vec![ Some(40.0_f64), Some(40.0_f64), Some(40.0_f64), @@ -451,12 +423,8 @@ mod test { // >> 9.7000000000000011 let mut percentile = Percentile::::default(); let v: Vec = vec![ - Arc::new(PrimitiveVector::::from(vec![ - Some(10i32), - Some(7), - Some(4), - ])), - Arc::new(PrimitiveVector::::from(vec![ + Arc::new(Int32Vector::from(vec![Some(10i32), Some(7), Some(4)])), + Arc::new(Float64Vector::from(vec![ Some(95.0_f64), Some(95.0_f64), Some(95.0_f64), diff --git a/src/common/function/src/scalars/aggregate/polyval.rs b/src/common/function/src/scalars/aggregate/polyval.rs index 75a9d809f7f3..0a8fc818c5da 100644 --- a/src/common/function/src/scalars/aggregate/polyval.rs +++ b/src/common/function/src/scalars/aggregate/polyval.rs @@ -23,9 +23,9 @@ use common_query::error::{ use common_query::logical_plan::{Accumulator, AggregateFunctionCreator}; use common_query::prelude::*; use datatypes::prelude::*; -use datatypes::types::PrimitiveType; +use datatypes::types::{LogicalPrimitiveType, WrapperType}; use datatypes::value::ListValue; -use datatypes::vectors::{ConstantVector, Int64Vector, ListVector}; +use datatypes::vectors::{ConstantVector, Helper, Int64Vector, ListVector}; use datatypes::with_match_primitive_type_id; use num_traits::AsPrimitive; use snafu::{ensure, OptionExt, ResultExt}; @@ -34,8 +34,10 @@ use snafu::{ensure, OptionExt, ResultExt}; #[derive(Debug, Default)] pub struct Polyval where - T: Primitive + AsPrimitive, - PolyT: Primitive + std::ops::Mul, + T: WrapperType, + T::Native: AsPrimitive, + PolyT: WrapperType, + PolyT::Native: std::ops::Mul, { values: Vec, // DataFusion casts constant in into i64 type. @@ -45,8 +47,10 @@ where impl Polyval where - T: Primitive + AsPrimitive, - PolyT: Primitive + std::ops::Mul, + T: WrapperType, + T::Native: AsPrimitive, + PolyT: WrapperType, + PolyT::Native: std::ops::Mul, { fn push(&mut self, value: T) { self.values.push(value); @@ -55,11 +59,11 @@ where impl Accumulator for Polyval where - T: Primitive + AsPrimitive, - PolyT: Primitive + std::ops::Mul + std::iter::Sum, - for<'a> T: Scalar = T>, - for<'a> PolyT: Scalar = PolyT>, - i64: AsPrimitive, + T: WrapperType, + T::Native: AsPrimitive, + PolyT: WrapperType + std::iter::Sum<::Native>, + PolyT::Native: std::ops::Mul + std::iter::Sum, + i64: AsPrimitive<::Native>, { fn state(&self) -> Result> { let nums = self @@ -70,7 +74,7 @@ where Ok(vec![ Value::List(ListValue::new( Some(Box::new(nums)), - T::default().into().data_type(), + T::LogicalType::build_data_type(), )), self.x.into(), ]) @@ -91,10 +95,10 @@ where let mut len = 1; let column: &::VectorType = if column.is_const() { len = column.len(); - let column: &ConstantVector = unsafe { VectorHelper::static_cast(column) }; - unsafe { VectorHelper::static_cast(column.inner()) } + let column: &ConstantVector = unsafe { Helper::static_cast(column) }; + unsafe { Helper::static_cast(column.inner()) } } else { - unsafe { VectorHelper::static_cast(column) } + unsafe { Helper::static_cast(column) } }; (0..len).for_each(|_| { for v in column.iter_data().flatten() { @@ -103,7 +107,7 @@ where }); let x = &values[1]; - let x = VectorHelper::check_get_scalar::(x).context(error::InvalidInputsSnafu { + let x = Helper::check_get_scalar::(x).context(error::InvalidInputTypeSnafu { err_msg: "expecting \"POLYVAL\" function's second argument to be a positive integer", })?; // `get(0)` is safe because we have checked `values[1].len() == values[0].len() != 0` @@ -172,12 +176,14 @@ where ), })?; for value in values.values_iter() { - let value = value.context(FromScalarValueSnafu)?; - let column: &::VectorType = unsafe { VectorHelper::static_cast(&value) }; - for v in column.iter_data().flatten() { - self.push(v); + if let Some(value) = value.context(FromScalarValueSnafu)? { + let column: &::VectorType = unsafe { Helper::static_cast(&value) }; + for v in column.iter_data().flatten() { + self.push(v); + } } } + Ok(()) } @@ -196,7 +202,7 @@ where .values .iter() .enumerate() - .map(|(i, &value)| value.as_() * (x.pow((len - 1 - i) as u32)).as_()) + .map(|(i, &value)| value.into_native().as_() * x.pow((len - 1 - i) as u32).as_()) .sum(); Ok(polyval.into()) } @@ -213,7 +219,7 @@ impl AggregateFunctionCreator for PolyvalAccumulatorCreator { with_match_primitive_type_id!( input_type.logical_type_id(), |$S| { - Ok(Box::new(Polyval::<$S,<$S as Primitive>::LargestType>::default())) + Ok(Box::new(Polyval::<<$S as LogicalPrimitiveType>::Wrapper, <<$S as LogicalPrimitiveType>::LargestType as LogicalPrimitiveType>::Wrapper>::default())) }, { let err_msg = format!( @@ -234,7 +240,7 @@ impl AggregateFunctionCreator for PolyvalAccumulatorCreator { with_match_primitive_type_id!( input_type, |$S| { - Ok(PrimitiveType::<<$S as Primitive>::LargestType>::default().into()) + Ok(<<$S as LogicalPrimitiveType>::LargestType as LogicalPrimitiveType>::build_data_type()) }, { unreachable!() @@ -254,7 +260,7 @@ impl AggregateFunctionCreator for PolyvalAccumulatorCreator { #[cfg(test)] mod test { - use datatypes::vectors::PrimitiveVector; + use datatypes::vectors::Int32Vector; use super::*; #[test] @@ -268,8 +274,8 @@ mod test { // test update one not-null value let mut polyval = Polyval::::default(); let v: Vec = vec![ - Arc::new(PrimitiveVector::::from(vec![Some(3)])), - Arc::new(PrimitiveVector::::from(vec![Some(2_i64)])), + Arc::new(Int32Vector::from(vec![Some(3)])), + Arc::new(Int64Vector::from(vec![Some(2_i64)])), ]; assert!(polyval.update_batch(&v).is_ok()); assert_eq!(Value::Int64(3), polyval.evaluate().unwrap()); @@ -277,8 +283,8 @@ mod test { // test update one null value let mut polyval = Polyval::::default(); let v: Vec = vec![ - Arc::new(PrimitiveVector::::from(vec![Option::::None])), - Arc::new(PrimitiveVector::::from(vec![Some(2_i64)])), + Arc::new(Int32Vector::from(vec![Option::::None])), + Arc::new(Int64Vector::from(vec![Some(2_i64)])), ]; assert!(polyval.update_batch(&v).is_ok()); assert_eq!(Value::Null, polyval.evaluate().unwrap()); @@ -286,12 +292,8 @@ mod test { // test update no null-value batch let mut polyval = Polyval::::default(); let v: Vec = vec![ - Arc::new(PrimitiveVector::::from(vec![ - Some(3), - Some(0), - Some(1), - ])), - Arc::new(PrimitiveVector::::from(vec![ + Arc::new(Int32Vector::from(vec![Some(3), Some(0), Some(1)])), + Arc::new(Int64Vector::from(vec![ Some(2_i64), Some(2_i64), Some(2_i64), @@ -303,13 +305,8 @@ mod test { // test update null-value batch let mut polyval = Polyval::::default(); let v: Vec = vec![ - Arc::new(PrimitiveVector::::from(vec![ - Some(3), - Some(0), - None, - Some(1), - ])), - Arc::new(PrimitiveVector::::from(vec![ + Arc::new(Int32Vector::from(vec![Some(3), Some(0), None, Some(1)])), + Arc::new(Int64Vector::from(vec![ Some(2_i64), Some(2_i64), Some(2_i64), @@ -323,10 +320,10 @@ mod test { let mut polyval = Polyval::::default(); let v: Vec = vec![ Arc::new(ConstantVector::new( - Arc::new(PrimitiveVector::::from_vec(vec![4])), + Arc::new(Int32Vector::from_vec(vec![4])), 2, )), - Arc::new(PrimitiveVector::::from(vec![Some(5_i64), Some(5_i64)])), + Arc::new(Int64Vector::from(vec![Some(5_i64), Some(5_i64)])), ]; assert!(polyval.update_batch(&v).is_ok()); assert_eq!(Value::Int64(24), polyval.evaluate().unwrap()); diff --git a/src/common/function/src/scalars/aggregate/scipy_stats_norm_cdf.rs b/src/common/function/src/scalars/aggregate/scipy_stats_norm_cdf.rs index 8f43b64e9273..caa07248a33b 100644 --- a/src/common/function/src/scalars/aggregate/scipy_stats_norm_cdf.rs +++ b/src/common/function/src/scalars/aggregate/scipy_stats_norm_cdf.rs @@ -23,7 +23,7 @@ use common_query::logical_plan::{Accumulator, AggregateFunctionCreator}; use common_query::prelude::*; use datatypes::prelude::*; use datatypes::value::{ListValue, OrderedFloat}; -use datatypes::vectors::{ConstantVector, Float64Vector, ListVector}; +use datatypes::vectors::{ConstantVector, Float64Vector, Helper, ListVector}; use datatypes::with_match_primitive_type_id; use num_traits::AsPrimitive; use snafu::{ensure, OptionExt, ResultExt}; @@ -33,18 +33,12 @@ use statrs::statistics::Statistics; // https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.norm.html #[derive(Debug, Default)] -pub struct ScipyStatsNormCdf -where - T: Primitive + AsPrimitive + std::iter::Sum, -{ +pub struct ScipyStatsNormCdf { values: Vec, x: Option, } -impl ScipyStatsNormCdf -where - T: Primitive + AsPrimitive + std::iter::Sum, -{ +impl ScipyStatsNormCdf { fn push(&mut self, value: T) { self.values.push(value); } @@ -52,8 +46,8 @@ where impl Accumulator for ScipyStatsNormCdf where - T: Primitive + AsPrimitive + std::iter::Sum, - for<'a> T: Scalar = T>, + T: WrapperType + std::iter::Sum, + T::Native: AsPrimitive, { fn state(&self) -> Result> { let nums = self @@ -64,7 +58,7 @@ where Ok(vec![ Value::List(ListValue::new( Some(Box::new(nums)), - T::default().into().data_type(), + T::LogicalType::build_data_type(), )), self.x.into(), ]) @@ -86,14 +80,14 @@ where let mut len = 1; let column: &::VectorType = if column.is_const() { len = column.len(); - let column: &ConstantVector = unsafe { VectorHelper::static_cast(column) }; - unsafe { VectorHelper::static_cast(column.inner()) } + let column: &ConstantVector = unsafe { Helper::static_cast(column) }; + unsafe { Helper::static_cast(column.inner()) } } else { - unsafe { VectorHelper::static_cast(column) } + unsafe { Helper::static_cast(column) } }; let x = &values[1]; - let x = VectorHelper::check_get_scalar::(x).context(error::InvalidInputsSnafu { + let x = Helper::check_get_scalar::(x).context(error::InvalidInputTypeSnafu { err_msg: "expecting \"SCIPYSTATSNORMCDF\" function's second argument to be a positive integer", })?; let first = x.get(0); @@ -160,19 +154,19 @@ where ), })?; for value in values.values_iter() { - let value = value.context(FromScalarValueSnafu)?; - let column: &::VectorType = unsafe { VectorHelper::static_cast(&value) }; - for v in column.iter_data().flatten() { - self.push(v); + if let Some(value) = value.context(FromScalarValueSnafu)? { + let column: &::VectorType = unsafe { Helper::static_cast(&value) }; + for v in column.iter_data().flatten() { + self.push(v); + } } } Ok(()) } fn evaluate(&self) -> Result { - let values = self.values.iter().map(|&v| v.as_()).collect::>(); - let mean = values.clone().mean(); - let std_dev = values.std_dev(); + let mean = self.values.iter().map(|v| v.into_native().as_()).mean(); + let std_dev = self.values.iter().map(|v| v.into_native().as_()).std_dev(); if mean.is_nan() || std_dev.is_nan() { Ok(Value::Null) } else { @@ -198,7 +192,7 @@ impl AggregateFunctionCreator for ScipyStatsNormCdfAccumulatorCreator { with_match_primitive_type_id!( input_type.logical_type_id(), |$S| { - Ok(Box::new(ScipyStatsNormCdf::<$S>::default())) + Ok(Box::new(ScipyStatsNormCdf::<<$S as LogicalPrimitiveType>::Wrapper>::default())) }, { let err_msg = format!( @@ -230,7 +224,7 @@ impl AggregateFunctionCreator for ScipyStatsNormCdfAccumulatorCreator { #[cfg(test)] mod test { - use datatypes::vectors::PrimitiveVector; + use datatypes::vectors::{Float64Vector, Int32Vector}; use super::*; #[test] @@ -244,12 +238,8 @@ mod test { // test update no null-value batch let mut scipy_stats_norm_cdf = ScipyStatsNormCdf::::default(); let v: Vec = vec![ - Arc::new(PrimitiveVector::::from(vec![ - Some(-1i32), - Some(1), - Some(2), - ])), - Arc::new(PrimitiveVector::::from(vec![ + Arc::new(Int32Vector::from(vec![Some(-1i32), Some(1), Some(2)])), + Arc::new(Float64Vector::from(vec![ Some(2.0_f64), Some(2.0_f64), Some(2.0_f64), @@ -264,13 +254,8 @@ mod test { // test update null-value batch let mut scipy_stats_norm_cdf = ScipyStatsNormCdf::::default(); let v: Vec = vec![ - Arc::new(PrimitiveVector::::from(vec![ - Some(-2i32), - None, - Some(3), - Some(4), - ])), - Arc::new(PrimitiveVector::::from(vec![ + Arc::new(Int32Vector::from(vec![Some(-2i32), None, Some(3), Some(4)])), + Arc::new(Float64Vector::from(vec![ Some(2.0_f64), None, Some(2.0_f64), diff --git a/src/common/function/src/scalars/aggregate/scipy_stats_norm_pdf.rs b/src/common/function/src/scalars/aggregate/scipy_stats_norm_pdf.rs index e381d11b546a..186d59a89084 100644 --- a/src/common/function/src/scalars/aggregate/scipy_stats_norm_pdf.rs +++ b/src/common/function/src/scalars/aggregate/scipy_stats_norm_pdf.rs @@ -23,7 +23,7 @@ use common_query::logical_plan::{Accumulator, AggregateFunctionCreator}; use common_query::prelude::*; use datatypes::prelude::*; use datatypes::value::{ListValue, OrderedFloat}; -use datatypes::vectors::{ConstantVector, Float64Vector, ListVector}; +use datatypes::vectors::{ConstantVector, Float64Vector, Helper, ListVector}; use datatypes::with_match_primitive_type_id; use num_traits::AsPrimitive; use snafu::{ensure, OptionExt, ResultExt}; @@ -33,18 +33,12 @@ use statrs::statistics::Statistics; // https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.norm.html #[derive(Debug, Default)] -pub struct ScipyStatsNormPdf -where - T: Primitive + AsPrimitive + std::iter::Sum, -{ +pub struct ScipyStatsNormPdf { values: Vec, x: Option, } -impl ScipyStatsNormPdf -where - T: Primitive + AsPrimitive + std::iter::Sum, -{ +impl ScipyStatsNormPdf { fn push(&mut self, value: T) { self.values.push(value); } @@ -52,8 +46,8 @@ where impl Accumulator for ScipyStatsNormPdf where - T: Primitive + AsPrimitive + std::iter::Sum, - for<'a> T: Scalar = T>, + T: WrapperType, + T::Native: AsPrimitive + std::iter::Sum, { fn state(&self) -> Result> { let nums = self @@ -64,7 +58,7 @@ where Ok(vec![ Value::List(ListValue::new( Some(Box::new(nums)), - T::default().into().data_type(), + T::LogicalType::build_data_type(), )), self.x.into(), ]) @@ -86,14 +80,14 @@ where let mut len = 1; let column: &::VectorType = if column.is_const() { len = column.len(); - let column: &ConstantVector = unsafe { VectorHelper::static_cast(column) }; - unsafe { VectorHelper::static_cast(column.inner()) } + let column: &ConstantVector = unsafe { Helper::static_cast(column) }; + unsafe { Helper::static_cast(column.inner()) } } else { - unsafe { VectorHelper::static_cast(column) } + unsafe { Helper::static_cast(column) } }; let x = &values[1]; - let x = VectorHelper::check_get_scalar::(x).context(error::InvalidInputsSnafu { + let x = Helper::check_get_scalar::(x).context(error::InvalidInputTypeSnafu { err_msg: "expecting \"SCIPYSTATSNORMPDF\" function's second argument to be a positive integer", })?; let first = x.get(0); @@ -160,19 +154,20 @@ where ), })?; for value in values.values_iter() { - let value = value.context(FromScalarValueSnafu)?; - let column: &::VectorType = unsafe { VectorHelper::static_cast(&value) }; - for v in column.iter_data().flatten() { - self.push(v); + if let Some(value) = value.context(FromScalarValueSnafu)? { + let column: &::VectorType = unsafe { Helper::static_cast(&value) }; + for v in column.iter_data().flatten() { + self.push(v); + } } } Ok(()) } fn evaluate(&self) -> Result { - let values = self.values.iter().map(|&v| v.as_()).collect::>(); - let mean = values.clone().mean(); - let std_dev = values.std_dev(); + let mean = self.values.iter().map(|v| v.into_native().as_()).mean(); + let std_dev = self.values.iter().map(|v| v.into_native().as_()).std_dev(); + if mean.is_nan() || std_dev.is_nan() { Ok(Value::Null) } else { @@ -198,7 +193,7 @@ impl AggregateFunctionCreator for ScipyStatsNormPdfAccumulatorCreator { with_match_primitive_type_id!( input_type.logical_type_id(), |$S| { - Ok(Box::new(ScipyStatsNormPdf::<$S>::default())) + Ok(Box::new(ScipyStatsNormPdf::<<$S as LogicalPrimitiveType>::Wrapper>::default())) }, { let err_msg = format!( @@ -230,7 +225,7 @@ impl AggregateFunctionCreator for ScipyStatsNormPdfAccumulatorCreator { #[cfg(test)] mod test { - use datatypes::vectors::PrimitiveVector; + use datatypes::vectors::{Float64Vector, Int32Vector}; use super::*; #[test] @@ -244,12 +239,8 @@ mod test { // test update no null-value batch let mut scipy_stats_norm_pdf = ScipyStatsNormPdf::::default(); let v: Vec = vec![ - Arc::new(PrimitiveVector::::from(vec![ - Some(-1i32), - Some(1), - Some(2), - ])), - Arc::new(PrimitiveVector::::from(vec![ + Arc::new(Int32Vector::from(vec![Some(-1i32), Some(1), Some(2)])), + Arc::new(Float64Vector::from(vec![ Some(2.0_f64), Some(2.0_f64), Some(2.0_f64), @@ -264,13 +255,8 @@ mod test { // test update null-value batch let mut scipy_stats_norm_pdf = ScipyStatsNormPdf::::default(); let v: Vec = vec![ - Arc::new(PrimitiveVector::::from(vec![ - Some(-2i32), - None, - Some(3), - Some(4), - ])), - Arc::new(PrimitiveVector::::from(vec![ + Arc::new(Int32Vector::from(vec![Some(-2i32), None, Some(3), Some(4)])), + Arc::new(Float64Vector::from(vec![ Some(2.0_f64), None, Some(2.0_f64), diff --git a/src/common/function/src/scalars/expression/mod.rs b/src/common/function/src/scalars/expression.rs similarity index 100% rename from src/common/function/src/scalars/expression/mod.rs rename to src/common/function/src/scalars/expression.rs diff --git a/src/common/function/src/scalars/expression/binary.rs b/src/common/function/src/scalars/expression/binary.rs index b02e46c93734..d1a9db8eb997 100644 --- a/src/common/function/src/scalars/expression/binary.rs +++ b/src/common/function/src/scalars/expression/binary.rs @@ -14,10 +14,10 @@ use std::iter; +use common_query::error::Result; use datatypes::prelude::*; -use datatypes::vectors::ConstantVector; +use datatypes::vectors::{ConstantVector, Helper}; -use crate::error::Result; use crate::scalars::expression::ctx::EvalContext; pub fn scalar_binary_op( @@ -36,10 +36,9 @@ where let result = match (l.is_const(), r.is_const()) { (false, true) => { - let left: &::VectorType = unsafe { VectorHelper::static_cast(l) }; - let right: &ConstantVector = unsafe { VectorHelper::static_cast(r) }; - let right: &::VectorType = - unsafe { VectorHelper::static_cast(right.inner()) }; + let left: &::VectorType = unsafe { Helper::static_cast(l) }; + let right: &ConstantVector = unsafe { Helper::static_cast(r) }; + let right: &::VectorType = unsafe { Helper::static_cast(right.inner()) }; let b = right.get_data(0); let it = left.iter_data().map(|a| f(a, b, ctx)); @@ -47,8 +46,8 @@ where } (false, false) => { - let left: &::VectorType = unsafe { VectorHelper::static_cast(l) }; - let right: &::VectorType = unsafe { VectorHelper::static_cast(r) }; + let left: &::VectorType = unsafe { Helper::static_cast(l) }; + let right: &::VectorType = unsafe { Helper::static_cast(r) }; let it = left .iter_data() @@ -58,25 +57,22 @@ where } (true, false) => { - let left: &ConstantVector = unsafe { VectorHelper::static_cast(l) }; - let left: &::VectorType = - unsafe { VectorHelper::static_cast(left.inner()) }; + let left: &ConstantVector = unsafe { Helper::static_cast(l) }; + let left: &::VectorType = unsafe { Helper::static_cast(left.inner()) }; let a = left.get_data(0); - let right: &::VectorType = unsafe { VectorHelper::static_cast(r) }; + let right: &::VectorType = unsafe { Helper::static_cast(r) }; let it = right.iter_data().map(|b| f(a, b, ctx)); ::VectorType::from_owned_iterator(it) } (true, true) => { - let left: &ConstantVector = unsafe { VectorHelper::static_cast(l) }; - let left: &::VectorType = - unsafe { VectorHelper::static_cast(left.inner()) }; + let left: &ConstantVector = unsafe { Helper::static_cast(l) }; + let left: &::VectorType = unsafe { Helper::static_cast(left.inner()) }; let a = left.get_data(0); - let right: &ConstantVector = unsafe { VectorHelper::static_cast(r) }; - let right: &::VectorType = - unsafe { VectorHelper::static_cast(right.inner()) }; + let right: &ConstantVector = unsafe { Helper::static_cast(r) }; + let right: &::VectorType = unsafe { Helper::static_cast(right.inner()) }; let b = right.get_data(0); let it = iter::repeat(a) diff --git a/src/common/function/src/scalars/expression/ctx.rs b/src/common/function/src/scalars/expression/ctx.rs index 7910bb82b8d2..c6735bd1d0d5 100644 --- a/src/common/function/src/scalars/expression/ctx.rs +++ b/src/common/function/src/scalars/expression/ctx.rs @@ -13,8 +13,7 @@ // limitations under the License. use chrono_tz::Tz; - -use crate::error::Error; +use common_query::error::Error; pub struct EvalContext { _tz: Tz, diff --git a/src/common/function/src/scalars/expression/unary.rs b/src/common/function/src/scalars/expression/unary.rs index a3434a2b0e4d..0862f711e1f4 100644 --- a/src/common/function/src/scalars/expression/unary.rs +++ b/src/common/function/src/scalars/expression/unary.rs @@ -12,10 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. +use common_query::error::{self, Result}; use datatypes::prelude::*; +use datatypes::vectors::Helper; use snafu::ResultExt; -use crate::error::{GetScalarVectorSnafu, Result}; use crate::scalars::expression::ctx::EvalContext; /// TODO: remove the allow_unused when it's used. @@ -28,7 +29,7 @@ pub fn scalar_unary_op( where F: Fn(Option>, &mut EvalContext) -> Option, { - let left = VectorHelper::check_get_scalar::(l).context(GetScalarVectorSnafu)?; + let left = Helper::check_get_scalar::(l).context(error::GetScalarVectorSnafu)?; let it = left.iter_data().map(|a| f(a, ctx)); let result = ::VectorType::from_owned_iterator(it); diff --git a/src/common/function/src/scalars/function.rs b/src/common/function/src/scalars/function.rs index 353f524ea99d..6f70bca4a0d5 100644 --- a/src/common/function/src/scalars/function.rs +++ b/src/common/function/src/scalars/function.rs @@ -16,12 +16,11 @@ use std::fmt; use std::sync::Arc; use chrono_tz::Tz; +use common_query::error::Result; use common_query::prelude::Signature; use datatypes::data_type::ConcreteDataType; use datatypes::vectors::VectorRef; -use crate::error::Result; - #[derive(Clone)] pub struct FunctionContext { pub tz: Tz, diff --git a/src/common/function/src/scalars/math/mod.rs b/src/common/function/src/scalars/math.rs similarity index 100% rename from src/common/function/src/scalars/math/mod.rs rename to src/common/function/src/scalars/math.rs diff --git a/src/common/function/src/scalars/math/pow.rs b/src/common/function/src/scalars/math/pow.rs index fcbb87724039..6a4e1937ddc3 100644 --- a/src/common/function/src/scalars/math/pow.rs +++ b/src/common/function/src/scalars/math/pow.rs @@ -15,15 +15,16 @@ use std::fmt; use std::sync::Arc; +use common_query::error::Result; use common_query::prelude::{Signature, Volatility}; use datatypes::data_type::DataType; use datatypes::prelude::ConcreteDataType; +use datatypes::types::LogicalPrimitiveType; use datatypes::vectors::VectorRef; use datatypes::with_match_primitive_type_id; use num::traits::Pow; use num_traits::AsPrimitive; -use crate::error::Result; use crate::scalars::expression::{scalar_binary_op, EvalContext}; use crate::scalars::function::{Function, FunctionContext}; @@ -46,7 +47,7 @@ impl Function for PowFunction { fn eval(&self, _func_ctx: FunctionContext, columns: &[VectorRef]) -> Result { with_match_primitive_type_id!(columns[0].data_type().logical_type_id(), |$S| { with_match_primitive_type_id!(columns[1].data_type().logical_type_id(), |$T| { - let col = scalar_binary_op::<$S, $T, f64, _>(&columns[0], &columns[1], scalar_pow, &mut EvalContext::default())?; + let col = scalar_binary_op::<<$S as LogicalPrimitiveType>::Native, <$T as LogicalPrimitiveType>::Native, f64, _>(&columns[0], &columns[1], scalar_pow, &mut EvalContext::default())?; Ok(Arc::new(col)) },{ unreachable!() diff --git a/src/common/function/src/scalars/math/rate.rs b/src/common/function/src/scalars/math/rate.rs index 628a19408ac7..ad03485a36aa 100644 --- a/src/common/function/src/scalars/math/rate.rs +++ b/src/common/function/src/scalars/math/rate.rs @@ -14,10 +14,10 @@ use std::fmt; -use arrow::array::Array; -use common_query::error::{FromArrowArraySnafu, Result, TypeCastSnafu}; +use common_query::error::{self, Result}; use common_query::prelude::{Signature, Volatility}; -use datatypes::arrow; +use datatypes::arrow::compute::kernels::{arithmetic, cast}; +use datatypes::arrow::datatypes::DataType; use datatypes::prelude::*; use datatypes::vectors::{Helper, VectorRef}; use snafu::ResultExt; @@ -51,28 +51,21 @@ impl Function for RateFunction { let val = &columns[0].to_arrow_array(); let val_0 = val.slice(0, val.len() - 1); let val_1 = val.slice(1, val.len() - 1); - let dv = arrow::compute::arithmetics::sub(&*val_1, &*val_0); + let dv = arithmetic::subtract_dyn(&val_1, &val_0).context(error::ArrowComputeSnafu)?; let ts = &columns[1].to_arrow_array(); let ts_0 = ts.slice(0, ts.len() - 1); let ts_1 = ts.slice(1, ts.len() - 1); - let dt = arrow::compute::arithmetics::sub(&*ts_1, &*ts_0); - fn all_to_f64(array: &dyn Array) -> Result> { - Ok(arrow::compute::cast::cast( - array, - &arrow::datatypes::DataType::Float64, - arrow::compute::cast::CastOptions { - wrapped: true, - partial: true, - }, - ) - .context(TypeCastSnafu { - typ: arrow::datatypes::DataType::Float64, - })?) - } - let dv = all_to_f64(&*dv)?; - let dt = all_to_f64(&*dt)?; - let rate = arrow::compute::arithmetics::div(&*dv, &*dt); - let v = Helper::try_into_vector(&rate).context(FromArrowArraySnafu)?; + let dt = arithmetic::subtract_dyn(&ts_1, &ts_0).context(error::ArrowComputeSnafu)?; + + let dv = cast::cast(&dv, &DataType::Float64).context(error::TypeCastSnafu { + typ: DataType::Float64, + })?; + let dt = cast::cast(&dt, &DataType::Float64).context(error::TypeCastSnafu { + typ: DataType::Float64, + })?; + let rate = arithmetic::divide_dyn(&dv, &dt).context(error::ArrowComputeSnafu)?; + let v = Helper::try_into_vector(&rate).context(error::FromArrowArraySnafu)?; + Ok(v) } } @@ -81,9 +74,8 @@ impl Function for RateFunction { mod tests { use std::sync::Arc; - use arrow::array::Float64Array; use common_query::prelude::TypeSignature; - use datatypes::vectors::{Float32Vector, Int64Vector}; + use datatypes::vectors::{Float32Vector, Float64Vector, Int64Vector}; use super::*; #[test] @@ -108,9 +100,7 @@ mod tests { Arc::new(Int64Vector::from_vec(ts)), ]; let vector = rate.eval(FunctionContext::default(), &args).unwrap(); - let arr = vector.to_arrow_array(); - let expect = Arc::new(Float64Array::from_vec(vec![2.0, 3.0])); - let res = arrow::compute::comparison::eq(&*arr, &*expect); - res.iter().for_each(|x| assert!(matches!(x, Some(true)))); + let expect: VectorRef = Arc::new(Float64Vector::from_vec(vec![2.0, 3.0])); + assert_eq!(expect, vector); } } diff --git a/src/common/function/src/scalars/numpy/mod.rs b/src/common/function/src/scalars/numpy.rs similarity index 98% rename from src/common/function/src/scalars/numpy/mod.rs rename to src/common/function/src/scalars/numpy.rs index 76140fb7def3..ed8d9b6f3009 100644 --- a/src/common/function/src/scalars/numpy/mod.rs +++ b/src/common/function/src/scalars/numpy.rs @@ -13,7 +13,6 @@ // limitations under the License. mod clip; -#[allow(unused)] mod interp; use std::sync::Arc; diff --git a/src/common/function/src/scalars/numpy/clip.rs b/src/common/function/src/scalars/numpy/clip.rs index f743bf5ff56d..888a080f3fcf 100644 --- a/src/common/function/src/scalars/numpy/clip.rs +++ b/src/common/function/src/scalars/numpy/clip.rs @@ -15,14 +15,15 @@ use std::fmt; use std::sync::Arc; +use common_query::error::Result; use common_query::prelude::{Signature, Volatility}; -use datatypes::data_type::{ConcreteDataType, DataType}; -use datatypes::prelude::{Scalar, VectorRef}; -use datatypes::with_match_primitive_type_id; -use num_traits::AsPrimitive; +use datatypes::arrow::compute; +use datatypes::arrow::datatypes::ArrowPrimitiveType; +use datatypes::data_type::ConcreteDataType; +use datatypes::prelude::*; +use datatypes::vectors::PrimitiveVector; use paste::paste; -use crate::error::Result; use crate::scalars::expression::{scalar_binary_op, EvalContext}; use crate::scalars::function::{Function, FunctionContext}; @@ -34,25 +35,32 @@ macro_rules! define_eval { ($O: ident) => { paste! { fn [](columns: &[VectorRef]) -> Result { - with_match_primitive_type_id!(columns[0].data_type().logical_type_id(), |$S| { - with_match_primitive_type_id!(columns[1].data_type().logical_type_id(), |$T| { - with_match_primitive_type_id!(columns[2].data_type().logical_type_id(), |$R| { - // clip(a, min, max) is equals to min(max(a, min), max) - let col: VectorRef = Arc::new(scalar_binary_op::<$S, $T, $O, _>(&columns[0], &columns[1], scalar_max, &mut EvalContext::default())?); - let col = scalar_binary_op::<$O, $R, $O, _>(&col, &columns[2], scalar_min, &mut EvalContext::default())?; - Ok(Arc::new(col)) - }, { - unreachable!() - }) - }, { - unreachable!() - }) - }, { - unreachable!() - }) + fn cast_vector(input: &VectorRef) -> VectorRef { + Arc::new(PrimitiveVector::<<$O as WrapperType>::LogicalType>::try_from_arrow_array( + compute::cast(&input.to_arrow_array(), &<<<$O as WrapperType>::LogicalType as LogicalPrimitiveType>::ArrowPrimitive as ArrowPrimitiveType>::DATA_TYPE).unwrap() + ).unwrap()) as _ + } + let operator_1 = cast_vector(&columns[0]); + let operator_2 = cast_vector(&columns[1]); + let operator_3 = cast_vector(&columns[2]); + + // clip(a, min, max) is equals to min(max(a, min), max) + let col: VectorRef = Arc::new(scalar_binary_op::<$O, $O, $O, _>( + &operator_1, + &operator_2, + scalar_max, + &mut EvalContext::default(), + )?); + let col = scalar_binary_op::<$O, $O, $O, _>( + &col, + &operator_3, + scalar_min, + &mut EvalContext::default(), + )?; + Ok(Arc::new(col)) } } - } + }; } define_eval!(i64); @@ -108,27 +116,23 @@ pub fn max(input: T, max: T) -> T { } #[inline] -fn scalar_min(left: Option, right: Option, _ctx: &mut EvalContext) -> Option +fn scalar_min(left: Option, right: Option, _ctx: &mut EvalContext) -> Option where - S: AsPrimitive, - T: AsPrimitive, O: Scalar + Copy + PartialOrd, { match (left, right) { - (Some(left), Some(right)) => Some(min(left.as_(), right.as_())), + (Some(left), Some(right)) => Some(min(left, right)), _ => None, } } #[inline] -fn scalar_max(left: Option, right: Option, _ctx: &mut EvalContext) -> Option +fn scalar_max(left: Option, right: Option, _ctx: &mut EvalContext) -> Option where - S: AsPrimitive, - T: AsPrimitive, O: Scalar + Copy + PartialOrd, { match (left, right) { - (Some(left), Some(right)) => Some(max(left.as_(), right.as_())), + (Some(left), Some(right)) => Some(max(left, right)), _ => None, } } @@ -143,11 +147,15 @@ impl fmt::Display for ClipFunction { mod tests { use common_query::prelude::TypeSignature; use datatypes::value::Value; - use datatypes::vectors::{ConstantVector, Float32Vector, Int32Vector, UInt32Vector}; + use datatypes::vectors::{ + ConstantVector, Float32Vector, Int16Vector, Int32Vector, Int8Vector, UInt16Vector, + UInt32Vector, UInt8Vector, + }; use super::*; + #[test] - fn test_clip_function() { + fn test_clip_signature() { let clip = ClipFunction::default(); assert_eq!("clip", clip.name()); @@ -190,16 +198,21 @@ mod tests { volatility: Volatility::Immutable } if valid_types == ConcreteDataType::numerics() )); + } + + #[test] + fn test_clip_fn_signed() { + let clip = ClipFunction::default(); // eval with signed integers let args: Vec = vec![ Arc::new(Int32Vector::from_values(0..10)), Arc::new(ConstantVector::new( - Arc::new(Int32Vector::from_vec(vec![3])), + Arc::new(Int8Vector::from_vec(vec![3])), 10, )), Arc::new(ConstantVector::new( - Arc::new(Int32Vector::from_vec(vec![6])), + Arc::new(Int16Vector::from_vec(vec![6])), 10, )), ]; @@ -217,16 +230,21 @@ mod tests { assert!(matches!(vector.get(i), Value::Int64(v) if v == 6)); } } + } + + #[test] + fn test_clip_fn_unsigned() { + let clip = ClipFunction::default(); // eval with unsigned integers let args: Vec = vec![ - Arc::new(UInt32Vector::from_values(0..10)), + Arc::new(UInt8Vector::from_values(0..10)), Arc::new(ConstantVector::new( Arc::new(UInt32Vector::from_vec(vec![3])), 10, )), Arc::new(ConstantVector::new( - Arc::new(UInt32Vector::from_vec(vec![6])), + Arc::new(UInt16Vector::from_vec(vec![6])), 10, )), ]; @@ -244,12 +262,17 @@ mod tests { assert!(matches!(vector.get(i), Value::UInt64(v) if v == 6)); } } + } + + #[test] + fn test_clip_fn_float() { + let clip = ClipFunction::default(); // eval with floats let args: Vec = vec![ - Arc::new(Int32Vector::from_values(0..10)), + Arc::new(Int8Vector::from_values(0..10)), Arc::new(ConstantVector::new( - Arc::new(Int32Vector::from_vec(vec![3])), + Arc::new(UInt32Vector::from_vec(vec![3])), 10, )), Arc::new(ConstantVector::new( diff --git a/src/common/function/src/scalars/numpy/interp.rs b/src/common/function/src/scalars/numpy/interp.rs index 68981c25566c..c4bb6e981103 100644 --- a/src/common/function/src/scalars/numpy/interp.rs +++ b/src/common/function/src/scalars/numpy/interp.rs @@ -14,41 +14,18 @@ use std::sync::Arc; -use datatypes::arrow::array::PrimitiveArray; -use datatypes::arrow::compute::cast::primitive_to_primitive; -use datatypes::arrow::datatypes::DataType::Float64; +use common_query::error::{self, Result}; +use datatypes::arrow::compute::cast; +use datatypes::arrow::datatypes::DataType as ArrowDataType; use datatypes::data_type::DataType; use datatypes::prelude::ScalarVector; -use datatypes::type_id::LogicalTypeId; use datatypes::value::Value; -use datatypes::vectors::{Float64Vector, PrimitiveVector, Vector, VectorRef}; -use datatypes::{arrow, with_match_primitive_type_id}; -use snafu::{ensure, Snafu}; - -#[derive(Debug, Snafu)] -pub enum Error { - #[snafu(display( - "The length of the args is not enough, expect at least: {}, have: {}", - expect, - actual, - ))] - ArgsLenNotEnough { expect: usize, actual: usize }, - - #[snafu(display("The sample {} is empty", name))] - SampleEmpty { name: String }, - - #[snafu(display( - "The length of the len1: {} don't match the length of the len2: {}", - len1, - len2, - ))] - LenNotEquals { len1: usize, len2: usize }, -} - -pub type Result = std::result::Result; +use datatypes::vectors::{Float64Vector, Vector, VectorRef}; +use datatypes::with_match_primitive_type_id; +use snafu::{ensure, ResultExt}; /* search the biggest number that smaller than x in xp */ -fn linear_search_ascending_vector(x: Value, xp: &PrimitiveVector) -> usize { +fn linear_search_ascending_vector(x: Value, xp: &Float64Vector) -> usize { for i in 0..xp.len() { if x < xp.get(i) { return i - 1; @@ -58,7 +35,7 @@ fn linear_search_ascending_vector(x: Value, xp: &PrimitiveVector) -> usize } /* search the biggest number that smaller than x in xp */ -fn binary_search_ascending_vector(key: Value, xp: &PrimitiveVector) -> usize { +fn binary_search_ascending_vector(key: Value, xp: &Float64Vector) -> usize { let mut left = 0; let mut right = xp.len(); /* If len <= 4 use linear search. */ @@ -77,27 +54,33 @@ fn binary_search_ascending_vector(key: Value, xp: &PrimitiveVector) -> usiz left - 1 } -fn concrete_type_to_primitive_vector(arg: &VectorRef) -> Result> { +fn concrete_type_to_primitive_vector(arg: &VectorRef) -> Result { with_match_primitive_type_id!(arg.data_type().logical_type_id(), |$S| { let tmp = arg.to_arrow_array(); - let from = tmp.as_any().downcast_ref::>().expect("cast failed"); - let array = primitive_to_primitive(from, &Float64); - Ok(PrimitiveVector::new(array)) + let array = cast(&tmp, &ArrowDataType::Float64).context(error::TypeCastSnafu { + typ: ArrowDataType::Float64, + })?; + // Safety: array has been cast to Float64Array. + Ok(Float64Vector::try_from_arrow_array(array).unwrap()) },{ unreachable!() }) } /// https://github.com/numpy/numpy/blob/b101756ac02e390d605b2febcded30a1da50cc2c/numpy/core/src/multiarray/compiled_base.c#L491 +#[allow(unused)] pub fn interp(args: &[VectorRef]) -> Result { let mut left = None; let mut right = None; ensure!( args.len() >= 3, - ArgsLenNotEnoughSnafu { - expect: 3_usize, - actual: args.len() + error::InvalidFuncArgsSnafu { + err_msg: format!( + "The length of the args is not enough, expect at least: {}, have: {}", + 3, + args.len() + ), } ); @@ -109,9 +92,12 @@ pub fn interp(args: &[VectorRef]) -> Result { if args.len() > 3 { ensure!( args.len() == 5, - ArgsLenNotEnoughSnafu { - expect: 5_usize, - actual: args.len() + error::InvalidFuncArgsSnafu { + err_msg: format!( + "The length of the args is not enough, expect at least: {}, have: {}", + 5, + args.len() + ), } ); @@ -123,14 +109,32 @@ pub fn interp(args: &[VectorRef]) -> Result { .get_data(0); } - ensure!(x.len() != 0, SampleEmptySnafu { name: "x" }); - ensure!(xp.len() != 0, SampleEmptySnafu { name: "xp" }); - ensure!(fp.len() != 0, SampleEmptySnafu { name: "fp" }); + ensure!( + x.len() != 0, + error::InvalidFuncArgsSnafu { + err_msg: "The sample x is empty", + } + ); + ensure!( + xp.len() != 0, + error::InvalidFuncArgsSnafu { + err_msg: "The sample xp is empty", + } + ); + ensure!( + fp.len() != 0, + error::InvalidFuncArgsSnafu { + err_msg: "The sample fp is empty", + } + ); ensure!( xp.len() == fp.len(), - LenNotEqualsSnafu { - len1: xp.len(), - len2: fp.len(), + error::InvalidFuncArgsSnafu { + err_msg: format!( + "The length of the len1: {} don't match the length of the len2: {}", + xp.len(), + fp.len() + ), } ); @@ -147,7 +151,7 @@ pub fn interp(args: &[VectorRef]) -> Result { let res; if xp.len() == 1 { - res = x + let datas = x .iter_data() .map(|x| { if Value::from(x) < xp.get(0) { @@ -158,7 +162,8 @@ pub fn interp(args: &[VectorRef]) -> Result { fp.get_data(0) } }) - .collect::(); + .collect::>(); + res = Float64Vector::from(datas); } else { let mut j = 0; /* only pre-calculate slopes if there are relatively few of them. */ @@ -185,7 +190,7 @@ pub fn interp(args: &[VectorRef]) -> Result { } slopes = Some(slopes_tmp); } - res = x + let datas = x .iter_data() .map(|x| match x { Some(xi) => { @@ -248,7 +253,8 @@ pub fn interp(args: &[VectorRef]) -> Result { } _ => None, }) - .collect::(); + .collect::>(); + res = Float64Vector::from(datas); } Ok(Arc::new(res) as _) } @@ -257,8 +263,7 @@ pub fn interp(args: &[VectorRef]) -> Result { mod tests { use std::sync::Arc; - use datatypes::prelude::ScalarVectorBuilder; - use datatypes::vectors::{Int32Vector, Int64Vector, PrimitiveVectorBuilder}; + use datatypes::vectors::{Int32Vector, Int64Vector}; use super::*; #[test] @@ -341,12 +346,8 @@ mod tests { assert!(matches!(vector.get(0), Value::Float64(v) if v==x[0] as f64)); // x=None output:Null - let input = [None, Some(0.0), Some(0.3)]; - let mut builder = PrimitiveVectorBuilder::with_capacity(input.len()); - for v in input { - builder.push(v); - } - let x = builder.finish(); + let input = vec![None, Some(0.0), Some(0.3)]; + let x = Float64Vector::from(input); let args: Vec = vec![ Arc::new(x), Arc::new(Int64Vector::from_vec(xp)), diff --git a/src/common/function/src/scalars/test.rs b/src/common/function/src/scalars/test.rs index 7d74ff5d83e1..8e81d1f025ac 100644 --- a/src/common/function/src/scalars/test.rs +++ b/src/common/function/src/scalars/test.rs @@ -15,11 +15,11 @@ use std::fmt; use std::sync::Arc; +use common_query::error::Result; use common_query::prelude::{Signature, Volatility}; use datatypes::data_type::ConcreteDataType; use datatypes::prelude::VectorRef; -use crate::error::Result; use crate::scalars::expression::{scalar_binary_op, EvalContext}; use crate::scalars::function::{Function, FunctionContext}; diff --git a/src/common/function/src/scalars/timestamp/mod.rs b/src/common/function/src/scalars/timestamp.rs similarity index 100% rename from src/common/function/src/scalars/timestamp/mod.rs rename to src/common/function/src/scalars/timestamp.rs diff --git a/src/common/function/src/scalars/timestamp/from_unixtime.rs b/src/common/function/src/scalars/timestamp/from_unixtime.rs index 4462672c8ce9..c8adc01f8c6e 100644 --- a/src/common/function/src/scalars/timestamp/from_unixtime.rs +++ b/src/common/function/src/scalars/timestamp/from_unixtime.rs @@ -17,16 +17,17 @@ use std::fmt; use std::sync::Arc; -use common_query::error::{IntoVectorSnafu, UnsupportedInputDataTypeSnafu}; +use common_query::error::{ + ArrowComputeSnafu, IntoVectorSnafu, Result, TypeCastSnafu, UnsupportedInputDataTypeSnafu, +}; use common_query::prelude::{Signature, Volatility}; -use datatypes::arrow::compute::arithmetics; -use datatypes::arrow::datatypes::DataType as ArrowDatatype; -use datatypes::arrow::scalar::PrimitiveScalar; +use datatypes::arrow::compute; +use datatypes::arrow::datatypes::{DataType as ArrowDatatype, Int64Type}; +use datatypes::data_type::DataType; use datatypes::prelude::ConcreteDataType; -use datatypes::vectors::{TimestampVector, VectorRef}; +use datatypes::vectors::{TimestampMillisecondVector, VectorRef}; use snafu::ResultExt; -use crate::error::Result; use crate::scalars::function::{Function, FunctionContext}; #[derive(Clone, Debug, Default)] @@ -40,7 +41,7 @@ impl Function for FromUnixtimeFunction { } fn return_type(&self, _input_types: &[ConcreteDataType]) -> Result { - Ok(ConcreteDataType::timestamp_millis_datatype()) + Ok(ConcreteDataType::timestamp_millisecond_datatype()) } fn signature(&self) -> Signature { @@ -56,14 +57,18 @@ impl Function for FromUnixtimeFunction { ConcreteDataType::Int64(_) => { let array = columns[0].to_arrow_array(); // Our timestamp vector's time unit is millisecond - let array = arithmetics::mul_scalar( - &*array, - &PrimitiveScalar::new(ArrowDatatype::Int64, Some(1000i64)), - ); + let array = compute::multiply_scalar_dyn::(&array, 1000i64) + .context(ArrowComputeSnafu)?; + let arrow_datatype = &self.return_type(&[]).unwrap().as_arrow_type(); Ok(Arc::new( - TimestampVector::try_from_arrow_array(array).context(IntoVectorSnafu { - data_type: ArrowDatatype::Int64, + TimestampMillisecondVector::try_from_arrow_array( + compute::cast(&array, arrow_datatype).context(TypeCastSnafu { + typ: ArrowDatatype::Int64, + })?, + ) + .context(IntoVectorSnafu { + data_type: arrow_datatype.clone(), })?, )) } @@ -71,8 +76,7 @@ impl Function for FromUnixtimeFunction { function: NAME, datatypes: columns.iter().map(|c| c.data_type()).collect::>(), } - .fail() - .map_err(|e| e.into()), + .fail(), } } } @@ -96,7 +100,7 @@ mod tests { let f = FromUnixtimeFunction::default(); assert_eq!("from_unixtime", f.name()); assert_eq!( - ConcreteDataType::timestamp_millis_datatype(), + ConcreteDataType::timestamp_millisecond_datatype(), f.return_type(&[]).unwrap() ); diff --git a/src/common/function/src/scalars/udf.rs b/src/common/function/src/scalars/udf.rs index b2d47af34db3..f6a7dcee874d 100644 --- a/src/common/function/src/scalars/udf.rs +++ b/src/common/function/src/scalars/udf.rs @@ -19,7 +19,8 @@ use common_query::prelude::{ ColumnarValue, ReturnTypeFunction, ScalarFunctionImplementation, ScalarUdf, ScalarValue, }; use datatypes::error::Error as DataTypeError; -use datatypes::prelude::{ConcreteDataType, VectorHelper}; +use datatypes::prelude::*; +use datatypes::vectors::Helper; use snafu::ResultExt; use crate::scalars::function::{FunctionContext, FunctionRef}; @@ -47,7 +48,7 @@ pub fn create_udf(func: FunctionRef) -> ScalarUdf { let args: Result, DataTypeError> = args .iter() .map(|arg| match arg { - ColumnarValue::Scalar(v) => VectorHelper::try_from_scalar_value(v.clone(), rows), + ColumnarValue::Scalar(v) => Helper::try_from_scalar_value(v.clone(), rows), ColumnarValue::Vector(v) => Ok(v.clone()), }) .collect(); diff --git a/src/common/grpc-expr/src/insert.rs b/src/common/grpc-expr/src/insert.rs index d7687d078976..f968ff9b56f7 100644 --- a/src/common/grpc-expr/src/insert.rs +++ b/src/common/grpc-expr/src/insert.rs @@ -22,11 +22,11 @@ use api::v1::{AddColumn, AddColumns, Column, ColumnDataType, ColumnDef, CreateEx use common_base::BitVec; use common_time::timestamp::Timestamp; use common_time::{Date, DateTime}; -use datatypes::data_type::ConcreteDataType; +use datatypes::data_type::{ConcreteDataType, DataType}; use datatypes::prelude::{ValueRef, VectorRef}; use datatypes::schema::SchemaRef; use datatypes::value::Value; -use datatypes::vectors::VectorBuilder; +use datatypes::vectors::MutableVector; use snafu::{ensure, OptionExt, ResultExt}; use table::metadata::TableId; use table::requests::{AddColumnRequest, AlterKind, AlterTableRequest, InsertRequest}; @@ -99,7 +99,7 @@ pub fn column_to_vector(column: &Column, rows: u32) -> Result { let column_datatype = wrapper.datatype(); let rows = rows as usize; - let mut vector = VectorBuilder::with_capacity(wrapper.into(), rows); + let mut vector = ConcreteDataType::from(wrapper).create_mutable_vector(rows); if let Some(values) = &column.values { let values = collect_column_values(column_datatype, values); @@ -110,21 +110,31 @@ pub fn column_to_vector(column: &Column, rows: u32) -> Result { for i in 0..rows { if let Some(true) = nulls_iter.next() { - vector.push_null(); + vector + .push_value_ref(ValueRef::Null) + .context(CreateVectorSnafu)?; } else { - let value_ref = values_iter.next().context(InvalidColumnProtoSnafu { - err_msg: format!( - "value not found at position {} of column {}", - i, &column.column_name - ), - })?; - vector.try_push_ref(value_ref).context(CreateVectorSnafu)?; + let value_ref = values_iter + .next() + .with_context(|| InvalidColumnProtoSnafu { + err_msg: format!( + "value not found at position {} of column {}", + i, &column.column_name + ), + })?; + vector + .push_value_ref(value_ref) + .context(CreateVectorSnafu)?; } } } else { - (0..rows).for_each(|_| vector.push_null()); + (0..rows).try_for_each(|_| { + vector + .push_value_ref(ValueRef::Null) + .context(CreateVectorSnafu) + })?; } - Ok(vector.finish()) + Ok(vector.to_vector()) } fn collect_column_values(column_datatype: ColumnDataType, values: &Values) -> Vec { @@ -174,9 +184,24 @@ fn collect_column_values(column_datatype: ColumnDataType, values: &Values) -> Ve DateTime::new(*v) )) } - ColumnDataType::Timestamp => { - collect_values!(values.ts_millis_values, |v| ValueRef::Timestamp( - Timestamp::from_millis(*v) + ColumnDataType::TimestampSecond => { + collect_values!(values.ts_second_values, |v| ValueRef::Timestamp( + Timestamp::new_second(*v) + )) + } + ColumnDataType::TimestampMillisecond => { + collect_values!(values.ts_millisecond_values, |v| ValueRef::Timestamp( + Timestamp::new_millisecond(*v) + )) + } + ColumnDataType::TimestampMicrosecond => { + collect_values!(values.ts_millisecond_values, |v| ValueRef::Timestamp( + Timestamp::new_microsecond(*v) + )) + } + ColumnDataType::TimestampNanosecond => { + collect_values!(values.ts_millisecond_values, |v| ValueRef::Timestamp( + Timestamp::new_nanosecond(*v) )) } } @@ -289,10 +314,7 @@ pub fn insertion_expr_to_request( }, )?; let data_type = &column_schema.data_type; - entry.insert(VectorBuilder::with_capacity( - data_type.clone(), - row_count as usize, - )) + entry.insert(data_type.create_mutable_vector(row_count as usize)) } }; add_values_to_builder(vector_builder, values, row_count as usize, null_mask)?; @@ -300,7 +322,7 @@ pub fn insertion_expr_to_request( } let columns_values = columns_builders .into_iter() - .map(|(column_name, mut vector_builder)| (column_name, vector_builder.finish())) + .map(|(column_name, mut vector_builder)| (column_name, vector_builder.to_vector())) .collect(); Ok(InsertRequest { @@ -312,7 +334,7 @@ pub fn insertion_expr_to_request( } fn add_values_to_builder( - builder: &mut VectorBuilder, + builder: &mut Box, values: Values, row_count: usize, null_mask: Vec, @@ -323,9 +345,11 @@ fn add_values_to_builder( if null_mask.is_empty() { ensure!(values.len() == row_count, IllegalInsertDataSnafu); - values.iter().for_each(|value| { - builder.push(value); - }); + values.iter().try_for_each(|value| { + builder + .push_value_ref(value.as_value_ref()) + .context(CreateVectorSnafu) + })?; } else { let null_mask = BitVec::from_vec(null_mask); ensure!( @@ -336,9 +360,13 @@ fn add_values_to_builder( let mut idx_of_values = 0; for idx in 0..row_count { match is_null(&null_mask, idx) { - Some(true) => builder.push(&Value::Null), + Some(true) => builder + .push_value_ref(ValueRef::Null) + .context(CreateVectorSnafu)?, _ => { - builder.push(&values[idx_of_values]); + builder + .push_value_ref(values[idx_of_values].as_value_ref()) + .context(CreateVectorSnafu)?; idx_of_values += 1 } } @@ -418,9 +446,9 @@ fn convert_values(data_type: &ConcreteDataType, values: Values) -> Vec { .map(|v| Value::Date(v.into())) .collect(), ConcreteDataType::Timestamp(_) => values - .ts_millis_values + .ts_millisecond_values .into_iter() - .map(|v| Value::Timestamp(Timestamp::from_millis(v))) + .map(|v| Value::Timestamp(Timestamp::new_millisecond(v))) .collect(), ConcreteDataType::Null(_) => unreachable!(), ConcreteDataType::List(_) => unreachable!(), @@ -543,7 +571,7 @@ mod tests { ); assert_eq!( - ConcreteDataType::timestamp_millis_datatype(), + ConcreteDataType::timestamp_millisecond_datatype(), ConcreteDataType::from( ColumnDataTypeWrapper::try_new( column_defs @@ -624,8 +652,8 @@ mod tests { assert_eq!(Value::Float64(0.1.into()), memory.get(1)); let ts = insert_req.columns_values.get("ts").unwrap(); - assert_eq!(Value::Timestamp(Timestamp::from_millis(100)), ts.get(0)); - assert_eq!(Value::Timestamp(Timestamp::from_millis(101)), ts.get(1)); + assert_eq!(Value::Timestamp(Timestamp::new_millisecond(100)), ts.get(0)); + assert_eq!(Value::Timestamp(Timestamp::new_millisecond(101)), ts.get(1)); } #[test] @@ -675,8 +703,12 @@ mod tests { ColumnSchema::new("host", ConcreteDataType::string_datatype(), false), ColumnSchema::new("cpu", ConcreteDataType::float64_datatype(), true), ColumnSchema::new("memory", ConcreteDataType::float64_datatype(), true), - ColumnSchema::new("ts", ConcreteDataType::timestamp_millis_datatype(), true) - .with_time_index(true), + ColumnSchema::new( + "ts", + ConcreteDataType::timestamp_millisecond_datatype(), + true, + ) + .with_time_index(true), ]; Arc::new( @@ -741,7 +773,7 @@ mod tests { }; let ts_vals = column::Values { - ts_millis_values: vec![100, 101], + ts_millisecond_values: vec![100, 101], ..Default::default() }; let ts_column = Column { @@ -749,7 +781,7 @@ mod tests { semantic_type: TIMESTAMP_SEMANTIC_TYPE, values: Some(ts_vals), null_mask: vec![0], - datatype: ColumnDataType::Timestamp as i32, + datatype: ColumnDataType::TimestampMillisecond as i32, }; ( diff --git a/src/common/grpc/Cargo.toml b/src/common/grpc/Cargo.toml index f1a60addbab9..b1b5a25b6ed3 100644 --- a/src/common/grpc/Cargo.toml +++ b/src/common/grpc/Cargo.toml @@ -13,9 +13,7 @@ common-query = { path = "../query" } common-recordbatch = { path = "../recordbatch" } common-runtime = { path = "../runtime" } dashmap = "5.4" -datafusion = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2", features = [ - "simd", -] } +datafusion = "14.0.0" datatypes = { path = "../../datatypes" } snafu = { version = "0.7", features = ["backtraces"] } tokio = { version = "1.0", features = ["full"] } diff --git a/src/common/grpc/src/select.rs b/src/common/grpc/src/select.rs index 516f697d3bb1..3a572ab13744 100644 --- a/src/common/grpc/src/select.rs +++ b/src/common/grpc/src/select.rs @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::sync::Arc; - use api::helper::ColumnDataTypeWrapper; use api::result::{build_err_result, ObjectResultBuilder}; use api::v1::codec::SelectResult; @@ -24,9 +22,14 @@ use common_error::prelude::ErrorExt; use common_error::status_code::StatusCode; use common_query::Output; use common_recordbatch::{RecordBatches, SendableRecordBatchStream}; -use datatypes::arrow::array::{Array, BooleanArray, PrimitiveArray}; -use datatypes::arrow_array::{BinaryArray, StringArray}; use datatypes::schema::SchemaRef; +use datatypes::types::{TimestampType, WrapperType}; +use datatypes::vectors::{ + BinaryVector, BooleanVector, DateTimeVector, DateVector, Float32Vector, Float64Vector, + Int16Vector, Int32Vector, Int64Vector, Int8Vector, StringVector, TimestampMicrosecondVector, + TimestampMillisecondVector, TimestampNanosecondVector, TimestampSecondVector, UInt16Vector, + UInt32Vector, UInt64Vector, UInt8Vector, VectorRef, +}; use snafu::{OptionExt, ResultExt}; use crate::error::{self, ConversionSnafu, Result}; @@ -46,6 +49,7 @@ pub async fn to_object_result(output: std::result::Result Err(e) => build_err_result(&e), } } + async fn collect(stream: SendableRecordBatchStream) -> Result { let recordbatches = RecordBatches::try_collect(stream) .await @@ -78,10 +82,7 @@ fn try_convert(record_batches: RecordBatches) -> Result { let schema = record_batches.schema(); let record_batches = record_batches.take(); - let row_count: usize = record_batches - .iter() - .map(|r| r.df_recordbatch.num_rows()) - .sum(); + let row_count: usize = record_batches.iter().map(|r| r.num_rows()).sum(); let schemas = schema.column_schemas(); let mut columns = Vec::with_capacity(schemas.len()); @@ -89,9 +90,9 @@ fn try_convert(record_batches: RecordBatches) -> Result { for (idx, column_schema) in schemas.iter().enumerate() { let column_name = column_schema.name.clone(); - let arrays: Vec> = record_batches + let arrays: Vec<_> = record_batches .iter() - .map(|r| r.df_recordbatch.columns()[idx].clone()) + .map(|r| r.column(idx).clone()) .collect(); let column = Column { @@ -112,7 +113,7 @@ fn try_convert(record_batches: RecordBatches) -> Result { }) } -pub fn null_mask(arrays: &Vec>, row_count: usize) -> Vec { +pub fn null_mask(arrays: &[VectorRef], row_count: usize) -> Vec { let null_count: usize = arrays.iter().map(|a| a.null_count()).sum(); if null_count == 0 { @@ -122,10 +123,12 @@ pub fn null_mask(arrays: &Vec>, row_count: usize) -> Vec { let mut null_mask = BitVec::with_capacity(row_count); for array in arrays { let validity = array.validity(); - if let Some(v) = validity { - v.iter().for_each(|x| null_mask.push(!x)); - } else { + if validity.is_all_valid() { null_mask.extend_from_bitslice(&BitVec::repeat(false, array.len())); + } else { + for i in 0..array.len() { + null_mask.push(!validity.is_set(i)); + } } } null_mask.into_vec() @@ -133,7 +136,9 @@ pub fn null_mask(arrays: &Vec>, row_count: usize) -> Vec { macro_rules! convert_arrow_array_to_grpc_vals { ($data_type: expr, $arrays: ident, $(($Type: pat, $CastType: ty, $field: ident, $MapFunction: expr)), +) => {{ - use datatypes::arrow::datatypes::{DataType, TimeUnit}; + use datatypes::data_type::{ConcreteDataType}; + use datatypes::prelude::ScalarVector; + match $data_type { $( $Type => { @@ -143,52 +148,114 @@ macro_rules! convert_arrow_array_to_grpc_vals { from: format!("{:?}", $data_type), })?; vals.$field.extend(array - .iter() + .iter_data() .filter_map(|i| i.map($MapFunction)) .collect::>()); } return Ok(vals); }, )+ - _ => unimplemented!(), + ConcreteDataType::Null(_) | ConcreteDataType::List(_) => unreachable!("Should not send {:?} in gRPC", $data_type), } }}; } -pub fn values(arrays: &[Arc]) -> Result { +pub fn values(arrays: &[VectorRef]) -> Result { if arrays.is_empty() { return Ok(Values::default()); } let data_type = arrays[0].data_type(); convert_arrow_array_to_grpc_vals!( - data_type, arrays, - - (DataType::Boolean, BooleanArray, bool_values, |x| {x}), - - (DataType::Int8, PrimitiveArray, i8_values, |x| {*x as i32}), - (DataType::Int16, PrimitiveArray, i16_values, |x| {*x as i32}), - (DataType::Int32, PrimitiveArray, i32_values, |x| {*x}), - (DataType::Int64, PrimitiveArray, i64_values, |x| {*x}), - - (DataType::UInt8, PrimitiveArray, u8_values, |x| {*x as u32}), - (DataType::UInt16, PrimitiveArray, u16_values, |x| {*x as u32}), - (DataType::UInt32, PrimitiveArray, u32_values, |x| {*x}), - (DataType::UInt64, PrimitiveArray, u64_values, |x| {*x}), - - (DataType::Float32, PrimitiveArray, f32_values, |x| {*x}), - (DataType::Float64, PrimitiveArray, f64_values, |x| {*x}), - - (DataType::Binary, BinaryArray, binary_values, |x| {x.into()}), - (DataType::LargeBinary, BinaryArray, binary_values, |x| {x.into()}), - - (DataType::Utf8, StringArray, string_values, |x| {x.into()}), - (DataType::LargeUtf8, StringArray, string_values, |x| {x.into()}), - - (DataType::Date32, PrimitiveArray, date_values, |x| {*x as i32}), - (DataType::Date64, PrimitiveArray, datetime_values,|x| {*x as i64}), - - (DataType::Timestamp(TimeUnit::Millisecond, _), PrimitiveArray, ts_millis_values, |x| {*x}) + data_type, + arrays, + ( + ConcreteDataType::Boolean(_), + BooleanVector, + bool_values, + |x| { x } + ), + (ConcreteDataType::Int8(_), Int8Vector, i8_values, |x| { + i32::from(x) + }), + (ConcreteDataType::Int16(_), Int16Vector, i16_values, |x| { + i32::from(x) + }), + (ConcreteDataType::Int32(_), Int32Vector, i32_values, |x| { + x + }), + (ConcreteDataType::Int64(_), Int64Vector, i64_values, |x| { + x + }), + (ConcreteDataType::UInt8(_), UInt8Vector, u8_values, |x| { + u32::from(x) + }), + (ConcreteDataType::UInt16(_), UInt16Vector, u16_values, |x| { + u32::from(x) + }), + (ConcreteDataType::UInt32(_), UInt32Vector, u32_values, |x| { + x + }), + (ConcreteDataType::UInt64(_), UInt64Vector, u64_values, |x| { + x + }), + ( + ConcreteDataType::Float32(_), + Float32Vector, + f32_values, + |x| { x } + ), + ( + ConcreteDataType::Float64(_), + Float64Vector, + f64_values, + |x| { x } + ), + ( + ConcreteDataType::Binary(_), + BinaryVector, + binary_values, + |x| { x.into() } + ), + ( + ConcreteDataType::String(_), + StringVector, + string_values, + |x| { x.into() } + ), + (ConcreteDataType::Date(_), DateVector, date_values, |x| { + x.val() + }), + ( + ConcreteDataType::DateTime(_), + DateTimeVector, + datetime_values, + |x| { x.val() } + ), + ( + ConcreteDataType::Timestamp(TimestampType::Second(_)), + TimestampSecondVector, + ts_second_values, + |x| { x.into_native() } + ), + ( + ConcreteDataType::Timestamp(TimestampType::Millisecond(_)), + TimestampMillisecondVector, + ts_millisecond_values, + |x| { x.into_native() } + ), + ( + ConcreteDataType::Timestamp(TimestampType::Microsecond(_)), + TimestampMicrosecondVector, + ts_microsecond_values, + |x| { x.into_native() } + ), + ( + ConcreteDataType::Timestamp(TimestampType::Nanosecond(_)), + TimestampNanosecondVector, + ts_nanosecond_values, + |x| { x.into_native() } + ) ) } @@ -197,14 +264,10 @@ mod tests { use std::sync::Arc; use common_recordbatch::{RecordBatch, RecordBatches}; - use datafusion::field_util::SchemaExt; - use datatypes::arrow::array::{Array, BooleanArray, PrimitiveArray}; - use datatypes::arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; - use datatypes::arrow_array::StringArray; - use datatypes::schema::Schema; - use datatypes::vectors::{UInt32Vector, VectorRef}; + use datatypes::data_type::ConcreteDataType; + use datatypes::schema::{ColumnSchema, Schema}; - use crate::select::{null_mask, try_convert, values}; + use super::*; #[test] fn test_convert_record_batches_to_select_result() { @@ -230,9 +293,8 @@ mod tests { #[test] fn test_convert_arrow_arrays_i32() { - let array: PrimitiveArray = - PrimitiveArray::from(vec![Some(1), Some(2), None, Some(3)]); - let array: Arc = Arc::new(array); + let array = Int32Vector::from(vec![Some(1), Some(2), None, Some(3)]); + let array: VectorRef = Arc::new(array); let values = values(&[array]).unwrap(); @@ -241,14 +303,14 @@ mod tests { #[test] fn test_convert_arrow_arrays_string() { - let array = StringArray::from(vec![ + let array = StringVector::from(vec![ Some("1".to_string()), Some("2".to_string()), None, Some("3".to_string()), None, ]); - let array: Arc = Arc::new(array); + let array: VectorRef = Arc::new(array); let values = values(&[array]).unwrap(); @@ -257,8 +319,8 @@ mod tests { #[test] fn test_convert_arrow_arrays_bool() { - let array = BooleanArray::from(vec![Some(true), Some(false), None, Some(false), None]); - let array: Arc = Arc::new(array); + let array = BooleanVector::from(vec![Some(true), Some(false), None, Some(false), None]); + let array: VectorRef = Arc::new(array); let values = values(&[array]).unwrap(); @@ -267,43 +329,42 @@ mod tests { #[test] fn test_convert_arrow_arrays_empty() { - let array = BooleanArray::from(vec![None, None, None, None, None]); - let array: Arc = Arc::new(array); + let array = BooleanVector::from(vec![None, None, None, None, None]); + let array: VectorRef = Arc::new(array); let values = values(&[array]).unwrap(); - assert_eq!(Vec::::default(), values.bool_values); + assert!(values.bool_values.is_empty()); } #[test] fn test_null_mask() { - let a1: Arc = Arc::new(PrimitiveArray::from(vec![None, Some(2), None])); - let a2: Arc = - Arc::new(PrimitiveArray::from(vec![Some(1), Some(2), None, Some(4)])); - let mask = null_mask(&vec![a1, a2], 3 + 4); + let a1: VectorRef = Arc::new(Int32Vector::from(vec![None, Some(2), None])); + let a2: VectorRef = Arc::new(Int32Vector::from(vec![Some(1), Some(2), None, Some(4)])); + let mask = null_mask(&[a1, a2], 3 + 4); assert_eq!(vec![0b0010_0101], mask); - let empty: Arc = Arc::new(PrimitiveArray::::from(vec![None, None, None])); - let mask = null_mask(&vec![empty.clone(), empty.clone(), empty], 9); + let empty: VectorRef = Arc::new(Int32Vector::from(vec![None, None, None])); + let mask = null_mask(&[empty.clone(), empty.clone(), empty], 9); assert_eq!(vec![0b1111_1111, 0b0000_0001], mask); - let a1: Arc = Arc::new(PrimitiveArray::from(vec![Some(1), Some(2), Some(3)])); - let a2: Arc = Arc::new(PrimitiveArray::from(vec![Some(4), Some(5), Some(6)])); - let mask = null_mask(&vec![a1, a2], 3 + 3); + let a1: VectorRef = Arc::new(Int32Vector::from(vec![Some(1), Some(2), Some(3)])); + let a2: VectorRef = Arc::new(Int32Vector::from(vec![Some(4), Some(5), Some(6)])); + let mask = null_mask(&[a1, a2], 3 + 3); assert_eq!(Vec::::default(), mask); - let a1: Arc = Arc::new(PrimitiveArray::from(vec![Some(1), Some(2), Some(3)])); - let a2: Arc = Arc::new(PrimitiveArray::from(vec![Some(4), Some(5), None])); - let mask = null_mask(&vec![a1, a2], 3 + 3); + let a1: VectorRef = Arc::new(Int32Vector::from(vec![Some(1), Some(2), Some(3)])); + let a2: VectorRef = Arc::new(Int32Vector::from(vec![Some(4), Some(5), None])); + let mask = null_mask(&[a1, a2], 3 + 3); assert_eq!(vec![0b0010_0000], mask); } fn mock_record_batch() -> RecordBatch { - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("c1", DataType::UInt32, false), - Field::new("c2", DataType::UInt32, false), - ])); - let schema = Arc::new(Schema::try_from(arrow_schema).unwrap()); + let column_schemas = vec![ + ColumnSchema::new("c1", ConcreteDataType::uint32_datatype(), true), + ColumnSchema::new("c2", ConcreteDataType::uint32_datatype(), true), + ]; + let schema = Arc::new(Schema::try_new(column_schemas).unwrap()); let v1 = Arc::new(UInt32Vector::from(vec![Some(1), Some(2), None])); let v2 = Arc::new(UInt32Vector::from(vec![Some(1), None, None])); diff --git a/src/common/grpc/src/writer.rs b/src/common/grpc/src/writer.rs index 2cd28f45af9d..d05a2908e179 100644 --- a/src/common/grpc/src/writer.rs +++ b/src/common/grpc/src/writer.rs @@ -45,11 +45,11 @@ impl LinesWriter { pub fn write_ts(&mut self, column_name: &str, value: (i64, Precision)) -> Result<()> { let (idx, column) = self.mut_column( column_name, - ColumnDataType::Timestamp, + ColumnDataType::TimestampMillisecond, SemanticType::Timestamp, ); ensure!( - column.datatype == ColumnDataType::Timestamp as i32, + column.datatype == ColumnDataType::TimestampMillisecond as i32, TypeMismatchSnafu { column_name, expected: "timestamp", @@ -58,7 +58,9 @@ impl LinesWriter { ); // It is safe to use unwrap here, because values has been initialized in mut_column() let values = column.values.as_mut().unwrap(); - values.ts_millis_values.push(to_ms_ts(value.1, value.0)); + values + .ts_millisecond_values + .push(to_ms_ts(value.1, value.0)); self.null_masks[idx].push(false); Ok(()) } @@ -224,23 +226,23 @@ impl LinesWriter { pub fn to_ms_ts(p: Precision, ts: i64) -> i64 { match p { - Precision::NANOSECOND => ts / 1_000_000, - Precision::MICROSECOND => ts / 1000, - Precision::MILLISECOND => ts, - Precision::SECOND => ts * 1000, - Precision::MINUTE => ts * 1000 * 60, - Precision::HOUR => ts * 1000 * 60 * 60, + Precision::Nanosecond => ts / 1_000_000, + Precision::Microsecond => ts / 1000, + Precision::Millisecond => ts, + Precision::Second => ts * 1000, + Precision::Minute => ts * 1000 * 60, + Precision::Hour => ts * 1000 * 60 * 60, } } #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum Precision { - NANOSECOND, - MICROSECOND, - MILLISECOND, - SECOND, - MINUTE, - HOUR, + Nanosecond, + Microsecond, + Millisecond, + Second, + Minute, + Hour, } #[cfg(test)] @@ -261,13 +263,13 @@ mod tests { writer.write_f64("memory", 0.4).unwrap(); writer.write_string("name", "name1").unwrap(); writer - .write_ts("ts", (101011000, Precision::MILLISECOND)) + .write_ts("ts", (101011000, Precision::Millisecond)) .unwrap(); writer.commit(); writer.write_tag("host", "host2").unwrap(); writer - .write_ts("ts", (102011001, Precision::MILLISECOND)) + .write_ts("ts", (102011001, Precision::Millisecond)) .unwrap(); writer.write_bool("enable_reboot", true).unwrap(); writer.write_u64("year_of_service", 2).unwrap(); @@ -278,7 +280,7 @@ mod tests { writer.write_f64("cpu", 0.4).unwrap(); writer.write_u64("cpu_core_num", 16).unwrap(); writer - .write_ts("ts", (103011002, Precision::MILLISECOND)) + .write_ts("ts", (103011002, Precision::Millisecond)) .unwrap(); writer.commit(); @@ -321,11 +323,11 @@ mod tests { let column = &columns[4]; assert_eq!("ts", column.column_name); - assert_eq!(ColumnDataType::Timestamp as i32, column.datatype); + assert_eq!(ColumnDataType::TimestampMillisecond as i32, column.datatype); assert_eq!(SemanticType::Timestamp as i32, column.semantic_type); assert_eq!( vec![101011000, 102011001, 103011002], - column.values.as_ref().unwrap().ts_millis_values + column.values.as_ref().unwrap().ts_millisecond_values ); verify_null_mask(&column.null_mask, vec![false, false, false]); @@ -367,16 +369,16 @@ mod tests { #[test] fn test_to_ms() { - assert_eq!(100, to_ms_ts(Precision::NANOSECOND, 100110000)); - assert_eq!(100110, to_ms_ts(Precision::MICROSECOND, 100110000)); - assert_eq!(100110000, to_ms_ts(Precision::MILLISECOND, 100110000)); + assert_eq!(100, to_ms_ts(Precision::Nanosecond, 100110000)); + assert_eq!(100110, to_ms_ts(Precision::Microsecond, 100110000)); + assert_eq!(100110000, to_ms_ts(Precision::Millisecond, 100110000)); assert_eq!( 100110000 * 1000 * 60, - to_ms_ts(Precision::MINUTE, 100110000) + to_ms_ts(Precision::Minute, 100110000) ); assert_eq!( 100110000 * 1000 * 60 * 60, - to_ms_ts(Precision::HOUR, 100110000) + to_ms_ts(Precision::Hour, 100110000) ); } } diff --git a/src/common/query/Cargo.toml b/src/common/query/Cargo.toml index 7b9f87617b78..bd8f0bbf3a62 100644 --- a/src/common/query/Cargo.toml +++ b/src/common/query/Cargo.toml @@ -9,11 +9,9 @@ async-trait = "0.1" common-error = { path = "../error" } common-recordbatch = { path = "../recordbatch" } common-time = { path = "../time" } -datafusion = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2", features = [ - "simd", -] } -datafusion-common = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2" } -datafusion-expr = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2" } +datafusion = "14.0.0" +datafusion-common = "14.0.0" +datafusion-expr = "14.0.0" datatypes = { path = "../../datatypes" } snafu = { version = "0.7", features = ["backtraces"] } statrs = "0.15" diff --git a/src/common/query/src/error.rs b/src/common/query/src/error.rs index 7c5c224d1eb1..25c169baa5b2 100644 --- a/src/common/query/src/error.rs +++ b/src/common/query/src/error.rs @@ -23,16 +23,9 @@ use datatypes::error::Error as DataTypeError; use datatypes::prelude::ConcreteDataType; use statrs::StatsError; -common_error::define_opaque_error!(Error); - #[derive(Debug, Snafu)] #[snafu(visibility(pub))] -pub enum InnerError { - #[snafu(display("Fail to cast array to {:?}, source: {}", typ, source))] - TypeCast { - source: ArrowError, - typ: arrow::datatypes::DataType, - }, +pub enum Error { #[snafu(display("Fail to execute function, source: {}", source))] ExecuteFunction { source: DataFusionError, @@ -83,8 +76,8 @@ pub enum InnerError { backtrace: Backtrace, }, - #[snafu(display("Invalid inputs: {}", err_msg))] - InvalidInputs { + #[snafu(display("Invalid input type: {}", err_msg))] + InvalidInputType { #[snafu(backtrace)] source: DataTypeError, err_msg: String, @@ -133,37 +126,74 @@ pub enum InnerError { #[snafu(backtrace)] source: BoxedError, }, + + #[snafu(display("Failed to cast array to {:?}, source: {}", typ, source))] + TypeCast { + source: ArrowError, + typ: arrow::datatypes::DataType, + backtrace: Backtrace, + }, + + #[snafu(display( + "Failed to perform compute operation on arrow arrays, source: {}", + source + ))] + ArrowCompute { + source: ArrowError, + backtrace: Backtrace, + }, + + #[snafu(display("Query engine fail to cast value: {}", source))] + ToScalarValue { + #[snafu(backtrace)] + source: DataTypeError, + }, + + #[snafu(display("Failed to get scalar vector, {}", source))] + GetScalarVector { + #[snafu(backtrace)] + source: DataTypeError, + }, + + #[snafu(display("Invalid function args: {}", err_msg))] + InvalidFuncArgs { + err_msg: String, + backtrace: Backtrace, + }, } pub type Result = std::result::Result; -impl ErrorExt for InnerError { +impl ErrorExt for Error { fn status_code(&self) -> StatusCode { match self { - InnerError::ExecuteFunction { .. } - | InnerError::GenerateFunction { .. } - | InnerError::CreateAccumulator { .. } - | InnerError::DowncastVector { .. } - | InnerError::InvalidInputState { .. } - | InnerError::InvalidInputCol { .. } - | InnerError::BadAccumulatorImpl { .. } => StatusCode::EngineExecuteQuery, - - InnerError::InvalidInputs { source, .. } - | InnerError::IntoVector { source, .. } - | InnerError::FromScalarValue { source } - | InnerError::ConvertArrowSchema { source } - | InnerError::FromArrowArray { source } => source.status_code(), - - InnerError::ExecuteRepeatedly { .. } - | InnerError::GeneralDataFusion { .. } - | InnerError::DataFusionExecutionPlan { .. } => StatusCode::Unexpected, - - InnerError::UnsupportedInputDataType { .. } | InnerError::TypeCast { .. } => { - StatusCode::InvalidArguments - } - - InnerError::ConvertDfRecordBatchStream { source, .. } => source.status_code(), - InnerError::ExecutePhysicalPlan { source } => source.status_code(), + Error::ExecuteFunction { .. } + | Error::GenerateFunction { .. } + | Error::CreateAccumulator { .. } + | Error::DowncastVector { .. } + | Error::InvalidInputState { .. } + | Error::InvalidInputCol { .. } + | Error::BadAccumulatorImpl { .. } + | Error::ToScalarValue { .. } + | Error::GetScalarVector { .. } + | Error::ArrowCompute { .. } => StatusCode::EngineExecuteQuery, + + Error::InvalidInputType { source, .. } + | Error::IntoVector { source, .. } + | Error::FromScalarValue { source } + | Error::ConvertArrowSchema { source } + | Error::FromArrowArray { source } => source.status_code(), + + Error::ExecuteRepeatedly { .. } + | Error::GeneralDataFusion { .. } + | Error::DataFusionExecutionPlan { .. } => StatusCode::Unexpected, + + Error::UnsupportedInputDataType { .. } + | Error::TypeCast { .. } + | Error::InvalidFuncArgs { .. } => StatusCode::InvalidArguments, + + Error::ConvertDfRecordBatchStream { source, .. } => source.status_code(), + Error::ExecutePhysicalPlan { source } => source.status_code(), } } @@ -176,12 +206,6 @@ impl ErrorExt for InnerError { } } -impl From for Error { - fn from(e: InnerError) -> Error { - Error::new(e) - } -} - impl From for DataFusionError { fn from(e: Error) -> DataFusionError { DataFusionError::External(Box::new(e)) @@ -190,7 +214,7 @@ impl From for DataFusionError { impl From for Error { fn from(source: BoxedError) -> Self { - InnerError::ExecutePhysicalPlan { source }.into() + Error::ExecutePhysicalPlan { source } } } @@ -206,60 +230,51 @@ mod tests { } fn assert_error(err: &Error, code: StatusCode) { - let inner_err = err.as_any().downcast_ref::().unwrap(); + let inner_err = err.as_any().downcast_ref::().unwrap(); assert_eq!(code, inner_err.status_code()); assert!(inner_err.backtrace_opt().is_some()); } #[test] fn test_datafusion_as_source() { - let err: Error = throw_df_error() + let err = throw_df_error() .context(ExecuteFunctionSnafu) .err() - .unwrap() - .into(); + .unwrap(); assert_error(&err, StatusCode::EngineExecuteQuery); let err: Error = throw_df_error() .context(GeneralDataFusionSnafu) .err() - .unwrap() - .into(); + .unwrap(); assert_error(&err, StatusCode::Unexpected); - let err: Error = throw_df_error() + let err = throw_df_error() .context(DataFusionExecutionPlanSnafu) .err() - .unwrap() - .into(); + .unwrap(); assert_error(&err, StatusCode::Unexpected); } #[test] fn test_execute_repeatedly_error() { - let error: Error = None:: - .context(ExecuteRepeatedlySnafu) - .err() - .unwrap() - .into(); - assert_eq!(error.inner.status_code(), StatusCode::Unexpected); + let error = None::.context(ExecuteRepeatedlySnafu).err().unwrap(); + assert_eq!(error.status_code(), StatusCode::Unexpected); assert!(error.backtrace_opt().is_some()); } #[test] fn test_convert_df_recordbatch_stream_error() { let result: std::result::Result = - Err(common_recordbatch::error::InnerError::PollStream { - source: ArrowError::Overflow, + Err(common_recordbatch::error::Error::PollStream { + source: ArrowError::DivideByZero, backtrace: Backtrace::generate(), - } - .into()); - let error: Error = result + }); + let error = result .context(ConvertDfRecordBatchStreamSnafu) .err() - .unwrap() - .into(); - assert_eq!(error.inner.status_code(), StatusCode::Internal); + .unwrap(); + assert_eq!(error.status_code(), StatusCode::Internal); assert!(error.backtrace_opt().is_some()); } @@ -272,13 +287,12 @@ mod tests { #[test] fn test_into_vector_error() { - let err: Error = raise_datatype_error() + let err = raise_datatype_error() .context(IntoVectorSnafu { data_type: ArrowDatatype::Int32, }) .err() - .unwrap() - .into(); + .unwrap(); assert!(err.backtrace_opt().is_some()); let datatype_err = raise_datatype_error().err().unwrap(); assert_eq!(datatype_err.status_code(), err.status_code()); diff --git a/src/common/query/src/logical_plan/mod.rs b/src/common/query/src/logical_plan.rs similarity index 97% rename from src/common/query/src/logical_plan/mod.rs rename to src/common/query/src/logical_plan.rs index 5f57cd96aac4..a0df518ce7f2 100644 --- a/src/common/query/src/logical_plan/mod.rs +++ b/src/common/query/src/logical_plan.rs @@ -22,7 +22,7 @@ use std::sync::Arc; use datatypes::prelude::ConcreteDataType; pub use self::accumulator::{Accumulator, AggregateFunctionCreator, AggregateFunctionCreatorRef}; -pub use self::expr::Expr; +pub use self::expr::{DfExpr, Expr}; pub use self::udaf::AggregateFunction; pub use self::udf::ScalarUdf; use crate::function::{ReturnTypeFunction, ScalarFunctionImplementation}; @@ -148,9 +148,7 @@ mod tests { let args = vec![ DfColumnarValue::Scalar(ScalarValue::Boolean(Some(true))), - DfColumnarValue::Array(Arc::new(BooleanArray::from_slice(vec![ - true, false, false, true, - ]))), + DfColumnarValue::Array(Arc::new(BooleanArray::from(vec![true, false, false, true]))), ]; // call the function diff --git a/src/common/query/src/logical_plan/accumulator.rs b/src/common/query/src/logical_plan/accumulator.rs index 717214f3ff12..cce139094e1b 100644 --- a/src/common/query/src/logical_plan/accumulator.rs +++ b/src/common/query/src/logical_plan/accumulator.rs @@ -17,12 +17,10 @@ use std::fmt::Debug; use std::sync::Arc; -use common_time::timestamp::TimeUnit; use datafusion_common::Result as DfResult; -use datafusion_expr::Accumulator as DfAccumulator; +use datafusion_expr::{Accumulator as DfAccumulator, AggregateState}; use datatypes::arrow::array::ArrayRef; use datatypes::prelude::*; -use datatypes::value::ListValue; use datatypes::vectors::{Helper as VectorHelper, VectorRef}; use snafu::ResultExt; @@ -128,356 +126,53 @@ impl DfAccumulatorAdaptor { } impl DfAccumulator for DfAccumulatorAdaptor { - fn state(&self) -> DfResult> { + fn state(&self) -> DfResult> { let state_values = self.accumulator.state()?; let state_types = self.creator.state_types()?; if state_values.len() != state_types.len() { return error::BadAccumulatorImplSnafu { err_msg: format!("Accumulator {:?} returned state values size do not match its state types size.", self), } - .fail() - .map_err(Error::from)?; + .fail()?; } Ok(state_values .into_iter() .zip(state_types.iter()) - .map(|(v, t)| try_into_scalar_value(v, t)) - .collect::>>() - .map_err(Error::from)?) + .map(|(v, t)| { + let scalar = v + .try_to_scalar_value(t) + .context(error::ToScalarValueSnafu)?; + Ok(AggregateState::Scalar(scalar)) + }) + .collect::>>()?) } fn update_batch(&mut self, values: &[ArrayRef]) -> DfResult<()> { - let vectors = VectorHelper::try_into_vectors(values) - .context(FromScalarValueSnafu) - .map_err(Error::from)?; - self.accumulator - .update_batch(&vectors) - .map_err(|e| e.into()) + let vectors = VectorHelper::try_into_vectors(values).context(FromScalarValueSnafu)?; + self.accumulator.update_batch(&vectors)?; + Ok(()) } fn merge_batch(&mut self, states: &[ArrayRef]) -> DfResult<()> { let mut vectors = Vec::with_capacity(states.len()); for array in states.iter() { vectors.push( - VectorHelper::try_into_vector(array) - .context(IntoVectorSnafu { - data_type: array.data_type().clone(), - }) - .map_err(Error::from)?, + VectorHelper::try_into_vector(array).context(IntoVectorSnafu { + data_type: array.data_type().clone(), + })?, ); } - self.accumulator.merge_batch(&vectors).map_err(|e| e.into()) + self.accumulator.merge_batch(&vectors)?; + Ok(()) } fn evaluate(&self) -> DfResult { let value = self.accumulator.evaluate()?; let output_type = self.creator.output_type()?; - Ok(try_into_scalar_value(value, &output_type)?) - } -} - -fn try_into_scalar_value(value: Value, datatype: &ConcreteDataType) -> Result { - if !matches!(value, Value::Null) && datatype != &value.data_type() { - return error::BadAccumulatorImplSnafu { - err_msg: format!( - "expect value to return datatype {:?}, actual: {:?}", - datatype, - value.data_type() - ), - } - .fail()?; - } - - Ok(match value { - Value::Boolean(v) => ScalarValue::Boolean(Some(v)), - Value::UInt8(v) => ScalarValue::UInt8(Some(v)), - Value::UInt16(v) => ScalarValue::UInt16(Some(v)), - Value::UInt32(v) => ScalarValue::UInt32(Some(v)), - Value::UInt64(v) => ScalarValue::UInt64(Some(v)), - Value::Int8(v) => ScalarValue::Int8(Some(v)), - Value::Int16(v) => ScalarValue::Int16(Some(v)), - Value::Int32(v) => ScalarValue::Int32(Some(v)), - Value::Int64(v) => ScalarValue::Int64(Some(v)), - Value::Float32(v) => ScalarValue::Float32(Some(v.0)), - Value::Float64(v) => ScalarValue::Float64(Some(v.0)), - Value::String(v) => ScalarValue::Utf8(Some(v.as_utf8().to_string())), - Value::Binary(v) => ScalarValue::LargeBinary(Some(v.to_vec())), - Value::Date(v) => ScalarValue::Date32(Some(v.val())), - Value::DateTime(v) => ScalarValue::Date64(Some(v.val())), - Value::Null => try_convert_null_value(datatype)?, - Value::List(list) => try_convert_list_value(list)?, - Value::Timestamp(t) => timestamp_to_scalar_value(t.unit(), Some(t.value())), - }) -} - -fn timestamp_to_scalar_value(unit: TimeUnit, val: Option) -> ScalarValue { - match unit { - TimeUnit::Second => ScalarValue::TimestampSecond(val, None), - TimeUnit::Millisecond => ScalarValue::TimestampMillisecond(val, None), - TimeUnit::Microsecond => ScalarValue::TimestampMicrosecond(val, None), - TimeUnit::Nanosecond => ScalarValue::TimestampNanosecond(val, None), - } -} - -fn try_convert_null_value(datatype: &ConcreteDataType) -> Result { - Ok(match datatype { - ConcreteDataType::Boolean(_) => ScalarValue::Boolean(None), - ConcreteDataType::Int8(_) => ScalarValue::Int8(None), - ConcreteDataType::Int16(_) => ScalarValue::Int16(None), - ConcreteDataType::Int32(_) => ScalarValue::Int32(None), - ConcreteDataType::Int64(_) => ScalarValue::Int64(None), - ConcreteDataType::UInt8(_) => ScalarValue::UInt8(None), - ConcreteDataType::UInt16(_) => ScalarValue::UInt16(None), - ConcreteDataType::UInt32(_) => ScalarValue::UInt32(None), - ConcreteDataType::UInt64(_) => ScalarValue::UInt64(None), - ConcreteDataType::Float32(_) => ScalarValue::Float32(None), - ConcreteDataType::Float64(_) => ScalarValue::Float64(None), - ConcreteDataType::Binary(_) => ScalarValue::LargeBinary(None), - ConcreteDataType::String(_) => ScalarValue::Utf8(None), - ConcreteDataType::Timestamp(t) => timestamp_to_scalar_value(t.unit, None), - _ => { - return error::BadAccumulatorImplSnafu { - err_msg: format!( - "undefined transition from null value to datatype {:?}", - datatype - ), - } - .fail()? - } - }) -} - -fn try_convert_list_value(list: ListValue) -> Result { - let vs = if let Some(items) = list.items() { - Some(Box::new( - items - .iter() - .map(|v| try_into_scalar_value(v.clone(), list.datatype())) - .collect::>>()?, - )) - } else { - None - }; - Ok(ScalarValue::List( - vs, - Box::new(list.datatype().as_arrow_type()), - )) -} - -#[cfg(test)] -mod tests { - use common_base::bytes::{Bytes, StringBytes}; - use datafusion_common::ScalarValue; - use datatypes::arrow::datatypes::DataType; - use datatypes::value::{ListValue, OrderedFloat}; - - use super::*; - - #[test] - fn test_not_null_value_to_scalar_value() { - assert_eq!( - ScalarValue::Boolean(Some(true)), - try_into_scalar_value(Value::Boolean(true), &ConcreteDataType::boolean_datatype()) - .unwrap() - ); - assert_eq!( - ScalarValue::Boolean(Some(false)), - try_into_scalar_value(Value::Boolean(false), &ConcreteDataType::boolean_datatype()) - .unwrap() - ); - assert_eq!( - ScalarValue::UInt8(Some(u8::MIN + 1)), - try_into_scalar_value( - Value::UInt8(u8::MIN + 1), - &ConcreteDataType::uint8_datatype() - ) - .unwrap() - ); - assert_eq!( - ScalarValue::UInt16(Some(u16::MIN + 2)), - try_into_scalar_value( - Value::UInt16(u16::MIN + 2), - &ConcreteDataType::uint16_datatype() - ) - .unwrap() - ); - assert_eq!( - ScalarValue::UInt32(Some(u32::MIN + 3)), - try_into_scalar_value( - Value::UInt32(u32::MIN + 3), - &ConcreteDataType::uint32_datatype() - ) - .unwrap() - ); - assert_eq!( - ScalarValue::UInt64(Some(u64::MIN + 4)), - try_into_scalar_value( - Value::UInt64(u64::MIN + 4), - &ConcreteDataType::uint64_datatype() - ) - .unwrap() - ); - assert_eq!( - ScalarValue::Int8(Some(i8::MIN + 4)), - try_into_scalar_value(Value::Int8(i8::MIN + 4), &ConcreteDataType::int8_datatype()) - .unwrap() - ); - assert_eq!( - ScalarValue::Int16(Some(i16::MIN + 5)), - try_into_scalar_value( - Value::Int16(i16::MIN + 5), - &ConcreteDataType::int16_datatype() - ) - .unwrap() - ); - assert_eq!( - ScalarValue::Int32(Some(i32::MIN + 6)), - try_into_scalar_value( - Value::Int32(i32::MIN + 6), - &ConcreteDataType::int32_datatype() - ) - .unwrap() - ); - assert_eq!( - ScalarValue::Int64(Some(i64::MIN + 7)), - try_into_scalar_value( - Value::Int64(i64::MIN + 7), - &ConcreteDataType::int64_datatype() - ) - .unwrap() - ); - assert_eq!( - ScalarValue::Float32(Some(8.0f32)), - try_into_scalar_value( - Value::Float32(OrderedFloat(8.0f32)), - &ConcreteDataType::float32_datatype() - ) - .unwrap() - ); - assert_eq!( - ScalarValue::Float64(Some(9.0f64)), - try_into_scalar_value( - Value::Float64(OrderedFloat(9.0f64)), - &ConcreteDataType::float64_datatype() - ) - .unwrap() - ); - assert_eq!( - ScalarValue::Utf8(Some("hello".to_string())), - try_into_scalar_value( - Value::String(StringBytes::from("hello")), - &ConcreteDataType::string_datatype(), - ) - .unwrap() - ); - assert_eq!( - ScalarValue::LargeBinary(Some("world".as_bytes().to_vec())), - try_into_scalar_value( - Value::Binary(Bytes::from("world".as_bytes())), - &ConcreteDataType::binary_datatype() - ) - .unwrap() - ); - } - - #[test] - fn test_null_value_to_scalar_value() { - assert_eq!( - ScalarValue::Boolean(None), - try_into_scalar_value(Value::Null, &ConcreteDataType::boolean_datatype()).unwrap() - ); - assert_eq!( - ScalarValue::UInt8(None), - try_into_scalar_value(Value::Null, &ConcreteDataType::uint8_datatype()).unwrap() - ); - assert_eq!( - ScalarValue::UInt16(None), - try_into_scalar_value(Value::Null, &ConcreteDataType::uint16_datatype()).unwrap() - ); - assert_eq!( - ScalarValue::UInt32(None), - try_into_scalar_value(Value::Null, &ConcreteDataType::uint32_datatype()).unwrap() - ); - assert_eq!( - ScalarValue::UInt64(None), - try_into_scalar_value(Value::Null, &ConcreteDataType::uint64_datatype()).unwrap() - ); - assert_eq!( - ScalarValue::Int8(None), - try_into_scalar_value(Value::Null, &ConcreteDataType::int8_datatype()).unwrap() - ); - assert_eq!( - ScalarValue::Int16(None), - try_into_scalar_value(Value::Null, &ConcreteDataType::int16_datatype()).unwrap() - ); - assert_eq!( - ScalarValue::Int32(None), - try_into_scalar_value(Value::Null, &ConcreteDataType::int32_datatype()).unwrap() - ); - assert_eq!( - ScalarValue::Int64(None), - try_into_scalar_value(Value::Null, &ConcreteDataType::int64_datatype()).unwrap() - ); - assert_eq!( - ScalarValue::Float32(None), - try_into_scalar_value(Value::Null, &ConcreteDataType::float32_datatype()).unwrap() - ); - assert_eq!( - ScalarValue::Float64(None), - try_into_scalar_value(Value::Null, &ConcreteDataType::float64_datatype()).unwrap() - ); - assert_eq!( - ScalarValue::Utf8(None), - try_into_scalar_value(Value::Null, &ConcreteDataType::string_datatype()).unwrap() - ); - assert_eq!( - ScalarValue::LargeBinary(None), - try_into_scalar_value(Value::Null, &ConcreteDataType::binary_datatype()).unwrap() - ); - } - - #[test] - fn test_list_value_to_scalar_value() { - let items = Some(Box::new(vec![Value::Int32(-1), Value::Null])); - let list = Value::List(ListValue::new(items, ConcreteDataType::int32_datatype())); - let df_list = try_into_scalar_value( - list, - &ConcreteDataType::list_datatype(ConcreteDataType::int32_datatype()), - ) - .unwrap(); - assert!(matches!(df_list, ScalarValue::List(_, _))); - match df_list { - ScalarValue::List(vs, datatype) => { - assert_eq!(*datatype, DataType::Int32); - - assert!(vs.is_some()); - let vs = *vs.unwrap(); - assert_eq!( - vs, - vec![ScalarValue::Int32(Some(-1)), ScalarValue::Int32(None)] - ); - } - _ => unreachable!(), - } - } - - #[test] - pub fn test_timestamp_to_scalar_value() { - assert_eq!( - ScalarValue::TimestampSecond(Some(1), None), - timestamp_to_scalar_value(TimeUnit::Second, Some(1)) - ); - assert_eq!( - ScalarValue::TimestampMillisecond(Some(1), None), - timestamp_to_scalar_value(TimeUnit::Millisecond, Some(1)) - ); - assert_eq!( - ScalarValue::TimestampMicrosecond(Some(1), None), - timestamp_to_scalar_value(TimeUnit::Microsecond, Some(1)) - ); - assert_eq!( - ScalarValue::TimestampNanosecond(Some(1), None), - timestamp_to_scalar_value(TimeUnit::Nanosecond, Some(1)) - ); + let scalar_value = value + .try_to_scalar_value(&output_type) + .context(error::ToScalarValueSnafu) + .map_err(Error::from)?; + Ok(scalar_value) } } diff --git a/src/common/query/src/logical_plan/expr.rs b/src/common/query/src/logical_plan/expr.rs index 45cb12cdebc9..cc8aa1bea330 100644 --- a/src/common/query/src/logical_plan/expr.rs +++ b/src/common/query/src/logical_plan/expr.rs @@ -12,11 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -use datafusion::logical_plan::Expr as DfExpr; +pub use datafusion_expr::expr::Expr as DfExpr; /// Central struct of query API. /// Represent logical expressions such as `A + 1`, or `CAST(c1 AS int)`. -#[derive(Clone, PartialEq, Hash, Debug)] +#[derive(Clone, PartialEq, Eq, Hash, Debug)] pub struct Expr { df_expr: DfExpr, } diff --git a/src/common/query/src/logical_plan/udaf.rs b/src/common/query/src/logical_plan/udaf.rs index 6fb4a2f68a0e..1f3fb26a9824 100644 --- a/src/common/query/src/logical_plan/udaf.rs +++ b/src/common/query/src/logical_plan/udaf.rs @@ -104,7 +104,7 @@ fn to_df_accumulator_func( accumulator: AccumulatorFunctionImpl, creator: AggregateFunctionCreatorRef, ) -> DfAccumulatorFunctionImplementation { - Arc::new(move || { + Arc::new(move |_| { let accumulator = accumulator()?; let creator = creator.clone(); Ok(Box::new(DfAccumulatorAdaptor::new(accumulator, creator))) diff --git a/src/common/query/src/physical_plan.rs b/src/common/query/src/physical_plan.rs index fae044389723..42bb70087e07 100644 --- a/src/common/query/src/physical_plan.rs +++ b/src/common/query/src/physical_plan.rs @@ -16,12 +16,11 @@ use std::any::Any; use std::fmt::Debug; use std::sync::Arc; -use async_trait::async_trait; -use common_recordbatch::adapter::{AsyncRecordBatchStreamAdapter, DfRecordBatchStreamAdapter}; +use common_recordbatch::adapter::{DfRecordBatchStreamAdapter, RecordBatchStreamAdapter}; use common_recordbatch::{DfSendableRecordBatchStream, SendableRecordBatchStream}; use datafusion::arrow::datatypes::SchemaRef as DfSchemaRef; use datafusion::error::Result as DfResult; -pub use datafusion::execution::runtime_env::RuntimeEnv; +pub use datafusion::execution::context::{SessionContext, TaskContext}; use datafusion::physical_plan::expressions::PhysicalSortExpr; pub use datafusion::physical_plan::Partitioning; use datafusion::physical_plan::Statistics; @@ -63,7 +62,7 @@ pub trait PhysicalPlan: Debug + Send + Sync { fn execute( &self, partition: usize, - runtime: Arc, + context: Arc, ) -> Result; } @@ -111,6 +110,7 @@ impl PhysicalPlan for PhysicalPlanAdapter { .collect(); let plan = self .df_plan + .clone() .with_new_children(children) .context(error::GeneralDataFusionSnafu)?; Ok(Arc::new(PhysicalPlanAdapter::new(self.schema(), plan))) @@ -119,20 +119,22 @@ impl PhysicalPlan for PhysicalPlanAdapter { fn execute( &self, partition: usize, - runtime: Arc, + context: Arc, ) -> Result { let df_plan = self.df_plan.clone(); - let stream = Box::pin(async move { df_plan.execute(partition, runtime).await }); - let stream = AsyncRecordBatchStreamAdapter::new(self.schema(), stream); + let stream = df_plan + .execute(partition, context) + .context(error::GeneralDataFusionSnafu)?; + let adapter = RecordBatchStreamAdapter::try_new(stream) + .context(error::ConvertDfRecordBatchStreamSnafu)?; - Ok(Box::pin(stream)) + Ok(Box::pin(adapter)) } } #[derive(Debug)] pub struct DfPhysicalPlanAdapter(pub PhysicalPlanRef); -#[async_trait] impl DfPhysicalPlan for DfPhysicalPlanAdapter { fn as_any(&self) -> &dyn Any { self @@ -159,15 +161,14 @@ impl DfPhysicalPlan for DfPhysicalPlanAdapter { } fn with_new_children( - &self, + self: Arc, children: Vec>, ) -> DfResult> { let df_schema = self.schema(); let schema: SchemaRef = Arc::new( df_schema .try_into() - .context(error::ConvertArrowSchemaSnafu) - .map_err(error::Error::from)?, + .context(error::ConvertArrowSchemaSnafu)?, ); let children = children .into_iter() @@ -177,12 +178,12 @@ impl DfPhysicalPlan for DfPhysicalPlanAdapter { Ok(Arc::new(DfPhysicalPlanAdapter(plan))) } - async fn execute( + fn execute( &self, partition: usize, - runtime: Arc, + context: Arc, ) -> DfResult { - let stream = self.0.execute(partition, runtime)?; + let stream = self.0.execute(partition, context)?; Ok(Box::pin(DfRecordBatchStreamAdapter::new(stream))) } @@ -194,16 +195,16 @@ impl DfPhysicalPlan for DfPhysicalPlanAdapter { #[cfg(test)] mod test { + use async_trait::async_trait; use common_recordbatch::{RecordBatch, RecordBatches}; - use datafusion::arrow_print; - use datafusion::datasource::TableProvider as DfTableProvider; - use datafusion::logical_plan::LogicalPlanBuilder; + use datafusion::datasource::{DefaultTableSource, TableProvider as DfTableProvider, TableType}; + use datafusion::execution::context::{SessionContext, SessionState}; use datafusion::physical_plan::collect; use datafusion::physical_plan::empty::EmptyExec; - use datafusion::prelude::ExecutionContext; - use datafusion_common::field_util::SchemaExt; - use datafusion_expr::Expr; + use datafusion_expr::logical_plan::builder::LogicalPlanBuilder; + use datafusion_expr::{Expr, TableSource}; use datatypes::arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; + use datatypes::arrow::util::pretty; use datatypes::schema::Schema; use datatypes::vectors::Int32Vector; @@ -225,8 +226,13 @@ mod test { )])) } + fn table_type(&self) -> TableType { + TableType::Base + } + async fn scan( &self, + _ctx: &SessionState, _projection: &Option>, _filters: &[Expr], _limit: Option, @@ -240,6 +246,14 @@ mod test { } } + impl MyDfTableProvider { + fn table_source() -> Arc { + Arc::new(DefaultTableSource { + table_provider: Arc::new(Self), + }) + } + } + #[derive(Debug)] struct MyExecutionPlan { schema: SchemaRef, @@ -269,7 +283,7 @@ mod test { fn execute( &self, _partition: usize, - _runtime: Arc, + _context: Arc, ) -> Result { let schema = self.schema(); let recordbatches = RecordBatches::try_new( @@ -295,20 +309,26 @@ mod test { // Test our physical plan can be executed by DataFusion, through adapters. #[tokio::test] async fn test_execute_physical_plan() { - let ctx = ExecutionContext::new(); - let logical_plan = LogicalPlanBuilder::scan("test", Arc::new(MyDfTableProvider), None) - .unwrap() - .build() - .unwrap(); + let ctx = SessionContext::new(); + let logical_plan = + LogicalPlanBuilder::scan("test", MyDfTableProvider::table_source(), None) + .unwrap() + .build() + .unwrap(); let physical_plan = ctx.create_physical_plan(&logical_plan).await.unwrap(); - let df_recordbatches = collect(physical_plan, Arc::new(RuntimeEnv::default())) + let df_recordbatches = collect(physical_plan, Arc::new(TaskContext::from(&ctx))) .await .unwrap(); - let pretty_print = arrow_print::write(&df_recordbatches); - let pretty_print = pretty_print.lines().collect::>(); + let pretty_print = pretty::pretty_format_batches(&df_recordbatches).unwrap(); assert_eq!( - pretty_print, - vec!["+---+", "| a |", "+---+", "| 1 |", "| 2 |", "| 3 |", "+---+",] + pretty_print.to_string(), + r#"+---+ +| a | ++---+ +| 1 | +| 2 | +| 3 | ++---+"# ); } diff --git a/src/common/query/src/signature.rs b/src/common/query/src/signature.rs index c8d4963b6eb5..1d57ee7992cd 100644 --- a/src/common/query/src/signature.rs +++ b/src/common/query/src/signature.rs @@ -15,7 +15,7 @@ //! Signature module contains foundational types that are used to represent signatures, types, //! and return types of functions. //! Copied and modified from datafusion. -pub use datafusion::physical_plan::functions::Volatility; +pub use datafusion_expr::Volatility; use datafusion_expr::{Signature as DfSignature, TypeSignature as DfTypeSignature}; use datatypes::arrow::datatypes::DataType as ArrowDataType; use datatypes::data_type::DataType; diff --git a/src/common/recordbatch/Cargo.toml b/src/common/recordbatch/Cargo.toml index a8236120617e..634ec6441056 100644 --- a/src/common/recordbatch/Cargo.toml +++ b/src/common/recordbatch/Cargo.toml @@ -6,10 +6,8 @@ license = "Apache-2.0" [dependencies] common-error = { path = "../error" } -datafusion = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2", features = [ - "simd", -] } -datafusion-common = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2" } +datafusion = "14.0.0" +datafusion-common = "14.0.0" datatypes = { path = "../../datatypes" } futures = "0.3" paste = "1.0" diff --git a/src/common/recordbatch/src/adapter.rs b/src/common/recordbatch/src/adapter.rs index 2994d8f0788d..2b8436ec4e43 100644 --- a/src/common/recordbatch/src/adapter.rs +++ b/src/common/recordbatch/src/adapter.rs @@ -19,7 +19,6 @@ use std::task::{Context, Poll}; use datafusion::arrow::datatypes::SchemaRef as DfSchemaRef; use datafusion::physical_plan::RecordBatchStream as DfRecordBatchStream; -use datafusion_common::record_batch::RecordBatch as DfRecordBatch; use datafusion_common::DataFusionError; use datatypes::arrow::error::{ArrowError, Result as ArrowResult}; use datatypes::schema::{Schema, SchemaRef}; @@ -28,7 +27,8 @@ use snafu::ResultExt; use crate::error::{self, Result}; use crate::{ - DfSendableRecordBatchStream, RecordBatch, RecordBatchStream, SendableRecordBatchStream, Stream, + DfRecordBatch, DfSendableRecordBatchStream, RecordBatch, RecordBatchStream, + SendableRecordBatchStream, Stream, }; type FutureStream = Pin< @@ -63,8 +63,8 @@ impl Stream for DfRecordBatchStreamAdapter { match Pin::new(&mut self.stream).poll_next(cx) { Poll::Pending => Poll::Pending, Poll::Ready(Some(recordbatch)) => match recordbatch { - Ok(recordbatch) => Poll::Ready(Some(Ok(recordbatch.df_recordbatch))), - Err(e) => Poll::Ready(Some(Err(ArrowError::External("".to_owned(), Box::new(e))))), + Ok(recordbatch) => Poll::Ready(Some(Ok(recordbatch.into_df_record_batch()))), + Err(e) => Poll::Ready(Some(Err(ArrowError::ExternalError(Box::new(e))))), }, Poll::Ready(None) => Poll::Ready(None), } @@ -102,10 +102,13 @@ impl Stream for RecordBatchStreamAdapter { fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { match Pin::new(&mut self.stream).poll_next(cx) { Poll::Pending => Poll::Pending, - Poll::Ready(Some(df_recordbatch)) => Poll::Ready(Some(Ok(RecordBatch { - schema: self.schema(), - df_recordbatch: df_recordbatch.context(error::PollStreamSnafu)?, - }))), + Poll::Ready(Some(df_record_batch)) => { + let df_record_batch = df_record_batch.context(error::PollStreamSnafu)?; + Poll::Ready(Some(RecordBatch::try_from_df_record_batch( + self.schema(), + df_record_batch, + ))) + } Poll::Ready(None) => Poll::Ready(None), } } @@ -157,10 +160,8 @@ impl Stream for AsyncRecordBatchStreamAdapter { AsyncRecordBatchStreamAdapterState::Inited(stream) => match stream { Ok(stream) => { return Poll::Ready(ready!(Pin::new(stream).poll_next(cx)).map(|df| { - Ok(RecordBatch { - schema: self.schema(), - df_recordbatch: df.context(error::PollStreamSnafu)?, - }) + let df_record_batch = df.context(error::PollStreamSnafu)?; + RecordBatch::try_from_df_record_batch(self.schema(), df_record_batch) })); } Err(e) => { @@ -168,8 +169,7 @@ impl Stream for AsyncRecordBatchStreamAdapter { error::CreateRecordBatchesSnafu { reason: format!("Read error {:?} from stream", e), } - .fail() - .map_err(|e| e.into()), + .fail(), )) } }, diff --git a/src/common/recordbatch/src/error.rs b/src/common/recordbatch/src/error.rs index 2425defad8a0..09374413381a 100644 --- a/src/common/recordbatch/src/error.rs +++ b/src/common/recordbatch/src/error.rs @@ -17,13 +17,12 @@ use std::any::Any; use common_error::ext::BoxedError; use common_error::prelude::*; -common_error::define_opaque_error!(Error); pub type Result = std::result::Result; #[derive(Debug, Snafu)] #[snafu(visibility(pub))] -pub enum InnerError { +pub enum Error { #[snafu(display("Fail to create datafusion record batch, source: {}", source))] NewDfRecordBatch { source: datatypes::arrow::error::ArrowError, @@ -59,20 +58,27 @@ pub enum InnerError { source: datatypes::arrow::error::ArrowError, backtrace: Backtrace, }, + + #[snafu(display("Fail to format record batch, source: {}", source))] + Format { + source: datatypes::arrow::error::ArrowError, + backtrace: Backtrace, + }, } -impl ErrorExt for InnerError { +impl ErrorExt for Error { fn status_code(&self) -> StatusCode { match self { - InnerError::NewDfRecordBatch { .. } => StatusCode::InvalidArguments, + Error::NewDfRecordBatch { .. } => StatusCode::InvalidArguments, - InnerError::DataTypes { .. } - | InnerError::CreateRecordBatches { .. } - | InnerError::PollStream { .. } => StatusCode::Internal, + Error::DataTypes { .. } + | Error::CreateRecordBatches { .. } + | Error::PollStream { .. } + | Error::Format { .. } => StatusCode::Internal, - InnerError::External { source } => source.status_code(), + Error::External { source } => source.status_code(), - InnerError::SchemaConversion { source, .. } => source.status_code(), + Error::SchemaConversion { source, .. } => source.status_code(), } } @@ -84,9 +90,3 @@ impl ErrorExt for InnerError { self } } - -impl From for Error { - fn from(e: InnerError) -> Error { - Error::new(e) - } -} diff --git a/src/common/recordbatch/src/lib.rs b/src/common/recordbatch/src/lib.rs index 2809040326ff..be96a94a50d8 100644 --- a/src/common/recordbatch/src/lib.rs +++ b/src/common/recordbatch/src/lib.rs @@ -20,16 +20,17 @@ pub mod util; use std::pin::Pin; use std::sync::Arc; -use datafusion::arrow_print; use datafusion::physical_plan::memory::MemoryStream; pub use datafusion::physical_plan::SendableRecordBatchStream as DfSendableRecordBatchStream; +pub use datatypes::arrow::record_batch::RecordBatch as DfRecordBatch; +use datatypes::arrow::util::pretty; use datatypes::prelude::VectorRef; use datatypes::schema::{Schema, SchemaRef}; use error::Result; use futures::task::{Context, Poll}; use futures::{Stream, TryStreamExt}; pub use recordbatch::RecordBatch; -use snafu::ensure; +use snafu::{ensure, ResultExt}; pub trait RecordBatchStream: Stream> { fn schema(&self) -> SchemaRef; @@ -65,7 +66,7 @@ impl Stream for EmptyRecordBatchStream { } } -#[derive(Debug)] +#[derive(Debug, PartialEq)] pub struct RecordBatches { schema: SchemaRef, batches: Vec, @@ -98,17 +99,18 @@ impl RecordBatches { self.batches.iter() } - pub fn pretty_print(&self) -> String { - arrow_print::write( - &self - .iter() - .map(|x| x.df_recordbatch.clone()) - .collect::>(), - ) + pub fn pretty_print(&self) -> Result { + let df_batches = &self + .iter() + .map(|x| x.df_record_batch().clone()) + .collect::>(); + let result = pretty::pretty_format_batches(df_batches).context(error::FormatSnafu)?; + + Ok(result.to_string()) } pub fn try_new(schema: SchemaRef, batches: Vec) -> Result { - for batch in batches.iter() { + for batch in &batches { ensure!( batch.schema == schema, error::CreateRecordBatchesSnafu { @@ -144,7 +146,7 @@ impl RecordBatches { let df_record_batches = self .batches .into_iter() - .map(|batch| batch.df_recordbatch) + .map(|batch| batch.into_df_record_batch()) .collect(); // unwrap safety: `MemoryStream::try_new` won't fail Box::pin( @@ -242,7 +244,7 @@ mod tests { | 1 | hello | | 2 | world | +---+-------+"; - assert_eq!(batches.pretty_print(), expected); + assert_eq!(batches.pretty_print().unwrap(), expected); assert_eq!(schema1, batches.schema()); assert_eq!(vec![batch1], batches.take()); diff --git a/src/common/recordbatch/src/recordbatch.rs b/src/common/recordbatch/src/recordbatch.rs index 5fc886f8b9d1..6b24a9c5a9fc 100644 --- a/src/common/recordbatch/src/recordbatch.rs +++ b/src/common/recordbatch/src/recordbatch.rs @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -use datafusion_common::record_batch::RecordBatch as DfRecordBatch; -use datatypes::arrow_array::arrow_array_get; use datatypes::schema::SchemaRef; use datatypes::value::Value; use datatypes::vectors::{Helper, VectorRef}; @@ -22,32 +20,88 @@ use serde::{Serialize, Serializer}; use snafu::ResultExt; use crate::error::{self, Result}; +use crate::DfRecordBatch; -// TODO(yingwen): We should hold vectors in the RecordBatch. +/// A two-dimensional batch of column-oriented data with a defined schema. #[derive(Clone, Debug, PartialEq)] pub struct RecordBatch { pub schema: SchemaRef, - pub df_recordbatch: DfRecordBatch, + columns: Vec, + df_record_batch: DfRecordBatch, } impl RecordBatch { + /// Create a new [`RecordBatch`] from `schema` and `columns`. pub fn new>( schema: SchemaRef, columns: I, ) -> Result { - let arrow_arrays = columns.into_iter().map(|v| v.to_arrow_array()).collect(); + let columns: Vec<_> = columns.into_iter().collect(); + let arrow_arrays = columns.iter().map(|v| v.to_arrow_array()).collect(); - let df_recordbatch = DfRecordBatch::try_new(schema.arrow_schema().clone(), arrow_arrays) + let df_record_batch = DfRecordBatch::try_new(schema.arrow_schema().clone(), arrow_arrays) .context(error::NewDfRecordBatchSnafu)?; Ok(RecordBatch { schema, - df_recordbatch, + columns, + df_record_batch, }) } + /// Create a new [`RecordBatch`] from `schema` and `df_record_batch`. + /// + /// This method doesn't check the schema. + pub fn try_from_df_record_batch( + schema: SchemaRef, + df_record_batch: DfRecordBatch, + ) -> Result { + let columns = df_record_batch + .columns() + .iter() + .map(|c| Helper::try_into_vector(c.clone()).context(error::DataTypesSnafu)) + .collect::>>()?; + + Ok(RecordBatch { + schema, + columns, + df_record_batch, + }) + } + + #[inline] + pub fn df_record_batch(&self) -> &DfRecordBatch { + &self.df_record_batch + } + + #[inline] + pub fn into_df_record_batch(self) -> DfRecordBatch { + self.df_record_batch + } + + #[inline] + pub fn columns(&self) -> &[VectorRef] { + &self.columns + } + + #[inline] + pub fn column(&self, idx: usize) -> &VectorRef { + &self.columns[idx] + } + + pub fn column_by_name(&self, name: &str) -> Option<&VectorRef> { + let idx = self.schema.column_index_by_name(name)?; + Some(&self.columns[idx]) + } + + #[inline] + pub fn num_columns(&self) -> usize { + self.columns.len() + } + + #[inline] pub fn num_rows(&self) -> usize { - self.df_recordbatch.num_rows() + self.df_record_batch.num_rows() } /// Create an iterator to traverse the data by row @@ -61,14 +115,15 @@ impl Serialize for RecordBatch { where S: Serializer, { + // TODO(yingwen): arrow and arrow2's schemas have different fields, so + // it might be better to use our `RawSchema` as serialized field. let mut s = serializer.serialize_struct("record", 2)?; - s.serialize_field("schema", &self.schema.arrow_schema())?; + s.serialize_field("schema", &**self.schema.arrow_schema())?; - let df_columns = self.df_recordbatch.columns(); - - let vec = df_columns + let vec = self + .columns .iter() - .map(|c| Helper::try_into_vector(c.clone())?.serialize_to_json()) + .map(|c| c.serialize_to_json()) .collect::, _>>() .map_err(S::Error::custom)?; @@ -88,8 +143,8 @@ impl<'a> RecordBatchRowIterator<'a> { fn new(record_batch: &'a RecordBatch) -> RecordBatchRowIterator { RecordBatchRowIterator { record_batch, - rows: record_batch.df_recordbatch.num_rows(), - columns: record_batch.df_recordbatch.num_columns(), + rows: record_batch.df_record_batch.num_rows(), + columns: record_batch.df_record_batch.num_columns(), row_cursor: 0, } } @@ -104,15 +159,9 @@ impl<'a> Iterator for RecordBatchRowIterator<'a> { } else { let mut row = Vec::with_capacity(self.columns); - // TODO(yingwen): Get from the vector if RecordBatch also holds vectors. for col in 0..self.columns { - let column_array = self.record_batch.df_recordbatch.column(col); - match arrow_array_get(column_array.as_ref(), self.row_cursor) - .context(error::DataTypesSnafu) - { - Ok(field) => row.push(field), - Err(e) => return Some(Err(e.into())), - } + let column = self.record_batch.column(col); + row.push(column.get(self.row_cursor)); } self.row_cursor += 1; @@ -125,63 +174,60 @@ impl<'a> Iterator for RecordBatchRowIterator<'a> { mod tests { use std::sync::Arc; - use datafusion_common::field_util::SchemaExt; - use datafusion_common::record_batch::RecordBatch as DfRecordBatch; - use datatypes::arrow::array::UInt32Array; use datatypes::arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; - use datatypes::prelude::*; + use datatypes::data_type::ConcreteDataType; use datatypes::schema::{ColumnSchema, Schema}; - use datatypes::vectors::{StringVector, UInt32Vector, Vector}; + use datatypes::vectors::{StringVector, UInt32Vector}; use super::*; #[test] - fn test_new_record_batch() { + fn test_record_batch() { let arrow_schema = Arc::new(ArrowSchema::new(vec![ Field::new("c1", DataType::UInt32, false), Field::new("c2", DataType::UInt32, false), ])); let schema = Arc::new(Schema::try_from(arrow_schema).unwrap()); - let v = Arc::new(UInt32Vector::from_slice(&[1, 2, 3])); - let columns: Vec = vec![v.clone(), v.clone()]; - - let batch = RecordBatch::new(schema.clone(), columns).unwrap(); - let expect = v.to_arrow_array(); - for column in batch.df_recordbatch.columns() { - let array = column.as_any().downcast_ref::().unwrap(); - assert_eq!( - expect.as_any().downcast_ref::().unwrap(), - array - ); + let c1 = Arc::new(UInt32Vector::from_slice(&[1, 2, 3])); + let c2 = Arc::new(UInt32Vector::from_slice(&[4, 5, 6])); + let columns: Vec = vec![c1, c2]; + + let batch = RecordBatch::new(schema.clone(), columns.clone()).unwrap(); + assert_eq!(3, batch.num_rows()); + assert_eq!(&columns, batch.columns()); + for (i, expect) in columns.iter().enumerate().take(batch.num_columns()) { + let column = batch.column(i); + assert_eq!(expect, column); } assert_eq!(schema, batch.schema); + + assert_eq!(columns[0], *batch.column_by_name("c1").unwrap()); + assert_eq!(columns[1], *batch.column_by_name("c2").unwrap()); + assert!(batch.column_by_name("c3").is_none()); + + let converted = + RecordBatch::try_from_df_record_batch(schema, batch.df_record_batch().clone()).unwrap(); + assert_eq!(batch, converted); + assert_eq!(*batch.df_record_batch(), converted.into_df_record_batch()); } #[test] pub fn test_serialize_recordbatch() { - let arrow_schema = Arc::new(ArrowSchema::new(vec![Field::new( + let column_schemas = vec![ColumnSchema::new( "number", - DataType::UInt32, + ConcreteDataType::uint32_datatype(), false, - )])); - let schema = Arc::new(Schema::try_from(arrow_schema.clone()).unwrap()); + )]; + let schema = Arc::new(Schema::try_new(column_schemas).unwrap()); let numbers: Vec = (0..10).collect(); - let df_batch = DfRecordBatch::try_new( - arrow_schema, - vec![Arc::new(UInt32Array::from_slice(&numbers))], - ) - .unwrap(); - - let batch = RecordBatch { - schema, - df_recordbatch: df_batch, - }; + let columns = vec![Arc::new(UInt32Vector::from_slice(&numbers)) as VectorRef]; + let batch = RecordBatch::new(schema, columns).unwrap(); let output = serde_json::to_string(&batch).unwrap(); assert_eq!( - r#"{"schema":{"fields":[{"name":"number","data_type":"UInt32","is_nullable":false,"metadata":{}}],"metadata":{}},"columns":[[0,1,2,3,4,5,6,7,8,9]]}"#, + r#"{"schema":{"fields":[{"name":"number","data_type":"UInt32","nullable":false,"dict_id":0,"dict_is_ordered":false}],"metadata":{"greptime:version":"0"}},"columns":[[0,1,2,3,4,5,6,7,8,9]]}"#, output ); } diff --git a/src/common/recordbatch/src/util.rs b/src/common/recordbatch/src/util.rs index efe34dbfed44..4b2f1a67c84d 100644 --- a/src/common/recordbatch/src/util.rs +++ b/src/common/recordbatch/src/util.rs @@ -15,23 +15,29 @@ use futures::TryStreamExt; use crate::error::Result; -use crate::{RecordBatch, SendableRecordBatchStream}; +use crate::{RecordBatch, RecordBatches, SendableRecordBatchStream}; +/// Collect all the items from the stream into a vector of [`RecordBatch`]. pub async fn collect(stream: SendableRecordBatchStream) -> Result> { stream.try_collect::>().await } +/// Collect all the items from the stream into [RecordBatches]. +pub async fn collect_batches(stream: SendableRecordBatchStream) -> Result { + let schema = stream.schema(); + let batches = stream.try_collect::>().await?; + RecordBatches::try_new(schema, batches) +} + #[cfg(test)] mod tests { use std::mem; use std::pin::Pin; use std::sync::Arc; - use datafusion_common::field_util::SchemaExt; - use datafusion_common::record_batch::RecordBatch as DfRecordBatch; - use datatypes::arrow::array::UInt32Array; - use datatypes::arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; - use datatypes::schema::{Schema, SchemaRef}; + use datatypes::prelude::*; + use datatypes::schema::{ColumnSchema, Schema, SchemaRef}; + use datatypes::vectors::UInt32Vector; use futures::task::{Context, Poll}; use futures::Stream; @@ -65,12 +71,13 @@ mod tests { #[tokio::test] async fn test_collect() { - let arrow_schema = Arc::new(ArrowSchema::new(vec![Field::new( + let column_schemas = vec![ColumnSchema::new( "number", - DataType::UInt32, + ConcreteDataType::uint32_datatype(), false, - )])); - let schema = Arc::new(Schema::try_from(arrow_schema.clone()).unwrap()); + )]; + + let schema = Arc::new(Schema::try_new(column_schemas).unwrap()); let stream = MockRecordBatchStream { schema: schema.clone(), @@ -81,24 +88,23 @@ mod tests { assert_eq!(0, batches.len()); let numbers: Vec = (0..10).collect(); - let df_batch = DfRecordBatch::try_new( - arrow_schema.clone(), - vec![Arc::new(UInt32Array::from_slice(&numbers))], - ) - .unwrap(); - - let batch = RecordBatch { - schema: schema.clone(), - df_recordbatch: df_batch, - }; + let columns = [Arc::new(UInt32Vector::from_vec(numbers)) as _]; + let batch = RecordBatch::new(schema.clone(), columns).unwrap(); let stream = MockRecordBatchStream { - schema: Arc::new(Schema::try_from(arrow_schema).unwrap()), + schema: schema.clone(), batch: Some(batch.clone()), }; let batches = collect(Box::pin(stream)).await.unwrap(); assert_eq!(1, batches.len()); - assert_eq!(batch, batches[0]); + + let stream = MockRecordBatchStream { + schema: schema.clone(), + batch: Some(batch.clone()), + }; + let batches = collect_batches(Box::pin(stream)).await.unwrap(); + let expect_batches = RecordBatches::try_new(schema.clone(), vec![batch]).unwrap(); + assert_eq!(expect_batches, batches); } } diff --git a/src/common/substrait/Cargo.toml b/src/common/substrait/Cargo.toml index 9f9aea0b5e48..815a986d1e93 100644 --- a/src/common/substrait/Cargo.toml +++ b/src/common/substrait/Cargo.toml @@ -10,10 +10,8 @@ catalog = { path = "../../catalog" } common-catalog = { path = "../catalog" } common-error = { path = "../error" } common-telemetry = { path = "../telemetry" } -datafusion = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2", features = [ - "simd", -] } -datafusion-expr = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2" } +datafusion = "14.0.0" +datafusion-expr = "14.0.0" datatypes = { path = "../../datatypes" } futures = "0.3" prost = "0.9" diff --git a/src/common/substrait/src/context.rs b/src/common/substrait/src/context.rs index b017e9cc9aa9..af4a07b788d9 100644 --- a/src/common/substrait/src/context.rs +++ b/src/common/substrait/src/context.rs @@ -14,7 +14,7 @@ use std::collections::HashMap; -use datafusion::logical_plan::DFSchemaRef; +use datafusion::common::DFSchemaRef; use substrait_proto::protobuf::extensions::simple_extension_declaration::{ ExtensionFunction, MappingType, }; diff --git a/src/common/substrait/src/df_expr.rs b/src/common/substrait/src/df_expr.rs index d924e7b08508..b8d77a113c7c 100644 --- a/src/common/substrait/src/df_expr.rs +++ b/src/common/substrait/src/df_expr.rs @@ -15,8 +15,8 @@ use std::collections::VecDeque; use std::str::FromStr; -use datafusion::logical_plan::{Column, Expr}; -use datafusion_expr::{expr_fn, lit, BuiltinScalarFunction, Operator}; +use datafusion::common::Column; +use datafusion_expr::{expr_fn, lit, Between, BinaryExpr, BuiltinScalarFunction, Expr, Operator}; use datatypes::schema::Schema; use snafu::{ensure, OptionExt}; use substrait_proto::protobuf::expression::field_reference::ReferenceType as FieldReferenceType; @@ -311,21 +311,21 @@ pub fn convert_scalar_function( // skip GetIndexedField, unimplemented. "between" => { ensure_arg_len(3)?; - Expr::Between { + Expr::Between(Between { expr: Box::new(inputs.pop_front().unwrap()), negated: false, low: Box::new(inputs.pop_front().unwrap()), high: Box::new(inputs.pop_front().unwrap()), - } + }) } "not_between" => { ensure_arg_len(3)?; - Expr::Between { + Expr::Between(Between { expr: Box::new(inputs.pop_front().unwrap()), negated: true, low: Box::new(inputs.pop_front().unwrap()), high: Box::new(inputs.pop_front().unwrap()), - } + }) } // skip Case, is covered in substrait::SwitchExpression. // skip Cast and TryCast, is covered in substrait::Cast. @@ -477,7 +477,7 @@ pub fn expression_from_df_expr( rex_type: Some(RexType::Literal(l)), } } - Expr::BinaryExpr { left, op, right } => { + Expr::BinaryExpr(BinaryExpr { left, op, right }) => { let left = expression_from_df_expr(ctx, left, schema)?; let right = expression_from_df_expr(ctx, right, schema)?; let arguments = utils::expression_to_argument(vec![left, right]); @@ -518,12 +518,12 @@ pub fn expression_from_df_expr( name: expr.to_string(), } .fail()?, - Expr::Between { + Expr::Between(Between { expr, negated, low, high, - } => { + }) => { let expr = expression_from_df_expr(ctx, expr, schema)?; let low = expression_from_df_expr(ctx, low, schema)?; let high = expression_from_df_expr(ctx, high, schema)?; @@ -564,7 +564,21 @@ pub fn expression_from_df_expr( | Expr::WindowFunction { .. } | Expr::AggregateUDF { .. } | Expr::InList { .. } - | Expr::Wildcard => UnsupportedExprSnafu { + | Expr::Wildcard + | Expr::Like(_) + | Expr::ILike(_) + | Expr::SimilarTo(_) + | Expr::IsTrue(_) + | Expr::IsFalse(_) + | Expr::IsUnknown(_) + | Expr::IsNotTrue(_) + | Expr::IsNotFalse(_) + | Expr::IsNotUnknown(_) + | Expr::Exists { .. } + | Expr::InSubquery { .. } + | Expr::ScalarSubquery(..) + | Expr::QualifiedWildcard { .. } => todo!(), + Expr::GroupingSet(_) => UnsupportedExprSnafu { name: expr.to_string(), } .fail()?, @@ -628,6 +642,10 @@ mod utils { Operator::RegexNotIMatch => "regex_not_i_match", Operator::BitwiseAnd => "bitwise_and", Operator::BitwiseOr => "bitwise_or", + Operator::BitwiseXor => "bitwise_xor", + Operator::BitwiseShiftRight => "bitwise_shift_right", + Operator::BitwiseShiftLeft => "bitwise_shift_left", + Operator::StringConcat => "string_concat", } } @@ -679,7 +697,6 @@ mod utils { BuiltinScalarFunction::Sqrt => "sqrt", BuiltinScalarFunction::Tan => "tan", BuiltinScalarFunction::Trunc => "trunc", - BuiltinScalarFunction::Array => "make_array", BuiltinScalarFunction::Ascii => "ascii", BuiltinScalarFunction::BitLength => "bit_length", BuiltinScalarFunction::Btrim => "btrim", @@ -723,6 +740,17 @@ mod utils { BuiltinScalarFunction::Trim => "trim", BuiltinScalarFunction::Upper => "upper", BuiltinScalarFunction::RegexpMatch => "regexp_match", + BuiltinScalarFunction::Atan2 => "atan2", + BuiltinScalarFunction::Coalesce => "coalesce", + BuiltinScalarFunction::Power => "power", + BuiltinScalarFunction::MakeArray => "make_array", + BuiltinScalarFunction::DateBin => "date_bin", + BuiltinScalarFunction::FromUnixtime => "from_unixtime", + BuiltinScalarFunction::CurrentDate => "current_date", + BuiltinScalarFunction::CurrentTime => "current_time", + BuiltinScalarFunction::Uuid => "uuid", + BuiltinScalarFunction::Struct => "struct", + BuiltinScalarFunction::ArrowTypeof => "arrow_type_of", } } } diff --git a/src/common/substrait/src/df_logical.rs b/src/common/substrait/src/df_logical.rs index 81909cf38d26..a6a81fb6f52a 100644 --- a/src/common/substrait/src/df_logical.rs +++ b/src/common/substrait/src/df_logical.rs @@ -19,10 +19,10 @@ use catalog::CatalogManagerRef; use common_error::prelude::BoxedError; use common_telemetry::debug; use datafusion::arrow::datatypes::SchemaRef as ArrowSchemaRef; -use datafusion::datasource::TableProvider; -use datafusion::logical_plan::plan::Filter; -use datafusion::logical_plan::{LogicalPlan, TableScan, ToDFSchema}; +use datafusion::common::ToDFSchema; +use datafusion::datasource::DefaultTableSource; use datafusion::physical_plan::project_schema; +use datafusion_expr::{Filter, LogicalPlan, TableScan, TableSource}; use prost::Message; use snafu::{ensure, OptionExt, ResultExt}; use substrait_proto::protobuf::expression::mask_expression::{StructItem, StructSelect}; @@ -144,7 +144,7 @@ impl DFLogicalSubstraitConvertor { .context(error::ConvertDfSchemaSnafu)?; let predicate = to_df_expr(ctx, *condition, &schema)?; - LogicalPlan::Filter(Filter { predicate, input }) + LogicalPlan::Filter(Filter::try_new(predicate, input).context(DFInternalSnafu)?) } RelType::Fetch(_fetch_rel) => UnsupportedPlanSnafu { name: "Fetch Relation", @@ -238,7 +238,9 @@ impl DFLogicalSubstraitConvertor { .context(TableNotFoundSnafu { name: format!("{}.{}.{}", catalog_name, schema_name, table_name), })?; - let adapter = Arc::new(DfTableProviderAdapter::new(table_ref)); + let adapter = Arc::new(DefaultTableSource::new(Arc::new( + DfTableProviderAdapter::new(table_ref), + ))); // Get schema directly from the table, and compare it with the schema retrieved from substrait proto. let stored_schema = adapter.schema(); @@ -267,14 +269,14 @@ impl DFLogicalSubstraitConvertor { ctx.set_df_schema(projected_schema.clone()); - // TODO(ruihang): Support limit + // TODO(ruihang): Support limit(fetch) Ok(LogicalPlan::TableScan(TableScan { table_name: format!("{}.{}.{}", catalog_name, schema_name, table_name), source: adapter, projection, projected_schema, filters, - limit: None, + fetch: None, })) } @@ -302,7 +304,7 @@ impl DFLogicalSubstraitConvertor { .fail()?, LogicalPlan::Filter(filter) => { let input = Some(Box::new( - self.logical_plan_to_rel(ctx, filter.input.clone())?, + self.logical_plan_to_rel(ctx, filter.input().clone())?, )); let schema = plan @@ -312,7 +314,7 @@ impl DFLogicalSubstraitConvertor { .context(error::ConvertDfSchemaSnafu)?; let condition = Some(Box::new(expression_from_df_expr( ctx, - &filter.predicate, + filter.predicate(), &schema, )?)); @@ -368,7 +370,16 @@ impl DFLogicalSubstraitConvertor { name: "DataFusion Logical Limit", } .fail()?, - LogicalPlan::CreateExternalTable(_) + + LogicalPlan::Subquery(_) + | LogicalPlan::SubqueryAlias(_) + | LogicalPlan::CreateView(_) + | LogicalPlan::CreateCatalogSchema(_) + | LogicalPlan::CreateCatalog(_) + | LogicalPlan::DropView(_) + | LogicalPlan::Distinct(_) + | LogicalPlan::SetVariable(_) + | LogicalPlan::CreateExternalTable(_) | LogicalPlan::CreateMemoryTable(_) | LogicalPlan::DropTable(_) | LogicalPlan::Values(_) @@ -414,6 +425,10 @@ impl DFLogicalSubstraitConvertor { let provider = table_scan .source .as_any() + .downcast_ref::() + .context(UnknownPlanSnafu)? + .table_provider + .as_any() .downcast_ref::() .context(UnknownPlanSnafu)?; let table_info = provider.table().table_info(); @@ -485,7 +500,9 @@ impl DFLogicalSubstraitConvertor { fn same_schema_without_metadata(lhs: &ArrowSchemaRef, rhs: &ArrowSchemaRef) -> bool { lhs.fields.len() == rhs.fields.len() && lhs.fields.iter().zip(rhs.fields.iter()).all(|(x, y)| { - x.name == y.name && x.data_type == y.data_type && x.is_nullable == y.is_nullable + x.name() == y.name() + && x.data_type() == y.data_type() + && x.is_nullable() == y.is_nullable() }) } @@ -494,7 +511,7 @@ mod test { use catalog::local::{LocalCatalogManager, MemoryCatalogProvider, MemorySchemaProvider}; use catalog::{CatalogList, CatalogProvider, RegisterTableRequest}; use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME}; - use datafusion::logical_plan::DFSchema; + use datafusion::common::{DFSchema, ToDFSchema}; use datatypes::schema::Schema; use table::requests::CreateTableRequest; use table::test_util::{EmptyTable, MockTableEngine}; @@ -564,7 +581,9 @@ mod test { }) .await .unwrap(); - let adapter = Arc::new(DfTableProviderAdapter::new(table_ref)); + let adapter = Arc::new(DefaultTableSource::new(Arc::new( + DfTableProviderAdapter::new(table_ref), + ))); let projection = vec![1, 3, 5]; let df_schema = adapter.schema().to_dfschema().unwrap(); @@ -584,7 +603,7 @@ mod test { projection: Some(projection), projected_schema, filters: vec![], - limit: None, + fetch: None, }); logical_plan_round_trip(table_scan_plan, catalog_manager).await; diff --git a/src/common/time/src/date.rs b/src/common/time/src/date.rs index 30e452906398..b12eb9f50da7 100644 --- a/src/common/time/src/date.rs +++ b/src/common/time/src/date.rs @@ -55,8 +55,11 @@ impl From for Date { impl Display for Date { /// [Date] is formatted according to ISO-8601 standard. fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - let abs_date = NaiveDate::from_num_days_from_ce(UNIX_EPOCH_FROM_CE + self.0); - f.write_str(&abs_date.format("%F").to_string()) + if let Some(abs_date) = NaiveDate::from_num_days_from_ce_opt(UNIX_EPOCH_FROM_CE + self.0) { + write!(f, "{}", abs_date.format("%F")) + } else { + write!(f, "Date({})", self.0) + } } } @@ -95,7 +98,7 @@ mod tests { Date::from_str("1969-01-01").unwrap().to_string() ); - let now = Utc::now().date().format("%F").to_string(); + let now = Utc::now().date_naive().format("%F").to_string(); assert_eq!(now, Date::from_str(&now).unwrap().to_string()); } diff --git a/src/common/time/src/datetime.rs b/src/common/time/src/datetime.rs index 4055a07429c5..73d465babed5 100644 --- a/src/common/time/src/datetime.rs +++ b/src/common/time/src/datetime.rs @@ -31,8 +31,11 @@ pub struct DateTime(i64); impl Display for DateTime { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - let abs_time = NaiveDateTime::from_timestamp(self.0, 0); - write!(f, "{}", abs_time.format(DATETIME_FORMAT)) + if let Some(abs_time) = NaiveDateTime::from_timestamp_opt(self.0, 0) { + write!(f, "{}", abs_time.format(DATETIME_FORMAT)) + } else { + write!(f, "DateTime({})", self.0) + } } } diff --git a/src/common/time/src/timestamp.rs b/src/common/time/src/timestamp.rs index 5ff20f702be3..b3de23d01d70 100644 --- a/src/common/time/src/timestamp.rs +++ b/src/common/time/src/timestamp.rs @@ -14,6 +14,7 @@ use core::default::Default; use std::cmp::Ordering; +use std::fmt::{Display, Formatter}; use std::hash::{Hash, Hasher}; use std::str::FromStr; @@ -34,13 +35,34 @@ impl Timestamp { Self { unit, value } } - pub fn from_millis(value: i64) -> Self { + pub fn new_second(value: i64) -> Self { + Self { + value, + unit: TimeUnit::Second, + } + } + + pub fn new_millisecond(value: i64) -> Self { Self { value, unit: TimeUnit::Millisecond, } } + pub fn new_microsecond(value: i64) -> Self { + Self { + value, + unit: TimeUnit::Microsecond, + } + } + + pub fn new_nanosecond(value: i64) -> Self { + Self { + value, + unit: TimeUnit::Nanosecond, + } + } + pub fn unit(&self) -> TimeUnit { self.unit } @@ -54,6 +76,8 @@ impl Timestamp { self.value * self.unit.factor() / unit.factor() } + /// Format timestamp to ISO8601 string. If the timestamp exceeds what chrono timestamp can + /// represent, this function simply print the timestamp unit and value in plain string. pub fn to_iso8601_string(&self) -> String { let nano_factor = TimeUnit::Second.factor() / TimeUnit::Nanosecond.factor(); @@ -65,8 +89,11 @@ impl Timestamp { nsecs += nano_factor; } - let datetime = Utc.timestamp(secs, nsecs as u32); - format!("{}", datetime.format("%Y-%m-%d %H:%M:%S%.f%z")) + if let LocalResult::Single(datetime) = Utc.timestamp_opt(secs, nsecs as u32) { + format!("{}", datetime.format("%Y-%m-%d %H:%M:%S%.f%z")) + } else { + format!("[Timestamp{}: {}]", self.unit, self.value) + } } } @@ -168,6 +195,25 @@ pub enum TimeUnit { Nanosecond, } +impl Display for TimeUnit { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + TimeUnit::Second => { + write!(f, "Second") + } + TimeUnit::Millisecond => { + write!(f, "Millisecond") + } + TimeUnit::Microsecond => { + write!(f, "Microsecond") + } + TimeUnit::Nanosecond => { + write!(f, "Nanosecond") + } + } + } +} + impl TimeUnit { pub fn factor(&self) -> i64 { match self { @@ -249,10 +295,11 @@ mod tests { // but expected timestamp is in UTC timezone fn check_from_str(s: &str, expect: &str) { let ts = Timestamp::from_str(s).unwrap(); - let time = NaiveDateTime::from_timestamp( + let time = NaiveDateTime::from_timestamp_opt( ts.value / 1_000_000_000, (ts.value % 1_000_000_000) as u32, - ); + ) + .unwrap(); assert_eq!(expect, time.to_string()); } @@ -265,7 +312,13 @@ mod tests { check_from_str( "2020-09-08 13:42:29", &NaiveDateTime::from_timestamp_opt( - 1599572549 - Local.timestamp(0, 0).offset().fix().local_minus_utc() as i64, + 1599572549 + - Local + .timestamp_opt(0, 0) + .unwrap() + .offset() + .fix() + .local_minus_utc() as i64, 0, ) .unwrap() @@ -275,7 +328,13 @@ mod tests { check_from_str( "2020-09-08T13:42:29", &NaiveDateTime::from_timestamp_opt( - 1599572549 - Local.timestamp(0, 0).offset().fix().local_minus_utc() as i64, + 1599572549 + - Local + .timestamp_opt(0, 0) + .unwrap() + .offset() + .fix() + .local_minus_utc() as i64, 0, ) .unwrap() @@ -285,7 +344,13 @@ mod tests { check_from_str( "2020-09-08 13:42:29.042", &NaiveDateTime::from_timestamp_opt( - 1599572549 - Local.timestamp(0, 0).offset().fix().local_minus_utc() as i64, + 1599572549 + - Local + .timestamp_opt(0, 0) + .unwrap() + .offset() + .fix() + .local_minus_utc() as i64, 42000000, ) .unwrap() @@ -296,7 +361,13 @@ mod tests { check_from_str( "2020-09-08T13:42:29.042", &NaiveDateTime::from_timestamp_opt( - 1599572549 - Local.timestamp(0, 0).offset().fix().local_minus_utc() as i64, + 1599572549 + - Local + .timestamp_opt(0, 0) + .unwrap() + .offset() + .fix() + .local_minus_utc() as i64, 42000000, ) .unwrap() @@ -316,19 +387,19 @@ mod tests { assert_eq!(datetime_str, ts.to_iso8601_string()); let ts_millis = 1668070237000; - let ts = Timestamp::from_millis(ts_millis); + let ts = Timestamp::new_millisecond(ts_millis); assert_eq!("2022-11-10 08:50:37+0000", ts.to_iso8601_string()); let ts_millis = -1000; - let ts = Timestamp::from_millis(ts_millis); + let ts = Timestamp::new_millisecond(ts_millis); assert_eq!("1969-12-31 23:59:59+0000", ts.to_iso8601_string()); let ts_millis = -1; - let ts = Timestamp::from_millis(ts_millis); + let ts = Timestamp::new_millisecond(ts_millis); assert_eq!("1969-12-31 23:59:59.999+0000", ts.to_iso8601_string()); let ts_millis = -1001; - let ts = Timestamp::from_millis(ts_millis); + let ts = Timestamp::new_millisecond(ts_millis); assert_eq!("1969-12-31 23:59:58.999+0000", ts.to_iso8601_string()); } diff --git a/src/common/time/src/util.rs b/src/common/time/src/util.rs index 3d3baebc2ee6..1917ce3456a2 100644 --- a/src/common/time/src/util.rs +++ b/src/common/time/src/util.rs @@ -33,8 +33,8 @@ mod tests { .duration_since(time::UNIX_EPOCH) .unwrap() .as_millis() as i64; - let datetime_now = chrono::Utc.timestamp_millis(now); - let datetime_std = chrono::Utc.timestamp_millis(millis_from_std); + let datetime_now = chrono::Utc.timestamp_millis_opt(now).unwrap(); + let datetime_std = chrono::Utc.timestamp_millis_opt(millis_from_std).unwrap(); assert_eq!(datetime_std.year(), datetime_now.year()); assert_eq!(datetime_std.month(), datetime_now.month()); diff --git a/src/datanode/Cargo.toml b/src/datanode/Cargo.toml index 47f34d218625..a245340ad87e 100644 --- a/src/datanode/Cargo.toml +++ b/src/datanode/Cargo.toml @@ -25,9 +25,7 @@ common-recordbatch = { path = "../common/recordbatch" } common-runtime = { path = "../common/runtime" } common-telemetry = { path = "../common/telemetry" } common-time = { path = "../common/time" } -datafusion = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2", features = [ - "simd", -] } +datafusion = "14.0.0" datatypes = { path = "../datatypes" } futures = "0.3" hyper = { version = "0.14", features = ["full"] } @@ -59,8 +57,5 @@ tower-http = { version = "0.3", features = ["full"] } axum-test-helper = { git = "https://github.com/sunng87/axum-test-helper.git", branch = "patch-1" } client = { path = "../client" } common-query = { path = "../common/query" } -datafusion = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2", features = [ - "simd", -] } -datafusion-common = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2" } +datafusion-common = "14.0.0" tempdir = "0.3" diff --git a/src/datanode/src/server/grpc.rs b/src/datanode/src/server/grpc.rs index 26108eb02004..5109522541c9 100644 --- a/src/datanode/src/server/grpc.rs +++ b/src/datanode/src/server/grpc.rs @@ -260,7 +260,7 @@ mod tests { }, ColumnDef { name: "ts".to_string(), - datatype: ColumnDataType::Timestamp as i32, + datatype: ColumnDataType::TimestampMillisecond as i32, is_nullable: false, default_constraint: None, }, @@ -295,8 +295,12 @@ mod tests { fn expected_table_schema() -> SchemaRef { let column_schemas = vec![ ColumnSchema::new("host", ConcreteDataType::string_datatype(), false), - ColumnSchema::new("ts", ConcreteDataType::timestamp_millis_datatype(), false) - .with_time_index(true), + ColumnSchema::new( + "ts", + ConcreteDataType::timestamp_millisecond_datatype(), + false, + ) + .with_time_index(true), ColumnSchema::new("cpu", ConcreteDataType::float32_datatype(), true), ColumnSchema::new("memory", ConcreteDataType::float64_datatype(), true), ]; diff --git a/src/datanode/src/sql.rs b/src/datanode/src/sql.rs index 0a3b4a999e70..e578bec1e9f4 100644 --- a/src/datanode/src/sql.rs +++ b/src/datanode/src/sql.rs @@ -154,8 +154,12 @@ mod tests { ColumnSchema::new("host", ConcreteDataType::string_datatype(), false), ColumnSchema::new("cpu", ConcreteDataType::float64_datatype(), true), ColumnSchema::new("memory", ConcreteDataType::float64_datatype(), true), - ColumnSchema::new("ts", ConcreteDataType::timestamp_millis_datatype(), true) - .with_time_index(true), + ColumnSchema::new( + "ts", + ConcreteDataType::timestamp_millisecond_datatype(), + true, + ) + .with_time_index(true), ]; Arc::new( @@ -284,11 +288,11 @@ mod tests { let ts = &columns_values["ts"]; assert_eq!(2, ts.len()); assert_eq!( - Value::from(Timestamp::from_millis(1655276557000i64)), + Value::from(Timestamp::new_millisecond(1655276557000i64)), ts.get(0) ); assert_eq!( - Value::from(Timestamp::from_millis(1655276558000i64)), + Value::from(Timestamp::new_millisecond(1655276558000i64)), ts.get(1) ); } diff --git a/src/datanode/src/sql/create.rs b/src/datanode/src/sql/create.rs index 8b75bdef3f74..ac80338aa860 100644 --- a/src/datanode/src/sql/create.rs +++ b/src/datanode/src/sql/create.rs @@ -375,7 +375,7 @@ mod tests { .data_type ); assert_eq!( - ConcreteDataType::timestamp_millis_datatype(), + ConcreteDataType::timestamp_millisecond_datatype(), request .schema .column_schema_by_name("ts") diff --git a/src/datanode/src/sql/insert.rs b/src/datanode/src/sql/insert.rs index 8c2dae5c4a6e..6c99b7172921 100644 --- a/src/datanode/src/sql/insert.rs +++ b/src/datanode/src/sql/insert.rs @@ -14,7 +14,9 @@ use catalog::CatalogManagerRef; use common_query::Output; -use datatypes::prelude::{ConcreteDataType, VectorBuilder}; +use datatypes::data_type::DataType; +use datatypes::prelude::ConcreteDataType; +use datatypes::vectors::MutableVector; use snafu::{ensure, OptionExt, ResultExt}; use sql::ast::Value as SqlValue; use sql::statements::insert::Insert; @@ -70,7 +72,7 @@ impl SqlHandler { }; let rows_num = values.len(); - let mut columns_builders: Vec<(&String, &ConcreteDataType, VectorBuilder)> = + let mut columns_builders: Vec<(&String, &ConcreteDataType, Box)> = Vec::with_capacity(columns_num); if columns.is_empty() { @@ -79,7 +81,7 @@ impl SqlHandler { columns_builders.push(( &column_schema.name, data_type, - VectorBuilder::with_capacity(data_type.clone(), rows_num), + data_type.create_mutable_vector(rows_num), )); } } else { @@ -95,7 +97,7 @@ impl SqlHandler { columns_builders.push(( column_name, data_type, - VectorBuilder::with_capacity(data_type.clone(), rows_num), + data_type.create_mutable_vector(rows_num), )); } } @@ -123,7 +125,7 @@ impl SqlHandler { table_name: table_ref.table.to_string(), columns_values: columns_builders .into_iter() - .map(|(c, _, mut b)| (c.to_owned(), b.finish())) + .map(|(c, _, mut b)| (c.to_owned(), b.to_vector())) .collect(), })) } @@ -133,11 +135,11 @@ fn add_row_to_vector( column_name: &str, data_type: &ConcreteDataType, sql_val: &SqlValue, - builder: &mut VectorBuilder, + builder: &mut Box, ) -> Result<()> { let value = statements::sql_value_to_value(column_name, data_type, sql_val) .context(ParseSqlValueSnafu)?; - builder.push(&value); + builder.push_value_ref(value.as_value_ref()).unwrap(); Ok(()) } diff --git a/src/datanode/src/tests/instance_test.rs b/src/datanode/src/tests/instance_test.rs index 1b01d05eae84..26ba03da73ec 100644 --- a/src/datanode/src/tests/instance_test.rs +++ b/src/datanode/src/tests/instance_test.rs @@ -17,11 +17,8 @@ use std::sync::Arc; use common_catalog::consts::DEFAULT_SCHEMA_NAME; use common_query::Output; use common_recordbatch::util; -use datafusion::arrow_print; -use datafusion_common::record_batch::RecordBatch as DfRecordBatch; -use datatypes::arrow::array::{Int64Array, UInt64Array, Utf8Array}; -use datatypes::arrow_array::StringArray; -use datatypes::prelude::ConcreteDataType; +use datatypes::data_type::ConcreteDataType; +use datatypes::vectors::{Int64Vector, StringVector, UInt64Vector, VectorRef}; use session::context::QueryContext; use crate::instance::Instance; @@ -66,11 +63,13 @@ async fn test_create_database_and_insert_query() { match query_output { Output::Stream(s) => { let batches = util::collect(s).await.unwrap(); - let columns = batches[0].df_recordbatch.columns(); - assert_eq!(1, columns.len()); + assert_eq!(1, batches[0].num_columns()); assert_eq!( - &Int64Array::from_slice(&[1655276557000, 1655276558000]), - columns[0].as_any().downcast_ref::().unwrap() + Arc::new(Int64Vector::from_vec(vec![ + 1655276557000_i64, + 1655276558000_i64 + ])) as VectorRef, + *batches[0].column(0) ); } _ => unreachable!(), @@ -155,18 +154,15 @@ async fn assert_query_result(instance: &Instance, sql: &str, ts: i64, host: &str match query_output { Output::Stream(s) => { let batches = util::collect(s).await.unwrap(); - let columns = batches[0].df_recordbatch.columns(); - assert_eq!(2, columns.len()); + // let columns = batches[0].df_recordbatch.columns(); + assert_eq!(2, batches[0].num_columns()); assert_eq!( - &Utf8Array::::from_slice(&[host]), - columns[0] - .as_any() - .downcast_ref::>() - .unwrap() + Arc::new(StringVector::from(vec![host])) as VectorRef, + *batches[0].column(0) ); assert_eq!( - &Int64Array::from_slice(&[ts]), - columns[1].as_any().downcast_ref::().unwrap() + Arc::new(Int64Vector::from_vec(vec![ts])) as VectorRef, + *batches[0].column(1) ); } _ => unreachable!(), @@ -183,7 +179,7 @@ async fn setup_test_instance(test_name: &str) -> Instance { test_util::create_test_table( instance.catalog_manager(), instance.sql_handler(), - ConcreteDataType::timestamp_millis_datatype(), + ConcreteDataType::timestamp_millisecond_datatype(), ) .await .unwrap(); @@ -235,11 +231,13 @@ async fn test_execute_insert_query_with_i64_timestamp() { match query_output { Output::Stream(s) => { let batches = util::collect(s).await.unwrap(); - let columns = batches[0].df_recordbatch.columns(); - assert_eq!(1, columns.len()); + assert_eq!(1, batches[0].num_columns()); assert_eq!( - &Int64Array::from_slice(&[1655276557000, 1655276558000]), - columns[0].as_any().downcast_ref::().unwrap() + Arc::new(Int64Vector::from_vec(vec![ + 1655276557000_i64, + 1655276558000_i64 + ])) as VectorRef, + *batches[0].column(0) ); } _ => unreachable!(), @@ -249,11 +247,13 @@ async fn test_execute_insert_query_with_i64_timestamp() { match query_output { Output::Stream(s) => { let batches = util::collect(s).await.unwrap(); - let columns = batches[0].df_recordbatch.columns(); - assert_eq!(1, columns.len()); + assert_eq!(1, batches[0].num_columns()); assert_eq!( - &Int64Array::from_slice(&[1655276557000, 1655276558000]), - columns[0].as_any().downcast_ref::().unwrap() + Arc::new(Int64Vector::from_vec(vec![ + 1655276557000_i64, + 1655276558000_i64 + ])) as VectorRef, + *batches[0].column(0) ); } _ => unreachable!(), @@ -270,13 +270,12 @@ async fn test_execute_query() { match output { Output::Stream(recordbatch) => { let numbers = util::collect(recordbatch).await.unwrap(); - let columns = numbers[0].df_recordbatch.columns(); - assert_eq!(1, columns.len()); - assert_eq!(columns[0].len(), 1); + assert_eq!(1, numbers[0].num_columns()); + assert_eq!(numbers[0].column(0).len(), 1); assert_eq!( - *columns[0].as_any().downcast_ref::().unwrap(), - UInt64Array::from_slice(&[4950]) + Arc::new(UInt64Vector::from_vec(vec![4950_u64])) as VectorRef, + *numbers[0].column(0), ); } _ => unreachable!(), @@ -294,13 +293,12 @@ async fn test_execute_show_databases_tables() { match output { Output::RecordBatches(databases) => { let databases = databases.take(); - let columns = databases[0].df_recordbatch.columns(); - assert_eq!(1, columns.len()); - assert_eq!(columns[0].len(), 1); + assert_eq!(1, databases[0].num_columns()); + assert_eq!(databases[0].column(0).len(), 1); assert_eq!( - *columns[0].as_any().downcast_ref::().unwrap(), - StringArray::from(vec![Some("public")]) + *databases[0].column(0), + Arc::new(StringVector::from(vec![Some("public")])) as VectorRef ); } _ => unreachable!(), @@ -310,13 +308,12 @@ async fn test_execute_show_databases_tables() { match output { Output::RecordBatches(databases) => { let databases = databases.take(); - let columns = databases[0].df_recordbatch.columns(); - assert_eq!(1, columns.len()); - assert_eq!(columns[0].len(), 1); + assert_eq!(1, databases[0].num_columns()); + assert_eq!(databases[0].column(0).len(), 1); assert_eq!( - *columns[0].as_any().downcast_ref::().unwrap(), - StringArray::from(vec![Some("public")]) + *databases[0].column(0), + Arc::new(StringVector::from(vec![Some("public")])) as VectorRef ); } _ => unreachable!(), @@ -326,9 +323,8 @@ async fn test_execute_show_databases_tables() { match output { Output::RecordBatches(databases) => { let databases = databases.take(); - let columns = databases[0].df_recordbatch.columns(); - assert_eq!(1, columns.len()); - assert_eq!(columns[0].len(), 2); + assert_eq!(1, databases[0].num_columns()); + assert_eq!(databases[0].column(0).len(), 2); } _ => unreachable!(), } @@ -337,7 +333,7 @@ async fn test_execute_show_databases_tables() { test_util::create_test_table( instance.catalog_manager(), instance.sql_handler(), - ConcreteDataType::timestamp_millis_datatype(), + ConcreteDataType::timestamp_millisecond_datatype(), ) .await .unwrap(); @@ -346,9 +342,8 @@ async fn test_execute_show_databases_tables() { match output { Output::RecordBatches(databases) => { let databases = databases.take(); - let columns = databases[0].df_recordbatch.columns(); - assert_eq!(1, columns.len()); - assert_eq!(columns[0].len(), 3); + assert_eq!(1, databases[0].num_columns()); + assert_eq!(databases[0].column(0).len(), 3); } _ => unreachable!(), } @@ -358,13 +353,12 @@ async fn test_execute_show_databases_tables() { match output { Output::RecordBatches(databases) => { let databases = databases.take(); - let columns = databases[0].df_recordbatch.columns(); - assert_eq!(1, columns.len()); - assert_eq!(columns[0].len(), 1); + assert_eq!(1, databases[0].num_columns()); + assert_eq!(databases[0].column(0).len(), 1); assert_eq!( - *columns[0].as_any().downcast_ref::().unwrap(), - StringArray::from(vec![Some("demo")]) + *databases[0].column(0), + Arc::new(StringVector::from(vec![Some("demo")])) as VectorRef ); } _ => unreachable!(), @@ -394,18 +388,13 @@ pub async fn test_execute_create() { assert!(matches!(output, Output::AffectedRows(1))); } -async fn check_output_stream(output: Output, expected: Vec<&str>) { +async fn check_output_stream(output: Output, expected: String) { let recordbatches = match output { - Output::Stream(stream) => util::collect(stream).await.unwrap(), - Output::RecordBatches(recordbatches) => recordbatches.take(), + Output::Stream(stream) => util::collect_batches(stream).await.unwrap(), + Output::RecordBatches(recordbatches) => recordbatches, _ => unreachable!(), }; - let recordbatches = recordbatches - .into_iter() - .map(|r| r.df_recordbatch) - .collect::>(); - let pretty_print = arrow_print::write(&recordbatches); - let pretty_print = pretty_print.lines().collect::>(); + let pretty_print = recordbatches.pretty_print().unwrap(); assert_eq!(pretty_print, expected); } @@ -438,15 +427,16 @@ async fn test_alter_table() { assert!(matches!(output, Output::AffectedRows(1))); let output = execute_sql(&instance, "select * from demo order by ts").await; - let expected = vec![ - "+-------+-----+--------+---------------------+--------+", - "| host | cpu | memory | ts | my_tag |", - "+-------+-----+--------+---------------------+--------+", - "| host1 | 1.1 | 100 | 1970-01-01 00:00:01 | |", - "| host2 | 2.2 | 200 | 1970-01-01 00:00:02 | hello |", - "| host3 | 3.3 | 300 | 1970-01-01 00:00:03 | |", - "+-------+-----+--------+---------------------+--------+", - ]; + let expected = "\ ++-------+-----+--------+---------------------+--------+ +| host | cpu | memory | ts | my_tag | ++-------+-----+--------+---------------------+--------+ +| host1 | 1.1 | 100 | 1970-01-01T00:00:01 | | +| host2 | 2.2 | 200 | 1970-01-01T00:00:02 | hello | +| host3 | 3.3 | 300 | 1970-01-01T00:00:03 | | ++-------+-----+--------+---------------------+--------+\ + " + .to_string(); check_output_stream(output, expected).await; // Drop a column @@ -454,15 +444,16 @@ async fn test_alter_table() { assert!(matches!(output, Output::AffectedRows(0))); let output = execute_sql(&instance, "select * from demo order by ts").await; - let expected = vec![ - "+-------+-----+---------------------+--------+", - "| host | cpu | ts | my_tag |", - "+-------+-----+---------------------+--------+", - "| host1 | 1.1 | 1970-01-01 00:00:01 | |", - "| host2 | 2.2 | 1970-01-01 00:00:02 | hello |", - "| host3 | 3.3 | 1970-01-01 00:00:03 | |", - "+-------+-----+---------------------+--------+", - ]; + let expected = "\ ++-------+-----+---------------------+--------+ +| host | cpu | ts | my_tag | ++-------+-----+---------------------+--------+ +| host1 | 1.1 | 1970-01-01T00:00:01 | | +| host2 | 2.2 | 1970-01-01T00:00:02 | hello | +| host3 | 3.3 | 1970-01-01T00:00:03 | | ++-------+-----+---------------------+--------+\ + " + .to_string(); check_output_stream(output, expected).await; // insert a new row @@ -474,16 +465,17 @@ async fn test_alter_table() { assert!(matches!(output, Output::AffectedRows(1))); let output = execute_sql(&instance, "select * from demo order by ts").await; - let expected = vec![ - "+-------+-----+---------------------+--------+", - "| host | cpu | ts | my_tag |", - "+-------+-----+---------------------+--------+", - "| host1 | 1.1 | 1970-01-01 00:00:01 | |", - "| host2 | 2.2 | 1970-01-01 00:00:02 | hello |", - "| host3 | 3.3 | 1970-01-01 00:00:03 | |", - "| host4 | 400 | 1970-01-01 00:00:04 | world |", - "+-------+-----+---------------------+--------+", - ]; + let expected = "\ ++-------+-----+---------------------+--------+ +| host | cpu | ts | my_tag | ++-------+-----+---------------------+--------+ +| host1 | 1.1 | 1970-01-01T00:00:01 | | +| host2 | 2.2 | 1970-01-01T00:00:02 | hello | +| host3 | 3.3 | 1970-01-01T00:00:03 | | +| host4 | 400 | 1970-01-01T00:00:04 | world | ++-------+-----+---------------------+--------+\ + " + .to_string(); check_output_stream(output, expected).await; } @@ -522,14 +514,15 @@ async fn test_insert_with_default_value_for_type(type_name: &str) { assert!(matches!(output, Output::AffectedRows(1))); let output = execute_sql(&instance, "select host, cpu from test_table").await; - let expected = vec![ - "+-------+-----+", - "| host | cpu |", - "+-------+-----+", - "| host1 | 1.1 |", - "| host2 | 2.2 |", - "+-------+-----+", - ]; + let expected = "\ ++-------+-----+ +| host | cpu | ++-------+-----+ +| host1 | 1.1 | +| host2 | 2.2 | ++-------+-----+\ + " + .to_string(); check_output_stream(output, expected).await; } @@ -559,13 +552,14 @@ async fn test_use_database() { assert!(matches!(output, Output::AffectedRows(1))); let output = execute_sql_in_db(&instance, "show tables", "db1").await; - let expected = vec![ - "+--------+", - "| Tables |", - "+--------+", - "| tb1 |", - "+--------+", - ]; + let expected = "\ ++--------+ +| Tables | ++--------+ +| tb1 | ++--------+\ + " + .to_string(); check_output_stream(output, expected).await; let output = execute_sql_in_db( @@ -577,25 +571,27 @@ async fn test_use_database() { assert!(matches!(output, Output::AffectedRows(1))); let output = execute_sql_in_db(&instance, "select col_i32 from tb1", "db1").await; - let expected = vec![ - "+---------+", - "| col_i32 |", - "+---------+", - "| 1 |", - "+---------+", - ]; + let expected = "\ ++---------+ +| col_i32 | ++---------+ +| 1 | ++---------+\ + " + .to_string(); check_output_stream(output, expected).await; // Making a particular database the default by means of the USE statement does not preclude // accessing tables in other databases. let output = execute_sql(&instance, "select number from public.numbers limit 1").await; - let expected = vec![ - "+--------+", - "| number |", - "+--------+", - "| 0 |", - "+--------+", - ]; + let expected = "\ ++--------+ +| number | ++--------+ +| 0 | ++--------+\ + " + .to_string(); check_output_stream(output, expected).await; } diff --git a/src/datatypes/Cargo.toml b/src/datatypes/Cargo.toml index 5c66508dd146..2841decb676f 100644 --- a/src/datatypes/Cargo.toml +++ b/src/datatypes/Cargo.toml @@ -9,10 +9,12 @@ default = [] test = [] [dependencies] +arrow = { version = "26.0" } +arrow-schema = { version = "26.0", features = ["serde"] } common-base = { path = "../common/base" } common-error = { path = "../common/error" } common-time = { path = "../common/time" } -datafusion-common = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2" } +datafusion-common = "14.0" enum_dispatch = "0.3" num = "0.4" num-traits = "0.2" @@ -21,17 +23,3 @@ paste = "1.0" serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" snafu = { version = "0.7", features = ["backtraces"] } - -[dependencies.arrow] -package = "arrow2" -version = "0.10" -features = [ - "io_csv", - "io_json", - "io_parquet", - "io_parquet_compression", - "io_ipc", - "ahash", - "compute", - "serde_types", -] diff --git a/src/datatypes/src/arrow_array.rs b/src/datatypes/src/arrow_array.rs index ca2cb6cc48cc..72de42214200 100644 --- a/src/datatypes/src/arrow_array.rs +++ b/src/datatypes/src/arrow_array.rs @@ -12,216 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use arrow::array::{ - self, Array, BinaryArray as ArrowBinaryArray, ListArray, - MutableBinaryArray as ArrowMutableBinaryArray, MutableUtf8Array, PrimitiveArray, Utf8Array, -}; -use arrow::datatypes::DataType as ArrowDataType; -use common_time::timestamp::Timestamp; -use snafu::OptionExt; - -use crate::error::{ConversionSnafu, Result}; -use crate::prelude::ConcreteDataType; -use crate::value::{ListValue, Value}; - -pub type BinaryArray = ArrowBinaryArray; -pub type MutableBinaryArray = ArrowMutableBinaryArray; -pub type MutableStringArray = MutableUtf8Array; -pub type StringArray = Utf8Array; - -macro_rules! cast_array { - ($arr: ident, $CastType: ty) => { - $arr.as_any() - .downcast_ref::<$CastType>() - .with_context(|| ConversionSnafu { - from: format!("{:?}", $arr.data_type()), - })? - }; -} - -pub fn arrow_array_get(array: &dyn Array, idx: usize) -> Result { - if array.is_null(idx) { - return Ok(Value::Null); - } - - let result = match array.data_type() { - ArrowDataType::Null => Value::Null, - ArrowDataType::Boolean => { - Value::Boolean(cast_array!(array, array::BooleanArray).value(idx)) - } - ArrowDataType::Binary | ArrowDataType::LargeBinary => { - Value::Binary(cast_array!(array, BinaryArray).value(idx).into()) - } - ArrowDataType::Int8 => Value::Int8(cast_array!(array, PrimitiveArray::).value(idx)), - ArrowDataType::Int16 => Value::Int16(cast_array!(array, PrimitiveArray::).value(idx)), - ArrowDataType::Int32 => Value::Int32(cast_array!(array, PrimitiveArray::).value(idx)), - ArrowDataType::Int64 => Value::Int64(cast_array!(array, PrimitiveArray::).value(idx)), - ArrowDataType::UInt8 => Value::UInt8(cast_array!(array, PrimitiveArray::).value(idx)), - ArrowDataType::UInt16 => { - Value::UInt16(cast_array!(array, PrimitiveArray::).value(idx)) - } - ArrowDataType::UInt32 => { - Value::UInt32(cast_array!(array, PrimitiveArray::).value(idx)) - } - ArrowDataType::UInt64 => { - Value::UInt64(cast_array!(array, PrimitiveArray::).value(idx)) - } - ArrowDataType::Float32 => { - Value::Float32(cast_array!(array, PrimitiveArray::).value(idx).into()) - } - ArrowDataType::Float64 => { - Value::Float64(cast_array!(array, PrimitiveArray::).value(idx).into()) - } - ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 => { - Value::String(cast_array!(array, StringArray).value(idx).into()) - } - ArrowDataType::Timestamp(t, _) => { - let value = cast_array!(array, PrimitiveArray::).value(idx); - let unit = match ConcreteDataType::from_arrow_time_unit(t) { - ConcreteDataType::Timestamp(t) => t.unit, - _ => unreachable!(), - }; - Value::Timestamp(Timestamp::new(value, unit)) - } - ArrowDataType::List(_) => { - let array = cast_array!(array, ListArray::).value(idx); - let inner_datatype = ConcreteDataType::try_from(array.data_type())?; - let values = (0..array.len()) - .map(|i| arrow_array_get(&*array, i)) - .collect::>>()?; - Value::List(ListValue::new(Some(Box::new(values)), inner_datatype)) - } - _ => unimplemented!("Arrow array datatype: {:?}", array.data_type()), - }; - - Ok(result) -} - -#[cfg(test)] -mod test { - use arrow::array::{ - BooleanArray, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, - MutableListArray, MutablePrimitiveArray, TryExtend, UInt16Array, UInt32Array, UInt64Array, - UInt8Array, - }; - use arrow::buffer::Buffer; - use arrow::datatypes::{DataType, TimeUnit as ArrowTimeUnit}; - use common_time::timestamp::{TimeUnit, Timestamp}; - - use super::*; - use crate::prelude::Vector; - use crate::vectors::TimestampVector; - - #[test] - fn test_arrow_array_access() { - let array1 = BooleanArray::from_slice(vec![true, true, false, false]); - assert_eq!(Value::Boolean(true), arrow_array_get(&array1, 1).unwrap()); - let array1 = Int8Array::from_vec(vec![1, 2, 3, 4]); - assert_eq!(Value::Int8(2), arrow_array_get(&array1, 1).unwrap()); - let array1 = UInt8Array::from_vec(vec![1, 2, 3, 4]); - assert_eq!(Value::UInt8(2), arrow_array_get(&array1, 1).unwrap()); - let array1 = Int16Array::from_vec(vec![1, 2, 3, 4]); - assert_eq!(Value::Int16(2), arrow_array_get(&array1, 1).unwrap()); - let array1 = UInt16Array::from_vec(vec![1, 2, 3, 4]); - assert_eq!(Value::UInt16(2), arrow_array_get(&array1, 1).unwrap()); - let array1 = Int32Array::from_vec(vec![1, 2, 3, 4]); - assert_eq!(Value::Int32(2), arrow_array_get(&array1, 1).unwrap()); - let array1 = UInt32Array::from_vec(vec![1, 2, 3, 4]); - assert_eq!(Value::UInt32(2), arrow_array_get(&array1, 1).unwrap()); - let array = Int64Array::from_vec(vec![1, 2, 3, 4]); - assert_eq!(Value::Int64(2), arrow_array_get(&array, 1).unwrap()); - let array1 = UInt64Array::from_vec(vec![1, 2, 3, 4]); - assert_eq!(Value::UInt64(2), arrow_array_get(&array1, 1).unwrap()); - let array1 = Float32Array::from_vec(vec![1f32, 2f32, 3f32, 4f32]); - assert_eq!( - Value::Float32(2f32.into()), - arrow_array_get(&array1, 1).unwrap() - ); - let array1 = Float64Array::from_vec(vec![1f64, 2f64, 3f64, 4f64]); - assert_eq!( - Value::Float64(2f64.into()), - arrow_array_get(&array1, 1).unwrap() - ); - - let array2 = StringArray::from(vec![Some("hello"), None, Some("world")]); - assert_eq!( - Value::String("hello".into()), - arrow_array_get(&array2, 0).unwrap() - ); - assert_eq!(Value::Null, arrow_array_get(&array2, 1).unwrap()); - - let array3 = super::BinaryArray::from(vec![ - Some("hello".as_bytes()), - None, - Some("world".as_bytes()), - ]); - assert_eq!( - Value::Binary("hello".as_bytes().into()), - arrow_array_get(&array3, 0).unwrap() - ); - assert_eq!(Value::Null, arrow_array_get(&array3, 1).unwrap()); - - let vector = TimestampVector::new(Int64Array::from_vec(vec![1, 2, 3, 4])); - let array = vector.to_boxed_arrow_array(); - let value = arrow_array_get(&*array, 1).unwrap(); - assert_eq!( - value, - Value::Timestamp(Timestamp::new(2, TimeUnit::Millisecond)) - ); - - let array4 = PrimitiveArray::::from_data( - DataType::Timestamp(ArrowTimeUnit::Millisecond, None), - Buffer::from_slice(&vec![1, 2, 3, 4]), - None, - ); - assert_eq!( - Value::Timestamp(Timestamp::new(1, TimeUnit::Millisecond)), - arrow_array_get(&array4, 0).unwrap() - ); - - let array4 = PrimitiveArray::::from_data( - DataType::Timestamp(ArrowTimeUnit::Nanosecond, None), - Buffer::from_slice(&vec![1, 2, 3, 4]), - None, - ); - assert_eq!( - Value::Timestamp(Timestamp::new(1, TimeUnit::Nanosecond)), - arrow_array_get(&array4, 0).unwrap() - ); - - // test list array - let data = vec![ - Some(vec![Some(1i32), Some(2), Some(3)]), - None, - Some(vec![Some(4), None, Some(6)]), - ]; - - let mut arrow_array = MutableListArray::>::new(); - arrow_array.try_extend(data).unwrap(); - let arrow_array: ListArray = arrow_array.into(); - - let v0 = arrow_array_get(&arrow_array, 0).unwrap(); - match v0 { - Value::List(list) => { - assert!(matches!(list.datatype(), ConcreteDataType::Int32(_))); - let items = list.items().as_ref().unwrap(); - assert_eq!( - **items, - vec![Value::Int32(1), Value::Int32(2), Value::Int32(3)] - ); - } - _ => unreachable!(), - } - - assert_eq!(Value::Null, arrow_array_get(&arrow_array, 1).unwrap()); - let v2 = arrow_array_get(&arrow_array, 2).unwrap(); - match v2 { - Value::List(list) => { - assert!(matches!(list.datatype(), ConcreteDataType::Int32(_))); - let items = list.items().as_ref().unwrap(); - assert_eq!(**items, vec![Value::Int32(4), Value::Null, Value::Int32(6)]); - } - _ => unreachable!(), - } - } -} +pub type BinaryArray = arrow::array::LargeBinaryArray; +pub type MutableBinaryArray = arrow::array::LargeBinaryBuilder; +pub type StringArray = arrow::array::StringArray; +pub type MutableStringArray = arrow::array::StringBuilder; diff --git a/src/datatypes/src/data_type.rs b/src/datatypes/src/data_type.rs index e14a3d8e8480..9e4641defa30 100644 --- a/src/datatypes/src/data_type.rs +++ b/src/datatypes/src/data_type.rs @@ -14,7 +14,7 @@ use std::sync::Arc; -use arrow::datatypes::DataType as ArrowDataType; +use arrow::datatypes::{DataType as ArrowDataType, TimeUnit as ArrowTimeUnit}; use common_time::timestamp::TimeUnit; use paste::paste; use serde::{Deserialize, Serialize}; @@ -23,13 +23,14 @@ use crate::error::{self, Error, Result}; use crate::type_id::LogicalTypeId; use crate::types::{ BinaryType, BooleanType, DateTimeType, DateType, Float32Type, Float64Type, Int16Type, - Int32Type, Int64Type, Int8Type, ListType, NullType, StringType, TimestampType, UInt16Type, - UInt32Type, UInt64Type, UInt8Type, + Int32Type, Int64Type, Int8Type, ListType, NullType, StringType, TimestampMicrosecondType, + TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, TimestampType, + UInt16Type, UInt32Type, UInt64Type, UInt8Type, }; use crate::value::Value; use crate::vectors::MutableVector; -#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] #[enum_dispatch::enum_dispatch(DataType)] pub enum ConcreteDataType { Null(NullType), @@ -47,17 +48,21 @@ pub enum ConcreteDataType { Float32(Float32Type), Float64(Float64Type), - // String types + // String types: Binary(BinaryType), String(StringType), + // Date types: Date(DateType), DateTime(DateTimeType), Timestamp(TimestampType), + // Compound types: List(ListType), } +// TODO(yingwen): Refactor these `is_xxx()` methods, such as adding a `properties()` method +// returning all these properties to the `DataType` trait impl ConcreteDataType { pub fn is_float(&self) -> bool { matches!( @@ -70,7 +75,7 @@ impl ConcreteDataType { matches!(self, ConcreteDataType::Boolean(_)) } - pub fn stringifiable(&self) -> bool { + pub fn is_stringifiable(&self) -> bool { matches!( self, ConcreteDataType::String(_) @@ -103,13 +108,6 @@ impl ConcreteDataType { ) } - pub fn is_timestamp(&self) -> bool { - matches!( - self, - ConcreteDataType::Timestamp(_) | ConcreteDataType::Int64(_) - ) - } - pub fn numerics() -> Vec { vec![ ConcreteDataType::int8_datatype(), @@ -136,6 +134,14 @@ impl ConcreteDataType { pub fn is_null(&self) -> bool { matches!(self, ConcreteDataType::Null(NullType)) } + + /// Try to cast the type as a [`ListType`]. + pub fn as_list(&self) -> Option<&ListType> { + match self { + ConcreteDataType::List(t) => Some(t), + _ => None, + } + } } impl TryFrom<&ArrowDataType> for ConcreteDataType { @@ -161,7 +167,7 @@ impl TryFrom<&ArrowDataType> for ConcreteDataType { ArrowDataType::Binary | ArrowDataType::LargeBinary => Self::binary_datatype(), ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 => Self::string_datatype(), ArrowDataType::List(field) => Self::List(ListType::new( - ConcreteDataType::from_arrow_type(&field.data_type), + ConcreteDataType::from_arrow_type(field.data_type()), )), _ => { return error::UnsupportedArrowTypeSnafu { @@ -191,38 +197,52 @@ macro_rules! impl_new_concrete_type_functions { impl_new_concrete_type_functions!( Null, Boolean, UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32, Float64, - Binary, String, Date, DateTime + Binary, Date, DateTime, String ); impl ConcreteDataType { - pub fn list_datatype(inner_type: ConcreteDataType) -> ConcreteDataType { - ConcreteDataType::List(ListType::new(inner_type)) + pub fn timestamp_second_datatype() -> Self { + ConcreteDataType::Timestamp(TimestampType::Second(TimestampSecondType::default())) } - pub fn timestamp_datatype(unit: TimeUnit) -> Self { - ConcreteDataType::Timestamp(TimestampType::new(unit)) + pub fn timestamp_millisecond_datatype() -> Self { + ConcreteDataType::Timestamp(TimestampType::Millisecond( + TimestampMillisecondType::default(), + )) + } + + pub fn timestamp_microsecond_datatype() -> Self { + ConcreteDataType::Timestamp(TimestampType::Microsecond( + TimestampMicrosecondType::default(), + )) + } + + pub fn timestamp_nanosecond_datatype() -> Self { + ConcreteDataType::Timestamp(TimestampType::Nanosecond(TimestampNanosecondType::default())) } - pub fn timestamp_millis_datatype() -> Self { - ConcreteDataType::Timestamp(TimestampType::new(TimeUnit::Millisecond)) + pub fn timestamp_datatype(unit: TimeUnit) -> Self { + match unit { + TimeUnit::Second => Self::timestamp_second_datatype(), + TimeUnit::Millisecond => Self::timestamp_millisecond_datatype(), + TimeUnit::Microsecond => Self::timestamp_microsecond_datatype(), + TimeUnit::Nanosecond => Self::timestamp_nanosecond_datatype(), + } } /// Converts from arrow timestamp unit to - // TODO(hl): maybe impl From for our timestamp ? - pub fn from_arrow_time_unit(t: &arrow::datatypes::TimeUnit) -> Self { + pub fn from_arrow_time_unit(t: &ArrowTimeUnit) -> Self { match t { - arrow::datatypes::TimeUnit::Second => Self::timestamp_datatype(TimeUnit::Second), - arrow::datatypes::TimeUnit::Millisecond => { - Self::timestamp_datatype(TimeUnit::Millisecond) - } - arrow::datatypes::TimeUnit::Microsecond => { - Self::timestamp_datatype(TimeUnit::Microsecond) - } - arrow::datatypes::TimeUnit::Nanosecond => { - Self::timestamp_datatype(TimeUnit::Nanosecond) - } + ArrowTimeUnit::Second => Self::timestamp_second_datatype(), + ArrowTimeUnit::Millisecond => Self::timestamp_millisecond_datatype(), + ArrowTimeUnit::Microsecond => Self::timestamp_microsecond_datatype(), + ArrowTimeUnit::Nanosecond => Self::timestamp_nanosecond_datatype(), } } + + pub fn list_datatype(item_type: ConcreteDataType) -> ConcreteDataType { + ConcreteDataType::List(ListType::new(item_type)) + } } /// Data type abstraction. @@ -237,11 +257,15 @@ pub trait DataType: std::fmt::Debug + Send + Sync { /// Returns the default value of this type. fn default_value(&self) -> Value; - /// Convert this type as [arrow2::datatypes::DataType]. + /// Convert this type as [arrow::datatypes::DataType]. fn as_arrow_type(&self) -> ArrowDataType; - /// Create a mutable vector with given `capacity` of this type. + /// Creates a mutable vector with given `capacity` of this type. fn create_mutable_vector(&self, capacity: usize) -> Box; + + /// Returns true if the data type is compatible with timestamp type so we can + /// use it as a timestamp. + fn is_timestamp_compatible(&self) -> bool; } pub type DataTypeRef = Arc; @@ -324,10 +348,6 @@ mod tests { ConcreteDataType::from_arrow_type(&ArrowDataType::Utf8), ConcreteDataType::String(_) )); - assert!(matches!( - ConcreteDataType::from_arrow_type(&ArrowDataType::Utf8), - ConcreteDataType::String(_) - )); assert_eq!( ConcreteDataType::from_arrow_type(&ArrowDataType::List(Box::new(Field::new( "item", @@ -345,31 +365,48 @@ mod tests { #[test] fn test_from_arrow_timestamp() { assert_eq!( - ConcreteDataType::timestamp_millis_datatype(), - ConcreteDataType::from_arrow_time_unit(&arrow::datatypes::TimeUnit::Millisecond) + ConcreteDataType::timestamp_millisecond_datatype(), + ConcreteDataType::from_arrow_time_unit(&ArrowTimeUnit::Millisecond) ); assert_eq!( - ConcreteDataType::timestamp_datatype(TimeUnit::Microsecond), - ConcreteDataType::from_arrow_time_unit(&arrow::datatypes::TimeUnit::Microsecond) + ConcreteDataType::timestamp_microsecond_datatype(), + ConcreteDataType::from_arrow_time_unit(&ArrowTimeUnit::Microsecond) ); assert_eq!( - ConcreteDataType::timestamp_datatype(TimeUnit::Nanosecond), - ConcreteDataType::from_arrow_time_unit(&arrow::datatypes::TimeUnit::Nanosecond) + ConcreteDataType::timestamp_nanosecond_datatype(), + ConcreteDataType::from_arrow_time_unit(&ArrowTimeUnit::Nanosecond) ); assert_eq!( - ConcreteDataType::timestamp_datatype(TimeUnit::Second), - ConcreteDataType::from_arrow_time_unit(&arrow::datatypes::TimeUnit::Second) + ConcreteDataType::timestamp_second_datatype(), + ConcreteDataType::from_arrow_time_unit(&ArrowTimeUnit::Second) ); } #[test] - fn test_is_timestamp() { - assert!(ConcreteDataType::timestamp_millis_datatype().is_timestamp()); - assert!(ConcreteDataType::timestamp_datatype(TimeUnit::Second).is_timestamp()); - assert!(ConcreteDataType::timestamp_datatype(TimeUnit::Millisecond).is_timestamp()); - assert!(ConcreteDataType::timestamp_datatype(TimeUnit::Microsecond).is_timestamp()); - assert!(ConcreteDataType::timestamp_datatype(TimeUnit::Nanosecond).is_timestamp()); - assert!(ConcreteDataType::int64_datatype().is_timestamp()); + fn test_is_timestamp_compatible() { + assert!(ConcreteDataType::timestamp_datatype(TimeUnit::Second).is_timestamp_compatible()); + assert!( + ConcreteDataType::timestamp_datatype(TimeUnit::Millisecond).is_timestamp_compatible() + ); + assert!( + ConcreteDataType::timestamp_datatype(TimeUnit::Microsecond).is_timestamp_compatible() + ); + assert!( + ConcreteDataType::timestamp_datatype(TimeUnit::Nanosecond).is_timestamp_compatible() + ); + assert!(ConcreteDataType::timestamp_second_datatype().is_timestamp_compatible()); + assert!(ConcreteDataType::timestamp_millisecond_datatype().is_timestamp_compatible()); + assert!(ConcreteDataType::timestamp_microsecond_datatype().is_timestamp_compatible()); + assert!(ConcreteDataType::timestamp_nanosecond_datatype().is_timestamp_compatible()); + assert!(ConcreteDataType::int64_datatype().is_timestamp_compatible()); + assert!(!ConcreteDataType::null_datatype().is_timestamp_compatible()); + assert!(!ConcreteDataType::binary_datatype().is_timestamp_compatible()); + assert!(!ConcreteDataType::boolean_datatype().is_timestamp_compatible()); + assert!(!ConcreteDataType::date_datatype().is_timestamp_compatible()); + assert!(!ConcreteDataType::datetime_datatype().is_timestamp_compatible()); + assert!(!ConcreteDataType::string_datatype().is_timestamp_compatible()); + assert!(!ConcreteDataType::int32_datatype().is_timestamp_compatible()); + assert!(!ConcreteDataType::uint64_datatype().is_timestamp_compatible()); } #[test] @@ -377,4 +414,91 @@ mod tests { assert!(ConcreteDataType::null_datatype().is_null()); assert!(!ConcreteDataType::int32_datatype().is_null()); } + + #[test] + fn test_is_float() { + assert!(!ConcreteDataType::int32_datatype().is_float()); + assert!(ConcreteDataType::float32_datatype().is_float()); + assert!(ConcreteDataType::float64_datatype().is_float()); + } + + #[test] + fn test_is_boolean() { + assert!(!ConcreteDataType::int32_datatype().is_boolean()); + assert!(!ConcreteDataType::float32_datatype().is_boolean()); + assert!(ConcreteDataType::boolean_datatype().is_boolean()); + } + + #[test] + fn test_is_stringifiable() { + assert!(!ConcreteDataType::int32_datatype().is_stringifiable()); + assert!(!ConcreteDataType::float32_datatype().is_stringifiable()); + assert!(ConcreteDataType::string_datatype().is_stringifiable()); + assert!(ConcreteDataType::date_datatype().is_stringifiable()); + assert!(ConcreteDataType::datetime_datatype().is_stringifiable()); + assert!(ConcreteDataType::timestamp_second_datatype().is_stringifiable()); + assert!(ConcreteDataType::timestamp_millisecond_datatype().is_stringifiable()); + assert!(ConcreteDataType::timestamp_microsecond_datatype().is_stringifiable()); + assert!(ConcreteDataType::timestamp_nanosecond_datatype().is_stringifiable()); + } + + #[test] + fn test_is_signed() { + assert!(ConcreteDataType::int8_datatype().is_signed()); + assert!(ConcreteDataType::int16_datatype().is_signed()); + assert!(ConcreteDataType::int32_datatype().is_signed()); + assert!(ConcreteDataType::int64_datatype().is_signed()); + assert!(ConcreteDataType::date_datatype().is_signed()); + assert!(ConcreteDataType::datetime_datatype().is_signed()); + assert!(ConcreteDataType::timestamp_second_datatype().is_signed()); + assert!(ConcreteDataType::timestamp_millisecond_datatype().is_signed()); + assert!(ConcreteDataType::timestamp_microsecond_datatype().is_signed()); + assert!(ConcreteDataType::timestamp_nanosecond_datatype().is_signed()); + + assert!(!ConcreteDataType::uint8_datatype().is_signed()); + assert!(!ConcreteDataType::uint16_datatype().is_signed()); + assert!(!ConcreteDataType::uint32_datatype().is_signed()); + assert!(!ConcreteDataType::uint64_datatype().is_signed()); + + assert!(!ConcreteDataType::float32_datatype().is_signed()); + assert!(!ConcreteDataType::float64_datatype().is_signed()); + } + + #[test] + fn test_is_unsigned() { + assert!(!ConcreteDataType::int8_datatype().is_unsigned()); + assert!(!ConcreteDataType::int16_datatype().is_unsigned()); + assert!(!ConcreteDataType::int32_datatype().is_unsigned()); + assert!(!ConcreteDataType::int64_datatype().is_unsigned()); + assert!(!ConcreteDataType::date_datatype().is_unsigned()); + assert!(!ConcreteDataType::datetime_datatype().is_unsigned()); + assert!(!ConcreteDataType::timestamp_second_datatype().is_unsigned()); + assert!(!ConcreteDataType::timestamp_millisecond_datatype().is_unsigned()); + assert!(!ConcreteDataType::timestamp_microsecond_datatype().is_unsigned()); + assert!(!ConcreteDataType::timestamp_nanosecond_datatype().is_unsigned()); + + assert!(ConcreteDataType::uint8_datatype().is_unsigned()); + assert!(ConcreteDataType::uint16_datatype().is_unsigned()); + assert!(ConcreteDataType::uint32_datatype().is_unsigned()); + assert!(ConcreteDataType::uint64_datatype().is_unsigned()); + + assert!(!ConcreteDataType::float32_datatype().is_unsigned()); + assert!(!ConcreteDataType::float64_datatype().is_unsigned()); + } + + #[test] + fn test_numerics() { + let nums = ConcreteDataType::numerics(); + assert_eq!(10, nums.len()); + } + + #[test] + fn test_as_list() { + let list_type = ConcreteDataType::list_datatype(ConcreteDataType::int32_datatype()); + assert_eq!( + ListType::new(ConcreteDataType::int32_datatype()), + *list_type.as_list().unwrap() + ); + assert!(ConcreteDataType::int32_datatype().as_list().is_none()); + } } diff --git a/src/datatypes/src/error.rs b/src/datatypes/src/error.rs index 50b49cf2b4bb..2cb8553a900d 100644 --- a/src/datatypes/src/error.rs +++ b/src/datatypes/src/error.rs @@ -99,6 +99,12 @@ pub enum Error { #[snafu(display("Duplicated metadata for {}", key))] DuplicateMeta { key: String, backtrace: Backtrace }, + + #[snafu(display("Failed to convert value into scalar value, reason: {}", reason))] + ToScalarValue { + reason: String, + backtrace: Backtrace, + }, } impl ErrorExt for Error { diff --git a/src/datatypes/src/lib.rs b/src/datatypes/src/lib.rs index f6f6db112a2c..3051c7a4b3e3 100644 --- a/src/datatypes/src/lib.rs +++ b/src/datatypes/src/lib.rs @@ -20,9 +20,10 @@ pub mod data_type; pub mod error; pub mod macros; pub mod prelude; -mod scalars; +pub mod scalars; pub mod schema; pub mod serialize; +pub mod timestamp; pub mod type_id; pub mod types; pub mod value; diff --git a/src/datatypes/src/macros.rs b/src/datatypes/src/macros.rs index 18be9fa375b3..37c0a42e3f55 100644 --- a/src/datatypes/src/macros.rs +++ b/src/datatypes/src/macros.rs @@ -12,27 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -///! Some helper macros for datatypes, copied from databend. -#[macro_export] -macro_rules! for_all_scalar_types { - ($macro:tt $(, $x:tt)*) => { - $macro! { - [$($x),*], - { i8 }, - { i16 }, - { i32 }, - { i64 }, - { u8 }, - { u16 }, - { u32 }, - { u64 }, - { f32 }, - { f64 }, - { bool }, - } - }; -} +//! Some helper macros for datatypes, copied from databend. +/// Apply the macro rules to all primitive types. #[macro_export] macro_rules! for_all_primitive_types { ($macro:tt $(, $x:tt)*) => { @@ -52,6 +34,8 @@ macro_rules! for_all_primitive_types { }; } +/// Match the logical type and apply `$body` to all primitive types and +/// `nbody` to other types. #[macro_export] macro_rules! with_match_primitive_type_id { ($key_type:expr, | $_:tt $T:ident | $body:tt, $nbody:tt) => {{ @@ -62,17 +46,21 @@ macro_rules! with_match_primitive_type_id { } use $crate::type_id::LogicalTypeId; + use $crate::types::{ + Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type, + UInt32Type, UInt64Type, UInt8Type, + }; match $key_type { - LogicalTypeId::Int8 => __with_ty__! { i8 }, - LogicalTypeId::Int16 => __with_ty__! { i16 }, - LogicalTypeId::Int32 => __with_ty__! { i32 }, - LogicalTypeId::Int64 => __with_ty__! { i64 }, - LogicalTypeId::UInt8 => __with_ty__! { u8 }, - LogicalTypeId::UInt16 => __with_ty__! { u16 }, - LogicalTypeId::UInt32 => __with_ty__! { u32 }, - LogicalTypeId::UInt64 => __with_ty__! { u64 }, - LogicalTypeId::Float32 => __with_ty__! { f32 }, - LogicalTypeId::Float64 => __with_ty__! { f64 }, + LogicalTypeId::Int8 => __with_ty__! { Int8Type }, + LogicalTypeId::Int16 => __with_ty__! { Int16Type }, + LogicalTypeId::Int32 => __with_ty__! { Int32Type }, + LogicalTypeId::Int64 => __with_ty__! { Int64Type }, + LogicalTypeId::UInt8 => __with_ty__! { UInt8Type }, + LogicalTypeId::UInt16 => __with_ty__! { UInt16Type }, + LogicalTypeId::UInt32 => __with_ty__! { UInt32Type }, + LogicalTypeId::UInt64 => __with_ty__! { UInt64Type }, + LogicalTypeId::Float32 => __with_ty__! { Float32Type }, + LogicalTypeId::Float64 => __with_ty__! { Float64Type }, _ => $nbody, } diff --git a/src/datatypes/src/prelude.rs b/src/datatypes/src/prelude.rs index 014a40efaf65..b1afe93042f4 100644 --- a/src/datatypes/src/prelude.rs +++ b/src/datatypes/src/prelude.rs @@ -16,8 +16,6 @@ pub use crate::data_type::{ConcreteDataType, DataType, DataTypeRef}; pub use crate::macros::*; pub use crate::scalars::{Scalar, ScalarRef, ScalarVector, ScalarVectorBuilder}; pub use crate::type_id::LogicalTypeId; -pub use crate::types::Primitive; +pub use crate::types::{LogicalPrimitiveType, WrapperType}; pub use crate::value::{Value, ValueRef}; -pub use crate::vectors::{ - Helper as VectorHelper, MutableVector, Validity, Vector, VectorBuilder, VectorRef, -}; +pub use crate::vectors::{MutableVector, Validity, Vector, VectorRef}; diff --git a/src/datatypes/src/scalars.rs b/src/datatypes/src/scalars.rs index ddb8eff007ed..327ebaa629a2 100644 --- a/src/datatypes/src/scalars.rs +++ b/src/datatypes/src/scalars.rs @@ -14,11 +14,17 @@ use std::any::Any; -use common_time::{Date, DateTime, Timestamp}; - -use crate::prelude::*; -use crate::value::{ListValue, ListValueRef}; -use crate::vectors::*; +use common_time::{Date, DateTime}; + +use crate::types::{ + Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type, UInt32Type, + UInt64Type, UInt8Type, +}; +use crate::value::{ListValue, ListValueRef, Value}; +use crate::vectors::{ + BinaryVector, BooleanVector, DateTimeVector, DateVector, ListVector, MutableVector, + PrimitiveVector, StringVector, Vector, +}; fn get_iter_capacity>(iter: &I) -> usize { match iter.size_hint() { @@ -35,7 +41,7 @@ where for<'a> Self::VectorType: ScalarVector = Self::RefType<'a>>, { type VectorType: ScalarVector; - type RefType<'a>: ScalarRef<'a, ScalarType = Self, VectorType = Self::VectorType> + type RefType<'a>: ScalarRef<'a, ScalarType = Self> where Self: 'a; /// Get a reference of the current value. @@ -46,7 +52,6 @@ where } pub trait ScalarRef<'a>: std::fmt::Debug + Clone + Copy + Send + 'a { - type VectorType: ScalarVector = Self>; /// The corresponding [`Scalar`] type. type ScalarType: Scalar = Self>; @@ -63,7 +68,7 @@ where { type OwnedItem: Scalar; /// The reference item of this vector. - type RefItem<'a>: ScalarRef<'a, ScalarType = Self::OwnedItem, VectorType = Self> + type RefItem<'a>: ScalarRef<'a, ScalarType = Self::OwnedItem> where Self: 'a; @@ -137,47 +142,46 @@ pub trait ScalarVectorBuilder: MutableVector { fn finish(&mut self) -> Self::VectorType; } -macro_rules! impl_primitive_scalar_type { - ($native:ident) => { - impl Scalar for $native { - type VectorType = PrimitiveVector<$native>; - type RefType<'a> = $native; +macro_rules! impl_scalar_for_native { + ($Native: ident, $DataType: ident) => { + impl Scalar for $Native { + type VectorType = PrimitiveVector<$DataType>; + type RefType<'a> = $Native; #[inline] - fn as_scalar_ref(&self) -> $native { + fn as_scalar_ref(&self) -> $Native { *self } #[allow(clippy::needless_lifetimes)] #[inline] - fn upcast_gat<'short, 'long: 'short>(long: $native) -> $native { + fn upcast_gat<'short, 'long: 'short>(long: $Native) -> $Native { long } } /// Implement [`ScalarRef`] for primitive types. Note that primitive types are both [`Scalar`] and [`ScalarRef`]. - impl<'a> ScalarRef<'a> for $native { - type VectorType = PrimitiveVector<$native>; - type ScalarType = $native; + impl<'a> ScalarRef<'a> for $Native { + type ScalarType = $Native; #[inline] - fn to_owned_scalar(&self) -> $native { + fn to_owned_scalar(&self) -> $Native { *self } } }; } -impl_primitive_scalar_type!(u8); -impl_primitive_scalar_type!(u16); -impl_primitive_scalar_type!(u32); -impl_primitive_scalar_type!(u64); -impl_primitive_scalar_type!(i8); -impl_primitive_scalar_type!(i16); -impl_primitive_scalar_type!(i32); -impl_primitive_scalar_type!(i64); -impl_primitive_scalar_type!(f32); -impl_primitive_scalar_type!(f64); +impl_scalar_for_native!(u8, UInt8Type); +impl_scalar_for_native!(u16, UInt16Type); +impl_scalar_for_native!(u32, UInt32Type); +impl_scalar_for_native!(u64, UInt64Type); +impl_scalar_for_native!(i8, Int8Type); +impl_scalar_for_native!(i16, Int16Type); +impl_scalar_for_native!(i32, Int32Type); +impl_scalar_for_native!(i64, Int64Type); +impl_scalar_for_native!(f32, Float32Type); +impl_scalar_for_native!(f64, Float64Type); impl Scalar for bool { type VectorType = BooleanVector; @@ -196,7 +200,6 @@ impl Scalar for bool { } impl<'a> ScalarRef<'a> for bool { - type VectorType = BooleanVector; type ScalarType = bool; #[inline] @@ -221,7 +224,6 @@ impl Scalar for String { } impl<'a> ScalarRef<'a> for &'a str { - type VectorType = StringVector; type ScalarType = String; #[inline] @@ -246,7 +248,6 @@ impl Scalar for Vec { } impl<'a> ScalarRef<'a> for &'a [u8] { - type VectorType = BinaryVector; type ScalarType = Vec; #[inline] @@ -269,7 +270,6 @@ impl Scalar for Date { } impl<'a> ScalarRef<'a> for Date { - type VectorType = DateVector; type ScalarType = Date; fn to_owned_scalar(&self) -> Self::ScalarType { @@ -291,7 +291,6 @@ impl Scalar for DateTime { } impl<'a> ScalarRef<'a> for DateTime { - type VectorType = DateTimeVector; type ScalarType = DateTime; fn to_owned_scalar(&self) -> Self::ScalarType { @@ -299,27 +298,7 @@ impl<'a> ScalarRef<'a> for DateTime { } } -impl Scalar for Timestamp { - type VectorType = TimestampVector; - type RefType<'a> = Timestamp; - - fn as_scalar_ref(&self) -> Self::RefType<'_> { - *self - } - - fn upcast_gat<'short, 'long: 'short>(long: Self::RefType<'long>) -> Self::RefType<'short> { - long - } -} - -impl<'a> ScalarRef<'a> for Timestamp { - type VectorType = TimestampVector; - type ScalarType = Timestamp; - - fn to_owned_scalar(&self) -> Self::ScalarType { - *self - } -} +// Timestamp types implement Scalar and ScalarRef in `src/timestamp.rs`. impl Scalar for ListValue { type VectorType = ListVector; @@ -335,7 +314,6 @@ impl Scalar for ListValue { } impl<'a> ScalarRef<'a> for ListValueRef<'a> { - type VectorType = ListVector; type ScalarType = ListValue; fn to_owned_scalar(&self) -> Self::ScalarType { @@ -357,8 +335,9 @@ impl<'a> ScalarRef<'a> for ListValueRef<'a> { #[cfg(test)] mod tests { use super::*; - use crate::vectors::binary::BinaryVector; - use crate::vectors::primitive::Int32Vector; + use crate::data_type::ConcreteDataType; + use crate::timestamp::TimestampSecond; + use crate::vectors::{BinaryVector, Int32Vector, ListVectorBuilder, TimestampSecondVector}; fn build_vector_from_slice(items: &[Option>]) -> T { let mut builder = T::Builder::with_capacity(items.len()); @@ -454,11 +433,11 @@ mod tests { #[test] fn test_build_timestamp_vector() { - let expect: Vec> = vec![Some(10.into()), None, Some(42.into())]; - let vector: TimestampVector = build_vector_from_slice(&expect); + let expect: Vec> = vec![Some(10.into()), None, Some(42.into())]; + let vector: TimestampSecondVector = build_vector_from_slice(&expect); assert_vector_eq(&expect, &vector); let val = vector.get_data(0).unwrap(); assert_eq!(val, val.as_scalar_ref()); - assert_eq!(10, val.to_owned_scalar().value()); + assert_eq!(TimestampSecond::from(10), val.to_owned_scalar()); } } diff --git a/src/datatypes/src/schema.rs b/src/datatypes/src/schema.rs index e3a5661dfd38..4952e36cc0fd 100644 --- a/src/datatypes/src/schema.rs +++ b/src/datatypes/src/schema.rs @@ -12,129 +12,28 @@ // See the License for the specific language governing permissions and // limitations under the License. +mod column_schema; mod constraint; mod raw; use std::collections::HashMap; use std::sync::Arc; -pub use arrow::datatypes::Metadata; use arrow::datatypes::{Field, Schema as ArrowSchema}; use datafusion_common::DFSchemaRef; -use serde::{Deserialize, Serialize}; use snafu::{ensure, ResultExt}; -use crate::data_type::{ConcreteDataType, DataType}; -use crate::error::{self, DeserializeSnafu, Error, Result, SerializeSnafu}; +use crate::data_type::DataType; +use crate::error::{self, Error, Result}; +pub use crate::schema::column_schema::{ColumnSchema, Metadata}; pub use crate::schema::constraint::ColumnDefaultConstraint; pub use crate::schema::raw::RawSchema; -use crate::vectors::VectorRef; -/// Key used to store whether the column is time index in arrow field's metadata. -const TIME_INDEX_KEY: &str = "greptime:time_index"; /// Key used to store version number of the schema in metadata. const VERSION_KEY: &str = "greptime:version"; -/// Key used to store default constraint in arrow field's metadata. -const ARROW_FIELD_DEFAULT_CONSTRAINT_KEY: &str = "greptime:default_constraint"; - -/// Schema of a column, used as an immutable struct. -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] -pub struct ColumnSchema { - pub name: String, - pub data_type: ConcreteDataType, - is_nullable: bool, - is_time_index: bool, - default_constraint: Option, - metadata: Metadata, -} - -impl ColumnSchema { - pub fn new>( - name: T, - data_type: ConcreteDataType, - is_nullable: bool, - ) -> ColumnSchema { - ColumnSchema { - name: name.into(), - data_type, - is_nullable, - is_time_index: false, - default_constraint: None, - metadata: Metadata::new(), - } - } - - #[inline] - pub fn is_time_index(&self) -> bool { - self.is_time_index - } - - #[inline] - pub fn is_nullable(&self) -> bool { - self.is_nullable - } - - #[inline] - pub fn default_constraint(&self) -> Option<&ColumnDefaultConstraint> { - self.default_constraint.as_ref() - } - - #[inline] - pub fn metadata(&self) -> &Metadata { - &self.metadata - } - - pub fn with_time_index(mut self, is_time_index: bool) -> Self { - self.is_time_index = is_time_index; - if is_time_index { - self.metadata - .insert(TIME_INDEX_KEY.to_string(), "true".to_string()); - } else { - self.metadata.remove(TIME_INDEX_KEY); - } - self - } - - pub fn with_default_constraint( - mut self, - default_constraint: Option, - ) -> Result { - if let Some(constraint) = &default_constraint { - constraint.validate(&self.data_type, self.is_nullable)?; - } - - self.default_constraint = default_constraint; - Ok(self) - } - - /// Creates a new [`ColumnSchema`] with given metadata. - pub fn with_metadata(mut self, metadata: Metadata) -> Self { - self.metadata = metadata; - self - } - - pub fn create_default_vector(&self, num_rows: usize) -> Result> { - match &self.default_constraint { - Some(c) => c - .create_default_vector(&self.data_type, self.is_nullable, num_rows) - .map(Some), - None => { - if self.is_nullable { - // No default constraint, use null as default value. - // TODO(yingwen): Use NullVector once it supports setting logical type. - ColumnDefaultConstraint::null_value() - .create_default_vector(&self.data_type, self.is_nullable, num_rows) - .map(Some) - } else { - Ok(None) - } - } - } - } -} /// A common schema, should be immutable. -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Eq)] pub struct Schema { column_schemas: Vec, name_to_index: HashMap, @@ -232,7 +131,7 @@ impl Schema { } #[inline] - pub fn metadata(&self) -> &Metadata { + pub fn metadata(&self) -> &HashMap { &self.arrow_schema.metadata } } @@ -244,7 +143,7 @@ pub struct SchemaBuilder { fields: Vec, timestamp_index: Option, version: u32, - metadata: Metadata, + metadata: HashMap, } impl TryFrom> for SchemaBuilder { @@ -293,7 +192,7 @@ impl SchemaBuilder { self.metadata .insert(VERSION_KEY.to_string(), self.version.to_string()); - let arrow_schema = ArrowSchema::from(self.fields).with_metadata(self.metadata); + let arrow_schema = ArrowSchema::new(self.fields).with_metadata(self.metadata); Ok(Schema { column_schemas: self.column_schemas, @@ -348,7 +247,7 @@ fn validate_timestamp_index(column_schemas: &[ColumnSchema], timestamp_index: us let column_schema = &column_schemas[timestamp_index]; ensure!( - column_schema.data_type.is_timestamp(), + column_schema.data_type.is_timestamp_compatible(), error::InvalidTimestampIndexSnafu { index: timestamp_index, } @@ -365,58 +264,6 @@ fn validate_timestamp_index(column_schemas: &[ColumnSchema], timestamp_index: us pub type SchemaRef = Arc; -impl TryFrom<&Field> for ColumnSchema { - type Error = Error; - - fn try_from(field: &Field) -> Result { - let data_type = ConcreteDataType::try_from(&field.data_type)?; - let mut metadata = field.metadata.clone(); - let default_constraint = match metadata.remove(ARROW_FIELD_DEFAULT_CONSTRAINT_KEY) { - Some(json) => Some(serde_json::from_str(&json).context(DeserializeSnafu { json })?), - None => None, - }; - let is_time_index = metadata.contains_key(TIME_INDEX_KEY); - - Ok(ColumnSchema { - name: field.name.clone(), - data_type, - is_nullable: field.is_nullable, - is_time_index, - default_constraint, - metadata, - }) - } -} - -impl TryFrom<&ColumnSchema> for Field { - type Error = Error; - - fn try_from(column_schema: &ColumnSchema) -> Result { - let mut metadata = column_schema.metadata.clone(); - if let Some(value) = &column_schema.default_constraint { - // Adds an additional metadata to store the default constraint. - let old = metadata.insert( - ARROW_FIELD_DEFAULT_CONSTRAINT_KEY.to_string(), - serde_json::to_string(&value).context(SerializeSnafu)?, - ); - - ensure!( - old.is_none(), - error::DuplicateMetaSnafu { - key: ARROW_FIELD_DEFAULT_CONSTRAINT_KEY, - } - ); - } - - Ok(Field::new( - column_schema.name.clone(), - column_schema.data_type.as_arrow_type(), - column_schema.is_nullable(), - ) - .with_metadata(metadata)) - } -} - impl TryFrom> for Schema { type Error = Error; @@ -425,7 +272,7 @@ impl TryFrom> for Schema { let mut name_to_index = HashMap::with_capacity(arrow_schema.fields.len()); for field in &arrow_schema.fields { let column_schema = ColumnSchema::try_from(field)?; - name_to_index.insert(field.name.clone(), column_schemas.len()); + name_to_index.insert(field.name().to_string(), column_schemas.len()); column_schemas.push(column_schema); } @@ -475,7 +322,7 @@ impl TryFrom for Schema { } } -fn try_parse_version(metadata: &Metadata, key: &str) -> Result { +fn try_parse_version(metadata: &HashMap, key: &str) -> Result { if let Some(value) = metadata.get(key) { let version = value .parse() @@ -489,127 +336,8 @@ fn try_parse_version(metadata: &Metadata, key: &str) -> Result { #[cfg(test)] mod tests { - use arrow::datatypes::DataType as ArrowDataType; - use super::*; - use crate::value::Value; - - #[test] - fn test_column_schema() { - let column_schema = ColumnSchema::new("test", ConcreteDataType::int32_datatype(), true); - let field = Field::try_from(&column_schema).unwrap(); - assert_eq!("test", field.name); - assert_eq!(ArrowDataType::Int32, field.data_type); - assert!(field.is_nullable); - - let new_column_schema = ColumnSchema::try_from(&field).unwrap(); - assert_eq!(column_schema, new_column_schema); - } - - #[test] - fn test_column_schema_with_default_constraint() { - let column_schema = ColumnSchema::new("test", ConcreteDataType::int32_datatype(), true) - .with_default_constraint(Some(ColumnDefaultConstraint::Value(Value::from(99)))) - .unwrap(); - assert!(column_schema - .metadata() - .get(ARROW_FIELD_DEFAULT_CONSTRAINT_KEY) - .is_none()); - - let field = Field::try_from(&column_schema).unwrap(); - assert_eq!("test", field.name); - assert_eq!(ArrowDataType::Int32, field.data_type); - assert!(field.is_nullable); - assert_eq!( - "{\"Value\":{\"Int32\":99}}", - field - .metadata - .get(ARROW_FIELD_DEFAULT_CONSTRAINT_KEY) - .unwrap() - ); - - let new_column_schema = ColumnSchema::try_from(&field).unwrap(); - assert_eq!(column_schema, new_column_schema); - } - - #[test] - fn test_column_schema_with_metadata() { - let mut metadata = Metadata::new(); - metadata.insert("k1".to_string(), "v1".to_string()); - let column_schema = ColumnSchema::new("test", ConcreteDataType::int32_datatype(), true) - .with_metadata(metadata) - .with_default_constraint(Some(ColumnDefaultConstraint::null_value())) - .unwrap(); - assert_eq!("v1", column_schema.metadata().get("k1").unwrap()); - assert!(column_schema - .metadata() - .get(ARROW_FIELD_DEFAULT_CONSTRAINT_KEY) - .is_none()); - - let field = Field::try_from(&column_schema).unwrap(); - assert_eq!("v1", field.metadata.get("k1").unwrap()); - assert!(field - .metadata - .get(ARROW_FIELD_DEFAULT_CONSTRAINT_KEY) - .is_some()); - - let new_column_schema = ColumnSchema::try_from(&field).unwrap(); - assert_eq!(column_schema, new_column_schema); - } - - #[test] - fn test_column_schema_with_duplicate_metadata() { - let mut metadata = Metadata::new(); - metadata.insert( - ARROW_FIELD_DEFAULT_CONSTRAINT_KEY.to_string(), - "v1".to_string(), - ); - let column_schema = ColumnSchema::new("test", ConcreteDataType::int32_datatype(), true) - .with_metadata(metadata) - .with_default_constraint(Some(ColumnDefaultConstraint::null_value())) - .unwrap(); - Field::try_from(&column_schema).unwrap_err(); - } - - #[test] - fn test_column_schema_invalid_default_constraint() { - ColumnSchema::new("test", ConcreteDataType::int32_datatype(), false) - .with_default_constraint(Some(ColumnDefaultConstraint::null_value())) - .unwrap_err(); - } - - #[test] - fn test_column_default_constraint_try_into_from() { - let default_constraint = ColumnDefaultConstraint::Value(Value::from(42i64)); - - let bytes: Vec = default_constraint.clone().try_into().unwrap(); - let from_value = ColumnDefaultConstraint::try_from(&bytes[..]).unwrap(); - - assert_eq!(default_constraint, from_value); - } - - #[test] - fn test_column_schema_create_default_null() { - // Implicit default null. - let column_schema = ColumnSchema::new("test", ConcreteDataType::int32_datatype(), true); - let v = column_schema.create_default_vector(5).unwrap().unwrap(); - assert_eq!(5, v.len()); - assert!(v.only_null()); - - // Explicit default null. - let column_schema = ColumnSchema::new("test", ConcreteDataType::int32_datatype(), true) - .with_default_constraint(Some(ColumnDefaultConstraint::null_value())) - .unwrap(); - let v = column_schema.create_default_vector(5).unwrap().unwrap(); - assert_eq!(5, v.len()); - assert!(v.only_null()); - } - - #[test] - fn test_column_schema_no_default() { - let column_schema = ColumnSchema::new("test", ConcreteDataType::int32_datatype(), false); - assert!(column_schema.create_default_vector(5).unwrap().is_none()); - } + use crate::data_type::ConcreteDataType; #[test] fn test_build_empty_schema() { @@ -664,8 +392,12 @@ mod tests { fn test_schema_with_timestamp() { let column_schemas = vec![ ColumnSchema::new("col1", ConcreteDataType::int32_datatype(), true), - ColumnSchema::new("ts", ConcreteDataType::timestamp_millis_datatype(), false) - .with_time_index(true), + ColumnSchema::new( + "ts", + ConcreteDataType::timestamp_millisecond_datatype(), + false, + ) + .with_time_index(true), ]; let schema = SchemaBuilder::try_from(column_schemas.clone()) .unwrap() diff --git a/src/datatypes2/src/schema/column_schema.rs b/src/datatypes/src/schema/column_schema.rs similarity index 100% rename from src/datatypes2/src/schema/column_schema.rs rename to src/datatypes/src/schema/column_schema.rs diff --git a/src/datatypes/src/schema/constraint.rs b/src/datatypes/src/schema/constraint.rs index 3750fcebcfac..4dd3ecc14b7f 100644 --- a/src/datatypes/src/schema/constraint.rs +++ b/src/datatypes/src/schema/constraint.rs @@ -22,7 +22,7 @@ use snafu::{ensure, ResultExt}; use crate::data_type::{ConcreteDataType, DataType}; use crate::error::{self, Result}; use crate::value::Value; -use crate::vectors::{Int64Vector, TimestampVector, VectorRef}; +use crate::vectors::{Int64Vector, TimestampMillisecondVector, VectorRef}; const CURRENT_TIMESTAMP: &str = "current_timestamp()"; @@ -81,7 +81,7 @@ impl ColumnDefaultConstraint { error::UnsupportedDefaultExprSnafu { expr } ); ensure!( - data_type.is_timestamp(), + data_type.is_timestamp_compatible(), error::DefaultValueTypeSnafu { reason: "return value of the function must has timestamp type", } @@ -162,8 +162,10 @@ fn create_current_timestamp_vector( data_type: &ConcreteDataType, num_rows: usize, ) -> Result { + // FIXME(yingwen): We should implements cast in VectorOp so we could cast the millisecond vector + // to other data type and avoid this match. match data_type { - ConcreteDataType::Timestamp(_) => Ok(Arc::new(TimestampVector::from_values( + ConcreteDataType::Timestamp(_) => Ok(Arc::new(TimestampMillisecondVector::from_values( std::iter::repeat(util::current_time_millis()).take(num_rows), ))), ConcreteDataType::Int64(_) => Ok(Arc::new(Int64Vector::from_values( @@ -217,7 +219,7 @@ mod tests { fn test_validate_function_constraint() { let constraint = ColumnDefaultConstraint::Function(CURRENT_TIMESTAMP.to_string()); constraint - .validate(&ConcreteDataType::timestamp_millis_datatype(), false) + .validate(&ConcreteDataType::timestamp_millisecond_datatype(), false) .unwrap(); constraint .validate(&ConcreteDataType::boolean_datatype(), false) @@ -225,7 +227,7 @@ mod tests { let constraint = ColumnDefaultConstraint::Function("hello()".to_string()); constraint - .validate(&ConcreteDataType::timestamp_millis_datatype(), false) + .validate(&ConcreteDataType::timestamp_millisecond_datatype(), false) .unwrap_err(); } @@ -262,7 +264,7 @@ mod tests { fn test_create_default_vector_by_func() { let constraint = ColumnDefaultConstraint::Function(CURRENT_TIMESTAMP.to_string()); // Timestamp type. - let data_type = ConcreteDataType::timestamp_millis_datatype(); + let data_type = ConcreteDataType::timestamp_millisecond_datatype(); let v = constraint .create_default_vector(&data_type, false, 4) .unwrap(); @@ -286,7 +288,7 @@ mod tests { ); let constraint = ColumnDefaultConstraint::Function("no".to_string()); - let data_type = ConcreteDataType::timestamp_millis_datatype(); + let data_type = ConcreteDataType::timestamp_millisecond_datatype(); constraint .create_default_vector(&data_type, false, 4) .unwrap_err(); diff --git a/src/datatypes/src/schema/raw.rs b/src/datatypes/src/schema/raw.rs index f415a1ab85c1..75f0853b4b74 100644 --- a/src/datatypes/src/schema/raw.rs +++ b/src/datatypes/src/schema/raw.rs @@ -20,7 +20,7 @@ use crate::schema::{ColumnSchema, Schema, SchemaBuilder}; /// Struct used to serialize and deserialize [`Schema`](crate::schema::Schema). /// /// This struct only contains necessary data to recover the Schema. -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct RawSchema { pub column_schemas: Vec, pub timestamp_index: Option, @@ -56,8 +56,12 @@ mod tests { fn test_raw_convert() { let column_schemas = vec![ ColumnSchema::new("col1", ConcreteDataType::int32_datatype(), true), - ColumnSchema::new("ts", ConcreteDataType::timestamp_millis_datatype(), false) - .with_time_index(true), + ColumnSchema::new( + "ts", + ConcreteDataType::timestamp_millisecond_datatype(), + false, + ) + .with_time_index(true), ]; let schema = SchemaBuilder::try_from(column_schemas) .unwrap() diff --git a/src/datatypes2/src/timestamp.rs b/src/datatypes/src/timestamp.rs similarity index 89% rename from src/datatypes2/src/timestamp.rs rename to src/datatypes/src/timestamp.rs index f14e91a6c614..82d68ae66234 100644 --- a/src/datatypes2/src/timestamp.rs +++ b/src/datatypes/src/timestamp.rs @@ -104,6 +104,12 @@ macro_rules! define_timestamp_with_unit { []::from_native(val) } } + + impl From<[]> for i64{ + fn from(val: []) -> Self { + val.0.value() + } + } } }; } @@ -117,6 +123,18 @@ define_timestamp_with_unit!(Nanosecond); mod tests { use super::*; + #[test] + fn test_to_serde_json_value() { + let ts = TimestampSecond::new(123); + let val = serde_json::Value::from(ts); + match val { + serde_json::Value::String(s) => { + assert_eq!("1970-01-01 00:02:03+0000", s); + } + _ => unreachable!(), + } + } + #[test] fn test_timestamp_scalar() { let ts = TimestampSecond::new(123); diff --git a/src/datatypes/src/type_id.rs b/src/datatypes/src/type_id.rs index fa11430dec99..bcb7ea52b129 100644 --- a/src/datatypes/src/type_id.rs +++ b/src/datatypes/src/type_id.rs @@ -42,7 +42,10 @@ pub enum LogicalTypeId { /// seconds/milliseconds/microseconds/nanoseconds, determined by precision. DateTime, - Timestamp, + TimestampSecond, + TimestampMillisecond, + TimestampMicrosecond, + TimestampNanosecond, List, } @@ -74,7 +77,14 @@ impl LogicalTypeId { LogicalTypeId::Binary => ConcreteDataType::binary_datatype(), LogicalTypeId::Date => ConcreteDataType::date_datatype(), LogicalTypeId::DateTime => ConcreteDataType::datetime_datatype(), - LogicalTypeId::Timestamp => ConcreteDataType::timestamp_millis_datatype(), // to timestamp type with default time unit + LogicalTypeId::TimestampSecond => ConcreteDataType::timestamp_second_datatype(), + LogicalTypeId::TimestampMillisecond => { + ConcreteDataType::timestamp_millisecond_datatype() + } + LogicalTypeId::TimestampMicrosecond => { + ConcreteDataType::timestamp_microsecond_datatype() + } + LogicalTypeId::TimestampNanosecond => ConcreteDataType::timestamp_nanosecond_datatype(), LogicalTypeId::List => { ConcreteDataType::list_datatype(ConcreteDataType::null_datatype()) } diff --git a/src/datatypes/src/types.rs b/src/datatypes/src/types.rs index aabeb59db350..8f40c563defe 100644 --- a/src/datatypes/src/types.rs +++ b/src/datatypes/src/types.rs @@ -14,25 +14,27 @@ mod binary_type; mod boolean_type; -mod date; -mod datetime; +mod date_type; +mod datetime_type; mod list_type; mod null_type; -mod primitive_traits; mod primitive_type; mod string_type; -mod timestamp; + +mod timestamp_type; pub use binary_type::BinaryType; pub use boolean_type::BooleanType; -pub use date::DateType; -pub use datetime::DateTimeType; +pub use date_type::DateType; +pub use datetime_type::DateTimeType; pub use list_type::ListType; pub use null_type::NullType; -pub use primitive_traits::{OrdPrimitive, Primitive}; pub use primitive_type::{ - Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, PrimitiveElement, - PrimitiveType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, + Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, LogicalPrimitiveType, + NativeType, OrdPrimitive, UInt16Type, UInt32Type, UInt64Type, UInt8Type, WrapperType, }; pub use string_type::StringType; -pub use timestamp::TimestampType; +pub use timestamp_type::{ + TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, + TimestampSecondType, TimestampType, +}; diff --git a/src/datatypes/src/types/binary_type.rs b/src/datatypes/src/types/binary_type.rs index 13922ff06310..0d06724fffb4 100644 --- a/src/datatypes/src/types/binary_type.rs +++ b/src/datatypes/src/types/binary_type.rs @@ -53,4 +53,8 @@ impl DataType for BinaryType { fn create_mutable_vector(&self, capacity: usize) -> Box { Box::new(BinaryVectorBuilder::with_capacity(capacity)) } + + fn is_timestamp_compatible(&self) -> bool { + false + } } diff --git a/src/datatypes/src/types/boolean_type.rs b/src/datatypes/src/types/boolean_type.rs index 4566f1d82630..36d92169eb01 100644 --- a/src/datatypes/src/types/boolean_type.rs +++ b/src/datatypes/src/types/boolean_type.rs @@ -52,4 +52,8 @@ impl DataType for BooleanType { fn create_mutable_vector(&self, capacity: usize) -> Box { Box::new(BooleanVectorBuilder::with_capacity(capacity)) } + + fn is_timestamp_compatible(&self) -> bool { + false + } } diff --git a/src/datatypes/src/types/date.rs b/src/datatypes/src/types/date.rs deleted file mode 100644 index 8d2cca12fa3f..000000000000 --- a/src/datatypes/src/types/date.rs +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::sync::Arc; - -use arrow::datatypes::DataType as ArrowDataType; -use serde::{Deserialize, Serialize}; - -use crate::data_type::DataType; -use crate::prelude::{DataTypeRef, LogicalTypeId, Value}; -use crate::scalars::ScalarVectorBuilder; -use crate::vectors::{DateVectorBuilder, MutableVector}; - -#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct DateType; - -impl DataType for DateType { - fn name(&self) -> &str { - "Date" - } - - fn logical_type_id(&self) -> LogicalTypeId { - LogicalTypeId::Date - } - - fn default_value(&self) -> Value { - Value::Date(Default::default()) - } - - fn as_arrow_type(&self) -> ArrowDataType { - ArrowDataType::Date32 - } - - fn create_mutable_vector(&self, capacity: usize) -> Box { - Box::new(DateVectorBuilder::with_capacity(capacity)) - } -} - -impl DateType { - pub fn arc() -> DataTypeRef { - Arc::new(Self) - } -} diff --git a/src/datatypes2/src/types/date_type.rs b/src/datatypes/src/types/date_type.rs similarity index 98% rename from src/datatypes2/src/types/date_type.rs rename to src/datatypes/src/types/date_type.rs index 052b837a3d58..afd482359d71 100644 --- a/src/datatypes2/src/types/date_type.rs +++ b/src/datatypes/src/types/date_type.rs @@ -59,6 +59,7 @@ impl LogicalPrimitiveType for DateType { type ArrowPrimitive = Date32Type; type Native = i32; type Wrapper = Date; + type LargestType = Self; fn build_data_type() -> ConcreteDataType { ConcreteDataType::date_datatype() diff --git a/src/datatypes/src/types/datetime.rs b/src/datatypes/src/types/datetime.rs deleted file mode 100644 index 6166c73f37d5..000000000000 --- a/src/datatypes/src/types/datetime.rs +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::sync::Arc; - -use arrow::datatypes::DataType as ArrowDataType; -use serde::{Deserialize, Serialize}; - -use crate::data_type::{DataType, DataTypeRef}; -use crate::prelude::{LogicalTypeId, Value}; -use crate::scalars::ScalarVectorBuilder; -use crate::vectors::{DateTimeVectorBuilder, MutableVector}; - -#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct DateTimeType; - -const DATE_TIME_TYPE_NAME: &str = "DateTime"; - -/// [DateTimeType] represents the seconds elapsed since UNIX EPOCH. -impl DataType for DateTimeType { - fn name(&self) -> &str { - DATE_TIME_TYPE_NAME - } - - fn logical_type_id(&self) -> LogicalTypeId { - LogicalTypeId::DateTime - } - - fn default_value(&self) -> Value { - Value::DateTime(Default::default()) - } - - fn as_arrow_type(&self) -> ArrowDataType { - ArrowDataType::Date64 - } - - fn create_mutable_vector(&self, capacity: usize) -> Box { - Box::new(DateTimeVectorBuilder::with_capacity(capacity)) - } -} - -impl DateTimeType { - pub fn arc() -> DataTypeRef { - Arc::new(Self) - } - - pub fn name() -> &'static str { - DATE_TIME_TYPE_NAME - } -} diff --git a/src/datatypes2/src/types/datetime_type.rs b/src/datatypes/src/types/datetime_type.rs similarity index 98% rename from src/datatypes2/src/types/datetime_type.rs rename to src/datatypes/src/types/datetime_type.rs index d74a02effe4f..ccd810eee746 100644 --- a/src/datatypes2/src/types/datetime_type.rs +++ b/src/datatypes/src/types/datetime_type.rs @@ -57,6 +57,7 @@ impl LogicalPrimitiveType for DateTimeType { type ArrowPrimitive = Date64Type; type Native = i64; type Wrapper = DateTime; + type LargestType = Self; fn build_data_type() -> ConcreteDataType { ConcreteDataType::datetime_datatype() diff --git a/src/datatypes/src/types/list_type.rs b/src/datatypes/src/types/list_type.rs index 1ada1090111f..3c8535810d6c 100644 --- a/src/datatypes/src/types/list_type.rs +++ b/src/datatypes/src/types/list_type.rs @@ -15,15 +15,17 @@ use arrow::datatypes::{DataType as ArrowDataType, Field}; use serde::{Deserialize, Serialize}; -use crate::prelude::*; -use crate::value::ListValue; +use crate::data_type::{ConcreteDataType, DataType}; +use crate::type_id::LogicalTypeId; +use crate::value::{ListValue, Value}; use crate::vectors::{ListVectorBuilder, MutableVector}; /// Used to represent the List datatype. -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct ListType { - /// The type of List's inner data. - inner: Box, + /// The type of List's item. + // Use Box to avoid recursive dependency, as enum ConcreteDataType depends on ListType. + item_type: Box, } impl Default for ListType { @@ -33,11 +35,18 @@ impl Default for ListType { } impl ListType { - pub fn new(datatype: ConcreteDataType) -> Self { + /// Create a new `ListType` whose item's data type is `item_type`. + pub fn new(item_type: ConcreteDataType) -> Self { ListType { - inner: Box::new(datatype), + item_type: Box::new(item_type), } } + + /// Returns the item data type. + #[inline] + pub fn item_type(&self) -> &ConcreteDataType { + &self.item_type + } } impl DataType for ListType { @@ -50,20 +59,24 @@ impl DataType for ListType { } fn default_value(&self) -> Value { - Value::List(ListValue::new(None, *self.inner.clone())) + Value::List(ListValue::new(None, *self.item_type.clone())) } fn as_arrow_type(&self) -> ArrowDataType { - let field = Box::new(Field::new("item", self.inner.as_arrow_type(), true)); + let field = Box::new(Field::new("item", self.item_type.as_arrow_type(), true)); ArrowDataType::List(field) } fn create_mutable_vector(&self, capacity: usize) -> Box { Box::new(ListVectorBuilder::with_type_capacity( - *self.inner.clone(), + *self.item_type.clone(), capacity, )) } + + fn is_timestamp_compatible(&self) -> bool { + false + } } #[cfg(test)] @@ -84,5 +97,6 @@ mod tests { ArrowDataType::List(Box::new(Field::new("item", ArrowDataType::Boolean, true))), t.as_arrow_type() ); + assert_eq!(ConcreteDataType::boolean_datatype(), *t.item_type()); } } diff --git a/src/datatypes/src/types/null_type.rs b/src/datatypes/src/types/null_type.rs index a0b027dd1492..b9bb2dc7526d 100644 --- a/src/datatypes/src/types/null_type.rs +++ b/src/datatypes/src/types/null_type.rs @@ -27,7 +27,7 @@ pub struct NullType; impl NullType { pub fn arc() -> DataTypeRef { - Arc::new(Self) + Arc::new(NullType) } } @@ -51,4 +51,8 @@ impl DataType for NullType { fn create_mutable_vector(&self, _capacity: usize) -> Box { Box::new(NullVectorBuilder::default()) } + + fn is_timestamp_compatible(&self) -> bool { + false + } } diff --git a/src/datatypes/src/types/primitive_traits.rs b/src/datatypes/src/types/primitive_traits.rs deleted file mode 100644 index e900ba217e15..000000000000 --- a/src/datatypes/src/types/primitive_traits.rs +++ /dev/null @@ -1,138 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::cmp::Ordering; - -use arrow::compute::arithmetics::basic::NativeArithmetics; -use arrow::types::NativeType; -use num::NumCast; - -use crate::prelude::Scalar; -use crate::value::{IntoValueRef, Value}; - -/// Primitive type. -pub trait Primitive: - PartialOrd - + Default - + Clone - + Copy - + Into - + IntoValueRef<'static> - + NativeType - + serde::Serialize - + NativeArithmetics - + NumCast - + Scalar -{ - /// Largest numeric type this primitive type can be cast to. - type LargestType: Primitive; -} - -macro_rules! impl_primitive { - ($Type:ident, $LargestType: ident) => { - impl Primitive for $Type { - type LargestType = $LargestType; - } - }; -} - -impl_primitive!(u8, u64); -impl_primitive!(u16, u64); -impl_primitive!(u32, u64); -impl_primitive!(u64, u64); -impl_primitive!(i8, i64); -impl_primitive!(i16, i64); -impl_primitive!(i32, i64); -impl_primitive!(i64, i64); -impl_primitive!(f32, f64); -impl_primitive!(f64, f64); - -/// A new type for [Primitive], complement the `Ord` feature for it. Wrapping not ordered -/// primitive types like `f32` and `f64` in `OrdPrimitive` can make them be used in places that -/// require `Ord`. For example, in `Median` or `Percentile` UDAFs. -#[derive(Debug, Clone, Copy, PartialEq)] -pub struct OrdPrimitive(pub T); - -impl OrdPrimitive { - pub fn as_primitive(&self) -> T { - self.0 - } -} - -impl Eq for OrdPrimitive {} - -impl PartialOrd for OrdPrimitive { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl Ord for OrdPrimitive { - fn cmp(&self, other: &Self) -> Ordering { - self.0.into().cmp(&other.0.into()) - } -} - -impl From> for Value { - fn from(p: OrdPrimitive) -> Self { - p.0.into() - } -} - -#[cfg(test)] -mod tests { - use std::collections::BinaryHeap; - - use super::*; - - #[test] - fn test_ord_primitive() { - struct Foo - where - T: Primitive, - { - heap: BinaryHeap>, - } - - impl Foo - where - T: Primitive, - { - fn push(&mut self, value: T) { - let value = OrdPrimitive::(value); - self.heap.push(value); - } - } - - macro_rules! test { - ($Type:ident) => { - let mut foo = Foo::<$Type> { - heap: BinaryHeap::new(), - }; - foo.push($Type::default()); - }; - } - - test!(u8); - test!(u16); - test!(u32); - test!(u64); - test!(i8); - test!(i16); - test!(i32); - test!(i64); - test!(f32); - test!(f64); - } -} diff --git a/src/datatypes/src/types/primitive_type.rs b/src/datatypes/src/types/primitive_type.rs index b9f07ce82c83..ea752cf8debb 100644 --- a/src/datatypes/src/types/primitive_type.rs +++ b/src/datatypes/src/types/primitive_type.rs @@ -12,12 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::any::TypeId; -use std::marker::PhantomData; +use std::cmp::Ordering; +use std::fmt; -use arrow::array::PrimitiveArray; -use arrow::datatypes::DataType as ArrowDataType; -use paste::paste; +use arrow::datatypes::{ArrowNativeType, ArrowPrimitiveType, DataType as ArrowDataType}; +use common_time::{Date, DateTime}; +use num::NumCast; use serde::{Deserialize, Serialize}; use snafu::OptionExt; @@ -25,92 +25,227 @@ use crate::data_type::{ConcreteDataType, DataType}; use crate::error::{self, Result}; use crate::scalars::{Scalar, ScalarRef, ScalarVectorBuilder}; use crate::type_id::LogicalTypeId; -use crate::types::primitive_traits::Primitive; +use crate::types::{DateTimeType, DateType}; use crate::value::{Value, ValueRef}; use crate::vectors::{MutableVector, PrimitiveVector, PrimitiveVectorBuilder, Vector}; -#[derive(Clone, Serialize, Deserialize)] -pub struct PrimitiveType { - #[serde(skip)] - _phantom: PhantomData, -} +/// Data types that can be used as arrow's native type. +pub trait NativeType: ArrowNativeType + NumCast {} -impl PartialEq> for PrimitiveType { - fn eq(&self, _other: &PrimitiveType) -> bool { - TypeId::of::() == TypeId::of::() - } +macro_rules! impl_native_type { + ($Type: ident) => { + impl NativeType for $Type {} + }; } -impl Eq for PrimitiveType {} +impl_native_type!(u8); +impl_native_type!(u16); +impl_native_type!(u32); +impl_native_type!(u64); +impl_native_type!(i8); +impl_native_type!(i16); +impl_native_type!(i32); +impl_native_type!(i64); +impl_native_type!(f32); +impl_native_type!(f64); -/// A trait that provide helper methods for a primitive type to implementing the [PrimitiveVector]. -pub trait PrimitiveElement -where - for<'a> Self: Primitive - + Scalar> - + ScalarRef<'a, ScalarType = Self, VectorType = PrimitiveVector> - + Scalar = Self>, +/// Represents the wrapper type that wraps a native type using the `newtype pattern`, +/// such as [Date](`common_time::Date`) is a wrapper type for the underlying native +/// type `i32`. +pub trait WrapperType: + Copy + + Send + + Sync + + fmt::Debug + + for<'a> Scalar = Self> + + PartialEq + + Into + + Into> + + Serialize + + Into { + /// Logical primitive type that this wrapper type belongs to. + type LogicalType: LogicalPrimitiveType; + /// The underlying native type. + type Native: NativeType; + + /// Convert native type into this wrapper type. + fn from_native(value: Self::Native) -> Self; + + /// Convert this wrapper type into native type. + fn into_native(self) -> Self::Native; +} + +/// Trait bridging the logical primitive type with [ArrowPrimitiveType]. +pub trait LogicalPrimitiveType: 'static + Sized { + /// Arrow primitive type of this logical type. + type ArrowPrimitive: ArrowPrimitiveType; + /// Native (physical) type of this logical type. + type Native: NativeType; + /// Wrapper type that the vector returns. + type Wrapper: WrapperType + + for<'a> Scalar, RefType<'a> = Self::Wrapper> + + for<'a> ScalarRef<'a, ScalarType = Self::Wrapper>; + /// Largest type this primitive type can cast to. + type LargestType: LogicalPrimitiveType; + /// Construct the data type struct. fn build_data_type() -> ConcreteDataType; - /// Returns the name of the type id. - fn type_name() -> String; + /// Return the name of the type. + fn type_name() -> &'static str; /// Dynamic cast the vector to the concrete vector type. - fn cast_vector(vector: &dyn Vector) -> Result<&PrimitiveArray>; + fn cast_vector(vector: &dyn Vector) -> Result<&PrimitiveVector>; /// Cast value ref to the primitive type. - fn cast_value_ref(value: ValueRef) -> Result>; + fn cast_value_ref(value: ValueRef) -> Result>; } -macro_rules! impl_primitive_element { - ($Type:ident, $TypeId:ident) => { - paste::paste! { - impl PrimitiveElement for $Type { - fn build_data_type() -> ConcreteDataType { - ConcreteDataType::$TypeId(PrimitiveType::<$Type>::default()) - } +/// A new type for [WrapperType], complement the `Ord` feature for it. Wrapping non ordered +/// primitive types like `f32` and `f64` in `OrdPrimitive` can make them be used in places that +/// require `Ord`. For example, in `Median` or `Percentile` UDAFs. +#[derive(Debug, Clone, Copy, PartialEq)] +pub struct OrdPrimitive(pub T); - fn type_name() -> String { - stringify!($TypeId).to_string() - } +impl OrdPrimitive { + pub fn as_primitive(&self) -> T::Native { + self.0.into_native() + } +} - fn cast_vector(vector: &dyn Vector) -> Result<&PrimitiveArray<$Type>> { - let primitive_vector = vector - .as_any() - .downcast_ref::>() - .with_context(|| error::CastTypeSnafu { - msg: format!( - "Failed to cast {} to vector of primitive type {}", - vector.vector_type_name(), - stringify!($TypeId) - ), - })?; - Ok(&primitive_vector.array) - } +impl Eq for OrdPrimitive {} + +impl PartialOrd for OrdPrimitive { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} - fn cast_value_ref(value: ValueRef) -> Result> { - match value { - ValueRef::Null => Ok(None), - ValueRef::$TypeId(v) => Ok(Some(v.into())), - other => error::CastTypeSnafu { - msg: format!( - "Failed to cast value {:?} to primitive type {}", - other, - stringify!($TypeId), - ), - }.fail(), +impl Ord for OrdPrimitive { + fn cmp(&self, other: &Self) -> Ordering { + Into::::into(self.0).cmp(&Into::::into(other.0)) + } +} + +impl From> for Value { + fn from(p: OrdPrimitive) -> Self { + p.0.into() + } +} + +macro_rules! impl_wrapper { + ($Type: ident, $LogicalType: ident) => { + impl WrapperType for $Type { + type LogicalType = $LogicalType; + type Native = $Type; + + fn from_native(value: Self::Native) -> Self { + value + } + + fn into_native(self) -> Self::Native { + self + } + } + }; +} + +impl_wrapper!(u8, UInt8Type); +impl_wrapper!(u16, UInt16Type); +impl_wrapper!(u32, UInt32Type); +impl_wrapper!(u64, UInt64Type); +impl_wrapper!(i8, Int8Type); +impl_wrapper!(i16, Int16Type); +impl_wrapper!(i32, Int32Type); +impl_wrapper!(i64, Int64Type); +impl_wrapper!(f32, Float32Type); +impl_wrapper!(f64, Float64Type); + +impl WrapperType for Date { + type LogicalType = DateType; + type Native = i32; + + fn from_native(value: i32) -> Self { + Date::new(value) + } + + fn into_native(self) -> i32 { + self.val() + } +} + +impl WrapperType for DateTime { + type LogicalType = DateTimeType; + type Native = i64; + + fn from_native(value: Self::Native) -> Self { + DateTime::new(value) + } + + fn into_native(self) -> Self::Native { + self.val() + } +} + +macro_rules! define_logical_primitive_type { + ($Native: ident, $TypeId: ident, $DataType: ident, $Largest: ident) => { + // We need to define it as an empty struct `struct DataType {}` instead of a struct-unit + // `struct DataType;` to ensure the serialized JSON string is compatible with previous + // implementation. + #[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)] + pub struct $DataType {} + + impl LogicalPrimitiveType for $DataType { + type ArrowPrimitive = arrow::datatypes::$DataType; + type Native = $Native; + type Wrapper = $Native; + type LargestType = $Largest; + + fn build_data_type() -> ConcreteDataType { + ConcreteDataType::$TypeId($DataType::default()) + } + + fn type_name() -> &'static str { + stringify!($TypeId) + } + + fn cast_vector(vector: &dyn Vector) -> Result<&PrimitiveVector<$DataType>> { + vector + .as_any() + .downcast_ref::>() + .with_context(|| error::CastTypeSnafu { + msg: format!( + "Failed to cast {} to vector of primitive type {}", + vector.vector_type_name(), + stringify!($TypeId) + ), + }) + } + + fn cast_value_ref(value: ValueRef) -> Result> { + match value { + ValueRef::Null => Ok(None), + ValueRef::$TypeId(v) => Ok(Some(v.into())), + other => error::CastTypeSnafu { + msg: format!( + "Failed to cast value {:?} to primitive type {}", + other, + stringify!($TypeId), + ), } + .fail(), } } } }; } -macro_rules! impl_numeric { - ($Type:ident, $TypeId:ident) => { - impl DataType for PrimitiveType<$Type> { +macro_rules! define_non_timestamp_primitive { + ($Native: ident, $TypeId: ident, $DataType: ident, $Largest: ident) => { + define_logical_primitive_type!($Native, $TypeId, $DataType, $Largest); + + impl DataType for $DataType { fn name(&self) -> &str { stringify!($TypeId) } @@ -120,7 +255,7 @@ macro_rules! impl_numeric { } fn default_value(&self) -> Value { - $Type::default().into() + $Native::default().into() } fn as_arrow_type(&self) -> ArrowDataType { @@ -128,61 +263,99 @@ macro_rules! impl_numeric { } fn create_mutable_vector(&self, capacity: usize) -> Box { - Box::new(PrimitiveVectorBuilder::<$Type>::with_capacity(capacity)) + Box::new(PrimitiveVectorBuilder::<$DataType>::with_capacity(capacity)) } - } - impl std::fmt::Debug for PrimitiveType<$Type> { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - write!(f, "{}", self.name()) + fn is_timestamp_compatible(&self) -> bool { + false } } + }; +} - impl Default for PrimitiveType<$Type> { - fn default() -> Self { - Self { - _phantom: PhantomData, - } - } - } +define_non_timestamp_primitive!(u8, UInt8, UInt8Type, UInt64Type); +define_non_timestamp_primitive!(u16, UInt16, UInt16Type, UInt64Type); +define_non_timestamp_primitive!(u32, UInt32, UInt32Type, UInt64Type); +define_non_timestamp_primitive!(u64, UInt64, UInt64Type, UInt64Type); +define_non_timestamp_primitive!(i8, Int8, Int8Type, Int64Type); +define_non_timestamp_primitive!(i16, Int16, Int16Type, Int64Type); +define_non_timestamp_primitive!(i32, Int32, Int32Type, Int64Type); +define_non_timestamp_primitive!(f32, Float32, Float32Type, Float64Type); +define_non_timestamp_primitive!(f64, Float64, Float64Type, Float64Type); - impl_primitive_element!($Type, $TypeId); +// Timestamp primitive: +define_logical_primitive_type!(i64, Int64, Int64Type, Int64Type); - paste! { - pub type [<$TypeId Type>]=PrimitiveType<$Type>; - } - }; -} +impl DataType for Int64Type { + fn name(&self) -> &str { + "Int64" + } + + fn logical_type_id(&self) -> LogicalTypeId { + LogicalTypeId::Int64 + } + + fn default_value(&self) -> Value { + Value::Int64(0) + } + + fn as_arrow_type(&self) -> ArrowDataType { + ArrowDataType::Int64 + } + + fn create_mutable_vector(&self, capacity: usize) -> Box { + Box::new(PrimitiveVectorBuilder::::with_capacity(capacity)) + } -impl_numeric!(u8, UInt8); -impl_numeric!(u16, UInt16); -impl_numeric!(u32, UInt32); -impl_numeric!(u64, UInt64); -impl_numeric!(i8, Int8); -impl_numeric!(i16, Int16); -impl_numeric!(i32, Int32); -impl_numeric!(i64, Int64); -impl_numeric!(f32, Float32); -impl_numeric!(f64, Float64); + fn is_timestamp_compatible(&self) -> bool { + true + } +} #[cfg(test)] mod tests { + use std::collections::BinaryHeap; + use super::*; #[test] - fn test_eq() { - assert_eq!(UInt8Type::default(), UInt8Type::default()); - assert_eq!(UInt16Type::default(), UInt16Type::default()); - assert_eq!(UInt32Type::default(), UInt32Type::default()); - assert_eq!(UInt64Type::default(), UInt64Type::default()); - assert_eq!(Int8Type::default(), Int8Type::default()); - assert_eq!(Int16Type::default(), Int16Type::default()); - assert_eq!(Int32Type::default(), Int32Type::default()); - assert_eq!(Int64Type::default(), Int64Type::default()); - assert_eq!(Float32Type::default(), Float32Type::default()); - assert_eq!(Float64Type::default(), Float64Type::default()); - - assert_ne!(Float32Type::default(), Float64Type::default()); - assert_ne!(Float32Type::default(), Int32Type::default()); + fn test_ord_primitive() { + struct Foo + where + T: WrapperType, + { + heap: BinaryHeap>, + } + + impl Foo + where + T: WrapperType, + { + fn push(&mut self, value: T) { + let value = OrdPrimitive::(value); + self.heap.push(value); + } + } + + macro_rules! test { + ($Type:ident) => { + let mut foo = Foo::<$Type> { + heap: BinaryHeap::new(), + }; + foo.push($Type::default()); + assert_eq!($Type::default(), foo.heap.pop().unwrap().as_primitive()); + }; + } + + test!(u8); + test!(u16); + test!(u32); + test!(u64); + test!(i8); + test!(i16); + test!(i32); + test!(i64); + test!(f32); + test!(f64); } } diff --git a/src/datatypes/src/types/string_type.rs b/src/datatypes/src/types/string_type.rs index 736a3faac942..799cbbbdd345 100644 --- a/src/datatypes/src/types/string_type.rs +++ b/src/datatypes/src/types/string_type.rs @@ -18,9 +18,10 @@ use arrow::datatypes::DataType as ArrowDataType; use common_base::bytes::StringBytes; use serde::{Deserialize, Serialize}; -use crate::data_type::DataType; -use crate::prelude::{DataTypeRef, LogicalTypeId, Value}; -use crate::scalars::ScalarVectorBuilder; +use crate::data_type::{DataType, DataTypeRef}; +use crate::prelude::ScalarVectorBuilder; +use crate::type_id::LogicalTypeId; +use crate::value::Value; use crate::vectors::{MutableVector, StringVectorBuilder}; #[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] @@ -52,4 +53,8 @@ impl DataType for StringType { fn create_mutable_vector(&self, capacity: usize) -> Box { Box::new(StringVectorBuilder::with_capacity(capacity)) } + + fn is_timestamp_compatible(&self) -> bool { + false + } } diff --git a/src/datatypes/src/types/timestamp.rs b/src/datatypes/src/types/timestamp.rs deleted file mode 100644 index b80d16a64f3b..000000000000 --- a/src/datatypes/src/types/timestamp.rs +++ /dev/null @@ -1,125 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use arrow::datatypes::{DataType as ArrowDataType, TimeUnit as ArrowTimeUnit}; -use common_time::timestamp::{TimeUnit, Timestamp}; -use serde::{Deserialize, Serialize}; - -use crate::data_type::DataType; -use crate::prelude::{LogicalTypeId, MutableVector, ScalarVectorBuilder, Value}; -use crate::vectors::TimestampVectorBuilder; - -#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct TimestampType { - pub unit: TimeUnit, -} - -impl TimestampType { - pub fn new(unit: TimeUnit) -> Self { - Self { unit } - } -} - -impl DataType for TimestampType { - fn name(&self) -> &str { - "Timestamp" - } - - fn logical_type_id(&self) -> LogicalTypeId { - LogicalTypeId::Timestamp - } - - fn default_value(&self) -> Value { - Value::Timestamp(Timestamp::new(0, self.unit)) - } - - fn as_arrow_type(&self) -> ArrowDataType { - match self.unit { - TimeUnit::Second => ArrowDataType::Timestamp(ArrowTimeUnit::Second, None), - TimeUnit::Millisecond => ArrowDataType::Timestamp(ArrowTimeUnit::Millisecond, None), - TimeUnit::Microsecond => ArrowDataType::Timestamp(ArrowTimeUnit::Microsecond, None), - TimeUnit::Nanosecond => ArrowDataType::Timestamp(ArrowTimeUnit::Nanosecond, None), - } - } - - fn create_mutable_vector(&self, capacity: usize) -> Box { - Box::new(TimestampVectorBuilder::with_capacity(capacity)) - } -} - -#[cfg(test)] -mod tests { - use arrow::datatypes::TimeUnit as ArrowTimeUnit; - use common_time::timestamp::TimeUnit::Microsecond; - - use super::*; - use crate::prelude::{ConcreteDataType, ValueRef}; - - #[test] - pub fn test_timestamp_type() { - assert_eq!( - LogicalTypeId::Timestamp, - TimestampType::new(TimeUnit::Microsecond).logical_type_id() - ); - } - - #[test] - pub fn test_as_arrow_type() { - assert_eq!( - ArrowDataType::Timestamp(ArrowTimeUnit::Nanosecond, None), - TimestampType::new(TimeUnit::Nanosecond).as_arrow_type() - ); - assert_eq!( - ArrowDataType::Timestamp(ArrowTimeUnit::Microsecond, None), - TimestampType::new(TimeUnit::Microsecond).as_arrow_type() - ); - assert_eq!( - ArrowDataType::Timestamp(ArrowTimeUnit::Millisecond, None), - TimestampType::new(TimeUnit::Millisecond).as_arrow_type() - ); - assert_eq!( - ArrowDataType::Timestamp(ArrowTimeUnit::Second, None), - TimestampType::new(TimeUnit::Second).as_arrow_type() - ); - } - - #[test] - pub fn test_default_value() { - assert_eq!( - Value::Timestamp(Timestamp::new(0, Microsecond)), - TimestampType::new(TimeUnit::Microsecond).default_value() - ); - } - - #[test] - pub fn test_create_mutable_vector() { - let mut builder = TimestampType::new(TimeUnit::Microsecond).create_mutable_vector(10); - builder - .push_value_ref(ValueRef::Timestamp(Timestamp::new( - 42, - TimeUnit::Millisecond, - ))) - .unwrap(); - builder.push_value_ref(ValueRef::Null).unwrap(); - builder - .push_value_ref(ValueRef::Timestamp(Timestamp::new(96, TimeUnit::Second))) - .unwrap(); - let v = builder.to_vector(); - assert_eq!(ConcreteDataType::timestamp_millis_datatype(), v.data_type()); - assert_eq!(Value::Timestamp(Timestamp::from_millis(42)), v.get(0)); - assert_eq!(Value::Null, v.get(1)); - // Push a timestamp with different unit will convert the value to value with time unit millisecond. - assert_eq!(Value::Timestamp(Timestamp::from_millis(96_000)), v.get(2)); - } -} diff --git a/src/datatypes2/src/types/timestamp_type.rs b/src/datatypes/src/types/timestamp_type.rs similarity index 81% rename from src/datatypes2/src/types/timestamp_type.rs rename to src/datatypes/src/types/timestamp_type.rs index fe86eeb8fdbc..629d901cc835 100644 --- a/src/datatypes2/src/types/timestamp_type.rs +++ b/src/datatypes/src/types/timestamp_type.rs @@ -50,6 +50,18 @@ pub enum TimestampType { Nanosecond(TimestampNanosecondType), } +impl TimestampType { + /// Returns the [`TimeUnit`] of this type. + pub fn unit(&self) -> TimeUnit { + match self { + TimestampType::Second(_) => TimeUnit::Second, + TimestampType::Millisecond(_) => TimeUnit::Millisecond, + TimestampType::Microsecond(_) => TimeUnit::Microsecond, + TimestampType::Nanosecond(_) => TimeUnit::Nanosecond, + } + } +} + macro_rules! impl_data_type_for_timestamp { ($unit: ident) => { paste! { @@ -58,7 +70,7 @@ macro_rules! impl_data_type_for_timestamp { impl DataType for [] { fn name(&self) -> &str { - stringify!([]) + stringify!([]) } fn logical_type_id(&self) -> LogicalTypeId { @@ -82,11 +94,11 @@ macro_rules! impl_data_type_for_timestamp { } } - impl LogicalPrimitiveType for [] { type ArrowPrimitive = []; type Native = i64; type Wrapper = []; + type LargestType = Self; fn build_data_type() -> ConcreteDataType { ConcreteDataType::Timestamp(TimestampType::$unit( @@ -113,6 +125,9 @@ macro_rules! impl_data_type_for_timestamp { fn cast_value_ref(value: ValueRef) -> crate::Result> { match value { ValueRef::Null => Ok(None), + ValueRef::Int64(v) =>{ + Ok(Some([]::from(v))) + } ValueRef::Timestamp(t) => match t.unit() { TimeUnit::$unit => Ok(Some([](t))), other => error::CastTypeSnafu { @@ -138,3 +153,28 @@ impl_data_type_for_timestamp!(Nanosecond); impl_data_type_for_timestamp!(Second); impl_data_type_for_timestamp!(Millisecond); impl_data_type_for_timestamp!(Microsecond); + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_timestamp_type_unit() { + assert_eq!( + TimeUnit::Second, + TimestampType::Second(TimestampSecondType).unit() + ); + assert_eq!( + TimeUnit::Millisecond, + TimestampType::Millisecond(TimestampMillisecondType).unit() + ); + assert_eq!( + TimeUnit::Microsecond, + TimestampType::Microsecond(TimestampMicrosecondType).unit() + ); + assert_eq!( + TimeUnit::Nanosecond, + TimestampType::Nanosecond(TimestampNanosecondType).unit() + ); + } +} diff --git a/src/datatypes/src/value.rs b/src/datatypes/src/value.rs index d5e0ae3e9f06..457c774606d9 100644 --- a/src/datatypes/src/value.rs +++ b/src/datatypes/src/value.rs @@ -15,6 +15,7 @@ use std::cmp::Ordering; use std::fmt::{Display, Formatter}; +use arrow::datatypes::{DataType as ArrowDataType, Field}; use common_base::bytes::{Bytes, StringBytes}; use common_time::date::Date; use common_time::datetime::DateTime; @@ -22,10 +23,12 @@ use common_time::timestamp::{TimeUnit, Timestamp}; use datafusion_common::ScalarValue; pub use ordered_float::OrderedFloat; use serde::{Deserialize, Serialize}; +use snafu::ensure; use crate::error::{self, Result}; use crate::prelude::*; use crate::type_id::LogicalTypeId; +use crate::types::ListType; use crate::vectors::ListVector; pub type OrderedF32 = OrderedFloat; @@ -125,10 +128,10 @@ impl Value { Value::Float64(_) => ConcreteDataType::float64_datatype(), Value::String(_) => ConcreteDataType::string_datatype(), Value::Binary(_) => ConcreteDataType::binary_datatype(), - Value::List(list) => ConcreteDataType::list_datatype(list.datatype().clone()), Value::Date(_) => ConcreteDataType::date_datatype(), Value::DateTime(_) => ConcreteDataType::datetime_datatype(), Value::Timestamp(v) => ConcreteDataType::timestamp_datatype(v.unit()), + Value::List(list) => ConcreteDataType::list_datatype(list.datatype().clone()), } } @@ -193,11 +196,97 @@ impl Value { Value::List(_) => LogicalTypeId::List, Value::Date(_) => LogicalTypeId::Date, Value::DateTime(_) => LogicalTypeId::DateTime, - Value::Timestamp(_) => LogicalTypeId::Timestamp, + Value::Timestamp(t) => match t.unit() { + TimeUnit::Second => LogicalTypeId::TimestampSecond, + TimeUnit::Millisecond => LogicalTypeId::TimestampMillisecond, + TimeUnit::Microsecond => LogicalTypeId::TimestampMicrosecond, + TimeUnit::Nanosecond => LogicalTypeId::TimestampNanosecond, + }, + } + } + + /// Convert the value into [`ScalarValue`] according to the `output_type`. + pub fn try_to_scalar_value(&self, output_type: &ConcreteDataType) -> Result { + // Compare logical type, since value might not contains full type information. + let value_type_id = self.logical_type_id(); + let output_type_id = output_type.logical_type_id(); + ensure!( + output_type_id == value_type_id || self.is_null(), + error::ToScalarValueSnafu { + reason: format!( + "expect value to return output_type {:?}, actual: {:?}", + output_type_id, value_type_id, + ), + } + ); + + let scalar_value = match self { + Value::Boolean(v) => ScalarValue::Boolean(Some(*v)), + Value::UInt8(v) => ScalarValue::UInt8(Some(*v)), + Value::UInt16(v) => ScalarValue::UInt16(Some(*v)), + Value::UInt32(v) => ScalarValue::UInt32(Some(*v)), + Value::UInt64(v) => ScalarValue::UInt64(Some(*v)), + Value::Int8(v) => ScalarValue::Int8(Some(*v)), + Value::Int16(v) => ScalarValue::Int16(Some(*v)), + Value::Int32(v) => ScalarValue::Int32(Some(*v)), + Value::Int64(v) => ScalarValue::Int64(Some(*v)), + Value::Float32(v) => ScalarValue::Float32(Some(v.0)), + Value::Float64(v) => ScalarValue::Float64(Some(v.0)), + Value::String(v) => ScalarValue::Utf8(Some(v.as_utf8().to_string())), + Value::Binary(v) => ScalarValue::LargeBinary(Some(v.to_vec())), + Value::Date(v) => ScalarValue::Date32(Some(v.val())), + Value::DateTime(v) => ScalarValue::Date64(Some(v.val())), + Value::Null => to_null_value(output_type), + Value::List(list) => { + // Safety: The logical type of the value and output_type are the same. + let list_type = output_type.as_list().unwrap(); + list.try_to_scalar_value(list_type)? + } + Value::Timestamp(t) => timestamp_to_scalar_value(t.unit(), Some(t.value())), + }; + + Ok(scalar_value) + } +} + +fn to_null_value(output_type: &ConcreteDataType) -> ScalarValue { + match output_type { + ConcreteDataType::Null(_) => ScalarValue::Null, + ConcreteDataType::Boolean(_) => ScalarValue::Boolean(None), + ConcreteDataType::Int8(_) => ScalarValue::Int8(None), + ConcreteDataType::Int16(_) => ScalarValue::Int16(None), + ConcreteDataType::Int32(_) => ScalarValue::Int32(None), + ConcreteDataType::Int64(_) => ScalarValue::Int64(None), + ConcreteDataType::UInt8(_) => ScalarValue::UInt8(None), + ConcreteDataType::UInt16(_) => ScalarValue::UInt16(None), + ConcreteDataType::UInt32(_) => ScalarValue::UInt32(None), + ConcreteDataType::UInt64(_) => ScalarValue::UInt64(None), + ConcreteDataType::Float32(_) => ScalarValue::Float32(None), + ConcreteDataType::Float64(_) => ScalarValue::Float64(None), + ConcreteDataType::Binary(_) => ScalarValue::LargeBinary(None), + ConcreteDataType::String(_) => ScalarValue::Utf8(None), + ConcreteDataType::Date(_) => ScalarValue::Date32(None), + ConcreteDataType::DateTime(_) => ScalarValue::Date64(None), + ConcreteDataType::Timestamp(t) => timestamp_to_scalar_value(t.unit(), None), + ConcreteDataType::List(_) => { + ScalarValue::List(None, Box::new(new_item_field(output_type.as_arrow_type()))) } } } +fn new_item_field(data_type: ArrowDataType) -> Field { + Field::new("item", data_type, false) +} + +fn timestamp_to_scalar_value(unit: TimeUnit, val: Option) -> ScalarValue { + match unit { + TimeUnit::Second => ScalarValue::TimestampSecond(val, None), + TimeUnit::Millisecond => ScalarValue::TimestampMillisecond(val, None), + TimeUnit::Microsecond => ScalarValue::TimestampMicrosecond(val, None), + TimeUnit::Nanosecond => ScalarValue::TimestampNanosecond(val, None), + } +} + macro_rules! impl_ord_for_value_like { ($Type: ident, $left: ident, $right: ident) => { if $left.is_null() && !$right.is_null() { @@ -277,6 +366,9 @@ impl_value_from!(Float32, f32); impl_value_from!(Float64, f64); impl_value_from!(String, StringBytes); impl_value_from!(Binary, Bytes); +impl_value_from!(Date, Date); +impl_value_from!(DateTime, DateTime); +impl_value_from!(Timestamp, Timestamp); impl From for Value { fn from(string: String) -> Value { @@ -296,12 +388,6 @@ impl From> for Value { } } -impl From for Value { - fn from(v: Timestamp) -> Self { - Value::Timestamp(v) - } -} - impl From<&[u8]> for Value { fn from(bytes: &[u8]) -> Value { Value::Binary(bytes.into()) @@ -337,6 +423,7 @@ impl TryFrom for serde_json::Value { } } +// TODO(yingwen): Consider removing the `datatype` field from `ListValue`. /// List value. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct ListValue { @@ -362,6 +449,24 @@ impl ListValue { pub fn datatype(&self) -> &ConcreteDataType { &self.datatype } + + fn try_to_scalar_value(&self, output_type: &ListType) -> Result { + let vs = if let Some(items) = self.items() { + Some( + items + .iter() + .map(|v| v.try_to_scalar_value(output_type.item_type())) + .collect::>>()?, + ) + } else { + None + }; + + Ok(ScalarValue::List( + vs, + Box::new(new_item_field(output_type.item_type().as_arrow_type())), + )) + } } impl Default for ListValue { @@ -391,6 +496,7 @@ impl TryFrom for Value { fn try_from(v: ScalarValue) -> Result { let v = match v { + ScalarValue::Null => Value::Null, ScalarValue::Boolean(b) => Value::from(b), ScalarValue::Float32(f) => Value::from(f), ScalarValue::Float64(f) => Value::from(f), @@ -405,8 +511,10 @@ impl TryFrom for Value { ScalarValue::Utf8(s) | ScalarValue::LargeUtf8(s) => { Value::from(s.map(StringBytes::from)) } - ScalarValue::Binary(b) | ScalarValue::LargeBinary(b) => Value::from(b.map(Bytes::from)), - ScalarValue::List(vs, t) => { + ScalarValue::Binary(b) + | ScalarValue::LargeBinary(b) + | ScalarValue::FixedSizeBinary(_, b) => Value::from(b.map(Bytes::from)), + ScalarValue::List(vs, field) => { let items = if let Some(vs) = vs { let vs = vs .into_iter() @@ -416,7 +524,7 @@ impl TryFrom for Value { } else { None }; - let datatype = t.as_ref().try_into()?; + let datatype = ConcreteDataType::try_from(field.data_type())?; Value::List(ListValue::new(items, datatype)) } ScalarValue::Date32(d) => d.map(|x| Value::Date(Date::new(x))).unwrap_or(Value::Null), @@ -435,7 +543,13 @@ impl TryFrom for Value { ScalarValue::TimestampNanosecond(t, _) => t .map(|x| Value::Timestamp(Timestamp::new(x, TimeUnit::Nanosecond))) .unwrap_or(Value::Null), - _ => { + ScalarValue::Decimal128(_, _, _) + | ScalarValue::Time64(_) + | ScalarValue::IntervalYearMonth(_) + | ScalarValue::IntervalDayTime(_) + | ScalarValue::IntervalMonthDayNano(_) + | ScalarValue::Struct(_, _) + | ScalarValue::Dictionary(_, _) => { return error::UnsupportedArrowTypeSnafu { arrow_type: v.get_datatype(), } @@ -545,15 +659,6 @@ impl<'a> Ord for ValueRef<'a> { } } -/// A helper trait to convert copyable types to `ValueRef`. -/// -/// It could replace the usage of `Into>`, thus avoid confusion between `Into` -/// and `Into>` in generic codes. One typical usage is the [`Primitive`](crate::primitive_traits::Primitive) trait. -pub trait IntoValueRef<'a> { - /// Convert itself to [ValueRef]. - fn into_value_ref(self) -> ValueRef<'a>; -} - macro_rules! impl_value_ref_from { ($Variant:ident, $Type:ident) => { impl From<$Type> for ValueRef<'_> { @@ -562,12 +667,6 @@ macro_rules! impl_value_ref_from { } } - impl<'a> IntoValueRef<'a> for $Type { - fn into_value_ref(self) -> ValueRef<'a> { - ValueRef::$Variant(self.into()) - } - } - impl From> for ValueRef<'_> { fn from(value: Option<$Type>) -> Self { match value { @@ -576,15 +675,6 @@ macro_rules! impl_value_ref_from { } } } - - impl<'a> IntoValueRef<'a> for Option<$Type> { - fn into_value_ref(self) -> ValueRef<'a> { - match self { - Some(v) => ValueRef::$Variant(v.into()), - None => ValueRef::Null, - } - } - } }; } @@ -599,6 +689,9 @@ impl_value_ref_from!(Int32, i32); impl_value_ref_from!(Int64, i64); impl_value_ref_from!(Float32, f32); impl_value_ref_from!(Float64, f64); +impl_value_ref_from!(Date, Date); +impl_value_ref_from!(DateTime, DateTime); +impl_value_ref_from!(Timestamp, Timestamp); impl<'a> From<&'a str> for ValueRef<'a> { fn from(string: &'a str) -> ValueRef<'a> { @@ -628,6 +721,7 @@ impl<'a> From>> for ValueRef<'a> { /// if it becomes bottleneck. #[derive(Debug, Clone, Copy)] pub enum ListValueRef<'a> { + // TODO(yingwen): Consider replace this by VectorRef. Indexed { vector: &'a ListVector, idx: usize }, Ref { val: &'a ListValue }, } @@ -785,19 +879,16 @@ mod tests { Some(Box::new(vec![Value::Int32(1), Value::Null])), ConcreteDataType::int32_datatype() )), - ScalarValue::List( - Some(Box::new(vec![ - ScalarValue::Int32(Some(1)), - ScalarValue::Int32(None) - ])), - Box::new(ArrowDataType::Int32) + ScalarValue::new_list( + Some(vec![ScalarValue::Int32(Some(1)), ScalarValue::Int32(None)]), + ArrowDataType::Int32, ) .try_into() .unwrap() ); assert_eq!( Value::List(ListValue::new(None, ConcreteDataType::uint32_datatype())), - ScalarValue::List(None, Box::new(ArrowDataType::UInt32)) + ScalarValue::new_list(None, ArrowDataType::UInt32) .try_into() .unwrap() ); @@ -980,6 +1071,10 @@ mod tests { ConcreteDataType::int32_datatype(), )), ); + check_type_and_value( + &ConcreteDataType::list_datatype(ConcreteDataType::null_datatype()), + &Value::List(ListValue::default()), + ); check_type_and_value( &ConcreteDataType::date_datatype(), &Value::Date(Date::new(1)), @@ -989,8 +1084,8 @@ mod tests { &Value::DateTime(DateTime::new(1)), ); check_type_and_value( - &ConcreteDataType::timestamp_millis_datatype(), - &Value::Timestamp(Timestamp::from_millis(1)), + &ConcreteDataType::timestamp_millisecond_datatype(), + &Value::Timestamp(Timestamp::new_millisecond(1)), ); } @@ -1085,7 +1180,7 @@ mod tests { assert_eq!( serde_json::Value::Number(1.into()), - to_json(Value::Timestamp(Timestamp::from_millis(1))) + to_json(Value::Timestamp(Timestamp::new_millisecond(1))) ); let json_value: serde_json::Value = @@ -1143,7 +1238,7 @@ mod tests { check_as_value_ref!(Int64, -12); check_as_value_ref!(Float32, OrderedF32::from(16.0)); check_as_value_ref!(Float64, OrderedF64::from(16.0)); - check_as_value_ref!(Timestamp, Timestamp::from_millis(1)); + check_as_value_ref!(Timestamp, Timestamp::new_millisecond(1)); assert_eq!( ValueRef::String("hello"), @@ -1208,59 +1303,6 @@ mod tests { assert!(wrong_value.as_list().is_err()); } - #[test] - fn test_into_value_ref() { - macro_rules! check_into_value_ref { - ($Variant: ident, $data: expr, $PrimitiveType: ident, $Wrapper: ident) => { - let data: $PrimitiveType = $data; - assert_eq!( - ValueRef::$Variant($Wrapper::from(data)), - data.into_value_ref() - ); - assert_eq!( - ValueRef::$Variant($Wrapper::from(data)), - ValueRef::from(data) - ); - assert_eq!( - ValueRef::$Variant($Wrapper::from(data)), - Some(data).into_value_ref() - ); - assert_eq!( - ValueRef::$Variant($Wrapper::from(data)), - ValueRef::from(Some(data)) - ); - let x: Option<$PrimitiveType> = None; - assert_eq!(ValueRef::Null, x.into_value_ref()); - assert_eq!(ValueRef::Null, x.into()); - }; - } - - macro_rules! check_primitive_into_value_ref { - ($Variant: ident, $data: expr, $PrimitiveType: ident) => { - check_into_value_ref!($Variant, $data, $PrimitiveType, $PrimitiveType) - }; - } - - check_primitive_into_value_ref!(Boolean, true, bool); - check_primitive_into_value_ref!(UInt8, 10, u8); - check_primitive_into_value_ref!(UInt16, 20, u16); - check_primitive_into_value_ref!(UInt32, 30, u32); - check_primitive_into_value_ref!(UInt64, 40, u64); - check_primitive_into_value_ref!(Int8, -10, i8); - check_primitive_into_value_ref!(Int16, -20, i16); - check_primitive_into_value_ref!(Int32, -30, i32); - check_primitive_into_value_ref!(Int64, -40, i64); - check_into_value_ref!(Float32, 10.0, f32, OrderedF32); - check_into_value_ref!(Float64, 10.0, f64, OrderedF64); - - let hello = "hello"; - assert_eq!( - ValueRef::Binary(hello.as_bytes()), - ValueRef::from(hello.as_bytes()) - ); - assert_eq!(ValueRef::String(hello), ValueRef::from(hello)); - } - #[test] fn test_display() { assert_eq!(Value::Null.to_string(), "Null"); @@ -1301,10 +1343,248 @@ mod tests { assert_eq!( Value::List(ListValue::new( Some(Box::new(vec![])), - ConcreteDataType::timestamp_datatype(TimeUnit::Millisecond), + ConcreteDataType::timestamp_second_datatype(), + )) + .to_string(), + "TimestampSecond[]" + ); + assert_eq!( + Value::List(ListValue::new( + Some(Box::new(vec![])), + ConcreteDataType::timestamp_millisecond_datatype(), )) .to_string(), - "Timestamp[]" + "TimestampMillisecond[]" + ); + assert_eq!( + Value::List(ListValue::new( + Some(Box::new(vec![])), + ConcreteDataType::timestamp_microsecond_datatype(), + )) + .to_string(), + "TimestampMicrosecond[]" + ); + assert_eq!( + Value::List(ListValue::new( + Some(Box::new(vec![])), + ConcreteDataType::timestamp_nanosecond_datatype(), + )) + .to_string(), + "TimestampNanosecond[]" + ); + } + + #[test] + fn test_not_null_value_to_scalar_value() { + assert_eq!( + ScalarValue::Boolean(Some(true)), + Value::Boolean(true) + .try_to_scalar_value(&ConcreteDataType::boolean_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::Boolean(Some(false)), + Value::Boolean(false) + .try_to_scalar_value(&ConcreteDataType::boolean_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::UInt8(Some(u8::MIN + 1)), + Value::UInt8(u8::MIN + 1) + .try_to_scalar_value(&ConcreteDataType::uint8_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::UInt16(Some(u16::MIN + 2)), + Value::UInt16(u16::MIN + 2) + .try_to_scalar_value(&ConcreteDataType::uint16_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::UInt32(Some(u32::MIN + 3)), + Value::UInt32(u32::MIN + 3) + .try_to_scalar_value(&ConcreteDataType::uint32_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::UInt64(Some(u64::MIN + 4)), + Value::UInt64(u64::MIN + 4) + .try_to_scalar_value(&ConcreteDataType::uint64_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::Int8(Some(i8::MIN + 4)), + Value::Int8(i8::MIN + 4) + .try_to_scalar_value(&ConcreteDataType::int8_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::Int16(Some(i16::MIN + 5)), + Value::Int16(i16::MIN + 5) + .try_to_scalar_value(&ConcreteDataType::int16_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::Int32(Some(i32::MIN + 6)), + Value::Int32(i32::MIN + 6) + .try_to_scalar_value(&ConcreteDataType::int32_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::Int64(Some(i64::MIN + 7)), + Value::Int64(i64::MIN + 7) + .try_to_scalar_value(&ConcreteDataType::int64_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::Float32(Some(8.0f32)), + Value::Float32(OrderedFloat(8.0f32)) + .try_to_scalar_value(&ConcreteDataType::float32_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::Float64(Some(9.0f64)), + Value::Float64(OrderedFloat(9.0f64)) + .try_to_scalar_value(&ConcreteDataType::float64_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::Utf8(Some("hello".to_string())), + Value::String(StringBytes::from("hello")) + .try_to_scalar_value(&ConcreteDataType::string_datatype(),) + .unwrap() + ); + assert_eq!( + ScalarValue::LargeBinary(Some("world".as_bytes().to_vec())), + Value::Binary(Bytes::from("world".as_bytes())) + .try_to_scalar_value(&ConcreteDataType::binary_datatype()) + .unwrap() + ); + } + + #[test] + fn test_null_value_to_scalar_value() { + assert_eq!( + ScalarValue::Boolean(None), + Value::Null + .try_to_scalar_value(&ConcreteDataType::boolean_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::UInt8(None), + Value::Null + .try_to_scalar_value(&ConcreteDataType::uint8_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::UInt16(None), + Value::Null + .try_to_scalar_value(&ConcreteDataType::uint16_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::UInt32(None), + Value::Null + .try_to_scalar_value(&ConcreteDataType::uint32_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::UInt64(None), + Value::Null + .try_to_scalar_value(&ConcreteDataType::uint64_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::Int8(None), + Value::Null + .try_to_scalar_value(&ConcreteDataType::int8_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::Int16(None), + Value::Null + .try_to_scalar_value(&ConcreteDataType::int16_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::Int32(None), + Value::Null + .try_to_scalar_value(&ConcreteDataType::int32_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::Int64(None), + Value::Null + .try_to_scalar_value(&ConcreteDataType::int64_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::Float32(None), + Value::Null + .try_to_scalar_value(&ConcreteDataType::float32_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::Float64(None), + Value::Null + .try_to_scalar_value(&ConcreteDataType::float64_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::Utf8(None), + Value::Null + .try_to_scalar_value(&ConcreteDataType::string_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::LargeBinary(None), + Value::Null + .try_to_scalar_value(&ConcreteDataType::binary_datatype()) + .unwrap() + ); + } + + #[test] + fn test_list_value_to_scalar_value() { + let items = Some(Box::new(vec![Value::Int32(-1), Value::Null])); + let list = Value::List(ListValue::new(items, ConcreteDataType::int32_datatype())); + let df_list = list + .try_to_scalar_value(&ConcreteDataType::list_datatype( + ConcreteDataType::int32_datatype(), + )) + .unwrap(); + assert!(matches!(df_list, ScalarValue::List(_, _))); + match df_list { + ScalarValue::List(vs, field) => { + assert_eq!(ArrowDataType::Int32, *field.data_type()); + + let vs = vs.unwrap(); + assert_eq!( + vs, + vec![ScalarValue::Int32(Some(-1)), ScalarValue::Int32(None)] + ); + } + _ => unreachable!(), + } + } + + #[test] + fn test_timestamp_to_scalar_value() { + assert_eq!( + ScalarValue::TimestampSecond(Some(1), None), + timestamp_to_scalar_value(TimeUnit::Second, Some(1)) + ); + assert_eq!( + ScalarValue::TimestampMillisecond(Some(1), None), + timestamp_to_scalar_value(TimeUnit::Millisecond, Some(1)) + ); + assert_eq!( + ScalarValue::TimestampMicrosecond(Some(1), None), + timestamp_to_scalar_value(TimeUnit::Microsecond, Some(1)) + ); + assert_eq!( + ScalarValue::TimestampNanosecond(Some(1), None), + timestamp_to_scalar_value(TimeUnit::Nanosecond, Some(1)) ); } } diff --git a/src/datatypes/src/vectors.rs b/src/datatypes/src/vectors.rs index 6c9402849fff..fe71a6a7c37a 100644 --- a/src/datatypes/src/vectors.rs +++ b/src/datatypes/src/vectors.rs @@ -12,68 +12,59 @@ // See the License for the specific language governing permissions and // limitations under the License. -pub mod binary; -pub mod boolean; -mod builder; -pub mod constant; -pub mod date; -pub mod datetime; -mod eq; -mod helper; -mod list; -pub mod mutable; -pub mod null; -mod operations; -pub mod primitive; -mod string; -mod timestamp; - use std::any::Any; use std::fmt::Debug; use std::sync::Arc; use arrow::array::{Array, ArrayRef}; -use arrow::bitmap::Bitmap; -pub use binary::*; -pub use boolean::*; -pub use builder::VectorBuilder; -pub use constant::*; -pub use date::*; -pub use datetime::*; -pub use helper::Helper; -pub use list::*; -pub use mutable::MutableVector; -pub use null::*; -pub use operations::VectorOp; -pub use primitive::*; use snafu::ensure; -pub use string::*; -pub use timestamp::*; use crate::data_type::ConcreteDataType; use crate::error::{self, Result}; use crate::serialize::Serializable; use crate::value::{Value, ValueRef}; +use crate::vectors::operations::VectorOp; -#[derive(Debug, PartialEq)] -pub enum Validity<'a> { - /// Whether the array slot is valid or not (null). - Slots(&'a Bitmap), - /// All slots are valid. - AllValid, - /// All slots are null. - AllNull, -} +mod binary; +mod boolean; +mod constant; +mod date; +mod datetime; +mod eq; +mod helper; +mod list; +mod null; +mod operations; +mod primitive; +mod string; +mod timestamp; +mod validity; -impl<'a> Validity<'a> { - pub fn slots(&self) -> Option<&Bitmap> { - match self { - Validity::Slots(bitmap) => Some(bitmap), - _ => None, - } - } -} +pub use binary::{BinaryVector, BinaryVectorBuilder}; +pub use boolean::{BooleanVector, BooleanVectorBuilder}; +pub use constant::ConstantVector; +pub use date::{DateVector, DateVectorBuilder}; +pub use datetime::{DateTimeVector, DateTimeVectorBuilder}; +pub use helper::Helper; +pub use list::{ListIter, ListVector, ListVectorBuilder}; +pub use null::{NullVector, NullVectorBuilder}; +pub use primitive::{ + Float32Vector, Float32VectorBuilder, Float64Vector, Float64VectorBuilder, Int16Vector, + Int16VectorBuilder, Int32Vector, Int32VectorBuilder, Int64Vector, Int64VectorBuilder, + Int8Vector, Int8VectorBuilder, PrimitiveIter, PrimitiveVector, PrimitiveVectorBuilder, + UInt16Vector, UInt16VectorBuilder, UInt32Vector, UInt32VectorBuilder, UInt64Vector, + UInt64VectorBuilder, UInt8Vector, UInt8VectorBuilder, +}; +pub use string::{StringVector, StringVectorBuilder}; +pub use timestamp::{ + TimestampMicrosecondVector, TimestampMicrosecondVectorBuilder, TimestampMillisecondVector, + TimestampMillisecondVectorBuilder, TimestampNanosecondVector, TimestampNanosecondVectorBuilder, + TimestampSecondVector, TimestampSecondVectorBuilder, +}; +pub use validity::Validity; +// TODO(yingwen): arrow 28.0 implements Clone for all arrays, we could upgrade to it and simplify +// some codes in methods such as `to_arrow_array()` and `to_boxed_arrow_array()`. /// Vector of data values. pub trait Vector: Send + Sync + Serializable + Debug + VectorOp { /// Returns the data type of the vector. @@ -110,13 +101,7 @@ pub trait Vector: Send + Sync + Serializable + Debug + VectorOp { /// The number of null slots on this [`Vector`]. /// # Implementation /// This is `O(1)`. - fn null_count(&self) -> usize { - match self.validity() { - Validity::Slots(bitmap) => bitmap.null_count(), - Validity::AllValid => 0, - Validity::AllNull => self.len(), - } - } + fn null_count(&self) -> usize; /// Returns true when it's a ConstantColumn fn is_const(&self) -> bool { @@ -165,6 +150,42 @@ pub trait Vector: Send + Sync + Serializable + Debug + VectorOp { pub type VectorRef = Arc; +/// Mutable vector that could be used to build an immutable vector. +pub trait MutableVector: Send + Sync { + /// Returns the data type of the vector. + fn data_type(&self) -> ConcreteDataType; + + /// Returns the length of the vector. + fn len(&self) -> usize; + + /// Returns whether the vector is empty. + fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Convert to Any, to enable dynamic casting. + fn as_any(&self) -> &dyn Any; + + /// Convert to mutable Any, to enable dynamic casting. + fn as_mut_any(&mut self) -> &mut dyn Any; + + /// Convert `self` to an (immutable) [VectorRef] and reset `self`. + fn to_vector(&mut self) -> VectorRef; + + /// Push value ref to this mutable vector. + /// + /// Returns error if data types mismatch. + fn push_value_ref(&mut self, value: ValueRef) -> Result<()>; + + /// Extend this mutable vector by slice of `vector`. + /// + /// Returns error if data types mismatch. + /// + /// # Panics + /// Panics if `offset + length > vector.len()`. + fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()>; +} + /// Helper to define `try_from_arrow_array(array: arrow::array::ArrayRef)` function. macro_rules! impl_try_from_arrow_array_for_vector { ($Array: ident, $Vector: ident) => { @@ -172,16 +193,20 @@ macro_rules! impl_try_from_arrow_array_for_vector { pub fn try_from_arrow_array( array: impl AsRef, ) -> crate::error::Result<$Vector> { - Ok($Vector::from( - array - .as_ref() - .as_any() - .downcast_ref::<$Array>() - .with_context(|| crate::error::ConversionSnafu { - from: std::format!("{:?}", array.as_ref().data_type()), - })? - .clone(), - )) + use snafu::OptionExt; + + let data = array + .as_ref() + .as_any() + .downcast_ref::<$Array>() + .with_context(|| crate::error::ConversionSnafu { + from: std::format!("{:?}", array.as_ref().data_type()), + })? + .data() + .clone(); + + let concrete_array = $Array::from(data); + Ok($Vector::from(concrete_array)) } } }; @@ -189,10 +214,7 @@ macro_rules! impl_try_from_arrow_array_for_vector { macro_rules! impl_validity_for_vector { ($array: expr) => { - match $array.validity() { - Some(bitmap) => Validity::Slots(bitmap), - None => Validity::AllValid, - } + Validity::from_array_data($array.data()) }; } @@ -219,10 +241,11 @@ macro_rules! impl_get_ref_for_vector { } macro_rules! impl_extend_for_builder { - ($mutable_array: expr, $vector: ident, $VectorType: ident, $offset: ident, $length: ident) => {{ + ($mutable_vector: expr, $vector: ident, $VectorType: ident, $offset: ident, $length: ident) => {{ use snafu::OptionExt; - let concrete_vector = $vector + let sliced_vector = $vector.slice($offset, $length); + let concrete_vector = sliced_vector .as_any() .downcast_ref::<$VectorType>() .with_context(|| crate::error::CastTypeSnafu { @@ -232,8 +255,9 @@ macro_rules! impl_extend_for_builder { stringify!($VectorType) ), })?; - let slice = concrete_vector.array.slice($offset, $length); - $mutable_array.extend_trusted_len(slice.iter()); + for value in concrete_vector.iter_data() { + $mutable_vector.push(value); + } Ok(()) }}; } @@ -245,27 +269,27 @@ pub(crate) use { #[cfg(test)] pub mod tests { - use arrow::array::{Array, PrimitiveArray}; + use arrow::array::{Array, Int32Array, UInt8Array}; use serde_json; - use super::helper::Helper; use super::*; use crate::data_type::DataType; - use crate::types::PrimitiveElement; + use crate::types::{Int32Type, LogicalPrimitiveType}; + use crate::vectors::helper::Helper; #[test] fn test_df_columns_to_vector() { - let df_column: Arc = Arc::new(PrimitiveArray::from_slice(vec![1, 2, 3])); + let df_column: Arc = Arc::new(Int32Array::from(vec![1, 2, 3])); let vector = Helper::try_into_vector(df_column).unwrap(); assert_eq!( - i32::build_data_type().as_arrow_type(), + Int32Type::build_data_type().as_arrow_type(), vector.data_type().as_arrow_type() ); } #[test] fn test_serialize_i32_vector() { - let df_column: Arc = Arc::new(PrimitiveArray::::from_slice(vec![1, 2, 3])); + let df_column: Arc = Arc::new(Int32Array::from(vec![1, 2, 3])); let json_value = Helper::try_into_vector(df_column) .unwrap() .serialize_to_json() @@ -275,7 +299,7 @@ pub mod tests { #[test] fn test_serialize_i8_vector() { - let df_column: Arc = Arc::new(PrimitiveArray::from_slice(vec![1u8, 2u8, 3u8])); + let df_column: Arc = Arc::new(UInt8Array::from(vec![1, 2, 3])); let json_value = Helper::try_into_vector(df_column) .unwrap() .serialize_to_json() diff --git a/src/datatypes/src/vectors/binary.rs b/src/datatypes/src/vectors/binary.rs index 817b29bca0fa..3b5defc8ec6e 100644 --- a/src/datatypes/src/vectors/binary.rs +++ b/src/datatypes/src/vectors/binary.rs @@ -15,9 +15,8 @@ use std::any::Any; use std::sync::Arc; -use arrow::array::{Array, ArrayRef, BinaryValueIter, MutableArray}; -use arrow::bitmap::utils::ZipValidity; -use snafu::{OptionExt, ResultExt}; +use arrow::array::{Array, ArrayBuilder, ArrayData, ArrayIter, ArrayRef}; +use snafu::ResultExt; use crate::arrow_array::{BinaryArray, MutableBinaryArray}; use crate::data_type::ConcreteDataType; @@ -37,6 +36,16 @@ impl BinaryVector { pub(crate) fn as_arrow(&self) -> &dyn Array { &self.array } + + fn to_array_data(&self) -> ArrayData { + self.array.data().clone() + } + + fn from_array_data(data: ArrayData) -> BinaryVector { + BinaryVector { + array: BinaryArray::from(data), + } + } } impl From for BinaryVector { @@ -48,7 +57,7 @@ impl From for BinaryVector { impl From>>> for BinaryVector { fn from(data: Vec>>) -> Self { Self { - array: BinaryArray::from(data), + array: BinaryArray::from_iter(data), } } } @@ -71,11 +80,13 @@ impl Vector for BinaryVector { } fn to_arrow_array(&self) -> ArrayRef { - Arc::new(self.array.clone()) + let data = self.to_array_data(); + Arc::new(BinaryArray::from(data)) } fn to_boxed_arrow_array(&self) -> Box { - Box::new(self.array.clone()) + let data = self.to_array_data(); + Box::new(BinaryArray::from(data)) } fn validity(&self) -> Validity { @@ -83,7 +94,11 @@ impl Vector for BinaryVector { } fn memory_size(&self) -> usize { - self.array.values().len() + self.array.offsets().len() * std::mem::size_of::() + self.array.get_buffer_memory_size() + } + + fn null_count(&self) -> usize { + self.array.null_count() } fn is_null(&self, row: usize) -> bool { @@ -91,7 +106,8 @@ impl Vector for BinaryVector { } fn slice(&self, offset: usize, length: usize) -> VectorRef { - Arc::new(Self::from(self.array.slice(offset, length))) + let data = self.array.data().slice(offset, length); + Arc::new(Self::from_array_data(data)) } fn get(&self, index: usize) -> Value { @@ -106,7 +122,7 @@ impl Vector for BinaryVector { impl ScalarVector for BinaryVector { type OwnedItem = Vec; type RefItem<'a> = &'a [u8]; - type Iter<'a> = ZipValidity<'a, &'a [u8], BinaryValueIter<'a, i64>>; + type Iter<'a> = ArrayIter<&'a BinaryArray>; type Builder = BinaryVectorBuilder; fn get_data(&self, idx: usize) -> Option> { @@ -148,12 +164,15 @@ impl MutableVector for BinaryVectorBuilder { } fn push_value_ref(&mut self, value: ValueRef) -> Result<()> { - self.mutable_array.push(value.as_binary()?); + match value.as_binary()? { + Some(v) => self.mutable_array.append_value(v), + None => self.mutable_array.append_null(), + } Ok(()) } fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> { - vectors::impl_extend_for_builder!(self.mutable_array, vector, BinaryVector, offset, length) + vectors::impl_extend_for_builder!(self, vector, BinaryVector, offset, length) } } @@ -162,17 +181,20 @@ impl ScalarVectorBuilder for BinaryVectorBuilder { fn with_capacity(capacity: usize) -> Self { Self { - mutable_array: MutableBinaryArray::with_capacity(capacity), + mutable_array: MutableBinaryArray::with_capacity(capacity, 0), } } fn push(&mut self, value: Option<::RefItem<'_>>) { - self.mutable_array.push(value); + match value { + Some(v) => self.mutable_array.append_value(v), + None => self.mutable_array.append_null(), + } } fn finish(&mut self) -> Self::VectorType { BinaryVector { - array: std::mem::take(&mut self.mutable_array).into(), + array: self.mutable_array.finish(), } } } @@ -205,14 +227,17 @@ mod tests { #[test] fn test_binary_vector_misc() { - let v = BinaryVector::from(BinaryArray::from_slice(&[vec![1, 2, 3], vec![1, 2, 3]])); + let v = BinaryVector::from(BinaryArray::from_iter_values(&[ + vec![1, 2, 3], + vec![1, 2, 3], + ])); assert_eq!(2, v.len()); assert_eq!("BinaryVector", v.vector_type_name()); assert!(!v.is_const()); - assert_eq!(Validity::AllValid, v.validity()); + assert!(v.validity().is_all_valid()); assert!(!v.only_null()); - assert_eq!(30, v.memory_size()); + assert_eq!(128, v.memory_size()); for i in 0..2 { assert!(!v.is_null(i)); @@ -227,7 +252,10 @@ mod tests { #[test] fn test_serialize_binary_vector_to_json() { - let vector = BinaryVector::from(BinaryArray::from_slice(&[vec![1, 2, 3], vec![1, 2, 3]])); + let vector = BinaryVector::from(BinaryArray::from_iter_values(&[ + vec![1, 2, 3], + vec![1, 2, 3], + ])); let json_value = vector.serialize_to_json().unwrap(); assert_eq!( @@ -253,8 +281,8 @@ mod tests { #[test] fn test_from_arrow_array() { - let arrow_array = BinaryArray::from_slice(&[vec![1, 2, 3], vec![1, 2, 3]]); - let original = arrow_array.clone(); + let arrow_array = BinaryArray::from_iter_values(&[vec![1, 2, 3], vec![1, 2, 3]]); + let original = BinaryArray::from(arrow_array.data().clone()); let vector = BinaryVector::from(arrow_array); assert_eq!(original, vector.array); } @@ -289,7 +317,7 @@ mod tests { builder.push(Some(b"world")); let vector = builder.finish(); assert_eq!(0, vector.null_count()); - assert_eq!(Validity::AllValid, vector.validity()); + assert!(vector.validity().is_all_valid()); let mut builder = BinaryVectorBuilder::with_capacity(3); builder.push(Some(b"hello")); @@ -298,9 +326,10 @@ mod tests { let vector = builder.finish(); assert_eq!(1, vector.null_count()); let validity = vector.validity(); - let slots = validity.slots().unwrap(); - assert_eq!(1, slots.null_count()); - assert!(!slots.get_bit(1)); + assert!(!validity.is_set(1)); + + assert_eq!(1, validity.null_count()); + assert!(!validity.is_set(1)); } #[test] diff --git a/src/datatypes/src/vectors/boolean.rs b/src/datatypes/src/vectors/boolean.rs index 11c40bd66157..2b4e5b8e10d9 100644 --- a/src/datatypes/src/vectors/boolean.rs +++ b/src/datatypes/src/vectors/boolean.rs @@ -16,9 +16,10 @@ use std::any::Any; use std::borrow::Borrow; use std::sync::Arc; -use arrow::array::{Array, ArrayRef, BooleanArray, MutableArray, MutableBooleanArray}; -use arrow::bitmap::utils::{BitmapIter, ZipValidity}; -use snafu::{OptionExt, ResultExt}; +use arrow::array::{ + Array, ArrayBuilder, ArrayData, ArrayIter, ArrayRef, BooleanArray, BooleanBuilder, +}; +use snafu::ResultExt; use crate::data_type::ConcreteDataType; use crate::error::Result; @@ -41,12 +42,26 @@ impl BooleanVector { pub(crate) fn as_boolean_array(&self) -> &BooleanArray { &self.array } + + fn to_array_data(&self) -> ArrayData { + self.array.data().clone() + } + + fn from_array_data(data: ArrayData) -> BooleanVector { + BooleanVector { + array: BooleanArray::from(data), + } + } + + pub(crate) fn false_count(&self) -> usize { + self.array.false_count() + } } impl From> for BooleanVector { fn from(data: Vec) -> Self { BooleanVector { - array: BooleanArray::from_slice(&data), + array: BooleanArray::from(data), } } } @@ -91,11 +106,13 @@ impl Vector for BooleanVector { } fn to_arrow_array(&self) -> ArrayRef { - Arc::new(self.array.clone()) + let data = self.to_array_data(); + Arc::new(BooleanArray::from(data)) } fn to_boxed_arrow_array(&self) -> Box { - Box::new(self.array.clone()) + let data = self.to_array_data(); + Box::new(BooleanArray::from(data)) } fn validity(&self) -> Validity { @@ -103,7 +120,11 @@ impl Vector for BooleanVector { } fn memory_size(&self) -> usize { - self.array.values().as_slice().0.len() + self.array.get_buffer_memory_size() + } + + fn null_count(&self) -> usize { + self.array.null_count() } fn is_null(&self, row: usize) -> bool { @@ -111,7 +132,8 @@ impl Vector for BooleanVector { } fn slice(&self, offset: usize, length: usize) -> VectorRef { - Arc::new(Self::from(self.array.slice(offset, length))) + let data = self.array.data().slice(offset, length); + Arc::new(Self::from_array_data(data)) } fn get(&self, index: usize) -> Value { @@ -126,7 +148,7 @@ impl Vector for BooleanVector { impl ScalarVector for BooleanVector { type OwnedItem = bool; type RefItem<'a> = bool; - type Iter<'a> = ZipValidity<'a, bool, BitmapIter<'a>>; + type Iter<'a> = ArrayIter<&'a BooleanArray>; type Builder = BooleanVectorBuilder; fn get_data(&self, idx: usize) -> Option> { @@ -143,7 +165,7 @@ impl ScalarVector for BooleanVector { } pub struct BooleanVectorBuilder { - mutable_array: MutableBooleanArray, + mutable_array: BooleanBuilder, } impl MutableVector for BooleanVectorBuilder { @@ -168,12 +190,15 @@ impl MutableVector for BooleanVectorBuilder { } fn push_value_ref(&mut self, value: ValueRef) -> Result<()> { - self.mutable_array.push(value.as_boolean()?); + match value.as_boolean()? { + Some(v) => self.mutable_array.append_value(v), + None => self.mutable_array.append_null(), + } Ok(()) } fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> { - vectors::impl_extend_for_builder!(self.mutable_array, vector, BooleanVector, offset, length) + vectors::impl_extend_for_builder!(self, vector, BooleanVector, offset, length) } } @@ -182,17 +207,20 @@ impl ScalarVectorBuilder for BooleanVectorBuilder { fn with_capacity(capacity: usize) -> Self { Self { - mutable_array: MutableBooleanArray::with_capacity(capacity), + mutable_array: BooleanBuilder::with_capacity(capacity), } } fn push(&mut self, value: Option<::RefItem<'_>>) { - self.mutable_array.push(value); + match value { + Some(v) => self.mutable_array.append_value(v), + None => self.mutable_array.append_null(), + } } fn finish(&mut self) -> Self::VectorType { BooleanVector { - array: std::mem::take(&mut self.mutable_array).into(), + array: self.mutable_array.finish(), } } } @@ -225,9 +253,9 @@ mod tests { assert_eq!(9, v.len()); assert_eq!("BooleanVector", v.vector_type_name()); assert!(!v.is_const()); - assert_eq!(Validity::AllValid, v.validity()); + assert!(v.validity().is_all_valid()); assert!(!v.only_null()); - assert_eq!(2, v.memory_size()); + assert_eq!(64, v.memory_size()); for (i, b) in bools.iter().enumerate() { assert!(!v.is_null(i)); @@ -316,13 +344,12 @@ mod tests { let vector = BooleanVector::from(vec![Some(true), None, Some(false)]); assert_eq!(1, vector.null_count()); let validity = vector.validity(); - let slots = validity.slots().unwrap(); - assert_eq!(1, slots.null_count()); - assert!(!slots.get_bit(1)); + assert_eq!(1, validity.null_count()); + assert!(!validity.is_set(1)); let vector = BooleanVector::from(vec![true, false, false]); assert_eq!(0, vector.null_count()); - assert_eq!(Validity::AllValid, vector.validity()); + assert!(vector.validity().is_all_valid()); } #[test] diff --git a/src/datatypes/src/vectors/builder.rs b/src/datatypes/src/vectors/builder.rs deleted file mode 100644 index 67ab2513ab3e..000000000000 --- a/src/datatypes/src/vectors/builder.rs +++ /dev/null @@ -1,494 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::sync::Arc; - -use common_time::date::Date; -use common_time::datetime::DateTime; -use common_time::timestamp::Timestamp; - -use crate::data_type::ConcreteDataType; -use crate::error::{self, Result}; -use crate::prelude::ValueRef; -use crate::scalars::ScalarVectorBuilder; -use crate::value::Value; -use crate::vectors::date::DateVectorBuilder; -use crate::vectors::datetime::DateTimeVectorBuilder; -use crate::vectors::{ - BinaryVectorBuilder, BooleanVectorBuilder, Float32VectorBuilder, Float64VectorBuilder, - Int16VectorBuilder, Int32VectorBuilder, Int64VectorBuilder, Int8VectorBuilder, MutableVector, - NullVector, StringVectorBuilder, TimestampVectorBuilder, UInt16VectorBuilder, - UInt32VectorBuilder, UInt64VectorBuilder, UInt8VectorBuilder, VectorRef, -}; - -pub enum VectorBuilder { - Null(usize), - - // Numeric types: - Boolean(BooleanVectorBuilder), - UInt8(UInt8VectorBuilder), - UInt16(UInt16VectorBuilder), - UInt32(UInt32VectorBuilder), - UInt64(UInt64VectorBuilder), - Int8(Int8VectorBuilder), - Int16(Int16VectorBuilder), - Int32(Int32VectorBuilder), - Int64(Int64VectorBuilder), - Float32(Float32VectorBuilder), - Float64(Float64VectorBuilder), - - // String types: - String(StringVectorBuilder), - Binary(BinaryVectorBuilder), - - Date(DateVectorBuilder), - DateTime(DateTimeVectorBuilder), - Timestamp(TimestampVectorBuilder), -} - -impl VectorBuilder { - pub fn new(data_type: ConcreteDataType) -> VectorBuilder { - VectorBuilder::with_capacity(data_type, 0) - } - - pub fn with_capacity(data_type: ConcreteDataType, capacity: usize) -> VectorBuilder { - match data_type { - ConcreteDataType::Null(_) => VectorBuilder::Null(0), - ConcreteDataType::Boolean(_) => { - VectorBuilder::Boolean(BooleanVectorBuilder::with_capacity(capacity)) - } - ConcreteDataType::UInt8(_) => { - VectorBuilder::UInt8(UInt8VectorBuilder::with_capacity(capacity)) - } - ConcreteDataType::UInt16(_) => { - VectorBuilder::UInt16(UInt16VectorBuilder::with_capacity(capacity)) - } - ConcreteDataType::UInt32(_) => { - VectorBuilder::UInt32(UInt32VectorBuilder::with_capacity(capacity)) - } - ConcreteDataType::UInt64(_) => { - VectorBuilder::UInt64(UInt64VectorBuilder::with_capacity(capacity)) - } - ConcreteDataType::Int8(_) => { - VectorBuilder::Int8(Int8VectorBuilder::with_capacity(capacity)) - } - ConcreteDataType::Int16(_) => { - VectorBuilder::Int16(Int16VectorBuilder::with_capacity(capacity)) - } - ConcreteDataType::Int32(_) => { - VectorBuilder::Int32(Int32VectorBuilder::with_capacity(capacity)) - } - ConcreteDataType::Int64(_) => { - VectorBuilder::Int64(Int64VectorBuilder::with_capacity(capacity)) - } - ConcreteDataType::Float32(_) => { - VectorBuilder::Float32(Float32VectorBuilder::with_capacity(capacity)) - } - ConcreteDataType::Float64(_) => { - VectorBuilder::Float64(Float64VectorBuilder::with_capacity(capacity)) - } - ConcreteDataType::String(_) => { - VectorBuilder::String(StringVectorBuilder::with_capacity(capacity)) - } - ConcreteDataType::Binary(_) => { - VectorBuilder::Binary(BinaryVectorBuilder::with_capacity(capacity)) - } - ConcreteDataType::Date(_) => { - VectorBuilder::Date(DateVectorBuilder::with_capacity(capacity)) - } - ConcreteDataType::DateTime(_) => { - VectorBuilder::DateTime(DateTimeVectorBuilder::with_capacity(capacity)) - } - ConcreteDataType::Timestamp(_) => { - VectorBuilder::Timestamp(TimestampVectorBuilder::with_capacity(capacity)) - } - _ => unimplemented!(), - } - } - - pub fn data_type(&self) -> ConcreteDataType { - match self { - VectorBuilder::Null(_) => ConcreteDataType::null_datatype(), - VectorBuilder::Boolean(b) => b.data_type(), - VectorBuilder::UInt8(b) => b.data_type(), - VectorBuilder::UInt16(b) => b.data_type(), - VectorBuilder::UInt32(b) => b.data_type(), - VectorBuilder::UInt64(b) => b.data_type(), - VectorBuilder::Int8(b) => b.data_type(), - VectorBuilder::Int16(b) => b.data_type(), - VectorBuilder::Int32(b) => b.data_type(), - VectorBuilder::Int64(b) => b.data_type(), - VectorBuilder::Float32(b) => b.data_type(), - VectorBuilder::Float64(b) => b.data_type(), - VectorBuilder::String(b) => b.data_type(), - VectorBuilder::Binary(b) => b.data_type(), - VectorBuilder::Date(b) => b.data_type(), - VectorBuilder::DateTime(b) => b.data_type(), - VectorBuilder::Timestamp(b) => b.data_type(), - } - } - - pub fn push(&mut self, value: &Value) { - if value.is_null() { - self.push_null(); - return; - } - - match (&mut *self, value) { - (VectorBuilder::Boolean(b), Value::Boolean(v)) => b.push(Some(*v)), - (VectorBuilder::UInt8(b), Value::UInt8(v)) => b.push(Some(*v)), - (VectorBuilder::UInt16(b), Value::UInt16(v)) => b.push(Some(*v)), - (VectorBuilder::UInt32(b), Value::UInt32(v)) => b.push(Some(*v)), - (VectorBuilder::UInt64(b), Value::UInt64(v)) => b.push(Some(*v)), - (VectorBuilder::Int8(b), Value::Int8(v)) => b.push(Some(*v)), - (VectorBuilder::Int16(b), Value::Int16(v)) => b.push(Some(*v)), - (VectorBuilder::Int32(b), Value::Int32(v)) => b.push(Some(*v)), - (VectorBuilder::Int64(b), Value::Int64(v)) => b.push(Some(*v)), - (VectorBuilder::Float32(b), Value::Float32(v)) => b.push(Some(v.into_inner())), - (VectorBuilder::Float64(b), Value::Float64(v)) => b.push(Some(v.into_inner())), - (VectorBuilder::String(b), Value::String(v)) => b.push(Some(v.as_utf8())), - (VectorBuilder::Binary(b), Value::Binary(v)) => b.push(Some(v)), - (VectorBuilder::Date(b), Value::Date(v)) => b.push(Some(*v)), - (VectorBuilder::Date(b), Value::Int32(v)) => b.push(Some(Date::new(*v))), - (VectorBuilder::DateTime(b), Value::DateTime(v)) => b.push(Some(*v)), - (VectorBuilder::DateTime(b), Value::Int64(v)) => b.push(Some(DateTime::new(*v))), - (VectorBuilder::Timestamp(b), Value::Timestamp(t)) => b.push(Some(*t)), - (VectorBuilder::Timestamp(b), Value::Int64(v)) => { - b.push(Some(Timestamp::from_millis(*v))) - } - - _ => panic!( - "Value {:?} does not match builder type {:?}", - value, - self.data_type() - ), - } - } - - pub fn try_push_ref(&mut self, value: ValueRef) -> Result<()> { - match &mut *self { - VectorBuilder::Null(b) => { - if !value.is_null() { - return error::CastTypeSnafu { - msg: "unable to accept non-null value in NullVectorBuilder", - } - .fail(); - } - *b += 1; - Ok(()) - } - VectorBuilder::Boolean(b) => b.push_value_ref(value), - VectorBuilder::UInt8(b) => b.push_value_ref(value), - VectorBuilder::UInt16(b) => b.push_value_ref(value), - VectorBuilder::UInt32(b) => b.push_value_ref(value), - VectorBuilder::UInt64(b) => b.push_value_ref(value), - VectorBuilder::Int8(b) => b.push_value_ref(value), - VectorBuilder::Int16(b) => b.push_value_ref(value), - VectorBuilder::Int32(b) => b.push_value_ref(value), - VectorBuilder::Int64(b) => b.push_value_ref(value), - VectorBuilder::Float32(b) => b.push_value_ref(value), - VectorBuilder::Float64(b) => b.push_value_ref(value), - VectorBuilder::String(b) => b.push_value_ref(value), - VectorBuilder::Binary(b) => b.push_value_ref(value), - VectorBuilder::Date(b) => b.push_value_ref(value), - VectorBuilder::DateTime(b) => b.push_value_ref(value), - VectorBuilder::Timestamp(b) => b.push_value_ref(value), - } - } - - pub fn push_null(&mut self) { - match self { - VectorBuilder::Null(v) => *v += 1, - VectorBuilder::Boolean(b) => b.push(None), - VectorBuilder::UInt8(b) => b.push(None), - VectorBuilder::UInt16(b) => b.push(None), - VectorBuilder::UInt32(b) => b.push(None), - VectorBuilder::UInt64(b) => b.push(None), - VectorBuilder::Int8(b) => b.push(None), - VectorBuilder::Int16(b) => b.push(None), - VectorBuilder::Int32(b) => b.push(None), - VectorBuilder::Int64(b) => b.push(None), - VectorBuilder::Float32(b) => b.push(None), - VectorBuilder::Float64(b) => b.push(None), - VectorBuilder::String(b) => b.push(None), - VectorBuilder::Binary(b) => b.push(None), - VectorBuilder::Date(b) => b.push(None), - VectorBuilder::DateTime(b) => b.push(None), - VectorBuilder::Timestamp(b) => b.push(None), - } - } - - pub fn finish(&mut self) -> VectorRef { - match self { - VectorBuilder::Null(v) => Arc::new(NullVector::new(*v)), - VectorBuilder::Boolean(b) => Arc::new(b.finish()), - VectorBuilder::UInt8(b) => Arc::new(b.finish()), - VectorBuilder::UInt16(b) => Arc::new(b.finish()), - VectorBuilder::UInt32(b) => Arc::new(b.finish()), - VectorBuilder::UInt64(b) => Arc::new(b.finish()), - VectorBuilder::Int8(b) => Arc::new(b.finish()), - VectorBuilder::Int16(b) => Arc::new(b.finish()), - VectorBuilder::Int32(b) => Arc::new(b.finish()), - VectorBuilder::Int64(b) => Arc::new(b.finish()), - VectorBuilder::Float32(b) => Arc::new(b.finish()), - VectorBuilder::Float64(b) => Arc::new(b.finish()), - VectorBuilder::String(b) => Arc::new(b.finish()), - VectorBuilder::Binary(b) => Arc::new(b.finish()), - VectorBuilder::Date(b) => Arc::new(b.finish()), - VectorBuilder::DateTime(b) => Arc::new(b.finish()), - VectorBuilder::Timestamp(b) => Arc::new(b.finish()), - } - } -} - -#[cfg(test)] -mod tests { - use ordered_float::OrderedFloat; - - use super::*; - use crate::prelude::Vector; - use crate::vectors::date::DateVector; - use crate::vectors::datetime::DateTimeVector; - - macro_rules! impl_integer_builder_test { - ($Type: ident, $datatype: ident) => { - let data_type = ConcreteDataType::$datatype(); - let mut builder = VectorBuilder::with_capacity(data_type.clone(), 10); - assert_eq!(data_type, builder.data_type()); - - for i in 0..10 { - builder.push(&Value::$Type(i)); - } - for i in 10..20 { - builder.try_push_ref(ValueRef::$Type(i)).unwrap(); - } - let vector = builder.finish(); - - for i in 0..20 { - assert_eq!(Value::$Type(i), vector.get(i as usize)); - } - - let mut builder = VectorBuilder::new(ConcreteDataType::$datatype()); - builder.push(&Value::Null); - builder.push(&Value::$Type(100)); - builder.try_push_ref(ValueRef::Null).unwrap(); - builder.try_push_ref(ValueRef::$Type(101)).unwrap(); - - let result = builder.try_push_ref(ValueRef::Boolean(true)); - assert!(result.is_err()); - assert_eq!( - result.unwrap_err().to_string(), - format!( - "Failed to cast value Boolean(true) to primitive type {}", - stringify!($Type) - ), - ); - - let vector = builder.finish(); - - assert!(vector.is_null(0)); - assert_eq!(Value::$Type(100), vector.get(1)); - assert!(vector.is_null(2)); - assert_eq!(Value::$Type(101), vector.get(3)); - }; - } - - #[test] - fn test_null_vector_builder() { - let mut builder = VectorBuilder::new(ConcreteDataType::null_datatype()); - assert_eq!(ConcreteDataType::null_datatype(), builder.data_type()); - builder.push(&Value::Null); - - let result = builder.try_push_ref(ValueRef::Boolean(true)); - assert!(result.is_err()); - assert_eq!( - result.unwrap_err().to_string(), - "unable to accept non-null value in NullVectorBuilder" - ); - - builder.try_push_ref(ValueRef::Null).unwrap(); - - let vector = builder.finish(); - assert!(vector.is_null(0)); - assert!(vector.is_null(1)); - } - - #[test] - fn test_integer_vector_builder() { - impl_integer_builder_test!(UInt8, uint8_datatype); - impl_integer_builder_test!(UInt16, uint16_datatype); - impl_integer_builder_test!(UInt32, uint32_datatype); - impl_integer_builder_test!(UInt64, uint64_datatype); - impl_integer_builder_test!(Int8, int8_datatype); - impl_integer_builder_test!(Int16, int16_datatype); - impl_integer_builder_test!(Int32, int32_datatype); - impl_integer_builder_test!(Int64, int64_datatype); - } - - #[test] - fn test_float_vector_builder() { - let data_type = ConcreteDataType::float32_datatype(); - let mut builder = VectorBuilder::new(data_type.clone()); - assert_eq!(data_type, builder.data_type()); - - builder.push(&Value::Float32(OrderedFloat(1.0))); - - let result = builder.try_push_ref(ValueRef::Boolean(true)); - assert!(result.is_err()); - assert_eq!( - result.unwrap_err().to_string(), - "Failed to cast value Boolean(true) to primitive type Float32" - ); - - builder - .try_push_ref(ValueRef::Float32(OrderedFloat(2.0))) - .unwrap(); - builder.try_push_ref(ValueRef::Null).unwrap(); - - let vector = builder.finish(); - assert_eq!(Value::Float32(OrderedFloat(1.0)), vector.get(0)); - assert_eq!(Value::Float32(OrderedFloat(2.0)), vector.get(1)); - assert_eq!(Value::Null, vector.get(2)); - - let mut builder = VectorBuilder::new(ConcreteDataType::float64_datatype()); - builder.push(&Value::Float64(OrderedFloat(2.0))); - - let result = builder.try_push_ref(ValueRef::Boolean(true)); - assert!(result.is_err()); - assert_eq!( - result.unwrap_err().to_string(), - "Failed to cast value Boolean(true) to primitive type Float64" - ); - - builder - .try_push_ref(ValueRef::Float64(OrderedFloat(3.0))) - .unwrap(); - builder.try_push_ref(ValueRef::Null).unwrap(); - - let vector = builder.finish(); - assert_eq!(Value::Float64(OrderedFloat(2.0)), vector.get(0)); - assert_eq!(Value::Float64(OrderedFloat(3.0)), vector.get(1)); - assert_eq!(Value::Null, vector.get(2)); - } - - #[test] - fn test_binary_vector_builder() { - let data_type = ConcreteDataType::binary_datatype(); - let hello: &[u8] = b"hello"; - let mut builder = VectorBuilder::new(data_type.clone()); - assert_eq!(data_type, builder.data_type()); - builder.push(&Value::Binary(hello.into())); - - let result = builder.try_push_ref(ValueRef::Boolean(true)); - assert!(result.is_err()); - assert_eq!( - result.unwrap_err().to_string(), - "Failed to cast value ref Boolean(true) to Binary" - ); - - builder.try_push_ref(ValueRef::Binary(b"world")).unwrap(); - builder.try_push_ref(ValueRef::Null).unwrap(); - - let vector = builder.finish(); - assert_eq!(Value::Binary(hello.into()), vector.get(0)); - assert_eq!(ValueRef::Binary(b"world"), vector.get_ref(1)); - assert_eq!(Value::Null, vector.get(2)); - } - - #[test] - fn test_string_vector_builder() { - let data_type = ConcreteDataType::string_datatype(); - let hello = "hello"; - let mut builder = VectorBuilder::new(data_type.clone()); - assert_eq!(data_type, builder.data_type()); - builder.push(&Value::String(hello.into())); - - let result = builder.try_push_ref(ValueRef::Boolean(true)); - assert!(result.is_err()); - assert_eq!( - result.unwrap_err().to_string(), - "Failed to cast value ref Boolean(true) to String" - ); - - builder.try_push_ref(ValueRef::String("world")).unwrap(); - builder.try_push_ref(ValueRef::Null).unwrap(); - - let vector = builder.finish(); - assert_eq!(Value::String(hello.into()), vector.get(0)); - assert_eq!(ValueRef::String("world"), vector.get_ref(1)); - assert_eq!(Value::Null, vector.get(2)); - } - - #[test] - pub fn test_date_vector_builder() { - let mut builder = VectorBuilder::with_capacity(ConcreteDataType::date_datatype(), 3); - assert_eq!(ConcreteDataType::date_datatype(), builder.data_type()); - builder.push_null(); - builder.push(&Value::Date(Date::new(123))); - - let result = builder.try_push_ref(ValueRef::Boolean(true)); - assert!(result.is_err()); - assert_eq!( - result.unwrap_err().to_string(), - "Failed to cast value ref Boolean(true) to Date" - ); - - builder - .try_push_ref(ValueRef::Date(Date::new(456))) - .unwrap(); - builder.try_push_ref(ValueRef::Null).unwrap(); - - let v = builder.finish(); - let v = v.as_any().downcast_ref::().unwrap(); - assert_eq!(Value::Null, v.get(0)); - assert_eq!(Value::Date(Date::new(123)), v.get(1)); - assert_eq!(ValueRef::Date(Date::new(456)), v.get_ref(2)); - assert_eq!(ValueRef::Null, v.get_ref(3)); - assert_eq!( - &arrow::datatypes::DataType::Date32, - v.to_arrow_array().data_type() - ); - } - - #[test] - pub fn test_datetime_vector_builder() { - let mut builder = VectorBuilder::with_capacity(ConcreteDataType::datetime_datatype(), 3); - assert_eq!(ConcreteDataType::datetime_datatype(), builder.data_type()); - builder.push_null(); - builder.push(&Value::DateTime(DateTime::new(123))); - - let result = builder.try_push_ref(ValueRef::Boolean(true)); - assert!(result.is_err()); - assert_eq!( - result.unwrap_err().to_string(), - "Failed to cast value ref Boolean(true) to DateTime" - ); - - builder - .try_push_ref(ValueRef::DateTime(DateTime::new(456))) - .unwrap(); - builder.try_push_ref(ValueRef::Null).unwrap(); - - let v = builder.finish(); - let v = v.as_any().downcast_ref::().unwrap(); - assert_eq!(Value::Null, v.get(0)); - assert_eq!(Value::DateTime(DateTime::new(123)), v.get(1)); - assert_eq!(ValueRef::DateTime(DateTime::new(456)), v.get_ref(2)); - assert_eq!(ValueRef::Null, v.get_ref(3)); - assert_eq!( - &arrow::datatypes::DataType::Date64, - v.to_arrow_array().data_type() - ); - } -} diff --git a/src/datatypes/src/vectors/constant.rs b/src/datatypes/src/vectors/constant.rs index d5522007a125..87739e91318b 100644 --- a/src/datatypes/src/vectors/constant.rs +++ b/src/datatypes/src/vectors/constant.rs @@ -55,6 +55,27 @@ impl ConstantVector { pub fn get_constant_ref(&self) -> ValueRef { self.vector.get_ref(0) } + + pub(crate) fn replicate_vector(&self, offsets: &[usize]) -> VectorRef { + assert_eq!(offsets.len(), self.len()); + + if offsets.is_empty() { + return self.slice(0, 0); + } + + Arc::new(ConstantVector::new( + self.vector.clone(), + *offsets.last().unwrap(), + )) + } + + pub(crate) fn filter_vector(&self, filter: &BooleanVector) -> Result { + let length = self.len() - filter.false_count(); + if length == self.len() { + return Ok(Arc::new(self.clone())); + } + Ok(Arc::new(ConstantVector::new(self.inner().clone(), length))) + } } impl Vector for ConstantVector { @@ -90,9 +111,9 @@ impl Vector for ConstantVector { fn validity(&self) -> Validity { if self.vector.is_null(0) { - Validity::AllNull + Validity::all_null(self.length) } else { - Validity::AllValid + Validity::all_valid(self.length) } } @@ -122,6 +143,14 @@ impl Vector for ConstantVector { fn get_ref(&self, _index: usize) -> ValueRef { self.vector.get_ref(0) } + + fn null_count(&self) -> usize { + if self.only_null() { + self.len() + } else { + 0 + } + } } impl fmt::Debug for ConstantVector { @@ -140,33 +169,6 @@ impl Serializable for ConstantVector { } } -pub(crate) fn replicate_constant(vector: &ConstantVector, offsets: &[usize]) -> VectorRef { - assert_eq!(offsets.len(), vector.len()); - - if offsets.is_empty() { - return vector.slice(0, 0); - } - - Arc::new(ConstantVector::new( - vector.vector.clone(), - *offsets.last().unwrap(), - )) -} - -pub(crate) fn filter_constant( - vector: &ConstantVector, - filter: &BooleanVector, -) -> Result { - let length = filter.len() - filter.as_boolean_array().values().null_count(); - if length == vector.len() { - return Ok(Arc::new(vector.clone())); - } - Ok(Arc::new(ConstantVector::new( - vector.inner().clone(), - length, - ))) -} - #[cfg(test)] mod tests { use arrow::datatypes::DataType as ArrowDataType; @@ -182,9 +184,9 @@ mod tests { assert_eq!("ConstantVector", c.vector_type_name()); assert!(c.is_const()); assert_eq!(10, c.len()); - assert_eq!(Validity::AllValid, c.validity()); + assert!(c.validity().is_all_valid()); assert!(!c.only_null()); - assert_eq!(4, c.memory_size()); + assert_eq!(64, c.memory_size()); for i in 0..10 { assert!(!c.is_null(i)); diff --git a/src/datatypes/src/vectors/date.rs b/src/datatypes/src/vectors/date.rs index 0198b3622f34..d0a66b80fb63 100644 --- a/src/datatypes/src/vectors/date.rs +++ b/src/datatypes/src/vectors/date.rs @@ -12,258 +12,28 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::any::Any; -use std::sync::Arc; +use crate::types::DateType; +use crate::vectors::{PrimitiveVector, PrimitiveVectorBuilder}; -use arrow::array::{Array, ArrayRef, PrimitiveArray}; -use common_time::date::Date; -use snafu::OptionExt; - -use crate::data_type::ConcreteDataType; -use crate::error::{self, Result}; -use crate::prelude::*; -use crate::scalars::ScalarVector; -use crate::serialize::Serializable; -use crate::vectors::{MutableVector, PrimitiveIter, PrimitiveVector, PrimitiveVectorBuilder}; - -#[derive(Debug, Clone, PartialEq)] -pub struct DateVector { - array: PrimitiveVector, -} - -impl DateVector { - pub fn new(array: PrimitiveArray) -> Self { - Self { - array: PrimitiveVector { array }, - } - } - - pub fn try_from_arrow_array(array: impl AsRef) -> Result { - Ok(Self::new( - array - .as_ref() - .as_any() - .downcast_ref::>() - .with_context(|| error::ConversionSnafu { - from: format!("{:?}", array.as_ref().data_type()), - })? - .clone(), - )) - } - - pub(crate) fn as_arrow(&self) -> &dyn Array { - self.array.as_arrow() - } -} - -impl Vector for DateVector { - fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::date_datatype() - } - - fn vector_type_name(&self) -> String { - "DateVector".to_string() - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn len(&self) -> usize { - self.array.len() - } - - fn to_arrow_array(&self) -> ArrayRef { - let validity = self.array.array.validity().cloned(); - let buffer = self.array.array.values().clone(); - Arc::new(PrimitiveArray::new( - arrow::datatypes::DataType::Date32, - buffer, - validity, - )) - } - - fn to_boxed_arrow_array(&self) -> Box { - let validity = self.array.array.validity().cloned(); - let buffer = self.array.array.values().clone(); - Box::new(PrimitiveArray::new( - arrow::datatypes::DataType::Date32, - buffer, - validity, - )) - } - - fn validity(&self) -> Validity { - self.array.validity() - } - - fn memory_size(&self) -> usize { - self.array.memory_size() - } - - fn is_null(&self, row: usize) -> bool { - self.array.is_null(row) - } - - fn slice(&self, offset: usize, length: usize) -> VectorRef { - Arc::new(Self { - array: PrimitiveVector::new(self.array.array.slice(offset, length)), - }) - } - - fn get(&self, index: usize) -> Value { - match self.array.get(index) { - Value::Int32(v) => Value::Date(Date::new(v)), - Value::Null => Value::Null, - _ => { - unreachable!() - } - } - } - - fn get_ref(&self, index: usize) -> ValueRef { - match self.array.get(index) { - Value::Int32(v) => ValueRef::Date(Date::new(v)), - Value::Null => ValueRef::Null, - _ => { - unreachable!() - } - } - } -} - -impl From>> for DateVector { - fn from(data: Vec>) -> Self { - Self { - array: PrimitiveVector::::from(data), - } - } -} - -pub struct DateIter<'a> { - iter: PrimitiveIter<'a, i32>, -} - -impl<'a> Iterator for DateIter<'a> { - type Item = Option; - - fn next(&mut self) -> Option { - self.iter.next().map(|v| v.map(Date::new)) - } -} - -impl ScalarVector for DateVector { - type OwnedItem = Date; - type RefItem<'a> = Date; - type Iter<'a> = DateIter<'a>; - - type Builder = DateVectorBuilder; - - fn get_data(&self, idx: usize) -> Option> { - self.array.get_data(idx).map(Date::new) - } - - fn iter_data(&self) -> Self::Iter<'_> { - DateIter { - iter: self.array.iter_data(), - } - } -} - -impl Serializable for DateVector { - fn serialize_to_json(&self) -> Result> { - Ok(self - .array - .iter_data() - .map(|v| v.map(Date::new)) - .map(|v| match v { - None => serde_json::Value::Null, - Some(v) => v.into(), - }) - .collect::>()) - } -} - -pub struct DateVectorBuilder { - buffer: PrimitiveVectorBuilder, -} - -impl MutableVector for DateVectorBuilder { - fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::date_datatype() - } - - fn len(&self) -> usize { - self.buffer.len() - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn as_mut_any(&mut self) -> &mut dyn Any { - self - } - - fn to_vector(&mut self) -> VectorRef { - Arc::new(self.finish()) - } - - fn push_value_ref(&mut self, value: ValueRef) -> Result<()> { - self.buffer.push(value.as_date()?.map(|d| d.val())); - Ok(()) - } - - fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> { - let concrete_vector = vector - .as_any() - .downcast_ref::() - .with_context(|| error::CastTypeSnafu { - msg: format!( - "Failed to convert vector from {} to DateVector", - vector.vector_type_name() - ), - })?; - self.buffer - .extend_slice_of(&concrete_vector.array, offset, length)?; - Ok(()) - } -} - -impl ScalarVectorBuilder for DateVectorBuilder { - type VectorType = DateVector; - - fn with_capacity(capacity: usize) -> Self { - Self { - buffer: PrimitiveVectorBuilder::with_capacity(capacity), - } - } - - fn push(&mut self, value: Option<::RefItem<'_>>) { - self.buffer.push(value.map(|d| d.val())) - } - - fn finish(&mut self) -> Self::VectorType { - Self::VectorType { - array: self.buffer.finish(), - } - } -} - -pub(crate) fn replicate_date(vector: &DateVector, offsets: &[usize]) -> VectorRef { - let array = crate::vectors::primitive::replicate_primitive_with_type( - &vector.array, - offsets, - vector.data_type(), - ); - Arc::new(DateVector { array }) -} +// Vector for [`Date`](common_time::Date). +pub type DateVector = PrimitiveVector; +// Builder to build DateVector. +pub type DateVectorBuilder = PrimitiveVectorBuilder; #[cfg(test)] mod tests { + use std::sync::Arc; + + use arrow::array::Array; + use common_time::date::Date; + use super::*; use crate::data_type::DataType; + use crate::scalars::{ScalarVector, ScalarVectorBuilder}; + use crate::serialize::Serializable; use crate::types::DateType; + use crate::value::{Value, ValueRef}; + use crate::vectors::{Vector, VectorRef}; #[test] fn test_build_date_vector() { @@ -288,7 +58,7 @@ mod tests { #[test] fn test_date_scalar() { - let vector = DateVector::from_slice(&[Date::new(1), Date::new(2)]); + let vector = DateVector::from_slice(&[1, 2]); assert_eq!(2, vector.len()); assert_eq!(Some(Date::new(1)), vector.get_data(0)); assert_eq!(Some(Date::new(2)), vector.get_data(1)); @@ -296,7 +66,7 @@ mod tests { #[test] fn test_date_vector_builder() { - let input = DateVector::from_slice(&[Date::new(1), Date::new(2), Date::new(3)]); + let input = DateVector::from_slice(&[1, 2, 3]); let mut builder = DateType::default().create_mutable_vector(3); builder @@ -309,19 +79,25 @@ mod tests { .is_err()); let vector = builder.to_vector(); - let expect: VectorRef = Arc::new(DateVector::from_slice(&[ - Date::new(5), - Date::new(2), - Date::new(3), - ])); + let expect: VectorRef = Arc::new(DateVector::from_slice(&[5, 2, 3])); assert_eq!(expect, vector); } #[test] fn test_date_from_arrow() { - let vector = DateVector::from_slice(&[Date::new(1), Date::new(2)]); + let vector = DateVector::from_slice(&[1, 2]); let arrow = vector.as_arrow().slice(0, vector.len()); let vector2 = DateVector::try_from_arrow_array(&arrow).unwrap(); assert_eq!(vector, vector2); } + + #[test] + fn test_serialize_date_vector() { + let vector = DateVector::from_slice(&[-1, 0, 1]); + let serialized_json = serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap(); + assert_eq!( + r#"["1969-12-31","1970-01-01","1970-01-02"]"#, + serialized_json + ); + } } diff --git a/src/datatypes/src/vectors/datetime.rs b/src/datatypes/src/vectors/datetime.rs index 732e56004c33..a40a3e54d330 100644 --- a/src/datatypes/src/vectors/datetime.rs +++ b/src/datatypes/src/vectors/datetime.rs @@ -12,264 +12,32 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::any::Any; -use std::sync::Arc; +use crate::types::DateTimeType; +use crate::vectors::{PrimitiveVector, PrimitiveVectorBuilder}; -use arrow::array::{Array, ArrayRef, PrimitiveArray}; -use common_time::datetime::DateTime; -use snafu::OptionExt; - -use crate::data_type::ConcreteDataType; -use crate::error::{self, Result}; -use crate::prelude::{ - MutableVector, ScalarVector, ScalarVectorBuilder, Validity, Value, ValueRef, Vector, VectorRef, -}; -use crate::serialize::Serializable; -use crate::vectors::{PrimitiveIter, PrimitiveVector, PrimitiveVectorBuilder}; - -#[derive(Debug, Clone, PartialEq)] -pub struct DateTimeVector { - array: PrimitiveVector, -} - -impl DateTimeVector { - pub fn new(array: PrimitiveArray) -> Self { - Self { - array: PrimitiveVector { array }, - } - } - - pub fn try_from_arrow_array(array: impl AsRef) -> Result { - Ok(Self::new( - array - .as_ref() - .as_any() - .downcast_ref::>() - .with_context(|| error::ConversionSnafu { - from: format!("{:?}", array.as_ref().data_type()), - })? - .clone(), - )) - } - - pub(crate) fn as_arrow(&self) -> &dyn Array { - self.array.as_arrow() - } -} - -impl Vector for DateTimeVector { - fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::datetime_datatype() - } - - fn vector_type_name(&self) -> String { - "DateTimeVector".to_string() - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn len(&self) -> usize { - self.array.len() - } - - fn to_arrow_array(&self) -> ArrayRef { - let validity = self.array.array.validity().cloned(); - let buffer = self.array.array.values().clone(); - Arc::new(PrimitiveArray::new( - arrow::datatypes::DataType::Date64, - buffer, - validity, - )) - } - - fn to_boxed_arrow_array(&self) -> Box { - let validity = self.array.array.validity().cloned(); - let buffer = self.array.array.values().clone(); - Box::new(PrimitiveArray::new( - arrow::datatypes::DataType::Date64, - buffer, - validity, - )) - } - - fn validity(&self) -> Validity { - self.array.validity() - } - - fn memory_size(&self) -> usize { - self.array.memory_size() - } - - fn is_null(&self, row: usize) -> bool { - self.array.is_null(row) - } - - fn slice(&self, offset: usize, length: usize) -> VectorRef { - Arc::new(Self { - array: PrimitiveVector::new(self.array.array.slice(offset, length)), - }) - } - - fn get(&self, index: usize) -> Value { - match self.array.get(index) { - Value::Int64(v) => Value::DateTime(DateTime::new(v)), - Value::Null => Value::Null, - _ => { - unreachable!() - } - } - } - - fn get_ref(&self, index: usize) -> ValueRef { - match self.array.get(index) { - Value::Int64(v) => ValueRef::DateTime(DateTime::new(v)), - Value::Null => ValueRef::Null, - _ => { - unreachable!() - } - } - } -} - -impl Serializable for DateTimeVector { - fn serialize_to_json(&self) -> crate::Result> { - Ok(self - .array - .iter_data() - .map(|v| v.map(DateTime::new)) - .map(|v| match v { - None => serde_json::Value::Null, - Some(v) => v.into(), - }) - .collect::>()) - } -} - -impl From>> for DateTimeVector { - fn from(data: Vec>) -> Self { - Self { - array: PrimitiveVector::::from(data), - } - } -} - -pub struct DateTimeVectorBuilder { - buffer: PrimitiveVectorBuilder, -} - -impl ScalarVectorBuilder for DateTimeVectorBuilder { - type VectorType = DateTimeVector; - - fn with_capacity(capacity: usize) -> Self { - Self { - buffer: PrimitiveVectorBuilder::with_capacity(capacity), - } - } - - fn push(&mut self, value: Option<::RefItem<'_>>) { - self.buffer.push(value.map(|d| d.val())) - } - - fn finish(&mut self) -> Self::VectorType { - Self::VectorType { - array: self.buffer.finish(), - } - } -} - -impl MutableVector for DateTimeVectorBuilder { - fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::datetime_datatype() - } - - fn len(&self) -> usize { - self.buffer.len() - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn as_mut_any(&mut self) -> &mut dyn Any { - self - } - - fn to_vector(&mut self) -> VectorRef { - Arc::new(self.finish()) - } - - fn push_value_ref(&mut self, value: ValueRef) -> Result<()> { - self.buffer.push(value.as_datetime()?.map(|d| d.val())); - Ok(()) - } - - fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> { - let concrete_vector = vector - .as_any() - .downcast_ref::() - .with_context(|| error::CastTypeSnafu { - msg: format!( - "Failed to convert vector from {} to DateVector", - vector.vector_type_name() - ), - })?; - self.buffer - .extend_slice_of(&concrete_vector.array, offset, length)?; - Ok(()) - } -} - -pub struct DateTimeIter<'a> { - iter: PrimitiveIter<'a, i64>, -} - -impl<'a> Iterator for DateTimeIter<'a> { - type Item = Option; - - fn next(&mut self) -> Option { - self.iter.next().map(|v| v.map(DateTime::new)) - } -} - -impl ScalarVector for DateTimeVector { - type OwnedItem = DateTime; - type RefItem<'a> = DateTime; - type Iter<'a> = DateTimeIter<'a>; - type Builder = DateTimeVectorBuilder; - - fn get_data(&self, idx: usize) -> Option> { - self.array.get_data(idx).map(DateTime::new) - } - - fn iter_data(&self) -> Self::Iter<'_> { - DateTimeIter { - iter: self.array.iter_data(), - } - } -} - -pub(crate) fn replicate_datetime(vector: &DateTimeVector, offsets: &[usize]) -> VectorRef { - let array = crate::vectors::primitive::replicate_primitive_with_type( - &vector.array, - offsets, - vector.data_type(), - ); - Arc::new(DateTimeVector { array }) -} +/// Vector of [`DateTime`](common_time::Date) +pub type DateTimeVector = PrimitiveVector; +/// Builder for [`DateTimeVector`]. +pub type DateTimeVectorBuilder = PrimitiveVectorBuilder; #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::sync::Arc; + + use arrow::array::{Array, PrimitiveArray}; + use common_time::DateTime; + use datafusion_common::from_slice::FromSlice; use super::*; use crate::data_type::DataType; - use crate::types::DateTimeType; + use crate::prelude::{ + ConcreteDataType, ScalarVector, ScalarVectorBuilder, Value, ValueRef, Vector, VectorRef, + }; + use crate::serialize::Serializable; #[test] fn test_datetime_vector() { - let v = DateTimeVector::new(PrimitiveArray::from_vec(vec![1, 2, 3])); + let v = DateTimeVector::new(PrimitiveArray::from_slice(&[1, 2, 3])); assert_eq!(ConcreteDataType::datetime_datatype(), v.data_type()); assert_eq!(3, v.len()); assert_eq!("DateTimeVector", v.vector_type_name()); @@ -287,9 +55,8 @@ mod tests { assert_eq!(Some(DateTime::new(2)), iter.next().unwrap()); assert_eq!(Some(DateTime::new(3)), iter.next().unwrap()); assert!(!v.is_null(0)); - assert_eq!(24, v.memory_size()); // size of i64 * 3 + assert_eq!(64, v.memory_size()); - assert_matches!(v.validity(), Validity::AllValid); if let Value::DateTime(d) = v.get(0) { assert_eq!(1, d.val()); } else { @@ -314,8 +81,11 @@ mod tests { assert_eq!(Value::Null, v.get(1)); assert_eq!(Value::DateTime(DateTime::new(-1)), v.get(2)); - let input = - DateTimeVector::from_slice(&[DateTime::new(1), DateTime::new(2), DateTime::new(3)]); + let input = DateTimeVector::from_wrapper_slice(&[ + DateTime::new(1), + DateTime::new(2), + DateTime::new(3), + ]); let mut builder = DateTimeType::default().create_mutable_vector(3); builder @@ -328,7 +98,7 @@ mod tests { .is_err()); let vector = builder.to_vector(); - let expect: VectorRef = Arc::new(DateTimeVector::from_slice(&[ + let expect: VectorRef = Arc::new(DateTimeVector::from_wrapper_slice(&[ DateTime::new(5), DateTime::new(2), DateTime::new(3), @@ -338,7 +108,7 @@ mod tests { #[test] fn test_datetime_from_arrow() { - let vector = DateTimeVector::from_slice(&[DateTime::new(1), DateTime::new(2)]); + let vector = DateTimeVector::from_wrapper_slice(&[DateTime::new(1), DateTime::new(2)]); let arrow = vector.as_arrow().slice(0, vector.len()); let vector2 = DateTimeVector::try_from_arrow_array(&arrow).unwrap(); assert_eq!(vector, vector2); diff --git a/src/datatypes/src/vectors/eq.rs b/src/datatypes/src/vectors/eq.rs index d47167c3f93e..55359026d479 100644 --- a/src/datatypes/src/vectors/eq.rs +++ b/src/datatypes/src/vectors/eq.rs @@ -15,9 +15,12 @@ use std::sync::Arc; use crate::data_type::DataType; +use crate::types::TimestampType; +use crate::vectors::constant::ConstantVector; use crate::vectors::{ - BinaryVector, BooleanVector, ConstantVector, DateTimeVector, DateVector, ListVector, - PrimitiveVector, StringVector, TimestampVector, Vector, + BinaryVector, BooleanVector, DateTimeVector, DateVector, ListVector, PrimitiveVector, + StringVector, TimestampMicrosecondVector, TimestampMillisecondVector, + TimestampNanosecondVector, TimestampSecondVector, Vector, }; use crate::with_match_primitive_type_id; @@ -76,7 +79,20 @@ fn equal(lhs: &dyn Vector, rhs: &dyn Vector) -> bool { String(_) => is_vector_eq!(StringVector, lhs, rhs), Date(_) => is_vector_eq!(DateVector, lhs, rhs), DateTime(_) => is_vector_eq!(DateTimeVector, lhs, rhs), - Timestamp(_) => is_vector_eq!(TimestampVector, lhs, rhs), + Timestamp(t) => match t { + TimestampType::Second(_) => { + is_vector_eq!(TimestampSecondVector, lhs, rhs) + } + TimestampType::Millisecond(_) => { + is_vector_eq!(TimestampMillisecondVector, lhs, rhs) + } + TimestampType::Microsecond(_) => { + is_vector_eq!(TimestampMicrosecondVector, lhs, rhs) + } + TimestampType::Nanosecond(_) => { + is_vector_eq!(TimestampNanosecondVector, lhs, rhs) + } + }, List(_) => is_vector_eq!(ListVector, lhs, rhs), UInt8(_) | UInt16(_) | UInt32(_) | UInt64(_) | Int8(_) | Int16(_) | Int32(_) | Int64(_) | Float32(_) | Float64(_) => { @@ -95,13 +111,10 @@ fn equal(lhs: &dyn Vector, rhs: &dyn Vector) -> bool { #[cfg(test)] mod tests { - use arrow::array::{ListArray, MutableListArray, MutablePrimitiveArray, TryExtend}; - use super::*; use crate::vectors::{ - Float32Vector, Float64Vector, Int16Vector, Int32Vector, Int64Vector, Int8Vector, - NullVector, TimestampVector, UInt16Vector, UInt32Vector, UInt64Vector, UInt8Vector, - VectorRef, + list, Float32Vector, Float64Vector, Int16Vector, Int32Vector, Int64Vector, Int8Vector, + NullVector, UInt16Vector, UInt32Vector, UInt64Vector, UInt8Vector, VectorRef, }; fn assert_vector_ref_eq(vector: VectorRef) { @@ -132,14 +145,21 @@ mod tests { assert_vector_ref_eq(Arc::new(BooleanVector::from(vec![true, false]))); assert_vector_ref_eq(Arc::new(DateVector::from(vec![Some(100), Some(120)]))); assert_vector_ref_eq(Arc::new(DateTimeVector::from(vec![Some(100), Some(120)]))); - assert_vector_ref_eq(Arc::new(TimestampVector::from_values([100, 120]))); - - let mut arrow_array = MutableListArray::>::new(); - arrow_array - .try_extend(vec![Some(vec![Some(1), Some(2), Some(3)])]) - .unwrap(); - let arrow_array: ListArray = arrow_array.into(); - assert_vector_ref_eq(Arc::new(ListVector::from(arrow_array))); + assert_vector_ref_eq(Arc::new(TimestampSecondVector::from_values([100, 120]))); + assert_vector_ref_eq(Arc::new(TimestampMillisecondVector::from_values([ + 100, 120, + ]))); + assert_vector_ref_eq(Arc::new(TimestampMicrosecondVector::from_values([ + 100, 120, + ]))); + assert_vector_ref_eq(Arc::new(TimestampNanosecondVector::from_values([100, 120]))); + + let list_vector = list::tests::new_list_vector(&[ + Some(vec![Some(1), Some(2)]), + None, + Some(vec![Some(3), Some(4)]), + ]); + assert_vector_ref_eq(Arc::new(list_vector)); assert_vector_ref_eq(Arc::new(NullVector::new(4))); assert_vector_ref_eq(Arc::new(StringVector::from(vec![ diff --git a/src/datatypes/src/vectors/helper.rs b/src/datatypes/src/vectors/helper.rs index 60a9f8511fab..f3236ca0ec42 100644 --- a/src/datatypes/src/vectors/helper.rs +++ b/src/datatypes/src/vectors/helper.rs @@ -17,19 +17,26 @@ use std::any::Any; use std::sync::Arc; -use arrow::array::Array; +use arrow::array::{Array, ArrayRef, StringArray}; use arrow::compute; -use arrow::datatypes::DataType as ArrowDataType; +use arrow::compute::kernels::comparison; +use arrow::datatypes::{DataType as ArrowDataType, TimeUnit}; use datafusion_common::ScalarValue; use snafu::{OptionExt, ResultExt}; -use crate::arrow_array::StringArray; -use crate::error::{ConversionSnafu, Result, UnknownVectorSnafu}; -use crate::scalars::*; -use crate::vectors::date::DateVector; -use crate::vectors::datetime::DateTimeVector; -use crate::vectors::*; +use crate::data_type::ConcreteDataType; +use crate::error::{self, Result}; +use crate::scalars::{Scalar, ScalarVectorBuilder}; +use crate::value::{ListValue, ListValueRef}; +use crate::vectors::{ + BinaryVector, BooleanVector, ConstantVector, DateTimeVector, DateVector, Float32Vector, + Float64Vector, Int16Vector, Int32Vector, Int64Vector, Int8Vector, ListVector, + ListVectorBuilder, MutableVector, NullVector, StringVector, TimestampMicrosecondVector, + TimestampMillisecondVector, TimestampNanosecondVector, TimestampSecondVector, UInt16Vector, + UInt32Vector, UInt64Vector, UInt8Vector, Vector, VectorRef, +}; +/// Helper functions for `Vector`. pub struct Helper; impl Helper { @@ -47,7 +54,7 @@ impl Helper { let arr = vector .as_any() .downcast_ref::<::VectorType>() - .with_context(|| UnknownVectorSnafu { + .with_context(|| error::UnknownVectorSnafu { msg: format!( "downcast vector error, vector type: {:?}, expected vector: {:?}", vector.vector_type_name(), @@ -61,7 +68,7 @@ impl Helper { let arr = vector .as_any() .downcast_ref::() - .with_context(|| UnknownVectorSnafu { + .with_context(|| error::UnknownVectorSnafu { msg: format!( "downcast vector error, vector type: {:?}, expected vector: {:?}", vector.vector_type_name(), @@ -78,7 +85,7 @@ impl Helper { let arr = vector .as_mut_any() .downcast_mut() - .with_context(|| UnknownVectorSnafu { + .with_context(|| error::UnknownVectorSnafu { msg: format!( "downcast vector error, vector type: {:?}, expected vector: {:?}", ty, @@ -94,7 +101,7 @@ impl Helper { let arr = vector .as_any() .downcast_ref::<::VectorType>() - .with_context(|| UnknownVectorSnafu { + .with_context(|| error::UnknownVectorSnafu { msg: format!( "downcast vector error, vector type: {:?}, expected vector: {:?}", vector.vector_type_name(), @@ -105,11 +112,9 @@ impl Helper { } /// Try to cast an arrow scalar value into vector - /// - /// # Panics - /// Panic if given scalar value is not supported. pub fn try_from_scalar_value(value: ScalarValue, length: usize) -> Result { let vector = match value { + ScalarValue::Null => ConstantVector::new(Arc::new(NullVector::new(1)), length), ScalarValue::Boolean(v) => { ConstantVector::new(Arc::new(BooleanVector::from(vec![v])), length) } @@ -143,17 +148,29 @@ impl Helper { ScalarValue::UInt64(v) => { ConstantVector::new(Arc::new(UInt64Vector::from(vec![v])), length) } - ScalarValue::Utf8(v) => { - ConstantVector::new(Arc::new(StringVector::from(vec![v])), length) - } - ScalarValue::LargeUtf8(v) => { + ScalarValue::Utf8(v) | ScalarValue::LargeUtf8(v) => { ConstantVector::new(Arc::new(StringVector::from(vec![v])), length) } - ScalarValue::Binary(v) => { + ScalarValue::Binary(v) + | ScalarValue::LargeBinary(v) + | ScalarValue::FixedSizeBinary(_, v) => { ConstantVector::new(Arc::new(BinaryVector::from(vec![v])), length) } - ScalarValue::LargeBinary(v) => { - ConstantVector::new(Arc::new(BinaryVector::from(vec![v])), length) + ScalarValue::List(v, field) => { + let item_type = ConcreteDataType::try_from(field.data_type())?; + let mut builder = ListVectorBuilder::with_type_capacity(item_type.clone(), 1); + if let Some(values) = v { + let values = values + .into_iter() + .map(ScalarValue::try_into) + .collect::>()?; + let list_value = ListValue::new(Some(Box::new(values)), item_type); + builder.push(Some(ListValueRef::Ref { val: &list_value })); + } else { + builder.push(None); + } + let list_vector = builder.to_vector(); + ConstantVector::new(list_vector, length) } ScalarValue::Date32(v) => { ConstantVector::new(Arc::new(DateVector::from(vec![v])), length) @@ -161,8 +178,30 @@ impl Helper { ScalarValue::Date64(v) => { ConstantVector::new(Arc::new(DateTimeVector::from(vec![v])), length) } - _ => { - return ConversionSnafu { + ScalarValue::TimestampSecond(v, _) => { + // Timezone is unimplemented now. + ConstantVector::new(Arc::new(TimestampSecondVector::from(vec![v])), length) + } + ScalarValue::TimestampMillisecond(v, _) => { + // Timezone is unimplemented now. + ConstantVector::new(Arc::new(TimestampMillisecondVector::from(vec![v])), length) + } + ScalarValue::TimestampMicrosecond(v, _) => { + // Timezone is unimplemented now. + ConstantVector::new(Arc::new(TimestampMicrosecondVector::from(vec![v])), length) + } + ScalarValue::TimestampNanosecond(v, _) => { + // Timezone is unimplemented now. + ConstantVector::new(Arc::new(TimestampNanosecondVector::from(vec![v])), length) + } + ScalarValue::Decimal128(_, _, _) + | ScalarValue::Time64(_) + | ScalarValue::IntervalYearMonth(_) + | ScalarValue::IntervalDayTime(_) + | ScalarValue::IntervalMonthDayNano(_) + | ScalarValue::Struct(_, _) + | ScalarValue::Dictionary(_, _) => { + return error::ConversionSnafu { from: format!("Unsupported scalar value: {}", value), } .fail() @@ -180,9 +219,7 @@ impl Helper { Ok(match array.as_ref().data_type() { ArrowDataType::Null => Arc::new(NullVector::try_from_arrow_array(array)?), ArrowDataType::Boolean => Arc::new(BooleanVector::try_from_arrow_array(array)?), - ArrowDataType::Binary | ArrowDataType::LargeBinary => { - Arc::new(BinaryVector::try_from_arrow_array(array)?) - } + ArrowDataType::LargeBinary => Arc::new(BinaryVector::try_from_arrow_array(array)?), ArrowDataType::Int8 => Arc::new(Int8Vector::try_from_arrow_array(array)?), ArrowDataType::Int16 => Arc::new(Int16Vector::try_from_arrow_array(array)?), ArrowDataType::Int32 => Arc::new(Int32Vector::try_from_arrow_array(array)?), @@ -193,48 +230,80 @@ impl Helper { ArrowDataType::UInt64 => Arc::new(UInt64Vector::try_from_arrow_array(array)?), ArrowDataType::Float32 => Arc::new(Float32Vector::try_from_arrow_array(array)?), ArrowDataType::Float64 => Arc::new(Float64Vector::try_from_arrow_array(array)?), - ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 => { - Arc::new(StringVector::try_from_arrow_array(array)?) - } + ArrowDataType::Utf8 => Arc::new(StringVector::try_from_arrow_array(array)?), ArrowDataType::Date32 => Arc::new(DateVector::try_from_arrow_array(array)?), ArrowDataType::Date64 => Arc::new(DateTimeVector::try_from_arrow_array(array)?), ArrowDataType::List(_) => Arc::new(ListVector::try_from_arrow_array(array)?), - ArrowDataType::Timestamp(_, _) => { - Arc::new(TimestampVector::try_from_arrow_array(array)?) + ArrowDataType::Timestamp(unit, _) => match unit { + TimeUnit::Second => Arc::new(TimestampSecondVector::try_from_arrow_array(array)?), + TimeUnit::Millisecond => { + Arc::new(TimestampMillisecondVector::try_from_arrow_array(array)?) + } + TimeUnit::Microsecond => { + Arc::new(TimestampMicrosecondVector::try_from_arrow_array(array)?) + } + TimeUnit::Nanosecond => { + Arc::new(TimestampNanosecondVector::try_from_arrow_array(array)?) + } + }, + ArrowDataType::Float16 + | ArrowDataType::Time32(_) + | ArrowDataType::Time64(_) + | ArrowDataType::Duration(_) + | ArrowDataType::Interval(_) + | ArrowDataType::Binary + | ArrowDataType::FixedSizeBinary(_) + | ArrowDataType::LargeUtf8 + | ArrowDataType::LargeList(_) + | ArrowDataType::FixedSizeList(_, _) + | ArrowDataType::Struct(_) + | ArrowDataType::Union(_, _, _) + | ArrowDataType::Dictionary(_, _) + | ArrowDataType::Decimal128(_, _) + | ArrowDataType::Decimal256(_, _) + | ArrowDataType::Map(_, _) => { + unimplemented!("Arrow array datatype: {:?}", array.as_ref().data_type()) } - _ => unimplemented!("Arrow array datatype: {:?}", array.as_ref().data_type()), }) } + /// Try to cast slice of `arrays` to vectors. pub fn try_into_vectors(arrays: &[ArrayRef]) -> Result> { arrays.iter().map(Self::try_into_vector).collect() } + /// Perform SQL like operation on `names` and a scalar `s`. pub fn like_utf8(names: Vec, s: &str) -> Result { - let array = StringArray::from_slice(&names); + let array = StringArray::from(names); - let filter = - compute::like::like_utf8_scalar(&array, s).context(error::ArrowComputeSnafu)?; + let filter = comparison::like_utf8_scalar(&array, s).context(error::ArrowComputeSnafu)?; - let result = compute::filter::filter(&array, &filter).context(error::ArrowComputeSnafu)?; + let result = compute::filter(&array, &filter).context(error::ArrowComputeSnafu)?; Helper::try_into_vector(result) } } #[cfg(test)] mod tests { - use arrow::array::Int32Array; - use common_time::date::Date; - use common_time::datetime::DateTime; + use arrow::array::{ + ArrayRef, BooleanArray, Date32Array, Date64Array, Float32Array, Float64Array, Int16Array, + Int32Array, Int64Array, Int8Array, LargeBinaryArray, ListArray, NullArray, + TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, + TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, + }; + use arrow::datatypes::{Field, Int32Type}; + use common_time::{Date, DateTime}; use super::*; + use crate::value::Value; + use crate::vectors::ConcreteDataType; #[test] fn test_try_into_vectors() { let arrays: Vec = vec![ - Arc::new(Int32Array::from_vec(vec![1])), - Arc::new(Int32Array::from_vec(vec![2])), - Arc::new(Int32Array::from_vec(vec![3])), + Arc::new(Int32Array::from(vec![1])), + Arc::new(Int32Array::from(vec![2])), + Arc::new(Int32Array::from(vec![3])), ]; let vectors = Helper::try_into_vectors(&arrays); assert!(vectors.is_ok()); @@ -246,10 +315,10 @@ mod tests { } #[test] - pub fn test_try_into_date_vector() { + fn test_try_into_date_vector() { let vector = DateVector::from(vec![Some(1), Some(2), None]); let arrow_array = vector.to_arrow_array(); - assert_eq!(&arrow::datatypes::DataType::Date32, arrow_array.data_type()); + assert_eq!(&ArrowDataType::Date32, arrow_array.data_type()); let vector_converted = Helper::try_into_vector(arrow_array).unwrap(); assert_eq!(vector.len(), vector_converted.len()); for i in 0..vector_converted.len() { @@ -258,7 +327,7 @@ mod tests { } #[test] - pub fn test_try_from_scalar_date_value() { + fn test_try_from_scalar_date_value() { let vector = Helper::try_from_scalar_value(ScalarValue::Date32(Some(42)), 3).unwrap(); assert_eq!(ConcreteDataType::date_datatype(), vector.data_type()); assert_eq!(3, vector.len()); @@ -268,7 +337,7 @@ mod tests { } #[test] - pub fn test_try_from_scalar_datetime_value() { + fn test_try_from_scalar_datetime_value() { let vector = Helper::try_from_scalar_value(ScalarValue::Date64(Some(42)), 3).unwrap(); assert_eq!(ConcreteDataType::datetime_datatype(), vector.data_type()); assert_eq!(3, vector.len()); @@ -277,6 +346,28 @@ mod tests { } } + #[test] + fn test_try_from_list_value() { + let value = ScalarValue::List( + Some(vec![ + ScalarValue::Int32(Some(1)), + ScalarValue::Int32(Some(2)), + ]), + Box::new(Field::new("item", ArrowDataType::Int32, true)), + ); + let vector = Helper::try_from_scalar_value(value, 3).unwrap(); + assert_eq!( + ConcreteDataType::list_datatype(ConcreteDataType::int32_datatype()), + vector.data_type() + ); + assert_eq!(3, vector.len()); + for i in 0..vector.len() { + let v = vector.get(i); + let items = v.as_list().unwrap().unwrap().items().as_ref().unwrap(); + assert_eq!(vec![Value::Int32(1), Value::Int32(2)], **items); + } + } + #[test] fn test_like_utf8() { fn assert_vector(expected: Vec<&str>, actual: &VectorRef) { @@ -301,4 +392,40 @@ mod tests { let ret = Helper::like_utf8(names, "%").unwrap(); assert_vector(vec!["greptime", "hello", "public", "world"], &ret); } + + fn check_try_into_vector(array: impl Array + 'static) { + let array: ArrayRef = Arc::new(array); + let vector = Helper::try_into_vector(array.clone()).unwrap(); + assert_eq!(&array, &vector.to_arrow_array()); + } + + #[test] + fn test_try_into_vector() { + check_try_into_vector(NullArray::new(2)); + check_try_into_vector(BooleanArray::from(vec![true, false])); + check_try_into_vector(LargeBinaryArray::from(vec![ + "hello".as_bytes(), + "world".as_bytes(), + ])); + check_try_into_vector(Int8Array::from(vec![1, 2, 3])); + check_try_into_vector(Int16Array::from(vec![1, 2, 3])); + check_try_into_vector(Int32Array::from(vec![1, 2, 3])); + check_try_into_vector(Int64Array::from(vec![1, 2, 3])); + check_try_into_vector(UInt8Array::from(vec![1, 2, 3])); + check_try_into_vector(UInt16Array::from(vec![1, 2, 3])); + check_try_into_vector(UInt32Array::from(vec![1, 2, 3])); + check_try_into_vector(UInt64Array::from(vec![1, 2, 3])); + check_try_into_vector(Float32Array::from(vec![1.0, 2.0, 3.0])); + check_try_into_vector(Float64Array::from(vec![1.0, 2.0, 3.0])); + check_try_into_vector(StringArray::from(vec!["hello", "world"])); + check_try_into_vector(Date32Array::from(vec![1, 2, 3])); + check_try_into_vector(Date64Array::from(vec![1, 2, 3])); + let data = vec![None, Some(vec![Some(6), Some(7)])]; + let list_array = ListArray::from_iter_primitive::(data); + check_try_into_vector(list_array); + check_try_into_vector(TimestampSecondArray::from(vec![1, 2, 3])); + check_try_into_vector(TimestampMillisecondArray::from(vec![1, 2, 3])); + check_try_into_vector(TimestampMicrosecondArray::from(vec![1, 2, 3])); + check_try_into_vector(TimestampNanosecondArray::from(vec![1, 2, 3])); + } } diff --git a/src/datatypes/src/vectors/list.rs b/src/datatypes/src/vectors/list.rs index 76d9dd8717eb..747e03557ba2 100644 --- a/src/datatypes/src/vectors/list.rs +++ b/src/datatypes/src/vectors/list.rs @@ -13,39 +13,48 @@ // limitations under the License. use std::any::Any; -use std::ops::Range; use std::sync::Arc; -use arrow::array::{Array, ArrayRef, ListArray}; -use arrow::bitmap::utils::ZipValidity; -use arrow::bitmap::MutableBitmap; +use arrow::array::{ + Array, ArrayData, ArrayRef, BooleanBufferBuilder, Int32BufferBuilder, ListArray, +}; +use arrow::buffer::Buffer; use arrow::datatypes::DataType as ArrowDataType; use serde_json::Value as JsonValue; -use snafu::prelude::*; +use crate::data_type::{ConcreteDataType, DataType}; use crate::error::Result; -use crate::prelude::*; +use crate::scalars::{ScalarVector, ScalarVectorBuilder}; use crate::serialize::Serializable; use crate::types::ListType; -use crate::value::{ListValue, ListValueRef}; -use crate::vectors::{impl_try_from_arrow_array_for_vector, impl_validity_for_vector}; - -type ArrowListArray = ListArray; +use crate::value::{ListValue, ListValueRef, Value, ValueRef}; +use crate::vectors::{self, Helper, MutableVector, Validity, Vector, VectorRef}; /// Vector of Lists, basically backed by Arrow's `ListArray`. -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, PartialEq)] pub struct ListVector { - array: ArrowListArray, - inner_datatype: ConcreteDataType, + array: ListArray, + /// The datatype of the items in the list. + item_type: ConcreteDataType, } impl ListVector { - /// Only iterate values in the [ListVector]. - /// - /// Be careful to use this method as it would ignore validity and replace null - /// by empty vector. - pub fn values_iter(&self) -> Box> + '_> { - Box::new(self.array.values_iter().map(VectorHelper::try_into_vector)) + /// Iterate elements as [VectorRef]. + pub fn values_iter(&self) -> impl Iterator>> + '_ { + self.array + .iter() + .map(|value_opt| value_opt.map(Helper::try_into_vector).transpose()) + } + + fn to_array_data(&self) -> ArrayData { + self.array.data().clone() + } + + fn from_array_data_and_type(data: ArrayData, item_type: ConcreteDataType) -> Self { + Self { + array: ListArray::from(data), + item_type, + } } pub(crate) fn as_arrow(&self) -> &dyn Array { @@ -55,7 +64,7 @@ impl ListVector { impl Vector for ListVector { fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::List(ListType::new(self.inner_datatype.clone())) + ConcreteDataType::List(ListType::new(self.item_type.clone())) } fn vector_type_name(&self) -> String { @@ -71,21 +80,25 @@ impl Vector for ListVector { } fn to_arrow_array(&self) -> ArrayRef { - Arc::new(self.array.clone()) + let data = self.to_array_data(); + Arc::new(ListArray::from(data)) } fn to_boxed_arrow_array(&self) -> Box { - Box::new(self.array.clone()) + let data = self.to_array_data(); + Box::new(ListArray::from(data)) } fn validity(&self) -> Validity { - impl_validity_for_vector!(self.array) + vectors::impl_validity_for_vector!(self.array) } fn memory_size(&self) -> usize { - let offsets_bytes = self.array.offsets().len() * std::mem::size_of::(); - let value_refs_bytes = self.array.values().len() * std::mem::size_of::>(); - offsets_bytes + value_refs_bytes + self.array.get_buffer_memory_size() + } + + fn null_count(&self) -> usize { + self.array.null_count() } fn is_null(&self, row: usize) -> bool { @@ -93,7 +106,8 @@ impl Vector for ListVector { } fn slice(&self, offset: usize, length: usize) -> VectorRef { - Arc::new(ListVector::from(self.array.slice(offset, length))) + let data = self.array.data().slice(offset, length); + Arc::new(Self::from_array_data_and_type(data, self.item_type.clone())) } fn get(&self, index: usize) -> Value { @@ -102,7 +116,7 @@ impl Vector for ListVector { } let array = &self.array.value(index); - let vector = VectorHelper::try_into_vector(array).unwrap_or_else(|_| { + let vector = Helper::try_into_vector(array).unwrap_or_else(|_| { panic!( "arrow array with datatype {:?} cannot converted to our vector", array.data_type() @@ -113,7 +127,7 @@ impl Vector for ListVector { .collect::>(); Value::List(ListValue::new( Some(Box::new(values)), - self.inner_datatype.clone(), + self.item_type.clone(), )) } @@ -131,7 +145,7 @@ impl Serializable for ListVector { .iter() .map(|v| match v { None => Ok(JsonValue::Null), - Some(v) => VectorHelper::try_into_vector(v) + Some(v) => Helper::try_into_vector(v) .and_then(|v| v.serialize_to_json()) .map(JsonValue::Array), }) @@ -139,70 +153,64 @@ impl Serializable for ListVector { } } -impl From for ListVector { - fn from(array: ArrowListArray) -> Self { - let inner_datatype = ConcreteDataType::from_arrow_type(match array.data_type() { - ArrowDataType::List(field) => &field.data_type, - _ => unreachable!(), +impl From for ListVector { + fn from(array: ListArray) -> Self { + let item_type = ConcreteDataType::from_arrow_type(match array.data_type() { + ArrowDataType::List(field) => field.data_type(), + other => panic!( + "Try to create ListVector from an arrow array with type {:?}", + other + ), }); - Self { - array, - inner_datatype, - } + Self { array, item_type } } } -impl_try_from_arrow_array_for_vector!(ArrowListArray, ListVector); +vectors::impl_try_from_arrow_array_for_vector!(ListArray, ListVector); -pub struct ListVectorIter<'a> { +pub struct ListIter<'a> { vector: &'a ListVector, - iter: ZipValidity<'a, usize, Range>, + idx: usize, } -impl<'a> ListVectorIter<'a> { - pub fn new(vector: &'a ListVector) -> ListVectorIter<'a> { - let iter = ZipValidity::new( - 0..vector.len(), - vector.array.validity().as_ref().map(|x| x.iter()), - ); - - Self { vector, iter } +impl<'a> ListIter<'a> { + fn new(vector: &'a ListVector) -> ListIter { + ListIter { vector, idx: 0 } } } -impl<'a> Iterator for ListVectorIter<'a> { +impl<'a> Iterator for ListIter<'a> { type Item = Option>; #[inline] fn next(&mut self) -> Option { - self.iter.next().map(|idx_opt| { - idx_opt.map(|idx| ListValueRef::Indexed { - vector: self.vector, - idx, - }) - }) - } + if self.idx >= self.vector.len() { + return None; + } - #[inline] - fn size_hint(&self) -> (usize, Option) { - self.iter.size_hint() + let idx = self.idx; + self.idx += 1; + + if self.vector.is_null(idx) { + return Some(None); + } + + Some(Some(ListValueRef::Indexed { + vector: self.vector, + idx, + })) } #[inline] - fn nth(&mut self, n: usize) -> Option { - self.iter.nth(n).map(|idx_opt| { - idx_opt.map(|idx| ListValueRef::Indexed { - vector: self.vector, - idx, - }) - }) + fn size_hint(&self) -> (usize, Option) { + (self.vector.len(), Some(self.vector.len())) } } impl ScalarVector for ListVector { type OwnedItem = ListValue; type RefItem<'a> = ListValueRef<'a>; - type Iter<'a> = ListVectorIter<'a>; + type Iter<'a> = ListIter<'a>; type Builder = ListVectorBuilder; fn get_data(&self, idx: usize) -> Option> { @@ -214,86 +222,68 @@ impl ScalarVector for ListVector { } fn iter_data(&self) -> Self::Iter<'_> { - ListVectorIter::new(self) + ListIter::new(self) } } -// Some codes are ported from arrow2's MutableListArray. +// Ports from arrow's GenericListBuilder. +// See https://github.com/apache/arrow-rs/blob/94565bca99b5d9932a3e9a8e094aaf4e4384b1e5/arrow-array/src/builder/generic_list_builder.rs +/// [ListVector] builder. pub struct ListVectorBuilder { - inner_type: ConcreteDataType, - offsets: Vec, - values: Box, - validity: Option, + item_type: ConcreteDataType, + offsets_builder: Int32BufferBuilder, + null_buffer_builder: NullBufferBuilder, + values_builder: Box, } impl ListVectorBuilder { - pub fn with_type_capacity(inner_type: ConcreteDataType, capacity: usize) -> ListVectorBuilder { - let mut offsets = Vec::with_capacity(capacity + 1); - offsets.push(0); - // The actual required capacity might greater than the capacity of the `ListVector` - // if there exists child vector that has more than one element. - let values = inner_type.create_mutable_vector(capacity); + /// Creates a new [`ListVectorBuilder`]. `item_type` is the data type of the list item, `capacity` + /// is the number of items to pre-allocate space for in this builder. + pub fn with_type_capacity(item_type: ConcreteDataType, capacity: usize) -> ListVectorBuilder { + let mut offsets_builder = Int32BufferBuilder::new(capacity + 1); + offsets_builder.append(0); + // The actual required capacity might be greater than the capacity of the `ListVector` + // if the child vector has more than one element. + let values_builder = item_type.create_mutable_vector(capacity); ListVectorBuilder { - inner_type, - offsets, - values, - validity: None, + item_type, + offsets_builder, + null_buffer_builder: NullBufferBuilder::new(capacity), + values_builder, } } - #[inline] - fn last_offset(&self) -> i32 { - *self.offsets.last().unwrap() + /// Finish the current variable-length list vector slot. + fn finish_list(&mut self, is_valid: bool) { + self.offsets_builder + .append(i32::try_from(self.values_builder.len()).unwrap()); + self.null_buffer_builder.append(is_valid); } fn push_null(&mut self) { - self.offsets.push(self.last_offset()); - match &mut self.validity { - Some(validity) => validity.push(false), - None => self.init_validity(), - } - } - - fn init_validity(&mut self) { - let len = self.offsets.len() - 1; - - let mut validity = MutableBitmap::with_capacity(self.offsets.capacity()); - validity.extend_constant(len, true); - validity.set(len - 1, false); - self.validity = Some(validity) + self.finish_list(false); } fn push_list_value(&mut self, list_value: &ListValue) -> Result<()> { if let Some(items) = list_value.items() { for item in &**items { - self.values.push_value_ref(item.as_value_ref())?; + self.values_builder.push_value_ref(item.as_value_ref())?; } } - self.push_valid(); - Ok(()) - } - - /// Needs to be called when a valid value was extended to this builder. - fn push_valid(&mut self) { - let size = self.values.len(); - let size = i32::try_from(size).unwrap(); - assert!(size >= *self.offsets.last().unwrap()); - self.offsets.push(size); - if let Some(validity) = &mut self.validity { - validity.push(true) - } + self.finish_list(true); + Ok(()) } } impl MutableVector for ListVectorBuilder { fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::list_datatype(self.inner_type.clone()) + ConcreteDataType::list_datatype(self.item_type.clone()) } fn len(&self) -> usize { - self.offsets.len() - 1 + self.null_buffer_builder.len() } fn as_any(&self) -> &dyn Any { @@ -348,51 +338,181 @@ impl ScalarVectorBuilder for ListVectorBuilder { self.push_value_ref(value.into()).unwrap_or_else(|e| { panic!( "Failed to push value, expect value type {:?}, err:{}", - self.inner_type, e + self.item_type, e ); }); } fn finish(&mut self) -> Self::VectorType { - let array = ArrowListArray::try_new( - ConcreteDataType::list_datatype(self.inner_type.clone()).as_arrow_type(), - std::mem::take(&mut self.offsets).into(), - self.values.to_vector().to_arrow_array(), - std::mem::take(&mut self.validity).map(|x| x.into()), - ) - .unwrap(); // The `ListVectorBuilder` itself should ensure it always builds a valid array. + let len = self.len(); + let values_vector = self.values_builder.to_vector(); + let values_arr = values_vector.to_arrow_array(); + let values_data = values_arr.data(); + + let offset_buffer = self.offsets_builder.finish(); + let null_bit_buffer = self.null_buffer_builder.finish(); + // Re-initialize the offsets_builder. + self.offsets_builder.append(0); + let data_type = ConcreteDataType::list_datatype(self.item_type.clone()).as_arrow_type(); + let array_data_builder = ArrayData::builder(data_type) + .len(len) + .add_buffer(offset_buffer) + .add_child_data(values_data.clone()) + .null_bit_buffer(null_bit_buffer); + + let array_data = unsafe { array_data_builder.build_unchecked() }; + let array = ListArray::from(array_data); ListVector { array, - inner_datatype: self.inner_type.clone(), + item_type: self.item_type.clone(), + } + } +} + +// Ports from https://github.com/apache/arrow-rs/blob/94565bca99b5d9932a3e9a8e094aaf4e4384b1e5/arrow-array/src/builder/null_buffer_builder.rs +/// Builder for creating the null bit buffer. +/// This builder only materializes the buffer when we append `false`. +/// If you only append `true`s to the builder, what you get will be +/// `None` when calling [`finish`](#method.finish). +/// This optimization is **very** important for the performance. +#[derive(Debug)] +struct NullBufferBuilder { + bitmap_builder: Option, + /// Store the length of the buffer before materializing. + len: usize, + capacity: usize, +} + +impl NullBufferBuilder { + /// Creates a new empty builder. + /// `capacity` is the number of bits in the null buffer. + fn new(capacity: usize) -> Self { + Self { + bitmap_builder: None, + len: 0, + capacity, + } + } + + fn len(&self) -> usize { + if let Some(b) = &self.bitmap_builder { + b.len() + } else { + self.len + } + } + + /// Appends a `true` into the builder + /// to indicate that this item is not null. + #[inline] + fn append_non_null(&mut self) { + if let Some(buf) = self.bitmap_builder.as_mut() { + buf.append(true) + } else { + self.len += 1; + } + } + + /// Appends a `false` into the builder + /// to indicate that this item is null. + #[inline] + fn append_null(&mut self) { + self.materialize_if_needed(); + self.bitmap_builder.as_mut().unwrap().append(false); + } + + /// Appends a boolean value into the builder. + #[inline] + fn append(&mut self, not_null: bool) { + if not_null { + self.append_non_null() + } else { + self.append_null() + } + } + + /// Builds the null buffer and resets the builder. + /// Returns `None` if the builder only contains `true`s. + fn finish(&mut self) -> Option { + let buf = self.bitmap_builder.as_mut().map(|b| b.finish()); + self.bitmap_builder = None; + self.len = 0; + buf + } + + #[inline] + fn materialize_if_needed(&mut self) { + if self.bitmap_builder.is_none() { + self.materialize() + } + } + + #[cold] + fn materialize(&mut self) { + if self.bitmap_builder.is_none() { + let mut b = BooleanBufferBuilder::new(self.len.max(self.capacity)); + b.append_n(self.len, true); + self.bitmap_builder = Some(b); } } } #[cfg(test)] -mod tests { - use arrow::array::{MutableListArray, MutablePrimitiveArray, TryExtend}; +pub mod tests { + use arrow::array::{Int32Array, Int32Builder, ListBuilder}; use serde_json::json; use super::*; + use crate::scalars::ScalarRef; use crate::types::ListType; + use crate::vectors::Int32Vector; + + pub fn new_list_vector(data: &[Option>>]) -> ListVector { + let mut builder = + ListVectorBuilder::with_type_capacity(ConcreteDataType::int32_datatype(), 8); + for vec_opt in data { + if let Some(vec) = vec_opt { + let values = vec.iter().map(|v| Value::from(*v)).collect(); + let values = Some(Box::new(values)); + let list_value = ListValue::new(values, ConcreteDataType::int32_datatype()); + + builder.push(Some(ListValueRef::Ref { val: &list_value })); + } else { + builder.push(None); + } + } + + builder.finish() + } + + fn new_list_array(data: &[Option>>]) -> ListArray { + let mut builder = ListBuilder::new(Int32Builder::new()); + for vec_opt in data { + if let Some(vec) = vec_opt { + for value_opt in vec { + builder.values().append_option(*value_opt); + } + + builder.append(true); + } else { + builder.append(false); + } + } + + builder.finish() + } #[test] fn test_list_vector() { let data = vec![ - Some(vec![Some(1i32), Some(2), Some(3)]), + Some(vec![Some(1), Some(2), Some(3)]), None, Some(vec![Some(4), None, Some(6)]), ]; - let mut arrow_array = MutableListArray::>::new(); - arrow_array.try_extend(data).unwrap(); - let arrow_array: ArrowListArray = arrow_array.into(); + let list_vector = new_list_vector(&data); - let list_vector = ListVector { - array: arrow_array.clone(), - inner_datatype: ConcreteDataType::int32_datatype(), - }; assert_eq!( ConcreteDataType::List(ListType::new(ConcreteDataType::int32_datatype())), list_vector.data_type() @@ -403,30 +523,34 @@ mod tests { assert!(list_vector.is_null(1)); assert!(!list_vector.is_null(2)); + let arrow_array = new_list_array(&data); assert_eq!( arrow_array, - list_vector + *list_vector .to_arrow_array() .as_any() - .downcast_ref::() + .downcast_ref::() .unwrap() - .clone() - ); - assert_eq!( - Validity::Slots(arrow_array.validity().unwrap()), - list_vector.validity() - ); - assert_eq!( - arrow_array.offsets().len() * std::mem::size_of::() - + arrow_array.values().len() * std::mem::size_of::>(), - list_vector.memory_size() ); - - let slice = list_vector.slice(0, 2); + let validity = list_vector.validity(); + assert!(!validity.is_all_null()); + assert!(!validity.is_all_valid()); + assert!(validity.is_set(0)); + assert!(!validity.is_set(1)); + assert!(validity.is_set(2)); + assert_eq!(256, list_vector.memory_size()); + + let slice = list_vector.slice(0, 2).to_arrow_array(); + let sliced_array = slice.as_any().downcast_ref::().unwrap(); assert_eq!( - "ListArray[[1, 2, 3], None]", - format!("{:?}", slice.to_arrow_array()) + Int32Array::from_iter_values([1, 2, 3]), + *sliced_array + .value(0) + .as_any() + .downcast_ref::() + .unwrap() ); + assert!(sliced_array.is_null(1)); assert_eq!( Value::List(ListValue::new( @@ -467,52 +591,48 @@ mod tests { #[test] fn test_from_arrow_array() { let data = vec![ - Some(vec![Some(1u32), Some(2), Some(3)]), + Some(vec![Some(1), Some(2), Some(3)]), None, Some(vec![Some(4), None, Some(6)]), ]; - let mut arrow_array = MutableListArray::>::new(); - arrow_array.try_extend(data).unwrap(); - let arrow_array: ArrowListArray = arrow_array.into(); + let arrow_array = new_list_array(&data); let array_ref: ArrayRef = Arc::new(arrow_array); + let expect = new_list_vector(&data); + // Test try from ArrayRef let list_vector = ListVector::try_from_arrow_array(array_ref).unwrap(); - assert_eq!( - "ListVector { array: ListArray[[1, 2, 3], None, [4, None, 6]], inner_datatype: UInt32(UInt32) }", - format!("{:?}", list_vector) - ); + assert_eq!(expect, list_vector); + + // Test from + let arrow_array = new_list_array(&data); + let list_vector = ListVector::from(arrow_array); + assert_eq!(expect, list_vector); } #[test] fn test_iter_list_vector_values() { let data = vec![ - Some(vec![Some(1i64), Some(2), Some(3)]), + Some(vec![Some(1), Some(2), Some(3)]), None, Some(vec![Some(4), None, Some(6)]), ]; - let mut arrow_array = MutableListArray::>::new(); - arrow_array.try_extend(data).unwrap(); - let arrow_array: ArrowListArray = arrow_array.into(); + let list_vector = new_list_vector(&data); - let list_vector = ListVector::from(arrow_array); assert_eq!( - ConcreteDataType::List(ListType::new(ConcreteDataType::int64_datatype())), + ConcreteDataType::List(ListType::new(ConcreteDataType::int32_datatype())), list_vector.data_type() ); let mut iter = list_vector.values_iter(); assert_eq!( - "Int64[1, 2, 3]", - format!("{:?}", iter.next().unwrap().unwrap().to_arrow_array()) - ); - assert_eq!( - "Int64[]", - format!("{:?}", iter.next().unwrap().unwrap().to_arrow_array()) + Arc::new(Int32Vector::from_slice(&[1, 2, 3])) as VectorRef, + *iter.next().unwrap().unwrap().unwrap() ); + assert!(iter.next().unwrap().unwrap().is_none()); assert_eq!( - "Int64[4, None, 6]", - format!("{:?}", iter.next().unwrap().unwrap().to_arrow_array()) + Arc::new(Int32Vector::from(vec![Some(4), None, Some(6)])) as VectorRef, + *iter.next().unwrap().unwrap().unwrap(), ); assert!(iter.next().is_none()) } @@ -520,30 +640,18 @@ mod tests { #[test] fn test_serialize_to_json() { let data = vec![ - Some(vec![Some(1i64), Some(2), Some(3)]), + Some(vec![Some(1), Some(2), Some(3)]), None, Some(vec![Some(4), None, Some(6)]), ]; - let mut arrow_array = MutableListArray::>::new(); - arrow_array.try_extend(data).unwrap(); - let arrow_array: ArrowListArray = arrow_array.into(); - - let list_vector = ListVector::from(arrow_array); + let list_vector = new_list_vector(&data); assert_eq!( vec![json!([1, 2, 3]), json!(null), json!([4, null, 6]),], list_vector.serialize_to_json().unwrap() ); } - fn new_list_vector(data: Vec>>>) -> ListVector { - let mut arrow_array = MutableListArray::>::new(); - arrow_array.try_extend(data).unwrap(); - let arrow_array: ArrowListArray = arrow_array.into(); - - ListVector::from(arrow_array) - } - #[test] fn test_list_vector_builder() { let mut builder = @@ -567,14 +675,14 @@ mod tests { None, Some(vec![Some(7), Some(8), None]), ]; - let input = new_list_vector(data); + let input = new_list_vector(&data); builder.extend_slice_of(&input, 1, 2).unwrap(); assert!(builder .extend_slice_of(&crate::vectors::Int32Vector::from_slice(&[13]), 0, 1) .is_err()); let vector = builder.to_vector(); - let expect: VectorRef = Arc::new(new_list_vector(vec![ + let expect: VectorRef = Arc::new(new_list_vector(&[ Some(vec![Some(4), None, Some(6)]), None, Some(vec![Some(7), Some(8), None]), @@ -599,7 +707,7 @@ mod tests { })); let vector = builder.finish(); - let expect = new_list_vector(vec![None, Some(vec![Some(4), None, Some(6)])]); + let expect = new_list_vector(&[None, Some(vec![Some(4), None, Some(6)])]); assert_eq!(expect, vector); assert!(vector.get_data(0).is_none()); diff --git a/src/datatypes/src/vectors/mutable.rs b/src/datatypes/src/vectors/mutable.rs deleted file mode 100644 index 5f949574602e..000000000000 --- a/src/datatypes/src/vectors/mutable.rs +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::any::Any; - -use crate::error::Result; -use crate::prelude::*; - -/// Mutable vector that could be used to build an immutable vector. -pub trait MutableVector: Send + Sync { - /// Returns the data type of the vector. - fn data_type(&self) -> ConcreteDataType; - - /// Returns the length of the vector. - fn len(&self) -> usize; - - /// Returns whether the vector is empty. - fn is_empty(&self) -> bool { - self.len() == 0 - } - - /// Convert to Any, to enable dynamic casting. - fn as_any(&self) -> &dyn Any; - - /// Convert to mutable Any, to enable dynamic casting. - fn as_mut_any(&mut self) -> &mut dyn Any; - - /// Convert `self` to an (immutable) [VectorRef] and reset `self`. - fn to_vector(&mut self) -> VectorRef; - - /// Push value ref to this mutable vector. - /// - /// Returns error if data type unmatch. - fn push_value_ref(&mut self, value: ValueRef) -> Result<()>; - - /// Extend this mutable vector by slice of `vector`. - /// - /// Returns error if data type unmatch. - /// - /// # Panics - /// Panics if `offset + length > vector.len()`. - fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()>; -} diff --git a/src/datatypes/src/vectors/null.rs b/src/datatypes/src/vectors/null.rs index 64974d99b0c0..bb66e09b392b 100644 --- a/src/datatypes/src/vectors/null.rs +++ b/src/datatypes/src/vectors/null.rs @@ -16,8 +16,7 @@ use std::any::Any; use std::fmt; use std::sync::Arc; -use arrow::array::{Array, ArrayRef, NullArray}; -use arrow::datatypes::DataType as ArrowDataType; +use arrow::array::{Array, ArrayData, ArrayRef, NullArray}; use snafu::{ensure, OptionExt}; use crate::data_type::ConcreteDataType; @@ -27,21 +26,28 @@ use crate::types::NullType; use crate::value::{Value, ValueRef}; use crate::vectors::{self, MutableVector, Validity, Vector, VectorRef}; +/// A vector where all elements are nulls. #[derive(PartialEq)] pub struct NullVector { array: NullArray, } +// TODO(yingwen): Support null vector with other logical types. impl NullVector { + /// Create a new `NullVector` with `n` elements. pub fn new(n: usize) -> Self { Self { - array: NullArray::new(ArrowDataType::Null, n), + array: NullArray::new(n), } } pub(crate) fn as_arrow(&self) -> &dyn Array { &self.array } + + fn to_array_data(&self) -> ArrayData { + self.array.data().clone() + } } impl From for NullVector { @@ -68,21 +74,28 @@ impl Vector for NullVector { } fn to_arrow_array(&self) -> ArrayRef { - Arc::new(self.array.clone()) + // TODO(yingwen): Replaced by clone after upgrading to arrow 28.0. + let data = self.to_array_data(); + Arc::new(NullArray::from(data)) } fn to_boxed_arrow_array(&self) -> Box { - Box::new(self.array.clone()) + let data = self.to_array_data(); + Box::new(NullArray::from(data)) } fn validity(&self) -> Validity { - Validity::AllNull + Validity::all_null(self.array.len()) } fn memory_size(&self) -> usize { 0 } + fn null_count(&self) -> usize { + self.array.null_count() + } + fn is_null(&self, _row: usize) -> bool { true } @@ -217,7 +230,7 @@ mod tests { assert_eq!("NullVector", v.vector_type_name()); assert!(!v.is_const()); - assert_eq!(Validity::AllNull, v.validity()); + assert!(v.validity().is_all_null()); assert!(v.only_null()); for i in 0..32 { @@ -246,7 +259,7 @@ mod tests { #[test] fn test_null_vector_validity() { let vector = NullVector::new(5); - assert_eq!(Validity::AllNull, vector.validity()); + assert!(vector.validity().is_all_null()); assert_eq!(5, vector.null_count()); } diff --git a/src/datatypes/src/vectors/operations.rs b/src/datatypes/src/vectors/operations.rs index e63f338a0546..70ddb4a0317a 100644 --- a/src/datatypes/src/vectors/operations.rs +++ b/src/datatypes/src/vectors/operations.rs @@ -19,10 +19,11 @@ mod replicate; use common_base::BitVec; use crate::error::Result; -use crate::types::PrimitiveElement; +use crate::types::LogicalPrimitiveType; +use crate::vectors::constant::ConstantVector; use crate::vectors::{ - BinaryVector, BooleanVector, ConstantVector, DateTimeVector, DateVector, ListVector, - NullVector, PrimitiveVector, StringVector, TimestampVector, Vector, VectorRef, + BinaryVector, BooleanVector, ListVector, NullVector, PrimitiveVector, StringVector, Vector, + VectorRef, }; /// Vector compute operations. @@ -59,10 +60,10 @@ pub trait VectorOp { } macro_rules! impl_scalar_vector_op { - ($( { $VectorType: ident, $replicate: ident } ),+) => {$( + ($($VectorType: ident),+) => {$( impl VectorOp for $VectorType { fn replicate(&self, offsets: &[usize]) -> VectorRef { - replicate::$replicate(self, offsets) + replicate::replicate_scalar(self, offsets) } fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) { @@ -77,28 +78,21 @@ macro_rules! impl_scalar_vector_op { )+}; } -impl_scalar_vector_op!( - { BinaryVector, replicate_scalar }, - { BooleanVector, replicate_scalar }, - { ListVector, replicate_scalar }, - { StringVector, replicate_scalar }, - { DateVector, replicate_date }, - { DateTimeVector, replicate_datetime }, - { TimestampVector, replicate_timestamp } -); +impl_scalar_vector_op!(BinaryVector, BooleanVector, ListVector, StringVector); -impl VectorOp for ConstantVector { +impl VectorOp for PrimitiveVector { fn replicate(&self, offsets: &[usize]) -> VectorRef { - replicate::replicate_constant(self, offsets) + std::sync::Arc::new(replicate::replicate_primitive(self, offsets)) } fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) { - let prev_vector = prev_vector.and_then(|pv| pv.as_any().downcast_ref::()); - find_unique::find_unique_constant(self, selected, prev_vector); + let prev_vector = + prev_vector.and_then(|pv| pv.as_any().downcast_ref::>()); + find_unique::find_unique_scalar(self, selected, prev_vector); } fn filter(&self, filter: &BooleanVector) -> Result { - filter::filter_constant(self, filter) + filter::filter_non_constant!(self, PrimitiveVector, filter) } } @@ -117,21 +111,17 @@ impl VectorOp for NullVector { } } -impl VectorOp for PrimitiveVector -where - T: PrimitiveElement, -{ +impl VectorOp for ConstantVector { fn replicate(&self, offsets: &[usize]) -> VectorRef { - replicate::replicate_primitive(self, offsets) + self.replicate_vector(offsets) } fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) { - let prev_vector = - prev_vector.and_then(|pv| pv.as_any().downcast_ref::>()); - find_unique::find_unique_scalar(self, selected, prev_vector); + let prev_vector = prev_vector.and_then(|pv| pv.as_any().downcast_ref::()); + find_unique::find_unique_constant(self, selected, prev_vector); } fn filter(&self, filter: &BooleanVector) -> Result { - filter::filter_non_constant!(self, PrimitiveVector, filter) + self.filter_vector(filter) } } diff --git a/src/datatypes/src/vectors/operations/filter.rs b/src/datatypes/src/vectors/operations/filter.rs index 7a9f514a1621..8368a6afb4c4 100644 --- a/src/datatypes/src/vectors/operations/filter.rs +++ b/src/datatypes/src/vectors/operations/filter.rs @@ -12,16 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -pub(crate) use crate::vectors::constant::filter_constant; - macro_rules! filter_non_constant { ($vector: expr, $VectorType: ty, $filter: ident) => {{ use std::sync::Arc; + use arrow::compute; use snafu::ResultExt; let arrow_array = $vector.as_arrow(); - let filtered = arrow::compute::filter::filter(arrow_array, $filter.as_boolean_array()) + let filtered = compute::filter(arrow_array, $filter.as_boolean_array()) .context(crate::error::ArrowComputeSnafu)?; Ok(Arc::new(<$VectorType>::try_from_arrow_array(filtered)?)) }}; @@ -33,9 +32,16 @@ pub(crate) use filter_non_constant; mod tests { use std::sync::Arc; + use common_time::{Date, DateTime}; + use crate::scalars::ScalarVector; + use crate::timestamp::{ + TimestampMicrosecond, TimestampMillisecond, TimestampNanosecond, TimestampSecond, + }; + use crate::types::WrapperType; + use crate::vectors::constant::ConstantVector; use crate::vectors::{ - BooleanVector, ConstantVector, Int32Vector, NullVector, StringVector, VectorOp, VectorRef, + BooleanVector, Int32Vector, NullVector, StringVector, VectorOp, VectorRef, }; fn check_filter_primitive(expect: &[i32], input: &[i32], filter: &[bool]) { @@ -105,7 +111,6 @@ mod tests { ($VectorType: ident, $ValueType: ident, $method: ident) => {{ use std::sync::Arc; - use common_time::$ValueType; use $crate::vectors::{$VectorType, VectorRef}; let v = $VectorType::from_iterator((0..5).map($ValueType::$method)); @@ -123,6 +128,18 @@ mod tests { fn test_filter_date_like() { impl_filter_date_like_test!(DateVector, Date, new); impl_filter_date_like_test!(DateTimeVector, DateTime, new); - impl_filter_date_like_test!(TimestampVector, Timestamp, from_millis); + + impl_filter_date_like_test!(TimestampSecondVector, TimestampSecond, from_native); + impl_filter_date_like_test!( + TimestampMillisecondVector, + TimestampMillisecond, + from_native + ); + impl_filter_date_like_test!( + TimestampMicrosecondVector, + TimestampMicrosecond, + from_native + ); + impl_filter_date_like_test!(TimestampNanosecondVector, TimestampNanosecond, from_native); } } diff --git a/src/datatypes/src/vectors/operations/find_unique.rs b/src/datatypes/src/vectors/operations/find_unique.rs index d63a3c66b9d2..7116a9e90d53 100644 --- a/src/datatypes/src/vectors/operations/find_unique.rs +++ b/src/datatypes/src/vectors/operations/find_unique.rs @@ -15,7 +15,8 @@ use common_base::BitVec; use crate::scalars::ScalarVector; -use crate::vectors::{ConstantVector, NullVector, Vector}; +use crate::vectors::constant::ConstantVector; +use crate::vectors::{NullVector, Vector}; // To implement `find_unique()` correctly, we need to keep in mind that always marks an element as // selected when it is different from the previous one, and leaves the `selected` unchanged @@ -70,7 +71,7 @@ pub(crate) fn find_unique_null( return; } - let is_first_not_duplicate = prev_vector.map(|pv| pv.is_empty()).unwrap_or(true); + let is_first_not_duplicate = prev_vector.map(NullVector::is_empty).unwrap_or(true); if is_first_not_duplicate { selected.set(0, true); } @@ -104,8 +105,11 @@ pub(crate) fn find_unique_constant( mod tests { use std::sync::Arc; + use common_time::{Date, DateTime}; + use super::*; - use crate::vectors::{Int32Vector, StringVector, VectorOp}; + use crate::timestamp::*; + use crate::vectors::{Int32Vector, StringVector, Vector, VectorOp}; fn check_bitmap(expect: &[bool], selected: &BitVec) { let actual = selected.iter().collect::>(); @@ -121,7 +125,7 @@ mod tests { input: impl Iterator>, prev: Option<&[i32]>, ) { - let input = Int32Vector::from_iter(input); + let input = Int32Vector::from(input.collect::>()); let prev = prev.map(Int32Vector::from_slice); let mut selected = BitVec::repeat(false, input.len()); @@ -341,7 +345,6 @@ mod tests { macro_rules! impl_find_unique_date_like_test { ($VectorType: ident, $ValueType: ident, $method: ident) => {{ - use common_time::$ValueType; use $crate::vectors::$VectorType; let v = $VectorType::from_iterator([8, 8, 9, 10].into_iter().map($ValueType::$method)); @@ -356,6 +359,9 @@ mod tests { fn test_find_unique_date_like() { impl_find_unique_date_like_test!(DateVector, Date, new); impl_find_unique_date_like_test!(DateTimeVector, DateTime, new); - impl_find_unique_date_like_test!(TimestampVector, Timestamp, from_millis); + impl_find_unique_date_like_test!(TimestampSecondVector, TimestampSecond, from); + impl_find_unique_date_like_test!(TimestampMillisecondVector, TimestampMillisecond, from); + impl_find_unique_date_like_test!(TimestampMicrosecondVector, TimestampMicrosecond, from); + impl_find_unique_date_like_test!(TimestampNanosecondVector, TimestampNanosecond, from); } } diff --git a/src/datatypes/src/vectors/operations/replicate.rs b/src/datatypes/src/vectors/operations/replicate.rs index 7fb93134eda5..8216517fc62d 100644 --- a/src/datatypes/src/vectors/operations/replicate.rs +++ b/src/datatypes/src/vectors/operations/replicate.rs @@ -13,12 +13,8 @@ // limitations under the License. use crate::prelude::*; -pub(crate) use crate::vectors::constant::replicate_constant; -pub(crate) use crate::vectors::date::replicate_date; -pub(crate) use crate::vectors::datetime::replicate_datetime; pub(crate) use crate::vectors::null::replicate_null; pub(crate) use crate::vectors::primitive::replicate_primitive; -pub(crate) use crate::vectors::timestamp::replicate_timestamp; pub(crate) fn replicate_scalar(c: &C, offsets: &[usize]) -> VectorRef { assert_eq!(offsets.len(), c.len()); @@ -43,8 +39,13 @@ pub(crate) fn replicate_scalar(c: &C, offsets: &[usize]) -> Vec mod tests { use std::sync::Arc; + use common_time::timestamp::TimeUnit; + use common_time::{Date, DateTime, Timestamp}; + use paste::paste; + use super::*; - use crate::vectors::{ConstantVector, Int32Vector, NullVector, StringVector, VectorOp}; + use crate::vectors::constant::ConstantVector; + use crate::vectors::{Int32Vector, NullVector, StringVector, VectorOp}; #[test] fn test_replicate_primitive() { @@ -120,7 +121,6 @@ mod tests { macro_rules! impl_replicate_date_like_test { ($VectorType: ident, $ValueType: ident, $method: ident) => {{ - use common_time::$ValueType; use $crate::vectors::$VectorType; let v = $VectorType::from_iterator((0..5).map($ValueType::$method)); @@ -138,10 +138,33 @@ mod tests { }}; } + macro_rules! impl_replicate_timestamp_test { + ($unit: ident) => {{ + paste!{ + use $crate::vectors::[]; + use $crate::timestamp::[]; + let v = []::from_iterator((0..5).map([]::from)); + let offsets = [0, 1, 2, 3, 4]; + let v = v.replicate(&offsets); + assert_eq!(4, v.len()); + for i in 0..4 { + assert_eq!( + Value::Timestamp(Timestamp::new(i as i64 + 1, TimeUnit::$unit)), + v.get(i) + ); + } + } + }}; + } + #[test] fn test_replicate_date_like() { impl_replicate_date_like_test!(DateVector, Date, new); impl_replicate_date_like_test!(DateTimeVector, DateTime, new); - impl_replicate_date_like_test!(TimestampVector, Timestamp, from_millis); + + impl_replicate_timestamp_test!(Second); + impl_replicate_timestamp_test!(Millisecond); + impl_replicate_timestamp_test!(Microsecond); + impl_replicate_timestamp_test!(Nanosecond); } } diff --git a/src/datatypes/src/vectors/primitive.rs b/src/datatypes/src/vectors/primitive.rs index c49295630cff..7829c3173131 100644 --- a/src/datatypes/src/vectors/primitive.rs +++ b/src/datatypes/src/vectors/primitive.rs @@ -13,75 +13,111 @@ // limitations under the License. use std::any::Any; -use std::iter::FromIterator; -use std::slice::Iter; +use std::fmt; use std::sync::Arc; -use arrow::array::{Array, ArrayRef, MutableArray, MutablePrimitiveArray, PrimitiveArray}; -use arrow::bitmap::utils::ZipValidity; +use arrow::array::{ + Array, ArrayBuilder, ArrayData, ArrayIter, ArrayRef, PrimitiveArray, PrimitiveBuilder, +}; use serde_json::Value as JsonValue; -use snafu::{OptionExt, ResultExt}; +use snafu::OptionExt; -use crate::data_type::{ConcreteDataType, DataType}; -use crate::error::{ConversionSnafu, Result, SerializeSnafu}; +use crate::data_type::ConcreteDataType; +use crate::error::{self, Result}; use crate::scalars::{Scalar, ScalarRef, ScalarVector, ScalarVectorBuilder}; use crate::serialize::Serializable; -use crate::types::{Primitive, PrimitiveElement}; +use crate::types::{ + Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, LogicalPrimitiveType, + UInt16Type, UInt32Type, UInt64Type, UInt8Type, WrapperType, +}; use crate::value::{Value, ValueRef}; use crate::vectors::{self, MutableVector, Validity, Vector, VectorRef}; +pub type UInt8Vector = PrimitiveVector; +pub type UInt16Vector = PrimitiveVector; +pub type UInt32Vector = PrimitiveVector; +pub type UInt64Vector = PrimitiveVector; + +pub type Int8Vector = PrimitiveVector; +pub type Int16Vector = PrimitiveVector; +pub type Int32Vector = PrimitiveVector; +pub type Int64Vector = PrimitiveVector; + +pub type Float32Vector = PrimitiveVector; +pub type Float64Vector = PrimitiveVector; + /// Vector for primitive data types. -#[derive(Debug, Clone, PartialEq)] -pub struct PrimitiveVector { - pub(crate) array: PrimitiveArray, +pub struct PrimitiveVector { + array: PrimitiveArray, } -impl PrimitiveVector { - pub fn new(array: PrimitiveArray) -> Self { +impl PrimitiveVector { + pub fn new(array: PrimitiveArray) -> Self { Self { array } } pub fn try_from_arrow_array(array: impl AsRef) -> Result { - Ok(Self::new( - array - .as_ref() - .as_any() - .downcast_ref::>() - .with_context(|| ConversionSnafu { - from: format!("{:?}", array.as_ref().data_type()), - })? - .clone(), - )) - } - - pub fn from_slice>(slice: P) -> Self { + let data = array + .as_ref() + .as_any() + .downcast_ref::>() + .with_context(|| error::ConversionSnafu { + from: format!("{:?}", array.as_ref().data_type()), + })? + .data() + .clone(); + let concrete_array = PrimitiveArray::::from(data); + Ok(Self::new(concrete_array)) + } + + pub fn from_slice>(slice: P) -> Self { + let iter = slice.as_ref().iter().copied(); + Self { + array: PrimitiveArray::from_iter_values(iter), + } + } + + pub fn from_wrapper_slice>(slice: P) -> Self { + let iter = slice.as_ref().iter().copied().map(WrapperType::into_native); Self { - array: PrimitiveArray::from_slice(slice), + array: PrimitiveArray::from_iter_values(iter), } } - pub fn from_vec(array: Vec) -> Self { + pub fn from_vec(array: Vec) -> Self { Self { - array: PrimitiveArray::from_vec(array), + array: PrimitiveArray::from_iter_values(array), } } - pub fn from_values>(iter: I) -> Self { + pub fn from_values>(iter: I) -> Self { Self { - array: PrimitiveArray::from_values(iter), + array: PrimitiveArray::from_iter_values(iter), } } - pub(crate) fn as_arrow(&self) -> &dyn Array { + pub(crate) fn as_arrow(&self) -> &PrimitiveArray { &self.array } - fn slice(&self, offset: usize, length: usize) -> Self { - Self::from(self.array.slice(offset, length)) + fn to_array_data(&self) -> ArrayData { + self.array.data().clone() + } + + fn from_array_data(data: ArrayData) -> Self { + Self { + array: PrimitiveArray::from(data), + } + } + + // To distinguish with `Vector::slice()`. + fn get_slice(&self, offset: usize, length: usize) -> Self { + let data = self.array.data().slice(offset, length); + Self::from_array_data(data) } } -impl Vector for PrimitiveVector { +impl Vector for PrimitiveVector { fn data_type(&self) -> ConcreteDataType { T::build_data_type() } @@ -99,11 +135,13 @@ impl Vector for PrimitiveVector { } fn to_arrow_array(&self) -> ArrayRef { - Arc::new(self.array.clone()) + let data = self.to_array_data(); + Arc::new(PrimitiveArray::::from(data)) } fn to_boxed_arrow_array(&self) -> Box { - Box::new(self.array.clone()) + let data = self.to_array_data(); + Box::new(PrimitiveArray::::from(data)) } fn validity(&self) -> Validity { @@ -111,7 +149,11 @@ impl Vector for PrimitiveVector { } fn memory_size(&self) -> usize { - self.array.values().len() * std::mem::size_of::() + self.array.get_buffer_memory_size() + } + + fn null_count(&self) -> usize { + self.array.null_count() } fn is_null(&self, row: usize) -> bool { @@ -119,57 +161,80 @@ impl Vector for PrimitiveVector { } fn slice(&self, offset: usize, length: usize) -> VectorRef { - Arc::new(self.slice(offset, length)) + let data = self.array.data().slice(offset, length); + Arc::new(Self::from_array_data(data)) } fn get(&self, index: usize) -> Value { - vectors::impl_get_for_vector!(self.array, index) + if self.array.is_valid(index) { + // Safety: The index have been checked by `is_valid()`. + let wrapper = unsafe { T::Wrapper::from_native(self.array.value_unchecked(index)) }; + wrapper.into() + } else { + Value::Null + } } fn get_ref(&self, index: usize) -> ValueRef { if self.array.is_valid(index) { // Safety: The index have been checked by `is_valid()`. - unsafe { self.array.value_unchecked(index).into_value_ref() } + let wrapper = unsafe { T::Wrapper::from_native(self.array.value_unchecked(index)) }; + wrapper.into() } else { ValueRef::Null } } } -impl From> for PrimitiveVector { - fn from(array: PrimitiveArray) -> Self { +impl fmt::Debug for PrimitiveVector { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("PrimitiveVector") + .field("array", &self.array) + .finish() + } +} + +impl From> for PrimitiveVector { + fn from(array: PrimitiveArray) -> Self { Self { array } } } -impl From>> for PrimitiveVector { - fn from(v: Vec>) -> Self { +impl From>> for PrimitiveVector { + fn from(v: Vec>) -> Self { Self { - array: PrimitiveArray::::from(v), + array: PrimitiveArray::from_iter(v), } } } -impl>> FromIterator for PrimitiveVector { - fn from_iter>(iter: I) -> Self { - Self { - array: MutablePrimitiveArray::::from_iter(iter).into(), - } +pub struct PrimitiveIter<'a, T: LogicalPrimitiveType> { + iter: ArrayIter<&'a PrimitiveArray>, +} + +impl<'a, T: LogicalPrimitiveType> Iterator for PrimitiveIter<'a, T> { + type Item = Option; + + fn next(&mut self) -> Option> { + self.iter + .next() + .map(|item| item.map(T::Wrapper::from_native)) + } + + fn size_hint(&self) -> (usize, Option) { + self.iter.size_hint() } } -impl ScalarVector for PrimitiveVector -where - T: PrimitiveElement, -{ - type OwnedItem = T; - type RefItem<'a> = T; +impl ScalarVector for PrimitiveVector { + type OwnedItem = T::Wrapper; + type RefItem<'a> = T::Wrapper; type Iter<'a> = PrimitiveIter<'a, T>; type Builder = PrimitiveVectorBuilder; fn get_data(&self, idx: usize) -> Option> { if self.array.is_valid(idx) { - Some(self.array.value(idx)) + Some(T::Wrapper::from_native(self.array.value(idx))) } else { None } @@ -182,59 +247,47 @@ where } } -pub type UInt8Vector = PrimitiveVector; -pub type UInt16Vector = PrimitiveVector; -pub type UInt32Vector = PrimitiveVector; -pub type UInt64Vector = PrimitiveVector; - -pub type Int8Vector = PrimitiveVector; -pub type Int16Vector = PrimitiveVector; -pub type Int32Vector = PrimitiveVector; -pub type Int64Vector = PrimitiveVector; - -pub type Float32Vector = PrimitiveVector; -pub type Float64Vector = PrimitiveVector; - -pub struct PrimitiveIter<'a, T> { - iter: ZipValidity<'a, &'a T, Iter<'a, T>>, -} - -impl<'a, T: Copy> Iterator for PrimitiveIter<'a, T> { - type Item = Option; - - fn next(&mut self) -> Option> { - self.iter.next().map(|v| v.copied()) +impl Serializable for PrimitiveVector { + fn serialize_to_json(&self) -> Result> { + let res = self + .iter_data() + .map(|v| match v { + None => serde_json::Value::Null, + // use WrapperType's Into bound instead of + // serde_json::to_value to facilitate customized serialization + // for WrapperType + Some(v) => v.into(), + }) + .collect::>(); + Ok(res) } } -impl Serializable for PrimitiveVector { - fn serialize_to_json(&self) -> Result> { - self.array - .iter() - .map(serde_json::to_value) - .collect::>() - .context(SerializeSnafu) +impl PartialEq for PrimitiveVector { + fn eq(&self, other: &PrimitiveVector) -> bool { + self.array == other.array } } -pub struct PrimitiveVectorBuilder { - pub(crate) mutable_array: MutablePrimitiveArray, -} +pub type UInt8VectorBuilder = PrimitiveVectorBuilder; +pub type UInt16VectorBuilder = PrimitiveVectorBuilder; +pub type UInt32VectorBuilder = PrimitiveVectorBuilder; +pub type UInt64VectorBuilder = PrimitiveVectorBuilder; -pub type UInt8VectorBuilder = PrimitiveVectorBuilder; -pub type UInt16VectorBuilder = PrimitiveVectorBuilder; -pub type UInt32VectorBuilder = PrimitiveVectorBuilder; -pub type UInt64VectorBuilder = PrimitiveVectorBuilder; +pub type Int8VectorBuilder = PrimitiveVectorBuilder; +pub type Int16VectorBuilder = PrimitiveVectorBuilder; +pub type Int32VectorBuilder = PrimitiveVectorBuilder; +pub type Int64VectorBuilder = PrimitiveVectorBuilder; -pub type Int8VectorBuilder = PrimitiveVectorBuilder; -pub type Int16VectorBuilder = PrimitiveVectorBuilder; -pub type Int32VectorBuilder = PrimitiveVectorBuilder; -pub type Int64VectorBuilder = PrimitiveVectorBuilder; +pub type Float32VectorBuilder = PrimitiveVectorBuilder; +pub type Float64VectorBuilder = PrimitiveVectorBuilder; -pub type Float32VectorBuilder = PrimitiveVectorBuilder; -pub type Float64VectorBuilder = PrimitiveVectorBuilder; +/// Builder to build a primitive vector. +pub struct PrimitiveVectorBuilder { + mutable_array: PrimitiveBuilder, +} -impl MutableVector for PrimitiveVectorBuilder { +impl MutableVector for PrimitiveVectorBuilder { fn data_type(&self) -> ConcreteDataType { T::build_data_type() } @@ -257,81 +310,62 @@ impl MutableVector for PrimitiveVectorBuilder { fn push_value_ref(&mut self, value: ValueRef) -> Result<()> { let primitive = T::cast_value_ref(value)?; - self.mutable_array.push(primitive); + match primitive { + Some(v) => self.mutable_array.append_value(v.into_native()), + None => self.mutable_array.append_null(), + } Ok(()) } fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> { let primitive = T::cast_vector(vector)?; // Slice the underlying array to avoid creating a new Arc. - let slice = primitive.slice(offset, length); - self.mutable_array.extend_trusted_len(slice.iter()); + let slice = primitive.get_slice(offset, length); + for v in slice.iter_data() { + self.push(v); + } Ok(()) } } impl ScalarVectorBuilder for PrimitiveVectorBuilder where - T: Scalar> + PrimitiveElement, - for<'a> T: ScalarRef<'a, ScalarType = T, VectorType = PrimitiveVector>, - for<'a> T: Scalar = T>, + T: LogicalPrimitiveType, + T::Wrapper: Scalar>, + for<'a> T::Wrapper: ScalarRef<'a, ScalarType = T::Wrapper>, + for<'a> T::Wrapper: Scalar = T::Wrapper>, { type VectorType = PrimitiveVector; fn with_capacity(capacity: usize) -> Self { Self { - mutable_array: MutablePrimitiveArray::with_capacity(capacity), + mutable_array: PrimitiveBuilder::with_capacity(capacity), } } fn push(&mut self, value: Option<::RefItem<'_>>) { - self.mutable_array.push(value); + self.mutable_array + .append_option(value.map(|v| v.into_native())); } fn finish(&mut self) -> Self::VectorType { PrimitiveVector { - array: std::mem::take(&mut self.mutable_array).into(), + array: self.mutable_array.finish(), } } } -impl PrimitiveVectorBuilder { - fn with_type_capacity(data_type: ConcreteDataType, capacity: usize) -> Self { - Self { - mutable_array: MutablePrimitiveArray::with_capacity_from( - capacity, - data_type.as_arrow_type(), - ), - } - } -} - -pub(crate) fn replicate_primitive( - vector: &PrimitiveVector, - offsets: &[usize], -) -> VectorRef { - Arc::new(replicate_primitive_with_type( - vector, - offsets, - T::build_data_type(), - )) -} - -pub(crate) fn replicate_primitive_with_type( +pub(crate) fn replicate_primitive( vector: &PrimitiveVector, offsets: &[usize], - data_type: ConcreteDataType, ) -> PrimitiveVector { assert_eq!(offsets.len(), vector.len()); if offsets.is_empty() { - return vector.slice(0, 0); + return vector.get_slice(0, 0); } - let mut builder = PrimitiveVectorBuilder::::with_type_capacity( - data_type, - *offsets.last().unwrap() as usize, - ); + let mut builder = PrimitiveVectorBuilder::::with_capacity(*offsets.last().unwrap() as usize); let mut previous_offset = 0; @@ -339,14 +373,15 @@ pub(crate) fn replicate_primitive_with_type( let repeat_times = *offset - previous_offset; match value { Some(data) => { - builder.mutable_array.extend_trusted_len( - std::iter::repeat(*data) - .take(repeat_times) - .map(Option::Some), - ); + unsafe { + // Safety: std::iter::Repeat and std::iter::Take implement TrustedLen. + builder + .mutable_array + .append_trusted_len_iter(std::iter::repeat(data).take(repeat_times)); + } } None => { - builder.mutable_array.extend_constant(repeat_times, None); + builder.mutable_array.append_nulls(repeat_times); } } previous_offset = *offset; @@ -356,6 +391,7 @@ pub(crate) fn replicate_primitive_with_type( #[cfg(test)] mod tests { + use arrow::array::Int32Array; use arrow::datatypes::DataType as ArrowDataType; use serde_json; @@ -364,11 +400,11 @@ mod tests { use crate::serialize::Serializable; use crate::types::Int64Type; - fn check_vec(v: PrimitiveVector) { + fn check_vec(v: Int32Vector) { assert_eq!(4, v.len()); assert_eq!("Int32Vector", v.vector_type_name()); assert!(!v.is_const()); - assert_eq!(Validity::AllValid, v.validity()); + assert!(v.validity().is_all_valid()); assert!(!v.only_null()); for i in 0..4 { @@ -387,26 +423,26 @@ mod tests { #[test] fn test_from_values() { - let v = PrimitiveVector::::from_values(vec![1, 2, 3, 4]); + let v = Int32Vector::from_values(vec![1, 2, 3, 4]); check_vec(v); } #[test] fn test_from_vec() { - let v = PrimitiveVector::::from_vec(vec![1, 2, 3, 4]); + let v = Int32Vector::from_vec(vec![1, 2, 3, 4]); check_vec(v); } #[test] fn test_from_slice() { - let v = PrimitiveVector::::from_slice(vec![1, 2, 3, 4]); + let v = Int32Vector::from_slice(vec![1, 2, 3, 4]); check_vec(v); } #[test] fn test_serialize_primitive_vector_with_null_to_json() { let input = [Some(1i32), Some(2i32), None, Some(4i32), None]; - let mut builder = PrimitiveVectorBuilder::with_capacity(input.len()); + let mut builder = Int32VectorBuilder::with_capacity(input.len()); for v in input { builder.push(v); } @@ -421,15 +457,15 @@ mod tests { #[test] fn test_from_arrow_array() { - let arrow_array = PrimitiveArray::from_slice(vec![1, 2, 3, 4]); - let v = PrimitiveVector::from(arrow_array); + let arrow_array = Int32Array::from(vec![1, 2, 3, 4]); + let v = Int32Vector::from(arrow_array); check_vec(v); } #[test] fn test_primitive_vector_build_get() { let input = [Some(1i32), Some(2i32), None, Some(4i32), None]; - let mut builder = PrimitiveVectorBuilder::with_capacity(input.len()); + let mut builder = Int32VectorBuilder::with_capacity(input.len()); for v in input { builder.push(v); } @@ -448,29 +484,28 @@ mod tests { #[test] fn test_primitive_vector_validity() { let input = [Some(1i32), Some(2i32), None, None]; - let mut builder = PrimitiveVectorBuilder::with_capacity(input.len()); + let mut builder = Int32VectorBuilder::with_capacity(input.len()); for v in input { builder.push(v); } let vector = builder.finish(); assert_eq!(2, vector.null_count()); let validity = vector.validity(); - let slots = validity.slots().unwrap(); - assert_eq!(2, slots.null_count()); - assert!(!slots.get_bit(2)); - assert!(!slots.get_bit(3)); + assert_eq!(2, validity.null_count()); + assert!(!validity.is_set(2)); + assert!(!validity.is_set(3)); - let vector = PrimitiveVector::::from_slice(vec![1, 2, 3, 4]); + let vector = Int32Vector::from_slice(vec![1, 2, 3, 4]); assert_eq!(0, vector.null_count()); - assert_eq!(Validity::AllValid, vector.validity()); + assert!(vector.validity().is_all_valid()); } #[test] fn test_memory_size() { - let v = PrimitiveVector::::from_slice((0..5).collect::>()); - assert_eq!(20, v.memory_size()); - let v = PrimitiveVector::::from(vec![Some(0i64), Some(1i64), Some(2i64), None, None]); - assert_eq!(40, v.memory_size()); + let v = Int32Vector::from_slice((0..5).collect::>()); + assert_eq!(64, v.memory_size()); + let v = Int64Vector::from(vec![Some(0i64), Some(1i64), Some(2i64), None, None]); + assert_eq!(128, v.memory_size()); } #[test] @@ -489,4 +524,29 @@ mod tests { let expect: VectorRef = Arc::new(Int64Vector::from_slice(&[123, 8, 9])); assert_eq!(expect, vector); } + + #[test] + fn test_from_wrapper_slice() { + macro_rules! test_from_wrapper_slice { + ($vec: ident, $ty: ident) => { + let from_wrapper_slice = $vec::from_wrapper_slice(&[ + $ty::from_native($ty::MAX), + $ty::from_native($ty::MIN), + ]); + let from_slice = $vec::from_slice(&[$ty::MAX, $ty::MIN]); + assert_eq!(from_wrapper_slice, from_slice); + }; + } + + test_from_wrapper_slice!(UInt8Vector, u8); + test_from_wrapper_slice!(Int8Vector, i8); + test_from_wrapper_slice!(UInt16Vector, u16); + test_from_wrapper_slice!(Int16Vector, i16); + test_from_wrapper_slice!(UInt32Vector, u32); + test_from_wrapper_slice!(Int32Vector, i32); + test_from_wrapper_slice!(UInt64Vector, u64); + test_from_wrapper_slice!(Int64Vector, i64); + test_from_wrapper_slice!(Float32Vector, f32); + test_from_wrapper_slice!(Float64Vector, f64); + } } diff --git a/src/datatypes/src/vectors/string.rs b/src/datatypes/src/vectors/string.rs index 638b04dd3eea..252116b3b2dd 100644 --- a/src/datatypes/src/vectors/string.rs +++ b/src/datatypes/src/vectors/string.rs @@ -15,22 +15,19 @@ use std::any::Any; use std::sync::Arc; -use arrow::array::{Array, ArrayRef, MutableArray, Utf8ValuesIter}; -use arrow::bitmap::utils::ZipValidity; -use serde_json::Value as JsonValue; -use snafu::{OptionExt, ResultExt}; +use arrow::array::{Array, ArrayBuilder, ArrayData, ArrayIter, ArrayRef}; +use snafu::ResultExt; use crate::arrow_array::{MutableStringArray, StringArray}; use crate::data_type::ConcreteDataType; -use crate::error::{Result, SerializeSnafu}; +use crate::error::{self, Result}; use crate::scalars::{ScalarVector, ScalarVectorBuilder}; use crate::serialize::Serializable; -use crate::types::StringType; use crate::value::{Value, ValueRef}; use crate::vectors::{self, MutableVector, Validity, Vector, VectorRef}; -/// String array wrapper -#[derive(Debug, Clone, PartialEq)] +/// Vector of strings. +#[derive(Debug, PartialEq)] pub struct StringVector { array: StringArray, } @@ -39,6 +36,16 @@ impl StringVector { pub(crate) fn as_arrow(&self) -> &dyn Array { &self.array } + + fn to_array_data(&self) -> ArrayData { + self.array.data().clone() + } + + fn from_array_data(data: ArrayData) -> Self { + Self { + array: StringArray::from(data), + } + } } impl From for StringVector { @@ -50,27 +57,39 @@ impl From for StringVector { impl From>> for StringVector { fn from(data: Vec>) -> Self { Self { - array: StringArray::from(data), + array: StringArray::from_iter(data), } } } -impl From> for StringVector { - fn from(data: Vec) -> Self { +impl From>> for StringVector { + fn from(data: Vec>) -> Self { Self { - array: StringArray::from( - data.into_iter() - .map(Option::Some) - .collect::>>(), - ), + array: StringArray::from_iter(data), } } } -impl From>> for StringVector { - fn from(data: Vec>) -> Self { +impl From<&[Option]> for StringVector { + fn from(data: &[Option]) -> Self { Self { - array: StringArray::from(data), + array: StringArray::from_iter(data), + } + } +} + +impl From<&[Option<&str>]> for StringVector { + fn from(data: &[Option<&str>]) -> Self { + Self { + array: StringArray::from_iter(data), + } + } +} + +impl From> for StringVector { + fn from(data: Vec) -> Self { + Self { + array: StringArray::from_iter(data.into_iter().map(Some)), } } } @@ -78,18 +97,14 @@ impl From>> for StringVector { impl From> for StringVector { fn from(data: Vec<&str>) -> Self { Self { - array: StringArray::from( - data.into_iter() - .map(Option::Some) - .collect::>>(), - ), + array: StringArray::from_iter(data.into_iter().map(Some)), } } } impl Vector for StringVector { fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::String(StringType::default()) + ConcreteDataType::string_datatype() } fn vector_type_name(&self) -> String { @@ -105,11 +120,13 @@ impl Vector for StringVector { } fn to_arrow_array(&self) -> ArrayRef { - Arc::new(self.array.clone()) + let data = self.to_array_data(); + Arc::new(StringArray::from(data)) } fn to_boxed_arrow_array(&self) -> Box { - Box::new(self.array.clone()) + let data = self.to_array_data(); + Box::new(StringArray::from(data)) } fn validity(&self) -> Validity { @@ -117,7 +134,11 @@ impl Vector for StringVector { } fn memory_size(&self) -> usize { - self.len() * std::mem::size_of::() + self.array.values().len() + self.array.get_buffer_memory_size() + } + + fn null_count(&self) -> usize { + self.array.null_count() } fn is_null(&self, row: usize) -> bool { @@ -125,7 +146,8 @@ impl Vector for StringVector { } fn slice(&self, offset: usize, length: usize) -> VectorRef { - Arc::new(Self::from(self.array.slice(offset, length))) + let data = self.array.data().slice(offset, length); + Arc::new(Self::from_array_data(data)) } fn get(&self, index: usize) -> Value { @@ -140,7 +162,7 @@ impl Vector for StringVector { impl ScalarVector for StringVector { type OwnedItem = String; type RefItem<'a> = &'a str; - type Iter<'a> = ZipValidity<'a, &'a str, Utf8ValuesIter<'a, i32>>; + type Iter<'a> = ArrayIter<&'a StringArray>; type Builder = StringVectorBuilder; fn get_data(&self, idx: usize) -> Option> { @@ -157,7 +179,7 @@ impl ScalarVector for StringVector { } pub struct StringVectorBuilder { - buffer: MutableStringArray, + mutable_array: MutableStringArray, } impl MutableVector for StringVectorBuilder { @@ -166,7 +188,7 @@ impl MutableVector for StringVectorBuilder { } fn len(&self) -> usize { - self.buffer.len() + self.mutable_array.len() } fn as_any(&self) -> &dyn Any { @@ -182,12 +204,15 @@ impl MutableVector for StringVectorBuilder { } fn push_value_ref(&mut self, value: ValueRef) -> Result<()> { - self.buffer.push(value.as_string()?); + match value.as_string()? { + Some(v) => self.mutable_array.append_value(v), + None => self.mutable_array.append_null(), + } Ok(()) } fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> { - vectors::impl_extend_for_builder!(self.buffer, vector, StringVector, offset, length) + vectors::impl_extend_for_builder!(self, vector, StringVector, offset, length) } } @@ -196,30 +221,30 @@ impl ScalarVectorBuilder for StringVectorBuilder { fn with_capacity(capacity: usize) -> Self { Self { - buffer: MutableStringArray::with_capacity(capacity), + mutable_array: MutableStringArray::with_capacity(capacity, 0), } } fn push(&mut self, value: Option<::RefItem<'_>>) { - self.buffer.push(value) + match value { + Some(v) => self.mutable_array.append_value(v), + None => self.mutable_array.append_null(), + } } fn finish(&mut self) -> Self::VectorType { - Self::VectorType { - array: std::mem::take(&mut self.buffer).into(), + StringVector { + array: self.mutable_array.finish(), } } } impl Serializable for StringVector { - fn serialize_to_json(&self) -> crate::error::Result> { + fn serialize_to_json(&self) -> Result> { self.iter_data() - .map(|v| match v { - None => Ok(serde_json::Value::Null), - Some(s) => serde_json::to_value(s), - }) + .map(serde_json::to_value) .collect::>() - .context(SerializeSnafu) + .context(error::SerializeSnafu) } } @@ -227,11 +252,52 @@ vectors::impl_try_from_arrow_array_for_vector!(StringArray, StringVector); #[cfg(test)] mod tests { - use arrow::datatypes::DataType as ArrowDataType; - use serde_json; + use arrow::datatypes::DataType; use super::*; - use crate::data_type::DataType; + + #[test] + fn test_string_vector_build_get() { + let mut builder = StringVectorBuilder::with_capacity(4); + builder.push(Some("hello")); + builder.push(None); + builder.push(Some("world")); + let vector = builder.finish(); + + assert_eq!(Some("hello"), vector.get_data(0)); + assert_eq!(None, vector.get_data(1)); + assert_eq!(Some("world"), vector.get_data(2)); + + // Get out of bound + assert!(vector.try_get(3).is_err()); + + assert_eq!(Value::String("hello".into()), vector.get(0)); + assert_eq!(Value::Null, vector.get(1)); + assert_eq!(Value::String("world".into()), vector.get(2)); + + let mut iter = vector.iter_data(); + assert_eq!("hello", iter.next().unwrap().unwrap()); + assert_eq!(None, iter.next().unwrap()); + assert_eq!("world", iter.next().unwrap().unwrap()); + assert_eq!(None, iter.next()); + } + + #[test] + fn test_string_vector_builder() { + let mut builder = StringVectorBuilder::with_capacity(3); + builder.push_value_ref(ValueRef::String("hello")).unwrap(); + assert!(builder.push_value_ref(ValueRef::Int32(123)).is_err()); + + let input = StringVector::from_slice(&["world", "one", "two"]); + builder.extend_slice_of(&input, 1, 2).unwrap(); + assert!(builder + .extend_slice_of(&crate::vectors::Int32Vector::from_slice(&[13]), 0, 1) + .is_err()); + let vector = builder.to_vector(); + + let expect: VectorRef = Arc::new(StringVector::from_slice(&["hello", "one", "two"])); + assert_eq!(expect, vector); + } #[test] fn test_string_vector_misc() { @@ -240,9 +306,9 @@ mod tests { assert_eq!(3, v.len()); assert_eq!("StringVector", v.vector_type_name()); assert!(!v.is_const()); - assert_eq!(Validity::AllValid, v.validity()); + assert!(v.validity().is_all_valid()); assert!(!v.only_null()); - assert_eq!(41, v.memory_size()); + assert_eq!(128, v.memory_size()); for (i, s) in strs.iter().enumerate() { assert_eq!(Value::from(*s), v.get(i)); @@ -252,7 +318,7 @@ mod tests { let arrow_arr = v.to_arrow_array(); assert_eq!(3, arrow_arr.len()); - assert_eq!(&ArrowDataType::Utf8, arrow_arr.data_type()); + assert_eq!(&DataType::Utf8, arrow_arr.data_type()); } #[test] @@ -270,11 +336,11 @@ mod tests { #[test] fn test_from_arrow_array() { let mut builder = MutableStringArray::new(); - builder.push(Some("A")); - builder.push(Some("B")); - builder.push::<&str>(None); - builder.push(Some("D")); - let string_array: StringArray = builder.into(); + builder.append_option(Some("A")); + builder.append_option(Some("B")); + builder.append_null(); + builder.append_option(Some("D")); + let string_array: StringArray = builder.finish(); let vector = StringVector::from(string_array); assert_eq!( r#"["A","B",null,"D"]"#, @@ -283,45 +349,22 @@ mod tests { } #[test] - fn test_string_vector_build_get() { - let mut builder = StringVectorBuilder::with_capacity(4); - builder.push(Some("hello")); - builder.push(None); - builder.push(Some("world")); - let vector = builder.finish(); - - assert_eq!(Some("hello"), vector.get_data(0)); - assert_eq!(None, vector.get_data(1)); - assert_eq!(Some("world"), vector.get_data(2)); - - // Get out of bound - assert!(vector.try_get(3).is_err()); - - assert_eq!(Value::String("hello".into()), vector.get(0)); - assert_eq!(Value::Null, vector.get(1)); - assert_eq!(Value::String("world".into()), vector.get(2)); - - let mut iter = vector.iter_data(); - assert_eq!("hello", iter.next().unwrap().unwrap()); - assert_eq!(None, iter.next().unwrap()); - assert_eq!("world", iter.next().unwrap().unwrap()); - assert_eq!(None, iter.next()); - } - - #[test] - fn test_string_vector_builder() { - let mut builder = StringType::default().create_mutable_vector(3); - builder.push_value_ref(ValueRef::String("hello")).unwrap(); - assert!(builder.push_value_ref(ValueRef::Int32(123)).is_err()); - - let input = StringVector::from_slice(&["world", "one", "two"]); - builder.extend_slice_of(&input, 1, 2).unwrap(); - assert!(builder - .extend_slice_of(&crate::vectors::Int32Vector::from_slice(&[13]), 0, 1) - .is_err()); - let vector = builder.to_vector(); - - let expect: VectorRef = Arc::new(StringVector::from_slice(&["hello", "one", "two"])); - assert_eq!(expect, vector); + fn test_from_non_option_string() { + let nul = String::from_utf8(vec![0]).unwrap(); + let corpus = vec!["😅😅😅", "😍😍😍😍", "🥵🥵", nul.as_str()]; + let vector = StringVector::from(corpus); + let serialized = serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap(); + assert_eq!(r#"["😅😅😅","😍😍😍😍","🥵🥵","\u0000"]"#, serialized); + + let corpus = vec![ + "🀀🀀🀀".to_string(), + "🀁🀁🀁".to_string(), + "🀂🀂🀂".to_string(), + "🀃🀃🀃".to_string(), + "🀆🀆".to_string(), + ]; + let vector = StringVector::from(corpus); + let serialized = serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap(); + assert_eq!(r#"["🀀🀀🀀","🀁🀁🀁","🀂🀂🀂","🀃🀃🀃","🀆🀆"]"#, serialized); } } diff --git a/src/datatypes/src/vectors/timestamp.rs b/src/datatypes/src/vectors/timestamp.rs index 62b8332c895e..5d9f7f2ed1fc 100644 --- a/src/datatypes/src/vectors/timestamp.rs +++ b/src/datatypes/src/vectors/timestamp.rs @@ -12,308 +12,20 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::any::Any; -use std::sync::Arc; - -use arrow::array::{Array, ArrayRef, PrimitiveArray}; -use common_time::timestamp::{TimeUnit, Timestamp}; -use snafu::OptionExt; - -use crate::data_type::{ConcreteDataType, DataType}; -use crate::error; -use crate::error::Result; -use crate::prelude::{ - MutableVector, ScalarVector, ScalarVectorBuilder, Validity, Value, ValueRef, Vector, VectorRef, +use crate::types::{ + TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, + TimestampSecondType, }; -use crate::serialize::Serializable; -use crate::types::TimestampType; -use crate::vectors::{PrimitiveIter, PrimitiveVector, PrimitiveVectorBuilder}; - -/// `TimestampVector` stores timestamp in millisecond since UNIX Epoch. -#[derive(Debug, Clone, PartialEq)] -pub struct TimestampVector { - array: PrimitiveVector, -} - -impl TimestampVector { - pub fn new(array: PrimitiveArray) -> Self { - Self { - array: PrimitiveVector { array }, - } - } - - pub fn try_from_arrow_array(array: impl AsRef) -> Result { - Ok(Self::new( - array - .as_ref() - .as_any() - .downcast_ref::>() - .with_context(|| error::ConversionSnafu { - from: format!("{:?}", array.as_ref().data_type()), - })? - .clone(), - )) - } - - pub fn from_values>(iter: I) -> Self { - Self { - array: PrimitiveVector { - array: PrimitiveArray::from_values(iter), - }, - } - } - - pub(crate) fn as_arrow(&self) -> &dyn Array { - self.array.as_arrow() - } -} - -impl Vector for TimestampVector { - fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::timestamp_millis_datatype() - } - - fn vector_type_name(&self) -> String { - "TimestampVector".to_string() - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn len(&self) -> usize { - self.array.len() - } - - fn to_arrow_array(&self) -> ArrayRef { - let validity = self.array.array.validity().cloned(); - let buffer = self.array.array.values().clone(); - Arc::new(PrimitiveArray::new( - TimestampType::new(TimeUnit::Millisecond).as_arrow_type(), - buffer, - validity, - )) - } - - fn to_boxed_arrow_array(&self) -> Box { - let validity = self.array.array.validity().cloned(); - let values = self.array.array.values().clone(); - Box::new(PrimitiveArray::new( - arrow::datatypes::DataType::Timestamp(arrow::datatypes::TimeUnit::Millisecond, None), - values, - validity, - )) - } - - fn validity(&self) -> Validity { - self.array.validity() - } - - fn memory_size(&self) -> usize { - self.array.memory_size() - } - - fn is_null(&self, row: usize) -> bool { - self.array.is_null(row) - } - - fn slice(&self, offset: usize, length: usize) -> VectorRef { - Arc::new(Self { - array: PrimitiveVector { - array: self.array.array.slice(offset, length), - }, - }) - } - - fn get(&self, index: usize) -> Value { - match self.array.get(index) { - Value::Null => Value::Null, - Value::Int64(v) => Value::Timestamp(Timestamp::from_millis(v)), - _ => { - unreachable!() - } - } - } - - fn get_ref(&self, index: usize) -> ValueRef { - match self.array.get(index) { - Value::Int64(v) => ValueRef::Timestamp(Timestamp::from_millis(v)), - Value::Null => ValueRef::Null, - _ => unreachable!(), - } - } -} - -impl Serializable for TimestampVector { - fn serialize_to_json(&self) -> Result> { - Ok(self - .array - .iter_data() - .map(|v| match v { - None => serde_json::Value::Null, - Some(v) => v.into(), - }) - .collect::>()) - } -} - -impl ScalarVector for TimestampVector { - type OwnedItem = Timestamp; - type RefItem<'a> = Timestamp; - type Iter<'a> = TimestampDataIter<'a>; - type Builder = TimestampVectorBuilder; - - fn get_data(&self, idx: usize) -> Option> { - self.array.get_data(idx).map(Timestamp::from_millis) - } - - fn iter_data(&self) -> Self::Iter<'_> { - TimestampDataIter { - iter: self.array.iter_data(), - } - } -} - -pub struct TimestampDataIter<'a> { - iter: PrimitiveIter<'a, i64>, -} - -impl<'a> Iterator for TimestampDataIter<'a> { - type Item = Option; - - fn next(&mut self) -> Option { - self.iter.next().map(|v| v.map(Timestamp::from_millis)) - } -} - -pub struct TimestampVectorBuilder { - buffer: PrimitiveVectorBuilder, -} - -impl MutableVector for TimestampVectorBuilder { - fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::timestamp_millis_datatype() - } - - fn len(&self) -> usize { - self.buffer.len() - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn as_mut_any(&mut self) -> &mut dyn Any { - self - } - - fn to_vector(&mut self) -> VectorRef { - Arc::new(self.finish()) - } - - fn push_value_ref(&mut self, value: ValueRef) -> Result<()> { - // TODO(hl): vector and vector builder should also support customized time unit. - self.buffer.push( - value - .as_timestamp()? - .map(|t| t.convert_to(TimeUnit::Millisecond)), - ); - Ok(()) - } - - fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> { - let concrete_vector = vector - .as_any() - .downcast_ref::() - .with_context(|| error::CastTypeSnafu { - msg: format!( - "Failed to convert vector from {} to DateVector", - vector.vector_type_name() - ), - })?; - - self.buffer - .extend_slice_of(&concrete_vector.array, offset, length)?; - Ok(()) - } -} - -impl ScalarVectorBuilder for TimestampVectorBuilder { - type VectorType = TimestampVector; - - fn with_capacity(capacity: usize) -> Self { - Self { - buffer: PrimitiveVectorBuilder::with_capacity(capacity), - } - } - - /// Pushes a Timestamp value into vector builder. The timestamp must be with time unit - /// `Second`/`MilliSecond`/`Microsecond`. - fn push(&mut self, value: Option<::RefItem<'_>>) { - self.buffer - .push(value.map(|v| v.convert_to(TimeUnit::Millisecond))); - } - - fn finish(&mut self) -> Self::VectorType { - Self::VectorType { - array: self.buffer.finish(), - } - } -} - -pub(crate) fn replicate_timestamp(vector: &TimestampVector, offsets: &[usize]) -> VectorRef { - let array = crate::vectors::primitive::replicate_primitive_with_type( - &vector.array, - offsets, - vector.data_type(), - ); - Arc::new(TimestampVector { array }) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - pub fn test_build_timestamp_vector() { - let mut builder = TimestampVectorBuilder::with_capacity(3); - builder.push(Some(Timestamp::new(1, TimeUnit::Second))); - builder.push(None); - builder.push(Some(Timestamp::new(2, TimeUnit::Millisecond))); +use crate::vectors::{PrimitiveVector, PrimitiveVectorBuilder}; - let vector = builder.finish(); - assert_eq!( - ConcreteDataType::timestamp_millis_datatype(), - vector.data_type() - ); - assert_eq!(3, vector.len()); - assert_eq!( - Value::Timestamp(Timestamp::new(1000, TimeUnit::Millisecond)), - vector.get(0) - ); +pub type TimestampSecondVector = PrimitiveVector; +pub type TimestampSecondVectorBuilder = PrimitiveVectorBuilder; - assert_eq!(Value::Null, vector.get(1)); - assert_eq!( - Value::Timestamp(Timestamp::new(2, TimeUnit::Millisecond)), - vector.get(2) - ); +pub type TimestampMillisecondVector = PrimitiveVector; +pub type TimestampMillisecondVectorBuilder = PrimitiveVectorBuilder; - assert_eq!( - vec![ - Some(Timestamp::new(1000, TimeUnit::Millisecond)), - None, - Some(Timestamp::new(2, TimeUnit::Millisecond)), - ], - vector.iter_data().collect::>() - ); - } +pub type TimestampMicrosecondVector = PrimitiveVector; +pub type TimestampMicrosecondVectorBuilder = PrimitiveVectorBuilder; - #[test] - fn test_timestamp_from_arrow() { - let vector = - TimestampVector::from_slice(&[Timestamp::from_millis(1), Timestamp::from_millis(2)]); - let arrow = vector.as_arrow().slice(0, vector.len()); - let vector2 = TimestampVector::try_from_arrow_array(&arrow).unwrap(); - assert_eq!(vector, vector2); - } -} +pub type TimestampNanosecondVector = PrimitiveVector; +pub type TimestampNanosecondVectorBuilder = PrimitiveVectorBuilder; diff --git a/src/datatypes2/src/vectors/validity.rs b/src/datatypes/src/vectors/validity.rs similarity index 100% rename from src/datatypes2/src/vectors/validity.rs rename to src/datatypes/src/vectors/validity.rs diff --git a/src/datatypes2/Cargo.toml b/src/datatypes2/Cargo.toml deleted file mode 100644 index ea6021954425..000000000000 --- a/src/datatypes2/Cargo.toml +++ /dev/null @@ -1,24 +0,0 @@ -[package] -name = "datatypes2" -version = "0.1.0" -edition = "2021" -license = "Apache-2.0" - -[features] -default = [] -test = [] - -[dependencies] -arrow = "26.0" -common-base = { path = "../common/base" } -common-error = { path = "../common/error" } -common-time = { path = "../common/time" } -datafusion-common = "14.0" -enum_dispatch = "0.3" -num = "0.4" -num-traits = "0.2" -ordered-float = { version = "3.0", features = ["serde"] } -paste = "1.0" -serde = { version = "1.0", features = ["derive"] } -serde_json = "1.0" -snafu = { version = "0.7", features = ["backtraces"] } diff --git a/src/datatypes2/src/arrow_array.rs b/src/datatypes2/src/arrow_array.rs deleted file mode 100644 index 7405c8a665af..000000000000 --- a/src/datatypes2/src/arrow_array.rs +++ /dev/null @@ -1,242 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use arrow::array::{ - Array, BooleanArray, Date32Array, Date64Array, Float32Array, Float64Array, Int16Array, - Int32Array, Int64Array, Int8Array, ListArray, UInt16Array, UInt32Array, UInt64Array, - UInt8Array, -}; -use arrow::datatypes::DataType; -use common_time::timestamp::TimeUnit; -use common_time::Timestamp; -use snafu::OptionExt; - -use crate::data_type::ConcreteDataType; -use crate::error::{ConversionSnafu, Result}; -use crate::value::{ListValue, Value}; - -pub type BinaryArray = arrow::array::LargeBinaryArray; -pub type MutableBinaryArray = arrow::array::LargeBinaryBuilder; -pub type StringArray = arrow::array::StringArray; -pub type MutableStringArray = arrow::array::StringBuilder; - -macro_rules! cast_array { - ($arr: ident, $CastType: ty) => { - $arr.as_any() - .downcast_ref::<$CastType>() - .with_context(|| ConversionSnafu { - from: format!("{:?}", $arr.data_type()), - })? - }; -} - -// TODO(yingwen): Remove this function. -pub fn arrow_array_get(array: &dyn Array, idx: usize) -> Result { - if array.is_null(idx) { - return Ok(Value::Null); - } - - let result = match array.data_type() { - DataType::Null => Value::Null, - DataType::Boolean => Value::Boolean(cast_array!(array, BooleanArray).value(idx)), - DataType::Binary => Value::Binary(cast_array!(array, BinaryArray).value(idx).into()), - DataType::Int8 => Value::Int8(cast_array!(array, Int8Array).value(idx)), - DataType::Int16 => Value::Int16(cast_array!(array, Int16Array).value(idx)), - DataType::Int32 => Value::Int32(cast_array!(array, Int32Array).value(idx)), - DataType::Int64 => Value::Int64(cast_array!(array, Int64Array).value(idx)), - DataType::UInt8 => Value::UInt8(cast_array!(array, UInt8Array).value(idx)), - DataType::UInt16 => Value::UInt16(cast_array!(array, UInt16Array).value(idx)), - DataType::UInt32 => Value::UInt32(cast_array!(array, UInt32Array).value(idx)), - DataType::UInt64 => Value::UInt64(cast_array!(array, UInt64Array).value(idx)), - DataType::Float32 => Value::Float32(cast_array!(array, Float32Array).value(idx).into()), - DataType::Float64 => Value::Float64(cast_array!(array, Float64Array).value(idx).into()), - DataType::Utf8 => Value::String(cast_array!(array, StringArray).value(idx).into()), - DataType::Date32 => Value::Date(cast_array!(array, Date32Array).value(idx).into()), - DataType::Date64 => Value::DateTime(cast_array!(array, Date64Array).value(idx).into()), - DataType::Timestamp(t, _) => match t { - arrow::datatypes::TimeUnit::Second => Value::Timestamp(Timestamp::new( - cast_array!(array, arrow::array::TimestampSecondArray).value(idx), - TimeUnit::Second, - )), - arrow::datatypes::TimeUnit::Millisecond => Value::Timestamp(Timestamp::new( - cast_array!(array, arrow::array::TimestampMillisecondArray).value(idx), - TimeUnit::Millisecond, - )), - arrow::datatypes::TimeUnit::Microsecond => Value::Timestamp(Timestamp::new( - cast_array!(array, arrow::array::TimestampMicrosecondArray).value(idx), - TimeUnit::Microsecond, - )), - arrow::datatypes::TimeUnit::Nanosecond => Value::Timestamp(Timestamp::new( - cast_array!(array, arrow::array::TimestampNanosecondArray).value(idx), - TimeUnit::Nanosecond, - )), - }, - DataType::List(_) => { - let array = cast_array!(array, ListArray).value(idx); - let item_type = ConcreteDataType::try_from(array.data_type())?; - let values = (0..array.len()) - .map(|i| arrow_array_get(&*array, i)) - .collect::>>()?; - Value::List(ListValue::new(Some(Box::new(values)), item_type)) - } - _ => unimplemented!("Arrow array datatype: {:?}", array.data_type()), - }; - - Ok(result) -} - -#[cfg(test)] -mod test { - use std::sync::Arc; - - use arrow::array::{ - BooleanArray, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, - LargeBinaryArray, TimestampMicrosecondArray, TimestampMillisecondArray, - TimestampNanosecondArray, TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, - UInt8Array, - }; - use arrow::datatypes::Int32Type; - use common_time::timestamp::{TimeUnit, Timestamp}; - use paste::paste; - - use super::*; - use crate::data_type::ConcreteDataType; - use crate::types::TimestampType; - - macro_rules! test_arrow_array_get_for_timestamps { - ( $($unit: ident), *) => { - $( - paste! { - let mut builder = arrow::array::[]::builder(3); - builder.append_value(1); - builder.append_value(0); - builder.append_value(-1); - let ts_array = Arc::new(builder.finish()) as Arc; - let v = arrow_array_get(&ts_array, 1).unwrap(); - assert_eq!( - ConcreteDataType::Timestamp(TimestampType::$unit( - $crate::types::[]::default(), - )), - v.data_type() - ); - } - )* - }; - } - - #[test] - fn test_timestamp_array() { - test_arrow_array_get_for_timestamps![Second, Millisecond, Microsecond, Nanosecond]; - } - - #[test] - fn test_arrow_array_access() { - let array1 = BooleanArray::from(vec![true, true, false, false]); - assert_eq!(Value::Boolean(true), arrow_array_get(&array1, 1).unwrap()); - let array1 = Int8Array::from(vec![1, 2, 3, 4]); - assert_eq!(Value::Int8(2), arrow_array_get(&array1, 1).unwrap()); - let array1 = UInt8Array::from(vec![1, 2, 3, 4]); - assert_eq!(Value::UInt8(2), arrow_array_get(&array1, 1).unwrap()); - let array1 = Int16Array::from(vec![1, 2, 3, 4]); - assert_eq!(Value::Int16(2), arrow_array_get(&array1, 1).unwrap()); - let array1 = UInt16Array::from(vec![1, 2, 3, 4]); - assert_eq!(Value::UInt16(2), arrow_array_get(&array1, 1).unwrap()); - let array1 = Int32Array::from(vec![1, 2, 3, 4]); - assert_eq!(Value::Int32(2), arrow_array_get(&array1, 1).unwrap()); - let array1 = UInt32Array::from(vec![1, 2, 3, 4]); - assert_eq!(Value::UInt32(2), arrow_array_get(&array1, 1).unwrap()); - let array = Int64Array::from(vec![1, 2, 3, 4]); - assert_eq!(Value::Int64(2), arrow_array_get(&array, 1).unwrap()); - let array1 = UInt64Array::from(vec![1, 2, 3, 4]); - assert_eq!(Value::UInt64(2), arrow_array_get(&array1, 1).unwrap()); - let array1 = Float32Array::from(vec![1f32, 2f32, 3f32, 4f32]); - assert_eq!( - Value::Float32(2f32.into()), - arrow_array_get(&array1, 1).unwrap() - ); - let array1 = Float64Array::from(vec![1f64, 2f64, 3f64, 4f64]); - assert_eq!( - Value::Float64(2f64.into()), - arrow_array_get(&array1, 1).unwrap() - ); - - let array2 = StringArray::from(vec![Some("hello"), None, Some("world")]); - assert_eq!( - Value::String("hello".into()), - arrow_array_get(&array2, 0).unwrap() - ); - assert_eq!(Value::Null, arrow_array_get(&array2, 1).unwrap()); - - let array3 = LargeBinaryArray::from(vec![ - Some("hello".as_bytes()), - None, - Some("world".as_bytes()), - ]); - assert_eq!(Value::Null, arrow_array_get(&array3, 1).unwrap()); - - let array = TimestampSecondArray::from(vec![1, 2, 3]); - let value = arrow_array_get(&array, 1).unwrap(); - assert_eq!(value, Value::Timestamp(Timestamp::new(2, TimeUnit::Second))); - let array = TimestampMillisecondArray::from(vec![1, 2, 3]); - let value = arrow_array_get(&array, 1).unwrap(); - assert_eq!( - value, - Value::Timestamp(Timestamp::new(2, TimeUnit::Millisecond)) - ); - let array = TimestampMicrosecondArray::from(vec![1, 2, 3]); - let value = arrow_array_get(&array, 1).unwrap(); - assert_eq!( - value, - Value::Timestamp(Timestamp::new(2, TimeUnit::Microsecond)) - ); - let array = TimestampNanosecondArray::from(vec![1, 2, 3]); - let value = arrow_array_get(&array, 1).unwrap(); - assert_eq!( - value, - Value::Timestamp(Timestamp::new(2, TimeUnit::Nanosecond)) - ); - - // test list array - let data = vec![ - Some(vec![Some(1), Some(2), Some(3)]), - None, - Some(vec![Some(4), None, Some(6)]), - ]; - let arrow_array = ListArray::from_iter_primitive::(data); - - let v0 = arrow_array_get(&arrow_array, 0).unwrap(); - match v0 { - Value::List(list) => { - assert!(matches!(list.datatype(), ConcreteDataType::Int32(_))); - let items = list.items().as_ref().unwrap(); - assert_eq!( - **items, - vec![Value::Int32(1), Value::Int32(2), Value::Int32(3)] - ); - } - _ => unreachable!(), - } - - assert_eq!(Value::Null, arrow_array_get(&arrow_array, 1).unwrap()); - let v2 = arrow_array_get(&arrow_array, 2).unwrap(); - match v2 { - Value::List(list) => { - assert!(matches!(list.datatype(), ConcreteDataType::Int32(_))); - let items = list.items().as_ref().unwrap(); - assert_eq!(**items, vec![Value::Int32(4), Value::Null, Value::Int32(6)]); - } - _ => unreachable!(), - } - } -} diff --git a/src/datatypes2/src/data_type.rs b/src/datatypes2/src/data_type.rs deleted file mode 100644 index 0d06d566b667..000000000000 --- a/src/datatypes2/src/data_type.rs +++ /dev/null @@ -1,486 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::sync::Arc; - -use arrow::datatypes::{DataType as ArrowDataType, TimeUnit as ArrowTimeUnit}; -use common_time::timestamp::TimeUnit; -use paste::paste; -use serde::{Deserialize, Serialize}; - -use crate::error::{self, Error, Result}; -use crate::type_id::LogicalTypeId; -use crate::types::{ - BinaryType, BooleanType, DateTimeType, DateType, Float32Type, Float64Type, Int16Type, - Int32Type, Int64Type, Int8Type, ListType, NullType, StringType, TimestampMicrosecondType, - TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, TimestampType, - UInt16Type, UInt32Type, UInt64Type, UInt8Type, -}; -use crate::value::Value; -use crate::vectors::MutableVector; - -#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] -#[enum_dispatch::enum_dispatch(DataType)] -pub enum ConcreteDataType { - Null(NullType), - Boolean(BooleanType), - - // Numeric types: - Int8(Int8Type), - Int16(Int16Type), - Int32(Int32Type), - Int64(Int64Type), - UInt8(UInt8Type), - UInt16(UInt16Type), - UInt32(UInt32Type), - UInt64(UInt64Type), - Float32(Float32Type), - Float64(Float64Type), - - // String types: - Binary(BinaryType), - String(StringType), - - // Date types: - Date(DateType), - DateTime(DateTimeType), - Timestamp(TimestampType), - - // Compound types: - List(ListType), -} - -// TODO(yingwen): Refactor these `is_xxx()` methods, such as adding a `properties()` method -// returning all these properties to the `DataType` trait -impl ConcreteDataType { - pub fn is_float(&self) -> bool { - matches!( - self, - ConcreteDataType::Float64(_) | ConcreteDataType::Float32(_) - ) - } - - pub fn is_boolean(&self) -> bool { - matches!(self, ConcreteDataType::Boolean(_)) - } - - pub fn is_stringifiable(&self) -> bool { - matches!( - self, - ConcreteDataType::String(_) - | ConcreteDataType::Date(_) - | ConcreteDataType::DateTime(_) - | ConcreteDataType::Timestamp(_) - ) - } - - pub fn is_signed(&self) -> bool { - matches!( - self, - ConcreteDataType::Int8(_) - | ConcreteDataType::Int16(_) - | ConcreteDataType::Int32(_) - | ConcreteDataType::Int64(_) - | ConcreteDataType::Date(_) - | ConcreteDataType::DateTime(_) - | ConcreteDataType::Timestamp(_) - ) - } - - pub fn is_unsigned(&self) -> bool { - matches!( - self, - ConcreteDataType::UInt8(_) - | ConcreteDataType::UInt16(_) - | ConcreteDataType::UInt32(_) - | ConcreteDataType::UInt64(_) - ) - } - - pub fn numerics() -> Vec { - vec![ - ConcreteDataType::int8_datatype(), - ConcreteDataType::int16_datatype(), - ConcreteDataType::int32_datatype(), - ConcreteDataType::int64_datatype(), - ConcreteDataType::uint8_datatype(), - ConcreteDataType::uint16_datatype(), - ConcreteDataType::uint32_datatype(), - ConcreteDataType::uint64_datatype(), - ConcreteDataType::float32_datatype(), - ConcreteDataType::float64_datatype(), - ] - } - - /// Convert arrow data type to [ConcreteDataType]. - /// - /// # Panics - /// Panic if given arrow data type is not supported. - pub fn from_arrow_type(dt: &ArrowDataType) -> Self { - ConcreteDataType::try_from(dt).expect("Unimplemented type") - } - - pub fn is_null(&self) -> bool { - matches!(self, ConcreteDataType::Null(NullType)) - } -} - -impl TryFrom<&ArrowDataType> for ConcreteDataType { - type Error = Error; - - fn try_from(dt: &ArrowDataType) -> Result { - let concrete_type = match dt { - ArrowDataType::Null => Self::null_datatype(), - ArrowDataType::Boolean => Self::boolean_datatype(), - ArrowDataType::UInt8 => Self::uint8_datatype(), - ArrowDataType::UInt16 => Self::uint16_datatype(), - ArrowDataType::UInt32 => Self::uint32_datatype(), - ArrowDataType::UInt64 => Self::uint64_datatype(), - ArrowDataType::Int8 => Self::int8_datatype(), - ArrowDataType::Int16 => Self::int16_datatype(), - ArrowDataType::Int32 => Self::int32_datatype(), - ArrowDataType::Int64 => Self::int64_datatype(), - ArrowDataType::Float32 => Self::float32_datatype(), - ArrowDataType::Float64 => Self::float64_datatype(), - ArrowDataType::Date32 => Self::date_datatype(), - ArrowDataType::Date64 => Self::datetime_datatype(), - ArrowDataType::Timestamp(u, _) => ConcreteDataType::from_arrow_time_unit(u), - ArrowDataType::Binary | ArrowDataType::LargeBinary => Self::binary_datatype(), - ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 => Self::string_datatype(), - ArrowDataType::List(field) => Self::List(ListType::new( - ConcreteDataType::from_arrow_type(field.data_type()), - )), - _ => { - return error::UnsupportedArrowTypeSnafu { - arrow_type: dt.clone(), - } - .fail() - } - }; - - Ok(concrete_type) - } -} - -macro_rules! impl_new_concrete_type_functions { - ($($Type: ident), +) => { - paste! { - impl ConcreteDataType { - $( - pub fn [<$Type:lower _datatype>]() -> ConcreteDataType { - ConcreteDataType::$Type([<$Type Type>]::default()) - } - )+ - } - } - } -} - -impl_new_concrete_type_functions!( - Null, Boolean, UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32, Float64, - Binary, Date, DateTime, String -); - -impl ConcreteDataType { - pub fn timestamp_second_datatype() -> Self { - ConcreteDataType::Timestamp(TimestampType::Second(TimestampSecondType::default())) - } - - pub fn timestamp_millisecond_datatype() -> Self { - ConcreteDataType::Timestamp(TimestampType::Millisecond( - TimestampMillisecondType::default(), - )) - } - - pub fn timestamp_microsecond_datatype() -> Self { - ConcreteDataType::Timestamp(TimestampType::Microsecond( - TimestampMicrosecondType::default(), - )) - } - - pub fn timestamp_nanosecond_datatype() -> Self { - ConcreteDataType::Timestamp(TimestampType::Nanosecond(TimestampNanosecondType::default())) - } - - pub fn timestamp_datatype(unit: TimeUnit) -> Self { - match unit { - TimeUnit::Second => Self::timestamp_second_datatype(), - TimeUnit::Millisecond => Self::timestamp_millisecond_datatype(), - TimeUnit::Microsecond => Self::timestamp_microsecond_datatype(), - TimeUnit::Nanosecond => Self::timestamp_nanosecond_datatype(), - } - } - - /// Converts from arrow timestamp unit to - pub fn from_arrow_time_unit(t: &ArrowTimeUnit) -> Self { - match t { - ArrowTimeUnit::Second => Self::timestamp_second_datatype(), - ArrowTimeUnit::Millisecond => Self::timestamp_millisecond_datatype(), - ArrowTimeUnit::Microsecond => Self::timestamp_microsecond_datatype(), - ArrowTimeUnit::Nanosecond => Self::timestamp_nanosecond_datatype(), - } - } - - pub fn list_datatype(item_type: ConcreteDataType) -> ConcreteDataType { - ConcreteDataType::List(ListType::new(item_type)) - } -} - -/// Data type abstraction. -#[enum_dispatch::enum_dispatch] -pub trait DataType: std::fmt::Debug + Send + Sync { - /// Name of this data type. - fn name(&self) -> &str; - - /// Returns id of the Logical data type. - fn logical_type_id(&self) -> LogicalTypeId; - - /// Returns the default value of this type. - fn default_value(&self) -> Value; - - /// Convert this type as [arrow::datatypes::DataType]. - fn as_arrow_type(&self) -> ArrowDataType; - - /// Creates a mutable vector with given `capacity` of this type. - fn create_mutable_vector(&self, capacity: usize) -> Box; - - /// Returns true if the data type is compatible with timestamp type so we can - /// use it as a timestamp. - fn is_timestamp_compatible(&self) -> bool; -} - -pub type DataTypeRef = Arc; - -#[cfg(test)] -mod tests { - use arrow::datatypes::Field; - - use super::*; - - #[test] - fn test_concrete_type_as_datatype_trait() { - let concrete_type = ConcreteDataType::boolean_datatype(); - - assert_eq!("Boolean", concrete_type.name()); - assert_eq!(Value::Boolean(false), concrete_type.default_value()); - assert_eq!(LogicalTypeId::Boolean, concrete_type.logical_type_id()); - assert_eq!(ArrowDataType::Boolean, concrete_type.as_arrow_type()); - } - - #[test] - fn test_from_arrow_type() { - assert!(matches!( - ConcreteDataType::from_arrow_type(&ArrowDataType::Null), - ConcreteDataType::Null(_) - )); - assert!(matches!( - ConcreteDataType::from_arrow_type(&ArrowDataType::Boolean), - ConcreteDataType::Boolean(_) - )); - assert!(matches!( - ConcreteDataType::from_arrow_type(&ArrowDataType::Binary), - ConcreteDataType::Binary(_) - )); - assert!(matches!( - ConcreteDataType::from_arrow_type(&ArrowDataType::LargeBinary), - ConcreteDataType::Binary(_) - )); - assert!(matches!( - ConcreteDataType::from_arrow_type(&ArrowDataType::Int8), - ConcreteDataType::Int8(_) - )); - assert!(matches!( - ConcreteDataType::from_arrow_type(&ArrowDataType::Int16), - ConcreteDataType::Int16(_) - )); - assert!(matches!( - ConcreteDataType::from_arrow_type(&ArrowDataType::Int32), - ConcreteDataType::Int32(_) - )); - assert!(matches!( - ConcreteDataType::from_arrow_type(&ArrowDataType::Int64), - ConcreteDataType::Int64(_) - )); - assert!(matches!( - ConcreteDataType::from_arrow_type(&ArrowDataType::UInt8), - ConcreteDataType::UInt8(_) - )); - assert!(matches!( - ConcreteDataType::from_arrow_type(&ArrowDataType::UInt16), - ConcreteDataType::UInt16(_) - )); - assert!(matches!( - ConcreteDataType::from_arrow_type(&ArrowDataType::UInt32), - ConcreteDataType::UInt32(_) - )); - assert!(matches!( - ConcreteDataType::from_arrow_type(&ArrowDataType::UInt64), - ConcreteDataType::UInt64(_) - )); - assert!(matches!( - ConcreteDataType::from_arrow_type(&ArrowDataType::Float32), - ConcreteDataType::Float32(_) - )); - assert!(matches!( - ConcreteDataType::from_arrow_type(&ArrowDataType::Float64), - ConcreteDataType::Float64(_) - )); - assert!(matches!( - ConcreteDataType::from_arrow_type(&ArrowDataType::Utf8), - ConcreteDataType::String(_) - )); - assert_eq!( - ConcreteDataType::from_arrow_type(&ArrowDataType::List(Box::new(Field::new( - "item", - ArrowDataType::Int32, - true, - )))), - ConcreteDataType::List(ListType::new(ConcreteDataType::int32_datatype())) - ); - assert!(matches!( - ConcreteDataType::from_arrow_type(&ArrowDataType::Date32), - ConcreteDataType::Date(_) - )); - } - - #[test] - fn test_from_arrow_timestamp() { - assert_eq!( - ConcreteDataType::timestamp_millisecond_datatype(), - ConcreteDataType::from_arrow_time_unit(&ArrowTimeUnit::Millisecond) - ); - assert_eq!( - ConcreteDataType::timestamp_microsecond_datatype(), - ConcreteDataType::from_arrow_time_unit(&ArrowTimeUnit::Microsecond) - ); - assert_eq!( - ConcreteDataType::timestamp_nanosecond_datatype(), - ConcreteDataType::from_arrow_time_unit(&ArrowTimeUnit::Nanosecond) - ); - assert_eq!( - ConcreteDataType::timestamp_second_datatype(), - ConcreteDataType::from_arrow_time_unit(&ArrowTimeUnit::Second) - ); - } - - #[test] - fn test_is_timestamp_compatible() { - assert!(ConcreteDataType::timestamp_datatype(TimeUnit::Second).is_timestamp_compatible()); - assert!( - ConcreteDataType::timestamp_datatype(TimeUnit::Millisecond).is_timestamp_compatible() - ); - assert!( - ConcreteDataType::timestamp_datatype(TimeUnit::Microsecond).is_timestamp_compatible() - ); - assert!( - ConcreteDataType::timestamp_datatype(TimeUnit::Nanosecond).is_timestamp_compatible() - ); - assert!(ConcreteDataType::timestamp_second_datatype().is_timestamp_compatible()); - assert!(ConcreteDataType::timestamp_millisecond_datatype().is_timestamp_compatible()); - assert!(ConcreteDataType::timestamp_microsecond_datatype().is_timestamp_compatible()); - assert!(ConcreteDataType::timestamp_nanosecond_datatype().is_timestamp_compatible()); - assert!(ConcreteDataType::int64_datatype().is_timestamp_compatible()); - assert!(!ConcreteDataType::null_datatype().is_timestamp_compatible()); - assert!(!ConcreteDataType::binary_datatype().is_timestamp_compatible()); - assert!(!ConcreteDataType::boolean_datatype().is_timestamp_compatible()); - assert!(!ConcreteDataType::date_datatype().is_timestamp_compatible()); - assert!(!ConcreteDataType::datetime_datatype().is_timestamp_compatible()); - assert!(!ConcreteDataType::string_datatype().is_timestamp_compatible()); - assert!(!ConcreteDataType::int32_datatype().is_timestamp_compatible()); - assert!(!ConcreteDataType::uint64_datatype().is_timestamp_compatible()); - } - - #[test] - fn test_is_null() { - assert!(ConcreteDataType::null_datatype().is_null()); - assert!(!ConcreteDataType::int32_datatype().is_null()); - } - - #[test] - fn test_is_float() { - assert!(!ConcreteDataType::int32_datatype().is_float()); - assert!(ConcreteDataType::float32_datatype().is_float()); - assert!(ConcreteDataType::float64_datatype().is_float()); - } - - #[test] - fn test_is_boolean() { - assert!(!ConcreteDataType::int32_datatype().is_boolean()); - assert!(!ConcreteDataType::float32_datatype().is_boolean()); - assert!(ConcreteDataType::boolean_datatype().is_boolean()); - } - - #[test] - fn test_is_stringifiable() { - assert!(!ConcreteDataType::int32_datatype().is_stringifiable()); - assert!(!ConcreteDataType::float32_datatype().is_stringifiable()); - assert!(ConcreteDataType::string_datatype().is_stringifiable()); - assert!(ConcreteDataType::date_datatype().is_stringifiable()); - assert!(ConcreteDataType::datetime_datatype().is_stringifiable()); - assert!(ConcreteDataType::timestamp_second_datatype().is_stringifiable()); - assert!(ConcreteDataType::timestamp_millisecond_datatype().is_stringifiable()); - assert!(ConcreteDataType::timestamp_microsecond_datatype().is_stringifiable()); - assert!(ConcreteDataType::timestamp_nanosecond_datatype().is_stringifiable()); - } - - #[test] - fn test_is_signed() { - assert!(ConcreteDataType::int8_datatype().is_signed()); - assert!(ConcreteDataType::int16_datatype().is_signed()); - assert!(ConcreteDataType::int32_datatype().is_signed()); - assert!(ConcreteDataType::int64_datatype().is_signed()); - assert!(ConcreteDataType::date_datatype().is_signed()); - assert!(ConcreteDataType::datetime_datatype().is_signed()); - assert!(ConcreteDataType::timestamp_second_datatype().is_signed()); - assert!(ConcreteDataType::timestamp_millisecond_datatype().is_signed()); - assert!(ConcreteDataType::timestamp_microsecond_datatype().is_signed()); - assert!(ConcreteDataType::timestamp_nanosecond_datatype().is_signed()); - - assert!(!ConcreteDataType::uint8_datatype().is_signed()); - assert!(!ConcreteDataType::uint16_datatype().is_signed()); - assert!(!ConcreteDataType::uint32_datatype().is_signed()); - assert!(!ConcreteDataType::uint64_datatype().is_signed()); - - assert!(!ConcreteDataType::float32_datatype().is_signed()); - assert!(!ConcreteDataType::float64_datatype().is_signed()); - } - - #[test] - fn test_is_unsigned() { - assert!(!ConcreteDataType::int8_datatype().is_unsigned()); - assert!(!ConcreteDataType::int16_datatype().is_unsigned()); - assert!(!ConcreteDataType::int32_datatype().is_unsigned()); - assert!(!ConcreteDataType::int64_datatype().is_unsigned()); - assert!(!ConcreteDataType::date_datatype().is_unsigned()); - assert!(!ConcreteDataType::datetime_datatype().is_unsigned()); - assert!(!ConcreteDataType::timestamp_second_datatype().is_unsigned()); - assert!(!ConcreteDataType::timestamp_millisecond_datatype().is_unsigned()); - assert!(!ConcreteDataType::timestamp_microsecond_datatype().is_unsigned()); - assert!(!ConcreteDataType::timestamp_nanosecond_datatype().is_unsigned()); - - assert!(ConcreteDataType::uint8_datatype().is_unsigned()); - assert!(ConcreteDataType::uint16_datatype().is_unsigned()); - assert!(ConcreteDataType::uint32_datatype().is_unsigned()); - assert!(ConcreteDataType::uint64_datatype().is_unsigned()); - - assert!(!ConcreteDataType::float32_datatype().is_unsigned()); - assert!(!ConcreteDataType::float64_datatype().is_unsigned()); - } - - #[test] - fn test_numerics() { - let nums = ConcreteDataType::numerics(); - assert_eq!(10, nums.len()); - } -} diff --git a/src/datatypes2/src/error.rs b/src/datatypes2/src/error.rs deleted file mode 100644 index 50b49cf2b4bb..000000000000 --- a/src/datatypes2/src/error.rs +++ /dev/null @@ -1,144 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::any::Any; - -use common_error::prelude::{ErrorCompat, ErrorExt, Snafu, StatusCode}; -use snafu::Backtrace; - -#[derive(Debug, Snafu)] -#[snafu(visibility(pub))] -pub enum Error { - #[snafu(display("Failed to serialize data, source: {}", source))] - Serialize { - source: serde_json::Error, - backtrace: Backtrace, - }, - - #[snafu(display("Failed to deserialize data, source: {}, json: {}", source, json))] - Deserialize { - source: serde_json::Error, - backtrace: Backtrace, - json: String, - }, - - #[snafu(display("Failed to convert datafusion type: {}", from))] - Conversion { from: String, backtrace: Backtrace }, - - #[snafu(display("Bad array access, Index out of bounds: {}, size: {}", index, size))] - BadArrayAccess { - index: usize, - size: usize, - backtrace: Backtrace, - }, - - #[snafu(display("Unknown vector, {}", msg))] - UnknownVector { msg: String, backtrace: Backtrace }, - - #[snafu(display("Unsupported arrow data type, type: {:?}", arrow_type))] - UnsupportedArrowType { - arrow_type: arrow::datatypes::DataType, - backtrace: Backtrace, - }, - - #[snafu(display("Timestamp column {} not found", name,))] - TimestampNotFound { name: String, backtrace: Backtrace }, - - #[snafu(display( - "Failed to parse version in schema meta, value: {}, source: {}", - value, - source - ))] - ParseSchemaVersion { - value: String, - source: std::num::ParseIntError, - backtrace: Backtrace, - }, - - #[snafu(display("Invalid timestamp index: {}", index))] - InvalidTimestampIndex { index: usize, backtrace: Backtrace }, - - #[snafu(display("Duplicate timestamp index, exists: {}, new: {}", exists, new))] - DuplicateTimestampIndex { - exists: usize, - new: usize, - backtrace: Backtrace, - }, - - #[snafu(display("{}", msg))] - CastType { msg: String, backtrace: Backtrace }, - - #[snafu(display("Arrow failed to compute, source: {}", source))] - ArrowCompute { - source: arrow::error::ArrowError, - backtrace: Backtrace, - }, - - #[snafu(display("Unsupported column default constraint expression: {}", expr))] - UnsupportedDefaultExpr { expr: String, backtrace: Backtrace }, - - #[snafu(display("Default value should not be null for non null column"))] - NullDefault { backtrace: Backtrace }, - - #[snafu(display("Incompatible default value type, reason: {}", reason))] - DefaultValueType { - reason: String, - backtrace: Backtrace, - }, - - #[snafu(display("Duplicated metadata for {}", key))] - DuplicateMeta { key: String, backtrace: Backtrace }, -} - -impl ErrorExt for Error { - fn status_code(&self) -> StatusCode { - // Inner encoding and decoding error should not be exposed to users. - StatusCode::Internal - } - - fn backtrace_opt(&self) -> Option<&Backtrace> { - ErrorCompat::backtrace(self) - } - - fn as_any(&self) -> &dyn Any { - self - } -} - -pub type Result = std::result::Result; - -#[cfg(test)] -mod tests { - use std::collections::HashMap; - - use snafu::ResultExt; - - use super::*; - - #[test] - pub fn test_error() { - let mut map = HashMap::new(); - map.insert(true, 1); - map.insert(false, 2); - - let result = serde_json::to_string(&map).context(SerializeSnafu); - assert!(result.is_err(), "serialize result is: {:?}", result); - let err = serde_json::to_string(&map) - .context(SerializeSnafu) - .err() - .unwrap(); - assert!(err.backtrace_opt().is_some()); - assert_eq!(StatusCode::Internal, err.status_code()); - } -} diff --git a/src/datatypes2/src/lib.rs b/src/datatypes2/src/lib.rs deleted file mode 100644 index 256d347eacb0..000000000000 --- a/src/datatypes2/src/lib.rs +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#![feature(generic_associated_types)] -#![feature(assert_matches)] - -pub mod arrow_array; -pub mod data_type; -pub mod error; -pub mod macros; -pub mod prelude; -mod scalars; -pub mod schema; -pub mod serialize; -mod timestamp; -pub mod type_id; -pub mod types; -pub mod value; -pub mod vectors; - -pub use arrow; -pub use error::{Error, Result}; diff --git a/src/datatypes2/src/macros.rs b/src/datatypes2/src/macros.rs deleted file mode 100644 index 37c0a42e3f55..000000000000 --- a/src/datatypes2/src/macros.rs +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -//! Some helper macros for datatypes, copied from databend. - -/// Apply the macro rules to all primitive types. -#[macro_export] -macro_rules! for_all_primitive_types { - ($macro:tt $(, $x:tt)*) => { - $macro! { - [$($x),*], - { i8 }, - { i16 }, - { i32 }, - { i64 }, - { u8 }, - { u16 }, - { u32 }, - { u64 }, - { f32 }, - { f64 } - } - }; -} - -/// Match the logical type and apply `$body` to all primitive types and -/// `nbody` to other types. -#[macro_export] -macro_rules! with_match_primitive_type_id { - ($key_type:expr, | $_:tt $T:ident | $body:tt, $nbody:tt) => {{ - macro_rules! __with_ty__ { - ( $_ $T:ident ) => { - $body - }; - } - - use $crate::type_id::LogicalTypeId; - use $crate::types::{ - Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type, - UInt32Type, UInt64Type, UInt8Type, - }; - match $key_type { - LogicalTypeId::Int8 => __with_ty__! { Int8Type }, - LogicalTypeId::Int16 => __with_ty__! { Int16Type }, - LogicalTypeId::Int32 => __with_ty__! { Int32Type }, - LogicalTypeId::Int64 => __with_ty__! { Int64Type }, - LogicalTypeId::UInt8 => __with_ty__! { UInt8Type }, - LogicalTypeId::UInt16 => __with_ty__! { UInt16Type }, - LogicalTypeId::UInt32 => __with_ty__! { UInt32Type }, - LogicalTypeId::UInt64 => __with_ty__! { UInt64Type }, - LogicalTypeId::Float32 => __with_ty__! { Float32Type }, - LogicalTypeId::Float64 => __with_ty__! { Float64Type }, - - _ => $nbody, - } - }}; -} diff --git a/src/datatypes2/src/prelude.rs b/src/datatypes2/src/prelude.rs deleted file mode 100644 index f6bd298316db..000000000000 --- a/src/datatypes2/src/prelude.rs +++ /dev/null @@ -1,20 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -pub use crate::data_type::{ConcreteDataType, DataType, DataTypeRef}; -pub use crate::macros::*; -pub use crate::scalars::{Scalar, ScalarRef, ScalarVector, ScalarVectorBuilder}; -pub use crate::type_id::LogicalTypeId; -pub use crate::value::{Value, ValueRef}; -pub use crate::vectors::{MutableVector, Validity, Vector, VectorRef}; diff --git a/src/datatypes2/src/scalars.rs b/src/datatypes2/src/scalars.rs deleted file mode 100644 index 327ebaa629a2..000000000000 --- a/src/datatypes2/src/scalars.rs +++ /dev/null @@ -1,443 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::any::Any; - -use common_time::{Date, DateTime}; - -use crate::types::{ - Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type, UInt32Type, - UInt64Type, UInt8Type, -}; -use crate::value::{ListValue, ListValueRef, Value}; -use crate::vectors::{ - BinaryVector, BooleanVector, DateTimeVector, DateVector, ListVector, MutableVector, - PrimitiveVector, StringVector, Vector, -}; - -fn get_iter_capacity>(iter: &I) -> usize { - match iter.size_hint() { - (_lower, Some(upper)) => upper, - (0, None) => 1024, - (lower, None) => lower, - } -} - -/// Owned scalar value -/// primitive types, bool, Vec ... -pub trait Scalar: 'static + Sized + Default + Any -where - for<'a> Self::VectorType: ScalarVector = Self::RefType<'a>>, -{ - type VectorType: ScalarVector; - type RefType<'a>: ScalarRef<'a, ScalarType = Self> - where - Self: 'a; - /// Get a reference of the current value. - fn as_scalar_ref(&self) -> Self::RefType<'_>; - - /// Upcast GAT type's lifetime. - fn upcast_gat<'short, 'long: 'short>(long: Self::RefType<'long>) -> Self::RefType<'short>; -} - -pub trait ScalarRef<'a>: std::fmt::Debug + Clone + Copy + Send + 'a { - /// The corresponding [`Scalar`] type. - type ScalarType: Scalar = Self>; - - /// Convert the reference into an owned value. - fn to_owned_scalar(&self) -> Self::ScalarType; -} - -/// A sub trait of Vector to add scalar operation support. -// This implementation refers to Datebend's [ScalarColumn](https://github.com/datafuselabs/databend/blob/main/common/datavalues/src/scalars/type_.rs) -// and skyzh's [type-exercise-in-rust](https://github.com/skyzh/type-exercise-in-rust). -pub trait ScalarVector: Vector + Send + Sync + Sized + 'static -where - for<'a> Self::OwnedItem: Scalar = Self::RefItem<'a>>, -{ - type OwnedItem: Scalar; - /// The reference item of this vector. - type RefItem<'a>: ScalarRef<'a, ScalarType = Self::OwnedItem> - where - Self: 'a; - - /// Iterator type of this vector. - type Iter<'a>: Iterator>> - where - Self: 'a; - - /// Builder type to build this vector. - type Builder: ScalarVectorBuilder; - - /// Returns the reference to an element at given position. - /// - /// Note: `get()` has bad performance, avoid call this function inside loop. - /// - /// # Panics - /// Panics if `idx >= self.len()`. - fn get_data(&self, idx: usize) -> Option>; - - /// Returns iterator of current vector. - fn iter_data(&self) -> Self::Iter<'_>; - - fn from_slice(data: &[Self::RefItem<'_>]) -> Self { - let mut builder = Self::Builder::with_capacity(data.len()); - for item in data { - builder.push(Some(*item)); - } - builder.finish() - } - - fn from_iterator<'a>(it: impl Iterator>) -> Self { - let mut builder = Self::Builder::with_capacity(get_iter_capacity(&it)); - for item in it { - builder.push(Some(item)); - } - builder.finish() - } - - fn from_owned_iterator(it: impl Iterator>) -> Self { - let mut builder = Self::Builder::with_capacity(get_iter_capacity(&it)); - for item in it { - match item { - Some(item) => builder.push(Some(item.as_scalar_ref())), - None => builder.push(None), - } - } - builder.finish() - } - - fn from_vec>(values: Vec) -> Self { - let it = values.into_iter(); - let mut builder = Self::Builder::with_capacity(get_iter_capacity(&it)); - for item in it { - builder.push(Some(item.into().as_scalar_ref())); - } - builder.finish() - } -} - -/// A trait over all vector builders. -pub trait ScalarVectorBuilder: MutableVector { - type VectorType: ScalarVector; - - /// Create a new builder with initial `capacity`. - fn with_capacity(capacity: usize) -> Self; - - /// Push a value into the builder. - fn push(&mut self, value: Option<::RefItem<'_>>); - - /// Finish build and return a new vector. - fn finish(&mut self) -> Self::VectorType; -} - -macro_rules! impl_scalar_for_native { - ($Native: ident, $DataType: ident) => { - impl Scalar for $Native { - type VectorType = PrimitiveVector<$DataType>; - type RefType<'a> = $Native; - - #[inline] - fn as_scalar_ref(&self) -> $Native { - *self - } - - #[allow(clippy::needless_lifetimes)] - #[inline] - fn upcast_gat<'short, 'long: 'short>(long: $Native) -> $Native { - long - } - } - - /// Implement [`ScalarRef`] for primitive types. Note that primitive types are both [`Scalar`] and [`ScalarRef`]. - impl<'a> ScalarRef<'a> for $Native { - type ScalarType = $Native; - - #[inline] - fn to_owned_scalar(&self) -> $Native { - *self - } - } - }; -} - -impl_scalar_for_native!(u8, UInt8Type); -impl_scalar_for_native!(u16, UInt16Type); -impl_scalar_for_native!(u32, UInt32Type); -impl_scalar_for_native!(u64, UInt64Type); -impl_scalar_for_native!(i8, Int8Type); -impl_scalar_for_native!(i16, Int16Type); -impl_scalar_for_native!(i32, Int32Type); -impl_scalar_for_native!(i64, Int64Type); -impl_scalar_for_native!(f32, Float32Type); -impl_scalar_for_native!(f64, Float64Type); - -impl Scalar for bool { - type VectorType = BooleanVector; - type RefType<'a> = bool; - - #[inline] - fn as_scalar_ref(&self) -> bool { - *self - } - - #[allow(clippy::needless_lifetimes)] - #[inline] - fn upcast_gat<'short, 'long: 'short>(long: bool) -> bool { - long - } -} - -impl<'a> ScalarRef<'a> for bool { - type ScalarType = bool; - - #[inline] - fn to_owned_scalar(&self) -> bool { - *self - } -} - -impl Scalar for String { - type VectorType = StringVector; - type RefType<'a> = &'a str; - - #[inline] - fn as_scalar_ref(&self) -> &str { - self - } - - #[inline] - fn upcast_gat<'short, 'long: 'short>(long: &'long str) -> &'short str { - long - } -} - -impl<'a> ScalarRef<'a> for &'a str { - type ScalarType = String; - - #[inline] - fn to_owned_scalar(&self) -> String { - self.to_string() - } -} - -impl Scalar for Vec { - type VectorType = BinaryVector; - type RefType<'a> = &'a [u8]; - - #[inline] - fn as_scalar_ref(&self) -> &[u8] { - self - } - - #[inline] - fn upcast_gat<'short, 'long: 'short>(long: &'long [u8]) -> &'short [u8] { - long - } -} - -impl<'a> ScalarRef<'a> for &'a [u8] { - type ScalarType = Vec; - - #[inline] - fn to_owned_scalar(&self) -> Vec { - self.to_vec() - } -} - -impl Scalar for Date { - type VectorType = DateVector; - type RefType<'a> = Date; - - fn as_scalar_ref(&self) -> Self::RefType<'_> { - *self - } - - fn upcast_gat<'short, 'long: 'short>(long: Self::RefType<'long>) -> Self::RefType<'short> { - long - } -} - -impl<'a> ScalarRef<'a> for Date { - type ScalarType = Date; - - fn to_owned_scalar(&self) -> Self::ScalarType { - *self - } -} - -impl Scalar for DateTime { - type VectorType = DateTimeVector; - type RefType<'a> = DateTime; - - fn as_scalar_ref(&self) -> Self::RefType<'_> { - *self - } - - fn upcast_gat<'short, 'long: 'short>(long: Self::RefType<'long>) -> Self::RefType<'short> { - long - } -} - -impl<'a> ScalarRef<'a> for DateTime { - type ScalarType = DateTime; - - fn to_owned_scalar(&self) -> Self::ScalarType { - *self - } -} - -// Timestamp types implement Scalar and ScalarRef in `src/timestamp.rs`. - -impl Scalar for ListValue { - type VectorType = ListVector; - type RefType<'a> = ListValueRef<'a>; - - fn as_scalar_ref(&self) -> Self::RefType<'_> { - ListValueRef::Ref { val: self } - } - - fn upcast_gat<'short, 'long: 'short>(long: Self::RefType<'long>) -> Self::RefType<'short> { - long - } -} - -impl<'a> ScalarRef<'a> for ListValueRef<'a> { - type ScalarType = ListValue; - - fn to_owned_scalar(&self) -> Self::ScalarType { - match self { - ListValueRef::Indexed { vector, idx } => match vector.get(*idx) { - // Normally should not get `Value::Null` if the `ListValueRef` comes - // from the iterator of the ListVector, but we avoid panic and just - // returns a default list value in such case since `ListValueRef` may - // be constructed manually. - Value::Null => ListValue::default(), - Value::List(v) => v, - _ => unreachable!(), - }, - ListValueRef::Ref { val } => (*val).clone(), - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::data_type::ConcreteDataType; - use crate::timestamp::TimestampSecond; - use crate::vectors::{BinaryVector, Int32Vector, ListVectorBuilder, TimestampSecondVector}; - - fn build_vector_from_slice(items: &[Option>]) -> T { - let mut builder = T::Builder::with_capacity(items.len()); - for item in items { - builder.push(*item); - } - builder.finish() - } - - fn assert_vector_eq<'a, T: ScalarVector>(expect: &[Option>], vector: &'a T) - where - T::RefItem<'a>: PartialEq + std::fmt::Debug, - { - for (a, b) in expect.iter().zip(vector.iter_data()) { - assert_eq!(*a, b); - } - } - - #[test] - fn test_build_i32_vector() { - let expect = vec![Some(1), Some(2), Some(3), None, Some(5)]; - let vector: Int32Vector = build_vector_from_slice(&expect); - assert_vector_eq(&expect, &vector); - } - - #[test] - fn test_build_binary_vector() { - let expect: Vec> = vec![ - Some(b"a"), - Some(b"b"), - Some(b"c"), - None, - Some(b"e"), - Some(b""), - ]; - let vector: BinaryVector = build_vector_from_slice(&expect); - assert_vector_eq(&expect, &vector); - } - - #[test] - fn test_build_date_vector() { - let expect: Vec> = vec![ - Some(Date::new(0)), - Some(Date::new(-1)), - None, - Some(Date::new(1)), - ]; - let vector: DateVector = build_vector_from_slice(&expect); - assert_vector_eq(&expect, &vector); - } - - #[test] - fn test_date_scalar() { - let date = Date::new(1); - assert_eq!(date, date.as_scalar_ref()); - assert_eq!(date, date.to_owned_scalar()); - } - - #[test] - fn test_datetime_scalar() { - let dt = DateTime::new(123); - assert_eq!(dt, dt.as_scalar_ref()); - assert_eq!(dt, dt.to_owned_scalar()); - } - - #[test] - fn test_list_value_scalar() { - let list_value = ListValue::new( - Some(Box::new(vec![Value::Int32(123)])), - ConcreteDataType::int32_datatype(), - ); - let list_ref = ListValueRef::Ref { val: &list_value }; - assert_eq!(list_ref, list_value.as_scalar_ref()); - assert_eq!(list_value, list_ref.to_owned_scalar()); - - let mut builder = - ListVectorBuilder::with_type_capacity(ConcreteDataType::int32_datatype(), 1); - builder.push(None); - builder.push(Some(list_value.as_scalar_ref())); - let vector = builder.finish(); - - let ref_on_vec = ListValueRef::Indexed { - vector: &vector, - idx: 0, - }; - assert_eq!(ListValue::default(), ref_on_vec.to_owned_scalar()); - let ref_on_vec = ListValueRef::Indexed { - vector: &vector, - idx: 1, - }; - assert_eq!(list_value, ref_on_vec.to_owned_scalar()); - } - - #[test] - fn test_build_timestamp_vector() { - let expect: Vec> = vec![Some(10.into()), None, Some(42.into())]; - let vector: TimestampSecondVector = build_vector_from_slice(&expect); - assert_vector_eq(&expect, &vector); - let val = vector.get_data(0).unwrap(); - assert_eq!(val, val.as_scalar_ref()); - assert_eq!(TimestampSecond::from(10), val.to_owned_scalar()); - } -} diff --git a/src/datatypes2/src/schema.rs b/src/datatypes2/src/schema.rs deleted file mode 100644 index 328fe0de24dc..000000000000 --- a/src/datatypes2/src/schema.rs +++ /dev/null @@ -1,430 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -mod column_schema; -mod constraint; -mod raw; - -use std::collections::HashMap; -use std::sync::Arc; - -use arrow::datatypes::{Field, Schema as ArrowSchema}; -use snafu::{ensure, ResultExt}; - -use crate::data_type::DataType; -use crate::error::{self, Error, Result}; -pub use crate::schema::column_schema::{ColumnSchema, Metadata}; -pub use crate::schema::constraint::ColumnDefaultConstraint; -pub use crate::schema::raw::RawSchema; - -/// Key used to store version number of the schema in metadata. -const VERSION_KEY: &str = "greptime:version"; - -/// A common schema, should be immutable. -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct Schema { - column_schemas: Vec, - name_to_index: HashMap, - arrow_schema: Arc, - /// Index of the timestamp key column. - /// - /// Timestamp key column is the column holds the timestamp and forms part of - /// the primary key. None means there is no timestamp key column. - timestamp_index: Option, - /// Version of the schema. - /// - /// Initial value is zero. The version should bump after altering schema. - version: u32, -} - -impl Schema { - /// Initial version of the schema. - pub const INITIAL_VERSION: u32 = 0; - - /// Create a schema from a vector of [ColumnSchema]. - /// - /// # Panics - /// Panics when ColumnSchema's `default_constraint` can't be serialized into json. - pub fn new(column_schemas: Vec) -> Schema { - // Builder won't fail in this case - SchemaBuilder::try_from(column_schemas) - .unwrap() - .build() - .unwrap() - } - - /// Try to Create a schema from a vector of [ColumnSchema]. - pub fn try_new(column_schemas: Vec) -> Result { - SchemaBuilder::try_from(column_schemas)?.build() - } - - #[inline] - pub fn arrow_schema(&self) -> &Arc { - &self.arrow_schema - } - - #[inline] - pub fn column_schemas(&self) -> &[ColumnSchema] { - &self.column_schemas - } - - pub fn column_schema_by_name(&self, name: &str) -> Option<&ColumnSchema> { - self.name_to_index - .get(name) - .map(|index| &self.column_schemas[*index]) - } - - /// Retrieve the column's name by index - /// # Panics - /// This method **may** panic if the index is out of range of column schemas. - #[inline] - pub fn column_name_by_index(&self, idx: usize) -> &str { - &self.column_schemas[idx].name - } - - #[inline] - pub fn column_index_by_name(&self, name: &str) -> Option { - self.name_to_index.get(name).copied() - } - - #[inline] - pub fn contains_column(&self, name: &str) -> bool { - self.name_to_index.contains_key(name) - } - - #[inline] - pub fn num_columns(&self) -> usize { - self.column_schemas.len() - } - - #[inline] - pub fn is_empty(&self) -> bool { - self.column_schemas.is_empty() - } - - /// Returns index of the timestamp key column. - #[inline] - pub fn timestamp_index(&self) -> Option { - self.timestamp_index - } - - #[inline] - pub fn timestamp_column(&self) -> Option<&ColumnSchema> { - self.timestamp_index.map(|idx| &self.column_schemas[idx]) - } - - #[inline] - pub fn version(&self) -> u32 { - self.version - } - - #[inline] - pub fn metadata(&self) -> &HashMap { - &self.arrow_schema.metadata - } -} - -#[derive(Default)] -pub struct SchemaBuilder { - column_schemas: Vec, - name_to_index: HashMap, - fields: Vec, - timestamp_index: Option, - version: u32, - metadata: HashMap, -} - -impl TryFrom> for SchemaBuilder { - type Error = Error; - - fn try_from(column_schemas: Vec) -> Result { - SchemaBuilder::try_from_columns(column_schemas) - } -} - -impl SchemaBuilder { - pub fn try_from_columns(column_schemas: Vec) -> Result { - let FieldsAndIndices { - fields, - name_to_index, - timestamp_index, - } = collect_fields(&column_schemas)?; - - Ok(Self { - column_schemas, - name_to_index, - fields, - timestamp_index, - ..Default::default() - }) - } - - pub fn version(mut self, version: u32) -> Self { - self.version = version; - self - } - - /// Add key value pair to metadata. - /// - /// Old metadata with same key would be overwritten. - pub fn add_metadata(mut self, key: impl Into, value: impl Into) -> Self { - self.metadata.insert(key.into(), value.into()); - self - } - - pub fn build(mut self) -> Result { - if let Some(timestamp_index) = self.timestamp_index { - validate_timestamp_index(&self.column_schemas, timestamp_index)?; - } - - self.metadata - .insert(VERSION_KEY.to_string(), self.version.to_string()); - - let arrow_schema = ArrowSchema::new(self.fields).with_metadata(self.metadata); - - Ok(Schema { - column_schemas: self.column_schemas, - name_to_index: self.name_to_index, - arrow_schema: Arc::new(arrow_schema), - timestamp_index: self.timestamp_index, - version: self.version, - }) - } -} - -struct FieldsAndIndices { - fields: Vec, - name_to_index: HashMap, - timestamp_index: Option, -} - -fn collect_fields(column_schemas: &[ColumnSchema]) -> Result { - let mut fields = Vec::with_capacity(column_schemas.len()); - let mut name_to_index = HashMap::with_capacity(column_schemas.len()); - let mut timestamp_index = None; - for (index, column_schema) in column_schemas.iter().enumerate() { - if column_schema.is_time_index() { - ensure!( - timestamp_index.is_none(), - error::DuplicateTimestampIndexSnafu { - exists: timestamp_index.unwrap(), - new: index, - } - ); - timestamp_index = Some(index); - } - let field = Field::try_from(column_schema)?; - fields.push(field); - name_to_index.insert(column_schema.name.clone(), index); - } - - Ok(FieldsAndIndices { - fields, - name_to_index, - timestamp_index, - }) -} - -fn validate_timestamp_index(column_schemas: &[ColumnSchema], timestamp_index: usize) -> Result<()> { - ensure!( - timestamp_index < column_schemas.len(), - error::InvalidTimestampIndexSnafu { - index: timestamp_index, - } - ); - - let column_schema = &column_schemas[timestamp_index]; - ensure!( - column_schema.data_type.is_timestamp_compatible(), - error::InvalidTimestampIndexSnafu { - index: timestamp_index, - } - ); - ensure!( - column_schema.is_time_index(), - error::InvalidTimestampIndexSnafu { - index: timestamp_index, - } - ); - - Ok(()) -} - -pub type SchemaRef = Arc; - -impl TryFrom> for Schema { - type Error = Error; - - fn try_from(arrow_schema: Arc) -> Result { - let mut column_schemas = Vec::with_capacity(arrow_schema.fields.len()); - let mut name_to_index = HashMap::with_capacity(arrow_schema.fields.len()); - for field in &arrow_schema.fields { - let column_schema = ColumnSchema::try_from(field)?; - name_to_index.insert(field.name().to_string(), column_schemas.len()); - column_schemas.push(column_schema); - } - - let mut timestamp_index = None; - for (index, column_schema) in column_schemas.iter().enumerate() { - if column_schema.is_time_index() { - validate_timestamp_index(&column_schemas, index)?; - ensure!( - timestamp_index.is_none(), - error::DuplicateTimestampIndexSnafu { - exists: timestamp_index.unwrap(), - new: index, - } - ); - timestamp_index = Some(index); - } - } - - let version = try_parse_version(&arrow_schema.metadata, VERSION_KEY)?; - - Ok(Self { - column_schemas, - name_to_index, - arrow_schema, - timestamp_index, - version, - }) - } -} - -impl TryFrom for Schema { - type Error = Error; - - fn try_from(arrow_schema: ArrowSchema) -> Result { - let arrow_schema = Arc::new(arrow_schema); - - Schema::try_from(arrow_schema) - } -} - -fn try_parse_version(metadata: &HashMap, key: &str) -> Result { - if let Some(value) = metadata.get(key) { - let version = value - .parse() - .context(error::ParseSchemaVersionSnafu { value })?; - - Ok(version) - } else { - Ok(Schema::INITIAL_VERSION) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::data_type::ConcreteDataType; - - #[test] - fn test_build_empty_schema() { - let schema = SchemaBuilder::default().build().unwrap(); - assert_eq!(0, schema.num_columns()); - assert!(schema.is_empty()); - } - - #[test] - fn test_schema_no_timestamp() { - let column_schemas = vec![ - ColumnSchema::new("col1", ConcreteDataType::int32_datatype(), false), - ColumnSchema::new("col2", ConcreteDataType::float64_datatype(), true), - ]; - let schema = Schema::new(column_schemas.clone()); - - assert_eq!(2, schema.num_columns()); - assert!(!schema.is_empty()); - assert!(schema.timestamp_index().is_none()); - assert!(schema.timestamp_column().is_none()); - assert_eq!(Schema::INITIAL_VERSION, schema.version()); - - for column_schema in &column_schemas { - let found = schema.column_schema_by_name(&column_schema.name).unwrap(); - assert_eq!(column_schema, found); - } - assert!(schema.column_schema_by_name("col3").is_none()); - - let new_schema = Schema::try_from(schema.arrow_schema().clone()).unwrap(); - - assert_eq!(schema, new_schema); - assert_eq!(column_schemas, schema.column_schemas()); - } - - #[test] - fn test_metadata() { - let column_schemas = vec![ColumnSchema::new( - "col1", - ConcreteDataType::int32_datatype(), - false, - )]; - let schema = SchemaBuilder::try_from(column_schemas) - .unwrap() - .add_metadata("k1", "v1") - .build() - .unwrap(); - - assert_eq!("v1", schema.metadata().get("k1").unwrap()); - } - - #[test] - fn test_schema_with_timestamp() { - let column_schemas = vec![ - ColumnSchema::new("col1", ConcreteDataType::int32_datatype(), true), - ColumnSchema::new( - "ts", - ConcreteDataType::timestamp_millisecond_datatype(), - false, - ) - .with_time_index(true), - ]; - let schema = SchemaBuilder::try_from(column_schemas.clone()) - .unwrap() - .version(123) - .build() - .unwrap(); - - assert_eq!(1, schema.timestamp_index().unwrap()); - assert_eq!(&column_schemas[1], schema.timestamp_column().unwrap()); - assert_eq!(123, schema.version()); - - let new_schema = Schema::try_from(schema.arrow_schema().clone()).unwrap(); - assert_eq!(1, schema.timestamp_index().unwrap()); - assert_eq!(schema, new_schema); - } - - #[test] - fn test_schema_wrong_timestamp() { - let column_schemas = vec![ - ColumnSchema::new("col1", ConcreteDataType::int32_datatype(), true) - .with_time_index(true), - ColumnSchema::new("col2", ConcreteDataType::float64_datatype(), false), - ]; - assert!(SchemaBuilder::try_from(column_schemas) - .unwrap() - .build() - .is_err()); - - let column_schemas = vec![ - ColumnSchema::new("col1", ConcreteDataType::int32_datatype(), true), - ColumnSchema::new("col2", ConcreteDataType::float64_datatype(), false) - .with_time_index(true), - ]; - - assert!(SchemaBuilder::try_from(column_schemas) - .unwrap() - .build() - .is_err()); - } -} diff --git a/src/datatypes2/src/schema/constraint.rs b/src/datatypes2/src/schema/constraint.rs deleted file mode 100644 index 4dd3ecc14b7f..000000000000 --- a/src/datatypes2/src/schema/constraint.rs +++ /dev/null @@ -1,306 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::fmt::{Display, Formatter}; -use std::sync::Arc; - -use common_time::util; -use serde::{Deserialize, Serialize}; -use snafu::{ensure, ResultExt}; - -use crate::data_type::{ConcreteDataType, DataType}; -use crate::error::{self, Result}; -use crate::value::Value; -use crate::vectors::{Int64Vector, TimestampMillisecondVector, VectorRef}; - -const CURRENT_TIMESTAMP: &str = "current_timestamp()"; - -/// Column's default constraint. -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub enum ColumnDefaultConstraint { - // A function invocation - // TODO(dennis): we save the function expression here, maybe use a struct in future. - Function(String), - // A value - Value(Value), -} - -impl TryFrom<&[u8]> for ColumnDefaultConstraint { - type Error = error::Error; - - fn try_from(bytes: &[u8]) -> Result { - let json = String::from_utf8_lossy(bytes); - serde_json::from_str(&json).context(error::DeserializeSnafu { json }) - } -} - -impl TryFrom for Vec { - type Error = error::Error; - - fn try_from(value: ColumnDefaultConstraint) -> std::result::Result { - let s = serde_json::to_string(&value).context(error::SerializeSnafu)?; - Ok(s.into_bytes()) - } -} - -impl Display for ColumnDefaultConstraint { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - match self { - ColumnDefaultConstraint::Function(expr) => write!(f, "{}", expr), - ColumnDefaultConstraint::Value(v) => write!(f, "{}", v), - } - } -} - -impl ColumnDefaultConstraint { - /// Returns a default null constraint. - pub fn null_value() -> ColumnDefaultConstraint { - ColumnDefaultConstraint::Value(Value::Null) - } - - /// Check whether the constraint is valid for columns with given `data_type` - /// and `is_nullable` attributes. - pub fn validate(&self, data_type: &ConcreteDataType, is_nullable: bool) -> Result<()> { - ensure!(is_nullable || !self.maybe_null(), error::NullDefaultSnafu); - - match self { - ColumnDefaultConstraint::Function(expr) => { - ensure!( - expr == CURRENT_TIMESTAMP, - error::UnsupportedDefaultExprSnafu { expr } - ); - ensure!( - data_type.is_timestamp_compatible(), - error::DefaultValueTypeSnafu { - reason: "return value of the function must has timestamp type", - } - ); - } - ColumnDefaultConstraint::Value(v) => { - if !v.is_null() { - // Whether the value could be nullable has been checked before, only need - // to check the type compatibility here. - ensure!( - data_type.logical_type_id() == v.logical_type_id(), - error::DefaultValueTypeSnafu { - reason: format!( - "column has type {:?} but default value has type {:?}", - data_type.logical_type_id(), - v.logical_type_id() - ), - } - ); - } - } - } - - Ok(()) - } - - /// Create a vector that contains `num_rows` default values for given `data_type`. - /// - /// If `is_nullable` is `true`, then this method would returns error if the created - /// default value is null. - /// - /// # Panics - /// Panics if `num_rows == 0`. - pub fn create_default_vector( - &self, - data_type: &ConcreteDataType, - is_nullable: bool, - num_rows: usize, - ) -> Result { - assert!(num_rows > 0); - - match self { - ColumnDefaultConstraint::Function(expr) => { - // Functions should also ensure its return value is not null when - // is_nullable is true. - match &expr[..] { - // TODO(dennis): we only supports current_timestamp right now, - // it's better to use a expression framework in future. - CURRENT_TIMESTAMP => create_current_timestamp_vector(data_type, num_rows), - _ => error::UnsupportedDefaultExprSnafu { expr }.fail(), - } - } - ColumnDefaultConstraint::Value(v) => { - ensure!(is_nullable || !v.is_null(), error::NullDefaultSnafu); - - // TODO(yingwen): - // 1. For null value, we could use NullVector once it supports custom logical type. - // 2. For non null value, we could use ConstantVector, but it would cause all codes - // attempt to downcast the vector fail if they don't check whether the vector is const - // first. - let mut mutable_vector = data_type.create_mutable_vector(1); - mutable_vector.push_value_ref(v.as_value_ref())?; - let base_vector = mutable_vector.to_vector(); - Ok(base_vector.replicate(&[num_rows])) - } - } - } - - /// Returns true if this constraint might creates NULL. - fn maybe_null(&self) -> bool { - // Once we support more functions, we may return true if given function - // could return null. - matches!(self, ColumnDefaultConstraint::Value(Value::Null)) - } -} - -fn create_current_timestamp_vector( - data_type: &ConcreteDataType, - num_rows: usize, -) -> Result { - // FIXME(yingwen): We should implements cast in VectorOp so we could cast the millisecond vector - // to other data type and avoid this match. - match data_type { - ConcreteDataType::Timestamp(_) => Ok(Arc::new(TimestampMillisecondVector::from_values( - std::iter::repeat(util::current_time_millis()).take(num_rows), - ))), - ConcreteDataType::Int64(_) => Ok(Arc::new(Int64Vector::from_values( - std::iter::repeat(util::current_time_millis()).take(num_rows), - ))), - _ => error::DefaultValueTypeSnafu { - reason: format!( - "Not support to assign current timestamp to {:?} type", - data_type - ), - } - .fail(), - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::error::Error; - use crate::vectors::Int32Vector; - - #[test] - fn test_null_default_constraint() { - let constraint = ColumnDefaultConstraint::null_value(); - assert!(constraint.maybe_null()); - let constraint = ColumnDefaultConstraint::Value(Value::Int32(10)); - assert!(!constraint.maybe_null()); - } - - #[test] - fn test_validate_null_constraint() { - let constraint = ColumnDefaultConstraint::null_value(); - let data_type = ConcreteDataType::int32_datatype(); - constraint.validate(&data_type, false).unwrap_err(); - constraint.validate(&data_type, true).unwrap(); - } - - #[test] - fn test_validate_value_constraint() { - let constraint = ColumnDefaultConstraint::Value(Value::Int32(10)); - let data_type = ConcreteDataType::int32_datatype(); - constraint.validate(&data_type, false).unwrap(); - constraint.validate(&data_type, true).unwrap(); - - constraint - .validate(&ConcreteDataType::uint32_datatype(), true) - .unwrap_err(); - } - - #[test] - fn test_validate_function_constraint() { - let constraint = ColumnDefaultConstraint::Function(CURRENT_TIMESTAMP.to_string()); - constraint - .validate(&ConcreteDataType::timestamp_millisecond_datatype(), false) - .unwrap(); - constraint - .validate(&ConcreteDataType::boolean_datatype(), false) - .unwrap_err(); - - let constraint = ColumnDefaultConstraint::Function("hello()".to_string()); - constraint - .validate(&ConcreteDataType::timestamp_millisecond_datatype(), false) - .unwrap_err(); - } - - #[test] - fn test_create_default_vector_by_null() { - let constraint = ColumnDefaultConstraint::null_value(); - let data_type = ConcreteDataType::int32_datatype(); - constraint - .create_default_vector(&data_type, false, 10) - .unwrap_err(); - - let constraint = ColumnDefaultConstraint::null_value(); - let v = constraint - .create_default_vector(&data_type, true, 3) - .unwrap(); - assert_eq!(3, v.len()); - for i in 0..v.len() { - assert_eq!(Value::Null, v.get(i)); - } - } - - #[test] - fn test_create_default_vector_by_value() { - let constraint = ColumnDefaultConstraint::Value(Value::Int32(10)); - let data_type = ConcreteDataType::int32_datatype(); - let v = constraint - .create_default_vector(&data_type, false, 4) - .unwrap(); - let expect: VectorRef = Arc::new(Int32Vector::from_values(vec![10; 4])); - assert_eq!(expect, v); - } - - #[test] - fn test_create_default_vector_by_func() { - let constraint = ColumnDefaultConstraint::Function(CURRENT_TIMESTAMP.to_string()); - // Timestamp type. - let data_type = ConcreteDataType::timestamp_millisecond_datatype(); - let v = constraint - .create_default_vector(&data_type, false, 4) - .unwrap(); - assert_eq!(4, v.len()); - assert!( - matches!(v.get(0), Value::Timestamp(_)), - "v {:?} is not timestamp", - v.get(0) - ); - - // Int64 type. - let data_type = ConcreteDataType::int64_datatype(); - let v = constraint - .create_default_vector(&data_type, false, 4) - .unwrap(); - assert_eq!(4, v.len()); - assert!( - matches!(v.get(0), Value::Int64(_)), - "v {:?} is not timestamp", - v.get(0) - ); - - let constraint = ColumnDefaultConstraint::Function("no".to_string()); - let data_type = ConcreteDataType::timestamp_millisecond_datatype(); - constraint - .create_default_vector(&data_type, false, 4) - .unwrap_err(); - } - - #[test] - fn test_create_by_func_and_invalid_type() { - let constraint = ColumnDefaultConstraint::Function(CURRENT_TIMESTAMP.to_string()); - let data_type = ConcreteDataType::boolean_datatype(); - let err = constraint - .create_default_vector(&data_type, false, 4) - .unwrap_err(); - assert!(matches!(err, Error::DefaultValueType { .. }), "{:?}", err); - } -} diff --git a/src/datatypes2/src/schema/raw.rs b/src/datatypes2/src/schema/raw.rs deleted file mode 100644 index 75f0853b4b74..000000000000 --- a/src/datatypes2/src/schema/raw.rs +++ /dev/null @@ -1,77 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use serde::{Deserialize, Serialize}; - -use crate::error::{Error, Result}; -use crate::schema::{ColumnSchema, Schema, SchemaBuilder}; - -/// Struct used to serialize and deserialize [`Schema`](crate::schema::Schema). -/// -/// This struct only contains necessary data to recover the Schema. -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct RawSchema { - pub column_schemas: Vec, - pub timestamp_index: Option, - pub version: u32, -} - -impl TryFrom for Schema { - type Error = Error; - - fn try_from(raw: RawSchema) -> Result { - SchemaBuilder::try_from(raw.column_schemas)? - .version(raw.version) - .build() - } -} - -impl From<&Schema> for RawSchema { - fn from(schema: &Schema) -> RawSchema { - RawSchema { - column_schemas: schema.column_schemas.clone(), - timestamp_index: schema.timestamp_index, - version: schema.version, - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::data_type::ConcreteDataType; - - #[test] - fn test_raw_convert() { - let column_schemas = vec![ - ColumnSchema::new("col1", ConcreteDataType::int32_datatype(), true), - ColumnSchema::new( - "ts", - ConcreteDataType::timestamp_millisecond_datatype(), - false, - ) - .with_time_index(true), - ]; - let schema = SchemaBuilder::try_from(column_schemas) - .unwrap() - .version(123) - .build() - .unwrap(); - - let raw = RawSchema::from(&schema); - let schema_new = Schema::try_from(raw).unwrap(); - - assert_eq!(schema, schema_new); - } -} diff --git a/src/datatypes2/src/serialize.rs b/src/datatypes2/src/serialize.rs deleted file mode 100644 index 1cbf04cedd79..000000000000 --- a/src/datatypes2/src/serialize.rs +++ /dev/null @@ -1,20 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use crate::error::Result; - -pub trait Serializable: Send + Sync { - /// Serialize a column of value with given type to JSON value - fn serialize_to_json(&self) -> Result>; -} diff --git a/src/datatypes2/src/type_id.rs b/src/datatypes2/src/type_id.rs deleted file mode 100644 index bcb7ea52b129..000000000000 --- a/src/datatypes2/src/type_id.rs +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -/// Unique identifier for logical data type. -#[derive(Clone, Copy, Debug, PartialEq, Eq)] -pub enum LogicalTypeId { - Null, - - // Numeric types: - Boolean, - Int8, - Int16, - Int32, - Int64, - UInt8, - UInt16, - UInt32, - UInt64, - Float32, - Float64, - - // String types: - String, - Binary, - - // Date & Time types: - /// Date representing the elapsed time since UNIX epoch (1970-01-01) - /// in days (32 bits). - Date, - /// Datetime representing the elapsed time since UNIX epoch (1970-01-01) in - /// seconds/milliseconds/microseconds/nanoseconds, determined by precision. - DateTime, - - TimestampSecond, - TimestampMillisecond, - TimestampMicrosecond, - TimestampNanosecond, - - List, -} - -impl LogicalTypeId { - /// Create ConcreteDataType based on this id. This method is for test only as it - /// would lost some info. - /// - /// # Panics - /// Panics if data type is not supported. - #[cfg(any(test, feature = "test"))] - pub fn data_type(&self) -> crate::data_type::ConcreteDataType { - use crate::data_type::ConcreteDataType; - - match self { - LogicalTypeId::Null => ConcreteDataType::null_datatype(), - LogicalTypeId::Boolean => ConcreteDataType::boolean_datatype(), - LogicalTypeId::Int8 => ConcreteDataType::int8_datatype(), - LogicalTypeId::Int16 => ConcreteDataType::int16_datatype(), - LogicalTypeId::Int32 => ConcreteDataType::int32_datatype(), - LogicalTypeId::Int64 => ConcreteDataType::int64_datatype(), - LogicalTypeId::UInt8 => ConcreteDataType::uint8_datatype(), - LogicalTypeId::UInt16 => ConcreteDataType::uint16_datatype(), - LogicalTypeId::UInt32 => ConcreteDataType::uint32_datatype(), - LogicalTypeId::UInt64 => ConcreteDataType::uint64_datatype(), - LogicalTypeId::Float32 => ConcreteDataType::float32_datatype(), - LogicalTypeId::Float64 => ConcreteDataType::float64_datatype(), - LogicalTypeId::String => ConcreteDataType::string_datatype(), - LogicalTypeId::Binary => ConcreteDataType::binary_datatype(), - LogicalTypeId::Date => ConcreteDataType::date_datatype(), - LogicalTypeId::DateTime => ConcreteDataType::datetime_datatype(), - LogicalTypeId::TimestampSecond => ConcreteDataType::timestamp_second_datatype(), - LogicalTypeId::TimestampMillisecond => { - ConcreteDataType::timestamp_millisecond_datatype() - } - LogicalTypeId::TimestampMicrosecond => { - ConcreteDataType::timestamp_microsecond_datatype() - } - LogicalTypeId::TimestampNanosecond => ConcreteDataType::timestamp_nanosecond_datatype(), - LogicalTypeId::List => { - ConcreteDataType::list_datatype(ConcreteDataType::null_datatype()) - } - } - } -} diff --git a/src/datatypes2/src/types.rs b/src/datatypes2/src/types.rs deleted file mode 100644 index 186704fdfdb3..000000000000 --- a/src/datatypes2/src/types.rs +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -mod binary_type; -mod boolean_type; -mod date_type; -mod datetime_type; -mod list_type; -mod null_type; -mod primitive_type; -mod string_type; - -mod timestamp_type; - -pub use binary_type::BinaryType; -pub use boolean_type::BooleanType; -pub use date_type::DateType; -pub use datetime_type::DateTimeType; -pub use list_type::ListType; -pub use null_type::NullType; -pub use primitive_type::{ - Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, LogicalPrimitiveType, - NativeType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, WrapperType, -}; -pub use string_type::StringType; -pub use timestamp_type::*; diff --git a/src/datatypes2/src/types/binary_type.rs b/src/datatypes2/src/types/binary_type.rs deleted file mode 100644 index 0d06724fffb4..000000000000 --- a/src/datatypes2/src/types/binary_type.rs +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::sync::Arc; - -use arrow::datatypes::DataType as ArrowDataType; -use common_base::bytes::StringBytes; -use serde::{Deserialize, Serialize}; - -use crate::data_type::{DataType, DataTypeRef}; -use crate::scalars::ScalarVectorBuilder; -use crate::type_id::LogicalTypeId; -use crate::value::Value; -use crate::vectors::{BinaryVectorBuilder, MutableVector}; - -#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct BinaryType; - -impl BinaryType { - pub fn arc() -> DataTypeRef { - Arc::new(Self) - } -} - -impl DataType for BinaryType { - fn name(&self) -> &str { - "Binary" - } - - fn logical_type_id(&self) -> LogicalTypeId { - LogicalTypeId::Binary - } - - fn default_value(&self) -> Value { - StringBytes::default().into() - } - - fn as_arrow_type(&self) -> ArrowDataType { - ArrowDataType::LargeBinary - } - - fn create_mutable_vector(&self, capacity: usize) -> Box { - Box::new(BinaryVectorBuilder::with_capacity(capacity)) - } - - fn is_timestamp_compatible(&self) -> bool { - false - } -} diff --git a/src/datatypes2/src/types/boolean_type.rs b/src/datatypes2/src/types/boolean_type.rs deleted file mode 100644 index 36d92169eb01..000000000000 --- a/src/datatypes2/src/types/boolean_type.rs +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::sync::Arc; - -use arrow::datatypes::DataType as ArrowDataType; -use serde::{Deserialize, Serialize}; - -use crate::data_type::{DataType, DataTypeRef}; -use crate::scalars::ScalarVectorBuilder; -use crate::type_id::LogicalTypeId; -use crate::value::Value; -use crate::vectors::{BooleanVectorBuilder, MutableVector}; - -#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct BooleanType; - -impl BooleanType { - pub fn arc() -> DataTypeRef { - Arc::new(Self) - } -} - -impl DataType for BooleanType { - fn name(&self) -> &str { - "Boolean" - } - - fn logical_type_id(&self) -> LogicalTypeId { - LogicalTypeId::Boolean - } - - fn default_value(&self) -> Value { - bool::default().into() - } - - fn as_arrow_type(&self) -> ArrowDataType { - ArrowDataType::Boolean - } - - fn create_mutable_vector(&self, capacity: usize) -> Box { - Box::new(BooleanVectorBuilder::with_capacity(capacity)) - } - - fn is_timestamp_compatible(&self) -> bool { - false - } -} diff --git a/src/datatypes2/src/types/list_type.rs b/src/datatypes2/src/types/list_type.rs deleted file mode 100644 index b9875ca36263..000000000000 --- a/src/datatypes2/src/types/list_type.rs +++ /dev/null @@ -1,95 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use arrow::datatypes::{DataType as ArrowDataType, Field}; -use serde::{Deserialize, Serialize}; - -use crate::data_type::{ConcreteDataType, DataType}; -use crate::type_id::LogicalTypeId; -use crate::value::{ListValue, Value}; -use crate::vectors::{ListVectorBuilder, MutableVector}; - -/// Used to represent the List datatype. -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct ListType { - /// The type of List's item. - // Use Box to avoid recursive dependency, as enum ConcreteDataType depends on ListType. - item_type: Box, -} - -impl Default for ListType { - fn default() -> Self { - ListType::new(ConcreteDataType::null_datatype()) - } -} - -impl ListType { - /// Create a new `ListType` whose item's data type is `item_type`. - pub fn new(item_type: ConcreteDataType) -> Self { - ListType { - item_type: Box::new(item_type), - } - } -} - -impl DataType for ListType { - fn name(&self) -> &str { - "List" - } - - fn logical_type_id(&self) -> LogicalTypeId { - LogicalTypeId::List - } - - fn default_value(&self) -> Value { - Value::List(ListValue::new(None, *self.item_type.clone())) - } - - fn as_arrow_type(&self) -> ArrowDataType { - let field = Box::new(Field::new("item", self.item_type.as_arrow_type(), true)); - ArrowDataType::List(field) - } - - fn create_mutable_vector(&self, capacity: usize) -> Box { - Box::new(ListVectorBuilder::with_type_capacity( - *self.item_type.clone(), - capacity, - )) - } - - fn is_timestamp_compatible(&self) -> bool { - false - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::value::ListValue; - - #[test] - fn test_list_type() { - let t = ListType::new(ConcreteDataType::boolean_datatype()); - assert_eq!("List", t.name()); - assert_eq!(LogicalTypeId::List, t.logical_type_id()); - assert_eq!( - Value::List(ListValue::new(None, ConcreteDataType::boolean_datatype())), - t.default_value() - ); - assert_eq!( - ArrowDataType::List(Box::new(Field::new("item", ArrowDataType::Boolean, true))), - t.as_arrow_type() - ); - } -} diff --git a/src/datatypes2/src/types/null_type.rs b/src/datatypes2/src/types/null_type.rs deleted file mode 100644 index b9bb2dc7526d..000000000000 --- a/src/datatypes2/src/types/null_type.rs +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::sync::Arc; - -use arrow::datatypes::DataType as ArrowDataType; -use serde::{Deserialize, Serialize}; - -use crate::data_type::{DataType, DataTypeRef}; -use crate::type_id::LogicalTypeId; -use crate::value::Value; -use crate::vectors::{MutableVector, NullVectorBuilder}; - -#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct NullType; - -impl NullType { - pub fn arc() -> DataTypeRef { - Arc::new(NullType) - } -} - -impl DataType for NullType { - fn name(&self) -> &str { - "Null" - } - - fn logical_type_id(&self) -> LogicalTypeId { - LogicalTypeId::Null - } - - fn default_value(&self) -> Value { - Value::Null - } - - fn as_arrow_type(&self) -> ArrowDataType { - ArrowDataType::Null - } - - fn create_mutable_vector(&self, _capacity: usize) -> Box { - Box::new(NullVectorBuilder::default()) - } - - fn is_timestamp_compatible(&self) -> bool { - false - } -} diff --git a/src/datatypes2/src/types/primitive_type.rs b/src/datatypes2/src/types/primitive_type.rs deleted file mode 100644 index e389ca13bf91..000000000000 --- a/src/datatypes2/src/types/primitive_type.rs +++ /dev/null @@ -1,358 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::cmp::Ordering; - -use arrow::datatypes::{ArrowNativeType, ArrowPrimitiveType, DataType as ArrowDataType}; -use common_time::{Date, DateTime}; -use num::NumCast; -use serde::{Deserialize, Serialize}; -use snafu::OptionExt; - -use crate::data_type::{ConcreteDataType, DataType}; -use crate::error::{self, Result}; -use crate::scalars::{Scalar, ScalarRef, ScalarVectorBuilder}; -use crate::type_id::LogicalTypeId; -use crate::types::{DateTimeType, DateType}; -use crate::value::{Value, ValueRef}; -use crate::vectors::{MutableVector, PrimitiveVector, PrimitiveVectorBuilder, Vector}; - -/// Data types that can be used as arrow's native type. -pub trait NativeType: ArrowNativeType + NumCast { - /// Largest numeric type this primitive type can be cast to. - type LargestType: NativeType; -} - -macro_rules! impl_native_type { - ($Type: ident, $LargestType: ident) => { - impl NativeType for $Type { - type LargestType = $LargestType; - } - }; -} - -impl_native_type!(u8, u64); -impl_native_type!(u16, u64); -impl_native_type!(u32, u64); -impl_native_type!(u64, u64); -impl_native_type!(i8, i64); -impl_native_type!(i16, i64); -impl_native_type!(i32, i64); -impl_native_type!(i64, i64); -impl_native_type!(f32, f64); -impl_native_type!(f64, f64); - -/// Represents the wrapper type that wraps a native type using the `newtype pattern`, -/// such as [Date](`common_time::Date`) is a wrapper type for the underlying native -/// type `i32`. -pub trait WrapperType: - Copy - + Scalar - + PartialEq - + Into - + Into> - + Serialize - + Into -{ - /// Logical primitive type that this wrapper type belongs to. - type LogicalType: LogicalPrimitiveType; - /// The underlying native type. - type Native: NativeType; - - /// Convert native type into this wrapper type. - fn from_native(value: Self::Native) -> Self; - - /// Convert this wrapper type into native type. - fn into_native(self) -> Self::Native; -} - -/// Trait bridging the logical primitive type with [ArrowPrimitiveType]. -pub trait LogicalPrimitiveType: 'static + Sized { - /// Arrow primitive type of this logical type. - type ArrowPrimitive: ArrowPrimitiveType; - /// Native (physical) type of this logical type. - type Native: NativeType; - /// Wrapper type that the vector returns. - type Wrapper: WrapperType - + for<'a> Scalar, RefType<'a> = Self::Wrapper> - + for<'a> ScalarRef<'a, ScalarType = Self::Wrapper>; - - /// Construct the data type struct. - fn build_data_type() -> ConcreteDataType; - - /// Return the name of the type. - fn type_name() -> &'static str; - - /// Dynamic cast the vector to the concrete vector type. - fn cast_vector(vector: &dyn Vector) -> Result<&PrimitiveVector>; - - /// Cast value ref to the primitive type. - fn cast_value_ref(value: ValueRef) -> Result>; -} - -/// A new type for [WrapperType], complement the `Ord` feature for it. Wrapping non ordered -/// primitive types like `f32` and `f64` in `OrdPrimitive` can make them be used in places that -/// require `Ord`. For example, in `Median` or `Percentile` UDAFs. -#[derive(Debug, Clone, Copy, PartialEq)] -pub struct OrdPrimitive(pub T); - -impl OrdPrimitive { - pub fn as_primitive(&self) -> T { - self.0 - } -} - -impl Eq for OrdPrimitive {} - -impl PartialOrd for OrdPrimitive { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl Ord for OrdPrimitive { - fn cmp(&self, other: &Self) -> Ordering { - Into::::into(self.0).cmp(&Into::::into(other.0)) - } -} - -impl From> for Value { - fn from(p: OrdPrimitive) -> Self { - p.0.into() - } -} - -macro_rules! impl_wrapper { - ($Type: ident, $LogicalType: ident) => { - impl WrapperType for $Type { - type LogicalType = $LogicalType; - type Native = $Type; - - fn from_native(value: Self::Native) -> Self { - value - } - - fn into_native(self) -> Self::Native { - self - } - } - }; -} - -impl_wrapper!(u8, UInt8Type); -impl_wrapper!(u16, UInt16Type); -impl_wrapper!(u32, UInt32Type); -impl_wrapper!(u64, UInt64Type); -impl_wrapper!(i8, Int8Type); -impl_wrapper!(i16, Int16Type); -impl_wrapper!(i32, Int32Type); -impl_wrapper!(i64, Int64Type); -impl_wrapper!(f32, Float32Type); -impl_wrapper!(f64, Float64Type); - -impl WrapperType for Date { - type LogicalType = DateType; - type Native = i32; - - fn from_native(value: i32) -> Self { - Date::new(value) - } - - fn into_native(self) -> i32 { - self.val() - } -} - -impl WrapperType for DateTime { - type LogicalType = DateTimeType; - type Native = i64; - - fn from_native(value: Self::Native) -> Self { - DateTime::new(value) - } - - fn into_native(self) -> Self::Native { - self.val() - } -} - -macro_rules! define_logical_primitive_type { - ($Native: ident, $TypeId: ident, $DataType: ident) => { - // We need to define it as an empty struct `struct DataType {}` instead of a struct-unit - // `struct DataType;` to ensure the serialized JSON string is compatible with previous - // implementation. - #[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)] - pub struct $DataType {} - - impl LogicalPrimitiveType for $DataType { - type ArrowPrimitive = arrow::datatypes::$DataType; - type Native = $Native; - type Wrapper = $Native; - - fn build_data_type() -> ConcreteDataType { - ConcreteDataType::$TypeId($DataType::default()) - } - - fn type_name() -> &'static str { - stringify!($TypeId) - } - - fn cast_vector(vector: &dyn Vector) -> Result<&PrimitiveVector<$DataType>> { - vector - .as_any() - .downcast_ref::>() - .with_context(|| error::CastTypeSnafu { - msg: format!( - "Failed to cast {} to vector of primitive type {}", - vector.vector_type_name(), - stringify!($TypeId) - ), - }) - } - - fn cast_value_ref(value: ValueRef) -> Result> { - match value { - ValueRef::Null => Ok(None), - ValueRef::$TypeId(v) => Ok(Some(v.into())), - other => error::CastTypeSnafu { - msg: format!( - "Failed to cast value {:?} to primitive type {}", - other, - stringify!($TypeId), - ), - } - .fail(), - } - } - } - }; -} - -macro_rules! define_non_timestamp_primitive { - ($Native: ident, $TypeId: ident, $DataType: ident) => { - define_logical_primitive_type!($Native, $TypeId, $DataType); - - impl DataType for $DataType { - fn name(&self) -> &str { - stringify!($TypeId) - } - - fn logical_type_id(&self) -> LogicalTypeId { - LogicalTypeId::$TypeId - } - - fn default_value(&self) -> Value { - $Native::default().into() - } - - fn as_arrow_type(&self) -> ArrowDataType { - ArrowDataType::$TypeId - } - - fn create_mutable_vector(&self, capacity: usize) -> Box { - Box::new(PrimitiveVectorBuilder::<$DataType>::with_capacity(capacity)) - } - - fn is_timestamp_compatible(&self) -> bool { - false - } - } - }; -} - -define_non_timestamp_primitive!(u8, UInt8, UInt8Type); -define_non_timestamp_primitive!(u16, UInt16, UInt16Type); -define_non_timestamp_primitive!(u32, UInt32, UInt32Type); -define_non_timestamp_primitive!(u64, UInt64, UInt64Type); -define_non_timestamp_primitive!(i8, Int8, Int8Type); -define_non_timestamp_primitive!(i16, Int16, Int16Type); -define_non_timestamp_primitive!(i32, Int32, Int32Type); -define_non_timestamp_primitive!(f32, Float32, Float32Type); -define_non_timestamp_primitive!(f64, Float64, Float64Type); - -// Timestamp primitive: -define_logical_primitive_type!(i64, Int64, Int64Type); - -impl DataType for Int64Type { - fn name(&self) -> &str { - "Int64" - } - - fn logical_type_id(&self) -> LogicalTypeId { - LogicalTypeId::Int64 - } - - fn default_value(&self) -> Value { - Value::Int64(0) - } - - fn as_arrow_type(&self) -> ArrowDataType { - ArrowDataType::Int64 - } - - fn create_mutable_vector(&self, capacity: usize) -> Box { - Box::new(PrimitiveVectorBuilder::::with_capacity(capacity)) - } - - fn is_timestamp_compatible(&self) -> bool { - true - } -} - -#[cfg(test)] -mod tests { - use std::collections::BinaryHeap; - - use super::*; - - #[test] - fn test_ord_primitive() { - struct Foo - where - T: WrapperType, - { - heap: BinaryHeap>, - } - - impl Foo - where - T: WrapperType, - { - fn push(&mut self, value: T) { - let value = OrdPrimitive::(value); - self.heap.push(value); - } - } - - macro_rules! test { - ($Type:ident) => { - let mut foo = Foo::<$Type> { - heap: BinaryHeap::new(), - }; - foo.push($Type::default()); - }; - } - - test!(u8); - test!(u16); - test!(u32); - test!(u64); - test!(i8); - test!(i16); - test!(i32); - test!(i64); - test!(f32); - test!(f64); - } -} diff --git a/src/datatypes2/src/types/string_type.rs b/src/datatypes2/src/types/string_type.rs deleted file mode 100644 index 799cbbbdd345..000000000000 --- a/src/datatypes2/src/types/string_type.rs +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::sync::Arc; - -use arrow::datatypes::DataType as ArrowDataType; -use common_base::bytes::StringBytes; -use serde::{Deserialize, Serialize}; - -use crate::data_type::{DataType, DataTypeRef}; -use crate::prelude::ScalarVectorBuilder; -use crate::type_id::LogicalTypeId; -use crate::value::Value; -use crate::vectors::{MutableVector, StringVectorBuilder}; - -#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct StringType; - -impl StringType { - pub fn arc() -> DataTypeRef { - Arc::new(Self) - } -} - -impl DataType for StringType { - fn name(&self) -> &str { - "String" - } - - fn logical_type_id(&self) -> LogicalTypeId { - LogicalTypeId::String - } - - fn default_value(&self) -> Value { - StringBytes::default().into() - } - - fn as_arrow_type(&self) -> ArrowDataType { - ArrowDataType::Utf8 - } - - fn create_mutable_vector(&self, capacity: usize) -> Box { - Box::new(StringVectorBuilder::with_capacity(capacity)) - } - - fn is_timestamp_compatible(&self) -> bool { - false - } -} diff --git a/src/datatypes2/src/value.rs b/src/datatypes2/src/value.rs deleted file mode 100644 index bade88d419af..000000000000 --- a/src/datatypes2/src/value.rs +++ /dev/null @@ -1,1275 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::cmp::Ordering; -use std::fmt::{Display, Formatter}; - -use common_base::bytes::{Bytes, StringBytes}; -use common_time::date::Date; -use common_time::datetime::DateTime; -use common_time::timestamp::{TimeUnit, Timestamp}; -use datafusion_common::ScalarValue; -pub use ordered_float::OrderedFloat; -use serde::{Deserialize, Serialize}; - -use crate::error::{self, Result}; -use crate::prelude::*; -use crate::type_id::LogicalTypeId; -use crate::vectors::ListVector; - -pub type OrderedF32 = OrderedFloat; -pub type OrderedF64 = OrderedFloat; - -/// Value holds a single arbitrary value of any [DataType](crate::data_type::DataType). -/// -/// Comparison between values with different types (expect Null) is not allowed. -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub enum Value { - Null, - - // Numeric types: - Boolean(bool), - UInt8(u8), - UInt16(u16), - UInt32(u32), - UInt64(u64), - Int8(i8), - Int16(i16), - Int32(i32), - Int64(i64), - Float32(OrderedF32), - Float64(OrderedF64), - - // String types: - String(StringBytes), - Binary(Bytes), - - // Date & Time types: - Date(Date), - DateTime(DateTime), - Timestamp(Timestamp), - - List(ListValue), -} - -impl Display for Value { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - match self { - Value::Null => write!(f, "{}", self.data_type().name()), - Value::Boolean(v) => write!(f, "{}", v), - Value::UInt8(v) => write!(f, "{}", v), - Value::UInt16(v) => write!(f, "{}", v), - Value::UInt32(v) => write!(f, "{}", v), - Value::UInt64(v) => write!(f, "{}", v), - Value::Int8(v) => write!(f, "{}", v), - Value::Int16(v) => write!(f, "{}", v), - Value::Int32(v) => write!(f, "{}", v), - Value::Int64(v) => write!(f, "{}", v), - Value::Float32(v) => write!(f, "{}", v), - Value::Float64(v) => write!(f, "{}", v), - Value::String(v) => write!(f, "{}", v.as_utf8()), - Value::Binary(v) => { - let hex = v - .iter() - .map(|b| format!("{:02x}", b)) - .collect::>() - .join(""); - write!(f, "{}", hex) - } - Value::Date(v) => write!(f, "{}", v), - Value::DateTime(v) => write!(f, "{}", v), - Value::Timestamp(v) => write!(f, "{}", v.to_iso8601_string()), - Value::List(v) => { - let default = Box::new(vec![]); - let items = v.items().as_ref().unwrap_or(&default); - let items = items - .iter() - .map(|i| i.to_string()) - .collect::>() - .join(", "); - write!(f, "{}[{}]", v.datatype.name(), items) - } - } - } -} - -impl Value { - /// Returns data type of the value. - /// - /// # Panics - /// Panics if the data type is not supported. - pub fn data_type(&self) -> ConcreteDataType { - // TODO(yingwen): Implement this once all data types are implemented. - match self { - Value::Null => ConcreteDataType::null_datatype(), - Value::Boolean(_) => ConcreteDataType::boolean_datatype(), - Value::UInt8(_) => ConcreteDataType::uint8_datatype(), - Value::UInt16(_) => ConcreteDataType::uint16_datatype(), - Value::UInt32(_) => ConcreteDataType::uint32_datatype(), - Value::UInt64(_) => ConcreteDataType::uint64_datatype(), - Value::Int8(_) => ConcreteDataType::int8_datatype(), - Value::Int16(_) => ConcreteDataType::int16_datatype(), - Value::Int32(_) => ConcreteDataType::int32_datatype(), - Value::Int64(_) => ConcreteDataType::int64_datatype(), - Value::Float32(_) => ConcreteDataType::float32_datatype(), - Value::Float64(_) => ConcreteDataType::float64_datatype(), - Value::String(_) => ConcreteDataType::string_datatype(), - Value::Binary(_) => ConcreteDataType::binary_datatype(), - Value::Date(_) => ConcreteDataType::date_datatype(), - Value::DateTime(_) => ConcreteDataType::datetime_datatype(), - Value::Timestamp(v) => ConcreteDataType::timestamp_datatype(v.unit()), - Value::List(list) => ConcreteDataType::list_datatype(list.datatype().clone()), - } - } - - /// Returns true if this is a null value. - pub fn is_null(&self) -> bool { - matches!(self, Value::Null) - } - - /// Cast itself to [ListValue]. - pub fn as_list(&self) -> Result> { - match self { - Value::Null => Ok(None), - Value::List(v) => Ok(Some(v)), - other => error::CastTypeSnafu { - msg: format!("Failed to cast {:?} to list value", other), - } - .fail(), - } - } - - /// Cast itself to [ValueRef]. - pub fn as_value_ref(&self) -> ValueRef { - match self { - Value::Null => ValueRef::Null, - Value::Boolean(v) => ValueRef::Boolean(*v), - Value::UInt8(v) => ValueRef::UInt8(*v), - Value::UInt16(v) => ValueRef::UInt16(*v), - Value::UInt32(v) => ValueRef::UInt32(*v), - Value::UInt64(v) => ValueRef::UInt64(*v), - Value::Int8(v) => ValueRef::Int8(*v), - Value::Int16(v) => ValueRef::Int16(*v), - Value::Int32(v) => ValueRef::Int32(*v), - Value::Int64(v) => ValueRef::Int64(*v), - Value::Float32(v) => ValueRef::Float32(*v), - Value::Float64(v) => ValueRef::Float64(*v), - Value::String(v) => ValueRef::String(v.as_utf8()), - Value::Binary(v) => ValueRef::Binary(v), - Value::Date(v) => ValueRef::Date(*v), - Value::DateTime(v) => ValueRef::DateTime(*v), - Value::List(v) => ValueRef::List(ListValueRef::Ref { val: v }), - Value::Timestamp(v) => ValueRef::Timestamp(*v), - } - } - - /// Returns the logical type of the value. - pub fn logical_type_id(&self) -> LogicalTypeId { - match self { - Value::Null => LogicalTypeId::Null, - Value::Boolean(_) => LogicalTypeId::Boolean, - Value::UInt8(_) => LogicalTypeId::UInt8, - Value::UInt16(_) => LogicalTypeId::UInt16, - Value::UInt32(_) => LogicalTypeId::UInt32, - Value::UInt64(_) => LogicalTypeId::UInt64, - Value::Int8(_) => LogicalTypeId::Int8, - Value::Int16(_) => LogicalTypeId::Int16, - Value::Int32(_) => LogicalTypeId::Int32, - Value::Int64(_) => LogicalTypeId::Int64, - Value::Float32(_) => LogicalTypeId::Float32, - Value::Float64(_) => LogicalTypeId::Float64, - Value::String(_) => LogicalTypeId::String, - Value::Binary(_) => LogicalTypeId::Binary, - Value::List(_) => LogicalTypeId::List, - Value::Date(_) => LogicalTypeId::Date, - Value::DateTime(_) => LogicalTypeId::DateTime, - Value::Timestamp(t) => match t.unit() { - TimeUnit::Second => LogicalTypeId::TimestampSecond, - TimeUnit::Millisecond => LogicalTypeId::TimestampMillisecond, - TimeUnit::Microsecond => LogicalTypeId::TimestampMicrosecond, - TimeUnit::Nanosecond => LogicalTypeId::TimestampNanosecond, - }, - } - } -} - -macro_rules! impl_ord_for_value_like { - ($Type: ident, $left: ident, $right: ident) => { - if $left.is_null() && !$right.is_null() { - return Ordering::Less; - } else if !$left.is_null() && $right.is_null() { - return Ordering::Greater; - } else { - match ($left, $right) { - ($Type::Null, $Type::Null) => Ordering::Equal, - ($Type::Boolean(v1), $Type::Boolean(v2)) => v1.cmp(v2), - ($Type::UInt8(v1), $Type::UInt8(v2)) => v1.cmp(v2), - ($Type::UInt16(v1), $Type::UInt16(v2)) => v1.cmp(v2), - ($Type::UInt32(v1), $Type::UInt32(v2)) => v1.cmp(v2), - ($Type::UInt64(v1), $Type::UInt64(v2)) => v1.cmp(v2), - ($Type::Int8(v1), $Type::Int8(v2)) => v1.cmp(v2), - ($Type::Int16(v1), $Type::Int16(v2)) => v1.cmp(v2), - ($Type::Int32(v1), $Type::Int32(v2)) => v1.cmp(v2), - ($Type::Int64(v1), $Type::Int64(v2)) => v1.cmp(v2), - ($Type::Float32(v1), $Type::Float32(v2)) => v1.cmp(v2), - ($Type::Float64(v1), $Type::Float64(v2)) => v1.cmp(v2), - ($Type::String(v1), $Type::String(v2)) => v1.cmp(v2), - ($Type::Binary(v1), $Type::Binary(v2)) => v1.cmp(v2), - ($Type::Date(v1), $Type::Date(v2)) => v1.cmp(v2), - ($Type::DateTime(v1), $Type::DateTime(v2)) => v1.cmp(v2), - ($Type::Timestamp(v1), $Type::Timestamp(v2)) => v1.cmp(v2), - ($Type::List(v1), $Type::List(v2)) => v1.cmp(v2), - _ => panic!( - "Cannot compare different values {:?} and {:?}", - $left, $right - ), - } - } - }; -} - -impl PartialOrd for Value { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl Ord for Value { - fn cmp(&self, other: &Self) -> Ordering { - impl_ord_for_value_like!(Value, self, other) - } -} - -macro_rules! impl_value_from { - ($Variant: ident, $Type: ident) => { - impl From<$Type> for Value { - fn from(value: $Type) -> Self { - Value::$Variant(value.into()) - } - } - - impl From> for Value { - fn from(value: Option<$Type>) -> Self { - match value { - Some(v) => Value::$Variant(v.into()), - None => Value::Null, - } - } - } - }; -} - -impl_value_from!(Boolean, bool); -impl_value_from!(UInt8, u8); -impl_value_from!(UInt16, u16); -impl_value_from!(UInt32, u32); -impl_value_from!(UInt64, u64); -impl_value_from!(Int8, i8); -impl_value_from!(Int16, i16); -impl_value_from!(Int32, i32); -impl_value_from!(Int64, i64); -impl_value_from!(Float32, f32); -impl_value_from!(Float64, f64); -impl_value_from!(String, StringBytes); -impl_value_from!(Binary, Bytes); -impl_value_from!(Date, Date); -impl_value_from!(DateTime, DateTime); -impl_value_from!(Timestamp, Timestamp); - -impl From for Value { - fn from(string: String) -> Value { - Value::String(string.into()) - } -} - -impl From<&str> for Value { - fn from(string: &str) -> Value { - Value::String(string.into()) - } -} - -impl From> for Value { - fn from(bytes: Vec) -> Value { - Value::Binary(bytes.into()) - } -} - -impl From<&[u8]> for Value { - fn from(bytes: &[u8]) -> Value { - Value::Binary(bytes.into()) - } -} - -impl TryFrom for serde_json::Value { - type Error = serde_json::Error; - - fn try_from(value: Value) -> serde_json::Result { - let json_value = match value { - Value::Null => serde_json::Value::Null, - Value::Boolean(v) => serde_json::Value::Bool(v), - Value::UInt8(v) => serde_json::Value::from(v), - Value::UInt16(v) => serde_json::Value::from(v), - Value::UInt32(v) => serde_json::Value::from(v), - Value::UInt64(v) => serde_json::Value::from(v), - Value::Int8(v) => serde_json::Value::from(v), - Value::Int16(v) => serde_json::Value::from(v), - Value::Int32(v) => serde_json::Value::from(v), - Value::Int64(v) => serde_json::Value::from(v), - Value::Float32(v) => serde_json::Value::from(v.0), - Value::Float64(v) => serde_json::Value::from(v.0), - Value::String(bytes) => serde_json::Value::String(bytes.as_utf8().to_string()), - Value::Binary(bytes) => serde_json::to_value(bytes)?, - Value::Date(v) => serde_json::Value::Number(v.val().into()), - Value::DateTime(v) => serde_json::Value::Number(v.val().into()), - Value::List(v) => serde_json::to_value(v)?, - Value::Timestamp(v) => serde_json::to_value(v.value())?, - }; - - Ok(json_value) - } -} - -// TODO(yingwen): Consider removing the `datatype` field from `ListValue`. -/// List value. -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] -pub struct ListValue { - /// List of nested Values (boxed to reduce size_of(Value)) - #[allow(clippy::box_collection)] - items: Option>>, - /// Inner values datatype, to distinguish empty lists of different datatypes. - /// Restricted by DataFusion, cannot use null datatype for empty list. - datatype: ConcreteDataType, -} - -impl Eq for ListValue {} - -impl ListValue { - pub fn new(items: Option>>, datatype: ConcreteDataType) -> Self { - Self { items, datatype } - } - - pub fn items(&self) -> &Option>> { - &self.items - } - - pub fn datatype(&self) -> &ConcreteDataType { - &self.datatype - } -} - -impl Default for ListValue { - fn default() -> ListValue { - ListValue::new(None, ConcreteDataType::null_datatype()) - } -} - -impl PartialOrd for ListValue { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl Ord for ListValue { - fn cmp(&self, other: &Self) -> Ordering { - assert_eq!( - self.datatype, other.datatype, - "Cannot compare different datatypes!" - ); - self.items.cmp(&other.items) - } -} - -impl TryFrom for Value { - type Error = error::Error; - - fn try_from(v: ScalarValue) -> Result { - let v = match v { - ScalarValue::Null => Value::Null, - ScalarValue::Boolean(b) => Value::from(b), - ScalarValue::Float32(f) => Value::from(f), - ScalarValue::Float64(f) => Value::from(f), - ScalarValue::Int8(i) => Value::from(i), - ScalarValue::Int16(i) => Value::from(i), - ScalarValue::Int32(i) => Value::from(i), - ScalarValue::Int64(i) => Value::from(i), - ScalarValue::UInt8(u) => Value::from(u), - ScalarValue::UInt16(u) => Value::from(u), - ScalarValue::UInt32(u) => Value::from(u), - ScalarValue::UInt64(u) => Value::from(u), - ScalarValue::Utf8(s) | ScalarValue::LargeUtf8(s) => { - Value::from(s.map(StringBytes::from)) - } - ScalarValue::Binary(b) - | ScalarValue::LargeBinary(b) - | ScalarValue::FixedSizeBinary(_, b) => Value::from(b.map(Bytes::from)), - ScalarValue::List(vs, field) => { - let items = if let Some(vs) = vs { - let vs = vs - .into_iter() - .map(ScalarValue::try_into) - .collect::>()?; - Some(Box::new(vs)) - } else { - None - }; - let datatype = ConcreteDataType::try_from(field.data_type())?; - Value::List(ListValue::new(items, datatype)) - } - ScalarValue::Date32(d) => d.map(|x| Value::Date(Date::new(x))).unwrap_or(Value::Null), - ScalarValue::Date64(d) => d - .map(|x| Value::DateTime(DateTime::new(x))) - .unwrap_or(Value::Null), - ScalarValue::TimestampSecond(t, _) => t - .map(|x| Value::Timestamp(Timestamp::new(x, TimeUnit::Second))) - .unwrap_or(Value::Null), - ScalarValue::TimestampMillisecond(t, _) => t - .map(|x| Value::Timestamp(Timestamp::new(x, TimeUnit::Millisecond))) - .unwrap_or(Value::Null), - ScalarValue::TimestampMicrosecond(t, _) => t - .map(|x| Value::Timestamp(Timestamp::new(x, TimeUnit::Microsecond))) - .unwrap_or(Value::Null), - ScalarValue::TimestampNanosecond(t, _) => t - .map(|x| Value::Timestamp(Timestamp::new(x, TimeUnit::Nanosecond))) - .unwrap_or(Value::Null), - ScalarValue::Decimal128(_, _, _) - | ScalarValue::Time64(_) - | ScalarValue::IntervalYearMonth(_) - | ScalarValue::IntervalDayTime(_) - | ScalarValue::IntervalMonthDayNano(_) - | ScalarValue::Struct(_, _) - | ScalarValue::Dictionary(_, _) => { - return error::UnsupportedArrowTypeSnafu { - arrow_type: v.get_datatype(), - } - .fail() - } - }; - Ok(v) - } -} - -/// Reference to [Value]. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum ValueRef<'a> { - Null, - - // Numeric types: - Boolean(bool), - UInt8(u8), - UInt16(u16), - UInt32(u32), - UInt64(u64), - Int8(i8), - Int16(i16), - Int32(i32), - Int64(i64), - Float32(OrderedF32), - Float64(OrderedF64), - - // String types: - String(&'a str), - Binary(&'a [u8]), - - // Date & Time types: - Date(Date), - DateTime(DateTime), - Timestamp(Timestamp), - List(ListValueRef<'a>), -} - -macro_rules! impl_as_for_value_ref { - ($value: ident, $Variant: ident) => { - match $value { - ValueRef::Null => Ok(None), - ValueRef::$Variant(v) => Ok(Some(*v)), - other => error::CastTypeSnafu { - msg: format!( - "Failed to cast value ref {:?} to {}", - other, - stringify!($Variant) - ), - } - .fail(), - } - }; -} - -impl<'a> ValueRef<'a> { - /// Returns true if this is null. - pub fn is_null(&self) -> bool { - matches!(self, ValueRef::Null) - } - - /// Cast itself to binary slice. - pub fn as_binary(&self) -> Result> { - impl_as_for_value_ref!(self, Binary) - } - - /// Cast itself to string slice. - pub fn as_string(&self) -> Result> { - impl_as_for_value_ref!(self, String) - } - - /// Cast itself to boolean. - pub fn as_boolean(&self) -> Result> { - impl_as_for_value_ref!(self, Boolean) - } - - /// Cast itself to [Date]. - pub fn as_date(&self) -> Result> { - impl_as_for_value_ref!(self, Date) - } - - /// Cast itself to [DateTime]. - pub fn as_datetime(&self) -> Result> { - impl_as_for_value_ref!(self, DateTime) - } - - pub fn as_timestamp(&self) -> Result> { - impl_as_for_value_ref!(self, Timestamp) - } - - /// Cast itself to [ListValueRef]. - pub fn as_list(&self) -> Result> { - impl_as_for_value_ref!(self, List) - } -} - -impl<'a> PartialOrd for ValueRef<'a> { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl<'a> Ord for ValueRef<'a> { - fn cmp(&self, other: &Self) -> Ordering { - impl_ord_for_value_like!(ValueRef, self, other) - } -} - -macro_rules! impl_value_ref_from { - ($Variant:ident, $Type:ident) => { - impl From<$Type> for ValueRef<'_> { - fn from(value: $Type) -> Self { - ValueRef::$Variant(value.into()) - } - } - - impl From> for ValueRef<'_> { - fn from(value: Option<$Type>) -> Self { - match value { - Some(v) => ValueRef::$Variant(v.into()), - None => ValueRef::Null, - } - } - } - }; -} - -impl_value_ref_from!(Boolean, bool); -impl_value_ref_from!(UInt8, u8); -impl_value_ref_from!(UInt16, u16); -impl_value_ref_from!(UInt32, u32); -impl_value_ref_from!(UInt64, u64); -impl_value_ref_from!(Int8, i8); -impl_value_ref_from!(Int16, i16); -impl_value_ref_from!(Int32, i32); -impl_value_ref_from!(Int64, i64); -impl_value_ref_from!(Float32, f32); -impl_value_ref_from!(Float64, f64); -impl_value_ref_from!(Date, Date); -impl_value_ref_from!(DateTime, DateTime); -impl_value_ref_from!(Timestamp, Timestamp); - -impl<'a> From<&'a str> for ValueRef<'a> { - fn from(string: &'a str) -> ValueRef<'a> { - ValueRef::String(string) - } -} - -impl<'a> From<&'a [u8]> for ValueRef<'a> { - fn from(bytes: &'a [u8]) -> ValueRef<'a> { - ValueRef::Binary(bytes) - } -} - -impl<'a> From>> for ValueRef<'a> { - fn from(list: Option) -> ValueRef { - match list { - Some(v) => ValueRef::List(v), - None => ValueRef::Null, - } - } -} - -/// Reference to a [ListValue]. -/// -/// Now comparison still requires some allocation (call of `to_value()`) and -/// might be avoidable by downcasting and comparing the underlying array slice -/// if it becomes bottleneck. -#[derive(Debug, Clone, Copy)] -pub enum ListValueRef<'a> { - // TODO(yingwen): Consider replace this by VectorRef. - Indexed { vector: &'a ListVector, idx: usize }, - Ref { val: &'a ListValue }, -} - -impl<'a> ListValueRef<'a> { - /// Convert self to [Value]. This method would clone the underlying data. - fn to_value(self) -> Value { - match self { - ListValueRef::Indexed { vector, idx } => vector.get(idx), - ListValueRef::Ref { val } => Value::List(val.clone()), - } - } -} - -impl<'a> PartialEq for ListValueRef<'a> { - fn eq(&self, other: &Self) -> bool { - self.to_value().eq(&other.to_value()) - } -} - -impl<'a> Eq for ListValueRef<'a> {} - -impl<'a> Ord for ListValueRef<'a> { - fn cmp(&self, other: &Self) -> Ordering { - // Respect the order of `Value` by converting into value before comparison. - self.to_value().cmp(&other.to_value()) - } -} - -impl<'a> PartialOrd for ListValueRef<'a> { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -#[cfg(test)] -mod tests { - use arrow::datatypes::DataType as ArrowDataType; - use num_traits::Float; - - use super::*; - - #[test] - fn test_try_from_scalar_value() { - assert_eq!( - Value::Boolean(true), - ScalarValue::Boolean(Some(true)).try_into().unwrap() - ); - assert_eq!( - Value::Boolean(false), - ScalarValue::Boolean(Some(false)).try_into().unwrap() - ); - assert_eq!(Value::Null, ScalarValue::Boolean(None).try_into().unwrap()); - - assert_eq!( - Value::Float32(1.0f32.into()), - ScalarValue::Float32(Some(1.0f32)).try_into().unwrap() - ); - assert_eq!(Value::Null, ScalarValue::Float32(None).try_into().unwrap()); - - assert_eq!( - Value::Float64(2.0f64.into()), - ScalarValue::Float64(Some(2.0f64)).try_into().unwrap() - ); - assert_eq!(Value::Null, ScalarValue::Float64(None).try_into().unwrap()); - - assert_eq!( - Value::Int8(i8::MAX), - ScalarValue::Int8(Some(i8::MAX)).try_into().unwrap() - ); - assert_eq!(Value::Null, ScalarValue::Int8(None).try_into().unwrap()); - - assert_eq!( - Value::Int16(i16::MAX), - ScalarValue::Int16(Some(i16::MAX)).try_into().unwrap() - ); - assert_eq!(Value::Null, ScalarValue::Int16(None).try_into().unwrap()); - - assert_eq!( - Value::Int32(i32::MAX), - ScalarValue::Int32(Some(i32::MAX)).try_into().unwrap() - ); - assert_eq!(Value::Null, ScalarValue::Int32(None).try_into().unwrap()); - - assert_eq!( - Value::Int64(i64::MAX), - ScalarValue::Int64(Some(i64::MAX)).try_into().unwrap() - ); - assert_eq!(Value::Null, ScalarValue::Int64(None).try_into().unwrap()); - - assert_eq!( - Value::UInt8(u8::MAX), - ScalarValue::UInt8(Some(u8::MAX)).try_into().unwrap() - ); - assert_eq!(Value::Null, ScalarValue::UInt8(None).try_into().unwrap()); - - assert_eq!( - Value::UInt16(u16::MAX), - ScalarValue::UInt16(Some(u16::MAX)).try_into().unwrap() - ); - assert_eq!(Value::Null, ScalarValue::UInt16(None).try_into().unwrap()); - - assert_eq!( - Value::UInt32(u32::MAX), - ScalarValue::UInt32(Some(u32::MAX)).try_into().unwrap() - ); - assert_eq!(Value::Null, ScalarValue::UInt32(None).try_into().unwrap()); - - assert_eq!( - Value::UInt64(u64::MAX), - ScalarValue::UInt64(Some(u64::MAX)).try_into().unwrap() - ); - assert_eq!(Value::Null, ScalarValue::UInt64(None).try_into().unwrap()); - - assert_eq!( - Value::from("hello"), - ScalarValue::Utf8(Some("hello".to_string())) - .try_into() - .unwrap() - ); - assert_eq!(Value::Null, ScalarValue::Utf8(None).try_into().unwrap()); - - assert_eq!( - Value::from("large_hello"), - ScalarValue::LargeUtf8(Some("large_hello".to_string())) - .try_into() - .unwrap() - ); - assert_eq!( - Value::Null, - ScalarValue::LargeUtf8(None).try_into().unwrap() - ); - - assert_eq!( - Value::from("world".as_bytes()), - ScalarValue::Binary(Some("world".as_bytes().to_vec())) - .try_into() - .unwrap() - ); - assert_eq!(Value::Null, ScalarValue::Binary(None).try_into().unwrap()); - - assert_eq!( - Value::from("large_world".as_bytes()), - ScalarValue::LargeBinary(Some("large_world".as_bytes().to_vec())) - .try_into() - .unwrap() - ); - assert_eq!( - Value::Null, - ScalarValue::LargeBinary(None).try_into().unwrap() - ); - - assert_eq!( - Value::List(ListValue::new( - Some(Box::new(vec![Value::Int32(1), Value::Null])), - ConcreteDataType::int32_datatype() - )), - ScalarValue::new_list( - Some(vec![ScalarValue::Int32(Some(1)), ScalarValue::Int32(None)]), - ArrowDataType::Int32, - ) - .try_into() - .unwrap() - ); - assert_eq!( - Value::List(ListValue::new(None, ConcreteDataType::uint32_datatype())), - ScalarValue::new_list(None, ArrowDataType::UInt32) - .try_into() - .unwrap() - ); - - assert_eq!( - Value::Date(Date::new(123)), - ScalarValue::Date32(Some(123)).try_into().unwrap() - ); - assert_eq!(Value::Null, ScalarValue::Date32(None).try_into().unwrap()); - - assert_eq!( - Value::DateTime(DateTime::new(456)), - ScalarValue::Date64(Some(456)).try_into().unwrap() - ); - assert_eq!(Value::Null, ScalarValue::Date64(None).try_into().unwrap()); - - assert_eq!( - Value::Timestamp(Timestamp::new(1, TimeUnit::Second)), - ScalarValue::TimestampSecond(Some(1), None) - .try_into() - .unwrap() - ); - assert_eq!( - Value::Null, - ScalarValue::TimestampSecond(None, None).try_into().unwrap() - ); - - assert_eq!( - Value::Timestamp(Timestamp::new(1, TimeUnit::Millisecond)), - ScalarValue::TimestampMillisecond(Some(1), None) - .try_into() - .unwrap() - ); - assert_eq!( - Value::Null, - ScalarValue::TimestampMillisecond(None, None) - .try_into() - .unwrap() - ); - - assert_eq!( - Value::Timestamp(Timestamp::new(1, TimeUnit::Microsecond)), - ScalarValue::TimestampMicrosecond(Some(1), None) - .try_into() - .unwrap() - ); - assert_eq!( - Value::Null, - ScalarValue::TimestampMicrosecond(None, None) - .try_into() - .unwrap() - ); - - assert_eq!( - Value::Timestamp(Timestamp::new(1, TimeUnit::Nanosecond)), - ScalarValue::TimestampNanosecond(Some(1), None) - .try_into() - .unwrap() - ); - assert_eq!( - Value::Null, - ScalarValue::TimestampNanosecond(None, None) - .try_into() - .unwrap() - ); - - let result: Result = ScalarValue::Decimal128(Some(1), 0, 0).try_into(); - result - .unwrap_err() - .to_string() - .contains("Unsupported arrow data type, type: Decimal(0, 0)"); - } - - #[test] - fn test_value_from_inner() { - assert_eq!(Value::Boolean(true), Value::from(true)); - assert_eq!(Value::Boolean(false), Value::from(false)); - - assert_eq!(Value::UInt8(u8::MIN), Value::from(u8::MIN)); - assert_eq!(Value::UInt8(u8::MAX), Value::from(u8::MAX)); - - assert_eq!(Value::UInt16(u16::MIN), Value::from(u16::MIN)); - assert_eq!(Value::UInt16(u16::MAX), Value::from(u16::MAX)); - - assert_eq!(Value::UInt32(u32::MIN), Value::from(u32::MIN)); - assert_eq!(Value::UInt32(u32::MAX), Value::from(u32::MAX)); - - assert_eq!(Value::UInt64(u64::MIN), Value::from(u64::MIN)); - assert_eq!(Value::UInt64(u64::MAX), Value::from(u64::MAX)); - - assert_eq!(Value::Int8(i8::MIN), Value::from(i8::MIN)); - assert_eq!(Value::Int8(i8::MAX), Value::from(i8::MAX)); - - assert_eq!(Value::Int16(i16::MIN), Value::from(i16::MIN)); - assert_eq!(Value::Int16(i16::MAX), Value::from(i16::MAX)); - - assert_eq!(Value::Int32(i32::MIN), Value::from(i32::MIN)); - assert_eq!(Value::Int32(i32::MAX), Value::from(i32::MAX)); - - assert_eq!(Value::Int64(i64::MIN), Value::from(i64::MIN)); - assert_eq!(Value::Int64(i64::MAX), Value::from(i64::MAX)); - - assert_eq!( - Value::Float32(OrderedFloat(f32::MIN)), - Value::from(f32::MIN) - ); - assert_eq!( - Value::Float32(OrderedFloat(f32::MAX)), - Value::from(f32::MAX) - ); - - assert_eq!( - Value::Float64(OrderedFloat(f64::MIN)), - Value::from(f64::MIN) - ); - assert_eq!( - Value::Float64(OrderedFloat(f64::MAX)), - Value::from(f64::MAX) - ); - - let string_bytes = StringBytes::from("hello"); - assert_eq!( - Value::String(string_bytes.clone()), - Value::from(string_bytes) - ); - - let bytes = Bytes::from(b"world".as_slice()); - assert_eq!(Value::Binary(bytes.clone()), Value::from(bytes)); - } - - fn check_type_and_value(data_type: &ConcreteDataType, value: &Value) { - assert_eq!(*data_type, value.data_type()); - assert_eq!(data_type.logical_type_id(), value.logical_type_id()); - } - - #[test] - fn test_value_datatype() { - check_type_and_value(&ConcreteDataType::boolean_datatype(), &Value::Boolean(true)); - check_type_and_value(&ConcreteDataType::uint8_datatype(), &Value::UInt8(u8::MIN)); - check_type_and_value( - &ConcreteDataType::uint16_datatype(), - &Value::UInt16(u16::MIN), - ); - check_type_and_value( - &ConcreteDataType::uint16_datatype(), - &Value::UInt16(u16::MAX), - ); - check_type_and_value( - &ConcreteDataType::uint32_datatype(), - &Value::UInt32(u32::MIN), - ); - check_type_and_value( - &ConcreteDataType::uint64_datatype(), - &Value::UInt64(u64::MIN), - ); - check_type_and_value(&ConcreteDataType::int8_datatype(), &Value::Int8(i8::MIN)); - check_type_and_value(&ConcreteDataType::int16_datatype(), &Value::Int16(i16::MIN)); - check_type_and_value(&ConcreteDataType::int32_datatype(), &Value::Int32(i32::MIN)); - check_type_and_value(&ConcreteDataType::int64_datatype(), &Value::Int64(i64::MIN)); - check_type_and_value( - &ConcreteDataType::float32_datatype(), - &Value::Float32(OrderedFloat(f32::MIN)), - ); - check_type_and_value( - &ConcreteDataType::float64_datatype(), - &Value::Float64(OrderedFloat(f64::MIN)), - ); - check_type_and_value( - &ConcreteDataType::string_datatype(), - &Value::String(StringBytes::from("hello")), - ); - check_type_and_value( - &ConcreteDataType::binary_datatype(), - &Value::Binary(Bytes::from(b"world".as_slice())), - ); - check_type_and_value( - &ConcreteDataType::list_datatype(ConcreteDataType::int32_datatype()), - &Value::List(ListValue::new( - Some(Box::new(vec![Value::Int32(10)])), - ConcreteDataType::int32_datatype(), - )), - ); - check_type_and_value( - &ConcreteDataType::list_datatype(ConcreteDataType::null_datatype()), - &Value::List(ListValue::default()), - ); - check_type_and_value( - &ConcreteDataType::date_datatype(), - &Value::Date(Date::new(1)), - ); - check_type_and_value( - &ConcreteDataType::datetime_datatype(), - &Value::DateTime(DateTime::new(1)), - ); - check_type_and_value( - &ConcreteDataType::timestamp_millisecond_datatype(), - &Value::Timestamp(Timestamp::from_millis(1)), - ); - } - - #[test] - fn test_value_from_string() { - let hello = "hello".to_string(); - assert_eq!( - Value::String(StringBytes::from(hello.clone())), - Value::from(hello) - ); - - let world = "world"; - assert_eq!(Value::String(StringBytes::from(world)), Value::from(world)); - } - - #[test] - fn test_value_from_bytes() { - let hello = b"hello".to_vec(); - assert_eq!( - Value::Binary(Bytes::from(hello.clone())), - Value::from(hello) - ); - - let world: &[u8] = b"world"; - assert_eq!(Value::Binary(Bytes::from(world)), Value::from(world)); - } - - fn to_json(value: Value) -> serde_json::Value { - value.try_into().unwrap() - } - - #[test] - fn test_to_json_value() { - assert_eq!(serde_json::Value::Null, to_json(Value::Null)); - assert_eq!(serde_json::Value::Bool(true), to_json(Value::Boolean(true))); - assert_eq!( - serde_json::Value::Number(20u8.into()), - to_json(Value::UInt8(20)) - ); - assert_eq!( - serde_json::Value::Number(20i8.into()), - to_json(Value::Int8(20)) - ); - assert_eq!( - serde_json::Value::Number(2000u16.into()), - to_json(Value::UInt16(2000)) - ); - assert_eq!( - serde_json::Value::Number(2000i16.into()), - to_json(Value::Int16(2000)) - ); - assert_eq!( - serde_json::Value::Number(3000u32.into()), - to_json(Value::UInt32(3000)) - ); - assert_eq!( - serde_json::Value::Number(3000i32.into()), - to_json(Value::Int32(3000)) - ); - assert_eq!( - serde_json::Value::Number(4000u64.into()), - to_json(Value::UInt64(4000)) - ); - assert_eq!( - serde_json::Value::Number(4000i64.into()), - to_json(Value::Int64(4000)) - ); - assert_eq!( - serde_json::Value::from(125.0f32), - to_json(Value::Float32(125.0.into())) - ); - assert_eq!( - serde_json::Value::from(125.0f64), - to_json(Value::Float64(125.0.into())) - ); - assert_eq!( - serde_json::Value::String(String::from("hello")), - to_json(Value::String(StringBytes::from("hello"))) - ); - assert_eq!( - serde_json::Value::from(b"world".as_slice()), - to_json(Value::Binary(Bytes::from(b"world".as_slice()))) - ); - assert_eq!( - serde_json::Value::Number(5000i32.into()), - to_json(Value::Date(Date::new(5000))) - ); - assert_eq!( - serde_json::Value::Number(5000i64.into()), - to_json(Value::DateTime(DateTime::new(5000))) - ); - - assert_eq!( - serde_json::Value::Number(1.into()), - to_json(Value::Timestamp(Timestamp::from_millis(1))) - ); - - let json_value: serde_json::Value = - serde_json::from_str(r#"{"items":[{"Int32":123}],"datatype":{"Int32":{}}}"#).unwrap(); - assert_eq!( - json_value, - to_json(Value::List(ListValue { - items: Some(Box::new(vec![Value::Int32(123)])), - datatype: ConcreteDataType::int32_datatype(), - })) - ); - } - - #[test] - fn test_null_value() { - assert!(Value::Null.is_null()); - assert!(!Value::Boolean(true).is_null()); - assert!(Value::Null < Value::Boolean(false)); - assert!(Value::Boolean(true) > Value::Null); - assert!(Value::Null < Value::Int32(10)); - assert!(Value::Int32(10) > Value::Null); - } - - #[test] - fn test_null_value_ref() { - assert!(ValueRef::Null.is_null()); - assert!(!ValueRef::Boolean(true).is_null()); - assert!(ValueRef::Null < ValueRef::Boolean(false)); - assert!(ValueRef::Boolean(true) > ValueRef::Null); - assert!(ValueRef::Null < ValueRef::Int32(10)); - assert!(ValueRef::Int32(10) > ValueRef::Null); - } - - #[test] - fn test_as_value_ref() { - macro_rules! check_as_value_ref { - ($Variant: ident, $data: expr) => { - let value = Value::$Variant($data); - let value_ref = value.as_value_ref(); - let expect_ref = ValueRef::$Variant($data); - - assert_eq!(expect_ref, value_ref); - }; - } - - assert_eq!(ValueRef::Null, Value::Null.as_value_ref()); - check_as_value_ref!(Boolean, true); - check_as_value_ref!(UInt8, 123); - check_as_value_ref!(UInt16, 123); - check_as_value_ref!(UInt32, 123); - check_as_value_ref!(UInt64, 123); - check_as_value_ref!(Int8, -12); - check_as_value_ref!(Int16, -12); - check_as_value_ref!(Int32, -12); - check_as_value_ref!(Int64, -12); - check_as_value_ref!(Float32, OrderedF32::from(16.0)); - check_as_value_ref!(Float64, OrderedF64::from(16.0)); - check_as_value_ref!(Timestamp, Timestamp::from_millis(1)); - - assert_eq!( - ValueRef::String("hello"), - Value::String("hello".into()).as_value_ref() - ); - assert_eq!( - ValueRef::Binary(b"hello"), - Value::Binary("hello".as_bytes().into()).as_value_ref() - ); - - check_as_value_ref!(Date, Date::new(103)); - check_as_value_ref!(DateTime, DateTime::new(1034)); - - let list = ListValue { - items: None, - datatype: ConcreteDataType::int32_datatype(), - }; - assert_eq!( - ValueRef::List(ListValueRef::Ref { val: &list }), - Value::List(list.clone()).as_value_ref() - ); - } - - #[test] - fn test_value_ref_as() { - macro_rules! check_as_null { - ($method: ident) => { - assert_eq!(None, ValueRef::Null.$method().unwrap()); - }; - } - - check_as_null!(as_binary); - check_as_null!(as_string); - check_as_null!(as_boolean); - check_as_null!(as_date); - check_as_null!(as_datetime); - check_as_null!(as_list); - - macro_rules! check_as_correct { - ($data: expr, $Variant: ident, $method: ident) => { - assert_eq!(Some($data), ValueRef::$Variant($data).$method().unwrap()); - }; - } - - check_as_correct!("hello", String, as_string); - check_as_correct!("hello".as_bytes(), Binary, as_binary); - check_as_correct!(true, Boolean, as_boolean); - check_as_correct!(Date::new(123), Date, as_date); - check_as_correct!(DateTime::new(12), DateTime, as_datetime); - let list = ListValue { - items: None, - datatype: ConcreteDataType::int32_datatype(), - }; - check_as_correct!(ListValueRef::Ref { val: &list }, List, as_list); - - let wrong_value = ValueRef::Int32(12345); - assert!(wrong_value.as_binary().is_err()); - assert!(wrong_value.as_string().is_err()); - assert!(wrong_value.as_boolean().is_err()); - assert!(wrong_value.as_date().is_err()); - assert!(wrong_value.as_datetime().is_err()); - assert!(wrong_value.as_list().is_err()); - } - - #[test] - fn test_display() { - assert_eq!(Value::Null.to_string(), "Null"); - assert_eq!(Value::UInt8(8).to_string(), "8"); - assert_eq!(Value::UInt16(16).to_string(), "16"); - assert_eq!(Value::UInt32(32).to_string(), "32"); - assert_eq!(Value::UInt64(64).to_string(), "64"); - assert_eq!(Value::Int8(-8).to_string(), "-8"); - assert_eq!(Value::Int16(-16).to_string(), "-16"); - assert_eq!(Value::Int32(-32).to_string(), "-32"); - assert_eq!(Value::Int64(-64).to_string(), "-64"); - assert_eq!(Value::Float32((-32.123).into()).to_string(), "-32.123"); - assert_eq!(Value::Float64((-64.123).into()).to_string(), "-64.123"); - assert_eq!(Value::Float64(OrderedF64::infinity()).to_string(), "inf"); - assert_eq!(Value::Float64(OrderedF64::nan()).to_string(), "NaN"); - assert_eq!(Value::String(StringBytes::from("123")).to_string(), "123"); - assert_eq!( - Value::Binary(Bytes::from(vec![1, 2, 3])).to_string(), - "010203" - ); - assert_eq!(Value::Date(Date::new(0)).to_string(), "1970-01-01"); - assert_eq!( - Value::DateTime(DateTime::new(0)).to_string(), - "1970-01-01 00:00:00" - ); - assert_eq!( - Value::Timestamp(Timestamp::new(1000, TimeUnit::Millisecond)).to_string(), - "1970-01-01 00:00:01+0000" - ); - assert_eq!( - Value::List(ListValue::new( - Some(Box::new(vec![Value::Int8(1), Value::Int8(2)])), - ConcreteDataType::int8_datatype(), - )) - .to_string(), - "Int8[1, 2]" - ); - assert_eq!( - Value::List(ListValue::new( - Some(Box::new(vec![])), - ConcreteDataType::timestamp_second_datatype(), - )) - .to_string(), - "TimestampSecondType[]" - ); - assert_eq!( - Value::List(ListValue::new( - Some(Box::new(vec![])), - ConcreteDataType::timestamp_millisecond_datatype(), - )) - .to_string(), - "TimestampMillisecondType[]" - ); - assert_eq!( - Value::List(ListValue::new( - Some(Box::new(vec![])), - ConcreteDataType::timestamp_microsecond_datatype(), - )) - .to_string(), - "TimestampMicrosecondType[]" - ); - assert_eq!( - Value::List(ListValue::new( - Some(Box::new(vec![])), - ConcreteDataType::timestamp_nanosecond_datatype(), - )) - .to_string(), - "TimestampNanosecondType[]" - ); - } -} diff --git a/src/datatypes2/src/vectors.rs b/src/datatypes2/src/vectors.rs deleted file mode 100644 index 38fa762d4b3c..000000000000 --- a/src/datatypes2/src/vectors.rs +++ /dev/null @@ -1,309 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::any::Any; -use std::fmt::Debug; -use std::sync::Arc; - -use arrow::array::{Array, ArrayRef}; -use snafu::ensure; - -use crate::data_type::ConcreteDataType; -use crate::error::{self, Result}; -use crate::serialize::Serializable; -use crate::value::{Value, ValueRef}; -use crate::vectors::operations::VectorOp; - -mod binary; -mod boolean; -mod constant; -mod date; -mod datetime; -mod eq; -mod helper; -mod list; -mod null; -mod operations; -mod primitive; -mod string; -mod timestamp; -mod validity; - -pub use binary::{BinaryVector, BinaryVectorBuilder}; -pub use boolean::{BooleanVector, BooleanVectorBuilder}; -pub use constant::ConstantVector; -pub use date::{DateVector, DateVectorBuilder}; -pub use datetime::{DateTimeVector, DateTimeVectorBuilder}; -pub use helper::Helper; -pub use list::{ListIter, ListVector, ListVectorBuilder}; -pub use null::{NullVector, NullVectorBuilder}; -pub use primitive::{ - Float32Vector, Float32VectorBuilder, Float64Vector, Float64VectorBuilder, Int16Vector, - Int16VectorBuilder, Int32Vector, Int32VectorBuilder, Int64Vector, Int64VectorBuilder, - Int8Vector, Int8VectorBuilder, PrimitiveIter, PrimitiveVector, PrimitiveVectorBuilder, - UInt16Vector, UInt16VectorBuilder, UInt32Vector, UInt32VectorBuilder, UInt64Vector, - UInt64VectorBuilder, UInt8Vector, UInt8VectorBuilder, -}; -pub use string::{StringVector, StringVectorBuilder}; -pub use timestamp::{ - TimestampMicrosecondVector, TimestampMicrosecondVectorBuilder, TimestampMillisecondVector, - TimestampMillisecondVectorBuilder, TimestampNanosecondVector, TimestampNanosecondVectorBuilder, - TimestampSecondVector, TimestampSecondVectorBuilder, -}; -pub use validity::Validity; - -// TODO(yingwen): arrow 28.0 implements Clone for all arrays, we could upgrade to it and simplify -// some codes in methods such as `to_arrow_array()` and `to_boxed_arrow_array()`. -/// Vector of data values. -pub trait Vector: Send + Sync + Serializable + Debug + VectorOp { - /// Returns the data type of the vector. - /// - /// This may require heap allocation. - fn data_type(&self) -> ConcreteDataType; - - fn vector_type_name(&self) -> String; - - /// Returns the vector as [Any](std::any::Any) so that it can be - /// downcast to a specific implementation. - fn as_any(&self) -> &dyn Any; - - /// Returns number of elements in the vector. - fn len(&self) -> usize; - - /// Returns whether the vector is empty. - fn is_empty(&self) -> bool { - self.len() == 0 - } - - /// Convert this vector to a new arrow [ArrayRef]. - fn to_arrow_array(&self) -> ArrayRef; - - /// Convert this vector to a new boxed arrow [Array]. - fn to_boxed_arrow_array(&self) -> Box; - - /// Returns the validity of the Array. - fn validity(&self) -> Validity; - - /// Returns the memory size of vector. - fn memory_size(&self) -> usize; - - /// The number of null slots on this [`Vector`]. - /// # Implementation - /// This is `O(1)`. - fn null_count(&self) -> usize; - - /// Returns true when it's a ConstantColumn - fn is_const(&self) -> bool { - false - } - - /// Returns whether row is null. - fn is_null(&self, row: usize) -> bool; - - /// If the only value vector can contain is NULL. - fn only_null(&self) -> bool { - self.null_count() == self.len() - } - - /// Slices the `Vector`, returning a new `VectorRef`. - /// - /// # Panics - /// This function panics if `offset + length > self.len()`. - fn slice(&self, offset: usize, length: usize) -> VectorRef; - - /// Returns the clone of value at `index`. - /// - /// # Panics - /// Panic if `index` is out of bound. - fn get(&self, index: usize) -> Value; - - /// Returns the clone of value at `index` or error if `index` - /// is out of bound. - fn try_get(&self, index: usize) -> Result { - ensure!( - index < self.len(), - error::BadArrayAccessSnafu { - index, - size: self.len() - } - ); - Ok(self.get(index)) - } - - /// Returns the reference of value at `index`. - /// - /// # Panics - /// Panic if `index` is out of bound. - fn get_ref(&self, index: usize) -> ValueRef; -} - -pub type VectorRef = Arc; - -/// Mutable vector that could be used to build an immutable vector. -pub trait MutableVector: Send + Sync { - /// Returns the data type of the vector. - fn data_type(&self) -> ConcreteDataType; - - /// Returns the length of the vector. - fn len(&self) -> usize; - - /// Returns whether the vector is empty. - fn is_empty(&self) -> bool { - self.len() == 0 - } - - /// Convert to Any, to enable dynamic casting. - fn as_any(&self) -> &dyn Any; - - /// Convert to mutable Any, to enable dynamic casting. - fn as_mut_any(&mut self) -> &mut dyn Any; - - /// Convert `self` to an (immutable) [VectorRef] and reset `self`. - fn to_vector(&mut self) -> VectorRef; - - /// Push value ref to this mutable vector. - /// - /// Returns error if data type unmatch. - fn push_value_ref(&mut self, value: ValueRef) -> Result<()>; - - /// Extend this mutable vector by slice of `vector`. - /// - /// Returns error if data type unmatch. - /// - /// # Panics - /// Panics if `offset + length > vector.len()`. - fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()>; -} - -/// Helper to define `try_from_arrow_array(array: arrow::array::ArrayRef)` function. -macro_rules! impl_try_from_arrow_array_for_vector { - ($Array: ident, $Vector: ident) => { - impl $Vector { - pub fn try_from_arrow_array( - array: impl AsRef, - ) -> crate::error::Result<$Vector> { - use snafu::OptionExt; - - let data = array - .as_ref() - .as_any() - .downcast_ref::<$Array>() - .with_context(|| crate::error::ConversionSnafu { - from: std::format!("{:?}", array.as_ref().data_type()), - })? - .data() - .clone(); - - let concrete_array = $Array::from(data); - Ok($Vector::from(concrete_array)) - } - } - }; -} - -macro_rules! impl_validity_for_vector { - ($array: expr) => { - Validity::from_array_data($array.data()) - }; -} - -macro_rules! impl_get_for_vector { - ($array: expr, $index: ident) => { - if $array.is_valid($index) { - // Safety: The index have been checked by `is_valid()`. - unsafe { $array.value_unchecked($index).into() } - } else { - Value::Null - } - }; -} - -macro_rules! impl_get_ref_for_vector { - ($array: expr, $index: ident) => { - if $array.is_valid($index) { - // Safety: The index have been checked by `is_valid()`. - unsafe { $array.value_unchecked($index).into() } - } else { - ValueRef::Null - } - }; -} - -macro_rules! impl_extend_for_builder { - ($mutable_vector: expr, $vector: ident, $VectorType: ident, $offset: ident, $length: ident) => {{ - use snafu::OptionExt; - - let sliced_vector = $vector.slice($offset, $length); - let concrete_vector = sliced_vector - .as_any() - .downcast_ref::<$VectorType>() - .with_context(|| crate::error::CastTypeSnafu { - msg: format!( - "Failed to cast vector from {} to {}", - $vector.vector_type_name(), - stringify!($VectorType) - ), - })?; - for value in concrete_vector.iter_data() { - $mutable_vector.push(value); - } - Ok(()) - }}; -} - -pub(crate) use { - impl_extend_for_builder, impl_get_for_vector, impl_get_ref_for_vector, - impl_try_from_arrow_array_for_vector, impl_validity_for_vector, -}; - -#[cfg(test)] -pub mod tests { - use arrow::array::{Array, Int32Array, UInt8Array}; - use serde_json; - - use super::*; - use crate::data_type::DataType; - use crate::types::{Int32Type, LogicalPrimitiveType}; - use crate::vectors::helper::Helper; - - #[test] - fn test_df_columns_to_vector() { - let df_column: Arc = Arc::new(Int32Array::from(vec![1, 2, 3])); - let vector = Helper::try_into_vector(df_column).unwrap(); - assert_eq!( - Int32Type::build_data_type().as_arrow_type(), - vector.data_type().as_arrow_type() - ); - } - - #[test] - fn test_serialize_i32_vector() { - let df_column: Arc = Arc::new(Int32Array::from(vec![1, 2, 3])); - let json_value = Helper::try_into_vector(df_column) - .unwrap() - .serialize_to_json() - .unwrap(); - assert_eq!("[1,2,3]", serde_json::to_string(&json_value).unwrap()); - } - - #[test] - fn test_serialize_i8_vector() { - let df_column: Arc = Arc::new(UInt8Array::from(vec![1, 2, 3])); - let json_value = Helper::try_into_vector(df_column) - .unwrap() - .serialize_to_json() - .unwrap(); - assert_eq!("[1,2,3]", serde_json::to_string(&json_value).unwrap()); - } -} diff --git a/src/datatypes2/src/vectors/binary.rs b/src/datatypes2/src/vectors/binary.rs deleted file mode 100644 index 3b5defc8ec6e..000000000000 --- a/src/datatypes2/src/vectors/binary.rs +++ /dev/null @@ -1,353 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::any::Any; -use std::sync::Arc; - -use arrow::array::{Array, ArrayBuilder, ArrayData, ArrayIter, ArrayRef}; -use snafu::ResultExt; - -use crate::arrow_array::{BinaryArray, MutableBinaryArray}; -use crate::data_type::ConcreteDataType; -use crate::error::{self, Result}; -use crate::scalars::{ScalarVector, ScalarVectorBuilder}; -use crate::serialize::Serializable; -use crate::value::{Value, ValueRef}; -use crate::vectors::{self, MutableVector, Validity, Vector, VectorRef}; - -/// Vector of binary strings. -#[derive(Debug, PartialEq)] -pub struct BinaryVector { - array: BinaryArray, -} - -impl BinaryVector { - pub(crate) fn as_arrow(&self) -> &dyn Array { - &self.array - } - - fn to_array_data(&self) -> ArrayData { - self.array.data().clone() - } - - fn from_array_data(data: ArrayData) -> BinaryVector { - BinaryVector { - array: BinaryArray::from(data), - } - } -} - -impl From for BinaryVector { - fn from(array: BinaryArray) -> Self { - Self { array } - } -} - -impl From>>> for BinaryVector { - fn from(data: Vec>>) -> Self { - Self { - array: BinaryArray::from_iter(data), - } - } -} - -impl Vector for BinaryVector { - fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::binary_datatype() - } - - fn vector_type_name(&self) -> String { - "BinaryVector".to_string() - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn len(&self) -> usize { - self.array.len() - } - - fn to_arrow_array(&self) -> ArrayRef { - let data = self.to_array_data(); - Arc::new(BinaryArray::from(data)) - } - - fn to_boxed_arrow_array(&self) -> Box { - let data = self.to_array_data(); - Box::new(BinaryArray::from(data)) - } - - fn validity(&self) -> Validity { - vectors::impl_validity_for_vector!(self.array) - } - - fn memory_size(&self) -> usize { - self.array.get_buffer_memory_size() - } - - fn null_count(&self) -> usize { - self.array.null_count() - } - - fn is_null(&self, row: usize) -> bool { - self.array.is_null(row) - } - - fn slice(&self, offset: usize, length: usize) -> VectorRef { - let data = self.array.data().slice(offset, length); - Arc::new(Self::from_array_data(data)) - } - - fn get(&self, index: usize) -> Value { - vectors::impl_get_for_vector!(self.array, index) - } - - fn get_ref(&self, index: usize) -> ValueRef { - vectors::impl_get_ref_for_vector!(self.array, index) - } -} - -impl ScalarVector for BinaryVector { - type OwnedItem = Vec; - type RefItem<'a> = &'a [u8]; - type Iter<'a> = ArrayIter<&'a BinaryArray>; - type Builder = BinaryVectorBuilder; - - fn get_data(&self, idx: usize) -> Option> { - if self.array.is_valid(idx) { - Some(self.array.value(idx)) - } else { - None - } - } - - fn iter_data(&self) -> Self::Iter<'_> { - self.array.iter() - } -} - -pub struct BinaryVectorBuilder { - mutable_array: MutableBinaryArray, -} - -impl MutableVector for BinaryVectorBuilder { - fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::binary_datatype() - } - - fn len(&self) -> usize { - self.mutable_array.len() - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn as_mut_any(&mut self) -> &mut dyn Any { - self - } - - fn to_vector(&mut self) -> VectorRef { - Arc::new(self.finish()) - } - - fn push_value_ref(&mut self, value: ValueRef) -> Result<()> { - match value.as_binary()? { - Some(v) => self.mutable_array.append_value(v), - None => self.mutable_array.append_null(), - } - Ok(()) - } - - fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> { - vectors::impl_extend_for_builder!(self, vector, BinaryVector, offset, length) - } -} - -impl ScalarVectorBuilder for BinaryVectorBuilder { - type VectorType = BinaryVector; - - fn with_capacity(capacity: usize) -> Self { - Self { - mutable_array: MutableBinaryArray::with_capacity(capacity, 0), - } - } - - fn push(&mut self, value: Option<::RefItem<'_>>) { - match value { - Some(v) => self.mutable_array.append_value(v), - None => self.mutable_array.append_null(), - } - } - - fn finish(&mut self) -> Self::VectorType { - BinaryVector { - array: self.mutable_array.finish(), - } - } -} - -impl Serializable for BinaryVector { - fn serialize_to_json(&self) -> Result> { - self.iter_data() - .map(|v| match v { - None => Ok(serde_json::Value::Null), // if binary vector not present, map to NULL - Some(vec) => serde_json::to_value(vec), - }) - .collect::>() - .context(error::SerializeSnafu) - } -} - -vectors::impl_try_from_arrow_array_for_vector!(BinaryArray, BinaryVector); - -#[cfg(test)] -mod tests { - use arrow::datatypes::DataType as ArrowDataType; - use common_base::bytes::Bytes; - use serde_json; - - use super::*; - use crate::arrow_array::BinaryArray; - use crate::data_type::DataType; - use crate::serialize::Serializable; - use crate::types::BinaryType; - - #[test] - fn test_binary_vector_misc() { - let v = BinaryVector::from(BinaryArray::from_iter_values(&[ - vec![1, 2, 3], - vec![1, 2, 3], - ])); - - assert_eq!(2, v.len()); - assert_eq!("BinaryVector", v.vector_type_name()); - assert!(!v.is_const()); - assert!(v.validity().is_all_valid()); - assert!(!v.only_null()); - assert_eq!(128, v.memory_size()); - - for i in 0..2 { - assert!(!v.is_null(i)); - assert_eq!(Value::Binary(Bytes::from(vec![1, 2, 3])), v.get(i)); - assert_eq!(ValueRef::Binary(&[1, 2, 3]), v.get_ref(i)); - } - - let arrow_arr = v.to_arrow_array(); - assert_eq!(2, arrow_arr.len()); - assert_eq!(&ArrowDataType::LargeBinary, arrow_arr.data_type()); - } - - #[test] - fn test_serialize_binary_vector_to_json() { - let vector = BinaryVector::from(BinaryArray::from_iter_values(&[ - vec![1, 2, 3], - vec![1, 2, 3], - ])); - - let json_value = vector.serialize_to_json().unwrap(); - assert_eq!( - "[[1,2,3],[1,2,3]]", - serde_json::to_string(&json_value).unwrap() - ); - } - - #[test] - fn test_serialize_binary_vector_with_null_to_json() { - let mut builder = BinaryVectorBuilder::with_capacity(4); - builder.push(Some(&[1, 2, 3])); - builder.push(None); - builder.push(Some(&[4, 5, 6])); - let vector = builder.finish(); - - let json_value = vector.serialize_to_json().unwrap(); - assert_eq!( - "[[1,2,3],null,[4,5,6]]", - serde_json::to_string(&json_value).unwrap() - ); - } - - #[test] - fn test_from_arrow_array() { - let arrow_array = BinaryArray::from_iter_values(&[vec![1, 2, 3], vec![1, 2, 3]]); - let original = BinaryArray::from(arrow_array.data().clone()); - let vector = BinaryVector::from(arrow_array); - assert_eq!(original, vector.array); - } - - #[test] - fn test_binary_vector_build_get() { - let mut builder = BinaryVectorBuilder::with_capacity(4); - builder.push(Some(b"hello")); - builder.push(Some(b"happy")); - builder.push(Some(b"world")); - builder.push(None); - - let vector = builder.finish(); - assert_eq!(b"hello", vector.get_data(0).unwrap()); - assert_eq!(None, vector.get_data(3)); - - assert_eq!(Value::Binary(b"hello".as_slice().into()), vector.get(0)); - assert_eq!(Value::Null, vector.get(3)); - - let mut iter = vector.iter_data(); - assert_eq!(b"hello", iter.next().unwrap().unwrap()); - assert_eq!(b"happy", iter.next().unwrap().unwrap()); - assert_eq!(b"world", iter.next().unwrap().unwrap()); - assert_eq!(None, iter.next().unwrap()); - assert_eq!(None, iter.next()); - } - - #[test] - fn test_binary_vector_validity() { - let mut builder = BinaryVectorBuilder::with_capacity(4); - builder.push(Some(b"hello")); - builder.push(Some(b"world")); - let vector = builder.finish(); - assert_eq!(0, vector.null_count()); - assert!(vector.validity().is_all_valid()); - - let mut builder = BinaryVectorBuilder::with_capacity(3); - builder.push(Some(b"hello")); - builder.push(None); - builder.push(Some(b"world")); - let vector = builder.finish(); - assert_eq!(1, vector.null_count()); - let validity = vector.validity(); - assert!(!validity.is_set(1)); - - assert_eq!(1, validity.null_count()); - assert!(!validity.is_set(1)); - } - - #[test] - fn test_binary_vector_builder() { - let input = BinaryVector::from_slice(&[b"world", b"one", b"two"]); - - let mut builder = BinaryType::default().create_mutable_vector(3); - builder - .push_value_ref(ValueRef::Binary("hello".as_bytes())) - .unwrap(); - assert!(builder.push_value_ref(ValueRef::Int32(123)).is_err()); - builder.extend_slice_of(&input, 1, 2).unwrap(); - assert!(builder - .extend_slice_of(&crate::vectors::Int32Vector::from_slice(&[13]), 0, 1) - .is_err()); - let vector = builder.to_vector(); - - let expect: VectorRef = Arc::new(BinaryVector::from_slice(&[b"hello", b"one", b"two"])); - assert_eq!(expect, vector); - } -} diff --git a/src/datatypes2/src/vectors/boolean.rs b/src/datatypes2/src/vectors/boolean.rs deleted file mode 100644 index 2b4e5b8e10d9..000000000000 --- a/src/datatypes2/src/vectors/boolean.rs +++ /dev/null @@ -1,371 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::any::Any; -use std::borrow::Borrow; -use std::sync::Arc; - -use arrow::array::{ - Array, ArrayBuilder, ArrayData, ArrayIter, ArrayRef, BooleanArray, BooleanBuilder, -}; -use snafu::ResultExt; - -use crate::data_type::ConcreteDataType; -use crate::error::Result; -use crate::scalars::{ScalarVector, ScalarVectorBuilder}; -use crate::serialize::Serializable; -use crate::value::{Value, ValueRef}; -use crate::vectors::{self, MutableVector, Validity, Vector, VectorRef}; - -/// Vector of boolean. -#[derive(Debug, PartialEq)] -pub struct BooleanVector { - array: BooleanArray, -} - -impl BooleanVector { - pub(crate) fn as_arrow(&self) -> &dyn Array { - &self.array - } - - pub(crate) fn as_boolean_array(&self) -> &BooleanArray { - &self.array - } - - fn to_array_data(&self) -> ArrayData { - self.array.data().clone() - } - - fn from_array_data(data: ArrayData) -> BooleanVector { - BooleanVector { - array: BooleanArray::from(data), - } - } - - pub(crate) fn false_count(&self) -> usize { - self.array.false_count() - } -} - -impl From> for BooleanVector { - fn from(data: Vec) -> Self { - BooleanVector { - array: BooleanArray::from(data), - } - } -} - -impl From for BooleanVector { - fn from(array: BooleanArray) -> Self { - Self { array } - } -} - -impl From>> for BooleanVector { - fn from(data: Vec>) -> Self { - BooleanVector { - array: BooleanArray::from(data), - } - } -} - -impl>> FromIterator for BooleanVector { - fn from_iter>(iter: I) -> Self { - BooleanVector { - array: BooleanArray::from_iter(iter), - } - } -} - -impl Vector for BooleanVector { - fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::boolean_datatype() - } - - fn vector_type_name(&self) -> String { - "BooleanVector".to_string() - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn len(&self) -> usize { - self.array.len() - } - - fn to_arrow_array(&self) -> ArrayRef { - let data = self.to_array_data(); - Arc::new(BooleanArray::from(data)) - } - - fn to_boxed_arrow_array(&self) -> Box { - let data = self.to_array_data(); - Box::new(BooleanArray::from(data)) - } - - fn validity(&self) -> Validity { - vectors::impl_validity_for_vector!(self.array) - } - - fn memory_size(&self) -> usize { - self.array.get_buffer_memory_size() - } - - fn null_count(&self) -> usize { - self.array.null_count() - } - - fn is_null(&self, row: usize) -> bool { - self.array.is_null(row) - } - - fn slice(&self, offset: usize, length: usize) -> VectorRef { - let data = self.array.data().slice(offset, length); - Arc::new(Self::from_array_data(data)) - } - - fn get(&self, index: usize) -> Value { - vectors::impl_get_for_vector!(self.array, index) - } - - fn get_ref(&self, index: usize) -> ValueRef { - vectors::impl_get_ref_for_vector!(self.array, index) - } -} - -impl ScalarVector for BooleanVector { - type OwnedItem = bool; - type RefItem<'a> = bool; - type Iter<'a> = ArrayIter<&'a BooleanArray>; - type Builder = BooleanVectorBuilder; - - fn get_data(&self, idx: usize) -> Option> { - if self.array.is_valid(idx) { - Some(self.array.value(idx)) - } else { - None - } - } - - fn iter_data(&self) -> Self::Iter<'_> { - self.array.iter() - } -} - -pub struct BooleanVectorBuilder { - mutable_array: BooleanBuilder, -} - -impl MutableVector for BooleanVectorBuilder { - fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::boolean_datatype() - } - - fn len(&self) -> usize { - self.mutable_array.len() - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn as_mut_any(&mut self) -> &mut dyn Any { - self - } - - fn to_vector(&mut self) -> VectorRef { - Arc::new(self.finish()) - } - - fn push_value_ref(&mut self, value: ValueRef) -> Result<()> { - match value.as_boolean()? { - Some(v) => self.mutable_array.append_value(v), - None => self.mutable_array.append_null(), - } - Ok(()) - } - - fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> { - vectors::impl_extend_for_builder!(self, vector, BooleanVector, offset, length) - } -} - -impl ScalarVectorBuilder for BooleanVectorBuilder { - type VectorType = BooleanVector; - - fn with_capacity(capacity: usize) -> Self { - Self { - mutable_array: BooleanBuilder::with_capacity(capacity), - } - } - - fn push(&mut self, value: Option<::RefItem<'_>>) { - match value { - Some(v) => self.mutable_array.append_value(v), - None => self.mutable_array.append_null(), - } - } - - fn finish(&mut self) -> Self::VectorType { - BooleanVector { - array: self.mutable_array.finish(), - } - } -} - -impl Serializable for BooleanVector { - fn serialize_to_json(&self) -> Result> { - self.iter_data() - .map(serde_json::to_value) - .collect::>() - .context(crate::error::SerializeSnafu) - } -} - -vectors::impl_try_from_arrow_array_for_vector!(BooleanArray, BooleanVector); - -#[cfg(test)] -mod tests { - use arrow::datatypes::DataType as ArrowDataType; - use serde_json; - - use super::*; - use crate::data_type::DataType; - use crate::serialize::Serializable; - use crate::types::BooleanType; - - #[test] - fn test_boolean_vector_misc() { - let bools = vec![true, false, true, true, false, false, true, true, false]; - let v = BooleanVector::from(bools.clone()); - assert_eq!(9, v.len()); - assert_eq!("BooleanVector", v.vector_type_name()); - assert!(!v.is_const()); - assert!(v.validity().is_all_valid()); - assert!(!v.only_null()); - assert_eq!(64, v.memory_size()); - - for (i, b) in bools.iter().enumerate() { - assert!(!v.is_null(i)); - assert_eq!(Value::Boolean(*b), v.get(i)); - assert_eq!(ValueRef::Boolean(*b), v.get_ref(i)); - } - - let arrow_arr = v.to_arrow_array(); - assert_eq!(9, arrow_arr.len()); - assert_eq!(&ArrowDataType::Boolean, arrow_arr.data_type()); - } - - #[test] - fn test_serialize_boolean_vector_to_json() { - let vector = BooleanVector::from(vec![true, false, true, true, false, false]); - - let json_value = vector.serialize_to_json().unwrap(); - assert_eq!( - "[true,false,true,true,false,false]", - serde_json::to_string(&json_value).unwrap(), - ); - } - - #[test] - fn test_serialize_boolean_vector_with_null_to_json() { - let vector = BooleanVector::from(vec![Some(true), None, Some(false)]); - - let json_value = vector.serialize_to_json().unwrap(); - assert_eq!( - "[true,null,false]", - serde_json::to_string(&json_value).unwrap(), - ); - } - - #[test] - fn test_boolean_vector_from_vec() { - let input = vec![false, true, false, true]; - let vec = BooleanVector::from(input.clone()); - assert_eq!(4, vec.len()); - for (i, v) in input.into_iter().enumerate() { - assert_eq!(Some(v), vec.get_data(i), "failed at {}", i) - } - } - - #[test] - fn test_boolean_vector_from_iter() { - let input = vec![Some(false), Some(true), Some(false), Some(true)]; - let vec = input.iter().collect::(); - assert_eq!(4, vec.len()); - for (i, v) in input.into_iter().enumerate() { - assert_eq!(v, vec.get_data(i), "failed at {}", i) - } - } - - #[test] - fn test_boolean_vector_from_vec_option() { - let input = vec![Some(false), Some(true), None, Some(true)]; - let vec = BooleanVector::from(input.clone()); - assert_eq!(4, vec.len()); - for (i, v) in input.into_iter().enumerate() { - assert_eq!(v, vec.get_data(i), "failed at {}", i) - } - } - - #[test] - fn test_boolean_vector_build_get() { - let input = [Some(true), None, Some(false)]; - let mut builder = BooleanVectorBuilder::with_capacity(3); - for v in input { - builder.push(v); - } - let vector = builder.finish(); - assert_eq!(input.len(), vector.len()); - - let res: Vec<_> = vector.iter_data().collect(); - assert_eq!(input, &res[..]); - - for (i, v) in input.into_iter().enumerate() { - assert_eq!(v, vector.get_data(i)); - assert_eq!(Value::from(v), vector.get(i)); - } - } - - #[test] - fn test_boolean_vector_validity() { - let vector = BooleanVector::from(vec![Some(true), None, Some(false)]); - assert_eq!(1, vector.null_count()); - let validity = vector.validity(); - assert_eq!(1, validity.null_count()); - assert!(!validity.is_set(1)); - - let vector = BooleanVector::from(vec![true, false, false]); - assert_eq!(0, vector.null_count()); - assert!(vector.validity().is_all_valid()); - } - - #[test] - fn test_boolean_vector_builder() { - let input = BooleanVector::from_slice(&[true, false, true]); - - let mut builder = BooleanType::default().create_mutable_vector(3); - builder.push_value_ref(ValueRef::Boolean(true)).unwrap(); - assert!(builder.push_value_ref(ValueRef::Int32(123)).is_err()); - builder.extend_slice_of(&input, 1, 2).unwrap(); - assert!(builder - .extend_slice_of(&crate::vectors::Int32Vector::from_slice(&[13]), 0, 1) - .is_err()); - let vector = builder.to_vector(); - - let expect: VectorRef = Arc::new(BooleanVector::from_slice(&[true, false, true])); - assert_eq!(expect, vector); - } -} diff --git a/src/datatypes2/src/vectors/constant.rs b/src/datatypes2/src/vectors/constant.rs deleted file mode 100644 index 87739e91318b..000000000000 --- a/src/datatypes2/src/vectors/constant.rs +++ /dev/null @@ -1,218 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::any::Any; -use std::fmt; -use std::sync::Arc; - -use arrow::array::{Array, ArrayRef}; -use snafu::ResultExt; - -use crate::data_type::ConcreteDataType; -use crate::error::{Result, SerializeSnafu}; -use crate::serialize::Serializable; -use crate::value::{Value, ValueRef}; -use crate::vectors::{BooleanVector, Helper, Validity, Vector, VectorRef}; - -#[derive(Clone)] -pub struct ConstantVector { - length: usize, - vector: VectorRef, -} - -impl ConstantVector { - /// Create a new [ConstantVector]. - /// - /// # Panics - /// Panics if `vector.len() != 1`. - pub fn new(vector: VectorRef, length: usize) -> Self { - assert_eq!(1, vector.len()); - - // Avoid const recursion. - if vector.is_const() { - let vec: &ConstantVector = unsafe { Helper::static_cast(&vector) }; - return Self::new(vec.inner().clone(), length); - } - Self { vector, length } - } - - pub fn inner(&self) -> &VectorRef { - &self.vector - } - - /// Returns the constant value. - pub fn get_constant_ref(&self) -> ValueRef { - self.vector.get_ref(0) - } - - pub(crate) fn replicate_vector(&self, offsets: &[usize]) -> VectorRef { - assert_eq!(offsets.len(), self.len()); - - if offsets.is_empty() { - return self.slice(0, 0); - } - - Arc::new(ConstantVector::new( - self.vector.clone(), - *offsets.last().unwrap(), - )) - } - - pub(crate) fn filter_vector(&self, filter: &BooleanVector) -> Result { - let length = self.len() - filter.false_count(); - if length == self.len() { - return Ok(Arc::new(self.clone())); - } - Ok(Arc::new(ConstantVector::new(self.inner().clone(), length))) - } -} - -impl Vector for ConstantVector { - fn data_type(&self) -> ConcreteDataType { - self.vector.data_type() - } - - fn vector_type_name(&self) -> String { - "ConstantVector".to_string() - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn len(&self) -> usize { - self.length - } - - fn to_arrow_array(&self) -> ArrayRef { - let v = self.vector.replicate(&[self.length]); - v.to_arrow_array() - } - - fn to_boxed_arrow_array(&self) -> Box { - let v = self.vector.replicate(&[self.length]); - v.to_boxed_arrow_array() - } - - fn is_const(&self) -> bool { - true - } - - fn validity(&self) -> Validity { - if self.vector.is_null(0) { - Validity::all_null(self.length) - } else { - Validity::all_valid(self.length) - } - } - - fn memory_size(&self) -> usize { - self.vector.memory_size() - } - - fn is_null(&self, _row: usize) -> bool { - self.vector.is_null(0) - } - - fn only_null(&self) -> bool { - self.vector.is_null(0) - } - - fn slice(&self, _offset: usize, length: usize) -> VectorRef { - Arc::new(Self { - vector: self.vector.clone(), - length, - }) - } - - fn get(&self, _index: usize) -> Value { - self.vector.get(0) - } - - fn get_ref(&self, _index: usize) -> ValueRef { - self.vector.get_ref(0) - } - - fn null_count(&self) -> usize { - if self.only_null() { - self.len() - } else { - 0 - } - } -} - -impl fmt::Debug for ConstantVector { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "ConstantVector([{:?}; {}])", self.get(0), self.len()) - } -} - -impl Serializable for ConstantVector { - fn serialize_to_json(&self) -> Result> { - std::iter::repeat(self.get(0)) - .take(self.len()) - .map(serde_json::Value::try_from) - .collect::>() - .context(SerializeSnafu) - } -} - -#[cfg(test)] -mod tests { - use arrow::datatypes::DataType as ArrowDataType; - - use super::*; - use crate::vectors::Int32Vector; - - #[test] - fn test_constant_vector_misc() { - let a = Int32Vector::from_slice(vec![1]); - let c = ConstantVector::new(Arc::new(a), 10); - - assert_eq!("ConstantVector", c.vector_type_name()); - assert!(c.is_const()); - assert_eq!(10, c.len()); - assert!(c.validity().is_all_valid()); - assert!(!c.only_null()); - assert_eq!(64, c.memory_size()); - - for i in 0..10 { - assert!(!c.is_null(i)); - assert_eq!(Value::Int32(1), c.get(i)); - } - - let arrow_arr = c.to_arrow_array(); - assert_eq!(10, arrow_arr.len()); - assert_eq!(&ArrowDataType::Int32, arrow_arr.data_type()); - } - - #[test] - fn test_debug_null_array() { - let a = Int32Vector::from_slice(vec![1]); - let c = ConstantVector::new(Arc::new(a), 10); - - let s = format!("{:?}", c); - assert_eq!(s, "ConstantVector([Int32(1); 10])"); - } - - #[test] - fn test_serialize_json() { - let a = Int32Vector::from_slice(vec![1]); - let c = ConstantVector::new(Arc::new(a), 10); - - let s = serde_json::to_string(&c.serialize_to_json().unwrap()).unwrap(); - assert_eq!(s, "[1,1,1,1,1,1,1,1,1,1]"); - } -} diff --git a/src/datatypes2/src/vectors/date.rs b/src/datatypes2/src/vectors/date.rs deleted file mode 100644 index d0a66b80fb63..000000000000 --- a/src/datatypes2/src/vectors/date.rs +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use crate::types::DateType; -use crate::vectors::{PrimitiveVector, PrimitiveVectorBuilder}; - -// Vector for [`Date`](common_time::Date). -pub type DateVector = PrimitiveVector; -// Builder to build DateVector. -pub type DateVectorBuilder = PrimitiveVectorBuilder; - -#[cfg(test)] -mod tests { - use std::sync::Arc; - - use arrow::array::Array; - use common_time::date::Date; - - use super::*; - use crate::data_type::DataType; - use crate::scalars::{ScalarVector, ScalarVectorBuilder}; - use crate::serialize::Serializable; - use crate::types::DateType; - use crate::value::{Value, ValueRef}; - use crate::vectors::{Vector, VectorRef}; - - #[test] - fn test_build_date_vector() { - let mut builder = DateVectorBuilder::with_capacity(4); - builder.push(Some(Date::new(1))); - builder.push(None); - builder.push(Some(Date::new(-1))); - let vector = builder.finish(); - assert_eq!(3, vector.len()); - assert_eq!(Value::Date(Date::new(1)), vector.get(0)); - assert_eq!(ValueRef::Date(Date::new(1)), vector.get_ref(0)); - assert_eq!(Some(Date::new(1)), vector.get_data(0)); - assert_eq!(None, vector.get_data(1)); - assert_eq!(Value::Null, vector.get(1)); - assert_eq!(ValueRef::Null, vector.get_ref(1)); - assert_eq!(Some(Date::new(-1)), vector.get_data(2)); - let mut iter = vector.iter_data(); - assert_eq!(Some(Date::new(1)), iter.next().unwrap()); - assert_eq!(None, iter.next().unwrap()); - assert_eq!(Some(Date::new(-1)), iter.next().unwrap()); - } - - #[test] - fn test_date_scalar() { - let vector = DateVector::from_slice(&[1, 2]); - assert_eq!(2, vector.len()); - assert_eq!(Some(Date::new(1)), vector.get_data(0)); - assert_eq!(Some(Date::new(2)), vector.get_data(1)); - } - - #[test] - fn test_date_vector_builder() { - let input = DateVector::from_slice(&[1, 2, 3]); - - let mut builder = DateType::default().create_mutable_vector(3); - builder - .push_value_ref(ValueRef::Date(Date::new(5))) - .unwrap(); - assert!(builder.push_value_ref(ValueRef::Int32(123)).is_err()); - builder.extend_slice_of(&input, 1, 2).unwrap(); - assert!(builder - .extend_slice_of(&crate::vectors::Int32Vector::from_slice(&[13]), 0, 1) - .is_err()); - let vector = builder.to_vector(); - - let expect: VectorRef = Arc::new(DateVector::from_slice(&[5, 2, 3])); - assert_eq!(expect, vector); - } - - #[test] - fn test_date_from_arrow() { - let vector = DateVector::from_slice(&[1, 2]); - let arrow = vector.as_arrow().slice(0, vector.len()); - let vector2 = DateVector::try_from_arrow_array(&arrow).unwrap(); - assert_eq!(vector, vector2); - } - - #[test] - fn test_serialize_date_vector() { - let vector = DateVector::from_slice(&[-1, 0, 1]); - let serialized_json = serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap(); - assert_eq!( - r#"["1969-12-31","1970-01-01","1970-01-02"]"#, - serialized_json - ); - } -} diff --git a/src/datatypes2/src/vectors/datetime.rs b/src/datatypes2/src/vectors/datetime.rs deleted file mode 100644 index a40a3e54d330..000000000000 --- a/src/datatypes2/src/vectors/datetime.rs +++ /dev/null @@ -1,116 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use crate::types::DateTimeType; -use crate::vectors::{PrimitiveVector, PrimitiveVectorBuilder}; - -/// Vector of [`DateTime`](common_time::Date) -pub type DateTimeVector = PrimitiveVector; -/// Builder for [`DateTimeVector`]. -pub type DateTimeVectorBuilder = PrimitiveVectorBuilder; - -#[cfg(test)] -mod tests { - use std::sync::Arc; - - use arrow::array::{Array, PrimitiveArray}; - use common_time::DateTime; - use datafusion_common::from_slice::FromSlice; - - use super::*; - use crate::data_type::DataType; - use crate::prelude::{ - ConcreteDataType, ScalarVector, ScalarVectorBuilder, Value, ValueRef, Vector, VectorRef, - }; - use crate::serialize::Serializable; - - #[test] - fn test_datetime_vector() { - let v = DateTimeVector::new(PrimitiveArray::from_slice(&[1, 2, 3])); - assert_eq!(ConcreteDataType::datetime_datatype(), v.data_type()); - assert_eq!(3, v.len()); - assert_eq!("DateTimeVector", v.vector_type_name()); - assert_eq!( - &arrow::datatypes::DataType::Date64, - v.to_arrow_array().data_type() - ); - - assert_eq!(Some(DateTime::new(1)), v.get_data(0)); - assert_eq!(Value::DateTime(DateTime::new(1)), v.get(0)); - assert_eq!(ValueRef::DateTime(DateTime::new(1)), v.get_ref(0)); - - let mut iter = v.iter_data(); - assert_eq!(Some(DateTime::new(1)), iter.next().unwrap()); - assert_eq!(Some(DateTime::new(2)), iter.next().unwrap()); - assert_eq!(Some(DateTime::new(3)), iter.next().unwrap()); - assert!(!v.is_null(0)); - assert_eq!(64, v.memory_size()); - - if let Value::DateTime(d) = v.get(0) { - assert_eq!(1, d.val()); - } else { - unreachable!() - } - assert_eq!( - "[\"1970-01-01 00:00:01\",\"1970-01-01 00:00:02\",\"1970-01-01 00:00:03\"]", - serde_json::to_string(&v.serialize_to_json().unwrap()).unwrap() - ); - } - - #[test] - fn test_datetime_vector_builder() { - let mut builder = DateTimeVectorBuilder::with_capacity(3); - builder.push(Some(DateTime::new(1))); - builder.push(None); - builder.push(Some(DateTime::new(-1))); - - let v = builder.finish(); - assert_eq!(ConcreteDataType::datetime_datatype(), v.data_type()); - assert_eq!(Value::DateTime(DateTime::new(1)), v.get(0)); - assert_eq!(Value::Null, v.get(1)); - assert_eq!(Value::DateTime(DateTime::new(-1)), v.get(2)); - - let input = DateTimeVector::from_wrapper_slice(&[ - DateTime::new(1), - DateTime::new(2), - DateTime::new(3), - ]); - - let mut builder = DateTimeType::default().create_mutable_vector(3); - builder - .push_value_ref(ValueRef::DateTime(DateTime::new(5))) - .unwrap(); - assert!(builder.push_value_ref(ValueRef::Int32(123)).is_err()); - builder.extend_slice_of(&input, 1, 2).unwrap(); - assert!(builder - .extend_slice_of(&crate::vectors::Int32Vector::from_slice(&[13]), 0, 1) - .is_err()); - let vector = builder.to_vector(); - - let expect: VectorRef = Arc::new(DateTimeVector::from_wrapper_slice(&[ - DateTime::new(5), - DateTime::new(2), - DateTime::new(3), - ])); - assert_eq!(expect, vector); - } - - #[test] - fn test_datetime_from_arrow() { - let vector = DateTimeVector::from_wrapper_slice(&[DateTime::new(1), DateTime::new(2)]); - let arrow = vector.as_arrow().slice(0, vector.len()); - let vector2 = DateTimeVector::try_from_arrow_array(&arrow).unwrap(); - assert_eq!(vector, vector2); - } -} diff --git a/src/datatypes2/src/vectors/eq.rs b/src/datatypes2/src/vectors/eq.rs deleted file mode 100644 index 55359026d479..000000000000 --- a/src/datatypes2/src/vectors/eq.rs +++ /dev/null @@ -1,228 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::sync::Arc; - -use crate::data_type::DataType; -use crate::types::TimestampType; -use crate::vectors::constant::ConstantVector; -use crate::vectors::{ - BinaryVector, BooleanVector, DateTimeVector, DateVector, ListVector, PrimitiveVector, - StringVector, TimestampMicrosecondVector, TimestampMillisecondVector, - TimestampNanosecondVector, TimestampSecondVector, Vector, -}; -use crate::with_match_primitive_type_id; - -impl Eq for dyn Vector + '_ {} - -impl PartialEq for dyn Vector + '_ { - fn eq(&self, other: &dyn Vector) -> bool { - equal(self, other) - } -} - -impl PartialEq for Arc { - fn eq(&self, other: &dyn Vector) -> bool { - equal(&**self, other) - } -} - -macro_rules! is_vector_eq { - ($VectorType: ident, $lhs: ident, $rhs: ident) => {{ - let lhs = $lhs.as_any().downcast_ref::<$VectorType>().unwrap(); - let rhs = $rhs.as_any().downcast_ref::<$VectorType>().unwrap(); - - lhs == rhs - }}; -} - -fn equal(lhs: &dyn Vector, rhs: &dyn Vector) -> bool { - if lhs.data_type() != rhs.data_type() || lhs.len() != rhs.len() { - return false; - } - - if lhs.is_const() || rhs.is_const() { - // Length has been checked before, so we only need to compare inner - // vector here. - return equal( - &**lhs - .as_any() - .downcast_ref::() - .unwrap() - .inner(), - &**lhs - .as_any() - .downcast_ref::() - .unwrap() - .inner(), - ); - } - - use crate::data_type::ConcreteDataType::*; - - let lhs_type = lhs.data_type(); - match lhs.data_type() { - Null(_) => true, - Boolean(_) => is_vector_eq!(BooleanVector, lhs, rhs), - Binary(_) => is_vector_eq!(BinaryVector, lhs, rhs), - String(_) => is_vector_eq!(StringVector, lhs, rhs), - Date(_) => is_vector_eq!(DateVector, lhs, rhs), - DateTime(_) => is_vector_eq!(DateTimeVector, lhs, rhs), - Timestamp(t) => match t { - TimestampType::Second(_) => { - is_vector_eq!(TimestampSecondVector, lhs, rhs) - } - TimestampType::Millisecond(_) => { - is_vector_eq!(TimestampMillisecondVector, lhs, rhs) - } - TimestampType::Microsecond(_) => { - is_vector_eq!(TimestampMicrosecondVector, lhs, rhs) - } - TimestampType::Nanosecond(_) => { - is_vector_eq!(TimestampNanosecondVector, lhs, rhs) - } - }, - List(_) => is_vector_eq!(ListVector, lhs, rhs), - UInt8(_) | UInt16(_) | UInt32(_) | UInt64(_) | Int8(_) | Int16(_) | Int32(_) | Int64(_) - | Float32(_) | Float64(_) => { - with_match_primitive_type_id!(lhs_type.logical_type_id(), |$T| { - let lhs = lhs.as_any().downcast_ref::>().unwrap(); - let rhs = rhs.as_any().downcast_ref::>().unwrap(); - - lhs == rhs - }, - { - unreachable!("should not compare {} with {}", lhs.vector_type_name(), rhs.vector_type_name()) - }) - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::vectors::{ - list, Float32Vector, Float64Vector, Int16Vector, Int32Vector, Int64Vector, Int8Vector, - NullVector, UInt16Vector, UInt32Vector, UInt64Vector, UInt8Vector, VectorRef, - }; - - fn assert_vector_ref_eq(vector: VectorRef) { - let rhs = vector.clone(); - assert_eq!(vector, rhs); - assert_dyn_vector_eq(&*vector, &*rhs); - } - - fn assert_dyn_vector_eq(lhs: &dyn Vector, rhs: &dyn Vector) { - assert_eq!(lhs, rhs); - } - - fn assert_vector_ref_ne(lhs: VectorRef, rhs: VectorRef) { - assert_ne!(lhs, rhs); - } - - #[test] - fn test_vector_eq() { - assert_vector_ref_eq(Arc::new(BinaryVector::from(vec![ - Some(b"hello".to_vec()), - Some(b"world".to_vec()), - ]))); - assert_vector_ref_eq(Arc::new(BooleanVector::from(vec![true, false]))); - assert_vector_ref_eq(Arc::new(ConstantVector::new( - Arc::new(BooleanVector::from(vec![true])), - 5, - ))); - assert_vector_ref_eq(Arc::new(BooleanVector::from(vec![true, false]))); - assert_vector_ref_eq(Arc::new(DateVector::from(vec![Some(100), Some(120)]))); - assert_vector_ref_eq(Arc::new(DateTimeVector::from(vec![Some(100), Some(120)]))); - assert_vector_ref_eq(Arc::new(TimestampSecondVector::from_values([100, 120]))); - assert_vector_ref_eq(Arc::new(TimestampMillisecondVector::from_values([ - 100, 120, - ]))); - assert_vector_ref_eq(Arc::new(TimestampMicrosecondVector::from_values([ - 100, 120, - ]))); - assert_vector_ref_eq(Arc::new(TimestampNanosecondVector::from_values([100, 120]))); - - let list_vector = list::tests::new_list_vector(&[ - Some(vec![Some(1), Some(2)]), - None, - Some(vec![Some(3), Some(4)]), - ]); - assert_vector_ref_eq(Arc::new(list_vector)); - - assert_vector_ref_eq(Arc::new(NullVector::new(4))); - assert_vector_ref_eq(Arc::new(StringVector::from(vec![ - Some("hello"), - Some("world"), - ]))); - - assert_vector_ref_eq(Arc::new(Int8Vector::from_slice(&[1, 2, 3, 4]))); - assert_vector_ref_eq(Arc::new(UInt8Vector::from_slice(&[1, 2, 3, 4]))); - assert_vector_ref_eq(Arc::new(Int16Vector::from_slice(&[1, 2, 3, 4]))); - assert_vector_ref_eq(Arc::new(UInt16Vector::from_slice(&[1, 2, 3, 4]))); - assert_vector_ref_eq(Arc::new(Int32Vector::from_slice(&[1, 2, 3, 4]))); - assert_vector_ref_eq(Arc::new(UInt32Vector::from_slice(&[1, 2, 3, 4]))); - assert_vector_ref_eq(Arc::new(Int64Vector::from_slice(&[1, 2, 3, 4]))); - assert_vector_ref_eq(Arc::new(UInt64Vector::from_slice(&[1, 2, 3, 4]))); - assert_vector_ref_eq(Arc::new(Float32Vector::from_slice(&[1.0, 2.0, 3.0, 4.0]))); - assert_vector_ref_eq(Arc::new(Float64Vector::from_slice(&[1.0, 2.0, 3.0, 4.0]))); - } - - #[test] - fn test_vector_ne() { - assert_vector_ref_ne( - Arc::new(Int32Vector::from_slice(&[1, 2, 3, 4])), - Arc::new(Int32Vector::from_slice(&[1, 2])), - ); - assert_vector_ref_ne( - Arc::new(Int32Vector::from_slice(&[1, 2, 3, 4])), - Arc::new(Int8Vector::from_slice(&[1, 2, 3, 4])), - ); - assert_vector_ref_ne( - Arc::new(Int32Vector::from_slice(&[1, 2, 3, 4])), - Arc::new(BooleanVector::from(vec![true, true])), - ); - assert_vector_ref_ne( - Arc::new(ConstantVector::new( - Arc::new(BooleanVector::from(vec![true])), - 5, - )), - Arc::new(ConstantVector::new( - Arc::new(BooleanVector::from(vec![true])), - 4, - )), - ); - assert_vector_ref_ne( - Arc::new(ConstantVector::new( - Arc::new(BooleanVector::from(vec![true])), - 5, - )), - Arc::new(ConstantVector::new( - Arc::new(BooleanVector::from(vec![false])), - 4, - )), - ); - assert_vector_ref_ne( - Arc::new(ConstantVector::new( - Arc::new(BooleanVector::from(vec![true])), - 5, - )), - Arc::new(ConstantVector::new( - Arc::new(Int32Vector::from_slice(vec![1])), - 4, - )), - ); - assert_vector_ref_ne(Arc::new(NullVector::new(5)), Arc::new(NullVector::new(8))); - } -} diff --git a/src/datatypes2/src/vectors/helper.rs b/src/datatypes2/src/vectors/helper.rs deleted file mode 100644 index f3236ca0ec42..000000000000 --- a/src/datatypes2/src/vectors/helper.rs +++ /dev/null @@ -1,431 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -//! Vector helper functions, inspired by databend Series mod - -use std::any::Any; -use std::sync::Arc; - -use arrow::array::{Array, ArrayRef, StringArray}; -use arrow::compute; -use arrow::compute::kernels::comparison; -use arrow::datatypes::{DataType as ArrowDataType, TimeUnit}; -use datafusion_common::ScalarValue; -use snafu::{OptionExt, ResultExt}; - -use crate::data_type::ConcreteDataType; -use crate::error::{self, Result}; -use crate::scalars::{Scalar, ScalarVectorBuilder}; -use crate::value::{ListValue, ListValueRef}; -use crate::vectors::{ - BinaryVector, BooleanVector, ConstantVector, DateTimeVector, DateVector, Float32Vector, - Float64Vector, Int16Vector, Int32Vector, Int64Vector, Int8Vector, ListVector, - ListVectorBuilder, MutableVector, NullVector, StringVector, TimestampMicrosecondVector, - TimestampMillisecondVector, TimestampNanosecondVector, TimestampSecondVector, UInt16Vector, - UInt32Vector, UInt64Vector, UInt8Vector, Vector, VectorRef, -}; - -/// Helper functions for `Vector`. -pub struct Helper; - -impl Helper { - /// Get a pointer to the underlying data of this vectors. - /// Can be useful for fast comparisons. - /// # Safety - /// Assumes that the `vector` is T. - pub unsafe fn static_cast(vector: &VectorRef) -> &T { - let object = vector.as_ref(); - debug_assert!(object.as_any().is::()); - &*(object as *const dyn Vector as *const T) - } - - pub fn check_get_scalar(vector: &VectorRef) -> Result<&::VectorType> { - let arr = vector - .as_any() - .downcast_ref::<::VectorType>() - .with_context(|| error::UnknownVectorSnafu { - msg: format!( - "downcast vector error, vector type: {:?}, expected vector: {:?}", - vector.vector_type_name(), - std::any::type_name::(), - ), - }); - arr - } - - pub fn check_get(vector: &VectorRef) -> Result<&T> { - let arr = vector - .as_any() - .downcast_ref::() - .with_context(|| error::UnknownVectorSnafu { - msg: format!( - "downcast vector error, vector type: {:?}, expected vector: {:?}", - vector.vector_type_name(), - std::any::type_name::(), - ), - }); - arr - } - - pub fn check_get_mutable_vector( - vector: &mut dyn MutableVector, - ) -> Result<&mut T> { - let ty = vector.data_type(); - let arr = vector - .as_mut_any() - .downcast_mut() - .with_context(|| error::UnknownVectorSnafu { - msg: format!( - "downcast vector error, vector type: {:?}, expected vector: {:?}", - ty, - std::any::type_name::(), - ), - }); - arr - } - - pub fn check_get_scalar_vector( - vector: &VectorRef, - ) -> Result<&::VectorType> { - let arr = vector - .as_any() - .downcast_ref::<::VectorType>() - .with_context(|| error::UnknownVectorSnafu { - msg: format!( - "downcast vector error, vector type: {:?}, expected vector: {:?}", - vector.vector_type_name(), - std::any::type_name::(), - ), - }); - arr - } - - /// Try to cast an arrow scalar value into vector - pub fn try_from_scalar_value(value: ScalarValue, length: usize) -> Result { - let vector = match value { - ScalarValue::Null => ConstantVector::new(Arc::new(NullVector::new(1)), length), - ScalarValue::Boolean(v) => { - ConstantVector::new(Arc::new(BooleanVector::from(vec![v])), length) - } - ScalarValue::Float32(v) => { - ConstantVector::new(Arc::new(Float32Vector::from(vec![v])), length) - } - ScalarValue::Float64(v) => { - ConstantVector::new(Arc::new(Float64Vector::from(vec![v])), length) - } - ScalarValue::Int8(v) => { - ConstantVector::new(Arc::new(Int8Vector::from(vec![v])), length) - } - ScalarValue::Int16(v) => { - ConstantVector::new(Arc::new(Int16Vector::from(vec![v])), length) - } - ScalarValue::Int32(v) => { - ConstantVector::new(Arc::new(Int32Vector::from(vec![v])), length) - } - ScalarValue::Int64(v) => { - ConstantVector::new(Arc::new(Int64Vector::from(vec![v])), length) - } - ScalarValue::UInt8(v) => { - ConstantVector::new(Arc::new(UInt8Vector::from(vec![v])), length) - } - ScalarValue::UInt16(v) => { - ConstantVector::new(Arc::new(UInt16Vector::from(vec![v])), length) - } - ScalarValue::UInt32(v) => { - ConstantVector::new(Arc::new(UInt32Vector::from(vec![v])), length) - } - ScalarValue::UInt64(v) => { - ConstantVector::new(Arc::new(UInt64Vector::from(vec![v])), length) - } - ScalarValue::Utf8(v) | ScalarValue::LargeUtf8(v) => { - ConstantVector::new(Arc::new(StringVector::from(vec![v])), length) - } - ScalarValue::Binary(v) - | ScalarValue::LargeBinary(v) - | ScalarValue::FixedSizeBinary(_, v) => { - ConstantVector::new(Arc::new(BinaryVector::from(vec![v])), length) - } - ScalarValue::List(v, field) => { - let item_type = ConcreteDataType::try_from(field.data_type())?; - let mut builder = ListVectorBuilder::with_type_capacity(item_type.clone(), 1); - if let Some(values) = v { - let values = values - .into_iter() - .map(ScalarValue::try_into) - .collect::>()?; - let list_value = ListValue::new(Some(Box::new(values)), item_type); - builder.push(Some(ListValueRef::Ref { val: &list_value })); - } else { - builder.push(None); - } - let list_vector = builder.to_vector(); - ConstantVector::new(list_vector, length) - } - ScalarValue::Date32(v) => { - ConstantVector::new(Arc::new(DateVector::from(vec![v])), length) - } - ScalarValue::Date64(v) => { - ConstantVector::new(Arc::new(DateTimeVector::from(vec![v])), length) - } - ScalarValue::TimestampSecond(v, _) => { - // Timezone is unimplemented now. - ConstantVector::new(Arc::new(TimestampSecondVector::from(vec![v])), length) - } - ScalarValue::TimestampMillisecond(v, _) => { - // Timezone is unimplemented now. - ConstantVector::new(Arc::new(TimestampMillisecondVector::from(vec![v])), length) - } - ScalarValue::TimestampMicrosecond(v, _) => { - // Timezone is unimplemented now. - ConstantVector::new(Arc::new(TimestampMicrosecondVector::from(vec![v])), length) - } - ScalarValue::TimestampNanosecond(v, _) => { - // Timezone is unimplemented now. - ConstantVector::new(Arc::new(TimestampNanosecondVector::from(vec![v])), length) - } - ScalarValue::Decimal128(_, _, _) - | ScalarValue::Time64(_) - | ScalarValue::IntervalYearMonth(_) - | ScalarValue::IntervalDayTime(_) - | ScalarValue::IntervalMonthDayNano(_) - | ScalarValue::Struct(_, _) - | ScalarValue::Dictionary(_, _) => { - return error::ConversionSnafu { - from: format!("Unsupported scalar value: {}", value), - } - .fail() - } - }; - - Ok(Arc::new(vector)) - } - - /// Try to cast an arrow array into vector - /// - /// # Panics - /// Panic if given arrow data type is not supported. - pub fn try_into_vector(array: impl AsRef) -> Result { - Ok(match array.as_ref().data_type() { - ArrowDataType::Null => Arc::new(NullVector::try_from_arrow_array(array)?), - ArrowDataType::Boolean => Arc::new(BooleanVector::try_from_arrow_array(array)?), - ArrowDataType::LargeBinary => Arc::new(BinaryVector::try_from_arrow_array(array)?), - ArrowDataType::Int8 => Arc::new(Int8Vector::try_from_arrow_array(array)?), - ArrowDataType::Int16 => Arc::new(Int16Vector::try_from_arrow_array(array)?), - ArrowDataType::Int32 => Arc::new(Int32Vector::try_from_arrow_array(array)?), - ArrowDataType::Int64 => Arc::new(Int64Vector::try_from_arrow_array(array)?), - ArrowDataType::UInt8 => Arc::new(UInt8Vector::try_from_arrow_array(array)?), - ArrowDataType::UInt16 => Arc::new(UInt16Vector::try_from_arrow_array(array)?), - ArrowDataType::UInt32 => Arc::new(UInt32Vector::try_from_arrow_array(array)?), - ArrowDataType::UInt64 => Arc::new(UInt64Vector::try_from_arrow_array(array)?), - ArrowDataType::Float32 => Arc::new(Float32Vector::try_from_arrow_array(array)?), - ArrowDataType::Float64 => Arc::new(Float64Vector::try_from_arrow_array(array)?), - ArrowDataType::Utf8 => Arc::new(StringVector::try_from_arrow_array(array)?), - ArrowDataType::Date32 => Arc::new(DateVector::try_from_arrow_array(array)?), - ArrowDataType::Date64 => Arc::new(DateTimeVector::try_from_arrow_array(array)?), - ArrowDataType::List(_) => Arc::new(ListVector::try_from_arrow_array(array)?), - ArrowDataType::Timestamp(unit, _) => match unit { - TimeUnit::Second => Arc::new(TimestampSecondVector::try_from_arrow_array(array)?), - TimeUnit::Millisecond => { - Arc::new(TimestampMillisecondVector::try_from_arrow_array(array)?) - } - TimeUnit::Microsecond => { - Arc::new(TimestampMicrosecondVector::try_from_arrow_array(array)?) - } - TimeUnit::Nanosecond => { - Arc::new(TimestampNanosecondVector::try_from_arrow_array(array)?) - } - }, - ArrowDataType::Float16 - | ArrowDataType::Time32(_) - | ArrowDataType::Time64(_) - | ArrowDataType::Duration(_) - | ArrowDataType::Interval(_) - | ArrowDataType::Binary - | ArrowDataType::FixedSizeBinary(_) - | ArrowDataType::LargeUtf8 - | ArrowDataType::LargeList(_) - | ArrowDataType::FixedSizeList(_, _) - | ArrowDataType::Struct(_) - | ArrowDataType::Union(_, _, _) - | ArrowDataType::Dictionary(_, _) - | ArrowDataType::Decimal128(_, _) - | ArrowDataType::Decimal256(_, _) - | ArrowDataType::Map(_, _) => { - unimplemented!("Arrow array datatype: {:?}", array.as_ref().data_type()) - } - }) - } - - /// Try to cast slice of `arrays` to vectors. - pub fn try_into_vectors(arrays: &[ArrayRef]) -> Result> { - arrays.iter().map(Self::try_into_vector).collect() - } - - /// Perform SQL like operation on `names` and a scalar `s`. - pub fn like_utf8(names: Vec, s: &str) -> Result { - let array = StringArray::from(names); - - let filter = comparison::like_utf8_scalar(&array, s).context(error::ArrowComputeSnafu)?; - - let result = compute::filter(&array, &filter).context(error::ArrowComputeSnafu)?; - Helper::try_into_vector(result) - } -} - -#[cfg(test)] -mod tests { - use arrow::array::{ - ArrayRef, BooleanArray, Date32Array, Date64Array, Float32Array, Float64Array, Int16Array, - Int32Array, Int64Array, Int8Array, LargeBinaryArray, ListArray, NullArray, - TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, - TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, - }; - use arrow::datatypes::{Field, Int32Type}; - use common_time::{Date, DateTime}; - - use super::*; - use crate::value::Value; - use crate::vectors::ConcreteDataType; - - #[test] - fn test_try_into_vectors() { - let arrays: Vec = vec![ - Arc::new(Int32Array::from(vec![1])), - Arc::new(Int32Array::from(vec![2])), - Arc::new(Int32Array::from(vec![3])), - ]; - let vectors = Helper::try_into_vectors(&arrays); - assert!(vectors.is_ok()); - let vectors = vectors.unwrap(); - vectors.iter().for_each(|v| assert_eq!(1, v.len())); - assert_eq!(Value::Int32(1), vectors[0].get(0)); - assert_eq!(Value::Int32(2), vectors[1].get(0)); - assert_eq!(Value::Int32(3), vectors[2].get(0)); - } - - #[test] - fn test_try_into_date_vector() { - let vector = DateVector::from(vec![Some(1), Some(2), None]); - let arrow_array = vector.to_arrow_array(); - assert_eq!(&ArrowDataType::Date32, arrow_array.data_type()); - let vector_converted = Helper::try_into_vector(arrow_array).unwrap(); - assert_eq!(vector.len(), vector_converted.len()); - for i in 0..vector_converted.len() { - assert_eq!(vector.get(i), vector_converted.get(i)); - } - } - - #[test] - fn test_try_from_scalar_date_value() { - let vector = Helper::try_from_scalar_value(ScalarValue::Date32(Some(42)), 3).unwrap(); - assert_eq!(ConcreteDataType::date_datatype(), vector.data_type()); - assert_eq!(3, vector.len()); - for i in 0..vector.len() { - assert_eq!(Value::Date(Date::new(42)), vector.get(i)); - } - } - - #[test] - fn test_try_from_scalar_datetime_value() { - let vector = Helper::try_from_scalar_value(ScalarValue::Date64(Some(42)), 3).unwrap(); - assert_eq!(ConcreteDataType::datetime_datatype(), vector.data_type()); - assert_eq!(3, vector.len()); - for i in 0..vector.len() { - assert_eq!(Value::DateTime(DateTime::new(42)), vector.get(i)); - } - } - - #[test] - fn test_try_from_list_value() { - let value = ScalarValue::List( - Some(vec![ - ScalarValue::Int32(Some(1)), - ScalarValue::Int32(Some(2)), - ]), - Box::new(Field::new("item", ArrowDataType::Int32, true)), - ); - let vector = Helper::try_from_scalar_value(value, 3).unwrap(); - assert_eq!( - ConcreteDataType::list_datatype(ConcreteDataType::int32_datatype()), - vector.data_type() - ); - assert_eq!(3, vector.len()); - for i in 0..vector.len() { - let v = vector.get(i); - let items = v.as_list().unwrap().unwrap().items().as_ref().unwrap(); - assert_eq!(vec![Value::Int32(1), Value::Int32(2)], **items); - } - } - - #[test] - fn test_like_utf8() { - fn assert_vector(expected: Vec<&str>, actual: &VectorRef) { - let actual = actual.as_any().downcast_ref::().unwrap(); - assert_eq!(*actual, StringVector::from(expected)); - } - - let names: Vec = vec!["greptime", "hello", "public", "world"] - .into_iter() - .map(|x| x.to_string()) - .collect(); - - let ret = Helper::like_utf8(names.clone(), "%ll%").unwrap(); - assert_vector(vec!["hello"], &ret); - - let ret = Helper::like_utf8(names.clone(), "%time").unwrap(); - assert_vector(vec!["greptime"], &ret); - - let ret = Helper::like_utf8(names.clone(), "%ld").unwrap(); - assert_vector(vec!["world"], &ret); - - let ret = Helper::like_utf8(names, "%").unwrap(); - assert_vector(vec!["greptime", "hello", "public", "world"], &ret); - } - - fn check_try_into_vector(array: impl Array + 'static) { - let array: ArrayRef = Arc::new(array); - let vector = Helper::try_into_vector(array.clone()).unwrap(); - assert_eq!(&array, &vector.to_arrow_array()); - } - - #[test] - fn test_try_into_vector() { - check_try_into_vector(NullArray::new(2)); - check_try_into_vector(BooleanArray::from(vec![true, false])); - check_try_into_vector(LargeBinaryArray::from(vec![ - "hello".as_bytes(), - "world".as_bytes(), - ])); - check_try_into_vector(Int8Array::from(vec![1, 2, 3])); - check_try_into_vector(Int16Array::from(vec![1, 2, 3])); - check_try_into_vector(Int32Array::from(vec![1, 2, 3])); - check_try_into_vector(Int64Array::from(vec![1, 2, 3])); - check_try_into_vector(UInt8Array::from(vec![1, 2, 3])); - check_try_into_vector(UInt16Array::from(vec![1, 2, 3])); - check_try_into_vector(UInt32Array::from(vec![1, 2, 3])); - check_try_into_vector(UInt64Array::from(vec![1, 2, 3])); - check_try_into_vector(Float32Array::from(vec![1.0, 2.0, 3.0])); - check_try_into_vector(Float64Array::from(vec![1.0, 2.0, 3.0])); - check_try_into_vector(StringArray::from(vec!["hello", "world"])); - check_try_into_vector(Date32Array::from(vec![1, 2, 3])); - check_try_into_vector(Date64Array::from(vec![1, 2, 3])); - let data = vec![None, Some(vec![Some(6), Some(7)])]; - let list_array = ListArray::from_iter_primitive::(data); - check_try_into_vector(list_array); - check_try_into_vector(TimestampSecondArray::from(vec![1, 2, 3])); - check_try_into_vector(TimestampMillisecondArray::from(vec![1, 2, 3])); - check_try_into_vector(TimestampMicrosecondArray::from(vec![1, 2, 3])); - check_try_into_vector(TimestampNanosecondArray::from(vec![1, 2, 3])); - } -} diff --git a/src/datatypes2/src/vectors/list.rs b/src/datatypes2/src/vectors/list.rs deleted file mode 100644 index 747e03557ba2..000000000000 --- a/src/datatypes2/src/vectors/list.rs +++ /dev/null @@ -1,747 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::any::Any; -use std::sync::Arc; - -use arrow::array::{ - Array, ArrayData, ArrayRef, BooleanBufferBuilder, Int32BufferBuilder, ListArray, -}; -use arrow::buffer::Buffer; -use arrow::datatypes::DataType as ArrowDataType; -use serde_json::Value as JsonValue; - -use crate::data_type::{ConcreteDataType, DataType}; -use crate::error::Result; -use crate::scalars::{ScalarVector, ScalarVectorBuilder}; -use crate::serialize::Serializable; -use crate::types::ListType; -use crate::value::{ListValue, ListValueRef, Value, ValueRef}; -use crate::vectors::{self, Helper, MutableVector, Validity, Vector, VectorRef}; - -/// Vector of Lists, basically backed by Arrow's `ListArray`. -#[derive(Debug, PartialEq)] -pub struct ListVector { - array: ListArray, - /// The datatype of the items in the list. - item_type: ConcreteDataType, -} - -impl ListVector { - /// Iterate elements as [VectorRef]. - pub fn values_iter(&self) -> impl Iterator>> + '_ { - self.array - .iter() - .map(|value_opt| value_opt.map(Helper::try_into_vector).transpose()) - } - - fn to_array_data(&self) -> ArrayData { - self.array.data().clone() - } - - fn from_array_data_and_type(data: ArrayData, item_type: ConcreteDataType) -> Self { - Self { - array: ListArray::from(data), - item_type, - } - } - - pub(crate) fn as_arrow(&self) -> &dyn Array { - &self.array - } -} - -impl Vector for ListVector { - fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::List(ListType::new(self.item_type.clone())) - } - - fn vector_type_name(&self) -> String { - "ListVector".to_string() - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn len(&self) -> usize { - self.array.len() - } - - fn to_arrow_array(&self) -> ArrayRef { - let data = self.to_array_data(); - Arc::new(ListArray::from(data)) - } - - fn to_boxed_arrow_array(&self) -> Box { - let data = self.to_array_data(); - Box::new(ListArray::from(data)) - } - - fn validity(&self) -> Validity { - vectors::impl_validity_for_vector!(self.array) - } - - fn memory_size(&self) -> usize { - self.array.get_buffer_memory_size() - } - - fn null_count(&self) -> usize { - self.array.null_count() - } - - fn is_null(&self, row: usize) -> bool { - self.array.is_null(row) - } - - fn slice(&self, offset: usize, length: usize) -> VectorRef { - let data = self.array.data().slice(offset, length); - Arc::new(Self::from_array_data_and_type(data, self.item_type.clone())) - } - - fn get(&self, index: usize) -> Value { - if !self.array.is_valid(index) { - return Value::Null; - } - - let array = &self.array.value(index); - let vector = Helper::try_into_vector(array).unwrap_or_else(|_| { - panic!( - "arrow array with datatype {:?} cannot converted to our vector", - array.data_type() - ) - }); - let values = (0..vector.len()) - .map(|i| vector.get(i)) - .collect::>(); - Value::List(ListValue::new( - Some(Box::new(values)), - self.item_type.clone(), - )) - } - - fn get_ref(&self, index: usize) -> ValueRef { - ValueRef::List(ListValueRef::Indexed { - vector: self, - idx: index, - }) - } -} - -impl Serializable for ListVector { - fn serialize_to_json(&self) -> Result> { - self.array - .iter() - .map(|v| match v { - None => Ok(JsonValue::Null), - Some(v) => Helper::try_into_vector(v) - .and_then(|v| v.serialize_to_json()) - .map(JsonValue::Array), - }) - .collect() - } -} - -impl From for ListVector { - fn from(array: ListArray) -> Self { - let item_type = ConcreteDataType::from_arrow_type(match array.data_type() { - ArrowDataType::List(field) => field.data_type(), - other => panic!( - "Try to create ListVector from an arrow array with type {:?}", - other - ), - }); - Self { array, item_type } - } -} - -vectors::impl_try_from_arrow_array_for_vector!(ListArray, ListVector); - -pub struct ListIter<'a> { - vector: &'a ListVector, - idx: usize, -} - -impl<'a> ListIter<'a> { - fn new(vector: &'a ListVector) -> ListIter { - ListIter { vector, idx: 0 } - } -} - -impl<'a> Iterator for ListIter<'a> { - type Item = Option>; - - #[inline] - fn next(&mut self) -> Option { - if self.idx >= self.vector.len() { - return None; - } - - let idx = self.idx; - self.idx += 1; - - if self.vector.is_null(idx) { - return Some(None); - } - - Some(Some(ListValueRef::Indexed { - vector: self.vector, - idx, - })) - } - - #[inline] - fn size_hint(&self) -> (usize, Option) { - (self.vector.len(), Some(self.vector.len())) - } -} - -impl ScalarVector for ListVector { - type OwnedItem = ListValue; - type RefItem<'a> = ListValueRef<'a>; - type Iter<'a> = ListIter<'a>; - type Builder = ListVectorBuilder; - - fn get_data(&self, idx: usize) -> Option> { - if self.array.is_valid(idx) { - Some(ListValueRef::Indexed { vector: self, idx }) - } else { - None - } - } - - fn iter_data(&self) -> Self::Iter<'_> { - ListIter::new(self) - } -} - -// Ports from arrow's GenericListBuilder. -// See https://github.com/apache/arrow-rs/blob/94565bca99b5d9932a3e9a8e094aaf4e4384b1e5/arrow-array/src/builder/generic_list_builder.rs -/// [ListVector] builder. -pub struct ListVectorBuilder { - item_type: ConcreteDataType, - offsets_builder: Int32BufferBuilder, - null_buffer_builder: NullBufferBuilder, - values_builder: Box, -} - -impl ListVectorBuilder { - /// Creates a new [`ListVectorBuilder`]. `item_type` is the data type of the list item, `capacity` - /// is the number of items to pre-allocate space for in this builder. - pub fn with_type_capacity(item_type: ConcreteDataType, capacity: usize) -> ListVectorBuilder { - let mut offsets_builder = Int32BufferBuilder::new(capacity + 1); - offsets_builder.append(0); - // The actual required capacity might be greater than the capacity of the `ListVector` - // if the child vector has more than one element. - let values_builder = item_type.create_mutable_vector(capacity); - - ListVectorBuilder { - item_type, - offsets_builder, - null_buffer_builder: NullBufferBuilder::new(capacity), - values_builder, - } - } - - /// Finish the current variable-length list vector slot. - fn finish_list(&mut self, is_valid: bool) { - self.offsets_builder - .append(i32::try_from(self.values_builder.len()).unwrap()); - self.null_buffer_builder.append(is_valid); - } - - fn push_null(&mut self) { - self.finish_list(false); - } - - fn push_list_value(&mut self, list_value: &ListValue) -> Result<()> { - if let Some(items) = list_value.items() { - for item in &**items { - self.values_builder.push_value_ref(item.as_value_ref())?; - } - } - - self.finish_list(true); - Ok(()) - } -} - -impl MutableVector for ListVectorBuilder { - fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::list_datatype(self.item_type.clone()) - } - - fn len(&self) -> usize { - self.null_buffer_builder.len() - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn as_mut_any(&mut self) -> &mut dyn Any { - self - } - - fn to_vector(&mut self) -> VectorRef { - Arc::new(self.finish()) - } - - fn push_value_ref(&mut self, value: ValueRef) -> Result<()> { - if let Some(list_ref) = value.as_list()? { - match list_ref { - ListValueRef::Indexed { vector, idx } => match vector.get(idx).as_list()? { - Some(list_value) => self.push_list_value(list_value)?, - None => self.push_null(), - }, - ListValueRef::Ref { val } => self.push_list_value(val)?, - } - } else { - self.push_null(); - } - - Ok(()) - } - - fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> { - for idx in offset..offset + length { - let value = vector.get_ref(idx); - self.push_value_ref(value)?; - } - - Ok(()) - } -} - -impl ScalarVectorBuilder for ListVectorBuilder { - type VectorType = ListVector; - - fn with_capacity(_capacity: usize) -> Self { - panic!("Must use ListVectorBuilder::with_type_capacity()"); - } - - fn push(&mut self, value: Option<::RefItem<'_>>) { - // We expect the input ListValue has the same inner type as the builder when using - // push(), so just panic if `push_value_ref()` returns error, which indicate an - // invalid input value type. - self.push_value_ref(value.into()).unwrap_or_else(|e| { - panic!( - "Failed to push value, expect value type {:?}, err:{}", - self.item_type, e - ); - }); - } - - fn finish(&mut self) -> Self::VectorType { - let len = self.len(); - let values_vector = self.values_builder.to_vector(); - let values_arr = values_vector.to_arrow_array(); - let values_data = values_arr.data(); - - let offset_buffer = self.offsets_builder.finish(); - let null_bit_buffer = self.null_buffer_builder.finish(); - // Re-initialize the offsets_builder. - self.offsets_builder.append(0); - let data_type = ConcreteDataType::list_datatype(self.item_type.clone()).as_arrow_type(); - let array_data_builder = ArrayData::builder(data_type) - .len(len) - .add_buffer(offset_buffer) - .add_child_data(values_data.clone()) - .null_bit_buffer(null_bit_buffer); - - let array_data = unsafe { array_data_builder.build_unchecked() }; - let array = ListArray::from(array_data); - - ListVector { - array, - item_type: self.item_type.clone(), - } - } -} - -// Ports from https://github.com/apache/arrow-rs/blob/94565bca99b5d9932a3e9a8e094aaf4e4384b1e5/arrow-array/src/builder/null_buffer_builder.rs -/// Builder for creating the null bit buffer. -/// This builder only materializes the buffer when we append `false`. -/// If you only append `true`s to the builder, what you get will be -/// `None` when calling [`finish`](#method.finish). -/// This optimization is **very** important for the performance. -#[derive(Debug)] -struct NullBufferBuilder { - bitmap_builder: Option, - /// Store the length of the buffer before materializing. - len: usize, - capacity: usize, -} - -impl NullBufferBuilder { - /// Creates a new empty builder. - /// `capacity` is the number of bits in the null buffer. - fn new(capacity: usize) -> Self { - Self { - bitmap_builder: None, - len: 0, - capacity, - } - } - - fn len(&self) -> usize { - if let Some(b) = &self.bitmap_builder { - b.len() - } else { - self.len - } - } - - /// Appends a `true` into the builder - /// to indicate that this item is not null. - #[inline] - fn append_non_null(&mut self) { - if let Some(buf) = self.bitmap_builder.as_mut() { - buf.append(true) - } else { - self.len += 1; - } - } - - /// Appends a `false` into the builder - /// to indicate that this item is null. - #[inline] - fn append_null(&mut self) { - self.materialize_if_needed(); - self.bitmap_builder.as_mut().unwrap().append(false); - } - - /// Appends a boolean value into the builder. - #[inline] - fn append(&mut self, not_null: bool) { - if not_null { - self.append_non_null() - } else { - self.append_null() - } - } - - /// Builds the null buffer and resets the builder. - /// Returns `None` if the builder only contains `true`s. - fn finish(&mut self) -> Option { - let buf = self.bitmap_builder.as_mut().map(|b| b.finish()); - self.bitmap_builder = None; - self.len = 0; - buf - } - - #[inline] - fn materialize_if_needed(&mut self) { - if self.bitmap_builder.is_none() { - self.materialize() - } - } - - #[cold] - fn materialize(&mut self) { - if self.bitmap_builder.is_none() { - let mut b = BooleanBufferBuilder::new(self.len.max(self.capacity)); - b.append_n(self.len, true); - self.bitmap_builder = Some(b); - } - } -} - -#[cfg(test)] -pub mod tests { - use arrow::array::{Int32Array, Int32Builder, ListBuilder}; - use serde_json::json; - - use super::*; - use crate::scalars::ScalarRef; - use crate::types::ListType; - use crate::vectors::Int32Vector; - - pub fn new_list_vector(data: &[Option>>]) -> ListVector { - let mut builder = - ListVectorBuilder::with_type_capacity(ConcreteDataType::int32_datatype(), 8); - for vec_opt in data { - if let Some(vec) = vec_opt { - let values = vec.iter().map(|v| Value::from(*v)).collect(); - let values = Some(Box::new(values)); - let list_value = ListValue::new(values, ConcreteDataType::int32_datatype()); - - builder.push(Some(ListValueRef::Ref { val: &list_value })); - } else { - builder.push(None); - } - } - - builder.finish() - } - - fn new_list_array(data: &[Option>>]) -> ListArray { - let mut builder = ListBuilder::new(Int32Builder::new()); - for vec_opt in data { - if let Some(vec) = vec_opt { - for value_opt in vec { - builder.values().append_option(*value_opt); - } - - builder.append(true); - } else { - builder.append(false); - } - } - - builder.finish() - } - - #[test] - fn test_list_vector() { - let data = vec![ - Some(vec![Some(1), Some(2), Some(3)]), - None, - Some(vec![Some(4), None, Some(6)]), - ]; - - let list_vector = new_list_vector(&data); - - assert_eq!( - ConcreteDataType::List(ListType::new(ConcreteDataType::int32_datatype())), - list_vector.data_type() - ); - assert_eq!("ListVector", list_vector.vector_type_name()); - assert_eq!(3, list_vector.len()); - assert!(!list_vector.is_null(0)); - assert!(list_vector.is_null(1)); - assert!(!list_vector.is_null(2)); - - let arrow_array = new_list_array(&data); - assert_eq!( - arrow_array, - *list_vector - .to_arrow_array() - .as_any() - .downcast_ref::() - .unwrap() - ); - let validity = list_vector.validity(); - assert!(!validity.is_all_null()); - assert!(!validity.is_all_valid()); - assert!(validity.is_set(0)); - assert!(!validity.is_set(1)); - assert!(validity.is_set(2)); - assert_eq!(256, list_vector.memory_size()); - - let slice = list_vector.slice(0, 2).to_arrow_array(); - let sliced_array = slice.as_any().downcast_ref::().unwrap(); - assert_eq!( - Int32Array::from_iter_values([1, 2, 3]), - *sliced_array - .value(0) - .as_any() - .downcast_ref::() - .unwrap() - ); - assert!(sliced_array.is_null(1)); - - assert_eq!( - Value::List(ListValue::new( - Some(Box::new(vec![ - Value::Int32(1), - Value::Int32(2), - Value::Int32(3) - ])), - ConcreteDataType::int32_datatype() - )), - list_vector.get(0) - ); - let value_ref = list_vector.get_ref(0); - assert!(matches!( - value_ref, - ValueRef::List(ListValueRef::Indexed { .. }) - )); - let value_ref = list_vector.get_ref(1); - if let ValueRef::List(ListValueRef::Indexed { idx, .. }) = value_ref { - assert_eq!(1, idx); - } else { - unreachable!() - } - assert_eq!(Value::Null, list_vector.get(1)); - assert_eq!( - Value::List(ListValue::new( - Some(Box::new(vec![ - Value::Int32(4), - Value::Null, - Value::Int32(6) - ])), - ConcreteDataType::int32_datatype() - )), - list_vector.get(2) - ); - } - - #[test] - fn test_from_arrow_array() { - let data = vec![ - Some(vec![Some(1), Some(2), Some(3)]), - None, - Some(vec![Some(4), None, Some(6)]), - ]; - - let arrow_array = new_list_array(&data); - let array_ref: ArrayRef = Arc::new(arrow_array); - let expect = new_list_vector(&data); - - // Test try from ArrayRef - let list_vector = ListVector::try_from_arrow_array(array_ref).unwrap(); - assert_eq!(expect, list_vector); - - // Test from - let arrow_array = new_list_array(&data); - let list_vector = ListVector::from(arrow_array); - assert_eq!(expect, list_vector); - } - - #[test] - fn test_iter_list_vector_values() { - let data = vec![ - Some(vec![Some(1), Some(2), Some(3)]), - None, - Some(vec![Some(4), None, Some(6)]), - ]; - - let list_vector = new_list_vector(&data); - - assert_eq!( - ConcreteDataType::List(ListType::new(ConcreteDataType::int32_datatype())), - list_vector.data_type() - ); - let mut iter = list_vector.values_iter(); - assert_eq!( - Arc::new(Int32Vector::from_slice(&[1, 2, 3])) as VectorRef, - *iter.next().unwrap().unwrap().unwrap() - ); - assert!(iter.next().unwrap().unwrap().is_none()); - assert_eq!( - Arc::new(Int32Vector::from(vec![Some(4), None, Some(6)])) as VectorRef, - *iter.next().unwrap().unwrap().unwrap(), - ); - assert!(iter.next().is_none()) - } - - #[test] - fn test_serialize_to_json() { - let data = vec![ - Some(vec![Some(1), Some(2), Some(3)]), - None, - Some(vec![Some(4), None, Some(6)]), - ]; - - let list_vector = new_list_vector(&data); - assert_eq!( - vec![json!([1, 2, 3]), json!(null), json!([4, null, 6]),], - list_vector.serialize_to_json().unwrap() - ); - } - - #[test] - fn test_list_vector_builder() { - let mut builder = - ListType::new(ConcreteDataType::int32_datatype()).create_mutable_vector(3); - builder - .push_value_ref(ValueRef::List(ListValueRef::Ref { - val: &ListValue::new( - Some(Box::new(vec![ - Value::Int32(4), - Value::Null, - Value::Int32(6), - ])), - ConcreteDataType::int32_datatype(), - ), - })) - .unwrap(); - assert!(builder.push_value_ref(ValueRef::Int32(123)).is_err()); - - let data = vec![ - Some(vec![Some(1), Some(2), Some(3)]), - None, - Some(vec![Some(7), Some(8), None]), - ]; - let input = new_list_vector(&data); - builder.extend_slice_of(&input, 1, 2).unwrap(); - assert!(builder - .extend_slice_of(&crate::vectors::Int32Vector::from_slice(&[13]), 0, 1) - .is_err()); - let vector = builder.to_vector(); - - let expect: VectorRef = Arc::new(new_list_vector(&[ - Some(vec![Some(4), None, Some(6)]), - None, - Some(vec![Some(7), Some(8), None]), - ])); - assert_eq!(expect, vector); - } - - #[test] - fn test_list_vector_for_scalar() { - let mut builder = - ListVectorBuilder::with_type_capacity(ConcreteDataType::int32_datatype(), 2); - builder.push(None); - builder.push(Some(ListValueRef::Ref { - val: &ListValue::new( - Some(Box::new(vec![ - Value::Int32(4), - Value::Null, - Value::Int32(6), - ])), - ConcreteDataType::int32_datatype(), - ), - })); - let vector = builder.finish(); - - let expect = new_list_vector(&[None, Some(vec![Some(4), None, Some(6)])]); - assert_eq!(expect, vector); - - assert!(vector.get_data(0).is_none()); - assert_eq!( - ListValueRef::Indexed { - vector: &vector, - idx: 1 - }, - vector.get_data(1).unwrap() - ); - assert_eq!( - *vector.get(1).as_list().unwrap().unwrap(), - vector.get_data(1).unwrap().to_owned_scalar() - ); - - let mut iter = vector.iter_data(); - assert!(iter.next().unwrap().is_none()); - assert_eq!( - ListValueRef::Indexed { - vector: &vector, - idx: 1 - }, - iter.next().unwrap().unwrap() - ); - assert!(iter.next().is_none()); - - let mut iter = vector.iter_data(); - assert_eq!(2, iter.size_hint().0); - assert_eq!( - ListValueRef::Indexed { - vector: &vector, - idx: 1 - }, - iter.nth(1).unwrap().unwrap() - ); - } -} diff --git a/src/datatypes2/src/vectors/null.rs b/src/datatypes2/src/vectors/null.rs deleted file mode 100644 index bb66e09b392b..000000000000 --- a/src/datatypes2/src/vectors/null.rs +++ /dev/null @@ -1,282 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::any::Any; -use std::fmt; -use std::sync::Arc; - -use arrow::array::{Array, ArrayData, ArrayRef, NullArray}; -use snafu::{ensure, OptionExt}; - -use crate::data_type::ConcreteDataType; -use crate::error::{self, Result}; -use crate::serialize::Serializable; -use crate::types::NullType; -use crate::value::{Value, ValueRef}; -use crate::vectors::{self, MutableVector, Validity, Vector, VectorRef}; - -/// A vector where all elements are nulls. -#[derive(PartialEq)] -pub struct NullVector { - array: NullArray, -} - -// TODO(yingwen): Support null vector with other logical types. -impl NullVector { - /// Create a new `NullVector` with `n` elements. - pub fn new(n: usize) -> Self { - Self { - array: NullArray::new(n), - } - } - - pub(crate) fn as_arrow(&self) -> &dyn Array { - &self.array - } - - fn to_array_data(&self) -> ArrayData { - self.array.data().clone() - } -} - -impl From for NullVector { - fn from(array: NullArray) -> Self { - Self { array } - } -} - -impl Vector for NullVector { - fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::Null(NullType::default()) - } - - fn vector_type_name(&self) -> String { - "NullVector".to_string() - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn len(&self) -> usize { - self.array.len() - } - - fn to_arrow_array(&self) -> ArrayRef { - // TODO(yingwen): Replaced by clone after upgrading to arrow 28.0. - let data = self.to_array_data(); - Arc::new(NullArray::from(data)) - } - - fn to_boxed_arrow_array(&self) -> Box { - let data = self.to_array_data(); - Box::new(NullArray::from(data)) - } - - fn validity(&self) -> Validity { - Validity::all_null(self.array.len()) - } - - fn memory_size(&self) -> usize { - 0 - } - - fn null_count(&self) -> usize { - self.array.null_count() - } - - fn is_null(&self, _row: usize) -> bool { - true - } - - fn only_null(&self) -> bool { - true - } - - fn slice(&self, _offset: usize, length: usize) -> VectorRef { - Arc::new(Self::new(length)) - } - - fn get(&self, _index: usize) -> Value { - // Skips bound check for null array. - Value::Null - } - - fn get_ref(&self, _index: usize) -> ValueRef { - // Skips bound check for null array. - ValueRef::Null - } -} - -impl fmt::Debug for NullVector { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "NullVector({})", self.len()) - } -} - -impl Serializable for NullVector { - fn serialize_to_json(&self) -> Result> { - Ok(std::iter::repeat(serde_json::Value::Null) - .take(self.len()) - .collect()) - } -} - -vectors::impl_try_from_arrow_array_for_vector!(NullArray, NullVector); - -#[derive(Default)] -pub struct NullVectorBuilder { - length: usize, -} - -impl MutableVector for NullVectorBuilder { - fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::null_datatype() - } - - fn len(&self) -> usize { - self.length - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn as_mut_any(&mut self) -> &mut dyn Any { - self - } - - fn to_vector(&mut self) -> VectorRef { - let vector = Arc::new(NullVector::new(self.length)); - self.length = 0; - vector - } - - fn push_value_ref(&mut self, value: ValueRef) -> Result<()> { - ensure!( - value.is_null(), - error::CastTypeSnafu { - msg: format!("Failed to cast value ref {:?} to null", value), - } - ); - - self.length += 1; - Ok(()) - } - - fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> { - vector - .as_any() - .downcast_ref::() - .with_context(|| error::CastTypeSnafu { - msg: format!( - "Failed to convert vector from {} to NullVector", - vector.vector_type_name() - ), - })?; - assert!( - offset + length <= vector.len(), - "offset {} + length {} must less than {}", - offset, - length, - vector.len() - ); - - self.length += length; - Ok(()) - } -} - -pub(crate) fn replicate_null(vector: &NullVector, offsets: &[usize]) -> VectorRef { - assert_eq!(offsets.len(), vector.len()); - - if offsets.is_empty() { - return vector.slice(0, 0); - } - - Arc::new(NullVector::new(*offsets.last().unwrap())) -} - -#[cfg(test)] -mod tests { - use serde_json; - - use super::*; - use crate::data_type::DataType; - - #[test] - fn test_null_vector_misc() { - let v = NullVector::new(32); - - assert_eq!(v.len(), 32); - assert_eq!(0, v.memory_size()); - let arrow_arr = v.to_arrow_array(); - assert_eq!(arrow_arr.null_count(), 32); - - let array2 = arrow_arr.slice(8, 16); - assert_eq!(array2.len(), 16); - assert_eq!(array2.null_count(), 16); - - assert_eq!("NullVector", v.vector_type_name()); - assert!(!v.is_const()); - assert!(v.validity().is_all_null()); - assert!(v.only_null()); - - for i in 0..32 { - assert!(v.is_null(i)); - assert_eq!(Value::Null, v.get(i)); - assert_eq!(ValueRef::Null, v.get_ref(i)); - } - } - - #[test] - fn test_debug_null_vector() { - let array = NullVector::new(1024 * 1024); - assert_eq!(format!("{:?}", array), "NullVector(1048576)"); - } - - #[test] - fn test_serialize_json() { - let vector = NullVector::new(3); - let json_value = vector.serialize_to_json().unwrap(); - assert_eq!( - "[null,null,null]", - serde_json::to_string(&json_value).unwrap() - ); - } - - #[test] - fn test_null_vector_validity() { - let vector = NullVector::new(5); - assert!(vector.validity().is_all_null()); - assert_eq!(5, vector.null_count()); - } - - #[test] - fn test_null_vector_builder() { - let mut builder = NullType::default().create_mutable_vector(3); - builder.push_value_ref(ValueRef::Null).unwrap(); - assert!(builder.push_value_ref(ValueRef::Int32(123)).is_err()); - - let input = NullVector::new(3); - builder.extend_slice_of(&input, 1, 2).unwrap(); - assert!(builder - .extend_slice_of(&crate::vectors::Int32Vector::from_slice(&[13]), 0, 1) - .is_err()); - let vector = builder.to_vector(); - - let expect: VectorRef = Arc::new(input); - assert_eq!(expect, vector); - } -} diff --git a/src/datatypes2/src/vectors/operations.rs b/src/datatypes2/src/vectors/operations.rs deleted file mode 100644 index 70ddb4a0317a..000000000000 --- a/src/datatypes2/src/vectors/operations.rs +++ /dev/null @@ -1,127 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -mod filter; -mod find_unique; -mod replicate; - -use common_base::BitVec; - -use crate::error::Result; -use crate::types::LogicalPrimitiveType; -use crate::vectors::constant::ConstantVector; -use crate::vectors::{ - BinaryVector, BooleanVector, ListVector, NullVector, PrimitiveVector, StringVector, Vector, - VectorRef, -}; - -/// Vector compute operations. -pub trait VectorOp { - /// Copies each element according `offsets` parameter. - /// - `i-th` element should be copied `offsets[i] - offsets[i - 1]` times - /// - `0-th` element would be copied `offsets[0]` times - /// - /// # Panics - /// Panics if `offsets.len() != self.len()`. - fn replicate(&self, offsets: &[usize]) -> VectorRef; - - /// Mark `i-th` bit of `selected` to `true` if the `i-th` element of `self` is unique, which - /// means there is no elements behind it have same value as it. - /// - /// The caller should ensure - /// 1. the length of `selected` bitmap is equal to `vector.len()`. - /// 2. `vector` and `prev_vector` are sorted. - /// - /// If there are multiple duplicate elements, this function retains the **first** element. - /// The first element is considered as unique if the first element of `self` is different - /// from its previous element, that is the last element of `prev_vector`. - /// - /// # Panics - /// Panics if - /// - `selected.len() < self.len()`. - /// - `prev_vector` and `self` have different data types. - fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>); - - /// Filters the vector, returns elements matching the `filter` (i.e. where the values are true). - /// - /// Note that the nulls of `filter` are interpreted as `false` will lead to these elements being masked out. - fn filter(&self, filter: &BooleanVector) -> Result; -} - -macro_rules! impl_scalar_vector_op { - ($($VectorType: ident),+) => {$( - impl VectorOp for $VectorType { - fn replicate(&self, offsets: &[usize]) -> VectorRef { - replicate::replicate_scalar(self, offsets) - } - - fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) { - let prev_vector = prev_vector.map(|pv| pv.as_any().downcast_ref::<$VectorType>().unwrap()); - find_unique::find_unique_scalar(self, selected, prev_vector); - } - - fn filter(&self, filter: &BooleanVector) -> Result { - filter::filter_non_constant!(self, $VectorType, filter) - } - } - )+}; -} - -impl_scalar_vector_op!(BinaryVector, BooleanVector, ListVector, StringVector); - -impl VectorOp for PrimitiveVector { - fn replicate(&self, offsets: &[usize]) -> VectorRef { - std::sync::Arc::new(replicate::replicate_primitive(self, offsets)) - } - - fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) { - let prev_vector = - prev_vector.and_then(|pv| pv.as_any().downcast_ref::>()); - find_unique::find_unique_scalar(self, selected, prev_vector); - } - - fn filter(&self, filter: &BooleanVector) -> Result { - filter::filter_non_constant!(self, PrimitiveVector, filter) - } -} - -impl VectorOp for NullVector { - fn replicate(&self, offsets: &[usize]) -> VectorRef { - replicate::replicate_null(self, offsets) - } - - fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) { - let prev_vector = prev_vector.and_then(|pv| pv.as_any().downcast_ref::()); - find_unique::find_unique_null(self, selected, prev_vector); - } - - fn filter(&self, filter: &BooleanVector) -> Result { - filter::filter_non_constant!(self, NullVector, filter) - } -} - -impl VectorOp for ConstantVector { - fn replicate(&self, offsets: &[usize]) -> VectorRef { - self.replicate_vector(offsets) - } - - fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) { - let prev_vector = prev_vector.and_then(|pv| pv.as_any().downcast_ref::()); - find_unique::find_unique_constant(self, selected, prev_vector); - } - - fn filter(&self, filter: &BooleanVector) -> Result { - self.filter_vector(filter) - } -} diff --git a/src/datatypes2/src/vectors/operations/filter.rs b/src/datatypes2/src/vectors/operations/filter.rs deleted file mode 100644 index 8368a6afb4c4..000000000000 --- a/src/datatypes2/src/vectors/operations/filter.rs +++ /dev/null @@ -1,145 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -macro_rules! filter_non_constant { - ($vector: expr, $VectorType: ty, $filter: ident) => {{ - use std::sync::Arc; - - use arrow::compute; - use snafu::ResultExt; - - let arrow_array = $vector.as_arrow(); - let filtered = compute::filter(arrow_array, $filter.as_boolean_array()) - .context(crate::error::ArrowComputeSnafu)?; - Ok(Arc::new(<$VectorType>::try_from_arrow_array(filtered)?)) - }}; -} - -pub(crate) use filter_non_constant; - -#[cfg(test)] -mod tests { - use std::sync::Arc; - - use common_time::{Date, DateTime}; - - use crate::scalars::ScalarVector; - use crate::timestamp::{ - TimestampMicrosecond, TimestampMillisecond, TimestampNanosecond, TimestampSecond, - }; - use crate::types::WrapperType; - use crate::vectors::constant::ConstantVector; - use crate::vectors::{ - BooleanVector, Int32Vector, NullVector, StringVector, VectorOp, VectorRef, - }; - - fn check_filter_primitive(expect: &[i32], input: &[i32], filter: &[bool]) { - let v = Int32Vector::from_slice(&input); - let filter = BooleanVector::from_slice(filter); - let out = v.filter(&filter).unwrap(); - - let expect: VectorRef = Arc::new(Int32Vector::from_slice(&expect)); - assert_eq!(expect, out); - } - - #[test] - fn test_filter_primitive() { - check_filter_primitive(&[], &[], &[]); - check_filter_primitive(&[5], &[5], &[true]); - check_filter_primitive(&[], &[5], &[false]); - check_filter_primitive(&[], &[5, 6], &[false, false]); - check_filter_primitive(&[5, 6], &[5, 6], &[true, true]); - check_filter_primitive(&[], &[5, 6, 7], &[false, false, false]); - check_filter_primitive(&[5], &[5, 6, 7], &[true, false, false]); - check_filter_primitive(&[6], &[5, 6, 7], &[false, true, false]); - check_filter_primitive(&[7], &[5, 6, 7], &[false, false, true]); - check_filter_primitive(&[5, 7], &[5, 6, 7], &[true, false, true]); - } - - fn check_filter_constant(expect_length: usize, input_length: usize, filter: &[bool]) { - let v = ConstantVector::new(Arc::new(Int32Vector::from_slice(&[123])), input_length); - let filter = BooleanVector::from_slice(filter); - let out = v.filter(&filter).unwrap(); - - assert!(out.is_const()); - assert_eq!(expect_length, out.len()); - } - - #[test] - fn test_filter_constant() { - check_filter_constant(0, 0, &[]); - check_filter_constant(1, 1, &[true]); - check_filter_constant(0, 1, &[false]); - check_filter_constant(1, 2, &[false, true]); - check_filter_constant(2, 2, &[true, true]); - check_filter_constant(1, 4, &[false, false, false, true]); - check_filter_constant(2, 4, &[false, true, false, true]); - } - - #[test] - fn test_filter_scalar() { - let v = StringVector::from_slice(&["0", "1", "2", "3"]); - let filter = BooleanVector::from_slice(&[false, true, false, true]); - let out = v.filter(&filter).unwrap(); - - let expect: VectorRef = Arc::new(StringVector::from_slice(&["1", "3"])); - assert_eq!(expect, out); - } - - #[test] - fn test_filter_null() { - let v = NullVector::new(5); - let filter = BooleanVector::from_slice(&[false, true, false, true, true]); - let out = v.filter(&filter).unwrap(); - - let expect: VectorRef = Arc::new(NullVector::new(3)); - assert_eq!(expect, out); - } - - macro_rules! impl_filter_date_like_test { - ($VectorType: ident, $ValueType: ident, $method: ident) => {{ - use std::sync::Arc; - - use $crate::vectors::{$VectorType, VectorRef}; - - let v = $VectorType::from_iterator((0..5).map($ValueType::$method)); - let filter = BooleanVector::from_slice(&[false, true, false, true, true]); - let out = v.filter(&filter).unwrap(); - - let expect: VectorRef = Arc::new($VectorType::from_iterator( - [1, 3, 4].into_iter().map($ValueType::$method), - )); - assert_eq!(expect, out); - }}; - } - - #[test] - fn test_filter_date_like() { - impl_filter_date_like_test!(DateVector, Date, new); - impl_filter_date_like_test!(DateTimeVector, DateTime, new); - - impl_filter_date_like_test!(TimestampSecondVector, TimestampSecond, from_native); - impl_filter_date_like_test!( - TimestampMillisecondVector, - TimestampMillisecond, - from_native - ); - impl_filter_date_like_test!( - TimestampMicrosecondVector, - TimestampMicrosecond, - from_native - ); - impl_filter_date_like_test!(TimestampNanosecondVector, TimestampNanosecond, from_native); - } -} diff --git a/src/datatypes2/src/vectors/operations/find_unique.rs b/src/datatypes2/src/vectors/operations/find_unique.rs deleted file mode 100644 index 7116a9e90d53..000000000000 --- a/src/datatypes2/src/vectors/operations/find_unique.rs +++ /dev/null @@ -1,367 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use common_base::BitVec; - -use crate::scalars::ScalarVector; -use crate::vectors::constant::ConstantVector; -use crate::vectors::{NullVector, Vector}; - -// To implement `find_unique()` correctly, we need to keep in mind that always marks an element as -// selected when it is different from the previous one, and leaves the `selected` unchanged -// in any other case. -pub(crate) fn find_unique_scalar<'a, T: ScalarVector>( - vector: &'a T, - selected: &'a mut BitVec, - prev_vector: Option<&'a T>, -) where - T::RefItem<'a>: PartialEq, -{ - assert!(selected.len() >= vector.len()); - - if vector.is_empty() { - return; - } - - for ((i, current), next) in vector - .iter_data() - .enumerate() - .zip(vector.iter_data().skip(1)) - { - if current != next { - // If next element is a different element, we mark it as selected. - selected.set(i + 1, true); - } - } - - // Marks first element as selected if it is different from previous element, otherwise - // keep selected bitmap unchanged. - let is_first_not_duplicate = prev_vector - .map(|pv| { - if pv.is_empty() { - true - } else { - let last = pv.get_data(pv.len() - 1); - last != vector.get_data(0) - } - }) - .unwrap_or(true); - if is_first_not_duplicate { - selected.set(0, true); - } -} - -pub(crate) fn find_unique_null( - vector: &NullVector, - selected: &mut BitVec, - prev_vector: Option<&NullVector>, -) { - if vector.is_empty() { - return; - } - - let is_first_not_duplicate = prev_vector.map(NullVector::is_empty).unwrap_or(true); - if is_first_not_duplicate { - selected.set(0, true); - } -} - -pub(crate) fn find_unique_constant( - vector: &ConstantVector, - selected: &mut BitVec, - prev_vector: Option<&ConstantVector>, -) { - if vector.is_empty() { - return; - } - - let is_first_not_duplicate = prev_vector - .map(|pv| { - if pv.is_empty() { - true - } else { - vector.get_constant_ref() != pv.get_constant_ref() - } - }) - .unwrap_or(true); - - if is_first_not_duplicate { - selected.set(0, true); - } -} - -#[cfg(test)] -mod tests { - use std::sync::Arc; - - use common_time::{Date, DateTime}; - - use super::*; - use crate::timestamp::*; - use crate::vectors::{Int32Vector, StringVector, Vector, VectorOp}; - - fn check_bitmap(expect: &[bool], selected: &BitVec) { - let actual = selected.iter().collect::>(); - assert_eq!(expect, actual); - } - - fn check_find_unique_scalar(expect: &[bool], input: &[i32], prev: Option<&[i32]>) { - check_find_unique_scalar_opt(expect, input.iter().map(|v| Some(*v)), prev); - } - - fn check_find_unique_scalar_opt( - expect: &[bool], - input: impl Iterator>, - prev: Option<&[i32]>, - ) { - let input = Int32Vector::from(input.collect::>()); - let prev = prev.map(Int32Vector::from_slice); - - let mut selected = BitVec::repeat(false, input.len()); - input.find_unique(&mut selected, prev.as_ref().map(|v| v as _)); - - check_bitmap(expect, &selected); - } - - #[test] - fn test_find_unique_scalar() { - check_find_unique_scalar(&[], &[], None); - check_find_unique_scalar(&[true], &[1], None); - check_find_unique_scalar(&[true, false], &[1, 1], None); - check_find_unique_scalar(&[true, true], &[1, 2], None); - check_find_unique_scalar(&[true, true, true, true], &[1, 2, 3, 4], None); - check_find_unique_scalar(&[true, false, true, false], &[1, 1, 3, 3], None); - check_find_unique_scalar(&[true, false, false, false, true], &[2, 2, 2, 2, 3], None); - - check_find_unique_scalar(&[true], &[5], Some(&[])); - check_find_unique_scalar(&[true], &[5], Some(&[3])); - check_find_unique_scalar(&[false], &[5], Some(&[5])); - check_find_unique_scalar(&[false], &[5], Some(&[4, 5])); - check_find_unique_scalar(&[false, true], &[5, 6], Some(&[4, 5])); - check_find_unique_scalar(&[false, true, false], &[5, 6, 6], Some(&[4, 5])); - check_find_unique_scalar( - &[false, true, false, true, true], - &[5, 6, 6, 7, 8], - Some(&[4, 5]), - ); - - check_find_unique_scalar_opt( - &[true, true, false, true, false], - [Some(1), Some(2), Some(2), None, None].into_iter(), - None, - ); - } - - #[test] - fn test_find_unique_scalar_multi_times_with_prev() { - let prev = Int32Vector::from_slice(&[1]); - - let v1 = Int32Vector::from_slice(&[2, 3, 4]); - let mut selected = BitVec::repeat(false, v1.len()); - v1.find_unique(&mut selected, Some(&prev)); - - // Though element in v2 are the same as prev, but we should still keep them. - let v2 = Int32Vector::from_slice(&[1, 1, 1]); - v2.find_unique(&mut selected, Some(&prev)); - - check_bitmap(&[true, true, true], &selected); - } - - fn new_bitmap(bits: &[bool]) -> BitVec { - BitVec::from_iter(bits) - } - - #[test] - fn test_find_unique_scalar_with_prev() { - let prev = Int32Vector::from_slice(&[1]); - - let mut selected = new_bitmap(&[true, false, true, false]); - let v = Int32Vector::from_slice(&[2, 3, 4, 5]); - v.find_unique(&mut selected, Some(&prev)); - // All elements are different. - check_bitmap(&[true, true, true, true], &selected); - - let mut selected = new_bitmap(&[true, false, true, false]); - let v = Int32Vector::from_slice(&[1, 2, 3, 4]); - v.find_unique(&mut selected, Some(&prev)); - // Though first element is duplicate, but we keep the flag unchanged. - check_bitmap(&[true, true, true, true], &selected); - - // Same case as above, but now `prev` is None. - let mut selected = new_bitmap(&[true, false, true, false]); - let v = Int32Vector::from_slice(&[1, 2, 3, 4]); - v.find_unique(&mut selected, None); - check_bitmap(&[true, true, true, true], &selected); - - // Same case as above, but now `prev` is empty. - let mut selected = new_bitmap(&[true, false, true, false]); - let v = Int32Vector::from_slice(&[1, 2, 3, 4]); - v.find_unique(&mut selected, Some(&Int32Vector::from_slice(&[]))); - check_bitmap(&[true, true, true, true], &selected); - - let mut selected = new_bitmap(&[false, false, false, false]); - let v = Int32Vector::from_slice(&[2, 2, 4, 5]); - v.find_unique(&mut selected, Some(&prev)); - // only v[1] is duplicate. - check_bitmap(&[true, false, true, true], &selected); - } - - fn check_find_unique_null(len: usize) { - let input = NullVector::new(len); - let mut selected = BitVec::repeat(false, input.len()); - input.find_unique(&mut selected, None); - - let mut expect = vec![false; len]; - if !expect.is_empty() { - expect[0] = true; - } - check_bitmap(&expect, &selected); - - let mut selected = BitVec::repeat(false, input.len()); - let prev = Some(NullVector::new(1)); - input.find_unique(&mut selected, prev.as_ref().map(|v| v as _)); - let expect = vec![false; len]; - check_bitmap(&expect, &selected); - } - - #[test] - fn test_find_unique_null() { - for len in 0..5 { - check_find_unique_null(len); - } - } - - #[test] - fn test_find_unique_null_with_prev() { - let prev = NullVector::new(1); - - // Keep flags unchanged. - let mut selected = new_bitmap(&[true, false, true, false]); - let v = NullVector::new(4); - v.find_unique(&mut selected, Some(&prev)); - check_bitmap(&[true, false, true, false], &selected); - - // Keep flags unchanged. - let mut selected = new_bitmap(&[false, false, true, false]); - v.find_unique(&mut selected, Some(&prev)); - check_bitmap(&[false, false, true, false], &selected); - - // Prev is None, select first element. - let mut selected = new_bitmap(&[false, false, true, false]); - v.find_unique(&mut selected, None); - check_bitmap(&[true, false, true, false], &selected); - - // Prev is empty, select first element. - let mut selected = new_bitmap(&[false, false, true, false]); - v.find_unique(&mut selected, Some(&NullVector::new(0))); - check_bitmap(&[true, false, true, false], &selected); - } - - fn check_find_unique_constant(len: usize) { - let input = ConstantVector::new(Arc::new(Int32Vector::from_slice(&[8])), len); - let mut selected = BitVec::repeat(false, len); - input.find_unique(&mut selected, None); - - let mut expect = vec![false; len]; - if !expect.is_empty() { - expect[0] = true; - } - check_bitmap(&expect, &selected); - - let mut selected = BitVec::repeat(false, len); - let prev = Some(ConstantVector::new( - Arc::new(Int32Vector::from_slice(&[8])), - 1, - )); - input.find_unique(&mut selected, prev.as_ref().map(|v| v as _)); - let expect = vec![false; len]; - check_bitmap(&expect, &selected); - } - - #[test] - fn test_find_unique_constant() { - for len in 0..5 { - check_find_unique_constant(len); - } - } - - #[test] - fn test_find_unique_constant_with_prev() { - let prev = ConstantVector::new(Arc::new(Int32Vector::from_slice(&[1])), 1); - - // Keep flags unchanged. - let mut selected = new_bitmap(&[true, false, true, false]); - let v = ConstantVector::new(Arc::new(Int32Vector::from_slice(&[1])), 4); - v.find_unique(&mut selected, Some(&prev)); - check_bitmap(&[true, false, true, false], &selected); - - // Keep flags unchanged. - let mut selected = new_bitmap(&[false, false, true, false]); - v.find_unique(&mut selected, Some(&prev)); - check_bitmap(&[false, false, true, false], &selected); - - // Prev is None, select first element. - let mut selected = new_bitmap(&[false, false, true, false]); - v.find_unique(&mut selected, None); - check_bitmap(&[true, false, true, false], &selected); - - // Prev is empty, select first element. - let mut selected = new_bitmap(&[false, false, true, false]); - v.find_unique( - &mut selected, - Some(&ConstantVector::new( - Arc::new(Int32Vector::from_slice(&[1])), - 0, - )), - ); - check_bitmap(&[true, false, true, false], &selected); - - // Different constant vector. - let mut selected = new_bitmap(&[false, false, true, false]); - let v = ConstantVector::new(Arc::new(Int32Vector::from_slice(&[2])), 4); - v.find_unique(&mut selected, Some(&prev)); - check_bitmap(&[true, false, true, false], &selected); - } - - #[test] - fn test_find_unique_string() { - let input = StringVector::from_slice(&["a", "a", "b", "c"]); - let mut selected = BitVec::repeat(false, 4); - input.find_unique(&mut selected, None); - let expect = vec![true, false, true, true]; - check_bitmap(&expect, &selected); - } - - macro_rules! impl_find_unique_date_like_test { - ($VectorType: ident, $ValueType: ident, $method: ident) => {{ - use $crate::vectors::$VectorType; - - let v = $VectorType::from_iterator([8, 8, 9, 10].into_iter().map($ValueType::$method)); - let mut selected = BitVec::repeat(false, 4); - v.find_unique(&mut selected, None); - let expect = vec![true, false, true, true]; - check_bitmap(&expect, &selected); - }}; - } - - #[test] - fn test_find_unique_date_like() { - impl_find_unique_date_like_test!(DateVector, Date, new); - impl_find_unique_date_like_test!(DateTimeVector, DateTime, new); - impl_find_unique_date_like_test!(TimestampSecondVector, TimestampSecond, from); - impl_find_unique_date_like_test!(TimestampMillisecondVector, TimestampMillisecond, from); - impl_find_unique_date_like_test!(TimestampMicrosecondVector, TimestampMicrosecond, from); - impl_find_unique_date_like_test!(TimestampNanosecondVector, TimestampNanosecond, from); - } -} diff --git a/src/datatypes2/src/vectors/operations/replicate.rs b/src/datatypes2/src/vectors/operations/replicate.rs deleted file mode 100644 index 8216517fc62d..000000000000 --- a/src/datatypes2/src/vectors/operations/replicate.rs +++ /dev/null @@ -1,170 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use crate::prelude::*; -pub(crate) use crate::vectors::null::replicate_null; -pub(crate) use crate::vectors::primitive::replicate_primitive; - -pub(crate) fn replicate_scalar(c: &C, offsets: &[usize]) -> VectorRef { - assert_eq!(offsets.len(), c.len()); - - if offsets.is_empty() { - return c.slice(0, 0); - } - let mut builder = <::Builder>::with_capacity(c.len()); - - let mut previous_offset = 0; - for (i, offset) in offsets.iter().enumerate() { - let data = c.get_data(i); - for _ in previous_offset..*offset { - builder.push(data); - } - previous_offset = *offset; - } - builder.to_vector() -} - -#[cfg(test)] -mod tests { - use std::sync::Arc; - - use common_time::timestamp::TimeUnit; - use common_time::{Date, DateTime, Timestamp}; - use paste::paste; - - use super::*; - use crate::vectors::constant::ConstantVector; - use crate::vectors::{Int32Vector, NullVector, StringVector, VectorOp}; - - #[test] - fn test_replicate_primitive() { - let v = Int32Vector::from_iterator(0..5); - let offsets = [0, 1, 2, 3, 4]; - - let v = v.replicate(&offsets); - assert_eq!(4, v.len()); - - for i in 0..4 { - assert_eq!(Value::Int32(i as i32 + 1), v.get(i)); - } - } - - #[test] - fn test_replicate_nullable_primitive() { - let v = Int32Vector::from(vec![None, Some(1), None, Some(2)]); - let offsets = [2, 4, 6, 8]; - let v = v.replicate(&offsets); - assert_eq!(8, v.len()); - - let expect: VectorRef = Arc::new(Int32Vector::from(vec![ - None, - None, - Some(1), - Some(1), - None, - None, - Some(2), - Some(2), - ])); - assert_eq!(expect, v); - } - - #[test] - fn test_replicate_scalar() { - let v = StringVector::from_slice(&["0", "1", "2", "3"]); - let offsets = [1, 3, 5, 6]; - - let v = v.replicate(&offsets); - assert_eq!(6, v.len()); - - let expect: VectorRef = Arc::new(StringVector::from_slice(&["0", "1", "1", "2", "2", "3"])); - assert_eq!(expect, v); - } - - #[test] - fn test_replicate_constant() { - let v = Arc::new(StringVector::from_slice(&["hello"])); - let cv = ConstantVector::new(v.clone(), 2); - let offsets = [1, 4]; - - let cv = cv.replicate(&offsets); - assert_eq!(4, cv.len()); - - let expect: VectorRef = Arc::new(ConstantVector::new(v, 4)); - assert_eq!(expect, cv); - } - - #[test] - fn test_replicate_null() { - let v = NullVector::new(0); - let offsets = []; - let v = v.replicate(&offsets); - assert!(v.is_empty()); - - let v = NullVector::new(3); - let offsets = [1, 3, 5]; - - let v = v.replicate(&offsets); - assert_eq!(5, v.len()); - } - - macro_rules! impl_replicate_date_like_test { - ($VectorType: ident, $ValueType: ident, $method: ident) => {{ - use $crate::vectors::$VectorType; - - let v = $VectorType::from_iterator((0..5).map($ValueType::$method)); - let offsets = [0, 1, 2, 3, 4]; - - let v = v.replicate(&offsets); - assert_eq!(4, v.len()); - - for i in 0..4 { - assert_eq!( - Value::$ValueType($ValueType::$method((i as i32 + 1).into())), - v.get(i) - ); - } - }}; - } - - macro_rules! impl_replicate_timestamp_test { - ($unit: ident) => {{ - paste!{ - use $crate::vectors::[]; - use $crate::timestamp::[]; - let v = []::from_iterator((0..5).map([]::from)); - let offsets = [0, 1, 2, 3, 4]; - let v = v.replicate(&offsets); - assert_eq!(4, v.len()); - for i in 0..4 { - assert_eq!( - Value::Timestamp(Timestamp::new(i as i64 + 1, TimeUnit::$unit)), - v.get(i) - ); - } - } - }}; - } - - #[test] - fn test_replicate_date_like() { - impl_replicate_date_like_test!(DateVector, Date, new); - impl_replicate_date_like_test!(DateTimeVector, DateTime, new); - - impl_replicate_timestamp_test!(Second); - impl_replicate_timestamp_test!(Millisecond); - impl_replicate_timestamp_test!(Microsecond); - impl_replicate_timestamp_test!(Nanosecond); - } -} diff --git a/src/datatypes2/src/vectors/primitive.rs b/src/datatypes2/src/vectors/primitive.rs deleted file mode 100644 index 7829c3173131..000000000000 --- a/src/datatypes2/src/vectors/primitive.rs +++ /dev/null @@ -1,552 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::any::Any; -use std::fmt; -use std::sync::Arc; - -use arrow::array::{ - Array, ArrayBuilder, ArrayData, ArrayIter, ArrayRef, PrimitiveArray, PrimitiveBuilder, -}; -use serde_json::Value as JsonValue; -use snafu::OptionExt; - -use crate::data_type::ConcreteDataType; -use crate::error::{self, Result}; -use crate::scalars::{Scalar, ScalarRef, ScalarVector, ScalarVectorBuilder}; -use crate::serialize::Serializable; -use crate::types::{ - Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, LogicalPrimitiveType, - UInt16Type, UInt32Type, UInt64Type, UInt8Type, WrapperType, -}; -use crate::value::{Value, ValueRef}; -use crate::vectors::{self, MutableVector, Validity, Vector, VectorRef}; - -pub type UInt8Vector = PrimitiveVector; -pub type UInt16Vector = PrimitiveVector; -pub type UInt32Vector = PrimitiveVector; -pub type UInt64Vector = PrimitiveVector; - -pub type Int8Vector = PrimitiveVector; -pub type Int16Vector = PrimitiveVector; -pub type Int32Vector = PrimitiveVector; -pub type Int64Vector = PrimitiveVector; - -pub type Float32Vector = PrimitiveVector; -pub type Float64Vector = PrimitiveVector; - -/// Vector for primitive data types. -pub struct PrimitiveVector { - array: PrimitiveArray, -} - -impl PrimitiveVector { - pub fn new(array: PrimitiveArray) -> Self { - Self { array } - } - - pub fn try_from_arrow_array(array: impl AsRef) -> Result { - let data = array - .as_ref() - .as_any() - .downcast_ref::>() - .with_context(|| error::ConversionSnafu { - from: format!("{:?}", array.as_ref().data_type()), - })? - .data() - .clone(); - let concrete_array = PrimitiveArray::::from(data); - Ok(Self::new(concrete_array)) - } - - pub fn from_slice>(slice: P) -> Self { - let iter = slice.as_ref().iter().copied(); - Self { - array: PrimitiveArray::from_iter_values(iter), - } - } - - pub fn from_wrapper_slice>(slice: P) -> Self { - let iter = slice.as_ref().iter().copied().map(WrapperType::into_native); - Self { - array: PrimitiveArray::from_iter_values(iter), - } - } - - pub fn from_vec(array: Vec) -> Self { - Self { - array: PrimitiveArray::from_iter_values(array), - } - } - - pub fn from_values>(iter: I) -> Self { - Self { - array: PrimitiveArray::from_iter_values(iter), - } - } - - pub(crate) fn as_arrow(&self) -> &PrimitiveArray { - &self.array - } - - fn to_array_data(&self) -> ArrayData { - self.array.data().clone() - } - - fn from_array_data(data: ArrayData) -> Self { - Self { - array: PrimitiveArray::from(data), - } - } - - // To distinguish with `Vector::slice()`. - fn get_slice(&self, offset: usize, length: usize) -> Self { - let data = self.array.data().slice(offset, length); - Self::from_array_data(data) - } -} - -impl Vector for PrimitiveVector { - fn data_type(&self) -> ConcreteDataType { - T::build_data_type() - } - - fn vector_type_name(&self) -> String { - format!("{}Vector", T::type_name()) - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn len(&self) -> usize { - self.array.len() - } - - fn to_arrow_array(&self) -> ArrayRef { - let data = self.to_array_data(); - Arc::new(PrimitiveArray::::from(data)) - } - - fn to_boxed_arrow_array(&self) -> Box { - let data = self.to_array_data(); - Box::new(PrimitiveArray::::from(data)) - } - - fn validity(&self) -> Validity { - vectors::impl_validity_for_vector!(self.array) - } - - fn memory_size(&self) -> usize { - self.array.get_buffer_memory_size() - } - - fn null_count(&self) -> usize { - self.array.null_count() - } - - fn is_null(&self, row: usize) -> bool { - self.array.is_null(row) - } - - fn slice(&self, offset: usize, length: usize) -> VectorRef { - let data = self.array.data().slice(offset, length); - Arc::new(Self::from_array_data(data)) - } - - fn get(&self, index: usize) -> Value { - if self.array.is_valid(index) { - // Safety: The index have been checked by `is_valid()`. - let wrapper = unsafe { T::Wrapper::from_native(self.array.value_unchecked(index)) }; - wrapper.into() - } else { - Value::Null - } - } - - fn get_ref(&self, index: usize) -> ValueRef { - if self.array.is_valid(index) { - // Safety: The index have been checked by `is_valid()`. - let wrapper = unsafe { T::Wrapper::from_native(self.array.value_unchecked(index)) }; - wrapper.into() - } else { - ValueRef::Null - } - } -} - -impl fmt::Debug for PrimitiveVector { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.debug_struct("PrimitiveVector") - .field("array", &self.array) - .finish() - } -} - -impl From> for PrimitiveVector { - fn from(array: PrimitiveArray) -> Self { - Self { array } - } -} - -impl From>> for PrimitiveVector { - fn from(v: Vec>) -> Self { - Self { - array: PrimitiveArray::from_iter(v), - } - } -} - -pub struct PrimitiveIter<'a, T: LogicalPrimitiveType> { - iter: ArrayIter<&'a PrimitiveArray>, -} - -impl<'a, T: LogicalPrimitiveType> Iterator for PrimitiveIter<'a, T> { - type Item = Option; - - fn next(&mut self) -> Option> { - self.iter - .next() - .map(|item| item.map(T::Wrapper::from_native)) - } - - fn size_hint(&self) -> (usize, Option) { - self.iter.size_hint() - } -} - -impl ScalarVector for PrimitiveVector { - type OwnedItem = T::Wrapper; - type RefItem<'a> = T::Wrapper; - type Iter<'a> = PrimitiveIter<'a, T>; - type Builder = PrimitiveVectorBuilder; - - fn get_data(&self, idx: usize) -> Option> { - if self.array.is_valid(idx) { - Some(T::Wrapper::from_native(self.array.value(idx))) - } else { - None - } - } - - fn iter_data(&self) -> Self::Iter<'_> { - PrimitiveIter { - iter: self.array.iter(), - } - } -} - -impl Serializable for PrimitiveVector { - fn serialize_to_json(&self) -> Result> { - let res = self - .iter_data() - .map(|v| match v { - None => serde_json::Value::Null, - // use WrapperType's Into bound instead of - // serde_json::to_value to facilitate customized serialization - // for WrapperType - Some(v) => v.into(), - }) - .collect::>(); - Ok(res) - } -} - -impl PartialEq for PrimitiveVector { - fn eq(&self, other: &PrimitiveVector) -> bool { - self.array == other.array - } -} - -pub type UInt8VectorBuilder = PrimitiveVectorBuilder; -pub type UInt16VectorBuilder = PrimitiveVectorBuilder; -pub type UInt32VectorBuilder = PrimitiveVectorBuilder; -pub type UInt64VectorBuilder = PrimitiveVectorBuilder; - -pub type Int8VectorBuilder = PrimitiveVectorBuilder; -pub type Int16VectorBuilder = PrimitiveVectorBuilder; -pub type Int32VectorBuilder = PrimitiveVectorBuilder; -pub type Int64VectorBuilder = PrimitiveVectorBuilder; - -pub type Float32VectorBuilder = PrimitiveVectorBuilder; -pub type Float64VectorBuilder = PrimitiveVectorBuilder; - -/// Builder to build a primitive vector. -pub struct PrimitiveVectorBuilder { - mutable_array: PrimitiveBuilder, -} - -impl MutableVector for PrimitiveVectorBuilder { - fn data_type(&self) -> ConcreteDataType { - T::build_data_type() - } - - fn len(&self) -> usize { - self.mutable_array.len() - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn as_mut_any(&mut self) -> &mut dyn Any { - self - } - - fn to_vector(&mut self) -> VectorRef { - Arc::new(self.finish()) - } - - fn push_value_ref(&mut self, value: ValueRef) -> Result<()> { - let primitive = T::cast_value_ref(value)?; - match primitive { - Some(v) => self.mutable_array.append_value(v.into_native()), - None => self.mutable_array.append_null(), - } - Ok(()) - } - - fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> { - let primitive = T::cast_vector(vector)?; - // Slice the underlying array to avoid creating a new Arc. - let slice = primitive.get_slice(offset, length); - for v in slice.iter_data() { - self.push(v); - } - Ok(()) - } -} - -impl ScalarVectorBuilder for PrimitiveVectorBuilder -where - T: LogicalPrimitiveType, - T::Wrapper: Scalar>, - for<'a> T::Wrapper: ScalarRef<'a, ScalarType = T::Wrapper>, - for<'a> T::Wrapper: Scalar = T::Wrapper>, -{ - type VectorType = PrimitiveVector; - - fn with_capacity(capacity: usize) -> Self { - Self { - mutable_array: PrimitiveBuilder::with_capacity(capacity), - } - } - - fn push(&mut self, value: Option<::RefItem<'_>>) { - self.mutable_array - .append_option(value.map(|v| v.into_native())); - } - - fn finish(&mut self) -> Self::VectorType { - PrimitiveVector { - array: self.mutable_array.finish(), - } - } -} - -pub(crate) fn replicate_primitive( - vector: &PrimitiveVector, - offsets: &[usize], -) -> PrimitiveVector { - assert_eq!(offsets.len(), vector.len()); - - if offsets.is_empty() { - return vector.get_slice(0, 0); - } - - let mut builder = PrimitiveVectorBuilder::::with_capacity(*offsets.last().unwrap() as usize); - - let mut previous_offset = 0; - - for (offset, value) in offsets.iter().zip(vector.array.iter()) { - let repeat_times = *offset - previous_offset; - match value { - Some(data) => { - unsafe { - // Safety: std::iter::Repeat and std::iter::Take implement TrustedLen. - builder - .mutable_array - .append_trusted_len_iter(std::iter::repeat(data).take(repeat_times)); - } - } - None => { - builder.mutable_array.append_nulls(repeat_times); - } - } - previous_offset = *offset; - } - builder.finish() -} - -#[cfg(test)] -mod tests { - use arrow::array::Int32Array; - use arrow::datatypes::DataType as ArrowDataType; - use serde_json; - - use super::*; - use crate::data_type::DataType; - use crate::serialize::Serializable; - use crate::types::Int64Type; - - fn check_vec(v: Int32Vector) { - assert_eq!(4, v.len()); - assert_eq!("Int32Vector", v.vector_type_name()); - assert!(!v.is_const()); - assert!(v.validity().is_all_valid()); - assert!(!v.only_null()); - - for i in 0..4 { - assert!(!v.is_null(i)); - assert_eq!(Value::Int32(i as i32 + 1), v.get(i)); - assert_eq!(ValueRef::Int32(i as i32 + 1), v.get_ref(i)); - } - - let json_value = v.serialize_to_json().unwrap(); - assert_eq!("[1,2,3,4]", serde_json::to_string(&json_value).unwrap(),); - - let arrow_arr = v.to_arrow_array(); - assert_eq!(4, arrow_arr.len()); - assert_eq!(&ArrowDataType::Int32, arrow_arr.data_type()); - } - - #[test] - fn test_from_values() { - let v = Int32Vector::from_values(vec![1, 2, 3, 4]); - check_vec(v); - } - - #[test] - fn test_from_vec() { - let v = Int32Vector::from_vec(vec![1, 2, 3, 4]); - check_vec(v); - } - - #[test] - fn test_from_slice() { - let v = Int32Vector::from_slice(vec![1, 2, 3, 4]); - check_vec(v); - } - - #[test] - fn test_serialize_primitive_vector_with_null_to_json() { - let input = [Some(1i32), Some(2i32), None, Some(4i32), None]; - let mut builder = Int32VectorBuilder::with_capacity(input.len()); - for v in input { - builder.push(v); - } - let vector = builder.finish(); - - let json_value = vector.serialize_to_json().unwrap(); - assert_eq!( - "[1,2,null,4,null]", - serde_json::to_string(&json_value).unwrap(), - ); - } - - #[test] - fn test_from_arrow_array() { - let arrow_array = Int32Array::from(vec![1, 2, 3, 4]); - let v = Int32Vector::from(arrow_array); - check_vec(v); - } - - #[test] - fn test_primitive_vector_build_get() { - let input = [Some(1i32), Some(2i32), None, Some(4i32), None]; - let mut builder = Int32VectorBuilder::with_capacity(input.len()); - for v in input { - builder.push(v); - } - let vector = builder.finish(); - assert_eq!(input.len(), vector.len()); - - for (i, v) in input.into_iter().enumerate() { - assert_eq!(v, vector.get_data(i)); - assert_eq!(Value::from(v), vector.get(i)); - } - - let res: Vec<_> = vector.iter_data().collect(); - assert_eq!(input, &res[..]); - } - - #[test] - fn test_primitive_vector_validity() { - let input = [Some(1i32), Some(2i32), None, None]; - let mut builder = Int32VectorBuilder::with_capacity(input.len()); - for v in input { - builder.push(v); - } - let vector = builder.finish(); - assert_eq!(2, vector.null_count()); - let validity = vector.validity(); - assert_eq!(2, validity.null_count()); - assert!(!validity.is_set(2)); - assert!(!validity.is_set(3)); - - let vector = Int32Vector::from_slice(vec![1, 2, 3, 4]); - assert_eq!(0, vector.null_count()); - assert!(vector.validity().is_all_valid()); - } - - #[test] - fn test_memory_size() { - let v = Int32Vector::from_slice((0..5).collect::>()); - assert_eq!(64, v.memory_size()); - let v = Int64Vector::from(vec![Some(0i64), Some(1i64), Some(2i64), None, None]); - assert_eq!(128, v.memory_size()); - } - - #[test] - fn test_primitive_vector_builder() { - let mut builder = Int64Type::default().create_mutable_vector(3); - builder.push_value_ref(ValueRef::Int64(123)).unwrap(); - assert!(builder.push_value_ref(ValueRef::Int32(123)).is_err()); - - let input = Int64Vector::from_slice(&[7, 8, 9]); - builder.extend_slice_of(&input, 1, 2).unwrap(); - assert!(builder - .extend_slice_of(&Int32Vector::from_slice(&[13]), 0, 1) - .is_err()); - let vector = builder.to_vector(); - - let expect: VectorRef = Arc::new(Int64Vector::from_slice(&[123, 8, 9])); - assert_eq!(expect, vector); - } - - #[test] - fn test_from_wrapper_slice() { - macro_rules! test_from_wrapper_slice { - ($vec: ident, $ty: ident) => { - let from_wrapper_slice = $vec::from_wrapper_slice(&[ - $ty::from_native($ty::MAX), - $ty::from_native($ty::MIN), - ]); - let from_slice = $vec::from_slice(&[$ty::MAX, $ty::MIN]); - assert_eq!(from_wrapper_slice, from_slice); - }; - } - - test_from_wrapper_slice!(UInt8Vector, u8); - test_from_wrapper_slice!(Int8Vector, i8); - test_from_wrapper_slice!(UInt16Vector, u16); - test_from_wrapper_slice!(Int16Vector, i16); - test_from_wrapper_slice!(UInt32Vector, u32); - test_from_wrapper_slice!(Int32Vector, i32); - test_from_wrapper_slice!(UInt64Vector, u64); - test_from_wrapper_slice!(Int64Vector, i64); - test_from_wrapper_slice!(Float32Vector, f32); - test_from_wrapper_slice!(Float64Vector, f64); - } -} diff --git a/src/datatypes2/src/vectors/string.rs b/src/datatypes2/src/vectors/string.rs deleted file mode 100644 index 252116b3b2dd..000000000000 --- a/src/datatypes2/src/vectors/string.rs +++ /dev/null @@ -1,370 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::any::Any; -use std::sync::Arc; - -use arrow::array::{Array, ArrayBuilder, ArrayData, ArrayIter, ArrayRef}; -use snafu::ResultExt; - -use crate::arrow_array::{MutableStringArray, StringArray}; -use crate::data_type::ConcreteDataType; -use crate::error::{self, Result}; -use crate::scalars::{ScalarVector, ScalarVectorBuilder}; -use crate::serialize::Serializable; -use crate::value::{Value, ValueRef}; -use crate::vectors::{self, MutableVector, Validity, Vector, VectorRef}; - -/// Vector of strings. -#[derive(Debug, PartialEq)] -pub struct StringVector { - array: StringArray, -} - -impl StringVector { - pub(crate) fn as_arrow(&self) -> &dyn Array { - &self.array - } - - fn to_array_data(&self) -> ArrayData { - self.array.data().clone() - } - - fn from_array_data(data: ArrayData) -> Self { - Self { - array: StringArray::from(data), - } - } -} - -impl From for StringVector { - fn from(array: StringArray) -> Self { - Self { array } - } -} - -impl From>> for StringVector { - fn from(data: Vec>) -> Self { - Self { - array: StringArray::from_iter(data), - } - } -} - -impl From>> for StringVector { - fn from(data: Vec>) -> Self { - Self { - array: StringArray::from_iter(data), - } - } -} - -impl From<&[Option]> for StringVector { - fn from(data: &[Option]) -> Self { - Self { - array: StringArray::from_iter(data), - } - } -} - -impl From<&[Option<&str>]> for StringVector { - fn from(data: &[Option<&str>]) -> Self { - Self { - array: StringArray::from_iter(data), - } - } -} - -impl From> for StringVector { - fn from(data: Vec) -> Self { - Self { - array: StringArray::from_iter(data.into_iter().map(Some)), - } - } -} - -impl From> for StringVector { - fn from(data: Vec<&str>) -> Self { - Self { - array: StringArray::from_iter(data.into_iter().map(Some)), - } - } -} - -impl Vector for StringVector { - fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::string_datatype() - } - - fn vector_type_name(&self) -> String { - "StringVector".to_string() - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn len(&self) -> usize { - self.array.len() - } - - fn to_arrow_array(&self) -> ArrayRef { - let data = self.to_array_data(); - Arc::new(StringArray::from(data)) - } - - fn to_boxed_arrow_array(&self) -> Box { - let data = self.to_array_data(); - Box::new(StringArray::from(data)) - } - - fn validity(&self) -> Validity { - vectors::impl_validity_for_vector!(self.array) - } - - fn memory_size(&self) -> usize { - self.array.get_buffer_memory_size() - } - - fn null_count(&self) -> usize { - self.array.null_count() - } - - fn is_null(&self, row: usize) -> bool { - self.array.is_null(row) - } - - fn slice(&self, offset: usize, length: usize) -> VectorRef { - let data = self.array.data().slice(offset, length); - Arc::new(Self::from_array_data(data)) - } - - fn get(&self, index: usize) -> Value { - vectors::impl_get_for_vector!(self.array, index) - } - - fn get_ref(&self, index: usize) -> ValueRef { - vectors::impl_get_ref_for_vector!(self.array, index) - } -} - -impl ScalarVector for StringVector { - type OwnedItem = String; - type RefItem<'a> = &'a str; - type Iter<'a> = ArrayIter<&'a StringArray>; - type Builder = StringVectorBuilder; - - fn get_data(&self, idx: usize) -> Option> { - if self.array.is_valid(idx) { - Some(self.array.value(idx)) - } else { - None - } - } - - fn iter_data(&self) -> Self::Iter<'_> { - self.array.iter() - } -} - -pub struct StringVectorBuilder { - mutable_array: MutableStringArray, -} - -impl MutableVector for StringVectorBuilder { - fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::string_datatype() - } - - fn len(&self) -> usize { - self.mutable_array.len() - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn as_mut_any(&mut self) -> &mut dyn Any { - self - } - - fn to_vector(&mut self) -> VectorRef { - Arc::new(self.finish()) - } - - fn push_value_ref(&mut self, value: ValueRef) -> Result<()> { - match value.as_string()? { - Some(v) => self.mutable_array.append_value(v), - None => self.mutable_array.append_null(), - } - Ok(()) - } - - fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> { - vectors::impl_extend_for_builder!(self, vector, StringVector, offset, length) - } -} - -impl ScalarVectorBuilder for StringVectorBuilder { - type VectorType = StringVector; - - fn with_capacity(capacity: usize) -> Self { - Self { - mutable_array: MutableStringArray::with_capacity(capacity, 0), - } - } - - fn push(&mut self, value: Option<::RefItem<'_>>) { - match value { - Some(v) => self.mutable_array.append_value(v), - None => self.mutable_array.append_null(), - } - } - - fn finish(&mut self) -> Self::VectorType { - StringVector { - array: self.mutable_array.finish(), - } - } -} - -impl Serializable for StringVector { - fn serialize_to_json(&self) -> Result> { - self.iter_data() - .map(serde_json::to_value) - .collect::>() - .context(error::SerializeSnafu) - } -} - -vectors::impl_try_from_arrow_array_for_vector!(StringArray, StringVector); - -#[cfg(test)] -mod tests { - use arrow::datatypes::DataType; - - use super::*; - - #[test] - fn test_string_vector_build_get() { - let mut builder = StringVectorBuilder::with_capacity(4); - builder.push(Some("hello")); - builder.push(None); - builder.push(Some("world")); - let vector = builder.finish(); - - assert_eq!(Some("hello"), vector.get_data(0)); - assert_eq!(None, vector.get_data(1)); - assert_eq!(Some("world"), vector.get_data(2)); - - // Get out of bound - assert!(vector.try_get(3).is_err()); - - assert_eq!(Value::String("hello".into()), vector.get(0)); - assert_eq!(Value::Null, vector.get(1)); - assert_eq!(Value::String("world".into()), vector.get(2)); - - let mut iter = vector.iter_data(); - assert_eq!("hello", iter.next().unwrap().unwrap()); - assert_eq!(None, iter.next().unwrap()); - assert_eq!("world", iter.next().unwrap().unwrap()); - assert_eq!(None, iter.next()); - } - - #[test] - fn test_string_vector_builder() { - let mut builder = StringVectorBuilder::with_capacity(3); - builder.push_value_ref(ValueRef::String("hello")).unwrap(); - assert!(builder.push_value_ref(ValueRef::Int32(123)).is_err()); - - let input = StringVector::from_slice(&["world", "one", "two"]); - builder.extend_slice_of(&input, 1, 2).unwrap(); - assert!(builder - .extend_slice_of(&crate::vectors::Int32Vector::from_slice(&[13]), 0, 1) - .is_err()); - let vector = builder.to_vector(); - - let expect: VectorRef = Arc::new(StringVector::from_slice(&["hello", "one", "two"])); - assert_eq!(expect, vector); - } - - #[test] - fn test_string_vector_misc() { - let strs = vec!["hello", "greptime", "rust"]; - let v = StringVector::from(strs.clone()); - assert_eq!(3, v.len()); - assert_eq!("StringVector", v.vector_type_name()); - assert!(!v.is_const()); - assert!(v.validity().is_all_valid()); - assert!(!v.only_null()); - assert_eq!(128, v.memory_size()); - - for (i, s) in strs.iter().enumerate() { - assert_eq!(Value::from(*s), v.get(i)); - assert_eq!(ValueRef::from(*s), v.get_ref(i)); - assert_eq!(Value::from(*s), v.try_get(i).unwrap()); - } - - let arrow_arr = v.to_arrow_array(); - assert_eq!(3, arrow_arr.len()); - assert_eq!(&DataType::Utf8, arrow_arr.data_type()); - } - - #[test] - fn test_serialize_string_vector() { - let mut builder = StringVectorBuilder::with_capacity(3); - builder.push(Some("hello")); - builder.push(None); - builder.push(Some("world")); - let string_vector = builder.finish(); - let serialized = - serde_json::to_string(&string_vector.serialize_to_json().unwrap()).unwrap(); - assert_eq!(r#"["hello",null,"world"]"#, serialized); - } - - #[test] - fn test_from_arrow_array() { - let mut builder = MutableStringArray::new(); - builder.append_option(Some("A")); - builder.append_option(Some("B")); - builder.append_null(); - builder.append_option(Some("D")); - let string_array: StringArray = builder.finish(); - let vector = StringVector::from(string_array); - assert_eq!( - r#"["A","B",null,"D"]"#, - serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap(), - ); - } - - #[test] - fn test_from_non_option_string() { - let nul = String::from_utf8(vec![0]).unwrap(); - let corpus = vec!["😅😅😅", "😍😍😍😍", "🥵🥵", nul.as_str()]; - let vector = StringVector::from(corpus); - let serialized = serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap(); - assert_eq!(r#"["😅😅😅","😍😍😍😍","🥵🥵","\u0000"]"#, serialized); - - let corpus = vec![ - "🀀🀀🀀".to_string(), - "🀁🀁🀁".to_string(), - "🀂🀂🀂".to_string(), - "🀃🀃🀃".to_string(), - "🀆🀆".to_string(), - ]; - let vector = StringVector::from(corpus); - let serialized = serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap(); - assert_eq!(r#"["🀀🀀🀀","🀁🀁🀁","🀂🀂🀂","🀃🀃🀃","🀆🀆"]"#, serialized); - } -} diff --git a/src/datatypes2/src/vectors/timestamp.rs b/src/datatypes2/src/vectors/timestamp.rs deleted file mode 100644 index 5d9f7f2ed1fc..000000000000 --- a/src/datatypes2/src/vectors/timestamp.rs +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use crate::types::{ - TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, - TimestampSecondType, -}; -use crate::vectors::{PrimitiveVector, PrimitiveVectorBuilder}; - -pub type TimestampSecondVector = PrimitiveVector; -pub type TimestampSecondVectorBuilder = PrimitiveVectorBuilder; - -pub type TimestampMillisecondVector = PrimitiveVector; -pub type TimestampMillisecondVectorBuilder = PrimitiveVectorBuilder; - -pub type TimestampMicrosecondVector = PrimitiveVector; -pub type TimestampMicrosecondVectorBuilder = PrimitiveVectorBuilder; - -pub type TimestampNanosecondVector = PrimitiveVector; -pub type TimestampNanosecondVectorBuilder = PrimitiveVectorBuilder; diff --git a/src/frontend/Cargo.toml b/src/frontend/Cargo.toml index 56065fe1c08d..90c712067130 100644 --- a/src/frontend/Cargo.toml +++ b/src/frontend/Cargo.toml @@ -22,11 +22,9 @@ common-recordbatch = { path = "../common/recordbatch" } common-runtime = { path = "../common/runtime" } common-telemetry = { path = "../common/telemetry" } common-time = { path = "../common/time" } -datafusion = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2", features = [ - "simd", -] } -datafusion-common = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2" } -datafusion-expr = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2" } +datafusion = "14.0.0" +datafusion-common = "14.0.0" +datafusion-expr = "14.0.0" datanode = { path = "../datanode" } datatypes = { path = "../datatypes" } futures = "0.3" @@ -44,7 +42,6 @@ servers = { path = "../servers" } session = { path = "../session" } snafu = { version = "0.7", features = ["backtraces"] } sql = { path = "../sql" } -sqlparser = "0.15" store-api = { path = "../store-api" } substrait = { path = "../common/substrait" } table = { path = "../table" } diff --git a/src/frontend/src/error.rs b/src/frontend/src/error.rs index eae56a12f837..2f40eec4b2ab 100644 --- a/src/frontend/src/error.rs +++ b/src/frontend/src/error.rs @@ -17,6 +17,7 @@ use std::any::Any; use common_error::prelude::*; use common_query::logical_plan::Expr; use datafusion_common::ScalarValue; +use datatypes::prelude::Value; use store_api::storage::RegionId; #[derive(Debug, Snafu)] @@ -437,6 +438,17 @@ pub enum Error { source: substrait::error::Error, }, + #[snafu(display( + "Failed to build a vector from values, value: {}, source: {}", + value, + source + ))] + BuildVector { + value: Value, + #[snafu(backtrace)] + source: datatypes::error::Error, + }, + #[snafu(display("Failed to invoke GRPC server, source: {}", source))] InvokeGrpcServer { #[snafu(backtrace)] @@ -533,6 +545,7 @@ impl ErrorExt for Error { Error::LeaderNotFound { .. } => StatusCode::StorageUnavailable, Error::TableAlreadyExist { .. } => StatusCode::TableAlreadyExists, Error::EncodeSubstraitLogicalPlan { source } => source.status_code(), + Error::BuildVector { source, .. } => source.status_code(), } } diff --git a/src/frontend/src/expr_factory.rs b/src/frontend/src/expr_factory.rs index 9f406ace0b71..204eb42d92b3 100644 --- a/src/frontend/src/expr_factory.rs +++ b/src/frontend/src/expr_factory.rs @@ -19,9 +19,9 @@ use api::helper::ColumnDataTypeWrapper; use api::v1::{Column, ColumnDataType, CreateExpr}; use datatypes::schema::ColumnSchema; use snafu::{ensure, ResultExt}; +use sql::ast::{ColumnDef, TableConstraint}; use sql::statements::create::{CreateTable, TIME_INDEX}; use sql::statements::{column_def_to_schema, table_idents_to_full_name}; -use sqlparser::ast::{ColumnDef, TableConstraint}; use crate::error::{ BuildCreateExprOnInsertionSnafu, ColumnDataTypeSnafu, ConvertColumnDefaultConstraintSnafu, diff --git a/src/frontend/src/instance.rs b/src/frontend/src/instance.rs index 64b2bac22a34..730c16d3b4c8 100644 --- a/src/frontend/src/instance.rs +++ b/src/frontend/src/instance.rs @@ -695,22 +695,26 @@ mod tests { .await .unwrap(); match output { - Output::Stream(stream) => { - let recordbatches = RecordBatches::try_collect(stream).await.unwrap(); - let pretty_print = recordbatches.pretty_print(); - let pretty_print = pretty_print.lines().collect::>(); - let expected = vec![ - "+----------------+---------------------+-----+--------+-----------+", - "| host | ts | cpu | memory | disk_util |", - "+----------------+---------------------+-----+--------+-----------+", - "| frontend.host1 | 1970-01-01 00:00:01 | 1.1 | 100 | 9.9 |", - "| frontend.host2 | 1970-01-01 00:00:02 | | | 9.9 |", - "| frontend.host3 | 1970-01-01 00:00:03 | 3.3 | 300 | 9.9 |", - "+----------------+---------------------+-----+--------+-----------+", - ]; + Output::RecordBatches(_) => { + unreachable!("Output::RecordBatches"); + } + Output::AffectedRows(_) => { + unreachable!("Output::AffectedRows"); + } + Output::Stream(s) => { + let batches = common_recordbatch::util::collect_batches(s).await.unwrap(); + let pretty_print = batches.pretty_print().unwrap(); + let expected = "\ ++----------------+---------------------+-----+--------+-----------+ +| host | ts | cpu | memory | disk_util | ++----------------+---------------------+-----+--------+-----------+ +| frontend.host1 | 1970-01-01T00:00:01 | 1.1 | 100 | 9.9 | +| frontend.host2 | 1970-01-01T00:00:02 | | | 9.9 | +| frontend.host3 | 1970-01-01T00:00:03 | 3.3 | 300 | 9.9 | ++----------------+---------------------+-----+--------+-----------+\ + "; assert_eq!(pretty_print, expected); } - _ => unreachable!(), }; let sql = "select * from demo where ts>cast(1000000000 as timestamp)"; // use nanoseconds as where condition @@ -718,21 +722,26 @@ mod tests { .await .unwrap(); match output { - Output::Stream(stream) => { - let recordbatches = RecordBatches::try_collect(stream).await.unwrap(); - let pretty_print = recordbatches.pretty_print(); - let pretty_print = pretty_print.lines().collect::>(); - let expected = vec![ - "+----------------+---------------------+-----+--------+-----------+", - "| host | ts | cpu | memory | disk_util |", - "+----------------+---------------------+-----+--------+-----------+", - "| frontend.host2 | 1970-01-01 00:00:02 | | | 9.9 |", - "| frontend.host3 | 1970-01-01 00:00:03 | 3.3 | 300 | 9.9 |", - "+----------------+---------------------+-----+--------+-----------+", - ]; - assert_eq!(pretty_print, expected); + Output::RecordBatches(_) => { + unreachable!("Output::RecordBatches") + } + Output::AffectedRows(_) => { + unreachable!("Output::AffectedRows") + } + Output::Stream(s) => { + let recordbatches = common_recordbatch::util::collect_batches(s).await.unwrap(); + let pretty = recordbatches.pretty_print().unwrap(); + let expected = "\ ++----------------+---------------------+-----+--------+-----------+ +| host | ts | cpu | memory | disk_util | ++----------------+---------------------+-----+--------+-----------+ +| frontend.host2 | 1970-01-01T00:00:02 | | | 9.9 | +| frontend.host3 | 1970-01-01T00:00:03 | 3.3 | 300 | 9.9 | ++----------------+---------------------+-----+--------+-----------+\ + " + .to_string(); + assert_eq!(pretty, expected); } - _ => unreachable!(), }; } @@ -787,11 +796,11 @@ mod tests { let expected_ts_col = Column { column_name: "ts".to_string(), values: Some(column::Values { - ts_millis_values: vec![1000, 2000, 3000, 4000], + ts_millisecond_values: vec![1000, 2000, 3000, 4000], ..Default::default() }), semantic_type: SemanticType::Timestamp as i32, - datatype: ColumnDataType::Timestamp as i32, + datatype: ColumnDataType::TimestampMillisecond as i32, ..Default::default() }; @@ -909,7 +918,7 @@ mod tests { }, GrpcColumnDef { name: "ts".to_string(), - datatype: ColumnDataType::Timestamp as i32, + datatype: ColumnDataType::TimestampMillisecond as i32, is_nullable: true, default_constraint: None, }, diff --git a/src/frontend/src/instance/distributed.rs b/src/frontend/src/instance/distributed.rs index 2613654f8fbd..a44e4596fa13 100644 --- a/src/frontend/src/instance/distributed.rs +++ b/src/frontend/src/instance/distributed.rs @@ -43,10 +43,10 @@ use servers::error as server_error; use servers::query_handler::{GrpcAdminHandler, GrpcQueryHandler, SqlQueryHandler}; use session::context::QueryContextRef; use snafu::{ensure, OptionExt, ResultExt}; +use sql::ast::Value as SqlValue; use sql::statements::create::Partitions; use sql::statements::sql_value_to_value; use sql::statements::statement::Statement; -use sqlparser::ast::Value as SqlValue; use table::metadata::{RawTableInfo, RawTableMeta, TableIdent, TableType}; use crate::catalog::FrontendCatalogManager; @@ -522,11 +522,12 @@ fn find_partition_columns( #[cfg(test)] mod test { + use itertools::Itertools; use servers::query_handler::SqlQueryHandlerRef; use session::context::QueryContext; + use sql::dialect::GenericDialect; use sql::parser::ParserContext; use sql::statements::statement::Statement; - use sqlparser::dialect::GenericDialect; use super::*; use crate::expr_factory::{CreateExprFactory, DefaultCreateExprFactory}; @@ -604,7 +605,9 @@ ENGINE=mito", "| public |", "| test_show_databases |", "+---------------------+", - ]; + ] + .into_iter() + .join("\n"); let expected2 = vec![ "+---------------------+", "| Schemas |", @@ -612,9 +615,10 @@ ENGINE=mito", "| test_show_databases |", "| public |", "+---------------------+", - ]; - let pretty = r.pretty_print(); - let lines = pretty.lines().collect::>(); + ] + .into_iter() + .join("\n"); + let lines = r.pretty_print().unwrap(); assert!(lines == expected1 || lines == expected2) } _ => unreachable!(), @@ -654,14 +658,12 @@ ENGINE=mito", let output = instance.do_query(sql, QueryContext::arc()).await.unwrap(); match output { Output::RecordBatches(r) => { - let expected = vec![ - "+--------------+", - "| Tables |", - "+--------------+", - "| dist_numbers |", - "+--------------+", - ]; - assert_eq!(r.pretty_print().lines().collect::>(), expected); + let expected = r#"+--------------+ +| Tables | ++--------------+ +| dist_numbers | ++--------------+"#; + assert_eq!(r.pretty_print().unwrap(), expected); } _ => unreachable!(), } diff --git a/src/frontend/src/instance/opentsdb.rs b/src/frontend/src/instance/opentsdb.rs index 842a45240e75..9bcec20bb75c 100644 --- a/src/frontend/src/instance/opentsdb.rs +++ b/src/frontend/src/instance/opentsdb.rs @@ -63,7 +63,7 @@ mod tests { use common_query::Output; use common_recordbatch::RecordBatches; - use datafusion::arrow_print; + use itertools::Itertools; use servers::query_handler::SqlQueryHandler; use session::context::QueryContext; @@ -134,22 +134,18 @@ mod tests { match output { Output::Stream(stream) => { let recordbatches = RecordBatches::try_collect(stream).await.unwrap(); - let recordbatches = recordbatches - .take() - .into_iter() - .map(|r| r.df_recordbatch) - .collect::>(); - let pretty_print = arrow_print::write(&recordbatches); - let pretty_print = pretty_print.lines().collect::>(); + let pretty_print = recordbatches.pretty_print().unwrap(); let expected = vec![ "+---------------------+----------------+-------+-------+-------+", "| greptime_timestamp | greptime_value | tagk1 | tagk2 | tagk3 |", "+---------------------+----------------+-------+-------+-------+", - "| 1970-01-01 00:00:01 | 1 | tagv1 | tagv2 | |", - "| 1970-01-01 00:00:02 | 2 | | tagv2 | tagv3 |", - "| 1970-01-01 00:00:03 | 3 | | | |", + "| 1970-01-01T00:00:01 | 1 | tagv1 | tagv2 | |", + "| 1970-01-01T00:00:02 | 2 | | tagv2 | tagv3 |", + "| 1970-01-01T00:00:03 | 3 | | | |", "+---------------------+----------------+-------+-------+-------+", - ]; + ] + .into_iter() + .join("\n"); assert_eq!(pretty_print, expected); } _ => unreachable!(), diff --git a/src/frontend/src/mysql.rs b/src/frontend/src/mysql.rs index a0f8ef796138..87888b147b2b 100644 --- a/src/frontend/src/mysql.rs +++ b/src/frontend/src/mysql.rs @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::sync::Arc; - use serde::{Deserialize, Serialize}; use servers::tls::TlsOption; @@ -22,7 +20,7 @@ pub struct MysqlOptions { pub addr: String, pub runtime_size: usize, #[serde(default = "Default::default")] - pub tls: Arc, + pub tls: TlsOption, } impl Default for MysqlOptions { @@ -30,7 +28,7 @@ impl Default for MysqlOptions { Self { addr: "127.0.0.1:4002".to_string(), runtime_size: 2, - tls: Arc::new(TlsOption::default()), + tls: TlsOption::default(), } } } diff --git a/src/frontend/src/postgres.rs b/src/frontend/src/postgres.rs index c2df2f54dc14..144758f315e3 100644 --- a/src/frontend/src/postgres.rs +++ b/src/frontend/src/postgres.rs @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::sync::Arc; - use serde::{Deserialize, Serialize}; use servers::tls::TlsOption; @@ -22,7 +20,7 @@ pub struct PostgresOptions { pub addr: String, pub runtime_size: usize, #[serde(default = "Default::default")] - pub tls: Arc, + pub tls: TlsOption, } impl Default for PostgresOptions { diff --git a/src/frontend/src/spliter.rs b/src/frontend/src/spliter.rs index eb87907651fd..f70116b69e25 100644 --- a/src/frontend/src/spliter.rs +++ b/src/frontend/src/spliter.rs @@ -14,8 +14,10 @@ use std::collections::HashMap; +use datatypes::data_type::DataType; +use datatypes::prelude::MutableVector; use datatypes::value::Value; -use datatypes::vectors::{VectorBuilder, VectorRef}; +use datatypes::vectors::VectorRef; use snafu::{ensure, OptionExt}; use store_api::storage::RegionNumber; use table::requests::InsertRequest; @@ -125,9 +127,16 @@ fn partition_insert_request( insert: &InsertRequest, region_map: HashMap>, ) -> DistInsertRequest { - let mut dist_insert: HashMap> = + let mut dist_insert: HashMap>> = HashMap::with_capacity(region_map.len()); + let row_num = insert + .columns_values + .values() + .next() + .map(|v| v.len()) + .unwrap_or(0); + let column_count = insert.columns_values.len(); for (column_name, vector) in &insert.columns_values { for (region_id, val_idxs) in ®ion_map { @@ -136,10 +145,13 @@ fn partition_insert_request( .or_insert_with(|| HashMap::with_capacity(column_count)); let builder = region_insert .entry(column_name) - .or_insert_with(|| VectorBuilder::new(vector.data_type())); - val_idxs - .iter() - .for_each(|idx| builder.push(&vector.get(*idx))); + .or_insert_with(|| vector.data_type().create_mutable_vector(row_num)); + val_idxs.iter().for_each(|idx| { + // Safety: MutableVector is built according to column data type. + builder + .push_value_ref(vector.get(*idx).as_value_ref()) + .unwrap(); + }); } } @@ -151,7 +163,7 @@ fn partition_insert_request( .map(|(region_id, vector_map)| { let columns_values = vector_map .into_iter() - .map(|(column_name, mut builder)| (column_name.to_string(), builder.finish())) + .map(|(column_name, mut builder)| (column_name.to_string(), builder.to_vector())) .collect(); ( region_id, @@ -175,9 +187,12 @@ mod tests { use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME}; use datatypes::data_type::ConcreteDataType; - use datatypes::types::{BooleanType, StringType}; + use datatypes::prelude::ScalarVectorBuilder; + use datatypes::types::StringType; use datatypes::value::Value; - use datatypes::vectors::VectorBuilder; + use datatypes::vectors::{ + BooleanVectorBuilder, Int16VectorBuilder, MutableVector, StringVectorBuilder, + }; use serde::{Deserialize, Serialize}; use store_api::storage::RegionNumber; use table::requests::InsertRequest; @@ -339,17 +354,17 @@ mod tests { #[test] fn test_partition_values() { - let mut builder = VectorBuilder::new(ConcreteDataType::Boolean(BooleanType)); - builder.push(&true.into()); - builder.push(&false.into()); - builder.push(&true.into()); - let v1 = builder.finish(); - - let mut builder = VectorBuilder::new(ConcreteDataType::String(StringType)); - builder.push(&"host1".into()); - builder.push_null(); - builder.push(&"host3".into()); - let v2 = builder.finish(); + let mut builder = BooleanVectorBuilder::with_capacity(3); + builder.push(Some(true)); + builder.push(Some(false)); + builder.push(Some(true)); + let v1 = builder.to_vector(); + + let mut builder = StringVectorBuilder::with_capacity(3); + builder.push(Some("host1")); + builder.push(None); + builder.push(Some("host3")); + let v2 = builder.to_vector(); let vectors = vec![v1, v2]; @@ -368,23 +383,23 @@ mod tests { fn mock_insert_request() -> InsertRequest { let mut columns_values = HashMap::with_capacity(4); - let mut builder = VectorBuilder::new(ConcreteDataType::Boolean(BooleanType)); - builder.push(&true.into()); - builder.push(&false.into()); - builder.push(&true.into()); - columns_values.insert("enable_reboot".to_string(), builder.finish()); - - let mut builder = VectorBuilder::new(ConcreteDataType::String(StringType)); - builder.push(&"host1".into()); - builder.push_null(); - builder.push(&"host3".into()); - columns_values.insert("host".to_string(), builder.finish()); - - let mut builder = VectorBuilder::new(ConcreteDataType::int16_datatype()); - builder.push(&1_i16.into()); - builder.push(&2_i16.into()); - builder.push(&3_i16.into()); - columns_values.insert("id".to_string(), builder.finish()); + let mut builder = BooleanVectorBuilder::with_capacity(3); + builder.push(Some(true)); + builder.push(Some(false)); + builder.push(Some(true)); + columns_values.insert("enable_reboot".to_string(), builder.to_vector()); + + let mut builder = StringVectorBuilder::with_capacity(3); + builder.push(Some("host1")); + builder.push(None); + builder.push(Some("host3")); + columns_values.insert("host".to_string(), builder.to_vector()); + + let mut builder = Int16VectorBuilder::with_capacity(3); + builder.push(Some(1_i16)); + builder.push(Some(2_i16)); + builder.push(Some(3_i16)); + columns_values.insert("id".to_string(), builder.to_vector()); InsertRequest { catalog_name: DEFAULT_CATALOG_NAME.to_string(), @@ -396,22 +411,22 @@ mod tests { fn mock_wrong_insert_request() -> InsertRequest { let mut columns_values = HashMap::with_capacity(4); - let mut builder = VectorBuilder::new(ConcreteDataType::Boolean(BooleanType)); - builder.push(&true.into()); - builder.push(&false.into()); - builder.push(&true.into()); - columns_values.insert("enable_reboot".to_string(), builder.finish()); - - let mut builder = VectorBuilder::new(ConcreteDataType::String(StringType)); - builder.push(&"host1".into()); - builder.push_null(); - builder.push(&"host3".into()); - columns_values.insert("host".to_string(), builder.finish()); - - let mut builder = VectorBuilder::new(ConcreteDataType::int16_datatype()); - builder.push(&1_i16.into()); + let mut builder = BooleanVectorBuilder::with_capacity(3); + builder.push(Some(true)); + builder.push(Some(false)); + builder.push(Some(true)); + columns_values.insert("enable_reboot".to_string(), builder.to_vector()); + + let mut builder = StringVectorBuilder::with_capacity(3); + builder.push(Some("host1")); + builder.push(None); + builder.push(Some("host3")); + columns_values.insert("host".to_string(), builder.to_vector()); + + let mut builder = Int16VectorBuilder::with_capacity(1); + builder.push(Some(1_i16)); // two values are missing - columns_values.insert("id".to_string(), builder.finish()); + columns_values.insert("id".to_string(), builder.to_vector()); InsertRequest { catalog_name: DEFAULT_CATALOG_NAME.to_string(), diff --git a/src/frontend/src/sql.rs b/src/frontend/src/sql.rs index f888d5e83617..8814ef2bf5dd 100644 --- a/src/frontend/src/sql.rs +++ b/src/frontend/src/sql.rs @@ -14,15 +14,15 @@ use catalog::SchemaProviderRef; use common_error::snafu::ensure; -use datatypes::prelude::ConcreteDataType; -use datatypes::vectors::VectorBuilder; +use datatypes::data_type::DataType; +use datatypes::prelude::{ConcreteDataType, MutableVector}; use snafu::{OptionExt, ResultExt}; use sql::ast::Value as SqlValue; use sql::statements; use sql::statements::insert::Insert; use table::requests::InsertRequest; -use crate::error::{self, Result}; +use crate::error::{self, BuildVectorSnafu, Result}; // TODO(fys): Extract the common logic in datanode and frontend in the future. #[allow(dead_code)] @@ -49,7 +49,7 @@ pub(crate) fn insert_to_request( }; let rows_num = values.len(); - let mut columns_builders: Vec<(&String, &ConcreteDataType, VectorBuilder)> = + let mut columns_builders: Vec<(&String, &ConcreteDataType, Box)> = Vec::with_capacity(columns_num); if columns.is_empty() { @@ -58,7 +58,7 @@ pub(crate) fn insert_to_request( columns_builders.push(( &column_schema.name, data_type, - VectorBuilder::with_capacity(data_type.clone(), rows_num), + data_type.create_mutable_vector(rows_num), )); } } else { @@ -73,7 +73,7 @@ pub(crate) fn insert_to_request( columns_builders.push(( column_name, data_type, - VectorBuilder::with_capacity(data_type.clone(), rows_num), + data_type.create_mutable_vector(rows_num), )); } } @@ -100,7 +100,7 @@ pub(crate) fn insert_to_request( table_name, columns_values: columns_builders .into_iter() - .map(|(c, _, mut b)| (c.to_owned(), b.finish())) + .map(|(c, _, mut b)| (c.to_owned(), b.to_vector())) .collect(), }) } @@ -109,11 +109,12 @@ fn add_row_to_vector( column_name: &str, data_type: &ConcreteDataType, sql_val: &SqlValue, - builder: &mut VectorBuilder, + builder: &mut Box, ) -> Result<()> { let value = statements::sql_value_to_value(column_name, data_type, sql_val) .context(error::ParseSqlSnafu)?; - builder.push(&value); - + builder + .push_value_ref(value.as_value_ref()) + .context(BuildVectorSnafu { value })?; Ok(()) } diff --git a/src/frontend/src/table.rs b/src/frontend/src/table.rs index ac97d2dc3ca2..2d157f30a8dd 100644 --- a/src/frontend/src/table.rs +++ b/src/frontend/src/table.rs @@ -29,12 +29,13 @@ use common_query::physical_plan::{PhysicalPlan, PhysicalPlanRef}; use common_recordbatch::adapter::AsyncRecordBatchStreamAdapter; use common_recordbatch::{RecordBatches, SendableRecordBatchStream}; use common_telemetry::debug; -use datafusion::execution::runtime_env::RuntimeEnv; -use datafusion::logical_plan::Expr as DfExpr; +use datafusion::execution::context::TaskContext; use datafusion::physical_plan::{ Partitioning, SendableRecordBatchStream as DfSendableRecordBatchStream, }; use datafusion_common::DataFusionError; +use datafusion_expr::expr::Expr as DfExpr; +use datafusion_expr::BinaryExpr; use datatypes::prelude::Value; use datatypes::schema::{ColumnSchema, Schema, SchemaRef}; use meta_client::rpc::{Peer, TableName}; @@ -198,7 +199,7 @@ impl DistTable { ) -> Result> { let expr = filter.df_expr(); match expr { - DfExpr::BinaryExpr { left, op, right } if is_compare_op(op) => { + DfExpr::BinaryExpr(BinaryExpr { left, op, right }) if is_compare_op(op) => { let column_op_value = match (left.as_ref(), right.as_ref()) { (DfExpr::Column(c), DfExpr::Literal(v)) => Some((&c.name, *op, v)), (DfExpr::Literal(v), DfExpr::Column(c)) => { @@ -217,7 +218,7 @@ impl DistTable { .collect::>()); } } - DfExpr::BinaryExpr { left, op, right } + DfExpr::BinaryExpr(BinaryExpr { left, op, right }) if matches!(op, Operator::And | Operator::Or) => { let left_regions = @@ -449,7 +450,7 @@ impl PhysicalPlan for DistTableScan { fn execute( &self, partition: usize, - _runtime: Arc, + _context: Arc, ) -> QueryResult { let exec = self.partition_execs[partition].clone(); let stream = Box::pin(async move { @@ -515,18 +516,20 @@ mod test { use datafusion::physical_plan::expressions::{col as physical_col, PhysicalSortExpr}; use datafusion::physical_plan::sorts::sort::SortExec; use datafusion::physical_plan::ExecutionPlan; + use datafusion::prelude::SessionContext; + use datafusion::sql::sqlparser; use datafusion_expr::expr_fn::{and, binary_expr, col, or}; use datafusion_expr::lit; use datanode::instance::Instance; - use datatypes::arrow::compute::sort::SortOptions; + use datatypes::arrow::compute::SortOptions; use datatypes::prelude::ConcreteDataType; use datatypes::schema::{ColumnSchema, Schema}; + use itertools::Itertools; use meta_client::client::MetaClient; use meta_client::rpc::router::RegionRoute; use meta_client::rpc::{Region, Table, TableRoute}; use sql::parser::ParserContext; use sql::statements::statement::Statement; - use sqlparser::dialect::GenericDialect; use table::metadata::{TableInfoBuilder, TableMetaBuilder}; use table::TableRef; @@ -733,7 +736,6 @@ mod test { #[tokio::test(flavor = "multi_thread")] async fn test_dist_table_scan() { let table = Arc::new(new_dist_table().await); - // should scan all regions // select a, row_id from numbers let projection = Some(vec![1, 2]); @@ -859,6 +861,7 @@ mod test { expected_partitions: usize, expected_output: Vec<&str>, ) { + let expected_output = expected_output.into_iter().join("\n"); let table_scan = table .scan(&projection, filters.as_slice(), None) .await @@ -877,21 +880,17 @@ mod test { options: SortOptions::default(), }], Arc::new(merge), + None, ) .unwrap(); assert_eq!(sort.output_partitioning().partition_count(), 1); - let stream = sort - .execute(0, Arc::new(RuntimeEnv::default())) - .await - .unwrap(); + let session_ctx = SessionContext::new(); + let stream = sort.execute(0, session_ctx.task_ctx()).unwrap(); let stream = Box::pin(RecordBatchStreamAdapter::try_new(stream).unwrap()); let recordbatches = RecordBatches::try_collect(stream).await.unwrap(); - assert_eq!( - recordbatches.pretty_print().lines().collect::>(), - expected_output - ); + assert_eq!(recordbatches.pretty_print().unwrap(), expected_output); } async fn new_dist_table() -> DistTable { @@ -923,14 +922,16 @@ mod test { PARTITION r3 VALUES LESS THAN (MAXVALUE), ) ENGINE=mito"; - let create_table = match ParserContext::create_with_dialect(sql, &GenericDialect {}) - .unwrap() - .pop() - .unwrap() - { - Statement::CreateTable(c) => c, - _ => unreachable!(), - }; + + let create_table = + match ParserContext::create_with_dialect(sql, &sqlparser::dialect::GenericDialect {}) + .unwrap() + .pop() + .unwrap() + { + Statement::CreateTable(c) => c, + _ => unreachable!(), + }; let mut expr = DefaultCreateExprFactory .create_expr_by_stmt(&create_table) diff --git a/src/frontend/src/table/insert.rs b/src/frontend/src/table/insert.rs index 409632474f80..fb23b0e7925f 100644 --- a/src/frontend/src/table/insert.rs +++ b/src/frontend/src/table/insert.rs @@ -107,7 +107,7 @@ pub fn insert_request_to_insert_batch(insert: &InsertRequest) -> Result<(Vec InsertRequest { let mut columns_values = HashMap::with_capacity(4); - let mut builder = VectorBuilder::new(ConcreteDataType::String(StringType)); - builder.push(&"host1".into()); - builder.push_null(); - builder.push(&"host3".into()); - columns_values.insert("host".to_string(), builder.finish()); - - let mut builder = VectorBuilder::new(ConcreteDataType::int16_datatype()); - builder.push(&1_i16.into()); - builder.push(&2_i16.into()); - builder.push(&3_i16.into()); - columns_values.insert("id".to_string(), builder.finish()); + let mut builder = StringVectorBuilder::with_capacity(3); + builder.push(Some("host1")); + builder.push(None); + builder.push(Some("host3")); + columns_values.insert("host".to_string(), builder.to_vector()); + + let mut builder = Int16VectorBuilder::with_capacity(3); + builder.push(Some(1_i16)); + builder.push(Some(2_i16)); + builder.push(Some(3_i16)); + columns_values.insert("id".to_string(), builder.to_vector()); InsertRequest { catalog_name: DEFAULT_CATALOG_NAME.to_string(), diff --git a/src/frontend/src/table/scan.rs b/src/frontend/src/table/scan.rs index 14ea9a6a93df..3d9f623aeb37 100644 --- a/src/frontend/src/table/scan.rs +++ b/src/frontend/src/table/scan.rs @@ -20,7 +20,8 @@ use client::{Database, ObjectResult}; use common_query::prelude::Expr; use common_query::Output; use common_recordbatch::{util, RecordBatches}; -use datafusion::logical_plan::{LogicalPlan, LogicalPlanBuilder}; +use datafusion::datasource::DefaultTableSource; +use datafusion_expr::{LogicalPlan, LogicalPlanBuilder}; use meta_client::rpc::TableName; use snafu::ResultExt; use substrait::{DFLogicalSubstraitConvertor, SubstraitPlan}; @@ -82,7 +83,7 @@ impl DatanodeInstance { let mut builder = LogicalPlanBuilder::scan_with_filters( &table_scan.table_name.to_string(), - table_provider, + Arc::new(DefaultTableSource::new(table_provider)), table_scan.projection.clone(), table_scan .filters @@ -104,11 +105,9 @@ impl DatanodeInstance { .context(error::BuildDfLogicalPlanSnafu)?; } - if let Some(limit) = table_scan.limit { - builder = builder - .limit(limit) - .context(error::BuildDfLogicalPlanSnafu)?; - } + builder + .limit(0, table_scan.limit) + .context(error::BuildDfLogicalPlanSnafu)?; builder.build().context(error::BuildDfLogicalPlanSnafu) } diff --git a/src/mito/Cargo.toml b/src/mito/Cargo.toml index 63612075f77d..583ff9f3f56c 100644 --- a/src/mito/Cargo.toml +++ b/src/mito/Cargo.toml @@ -19,10 +19,8 @@ common-query = { path = "../common/query" } common-recordbatch = { path = "../common/recordbatch" } common-telemetry = { path = "../common/telemetry" } common-time = { path = "../common/time" } -datafusion = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2", features = [ - "simd", -] } -datafusion-common = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2" } +datafusion = "14.0.0" +datafusion-common = "14.0.0" datatypes = { path = "../datatypes" } futures = "0.3" log-store = { path = "../log-store" } diff --git a/src/mito/src/engine.rs b/src/mito/src/engine.rs index 48d334482189..480dd16cea60 100644 --- a/src/mito/src/engine.rs +++ b/src/mito/src/engine.rs @@ -519,13 +519,14 @@ impl MitoEngineInner { #[cfg(test)] mod tests { - use common_query::physical_plan::RuntimeEnv; + use common_query::physical_plan::SessionContext; use common_recordbatch::util; - use datafusion_common::field_util::{FieldExt, SchemaExt}; - use datatypes::prelude::{ConcreteDataType, ScalarVector}; + use datatypes::prelude::ConcreteDataType; use datatypes::schema::{ColumnDefaultConstraint, ColumnSchema, SchemaBuilder}; use datatypes::value::Value; - use datatypes::vectors::*; + use datatypes::vectors::{ + Float64Vector, Int32Vector, StringVector, TimestampMillisecondVector, VectorRef, + }; use log_store::fs::noop::NoopLogStore; use storage::config::EngineConfig as StorageEngineConfig; use storage::EngineImpl; @@ -600,30 +601,29 @@ mod tests { let (_dir, table_name, table) = setup_table_with_column_default_constraint().await; let mut columns_values: HashMap = HashMap::with_capacity(4); - let names = StringVector::from(vec!["first", "second"]); - let tss = TimestampVector::from_vec(vec![1, 2]); + let names: VectorRef = Arc::new(StringVector::from(vec!["first", "second"])); + let tss: VectorRef = Arc::new(TimestampMillisecondVector::from_vec(vec![1, 2])); - columns_values.insert("name".to_string(), Arc::new(names.clone())); - columns_values.insert("ts".to_string(), Arc::new(tss.clone())); + columns_values.insert("name".to_string(), names.clone()); + columns_values.insert("ts".to_string(), tss.clone()); let insert_req = new_insert_request(table_name.to_string(), columns_values); assert_eq!(2, table.insert(insert_req).await.unwrap()); + let session_ctx = SessionContext::new(); let stream = table.scan(&None, &[], None).await.unwrap(); - let stream = stream.execute(0, Arc::new(RuntimeEnv::default())).unwrap(); + let stream = stream.execute(0, session_ctx.task_ctx()).unwrap(); let batches = util::collect(stream).await.unwrap(); assert_eq!(1, batches.len()); - let record = &batches[0].df_recordbatch; + let record = &batches[0]; assert_eq!(record.num_columns(), 3); - let columns = record.columns(); - assert_eq!(3, columns.len()); - assert_eq!(names.to_arrow_array(), columns[0]); + assert_eq!(names, *record.column(0)); assert_eq!( - Int32Vector::from_vec(vec![42, 42]).to_arrow_array(), - columns[1] + Arc::new(Int32Vector::from_vec(vec![42, 42])) as VectorRef, + *record.column(1) ); - assert_eq!(tss.to_arrow_array(), columns[2]); + assert_eq!(tss, *record.column(2)); } #[tokio::test] @@ -631,29 +631,28 @@ mod tests { let (_dir, table_name, table) = setup_table_with_column_default_constraint().await; let mut columns_values: HashMap = HashMap::with_capacity(4); - let names = StringVector::from(vec!["first", "second"]); - let nums = Int32Vector::from(vec![None, Some(66)]); - let tss = TimestampVector::from_vec(vec![1, 2]); + let names: VectorRef = Arc::new(StringVector::from(vec!["first", "second"])); + let nums: VectorRef = Arc::new(Int32Vector::from(vec![None, Some(66)])); + let tss: VectorRef = Arc::new(TimestampMillisecondVector::from_vec(vec![1, 2])); - columns_values.insert("name".to_string(), Arc::new(names.clone())); - columns_values.insert("n".to_string(), Arc::new(nums.clone())); - columns_values.insert("ts".to_string(), Arc::new(tss.clone())); + columns_values.insert("name".to_string(), names.clone()); + columns_values.insert("n".to_string(), nums.clone()); + columns_values.insert("ts".to_string(), tss.clone()); let insert_req = new_insert_request(table_name.to_string(), columns_values); assert_eq!(2, table.insert(insert_req).await.unwrap()); + let session_ctx = SessionContext::new(); let stream = table.scan(&None, &[], None).await.unwrap(); - let stream = stream.execute(0, Arc::new(RuntimeEnv::default())).unwrap(); + let stream = stream.execute(0, session_ctx.task_ctx()).unwrap(); let batches = util::collect(stream).await.unwrap(); assert_eq!(1, batches.len()); - let record = &batches[0].df_recordbatch; + let record = &batches[0]; assert_eq!(record.num_columns(), 3); - let columns = record.columns(); - assert_eq!(3, columns.len()); - assert_eq!(names.to_arrow_array(), columns[0]); - assert_eq!(nums.to_arrow_array(), columns[1]); - assert_eq!(tss.to_arrow_array(), columns[2]); + assert_eq!(names, *record.column(0)); + assert_eq!(nums, *record.column(1)); + assert_eq!(tss, *record.column(2)); } #[test] @@ -724,73 +723,73 @@ mod tests { assert_eq!(0, table.insert(insert_req).await.unwrap()); let mut columns_values: HashMap = HashMap::with_capacity(4); - let hosts = StringVector::from(vec!["host1", "host2"]); - let cpus = Float64Vector::from_vec(vec![55.5, 66.6]); - let memories = Float64Vector::from_vec(vec![1024f64, 4096f64]); - let tss = TimestampVector::from_vec(vec![1, 2]); + let hosts: VectorRef = Arc::new(StringVector::from(vec!["host1", "host2"])); + let cpus: VectorRef = Arc::new(Float64Vector::from_vec(vec![55.5, 66.6])); + let memories: VectorRef = Arc::new(Float64Vector::from_vec(vec![1024f64, 4096f64])); + let tss: VectorRef = Arc::new(TimestampMillisecondVector::from_vec(vec![1, 2])); - columns_values.insert("host".to_string(), Arc::new(hosts.clone())); - columns_values.insert("cpu".to_string(), Arc::new(cpus.clone())); - columns_values.insert("memory".to_string(), Arc::new(memories.clone())); - columns_values.insert("ts".to_string(), Arc::new(tss.clone())); + columns_values.insert("host".to_string(), hosts.clone()); + columns_values.insert("cpu".to_string(), cpus.clone()); + columns_values.insert("memory".to_string(), memories.clone()); + columns_values.insert("ts".to_string(), tss.clone()); let insert_req = new_insert_request("demo".to_string(), columns_values); assert_eq!(2, table.insert(insert_req).await.unwrap()); + let session_ctx = SessionContext::new(); let stream = table.scan(&None, &[], None).await.unwrap(); - let stream = stream.execute(0, Arc::new(RuntimeEnv::default())).unwrap(); + let stream = stream.execute(0, session_ctx.task_ctx()).unwrap(); let batches = util::collect(stream).await.unwrap(); assert_eq!(1, batches.len()); - assert_eq!(batches[0].df_recordbatch.num_columns(), 4); - - let arrow_schema = batches[0].schema.arrow_schema(); - assert_eq!(arrow_schema.fields().len(), 4); - - assert_eq!(arrow_schema.field(0).name(), "host"); - assert_eq!(arrow_schema.field(1).name(), "cpu"); - assert_eq!(arrow_schema.field(2).name(), "memory"); - assert_eq!(arrow_schema.field(3).name(), "ts"); - - let columns = batches[0].df_recordbatch.columns(); - assert_eq!(4, columns.len()); - assert_eq!(hosts.to_arrow_array(), columns[0]); - assert_eq!(cpus.to_arrow_array(), columns[1]); - assert_eq!(memories.to_arrow_array(), columns[2]); - assert_eq!(tss.to_arrow_array(), columns[3]); + assert_eq!(batches[0].num_columns(), 4); + + let batch_schema = &batches[0].schema; + assert_eq!(batch_schema.num_columns(), 4); + assert_eq!(batch_schema.column_schemas()[0].name, "host"); + assert_eq!(batch_schema.column_schemas()[1].name, "cpu"); + assert_eq!(batch_schema.column_schemas()[2].name, "memory"); + assert_eq!(batch_schema.column_schemas()[3].name, "ts"); + + let batch = &batches[0]; + assert_eq!(4, batch.num_columns()); + assert_eq!(hosts, *batch.column(0)); + assert_eq!(cpus, *batch.column(1)); + assert_eq!(memories, *batch.column(2)); + assert_eq!(tss, *batch.column(3)); // Scan with projections: cpu and memory let stream = table.scan(&Some(vec![1, 2]), &[], None).await.unwrap(); - let stream = stream.execute(0, Arc::new(RuntimeEnv::default())).unwrap(); + let stream = stream.execute(0, session_ctx.task_ctx()).unwrap(); let batches = util::collect(stream).await.unwrap(); assert_eq!(1, batches.len()); - assert_eq!(batches[0].df_recordbatch.num_columns(), 2); + assert_eq!(batches[0].num_columns(), 2); - let arrow_schema = batches[0].schema.arrow_schema(); - assert_eq!(arrow_schema.fields().len(), 2); + let batch_schema = &batches[0].schema; + assert_eq!(batch_schema.num_columns(), 2); - assert_eq!(arrow_schema.field(0).name(), "cpu"); - assert_eq!(arrow_schema.field(1).name(), "memory"); + assert_eq!(batch_schema.column_schemas()[0].name, "cpu"); + assert_eq!(batch_schema.column_schemas()[1].name, "memory"); - let columns = batches[0].df_recordbatch.columns(); - assert_eq!(2, columns.len()); - assert_eq!(cpus.to_arrow_array(), columns[0]); - assert_eq!(memories.to_arrow_array(), columns[1]); + let batch = &batches[0]; + assert_eq!(2, batch.num_columns()); + assert_eq!(cpus, *batch.column(0)); + assert_eq!(memories, *batch.column(1)); // Scan with projections: only ts let stream = table.scan(&Some(vec![3]), &[], None).await.unwrap(); - let stream = stream.execute(0, Arc::new(RuntimeEnv::default())).unwrap(); + let stream = stream.execute(0, session_ctx.task_ctx()).unwrap(); let batches = util::collect(stream).await.unwrap(); assert_eq!(1, batches.len()); - assert_eq!(batches[0].df_recordbatch.num_columns(), 1); + assert_eq!(batches[0].num_columns(), 1); - let arrow_schema = batches[0].schema.arrow_schema(); - assert_eq!(arrow_schema.fields().len(), 1); + let batch_schema = &batches[0].schema; + assert_eq!(batch_schema.num_columns(), 1); - assert_eq!(arrow_schema.field(0).name(), "ts"); + assert_eq!(batch_schema.column_schemas()[0].name, "ts"); - let columns = batches[0].df_recordbatch.columns(); - assert_eq!(1, columns.len()); - assert_eq!(tss.to_arrow_array(), columns[0]); + let record = &batches[0]; + assert_eq!(1, record.num_columns()); + assert_eq!(tss, *record.column(0)); } #[tokio::test] @@ -804,28 +803,31 @@ mod tests { // Insert more than batch size rows to the table. let test_batch_size = default_batch_size * 4; let mut columns_values: HashMap = HashMap::with_capacity(4); - let hosts = StringVector::from(vec!["host1"; test_batch_size]); - let cpus = Float64Vector::from_vec(vec![55.5; test_batch_size]); - let memories = Float64Vector::from_vec(vec![1024f64; test_batch_size]); - let tss = TimestampVector::from_values((0..test_batch_size).map(|v| v as i64)); - - columns_values.insert("host".to_string(), Arc::new(hosts)); - columns_values.insert("cpu".to_string(), Arc::new(cpus)); - columns_values.insert("memory".to_string(), Arc::new(memories)); - columns_values.insert("ts".to_string(), Arc::new(tss.clone())); + let hosts: VectorRef = Arc::new(StringVector::from(vec!["host1"; test_batch_size])); + let cpus: VectorRef = Arc::new(Float64Vector::from_vec(vec![55.5; test_batch_size])); + let memories: VectorRef = Arc::new(Float64Vector::from_vec(vec![1024f64; test_batch_size])); + let tss: VectorRef = Arc::new(TimestampMillisecondVector::from_values( + (0..test_batch_size).map(|v| v as i64), + )); + + columns_values.insert("host".to_string(), hosts); + columns_values.insert("cpu".to_string(), cpus); + columns_values.insert("memory".to_string(), memories); + columns_values.insert("ts".to_string(), tss.clone()); let insert_req = new_insert_request("demo".to_string(), columns_values); assert_eq!(test_batch_size, table.insert(insert_req).await.unwrap()); + let session_ctx = SessionContext::new(); let stream = table.scan(&None, &[], None).await.unwrap(); - let stream = stream.execute(0, Arc::new(RuntimeEnv::default())).unwrap(); + let stream = stream.execute(0, session_ctx.task_ctx()).unwrap(); let batches = util::collect(stream).await.unwrap(); let mut total = 0; for batch in batches { - assert_eq!(batch.df_recordbatch.num_columns(), 4); - let ts = batch.df_recordbatch.column(3); + assert_eq!(batch.num_columns(), 4); + let ts = batch.column(3); let expect = tss.slice(total, ts.len()); - assert_eq!(expect.to_arrow_array(), *ts); + assert_eq!(expect, *ts); total += ts.len(); } assert_eq!(test_batch_size, total); diff --git a/src/mito/src/manifest/action.rs b/src/mito/src/manifest/action.rs index f8428367d463..4e2ba43db44a 100644 --- a/src/mito/src/manifest/action.rs +++ b/src/mito/src/manifest/action.rs @@ -26,7 +26,7 @@ use store_api::manifest::action::{ProtocolAction, ProtocolVersion, VersionHeader use store_api::manifest::{ManifestVersion, MetaAction}; use table::metadata::{RawTableInfo, TableIdent}; -#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)] +#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)] pub struct TableChange { pub table_info: RawTableInfo, } @@ -37,7 +37,7 @@ pub struct TableRemove { pub table_name: String, } -#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)] +#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)] pub enum TableMetaAction { Protocol(ProtocolAction), // Boxed TableChange to reduce the total size of enum @@ -45,7 +45,7 @@ pub enum TableMetaAction { Remove(TableRemove), } -#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)] +#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)] pub struct TableMetaActionList { pub actions: Vec, pub prev_version: ManifestVersion, diff --git a/src/mito/src/table.rs b/src/mito/src/table.rs index 689a2b4c1b18..d5f554a994bf 100644 --- a/src/mito/src/table.rs +++ b/src/mito/src/table.rs @@ -21,9 +21,10 @@ use std::sync::Arc; use arc_swap::ArcSwap; use async_trait::async_trait; +use common_error::ext::BoxedError; use common_query::logical_plan::Expr; use common_query::physical_plan::PhysicalPlanRef; -use common_recordbatch::error::{Error as RecordBatchError, Result as RecordBatchResult}; +use common_recordbatch::error::{ExternalSnafu, Result as RecordBatchResult}; use common_recordbatch::{RecordBatch, RecordBatchStream}; use common_telemetry::logging; use datatypes::schema::ColumnSchema; @@ -189,7 +190,7 @@ impl Table for MitoTable { let stream_schema = schema.clone(); let stream = Box::pin(async_stream::try_stream! { - while let Some(chunk) = reader.next_chunk().await.map_err(RecordBatchError::new)? { + while let Some(chunk) = reader.next_chunk().await.map_err(BoxedError::new).context(ExternalSnafu)? { yield RecordBatch::new(stream_schema.clone(), chunk.columns)? } }); diff --git a/src/mito/src/table/test_util/mock_engine.rs b/src/mito/src/table/test_util/mock_engine.rs index 08b137cdc7c4..54b845bc514b 100644 --- a/src/mito/src/table/test_util/mock_engine.rs +++ b/src/mito/src/table/test_util/mock_engine.rs @@ -21,7 +21,7 @@ use arc_swap::ArcSwap; use async_trait::async_trait; use common_error::mock::MockError; use common_telemetry::logging; -use datatypes::prelude::{Value, VectorBuilder, VectorRef}; +use datatypes::prelude::{DataType, Value, VectorRef}; use datatypes::schema::{ColumnSchema, Schema}; use storage::metadata::{RegionMetaImpl, RegionMetadata}; use storage::write_batch::{Mutation, WriteBatch}; @@ -58,12 +58,11 @@ impl ChunkReader for MockChunkReader { .iter() .map(|column_schema| { let data = self.memtable.get(&column_schema.name).unwrap(); - let mut builder = - VectorBuilder::with_capacity(column_schema.data_type.clone(), data.len()); + let mut builder = column_schema.data_type.create_mutable_vector(data.len()); for v in data { - builder.push(v); + builder.push_value_ref(v.as_value_ref()).unwrap(); } - builder.finish() + builder.to_vector() }) .collect::>(); self.read = true; diff --git a/src/query/Cargo.toml b/src/query/Cargo.toml index 9676a81a3956..1bb9da358a5b 100644 --- a/src/query/Cargo.toml +++ b/src/query/Cargo.toml @@ -15,11 +15,12 @@ common-query = { path = "../common/query" } common-recordbatch = { path = "../common/recordbatch" } common-telemetry = { path = "../common/telemetry" } common-time = { path = "../common/time" } -datafusion = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2", features = [ - "simd", -] } -datafusion-common = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2" } -datafusion-physical-expr = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2" } +datafusion = "14.0.0" +datafusion-common = "14.0.0" +datafusion-expr = "14.0.0" +datafusion-optimizer = "14.0.0" +datafusion-physical-expr = "14.0.0" +datafusion-sql = "14.0.0" datatypes = { path = "../datatypes" } futures = "0.3" futures-util = "0.3" diff --git a/src/query/src/datafusion.rs b/src/query/src/datafusion.rs index 8dda26a5dbde..0968d9935795 100644 --- a/src/query/src/datafusion.rs +++ b/src/query/src/datafusion.rs @@ -141,7 +141,6 @@ impl LogicalOptimizer for DatafusionQueryEngine { LogicalPlan::DfPlan(df_plan) => { let optimized_plan = self.state - .df_context() .optimize(df_plan) .context(error::DatafusionSnafu { msg: "Fail to optimize logical plan", @@ -163,14 +162,11 @@ impl PhysicalPlanner for DatafusionQueryEngine { let _timer = timer!(metric::METRIC_CREATE_PHYSICAL_ELAPSED); match logical_plan { LogicalPlan::DfPlan(df_plan) => { - let physical_plan = self - .state - .df_context() - .create_physical_plan(df_plan) - .await - .context(error::DatafusionSnafu { + let physical_plan = self.state.create_physical_plan(df_plan).await.context( + error::DatafusionSnafu { msg: "Fail to create physical plan", - })?; + }, + )?; Ok(Arc::new(PhysicalPlanAdapter::new( Arc::new( @@ -193,22 +189,19 @@ impl PhysicalOptimizer for DatafusionQueryEngine { plan: Arc, ) -> Result> { let _timer = timer!(metric::METRIC_OPTIMIZE_PHYSICAL_ELAPSED); - let config = &self.state.df_context().state.lock().config; - let optimizers = &config.physical_optimizers; - let mut new_plan = plan + let new_plan = plan .as_any() .downcast_ref::() .context(error::PhysicalPlanDowncastSnafu)? .df_plan(); - for optimizer in optimizers { - new_plan = optimizer - .optimize(new_plan, config) + let new_plan = + self.state + .optimize_physical_plan(new_plan) .context(error::DatafusionSnafu { msg: "Fail to optimize physical plan", })?; - } Ok(Arc::new(PhysicalPlanAdapter::new(plan.schema(), new_plan))) } } @@ -224,7 +217,7 @@ impl QueryExecutor for DatafusionQueryEngine { match plan.output_partitioning().partition_count() { 0 => Ok(Box::pin(EmptyRecordBatchStream::new(plan.schema()))), 1 => Ok(plan - .execute(0, ctx.state().runtime()) + .execute(0, ctx.state().task_ctx()) .context(error::ExecutePhysicalPlanSnafu)?), _ => { // merge into a single partition @@ -232,11 +225,11 @@ impl QueryExecutor for DatafusionQueryEngine { CoalescePartitionsExec::new(Arc::new(DfPhysicalPlanAdapter(plan.clone()))); // CoalescePartitionsExec must produce a single partition assert_eq!(1, plan.output_partitioning().partition_count()); - let df_stream = plan.execute(0, ctx.state().runtime()).await.context( - error::DatafusionSnafu { - msg: "Failed to execute DataFusion merge exec", - }, - )?; + let df_stream = + plan.execute(0, ctx.state().task_ctx()) + .context(error::DatafusionSnafu { + msg: "Failed to execute DataFusion merge exec", + })?; let stream = RecordBatchStreamAdapter::try_new(df_stream) .context(error::ConvertDfRecordBatchStreamSnafu)?; Ok(Box::pin(stream)) @@ -254,8 +247,7 @@ mod tests { use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME}; use common_query::Output; use common_recordbatch::util; - use datafusion::field_util::{FieldExt, SchemaExt}; - use datatypes::arrow::array::UInt64Array; + use datatypes::vectors::{UInt64Vector, VectorRef}; use session::context::QueryContext; use table::table::numbers::NumbersTable; @@ -290,10 +282,10 @@ mod tests { assert_eq!( format!("{:?}", plan), - r#"DfPlan(Limit: 20 - Projection: #SUM(numbers.number) - Aggregate: groupBy=[[]], aggr=[[SUM(#numbers.number)]] - TableScan: numbers projection=None)"# + r#"DfPlan(Limit: skip=0, fetch=20 + Projection: SUM(numbers.number) + Aggregate: groupBy=[[]], aggr=[[SUM(numbers.number)]] + TableScan: numbers)"# ); } @@ -311,20 +303,20 @@ mod tests { Output::Stream(recordbatch) => { let numbers = util::collect(recordbatch).await.unwrap(); assert_eq!(1, numbers.len()); - assert_eq!(numbers[0].df_recordbatch.num_columns(), 1); - assert_eq!(1, numbers[0].schema.arrow_schema().fields().len()); + assert_eq!(numbers[0].num_columns(), 1); + assert_eq!(1, numbers[0].schema.num_columns()); assert_eq!( "SUM(numbers.number)", - numbers[0].schema.arrow_schema().field(0).name() + numbers[0].schema.column_schemas()[0].name ); - let columns = numbers[0].df_recordbatch.columns(); - assert_eq!(1, columns.len()); - assert_eq!(columns[0].len(), 1); + let batch = &numbers[0]; + assert_eq!(1, batch.num_columns()); + assert_eq!(batch.column(0).len(), 1); assert_eq!( - *columns[0].as_any().downcast_ref::().unwrap(), - UInt64Array::from_slice(&[4950]) + *batch.column(0), + Arc::new(UInt64Vector::from_slice(&[4950])) as VectorRef ); } _ => unreachable!(), diff --git a/src/query/src/datafusion/planner.rs b/src/query/src/datafusion/planner.rs index 6d70109e74af..4c87654e3c42 100644 --- a/src/query/src/datafusion/planner.rs +++ b/src/query/src/datafusion/planner.rs @@ -12,14 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::collections::HashMap; use std::sync::Arc; use common_query::logical_plan::create_aggregate_function; use datafusion::catalog::TableReference; -use datafusion::datasource::TableProvider; +use datafusion::error::Result as DfResult; use datafusion::physical_plan::udaf::AggregateUDF; use datafusion::physical_plan::udf::ScalarUDF; use datafusion::sql::planner::{ContextProvider, SqlToRel}; +use datafusion_expr::TableSource; use datatypes::arrow::datatypes::DataType; use session::context::QueryContextRef; use snafu::ResultExt; @@ -50,7 +52,7 @@ impl<'a, S: ContextProvider + Send + Sync> DfPlanner<'a, S> { let sql = query.inner.to_string(); let result = self .sql_to_rel - .query_to_plan(query.inner) + .query_to_plan(query.inner, &mut HashMap::new()) .context(error::PlanSqlSnafu { sql })?; Ok(LogicalPlan::DfPlan(result)) @@ -103,26 +105,14 @@ impl DfContextProviderAdapter { } } -/// TODO(dennis): Delegate all requests to ExecutionContext right now, -/// manage UDFs, UDAFs, variables by ourself in future. impl ContextProvider for DfContextProviderAdapter { - fn get_table_provider(&self, name: TableReference) -> Option> { + fn get_table_provider(&self, name: TableReference) -> DfResult> { let schema = self.query_ctx.current_schema(); - let execution_ctx = self.state.df_context().state.lock(); - match name { - TableReference::Bare { table } if schema.is_some() => { - execution_ctx.get_table_provider(TableReference::Partial { - // unwrap safety: checked in this match's arm - schema: &schema.unwrap(), - table, - }) - } - _ => execution_ctx.get_table_provider(name), - } + self.state.get_table_provider(schema.as_deref(), name) } fn get_function_meta(&self, name: &str) -> Option> { - self.state.df_context().state.lock().get_function_meta(name) + self.state.get_function_meta(name) } fn get_aggregate_meta(&self, name: &str) -> Option> { @@ -134,10 +124,6 @@ impl ContextProvider for DfContextProviderAdapter { } fn get_variable_type(&self, variable_names: &[String]) -> Option { - self.state - .df_context() - .state - .lock() - .get_variable_type(variable_names) + self.state.get_variable_type(variable_names) } } diff --git a/src/query/src/expr.rs b/src/query/src/expr.rs deleted file mode 100644 index 3a2a59181e92..000000000000 --- a/src/query/src/expr.rs +++ /dev/null @@ -1,13 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. diff --git a/src/query/src/optimizer.rs b/src/query/src/optimizer.rs index cee0c2727aef..2e6658876933 100644 --- a/src/query/src/optimizer.rs +++ b/src/query/src/optimizer.rs @@ -16,16 +16,14 @@ use std::str::FromStr; use std::sync::Arc; use common_time::timestamp::{TimeUnit, Timestamp}; -use datafusion::execution::context::ExecutionProps; -use datafusion::logical_plan::plan::Filter; -use datafusion::logical_plan::{ - Expr, ExprRewritable, ExprRewriter, ExprSchemable, LogicalPlan, Operator, TableScan, -}; use datafusion::optimizer::optimizer::OptimizerRule; -use datafusion::optimizer::utils; +use datafusion::optimizer::OptimizerConfig; use datafusion_common::{DFSchemaRef, DataFusionError, Result, ScalarValue}; +use datafusion_expr::expr_rewriter::{ExprRewritable, ExprRewriter}; +use datafusion_expr::{ + Between, BinaryExpr, Expr, ExprSchemable, Filter, LogicalPlan, Operator, TableScan, +}; use datatypes::arrow::compute; -use datatypes::arrow::compute::cast::CastOptions; use datatypes::arrow::datatypes::DataType; /// TypeConversionRule converts some literal values in logical plan to other types according @@ -39,24 +37,24 @@ impl OptimizerRule for TypeConversionRule { fn optimize( &self, plan: &LogicalPlan, - execution_props: &ExecutionProps, + optimizer_config: &mut OptimizerConfig, ) -> Result { let mut converter = TypeConverter { schemas: plan.all_schemas(), }; match plan { - LogicalPlan::Filter(Filter { predicate, input }) => Ok(LogicalPlan::Filter(Filter { - predicate: predicate.clone().rewrite(&mut converter)?, - input: Arc::new(self.optimize(input, execution_props)?), - })), + LogicalPlan::Filter(filter) => Ok(LogicalPlan::Filter(Filter::try_new( + filter.predicate().clone().rewrite(&mut converter)?, + Arc::new(self.optimize(filter.input(), optimizer_config)?), + )?)), LogicalPlan::TableScan(TableScan { table_name, source, projection, projected_schema, filters, - limit, + fetch, }) => { let rewrite_filters = filters .clone() @@ -69,7 +67,7 @@ impl OptimizerRule for TypeConversionRule { projection: projection.clone(), projected_schema: projected_schema.clone(), filters: rewrite_filters, - limit: *limit, + fetch: *fetch, })) } LogicalPlan::Projection { .. } @@ -86,12 +84,15 @@ impl OptimizerRule for TypeConversionRule { | LogicalPlan::CrossJoin { .. } | LogicalPlan::CreateMemoryTable { .. } | LogicalPlan::DropTable { .. } + | LogicalPlan::DropView { .. } + | LogicalPlan::Distinct { .. } | LogicalPlan::Values { .. } + | LogicalPlan::SetVariable { .. } | LogicalPlan::Analyze { .. } => { let inputs = plan.inputs(); let new_inputs = inputs .iter() - .map(|plan| self.optimize(plan, execution_props)) + .map(|plan| self.optimize(plan, optimizer_config)) .collect::>>()?; let expr = plan @@ -100,10 +101,15 @@ impl OptimizerRule for TypeConversionRule { .map(|e| e.rewrite(&mut converter)) .collect::>>()?; - utils::from_plan(plan, &expr, &new_inputs) + datafusion_expr::utils::from_plan(plan, &expr, &new_inputs) } - LogicalPlan::EmptyRelation { .. } => Ok(plan.clone()), + LogicalPlan::Subquery { .. } + | LogicalPlan::SubqueryAlias { .. } + | LogicalPlan::CreateView { .. } + | LogicalPlan::CreateCatalogSchema { .. } + | LogicalPlan::CreateCatalog { .. } + | LogicalPlan::EmptyRelation { .. } => Ok(plan.clone()), } } @@ -139,12 +145,11 @@ impl<'a> TypeConverter<'a> { (target_type, value) => { let value_arr = value.to_array(); let arr = - compute::cast::cast(value_arr.as_ref(), target_type, CastOptions::default()) - .map_err(DataFusionError::ArrowError)?; + compute::cast(&value_arr, target_type).map_err(DataFusionError::ArrowError)?; ScalarValue::try_from_array( - &Arc::from(arr), // index: Converts a value in `array` at `index` into a ScalarValue - 0, + &arr, + 0, // index: Converts a value in `array` at `index` into a ScalarValue ) } } @@ -188,7 +193,7 @@ impl<'a> TypeConverter<'a> { impl<'a> ExprRewriter for TypeConverter<'a> { fn mutate(&mut self, expr: Expr) -> Result { let new_expr = match expr { - Expr::BinaryExpr { left, op, right } => match op { + Expr::BinaryExpr(BinaryExpr { left, op, right }) => match op { Operator::Eq | Operator::NotEq | Operator::Lt @@ -196,28 +201,28 @@ impl<'a> ExprRewriter for TypeConverter<'a> { | Operator::Gt | Operator::GtEq => { let (left, right) = self.convert_type(&left, &right)?; - Expr::BinaryExpr { + Expr::BinaryExpr(BinaryExpr { left: Box::new(left), op, right: Box::new(right), - } + }) } - _ => Expr::BinaryExpr { left, op, right }, + _ => Expr::BinaryExpr(BinaryExpr { left, op, right }), }, - Expr::Between { + Expr::Between(Between { expr, negated, low, high, - } => { + }) => { let (expr, low) = self.convert_type(&expr, &low)?; let (expr, high) = self.convert_type(&expr, &high)?; - Expr::Between { + Expr::Between(Between { expr: Box::new(expr), negated, low: Box::new(low), high: Box::new(high), - } + }) } Expr::InList { expr, diff --git a/src/query/src/plan.rs b/src/query/src/plan.rs index c7e337c0e98d..5182db4f6aea 100644 --- a/src/query/src/plan.rs +++ b/src/query/src/plan.rs @@ -14,7 +14,7 @@ use std::fmt::Debug; -use datafusion::logical_plan::LogicalPlan as DfLogicalPlan; +use datafusion_expr::LogicalPlan as DfLogicalPlan; /// A LogicalPlan represents the different types of relational /// operators (such as Projection, Filter, etc) and can be created by diff --git a/src/query/src/query_engine/state.rs b/src/query/src/query_engine/state.rs index 36bd331b36ca..a72b0203e30f 100644 --- a/src/query/src/query_engine/state.rs +++ b/src/query/src/query_engine/state.rs @@ -19,16 +19,18 @@ use std::sync::{Arc, RwLock}; use catalog::CatalogListRef; use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME}; use common_function::scalars::aggregate::AggregateFunctionMetaRef; -use common_query::physical_plan::RuntimeEnv; +use common_query::physical_plan::{SessionContext, TaskContext}; use common_query::prelude::ScalarUdf; -use datafusion::optimizer::common_subexpr_eliminate::CommonSubexprEliminate; -use datafusion::optimizer::eliminate_limit::EliminateLimit; -use datafusion::optimizer::filter_push_down::FilterPushDown; -use datafusion::optimizer::limit_push_down::LimitPushDown; -use datafusion::optimizer::projection_push_down::ProjectionPushDown; -use datafusion::optimizer::single_distinct_to_groupby::SingleDistinctToGroupBy; -use datafusion::optimizer::to_approx_perc::ToApproxPerc; -use datafusion::prelude::{ExecutionConfig, ExecutionContext}; +use datafusion::catalog::TableReference; +use datafusion::error::Result as DfResult; +use datafusion::execution::context::{SessionConfig, SessionState}; +use datafusion::execution::runtime_env::RuntimeEnv; +use datafusion::physical_plan::udf::ScalarUDF; +use datafusion::physical_plan::ExecutionPlan; +use datafusion_expr::{LogicalPlan as DfLogicalPlan, TableSource}; +use datafusion_optimizer::optimizer::{Optimizer, OptimizerConfig}; +use datafusion_sql::planner::ContextProvider; +use datatypes::arrow::datatypes::DataType; use crate::datafusion::DfCatalogListAdapter; use crate::optimizer::TypeConversionRule; @@ -39,7 +41,7 @@ use crate::optimizer::TypeConversionRule; // type in QueryEngine trait. #[derive(Clone)] pub struct QueryEngineState { - df_context: ExecutionContext, + df_context: SessionContext, catalog_list: CatalogListRef, aggregate_functions: Arc>>, } @@ -53,25 +55,18 @@ impl fmt::Debug for QueryEngineState { impl QueryEngineState { pub(crate) fn new(catalog_list: CatalogListRef) -> Self { - let config = ExecutionConfig::new() - .with_default_catalog_and_schema(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME) - .with_optimizer_rules(vec![ - // TODO(hl): SimplifyExpressions is not exported. - Arc::new(TypeConversionRule {}), - // These are the default optimizer in datafusion - Arc::new(CommonSubexprEliminate::new()), - Arc::new(EliminateLimit::new()), - Arc::new(ProjectionPushDown::new()), - Arc::new(FilterPushDown::new()), - Arc::new(LimitPushDown::new()), - Arc::new(SingleDistinctToGroupBy::new()), - Arc::new(ToApproxPerc::new()), - ]); - - let df_context = ExecutionContext::with_config(config); - - df_context.state.lock().catalog_list = - Arc::new(DfCatalogListAdapter::new(catalog_list.clone())); + let runtime_env = Arc::new(RuntimeEnv::default()); + let session_config = SessionConfig::new() + .with_default_catalog_and_schema(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME); + let mut optimizer = Optimizer::new(&OptimizerConfig::new()); + // Apply the type conversion rule first. + optimizer.rules.insert(0, Arc::new(TypeConversionRule {})); + + let mut session_state = SessionState::with_config_rt(session_config, runtime_env); + session_state.optimizer = optimizer; + session_state.catalog_list = Arc::new(DfCatalogListAdapter::new(catalog_list.clone())); + + let df_context = SessionContext::with_state(session_state); Self { df_context, @@ -81,11 +76,15 @@ impl QueryEngineState { } /// Register a udf function - /// TODO(dennis): manage UDFs by ourself. + // TODO(dennis): manage UDFs by ourself. pub fn register_udf(&self, udf: ScalarUdf) { + // `SessionContext` has a `register_udf()` method, which requires `&mut self`, this is + // a workaround. + // TODO(yingwen): Use `SessionContext::register_udf()` once it taks `&self`. + // It's implemented in https://github.com/apache/arrow-datafusion/pull/4612 self.df_context .state - .lock() + .write() .scalar_functions .insert(udf.name.clone(), Arc::new(udf.into_df_udf())); } @@ -113,12 +112,59 @@ impl QueryEngineState { } #[inline] - pub(crate) fn df_context(&self) -> &ExecutionContext { - &self.df_context + pub(crate) fn task_ctx(&self) -> Arc { + self.df_context.task_ctx() } - #[inline] - pub(crate) fn runtime(&self) -> Arc { - self.df_context.runtime_env() + pub(crate) fn get_table_provider( + &self, + schema: Option<&str>, + name: TableReference, + ) -> DfResult> { + let state = self.df_context.state.read(); + match name { + TableReference::Bare { table } if schema.is_some() => { + state.get_table_provider(TableReference::Partial { + // unwrap safety: checked in this match's arm + schema: schema.unwrap(), + table, + }) + } + _ => state.get_table_provider(name), + } + } + + pub(crate) fn get_function_meta(&self, name: &str) -> Option> { + let state = self.df_context.state.read(); + state.get_function_meta(name) + } + + pub(crate) fn get_variable_type(&self, variable_names: &[String]) -> Option { + let state = self.df_context.state.read(); + state.get_variable_type(variable_names) + } + + pub(crate) fn optimize(&self, plan: &DfLogicalPlan) -> DfResult { + self.df_context.optimize(plan) + } + + pub(crate) async fn create_physical_plan( + &self, + logical_plan: &DfLogicalPlan, + ) -> DfResult> { + self.df_context.create_physical_plan(logical_plan).await + } + + pub(crate) fn optimize_physical_plan( + &self, + mut plan: Arc, + ) -> DfResult> { + let state = self.df_context.state.read(); + let config = &state.config; + for optimizer in &state.physical_optimizers { + plan = optimizer.optimize(plan, config)?; + } + + Ok(plan) } } diff --git a/src/query/src/sql.rs b/src/query/src/sql.rs index 2854fed7fc85..327394416eb0 100644 --- a/src/query/src/sql.rs +++ b/src/query/src/sql.rs @@ -261,10 +261,9 @@ mod test { use common_query::Output; use common_recordbatch::{RecordBatch, RecordBatches}; use common_time::timestamp::TimeUnit; - use datatypes::arrow::array::PrimitiveArray; use datatypes::prelude::ConcreteDataType; use datatypes::schema::{ColumnDefaultConstraint, ColumnSchema, Schema, SchemaRef}; - use datatypes::vectors::{StringVector, TimestampVector, UInt32Vector, VectorRef}; + use datatypes::vectors::{StringVector, TimestampMillisecondVector, UInt32Vector, VectorRef}; use snafu::ResultExt; use sql::statements::describe::DescribeTable; use table::test_util::MemTable; @@ -379,12 +378,12 @@ mod test { .with_time_index(true), ]; let data = vec![ - Arc::new(UInt32Vector::from_vec(vec![0])) as _, - Arc::new(TimestampVector::new(PrimitiveArray::from_vec(vec![0]))) as _, + Arc::new(UInt32Vector::from_slice(&[0])) as _, + Arc::new(TimestampMillisecondVector::from_slice(&[0])) as _, ]; let expected_columns = vec![ Arc::new(StringVector::from(vec!["t1", "t2"])) as _, - Arc::new(StringVector::from(vec!["UInt32", "Timestamp"])) as _, + Arc::new(StringVector::from(vec!["UInt32", "TimestampMillisecond"])) as _, Arc::new(StringVector::from(vec![NULLABLE_YES, NULLABLE_NO])) as _, Arc::new(StringVector::from(vec!["", "current_timestamp()"])) as _, Arc::new(StringVector::from(vec![ diff --git a/src/query/tests/argmax_test.rs b/src/query/tests/argmax_test.rs index 11f0167a096c..cbf1ae931dc9 100644 --- a/src/query/tests/argmax_test.rs +++ b/src/query/tests/argmax_test.rs @@ -12,16 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::sync::Arc; mod function; + +use std::sync::Arc; + use common_query::Output; use common_recordbatch::error::Result as RecordResult; use common_recordbatch::{util, RecordBatch}; -use datafusion::field_util::{FieldExt, SchemaExt}; use datatypes::for_all_primitive_types; use datatypes::prelude::*; -use datatypes::types::PrimitiveElement; -use function::{create_query_engine, get_numbers_from_table}; +use datatypes::types::WrapperType; use query::error::Result; use query::QueryEngine; use session::context::QueryContext; @@ -29,7 +29,7 @@ use session::context::QueryContext; #[tokio::test] async fn test_argmax_aggregator() -> Result<()> { common_telemetry::init_default_ut_logging(); - let engine = create_query_engine(); + let engine = function::create_query_engine(); macro_rules! test_argmax { ([], $( { $T:ty } ),*) => { @@ -49,33 +49,23 @@ async fn test_argmax_success( engine: Arc, ) -> Result<()> where - T: PrimitiveElement + PartialOrd, - for<'a> T: Scalar = T>, + T: WrapperType + PartialOrd, { let result = execute_argmax(column_name, table_name, engine.clone()) .await .unwrap(); - assert_eq!(1, result.len()); - assert_eq!(result[0].df_recordbatch.num_columns(), 1); - assert_eq!(1, result[0].schema.arrow_schema().fields().len()); - assert_eq!("argmax", result[0].schema.arrow_schema().field(0).name()); - - let columns = result[0].df_recordbatch.columns(); - assert_eq!(1, columns.len()); - assert_eq!(columns[0].len(), 1); - let v = VectorHelper::try_into_vector(&columns[0]).unwrap(); - assert_eq!(1, v.len()); - let value = v.get(0); + let value = function::get_value_from_batches("argmax", result); - let numbers = get_numbers_from_table::(column_name, table_name, engine.clone()).await; + let numbers = + function::get_numbers_from_table::(column_name, table_name, engine.clone()).await; let expected_value = match numbers.len() { 0 => 0_u64, _ => { let mut index = 0; - let mut max = numbers[0].into(); + let mut max = numbers[0]; for (i, &number) in numbers.iter().enumerate() { - if max < number.into() { - max = number.into(); + if max < number { + max = number; index = i; } } diff --git a/src/query/tests/argmin_test.rs b/src/query/tests/argmin_test.rs index 2a509f05fdc1..546fa9ae23f3 100644 --- a/src/query/tests/argmin_test.rs +++ b/src/query/tests/argmin_test.rs @@ -12,17 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::sync::Arc; mod function; +use std::sync::Arc; + use common_query::Output; use common_recordbatch::error::Result as RecordResult; use common_recordbatch::{util, RecordBatch}; -use datafusion::field_util::{FieldExt, SchemaExt}; use datatypes::for_all_primitive_types; use datatypes::prelude::*; -use datatypes::types::PrimitiveElement; -use function::{create_query_engine, get_numbers_from_table}; +use datatypes::types::WrapperType; use query::error::Result; use query::QueryEngine; use session::context::QueryContext; @@ -30,7 +29,7 @@ use session::context::QueryContext; #[tokio::test] async fn test_argmin_aggregator() -> Result<()> { common_telemetry::init_default_ut_logging(); - let engine = create_query_engine(); + let engine = function::create_query_engine(); macro_rules! test_argmin { ([], $( { $T:ty } ),*) => { @@ -50,33 +49,23 @@ async fn test_argmin_success( engine: Arc, ) -> Result<()> where - T: PrimitiveElement + PartialOrd, - for<'a> T: Scalar = T>, + T: WrapperType + PartialOrd, { let result = execute_argmin(column_name, table_name, engine.clone()) .await .unwrap(); - assert_eq!(1, result.len()); - assert_eq!(result[0].df_recordbatch.num_columns(), 1); - assert_eq!(1, result[0].schema.arrow_schema().fields().len()); - assert_eq!("argmin", result[0].schema.arrow_schema().field(0).name()); - - let columns = result[0].df_recordbatch.columns(); - assert_eq!(1, columns.len()); - assert_eq!(columns[0].len(), 1); - let v = VectorHelper::try_into_vector(&columns[0]).unwrap(); - assert_eq!(1, v.len()); - let value = v.get(0); + let value = function::get_value_from_batches("argmin", result); - let numbers = get_numbers_from_table::(column_name, table_name, engine.clone()).await; + let numbers = + function::get_numbers_from_table::(column_name, table_name, engine.clone()).await; let expected_value = match numbers.len() { 0 => 0_u32, _ => { let mut index = 0; - let mut min = numbers[0].into(); + let mut min = numbers[0]; for (i, &number) in numbers.iter().enumerate() { - if min > number.into() { - min = number.into(); + if min > number { + min = number; index = i; } } diff --git a/src/query/tests/function.rs b/src/query/tests/function.rs index 040dfa7a6b0a..7de93a6265ec 100644 --- a/src/query/tests/function.rs +++ b/src/query/tests/function.rs @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +// FIXME(yingwen): Consider move all tests under query/tests to query/src so we could reuse +// more codes. use std::sync::Arc; use catalog::local::{MemoryCatalogManager, MemoryCatalogProvider, MemorySchemaProvider}; @@ -22,8 +24,8 @@ use common_recordbatch::{util, RecordBatch}; use datatypes::for_all_primitive_types; use datatypes::prelude::*; use datatypes::schema::{ColumnSchema, Schema}; -use datatypes::types::PrimitiveElement; -use datatypes::vectors::PrimitiveVector; +use datatypes::types::WrapperType; +use datatypes::vectors::Helper; use query::query_engine::QueryEngineFactory; use query::QueryEngine; use rand::Rng; @@ -47,7 +49,7 @@ pub fn create_query_engine() -> Arc { column_schemas.push(column_schema); let numbers = (1..=10).map(|_| rng.gen::<$T>()).collect::>(); - let column: VectorRef = Arc::new(PrimitiveVector::<$T>::from_vec(numbers.to_vec())); + let column: VectorRef = Arc::new(<$T as Scalar>::VectorType::from_vec(numbers.to_vec())); columns.push(column); )* } @@ -77,8 +79,7 @@ pub async fn get_numbers_from_table<'s, T>( engine: Arc, ) -> Vec where - T: PrimitiveElement, - for<'a> T: Scalar = T>, + T: WrapperType, { let sql = format!("SELECT {} FROM {}", column_name, table_name); let plan = engine @@ -92,8 +93,21 @@ where }; let numbers = util::collect(recordbatch_stream).await.unwrap(); - let columns = numbers[0].df_recordbatch.columns(); - let column = VectorHelper::try_into_vector(&columns[0]).unwrap(); - let column: &::VectorType = unsafe { VectorHelper::static_cast(&column) }; + let column = numbers[0].column(0); + let column: &::VectorType = unsafe { Helper::static_cast(column) }; column.iter_data().flatten().collect::>() } + +pub fn get_value_from_batches(column_name: &str, batches: Vec) -> Value { + assert_eq!(1, batches.len()); + assert_eq!(batches[0].num_columns(), 1); + assert_eq!(1, batches[0].schema.num_columns()); + assert_eq!(column_name, batches[0].schema.column_schemas()[0].name); + + let batch = &batches[0]; + assert_eq!(1, batch.num_columns()); + assert_eq!(batch.column(0).len(), 1); + let v = batch.column(0); + assert_eq!(1, v.len()); + v.get(0) +} diff --git a/src/query/tests/mean_test.rs b/src/query/tests/mean_test.rs index 705dea797db1..000323fb2192 100644 --- a/src/query/tests/mean_test.rs +++ b/src/query/tests/mean_test.rs @@ -12,19 +12,18 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::sync::Arc; mod function; +use std::sync::Arc; + use common_query::Output; use common_recordbatch::error::Result as RecordResult; use common_recordbatch::{util, RecordBatch}; -use datafusion::field_util::{FieldExt, SchemaExt}; use datatypes::for_all_primitive_types; use datatypes::prelude::*; -use datatypes::types::PrimitiveElement; +use datatypes::types::WrapperType; use datatypes::value::OrderedFloat; use format_num::NumberFormat; -use function::{create_query_engine, get_numbers_from_table}; use num_traits::AsPrimitive; use query::error::Result; use query::QueryEngine; @@ -33,7 +32,7 @@ use session::context::QueryContext; #[tokio::test] async fn test_mean_aggregator() -> Result<()> { common_telemetry::init_default_ut_logging(); - let engine = create_query_engine(); + let engine = function::create_query_engine(); macro_rules! test_mean { ([], $( { $T:ty } ),*) => { @@ -53,25 +52,15 @@ async fn test_mean_success( engine: Arc, ) -> Result<()> where - T: PrimitiveElement + AsPrimitive, - for<'a> T: Scalar = T>, + T: WrapperType + AsPrimitive, { let result = execute_mean(column_name, table_name, engine.clone()) .await .unwrap(); - assert_eq!(1, result.len()); - assert_eq!(result[0].df_recordbatch.num_columns(), 1); - assert_eq!(1, result[0].schema.arrow_schema().fields().len()); - assert_eq!("mean", result[0].schema.arrow_schema().field(0).name()); - - let columns = result[0].df_recordbatch.columns(); - assert_eq!(1, columns.len()); - assert_eq!(columns[0].len(), 1); - let v = VectorHelper::try_into_vector(&columns[0]).unwrap(); - assert_eq!(1, v.len()); - let value = v.get(0); + let value = function::get_value_from_batches("mean", result); - let numbers = get_numbers_from_table::(column_name, table_name, engine.clone()).await; + let numbers = + function::get_numbers_from_table::(column_name, table_name, engine.clone()).await; let expected_value = numbers.iter().map(|&n| n.as_()).collect::>(); let expected_value = inc_stats::mean(expected_value.iter().cloned()).unwrap(); diff --git a/src/query/tests/my_sum_udaf_example.rs b/src/query/tests/my_sum_udaf_example.rs index 4e05183861ee..54d3a62a5b98 100644 --- a/src/query/tests/my_sum_udaf_example.rs +++ b/src/query/tests/my_sum_udaf_example.rs @@ -26,12 +26,10 @@ use common_query::logical_plan::{Accumulator, AggregateFunctionCreator}; use common_query::prelude::*; use common_query::Output; use common_recordbatch::{util, RecordBatch}; -use datafusion::arrow_print; -use datafusion_common::record_batch::RecordBatch as DfRecordBatch; use datatypes::prelude::*; use datatypes::schema::{ColumnSchema, Schema}; -use datatypes::types::{PrimitiveElement, PrimitiveType}; -use datatypes::vectors::PrimitiveVector; +use datatypes::types::{LogicalPrimitiveType, WrapperType}; +use datatypes::vectors::Helper; use datatypes::with_match_primitive_type_id; use num_traits::AsPrimitive; use query::error::Result; @@ -40,28 +38,30 @@ use session::context::QueryContext; use table::test_util::MemTable; #[derive(Debug, Default)] -struct MySumAccumulator -where - T: Primitive + AsPrimitive, - SumT: Primitive + std::ops::AddAssign, -{ +struct MySumAccumulator { sum: SumT, _phantom: PhantomData, } impl MySumAccumulator where - T: Primitive + AsPrimitive, - SumT: Primitive + std::ops::AddAssign, + T: WrapperType, + SumT: WrapperType, + T::Native: AsPrimitive, + SumT::Native: std::ops::AddAssign, { #[inline(always)] fn add(&mut self, v: T) { - self.sum += v.as_(); + let mut sum_native = self.sum.into_native(); + sum_native += v.into_native().as_(); + self.sum = SumT::from_native(sum_native); } #[inline(always)] fn merge(&mut self, s: SumT) { - self.sum += s; + let mut sum_native = self.sum.into_native(); + sum_native += s.into_native(); + self.sum = SumT::from_native(sum_native); } } @@ -76,7 +76,7 @@ impl AggregateFunctionCreator for MySumAccumulatorCreator { with_match_primitive_type_id!( input_type.logical_type_id(), |$S| { - Ok(Box::new(MySumAccumulator::<$S, <$S as Primitive>::LargestType>::default())) + Ok(Box::new(MySumAccumulator::<<$S as LogicalPrimitiveType>::Wrapper, <<$S as LogicalPrimitiveType>::LargestType as LogicalPrimitiveType>::Wrapper>::default())) }, { let err_msg = format!( @@ -95,7 +95,7 @@ impl AggregateFunctionCreator for MySumAccumulatorCreator { with_match_primitive_type_id!( input_type.logical_type_id(), |$S| { - Ok(PrimitiveType::<<$S as Primitive>::LargestType>::default().logical_type_id().data_type()) + Ok(<<$S as LogicalPrimitiveType>::LargestType>::build_data_type()) }, { unreachable!() @@ -110,10 +110,10 @@ impl AggregateFunctionCreator for MySumAccumulatorCreator { impl Accumulator for MySumAccumulator where - T: Primitive + AsPrimitive, - for<'a> T: Scalar = T>, - SumT: Primitive + std::ops::AddAssign, - for<'a> SumT: Scalar = SumT>, + T: WrapperType, + SumT: WrapperType, + T::Native: AsPrimitive, + SumT::Native: std::ops::AddAssign, { fn state(&self) -> QueryResult> { Ok(vec![self.sum.into()]) @@ -124,7 +124,7 @@ where return Ok(()); }; let column = &values[0]; - let column: &::VectorType = unsafe { VectorHelper::static_cast(column) }; + let column: &::VectorType = unsafe { Helper::static_cast(column) }; for v in column.iter_data().flatten() { self.add(v) } @@ -136,7 +136,7 @@ where return Ok(()); }; let states = &states[0]; - let states: &::VectorType = unsafe { VectorHelper::static_cast(states) }; + let states: &::VectorType = unsafe { Helper::static_cast(states) }; for s in states.iter_data().flatten() { self.merge(s) } @@ -154,65 +154,57 @@ async fn test_my_sum() -> Result<()> { test_my_sum_with( (1..=10).collect::>(), - vec![ - "+--------+", - "| my_sum |", - "+--------+", - "| 55 |", - "+--------+", - ], + r#"+--------+ +| my_sum | ++--------+ +| 55 | ++--------+"#, ) .await?; test_my_sum_with( (-10..=11).collect::>(), - vec![ - "+--------+", - "| my_sum |", - "+--------+", - "| 11 |", - "+--------+", - ], + r#"+--------+ +| my_sum | ++--------+ +| 11 | ++--------+"#, ) .await?; test_my_sum_with( vec![-1.0f32, 1.0, 2.0, 3.0, 4.0], - vec![ - "+--------+", - "| my_sum |", - "+--------+", - "| 9 |", - "+--------+", - ], + r#"+--------+ +| my_sum | ++--------+ +| 9 | ++--------+"#, ) .await?; test_my_sum_with( vec![u32::MAX, u32::MAX], - vec![ - "+------------+", - "| my_sum |", - "+------------+", - "| 8589934590 |", - "+------------+", - ], + r#"+------------+ +| my_sum | ++------------+ +| 8589934590 | ++------------+"#, ) .await?; Ok(()) } -async fn test_my_sum_with(numbers: Vec, expected: Vec<&str>) -> Result<()> +async fn test_my_sum_with(numbers: Vec, expected: &str) -> Result<()> where - T: PrimitiveElement, + T: WrapperType, { let table_name = format!("{}_numbers", std::any::type_name::()); let column_name = format!("{}_number", std::any::type_name::()); let column_schemas = vec![ColumnSchema::new( column_name.clone(), - T::build_data_type(), + T::LogicalType::build_data_type(), true, )]; let schema = Arc::new(Schema::new(column_schemas.clone())); - let column: VectorRef = Arc::new(PrimitiveVector::::from_vec(numbers)); + let column: VectorRef = Arc::new(T::VectorType::from_vec(numbers)); let recordbatch = RecordBatch::new(schema, vec![column]).unwrap(); let testing_table = MemTable::new(&table_name, recordbatch); @@ -236,14 +228,9 @@ where Output::Stream(batch) => batch, _ => unreachable!(), }; - let recordbatch = util::collect(recordbatch_stream).await.unwrap(); - let df_recordbatch = recordbatch - .into_iter() - .map(|r| r.df_recordbatch) - .collect::>(); + let batches = util::collect_batches(recordbatch_stream).await.unwrap(); - let pretty_print = arrow_print::write(&df_recordbatch); - let pretty_print = pretty_print.lines().collect::>(); + let pretty_print = batches.pretty_print().unwrap(); assert_eq!(expected, pretty_print); Ok(()) } diff --git a/src/query/tests/percentile_test.rs b/src/query/tests/percentile_test.rs index 6e210a0494e0..e639d4b3e63f 100644 --- a/src/query/tests/percentile_test.rs +++ b/src/query/tests/percentile_test.rs @@ -20,12 +20,10 @@ use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME}; use common_query::Output; use common_recordbatch::error::Result as RecordResult; use common_recordbatch::{util, RecordBatch}; -use datafusion::field_util::{FieldExt, SchemaExt}; use datatypes::for_all_primitive_types; use datatypes::prelude::*; use datatypes::schema::{ColumnSchema, Schema}; -use datatypes::types::PrimitiveElement; -use datatypes::vectors::PrimitiveVector; +use datatypes::vectors::Int32Vector; use function::{create_query_engine, get_numbers_from_table}; use num_traits::AsPrimitive; use query::error::Result; @@ -64,9 +62,8 @@ async fn test_percentile_correctness() -> Result<()> { _ => unreachable!(), }; let record_batch = util::collect(recordbatch_stream).await.unwrap(); - let columns = record_batch[0].df_recordbatch.columns(); - let v = VectorHelper::try_into_vector(&columns[0]).unwrap(); - let value = v.get(0); + let column = record_batch[0].column(0); + let value = column.get(0); assert_eq!(value, Value::from(9.280_000_000_000_001_f64)); Ok(()) } @@ -77,26 +74,12 @@ async fn test_percentile_success( engine: Arc, ) -> Result<()> where - T: PrimitiveElement + AsPrimitive, - for<'a> T: Scalar = T>, + T: WrapperType + AsPrimitive, { let result = execute_percentile(column_name, table_name, engine.clone()) .await .unwrap(); - assert_eq!(1, result.len()); - assert_eq!(result[0].df_recordbatch.num_columns(), 1); - assert_eq!(1, result[0].schema.arrow_schema().fields().len()); - assert_eq!( - "percentile", - result[0].schema.arrow_schema().field(0).name() - ); - - let columns = result[0].df_recordbatch.columns(); - assert_eq!(1, columns.len()); - assert_eq!(columns[0].len(), 1); - let v = VectorHelper::try_into_vector(&columns[0]).unwrap(); - assert_eq!(1, v.len()); - let value = v.get(0); + let value = function::get_value_from_batches("percentile", result); let numbers = get_numbers_from_table::(column_name, table_name, engine.clone()).await; let expected_value = numbers.iter().map(|&n| n.as_()).collect::>(); @@ -140,9 +123,9 @@ fn create_correctness_engine() -> Arc { let column_schema = ColumnSchema::new("corr_number", ConcreteDataType::int32_datatype(), true); column_schemas.push(column_schema); - let numbers = vec![3_i32, 6_i32, 8_i32, 10_i32]; + let numbers = [3_i32, 6_i32, 8_i32, 10_i32]; - let column: VectorRef = Arc::new(PrimitiveVector::::from_vec(numbers.to_vec())); + let column: VectorRef = Arc::new(Int32Vector::from_slice(&numbers)); columns.push(column); let schema = Arc::new(Schema::new(column_schemas)); diff --git a/src/query/tests/polyval_test.rs b/src/query/tests/polyval_test.rs index f2e60c0217ca..248c0d42d74e 100644 --- a/src/query/tests/polyval_test.rs +++ b/src/query/tests/polyval_test.rs @@ -18,11 +18,9 @@ mod function; use common_query::Output; use common_recordbatch::error::Result as RecordResult; use common_recordbatch::{util, RecordBatch}; -use datafusion::field_util::{FieldExt, SchemaExt}; use datatypes::for_all_primitive_types; use datatypes::prelude::*; -use datatypes::types::PrimitiveElement; -use function::{create_query_engine, get_numbers_from_table}; +use datatypes::types::WrapperType; use num_traits::AsPrimitive; use query::error::Result; use query::QueryEngine; @@ -31,13 +29,13 @@ use session::context::QueryContext; #[tokio::test] async fn test_polyval_aggregator() -> Result<()> { common_telemetry::init_default_ut_logging(); - let engine = create_query_engine(); + let engine = function::create_query_engine(); macro_rules! test_polyval { ([], $( { $T:ty } ),*) => { $( let column_name = format!("{}_number", std::any::type_name::<$T>()); - test_polyval_success::<$T,<$T as Primitive>::LargestType>(&column_name, "numbers", engine.clone()).await?; + test_polyval_success::<$T, <<<$T as WrapperType>::LogicalType as LogicalPrimitiveType>::LargestType as LogicalPrimitiveType>::Wrapper>(&column_name, "numbers", engine.clone()).await?; )* } } @@ -51,36 +49,27 @@ async fn test_polyval_success( engine: Arc, ) -> Result<()> where - T: Primitive + AsPrimitive + PrimitiveElement, - PolyT: Primitive + std::ops::Mul + std::iter::Sum, - for<'a> T: Scalar = T>, - for<'a> PolyT: Scalar = PolyT>, - i64: AsPrimitive, + T: WrapperType, + PolyT: WrapperType, + T::Native: AsPrimitive, + PolyT::Native: std::ops::Mul + std::iter::Sum, + i64: AsPrimitive, { let result = execute_polyval(column_name, table_name, engine.clone()) .await .unwrap(); - assert_eq!(1, result.len()); - assert_eq!(result[0].df_recordbatch.num_columns(), 1); - assert_eq!(1, result[0].schema.arrow_schema().fields().len()); - assert_eq!("polyval", result[0].schema.arrow_schema().field(0).name()); + let value = function::get_value_from_batches("polyval", result); - let columns = result[0].df_recordbatch.columns(); - assert_eq!(1, columns.len()); - assert_eq!(columns[0].len(), 1); - let v = VectorHelper::try_into_vector(&columns[0]).unwrap(); - assert_eq!(1, v.len()); - let value = v.get(0); - - let numbers = get_numbers_from_table::(column_name, table_name, engine.clone()).await; + let numbers = + function::get_numbers_from_table::(column_name, table_name, engine.clone()).await; let expected_value = numbers.iter().copied(); let x = 0i64; let len = expected_value.len(); - let expected_value: PolyT = expected_value + let expected_native: PolyT::Native = expected_value .enumerate() - .map(|(i, value)| value.as_() * (x.pow((len - 1 - i) as u32)).as_()) + .map(|(i, v)| v.into_native().as_() * (x.pow((len - 1 - i) as u32)).as_()) .sum(); - assert_eq!(value, expected_value.into()); + assert_eq!(value, PolyT::from_native(expected_native).into()); Ok(()) } diff --git a/src/query/tests/pow.rs b/src/query/tests/pow.rs index 4d9006ca29fa..d48c28b22051 100644 --- a/src/query/tests/pow.rs +++ b/src/query/tests/pow.rs @@ -32,7 +32,7 @@ pub fn pow(args: &[VectorRef]) -> Result { assert_eq!(exponent.len(), base.len()); - let v = base + let iter = base .iter_data() .zip(exponent.iter_data()) .map(|(base, exponent)| { @@ -42,8 +42,8 @@ pub fn pow(args: &[VectorRef]) -> Result { (Some(base), Some(exponent)) => Some(base.pow(exponent)), _ => None, } - }) - .collect::(); + }); + let v = UInt32Vector::from_owned_iterator(iter); Ok(Arc::new(v) as _) } diff --git a/src/query/tests/query_engine_test.rs b/src/query/tests/query_engine_test.rs index cf640afba48e..05bb32a2c415 100644 --- a/src/query/tests/query_engine_test.rs +++ b/src/query/tests/query_engine_test.rs @@ -13,30 +13,28 @@ // limitations under the License. mod pow; +// This is used to suppress the warning: function `create_query_engine` is never used. +// FIXME(yingwen): We finally need to refactor these tests and move them to `query/src` +// so tests can share codes with other mods. +#[allow(unused)] +mod function; use std::sync::Arc; -use catalog::local::{MemoryCatalogManager, MemoryCatalogProvider, MemorySchemaProvider}; +use catalog::local::{MemoryCatalogProvider, MemorySchemaProvider}; use catalog::{CatalogList, CatalogProvider, SchemaProvider}; use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME}; use common_query::prelude::{create_udf, make_scalar_function, Volatility}; use common_query::Output; -use common_recordbatch::error::Result as RecordResult; use common_recordbatch::{util, RecordBatch}; -use datafusion::field_util::{FieldExt, SchemaExt}; -use datafusion::logical_plan::LogicalPlanBuilder; -use datatypes::arrow::array::UInt32Array; -use datatypes::for_all_primitive_types; +use datafusion::datasource::DefaultTableSource; +use datafusion_expr::logical_plan::builder::LogicalPlanBuilder; use datatypes::prelude::*; use datatypes::schema::{ColumnSchema, Schema}; -use datatypes::types::{OrdPrimitive, PrimitiveElement}; -use datatypes::vectors::{PrimitiveVector, UInt32Vector}; -use num::NumCast; +use datatypes::vectors::UInt32Vector; use query::error::Result; use query::plan::LogicalPlan; use query::query_engine::QueryEngineFactory; -use query::QueryEngine; -use rand::Rng; use session::context::QueryContext; use table::table::adapter::DfTableProviderAdapter; use table::table::numbers::NumbersTable; @@ -66,12 +64,16 @@ async fn test_datafusion_query_engine() -> Result<()> { let limit = 10; let table_provider = Arc::new(DfTableProviderAdapter::new(table.clone())); let plan = LogicalPlan::DfPlan( - LogicalPlanBuilder::scan("numbers", table_provider, None) - .unwrap() - .limit(limit) - .unwrap() - .build() - .unwrap(), + LogicalPlanBuilder::scan( + "numbers", + Arc::new(DefaultTableSource { table_provider }), + None, + ) + .unwrap() + .limit(0, Some(limit)) + .unwrap() + .build() + .unwrap(), ); let output = engine.execute(&plan).await?; @@ -84,17 +86,17 @@ async fn test_datafusion_query_engine() -> Result<()> { let numbers = util::collect(recordbatch).await.unwrap(); assert_eq!(1, numbers.len()); - assert_eq!(numbers[0].df_recordbatch.num_columns(), 1); - assert_eq!(1, numbers[0].schema.arrow_schema().fields().len()); - assert_eq!("number", numbers[0].schema.arrow_schema().field(0).name()); + assert_eq!(numbers[0].num_columns(), 1); + assert_eq!(1, numbers[0].schema.num_columns()); + assert_eq!("number", numbers[0].schema.column_schemas()[0].name); - let columns = numbers[0].df_recordbatch.columns(); - assert_eq!(1, columns.len()); - assert_eq!(columns[0].len(), limit); + let batch = &numbers[0]; + assert_eq!(1, batch.num_columns()); + assert_eq!(batch.column(0).len(), limit); let expected: Vec = (0u32..limit as u32).collect(); assert_eq!( - *columns[0].as_any().downcast_ref::().unwrap(), - UInt32Array::from_slice(&expected) + *batch.column(0), + Arc::new(UInt32Vector::from_slice(&expected)) as VectorRef ); Ok(()) @@ -123,7 +125,8 @@ async fn test_udf() -> Result<()> { let pow = make_scalar_function(pow); let udf = create_udf( - "pow", + // datafusion already supports pow, so we use a different name. + "my_pow", vec![ ConcreteDataType::uint32_datatype(), ConcreteDataType::uint32_datatype(), @@ -136,7 +139,7 @@ async fn test_udf() -> Result<()> { engine.register_udf(udf); let plan = engine.sql_to_plan( - "select pow(number, number) as p from numbers limit 10", + "select my_pow(number, number) as p from numbers limit 10", Arc::new(QueryContext::new()), )?; @@ -148,202 +151,18 @@ async fn test_udf() -> Result<()> { let numbers = util::collect(recordbatch).await.unwrap(); assert_eq!(1, numbers.len()); - assert_eq!(numbers[0].df_recordbatch.num_columns(), 1); - assert_eq!(1, numbers[0].schema.arrow_schema().fields().len()); - assert_eq!("p", numbers[0].schema.arrow_schema().field(0).name()); + assert_eq!(numbers[0].num_columns(), 1); + assert_eq!(1, numbers[0].schema.num_columns()); + assert_eq!("p", numbers[0].schema.column_schemas()[0].name); - let columns = numbers[0].df_recordbatch.columns(); - assert_eq!(1, columns.len()); - assert_eq!(columns[0].len(), 10); + let batch = &numbers[0]; + assert_eq!(1, batch.num_columns()); + assert_eq!(batch.column(0).len(), 10); let expected: Vec = vec![1, 1, 4, 27, 256, 3125, 46656, 823543, 16777216, 387420489]; assert_eq!( - *columns[0].as_any().downcast_ref::().unwrap(), - UInt32Array::from_slice(&expected) + *batch.column(0), + Arc::new(UInt32Vector::from_slice(&expected)) as VectorRef ); Ok(()) } - -fn create_query_engine() -> Arc { - let schema_provider = Arc::new(MemorySchemaProvider::new()); - let catalog_provider = Arc::new(MemoryCatalogProvider::new()); - let catalog_list = Arc::new(MemoryCatalogManager::default()); - - // create table with primitives, and all columns' length are even - let mut column_schemas = vec![]; - let mut columns = vec![]; - macro_rules! create_even_number_table { - ([], $( { $T:ty } ),*) => { - $( - let mut rng = rand::thread_rng(); - - let column_name = format!("{}_number_even", std::any::type_name::<$T>()); - let column_schema = ColumnSchema::new(column_name, Value::from(<$T>::default()).data_type(), true); - column_schemas.push(column_schema); - - let numbers = (1..=100).map(|_| rng.gen::<$T>()).collect::>(); - let column: VectorRef = Arc::new(PrimitiveVector::<$T>::from_vec(numbers.to_vec())); - columns.push(column); - )* - } - } - for_all_primitive_types! { create_even_number_table } - - let schema = Arc::new(Schema::new(column_schemas.clone())); - let recordbatch = RecordBatch::new(schema, columns).unwrap(); - let even_number_table = Arc::new(MemTable::new("even_numbers", recordbatch)); - schema_provider - .register_table( - even_number_table.table_name().to_string(), - even_number_table, - ) - .unwrap(); - - // create table with primitives, and all columns' length are odd - let mut column_schemas = vec![]; - let mut columns = vec![]; - macro_rules! create_odd_number_table { - ([], $( { $T:ty } ),*) => { - $( - let mut rng = rand::thread_rng(); - - let column_name = format!("{}_number_odd", std::any::type_name::<$T>()); - let column_schema = ColumnSchema::new(column_name, Value::from(<$T>::default()).data_type(), true); - column_schemas.push(column_schema); - - let numbers = (1..=99).map(|_| rng.gen::<$T>()).collect::>(); - let column: VectorRef = Arc::new(PrimitiveVector::<$T>::from_vec(numbers.to_vec())); - columns.push(column); - )* - } - } - for_all_primitive_types! { create_odd_number_table } - - let schema = Arc::new(Schema::new(column_schemas.clone())); - let recordbatch = RecordBatch::new(schema, columns).unwrap(); - let odd_number_table = Arc::new(MemTable::new("odd_numbers", recordbatch)); - schema_provider - .register_table(odd_number_table.table_name().to_string(), odd_number_table) - .unwrap(); - - catalog_provider - .register_schema(DEFAULT_SCHEMA_NAME.to_string(), schema_provider) - .unwrap(); - catalog_list - .register_catalog(DEFAULT_CATALOG_NAME.to_string(), catalog_provider) - .unwrap(); - - QueryEngineFactory::new(catalog_list).query_engine() -} - -async fn get_numbers_from_table<'s, T>( - column_name: &'s str, - table_name: &'s str, - engine: Arc, -) -> Vec> -where - T: PrimitiveElement, - for<'a> T: Scalar = T>, -{ - let sql = format!("SELECT {} FROM {}", column_name, table_name); - let plan = engine - .sql_to_plan(&sql, Arc::new(QueryContext::new())) - .unwrap(); - - let output = engine.execute(&plan).await.unwrap(); - let recordbatch_stream = match output { - Output::Stream(batch) => batch, - _ => unreachable!(), - }; - let numbers = util::collect(recordbatch_stream).await.unwrap(); - - let columns = numbers[0].df_recordbatch.columns(); - let column = VectorHelper::try_into_vector(&columns[0]).unwrap(); - let column: &::VectorType = unsafe { VectorHelper::static_cast(&column) }; - column - .iter_data() - .flatten() - .map(|x| OrdPrimitive::(x)) - .collect::>>() -} - -#[tokio::test] -async fn test_median_aggregator() -> Result<()> { - common_telemetry::init_default_ut_logging(); - - let engine = create_query_engine(); - - macro_rules! test_median { - ([], $( { $T:ty } ),*) => { - $( - let column_name = format!("{}_number_even", std::any::type_name::<$T>()); - test_median_success::<$T>(&column_name, "even_numbers", engine.clone()).await?; - - let column_name = format!("{}_number_odd", std::any::type_name::<$T>()); - test_median_success::<$T>(&column_name, "odd_numbers", engine.clone()).await?; - )* - } - } - for_all_primitive_types! { test_median } - Ok(()) -} - -async fn test_median_success( - column_name: &str, - table_name: &str, - engine: Arc, -) -> Result<()> -where - T: PrimitiveElement, - for<'a> T: Scalar = T>, -{ - let result = execute_median(column_name, table_name, engine.clone()) - .await - .unwrap(); - assert_eq!(1, result.len()); - assert_eq!(result[0].df_recordbatch.num_columns(), 1); - assert_eq!(1, result[0].schema.arrow_schema().fields().len()); - assert_eq!("median", result[0].schema.arrow_schema().field(0).name()); - - let columns = result[0].df_recordbatch.columns(); - assert_eq!(1, columns.len()); - assert_eq!(columns[0].len(), 1); - let v = VectorHelper::try_into_vector(&columns[0]).unwrap(); - assert_eq!(1, v.len()); - let median = v.get(0); - - let mut numbers = get_numbers_from_table::(column_name, table_name, engine.clone()).await; - numbers.sort(); - let len = numbers.len(); - let expected_median: Value = if len % 2 == 1 { - numbers[len / 2] - } else { - let a: f64 = NumCast::from(numbers[len / 2 - 1].as_primitive()).unwrap(); - let b: f64 = NumCast::from(numbers[len / 2].as_primitive()).unwrap(); - OrdPrimitive::(NumCast::from(a / 2.0 + b / 2.0).unwrap()) - } - .into(); - assert_eq!(expected_median, median); - Ok(()) -} - -async fn execute_median<'a>( - column_name: &'a str, - table_name: &'a str, - engine: Arc, -) -> RecordResult> { - let sql = format!( - "select MEDIAN({}) as median from {}", - column_name, table_name - ); - let plan = engine - .sql_to_plan(&sql, Arc::new(QueryContext::new())) - .unwrap(); - - let output = engine.execute(&plan).await.unwrap(); - let recordbatch_stream = match output { - Output::Stream(batch) => batch, - _ => unreachable!(), - }; - util::collect(recordbatch_stream).await -} diff --git a/src/query/tests/scipy_stats_norm_cdf_test.rs b/src/query/tests/scipy_stats_norm_cdf_test.rs index 815501a314cb..dee8f5c87ee3 100644 --- a/src/query/tests/scipy_stats_norm_cdf_test.rs +++ b/src/query/tests/scipy_stats_norm_cdf_test.rs @@ -18,11 +18,8 @@ mod function; use common_query::Output; use common_recordbatch::error::Result as RecordResult; use common_recordbatch::{util, RecordBatch}; -use datafusion::field_util::{FieldExt, SchemaExt}; use datatypes::for_all_primitive_types; -use datatypes::prelude::*; -use datatypes::types::PrimitiveElement; -use function::{create_query_engine, get_numbers_from_table}; +use datatypes::types::WrapperType; use num_traits::AsPrimitive; use query::error::Result; use query::QueryEngine; @@ -33,7 +30,7 @@ use statrs::statistics::Statistics; #[tokio::test] async fn test_scipy_stats_norm_cdf_aggregator() -> Result<()> { common_telemetry::init_default_ut_logging(); - let engine = create_query_engine(); + let engine = function::create_query_engine(); macro_rules! test_scipy_stats_norm_cdf { ([], $( { $T:ty } ),*) => { @@ -53,28 +50,15 @@ async fn test_scipy_stats_norm_cdf_success( engine: Arc, ) -> Result<()> where - T: PrimitiveElement + AsPrimitive, - for<'a> T: Scalar = T>, + T: WrapperType + AsPrimitive, { let result = execute_scipy_stats_norm_cdf(column_name, table_name, engine.clone()) .await .unwrap(); - assert_eq!(1, result.len()); - assert_eq!(result[0].df_recordbatch.num_columns(), 1); - assert_eq!(1, result[0].schema.arrow_schema().fields().len()); - assert_eq!( - "scipy_stats_norm_cdf", - result[0].schema.arrow_schema().field(0).name() - ); - - let columns = result[0].df_recordbatch.columns(); - assert_eq!(1, columns.len()); - assert_eq!(columns[0].len(), 1); - let v = VectorHelper::try_into_vector(&columns[0]).unwrap(); - assert_eq!(1, v.len()); - let value = v.get(0); + let value = function::get_value_from_batches("scipy_stats_norm_cdf", result); - let numbers = get_numbers_from_table::(column_name, table_name, engine.clone()).await; + let numbers = + function::get_numbers_from_table::(column_name, table_name, engine.clone()).await; let expected_value = numbers.iter().map(|&n| n.as_()).collect::>(); let mean = expected_value.clone().mean(); let stddev = expected_value.std_dev(); diff --git a/src/query/tests/scipy_stats_norm_pdf.rs b/src/query/tests/scipy_stats_norm_pdf.rs index dd5e0fc7fc5b..03e4cf129220 100644 --- a/src/query/tests/scipy_stats_norm_pdf.rs +++ b/src/query/tests/scipy_stats_norm_pdf.rs @@ -18,11 +18,8 @@ mod function; use common_query::Output; use common_recordbatch::error::Result as RecordResult; use common_recordbatch::{util, RecordBatch}; -use datafusion::field_util::{FieldExt, SchemaExt}; use datatypes::for_all_primitive_types; -use datatypes::prelude::*; -use datatypes::types::PrimitiveElement; -use function::{create_query_engine, get_numbers_from_table}; +use datatypes::types::WrapperType; use num_traits::AsPrimitive; use query::error::Result; use query::QueryEngine; @@ -33,7 +30,7 @@ use statrs::statistics::Statistics; #[tokio::test] async fn test_scipy_stats_norm_pdf_aggregator() -> Result<()> { common_telemetry::init_default_ut_logging(); - let engine = create_query_engine(); + let engine = function::create_query_engine(); macro_rules! test_scipy_stats_norm_pdf { ([], $( { $T:ty } ),*) => { @@ -53,28 +50,15 @@ async fn test_scipy_stats_norm_pdf_success( engine: Arc, ) -> Result<()> where - T: PrimitiveElement + AsPrimitive, - for<'a> T: Scalar = T>, + T: WrapperType + AsPrimitive, { let result = execute_scipy_stats_norm_pdf(column_name, table_name, engine.clone()) .await .unwrap(); - assert_eq!(1, result.len()); - assert_eq!(result[0].df_recordbatch.num_columns(), 1); - assert_eq!(1, result[0].schema.arrow_schema().fields().len()); - assert_eq!( - "scipy_stats_norm_pdf", - result[0].schema.arrow_schema().field(0).name() - ); - - let columns = result[0].df_recordbatch.columns(); - assert_eq!(1, columns.len()); - assert_eq!(columns[0].len(), 1); - let v = VectorHelper::try_into_vector(&columns[0]).unwrap(); - assert_eq!(1, v.len()); - let value = v.get(0); + let value = function::get_value_from_batches("scipy_stats_norm_pdf", result); - let numbers = get_numbers_from_table::(column_name, table_name, engine.clone()).await; + let numbers = + function::get_numbers_from_table::(column_name, table_name, engine.clone()).await; let expected_value = numbers.iter().map(|&n| n.as_()).collect::>(); let mean = expected_value.clone().mean(); let stddev = expected_value.std_dev(); diff --git a/src/script/Cargo.toml b/src/script/Cargo.toml index 43206c3ba573..3c3663264752 100644 --- a/src/script/Cargo.toml +++ b/src/script/Cargo.toml @@ -8,6 +8,7 @@ license = "Apache-2.0" default = ["python"] python = [ "dep:datafusion", + "dep:datafusion-common", "dep:datafusion-expr", "dep:datafusion-physical-expr", "dep:rustpython-vm", @@ -32,10 +33,10 @@ common-recordbatch = { path = "../common/recordbatch" } common-telemetry = { path = "../common/telemetry" } common-time = { path = "../common/time" } console = "0.15" -datafusion = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2", optional = true } -datafusion-common = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2" } -datafusion-expr = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2", optional = true } -datafusion-physical-expr = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2", optional = true } +datafusion = { version = "14.0.0", optional = true } +datafusion-common = { version = "14.0.0", optional = true } +datafusion-expr = { version = "14.0.0", optional = true } +datafusion-physical-expr = { version = "14.0.0", optional = true } datatypes = { path = "../datatypes" } futures = "0.3" futures-util = "0.3" diff --git a/src/script/src/python/builtins/mod.rs b/src/script/src/python/builtins/mod.rs index 679d91289b52..4cd52cc609f6 100644 --- a/src/script/src/python/builtins/mod.rs +++ b/src/script/src/python/builtins/mod.rs @@ -20,10 +20,9 @@ mod test; use datafusion_common::{DataFusionError, ScalarValue}; use datafusion_expr::ColumnarValue as DFColValue; use datafusion_physical_expr::AggregateExpr; -use datatypes::arrow; use datatypes::arrow::array::ArrayRef; -use datatypes::arrow::compute::cast::CastOptions; -use datatypes::arrow::datatypes::DataType; +use datatypes::arrow::compute; +use datatypes::arrow::datatypes::{DataType as ArrowDataType, Field}; use datatypes::vectors::Helper as HelperVec; use rustpython_vm::builtins::{PyBaseExceptionRef, PyBool, PyFloat, PyInt, PyList, PyStr}; use rustpython_vm::{pymodule, AsObject, PyObjectRef, PyPayload, PyResult, VirtualMachine}; @@ -36,7 +35,7 @@ fn type_cast_error(name: &str, ty: &str, vm: &VirtualMachine) -> PyBaseException vm.new_type_error(format!("Can't cast operand of type `{name}` into `{ty}`.")) } -fn collect_diff_types_string(values: &[ScalarValue], ty: &DataType) -> String { +fn collect_diff_types_string(values: &[ScalarValue], ty: &ArrowDataType) -> String { values .iter() .enumerate() @@ -55,6 +54,10 @@ fn collect_diff_types_string(values: &[ScalarValue], ty: &DataType) -> String { .unwrap_or_else(|| "Nothing".to_string()) } +fn new_item_field(data_type: ArrowDataType) -> Field { + Field::new("item", data_type, false) +} + /// try to turn a Python Object into a PyVector or a scalar that can be use for calculate /// /// supported scalar are(leftside is python data type, right side is rust type): @@ -108,7 +111,7 @@ pub fn try_into_columnar_value(obj: PyObjectRef, vm: &VirtualMachine) -> PyResul // TODO(dennis): empty list, we set type as null. return Ok(DFColValue::Scalar(ScalarValue::List( None, - Box::new(DataType::Null), + Box::new(new_item_field(ArrowDataType::Null)), ))); } @@ -120,8 +123,8 @@ pub fn try_into_columnar_value(obj: PyObjectRef, vm: &VirtualMachine) -> PyResul ))); } Ok(DFColValue::Scalar(ScalarValue::List( - Some(Box::new(ret)), - Box::new(ty), + Some(ret), + Box::new(new_item_field(ty)), ))) } else { Err(vm.new_type_error(format!( @@ -183,22 +186,14 @@ fn scalar_val_try_into_py_obj(val: ScalarValue, vm: &VirtualMachine) -> PyResult fn all_to_f64(col: DFColValue, vm: &VirtualMachine) -> PyResult { match col { DFColValue::Array(arr) => { - let res = arrow::compute::cast::cast( - arr.as_ref(), - &DataType::Float64, - CastOptions { - wrapped: true, - partial: true, - }, - ) - .map_err(|err| { + let res = compute::cast(&arr, &ArrowDataType::Float64).map_err(|err| { vm.new_type_error(format!( "Arrow Type Cast Fail(from {:#?} to {:#?}): {err:#?}", arr.data_type(), - DataType::Float64 + ArrowDataType::Float64 )) })?; - Ok(DFColValue::Array(res.into())) + Ok(DFColValue::Array(res)) } DFColValue::Scalar(val) => { let val_in_f64 = match val { @@ -209,7 +204,7 @@ fn all_to_f64(col: DFColValue, vm: &VirtualMachine) -> PyResult { return Err(vm.new_type_error(format!( "Can't cast type {:#?} to {:#?}", val.get_datatype(), - DataType::Float64 + ArrowDataType::Float64 ))) } }; @@ -283,17 +278,16 @@ pub(crate) mod greptime_builtin { // P.S.: not extract to file because not-inlined proc macro attribute is *unstable* use std::sync::Arc; + use arrow::compute::kernels::{aggregate, boolean, comparison}; use common_function::scalars::function::FunctionContext; use common_function::scalars::math::PowFunction; use common_function::scalars::{Function, FunctionRef, FUNCTION_REGISTRY}; - use datafusion::arrow::compute::comparison::{gt_eq_scalar, lt_eq_scalar}; - use datafusion::arrow::datatypes::DataType; - use datafusion::arrow::error::ArrowError; - use datafusion::arrow::scalar::{PrimitiveScalar, Scalar}; + use datafusion::arrow::datatypes::DataType as ArrowDataType; use datafusion::physical_plan::expressions; use datafusion_expr::ColumnarValue as DFColValue; use datafusion_physical_expr::math_expressions; - use datatypes::arrow::array::{ArrayRef, NullArray}; + use datatypes::arrow::array::{ArrayRef, Int64Array, NullArray}; + use datatypes::arrow::error::ArrowError; use datatypes::arrow::{self, compute}; use datatypes::vectors::{ConstantVector, Float64Vector, Helper, Int64Vector, VectorRef}; use paste::paste; @@ -386,11 +380,6 @@ pub(crate) mod greptime_builtin { eval_func("clip", &[v0, v1, v2], vm) } - #[pyfunction] - fn median(v: PyVectorRef, vm: &VirtualMachine) -> PyResult { - eval_aggr_func("median", &[v], vm) - } - #[pyfunction] fn diff(v: PyVectorRef, vm: &VirtualMachine) -> PyResult { eval_aggr_func("diff", &[v], vm) @@ -552,7 +541,7 @@ pub(crate) mod greptime_builtin { fn random(len: usize, vm: &VirtualMachine) -> PyResult { // This is in a proc macro so using full path to avoid strange things // more info at: https://doc.rust-lang.org/reference/procedural-macros.html#procedural-macro-hygiene - let arg = NullArray::new(arrow::datatypes::DataType::Null, len); + let arg = NullArray::new(len); let args = &[DFColValue::Array(std::sync::Arc::new(arg) as _)]; let res = math_expressions::random(args).map_err(|err| from_df_err(err, vm))?; let ret = try_into_py_obj(res, vm)?; @@ -571,6 +560,17 @@ pub(crate) mod greptime_builtin { ); } + #[pyfunction] + fn median(values: PyVectorRef, vm: &VirtualMachine) -> PyResult { + bind_aggr_fn!( + Median, + vm, + &[values.to_arrow_array()], + values.to_arrow_array().data_type(), + expr0 + ); + } + /// Not implement in datafusion /// TODO(discord9): use greptime's own impl instead /* @@ -807,12 +807,16 @@ pub(crate) mod greptime_builtin { Ok(res.into()) } - fn gen_none_array(data_type: DataType, len: usize, vm: &VirtualMachine) -> PyResult { + fn gen_none_array( + data_type: ArrowDataType, + len: usize, + vm: &VirtualMachine, + ) -> PyResult { macro_rules! match_none_array { ($VAR:ident, $LEN: ident, [$($TY:ident),*]) => { paste!{ match $VAR{ - $(DataType::$TY => Arc::new(arrow::array::[<$TY Array>]::from(vec![None;$LEN])), )* + $(ArrowDataType::$TY => Arc::new(arrow::array::[<$TY Array>]::from(vec![None;$LEN])), )* _ => return Err(vm.new_type_error(format!("gen_none_array() does not support {:?}", data_type))) } } @@ -828,10 +832,10 @@ pub(crate) mod greptime_builtin { #[pyfunction] fn prev(cur: PyVectorRef, vm: &VirtualMachine) -> PyResult { - let cur: ArrayRef = cur.to_arrow_array(); + let cur = cur.to_arrow_array(); if cur.len() == 0 { let ret = cur.slice(0, 0); - let ret = Helper::try_into_vector(&*ret).map_err(|e| { + let ret = Helper::try_into_vector(ret.clone()).map_err(|e| { vm.new_type_error(format!( "Can't cast result into vector, result: {:?}, err: {:?}", ret, e @@ -841,10 +845,10 @@ pub(crate) mod greptime_builtin { } let cur = cur.slice(0, cur.len() - 1); // except the last one that is let fill = gen_none_array(cur.data_type().to_owned(), 1, vm)?; - let ret = compute::concatenate::concatenate(&[&*fill, &*cur]).map_err(|err| { + let ret = compute::concat(&[&*fill, &*cur]).map_err(|err| { vm.new_runtime_error(format!("Can't concat array[0] with array[0:-1]!{err:#?}")) })?; - let ret = Helper::try_into_vector(&*ret).map_err(|e| { + let ret = Helper::try_into_vector(ret.clone()).map_err(|e| { vm.new_type_error(format!( "Can't cast result into vector, result: {:?}, err: {:?}", ret, e @@ -855,10 +859,10 @@ pub(crate) mod greptime_builtin { #[pyfunction] fn next(cur: PyVectorRef, vm: &VirtualMachine) -> PyResult { - let cur: ArrayRef = cur.to_arrow_array(); + let cur = cur.to_arrow_array(); if cur.len() == 0 { let ret = cur.slice(0, 0); - let ret = Helper::try_into_vector(&*ret).map_err(|e| { + let ret = Helper::try_into_vector(ret.clone()).map_err(|e| { vm.new_type_error(format!( "Can't cast result into vector, result: {:?}, err: {:?}", ret, e @@ -868,10 +872,10 @@ pub(crate) mod greptime_builtin { } let cur = cur.slice(1, cur.len() - 1); // except the last one that is let fill = gen_none_array(cur.data_type().to_owned(), 1, vm)?; - let ret = compute::concatenate::concatenate(&[&*cur, &*fill]).map_err(|err| { + let ret = compute::concat(&[&*cur, &*fill]).map_err(|err| { vm.new_runtime_error(format!("Can't concat array[0] with array[0:-1]!{err:#?}")) })?; - let ret = Helper::try_into_vector(&*ret).map_err(|e| { + let ret = Helper::try_into_vector(ret.clone()).map_err(|e| { vm.new_type_error(format!( "Can't cast result into vector, result: {:?}, err: {:?}", ret, e @@ -880,55 +884,24 @@ pub(crate) mod greptime_builtin { Ok(ret.into()) } - fn try_scalar_to_value(scalar: &dyn Scalar, vm: &VirtualMachine) -> PyResult { - let ty_error = |s: String| vm.new_type_error(s); - scalar - .as_any() - .downcast_ref::>() - .ok_or_else(|| { - ty_error(format!( - "expect scalar to be i64, found{:?}", - scalar.data_type() - )) - })? - .value() - .ok_or_else(|| ty_error("All element is Null in a time series array".to_string())) - } - /// generate interval time point fn gen_inteveral( - oldest: &dyn Scalar, - newest: &dyn Scalar, + oldest: i64, + newest: i64, duration: i64, vm: &VirtualMachine, - ) -> PyResult>> { - use datatypes::arrow::datatypes::DataType; - match (oldest.data_type(), newest.data_type()) { - (DataType::Int64, DataType::Int64) => (), - _ => { - return Err(vm.new_type_error(format!( - "Expect int64, found {:?} and {:?}", - oldest.data_type(), - newest.data_type() - ))); - } - } - - let oldest = try_scalar_to_value(oldest, vm)?; - let newest = try_scalar_to_value(newest, vm)?; + ) -> PyResult> { if oldest > newest { return Err(vm.new_value_error(format!("{oldest} is greater than {newest}"))); } - let ret = if duration > 0 { - (oldest..=newest) + if duration > 0 { + let ret = (oldest..=newest) .step_by(duration as usize) - .map(|v| PrimitiveScalar::new(DataType::Int64, Some(v))) - .collect::>() + .collect::>(); + Ok(ret) } else { - return Err(vm.new_value_error(format!("duration: {duration} is not positive number."))); - }; - - Ok(ret) + Err(vm.new_value_error(format!("duration: {duration} is not positive number."))) + } } /// `func`: exec on sliding window slice of given `arr`, expect it to always return a PyVector of one element @@ -951,12 +924,19 @@ pub(crate) mod greptime_builtin { let arrow_error = |err: ArrowError| vm.new_runtime_error(format!("Arrow Error: {err:#?}")); let datatype_error = |err: datatypes::Error| vm.new_runtime_error(format!("DataType Errors!: {err:#?}")); - let ts: ArrayRef = ts.to_arrow_array(); - let arr: ArrayRef = arr.to_arrow_array(); + let ts_array_ref: ArrayRef = ts.to_arrow_array(); + let ts = ts_array_ref + .as_any() + .downcast_ref::() + .ok_or_else(|| { + vm.new_type_error(format!("ts must be int64, found: {:?}", ts_array_ref)) + })?; let slices = { - let oldest = compute::aggregate::min(&*ts).map_err(arrow_error)?; - let newest = compute::aggregate::max(&*ts).map_err(arrow_error)?; - gen_inteveral(&*oldest, &*newest, duration, vm)? + let oldest = aggregate::min(ts) + .ok_or_else(|| vm.new_runtime_error("ts must has min value".to_string()))?; + let newest = aggregate::max(ts) + .ok_or_else(|| vm.new_runtime_error("ts must has max value".to_string()))?; + gen_inteveral(oldest, newest, duration, vm)? }; let windows = { @@ -968,11 +948,15 @@ pub(crate) mod greptime_builtin { it }) .map(|(first, second)| { - compute::boolean::and(>_eq_scalar(&*ts, first), <_eq_scalar(&*ts, second)) - .map_err(arrow_error) + let left = comparison::gt_eq_scalar(ts, *first).map_err(arrow_error)?; + let right = comparison::lt_eq_scalar(ts, *second).map_err(arrow_error)?; + boolean::and(&left, &right).map_err(arrow_error) }) .map(|mask| match mask { - Ok(mask) => compute::filter::filter(&*arr, &mask).map_err(arrow_error), + Ok(mask) => { + let arrow_arr = arr.to_arrow_array(); + compute::filter(&arrow_arr, &mask).map_err(arrow_error) + } Err(e) => Err(e), }) .collect::, _>>()? @@ -1012,16 +996,17 @@ pub(crate) mod greptime_builtin { .map(apply_interval_function) .collect::, _>>()?; - // 3. get returen vector and concat them - let ret = fn_results - .into_iter() - .try_reduce(|acc, x| { - compute::concatenate::concatenate(&[acc.as_ref(), x.as_ref()]).map(Arc::from) - }) - .map_err(arrow_error)? - .unwrap_or_else(|| Arc::from(arr.slice(0, 0))); + // 3. get returned vector and concat them + let result_arrays: Vec<_> = fn_results + .iter() + .map(|vector| vector.to_arrow_array()) + .collect(); + let result_dyn_arrays: Vec<_> = result_arrays.iter().map(|v| v.as_ref()).collect(); + let concat_array = compute::concat(&result_dyn_arrays).map_err(arrow_error)?; + let vector = Helper::try_into_vector(concat_array).map_err(datatype_error)?; + // 4. return result vector - Ok(Helper::try_into_vector(ret).map_err(datatype_error)?.into()) + Ok(PyVector::from(vector)) } /// return first element in a `PyVector` in sliced new `PyVector`, if vector's length is zero, return a zero sized slice instead @@ -1032,7 +1017,7 @@ pub(crate) mod greptime_builtin { 0 => arr.slice(0, 0), _ => arr.slice(0, 1), }; - let ret = Helper::try_into_vector(&*ret).map_err(|e| { + let ret = Helper::try_into_vector(ret.clone()).map_err(|e| { vm.new_type_error(format!( "Can't cast result into vector, result: {:?}, err: {:?}", ret, e @@ -1049,7 +1034,7 @@ pub(crate) mod greptime_builtin { 0 => arr.slice(0, 0), _ => arr.slice(arr.len() - 1, 1), }; - let ret = Helper::try_into_vector(&*ret).map_err(|e| { + let ret = Helper::try_into_vector(ret.clone()).map_err(|e| { vm.new_type_error(format!( "Can't cast result into vector, result: {:?}, err: {:?}", ret, e diff --git a/src/script/src/python/builtins/test.rs b/src/script/src/python/builtins/test.rs index 39caf399e222..16828ba8836f 100644 --- a/src/script/src/python/builtins/test.rs +++ b/src/script/src/python/builtins/test.rs @@ -19,10 +19,10 @@ use std::path::Path; use std::sync::Arc; use common_telemetry::{error, info}; -use datatypes::arrow::array::{Float64Array, Int64Array, PrimitiveArray}; -use datatypes::arrow::compute::cast::CastOptions; -use datatypes::arrow::datatypes::DataType; -use datatypes::vectors::VectorRef; +use datatypes::arrow::array::{Float64Array, Int64Array}; +use datatypes::arrow::compute; +use datatypes::arrow::datatypes::{DataType as ArrowDataType, Field}; +use datatypes::vectors::{Float64Vector, Int64Vector, VectorRef}; use ron::from_str as from_ron_string; use rustpython_vm::builtins::{PyFloat, PyInt, PyList}; use rustpython_vm::class::PyClassImpl; @@ -69,17 +69,17 @@ fn convert_scalar_to_py_obj_and_back() { panic!("Convert errors, expect 1") } let col = DFColValue::Scalar(ScalarValue::List( - Some(Box::new(vec![ + Some(vec![ ScalarValue::Int64(Some(1)), ScalarValue::Int64(Some(2)), - ])), - Box::new(DataType::Int64), + ]), + Box::new(Field::new("item", ArrowDataType::Int64, false)), )); let to = try_into_py_obj(col, vm).unwrap(); let back = try_into_columnar_value(to, vm).unwrap(); - if let DFColValue::Scalar(ScalarValue::List(Some(list), ty)) = back { + if let DFColValue::Scalar(ScalarValue::List(Some(list), field)) = back { assert_eq!(list.len(), 2); - assert_eq!(ty.as_ref(), &DataType::Int64); + assert_eq!(*field.data_type(), ArrowDataType::Int64); } let list: Vec = vec![vm.ctx.new_int(1).into(), vm.ctx.new_int(2).into()]; let nested_list: Vec = @@ -93,12 +93,10 @@ fn convert_scalar_to_py_obj_and_back() { )); } - let list: PyVector = PyVector::from( - HelperVec::try_into_vector( - Arc::new(PrimitiveArray::from_slice([0.1f64, 0.2, 0.3, 0.4])) as ArrayRef, - ) - .unwrap(), - ); + let list: PyVector = + PyVector::from( + Arc::new(Float64Vector::from_slice([0.1f64, 0.2, 0.3, 0.4])) as VectorRef + ); let nested_list: Vec = vec![list.into_pyobject(vm), vm.ctx.new_int(3).into()]; let list_obj = vm.ctx.new_list(nested_list).into(); let expect_err = try_into_columnar_value(list_obj, vm); @@ -116,7 +114,7 @@ struct TestCase { #[derive(Debug, Serialize, Deserialize)] struct Var { value: PyValue, - ty: DataType, + ty: ArrowDataType, } /// for floating number comparison @@ -190,25 +188,25 @@ impl PyValue { } } -fn is_float(ty: &DataType) -> bool { +fn is_float(ty: &ArrowDataType) -> bool { matches!( ty, - DataType::Float16 | DataType::Float32 | DataType::Float64 + ArrowDataType::Float16 | ArrowDataType::Float32 | ArrowDataType::Float64 ) } /// unsigned included -fn is_int(ty: &DataType) -> bool { +fn is_int(ty: &ArrowDataType) -> bool { matches!( ty, - DataType::UInt8 - | DataType::UInt16 - | DataType::UInt32 - | DataType::UInt64 - | DataType::Int8 - | DataType::Int16 - | DataType::Int32 - | DataType::Int64 + ArrowDataType::UInt8 + | ArrowDataType::UInt16 + | ArrowDataType::UInt32 + | ArrowDataType::UInt64 + | ArrowDataType::Int8 + | ArrowDataType::Int16 + | ArrowDataType::Int32 + | ArrowDataType::Int64 ) } @@ -218,7 +216,7 @@ impl PyValue { PyValue::FloatVec(v) => { Arc::new(datatypes::vectors::Float64Vector::from_vec(v.clone())) } - PyValue::IntVec(v) => Arc::new(datatypes::vectors::Int64Vector::from_vec(v.clone())), + PyValue::IntVec(v) => Arc::new(Int64Vector::from_vec(v.clone())), PyValue::Int(v) => return Ok(vm.ctx.new_int(*v).into()), PyValue::Float(v) => return Ok(vm.ctx.new_float(*v).into()), Self::Bool(v) => return Ok(vm.ctx.new_bool(*v).into()), @@ -235,16 +233,9 @@ impl PyValue { let res = res.to_arrow_array(); let ty = res.data_type(); if is_float(ty) { - let vec_f64 = arrow::compute::cast::cast( - res.as_ref(), - &DataType::Float64, - CastOptions { - wrapped: true, - partial: true, - }, - ) - .map_err(|err| format!("{err:#?}"))?; - assert_eq!(vec_f64.data_type(), &DataType::Float64); + let vec_f64 = compute::cast(&res, &ArrowDataType::Float64) + .map_err(|err| format!("{err:#?}"))?; + assert_eq!(vec_f64.data_type(), &ArrowDataType::Float64); let vec_f64 = vec_f64 .as_any() .downcast_ref::() @@ -252,13 +243,6 @@ impl PyValue { let ret = vec_f64 .into_iter() .map(|v| v.map(|inner| inner.to_owned())) - /* .enumerate() - .map(|(idx, v)| { - v.ok_or(format!( - "No null element expected, found one in {idx} position" - )) - .map(|v| v.to_owned()) - })*/ .collect::>(); if ret.iter().all(|x| x.is_some()) { Ok(Self::FloatVec( @@ -268,16 +252,9 @@ impl PyValue { Ok(Self::FloatVecWithNull(ret)) } } else if is_int(ty) { - let vec_int = arrow::compute::cast::cast( - res.as_ref(), - &DataType::Int64, - CastOptions { - wrapped: true, - partial: true, - }, - ) - .map_err(|err| format!("{err:#?}"))?; - assert_eq!(vec_int.data_type(), &DataType::Int64); + let vec_int = compute::cast(&res, &ArrowDataType::Int64) + .map_err(|err| format!("{err:#?}"))?; + assert_eq!(vec_int.data_type(), &ArrowDataType::Int64); let vec_i64 = vec_int .as_any() .downcast_ref::() @@ -294,7 +271,7 @@ impl PyValue { .collect::>()?; Ok(Self::IntVec(ret)) } else { - Err(format!("unspupported DataType:{ty:#?}")) + Err(format!("unspupported ArrowDataType:{ty:#?}")) } } else if is_instance::(obj, vm) { let res = obj diff --git a/src/script/src/python/coprocessor.rs b/src/script/src/python/coprocessor.rs index 3bc5c39f2a5c..3dcc34856216 100644 --- a/src/script/src/python/coprocessor.rs +++ b/src/script/src/python/coprocessor.rs @@ -16,19 +16,18 @@ pub mod compile; pub mod parse; use std::cell::RefCell; -use std::collections::{HashMap, HashSet}; +use std::collections::HashSet; use std::result::Result as StdResult; use std::sync::Arc; use common_recordbatch::RecordBatch; use common_telemetry::info; -use datafusion_common::record_batch::RecordBatch as DfRecordBatch; -use datatypes::arrow; -use datatypes::arrow::array::{Array, ArrayRef}; -use datatypes::arrow::compute::cast::CastOptions; -use datatypes::arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; -use datatypes::schema::Schema; -use datatypes::vectors::{BooleanVector, Helper, StringVector, Vector, VectorRef}; +use datatypes::arrow::array::Array; +use datatypes::arrow::compute; +use datatypes::arrow::datatypes::DataType as ArrowDataType; +use datatypes::data_type::{ConcreteDataType, DataType}; +use datatypes::schema::{ColumnSchema, Schema, SchemaRef}; +use datatypes::vectors::{Helper, VectorRef}; use rustpython_compiler_core::CodeObject; use rustpython_vm as vm; use rustpython_vm::class::PyClassImpl; @@ -43,7 +42,8 @@ use vm::{Interpreter, PyObjectRef, VirtualMachine}; use crate::python::builtins::greptime_builtin; use crate::python::coprocessor::parse::DecoratorArgs; use crate::python::error::{ - ensure, ret_other_error_with, ArrowSnafu, OtherSnafu, Result, TypeCastSnafu, + ensure, ret_other_error_with, ArrowSnafu, NewRecordBatchSnafu, OtherSnafu, Result, + TypeCastSnafu, }; use crate::python::utils::{format_py_error, is_instance, py_vec_obj_to_array}; use crate::python::PyVector; @@ -54,7 +54,8 @@ thread_local!(static INTERPRETER: RefCell>> = RefCell::n #[derive(Debug, Clone, PartialEq, Eq)] pub struct AnnotationInfo { /// if None, use types inferred by PyVector - pub datatype: Option, + // TODO(yingwen): We should use our data type. i.e. ConcreteDataType. + pub datatype: Option, pub is_nullable: bool, } @@ -95,7 +96,7 @@ impl Coprocessor { /// generate [`Schema`] according to return names, types, /// if no annotation /// the datatypes of the actual columns is used directly - fn gen_schema(&self, cols: &[ArrayRef]) -> Result> { + fn gen_schema(&self, cols: &[VectorRef]) -> Result { let names = &self.deco_args.ret_names; let anno = &self.return_types; ensure!( @@ -109,35 +110,38 @@ impl Coprocessor { ) } ); - Ok(Arc::new(ArrowSchema::from( - names - .iter() - .enumerate() - .map(|(idx, name)| { - let real_ty = cols[idx].data_type().to_owned(); - let AnnotationInfo { - datatype: ty, - is_nullable, - } = anno[idx].to_owned().unwrap_or_else(|| { - // default to be not nullable and use DataType inferred by PyVector itself - AnnotationInfo { - datatype: Some(real_ty.to_owned()), - is_nullable: false, - } - }); - Field::new( - name, - // if type is like `_` or `_ | None` - ty.unwrap_or(real_ty), - is_nullable, - ) - }) - .collect::>(), - ))) + + let column_schemas = names + .iter() + .enumerate() + .map(|(idx, name)| { + let real_ty = cols[idx].data_type(); + let AnnotationInfo { + datatype: ty, + is_nullable, + } = anno[idx].to_owned().unwrap_or_else(|| { + // default to be not nullable and use DataType inferred by PyVector itself + AnnotationInfo { + datatype: Some(real_ty.as_arrow_type()), + is_nullable: false, + } + }); + let column_type = match ty { + Some(arrow_type) => { + ConcreteDataType::try_from(&arrow_type).context(TypeCastSnafu)? + } + // if type is like `_` or `_ | None` + None => real_ty, + }; + Ok(ColumnSchema::new(name, column_type, is_nullable)) + }) + .collect::>>()?; + + Ok(Arc::new(Schema::new(column_schemas))) } /// check if real types and annotation types(if have) is the same, if not try cast columns to annotated type - fn check_and_cast_type(&self, cols: &mut [ArrayRef]) -> Result<()> { + fn check_and_cast_type(&self, cols: &mut [VectorRef]) -> Result<()> { let return_types = &self.return_types; // allow ignore Return Type Annotation if return_types.is_empty() { @@ -161,21 +165,10 @@ impl Coprocessor { { let real_ty = col.data_type(); let anno_ty = datatype; - if real_ty != anno_ty { - { - // This`CastOption` allow for overflowly cast and int to float loosely cast etc.., - // check its doc for more information - *col = arrow::compute::cast::cast( - col.as_ref(), - anno_ty, - CastOptions { - wrapped: true, - partial: true, - }, - ) - .context(ArrowSnafu)? - .into(); - } + if real_ty.as_arrow_type() != *anno_ty { + let array = col.to_arrow_array(); + let array = compute::cast(&array, anno_ty).context(ArrowSnafu)?; + *col = Helper::try_into_vector(array).context(TypeCastSnafu)?; } } } @@ -183,47 +176,6 @@ impl Coprocessor { } } -/// cast a `dyn Array` of type unsigned/int/float into a `dyn Vector` -fn try_into_vector(arg: Arc) -> Result> { - // wrap try_into_vector in here to convert `datatypes::error::Error` to `python::error::Error` - Helper::try_into_vector(arg).context(TypeCastSnafu) -} - -/// convert a `Vec` into a `Vec` only when they are of supported types -/// PyVector now only support unsigned&int8/16/32/64, float32/64 and bool when doing meanful arithmetics operation -fn try_into_py_vector(fetch_args: Vec) -> Result> { - let mut args: Vec = Vec::with_capacity(fetch_args.len()); - for (idx, arg) in fetch_args.into_iter().enumerate() { - let v: VectorRef = match arg.data_type() { - DataType::Float32 => try_into_vector::(arg)?, - DataType::Float64 => try_into_vector::(arg)?, - DataType::UInt8 => try_into_vector::(arg)?, - DataType::UInt16 => try_into_vector::(arg)?, - DataType::UInt32 => try_into_vector::(arg)?, - DataType::UInt64 => try_into_vector::(arg)?, - DataType::Int8 => try_into_vector::(arg)?, - DataType::Int16 => try_into_vector::(arg)?, - DataType::Int32 => try_into_vector::(arg)?, - DataType::Int64 => try_into_vector::(arg)?, - DataType::Utf8 => { - Arc::new(StringVector::try_from_arrow_array(arg).context(TypeCastSnafu)?) as _ - } - DataType::Boolean => { - Arc::new(BooleanVector::try_from_arrow_array(arg).context(TypeCastSnafu)?) as _ - } - _ => { - return ret_other_error_with(format!( - "Unsupported data type at column {idx}: {:?} for coprocessor", - arg.data_type() - )) - .fail() - } - }; - args.push(PyVector::from(v)); - } - Ok(args) -} - /// convert a tuple of `PyVector` or one `PyVector`(wrapped in a Python Object Ref[`PyObjectRef`]) /// to a `Vec` /// by default, a constant(int/float/bool) gives the a constant array of same length with input args @@ -231,7 +183,7 @@ fn try_into_columns( obj: &PyObjectRef, vm: &VirtualMachine, col_len: usize, -) -> Result> { +) -> Result> { if is_instance::(obj, vm) { let tuple = obj.payload::().with_context(|| { ret_other_error_with(format!("can't cast obj {:?} to PyTuple)", obj)) @@ -239,7 +191,7 @@ fn try_into_columns( let cols = tuple .iter() .map(|obj| py_vec_obj_to_array(obj, vm, col_len)) - .collect::>>()?; + .collect::>>()?; Ok(cols) } else { let col = py_vec_obj_to_array(obj, vm, col_len)?; @@ -249,27 +201,16 @@ fn try_into_columns( /// select columns according to `fetch_names` from `rb` /// and cast them into a Vec of PyVector -fn select_from_rb(rb: &DfRecordBatch, fetch_names: &[String]) -> Result> { - let field_map: HashMap<&String, usize> = rb - .schema() - .fields - .iter() - .enumerate() - .map(|(idx, field)| (&field.name, idx)) - .collect(); - let fetch_idx: Vec = fetch_names +fn select_from_rb(rb: &RecordBatch, fetch_names: &[String]) -> Result> { + fetch_names .iter() - .map(|field| { - field_map.get(field).copied().context(OtherSnafu { - reason: format!("Can't found field name {field}"), - }) + .map(|name| { + let vector = rb.column_by_name(name).with_context(|| OtherSnafu { + reason: format!("Can't find field name {}", name), + })?; + Ok(PyVector::from(vector.clone())) }) - .collect::>>()?; - let fetch_args: Vec> = fetch_idx - .into_iter() - .map(|idx| rb.column(idx).clone()) - .collect(); - try_into_py_vector(fetch_args) + .collect() } /// match between arguments' real type and annotation types @@ -277,12 +218,12 @@ fn select_from_rb(rb: &DfRecordBatch, fetch_names: &[String]) -> Result Result<()> { for (idx, arg) in args.iter().enumerate() { let anno_ty = copr.arg_types[idx].to_owned(); let real_ty = arg.to_arrow_array().data_type().to_owned(); - let is_nullable: bool = rb.schema().fields[idx].is_nullable; + let is_nullable: bool = rb.schema.column_schemas()[idx].is_nullable(); ensure!( anno_ty .to_owned() @@ -323,31 +264,32 @@ fn set_items_in_scope( /// The coprocessor function accept a python script and a Record Batch: /// ## What it does -/// 1. it take a python script and a [`DfRecordBatch`], extract columns and annotation info according to `args` given in decorator in python script +/// 1. it take a python script and a [`RecordBatch`], extract columns and annotation info according to `args` given in decorator in python script /// 2. execute python code and return a vector or a tuple of vector, -/// 3. the returning vector(s) is assembled into a new [`DfRecordBatch`] according to `returns` in python decorator and return to caller +/// 3. the returning vector(s) is assembled into a new [`RecordBatch`] according to `returns` in python decorator and return to caller /// /// # Example /// /// ```ignore /// use std::sync::Arc; -/// use datafusion_common::record_batch::RecordBatch as DfRecordBatch; -/// use datatypes::arrow::array::PrimitiveArray; -/// use datatypes::arrow::datatypes::{DataType, Field, Schema}; +/// use common_recordbatch::RecordBatch; +/// use datatypes::prelude::*; +/// use datatypes::schema::{ColumnSchema, Schema}; +/// use datatypes::vectors::{Float32Vector, Float64Vector}; /// use common_function::scalars::python::exec_coprocessor; /// let python_source = r#" /// @copr(args=["cpu", "mem"], returns=["perf", "what"]) /// def a(cpu, mem): /// return cpu + mem, cpu - mem /// "#; -/// let cpu_array = PrimitiveArray::from_slice([0.9f32, 0.8, 0.7, 0.6]); -/// let mem_array = PrimitiveArray::from_slice([0.1f64, 0.2, 0.3, 0.4]); -/// let schema = Arc::new(Schema::from(vec![ -/// Field::new("cpu", DataType::Float32, false), -/// Field::new("mem", DataType::Float64, false), +/// let cpu_array = Float32Vector::from_slice([0.9f32, 0.8, 0.7, 0.6]); +/// let mem_array = Float64Vector::from_slice([0.1f64, 0.2, 0.3, 0.4]); +/// let schema = Arc::new(Schema::new(vec![ +/// ColumnSchema::new("cpu", ConcreteDataType::float32_datatype(), false), +/// ColumnSchema::new("mem", ConcreteDataType::float64_datatype(), false), /// ])); /// let rb = -/// DfRecordBatch::try_new(schema, vec![Arc::new(cpu_array), Arc::new(mem_array)]).unwrap(); +/// RecordBatch::new(schema, vec![Arc::new(cpu_array), Arc::new(mem_array)]).unwrap(); /// let ret = exec_coprocessor(python_source, &rb).unwrap(); /// assert_eq!(ret.column(0).len(), 4); /// ``` @@ -357,7 +299,7 @@ fn set_items_in_scope( /// /// Currently support types are `u8`, `u16`, `u32`, `u64`, `i8`, `i16`, `i32`, `i64` and `f16`, `f32`, `f64` /// -/// use `f64 | None` to mark if returning column is nullable like in [`DfRecordBatch`]'s schema's [`Field`]'s is_nullable +/// use `f64 | None` to mark if returning column is nullable like in [`RecordBatch`]'s schema's [`ColumnSchema`]'s is_nullable /// /// you can also use single underscore `_` to let coprocessor infer what type it is, so `_` and `_ | None` are both valid in type annotation. /// Note: using `_` means not nullable column, using `_ | None` means nullable column @@ -373,7 +315,7 @@ fn set_items_in_scope( /// You can return constant in python code like `return 1, 1.0, True` /// which create a constant array(with same value)(currently support int, float and bool) as column on return #[cfg(test)] -pub fn exec_coprocessor(script: &str, rb: &DfRecordBatch) -> Result { +pub fn exec_coprocessor(script: &str, rb: &RecordBatch) -> Result { // 1. parse the script and check if it's only a function with `@coprocessor` decorator, and get `args` and `returns`, // 2. also check for exist of `args` in `rb`, if not found, return error // TODO(discord9): cache the result of parse_copr @@ -383,7 +325,7 @@ pub fn exec_coprocessor(script: &str, rb: &DfRecordBatch) -> Result pub(crate) fn exec_with_cached_vm( copr: &Coprocessor, - rb: &DfRecordBatch, + rb: &RecordBatch, args: Vec, vm: &Arc, ) -> Result { @@ -401,7 +343,7 @@ pub(crate) fn exec_with_cached_vm( // 5. get returns as either a PyVector or a PyTuple, and naming schema them according to `returns` let col_len = rb.num_rows(); - let mut cols: Vec = try_into_columns(&ret, vm, col_len)?; + let mut cols = try_into_columns(&ret, vm, col_len)?; ensure!( cols.len() == copr.deco_args.ret_names.len(), OtherSnafu { @@ -417,11 +359,7 @@ pub(crate) fn exec_with_cached_vm( copr.check_and_cast_type(&mut cols)?; // 6. return a assembled DfRecordBatch let schema = copr.gen_schema(&cols)?; - let res_rb = DfRecordBatch::try_new(schema.clone(), cols).context(ArrowSnafu)?; - Ok(RecordBatch { - schema: Arc::new(Schema::try_from(schema).context(TypeCastSnafu)?), - df_recordbatch: res_rb, - }) + RecordBatch::new(schema, cols).context(NewRecordBatchSnafu) }) } @@ -459,7 +397,7 @@ pub(crate) fn init_interpreter() -> Arc { } /// using a parsed `Coprocessor` struct as input to execute python code -pub(crate) fn exec_parsed(copr: &Coprocessor, rb: &DfRecordBatch) -> Result { +pub(crate) fn exec_parsed(copr: &Coprocessor, rb: &RecordBatch) -> Result { // 3. get args from `rb`, and cast them into PyVector let args: Vec = select_from_rb(rb, &copr.deco_args.arg_names)?; check_args_anno_real_type(&args, copr, rb)?; @@ -477,7 +415,7 @@ pub(crate) fn exec_parsed(copr: &Coprocessor, rb: &DfRecordBatch) -> Result StdResult { diff --git a/src/script/src/python/engine.rs b/src/script/src/python/engine.rs index 7ad5390f7b1b..848bf71d8b12 100644 --- a/src/script/src/python/engine.rs +++ b/src/script/src/python/engine.rs @@ -59,7 +59,7 @@ impl Stream for CoprStream { match Pin::new(&mut self.stream).poll_next(cx) { Poll::Pending => Poll::Pending, Poll::Ready(Some(Ok(recordbatch))) => { - let batch = exec_parsed(&self.copr, &recordbatch.df_recordbatch) + let batch = exec_parsed(&self.copr, &recordbatch) .map_err(BoxedError::new) .context(ExternalSnafu)?; @@ -149,8 +149,8 @@ mod tests { use catalog::{CatalogList, CatalogProvider, SchemaProvider}; use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME}; use common_recordbatch::util; - use datafusion_common::field_util::{FieldExt, SchemaExt}; - use datatypes::arrow::array::{Float64Array, Int64Array}; + use datatypes::prelude::ScalarVector; + use datatypes::vectors::{Float64Vector, Int64Vector}; use query::QueryEngineFactory; use table::table::numbers::NumbersTable; @@ -177,6 +177,7 @@ mod tests { let script_engine = PyEngine::new(query_engine.clone()); + // To avoid divide by zero, the script divides `add(a, b)` by `g.sqrt(c + 1)` instead of `g.sqrt(c)` let script = r#" import greptime as g def add(a, b): @@ -184,7 +185,7 @@ def add(a, b): @copr(args=["a", "b", "c"], returns = ["r"], sql="select number as a,number as b,number as c from numbers limit 100") def test(a, b, c): - return add(a, b) / g.sqrt(c) + return add(a, b) / g.sqrt(c + 1) "#; let script = script_engine .compile(script, CompileContext::default()) @@ -197,15 +198,18 @@ def test(a, b, c): assert_eq!(1, numbers.len()); let number = &numbers[0]; - assert_eq!(number.df_recordbatch.num_columns(), 1); - assert_eq!("r", number.schema.arrow_schema().field(0).name()); - - let columns = number.df_recordbatch.columns(); - assert_eq!(1, columns.len()); - assert_eq!(100, columns[0].len()); - let rows = columns[0].as_any().downcast_ref::().unwrap(); - assert!(rows.value(0).is_nan()); - assert_eq!((99f64 + 99f64) / 99f64.sqrt(), rows.value(99)) + assert_eq!(number.num_columns(), 1); + assert_eq!("r", number.schema.column_schemas()[0].name); + + assert_eq!(1, number.num_columns()); + assert_eq!(100, number.column(0).len()); + let rows = number + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(0f64, rows.get_data(0).unwrap()); + assert_eq!((99f64 + 99f64) / 100f64.sqrt(), rows.get_data(99).unwrap()) } _ => unreachable!(), } @@ -229,15 +233,18 @@ def test(a): assert_eq!(1, numbers.len()); let number = &numbers[0]; - assert_eq!(number.df_recordbatch.num_columns(), 1); - assert_eq!("r", number.schema.arrow_schema().field(0).name()); - - let columns = number.df_recordbatch.columns(); - assert_eq!(1, columns.len()); - assert_eq!(50, columns[0].len()); - let rows = columns[0].as_any().downcast_ref::().unwrap(); - assert_eq!(0, rows.value(0)); - assert_eq!(98, rows.value(49)) + assert_eq!(number.num_columns(), 1); + assert_eq!("r", number.schema.column_schemas()[0].name); + + assert_eq!(1, number.num_columns()); + assert_eq!(50, number.column(0).len()); + let rows = number + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(0, rows.get_data(0).unwrap()); + assert_eq!(98, rows.get_data(49).unwrap()) } _ => unreachable!(), } diff --git a/src/script/src/python/error.rs b/src/script/src/python/error.rs index 9a77984149cf..6e20e86db004 100644 --- a/src/script/src/python/error.rs +++ b/src/script/src/python/error.rs @@ -105,6 +105,12 @@ pub enum Error { #[snafu(backtrace)] source: common_recordbatch::error::Error, }, + + #[snafu(display("Failed to create record batch, source: {}", source))] + NewRecordBatch { + #[snafu(backtrace)] + source: common_recordbatch::error::Error, + }, } impl From for Error { @@ -121,7 +127,9 @@ impl ErrorExt for Error { | Error::PyRuntime { .. } | Error::Other { .. } => StatusCode::Internal, - Error::RecordBatch { source } => source.status_code(), + Error::RecordBatch { source } | Error::NewRecordBatch { source } => { + source.status_code() + } Error::DatabaseQuery { source } => source.status_code(), Error::TypeCast { source } => source.status_code(), diff --git a/src/script/src/python/test.rs b/src/script/src/python/test.rs index 4c0bcdcd25f3..49b511c10137 100644 --- a/src/script/src/python/test.rs +++ b/src/script/src/python/test.rs @@ -12,19 +12,18 @@ // See the License for the specific language governing permissions and // limitations under the License. -#![allow(clippy::print_stdout, clippy::print_stderr)] -// for debug purpose, also this is already a -// test module so allow print_stdout shouldn't be a problem? use std::fs::File; use std::io::prelude::*; use std::path::Path; use std::sync::Arc; +use common_recordbatch::RecordBatch; use common_telemetry::{error, info}; use console::style; -use datafusion_common::record_batch::RecordBatch as DfRecordBatch; -use datatypes::arrow::array::PrimitiveArray; -use datatypes::arrow::datatypes::{DataType, Field, Schema}; +use datatypes::arrow::datatypes::DataType as ArrowDataType; +use datatypes::data_type::{ConcreteDataType, DataType}; +use datatypes::schema::{ColumnSchema, Schema}; +use datatypes::vectors::{Float32Vector, Float64Vector, Int64Vector, VectorRef}; use ron::from_str as from_ron_string; use rustpython_parser::parser; use serde::{Deserialize, Serialize}; @@ -63,19 +62,26 @@ enum Predicate { #[derive(Serialize, Deserialize, Debug)] struct ColumnInfo { - pub ty: DataType, + pub ty: ArrowDataType, pub len: usize, } -fn create_sample_recordbatch() -> DfRecordBatch { - let cpu_array = PrimitiveArray::from_slice([0.9f32, 0.8, 0.7, 0.6]); - let mem_array = PrimitiveArray::from_slice([0.1f64, 0.2, 0.3, 0.4]); - let schema = Arc::new(Schema::from(vec![ - Field::new("cpu", DataType::Float32, false), - Field::new("mem", DataType::Float64, false), +fn create_sample_recordbatch() -> RecordBatch { + let cpu_array = Float32Vector::from_slice([0.9f32, 0.8, 0.7, 0.6]); + let mem_array = Float64Vector::from_slice([0.1f64, 0.2, 0.3, 0.4]); + let schema = Arc::new(Schema::new(vec![ + ColumnSchema::new("cpu", ConcreteDataType::float32_datatype(), false), + ColumnSchema::new("mem", ConcreteDataType::float64_datatype(), false), ])); - DfRecordBatch::try_new(schema, vec![Arc::new(cpu_array), Arc::new(mem_array)]).unwrap() + RecordBatch::new( + schema, + [ + Arc::new(cpu_array) as VectorRef, + Arc::new(mem_array) as VectorRef, + ], + ) + .unwrap() } /// test cases which read from a .ron file, deser, @@ -120,37 +126,27 @@ fn run_ron_testcases() { } Predicate::ExecIsOk { fields, columns } => { let rb = create_sample_recordbatch(); - let res = coprocessor::exec_coprocessor(&testcase.code, &rb); - if res.is_err() { - dbg!(&res); - } - assert!(res.is_ok()); - let res = res.unwrap(); + let res = coprocessor::exec_coprocessor(&testcase.code, &rb).unwrap(); fields .iter() - .zip(&res.schema.arrow_schema().fields) - .map(|(anno, real)| { + .zip(res.schema.column_schemas()) + .for_each(|(anno, real)| { assert!( - anno.datatype.clone().unwrap() == real.data_type - && anno.is_nullable == real.is_nullable, + anno.datatype.as_ref().unwrap() == &real.data_type.as_arrow_type() + && anno.is_nullable == real.is_nullable(), "Fields expected to be {anno:#?}, actual {real:#?}" ); - }) - .count(); - columns - .iter() - .zip(res.df_recordbatch.columns()) - .map(|(anno, real)| { - assert!( - &anno.ty == real.data_type() && anno.len == real.len(), - "Type or length not match! Expect [{:#?}; {}], actual [{:#?}; {}]", - anno.ty, - anno.len, - real.data_type(), - real.len() - ); - }) - .count(); + }); + columns.iter().zip(res.columns()).for_each(|(anno, real)| { + assert!( + anno.ty == real.data_type().as_arrow_type() && anno.len == real.len(), + "Type or length not match! Expect [{:#?}; {}], actual [{:#?}; {}]", + anno.ty, + anno.len, + real.data_type(), + real.len() + ); + }); } Predicate::ExecIsErr { reason: part_reason, @@ -229,7 +225,7 @@ def calc_rvs(open_time, close): rv_180d = vector([calc_rv(close, open_time, timepoint, datetime("180d"))]) return rv_7d, rv_15d, rv_30d, rv_60d, rv_90d, rv_180d "#; - let close_array = PrimitiveArray::from_slice([ + let close_array = Float32Vector::from_slice([ 10106.79f32, 10106.09, 10108.73, @@ -242,17 +238,20 @@ def calc_rvs(open_time, close): 10117.08, 10120.43, ]); - let open_time_array = PrimitiveArray::from_slice([ + let open_time_array = Int64Vector::from_slice([ 300i64, 900i64, 1200i64, 1800i64, 2400i64, 3000i64, 3600i64, 4200i64, 4800i64, 5400i64, 6000i64, ]); - let schema = Arc::new(Schema::from(vec![ - Field::new("close", DataType::Float32, false), - Field::new("open_time", DataType::Int64, false), + let schema = Arc::new(Schema::new(vec![ + ColumnSchema::new("close", ConcreteDataType::float32_datatype(), false), + ColumnSchema::new("open_time", ConcreteDataType::int64_datatype(), false), ])); - let rb = DfRecordBatch::try_new( + let rb = RecordBatch::new( schema, - vec![Arc::new(close_array), Arc::new(open_time_array)], + [ + Arc::new(close_array) as VectorRef, + Arc::new(open_time_array) as VectorRef, + ], ) .unwrap(); let ret = coprocessor::exec_coprocessor(python_source, &rb); @@ -291,14 +290,20 @@ def a(cpu, mem): ref = log2(fed/prev(fed)) return (0.5 < cpu) & ~( cpu >= 0.75) "#; - let cpu_array = PrimitiveArray::from_slice([0.9f32, 0.8, 0.7, 0.3]); - let mem_array = PrimitiveArray::from_slice([0.1f64, 0.2, 0.3, 0.4]); - let schema = Arc::new(Schema::from(vec![ - Field::new("cpu", DataType::Float32, false), - Field::new("mem", DataType::Float64, false), + let cpu_array = Float32Vector::from_slice([0.9f32, 0.8, 0.7, 0.3]); + let mem_array = Float64Vector::from_slice([0.1f64, 0.2, 0.3, 0.4]); + let schema = Arc::new(Schema::new(vec![ + ColumnSchema::new("cpu", ConcreteDataType::float32_datatype(), false), + ColumnSchema::new("mem", ConcreteDataType::float64_datatype(), false), ])); - let rb = - DfRecordBatch::try_new(schema, vec![Arc::new(cpu_array), Arc::new(mem_array)]).unwrap(); + let rb = RecordBatch::new( + schema, + [ + Arc::new(cpu_array) as VectorRef, + Arc::new(mem_array) as VectorRef, + ], + ) + .unwrap(); let ret = coprocessor::exec_coprocessor(python_source, &rb); if let Err(Error::PyParse { backtrace: _, diff --git a/src/script/src/python/utils.rs b/src/script/src/python/utils.rs index fcc0bf39565c..8f078c163cbf 100644 --- a/src/script/src/python/utils.rs +++ b/src/script/src/python/utils.rs @@ -14,10 +14,12 @@ use std::sync::Arc; -use datafusion::arrow::array::{ArrayRef, BooleanArray, NullArray, PrimitiveArray, Utf8Array}; use datafusion_common::ScalarValue; use datafusion_expr::ColumnarValue as DFColValue; -use datatypes::arrow::datatypes::DataType; +use datatypes::prelude::ScalarVector; +use datatypes::vectors::{ + BooleanVector, Float64Vector, Helper, Int64Vector, NullVector, StringVector, VectorRef, +}; use rustpython_vm::builtins::{PyBaseExceptionRef, PyBool, PyFloat, PyInt, PyList, PyStr}; use rustpython_vm::{PyObjectRef, PyPayload, PyRef, VirtualMachine}; use snafu::{Backtrace, GenerateImplicitData, OptionExt, ResultExt}; @@ -54,26 +56,26 @@ pub fn py_vec_obj_to_array( obj: &PyObjectRef, vm: &VirtualMachine, col_len: usize, -) -> Result { +) -> Result { // It's ugly, but we can't find a better way right now. if is_instance::(obj, vm) { let pyv = obj.payload::().with_context(|| { ret_other_error_with(format!("can't cast obj {:?} to PyVector", obj)) })?; - Ok(pyv.to_arrow_array()) + Ok(pyv.as_vector_ref()) } else if is_instance::(obj, vm) { let val = obj .to_owned() .try_into_value::(vm) .map_err(|e| format_py_error(e, vm))?; - let ret = PrimitiveArray::from_vec(vec![val; col_len]); + let ret = Int64Vector::from_iterator(std::iter::repeat(val).take(col_len)); Ok(Arc::new(ret) as _) } else if is_instance::(obj, vm) { let val = obj .to_owned() .try_into_value::(vm) .map_err(|e| format_py_error(e, vm))?; - let ret = PrimitiveArray::from_vec(vec![val; col_len]); + let ret = Float64Vector::from_iterator(std::iter::repeat(val).take(col_len)); Ok(Arc::new(ret) as _) } else if is_instance::(obj, vm) { let val = obj @@ -81,7 +83,7 @@ pub fn py_vec_obj_to_array( .try_into_value::(vm) .map_err(|e| format_py_error(e, vm))?; - let ret = BooleanArray::from_iter(std::iter::repeat(Some(val)).take(col_len)); + let ret = BooleanVector::from_iterator(std::iter::repeat(val).take(col_len)); Ok(Arc::new(ret) as _) } else if is_instance::(obj, vm) { let val = obj @@ -89,7 +91,7 @@ pub fn py_vec_obj_to_array( .try_into_value::(vm) .map_err(|e| format_py_error(e, vm))?; - let ret = Utf8Array::::from_iter(std::iter::repeat(Some(val)).take(col_len)); + let ret = StringVector::from_iterator(std::iter::repeat(val.as_str()).take(col_len)); Ok(Arc::new(ret) as _) } else if is_instance::(obj, vm) { let columnar_value = @@ -101,9 +103,9 @@ pub fn py_vec_obj_to_array( let array = ScalarValue::iter_to_array(scalars.into_iter()) .context(error::DataFusionSnafu)?; - Ok(array) + Helper::try_into_vector(array).context(error::TypeCastSnafu) } - None => Ok(Arc::new(NullArray::new(DataType::Null, 0))), + None => Ok(Arc::new(NullVector::new(0))), }, _ => unreachable!(), } diff --git a/src/script/src/python/vector.rs b/src/script/src/python/vector.rs index 448df3e62eed..47fae45ed13f 100644 --- a/src/script/src/python/vector.rs +++ b/src/script/src/python/vector.rs @@ -19,17 +19,17 @@ use std::sync::Arc; use common_time::date::Date; use common_time::datetime::DateTime; use common_time::timestamp::Timestamp; -use datatypes::arrow::array::{Array, ArrayRef, BooleanArray, PrimitiveArray}; +use datatypes::arrow::array::{ + Array, ArrayRef, BooleanArray, Float64Array, Int64Array, UInt64Array, +}; use datatypes::arrow::compute; -use datatypes::arrow::compute::cast::{self, CastOptions}; -use datatypes::arrow::compute::{arithmetics, comparison}; -use datatypes::arrow::datatypes::DataType; -use datatypes::arrow::scalar::{PrimitiveScalar, Scalar}; -use datatypes::data_type::ConcreteDataType; +use datatypes::arrow::compute::kernels::{arithmetic, boolean, comparison}; +use datatypes::arrow::datatypes::DataType as ArrowDataType; +use datatypes::arrow::error::Result as ArrowResult; +use datatypes::data_type::{ConcreteDataType, DataType}; use datatypes::prelude::Value; -use datatypes::value::OrderedFloat; -use datatypes::vectors::{Helper, NullVector, VectorBuilder, VectorRef}; -use datatypes::{arrow, value}; +use datatypes::value::{self, OrderedFloat}; +use datatypes::vectors::{Helper, NullVector, VectorRef}; use rustpython_vm::builtins::{PyBaseExceptionRef, PyBool, PyBytes, PyFloat, PyInt, PyNone, PyStr}; use rustpython_vm::function::{Either, OptionalArg, PyComparisonValue}; use rustpython_vm::protocol::{PyMappingMethods, PySequenceMethods}; @@ -55,120 +55,71 @@ impl From for PyVector { fn emit_cast_error( vm: &VirtualMachine, - src_ty: &DataType, - dst_ty: &DataType, + src_ty: &ArrowDataType, + dst_ty: &ArrowDataType, ) -> PyBaseExceptionRef { vm.new_type_error(format!( "Can't cast source operand of type {:?} into target type of {:?}", src_ty, dst_ty )) } -fn arrow2_rsub_scalar( - arr: &dyn Array, - val: &dyn Scalar, - _vm: &VirtualMachine, -) -> PyResult> { - // b - a => a * (-1) + b - let neg = arithmetics::mul_scalar(arr, &PrimitiveScalar::new(DataType::Int64, Some(-1i64))); - Ok(arithmetics::add_scalar(neg.as_ref(), val)) + +/// Performs `val - arr`. +fn arrow_rsub(arr: &dyn Array, val: &dyn Array, vm: &VirtualMachine) -> PyResult { + arithmetic::subtract_dyn(val, arr).map_err(|e| vm.new_type_error(format!("rsub error: {}", e))) } -fn arrow2_rtruediv_scalar( - arr: &dyn Array, - val: &dyn Scalar, - vm: &VirtualMachine, -) -> PyResult> { - // val / arr => one_arr / arr * val (this is simpler to write) - let one_arr: Box = if is_float(arr.data_type()) { - Box::new(PrimitiveArray::from_values(vec![1f64; arr.len()])) - } else if is_integer(arr.data_type()) { - Box::new(PrimitiveArray::from_values(vec![1i64; arr.len()])) - } else { - return Err(vm.new_not_implemented_error(format!( - "truediv of {:?} Scalar with {:?} Array is not supported", - val.data_type(), - arr.data_type() - ))); - }; - let tmp = arithmetics::mul_scalar(one_arr.as_ref(), val); - Ok(arithmetics::div(tmp.as_ref(), arr)) +/// Performs `val / arr` +fn arrow_rtruediv(arr: &dyn Array, val: &dyn Array, vm: &VirtualMachine) -> PyResult { + arithmetic::divide_dyn(val, arr) + .map_err(|e| vm.new_type_error(format!("rtruediv error: {}", e))) } -fn arrow2_rfloordiv_scalar( - arr: &dyn Array, - val: &dyn Scalar, - vm: &VirtualMachine, -) -> PyResult> { - // val // arr => one_arr // arr * val (this is simpler to write) - let one_arr: Box = if is_float(arr.data_type()) { - Box::new(PrimitiveArray::from_values(vec![1f64; arr.len()])) - } else if is_integer(arr.data_type()) { - Box::new(PrimitiveArray::from_values(vec![1i64; arr.len()])) - } else { - return Err(vm.new_not_implemented_error(format!( - "truediv of {:?} Scalar with {:?} Array is not supported", - val.data_type(), - arr.data_type() - ))); - }; - let tmp = arithmetics::mul_scalar(one_arr.as_ref(), val); - - Ok(arrow::compute::cast::cast( - arithmetics::div(tmp.as_ref(), arr).as_ref(), - &DataType::Int64, - cast::CastOptions { - wrapped: false, - partial: true, - }, - ) - .unwrap()) +/// Performs `val / arr`, but cast to i64. +fn arrow_rfloordiv(arr: &dyn Array, val: &dyn Array, vm: &VirtualMachine) -> PyResult { + let array = arithmetic::divide_dyn(val, arr) + .map_err(|e| vm.new_type_error(format!("rtruediv divide error: {}", e)))?; + compute::cast(&array, &ArrowDataType::Int64) + .map_err(|e| vm.new_type_error(format!("rtruediv cast error: {}", e))) } -fn wrap_result( - f: F, -) -> impl Fn(&dyn Array, &dyn Scalar, &VirtualMachine) -> PyResult> +fn wrap_result(f: F) -> impl Fn(&dyn Array, &dyn Array, &VirtualMachine) -> PyResult where - F: Fn(&dyn Array, &dyn Scalar) -> Box, + F: Fn(&dyn Array, &dyn Array) -> ArrowResult, { - move |left, right, _vm| Ok(f(left, right)) + move |left, right, vm| { + f(left, right).map_err(|e| vm.new_type_error(format!("arithmetic error {}", e))) + } } -fn is_float(datatype: &DataType) -> bool { +fn is_float(datatype: &ArrowDataType) -> bool { matches!( datatype, - DataType::Float16 | DataType::Float32 | DataType::Float64 + ArrowDataType::Float16 | ArrowDataType::Float32 | ArrowDataType::Float64 ) } -fn is_integer(datatype: &DataType) -> bool { - is_signed(datatype) || is_unsigned(datatype) -} - -fn is_signed(datatype: &DataType) -> bool { +fn is_signed(datatype: &ArrowDataType) -> bool { matches!( datatype, - DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64 + ArrowDataType::Int8 | ArrowDataType::Int16 | ArrowDataType::Int32 | ArrowDataType::Int64 ) } -fn is_unsigned(datatype: &DataType) -> bool { +fn is_unsigned(datatype: &ArrowDataType) -> bool { matches!( datatype, - DataType::UInt8 | DataType::UInt16 | DataType::UInt32 | DataType::UInt64 + ArrowDataType::UInt8 + | ArrowDataType::UInt16 + | ArrowDataType::UInt32 + | ArrowDataType::UInt64 ) } -fn cast(array: ArrayRef, target_type: &DataType, vm: &VirtualMachine) -> PyResult> { - cast::cast( - array.as_ref(), - target_type, - CastOptions { - wrapped: true, - partial: true, - }, - ) - .map_err(|e| vm.new_type_error(e.to_string())) +fn cast(array: ArrayRef, target_type: &ArrowDataType, vm: &VirtualMachine) -> PyResult { + compute::cast(&array, target_type).map_err(|e| vm.new_type_error(e.to_string())) } + fn from_debug_error(err: impl std::fmt::Debug, vm: &VirtualMachine) -> PyBaseExceptionRef { vm.new_runtime_error(format!("Runtime Error: {err:#?}")) } @@ -194,7 +145,7 @@ impl PyVector { } let datatype = get_concrete_type(&elements[0], vm)?; - let mut buf = VectorBuilder::with_capacity(datatype.clone(), elements.len()); + let mut buf = datatype.create_mutable_vector(elements.len()); for obj in elements.drain(..) { let val = if let Some(v) = @@ -207,11 +158,12 @@ impl PyVector { obj, datatype ))); }; - buf.push(&val); + // Safety: `pyobj_try_to_typed_val()` has checked the data type. + buf.push_value_ref(val.as_value_ref()).unwrap(); } Ok(PyVector { - vector: buf.finish(), + vector: buf.to_vector(), }) } else { Ok(PyVector::default()) @@ -232,23 +184,26 @@ impl PyVector { fn scalar_arith_op( &self, other: PyObjectRef, - target_type: Option, + target_type: Option, op: F, vm: &VirtualMachine, ) -> PyResult where - F: Fn(&dyn Array, &dyn Scalar, &VirtualMachine) -> PyResult>, + F: Fn(&dyn Array, &dyn Array, &VirtualMachine) -> PyResult, { // the right operand only support PyInt or PyFloat, let (right, right_type) = { if is_instance::(&other, vm) { other .try_into_value::(vm) - .map(|v| (value::Value::Int64(v), DataType::Int64))? + .map(|v| (value::Value::Int64(v), ArrowDataType::Int64))? } else if is_instance::(&other, vm) { - other - .try_into_value::(vm) - .map(|v| (value::Value::Float64(OrderedFloat(v)), DataType::Float64))? + other.try_into_value::(vm).map(|v| { + ( + value::Value::Float64(OrderedFloat(v)), + ArrowDataType::Float64, + ) + })? } else { return Err(vm.new_type_error(format!( "Can't cast right operand into Scalar of Int or Float, actual: {}", @@ -264,45 +219,38 @@ impl PyVector { // TODO(discord9): found better way to cast between signed and unsigned type let target_type = target_type.unwrap_or_else(|| { if is_signed(left_type) && is_signed(right_type) { - DataType::Int64 + ArrowDataType::Int64 } else if is_unsigned(left_type) && is_unsigned(right_type) { - DataType::UInt64 + ArrowDataType::UInt64 } else { - DataType::Float64 + ArrowDataType::Float64 } }); let left = cast(left, &target_type, vm)?; - let right: Box = if is_float(&target_type) { + let left_len = left.len(); + + // Convert `right` to an array of `target_type`. + let right: Box = if is_float(&target_type) { match right { - value::Value::Int64(v) => { - Box::new(PrimitiveScalar::new(target_type, Some(v as f64))) - } - value::Value::UInt64(v) => { - Box::new(PrimitiveScalar::new(target_type, Some(v as f64))) - } + value::Value::Int64(v) => Box::new(Float64Array::from_value(v as f64, left_len)), + value::Value::UInt64(v) => Box::new(Float64Array::from_value(v as f64, left_len)), value::Value::Float64(v) => { - Box::new(PrimitiveScalar::new(target_type, Some(f64::from(v)))) + Box::new(Float64Array::from_value(f64::from(v), left_len)) } _ => unreachable!(), } } else if is_signed(&target_type) { match right { - value::Value::Int64(v) => Box::new(PrimitiveScalar::new(target_type, Some(v))), - value::Value::UInt64(v) => { - Box::new(PrimitiveScalar::new(target_type, Some(v as i64))) - } - value::Value::Float64(v) => { - Box::new(PrimitiveScalar::new(DataType::Float64, Some(v.0 as i64))) - } + value::Value::Int64(v) => Box::new(Int64Array::from_value(v, left_len)), + value::Value::UInt64(v) => Box::new(Int64Array::from_value(v as i64, left_len)), + value::Value::Float64(v) => Box::new(Int64Array::from_value(v.0 as i64, left_len)), _ => unreachable!(), } } else if is_unsigned(&target_type) { match right { - value::Value::Int64(v) => Box::new(PrimitiveScalar::new(target_type, Some(v))), - value::Value::UInt64(v) => Box::new(PrimitiveScalar::new(target_type, Some(v))), - value::Value::Float64(v) => { - Box::new(PrimitiveScalar::new(target_type, Some(f64::from(v)))) - } + value::Value::Int64(v) => Box::new(UInt64Array::from_value(v as u64, left_len)), + value::Value::UInt64(v) => Box::new(UInt64Array::from_value(v, left_len)), + value::Value::Float64(v) => Box::new(UInt64Array::from_value(v.0 as u64, left_len)), _ => unreachable!(), } } else { @@ -311,7 +259,7 @@ impl PyVector { let result = op(left.as_ref(), right.as_ref(), vm)?; - Ok(Helper::try_into_vector(&*result) + Ok(Helper::try_into_vector(result.clone()) .map_err(|e| { vm.new_type_error(format!( "Can't cast result into vector, result: {:?}, err: {:?}", @@ -324,12 +272,12 @@ impl PyVector { fn arith_op( &self, other: PyObjectRef, - target_type: Option, + target_type: Option, op: F, vm: &VirtualMachine, ) -> PyResult where - F: Fn(&dyn Array, &dyn Array) -> Box, + F: Fn(&dyn Array, &dyn Array) -> ArrowResult, { let right = other.downcast_ref::().ok_or_else(|| { vm.new_type_error(format!( @@ -345,20 +293,21 @@ impl PyVector { let target_type = target_type.unwrap_or_else(|| { if is_signed(left_type) && is_signed(right_type) { - DataType::Int64 + ArrowDataType::Int64 } else if is_unsigned(left_type) && is_unsigned(right_type) { - DataType::UInt64 + ArrowDataType::UInt64 } else { - DataType::Float64 + ArrowDataType::Float64 } }); let left = cast(left, &target_type, vm)?; let right = cast(right, &target_type, vm)?; - let result = op(left.as_ref(), right.as_ref()); + let result = op(left.as_ref(), right.as_ref()) + .map_err(|e| vm.new_type_error(format!("Can't compute op, error: {}", e)))?; - Ok(Helper::try_into_vector(&*result) + Ok(Helper::try_into_vector(result.clone()) .map_err(|e| { vm.new_type_error(format!( "Can't cast result into vector, result: {:?}, err: {:?}", @@ -372,27 +321,27 @@ impl PyVector { #[pymethod(magic)] fn add(&self, other: PyObjectRef, vm: &VirtualMachine) -> PyResult { if is_pyobj_scalar(&other, vm) { - self.scalar_arith_op(other, None, wrap_result(arithmetics::add_scalar), vm) + self.scalar_arith_op(other, None, wrap_result(arithmetic::add_dyn), vm) } else { - self.arith_op(other, None, arithmetics::add, vm) + self.arith_op(other, None, arithmetic::add_dyn, vm) } } #[pymethod(magic)] fn sub(&self, other: PyObjectRef, vm: &VirtualMachine) -> PyResult { if is_pyobj_scalar(&other, vm) { - self.scalar_arith_op(other, None, wrap_result(arithmetics::sub_scalar), vm) + self.scalar_arith_op(other, None, wrap_result(arithmetic::subtract_dyn), vm) } else { - self.arith_op(other, None, arithmetics::sub, vm) + self.arith_op(other, None, arithmetic::subtract_dyn, vm) } } #[pymethod(magic)] fn rsub(&self, other: PyObjectRef, vm: &VirtualMachine) -> PyResult { if is_pyobj_scalar(&other, vm) { - self.scalar_arith_op(other, None, arrow2_rsub_scalar, vm) + self.scalar_arith_op(other, None, arrow_rsub, vm) } else { - self.arith_op(other, None, |a, b| arithmetics::sub(b, a), vm) + self.arith_op(other, None, |a, b| arithmetic::subtract_dyn(b, a), vm) } } @@ -400,9 +349,9 @@ impl PyVector { #[pymethod(magic)] fn mul(&self, other: PyObjectRef, vm: &VirtualMachine) -> PyResult { if is_pyobj_scalar(&other, vm) { - self.scalar_arith_op(other, None, wrap_result(arithmetics::mul_scalar), vm) + self.scalar_arith_op(other, None, wrap_result(arithmetic::multiply_dyn), vm) } else { - self.arith_op(other, None, arithmetics::mul, vm) + self.arith_op(other, None, arithmetic::multiply_dyn, vm) } } @@ -411,24 +360,29 @@ impl PyVector { if is_pyobj_scalar(&other, vm) { self.scalar_arith_op( other, - Some(DataType::Float64), - wrap_result(arithmetics::div_scalar), + Some(ArrowDataType::Float64), + wrap_result(arithmetic::divide_dyn), vm, ) } else { - self.arith_op(other, Some(DataType::Float64), arithmetics::div, vm) + self.arith_op( + other, + Some(ArrowDataType::Float64), + arithmetic::divide_dyn, + vm, + ) } } #[pymethod(magic)] fn rtruediv(&self, other: PyObjectRef, vm: &VirtualMachine) -> PyResult { if is_pyobj_scalar(&other, vm) { - self.scalar_arith_op(other, Some(DataType::Float64), arrow2_rtruediv_scalar, vm) + self.scalar_arith_op(other, Some(ArrowDataType::Float64), arrow_rtruediv, vm) } else { self.arith_op( other, - Some(DataType::Float64), - |a, b| arithmetics::div(b, a), + Some(ArrowDataType::Float64), + |a, b| arithmetic::divide_dyn(b, a), vm, ) } @@ -439,12 +393,17 @@ impl PyVector { if is_pyobj_scalar(&other, vm) { self.scalar_arith_op( other, - Some(DataType::Int64), - wrap_result(arithmetics::div_scalar), + Some(ArrowDataType::Int64), + wrap_result(arithmetic::divide_dyn), vm, ) } else { - self.arith_op(other, Some(DataType::Int64), arithmetics::div, vm) + self.arith_op( + other, + Some(ArrowDataType::Int64), + arithmetic::divide_dyn, + vm, + ) } } @@ -452,12 +411,12 @@ impl PyVector { fn rfloordiv(&self, other: PyObjectRef, vm: &VirtualMachine) -> PyResult { if is_pyobj_scalar(&other, vm) { // FIXME: DataType convert problem, target_type should be inferred? - self.scalar_arith_op(other, Some(DataType::Int64), arrow2_rfloordiv_scalar, vm) + self.scalar_arith_op(other, Some(ArrowDataType::Int64), arrow_rfloordiv, vm) } else { self.arith_op( other, - Some(DataType::Int64), - |a, b| arithmetics::div(b, a), + Some(ArrowDataType::Int64), + |a, b| arithmetic::divide_dyn(b, a), vm, ) } @@ -533,9 +492,9 @@ impl PyVector { .as_any() .downcast_ref::() .ok_or_else(|| vm.new_type_error(format!("Can't cast {left:#?} as a Boolean Array")))?; - let res = compute::boolean::and(left, right).map_err(|err| from_debug_error(err, vm))?; + let res = boolean::and(left, right).map_err(|err| from_debug_error(err, vm))?; let res = Arc::new(res) as ArrayRef; - let ret = Helper::try_into_vector(&*res).map_err(|err| from_debug_error(err, vm))?; + let ret = Helper::try_into_vector(res.clone()).map_err(|err| from_debug_error(err, vm))?; Ok(ret.into()) } @@ -551,9 +510,9 @@ impl PyVector { .as_any() .downcast_ref::() .ok_or_else(|| vm.new_type_error(format!("Can't cast {left:#?} as a Boolean Array")))?; - let res = compute::boolean::or(left, right).map_err(|err| from_debug_error(err, vm))?; + let res = boolean::or(left, right).map_err(|err| from_debug_error(err, vm))?; let res = Arc::new(res) as ArrayRef; - let ret = Helper::try_into_vector(&*res).map_err(|err| from_debug_error(err, vm))?; + let ret = Helper::try_into_vector(res.clone()).map_err(|err| from_debug_error(err, vm))?; Ok(ret.into()) } @@ -565,9 +524,9 @@ impl PyVector { .as_any() .downcast_ref::() .ok_or_else(|| vm.new_type_error(format!("Can't cast {left:#?} as a Boolean Array")))?; - let res = compute::boolean::not(left); + let res = boolean::not(left).map_err(|err| from_debug_error(err, vm))?; let res = Arc::new(res) as ArrayRef; - let ret = Helper::try_into_vector(&*res).map_err(|err| from_debug_error(err, vm))?; + let ret = Helper::try_into_vector(res.clone()).map_err(|err| from_debug_error(err, vm))?; Ok(ret.into()) } @@ -580,15 +539,15 @@ impl PyVector { #[pymethod(name = "filter")] fn filter(&self, other: PyVectorRef, vm: &VirtualMachine) -> PyResult { let left = self.to_arrow_array(); - let right: ArrayRef = other.to_arrow_array(); + let right = other.to_arrow_array(); let filter = right.as_any().downcast_ref::(); match filter { Some(filter) => { - let res = compute::filter::filter(left.as_ref(), filter); + let res = compute::filter(left.as_ref(), filter); let res = res.map_err(|err| vm.new_runtime_error(format!("Arrow Error: {err:#?}")))?; - let ret = Helper::try_into_vector(&*res).map_err(|e| { + let ret = Helper::try_into_vector(res.clone()).map_err(|e| { vm.new_type_error(format!( "Can't cast result into vector, result: {:?}, err: {:?}", res, e @@ -618,14 +577,10 @@ impl PyVector { .ok_or_else(|| { vm.new_type_error(format!("Can't cast {seq:#?} as a Boolean Array")) })?; - // let left = self.to_arrow_array(); - let res = compute::filter::filter(self.to_arrow_array().as_ref(), mask) + let res = compute::filter(self.to_arrow_array().as_ref(), mask) .map_err(|err| vm.new_runtime_error(format!("Arrow Error: {err:#?}")))?; - let ret = Helper::try_into_vector(&*res).map_err(|e| { - vm.new_type_error(format!( - "Can't cast result into vector, result: {:?}, err: {:?}", - res, e - )) + let ret = Helper::try_into_vector(res.clone()).map_err(|e| { + vm.new_type_error(format!("Can't cast result into vector, err: {:?}", e)) })?; Ok(Self::from(ret).into_pyobject(vm)) } else { @@ -654,9 +609,9 @@ impl PyVector { let (mut range, step, slice_len) = slice.adjust_indices(self.len()); let vector = self.as_vector_ref(); - let mut buf = VectorBuilder::with_capacity(vector.data_type(), slice_len); + let mut buf = vector.data_type().create_mutable_vector(slice_len); if slice_len == 0 { - let v: PyVector = buf.finish().into(); + let v: PyVector = buf.to_vector().into(); Ok(v.into_pyobject(vm)) } else if step == 1 { let v: PyVector = vector.slice(range.next().unwrap_or(0), slice_len).into(); @@ -664,15 +619,17 @@ impl PyVector { } else if step.is_negative() { // Negative step require special treatment for i in range.rev().step_by(step.unsigned_abs()) { - buf.push(&vector.get(i)) + // Safety: This mutable vector is created from the vector's data type. + buf.push_value_ref(vector.get_ref(i)).unwrap(); } - let v: PyVector = buf.finish().into(); + let v: PyVector = buf.to_vector().into(); Ok(v.into_pyobject(vm)) } else { for i in range.step_by(step.unsigned_abs()) { - buf.push(&vector.get(i)) + // Safety: This mutable vector is created from the vector's data type. + buf.push_value_ref(vector.get_ref(i)).unwrap(); } - let v: PyVector = buf.finish().into(); + let v: PyVector = buf.to_vector().into(); Ok(v.into_pyobject(vm)) } } @@ -693,19 +650,19 @@ impl PyVector { /// get corresponding arrow op function according to given PyComaprsionOp /// /// TODO(discord9): impl scalar version function -fn get_arrow_op(op: PyComparisonOp) -> impl Fn(&dyn Array, &dyn Array) -> Box { +fn get_arrow_op(op: PyComparisonOp) -> impl Fn(&dyn Array, &dyn Array) -> ArrowResult { let op_bool_arr = match op { - PyComparisonOp::Eq => comparison::eq, - PyComparisonOp::Ne => comparison::neq, - PyComparisonOp::Gt => comparison::gt, - PyComparisonOp::Lt => comparison::lt, - PyComparisonOp::Ge => comparison::gt_eq, - PyComparisonOp::Le => comparison::lt_eq, + PyComparisonOp::Eq => comparison::eq_dyn, + PyComparisonOp::Ne => comparison::neq_dyn, + PyComparisonOp::Gt => comparison::gt_dyn, + PyComparisonOp::Lt => comparison::lt_dyn, + PyComparisonOp::Ge => comparison::gt_eq_dyn, + PyComparisonOp::Le => comparison::lt_eq_dyn, }; - move |a: &dyn Array, b: &dyn Array| -> Box { - let ret = op_bool_arr(a, b); - Box::new(ret) as _ + move |a: &dyn Array, b: &dyn Array| -> ArrowResult { + let array = op_bool_arr(a, b)?; + Ok(Arc::new(array)) } } @@ -714,19 +671,20 @@ fn get_arrow_op(op: PyComparisonOp) -> impl Fn(&dyn Array, &dyn Array) -> Box impl Fn(&dyn Array, &dyn Scalar, &VirtualMachine) -> PyResult> { +) -> impl Fn(&dyn Array, &dyn Array, &VirtualMachine) -> PyResult { let op_bool_arr = match op { - PyComparisonOp::Eq => comparison::eq_scalar, - PyComparisonOp::Ne => comparison::neq_scalar, - PyComparisonOp::Gt => comparison::gt_scalar, - PyComparisonOp::Lt => comparison::lt_scalar, - PyComparisonOp::Ge => comparison::gt_eq_scalar, - PyComparisonOp::Le => comparison::lt_eq_scalar, + PyComparisonOp::Eq => comparison::eq_dyn, + PyComparisonOp::Ne => comparison::neq_dyn, + PyComparisonOp::Gt => comparison::gt_dyn, + PyComparisonOp::Lt => comparison::lt_dyn, + PyComparisonOp::Ge => comparison::gt_eq_dyn, + PyComparisonOp::Le => comparison::lt_eq_dyn, }; - move |a: &dyn Array, b: &dyn Scalar, _vm| -> PyResult> { - let ret = op_bool_arr(a, b); - Ok(Box::new(ret) as _) + move |a: &dyn Array, b: &dyn Array, vm| -> PyResult { + let array = + op_bool_arr(a, b).map_err(|e| vm.new_type_error(format!("scalar op error: {}", e)))?; + Ok(Arc::new(array)) } } @@ -875,7 +833,7 @@ pub fn pyobj_try_to_typed_val( // FIXME(dennis): we always consider the timestamp unit is millis, it's not correct if user define timestamp column with other units. obj.try_into_value::(vm) .ok() - .map(Timestamp::from_millis) + .map(Timestamp::new_millisecond) .map(value::Value::Timestamp) } _ => unreachable!(), diff --git a/src/script/src/table.rs b/src/script/src/table.rs index abc0279a3f23..7c1570d8d1c9 100644 --- a/src/script/src/table.rs +++ b/src/script/src/table.rs @@ -21,12 +21,10 @@ use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, SCRIPTS_ use common_query::Output; use common_recordbatch::util as record_util; use common_telemetry::logging; -use common_time::timestamp::Timestamp; use common_time::util; -use datatypes::arrow::array::Utf8Array; use datatypes::prelude::{ConcreteDataType, ScalarVector}; use datatypes::schema::{ColumnSchema, Schema, SchemaBuilder}; -use datatypes::vectors::{StringVector, TimestampVector, VectorRef}; +use datatypes::vectors::{StringVector, TimestampMillisecondVector, Vector, VectorRef}; use query::QueryEngineRef; use session::context::QueryContext; use snafu::{ensure, OptionExt, ResultExt}; @@ -104,19 +102,16 @@ impl ScriptsTable { // Timestamp in key part is intentionally left to 0 columns_values.insert( "timestamp".to_string(), - Arc::new(TimestampVector::from_slice(&[Timestamp::from_millis(0)])) as _, + Arc::new(TimestampMillisecondVector::from_slice(&[0])) as _, ); + let now = util::current_time_millis(); columns_values.insert( "gmt_created".to_string(), - Arc::new(TimestampVector::from_slice(&[Timestamp::from_millis( - util::current_time_millis(), - )])) as _, + Arc::new(TimestampMillisecondVector::from_slice(&[now])) as _, ); columns_values.insert( "gmt_modified".to_string(), - Arc::new(TimestampVector::from_slice(&[Timestamp::from_millis( - util::current_time_millis(), - )])) as _, + Arc::new(TimestampMillisecondVector::from_slice(&[now])) as _, ); let table = self @@ -171,23 +166,21 @@ impl ScriptsTable { ensure!(!records.is_empty(), ScriptNotFoundSnafu { name }); assert_eq!(records.len(), 1); - assert_eq!(records[0].df_recordbatch.num_columns(), 1); + assert_eq!(records[0].num_columns(), 1); - let record = &records[0].df_recordbatch; - - let script_column = record - .column(0) + let script_column = records[0].column(0); + let script_column = script_column .as_any() - .downcast_ref::>() - .context(CastTypeSnafu { + .downcast_ref::() + .with_context(|| CastTypeSnafu { msg: format!( - "can't downcast {:?} array into utf8 array", - record.column(0).data_type() + "can't downcast {:?} array into string vector", + script_column.data_type() ), })?; assert_eq!(script_column.len(), 1); - Ok(script_column.value(0).to_string()) + Ok(script_column.get_data(0).unwrap().to_string()) } #[inline] @@ -216,18 +209,18 @@ fn build_scripts_schema() -> Schema { ), ColumnSchema::new( "timestamp".to_string(), - ConcreteDataType::timestamp_millis_datatype(), + ConcreteDataType::timestamp_millisecond_datatype(), false, ) .with_time_index(true), ColumnSchema::new( "gmt_created".to_string(), - ConcreteDataType::timestamp_millis_datatype(), + ConcreteDataType::timestamp_millisecond_datatype(), false, ), ColumnSchema::new( "gmt_modified".to_string(), - ConcreteDataType::timestamp_millis_datatype(), + ConcreteDataType::timestamp_millisecond_datatype(), false, ), ]; diff --git a/src/servers/src/http.rs b/src/servers/src/http.rs index 04158cdbd389..3885543e9876 100644 --- a/src/servers/src/http.rs +++ b/src/servers/src/http.rs @@ -277,7 +277,7 @@ impl JsonResponse { } } -async fn serve_api(Extension(api): Extension>) -> impl IntoApiResponse { +async fn serve_api(Extension(api): Extension) -> impl IntoApiResponse { Json(api) } diff --git a/src/servers/src/http/influxdb.rs b/src/servers/src/http/influxdb.rs index 06929eb3410c..b68cb3616f14 100644 --- a/src/servers/src/http/influxdb.rs +++ b/src/servers/src/http/influxdb.rs @@ -48,12 +48,12 @@ pub async fn influxdb_write( fn parse_time_precision(value: &str) -> Result { match value { - "n" => Ok(Precision::NANOSECOND), - "u" => Ok(Precision::MICROSECOND), - "ms" => Ok(Precision::MILLISECOND), - "s" => Ok(Precision::SECOND), - "m" => Ok(Precision::MINUTE), - "h" => Ok(Precision::HOUR), + "n" => Ok(Precision::Nanosecond), + "u" => Ok(Precision::Microsecond), + "ms" => Ok(Precision::Millisecond), + "s" => Ok(Precision::Second), + "m" => Ok(Precision::Minute), + "h" => Ok(Precision::Hour), unknown => TimePrecisionSnafu { name: unknown.to_string(), } @@ -69,12 +69,12 @@ mod tests { #[test] fn test_parse_time_precision() { - assert_eq!(Precision::NANOSECOND, parse_time_precision("n").unwrap()); - assert_eq!(Precision::MICROSECOND, parse_time_precision("u").unwrap()); - assert_eq!(Precision::MILLISECOND, parse_time_precision("ms").unwrap()); - assert_eq!(Precision::SECOND, parse_time_precision("s").unwrap()); - assert_eq!(Precision::MINUTE, parse_time_precision("m").unwrap()); - assert_eq!(Precision::HOUR, parse_time_precision("h").unwrap()); + assert_eq!(Precision::Nanosecond, parse_time_precision("n").unwrap()); + assert_eq!(Precision::Microsecond, parse_time_precision("u").unwrap()); + assert_eq!(Precision::Millisecond, parse_time_precision("ms").unwrap()); + assert_eq!(Precision::Second, parse_time_precision("s").unwrap()); + assert_eq!(Precision::Minute, parse_time_precision("m").unwrap()); + assert_eq!(Precision::Hour, parse_time_precision("h").unwrap()); assert!(parse_time_precision("unknown").is_err()); } } diff --git a/src/servers/src/influxdb.rs b/src/servers/src/influxdb.rs index 0766d6584360..870f6918b8a9 100644 --- a/src/servers/src/influxdb.rs +++ b/src/servers/src/influxdb.rs @@ -24,7 +24,7 @@ use crate::error::{Error, InfluxdbLineProtocolSnafu, InfluxdbLinesWriteSnafu}; use crate::line_writer::LineWriter; pub const INFLUXDB_TIMESTAMP_COLUMN_NAME: &str = "ts"; -pub const DEFAULT_TIME_PRECISION: Precision = Precision::NANOSECOND; +pub const DEFAULT_TIME_PRECISION: Precision = Precision::Nanosecond; pub struct InfluxdbRequest { pub precision: Option, @@ -359,11 +359,11 @@ monitor2,host=host4 cpu=66.3,memory=1029 1663840496400340003"; verify_column( &columns[3], "ts", - ColumnDataType::Timestamp, + ColumnDataType::TimestampMillisecond, SemanticType::Timestamp, Vec::new(), Values { - ts_millis_values: vec![1663840496100, 1663840496400], + ts_millisecond_values: vec![1663840496100, 1663840496400], ..Default::default() }, ); @@ -398,11 +398,11 @@ monitor2,host=host4 cpu=66.3,memory=1029 1663840496400340003"; verify_column( &columns[2], "ts", - ColumnDataType::Timestamp, + ColumnDataType::TimestampMillisecond, SemanticType::Timestamp, Vec::new(), Values { - ts_millis_values: vec![1663840496100, 1663840496400], + ts_millisecond_values: vec![1663840496100, 1663840496400], ..Default::default() }, ); diff --git a/src/servers/src/line_writer.rs b/src/servers/src/line_writer.rs index cbb2aff9873b..211e720399f8 100644 --- a/src/servers/src/line_writer.rs +++ b/src/servers/src/line_writer.rs @@ -18,12 +18,16 @@ use common_catalog::consts::DEFAULT_CATALOG_NAME; use common_grpc::writer::{to_ms_ts, Precision}; use common_time::timestamp::TimeUnit::Millisecond; use common_time::Timestamp; +use datatypes::data_type::DataType; use datatypes::prelude::ConcreteDataType; -use datatypes::types::TimestampType; -use datatypes::value::Value; -use datatypes::vectors::{VectorBuilder, VectorRef}; +use datatypes::types::{TimestampMillisecondType, TimestampType}; +use datatypes::value::{Value, ValueRef}; +use datatypes::vectors::{MutableVector, VectorRef}; +use snafu::ResultExt; use table::requests::InsertRequest; +use crate::error::VectorConversionSnafu; + type ColumnLen = usize; type ColumnName = String; @@ -32,7 +36,7 @@ pub struct LineWriter { table_name: String, expected_rows: usize, current_rows: usize, - columns_builders: HashMap, + columns_builders: HashMap, ColumnLen)>, } impl LineWriter { @@ -48,7 +52,8 @@ impl LineWriter { pub fn write_ts(&mut self, column_name: &str, value: (i64, Precision)) { let (val, precision) = value; - let datatype = ConcreteDataType::Timestamp(TimestampType { unit: Millisecond }); + let datatype = + ConcreteDataType::Timestamp(TimestampType::Millisecond(TimestampMillisecondType)); let ts_val = Value::Timestamp(Timestamp::new(to_ms_ts(precision, val), Millisecond)); self.write(column_name, datatype, ts_val); } @@ -104,8 +109,12 @@ impl LineWriter { fn write(&mut self, column_name: &str, datatype: ConcreteDataType, value: Value) { let or_insert = || { let rows = self.current_rows; - let mut builder = VectorBuilder::with_capacity(datatype, self.expected_rows); - (0..rows).into_iter().for_each(|_| builder.push_null()); + let mut builder = datatype.create_mutable_vector(self.expected_rows); + (0..rows) + .into_iter() + .try_for_each(|_| builder.push_value_ref(ValueRef::Null)) + .context(VectorConversionSnafu) + .unwrap(); (builder, rows) }; let (builder, column_len) = self @@ -113,7 +122,7 @@ impl LineWriter { .entry(column_name.to_string()) .or_insert_with(or_insert); - builder.push(&value); + builder.push_value_ref(value.as_value_ref()).unwrap(); *column_len += 1; } @@ -122,18 +131,22 @@ impl LineWriter { self.columns_builders .values_mut() .into_iter() - .for_each(|(builder, len)| { + .try_for_each(|(builder, len)| { if self.current_rows > *len { - builder.push(&Value::Null) + builder.push_value_ref(ValueRef::Null) + } else { + Ok(()) } - }); + }) + .context(VectorConversionSnafu) + .unwrap(); } pub fn finish(self) -> InsertRequest { let columns_values: HashMap = self .columns_builders .into_iter() - .map(|(column_name, (mut builder, _))| (column_name, builder.finish())) + .map(|(column_name, (mut builder, _))| (column_name, builder.to_vector())) .collect(); InsertRequest { catalog_name: DEFAULT_CATALOG_NAME.to_string(), @@ -158,18 +171,18 @@ mod tests { #[test] fn test_writer() { let mut writer = LineWriter::with_lines(DEFAULT_SCHEMA_NAME, "demo".to_string(), 4); - writer.write_ts("ts", (1665893727685, Precision::MILLISECOND)); + writer.write_ts("ts", (1665893727685, Precision::Millisecond)); writer.write_tag("host", "host-1"); writer.write_i64("memory", 10_i64); writer.commit(); - writer.write_ts("ts", (1665893727686, Precision::MILLISECOND)); + writer.write_ts("ts", (1665893727686, Precision::Millisecond)); writer.write_tag("host", "host-2"); writer.write_tag("region", "region-2"); writer.write_i64("memory", 9_i64); writer.commit(); - writer.write_ts("ts", (1665893727689, Precision::MILLISECOND)); + writer.write_ts("ts", (1665893727689, Precision::Millisecond)); writer.write_tag("host", "host-3"); writer.write_tag("region", "region-3"); writer.write_i64("cpu", 19_i64); @@ -195,9 +208,9 @@ mod tests { let cpu = columns.get("cpu").unwrap(); let expected: Vec = vec![ - Value::Timestamp(Timestamp::from_millis(1665893727685_i64)), - Value::Timestamp(Timestamp::from_millis(1665893727686_i64)), - Value::Timestamp(Timestamp::from_millis(1665893727689_i64)), + Value::Timestamp(Timestamp::new_millisecond(1665893727685_i64)), + Value::Timestamp(Timestamp::new_millisecond(1665893727686_i64)), + Value::Timestamp(Timestamp::new_millisecond(1665893727689_i64)), ]; assert_vector(&expected, ts); diff --git a/src/servers/src/mysql/federated.rs b/src/servers/src/mysql/federated.rs index f2f1a8caedb3..1736ae67feca 100644 --- a/src/servers/src/mysql/federated.rs +++ b/src/servers/src/mysql/federated.rs @@ -310,90 +310,85 @@ mod test { let output = check(query, Arc::new(QueryContext::new())); assert!(output.is_none()); - fn test(query: &str, expected: Vec<&str>) { + fn test(query: &str, expected: &str) { let output = check(query, Arc::new(QueryContext::new())); match output.unwrap() { Output::RecordBatches(r) => { - assert_eq!(r.pretty_print().lines().collect::>(), expected) + assert_eq!(&r.pretty_print().unwrap(), expected) } _ => unreachable!(), } } let query = "select version()"; - let expected = vec![ - "+-----------+", - "| version() |", - "+-----------+", - "| 8.0.26 |", - "+-----------+", - ]; + let expected = "\ ++-----------+ +| version() | ++-----------+ +| 8.0.26 | ++-----------+"; test(query, expected); let query = "SELECT @@version_comment LIMIT 1"; - let expected = vec![ - "+-------------------+", - "| @@version_comment |", - "+-------------------+", - "| Greptime |", - "+-------------------+", - ]; + let expected = "\ ++-------------------+ +| @@version_comment | ++-------------------+ +| Greptime | ++-------------------+"; test(query, expected); // variables let query = "select @@tx_isolation, @@session.tx_isolation"; - let expected = vec![ - "+-----------------+------------------------+", - "| @@tx_isolation | @@session.tx_isolation |", - "+-----------------+------------------------+", - "| REPEATABLE-READ | REPEATABLE-READ |", - "+-----------------+------------------------+", - ]; + let expected = "\ ++-----------------+------------------------+ +| @@tx_isolation | @@session.tx_isolation | ++-----------------+------------------------+ +| REPEATABLE-READ | REPEATABLE-READ | ++-----------------+------------------------+"; test(query, expected); // complex variables let query = "/* mysql-connector-java-8.0.17 (Revision: 16a712ddb3f826a1933ab42b0039f7fb9eebc6ec) */SELECT @@session.auto_increment_increment AS auto_increment_increment, @@character_set_client AS character_set_client, @@character_set_connection AS character_set_connection, @@character_set_results AS character_set_results, @@character_set_server AS character_set_server, @@collation_server AS collation_server, @@collation_connection AS collation_connection, @@init_connect AS init_connect, @@interactive_timeout AS interactive_timeout, @@license AS license, @@lower_case_table_names AS lower_case_table_names, @@max_allowed_packet AS max_allowed_packet, @@net_write_timeout AS net_write_timeout, @@performance_schema AS performance_schema, @@sql_mode AS sql_mode, @@system_time_zone AS system_time_zone, @@time_zone AS time_zone, @@transaction_isolation AS transaction_isolation, @@wait_timeout AS wait_timeout;"; - let expected = vec![ - "+--------------------------+----------------------+--------------------------+-----------------------+----------------------+------------------+----------------------+--------------+---------------------+---------+------------------------+--------------------+-------------------+--------------------+----------+------------------+-----------+-----------------------+---------------+", - "| auto_increment_increment | character_set_client | character_set_connection | character_set_results | character_set_server | collation_server | collation_connection | init_connect | interactive_timeout | license | lower_case_table_names | max_allowed_packet | net_write_timeout | performance_schema | sql_mode | system_time_zone | time_zone | transaction_isolation | wait_timeout; |", - "+--------------------------+----------------------+--------------------------+-----------------------+----------------------+------------------+----------------------+--------------+---------------------+---------+------------------------+--------------------+-------------------+--------------------+----------+------------------+-----------+-----------------------+---------------+", - "| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 31536000 | 0 | 0 | 134217728 | 31536000 | 0 | 0 | UTC | UTC | REPEATABLE-READ | 31536000 |", - "+--------------------------+----------------------+--------------------------+-----------------------+----------------------+------------------+----------------------+--------------+---------------------+---------+------------------------+--------------------+-------------------+--------------------+----------+------------------+-----------+-----------------------+---------------+", - ]; + let expected = "\ ++--------------------------+----------------------+--------------------------+-----------------------+----------------------+------------------+----------------------+--------------+---------------------+---------+------------------------+--------------------+-------------------+--------------------+----------+------------------+-----------+-----------------------+---------------+ +| auto_increment_increment | character_set_client | character_set_connection | character_set_results | character_set_server | collation_server | collation_connection | init_connect | interactive_timeout | license | lower_case_table_names | max_allowed_packet | net_write_timeout | performance_schema | sql_mode | system_time_zone | time_zone | transaction_isolation | wait_timeout; | ++--------------------------+----------------------+--------------------------+-----------------------+----------------------+------------------+----------------------+--------------+---------------------+---------+------------------------+--------------------+-------------------+--------------------+----------+------------------+-----------+-----------------------+---------------+ +| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 31536000 | 0 | 0 | 134217728 | 31536000 | 0 | 0 | UTC | UTC | REPEATABLE-READ | 31536000 | ++--------------------------+----------------------+--------------------------+-----------------------+----------------------+------------------+----------------------+--------------+---------------------+---------+------------------------+--------------------+-------------------+--------------------+----------+------------------+-----------+-----------------------+---------------+"; test(query, expected); let query = "show variables"; - let expected = vec![ - "+---------------+-------+", - "| Variable_name | Value |", - "+---------------+-------+", - "| | |", - "+---------------+-------+", - ]; + let expected = "\ ++---------------+-------+ +| Variable_name | Value | ++---------------+-------+ +| | | ++---------------+-------+"; test(query, expected); let query = "show variables like 'lower_case_table_names'"; - let expected = vec![ - "+------------------------+-------+", - "| Variable_name | Value |", - "+------------------------+-------+", - "| lower_case_table_names | 0 |", - "+------------------------+-------+", - ]; + let expected = "\ ++------------------------+-------+ +| Variable_name | Value | ++------------------------+-------+ +| lower_case_table_names | 0 | ++------------------------+-------+"; test(query, expected); let query = "show collation"; - let expected = vec!["++", "++"]; // empty + let expected = "\ +++ +++"; // empty test(query, expected); let query = "SELECT TIMEDIFF(NOW(), UTC_TIMESTAMP())"; - let expected = vec![ - "+----------------------------------+", - "| TIMEDIFF(NOW(), UTC_TIMESTAMP()) |", - "+----------------------------------+", - "| 00:00:00 |", - "+----------------------------------+", - ]; + let expected = "\ ++----------------------------------+ +| TIMEDIFF(NOW(), UTC_TIMESTAMP()) | ++----------------------------------+ +| 00:00:00 | ++----------------------------------+"; test(query, expected); } } diff --git a/src/servers/src/mysql/server.rs b/src/servers/src/mysql/server.rs index 79a3bd3a6660..3bec0ebbbc28 100644 --- a/src/servers/src/mysql/server.rs +++ b/src/servers/src/mysql/server.rs @@ -41,7 +41,7 @@ const DEFAULT_RESULT_SET_WRITE_BUFFER_SIZE: usize = 100 * 1024; pub struct MysqlServer { base_server: BaseTcpServer, query_handler: SqlQueryHandlerRef, - tls: Arc, + tls: TlsOption, user_provider: Option, } @@ -49,7 +49,7 @@ impl MysqlServer { pub fn create_server( query_handler: SqlQueryHandlerRef, io_runtime: Arc, - tls: Arc, + tls: TlsOption, user_provider: Option, ) -> Box { Box::new(MysqlServer { diff --git a/src/servers/src/opentsdb/codec.rs b/src/servers/src/opentsdb/codec.rs index 260a206fe536..49fccc48489a 100644 --- a/src/servers/src/opentsdb/codec.rs +++ b/src/servers/src/opentsdb/codec.rs @@ -132,7 +132,7 @@ impl DataPoint { let mut line_writer = LineWriter::with_lines(DEFAULT_SCHEMA_NAME, self.metric.clone(), 1); line_writer.write_ts( OPENTSDB_TIMESTAMP_COLUMN_NAME, - (self.ts_millis(), Precision::MILLISECOND), + (self.ts_millis(), Precision::Millisecond), ); line_writer.write_f64(OPENTSDB_VALUE_COLUMN_NAME, self.value); @@ -152,11 +152,11 @@ impl DataPoint { let ts_column = Column { column_name: OPENTSDB_TIMESTAMP_COLUMN_NAME.to_string(), values: Some(column::Values { - ts_millis_values: vec![self.ts_millis], + ts_millisecond_values: vec![self.ts_millis], ..Default::default() }), semantic_type: SemanticType::Timestamp as i32, - datatype: ColumnDataType::Timestamp as i32, + datatype: ColumnDataType::TimestampMillisecond as i32, ..Default::default() }; columns.push(ts_column); @@ -336,7 +336,7 @@ mod test { assert_eq!(columns[0].column_name, OPENTSDB_TIMESTAMP_COLUMN_NAME); assert_eq!( - columns[0].values.as_ref().unwrap().ts_millis_values, + columns[0].values.as_ref().unwrap().ts_millisecond_values, vec![1000] ); diff --git a/src/servers/src/postgres/handler.rs b/src/servers/src/postgres/handler.rs index 36dbd80d334b..3d9b11c0770c 100644 --- a/src/servers/src/postgres/handler.rs +++ b/src/servers/src/postgres/handler.rs @@ -237,7 +237,7 @@ mod test { ColumnSchema::new("strings", ConcreteDataType::string_datatype(), true), ColumnSchema::new( "timestamps", - ConcreteDataType::timestamp_millis_datatype(), + ConcreteDataType::timestamp_millisecond_datatype(), true, ), ColumnSchema::new("dates", ConcreteDataType::date_datatype(), true), diff --git a/src/servers/src/postgres/server.rs b/src/servers/src/postgres/server.rs index d2b88449926f..5003af92c53b 100644 --- a/src/servers/src/postgres/server.rs +++ b/src/servers/src/postgres/server.rs @@ -37,14 +37,14 @@ pub struct PostgresServer { base_server: BaseTcpServer, auth_handler: Arc, query_handler: Arc, - tls: Arc, + tls: TlsOption, } impl PostgresServer { /// Creates a new Postgres server with provided query_handler and async runtime pub fn new( query_handler: SqlQueryHandlerRef, - tls: Arc, + tls: TlsOption, io_runtime: Arc, user_provider: Option, ) -> PostgresServer { diff --git a/src/servers/src/prometheus.rs b/src/servers/src/prometheus.rs index 1c2b035ec0ea..80d9db0b743a 100644 --- a/src/servers/src/prometheus.rs +++ b/src/servers/src/prometheus.rs @@ -22,7 +22,7 @@ use api::prometheus::remote::{Label, Query, Sample, TimeSeries, WriteRequest}; use api::v1::codec::SelectResult; use api::v1::column::SemanticType; use api::v1::{column, Column, ColumnDataType, InsertExpr}; -use common_grpc::writer::Precision::MILLISECOND; +use common_grpc::writer::Precision::Millisecond; use openmetrics_parser::{MetricsExposition, PrometheusType, PrometheusValue}; use snafu::{OptionExt, ResultExt}; use snap::raw::{Decoder, Encoder}; @@ -279,7 +279,7 @@ pub fn select_result_to_timeseries( timestamp: ts_column .values .as_ref() - .map(|vs| vs.ts_millis_values[ts_row]) + .map(|vs| vs.ts_millisecond_values[ts_row]) .unwrap_or(0i64), }; @@ -325,7 +325,7 @@ fn timeseries_to_insert_request(db: &str, mut timeseries: TimeSeries) -> Result< let ts_millis = sample.timestamp; let val = sample.value; - line_writer.write_ts(TIMESTAMP_COLUMN_NAME, (ts_millis, MILLISECOND)); + line_writer.write_ts(TIMESTAMP_COLUMN_NAME, (ts_millis, Millisecond)); line_writer.write_f64(VALUE_COLUMN_NAME, val); labels @@ -368,11 +368,11 @@ fn timeseries_to_insert_expr(database: &str, mut timeseries: TimeSeries) -> Resu let ts_column = Column { column_name: TIMESTAMP_COLUMN_NAME.to_string(), values: Some(column::Values { - ts_millis_values: samples.iter().map(|x| x.timestamp).collect(), + ts_millisecond_values: samples.iter().map(|x| x.timestamp).collect(), ..Default::default() }), semantic_type: SemanticType::Timestamp as i32, - datatype: ColumnDataType::Timestamp as i32, + datatype: ColumnDataType::TimestampMillisecond as i32, ..Default::default() }; columns.push(ts_column); @@ -686,7 +686,7 @@ mod tests { assert_eq!(columns[0].column_name, TIMESTAMP_COLUMN_NAME); assert_eq!( - columns[0].values.as_ref().unwrap().ts_millis_values, + columns[0].values.as_ref().unwrap().ts_millisecond_values, vec![1000, 2000] ); @@ -712,7 +712,7 @@ mod tests { assert_eq!(columns[0].column_name, TIMESTAMP_COLUMN_NAME); assert_eq!( - columns[0].values.as_ref().unwrap().ts_millis_values, + columns[0].values.as_ref().unwrap().ts_millisecond_values, vec![1000, 2000] ); @@ -743,7 +743,7 @@ mod tests { assert_eq!(columns[0].column_name, TIMESTAMP_COLUMN_NAME); assert_eq!( - columns[0].values.as_ref().unwrap().ts_millis_values, + columns[0].values.as_ref().unwrap().ts_millisecond_values, vec![1000, 2000, 3000] ); @@ -773,7 +773,7 @@ mod tests { Column { column_name: TIMESTAMP_COLUMN_NAME.to_string(), values: Some(column::Values { - ts_millis_values: vec![1000, 2000], + ts_millisecond_values: vec![1000, 2000], ..Default::default() }), ..Default::default() diff --git a/src/servers/tests/mysql/mysql_server_test.rs b/src/servers/tests/mysql/mysql_server_test.rs index fc0ef36f2a09..2d1aac91a98b 100644 --- a/src/servers/tests/mysql/mysql_server_test.rs +++ b/src/servers/tests/mysql/mysql_server_test.rs @@ -33,7 +33,7 @@ use table::test_util::MemTable; use crate::create_testing_sql_query_handler; use crate::mysql::{all_datatype_testing_data, MysqlTextRow, TestingData}; -fn create_mysql_server(table: MemTable, tls: Arc) -> Result> { +fn create_mysql_server(table: MemTable, tls: TlsOption) -> Result> { let query_handler = create_testing_sql_query_handler(table); let io_runtime = Arc::new( RuntimeBuilder::default() @@ -125,7 +125,7 @@ async fn test_shutdown_mysql_server() -> Result<()> { async fn test_query_all_datatypes() -> Result<()> { common_telemetry::init_default_ut_logging(); - let server_tls = Arc::new(TlsOption::default()); + let server_tls = TlsOption::default(); let client_tls = false; do_test_query_all_datatypes(server_tls, client_tls).await?; @@ -134,11 +134,11 @@ async fn test_query_all_datatypes() -> Result<()> { #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn test_server_prefer_secure_client_plain() -> Result<()> { - let server_tls = Arc::new(TlsOption { + let server_tls = TlsOption { mode: servers::tls::TlsMode::Prefer, cert_path: "tests/ssl/server.crt".to_owned(), key_path: "tests/ssl/server.key".to_owned(), - }); + }; let client_tls = false; do_test_query_all_datatypes(server_tls, client_tls).await?; @@ -147,11 +147,11 @@ async fn test_server_prefer_secure_client_plain() -> Result<()> { #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn test_server_prefer_secure_client_secure() -> Result<()> { - let server_tls = Arc::new(TlsOption { + let server_tls = TlsOption { mode: servers::tls::TlsMode::Prefer, cert_path: "tests/ssl/server.crt".to_owned(), key_path: "tests/ssl/server.key".to_owned(), - }); + }; let client_tls = true; do_test_query_all_datatypes(server_tls, client_tls).await?; @@ -160,11 +160,11 @@ async fn test_server_prefer_secure_client_secure() -> Result<()> { #[tokio::test(flavor = "multi_thread", worker_threads = 4)] async fn test_server_require_secure_client_secure() -> Result<()> { - let server_tls = Arc::new(TlsOption { + let server_tls = TlsOption { mode: servers::tls::TlsMode::Require, cert_path: "tests/ssl/server.crt".to_owned(), key_path: "tests/ssl/server.key".to_owned(), - }); + }; let client_tls = true; do_test_query_all_datatypes(server_tls, client_tls).await?; @@ -173,11 +173,11 @@ async fn test_server_require_secure_client_secure() -> Result<()> { #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn test_server_required_secure_client_plain() -> Result<()> { - let server_tls = Arc::new(TlsOption { + let server_tls = TlsOption { mode: servers::tls::TlsMode::Require, cert_path: "tests/ssl/server.crt".to_owned(), key_path: "tests/ssl/server.key".to_owned(), - }); + }; let client_tls = false; @@ -202,7 +202,7 @@ async fn test_server_required_secure_client_plain() -> Result<()> { Ok(()) } -async fn do_test_query_all_datatypes(server_tls: Arc, client_tls: bool) -> Result<()> { +async fn do_test_query_all_datatypes(server_tls: TlsOption, client_tls: bool) -> Result<()> { common_telemetry::init_default_ut_logging(); let TestingData { column_schemas, diff --git a/src/servers/tests/postgres/mod.rs b/src/servers/tests/postgres/mod.rs index f7cdec12b2e1..5653251c0d7c 100644 --- a/src/servers/tests/postgres/mod.rs +++ b/src/servers/tests/postgres/mod.rs @@ -36,7 +36,7 @@ use crate::create_testing_sql_query_handler; fn create_postgres_server( table: MemTable, check_pwd: bool, - tls: Arc, + tls: TlsOption, ) -> Result> { let query_handler = create_testing_sql_query_handler(table); let io_runtime = Arc::new( @@ -194,11 +194,11 @@ async fn test_query_pg_concurrently() -> Result<()> { async fn test_server_secure_prefer_client_plain() -> Result<()> { common_telemetry::init_default_ut_logging(); - let server_tls = Arc::new(TlsOption { + let server_tls = TlsOption { mode: servers::tls::TlsMode::Prefer, cert_path: "tests/ssl/server.crt".to_owned(), key_path: "tests/ssl/server.key".to_owned(), - }); + }; let client_tls = false; do_simple_query(server_tls, client_tls).await?; @@ -209,11 +209,11 @@ async fn test_server_secure_prefer_client_plain() -> Result<()> { async fn test_server_secure_require_client_plain() -> Result<()> { common_telemetry::init_default_ut_logging(); - let server_tls = Arc::new(TlsOption { + let server_tls = TlsOption { mode: servers::tls::TlsMode::Require, cert_path: "tests/ssl/server.crt".to_owned(), key_path: "tests/ssl/server.key".to_owned(), - }); + }; let server_port = start_test_server(server_tls).await?; let r = create_plain_connection(server_port, false).await; assert!(r.is_err()); @@ -224,11 +224,11 @@ async fn test_server_secure_require_client_plain() -> Result<()> { async fn test_server_secure_require_client_secure() -> Result<()> { common_telemetry::init_default_ut_logging(); - let server_tls = Arc::new(TlsOption { + let server_tls = TlsOption { mode: servers::tls::TlsMode::Require, cert_path: "tests/ssl/server.crt".to_owned(), key_path: "tests/ssl/server.key".to_owned(), - }); + }; let client_tls = true; do_simple_query(server_tls, client_tls).await?; @@ -237,7 +237,7 @@ async fn test_server_secure_require_client_secure() -> Result<()> { #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn test_using_db() -> Result<()> { - let server_port = start_test_server(Arc::new(TlsOption::default())).await?; + let server_port = start_test_server(TlsOption::default()).await?; let client = create_connection_with_given_db(server_port, "testdb") .await @@ -253,7 +253,7 @@ async fn test_using_db() -> Result<()> { Ok(()) } -async fn start_test_server(server_tls: Arc) -> Result { +async fn start_test_server(server_tls: TlsOption) -> Result { common_telemetry::init_default_ut_logging(); let table = MemTable::default_numbers_table(); let pg_server = create_postgres_server(table, false, server_tls)?; @@ -262,7 +262,7 @@ async fn start_test_server(server_tls: Arc) -> Result { Ok(server_addr.port()) } -async fn do_simple_query(server_tls: Arc, client_tls: bool) -> Result<()> { +async fn do_simple_query(server_tls: TlsOption, client_tls: bool) -> Result<()> { let server_port = start_test_server(server_tls).await?; if !client_tls { diff --git a/src/sql/Cargo.toml b/src/sql/Cargo.toml index 6f7f40b017bd..ebdd0f172b0d 100644 --- a/src/sql/Cargo.toml +++ b/src/sql/Cargo.toml @@ -15,4 +15,4 @@ itertools = "0.10" mito = { path = "../mito" } once_cell = "1.10" snafu = { version = "0.7", features = ["backtraces"] } -sqlparser = "0.15.0" +sqlparser = "0.26" diff --git a/src/sql/src/ast.rs b/src/sql/src/ast.rs index 11636df8c078..7388b9453cc4 100644 --- a/src/sql/src/ast.rs +++ b/src/sql/src/ast.rs @@ -14,5 +14,5 @@ pub use sqlparser::ast::{ ColumnDef, ColumnOption, ColumnOptionDef, DataType, Expr, Function, FunctionArg, - FunctionArgExpr, Ident, ObjectName, SqlOption, TableConstraint, Value, + FunctionArgExpr, Ident, ObjectName, SqlOption, TableConstraint, TimezoneInfo, Value, }; diff --git a/src/sql/src/parser.rs b/src/sql/src/parser.rs index 254982e88e5a..3a14fb066619 100644 --- a/src/sql/src/parser.rs +++ b/src/sql/src/parser.rs @@ -505,11 +505,7 @@ mod tests { assert_matches!( &stmts[0], Statement::ShowTables(ShowTables { - kind: ShowKind::Where(sqlparser::ast::Expr::BinaryOp { - left: _, - right: _, - op: sqlparser::ast::BinaryOperator::Like, - }), + kind: ShowKind::Where(sqlparser::ast::Expr::Like { .. }), database: None, }) ); @@ -522,11 +518,7 @@ mod tests { assert_matches!( &stmts[0], Statement::ShowTables(ShowTables { - kind: ShowKind::Where(sqlparser::ast::Expr::BinaryOp { - left: _, - right: _, - op: sqlparser::ast::BinaryOperator::Like, - }), + kind: ShowKind::Where(sqlparser::ast::Expr::Like { .. }), database: Some(_), }) ); @@ -543,11 +535,12 @@ mod tests { distinct: false, top: None, projection: vec![sqlparser::ast::SelectItem::Wildcard], + into: None, from: vec![sqlparser::ast::TableWithJoins { relation: sqlparser::ast::TableFactor::Table { name: sqlparser::ast::ObjectName(vec![sqlparser::ast::Ident::new("foo")]), alias: None, - args: vec![], + args: None, with_hints: vec![], }, joins: vec![], @@ -559,11 +552,12 @@ mod tests { distribute_by: vec![], sort_by: vec![], having: None, + qualify: None, }; let sp_statement = SpStatement::Query(Box::new(SpQuery { with: None, - body: sqlparser::ast::SetExpr::Select(Box::new(select)), + body: Box::new(sqlparser::ast::SetExpr::Select(Box::new(select))), order_by: vec![], limit: None, offset: None, @@ -576,6 +570,7 @@ mod tests { analyze: false, verbose: false, statement: Box::new(sp_statement), + format: None, }) .unwrap(); diff --git a/src/sql/src/parsers/create_parser.rs b/src/sql/src/parsers/create_parser.rs index 0ddb8e6ff550..12dc8fa58e26 100644 --- a/src/sql/src/parsers/create_parser.rs +++ b/src/sql/src/parsers/create_parser.rs @@ -253,7 +253,7 @@ impl<'a> ParserContext<'a> { .parse_column_def() .context(SyntaxSnafu { sql: self.sql })?; - if !matches!(column.data_type, DataType::Timestamp) + if !matches!(column.data_type, DataType::Timestamp(_)) || matches!(self.parser.peek_token(), Token::Comma) { columns.push(column); diff --git a/src/sql/src/statements.rs b/src/sql/src/statements.rs index bcdc099265be..ba8397ca01ed 100644 --- a/src/sql/src/statements.rs +++ b/src/sql/src/statements.rs @@ -21,12 +21,12 @@ pub mod insert; pub mod query; pub mod show; pub mod statement; - use std::str::FromStr; use api::helper::ColumnDataTypeWrapper; use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME}; use common_time::Timestamp; +use datatypes::data_type::DataType; use datatypes::prelude::ConcreteDataType; use datatypes::schema::{ColumnDefaultConstraint, ColumnSchema}; use datatypes::types::DateTimeType; @@ -79,7 +79,7 @@ fn parse_string_to_value( data_type: &ConcreteDataType, ) -> Result { ensure!( - data_type.stringifiable(), + data_type.is_stringifiable(), ColumnTypeMismatchSnafu { column_name, expect: data_type.clone(), @@ -112,8 +112,8 @@ fn parse_string_to_value( ConcreteDataType::Timestamp(t) => { if let Ok(ts) = Timestamp::from_str(&s) { Ok(Value::Timestamp(Timestamp::new( - ts.convert_to(t.unit), - t.unit, + ts.convert_to(t.unit()), + t.unit(), ))) } else { ParseSqlValueSnafu { @@ -301,7 +301,10 @@ pub fn sql_data_type_to_concrete_data_type(data_type: &SqlDataType) -> Result Ok(ConcreteDataType::date_datatype()), SqlDataType::Custom(obj_name) => match &obj_name.0[..] { [type_name] => { - if type_name.value.eq_ignore_ascii_case(DateTimeType::name()) { + if type_name + .value + .eq_ignore_ascii_case(DateTimeType::default().name()) + { Ok(ConcreteDataType::datetime_datatype()) } else { error::SqlTypeNotSupportedSnafu { @@ -315,7 +318,7 @@ pub fn sql_data_type_to_concrete_data_type(data_type: &SqlDataType) -> Result Ok(ConcreteDataType::timestamp_millis_datatype()), + SqlDataType::Timestamp(_) => Ok(ConcreteDataType::timestamp_millisecond_datatype()), _ => error::SqlTypeNotSupportedSnafu { t: data_type.clone(), } @@ -333,7 +336,7 @@ mod tests { use datatypes::value::OrderedFloat; use super::*; - use crate::ast::{DataType, Ident}; + use crate::ast::{Ident, TimezoneInfo}; use crate::statements::ColumnOption; fn check_type(sql_type: SqlDataType, data_type: ConcreteDataType) { @@ -373,8 +376,8 @@ mod tests { ConcreteDataType::datetime_datatype(), ); check_type( - SqlDataType::Timestamp, - ConcreteDataType::timestamp_millis_datatype(), + SqlDataType::Timestamp(TimezoneInfo::None), + ConcreteDataType::timestamp_millisecond_datatype(), ); } @@ -419,9 +422,13 @@ mod tests { let sql_val = SqlValue::Boolean(true); let v = sql_value_to_value("a", &ConcreteDataType::float64_datatype(), &sql_val); assert!(v.is_err()); - assert!(format!("{:?}", v).contains( - "column_name: \"a\", expect: Float64(Float64), actual: Boolean(BooleanType)" - )); + assert!( + format!("{:?}", v).contains( + "column_name: \"a\", expect: Float64(Float64Type), actual: Boolean(BooleanType)" + ), + "v is {:?}", + v + ); } #[test] @@ -471,7 +478,7 @@ mod tests { match parse_string_to_value( "timestamp_col", "2022-02-22T00:01:01+08:00".to_string(), - &ConcreteDataType::timestamp_millis_datatype(), + &ConcreteDataType::timestamp_millisecond_datatype(), ) .unwrap() { @@ -570,7 +577,7 @@ mod tests { // test basic let column_def = ColumnDef { name: "col".into(), - data_type: DataType::Double, + data_type: SqlDataType::Double, collation: None, options: vec![], }; @@ -585,7 +592,7 @@ mod tests { // test not null let column_def = ColumnDef { name: "col".into(), - data_type: DataType::Double, + data_type: SqlDataType::Double, collation: None, options: vec![ColumnOptionDef { name: None, diff --git a/src/sql/src/statements/insert.rs b/src/sql/src/statements/insert.rs index 410c0d09cb6b..f105648ea826 100644 --- a/src/sql/src/statements/insert.rs +++ b/src/sql/src/statements/insert.rs @@ -49,7 +49,7 @@ impl Insert { pub fn values(&self) -> Result>> { let values = match &self.inner { - Statement::Insert { source, .. } => match &source.body { + Statement::Insert { source, .. } => match &*source.body { SetExpr::Values(Values(exprs)) => sql_exprs_to_values(exprs)?, _ => unreachable!(), }, diff --git a/src/storage/Cargo.toml b/src/storage/Cargo.toml index 94d106f69957..9c107d3e6479 100644 --- a/src/storage/Cargo.toml +++ b/src/storage/Cargo.toml @@ -6,7 +6,7 @@ license = "Apache-2.0" [dependencies] arc-swap = "1.0" -arrow-format = { version = "0.4", features = ["ipc"] } +async-compat = "0.2" async-stream = "0.3" async-trait = "0.1" bytes = "1.1" @@ -22,6 +22,7 @@ futures-util = "0.3" lazy_static = "1.4" object-store = { path = "../object-store" } paste = "1.0" +parquet = { version = "26", features = ["async"] } planus = "0.2" prost = "0.11" regex = "1.5" diff --git a/src/storage/benches/memtable/mod.rs b/src/storage/benches/memtable/mod.rs index 462c3edc28b5..eb12b11ab079 100644 --- a/src/storage/benches/memtable/mod.rs +++ b/src/storage/benches/memtable/mod.rs @@ -20,9 +20,11 @@ pub mod util; use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::Arc; -use common_time::Timestamp; use datatypes::prelude::ScalarVectorBuilder; -use datatypes::vectors::{StringVectorBuilder, TimestampVectorBuilder, UInt64VectorBuilder}; +use datatypes::timestamp::TimestampMillisecond; +use datatypes::vectors::{ + StringVectorBuilder, TimestampMillisecondVectorBuilder, UInt64VectorBuilder, +}; use rand::distributions::Alphanumeric; use rand::prelude::ThreadRng; use rand::Rng; @@ -69,11 +71,11 @@ fn kvs_with_index( values: &[(Option, String)], ) -> KeyValues { let mut key_builders = ( - TimestampVectorBuilder::with_capacity(keys.len()), + TimestampMillisecondVectorBuilder::with_capacity(keys.len()), UInt64VectorBuilder::with_capacity(keys.len()), ); for key in keys { - key_builders.0.push(Some(Timestamp::from_millis(key.0))); + key_builders.0.push(Some(TimestampMillisecond::from(key.0))); key_builders.1.push(Some(key.1)); } let row_keys = vec![ diff --git a/src/storage/benches/memtable/util/regiondesc_util.rs b/src/storage/benches/memtable/util/regiondesc_util.rs index 51dcb8795ad5..e8f71c71bd34 100644 --- a/src/storage/benches/memtable/util/regiondesc_util.rs +++ b/src/storage/benches/memtable/util/regiondesc_util.rs @@ -34,7 +34,7 @@ impl RegionDescBuilder { ColumnDescriptorBuilder::new( 1, TIMESTAMP_NAME, - ConcreteDataType::timestamp_millis_datatype(), + ConcreteDataType::timestamp_millisecond_datatype(), ) .is_nullable(false) .build() diff --git a/src/storage/benches/wal/util/mod.rs b/src/storage/benches/wal/util/mod.rs index 47bd3766c098..477297074afc 100644 --- a/src/storage/benches/wal/util/mod.rs +++ b/src/storage/benches/wal/util/mod.rs @@ -19,7 +19,7 @@ use std::sync::Arc; use datatypes::prelude::ScalarVector; use datatypes::type_id::LogicalTypeId; use datatypes::vectors::{ - BooleanVector, Float64Vector, StringVector, TimestampVector, UInt64Vector, + BooleanVector, Float64Vector, StringVector, TimestampMillisecondVector, UInt64Vector, }; use rand::Rng; use storage::proto; @@ -31,7 +31,7 @@ pub fn new_test_batch() -> WriteBatch { &[ ("k1", LogicalTypeId::UInt64, false), (consts::VERSION_COLUMN_NAME, LogicalTypeId::UInt64, false), - ("ts", LogicalTypeId::Timestamp, false), + ("ts", LogicalTypeId::TimestampMillisecond, false), ("v1", LogicalTypeId::Boolean, true), ("4", LogicalTypeId::Float64, false), ("5", LogicalTypeId::Float64, false), @@ -71,7 +71,7 @@ pub fn gen_new_batch_and_types(putdate_nums: usize) -> (WriteBatch, Vec) { rng.fill(&mut fvs[..]); let intv = Arc::new(UInt64Vector::from_slice(&intvs)); let boolv = Arc::new(BooleanVector::from(boolvs.to_vec())); - let tsv = Arc::new(TimestampVector::from_values(tsvs)); + let tsv = Arc::new(TimestampMillisecondVector::from_values(tsvs)); let fvs = Arc::new(Float64Vector::from_slice(&fvs)); let svs = Arc::new(StringVector::from_slice(&svs)); let mut put_data = PutData::default(); diff --git a/src/storage/proto/write_batch.proto b/src/storage/proto/write_batch.proto index 6f0ec4a388a7..ed1813aa556d 100644 --- a/src/storage/proto/write_batch.proto +++ b/src/storage/proto/write_batch.proto @@ -61,7 +61,12 @@ enum DataType { FLOAT64 = 11; STRING = 12; BINARY = 13; - TIMESTAMP = 14; + DATE = 14; + DATETIME = 15; + TIMESTAMP_SECOND = 16; + TIMESTAMP_MILLISECOND = 17; + TIMESTAMP_MICROSECOND = 18; + TIMESTAMP_NANOSECOND = 19; } message Values { @@ -81,5 +86,10 @@ message Values { repeated bool bool_values = 11; repeated bytes binary_values = 12; repeated string string_values = 13; - repeated int64 timestamp_values = 14; + repeated int32 date_values = 14; + repeated int64 datetime_values = 15; + repeated int64 ts_second_values = 16; + repeated int64 ts_millisecond_values = 17; + repeated int64 ts_microsecond_values = 18; + repeated int64 ts_nanosecond_values = 19; } diff --git a/src/storage/src/error.rs b/src/storage/src/error.rs index bc86199f23ec..53c34f8ecc83 100644 --- a/src/storage/src/error.rs +++ b/src/storage/src/error.rs @@ -18,7 +18,6 @@ use std::str::Utf8Error; use common_error::prelude::*; use datatypes::arrow; -use datatypes::arrow::error::ArrowError; use datatypes::prelude::ConcreteDataType; use serde_json::error::Error as JsonError; use store_api::manifest::action::ProtocolVersion; @@ -54,10 +53,16 @@ pub enum Error { #[snafu(display("Failed to write parquet file, source: {}", source))] WriteParquet { - source: arrow::error::ArrowError, + source: parquet::errors::ParquetError, backtrace: Backtrace, }, + #[snafu(display("Failed to create RecordBatch from vectors, source: {}", source))] + NewRecordBatch { + backtrace: Backtrace, + source: arrow::error::ArrowError, + }, + #[snafu(display("Fail to read object from path: {}, source: {}", path, source))] ReadObject { path: String, @@ -180,7 +185,7 @@ pub enum Error { #[snafu(display("Failed to read Parquet file: {}, source: {}", file, source))] ReadParquet { file: String, - source: ArrowError, + source: parquet::errors::ParquetError, backtrace: Backtrace, }, @@ -396,7 +401,8 @@ impl ErrorExt for Error { | AlterMetadata { .. } | CompatRead { .. } | CreateDefaultToRead { .. } - | NoDefaultToRead { .. } => StatusCode::Unexpected, + | NoDefaultToRead { .. } + | NewRecordBatch { .. } => StatusCode::Unexpected, FlushIo { .. } | WriteParquet { .. } @@ -484,14 +490,14 @@ mod tests { #[test] pub fn test_arrow_error() { fn throw_arrow_error() -> std::result::Result<(), ArrowError> { - Err(ArrowError::ExternalFormat("Lorem ipsum".to_string())) + Err(ArrowError::IoError("Lorem ipsum".to_string())) } let error = throw_arrow_error() - .context(WriteParquetSnafu) + .context(NewRecordBatchSnafu) .err() .unwrap(); - assert_eq!(StorageUnavailable, error.status_code()); + assert_eq!(Unexpected, error.status_code()); assert!(error.backtrace_opt().is_some()); } } diff --git a/src/storage/src/manifest/action.rs b/src/storage/src/manifest/action.rs index 6c53aaf94128..690fe679c44f 100644 --- a/src/storage/src/manifest/action.rs +++ b/src/storage/src/manifest/action.rs @@ -30,7 +30,7 @@ use crate::metadata::{ColumnFamilyMetadata, ColumnMetadata, VersionNumber}; use crate::sst::FileMeta; /// Minimal data that could be used to persist and recover [RegionMetadata](crate::metadata::RegionMetadata). -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub struct RawRegionMetadata { pub id: RegionId, pub name: String, @@ -40,7 +40,7 @@ pub struct RawRegionMetadata { } /// Minimal data that could be used to persist and recover [ColumnsMetadata](crate::metadata::ColumnsMetadata). -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct RawColumnsMetadata { pub columns: Vec, pub row_key_end: usize, @@ -55,7 +55,7 @@ pub struct RawColumnFamiliesMetadata { pub column_families: Vec, } -#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)] +#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)] pub struct RegionChange { /// The committed sequence of the region when this change happens. So the /// data with sequence **greater than** this sequence would use the new @@ -78,7 +78,7 @@ pub struct RegionEdit { pub files_to_remove: Vec, } -#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)] +#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)] pub enum RegionMetaAction { Protocol(ProtocolAction), Change(RegionChange), @@ -86,7 +86,7 @@ pub enum RegionMetaAction { Edit(RegionEdit), } -#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)] +#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)] pub struct RegionMetaActionList { pub actions: Vec, pub prev_version: ManifestVersion, diff --git a/src/storage/src/memtable/btree.rs b/src/storage/src/memtable/btree.rs index a06c6ee5a7b7..e1da00a33d73 100644 --- a/src/storage/src/memtable/btree.rs +++ b/src/storage/src/memtable/btree.rs @@ -18,11 +18,10 @@ use std::ops::Bound; use std::sync::atomic::{AtomicUsize, Ordering as AtomicOrdering}; use std::sync::{Arc, RwLock}; +use datatypes::data_type::DataType; use datatypes::prelude::*; use datatypes::value::Value; -use datatypes::vectors::{ - UInt64Vector, UInt64VectorBuilder, UInt8Vector, UInt8VectorBuilder, VectorBuilder, -}; +use datatypes::vectors::{UInt64Vector, UInt64VectorBuilder, UInt8Vector, UInt8VectorBuilder}; use store_api::storage::{OpType, SequenceNumber}; use crate::error::Result; @@ -441,7 +440,7 @@ fn rows_to_vectors, T: RowsProvider>( let row_num = provider.row_num(); let mut builders = Vec::with_capacity(column_num); for data_type in data_types { - builders.push(VectorBuilder::with_capacity(data_type, row_num)); + builders.push(data_type.create_mutable_vector(row_num)); } let mut vectors = Vec::with_capacity(column_num); @@ -453,10 +452,13 @@ fn rows_to_vectors, T: RowsProvider>( for row_idx in 0..row_num { let row = provider.row_by_index(row_idx); let value = &row[col_idx]; - builder.push(value); + builder + .as_mut() + .push_value_ref(value.as_value_ref()) + .unwrap(); } - vectors.push(builder.finish()); + vectors.push(builder.to_vector()); } vectors diff --git a/src/storage/src/memtable/inserter.rs b/src/storage/src/memtable/inserter.rs index 6f0ea70b0fa2..a876f7c4c4f9 100644 --- a/src/storage/src/memtable/inserter.rs +++ b/src/storage/src/memtable/inserter.rs @@ -140,7 +140,7 @@ mod tests { use common_time::timestamp::Timestamp; use datatypes::type_id::LogicalTypeId; use datatypes::value::Value; - use datatypes::vectors::{Int64Vector, TimestampVector}; + use datatypes::vectors::{Int64Vector, TimestampMillisecondVector}; use store_api::storage::{PutOperation, WriteRequest}; use super::*; @@ -153,7 +153,7 @@ mod tests { fn new_test_write_batch() -> WriteBatch { write_batch_util::new_write_batch( &[ - ("ts", LogicalTypeId::Timestamp, false), + ("ts", LogicalTypeId::TimestampMillisecond, false), ("value", LogicalTypeId::Int64, true), ], Some(0), @@ -162,7 +162,7 @@ mod tests { fn new_region_schema() -> RegionSchemaRef { let desc = RegionDescBuilder::new("test") - .timestamp(("ts", LogicalTypeId::Timestamp, false)) + .timestamp(("ts", LogicalTypeId::TimestampMillisecond, false)) .push_value_column(("value", LogicalTypeId::Int64, true)) .enable_version_column(false) .build(); @@ -173,9 +173,9 @@ mod tests { fn put_batch(batch: &mut WriteBatch, data: &[(i64, Option)]) { let mut put_data = PutData::with_num_columns(2); - let ts = TimestampVector::from_values(data.iter().map(|v| v.0)); + let ts = TimestampMillisecondVector::from_values(data.iter().map(|v| v.0)); put_data.add_key_column("ts", Arc::new(ts)).unwrap(); - let value = Int64Vector::from_iter(data.iter().map(|v| v.1)); + let value = Int64Vector::from(data.iter().map(|v| v.1).collect::>()); put_data.add_value_column("value", Arc::new(value)).unwrap(); batch.put(put_data).unwrap(); @@ -195,7 +195,10 @@ mod tests { for i in 0..row_num { let ts = batch.column(0).get(i); let v = batch.column(1).get(i); - assert_eq!(Value::Timestamp(Timestamp::from_millis(data[index].0)), ts); + assert_eq!( + Value::Timestamp(Timestamp::new_millisecond(data[index].0)), + ts + ); assert_eq!(Value::from(data[index].1), v); assert_eq!(Value::from(sequence), batch.column(2).get(i)); diff --git a/src/storage/src/memtable/tests.rs b/src/storage/src/memtable/tests.rs index d51cc844ca01..2ede68cc0f21 100644 --- a/src/storage/src/memtable/tests.rs +++ b/src/storage/src/memtable/tests.rs @@ -12,12 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -use common_time::timestamp::Timestamp; -use datatypes::arrow; -use datatypes::arrow::array::{Int64Array, PrimitiveArray, UInt64Array, UInt8Array}; use datatypes::prelude::*; +use datatypes::timestamp::TimestampMillisecond; use datatypes::type_id::LogicalTypeId; -use datatypes::vectors::{TimestampVectorBuilder, UInt64VectorBuilder}; +use datatypes::vectors::{ + TimestampMillisecondVector, TimestampMillisecondVectorBuilder, UInt64Vector, + UInt64VectorBuilder, UInt8Vector, +}; use super::*; use crate::metadata::RegionMetadata; @@ -43,13 +44,13 @@ fn kvs_for_test_with_index( sequence: SequenceNumber, op_type: OpType, start_index_in_batch: usize, - keys: &[(Timestamp, u64)], + keys: &[(TimestampMillisecond, u64)], values: &[(Option, Option)], ) -> KeyValues { assert_eq!(keys.len(), values.len()); let mut key_builders = ( - TimestampVectorBuilder::with_capacity(keys.len()), + TimestampMillisecondVectorBuilder::with_capacity(keys.len()), UInt64VectorBuilder::with_capacity(keys.len()), ); for key in keys { @@ -91,7 +92,7 @@ fn kvs_for_test_with_index( fn kvs_for_test( sequence: SequenceNumber, op_type: OpType, - keys: &[(Timestamp, u64)], + keys: &[(TimestampMillisecond, u64)], values: &[(Option, Option)], ) -> KeyValues { kvs_for_test_with_index(sequence, op_type, 0, keys, values) @@ -104,7 +105,8 @@ pub fn write_kvs( keys: &[(i64, u64)], values: &[(Option, Option)], ) { - let keys: Vec<(Timestamp, u64)> = keys.iter().map(|(l, r)| ((*l).into(), *r)).collect(); + let keys: Vec<(TimestampMillisecond, u64)> = + keys.iter().map(|(l, r)| ((*l).into(), *r)).collect(); let kvs = kvs_for_test(sequence, op_type, &keys, values); @@ -126,7 +128,8 @@ fn check_iter_content( op_types: &[OpType], values: &[(Option, Option)], ) { - let keys: Vec<(Timestamp, u64)> = keys.iter().map(|(l, r)| ((*l).into(), *r)).collect(); + let keys: Vec<(TimestampMillisecond, u64)> = + keys.iter().map(|(l, r)| ((*l).into(), *r)).collect(); let mut index = 0; for batch in iter { @@ -239,7 +242,7 @@ fn write_iter_memtable_case(ctx: &TestContext) { ); // 9 key value pairs (6 + 3). - assert_eq!(288, ctx.memtable.bytes_allocated()); + assert_eq!(704, ctx.memtable.bytes_allocated()); let batch_sizes = [1, 4, 8, consts::READ_BATCH_SIZE]; for batch_size in batch_sizes { @@ -576,22 +579,16 @@ fn test_memtable_projection() { assert!(iter.next().is_none()); assert_eq!(5, batch.num_columns()); - let k0 = Int64Array::from_slice(&[1000, 1001, 1002]); - let k0 = PrimitiveArray::new( - arrow::datatypes::DataType::Timestamp(arrow::datatypes::TimeUnit::Millisecond, None), - k0.values().clone(), - k0.validity().cloned(), - ); - - let k1 = UInt64Array::from_slice(&[0, 1, 2]); - let v0 = UInt64Array::from_slice(&[10, 11, 12]); - let sequences = UInt64Array::from_slice(&[9, 9, 9]); - let op_types = UInt8Array::from_slice(&[0, 0, 0]); - - assert_eq!(k0, &*batch.column(0).to_arrow_array()); - assert_eq!(k1, &*batch.column(1).to_arrow_array()); - assert_eq!(v0, &*batch.column(2).to_arrow_array()); - assert_eq!(sequences, &*batch.column(3).to_arrow_array()); - assert_eq!(op_types, &*batch.column(4).to_arrow_array()); + let k0 = Arc::new(TimestampMillisecondVector::from_slice(&[1000, 1001, 1002])) as VectorRef; + let k1 = Arc::new(UInt64Vector::from_slice(&[0, 1, 2])) as VectorRef; + let v0 = Arc::new(UInt64Vector::from_slice(&[10, 11, 12])) as VectorRef; + let sequences = Arc::new(UInt64Vector::from_slice(&[9, 9, 9])) as VectorRef; + let op_types = Arc::new(UInt8Vector::from_slice(&[0, 0, 0])) as VectorRef; + + assert_eq!(k0, *batch.column(0)); + assert_eq!(k1, *batch.column(1)); + assert_eq!(v0, *batch.column(2)); + assert_eq!(sequences, *batch.column(3)); + assert_eq!(op_types, *batch.column(4)); }); } diff --git a/src/storage/src/metadata.rs b/src/storage/src/metadata.rs index 3808f4f14ff1..02fc437509fc 100644 --- a/src/storage/src/metadata.rs +++ b/src/storage/src/metadata.rs @@ -186,7 +186,7 @@ pub type VersionNumber = u32; // TODO(yingwen): We may need to hold a list of history schema. /// In memory metadata of region. -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, Eq)] pub struct RegionMetadata { // The following fields are immutable. id: RegionId, @@ -376,7 +376,7 @@ const METADATA_CF_ID_KEY: &str = "greptime:storage:cf_id"; const METADATA_COLUMN_ID_KEY: &str = "greptime:storage:column_id"; const METADATA_COMMENT_KEY: &str = "greptime:storage:comment"; -#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct ColumnMetadata { pub cf_id: ColumnFamilyId, pub desc: ColumnDescriptor, @@ -458,7 +458,7 @@ where default_value.context(MetaNotFoundSnafu { key }) } -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, Eq)] pub struct ColumnsMetadata { /// All columns. /// @@ -926,7 +926,7 @@ mod tests { fn test_descriptor_to_region_metadata() { let region_name = "region-0"; let desc = RegionDescBuilder::new(region_name) - .timestamp(("ts", LogicalTypeId::Timestamp, false)) + .timestamp(("ts", LogicalTypeId::TimestampMillisecond, false)) .enable_version_column(false) .push_key_column(("k1", LogicalTypeId::Int32, false)) .push_value_column(("v1", LogicalTypeId::Float32, true)) @@ -935,7 +935,7 @@ mod tests { let expect_schema = schema_util::new_schema_ref( &[ ("k1", LogicalTypeId::Int32, false), - ("ts", LogicalTypeId::Timestamp, false), + ("ts", LogicalTypeId::TimestampMillisecond, false), ("v1", LogicalTypeId::Float32, true), ], Some(1), @@ -1036,12 +1036,15 @@ mod tests { } fn new_metadata(enable_version_column: bool) -> RegionMetadata { - let timestamp = - ColumnDescriptorBuilder::new(2, "ts", ConcreteDataType::timestamp_millis_datatype()) - .is_nullable(false) - .is_time_index(true) - .build() - .unwrap(); + let timestamp = ColumnDescriptorBuilder::new( + 2, + "ts", + ConcreteDataType::timestamp_millisecond_datatype(), + ) + .is_nullable(false) + .is_time_index(true) + .build() + .unwrap(); let row_key = RowKeyDescriptorBuilder::new(timestamp) .push_column( ColumnDescriptorBuilder::new(3, "k1", ConcreteDataType::int64_datatype()) @@ -1078,7 +1081,7 @@ mod tests { let expect_schema = schema_util::new_schema_ref( &[ ("k1", LogicalTypeId::Int64, false), - ("ts", LogicalTypeId::Timestamp, false), + ("ts", LogicalTypeId::TimestampMillisecond, false), ("v1", LogicalTypeId::Int64, true), ], Some(1), @@ -1125,7 +1128,7 @@ mod tests { let expect_schema = schema_util::new_schema_ref( &[ ("k1", LogicalTypeId::Int64, false), - ("ts", LogicalTypeId::Timestamp, false), + ("ts", LogicalTypeId::TimestampMillisecond, false), (consts::VERSION_COLUMN_NAME, LogicalTypeId::UInt64, false), ("v1", LogicalTypeId::Int64, true), ], @@ -1266,7 +1269,7 @@ mod tests { fn test_validate_alter_request() { let builder = RegionDescBuilder::new("region-alter") .enable_version_column(false) - .timestamp(("ts", LogicalTypeId::Timestamp, false)) + .timestamp(("ts", LogicalTypeId::TimestampMillisecond, false)) .push_key_column(("k0", LogicalTypeId::Int32, false)) .push_value_column(("v0", LogicalTypeId::Float32, true)) .push_value_column(("v1", LogicalTypeId::Float32, true)); diff --git a/src/storage/src/proto/write_batch.rs b/src/storage/src/proto/write_batch.rs index 7d5ef21aedb2..d710df9dc2a0 100644 --- a/src/storage/src/proto/write_batch.rs +++ b/src/storage/src/proto/write_batch.rs @@ -22,13 +22,18 @@ use common_error::prelude::*; use datatypes::data_type::ConcreteDataType; use datatypes::prelude::{ScalarVector, ScalarVectorBuilder}; use datatypes::schema; +use datatypes::types::TimestampType; use datatypes::vectors::{ - BinaryVector, BinaryVectorBuilder, BooleanVector, BooleanVectorBuilder, Float32Vector, - Float32VectorBuilder, Float64Vector, Float64VectorBuilder, Int16Vector, Int16VectorBuilder, - Int32Vector, Int32VectorBuilder, Int64Vector, Int64VectorBuilder, Int8Vector, - Int8VectorBuilder, StringVector, StringVectorBuilder, TimestampVector, TimestampVectorBuilder, - UInt16Vector, UInt16VectorBuilder, UInt32Vector, UInt32VectorBuilder, UInt64Vector, - UInt64VectorBuilder, UInt8Vector, UInt8VectorBuilder, Vector, VectorRef, + BinaryVector, BinaryVectorBuilder, BooleanVector, BooleanVectorBuilder, DateTimeVector, + DateTimeVectorBuilder, DateVector, DateVectorBuilder, Float32Vector, Float32VectorBuilder, + Float64Vector, Float64VectorBuilder, Int16Vector, Int16VectorBuilder, Int32Vector, + Int32VectorBuilder, Int64Vector, Int64VectorBuilder, Int8Vector, Int8VectorBuilder, + StringVector, StringVectorBuilder, TimestampMicrosecondVector, + TimestampMicrosecondVectorBuilder, TimestampMillisecondVector, + TimestampMillisecondVectorBuilder, TimestampNanosecondVector, TimestampNanosecondVectorBuilder, + TimestampSecondVector, TimestampSecondVectorBuilder, UInt16Vector, UInt16VectorBuilder, + UInt32Vector, UInt32VectorBuilder, UInt64Vector, UInt64VectorBuilder, UInt8Vector, + UInt8VectorBuilder, Vector, VectorRef, }; use paste::paste; use snafu::OptionExt; @@ -148,7 +153,12 @@ impl From<&ConcreteDataType> for DataType { ConcreteDataType::String(_) => DataType::String, ConcreteDataType::Null(_) => DataType::Null, ConcreteDataType::Binary(_) => DataType::Binary, - ConcreteDataType::Timestamp(_) => DataType::Timestamp, + ConcreteDataType::Timestamp(unit) => match unit { + TimestampType::Second(_) => DataType::TimestampSecond, + TimestampType::Millisecond(_) => DataType::TimestampMillisecond, + TimestampType::Microsecond(_) => DataType::TimestampMicrosecond, + TimestampType::Nanosecond(_) => DataType::TimestampNanosecond, + }, ConcreteDataType::Date(_) | ConcreteDataType::DateTime(_) | ConcreteDataType::List(_) => { @@ -176,7 +186,12 @@ impl From for ConcreteDataType { DataType::String => ConcreteDataType::string_datatype(), DataType::Binary => ConcreteDataType::binary_datatype(), DataType::Null => ConcreteDataType::null_datatype(), - DataType::Timestamp => ConcreteDataType::timestamp_millis_datatype(), + DataType::Date => ConcreteDataType::date_datatype(), + DataType::Datetime => ConcreteDataType::datetime_datatype(), + DataType::TimestampSecond => ConcreteDataType::timestamp_second_datatype(), + DataType::TimestampMillisecond => ConcreteDataType::timestamp_millisecond_datatype(), + DataType::TimestampMicrosecond => ConcreteDataType::timestamp_microsecond_datatype(), + DataType::TimestampNanosecond => ConcreteDataType::timestamp_nanosecond_datatype(), } } } @@ -239,7 +254,12 @@ gen_columns!(f64, Float64Vector, v, v); gen_columns!(bool, BooleanVector, v, v); gen_columns!(binary, BinaryVector, v, v.to_vec()); gen_columns!(string, StringVector, v, v.to_string()); -gen_columns!(timestamp, TimestampVector, v, v.value()); +gen_columns!(date, DateVector, v, v.val()); +gen_columns!(datetime, DateTimeVector, v, v.val()); +gen_columns!(ts_second, TimestampSecondVector, v, v.into()); +gen_columns!(ts_millisecond, TimestampMillisecondVector, v, v.into()); +gen_columns!(ts_microsecond, TimestampMicrosecondVector, v, v.into()); +gen_columns!(ts_nanosecond, TimestampNanosecondVector, v, v.into()); #[macro_export] macro_rules! gen_put_data { @@ -287,7 +307,27 @@ gen_put_data!(f64, Float64VectorBuilder, v, *v as f64); gen_put_data!(bool, BooleanVectorBuilder, v, *v); gen_put_data!(binary, BinaryVectorBuilder, v, v.as_slice()); gen_put_data!(string, StringVectorBuilder, v, v.as_str()); -gen_put_data!(timestamp, TimestampVectorBuilder, v, (*v).into()); +gen_put_data!(date, DateVectorBuilder, v, (*v).into()); +gen_put_data!(datetime, DateTimeVectorBuilder, v, (*v).into()); +gen_put_data!(ts_second, TimestampSecondVectorBuilder, v, (*v).into()); +gen_put_data!( + ts_millisecond, + TimestampMillisecondVectorBuilder, + v, + (*v).into() +); +gen_put_data!( + ts_microsecond, + TimestampMicrosecondVectorBuilder, + v, + (*v).into() +); +gen_put_data!( + ts_nanosecond, + TimestampNanosecondVectorBuilder, + v, + (*v).into() +); pub fn gen_columns(vector: &VectorRef) -> Result { let data_type = vector.data_type(); @@ -305,11 +345,15 @@ pub fn gen_columns(vector: &VectorRef) -> Result { ConcreteDataType::Float64(_) => gen_columns_f64(vector), ConcreteDataType::Binary(_) => gen_columns_binary(vector), ConcreteDataType::String(_) => gen_columns_string(vector), - ConcreteDataType::Timestamp(_) => gen_columns_timestamp(vector), - ConcreteDataType::Null(_) - | ConcreteDataType::Date(_) - | ConcreteDataType::DateTime(_) - | ConcreteDataType::List(_) => { + ConcreteDataType::Date(_) => gen_columns_date(vector), + ConcreteDataType::DateTime(_) => gen_columns_datetime(vector), + ConcreteDataType::Timestamp(t) => match t { + TimestampType::Second(_) => gen_columns_ts_second(vector), + TimestampType::Millisecond(_) => gen_columns_ts_millisecond(vector), + TimestampType::Microsecond(_) => gen_columns_ts_microsecond(vector), + TimestampType::Nanosecond(_) => gen_columns_ts_nanosecond(vector), + }, + ConcreteDataType::Null(_) | ConcreteDataType::List(_) => { // TODO(jiachun): Maybe support some composite types in the future, such as list, struct, etc. unimplemented!("data type {:?} is not supported", data_type) } @@ -331,11 +375,15 @@ pub fn gen_put_data_vector(data_type: ConcreteDataType, column: Column) -> Resul ConcreteDataType::Float64(_) => gen_put_data_f64(column), ConcreteDataType::Binary(_) => gen_put_data_binary(column), ConcreteDataType::String(_) => gen_put_data_string(column), - ConcreteDataType::Timestamp(_) => gen_put_data_timestamp(column), - ConcreteDataType::Null(_) - | ConcreteDataType::Date(_) - | ConcreteDataType::DateTime(_) - | ConcreteDataType::List(_) => { + ConcreteDataType::Date(_) => gen_put_data_date(column), + ConcreteDataType::DateTime(_) => gen_put_data_datetime(column), + ConcreteDataType::Timestamp(t) => match t { + TimestampType::Second(_) => gen_put_data_ts_second(column), + TimestampType::Millisecond(_) => gen_put_data_ts_millisecond(column), + TimestampType::Microsecond(_) => gen_put_data_ts_microsecond(column), + TimestampType::Nanosecond(_) => gen_put_data_ts_nanosecond(column), + }, + ConcreteDataType::Null(_) | ConcreteDataType::List(_) => { // TODO(jiachun): Maybe support some composite types in the future, such as list, struct, etc. unimplemented!("data type {:?} is not supported", data_type) } diff --git a/src/storage/src/read/merge.rs b/src/storage/src/read/merge.rs index 75c5d112dd22..b4f76b1f4197 100644 --- a/src/storage/src/read/merge.rs +++ b/src/storage/src/read/merge.rs @@ -605,7 +605,7 @@ impl MergeReader { #[cfg(test)] mod tests { use datatypes::prelude::ScalarVector; - use datatypes::vectors::{Int64Vector, TimestampVector}; + use datatypes::vectors::{Int64Vector, TimestampMillisecondVector}; use super::*; use crate::test_util::read_util; @@ -692,7 +692,7 @@ mod tests { let key = batch .column(0) .as_any() - .downcast_ref::() + .downcast_ref::() .unwrap(); let value = batch .column(1) @@ -703,7 +703,7 @@ mod tests { let batch: Vec<_> = key .iter_data() .zip(value.iter_data()) - .map(|(k, v)| (k.unwrap().value(), v)) + .map(|(k, v)| (k.unwrap().into(), v)) .collect(); result.push(batch); } diff --git a/src/storage/src/region/tests.rs b/src/storage/src/region/tests.rs index 57357f2da0d8..798eca19c10b 100644 --- a/src/storage/src/region/tests.rs +++ b/src/storage/src/region/tests.rs @@ -20,10 +20,10 @@ mod flush; mod projection; use common_telemetry::logging; -use common_time::timestamp::Timestamp; -use datatypes::prelude::ScalarVector; +use datatypes::prelude::{ScalarVector, WrapperType}; +use datatypes::timestamp::TimestampMillisecond; use datatypes::type_id::LogicalTypeId; -use datatypes::vectors::{Int64Vector, TimestampVector}; +use datatypes::vectors::{Int64Vector, TimestampMillisecondVector}; use log_store::fs::log::LocalFileLogStore; use log_store::fs::noop::NoopLogStore; use object_store::backend::fs; @@ -70,7 +70,7 @@ impl TesterBase { /// /// Format of data: (timestamp, v0), timestamp is key, v0 is value. pub async fn put(&self, data: &[(i64, Option)]) -> WriteResponse { - let data: Vec<(Timestamp, Option)> = + let data: Vec<(TimestampMillisecond, Option)> = data.iter().map(|(l, r)| ((*l).into(), *r)).collect(); // Build a batch without version. let mut batch = new_write_batch_for_test(false); @@ -82,7 +82,7 @@ impl TesterBase { /// Put without version specified directly to inner writer. pub async fn put_inner(&self, data: &[(i64, Option)]) -> WriteResponse { - let data: Vec<(Timestamp, Option)> = + let data: Vec<(TimestampMillisecond, Option)> = data.iter().map(|(l, r)| ((*l).into(), *r)).collect(); let mut batch = new_write_batch_for_test(false); let put_data = new_put_data(&data); @@ -131,7 +131,11 @@ fn new_write_batch_for_test(enable_version_column: bool) -> WriteBatch { if enable_version_column { write_batch_util::new_write_batch( &[ - (test_util::TIMESTAMP_NAME, LogicalTypeId::Timestamp, false), + ( + test_util::TIMESTAMP_NAME, + LogicalTypeId::TimestampMillisecond, + false, + ), (consts::VERSION_COLUMN_NAME, LogicalTypeId::UInt64, false), ("v0", LogicalTypeId::Int64, true), ], @@ -140,7 +144,11 @@ fn new_write_batch_for_test(enable_version_column: bool) -> WriteBatch { } else { write_batch_util::new_write_batch( &[ - (test_util::TIMESTAMP_NAME, LogicalTypeId::Timestamp, false), + ( + test_util::TIMESTAMP_NAME, + LogicalTypeId::TimestampMillisecond, + false, + ), ("v0", LogicalTypeId::Int64, true), ], Some(0), @@ -148,11 +156,12 @@ fn new_write_batch_for_test(enable_version_column: bool) -> WriteBatch { } } -fn new_put_data(data: &[(Timestamp, Option)]) -> PutData { +fn new_put_data(data: &[(TimestampMillisecond, Option)]) -> PutData { let mut put_data = PutData::with_num_columns(2); - let timestamps = TimestampVector::from_vec(data.iter().map(|v| v.0).collect()); - let values = Int64Vector::from_iter(data.iter().map(|kv| kv.1)); + let timestamps = + TimestampMillisecondVector::from_vec(data.iter().map(|v| v.0.into()).collect()); + let values = Int64Vector::from_owned_iterator(data.iter().map(|kv| kv.1)); put_data .add_key_column(test_util::TIMESTAMP_NAME, Arc::new(timestamps)) @@ -167,14 +176,14 @@ fn append_chunk_to(chunk: &Chunk, dst: &mut Vec<(i64, Option)>) { let timestamps = chunk.columns[0] .as_any() - .downcast_ref::() + .downcast_ref::() .unwrap(); let values = chunk.columns[1] .as_any() .downcast_ref::() .unwrap(); for (ts, value) in timestamps.iter_data().zip(values.iter_data()) { - dst.push((ts.unwrap().value(), value)); + dst.push((ts.unwrap().into_native(), value)); } } @@ -207,7 +216,11 @@ async fn test_new_region() { let expect_schema = schema_util::new_schema_ref( &[ ("k1", LogicalTypeId::Int32, false), - (test_util::TIMESTAMP_NAME, LogicalTypeId::Timestamp, false), + ( + test_util::TIMESTAMP_NAME, + LogicalTypeId::TimestampMillisecond, + false, + ), (consts::VERSION_COLUMN_NAME, LogicalTypeId::UInt64, false), ("v0", LogicalTypeId::Float32, true), ], diff --git a/src/storage/src/region/tests/alter.rs b/src/storage/src/region/tests/alter.rs index 3ab273f1bbce..4372e96c9506 100644 --- a/src/storage/src/region/tests/alter.rs +++ b/src/storage/src/region/tests/alter.rs @@ -15,9 +15,9 @@ use std::collections::BTreeMap; use std::sync::Arc; -use common_time::Timestamp; use datatypes::prelude::*; -use datatypes::vectors::{Int64Vector, TimestampVector}; +use datatypes::timestamp::TimestampMillisecond; +use datatypes::vectors::{Int64Vector, TimestampMillisecondVector}; use log_store::fs::log::LocalFileLogStore; use store_api::storage::{ AddColumn, AlterOperation, AlterRequest, Chunk, ChunkReader, ColumnDescriptor, @@ -53,7 +53,7 @@ struct AlterTester { #[derive(Debug, Clone, PartialEq)] struct DataRow { key: Option, - ts: Timestamp, + ts: TimestampMillisecond, v0: Option, v1: Option, } @@ -71,11 +71,14 @@ impl DataRow { fn new_put_data(data: &[DataRow]) -> PutData { let mut put_data = PutData::with_num_columns(4); - - let keys = Int64Vector::from_iter(data.iter().map(|v| v.key)); - let timestamps = TimestampVector::from_vec(data.iter().map(|v| v.ts).collect()); - let values1 = Int64Vector::from_iter(data.iter().map(|kv| kv.v0)); - let values2 = Int64Vector::from_iter(data.iter().map(|kv| kv.v1)); + let keys = Int64Vector::from(data.iter().map(|v| v.key).collect::>()); + let timestamps = TimestampMillisecondVector::from( + data.iter() + .map(|v| Some(v.ts.into_native())) + .collect::>(), + ); + let values1 = Int64Vector::from(data.iter().map(|kv| kv.v0).collect::>()); + let values2 = Int64Vector::from(data.iter().map(|kv| kv.v1).collect::>()); put_data.add_key_column("k0", Arc::new(keys)).unwrap(); put_data @@ -193,7 +196,7 @@ fn append_chunk_to(chunk: &Chunk, dst: &mut Vec) { .unwrap(); let ts_vector = chunk.columns[1] .as_any() - .downcast_ref::() + .downcast_ref::() .unwrap(); let v0_vector = chunk.columns[2] .as_any() @@ -206,7 +209,7 @@ fn append_chunk_to(chunk: &Chunk, dst: &mut Vec) { for i in 0..k0_vector.len() { dst.push(DataRow::new( k0_vector.get_data(i), - ts_vector.get_data(i).unwrap().value(), + ts_vector.get_data(i).unwrap().into(), v0_vector.get_data(i), v1_vector.get_data(i), )); diff --git a/src/storage/src/region/tests/projection.rs b/src/storage/src/region/tests/projection.rs index d607f4113305..98d0e5026adc 100644 --- a/src/storage/src/region/tests/projection.rs +++ b/src/storage/src/region/tests/projection.rs @@ -17,7 +17,7 @@ use std::sync::Arc; use datatypes::data_type::ConcreteDataType; use datatypes::prelude::ScalarVector; use datatypes::type_id::LogicalTypeId; -use datatypes::vectors::{Int64Vector, TimestampVector}; +use datatypes::vectors::{Int64Vector, TimestampMillisecondVector}; use log_store::fs::log::LocalFileLogStore; use store_api::logstore::LogStore; use store_api::storage::{ @@ -40,7 +40,11 @@ fn new_write_batch_for_test() -> WriteBatch { write_batch_util::new_write_batch( &[ ("k0", LogicalTypeId::Int64, false), - (test_util::TIMESTAMP_NAME, LogicalTypeId::Timestamp, false), + ( + test_util::TIMESTAMP_NAME, + LogicalTypeId::TimestampMillisecond, + false, + ), ("v0", LogicalTypeId::Int64, true), ("v1", LogicalTypeId::Int64, true), ], @@ -60,7 +64,7 @@ fn new_put_data(len: usize, key_start: i64, ts_start: i64, initial_value: i64) - let mut put_data = PutData::with_num_columns(4); let k0 = Int64Vector::from_values((0..len).map(|v| key_start + v as i64)); - let ts = TimestampVector::from_values((0..len).map(|v| ts_start + v as i64)); + let ts = TimestampMillisecondVector::from_values((0..len).map(|v| ts_start + v as i64)); let v0 = Int64Vector::from_values(std::iter::repeat(initial_value).take(len)); let v1 = Int64Vector::from_values((0..len).map(|v| initial_value + v as i64)); @@ -95,11 +99,11 @@ fn append_chunk_to(chunk: &Chunk, dst: &mut Vec>) { ConcreteDataType::Timestamp(_) => { let val = col .as_any() - .downcast_ref::() + .downcast_ref::() .unwrap() .get_data(i) .unwrap(); - row.push(val.value()); + row.push(val.into()); } _ => unreachable!(), } diff --git a/src/storage/src/schema.rs b/src/storage/src/schema.rs index 6bc344f3a701..dcec7ef1d19c 100644 --- a/src/storage/src/schema.rs +++ b/src/storage/src/schema.rs @@ -25,7 +25,9 @@ pub use crate::schema::store::{StoreSchema, StoreSchemaRef}; mod tests { use std::sync::Arc; - use datatypes::vectors::{Int64Vector, UInt64Vector, UInt8Vector, VectorRef}; + use datatypes::vectors::{ + Int64Vector, TimestampMillisecondVector, UInt64Vector, UInt8Vector, VectorRef, + }; use crate::read::Batch; @@ -37,7 +39,8 @@ mod tests { pub(crate) fn new_batch_with_num_values(num_value_columns: usize) -> Batch { let k0 = Int64Vector::from_slice(&[1, 2, 3]); - let timestamp = Int64Vector::from_slice(&[4, 5, 6]); + let timestamp = TimestampMillisecondVector::from_vec(vec![4, 5, 6]); + let mut columns: Vec = vec![Arc::new(k0), Arc::new(timestamp)]; for i in 0..num_value_columns { diff --git a/src/storage/src/schema/compat.rs b/src/storage/src/schema/compat.rs index 5e1c22cd773f..d8d5f9a08a0f 100644 --- a/src/storage/src/schema/compat.rs +++ b/src/storage/src/schema/compat.rs @@ -14,11 +14,7 @@ //! Utilities for resolving schema compatibility problems. -use std::sync::Arc; - -use datatypes::arrow::array::Array; -use datatypes::arrow::chunk::Chunk; -use datatypes::arrow::datatypes::Field; +use datatypes::arrow::record_batch::RecordBatch; use datatypes::schema::SchemaRef; use datatypes::vectors::{Helper, VectorRef}; use snafu::{ensure, OptionExt, ResultExt}; @@ -230,36 +226,19 @@ impl ReadAdapter { self.source_columns_to_batch(source, num_rows) } - /// Returns list of fields need to read from the parquet file. - pub fn fields_to_read(&self) -> Vec { - if !self.need_compat() { - return self - .dest_schema - .schema_to_read() - .arrow_schema() - .fields - .clone(); - } - - self.source_schema - .arrow_schema() - .fields + /// Returns list of fields indices need to read from the parquet file. + pub fn fields_to_read(&self) -> Vec { + self.is_source_needed .iter() - .zip(self.is_source_needed.iter()) - .filter_map(|(field, is_needed)| { - if *is_needed { - Some(field.clone()) - } else { - None - } - }) - .collect() + .enumerate() + .filter_map(|(idx, needed)| if *needed { Some(idx) } else { None }) + .collect::>() } - /// Convert chunk read from the parquet file into [Batch]. + /// Convert [RecordBatch] read from the parquet file into [Batch]. /// - /// The chunk should have the same schema as [`ReadAdapter::fields_to_read()`]. - pub fn arrow_chunk_to_batch(&self, chunk: &Chunk>) -> Result { + /// The [RecordBatch] should have the same schema as [`ReadAdapter::fields_to_read()`]. + pub fn arrow_record_batch_to_batch(&self, record_batch: &RecordBatch) -> Result { let names = self .source_schema .schema() @@ -273,7 +252,8 @@ impl ReadAdapter { None } }); - let source = chunk + let source = record_batch + .columns() .iter() .zip(names) .map(|(column, name)| { @@ -281,11 +261,11 @@ impl ReadAdapter { }) .collect::>()?; - if !self.need_compat() || chunk.is_empty() { + if !self.need_compat() || record_batch.num_rows() == 0 { return Ok(Batch::new(source)); } - let num_rows = chunk.len(); + let num_rows = record_batch.num_rows(); self.source_columns_to_batch(source, num_rows) } @@ -323,8 +303,11 @@ impl ReadAdapter { #[cfg(test)] mod tests { + use std::sync::Arc; + use datatypes::data_type::ConcreteDataType; - use store_api::storage::{consts, ColumnDescriptorBuilder}; + use datatypes::schema::Schema; + use store_api::storage::ColumnDescriptorBuilder; use super::*; use crate::error::Error; @@ -332,12 +315,6 @@ mod tests { use crate::schema::{tests, ProjectedSchema, RegionSchema}; use crate::test_util::{descriptor_util, schema_util}; - fn check_fields(fields: &[Field], names: &[&str]) { - for (field, name) in fields.iter().zip(names) { - assert_eq!(&field.name, name); - } - } - fn call_batch_from_parts( adapter: &ReadAdapter, batch: &Batch, @@ -363,9 +340,26 @@ mod tests { } fn call_arrow_chunk_to_batch(adapter: &ReadAdapter, batch: &Batch) -> Batch { + let columns_schema = adapter + .source_schema + .columns() + .iter() + .zip(adapter.is_source_needed.iter()) + .filter_map(|(field, is_needed)| { + if *is_needed { + Some(field.to_column_schema().unwrap()) + } else { + None + } + }) + .collect::>(); + let arrow_schema = Schema::try_new(columns_schema) + .unwrap() + .arrow_schema() + .clone(); let arrays = batch.columns().iter().map(|v| v.to_arrow_array()).collect(); - let chunk = Chunk::new(arrays); - adapter.arrow_chunk_to_batch(&chunk).unwrap() + let chunk = RecordBatch::try_new(arrow_schema, arrays).unwrap(); + adapter.arrow_record_batch_to_batch(&chunk).unwrap() } fn check_arrow_chunk_to_batch_without_padding(adapter: &ReadAdapter, batch: &Batch) { @@ -404,7 +398,6 @@ mod tests { // (k0, timestamp, v0, v1) with version 0. let region_schema = Arc::new(schema_util::new_region_schema(0, 2)); let projected_schema = Arc::new(ProjectedSchema::no_projection(region_schema.clone())); - let source_schema = region_schema.store_schema().clone(); let adapter = ReadAdapter::new(source_schema, projected_schema).unwrap(); @@ -414,17 +407,7 @@ mod tests { let batch = tests::new_batch_with_num_values(2); check_batch_from_parts_without_padding(&adapter, &batch, 2); - check_fields( - &adapter.fields_to_read(), - &[ - "k0", - "timestamp", - "v0", - "v1", - consts::SEQUENCE_COLUMN_NAME, - consts::OP_TYPE_COLUMN_NAME, - ], - ); + assert_eq!(&adapter.fields_to_read(), &[0, 1, 2, 3, 4, 5],); check_arrow_chunk_to_batch_without_padding(&adapter, &batch); } @@ -447,16 +430,7 @@ mod tests { let batch = tests::new_batch_with_num_values(1); check_batch_from_parts_without_padding(&adapter, &batch, 1); - check_fields( - &adapter.fields_to_read(), - &[ - "k0", - "timestamp", - "v0", - consts::SEQUENCE_COLUMN_NAME, - consts::OP_TYPE_COLUMN_NAME, - ], - ); + assert_eq!(&adapter.fields_to_read(), &[0, 1, 2, 4, 5]); check_arrow_chunk_to_batch_without_padding(&adapter, &batch); } @@ -481,16 +455,7 @@ mod tests { let batch = tests::new_batch_with_num_values(1); check_batch_from_parts_without_padding(&adapter, &batch, 1); - check_fields( - &adapter.fields_to_read(), - &[ - "k0", - "timestamp", - "v0", - consts::SEQUENCE_COLUMN_NAME, - consts::OP_TYPE_COLUMN_NAME, - ], - ); + assert_eq!(&adapter.fields_to_read(), &[0, 1, 2, 3, 4],); check_arrow_chunk_to_batch_without_padding(&adapter, &batch); } @@ -519,16 +484,7 @@ mod tests { // v2 is filled by null. check_batch_with_null_padding(&batch, &new_batch, &[3]); - check_fields( - &adapter.fields_to_read(), - &[ - "k0", - "timestamp", - "v0", - consts::SEQUENCE_COLUMN_NAME, - consts::OP_TYPE_COLUMN_NAME, - ], - ); + assert_eq!(&adapter.fields_to_read(), &[0, 1, 2, 4, 5],); let new_batch = call_arrow_chunk_to_batch(&adapter, &batch); check_batch_with_null_padding(&batch, &new_batch, &[3]); @@ -567,16 +523,7 @@ mod tests { // v0 is filled by null. check_batch_with_null_padding(&batch, &new_batch, &[2]); - check_fields( - &adapter.fields_to_read(), - &[ - "k0", - "timestamp", - "v1", - consts::SEQUENCE_COLUMN_NAME, - consts::OP_TYPE_COLUMN_NAME, - ], - ); + assert_eq!(&adapter.fields_to_read(), &[0, 1, 3, 4, 5],); let new_batch = call_arrow_chunk_to_batch(&adapter, &batch); check_batch_with_null_padding(&batch, &new_batch, &[2]); diff --git a/src/storage/src/schema/projected.rs b/src/storage/src/schema/projected.rs index 6e746c9ff923..f50d43143370 100644 --- a/src/storage/src/schema/projected.rs +++ b/src/storage/src/schema/projected.rs @@ -186,7 +186,6 @@ impl ProjectedSchema { .collect() } }; - Chunk::new(columns) } @@ -337,7 +336,7 @@ impl BatchOp for ProjectedSchema { mod tests { use datatypes::prelude::ScalarVector; use datatypes::type_id::LogicalTypeId; - use datatypes::vectors::{TimestampVector, VectorRef}; + use datatypes::vectors::{TimestampMillisecondVector, VectorRef}; use store_api::storage::OpType; use super::*; @@ -398,7 +397,7 @@ mod tests { let expect_user = schema_util::new_schema_with_version( &[ ("v1", LogicalTypeId::Int64, true), - ("timestamp", LogicalTypeId::Timestamp, false), + ("timestamp", LogicalTypeId::TimestampMillisecond, false), ], Some(1), 123, @@ -524,7 +523,7 @@ mod tests { let filter = BooleanVector::from_slice(&[true, false, true]); let res = schema.filter(&batch, &filter).unwrap(); - let expect: VectorRef = Arc::new(TimestampVector::from_values([1000, 3000])); + let expect: VectorRef = Arc::new(TimestampMillisecondVector::from_values([1000, 3000])); assert_eq!(expect, *res.column(0)); } } diff --git a/src/storage/src/schema/region.rs b/src/storage/src/schema/region.rs index bfc046c868ed..b6c0ef2a4e96 100644 --- a/src/storage/src/schema/region.rs +++ b/src/storage/src/schema/region.rs @@ -32,7 +32,7 @@ use crate::schema::{StoreSchema, StoreSchemaRef}; /// /// The user schema is the schema that only contains columns that user could visit, /// as well as what the schema user created. -#[derive(Debug, PartialEq)] +#[derive(Debug, PartialEq, Eq)] pub struct RegionSchema { /// Schema that only contains columns that user defined, excluding internal columns /// that are reserved and used by the storage engine. @@ -162,7 +162,7 @@ mod tests { let expect_schema = schema_util::new_schema_with_version( &[ ("k0", LogicalTypeId::Int64, false), - ("timestamp", LogicalTypeId::Timestamp, false), + ("timestamp", LogicalTypeId::TimestampMillisecond, false), ("v0", LogicalTypeId::Int64, true), ], Some(1), diff --git a/src/storage/src/schema/store.rs b/src/storage/src/schema/store.rs index 681059b256d8..691320e8bd53 100644 --- a/src/storage/src/schema/store.rs +++ b/src/storage/src/schema/store.rs @@ -12,15 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::collections::HashMap; use std::sync::Arc; use common_error::prelude::*; -use datatypes::arrow::array::Array; -use datatypes::arrow::chunk::Chunk as ArrowChunk; use datatypes::arrow::datatypes::Schema as ArrowSchema; -use datatypes::schema::{Metadata, Schema, SchemaBuilder, SchemaRef}; +use datatypes::arrow::record_batch::RecordBatch; +use datatypes::schema::{Schema, SchemaBuilder, SchemaRef}; use store_api::storage::consts; +use crate::error::NewRecordBatchSnafu; use crate::metadata::{self, ColumnMetadata, ColumnsMetadata, Error, Result}; use crate::read::Batch; @@ -31,7 +32,7 @@ const USER_COLUMN_END_KEY: &str = "greptime:storage:user_column_end"; /// /// Used internally, contains all row key columns, internal columns and a sub set of /// value columns in a region. The columns are organized in `key, value, internal` order. -#[derive(Debug, PartialEq)] +#[derive(Debug, PartialEq, Eq)] pub struct StoreSchema { columns: Vec, schema: SchemaRef, @@ -57,10 +58,16 @@ impl StoreSchema { self.schema.arrow_schema() } - pub fn batch_to_arrow_chunk(&self, batch: &Batch) -> ArrowChunk> { - assert_eq!(self.schema.num_columns(), batch.num_columns()); - - ArrowChunk::new(batch.columns().iter().map(|v| v.to_arrow_array()).collect()) + pub fn batch_to_arrow_record_batch( + &self, + batch: &Batch, + ) -> std::result::Result { + assert_eq!(self.schema.num_columns(), batch.num_columns(),); + RecordBatch::try_new( + self.schema.arrow_schema().clone(), + batch.columns().iter().map(|v| v.to_arrow_array()).collect(), + ) + .context(NewRecordBatchSnafu) } pub(crate) fn contains_column(&self, name: &str) -> bool { @@ -181,10 +188,10 @@ impl StoreSchema { } } -impl TryFrom for StoreSchema { +impl TryFrom> for StoreSchema { type Error = Error; - fn try_from(arrow_schema: ArrowSchema) -> Result { + fn try_from(arrow_schema: Arc) -> std::result::Result { let schema = Schema::try_from(arrow_schema).context(metadata::ConvertArrowSchemaSnafu)?; // Recover other metadata from schema. let row_key_end = parse_index_from_metadata(schema.metadata(), ROW_KEY_END_KEY)?; @@ -216,7 +223,15 @@ impl TryFrom for StoreSchema { } } -fn parse_index_from_metadata(metadata: &Metadata, key: &str) -> Result { +impl TryFrom for StoreSchema { + type Error = Error; + + fn try_from(arrow_schema: ArrowSchema) -> std::result::Result { + StoreSchema::try_from(Arc::new(arrow_schema)) + } +} + +fn parse_index_from_metadata(metadata: &HashMap, key: &str) -> Result { let value = metadata .get(key) .context(metadata::MetaNotFoundSnafu { key })?; @@ -227,20 +242,17 @@ fn parse_index_from_metadata(metadata: &Metadata, key: &str) -> Result { #[cfg(test)] mod tests { - use datatypes::arrow::array::Array; - use datatypes::arrow::chunk::Chunk as ArrowChunk; - use super::*; use crate::read::Batch; use crate::schema::tests; use crate::test_util::schema_util; - fn check_chunk_batch(chunk: &ArrowChunk>, batch: &Batch) { - assert_eq!(5, chunk.columns().len()); - assert_eq!(3, chunk.len()); + fn check_chunk_batch(record_batch: &RecordBatch, batch: &Batch) { + assert_eq!(5, record_batch.num_columns()); + assert_eq!(3, record_batch.num_rows()); for i in 0..5 { - assert_eq!(chunk[i], batch.column(i).to_arrow_array()); + assert_eq!(record_batch.column(i), &batch.column(i).to_arrow_array()); } } @@ -280,7 +292,7 @@ mod tests { // Test batch and chunk conversion. let batch = tests::new_batch(); // Convert batch to chunk. - let chunk = store_schema.batch_to_arrow_chunk(&batch); + let chunk = store_schema.batch_to_arrow_record_batch(&batch).unwrap(); check_chunk_batch(&chunk, &batch); } } diff --git a/src/storage/src/sst.rs b/src/storage/src/sst.rs index 273841809e16..afe1c10fdff7 100644 --- a/src/storage/src/sst.rs +++ b/src/storage/src/sst.rs @@ -259,7 +259,7 @@ impl AccessLayer for FsAccessLayer { opts.predicate.clone(), ); - let stream = reader.chunk_stream(opts.batch_size).await?; + let stream = reader.chunk_stream().await?; Ok(Box::new(stream)) } } diff --git a/src/storage/src/sst/parquet.rs b/src/storage/src/sst/parquet.rs index 1244582b69a4..5bde4ac4e4c0 100644 --- a/src/storage/src/sst/parquet.rs +++ b/src/storage/src/sst/parquet.rs @@ -18,28 +18,23 @@ use std::collections::HashMap; use std::pin::Pin; use std::sync::Arc; +use async_compat::CompatExt; use async_stream::try_stream; use async_trait::async_trait; -use common_telemetry::debug; -use datatypes::arrow::array::Array; -use datatypes::arrow::chunk::Chunk; -use datatypes::arrow::datatypes::{DataType, Schema}; -use datatypes::arrow::io::parquet::read::{ - infer_schema, read_columns_many_async, read_metadata_async, RowGroupDeserializer, -}; -use datatypes::arrow::io::parquet::write::{ - Compression, Encoding, FileSink, Version, WriteOptions, -}; -use futures::io::BufReader; -use futures::AsyncWriteExt; -use futures_util::sink::SinkExt; -use futures_util::{try_join, Stream, TryStreamExt}; -use object_store::{ObjectStore, SeekableReader}; -use sluice::pipe; +use datatypes::arrow::record_batch::RecordBatch; +use futures_util::{Stream, StreamExt, TryStreamExt}; +use object_store::ObjectStore; +use parquet::arrow::{ArrowWriter, ParquetRecordBatchStreamBuilder, ProjectionMask}; +use parquet::basic::{Compression, Encoding}; +use parquet::file::metadata::KeyValue; +use parquet::file::properties::WriterProperties; use snafu::ResultExt; use table::predicate::Predicate; +use tokio::io::BufReader; -use crate::error::{self, Result}; +use crate::error::{ + self, NewRecordBatchSnafu, ReadParquetSnafu, Result, WriteObjectSnafu, WriteParquetSnafu, +}; use crate::memtable::BoxedBatchIterator; use crate::read::{Batch, BatchReader}; use crate::schema::compat::ReadAdapter; @@ -51,6 +46,7 @@ pub struct ParquetWriter<'a> { file_path: &'a str, iter: BoxedBatchIterator, object_store: ObjectStore, + max_row_group_size: usize, } impl<'a> ParquetWriter<'a> { @@ -63,6 +59,7 @@ impl<'a> ParquetWriter<'a> { file_path, iter, object_store, + max_row_group_size: 4096, // TODO(hl): make this configurable } } @@ -76,122 +73,46 @@ impl<'a> ParquetWriter<'a> { async fn write_rows(self, extra_meta: Option>) -> Result<()> { let projected_schema = self.iter.schema(); let store_schema = projected_schema.schema_to_read(); - let schema = store_schema.arrow_schema(); + let schema = store_schema.arrow_schema().clone(); let object = self.object_store.object(self.file_path); - let (reader, mut writer) = pipe::pipe(); - - // now all physical types use plain encoding, maybe let caller to choose encoding for each type. - let encodings = get_encoding_for_schema(schema, |_| Encoding::Plain); - try_join!( - async { - // FIXME(hl): writer size is not used in fs backend so just leave it to 0, - // but in s3/azblob backend the Content-Length field of HTTP request is set - // to this value. - object - .write_from(0, reader) - .await - .context(error::FlushIoSnafu) - }, - async { - let mut sink = FileSink::try_new( - &mut writer, - // The file sink needs the `Schema` instead of a reference. - (**schema).clone(), - encodings, - WriteOptions { - write_statistics: true, - compression: Compression::Gzip, - version: Version::V2, - }, - ) - .context(error::WriteParquetSnafu)?; - - for batch in self.iter { - let batch = batch?; - sink.send(store_schema.batch_to_arrow_chunk(&batch)) - .await - .context(error::WriteParquetSnafu)?; - } - - if let Some(meta) = extra_meta { - for (k, v) in meta { - sink.metadata.insert(k, Some(v)); - } - } - sink.close().await.context(error::WriteParquetSnafu)?; - drop(sink); - - writer - .close() - .await - .map_err(|err| { - object_store::Error::new( - object_store::ErrorKind::Unexpected, - "writer close failed", - ) - .set_source(err) - }) - .context(error::WriteObjectSnafu { - path: self.file_path, - }) - } - ) - .map(|_| ()) - } -} - -fn get_encoding_for_schema Encoding + Clone>( - schema: &Schema, - map: F, -) -> Vec { - schema - .fields - .iter() - .flat_map(|f| transverse(&f.data_type, map.clone())) - .collect() -} - -// TODO(hl): backport from arrow2 v0.12 (https://github.com/jorgecarleitao/arrow2/blob/f57dbd5dbc61b940a71decd5f81d0fd4c93b158d/src/io/parquet/write/mod.rs#L454-L509) -// remove it when upgrade to newer version -pub fn transverse T + Clone>(data_type: &DataType, map: F) -> Vec { - let mut encodings = vec![]; - transverse_recursive(data_type, map, &mut encodings); - encodings -} - -fn transverse_recursive T + Clone>( - data_type: &DataType, - map: F, - encodings: &mut Vec, -) { - use datatypes::arrow::datatypes::PhysicalType::*; - match data_type.to_physical_type() { - Null | Boolean | Primitive(_) | Binary | FixedSizeBinary | LargeBinary | Utf8 - | Dictionary(_) | LargeUtf8 => encodings.push(map(data_type)), - List | FixedSizeList | LargeList => { - let a = data_type.to_logical_type(); - if let DataType::List(inner) = a { - transverse_recursive(&inner.data_type, map, encodings) - } else if let DataType::LargeList(inner) = a { - transverse_recursive(&inner.data_type, map, encodings) - } else if let DataType::FixedSizeList(inner, _) = a { - transverse_recursive(&inner.data_type, map, encodings) - } else { - unreachable!() - } + let writer_props = WriterProperties::builder() + .set_compression(Compression::ZSTD) + .set_encoding(Encoding::PLAIN) + .set_max_row_group_size(self.max_row_group_size) + .set_key_value_metadata(extra_meta.map(|map| { + map.iter() + .map(|(k, v)| KeyValue::new(k.clone(), v.clone())) + .collect::>() + })) + .build(); + + // TODO(hl): Since OpenDAL's writer is async and ArrowWriter requires a `std::io::Write`, + // here we use a Vec to buffer all parquet bytes in memory and write to object store + // at a time. Maybe we should find a better way to brige ArrowWriter and OpenDAL's object. + let mut buf = vec![]; + let mut arrow_writer = ArrowWriter::try_new(&mut buf, schema.clone(), Some(writer_props)) + .context(WriteParquetSnafu)?; + for batch in self.iter { + let batch = batch?; + let arrow_batch = RecordBatch::try_new( + schema.clone(), + batch + .columns() + .iter() + .map(|v| v.to_arrow_array()) + .collect::>(), + ) + .context(NewRecordBatchSnafu)?; + arrow_writer + .write(&arrow_batch) + .context(WriteParquetSnafu)?; } - Struct => { - if let DataType::Struct(fields) = data_type.to_logical_type() { - for field in fields { - transverse_recursive(&field.data_type, map.clone(), encodings) - } - } else { - unreachable!() - } - } - Union => todo!(), - Map => todo!(), + arrow_writer.close().context(WriteParquetSnafu)?; + object.write(buf).await.context(WriteObjectSnafu { + path: object.path(), + })?; + Ok(()) } } @@ -202,9 +123,6 @@ pub struct ParquetReader<'a> { predicate: Predicate, } -type ReaderFactoryFuture<'a, R> = - Pin> + Send + 'a>>; - impl<'a> ParquetReader<'a> { pub fn new( file_path: &str, @@ -220,61 +138,48 @@ impl<'a> ParquetReader<'a> { } } - pub async fn chunk_stream(&self, chunk_size: usize) -> Result { - let file_path = self.file_path.to_string(); + pub async fn chunk_stream(&self) -> Result { let operator = self.object_store.clone(); - let reader_factory = move || -> ReaderFactoryFuture { - let file_path = file_path.clone(); - let operator = operator.clone(); - Box::pin(async move { Ok(operator.object(&file_path).seekable_reader(..)) }) - }; - - let file_path = self.file_path.to_string(); - let reader = reader_factory() + let reader = operator.object(self.file_path).seekable_reader(..).compat(); + let buf_reader = BufReader::new(reader); + let builder = ParquetRecordBatchStreamBuilder::new(buf_reader) .await - .context(error::ReadParquetIoSnafu { file: &file_path })?; - // Use BufReader to alleviate consumption bring by random seek and small IO. - let mut buf_reader = BufReader::new(reader); - let metadata = read_metadata_async(&mut buf_reader) - .await - .context(error::ReadParquetSnafu { file: &file_path })?; - - let arrow_schema = - infer_schema(&metadata).context(error::ReadParquetSnafu { file: &file_path })?; - let store_schema = Arc::new( - StoreSchema::try_from(arrow_schema) - .context(error::ConvertStoreSchemaSnafu { file: &file_path })?, - ); + .context(ReadParquetSnafu { + file: self.file_path, + })?; + let arrow_schema = builder.schema().clone(); + + let store_schema = Arc::new(StoreSchema::try_from(arrow_schema).context( + error::ConvertStoreSchemaSnafu { + file: self.file_path, + }, + )?); let adapter = ReadAdapter::new(store_schema.clone(), self.projected_schema.clone())?; - let pruned_row_groups = self - .predicate - .prune_row_groups(store_schema.schema().clone(), &metadata.row_groups); + let pruned_row_groups = self.predicate.prune_row_groups( + store_schema.schema().clone(), + builder.metadata().row_groups(), + ); - let projected_fields = adapter.fields_to_read(); - let chunk_stream = try_stream!({ - for (idx, valid) in pruned_row_groups.iter().enumerate() { - if !valid { - debug!("Pruned {} row groups", idx); - continue; - } + let projection = ProjectionMask::roots( + builder.metadata().file_metadata().schema_descr(), + adapter.fields_to_read(), + ); - let rg = &metadata.row_groups[idx]; - let column_chunks = read_columns_many_async( - &reader_factory, - rg, - projected_fields.clone(), - Some(chunk_size), - ) - .await - .context(error::ReadParquetSnafu { file: &file_path })?; + let mut masked_stream = builder + .with_projection(projection) + .build() + .context(ReadParquetSnafu { + file: self.file_path, + })? + .zip(futures_util::stream::iter(pruned_row_groups.into_iter())); - let chunks = RowGroupDeserializer::new(column_chunks, rg.num_rows() as usize, None); - for maybe_chunk in chunks { - let columns_in_chunk = - maybe_chunk.context(error::ReadParquetSnafu { file: &file_path })?; - yield columns_in_chunk; + let file_name = self.file_path.to_string(); + let chunk_stream = try_stream!({ + while let Some((record_batch, valid)) = masked_stream.next().await { + if valid { + yield record_batch.context(ReadParquetSnafu { file: &file_name })? } } }); @@ -283,7 +188,7 @@ impl<'a> ParquetReader<'a> { } } -pub type SendableChunkStream = Pin>>> + Send>>; +pub type SendableChunkStream = Pin> + Send>>; pub struct ChunkStream { adapter: ReadAdapter, @@ -302,7 +207,7 @@ impl BatchReader for ChunkStream { self.stream .try_next() .await? - .map(|chunk| self.adapter.arrow_chunk_to_batch(&chunk)) + .map(|rb| self.adapter.arrow_record_batch_to_batch(&rb)) .transpose() } } @@ -311,10 +216,9 @@ impl BatchReader for ChunkStream { mod tests { use std::sync::Arc; - use datatypes::arrow::array::{Array, UInt64Array, UInt8Array}; - use datatypes::arrow::io::parquet::read::FileReader; - use datatypes::prelude::{ScalarVector, Vector}; - use datatypes::vectors::TimestampVector; + use datatypes::arrow::array::{Array, ArrayRef, UInt64Array, UInt8Array}; + use datatypes::prelude::Vector; + use datatypes::vectors::TimestampMillisecondVector; use object_store::backend::fs::Builder; use store_api::storage::OpType; use tempdir::TempDir; @@ -323,6 +227,7 @@ mod tests { use crate::memtable::{ tests as memtable_tests, DefaultMemtableBuilder, IterContext, MemtableBuilder, }; + use crate::schema::ProjectedSchema; #[tokio::test] async fn test_parquet_writer() { @@ -357,7 +262,7 @@ mod tests { let object_store = ObjectStore::new(backend); let sst_file_name = "test-flush.parquet"; let iter = memtable.iter(&IterContext::default()).unwrap(); - let writer = ParquetWriter::new(sst_file_name, iter, object_store); + let writer = ParquetWriter::new(sst_file_name, iter, object_store.clone()); writer .write_sst(&sst::WriteOptions::default()) @@ -365,17 +270,23 @@ mod tests { .unwrap(); // verify parquet file + let reader = BufReader::new( + object_store + .object(sst_file_name) + .seekable_reader(..) + .compat(), + ); - let reader = std::fs::File::open(dir.path().join(sst_file_name)).unwrap(); - let mut file_reader = FileReader::try_new(reader, None, Some(128), None, None).unwrap(); + let builder = ParquetRecordBatchStreamBuilder::new(reader).await.unwrap(); + let mut stream = builder.build().unwrap(); // chunk schema: timestamp, __version, v1, __sequence, __op_type - let chunk = file_reader.next().unwrap().unwrap(); - assert_eq!(6, chunk.arrays().len()); + let chunk = stream.next().await.unwrap().unwrap(); + assert_eq!(6, chunk.columns().len()); // timestamp assert_eq!( - TimestampVector::from_slice(&[ + &TimestampMillisecondVector::from_slice(&[ 1000.into(), 1000.into(), 1001.into(), @@ -384,39 +295,107 @@ mod tests { 2003.into() ]) .to_arrow_array(), - chunk.arrays()[0] + chunk.column(0) ); // version assert_eq!( - Arc::new(UInt64Array::from_slice(&[1, 2, 1, 1, 1, 5])) as Arc, - chunk.arrays()[1] + &(Arc::new(UInt64Array::from(vec![1, 2, 1, 1, 1, 5])) as ArrayRef), + chunk.column(1) ); // v0 assert_eq!( - Arc::new(UInt64Array::from_slice(&[1, 2, 3, 7, 8, 9])) as Arc, - chunk.arrays()[2] + &(Arc::new(UInt64Array::from(vec![1, 2, 3, 7, 8, 9])) as Arc), + chunk.column(2) ); // v1 assert_eq!( - Arc::new(UInt64Array::from_slice(&[ - 1234, 1234, 1234, 1234, 1234, 1234 - ])) as Arc, - chunk.arrays()[3] + &(Arc::new(UInt64Array::from(vec![1234, 1234, 1234, 1234, 1234, 1234])) + as Arc), + chunk.column(3) ); // sequence assert_eq!( - Arc::new(UInt64Array::from_slice(&[10, 10, 10, 10, 10, 10])) as Arc, - chunk.arrays()[4] + &(Arc::new(UInt64Array::from(vec![10, 10, 10, 10, 10, 10])) as Arc), + chunk.column(4) ); // op_type assert_eq!( - Arc::new(UInt8Array::from_slice(&[0, 0, 0, 0, 0, 0])) as Arc, - chunk.arrays()[5] + &(Arc::new(UInt8Array::from(vec![0, 0, 0, 0, 0, 0])) as Arc), + chunk.column(5) + ); + } + + #[tokio::test] + async fn test_parquet_reader() { + common_telemetry::init_default_ut_logging(); + let schema = memtable_tests::schema_for_test(); + let memtable = DefaultMemtableBuilder::default().build(schema.clone()); + + memtable_tests::write_kvs( + &*memtable, + 10, // sequence + OpType::Put, + &[ + (1000, 1), + (1000, 2), + (2002, 1), + (2003, 1), + (2003, 5), + (1001, 1), + ], // keys + &[ + (Some(1), Some(1234)), + (Some(2), Some(1234)), + (Some(7), Some(1234)), + (Some(8), Some(1234)), + (Some(9), Some(1234)), + (Some(3), Some(1234)), + ], // values + ); + + let dir = TempDir::new("write_parquet").unwrap(); + let path = dir.path().to_str().unwrap(); + let backend = Builder::default().root(path).build().unwrap(); + let object_store = ObjectStore::new(backend); + let sst_file_name = "test-read.parquet"; + let iter = memtable.iter(&IterContext::default()).unwrap(); + let writer = ParquetWriter::new(sst_file_name, iter, object_store.clone()); + + writer + .write_sst(&sst::WriteOptions::default()) + .await + .unwrap(); + + let operator = ObjectStore::new( + object_store::backend::fs::Builder::default() + .root(dir.path().to_str().unwrap()) + .build() + .unwrap(), + ); + + let projected_schema = Arc::new(ProjectedSchema::new(schema, Some(vec![1])).unwrap()); + let reader = ParquetReader::new( + "test-read.parquet", + operator, + projected_schema, + Predicate::empty(), + ); + + let mut stream = reader.chunk_stream().await.unwrap(); + assert_eq!( + 6, + stream + .next_batch() + .await + .transpose() + .unwrap() + .unwrap() + .num_rows() ); } } diff --git a/src/storage/src/test_util/descriptor_util.rs b/src/storage/src/test_util/descriptor_util.rs index 50c8c2613e15..10d682745b06 100644 --- a/src/storage/src/test_util/descriptor_util.rs +++ b/src/storage/src/test_util/descriptor_util.rs @@ -37,7 +37,7 @@ impl RegionDescBuilder { ColumnDescriptorBuilder::new( 1, test_util::TIMESTAMP_NAME, - ConcreteDataType::timestamp_millis_datatype(), + ConcreteDataType::timestamp_millisecond_datatype(), ) .is_nullable(false) .is_time_index(true) diff --git a/src/storage/src/test_util/read_util.rs b/src/storage/src/test_util/read_util.rs index 1b62611dc610..fe231de8ae93 100644 --- a/src/storage/src/test_util/read_util.rs +++ b/src/storage/src/test_util/read_util.rs @@ -15,9 +15,9 @@ use std::sync::Arc; use async_trait::async_trait; -use datatypes::prelude::ScalarVector; +use datatypes::prelude::{ScalarVector, WrapperType}; use datatypes::type_id::LogicalTypeId; -use datatypes::vectors::{Int64Vector, TimestampVector, UInt64Vector, UInt8Vector}; +use datatypes::vectors::{Int64Vector, TimestampMillisecondVector, UInt64Vector, UInt8Vector}; use store_api::storage::OpType; use crate::error::Result; @@ -45,8 +45,12 @@ pub fn new_projected_schema() -> ProjectedSchemaRef { /// Build a new batch, with 0 sequence and op_type. pub fn new_kv_batch(key_values: &[(i64, Option)]) -> Batch { - let key = Arc::new(TimestampVector::from_values(key_values.iter().map(|v| v.0))); - let value = Arc::new(Int64Vector::from_iter(key_values.iter().map(|v| v.1))); + let key = Arc::new(TimestampMillisecondVector::from_values( + key_values.iter().map(|v| v.0), + )); + let value = Arc::new(Int64Vector::from( + key_values.iter().map(|v| v.1).collect::>(), + )); let sequences = Arc::new(UInt64Vector::from_vec(vec![0; key_values.len()])); let op_types = Arc::new(UInt8Vector::from_vec(vec![0; key_values.len()])); @@ -55,7 +59,9 @@ pub fn new_kv_batch(key_values: &[(i64, Option)]) -> Batch { /// Build a new batch from (key, value, sequence, op_type) pub fn new_full_kv_batch(all_values: &[(i64, i64, u64, OpType)]) -> Batch { - let key = Arc::new(TimestampVector::from_values(all_values.iter().map(|v| v.0))); + let key = Arc::new(TimestampMillisecondVector::from_values( + all_values.iter().map(|v| v.0), + )); let value = Arc::new(Int64Vector::from_values(all_values.iter().map(|v| v.1))); let sequences = Arc::new(UInt64Vector::from_values(all_values.iter().map(|v| v.2))); let op_types = Arc::new(UInt8Vector::from_values( @@ -70,7 +76,7 @@ fn check_kv_batch(batches: &[Batch], expect: &[&[(i64, Option)]]) { let key = batch .column(0) .as_any() - .downcast_ref::() + .downcast_ref::() .unwrap(); let value = batch .column(1) @@ -79,7 +85,7 @@ fn check_kv_batch(batches: &[Batch], expect: &[&[(i64, Option)]]) { .unwrap(); for (i, (k, v)) in key_values.iter().enumerate() { - assert_eq!(key.get_data(i).unwrap().value(), *k); + assert_eq!(key.get_data(i).unwrap().into_native(), *k); assert_eq!(value.get_data(i), *v,); } } @@ -92,7 +98,7 @@ pub async fn collect_kv_batch(reader: &mut dyn BatchReader) -> Vec<(i64, Option< let key = batch .column(0) .as_any() - .downcast_ref::() + .downcast_ref::() .unwrap(); let value = batch .column(1) @@ -101,7 +107,7 @@ pub async fn collect_kv_batch(reader: &mut dyn BatchReader) -> Vec<(i64, Option< .unwrap(); for (k, v) in key.iter_data().zip(value.iter_data()) { - result.push((k.unwrap().value(), v)); + result.push((k.unwrap().into(), v)); } } diff --git a/src/storage/src/write_batch.rs b/src/storage/src/write_batch.rs index 10d66bcd25b1..7f8768298fbf 100644 --- a/src/storage/src/write_batch.rs +++ b/src/storage/src/write_batch.rs @@ -26,7 +26,7 @@ use datatypes::arrow::error::ArrowError; use datatypes::data_type::ConcreteDataType; use datatypes::prelude::{ScalarVector, Value}; use datatypes::schema::{ColumnSchema, SchemaRef}; -use datatypes::vectors::{Int64Vector, TimestampVector, VectorRef}; +use datatypes::vectors::{Int64Vector, TimestampMillisecondVector, VectorRef}; use prost::{DecodeError, EncodeError}; use snafu::{ensure, OptionExt, ResultExt}; use store_api::storage::{consts, PutOperation, WriteRequest}; @@ -116,9 +116,6 @@ pub enum Error { source: datatypes::error::Error, }, - #[snafu(display("Failed to decode, in stream waiting state"))] - StreamWaiting { backtrace: Backtrace }, - #[snafu(display("Failed to decode, corrupted data {}", message))] DataCorrupted { message: String, @@ -230,11 +227,13 @@ impl WriteRequest for WriteBatch { } else { match column.data_type() { ConcreteDataType::Timestamp(_) => { - let ts_vector = - column.as_any().downcast_ref::().unwrap(); + let ts_vector = column + .as_any() + .downcast_ref::() + .unwrap(); for ts in ts_vector.iter_data().flatten() { - let aligned = align_timestamp(ts.value(), durations_millis) - .context(TimestampOverflowSnafu { ts: ts.value() })?; + let aligned = align_timestamp(ts.into(), durations_millis) + .context(TimestampOverflowSnafu { ts: i64::from(ts) })?; aligned_timestamps.insert(aligned); } } @@ -505,9 +504,9 @@ pub mod codec { use std::io::Cursor; use std::sync::Arc; - use datatypes::arrow::chunk::Chunk as ArrowChunk; - use datatypes::arrow::io::ipc::read::{self, StreamReader, StreamState}; - use datatypes::arrow::io::ipc::write::{StreamWriter, WriteOptions}; + use datatypes::arrow::ipc::reader::StreamReader; + use datatypes::arrow::ipc::writer::{IpcWriteOptions, StreamWriter}; + use datatypes::arrow::record_batch::RecordBatch; use datatypes::schema::{Schema, SchemaRef}; use datatypes::vectors::Helper; use prost::Message; @@ -520,8 +519,8 @@ pub mod codec { use crate::write_batch::{ DataCorruptedSnafu, DecodeArrowSnafu, DecodeProtobufSnafu, DecodeVectorSnafu, EncodeArrowSnafu, EncodeProtobufSnafu, Error as WriteBatchError, FromProtobufSnafu, - MissingColumnSnafu, Mutation, ParseSchemaSnafu, PutData, Result, StreamWaitingSnafu, - ToProtobufSnafu, WriteBatch, + MissingColumnSnafu, Mutation, ParseSchemaSnafu, PutData, Result, ToProtobufSnafu, + WriteBatch, }; // TODO(jiachun): We can make a comparison with protobuf, including performance, storage cost, @@ -543,12 +542,12 @@ pub mod codec { let item_schema = item.schema(); let arrow_schema = item_schema.arrow_schema(); - let opts = WriteOptions { compression: None }; - let mut writer = StreamWriter::new(dst, opts); - writer.start(arrow_schema, None).context(EncodeArrowSnafu)?; + let opts = IpcWriteOptions::default(); + let mut writer = StreamWriter::try_new_with_options(dst, arrow_schema, opts) + .context(EncodeArrowSnafu)?; for mutation in item.iter() { - let chunk = match mutation { + let rb = match mutation { Mutation::Put(put) => { let arrays = item_schema .column_schemas() @@ -563,15 +562,13 @@ pub mod codec { }) .collect::>>()?; - ArrowChunk::try_new(arrays).context(EncodeArrowSnafu)? + RecordBatch::try_new(arrow_schema.clone(), arrays) + .context(EncodeArrowSnafu)? } }; - - writer.write(&chunk, None).context(EncodeArrowSnafu)?; + writer.write(&rb).context(EncodeArrowSnafu)?; } - writer.finish().context(EncodeArrowSnafu)?; - Ok(()) } } @@ -591,20 +588,14 @@ pub mod codec { type Error = WriteBatchError; fn decode(&self, src: &[u8]) -> Result { - let mut reader = Cursor::new(src); - let metadata = read::read_stream_metadata(&mut reader).context(DecodeArrowSnafu)?; - let mut reader = StreamReader::new(reader, metadata); - let arrow_schema = reader.metadata().schema.clone(); - + let reader = Cursor::new(src); + let mut reader = StreamReader::try_new(reader, None).context(DecodeArrowSnafu)?; + let arrow_schema = reader.schema(); let mut chunks = Vec::with_capacity(self.mutation_types.len()); - for stream_state in reader.by_ref() { - let stream_state = stream_state.context(DecodeArrowSnafu)?; - let chunk = match stream_state { - StreamState::Some(chunk) => chunk, - StreamState::Waiting => return StreamWaitingSnafu {}.fail(), - }; - chunks.push(chunk); + for maybe_record_batch in reader.by_ref() { + let record_batch = maybe_record_batch.context(DecodeArrowSnafu)?; + chunks.push(record_batch); } // check if exactly finished @@ -629,12 +620,15 @@ pub mod codec { let schema = Arc::new(Schema::try_from(arrow_schema).context(ParseSchemaSnafu)?); let mut write_batch = WriteBatch::new(schema.clone()); - for (mutation_type, chunk) in self.mutation_types.iter().zip(chunks.into_iter()) { + for (mutation_type, record_batch) in self.mutation_types.iter().zip(chunks.into_iter()) + { match MutationType::from_i32(*mutation_type) { Some(MutationType::Put) => { let mut put_data = PutData::with_num_columns(schema.num_columns()); - for (column_schema, array) in - schema.column_schemas().iter().zip(chunk.arrays().iter()) + for (column_schema, array) in schema + .column_schemas() + .iter() + .zip(record_batch.columns().iter()) { let vector = Helper::try_into_vector(array).context(DecodeVectorSnafu)?; @@ -654,7 +648,6 @@ pub mod codec { } } } - Ok(write_batch) } } @@ -787,7 +780,8 @@ mod tests { use datatypes::type_id::LogicalTypeId; use datatypes::vectors::{ - BooleanVector, ConstantVector, Int32Vector, Int64Vector, UInt64Vector, + BooleanVector, ConstantVector, Int32Vector, Int64Vector, TimestampMillisecondVector, + UInt64Vector, }; use super::*; @@ -835,7 +829,7 @@ mod tests { &[ ("k1", LogicalTypeId::UInt64, false), (consts::VERSION_COLUMN_NAME, LogicalTypeId::UInt64, false), - ("ts", LogicalTypeId::Timestamp, false), + ("ts", LogicalTypeId::TimestampMillisecond, false), ("v1", LogicalTypeId::Boolean, true), ], Some(2), @@ -846,7 +840,7 @@ mod tests { fn test_write_batch_put() { let intv = Arc::new(UInt64Vector::from_slice(&[1, 2, 3])); let boolv = Arc::new(BooleanVector::from(vec![true, false, true])); - let tsv = Arc::new(TimestampVector::from_vec(vec![0, 0, 0])); + let tsv = Arc::new(TimestampMillisecondVector::from_vec(vec![0, 0, 0])); let mut put_data = PutData::new(); put_data.add_key_column("k1", intv.clone()).unwrap(); @@ -872,8 +866,8 @@ mod tests { #[test] fn test_write_batch_too_large() { - let boolv = Arc::new(BooleanVector::from_iter( - iter::repeat(Some(true)).take(MAX_BATCH_SIZE + 1), + let boolv = Arc::new(BooleanVector::from_iterator( + iter::repeat(true).take(MAX_BATCH_SIZE + 1), )); let mut put_data = PutData::new(); @@ -922,7 +916,7 @@ mod tests { #[test] fn test_put_type_has_null() { - let intv = Arc::new(UInt64Vector::from_iter(&[Some(1), None, Some(3)])); + let intv = Arc::new(UInt64Vector::from(vec![Some(1), None, Some(3)])); let tsv = Arc::new(Int64Vector::from_vec(vec![0, 0, 0])); let mut put_data = PutData::new(); @@ -950,7 +944,7 @@ mod tests { #[test] fn test_put_unknown_column() { let intv = Arc::new(UInt64Vector::from_slice(&[1, 2, 3])); - let tsv = Arc::new(TimestampVector::from_vec(vec![0, 0, 0])); + let tsv = Arc::new(TimestampMillisecondVector::from_vec(vec![0, 0, 0])); let boolv = Arc::new(BooleanVector::from(vec![true, false, true])); let mut put_data = PutData::new(); @@ -990,7 +984,9 @@ mod tests { #[test] pub fn test_write_batch_time_range() { let intv = Arc::new(UInt64Vector::from_slice(&[1, 2, 3, 4, 5, 6])); - let tsv = Arc::new(TimestampVector::from_vec(vec![-21, -20, -1, 0, 1, 20])); + let tsv = Arc::new(TimestampMillisecondVector::from_vec(vec![ + -21, -20, -1, 0, 1, 20, + ])); let boolv = Arc::new(BooleanVector::from(vec![ true, false, true, false, false, false, ])); @@ -1018,7 +1014,7 @@ mod tests { pub fn test_write_batch_time_range_const_vector() { let intv = Arc::new(UInt64Vector::from_slice(&[1, 2, 3, 4, 5, 6])); let tsv = Arc::new(ConstantVector::new( - Arc::new(TimestampVector::from_vec(vec![20])), + Arc::new(TimestampMillisecondVector::from_vec(vec![20])), 6, )); let boolv = Arc::new(BooleanVector::from(vec![ @@ -1049,7 +1045,7 @@ mod tests { for i in 0..10 { let intv = Arc::new(UInt64Vector::from_slice(&[1, 2, 3])); let boolv = Arc::new(BooleanVector::from(vec![Some(true), Some(false), None])); - let tsv = Arc::new(TimestampVector::from_vec(vec![i, i, i])); + let tsv = Arc::new(TimestampMillisecondVector::from_vec(vec![i, i, i])); let mut put_data = PutData::new(); put_data.add_key_column("k1", intv.clone()).unwrap(); @@ -1103,7 +1099,7 @@ mod tests { let mut batch = new_test_batch(); for _ in 0..10 { let intv = Arc::new(UInt64Vector::from_slice(&[1, 2, 3])); - let tsv = Arc::new(TimestampVector::from_vec(vec![0, 0, 0])); + let tsv = Arc::new(TimestampMillisecondVector::from_vec(vec![0, 0, 0])); let mut put_data = PutData::new(); put_data.add_key_column("k1", intv.clone()).unwrap(); diff --git a/src/storage/src/write_batch/compat.rs b/src/storage/src/write_batch/compat.rs index dcd9d155c33b..ce45ffc1db4c 100644 --- a/src/storage/src/write_batch/compat.rs +++ b/src/storage/src/write_batch/compat.rs @@ -99,7 +99,7 @@ mod tests { use datatypes::data_type::ConcreteDataType; use datatypes::schema::{ColumnDefaultConstraint, SchemaBuilder}; - use datatypes::vectors::{Int32Vector, TimestampVector}; + use datatypes::vectors::{Int32Vector, TimestampMillisecondVector}; use store_api::storage::{PutOperation, WriteRequest}; use super::*; @@ -110,8 +110,12 @@ mod tests { ) -> SchemaBuilder { let mut column_schemas = vec![ ColumnSchema::new("k0", ConcreteDataType::int32_datatype(), false), - ColumnSchema::new("ts", ConcreteDataType::timestamp_millis_datatype(), false) - .with_time_index(true), + ColumnSchema::new( + "ts", + ConcreteDataType::timestamp_millisecond_datatype(), + false, + ) + .with_time_index(true), ]; if let Some(v0_constraint) = v0_constraint { @@ -134,7 +138,7 @@ mod tests { fn new_put_data() -> PutData { let mut put_data = PutData::new(); let k0 = Arc::new(Int32Vector::from_slice(&[1, 2, 3])); - let ts = Arc::new(TimestampVector::from_values([11, 12, 13])); + let ts = Arc::new(TimestampMillisecondVector::from_values([11, 12, 13])); put_data.add_key_column("k0", k0).unwrap(); put_data.add_key_column("ts", ts).unwrap(); diff --git a/src/store-api/src/storage/chunk.rs b/src/store-api/src/storage/chunk.rs index ca7a8e1736c7..32fedc2df1dc 100644 --- a/src/store-api/src/storage/chunk.rs +++ b/src/store-api/src/storage/chunk.rs @@ -19,6 +19,7 @@ use datatypes::vectors::VectorRef; use crate::storage::SchemaRef; /// A bunch of rows in columnar format. +#[derive(Debug)] pub struct Chunk { pub columns: Vec, // TODO(yingwen): Sequences. diff --git a/src/store-api/src/storage/descriptors.rs b/src/store-api/src/storage/descriptors.rs index 994223928697..b31159b1bddd 100644 --- a/src/store-api/src/storage/descriptors.rs +++ b/src/store-api/src/storage/descriptors.rs @@ -27,7 +27,7 @@ pub type RegionId = u64; pub type RegionNumber = u32; /// A [ColumnDescriptor] contains information to create a column. -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Builder)] +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Builder)] #[builder(pattern = "owned", build_fn(validate = "Self::validate"))] pub struct ColumnDescriptor { pub id: ColumnId, @@ -107,7 +107,7 @@ impl ColumnDescriptorBuilder { } /// A [RowKeyDescriptor] contains information about row key. -#[derive(Debug, Clone, PartialEq, Builder)] +#[derive(Debug, Clone, PartialEq, Eq, Builder)] #[builder(pattern = "owned")] pub struct RowKeyDescriptor { #[builder(default, setter(each(name = "push_column")))] @@ -122,7 +122,7 @@ pub struct RowKeyDescriptor { } /// A [ColumnFamilyDescriptor] contains information to create a column family. -#[derive(Debug, Clone, PartialEq, Builder)] +#[derive(Debug, Clone, PartialEq, Eq, Builder)] #[builder(pattern = "owned")] pub struct ColumnFamilyDescriptor { #[builder(default = "consts::DEFAULT_CF_ID")] @@ -135,7 +135,7 @@ pub struct ColumnFamilyDescriptor { } /// A [RegionDescriptor] contains information to create a region. -#[derive(Debug, Clone, PartialEq, Builder)] +#[derive(Debug, Clone, PartialEq, Eq, Builder)] #[builder(pattern = "owned")] pub struct RegionDescriptor { pub id: RegionId, diff --git a/src/table/Cargo.toml b/src/table/Cargo.toml index 8e7cebb40d68..59d9a901bd8a 100644 --- a/src/table/Cargo.toml +++ b/src/table/Cargo.toml @@ -12,10 +12,9 @@ common-error = { path = "../common/error" } common-query = { path = "../common/query" } common-recordbatch = { path = "../common/recordbatch" } common-telemetry = { path = "../common/telemetry" } -datafusion = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2", features = [ - "simd", -] } -datafusion-common = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2" } +datafusion = "14.0.0" +datafusion-common = "14.0.0" +datafusion-expr = "14.0.0" datatypes = { path = "../datatypes" } derive_builder = "0.11" futures = "0.3" @@ -27,6 +26,7 @@ store-api = { path = "../store-api" } tokio = { version = "1.18", features = ["full"] } [dev-dependencies] -datafusion-expr = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2" } +datafusion-expr = "14.0.0" tempdir = "0.3" tokio-util = { version = "0.7", features = ["compat"] } +parquet = { version = "26", features = ["async"] } diff --git a/src/table/src/error.rs b/src/table/src/error.rs index ed18c471ce0c..3605ab0a1ae5 100644 --- a/src/table/src/error.rs +++ b/src/table/src/error.rs @@ -152,7 +152,9 @@ impl From for DataFusionError { impl From for RecordBatchError { fn from(e: InnerError) -> RecordBatchError { - RecordBatchError::new(e) + RecordBatchError::External { + source: BoxedError::new(e), + } } } @@ -173,7 +175,7 @@ mod tests { } fn throw_arrow() -> Result<()> { - Err(ArrowError::Overflow).context(PollStreamSnafu)? + Err(ArrowError::ComputeError("Overflow".to_string())).context(PollStreamSnafu)? } #[test] diff --git a/src/table/src/metadata.rs b/src/table/src/metadata.rs index 2e0f72235250..e481acbc7ec9 100644 --- a/src/table/src/metadata.rs +++ b/src/table/src/metadata.rs @@ -68,7 +68,7 @@ pub struct TableIdent { pub version: TableVersion, } -#[derive(Clone, Debug, Builder, PartialEq)] +#[derive(Clone, Debug, Builder, PartialEq, Eq)] #[builder(pattern = "mutable")] pub struct TableMeta { pub schema: SchemaRef, @@ -322,7 +322,7 @@ impl TableMeta { } } -#[derive(Clone, Debug, PartialEq, Builder)] +#[derive(Clone, Debug, PartialEq, Eq, Builder)] #[builder(pattern = "owned")] pub struct TableInfo { /// Id and version of the table. @@ -383,7 +383,7 @@ impl From for TableIdent { } /// Struct used to serialize and deserialize [`TableMeta`]. -#[derive(Debug, PartialEq, Clone, Serialize, Deserialize)] +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] pub struct RawTableMeta { pub schema: RawSchema, pub primary_key_indices: Vec, @@ -431,7 +431,7 @@ impl TryFrom for TableMeta { } /// Struct used to serialize and deserialize [`TableInfo`]. -#[derive(Debug, PartialEq, Clone, Serialize, Deserialize)] +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] pub struct RawTableInfo { pub ident: TableIdent, pub name: String, @@ -483,8 +483,12 @@ mod tests { fn new_test_schema() -> Schema { let column_schemas = vec![ ColumnSchema::new("col1", ConcreteDataType::int32_datatype(), true), - ColumnSchema::new("ts", ConcreteDataType::timestamp_millis_datatype(), false) - .with_time_index(true), + ColumnSchema::new( + "ts", + ConcreteDataType::timestamp_millisecond_datatype(), + false, + ) + .with_time_index(true), ColumnSchema::new("col2", ConcreteDataType::int32_datatype(), true), ]; SchemaBuilder::try_from(column_schemas) @@ -607,8 +611,12 @@ mod tests { ColumnSchema::new("col1", ConcreteDataType::int32_datatype(), true), ColumnSchema::new("col2", ConcreteDataType::int32_datatype(), true), ColumnSchema::new("col3", ConcreteDataType::int32_datatype(), true), - ColumnSchema::new("ts", ConcreteDataType::timestamp_millis_datatype(), false) - .with_time_index(true), + ColumnSchema::new( + "ts", + ConcreteDataType::timestamp_millisecond_datatype(), + false, + ) + .with_time_index(true), ]; let schema = Arc::new( SchemaBuilder::try_from(column_schemas) diff --git a/src/table/src/predicate.rs b/src/table/src/predicate.rs index 64d32d57f409..6e61415cbe9f 100644 --- a/src/table/src/predicate.rs +++ b/src/table/src/predicate.rs @@ -16,8 +16,8 @@ mod stats; use common_query::logical_plan::Expr; use common_telemetry::{error, warn}; +use datafusion::parquet::file::metadata::RowGroupMetaData; use datafusion::physical_optimizer::pruning::PruningPredicate; -use datatypes::arrow::io::parquet::read::RowGroupMetaData; use datatypes::schema::SchemaRef; use crate::predicate::stats::RowGroupPruningStatistics; @@ -70,19 +70,17 @@ impl Predicate { mod tests { use std::sync::Arc; - pub use datafusion::parquet::schema::types::{BasicTypeInfo, PhysicalType}; - use datafusion_common::Column; - use datafusion_expr::{Expr, Literal, Operator}; - use datatypes::arrow::array::{Int32Array, Utf8Array}; - use datatypes::arrow::chunk::Chunk; + use datafusion::parquet::arrow::ArrowWriter; + pub use datafusion::parquet::schema::types::BasicTypeInfo; + use datafusion_common::{Column, ScalarValue}; + use datafusion_expr::{BinaryExpr, Expr, Literal, Operator}; + use datatypes::arrow::array::Int32Array; use datatypes::arrow::datatypes::{DataType, Field, Schema}; - use datatypes::arrow::io::parquet::read::FileReader; - use datatypes::arrow::io::parquet::write::{ - Compression, Encoding, FileSink, Version, WriteOptions, - }; - use futures::{AsyncWriteExt, SinkExt}; + use datatypes::arrow::record_batch::RecordBatch; + use datatypes::arrow_array::StringArray; + use parquet::arrow::ParquetRecordBatchStreamBuilder; + use parquet::file::properties::WriterProperties; use tempdir::TempDir; - use tokio_util::compat::TokioAsyncWriteCompatExt; use super::*; @@ -95,80 +93,62 @@ mod tests { let name_field = Field::new("name", DataType::Utf8, true); let count_field = Field::new("cnt", DataType::Int32, true); + let schema = Arc::new(Schema::new(vec![name_field, count_field])); - let schema = Schema::from(vec![name_field, count_field]); - - // now all physical types use plain encoding, maybe let caller to choose encoding for each type. - let encodings = vec![Encoding::Plain].repeat(schema.fields.len()); - - let mut writer = tokio::fs::OpenOptions::new() + let file = std::fs::OpenOptions::new() .write(true) .create(true) - .open(&path) - .await - .unwrap() - .compat_write(); - - let mut sink = FileSink::try_new( - &mut writer, - schema.clone(), - encodings, - WriteOptions { - write_statistics: true, - compression: Compression::Gzip, - version: Version::V2, - }, - ) - .unwrap(); + .open(path.clone()) + .unwrap(); + + let write_props = WriterProperties::builder() + .set_max_row_group_size(10) + .build(); + let mut writer = ArrowWriter::try_new(file, schema.clone(), Some(write_props)).unwrap(); for i in (0..cnt).step_by(10) { - let name_array = Utf8Array::::from( - &(i..(i + 10).min(cnt)) - .map(|i| Some(i.to_string())) + let name_array = Arc::new(StringArray::from( + (i..(i + 10).min(cnt)) + .map(|i| i.to_string()) .collect::>(), - ); - let count_array = Int32Array::from( - &(i..(i + 10).min(cnt)) - .map(|i| Some(i as i32)) - .collect::>(), - ); - - sink.send(Chunk::new(vec![ - Arc::new(name_array), - Arc::new(count_array), - ])) - .await - .unwrap(); + )) as Arc<_>; + let count_array = Arc::new(Int32Array::from( + (i..(i + 10).min(cnt)).map(|i| i as i32).collect::>(), + )) as Arc<_>; + let rb = RecordBatch::try_new(schema.clone(), vec![name_array, count_array]).unwrap(); + writer.write(&rb).unwrap(); } - sink.close().await.unwrap(); - - drop(sink); - writer.flush().await.unwrap(); - - (path, Arc::new(schema)) + writer.close().unwrap(); + (path, schema) } async fn assert_prune(array_cnt: usize, predicate: Predicate, expect: Vec) { let dir = TempDir::new("prune_parquet").unwrap(); let (path, schema) = gen_test_parquet_file(&dir, array_cnt).await; - let file_reader = - FileReader::try_new(std::fs::File::open(path).unwrap(), None, None, None, None) - .unwrap(); - let schema = Arc::new(datatypes::schema::Schema::try_from(schema).unwrap()); - - let vec = file_reader.metadata().row_groups.clone(); - let res = predicate.prune_row_groups(schema, &vec); + let builder = ParquetRecordBatchStreamBuilder::new( + tokio::fs::OpenOptions::new() + .read(true) + .open(path) + .await + .unwrap(), + ) + .await + .unwrap(); + let metadata = builder.metadata().clone(); + let row_groups = metadata.row_groups(); + let res = predicate.prune_row_groups(schema, row_groups); assert_eq!(expect, res); } fn gen_predicate(max_val: i32, op: Operator) -> Predicate { - Predicate::new(vec![Expr::BinaryExpr { - left: Box::new(Expr::Column(Column::from_name("cnt".to_string()))), - op, - right: Box::new(max_val.lit()), - } - .into()]) + Predicate::new(vec![common_query::logical_plan::Expr::from( + Expr::BinaryExpr(BinaryExpr { + left: Box::new(Expr::Column(Column::from_name("cnt"))), + op, + right: Box::new(Expr::Literal(ScalarValue::Int32(Some(max_val)))), + }), + )]) } #[tokio::test] diff --git a/src/table/src/predicate/stats.rs b/src/table/src/predicate/stats.rs index b474eddeb190..f092cd5418fc 100644 --- a/src/table/src/predicate/stats.rs +++ b/src/table/src/predicate/stats.rs @@ -12,17 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -use datafusion::parquet::metadata::RowGroupMetaData; -use datafusion::parquet::statistics::{ - BinaryStatistics, BooleanStatistics, FixedLenStatistics, PrimitiveStatistics, -}; +use std::sync::Arc; + +use datafusion::parquet::file::metadata::RowGroupMetaData; +use datafusion::parquet::file::statistics::Statistics as ParquetStats; use datafusion::physical_optimizer::pruning::PruningStatistics; use datafusion_common::{Column, ScalarValue}; -use datatypes::arrow::array::ArrayRef; +use datatypes::arrow::array::{ArrayRef, UInt64Array}; use datatypes::arrow::datatypes::DataType; -use datatypes::arrow::io::parquet::read::PhysicalType; -use datatypes::prelude::Vector; -use datatypes::vectors::Int64Vector; use paste::paste; pub struct RowGroupPruningStatistics<'a> { @@ -40,92 +37,58 @@ impl<'a> RowGroupPruningStatistics<'a> { fn field_by_name(&self, name: &str) -> Option<(usize, &DataType)> { let idx = self.schema.column_index_by_name(name)?; - let data_type = &self.schema.arrow_schema().fields.get(idx)?.data_type; + let data_type = &self.schema.arrow_schema().fields.get(idx)?.data_type(); Some((idx, data_type)) } } macro_rules! impl_min_max_values { - ($self:ident, $col:ident, $min_max: ident) => { - paste! { - { - let (column_index, data_type) = $self.field_by_name(&$col.name)?; - let null_scalar: ScalarValue = data_type.try_into().ok()?; - let scalar_values: Vec = $self - .meta_data - .iter() - .flat_map(|meta| meta.column(column_index).statistics()) - .map(|stats| { - let stats = stats.ok()?; - let res = match stats.physical_type() { - PhysicalType::Boolean => { - let $min_max = stats.as_any().downcast_ref::().unwrap().[<$min_max _value>]; - Some(ScalarValue::Boolean($min_max)) - } - PhysicalType::Int32 => { - let $min_max = stats - .as_any() - .downcast_ref::>() - .unwrap() - .[<$min_max _value>]; - Some(ScalarValue::Int32($min_max)) - } - PhysicalType::Int64 => { - let $min_max = stats - .as_any() - .downcast_ref::>() - .unwrap() - .[<$min_max _value>]; - Some(ScalarValue::Int64($min_max)) - } - PhysicalType::Int96 => { - // INT96 currently not supported - None - } - PhysicalType::Float => { - let $min_max = stats - .as_any() - .downcast_ref::>() - .unwrap() - .[<$min_max _value>]; - Some(ScalarValue::Float32($min_max)) - } - PhysicalType::Double => { - let $min_max = stats - .as_any() - .downcast_ref::>() - .unwrap() - .[<$min_max _value>]; - Some(ScalarValue::Float64($min_max)) - } - PhysicalType::ByteArray => { - let $min_max = stats - .as_any() - .downcast_ref::() - .unwrap() - .[<$min_max _value>] - .clone(); - Some(ScalarValue::Binary($min_max)) - } - PhysicalType::FixedLenByteArray(_) => { - let $min_max = stats - .as_any() - .downcast_ref::() - .unwrap() - .[<$min_max _value>] - .clone(); - Some(ScalarValue::Binary($min_max)) - } - }; + ($self:ident, $col:ident, $min_max: ident) => {{ + let arrow_schema = $self.schema.arrow_schema().clone(); + let (column_index, field) = if let Some((v, f)) = arrow_schema.column_with_name(&$col.name) + { + (v, f) + } else { + return None; + }; + let data_type = field.data_type(); + let null_scalar: ScalarValue = if let Ok(v) = data_type.try_into() { + v + } else { + return None; + }; - res - }) - .map(|maybe_scalar| maybe_scalar.unwrap_or_else(|| null_scalar.clone())) - .collect::>(); - ScalarValue::iter_to_array(scalar_values).ok() - } - } - }; + let scalar_values = $self + .meta_data + .iter() + .map(|meta| { + let stats = meta.column(column_index).statistics()?; + if !stats.has_min_max_set() { + return None; + } + match stats { + ParquetStats::Boolean(s) => Some(ScalarValue::Boolean(Some(*s.$min_max()))), + ParquetStats::Int32(s) => Some(ScalarValue::Int32(Some(*s.$min_max()))), + ParquetStats::Int64(s) => Some(ScalarValue::Int64(Some(*s.$min_max()))), + + ParquetStats::Int96(_) => None, + ParquetStats::Float(s) => Some(ScalarValue::Float32(Some(*s.$min_max()))), + ParquetStats::Double(s) => Some(ScalarValue::Float64(Some(*s.$min_max()))), + ParquetStats::ByteArray(s) => { + paste! { + let s = String::from_utf8(s.[<$min_max _bytes>]().to_owned()).ok(); + } + Some(ScalarValue::Utf8(s)) + } + + ParquetStats::FixedLenByteArray(_) => None, + } + }) + .map(|maybe_scalar| maybe_scalar.unwrap_or_else(|| null_scalar.clone())) + .collect::>(); + debug_assert_eq!(scalar_values.len(), $self.meta_data.len()); + ScalarValue::iter_to_array(scalar_values).ok() + }}; } impl<'a> PruningStatistics for RowGroupPruningStatistics<'a> { @@ -143,14 +106,13 @@ impl<'a> PruningStatistics for RowGroupPruningStatistics<'a> { fn null_counts(&self, column: &Column) -> Option { let (idx, _) = self.field_by_name(&column.name)?; - let mut values: Vec> = Vec::with_capacity(self.meta_data.len()); + let mut values: Vec> = Vec::with_capacity(self.meta_data.len()); for m in self.meta_data { let col = m.column(idx); - let stat = col.statistics()?.ok()?; + let stat = col.statistics()?; let bs = stat.null_count(); - values.push(bs); + values.push(Some(bs)); } - - Some(Int64Vector::from(values).to_arrow_array()) + Some(Arc::new(UInt64Array::from(values))) } } diff --git a/src/table/src/table/adapter.rs b/src/table/src/table/adapter.rs index 32824e7a49e1..98ff82d08a3b 100644 --- a/src/table/src/table/adapter.rs +++ b/src/table/src/table/adapter.rs @@ -23,7 +23,9 @@ use datafusion::arrow::datatypes::SchemaRef as DfSchemaRef; use datafusion::datasource::datasource::TableProviderFilterPushDown as DfTableProviderFilterPushDown; use datafusion::datasource::{TableProvider, TableType as DfTableType}; use datafusion::error::Result as DfResult; -use datafusion::logical_plan::Expr as DfExpr; +use datafusion::execution::context::SessionState; +use datafusion::prelude::SessionContext; +use datafusion_expr::expr::Expr as DfExpr; use datatypes::schema::{SchemaRef as TableSchemaRef, SchemaRef}; use snafu::prelude::*; @@ -66,6 +68,7 @@ impl TableProvider for DfTableProviderAdapter { async fn scan( &self, + _ctx: &SessionState, projection: &Option>, filters: &[DfExpr], limit: Option, @@ -135,11 +138,12 @@ impl Table for TableAdapter { filters: &[Expr], limit: Option, ) -> Result { + let ctx = SessionContext::new(); let filters: Vec = filters.iter().map(|e| e.df_expr().clone()).collect(); debug!("TableScan filter size: {}", filters.len()); let execution_plan = self .table_provider - .scan(projection, &filters, limit) + .scan(&ctx.state(), projection, &filters, limit) .await .context(error::DatafusionSnafu)?; let schema: SchemaRef = Arc::new( @@ -168,7 +172,6 @@ impl Table for TableAdapter { mod tests { use datafusion::arrow; use datafusion::datasource::empty::EmptyTable; - use datafusion_common::field_util::SchemaExt; use super::*; use crate::metadata::TableType::Base; diff --git a/src/table/src/table/numbers.rs b/src/table/src/table/numbers.rs index db33769c312b..7664d8f0fd9d 100644 --- a/src/table/src/table/numbers.rs +++ b/src/table/src/table/numbers.rs @@ -19,7 +19,8 @@ use std::sync::Arc; use common_query::physical_plan::PhysicalPlanRef; use common_recordbatch::error::Result as RecordBatchResult; use common_recordbatch::{RecordBatch, RecordBatchStream}; -use datafusion_common::record_batch::RecordBatch as DfRecordBatch; +use datafusion::arrow::record_batch::RecordBatch as DfRecordBatch; +use datafusion_common::from_slice::FromSlice; use datatypes::arrow::array::UInt32Array; use datatypes::data_type::ConcreteDataType; use datatypes::schema::{ColumnSchema, SchemaBuilder, SchemaRef}; @@ -139,9 +140,9 @@ impl Stream for NumbersStream { ) .unwrap(); - Poll::Ready(Some(Ok(RecordBatch { - schema: self.schema.clone(), - df_recordbatch: batch, - }))) + Poll::Ready(Some(RecordBatch::try_from_df_record_batch( + self.schema.clone(), + batch, + ))) } } diff --git a/src/table/src/table/scan.rs b/src/table/src/table/scan.rs index 4e1ef884e744..b9078befa8aa 100644 --- a/src/table/src/table/scan.rs +++ b/src/table/src/table/scan.rs @@ -18,8 +18,9 @@ use std::sync::{Arc, Mutex}; use common_query::error as query_error; use common_query::error::Result as QueryResult; -use common_query::physical_plan::{Partitioning, PhysicalPlan, PhysicalPlanRef, RuntimeEnv}; +use common_query::physical_plan::{Partitioning, PhysicalPlan, PhysicalPlanRef}; use common_recordbatch::SendableRecordBatchStream; +use datafusion::execution::context::TaskContext; use datatypes::schema::SchemaRef; use snafu::OptionExt; @@ -71,16 +72,17 @@ impl PhysicalPlan for SimpleTableScan { fn execute( &self, _partition: usize, - _runtime: Arc, + _context: Arc, ) -> QueryResult { let mut stream = self.stream.lock().unwrap(); - Ok(stream.take().context(query_error::ExecuteRepeatedlySnafu)?) + stream.take().context(query_error::ExecuteRepeatedlySnafu) } } #[cfg(test)] mod test { use common_recordbatch::{util, RecordBatch, RecordBatches}; + use datafusion::prelude::SessionContext; use datatypes::data_type::ConcreteDataType; use datatypes::schema::{ColumnSchema, Schema}; use datatypes::vectors::Int32Vector; @@ -89,6 +91,7 @@ mod test { #[tokio::test] async fn test_simple_table_scan() { + let ctx = SessionContext::new(); let schema = Arc::new(Schema::new(vec![ColumnSchema::new( "a", ConcreteDataType::int32_datatype(), @@ -114,13 +117,12 @@ mod test { assert_eq!(scan.schema(), schema); - let runtime = Arc::new(RuntimeEnv::default()); - let stream = scan.execute(0, runtime.clone()).unwrap(); + let stream = scan.execute(0, ctx.task_ctx()).unwrap(); let recordbatches = util::collect(stream).await.unwrap(); assert_eq!(recordbatches[0], batch1); assert_eq!(recordbatches[1], batch2); - let result = scan.execute(0, runtime); + let result = scan.execute(0, ctx.task_ctx()); assert!(result.is_err()); match result { Err(e) => assert!(e diff --git a/src/table/src/test_util/memtable.rs b/src/table/src/test_util/memtable.rs index 5f35e73c82b9..c0cd028f457f 100644 --- a/src/table/src/test_util/memtable.rs +++ b/src/table/src/test_util/memtable.rs @@ -17,6 +17,7 @@ use std::pin::Pin; use std::sync::Arc; use async_trait::async_trait; +use common_error::prelude::BoxedError; use common_query::physical_plan::PhysicalPlanRef; use common_query::prelude::Expr; use common_recordbatch::error::Result as RecordBatchResult; @@ -29,7 +30,7 @@ use futures::Stream; use snafu::prelude::*; use store_api::storage::RegionNumber; -use crate::error::{Result, SchemaConversionSnafu, TableProjectionSnafu}; +use crate::error::{Result, SchemaConversionSnafu, TableProjectionSnafu, TablesRecordBatchSnafu}; use crate::metadata::{ TableId, TableInfoBuilder, TableInfoRef, TableMetaBuilder, TableType, TableVersion, }; @@ -145,11 +146,11 @@ impl Table for MemTable { ) -> Result { let df_recordbatch = if let Some(indices) = projection { self.recordbatch - .df_recordbatch + .df_record_batch() .project(indices) .context(TableProjectionSnafu)? } else { - self.recordbatch.df_recordbatch.clone() + self.recordbatch.df_record_batch().clone() }; let rows = df_recordbatch.num_rows(); @@ -160,12 +161,12 @@ impl Table for MemTable { }; let df_recordbatch = df_recordbatch.slice(0, limit); - let recordbatch = RecordBatch { - schema: Arc::new( - Schema::try_from(df_recordbatch.schema().clone()).context(SchemaConversionSnafu)?, - ), + let recordbatch = RecordBatch::try_from_df_record_batch( + Arc::new(Schema::try_from(df_recordbatch.schema()).context(SchemaConversionSnafu)?), df_recordbatch, - }; + ) + .map_err(BoxedError::new) + .context(TablesRecordBatchSnafu)?; Ok(Arc::new(SimpleTableScan::new(Box::pin(MemtableStream { schema: recordbatch.schema.clone(), recordbatch: Some(recordbatch), @@ -197,28 +198,27 @@ impl Stream for MemtableStream { #[cfg(test)] mod test { - use common_query::physical_plan::RuntimeEnv; use common_recordbatch::util; + use datafusion::prelude::SessionContext; use datatypes::prelude::*; use datatypes::schema::ColumnSchema; - use datatypes::vectors::{Int32Vector, StringVector}; + use datatypes::vectors::{Helper, Int32Vector, StringVector}; use super::*; #[tokio::test] async fn test_scan_with_projection() { + let ctx = SessionContext::new(); let table = build_testing_table(); let scan_stream = table.scan(&Some(vec![1]), &[], None).await.unwrap(); - let scan_stream = scan_stream - .execute(0, Arc::new(RuntimeEnv::default())) - .unwrap(); + let scan_stream = scan_stream.execute(0, ctx.task_ctx()).unwrap(); let recordbatch = util::collect(scan_stream).await.unwrap(); assert_eq!(1, recordbatch.len()); - let columns = recordbatch[0].df_recordbatch.columns(); + let columns = recordbatch[0].df_record_batch().columns(); assert_eq!(1, columns.len()); - let string_column = VectorHelper::try_into_vector(&columns[0]).unwrap(); + let string_column = Helper::try_into_vector(&columns[0]).unwrap(); let string_column = string_column .as_any() .downcast_ref::() @@ -229,23 +229,22 @@ mod test { #[tokio::test] async fn test_scan_with_limit() { + let ctx = SessionContext::new(); let table = build_testing_table(); let scan_stream = table.scan(&None, &[], Some(2)).await.unwrap(); - let scan_stream = scan_stream - .execute(0, Arc::new(RuntimeEnv::default())) - .unwrap(); + let scan_stream = scan_stream.execute(0, ctx.task_ctx()).unwrap(); let recordbatch = util::collect(scan_stream).await.unwrap(); assert_eq!(1, recordbatch.len()); - let columns = recordbatch[0].df_recordbatch.columns(); + let columns = recordbatch[0].df_record_batch().columns(); assert_eq!(2, columns.len()); - let i32_column = VectorHelper::try_into_vector(&columns[0]).unwrap(); + let i32_column = Helper::try_into_vector(&columns[0]).unwrap(); let i32_column = i32_column.as_any().downcast_ref::().unwrap(); let i32_column = i32_column.iter_data().flatten().collect::>(); assert_eq!(vec![-100], i32_column); - let string_column = VectorHelper::try_into_vector(&columns[1]).unwrap(); + let string_column = Helper::try_into_vector(&columns[1]).unwrap(); let string_column = string_column .as_any() .downcast_ref::() diff --git a/tests-integration/src/test_util.rs b/tests-integration/src/test_util.rs index 958bcf2fb88a..6c77b28d52d4 100644 --- a/tests-integration/src/test_util.rs +++ b/tests-integration/src/test_util.rs @@ -225,7 +225,7 @@ pub async fn setup_test_app(store_type: StorageType, name: &str) -> (Router, Tes create_test_table( instance.catalog_manager(), instance.sql_handler(), - ConcreteDataType::timestamp_millis_datatype(), + ConcreteDataType::timestamp_millisecond_datatype(), ) .await .unwrap(); @@ -244,7 +244,7 @@ pub async fn setup_test_app_with_frontend( create_test_table( frontend.catalog_manager(), instance.sql_handler(), - ConcreteDataType::timestamp_millis_datatype(), + ConcreteDataType::timestamp_millisecond_datatype(), ) .await .unwrap(); diff --git a/tests-integration/tests/grpc.rs b/tests-integration/tests/grpc.rs index 6f94aff3e54a..7ebce045096f 100644 --- a/tests-integration/tests/grpc.rs +++ b/tests-integration/tests/grpc.rs @@ -109,11 +109,11 @@ fn expect_data() -> (Column, Column, Column, Column) { let expected_ts_col = Column { column_name: "ts".to_string(), values: Some(column::Values { - ts_millis_values: vec![100, 101, 102, 103], + ts_millisecond_values: vec![100, 101, 102, 103], ..Default::default() }), semantic_type: SemanticType::Timestamp as i32, - datatype: ColumnDataType::Timestamp as i32, + datatype: ColumnDataType::TimestampMillisecond as i32, ..Default::default() }; @@ -244,7 +244,7 @@ fn testing_create_expr() -> CreateExpr { }, ColumnDef { name: "ts".to_string(), - datatype: 15, // timestamp + datatype: ColumnDataType::TimestampMillisecond as i32, // timestamp is_nullable: true, default_constraint: None, }, diff --git a/tests-integration/tests/http.rs b/tests-integration/tests/http.rs index 8d074bba67df..267c49e8249c 100644 --- a/tests-integration/tests/http.rs +++ b/tests-integration/tests/http.rs @@ -116,7 +116,7 @@ pub async fn test_sql_api(store_type: StorageType) { assert_eq!( output[0], serde_json::from_value::(json!({ - "records":{"schema":{"column_schemas":[{"name":"host","data_type":"String"},{"name":"cpu","data_type":"Float64"},{"name":"memory","data_type":"Float64"},{"name":"ts","data_type":"Timestamp"}]},"rows":[["host",66.6,1024.0,0]]} + "records":{"schema":{"column_schemas":[{"name":"host","data_type":"String"},{"name":"cpu","data_type":"Float64"},{"name":"memory","data_type":"Float64"},{"name":"ts","data_type":"TimestampMillisecond"}]},"rows":[["host",66.6,1024.0,0]]} })).unwrap() ); @@ -138,7 +138,7 @@ pub async fn test_sql_api(store_type: StorageType) { assert_eq!( output[0], serde_json::from_value::(json!({ - "records":{"schema":{"column_schemas":[{"name":"cpu","data_type":"Float64"},{"name":"ts","data_type":"Timestamp"}]},"rows":[[66.6,0]]} + "records":{"schema":{"column_schemas":[{"name":"cpu","data_type":"Float64"},{"name":"ts","data_type":"TimestampMillisecond"}]},"rows":[[66.6,0]]} })).unwrap() ); @@ -159,7 +159,7 @@ pub async fn test_sql_api(store_type: StorageType) { assert_eq!( output[0], serde_json::from_value::(json!({ - "records":{"schema":{"column_schemas":[{"name":"c","data_type":"Float64"},{"name":"time","data_type":"Timestamp"}]},"rows":[[66.6,0]]} + "records":{"schema":{"column_schemas":[{"name":"c","data_type":"Float64"},{"name":"time","data_type":"TimestampMillisecond"}]},"rows":[[66.6,0]]} })).unwrap() ); diff --git a/tests/runner/src/util.rs b/tests/runner/src/util.rs index a6accc9ed71c..6c42d4391d4f 100644 --- a/tests/runner/src/util.rs +++ b/tests/runner/src/util.rs @@ -98,8 +98,23 @@ pub fn values_to_string(data_type: ColumnDataType, values: Values) -> Vec values - .ts_millis_values + ColumnDataType::TimestampSecond => values + .ts_second_values + .into_iter() + .map(|v| v.to_string()) + .collect(), + ColumnDataType::TimestampMillisecond => values + .ts_millisecond_values + .into_iter() + .map(|v| v.to_string()) + .collect(), + ColumnDataType::TimestampMicrosecond => values + .ts_microsecond_values + .into_iter() + .map(|v| v.to_string()) + .collect(), + ColumnDataType::TimestampNanosecond => values + .ts_nanosecond_values .into_iter() .map(|v| v.to_string()) .collect(),