diff --git a/Cargo.lock b/Cargo.lock index 7f9efb7d47229..6ffa8611f6ea2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -614,7 +614,7 @@ source = "git+https://github.com/datafuse-extras/async-backtrace.git?rev=dea4553 dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -717,7 +717,7 @@ source = "git+https://github.com/datafuse-extras/async-recursion.git?rev=a353334 dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -739,7 +739,7 @@ checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -756,7 +756,7 @@ checksum = "e539d3fca749fcee5236ab05e93a52867dd549cc157c8cb7f99595f3cedffdb5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -1441,7 +1441,7 @@ dependencies = [ "regex", "rustc-hash 1.1.0", "shlex", - "syn 2.0.101", + "syn 2.0.106", "which 4.4.2", ] @@ -1460,7 +1460,7 @@ dependencies = [ "regex", "rustc-hash 2.1.1", "shlex", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -1628,6 +1628,31 @@ dependencies = [ "serde_with", ] +[[package]] +name = "bon" +version = "3.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebeb9aaf9329dff6ceb65c689ca3db33dbf15f324909c60e4e5eef5701ce31b1" +dependencies = [ + "bon-macros", + "rustversion", +] + +[[package]] +name = "bon-macros" +version = "3.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77e9d642a7e3a318e37c2c9427b5a6a48aa1ad55dcd986f3034ab2239045a645" +dependencies = [ + "darling 0.21.3", + "ident_case", + "prettyplease", + "proc-macro2", + "quote", + "rustversion", + "syn 2.0.106", +] + [[package]] name = "borsh" version = "1.5.7" @@ -1648,7 +1673,7 @@ dependencies = [ "proc-macro-crate 3.3.0", "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -1778,7 +1803,7 @@ checksum = "7ecc273b49b3205b83d648f0690daa588925572cc5063745bfe547fe7ec8e1a1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -2049,7 +2074,7 @@ checksum = "d59ae0466b83e838b81a54256c39d5d7c20b9d7daa10510a242d9b75abd5936e" dependencies = [ "chrono", "chrono-tz-build 0.2.1", - "phf", + "phf 0.11.3", "serde", ] @@ -2061,7 +2086,7 @@ checksum = "efdce149c370f133a071ca8ef6ea340b7b88748ab0810097a9e2976eaa34b4f3" dependencies = [ "chrono", "chrono-tz-build 0.4.1", - "phf", + "phf 0.11.3", ] [[package]] @@ -2071,8 +2096,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "433e39f13c9a060046954e0592a8d0a4bcb1040125cbf91cb8ee58964cfb350f" dependencies = [ "parse-zoneinfo", - "phf", - "phf_codegen", + "phf 0.11.3", + "phf_codegen 0.11.3", ] [[package]] @@ -2082,7 +2107,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f10f8c9340e31fc120ff885fcdb54a0b48e474bbd77cab557f0c30a3e569402" dependencies = [ "parse-zoneinfo", - "phf_codegen", + "phf_codegen 0.11.3", ] [[package]] @@ -2195,7 +2220,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -2828,7 +2853,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a2785755761f3ddc1492979ce1e48d2c00d09311c39e4466429188f3dd6501" dependencies = [ "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -2856,8 +2881,18 @@ version = "0.20.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fc7f46116c46ff9ab3eb1597a45688b6715c6e628b5c133e288e709a29bcb4ee" dependencies = [ - "darling_core", - "darling_macro", + "darling_core 0.20.11", + "darling_macro 0.20.11", +] + +[[package]] +name = "darling" +version = "0.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9cdf337090841a411e2a7f3deb9187445851f91b309c0c0a29e05f74a00a48c0" +dependencies = [ + "darling_core 0.21.3", + "darling_macro 0.21.3", ] [[package]] @@ -2871,7 +2906,21 @@ dependencies = [ "proc-macro2", "quote", "strsim 0.11.1", - "syn 2.0.101", + "syn 2.0.106", +] + +[[package]] +name = "darling_core" +version = "0.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1247195ecd7e3c85f83c8d2a366e4210d588e802133e1e355180a9870b517ea4" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim 0.11.1", + "syn 2.0.106", ] [[package]] @@ -2880,9 +2929,20 @@ version = "0.20.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" dependencies = [ - "darling_core", + "darling_core 0.20.11", + "quote", + "syn 2.0.106", +] + +[[package]] +name = "darling_macro" +version = "0.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d38308df82d1080de0afee5d069fa14b0326a88c14f15c5ccda35b4a6c414c81" +dependencies = [ + "darling_core 0.21.3", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -3381,7 +3441,7 @@ dependencies = [ "proc-macro2", "quote", "sha2", - "syn 2.0.101", + "syn 2.0.106", "trybuild", ] @@ -5441,6 +5501,7 @@ dependencies = [ "bytemuck", "bytes", "cbordata", + "crc32fast", "databend-common-ast", "databend-common-exception", "databend-common-expression", @@ -5578,7 +5639,7 @@ dependencies = [ "enum-ordinalize", "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -5677,7 +5738,7 @@ checksum = "f7b49a2e67ebafbe644e36f251ee985f237bfb39e4ef1e312eb5876535bc449e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -5750,7 +5811,7 @@ dependencies = [ "itertools 0.14.0", "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -5815,7 +5876,7 @@ checksum = "30542c1ad912e0e3d22a1935c290e12e8a29d704a420177a31faad4a601a0800" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -5833,10 +5894,10 @@ version = "0.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2d5bcf7b024d6835cfb3d473887cd966994907effbe9227e8c8219824d06c4e8" dependencies = [ - "darling", + "darling 0.20.11", "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -5846,7 +5907,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" dependencies = [ "derive_builder_core", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -5867,7 +5928,7 @@ dependencies = [ "convert_case 0.6.0", "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", "unicode-xid", ] @@ -5953,7 +6014,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -5978,7 +6039,7 @@ checksum = "9556bc800956545d6420a640173e5ba7dfa82f38d3ea5a167eb555bc69ac3323" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -6024,9 +6085,9 @@ checksum = "1435fa1053d8b2fbbe9be7e97eca7f33d37b28409959813daefc1446a14247f1" [[package]] name = "downcast-rs" -version = "1.2.1" +version = "2.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75b325c5dbd37f80359721ad39aca5a29fb04c89279657cffdda8736d0c0b9d2" +checksum = "117240f60069e65410b3ae1bb213295bd828f707b5bec6596a1afc8793ce0cbc" [[package]] name = "dtoa" @@ -6100,7 +6161,7 @@ dependencies = [ "enum-ordinalize", "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -6178,7 +6239,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -6198,7 +6259,7 @@ checksum = "0d28318a75d4aead5c4db25382e8ef717932d0346600cacae6357eb5941bc5ff" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -6210,7 +6271,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -6231,7 +6292,7 @@ checksum = "fc4caf64a58d7a6d65ab00639b046ff54399a39f5f2554728895ace4b297cd79" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -6303,7 +6364,7 @@ checksum = "44f23cf4b44bfce11a86ace86f8a73ffdec849c9fd00a386a53d278bd9e81fb3" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -6472,7 +6533,7 @@ dependencies = [ "proc-macro-error2", "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -6710,7 +6771,7 @@ checksum = "e99b8b3c28ae0e84b604c75f721c21dc77afb3706076af5e8216d15fd1deaae3" dependencies = [ "frunk_proc_macro_helpers", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -6722,7 +6783,7 @@ dependencies = [ "frunk_core", "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -6734,7 +6795,7 @@ dependencies = [ "frunk_core", "frunk_proc_macro_helpers", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -6769,12 +6830,12 @@ dependencies = [ [[package]] name = "fs4" -version = "0.8.4" +version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7e180ac76c23b45e767bd7ae9579bc0bb458618c4bc71835926e098e61d15f8" +checksum = "8640e34b88f7652208ce9e88b1a37a2ae95227d84abec377ccd3c5cfeb141ed4" dependencies = [ - "rustix 0.38.44", - "windows-sys 0.52.0", + "rustix 1.0.7", + "windows-sys 0.59.0", ] [[package]] @@ -6832,7 +6893,7 @@ checksum = "f664c1c2186b81f798ac765d661fb8cefd74fdb398fd23c76c3fb3c1aec760e8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -6897,7 +6958,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -8726,6 +8787,15 @@ dependencies = [ "tower-service", ] +[[package]] +name = "hyperloglogplus" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "621debdf94dcac33e50475fdd76d34d5ea9c0362a834b9db08c3024696c1fbe3" +dependencies = [ + "serde", +] + [[package]] name = "iana-time-zone" version = "0.1.63" @@ -9019,25 +9089,39 @@ dependencies = [ [[package]] name = "include-flate" -version = "0.3.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df49c16750695486c1f34de05da5b7438096156466e7f76c38fcdf285cf0113e" +checksum = "e01b7cb6ca682a621e7cda1c358c9724b53a7b4409be9be1dd443b7f3a26f998" dependencies = [ "include-flate-codegen", - "lazy_static", + "include-flate-compress", "libflate", + "zstd 0.13.3", ] [[package]] name = "include-flate-codegen" -version = "0.2.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c5b246c6261be723b85c61ecf87804e8ea4a35cb68be0ff282ed84b95ffe7d7" +checksum = "4f49bf5274aebe468d6e6eba14a977eaf1efa481dc173f361020de70c1c48050" dependencies = [ + "include-flate-compress", "libflate", + "proc-macro-error 1.0.4", "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", + "zstd 0.13.3", +] + +[[package]] +name = "include-flate-compress" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eae6a40e716bcd5931f5dbb79cd921512a4f647e2e9413fded3171fca3824dbc" +dependencies = [ + "libflate", + "zstd 0.13.3", ] [[package]] @@ -9135,9 +9219,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222" dependencies = [ "cfg-if", - "js-sys", - "wasm-bindgen", - "web-sys", ] [[package]] @@ -9368,26 +9449,25 @@ dependencies = [ [[package]] name = "jieba-macros" -version = "0.7.1" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c676b32a471d3cfae8dac2ad2f8334cd52e53377733cca8c1fb0a5062fec192" +checksum = "348294e44ee7e3c42685da656490f8febc7359632544019621588902216da95c" dependencies = [ - "phf_codegen", + "phf_codegen 0.13.1", ] [[package]] name = "jieba-rs" -version = "0.7.2" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d1bcad6332969e4d48ee568d430e14ee6dea70740c2549d005d87677ebefb0c" +checksum = "766bd7012aa5ba49411ebdf4e93bddd59b182d2918e085d58dec5bb9b54b7105" dependencies = [ "cedarwood", - "fxhash", "include-flate", "jieba-macros", - "lazy_static", - "phf", + "phf 0.13.1", "regex", + "rustc-hash 2.1.1", ] [[package]] @@ -9414,7 +9494,7 @@ checksum = "f3c30758ddd7188629c6713fc45d1188af4f44c90582311d0c8d8c9907f60c48" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -9888,7 +9968,7 @@ dependencies = [ "proc-macro-error 1.0.4", "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -10146,7 +10226,7 @@ checksum = "5cf92c10c7e361d6b99666ec1c6f9805b0bea2c3bd8c78dc6fe98ac5bd78db11" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -10167,11 +10247,10 @@ dependencies = [ [[package]] name = "measure_time" -version = "0.8.3" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbefd235b0aadd181626f281e1d684e116972988c14c264e42069d5e8a5775cc" +checksum = "51c55d61e72fc3ab704396c5fa16f4c184db37978ae4e94ca8959693a235fc0e" dependencies = [ - "instant", "log", ] @@ -10394,7 +10473,7 @@ checksum = "b40e46c845ac234bcba19db7ab252bc2778cbadd516a466d2f12b1580852d136" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -10438,7 +10517,7 @@ checksum = "0ac7d860b767c6398e88fe93db73ce53eb496057aa6895ffa4d60cb02e1d1c6b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -10465,14 +10544,14 @@ version = "0.31.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "63c3512cf11487168e0e9db7157801bf5273be13055a9cc95356dc9e0035e49c" dependencies = [ - "darling", + "darling 0.20.11", "heck 0.5.0", "num-bigint", "proc-macro-crate 3.3.0", "proc-macro-error2", "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", "termcolor", "thiserror 1.0.69", ] @@ -10650,7 +10729,7 @@ dependencies = [ "proc-macro-error 1.0.4", "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -10763,7 +10842,7 @@ checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -10845,7 +10924,7 @@ dependencies = [ "proc-macro-crate 3.3.0", "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -11022,7 +11101,7 @@ dependencies = [ "proc-macro2", "quote", "semver", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -11064,7 +11143,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -11339,8 +11418,8 @@ checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" [[package]] name = "ownedbytes" -version = "0.7.0" -source = "git+https://github.com/datafuse-extras/tantivy?rev=7502370#7502370b68e6822a687ee071660e350b67808533" +version = "0.9.0" +source = "git+https://github.com/datafuse-extras/tantivy?rev=9065a4d#9065a4de248d7b077560dd3602e0ced82471d8b5" dependencies = [ "stable_deref_trait", ] @@ -11487,7 +11566,7 @@ dependencies = [ "regex", "regex-syntax 0.8.5", "structmeta", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -11577,7 +11656,17 @@ version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" dependencies = [ - "phf_shared", + "phf_shared 0.11.3", +] + +[[package]] +name = "phf" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1562dc717473dbaa4c1f85a36410e03c047b2e7df7f45ee938fbef64ae7fadf" +dependencies = [ + "phf_shared 0.13.1", + "serde", ] [[package]] @@ -11586,8 +11675,18 @@ version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a" dependencies = [ - "phf_generator", - "phf_shared", + "phf_generator 0.11.3", + "phf_shared 0.11.3", +] + +[[package]] +name = "phf_codegen" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49aa7f9d80421bca176ca8dbfebe668cc7a2684708594ec9f3c0db0805d5d6e1" +dependencies = [ + "phf_generator 0.13.1", + "phf_shared 0.13.1", ] [[package]] @@ -11596,10 +11695,20 @@ version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" dependencies = [ - "phf_shared", + "phf_shared 0.11.3", "rand 0.8.5", ] +[[package]] +name = "phf_generator" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "135ace3a761e564ec88c03a77317a7c6b80bb7f7135ef2544dbe054243b89737" +dependencies = [ + "fastrand", + "phf_shared 0.13.1", +] + [[package]] name = "phf_shared" version = "0.11.3" @@ -11609,6 +11718,15 @@ dependencies = [ "siphasher 1.0.1", ] +[[package]] +name = "phf_shared" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e57fef6bc5981e38c2ce2d63bfa546861309f875b8a75f092d1d54ae2d64f266" +dependencies = [ + "siphasher 1.0.1", +] + [[package]] name = "pilota" version = "0.11.8" @@ -11649,7 +11767,7 @@ checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -11768,7 +11886,7 @@ dependencies = [ "proc-macro-crate 3.3.0", "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -11916,7 +12034,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "664ec5419c51e34154eec046ebcba56312d5a2fc3b09a06da188e1ad21afadf6" dependencies = [ "proc-macro2", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -12016,7 +12134,7 @@ dependencies = [ "proc-macro-error-attr2", "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -12119,7 +12237,7 @@ checksum = "440f724eba9f6996b75d63681b0a92b06947f1457076d503a4d2e2c8f56442b8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -12176,7 +12294,7 @@ dependencies = [ "prost", "prost-types", "regex", - "syn 2.0.101", + "syn 2.0.106", "tempfile", ] @@ -12190,7 +12308,7 @@ dependencies = [ "itertools 0.14.0", "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -12370,7 +12488,7 @@ checksum = "ca414edb151b4c8d125c12566ab0d74dc9cdba36fb80eb7b848c15f495fd32d1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -12442,7 +12560,7 @@ dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -12455,7 +12573,7 @@ dependencies = [ "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -12710,7 +12828,7 @@ checksum = "f5135143cb48d14289139e4615bffec0d59b4cbfd4ea2398a3770bd2abfc4aa2" dependencies = [ "proc-macro-hack", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -12763,7 +12881,7 @@ version = "0.1.1" source = "git+https://github.com/datafuse-extras/recursive.git?rev=16e433a#16e433ab3f291512b437206f7ce7b8d078e84195" dependencies = [ "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -12847,7 +12965,7 @@ checksum = "1165225c21bff1f3bbce98f5a1f889949bc902d3575308cc7b0de30b4f6d27c7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -13156,7 +13274,7 @@ checksum = "246b40ac189af6c675d124b802e8ef6d5246c53e17367ce9501f8f66a81abb7a" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -13262,7 +13380,7 @@ dependencies = [ "proc-macro2", "quote", "rquickjs-core", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -13780,7 +13898,7 @@ checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -13822,7 +13940,7 @@ checksum = "175ee3e80ae9982737ca543e96133087cbd9a485eecc3bc4de9c1a37b47ea59c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -13898,10 +14016,10 @@ version = "3.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8d00caa5193a3c8362ac2b73be6b9e768aa5a4b2f721d8f4b339600c3cb51f8e" dependencies = [ - "darling", + "darling 0.20.11", "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -14117,9 +14235,9 @@ dependencies = [ [[package]] name = "sketches-ddsketch" -version = "0.2.2" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85636c14b73d81f541e525f585c0a2109e6744e1565b5c1668e31c70c10ed65c" +checksum = "c1e9a774a6c28142ac54bb25d25562e6bcf957493a184f15ad4eebccb23e410a" dependencies = [ "serde", ] @@ -14186,7 +14304,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -14417,7 +14535,7 @@ dependencies = [ "quote", "sqlx-core", "sqlx-macros-core", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -14440,7 +14558,7 @@ dependencies = [ "sqlx-mysql", "sqlx-postgres", "sqlx-sqlite", - "syn 2.0.101", + "syn 2.0.106", "tempfile", "tokio", "url", @@ -14656,7 +14774,7 @@ dependencies = [ "proc-macro2", "quote", "structmeta-derive", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -14667,7 +14785,7 @@ checksum = "152a0b65a590ff6c3da95cabe2353ee04e6167c896b28e3b14478c2636c922fc" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -14717,7 +14835,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -14730,7 +14848,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -14899,9 +15017,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.101" +version = "2.0.106" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf" +checksum = "ede7c438028d4436d71104916910f5bb611972c5cfd7f89b8300a8186e6fada6" dependencies = [ "proc-macro2", "quote", @@ -14936,7 +15054,7 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -15007,13 +15125,14 @@ checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417" [[package]] name = "tantivy" -version = "0.22.0" -source = "git+https://github.com/datafuse-extras/tantivy?rev=7502370#7502370b68e6822a687ee071660e350b67808533" +version = "0.25.0" +source = "git+https://github.com/datafuse-extras/tantivy?rev=9065a4d#9065a4de248d7b077560dd3602e0ced82471d8b5" dependencies = [ "aho-corasick", "arc-swap", "base64 0.22.1", "bitpacking 0.9.2", + "bon", "byteorder", "census", "crc32fast", @@ -15023,20 +15142,20 @@ dependencies = [ "fnv", "fs4", "htmlescape", - "itertools 0.12.1", + "hyperloglogplus", + "itertools 0.14.0", "levenshtein_automata", "log", "lru", "lz4_flex", "measure_time", "memmap2", - "num_cpus", "once_cell", "oneshot", "rayon", "regex", "rust-stemmers", - "rustc-hash 1.1.0", + "rustc-hash 2.1.1", "serde", "serde_json", "sketches-ddsketch", @@ -15049,7 +15168,7 @@ dependencies = [ "tantivy-stacker", "tantivy-tokenizer-api", "tempfile", - "thiserror 1.0.69", + "thiserror 2.0.12", "time", "uuid", "winapi", @@ -15057,20 +15176,20 @@ dependencies = [ [[package]] name = "tantivy-bitpacker" -version = "0.6.0" -source = "git+https://github.com/datafuse-extras/tantivy?rev=7502370#7502370b68e6822a687ee071660e350b67808533" +version = "0.9.0" +source = "git+https://github.com/datafuse-extras/tantivy?rev=9065a4d#9065a4de248d7b077560dd3602e0ced82471d8b5" dependencies = [ "bitpacking 0.9.2", ] [[package]] name = "tantivy-columnar" -version = "0.3.0" -source = "git+https://github.com/datafuse-extras/tantivy?rev=7502370#7502370b68e6822a687ee071660e350b67808533" +version = "0.6.0" +source = "git+https://github.com/datafuse-extras/tantivy?rev=9065a4d#9065a4de248d7b077560dd3602e0ced82471d8b5" dependencies = [ "downcast-rs", "fastdivide", - "itertools 0.12.1", + "itertools 0.14.0", "serde", "tantivy-bitpacker", "tantivy-common", @@ -15080,8 +15199,8 @@ dependencies = [ [[package]] name = "tantivy-common" -version = "0.7.0" -source = "git+https://github.com/datafuse-extras/tantivy?rev=7502370#7502370b68e6822a687ee071660e350b67808533" +version = "0.10.0" +source = "git+https://github.com/datafuse-extras/tantivy?rev=9065a4d#9065a4de248d7b077560dd3602e0ced82471d8b5" dependencies = [ "async-trait", "byteorder", @@ -15103,8 +15222,8 @@ dependencies = [ [[package]] name = "tantivy-jieba" -version = "0.11.0" -source = "git+https://github.com/datafuse-extras/tantivy-jieba?rev=0e300e9#0e300e9085651b7e6659dfcc7b0ea0fa9cab09c2" +version = "0.17.0" +source = "git+https://github.com/datafuse-extras/tantivy-jieba?rev=ac27464#ac27464d5d2f35320b83cd7cb66df68052d9bc18" dependencies = [ "jieba-rs", "lazy_static", @@ -15113,17 +15232,23 @@ dependencies = [ [[package]] name = "tantivy-query-grammar" -version = "0.22.0" -source = "git+https://github.com/datafuse-extras/tantivy?rev=7502370#7502370b68e6822a687ee071660e350b67808533" +version = "0.25.0" +source = "git+https://github.com/datafuse-extras/tantivy?rev=9065a4d#9065a4de248d7b077560dd3602e0ced82471d8b5" dependencies = [ + "fnv", "nom 7.1.3", + "ordered-float 5.0.0", + "serde", + "serde_json", ] [[package]] name = "tantivy-sstable" -version = "0.3.0" -source = "git+https://github.com/datafuse-extras/tantivy?rev=7502370#7502370b68e6822a687ee071660e350b67808533" +version = "0.6.0" +source = "git+https://github.com/datafuse-extras/tantivy?rev=9065a4d#9065a4de248d7b077560dd3602e0ced82471d8b5" dependencies = [ + "futures-util", + "itertools 0.14.0", "tantivy-bitpacker", "tantivy-common", "tantivy-fst", @@ -15132,8 +15257,8 @@ dependencies = [ [[package]] name = "tantivy-stacker" -version = "0.3.0" -source = "git+https://github.com/datafuse-extras/tantivy?rev=7502370#7502370b68e6822a687ee071660e350b67808533" +version = "0.6.0" +source = "git+https://github.com/datafuse-extras/tantivy?rev=9065a4d#9065a4de248d7b077560dd3602e0ced82471d8b5" dependencies = [ "murmurhash32", "rand_distr", @@ -15142,8 +15267,8 @@ dependencies = [ [[package]] name = "tantivy-tokenizer-api" -version = "0.3.0" -source = "git+https://github.com/datafuse-extras/tantivy?rev=7502370#7502370b68e6822a687ee071660e350b67808533" +version = "0.6.0" +source = "git+https://github.com/datafuse-extras/tantivy?rev=9065a4d#9065a4de248d7b077560dd3602e0ced82471d8b5" dependencies = [ "serde", ] @@ -15227,7 +15352,7 @@ checksum = "ae861f7d521762a2e5524ceeb3a518fab2c06c25e217a1d7270b8c5e158c141b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -15294,7 +15419,7 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -15305,7 +15430,7 @@ checksum = "7f7cf42b4507d8ea322120659672cf1b9dbb93f8f2d4ecfd6e51350ff5b17a1d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -15455,7 +15580,7 @@ checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -15688,7 +15813,7 @@ dependencies = [ "prost-build", "prost-types", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -15771,7 +15896,7 @@ checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -15926,7 +16051,7 @@ checksum = "f9534daa9fd3ed0bd911d462a37f172228077e7abf18c18a5f67199d959205f8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -15937,7 +16062,7 @@ checksum = "3c36781cc0e46a83726d9879608e4cf6c2505237e263a8eb8c24502989cfdb28" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -15973,7 +16098,7 @@ checksum = "35f5380909ffc31b4de4f4bdf96b877175a016aa2ca98cee39fcfd8c4d53d952" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -16452,7 +16577,7 @@ dependencies = [ "log", "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", "wasm-bindgen-shared", ] @@ -16487,7 +16612,7 @@ checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -16665,7 +16790,7 @@ dependencies = [ "anyhow", "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", "wasmtime-component-util", "wasmtime-wit-bindgen", "wit-parser", @@ -16781,7 +16906,7 @@ checksum = "df09be00c38f49172ca9936998938476e3f2df782673a39ae2ef9fb0838341b6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -16960,7 +17085,7 @@ dependencies = [ "proc-macro2", "quote", "shellexpand", - "syn 2.0.101", + "syn 2.0.106", "witx", ] @@ -16972,7 +17097,7 @@ checksum = "9b8eb1a5783540696c59cefbfc9e52570c2d5e62bd47bdf0bdcef29231879db2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", "wiggle-generate", ] @@ -17114,7 +17239,7 @@ checksum = "9107ddc059d5b6fbfbffdfa7a7fe3e22a226def0b2608f72e9d552763d3e1ad7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -17125,7 +17250,7 @@ checksum = "a47fddd13af08290e67f4acabf4b459f647552718f683a7b415d290ac744a836" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -17136,7 +17261,7 @@ checksum = "29bee4b38ea3cde66011baa44dba677c432a78593e202392d1e9070cf2a7fca7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -17147,7 +17272,7 @@ checksum = "bd9211b69f8dcdfa817bfd14bf1c97c9188afa36f4750130fcdf3f400eca9fa8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -17684,7 +17809,7 @@ checksum = "38da3c9736e16c5d3c8c597a9aaa5d1fa565d0532ae05e27c24aa62fb32c0ab6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", "synstructure", ] @@ -17711,7 +17836,7 @@ checksum = "9ecf5b4cc5364572d7f4c329661bcc82724222973f2cab6f050a4e5c22f75181" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -17731,7 +17856,7 @@ checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", "synstructure", ] @@ -17752,7 +17877,7 @@ checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] @@ -17785,7 +17910,7 @@ checksum = "5b96237efa0c878c64bd89c436f661be4e46b2f3eff1ebb976f7ef2321d2f58f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.106", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 41eb259989423..328b61ac2f79f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -506,10 +506,10 @@ strum = "0.24.1" sub-cache = "0.2.1" sys-info = "0.9" sysinfo = "0.34.2" -tantivy = "0.22.0" -tantivy-common = "0.7.0" +tantivy = "0.25.0" +tantivy-common = "0.10.0" tantivy-fst = "0.5" -tantivy-jieba = "0.11.0" +tantivy-jieba = "0.17.0" temp-env = "0.3.0" tempfile = "3.4.0" terminal_size = "0.4.2" @@ -665,8 +665,8 @@ recursive = { git = "https://github.com/datafuse-extras/recursive.git", rev = "1 sled = { git = "https://github.com/datafuse-extras/sled", tag = "v0.34.7-datafuse.1" } state-machine-api = { git = "https://github.com/databendlabs/state-machine-api.git", tag = "v0.3.4" } sub-cache = { git = "https://github.com/databendlabs/sub-cache", tag = "v0.2.1" } -tantivy = { git = "https://github.com/datafuse-extras/tantivy", rev = "7502370" } -tantivy-common = { git = "https://github.com/datafuse-extras/tantivy", rev = "7502370", package = "tantivy-common" } -tantivy-jieba = { git = "https://github.com/datafuse-extras/tantivy-jieba", rev = "0e300e9" } +tantivy = { git = "https://github.com/datafuse-extras/tantivy", rev = "9065a4d" } +tantivy-common = { git = "https://github.com/datafuse-extras/tantivy", rev = "9065a4d", package = "tantivy-common" } +tantivy-jieba = { git = "https://github.com/datafuse-extras/tantivy-jieba", rev = "ac27464" } watcher = { git = "https://github.com/databendlabs/watcher", tag = "v0.4.2" } xorfilter-rs = { git = "https://github.com/datafuse-extras/xorfilter", tag = "databend-alpha.4" } diff --git a/src/common/metrics/src/metrics/storage.rs b/src/common/metrics/src/metrics/storage.rs index 8059be0b39645..f8606b4fb7a14 100644 --- a/src/common/metrics/src/metrics/storage.rs +++ b/src/common/metrics/src/metrics/storage.rs @@ -174,6 +174,8 @@ static BLOCK_INVERTED_INDEX_GENERATE_MILLISECONDS: LazyLock = LazyLoc static BLOCK_INVERTED_INDEX_READ_MILLISECONDS: LazyLock = LazyLock::new(|| { register_histogram_in_milliseconds("fuse_block_inverted_index_read_milliseconds") }); +static BLOCK_INVERTED_INDEX_READ_BYTES: LazyLock = + LazyLock::new(|| register_counter("fuse_block_inverted_index_read_bytes")); static BLOCK_INVERTED_INDEX_SEARCH_MILLISECONDS: LazyLock = LazyLock::new(|| { register_histogram_in_milliseconds("fuse_block_inverted_index_search_milliseconds") }); @@ -602,6 +604,10 @@ pub fn metrics_inc_block_inverted_index_read_milliseconds(c: u64) { BLOCK_INVERTED_INDEX_READ_MILLISECONDS.observe(c as f64); } +pub fn metrics_inc_block_inverted_index_read_bytes(c: u64) { + BLOCK_INVERTED_INDEX_READ_BYTES.inc_by(c); +} + pub fn metrics_inc_block_inverted_index_search_milliseconds(c: u64) { BLOCK_INVERTED_INDEX_SEARCH_MILLISECONDS.observe(c as f64); } diff --git a/src/query/config/src/config.rs b/src/query/config/src/config.rs index 58ba2e397a79f..c31a01a97edbd 100644 --- a/src/query/config/src/config.rs +++ b/src/query/config/src/config.rs @@ -3318,7 +3318,7 @@ pub struct CacheConfig { #[clap( long = "cache-inverted-index-meta-count", value_name = "VALUE", - default_value = "3000" + default_value = "30000" )] pub inverted_index_meta_count: u64, @@ -3326,7 +3326,7 @@ pub struct CacheConfig { #[clap( long = "cache-inverted-index-filter-size", value_name = "VALUE", - default_value = "2147483648" + default_value = "64424509440" )] pub inverted_index_filter_size: u64, diff --git a/src/query/config/src/inner.rs b/src/query/config/src/inner.rs index fd04a81e4d298..bf4def1fcf506 100644 --- a/src/query/config/src/inner.rs +++ b/src/query/config/src/inner.rs @@ -756,8 +756,8 @@ impl Default for CacheConfig { table_bloom_index_filter_size: 2147483648, disk_cache_table_bloom_index_data_size: 0, disk_cache_table_bloom_index_meta_size: 0, - inverted_index_meta_count: 3000, - inverted_index_filter_size: 2147483648, + inverted_index_meta_count: 30000, + inverted_index_filter_size: 64424509440, inverted_index_filter_memory_ratio: 0, vector_index_meta_count: 30000, vector_index_filter_size: 64424509440, diff --git a/src/query/ee/tests/it/inverted_index/index_refresh.rs b/src/query/ee/tests/it/inverted_index/index_refresh.rs index ca6ffdcee4331..d1157f8d48037 100644 --- a/src/query/ee/tests/it/inverted_index/index_refresh.rs +++ b/src/query/ee/tests/it/inverted_index/index_refresh.rs @@ -160,7 +160,7 @@ async fn test_fuse_do_refresh_inverted_index() -> Result<()> { let queries = vec![ ("rust".to_string(), vec![0, 1]), ("java".to_string(), vec![2]), - ("data".to_string(), vec![1, 4, 5]), + ("data".to_string(), vec![4, 1, 5]), ]; for (query_text, ids) in queries.into_iter() { diff --git a/src/query/service/tests/it/storages/testdata/caches_table.txt b/src/query/service/tests/it/storages/testdata/caches_table.txt index ed7132592449b..4400fcd3c111a 100644 --- a/src/query/service/tests/it/storages/testdata/caches_table.txt +++ b/src/query/service/tests/it/storages/testdata/caches_table.txt @@ -1,21 +1,23 @@ ---------- TABLE INFO ------------ DB.Table: 'system'.'caches', Table: caches-table_id:1, ver:0, Engine: SystemCache -------- TABLE CONTENTS ---------- -+-------------+----------------------------------------------+----------+----------+------------+----------+----------+----------+----------+ -| Column 0 | Column 1 | Column 2 | Column 3 | Column 4 | Column 5 | Column 6 | Column 7 | Column 8 | -+-------------+----------------------------------------------+----------+----------+------------+----------+----------+----------+----------+ -| 'test-node' | 'memory_cache_bloom_index_file_meta_data' | 0 | 0 | 3000 | 'count' | 0 | 0 | 0 | -| 'test-node' | 'memory_cache_bloom_index_filter' | 0 | 0 | 2147483648 | 'bytes' | 0 | 0 | 0 | -| 'test-node' | 'memory_cache_column_oriented_segment_info' | 0 | 0 | 1073741824 | 'bytes' | 0 | 0 | 0 | -| 'test-node' | 'memory_cache_compact_segment_info' | 0 | 0 | 1073741824 | 'bytes' | 0 | 0 | 0 | -| 'test-node' | 'memory_cache_iceberg_table' | 0 | 0 | 1024 | 'count' | 0 | 0 | 0 | -| 'test-node' | 'memory_cache_inverted_index_file' | 0 | 0 | 2147483648 | 'bytes' | 0 | 0 | 0 | -| 'test-node' | 'memory_cache_inverted_index_file_meta_data' | 0 | 0 | 3000 | 'count' | 0 | 0 | 0 | -| 'test-node' | 'memory_cache_parquet_meta_data' | 0 | 0 | 3000 | 'count' | 0 | 0 | 0 | -| 'test-node' | 'memory_cache_prune_partitions' | 0 | 0 | 256 | 'count' | 0 | 0 | 0 | -| 'test-node' | 'memory_cache_segment_statistics' | 0 | 0 | 1073741824 | 'bytes' | 0 | 0 | 0 | -| 'test-node' | 'memory_cache_table_snapshot' | 0 | 0 | 256 | 'count' | 0 | 0 | 0 | -| 'test-node' | 'memory_cache_table_statistics' | 0 | 0 | 256 | 'count' | 0 | 0 | 0 | -+-------------+----------------------------------------------+----------+----------+------------+----------+----------+----------+----------+ ++-------------+----------------------------------------------+----------+----------+-------------+----------+----------+----------+----------+ +| Column 0 | Column 1 | Column 2 | Column 3 | Column 4 | Column 5 | Column 6 | Column 7 | Column 8 | ++-------------+----------------------------------------------+----------+----------+-------------+----------+----------+----------+----------+ +| 'test-node' | 'memory_cache_bloom_index_file_meta_data' | 0 | 0 | 3000 | 'count' | 0 | 0 | 0 | +| 'test-node' | 'memory_cache_bloom_index_filter' | 0 | 0 | 2147483648 | 'bytes' | 0 | 0 | 0 | +| 'test-node' | 'memory_cache_column_oriented_segment_info' | 0 | 0 | 1073741824 | 'bytes' | 0 | 0 | 0 | +| 'test-node' | 'memory_cache_compact_segment_info' | 0 | 0 | 1073741824 | 'bytes' | 0 | 0 | 0 | +| 'test-node' | 'memory_cache_iceberg_table' | 0 | 0 | 1024 | 'count' | 0 | 0 | 0 | +| 'test-node' | 'memory_cache_inverted_index_file' | 0 | 0 | 64424509440 | 'bytes' | 0 | 0 | 0 | +| 'test-node' | 'memory_cache_inverted_index_file_meta_data' | 0 | 0 | 30000 | 'count' | 0 | 0 | 0 | +| 'test-node' | 'memory_cache_parquet_meta_data' | 0 | 0 | 3000 | 'count' | 0 | 0 | 0 | +| 'test-node' | 'memory_cache_prune_partitions' | 0 | 0 | 256 | 'count' | 0 | 0 | 0 | +| 'test-node' | 'memory_cache_segment_statistics' | 0 | 0 | 1073741824 | 'bytes' | 0 | 0 | 0 | +| 'test-node' | 'memory_cache_table_snapshot' | 0 | 0 | 256 | 'count' | 0 | 0 | 0 | +| 'test-node' | 'memory_cache_table_statistics' | 0 | 0 | 256 | 'count' | 0 | 0 | 0 | +| 'test-node' | 'memory_cache_vector_index_file' | 0 | 0 | 64424509440 | 'bytes' | 0 | 0 | 0 | +| 'test-node' | 'memory_cache_vector_index_file_meta_data' | 0 | 0 | 30000 | 'count' | 0 | 0 | 0 | ++-------------+----------------------------------------------+----------+----------+-------------+----------+----------+----------+----------+ diff --git a/src/query/service/tests/it/storages/testdata/configs_table_basic.txt b/src/query/service/tests/it/storages/testdata/configs_table_basic.txt index f5172fc9d79c9..3623b3d3f55e7 100644 --- a/src/query/service/tests/it/storages/testdata/configs_table_basic.txt +++ b/src/query/service/tests/it/storages/testdata/configs_table_basic.txt @@ -17,8 +17,8 @@ DB.Table: 'system'.'configs', Table: configs-table_id:1, ver:0, Engine: SystemCo | 'cache' | 'enable_table_meta_cache' | 'true' | '' | | 'cache' | 'iceberg_table_meta_count' | '1024' | '' | | 'cache' | 'inverted_index_filter_memory_ratio' | '0' | '' | -| 'cache' | 'inverted_index_filter_size' | '2147483648' | '' | -| 'cache' | 'inverted_index_meta_count' | '3000' | '' | +| 'cache' | 'inverted_index_filter_size' | '64424509440' | '' | +| 'cache' | 'inverted_index_meta_count' | '30000' | '' | | 'cache' | 'meta_service_ownership_cache' | 'false' | '' | | 'cache' | 'segment_block_metas_count' | '0' | '' | | 'cache' | 'segment_statistics_bytes' | '1073741824' | '' | diff --git a/src/query/storages/common/index/Cargo.toml b/src/query/storages/common/index/Cargo.toml index 2a203af3eb13d..e40720b047164 100644 --- a/src/query/storages/common/index/Cargo.toml +++ b/src/query/storages/common/index/Cargo.toml @@ -22,6 +22,7 @@ bitvec = { workspace = true } bytemuck = { workspace = true } bytes = { workspace = true } cbordata = { workspace = true } +crc32fast = { workspace = true } fastrace = { workspace = true } feistel-permutation-rs = { workspace = true } goldenfile = { workspace = true } diff --git a/src/query/storages/common/index/src/inverted_index.rs b/src/query/storages/common/index/src/inverted_index.rs index 907f24f1ec6f5..1aa0b4847ad47 100644 --- a/src/query/storages/common/index/src/inverted_index.rs +++ b/src/query/storages/common/index/src/inverted_index.rs @@ -50,6 +50,7 @@ use std::path::PathBuf; use std::result; use std::sync::Arc; +use crc32fast::Hasher; use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::types::F32; @@ -63,6 +64,7 @@ use levenshtein_automata::Distance; use levenshtein_automata::LevenshteinAutomatonBuilder; use levenshtein_automata::DFA; use log::warn; +use parquet::format::FileMetaData; use roaring::RoaringTreemap; use tantivy::directory::error::DeleteError; use tantivy::directory::error::OpenReadError; @@ -104,6 +106,68 @@ use tantivy_fst::IntoStreamer; use tantivy_fst::Regex; use tantivy_fst::Streamer; +// tantivy version is used to generate the footer data + +// Index major version. +const INDEX_MAJOR_VERSION: u32 = 0; +// Index minor version. +const INDEX_MINOR_VERSION: u32 = 25; +// Index patch version. +const INDEX_PATCH_VERSION: u32 = 0; +// Index format version. +const INDEX_FORMAT_VERSION: u32 = 7; +// The magic byte of the footer to identify corruption +// or an old version of the footer. +const FOOTER_MAGIC_NUMBER: u32 = 1337; + +type CrcHashU32 = u32; + +/// Structure version for the index. +#[derive(Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub struct Version { + major: u32, + minor: u32, + patch: u32, + index_format_version: u32, +} +/// A Footer is appended every part of data, like tantivy file. +#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)] +struct Footer { + version: Version, + crc: CrcHashU32, +} + +impl Footer { + fn new(crc: CrcHashU32) -> Self { + let version = Version { + major: INDEX_MAJOR_VERSION, + minor: INDEX_MINOR_VERSION, + patch: INDEX_PATCH_VERSION, + index_format_version: INDEX_FORMAT_VERSION, + }; + Footer { version, crc } + } + + fn append_footer(&self, write: &mut W) -> Result<()> { + let footer_payload_len = write.write(serde_json::to_string(&self)?.as_ref())?; + BinarySerializable::serialize(&(footer_payload_len as u32), write)?; + BinarySerializable::serialize(&FOOTER_MAGIC_NUMBER, write)?; + Ok(()) + } +} + +// Build footer for tantivy files. +// Footer is used to check whether the data is valid when open a file. +pub fn build_tantivy_footer(bytes: &[u8]) -> Result> { + let mut hasher = Hasher::new(); + hasher.update(bytes); + let crc = hasher.finalize(); + let footer = Footer::new(crc); + let mut buf = Vec::new(); + footer.append_footer(&mut buf)?; + Ok(buf) +} + fn extract_footer(data: FileSlice) -> Result<(Vec, Vec)> { // The following code is copied from tantivy `CompositeFile::open` function. // extract field number and offsets of each fields. @@ -661,7 +725,7 @@ impl DocIdsCollector { Ok(matched) } else if let Some(boost_query) = query.downcast_ref::() { Self::check_term_fsts_match( - boost_query.query(), + boost_query.underlying_query(), fst_maps, fuzziness, matched_terms, @@ -670,7 +734,7 @@ impl DocIdsCollector { ) } else if let Some(const_query) = query.downcast_ref::() { Self::check_term_fsts_match( - const_query.query(), + const_query.underlying_query(), fst_maps, fuzziness, matched_terms, @@ -708,11 +772,11 @@ impl DocIdsCollector { pub fn collect_phrase_matched_doc_ids( &mut self, query_key: String, - phrase_terms: Vec<(usize, Term)>, + phrase_terms: &[(usize, Term)], prefix_term: Option<(usize, &Vec)>, ) -> Result> { let mut query_term_poses = Vec::with_capacity(phrase_terms.len()); - for (term_pos, term) in &phrase_terms { + for (term_pos, term) in phrase_terms { // term not exist means this phrase in not matched. let Some(term_id) = self.term_reader.term_id(term) else { return Ok(None); @@ -1012,11 +1076,11 @@ impl DocIdsCollector { } } else if let Some(phrase_query) = query.downcast_ref::() { let query_key = format!("{:?}", phrase_query); - let phrase_terms = phrase_query.phrase_terms_with_offsets(); + let phrase_terms = phrase_query.get_phrase_terms_with_offsets(); self.collect_phrase_matched_doc_ids(query_key, phrase_terms, None) } else if let Some(phrase_prefix_query) = query.downcast_ref::() { let query_key = format!("{:?}", phrase_prefix_query); - let phrase_terms = phrase_prefix_query.phrase_terms_with_offsets(); + let phrase_terms = phrase_prefix_query.get_phrase_terms_with_offsets(); let (prefix_term_pos, prefix_term) = phrase_prefix_query.prefix_term_with_offset(); let Some(prefix_term_ids) = prefix_terms.get(&prefix_term) else { @@ -1047,9 +1111,17 @@ impl DocIdsCollector { Ok(None) } } else if let Some(boost_query) = query.downcast_ref::() { - self.collect_matched_doc_ids(boost_query.query(), prefix_terms, fuzziness_terms) + self.collect_matched_doc_ids( + boost_query.underlying_query(), + prefix_terms, + fuzziness_terms, + ) } else if let Some(const_query) = query.downcast_ref::() { - self.collect_matched_doc_ids(const_query.query(), prefix_terms, fuzziness_terms) + self.collect_matched_doc_ids( + const_query.underlying_query(), + prefix_terms, + fuzziness_terms, + ) } else if let Some(_empty_query) = query.downcast_ref::() { Ok(None) } else if let Some(_all_query) = query.downcast_ref::() { @@ -1167,10 +1239,10 @@ impl DocIdsCollector { } Ok(scores) } else if let Some(boost_query) = query.downcast_ref::() { - let boost = boost_query.boost(); - self.calculate_scores(boost_query.query(), doc_ids, Some(boost)) + let boost = boost_query.get_boost(); + self.calculate_scores(boost_query.underlying_query(), doc_ids, Some(boost)) } else if let Some(const_query) = query.downcast_ref::() { - let score = const_query.score(); + let score = const_query.get_const_score(); let scores = vec![F32::from(score); doc_ids.len() as usize]; Ok(scores) } else if let Some(_all_query) = query.downcast_ref::() { @@ -1185,9 +1257,55 @@ impl DocIdsCollector { #[derive(Clone)] pub struct InvertedIndexMeta { + pub version: usize, pub columns: Vec<(String, SingleColumnMeta)>, } +impl TryFrom for InvertedIndexMeta { + type Error = ErrorCode; + + fn try_from(mut meta: FileMetaData) -> std::result::Result { + let rg = meta.row_groups.remove(0); + let mut col_metas = Vec::with_capacity(rg.columns.len()); + for x in &rg.columns { + match &x.meta_data { + Some(chunk_meta) => { + let col_start = + if let Some(dict_page_offset) = chunk_meta.dictionary_page_offset { + dict_page_offset + } else { + chunk_meta.data_page_offset + }; + let col_len = chunk_meta.total_compressed_size; + assert!( + col_start >= 0 && col_len >= 0, + "column start and length should not be negative" + ); + let num_values = chunk_meta.num_values as u64; + let res = SingleColumnMeta { + offset: col_start as u64, + len: col_len as u64, + num_values, + }; + let column_name = chunk_meta.path_in_schema[0].to_owned(); + col_metas.push((column_name, res)); + } + None => { + panic!( + "expecting chunk meta data while converting ThriftFileMetaData to BloomIndexMeta" + ) + } + } + } + col_metas.shrink_to_fit(); + + Ok(Self { + version: 3, + columns: col_metas, + }) + } +} + #[derive(Clone, Debug)] pub struct InvertedIndexFile { pub name: String, diff --git a/src/query/storages/common/index/src/lib.rs b/src/query/storages/common/index/src/lib.rs index cd6cbb69a581f..3ad4d62d94944 100644 --- a/src/query/storages/common/index/src/lib.rs +++ b/src/query/storages/common/index/src/lib.rs @@ -39,6 +39,7 @@ pub use hnsw_index::ScoredPointOffset; pub use hnsw_index::VectorIndexFile; pub use hnsw_index::VectorIndexMeta; pub use index::Index; +pub use inverted_index::build_tantivy_footer; pub use inverted_index::extract_component_fields; pub use inverted_index::extract_fsts; pub use inverted_index::DocIdsCollector; diff --git a/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_loader.rs b/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_loader.rs index f64adab27e756..0ddfd84914e70 100644 --- a/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_loader.rs +++ b/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_loader.rs @@ -18,11 +18,16 @@ use std::ops::Range; use std::sync::Arc; use std::time::Instant; +use arrow::datatypes::Field; +use arrow::datatypes::Fields; +use arrow::datatypes::Schema; use databend_common_base::runtime::GlobalIORuntime; use databend_common_base::runtime::Runtime; use databend_common_base::runtime::TrySpawn; use databend_common_exception::ErrorCode; use databend_common_exception::Result; +use databend_common_expression::Column; +use databend_common_metrics::storage::metrics_inc_block_inverted_index_read_bytes; use databend_common_metrics::storage::metrics_inc_block_inverted_index_read_milliseconds; use databend_storages_common_cache::CacheAccessor; use databend_storages_common_cache::CacheManager; @@ -32,9 +37,15 @@ use databend_storages_common_index::InvertedIndexMeta; use databend_storages_common_io::MergeIOReader; use databend_storages_common_io::ReadSettings; use databend_storages_common_table_meta::meta::SingleColumnMeta; +use databend_storages_common_table_meta::table::TableCompression; use opendal::Operator; +use parquet::arrow::arrow_reader::ParquetRecordBatchReader; +use parquet::arrow::parquet_to_arrow_field_levels; +use parquet::arrow::ArrowSchemaConverter; +use parquet::arrow::ProjectionMask; use crate::index::InvertedIndexFile; +use crate::io::read::block::parquet::RowGroupImplBuilder; use crate::io::MetaReaders; const INDEX_COLUMN_NAMES: [&str; 8] = [ @@ -95,10 +106,9 @@ pub(crate) async fn load_inverted_index_meta( .await? } -/// Loads bytes of each inverted index files -/// read data from cache, or populate cache items if possible +// Used to read inverted index data in old versions; will be removed in the future. #[fastrace::trace] -pub(crate) async fn load_inverted_index_files<'a>( +pub(crate) async fn legacy_load_inverted_index_files<'a>( settings: &ReadSettings, columns: Vec<(String, Range)>, location: &'a str, @@ -126,6 +136,7 @@ pub(crate) async fn load_inverted_index_files<'a>( column_id += 1; } + let mut inverted_bytes_len = 0; if !ranges.is_empty() { let merge_io_result = MergeIOReader::merge_io_read(settings, operator.clone(), location, &ranges).await?; @@ -136,6 +147,7 @@ pub(crate) async fn load_inverted_index_files<'a>( .owner_memory .get_chunk(*chunk_idx, &merge_io_result.block_path)?; let data = chunk.slice(range.clone()).to_vec(); + inverted_bytes_len += data.len(); let (name, cache_key) = names_map.remove(column_id).unwrap(); let file = InvertedIndexFile::create(name, data); @@ -148,18 +160,134 @@ pub(crate) async fn load_inverted_index_files<'a>( // Perf. { + metrics_inc_block_inverted_index_read_bytes(inverted_bytes_len as u64); metrics_inc_block_inverted_index_read_milliseconds(start.elapsed().as_millis() as u64); } Ok(files) } +/// Loads bytes of each inverted index files +/// read data from cache, or populate cache items if possible +#[fastrace::trace] +pub(crate) async fn load_inverted_index_files<'a>( + settings: &ReadSettings, + inverted_index_meta_map: HashMap, + location: &'a str, + operator: &'a Operator, +) -> Result>> { + let start = Instant::now(); + + let mut inverted_index_fields = Vec::with_capacity(inverted_index_meta_map.len()); + for (name, _) in inverted_index_meta_map.iter() { + let field = Field::new(name, arrow::datatypes::DataType::Binary, false); + inverted_index_fields.push(field); + } + + // 1. read column data, first try to read from cache, + // if not exists, fetch from object storage + let mut ranges = Vec::new(); + let mut names_map = HashMap::new(); + let mut inverted_files = Vec::with_capacity(inverted_index_fields.len()); + let inverted_index_file_cache = CacheManager::instance().get_inverted_index_file_cache(); + for (idx, index_field) in inverted_index_fields.iter().enumerate() { + let name = index_field.name(); + let col_meta = inverted_index_meta_map.get(name).unwrap(); + let cache_key = cache_key_of_column(location, name); + if let Some(cache_file) = inverted_index_file_cache.get_sized(&cache_key, col_meta.len) { + inverted_files.push(cache_file); + continue; + } + + // if cache missed, prepare the ranges to be read + let col_range = col_meta.offset..(col_meta.offset + col_meta.len); + + ranges.push((idx as u32, col_range)); + names_map.insert(idx as u32, (name, cache_key)); + } + + let mut inverted_bytes_len = 0; + if !ranges.is_empty() { + // 2. read data from object store. + let merge_io_result = + MergeIOReader::merge_io_read(settings, operator.clone(), location, &ranges).await?; + + let mut raw_column_data = HashMap::with_capacity(ranges.len()); + for (idx, (chunk_idx, range)) in &merge_io_result.columns_chunk_offsets { + let chunk = merge_io_result + .owner_memory + .get_chunk(*chunk_idx, &merge_io_result.block_path)?; + let data = chunk.slice(range.clone()); + + raw_column_data.insert(*idx as usize, data); + } + let mut column_indices = Vec::with_capacity(ranges.len()); + for (idx, _) in &ranges { + column_indices.push(*idx as usize); + } + + let inverted_index_schema = Schema::new(Fields::from(inverted_index_fields.clone())); + let inverted_index_schema_desc = + Arc::new(ArrowSchemaConverter::new().convert(&inverted_index_schema)?); + + // 3. deserialize raw data to inverted index data + let mut builder = RowGroupImplBuilder::new( + 1, + &inverted_index_schema_desc, + TableCompression::Zstd.into(), + ); + + for (idx, column_data) in raw_column_data.into_iter() { + builder.add_column_chunk(idx, column_data); + } + let row_group = Box::new(builder.build()); + let field_levels = parquet_to_arrow_field_levels( + inverted_index_schema_desc.as_ref(), + ProjectionMask::leaves(&inverted_index_schema_desc, column_indices), + None, + )?; + let mut record_reader = ParquetRecordBatchReader::try_new_with_row_groups( + &field_levels, + row_group.as_ref(), + 1, + None, + )?; + let record = record_reader.next().unwrap()?; + assert!(record_reader.next().is_none()); + + for (i, (idx, _)) in ranges.iter().enumerate() { + let (name, cache_key) = names_map.remove(idx).unwrap(); + let inverted_binary = record.column(i).clone(); + let column = Column::from_arrow_rs( + inverted_binary, + &databend_common_expression::types::DataType::Binary, + )?; + inverted_bytes_len += column.memory_size(); + let value = unsafe { column.index_unchecked(0) }; + let bytes = value.as_binary().unwrap(); + let file = InvertedIndexFile::create(name.clone(), bytes.to_vec()); + // add index file to cache + inverted_index_file_cache.insert(cache_key, file.clone()); + inverted_files.push(Arc::new(file)); + } + } + + // Perf. + { + metrics_inc_block_inverted_index_read_bytes(inverted_bytes_len as u64); + metrics_inc_block_inverted_index_read_milliseconds(start.elapsed().as_millis() as u64); + } + + Ok(inverted_files) +} + /// load inverted index directory #[fastrace::trace] pub(crate) async fn load_inverted_index_directory<'a>( settings: &ReadSettings, location: &'a str, operator: &'a Operator, + version: usize, inverted_index_meta_map: HashMap, ) -> Result { // load inverted index files, usually including following eight files: @@ -171,13 +299,20 @@ pub(crate) async fn load_inverted_index_directory<'a>( // 6. term file // 7. meta.json file // 8. .managed.json file - let mut columns = Vec::with_capacity(inverted_index_meta_map.len()); - for (col_name, col_meta) in inverted_index_meta_map { - let col_range = col_meta.offset..(col_meta.offset + col_meta.len); - columns.push((col_name, col_range)); + if version == 1 { + let mut columns = Vec::with_capacity(inverted_index_meta_map.len()); + for (col_name, col_meta) in inverted_index_meta_map { + let col_range = col_meta.offset..(col_meta.offset + col_meta.len); + columns.push((col_name, col_range)); + } + let files = legacy_load_inverted_index_files(settings, columns, location, operator).await?; + // use those files to create inverted index directory + let directory = InvertedIndexDirectory::try_create(files)?; + return Ok(directory); } - let files = load_inverted_index_files(settings, columns, location, operator).await?; + let files = + load_inverted_index_files(settings, inverted_index_meta_map, location, operator).await?; // use those files to create inverted index directory let directory = InvertedIndexDirectory::try_create(files)?; diff --git a/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_reader.rs b/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_reader.rs index b9edfde8662d0..6a00462daa912 100644 --- a/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_reader.rs +++ b/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_reader.rs @@ -43,8 +43,8 @@ use tantivy_fst::raw::Fst; use crate::index::DocIdsCollector; use crate::index::TermReader; use crate::io::read::inverted_index::inverted_index_loader::cache_key_of_index_columns; +use crate::io::read::inverted_index::inverted_index_loader::legacy_load_inverted_index_files; use crate::io::read::inverted_index::inverted_index_loader::load_inverted_index_directory; -use crate::io::read::inverted_index::inverted_index_loader::load_inverted_index_files; use crate::io::read::inverted_index::inverted_index_loader::load_inverted_index_meta; #[derive(Clone)] @@ -105,17 +105,68 @@ impl InvertedIndexReader { Ok(matched_rows) } - // legacy query search function, using tantivy searcher. - async fn legacy_search( + async fn search( + &self, + settings: &ReadSettings, + index_path: &str, + query: Box, + field_ids: &HashSet, + index_record: &IndexRecordOption, + fuzziness: &Option, + ) -> Result)>>> { + // read index meta. + let inverted_index_meta = load_inverted_index_meta(self.dal.clone(), index_path).await?; + let version = inverted_index_meta.version; + + let inverted_index_meta_map = inverted_index_meta + .columns + .clone() + .into_iter() + .collect::>(); + + // The first and third versions utilize tantivy's search function, + // while the second version employs a custom search function. + if version == 2 { + // To maintain compatibility with legacy data, will be removed in the future + self.custom_search_impl( + settings, + index_path, + query, + field_ids, + index_record, + fuzziness, + inverted_index_meta_map, + ) + .await + } else { + self.tantivy_search_impl( + settings, + index_path, + query, + version, + inverted_index_meta_map, + ) + .await + } + } + + // query search function, using tantivy searcher. + async fn tantivy_search_impl( &self, settings: &ReadSettings, index_path: &str, query: Box, + version: usize, inverted_index_meta_map: HashMap, ) -> Result)>>> { - let directory = - load_inverted_index_directory(settings, index_path, &self.dal, inverted_index_meta_map) - .await?; + let directory = load_inverted_index_directory( + settings, + index_path, + &self.dal, + version, + inverted_index_meta_map, + ) + .await?; let mut index = Index::open(directory)?; index.set_tokenizers(self.tokenizer_manager.clone()); @@ -151,6 +202,8 @@ impl InvertedIndexReader { } } + // Self-developed search function, will be removed in the future + // // Follow the process below to perform the query search: // // 1. Read the `fst` first, check if the term in the query matches. @@ -180,7 +233,7 @@ impl InvertedIndexReader { // If the term matches, the `term_dict` and `postings`, `positions` // data of the related terms need to be read instead of all // the `postings` and `positions` data. - async fn search( + async fn custom_search_impl( &self, settings: &ReadSettings, index_path: &str, @@ -188,26 +241,9 @@ impl InvertedIndexReader { field_ids: &HashSet, index_record: &IndexRecordOption, fuzziness: &Option, + mut inverted_index_meta_map: HashMap, ) -> Result)>>> { - // 1. read index meta. - let inverted_index_meta = load_inverted_index_meta(self.dal.clone(), index_path).await?; - - let mut inverted_index_meta_map = inverted_index_meta - .columns - .clone() - .into_iter() - .collect::>(); - - // if meta contains `meta.json` columns, - // the index file is the first version implementation - // use compatible search function to read. - if inverted_index_meta_map.contains_key("meta.json") { - return self - .legacy_search(settings, index_path, query, inverted_index_meta_map) - .await; - } - - // 2. read fst and term files. + // 1. read fst and term files. let mut columns = Vec::with_capacity(field_ids.len() * 2); for field_id in field_ids { let fst_col_name = format!("fst-{}", field_id); @@ -224,7 +260,7 @@ impl InvertedIndexReader { } let column_files = - load_inverted_index_files(settings, columns, index_path, &self.dal).await?; + legacy_load_inverted_index_files(settings, columns, index_path, &self.dal).await?; let mut column_files_map = column_files .into_iter() .map(|f| (f.name.clone(), f.data.clone())) @@ -252,7 +288,7 @@ impl InvertedIndexReader { fst_maps.insert(*field_id, fst_map); } - // 3. check whether query is matched in the fsts. + // 2. check whether query is matched in the fsts. let mut matched_terms = HashMap::new(); let mut prefix_terms = HashMap::new(); let mut fuzziness_terms = HashMap::new(); @@ -270,7 +306,7 @@ impl InvertedIndexReader { return Ok(None); } - // 4. collect term infos for each terms. + // 3. collect term infos for each terms. let mut term_infos = HashMap::with_capacity(matched_terms.len()); let mut field_term_ids = HashMap::with_capacity(field_ids.len()); for field_id in field_ids { @@ -292,7 +328,7 @@ impl InvertedIndexReader { } } - // 5. read postings and optional positions. + // 4. read postings and optional positions. let mut term_slice_len = if self.need_position { term_infos.len() * 2 } else { @@ -370,7 +406,8 @@ impl InvertedIndexReader { } let slice_column_files = - load_inverted_index_files(settings, slice_columns, index_path, &self.dal).await?; + legacy_load_inverted_index_files(settings, slice_columns, index_path, &self.dal) + .await?; let slice_column_files_map = slice_column_files .into_iter() .map(|f| (f.name.clone(), f.data.clone())) @@ -420,7 +457,7 @@ impl InvertedIndexReader { } } - // 6. collect matched doc ids. + // 5. collect matched doc ids. let term_reader = TermReader::create( self.row_count, self.need_position, diff --git a/src/query/storages/fuse/src/io/read/meta/meta_readers.rs b/src/query/storages/fuse/src/io/read/meta/meta_readers.rs index 33d564862b0a3..07ba4e81668f5 100644 --- a/src/query/storages/fuse/src/io/read/meta/meta_readers.rs +++ b/src/query/storages/fuse/src/io/read/meta/meta_readers.rs @@ -201,6 +201,13 @@ impl Loader for LoaderWrapper { meta.content_length() }; + // read the ThriftFileMetaData, omit unnecessary conversions + if let Ok(meta) = + read_thrift_file_metadata(operator.clone(), ¶ms.location, params.len_hint).await + { + return InvertedIndexMeta::try_from(meta); + } + // read and cache up to DEFAULT_FOOTER_READ_SIZE bytes from the end and process the footer let end_len = std::cmp::min(DEFAULT_FOOTER_READ_SIZE, file_size) as usize; @@ -250,7 +257,10 @@ impl Loader for LoaderWrapper { prev_offset = offset; columns.push((name, column_meta)); } - return Ok(InvertedIndexMeta { columns }); + return Ok(InvertedIndexMeta { + version: 1, + columns, + }); } let schema_len = @@ -279,7 +289,10 @@ impl Loader for LoaderWrapper { columns.push((field.name().clone(), column_meta)); } - Ok(InvertedIndexMeta { columns }) + Ok(InvertedIndexMeta { + version: 2, + columns, + }) } } diff --git a/src/query/storages/fuse/src/io/write/inverted_index_writer.rs b/src/query/storages/fuse/src/io/write/inverted_index_writer.rs index 8cf0b5f2355f0..26acc47c7424e 100644 --- a/src/query/storages/fuse/src/io/write/inverted_index_writer.rs +++ b/src/query/storages/fuse/src/io/write/inverted_index_writer.rs @@ -14,36 +14,35 @@ use std::collections::BTreeMap; use std::collections::HashSet; +use std::path::Path; use std::sync::Arc; use std::time::Instant; -use arrow_ipc::writer::write_message; -use arrow_ipc::writer::IpcDataGenerator; -use arrow_ipc::writer::IpcWriteOptions; -use arrow_schema::Schema as ArrowSchema; use databend_common_exception::ErrorCode; use databend_common_exception::Result; -use databend_common_expression::types::BinaryType; use databend_common_expression::types::DataType; use databend_common_expression::BlockEntry; use databend_common_expression::DataBlock; use databend_common_expression::DataField; use databend_common_expression::DataSchema; use databend_common_expression::DataSchemaRef; +use databend_common_expression::Scalar; use databend_common_expression::ScalarRef; -use databend_common_expression::TableSchema; +use databend_common_expression::TableDataType; +use databend_common_expression::TableField; use databend_common_expression::TableSchemaRef; -use databend_common_expression::Value; +use databend_common_expression::TableSchemaRefExt; use databend_common_io::constants::DEFAULT_BLOCK_BUFFER_SIZE; use databend_common_io::constants::DEFAULT_BLOCK_INDEX_BUFFER_SIZE; use databend_common_meta_app::schema::TableIndexType; use databend_common_meta_app::schema::TableMeta; use databend_common_metrics::storage::metrics_inc_block_inverted_index_generate_milliseconds; -use databend_storages_common_index::extract_component_fields; -use databend_storages_common_index::extract_fsts; +use databend_storages_common_blocks::blocks_to_parquet; use databend_storages_common_table_meta::meta::Location; +use databend_storages_common_table_meta::table::TableCompression; use jsonb::from_raw_jsonb; use jsonb::RawJsonb; +use tantivy::index::SegmentComponent; use tantivy::indexer::UserOperation; use tantivy::schema::Field; use tantivy::schema::IndexRecordOption; @@ -60,12 +59,13 @@ use tantivy::tokenizer::Stemmer; use tantivy::tokenizer::StopWordFilter; use tantivy::tokenizer::TextAnalyzer; use tantivy::tokenizer::TokenizerManager; +use tantivy::Directory; use tantivy::IndexBuilder; use tantivy::IndexSettings; use tantivy::IndexWriter; -use tantivy::SegmentComponent; use tantivy_jieba::JiebaTokenizer; +use crate::index::build_tantivy_footer; use crate::io::TableMetaLocationGenerator; #[derive(Clone)] @@ -183,7 +183,7 @@ impl InvertedIndexWriter { let (index_schema, _) = create_index_schema(schema.clone(), index_options)?; let index_settings = IndexSettings { - sort_by_field: None, + // sort_by_field: None, ..Default::default() }; @@ -221,16 +221,19 @@ impl InvertedIndexWriter { match unsafe { column.index_unchecked(i) } { ScalarRef::String(text) => doc.add_text(field, text), ScalarRef::Variant(jsonb_val) => { - // only support object JSON, other JSON type will not add index. let raw_jsonb = RawJsonb::new(jsonb_val); - if let Ok(obj_val) = - from_raw_jsonb::>(&raw_jsonb) - { - let object: BTreeMap = obj_val - .into_iter() - .map(|(key, value)| (key, OwnedValue::from(value))) - .collect(); - doc.add_object(field, object); + if let Ok(value) = from_raw_jsonb::(&raw_jsonb) { + if value.is_object() { + let owned_value = OwnedValue::from(value); + doc.add_field_value(field, &owned_value); + } else { + // tantivy only support object JSON, + // convert other JSON to object with an empty key. + let owned_value = OwnedValue::from(value); + let mut wrap_owned_value = BTreeMap::new(); + wrap_owned_value.insert("".to_string(), owned_value); + doc.add_object(field, wrap_owned_value); + } } else { doc.add_object(field, BTreeMap::new()); } @@ -255,89 +258,74 @@ impl InvertedIndexWriter { let _ = self.index_writer.run(self.operations); let _ = self.index_writer.commit()?; let index = self.index_writer.index(); + let directory = index.directory(); - let mut fields = Vec::new(); - let mut values = Vec::new(); + let mut index_columns = Vec::with_capacity(8); - let segments = index.searchable_segments()?; - let segment = &segments[0]; - - let termdict_file = segment.open_read(SegmentComponent::Terms)?; - extract_fsts(termdict_file, &mut fields, &mut values)?; + let managed_filepath = Path::new(".managed.json"); + let managed_bytes = directory.atomic_read(managed_filepath)?; + let managed_scalar = Scalar::Binary(managed_bytes); + let managed_block_entry = BlockEntry::new_const_column(DataType::Binary, managed_scalar, 1); + index_columns.push(managed_block_entry); - let field_norms_file = segment.open_read(SegmentComponent::FieldNorms)?; - extract_component_fields("fieldnorm", field_norms_file, &mut fields, &mut values)?; + let meta_filepath = Path::new("meta.json"); + let meta_data = directory.atomic_read(meta_filepath)?; + let meta_string = std::str::from_utf8(&meta_data)?; + let meta_val: serde_json::Value = serde_json::from_str(meta_string)?; + let meta_json: String = serde_json::to_string(&meta_val)?; + let meta_scalar = Scalar::Binary(meta_json.into_bytes()); + let meta_block_entry = BlockEntry::new_const_column(DataType::Binary, meta_scalar, 1); + index_columns.push(meta_block_entry); - let posting_file = segment.open_read(SegmentComponent::Postings)?; - extract_component_fields("idx", posting_file, &mut fields, &mut values)?; - - let position_file = segment.open_read(SegmentComponent::Positions)?; - extract_component_fields("pos", position_file, &mut fields, &mut values)?; + let segments = index.searchable_segments()?; + let segment = &segments[0]; + let components = vec![ + SegmentComponent::FastFields, + SegmentComponent::Store, + SegmentComponent::FieldNorms, + SegmentComponent::Positions, + SegmentComponent::Postings, + SegmentComponent::Terms, + ]; + for component in components { + let component_field = segment.open_read(component)?; + let bytes = component_field.read_bytes()?; + let mut value = bytes.as_slice().to_vec(); + let footer = build_tantivy_footer(&value)?; + value.extend_from_slice(&footer); + + let scalar = Scalar::Binary(value); + let block_entry = BlockEntry::new_const_column(DataType::Binary, scalar, 1); + index_columns.push(block_entry); + } - let inverted_index_schema = TableSchema::new(fields); + let index_fields = vec![ + TableField::new(".managed.json", TableDataType::Binary), + TableField::new("meta.json", TableDataType::Binary), + TableField::new("fast", TableDataType::Binary), + TableField::new("store", TableDataType::Binary), + TableField::new("fieldnorm", TableDataType::Binary), + TableField::new("pos", TableDataType::Binary), + TableField::new("idx", TableDataType::Binary), + TableField::new("term", TableDataType::Binary), + ]; - let index_columns = values - .into_iter() - .map(|v| BlockEntry::new_const_column(DataType::Binary, v, 1)) - .collect(); - let inverted_index_block = DataBlock::new(index_columns, 1); + let index_schema = TableSchemaRefExt::create(index_fields); + let index_block = DataBlock::new(index_columns, 1); let mut data = Vec::with_capacity(DEFAULT_BLOCK_INDEX_BUFFER_SIZE); - block_to_inverted_index(&inverted_index_schema, inverted_index_block, &mut data)?; - Ok(data) - } -} - -// inverted index block include 5 types of data, -// and each of which may have multiple fields. -// 1. `fst` used to check whether a term exist. -// for example: fst-0, fst-1, .. -// 2. `term dict` records the idx and pos locations of each terms. -// for example: term-0, term-1, .. -// 3. `idx` records the doc ids of each terms. -// for example: idx-0, idx-1, .. -// 4. `pos` records the positions of each terms in doc. -// for example: pos-0, pos-1, .. -// 5. `fieldnorms` records the number of tokens in each doc. -// for example: fieldnorms-0, fieldnorms-1, .. -// -// write the value of columns first, -// and then the offsets of columns, -// finally the number of columns. -fn block_to_inverted_index( - table_schema: &TableSchema, - block: DataBlock, - write_buffer: &mut Vec, -) -> Result<()> { - let mut offsets = Vec::with_capacity(block.num_columns()); - for column in block.columns() { - let value: Value = column.value().try_downcast().unwrap(); - write_buffer.extend_from_slice(value.as_scalar().unwrap()); - let offset = write_buffer.len() as u32; - offsets.push(offset); - } + let _ = blocks_to_parquet( + index_schema.as_ref(), + vec![index_block], + &mut data, + // Zstd has the best compression ratio + TableCompression::Zstd, + // Some(metadata), + None, + )?; - // footer: schema + offsets + schema_len + meta_len - let arrow_schema = Arc::new(ArrowSchema::from(table_schema)); - let generator = IpcDataGenerator {}; - let write_options = IpcWriteOptions::default(); - #[allow(deprecated)] - let encoded = generator.schema_to_bytes(&arrow_schema, &write_options); - let mut schema_buf = Vec::new(); - let (schema_len, _) = write_message(&mut schema_buf, encoded, &write_options)?; - write_buffer.extend_from_slice(&schema_buf); - - let schema_len = schema_len as u32; - let offset_len = (offsets.len() * 4) as u32; - for offset in offsets { - write_buffer.extend_from_slice(&offset.to_le_bytes()); + Ok(data) } - let meta_len = schema_len + offset_len + 8; - - write_buffer.extend_from_slice(&schema_len.to_le_bytes()); - write_buffer.extend_from_slice(&meta_len.to_le_bytes()); - - Ok(()) } // Create tokenizer can handle both Chinese and English @@ -357,7 +345,7 @@ pub(crate) fn create_tokenizer_manager( let english_analyzer = TextAnalyzer::builder(SimpleTokenizer::default()) .filter(LowerCaser) .build(); - let chinese_analyzer = TextAnalyzer::builder(JiebaTokenizer {}) + let chinese_analyzer = TextAnalyzer::builder(JiebaTokenizer::new()) .filter(LowerCaser) .build(); @@ -366,7 +354,7 @@ pub(crate) fn create_tokenizer_manager( let mut english_analyzer = TextAnalyzer::builder(SimpleTokenizer::default()).filter_dynamic(LowerCaser); let mut chinese_analyzer = - TextAnalyzer::builder(JiebaTokenizer {}).filter_dynamic(LowerCaser); + TextAnalyzer::builder(JiebaTokenizer::new()).filter_dynamic(LowerCaser); // add optional filters // remove English stop words, like "a", "an", "and", etc. @@ -476,7 +464,9 @@ pub(crate) fn create_index_schema( .set_tokenizer(&tokenizer_name) .set_index_option(index_record); let text_options = TextOptions::default().set_indexing_options(text_field_indexing.clone()); - let json_options = JsonObjectOptions::default().set_indexing_options(text_field_indexing); + let json_options = JsonObjectOptions::default() + .set_indexing_options(text_field_indexing) + .set_fast(None); let mut schema_builder = Schema::builder(); let mut index_fields = Vec::with_capacity(schema.fields.len()); diff --git a/src/query/storages/fuse/src/operations/gc.rs b/src/query/storages/fuse/src/operations/gc.rs index 46d3db9b2cf49..d61456bf6fe18 100644 --- a/src/query/storages/fuse/src/operations/gc.rs +++ b/src/query/storages/fuse/src/operations/gc.rs @@ -28,7 +28,6 @@ use databend_storages_common_cache::CacheAccessor; use databend_storages_common_cache::CachedObject; use databend_storages_common_cache::LoadParams; use databend_storages_common_index::BloomIndexMeta; -use databend_storages_common_index::InvertedIndexFile; use databend_storages_common_index::InvertedIndexMeta; use databend_storages_common_io::Files; use databend_storages_common_table_meta::meta::column_oriented_segment::ColumnOrientedSegment; @@ -43,6 +42,7 @@ use log::error; use log::info; use log::warn; +use crate::index::InvertedIndexFile; use crate::io::read::ColumnOrientedSegmentReader; use crate::io::read::RowOrientedSegmentReader; use crate::io::InvertedIndexReader; diff --git a/src/query/storages/system/src/caches_table.rs b/src/query/storages/system/src/caches_table.rs index 88e9b01d4903e..df15dd2c36f90 100644 --- a/src/query/storages/system/src/caches_table.rs +++ b/src/query/storages/system/src/caches_table.rs @@ -85,6 +85,8 @@ impl SyncSystemTable for CachesTable { let block_meta_cache = cache_manager.get_block_meta_cache(); let inverted_index_meta_cache = cache_manager.get_inverted_index_meta_cache(); let inverted_index_file_cache = cache_manager.get_inverted_index_file_cache(); + let vector_index_meta_cache = cache_manager.get_vector_index_meta_cache(); + let vector_index_file_cache = cache_manager.get_vector_index_file_cache(); let prune_partitions_cache = cache_manager.get_prune_partitions_cache(); let parquet_meta_data_cache = cache_manager.get_parquet_meta_data_cache(); let column_data_cache = cache_manager.get_column_data_cache(); @@ -144,6 +146,14 @@ impl SyncSystemTable for CachesTable { Self::append_row(&inverted_index_file_cache, &local_node, &mut columns); } + if let Some(vector_index_meta_cache) = vector_index_meta_cache { + Self::append_row(&vector_index_meta_cache, &local_node, &mut columns); + } + + if let Some(vector_index_file_cache) = vector_index_file_cache { + Self::append_row(&vector_index_file_cache, &local_node, &mut columns); + } + if let Some(prune_partitions_cache) = prune_partitions_cache { Self::append_row(&prune_partitions_cache, &local_node, &mut columns); } diff --git a/tests/sqllogictests/suites/ee/04_ee_inverted_index/04_0000_inverted_index_base.test b/tests/sqllogictests/suites/ee/04_ee_inverted_index/04_0000_inverted_index_base.test index 960d42b930adf..928e423cbfd33 100644 --- a/tests/sqllogictests/suites/ee/04_ee_inverted_index/04_0000_inverted_index_base.test +++ b/tests/sqllogictests/suites/ee/04_ee_inverted_index/04_0000_inverted_index_base.test @@ -429,33 +429,51 @@ CREATE INVERTED INDEX IF NOT EXISTS idx ON t1(body) tokenizer = 'chinese' statement ok INSERT INTO t1 VALUES -(1, '{"title":"The Psychology of Persuasion","metadata":{"author":"Oliver","publishedDate":"2021-06-15","tags":["psychology","persuasion","behavior"]}}'), -(2, '{"title":"Sustainable Energy Solutions","metadata":{"author":"Pamela","publishedDate":"2023-12-01","tags":["sustainable energy","solutions","environment"]}}'), -(3, '{"title":"The Future of Autonomous Vehicles","metadata":{"author":"Quincy","publishedDate":"2022-05-05","tags":["autonomous vehicles","future","technology"]}}'), -(4, '{"title":"The Role of AI in Customer Service","metadata":{"author":"Rachel","publishedDate":"2021-09-20","tags":["AI","customer service","automation"]}}'), -(5, '{"title":"Internet of Things Applications","metadata":{"author":"Samuel","publishedDate":"2023-12-15","tags":["IoT","applications","technology"]}}'), -(6, '{"title":"人工智能与机器学习","metadata":{"author":"张三","publishedDate":"2023-10-23","tags":["人工智能","机器学习","技术"]}}'), -(7, '{"title":"区块链在金融行业的应用","metadata":{"author":"李四","publishedDate":"2023-09-18","tags":["区块链","金融行业","金融科技"]}}'), -(8, '{"title":"物联网与智能家居","metadata":{"author":"王五","publishedDate":"2023-08-15","tags":["物联网","智能家居","生活"]}}'), -(9, '{"title":"量子计算的未来","metadata":{"author":"赵六","publishedDate":"2023-07-20","tags":["量子计算","未来科技","物理学"]}}'), -(10, '{"title":"网络安全与隐私保护","metadata":{"author":"刘七","publishedDate":"2023-06-25","tags":["网络安全","隐私保护","信息技术"]}}') +(1, '{"title":"The Psychology of Persuasion","metadata":{"author":"Oliver","publishedDate":"2021-06-15","tags":["psychology","persuasion","behavior"],"price":15.44}}'), +(2, '{"title":"Sustainable Energy Solutions","metadata":{"author":"Pamela","publishedDate":"2023-12-01","tags":["sustainable energy","solutions","environment"],"price":63.69}}'), +(3, '{"title":"The Future of Autonomous Vehicles","metadata":{"author":"Quincy","publishedDate":"2022-05-05","tags":["autonomous vehicles","future","technology"],"price":42.92}}'), +(4, '{"title":"The Role of AI in Customer Service","metadata":{"author":"Rachel","publishedDate":"2021-09-20","tags":["AI","customer service","automation"],"price":9.99}}'), +(5, '{"title":"Internet of Things Applications","metadata":{"author":"Samuel","publishedDate":"2023-12-15","tags":["IoT","applications","technology"],"price":69.99}}'), +(6, '{"title":"人工智能与机器学习","metadata":{"author":"张三","publishedDate":"2023-10-23","tags":["人工智能","机器学习","技术"],"price":154.50}}'), +(7, '{"title":"区块链在金融行业的应用","metadata":{"author":"李四","publishedDate":"2023-09-18","tags":["区块链","金融行业","金融科技"],"price":33.90}}'), +(8, '{"title":"物联网与智能家居","metadata":{"author":"王五","publishedDate":"2023-08-15","tags":["物联网","智能家居","生活"],"price":47.70}}'), +(9, '{"title":"量子计算的未来","metadata":{"author":"赵六","publishedDate":"2023-07-20","tags":["量子计算","未来科技","物理学"],"price":29.91}}'), +(10, '{"title":"网络安全与隐私保护","metadata":{"author":"刘七","publishedDate":"2023-06-25","tags":["网络安全","隐私保护","信息技术"],"price":29.00}}') query IFT SELECT id, score(), body FROM t1 WHERE query('body.title:energy') ---- -2 3.2352333 {"metadata":{"author":"Pamela","publishedDate":"2023-12-01","tags":["sustainable energy","solutions","environment"]},"title":"Sustainable Energy Solutions"} +2 3.2427819 {"metadata":{"author":"Pamela","price":63.69,"publishedDate":"2023-12-01","tags":["sustainable energy","solutions","environment"]},"title":"Sustainable Energy Solutions"} query IFT SELECT id, score(), body FROM t1 WHERE query('body.metadata.tags:technology') ---- -3 2.4057739 {"metadata":{"author":"Quincy","publishedDate":"2022-05-05","tags":["autonomous vehicles","future","technology"]},"title":"The Future of Autonomous Vehicles"} -5 2.4057739 {"metadata":{"author":"Samuel","publishedDate":"2023-12-15","tags":["IoT","applications","technology"]},"title":"Internet of Things Applications"} +3 2.411387 {"metadata":{"author":"Quincy","price":42.92,"publishedDate":"2022-05-05","tags":["autonomous vehicles","future","technology"]},"title":"The Future of Autonomous Vehicles"} +5 2.411387 {"metadata":{"author":"Samuel","price":69.99,"publishedDate":"2023-12-15","tags":["IoT","applications","technology"]},"title":"Internet of Things Applications"} query IFT SELECT id, score(), body FROM t1 WHERE query('body.metadata.tags:技术') ---- -6 2.4057739 {"metadata":{"author":"张三","publishedDate":"2023-10-23","tags":["人工智能","机器学习","技术"]},"title":"人工智能与机器学习"} -10 2.4057739 {"metadata":{"author":"刘七","publishedDate":"2023-06-25","tags":["网络安全","隐私保护","信息技术"]},"title":"网络安全与隐私保护"} +6 2.411387 {"metadata":{"author":"张三","price":154.50,"publishedDate":"2023-10-23","tags":["人工智能","机器学习","技术"]},"title":"人工智能与机器学习"} +10 2.411387 {"metadata":{"author":"刘七","price":29.00,"publishedDate":"2023-06-25","tags":["网络安全","隐私保护","信息技术"]},"title":"网络安全与隐私保护"} + +query IFT +SELECT id, score(), body FROM t1 WHERE query('body.metadata.tags: IN [技术 物理学]') +---- +6 1.0 {"metadata":{"author":"张三","price":154.50,"publishedDate":"2023-10-23","tags":["人工智能","机器学习","技术"]},"title":"人工智能与机器学习"} +9 1.0 {"metadata":{"author":"赵六","price":29.91,"publishedDate":"2023-07-20","tags":["量子计算","未来科技","物理学"]},"title":"量子计算的未来"} +10 1.0 {"metadata":{"author":"刘七","price":29.00,"publishedDate":"2023-06-25","tags":["网络安全","隐私保护","信息技术"]},"title":"网络安全与隐私保护"} + +query IFT +SELECT id, score(), body FROM t1 WHERE query('body.metadata.price: [40 TO 60]') +---- +3 1.0 {"metadata":{"author":"Quincy","price":42.92,"publishedDate":"2022-05-05","tags":["autonomous vehicles","future","technology"]},"title":"The Future of Autonomous Vehicles"} +8 1.0 {"metadata":{"author":"王五","price":47.70,"publishedDate":"2023-08-15","tags":["物联网","智能家居","生活"]},"title":"物联网与智能家居"} + +query IFT +SELECT id, score(), body FROM t1 WHERE query('body.metadata.price: [29.91 TO 33.90}') +---- +9 1.0 {"metadata":{"author":"赵六","price":29.91,"publishedDate":"2023-07-20","tags":["量子计算","未来科技","物理学"]},"title":"量子计算的未来"} statement error 1111 ALTER TABLE t1 DROP COLUMN body @@ -473,26 +491,6 @@ idx INVERTED t1(body)tokenizer='chinese' idx1 INVERTED t(content)index_record='"basic"' tokenizer='chinese' idx2 INVERTED books(title, author, description)index_record='"basic"' tokenizer='chinese' -query TII -select name, index_size, inverted_index_size from system.tables where name='t1' and database='test_inverted_index'; ----- -t1 2828 2390 - -query III -select row_count, bloom_filter_size, inverted_index_size from fuse_block('test_inverted_index', 't1') ----- -10 438 2390 - -query IIII -select block_count, row_count, index_size, inverted_index_size from fuse_segment('test_inverted_index', 't1'); ----- -1 10 2828 2390 - -query IIII -select block_count, row_count, index_size, inverted_index_size from fuse_snapshot('test_inverted_index', 't1'); ----- -1 10 2828 2390 - statement ok CREATE TABLE t2 (id int, body string)