From b0dd0a3a5c222b560a82cc3a81d41f60ee02b293 Mon Sep 17 00:00:00 2001 From: Mateusz Gienieczko Date: Sun, 20 Apr 2025 19:01:32 +0200 Subject: [PATCH 1/2] feat: experimental unicode-enabled string matching for avx2 --- Cargo.lock | 305 +++--- .../rsonpath-lib/src/classification/memmem.rs | 17 +- .../src/classification/memmem/avx2_32.rs | 108 ++- .../src/classification/memmem/avx2_64.rs | 135 ++- .../src/classification/memmem/nosimd.rs | 46 +- .../src/classification/memmem/shared.rs | 15 +- .../classification/memmem/shared/mask_32.rs | 32 +- .../classification/memmem/shared/mask_64.rs | 34 +- .../memmem/shared/vector_128.rs | 23 +- .../memmem/shared/vector_256.rs | 23 +- .../src/classification/memmem/sse2_32.rs | 126 ++- .../src/classification/memmem/sse2_64.rs | 145 ++- .../rsonpath-lib/src/classification/simd.rs | 42 +- .../rsonpath-lib/src/engine/head_skipping.rs | 32 +- crates/rsonpath-lib/src/engine/main.rs | 10 +- crates/rsonpath-lib/src/input.rs | 44 +- crates/rsonpath-lib/src/input/borrowed.rs | 49 +- crates/rsonpath-lib/src/input/buffered.rs | 54 +- crates/rsonpath-lib/src/input/mmap.rs | 50 +- crates/rsonpath-lib/src/input/owned.rs | 33 +- crates/rsonpath-lib/src/input/padding.rs | 75 +- crates/rsonpath-lib/src/input/slice.rs | 54 +- crates/rsonpath-lib/src/lib.rs | 2 +- crates/rsonpath-lib/src/string_pattern.rs | 333 ++++++- .../src/string_pattern/matcher.rs | 153 +++ .../src/string_pattern/matcher/avx2_64.rs | 334 +++++++ .../src/string_pattern/matcher/nosimd.rs | 135 +++ .../src/string_pattern/matcher/shared.rs | 115 +++ .../tests/input_implementation_tests.rs | 20 +- ...napshots__ron__jsonpath_example_query.snap | 892 +++++++++++++++++- ...lization_snapshots__ron__readme_query.snap | 96 +- ...ation_snapshots__ron__real_life_query.snap | 561 ++++++++++- ...e_serialization_snapshots__ron__slice.snap | 280 +++++- crates/rsonpath-test/Cargo.toml | 2 +- .../rsonpath-test/documents/toml/escapes.toml | 4 - .../documents/toml/extremely_long_key.toml | 2 - .../documents/toml/memchr_trap.toml | 2 - .../documents/toml/quote_escape.toml | 2 - .../documents/toml/unicode_escape.toml | 50 + 39 files changed, 3876 insertions(+), 559 deletions(-) create mode 100644 crates/rsonpath-lib/src/string_pattern/matcher.rs create mode 100644 crates/rsonpath-lib/src/string_pattern/matcher/avx2_64.rs create mode 100644 crates/rsonpath-lib/src/string_pattern/matcher/nosimd.rs create mode 100644 crates/rsonpath-lib/src/string_pattern/matcher/shared.rs create mode 100644 crates/rsonpath-test/documents/toml/unicode_escape.toml diff --git a/Cargo.lock b/Cargo.lock index 8e6afac7..43fea63c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -67,19 +67,20 @@ dependencies = [ [[package]] name = "anstyle-wincon" -version = "3.0.6" +version = "3.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2109dbce0e72be3ec00bed26e6a7479ca384ad226efdd66db8fa2e3a38c83125" +checksum = "ca3534e77181a9cc07539ad51f2141fe32f6c3ffd4df76db8ad92346b003ae4e" dependencies = [ "anstyle", + "once_cell", "windows-sys 0.59.0", ] [[package]] name = "anyhow" -version = "1.0.95" +version = "1.0.98" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34ac096ce696dc2fcabef30516bb13c0a68a11d30131d3df6f04711467681b04" +checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487" [[package]] name = "arbitrary" @@ -98,13 +99,13 @@ checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" [[package]] name = "automod" -version = "1.0.14" +version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "edf3ee19dbc0a46d740f6f0926bde8c50f02bdbc7b536842da28f6ac56513a8b" +checksum = "ebb4bd301db2e2ca1f5be131c24eb8ebf2d9559bc3744419e93baf8ddea7e670" dependencies = [ "proc-macro2", "quote", - "syn 2.0.93", + "syn 2.0.100", ] [[package]] @@ -151,9 +152,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.6.0" +version = "2.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" +checksum = "5c8214115b7bf84099f1309324e63141d4c5d7cc26862f97a0a857dbefe165bd" [[package]] name = "byteorder" @@ -181,9 +182,9 @@ dependencies = [ [[package]] name = "cargo_metadata" -version = "0.19.1" +version = "0.19.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8769706aad5d996120af43197bf46ef6ad0fda35216b4505f926a365a232d924" +checksum = "dd5eb614ed4c27c5d706420e4320fbe3216ab31fa1c33cd8246ac36dae4479ba" dependencies = [ "camino", "cargo-platform", @@ -195,9 +196,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.6" +version = "1.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d6dbb628b8f8555f86d0323c2eb39e3ec81901f4b83e091db8a6a76d316a333" +checksum = "8e3a13707ac958681c13b39b458c073d0d9bc8a22cb1b2f4c8e55eb72c13f362" dependencies = [ "jobserver", "libc", @@ -239,9 +240,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.36" +version = "4.5.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2df961d8c8a0d08aa9945718ccf584145eee3f3aa06cddbeac12933781102e04" +checksum = "eccb054f56cbd38340b380d4a8e69ef1f02f1af43db2f0cc817a4774d80ae071" dependencies = [ "clap_builder", "clap_derive", @@ -249,9 +250,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.36" +version = "4.5.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "132dbda40fb6753878316a489d5a1242a8ef2f0d9e47ba01c951ea8aa7d013a5" +checksum = "efd9466fac8543255d3b1fcad4762c5e116ffe808c8a3043d4263cd4fd4862a2" dependencies = [ "anstream", "anstyle", @@ -269,7 +270,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.93", + "syn 2.0.100", ] [[package]] @@ -309,9 +310,9 @@ dependencies = [ [[package]] name = "console" -version = "0.15.10" +version = "0.15.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea3c6ecd8059b57859df5c69830340ed3c41d30e3da0c1cbed90a96ac853041b" +checksum = "054ccb5b10f9f2cbf51eb355ca1d05c2d279ce1804688d0db74b4733a5aeafd8" dependencies = [ "encode_unicode", "libc", @@ -366,15 +367,15 @@ checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" [[package]] name = "crunchy" -version = "0.2.2" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" +checksum = "43da5946c66ffcc7745f48db692ffbb10a83bfe0afd96235c5c2a4fb23994929" [[package]] name = "darling" -version = "0.20.10" +version = "0.20.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f63b86c8a8826a49b8c21f08a2d07338eec8d900540f8630dc76284be802989" +checksum = "fc7f46116c46ff9ab3eb1597a45688b6715c6e628b5c133e288e709a29bcb4ee" dependencies = [ "darling_core", "darling_macro", @@ -382,34 +383,34 @@ dependencies = [ [[package]] name = "darling_core" -version = "0.20.10" +version = "0.20.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95133861a8032aaea082871032f5815eb9e98cef03fa916ab4500513994df9e5" +checksum = "0d00b9596d185e565c2207a0b01f8bd1a135483d02d9b7b0a54b11da8d53412e" dependencies = [ "fnv", "ident_case", "proc-macro2", "quote", "strsim", - "syn 2.0.93", + "syn 2.0.100", ] [[package]] name = "darling_macro" -version = "0.20.10" +version = "0.20.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806" +checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" dependencies = [ "darling_core", "quote", - "syn 2.0.93", + "syn 2.0.100", ] [[package]] name = "deranged" -version = "0.3.11" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4" +checksum = "9c9e6a11ca8224451684bc0d7d5a7adbf8f2fd6887261a1cfc3c0432f9d4068e" dependencies = [ "powerfmt", ] @@ -422,7 +423,7 @@ checksum = "30542c1ad912e0e3d22a1935c290e12e8a29d704a420177a31faad4a601a0800" dependencies = [ "proc-macro2", "quote", - "syn 2.0.93", + "syn 2.0.100", ] [[package]] @@ -443,7 +444,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.93", + "syn 2.0.100", ] [[package]] @@ -453,7 +454,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" dependencies = [ "derive_builder_core", - "syn 2.0.93", + "syn 2.0.100", ] [[package]] @@ -470,7 +471,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.93", + "syn 2.0.100", ] [[package]] @@ -481,9 +482,9 @@ checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813" [[package]] name = "either" -version = "1.13.0" +version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" [[package]] name = "encode_unicode" @@ -493,15 +494,15 @@ checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" [[package]] name = "equivalent" -version = "1.0.1" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" [[package]] name = "errno" -version = "0.3.10" +version = "0.3.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33d852cb9b869c2a9b3df2f71a3074817f01e1844f839a144f5fcef059a4eb5d" +checksum = "976dd42dc7e85965fe702eb8164f21f450704bdde31faefd6471dba214cb594e" dependencies = [ "libc", "windows-sys 0.59.0", @@ -572,6 +573,18 @@ dependencies = [ "wasi 0.11.0+wasi-snapshot-preview1", ] +[[package]] +name = "getrandom" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73fea8450eea4bac3940448fb7ae50d91f034f941199fcd9d909a5a07aa455f0" +dependencies = [ + "cfg-if", + "libc", + "r-efi", + "wasi 0.14.2+wasi-0.2.4", +] + [[package]] name = "gimli" version = "0.28.1" @@ -584,7 +597,7 @@ version = "0.20.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5220b8ba44c68a9a7f7a7659e864dd73692e417ef0211bea133c7b74e031eeb9" dependencies = [ - "bitflags 2.6.0", + "bitflags 2.9.0", "libc", "libgit2-sys", "log", @@ -599,9 +612,9 @@ checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2" [[package]] name = "half" -version = "2.4.1" +version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888" +checksum = "459196ed295495a68f7d7fe1d84f6c4b7ff0e21fe3017b2f283c6fac3ad803c9" dependencies = [ "cfg-if", "crunchy", @@ -621,9 +634,9 @@ checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" [[package]] name = "humantime" -version = "2.1.0" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" +checksum = "9b112acc8b3adf4b107a8ec20977da0273a8c386765a3ec0229bd500a1443f9f" [[package]] name = "humantime-serde" @@ -676,9 +689,9 @@ dependencies = [ [[package]] name = "icu_locid_transform_data" -version = "1.5.0" +version = "1.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fdc8ff3388f852bede6b579ad4e978ab004f139284d7b28715f773507b946f6e" +checksum = "7515e6d781098bf9f7205ab3fc7e9709d34554ae0b21ddbcb5febfa4bc7df11d" [[package]] name = "icu_normalizer" @@ -700,9 +713,9 @@ dependencies = [ [[package]] name = "icu_normalizer_data" -version = "1.5.0" +version = "1.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8cafbf7aa791e9b22bec55a167906f9e1215fd475cd22adfcf660e03e989516" +checksum = "c5e8338228bdc8ab83303f16b797e177953730f601a96c25d10cb3ab0daa0cb7" [[package]] name = "icu_properties" @@ -721,9 +734,9 @@ dependencies = [ [[package]] name = "icu_properties_data" -version = "1.5.0" +version = "1.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67a8effbc3dd3e4ba1afa8ad918d5684b8868b3b26500753effea8d2eed19569" +checksum = "85fb8799753b75aee8d2a21d7c14d9f38921b54b3dbda10f5a3c7a7b82dba5e2" [[package]] name = "icu_provider" @@ -750,7 +763,7 @@ checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.93", + "syn 2.0.100", ] [[package]] @@ -788,9 +801,9 @@ checksum = "ce23b50ad8242c51a442f3ff322d56b02f08852c77e4c0b4d3fd684abc89c683" [[package]] name = "indexmap" -version = "2.7.0" +version = "2.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62f822373a4fe84d4bb149bf54e584a7f4abec90e072ed49cda0edea5b95471f" +checksum = "cea70ddb795996207ad57735b50c5982d8844f38ba9ee5f1aedcfb708a2aa11e" dependencies = [ "equivalent", "hashbrown", @@ -828,16 +841,17 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.14" +version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674" +checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" [[package]] name = "jobserver" -version = "0.1.32" +version = "0.1.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0" +checksum = "38f262f097c174adebe41eb73d66ae9c06b2844fb0da69969647bbddd9b0538a" dependencies = [ + "getrandom 0.3.2", "libc", ] @@ -849,9 +863,9 @@ checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" [[package]] name = "libc" -version = "0.2.169" +version = "0.2.172" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" +checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa" [[package]] name = "libgit2-sys" @@ -871,16 +885,16 @@ version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c0ff37bd590ca25063e35af745c343cb7a0271906fb7b37e4813e8f79f00268d" dependencies = [ - "bitflags 2.6.0", + "bitflags 2.9.0", "libc", "redox_syscall", ] [[package]] name = "libz-sys" -version = "1.1.20" +version = "1.1.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2d16453e800a8cf6dd2fc3eb4bc99b786a9b90c663b8559a5b1a041bf89e472" +checksum = "8b70e7a7df205e92a1a4cd9aaae7898dac0aa555503cc0a649494d0d60e7651d" dependencies = [ "cc", "libc", @@ -896,15 +910,15 @@ checksum = "0717cef1bc8b636c6e1c1bbdefc09e6322da8a9321966e8928ef80d20f7f770f" [[package]] name = "linux-raw-sys" -version = "0.4.14" +version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" +checksum = "cd945864f07fe9f5371a27ad7b52a172b4b499999f1d97574c9fa68373937e12" [[package]] name = "litemap" -version = "0.7.4" +version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ee93343901ab17bd981295f2cf0026d4ad018c7c31ba84549a4ddbb47a45104" +checksum = "23fb14cb19457329c82206317a5663005a4d404783dc74f4252769b0d5f42856" [[package]] name = "log" @@ -992,9 +1006,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.20.2" +version = "1.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" [[package]] name = "os_pipe" @@ -1032,29 +1046,29 @@ checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" [[package]] name = "pin-project" -version = "1.1.8" +version = "1.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e2ec53ad785f4d35dac0adea7f7dc6f1bb277ad84a680c7afefeae05d1f5916" +checksum = "677f1add503faace112b9f1373e43e9e054bfdd22ff1a63c1bc485eaec6a6a8a" dependencies = [ "pin-project-internal", ] [[package]] name = "pin-project-internal" -version = "1.1.8" +version = "1.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d56a66c0c55993aa927429d0f8a0abfd74f084e4d9c192cffed01e418d83eefb" +checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861" dependencies = [ "proc-macro2", "quote", - "syn 2.0.93", + "syn 2.0.100", ] [[package]] name = "pkg-config" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2" +checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" [[package]] name = "powerfmt" @@ -1064,9 +1078,9 @@ checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" [[package]] name = "ppv-lite86" -version = "0.2.20" +version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" dependencies = [ "zerocopy", ] @@ -1083,9 +1097,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.94" +version = "1.0.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a31971752e70b8b2686d7e46ec17fb38dad4051d94024c88df49b667caea9c84" +checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" dependencies = [ "unicode-ident", ] @@ -1098,7 +1112,7 @@ checksum = "14cae93065090804185d3b75f0bf93b8eeda30c7a9b4a33d3bdb3988d6229e50" dependencies = [ "bit-set", "bit-vec", - "bitflags 2.6.0", + "bitflags 2.9.0", "lazy_static", "num-traits", "rand 0.8.5", @@ -1125,6 +1139,12 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "r-efi" +version = "5.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5" + [[package]] name = "rand" version = "0.7.3" @@ -1227,11 +1247,11 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.5.8" +version = "0.5.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03a862b389f93e68874fbf580b9de08dd02facb9a788ebadaf4a3fd33cf58834" +checksum = "d2f103c6d277498fbceb16e84d317e2a400f160f46904d5f5410848c829511a3" dependencies = [ - "bitflags 2.6.0", + "bitflags 2.9.0", ] [[package]] @@ -1416,11 +1436,11 @@ checksum = "a39e0e9135d7a7208ee80aa4e3e4b88f0f5ad7be92153ed70686c38a03db2e63" [[package]] name = "rustix" -version = "0.38.42" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f93dc38ecbab2eb790ff964bb77fa94faf256fd3e73285fd7ba0903b76bedb85" +checksum = "d97817398dd4bb2e6da002002db259209759911da105da92bec29ccb12cf58bf" dependencies = [ - "bitflags 2.6.0", + "bitflags 2.9.0", "errno", "libc", "linux-raw-sys", @@ -1429,9 +1449,9 @@ dependencies = [ [[package]] name = "rustversion" -version = "1.0.19" +version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7c45b9784283f1b2e7fb61b42047c2fd678ef0960d4f6f1eba131594cc369d4" +checksum = "eded382c5f5f786b989652c49544c4877d9f015cc22e145a5ea8ea66c2921cd2" [[package]] name = "rusty-fork" @@ -1447,9 +1467,9 @@ dependencies = [ [[package]] name = "ryu" -version = "1.0.18" +version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" +checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" [[package]] name = "same-file" @@ -1462,9 +1482,9 @@ dependencies = [ [[package]] name = "semver" -version = "1.0.24" +version = "1.0.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3cb6eb87a131f756572d7fb904f6e7b68633f09cca868c5df1c4b8d1a694bbba" +checksum = "56e6fa9c48d24d85fb3de5ad847117517440f6beceb7798af16b4a87d616b8d0" dependencies = [ "serde", ] @@ -1486,7 +1506,7 @@ checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" dependencies = [ "proc-macro2", "quote", - "syn 2.0.93", + "syn 2.0.100", ] [[package]] @@ -1518,9 +1538,9 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" [[package]] name = "similar" -version = "2.6.0" +version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1de1d4f81173b03af4c0cbed3c898f6bff5b870e4a7f5d6f4057d62a7a4b686e" +checksum = "bbbb5d9659141646ae647b42fe094daf6c6192d1620870b449d9557f748b2daa" [[package]] name = "simple_logger" @@ -1605,9 +1625,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.93" +version = "2.0.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c786062daee0d6db1132800e623df74274a0a87322d8e183338e01b3d98d058" +checksum = "b09a44accad81e1ba1cd74a32461ba89dee89095ba17b32f5d03683b1b1fc2a0" dependencies = [ "proc-macro2", "quote", @@ -1622,17 +1642,17 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" dependencies = [ "proc-macro2", "quote", - "syn 2.0.93", + "syn 2.0.100", ] [[package]] name = "tempfile" -version = "3.14.0" +version = "3.19.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28cce251fcbc87fac86a866eeb0d6c2d536fc16d06f184bb61aeae11aa4cee0c" +checksum = "7437ac7763b9b123ccf33c338a5cc1bac6f69b45a136c19bdd8a65e3916435bf" dependencies = [ - "cfg-if", "fastrand", + "getrandom 0.3.2", "once_cell", "rustix", "windows-sys 0.59.0", @@ -1640,9 +1660,9 @@ dependencies = [ [[package]] name = "terminal_size" -version = "0.4.1" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5352447f921fda68cf61b4101566c0bdb5104eff6804d0678e5227580ab6a4e9" +checksum = "45c6481c4829e4cc63825e62c49186a34538b7b2750b73b266581ffb612fb5ed" dependencies = [ "rustix", "windows-sys 0.59.0", @@ -1666,7 +1686,7 @@ dependencies = [ "cfg-if", "proc-macro2", "quote", - "syn 2.0.93", + "syn 2.0.100", ] [[package]] @@ -1677,7 +1697,7 @@ checksum = "5c89e72a01ed4c579669add59014b9a524d609c0c88c6a585ce37485879f6ffb" dependencies = [ "proc-macro2", "quote", - "syn 2.0.93", + "syn 2.0.100", "test-case-core", ] @@ -1698,14 +1718,14 @@ checksum = "7f7cf42b4507d8ea322120659672cf1b9dbb93f8f2d4ecfd6e51350ff5b17a1d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.93", + "syn 2.0.100", ] [[package]] name = "time" -version = "0.3.37" +version = "0.3.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35e7868883861bd0e56d9ac6efcaaca0d6d5d82a2a7ec8209ff492c07cf37b21" +checksum = "8a7619e19bc266e0f9c5e6686659d394bc57973859340060a69221e57dbc0c40" dependencies = [ "deranged", "itoa", @@ -1720,15 +1740,15 @@ dependencies = [ [[package]] name = "time-core" -version = "0.1.2" +version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3" +checksum = "c9e9a38711f559d9e3ce1cdb06dd7c5b8ea546bc90052da6d06bb76da74bb07c" [[package]] name = "time-macros" -version = "0.2.19" +version = "0.2.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2834e6017e3e5e4b9834939793b282bc03b37a3336245fa820e35e233e2a85de" +checksum = "3526739392ec93fd8b359c8e98514cb3e8e021beb4e5f597b00a0221f8ed8a49" dependencies = [ "num-conv", "time-core", @@ -1804,9 +1824,9 @@ checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94" [[package]] name = "unicode-ident" -version = "1.0.14" +version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" +checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" [[package]] name = "unicode-width" @@ -1916,9 +1936,9 @@ dependencies = [ [[package]] name = "wait-timeout" -version = "0.2.0" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f200f5b12eb75f8c1ed65abd4b2db8a6e1b138a20de009dacee265a2498f3f6" +checksum = "09ac3b126d3914f9849036f826e054cbabdc8519970b8998ddaf3b5bd3c65f11" dependencies = [ "libc", ] @@ -1945,6 +1965,15 @@ version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" +[[package]] +name = "wasi" +version = "0.14.2+wasi-0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3" +dependencies = [ + "wit-bindgen-rt", +] + [[package]] name = "winapi-util" version = "0.1.9" @@ -2095,13 +2124,22 @@ checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] name = "winnow" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e97b544156e9bebe1a0ffbc03484fc1ffe3100cbce3ffb17eac35f7cdd7ab36" +checksum = "63d3fcd9bba44b03821e7d699eeee959f3126dcc4aa8e4ae18ec617c2a5cea10" dependencies = [ "memchr", ] +[[package]] +name = "wit-bindgen-rt" +version = "0.39.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1" +dependencies = [ + "bitflags 2.9.0", +] + [[package]] name = "write16" version = "1.0.0" @@ -2140,49 +2178,48 @@ checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" dependencies = [ "proc-macro2", "quote", - "syn 2.0.93", + "syn 2.0.100", "synstructure", ] [[package]] name = "zerocopy" -version = "0.7.35" +version = "0.8.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" +checksum = "2586fea28e186957ef732a5f8b3be2da217d65c5969d4b1e17f973ebbe876879" dependencies = [ - "byteorder", "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.7.35" +version = "0.8.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" +checksum = "a996a8f63c5c4448cd959ac1bab0aaa3306ccfd060472f85943ee0750f0169be" dependencies = [ "proc-macro2", "quote", - "syn 2.0.93", + "syn 2.0.100", ] [[package]] name = "zerofrom" -version = "0.1.5" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cff3ee08c995dee1859d998dea82f7374f2826091dd9cd47def953cae446cd2e" +checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5" dependencies = [ "zerofrom-derive", ] [[package]] name = "zerofrom-derive" -version = "0.1.5" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808" +checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" dependencies = [ "proc-macro2", "quote", - "syn 2.0.93", + "syn 2.0.100", "synstructure", ] @@ -2205,5 +2242,5 @@ checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.93", + "syn 2.0.100", ] diff --git a/crates/rsonpath-lib/src/classification/memmem.rs b/crates/rsonpath-lib/src/classification/memmem.rs index 7ffc9980..9bc0f7c0 100644 --- a/crates/rsonpath-lib/src/classification/memmem.rs +++ b/crates/rsonpath-lib/src/classification/memmem.rs @@ -3,7 +3,7 @@ use crate::{ input::{error::InputError, Input}, result::InputRecorder, - string_pattern::StringPattern, + string_pattern::{matcher::StringPatternMatcher, StringPattern}, BLOCK_SIZE, }; @@ -16,6 +16,11 @@ pub trait Memmem<'i, 'b, 'r, I: Input, const N: usize> { /// - `start_idx` – index of the start of search, either falling inside `first_block`, /// or at the start of the next block. /// + /// # Returns + /// None if there was nno match. + /// Otherwise, `Some((i, j, block))` where `i` and `j` delimit the match exactly, + /// and `block` is the input block in which the start of the match occured. + /// /// # Errors /// Errors when reading the underlying [`Input`] are propagated. fn find_label( @@ -23,7 +28,7 @@ pub trait Memmem<'i, 'b, 'r, I: Input, const N: usize> { first_block: Option>, start_idx: usize, label: &StringPattern, - ) -> Result)>, InputError>; + ) -> Result)>, InputError>; } pub(crate) mod nosimd; @@ -39,19 +44,21 @@ pub(crate) mod sse2_32; pub(crate) mod sse2_64; pub(crate) trait MemmemImpl { - type Classifier<'i, 'b, 'r, I, R>: Memmem<'i, 'b, 'r, I, BLOCK_SIZE> + type Classifier<'i, 'b, 'r, I, SM, R>: Memmem<'i, 'b, 'r, I, BLOCK_SIZE> where I: Input + 'i, + SM: StringPatternMatcher, ::BlockIterator<'i, 'r, R, BLOCK_SIZE>: 'b, R: InputRecorder<::Block<'i, BLOCK_SIZE>> + 'r, 'i: 'r; - fn memmem<'i, 'b, 'r, I, R>( + fn memmem<'i, 'b, 'r, I, SM, R>( input: &'i I, iter: &'b mut ::BlockIterator<'i, 'r, R, BLOCK_SIZE>, - ) -> Self::Classifier<'i, 'b, 'r, I, R> + ) -> Self::Classifier<'i, 'b, 'r, I, SM, R> where I: Input, + SM: StringPatternMatcher, R: InputRecorder<::Block<'i, BLOCK_SIZE>>, 'i: 'r; } diff --git a/crates/rsonpath-lib/src/classification/memmem/avx2_32.rs b/crates/rsonpath-lib/src/classification/memmem/avx2_32.rs index c2b189fd..ca350d8c 100644 --- a/crates/rsonpath-lib/src/classification/memmem/avx2_32.rs +++ b/crates/rsonpath-lib/src/classification/memmem/avx2_32.rs @@ -1,59 +1,68 @@ use super::{shared::mask_32, shared::vector_256, *}; use crate::input::{error::InputErrorConvertible, InputBlockIterator}; +use std::marker::PhantomData; const SIZE: usize = 32; pub(crate) struct Constructor; impl MemmemImpl for Constructor { - type Classifier<'i, 'b, 'r, I, R> - = Avx2MemmemClassifier32<'i, 'b, 'r, I, R> + type Classifier<'i, 'b, 'r, I, SM, R> + = Avx2MemmemClassifier32<'i, 'b, 'r, I, SM, R> where I: Input + 'i, + SM: StringPatternMatcher, ::BlockIterator<'i, 'r, R, BLOCK_SIZE>: 'b, R: InputRecorder<::Block<'i, BLOCK_SIZE>> + 'r, 'i: 'r; - fn memmem<'i, 'b, 'r, I, R>( + fn memmem<'i, 'b, 'r, I, SM, R>( input: &'i I, iter: &'b mut ::BlockIterator<'i, 'r, R, BLOCK_SIZE>, - ) -> Self::Classifier<'i, 'b, 'r, I, R> + ) -> Self::Classifier<'i, 'b, 'r, I, SM, R> where I: Input, + SM: StringPatternMatcher, R: InputRecorder<::Block<'i, BLOCK_SIZE>>, 'i: 'r, { - Self::Classifier { input, iter } + Self::Classifier::new(input, iter) } } -pub(crate) struct Avx2MemmemClassifier32<'i, 'b, 'r, I, R> +pub(crate) struct Avx2MemmemClassifier32<'i, 'b, 'r, I, SM, R> where I: Input, R: InputRecorder> + 'r, { input: &'i I, iter: &'b mut I::BlockIterator<'i, 'r, R, SIZE>, + phantom_data: PhantomData, } -impl<'i, 'b, 'r, I, R> Avx2MemmemClassifier32<'i, 'b, 'r, I, R> +impl<'i, 'b, 'r, I, SM, R> Avx2MemmemClassifier32<'i, 'b, 'r, I, SM, R> where I: Input, + SM: StringPatternMatcher, R: InputRecorder>, 'i: 'r, { #[inline] #[allow(dead_code)] pub(crate) fn new(input: &'i I, iter: &'b mut I::BlockIterator<'i, 'r, R, SIZE>) -> Self { - Self { input, iter } + Self { + input, + iter, + phantom_data: PhantomData, + } } #[inline(always)] unsafe fn find_empty( &mut self, - label: &StringPattern, + pattern: &StringPattern, mut offset: usize, - ) -> Result)>, InputError> { + ) -> Result)>, InputError> { let classifier = vector_256::BlockClassifier256::new(b'"', b'"'); let mut previous_block: u32 = 0; @@ -63,12 +72,8 @@ where let mut result = (previous_block | (classified.first << 1)) & classified.second; while result != 0 { let idx = result.trailing_zeros() as usize; - if self - .input - .is_member_match(offset + idx - 1, offset + idx + 1, label) - .e()? - { - return Ok(Some((offset + idx - 1, block))); + if let Some(to) = self.input.pattern_match_from::(offset + idx - 1, pattern).e()? { + return Ok(Some((offset + idx - 1, to, block))); } result &= !(1 << idx); } @@ -86,28 +91,36 @@ where #[inline(always)] unsafe fn find_letter( &mut self, - label: &StringPattern, + pattern: &StringPattern, mut offset: usize, - ) -> Result)>, InputError> { - let classifier = vector_256::BlockClassifier256::new(label.unquoted()[0], b'"'); - let mut previous_block: u32 = 0; + ) -> Result)>, InputError> { + let classifier = vector_256::BlockClassifier256::new(pattern.unquoted()[0], b'"'); + let mut previous_slash: u32 = 0; + let mut previous_first: u32 = 0; + let mut previous_quote: u32 = 0; while let Some(block) = self.iter.next().e()? { let classified = classifier.classify_block(&block); - if let Some(res) = mask_32::find_in_mask( + if let Some((from, to)) = mask_32::find_in_mask::<_, SM>( self.input, - label, - previous_block, + pattern, + previous_slash, + previous_quote, + previous_first, classified.first, classified.second, + classified.slashes, + classified.quotes, offset, )? { - return Ok(Some((res, block))); + return Ok(Some((from, to, block))); } offset += SIZE; - previous_block = classified.first >> (SIZE - 1); + previous_slash = classified.slashes >> (SIZE - 1); + previous_first = classified.first >> (SIZE - 1); + previous_quote = classified.quotes >> (SIZE - 2); } Ok(None) @@ -116,43 +129,52 @@ where #[inline(always)] unsafe fn find_label_avx2( &mut self, - label: &StringPattern, + pattern: &StringPattern, mut offset: usize, - ) -> Result)>, InputError> { - if label.unquoted().is_empty() { - return self.find_empty(label, offset); - } else if label.unquoted().len() == 1 { - return self.find_letter(label, offset); + ) -> Result)>, InputError> { + if pattern.unquoted().is_empty() { + return self.find_empty(pattern, offset); + } else if pattern.unquoted().len() == 1 { + return self.find_letter(pattern, offset); } - let classifier = vector_256::BlockClassifier256::new(label.unquoted()[0], label.unquoted()[1]); - let mut previous_block: u32 = 0; + let classifier = vector_256::BlockClassifier256::new(pattern.unquoted()[0], pattern.unquoted()[1]); + let mut previous_slash: u32 = 0; + let mut previous_first: u32 = 0; + let mut previous_quote: u32 = 0; while let Some(block) = self.iter.next().e()? { let classified = classifier.classify_block(&block); - if let Some(res) = mask_32::find_in_mask( + if let Some((from, to)) = mask_32::find_in_mask::<_, SM>( self.input, - label, - previous_block, + pattern, + previous_slash, + previous_quote, + previous_first, classified.first, classified.second, + classified.slashes, + classified.quotes, offset, )? { - return Ok(Some((res, block))); + return Ok(Some((from, to, block))); } offset += SIZE; - previous_block = classified.first >> (SIZE - 1); + previous_slash = classified.slashes >> (SIZE - 1); + previous_first = classified.first >> (SIZE - 1); + previous_quote = classified.quotes >> (SIZE - 2); } Ok(None) } } -impl<'i, 'b, 'r, I, R> Memmem<'i, 'b, 'r, I, SIZE> for Avx2MemmemClassifier32<'i, 'b, 'r, I, R> +impl<'i, 'b, 'r, I, SM, R> Memmem<'i, 'b, 'r, I, SIZE> for Avx2MemmemClassifier32<'i, 'b, 'r, I, SM, R> where I: Input, + SM: StringPatternMatcher, R: InputRecorder>, 'i: 'r, { @@ -161,15 +183,15 @@ where &mut self, first_block: Option>, start_idx: usize, - label: &StringPattern, - ) -> Result)>, InputError> { + pattern: &StringPattern, + ) -> Result)>, InputError> { if let Some(b) = first_block { - if let Some(res) = shared::find_label_in_first_block(self.input, b, start_idx, label)? { + if let Some(res) = shared::find_pattern_in_first_block::<_, SM, SIZE>(self.input, b, start_idx, pattern)? { return Ok(Some(res)); } } let next_block_offset = self.iter.get_offset(); // SAFETY: target feature invariant - unsafe { self.find_label_avx2(label, next_block_offset) } + unsafe { self.find_label_avx2(pattern, next_block_offset) } } } diff --git a/crates/rsonpath-lib/src/classification/memmem/avx2_64.rs b/crates/rsonpath-lib/src/classification/memmem/avx2_64.rs index 36937ad8..7a1c9587 100644 --- a/crates/rsonpath-lib/src/classification/memmem/avx2_64.rs +++ b/crates/rsonpath-lib/src/classification/memmem/avx2_64.rs @@ -1,62 +1,72 @@ use super::{shared::mask_64, shared::vector_256, *}; use crate::{ + bin_u64, classification::mask::m64, input::{error::InputErrorConvertible, InputBlock, InputBlockIterator}, }; +use std::marker::PhantomData; const SIZE: usize = 64; pub(crate) struct Constructor; impl MemmemImpl for Constructor { - type Classifier<'i, 'b, 'r, I, R> - = Avx2MemmemClassifier64<'i, 'b, 'r, I, R> + type Classifier<'i, 'b, 'r, I, SM, R> + = Avx2MemmemClassifier64<'i, 'b, 'r, I, SM, R> where I: Input + 'i, + SM: StringPatternMatcher, ::BlockIterator<'i, 'r, R, BLOCK_SIZE>: 'b, R: InputRecorder<::Block<'i, BLOCK_SIZE>> + 'r, 'i: 'r; - fn memmem<'i, 'b, 'r, I, R>( + fn memmem<'i, 'b, 'r, I, SM, R>( input: &'i I, iter: &'b mut ::BlockIterator<'i, 'r, R, BLOCK_SIZE>, - ) -> Self::Classifier<'i, 'b, 'r, I, R> + ) -> Self::Classifier<'i, 'b, 'r, I, SM, R> where I: Input, + SM: StringPatternMatcher, R: InputRecorder<::Block<'i, BLOCK_SIZE>>, 'i: 'r, { - Self::Classifier { input, iter } + Self::Classifier::new(input, iter) } } -pub(crate) struct Avx2MemmemClassifier64<'i, 'b, 'r, I, R> +pub(crate) struct Avx2MemmemClassifier64<'i, 'b, 'r, I, SM, R> where I: Input, R: InputRecorder> + 'r, { input: &'i I, iter: &'b mut I::BlockIterator<'i, 'r, R, SIZE>, + phantom_data: PhantomData, } -impl<'i, 'b, 'r, I, R> Avx2MemmemClassifier64<'i, 'b, 'r, I, R> +impl<'i, 'b, 'r, I, SM, R> Avx2MemmemClassifier64<'i, 'b, 'r, I, SM, R> where I: Input, + SM: StringPatternMatcher, R: InputRecorder>, 'i: 'r, { #[inline] #[allow(dead_code)] pub(crate) fn new(input: &'i I, iter: &'b mut I::BlockIterator<'i, 'r, R, SIZE>) -> Self { - Self { input, iter } + Self { + input, + iter, + phantom_data: PhantomData, + } } #[inline(always)] unsafe fn find_empty( &mut self, - label: &StringPattern, + pattern: &StringPattern, mut offset: usize, - ) -> Result)>, InputError> { + ) -> Result)>, InputError> { let classifier = vector_256::BlockClassifier256::new(b'"', b'"'); let mut previous_block: u64 = 0; @@ -71,12 +81,8 @@ where let mut result = (previous_block | (first_bitmask << 1)) & second_bitmask; while result != 0 { let idx = result.trailing_zeros() as usize; - if self - .input - .is_member_match(offset + idx - 1, offset + idx + 1, label) - .e()? - { - return Ok(Some((offset + idx - 1, block))); + if let Some(to) = self.input.pattern_match_from::(offset + idx - 1, pattern).e()? { + return Ok(Some((offset + idx - 1, to, block))); } result &= !(1 << idx); } @@ -94,11 +100,13 @@ where #[inline(always)] unsafe fn find_letter( &mut self, - label: &StringPattern, + pattern: &StringPattern, mut offset: usize, - ) -> Result)>, InputError> { - let classifier = vector_256::BlockClassifier256::new(label.unquoted()[0], b'"'); - let mut previous_block: u64 = 0; + ) -> Result)>, InputError> { + let classifier = vector_256::BlockClassifier256::new(pattern.unquoted()[0], b'"'); + let mut previous_slash: u64 = 0; + let mut previous_first: u64 = 0; + let mut previous_quote: u64 = 0; while let Some(block) = self.iter.next().e()? { let (block1, block2) = block.halves(); @@ -107,15 +115,36 @@ where let first_bitmask = m64::combine_32(classified1.first, classified2.first); let second_bitmask = m64::combine_32(classified1.second, classified2.second); - - if let Some(res) = - mask_64::find_in_mask(self.input, label, previous_block, first_bitmask, second_bitmask, offset)? - { - return Ok(Some((res, block))); + let slash_bitmask = m64::combine_32(classified1.slashes, classified2.slashes); + let quote_bitmask = m64::combine_32(classified1.quotes, classified2.quotes); + + bin_u64!("first_bitmask", first_bitmask); + bin_u64!("second_bitmask", second_bitmask); + bin_u64!("slash_bitmask", slash_bitmask); + bin_u64!("quote_bitmask", quote_bitmask); + bin_u64!("previous_slash", previous_slash); + bin_u64!("previous_first", previous_first); + bin_u64!("previous_quote", previous_quote); + + if let Some((from, to)) = mask_64::find_in_mask::<_, SM>( + self.input, + pattern, + previous_slash, + previous_quote, + previous_first, + first_bitmask, + second_bitmask, + slash_bitmask, + quote_bitmask, + offset, + )? { + return Ok(Some((from, to, block))); } offset += SIZE; - previous_block = first_bitmask >> (SIZE - 1); + previous_slash = (slash_bitmask & (quote_bitmask << 1)) >> (SIZE - 1); + previous_first = (first_bitmask & (quote_bitmask << 1)) >> (SIZE - 1); + previous_quote = quote_bitmask >> (SIZE - 2); } Ok(None) @@ -124,17 +153,19 @@ where #[inline(always)] unsafe fn find_label_avx2( &mut self, - label: &StringPattern, + pattern: &StringPattern, mut offset: usize, - ) -> Result)>, InputError> { - if label.unquoted().is_empty() { - return self.find_empty(label, offset); - } else if label.unquoted().len() == 1 { - return self.find_letter(label, offset); + ) -> Result)>, InputError> { + if pattern.unquoted().is_empty() { + return self.find_empty(pattern, offset); + } else if pattern.unquoted().len() == 1 { + return self.find_letter(pattern, offset); } - let classifier = vector_256::BlockClassifier256::new(label.unquoted()[0], label.unquoted()[1]); - let mut previous_block: u64 = 0; + let classifier = vector_256::BlockClassifier256::new(pattern.unquoted()[0], pattern.unquoted()[1]); + let mut previous_slash: u64 = 0; + let mut previous_first: u64 = 0; + let mut previous_quote: u64 = 0; while let Some(block) = self.iter.next().e()? { let (block1, block2) = block.halves(); @@ -143,24 +174,38 @@ where let first_bitmask = m64::combine_32(classified1.first, classified2.first); let second_bitmask = m64::combine_32(classified1.second, classified2.second); - - if let Some(res) = - mask_64::find_in_mask(self.input, label, previous_block, first_bitmask, second_bitmask, offset)? - { - return Ok(Some((res, block))); + let slash_bitmask = m64::combine_32(classified1.slashes, classified2.slashes); + let quote_bitmask = m64::combine_32(classified1.quotes, classified2.quotes); + + if let Some((from, to)) = mask_64::find_in_mask::<_, SM>( + self.input, + pattern, + previous_slash, + previous_quote, + previous_first, + first_bitmask, + second_bitmask, + slash_bitmask, + quote_bitmask, + offset, + )? { + return Ok(Some((from, to, block))); } offset += SIZE; - previous_block = first_bitmask >> (SIZE - 1); + previous_slash = (slash_bitmask & (quote_bitmask << 1)) >> (SIZE - 1); + previous_first = (first_bitmask & (quote_bitmask << 1)) >> (SIZE - 1); + previous_quote = quote_bitmask >> (SIZE - 2); } Ok(None) } } -impl<'i, 'b, 'r, I, R> Memmem<'i, 'b, 'r, I, SIZE> for Avx2MemmemClassifier64<'i, 'b, 'r, I, R> +impl<'i, 'b, 'r, I, SM, R> Memmem<'i, 'b, 'r, I, SIZE> for Avx2MemmemClassifier64<'i, 'b, 'r, I, SM, R> where I: Input, + SM: StringPatternMatcher, R: InputRecorder>, 'i: 'r, { @@ -169,15 +214,15 @@ where &mut self, first_block: Option>, start_idx: usize, - label: &StringPattern, - ) -> Result)>, InputError> { + pattern: &StringPattern, + ) -> Result)>, InputError> { if let Some(b) = first_block { - if let Some(res) = shared::find_label_in_first_block(self.input, b, start_idx, label)? { + if let Some(res) = shared::find_pattern_in_first_block::<_, SM, SIZE>(self.input, b, start_idx, pattern)? { return Ok(Some(res)); } } let next_block_offset = self.iter.get_offset(); // SAFETY: target feature invariant - unsafe { self.find_label_avx2(label, next_block_offset) } + unsafe { self.find_label_avx2(pattern, next_block_offset) } } } diff --git a/crates/rsonpath-lib/src/classification/memmem/nosimd.rs b/crates/rsonpath-lib/src/classification/memmem/nosimd.rs index 19948437..e2b42170 100644 --- a/crates/rsonpath-lib/src/classification/memmem/nosimd.rs +++ b/crates/rsonpath-lib/src/classification/memmem/nosimd.rs @@ -1,42 +1,55 @@ +use std::marker::PhantomData; + use super::*; -use crate::input::{error::InputErrorConvertible, InputBlockIterator}; +use crate::{ + input::{error::InputErrorConvertible, InputBlockIterator}, + string_pattern::StringPattern, +}; pub(crate) struct Constructor; impl MemmemImpl for Constructor { - type Classifier<'i, 'b, 'r, I, R> - = SequentialMemmemClassifier<'i, 'b, 'r, I, R, BLOCK_SIZE> + type Classifier<'i, 'b, 'r, I, SM, R> + = SequentialMemmemClassifier<'i, 'b, 'r, I, SM, R, BLOCK_SIZE> where I: Input + 'i, + SM: StringPatternMatcher, ::BlockIterator<'i, 'r, R, BLOCK_SIZE>: 'b, R: InputRecorder<::Block<'i, BLOCK_SIZE>> + 'r, 'i: 'r; - fn memmem<'i, 'b, 'r, I, R>( + fn memmem<'i, 'b, 'r, I, SM, R>( input: &'i I, iter: &'b mut ::BlockIterator<'i, 'r, R, BLOCK_SIZE>, - ) -> Self::Classifier<'i, 'b, 'r, I, R> + ) -> Self::Classifier<'i, 'b, 'r, I, SM, R> where I: Input, + SM: StringPatternMatcher, R: InputRecorder<::Block<'i, BLOCK_SIZE>>, 'i: 'r, { - Self::Classifier { input, iter } + Self::Classifier { + input, + iter, + phantom: PhantomData, + } } } -pub(crate) struct SequentialMemmemClassifier<'i, 'b, 'r, I, R, const N: usize> +pub(crate) struct SequentialMemmemClassifier<'i, 'b, 'r, I, SM, R, const N: usize> where I: Input, R: InputRecorder> + 'r, { input: &'i I, iter: &'b mut I::BlockIterator<'i, 'r, R, N>, + phantom: PhantomData, } -impl<'i, 'r, I, R, const N: usize> SequentialMemmemClassifier<'i, '_, 'r, I, R, N> +impl<'i, 'b, 'r, I, SM, R, const N: usize> SequentialMemmemClassifier<'i, 'b, 'r, I, SM, R, N> where I: Input, + SM: StringPatternMatcher, R: InputRecorder> + 'r, { #[inline] @@ -44,8 +57,7 @@ where &mut self, label: &StringPattern, mut offset: usize, - ) -> Result)>, InputError> { - let label_size = label.quoted().len(); + ) -> Result)>, InputError> { let first_c = if label.unquoted().is_empty() { b'"' } else { @@ -56,8 +68,10 @@ where for (i, c) in block.iter().copied().enumerate() { let j = offset + i; - if c == first_c && j > 0 && self.input.is_member_match(j - 1, j + label_size - 1, label).e()? { - return Ok(Some((j - 1, block))); + if (c == first_c || c == b'\\') && j > 0 { + if let Some(to) = self.input.pattern_match_from::(j - 1, label).e()? { + return Ok(Some((j - 1, to, block))); + } } } @@ -68,9 +82,11 @@ where } } -impl<'i, 'b, 'r, I, R, const N: usize> Memmem<'i, 'b, 'r, I, N> for SequentialMemmemClassifier<'i, 'b, 'r, I, R, N> +impl<'i, 'b, 'r, I, SM, R, const N: usize> Memmem<'i, 'b, 'r, I, N> + for SequentialMemmemClassifier<'i, 'b, 'r, I, SM, R, N> where I: Input, + SM: StringPatternMatcher, R: InputRecorder> + 'r, { // Output the relative offsets @@ -79,9 +95,9 @@ where first_block: Option>, start_idx: usize, label: &StringPattern, - ) -> Result)>, InputError> { + ) -> Result)>, InputError> { if let Some(b) = first_block { - if let Some(res) = shared::find_label_in_first_block(self.input, b, start_idx, label)? { + if let Some(res) = shared::find_pattern_in_first_block::<_, SM, N>(self.input, b, start_idx, label)? { return Ok(Some(res)); } } diff --git a/crates/rsonpath-lib/src/classification/memmem/shared.rs b/crates/rsonpath-lib/src/classification/memmem/shared.rs index adef4b2b..076fb324 100644 --- a/crates/rsonpath-lib/src/classification/memmem/shared.rs +++ b/crates/rsonpath-lib/src/classification/memmem/shared.rs @@ -1,3 +1,4 @@ +use crate::string_pattern::matcher::StringPatternMatcher; use crate::{ input::{ error::{InputError, InputErrorConvertible}, @@ -15,24 +16,26 @@ pub(super) mod vector_128; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub(super) mod vector_256; -pub(crate) fn find_label_in_first_block<'i, 'r, I, const N: usize>( +pub(crate) fn find_pattern_in_first_block<'i, 'r, I, SM, const N: usize>( input: &I, first_block: I::Block<'i, N>, start_idx: usize, - label: &StringPattern, -) -> Result)>, InputError> + pattern: &StringPattern, +) -> Result)>, InputError> where I: Input, + SM: StringPatternMatcher, 'i: 'r, { let block_idx = start_idx % N; - let label_size = label.quoted().len(); for (i, c) in first_block[block_idx..].iter().copied().enumerate() { let j = start_idx + i; - if c == b'"' && input.is_member_match(j, j + label_size, label).e()? { - return Ok(Some((j, first_block))); + if c == b'"' { + if let Some(to) = input.pattern_match_from::(j, pattern).e()? { + return Ok(Some((j, to, first_block))); + } } } diff --git a/crates/rsonpath-lib/src/classification/memmem/shared/mask_32.rs b/crates/rsonpath-lib/src/classification/memmem/shared/mask_32.rs index 6ba662c0..47552e07 100644 --- a/crates/rsonpath-lib/src/classification/memmem/shared/mask_32.rs +++ b/crates/rsonpath-lib/src/classification/memmem/shared/mask_32.rs @@ -3,28 +3,34 @@ use crate::{ error::{InputError, InputErrorConvertible}, Input, }, - string_pattern::StringPattern, + string_pattern::{matcher::StringPatternMatcher, StringPattern}, }; #[inline(always)] -pub(crate) fn find_in_mask( +#[allow(clippy::too_many_arguments)] +pub(crate) fn find_in_mask( input: &I, - label: &StringPattern, - previous_block: u32, + pattern: &StringPattern, + previous_slash: u32, + previous_quote: u32, + previous_first: u32, first: u32, second: u32, + slash: u32, + quotes: u32, offset: usize, -) -> Result, InputError> { - let label_size = label.quoted().len(); - let mut result = (previous_block | (first << 1)) & second; +) -> Result, InputError> { + let slash_override = previous_slash | (slash << 1) | slash; + let first_mask = (first << 1) | previous_first; + let quote_mask = (quotes << 2) | previous_quote; + let character_mask = first_mask & second & quote_mask; + let mut result = slash_override | character_mask; while result != 0 { let idx = result.trailing_zeros() as usize; - if offset + idx > 1 - && input - .is_member_match(offset + idx - 2, offset + idx + label_size - 2, label) - .e()? - { - return Ok(Some(offset + idx - 2)); + if offset + idx > 1 { + if let Some(to) = input.pattern_match_from::(offset + idx - 2, pattern).e()? { + return Ok(Some((offset + idx - 2, to))); + } } result &= !(1 << idx); } diff --git a/crates/rsonpath-lib/src/classification/memmem/shared/mask_64.rs b/crates/rsonpath-lib/src/classification/memmem/shared/mask_64.rs index 97c2f7f5..e98c6208 100644 --- a/crates/rsonpath-lib/src/classification/memmem/shared/mask_64.rs +++ b/crates/rsonpath-lib/src/classification/memmem/shared/mask_64.rs @@ -1,32 +1,36 @@ use crate::{ - debug, input::{ error::{InputError, InputErrorConvertible}, Input, }, - string_pattern::StringPattern, + string_pattern::{matcher::StringPatternMatcher, StringPattern}, }; #[inline(always)] -pub(crate) fn find_in_mask( +#[allow(clippy::too_many_arguments)] +pub(crate) fn find_in_mask( input: &I, - label: &StringPattern, - previous_block: u64, + pattern: &StringPattern, + previous_slash: u64, + previous_quote: u64, + previous_first: u64, first: u64, second: u64, + slash: u64, + quotes: u64, offset: usize, -) -> Result, InputError> { - let label_size = label.quoted().len(); - let mut result = (previous_block | (first << 1)) & second; +) -> Result, InputError> { + let slash_override = previous_slash | (slash << 1) | slash; + let first_mask = (first << 1) | previous_first; + let quote_mask = (quotes << 2) | previous_quote; + let character_mask = first_mask & second & quote_mask; + let mut result = slash_override | character_mask; while result != 0 { let idx = result.trailing_zeros() as usize; - debug!("{offset} + {idx} - 2 to {offset} + {idx} + {label_size} - 3"); - if offset + idx > 1 - && input - .is_member_match(offset + idx - 2, offset + idx + label_size - 2, label) - .e()? - { - return Ok(Some(offset + idx - 2)); + if offset + idx > 1 { + if let Some(to) = input.pattern_match_from::(offset + idx - 2, pattern).e()? { + return Ok(Some((offset + idx - 2, to))); + } } result &= !(1 << idx); } diff --git a/crates/rsonpath-lib/src/classification/memmem/shared/vector_128.rs b/crates/rsonpath-lib/src/classification/memmem/shared/vector_128.rs index e87d4c8a..9eae537c 100644 --- a/crates/rsonpath-lib/src/classification/memmem/shared/vector_128.rs +++ b/crates/rsonpath-lib/src/classification/memmem/shared/vector_128.rs @@ -8,6 +8,16 @@ pub(crate) struct BlockClassifier128 { second: __m128i, } +#[inline(always)] +pub(crate) unsafe fn slash_mask() -> __m128i { + _mm_set1_epi8(b'\\' as i8) +} + +#[inline(always)] +pub(crate) unsafe fn quote_mask() -> __m128i { + _mm_set1_epi8(b'"' as i8) +} + impl BlockClassifier128 { #[target_feature(enable = "sse2")] pub(crate) unsafe fn new(first: u8, second: u8) -> Self { @@ -23,15 +33,26 @@ impl BlockClassifier128 { let first_cmp_vector = _mm_cmpeq_epi8(byte_vector, self.first); let second_cmp_vector = _mm_cmpeq_epi8(byte_vector, self.second); + let slash_cmp_vector = _mm_cmpeq_epi8(byte_vector, slash_mask()); + let quote_cmp_vector = _mm_cmpeq_epi8(byte_vector, quote_mask()); let first = _mm_movemask_epi8(first_cmp_vector) as u16; let second = _mm_movemask_epi8(second_cmp_vector) as u16; + let slashes = _mm_movemask_epi8(slash_cmp_vector) as u16; + let quotes = _mm_movemask_epi8(quote_cmp_vector) as u16; - BlockClassification128 { first, second } + BlockClassification128 { + first, + second, + slashes, + quotes, + } } } pub(crate) struct BlockClassification128 { pub(crate) first: u16, pub(crate) second: u16, + pub(crate) slashes: u16, + pub(crate) quotes: u16, } diff --git a/crates/rsonpath-lib/src/classification/memmem/shared/vector_256.rs b/crates/rsonpath-lib/src/classification/memmem/shared/vector_256.rs index 74018435..aeb356be 100644 --- a/crates/rsonpath-lib/src/classification/memmem/shared/vector_256.rs +++ b/crates/rsonpath-lib/src/classification/memmem/shared/vector_256.rs @@ -8,6 +8,16 @@ pub(crate) struct BlockClassifier256 { second: __m256i, } +#[inline(always)] +pub(crate) unsafe fn slash_mask() -> __m256i { + _mm256_set1_epi8(b'\\' as i8) +} + +#[inline(always)] +pub(crate) unsafe fn quote_mask() -> __m256i { + _mm256_set1_epi8(b'"' as i8) +} + impl BlockClassifier256 { #[target_feature(enable = "avx2")] pub(crate) unsafe fn new(first: u8, second: u8) -> Self { @@ -23,15 +33,26 @@ impl BlockClassifier256 { let first_cmp_vector = _mm256_cmpeq_epi8(byte_vector, self.first); let second_cmp_vector = _mm256_cmpeq_epi8(byte_vector, self.second); + let slash_cmp_vector = _mm256_cmpeq_epi8(byte_vector, slash_mask()); + let quote_cmp_vector = _mm256_cmpeq_epi8(byte_vector, quote_mask()); let first = _mm256_movemask_epi8(first_cmp_vector) as u32; let second = _mm256_movemask_epi8(second_cmp_vector) as u32; + let slashes = _mm256_movemask_epi8(slash_cmp_vector) as u32; + let quotes = _mm256_movemask_epi8(quote_cmp_vector) as u32; - BlockClassification256 { first, second } + BlockClassification256 { + first, + second, + slashes, + quotes, + } } } pub(crate) struct BlockClassification256 { pub(crate) first: u32, pub(crate) second: u32, + pub(crate) slashes: u32, + pub(crate) quotes: u32, } diff --git a/crates/rsonpath-lib/src/classification/memmem/sse2_32.rs b/crates/rsonpath-lib/src/classification/memmem/sse2_32.rs index 3df64a94..307e3806 100644 --- a/crates/rsonpath-lib/src/classification/memmem/sse2_32.rs +++ b/crates/rsonpath-lib/src/classification/memmem/sse2_32.rs @@ -3,60 +3,69 @@ use crate::{ classification::mask::m32, input::{error::InputErrorConvertible, InputBlock, InputBlockIterator}, }; +use std::marker::PhantomData; const SIZE: usize = 32; pub(crate) struct Constructor; impl MemmemImpl for Constructor { - type Classifier<'i, 'b, 'r, I, R> - = Sse2MemmemClassifier32<'i, 'b, 'r, I, R> + type Classifier<'i, 'b, 'r, I, SM, R> + = Sse2MemmemClassifier32<'i, 'b, 'r, I, SM, R> where I: Input + 'i, + SM: StringPatternMatcher, ::BlockIterator<'i, 'r, R, BLOCK_SIZE>: 'b, R: InputRecorder<::Block<'i, BLOCK_SIZE>> + 'r, 'i: 'r; - fn memmem<'i, 'b, 'r, I, R>( + fn memmem<'i, 'b, 'r, I, SM, R>( input: &'i I, iter: &'b mut ::BlockIterator<'i, 'r, R, BLOCK_SIZE>, - ) -> Self::Classifier<'i, 'b, 'r, I, R> + ) -> Self::Classifier<'i, 'b, 'r, I, SM, R> where I: Input, + SM: StringPatternMatcher, R: InputRecorder<::Block<'i, BLOCK_SIZE>>, 'i: 'r, { - Self::Classifier { input, iter } + Self::Classifier::new(input, iter) } } -pub(crate) struct Sse2MemmemClassifier32<'i, 'b, 'r, I, R> +pub(crate) struct Sse2MemmemClassifier32<'i, 'b, 'r, I, SM, R> where I: Input, R: InputRecorder> + 'r, { input: &'i I, iter: &'b mut I::BlockIterator<'i, 'r, R, SIZE>, + phantom_data: PhantomData, } -impl<'i, 'b, 'r, I, R> Sse2MemmemClassifier32<'i, 'b, 'r, I, R> +impl<'i, 'b, 'r, I, SM, R> Sse2MemmemClassifier32<'i, 'b, 'r, I, SM, R> where I: Input, + SM: StringPatternMatcher, R: InputRecorder>, 'i: 'r, { #[inline] #[allow(dead_code)] pub(crate) fn new(input: &'i I, iter: &'b mut I::BlockIterator<'i, 'r, R, SIZE>) -> Self { - Self { input, iter } + Self { + input, + iter, + phantom_data: PhantomData, + } } #[inline(always)] unsafe fn find_empty( &mut self, - label: &StringPattern, + pattern: &StringPattern, mut offset: usize, - ) -> Result)>, InputError> { + ) -> Result)>, InputError> { let classifier = vector_128::BlockClassifier128::new(b'"', b'"'); let mut previous_block: u32 = 0; @@ -71,12 +80,8 @@ where let mut result = (previous_block | (first_bitmask << 1)) & second_bitmask; while result != 0 { let idx = result.trailing_zeros() as usize; - if self - .input - .is_member_match(offset + idx - 1, offset + idx + 1, label) - .e()? - { - return Ok(Some((offset + idx - 1, block))); + if let Some(to) = self.input.pattern_match_from::(offset + idx - 1, pattern).e()? { + return Ok(Some((offset + idx - 1, to, block))); } result &= !(1 << idx); } @@ -94,11 +99,13 @@ where #[inline(always)] unsafe fn find_letter( &mut self, - label: &StringPattern, + pattern: &StringPattern, mut offset: usize, - ) -> Result)>, InputError> { - let classifier = vector_128::BlockClassifier128::new(label.unquoted()[0], b'"'); - let mut previous_block: u32 = 0; + ) -> Result)>, InputError> { + let classifier = vector_128::BlockClassifier128::new(pattern.unquoted()[0], b'"'); + let mut previous_slash: u32 = 0; + let mut previous_first: u32 = 0; + let mut previous_quote: u32 = 0; while let Some(block) = self.iter.next().e()? { let (block1, block2) = block.halves(); @@ -107,15 +114,28 @@ where let first_bitmask = m32::combine_16(classified1.first, classified2.first); let second_bitmask = m32::combine_16(classified1.second, classified2.second); - - if let Some(res) = - mask_32::find_in_mask(self.input, label, previous_block, first_bitmask, second_bitmask, offset)? - { - return Ok(Some((res, block))); + let slash_bitmask = m32::combine_16(classified1.slashes, classified2.slashes); + let quote_bitmask = m32::combine_16(classified1.quotes, classified2.quotes); + + if let Some((from, to)) = mask_32::find_in_mask::<_, SM>( + self.input, + pattern, + previous_slash, + previous_quote, + previous_first, + first_bitmask, + second_bitmask, + slash_bitmask, + quote_bitmask, + offset, + )? { + return Ok(Some((from, to, block))); } offset += SIZE; - previous_block = first_bitmask >> (SIZE - 1); + previous_slash = (slash_bitmask & (quote_bitmask << 1)) >> (SIZE - 1); + previous_first = (first_bitmask & (quote_bitmask << 1)) >> (SIZE - 1); + previous_quote = quote_bitmask >> (SIZE - 2); } Ok(None) @@ -124,17 +144,19 @@ where #[inline(always)] unsafe fn find_label_sse2( &mut self, - label: &StringPattern, + pattern: &StringPattern, mut offset: usize, - ) -> Result)>, InputError> { - if label.unquoted().is_empty() { - return self.find_empty(label, offset); - } else if label.unquoted().len() == 1 { - return self.find_letter(label, offset); + ) -> Result)>, InputError> { + if pattern.unquoted().is_empty() { + return self.find_empty(pattern, offset); + } else if pattern.unquoted().len() == 1 { + return self.find_letter(pattern, offset); } - let classifier = vector_128::BlockClassifier128::new(label.unquoted()[0], label.unquoted()[1]); - let mut previous_block: u32 = 0; + let classifier = vector_128::BlockClassifier128::new(pattern.unquoted()[0], pattern.unquoted()[1]); + let mut previous_slash: u32 = 0; + let mut previous_first: u32 = 0; + let mut previous_quote: u32 = 0; while let Some(block) = self.iter.next().e()? { let (block1, block2) = block.halves(); @@ -143,24 +165,38 @@ where let first_bitmask = m32::combine_16(classified1.first, classified2.first); let second_bitmask = m32::combine_16(classified1.second, classified2.second); - - if let Some(res) = - mask_32::find_in_mask(self.input, label, previous_block, first_bitmask, second_bitmask, offset)? - { - return Ok(Some((res, block))); + let slash_bitmask = m32::combine_16(classified1.slashes, classified2.slashes); + let quote_bitmask = m32::combine_16(classified1.quotes, classified2.quotes); + + if let Some((from, to)) = mask_32::find_in_mask::<_, SM>( + self.input, + pattern, + previous_slash, + previous_quote, + previous_first, + first_bitmask, + second_bitmask, + slash_bitmask, + quote_bitmask, + offset, + )? { + return Ok(Some((from, to, block))); } offset += SIZE; - previous_block = first_bitmask >> (SIZE - 1); + previous_slash = (slash_bitmask & (quote_bitmask << 1)) >> (SIZE - 1); + previous_first = (first_bitmask & (quote_bitmask << 1)) >> (SIZE - 1); + previous_quote = quote_bitmask >> (SIZE - 2); } Ok(None) } } -impl<'i, 'b, 'r, I, R> Memmem<'i, 'b, 'r, I, SIZE> for Sse2MemmemClassifier32<'i, 'b, 'r, I, R> +impl<'i, 'b, 'r, I, SM, R> Memmem<'i, 'b, 'r, I, SIZE> for Sse2MemmemClassifier32<'i, 'b, 'r, I, SM, R> where I: Input, + SM: StringPatternMatcher, R: InputRecorder>, 'i: 'r, { @@ -169,15 +205,15 @@ where &mut self, first_block: Option>, start_idx: usize, - label: &StringPattern, - ) -> Result)>, InputError> { + pattern: &StringPattern, + ) -> Result)>, InputError> { if let Some(b) = first_block { - if let Some(res) = shared::find_label_in_first_block(self.input, b, start_idx, label)? { + if let Some(res) = shared::find_pattern_in_first_block::<_, SM, SIZE>(self.input, b, start_idx, pattern)? { return Ok(Some(res)); } } let next_block_offset = self.iter.get_offset(); // SAFETY: target feature invariant - unsafe { self.find_label_sse2(label, next_block_offset) } + unsafe { self.find_label_sse2(pattern, next_block_offset) } } } diff --git a/crates/rsonpath-lib/src/classification/memmem/sse2_64.rs b/crates/rsonpath-lib/src/classification/memmem/sse2_64.rs index 26775e35..45c84dc3 100644 --- a/crates/rsonpath-lib/src/classification/memmem/sse2_64.rs +++ b/crates/rsonpath-lib/src/classification/memmem/sse2_64.rs @@ -3,52 +3,66 @@ use crate::{ classification::mask::m64, input::{error::InputErrorConvertible, InputBlock, InputBlockIterator}, }; +use std::marker::PhantomData; const SIZE: usize = 64; pub(crate) struct Constructor; impl MemmemImpl for Constructor { - type Classifier<'i, 'b, 'r, I, R> - = Sse2MemmemClassifier64<'i, 'b, 'r, I, R> + type Classifier<'i, 'b, 'r, I, SM, R> + = Sse2MemmemClassifier64<'i, 'b, 'r, I, SM, R> where I: Input + 'i, + SM: StringPatternMatcher, ::BlockIterator<'i, 'r, R, BLOCK_SIZE>: 'b, R: InputRecorder<::Block<'i, BLOCK_SIZE>> + 'r, 'i: 'r; - fn memmem<'i, 'b, 'r, I, R>( + fn memmem<'i, 'b, 'r, I, SM, R>( input: &'i I, iter: &'b mut ::BlockIterator<'i, 'r, R, BLOCK_SIZE>, - ) -> Self::Classifier<'i, 'b, 'r, I, R> + ) -> Self::Classifier<'i, 'b, 'r, I, SM, R> where I: Input, + SM: StringPatternMatcher, R: InputRecorder<::Block<'i, BLOCK_SIZE>>, 'i: 'r, { - Self::Classifier { input, iter } + Self::Classifier { + input, + iter, + phantom_data: PhantomData, + } } } -pub(crate) struct Sse2MemmemClassifier64<'i, 'b, 'r, I, R> +pub(crate) struct Sse2MemmemClassifier64<'i, 'b, 'r, I, SM, R> where I: Input, + SM: StringPatternMatcher, R: InputRecorder> + 'r, { input: &'i I, iter: &'b mut I::BlockIterator<'i, 'r, R, SIZE>, + phantom_data: PhantomData, } -impl<'i, 'b, 'r, I, R> Sse2MemmemClassifier64<'i, 'b, 'r, I, R> +impl<'i, 'b, 'r, I, SM, R> Sse2MemmemClassifier64<'i, 'b, 'r, I, SM, R> where I: Input, + SM: StringPatternMatcher, R: InputRecorder>, 'i: 'r, { #[inline] #[allow(dead_code)] pub(crate) fn new(input: &'i I, iter: &'b mut I::BlockIterator<'i, 'r, R, SIZE>) -> Self { - Self { input, iter } + Self { + input, + iter, + phantom_data: PhantomData, + } } #[inline(always)] @@ -56,7 +70,7 @@ where &mut self, label: &StringPattern, mut offset: usize, - ) -> Result)>, InputError> { + ) -> Result)>, InputError> { let classifier = vector_128::BlockClassifier128::new(b'"', b'"'); let mut previous_block: u64 = 0; @@ -83,12 +97,8 @@ where let mut result = (previous_block | (first_bitmask << 1)) & second_bitmask; while result != 0 { let idx = result.trailing_zeros() as usize; - if self - .input - .is_member_match(offset + idx - 1, offset + idx + 1, label) - .e()? - { - return Ok(Some((offset + idx - 1, block))); + if let Some(to) = self.input.pattern_match_from::(offset + idx - 1, label).e()? { + return Ok(Some((offset + idx - 1, to, block))); } result &= !(1 << idx); } @@ -106,11 +116,13 @@ where #[inline(always)] unsafe fn find_letter( &mut self, - label: &StringPattern, + pattern: &StringPattern, mut offset: usize, - ) -> Result)>, InputError> { - let classifier = vector_128::BlockClassifier128::new(label.unquoted()[0], b'"'); - let mut previous_block: u64 = 0; + ) -> Result)>, InputError> { + let classifier = vector_128::BlockClassifier128::new(pattern.unquoted()[0], b'"'); + let mut previous_slash: u64 = 0; + let mut previous_first: u64 = 0; + let mut previous_quote: u64 = 0; while let Some(block) = self.iter.next().e()? { let (block1, block2, block3, block4) = block.quarters(); @@ -131,15 +143,38 @@ where classified3.second, classified4.second, ); + let slash_bitmask = m64::combine_16( + classified1.slashes, + classified2.slashes, + classified3.slashes, + classified4.slashes, + ); + let quote_bitmask = m64::combine_16( + classified1.quotes, + classified2.quotes, + classified3.quotes, + classified4.quotes, + ); - if let Some(res) = - mask_64::find_in_mask(self.input, label, previous_block, first_bitmask, second_bitmask, offset)? - { - return Ok(Some((res, block))); + if let Some((from, to)) = mask_64::find_in_mask::<_, SM>( + self.input, + pattern, + previous_slash, + previous_quote, + previous_first, + first_bitmask, + second_bitmask, + slash_bitmask, + quote_bitmask, + offset, + )? { + return Ok(Some((from, to, block))); } offset += SIZE; - previous_block = first_bitmask >> (SIZE - 1); + previous_slash = (slash_bitmask & (quote_bitmask << 1)) >> (SIZE - 1); + previous_first = (first_bitmask & (quote_bitmask << 1)) >> (SIZE - 1); + previous_quote = quote_bitmask >> (SIZE - 2); } Ok(None) @@ -148,17 +183,19 @@ where #[inline(always)] unsafe fn find_label_sse2( &mut self, - label: &StringPattern, + pattern: &StringPattern, mut offset: usize, - ) -> Result)>, InputError> { - if label.unquoted().is_empty() { - return self.find_empty(label, offset); - } else if label.unquoted().len() == 1 { - return self.find_letter(label, offset); + ) -> Result)>, InputError> { + if pattern.unquoted().is_empty() { + return self.find_empty(pattern, offset); + } else if pattern.unquoted().len() == 1 { + return self.find_letter(pattern, offset); } - let classifier = vector_128::BlockClassifier128::new(label.unquoted()[0], label.unquoted()[1]); - let mut previous_block: u64 = 0; + let classifier = vector_128::BlockClassifier128::new(pattern.unquoted()[0], pattern.unquoted()[1]); + let mut previous_slash: u64 = 0; + let mut previous_first: u64 = 0; + let mut previous_quote: u64 = 0; while let Some(block) = self.iter.next().e()? { let (block1, block2, block3, block4) = block.quarters(); @@ -179,24 +216,48 @@ where classified3.second, classified4.second, ); + let slash_bitmask = m64::combine_16( + classified1.slashes, + classified2.slashes, + classified3.slashes, + classified4.slashes, + ); + let quote_bitmask = m64::combine_16( + classified1.quotes, + classified2.quotes, + classified3.quotes, + classified4.quotes, + ); - if let Some(res) = - mask_64::find_in_mask(self.input, label, previous_block, first_bitmask, second_bitmask, offset)? - { - return Ok(Some((res, block))); + if let Some((from, to)) = mask_64::find_in_mask::<_, SM>( + self.input, + pattern, + previous_slash, + previous_quote, + previous_first, + first_bitmask, + second_bitmask, + slash_bitmask, + quote_bitmask, + offset, + )? { + return Ok(Some((from, to, block))); } offset += SIZE; - previous_block = first_bitmask >> (SIZE - 1); + previous_slash = (slash_bitmask & (quote_bitmask << 1)) >> (SIZE - 1); + previous_first = (first_bitmask & (quote_bitmask << 1)) >> (SIZE - 1); + previous_quote = quote_bitmask >> (SIZE - 2); } Ok(None) } } -impl<'i, 'b, 'r, I, R> Memmem<'i, 'b, 'r, I, SIZE> for Sse2MemmemClassifier64<'i, 'b, 'r, I, R> +impl<'i, 'b, 'r, I, SM, R> Memmem<'i, 'b, 'r, I, SIZE> for Sse2MemmemClassifier64<'i, 'b, 'r, I, SM, R> where I: Input, + SM: StringPatternMatcher, R: InputRecorder>, 'i: 'r, { @@ -205,15 +266,15 @@ where &mut self, first_block: Option>, start_idx: usize, - label: &StringPattern, - ) -> Result)>, InputError> { + pattern: &StringPattern, + ) -> Result)>, InputError> { if let Some(b) = first_block { - if let Some(res) = shared::find_label_in_first_block(self.input, b, start_idx, label)? { + if let Some(res) = shared::find_pattern_in_first_block::<_, SM, SIZE>(self.input, b, start_idx, pattern)? { return Ok(Some(res)); } } let next_block_offset = self.iter.get_offset(); // SAFETY: target feature invariant - unsafe { self.find_label_sse2(label, next_block_offset) } + unsafe { self.find_label_sse2(pattern, next_block_offset) } } } diff --git a/crates/rsonpath-lib/src/classification/simd.rs b/crates/rsonpath-lib/src/classification/simd.rs index d8f98945..ea554db3 100644 --- a/crates/rsonpath-lib/src/classification/simd.rs +++ b/crates/rsonpath-lib/src/classification/simd.rs @@ -212,6 +212,7 @@ use super::{ use crate::{ input::{Input, InputBlockIterator}, result::InputRecorder, + string_pattern::matcher::StringPatternMatcher, MaskType, BLOCK_SIZE, }; use cfg_if::cfg_if; @@ -242,6 +243,9 @@ pub(crate) trait Simd: Copy { R: InputRecorder<::Block<'i, BLOCK_SIZE>> + 'r, 'i: 'r; + /// The implementation of [`StringPatternMatcher`] of this SIMD configuration. + type StringPatternMatcher: StringPatternMatcher; + /// Get a unique descriptor of the enabled SIMD capabilities. /// /// The value should correspond to the `const`s defined in [`simd`](`self`), @@ -318,30 +322,31 @@ pub(crate) trait Simd: Copy { 'i: 'r; } -pub(crate) struct ResolvedSimd { - phantom: PhantomData<(Q, S, D, M)>, +pub(crate) struct ResolvedSimd { + phantom: PhantomData<(Q, S, D, M, SM)>, } -impl Clone for ResolvedSimd { +impl Clone for ResolvedSimd { fn clone(&self) -> Self { *self } } -impl Copy for ResolvedSimd {} +impl Copy for ResolvedSimd {} -impl ResolvedSimd { +impl ResolvedSimd { pub(crate) fn new() -> Self { Self { phantom: PhantomData } } } -impl Simd for ResolvedSimd +impl Simd for ResolvedSimd where Q: QuotesImpl, S: StructuralImpl, D: DepthImpl, M: MemmemImpl, + SM: StringPatternMatcher, { type QuotesClassifier<'i, I> = Q::Classifier<'i, I> @@ -359,13 +364,15 @@ where I: InputBlockIterator<'i, BLOCK_SIZE>; type MemmemClassifier<'i, 'b, 'r, I, R> - = M::Classifier<'i, 'b, 'r, I, R> + = M::Classifier<'i, 'b, 'r, I, SM, R> where I: Input + 'i, ::BlockIterator<'i, 'r, R, BLOCK_SIZE>: 'b, R: InputRecorder<::Block<'i, BLOCK_SIZE>> + 'r, 'i: 'r; + type StringPatternMatcher = SM; + #[inline(always)] fn dispatch_tag(self) -> usize { TARGET @@ -686,6 +693,7 @@ cfg_if! { $crate::classification::structural::avx2_64::Constructor, $crate::classification::depth::avx2_64::Constructor, $crate::classification::memmem::avx2_64::Constructor, + $crate::string_pattern::matcher::avx2_64::Avx2StringMatcher64, {$crate::classification::simd::AVX2_PCLMULQDQ_POPCNT}, >::new(); $b @@ -699,6 +707,7 @@ cfg_if! { $crate::classification::structural::ssse3_64::Constructor, $crate::classification::depth::sse2_64::Constructor, $crate::classification::memmem::sse2_64::Constructor, + $crate::string_pattern::matcher::nosimd::NosimdStringMatcher, {$crate::classification::simd::SSSE3_PCLMULQDQ_POPCNT}, >::new(); $b @@ -709,6 +718,7 @@ cfg_if! { $crate::classification::structural::ssse3_64::Constructor, $crate::classification::depth::nosimd::Constructor, $crate::classification::memmem::sse2_64::Constructor, + $crate::string_pattern::matcher::nosimd::NosimdStringMatcher, {$crate::classification::simd::SSSE3_PCLMULQDQ}, >::new(); $b @@ -719,6 +729,7 @@ cfg_if! { $crate::classification::structural::ssse3_64::Constructor, $crate::classification::depth::sse2_64::Constructor, $crate::classification::memmem::sse2_64::Constructor, + $crate::string_pattern::matcher::nosimd::NosimdStringMatcher, {$crate::classification::simd::SSSE3_POPCNT}, >::new(); $b @@ -729,6 +740,7 @@ cfg_if! { $crate::classification::structural::ssse3_64::Constructor, $crate::classification::depth::nosimd::Constructor, $crate::classification::memmem::sse2_64::Constructor, + $crate::string_pattern::matcher::nosimd::NosimdStringMatcher, {$crate::classification::simd::SSSE3}, >::new(); $b @@ -745,6 +757,7 @@ cfg_if! { $crate::classification::structural::nosimd::Constructor, $crate::classification::depth::sse2_64::Constructor, $crate::classification::memmem::sse2_64::Constructor, + $crate::string_pattern::matcher::nosimd::NosimdStringMatcher, {$crate::classification::simd::SSE2_PCLMULQDQ_POPCNT}, >::new(); $b @@ -755,6 +768,7 @@ cfg_if! { $crate::classification::structural::nosimd::Constructor, $crate::classification::depth::nosimd::Constructor, $crate::classification::memmem::sse2_64::Constructor, + $crate::string_pattern::matcher::nosimd::NosimdStringMatcher, {$crate::classification::simd::SSE2_PCLMULQDQ}, >::new(); $b @@ -765,6 +779,7 @@ cfg_if! { $crate::classification::structural::nosimd::Constructor, $crate::classification::depth::sse2_64::Constructor, $crate::classification::memmem::sse2_64::Constructor, + $crate::string_pattern::matcher::nosimd::NosimdStringMatcher, {$crate::classification::simd::SSE2_POPCNT}, >::new(); $b @@ -775,6 +790,7 @@ cfg_if! { $crate::classification::structural::nosimd::Constructor, $crate::classification::depth::nosimd::Constructor, $crate::classification::memmem::sse2_64::Constructor, + $crate::string_pattern::matcher::nosimd::NosimdStringMatcher, {$crate::classification::simd::SSE2}, >::new(); $b @@ -788,6 +804,7 @@ cfg_if! { $crate::classification::structural::nosimd::Constructor, $crate::classification::depth::nosimd::Constructor, $crate::classification::memmem::nosimd::Constructor, + $crate::string_pattern::matcher::nosimd::NosimdStringMatcher, {$crate::classification::simd::NOSIMD} >::new(); $b @@ -813,6 +830,7 @@ cfg_if! { $crate::classification::structural::avx2_32::Constructor, $crate::classification::depth::avx2_32::Constructor, $crate::classification::memmem::avx2_32::Constructor, + $crate::string_pattern::matcher::nosimd::NosimdStringMatcher, {$crate::classification::simd::AVX2_PCLMULQDQ_POPCNT}, >::new(); $b @@ -826,6 +844,7 @@ cfg_if! { $crate::classification::structural::ssse3_32::Constructor, $crate::classification::depth::sse2_32::Constructor, $crate::classification::memmem::sse2_32::Constructor, + $crate::string_pattern::matcher::nosimd::NosimdStringMatcher, {$crate::classification::simd::SSSE3_PCLMULQDQ_POPCNT} >::new(); $b @@ -836,6 +855,7 @@ cfg_if! { $crate::classification::structural::ssse3_32::Constructor, $crate::classification::depth::nosimd::Constructor, $crate::classification::memmem::sse2_32::Constructor, + $crate::string_pattern::matcher::nosimd::NosimdStringMatcher, {$crate::classification::simd::SSSE3_PCLMULQDQ} >::new(); $b @@ -846,6 +866,7 @@ cfg_if! { $crate::classification::structural::ssse3_32::Constructor, $crate::classification::depth::sse2_32::Constructor, $crate::classification::memmem::sse2_32::Constructor, + $crate::string_pattern::matcher::nosimd::NosimdStringMatcher, {$crate::classification::simd::SSSE3_POPCNT} >::new(); $b @@ -856,6 +877,7 @@ cfg_if! { $crate::classification::structural::ssse3_32::Constructor, $crate::classification::depth::nosimd::Constructor, $crate::classification::memmem::sse2_32::Constructor, + $crate::string_pattern::matcher::nosimd::NosimdStringMatcher, {$crate::classification::simd::SSSE3} >::new(); $b @@ -872,6 +894,7 @@ cfg_if! { $crate::classification::structural::nosimd::Constructor, $crate::classification::depth::sse2_32::Constructor, $crate::classification::memmem::sse2_32::Constructor, + $crate::string_pattern::matcher::nosimd::NosimdStringMatcher, {$crate::classification::simd::SSE2_PCLMULQDQ_POPCNT} >::new(); $b @@ -882,6 +905,7 @@ cfg_if! { $crate::classification::structural::nosimd::Constructor, $crate::classification::depth::nosimd::Constructor, $crate::classification::memmem::sse2_32::Constructor, + $crate::string_pattern::matcher::nosimd::NosimdStringMatcher, {$crate::classification::simd::SSE2_PCLMULQDQ} >::new(); $b @@ -892,6 +916,7 @@ cfg_if! { $crate::classification::structural::nosimd::Constructor, $crate::classification::depth::sse2_32::Constructor, $crate::classification::memmem::sse2_32::Constructor, + $crate::string_pattern::matcher::nosimd::NosimdStringMatcher, {$crate::classification::simd::SSE2_POPCNT} >::new(); $b @@ -902,6 +927,7 @@ cfg_if! { $crate::classification::structural::nosimd::Constructor, $crate::classification::depth::nosimd::Constructor, $crate::classification::memmem::sse2_32::Constructor, + $crate::string_pattern::matcher::nosimd::NosimdStringMatcher, {$crate::classification::simd::SSE2} >::new(); $b @@ -915,6 +941,7 @@ cfg_if! { $crate::classification::structural::nosimd::Constructor, $crate::classification::depth::nosimd::Constructor, $crate::classification::memmem::nosimd::Constructor, + $crate::string_pattern::matcher::nosimd::NosimdStringMatcher, {$crate::classification::simd::NOSIMD} >::new(); $b @@ -937,6 +964,7 @@ cfg_if! { $crate::classification::structural::nosimd::Constructor, $crate::classification::depth::nosimd::Constructor, $crate::classification::memmem::nosimd::Constructor, + $crate::string_pattern::matcher::nosimd::NosimdStringMatcher, {$crate::classification::simd::NOSIMD}, >::new(); $b diff --git a/crates/rsonpath-lib/src/engine/head_skipping.rs b/crates/rsonpath-lib/src/engine/head_skipping.rs index 847d766c..0a98a424 100644 --- a/crates/rsonpath-lib/src/engine/head_skipping.rs +++ b/crates/rsonpath-lib/src/engine/head_skipping.rs @@ -1,9 +1,6 @@ //! Engine decorator that performs **head skipping** – an extremely optimized search for //! the first matching member name in a query starting with a self-looping state. //! This happens in queries starting with a descendant selector. - -use std::sync::Arc; - use crate::{ automaton::{Automaton, State}, classification::{ @@ -68,7 +65,7 @@ pub(super) struct HeadSkip<'b, I, V, const N: usize> { bytes: &'b I, state: State, is_accepting: bool, - member_name: Arc, + member_pattern: &'b StringPattern, simd: V, } @@ -95,7 +92,7 @@ impl<'b, I: Input, V: Simd> HeadSkip<'b, I, V, BLOCK_SIZE> { /// extremely quickly with [`classification::memmem`](crate::classification::memmem). /// /// In all other cases, head-skipping is not supported. - pub(super) fn new(bytes: &'b I, automaton: &Automaton, simd: V) -> Option { + pub(super) fn new(bytes: &'b I, automaton: &'b Automaton, simd: V) -> Option { let initial_state = automaton.initial_state(); let fallback_state = automaton[initial_state].fallback_state(); let transitions = automaton[initial_state].member_transitions(); @@ -104,13 +101,13 @@ impl<'b, I: Input, V: Simd> HeadSkip<'b, I, V, BLOCK_SIZE> { && transitions.len() == 1 && automaton[initial_state].array_transitions().is_empty() { - let (member_name, target_state) = &transitions[0]; + let (member_pattern, target_state) = &transitions[0]; debug!("Automaton starts with a descendant search, using memmem heuristic."); return Some(Self { bytes, state: *target_state, is_accepting: automaton.is_accepting(*target_state), - member_name: member_name.clone(), + member_pattern: member_pattern.as_ref(), simd, }); } @@ -143,20 +140,21 @@ impl<'b, I: Input, V: Simd> HeadSkip<'b, I, V, BLOCK_SIZE> { let mut memmem = head_skip.simd.memmem(head_skip.bytes, &mut input_iter); debug!("Starting memmem search from {idx}"); - if let Some((starting_quote_idx, last_block)) = memmem.find_label(first_block, idx, head_skip.member_name.as_ref())? { + if let Some((starting_quote_idx, ending_quote_idx, last_block)) = memmem.find_label(first_block, idx, head_skip.member_pattern)? { drop(memmem); first_block = Some(last_block); idx = starting_quote_idx; debug!("Needle found at {idx}"); - let seek_start_idx = idx + head_skip.member_name.quoted().len(); + let seek_start_idx = ending_quote_idx + 1; + debug!("Seeking from {seek_start_idx}"); - match head_skip.bytes.seek_non_whitespace_forward(seek_start_idx).e()? { - Some((colon_idx, b':')) => { - let (next_idx, next_c) = head_skip - .bytes - .seek_non_whitespace_forward(colon_idx + 1).e()? - .ok_or(EngineError::MissingItem())?; + match head_skip.bytes.seek_non_whitespace_forward(seek_start_idx).e()? { + Some((colon_idx, b':')) => { + let (next_idx, next_c) = head_skip + .bytes + .seek_non_whitespace_forward(colon_idx + 1).e()? + .ok_or(EngineError::MissingItem())?; let ResumedQuoteClassifier { classifier: quote_classifier, @@ -214,6 +212,9 @@ impl<'b, I: Input, V: Simd> HeadSkip<'b, I, V, BLOCK_SIZE> { } } + debug!("is accepting? {}", head_skip.is_accepting); + debug!("next_c is {next_c}"); + classifier_state = match next_c { b'{' | b'[' => { debug!("resuming"); @@ -234,6 +235,7 @@ impl<'b, I: Input, V: Simd> HeadSkip<'b, I, V, BLOCK_SIZE> { .0 } _ if head_skip.is_accepting => { + debug!("recording atomic match at {next_idx}"); engine.recorder().record_match( next_idx, Depth::ZERO, diff --git a/crates/rsonpath-lib/src/engine/main.rs b/crates/rsonpath-lib/src/engine/main.rs index f45c6337..ebe38708 100644 --- a/crates/rsonpath-lib/src/engine/main.rs +++ b/crates/rsonpath-lib/src/engine/main.rs @@ -663,10 +663,10 @@ where } } - /// Check if the label ended with a colon at index `idx` matches the `member_name`. + /// Check if the label ended with a colon at index `idx` matches the `pattern`. #[inline(always)] - fn is_match(&self, idx: usize, member_name: &StringPattern) -> Result { - let len = member_name.quoted().len(); + fn is_match(&self, idx: usize, pattern: &StringPattern) -> Result { + let len = pattern.quoted().len(); // The colon can be preceded by whitespace before the actual label. let closing_quote_idx = match self.input.seek_backward(idx - 1, b'"') { @@ -680,9 +680,9 @@ where } // Do the expensive memcmp. - let start_idx = closing_quote_idx + 1 - len; self.input - .is_member_match(start_idx, closing_quote_idx + 1, member_name) + .pattern_match_to::(closing_quote_idx + 1, pattern) + .map(|x| x.is_some()) .map_err(|x| x.into().into()) } diff --git a/crates/rsonpath-lib/src/input.rs b/crates/rsonpath-lib/src/input.rs index 05a022ec..734329eb 100644 --- a/crates/rsonpath-lib/src/input.rs +++ b/crates/rsonpath-lib/src/input.rs @@ -36,6 +36,7 @@ macro_rules! repr_align_block_size { $it }; } +use crate::string_pattern::matcher::StringPatternMatcher; pub(crate) use repr_align_block_size; /// Global padding guarantee for all [`Input`] implementations. @@ -140,17 +141,44 @@ pub trait Input: Sized { #[must_use] fn seek_non_whitespace_backward(&self, from: usize) -> Option<(usize, u8)>; - /// Decide whether the slice of input between `from` (inclusive) - /// and `to` (exclusive) matches the `member` (comparing bitwise, - /// including double quotes delimiters). + /// Decide whether a `pattern` matches the input bytes starting from `from`. /// /// This will also check if the leading double quote is not /// escaped by a backslash character. /// + /// # Returns + /// If matched, `Some` containing the index of the last character of the match. + /// If not matched, `None`. + /// /// # Errors - /// This function can read more data from the input if `to` falls beyond - /// the range that was already read, and the read operation can fail. - fn is_member_match(&self, from: usize, to: usize, member: &StringPattern) -> Result; + /// This function can read more data from the input if needed to conclusively + /// decide the result, and the read operation can fail depending on the input implementation. + fn pattern_match_from( + &self, + from: usize, + pattern: &StringPattern, + ) -> Result, Self::Error>; + + /// Decide whether a `pattern` matches the input bytes ending at `to` (inclusive). + /// + /// This is similar to [`pattern_match_from`](Input::pattern_match_from), + /// but matches backwards from `to`. + /// + /// This will also check if the leading double quote is not + /// escaped by a backslash character. + /// + /// # Returns + /// If matched, `Some` containing the index of the first character of the match. + /// If not matched, `None`. + /// + /// # Errors + /// This function can read more data from the input if `to` falls after the bytes + /// already read, and the read operation can fail depending on the input implementation. + fn pattern_match_to( + &self, + to: usize, + pattern: &StringPattern, + ) -> Result, Self::Error>; } /// An iterator over blocks of input of size `N`. @@ -210,7 +238,9 @@ impl<'i, const N: usize> InputBlock<'i, N> for &'i [u8] { } pub(super) trait SliceSeekable { - fn is_member_match(&self, from: usize, to: usize, member: &StringPattern) -> bool; + fn pattern_match_from(&self, from: usize, pattern: &StringPattern) -> Option; + + fn pattern_match_to(&self, to: usize, pattern: &StringPattern) -> Option; fn seek_backward(&self, from: usize, needle: u8) -> Option; diff --git a/crates/rsonpath-lib/src/input/borrowed.rs b/crates/rsonpath-lib/src/input/borrowed.rs index 70c77796..a7924c89 100644 --- a/crates/rsonpath-lib/src/input/borrowed.rs +++ b/crates/rsonpath-lib/src/input/borrowed.rs @@ -18,6 +18,7 @@ use super::{ padding::{EndPaddedInput, PaddedBlock, TwoSidesPaddedInput}, Input, InputBlockIterator, SliceSeekable, MAX_BLOCK_SIZE, }; +use crate::string_pattern::matcher::StringPatternMatcher; use crate::{debug, result::InputRecorder, string_pattern::StringPattern}; /// Input wrapping a borrowed [`[u8]`] buffer. @@ -218,22 +219,58 @@ impl Input for BorrowedBytes<'_> { } } - #[inline(always)] - fn is_member_match(&self, from: usize, to: usize, member: &StringPattern) -> Result { - debug_assert!(from < to); + #[inline] + fn pattern_match_from( + &self, + from: usize, + pattern: &StringPattern, + ) -> Result, Self::Error> { + let pessimistic_to = from + pattern.len_limit(); // The hot path is when we're checking fully within the middle section. // This has to be as fast as possible, so the "cold" path referring to the TwoSidesPaddedInput // impl is explicitly marked with #[cold]. - if from > MAX_BLOCK_SIZE && to < self.middle_bytes.len() + MAX_BLOCK_SIZE { + if from > MAX_BLOCK_SIZE && pessimistic_to < self.middle_bytes.len() + MAX_BLOCK_SIZE { // This is the hot path -- do the bounds check and memcmp. let bytes = self.middle_bytes; let from = from - MAX_BLOCK_SIZE; + let to = pessimistic_to - MAX_BLOCK_SIZE; + let slice = &bytes[from..to]; + if let Some(idx) = M::pattern_match_forward(pattern, slice) { + Ok((from == 0 || bytes[from - 1] != b'\\').then_some(idx + from + MAX_BLOCK_SIZE)) + } else { + Ok(None) + } + } else { + // This is a very expensive, cold path. + Ok(self.as_padded_input().pattern_match_from::(from, pattern)) + } + } + + #[inline] + fn pattern_match_to( + &self, + to: usize, + pattern: &StringPattern, + ) -> Result, Self::Error> { + let pessimistic_from = to.saturating_sub(pattern.len_limit()); + // The hot path is when we're checking fully within the middle section. + // This has to be as fast as possible, so the "cold" path referring to the TwoSidesPaddedInput + // impl is explicitly marked with #[cold]. + if pessimistic_from > MAX_BLOCK_SIZE && to < self.middle_bytes.len() + MAX_BLOCK_SIZE { + // This is the hot path -- do the bounds check and memcmp. + let bytes = self.middle_bytes; + let from = pessimistic_from - MAX_BLOCK_SIZE; let to = to - MAX_BLOCK_SIZE; let slice = &bytes[from..to]; - Ok(member.quoted() == slice && (from == 0 || bytes[from - 1] != b'\\')) + if let Some(idx) = M::pattern_match_backward(pattern, slice) { + let in_bytes_idx = from + idx; + Ok((in_bytes_idx == 0 || bytes[in_bytes_idx - 1] != b'\\').then_some(in_bytes_idx + MAX_BLOCK_SIZE)) + } else { + Ok(None) + } } else { // This is a very expensive, cold path. - Ok(self.as_padded_input().is_member_match(from, to, member)) + Ok(self.as_padded_input().pattern_match_to::(to, pattern)) } } } diff --git a/crates/rsonpath-lib/src/input/buffered.rs b/crates/rsonpath-lib/src/input/buffered.rs index f764ed6d..6b2c3429 100644 --- a/crates/rsonpath-lib/src/input/buffered.rs +++ b/crates/rsonpath-lib/src/input/buffered.rs @@ -19,8 +19,13 @@ use super::{ error::InputError, repr_align_block_size, Input, InputBlock, InputBlockIterator, SliceSeekable, MAX_BLOCK_SIZE, }; -use crate::{error::InternalRsonpathError, result::InputRecorder, string_pattern::StringPattern, JSON_SPACE_BYTE}; -use std::{cell::RefCell, io::Read, ops::Deref, slice}; +use crate::{ + error::InternalRsonpathError, + result::InputRecorder, + string_pattern::{matcher::StringPatternMatcher, StringPattern}, + JSON_SPACE_BYTE, +}; +use std::{cell::RefCell, cmp, io::Read, ops::Deref, slice}; // The buffer has to be a multiple of MAX_BLOCK_SIZE. // It could technically be as small as MAX_BLOCK_SIZE, but there is a performance consideration. @@ -217,19 +222,54 @@ impl Input for BufferedInput { buf.as_slice().seek_non_whitespace_backward(from) } - #[inline(always)] - fn is_member_match(&self, from: usize, to: usize, member: &StringPattern) -> Result { + #[inline] + fn pattern_match_from( + &self, + from: usize, + pattern: &StringPattern, + ) -> Result, Self::Error> { let mut buf = self.0.borrow_mut(); + let pessimistic_to = from + pattern.len_limit(); - while buf.len() < to { + while buf.len() < pessimistic_to { if !buf.read_more()? { - return Ok(false); + break; } } let bytes = buf.as_slice(); + let to = cmp::min(bytes.len(), pessimistic_to); let slice = &bytes[from..to]; - Ok(member.quoted() == slice && (from == 0 || bytes[from - 1] != b'\\')) + if let Some(idx) = M::pattern_match_forward(pattern, slice) { + Ok((from == 0 || bytes[from - 1] != b'\\').then_some(idx + from)) + } else { + Ok(None) + } + } + + #[inline] + fn pattern_match_to( + &self, + to: usize, + pattern: &StringPattern, + ) -> Result, Self::Error> { + let mut buf = self.0.borrow_mut(); + let pessimistic_from = to.saturating_sub(pattern.len_limit()); + + while buf.len() < to { + if !buf.read_more()? { + return Ok(None); + } + } + + let bytes = buf.as_slice(); + let slice = &bytes[pessimistic_from..to]; + if let Some(idx) = M::pattern_match_backward(pattern, slice) { + let in_bytes_idx = pessimistic_from + idx; + Ok((in_bytes_idx == 0 || bytes[in_bytes_idx - 1] != b'\\').then_some(in_bytes_idx)) + } else { + Ok(None) + } } } diff --git a/crates/rsonpath-lib/src/input/mmap.rs b/crates/rsonpath-lib/src/input/mmap.rs index 59252f36..d8154da8 100644 --- a/crates/rsonpath-lib/src/input/mmap.rs +++ b/crates/rsonpath-lib/src/input/mmap.rs @@ -21,7 +21,11 @@ use super::{ padding::PaddedBlock, Input, SliceSeekable, MAX_BLOCK_SIZE, }; -use crate::{input::padding::EndPaddedInput, result::InputRecorder, string_pattern::StringPattern}; +use crate::{ + input::padding::EndPaddedInput, + result::InputRecorder, + string_pattern::{matcher::StringPatternMatcher, StringPattern}, +}; use memmap2::{Mmap, MmapAsRawDesc}; /// Input wrapping a memory mapped file. @@ -161,19 +165,53 @@ impl Input for MmapInput { } #[inline] - fn is_member_match(&self, from: usize, to: usize, member: &StringPattern) -> Result { - debug_assert!(from < to); + fn pattern_match_from( + &self, + from: usize, + pattern: &StringPattern, + ) -> Result, Self::Error> { + let pessimistic_to = from + pattern.len_limit(); + // The hot path is when we're checking fully within the middle section. + // This has to be as fast as possible, so the "cold" path referring to the TwoSidesPaddedInput + // impl is explicitly marked with #[cold]. + if pessimistic_to < self.last_block_start { + // This is the hot path -- do the bounds check and memcmp. + let bytes = &self.mmap; + let slice = &bytes[from..pessimistic_to]; + if let Some(idx) = M::pattern_match_forward(pattern, slice) { + Ok((from == 0 || bytes[from - 1] != b'\\').then_some(idx + from)) + } else { + Ok(None) + } + } else { + // This is a very expensive, cold path. + Ok(self.as_padded_input().pattern_match_from::(from, pattern)) + } + } + + #[inline] + fn pattern_match_to( + &self, + to: usize, + pattern: &StringPattern, + ) -> Result, Self::Error> { + let pessimistic_from = to.saturating_sub(pattern.len_limit()); // The hot path is when we're checking fully within the middle section. // This has to be as fast as possible, so the "cold" path referring to the TwoSidesPaddedInput // impl is explicitly marked with #[cold]. if to < self.last_block_start { // This is the hot path -- do the bounds check and memcmp. let bytes = &self.mmap; - let slice = &bytes[from..to]; - Ok(member.quoted() == slice && (from == 0 || bytes[from - 1] != b'\\')) + let slice = &bytes[pessimistic_from..to]; + if let Some(idx) = M::pattern_match_backward(pattern, slice) { + let in_bytes_idx = pessimistic_from + idx; + Ok((in_bytes_idx == 0 || bytes[in_bytes_idx - 1] != b'\\').then_some(in_bytes_idx)) + } else { + Ok(None) + } } else { // This is a very expensive, cold path. - Ok(self.as_padded_input().is_member_match(from, to, member)) + Ok(self.as_padded_input().pattern_match_to::(to, pattern)) } } } diff --git a/crates/rsonpath-lib/src/input/owned.rs b/crates/rsonpath-lib/src/input/owned.rs index b82d4ea0..be1c20f2 100644 --- a/crates/rsonpath-lib/src/input/owned.rs +++ b/crates/rsonpath-lib/src/input/owned.rs @@ -24,7 +24,10 @@ use super::{ padding::{PaddedBlock, TwoSidesPaddedInput}, Input, SliceSeekable, MAX_BLOCK_SIZE, }; -use crate::{result::InputRecorder, string_pattern::StringPattern}; +use crate::{ + result::InputRecorder, + string_pattern::{matcher::StringPatternMatcher, StringPattern}, +}; use std::borrow::Borrow; /// Input wrapping a buffer borrowable as a slice of bytes. @@ -159,12 +162,34 @@ where } #[inline] - fn is_member_match(&self, from: usize, to: usize, member: &StringPattern) -> Result { + fn pattern_match_from( + &self, + from: usize, + pattern: &StringPattern, + ) -> Result, Self::Error> { let offset = self.leading_padding_len(); let Some(from) = from.checked_sub(offset) else { - return Ok(false); + return Ok(None); }; - Ok(self.bytes.borrow().is_member_match(from, to - offset, member)) + Ok(self + .bytes + .borrow() + .pattern_match_from::(from, pattern) + .map(|x| x + offset)) + } + + #[inline] + fn pattern_match_to( + &self, + to: usize, + pattern: &StringPattern, + ) -> Result, Self::Error> { + let offset = self.leading_padding_len(); + Ok(self + .bytes + .borrow() + .pattern_match_to::(to - offset, pattern) + .map(|x| x + offset)) } } diff --git a/crates/rsonpath-lib/src/input/padding.rs b/crates/rsonpath-lib/src/input/padding.rs index 93d8363c..4c68563e 100644 --- a/crates/rsonpath-lib/src/input/padding.rs +++ b/crates/rsonpath-lib/src/input/padding.rs @@ -1,5 +1,8 @@ use super::{SliceSeekable, MAX_BLOCK_SIZE}; -use crate::{string_pattern::StringPattern, JSON_SPACE_BYTE}; +use crate::{ + string_pattern::{self, StringPattern}, + JSON_SPACE_BYTE, +}; pub(super) struct PaddedBlock { bytes: [u8; MAX_BLOCK_SIZE], @@ -101,10 +104,14 @@ impl SliceSeekable for EndPaddedInput<'_> { #[cold] #[inline(never)] - fn is_member_match(&self, from: usize, to: usize, member: &StringPattern) -> bool { - debug_assert!(from < to); - let other = member.quoted(); - self.cold_member_match(other, from, to) + fn pattern_match_from(&self, from: usize, pattern: &StringPattern) -> Option { + self.cold_pattern_match_forward(pattern, from, from + pattern.len_limit()) + } + + #[cold] + #[inline(never)] + fn pattern_match_to(&self, to: usize, pattern: &StringPattern) -> Option { + self.cold_pattern_match_backward(pattern, to.saturating_sub(pattern.len_limit()), to) } } @@ -159,10 +166,14 @@ impl SliceSeekable for TwoSidesPaddedInput<'_> { #[cold] #[inline(never)] - fn is_member_match(&self, from: usize, to: usize, member: &StringPattern) -> bool { - debug_assert!(from < to); - let other = member.quoted(); - self.cold_member_match(other, from, to) + fn pattern_match_from(&self, from: usize, pattern: &StringPattern) -> Option { + self.cold_pattern_match_forward(pattern, from, from + pattern.len_limit()) + } + + #[cold] + #[inline(never)] + fn pattern_match_to(&self, to: usize, pattern: &StringPattern) -> Option { + self.cold_pattern_match_backward(pattern, to.saturating_sub(pattern.len_limit()), to) } } @@ -291,13 +302,26 @@ impl<'a> EndPaddedInput<'a> { } } - fn cold_member_match(&self, other: &[u8], from: usize, to: usize) -> bool { + fn cold_pattern_match_forward(&self, pattern: &StringPattern, from: usize, to: usize) -> Option { let (middle_self, last_self) = self.slice_parts(from, to); - let middle_other = &other[..middle_self.len()]; - let last_other = &other[middle_self.len()..]; let preceding_char = from.checked_sub(1).and_then(|x| self.get_at(x)); - middle_self == middle_other && last_self == last_other && preceding_char != Some(b'\\') + let idx = string_pattern::matcher::nosimd::NosimdStringMatcher::pattern_match_forward( + pattern, + (middle_self, last_self), + )?; + (preceding_char != Some(b'\\')).then_some(from + idx) + } + + fn cold_pattern_match_backward(&self, pattern: &StringPattern, from: usize, to: usize) -> Option { + let (middle_self, last_self) = self.slice_parts(from, to); + + let idx = string_pattern::matcher::nosimd::NosimdStringMatcher::pattern_match_backward( + pattern, + (middle_self, last_self), + )?; + let preceding_char = (from + idx).checked_sub(1).and_then(|x| self.get_at(x)); + (preceding_char != Some(b'\\')).then_some(from + idx) } } @@ -504,17 +528,26 @@ impl<'a> TwoSidesPaddedInput<'a> { } } - fn cold_member_match(&self, other: &[u8], from: usize, to: usize) -> bool { + fn cold_pattern_match_forward(&self, pattern: &StringPattern, from: usize, to: usize) -> Option { let (first_self, middle_self, last_self) = self.slice_parts(from, to); - let first_other = &other[..first_self.len()]; - let middle_other = &other[first_self.len()..first_self.len() + middle_self.len()]; - let last_other = &other[first_self.len() + middle_self.len()..]; let preceding_char = from.checked_sub(1).and_then(|x| self.get_at(x)); - first_self == first_other - && middle_self == middle_other - && last_self == last_other - && preceding_char != Some(b'\\') + let idx = string_pattern::matcher::nosimd::NosimdStringMatcher::pattern_match_forward( + pattern, + (first_self, middle_self, last_self), + )?; + preceding_char.map_or(Some(from + idx), |x| (x != b'\\').then_some(from + idx)) + } + + fn cold_pattern_match_backward(&self, pattern: &StringPattern, from: usize, to: usize) -> Option { + let (first_self, middle_self, last_self) = self.slice_parts(from, to); + + let idx = string_pattern::matcher::nosimd::NosimdStringMatcher::pattern_match_backward( + pattern, + (first_self, middle_self, last_self), + )?; + let preceding_char = (from + idx).checked_sub(1).and_then(|x| self.get_at(x)); + preceding_char.map_or(Some(from + idx), |x| (x != b'\\').then_some(from + idx)) } } diff --git a/crates/rsonpath-lib/src/input/slice.rs b/crates/rsonpath-lib/src/input/slice.rs index ca369008..de07c71d 100644 --- a/crates/rsonpath-lib/src/input/slice.rs +++ b/crates/rsonpath-lib/src/input/slice.rs @@ -1,15 +1,31 @@ use super::SliceSeekable; -use crate::string_pattern::StringPattern; +use crate::string_pattern::{matcher::StringPatternMatcher, StringPattern}; +use std::cmp; impl> SliceSeekable for T { - fn is_member_match(&self, from: usize, to: usize, member: &StringPattern) -> bool { + fn pattern_match_from(&self, from: usize, pattern: &StringPattern) -> Option { let bytes = self.as_ref(); + let to = from + pattern.len_limit(); - if to > bytes.len() { - return false; + if from >= bytes.len() { + return None; } + + let slice = &bytes[from..cmp::min(to, bytes.len())]; + let res = M::pattern_match_forward(pattern, slice)?; + + (from == 0 || bytes[from - 1] != b'\\').then_some(from + res) + } + + fn pattern_match_to(&self, to: usize, pattern: &StringPattern) -> Option { + let bytes = self.as_ref(); + let from = to.saturating_sub(pattern.len_limit()); + let slice = &bytes[from..to]; - member.quoted() == slice && (from == 0 || bytes[from - 1] != b'\\') + let idx = M::pattern_match_backward(pattern, slice)?; + let in_bytes_idx = from + idx; + + (in_bytes_idx == 0 || bytes[in_bytes_idx - 1] != b'\\').then_some(in_bytes_idx) } fn seek_backward(&self, from: usize, needle: u8) -> Option { @@ -308,9 +324,11 @@ mod tests { } } - mod is_member_match { - use crate::input::SliceSeekable; - use crate::string_pattern::StringPattern; + mod pattern_match_from { + use crate::{ + input::SliceSeekable, + string_pattern::{matcher::nosimd::NosimdStringMatcher, StringPattern}, + }; use pretty_assertions::assert_eq; use rsonpath_syntax::str::JsonString; @@ -318,27 +336,30 @@ mod tests { fn on_exact_match_returns_true() { let bytes = r#"{"needle":42,"other":37}"#.as_bytes(); - let result = bytes.is_member_match(1, 9, &StringPattern::new(&JsonString::new("needle"))); + let result = + bytes.pattern_match_from::(1, &StringPattern::new(&JsonString::new("needle"))); - assert_eq!(result, true); + assert_eq!(result, Some(8)); } #[test] fn matching_without_double_quotes_returns_false() { let bytes = r#"{"needle":42,"other":37}"#.as_bytes(); - let result = bytes.is_member_match(2, 8, &StringPattern::new(&JsonString::new("needle"))); + let result = + bytes.pattern_match_from::(2, &StringPattern::new(&JsonString::new("needle"))); - assert_eq!(result, false); + assert_eq!(result, None); } #[test] fn when_match_is_partial_due_to_escaped_double_quote_returns_false() { let bytes = r#"{"fake\"needle":42,"other":37}"#.as_bytes(); - let result = bytes.is_member_match(7, 15, &StringPattern::new(&JsonString::new("needle"))); + let result = + bytes.pattern_match_from::(7, &StringPattern::new(&JsonString::new("needle"))); - assert_eq!(result, false); + assert_eq!(result, None); } #[test] @@ -346,9 +367,10 @@ mod tests { fn when_looking_for_string_with_escaped_double_quote_returns_true() { let bytes = r#"{"fake\"needle":42,"other":37}"#.as_bytes(); - let result = bytes.is_member_match(1, 15, &StringPattern::new(&JsonString::new(r#"fake"needle"#))); + let result = bytes + .pattern_match_from::(1, &StringPattern::new(&JsonString::new(r#"fake"needle"#))); - assert_eq!(result, true); + assert_eq!(result, Some(15)); } } } diff --git a/crates/rsonpath-lib/src/lib.rs b/crates/rsonpath-lib/src/lib.rs index 396199cb..e3e35359 100644 --- a/crates/rsonpath-lib/src/lib.rs +++ b/crates/rsonpath-lib/src/lib.rs @@ -148,7 +148,7 @@ pub mod input; pub mod result; pub(crate) mod string_pattern; -pub use string_pattern::StringPattern; +pub use string_pattern::{matcher::nosimd::NosimdStringMatcher as DefaultStringMatcher, StringPattern}; cfg_if::cfg_if! { if #[cfg(target_pointer_width = "32")] { diff --git a/crates/rsonpath-lib/src/string_pattern.rs b/crates/rsonpath-lib/src/string_pattern.rs index f6a9c688..36863c87 100644 --- a/crates/rsonpath-lib/src/string_pattern.rs +++ b/crates/rsonpath-lib/src/string_pattern.rs @@ -1,69 +1,305 @@ +//! JSONString unicode-aware pattern matching. +//! +//! A [`JsonString`] can be turned into a [`StringPattern`] that contains all data necessary +//! to match a candidate string against it in a unicode-aware manner. This is more involved than +//! just bytewise equality. For example, a logical string "ab" can be represented in four unique +//! but equivalent ways in a JSON: +//! - `"ab"` +//! - `"\u0097b"` +//! - `"a\u0098"` +//! - `"\u0097\u0098"` +//! +//! The [`StringPattern`] itself contains no matching logic. The functions [`cmpeq_forward`] and +//! [`cmpeq_backward`] allow matching a pattern against an input. +//! +pub(crate) mod matcher; use rsonpath_syntax::str::JsonString; +use std::fmt::Debug; -/// String pattern coming from a JSONPath query that can be matched against strings in a JSON. +use crate::{BLOCK_SIZE, JSON_SPACE_BYTE}; + +/// Compiled JSONString representation allowing pattern-matching JSON strings. +/// +/// Any non-empty JSON string has multiple textual representations. For example, +/// `"a"` can also be written as `"\u0097"`. This structure precomputes the alternative +/// representations and allows efficient pattern-matching against JSON bytes. /// -/// Right now the only pattern is matching against a given [`JsonString`]. +/// A compiled pattern takes more space than a raw [`JsonString`], but is efficient to match. #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] -#[derive(Debug, Clone)] -pub struct StringPattern(JsonString); +#[derive(Clone)] +pub struct StringPattern { + bytes: Vec, + alternatives: Vec, + len: usize, + len_limit: usize, +} impl std::hash::Hash for StringPattern { #[inline] fn hash(&self, state: &mut H) { - self.0.hash(state); + self.bytes.hash(state); } } impl PartialOrd for StringPattern { #[inline] fn partial_cmp(&self, other: &Self) -> Option { - Some(self.0.unquoted().cmp(other.0.unquoted())) + Some(self.bytes.cmp(&other.bytes)) } } impl Ord for StringPattern { #[inline] fn cmp(&self, other: &Self) -> std::cmp::Ordering { - self.0.unquoted().cmp(other.0.unquoted()) + self.bytes.cmp(&other.bytes) } } impl PartialEq for StringPattern { #[inline] fn eq(&self, other: &Self) -> bool { - self.0 == other.0 + self.bytes == other.bytes } } impl Eq for StringPattern {} +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +#[derive(Clone, Copy, PartialEq, Eq)] +enum AlternativeRepresentation { + SlashUSingle(u32, u8), + SlashUPair(u32, u32, u8), + USingle(u32), + SlashByteOrUSingle(u8, u32), + None, +} + +struct StringPatternBuilder { + bytes: Vec, + alternatives: Vec, + len_limit: usize, +} + impl StringPattern { - /// Get the underlying [`JsonString`] as bytes, including the delimiting double quote symbols. + /// Returns bytes of the canonical representation of the string pattern without the delimiting + /// quotes. + /// + /// # Examples + /// ```rust + /// # use rsonpath::StringPattern; + /// # use rsonpath_syntax::str::JsonString; + /// let simple_pattern = StringPattern::new(&JsonString::new("ab")); + /// let complex_pattern = StringPattern::new(&JsonString::new("\n")); + /// + /// assert_eq!(simple_pattern.unquoted(), "ab".as_bytes()); + /// assert_eq!(complex_pattern.unquoted(), r"\n".as_bytes()); + /// ``` #[inline] #[must_use] - pub fn quoted(&self) -> &[u8] { - self.0.quoted().as_bytes() + pub fn unquoted(&self) -> &[u8] { + &self.quoted()[1..self.len - 1] } - /// Get the underlying [`JsonString`] as bytes, without the delimiting quotes. + /// Returns bytes of the canonical representation of the string pattern including the delimiting + /// quotes. + /// + /// # Examples + /// ```rust + /// # use rsonpath::StringPattern; + /// # use rsonpath_syntax::str::JsonString; + /// let simple_pattern = StringPattern::new(&JsonString::new("ab")); + /// let complex_pattern = StringPattern::new(&JsonString::new("\n")); + /// + /// assert_eq!(simple_pattern.quoted(), r#""ab""#.as_bytes()); + /// assert_eq!(complex_pattern.quoted(), r#""\n""#.as_bytes()); + /// ``` #[inline] #[must_use] - pub fn unquoted(&self) -> &[u8] { - self.0.unquoted().as_bytes() + pub fn quoted(&self) -> &[u8] { + &self.bytes[..self.len] + } + + /// Returns the maximum length of JSON text (in bytes) that can possibly match this pattern. + /// The length DOES include the delimiting quotes. + /// + /// In other words: if a JSON string contains more bytes than this, it definitely does not + /// match this pattern. + /// + /// # Examples + /// ```rust + /// # use rsonpath::StringPattern; + /// # use rsonpath_syntax::str::JsonString; + /// let pattern = StringPattern::new(&JsonString::new("ab")); + /// // The pattern can be represented as: "\u0097\u0098", which is 14 bytes. + /// assert_eq!(pattern.len_limit(), 14); + /// ``` + #[inline(always)] + #[must_use] + pub fn len_limit(&self) -> usize { + self.len_limit } - /// Create a new pattern from a given [`JsonString`]. + /// Build a [`StringPattern`] for a given [`JsonString`]. #[inline] #[must_use] pub fn new(string: &JsonString) -> Self { - Self(string.clone()) + // A pattern to be matched consists of the bytes that should be matched in the "canonical" + // representation of the string (the shortest possible valid representation), and possible + // alternative escapes that should be considered if a mismatch occurs + // at a given position relative to the canonical bytes. + // We have the following cases: + // - The character is a control character or a special symbol that is canonically represented + // as backslash-itself. If it is mismatched at the backslash, there is no match alternative + // representation; on the second byte it can be replaced with uXXXX. + // - The character is a control character that can only be represented as a unicode escape; + // it has no alternative encodings. + // - The character is one of the two awfully designed JSON special cases: + // forward slash (/) or single quote ('). The canonical form of them is themselves, but they + // can also be present escaped (\/ or \'), or as a unicode escape. + // - The character is a "regular" character; it has only one alternative encoding - unicode + // escape, which is either a single sequence \uXXXX or a pair \uXXXX\uXXXX. + let byte_length = string.quoted().len(); + let mut builder = StringPatternBuilder::new(byte_length); + + for char in string.unquoted().chars() { + match char { + '\u{0008}' => builder.short_escape(b'b', char), + '\u{000C}' => builder.short_escape(b'f', char), + '\n' => builder.short_escape(b'n', char), + '\r' => builder.short_escape(b'r', char), + '\t' => builder.short_escape(b't', char), + '"' => builder.short_escape(b'"', char), + '\\' => builder.short_escape(b'\\', char), + '\u{0000}'..='\u{001F}' => builder.long_escape(char), + '/' | '\'' => builder.special_escape(char), + _ => builder.regular_escape(char), + }; + } + + builder.into_pattern() } } -impl From for StringPattern { - #[inline(always)] - fn from(value: JsonString) -> Self { - Self::new(&value) +impl StringPatternBuilder { + fn new(byte_len: usize) -> Self { + let mut this = Self { + bytes: Vec::with_capacity(byte_len), + alternatives: Vec::with_capacity(byte_len), + len_limit: 0, + }; + this.bytes.push(b'"'); + this.alternatives.push(AlternativeRepresentation::None); + this.len_limit += 1; + + this + } + + fn into_pattern(mut self) -> StringPattern { + self.bytes.push(b'"'); + self.alternatives.push(AlternativeRepresentation::None); + self.len_limit += 1; + let len = self.bytes.len(); + for _ in 0..BLOCK_SIZE { + self.bytes.push(JSON_SPACE_BYTE); + } + + StringPattern { + bytes: self.bytes, + alternatives: self.alternatives, + len_limit: self.len_limit, + len, + } + } + + fn short_escape(&mut self, code_letter: u8, c: char) { + self.bytes.push(b'\\'); + self.bytes.push(code_letter); + + let mut utf16_buf = [0; 1]; + let utf16 = c.encode_utf16(&mut utf16_buf); + let code = Self::encode(utf16[0]); + + self.alternatives.push(AlternativeRepresentation::None); + self.alternatives.push(AlternativeRepresentation::USingle(code)); + + self.len_limit += 6; + } + + fn long_escape(&mut self, c: char) { + self.bytes.push(b'\\'); + self.bytes.push(b'u'); + self.bytes.push(b'0'); + self.bytes.push(b'0'); + self.bytes.push(Self::encode_nibble((c as u8 & 0xF0) >> 4)); + self.bytes.push(Self::encode_nibble(c as u8 & 0x0F)); + + for _ in 0..6 { + self.alternatives.push(AlternativeRepresentation::None); + } + + self.len_limit += 6; + } + + fn special_escape(&mut self, c: char) { + self.bytes.push(c as u8); + + let mut utf16_buf = [0; 1]; + let utf16 = c.encode_utf16(&mut utf16_buf); + let code = Self::encode(utf16[0]); + + self.alternatives + .push(AlternativeRepresentation::SlashByteOrUSingle(c as u8, code)); + + self.len_limit += 6; + } + + fn regular_escape(&mut self, c: char) { + let mut utf8_buf = [0; 4]; + let mut utf16_buf = [0; 2]; + let utf8 = c.encode_utf8(&mut utf8_buf); + let utf16 = c.encode_utf16(&mut utf16_buf); + + self.bytes.extend_from_slice(utf8.as_bytes()); + let len = utf8.len(); + let repr; + + if utf16.len() == 1 { + let code = Self::encode(utf16[0]); + repr = AlternativeRepresentation::SlashUSingle(code, len as u8); + self.alternatives.push(repr); + self.len_limit += 6; + } else { + let code1 = Self::encode(utf16[0]); + let code2 = Self::encode(utf16[1]); + repr = AlternativeRepresentation::SlashUPair(code1, code2, len as u8); + self.alternatives.push(repr); + self.len_limit += 12; + } + + for _ in 1..utf8.len() { + self.alternatives.push(AlternativeRepresentation::None); + } + let last_idx = self.alternatives.len() - 1; + self.alternatives[last_idx] = repr; + } + + fn encode(utf16: u16) -> u32 { + let bytes = utf16.to_be_bytes(); + let mut result = [0; 4]; + result[0] = Self::encode_nibble((bytes[0] & 0xF0) >> 4); + result[1] = Self::encode_nibble(bytes[0] & 0x0F); + result[2] = Self::encode_nibble((bytes[1] & 0xF0) >> 4); + result[3] = Self::encode_nibble(bytes[1] & 0x0F); + + u32::from_ne_bytes(result) + } + + fn encode_nibble(nibble: u8) -> u8 { + match nibble { + 0x00..=0x09 => b'0' + nibble, + 0x0A..=0x0F => b'a' + nibble - 0x0A, + _ => unreachable!(), + } } } @@ -73,3 +309,60 @@ impl From<&JsonString> for StringPattern { Self::new(value) } } + +impl Debug for StringPattern { + #[inline] + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("StringPattern") + .field( + "bytes", + &self.quoted().iter().copied().map(DebugByte).collect::>(), + ) + .field("as_string", &String::from_utf8_lossy(self.quoted())) + .field("alternatives", &self.alternatives) + .finish() + } +} + +impl Debug for AlternativeRepresentation { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::SlashUSingle(arg0, arg1) => f + .debug_tuple("SlashUSingle") + .field(&DebugCode(*arg0)) + .field(arg1) + .finish(), + Self::SlashUPair(arg0, arg1, arg2) => f + .debug_tuple("SlashUPair") + .field(&DebugCode(*arg0)) + .field(&DebugCode(*arg1)) + .field(arg2) + .finish(), + Self::USingle(arg0) => f.debug_tuple("USingle").field(&DebugCode(*arg0)).finish(), + Self::SlashByteOrUSingle(arg0, arg1) => f + .debug_tuple("SlashByteOrUSingle") + .field(&DebugByte(*arg0)) + .field(&DebugCode(*arg1)) + .finish(), + Self::None => write!(f, "None"), + } + } +} + +struct DebugByte(u8); +struct DebugCode(u32); + +impl Debug for DebugByte { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self.0 { + 0x20..=0x7F => write!(f, "b'{}'", self.0 as char), + _ => write!(f, "0x{:0>2x}", self.0), + } + } +} + +impl Debug for DebugCode { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "0x{:0>8x}", self.0) + } +} diff --git a/crates/rsonpath-lib/src/string_pattern/matcher.rs b/crates/rsonpath-lib/src/string_pattern/matcher.rs new file mode 100644 index 00000000..d22ab2f5 --- /dev/null +++ b/crates/rsonpath-lib/src/string_pattern/matcher.rs @@ -0,0 +1,153 @@ +#[cfg(target_arch = "x86_64")] +pub(crate) mod avx2_64; +pub(crate) mod nosimd; +mod shared; +use crate::StringPattern; + +pub trait StringPatternMatcher { + fn pattern_match_forward(pattern: &StringPattern, input: &[u8]) -> Option; + fn pattern_match_backward(pattern: &StringPattern, input: &[u8]) -> Option; +} + +pub(crate) trait MatcherInput { + fn len(&self) -> usize; + + fn offset(&mut self, offset: usize); + + fn offset_back(&mut self, offset: usize); + + fn read_u8(&self, idx: usize) -> u8; + + fn read_u16(&self, idx: usize) -> u16; + + fn read_u32(&self, idx: usize) -> u32; +} + +impl MatcherInput for &[u8] { + #[inline(always)] + fn len(&self) -> usize { + <[u8]>::len(self) + } + + #[inline(always)] + fn offset(&mut self, offset: usize) { + *self = &self[offset..]; + } + + #[inline(always)] + fn offset_back(&mut self, offset: usize) { + *self = &self[..self.len() - offset]; + } + + #[inline(always)] + fn read_u8(&self, idx: usize) -> u8 { + self[idx] + } + + #[inline(always)] + fn read_u16(&self, idx: usize) -> u16 { + u16::from_ne_bytes(self[idx..idx + 2].try_into().expect("length 2")) + } + + #[inline(always)] + fn read_u32(&self, idx: usize) -> u32 { + u32::from_ne_bytes(self[idx..idx + 4].try_into().expect("length 4")) + } +} + +impl MatcherInput for (&[u8], &[u8]) { + fn len(&self) -> usize { + self.0.len() + self.1.len() + } + + fn offset(&mut self, offset: usize) { + let first_offset = std::cmp::min(self.0.len(), offset); + let second_offset = offset - first_offset; + *self = (&self.0[first_offset..], &self.1[second_offset..]) + } + + fn offset_back(&mut self, offset: usize) { + let second_offset = self.1.len().saturating_sub(offset); + let first_offset = self.0.len() - (offset - (self.1.len() - second_offset)); + *self = (&self.0[..first_offset], &self.1[..second_offset]) + } + + fn read_u8(&self, idx: usize) -> u8 { + if idx < self.0.len() { + self.0[idx] + } else { + self.1[idx - self.0.len()] + } + } + + fn read_u16(&self, idx: usize) -> u16 { + let b1 = self.read_u8(idx); + let b2 = self.read_u8(idx + 1); + u16::from_ne_bytes([b1, b2]) + } + + fn read_u32(&self, idx: usize) -> u32 { + let b1 = self.read_u8(idx); + let b2 = self.read_u8(idx + 1); + let b3 = self.read_u8(idx + 2); + let b4 = self.read_u8(idx + 3); + u32::from_ne_bytes([b1, b2, b3, b4]) + } +} + +impl MatcherInput for (&[u8], &[u8], &[u8]) { + fn len(&self) -> usize { + self.0.len() + self.1.len() + self.2.len() + } + + fn offset(&mut self, offset: usize) { + let first_offset = std::cmp::min(self.0.len(), offset); + let second_offset_base = offset - first_offset; + let second_offset = std::cmp::min(self.1.len(), second_offset_base); + let third_offset = second_offset_base - second_offset; + *self = ( + &self.0[first_offset..], + &self.1[second_offset..], + &self.2[third_offset..], + ) + } + + fn offset_back(&mut self, offset: usize) { + let third_offset = self.2.len().saturating_sub(offset); + let second_offset_base = offset - (self.2.len() - third_offset); + let second_offset = self.1.len().saturating_sub(second_offset_base); + let first_offset = self.0.len() - (second_offset_base - (self.1.len() - second_offset)); + *self = ( + &self.0[..first_offset], + &self.1[..second_offset], + &self.2[..third_offset], + ) + } + + fn read_u8(&self, idx: usize) -> u8 { + if idx < self.0.len() { + self.0[idx] + } else { + let idx_2 = idx - self.0.len(); + if idx_2 < self.1.len() { + self.1[idx_2] + } else { + self.2[idx_2 - self.1.len()] + } + } + } + + fn read_u16(&self, idx: usize) -> u16 { + let b1 = self.read_u8(idx); + let b2 = self.read_u8(idx + 1); + u16::from_ne_bytes([b1, b2]) + } + + fn read_u32(&self, idx: usize) -> u32 { + let b1 = self.read_u8(idx); + let b2 = self.read_u8(idx + 1); + let b3 = self.read_u8(idx + 2); + let b4 = self.read_u8(idx + 3); + u32::from_ne_bytes([b1, b2, b3, b4]) + } +} diff --git a/crates/rsonpath-lib/src/string_pattern/matcher/avx2_64.rs b/crates/rsonpath-lib/src/string_pattern/matcher/avx2_64.rs new file mode 100644 index 00000000..8e337e09 --- /dev/null +++ b/crates/rsonpath-lib/src/string_pattern/matcher/avx2_64.rs @@ -0,0 +1,334 @@ +use std::{arch::x86_64::*, hint::unreachable_unchecked}; + +use super::{MatcherInput, StringPatternMatcher}; +use crate::string_pattern::matcher::shared::AlternativeMatchResult; +use crate::{debug, StringPattern}; + +pub(crate) struct Avx2StringMatcher64; + +impl StringPatternMatcher for Avx2StringMatcher64 { + fn pattern_match_forward(pattern: &StringPattern, input: &[u8]) -> Option { + // SAFETY: Avx2StringMatcher64 is only resolved in the simd module when the target features + // are enabled. + unsafe { + return impl_(pattern, input); + } + + #[target_feature(enable = "avx2")] + #[target_feature(enable = "popcnt")] + unsafe fn impl_(pattern: &StringPattern, input: &[u8]) -> Option { + let mut rem_pattern: &[u8] = pattern.quoted(); + let mut input = input; + let mut pat_idx = 0; + let mut input_idx = 0; + debug!( + "{} cmpeq {}", + std::str::from_utf8(rem_pattern).unwrap_or("[invalid UTF8]"), + std::str::from_utf8(input).unwrap_or("[invalid UTF8]") + ); + + while !rem_pattern.is_empty() && rem_pattern.len() <= input.len() { + let rem_ptr = rem_pattern.as_ptr(); + let in_ptr = input.as_ptr(); + match rem_pattern.len() { + 0 => unreachable_unchecked(), + 1 => { + if rem_ptr.read() == in_ptr.read() { + return Some(input_idx); + } + // Fallthrough to alt match. + } + 2 => { + let rem_code = rem_ptr.cast::().read_unaligned(); + let in_code = in_ptr.cast::().read_unaligned(); + let xor = rem_code ^ in_code; + if xor == 0 { + return Some(input_idx + 1); + } else if xor > 0x00FF { + rem_pattern = &rem_pattern[1..]; + input = &input[1..]; + input_idx += 1; + pat_idx += 1; + } + // Fallthrough to alt match. + } + 3 => { + let rem_code = rem_ptr.cast::().read_unaligned(); + let in_code = in_ptr.cast::().read_unaligned(); + let xor = rem_code ^ in_code; + if xor == 0 { + if rem_ptr.add(2).read() == in_ptr.add(2).read() { + return Some(input_idx + 2); + } else { + rem_pattern = &rem_pattern[2..]; + input = &input[2..]; + input_idx += 2; + pat_idx += 2; + } + } else if xor & 0x00FF == 0 { + rem_pattern = &rem_pattern[1..]; + input = &input[1..]; + input_idx += 1; + pat_idx += 1; + } + // Fallthrough to alt match. + } + 4 => { + let rem_code = rem_ptr.cast::().read_unaligned(); + let in_code = in_ptr.cast::().read_unaligned(); + let xor = rem_code ^ in_code; + if xor == 0 { + return Some(input_idx + 3); + } else { + let mismatch = (xor.trailing_zeros() / 8) as usize; + rem_pattern = &rem_pattern[mismatch..]; + input = &input[mismatch..]; + input_idx += mismatch; + pat_idx += mismatch; + } + // Fallthrough to alt match. + } + 5..=7 => { + let rem_code = rem_ptr.cast::().read_unaligned(); + let in_code = in_ptr.cast::().read_unaligned(); + let xor = rem_code ^ in_code; + if xor == 0 { + let offset = rem_pattern.len() ^ 4; + let rem_code = rem_ptr.add(offset).cast::().read_unaligned(); + let in_code = in_ptr.add(offset).cast::().read_unaligned(); + let xor = rem_code ^ in_code; + if xor == 0 { + return Some(input_idx + rem_pattern.len() - 1); + } else { + let mismatch = (xor.trailing_zeros() / 8) as usize + offset; + rem_pattern = &rem_pattern[mismatch..]; + input = &input[mismatch..]; + input_idx += mismatch; + pat_idx += mismatch; + } + } else { + let mismatch = (xor.trailing_zeros() / 8) as usize; + rem_pattern = &rem_pattern[mismatch..]; + input = &input[mismatch..]; + input_idx += mismatch; + pat_idx += mismatch; + } + // Fallthrough to alt match. + } + 8 => { + let rem_code = rem_ptr.cast::().read_unaligned(); + let in_code = in_ptr.cast::().read_unaligned(); + let xor = rem_code ^ in_code; + if xor == 0 { + return Some(input_idx + 7); + } else { + let mismatch = (xor.trailing_zeros() / 8) as usize; + rem_pattern = &rem_pattern[mismatch..]; + input = &input[mismatch..]; + input_idx += mismatch; + pat_idx += mismatch; + } + // Fallthrough to alt match. + } + 9..=15 => { + let rem_code = rem_ptr.cast::().read_unaligned(); + let in_code = in_ptr.cast::().read_unaligned(); + let xor = rem_code ^ in_code; + if xor == 0 { + let offset = rem_pattern.len() ^ 8; + let rem_code = rem_ptr.add(offset).cast::().read_unaligned(); + let in_code = in_ptr.add(offset).cast::().read_unaligned(); + let xor = rem_code ^ in_code; + if xor == 0 { + return Some(input_idx + rem_pattern.len() - 1); + } else { + let mismatch = (xor.trailing_zeros() / 8) as usize + offset; + rem_pattern = &rem_pattern[mismatch..]; + input = &input[mismatch..]; + input_idx += mismatch; + pat_idx += mismatch; + } + } else { + let mismatch = (xor.trailing_zeros() / 8) as usize; + rem_pattern = &rem_pattern[mismatch..]; + input = &input[mismatch..]; + input_idx += mismatch; + pat_idx += mismatch; + } + // Fallthrough to alt match. + } + 16 => { + let rem_vec = _mm_loadu_si128(rem_ptr.cast()); + let in_vec = _mm_loadu_si128(in_ptr.cast()); + let cmpeq = _mm_cmpeq_epi8(rem_vec, in_vec); + let mask = _mm_movemask_epi8(cmpeq); + if mask == 0xFFFF { + return Some(input_idx + 15); + } else { + let mismatch = mask.trailing_ones() as usize; + rem_pattern = &rem_pattern[mismatch..]; + input = &input[mismatch..]; + input_idx += mismatch; + pat_idx += mismatch; + } + // Fallthrough to alt match. + } + 17..=31 => { + let rem_vec = _mm_loadu_si128(rem_ptr.cast()); + let in_vec = _mm_loadu_si128(in_ptr.cast()); + let cmpeq = _mm_cmpeq_epi8(rem_vec, in_vec); + let mask = _mm_movemask_epi8(cmpeq); + if mask == 0xFFFF { + let offset = rem_pattern.len() ^ 16; + let rem_vec = _mm_loadu_si128(rem_ptr.add(offset).cast()); + let in_vec = _mm_loadu_si128(in_ptr.add(offset).cast()); + let cmpeq = _mm_cmpeq_epi8(rem_vec, in_vec); + let mask = _mm_movemask_epi8(cmpeq); + if mask == 0xFFFF { + return Some(input_idx + rem_pattern.len() - 1); + } else { + let mismatch = mask.trailing_ones() as usize + offset; + rem_pattern = &rem_pattern[mismatch..]; + input = &input[mismatch..]; + input_idx += mismatch; + pat_idx += mismatch; + } + } else { + let mismatch = mask.trailing_ones() as usize; + rem_pattern = &rem_pattern[mismatch..]; + input = &input[mismatch..]; + input_idx += mismatch; + pat_idx += mismatch; + } + // Fallthrough to alt match. + } + 32 => { + let rem_vec = _mm256_loadu_si256(rem_ptr.cast()); + let in_vec = _mm256_loadu_si256(in_ptr.cast()); + let cmpeq = _mm256_cmpeq_epi8(rem_vec, in_vec); + let mask = _mm256_movemask_epi8(cmpeq) as u32; + if mask == 0xFFFF_FFFF { + return Some(input_idx + 31); + } else { + let mismatch = mask.trailing_ones() as usize; + rem_pattern = &rem_pattern[mismatch..]; + input = &input[mismatch..]; + input_idx += mismatch; + pat_idx += mismatch; + } + // Fallthrough to alt match. + } + 33..=63 => { + let rem_vec = _mm256_loadu_si256(rem_ptr.cast()); + let in_vec = _mm256_loadu_si256(in_ptr.cast()); + let cmpeq = _mm256_cmpeq_epi8(rem_vec, in_vec); + let mask = _mm256_movemask_epi8(cmpeq) as u32; + if mask == 0xFFFF_FFFF { + let offset = rem_pattern.len() ^ 32; + let rem_vec = _mm256_loadu_si256(rem_ptr.add(offset).cast()); + let in_vec = _mm256_loadu_si256(in_ptr.add(offset).cast()); + let cmpeq = _mm256_cmpeq_epi8(rem_vec, in_vec); + let mask = _mm256_movemask_epi8(cmpeq) as u32; + if mask == 0xFFFF_FFFF { + return Some(input_idx + rem_pattern.len() - 1); + } else { + let mismatch = mask.trailing_ones() as usize + offset; + rem_pattern = &rem_pattern[mismatch..]; + input = &input[mismatch..]; + input_idx += mismatch; + pat_idx += mismatch; + } + } else { + let mismatch = mask.trailing_ones() as usize; + rem_pattern = &rem_pattern[mismatch..]; + input = &input[mismatch..]; + input_idx += mismatch; + pat_idx += mismatch; + } + // Fallthrough to alt match. + } + 64.. => { + let rem_vec = _mm256_loadu_si256(rem_ptr.cast()); + let in_vec = _mm256_loadu_si256(in_ptr.cast()); + let cmpeq = _mm256_cmpeq_epi8(rem_vec, in_vec); + let mask = _mm256_movemask_epi8(cmpeq) as u32; + if mask == 0xFFFF_FFFF { + let rem_vec = _mm256_loadu_si256(rem_ptr.add(32).cast()); + let in_vec = _mm256_loadu_si256(in_ptr.add(32).cast()); + let cmpeq = _mm256_cmpeq_epi8(rem_vec, in_vec); + let mask = _mm256_movemask_epi8(cmpeq) as u32; + if mask == 0xFFFF_FFFF { + rem_pattern = &rem_pattern[64..]; + input = &input[64..]; + input_idx += 64; + pat_idx += 64; + continue; + } else { + let mismatch = mask.trailing_ones() as usize + 32; + rem_pattern = &rem_pattern[mismatch..]; + input = &input[mismatch..]; + input_idx += mismatch; + pat_idx += mismatch; + } + } else { + let mismatch = mask.trailing_ones() as usize; + rem_pattern = &rem_pattern[mismatch..]; + input = &input[mismatch..]; + input_idx += mismatch; + pat_idx += mismatch; + } + // Fallthrough to alt match. + } + } + match super::shared::attempt_alt_match_forward(pattern, &input, pat_idx) { + AlternativeMatchResult::Continue(input_offset, pat_offset) => { + rem_pattern = &rem_pattern[pat_offset..]; + input.offset(input_offset); + input_idx += input_offset; + pat_idx += pat_offset; + } + AlternativeMatchResult::Mismatch => return None, + } + } + + #[allow(clippy::if_then_some_else_none)] // The -1 can overflow if the condition is false. + if rem_pattern.is_empty() { + Some(input_idx - 1) + } else { + None + } + } + } + + fn pattern_match_backward(pattern: &crate::StringPattern, input: &[u8]) -> Option { + super::nosimd::NosimdStringMatcher::pattern_match_backward(pattern, input) + } +} + +#[cfg(test)] +mod tests { + use crate::{ + string_pattern::matcher::{avx2_64::Avx2StringMatcher64, StringPatternMatcher}, + StringPattern, + }; + use rsonpath_syntax::str::JsonString; + use test_case::test_case; + + #[test_case("abc\n\u{01F980}'abc", "\"abc\\n\u{01F980}'abc\""; "str1")] + #[test_case("abc\n\u{01F980}'abc", "\"\\u0061bc\\n\u{01F980}'abc\""; "str2")] + #[test_case("abc\n\u{01F980}'abc", "\"\\u0061bc\\u000a\u{01F980}'abc\""; "str3")] + #[test_case("abc\n\u{01F980}'abc", "\"\\u0061bc\\u000A\u{01F980}'abc\""; "str4")] + #[test_case("abc\n\u{01F980}'abc", "\"\\u0061bc\\u000A\u{01F980}\\'abc\""; "str5")] + #[test_case("abc\n\u{01F980}'abc", "\"\\u0061bc\\u000A\\uD83E\\uDd80\\'abc\""; "str6")] + fn test(pat: &str, str: &str) { + let js = JsonString::new(pat); + let pattern = StringPattern::new(&js); + + let str = str.as_bytes(); + let res_forward = Avx2StringMatcher64::pattern_match_forward(&pattern, str); + let res_backward = Avx2StringMatcher64::pattern_match_backward(&pattern, str); + + assert_eq!(res_forward, Some(str.len() - 1)); + assert_eq!(res_backward, Some(0)); + } +} diff --git a/crates/rsonpath-lib/src/string_pattern/matcher/nosimd.rs b/crates/rsonpath-lib/src/string_pattern/matcher/nosimd.rs new file mode 100644 index 00000000..e9023acf --- /dev/null +++ b/crates/rsonpath-lib/src/string_pattern/matcher/nosimd.rs @@ -0,0 +1,135 @@ +use super::{shared::AlternativeMatchResult, MatcherInput, StringPatternMatcher}; +use crate::string_pattern::AlternativeRepresentation; + +/// Default, non-vectorised [`StringPattern`](crate::string_pattern::StringPattern) matcher. +/// +/// This is mostly exposed for testing purposes. +pub struct NosimdStringMatcher; + +impl NosimdStringMatcher { + pub(crate) fn pattern_match_forward(pattern: &crate::StringPattern, input: I) -> Option { + let mut rem_pattern: &[u8] = pattern.quoted(); + let mut input = input; + let mut pat_idx = 0; + let mut input_idx = 0; + + while !rem_pattern.is_empty() && rem_pattern.len() <= input.len() { + if rem_pattern[0] == input.read_u8(0) { + rem_pattern = &rem_pattern[1..]; + input.offset(1); + input_idx += 1; + pat_idx += 1; + } else { + match super::shared::attempt_alt_match_forward(pattern, &input, pat_idx) { + AlternativeMatchResult::Continue(input_offset, pat_offset) => { + rem_pattern = &rem_pattern[pat_offset..]; + input.offset(input_offset); + input_idx += input_offset; + pat_idx += pat_offset; + } + AlternativeMatchResult::Mismatch => return None, + } + } + } + + #[allow(clippy::if_then_some_else_none)] // The -1 can overflow if the condition is false. + if rem_pattern.is_empty() { + Some(input_idx - 1) + } else { + None + } + } + + pub(crate) fn pattern_match_backward(pattern: &crate::StringPattern, input: I) -> Option { + let mut rem_pattern: &[u8] = pattern.quoted(); + let mut input = input; + let mut pat_len = rem_pattern.len(); + let mut input_len = input.len(); + + while !rem_pattern.is_empty() && rem_pattern.len() <= input.len() { + if rem_pattern[pat_len - 1] == input.read_u8(input_len - 1) { + rem_pattern = &rem_pattern[..pat_len - 1]; + input.offset_back(1); + input_len -= 1; + pat_len -= 1; + continue; + } else if pat_len < pattern.alternatives.len() && input.read_u8(input_len - 1) == b'\\' { + // When going backwards there's one nasty special case. + // If the character ' or / is escaped it did match bytewise in the previous + // iteration, but a backslash here should also be accepted. + if let AlternativeRepresentation::SlashByteOrUSingle(_, _) = pattern.alternatives[pat_len] { + input.offset_back(1); + input_len -= 1; + continue; + } + } + match super::shared::attempt_alt_match_backward(pattern, &input, input_len, pat_len - 1) { + AlternativeMatchResult::Continue(input_offset, pat_offset) => { + rem_pattern = &rem_pattern[..pat_len - pat_offset]; + input.offset_back(input_offset); + input_len -= input_offset; + pat_len -= pat_offset; + } + AlternativeMatchResult::Mismatch => return None, + } + } + + rem_pattern.is_empty().then_some(input_len) + } +} + +impl StringPatternMatcher for NosimdStringMatcher { + fn pattern_match_forward(pattern: &crate::StringPattern, input: &[u8]) -> Option { + Self::pattern_match_forward(pattern, input) + } + + fn pattern_match_backward(pattern: &crate::StringPattern, input: &[u8]) -> Option { + Self::pattern_match_backward(pattern, input) + } +} + +#[cfg(test)] +mod tests { + use crate::{string_pattern::matcher::nosimd::NosimdStringMatcher, StringPattern}; + use rsonpath_syntax::str::JsonString; + use test_case::test_case; + + #[test_case("abc\n\u{01F980}'abc", "\"abc\\n\u{01F980}'abc\""; "str1")] + #[test_case("abc\n\u{01F980}'abc", "\"\\u0061bc\\n\u{01F980}'abc\""; "str2")] + #[test_case("abc\n\u{01F980}'abc", "\"\\u0061bc\\u000a\u{01F980}'abc\""; "str3")] + #[test_case("abc\n\u{01F980}'abc", "\"\\u0061bc\\u000A\u{01F980}'abc\""; "str4")] + #[test_case("abc\n\u{01F980}'abc", "\"\\u0061bc\\u000A\u{01F980}\\'abc\""; "str5")] + #[test_case("abc\n\u{01F980}'abc", "\"\\u0061bc\\u000A\\uD83E\\uDd80\\'abc\""; "str6")] + fn test(pat: &str, str: &str) { + let js = JsonString::new(pat); + let pattern = StringPattern::new(&js); + + let str = str.as_bytes(); + let res_forward = NosimdStringMatcher::pattern_match_forward(&pattern, str); + let res_backward = NosimdStringMatcher::pattern_match_backward(&pattern, str); + + assert_eq!(res_forward, Some(str.len() - 1)); + assert_eq!(res_backward, Some(0)); + + for i in 0..str.len() { + let (first, second) = str.split_at(i); + let res_forward = NosimdStringMatcher::pattern_match_forward(&pattern, (first, second)); + let res_backward = NosimdStringMatcher::pattern_match_backward(&pattern, (first, second)); + + assert_eq!(res_forward, Some(str.len() - 1)); + assert_eq!(res_backward, Some(0)); + } + + for i in 0..str.len() { + let (first, second_and_third) = str.split_at(i); + for j in 0..second_and_third.len() { + let (second, third) = second_and_third.split_at(j); + let res_forward = NosimdStringMatcher::pattern_match_forward(&pattern, (first, second, third)); + let res_backward = NosimdStringMatcher::pattern_match_backward(&pattern, (first, second, third)); + + assert_eq!(res_forward, Some(str.len() - 1)); + assert_eq!(res_backward, Some(0)); + } + } + } +} diff --git a/crates/rsonpath-lib/src/string_pattern/matcher/shared.rs b/crates/rsonpath-lib/src/string_pattern/matcher/shared.rs new file mode 100644 index 00000000..3cf78bbe --- /dev/null +++ b/crates/rsonpath-lib/src/string_pattern/matcher/shared.rs @@ -0,0 +1,115 @@ +use super::MatcherInput; +use crate::{string_pattern::AlternativeRepresentation, StringPattern}; + +const SLASH_U_CODE: u16 = u16::from_ne_bytes([b'\\', b'u']); +const ASCII_LOWERCASE: u32 = 0x2020_2020; + +pub(super) enum AlternativeMatchResult { + Continue(usize, usize), + Mismatch, +} + +#[inline(always)] +pub(super) fn attempt_alt_match_forward( + pattern: &StringPattern, + input: &I, + pat_idx: usize, +) -> AlternativeMatchResult { + match pattern.alternatives[pat_idx] { + AlternativeRepresentation::None => AlternativeMatchResult::Mismatch, + AlternativeRepresentation::SlashUSingle(code, pat_offset) => { + if input.len() >= 6 && input.read_u16(0) == SLASH_U_CODE && input.read_u32(2) | ASCII_LOWERCASE == code { + AlternativeMatchResult::Continue(6, pat_offset as usize) + } else { + AlternativeMatchResult::Mismatch + } + } + AlternativeRepresentation::SlashUPair(code_1, code_2, pat_offset) => { + if input.len() >= 12 + && input.read_u16(0) == SLASH_U_CODE + && input.read_u32(2) | ASCII_LOWERCASE == code_1 + && input.read_u16(6) == SLASH_U_CODE + && input.read_u32(8) | ASCII_LOWERCASE == code_2 + { + AlternativeMatchResult::Continue(12, pat_offset as usize) + } else { + AlternativeMatchResult::Mismatch + } + } + AlternativeRepresentation::USingle(code) => { + if input.len() >= 5 && input.read_u8(0) == b'u' && input.read_u32(1) | ASCII_LOWERCASE == code { + AlternativeMatchResult::Continue(5, 1) + } else { + AlternativeMatchResult::Mismatch + } + } + AlternativeRepresentation::SlashByteOrUSingle(byte, code) => { + if input.len() >= 2 && input.read_u8(0) == b'\\' && input.read_u8(1) == byte { + AlternativeMatchResult::Continue(2, 1) + } else if input.len() >= 6 + && input.read_u16(0) == SLASH_U_CODE + && input.read_u32(2) | ASCII_LOWERCASE == code + { + AlternativeMatchResult::Continue(6, 1) + } else { + AlternativeMatchResult::Mismatch + } + } + } +} + +#[inline(always)] +pub(super) fn attempt_alt_match_backward( + pattern: &StringPattern, + input: &I, + input_len: usize, + pat_idx: usize, +) -> AlternativeMatchResult { + match pattern.alternatives[pat_idx] { + AlternativeRepresentation::None => AlternativeMatchResult::Mismatch, + AlternativeRepresentation::SlashUSingle(code, pat_offset) => { + if input.len() >= 6 + && input.read_u16(input_len - 6) == SLASH_U_CODE + && input.read_u32(input_len - 4) | ASCII_LOWERCASE == code + { + AlternativeMatchResult::Continue(6, pat_offset as usize) + } else { + AlternativeMatchResult::Mismatch + } + } + AlternativeRepresentation::SlashUPair(code_1, code_2, pat_offset) => { + if input.len() >= 12 + && input.read_u16(input_len - 12) == SLASH_U_CODE + && input.read_u32(input_len - 10) | ASCII_LOWERCASE == code_1 + && input.read_u16(input_len - 6) == SLASH_U_CODE + && input.read_u32(input_len - 4) | ASCII_LOWERCASE == code_2 + { + AlternativeMatchResult::Continue(12, pat_offset as usize) + } else { + AlternativeMatchResult::Mismatch + } + } + AlternativeRepresentation::USingle(code) => { + if input.len() >= 5 + && input.read_u8(input_len - 5) == b'u' + && input.read_u32(input_len - 4) | ASCII_LOWERCASE == code + { + AlternativeMatchResult::Continue(5, 1) + } else { + AlternativeMatchResult::Mismatch + } + } + AlternativeRepresentation::SlashByteOrUSingle(byte, code) => { + if input.len() >= 2 && input.read_u8(input_len - 2) == b'\\' && input.read_u8(input_len - 1) == byte { + AlternativeMatchResult::Continue(2, 1) + } else if input.len() >= 6 + && input.read_u16(input_len - 6) == SLASH_U_CODE + && input.read_u32(input_len - 4) | ASCII_LOWERCASE == code + { + AlternativeMatchResult::Continue(6, 1) + } else { + AlternativeMatchResult::Mismatch + } + } + } +} diff --git a/crates/rsonpath-lib/tests/input_implementation_tests.rs b/crates/rsonpath-lib/tests/input_implementation_tests.rs index 2a57bb45..62a0473c 100644 --- a/crates/rsonpath-lib/tests/input_implementation_tests.rs +++ b/crates/rsonpath-lib/tests/input_implementation_tests.rs @@ -2,7 +2,7 @@ use pretty_assertions::assert_eq; use rsonpath::{ input::{error::InputError, *}, result::empty::EmptyRecorder, - StringPattern, + DefaultStringMatcher, StringPattern, }; use std::{cmp, fs, fs::File, io::Read, iter}; use test_case::test_case; @@ -261,10 +261,12 @@ impl InMemoryTestInput { fn test_positive_is_member_match_buffered(bytes: &[u8], from: usize, to: usize, json_string: &StringPattern) { let input = create_buffered(bytes); - let result = input.is_member_match(from, to, json_string).expect("match succeeds"); + let result = input + .pattern_match_from::(from, json_string) + .expect("match succeeds"); // Buffered is never padded from the start. - assert!(result); + assert_eq!(result, Some(to - 1)); } fn test_positive_is_member_match_borrowed(bytes: &[u8], from: usize, to: usize, json_string: &StringPattern) { @@ -273,9 +275,11 @@ impl InMemoryTestInput { // Need to take padding into account. let from = from + input.leading_padding_len(); let to = to + input.leading_padding_len(); - let result = input.is_member_match(from, to, json_string).expect("match succeeds"); + let result = input + .pattern_match_from::(from, json_string) + .expect("match succeeds"); - assert!(result); + assert_eq!(result, Some(to - 1)); } fn test_positive_is_member_match_owned(bytes: &[u8], from: usize, to: usize, json_string: &StringPattern) { @@ -284,9 +288,11 @@ impl InMemoryTestInput { // Need to take padding into account. let from = from + input.leading_padding_len(); let to = to + input.leading_padding_len(); - let result = input.is_member_match(from, to, json_string).expect("match succeeds"); + let result = input + .pattern_match_from::(from, json_string) + .expect("match succeeds"); - assert!(result); + assert_eq!(result, Some(to - 1)); } fn test_padding_buffered(bytes: &[u8]) { diff --git a/crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__jsonpath_example_query.snap b/crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__jsonpath_example_query.snap index f9042dd5..a1072a39 100644 --- a/crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__jsonpath_example_query.snap +++ b/crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__jsonpath_example_query.snap @@ -1,7 +1,6 @@ --- source: crates/rsonpath-lib/tests/engine_serialization_snapshots.rs expression: "&engine(\"$..phoneNumbers[*].number\")?" -snapshot_kind: text --- (V1, Automaton( states: [ @@ -14,9 +13,106 @@ snapshot_kind: text StateTable( attributes: StateAttributes(0), member_transitions: [ - (StringPattern(JsonString( - quoted: "\"phoneNumbers\"", - )), State(2)), + (StringPattern( + bytes: [ + 34, + 112, + 104, + 111, + 110, + 101, + 78, + 117, + 109, + 98, + 101, + 114, + 115, + 34, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + ], + alternatives: [ + None, + SlashUSingle(808923184, 1), + SlashUSingle(943075376, 1), + SlashUSingle(1714827312, 1), + SlashUSingle(1698050096, 1), + SlashUSingle(892743728, 1), + SlashUSingle(1697919024, 1), + SlashUSingle(892809264, 1), + SlashUSingle(1681272880, 1), + SlashUSingle(842412080, 1), + SlashUSingle(892743728, 1), + SlashUSingle(842477616, 1), + SlashUSingle(859254832, 1), + None, + ], + len: 14, + len_limit: 74, + ), State(2)), ], array_transitions: [], fallback_state: State(1), @@ -24,9 +120,106 @@ snapshot_kind: text StateTable( attributes: StateAttributes(0), member_transitions: [ - (StringPattern(JsonString( - quoted: "\"phoneNumbers\"", - )), State(4)), + (StringPattern( + bytes: [ + 34, + 112, + 104, + 111, + 110, + 101, + 78, + 117, + 109, + 98, + 101, + 114, + 115, + 34, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + ], + alternatives: [ + None, + SlashUSingle(808923184, 1), + SlashUSingle(943075376, 1), + SlashUSingle(1714827312, 1), + SlashUSingle(1698050096, 1), + SlashUSingle(892743728, 1), + SlashUSingle(1697919024, 1), + SlashUSingle(892809264, 1), + SlashUSingle(1681272880, 1), + SlashUSingle(842412080, 1), + SlashUSingle(892743728, 1), + SlashUSingle(842477616, 1), + SlashUSingle(859254832, 1), + None, + ], + len: 14, + len_limit: 74, + ), State(4)), ], array_transitions: [], fallback_state: State(3), @@ -34,12 +227,194 @@ snapshot_kind: text StateTable( attributes: StateAttributes(8), member_transitions: [ - (StringPattern(JsonString( - quoted: "\"phoneNumbers\"", - )), State(2)), - (StringPattern(JsonString( - quoted: "\"number\"", - )), State(6)), + (StringPattern( + bytes: [ + 34, + 112, + 104, + 111, + 110, + 101, + 78, + 117, + 109, + 98, + 101, + 114, + 115, + 34, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + ], + alternatives: [ + None, + SlashUSingle(808923184, 1), + SlashUSingle(943075376, 1), + SlashUSingle(1714827312, 1), + SlashUSingle(1698050096, 1), + SlashUSingle(892743728, 1), + SlashUSingle(1697919024, 1), + SlashUSingle(892809264, 1), + SlashUSingle(1681272880, 1), + SlashUSingle(842412080, 1), + SlashUSingle(892743728, 1), + SlashUSingle(842477616, 1), + SlashUSingle(859254832, 1), + None, + ], + len: 14, + len_limit: 74, + ), State(2)), + (StringPattern( + bytes: [ + 34, + 110, + 117, + 109, + 98, + 101, + 114, + 34, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + ], + alternatives: [ + None, + SlashUSingle(1698050096, 1), + SlashUSingle(892809264, 1), + SlashUSingle(1681272880, 1), + SlashUSingle(842412080, 1), + SlashUSingle(892743728, 1), + SlashUSingle(842477616, 1), + None, + ], + len: 8, + len_limit: 38, + ), State(6)), ], array_transitions: [], fallback_state: State(1), @@ -47,12 +422,194 @@ snapshot_kind: text StateTable( attributes: StateAttributes(8), member_transitions: [ - (StringPattern(JsonString( - quoted: "\"phoneNumbers\"", - )), State(4)), - (StringPattern(JsonString( - quoted: "\"number\"", - )), State(5)), + (StringPattern( + bytes: [ + 34, + 112, + 104, + 111, + 110, + 101, + 78, + 117, + 109, + 98, + 101, + 114, + 115, + 34, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + ], + alternatives: [ + None, + SlashUSingle(808923184, 1), + SlashUSingle(943075376, 1), + SlashUSingle(1714827312, 1), + SlashUSingle(1698050096, 1), + SlashUSingle(892743728, 1), + SlashUSingle(1697919024, 1), + SlashUSingle(892809264, 1), + SlashUSingle(1681272880, 1), + SlashUSingle(842412080, 1), + SlashUSingle(892743728, 1), + SlashUSingle(842477616, 1), + SlashUSingle(859254832, 1), + None, + ], + len: 14, + len_limit: 74, + ), State(4)), + (StringPattern( + bytes: [ + 34, + 110, + 117, + 109, + 98, + 101, + 114, + 34, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + ], + alternatives: [ + None, + SlashUSingle(1698050096, 1), + SlashUSingle(892809264, 1), + SlashUSingle(1681272880, 1), + SlashUSingle(842412080, 1), + SlashUSingle(892743728, 1), + SlashUSingle(842477616, 1), + None, + ], + len: 8, + len_limit: 38, + ), State(5)), ], array_transitions: [], fallback_state: State(3), @@ -60,12 +617,194 @@ snapshot_kind: text StateTable( attributes: StateAttributes(9), member_transitions: [ - (StringPattern(JsonString( - quoted: "\"phoneNumbers\"", - )), State(2)), - (StringPattern(JsonString( - quoted: "\"number\"", - )), State(6)), + (StringPattern( + bytes: [ + 34, + 112, + 104, + 111, + 110, + 101, + 78, + 117, + 109, + 98, + 101, + 114, + 115, + 34, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + ], + alternatives: [ + None, + SlashUSingle(808923184, 1), + SlashUSingle(943075376, 1), + SlashUSingle(1714827312, 1), + SlashUSingle(1698050096, 1), + SlashUSingle(892743728, 1), + SlashUSingle(1697919024, 1), + SlashUSingle(892809264, 1), + SlashUSingle(1681272880, 1), + SlashUSingle(842412080, 1), + SlashUSingle(892743728, 1), + SlashUSingle(842477616, 1), + SlashUSingle(859254832, 1), + None, + ], + len: 14, + len_limit: 74, + ), State(2)), + (StringPattern( + bytes: [ + 34, + 110, + 117, + 109, + 98, + 101, + 114, + 34, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + ], + alternatives: [ + None, + SlashUSingle(1698050096, 1), + SlashUSingle(892809264, 1), + SlashUSingle(1681272880, 1), + SlashUSingle(842412080, 1), + SlashUSingle(892743728, 1), + SlashUSingle(842477616, 1), + None, + ], + len: 8, + len_limit: 38, + ), State(6)), ], array_transitions: [], fallback_state: State(1), @@ -73,9 +812,106 @@ snapshot_kind: text StateTable( attributes: StateAttributes(1), member_transitions: [ - (StringPattern(JsonString( - quoted: "\"phoneNumbers\"", - )), State(2)), + (StringPattern( + bytes: [ + 34, + 112, + 104, + 111, + 110, + 101, + 78, + 117, + 109, + 98, + 101, + 114, + 115, + 34, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + ], + alternatives: [ + None, + SlashUSingle(808923184, 1), + SlashUSingle(943075376, 1), + SlashUSingle(1714827312, 1), + SlashUSingle(1698050096, 1), + SlashUSingle(892743728, 1), + SlashUSingle(1697919024, 1), + SlashUSingle(892809264, 1), + SlashUSingle(1681272880, 1), + SlashUSingle(842412080, 1), + SlashUSingle(892743728, 1), + SlashUSingle(842477616, 1), + SlashUSingle(859254832, 1), + None, + ], + len: 14, + len_limit: 74, + ), State(2)), ], array_transitions: [], fallback_state: State(1), diff --git a/crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__readme_query.snap b/crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__readme_query.snap index e40ccafb..8c154c41 100644 --- a/crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__readme_query.snap +++ b/crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__readme_query.snap @@ -1,7 +1,6 @@ --- source: crates/rsonpath-lib/tests/engine_serialization_snapshots.rs expression: "&engine(\"$.jsonpath[*]\")?" -snapshot_kind: text --- (V1, Automaton( states: [ @@ -14,9 +13,98 @@ snapshot_kind: text StateTable( attributes: StateAttributes(4), member_transitions: [ - (StringPattern(JsonString( - quoted: "\"jsonpath\"", - )), State(2)), + (StringPattern( + bytes: [ + 34, + 106, + 115, + 111, + 110, + 112, + 97, + 116, + 104, + 34, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + ], + alternatives: [ + None, + SlashUSingle(1630941232, 1), + SlashUSingle(859254832, 1), + SlashUSingle(1714827312, 1), + SlashUSingle(1698050096, 1), + SlashUSingle(808923184, 1), + SlashUSingle(825634864, 1), + SlashUSingle(876032048, 1), + SlashUSingle(943075376, 1), + None, + ], + len: 10, + len_limit: 50, + ), State(2)), ], array_transitions: [], fallback_state: State(0), diff --git a/crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__real_life_query.snap b/crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__real_life_query.snap index 29240077..1531d68c 100644 --- a/crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__real_life_query.snap +++ b/crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__real_life_query.snap @@ -1,7 +1,6 @@ --- source: crates/rsonpath-lib/tests/engine_serialization_snapshots.rs expression: "&engine(\"$.personal.details.contact.information.phones.home\")?" -snapshot_kind: text --- (V1, Automaton( states: [ @@ -14,9 +13,98 @@ snapshot_kind: text StateTable( attributes: StateAttributes(4), member_transitions: [ - (StringPattern(JsonString( - quoted: "\"personal\"", - )), State(2)), + (StringPattern( + bytes: [ + 34, + 112, + 101, + 114, + 115, + 111, + 110, + 97, + 108, + 34, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + ], + alternatives: [ + None, + SlashUSingle(808923184, 1), + SlashUSingle(892743728, 1), + SlashUSingle(842477616, 1), + SlashUSingle(859254832, 1), + SlashUSingle(1714827312, 1), + SlashUSingle(1698050096, 1), + SlashUSingle(825634864, 1), + SlashUSingle(1664495664, 1), + None, + ], + len: 10, + len_limit: 50, + ), State(2)), ], array_transitions: [], fallback_state: State(0), @@ -24,9 +112,96 @@ snapshot_kind: text StateTable( attributes: StateAttributes(4), member_transitions: [ - (StringPattern(JsonString( - quoted: "\"details\"", - )), State(3)), + (StringPattern( + bytes: [ + 34, + 100, + 101, + 116, + 97, + 105, + 108, + 115, + 34, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + ], + alternatives: [ + None, + SlashUSingle(875966512, 1), + SlashUSingle(892743728, 1), + SlashUSingle(876032048, 1), + SlashUSingle(825634864, 1), + SlashUSingle(959852592, 1), + SlashUSingle(1664495664, 1), + SlashUSingle(859254832, 1), + None, + ], + len: 9, + len_limit: 44, + ), State(3)), ], array_transitions: [], fallback_state: State(0), @@ -34,9 +209,96 @@ snapshot_kind: text StateTable( attributes: StateAttributes(4), member_transitions: [ - (StringPattern(JsonString( - quoted: "\"contact\"", - )), State(4)), + (StringPattern( + bytes: [ + 34, + 99, + 111, + 110, + 116, + 97, + 99, + 116, + 34, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + ], + alternatives: [ + None, + SlashUSingle(859189296, 1), + SlashUSingle(1714827312, 1), + SlashUSingle(1698050096, 1), + SlashUSingle(876032048, 1), + SlashUSingle(825634864, 1), + SlashUSingle(859189296, 1), + SlashUSingle(876032048, 1), + None, + ], + len: 9, + len_limit: 44, + ), State(4)), ], array_transitions: [], fallback_state: State(0), @@ -44,9 +306,104 @@ snapshot_kind: text StateTable( attributes: StateAttributes(4), member_transitions: [ - (StringPattern(JsonString( - quoted: "\"information\"", - )), State(5)), + (StringPattern( + bytes: [ + 34, + 105, + 110, + 102, + 111, + 114, + 109, + 97, + 116, + 105, + 111, + 110, + 34, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + ], + alternatives: [ + None, + SlashUSingle(959852592, 1), + SlashUSingle(1698050096, 1), + SlashUSingle(909520944, 1), + SlashUSingle(1714827312, 1), + SlashUSingle(842477616, 1), + SlashUSingle(1681272880, 1), + SlashUSingle(825634864, 1), + SlashUSingle(876032048, 1), + SlashUSingle(959852592, 1), + SlashUSingle(1714827312, 1), + SlashUSingle(1698050096, 1), + None, + ], + len: 13, + len_limit: 68, + ), State(5)), ], array_transitions: [], fallback_state: State(0), @@ -54,9 +411,94 @@ snapshot_kind: text StateTable( attributes: StateAttributes(4), member_transitions: [ - (StringPattern(JsonString( - quoted: "\"phones\"", - )), State(6)), + (StringPattern( + bytes: [ + 34, + 112, + 104, + 111, + 110, + 101, + 115, + 34, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + ], + alternatives: [ + None, + SlashUSingle(808923184, 1), + SlashUSingle(943075376, 1), + SlashUSingle(1714827312, 1), + SlashUSingle(1698050096, 1), + SlashUSingle(892743728, 1), + SlashUSingle(859254832, 1), + None, + ], + len: 8, + len_limit: 38, + ), State(6)), ], array_transitions: [], fallback_state: State(0), @@ -64,9 +506,90 @@ snapshot_kind: text StateTable( attributes: StateAttributes(12), member_transitions: [ - (StringPattern(JsonString( - quoted: "\"home\"", - )), State(7)), + (StringPattern( + bytes: [ + 34, + 104, + 111, + 109, + 101, + 34, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + ], + alternatives: [ + None, + SlashUSingle(943075376, 1), + SlashUSingle(1714827312, 1), + SlashUSingle(1681272880, 1), + SlashUSingle(892743728, 1), + None, + ], + len: 6, + len_limit: 26, + ), State(7)), ], array_transitions: [], fallback_state: State(0), diff --git a/crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__slice.snap b/crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__slice.snap index 212a92ec..e3379de7 100644 --- a/crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__slice.snap +++ b/crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__slice.snap @@ -1,7 +1,6 @@ --- source: crates/rsonpath-lib/tests/engine_serialization_snapshots.rs expression: "&engine(\"$..entries[3:5:7]\")?" -snapshot_kind: text --- (V1, Automaton( states: [ @@ -14,9 +13,96 @@ snapshot_kind: text StateTable( attributes: StateAttributes(0), member_transitions: [ - (StringPattern(JsonString( - quoted: "\"entries\"", - )), State(2)), + (StringPattern( + bytes: [ + 34, + 101, + 110, + 116, + 114, + 105, + 101, + 115, + 34, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + ], + alternatives: [ + None, + SlashUSingle(892743728, 1), + SlashUSingle(1698050096, 1), + SlashUSingle(876032048, 1), + SlashUSingle(842477616, 1), + SlashUSingle(959852592, 1), + SlashUSingle(892743728, 1), + SlashUSingle(859254832, 1), + None, + ], + len: 9, + len_limit: 44, + ), State(2)), ], array_transitions: [], fallback_state: State(1), @@ -24,9 +110,96 @@ snapshot_kind: text StateTable( attributes: StateAttributes(56), member_transitions: [ - (StringPattern(JsonString( - quoted: "\"entries\"", - )), State(2)), + (StringPattern( + bytes: [ + 34, + 101, + 110, + 116, + 114, + 105, + 101, + 115, + 34, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + ], + alternatives: [ + None, + SlashUSingle(892743728, 1), + SlashUSingle(1698050096, 1), + SlashUSingle(876032048, 1), + SlashUSingle(842477616, 1), + SlashUSingle(959852592, 1), + SlashUSingle(892743728, 1), + SlashUSingle(859254832, 1), + None, + ], + len: 9, + len_limit: 44, + ), State(2)), ], array_transitions: [ ArrayTransition( @@ -39,9 +212,96 @@ snapshot_kind: text StateTable( attributes: StateAttributes(1), member_transitions: [ - (StringPattern(JsonString( - quoted: "\"entries\"", - )), State(2)), + (StringPattern( + bytes: [ + 34, + 101, + 110, + 116, + 114, + 105, + 101, + 115, + 34, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + ], + alternatives: [ + None, + SlashUSingle(892743728, 1), + SlashUSingle(1698050096, 1), + SlashUSingle(876032048, 1), + SlashUSingle(842477616, 1), + SlashUSingle(959852592, 1), + SlashUSingle(892743728, 1), + SlashUSingle(859254832, 1), + None, + ], + len: 9, + len_limit: 44, + ), State(2)), ], array_transitions: [], fallback_state: State(1), diff --git a/crates/rsonpath-test/Cargo.toml b/crates/rsonpath-test/Cargo.toml index 081b11bd..9fe5bd61 100644 --- a/crates/rsonpath-test/Cargo.toml +++ b/crates/rsonpath-test/Cargo.toml @@ -19,7 +19,7 @@ serde_json = { workspace = true } [dev-dependencies] pretty_assertions = { workspace = true } -rsonpath-lib = { workspace = true } +rsonpath-lib = { workspace = true, features = ["simd"] } rsonpath-syntax = { workspace = true } [build-dependencies] diff --git a/crates/rsonpath-test/documents/toml/escapes.toml b/crates/rsonpath-test/documents/toml/escapes.toml index 972996d0..41e082ad 100644 --- a/crates/rsonpath-test/documents/toml/escapes.toml +++ b/crates/rsonpath-test/documents/toml/escapes.toml @@ -20,8 +20,6 @@ json_string = ''' [[queries]] query = '''$..a..b..['label\\']''' description = "select label with one actual backslash, which is two backslashes in the query" -disabled.reason = "we do not handle escapes correctly yet" -disabled.issue = "https://github.com/V0ldek/rsonpath/issues/117" [queries.results] count = 1 @@ -31,8 +29,6 @@ nodes = ["42"] [[queries]] query = '''$..a..b..['label\\\\']''' description = "select label with two actual backslashes (four backslashes in the query), which does not exist" -disabled.reason = "we do not handle escapes correctly yet" -disabled.issue = "https://github.com/V0ldek/rsonpath/issues/117" [queries.results] count = 0 diff --git a/crates/rsonpath-test/documents/toml/extremely_long_key.toml b/crates/rsonpath-test/documents/toml/extremely_long_key.toml index 5ffe7779..53c4d30c 100644 --- a/crates/rsonpath-test/documents/toml/extremely_long_key.toml +++ b/crates/rsonpath-test/documents/toml/extremely_long_key.toml @@ -19,8 +19,6 @@ query = "$..['This is meant to force the buffered input to have to read ahead to # Short descritpion of the query semantics. description = "select the long key" -disabled.issue = "https://github.com/V0ldek/rsonpath/issues/117" -disabled.reason = "we do not handle escapes correctly yet" [queries.results] # Number of expected matches. diff --git a/crates/rsonpath-test/documents/toml/memchr_trap.toml b/crates/rsonpath-test/documents/toml/memchr_trap.toml index fad36f95..6b4b0eef 100644 --- a/crates/rsonpath-test/documents/toml/memchr_trap.toml +++ b/crates/rsonpath-test/documents/toml/memchr_trap.toml @@ -24,8 +24,6 @@ nodes = ["43"] [[queries]] query = """$..['"b']""" description = "descendant search for 'b' with a leading quote" -disabled.issue = "https://github.com/V0ldek/rsonpath/issues/117" -disabled.reason = "we do not handle escapes correctly yet" [queries.results] count = 1 diff --git a/crates/rsonpath-test/documents/toml/quote_escape.toml b/crates/rsonpath-test/documents/toml/quote_escape.toml index 3c5370f9..840dd020 100644 --- a/crates/rsonpath-test/documents/toml/quote_escape.toml +++ b/crates/rsonpath-test/documents/toml/quote_escape.toml @@ -22,8 +22,6 @@ nodes = ["17"] [[queries]] query = """$['"x']""" description = "select 'x' with quote directly" -disabled.issue = "https://github.com/V0ldek/rsonpath/issues/117" -disabled.reason = "we do not handle escapes correctly yet" [queries.results] count = 1 diff --git a/crates/rsonpath-test/documents/toml/unicode_escape.toml b/crates/rsonpath-test/documents/toml/unicode_escape.toml new file mode 100644 index 00000000..1998d43e --- /dev/null +++ b/crates/rsonpath-test/documents/toml/unicode_escape.toml @@ -0,0 +1,50 @@ +# Define the JSON input for all query test cases. +[input] +# Short description of the input structure. +description = "Arrays of equivalent objects with different byte-encodings of labels" +# Set to true only if your specific test input is fully compressed (no extraneous whitespace). +is_compressed = false + +# Inline JSON document. +[input.source] +json_string = ''' +[ + [ + {"a":1}, + {"\u0061":2}, + {"filler":"this is filler long enough to fill a whole block of classification, so that the head-skipper has to go to the SIMD loop instead of doing just the first block"}, + {"\u0061":3} + ] +] +''' + +# Define queries to test on the input. +[[queries]] +# Valid JSONPath query string. +query = "$..['a']" +# Short descritpion of the query semantics. +description = "select all as" + +[queries.results] +# Number of expected matches. +count = 3 +# Byte locations of spans of all matches, in order. +spans = [[15, 16], [33, 34], [227, 228]] +# Stringified values of all matches, verbatim as in the input, +# in the same order as above. +nodes = ['1', '2', '3'] + +[[queries]] +# Valid JSONPath query string. +query = "$..['\\u0061']" +# Short descritpion of the query semantics. +description = "select all as by unicode escape" + +[queries.results] +# Number of expected matches. +count = 3 +# Byte locations of spans of all matches, in order. +spans = [[15, 16], [33, 34], [227, 228]] +# Stringified values of all matches, verbatim as in the input, +# in the same order as above. +nodes = ['1', '2', '3'] \ No newline at end of file From a126e863aa9be07c80fbd4fdd8968e1e2e589fc0 Mon Sep 17 00:00:00 2001 From: Mateusz Gienieczko Date: Sun, 20 Apr 2025 23:53:26 +0200 Subject: [PATCH 2/2] split snaps between 32-bit and 64-bit archs --- .../src/classification/memmem/nosimd.rs | 2 +- .../src/string_pattern/matcher/avx2_64.rs | 3 +- .../tests/engine_serialization_snapshots.rs | 40 +- ...alization_snapshots__ron__empty_query.snap | 1 - ...shots__ron__jsonpath_example_query_32.snap | 632 ++++++++++++++++++ ...hots__ron__jsonpath_example_query_64.snap} | 0 ...ation_snapshots__ron__readme_query_32.snap | 93 +++ ...tion_snapshots__ron__readme_query_64.snap} | 0 ...on_snapshots__ron__real_life_query_32.snap | 412 ++++++++++++ ...n_snapshots__ron__real_life_query_64.snap} | 0 ...erialization_snapshots__ron__slice_32.snap | 214 ++++++ ...rialization_snapshots__ron__slice_64.snap} | 0 12 files changed, 1390 insertions(+), 7 deletions(-) create mode 100644 crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__jsonpath_example_query_32.snap rename crates/rsonpath-lib/tests/snapshots/{engine_serialization_snapshots__ron__jsonpath_example_query.snap => engine_serialization_snapshots__ron__jsonpath_example_query_64.snap} (100%) create mode 100644 crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__readme_query_32.snap rename crates/rsonpath-lib/tests/snapshots/{engine_serialization_snapshots__ron__readme_query.snap => engine_serialization_snapshots__ron__readme_query_64.snap} (100%) create mode 100644 crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__real_life_query_32.snap rename crates/rsonpath-lib/tests/snapshots/{engine_serialization_snapshots__ron__real_life_query.snap => engine_serialization_snapshots__ron__real_life_query_64.snap} (100%) create mode 100644 crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__slice_32.snap rename crates/rsonpath-lib/tests/snapshots/{engine_serialization_snapshots__ron__slice.snap => engine_serialization_snapshots__ron__slice_64.snap} (100%) diff --git a/crates/rsonpath-lib/src/classification/memmem/nosimd.rs b/crates/rsonpath-lib/src/classification/memmem/nosimd.rs index e2b42170..11c94b72 100644 --- a/crates/rsonpath-lib/src/classification/memmem/nosimd.rs +++ b/crates/rsonpath-lib/src/classification/memmem/nosimd.rs @@ -46,7 +46,7 @@ where phantom: PhantomData, } -impl<'i, 'b, 'r, I, SM, R, const N: usize> SequentialMemmemClassifier<'i, 'b, 'r, I, SM, R, N> +impl<'i, 'r, I, SM, R, const N: usize> SequentialMemmemClassifier<'i, '_, 'r, I, SM, R, N> where I: Input, SM: StringPatternMatcher, diff --git a/crates/rsonpath-lib/src/string_pattern/matcher/avx2_64.rs b/crates/rsonpath-lib/src/string_pattern/matcher/avx2_64.rs index 8e337e09..6bcf1418 100644 --- a/crates/rsonpath-lib/src/string_pattern/matcher/avx2_64.rs +++ b/crates/rsonpath-lib/src/string_pattern/matcher/avx2_64.rs @@ -247,7 +247,8 @@ impl StringPatternMatcher for Avx2StringMatcher64 { } // Fallthrough to alt match. } - 64.. => { + _ => { + // >= 64 let rem_vec = _mm256_loadu_si256(rem_ptr.cast()); let in_vec = _mm256_loadu_si256(in_ptr.cast()); let cmpeq = _mm256_cmpeq_epi8(rem_vec, in_vec); diff --git a/crates/rsonpath-lib/tests/engine_serialization_snapshots.rs b/crates/rsonpath-lib/tests/engine_serialization_snapshots.rs index 8fa71e52..76d42bea 100644 --- a/crates/rsonpath-lib/tests/engine_serialization_snapshots.rs +++ b/crates/rsonpath-lib/tests/engine_serialization_snapshots.rs @@ -17,25 +17,57 @@ mod ron { } #[test] - fn readme_query() -> Result<(), Box> { + #[cfg(target_pointer_width = "64")] + fn readme_query_64() -> Result<(), Box> { assert_ron_snapshot!(&engine("$.jsonpath[*]")?); Ok(()) } #[test] - fn jsonpath_example_query() -> Result<(), Box> { + #[cfg(target_pointer_width = "32")] + fn readme_query_32() -> Result<(), Box> { + assert_ron_snapshot!(&engine("$.jsonpath[*]")?); + Ok(()) + } + + #[test] + #[cfg(target_pointer_width = "64")] + fn jsonpath_example_query_64() -> Result<(), Box> { + assert_ron_snapshot!(&engine("$..phoneNumbers[*].number")?); + Ok(()) + } + + #[test] + #[cfg(target_pointer_width = "32")] + fn jsonpath_example_query_32() -> Result<(), Box> { assert_ron_snapshot!(&engine("$..phoneNumbers[*].number")?); Ok(()) } #[test] - fn real_life_query() -> Result<(), Box> { + #[cfg(target_pointer_width = "64")] + fn real_life_query_64() -> Result<(), Box> { assert_ron_snapshot!(&engine("$.personal.details.contact.information.phones.home")?); Ok(()) } #[test] - fn slice() -> Result<(), Box> { + #[cfg(target_pointer_width = "32")] + fn real_life_query_32() -> Result<(), Box> { + assert_ron_snapshot!(&engine("$.personal.details.contact.information.phones.home")?); + Ok(()) + } + + #[test] + #[cfg(target_pointer_width = "64")] + fn slice_64() -> Result<(), Box> { + assert_ron_snapshot!(&engine("$..entries[3:5:7]")?); + Ok(()) + } + + #[test] + #[cfg(target_pointer_width = "32")] + fn slice_32() -> Result<(), Box> { assert_ron_snapshot!(&engine("$..entries[3:5:7]")?); Ok(()) } diff --git a/crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__empty_query.snap b/crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__empty_query.snap index 7b6e463d..92548f3e 100644 --- a/crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__empty_query.snap +++ b/crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__empty_query.snap @@ -1,7 +1,6 @@ --- source: crates/rsonpath-lib/tests/engine_serialization_snapshots.rs expression: "&engine(\"$\")?" -snapshot_kind: text --- (V1, Automaton( states: [ diff --git a/crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__jsonpath_example_query_32.snap b/crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__jsonpath_example_query_32.snap new file mode 100644 index 00000000..d771530e --- /dev/null +++ b/crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__jsonpath_example_query_32.snap @@ -0,0 +1,632 @@ +--- +source: crates/rsonpath-lib/tests/engine_serialization_snapshots.rs +expression: "&engine(\"$..phoneNumbers[*].number\")?" +--- +(V1, Automaton( + states: [ + StateTable( + attributes: StateAttributes(2), + member_transitions: [], + array_transitions: [], + fallback_state: State(0), + ), + StateTable( + attributes: StateAttributes(0), + member_transitions: [ + (StringPattern( + bytes: [ + 34, + 112, + 104, + 111, + 110, + 101, + 78, + 117, + 109, + 98, + 101, + 114, + 115, + 34, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + ], + alternatives: [ + None, + SlashUSingle(808923184, 1), + SlashUSingle(943075376, 1), + SlashUSingle(1714827312, 1), + SlashUSingle(1698050096, 1), + SlashUSingle(892743728, 1), + SlashUSingle(1697919024, 1), + SlashUSingle(892809264, 1), + SlashUSingle(1681272880, 1), + SlashUSingle(842412080, 1), + SlashUSingle(892743728, 1), + SlashUSingle(842477616, 1), + SlashUSingle(859254832, 1), + None, + ], + len: 14, + len_limit: 74, + ), State(2)), + ], + array_transitions: [], + fallback_state: State(1), + ), + StateTable( + attributes: StateAttributes(0), + member_transitions: [ + (StringPattern( + bytes: [ + 34, + 112, + 104, + 111, + 110, + 101, + 78, + 117, + 109, + 98, + 101, + 114, + 115, + 34, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + ], + alternatives: [ + None, + SlashUSingle(808923184, 1), + SlashUSingle(943075376, 1), + SlashUSingle(1714827312, 1), + SlashUSingle(1698050096, 1), + SlashUSingle(892743728, 1), + SlashUSingle(1697919024, 1), + SlashUSingle(892809264, 1), + SlashUSingle(1681272880, 1), + SlashUSingle(842412080, 1), + SlashUSingle(892743728, 1), + SlashUSingle(842477616, 1), + SlashUSingle(859254832, 1), + None, + ], + len: 14, + len_limit: 74, + ), State(4)), + ], + array_transitions: [], + fallback_state: State(3), + ), + StateTable( + attributes: StateAttributes(8), + member_transitions: [ + (StringPattern( + bytes: [ + 34, + 112, + 104, + 111, + 110, + 101, + 78, + 117, + 109, + 98, + 101, + 114, + 115, + 34, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + ], + alternatives: [ + None, + SlashUSingle(808923184, 1), + SlashUSingle(943075376, 1), + SlashUSingle(1714827312, 1), + SlashUSingle(1698050096, 1), + SlashUSingle(892743728, 1), + SlashUSingle(1697919024, 1), + SlashUSingle(892809264, 1), + SlashUSingle(1681272880, 1), + SlashUSingle(842412080, 1), + SlashUSingle(892743728, 1), + SlashUSingle(842477616, 1), + SlashUSingle(859254832, 1), + None, + ], + len: 14, + len_limit: 74, + ), State(2)), + (StringPattern( + bytes: [ + 34, + 110, + 117, + 109, + 98, + 101, + 114, + 34, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + ], + alternatives: [ + None, + SlashUSingle(1698050096, 1), + SlashUSingle(892809264, 1), + SlashUSingle(1681272880, 1), + SlashUSingle(842412080, 1), + SlashUSingle(892743728, 1), + SlashUSingle(842477616, 1), + None, + ], + len: 8, + len_limit: 38, + ), State(6)), + ], + array_transitions: [], + fallback_state: State(1), + ), + StateTable( + attributes: StateAttributes(8), + member_transitions: [ + (StringPattern( + bytes: [ + 34, + 112, + 104, + 111, + 110, + 101, + 78, + 117, + 109, + 98, + 101, + 114, + 115, + 34, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + ], + alternatives: [ + None, + SlashUSingle(808923184, 1), + SlashUSingle(943075376, 1), + SlashUSingle(1714827312, 1), + SlashUSingle(1698050096, 1), + SlashUSingle(892743728, 1), + SlashUSingle(1697919024, 1), + SlashUSingle(892809264, 1), + SlashUSingle(1681272880, 1), + SlashUSingle(842412080, 1), + SlashUSingle(892743728, 1), + SlashUSingle(842477616, 1), + SlashUSingle(859254832, 1), + None, + ], + len: 14, + len_limit: 74, + ), State(4)), + (StringPattern( + bytes: [ + 34, + 110, + 117, + 109, + 98, + 101, + 114, + 34, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + ], + alternatives: [ + None, + SlashUSingle(1698050096, 1), + SlashUSingle(892809264, 1), + SlashUSingle(1681272880, 1), + SlashUSingle(842412080, 1), + SlashUSingle(892743728, 1), + SlashUSingle(842477616, 1), + None, + ], + len: 8, + len_limit: 38, + ), State(5)), + ], + array_transitions: [], + fallback_state: State(3), + ), + StateTable( + attributes: StateAttributes(9), + member_transitions: [ + (StringPattern( + bytes: [ + 34, + 112, + 104, + 111, + 110, + 101, + 78, + 117, + 109, + 98, + 101, + 114, + 115, + 34, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + ], + alternatives: [ + None, + SlashUSingle(808923184, 1), + SlashUSingle(943075376, 1), + SlashUSingle(1714827312, 1), + SlashUSingle(1698050096, 1), + SlashUSingle(892743728, 1), + SlashUSingle(1697919024, 1), + SlashUSingle(892809264, 1), + SlashUSingle(1681272880, 1), + SlashUSingle(842412080, 1), + SlashUSingle(892743728, 1), + SlashUSingle(842477616, 1), + SlashUSingle(859254832, 1), + None, + ], + len: 14, + len_limit: 74, + ), State(2)), + (StringPattern( + bytes: [ + 34, + 110, + 117, + 109, + 98, + 101, + 114, + 34, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + ], + alternatives: [ + None, + SlashUSingle(1698050096, 1), + SlashUSingle(892809264, 1), + SlashUSingle(1681272880, 1), + SlashUSingle(842412080, 1), + SlashUSingle(892743728, 1), + SlashUSingle(842477616, 1), + None, + ], + len: 8, + len_limit: 38, + ), State(6)), + ], + array_transitions: [], + fallback_state: State(1), + ), + StateTable( + attributes: StateAttributes(1), + member_transitions: [ + (StringPattern( + bytes: [ + 34, + 112, + 104, + 111, + 110, + 101, + 78, + 117, + 109, + 98, + 101, + 114, + 115, + 34, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + ], + alternatives: [ + None, + SlashUSingle(808923184, 1), + SlashUSingle(943075376, 1), + SlashUSingle(1714827312, 1), + SlashUSingle(1698050096, 1), + SlashUSingle(892743728, 1), + SlashUSingle(1697919024, 1), + SlashUSingle(892809264, 1), + SlashUSingle(1681272880, 1), + SlashUSingle(842412080, 1), + SlashUSingle(892743728, 1), + SlashUSingle(842477616, 1), + SlashUSingle(859254832, 1), + None, + ], + len: 14, + len_limit: 74, + ), State(2)), + ], + array_transitions: [], + fallback_state: State(1), + ), + ], +)) diff --git a/crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__jsonpath_example_query.snap b/crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__jsonpath_example_query_64.snap similarity index 100% rename from crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__jsonpath_example_query.snap rename to crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__jsonpath_example_query_64.snap diff --git a/crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__readme_query_32.snap b/crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__readme_query_32.snap new file mode 100644 index 00000000..c838abc7 --- /dev/null +++ b/crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__readme_query_32.snap @@ -0,0 +1,93 @@ +--- +source: crates/rsonpath-lib/tests/engine_serialization_snapshots.rs +expression: "&engine(\"$.jsonpath[*]\")?" +--- +(V1, Automaton( + states: [ + StateTable( + attributes: StateAttributes(2), + member_transitions: [], + array_transitions: [], + fallback_state: State(0), + ), + StateTable( + attributes: StateAttributes(4), + member_transitions: [ + (StringPattern( + bytes: [ + 34, + 106, + 115, + 111, + 110, + 112, + 97, + 116, + 104, + 34, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + ], + alternatives: [ + None, + SlashUSingle(1630941232, 1), + SlashUSingle(859254832, 1), + SlashUSingle(1714827312, 1), + SlashUSingle(1698050096, 1), + SlashUSingle(808923184, 1), + SlashUSingle(825634864, 1), + SlashUSingle(876032048, 1), + SlashUSingle(943075376, 1), + None, + ], + len: 10, + len_limit: 50, + ), State(2)), + ], + array_transitions: [], + fallback_state: State(0), + ), + StateTable( + attributes: StateAttributes(8), + member_transitions: [], + array_transitions: [], + fallback_state: State(3), + ), + StateTable( + attributes: StateAttributes(1), + member_transitions: [], + array_transitions: [], + fallback_state: State(0), + ), + ], +)) diff --git a/crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__readme_query.snap b/crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__readme_query_64.snap similarity index 100% rename from crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__readme_query.snap rename to crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__readme_query_64.snap diff --git a/crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__real_life_query_32.snap b/crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__real_life_query_32.snap new file mode 100644 index 00000000..d2edbf94 --- /dev/null +++ b/crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__real_life_query_32.snap @@ -0,0 +1,412 @@ +--- +source: crates/rsonpath-lib/tests/engine_serialization_snapshots.rs +expression: "&engine(\"$.personal.details.contact.information.phones.home\")?" +--- +(V1, Automaton( + states: [ + StateTable( + attributes: StateAttributes(2), + member_transitions: [], + array_transitions: [], + fallback_state: State(0), + ), + StateTable( + attributes: StateAttributes(4), + member_transitions: [ + (StringPattern( + bytes: [ + 34, + 112, + 101, + 114, + 115, + 111, + 110, + 97, + 108, + 34, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + ], + alternatives: [ + None, + SlashUSingle(808923184, 1), + SlashUSingle(892743728, 1), + SlashUSingle(842477616, 1), + SlashUSingle(859254832, 1), + SlashUSingle(1714827312, 1), + SlashUSingle(1698050096, 1), + SlashUSingle(825634864, 1), + SlashUSingle(1664495664, 1), + None, + ], + len: 10, + len_limit: 50, + ), State(2)), + ], + array_transitions: [], + fallback_state: State(0), + ), + StateTable( + attributes: StateAttributes(4), + member_transitions: [ + (StringPattern( + bytes: [ + 34, + 100, + 101, + 116, + 97, + 105, + 108, + 115, + 34, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + ], + alternatives: [ + None, + SlashUSingle(875966512, 1), + SlashUSingle(892743728, 1), + SlashUSingle(876032048, 1), + SlashUSingle(825634864, 1), + SlashUSingle(959852592, 1), + SlashUSingle(1664495664, 1), + SlashUSingle(859254832, 1), + None, + ], + len: 9, + len_limit: 44, + ), State(3)), + ], + array_transitions: [], + fallback_state: State(0), + ), + StateTable( + attributes: StateAttributes(4), + member_transitions: [ + (StringPattern( + bytes: [ + 34, + 99, + 111, + 110, + 116, + 97, + 99, + 116, + 34, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + ], + alternatives: [ + None, + SlashUSingle(859189296, 1), + SlashUSingle(1714827312, 1), + SlashUSingle(1698050096, 1), + SlashUSingle(876032048, 1), + SlashUSingle(825634864, 1), + SlashUSingle(859189296, 1), + SlashUSingle(876032048, 1), + None, + ], + len: 9, + len_limit: 44, + ), State(4)), + ], + array_transitions: [], + fallback_state: State(0), + ), + StateTable( + attributes: StateAttributes(4), + member_transitions: [ + (StringPattern( + bytes: [ + 34, + 105, + 110, + 102, + 111, + 114, + 109, + 97, + 116, + 105, + 111, + 110, + 34, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + ], + alternatives: [ + None, + SlashUSingle(959852592, 1), + SlashUSingle(1698050096, 1), + SlashUSingle(909520944, 1), + SlashUSingle(1714827312, 1), + SlashUSingle(842477616, 1), + SlashUSingle(1681272880, 1), + SlashUSingle(825634864, 1), + SlashUSingle(876032048, 1), + SlashUSingle(959852592, 1), + SlashUSingle(1714827312, 1), + SlashUSingle(1698050096, 1), + None, + ], + len: 13, + len_limit: 68, + ), State(5)), + ], + array_transitions: [], + fallback_state: State(0), + ), + StateTable( + attributes: StateAttributes(4), + member_transitions: [ + (StringPattern( + bytes: [ + 34, + 112, + 104, + 111, + 110, + 101, + 115, + 34, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + ], + alternatives: [ + None, + SlashUSingle(808923184, 1), + SlashUSingle(943075376, 1), + SlashUSingle(1714827312, 1), + SlashUSingle(1698050096, 1), + SlashUSingle(892743728, 1), + SlashUSingle(859254832, 1), + None, + ], + len: 8, + len_limit: 38, + ), State(6)), + ], + array_transitions: [], + fallback_state: State(0), + ), + StateTable( + attributes: StateAttributes(12), + member_transitions: [ + (StringPattern( + bytes: [ + 34, + 104, + 111, + 109, + 101, + 34, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + ], + alternatives: [ + None, + SlashUSingle(943075376, 1), + SlashUSingle(1714827312, 1), + SlashUSingle(1681272880, 1), + SlashUSingle(892743728, 1), + None, + ], + len: 6, + len_limit: 26, + ), State(7)), + ], + array_transitions: [], + fallback_state: State(0), + ), + StateTable( + attributes: StateAttributes(1), + member_transitions: [], + array_transitions: [], + fallback_state: State(0), + ), + ], +)) diff --git a/crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__real_life_query.snap b/crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__real_life_query_64.snap similarity index 100% rename from crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__real_life_query.snap rename to crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__real_life_query_64.snap diff --git a/crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__slice_32.snap b/crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__slice_32.snap new file mode 100644 index 00000000..acf42b9f --- /dev/null +++ b/crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__slice_32.snap @@ -0,0 +1,214 @@ +--- +source: crates/rsonpath-lib/tests/engine_serialization_snapshots.rs +expression: "&engine(\"$..entries[3:5:7]\")?" +--- +(V1, Automaton( + states: [ + StateTable( + attributes: StateAttributes(2), + member_transitions: [], + array_transitions: [], + fallback_state: State(0), + ), + StateTable( + attributes: StateAttributes(0), + member_transitions: [ + (StringPattern( + bytes: [ + 34, + 101, + 110, + 116, + 114, + 105, + 101, + 115, + 34, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + ], + alternatives: [ + None, + SlashUSingle(892743728, 1), + SlashUSingle(1698050096, 1), + SlashUSingle(876032048, 1), + SlashUSingle(842477616, 1), + SlashUSingle(959852592, 1), + SlashUSingle(892743728, 1), + SlashUSingle(859254832, 1), + None, + ], + len: 9, + len_limit: 44, + ), State(2)), + ], + array_transitions: [], + fallback_state: State(1), + ), + StateTable( + attributes: StateAttributes(56), + member_transitions: [ + (StringPattern( + bytes: [ + 34, + 101, + 110, + 116, + 114, + 105, + 101, + 115, + 34, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + ], + alternatives: [ + None, + SlashUSingle(892743728, 1), + SlashUSingle(1698050096, 1), + SlashUSingle(876032048, 1), + SlashUSingle(842477616, 1), + SlashUSingle(959852592, 1), + SlashUSingle(892743728, 1), + SlashUSingle(859254832, 1), + None, + ], + len: 9, + len_limit: 44, + ), State(2)), + ], + array_transitions: [ + ArrayTransition( + label: Index(JsonUInt(3)), + target: State(3), + ), + ], + fallback_state: State(1), + ), + StateTable( + attributes: StateAttributes(1), + member_transitions: [ + (StringPattern( + bytes: [ + 34, + 101, + 110, + 116, + 114, + 105, + 101, + 115, + 34, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + ], + alternatives: [ + None, + SlashUSingle(892743728, 1), + SlashUSingle(1698050096, 1), + SlashUSingle(876032048, 1), + SlashUSingle(842477616, 1), + SlashUSingle(959852592, 1), + SlashUSingle(892743728, 1), + SlashUSingle(859254832, 1), + None, + ], + len: 9, + len_limit: 44, + ), State(2)), + ], + array_transitions: [], + fallback_state: State(1), + ), + ], +)) diff --git a/crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__slice.snap b/crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__slice_64.snap similarity index 100% rename from crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__slice.snap rename to crates/rsonpath-lib/tests/snapshots/engine_serialization_snapshots__ron__slice_64.snap