diff --git a/Cargo.lock b/Cargo.lock index 033f40303b..1fe9e69b4d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -715,6 +715,12 @@ dependencies = [ "cc", ] +[[package]] +name = "borrow-or-share" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc0b364ead1874514c8c2855ab558056ebfeb775653e7ae45ff72f28f8f3166c" + [[package]] name = "build_common" version = "28.0.3" @@ -1960,6 +1966,17 @@ dependencies = [ "num-traits", ] +[[package]] +name = "fluent-uri" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc74ac4d8359ae70623506d512209619e5cf8f347124910440dbc221714b328e" +dependencies = [ + "borrow-or-share", + "ref-cast", + "serde", +] + [[package]] name = "fnv" version = "1.0.7" @@ -3382,6 +3399,7 @@ dependencies = [ "anyhow", "criterion", "duplicate", + "fluent-uri", "libdd-common", "libdd-trace-protobuf", "libdd-trace-utils", @@ -3390,7 +3408,6 @@ dependencies = [ "regex", "serde", "serde_json", - "url", ] [[package]] diff --git a/LICENSE-3rdparty.yml b/LICENSE-3rdparty.yml index db7415cf26..c74c198f76 100644 --- a/LICENSE-3rdparty.yml +++ b/LICENSE-3rdparty.yml @@ -5197,6 +5197,31 @@ third_party_libraries: licenses: - license: MIT text: NOT FOUND +- package_name: borrow-or-share + package_version: 0.2.4 + repository: https://github.com/yescallop/borrow-or-share + license: MIT-0 + licenses: + - license: MIT-0 + text: |- + MIT No Attribution + + Copyright 2024 Scallop Ye + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. - package_name: bumpalo package_version: 3.17.0 repository: https://github.com/fitzgen/bumpalo @@ -11607,6 +11632,34 @@ third_party_libraries: LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +- package_name: fluent-uri + package_version: 0.4.1 + repository: https://github.com/yescallop/fluent-uri-rs + license: MIT + licenses: + - license: MIT + text: |- + MIT License + + Copyright (c) 2024 Scallop Ye + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. - package_name: fnv package_version: 1.0.7 repository: https://github.com/servo/rust-fnv diff --git a/libdd-trace-obfuscation/Cargo.toml b/libdd-trace-obfuscation/Cargo.toml index 6b812ecc43..eccb0fd93d 100644 --- a/libdd-trace-obfuscation/Cargo.toml +++ b/libdd-trace-obfuscation/Cargo.toml @@ -14,12 +14,12 @@ anyhow = "1.0" regex = "1" serde = { version = "1.0.145", features = ["derive"] } serde_json = { version = "1.0", features = ["preserve_order"] } -url = "^2.5.4" percent-encoding = "2.1" log = "0.4" libdd-trace-protobuf = { version = "1.1.0", path = "../libdd-trace-protobuf" } libdd-trace-utils = { version = "2.0.0", path = "../libdd-trace-utils" } libdd-common = { version = "2.0.0", path = "../libdd-common" } +fluent-uri = "0.4.1" [dev-dependencies] duplicate = "0.4.1" diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index ff22241cc4..a6dce5bfda 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -1,48 +1,220 @@ // Copyright 2023-Present Datadog, Inc. https://www.datadoghq.com/ // SPDX-License-Identifier: Apache-2.0 +// FIXME: once obfuscation feature parity is reached with the agent, change both modules to be more +// restrictive on the accepted forms of urls so that this module can be greatly simplified. +// One idea for now is to match the url to a regex on both side to validate it + +use fluent_uri::UriRef; use percent_encoding::percent_decode_str; -use url::Url; +use std::fmt::Write; + +/// Returns true for Go net/url's "category 1" characters: +/// ASCII bytes that always trigger escaping in URLs (plus space and quote). +fn is_go_url_escape_cat1(c: char) -> bool { + matches!( + c, + '\\' | '^' | '{' | '}' | '|' | '<' | '>' | '`' | ' ' | '"' + ) +} + +/// Returns true for Go net/url's "category 2" characters for PATH contexts: +/// characters Go may escape in paths when Cat1 is present or non-ASCII exists. +fn is_go_url_escape_cat2_path(c: char) -> bool { + matches!(c, '!' | '\'' | '(' | ')' | '*' | '[' | ']') +} + +/// Returns true for Go net/url's "category 2" characters for FRAGMENT contexts: +/// characters Go may escape in fragments when non-ASCII exists. +fn is_go_url_escape_cat2_fragment(c: char) -> bool { + matches!(c, '\'' | '[' | ']') +} + +fn hex_val(b: u8) -> u8 { + match b { + b'0'..=b'9' => b - b'0', + b'a'..=b'f' => b - b'a' + 10, + _ => b - b'A' + 10, + } +} + +/// Decode %XX for unreserved chars (A-Za-z0-9-._~) in path, matching Go's url.Parse behavior. +fn normalize_pct_encoded_unreserved(path: &str) -> String { + let b = path.as_bytes(); + let mut out = String::with_capacity(path.len()); + let mut i = 0; + while i < b.len() { + if b[i] == b'%' + && i + 2 < b.len() + && b[i + 1].is_ascii_hexdigit() + && b[i + 2].is_ascii_hexdigit() + { + let v = (hex_val(b[i + 1]) << 4) | hex_val(b[i + 2]); + if v.is_ascii_alphanumeric() || matches!(v, b'.' | b'_' | b'~') { + out.push(v as char); + } else { + out.push_str(&path[i..i + 3]); + } + i += 3; + } else { + out.push(b[i] as char); + i += 1; + } + } + out +} + +fn encode_char(out: &mut String, c: char) { + let mut buf = [0u8; 4]; + for &b in c.encode_utf8(&mut buf).as_bytes() { + let _ = write!(out, "%{b:02X}"); + } +} + +fn redact_path_digits(path: &str) -> String { + path.split('/') + .map(|seg| { + if percent_decode_str(seg) + .decode_utf8_lossy() + .chars() + .any(|c| c.is_ascii_digit()) + { + "?" + } else { + seg + } + }) + .collect::>() + .join("/") +} pub fn obfuscate_url_string( url: &str, remove_query_string: bool, remove_path_digits: bool, ) -> String { - let mut parsed_url = match Url::parse(url) { - Ok(res) => res, - Err(_) => return "?".to_string(), + if url.is_empty() { + return String::new(); + } + + let frag_pos = url.find('#'); + let path_query_end = frag_pos.unwrap_or(url.len()); + let path_end = url[..path_query_end].find('?').unwrap_or(path_query_end); + + // Control chars in path/query — Go rejects these + if url[..path_query_end].bytes().any(|b| b < 0x20 || b == 0x7F) { + return if remove_query_string || remove_path_digits { + "?".to_string() + } else { + url.to_string() + }; + } + + // Determine Go's escape() trigger: Cat1 or non-ASCII in path causes Cat2 encoding too + let path = &url[..path_end]; + let needs_full_path = path.bytes().any(|b| b > 127) || path.chars().any(is_go_url_escape_cat1); + let frag_has_non_ascii = frag_pos.is_some_and(|i| url[i + 1..].bytes().any(|b| b > 127)); + + // Pre-encode chars that UriRef (strict RFC 3986) rejects. + // We encode ALL non-ASCII chars (not just Cat1/Cat2) so that characters outside + // RFC 3987 ucschar ranges (e.g. U+10EF4F, U+10FFFF) don't cause parse failures. + // Exclude the query — Go doesn't validate query percent-encoding, so we pass + // only path + fragment to UriRef and restore the original query afterward. + let mut pre = String::with_capacity(url.len() * 4); + for c in url[..path_end].chars() { + if !c.is_ascii() { + encode_char(&mut pre, c); + } else if is_go_url_escape_cat1(c) || (needs_full_path && is_go_url_escape_cat2_path(c)) { + let _ = write!(pre, "%{:02X}", c as u8); + } else { + pre.push(c); + } + } + if let Some(fi) = frag_pos { + pre.push('#'); + for c in url[fi + 1..].chars() { + if !c.is_ascii() + || (c as u32) < 0x20 + || c as u32 == 0x7F + || c == '#' + || is_go_url_escape_cat1(c) + || (frag_has_non_ascii && is_go_url_escape_cat2_fragment(c)) + { + encode_char(&mut pre, c); + } else { + pre.push(c); + } + } + } + + let uri = match UriRef::parse(pre.as_str()) { + Ok(u) => u, + Err(_) => { + return if remove_query_string || remove_path_digits { + "?".to_string() + } else { + url.to_string() + }; + } }; - // remove username & password - parsed_url.set_username("").unwrap_or_default(); - parsed_url.set_password(Some("")).unwrap_or_default(); + let mut out = String::new(); - if remove_query_string && parsed_url.query().is_some() { - parsed_url.set_query(Some("")); + if let Some(scheme) = uri.scheme() { + out.push_str(&scheme.as_str().to_lowercase()); + out.push(':'); } - if !remove_path_digits { - return parsed_url.to_string(); + if let Some(auth) = uri.authority() { + out.push_str("//"); + // Strip userinfo — emit only host[:port] + out.push_str(auth.host()); + if let Some(port) = auth.port() { + out.push(':'); + out.push_str(port.as_str()); + } + let path_str = normalize_pct_encoded_unreserved(uri.path().as_str()); + if remove_path_digits { + out.push_str(&redact_path_digits(&path_str)); + } else { + out.push_str(&path_str); + } + } else if let Some(scheme) = uri.scheme() { + // This is a really weird case because there is a scheme but no authority. + // For example: http:# + // Length of "http:" + let scheme_end = scheme.as_str().len() + 1; + // http://example.com/?query -> //example.com/ + out.push_str(&url[scheme_end..path_end]); + } else { + // Relative reference: use pre-encoded path + let path_str = normalize_pct_encoded_unreserved(uri.path().as_str()); + if remove_path_digits { + out.push_str(&redact_path_digits(&path_str)); + } else { + out.push_str(&path_str); + } } - // remove path digits - let mut split_url: Vec<&str> = parsed_url.path().split('/').collect(); - let mut changed = false; - for segment in split_url.iter_mut() { - // we don't want to redact any HTML encodings - #[allow(clippy::unwrap_used)] - let decoded = percent_decode_str(segment).decode_utf8().unwrap(); - if decoded.chars().any(|c| char::is_ascii_digit(&c)) { - *segment = "/REDACTED/"; - changed = true; + // Use original URL positions to detect query — uri.query() is always None since we + // excluded the query from the string we passed to UriRef. + if remove_query_string { + if path_end < path_query_end { + out.push('?'); } + } else if path_end < path_query_end { + // Restore original raw query (Go's url.RawQuery is kept verbatim) + out.push_str(&url[path_end..path_query_end]); } - if changed { - parsed_url.set_path(&split_url.join("/")); + + if let Some(frag) = uri.fragment() { + if !frag.as_str().is_empty() { + out.push('#'); + out.push_str(frag.as_str()); + } } - parsed_url.to_string().replace("/REDACTED/", "?") + out } #[cfg(test)] @@ -158,11 +330,402 @@ mod tests { expected_output ["http://foo.com/?/nam%3Fe/?"]; ] [ - test_name [remove_path_digits_9] + test_name [empty_input] + remove_query_string [false] + remove_path_digits [false] + input [""] + expected_output [""]; + ] + [ + test_name [non_printable_chars] + remove_query_string [false] + remove_path_digits [false] + input ["\u{10}"] + // When both options false, Go returns original (obfuscateUserInfo passthrough) + expected_output ["\u{10}"]; + ] + [ + test_name [non_printable_chars_and_unicode] + remove_query_string [true] + remove_path_digits [true] + input ["\u{10}ჸ"] + expected_output ["?"]; + ] + [ + test_name [hashtag] + remove_query_string [true] + remove_path_digits [true] + input ["#"] + expected_output [""]; + ] + [ + test_name [fuzzing_1050521893] + remove_query_string [true] + remove_path_digits [true] + input ["ჸ"] + expected_output ["%E1%83%B8"]; + ] + [ + test_name [fuzzing_594901251] + remove_query_string [true] + remove_path_digits [true] + input ["%"] + expected_output ["?"]; + ] + [ + test_name [fuzzing_3638045804] + remove_query_string [true] + remove_path_digits [true] + input ["."] + expected_output ["."]; + ] + [ + test_name [fuzzing_1928485962] + remove_query_string [true] + remove_path_digits [true] + input ["0"] + expected_output ["?"]; + ] + [ + test_name [fuzzing_4273565798] + remove_query_string [true] + remove_path_digits [true] + input ["!ჸ"] + expected_output ["%21%E1%83%B8"]; + ] + [ + test_name [fuzzing_1457007156] + remove_query_string [true] + remove_path_digits [true] + input ["!"] + expected_output ["!"]; + ] + [ + test_name [fuzzing_3119724369] + remove_query_string [true] + remove_path_digits [true] + input [":"] + expected_output ["?"]; + ] + [ + test_name [fuzzing_1092426409] + remove_query_string [true] + remove_path_digits [true] + input ["#ჸ"] + expected_output ["#%E1%83%B8"]; + ] + [ + test_name [fuzzing_1323831861] + remove_query_string [true] + remove_path_digits [true] + input ["#\u{01}"] + expected_output ["#%01"]; + ] + [ + test_name [fuzzing_35626170] + remove_query_string [true] + remove_path_digits [true] + input ["#\u{01}ჸ"] + expected_output ["#%01%E1%83%B8"]; + ] + [ + test_name [fuzzing_618280270] + remove_query_string [true] + remove_path_digits [true] + input ["\\"] + expected_output ["%5C"]; + ] + [ + test_name [fuzzing_1505427946] + remove_query_string [true] + remove_path_digits [true] + input ["[ჸ"] + expected_output ["%5B%E1%83%B8"]; + ] + [ + test_name [fuzzing_backslash_unicode] + remove_query_string [true] + remove_path_digits [true] + input ["\\ჸ"] + expected_output ["%5C%E1%83%B8"]; + ] + [ + test_name [fuzzing_2438023093] + remove_query_string [true] + remove_path_digits [true] + input ["ჸ#"] + expected_output ["%E1%83%B8"]; + ] + [ + test_name [fuzzing_2729083127] + remove_query_string [true] + remove_path_digits [true] + input ["!#ჸ"] + expected_output ["!#%E1%83%B8"]; + ] + [ + test_name [fuzzing_slash_unicode] + remove_query_string [true] + remove_path_digits [true] + input ["/ჸ"] + expected_output ["/%E1%83%B8"]; + ] + [ + test_name [fuzzing_3710129001] + remove_query_string [true] + remove_path_digits [true] + input ["##"] + expected_output ["#%23"]; + ] + [ + test_name [fuzzing_1009954227] + remove_query_string [true] + remove_path_digits [true] + input ["ჸ#\u{10}"] + expected_output ["%E1%83%B8#%10"]; + ] + [ + test_name [fuzzing_hash_exclamation] + remove_query_string [true] + remove_path_digits [true] + input ["ჸ#!"] + expected_output ["%E1%83%B8#!"]; + ] + [ + test_name [fuzzing_578834728] + remove_query_string [true] + remove_path_digits [true] + input ["#%"] + expected_output ["?"]; + ] + [ + test_name [fuzzing_3991369296] + remove_query_string [true] + remove_path_digits [true] + input ["#'ჸ"] + expected_output ["#%27%E1%83%B8"]; + ] + [ + test_name [fuzzing_path_frag_quote] + remove_query_string [true] + remove_path_digits [true] + input ["ჸ#'ჸ"] + expected_output ["%E1%83%B8#%27%E1%83%B8"]; + ] + [ + test_name [fuzzing_hash_excl_unicode] + remove_query_string [true] + remove_path_digits [true] + input ["#!ჸ"] + expected_output ["#!%E1%83%B8"]; + ] + [ + // Cat1 char (<) triggers full escape(), which also encodes Cat2 char (!) + test_name [fuzzing_2455396347_cat1_triggers_cat2] + remove_query_string [true] + remove_path_digits [true] + input ["