From 368bac5c89065d38362d6f959cbc183d59e406b2 Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Tue, 3 Mar 2026 18:26:54 +0100 Subject: [PATCH 01/63] UGLY(http): fix relative url obfuscation --- libdd-trace-obfuscation/src/http.rs | 49 ++++++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 5 deletions(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index ff22241cc4..c6ad1101e6 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -4,6 +4,45 @@ use percent_encoding::percent_decode_str; use url::Url; +/// Go-ish behavior: +/// - Accepts almost anything as a URL reference +/// - If it's absolute, return it as-is (normalized/encoded) +/// - If it's relative, return the encoded relative reference (no dummy base in output) +pub fn go_like_reference(input: &str) -> String { + // Dummy base just to let the parser resolve relatives + let base = Url::parse("https://example.invalid/").unwrap(); + + // Try absolute first (like "https://...", "mailto:...", etc.) + if let Ok(abs) = Url::parse(input) { + return abs.to_string(); + } + + // Otherwise parse as a relative reference against the dummy base + let resolved = base.join(input).unwrap_or_else(|_| { + // If join fails (rare, but can happen with weird inputs), fall back to putting it in the path. + let mut u = base.clone(); + u.set_path(input); + u + }); + + // Strip the dummy origin back off so you get "hello%20world", "/x%20y", "?q=a%20b", "#frag", etc. + let full = resolved.as_str(); + + // base.as_str() is "https://example.invalid/" + let base_prefix = base.as_str(); + + if let Some(rest) = full.strip_prefix(base_prefix) { + // relative path (e.g. "hello%20world" or "dir/hello%20world") + rest.to_string() + } else if let Some(rest) = full.strip_prefix("https://example.invalid") { + // covers cases like "/path" where the base origin remains + rest.to_string() + } else { + // shouldn't happen, but safe fallback + full.to_string() + } +} + pub fn obfuscate_url_string( url: &str, remove_query_string: bool, @@ -11,7 +50,7 @@ pub fn obfuscate_url_string( ) -> String { let mut parsed_url = match Url::parse(url) { Ok(res) => res, - Err(_) => return "?".to_string(), + Err(_) => return go_like_reference(url), }; // remove username & password @@ -158,11 +197,11 @@ mod tests { expected_output ["http://foo.com/?/nam%3Fe/?"]; ] [ - test_name [remove_path_digits_9] + test_name [empty_input] remove_query_string [false] - remove_path_digits [true] - input ["http://user:password@foo.com/1/2/3?q=james"] - expected_output ["http://foo.com/?/?/??q=james"]; + remove_path_digits [false] + input [""] + expected_output [""]; ] )] #[test] From 405bf8639d5161ad77968f0092bd52410eb6d6b7 Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Thu, 5 Mar 2026 15:54:29 +0100 Subject: [PATCH 02/63] fix(http): fuzzing edge cases --- libdd-trace-obfuscation/src/http.rs | 50 ++++++++++++++++++++++++++--- 1 file changed, 46 insertions(+), 4 deletions(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index c6ad1101e6..b410c52cd1 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -8,7 +8,7 @@ use url::Url; /// - Accepts almost anything as a URL reference /// - If it's absolute, return it as-is (normalized/encoded) /// - If it's relative, return the encoded relative reference (no dummy base in output) -pub fn go_like_reference(input: &str) -> String { +pub fn go_like_reference(input: &str, remove_query_string: bool) -> String { // Dummy base just to let the parser resolve relatives let base = Url::parse("https://example.invalid/").unwrap(); @@ -19,13 +19,15 @@ pub fn go_like_reference(input: &str) -> String { // Otherwise parse as a relative reference against the dummy base let resolved = base.join(input).unwrap_or_else(|_| { - // If join fails (rare, but can happen with weird inputs), fall back to putting it in the path. + // If join fails (rare, but can happen with weird inputs), fall back to putting it in the + // path. let mut u = base.clone(); u.set_path(input); u }); - // Strip the dummy origin back off so you get "hello%20world", "/x%20y", "?q=a%20b", "#frag", etc. + // Strip the dummy origin back off so you get "hello%20world", "/x%20y", "?q=a%20b", "#frag", + // etc. let full = resolved.as_str(); // base.as_str() is "https://example.invalid/" @@ -33,6 +35,9 @@ pub fn go_like_reference(input: &str) -> String { if let Some(rest) = full.strip_prefix(base_prefix) { // relative path (e.g. "hello%20world" or "dir/hello%20world") + if remove_query_string && !rest.is_empty() { + return "?".to_owned(); + } rest.to_string() } else if let Some(rest) = full.strip_prefix("https://example.invalid") { // covers cases like "/path" where the base origin remains @@ -50,7 +55,23 @@ pub fn obfuscate_url_string( ) -> String { let mut parsed_url = match Url::parse(url) { Ok(res) => res, - Err(_) => return go_like_reference(url), + Err(_) => { + // Fragment-only references (e.g. "#", "#frag") are valid relative URL references. + // Go's url.Parse handles them successfully: "#" → "" (empty fragment → empty string), + // "#frag" → "#frag". Handle these before the go_like_reference fallback to prevent + // the "empty result → ?" heuristic from incorrectly triggering. + if let Some(fragment) = url.strip_prefix('#') { + if fragment.is_empty() { + return String::new(); + } + return format!("#{fragment}"); + } + let fixme_url_go_parsing = go_like_reference(url, remove_query_string); + if fixme_url_go_parsing.is_empty() && !url.is_empty() { + return String::from("?"); + } + return fixme_url_go_parsing; + } }; // remove username & password @@ -203,6 +224,27 @@ mod tests { input [""] expected_output [""]; ] + [ + test_name [non_printable_chars] + remove_query_string [false] + remove_path_digits [false] + input ["\u{10}"] + expected_output ["?"]; + ] + [ + test_name [non_printable_chars_and_unicode] + remove_query_string [true] + remove_path_digits [true] + input ["\u{10}ჸ"] + expected_output ["?"]; + ] + [ + test_name [hashtag] + remove_query_string [true] + remove_path_digits [true] + input ["#"] + expected_output [""]; + ] )] #[test] fn test_name() { From fcae8ab96f9153fa923bad1cd9a3ab3f747a1cc1 Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Thu, 5 Mar 2026 16:28:58 +0100 Subject: [PATCH 03/63] fix(http): fuzzing edge cases --- libdd-trace-obfuscation/src/http.rs | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index b410c52cd1..432db4ca5e 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -35,8 +35,10 @@ pub fn go_like_reference(input: &str, remove_query_string: bool) -> String { if let Some(rest) = full.strip_prefix(base_prefix) { // relative path (e.g. "hello%20world" or "dir/hello%20world") - if remove_query_string && !rest.is_empty() { - return "?".to_owned(); + if remove_query_string && resolved.query().is_some() { + // Strip the query string, preserving the path with a trailing "?" + let path_end = rest.find('?').unwrap_or(rest.len()); + return format!("{}?", &rest[..path_end]); } rest.to_string() } else if let Some(rest) = full.strip_prefix("https://example.invalid") { @@ -66,6 +68,12 @@ pub fn obfuscate_url_string( } return format!("#{fragment}"); } + // Go's url.Parse rejects control characters (bytes < 0x20 or 0x7F) and returns an + // error, causing ObfuscateURLString to return "?". The `url` crate silently drops + // them, so we must check explicitly before calling go_like_reference. + if url.bytes().any(|b| b < 0x20 || b == 0x7F) { + return String::from("?"); + } let fixme_url_go_parsing = go_like_reference(url, remove_query_string); if fixme_url_go_parsing.is_empty() && !url.is_empty() { return String::from("?"); @@ -245,6 +253,13 @@ mod tests { input ["#"] expected_output [""]; ] + [ + test_name [fuzzing_1050521893] + remove_query_string [true] + remove_path_digits [true] + input ["ჸ"] + expected_output ["%E1%83%B8"]; + ] )] #[test] fn test_name() { From 128aaa880445206d2d6a3cdc5d9dd6257ecb0599 Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Thu, 5 Mar 2026 16:43:15 +0100 Subject: [PATCH 04/63] fix(http): reject invalid percent-encoding sequences like bare '%' Go's url.Parse rejects bare '%' and other invalid percent-encoding sequences, returning an error which causes obfuscateURLString to return "?". The url crate silently re-encodes them as '%25', so add an explicit pre-check matching Go's behavior. Fixes fuzzing testcase: http_fuzzing_594901251 --- libdd-trace-obfuscation/src/http.rs | 32 +++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index 432db4ca5e..27cf32f217 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -4,6 +4,25 @@ use percent_encoding::percent_decode_str; use url::Url; +fn has_invalid_percent_encoding(s: &str) -> bool { + let bytes = s.as_bytes(); + let mut i = 0; + while i < bytes.len() { + if bytes[i] == b'%' { + if i + 2 >= bytes.len() + || !bytes[i + 1].is_ascii_hexdigit() + || !bytes[i + 2].is_ascii_hexdigit() + { + return true; + } + i += 3; + } else { + i += 1; + } + } + false +} + /// Go-ish behavior: /// - Accepts almost anything as a URL reference /// - If it's absolute, return it as-is (normalized/encoded) @@ -74,6 +93,12 @@ pub fn obfuscate_url_string( if url.bytes().any(|b| b < 0x20 || b == 0x7F) { return String::from("?"); } + // Go's url.Parse rejects invalid percent-encoding sequences (bare '%' or '%' not + // followed by exactly two hex digits). The `url` crate re-encodes them as '%25', + // so we must detect and reject them explicitly. + if has_invalid_percent_encoding(url) { + return String::from("?"); + } let fixme_url_go_parsing = go_like_reference(url, remove_query_string); if fixme_url_go_parsing.is_empty() && !url.is_empty() { return String::from("?"); @@ -260,6 +285,13 @@ mod tests { input ["ჸ"] expected_output ["%E1%83%B8"]; ] + [ + test_name [fuzzing_594901251] + remove_query_string [true] + remove_path_digits [true] + input ["%"] + expected_output ["?"]; + ] )] #[test] fn test_name() { From 54e0ee481d5cdd2fb95eb917f3ec7c3fd30a645d Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Thu, 5 Mar 2026 16:53:21 +0100 Subject: [PATCH 05/63] fix(http): preserve dot path segments that url crate resolves away Go's url.Parse stores "." and ".." path segments literally, while the url crate's join() resolves them via RFC 3986 normalization (making them empty after stripping the base). Return the original input when go_like_reference returns empty for a non-empty input that already passed all error checks (control chars, invalid percent-encoding). Fixes fuzzing testcase: http_fuzzing_3638045804 --- libdd-trace-obfuscation/src/http.rs | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index 27cf32f217..e85684063a 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -101,7 +101,9 @@ pub fn obfuscate_url_string( } let fixme_url_go_parsing = go_like_reference(url, remove_query_string); if fixme_url_go_parsing.is_empty() && !url.is_empty() { - return String::from("?"); + // The url crate resolved away dot path segments (e.g. "." or "..") via RFC 3986 + // normalization. Go's url.Parse preserves them literally. Return the original. + return url.to_string(); } return fixme_url_go_parsing; } @@ -292,6 +294,13 @@ mod tests { input ["%"] expected_output ["?"]; ] + [ + test_name [fuzzing_3638045804] + remove_query_string [true] + remove_path_digits [true] + input ["."] + expected_output ["."]; + ] )] #[test] fn test_name() { From 3291a3268243f7e500dde8a2b425304c4b75afa8 Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Thu, 5 Mar 2026 16:57:01 +0100 Subject: [PATCH 06/63] fix(http): apply remove_path_digits to relative URL results Go's url.Parse succeeds for relative URLs (like "0") and applies path-digit removal to them. The Rust code was returning early from the go_like_reference path without applying digit removal. Add remove_relative_path_digits() helper and call it for relative URL results when remove_path_digits=true. Fixes fuzzing testcase: http_fuzzing_1928485962 --- libdd-trace-obfuscation/src/http.rs | 43 +++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 3 deletions(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index e85684063a..a0bdb171be 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -4,6 +4,31 @@ use percent_encoding::percent_decode_str; use url::Url; +/// Apply path-digit removal to a relative URL string returned by go_like_reference. +/// Operates only on the path portion (before the first '?'), matching Go's behavior of +/// splitting path by '/' and replacing segments containing digits with '?'. +fn remove_relative_path_digits(url_str: &str) -> String { + let query_start = url_str.find('?').unwrap_or(url_str.len()); + let path_part = &url_str[..query_start]; + let rest = &url_str[query_start..]; + + let mut segments: Vec<&str> = path_part.split('/').collect(); + let mut changed = false; + for segment in segments.iter_mut() { + if let Ok(decoded) = percent_decode_str(segment).decode_utf8() { + if decoded.chars().any(|c| char::is_ascii_digit(&c)) { + *segment = "?"; + changed = true; + } + } + } + if changed { + format!("{}{}", segments.join("/"), rest) + } else { + url_str.to_string() + } +} + fn has_invalid_percent_encoding(s: &str) -> bool { let bytes = s.as_bytes(); let mut i = 0; @@ -100,12 +125,17 @@ pub fn obfuscate_url_string( return String::from("?"); } let fixme_url_go_parsing = go_like_reference(url, remove_query_string); - if fixme_url_go_parsing.is_empty() && !url.is_empty() { + let result = if fixme_url_go_parsing.is_empty() && !url.is_empty() { // The url crate resolved away dot path segments (e.g. "." or "..") via RFC 3986 // normalization. Go's url.Parse preserves them literally. Return the original. - return url.to_string(); + url.to_string() + } else { + fixme_url_go_parsing + }; + if remove_path_digits { + return remove_relative_path_digits(&result); } - return fixme_url_go_parsing; + return result; } }; @@ -301,6 +331,13 @@ mod tests { input ["."] expected_output ["."]; ] + [ + test_name [fuzzing_1928485962] + remove_query_string [true] + remove_path_digits [true] + input ["0"] + expected_output ["?"]; + ] )] #[test] fn test_name() { From c47c680b2437230ed7003908681e7771dec2c138 Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Thu, 5 Mar 2026 17:15:55 +0100 Subject: [PATCH 07/63] fix(http): encode Go's extra path chars (!, ', (, ), *) for relative URLs Go's url.shouldEscape for encodePath does not allow !, ', (, ), * even though RFC 3986 considers them valid sub-delimiters in path segments. The url crate follows RFC 3986 and keeps them unencoded. Post-process go_like_reference output to encode these characters to match Go's behavior. Fixes fuzzing testcase: http_fuzzing_4273565798 --- libdd-trace-obfuscation/src/http.rs | 35 +++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index a0bdb171be..09b5aba989 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -4,6 +4,32 @@ use percent_encoding::percent_decode_str; use url::Url; +/// Encode path characters that Go's url.EscapedPath() encodes but the url crate doesn't. +/// Go's shouldEscape for encodePath does not allow !, ', (, ), * even though RFC 3986 +/// considers them valid sub-delimiters in path segments. +/// Only applied to the path portion (before the first '?'). +fn encode_go_path_chars(url_str: &str) -> String { + let query_start = url_str.find('?').unwrap_or(url_str.len()); + let path_part = &url_str[..query_start]; + let rest = &url_str[query_start..]; + + let mut encoded = String::with_capacity(path_part.len()); + for c in path_part.chars() { + match c { + '!' | '\'' | '(' | ')' | '*' => { + encoded.push('%'); + encoded.push_str(&format!("{:02X}", c as u8)); + } + _ => encoded.push(c), + } + } + if rest.is_empty() { + encoded + } else { + format!("{encoded}{rest}") + } +} + /// Apply path-digit removal to a relative URL string returned by go_like_reference. /// Operates only on the path portion (before the first '?'), matching Go's behavior of /// splitting path by '/' and replacing segments containing digits with '?'. @@ -132,6 +158,8 @@ pub fn obfuscate_url_string( } else { fixme_url_go_parsing }; + // Encode path chars that Go encodes but the url crate doesn't (!, ', (, ), *). + let result = encode_go_path_chars(&result); if remove_path_digits { return remove_relative_path_digits(&result); } @@ -338,6 +366,13 @@ mod tests { input ["0"] expected_output ["?"]; ] + [ + test_name [fuzzing_4273565798] + remove_query_string [true] + remove_path_digits [true] + input ["!ჸ"] + expected_output ["%21%E1%83%B8"]; + ] )] #[test] fn test_name() { From f4f40825fee2598252a34732757caadac2d5d638 Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Thu, 5 Mar 2026 17:22:35 +0100 Subject: [PATCH 08/63] fix(http): only encode Go path chars when input has non-ASCII bytes Go's validEncoded() has an explicit allowlist for !, ', (, ), * so these are only re-encoded when the path has non-ASCII chars (which forces Go to call escape() instead of using RawPath). For pure-ASCII inputs, Go's EscapedPath() returns the RawPath unchanged, keeping ! as-is. Only apply encode_go_path_chars() when the original input contains non-ASCII. Fixes fuzzing testcase: http_fuzzing_1457007156 --- libdd-trace-obfuscation/src/http.rs | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index 09b5aba989..f7ce65e5d3 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -159,7 +159,14 @@ pub fn obfuscate_url_string( fixme_url_go_parsing }; // Encode path chars that Go encodes but the url crate doesn't (!, ', (, ), *). - let result = encode_go_path_chars(&result); + // Go's validEncoded allows these in RawPath (pure ASCII path → no re-encoding). + // But when the path has non-ASCII chars, Go calls escape() which also encodes them. + // Only apply when the original input contains non-ASCII bytes. + let result = if url.bytes().any(|b| b > 127) { + encode_go_path_chars(&result) + } else { + result + }; if remove_path_digits { return remove_relative_path_digits(&result); } @@ -373,6 +380,13 @@ mod tests { input ["!ჸ"] expected_output ["%21%E1%83%B8"]; ] + [ + test_name [fuzzing_1457007156] + remove_query_string [true] + remove_path_digits [true] + input ["!"] + expected_output ["!"]; + ] )] #[test] fn test_name() { From 40523dafa9b464db3b7cf98b713c7bcdc710f639 Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Thu, 5 Mar 2026 17:31:23 +0100 Subject: [PATCH 09/63] fix(http): properly encode non-ASCII fragment content MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Go's url.Parse percent-encodes non-ASCII chars in fragments (e.g., '#ჸ' → '#%E1%83%B8'). Our early-return fragment handler was returning the raw fragment without encoding. Delegate non-empty fragments to go_like_reference which uses the url crate's join() to correctly encode them. Fixes fuzzing testcase: http_fuzzing_1092426409 --- libdd-trace-obfuscation/src/http.rs | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index f7ce65e5d3..b162057cea 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -136,7 +136,9 @@ pub fn obfuscate_url_string( if fragment.is_empty() { return String::new(); } - return format!("#{fragment}"); + // Use go_like_reference to properly encode non-ASCII chars in the fragment + // (Go's url.Parse encodes them; our simple format!("#{fragment}") doesn't). + return go_like_reference(url, remove_query_string); } // Go's url.Parse rejects control characters (bytes < 0x20 or 0x7F) and returns an // error, causing ObfuscateURLString to return "?". The `url` crate silently drops @@ -387,6 +389,13 @@ mod tests { input ["!"] expected_output ["!"]; ] + [ + test_name [fuzzing_1092426409] + remove_query_string [true] + remove_path_digits [true] + input ["#ჸ"] + expected_output ["#%E1%83%B8"]; + ] )] #[test] fn test_name() { From 10b5e13ed3a66ea09d5d388a27a041cae26d28b5 Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Thu, 5 Mar 2026 17:35:11 +0100 Subject: [PATCH 10/63] fix(http): reject URLs where first path segment contains ':' (Go parse error) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Go's url.Parse rejects ":" (missing protocol scheme) and "1:b" (first path segment cannot contain colon per RFC 3986 §4.2). The url crate accepts them as path characters. Add an explicit check to return "?" for these inputs. Fixes fuzzing testcase: http_fuzzing_3119724369 --- libdd-trace-obfuscation/src/http.rs | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index b162057cea..c121647016 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -152,6 +152,18 @@ pub fn obfuscate_url_string( if has_invalid_percent_encoding(url) { return String::from("?"); } + // Go's url.Parse rejects URLs where the first path segment contains ':' (RFC 3986 + // §4.2): this is ambiguous with a scheme separator. E.g., ":" and "1:b" both fail + // with "missing protocol scheme" or "first path segment cannot contain colon". + // The url crate silently accepts these as path chars. + { + let segment_end = url + .find(|c| matches!(c, '/' | '?' | '#')) + .unwrap_or(url.len()); + if url[..segment_end].contains(':') { + return String::from("?"); + } + } let fixme_url_go_parsing = go_like_reference(url, remove_query_string); let result = if fixme_url_go_parsing.is_empty() && !url.is_empty() { // The url crate resolved away dot path segments (e.g. "." or "..") via RFC 3986 @@ -389,6 +401,13 @@ mod tests { input ["!"] expected_output ["!"]; ] + [ + test_name [fuzzing_3119724369] + remove_query_string [true] + remove_path_digits [true] + input [":"] + expected_output ["?"]; + ] [ test_name [fuzzing_1092426409] remove_query_string [true] From 0501cba38442f2cbe05dd9f148802ed0ceb5273e Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Thu, 5 Mar 2026 18:03:16 +0100 Subject: [PATCH 11/63] fix(http): percent-encode control chars in fragment before url-joining MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Go's url.Parse percent-encodes control chars in fragments (e.g., '#\x01' → '#%01'). The url crate silently drops them from fragments, returning '#'. Pre-encode control bytes in the fragment manually before passing to go_like_reference via base.join(). Fixes fuzzing testcase: http_fuzzing_1323831861 --- libdd-trace-obfuscation/src/http.rs | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index c121647016..563f7257ec 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -136,9 +136,22 @@ pub fn obfuscate_url_string( if fragment.is_empty() { return String::new(); } - // Use go_like_reference to properly encode non-ASCII chars in the fragment - // (Go's url.Parse encodes them; our simple format!("#{fragment}") doesn't). - return go_like_reference(url, remove_query_string); + // Go's url.Parse percent-encodes control chars in fragments. + // The url crate silently drops them, so pre-encode them manually. + let url_for_join = if fragment.bytes().any(|b| b < 0x20 || b == 0x7F) { + let mut encoded = String::from('#'); + for b in fragment.bytes() { + if b < 0x20 || b == 0x7F { + encoded.push_str(&format!("%{b:02X}")); + } else { + encoded.push(b as char); + } + } + encoded + } else { + url.to_string() + }; + return go_like_reference(&url_for_join, remove_query_string); } // Go's url.Parse rejects control characters (bytes < 0x20 or 0x7F) and returns an // error, causing ObfuscateURLString to return "?". The `url` crate silently drops @@ -415,6 +428,13 @@ mod tests { input ["#ჸ"] expected_output ["#%E1%83%B8"]; ] + [ + test_name [fuzzing_1323831861] + remove_query_string [true] + remove_path_digits [true] + input ["#\u{01}"] + expected_output ["#%01"]; + ] )] #[test] fn test_name() { From 215b2b2eaee52774a23036a58665f4748da44d7d Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Thu, 5 Mar 2026 18:04:56 +0100 Subject: [PATCH 12/63] fix(http): use char iteration when pre-encoding fragment control chars MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous fix iterated bytes and used 'b as char' which converts u8 to a Unicode scalar, corrupting multi-byte sequences like Georgian ჸ. Iterate over chars instead to preserve multi-byte Unicode correctly. Fixes fuzzing testcase: http_fuzzing_35626170 --- libdd-trace-obfuscation/src/http.rs | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index 563f7257ec..47e67e61de 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -138,13 +138,15 @@ pub fn obfuscate_url_string( } // Go's url.Parse percent-encodes control chars in fragments. // The url crate silently drops them, so pre-encode them manually. + // Iterate over chars (not bytes) to preserve multi-byte Unicode sequences. let url_for_join = if fragment.bytes().any(|b| b < 0x20 || b == 0x7F) { let mut encoded = String::from('#'); - for b in fragment.bytes() { - if b < 0x20 || b == 0x7F { - encoded.push_str(&format!("%{b:02X}")); + for c in fragment.chars() { + let cp = c as u32; + if cp < 0x20 || cp == 0x7F { + encoded.push_str(&format!("%{cp:02X}")); } else { - encoded.push(b as char); + encoded.push(c); } } encoded @@ -435,6 +437,13 @@ mod tests { input ["#\u{01}"] expected_output ["#%01"]; ] + [ + test_name [fuzzing_35626170] + remove_query_string [true] + remove_path_digits [true] + input ["#\u{01}ჸ"] + expected_output ["#%01%E1%83%B8"]; + ] )] #[test] fn test_name() { From 266d9480b3b8704afe14ac5c0df004d7fa4e18f8 Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Thu, 5 Mar 2026 18:19:13 +0100 Subject: [PATCH 13/63] fix(http): encode backslash and other Go-always-encoded chars in path Go's shouldEscape always encodes '\', '^', '{', '}', '|', '<', '>', '`', and ' ' in paths (they're not in validEncoded's allowlist). The url crate keeps them unencoded. Separate from the '!'-etc. class which are only encoded when non-ASCII chars trigger the escape() fallback. Fixes fuzzing testcase: http_fuzzing_618280270 --- libdd-trace-obfuscation/src/http.rs | 64 +++++++++++++++++++++++++---- 1 file changed, 56 insertions(+), 8 deletions(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index 47e67e61de..8aa5e14be3 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -5,9 +5,12 @@ use percent_encoding::percent_decode_str; use url::Url; /// Encode path characters that Go's url.EscapedPath() encodes but the url crate doesn't. -/// Go's shouldEscape for encodePath does not allow !, ', (, ), * even though RFC 3986 -/// considers them valid sub-delimiters in path segments. /// Only applied to the path portion (before the first '?'). +/// +/// Two categories: +/// 1. Always encoded: chars not in Go's validEncoded allowlist (e.g. '\', '^', '{', '}', '|') +/// 2. Encoded only when escape() fallback occurs (non-ASCII present): '!', '\'', '(', ')', '*' +/// These are in validEncoded's allowlist so RawPath is used for pure-ASCII paths. fn encode_go_path_chars(url_str: &str) -> String { let query_start = url_str.find('?').unwrap_or(url_str.len()); let path_part = &url_str[..query_start]; @@ -16,6 +19,12 @@ fn encode_go_path_chars(url_str: &str) -> String { let mut encoded = String::with_capacity(path_part.len()); for c in path_part.chars() { match c { + // Category 1: always encoded (not in validEncoded's explicit allowlist) + '\\' | '^' | '{' | '}' | '|' | '<' | '>' | '`' | ' ' => { + encoded.push('%'); + encoded.push_str(&format!("{:02X}", c as u8)); + } + // Category 2: encoded only when escape() fallback (handled by caller check) '!' | '\'' | '(' | ')' | '*' => { encoded.push('%'); encoded.push_str(&format!("{:02X}", c as u8)); @@ -187,14 +196,46 @@ pub fn obfuscate_url_string( } else { fixme_url_go_parsing }; - // Encode path chars that Go encodes but the url crate doesn't (!, ', (, ), *). - // Go's validEncoded allows these in RawPath (pure ASCII path → no re-encoding). - // But when the path has non-ASCII chars, Go calls escape() which also encodes them. - // Only apply when the original input contains non-ASCII bytes. - let result = if url.bytes().any(|b| b > 127) { + // Encode path chars that Go encodes but the url crate doesn't. + // Always apply encode_go_path_chars since it handles: + // - Category 1 (always encoded): \, ^, {, }, |, <, >, `, space + // - Category 2 (only when non-ASCII triggers escape() fallback): !, ', (, ), * + // For category 2, we still apply them here unconditionally since encode_go_path_chars + // would encode them for non-ASCII inputs; for pure-ASCII those chars were already + // handled by validEncoded allowing them in RawPath. But since we're post-processing + // the url crate's output (which keeps them), we must encode them only when non-ASCII. + // Simplification: apply all encodings, but for category 2 chars only when non-ASCII. + let has_non_ascii = url.bytes().any(|b| b > 127); + let result = if has_non_ascii { + // Full encoding: both category 1 and category 2 encode_go_path_chars(&result) } else { - result + // ASCII-only: only category 1 chars (\, ^, etc.) + // Category 2 (!, ', (, ), *) are left as-is for pure ASCII inputs + let query_start = result.find('?').unwrap_or(result.len()); + let path_part = &result[..query_start]; + let rest = &result[query_start..]; + let mut encoded = String::with_capacity(path_part.len()); + let mut changed = false; + for c in path_part.chars() { + match c { + '\\' | '^' | '{' | '}' | '|' | '<' | '>' | '`' | ' ' => { + encoded.push('%'); + encoded.push_str(&format!("{:02X}", c as u8)); + changed = true; + } + _ => encoded.push(c), + } + } + if changed { + if rest.is_empty() { + encoded + } else { + format!("{encoded}{rest}") + } + } else { + result + } }; if remove_path_digits { return remove_relative_path_digits(&result); @@ -444,6 +485,13 @@ mod tests { input ["#\u{01}ჸ"] expected_output ["#%01%E1%83%B8"]; ] + [ + test_name [fuzzing_618280270] + remove_query_string [true] + remove_path_digits [true] + input ["\\"] + expected_output ["%5C"]; + ] )] #[test] fn test_name() { From ea04644cbe9f0ed98a304b1dafbd72a88a580890 Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Thu, 5 Mar 2026 18:20:19 +0100 Subject: [PATCH 14/63] fix(http): encode '[' and ']' when non-ASCII triggers escape() fallback Like '!', '\'', '(', ')', '*', the '[' and ']' characters are in Go's validEncoded allowlist but get encoded when escape() is called due to non-ASCII chars in the path. Fixes fuzzing testcase: http_fuzzing_1505427946 --- libdd-trace-obfuscation/src/http.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index 8aa5e14be3..7ca3d06b95 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -25,7 +25,8 @@ fn encode_go_path_chars(url_str: &str) -> String { encoded.push_str(&format!("{:02X}", c as u8)); } // Category 2: encoded only when escape() fallback (handled by caller check) - '!' | '\'' | '(' | ')' | '*' => { + // These are in Go's validEncoded allowlist but get encoded when escape() is called + '!' | '\'' | '(' | ')' | '*' | '[' | ']' => { encoded.push('%'); encoded.push_str(&format!("{:02X}", c as u8)); } @@ -492,6 +493,13 @@ mod tests { input ["\\"] expected_output ["%5C"]; ] + [ + test_name [fuzzing_1505427946] + remove_query_string [true] + remove_path_digits [true] + input ["[ჸ"] + expected_output ["%5B%E1%83%B8"]; + ] )] #[test] fn test_name() { From 2c94802bd71ce14ead3ff9b765db71752a4d9fea Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Thu, 5 Mar 2026 18:21:45 +0100 Subject: [PATCH 15/63] fix(http): pre-encode backslash before go_like_reference to prevent path split The url crate treats '\' as a path separator, consuming it silently. Go treats '\' as a path character and encodes it as '%5C'. Pre-encode '\' as '%5C' before calling go_like_reference so base.join() preserves it rather than using it as a path segment separator. Fixes fuzzing testcase: http_fuzzing_backslash_unicode --- libdd-trace-obfuscation/src/http.rs | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index 7ca3d06b95..21aa80ae78 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -189,7 +189,17 @@ pub fn obfuscate_url_string( return String::from("?"); } } - let fixme_url_go_parsing = go_like_reference(url, remove_query_string); + // The url crate treats '\' as a path separator, silently consuming it. + // Go encodes '\' as '%5C'. Pre-encode backslashes before go_like_reference + // so they are preserved through base.join() and appear as '%5C' in the output. + let url_pre_encoded; + let url_for_go_like = if url.contains('\\') { + url_pre_encoded = url.replace('\\', "%5C"); + url_pre_encoded.as_str() + } else { + url + }; + let fixme_url_go_parsing = go_like_reference(url_for_go_like, remove_query_string); let result = if fixme_url_go_parsing.is_empty() && !url.is_empty() { // The url crate resolved away dot path segments (e.g. "." or "..") via RFC 3986 // normalization. Go's url.Parse preserves them literally. Return the original. @@ -500,6 +510,13 @@ mod tests { input ["[ჸ"] expected_output ["%5B%E1%83%B8"]; ] + [ + test_name [fuzzing_backslash_unicode] + remove_query_string [true] + remove_path_digits [true] + input ["\\ჸ"] + expected_output ["%5C%E1%83%B8"]; + ] )] #[test] fn test_name() { From 73fc087251cd0c6105d4f52b27f15e32fdeac8f6 Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Thu, 5 Mar 2026 18:23:25 +0100 Subject: [PATCH 16/63] fix(http): strip trailing empty fragment '#' to match Go's url.URL.String() Go's url.URL.String() omits a bare '#' with no fragment content. The url crate keeps it. Strip trailing '#' from go_like_reference results. Fixes fuzzing testcase: http_fuzzing_2438023093 --- libdd-trace-obfuscation/src/http.rs | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index 21aa80ae78..1a05cae2a4 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -199,7 +199,15 @@ pub fn obfuscate_url_string( } else { url }; - let fixme_url_go_parsing = go_like_reference(url_for_go_like, remove_query_string); + let fixme_url_go_parsing_raw = + go_like_reference(url_for_go_like, remove_query_string); + // Go's url.URL.String() omits a trailing empty fragment (bare '#'). + // The url crate keeps it. Strip it here for parity. + let fixme_url_go_parsing = if fixme_url_go_parsing_raw.ends_with('#') { + fixme_url_go_parsing_raw[..fixme_url_go_parsing_raw.len() - 1].to_string() + } else { + fixme_url_go_parsing_raw + }; let result = if fixme_url_go_parsing.is_empty() && !url.is_empty() { // The url crate resolved away dot path segments (e.g. "." or "..") via RFC 3986 // normalization. Go's url.Parse preserves them literally. Return the original. @@ -517,6 +525,13 @@ mod tests { input ["\\ჸ"] expected_output ["%5C%E1%83%B8"]; ] + [ + test_name [fuzzing_2438023093] + remove_query_string [true] + remove_path_digits [true] + input ["ჸ#"] + expected_output ["%E1%83%B8"]; + ] )] #[test] fn test_name() { From a0b9cc219dffb68b0a7019c7181949f2542d20e6 Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Thu, 5 Mar 2026 18:24:45 +0100 Subject: [PATCH 17/63] fix(http): only check path (not fragment) for non-ASCII when encoding ! etc. When determining whether to encode !, ', (, ), *, [, ] (Cat2 chars), only check the path portion (before '#') for non-ASCII bytes. A non-ASCII character in the fragment does not trigger Go's escape() fallback for the path, so the path chars should stay unencoded. Fixes fuzzing testcase: http_fuzzing_2729083127 --- libdd-trace-obfuscation/src/http.rs | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index 1a05cae2a4..f85b8f1e79 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -224,7 +224,10 @@ pub fn obfuscate_url_string( // handled by validEncoded allowing them in RawPath. But since we're post-processing // the url crate's output (which keeps them), we must encode them only when non-ASCII. // Simplification: apply all encodings, but for category 2 chars only when non-ASCII. - let has_non_ascii = url.bytes().any(|b| b > 127); + // Only check path portion (before '#') for non-ASCII; a non-ASCII fragment + // does not trigger Go's escape() fallback for the path encoding. + let path_end_for_ascii_check = url.find('#').unwrap_or(url.len()); + let has_non_ascii = url[..path_end_for_ascii_check].bytes().any(|b| b > 127); let result = if has_non_ascii { // Full encoding: both category 1 and category 2 encode_go_path_chars(&result) @@ -532,6 +535,13 @@ mod tests { input ["ჸ#"] expected_output ["%E1%83%B8"]; ] + [ + test_name [fuzzing_2729083127] + remove_query_string [true] + remove_path_digits [true] + input ["!#ჸ"] + expected_output ["!#%E1%83%B8"]; + ] )] #[test] fn test_name() { From 9f917b984796facf9eb42460c6d3cb43b5f462d0 Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Thu, 5 Mar 2026 18:28:13 +0100 Subject: [PATCH 18/63] fix(http): preserve leading '/' for absolute-path inputs in go_like_reference MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For input '/ჸ', base.join('/ჸ') resolves to 'https://example.invalid/%E1%83%B8'. Stripping base_prefix='https://example.invalid/' (with trailing slash) drops the leading '/'. For inputs starting with '/', use the no-trailing-slash strip to preserve the leading '/' in the output. Fixes fuzzing testcase: http_fuzzing_slash_unicode --- libdd-trace-obfuscation/src/http.rs | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index f85b8f1e79..ca2688c048 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -113,6 +113,20 @@ pub fn go_like_reference(input: &str, remove_query_string: bool) -> String { // base.as_str() is "https://example.invalid/" let base_prefix = base.as_str(); + // For absolute-path inputs (starting with '/'), use the no-trailing-slash strip + // to preserve the leading '/' in the result. Otherwise base.join("/ჸ") resolves to + // "https://example.invalid/%E1%83%B8" and stripping the base WITH trailing slash + // drops the leading '/'. + if input.starts_with('/') { + if let Some(rest) = full.strip_prefix("https://example.invalid") { + if remove_query_string && resolved.query().is_some() { + let path_end = rest.find('?').unwrap_or(rest.len()); + return format!("{}?", &rest[..path_end]); + } + return rest.to_string(); + } + } + if let Some(rest) = full.strip_prefix(base_prefix) { // relative path (e.g. "hello%20world" or "dir/hello%20world") if remove_query_string && resolved.query().is_some() { @@ -542,6 +556,13 @@ mod tests { input ["!#ჸ"] expected_output ["!#%E1%83%B8"]; ] + [ + test_name [fuzzing_slash_unicode] + remove_query_string [true] + remove_path_digits [true] + input ["/ჸ"] + expected_output ["/%E1%83%B8"]; + ] )] #[test] fn test_name() { From b3e92d728868c33b86c4f3cc68a42b8516ebb596 Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Thu, 5 Mar 2026 18:31:17 +0100 Subject: [PATCH 19/63] fix(http): encode '#' in fragment content (Go encodes it as %23) Go's shouldEscape('#', encodeFragment) returns true, so '#' within a fragment is encoded as '%23'. The url crate keeps it raw. For input '##', Go returns '#%23' (second '#' encoded). Pre-encode '#' in fragment content. Fixes fuzzing testcase: http_fuzzing_3710129001 --- libdd-trace-obfuscation/src/http.rs | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index ca2688c048..eb1ad1e217 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -160,14 +160,15 @@ pub fn obfuscate_url_string( if fragment.is_empty() { return String::new(); } - // Go's url.Parse percent-encodes control chars in fragments. - // The url crate silently drops them, so pre-encode them manually. + // Go's url.Parse percent-encodes control chars and '#' in fragments. + // ('#' in a fragment is encoded as %23 since shouldEscape('#', encodeFragment)=true) + // The url crate keeps them raw, so pre-encode them manually. // Iterate over chars (not bytes) to preserve multi-byte Unicode sequences. - let url_for_join = if fragment.bytes().any(|b| b < 0x20 || b == 0x7F) { + let url_for_join = if fragment.bytes().any(|b| b < 0x20 || b == 0x7F || b == b'#') { let mut encoded = String::from('#'); for c in fragment.chars() { let cp = c as u32; - if cp < 0x20 || cp == 0x7F { + if cp < 0x20 || cp == 0x7F || c == '#' { encoded.push_str(&format!("%{cp:02X}")); } else { encoded.push(c); @@ -563,6 +564,13 @@ mod tests { input ["/ჸ"] expected_output ["/%E1%83%B8"]; ] + [ + test_name [fuzzing_3710129001] + remove_query_string [true] + remove_path_digits [true] + input ["##"] + expected_output ["#%23"]; + ] )] #[test] fn test_name() { From 1d7b42ea453759c19f2543d60d655e79b835d0cd Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Thu, 5 Mar 2026 18:34:13 +0100 Subject: [PATCH 20/63] fix(http): allow control chars in fragment, reject only in path Go's url.Parse rejects control chars in the PATH (returning '?') but percent-encodes them in the FRAGMENT. Only check path portion (before '#') for control char rejection. Pre-encode control chars in the fragment before calling go_like_reference. Fixes fuzzing testcase: http_fuzzing_1009954227 --- libdd-trace-obfuscation/src/http.rs | 60 ++++++++++++++++++++++++++--- 1 file changed, 55 insertions(+), 5 deletions(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index eb1ad1e217..43dbd31218 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -180,11 +180,54 @@ pub fn obfuscate_url_string( }; return go_like_reference(&url_for_join, remove_query_string); } - // Go's url.Parse rejects control characters (bytes < 0x20 or 0x7F) and returns an - // error, causing ObfuscateURLString to return "?". The `url` crate silently drops - // them, so we must check explicitly before calling go_like_reference. - if url.bytes().any(|b| b < 0x20 || b == 0x7F) { - return String::from("?"); + // Go's url.Parse rejects control characters (bytes < 0x20 or 0x7F) in the PATH and + // returns "?". Control chars in the FRAGMENT are percent-encoded, not rejected. + // Only reject if there are control chars in the path portion (before '#'). + { + let path_end = url.find('#').unwrap_or(url.len()); + if url[..path_end].bytes().any(|b| b < 0x20 || b == 0x7F) { + return String::from("?"); + } + // Pre-encode control chars in the fragment (if any) before go_like_reference. + if path_end < url.len() + && url[path_end + 1..].bytes().any(|b| b < 0x20 || b == 0x7F || b == b'#') + { + let mut pre_encoded = url[..path_end].to_string(); + pre_encoded.push('#'); + for c in url[path_end + 1..].chars() { + let cp = c as u32; + if cp < 0x20 || cp == 0x7F || c == '#' { + pre_encoded.push_str(&format!("%{cp:02X}")); + } else { + pre_encoded.push(c); + } + } + // Use the pre-encoded URL for the rest of the processing + let url = pre_encoded.as_str(); + // Continue to go_like_reference below using the pre-encoded url + // (fall through with modified url) + let url_pre_encoded_for_backslash; + let url_for_go_like = if url.contains('\\') { + url_pre_encoded_for_backslash = url.replace('\\', "%5C"); + url_pre_encoded_for_backslash.as_str() + } else { + url + }; + let raw = go_like_reference(url_for_go_like, remove_query_string); + let raw = if raw.ends_with('#') { raw[..raw.len()-1].to_string() } else { raw }; + let result = if raw.is_empty() && !url.is_empty() { url.to_string() } else { raw }; + let path_end_for_ascii = url.find('#').unwrap_or(url.len()); + let has_non_ascii = url[..path_end_for_ascii].bytes().any(|b| b > 127); + let result = if has_non_ascii { encode_go_path_chars(&result) } else { + let qs = result.find('?').unwrap_or(result.len()); + let pp = &result[..qs]; let rr = &result[qs..]; + let mut enc = String::with_capacity(pp.len()); let mut changed = false; + for c in pp.chars() { match c { '\\' | '^' | '{' | '}' | '|' | '<' | '>' | '`' | ' ' => { enc.push('%'); enc.push_str(&format!("{:02X}", c as u8)); changed = true; } _ => enc.push(c), } } + if changed { if rr.is_empty() { enc } else { format!("{enc}{rr}") } } else { result } + }; + if remove_path_digits { return remove_relative_path_digits(&result); } + return result; + } } // Go's url.Parse rejects invalid percent-encoding sequences (bare '%' or '%' not // followed by exactly two hex digits). The `url` crate re-encodes them as '%25', @@ -571,6 +614,13 @@ mod tests { input ["##"] expected_output ["#%23"]; ] + [ + test_name [fuzzing_1009954227] + remove_query_string [true] + remove_path_digits [true] + input ["ჸ#\u{10}"] + expected_output ["%E1%83%B8#%10"]; + ] )] #[test] fn test_name() { From 061a5e8ee79ce640e3a24af6b6a7985c8ba4dd13 Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Thu, 5 Mar 2026 18:35:36 +0100 Subject: [PATCH 21/63] fix(http): don't encode ! etc. in fragment portion encode_go_path_chars was operating on the whole URL string including fragment. Go's encodeFragment mode allows '!', '(', ')', '*' (shouldEscape returns false). Stop path-char encoding at '#' so the fragment portion is preserved unchanged. Fixes fuzzing testcase: http_fuzzing_hash_exclamation --- libdd-trace-obfuscation/src/http.rs | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index 43dbd31218..e0cfff2de4 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -12,9 +12,13 @@ use url::Url; /// 2. Encoded only when escape() fallback occurs (non-ASCII present): '!', '\'', '(', ')', '*' /// These are in validEncoded's allowlist so RawPath is used for pure-ASCII paths. fn encode_go_path_chars(url_str: &str) -> String { - let query_start = url_str.find('?').unwrap_or(url_str.len()); - let path_part = &url_str[..query_start]; - let rest = &url_str[query_start..]; + // Only encode up to the first '?' or '#' — the fragment has different encoding rules + // (e.g., '!' is allowed in fragments per Go's shouldEscape for encodeFragment). + let path_end = url_str + .find(|c| c == '?' || c == '#') + .unwrap_or(url_str.len()); + let path_part = &url_str[..path_end]; + let rest = &url_str[path_end..]; let mut encoded = String::with_capacity(path_part.len()); for c in path_part.chars() { @@ -292,9 +296,10 @@ pub fn obfuscate_url_string( } else { // ASCII-only: only category 1 chars (\, ^, etc.) // Category 2 (!, ', (, ), *) are left as-is for pure ASCII inputs - let query_start = result.find('?').unwrap_or(result.len()); - let path_part = &result[..query_start]; - let rest = &result[query_start..]; + // Also stop at '#' since fragment has different encoding rules + let path_end = result.find(|c| c == '?' || c == '#').unwrap_or(result.len()); + let path_part = &result[..path_end]; + let rest = &result[path_end..]; let mut encoded = String::with_capacity(path_part.len()); let mut changed = false; for c in path_part.chars() { @@ -621,6 +626,13 @@ mod tests { input ["ჸ#\u{10}"] expected_output ["%E1%83%B8#%10"]; ] + [ + test_name [fuzzing_hash_exclamation] + remove_query_string [true] + remove_path_digits [true] + input ["ჸ#!"] + expected_output ["%E1%83%B8#!"]; + ] )] #[test] fn test_name() { From 0620bdef52a5bd9822cdeb76e6af81e706447387 Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Thu, 5 Mar 2026 18:36:48 +0100 Subject: [PATCH 22/63] fix(http): reject invalid percent-encoding in fragment (Go returns '?') Go's url.Parse rejects invalid percent-encoding sequences even in the fragment portion. Add the same check to the fragment handler. Fixes fuzzing testcase: http_fuzzing_578834728 --- libdd-trace-obfuscation/src/http.rs | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index e0cfff2de4..c017cec943 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -164,6 +164,10 @@ pub fn obfuscate_url_string( if fragment.is_empty() { return String::new(); } + // Go also rejects invalid percent-encoding in fragments. + if has_invalid_percent_encoding(fragment) { + return String::from("?"); + } // Go's url.Parse percent-encodes control chars and '#' in fragments. // ('#' in a fragment is encoded as %23 since shouldEscape('#', encodeFragment)=true) // The url crate keeps them raw, so pre-encode them manually. @@ -633,6 +637,13 @@ mod tests { input ["ჸ#!"] expected_output ["%E1%83%B8#!"]; ] + [ + test_name [fuzzing_578834728] + remove_query_string [true] + remove_path_digits [true] + input ["#%"] + expected_output ["?"]; + ] )] #[test] fn test_name() { From 179cc2c574f1ed01b2ab6d24e7b45081ed38f869 Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Thu, 5 Mar 2026 18:39:51 +0100 Subject: [PATCH 23/63] fix(http): encode ', !, (, ), *, [, ] in fragment when non-ASCII present Go's escape() for encodeFragment encodes these chars (they're in validEncoded's allowlist but not in shouldEscape's 'return false' cases). When non-ASCII chars trigger the escape() fallback, these also get encoded. Pre-encode them in the fragment when non-ASCII is detected. Fixes fuzzing testcase: http_fuzzing_3991369296 --- libdd-trace-obfuscation/src/http.rs | 53 ++++++++++++++++++++--------- 1 file changed, 36 insertions(+), 17 deletions(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index c017cec943..0e9054ce82 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -168,24 +168,36 @@ pub fn obfuscate_url_string( if has_invalid_percent_encoding(fragment) { return String::from("?"); } - // Go's url.Parse percent-encodes control chars and '#' in fragments. - // ('#' in a fragment is encoded as %23 since shouldEscape('#', encodeFragment)=true) - // The url crate keeps them raw, so pre-encode them manually. - // Iterate over chars (not bytes) to preserve multi-byte Unicode sequences. - let url_for_join = if fragment.bytes().any(|b| b < 0x20 || b == 0x7F || b == b'#') { - let mut encoded = String::from('#'); - for c in fragment.chars() { - let cp = c as u32; - if cp < 0x20 || cp == 0x7F || c == '#' { - encoded.push_str(&format!("%{cp:02X}")); - } else { - encoded.push(c); + // Go's url.Parse percent-encodes certain chars in fragments: + // - Always: control chars, '#' + // - When non-ASCII present (escape() fallback): '!', '\'', '(', ')', '*', '[', ']' + // (These are in validEncoded's allowlist so kept for pure-ASCII fragments, + // but escape() encodes them too.) + let frag_has_non_ascii = fragment.bytes().any(|b| b > 127); + let url_for_join = + if fragment.bytes().any(|b| b < 0x20 || b == 0x7F || b == b'#') + || (frag_has_non_ascii + && fragment + .chars() + .any(|c| matches!(c, '!' | '\'' | '(' | ')' | '*' | '[' | ']'))) + { + let mut encoded = String::from('#'); + for c in fragment.chars() { + let cp = c as u32; + if cp < 0x20 || cp == 0x7F || c == '#' { + encoded.push_str(&format!("%{cp:02X}")); + } else if frag_has_non_ascii + && matches!(c, '!' | '\'' | '(' | ')' | '*' | '[' | ']') + { + encoded.push_str(&format!("%{:02X}", c as u8)); + } else { + encoded.push(c); + } } - } - encoded - } else { - url.to_string() - }; + encoded + } else { + url.to_string() + }; return go_like_reference(&url_for_join, remove_query_string); } // Go's url.Parse rejects control characters (bytes < 0x20 or 0x7F) in the PATH and @@ -644,6 +656,13 @@ mod tests { input ["#%"] expected_output ["?"]; ] + [ + test_name [fuzzing_3991369296] + remove_query_string [true] + remove_path_digits [true] + input ["#'ჸ"] + expected_output ["#%27%E1%83%B8"]; + ] )] #[test] fn test_name() { From 39f0c19a0379bc5376fb872908bc1a250a2fdb5a Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Thu, 5 Mar 2026 18:41:27 +0100 Subject: [PATCH 24/63] fix(http): encode cat2 chars in fragment when original URL has non-ASCII fragment When the URL has non-ASCII chars in the fragment, Go's escape() also encodes cat2 chars (!, ', (, ), *, [, ]) in the fragment. Apply the same encoding to the result's fragment portion when the original URL's fragment had non-ASCII. Fixes fuzzing testcase: http_fuzzing_path_frag_quote --- libdd-trace-obfuscation/src/http.rs | 39 +++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index 0e9054ce82..0a0915551c 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -307,8 +307,36 @@ pub fn obfuscate_url_string( let path_end_for_ascii_check = url.find('#').unwrap_or(url.len()); let has_non_ascii = url[..path_end_for_ascii_check].bytes().any(|b| b > 127); let result = if has_non_ascii { - // Full encoding: both category 1 and category 2 - encode_go_path_chars(&result) + // Full encoding: both category 1 and category 2 in path + fragment. + // When non-ASCII is present, Go's escape() also encodes cat2 chars in fragments. + let encoded = encode_go_path_chars(&result); + // Check if original URL's fragment also has non-ASCII + let url_frag_start = url.find('#').map(|i| i + 1).unwrap_or(url.len()); + let frag_has_non_ascii = url[url_frag_start..].bytes().any(|b| b > 127); + if frag_has_non_ascii { + // Also encode cat2 chars in the result's fragment + if let Some(frag_start) = encoded.find('#') { + let path_and_hash = &encoded[..=frag_start]; + let frag = &encoded[frag_start + 1..]; + if frag.chars().any(|c| matches!(c, '!' | '\'' | '(' | ')' | '*' | '[' | ']')) { + let mut out = path_and_hash.to_string(); + for c in frag.chars() { + if matches!(c, '!' | '\'' | '(' | ')' | '*' | '[' | ']') { + out.push_str(&format!("%{:02X}", c as u8)); + } else { + out.push(c); + } + } + out + } else { + encoded + } + } else { + encoded + } + } else { + encoded + } } else { // ASCII-only: only category 1 chars (\, ^, etc.) // Category 2 (!, ', (, ), *) are left as-is for pure ASCII inputs @@ -663,6 +691,13 @@ mod tests { input ["#'ჸ"] expected_output ["#%27%E1%83%B8"]; ] + [ + test_name [fuzzing_path_frag_quote] + remove_query_string [true] + remove_path_digits [true] + input ["ჸ#'ჸ"] + expected_output ["%E1%83%B8#%27%E1%83%B8"]; + ] )] #[test] fn test_name() { From b2005ca90b073350eed2e5b716b13cc6cd1a5381 Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Thu, 5 Mar 2026 18:45:44 +0100 Subject: [PATCH 25/63] fix(http): fragment encoding - only encode ' [ ] not \! ( ) * when non-ASCII present Go's shouldEscape for encodeFragment returns false for \! ( ) * explicitly, so those are NOT encoded even when escape() is triggered by non-ASCII. Only ' [ ] (in validEncoded allowlist but not shouldEscape's return-false) get encoded when non-ASCII chars trigger the escape() fallback. --- libdd-trace-obfuscation/src/http.rs | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index 0a0915551c..7abcfd3b53 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -179,7 +179,7 @@ pub fn obfuscate_url_string( || (frag_has_non_ascii && fragment .chars() - .any(|c| matches!(c, '!' | '\'' | '(' | ')' | '*' | '[' | ']'))) + .any(|c| matches!(c, '\'' | '[' | ']'))) { let mut encoded = String::from('#'); for c in fragment.chars() { @@ -187,7 +187,7 @@ pub fn obfuscate_url_string( if cp < 0x20 || cp == 0x7F || c == '#' { encoded.push_str(&format!("%{cp:02X}")); } else if frag_has_non_ascii - && matches!(c, '!' | '\'' | '(' | ')' | '*' | '[' | ']') + && matches!(c, '\'' | '[' | ']') { encoded.push_str(&format!("%{:02X}", c as u8)); } else { @@ -318,10 +318,12 @@ pub fn obfuscate_url_string( if let Some(frag_start) = encoded.find('#') { let path_and_hash = &encoded[..=frag_start]; let frag = &encoded[frag_start + 1..]; - if frag.chars().any(|c| matches!(c, '!' | '\'' | '(' | ')' | '*' | '[' | ']')) { + // In fragments, Go encodes ' [ ] when non-ASCII triggers escape(), + // but NOT ! ( ) * (shouldEscape returns false for those in encodeFragment) + if frag.chars().any(|c| matches!(c, '\'' | '[' | ']')) { let mut out = path_and_hash.to_string(); for c in frag.chars() { - if matches!(c, '!' | '\'' | '(' | ')' | '*' | '[' | ']') { + if matches!(c, '\'' | '[' | ']') { out.push_str(&format!("%{:02X}", c as u8)); } else { out.push(c); @@ -698,6 +700,13 @@ mod tests { input ["ჸ#'ჸ"] expected_output ["%E1%83%B8#%27%E1%83%B8"]; ] + [ + test_name [fuzzing_hash_excl_unicode] + remove_query_string [true] + remove_path_digits [true] + input ["#!ჸ"] + expected_output ["#!%E1%83%B8"]; + ] )] #[test] fn test_name() { From 256c77c81b7610f78e54c9f5a7a1bde0599c9525 Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Thu, 5 Mar 2026 18:47:48 +0100 Subject: [PATCH 26/63] fix(http): remove_relative_path_digits must stop at '#' - fragment digits not path digits --- libdd-trace-obfuscation/src/http.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index 7abcfd3b53..79f8191e42 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -48,9 +48,12 @@ fn encode_go_path_chars(url_str: &str) -> String { /// Operates only on the path portion (before the first '?'), matching Go's behavior of /// splitting path by '/' and replacing segments containing digits with '?'. fn remove_relative_path_digits(url_str: &str) -> String { - let query_start = url_str.find('?').unwrap_or(url_str.len()); - let path_part = &url_str[..query_start]; - let rest = &url_str[query_start..]; + // Only apply digit removal to the path (before '?' or '#'); fragments are not paths. + let path_end = url_str + .find(|c: char| c == '?' || c == '#') + .unwrap_or(url_str.len()); + let path_part = &url_str[..path_end]; + let rest = &url_str[path_end..]; let mut segments: Vec<&str> = path_part.split('/').collect(); let mut changed = false; From 736bdeee71389b13166725d44d3986f70a615dcd Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Thu, 5 Mar 2026 18:49:27 +0100 Subject: [PATCH 27/63] fix(http): strip trailing '#' from dot-segment fallback (empty fragment) --- libdd-trace-obfuscation/src/http.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index 79f8191e42..01e3cb9965 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -291,8 +291,10 @@ pub fn obfuscate_url_string( }; let result = if fixme_url_go_parsing.is_empty() && !url.is_empty() { // The url crate resolved away dot path segments (e.g. "." or "..") via RFC 3986 - // normalization. Go's url.Parse preserves them literally. Return the original. - url.to_string() + // normalization. Go's url.Parse preserves them literally. Return the original, + // but strip a trailing empty fragment '#' (Go omits empty fragments). + let fallback = if url.ends_with('#') { &url[..url.len()-1] } else { url }; + fallback.to_string() } else { fixme_url_go_parsing }; From af46866246bb78f7c64ee72126e2026d066b3f50 Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Thu, 5 Mar 2026 18:51:02 +0100 Subject: [PATCH 28/63] fix(http): prepend original path when go_like_reference resolves path away to fragment When url like '.#frag' has dot path resolved away by base.join(), the result starts with '#' and loses the path. Detect this and prepend the original path. --- libdd-trace-obfuscation/src/http.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index 01e3cb9965..3b45e7e6db 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -295,6 +295,16 @@ pub fn obfuscate_url_string( // but strip a trailing empty fragment '#' (Go omits empty fragments). let fallback = if url.ends_with('#') { &url[..url.len()-1] } else { url }; fallback.to_string() + } else if fixme_url_go_parsing.starts_with('#') { + // go_like_reference resolved away the path (e.g. ".#frag" → "#frag"). + // Go preserves the original path. Prepend it. + let path_end = url.find('#').unwrap_or(url.len()); + let orig_path = &url[..path_end]; + if !orig_path.is_empty() { + format!("{}{}", orig_path, fixme_url_go_parsing) + } else { + fixme_url_go_parsing + } } else { fixme_url_go_parsing }; From 024425eea7ce6cf74d4fcc4ae40ed872ad874a3b Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Thu, 5 Mar 2026 18:52:38 +0100 Subject: [PATCH 29/63] fix(http): preserve dot-segment prefixes (., .., ./, ../) in go_like_reference output MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Go's url.Parse stores dot segments literally (e.g., '../ჸ' → path '../ჸ'), while the url crate's base.join() resolves them. Re-prepend the lost dot prefix to match Go's output. --- libdd-trace-obfuscation/src/http.rs | 40 ++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index 3b45e7e6db..78bffa3210 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -295,18 +295,40 @@ pub fn obfuscate_url_string( // but strip a trailing empty fragment '#' (Go omits empty fragments). let fallback = if url.ends_with('#') { &url[..url.len()-1] } else { url }; fallback.to_string() - } else if fixme_url_go_parsing.starts_with('#') { - // go_like_reference resolved away the path (e.g. ".#frag" → "#frag"). - // Go preserves the original path. Prepend it. - let path_end = url.find('#').unwrap_or(url.len()); - let orig_path = &url[..path_end]; - if !orig_path.is_empty() { - format!("{}{}", orig_path, fixme_url_go_parsing) + } else { + // If the original URL had a dot-segment prefix (., .., ./, ../) that + // base.join() resolved away, Go preserves it literally. Re-prepend it. + let frag_or_end = url.find(|c| c == '#' || c == '?').unwrap_or(url.len()); + let orig_path = &url[..frag_or_end]; + let dot_prefix_len = { + let mut i = 0; + loop { + if orig_path[i..].starts_with("../") { i += 3; } + else if orig_path[i..].starts_with("./") { i += 2; } + else if &orig_path[i..] == ".." || &orig_path[i..] == "." { + i += orig_path[i..].len(); break; + } else { break; } + } + i + }; + if dot_prefix_len > 0 { + let dot_prefix = &url[..dot_prefix_len]; + // Prepend the lost dot prefix + if !fixme_url_go_parsing.starts_with(dot_prefix) { + format!("{}{}", dot_prefix, fixme_url_go_parsing) + } else { + fixme_url_go_parsing + } + } else if fixme_url_go_parsing.starts_with('#') { + // Non-dot path resolved to fragment only - prepend original path + if !orig_path.is_empty() { + format!("{}{}", orig_path, fixme_url_go_parsing) + } else { + fixme_url_go_parsing + } } else { fixme_url_go_parsing } - } else { - fixme_url_go_parsing }; // Encode path chars that Go encodes but the url crate doesn't. // Always apply encode_go_path_chars since it handles: From b42079807d665b0fbea741c50c7f350ad527980f Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Thu, 5 Mar 2026 19:10:27 +0100 Subject: [PATCH 30/63] fix(http): both-options-false --- libdd-trace-obfuscation/src/http.rs | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index 78bffa3210..9eab74f359 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -204,11 +204,15 @@ pub fn obfuscate_url_string( return go_like_reference(&url_for_join, remove_query_string); } // Go's url.Parse rejects control characters (bytes < 0x20 or 0x7F) in the PATH and - // returns "?". Control chars in the FRAGMENT are percent-encoded, not rejected. - // Only reject if there are control chars in the path portion (before '#'). + // returns "?". BUT when both options are false, Go's obfuscateUserInfo returns + // the original URL on parse failure (no "?"). + // Control chars in the FRAGMENT are percent-encoded, not rejected. { let path_end = url.find('#').unwrap_or(url.len()); if url[..path_end].bytes().any(|b| b < 0x20 || b == 0x7F) { + if !remove_query_string && !remove_path_digits { + return url.to_string(); + } return String::from("?"); } // Pre-encode control chars in the fragment (if any) before go_like_reference. @@ -567,7 +571,8 @@ mod tests { remove_query_string [false] remove_path_digits [false] input ["\u{10}"] - expected_output ["?"]; + // When both options false, Go returns original (obfuscateUserInfo passthrough) + expected_output ["\u{10}"]; ] [ test_name [non_printable_chars_and_unicode] From 8872b31268d9402ac374dcb63225cd673a6b6d8b Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Thu, 5 Mar 2026 19:25:26 +0100 Subject: [PATCH 31/63] fix(http): pre-parse control char check in HTTP --- libdd-trace-obfuscation/src/http.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index 9eab74f359..b037c74926 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -156,6 +156,14 @@ pub fn obfuscate_url_string( remove_query_string: bool, remove_path_digits: bool, ) -> String { + // Go rejects control chars in the path (returns '?'). Check before Url::parse since + // the url crate may silently drop control chars and succeed where Go would fail. + if remove_query_string || remove_path_digits { + let path_end = url.find('#').unwrap_or(url.len()); + if url[..path_end].bytes().any(|b| b < 0x20 || b == 0x7F) { + return String::from("?"); + } + } let mut parsed_url = match Url::parse(url) { Ok(res) => res, Err(_) => { From 3a4ef0b57002996f56e911b79a2fe6e26260fcd4 Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Thu, 5 Mar 2026 19:27:23 +0100 Subject: [PATCH 32/63] fix(http): handle opaque URIs (lowercase scheme, keep data raw) --- libdd-trace-obfuscation/src/http.rs | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index b037c74926..18cd6cab77 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -165,7 +165,16 @@ pub fn obfuscate_url_string( } } let mut parsed_url = match Url::parse(url) { - Ok(res) => res, + Ok(res) => { + // For cannot-be-a-base (opaque) URIs like "A:ᏤᏤ", Go keeps the opaque + // path verbatim. Return the original with the scheme lowercased. + if res.cannot_be_a_base() { + let scheme_len = url.find(':').unwrap_or(0); + let lowered = url[..scheme_len].to_lowercase() + &url[scheme_len..]; + return lowered; + } + res + } Err(_) => { // Fragment-only references (e.g. "#", "#frag") are valid relative URL references. // Go's url.Parse handles them successfully: "#" → "" (empty fragment → empty string), From 2a4ed496000171080c3c44b549a5534466fee281 Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Thu, 5 Mar 2026 19:28:44 +0100 Subject: [PATCH 33/63] fix(http): return original opaque URI when it contains control chars --- libdd-trace-obfuscation/src/http.rs | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index 18cd6cab77..72aa677cdd 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -167,11 +167,16 @@ pub fn obfuscate_url_string( let mut parsed_url = match Url::parse(url) { Ok(res) => { // For cannot-be-a-base (opaque) URIs like "A:ᏤᏤ", Go keeps the opaque - // path verbatim. Return the original with the scheme lowercased. + // path verbatim. Return with lowercased scheme. + // Exception: if the opaque part has control chars, Go's url.Parse fails + // and obfuscateUserInfo returns the original URL unchanged. if res.cannot_be_a_base() { let scheme_len = url.find(':').unwrap_or(0); - let lowered = url[..scheme_len].to_lowercase() + &url[scheme_len..]; - return lowered; + let opaque_part = &url[scheme_len..]; + if opaque_part.bytes().any(|b| b < 0x20 || b == 0x7F) { + return url.to_string(); // Go returns original on parse error + } + return url[..scheme_len].to_lowercase() + opaque_part; } res } From 18dfa0d777eb74e2e4f86e4207a769c12ba8bf47 Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Thu, 5 Mar 2026 19:35:19 +0100 Subject: [PATCH 34/63] fix(http): query-only URLs (?query) return original (Go keeps query raw) --- libdd-trace-obfuscation/src/http.rs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index 72aa677cdd..4adfd116de 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -296,6 +296,18 @@ pub fn obfuscate_url_string( return String::from("?"); } } + // For query-only references (starting with '?'), Go keeps the query raw. + // With remove_query_string=true, return "?". Otherwise return original. + if url.starts_with('?') { + if has_invalid_percent_encoding(&url[1..]) { + return String::from("?"); + } + if remove_query_string { + return String::from("?"); + } + // Return original (Go keeps query chars raw, including non-ASCII) + return url.to_string(); + } // The url crate treats '\' as a path separator, silently consuming it. // Go encodes '\' as '%5C'. Pre-encode backslashes before go_like_reference // so they are preserved through base.join() and appear as '%5C' in the output. From 8b41991f2201b4a8ee59db6762871674c4eb0ffa Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Thu, 5 Mar 2026 19:44:01 +0100 Subject: [PATCH 35/63] fix(http): preserve original query string (Go uses RawQuery, doesn't re-encode) --- libdd-trace-obfuscation/src/http.rs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index 4adfd116de..0279a21893 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -443,6 +443,22 @@ pub fn obfuscate_url_string( result } }; + // Go keeps the query string raw (url.RawQuery in Go's URL struct). + // The url crate encodes query chars; restore the original query from the input. + let result = if !remove_query_string { + if let Some(orig_q_start) = url.find('?') { + let orig_query = &url[orig_q_start..]; // includes '?' and up to '#' + if let Some(result_q_start) = result.find('?') { + format!("{}{}", &result[..result_q_start], orig_query) + } else { + result + } + } else { + result + } + } else { + result + }; if remove_path_digits { return remove_relative_path_digits(&result); } From abacdf7e36d5a7b308274b8a21a64aabcdf2400e Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Thu, 5 Mar 2026 19:48:00 +0100 Subject: [PATCH 36/63] fix(http): encode Cat1 chars ({|}^etc.) in fragment, also add ! ( ) * to Cat2 for fragment --- libdd-trace-obfuscation/src/http.rs | 77 ++++++++++++++++------------- 1 file changed, 43 insertions(+), 34 deletions(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index 0279a21893..88920530e3 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -199,21 +199,20 @@ pub fn obfuscate_url_string( // (These are in validEncoded's allowlist so kept for pure-ASCII fragments, // but escape() encodes them too.) let frag_has_non_ascii = fragment.bytes().any(|b| b > 127); + // Cat1: always encode in fragments; Cat2: encode when non-ASCII present let url_for_join = if fragment.bytes().any(|b| b < 0x20 || b == 0x7F || b == b'#') - || (frag_has_non_ascii - && fragment - .chars() - .any(|c| matches!(c, '\'' | '[' | ']'))) + || fragment.chars().any(|c| matches!(c, '{' | '}' | '|' | '^' | '`' | '\\' | '<' | '>' | ' ')) + || (frag_has_non_ascii && fragment.chars().any(|c| matches!(c, '\'' | '[' | ']'))) { let mut encoded = String::from('#'); for c in fragment.chars() { let cp = c as u32; if cp < 0x20 || cp == 0x7F || c == '#' { encoded.push_str(&format!("%{cp:02X}")); - } else if frag_has_non_ascii - && matches!(c, '\'' | '[' | ']') - { + } else if matches!(c, '{' | '}' | '|' | '^' | '`' | '\\' | '<' | '>' | ' ') { + encoded.push_str(&format!("%{:02X}", c as u8)); + } else if frag_has_non_ascii && matches!(c, '\'' | '[' | ']') { encoded.push_str(&format!("%{:02X}", c as u8)); } else { encoded.push(c); @@ -274,6 +273,23 @@ pub fn obfuscate_url_string( for c in pp.chars() { match c { '\\' | '^' | '{' | '}' | '|' | '<' | '>' | '`' | ' ' => { enc.push('%'); enc.push_str(&format!("{:02X}", c as u8)); changed = true; } _ => enc.push(c), } } if changed { if rr.is_empty() { enc } else { format!("{enc}{rr}") } } else { result } }; + // Also encode Cat1 and (when frag has non-ASCII) Cat2 in the result's fragment + let orig_frag_has_non_ascii = url[url.find('#').map(|i|i+1).unwrap_or(url.len())..].bytes().any(|b| b > 127); + let result = if let Some(fs) = result.find('#') { + let (ph, fr) = result.split_at(fs); + let fr_inner = &fr[1..]; + let needs = fr_inner.chars().any(|c| matches!(c, '{' | '}' | '|' | '^' | '`' | '\\' | '<' | '>' | ' ')) + || (orig_frag_has_non_ascii && fr_inner.chars().any(|c| matches!(c, '\'' | '[' | ']'))); + if needs { + let mut out = ph.to_string(); out.push('#'); + for c in fr_inner.chars() { + if matches!(c, '{' | '}' | '|' | '^' | '`' | '\\' | '<' | '>' | ' ') { out.push_str(&format!("%{:02X}", c as u8)); } + else if orig_frag_has_non_ascii && matches!(c, '\'' | '[' | ']') { out.push_str(&format!("%{:02X}", c as u8)); } + else { out.push(c); } + } + out + } else { result } + } else { result }; if remove_path_digits { return remove_relative_path_digits(&result); } return result; } @@ -311,9 +327,10 @@ pub fn obfuscate_url_string( // The url crate treats '\' as a path separator, silently consuming it. // Go encodes '\' as '%5C'. Pre-encode backslashes before go_like_reference // so they are preserved through base.join() and appear as '%5C' in the output. + // Also pre-encode spaces (url crate may drop them). let url_pre_encoded; - let url_for_go_like = if url.contains('\\') { - url_pre_encoded = url.replace('\\', "%5C"); + let url_for_go_like = if url.contains('\\') || url.contains(' ') { + url_pre_encoded = url.replace('\\', "%5C").replace(' ', "%20"); url_pre_encoded.as_str() } else { url @@ -388,32 +405,24 @@ pub fn obfuscate_url_string( // Check if original URL's fragment also has non-ASCII let url_frag_start = url.find('#').map(|i| i + 1).unwrap_or(url.len()); let frag_has_non_ascii = url[url_frag_start..].bytes().any(|b| b > 127); - if frag_has_non_ascii { - // Also encode cat2 chars in the result's fragment - if let Some(frag_start) = encoded.find('#') { - let path_and_hash = &encoded[..=frag_start]; - let frag = &encoded[frag_start + 1..]; - // In fragments, Go encodes ' [ ] when non-ASCII triggers escape(), - // but NOT ! ( ) * (shouldEscape returns false for those in encodeFragment) - if frag.chars().any(|c| matches!(c, '\'' | '[' | ']')) { - let mut out = path_and_hash.to_string(); - for c in frag.chars() { - if matches!(c, '\'' | '[' | ']') { - out.push_str(&format!("%{:02X}", c as u8)); - } else { - out.push(c); - } - } - out - } else { - encoded + // Encode Cat1 and (when frag has non-ASCII) Cat2 in the result's fragment + if let Some(frag_start) = encoded.find('#') { + let path_and_hash = &encoded[..=frag_start]; + let frag = &encoded[frag_start + 1..]; + let frag_needs_enc = frag.chars().any(|c| matches!(c, '{' | '}' | '|' | '^' | '`' | '\\' | '<' | '>' | ' ')) + || (frag_has_non_ascii && frag.chars().any(|c| matches!(c, '\'' | '[' | ']' ))); + if frag_needs_enc { + let mut out = path_and_hash.to_string(); + for c in frag.chars() { + if matches!(c, '{' | '}' | '|' | '^' | '`' | '\\' | '<' | '>' | ' ') { + out.push_str(&format!("%{:02X}", c as u8)); + } else if frag_has_non_ascii && matches!(c, '\'' | '[' | ']' ) { + out.push_str(&format!("%{:02X}", c as u8)); + } else { out.push(c); } } - } else { - encoded - } - } else { - encoded - } + out + } else { encoded } + } else { encoded } } else { // ASCII-only: only category 1 chars (\, ^, etc.) // Category 2 (!, ', (, ), *) are left as-is for pure ASCII inputs From c2d7aa2bbbeeb0583c6269e6aee5910aa04c0547 Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Thu, 5 Mar 2026 20:00:56 +0100 Subject: [PATCH 37/63] fix(http): invalid percent-encoding with both options false returns original (not '?') --- libdd-trace-obfuscation/src/http.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index 88920530e3..6aa14162aa 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -295,9 +295,12 @@ pub fn obfuscate_url_string( } } // Go's url.Parse rejects invalid percent-encoding sequences (bare '%' or '%' not - // followed by exactly two hex digits). The `url` crate re-encodes them as '%25', - // so we must detect and reject them explicitly. + // followed by exactly two hex digits). Returns '?' if options are active, + // or original if both options are false (obfuscateUserInfo passthrough). if has_invalid_percent_encoding(url) { + if !remove_query_string && !remove_path_digits { + return url.to_string(); + } return String::from("?"); } // Go's url.Parse rejects URLs where the first path segment contains ':' (RFC 3986 From 5dd824499b47e30ac9f08df9cae86f0deb331fe8 Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Thu, 5 Mar 2026 20:16:46 +0100 Subject: [PATCH 38/63] fix(http): colon-in-segment with both options false returns original (Go passthrough) --- libdd-trace-obfuscation/src/http.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index 6aa14162aa..60a78854dc 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -312,6 +312,9 @@ pub fn obfuscate_url_string( .find(|c| matches!(c, '/' | '?' | '#')) .unwrap_or(url.len()); if url[..segment_end].contains(':') { + if !remove_query_string && !remove_path_digits { + return url.to_string(); + } return String::from("?"); } } From 2c2ec014dc795f8fd0c9fee864a5863f2f040c28 Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Thu, 5 Mar 2026 20:32:14 +0100 Subject: [PATCH 39/63] fix(http): encode Cat2 chars when any Cat1 char triggers Go's escape() fallback --- libdd-trace-obfuscation/src/http.rs | 30 +++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index 60a78854dc..70fe7bd241 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -392,19 +392,17 @@ pub fn obfuscate_url_string( } }; // Encode path chars that Go encodes but the url crate doesn't. - // Always apply encode_go_path_chars since it handles: - // - Category 1 (always encoded): \, ^, {, }, |, <, >, `, space - // - Category 2 (only when non-ASCII triggers escape() fallback): !, ', (, ), * - // For category 2, we still apply them here unconditionally since encode_go_path_chars - // would encode them for non-ASCII inputs; for pure-ASCII those chars were already - // handled by validEncoded allowing them in RawPath. But since we're post-processing - // the url crate's output (which keeps them), we must encode them only when non-ASCII. - // Simplification: apply all encodings, but for category 2 chars only when non-ASCII. - // Only check path portion (before '#') for non-ASCII; a non-ASCII fragment - // does not trigger Go's escape() fallback for the path encoding. + // Go's EscapedPath() calls escape(path, encodePath) whenever validEncoded() returns + // false. validEncoded() fails on: non-ASCII chars OR Cat1 chars (\,^,{,},|,<,>,`,space). + // When escape() is called, it also encodes Cat2 chars (!, ', (, ), *, [, ]). + // So Cat2 chars are encoded whenever any Cat1 or non-ASCII char is present in the path. + // Only check path portion (before '#'); fragment has separate encoding logic. let path_end_for_ascii_check = url.find('#').unwrap_or(url.len()); - let has_non_ascii = url[..path_end_for_ascii_check].bytes().any(|b| b > 127); - let result = if has_non_ascii { + let path_for_check = &url[..path_end_for_ascii_check]; + let has_non_ascii = path_for_check.bytes().any(|b| b > 127); + let has_cat1 = path_for_check.chars().any(|c| matches!(c, '\\' | '^' | '{' | '}' | '|' | '<' | '>' | '`' | ' ')); + let needs_full_encoding = has_non_ascii || has_cat1; + let result = if needs_full_encoding { // Full encoding: both category 1 and category 2 in path + fragment. // When non-ASCII is present, Go's escape() also encodes cat2 chars in fragments. let encoded = encode_go_path_chars(&result); @@ -814,6 +812,14 @@ mod tests { input ["#!ჸ"] expected_output ["#!%E1%83%B8"]; ] + [ + // Cat1 char (<) triggers full escape(), which also encodes Cat2 char (!) + test_name [fuzzing_2455396347_cat1_triggers_cat2] + remove_query_string [true] + remove_path_digits [true] + input [" Date: Thu, 5 Mar 2026 20:36:35 +0100 Subject: [PATCH 40/63] fix(http): invalid percent-encoding check only applies to path, not query string --- libdd-trace-obfuscation/src/http.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index 70fe7bd241..dc6ede6395 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -295,9 +295,9 @@ pub fn obfuscate_url_string( } } // Go's url.Parse rejects invalid percent-encoding sequences (bare '%' or '%' not - // followed by exactly two hex digits). Returns '?' if options are active, - // or original if both options are false (obfuscateUserInfo passthrough). - if has_invalid_percent_encoding(url) { + // followed by exactly two hex digits) in the PATH only, not in the query string. + // Check only the portion before '?' (and before '#'). + if has_invalid_percent_encoding(&url[..url.find(|c| c == '?' || c == '#').unwrap_or(url.len())]) { if !remove_query_string && !remove_path_digits { return url.to_string(); } From 72331712d969ac2884ef513e7f5233439a0bdc5c Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Thu, 5 Mar 2026 20:45:10 +0100 Subject: [PATCH 41/63] fix(http): reject invalid percent-encoding in fragment (not just path) --- libdd-trace-obfuscation/src/http.rs | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index dc6ede6395..6d9839d246 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -295,13 +295,18 @@ pub fn obfuscate_url_string( } } // Go's url.Parse rejects invalid percent-encoding sequences (bare '%' or '%' not - // followed by exactly two hex digits) in the PATH only, not in the query string. - // Check only the portion before '?' (and before '#'). - if has_invalid_percent_encoding(&url[..url.find(|c| c == '?' || c == '#').unwrap_or(url.len())]) { - if !remove_query_string && !remove_path_digits { - return url.to_string(); + // followed by exactly two hex digits) in the PATH and FRAGMENT, but not query string. + { + let path_end = url.find(|c| c == '?' || c == '#').unwrap_or(url.len()); + let frag_start = url.find('#').map(|i| i + 1); + let path_invalid = has_invalid_percent_encoding(&url[..path_end]); + let frag_invalid = frag_start.is_some_and(|i| has_invalid_percent_encoding(&url[i..])); + if path_invalid || frag_invalid { + if !remove_query_string && !remove_path_digits { + return url.to_string(); + } + return String::from("?"); } - return String::from("?"); } // Go's url.Parse rejects URLs where the first path segment contains ':' (RFC 3986 // §4.2): this is ambiguous with a scheme separator. E.g., ":" and "1:b" both fail From bd6a9d587977fb46da7633aa4e0cf83998ed9762 Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Fri, 6 Mar 2026 11:46:12 +0100 Subject: [PATCH 42/63] fix(http): reject invalid %-encoding and strip empty fragment in opaque URIs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Opaque URIs ending with bare '#' (e.g. "C:#") now strip the empty fragment to match Go's url.URL.String() which omits it - When a URL has control chars in the fragment, also check the path for invalid percent-encoding before pre-encoding — previously this branch returned early and skipped the path validity check, causing inputs like "ჸ#%\u{1}" to return a percent-encoded result instead of "?" --- libdd-trace-obfuscation/src/http.rs | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index 6d9839d246..ad3f760edb 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -176,7 +176,9 @@ pub fn obfuscate_url_string( if opaque_part.bytes().any(|b| b < 0x20 || b == 0x7F) { return url.to_string(); // Go returns original on parse error } - return url[..scheme_len].to_lowercase() + opaque_part; + let result = url[..scheme_len].to_lowercase() + opaque_part; + // Go's url.URL.String() omits empty trailing fragment (bare '#') + return if result.ends_with('#') { result[..result.len() - 1].to_string() } else { result }; } res } @@ -240,6 +242,15 @@ pub fn obfuscate_url_string( if path_end < url.len() && url[path_end + 1..].bytes().any(|b| b < 0x20 || b == 0x7F || b == b'#') { + // If the path or fragment has invalid percent-encoding, Go rejects the URL. + let path_invalid = has_invalid_percent_encoding(&url[..path_end]); + let frag_invalid = has_invalid_percent_encoding(&url[path_end + 1..]); + if path_invalid || frag_invalid { + if !remove_query_string && !remove_path_digits { + return url.to_string(); + } + return String::from("?"); + } let mut pre_encoded = url[..path_end].to_string(); pre_encoded.push('#'); for c in url[path_end + 1..].chars() { @@ -825,6 +836,14 @@ mod tests { input [" Date: Fri, 6 Mar 2026 12:14:00 +0100 Subject: [PATCH 43/63] fix(http): encode Cat2 chars (!, ', etc.) when path contains double-quote MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Go's url.EscapedPath() calls escape() on the whole path whenever validEncoded() returns false. validEncoded() returns false for any char not in its explicit allowlist — including '\"' (double-quote). When escape() is called, it also encodes Category 2 chars (!, ', (, ), *). Add '\"' to the has_cat1 trigger check so that inputs containing '\"' in the path also get Category 2 encoding, matching Go's behavior. --- libdd-trace-obfuscation/src/http.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index ad3f760edb..9d99c2ffcc 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -416,7 +416,7 @@ pub fn obfuscate_url_string( let path_end_for_ascii_check = url.find('#').unwrap_or(url.len()); let path_for_check = &url[..path_end_for_ascii_check]; let has_non_ascii = path_for_check.bytes().any(|b| b > 127); - let has_cat1 = path_for_check.chars().any(|c| matches!(c, '\\' | '^' | '{' | '}' | '|' | '<' | '>' | '`' | ' ')); + let has_cat1 = path_for_check.chars().any(|c| matches!(c, '\\' | '^' | '{' | '}' | '|' | '<' | '>' | '`' | ' ' | '"')); let needs_full_encoding = has_non_ascii || has_cat1; let result = if needs_full_encoding { // Full encoding: both category 1 and category 2 in path + fragment. From 5c0526d82400fba005c8a3f9aefaa68270b6b1b0 Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Fri, 6 Mar 2026 12:22:08 +0100 Subject: [PATCH 44/63] fix(http): reject percent-encoded non-UTF-8 sequences like Go Go's url.unescape validates that percent-encoded bytes in path/fragment form valid UTF-8 sequences. The Rust implementation only checked for syntactically invalid percent-encoding (wrong hex digits count), missing cases like %80 (a lone UTF-8 continuation byte) which Go rejects. Fix: collect consecutive percent-encoded bytes and validate with from_utf8. --- libdd-trace-obfuscation/src/http.rs | 30 ++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index 9d99c2ffcc..32e4c8b63b 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -77,13 +77,33 @@ fn has_invalid_percent_encoding(s: &str) -> bool { let mut i = 0; while i < bytes.len() { if bytes[i] == b'%' { - if i + 2 >= bytes.len() - || !bytes[i + 1].is_ascii_hexdigit() - || !bytes[i + 2].is_ascii_hexdigit() - { + // Collect a run of consecutive percent-encoded bytes and validate as UTF-8. + // Go's url.unescape rejects sequences whose decoded bytes are not valid UTF-8 + // in path/fragment mode (e.g. %80 alone is a lone continuation byte). + let mut buf = Vec::new(); + while i < bytes.len() && bytes[i] == b'%' { + if i + 2 >= bytes.len() + || !bytes[i + 1].is_ascii_hexdigit() + || !bytes[i + 2].is_ascii_hexdigit() + { + return true; + } + let hi = match bytes[i + 1] { + b'0'..=b'9' => bytes[i + 1] - b'0', + b'a'..=b'f' => bytes[i + 1] - b'a' + 10, + _ => bytes[i + 1] - b'A' + 10, + }; + let lo = match bytes[i + 2] { + b'0'..=b'9' => bytes[i + 2] - b'0', + b'a'..=b'f' => bytes[i + 2] - b'a' + 10, + _ => bytes[i + 2] - b'A' + 10, + }; + buf.push((hi << 4) | lo); + i += 3; + } + if std::str::from_utf8(&buf).is_err() { return true; } - i += 3; } else { i += 1; } From 9de928d95dac612df5311b5001b6466a5f5a2385 Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Fri, 6 Mar 2026 12:27:36 +0100 Subject: [PATCH 45/63] fix(http): preserve fragment when removing query string for ?#frag URLs Two bugs fixed: 1. go_like_reference() dropped the fragment when stripping the query. Fix: after finding path_end (at '?'), extract the '#...' fragment and include it in the returned string. 2. obfuscate_url_string() returned '?' for '?#frag' inputs with remove_query_string=true, discarding the fragment entirely. Fix: when after_q starts with '#' (empty query + fragment), fall through to go_like_reference which encodes and preserves it. --- libdd-trace-obfuscation/src/http.rs | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index 32e4c8b63b..4f354cd2db 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -148,7 +148,9 @@ pub fn go_like_reference(input: &str, remove_query_string: bool) -> String { if let Some(rest) = full.strip_prefix("https://example.invalid") { if remove_query_string && resolved.query().is_some() { let path_end = rest.find('?').unwrap_or(rest.len()); - return format!("{}?", &rest[..path_end]); + // Preserve fragment (Go keeps it when removing the query string) + let frag = rest.find('#').map(|i| &rest[i..]).unwrap_or(""); + return format!("{}?{}", &rest[..path_end], frag); } return rest.to_string(); } @@ -157,9 +159,11 @@ pub fn go_like_reference(input: &str, remove_query_string: bool) -> String { if let Some(rest) = full.strip_prefix(base_prefix) { // relative path (e.g. "hello%20world" or "dir/hello%20world") if remove_query_string && resolved.query().is_some() { - // Strip the query string, preserving the path with a trailing "?" + // Strip the query string, preserving the path and fragment with a trailing "?" let path_end = rest.find('?').unwrap_or(rest.len()); - return format!("{}?", &rest[..path_end]); + // Preserve fragment (Go keeps it when removing the query string) + let frag = rest.find('#').map(|i| &rest[i..]).unwrap_or(""); + return format!("{}?{}", &rest[..path_end], frag); } rest.to_string() } else if let Some(rest) = full.strip_prefix("https://example.invalid") { @@ -361,10 +365,16 @@ pub fn obfuscate_url_string( return String::from("?"); } if remove_query_string { - return String::from("?"); + // If the URL is "?#frag" (empty query + fragment), preserve the fragment. + // Fall through to go_like_reference which encodes and preserves it. + // For "?query" (no fragment), remove the query and return "?". + if !after_q.starts_with('#') { + return String::from("?"); + } + } else { + // Return original (Go keeps query chars raw, including non-ASCII) + return url.to_string(); } - // Return original (Go keeps query chars raw, including non-ASCII) - return url.to_string(); } // The url crate treats '\' as a path separator, silently consuming it. // Go encodes '\' as '%5C'. Pre-encode backslashes before go_like_reference From 0d5b9eb78e688997149e79690f4cf982ff0749bc Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Fri, 6 Mar 2026 12:29:15 +0100 Subject: [PATCH 46/63] fix(http): preserve fragment for ?query#frag when removing query string MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When remove_query_string=true and the URL has both a query and a fragment (e.g. "?ჸ#ჸ"), the previous fix only handled "?#frag" (empty query). Extend the fix to any URL starting with '?' that contains a '#' fragment. Fall through to go_like_reference which strips the query and preserves the fragment with correct percent-encoding. --- libdd-trace-obfuscation/src/http.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index 4f354cd2db..1107dae204 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -365,10 +365,10 @@ pub fn obfuscate_url_string( return String::from("?"); } if remove_query_string { - // If the URL is "?#frag" (empty query + fragment), preserve the fragment. + // If the URL has a fragment ("?query#frag" or "?#frag"), preserve it. // Fall through to go_like_reference which encodes and preserves it. // For "?query" (no fragment), remove the query and return "?". - if !after_q.starts_with('#') { + if !after_q.contains('#') { return String::from("?"); } } else { From b7e873be7355f19202ac361ebccc6a5eb3236aa0 Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Fri, 6 Mar 2026 12:31:13 +0100 Subject: [PATCH 47/63] fix(http): strip empty trailing fragment for query-only URLs Go's url.URL.String() omits an empty trailing fragment (bare '#'). For query-only URL references like '?query#', the previous code returned the original string including the bare '#', while Go returns '?query'. --- libdd-trace-obfuscation/src/http.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index 1107dae204..16974d6f0a 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -372,8 +372,10 @@ pub fn obfuscate_url_string( return String::from("?"); } } else { - // Return original (Go keeps query chars raw, including non-ASCII) - return url.to_string(); + // Return original (Go keeps query chars raw, including non-ASCII). + // Go's url.URL.String() omits an empty trailing fragment (bare '#'). + let s = url.to_string(); + return if s.ends_with('#') { s[..s.len() - 1].to_string() } else { s }; } } // The url crate treats '\' as a path separator, silently consuming it. From d672f04d1c53f8fb5f59eea48d7295db3c7a444d Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Fri, 6 Mar 2026 12:37:54 +0100 Subject: [PATCH 48/63] fix(http): encode non-ASCII fragment chars for query-only URLs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For URLs starting with '?' that have a fragment (e.g. '?#ჸ'), Go's url.URL.String() percent-encodes non-ASCII chars in the fragment via EscapeFragment. Also, Go omits an empty trailing fragment ('?#' → '?'). Handle these cases early before the 'restore original query' pass which would otherwise undo the encoding. --- libdd-trace-obfuscation/src/http.rs | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index 16974d6f0a..babf672277 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -371,11 +371,32 @@ pub fn obfuscate_url_string( if !after_q.contains('#') { return String::from("?"); } + } else if let Some(hash_pos) = after_q.find('#') { + // Fragment present. Go keeps query raw and percent-encodes non-ASCII in + // the fragment (url.URL.String() calls EscapeFragment). Handle it here so + // the "restore original query" pass below doesn't undo the encoding. + let query_part = &after_q[..hash_pos]; // query content (without '?') + let frag = &after_q[hash_pos + 1..]; // fragment content + if frag.is_empty() { + // Go's url.URL.String() omits an empty trailing fragment (bare '#'). + return format!("?{query_part}"); + } + // Encode non-ASCII chars in the fragment byte-by-byte. + let mut encoded_frag = String::new(); + for c in frag.chars() { + if (c as u32) > 127 { + let mut buf = [0u8; 4]; + for &b in c.encode_utf8(&mut buf).as_bytes() { + encoded_frag.push_str(&format!("%{b:02X}")); + } + } else { + encoded_frag.push(c); + } + } + return format!("?{query_part}#{encoded_frag}"); } else { - // Return original (Go keeps query chars raw, including non-ASCII). - // Go's url.URL.String() omits an empty trailing fragment (bare '#'). - let s = url.to_string(); - return if s.ends_with('#') { s[..s.len() - 1].to_string() } else { s }; + // No fragment: Go keeps query chars raw, including non-ASCII. + return url.to_string(); } } // The url crate treats '\' as a path separator, silently consuming it. From 308201355065ab8c1ff388d520511c5cc2ab4057 Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Fri, 6 Mar 2026 12:40:02 +0100 Subject: [PATCH 49/63] fix(http): don't restore bare '#' fragment when restoring raw query The restore-original-query pass was splicing &url[q_start..] which includes any trailing '#' (empty fragment), overriding the empty-fragment stripping done by go_like_reference. Now only restores up to '#', and appends the (already-encoded/stripped) fragment from go_like_reference. --- libdd-trace-obfuscation/src/http.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index babf672277..b500230ea7 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -527,11 +527,17 @@ pub fn obfuscate_url_string( }; // Go keeps the query string raw (url.RawQuery in Go's URL struct). // The url crate encodes query chars; restore the original query from the input. + // Only restore the query portion (up to '#'), not the fragment — the fragment + // comes from go_like_reference which already handles encoding and empty stripping. let result = if !remove_query_string { if let Some(orig_q_start) = url.find('?') { - let orig_query = &url[orig_q_start..]; // includes '?' and up to '#' + let orig_frag_start = url.find('#'); + // orig_query: from '?' to '#' (exclusive), e.g. "?rawquery" + let orig_query = &url[orig_q_start..orig_frag_start.unwrap_or(url.len())]; if let Some(result_q_start) = result.find('?') { - format!("{}{}", &result[..result_q_start], orig_query) + // Keep the fragment from `result` (already encoded/stripped by go_like_reference) + let result_frag = result.find('#').map_or("", |i| &result[i..]); + format!("{}{}{}", &result[..result_q_start], orig_query, result_frag) } else { result } From 9bab0bfcf96e468cc00fac2623730e62abe1bb40 Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Fri, 6 Mar 2026 12:42:54 +0100 Subject: [PATCH 50/63] fix(http): normalize percent-encoded unreserved chars in path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Go's url.Parse decodes %XX sequences where the decoded byte is an unreserved char (A-Z, a-z, 0-9, -, ., _, ~) as part of path normalization. E.g. %30 → 0, %41 → A. The url crate preserves them as-is. Add normalize_pct_encoded_unreserved() and apply it in go_like_reference on the path portion of all returned values. --- libdd-trace-obfuscation/src/http.rs | 65 ++++++++++++++++++++++++++--- 1 file changed, 59 insertions(+), 6 deletions(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index b500230ea7..87c1bd0beb 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -4,6 +4,55 @@ use percent_encoding::percent_decode_str; use url::Url; +/// Go's url.Parse normalizes percent-encoded unreserved chars (A-Z, a-z, 0-9, -, ., _, ~) +/// in the path by decoding them. E.g. %30 → 0, %41 → A. The url crate does not do this. +/// Apply this normalization to the path portion (before '?' or '#') of a URL string. +fn normalize_pct_encoded_unreserved(s: &str) -> String { + let path_end = s.find(['?', '#']).unwrap_or(s.len()); + let path_part = &s[..path_end]; + let rest = &s[path_end..]; + + let bytes = path_part.as_bytes(); + let mut out = String::with_capacity(path_part.len()); + let mut i = 0; + while i < bytes.len() { + if bytes[i] == b'%' + && i + 2 < bytes.len() + && bytes[i + 1].is_ascii_hexdigit() + && bytes[i + 2].is_ascii_hexdigit() + { + let hi = match bytes[i + 1] { + b'0'..=b'9' => bytes[i + 1] - b'0', + b'a'..=b'f' => bytes[i + 1] - b'a' + 10, + _ => bytes[i + 1] - b'A' + 10, + }; + let lo = match bytes[i + 2] { + b'0'..=b'9' => bytes[i + 2] - b'0', + b'a'..=b'f' => bytes[i + 2] - b'a' + 10, + _ => bytes[i + 2] - b'A' + 10, + }; + let c = (hi << 4) | lo; + // Unreserved chars per RFC 3986: ALPHA / DIGIT / "-" / "." / "_" / "~" + if c.is_ascii_alphanumeric() || matches!(c, b'-' | b'.' | b'_' | b'~') { + out.push(c as char); + } else { + out.push('%'); + out.push(bytes[i + 1] as char); + out.push(bytes[i + 2] as char); + } + i += 3; + } else { + out.push(bytes[i] as char); + i += 1; + } + } + if rest.is_empty() { + out + } else { + format!("{out}{rest}") + } +} + /// Encode path characters that Go's url.EscapedPath() encodes but the url crate doesn't. /// Only applied to the path portion (before the first '?'). /// @@ -144,15 +193,19 @@ pub fn go_like_reference(input: &str, remove_query_string: bool) -> String { // to preserve the leading '/' in the result. Otherwise base.join("/ჸ") resolves to // "https://example.invalid/%E1%83%B8" and stripping the base WITH trailing slash // drops the leading '/'. + // Helper: normalize percent-encoded unreserved chars in path, then append rest unchanged. + // Go's url.Parse normalizes %XX of unreserved chars (e.g. %30 → 0) in the path. + let normalize = |s: &str| normalize_pct_encoded_unreserved(s); + if input.starts_with('/') { if let Some(rest) = full.strip_prefix("https://example.invalid") { if remove_query_string && resolved.query().is_some() { let path_end = rest.find('?').unwrap_or(rest.len()); // Preserve fragment (Go keeps it when removing the query string) let frag = rest.find('#').map(|i| &rest[i..]).unwrap_or(""); - return format!("{}?{}", &rest[..path_end], frag); + return normalize(&format!("{}?{}", &rest[..path_end], frag)); } - return rest.to_string(); + return normalize(rest); } } @@ -163,15 +216,15 @@ pub fn go_like_reference(input: &str, remove_query_string: bool) -> String { let path_end = rest.find('?').unwrap_or(rest.len()); // Preserve fragment (Go keeps it when removing the query string) let frag = rest.find('#').map(|i| &rest[i..]).unwrap_or(""); - return format!("{}?{}", &rest[..path_end], frag); + return normalize(&format!("{}?{}", &rest[..path_end], frag)); } - rest.to_string() + normalize(rest) } else if let Some(rest) = full.strip_prefix("https://example.invalid") { // covers cases like "/path" where the base origin remains - rest.to_string() + normalize(rest) } else { // shouldn't happen, but safe fallback - full.to_string() + normalize(full) } } From c726aa1b3f42c389ce32edea4d29258c289ebfe3 Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Fri, 6 Mar 2026 12:59:13 +0100 Subject: [PATCH 51/63] fix(http): reject URLs where pre-fragment path starts with colon Go's url.Parse first splits on '#', then parses the pre-fragment portion. If that portion starts with ':' (empty scheme), getScheme returns "missing protocol scheme" and ObfuscateURLString returns '?'. The Rust code had a check for ':' in the first path segment, but it was placed after the CTL-in-fragment pre-encode block which returned early, so inputs like ":#" bypassed the check. Move the colon check to before the CTL-in-fragment block so it fires regardless of what the fragment contains. Fixes parity for input ":#\u{1}" (http_fuzzing_4114246193). --- libdd-trace-obfuscation/src/http.rs | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index 87c1bd0beb..d6cb0e1154 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -315,6 +315,19 @@ pub fn obfuscate_url_string( } return String::from("?"); } + // Check if Go would reject the pre-fragment path due to ':' in first segment. + // This check must come before the CTL-in-fragment pre-encode block (which + // returns early) so that ":#\x01" is caught here rather than being pre-encoded + // and passed to go_like_reference. + if path_end < url.len() { + let segment_end = url.find(['/', '?', '#']).unwrap_or(url.len()); + if url[..segment_end].contains(':') { + if !remove_query_string && !remove_path_digits { + return url.to_string(); + } + return String::from("?"); + } + } // Pre-encode control chars in the fragment (if any) before go_like_reference. if path_end < url.len() && url[path_end + 1..].bytes().any(|b| b < 0x20 || b == 0x7F || b == b'#') From e7180a75007b3c50d5ab8226369ef92347166bbb Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Fri, 6 Mar 2026 16:17:58 +0100 Subject: [PATCH 52/63] fix(http): HTTP non-ASCII path check --- libdd-trace-obfuscation/src/http.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index d6cb0e1154..39dec91ced 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -365,7 +365,7 @@ pub fn obfuscate_url_string( let raw = go_like_reference(url_for_go_like, remove_query_string); let raw = if raw.ends_with('#') { raw[..raw.len()-1].to_string() } else { raw }; let result = if raw.is_empty() && !url.is_empty() { url.to_string() } else { raw }; - let path_end_for_ascii = url.find('#').unwrap_or(url.len()); + let path_end_for_ascii = url.find(['?', '#']).unwrap_or(url.len()); let has_non_ascii = url[..path_end_for_ascii].bytes().any(|b| b > 127); let result = if has_non_ascii { encode_go_path_chars(&result) } else { let qs = result.find('?').unwrap_or(result.len()); @@ -531,8 +531,9 @@ pub fn obfuscate_url_string( // false. validEncoded() fails on: non-ASCII chars OR Cat1 chars (\,^,{,},|,<,>,`,space). // When escape() is called, it also encodes Cat2 chars (!, ', (, ), *, [, ]). // So Cat2 chars are encoded whenever any Cat1 or non-ASCII char is present in the path. - // Only check path portion (before '#'); fragment has separate encoding logic. - let path_end_for_ascii_check = url.find('#').unwrap_or(url.len()); + // Only check path portion (before '?' or '#'); query string and fragment have + // separate handling. Go's EscapedPath() only runs on the path component. + let path_end_for_ascii_check = url.find(['?', '#']).unwrap_or(url.len()); let path_for_check = &url[..path_end_for_ascii_check]; let has_non_ascii = path_for_check.bytes().any(|b| b > 127); let has_cat1 = path_for_check.chars().any(|c| matches!(c, '\\' | '^' | '{' | '}' | '|' | '<' | '>' | '`' | ' ' | '"')); From c5f4ceba0201d69891d46330cbf81078f49ca335 Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Fri, 6 Mar 2026 17:03:23 +0100 Subject: [PATCH 53/63] fix(http): clippy warnings --- libdd-trace-obfuscation/src/http.rs | 38 +++++++++++++++-------------- 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index 39dec91ced..d2e91d5e86 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -64,7 +64,7 @@ fn encode_go_path_chars(url_str: &str) -> String { // Only encode up to the first '?' or '#' — the fragment has different encoding rules // (e.g., '!' is allowed in fragments per Go's shouldEscape for encodeFragment). let path_end = url_str - .find(|c| c == '?' || c == '#') + .find(['?', '#']) .unwrap_or(url_str.len()); let path_part = &url_str[..path_end]; let rest = &url_str[path_end..]; @@ -99,7 +99,7 @@ fn encode_go_path_chars(url_str: &str) -> String { fn remove_relative_path_digits(url_str: &str) -> String { // Only apply digit removal to the path (before '?' or '#'); fragments are not paths. let path_end = url_str - .find(|c: char| c == '?' || c == '#') + .find(['?', '#']) .unwrap_or(url_str.len()); let path_part = &url_str[..path_end]; let rest = &url_str[path_end..]; @@ -166,7 +166,8 @@ fn has_invalid_percent_encoding(s: &str) -> bool { /// - If it's relative, return the encoded relative reference (no dummy base in output) pub fn go_like_reference(input: &str, remove_query_string: bool) -> String { // Dummy base just to let the parser resolve relatives - let base = Url::parse("https://example.invalid/").unwrap(); + #[allow(clippy::expect_used)] + let base = Url::parse("https://example.invalid/").expect("known-good base URL"); // Try absolute first (like "https://...", "mailto:...", etc.) if let Ok(abs) = Url::parse(input) { @@ -289,9 +290,9 @@ pub fn obfuscate_url_string( let cp = c as u32; if cp < 0x20 || cp == 0x7F || c == '#' { encoded.push_str(&format!("%{cp:02X}")); - } else if matches!(c, '{' | '}' | '|' | '^' | '`' | '\\' | '<' | '>' | ' ') { - encoded.push_str(&format!("%{:02X}", c as u8)); - } else if frag_has_non_ascii && matches!(c, '\'' | '[' | ']') { + } else if matches!(c, '{' | '}' | '|' | '^' | '`' | '\\' | '<' | '>' | ' ') + || (frag_has_non_ascii && matches!(c, '\'' | '[' | ']')) + { encoded.push_str(&format!("%{:02X}", c as u8)); } else { encoded.push(c); @@ -384,8 +385,9 @@ pub fn obfuscate_url_string( if needs { let mut out = ph.to_string(); out.push('#'); for c in fr_inner.chars() { - if matches!(c, '{' | '}' | '|' | '^' | '`' | '\\' | '<' | '>' | ' ') { out.push_str(&format!("%{:02X}", c as u8)); } - else if orig_frag_has_non_ascii && matches!(c, '\'' | '[' | ']') { out.push_str(&format!("%{:02X}", c as u8)); } + if matches!(c, '{' | '}' | '|' | '^' | '`' | '\\' | '<' | '>' | ' ') + || (orig_frag_has_non_ascii && matches!(c, '\'' | '[' | ']')) + { out.push_str(&format!("%{:02X}", c as u8)); } else { out.push(c); } } out @@ -398,7 +400,7 @@ pub fn obfuscate_url_string( // Go's url.Parse rejects invalid percent-encoding sequences (bare '%' or '%' not // followed by exactly two hex digits) in the PATH and FRAGMENT, but not query string. { - let path_end = url.find(|c| c == '?' || c == '#').unwrap_or(url.len()); + let path_end = url.find(['?', '#']).unwrap_or(url.len()); let frag_start = url.find('#').map(|i| i + 1); let path_invalid = has_invalid_percent_encoding(&url[..path_end]); let frag_invalid = frag_start.is_some_and(|i| has_invalid_percent_encoding(&url[i..])); @@ -415,7 +417,7 @@ pub fn obfuscate_url_string( // The url crate silently accepts these as path chars. { let segment_end = url - .find(|c| matches!(c, '/' | '?' | '#')) + .find(['/', '?', '#']) .unwrap_or(url.len()); if url[..segment_end].contains(':') { if !remove_query_string && !remove_path_digits { @@ -426,8 +428,8 @@ pub fn obfuscate_url_string( } // For query-only references (starting with '?'), Go keeps the query raw. // With remove_query_string=true, return "?". Otherwise return original. - if url.starts_with('?') { - if has_invalid_percent_encoding(&url[1..]) { + if let Some(after_q) = url.strip_prefix('?') { + if has_invalid_percent_encoding(after_q) { return String::from("?"); } if remove_query_string { @@ -489,12 +491,12 @@ pub fn obfuscate_url_string( // The url crate resolved away dot path segments (e.g. "." or "..") via RFC 3986 // normalization. Go's url.Parse preserves them literally. Return the original, // but strip a trailing empty fragment '#' (Go omits empty fragments). - let fallback = if url.ends_with('#') { &url[..url.len()-1] } else { url }; + let fallback = url.strip_suffix('#').unwrap_or(url); fallback.to_string() } else { // If the original URL had a dot-segment prefix (., .., ./, ../) that // base.join() resolved away, Go preserves it literally. Re-prepend it. - let frag_or_end = url.find(|c| c == '#' || c == '?').unwrap_or(url.len()); + let frag_or_end = url.find(['#', '?']).unwrap_or(url.len()); let orig_path = &url[..frag_or_end]; let dot_prefix_len = { let mut i = 0; @@ -554,9 +556,9 @@ pub fn obfuscate_url_string( if frag_needs_enc { let mut out = path_and_hash.to_string(); for c in frag.chars() { - if matches!(c, '{' | '}' | '|' | '^' | '`' | '\\' | '<' | '>' | ' ') { - out.push_str(&format!("%{:02X}", c as u8)); - } else if frag_has_non_ascii && matches!(c, '\'' | '[' | ']' ) { + if matches!(c, '{' | '}' | '|' | '^' | '`' | '\\' | '<' | '>' | ' ') + || (frag_has_non_ascii && matches!(c, '\'' | '[' | ']')) + { out.push_str(&format!("%{:02X}", c as u8)); } else { out.push(c); } } @@ -567,7 +569,7 @@ pub fn obfuscate_url_string( // ASCII-only: only category 1 chars (\, ^, etc.) // Category 2 (!, ', (, ), *) are left as-is for pure ASCII inputs // Also stop at '#' since fragment has different encoding rules - let path_end = result.find(|c| c == '?' || c == '#').unwrap_or(result.len()); + let path_end = result.find(['?', '#']).unwrap_or(result.len()); let path_part = &result[..path_end]; let rest = &result[path_end..]; let mut encoded = String::with_capacity(path_part.len()); From 42a31e550e9999b309888c1c88aca614db57af50 Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Fri, 6 Mar 2026 17:16:41 +0100 Subject: [PATCH 54/63] fix(http): cargo fmt --- libdd-trace-obfuscation/src/http.rs | 207 ++++++++++++++++++---------- 1 file changed, 137 insertions(+), 70 deletions(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index d2e91d5e86..52f89f8567 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -63,9 +63,7 @@ fn normalize_pct_encoded_unreserved(s: &str) -> String { fn encode_go_path_chars(url_str: &str) -> String { // Only encode up to the first '?' or '#' — the fragment has different encoding rules // (e.g., '!' is allowed in fragments per Go's shouldEscape for encodeFragment). - let path_end = url_str - .find(['?', '#']) - .unwrap_or(url_str.len()); + let path_end = url_str.find(['?', '#']).unwrap_or(url_str.len()); let path_part = &url_str[..path_end]; let rest = &url_str[path_end..]; @@ -98,9 +96,7 @@ fn encode_go_path_chars(url_str: &str) -> String { /// splitting path by '/' and replacing segments containing digits with '?'. fn remove_relative_path_digits(url_str: &str) -> String { // Only apply digit removal to the path (before '?' or '#'); fragments are not paths. - let path_end = url_str - .find(['?', '#']) - .unwrap_or(url_str.len()); + let path_end = url_str.find(['?', '#']).unwrap_or(url_str.len()); let path_part = &url_str[..path_end]; let rest = &url_str[path_end..]; @@ -256,7 +252,11 @@ pub fn obfuscate_url_string( } let result = url[..scheme_len].to_lowercase() + opaque_part; // Go's url.URL.String() omits empty trailing fragment (bare '#') - return if result.ends_with('#') { result[..result.len() - 1].to_string() } else { result }; + return if result.ends_with('#') { + result[..result.len() - 1].to_string() + } else { + result + }; } res } @@ -276,32 +276,34 @@ pub fn obfuscate_url_string( // Go's url.Parse percent-encodes certain chars in fragments: // - Always: control chars, '#' // - When non-ASCII present (escape() fallback): '!', '\'', '(', ')', '*', '[', ']' - // (These are in validEncoded's allowlist so kept for pure-ASCII fragments, - // but escape() encodes them too.) + // (These are in validEncoded's allowlist so kept for pure-ASCII fragments, but + // escape() encodes them too.) let frag_has_non_ascii = fragment.bytes().any(|b| b > 127); // Cat1: always encode in fragments; Cat2: encode when non-ASCII present - let url_for_join = - if fragment.bytes().any(|b| b < 0x20 || b == 0x7F || b == b'#') - || fragment.chars().any(|c| matches!(c, '{' | '}' | '|' | '^' | '`' | '\\' | '<' | '>' | ' ')) - || (frag_has_non_ascii && fragment.chars().any(|c| matches!(c, '\'' | '[' | ']'))) - { - let mut encoded = String::from('#'); - for c in fragment.chars() { - let cp = c as u32; - if cp < 0x20 || cp == 0x7F || c == '#' { - encoded.push_str(&format!("%{cp:02X}")); - } else if matches!(c, '{' | '}' | '|' | '^' | '`' | '\\' | '<' | '>' | ' ') - || (frag_has_non_ascii && matches!(c, '\'' | '[' | ']')) - { - encoded.push_str(&format!("%{:02X}", c as u8)); - } else { - encoded.push(c); - } + let url_for_join = if fragment.bytes().any(|b| b < 0x20 || b == 0x7F || b == b'#') + || fragment + .chars() + .any(|c| matches!(c, '{' | '}' | '|' | '^' | '`' | '\\' | '<' | '>' | ' ')) + || (frag_has_non_ascii + && fragment.chars().any(|c| matches!(c, '\'' | '[' | ']'))) + { + let mut encoded = String::from('#'); + for c in fragment.chars() { + let cp = c as u32; + if cp < 0x20 || cp == 0x7F || c == '#' { + encoded.push_str(&format!("%{cp:02X}")); + } else if matches!(c, '{' | '}' | '|' | '^' | '`' | '\\' | '<' | '>' | ' ') + || (frag_has_non_ascii && matches!(c, '\'' | '[' | ']')) + { + encoded.push_str(&format!("%{:02X}", c as u8)); + } else { + encoded.push(c); } - encoded - } else { - url.to_string() - }; + } + encoded + } else { + url.to_string() + }; return go_like_reference(&url_for_join, remove_query_string); } // Go's url.Parse rejects control characters (bytes < 0x20 or 0x7F) in the PATH and @@ -331,7 +333,9 @@ pub fn obfuscate_url_string( } // Pre-encode control chars in the fragment (if any) before go_like_reference. if path_end < url.len() - && url[path_end + 1..].bytes().any(|b| b < 0x20 || b == 0x7F || b == b'#') + && url[path_end + 1..] + .bytes() + .any(|b| b < 0x20 || b == 0x7F || b == b'#') { // If the path or fragment has invalid percent-encoding, Go rejects the URL. let path_invalid = has_invalid_percent_encoding(&url[..path_end]); @@ -364,36 +368,80 @@ pub fn obfuscate_url_string( url }; let raw = go_like_reference(url_for_go_like, remove_query_string); - let raw = if raw.ends_with('#') { raw[..raw.len()-1].to_string() } else { raw }; - let result = if raw.is_empty() && !url.is_empty() { url.to_string() } else { raw }; + let raw = if raw.ends_with('#') { + raw[..raw.len() - 1].to_string() + } else { + raw + }; + let result = if raw.is_empty() && !url.is_empty() { + url.to_string() + } else { + raw + }; let path_end_for_ascii = url.find(['?', '#']).unwrap_or(url.len()); let has_non_ascii = url[..path_end_for_ascii].bytes().any(|b| b > 127); - let result = if has_non_ascii { encode_go_path_chars(&result) } else { + let result = if has_non_ascii { + encode_go_path_chars(&result) + } else { let qs = result.find('?').unwrap_or(result.len()); - let pp = &result[..qs]; let rr = &result[qs..]; - let mut enc = String::with_capacity(pp.len()); let mut changed = false; - for c in pp.chars() { match c { '\\' | '^' | '{' | '}' | '|' | '<' | '>' | '`' | ' ' => { enc.push('%'); enc.push_str(&format!("{:02X}", c as u8)); changed = true; } _ => enc.push(c), } } - if changed { if rr.is_empty() { enc } else { format!("{enc}{rr}") } } else { result } + let pp = &result[..qs]; + let rr = &result[qs..]; + let mut enc = String::with_capacity(pp.len()); + let mut changed = false; + for c in pp.chars() { + match c { + '\\' | '^' | '{' | '}' | '|' | '<' | '>' | '`' | ' ' => { + enc.push('%'); + enc.push_str(&format!("{:02X}", c as u8)); + changed = true; + } + _ => enc.push(c), + } + } + if changed { + if rr.is_empty() { + enc + } else { + format!("{enc}{rr}") + } + } else { + result + } }; // Also encode Cat1 and (when frag has non-ASCII) Cat2 in the result's fragment - let orig_frag_has_non_ascii = url[url.find('#').map(|i|i+1).unwrap_or(url.len())..].bytes().any(|b| b > 127); + let orig_frag_has_non_ascii = url + [url.find('#').map(|i| i + 1).unwrap_or(url.len())..] + .bytes() + .any(|b| b > 127); let result = if let Some(fs) = result.find('#') { let (ph, fr) = result.split_at(fs); let fr_inner = &fr[1..]; - let needs = fr_inner.chars().any(|c| matches!(c, '{' | '}' | '|' | '^' | '`' | '\\' | '<' | '>' | ' ')) - || (orig_frag_has_non_ascii && fr_inner.chars().any(|c| matches!(c, '\'' | '[' | ']'))); + let needs = fr_inner.chars().any(|c| { + matches!(c, '{' | '}' | '|' | '^' | '`' | '\\' | '<' | '>' | ' ') + }) || (orig_frag_has_non_ascii + && fr_inner.chars().any(|c| matches!(c, '\'' | '[' | ']'))); if needs { - let mut out = ph.to_string(); out.push('#'); + let mut out = ph.to_string(); + out.push('#'); for c in fr_inner.chars() { if matches!(c, '{' | '}' | '|' | '^' | '`' | '\\' | '<' | '>' | ' ') || (orig_frag_has_non_ascii && matches!(c, '\'' | '[' | ']')) - { out.push_str(&format!("%{:02X}", c as u8)); } - else { out.push(c); } + { + out.push_str(&format!("%{:02X}", c as u8)); + } else { + out.push(c); + } } out - } else { result } - } else { result }; - if remove_path_digits { return remove_relative_path_digits(&result); } + } else { + result + } + } else { + result + }; + if remove_path_digits { + return remove_relative_path_digits(&result); + } return result; } } @@ -403,7 +451,8 @@ pub fn obfuscate_url_string( let path_end = url.find(['?', '#']).unwrap_or(url.len()); let frag_start = url.find('#').map(|i| i + 1); let path_invalid = has_invalid_percent_encoding(&url[..path_end]); - let frag_invalid = frag_start.is_some_and(|i| has_invalid_percent_encoding(&url[i..])); + let frag_invalid = + frag_start.is_some_and(|i| has_invalid_percent_encoding(&url[i..])); if path_invalid || frag_invalid { if !remove_query_string && !remove_path_digits { return url.to_string(); @@ -416,9 +465,7 @@ pub fn obfuscate_url_string( // with "missing protocol scheme" or "first path segment cannot contain colon". // The url crate silently accepts these as path chars. { - let segment_end = url - .find(['/', '?', '#']) - .unwrap_or(url.len()); + let segment_end = url.find(['/', '?', '#']).unwrap_or(url.len()); if url[..segment_end].contains(':') { if !remove_query_string && !remove_path_digits { return url.to_string(); @@ -478,8 +525,7 @@ pub fn obfuscate_url_string( } else { url }; - let fixme_url_go_parsing_raw = - go_like_reference(url_for_go_like, remove_query_string); + let fixme_url_go_parsing_raw = go_like_reference(url_for_go_like, remove_query_string); // Go's url.URL.String() omits a trailing empty fragment (bare '#'). // The url crate keeps it. Strip it here for parity. let fixme_url_go_parsing = if fixme_url_go_parsing_raw.ends_with('#') { @@ -501,11 +547,16 @@ pub fn obfuscate_url_string( let dot_prefix_len = { let mut i = 0; loop { - if orig_path[i..].starts_with("../") { i += 3; } - else if orig_path[i..].starts_with("./") { i += 2; } - else if &orig_path[i..] == ".." || &orig_path[i..] == "." { - i += orig_path[i..].len(); break; - } else { break; } + if orig_path[i..].starts_with("../") { + i += 3; + } else if orig_path[i..].starts_with("./") { + i += 2; + } else if &orig_path[i..] == ".." || &orig_path[i..] == "." { + i += orig_path[i..].len(); + break; + } else { + break; + } } i }; @@ -530,15 +581,21 @@ pub fn obfuscate_url_string( }; // Encode path chars that Go encodes but the url crate doesn't. // Go's EscapedPath() calls escape(path, encodePath) whenever validEncoded() returns - // false. validEncoded() fails on: non-ASCII chars OR Cat1 chars (\,^,{,},|,<,>,`,space). - // When escape() is called, it also encodes Cat2 chars (!, ', (, ), *, [, ]). - // So Cat2 chars are encoded whenever any Cat1 or non-ASCII char is present in the path. - // Only check path portion (before '?' or '#'); query string and fragment have - // separate handling. Go's EscapedPath() only runs on the path component. + // false. validEncoded() fails on: non-ASCII chars OR Cat1 chars + // (\,^,{,},|,<,>,`,space). When escape() is called, it also encodes Cat2 + // chars (!, ', (, ), *, [, ]). So Cat2 chars are encoded whenever any Cat1 + // or non-ASCII char is present in the path. Only check path portion (before + // '?' or '#'); query string and fragment have separate handling. Go's + // EscapedPath() only runs on the path component. let path_end_for_ascii_check = url.find(['?', '#']).unwrap_or(url.len()); let path_for_check = &url[..path_end_for_ascii_check]; let has_non_ascii = path_for_check.bytes().any(|b| b > 127); - let has_cat1 = path_for_check.chars().any(|c| matches!(c, '\\' | '^' | '{' | '}' | '|' | '<' | '>' | '`' | ' ' | '"')); + let has_cat1 = path_for_check.chars().any(|c| { + matches!( + c, + '\\' | '^' | '{' | '}' | '|' | '<' | '>' | '`' | ' ' | '"' + ) + }); let needs_full_encoding = has_non_ascii || has_cat1; let result = if needs_full_encoding { // Full encoding: both category 1 and category 2 in path + fragment. @@ -551,8 +608,11 @@ pub fn obfuscate_url_string( if let Some(frag_start) = encoded.find('#') { let path_and_hash = &encoded[..=frag_start]; let frag = &encoded[frag_start + 1..]; - let frag_needs_enc = frag.chars().any(|c| matches!(c, '{' | '}' | '|' | '^' | '`' | '\\' | '<' | '>' | ' ')) - || (frag_has_non_ascii && frag.chars().any(|c| matches!(c, '\'' | '[' | ']' ))); + let frag_needs_enc = frag + .chars() + .any(|c| matches!(c, '{' | '}' | '|' | '^' | '`' | '\\' | '<' | '>' | ' ')) + || (frag_has_non_ascii + && frag.chars().any(|c| matches!(c, '\'' | '[' | ']'))); if frag_needs_enc { let mut out = path_and_hash.to_string(); for c in frag.chars() { @@ -560,11 +620,17 @@ pub fn obfuscate_url_string( || (frag_has_non_ascii && matches!(c, '\'' | '[' | ']')) { out.push_str(&format!("%{:02X}", c as u8)); - } else { out.push(c); } + } else { + out.push(c); + } } out - } else { encoded } - } else { encoded } + } else { + encoded + } + } else { + encoded + } } else { // ASCII-only: only category 1 chars (\, ^, etc.) // Category 2 (!, ', (, ), *) are left as-is for pure ASCII inputs @@ -604,7 +670,8 @@ pub fn obfuscate_url_string( // orig_query: from '?' to '#' (exclusive), e.g. "?rawquery" let orig_query = &url[orig_q_start..orig_frag_start.unwrap_or(url.len())]; if let Some(result_q_start) = result.find('?') { - // Keep the fragment from `result` (already encoded/stripped by go_like_reference) + // Keep the fragment from `result` (already encoded/stripped by + // go_like_reference) let result_frag = result.find('#').map_or("", |i| &result[i..]); format!("{}{}{}", &result[..result_q_start], orig_query, result_frag) } else { From b27a2fc4f5b6b80f46ae2e0e91a72e7388076fb8 Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Mon, 9 Mar 2026 13:48:49 +0100 Subject: [PATCH 55/63] fix(http): simplify a lot --- Cargo.lock | 18 + libdd-trace-obfuscation/Cargo.toml | 1 + libdd-trace-obfuscation/src/http.rs | 1008 +++++++++------------------ 3 files changed, 357 insertions(+), 670 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 033f40303b..a715081595 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -715,6 +715,12 @@ dependencies = [ "cc", ] +[[package]] +name = "borrow-or-share" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc0b364ead1874514c8c2855ab558056ebfeb775653e7ae45ff72f28f8f3166c" + [[package]] name = "build_common" version = "28.0.3" @@ -1960,6 +1966,17 @@ dependencies = [ "num-traits", ] +[[package]] +name = "fluent-uri" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc74ac4d8359ae70623506d512209619e5cf8f347124910440dbc221714b328e" +dependencies = [ + "borrow-or-share", + "ref-cast", + "serde", +] + [[package]] name = "fnv" version = "1.0.7" @@ -3382,6 +3399,7 @@ dependencies = [ "anyhow", "criterion", "duplicate", + "fluent-uri", "libdd-common", "libdd-trace-protobuf", "libdd-trace-utils", diff --git a/libdd-trace-obfuscation/Cargo.toml b/libdd-trace-obfuscation/Cargo.toml index 6b812ecc43..99cb80a18e 100644 --- a/libdd-trace-obfuscation/Cargo.toml +++ b/libdd-trace-obfuscation/Cargo.toml @@ -20,6 +20,7 @@ log = "0.4" libdd-trace-protobuf = { version = "1.1.0", path = "../libdd-trace-protobuf" } libdd-trace-utils = { version = "2.0.0", path = "../libdd-trace-utils" } libdd-common = { version = "2.0.0", path = "../libdd-common" } +fluent-uri = "0.4.1" [dev-dependencies] duplicate = "0.4.1" diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index 52f89f8567..bdfb611eb0 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -1,228 +1,80 @@ // Copyright 2023-Present Datadog, Inc. https://www.datadoghq.com/ // SPDX-License-Identifier: Apache-2.0 +use fluent_uri::UriRef; use percent_encoding::percent_decode_str; -use url::Url; -/// Go's url.Parse normalizes percent-encoded unreserved chars (A-Z, a-z, 0-9, -, ., _, ~) -/// in the path by decoding them. E.g. %30 → 0, %41 → A. The url crate does not do this. -/// Apply this normalization to the path portion (before '?' or '#') of a URL string. -fn normalize_pct_encoded_unreserved(s: &str) -> String { - let path_end = s.find(['?', '#']).unwrap_or(s.len()); - let path_part = &s[..path_end]; - let rest = &s[path_end..]; +fn is_cat1(c: char) -> bool { + matches!( + c, + '\\' | '^' | '{' | '}' | '|' | '<' | '>' | '`' | ' ' | '"' + ) +} + +fn hex_val(b: u8) -> u8 { + match b { + b'0'..=b'9' => b - b'0', + b'a'..=b'f' => b - b'a' + 10, + _ => b - b'A' + 10, + } +} - let bytes = path_part.as_bytes(); - let mut out = String::with_capacity(path_part.len()); +/// Decode %XX for unreserved chars (A-Za-z0-9-._~) in path, matching Go's url.Parse behavior. +fn normalize_pct_encoded_unreserved(path: &str) -> String { + let b = path.as_bytes(); + let mut out = String::with_capacity(path.len()); let mut i = 0; - while i < bytes.len() { - if bytes[i] == b'%' - && i + 2 < bytes.len() - && bytes[i + 1].is_ascii_hexdigit() - && bytes[i + 2].is_ascii_hexdigit() + while i < b.len() { + if b[i] == b'%' + && i + 2 < b.len() + && b[i + 1].is_ascii_hexdigit() + && b[i + 2].is_ascii_hexdigit() { - let hi = match bytes[i + 1] { - b'0'..=b'9' => bytes[i + 1] - b'0', - b'a'..=b'f' => bytes[i + 1] - b'a' + 10, - _ => bytes[i + 1] - b'A' + 10, - }; - let lo = match bytes[i + 2] { - b'0'..=b'9' => bytes[i + 2] - b'0', - b'a'..=b'f' => bytes[i + 2] - b'a' + 10, - _ => bytes[i + 2] - b'A' + 10, - }; - let c = (hi << 4) | lo; - // Unreserved chars per RFC 3986: ALPHA / DIGIT / "-" / "." / "_" / "~" - if c.is_ascii_alphanumeric() || matches!(c, b'-' | b'.' | b'_' | b'~') { - out.push(c as char); + let v = (hex_val(b[i + 1]) << 4) | hex_val(b[i + 2]); + if v.is_ascii_alphanumeric() || matches!(v, b'-' | b'.' | b'_' | b'~') { + out.push(v as char); } else { - out.push('%'); - out.push(bytes[i + 1] as char); - out.push(bytes[i + 2] as char); + out.push_str(&path[i..i + 3]); } i += 3; } else { - out.push(bytes[i] as char); + out.push(b[i] as char); i += 1; } } - if rest.is_empty() { - out - } else { - format!("{out}{rest}") - } + out } -/// Encode path characters that Go's url.EscapedPath() encodes but the url crate doesn't. -/// Only applied to the path portion (before the first '?'). -/// -/// Two categories: -/// 1. Always encoded: chars not in Go's validEncoded allowlist (e.g. '\', '^', '{', '}', '|') -/// 2. Encoded only when escape() fallback occurs (non-ASCII present): '!', '\'', '(', ')', '*' -/// These are in validEncoded's allowlist so RawPath is used for pure-ASCII paths. -fn encode_go_path_chars(url_str: &str) -> String { - // Only encode up to the first '?' or '#' — the fragment has different encoding rules - // (e.g., '!' is allowed in fragments per Go's shouldEscape for encodeFragment). - let path_end = url_str.find(['?', '#']).unwrap_or(url_str.len()); - let path_part = &url_str[..path_end]; - let rest = &url_str[path_end..]; - - let mut encoded = String::with_capacity(path_part.len()); - for c in path_part.chars() { - match c { - // Category 1: always encoded (not in validEncoded's explicit allowlist) - '\\' | '^' | '{' | '}' | '|' | '<' | '>' | '`' | ' ' => { - encoded.push('%'); - encoded.push_str(&format!("{:02X}", c as u8)); - } - // Category 2: encoded only when escape() fallback (handled by caller check) - // These are in Go's validEncoded allowlist but get encoded when escape() is called - '!' | '\'' | '(' | ')' | '*' | '[' | ']' => { - encoded.push('%'); - encoded.push_str(&format!("{:02X}", c as u8)); - } - _ => encoded.push(c), - } - } - if rest.is_empty() { - encoded - } else { - format!("{encoded}{rest}") - } +fn is_path_cat2(c: char) -> bool { + matches!(c, '!' | '\'' | '(' | ')' | '*' | '[' | ']') } -/// Apply path-digit removal to a relative URL string returned by go_like_reference. -/// Operates only on the path portion (before the first '?'), matching Go's behavior of -/// splitting path by '/' and replacing segments containing digits with '?'. -fn remove_relative_path_digits(url_str: &str) -> String { - // Only apply digit removal to the path (before '?' or '#'); fragments are not paths. - let path_end = url_str.find(['?', '#']).unwrap_or(url_str.len()); - let path_part = &url_str[..path_end]; - let rest = &url_str[path_end..]; - - let mut segments: Vec<&str> = path_part.split('/').collect(); - let mut changed = false; - for segment in segments.iter_mut() { - if let Ok(decoded) = percent_decode_str(segment).decode_utf8() { - if decoded.chars().any(|c| char::is_ascii_digit(&c)) { - *segment = "?"; - changed = true; - } - } - } - if changed { - format!("{}{}", segments.join("/"), rest) - } else { - url_str.to_string() - } +fn is_frag_cat2(c: char) -> bool { + matches!(c, '\'' | '[' | ']') } -fn has_invalid_percent_encoding(s: &str) -> bool { - let bytes = s.as_bytes(); - let mut i = 0; - while i < bytes.len() { - if bytes[i] == b'%' { - // Collect a run of consecutive percent-encoded bytes and validate as UTF-8. - // Go's url.unescape rejects sequences whose decoded bytes are not valid UTF-8 - // in path/fragment mode (e.g. %80 alone is a lone continuation byte). - let mut buf = Vec::new(); - while i < bytes.len() && bytes[i] == b'%' { - if i + 2 >= bytes.len() - || !bytes[i + 1].is_ascii_hexdigit() - || !bytes[i + 2].is_ascii_hexdigit() - { - return true; - } - let hi = match bytes[i + 1] { - b'0'..=b'9' => bytes[i + 1] - b'0', - b'a'..=b'f' => bytes[i + 1] - b'a' + 10, - _ => bytes[i + 1] - b'A' + 10, - }; - let lo = match bytes[i + 2] { - b'0'..=b'9' => bytes[i + 2] - b'0', - b'a'..=b'f' => bytes[i + 2] - b'a' + 10, - _ => bytes[i + 2] - b'A' + 10, - }; - buf.push((hi << 4) | lo); - i += 3; - } - if std::str::from_utf8(&buf).is_err() { - return true; - } - } else { - i += 1; - } +fn encode_char(out: &mut String, c: char) { + let mut buf = [0u8; 4]; + for &b in c.encode_utf8(&mut buf).as_bytes() { + out.push_str(&format!("%{b:02X}")); } - false } -/// Go-ish behavior: -/// - Accepts almost anything as a URL reference -/// - If it's absolute, return it as-is (normalized/encoded) -/// - If it's relative, return the encoded relative reference (no dummy base in output) -pub fn go_like_reference(input: &str, remove_query_string: bool) -> String { - // Dummy base just to let the parser resolve relatives - #[allow(clippy::expect_used)] - let base = Url::parse("https://example.invalid/").expect("known-good base URL"); - - // Try absolute first (like "https://...", "mailto:...", etc.) - if let Ok(abs) = Url::parse(input) { - return abs.to_string(); - } - - // Otherwise parse as a relative reference against the dummy base - let resolved = base.join(input).unwrap_or_else(|_| { - // If join fails (rare, but can happen with weird inputs), fall back to putting it in the - // path. - let mut u = base.clone(); - u.set_path(input); - u - }); - - // Strip the dummy origin back off so you get "hello%20world", "/x%20y", "?q=a%20b", "#frag", - // etc. - let full = resolved.as_str(); - - // base.as_str() is "https://example.invalid/" - let base_prefix = base.as_str(); - - // For absolute-path inputs (starting with '/'), use the no-trailing-slash strip - // to preserve the leading '/' in the result. Otherwise base.join("/ჸ") resolves to - // "https://example.invalid/%E1%83%B8" and stripping the base WITH trailing slash - // drops the leading '/'. - // Helper: normalize percent-encoded unreserved chars in path, then append rest unchanged. - // Go's url.Parse normalizes %XX of unreserved chars (e.g. %30 → 0) in the path. - let normalize = |s: &str| normalize_pct_encoded_unreserved(s); - - if input.starts_with('/') { - if let Some(rest) = full.strip_prefix("https://example.invalid") { - if remove_query_string && resolved.query().is_some() { - let path_end = rest.find('?').unwrap_or(rest.len()); - // Preserve fragment (Go keeps it when removing the query string) - let frag = rest.find('#').map(|i| &rest[i..]).unwrap_or(""); - return normalize(&format!("{}?{}", &rest[..path_end], frag)); +fn redact_path_digits(path: &str) -> String { + path.split('/') + .map(|seg| { + if percent_decode_str(seg) + .decode_utf8_lossy() + .chars() + .any(|c| c.is_ascii_digit()) + { + "?" + } else { + seg } - return normalize(rest); - } - } - - if let Some(rest) = full.strip_prefix(base_prefix) { - // relative path (e.g. "hello%20world" or "dir/hello%20world") - if remove_query_string && resolved.query().is_some() { - // Strip the query string, preserving the path and fragment with a trailing "?" - let path_end = rest.find('?').unwrap_or(rest.len()); - // Preserve fragment (Go keeps it when removing the query string) - let frag = rest.find('#').map(|i| &rest[i..]).unwrap_or(""); - return normalize(&format!("{}?{}", &rest[..path_end], frag)); - } - normalize(rest) - } else if let Some(rest) = full.strip_prefix("https://example.invalid") { - // covers cases like "/path" where the base origin remains - normalize(rest) - } else { - // shouldn't happen, but safe fallback - normalize(full) - } + }) + .collect::>() + .join("/") } pub fn obfuscate_url_string( @@ -230,495 +82,126 @@ pub fn obfuscate_url_string( remove_query_string: bool, remove_path_digits: bool, ) -> String { - // Go rejects control chars in the path (returns '?'). Check before Url::parse since - // the url crate may silently drop control chars and succeed where Go would fail. - if remove_query_string || remove_path_digits { - let path_end = url.find('#').unwrap_or(url.len()); - if url[..path_end].bytes().any(|b| b < 0x20 || b == 0x7F) { - return String::from("?"); + if url.is_empty() { + return String::new(); + } + + let frag_pos = url.find('#'); + let path_query_end = frag_pos.unwrap_or(url.len()); + let path_end = url[..path_query_end].find('?').unwrap_or(path_query_end); + + // Control chars in path/query — Go rejects these + if url[..path_query_end].bytes().any(|b| b < 0x20 || b == 0x7F) { + return if remove_query_string || remove_path_digits { + "?".to_string() + } else { + url.to_string() + }; + } + + // Determine Go's escape() trigger: Cat1 or non-ASCII in path causes Cat2 encoding too + let path = &url[..path_end]; + let needs_full_path = path.bytes().any(|b| b > 127) || path.chars().any(is_cat1); + let frag_has_non_ascii = frag_pos.map_or(false, |i| url[i + 1..].bytes().any(|b| b > 127)); + + // Pre-encode chars that UriRef (strict RFC 3986) rejects. + // We encode ALL non-ASCII chars (not just Cat1/Cat2) so that characters outside + // RFC 3987 ucschar ranges (e.g. U+10EF4F, U+10FFFF) don't cause parse failures. + // Exclude the query — Go doesn't validate query percent-encoding, so we pass + // only path + fragment to UriRef and restore the original query afterward. + let mut pre = String::with_capacity(url.len() * 4); + for c in url[..path_end].chars() { + if !c.is_ascii() { + encode_char(&mut pre, c); + } else if is_cat1(c) || (needs_full_path && is_path_cat2(c)) { + pre.push_str(&format!("%{:02X}", c as u8)); + } else { + pre.push(c); } } - let mut parsed_url = match Url::parse(url) { - Ok(res) => { - // For cannot-be-a-base (opaque) URIs like "A:ᏤᏤ", Go keeps the opaque - // path verbatim. Return with lowercased scheme. - // Exception: if the opaque part has control chars, Go's url.Parse fails - // and obfuscateUserInfo returns the original URL unchanged. - if res.cannot_be_a_base() { - let scheme_len = url.find(':').unwrap_or(0); - let opaque_part = &url[scheme_len..]; - if opaque_part.bytes().any(|b| b < 0x20 || b == 0x7F) { - return url.to_string(); // Go returns original on parse error - } - let result = url[..scheme_len].to_lowercase() + opaque_part; - // Go's url.URL.String() omits empty trailing fragment (bare '#') - return if result.ends_with('#') { - result[..result.len() - 1].to_string() - } else { - result - }; + if let Some(fi) = frag_pos { + pre.push('#'); + for c in url[fi + 1..].chars() { + if !c.is_ascii() + || (c as u32) < 0x20 + || c as u32 == 0x7F + || c == '#' + || is_cat1(c) + || (frag_has_non_ascii && is_frag_cat2(c)) + { + encode_char(&mut pre, c); + } else { + pre.push(c); } - res } + } + + let uri = match UriRef::parse(pre.as_str()) { + Ok(u) => u, Err(_) => { - // Fragment-only references (e.g. "#", "#frag") are valid relative URL references. - // Go's url.Parse handles them successfully: "#" → "" (empty fragment → empty string), - // "#frag" → "#frag". Handle these before the go_like_reference fallback to prevent - // the "empty result → ?" heuristic from incorrectly triggering. - if let Some(fragment) = url.strip_prefix('#') { - if fragment.is_empty() { - return String::new(); - } - // Go also rejects invalid percent-encoding in fragments. - if has_invalid_percent_encoding(fragment) { - return String::from("?"); - } - // Go's url.Parse percent-encodes certain chars in fragments: - // - Always: control chars, '#' - // - When non-ASCII present (escape() fallback): '!', '\'', '(', ')', '*', '[', ']' - // (These are in validEncoded's allowlist so kept for pure-ASCII fragments, but - // escape() encodes them too.) - let frag_has_non_ascii = fragment.bytes().any(|b| b > 127); - // Cat1: always encode in fragments; Cat2: encode when non-ASCII present - let url_for_join = if fragment.bytes().any(|b| b < 0x20 || b == 0x7F || b == b'#') - || fragment - .chars() - .any(|c| matches!(c, '{' | '}' | '|' | '^' | '`' | '\\' | '<' | '>' | ' ')) - || (frag_has_non_ascii - && fragment.chars().any(|c| matches!(c, '\'' | '[' | ']'))) - { - let mut encoded = String::from('#'); - for c in fragment.chars() { - let cp = c as u32; - if cp < 0x20 || cp == 0x7F || c == '#' { - encoded.push_str(&format!("%{cp:02X}")); - } else if matches!(c, '{' | '}' | '|' | '^' | '`' | '\\' | '<' | '>' | ' ') - || (frag_has_non_ascii && matches!(c, '\'' | '[' | ']')) - { - encoded.push_str(&format!("%{:02X}", c as u8)); - } else { - encoded.push(c); - } - } - encoded - } else { - url.to_string() - }; - return go_like_reference(&url_for_join, remove_query_string); - } - // Go's url.Parse rejects control characters (bytes < 0x20 or 0x7F) in the PATH and - // returns "?". BUT when both options are false, Go's obfuscateUserInfo returns - // the original URL on parse failure (no "?"). - // Control chars in the FRAGMENT are percent-encoded, not rejected. - { - let path_end = url.find('#').unwrap_or(url.len()); - if url[..path_end].bytes().any(|b| b < 0x20 || b == 0x7F) { - if !remove_query_string && !remove_path_digits { - return url.to_string(); - } - return String::from("?"); - } - // Check if Go would reject the pre-fragment path due to ':' in first segment. - // This check must come before the CTL-in-fragment pre-encode block (which - // returns early) so that ":#\x01" is caught here rather than being pre-encoded - // and passed to go_like_reference. - if path_end < url.len() { - let segment_end = url.find(['/', '?', '#']).unwrap_or(url.len()); - if url[..segment_end].contains(':') { - if !remove_query_string && !remove_path_digits { - return url.to_string(); - } - return String::from("?"); - } - } - // Pre-encode control chars in the fragment (if any) before go_like_reference. - if path_end < url.len() - && url[path_end + 1..] - .bytes() - .any(|b| b < 0x20 || b == 0x7F || b == b'#') - { - // If the path or fragment has invalid percent-encoding, Go rejects the URL. - let path_invalid = has_invalid_percent_encoding(&url[..path_end]); - let frag_invalid = has_invalid_percent_encoding(&url[path_end + 1..]); - if path_invalid || frag_invalid { - if !remove_query_string && !remove_path_digits { - return url.to_string(); - } - return String::from("?"); - } - let mut pre_encoded = url[..path_end].to_string(); - pre_encoded.push('#'); - for c in url[path_end + 1..].chars() { - let cp = c as u32; - if cp < 0x20 || cp == 0x7F || c == '#' { - pre_encoded.push_str(&format!("%{cp:02X}")); - } else { - pre_encoded.push(c); - } - } - // Use the pre-encoded URL for the rest of the processing - let url = pre_encoded.as_str(); - // Continue to go_like_reference below using the pre-encoded url - // (fall through with modified url) - let url_pre_encoded_for_backslash; - let url_for_go_like = if url.contains('\\') { - url_pre_encoded_for_backslash = url.replace('\\', "%5C"); - url_pre_encoded_for_backslash.as_str() - } else { - url - }; - let raw = go_like_reference(url_for_go_like, remove_query_string); - let raw = if raw.ends_with('#') { - raw[..raw.len() - 1].to_string() - } else { - raw - }; - let result = if raw.is_empty() && !url.is_empty() { - url.to_string() - } else { - raw - }; - let path_end_for_ascii = url.find(['?', '#']).unwrap_or(url.len()); - let has_non_ascii = url[..path_end_for_ascii].bytes().any(|b| b > 127); - let result = if has_non_ascii { - encode_go_path_chars(&result) - } else { - let qs = result.find('?').unwrap_or(result.len()); - let pp = &result[..qs]; - let rr = &result[qs..]; - let mut enc = String::with_capacity(pp.len()); - let mut changed = false; - for c in pp.chars() { - match c { - '\\' | '^' | '{' | '}' | '|' | '<' | '>' | '`' | ' ' => { - enc.push('%'); - enc.push_str(&format!("{:02X}", c as u8)); - changed = true; - } - _ => enc.push(c), - } - } - if changed { - if rr.is_empty() { - enc - } else { - format!("{enc}{rr}") - } - } else { - result - } - }; - // Also encode Cat1 and (when frag has non-ASCII) Cat2 in the result's fragment - let orig_frag_has_non_ascii = url - [url.find('#').map(|i| i + 1).unwrap_or(url.len())..] - .bytes() - .any(|b| b > 127); - let result = if let Some(fs) = result.find('#') { - let (ph, fr) = result.split_at(fs); - let fr_inner = &fr[1..]; - let needs = fr_inner.chars().any(|c| { - matches!(c, '{' | '}' | '|' | '^' | '`' | '\\' | '<' | '>' | ' ') - }) || (orig_frag_has_non_ascii - && fr_inner.chars().any(|c| matches!(c, '\'' | '[' | ']'))); - if needs { - let mut out = ph.to_string(); - out.push('#'); - for c in fr_inner.chars() { - if matches!(c, '{' | '}' | '|' | '^' | '`' | '\\' | '<' | '>' | ' ') - || (orig_frag_has_non_ascii && matches!(c, '\'' | '[' | ']')) - { - out.push_str(&format!("%{:02X}", c as u8)); - } else { - out.push(c); - } - } - out - } else { - result - } - } else { - result - }; - if remove_path_digits { - return remove_relative_path_digits(&result); - } - return result; - } - } - // Go's url.Parse rejects invalid percent-encoding sequences (bare '%' or '%' not - // followed by exactly two hex digits) in the PATH and FRAGMENT, but not query string. - { - let path_end = url.find(['?', '#']).unwrap_or(url.len()); - let frag_start = url.find('#').map(|i| i + 1); - let path_invalid = has_invalid_percent_encoding(&url[..path_end]); - let frag_invalid = - frag_start.is_some_and(|i| has_invalid_percent_encoding(&url[i..])); - if path_invalid || frag_invalid { - if !remove_query_string && !remove_path_digits { - return url.to_string(); - } - return String::from("?"); - } - } - // Go's url.Parse rejects URLs where the first path segment contains ':' (RFC 3986 - // §4.2): this is ambiguous with a scheme separator. E.g., ":" and "1:b" both fail - // with "missing protocol scheme" or "first path segment cannot contain colon". - // The url crate silently accepts these as path chars. - { - let segment_end = url.find(['/', '?', '#']).unwrap_or(url.len()); - if url[..segment_end].contains(':') { - if !remove_query_string && !remove_path_digits { - return url.to_string(); - } - return String::from("?"); - } - } - // For query-only references (starting with '?'), Go keeps the query raw. - // With remove_query_string=true, return "?". Otherwise return original. - if let Some(after_q) = url.strip_prefix('?') { - if has_invalid_percent_encoding(after_q) { - return String::from("?"); - } - if remove_query_string { - // If the URL has a fragment ("?query#frag" or "?#frag"), preserve it. - // Fall through to go_like_reference which encodes and preserves it. - // For "?query" (no fragment), remove the query and return "?". - if !after_q.contains('#') { - return String::from("?"); - } - } else if let Some(hash_pos) = after_q.find('#') { - // Fragment present. Go keeps query raw and percent-encodes non-ASCII in - // the fragment (url.URL.String() calls EscapeFragment). Handle it here so - // the "restore original query" pass below doesn't undo the encoding. - let query_part = &after_q[..hash_pos]; // query content (without '?') - let frag = &after_q[hash_pos + 1..]; // fragment content - if frag.is_empty() { - // Go's url.URL.String() omits an empty trailing fragment (bare '#'). - return format!("?{query_part}"); - } - // Encode non-ASCII chars in the fragment byte-by-byte. - let mut encoded_frag = String::new(); - for c in frag.chars() { - if (c as u32) > 127 { - let mut buf = [0u8; 4]; - for &b in c.encode_utf8(&mut buf).as_bytes() { - encoded_frag.push_str(&format!("%{b:02X}")); - } - } else { - encoded_frag.push(c); - } - } - return format!("?{query_part}#{encoded_frag}"); - } else { - // No fragment: Go keeps query chars raw, including non-ASCII. - return url.to_string(); - } - } - // The url crate treats '\' as a path separator, silently consuming it. - // Go encodes '\' as '%5C'. Pre-encode backslashes before go_like_reference - // so they are preserved through base.join() and appear as '%5C' in the output. - // Also pre-encode spaces (url crate may drop them). - let url_pre_encoded; - let url_for_go_like = if url.contains('\\') || url.contains(' ') { - url_pre_encoded = url.replace('\\', "%5C").replace(' ', "%20"); - url_pre_encoded.as_str() - } else { - url - }; - let fixme_url_go_parsing_raw = go_like_reference(url_for_go_like, remove_query_string); - // Go's url.URL.String() omits a trailing empty fragment (bare '#'). - // The url crate keeps it. Strip it here for parity. - let fixme_url_go_parsing = if fixme_url_go_parsing_raw.ends_with('#') { - fixme_url_go_parsing_raw[..fixme_url_go_parsing_raw.len() - 1].to_string() - } else { - fixme_url_go_parsing_raw - }; - let result = if fixme_url_go_parsing.is_empty() && !url.is_empty() { - // The url crate resolved away dot path segments (e.g. "." or "..") via RFC 3986 - // normalization. Go's url.Parse preserves them literally. Return the original, - // but strip a trailing empty fragment '#' (Go omits empty fragments). - let fallback = url.strip_suffix('#').unwrap_or(url); - fallback.to_string() - } else { - // If the original URL had a dot-segment prefix (., .., ./, ../) that - // base.join() resolved away, Go preserves it literally. Re-prepend it. - let frag_or_end = url.find(['#', '?']).unwrap_or(url.len()); - let orig_path = &url[..frag_or_end]; - let dot_prefix_len = { - let mut i = 0; - loop { - if orig_path[i..].starts_with("../") { - i += 3; - } else if orig_path[i..].starts_with("./") { - i += 2; - } else if &orig_path[i..] == ".." || &orig_path[i..] == "." { - i += orig_path[i..].len(); - break; - } else { - break; - } - } - i - }; - if dot_prefix_len > 0 { - let dot_prefix = &url[..dot_prefix_len]; - // Prepend the lost dot prefix - if !fixme_url_go_parsing.starts_with(dot_prefix) { - format!("{}{}", dot_prefix, fixme_url_go_parsing) - } else { - fixme_url_go_parsing - } - } else if fixme_url_go_parsing.starts_with('#') { - // Non-dot path resolved to fragment only - prepend original path - if !orig_path.is_empty() { - format!("{}{}", orig_path, fixme_url_go_parsing) - } else { - fixme_url_go_parsing - } - } else { - fixme_url_go_parsing - } - }; - // Encode path chars that Go encodes but the url crate doesn't. - // Go's EscapedPath() calls escape(path, encodePath) whenever validEncoded() returns - // false. validEncoded() fails on: non-ASCII chars OR Cat1 chars - // (\,^,{,},|,<,>,`,space). When escape() is called, it also encodes Cat2 - // chars (!, ', (, ), *, [, ]). So Cat2 chars are encoded whenever any Cat1 - // or non-ASCII char is present in the path. Only check path portion (before - // '?' or '#'); query string and fragment have separate handling. Go's - // EscapedPath() only runs on the path component. - let path_end_for_ascii_check = url.find(['?', '#']).unwrap_or(url.len()); - let path_for_check = &url[..path_end_for_ascii_check]; - let has_non_ascii = path_for_check.bytes().any(|b| b > 127); - let has_cat1 = path_for_check.chars().any(|c| { - matches!( - c, - '\\' | '^' | '{' | '}' | '|' | '<' | '>' | '`' | ' ' | '"' - ) - }); - let needs_full_encoding = has_non_ascii || has_cat1; - let result = if needs_full_encoding { - // Full encoding: both category 1 and category 2 in path + fragment. - // When non-ASCII is present, Go's escape() also encodes cat2 chars in fragments. - let encoded = encode_go_path_chars(&result); - // Check if original URL's fragment also has non-ASCII - let url_frag_start = url.find('#').map(|i| i + 1).unwrap_or(url.len()); - let frag_has_non_ascii = url[url_frag_start..].bytes().any(|b| b > 127); - // Encode Cat1 and (when frag has non-ASCII) Cat2 in the result's fragment - if let Some(frag_start) = encoded.find('#') { - let path_and_hash = &encoded[..=frag_start]; - let frag = &encoded[frag_start + 1..]; - let frag_needs_enc = frag - .chars() - .any(|c| matches!(c, '{' | '}' | '|' | '^' | '`' | '\\' | '<' | '>' | ' ')) - || (frag_has_non_ascii - && frag.chars().any(|c| matches!(c, '\'' | '[' | ']'))); - if frag_needs_enc { - let mut out = path_and_hash.to_string(); - for c in frag.chars() { - if matches!(c, '{' | '}' | '|' | '^' | '`' | '\\' | '<' | '>' | ' ') - || (frag_has_non_ascii && matches!(c, '\'' | '[' | ']')) - { - out.push_str(&format!("%{:02X}", c as u8)); - } else { - out.push(c); - } - } - out - } else { - encoded - } - } else { - encoded - } + return if remove_query_string || remove_path_digits { + "?".to_string() } else { - // ASCII-only: only category 1 chars (\, ^, etc.) - // Category 2 (!, ', (, ), *) are left as-is for pure ASCII inputs - // Also stop at '#' since fragment has different encoding rules - let path_end = result.find(['?', '#']).unwrap_or(result.len()); - let path_part = &result[..path_end]; - let rest = &result[path_end..]; - let mut encoded = String::with_capacity(path_part.len()); - let mut changed = false; - for c in path_part.chars() { - match c { - '\\' | '^' | '{' | '}' | '|' | '<' | '>' | '`' | ' ' => { - encoded.push('%'); - encoded.push_str(&format!("{:02X}", c as u8)); - changed = true; - } - _ => encoded.push(c), - } - } - if changed { - if rest.is_empty() { - encoded - } else { - format!("{encoded}{rest}") - } - } else { - result - } + url.to_string() }; - // Go keeps the query string raw (url.RawQuery in Go's URL struct). - // The url crate encodes query chars; restore the original query from the input. - // Only restore the query portion (up to '#'), not the fragment — the fragment - // comes from go_like_reference which already handles encoding and empty stripping. - let result = if !remove_query_string { - if let Some(orig_q_start) = url.find('?') { - let orig_frag_start = url.find('#'); - // orig_query: from '?' to '#' (exclusive), e.g. "?rawquery" - let orig_query = &url[orig_q_start..orig_frag_start.unwrap_or(url.len())]; - if let Some(result_q_start) = result.find('?') { - // Keep the fragment from `result` (already encoded/stripped by - // go_like_reference) - let result_frag = result.find('#').map_or("", |i| &result[i..]); - format!("{}{}{}", &result[..result_q_start], orig_query, result_frag) - } else { - result - } - } else { - result - } - } else { - result - }; - if remove_path_digits { - return remove_relative_path_digits(&result); - } - return result; } }; - // remove username & password - parsed_url.set_username("").unwrap_or_default(); - parsed_url.set_password(Some("")).unwrap_or_default(); + let mut out = String::new(); - if remove_query_string && parsed_url.query().is_some() { - parsed_url.set_query(Some("")); + if let Some(scheme) = uri.scheme() { + out.push_str(&scheme.as_str().to_lowercase()); + out.push(':'); } - if !remove_path_digits { - return parsed_url.to_string(); + if let Some(auth) = uri.authority() { + out.push_str("//"); + // Strip userinfo — emit only host[:port] + out.push_str(auth.host()); + if let Some(port) = auth.port() { + out.push(':'); + out.push_str(port.as_str()); + } + let path_str = normalize_pct_encoded_unreserved(uri.path().as_str()); + if remove_path_digits { + out.push_str(&redact_path_digits(&path_str)); + } else { + out.push_str(&path_str); + } + } else if uri.scheme().is_some() { + // Opaque URL (scheme but no authority): Go keeps the opaque part verbatim. + // u.Path is empty for opaque URLs in Go, so no digit redaction applies. + let scheme_end = url.find(':').unwrap() + 1; + out.push_str(&url[scheme_end..path_end]); + } else { + // Relative reference: use pre-encoded path + let path_str = normalize_pct_encoded_unreserved(uri.path().as_str()); + if remove_path_digits { + out.push_str(&redact_path_digits(&path_str)); + } else { + out.push_str(&path_str); + } } - // remove path digits - let mut split_url: Vec<&str> = parsed_url.path().split('/').collect(); - let mut changed = false; - for segment in split_url.iter_mut() { - // we don't want to redact any HTML encodings - #[allow(clippy::unwrap_used)] - let decoded = percent_decode_str(segment).decode_utf8().unwrap(); - if decoded.chars().any(|c| char::is_ascii_digit(&c)) { - *segment = "/REDACTED/"; - changed = true; + // Use original URL positions to detect query — uri.query() is always None since we + // excluded the query from the string we passed to UriRef. + if remove_query_string { + if path_end < path_query_end { + out.push('?'); } + } else if path_end < path_query_end { + // Restore original raw query (Go's url.RawQuery is kept verbatim) + out.push_str(&url[path_end..path_query_end]); } - if changed { - parsed_url.set_path(&split_url.join("/")); + + if let Some(frag) = uri.fragment() { + if !frag.as_str().is_empty() { + out.push('#'); + out.push_str(frag.as_str()); + } } - parsed_url.to_string().replace("/REDACTED/", "?") + out } #[cfg(test)] @@ -1039,6 +522,191 @@ mod tests { input ["ჸ#%\u{1}"] expected_output ["?"]; ] + [ + test_name [parity_double_quote_cat1] + remove_query_string [true] + remove_path_digits [true] + input ["\"!"] + expected_output ["%22%21"]; + ] + [ + test_name [parity_dot_hash_unicode] + remove_query_string [true] + remove_path_digits [true] + input [".#ჸ"] + expected_output [".#%E1%83%B8"]; + ] + [ + test_name [parity_dot_hash] + remove_query_string [true] + remove_path_digits [true] + input [".#"] + expected_output ["."]; + ] + [ + test_name [parity_unicode_hash_digit] + remove_query_string [true] + remove_path_digits [true] + input ["ჸ#0"] + expected_output ["%E1%83%B8#0"]; + ] + [ + test_name [parity_scheme_empty_frag] + remove_query_string [true] + remove_path_digits [true] + input ["C:#"] + expected_output ["c:"]; + ] + [ + test_name [parity_relative_dotdot_unicode] + remove_query_string [true] + remove_path_digits [true] + input ["../ჸ"] + expected_output ["../%E1%83%B8"]; + ] + [ + test_name [parity_query_hash_unicode_both] + remove_query_string [true] + remove_path_digits [true] + input ["?#ჸ"] + expected_output ["?#%E1%83%B8"]; + ] + [ + test_name [parity_query_hash_unicode_digits] + remove_query_string [false] + remove_path_digits [true] + input ["?#ჸ"] + expected_output ["?#%E1%83%B8"]; + ] + [ + test_name [parity_excl_query_unicode] + remove_query_string [true] + remove_path_digits [true] + input ["!?ჸ"] + expected_output ["!?"]; + ] + [ + test_name [parity_query_unicode_keep] + remove_query_string [false] + remove_path_digits [true] + input ["?ჸ"] + expected_output ["?ჸ"]; + ] + [ + test_name [parity_space_unicode] + remove_query_string [true] + remove_path_digits [true] + input [" ჸ"] + expected_output ["%20%E1%83%B8"]; + ] + [ + test_name [parity_unicode_query_unicode_keep] + remove_query_string [false] + remove_path_digits [true] + input ["ჸ?ჸ"] + expected_output ["%E1%83%B8?ჸ"]; + ] + [ + test_name [parity_unicode_query_hash_both] + remove_query_string [true] + remove_path_digits [true] + input ["?ჸ#ჸ"] + expected_output ["?#%E1%83%B8"]; + ] + [ + test_name [parity_unicode_query_empty_hash] + remove_query_string [false] + remove_path_digits [true] + input ["ჸ?#"] + expected_output ["%E1%83%B8?"]; + ] + [ + test_name [parity_pct_unreserved_normalize] + remove_query_string [true] + remove_path_digits [false] + input ["%30ჸ"] + expected_output ["0%E1%83%B8"]; + ] + [ + test_name [parity_unicode_query_invalid_pct] + remove_query_string [true] + remove_path_digits [true] + input ["ჸ?%"] + expected_output ["%E1%83%B8?"]; + ] + [ + test_name [parity_not_a_url_both_false] + remove_query_string [false] + remove_path_digits [false] + input ["this is not a valid url"] + expected_output ["this%20is%20not%20a%20valid%20url"]; + ] + [ + test_name [parity_not_a_url_both_true] + remove_query_string [true] + remove_path_digits [true] + input ["this is not a valid url"] + expected_output ["this%20is%20not%20a%20valid%20url"]; + ] + [ + test_name [parity_disabled_userinfo] + remove_query_string [false] + remove_path_digits [false] + input ["http://user:password@foo.com/1/2/3?q=james"] + expected_output ["http://foo.com/1/2/3?q=james"]; + ] + [ + test_name [parity_colon_both_false] + remove_query_string [false] + remove_path_digits [false] + input [":"] + expected_output [":"]; + ] + [ + test_name [parity_pct_both_false] + remove_query_string [false] + remove_path_digits [false] + input ["%"] + expected_output ["%"]; + ] + [ + test_name [parity_ctrl_in_scheme_both_false] + remove_query_string [false] + remove_path_digits [false] + input ["C:\u{1}"] + expected_output ["C:\u{1}"]; + ] + [ + test_name [parity_ctrl_both_false] + remove_query_string [false] + remove_path_digits [false] + input ["\u{1}"] + expected_output ["\u{1}"]; + ] + [ + test_name [parity_frag_curly_brace] + remove_query_string [true] + remove_path_digits [true] + input ["ჸ#{ჸ"] + expected_output ["%E1%83%B8#%7B%E1%83%B8"]; + ] + [ + // Opaque URL: Go keeps the opaque part verbatim (not percent-encoded) + test_name [parity_opaque_url_unicode] + remove_query_string [true] + remove_path_digits [true] + input ["A:ჸ"] + expected_output ["a:ჸ"]; + ] + [ + // Fragment with chars outside RFC 3987 ucschar ranges (U+10EF4F, U+10FFFF, etc.) + // These must be percent-encoded, not cause a parse failure returning "?" + test_name [parity_fuzzing_supp_unicode_frag] + remove_query_string [true] + remove_path_digits [true] + input ["\u{91cb8}\u{9232f}झ\u{44db0}#\u{3}\n\u{5bb50}\u{925d9}\u{925d5}\u{925d5}\u{925d5}\u{925d5}䕞\u{9a70d}\u{3d2ff}\u{10ef4f}\u{87307}\u{6}\u{10ef0a}\u{10ffff}\u{ad7e5}\u{33f}筚\u{361}➑\u{2}{\u{10de13}\u{10ffff}\u{10ffff}'"] + expected_output ["%F2%91%B2%B8%F2%92%8C%AF%E0%A4%9D%F1%84%B6%B0#%03%0A%F1%9B%AD%90%F2%92%97%99%F2%92%97%95%F2%92%97%95%F2%92%97%95%F2%92%97%95%E4%95%9E%F2%9A%9C%8D%F0%BD%8B%BF%F4%8E%BD%8F%F2%87%8C%87%06%F4%8E%BC%8A%F4%8F%BF%BF%F2%AD%9F%A5%CC%BF%E7%AD%9A%CD%A1%E2%9E%91%02%7B%F4%8D%B8%93%F4%8F%BF%BF%F4%8F%BF%BF%27"]; + ] )] #[test] fn test_name() { From a22d54a862cb3955aa46e905d1401473fa928abc Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Mon, 9 Mar 2026 17:12:23 +0100 Subject: [PATCH 56/63] fix(http): clippy --- libdd-trace-obfuscation/src/http.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index bdfb611eb0..87103ec167 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -102,7 +102,7 @@ pub fn obfuscate_url_string( // Determine Go's escape() trigger: Cat1 or non-ASCII in path causes Cat2 encoding too let path = &url[..path_end]; let needs_full_path = path.bytes().any(|b| b > 127) || path.chars().any(is_cat1); - let frag_has_non_ascii = frag_pos.map_or(false, |i| url[i + 1..].bytes().any(|b| b > 127)); + let frag_has_non_ascii = frag_pos.is_some_and(|i| url[i + 1..].bytes().any(|b| b > 127)); // Pre-encode chars that UriRef (strict RFC 3986) rejects. // We encode ALL non-ASCII chars (not just Cat1/Cat2) so that characters outside From 0362c872a84c718f63ddf73d5cc7699862393f15 Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Mon, 9 Mar 2026 18:46:32 +0100 Subject: [PATCH 57/63] fix: update LICENSE-3rdparty.yml --- LICENSE-3rdparty.yml | 55 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/LICENSE-3rdparty.yml b/LICENSE-3rdparty.yml index db7415cf26..b34e2a7e8e 100644 --- a/LICENSE-3rdparty.yml +++ b/LICENSE-3rdparty.yml @@ -1,4 +1,4 @@ -root_name: builder, build_common, tools, libdd-alloc, libdd-crashtracker, libdd-common, libdd-telemetry, libdd-ddsketch, libdd-libunwind-sys, libdd-crashtracker-ffi, libdd-common-ffi, datadog-ffe, datadog-ffe-ffi, datadog-ipc, datadog-ipc-macros, libdd-tinybytes, tarpc, tarpc-plugins, spawn_worker, cc_utils, libdd-library-config, libdd-trace-protobuf, libdd-library-config-ffi, datadog-live-debugger, libdd-data-pipeline, libdd-dogstatsd-client, libdd-trace-stats, libdd-trace-utils, libdd-trace-normalization, libdd-log, datadog-live-debugger-ffi, libdd-profiling, libdd-profiling-protobuf, libdd-profiling-ffi, libdd-data-pipeline-ffi, libdd-ddsketch-ffi, libdd-log-ffi, libdd-telemetry-ffi, symbolizer-ffi, datadog-profiling-replayer, datadog-remote-config, datadog-sidecar, datadog-sidecar-macros, datadog-sidecar-ffi, libdd-trace-obfuscation, datadog-tracer-flare, sidecar_mockgen, test_spawn_from_lib +root_name: builder, build_common, tools, libdd-alloc, libdd-crashtracker, libdd-common, libdd-telemetry, libdd-ddsketch, libdd-libunwind-sys, libdd-crashtracker-ffi, libdd-common-ffi, datadog-ffe, datadog-ffe-ffi, datadog-ipc, datadog-ipc-macros, libdd-tinybytes, tarpc, tarpc-plugins, spawn_worker, cc_utils, libdd-library-config, libdd-trace-protobuf, libdd-library-config-ffi, datadog-live-debugger, libdd-data-pipeline, libdd-dogstatsd-client, libdd-trace-stats, libdd-trace-utils, libdd-trace-normalization, libdd-log, datadog-live-debugger-ffi, libdd-profiling, libdd-profiling-protobuf, libdd-profiling-ffi, libdd-data-pipeline-ffi, libdd-ddsketch-ffi, libdd-log-ffi, libdd-telemetry-ffi, symbolizer-ffi, datadog-profiling-replayer, datadog-remote-config, datadog-sidecar, datadog-sidecar-macros, datadog-sidecar-ffi, libdd-trace-obfuscation, datadog-tracer-flare, sidecar_mockgen, test_spawn_from_lib, bin_tests third_party_libraries: - package_name: addr2line package_version: 0.24.2 @@ -5197,6 +5197,31 @@ third_party_libraries: licenses: - license: MIT text: NOT FOUND +- package_name: borrow-or-share + package_version: 0.2.4 + repository: https://github.com/yescallop/borrow-or-share + license: MIT-0 + licenses: + - license: MIT-0 + text: |- + MIT No Attribution + + Copyright 2024 Scallop Ye + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. - package_name: bumpalo package_version: 3.17.0 repository: https://github.com/fitzgen/bumpalo @@ -11607,6 +11632,34 @@ third_party_libraries: LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +- package_name: fluent-uri + package_version: 0.4.1 + repository: https://github.com/yescallop/fluent-uri-rs + license: MIT + licenses: + - license: MIT + text: |- + MIT License + + Copyright (c) 2024 Scallop Ye + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. - package_name: fnv package_version: 1.0.7 repository: https://github.com/servo/rust-fnv From 18088a758e495ac58dfdcf3da4732dacee88c4f5 Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Mon, 9 Mar 2026 18:48:41 +0100 Subject: [PATCH 58/63] fix(http): clippy --- libdd-trace-obfuscation/src/http.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index 87103ec167..ec3f3758eb 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -168,10 +168,12 @@ pub fn obfuscate_url_string( } else { out.push_str(&path_str); } - } else if uri.scheme().is_some() { - // Opaque URL (scheme but no authority): Go keeps the opaque part verbatim. - // u.Path is empty for opaque URLs in Go, so no digit redaction applies. - let scheme_end = url.find(':').unwrap() + 1; + } else if let Some(scheme) = uri.scheme() { + // This is a really weird case because there is a scheme but no authority. + // For example: http:# + // Length of "http:" + let scheme_end = scheme.as_str().len() + 1; + // http://example.com/?query -> //example.com/ out.push_str(&url[scheme_end..path_end]); } else { // Relative reference: use pre-encoded path From 3e32c31e6c099d241e6d7f27b64827d544469365 Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Tue, 10 Mar 2026 15:32:28 +0100 Subject: [PATCH 59/63] fix(http): dash decode --- libdd-trace-obfuscation/src/http.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index ec3f3758eb..57b1a9dd16 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -31,7 +31,7 @@ fn normalize_pct_encoded_unreserved(path: &str) -> String { && b[i + 2].is_ascii_hexdigit() { let v = (hex_val(b[i + 1]) << 4) | hex_val(b[i + 2]); - if v.is_ascii_alphanumeric() || matches!(v, b'-' | b'.' | b'_' | b'~') { + if v.is_ascii_alphanumeric() || matches!(v, b'.' | b'_' | b'~') { out.push(v as char); } else { out.push_str(&path[i..i + 3]); @@ -700,6 +700,13 @@ mod tests { input ["A:ჸ"] expected_output ["a:ჸ"]; ] + [ + test_name [no_decode_dash] + remove_query_string [false] + remove_path_digits [false] + input ["http://foo.com/foo%20bar/"] + expected_output ["http://foo.com/foo%20bar/"]; + ] [ // Fragment with chars outside RFC 3987 ucschar ranges (U+10EF4F, U+10FFFF, etc.) // These must be percent-encoded, not cause a parse failure returning "?" From 4d1aa686235a0965a1010605e365b91969e71a61 Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Tue, 10 Mar 2026 17:00:23 +0100 Subject: [PATCH 60/63] fix: use older cargo bundle license to match ci --- LICENSE-3rdparty.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE-3rdparty.yml b/LICENSE-3rdparty.yml index b34e2a7e8e..c74c198f76 100644 --- a/LICENSE-3rdparty.yml +++ b/LICENSE-3rdparty.yml @@ -1,4 +1,4 @@ -root_name: builder, build_common, tools, libdd-alloc, libdd-crashtracker, libdd-common, libdd-telemetry, libdd-ddsketch, libdd-libunwind-sys, libdd-crashtracker-ffi, libdd-common-ffi, datadog-ffe, datadog-ffe-ffi, datadog-ipc, datadog-ipc-macros, libdd-tinybytes, tarpc, tarpc-plugins, spawn_worker, cc_utils, libdd-library-config, libdd-trace-protobuf, libdd-library-config-ffi, datadog-live-debugger, libdd-data-pipeline, libdd-dogstatsd-client, libdd-trace-stats, libdd-trace-utils, libdd-trace-normalization, libdd-log, datadog-live-debugger-ffi, libdd-profiling, libdd-profiling-protobuf, libdd-profiling-ffi, libdd-data-pipeline-ffi, libdd-ddsketch-ffi, libdd-log-ffi, libdd-telemetry-ffi, symbolizer-ffi, datadog-profiling-replayer, datadog-remote-config, datadog-sidecar, datadog-sidecar-macros, datadog-sidecar-ffi, libdd-trace-obfuscation, datadog-tracer-flare, sidecar_mockgen, test_spawn_from_lib, bin_tests +root_name: builder, build_common, tools, libdd-alloc, libdd-crashtracker, libdd-common, libdd-telemetry, libdd-ddsketch, libdd-libunwind-sys, libdd-crashtracker-ffi, libdd-common-ffi, datadog-ffe, datadog-ffe-ffi, datadog-ipc, datadog-ipc-macros, libdd-tinybytes, tarpc, tarpc-plugins, spawn_worker, cc_utils, libdd-library-config, libdd-trace-protobuf, libdd-library-config-ffi, datadog-live-debugger, libdd-data-pipeline, libdd-dogstatsd-client, libdd-trace-stats, libdd-trace-utils, libdd-trace-normalization, libdd-log, datadog-live-debugger-ffi, libdd-profiling, libdd-profiling-protobuf, libdd-profiling-ffi, libdd-data-pipeline-ffi, libdd-ddsketch-ffi, libdd-log-ffi, libdd-telemetry-ffi, symbolizer-ffi, datadog-profiling-replayer, datadog-remote-config, datadog-sidecar, datadog-sidecar-macros, datadog-sidecar-ffi, libdd-trace-obfuscation, datadog-tracer-flare, sidecar_mockgen, test_spawn_from_lib third_party_libraries: - package_name: addr2line package_version: 0.24.2 From 54e297cec7bd2be7eae83ad978a83571bd12f12f Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Wed, 11 Mar 2026 11:47:05 +0100 Subject: [PATCH 61/63] fix(http): remove unused url crate --- Cargo.lock | 1 - libdd-trace-obfuscation/Cargo.toml | 1 - 2 files changed, 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a715081595..1fe9e69b4d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3408,7 +3408,6 @@ dependencies = [ "regex", "serde", "serde_json", - "url", ] [[package]] diff --git a/libdd-trace-obfuscation/Cargo.toml b/libdd-trace-obfuscation/Cargo.toml index 99cb80a18e..eccb0fd93d 100644 --- a/libdd-trace-obfuscation/Cargo.toml +++ b/libdd-trace-obfuscation/Cargo.toml @@ -14,7 +14,6 @@ anyhow = "1.0" regex = "1" serde = { version = "1.0.145", features = ["derive"] } serde_json = { version = "1.0", features = ["preserve_order"] } -url = "^2.5.4" percent-encoding = "2.1" log = "0.4" libdd-trace-protobuf = { version = "1.1.0", path = "../libdd-trace-protobuf" } From e4981689b7aa3be154024fe831a44dbe9fe4eac5 Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Thu, 12 Mar 2026 16:46:57 +0100 Subject: [PATCH 62/63] fix(http): write! instead of push_str(format), fixme comment --- libdd-trace-obfuscation/src/http.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index 57b1a9dd16..55414cd9b2 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -1,8 +1,13 @@ // Copyright 2023-Present Datadog, Inc. https://www.datadoghq.com/ // SPDX-License-Identifier: Apache-2.0 +// FIXME: once obfuscation feature parity is reached with the agent, change both modules to be more +// restrictive on the accepted forms of urls so that this module can be greatly simplified. +// One idea for now is to match the url to a regex on both side to validate it + use fluent_uri::UriRef; use percent_encoding::percent_decode_str; +use std::fmt::Write; fn is_cat1(c: char) -> bool { matches!( @@ -56,7 +61,7 @@ fn is_frag_cat2(c: char) -> bool { fn encode_char(out: &mut String, c: char) { let mut buf = [0u8; 4]; for &b in c.encode_utf8(&mut buf).as_bytes() { - out.push_str(&format!("%{b:02X}")); + let _ = write!(out, "%{b:02X}"); } } @@ -114,7 +119,7 @@ pub fn obfuscate_url_string( if !c.is_ascii() { encode_char(&mut pre, c); } else if is_cat1(c) || (needs_full_path && is_path_cat2(c)) { - pre.push_str(&format!("%{:02X}", c as u8)); + let _ = write!(pre, "%{:02X}", c as u8); } else { pre.push(c); } From 8db3fd436ab2cb3459b8edb438e42399f001a47f Mon Sep 17 00:00:00 2001 From: Oscar Le Dauphin Date: Thu, 12 Mar 2026 17:37:00 +0100 Subject: [PATCH 63/63] fix(http): rename/document confusing cat1/cat2 functions --- libdd-trace-obfuscation/src/http.rs | 32 +++++++++++++++++------------ 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/libdd-trace-obfuscation/src/http.rs b/libdd-trace-obfuscation/src/http.rs index 55414cd9b2..a6dce5bfda 100644 --- a/libdd-trace-obfuscation/src/http.rs +++ b/libdd-trace-obfuscation/src/http.rs @@ -9,13 +9,27 @@ use fluent_uri::UriRef; use percent_encoding::percent_decode_str; use std::fmt::Write; -fn is_cat1(c: char) -> bool { +/// Returns true for Go net/url's "category 1" characters: +/// ASCII bytes that always trigger escaping in URLs (plus space and quote). +fn is_go_url_escape_cat1(c: char) -> bool { matches!( c, '\\' | '^' | '{' | '}' | '|' | '<' | '>' | '`' | ' ' | '"' ) } +/// Returns true for Go net/url's "category 2" characters for PATH contexts: +/// characters Go may escape in paths when Cat1 is present or non-ASCII exists. +fn is_go_url_escape_cat2_path(c: char) -> bool { + matches!(c, '!' | '\'' | '(' | ')' | '*' | '[' | ']') +} + +/// Returns true for Go net/url's "category 2" characters for FRAGMENT contexts: +/// characters Go may escape in fragments when non-ASCII exists. +fn is_go_url_escape_cat2_fragment(c: char) -> bool { + matches!(c, '\'' | '[' | ']') +} + fn hex_val(b: u8) -> u8 { match b { b'0'..=b'9' => b - b'0', @@ -50,14 +64,6 @@ fn normalize_pct_encoded_unreserved(path: &str) -> String { out } -fn is_path_cat2(c: char) -> bool { - matches!(c, '!' | '\'' | '(' | ')' | '*' | '[' | ']') -} - -fn is_frag_cat2(c: char) -> bool { - matches!(c, '\'' | '[' | ']') -} - fn encode_char(out: &mut String, c: char) { let mut buf = [0u8; 4]; for &b in c.encode_utf8(&mut buf).as_bytes() { @@ -106,7 +112,7 @@ pub fn obfuscate_url_string( // Determine Go's escape() trigger: Cat1 or non-ASCII in path causes Cat2 encoding too let path = &url[..path_end]; - let needs_full_path = path.bytes().any(|b| b > 127) || path.chars().any(is_cat1); + let needs_full_path = path.bytes().any(|b| b > 127) || path.chars().any(is_go_url_escape_cat1); let frag_has_non_ascii = frag_pos.is_some_and(|i| url[i + 1..].bytes().any(|b| b > 127)); // Pre-encode chars that UriRef (strict RFC 3986) rejects. @@ -118,7 +124,7 @@ pub fn obfuscate_url_string( for c in url[..path_end].chars() { if !c.is_ascii() { encode_char(&mut pre, c); - } else if is_cat1(c) || (needs_full_path && is_path_cat2(c)) { + } else if is_go_url_escape_cat1(c) || (needs_full_path && is_go_url_escape_cat2_path(c)) { let _ = write!(pre, "%{:02X}", c as u8); } else { pre.push(c); @@ -131,8 +137,8 @@ pub fn obfuscate_url_string( || (c as u32) < 0x20 || c as u32 == 0x7F || c == '#' - || is_cat1(c) - || (frag_has_non_ascii && is_frag_cat2(c)) + || is_go_url_escape_cat1(c) + || (frag_has_non_ascii && is_go_url_escape_cat2_fragment(c)) { encode_char(&mut pre, c); } else {