Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(spans): Scrub random strings in resource spans #2614

Merged
merged 17 commits into from
Oct 19, 2023
197 changes: 162 additions & 35 deletions relay-event-normalization/src/normalize/span/description/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
//! Span description scrubbing logic.
mod sql;
use once_cell::sync::Lazy;
pub use sql::parse_query;

use std::borrow::Cow;
Expand Down Expand Up @@ -48,7 +49,7 @@ pub(crate) fn scrub_span_description(span: &Span) -> Option<String> {
sql::scrub_queries(db_system, description)
}
}
("resource", _) => scrub_resource_identifiers(description),
("resource", _) => scrub_resource(description),
("ui", "load") => {
// `ui.load` spans contain component names like `ListAppViewController`, so
// they _should_ be low-cardinality.
Expand Down Expand Up @@ -178,34 +179,89 @@ fn scrub_redis_keys(string: &str) -> Option<String> {
Some(scrubbed)
}

fn scrub_resource_identifiers(mut string: &str) -> Option<String> {
// Remove query parameters or the fragment.
if string.starts_with("data:") {
if let Some(pos) = string.find(';') {
return Some(string[..pos].into());
}
return Some("data:*/*".into());
} else if let Some(pos) = string.find('?') {
string = &string[..pos];
} else if let Some(pos) = string.find('#') {
string = &string[..pos];
}
match RESOURCE_NORMALIZER_REGEX.replace_all(string, "$pre*$post") {
Cow::Owned(scrubbed) => Some(scrubbed),
Cow::Borrowed(string) => {
// No IDs scrubbed, but we still want to set something.
// If we managed to parse the URL, we'll try to get an extension.
if let Ok(url) = Url::parse(string) {
let domain = normalize_domain(&url.host()?.to_string(), url.port())?;
// If there is an extension, we add it to the domain.
if let Some(extension) = url.path().rsplit_once('.') {
return Some(format!("{domain}/*.{}", extension.1));
}
return Some(format!("{domain}/*"));
static DUMMY_URL: Lazy<Url> = Lazy::new(|| "http://replace_me".parse().unwrap());

fn scrub_resource(string: &str) -> Option<String> {
let (url, is_relative) = match Url::parse(string) {
Ok(url) => (url, false),
Err(url::ParseError::RelativeUrlWithoutBase) => {
// Try again, with base URL
match Url::options().base_url(Some(&DUMMY_URL)).parse(string) {
jjbayer marked this conversation as resolved.
Show resolved Hide resolved
Ok(url) => (url, true),
Err(_) => return None,
}
Some(string.into())
}
Err(_) => {
return None;
}
};

let mut formatted = match url.scheme() {
"data" => match url.path().split_once(';') {
Some((ty, _data)) => format!("data:{ty}"),
None => "data:*/*".to_owned(),
},
"chrome-extension" => {
let path = scrub_resource_path(url.path());
format!("chrome-extension://*{path}")
}
scheme => {
let path = url.path();
let path = scrub_resource_path(path);
let domain = url
.domain()
.and_then(|d| normalize_domain(d, url.port()))
.unwrap_or("".into());
format!("{scheme}://{domain}{path}")
}
};

if is_relative {
formatted = formatted.replace("http://replace_me", "");
}

Some(formatted)
}

fn scrub_resource_path(string: &str) -> String {
let mut segments = vec![];
let mut extension = "";
for segment in string.split('/') {
let (base, ext) = match segment.rsplit_once('.') {
Some(p) => p,
None => (segment, ""),
};
extension = ext;
let base = RESOURCE_NORMALIZER_REGEX.replace_all(base, "$pre*$post");
segments.push(if invalid_base_name(&base) {
Cow::Borrowed("*")
} else {
base
});
}

let mut joined = segments.join("/");
if !extension.is_empty() {
joined.push('.');
joined.push_str(extension);
}

joined
}

const MAX_SEGMENT_LENGTH: usize = 25;

fn invalid_base_name(segment: &str) -> bool {
phacops marked this conversation as resolved.
Show resolved Hide resolved
if segment.len() > MAX_SEGMENT_LENGTH {
return true;
}
for char in segment.chars() {
if char.is_numeric() || "&%#=+@".contains(char) {
return true;
};
}

false
}

#[cfg(test)]
Expand Down Expand Up @@ -249,7 +305,7 @@ mod tests {
if $expected == "" {
assert!(scrubbed.is_none());
} else {
assert_eq!(scrubbed.unwrap(), $expected);
assert_eq!($expected, scrubbed.unwrap());
}
}
};
Expand Down Expand Up @@ -374,7 +430,7 @@ mod tests {
resource_script,
"https://example.com/static/chunks/vendors-node_modules_somemodule_v1.2.3_mini-dist_index_js-client_dist-6c733292-f3cd-11ed-a05b-0242ac120003-0dc369dcf3d311eda05b0242ac120003.[hash].abcd1234.chunk.js-0242ac120003.map",
"resource.script",
"https://example.com/static/chunks/vendors-node_modules_somemodule_*_mini-dist_index_js-client_dist-*-*.[hash].*.chunk.js-*.map"
"https://example.com/static/chunks/*.map"
);

span_description_test!(
Expand All @@ -395,7 +451,7 @@ mod tests {
integer_in_resource,
"https://example.com/assets/this_is-a_good_resource-123-scrub_me.js",
"resource.css",
"https://example.com/assets/this_is-a_good_resource-*-scrub_me.js"
"https://example.com/assets/*.js"
);

span_description_test!(
Expand All @@ -409,14 +465,14 @@ mod tests {
resource_query_params2,
"https://data.domain.com/data/guide123.gif?jzb=3f535634H467g5-2f256f&ct=1234567890&v=1.203.0_prod",
"resource.img",
"https://data.domain.com/data/guide*.gif"
"https://*.domain.com/data/guide*.gif"
);

span_description_test!(
resource_no_ids,
"https://data.domain.com/data/guide.gif",
"resource.img",
"*.domain.com/*.gif"
"https://*.domain.com/data/guide.gif"
);

span_description_test!(
Expand All @@ -430,14 +486,50 @@ mod tests {
resource_vite,
"webroot/assets/Profile-73f6525d.js",
"resource.js",
"webroot/assets/Profile-*.js"
"/webroot/assets/Profile-*.js"
);

span_description_test!(
resource_vite_css,
"webroot/assets/Shop-1aff80f7.css",
"resource.css",
"webroot/assets/Shop-*.css"
"/webroot/assets/Shop-*.css"
);

span_description_test!(
chrome_extension,
"chrome-extension://begnopegbbhjeeiganiajffnalhlkkjb/img/assets/icon-10k.svg",
"resource.other",
"chrome-extension://*/img/assets/icon-*k.svg"
);

span_description_test!(
urlencoded_path_segments,
"https://some.domain.com/embed/%2Fembed%2Fdashboards%2F20%3FSlug%3Dsomeone%*hide_title%3Dtrue",
"resource.iframe",
"https://*.domain.com/embed/*"
);

span_description_test!(
random_string1,
"https://static.domain.com/6gezWf_qs4Wc12Nz9rpLOx2aw2k/foo-99",
"resource.img",
"https://*.domain.com/*/foo-*"
);

span_description_test!(
random_string2,
"http://domain.com/fy2XSqBMqkEm_qZZH3RrzvBTKg4/qltdXIJWTF_cuwt3uKmcwWBc1DM/z1a--BVsUI_oyUjJR12pDBcOIn5.dom.jsonp",

jjbayer marked this conversation as resolved.
Show resolved Hide resolved
"resource.script",
"http://domain.com/*/*/*.jsonp"
);

span_description_test!(
random_string3,
"jkhdkkncnoglghljlkmcimlnlhkeamab/123.css",
"resource.link",
"/*/*.css"
);

span_description_test!(
Expand Down Expand Up @@ -472,14 +564,14 @@ mod tests {
resource_url_with_fragment,
"https://data.domain.com/data/guide123.gif#url=someotherurl",
"resource.img",
"https://data.domain.com/data/guide*.gif"
"https://*.domain.com/data/guide*.gif"
);

span_description_test!(
resource_script_with_no_extension,
"https://www.domain.com/page?id=1234567890",
"resource.script",
"*.domain.com/*"
"https://*.domain.com/page"
);

span_description_test!(
Expand Down Expand Up @@ -512,6 +604,41 @@ mod tests {

span_description_test!(db_category_with_not_sql, "{someField:someValue}", "db", "");

span_description_test!(
resource_img_semi_colon,
"http://www.foo.com/path/to/resource;param1=test;param2=ing",
"resource.img",
"http://*.foo.com/path/to/*"
);

span_description_test!(
resource_img_comma_with_extension,
"https://example.org/p/fit=cover,width=150,height=150,format=auto,quality=90/media/photosV2/weird-stuff-123-234-456.jpg",
"resource.img",
"https://example.org/p/*/media/photos*/weird-stuff-*-*-*.jpg"
);

span_description_test!(
resource_img_path_with_comma,
"/help/purchase-details/1,*,0&fmt=webp&qlt=*,1&fit=constrain,0&op_sharpen=0&resMode=sharp2&iccEmbed=0&printRes=*",
"resource.img",
"/help/purchase-details/*"
);

span_description_test!(
resource_script_random_path_only,
"/ERs-sUsu3/wd4/LyMTWg/Ot1Om4m8cu3p7a/QkJWAQ/FSYL/GBlxb3kB",
"resource.script",
"/*/*/LyMTWg/*/QkJWAQ/FSYL/*"
);

span_description_test!(
resource_script_normalize_domain,
"https://sub.sub.sub.domain.com/resource.js",
"resource.script",
"https://*.domain.com/resource.js"
);

#[test]
fn informed_sql_parser() {
let json = r#"
Expand Down