Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix escaped like wildcards in like_utf8 / nlike_utf8 kernels #2258

Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 69 additions & 6 deletions arrow/src/compute/kernels/comparison.rs
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ where
let re = if let Some(ref regex) = map.get(pat) {
regex
} else {
let re_pattern = escape(pat).replace('%', ".*").replace('_', ".");
let re_pattern = replace_like_wildcards(pat)?;
let re = op(&re_pattern)?;
map.insert(pat, re);
map.get(pat).unwrap()
Expand Down Expand Up @@ -248,7 +248,9 @@ pub fn like_utf8_scalar<OffsetSize: OffsetSizeTrait>(
bit_util::set_bit(bool_slice, i);
}
}
} else if right.ends_with('%') && !right[..right.len() - 1].contains(is_like_pattern)
} else if right.ends_with('%')
&& !right.ends_with("\\%")
&& !right[..right.len() - 1].contains(is_like_pattern)
{
// fast path, can use starts_with
let starts_with = &right[..right.len() - 1];
Expand All @@ -266,7 +268,7 @@ pub fn like_utf8_scalar<OffsetSize: OffsetSizeTrait>(
}
}
} else {
let re_pattern = escape(right).replace('%', ".*").replace('_', ".");
let re_pattern = replace_like_wildcards(right)?;
let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| {
ArrowError::ComputeError(format!(
"Unable to build regex from LIKE pattern: {}",
Expand Down Expand Up @@ -296,6 +298,41 @@ pub fn like_utf8_scalar<OffsetSize: OffsetSizeTrait>(
Ok(BooleanArray::from(data))
}

fn replace_like_wildcards(text: &str) -> Result<String> {
daniel-martinez-maqueda-sap marked this conversation as resolved.
Show resolved Hide resolved
let text = escape(text);
daniel-martinez-maqueda-sap marked this conversation as resolved.
Show resolved Hide resolved
let mut result = String::new();
let mut preceding_backslash_chars = String::new();
for c in text.chars() {
if c == '\\' {
preceding_backslash_chars.push(c);
} else if is_like_pattern(c) {
if preceding_backslash_chars.is_empty() {
// An unescaped like wildcard. Replaced by regex pattern
if c == '%' {
result.push_str(".*");
} else {
result.push('.');
}
} else {
// Escaped like wildcard. Remove the last two backslash
if preceding_backslash_chars.len() > 2 {
result.push_str(&preceding_backslash_chars[0..preceding_backslash_chars.len() - 2]);
}
result.push(c);
}
preceding_backslash_chars = String::new();
} else {
// No like wildcard found. Append unchanged
if !preceding_backslash_chars.is_empty() {
result.push_str(&preceding_backslash_chars);
preceding_backslash_chars = String::new();
}
result.push(c);
}
}
Ok(result)
}

/// Perform SQL `left NOT LIKE right` operation on [`StringArray`] /
/// [`LargeStringArray`].
///
Expand Down Expand Up @@ -342,7 +379,7 @@ pub fn nlike_utf8_scalar<OffsetSize: OffsetSizeTrait>(
result.append(!left.value(i).ends_with(&right[1..]));
}
} else {
let re_pattern = escape(right).replace('%', ".*").replace('_', ".");
let re_pattern = replace_like_wildcards(right)?;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I really like how you have refactored this code 👍

let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| {
ArrowError::ComputeError(format!(
"Unable to build regex from LIKE pattern: {}",
Expand Down Expand Up @@ -423,7 +460,7 @@ pub fn ilike_utf8_scalar<OffsetSize: OffsetSizeTrait>(
);
}
} else {
let re_pattern = escape(right).replace('%', ".*").replace('_', ".");
let re_pattern = replace_like_wildcards(right)?;
let re = Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| {
ArrowError::ComputeError(format!(
"Unable to build regex from ILIKE pattern: {}",
Expand Down Expand Up @@ -506,7 +543,7 @@ pub fn nilike_utf8_scalar<OffsetSize: OffsetSizeTrait>(
);
}
} else {
let re_pattern = escape(right).replace('%', ".*").replace('_', ".");
let re_pattern = replace_like_wildcards(right)?;
let re = Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| {
ArrowError::ComputeError(format!(
"Unable to build regex from ILIKE pattern: {}",
Expand Down Expand Up @@ -3740,6 +3777,32 @@ mod tests {
vec![false, true, false, false]
);

test_utf8_scalar!(
daniel-martinez-maqueda-sap marked this conversation as resolved.
Show resolved Hide resolved
test_utf8_scalar_like_escape,
vec!["a%", "a\\x"],
"a\\%",
like_utf8_scalar,
vec![true, false]
);

test_utf8!(
test_utf8_scalar_ilike_regex,
vec!["%%%"],
vec![r#"\%_\%"#],
ilike_utf8,
vec![true]
);

#[test]
fn test_replace_like_wildcards() {
let a_eq = "\\%_%\\_\\\\%.\\.";
let expected = String::from("%..*_\\\\%\\.\\\\\\.");
assert_eq!(
replace_like_wildcards(a_eq).unwrap(),
expected
);
}

test_utf8!(
test_utf8_array_eq,
vec!["arrow", "arrow", "arrow", "arrow"],
Expand Down