Skip to content

Commit

Permalink
Merge pull request #61860 from ClickHouse/backport/24.2/61749
Browse files Browse the repository at this point in the history
Backport #61749 to 24.2: Fix crash in `multiSearchAllPositionsCaseInsensitiveUTF8` for incorrect UTF-8
robot-ch-test-poll3 authored Mar 25, 2024
2 parents d2fa4c3 + ea068aa commit 27a280f
Showing 3 changed files with 13 additions and 8 deletions.
18 changes: 10 additions & 8 deletions src/Common/Volnitsky.h
Original file line number Diff line number Diff line change
@@ -191,7 +191,8 @@ namespace VolnitskyTraits
if (length_l != length_r)
return false;

assert(length_l >= 2 && length_r >= 2);
if (length_l < 2 || length_r < 2)
return false; /// Some part of the given ngram contains an invalid UTF-8 sequence.

chars.c0 = seq_l[seq_ngram_offset];
chars.c1 = seq_l[seq_ngram_offset + 1];
@@ -253,7 +254,9 @@ namespace VolnitskyTraits
if (size_l != size_u)
return false;

assert(size_l >= 1 && size_u >= 1);
if (size_l == 0 || size_u == 0)
return false; /// Some part of the given ngram contains an invalid UTF-8 sequence.

chars.c1 = seq_l[0];
putNGramBase(n, offset);

@@ -276,7 +279,8 @@ namespace VolnitskyTraits
if (size_l != size_u)
return false;

assert(size_l > seq_ngram_offset && size_u > seq_ngram_offset);
if (size_l <= seq_ngram_offset || size_u <= seq_ngram_offset)
return false; /// Some part of the given ngram contains an invalid UTF-8 sequence.

chars.c0 = seq_l[seq_ngram_offset];
putNGramBase(n, offset);
@@ -302,10 +306,8 @@ namespace VolnitskyTraits
if (size_first_l != size_first_u || size_second_l != size_second_u)
return false;

assert(size_first_l > seq_ngram_offset);
assert(size_first_u > seq_ngram_offset);
assert(size_second_l > 0);
assert(size_second_u > 0);
if (size_first_l <= seq_ngram_offset || size_first_u <= seq_ngram_offset || size_second_l == 0 || size_second_u == 0)
return false;

auto c0l = first_l_seq[seq_ngram_offset];
auto c0u = first_u_seq[seq_ngram_offset];
@@ -399,7 +401,7 @@ class VolnitskyBase
if (fallback || fallback_searcher.force_fallback)
return;

hash = std::unique_ptr<VolnitskyTraits::Offset[]>(new VolnitskyTraits::Offset[VolnitskyTraits::hash_size]{});
hash = std::make_unique<VolnitskyTraits::Offset[]>(VolnitskyTraits::hash_size);

auto callback = [this](const VolnitskyTraits::Ngram ngram, const int offset) { return this->putNGramBase(ngram, offset); };
/// ssize_t is used here because unsigned can't be used with condition like `i >= 0`, unsigned always >= 0
Original file line number Diff line number Diff line change
@@ -12872,3 +12872,4 @@
1
1
1
1
Original file line number Diff line number Diff line change
@@ -223,6 +223,8 @@ select [2] = multiSearchAllPositions(materialize('abab'), materialize(['ba']));
select [1] = multiSearchAllPositionsCaseInsensitive(materialize('aBaB'), materialize(['abab']));
select [3] = multiSearchAllPositionsUTF8(materialize('ab€ab'), materialize(['']));
select [3] = multiSearchAllPositionsCaseInsensitiveUTF8(materialize('ab€AB'), materialize(['€ab']));
-- checks the correct handling of broken utf-8 sequence
select [0] = multiSearchAllPositionsCaseInsensitiveUTF8(materialize(''), materialize(['a\x90\x90\x90\x90\x90\x90']));

select 1 = multiSearchAny(materialize('abcdefgh'), ['b']);
select 1 = multiSearchAny(materialize('abcdefgh'), ['bc']);

0 comments on commit 27a280f

Please sign in to comment.