diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py index 0d4c7ecf4a04f2..e998146c190df4 100644 --- a/Lib/test/string_tests.py +++ b/Lib/test/string_tests.py @@ -341,6 +341,42 @@ def reference_find(p, s): self.checkequal(reference_find(p, text), text, 'find', p) + def test_find_many_lengths(self): + haystack_repeats = [a * 10**e for e in range(6) for a in (1,2,5)] + haystacks = [(n, self.fixtype("abcab"*n + "da")) for n in haystack_repeats] + + needle_repeats = [a * 10**e for e in range(6) for a in (1, 3)] + needles = [(m, self.fixtype("abcab"*m + "da")) for m in needle_repeats] + + for n, haystack1 in haystacks: + haystack2 = haystack1[:-1] + for m, needle in needles: + answer1 = 5 * (n - m) if m <= n else -1 + self.assertEqual(haystack1.find(needle), answer1, msg=(n,m)) + self.assertEqual(haystack2.find(needle), -1, msg=(n,m)) + + def test_adaptive_find(self): + # This would be very slow for the naive algorithm, + # but str.find() should be O(n + m). + for N in 1000, 10_000, 100_000, 1_000_000: + A, B = 'a' * N, 'b' * N + haystack = A + A + B + A + A + needle = A + B + B + A + self.checkequal(-1, haystack, 'find', needle) + self.checkequal(0, haystack, 'count', needle) + self.checkequal(len(haystack), haystack + needle, 'find', needle) + self.checkequal(1, haystack + needle, 'count', needle) + + def test_find_with_memory(self): + # Test the "Skip with memory" path in the two-way algorithm. + for N in 1000, 3000, 10_000, 30_000: + needle = 'ab' * N + haystack = ('ab'*(N-1) + 'b') * 2 + self.checkequal(-1, haystack, 'find', needle) + self.checkequal(0, haystack, 'count', needle) + self.checkequal(len(haystack), haystack + needle, 'find', needle) + self.checkequal(1, haystack + needle, 'count', needle) + def test_find_shift_table_overflow(self): """When the table of 8-bit shifts overflows.""" N = 2**8 + 100 @@ -715,6 +751,18 @@ def test_replace(self): self.checkraises(TypeError, 'hello', 'replace', 42, 'h') self.checkraises(TypeError, 'hello', 'replace', 'h', 42) + def test_replace_uses_two_way_maxcount(self): + # Test that maxcount works in _two_way_count in fastsearch.h + A, B = "A"*1000, "B"*1000 + AABAA = A + A + B + A + A + ABBA = A + B + B + A + self.checkequal(AABAA + ABBA, + AABAA + ABBA, 'replace', ABBA, "ccc", 0) + self.checkequal(AABAA + "ccc", + AABAA + ABBA, 'replace', ABBA, "ccc", 1) + self.checkequal(AABAA + "ccc", + AABAA + ABBA, 'replace', ABBA, "ccc", 2) + @unittest.skipIf(sys.maxsize > (1 << 32) or struct.calcsize('P') != 4, 'only applies to 32-bit platforms') def test_replace_overflow(self): diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 2949d00ad223ee..257b7bd6788ad2 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -18,7 +18,8 @@ algorithm, which has worst-case O(n) runtime and best-case O(n/k). Also compute a table of shifts to achieve O(n/k) in more cases, and often (data dependent) deduce larger shifts than pure C&P can - deduce. */ + deduce. See stringlib_find_two_way_notes.txt in this folder for a + detailed explanation. */ #define FAST_COUNT 0 #define FAST_SEARCH 1 @@ -398,7 +399,7 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, if (window_last >= haystack_end) { return -1; } - LOG("Horspool skip"); + LOG("Horspool skip\n"); } no_shift: window = window_last - len_needle + 1; @@ -457,7 +458,7 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, if (window_last >= haystack_end) { return -1; } - LOG("Horspool skip"); + LOG("Horspool skip\n"); } window = window_last - len_needle + 1; assert((window[len_needle - 1] & TABLE_MASK) == diff --git a/Objects/stringlib/stringlib_find_two_way_notes.txt b/Objects/stringlib/stringlib_find_two_way_notes.txt index afe45431a75ac4..f97615b42fff4b 100644 --- a/Objects/stringlib/stringlib_find_two_way_notes.txt +++ b/Objects/stringlib/stringlib_find_two_way_notes.txt @@ -239,7 +239,7 @@ We cut as AA + bAAbAAbA, and then the algorithm runs as follows: ~~ AA != bA at the cut bbbAbbAAbAAbAAbbbAAbAAbAAbAA AAbAAbAAbA - ^^^^X 7-3=4 match, and the 5th misses. + ^^^^X 7-3=4 match, and the 5th misses. bbbAbbAAbAAbAAbbbAAbAAbAAbAA AAbAAbAAbA ~ A != b at the cut @@ -395,7 +395,7 @@ of their proof goes something like this (this is far from complete): needle == (a + w) + (w + b), meaning there's a bad equality w == w, it's impossible for w + b to be bigger than both b and w + w + b, so this can't happen. We thus have all of - the ineuqalities with no question marks. + the inequalities with no question marks. * By maximality, the right part is not a substring of the left part. Thus, we have all of the inequalities involving no left-side question marks.