Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

implement replace on String for multiple patterns #40484

Merged
merged 3 commits into from
Jun 7, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,10 @@ Standard library changes
* Some degree trigonometric functions, `sind`, `cosd`, `tand`, `asind`, `acosd`, `asecd`, `acscd`, `acotd`, `atand` now accept an square matrix ([#39758]).
* A backslash before a newline in command literals now always removes the newline, similar to standard string
literals, whereas the result was not well-defined before. ([#40753])
* `replace(::String)` now allows multiple patterns to be specified, and they
will be applied left-to-right simultaneously, so only one pattern will be
applied to any character, and the patterns will only be applied to the input
text, not the replacements. ([#TBD])

#### Package Manager

Expand Down
20 changes: 13 additions & 7 deletions base/regex.jl
Original file line number Diff line number Diff line change
Expand Up @@ -589,7 +589,7 @@ _free_pat_replacer(r::RegexAndMatchData) = PCRE.free_match_data(r.match_data)

replace_err(repl) = error("Bad replacement string: $repl")

function _write_capture(io, re::RegexAndMatchData, group)
function _write_capture(io::IO, group::Int, str, r, re::RegexAndMatchData)
len = PCRE.substring_length_bynumber(re.match_data, group)
# in the case of an optional group that doesn't match, len == 0
len == 0 && return
Expand All @@ -598,14 +598,19 @@ function _write_capture(io, re::RegexAndMatchData, group)
pointer(io.data, io.ptr), len+1)
io.ptr += len
io.size = max(io.size, io.ptr - 1)
nothing
end
function _write_capture(io::IO, group::Int, str, r, re)
group == 0 || replace_err("pattern is not a Regex")
return print(io, SubString(str, r))
end


const SUB_CHAR = '\\'
const GROUP_CHAR = 'g'
const KEEP_ESC = [SUB_CHAR, GROUP_CHAR, '0':'9'...]

function _replace(io, repl_s::SubstitutionString, str, r, re::RegexAndMatchData)
function _replace(io, repl_s::SubstitutionString, str, r, re)
LBRACKET = '<'
RBRACKET = '>'
repl = unescape_string(repl_s.string, KEEP_ESC)
Expand All @@ -629,7 +634,7 @@ function _replace(io, repl_s::SubstitutionString, str, r, re::RegexAndMatchData)
break
end
end
_write_capture(io, re, group)
_write_capture(io, group, str, r, re)
elseif repl[next_i] == GROUP_CHAR
i = nextind(repl, next_i)
if i > e || repl[i] != LBRACKET
Expand All @@ -642,15 +647,16 @@ function _replace(io, repl_s::SubstitutionString, str, r, re::RegexAndMatchData)
i = nextind(repl, i)
i > e && replace_err(repl)
end
# TODO: avoid this allocation
groupname = SubString(repl, groupstart, prevind(repl, i))
if all(isdigit, groupname)
_write_capture(io, re, parse(Int, groupname))
else
group = parse(Int, groupname)
elseif re isa RegexAndMatchData
group = PCRE.substring_number_from_name(re.re.regex, groupname)
group < 0 && replace_err("Group $groupname not found in regex $(re.re)")
_write_capture(io, re, group)
else
group = -1
end
_write_capture(io, group, str, r, re)
i = nextind(repl, i)
else
replace_err(repl)
Expand Down
1 change: 0 additions & 1 deletion base/set.jl
Original file line number Diff line number Diff line change
Expand Up @@ -621,7 +621,6 @@ replace!(a::Callable, b::Pair; count::Integer=-1) = throw(MethodError(replace!,
replace!(a::Callable, b::Pair, c::Pair; count::Integer=-1) = throw(MethodError(replace!, (a, b, c)))
replace(a::Callable, b::Pair; count::Integer=-1) = throw(MethodError(replace, (a, b)))
replace(a::Callable, b::Pair, c::Pair; count::Integer=-1) = throw(MethodError(replace, (a, b, c)))
replace(a::AbstractString, b::Pair, c::Pair) = throw(MethodError(replace, (a, b, c)))

### replace! for AbstractDict/AbstractSet

Expand Down
84 changes: 56 additions & 28 deletions base/strings/util.jl
Original file line number Diff line number Diff line change
Expand Up @@ -522,56 +522,74 @@ _replace(io, repl::Function, str, r, pattern) =
_replace(io, repl::Function, str, r, pattern::Function) =
print(io, repl(str[first(r)]))

replace(str::String, pat_repl::Pair{<:AbstractChar}; count::Integer=typemax(Int)) =
replace(str, isequal(first(pat_repl)) => last(pat_repl); count=count)

replace(str::String, pat_repl::Pair{<:Union{Tuple{Vararg{AbstractChar}},
AbstractVector{<:AbstractChar},Set{<:AbstractChar}}};
count::Integer=typemax(Int)) =
replace(str, in(first(pat_repl)) => last(pat_repl), count=count)

_pat_replacer(x) = x
_free_pat_replacer(x) = nothing

function replace(str::String, pat_repl::Pair; count::Integer=typemax(Int))
pattern, repl = pat_repl
_pat_replacer(x::AbstractChar) = isequal(x)
_pat_replacer(x::Union{Tuple{Vararg{AbstractChar}},AbstractVector{<:AbstractChar},Set{<:AbstractChar}}) = in(x)

function replace(str::String, pat_repl::Vararg{Pair,N}; count::Integer=typemax(Int)) where N
count == 0 && return str
count < 0 && throw(DomainError(count, "`count` must be non-negative."))
n = 1
e = lastindex(str)
e1 = nextind(str, lastindex(str)) # sizeof(str)
i = a = firstindex(str)
pattern = _pat_replacer(pattern)
r = something(findnext(pattern,str,i), 0)
j, k = first(r), last(r)
if j == 0
_free_pat_replacer(pattern)
patterns = map(p -> _pat_replacer(first(p)), pat_repl)
replaces = map(last, pat_repl)
rs = map(patterns) do p
r = findnext(p, str, a)
if r === nothing || first(r) == 0
stevengj marked this conversation as resolved.
Show resolved Hide resolved
return e1+1:0
end
r isa Int && (r = r:r) # findnext / performance fix
return r
end
if all(>(e1), map(first, rs))
foreach(_free_pat_replacer, patterns)
return str
end
out = IOBuffer(sizehint=floor(Int, 1.2sizeof(str)))
while j != 0
while true
p = argmin(map(first, rs)) # TODO: or argmin(rs), to pick the shortest first match ?
r = rs[p]
j, k = first(r), last(r)
j > e1 && break
if i == a || i <= k
# copy out preserved portion
GC.@preserve str unsafe_write(out, pointer(str, i), UInt(j-i))
_replace(out, repl, str, r, pattern)
# copy out replacement string
_replace(out, replaces[p], str, r, patterns[p])
end
if k < j
i = j
j > e && break
j == e1 && break
k = nextind(str, j)
else
i = k = nextind(str, k)
end
r = something(findnext(pattern,str,k), 0)
r === 0:-1 || n == count && break
j, k = first(r), last(r)
n == count && break
let k = k
rs = map(patterns, rs) do p, r
if first(r) < k
r = findnext(p, str, k)
if r === nothing || first(r) == 0
return e1+1:0
end
r isa Int && (r = r:r) # findnext / performance fix
end
return r
end
end
n += 1
end
_free_pat_replacer(pattern)
write(out, SubString(str,i))
String(take!(out))
foreach(_free_pat_replacer, patterns)
write(out, SubString(str, i))
return String(take!(out))
end


"""
replace(s::AbstractString, pat=>r; [count::Integer])
replace(s::AbstractString, pat=>r, [pat2=>r2, ...]; [count::Integer])

Search for the given pattern `pat` in `s`, and replace each occurrence with `r`.
If `count` is provided, replace at most `count` occurrences.
Expand All @@ -584,6 +602,13 @@ If `pat` is a regular expression and `r` is a [`SubstitutionString`](@ref), then
references in `r` are replaced with the corresponding matched text.
To remove instances of `pat` from `string`, set `r` to the empty `String` (`""`).

Multiple patterns can be specified, and they will be applied left-to-right
simultaneously, so only one pattern will be applied to any character, and the
patterns will only be applied to the input text, not the replacements.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this section is misleading - the text seems to sugest only patterns of the form 'x' => "replacement" or 'x' => 'a' are allowed, but the tests further down seem to suggest "original" => "replacement" and "original" => 'a' should also work.

Additionally, what should the result of replace("atest", "atest" => 'a', "at" => 'b') be? Simply "a", since the patterns are applied left to right, even though "simultaneously" applying them leads to inconsistency (and imo both are reasonable choices)?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The patterns are "not applied to the replacements". Each character is only matched by at most one pattern, though a pattern may match multiple characters (or even none).


!!! compat "Julia 1.7"
Support for multiple patterns requires version 1.7.

# Examples
```jldoctest
julia> replace("Python is a programming language.", "Python" => "Julia")
Expand All @@ -597,10 +622,13 @@ julia> replace("The quick foxes run quickly.", "quick" => "", count=1)

julia> replace("The quick foxes run quickly.", r"fox(es)?" => s"bus\\1")
"The quick buses run quickly."

julia> replace("abcabc", "a" => "b", "b" => "c", r".+" => "a")
"bca"
```
"""
replace(s::AbstractString, pat_f::Pair; count=typemax(Int)) =
replace(String(s), pat_f, count=count)
replace(s::AbstractString, pat_f::Pair...; count=typemax(Int)) =
replace(String(s), pat_f..., count=count)

# TODO: allow transform as the first argument to replace?

Expand Down
172 changes: 172 additions & 0 deletions test/strings/util.jl
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,178 @@ end

end

@testset "replace many" begin
# PR 35414 Francesco Alemanno <[email protected]>
@test replace("foobarbaz", "oo" => "zz", "ar" => "zz", "z" => "m") == "fzzbzzbam"
substmp=["z" => "m", "oo" => "zz", "ar" => "zz"]
for perm in [[1, 2, 3], [2, 1, 3], [3, 2, 1], [2, 3, 1], [1, 3, 2], [3, 1, 2]]
@test replace("foobarbaz", substmp[perm]...) == "fzzbzzbam"
@test replace("foobarbaz", substmp[perm]..., count=2) == "fzzbzzbaz"
@test replace("foobarbaz", substmp[perm]..., count=1) == "fzzbarbaz"
end
@test replace("foobarbaz", "z" => "m", r"a.*a" => uppercase) == "foobARBAm"
@test replace("foobarbaz", 'o' => 'z', 'a' => 'q', 'z' => 'm') == "fzzbqrbqm"


# PR #25732 Klaus Crusius <[email protected]>
@test replace("\u2202", '*' => '\0', "" => "") == "\u2202"

@test replace("foobar", 'o' => '0', "" => "") == "f00bar"
@test replace("foobar", 'o' => '0', count=1, "" => "") == "foobar"
@test replace("foobar", 'o' => '0', count=2, "" => "") == "f0obar"
@test replace("foobar", 'o' => "", "" => "") == "fbar"
@test replace("foobar", 'o' => "", count=1, "" => "") == "foobar"
@test replace("foobar", 'o' => "", count=2, "" => "") == "fobar"
@test replace("foobar", 'f' => 'F', "" => "") == "Foobar"
@test replace("foobar", 'r' => 'R', "" => "") == "foobaR"

@test replace("foofoofoo", "foo" => "bar", "" => "") == "barbarbar"
@test replace("foobarfoo", "foo" => "baz", "" => "") == "bazbarbaz"
@test replace("barfoofoo", "foo" => "baz", "" => "") == "barbazbaz"

@test replace("", "" => "", "" => "") == ""
@test replace("", "" => "x", "" => "") == "x"
@test replace("", "x" => "y", "" => "") == ""

@test replace("abcd", "" => "^", "" => "") == "^a^b^c^d^"
@test replace("abcd", "b" => "^", "" => "") == "a^cd"
@test replace("abcd", r"b?" => "^", "" => "") == "^a^c^d^"
@test replace("abcd", r"b+" => "^", "" => "") == "a^cd"
@test replace("abcd", r"b?c?" => "^", "" => "") == "^a^d^"
@test replace("abcd", r"[bc]?" => "^", "" => "") == "^a^^d^"

@test replace("foobarfoo", r"(fo|ba)" => "xx", "" => "") == "xxoxxrxxo"
@test replace("foobarfoo", r"(foo|ba)" => "bar", "" => "") == "barbarrbar"

@test replace("foobar", 'o' => 'ø', "" => "") == "føøbar"
@test replace("foobar", 'o' => 'ø', count=2, "" => "") == "føobar"
@test replace("føøbar", 'ø' => 'o', "" => "") == "foobar"
@test replace("føøbar", 'ø' => 'o', count=2, "" => "") == "foøbar"
@test replace("føøbar", 'ø' => 'ö', "" => "") == "fööbar"
@test replace("føøbar", 'ø' => 'ö', count=2, "" => "") == "föøbar"
@test replace("føøbar", 'ø' => "", "" => "") == "fbar"
@test replace("føøbar", 'ø' => "", count=2, "" => "") == "føbar"
@test replace("føøbar", 'f' => 'F', "" => "") == "Føøbar"
@test replace("ḟøøbar", 'ḟ' => 'F', "" => "") == "Føøbar"
@test replace("føøbar", 'f' => 'Ḟ', "" => "") == "Ḟøøbar"
@test replace("ḟøøbar", 'ḟ' => 'Ḟ', "" => "") == "Ḟøøbar"
@test replace("føøbar", 'r' => 'R', "" => "") == "føøbaR"
@test replace("føøbaṙ", 'ṙ' => 'R', "" => "") == "føøbaR"
@test replace("føøbar", 'r' => 'Ṙ', "" => "") == "føøbaṘ"
@test replace("føøbaṙ", 'ṙ' => 'Ṙ', "" => "") == "føøbaṘ"

@test replace("ḟøøḟøøḟøø", "ḟøø" => "bar", "" => "") == "barbarbar"
@test replace("ḟøøbarḟøø", "ḟøø" => "baz", "" => "") == "bazbarbaz"
@test replace("barḟøøḟøø", "ḟøø" => "baz", "" => "") == "barbazbaz"

@test replace("foofoofoo", "foo" => "ƀäṙ", "" => "") == "ƀäṙƀäṙƀäṙ"
@test replace("fooƀäṙfoo", "foo" => "baz", "" => "") == "bazƀäṙbaz"
@test replace("ƀäṙfoofoo", "foo" => "baz", "" => "") == "ƀäṙbazbaz"

@test replace("foofoofoo", "foo" => "bar", "" => "") == "barbarbar"
@test replace("foobarfoo", "foo" => "ƀäż", "" => "") == "ƀäżbarƀäż"
@test replace("barfoofoo", "foo" => "ƀäż", "" => "") == "barƀäżƀäż"

@test replace("ḟøøḟøøḟøø", "ḟøø" => "ƀäṙ", "" => "") == "ƀäṙƀäṙƀäṙ"
@test replace("ḟøøƀäṙḟøø", "ḟøø" => "baz", "" => "") == "bazƀäṙbaz"
@test replace("ƀäṙḟøøḟøø", "ḟøø" => "baz", "" => "") == "ƀäṙbazbaz"

@test replace("ḟøøḟøøḟøø", "ḟøø" => "bar", "" => "") == "barbarbar"
@test replace("ḟøøbarḟøø", "ḟøø" => "ƀäż", "" => "") == "ƀäżbarƀäż"
@test replace("barḟøøḟøø", "ḟøø" => "ƀäż", "" => "") == "barƀäżƀäż"

@test replace("ḟøøḟøøḟøø", "ḟøø" => "ƀäṙ", "" => "") == "ƀäṙƀäṙƀäṙ"
@test replace("ḟøøƀäṙḟøø", "ḟøø" => "ƀäż", "" => "") == "ƀäżƀäṙƀäż"
@test replace("ƀäṙḟøøḟøø", "ḟøø" => "ƀäż", "" => "") == "ƀäṙƀäżƀäż"

@test replace("", "" => "ẍ", "" => "") == "ẍ"
@test replace("", "ẍ" => "ÿ", "" => "") == ""

@test replace("äƀçđ", "" => "π", "" => "") == "πäπƀπçπđπ"
@test replace("äƀçđ", "ƀ" => "π", "" => "") == "äπçđ"
@test replace("äƀçđ", r"ƀ?" => "π", "" => "") == "πäπçπđπ"
@test replace("äƀçđ", r"ƀ+" => "π", "" => "") == "äπçđ"
@test replace("äƀçđ", r"ƀ?ç?" => "π", "" => "") == "πäπđπ"
@test replace("äƀçđ", r"[ƀç]?" => "π", "" => "") == "πäππđπ"

@test replace("foobarfoo", r"(fo|ba)" => "ẍẍ", "" => "") == "ẍẍoẍẍrẍẍo"

@test replace("ḟøøbarḟøø", r"(ḟø|ba)" => "xx", "" => "") == "xxøxxrxxø"
@test replace("ḟøøbarḟøø", r"(ḟøø|ba)" => "bar", "" => "") == "barbarrbar"

@test replace("fooƀäṙfoo", r"(fo|ƀä)" => "xx", "" => "") == "xxoxxṙxxo"
@test replace("fooƀäṙfoo", r"(foo|ƀä)" => "ƀäṙ", "" => "") == "ƀäṙƀäṙṙƀäṙ"

@test replace("ḟøøƀäṙḟøø", r"(ḟø|ƀä)" => "xx", "" => "") == "xxøxxṙxxø"
@test replace("ḟøøƀäṙḟøø", r"(ḟøø|ƀä)" => "ƀäṙ", "" => "") == "ƀäṙƀäṙṙƀäṙ"

@test replace("foo", "oo" => uppercase, "" => "") == "fOO"

# Issue 13332
@test replace("abc", 'b' => 2.1, "" => "") == "a2.1c"

# test replace with a count for String and GenericString
# check that replace is a no-op if count==0
for s in ["aaa", Test.GenericString("aaa")]
@test_throws DomainError replace(s, 'a' => "", count = -1, "" => "")
@test replace(s, 'a' => 'z', count=0, "" => "")::String == s
@test replace(s, 'a' => 'z', count=1, "" => "") == "zaa"
@test replace(s, 'a' => 'z', count=2, "" => "") == "zza"
@test replace(s, 'a' => 'z', count=3, "" => "") == "zzz"
@test replace(s, 'a' => 'z', count=4, "" => "") == "zzz"
@test replace(s, 'a' => 'z', count=typemax(Int), "" => "") == "zzz"
@test replace(s, 'a' => 'z', "" => "") == "zzz"
end

let s = "abc"
@test replace(s) === s
@test replace(s, 'a' => 'z', "" => "") === "zbc"
@test replace(s, 'a' => 'z', 'b' => 'y') == "zyc"
@test replace(s, 'a' => 'z', 'c' => 'x', "b" => 'y') == "zyx"
@test replace(s, '1' => 'z', "" => "") == s
@test replace(s, 'b' => "BbB", "" => "", count=2) == "aBbBc"
end

let s = "quick quicker quickest"
@test replace(s) === s
@test replace(s, "quickest" => 'z', "quicker" => uppercase, "quick" => 'a') == "a QUICKER z"
@test replace(s, "quick" => 'a', "quicker" => uppercase, "quickest" => 'z') == "a aer aest"
@test replace(s, "quickest" => "lame", "quicker" => "is", "quick" => "Duck", count=2) == "Duck is quickest"
@test "1q1u1i1c1k1 1q1u1i1c1k1e1r1 1q1u1i1c1k1e1s1t1" ==
replace(s, "" => '1', "" => "") ==
replace(s, "" => '1', "" => '2')
@test replace(s, "qu" => "QU", "qu" => "never happens", "ick" => "") == "QU QUer QUest"
@test replace(s, " " => '_', "r " => "r-") == "quick_quicker-quickest"
@test replace(s, r"[aeiou]" => "ä", "ui" => "ki", "i" => "I") == "qääck qääckär qääckäst"
@test replace(s, "i" => "I", "ui" => "ki", r"[aeiou]" => "ä") == "qkick qkickär qkickäst"
@test replace(s, r"[^ ]+" => "word", "quicker " => "X", count=big"99") == "word word word"
@test replace(s, "quicker " => "X", r"[^ ]+" => "word", count=big"99") == "word Xword"

@test replace(s, r"(quick)(e)" => s"\2-\1", "x" => "X") == "quick e-quickr e-quickst"

@test replace(s, 'q' => 'Q', 'u' => 'U') == "QUick QUicker QUickest"
@test replace(s, 'q' => 'Q', r"u" => 'U') == "QUick QUicker QUickest"
@test replace(s, 'q' => 'Q', ==('u') => uppercase) == "QUick QUicker QUickest"
@test replace(s, 'q' => 'Q', islowercase => '-') == "Q---- Q------ Q-------"
@test replace(s, ['q', 'u'] => 'K') == "KKick KKicker KKickest"
@test replace(s, occursin("uq") => 'K') == "KKick KKicker KKickest"
@test replace(s, ==('q') => "B") == "Buick Buicker Buickest"

@test replace(s, "qui" => "A", 'r' => 'R') == "Ack AckeR Ackest"
@test replace(s, 'r' => 'x', islowercase => uppercase) == "QUICK QUICKEx QUICKEST"
@test replace(s, islowercase => uppercase, 'r' => 'x') == "QUICK QUICKER QUICKEST"
@test replace(s, "q" => "z", islowercase => uppercase, 'r' => 'x') == "zUICK zUICKER zUICKEST"
@test replace(s, "qui" => "A", 'r' => 'x', islowercase => uppercase) == "ACK ACKEx ACKEST"
@test replace(s, "qui" => "A", 'r' => 'x', islowercase => uppercase) == "ACK ACKEx ACKEST"
@test replace(s, r"q" => "z", islowercase => uppercase, 'r' => 'x') == "zUICK zUICKER zUICKEST"

@test replace(s, "q" => s"a\0b") == "aqbuick aqbuicker aqbuickest"
@test replace(s, "q" => s"a\0b\n\\\g<0>") == "aqb\n\\quick aqb\n\\quicker aqb\n\\quickest"
@test_throws ErrorException("PCRE error: unknown substring") replace(s, r"q" => s"a\1b")
@test_throws ErrorException("Bad replacement string: pattern is not a Regex") replace(s, "q" => s"a\1b")
end
end

@testset "chomp/chop" begin
@test chomp("foo\n") == "foo"
@test chomp("fo∀\n") == "fo∀"
Expand Down