-
-
Notifications
You must be signed in to change notification settings - Fork 5.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
SubString{UTF8String} and UTF8String have different slicing behaviour #7811
Comments
|
I know the index 2 is invalid. The problem is that the behavior is different for the erroneous index, and I can't see why they should be. |
Range indexing for |
Changing range indexing for |
I suspect that most usages that rely on the auto-adjustment are actually bugs (people thinking that character index == string index). So I'm fine with |
+1 for throwing the error. The auto-adjustment makes it harder to figure out how UTF8String actually works. |
I've been working on the following patch. any thoughts on concerns on this change? I've been focusing on validation of the first index only. it may be worth validating the second index too, however. commit 215e3e02023e6400e5a8ab2a62cd0c8cdb359f09
Author: Jameson Nash <[email protected]>
Date: Thu Apr 16 21:15:12 2015 -0400
throw ArgumentError when accessing an invalid UTF-8 indice, rather than attempting to adjust it. fix #7811
diff --git a/base/ascii.jl b/base/ascii.jl
index a9c905f..3622e3c 100644
--- a/base/ascii.jl
+++ b/base/ascii.jl
@@ -17,7 +17,11 @@ sizeof(s::ASCIIString) = sizeof(s.data)
getindex(s::ASCIIString, r::Vector) = ASCIIString(getindex(s.data,r))
getindex(s::ASCIIString, r::UnitRange{Int}) = ASCIIString(getindex(s.data,r))
getindex(s::ASCIIString, indx::AbstractVector{Int}) = ASCIIString(s.data[indx])
-search(s::ASCIIString, c::Char, i::Integer) = c < Char(0x80) ? search(s.data,c%UInt8,i) : 0
+function search(s::ASCIIString, c::Char, i::Integer)
+ i == sizeof(s) + 1 && return 0
+ (i < 1 || i > sizeof(s)) && throw(BoundsError(s, i))
+ return c < Char(0x80) ? search(s.data,c%UInt8,i) : 0
+end
rsearch(s::ASCIIString, c::Char, i::Integer) = c < Char(0x80) ? rsearch(s.data,c%UInt8,i) : 0
function string(c::ASCIIString...)
diff --git a/base/string.jl b/base/string.jl
index 61b6567..7909821 100644
--- a/base/string.jl
+++ b/base/string.jl
@@ -185,10 +185,9 @@ function search(s::AbstractString, c::Chars, i::Integer)
return 1 <= i <= nextind(s,endof(s)) ? i :
throw(BoundsError(s, i))
end
- if i < 1
+ if i < 1 || i > nextind(s,endof(s))
throw(BoundsError(s, i))
end
- i = nextind(s,i-1)
while !done(s,i)
d, j = next(s,i)
if d in c
@@ -632,7 +631,7 @@ prevind(s::SubString, i::Integer) = prevind(s.string, i+s.offset)-s.offset
convert{T<:AbstractString}(::Type{SubString{T}}, s::T) = SubString(s, 1, endof(s))
-bytestring{T <: ByteString}(p::SubString{T}) = bytestring(pointer(p.string.data)+p.offset, nextind(p, p.endof)-1)
+bytestring{T <: ByteString}(p::SubString{T}) = bytestring(p.string.data[1+p.offset:p.offset+nextind(p, p.endof)-1])
function serialize{T}(s, ss::SubString{T})
# avoid saving a copy of the parent string, keeping the type of ss
diff --git a/base/utf8.jl b/base/utf8.jl
index c254a50..8eea581 100644
--- a/base/utf8.jl
+++ b/base/utf8.jl
@@ -54,16 +54,7 @@ function next(s::UTF8String, i::Int)
d = s.data
b = d[i]
if !is_utf8_start(b)
- j = i-1
- while 0 < j && !is_utf8_start(d[j])
- j -= 1
- end
- if 0 < j && i <= j+utf8_trailing[d[j]+1] <= length(d)
- # b is a continuation byte of a valid UTF-8 character
- throw(ArgumentError("invalid UTF-8 character index"))
- end
- # move past 1 byte in case the data is actually Latin-1
- return '\ufffd', i+1
+ throw(ArgumentError("invalid UTF-8 character index"))
end
trailing = utf8_trailing[b+1]
if length(d) < i + trailing
@@ -111,8 +102,11 @@ function getindex(s::UTF8String, r::UnitRange{Int})
isempty(r) && return empty_utf8
i, j = first(r), last(r)
d = s.data
+ if i < 1 || i > length(s.data)
+ throw(BoundsError(s, i))
+ end
if !is_utf8_start(d[i])
- i = nextind(s,i)
+ throw(ArgumentError("invalid UTF-8 character index"))
end
if j > length(d)
throw(BoundsError())
@@ -122,6 +116,13 @@ function getindex(s::UTF8String, r::UnitRange{Int})
end
function search(s::UTF8String, c::Char, i::Integer)
+ if i < 1 || i > sizeof(s)
+ i == sizeof(s) + 1 && return 0
+ throw(BoundsError(s, i))
+ end
+ if !is_utf8_start(s.data[i])
+ throw(ArgumentError("invalid UTF-8 character index"))
+ end
c < Char(0x80) && return search(s.data, c%UInt8, i)
while true
i = search(s.data, first_utf8_byte(c), i)
diff --git a/test/strings.jl b/test/strings.jl
index 3734a2e..f768def 100644
--- a/test/strings.jl
+++ b/test/strings.jl
@@ -256,6 +256,8 @@ u8str = "∀ ε > 0, ∃ δ > 0: |x-y| < δ ⇒ |f(x)-f(y)| < ε"
# ascii search
for str in [astr, Base.GenericString(astr)]
+ @test_throws BoundsError search(str, 'z', 0)
+ @test_throws BoundsError search(str, '∀', 0)
@test search(str, 'x') == 0
@test search(str, '\0') == 0
@test search(str, '\u80') == 0
@@ -269,6 +271,8 @@ for str in [astr, Base.GenericString(astr)]
@test search(str, ',', 7) == 0
@test search(str, '\n') == 14
@test search(str, '\n', 15) == 0
+ @test_throws BoundsError search(str, 'ε', nextind(str,endof(str))+1)
+ @test_throws BoundsError search(str, 'a', nextind(str,endof(str))+1)
end
# ascii rsearch
@@ -291,23 +295,32 @@ end
# utf-8 search
for str in (u8str, Base.GenericString(u8str))
+ @test_throws BoundsError search(str, 'z', 0)
+ @test_throws BoundsError search(str, '∀', 0)
@test search(str, 'z') == 0
@test search(str, '\0') == 0
@test search(str, '\u80') == 0
@test search(str, '∄') == 0
@test search(str, '∀') == 1
- @test search(str, '∀', 2) == 0
+ @test_throws ArgumentError search(str, '∀', 2)
+ @test search(str, '∀', 4) == 0
@test search(str, '∃') == 13
- @test search(str, '∃', 14) == 0
+ @test_throws ArgumentError search(str, '∃', 15)
+ @test search(str, '∃', 16) == 0
@test search(str, 'x') == 26
@test search(str, 'x', 27) == 43
@test search(str, 'x', 44) == 0
@test search(str, 'δ') == 17
- @test search(str, 'δ', 18) == 33
- @test search(str, 'δ', 34) == 0
+ @test_throws ArgumentError search(str, 'δ', 18)
+ @test search(str, 'δ', nextind(str,17)) == 33
+ @test search(str, 'δ', nextind(str,33)) == 0
@test search(str, 'ε') == 5
- @test search(str, 'ε', 6) == 54
- @test search(str, 'ε', 55) == 0
+ @test search(str, 'ε', nextind(str,5)) == 54
+ @test search(str, 'ε', nextind(str,54)) == 0
+ @test search(str, 'ε', nextind(str,endof(str))) == 0
+ @test search(str, 'a', nextind(str,endof(str))) == 0
+ @test_throws BoundsError search(str, 'ε', nextind(str,endof(str))+1)
+ @test_throws BoundsError search(str, 'a', nextind(str,endof(str))+1)
end
# utf-8 rsearch there were a number of other inconsistencies I noted and attempted fixed in this patch, for example, with
I tried to fix many of these also, making this patch much larger than I originally expected, so it may be time to turn this into a PR... |
cf #9297 cc @nalimilan? |
…an attempting to adjust it. fix #7811
i turned this into a branch. any concerns before I go ahead and merge this? |
For what it's worth, I think this should definitely go into 0.4 |
…an attempting to adjust it (in getindex, next, search). fix #7811
…n attempting to adjust it (in getindex, next, search). fix #7811
Found this on stackoverflow
I'm not sure what the correct behaviour should be, but having them different seems wrong. Some relevant discussion in #5446
The text was updated successfully, but these errors were encountered: