Skip to content

Commit

Permalink
Merge pull request #9261 from stevengj/graphemes
Browse files Browse the repository at this point in the history
add graphemes(s) function to iterate over string graphemes
  • Loading branch information
stevengj committed Dec 17, 2014
2 parents 6e4edf3 + 3c1b839 commit 2364748
Show file tree
Hide file tree
Showing 9 changed files with 106 additions and 9 deletions.
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@ Library improvements

* Efficient `mean` and `median` for ranges ([#8089]).

* `graphemes(s)` returns an iterator over grapheme substrings of `s` ([#9261]).

* Character predicates such as `islower()`, `isspace()`, etc. use utf8proc/libmojibake
to provide uniform cross-platform behavior and up-to-date, locale-independent support
for Unicode standards ([#5939]).
Expand Down Expand Up @@ -1132,4 +1134,6 @@ Too numerous to mention.
[#9133]: https://github.com/JuliaLang/julia/issues/9133
[#9144]: https://github.com/JuliaLang/julia/issues/9144
[#9249]: https://github.com/JuliaLang/julia/issues/9249
[#9261]: https://github.com/JuliaLang/julia/issues/9261
[#9271]: https://github.com/JuliaLang/julia/issues/9271
[#9294]: https://github.com/JuliaLang/julia/issues/9294
1 change: 1 addition & 0 deletions base/exports.jl
Original file line number Diff line number Diff line change
Expand Up @@ -822,6 +822,7 @@ export
escape_string,
float32_isvalid,
float64_isvalid,
graphemes,
hex,
hex2bytes,
ind2chr,
Expand Down
1 change: 0 additions & 1 deletion base/string.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1729,4 +1729,3 @@ pointer{T<:ByteString}(x::SubString{T}, i::Integer) = pointer(x.string.data) + x
pointer(x::Union(UTF16String,UTF32String), i::Integer) = pointer(x)+(i-1)*sizeof(eltype(x.data))
pointer{T<:Union(UTF16String,UTF32String)}(x::SubString{T}) = pointer(x.string.data) + x.offset*sizeof(eltype(x.data))
pointer{T<:Union(UTF16String,UTF32String)}(x::SubString{T}, i::Integer) = pointer(x.string.data) + (x.offset + (i-1))*sizeof(eltype(x.data))

2 changes: 1 addition & 1 deletion base/utf8.jl
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ function getindex(s::UTF8String, r::UnitRange{Int})
if !is_utf8_start(d[i])
i = nextind(s,i)
end
if j > endof(s)
if j > length(d)
throw(BoundsError())
end
j = nextind(s,j)-1
Expand Down
62 changes: 58 additions & 4 deletions base/utf8proc.jl
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
# Various Unicode functionality from the utf8proc library
module UTF8proc

import Base: show, showcompact, ==, string, symbol, isless
import Base: show, showcompact, ==, hash, string, symbol, isless, length, eltype, start, next, done, convert

export isgraphemebreak

# also exported by Base:
export normalize_string, is_valid_char, is_assigned_char,
export normalize_string, graphemes, is_valid_char, is_assigned_char,
islower, isupper, isalpha, isdigit, isnumber, isalnum,
iscntrl, ispunct, isspace, isprint, isgraph, isblank

Expand Down Expand Up @@ -60,6 +62,8 @@ const UTF8PROC_CHARBOUND = (1<<11)
const UTF8PROC_LUMP = (1<<12)
const UTF8PROC_STRIPMARK = (1<<13)

############################################################################

let
const p = Array(Ptr{UInt8}, 1)
global utf8proc_map
Expand Down Expand Up @@ -110,6 +114,8 @@ function normalize_string(s::AbstractString, nf::Symbol)
throw(ArgumentError(":$nf is not one of :NFC, :NFD, :NFKC, :NFKD")))
end

############################################################################

# returns UTF8PROC_CATEGORY code in 1:30 giving Unicode category
function category_code(c)
uint32(c) > 0x10FFFF && return 0x0000 # see utf8proc_get_property docs
Expand All @@ -118,8 +124,6 @@ end

is_assigned_char(c) = category_code(c) != UTF8PROC_CATEGORY_CN

# TODO: use UTF8PROC_CHARBOUND to extract graphemes from a string, e.g. to iterate over graphemes?

## libc character class predicates ##

islower(c::Char) = (category_code(c) == UTF8PROC_CATEGORY_LL)
Expand Down Expand Up @@ -168,4 +172,54 @@ for name = ("alnum", "alpha", "cntrl", "digit", "number", "graph",
end
end

############################################################################
# iterators for grapheme segmentation

isgraphemebreak(c1::Char, c2::Char) =
ccall(:utf8proc_grapheme_break, Bool, (Char, Char), c1, c2)

immutable GraphemeIterator{S<:AbstractString}
s::S # original string (for generation of SubStrings)
end
graphemes(s::AbstractString) = GraphemeIterator{typeof(s)}(s)

eltype{S}(::GraphemeIterator{S}) = SubString{S}

function length(g::GraphemeIterator)
c0 = Char(0x00ad) # soft hyphen (grapheme break always allowed after this)
n = 0
for c in g.s
n += isgraphemebreak(c0, c)
c0 = c
end
return n
end

start(g::GraphemeIterator) = start(g.s)
done(g::GraphemeIterator, i) = done(g.s, i)

function next(g::GraphemeIterator, i)
s = g.s
j = i
c0, k = next(s, i)
while !done(s, k) # loop until next grapheme is s[i:j]
c, ℓ = next(s, k)
isgraphemebreak(c0, c) && break
j = k
k =
c0 = c
end
return (s[i:j], k)
end

==(g1::GraphemeIterator, g2::GraphemeIterator) = g1.s == g2.s
hash(g::GraphemeIterator, h::UInt) = hash(g.s, h)
isless(g1::GraphemeIterator, g2::GraphemeIterator) = isless(g1.s, g2.s)

convert{S<:AbstractString}(::Type{S}, g::GraphemeIterator) = convert(S, g.s)

show{S}(io::IO, g::GraphemeIterator{S}) = print(io, "length-$(length(g)) GraphemeIterator{$S} for \"$(g.s)\"")

############################################################################

end # module
2 changes: 1 addition & 1 deletion deps/libmojibake
Submodule libmojibake updated from df71da to 86447a
8 changes: 8 additions & 0 deletions doc/stdlib/base.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1417,6 +1417,14 @@ Strings

For example, NFKC corresponds to the options ``compose=true, compat=true, stable=true``.

.. function:: graphemes(s) -> iterator over substrings of s

Returns an iterator over substrings of ``s`` that correspond to
the extended graphemes in the string, as defined by Unicode UAX #29.
(Roughly, these are what users would perceive as single characters,
even though they may contain more than one codepoint; for example
a letter combined with an accent mark is a single grapheme.)

.. function:: is_valid_ascii(s) -> Bool

Returns true if the argument (``ASCIIString``, ``UTF8String``, or byte vector) is valid ASCII, false otherwise.
Expand Down
7 changes: 6 additions & 1 deletion test/strings.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1267,6 +1267,11 @@ Base.done(jt::i9178, n) = (jt.ndone += 1 ; n > 3)
Base.next(jt::i9178, n) = (jt.nnext += 1 ; ("$(jt.nnext),$(jt.ndone)", n+1))
@test join(i9178(0,0), ";") == "1,1;2,2;3,3;4,4"

# make sure substrings handle last code unit even if not start of codepoint
let s = "x\u0302"
@test s[1:3] == s
end

# reverseind
for T in (ASCIIString, UTF8String, UTF16String, UTF32String)
for prefix in ("", "abcd", "\U0001d6a4\U0001d4c1", "\U0001d6a4\U0001d4c1c", " \U0001d6a4\U0001d4c1")
Expand All @@ -1288,4 +1293,4 @@ for T in (ASCIIString, UTF8String, UTF16String, UTF32String)
end
end
end
end
end
28 changes: 27 additions & 1 deletion test/unicode.jl
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,35 @@ else
end

# check utf8proc handling of CN category constants

let c_ll = 'β', c_cn = '\u038B'
@test Base.UTF8proc.category_code(c_ll) == Base.UTF8proc.UTF8PROC_CATEGORY_LL
# check codepoint with category code CN
@test Base.UTF8proc.category_code(c_cn) == Base.UTF8proc.UTF8PROC_CATEGORY_CN
end

# graphemes
let grphtest = (("b\u0300lahβlahb\u0302láh", ["b\u0300","l","a","h",
"β","l","a","h",
"b\u0302","l","á","h"]),
("", UTF8String[]),
("x\u0302", ["x\u0302"]),
("\U1d4c1\u0302", ["\U1d4c1\u0302"]),
("\U1d4c1\u0302\U1d4c1\u0300", ["\U1d4c1\u0302",
"\U1d4c1\u0300"]),
("x",["x"]),
("abc",["a","b","c"]))
for T in (utf8,utf16,utf32)
for nf in (:NFC, :NFD)
for (s, g) in grphtest
s_ = T(normalize_string(s, nf))
g_ = map(s -> normalize_string(s, nf), g)
grph = collect(graphemes(s_))
@test grph == g_
@test length(graphemes(s_)) == length(grph)
end
S = [T(normalize_string(s)) for (s,g) in grphtest]
G = map(graphemes, S)
@test map(graphemes, sort!(S)) == sort!(G)
end
end
end

0 comments on commit 2364748

Please sign in to comment.