-
Notifications
You must be signed in to change notification settings - Fork 141
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
11 changed files
with
10,618 additions
and
10,018 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
# Unicode data generation rules. Except for the test data files, most | ||
# users will not use these Makefile rules, which are primarily to re-generate | ||
# unicode_data.c when we get a new Unicode version or charwidth data; they | ||
# require ruby, fontforge, and julia to be installed. | ||
|
||
# programs | ||
CURL=curl | ||
RUBY=ruby | ||
PERL=perl | ||
MAKE=make | ||
JULIA=julia | ||
CURLFLAGS = --retry 5 --location | ||
|
||
# use JuliaLang caching (https://github.com/staticfloat/cache.julialang.org) | ||
# so that Travis builds do not depend on anyone's flaky servers but our own | ||
URLCACHE=https://cache.e.ip.saba.us/ | ||
|
||
.PHONY: clean | ||
|
||
.DELETE_ON_ERROR: | ||
|
||
utf8proc_data.c.new: data_generator.rb UnicodeData.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt CharWidths.txt | ||
$(RUBY) data_generator.rb < UnicodeData.txt > $@ | ||
|
||
# GNU Unifont version for font-metric calculations: | ||
UNIFONT_VERSION=7.0.06 | ||
|
||
unifont-$(UNIFONT_VERSION).ttf: | ||
$(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)http://unifoundry.com/pub/unifont-$(UNIFONT_VERSION)/font-builds/unifont-$(UNIFONT_VERSION).ttf | ||
|
||
unifont_upper-$(UNIFONT_VERSION).ttf: | ||
$(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)http://unifoundry.com/pub/unifont-$(UNIFONT_VERSION)/font-builds/unifont_upper-$(UNIFONT_VERSION).ttf | ||
|
||
CharWidths.txt: charwidths.jl unifont-$(UNIFONT_VERSION).ttf unifont_upper-$(UNIFONT_VERSION).ttf | ||
UNIFONT_VERSION=$(UNIFONT_VERSION) $(JULIA) charwidths.jl > $@ | ||
|
||
UnicodeData.txt: | ||
$(CURL) $(CURLFLAGS) -o $@ -O $(URLCACHE)http://www.unicode.org/Public/UNIDATA/UnicodeData.txt | ||
|
||
GraphemeBreakProperty.txt: | ||
$(CURL) $(CURLFLAGS) -o $@ -O $(URLCACHE)http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt | ||
|
||
DerivedCoreProperties.txt: | ||
$(CURL) $(CURLFLAGS) -o $@ -O $(URLCACHE)http://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt | ||
|
||
CompositionExclusions.txt: | ||
$(CURL) $(CURLFLAGS) -o $@ -O $(URLCACHE)http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt | ||
|
||
CaseFolding.txt: | ||
$(CURL) $(CURLFLAGS) -o $@ -O $(URLCACHE)http://www.unicode.org/Public/UNIDATA/CaseFolding.txt | ||
|
||
NormalizationTest.txt: | ||
$(CURL) $(CURLFLAGS) -o $@ -O $(URLCACHE)http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt | ||
|
||
GraphemeBreakTest.txt: | ||
$(CURL) $(CURLFLAGS) $(URLCACHE)http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt | $(PERL) -pe 's,÷,/,g;s,×,+,g' > $@ | ||
|
||
clean: | ||
rm -f UnicodeData.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt GraphemeBreakTest.txt CharWidths.txt unifont*.ttf unifont*.sfd |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,158 @@ | ||
# Following work by @jiahao, we compute character widths using a combination of | ||
# * advance widths from GNU Unifont (advance width 512 = 1 en) | ||
# * UAX 11: East Asian Width | ||
# * a few exceptions as needed | ||
# Adapted from http://nbviewer.ipython.org/gist/jiahao/07e8b08bf6d8671e9734 | ||
# | ||
# Requires Julia (obviously) and FontForge. | ||
|
||
############################################################################# | ||
# Julia 0.3/0.4 compatibility (taken from Compat package) | ||
if VERSION < v"0.4.0-dev+1419" | ||
const UInt16 = Uint16 | ||
end | ||
|
||
CharWidths = Dict{Int,Int}() | ||
|
||
############################################################################# | ||
# Widths from UAX #11: East Asian Width | ||
|
||
isfile("EastAsianWidth.txt") || download("http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt", "EastAsianWidth.txt") | ||
for line in readlines(open("EastAsianWidth.txt")) | ||
#Strip comments | ||
line[1] == '#' && continue | ||
precomment = split(line, '#')[1] | ||
#Parse code point range and width code | ||
tokens = split(precomment, ';') | ||
length(tokens) >= 2 || continue | ||
charrange = tokens[1] | ||
width = strip(tokens[2]) | ||
#Parse code point range into Julia UnitRange | ||
rangetokens = split(charrange, "..") | ||
charstart = uint32("0x"*rangetokens[1]) | ||
charend = uint32("0x"*rangetokens[length(rangetokens)>1 ? 2 : 1]) | ||
|
||
#Assign widths | ||
for c in charstart:charend | ||
width=="N" && continue #Ignore neutral characters | ||
CharWidths[c]=(width=="W" || width=="F") ? 2 : #Wide or full | ||
(width=="Na"|| width=="H" || width=="A") ? 1 : #Narrow or half or ambiguous (default to narrow in non-East-Asian contexts, which we can assume to be the default) | ||
error("Unknown East Asian width code: $width for code point: $c") | ||
end | ||
end | ||
|
||
############################################################################# | ||
# Widths from GNU Unifont | ||
|
||
universion=get(ENV, "UNIFONT_VERSION", "7.0.06") | ||
for fontfile in ["unifont-$universion", "unifont_upper-$universion"] | ||
isfile("$fontfile.ttf") || download("http://unifoundry.com/pub/unifont-$universion/font-builds/$fontfile.ttf", "$fontfile.ttf") | ||
isfile("$fontfile.sfd") || run(`fontforge -lang=ff -c "Open(\"$fontfile.ttf\");Save(\"$fontfile.sfd\");Quit(0);"`) | ||
end | ||
|
||
#Read sfdfile for character widths | ||
function parsesfd(filename::String, CharWidths::Dict{Int,Int}=Dict{Int,Int}()) | ||
state=:seekchar | ||
lineno = 0 | ||
for line in readlines(open(filename)) | ||
lineno += 1 | ||
if state==:seekchar #StartChar: nonmarkingreturn | ||
if contains(line, "StartChar: ") | ||
codepoint = nothing | ||
width = nothing | ||
state = :readdata | ||
end | ||
elseif state==:readdata #Encoding: 65538 -1 2, Width: 1024 | ||
contains(line, "Encoding:") && (codepoint = int(split(line)[3])) | ||
contains(line, "Width:") && (width = int(split(line)[2])) | ||
if codepoint!=nothing && width!=nothing && codepoint >= 0 | ||
CharWidths[codepoint]=div(width, 512) # 512 units to the en | ||
state = :seekchar | ||
end | ||
end | ||
end | ||
CharWidths | ||
end | ||
CharWidths=parsesfd("unifont-$universion.sfd", CharWidths) | ||
CharWidths=parsesfd("unifont_upper-$universion.sfd", CharWidths) | ||
|
||
############################################################################# | ||
# A few exceptions to the above cases, found by manual comparison | ||
# to other wcwidth functions and similar checks. | ||
|
||
# Use ../libutf8proc for category codes, rather than the one in Julia, | ||
# to minimize bootstrapping complexity when a new version of Unicode comes out. | ||
function catcode(c) | ||
uint(c) > 0x10FFFF && return 0x0000 # see utf8proc_get_property docs | ||
return unsafe_load(ccall((:utf8proc_get_property,"../libutf8proc"), Ptr{UInt16}, (Int32,), c)) | ||
end | ||
|
||
# use Base.UTF8proc module to get category codes constants, since | ||
# we aren't goint to change these in utf8proc. | ||
import Base.UTF8proc | ||
|
||
for c in keys(CharWidths) | ||
cat = catcode(c) | ||
|
||
# make sure format control character (category Cf) have width 0, | ||
# except for the Arabic characters 0x06xx (see unicode std 6.2, sec. 8.2) | ||
if cat==UTF8proc.UTF8PROC_CATEGORY_CF && c ∉ [0x0601,0x0602,0x0603,0x06dd] | ||
CharWidths[c]=0 | ||
end | ||
|
||
# Unifont has nonzero width for a number of non-spacing combining | ||
# characters, e.g. (in 7.0.06): f84,17b4,17b5,180b,180d,2d7f, and | ||
# the variation selectors | ||
if cat==UTF8proc.UTF8PROC_CATEGORY_MN | ||
CharWidths[c]=0 | ||
end | ||
|
||
# We also assign width of zero to unassigned and private-use | ||
# codepoints (Unifont includes ConScript Unicode Registry PUA fonts, | ||
# but since these are nonstandard it seems questionable to recognize them). | ||
if cat==UTF8proc.UTF8PROC_CATEGORY_CO || cat==UTF8proc.UTF8PROC_CATEGORY_CN | ||
CharWidths[c]=0 | ||
end | ||
|
||
# for some reason, Unifont has width-2 glyphs for ASCII control chars | ||
if cat==UTF8proc.UTF8PROC_CATEGORY_CC | ||
CharWidths[c]=0 | ||
end | ||
end | ||
|
||
#By definition, should have zero width (on the same line) | ||
#0x002028 ' ' category: Zl name: LINE SEPARATOR/ | ||
#0x002029 ' ' category: Zp name: PARAGRAPH SEPARATOR/ | ||
CharWidths[0x2028]=0 | ||
CharWidths[0x2029]=0 | ||
|
||
#By definition, should be narrow = width of 1 en space | ||
#0x00202f ' ' category: Zs name: NARROW NO-BREAK SPACE/ | ||
CharWidths[0x202f]=1 | ||
|
||
#By definition, should be wide = width of 1 em space | ||
#0x002001 ' ' category: Zs name: EM QUAD/ | ||
#0x002003 ' ' category: Zs name: EM SPACE/ | ||
CharWidths[0x2001]=2 | ||
CharWidths[0x2003]=2 | ||
|
||
############################################################################# | ||
# Output (to a file or pipe) for processing by data_generator.rb | ||
# ... don't bother to output zero widths since that will be the default. | ||
|
||
firstc = 0x000000 | ||
lastv = 0 | ||
uhex(c) = uppercase(hex(c,4)) | ||
for c in 0x0000:0x110000 | ||
v = get(CharWidths, c, 0) | ||
if v != lastv || c == 0x110000 | ||
v < 4 || error("invalid charwidth $v for $c") | ||
if firstc+1 < c | ||
println(uhex(firstc), "..", uhex(c-1), "; ", lastv) | ||
else | ||
println(uhex(firstc), "; ", lastv) | ||
end | ||
firstc = c | ||
lastv = v | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.