Skip to content

Commit

Permalink
Merge pull request #8 from clarcharr/master
Browse files Browse the repository at this point in the history
Update CaseFolding.txt
  • Loading branch information
SimonSapin authored Oct 3, 2017
2 parents 281535c + 237fb8c commit 67eb850
Show file tree
Hide file tree
Showing 3 changed files with 238 additions and 12 deletions.
3 changes: 1 addition & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
[package]

name = "caseless"
version = "0.1.3"
version = "0.2.0"
authors = ["Simon Sapin <[email protected]>"]
description = "Unicode caseless matching"
repository = "https://github.com/SimonSapin/rust-caseless"
Expand Down
234 changes: 228 additions & 6 deletions CaseFolding.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
# CaseFolding-7.0.0.txt
# Date: 2014-04-09, 20:00:56 GMT [MD]
# CaseFolding-10.0.0.txt
# Date: 2017-04-14, 05:40:18 GMT
# © 2017 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see http://www.unicode.org/terms_of_use.html
#
# Unicode Character Database
# Copyright (c) 1991-2014 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
# For documentation, see http://www.unicode.org/reports/tr44/
#
# Case Folding Properties
#
Expand All @@ -23,7 +24,7 @@
#
# NOTE: case folding does not preserve normalization formats!
#
# For information on case folding, including how to have case folding
# For information on case folding, including how to have case folding
# preserve normalization formats, see Section 3.13 Default Case Algorithms in
# The Unicode Standard.
#
Expand Down Expand Up @@ -587,6 +588,21 @@
10C5; C; 2D25; # GEORGIAN CAPITAL LETTER HOE
10C7; C; 2D27; # GEORGIAN CAPITAL LETTER YN
10CD; C; 2D2D; # GEORGIAN CAPITAL LETTER AEN
13F8; C; 13F0; # CHEROKEE SMALL LETTER YE
13F9; C; 13F1; # CHEROKEE SMALL LETTER YI
13FA; C; 13F2; # CHEROKEE SMALL LETTER YO
13FB; C; 13F3; # CHEROKEE SMALL LETTER YU
13FC; C; 13F4; # CHEROKEE SMALL LETTER YV
13FD; C; 13F5; # CHEROKEE SMALL LETTER MV
1C80; C; 0432; # CYRILLIC SMALL LETTER ROUNDED VE
1C81; C; 0434; # CYRILLIC SMALL LETTER LONG-LEGGED DE
1C82; C; 043E; # CYRILLIC SMALL LETTER NARROW O
1C83; C; 0441; # CYRILLIC SMALL LETTER WIDE ES
1C84; C; 0442; # CYRILLIC SMALL LETTER TALL TE
1C85; C; 0442; # CYRILLIC SMALL LETTER THREE-LEGGED TE
1C86; C; 044A; # CYRILLIC SMALL LETTER TALL HARD SIGN
1C87; C; 0463; # CYRILLIC SMALL LETTER TALL YAT
1C88; C; A64B; # CYRILLIC SMALL LETTER UNBLENDED UK
1E00; C; 1E01; # LATIN CAPITAL LETTER A WITH RING BELOW
1E02; C; 1E03; # LATIN CAPITAL LETTER B WITH DOT ABOVE
1E04; C; 1E05; # LATIN CAPITAL LETTER B WITH DOT BELOW
Expand Down Expand Up @@ -1157,8 +1173,93 @@ A7AA; C; 0266; # LATIN CAPITAL LETTER H WITH HOOK
A7AB; C; 025C; # LATIN CAPITAL LETTER REVERSED OPEN E
A7AC; C; 0261; # LATIN CAPITAL LETTER SCRIPT G
A7AD; C; 026C; # LATIN CAPITAL LETTER L WITH BELT
A7AE; C; 026A; # LATIN CAPITAL LETTER SMALL CAPITAL I
A7B0; C; 029E; # LATIN CAPITAL LETTER TURNED K
A7B1; C; 0287; # LATIN CAPITAL LETTER TURNED T
A7B2; C; 029D; # LATIN CAPITAL LETTER J WITH CROSSED-TAIL
A7B3; C; AB53; # LATIN CAPITAL LETTER CHI
A7B4; C; A7B5; # LATIN CAPITAL LETTER BETA
A7B6; C; A7B7; # LATIN CAPITAL LETTER OMEGA
AB70; C; 13A0; # CHEROKEE SMALL LETTER A
AB71; C; 13A1; # CHEROKEE SMALL LETTER E
AB72; C; 13A2; # CHEROKEE SMALL LETTER I
AB73; C; 13A3; # CHEROKEE SMALL LETTER O
AB74; C; 13A4; # CHEROKEE SMALL LETTER U
AB75; C; 13A5; # CHEROKEE SMALL LETTER V
AB76; C; 13A6; # CHEROKEE SMALL LETTER GA
AB77; C; 13A7; # CHEROKEE SMALL LETTER KA
AB78; C; 13A8; # CHEROKEE SMALL LETTER GE
AB79; C; 13A9; # CHEROKEE SMALL LETTER GI
AB7A; C; 13AA; # CHEROKEE SMALL LETTER GO
AB7B; C; 13AB; # CHEROKEE SMALL LETTER GU
AB7C; C; 13AC; # CHEROKEE SMALL LETTER GV
AB7D; C; 13AD; # CHEROKEE SMALL LETTER HA
AB7E; C; 13AE; # CHEROKEE SMALL LETTER HE
AB7F; C; 13AF; # CHEROKEE SMALL LETTER HI
AB80; C; 13B0; # CHEROKEE SMALL LETTER HO
AB81; C; 13B1; # CHEROKEE SMALL LETTER HU
AB82; C; 13B2; # CHEROKEE SMALL LETTER HV
AB83; C; 13B3; # CHEROKEE SMALL LETTER LA
AB84; C; 13B4; # CHEROKEE SMALL LETTER LE
AB85; C; 13B5; # CHEROKEE SMALL LETTER LI
AB86; C; 13B6; # CHEROKEE SMALL LETTER LO
AB87; C; 13B7; # CHEROKEE SMALL LETTER LU
AB88; C; 13B8; # CHEROKEE SMALL LETTER LV
AB89; C; 13B9; # CHEROKEE SMALL LETTER MA
AB8A; C; 13BA; # CHEROKEE SMALL LETTER ME
AB8B; C; 13BB; # CHEROKEE SMALL LETTER MI
AB8C; C; 13BC; # CHEROKEE SMALL LETTER MO
AB8D; C; 13BD; # CHEROKEE SMALL LETTER MU
AB8E; C; 13BE; # CHEROKEE SMALL LETTER NA
AB8F; C; 13BF; # CHEROKEE SMALL LETTER HNA
AB90; C; 13C0; # CHEROKEE SMALL LETTER NAH
AB91; C; 13C1; # CHEROKEE SMALL LETTER NE
AB92; C; 13C2; # CHEROKEE SMALL LETTER NI
AB93; C; 13C3; # CHEROKEE SMALL LETTER NO
AB94; C; 13C4; # CHEROKEE SMALL LETTER NU
AB95; C; 13C5; # CHEROKEE SMALL LETTER NV
AB96; C; 13C6; # CHEROKEE SMALL LETTER QUA
AB97; C; 13C7; # CHEROKEE SMALL LETTER QUE
AB98; C; 13C8; # CHEROKEE SMALL LETTER QUI
AB99; C; 13C9; # CHEROKEE SMALL LETTER QUO
AB9A; C; 13CA; # CHEROKEE SMALL LETTER QUU
AB9B; C; 13CB; # CHEROKEE SMALL LETTER QUV
AB9C; C; 13CC; # CHEROKEE SMALL LETTER SA
AB9D; C; 13CD; # CHEROKEE SMALL LETTER S
AB9E; C; 13CE; # CHEROKEE SMALL LETTER SE
AB9F; C; 13CF; # CHEROKEE SMALL LETTER SI
ABA0; C; 13D0; # CHEROKEE SMALL LETTER SO
ABA1; C; 13D1; # CHEROKEE SMALL LETTER SU
ABA2; C; 13D2; # CHEROKEE SMALL LETTER SV
ABA3; C; 13D3; # CHEROKEE SMALL LETTER DA
ABA4; C; 13D4; # CHEROKEE SMALL LETTER TA
ABA5; C; 13D5; # CHEROKEE SMALL LETTER DE
ABA6; C; 13D6; # CHEROKEE SMALL LETTER TE
ABA7; C; 13D7; # CHEROKEE SMALL LETTER DI
ABA8; C; 13D8; # CHEROKEE SMALL LETTER TI
ABA9; C; 13D9; # CHEROKEE SMALL LETTER DO
ABAA; C; 13DA; # CHEROKEE SMALL LETTER DU
ABAB; C; 13DB; # CHEROKEE SMALL LETTER DV
ABAC; C; 13DC; # CHEROKEE SMALL LETTER DLA
ABAD; C; 13DD; # CHEROKEE SMALL LETTER TLA
ABAE; C; 13DE; # CHEROKEE SMALL LETTER TLE
ABAF; C; 13DF; # CHEROKEE SMALL LETTER TLI
ABB0; C; 13E0; # CHEROKEE SMALL LETTER TLO
ABB1; C; 13E1; # CHEROKEE SMALL LETTER TLU
ABB2; C; 13E2; # CHEROKEE SMALL LETTER TLV
ABB3; C; 13E3; # CHEROKEE SMALL LETTER TSA
ABB4; C; 13E4; # CHEROKEE SMALL LETTER TSE
ABB5; C; 13E5; # CHEROKEE SMALL LETTER TSI
ABB6; C; 13E6; # CHEROKEE SMALL LETTER TSO
ABB7; C; 13E7; # CHEROKEE SMALL LETTER TSU
ABB8; C; 13E8; # CHEROKEE SMALL LETTER TSV
ABB9; C; 13E9; # CHEROKEE SMALL LETTER WA
ABBA; C; 13EA; # CHEROKEE SMALL LETTER WE
ABBB; C; 13EB; # CHEROKEE SMALL LETTER WI
ABBC; C; 13EC; # CHEROKEE SMALL LETTER WO
ABBD; C; 13ED; # CHEROKEE SMALL LETTER WU
ABBE; C; 13EE; # CHEROKEE SMALL LETTER WV
ABBF; C; 13EF; # CHEROKEE SMALL LETTER YA
FB00; F; 0066 0066; # LATIN SMALL LIGATURE FF
FB01; F; 0066 0069; # LATIN SMALL LIGATURE FI
FB02; F; 0066 006C; # LATIN SMALL LIGATURE FL
Expand Down Expand Up @@ -1237,6 +1338,93 @@ FF3A; C; FF5A; # FULLWIDTH LATIN CAPITAL LETTER Z
10425; C; 1044D; # DESERET CAPITAL LETTER ENG
10426; C; 1044E; # DESERET CAPITAL LETTER OI
10427; C; 1044F; # DESERET CAPITAL LETTER EW
104B0; C; 104D8; # OSAGE CAPITAL LETTER A
104B1; C; 104D9; # OSAGE CAPITAL LETTER AI
104B2; C; 104DA; # OSAGE CAPITAL LETTER AIN
104B3; C; 104DB; # OSAGE CAPITAL LETTER AH
104B4; C; 104DC; # OSAGE CAPITAL LETTER BRA
104B5; C; 104DD; # OSAGE CAPITAL LETTER CHA
104B6; C; 104DE; # OSAGE CAPITAL LETTER EHCHA
104B7; C; 104DF; # OSAGE CAPITAL LETTER E
104B8; C; 104E0; # OSAGE CAPITAL LETTER EIN
104B9; C; 104E1; # OSAGE CAPITAL LETTER HA
104BA; C; 104E2; # OSAGE CAPITAL LETTER HYA
104BB; C; 104E3; # OSAGE CAPITAL LETTER I
104BC; C; 104E4; # OSAGE CAPITAL LETTER KA
104BD; C; 104E5; # OSAGE CAPITAL LETTER EHKA
104BE; C; 104E6; # OSAGE CAPITAL LETTER KYA
104BF; C; 104E7; # OSAGE CAPITAL LETTER LA
104C0; C; 104E8; # OSAGE CAPITAL LETTER MA
104C1; C; 104E9; # OSAGE CAPITAL LETTER NA
104C2; C; 104EA; # OSAGE CAPITAL LETTER O
104C3; C; 104EB; # OSAGE CAPITAL LETTER OIN
104C4; C; 104EC; # OSAGE CAPITAL LETTER PA
104C5; C; 104ED; # OSAGE CAPITAL LETTER EHPA
104C6; C; 104EE; # OSAGE CAPITAL LETTER SA
104C7; C; 104EF; # OSAGE CAPITAL LETTER SHA
104C8; C; 104F0; # OSAGE CAPITAL LETTER TA
104C9; C; 104F1; # OSAGE CAPITAL LETTER EHTA
104CA; C; 104F2; # OSAGE CAPITAL LETTER TSA
104CB; C; 104F3; # OSAGE CAPITAL LETTER EHTSA
104CC; C; 104F4; # OSAGE CAPITAL LETTER TSHA
104CD; C; 104F5; # OSAGE CAPITAL LETTER DHA
104CE; C; 104F6; # OSAGE CAPITAL LETTER U
104CF; C; 104F7; # OSAGE CAPITAL LETTER WA
104D0; C; 104F8; # OSAGE CAPITAL LETTER KHA
104D1; C; 104F9; # OSAGE CAPITAL LETTER GHA
104D2; C; 104FA; # OSAGE CAPITAL LETTER ZA
104D3; C; 104FB; # OSAGE CAPITAL LETTER ZHA
10C80; C; 10CC0; # OLD HUNGARIAN CAPITAL LETTER A
10C81; C; 10CC1; # OLD HUNGARIAN CAPITAL LETTER AA
10C82; C; 10CC2; # OLD HUNGARIAN CAPITAL LETTER EB
10C83; C; 10CC3; # OLD HUNGARIAN CAPITAL LETTER AMB
10C84; C; 10CC4; # OLD HUNGARIAN CAPITAL LETTER EC
10C85; C; 10CC5; # OLD HUNGARIAN CAPITAL LETTER ENC
10C86; C; 10CC6; # OLD HUNGARIAN CAPITAL LETTER ECS
10C87; C; 10CC7; # OLD HUNGARIAN CAPITAL LETTER ED
10C88; C; 10CC8; # OLD HUNGARIAN CAPITAL LETTER AND
10C89; C; 10CC9; # OLD HUNGARIAN CAPITAL LETTER E
10C8A; C; 10CCA; # OLD HUNGARIAN CAPITAL LETTER CLOSE E
10C8B; C; 10CCB; # OLD HUNGARIAN CAPITAL LETTER EE
10C8C; C; 10CCC; # OLD HUNGARIAN CAPITAL LETTER EF
10C8D; C; 10CCD; # OLD HUNGARIAN CAPITAL LETTER EG
10C8E; C; 10CCE; # OLD HUNGARIAN CAPITAL LETTER EGY
10C8F; C; 10CCF; # OLD HUNGARIAN CAPITAL LETTER EH
10C90; C; 10CD0; # OLD HUNGARIAN CAPITAL LETTER I
10C91; C; 10CD1; # OLD HUNGARIAN CAPITAL LETTER II
10C92; C; 10CD2; # OLD HUNGARIAN CAPITAL LETTER EJ
10C93; C; 10CD3; # OLD HUNGARIAN CAPITAL LETTER EK
10C94; C; 10CD4; # OLD HUNGARIAN CAPITAL LETTER AK
10C95; C; 10CD5; # OLD HUNGARIAN CAPITAL LETTER UNK
10C96; C; 10CD6; # OLD HUNGARIAN CAPITAL LETTER EL
10C97; C; 10CD7; # OLD HUNGARIAN CAPITAL LETTER ELY
10C98; C; 10CD8; # OLD HUNGARIAN CAPITAL LETTER EM
10C99; C; 10CD9; # OLD HUNGARIAN CAPITAL LETTER EN
10C9A; C; 10CDA; # OLD HUNGARIAN CAPITAL LETTER ENY
10C9B; C; 10CDB; # OLD HUNGARIAN CAPITAL LETTER O
10C9C; C; 10CDC; # OLD HUNGARIAN CAPITAL LETTER OO
10C9D; C; 10CDD; # OLD HUNGARIAN CAPITAL LETTER NIKOLSBURG OE
10C9E; C; 10CDE; # OLD HUNGARIAN CAPITAL LETTER RUDIMENTA OE
10C9F; C; 10CDF; # OLD HUNGARIAN CAPITAL LETTER OEE
10CA0; C; 10CE0; # OLD HUNGARIAN CAPITAL LETTER EP
10CA1; C; 10CE1; # OLD HUNGARIAN CAPITAL LETTER EMP
10CA2; C; 10CE2; # OLD HUNGARIAN CAPITAL LETTER ER
10CA3; C; 10CE3; # OLD HUNGARIAN CAPITAL LETTER SHORT ER
10CA4; C; 10CE4; # OLD HUNGARIAN CAPITAL LETTER ES
10CA5; C; 10CE5; # OLD HUNGARIAN CAPITAL LETTER ESZ
10CA6; C; 10CE6; # OLD HUNGARIAN CAPITAL LETTER ET
10CA7; C; 10CE7; # OLD HUNGARIAN CAPITAL LETTER ENT
10CA8; C; 10CE8; # OLD HUNGARIAN CAPITAL LETTER ETY
10CA9; C; 10CE9; # OLD HUNGARIAN CAPITAL LETTER ECH
10CAA; C; 10CEA; # OLD HUNGARIAN CAPITAL LETTER U
10CAB; C; 10CEB; # OLD HUNGARIAN CAPITAL LETTER UU
10CAC; C; 10CEC; # OLD HUNGARIAN CAPITAL LETTER NIKOLSBURG UE
10CAD; C; 10CED; # OLD HUNGARIAN CAPITAL LETTER RUDIMENTA UE
10CAE; C; 10CEE; # OLD HUNGARIAN CAPITAL LETTER EV
10CAF; C; 10CEF; # OLD HUNGARIAN CAPITAL LETTER EZ
10CB0; C; 10CF0; # OLD HUNGARIAN CAPITAL LETTER EZS
10CB1; C; 10CF1; # OLD HUNGARIAN CAPITAL LETTER ENT-SHAPED SIGN
10CB2; C; 10CF2; # OLD HUNGARIAN CAPITAL LETTER US
118A0; C; 118C0; # WARANG CITI CAPITAL LETTER NGAA
118A1; C; 118C1; # WARANG CITI CAPITAL LETTER A
118A2; C; 118C2; # WARANG CITI CAPITAL LETTER WI
Expand Down Expand Up @@ -1269,5 +1457,39 @@ FF3A; C; FF5A; # FULLWIDTH LATIN CAPITAL LETTER Z
118BD; C; 118DD; # WARANG CITI CAPITAL LETTER SSUU
118BE; C; 118DE; # WARANG CITI CAPITAL LETTER SII
118BF; C; 118DF; # WARANG CITI CAPITAL LETTER VIYO
1E900; C; 1E922; # ADLAM CAPITAL LETTER ALIF
1E901; C; 1E923; # ADLAM CAPITAL LETTER DAALI
1E902; C; 1E924; # ADLAM CAPITAL LETTER LAAM
1E903; C; 1E925; # ADLAM CAPITAL LETTER MIIM
1E904; C; 1E926; # ADLAM CAPITAL LETTER BA
1E905; C; 1E927; # ADLAM CAPITAL LETTER SINNYIIYHE
1E906; C; 1E928; # ADLAM CAPITAL LETTER PE
1E907; C; 1E929; # ADLAM CAPITAL LETTER BHE
1E908; C; 1E92A; # ADLAM CAPITAL LETTER RA
1E909; C; 1E92B; # ADLAM CAPITAL LETTER E
1E90A; C; 1E92C; # ADLAM CAPITAL LETTER FA
1E90B; C; 1E92D; # ADLAM CAPITAL LETTER I
1E90C; C; 1E92E; # ADLAM CAPITAL LETTER O
1E90D; C; 1E92F; # ADLAM CAPITAL LETTER DHA
1E90E; C; 1E930; # ADLAM CAPITAL LETTER YHE
1E90F; C; 1E931; # ADLAM CAPITAL LETTER WAW
1E910; C; 1E932; # ADLAM CAPITAL LETTER NUN
1E911; C; 1E933; # ADLAM CAPITAL LETTER KAF
1E912; C; 1E934; # ADLAM CAPITAL LETTER YA
1E913; C; 1E935; # ADLAM CAPITAL LETTER U
1E914; C; 1E936; # ADLAM CAPITAL LETTER JIIM
1E915; C; 1E937; # ADLAM CAPITAL LETTER CHI
1E916; C; 1E938; # ADLAM CAPITAL LETTER HA
1E917; C; 1E939; # ADLAM CAPITAL LETTER QAAF
1E918; C; 1E93A; # ADLAM CAPITAL LETTER GA
1E919; C; 1E93B; # ADLAM CAPITAL LETTER NYA
1E91A; C; 1E93C; # ADLAM CAPITAL LETTER TU
1E91B; C; 1E93D; # ADLAM CAPITAL LETTER NHA
1E91C; C; 1E93E; # ADLAM CAPITAL LETTER VA
1E91D; C; 1E93F; # ADLAM CAPITAL LETTER KHA
1E91E; C; 1E940; # ADLAM CAPITAL LETTER GBE
1E91F; C; 1E941; # ADLAM CAPITAL LETTER ZAL
1E920; C; 1E942; # ADLAM CAPITAL LETTER KPO
1E921; C; 1E943; # ADLAM CAPITAL LETTER SHA
#
# EOF
13 changes: 9 additions & 4 deletions src/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,22 @@ const MAX_FOLDED_CODE_POINTS: usize = 3;
fn main() {
let mut lines = include_str!("../CaseFolding.txt").lines();
let first_line = lines.next().unwrap();
let version_regex = Regex::new(r"^# CaseFolding-(\d.\d.\d).txt$").unwrap();
let unicode_version = &version_regex.captures(first_line).unwrap()[1];
let version_regex = Regex::new(r"^# CaseFolding-(\d+)\.(\d+)\.(\d+).txt$").unwrap();
let unicode_version = &version_regex.captures(first_line).unwrap();
let (major, minor, patch): (u64, u64, u64) = (
unicode_version[1].parse().unwrap(),
unicode_version[2].parse().unwrap(),
unicode_version[3].parse().unwrap(),
);

let dst = Path::new(&env::var("OUT_DIR").unwrap()).join("case_folding_data.rs");
let mut f = &mut File::create(&dst).unwrap();
let f = &mut File::create(&dst).unwrap();

macro_rules! w {
($($args: tt)+) => { (write!(f, $($args)+)).unwrap(); }
};

w!("pub const UNICODE_VERSION: &'static str = \"{}\";\n", unicode_version);
w!("pub const UNICODE_VERSION: (u64, u64, u64) = ({}, {}, {});\n", major, minor, patch);
w!("const CASE_FOLDING_TABLE: &'static [(char, [char; 3])] = &[\n");

// Entry with C (common case folding) or F (full case folding) status
Expand Down

0 comments on commit 67eb850

Please sign in to comment.