diff --git a/Cargo.toml b/Cargo.toml index 99ac27c..6a7786a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,6 @@ [package] - name = "caseless" -version = "0.1.3" +version = "0.2.0" authors = ["Simon Sapin "] description = "Unicode caseless matching" repository = "https://github.com/SimonSapin/rust-caseless" diff --git a/CaseFolding.txt b/CaseFolding.txt index 13c10a6..efdf18e 100644 --- a/CaseFolding.txt +++ b/CaseFolding.txt @@ -1,10 +1,11 @@ -# CaseFolding-7.0.0.txt -# Date: 2014-04-09, 20:00:56 GMT [MD] +# CaseFolding-10.0.0.txt +# Date: 2017-04-14, 05:40:18 GMT +# © 2017 Unicode®, Inc. +# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. +# For terms of use, see http://www.unicode.org/terms_of_use.html # # Unicode Character Database -# Copyright (c) 1991-2014 Unicode, Inc. -# For terms of use, see http://www.unicode.org/terms_of_use.html -# For documentation, see http://www.unicode.org/reports/tr44/ +# For documentation, see http://www.unicode.org/reports/tr44/ # # Case Folding Properties # @@ -23,7 +24,7 @@ # # NOTE: case folding does not preserve normalization formats! # -# For information on case folding, including how to have case folding +# For information on case folding, including how to have case folding # preserve normalization formats, see Section 3.13 Default Case Algorithms in # The Unicode Standard. # @@ -587,6 +588,21 @@ 10C5; C; 2D25; # GEORGIAN CAPITAL LETTER HOE 10C7; C; 2D27; # GEORGIAN CAPITAL LETTER YN 10CD; C; 2D2D; # GEORGIAN CAPITAL LETTER AEN +13F8; C; 13F0; # CHEROKEE SMALL LETTER YE +13F9; C; 13F1; # CHEROKEE SMALL LETTER YI +13FA; C; 13F2; # CHEROKEE SMALL LETTER YO +13FB; C; 13F3; # CHEROKEE SMALL LETTER YU +13FC; C; 13F4; # CHEROKEE SMALL LETTER YV +13FD; C; 13F5; # CHEROKEE SMALL LETTER MV +1C80; C; 0432; # CYRILLIC SMALL LETTER ROUNDED VE +1C81; C; 0434; # CYRILLIC SMALL LETTER LONG-LEGGED DE +1C82; C; 043E; # CYRILLIC SMALL LETTER NARROW O +1C83; C; 0441; # CYRILLIC SMALL LETTER WIDE ES +1C84; C; 0442; # CYRILLIC SMALL LETTER TALL TE +1C85; C; 0442; # CYRILLIC SMALL LETTER THREE-LEGGED TE +1C86; C; 044A; # CYRILLIC SMALL LETTER TALL HARD SIGN +1C87; C; 0463; # CYRILLIC SMALL LETTER TALL YAT +1C88; C; A64B; # CYRILLIC SMALL LETTER UNBLENDED UK 1E00; C; 1E01; # LATIN CAPITAL LETTER A WITH RING BELOW 1E02; C; 1E03; # LATIN CAPITAL LETTER B WITH DOT ABOVE 1E04; C; 1E05; # LATIN CAPITAL LETTER B WITH DOT BELOW @@ -1157,8 +1173,93 @@ A7AA; C; 0266; # LATIN CAPITAL LETTER H WITH HOOK A7AB; C; 025C; # LATIN CAPITAL LETTER REVERSED OPEN E A7AC; C; 0261; # LATIN CAPITAL LETTER SCRIPT G A7AD; C; 026C; # LATIN CAPITAL LETTER L WITH BELT +A7AE; C; 026A; # LATIN CAPITAL LETTER SMALL CAPITAL I A7B0; C; 029E; # LATIN CAPITAL LETTER TURNED K A7B1; C; 0287; # LATIN CAPITAL LETTER TURNED T +A7B2; C; 029D; # LATIN CAPITAL LETTER J WITH CROSSED-TAIL +A7B3; C; AB53; # LATIN CAPITAL LETTER CHI +A7B4; C; A7B5; # LATIN CAPITAL LETTER BETA +A7B6; C; A7B7; # LATIN CAPITAL LETTER OMEGA +AB70; C; 13A0; # CHEROKEE SMALL LETTER A +AB71; C; 13A1; # CHEROKEE SMALL LETTER E +AB72; C; 13A2; # CHEROKEE SMALL LETTER I +AB73; C; 13A3; # CHEROKEE SMALL LETTER O +AB74; C; 13A4; # CHEROKEE SMALL LETTER U +AB75; C; 13A5; # CHEROKEE SMALL LETTER V +AB76; C; 13A6; # CHEROKEE SMALL LETTER GA +AB77; C; 13A7; # CHEROKEE SMALL LETTER KA +AB78; C; 13A8; # CHEROKEE SMALL LETTER GE +AB79; C; 13A9; # CHEROKEE SMALL LETTER GI +AB7A; C; 13AA; # CHEROKEE SMALL LETTER GO +AB7B; C; 13AB; # CHEROKEE SMALL LETTER GU +AB7C; C; 13AC; # CHEROKEE SMALL LETTER GV +AB7D; C; 13AD; # CHEROKEE SMALL LETTER HA +AB7E; C; 13AE; # CHEROKEE SMALL LETTER HE +AB7F; C; 13AF; # CHEROKEE SMALL LETTER HI +AB80; C; 13B0; # CHEROKEE SMALL LETTER HO +AB81; C; 13B1; # CHEROKEE SMALL LETTER HU +AB82; C; 13B2; # CHEROKEE SMALL LETTER HV +AB83; C; 13B3; # CHEROKEE SMALL LETTER LA +AB84; C; 13B4; # CHEROKEE SMALL LETTER LE +AB85; C; 13B5; # CHEROKEE SMALL LETTER LI +AB86; C; 13B6; # CHEROKEE SMALL LETTER LO +AB87; C; 13B7; # CHEROKEE SMALL LETTER LU +AB88; C; 13B8; # CHEROKEE SMALL LETTER LV +AB89; C; 13B9; # CHEROKEE SMALL LETTER MA +AB8A; C; 13BA; # CHEROKEE SMALL LETTER ME +AB8B; C; 13BB; # CHEROKEE SMALL LETTER MI +AB8C; C; 13BC; # CHEROKEE SMALL LETTER MO +AB8D; C; 13BD; # CHEROKEE SMALL LETTER MU +AB8E; C; 13BE; # CHEROKEE SMALL LETTER NA +AB8F; C; 13BF; # CHEROKEE SMALL LETTER HNA +AB90; C; 13C0; # CHEROKEE SMALL LETTER NAH +AB91; C; 13C1; # CHEROKEE SMALL LETTER NE +AB92; C; 13C2; # CHEROKEE SMALL LETTER NI +AB93; C; 13C3; # CHEROKEE SMALL LETTER NO +AB94; C; 13C4; # CHEROKEE SMALL LETTER NU +AB95; C; 13C5; # CHEROKEE SMALL LETTER NV +AB96; C; 13C6; # CHEROKEE SMALL LETTER QUA +AB97; C; 13C7; # CHEROKEE SMALL LETTER QUE +AB98; C; 13C8; # CHEROKEE SMALL LETTER QUI +AB99; C; 13C9; # CHEROKEE SMALL LETTER QUO +AB9A; C; 13CA; # CHEROKEE SMALL LETTER QUU +AB9B; C; 13CB; # CHEROKEE SMALL LETTER QUV +AB9C; C; 13CC; # CHEROKEE SMALL LETTER SA +AB9D; C; 13CD; # CHEROKEE SMALL LETTER S +AB9E; C; 13CE; # CHEROKEE SMALL LETTER SE +AB9F; C; 13CF; # CHEROKEE SMALL LETTER SI +ABA0; C; 13D0; # CHEROKEE SMALL LETTER SO +ABA1; C; 13D1; # CHEROKEE SMALL LETTER SU +ABA2; C; 13D2; # CHEROKEE SMALL LETTER SV +ABA3; C; 13D3; # CHEROKEE SMALL LETTER DA +ABA4; C; 13D4; # CHEROKEE SMALL LETTER TA +ABA5; C; 13D5; # CHEROKEE SMALL LETTER DE +ABA6; C; 13D6; # CHEROKEE SMALL LETTER TE +ABA7; C; 13D7; # CHEROKEE SMALL LETTER DI +ABA8; C; 13D8; # CHEROKEE SMALL LETTER TI +ABA9; C; 13D9; # CHEROKEE SMALL LETTER DO +ABAA; C; 13DA; # CHEROKEE SMALL LETTER DU +ABAB; C; 13DB; # CHEROKEE SMALL LETTER DV +ABAC; C; 13DC; # CHEROKEE SMALL LETTER DLA +ABAD; C; 13DD; # CHEROKEE SMALL LETTER TLA +ABAE; C; 13DE; # CHEROKEE SMALL LETTER TLE +ABAF; C; 13DF; # CHEROKEE SMALL LETTER TLI +ABB0; C; 13E0; # CHEROKEE SMALL LETTER TLO +ABB1; C; 13E1; # CHEROKEE SMALL LETTER TLU +ABB2; C; 13E2; # CHEROKEE SMALL LETTER TLV +ABB3; C; 13E3; # CHEROKEE SMALL LETTER TSA +ABB4; C; 13E4; # CHEROKEE SMALL LETTER TSE +ABB5; C; 13E5; # CHEROKEE SMALL LETTER TSI +ABB6; C; 13E6; # CHEROKEE SMALL LETTER TSO +ABB7; C; 13E7; # CHEROKEE SMALL LETTER TSU +ABB8; C; 13E8; # CHEROKEE SMALL LETTER TSV +ABB9; C; 13E9; # CHEROKEE SMALL LETTER WA +ABBA; C; 13EA; # CHEROKEE SMALL LETTER WE +ABBB; C; 13EB; # CHEROKEE SMALL LETTER WI +ABBC; C; 13EC; # CHEROKEE SMALL LETTER WO +ABBD; C; 13ED; # CHEROKEE SMALL LETTER WU +ABBE; C; 13EE; # CHEROKEE SMALL LETTER WV +ABBF; C; 13EF; # CHEROKEE SMALL LETTER YA FB00; F; 0066 0066; # LATIN SMALL LIGATURE FF FB01; F; 0066 0069; # LATIN SMALL LIGATURE FI FB02; F; 0066 006C; # LATIN SMALL LIGATURE FL @@ -1237,6 +1338,93 @@ FF3A; C; FF5A; # FULLWIDTH LATIN CAPITAL LETTER Z 10425; C; 1044D; # DESERET CAPITAL LETTER ENG 10426; C; 1044E; # DESERET CAPITAL LETTER OI 10427; C; 1044F; # DESERET CAPITAL LETTER EW +104B0; C; 104D8; # OSAGE CAPITAL LETTER A +104B1; C; 104D9; # OSAGE CAPITAL LETTER AI +104B2; C; 104DA; # OSAGE CAPITAL LETTER AIN +104B3; C; 104DB; # OSAGE CAPITAL LETTER AH +104B4; C; 104DC; # OSAGE CAPITAL LETTER BRA +104B5; C; 104DD; # OSAGE CAPITAL LETTER CHA +104B6; C; 104DE; # OSAGE CAPITAL LETTER EHCHA +104B7; C; 104DF; # OSAGE CAPITAL LETTER E +104B8; C; 104E0; # OSAGE CAPITAL LETTER EIN +104B9; C; 104E1; # OSAGE CAPITAL LETTER HA +104BA; C; 104E2; # OSAGE CAPITAL LETTER HYA +104BB; C; 104E3; # OSAGE CAPITAL LETTER I +104BC; C; 104E4; # OSAGE CAPITAL LETTER KA +104BD; C; 104E5; # OSAGE CAPITAL LETTER EHKA +104BE; C; 104E6; # OSAGE CAPITAL LETTER KYA +104BF; C; 104E7; # OSAGE CAPITAL LETTER LA +104C0; C; 104E8; # OSAGE CAPITAL LETTER MA +104C1; C; 104E9; # OSAGE CAPITAL LETTER NA +104C2; C; 104EA; # OSAGE CAPITAL LETTER O +104C3; C; 104EB; # OSAGE CAPITAL LETTER OIN +104C4; C; 104EC; # OSAGE CAPITAL LETTER PA +104C5; C; 104ED; # OSAGE CAPITAL LETTER EHPA +104C6; C; 104EE; # OSAGE CAPITAL LETTER SA +104C7; C; 104EF; # OSAGE CAPITAL LETTER SHA +104C8; C; 104F0; # OSAGE CAPITAL LETTER TA +104C9; C; 104F1; # OSAGE CAPITAL LETTER EHTA +104CA; C; 104F2; # OSAGE CAPITAL LETTER TSA +104CB; C; 104F3; # OSAGE CAPITAL LETTER EHTSA +104CC; C; 104F4; # OSAGE CAPITAL LETTER TSHA +104CD; C; 104F5; # OSAGE CAPITAL LETTER DHA +104CE; C; 104F6; # OSAGE CAPITAL LETTER U +104CF; C; 104F7; # OSAGE CAPITAL LETTER WA +104D0; C; 104F8; # OSAGE CAPITAL LETTER KHA +104D1; C; 104F9; # OSAGE CAPITAL LETTER GHA +104D2; C; 104FA; # OSAGE CAPITAL LETTER ZA +104D3; C; 104FB; # OSAGE CAPITAL LETTER ZHA +10C80; C; 10CC0; # OLD HUNGARIAN CAPITAL LETTER A +10C81; C; 10CC1; # OLD HUNGARIAN CAPITAL LETTER AA +10C82; C; 10CC2; # OLD HUNGARIAN CAPITAL LETTER EB +10C83; C; 10CC3; # OLD HUNGARIAN CAPITAL LETTER AMB +10C84; C; 10CC4; # OLD HUNGARIAN CAPITAL LETTER EC +10C85; C; 10CC5; # OLD HUNGARIAN CAPITAL LETTER ENC +10C86; C; 10CC6; # OLD HUNGARIAN CAPITAL LETTER ECS +10C87; C; 10CC7; # OLD HUNGARIAN CAPITAL LETTER ED +10C88; C; 10CC8; # OLD HUNGARIAN CAPITAL LETTER AND +10C89; C; 10CC9; # OLD HUNGARIAN CAPITAL LETTER E +10C8A; C; 10CCA; # OLD HUNGARIAN CAPITAL LETTER CLOSE E +10C8B; C; 10CCB; # OLD HUNGARIAN CAPITAL LETTER EE +10C8C; C; 10CCC; # OLD HUNGARIAN CAPITAL LETTER EF +10C8D; C; 10CCD; # OLD HUNGARIAN CAPITAL LETTER EG +10C8E; C; 10CCE; # OLD HUNGARIAN CAPITAL LETTER EGY +10C8F; C; 10CCF; # OLD HUNGARIAN CAPITAL LETTER EH +10C90; C; 10CD0; # OLD HUNGARIAN CAPITAL LETTER I +10C91; C; 10CD1; # OLD HUNGARIAN CAPITAL LETTER II +10C92; C; 10CD2; # OLD HUNGARIAN CAPITAL LETTER EJ +10C93; C; 10CD3; # OLD HUNGARIAN CAPITAL LETTER EK +10C94; C; 10CD4; # OLD HUNGARIAN CAPITAL LETTER AK +10C95; C; 10CD5; # OLD HUNGARIAN CAPITAL LETTER UNK +10C96; C; 10CD6; # OLD HUNGARIAN CAPITAL LETTER EL +10C97; C; 10CD7; # OLD HUNGARIAN CAPITAL LETTER ELY +10C98; C; 10CD8; # OLD HUNGARIAN CAPITAL LETTER EM +10C99; C; 10CD9; # OLD HUNGARIAN CAPITAL LETTER EN +10C9A; C; 10CDA; # OLD HUNGARIAN CAPITAL LETTER ENY +10C9B; C; 10CDB; # OLD HUNGARIAN CAPITAL LETTER O +10C9C; C; 10CDC; # OLD HUNGARIAN CAPITAL LETTER OO +10C9D; C; 10CDD; # OLD HUNGARIAN CAPITAL LETTER NIKOLSBURG OE +10C9E; C; 10CDE; # OLD HUNGARIAN CAPITAL LETTER RUDIMENTA OE +10C9F; C; 10CDF; # OLD HUNGARIAN CAPITAL LETTER OEE +10CA0; C; 10CE0; # OLD HUNGARIAN CAPITAL LETTER EP +10CA1; C; 10CE1; # OLD HUNGARIAN CAPITAL LETTER EMP +10CA2; C; 10CE2; # OLD HUNGARIAN CAPITAL LETTER ER +10CA3; C; 10CE3; # OLD HUNGARIAN CAPITAL LETTER SHORT ER +10CA4; C; 10CE4; # OLD HUNGARIAN CAPITAL LETTER ES +10CA5; C; 10CE5; # OLD HUNGARIAN CAPITAL LETTER ESZ +10CA6; C; 10CE6; # OLD HUNGARIAN CAPITAL LETTER ET +10CA7; C; 10CE7; # OLD HUNGARIAN CAPITAL LETTER ENT +10CA8; C; 10CE8; # OLD HUNGARIAN CAPITAL LETTER ETY +10CA9; C; 10CE9; # OLD HUNGARIAN CAPITAL LETTER ECH +10CAA; C; 10CEA; # OLD HUNGARIAN CAPITAL LETTER U +10CAB; C; 10CEB; # OLD HUNGARIAN CAPITAL LETTER UU +10CAC; C; 10CEC; # OLD HUNGARIAN CAPITAL LETTER NIKOLSBURG UE +10CAD; C; 10CED; # OLD HUNGARIAN CAPITAL LETTER RUDIMENTA UE +10CAE; C; 10CEE; # OLD HUNGARIAN CAPITAL LETTER EV +10CAF; C; 10CEF; # OLD HUNGARIAN CAPITAL LETTER EZ +10CB0; C; 10CF0; # OLD HUNGARIAN CAPITAL LETTER EZS +10CB1; C; 10CF1; # OLD HUNGARIAN CAPITAL LETTER ENT-SHAPED SIGN +10CB2; C; 10CF2; # OLD HUNGARIAN CAPITAL LETTER US 118A0; C; 118C0; # WARANG CITI CAPITAL LETTER NGAA 118A1; C; 118C1; # WARANG CITI CAPITAL LETTER A 118A2; C; 118C2; # WARANG CITI CAPITAL LETTER WI @@ -1269,5 +1457,39 @@ FF3A; C; FF5A; # FULLWIDTH LATIN CAPITAL LETTER Z 118BD; C; 118DD; # WARANG CITI CAPITAL LETTER SSUU 118BE; C; 118DE; # WARANG CITI CAPITAL LETTER SII 118BF; C; 118DF; # WARANG CITI CAPITAL LETTER VIYO +1E900; C; 1E922; # ADLAM CAPITAL LETTER ALIF +1E901; C; 1E923; # ADLAM CAPITAL LETTER DAALI +1E902; C; 1E924; # ADLAM CAPITAL LETTER LAAM +1E903; C; 1E925; # ADLAM CAPITAL LETTER MIIM +1E904; C; 1E926; # ADLAM CAPITAL LETTER BA +1E905; C; 1E927; # ADLAM CAPITAL LETTER SINNYIIYHE +1E906; C; 1E928; # ADLAM CAPITAL LETTER PE +1E907; C; 1E929; # ADLAM CAPITAL LETTER BHE +1E908; C; 1E92A; # ADLAM CAPITAL LETTER RA +1E909; C; 1E92B; # ADLAM CAPITAL LETTER E +1E90A; C; 1E92C; # ADLAM CAPITAL LETTER FA +1E90B; C; 1E92D; # ADLAM CAPITAL LETTER I +1E90C; C; 1E92E; # ADLAM CAPITAL LETTER O +1E90D; C; 1E92F; # ADLAM CAPITAL LETTER DHA +1E90E; C; 1E930; # ADLAM CAPITAL LETTER YHE +1E90F; C; 1E931; # ADLAM CAPITAL LETTER WAW +1E910; C; 1E932; # ADLAM CAPITAL LETTER NUN +1E911; C; 1E933; # ADLAM CAPITAL LETTER KAF +1E912; C; 1E934; # ADLAM CAPITAL LETTER YA +1E913; C; 1E935; # ADLAM CAPITAL LETTER U +1E914; C; 1E936; # ADLAM CAPITAL LETTER JIIM +1E915; C; 1E937; # ADLAM CAPITAL LETTER CHI +1E916; C; 1E938; # ADLAM CAPITAL LETTER HA +1E917; C; 1E939; # ADLAM CAPITAL LETTER QAAF +1E918; C; 1E93A; # ADLAM CAPITAL LETTER GA +1E919; C; 1E93B; # ADLAM CAPITAL LETTER NYA +1E91A; C; 1E93C; # ADLAM CAPITAL LETTER TU +1E91B; C; 1E93D; # ADLAM CAPITAL LETTER NHA +1E91C; C; 1E93E; # ADLAM CAPITAL LETTER VA +1E91D; C; 1E93F; # ADLAM CAPITAL LETTER KHA +1E91E; C; 1E940; # ADLAM CAPITAL LETTER GBE +1E91F; C; 1E941; # ADLAM CAPITAL LETTER ZAL +1E920; C; 1E942; # ADLAM CAPITAL LETTER KPO +1E921; C; 1E943; # ADLAM CAPITAL LETTER SHA # # EOF diff --git a/src/build.rs b/src/build.rs index ec8f19f..80f619a 100644 --- a/src/build.rs +++ b/src/build.rs @@ -13,17 +13,22 @@ const MAX_FOLDED_CODE_POINTS: usize = 3; fn main() { let mut lines = include_str!("../CaseFolding.txt").lines(); let first_line = lines.next().unwrap(); - let version_regex = Regex::new(r"^# CaseFolding-(\d.\d.\d).txt$").unwrap(); - let unicode_version = &version_regex.captures(first_line).unwrap()[1]; + let version_regex = Regex::new(r"^# CaseFolding-(\d+)\.(\d+)\.(\d+).txt$").unwrap(); + let unicode_version = &version_regex.captures(first_line).unwrap(); + let (major, minor, patch): (u64, u64, u64) = ( + unicode_version[1].parse().unwrap(), + unicode_version[2].parse().unwrap(), + unicode_version[3].parse().unwrap(), + ); let dst = Path::new(&env::var("OUT_DIR").unwrap()).join("case_folding_data.rs"); - let mut f = &mut File::create(&dst).unwrap(); + let f = &mut File::create(&dst).unwrap(); macro_rules! w { ($($args: tt)+) => { (write!(f, $($args)+)).unwrap(); } }; - w!("pub const UNICODE_VERSION: &'static str = \"{}\";\n", unicode_version); + w!("pub const UNICODE_VERSION: (u64, u64, u64) = ({}, {}, {});\n", major, minor, patch); w!("const CASE_FOLDING_TABLE: &'static [(char, [char; 3])] = &[\n"); // Entry with C (common case folding) or F (full case folding) status