Skip to content

Commit

Permalink
Rollup merge of rust-lang#62848 - matklad:xid-unicode, r=petrochenkov
Browse files Browse the repository at this point in the history
Use unicode-xid crate instead of libcore

This PR proposes to remove `char::is_xid_start` and `char::is_xid_continue` functions from `libcore` and use `unicode_xid` crate from crates.io (note that this crate is already present in rust-lang/rust's Cargo.lock).

Reasons to do this:

* removing rustc-binary-specific stuff from libcore
* making sure that, across the ecosystem, there's a single definition of what rust identifier is (`unicode-xid` has almost 10 million downs, as a `proc_macro2` dependency)
* making it easier to share `rustc_lexer` crate with rust-analyzer: no need to `#[cfg]` if we are building as a part of the compiler

Reasons not to do this:

* increased maintenance burden: we'll need to upgrade unicode version both in libcore and in unicode-xid. However, this shouldn't be a too heavy burden: just running `./unicode.py` after new unicode version. I (@matklad) am ready to be a t-compiler side maintainer of unicode-xid. Moreover, given that xid-unicode is an important dependency of syn, *someone* needs to maintain it anyway.
* xid-unicode implementation is significantly slower. It uses a more compact table with binary search, instead of a trie. However, this shouldn't matter in practice, because we have fast-path for ascii anyway, and code size savings is a plus. Moreover, in rust-lang#59706 not using libcore turned out to be *faster*, presumably beacause checking for whitespace with match is even faster.

<details>

<summary>old description</summary>

Followup to rust-lang#59706

r? @eddyb

Note that this doesn't actually remove tables from libcore, to avoid conflict with rust-lang#62641.

cc unicode-rs/unicode-xid#11

</details>
  • Loading branch information
Centril authored Sep 5, 2019
2 parents a24f636 + 206fe8e commit e0630ab
Show file tree
Hide file tree
Showing 17 changed files with 111 additions and 510 deletions.
17 changes: 13 additions & 4 deletions Cargo.lock
Original file line number Diff line number Diff line change
Expand Up @@ -1011,6 +1011,7 @@ dependencies = [
name = "fmt_macros"
version = "0.0.0"
dependencies = [
"rustc_lexer",
"syntax_pos",
]

Expand Down Expand Up @@ -2372,7 +2373,7 @@ version = "0.4.30"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cf3d2011ab5c909338f7887f4fc896d35932e29146c12c8d01da6b22a80ba759"
dependencies = [
"unicode-xid",
"unicode-xid 0.1.0",
]

[[package]]
Expand Down Expand Up @@ -3290,7 +3291,7 @@ dependencies = [
name = "rustc_lexer"
version = "0.1.0"
dependencies = [
"unicode-xid",
"unicode-xid 0.2.0",
]

[[package]]
Expand Down Expand Up @@ -3368,6 +3369,7 @@ dependencies = [
"rustc_apfloat",
"rustc_data_structures",
"rustc_errors",
"rustc_lexer",
"rustc_target",
"serialize",
"smallvec",
Expand Down Expand Up @@ -3976,7 +3978,7 @@ checksum = "641e117d55514d6d918490e47102f7e08d096fdde360247e4a10f7a91a8478d3"
dependencies = [
"proc-macro2",
"quote",
"unicode-xid",
"unicode-xid 0.1.0",
]

[[package]]
Expand All @@ -3988,7 +3990,7 @@ dependencies = [
"proc-macro2",
"quote",
"syn",
"unicode-xid",
"unicode-xid 0.1.0",
]

[[package]]
Expand Down Expand Up @@ -4017,6 +4019,7 @@ dependencies = [
"log",
"rustc_data_structures",
"rustc_errors",
"rustc_lexer",
"rustc_target",
"smallvec",
"syntax",
Expand Down Expand Up @@ -4532,6 +4535,12 @@ version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc"

[[package]]
name = "unicode-xid"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "826e7639553986605ec5979c7dd957c7895e93eabed50ab2ffa7f6128a75097c"

[[package]]
name = "unicode_categories"
version = "0.1.1"
Expand Down
23 changes: 0 additions & 23 deletions src/libcore/char/methods.rs
Original file line number Diff line number Diff line change
Expand Up @@ -547,29 +547,6 @@ impl char {
}
}

/// Returns `true` if this `char` satisfies the `XID_Start` Unicode property, and false
/// otherwise.
///
/// `XID_Start` is a Unicode Derived Property specified in
/// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
/// mostly similar to `ID_Start` but modified for closure under `NFKx`.
#[unstable(feature = "unicode_internals", issue = "0")]
pub fn is_xid_start(self) -> bool {
derived_property::XID_Start(self)
}

/// Returns `true` if this `char` satisfies the `XID_Continue` Unicode property, and false
/// otherwise.
///
/// `XID_Continue` is a Unicode Derived Property specified in
/// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
/// mostly similar to `ID_Continue` but modified for closure under NFKx.
#[unstable(feature = "unicode_internals", issue = "0")]
#[inline]
pub fn is_xid_continue(self) -> bool {
derived_property::XID_Continue(self)
}

/// Returns `true` if this `char` is lowercase.
///
/// 'Lowercase' is defined according to the terms of the Unicode Derived Core
Expand Down
5 changes: 0 additions & 5 deletions src/libcore/unicode/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,3 @@ pub mod derived_property {
pub mod conversions {
pub use crate::unicode::tables::conversions::{to_lower, to_upper};
}

// For use in libsyntax
pub mod property {
pub use crate::unicode::tables::property::Pattern_White_Space;
}
375 changes: 0 additions & 375 deletions src/libcore/unicode/tables.rs

Large diffs are not rendered by default.

9 changes: 4 additions & 5 deletions src/libcore/unicode/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -728,7 +728,7 @@ def generate_property_module(mod, grouped_categories, category_subset):

yield "pub(crate) mod %s {\n" % mod
for cat in sorted(category_subset):
if cat in ("Cc", "White_Space", "Pattern_White_Space"):
if cat in ("Cc", "White_Space"):
generator = generate_small_bool_trie("%s_table" % cat, grouped_categories[cat])
else:
generator = generate_bool_trie("%s_table" % cat, grouped_categories[cat])
Expand Down Expand Up @@ -841,19 +841,18 @@ def main():
unicode_data = load_unicode_data(get_path(UnicodeFiles.UNICODE_DATA))
load_special_casing(get_path(UnicodeFiles.SPECIAL_CASING), unicode_data)

want_derived = {"XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase",
want_derived = {"Alphabetic", "Lowercase", "Uppercase",
"Cased", "Case_Ignorable", "Grapheme_Extend"}
derived = load_properties(get_path(UnicodeFiles.DERIVED_CORE_PROPERTIES), want_derived)

props = load_properties(get_path(UnicodeFiles.PROPS),
{"White_Space", "Join_Control", "Noncharacter_Code_Point",
"Pattern_White_Space"})
{"White_Space", "Join_Control", "Noncharacter_Code_Point"})

# Category tables
for (name, categories, category_subset) in (
("general_category", unicode_data.general_categories, ["N", "Cc"]),
("derived_property", derived, want_derived),
("property", props, ["White_Space", "Pattern_White_Space"])
("property", props, ["White_Space"])
):
for fragment in generate_property_module(name, categories, category_subset):
buf.write(fragment)
Expand Down
2 changes: 1 addition & 1 deletion src/libfmt_macros/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,4 @@ path = "lib.rs"

[dependencies]
syntax_pos = { path = "../libsyntax_pos" }

rustc_lexer = { path = "../librustc_lexer" }
9 changes: 4 additions & 5 deletions src/libfmt_macros/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -597,12 +597,11 @@ impl<'a> Parser<'a> {
}
}

/// Parses a word starting at the current position. A word is considered to
/// be an alphabetic character followed by any number of alphanumeric
/// characters.
/// Parses a word starting at the current position. A word is the same as
/// Rust identifier, except that it can't start with `_` character.
fn word(&mut self) -> &'a str {
let start = match self.cur.peek() {
Some(&(pos, c)) if c.is_xid_start() => {
Some(&(pos, c)) if c != '_' && rustc_lexer::is_id_start(c) => {
self.cur.next();
pos
}
Expand All @@ -611,7 +610,7 @@ impl<'a> Parser<'a> {
}
};
while let Some(&(pos, c)) = self.cur.peek() {
if c.is_xid_continue() {
if rustc_lexer::is_id_continue(c) {
self.cur.next();
} else {
return &self.input[start..pos];
Expand Down
8 changes: 4 additions & 4 deletions src/librustc_lexer/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@ name = "rustc_lexer"
version = "0.1.0"
edition = "2018"

# Note that this crate purposefully does not depend on other rustc crates
[dependencies]
unicode-xid = { version = "0.1.0", optional = true }

# Note: do not remove this blank `[lib]` section.
# This will be used when publishing this crate as `rustc-ap-rustc_lexer`.
[lib]
doctest = false
name = "rustc_lexer"

# Note that this crate purposefully does not depend on other rustc crates
[dependencies]
unicode-xid = "0.2.0"
Loading

0 comments on commit e0630ab

Please sign in to comment.