From 09e619d62eedee6fb67ae677923c53136a9bf51c Mon Sep 17 00:00:00 2001 From: Mark Rousskov Date: Wed, 27 Dec 2023 20:55:55 -0500 Subject: [PATCH] Shrink span encoding further Spans are now stored in a more compact form which cuts down on at least 1 byte per span (indirect/direct encoding) and at most 3 bytes per span (indirect/direct encoding, context byte, length byte). As a result, libcore metadata shrinks by 1.5MB. --- compiler/rustc_metadata/src/rmeta/decoder.rs | 40 ++++----- compiler/rustc_metadata/src/rmeta/encoder.rs | 44 +++++---- compiler/rustc_metadata/src/rmeta/mod.rs | 93 +++++++++++++++++--- 3 files changed, 130 insertions(+), 47 deletions(-) diff --git a/compiler/rustc_metadata/src/rmeta/decoder.rs b/compiler/rustc_metadata/src/rmeta/decoder.rs index ae4a0e15fab68..2de29db9e5c84 100644 --- a/compiler/rustc_metadata/src/rmeta/decoder.rs +++ b/compiler/rustc_metadata/src/rmeta/decoder.rs @@ -508,21 +508,19 @@ impl<'a, 'tcx> Decodable> for ExpnId { impl<'a, 'tcx> Decodable> for Span { fn decode(decoder: &mut DecodeContext<'a, 'tcx>) -> Span { let start = decoder.position(); - let mode = SpanEncodingMode::decode(decoder); - let data = match mode { - SpanEncodingMode::Direct => SpanData::decode(decoder), - SpanEncodingMode::RelativeOffset(offset) => { - decoder.with_position(start - offset, |decoder| { - let mode = SpanEncodingMode::decode(decoder); - debug_assert!(matches!(mode, SpanEncodingMode::Direct)); - SpanData::decode(decoder) - }) - } - SpanEncodingMode::AbsoluteOffset(addr) => decoder.with_position(addr, |decoder| { - let mode = SpanEncodingMode::decode(decoder); - debug_assert!(matches!(mode, SpanEncodingMode::Direct)); - SpanData::decode(decoder) - }), + let tag = SpanTag(decoder.peek_byte()); + let data = if tag.kind() == SpanKind::Indirect { + // Skip past the tag we just peek'd. + decoder.read_u8(); + let offset_or_position = decoder.read_usize(); + let position = if tag.is_relative_offset() { + start - offset_or_position + } else { + offset_or_position + }; + decoder.with_position(position, SpanData::decode) + } else { + SpanData::decode(decoder) }; Span::new(data.lo, data.hi, data.ctxt, data.parent) } @@ -530,17 +528,17 @@ impl<'a, 'tcx> Decodable> for Span { impl<'a, 'tcx> Decodable> for SpanData { fn decode(decoder: &mut DecodeContext<'a, 'tcx>) -> SpanData { - let ctxt = SyntaxContext::decode(decoder); - let tag = u8::decode(decoder); + let tag = SpanTag::decode(decoder); + let ctxt = tag.context().unwrap_or_else(|| SyntaxContext::decode(decoder)); - if tag == TAG_PARTIAL_SPAN { + if tag.kind() == SpanKind::Partial { return DUMMY_SP.with_ctxt(ctxt).data(); } - debug_assert!(tag == TAG_VALID_SPAN_LOCAL || tag == TAG_VALID_SPAN_FOREIGN); + debug_assert!(tag.kind() == SpanKind::Local || tag.kind() == SpanKind::Foreign); let lo = BytePos::decode(decoder); - let len = BytePos::decode(decoder); + let len = tag.length().unwrap_or_else(|| BytePos::decode(decoder)); let hi = lo + len; let Some(sess) = decoder.sess else { @@ -581,7 +579,7 @@ impl<'a, 'tcx> Decodable> for SpanData { // treat the 'local' and 'foreign' cases almost identically during deserialization: // we can call `imported_source_file` for the proper crate, and binary search // through the returned slice using our span. - let source_file = if tag == TAG_VALID_SPAN_LOCAL { + let source_file = if tag.kind() == SpanKind::Local { decoder.cdata().imported_source_file(metadata_index, sess) } else { // When we encode a proc-macro crate, all `Span`s should be encoded diff --git a/compiler/rustc_metadata/src/rmeta/encoder.rs b/compiler/rustc_metadata/src/rmeta/encoder.rs index 3e7297a74f983..aca7a66596e64 100644 --- a/compiler/rustc_metadata/src/rmeta/encoder.rs +++ b/compiler/rustc_metadata/src/rmeta/encoder.rs @@ -177,15 +177,17 @@ impl<'a, 'tcx> Encodable> for Span { // previously saved offset must be smaller than the current position. let offset = s.opaque.position() - last_location; if offset < last_location { - SpanEncodingMode::RelativeOffset(offset).encode(s) + SpanTag::indirect(true).encode(s); + offset.encode(s); } else { - SpanEncodingMode::AbsoluteOffset(last_location).encode(s) + SpanTag::indirect(false).encode(s); + last_location.encode(s); } } Entry::Vacant(v) => { let position = s.opaque.position(); v.insert(position); - SpanEncodingMode::Direct.encode(s); + // Data is encoded with a SpanTag prefix (see below). self.data().encode(s); } } @@ -225,14 +227,15 @@ impl<'a, 'tcx> Encodable> for SpanData { // IMPORTANT: If this is ever changed, be sure to update // `rustc_span::hygiene::raw_encode_expn_id` to handle // encoding `ExpnData` for proc-macro crates. - if s.is_proc_macro { - SyntaxContext::root().encode(s); - } else { - self.ctxt.encode(s); - } + let ctxt = if s.is_proc_macro { SyntaxContext::root() } else { self.ctxt }; if self.is_dummy() { - return TAG_PARTIAL_SPAN.encode(s); + let tag = SpanTag::new(SpanKind::Partial, ctxt, 0); + tag.encode(s); + if tag.context().is_none() { + ctxt.encode(s); + } + return; } // The Span infrastructure should make sure that this invariant holds: @@ -250,7 +253,12 @@ impl<'a, 'tcx> Encodable> for SpanData { if !source_file.contains(self.hi) { // Unfortunately, macro expansion still sometimes generates Spans // that malformed in this way. - return TAG_PARTIAL_SPAN.encode(s); + let tag = SpanTag::new(SpanKind::Partial, ctxt, 0); + tag.encode(s); + if tag.context().is_none() { + ctxt.encode(s); + } + return; } // There are two possible cases here: @@ -269,7 +277,7 @@ impl<'a, 'tcx> Encodable> for SpanData { // if we're a proc-macro crate. // This allows us to avoid loading the dependencies of proc-macro crates: all of // the information we need to decode `Span`s is stored in the proc-macro crate. - let (tag, metadata_index) = if source_file.is_imported() && !s.is_proc_macro { + let (kind, metadata_index) = if source_file.is_imported() && !s.is_proc_macro { // To simplify deserialization, we 'rebase' this span onto the crate it originally came // from (the crate that 'owns' the file it references. These rebased 'lo' and 'hi' // values are relative to the source map information for the 'foreign' crate whose @@ -287,7 +295,7 @@ impl<'a, 'tcx> Encodable> for SpanData { } }; - (TAG_VALID_SPAN_FOREIGN, metadata_index) + (SpanKind::Foreign, metadata_index) } else { // Record the fact that we need to encode the data for this `SourceFile` let source_files = @@ -296,7 +304,7 @@ impl<'a, 'tcx> Encodable> for SpanData { let metadata_index: u32 = metadata_index.try_into().expect("cannot export more than U32_MAX files"); - (TAG_VALID_SPAN_LOCAL, metadata_index) + (SpanKind::Local, metadata_index) }; // Encode the start position relative to the file start, so we profit more from the @@ -307,14 +315,20 @@ impl<'a, 'tcx> Encodable> for SpanData { // from the variable-length integer encoding that we use. let len = self.hi - self.lo; + let tag = SpanTag::new(kind, ctxt, len.0 as usize); tag.encode(s); + if tag.context().is_none() { + ctxt.encode(s); + } lo.encode(s); - len.encode(s); + if tag.length().is_none() { + len.encode(s); + } // Encode the index of the `SourceFile` for the span, in order to make decoding faster. metadata_index.encode(s); - if tag == TAG_VALID_SPAN_FOREIGN { + if kind == SpanKind::Foreign { // This needs to be two lines to avoid holding the `s.source_file_cache` // while calling `cnum.encode(s)` let cnum = s.source_file_cache.0.cnum; diff --git a/compiler/rustc_metadata/src/rmeta/mod.rs b/compiler/rustc_metadata/src/rmeta/mod.rs index bafd3f0b84d29..54ee50c235860 100644 --- a/compiler/rustc_metadata/src/rmeta/mod.rs +++ b/compiler/rustc_metadata/src/rmeta/mod.rs @@ -66,13 +66,6 @@ const METADATA_VERSION: u8 = 9; /// unsigned integer, and further followed by the rustc version string. pub const METADATA_HEADER: &[u8] = &[b'r', b'u', b's', b't', 0, 0, 0, METADATA_VERSION]; -#[derive(Encodable, Decodable)] -enum SpanEncodingMode { - RelativeOffset(usize), - AbsoluteOffset(usize), - Direct, -} - /// A value of type T referred to by its absolute position /// in the metadata, and which can be decoded lazily. /// @@ -488,10 +481,88 @@ bitflags::bitflags! { } } -// Tags used for encoding Spans: -const TAG_VALID_SPAN_LOCAL: u8 = 0; -const TAG_VALID_SPAN_FOREIGN: u8 = 1; -const TAG_PARTIAL_SPAN: u8 = 2; +/// A span tag byte encodes a bunch of data, so that we can cut out a few extra bytes from span +/// encodings (which are very common, for example, libcore has ~650,000 unique spans and over 1.1 +/// million references to prior-written spans). +/// +/// The byte format is split into several parts: +/// +/// [ a a a a a c d d ] +/// +/// `a` bits represent the span length. We have 5 bits, so we can store lengths up to 30 inline, with +/// an all-1s pattern representing that the length is stored separately. +/// +/// `c` represents whether the span context is zero (and then it is not stored as a separate varint) +/// for direct span encodings, and whether the offset is absolute or relative otherwise (zero for +/// absolute). +/// +/// d bits represent the kind of span we are storing (local, foreign, partial, indirect). +#[derive(Encodable, Decodable, Copy, Clone)] +struct SpanTag(u8); + +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +enum SpanKind { + Local = 0b00, + Foreign = 0b01, + Partial = 0b10, + // Indicates the actual span contents are elsewhere. + // If this is the kind, then the span context bit represents whether it is a relative or + // absolute offset. + Indirect = 0b11, +} + +impl SpanTag { + fn new(kind: SpanKind, context: rustc_span::SyntaxContext, length: usize) -> SpanTag { + let mut data = 0u8; + data |= kind as u8; + if context.is_root() { + data |= 0b100; + } + let all_1s_len = (0xffu8 << 3) >> 3; + // strictly less than - all 1s pattern is a sentinel for storage being out of band. + if length < all_1s_len as usize { + data |= (length as u8) << 3; + } else { + data |= all_1s_len << 3; + } + + SpanTag(data) + } + + fn indirect(relative: bool) -> SpanTag { + let mut tag = SpanTag(SpanKind::Indirect as u8); + if relative { + tag.0 |= 0b100; + } + tag + } + + fn kind(self) -> SpanKind { + let masked = self.0 & 0b11; + match masked { + 0b00 => SpanKind::Local, + 0b01 => SpanKind::Foreign, + 0b10 => SpanKind::Partial, + 0b11 => SpanKind::Indirect, + _ => unreachable!(), + } + } + + fn is_relative_offset(self) -> bool { + debug_assert_eq!(self.kind(), SpanKind::Indirect); + self.0 & 0b100 != 0 + } + + fn context(self) -> Option { + if self.0 & 0b100 != 0 { Some(rustc_span::SyntaxContext::root()) } else { None } + } + + fn length(self) -> Option { + let all_1s_len = (0xffu8 << 3) >> 3; + let len = self.0 >> 3; + if len != all_1s_len { Some(rustc_span::BytePos(u32::from(len))) } else { None } + } +} // Tags for encoding Symbol's const SYMBOL_STR: u8 = 0;