6 files changed, 100 insertions, 42 deletions
diff --git a/compiler/rustc_data_structures/src/aligned.rs b/compiler/rustc_data_structures/src/aligned.rs
index 111740e5509..bfc7556faf6 100644
--- a/compiler/rustc_data_structures/src/aligned.rs
+++ b/compiler/rustc_data_structures/src/aligned.rs
@@ -1,7 +1,6 @@
+use std::marker::PointeeSized;
 use std::ptr::Alignment;
 
-use rustc_serialize::PointeeSized;
-
 /// Returns the ABI-required minimum alignment of a type in bytes.
 ///
 /// This is equivalent to [`align_of`], but also works for some unsized
diff --git a/compiler/rustc_data_structures/src/flock.rs b/compiler/rustc_data_structures/src/flock.rs
index 60ae7ad115a..f33f6b7cac1 100644
--- a/compiler/rustc_data_structures/src/flock.rs
+++ b/compiler/rustc_data_structures/src/flock.rs
@@ -4,18 +4,7 @@
 //! green/native threading. This is just a bare-bones enough solution for
 //! librustdoc, it is not production quality at all.
 
-// cfg(bootstrap)
-macro_rules! cfg_select_dispatch {
-    ($($tokens:tt)*) => {
-        #[cfg(bootstrap)]
-        cfg_match! { $($tokens)* }
-
-        #[cfg(not(bootstrap))]
-        cfg_select! { $($tokens)* }
-    };
-}
-
-cfg_select_dispatch! {
+cfg_select! {
     target_os = "linux" => {
         mod linux;
         use linux as imp;
diff --git a/compiler/rustc_data_structures/src/lib.rs b/compiler/rustc_data_structures/src/lib.rs
index 0431182e9e2..53178d09348 100644
--- a/compiler/rustc_data_structures/src/lib.rs
+++ b/compiler/rustc_data_structures/src/lib.rs
@@ -10,9 +10,6 @@
 #![allow(internal_features)]
 #![allow(rustc::default_hash_types)]
 #![allow(rustc::potential_query_instability)]
-#![cfg_attr(bootstrap, feature(cfg_match))]
-#![cfg_attr(not(bootstrap), feature(cfg_select))]
-#![cfg_attr(not(bootstrap), feature(sized_hierarchy))]
 #![deny(unsafe_op_in_unsafe_fn)]
 #![doc(html_root_url = "https://doc.rust-lang.org/nightly/nightly-rustc/")]
 #![doc(rust_logo)]
@@ -22,6 +19,7 @@
 #![feature(ascii_char_variants)]
 #![feature(assert_matches)]
 #![feature(auto_traits)]
+#![feature(cfg_select)]
 #![feature(core_intrinsics)]
 #![feature(dropck_eyepatch)]
 #![feature(extend_one)]
@@ -33,6 +31,7 @@
 #![feature(ptr_alignment_type)]
 #![feature(rustc_attrs)]
 #![feature(rustdoc_internals)]
+#![feature(sized_hierarchy)]
 #![feature(test)]
 #![feature(thread_id_value)]
 #![feature(type_alias_impl_trait)]
@@ -44,9 +43,6 @@ use std::fmt;
 pub use atomic_ref::AtomicRef;
 pub use ena::{snapshot_vec, undo_log, unify};
 pub use rustc_index::static_assert_size;
-// re-exported for `rustc_smir`
-// FIXME(sized_hierarchy): remove with `cfg(bootstrap)`, see `rustc_serialize/src/lib.rs`
-pub use rustc_serialize::PointeeSized;
 
 pub mod aligned;
 pub mod base_n;
diff --git a/compiler/rustc_data_structures/src/marker.rs b/compiler/rustc_data_structures/src/marker.rs
index 4846bc997f1..2be9ba292f9 100644
--- a/compiler/rustc_data_structures/src/marker.rs
+++ b/compiler/rustc_data_structures/src/marker.rs
@@ -1,6 +1,5 @@
 use std::alloc::Allocator;
-
-use rustc_serialize::PointeeSized;
+use std::marker::PointeeSized;
 
 #[diagnostic::on_unimplemented(message = "`{Self}` doesn't implement `DynSend`. \
             Add it to `rustc_data_structures::marker` or use `IntoDynSyncSend` if it's already `Send`")]
diff --git a/compiler/rustc_data_structures/src/profiling.rs b/compiler/rustc_data_structures/src/profiling.rs
index e3a01e4035c..1b4db7adc27 100644
--- a/compiler/rustc_data_structures/src/profiling.rs
+++ b/compiler/rustc_data_structures/src/profiling.rs
@@ -88,6 +88,7 @@ use std::fmt::Display;
 use std::intrinsics::unlikely;
 use std::path::Path;
 use std::sync::Arc;
+use std::sync::atomic::Ordering;
 use std::time::{Duration, Instant};
 use std::{fs, process};
 
@@ -99,12 +100,15 @@ use tracing::warn;
 
 use crate::fx::FxHashMap;
 use crate::outline;
+use crate::sync::AtomicU64;
 
 bitflags::bitflags! {
     #[derive(Clone, Copy)]
     struct EventFilter: u16 {
         const GENERIC_ACTIVITIES  = 1 << 0;
         const QUERY_PROVIDERS     = 1 << 1;
+        /// Store detailed instant events, including timestamp and thread ID,
+        /// per each query cache hit. Note that this is quite expensive.
         const QUERY_CACHE_HITS    = 1 << 2;
         const QUERY_BLOCKED       = 1 << 3;
         const INCR_CACHE_LOADS    = 1 << 4;
@@ -113,16 +117,20 @@ bitflags::bitflags! {
         const FUNCTION_ARGS       = 1 << 6;
         const LLVM                = 1 << 7;
         const INCR_RESULT_HASHING = 1 << 8;
-        const ARTIFACT_SIZES = 1 << 9;
+        const ARTIFACT_SIZES      = 1 << 9;
+        /// Store aggregated counts of cache hits per query invocation.
+        const QUERY_CACHE_HIT_COUNTS  = 1 << 10;
 
         const DEFAULT = Self::GENERIC_ACTIVITIES.bits() |
                         Self::QUERY_PROVIDERS.bits() |
                         Self::QUERY_BLOCKED.bits() |
                         Self::INCR_CACHE_LOADS.bits() |
                         Self::INCR_RESULT_HASHING.bits() |
-                        Self::ARTIFACT_SIZES.bits();
+                        Self::ARTIFACT_SIZES.bits() |
+                        Self::QUERY_CACHE_HIT_COUNTS.bits();
 
         const ARGS = Self::QUERY_KEYS.bits() | Self::FUNCTION_ARGS.bits();
+        const QUERY_CACHE_HIT_COMBINED = Self::QUERY_CACHE_HITS.bits() | Self::QUERY_CACHE_HIT_COUNTS.bits();
     }
 }
 
@@ -134,6 +142,7 @@ const EVENT_FILTERS_BY_NAME: &[(&str, EventFilter)] = &[
     ("generic-activity", EventFilter::GENERIC_ACTIVITIES),
     ("query-provider", EventFilter::QUERY_PROVIDERS),
     ("query-cache-hit", EventFilter::QUERY_CACHE_HITS),
+    ("query-cache-hit-count", EventFilter::QUERY_CACHE_HITS),
     ("query-blocked", EventFilter::QUERY_BLOCKED),
     ("incr-cache-load", EventFilter::INCR_CACHE_LOADS),
     ("query-keys", EventFilter::QUERY_KEYS),
@@ -411,13 +420,24 @@ impl SelfProfilerRef {
         #[inline(never)]
         #[cold]
         fn cold_call(profiler_ref: &SelfProfilerRef, query_invocation_id: QueryInvocationId) {
-            profiler_ref.instant_query_event(
-                |profiler| profiler.query_cache_hit_event_kind,
-                query_invocation_id,
-            );
+            if profiler_ref.event_filter_mask.contains(EventFilter::QUERY_CACHE_HIT_COUNTS) {
+                profiler_ref
+                    .profiler
+                    .as_ref()
+                    .unwrap()
+                    .increment_query_cache_hit_counters(QueryInvocationId(query_invocation_id.0));
+            }
+            if unlikely(profiler_ref.event_filter_mask.contains(EventFilter::QUERY_CACHE_HITS)) {
+                profiler_ref.instant_query_event(
+                    |profiler| profiler.query_cache_hit_event_kind,
+                    query_invocation_id,
+                );
+            }
         }
 
-        if unlikely(self.event_filter_mask.contains(EventFilter::QUERY_CACHE_HITS)) {
+        // We check both kinds of query cache hit events at once, to reduce overhead in the
+        // common case (with self-profile disabled).
+        if unlikely(self.event_filter_mask.intersects(EventFilter::QUERY_CACHE_HIT_COMBINED)) {
             cold_call(self, query_invocation_id);
         }
     }
@@ -489,6 +509,35 @@ impl SelfProfilerRef {
         self.profiler.as_ref().map(|p| p.get_or_alloc_cached_string(s))
     }
 
+    /// Store query cache hits to the self-profile log.
+    /// Should be called once at the end of the compilation session.
+    ///
+    /// The cache hits are stored per **query invocation**, not **per query kind/type**.
+    /// `analyzeme` can later deduplicate individual query labels from the QueryInvocationId event
+    /// IDs.
+    pub fn store_query_cache_hits(&self) {
+        if self.event_filter_mask.contains(EventFilter::QUERY_CACHE_HIT_COUNTS) {
+            let profiler = self.profiler.as_ref().unwrap();
+            let query_hits = profiler.query_hits.read();
+            let builder = EventIdBuilder::new(&profiler.profiler);
+            let thread_id = get_thread_id();
+            for (query_invocation, hit_count) in query_hits.iter().enumerate() {
+                let hit_count = hit_count.load(Ordering::Relaxed);
+                // No need to record empty cache hit counts
+                if hit_count > 0 {
+                    let event_id =
+                        builder.from_label(StringId::new_virtual(query_invocation as u64));
+                    profiler.profiler.record_integer_event(
+                        profiler.query_cache_hit_count_event_kind,
+                        event_id,
+                        thread_id,
+                        hit_count,
+                    );
+                }
+            }
+        }
+    }
+
     #[inline]
     pub fn enabled(&self) -> bool {
         self.profiler.is_some()
@@ -537,6 +586,19 @@ pub struct SelfProfiler {
 
     string_cache: RwLock<FxHashMap<String, StringId>>,
 
+    /// Recording individual query cache hits as "instant" measureme events
+    /// is incredibly expensive. Instead of doing that, we simply aggregate
+    /// cache hit *counts* per query invocation, and then store the final count
+    /// of cache hits per invocation at the end of the compilation session.
+    ///
+    /// With this approach, we don't know the individual thread IDs and timestamps
+    /// of cache hits, but it has very little overhead on top of `-Zself-profile`.
+    /// Recording the cache hits as individual events made compilation 3-5x slower.
+    ///
+    /// Query invocation IDs should be monotonic integers, so we can store them in a vec,
+    /// rather than using a hashmap.
+    query_hits: RwLock<Vec<AtomicU64>>,
+
     query_event_kind: StringId,
     generic_activity_event_kind: StringId,
     incremental_load_result_event_kind: StringId,
@@ -544,6 +606,8 @@ pub struct SelfProfiler {
     query_blocked_event_kind: StringId,
     query_cache_hit_event_kind: StringId,
     artifact_size_event_kind: StringId,
+    /// Total cache hits per query invocation
+    query_cache_hit_count_event_kind: StringId,
 }
 
 impl SelfProfiler {
@@ -573,6 +637,7 @@ impl SelfProfiler {
         let query_blocked_event_kind = profiler.alloc_string("QueryBlocked");
         let query_cache_hit_event_kind = profiler.alloc_string("QueryCacheHit");
         let artifact_size_event_kind = profiler.alloc_string("ArtifactSize");
+        let query_cache_hit_count_event_kind = profiler.alloc_string("QueryCacheHitCount");
 
         let mut event_filter_mask = EventFilter::empty();
 
@@ -618,6 +683,8 @@ impl SelfProfiler {
             query_blocked_event_kind,
             query_cache_hit_event_kind,
             artifact_size_event_kind,
+            query_cache_hit_count_event_kind,
+            query_hits: Default::default(),
         })
     }
 
@@ -627,6 +694,25 @@ impl SelfProfiler {
         self.profiler.alloc_string(s)
     }
 
+    /// Store a cache hit of a query invocation
+    pub fn increment_query_cache_hit_counters(&self, id: QueryInvocationId) {
+        // Fast path: assume that the query was already encountered before, and just record
+        // a cache hit.
+        let mut guard = self.query_hits.upgradable_read();
+        let query_hits = &guard;
+        let index = id.0 as usize;
+        if index < query_hits.len() {
+            // We only want to increment the count, no other synchronization is required
+            query_hits[index].fetch_add(1, Ordering::Relaxed);
+        } else {
+            // If not, we need to extend the query hit map to the highest observed ID
+            guard.with_upgraded(|vec| {
+                vec.resize_with(index + 1, || AtomicU64::new(0));
+                vec[index] = AtomicU64::from(1);
+            });
+        }
+    }
+
     /// Gets a `StringId` for the given string. This method makes sure that
     /// any strings going through it will only be allocated once in the
     /// profiling data.
@@ -859,19 +945,8 @@ fn get_thread_id() -> u32 {
     std::thread::current().id().as_u64().get() as u32
 }
 
-// cfg(bootstrap)
-macro_rules! cfg_select_dispatch {
-    ($($tokens:tt)*) => {
-        #[cfg(bootstrap)]
-        cfg_match! { $($tokens)* }
-
-        #[cfg(not(bootstrap))]
-        cfg_select! { $($tokens)* }
-    };
-}
-
 // Memory reporting
-cfg_select_dispatch! {
+cfg_select! {
     windows => {
         pub fn get_resident_set_size() -> Option<usize> {
             use windows::{
diff --git a/compiler/rustc_data_structures/src/vec_cache.rs b/compiler/rustc_data_structures/src/vec_cache.rs
index df83d15b5f9..599970663db 100644
--- a/compiler/rustc_data_structures/src/vec_cache.rs
+++ b/compiler/rustc_data_structures/src/vec_cache.rs
@@ -257,7 +257,7 @@ unsafe impl<K: Idx, #[may_dangle] V, I> Drop for VecCache<K, V, I> {
         // we are also guaranteed to just need to deallocate any large arrays (not iterate over
         // contents).
         //
-        // Confirm no need to deallocate invidual entries. Note that `V: Copy` is asserted on
+        // Confirm no need to deallocate individual entries. Note that `V: Copy` is asserted on
         // insert/lookup but not necessarily construction, primarily to avoid annoyingly propagating
         // the bounds into struct definitions everywhere.
         assert!(!std::mem::needs_drop::<K>());