microsoft · wuw92 · Jun 4, 2026 · Jun 5, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/diskann-quantization/Cargo.toml b/diskann-quantization/Cargo.toml
@@ -20,6 +20,13 @@ flatbuffers = { version = "25.2.10", optional = true }
 half = { version = "2.6.0", features = ["bytemuck"] }
 diskann-utils = { workspace = true }
 
+# Cache size detection. x86_64 uses CPUID; aarch64 Linux/macOS use OS APIs.
+[target.'cfg(target_arch = "x86_64")'.dependencies]
+raw-cpuid = "11.5"
+
+[target.'cfg(all(target_arch = "aarch64", target_os = "macos"))'.dependencies]
+libc = "0.2.148"
+
 [lints.clippy]
 undocumented_unsafe_blocks = "warn"
 unwrap_used = "warn"

diff --git a/diskann-quantization/src/multi_vector/distance/cache/cpuid.rs b/diskann-quantization/src/multi_vector/distance/cache/cpuid.rs
@@ -0,0 +1,48 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT license.
+
+//! x86_64 CPUID cache probe via deterministic cache parameters — `raw-cpuid`'s
+//! cache-parameter enumeration (CPUID `0x4` on Intel, `0x8000001D` on AMD).
+//! Returns `None` when those parameters are unavailable; we don't fall back to
+//! AMD's legacy `0x80000005/06` leaves.
+
+use raw_cpuid::{CacheType, CpuId};
+
+use super::CacheInfo;
+
+pub(super) fn detect() -> Option<CacheInfo> {
+    let cpuid = CpuId::new();
+    let params = cpuid.get_cache_parameters()?;
+
+    let mut l1d = None;
+    let mut l2 = None;
+
+    for cache in params {
+        let level = cache.level();
+        let ty = cache.cache_type();
+        let size = cache_size_bytes(&cache);
+
+        match (level, ty) {
+            (1, CacheType::Data) if l1d.is_none() => l1d = Some(size),
+            // L2 is usually Unified; accept Data as a defensive fallback.
+            (2, CacheType::Unified | CacheType::Data) if l2.is_none() => l2 = Some(size),
+            _ => {}
+        }
+
+        if l1d.is_some() && l2.is_some() {
+            break;
+        }
+    }
+
+    Some(CacheInfo {
+        l1d_bytes: l1d?,
+        l2_bytes: l2?,
+    })
+}
+
+fn cache_size_bytes(cache: &raw_cpuid::CacheParameter) -> usize {
+    cache.associativity()
+        * cache.physical_line_partitions()
+        * cache.coherency_line_size()
+        * cache.sets()
+}
diff --git a/diskann-quantization/src/multi_vector/distance/cache/linux.rs b/diskann-quantization/src/multi_vector/distance/cache/linux.rs
@@ -0,0 +1,84 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT license.
+
+//! aarch64 Linux cache probe via sysfs (`/sys/devices/system/cpu/cpu0/cache/`).
+//! Returns `None` when the cache sysfs entries are absent, as in some
+//! stripped-down containers.
+
+use std::path::Path;
+
+use super::CacheInfo;
+
+const SYSFS_CACHE_DIR: &str = "/sys/devices/system/cpu/cpu0/cache";
+
+pub(super) fn detect() -> Option<CacheInfo> {
+    let mut l1d = None;
+    let mut l2 = None;
+
+    // Each `index*` subdirectory describes one cache (level + type + size).
+    for entry in std::fs::read_dir(SYSFS_CACHE_DIR).ok()?.flatten() {
+        let dir = entry.path();
+
+        let Some(level) = read_trim(dir.join("level")).and_then(|s| s.parse::<u32>().ok()) else {
+            continue;
+        };
+        let Some(cache_type) = read_trim(dir.join("type")) else {
+            continue;
+        };
+        let Some(size) = read_trim(dir.join("size")).and_then(|s| parse_size(&s)) else {
+            continue;
+        };
+
+        match (level, cache_type.as_str()) {
+            (1, "Data") if l1d.is_none() => l1d = Some(size),
+            // L2 is Unified on real hardware; accept Data defensively.
+            (2, "Unified" | "Data") if l2.is_none() => l2 = Some(size),
+            _ => {}
+        }
+    }
+
+    Some(CacheInfo {
+        l1d_bytes: l1d?,
+        l2_bytes: l2?,
+    })
+}
+
+fn read_trim(path: impl AsRef<Path>) -> Option<String> {
+    std::fs::read_to_string(path)
+        .ok()
+        .map(|s| s.trim().to_string())
+}
+
+fn parse_size(s: &str) -> Option<usize> {
+    let s = s.trim();
+    let split = s.find(|c: char| !c.is_ascii_digit()).unwrap_or(s.len());
+    let (num, suffix) = s.split_at(split);
+    let n: usize = num.parse().ok()?;
+    match suffix.trim() {
+        "" => Some(n),
+        "K" | "KB" | "KiB" => n.checked_mul(1024),
+        "M" | "MB" | "MiB" => n.checked_mul(1024 * 1024),
+        "G" | "GB" | "GiB" => n.checked_mul(1024 * 1024 * 1024),
+        _ => None,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn parse_size_handles_common_formats() {
+        assert_eq!(parse_size("32K"), Some(32 * 1024));
+        assert_eq!(parse_size("1024K"), Some(1024 * 1024));
+        assert_eq!(parse_size("8M"), Some(8 * 1024 * 1024));
+        assert_eq!(parse_size("1G"), Some(1024 * 1024 * 1024));
+        assert_eq!(parse_size("4096"), Some(4096));
+        assert_eq!(parse_size(" 32K\n"), Some(32 * 1024));
+        assert_eq!(parse_size("garbage"), None);
+        assert_eq!(parse_size(""), None);
+        // A value that parses but overflows on the suffix multiply must yield
+        // None, not a silently wrapped size (release builds skip overflow checks).
+        assert_eq!(parse_size(&format!("{}K", usize::MAX)), None);
+    }
+}
diff --git a/diskann-quantization/src/multi_vector/distance/cache/macos.rs b/diskann-quantization/src/multi_vector/distance/cache/macos.rs
@@ -0,0 +1,55 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT license.
+
+//! aarch64 macOS sysctl cache probe. Reads `hw.perflevel0.*`
+//! and divides the cluster L2 by `cpusperl2` for a per-core budget.
+
+use core::ffi::{CStr, c_void};
+
+use super::CacheInfo;
+
+pub(super) fn detect() -> Option<CacheInfo> {
+    // L1d is private per core — no normalization.
+    let l1d = sysctl_uint(c"hw.perflevel0.l1dcachesize")?;
+    let l2 = perflevel0_l2_per_core()?;
+
+    Some(CacheInfo {
+        l1d_bytes: l1d as usize,
+        l2_bytes: l2 as usize,
+    })
+}
+
+/// It reports the full per-cluster L2 via `hw.perflevel0.l2cachesize`;
+/// divide by `hw.perflevel0.cpusperl2` for the per-core share.
+fn perflevel0_l2_per_core() -> Option<u64> {
+    let total = sysctl_uint(c"hw.perflevel0.l2cachesize")?;
+    let cpus = sysctl_uint(c"hw.perflevel0.cpusperl2")?;
+    (cpus > 0).then(|| total / cpus)
+}
+
+/// Reads an integer sysctl reported as either 4- or 8-byte: cache sizes are
+/// 64-bit, but topology counts like `cpusperl2` are 32-bit.
+fn sysctl_uint(name: &CStr) -> Option<u64> {
+    let mut buf = [0u8; 8];
+    let mut size = buf.len();
+    // SAFETY: `name` is a valid NUL-terminated C string; `buf` / `size` are
+    // valid and writable; the new* parameters are null because we are only
+    // reading.
+    let ret = unsafe {
+        libc::sysctlbyname(
+            name.as_ptr(),
+            buf.as_mut_ptr() as *mut c_void,
+            &mut size,
+            core::ptr::null_mut(),
+            0,
+        )
+    };
+    if ret != 0 {
+        return None;
+    }
+    match size {
+        4 => Some(u32::from_ne_bytes(buf[..4].try_into().ok()?) as u64),
+        8 => Some(u64::from_ne_bytes(buf)),
+        _ => None,
+    }
+}
diff --git a/diskann-quantization/src/multi_vector/distance/cache/mod.rs b/diskann-quantization/src/multi_vector/distance/cache/mod.rs
@@ -0,0 +1,88 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT license.
+
+//! L1d / L2 cache size probe used by the multi-vector tile planner.
+//! Detected once and memoized; returns [`CacheInfo::FALLBACK`] when no
+//! per-platform probe applies.
+
+use std::sync::OnceLock;
+
+#[cfg(target_arch = "x86_64")]
+mod cpuid;
+#[cfg(all(target_arch = "aarch64", target_os = "linux"))]
+mod linux;
+#[cfg(all(target_arch = "aarch64", target_os = "macos"))]
+mod macos;
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub(super) struct CacheInfo {
+    /// L1 **d**ata cache size in bytes. L1i (instruction cache) is not read —
+    /// tile budgets only constrain data residency.
+    pub l1d_bytes: usize,
+    /// L2 cache size in bytes.
+    pub l2_bytes: usize,
+}
+
+impl CacheInfo {
+    /// Used when no per-platform probe applies.
+    pub(super) const FALLBACK: Self = Self {
+        l1d_bytes: 32 * 1024,
+        l2_bytes: 256 * 1024,
+    };
+}
+
+pub(super) fn cache_info() -> CacheInfo {
+    static CACHED: OnceLock<CacheInfo> = OnceLock::new();
+    *CACHED.get_or_init(detect_uncached)
+}
+
+fn detect_uncached() -> CacheInfo {
+    #[cfg(target_arch = "x86_64")]
+    let detected = cpuid::detect();
+
+    #[cfg(all(target_arch = "aarch64", target_os = "linux"))]
+    let detected = linux::detect();
+
+    #[cfg(all(target_arch = "aarch64", target_os = "macos"))]
+    let detected = macos::detect();
+
+    #[cfg(not(any(
+        target_arch = "x86_64",
+        all(target_arch = "aarch64", target_os = "linux"),
+        all(target_arch = "aarch64", target_os = "macos"),
+    )))]
+    let detected: Option<CacheInfo> = None;
+
+    detected.unwrap_or(CacheInfo::FALLBACK)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn cache_info_returns_plausible_values() {
+        let info = cache_info();
+
+        // Either we detected real values or we fell back. In both cases the
+        // values must be within plausible bounds for any CPU we care about:
+        // 4 KB to 1 MB for L1d, 64 KB to 128 MB for L2.
+        assert!(
+            (4 * 1024..=1024 * 1024).contains(&info.l1d_bytes),
+            "L1d out of plausible range: {} bytes",
+            info.l1d_bytes
+        );
+        assert!(
+            (64 * 1024..=128 * 1024 * 1024).contains(&info.l2_bytes),
+            "L2 out of plausible range: {} bytes",
+            info.l2_bytes
+        );
+    }
+
+    #[test]
+    fn cache_info_is_memoized() {
+        let first = cache_info();
+        let second = cache_info();
+        assert_eq!(first, second);
+    }
+}
diff --git a/diskann-quantization/src/multi_vector/distance/kernels/mod.rs b/diskann-quantization/src/multi_vector/distance/kernels/mod.rs
@@ -22,8 +22,8 @@ mod tiled_reduce;
 
 /// Cache budgets fed to the tile planner.
 ///
-/// `Default` returns the production budgets derived from hardcoded L1/L2
-/// cache-size estimates and fixed fractions.
+/// `Default` derives the budgets from runtime-detected L1d/L2 cache sizes
+/// (see [`cache::cache_info`](super::cache::cache_info)).
 #[derive(Debug, Clone, Copy)]
 struct TileBudget {
     /// L2 budget in bytes reserved for A tiles.
@@ -33,17 +33,13 @@ struct TileBudget {
 }
 
 impl Default for TileBudget {
-    // TODO: Replace hardcoded fallbacks with detected cache sizes
-    // (e.g. via `diskann_platform`, env-var override, or runtime query).
     fn default() -> Self {
-        const L2_CACHE: usize = 1_250_000; // 1.25 MB fallback
-        const L1_CACHE: usize = 48_000; // 48 KB fallback
-
+        let cache = super::cache::cache_info();
         Self {
             // 50% of L2 for A tiles; remainder for B streaming + pollution.
-            l2_a: L2_CACHE / 2,
+            l2_a: cache.l2_bytes / 2,
             // 75% of L1 for B tiles; A micro-panel subtracted at runtime.
-            l1_b: L1_CACHE * 3 / 4,
+            l1_b: cache.l1d_bytes * 3 / 4,
         }
     }
 }

diff --git a/diskann-quantization/src/multi_vector/distance/mod.rs b/diskann-quantization/src/multi_vector/distance/mod.rs
@@ -41,6 +41,7 @@
 //! // scores[1] =  0.0 (query[1] has no good match: max IP was 0)
 //! ```
 
+mod cache;
 mod factory;
 mod fallback;
 mod isa;