From 00f14f7ce3d6c46cc1944591078658f8224c1666 Mon Sep 17 00:00:00 2001
From: Wei Wu <wuw1992@qq.com>
Date: Thu, 4 Jun 2026 14:39:18 +0800
Subject: [PATCH 1/2] [diskann-quantization] Add CPU cache size detection for
 multi-vector tile budget

Wires runtime L1d/L2 detection into TileBudget::default() so the
multi-vector tile planner sizes A/B tiles against the host's actual
cache geometry instead of hardcoded Skylake-X estimates (1.25 MB L2,
48 KB L1d from PR #863).

Detection lives in diskann-quantization, alongside the existing
ISA-capability probe in isa.rs. Cache size is fundamentally a CPU/arch
property: the OS-API is a discovery mechanism, not the concept being
captured. Putting the module here mirrors the existing
diskann-wide / diskann-vector / diskann-quantization stack, which
handles all arch dispatch internally without depending on
diskann-platform.

Detection strategy follows what gemm-common / faer / OpenBLAS do:
CPUID where available, OS API where required.

- x86_64 (any OS):     CPUID via the `raw-cpuid` crate (one path)
- aarch64 Linux:       sysfs (/sys/devices/system/cpu/cpu0/cache/...)
- aarch64 macOS:       sysctl (hw.perflevel0.*, P-core L2 / cpusperl2)
- Anything else:       CacheInfo::FALLBACK (32 KB L1d, 256 KB L2)

On Apple Silicon the per-cluster L2 is divided by cpusperl2 to give a
per-core budget. Windows-on-ARM falls back to the conservative
defaults: CI doesn't cover that target and DiskANN production doesn't
deploy there; dropping it removes a Win32 codepath and lets the crate
avoid pulling windows-sys.

Equality between CPUID and Win32 GetLogicalProcessorInformationEx was
verified on Windows x86_64 (32 KB L1d / 512 KB L2 on the test host)
during development. Final commit removes the side-by-side test along
with the temporary dependency on diskann-platform.

Closes #1062.
---
 Cargo.lock                                    |  2 +
 diskann-quantization/Cargo.toml               |  7 ++
 .../src/multi_vector/distance/cache/cpuid.rs  | 48 ++++++++++
 .../src/multi_vector/distance/cache/linux.rs  | 81 +++++++++++++++++
 .../src/multi_vector/distance/cache/macos.rs  | 55 ++++++++++++
 .../src/multi_vector/distance/cache/mod.rs    | 88 +++++++++++++++++++
 .../src/multi_vector/distance/kernels/mod.rs  | 14 ++-
 .../src/multi_vector/distance/mod.rs          |  1 +
 8 files changed, 287 insertions(+), 9 deletions(-)
 create mode 100644 diskann-quantization/src/multi_vector/distance/cache/cpuid.rs
 create mode 100644 diskann-quantization/src/multi_vector/distance/cache/linux.rs
 create mode 100644 diskann-quantization/src/multi_vector/distance/cache/macos.rs
 create mode 100644 diskann-quantization/src/multi_vector/distance/cache/mod.rs

diff --git a/Cargo.lock b/Cargo.lock
index ca3afe030..2fed74a22 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -902,8 +902,10 @@ dependencies = [
  "diskann-wide",
  "flatbuffers",
  "half",
+ "libc",
  "rand 0.9.4",
  "rand_distr",
+ "raw-cpuid",
  "rayon",
  "serde",
  "serde_json",
diff --git a/diskann-quantization/Cargo.toml b/diskann-quantization/Cargo.toml
index cb2ddf1c2..a43917d2d 100644
--- a/diskann-quantization/Cargo.toml
+++ b/diskann-quantization/Cargo.toml
@@ -20,6 +20,13 @@ flatbuffers = { version = "25.2.10", optional = true }
 half = { version = "2.6.0", features = ["bytemuck"] }
 diskann-utils = { workspace = true }
 
+# Cache size detection. x86_64 uses CPUID; aarch64 Linux/macOS use OS APIs.
+[target.'cfg(target_arch = "x86_64")'.dependencies]
+raw-cpuid = "11.5"
+
+[target.'cfg(all(target_arch = "aarch64", target_os = "macos"))'.dependencies]
+libc = "0.2.148"
+
 [lints.clippy]
 undocumented_unsafe_blocks = "warn"
 unwrap_used = "warn"
diff --git a/diskann-quantization/src/multi_vector/distance/cache/cpuid.rs b/diskann-quantization/src/multi_vector/distance/cache/cpuid.rs
new file mode 100644
index 000000000..d94f2160f
--- /dev/null
+++ b/diskann-quantization/src/multi_vector/distance/cache/cpuid.rs
@@ -0,0 +1,48 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT license.
+
+//! x86_64 CPUID cache probe via the deterministic cache parameter leaf
+//! (CPUID `0x4` on Intel, `0x8000001D` on AMD). Returns `None` on CPUs exposing
+//! neither, i.e. old AMD without the `TopologyExtensions` feature. We don't read
+//! AMD's legacy `0x80000005/06` leaves; no deployment target is that old.
+
+use raw_cpuid::{CacheType, CpuId};
+
+use super::CacheInfo;
+
+pub(super) fn detect() -> Option<CacheInfo> {
+    let cpuid = CpuId::new();
+    let params = cpuid.get_cache_parameters()?;
+
+    let mut l1d = None;
+    let mut l2 = None;
+
+    for cache in params {
+        let level = cache.level();
+        let ty = cache.cache_type();
+        let size = cache_size_bytes(&cache);
+
+        match (level, ty) {
+            (1, CacheType::Data) if l1d.is_none() => l1d = Some(size),
+            // L2 is usually Unified; accept Data as a defensive fallback.
+            (2, CacheType::Unified | CacheType::Data) if l2.is_none() => l2 = Some(size),
+            _ => {}
+        }
+
+        if l1d.is_some() && l2.is_some() {
+            break;
+        }
+    }
+
+    Some(CacheInfo {
+        l1d_bytes: l1d?,
+        l2_bytes: l2?,
+    })
+}
+
+fn cache_size_bytes(cache: &raw_cpuid::CacheParameter) -> usize {
+    cache.associativity()
+        * cache.physical_line_partitions()
+        * cache.coherency_line_size()
+        * cache.sets()
+}
diff --git a/diskann-quantization/src/multi_vector/distance/cache/linux.rs b/diskann-quantization/src/multi_vector/distance/cache/linux.rs
new file mode 100644
index 000000000..8f5d3420b
--- /dev/null
+++ b/diskann-quantization/src/multi_vector/distance/cache/linux.rs
@@ -0,0 +1,81 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT license.
+
+//! aarch64 Linux cache probe via sysfs (`/sys/devices/system/cpu/cpu0/cache/`).
+//! Returns `None` when the cache sysfs entries are absent, as in some
+//! stripped-down containers.
+
+use std::path::Path;
+
+use super::CacheInfo;
+
+const SYSFS_CACHE_DIR: &str = "/sys/devices/system/cpu/cpu0/cache";
+
+pub(super) fn detect() -> Option<CacheInfo> {
+    let mut l1d = None;
+    let mut l2 = None;
+
+    // Each `index*` subdirectory describes one cache (level + type + size).
+    for entry in std::fs::read_dir(SYSFS_CACHE_DIR).ok()?.flatten() {
+        let dir = entry.path();
+
+        let Some(level) = read_trim(dir.join("level")).and_then(|s| s.parse::<u32>().ok()) else {
+            continue;
+        };
+        let Some(cache_type) = read_trim(dir.join("type")) else {
+            continue;
+        };
+        let Some(size) = read_trim(dir.join("size")).and_then(|s| parse_size(&s)) else {
+            continue;
+        };
+
+        match (level, cache_type.as_str()) {
+            (1, "Data") if l1d.is_none() => l1d = Some(size),
+            // L2 is Unified on real hardware; accept Data defensively.
+            (2, "Unified" | "Data") if l2.is_none() => l2 = Some(size),
+            _ => {}
+        }
+    }
+
+    Some(CacheInfo {
+        l1d_bytes: l1d?,
+        l2_bytes: l2?,
+    })
+}
+
+fn read_trim(path: impl AsRef<Path>) -> Option<String> {
+    std::fs::read_to_string(path)
+        .ok()
+        .map(|s| s.trim().to_string())
+}
+
+fn parse_size(s: &str) -> Option<usize> {
+    let s = s.trim();
+    let split = s.find(|c: char| !c.is_ascii_digit()).unwrap_or(s.len());
+    let (num, suffix) = s.split_at(split);
+    let n: usize = num.parse().ok()?;
+    match suffix.trim() {
+        "" => Some(n),
+        "K" | "KB" | "KiB" => Some(n * 1024),
+        "M" | "MB" | "MiB" => Some(n * 1024 * 1024),
+        "G" | "GB" | "GiB" => Some(n * 1024 * 1024 * 1024),
+        _ => None,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn parse_size_handles_common_formats() {
+        assert_eq!(parse_size("32K"), Some(32 * 1024));
+        assert_eq!(parse_size("1024K"), Some(1024 * 1024));
+        assert_eq!(parse_size("8M"), Some(8 * 1024 * 1024));
+        assert_eq!(parse_size("1G"), Some(1024 * 1024 * 1024));
+        assert_eq!(parse_size("4096"), Some(4096));
+        assert_eq!(parse_size(" 32K\n"), Some(32 * 1024));
+        assert_eq!(parse_size("garbage"), None);
+        assert_eq!(parse_size(""), None);
+    }
+}
diff --git a/diskann-quantization/src/multi_vector/distance/cache/macos.rs b/diskann-quantization/src/multi_vector/distance/cache/macos.rs
new file mode 100644
index 000000000..2b0a61847
--- /dev/null
+++ b/diskann-quantization/src/multi_vector/distance/cache/macos.rs
@@ -0,0 +1,55 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT license.
+
+//! aarch64 macOS sysctl cache probe. Reads `hw.perflevel0.*`
+//! and divides the cluster L2 by `cpusperl2` for a per-core budget.
+
+use core::ffi::{CStr, c_void};
+
+use super::CacheInfo;
+
+pub(super) fn detect() -> Option<CacheInfo> {
+    // L1d is private per core — no normalization.
+    let l1d = sysctl_uint(c"hw.perflevel0.l1dcachesize")?;
+    let l2 = perflevel0_l2_per_core()?;
+
+    Some(CacheInfo {
+        l1d_bytes: l1d as usize,
+        l2_bytes: l2 as usize,
+    })
+}
+
+/// It reports the full per-cluster L2 via `hw.perflevel0.l2cachesize`;
+/// divide by `hw.perflevel0.cpusperl2` for the per-core share.
+fn perflevel0_l2_per_core() -> Option<u64> {
+    let total = sysctl_uint(c"hw.perflevel0.l2cachesize")?;
+    let cpus = sysctl_uint(c"hw.perflevel0.cpusperl2")?;
+    (cpus > 0).then(|| total / cpus)
+}
+
+/// Reads an integer sysctl reported as either 4- or 8-byte: cache sizes are
+/// 64-bit, but topology counts like `cpusperl2` are 32-bit.
+fn sysctl_uint(name: &CStr) -> Option<u64> {
+    let mut buf = [0u8; 8];
+    let mut size = buf.len();
+    // SAFETY: `name` is a valid NUL-terminated C string; `buf` / `size` are
+    // valid and writable; the new* parameters are null because we are only
+    // reading.
+    let ret = unsafe {
+        libc::sysctlbyname(
+            name.as_ptr(),
+            buf.as_mut_ptr() as *mut c_void,
+            &mut size,
+            core::ptr::null_mut(),
+            0,
+        )
+    };
+    if ret != 0 {
+        return None;
+    }
+    match size {
+        4 => Some(u32::from_ne_bytes(buf[..4].try_into().ok()?) as u64),
+        8 => Some(u64::from_ne_bytes(buf)),
+        _ => None,
+    }
+}
diff --git a/diskann-quantization/src/multi_vector/distance/cache/mod.rs b/diskann-quantization/src/multi_vector/distance/cache/mod.rs
new file mode 100644
index 000000000..62b11f98c
--- /dev/null
+++ b/diskann-quantization/src/multi_vector/distance/cache/mod.rs
@@ -0,0 +1,88 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT license.
+
+//! L1d / L2 cache size probe used by the multi-vector tile planner.
+//! Detected once and memoized; returns [`CacheInfo::FALLBACK`] when no
+//! per-platform probe applies.
+
+use std::sync::OnceLock;
+
+#[cfg(target_arch = "x86_64")]
+mod cpuid;
+#[cfg(all(target_arch = "aarch64", target_os = "linux"))]
+mod linux;
+#[cfg(all(target_arch = "aarch64", target_os = "macos"))]
+mod macos;
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub(super) struct CacheInfo {
+    /// L1 **d**ata cache size in bytes. L1i (instruction cache) is not read —
+    /// tile budgets only constrain data residency.
+    pub l1d_bytes: usize,
+    /// L2 cache size in bytes.
+    pub l2_bytes: usize,
+}
+
+impl CacheInfo {
+    /// Used when no per-platform probe applies.
+    pub(super) const FALLBACK: Self = Self {
+        l1d_bytes: 32 * 1024,
+        l2_bytes: 256 * 1024,
+    };
+}
+
+pub(super) fn cache_info() -> CacheInfo {
+    static CACHED: OnceLock<CacheInfo> = OnceLock::new();
+    *CACHED.get_or_init(detect_uncached)
+}
+
+fn detect_uncached() -> CacheInfo {
+    #[cfg(target_arch = "x86_64")]
+    let detected = cpuid::detect();
+
+    #[cfg(all(target_arch = "aarch64", target_os = "linux"))]
+    let detected = linux::detect();
+
+    #[cfg(all(target_arch = "aarch64", target_os = "macos"))]
+    let detected = macos::detect();
+
+    #[cfg(not(any(
+        target_arch = "x86_64",
+        all(target_arch = "aarch64", target_os = "linux"),
+        all(target_arch = "aarch64", target_os = "macos"),
+    )))]
+    let detected: Option<CacheInfo> = None;
+
+    detected.unwrap_or(CacheInfo::FALLBACK)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn cache_info_returns_plausible_values() {
+        let info = cache_info();
+
+        // Either we detected real values or we fell back. In both cases the
+        // values must be within plausible bounds for any CPU we care about:
+        // 4 KB to 1 MB for L1d, 64 KB to 128 MB for L2.
+        assert!(
+            (4 * 1024..=1024 * 1024).contains(&info.l1d_bytes),
+            "L1d out of plausible range: {} bytes",
+            info.l1d_bytes
+        );
+        assert!(
+            (64 * 1024..=128 * 1024 * 1024).contains(&info.l2_bytes),
+            "L2 out of plausible range: {} bytes",
+            info.l2_bytes
+        );
+    }
+
+    #[test]
+    fn cache_info_is_memoized() {
+        let first = cache_info();
+        let second = cache_info();
+        assert_eq!(first, second);
+    }
+}
diff --git a/diskann-quantization/src/multi_vector/distance/kernels/mod.rs b/diskann-quantization/src/multi_vector/distance/kernels/mod.rs
index 55108698d..95c3a2dd5 100644
--- a/diskann-quantization/src/multi_vector/distance/kernels/mod.rs
+++ b/diskann-quantization/src/multi_vector/distance/kernels/mod.rs
@@ -22,8 +22,8 @@ mod tiled_reduce;
 
 /// Cache budgets fed to the tile planner.
 ///
-/// `Default` returns the production budgets derived from hardcoded L1/L2
-/// cache-size estimates and fixed fractions.
+/// `Default` derives the budgets from runtime-detected L1d/L2 cache sizes
+/// (see [`cache::cache_info`](super::cache::cache_info)).
 #[derive(Debug, Clone, Copy)]
 struct TileBudget {
     /// L2 budget in bytes reserved for A tiles.
@@ -33,17 +33,13 @@ struct TileBudget {
 }
 
 impl Default for TileBudget {
-    // TODO: Replace hardcoded fallbacks with detected cache sizes
-    // (e.g. via `diskann_platform`, env-var override, or runtime query).
     fn default() -> Self {
-        const L2_CACHE: usize = 1_250_000; // 1.25 MB fallback
-        const L1_CACHE: usize = 48_000; // 48 KB fallback
-
+        let cache = super::cache::cache_info();
         Self {
             // 50% of L2 for A tiles; remainder for B streaming + pollution.
-            l2_a: L2_CACHE / 2,
+            l2_a: cache.l2_bytes / 2,
             // 75% of L1 for B tiles; A micro-panel subtracted at runtime.
-            l1_b: L1_CACHE * 3 / 4,
+            l1_b: cache.l1d_bytes * 3 / 4,
         }
     }
 }
diff --git a/diskann-quantization/src/multi_vector/distance/mod.rs b/diskann-quantization/src/multi_vector/distance/mod.rs
index ef336161c..61d1563f5 100644
--- a/diskann-quantization/src/multi_vector/distance/mod.rs
+++ b/diskann-quantization/src/multi_vector/distance/mod.rs
@@ -41,6 +41,7 @@
 //! // scores[1] =  0.0 (query[1] has no good match: max IP was 0)
 //! ```
 
+mod cache;
 mod factory;
 mod fallback;
 mod isa;

From 3be03545ca583ebaeee080607f887b4c00a93492 Mon Sep 17 00:00:00 2001
From: Wei Wu <wuw1992@qq.com>
Date: Fri, 5 Jun 2026 12:11:41 +0800
Subject: [PATCH 2/2] Address review comments on cache detection

- cpuid.rs: reword the module doc so the CPUID 0x4 / 0x8000001D leaf
  selection is attributed to raw-cpuid's cache-parameter enumeration.
  The vendor dispatch is internal to the crate, not visible at our call
  site, which the original wording obscured.
- linux.rs: parse_size uses checked_mul for the K/M/G suffixes so an
  oversized sysfs value returns None instead of silently wrapping in
  release builds, where overflow checks are off. Add a regression test.
---
 .../src/multi_vector/distance/cache/cpuid.rs             | 8 ++++----
 .../src/multi_vector/distance/cache/linux.rs             | 9 ++++++---
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/diskann-quantization/src/multi_vector/distance/cache/cpuid.rs b/diskann-quantization/src/multi_vector/distance/cache/cpuid.rs
index d94f2160f..a8cfcc7c3 100644
--- a/diskann-quantization/src/multi_vector/distance/cache/cpuid.rs
+++ b/diskann-quantization/src/multi_vector/distance/cache/cpuid.rs
@@ -1,10 +1,10 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT license.
 
-//! x86_64 CPUID cache probe via the deterministic cache parameter leaf
-//! (CPUID `0x4` on Intel, `0x8000001D` on AMD). Returns `None` on CPUs exposing
-//! neither, i.e. old AMD without the `TopologyExtensions` feature. We don't read
-//! AMD's legacy `0x80000005/06` leaves; no deployment target is that old.
+//! x86_64 CPUID cache probe via deterministic cache parameters — `raw-cpuid`'s
+//! cache-parameter enumeration (CPUID `0x4` on Intel, `0x8000001D` on AMD).
+//! Returns `None` when those parameters are unavailable; we don't fall back to
+//! AMD's legacy `0x80000005/06` leaves.
 
 use raw_cpuid::{CacheType, CpuId};
 
diff --git a/diskann-quantization/src/multi_vector/distance/cache/linux.rs b/diskann-quantization/src/multi_vector/distance/cache/linux.rs
index 8f5d3420b..b53b3b24d 100644
--- a/diskann-quantization/src/multi_vector/distance/cache/linux.rs
+++ b/diskann-quantization/src/multi_vector/distance/cache/linux.rs
@@ -56,9 +56,9 @@ fn parse_size(s: &str) -> Option<usize> {
     let n: usize = num.parse().ok()?;
     match suffix.trim() {
         "" => Some(n),
-        "K" | "KB" | "KiB" => Some(n * 1024),
-        "M" | "MB" | "MiB" => Some(n * 1024 * 1024),
-        "G" | "GB" | "GiB" => Some(n * 1024 * 1024 * 1024),
+        "K" | "KB" | "KiB" => n.checked_mul(1024),
+        "M" | "MB" | "MiB" => n.checked_mul(1024 * 1024),
+        "G" | "GB" | "GiB" => n.checked_mul(1024 * 1024 * 1024),
         _ => None,
     }
 }
@@ -77,5 +77,8 @@ mod tests {
         assert_eq!(parse_size(" 32K\n"), Some(32 * 1024));
         assert_eq!(parse_size("garbage"), None);
         assert_eq!(parse_size(""), None);
+        // A value that parses but overflows on the suffix multiply must yield
+        // None, not a silently wrapped size (release builds skip overflow checks).
+        assert_eq!(parse_size(&format!("{}K", usize::MAX)), None);
     }
 }