From 00f14f7ce3d6c46cc1944591078658f8224c1666 Mon Sep 17 00:00:00 2001 From: Wei Wu Date: Thu, 4 Jun 2026 14:39:18 +0800 Subject: [PATCH 1/2] [diskann-quantization] Add CPU cache size detection for multi-vector tile budget Wires runtime L1d/L2 detection into TileBudget::default() so the multi-vector tile planner sizes A/B tiles against the host's actual cache geometry instead of hardcoded Skylake-X estimates (1.25 MB L2, 48 KB L1d from PR #863). Detection lives in diskann-quantization, alongside the existing ISA-capability probe in isa.rs. Cache size is fundamentally a CPU/arch property: the OS-API is a discovery mechanism, not the concept being captured. Putting the module here mirrors the existing diskann-wide / diskann-vector / diskann-quantization stack, which handles all arch dispatch internally without depending on diskann-platform. Detection strategy follows what gemm-common / faer / OpenBLAS do: CPUID where available, OS API where required. - x86_64 (any OS): CPUID via the `raw-cpuid` crate (one path) - aarch64 Linux: sysfs (/sys/devices/system/cpu/cpu0/cache/...) - aarch64 macOS: sysctl (hw.perflevel0.*, P-core L2 / cpusperl2) - Anything else: CacheInfo::FALLBACK (32 KB L1d, 256 KB L2) On Apple Silicon the per-cluster L2 is divided by cpusperl2 to give a per-core budget. Windows-on-ARM falls back to the conservative defaults: CI doesn't cover that target and DiskANN production doesn't deploy there; dropping it removes a Win32 codepath and lets the crate avoid pulling windows-sys. Equality between CPUID and Win32 GetLogicalProcessorInformationEx was verified on Windows x86_64 (32 KB L1d / 512 KB L2 on the test host) during development. Final commit removes the side-by-side test along with the temporary dependency on diskann-platform. Closes #1062. --- Cargo.lock | 2 + diskann-quantization/Cargo.toml | 7 ++ .../src/multi_vector/distance/cache/cpuid.rs | 48 ++++++++++ .../src/multi_vector/distance/cache/linux.rs | 81 +++++++++++++++++ .../src/multi_vector/distance/cache/macos.rs | 55 ++++++++++++ .../src/multi_vector/distance/cache/mod.rs | 88 +++++++++++++++++++ .../src/multi_vector/distance/kernels/mod.rs | 14 ++- .../src/multi_vector/distance/mod.rs | 1 + 8 files changed, 287 insertions(+), 9 deletions(-) create mode 100644 diskann-quantization/src/multi_vector/distance/cache/cpuid.rs create mode 100644 diskann-quantization/src/multi_vector/distance/cache/linux.rs create mode 100644 diskann-quantization/src/multi_vector/distance/cache/macos.rs create mode 100644 diskann-quantization/src/multi_vector/distance/cache/mod.rs diff --git a/Cargo.lock b/Cargo.lock index ca3afe030..2fed74a22 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -902,8 +902,10 @@ dependencies = [ "diskann-wide", "flatbuffers", "half", + "libc", "rand 0.9.4", "rand_distr", + "raw-cpuid", "rayon", "serde", "serde_json", diff --git a/diskann-quantization/Cargo.toml b/diskann-quantization/Cargo.toml index cb2ddf1c2..a43917d2d 100644 --- a/diskann-quantization/Cargo.toml +++ b/diskann-quantization/Cargo.toml @@ -20,6 +20,13 @@ flatbuffers = { version = "25.2.10", optional = true } half = { version = "2.6.0", features = ["bytemuck"] } diskann-utils = { workspace = true } +# Cache size detection. x86_64 uses CPUID; aarch64 Linux/macOS use OS APIs. +[target.'cfg(target_arch = "x86_64")'.dependencies] +raw-cpuid = "11.5" + +[target.'cfg(all(target_arch = "aarch64", target_os = "macos"))'.dependencies] +libc = "0.2.148" + [lints.clippy] undocumented_unsafe_blocks = "warn" unwrap_used = "warn" diff --git a/diskann-quantization/src/multi_vector/distance/cache/cpuid.rs b/diskann-quantization/src/multi_vector/distance/cache/cpuid.rs new file mode 100644 index 000000000..d94f2160f --- /dev/null +++ b/diskann-quantization/src/multi_vector/distance/cache/cpuid.rs @@ -0,0 +1,48 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +//! x86_64 CPUID cache probe via the deterministic cache parameter leaf +//! (CPUID `0x4` on Intel, `0x8000001D` on AMD). Returns `None` on CPUs exposing +//! neither, i.e. old AMD without the `TopologyExtensions` feature. We don't read +//! AMD's legacy `0x80000005/06` leaves; no deployment target is that old. + +use raw_cpuid::{CacheType, CpuId}; + +use super::CacheInfo; + +pub(super) fn detect() -> Option { + let cpuid = CpuId::new(); + let params = cpuid.get_cache_parameters()?; + + let mut l1d = None; + let mut l2 = None; + + for cache in params { + let level = cache.level(); + let ty = cache.cache_type(); + let size = cache_size_bytes(&cache); + + match (level, ty) { + (1, CacheType::Data) if l1d.is_none() => l1d = Some(size), + // L2 is usually Unified; accept Data as a defensive fallback. + (2, CacheType::Unified | CacheType::Data) if l2.is_none() => l2 = Some(size), + _ => {} + } + + if l1d.is_some() && l2.is_some() { + break; + } + } + + Some(CacheInfo { + l1d_bytes: l1d?, + l2_bytes: l2?, + }) +} + +fn cache_size_bytes(cache: &raw_cpuid::CacheParameter) -> usize { + cache.associativity() + * cache.physical_line_partitions() + * cache.coherency_line_size() + * cache.sets() +} diff --git a/diskann-quantization/src/multi_vector/distance/cache/linux.rs b/diskann-quantization/src/multi_vector/distance/cache/linux.rs new file mode 100644 index 000000000..8f5d3420b --- /dev/null +++ b/diskann-quantization/src/multi_vector/distance/cache/linux.rs @@ -0,0 +1,81 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +//! aarch64 Linux cache probe via sysfs (`/sys/devices/system/cpu/cpu0/cache/`). +//! Returns `None` when the cache sysfs entries are absent, as in some +//! stripped-down containers. + +use std::path::Path; + +use super::CacheInfo; + +const SYSFS_CACHE_DIR: &str = "/sys/devices/system/cpu/cpu0/cache"; + +pub(super) fn detect() -> Option { + let mut l1d = None; + let mut l2 = None; + + // Each `index*` subdirectory describes one cache (level + type + size). + for entry in std::fs::read_dir(SYSFS_CACHE_DIR).ok()?.flatten() { + let dir = entry.path(); + + let Some(level) = read_trim(dir.join("level")).and_then(|s| s.parse::().ok()) else { + continue; + }; + let Some(cache_type) = read_trim(dir.join("type")) else { + continue; + }; + let Some(size) = read_trim(dir.join("size")).and_then(|s| parse_size(&s)) else { + continue; + }; + + match (level, cache_type.as_str()) { + (1, "Data") if l1d.is_none() => l1d = Some(size), + // L2 is Unified on real hardware; accept Data defensively. + (2, "Unified" | "Data") if l2.is_none() => l2 = Some(size), + _ => {} + } + } + + Some(CacheInfo { + l1d_bytes: l1d?, + l2_bytes: l2?, + }) +} + +fn read_trim(path: impl AsRef) -> Option { + std::fs::read_to_string(path) + .ok() + .map(|s| s.trim().to_string()) +} + +fn parse_size(s: &str) -> Option { + let s = s.trim(); + let split = s.find(|c: char| !c.is_ascii_digit()).unwrap_or(s.len()); + let (num, suffix) = s.split_at(split); + let n: usize = num.parse().ok()?; + match suffix.trim() { + "" => Some(n), + "K" | "KB" | "KiB" => Some(n * 1024), + "M" | "MB" | "MiB" => Some(n * 1024 * 1024), + "G" | "GB" | "GiB" => Some(n * 1024 * 1024 * 1024), + _ => None, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_size_handles_common_formats() { + assert_eq!(parse_size("32K"), Some(32 * 1024)); + assert_eq!(parse_size("1024K"), Some(1024 * 1024)); + assert_eq!(parse_size("8M"), Some(8 * 1024 * 1024)); + assert_eq!(parse_size("1G"), Some(1024 * 1024 * 1024)); + assert_eq!(parse_size("4096"), Some(4096)); + assert_eq!(parse_size(" 32K\n"), Some(32 * 1024)); + assert_eq!(parse_size("garbage"), None); + assert_eq!(parse_size(""), None); + } +} diff --git a/diskann-quantization/src/multi_vector/distance/cache/macos.rs b/diskann-quantization/src/multi_vector/distance/cache/macos.rs new file mode 100644 index 000000000..2b0a61847 --- /dev/null +++ b/diskann-quantization/src/multi_vector/distance/cache/macos.rs @@ -0,0 +1,55 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +//! aarch64 macOS sysctl cache probe. Reads `hw.perflevel0.*` +//! and divides the cluster L2 by `cpusperl2` for a per-core budget. + +use core::ffi::{CStr, c_void}; + +use super::CacheInfo; + +pub(super) fn detect() -> Option { + // L1d is private per core — no normalization. + let l1d = sysctl_uint(c"hw.perflevel0.l1dcachesize")?; + let l2 = perflevel0_l2_per_core()?; + + Some(CacheInfo { + l1d_bytes: l1d as usize, + l2_bytes: l2 as usize, + }) +} + +/// It reports the full per-cluster L2 via `hw.perflevel0.l2cachesize`; +/// divide by `hw.perflevel0.cpusperl2` for the per-core share. +fn perflevel0_l2_per_core() -> Option { + let total = sysctl_uint(c"hw.perflevel0.l2cachesize")?; + let cpus = sysctl_uint(c"hw.perflevel0.cpusperl2")?; + (cpus > 0).then(|| total / cpus) +} + +/// Reads an integer sysctl reported as either 4- or 8-byte: cache sizes are +/// 64-bit, but topology counts like `cpusperl2` are 32-bit. +fn sysctl_uint(name: &CStr) -> Option { + let mut buf = [0u8; 8]; + let mut size = buf.len(); + // SAFETY: `name` is a valid NUL-terminated C string; `buf` / `size` are + // valid and writable; the new* parameters are null because we are only + // reading. + let ret = unsafe { + libc::sysctlbyname( + name.as_ptr(), + buf.as_mut_ptr() as *mut c_void, + &mut size, + core::ptr::null_mut(), + 0, + ) + }; + if ret != 0 { + return None; + } + match size { + 4 => Some(u32::from_ne_bytes(buf[..4].try_into().ok()?) as u64), + 8 => Some(u64::from_ne_bytes(buf)), + _ => None, + } +} diff --git a/diskann-quantization/src/multi_vector/distance/cache/mod.rs b/diskann-quantization/src/multi_vector/distance/cache/mod.rs new file mode 100644 index 000000000..62b11f98c --- /dev/null +++ b/diskann-quantization/src/multi_vector/distance/cache/mod.rs @@ -0,0 +1,88 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +//! L1d / L2 cache size probe used by the multi-vector tile planner. +//! Detected once and memoized; returns [`CacheInfo::FALLBACK`] when no +//! per-platform probe applies. + +use std::sync::OnceLock; + +#[cfg(target_arch = "x86_64")] +mod cpuid; +#[cfg(all(target_arch = "aarch64", target_os = "linux"))] +mod linux; +#[cfg(all(target_arch = "aarch64", target_os = "macos"))] +mod macos; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(super) struct CacheInfo { + /// L1 **d**ata cache size in bytes. L1i (instruction cache) is not read — + /// tile budgets only constrain data residency. + pub l1d_bytes: usize, + /// L2 cache size in bytes. + pub l2_bytes: usize, +} + +impl CacheInfo { + /// Used when no per-platform probe applies. + pub(super) const FALLBACK: Self = Self { + l1d_bytes: 32 * 1024, + l2_bytes: 256 * 1024, + }; +} + +pub(super) fn cache_info() -> CacheInfo { + static CACHED: OnceLock = OnceLock::new(); + *CACHED.get_or_init(detect_uncached) +} + +fn detect_uncached() -> CacheInfo { + #[cfg(target_arch = "x86_64")] + let detected = cpuid::detect(); + + #[cfg(all(target_arch = "aarch64", target_os = "linux"))] + let detected = linux::detect(); + + #[cfg(all(target_arch = "aarch64", target_os = "macos"))] + let detected = macos::detect(); + + #[cfg(not(any( + target_arch = "x86_64", + all(target_arch = "aarch64", target_os = "linux"), + all(target_arch = "aarch64", target_os = "macos"), + )))] + let detected: Option = None; + + detected.unwrap_or(CacheInfo::FALLBACK) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn cache_info_returns_plausible_values() { + let info = cache_info(); + + // Either we detected real values or we fell back. In both cases the + // values must be within plausible bounds for any CPU we care about: + // 4 KB to 1 MB for L1d, 64 KB to 128 MB for L2. + assert!( + (4 * 1024..=1024 * 1024).contains(&info.l1d_bytes), + "L1d out of plausible range: {} bytes", + info.l1d_bytes + ); + assert!( + (64 * 1024..=128 * 1024 * 1024).contains(&info.l2_bytes), + "L2 out of plausible range: {} bytes", + info.l2_bytes + ); + } + + #[test] + fn cache_info_is_memoized() { + let first = cache_info(); + let second = cache_info(); + assert_eq!(first, second); + } +} diff --git a/diskann-quantization/src/multi_vector/distance/kernels/mod.rs b/diskann-quantization/src/multi_vector/distance/kernels/mod.rs index 55108698d..95c3a2dd5 100644 --- a/diskann-quantization/src/multi_vector/distance/kernels/mod.rs +++ b/diskann-quantization/src/multi_vector/distance/kernels/mod.rs @@ -22,8 +22,8 @@ mod tiled_reduce; /// Cache budgets fed to the tile planner. /// -/// `Default` returns the production budgets derived from hardcoded L1/L2 -/// cache-size estimates and fixed fractions. +/// `Default` derives the budgets from runtime-detected L1d/L2 cache sizes +/// (see [`cache::cache_info`](super::cache::cache_info)). #[derive(Debug, Clone, Copy)] struct TileBudget { /// L2 budget in bytes reserved for A tiles. @@ -33,17 +33,13 @@ struct TileBudget { } impl Default for TileBudget { - // TODO: Replace hardcoded fallbacks with detected cache sizes - // (e.g. via `diskann_platform`, env-var override, or runtime query). fn default() -> Self { - const L2_CACHE: usize = 1_250_000; // 1.25 MB fallback - const L1_CACHE: usize = 48_000; // 48 KB fallback - + let cache = super::cache::cache_info(); Self { // 50% of L2 for A tiles; remainder for B streaming + pollution. - l2_a: L2_CACHE / 2, + l2_a: cache.l2_bytes / 2, // 75% of L1 for B tiles; A micro-panel subtracted at runtime. - l1_b: L1_CACHE * 3 / 4, + l1_b: cache.l1d_bytes * 3 / 4, } } } diff --git a/diskann-quantization/src/multi_vector/distance/mod.rs b/diskann-quantization/src/multi_vector/distance/mod.rs index ef336161c..61d1563f5 100644 --- a/diskann-quantization/src/multi_vector/distance/mod.rs +++ b/diskann-quantization/src/multi_vector/distance/mod.rs @@ -41,6 +41,7 @@ //! // scores[1] = 0.0 (query[1] has no good match: max IP was 0) //! ``` +mod cache; mod factory; mod fallback; mod isa; From 3be03545ca583ebaeee080607f887b4c00a93492 Mon Sep 17 00:00:00 2001 From: Wei Wu Date: Fri, 5 Jun 2026 12:11:41 +0800 Subject: [PATCH 2/2] Address review comments on cache detection - cpuid.rs: reword the module doc so the CPUID 0x4 / 0x8000001D leaf selection is attributed to raw-cpuid's cache-parameter enumeration. The vendor dispatch is internal to the crate, not visible at our call site, which the original wording obscured. - linux.rs: parse_size uses checked_mul for the K/M/G suffixes so an oversized sysfs value returns None instead of silently wrapping in release builds, where overflow checks are off. Add a regression test. --- .../src/multi_vector/distance/cache/cpuid.rs | 8 ++++---- .../src/multi_vector/distance/cache/linux.rs | 9 ++++++--- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/diskann-quantization/src/multi_vector/distance/cache/cpuid.rs b/diskann-quantization/src/multi_vector/distance/cache/cpuid.rs index d94f2160f..a8cfcc7c3 100644 --- a/diskann-quantization/src/multi_vector/distance/cache/cpuid.rs +++ b/diskann-quantization/src/multi_vector/distance/cache/cpuid.rs @@ -1,10 +1,10 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT license. -//! x86_64 CPUID cache probe via the deterministic cache parameter leaf -//! (CPUID `0x4` on Intel, `0x8000001D` on AMD). Returns `None` on CPUs exposing -//! neither, i.e. old AMD without the `TopologyExtensions` feature. We don't read -//! AMD's legacy `0x80000005/06` leaves; no deployment target is that old. +//! x86_64 CPUID cache probe via deterministic cache parameters — `raw-cpuid`'s +//! cache-parameter enumeration (CPUID `0x4` on Intel, `0x8000001D` on AMD). +//! Returns `None` when those parameters are unavailable; we don't fall back to +//! AMD's legacy `0x80000005/06` leaves. use raw_cpuid::{CacheType, CpuId}; diff --git a/diskann-quantization/src/multi_vector/distance/cache/linux.rs b/diskann-quantization/src/multi_vector/distance/cache/linux.rs index 8f5d3420b..b53b3b24d 100644 --- a/diskann-quantization/src/multi_vector/distance/cache/linux.rs +++ b/diskann-quantization/src/multi_vector/distance/cache/linux.rs @@ -56,9 +56,9 @@ fn parse_size(s: &str) -> Option { let n: usize = num.parse().ok()?; match suffix.trim() { "" => Some(n), - "K" | "KB" | "KiB" => Some(n * 1024), - "M" | "MB" | "MiB" => Some(n * 1024 * 1024), - "G" | "GB" | "GiB" => Some(n * 1024 * 1024 * 1024), + "K" | "KB" | "KiB" => n.checked_mul(1024), + "M" | "MB" | "MiB" => n.checked_mul(1024 * 1024), + "G" | "GB" | "GiB" => n.checked_mul(1024 * 1024 * 1024), _ => None, } } @@ -77,5 +77,8 @@ mod tests { assert_eq!(parse_size(" 32K\n"), Some(32 * 1024)); assert_eq!(parse_size("garbage"), None); assert_eq!(parse_size(""), None); + // A value that parses but overflows on the suffix multiply must yield + // None, not a silently wrapped size (release builds skip overflow checks). + assert_eq!(parse_size(&format!("{}K", usize::MAX)), None); } }