using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; namespace PersistentOrderedMap; public static class IntScanner { [MethodImpl(MethodImplOptions.AggressiveInlining)] public static int FindFirstGreaterOrEqual(ReadOnlySpan keys, int target) { // Fallback for short arrays or unsupported hardware. // AVX2 processes 8 integers at a time. if (!Avx2.IsSupported || keys.Length < 8) return LinearScan(keys, target); return Avx512F.IsSupported ? ScanAvx512(keys, target) : ScanAvx2(keys, target); } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static int LinearScan(ReadOnlySpan keys, int target) { for (var i = 0; i < keys.Length; i++) if (keys[i] >= target) return i; return keys.Length; } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe int ScanAvx2(ReadOnlySpan keys, int target) { // AVX2 lacks a native GreaterOrEqual for 32-bit integers. // We use GreaterThan(Data, target - 1). var vTarget = Vector256.Create(target - 1); var i = 0; var len = keys.Length; for (; i <= len - 8; i += 8) { fixed (int* ptr = keys) { var vData = Avx2.LoadVector256(ptr + i); var vResult = Avx2.CompareGreaterThan(vData, vTarget); // MoveMask creates a 32-bit integer from the most significant bit of each byte. var mask = (uint)Avx2.MoveMask(vResult.AsByte()); if (mask != 0) { // Since an int is 4 bytes, MoveMask sets 4 bits per matching element. // Dividing the trailing zero count by 4 maps the byte offset back to the integer index. return i + (BitOperations.TrailingZeroCount(mask) / 4); } } } return LinearScan(keys.Slice(i), target) + i; } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe int ScanAvx512(ReadOnlySpan keys, int target) { // AVX-512 processes 16 integers (512 bits) per instruction. var vTarget = Vector512.Create(target); var i = 0; var len = keys.Length; for (; i <= len - 16; i += 16) { fixed (int* ptr = keys) { var vData = Avx512F.LoadVector512(ptr + i); // Vector512 API is used directly here to cleanly get the mask var mask = Vector512.GreaterThanOrEqual(vData, vTarget); if (mask != Vector512.Zero) { uint m = (uint)mask.ExtractMostSignificantBits(); return i + BitOperations.TrailingZeroCount(m); } } } return LinearScan(keys.Slice(i), target) + i; } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static int FindFirstGreater(ReadOnlySpan keys, int target) { if (!Avx2.IsSupported || keys.Length < 8) return LinearScanGreater(keys, target); return Avx512F.IsSupported ? ScanAvx512Greater(keys, target) : ScanAvx2Greater(keys, target); } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static int LinearScanGreater(ReadOnlySpan keys, int target) { for (var i = 0; i < keys.Length; i++) if (keys[i] > target) return i; return keys.Length; } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe int ScanAvx2Greater(ReadOnlySpan keys, int target) { // For > target, AVX2 CompareGreaterThan works directly without the (target - 1) offset var vTarget = Vector256.Create(target); var i = 0; var len = keys.Length; for (; i <= len - 8; i += 8) { fixed (int* ptr = keys) { var vData = Avx2.LoadVector256(ptr + i); var vResult = Avx2.CompareGreaterThan(vData, vTarget); var mask = (uint)Avx2.MoveMask(vResult.AsByte()); if (mask != 0) { return i + (BitOperations.TrailingZeroCount(mask) / 4); } } } return LinearScanGreater(keys.Slice(i), target) + i; } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe int ScanAvx512Greater(ReadOnlySpan keys, int target) { var vTarget = Vector512.Create(target); var i = 0; var len = keys.Length; for (; i <= len - 16; i += 16) { fixed (int* ptr = keys) { var vData = Avx512F.LoadVector512(ptr + i); // Use GreaterThan instead of GreaterThanOrEqual var mask = Vector512.GreaterThan(vData, vTarget); if (mask != Vector512.Zero) { uint m = (uint)mask.ExtractMostSignificantBits(); return i + BitOperations.TrailingZeroCount(m); } } } return LinearScanGreater(keys.Slice(i), target) + i; } }