src/simd/isa.zig


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170

const std = @import("std");
const builtin = @import("builtin");
const assert = std.debug.assert;
const x86_64 = @import("x86_64.zig");

/// Raw comptime entry of poissible ISA. The arch is the arch that the
/// ISA is even possible on (e.g. neon is only possible on aarch64) but
/// the actual ISA may not be available at runtime.
const Entry = struct {
    name: [:0]const u8,
    arch: []const std.Target.Cpu.Arch = &.{},
};

const entries: []const Entry = &.{
    .{ .name = "scalar" },
    .{ .name = "neon", .arch = &.{.aarch64} },
    .{ .name = "avx2", .arch = &.{ .x86, .x86_64 } },
};

/// Enum of possible ISAs for our SIMD operations. Note that these are
/// coarse-grained because they match possible implementations rather than
/// a fine-grained packed struct of available CPU features.
pub const ISA = isa: {
    const EnumField = std.builtin.Type.EnumField;
    var fields: [entries.len]EnumField = undefined;
    for (entries, 0..) |entry, i| {
        fields[i] = .{ .name = entry.name, .value = i };
    }

    break :isa @Type(.{ .Enum = .{
        .tag_type = std.math.IntFittingRange(0, entries.len - 1),
        .fields = &fields,
        .decls = &.{},
        .is_exhaustive = true,
    } });
};

/// A set of ISAs.
pub const Set = std.EnumSet(ISA);

/// Check if the given ISA is possible on the current target. This is
/// available at comptime to help prevent invalid architectures from
/// being used.
pub fn possible(comptime isa: ISA) bool {
    inline for (entries) |entry| {
        if (std.mem.eql(u8, entry.name, @tagName(isa))) {
            for (entry.arch) |arch| {
                if (arch == builtin.cpu.arch) return true;
            }

            // If we have no valid archs then its always valid.
            return entry.arch.len == 0;
        }
    }

    unreachable;
}

/// Detect all possible ISAs at runtime.
pub fn detect() Set {
    var set: Set = .{};
    set.insert(.scalar);
    switch (builtin.cpu.arch) {
        // Neon is mandatory on aarch64. No runtime checks necessary.
        .aarch64 => set.insert(.neon),
        .x86_64 => detectX86(&set),
        else => {},
    }

    return set;
}

/// Returns the preferred ISA to use that is available.
pub fn preferred(set: Set) ISA {
    const order: []const ISA = &.{ .avx2, .neon, .scalar };

    // We should have all of our ISAs present in order
    comptime {
        for (@typeInfo(ISA).Enum.fields) |field| {
            const v = @field(ISA, field.name);
            assert(std.mem.indexOfScalar(ISA, order, v) != null);
        }
    }

    inline for (order) |isa| {
        if (comptime possible(isa)) {
            if (set.contains(isa)) return isa;
        }
    }

    return .scalar;
}

fn detectX86(set: *Set) void {
    // NOTE: this is just some boilerplate to detect AVX2. We
    // can probably support earlier forms of SIMD such as plain
    // SSE, and we can definitely take advtange of later forms. This
    // is just some boilerplate to ONLY detect AVX2 right now.

    // If we support less than 7 for the maximum leaf level then we
    // don't support any AVX instructions.
    var leaf = x86_64.cpuid(0, 0);
    if (leaf.eax < 7) return;

    // If we don't have xsave or avx, then we don't support anything.
    leaf = x86_64.cpuid(1, 0);
    const has_xsave = hasBit(leaf.ecx, 27);
    const has_avx = hasBit(leaf.ecx, 28);
    if (!has_xsave or !has_avx) return;

    // We require AVX save state in order to use AVX instructions.
    const xcr0_eax = x86_64.getXCR0(); // requires xsave+avx
    const has_avx_save = hasMask(xcr0_eax, x86_64.XCR0_XMM | x86_64.XCR0_YMM);
    if (!has_avx_save) return;

    // Check for AVX2.
    leaf = x86_64.cpuid(7, 0);
    const has_avx2 = hasBit(leaf.ebx, 5);
    if (has_avx2) set.insert(.avx2);
}

/// Check if a bit is set at the given offset
pub inline fn hasBit(input: u32, offset: u5) bool {
    return (input >> offset) & 1 != 0;
}

/// Checks if a mask exactly matches the input
pub inline fn hasMask(input: u32, mask: u32) bool {
    return (input & mask) == mask;
}

test "detect" {
    const testing = std.testing;
    const set = detect();
    try testing.expect(set.contains(.scalar));

    switch (builtin.cpu.arch) {
        .aarch64 => {
            // Neon is always available on aarch64
            try testing.expect(set.contains(.neon));
            try testing.expect(!set.contains(.avx2));
        },

        else => {},
    }
}

test "preferred" {
    _ = preferred(detect());
}

test "possible" {
    const testing = std.testing;
    try testing.expect(possible(.scalar)); // always possible

    // hardcode some other common realities
    switch (builtin.cpu.arch) {
        .aarch64 => {
            try testing.expect(possible(.neon));
            try testing.expect(!possible(.avx2));
        },

        .x86, .x86_64 => {
            try testing.expect(!possible(.neon));
            try testing.expect(possible(.avx2));
        },

        else => {},
    }
}