summaryrefslogtreecommitdiff
path: root/src/os/cgroup.zig
blob: 4b5ccc4d3617bb03dd459923dc86382e54482684 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
const std = @import("std");
const assert = std.debug.assert;
const linux = std.os.linux;
const posix = std.posix;
const Allocator = std.mem.Allocator;

const log = std.log.scoped(.@"linux-cgroup");

/// Returns the path to the cgroup for the given pid.
pub fn current(alloc: Allocator, pid: std.os.linux.pid_t) !?[]const u8 {
    var buf: [std.fs.max_path_bytes]u8 = undefined;

    // Read our cgroup by opening /proc/<pid>/cgroup and reading the first
    // line. The first line will look something like this:
    // 0::/user.slice/user-1000.slice/session-1.scope
    // The cgroup path is the third field.
    const path = try std.fmt.bufPrint(&buf, "/proc/{}/cgroup", .{pid});
    const file = try std.fs.cwd().openFile(path, .{});
    defer file.close();

    // Read it all into memory -- we don't expect this file to ever be that large.
    const contents = try file.readToEndAlloc(
        alloc,
        1 * 1024 * 1024, // 1MB
    );
    defer alloc.free(contents);

    // Find the last ':'
    const idx = std.mem.lastIndexOfScalar(u8, contents, ':') orelse return null;
    const result = std.mem.trimRight(u8, contents[idx + 1 ..], " \r\n");
    return try alloc.dupe(u8, result);
}

/// Create a new cgroup. This will not move any process into it unless move is
/// set. If move is set, the given pid will be moved into the created cgroup.
pub fn create(
    cgroup: []const u8,
    child: []const u8,
    move: ?std.os.linux.pid_t,
) !void {
    var buf: [std.fs.max_path_bytes]u8 = undefined;
    const path = try std.fmt.bufPrint(&buf, "/sys/fs/cgroup{s}/{s}", .{ cgroup, child });
    try std.fs.cwd().makePath(path);

    // If we have a PID to move into the cgroup immediately, do it.
    if (move) |pid| {
        const pid_path = try std.fmt.bufPrint(
            &buf,
            "/sys/fs/cgroup{s}/{s}/cgroup.procs",
            .{ cgroup, child },
        );
        const file = try std.fs.cwd().openFile(pid_path, .{ .mode = .write_only });
        defer file.close();

        var file_buf: [64]u8 = undefined;
        var writer = file.writer(&file_buf);
        try writer.interface.print("{}", .{pid});
        try writer.interface.flush();
    }
}

/// Remove a cgroup. This will only succeed if the cgroup is empty
/// (has no processes). The cgroup path should be relative to the
/// cgroup root (e.g. "/user.slice/surfaces/abc123.scope").
pub fn remove(cgroup: []const u8) !void {
    assert(cgroup.len > 0);
    assert(cgroup[0] == '/');

    var buf: [std.fs.max_path_bytes]u8 = undefined;
    const path = try std.fmt.bufPrint(&buf, "/sys/fs/cgroup{s}", .{cgroup});
    std.fs.cwd().deleteDir(path) catch |err| switch (err) {
        // If it doesn't exist, that's fine - maybe it was already cleaned up
        error.FileNotFound => {},

        // Any other error we failed to delete it so we want to notify
        // the user.
        else => return err,
    };
}

/// Move the given PID into the given cgroup.
pub fn moveInto(
    cgroup: []const u8,
    pid: std.os.linux.pid_t,
) !void {
    var buf: [std.fs.max_path_bytes]u8 = undefined;
    const path = try std.fmt.bufPrint(&buf, "/sys/fs/cgroup{s}/cgroup.procs", .{cgroup});
    const file = try std.fs.cwd().openFile(path, .{ .mode = .write_only });
    defer file.close();
    try file.writer().print("{}", .{pid});
}

/// Use clone3 to have the kernel create a new process with the correct cgroup
/// rather than moving the process to the correct cgroup later.
pub fn cloneInto(cgroup: []const u8) !posix.pid_t {
    var buf: [std.fs.max_path_bytes]u8 = undefined;
    const path = try std.fmt.bufPrintZ(&buf, "/sys/fs/cgroup{s}", .{cgroup});

    // Get a file descriptor that refers to the cgroup directory in the cgroup
    // sysfs to pass to the kernel in clone3.
    const fd: linux.fd_t = fd: {
        const rc = linux.open(
            path,
            .{
                // Self-explanatory: we expect to open a directory, and
                // we only need the path-level permissions.
                .PATH = true,
                .DIRECTORY = true,

                // We don't want to leak this fd to the child process
                // when we clone below since we're using this fd for
                // a cgroup clone.
                .CLOEXEC = true,
            },
            0,
        );

        switch (posix.errno(rc)) {
            .SUCCESS => break :fd @as(linux.fd_t, @intCast(rc)),
            else => |errno| {
                log.err("unable to open cgroup dir {s}: {}", .{ path, errno });
                return error.CloneError;
            },
        }
    };
    assert(fd >= 0);
    defer _ = linux.close(fd);

    const args: extern struct {
        flags: u64,
        pidfd: u64,
        child_tid: u64,
        parent_tid: u64,
        exit_signal: u64,
        stack: u64,
        stack_size: u64,
        tls: u64,
        set_tid: u64,
        set_tid_size: u64,
        cgroup: u64,
    } = .{
        .flags = linux.CLONE.INTO_CGROUP,
        .pidfd = 0,
        .child_tid = 0,
        .parent_tid = 0,
        .exit_signal = linux.SIG.CHLD,
        .stack = 0,
        .stack_size = 0,
        .tls = 0,
        .set_tid = 0,
        .set_tid_size = 0,
        .cgroup = @intCast(fd),
    };

    const rc = linux.syscall2(linux.SYS.clone3, @intFromPtr(&args), @sizeOf(@TypeOf(args)));
    // do not use posix.errno, when linking libc it will use the libc errno which will not be set when making the syscall directly
    return switch (std.os.linux.E.init(rc)) {
        .SUCCESS => @as(posix.pid_t, @intCast(rc)),
        else => |errno| err: {
            log.err("unable to clone: {}", .{errno});
            break :err error.CloneError;
        },
    };
}

/// Returns all available cgroup controllers for the given cgroup.
/// The cgroup should have a '/'-prefix.
///
/// The returned list of is the raw space-separated list of
/// controllers from the /sys/fs directory. This avoids some extra
/// work since creating an iterator over this is easy and much cheaper
/// than allocating a bunch of copies for an array.
pub fn controllers(alloc: Allocator, cgroup: []const u8) ![]const u8 {
    assert(cgroup[0] == '/');
    var buf: [std.fs.max_path_bytes]u8 = undefined;

    // Read the available controllers. These will be space separated.
    const path = try std.fmt.bufPrint(
        &buf,
        "/sys/fs/cgroup{s}/cgroup.controllers",
        .{cgroup},
    );
    const file = try std.fs.cwd().openFile(path, .{});
    defer file.close();

    // Read it all into memory -- we don't expect this file to ever
    // be that large.
    const contents = try file.readToEndAlloc(
        alloc,
        1 * 1024 * 1024, // 1MB
    );
    defer alloc.free(contents);

    // Return our raw list of controllers
    const result = std.mem.trimRight(u8, contents, " \r\n");
    return try alloc.dupe(u8, result);
}

/// Configure the set of controllers in the cgroup. The "v" should
/// be in a valid format for "cgroup.subtree_control"
pub fn configureControllers(
    cgroup: []const u8,
    v: []const u8,
) !void {
    assert(cgroup[0] == '/');
    var buf: [std.fs.max_path_bytes]u8 = undefined;

    // Read the available controllers. These will be space separated.
    const path = try std.fmt.bufPrint(
        &buf,
        "/sys/fs/cgroup{s}/cgroup.subtree_control",
        .{cgroup},
    );
    const file = try std.fs.cwd().openFile(path, .{ .mode = .write_only });
    defer file.close();

    // Write
    var writer_buf: [4096]u8 = undefined;
    var writer = file.writer(&writer_buf);
    try writer.interface.writeAll(v);
    try writer.interface.flush();
}

pub const Limit = union(enum) {
    memory_high: usize,
    pids_max: usize,
};

/// Configure a limit for the given cgroup. Use the various
/// fields in Limit to configure a specific type of limit.
pub fn configureLimit(cgroup: []const u8, limit: Limit) !void {
    assert(cgroup[0] == '/');

    const filename, const size = switch (limit) {
        .memory_high => |v| .{ "memory.high", v },
        .pids_max => |v| .{ "pids.max", v },
    };

    // Open our file
    var buf: [std.fs.max_path_bytes]u8 = undefined;
    const path = try std.fmt.bufPrint(
        &buf,
        "/sys/fs/cgroup{s}/{s}",
        .{ cgroup, filename },
    );
    const file = try std.fs.cwd().openFile(path, .{ .mode = .write_only });
    defer file.close();

    // Write our limit in bytes
    var writer_buf: [4096]u8 = undefined;
    var writer = file.writer(&writer_buf);
    try writer.interface.print("{}", .{size});
    try writer.interface.flush();
}