summaryrefslogtreecommitdiffstats
path: root/io_uring/napi.c
blob: 4fd6bb331e1ed9a3a607ef72b108cd56ae099cfc (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
// SPDX-License-Identifier: GPL-2.0

#include "io_uring.h"
#include "napi.h"

#ifdef CONFIG_NET_RX_BUSY_POLL

/* Timeout for cleanout of stale entries. */
#define NAPI_TIMEOUT		(60 * SEC_CONVERSION)

struct io_napi_entry {
	unsigned int		napi_id;
	struct list_head	list;

	unsigned long		timeout;
	struct hlist_node	node;

	struct rcu_head		rcu;
};

static struct io_napi_entry *io_napi_hash_find(struct hlist_head *hash_list,
					       unsigned int napi_id)
{
	struct io_napi_entry *e;

	hlist_for_each_entry_rcu(e, hash_list, node) {
		if (e->napi_id != napi_id)
			continue;
		e->timeout = jiffies + NAPI_TIMEOUT;
		return e;
	}

	return NULL;
}

static inline ktime_t net_to_ktime(unsigned long t)
{
	/* napi approximating usecs, reverse busy_loop_current_time */
	return ns_to_ktime(t << 10);
}

void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock)
{
	struct hlist_head *hash_list;
	unsigned int napi_id;
	struct sock *sk;
	struct io_napi_entry *e;

	sk = sock->sk;
	if (!sk)
		return;

	napi_id = READ_ONCE(sk->sk_napi_id);

	/* Non-NAPI IDs can be rejected. */
	if (napi_id < MIN_NAPI_ID)
		return;

	hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))];

	rcu_read_lock();
	e = io_napi_hash_find(hash_list, napi_id);
	if (e) {
		e->timeout = jiffies + NAPI_TIMEOUT;
		rcu_read_unlock();
		return;
	}
	rcu_read_unlock();

	e = kmalloc(sizeof(*e), GFP_NOWAIT);
	if (!e)
		return;

	e->napi_id = napi_id;
	e->timeout = jiffies + NAPI_TIMEOUT;

	spin_lock(&ctx->napi_lock);
	if (unlikely(io_napi_hash_find(hash_list, napi_id))) {
		spin_unlock(&ctx->napi_lock);
		kfree(e);
		return;
	}

	hlist_add_tail_rcu(&e->node, hash_list);
	list_add_tail(&e->list, &ctx->napi_list);
	spin_unlock(&ctx->napi_lock);
}

static void __io_napi_remove_stale(struct io_ring_ctx *ctx)
{
	struct io_napi_entry *e;
	unsigned int i;

	spin_lock(&ctx->napi_lock);
	hash_for_each(ctx->napi_ht, i, e, node) {
		if (time_after(jiffies, e->timeout)) {
			list_del(&e->list);
			hash_del_rcu(&e->node);
			kfree_rcu(e, rcu);
		}
	}
	spin_unlock(&ctx->napi_lock);
}

static inline void io_napi_remove_stale(struct io_ring_ctx *ctx, bool is_stale)
{
	if (is_stale)
		__io_napi_remove_stale(ctx);
}

static inline bool io_napi_busy_loop_timeout(ktime_t start_time,
					     ktime_t bp)
{
	if (bp) {
		ktime_t end_time = ktime_add(start_time, bp);
		ktime_t now = net_to_ktime(busy_loop_current_time());

		return ktime_after(now, end_time);
	}

	return true;
}

static bool io_napi_busy_loop_should_end(void *data,
					 unsigned long start_time)
{
	struct io_wait_queue *iowq = data;

	if (signal_pending(current))
		return true;
	if (io_should_wake(iowq) || io_has_work(iowq->ctx))
		return true;
	if (io_napi_busy_loop_timeout(net_to_ktime(start_time),
				      iowq->napi_busy_poll_dt))
		return true;

	return false;
}

static bool __io_napi_do_busy_loop(struct io_ring_ctx *ctx,
				   void *loop_end_arg)
{
	struct io_napi_entry *e;
	bool (*loop_end)(void *, unsigned long) = NULL;
	bool is_stale = false;

	if (loop_end_arg)
		loop_end = io_napi_busy_loop_should_end;

	list_for_each_entry_rcu(e, &ctx->napi_list, list) {
		napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg,
				   ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET);

		if (time_after(jiffies, e->timeout))
			is_stale = true;
	}

	return is_stale;
}

static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx,
				       struct io_wait_queue *iowq)
{
	unsigned long start_time = busy_loop_current_time();
	void *loop_end_arg = NULL;
	bool is_stale = false;

	/* Singular lists use a different napi loop end check function and are
	 * only executed once.
	 */
	if (list_is_singular(&ctx->napi_list))
		loop_end_arg = iowq;

	rcu_read_lock();
	do {
		is_stale = __io_napi_do_busy_loop(ctx, loop_end_arg);
	} while (!io_napi_busy_loop_should_end(iowq, start_time) && !loop_end_arg);
	rcu_read_unlock();

	io_napi_remove_stale(ctx, is_stale);
}

/*
 * io_napi_init() - Init napi settings
 * @ctx: pointer to io-uring context structure
 *
 * Init napi settings in the io-uring context.
 */
void io_napi_init(struct io_ring_ctx *ctx)
{
	u64 sys_dt = READ_ONCE(sysctl_net_busy_poll) * NSEC_PER_USEC;

	INIT_LIST_HEAD(&ctx->napi_list);
	spin_lock_init(&ctx->napi_lock);
	ctx->napi_prefer_busy_poll = false;
	ctx->napi_busy_poll_dt = ns_to_ktime(sys_dt);
}

/*
 * io_napi_free() - Deallocate napi
 * @ctx: pointer to io-uring context structure
 *
 * Free the napi list and the hash table in the io-uring context.
 */
void io_napi_free(struct io_ring_ctx *ctx)
{
	struct io_napi_entry *e;
	LIST_HEAD(napi_list);
	unsigned int i;

	spin_lock(&ctx->napi_lock);
	hash_for_each(ctx->napi_ht, i, e, node) {
		hash_del_rcu(&e->node);
		kfree_rcu(e, rcu);
	}
	spin_unlock(&ctx->napi_lock);
}

/*
 * io_napi_register() - Register napi with io-uring
 * @ctx: pointer to io-uring context structure
 * @arg: pointer to io_uring_napi structure
 *
 * Register napi in the io-uring context.
 */
int io_register_napi(struct io_ring_ctx *ctx, void __user *arg)
{
	const struct io_uring_napi curr = {
		.busy_poll_to 	  = ktime_to_us(ctx->napi_busy_poll_dt),
		.prefer_busy_poll = ctx->napi_prefer_busy_poll
	};
	struct io_uring_napi napi;

	if (ctx->flags & IORING_SETUP_IOPOLL)
		return -EINVAL;
	if (copy_from_user(&napi, arg, sizeof(napi)))
		return -EFAULT;
	if (napi.pad[0] || napi.pad[1] || napi.pad[2] || napi.resv)
		return -EINVAL;

	if (copy_to_user(arg, &curr, sizeof(curr)))
		return -EFAULT;

	WRITE_ONCE(ctx->napi_busy_poll_dt, napi.busy_poll_to * NSEC_PER_USEC);
	WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi.prefer_busy_poll);
	WRITE_ONCE(ctx->napi_enabled, true);
	return 0;
}

/*
 * io_napi_unregister() - Unregister napi with io-uring
 * @ctx: pointer to io-uring context structure
 * @arg: pointer to io_uring_napi structure
 *
 * Unregister napi. If arg has been specified copy the busy poll timeout and
 * prefer busy poll setting to the passed in structure.
 */
int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg)
{
	const struct io_uring_napi curr = {
		.busy_poll_to 	  = ktime_to_us(ctx->napi_busy_poll_dt),
		.prefer_busy_poll = ctx->napi_prefer_busy_poll
	};

	if (arg && copy_to_user(arg, &curr, sizeof(curr)))
		return -EFAULT;

	WRITE_ONCE(ctx->napi_busy_poll_dt, 0);
	WRITE_ONCE(ctx->napi_prefer_busy_poll, false);
	WRITE_ONCE(ctx->napi_enabled, false);
	return 0;
}

/*
 * __io_napi_adjust_timeout() - adjust busy loop timeout
 * @ctx: pointer to io-uring context structure
 * @iowq: pointer to io wait queue
 * @ts: pointer to timespec or NULL
 *
 * Adjust the busy loop timeout according to timespec and busy poll timeout.
 * If the specified NAPI timeout is bigger than the wait timeout, then adjust
 * the NAPI timeout accordingly.
 */
void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iowq,
			      ktime_t to_wait)
{
	ktime_t poll_dt = READ_ONCE(ctx->napi_busy_poll_dt);

	if (to_wait)
		poll_dt = min(poll_dt, to_wait);

	iowq->napi_busy_poll_dt = poll_dt;
}

/*
 * __io_napi_busy_loop() - execute busy poll loop
 * @ctx: pointer to io-uring context structure
 * @iowq: pointer to io wait queue
 *
 * Execute the busy poll loop and merge the spliced off list.
 */
void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq)
{
	iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll);

	if (!(ctx->flags & IORING_SETUP_SQPOLL) && ctx->napi_enabled)
		io_napi_blocking_busy_loop(ctx, iowq);
}

/*
 * io_napi_sqpoll_busy_poll() - busy poll loop for sqpoll
 * @ctx: pointer to io-uring context structure
 *
 * Splice of the napi list and execute the napi busy poll loop.
 */
int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx)
{
	LIST_HEAD(napi_list);
	bool is_stale = false;

	if (!READ_ONCE(ctx->napi_busy_poll_dt))
		return 0;
	if (list_empty_careful(&ctx->napi_list))
		return 0;

	rcu_read_lock();
	is_stale = __io_napi_do_busy_loop(ctx, NULL);
	rcu_read_unlock();

	io_napi_remove_stale(ctx, is_stale);
	return 1;
}

#endif