Skip to content

Commit 6bcb772

Browse files
committed
Merge branch 'inet-frags-flush-pending-skbs-in-fqdir_pre_exit'
Jakub Kicinski says: ==================== inet: frags: flush pending skbs in fqdir_pre_exit() Fix the issue reported by NIPA starting on Sep 18th [1], where pernet_ops_rwsem is constantly held by a reader, preventing writers from grabbing it (specifically driver modules from loading). The fact that reports started around that time seems coincidental. The issue seems to be skbs queued for defrag preventing conntrack from exiting. First patch fixes another theoretical issue, it's mostly a leftover from an attempt to get rid of the inet_frag_queue refcnt, which I gave up on (still think it's doable but a bit of a time sink). Second patch is a minor refactor. The real fix is in the third patch. It's the simplest fix I can think of which is to flush the frag queues. Perhaps someone has a better suggestion? Last patch adds an explicit warning for conntrack getting stuck, as this seems like something that can easily happen if bugs sneak in. The warning will hopefully save us the first 20% of the investigation effort. Link: https://lore.kernel.org/20251001082036.0fc51440@kernel.org # [1] ==================== Link: https://patch.msgid.link/20251207010942.1672972-1-kuba@kernel.org Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2 parents 2f6e056 + 92df4c5 commit 6bcb772

File tree

5 files changed

+72
-35
lines changed

5 files changed

+72
-35
lines changed

include/net/inet_frag.h

Lines changed: 3 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -123,27 +123,15 @@ void inet_frags_fini(struct inet_frags *);
123123

124124
int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net);
125125

126-
static inline void fqdir_pre_exit(struct fqdir *fqdir)
127-
{
128-
/* Prevent creation of new frags.
129-
* Pairs with READ_ONCE() in inet_frag_find().
130-
*/
131-
WRITE_ONCE(fqdir->high_thresh, 0);
132-
133-
/* Pairs with READ_ONCE() in inet_frag_kill(), ip_expire()
134-
* and ip6frag_expire_frag_queue().
135-
*/
136-
WRITE_ONCE(fqdir->dead, true);
137-
}
126+
void fqdir_pre_exit(struct fqdir *fqdir);
138127
void fqdir_exit(struct fqdir *fqdir);
139128

140129
void inet_frag_kill(struct inet_frag_queue *q, int *refs);
141130
void inet_frag_destroy(struct inet_frag_queue *q);
142131
struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key);
143132

144-
/* Free all skbs in the queue; return the sum of their truesizes. */
145-
unsigned int inet_frag_rbtree_purge(struct rb_root *root,
146-
enum skb_drop_reason reason);
133+
void inet_frag_queue_flush(struct inet_frag_queue *q,
134+
enum skb_drop_reason reason);
147135

148136
static inline void inet_frag_putn(struct inet_frag_queue *q, int refs)
149137
{

include/net/ipv6_frag.h

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -69,9 +69,6 @@ ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq)
6969
int refs = 1;
7070

7171
rcu_read_lock();
72-
/* Paired with the WRITE_ONCE() in fqdir_pre_exit(). */
73-
if (READ_ONCE(fq->q.fqdir->dead))
74-
goto out_rcu_unlock;
7572
spin_lock(&fq->q.lock);
7673

7774
if (fq->q.flags & INET_FRAG_COMPLETE)
@@ -80,6 +77,12 @@ ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq)
8077
fq->q.flags |= INET_FRAG_DROP;
8178
inet_frag_kill(&fq->q, &refs);
8279

80+
/* Paired with the WRITE_ONCE() in fqdir_pre_exit(). */
81+
if (READ_ONCE(fq->q.fqdir->dead)) {
82+
inet_frag_queue_flush(&fq->q, 0);
83+
goto out;
84+
}
85+
8386
dev = dev_get_by_index_rcu(net, fq->iif);
8487
if (!dev)
8588
goto out;

net/ipv4/inet_fragment.c

Lines changed: 51 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,41 @@ static int __init inet_frag_wq_init(void)
218218

219219
pure_initcall(inet_frag_wq_init);
220220

221+
void fqdir_pre_exit(struct fqdir *fqdir)
222+
{
223+
struct inet_frag_queue *fq;
224+
struct rhashtable_iter hti;
225+
226+
/* Prevent creation of new frags.
227+
* Pairs with READ_ONCE() in inet_frag_find().
228+
*/
229+
WRITE_ONCE(fqdir->high_thresh, 0);
230+
231+
/* Pairs with READ_ONCE() in inet_frag_kill(), ip_expire()
232+
* and ip6frag_expire_frag_queue().
233+
*/
234+
WRITE_ONCE(fqdir->dead, true);
235+
236+
rhashtable_walk_enter(&fqdir->rhashtable, &hti);
237+
rhashtable_walk_start(&hti);
238+
239+
while ((fq = rhashtable_walk_next(&hti))) {
240+
if (IS_ERR(fq)) {
241+
if (PTR_ERR(fq) != -EAGAIN)
242+
break;
243+
continue;
244+
}
245+
spin_lock_bh(&fq->lock);
246+
if (!(fq->flags & INET_FRAG_COMPLETE))
247+
inet_frag_queue_flush(fq, 0);
248+
spin_unlock_bh(&fq->lock);
249+
}
250+
251+
rhashtable_walk_stop(&hti);
252+
rhashtable_walk_exit(&hti);
253+
}
254+
EXPORT_SYMBOL(fqdir_pre_exit);
255+
221256
void fqdir_exit(struct fqdir *fqdir)
222257
{
223258
INIT_WORK(&fqdir->destroy_work, fqdir_work_fn);
@@ -263,8 +298,8 @@ static void inet_frag_destroy_rcu(struct rcu_head *head)
263298
kmem_cache_free(f->frags_cachep, q);
264299
}
265300

266-
unsigned int inet_frag_rbtree_purge(struct rb_root *root,
267-
enum skb_drop_reason reason)
301+
static unsigned int
302+
inet_frag_rbtree_purge(struct rb_root *root, enum skb_drop_reason reason)
268303
{
269304
struct rb_node *p = rb_first(root);
270305
unsigned int sum = 0;
@@ -284,7 +319,17 @@ unsigned int inet_frag_rbtree_purge(struct rb_root *root,
284319
}
285320
return sum;
286321
}
287-
EXPORT_SYMBOL(inet_frag_rbtree_purge);
322+
323+
void inet_frag_queue_flush(struct inet_frag_queue *q,
324+
enum skb_drop_reason reason)
325+
{
326+
unsigned int sum;
327+
328+
reason = reason ?: SKB_DROP_REASON_FRAG_REASM_TIMEOUT;
329+
sum = inet_frag_rbtree_purge(&q->rb_fragments, reason);
330+
sub_frag_mem_limit(q->fqdir, sum);
331+
}
332+
EXPORT_SYMBOL(inet_frag_queue_flush);
288333

289334
void inet_frag_destroy(struct inet_frag_queue *q)
290335
{
@@ -327,7 +372,9 @@ static struct inet_frag_queue *inet_frag_alloc(struct fqdir *fqdir,
327372

328373
timer_setup(&q->timer, f->frag_expire, 0);
329374
spin_lock_init(&q->lock);
330-
/* One reference for the timer, one for the hash table. */
375+
/* One reference for the timer, one for the hash table.
376+
* We never take any extra references, only decrement this field.
377+
*/
331378
refcount_set(&q->refcnt, 2);
332379

333380
return q;

net/ipv4/ip_fragment.c

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -134,18 +134,20 @@ static void ip_expire(struct timer_list *t)
134134
net = qp->q.fqdir->net;
135135

136136
rcu_read_lock();
137-
138-
/* Paired with WRITE_ONCE() in fqdir_pre_exit(). */
139-
if (READ_ONCE(qp->q.fqdir->dead))
140-
goto out_rcu_unlock;
141-
142137
spin_lock(&qp->q.lock);
143138

144139
if (qp->q.flags & INET_FRAG_COMPLETE)
145140
goto out;
146141

147142
qp->q.flags |= INET_FRAG_DROP;
148143
inet_frag_kill(&qp->q, &refs);
144+
145+
/* Paired with WRITE_ONCE() in fqdir_pre_exit(). */
146+
if (READ_ONCE(qp->q.fqdir->dead)) {
147+
inet_frag_queue_flush(&qp->q, 0);
148+
goto out;
149+
}
150+
149151
__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
150152
__IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
151153

@@ -240,16 +242,10 @@ static int ip_frag_too_far(struct ipq *qp)
240242

241243
static int ip_frag_reinit(struct ipq *qp)
242244
{
243-
unsigned int sum_truesize = 0;
244-
245-
if (!mod_timer(&qp->q.timer, jiffies + qp->q.fqdir->timeout)) {
246-
refcount_inc(&qp->q.refcnt);
245+
if (!mod_timer_pending(&qp->q.timer, jiffies + qp->q.fqdir->timeout))
247246
return -ETIMEDOUT;
248-
}
249247

250-
sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments,
251-
SKB_DROP_REASON_FRAG_TOO_FAR);
252-
sub_frag_mem_limit(qp->q.fqdir, sum_truesize);
248+
inet_frag_queue_flush(&qp->q, SKB_DROP_REASON_FRAG_TOO_FAR);
253249

254250
qp->q.flags = 0;
255251
qp->q.len = 0;

net/netfilter/nf_conntrack_core.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2487,6 +2487,7 @@ void nf_conntrack_cleanup_net(struct net *net)
24872487
void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list)
24882488
{
24892489
struct nf_ct_iter_data iter_data = {};
2490+
unsigned long start = jiffies;
24902491
struct net *net;
24912492
int busy;
24922493

@@ -2507,6 +2508,8 @@ void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list)
25072508
busy = 1;
25082509
}
25092510
if (busy) {
2511+
DEBUG_NET_WARN_ONCE(time_after(jiffies, start + 60 * HZ),
2512+
"conntrack cleanup blocked for 60s");
25102513
schedule();
25112514
goto i_see_dead_people;
25122515
}

0 commit comments

Comments
 (0)