From b64d2730b2cb41b4c80d86b4f2cf3188d404972f Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 23 Jul 2022 17:04:24 +0200 Subject: [PATCH] 5.10-stable patches added patches: block-fix-bounce_clone_bio-for-passthrough-bios.patch block-split-bio_kmalloc-from-bio_alloc_bioset.patch docs-net-explain-struct-net_device-lifetime.patch net-inline-rollback_registered.patch net-inline-rollback_registered_many.patch net-make-free_netdev-more-lenient-with-unregistering-devices.patch net-make-sure-devices-go-through-netdev_wait_all_refs.patch net-move-net_set_todo-inside-rollback_registered.patch net-move-rollback_registered_many.patch --- ...ounce_clone_bio-for-passthrough-bios.patch | 77 +++++ ...it-bio_kmalloc-from-bio_alloc_bioset.patch | 267 ++++++++++++++++++ ...t-explain-struct-net_device-lifetime.patch | 221 +++++++++++++++ .../net-inline-rollback_registered.patch | 67 +++++ .../net-inline-rollback_registered_many.patch | 99 +++++++ ...e-lenient-with-unregistering-devices.patch | 117 ++++++++ ...ices-go-through-netdev_wait_all_refs.patch | 68 +++++ ..._set_todo-inside-rollback_registered.patch | 80 ++++++ .../net-move-rollback_registered_many.patch | 239 ++++++++++++++++ queue-5.10/series | 9 + 10 files changed, 1244 insertions(+) create mode 100644 queue-5.10/block-fix-bounce_clone_bio-for-passthrough-bios.patch create mode 100644 queue-5.10/block-split-bio_kmalloc-from-bio_alloc_bioset.patch create mode 100644 queue-5.10/docs-net-explain-struct-net_device-lifetime.patch create mode 100644 queue-5.10/net-inline-rollback_registered.patch create mode 100644 queue-5.10/net-inline-rollback_registered_many.patch create mode 100644 queue-5.10/net-make-free_netdev-more-lenient-with-unregistering-devices.patch create mode 100644 queue-5.10/net-make-sure-devices-go-through-netdev_wait_all_refs.patch create mode 100644 queue-5.10/net-move-net_set_todo-inside-rollback_registered.patch create mode 100644 queue-5.10/net-move-rollback_registered_many.patch diff --git a/queue-5.10/block-fix-bounce_clone_bio-for-passthrough-bios.patch b/queue-5.10/block-fix-bounce_clone_bio-for-passthrough-bios.patch new file mode 100644 index 00000000000..48d8be1369d --- /dev/null +++ b/queue-5.10/block-fix-bounce_clone_bio-for-passthrough-bios.patch @@ -0,0 +1,77 @@ +From b90994c6ab623baf9268df9710692f14920ce9d2 Mon Sep 17 00:00:00 2001 +From: Christoph Hellwig +Date: Wed, 24 Feb 2021 08:24:05 +0100 +Subject: block: fix bounce_clone_bio for passthrough bios + +From: Christoph Hellwig + +commit b90994c6ab623baf9268df9710692f14920ce9d2 upstream. + +Now that bio_alloc_bioset does not fall back to kmalloc for a NULL +bio_set, handle that case explicitly and simplify the calling +conventions. + +Based on an earlier patch from Chaitanya Kulkarni. + +Fixes: 3175199ab0ac ("block: split bio_kmalloc from bio_alloc_bioset") +Reported-by: Chaitanya Kulkarni +Signed-off-by: Christoph Hellwig +Signed-off-by: Jens Axboe +Signed-off-by: Tadeusz Struk +Signed-off-by: Greg Kroah-Hartman +--- + block/bounce.c | 17 +++++++++-------- + 1 file changed, 9 insertions(+), 8 deletions(-) + +--- a/block/bounce.c ++++ b/block/bounce.c +@@ -214,8 +214,7 @@ static void bounce_end_io_read_isa(struc + __bounce_end_io_read(bio, &isa_page_pool); + } + +-static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask, +- struct bio_set *bs) ++static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask) + { + struct bvec_iter iter; + struct bio_vec bv; +@@ -242,8 +241,11 @@ static struct bio *bounce_clone_bio(stru + * asking for trouble and would force extra work on + * __bio_clone_fast() anyways. + */ +- +- bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs); ++ if (bio_is_passthrough(bio_src)) ++ bio = bio_kmalloc(gfp_mask, bio_segments(bio_src)); ++ else ++ bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), ++ &bounce_bio_set); + if (!bio) + return NULL; + bio->bi_disk = bio_src->bi_disk; +@@ -294,7 +296,6 @@ static void __blk_queue_bounce(struct re + unsigned i = 0; + bool bounce = false; + int sectors = 0; +- bool passthrough = bio_is_passthrough(*bio_orig); + + bio_for_each_segment(from, *bio_orig, iter) { + if (i++ < BIO_MAX_PAGES) +@@ -305,14 +306,14 @@ static void __blk_queue_bounce(struct re + if (!bounce) + return; + +- if (!passthrough && sectors < bio_sectors(*bio_orig)) { ++ if (!bio_is_passthrough(*bio_orig) && ++ sectors < bio_sectors(*bio_orig)) { + bio = bio_split(*bio_orig, sectors, GFP_NOIO, &bounce_bio_split); + bio_chain(bio, *bio_orig); + submit_bio_noacct(*bio_orig); + *bio_orig = bio; + } +- bio = bounce_clone_bio(*bio_orig, GFP_NOIO, passthrough ? NULL : +- &bounce_bio_set); ++ bio = bounce_clone_bio(*bio_orig, GFP_NOIO); + + /* + * Bvec table can't be updated by bio_for_each_segment_all(), diff --git a/queue-5.10/block-split-bio_kmalloc-from-bio_alloc_bioset.patch b/queue-5.10/block-split-bio_kmalloc-from-bio_alloc_bioset.patch new file mode 100644 index 00000000000..fc26efba892 --- /dev/null +++ b/queue-5.10/block-split-bio_kmalloc-from-bio_alloc_bioset.patch @@ -0,0 +1,267 @@ +From 3175199ab0ac8c874ec25c6bf169f74888917435 Mon Sep 17 00:00:00 2001 +From: Christoph Hellwig +Date: Tue, 26 Jan 2021 15:52:34 +0100 +Subject: block: split bio_kmalloc from bio_alloc_bioset + +From: Christoph Hellwig + +commit 3175199ab0ac8c874ec25c6bf169f74888917435 upstream. + +bio_kmalloc shares almost no logic with the bio_set based fast path +in bio_alloc_bioset. Split it into an entirely separate implementation. + +Signed-off-by: Christoph Hellwig +Reviewed-by: Johannes Thumshirn +Reviewed-by: Chaitanya Kulkarni +Acked-by: Damien Le Moal +Signed-off-by: Jens Axboe +Signed-off-by: Tadeusz Struk +Signed-off-by: Greg Kroah-Hartman +--- + block/bio.c | 174 ++++++++++++++++++++++++++-------------------------- + include/linux/bio.h | 6 - + 2 files changed, 90 insertions(+), 90 deletions(-) + +--- a/block/bio.c ++++ b/block/bio.c +@@ -405,122 +405,101 @@ static void punt_bios_to_rescuer(struct + * @nr_iovecs: number of iovecs to pre-allocate + * @bs: the bio_set to allocate from. + * +- * Description: +- * If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is +- * backed by the @bs's mempool. +- * +- * When @bs is not NULL, if %__GFP_DIRECT_RECLAIM is set then bio_alloc will +- * always be able to allocate a bio. This is due to the mempool guarantees. +- * To make this work, callers must never allocate more than 1 bio at a time +- * from this pool. Callers that need to allocate more than 1 bio must always +- * submit the previously allocated bio for IO before attempting to allocate +- * a new one. Failure to do so can cause deadlocks under memory pressure. +- * +- * Note that when running under submit_bio_noacct() (i.e. any block +- * driver), bios are not submitted until after you return - see the code in +- * submit_bio_noacct() that converts recursion into iteration, to prevent +- * stack overflows. +- * +- * This would normally mean allocating multiple bios under +- * submit_bio_noacct() would be susceptible to deadlocks, but we have +- * deadlock avoidance code that resubmits any blocked bios from a rescuer +- * thread. +- * +- * However, we do not guarantee forward progress for allocations from other +- * mempools. Doing multiple allocations from the same mempool under +- * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad +- * for per bio allocations. ++ * Allocate a bio from the mempools in @bs. + * +- * RETURNS: +- * Pointer to new bio on success, NULL on failure. ++ * If %__GFP_DIRECT_RECLAIM is set then bio_alloc will always be able to ++ * allocate a bio. This is due to the mempool guarantees. To make this work, ++ * callers must never allocate more than 1 bio at a time from the general pool. ++ * Callers that need to allocate more than 1 bio must always submit the ++ * previously allocated bio for IO before attempting to allocate a new one. ++ * Failure to do so can cause deadlocks under memory pressure. ++ * ++ * Note that when running under submit_bio_noacct() (i.e. any block driver), ++ * bios are not submitted until after you return - see the code in ++ * submit_bio_noacct() that converts recursion into iteration, to prevent ++ * stack overflows. ++ * ++ * This would normally mean allocating multiple bios under submit_bio_noacct() ++ * would be susceptible to deadlocks, but we have ++ * deadlock avoidance code that resubmits any blocked bios from a rescuer ++ * thread. ++ * ++ * However, we do not guarantee forward progress for allocations from other ++ * mempools. Doing multiple allocations from the same mempool under ++ * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad ++ * for per bio allocations. ++ * ++ * Returns: Pointer to new bio on success, NULL on failure. + */ + struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs, + struct bio_set *bs) + { + gfp_t saved_gfp = gfp_mask; +- unsigned front_pad; +- unsigned inline_vecs; +- struct bio_vec *bvl = NULL; + struct bio *bio; + void *p; + +- if (!bs) { +- if (nr_iovecs > UIO_MAXIOV) +- return NULL; +- +- p = kmalloc(struct_size(bio, bi_inline_vecs, nr_iovecs), gfp_mask); +- front_pad = 0; +- inline_vecs = nr_iovecs; +- } else { +- /* should not use nobvec bioset for nr_iovecs > 0 */ +- if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && +- nr_iovecs > 0)) +- return NULL; +- /* +- * submit_bio_noacct() converts recursion to iteration; this +- * means if we're running beneath it, any bios we allocate and +- * submit will not be submitted (and thus freed) until after we +- * return. +- * +- * This exposes us to a potential deadlock if we allocate +- * multiple bios from the same bio_set() while running +- * underneath submit_bio_noacct(). If we were to allocate +- * multiple bios (say a stacking block driver that was splitting +- * bios), we would deadlock if we exhausted the mempool's +- * reserve. +- * +- * We solve this, and guarantee forward progress, with a rescuer +- * workqueue per bio_set. If we go to allocate and there are +- * bios on current->bio_list, we first try the allocation +- * without __GFP_DIRECT_RECLAIM; if that fails, we punt those +- * bios we would be blocking to the rescuer workqueue before +- * we retry with the original gfp_flags. +- */ +- +- if (current->bio_list && +- (!bio_list_empty(¤t->bio_list[0]) || +- !bio_list_empty(¤t->bio_list[1])) && +- bs->rescue_workqueue) +- gfp_mask &= ~__GFP_DIRECT_RECLAIM; ++ /* should not use nobvec bioset for nr_iovecs > 0 */ ++ if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_iovecs > 0)) ++ return NULL; + ++ /* ++ * submit_bio_noacct() converts recursion to iteration; this means if ++ * we're running beneath it, any bios we allocate and submit will not be ++ * submitted (and thus freed) until after we return. ++ * ++ * This exposes us to a potential deadlock if we allocate multiple bios ++ * from the same bio_set() while running underneath submit_bio_noacct(). ++ * If we were to allocate multiple bios (say a stacking block driver ++ * that was splitting bios), we would deadlock if we exhausted the ++ * mempool's reserve. ++ * ++ * We solve this, and guarantee forward progress, with a rescuer ++ * workqueue per bio_set. If we go to allocate and there are bios on ++ * current->bio_list, we first try the allocation without ++ * __GFP_DIRECT_RECLAIM; if that fails, we punt those bios we would be ++ * blocking to the rescuer workqueue before we retry with the original ++ * gfp_flags. ++ */ ++ if (current->bio_list && ++ (!bio_list_empty(¤t->bio_list[0]) || ++ !bio_list_empty(¤t->bio_list[1])) && ++ bs->rescue_workqueue) ++ gfp_mask &= ~__GFP_DIRECT_RECLAIM; ++ ++ p = mempool_alloc(&bs->bio_pool, gfp_mask); ++ if (!p && gfp_mask != saved_gfp) { ++ punt_bios_to_rescuer(bs); ++ gfp_mask = saved_gfp; + p = mempool_alloc(&bs->bio_pool, gfp_mask); +- if (!p && gfp_mask != saved_gfp) { +- punt_bios_to_rescuer(bs); +- gfp_mask = saved_gfp; +- p = mempool_alloc(&bs->bio_pool, gfp_mask); +- } +- +- front_pad = bs->front_pad; +- inline_vecs = BIO_INLINE_VECS; + } +- + if (unlikely(!p)) + return NULL; + +- bio = p + front_pad; +- bio_init(bio, NULL, 0); +- +- if (nr_iovecs > inline_vecs) { ++ bio = p + bs->front_pad; ++ if (nr_iovecs > BIO_INLINE_VECS) { + unsigned long idx = 0; ++ struct bio_vec *bvl = NULL; + + bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool); + if (!bvl && gfp_mask != saved_gfp) { + punt_bios_to_rescuer(bs); + gfp_mask = saved_gfp; +- bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool); ++ bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, ++ &bs->bvec_pool); + } + + if (unlikely(!bvl)) + goto err_free; + + bio->bi_flags |= idx << BVEC_POOL_OFFSET; ++ bio_init(bio, bvl, bvec_nr_vecs(idx)); + } else if (nr_iovecs) { +- bvl = bio->bi_inline_vecs; ++ bio_init(bio, bio->bi_inline_vecs, BIO_INLINE_VECS); ++ } else { ++ bio_init(bio, NULL, 0); + } + + bio->bi_pool = bs; +- bio->bi_max_vecs = nr_iovecs; +- bio->bi_io_vec = bvl; + return bio; + + err_free: +@@ -529,6 +508,31 @@ err_free: + } + EXPORT_SYMBOL(bio_alloc_bioset); + ++/** ++ * bio_kmalloc - kmalloc a bio for I/O ++ * @gfp_mask: the GFP_* mask given to the slab allocator ++ * @nr_iovecs: number of iovecs to pre-allocate ++ * ++ * Use kmalloc to allocate and initialize a bio. ++ * ++ * Returns: Pointer to new bio on success, NULL on failure. ++ */ ++struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs) ++{ ++ struct bio *bio; ++ ++ if (nr_iovecs > UIO_MAXIOV) ++ return NULL; ++ ++ bio = kmalloc(struct_size(bio, bi_inline_vecs, nr_iovecs), gfp_mask); ++ if (unlikely(!bio)) ++ return NULL; ++ bio_init(bio, nr_iovecs ? bio->bi_inline_vecs : NULL, nr_iovecs); ++ bio->bi_pool = NULL; ++ return bio; ++} ++EXPORT_SYMBOL(bio_kmalloc); ++ + void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start) + { + unsigned long flags; +--- a/include/linux/bio.h ++++ b/include/linux/bio.h +@@ -390,6 +390,7 @@ extern int biovec_init_pool(mempool_t *p + extern int bioset_init_from_src(struct bio_set *bs, struct bio_set *src); + + extern struct bio *bio_alloc_bioset(gfp_t, unsigned int, struct bio_set *); ++struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs); + extern void bio_put(struct bio *); + + extern void __bio_clone_fast(struct bio *, struct bio *); +@@ -402,11 +403,6 @@ static inline struct bio *bio_alloc(gfp_ + return bio_alloc_bioset(gfp_mask, nr_iovecs, &fs_bio_set); + } + +-static inline struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs) +-{ +- return bio_alloc_bioset(gfp_mask, nr_iovecs, NULL); +-} +- + extern blk_qc_t submit_bio(struct bio *); + + extern void bio_endio(struct bio *); diff --git a/queue-5.10/docs-net-explain-struct-net_device-lifetime.patch b/queue-5.10/docs-net-explain-struct-net_device-lifetime.patch new file mode 100644 index 00000000000..36c6a4cf72c --- /dev/null +++ b/queue-5.10/docs-net-explain-struct-net_device-lifetime.patch @@ -0,0 +1,221 @@ +From foo@baz Sat Jul 23 05:03:39 PM CEST 2022 +From: Fedor Pchelkin +Date: Fri, 15 Jul 2022 19:26:26 +0300 +Subject: docs: net: explain struct net_device lifetime +To: stable@vger.kernel.org, Greg Kroah-Hartman +Cc: Fedor Pchelkin , Jakub Kicinski , Alexey Khoroshilov +Message-ID: <20220715162632.332718-2-pchelkin@ispras.ru> + +From: Fedor Pchelkin + +From: Jakub Kicinski + +commit 2b446e650b418f9a9e75f99852e2f2560cabfa17 upstream. + +Explain the two basic flows of struct net_device's operation. + +Signed-off-by: Jakub Kicinski +Signed-off-by: Fedor Pchelkin +Signed-off-by: Greg Kroah-Hartman +--- + Documentation/networking/netdevices.rst | 171 ++++++++++++++++++++++++++++++-- + net/core/rtnetlink.c | 2 + 2 files changed, 166 insertions(+), 7 deletions(-) + +--- a/Documentation/networking/netdevices.rst ++++ b/Documentation/networking/netdevices.rst +@@ -10,18 +10,177 @@ Introduction + The following is a random collection of documentation regarding + network devices. + +-struct net_device allocation rules +-================================== ++struct net_device lifetime rules ++================================ + Network device structures need to persist even after module is unloaded and + must be allocated with alloc_netdev_mqs() and friends. + If device has registered successfully, it will be freed on last use +-by free_netdev(). This is required to handle the pathologic case cleanly +-(example: rmmod mydriver needs_free_netdev = true; ++ } ++ ++ static void my_destructor(struct net_device *dev) ++ { ++ some_obj_destroy(priv->obj); ++ some_uninit(priv); ++ } ++ ++ int create_link() ++ { ++ struct my_device_priv *priv; ++ int err; ++ ++ ASSERT_RTNL(); ++ ++ dev = alloc_netdev(sizeof(*priv), "net%d", NET_NAME_UNKNOWN, my_setup); ++ if (!dev) ++ return -ENOMEM; ++ priv = netdev_priv(dev); ++ ++ /* Implicit constructor */ ++ err = some_init(priv); ++ if (err) ++ goto err_free_dev; ++ ++ priv->obj = some_obj_create(); ++ if (!priv->obj) { ++ err = -ENOMEM; ++ goto err_some_uninit; ++ } ++ /* End of constructor, set the destructor: */ ++ dev->priv_destructor = my_destructor; ++ ++ err = register_netdevice(dev); ++ if (err) ++ /* register_netdevice() calls destructor on failure */ ++ goto err_free_dev; ++ ++ /* If anything fails now unregister_netdevice() (or unregister_netdev()) ++ * will take care of calling my_destructor and free_netdev(). ++ */ ++ ++ return 0; ++ ++ err_some_uninit: ++ some_uninit(priv); ++ err_free_dev: ++ free_netdev(dev); ++ return err; ++ } ++ ++If struct net_device.priv_destructor is set it will be called by the core ++some time after unregister_netdevice(), it will also be called if ++register_netdevice() fails. The callback may be invoked with or without ++``rtnl_lock`` held. ++ ++There is no explicit constructor callback, driver "constructs" the private ++netdev state after allocating it and before registration. ++ ++Setting struct net_device.needs_free_netdev makes core call free_netdevice() ++automatically after unregister_netdevice() when all references to the device ++are gone. It only takes effect after a successful call to register_netdevice() ++so if register_netdevice() fails driver is responsible for calling ++free_netdev(). ++ ++free_netdev() is safe to call on error paths right after unregister_netdevice() ++or when register_netdevice() fails. Parts of netdev (de)registration process ++happen after ``rtnl_lock`` is released, therefore in those cases free_netdev() ++will defer some of the processing until ``rtnl_lock`` is released. ++ ++Devices spawned from struct rtnl_link_ops should never free the ++struct net_device directly. ++ ++.ndo_init and .ndo_uninit ++~~~~~~~~~~~~~~~~~~~~~~~~~ ++ ++``.ndo_init`` and ``.ndo_uninit`` callbacks are called during net_device ++registration and de-registration, under ``rtnl_lock``. Drivers can use ++those e.g. when parts of their init process need to run under ``rtnl_lock``. ++ ++``.ndo_init`` runs before device is visible in the system, ``.ndo_uninit`` ++runs during de-registering after device is closed but other subsystems ++may still have outstanding references to the netdevice. + + MTU + === +--- a/net/core/rtnetlink.c ++++ b/net/core/rtnetlink.c +@@ -3444,7 +3444,7 @@ replay: + + if (ops->newlink) { + err = ops->newlink(link_net ? : net, dev, tb, data, extack); +- /* Drivers should call free_netdev() in ->destructor ++ /* Drivers should set dev->needs_free_netdev + * and unregister it on failure after registration + * so that device could be finally freed in rtnl_unlock. + */ diff --git a/queue-5.10/net-inline-rollback_registered.patch b/queue-5.10/net-inline-rollback_registered.patch new file mode 100644 index 00000000000..c912663ab8d --- /dev/null +++ b/queue-5.10/net-inline-rollback_registered.patch @@ -0,0 +1,67 @@ +From foo@baz Sat Jul 23 05:03:39 PM CEST 2022 +From: Fedor Pchelkin +Date: Fri, 15 Jul 2022 19:26:30 +0300 +Subject: net: inline rollback_registered() +To: stable@vger.kernel.org, Greg Kroah-Hartman +Cc: Fedor Pchelkin , Jakub Kicinski , Alexey Khoroshilov , Edwin Peer +Message-ID: <20220715162632.332718-6-pchelkin@ispras.ru> + +From: Fedor Pchelkin + +From: Jakub Kicinski + +commit 037e56bd965e1bc72c2fa9684ac25b56839a338e upstream. + +rollback_registered() is a local helper, it's common for driver +code to call unregister_netdevice_queue(dev, NULL) when they +want to unregister netdevices under rtnl_lock. Inline +rollback_registered() and adjust the only remaining caller. + +Reviewed-by: Edwin Peer +Signed-off-by: Jakub Kicinski +Signed-off-by: Fedor Pchelkin +Signed-off-by: Greg Kroah-Hartman +--- + net/core/dev.c | 17 ++++++----------- + 1 file changed, 6 insertions(+), 11 deletions(-) + +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -9601,15 +9601,6 @@ static void rollback_registered_many(str + } + } + +-static void rollback_registered(struct net_device *dev) +-{ +- LIST_HEAD(single); +- +- list_add(&dev->unreg_list, &single); +- rollback_registered_many(&single); +- list_del(&single); +-} +- + static netdev_features_t netdev_sync_upper_features(struct net_device *lower, + struct net_device *upper, netdev_features_t features) + { +@@ -10148,7 +10139,7 @@ int register_netdevice(struct net_device + if (ret) { + /* Expect explicit free_netdev() on failure */ + dev->needs_free_netdev = false; +- rollback_registered(dev); ++ unregister_netdevice_queue(dev, NULL); + goto out; + } + /* +@@ -10755,7 +10746,11 @@ void unregister_netdevice_queue(struct n + if (head) { + list_move_tail(&dev->unreg_list, head); + } else { +- rollback_registered(dev); ++ LIST_HEAD(single); ++ ++ list_add(&dev->unreg_list, &single); ++ rollback_registered_many(&single); ++ list_del(&single); + } + } + EXPORT_SYMBOL(unregister_netdevice_queue); diff --git a/queue-5.10/net-inline-rollback_registered_many.patch b/queue-5.10/net-inline-rollback_registered_many.patch new file mode 100644 index 00000000000..bed90cbc545 --- /dev/null +++ b/queue-5.10/net-inline-rollback_registered_many.patch @@ -0,0 +1,99 @@ +From foo@baz Sat Jul 23 05:03:39 PM CEST 2022 +From: Fedor Pchelkin +Date: Fri, 15 Jul 2022 19:26:32 +0300 +Subject: net: inline rollback_registered_many() +To: stable@vger.kernel.org, Greg Kroah-Hartman +Cc: Fedor Pchelkin , Jakub Kicinski , Alexey Khoroshilov , Edwin Peer +Message-ID: <20220715162632.332718-8-pchelkin@ispras.ru> + +From: Fedor Pchelkin + +From: Jakub Kicinski + +commit 0cbe1e57a7b93517100b0eb63d8e445cfbeb630c upstream. + +Similar to the change for rollback_registered() - +rollback_registered_many() was a part of unregister_netdevice_many() +minus the net_set_todo(), which is no longer needed. + +Functionally this patch moves the list_empty() check back after: + + BUG_ON(dev_boot_phase); + ASSERT_RTNL(); + +but I can't find any reason why that would be an issue. + +Reviewed-by: Edwin Peer +Signed-off-by: Jakub Kicinski +Signed-off-by: Fedor Pchelkin +Signed-off-by: Greg Kroah-Hartman +--- + net/core/dev.c | 22 ++++++++-------------- + 1 file changed, 8 insertions(+), 14 deletions(-) + +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -5750,7 +5750,7 @@ static void flush_all_backlogs(void) + } + + /* we can have in flight packet[s] on the cpus we are not flushing, +- * synchronize_net() in rollback_registered_many() will take care of ++ * synchronize_net() in unregister_netdevice_many() will take care of + * them + */ + for_each_cpu(cpu, &flush_cpus) +@@ -10633,8 +10633,6 @@ void synchronize_net(void) + } + EXPORT_SYMBOL(synchronize_net); + +-static void rollback_registered_many(struct list_head *head); +- + /** + * unregister_netdevice_queue - remove device from the kernel + * @dev: device +@@ -10658,8 +10656,7 @@ void unregister_netdevice_queue(struct n + LIST_HEAD(single); + + list_add(&dev->unreg_list, &single); +- rollback_registered_many(&single); +- list_del(&single); ++ unregister_netdevice_many(&single); + } + } + EXPORT_SYMBOL(unregister_netdevice_queue); +@@ -10673,21 +10670,15 @@ EXPORT_SYMBOL(unregister_netdevice_queue + */ + void unregister_netdevice_many(struct list_head *head) + { +- if (!list_empty(head)) { +- rollback_registered_many(head); +- list_del(head); +- } +-} +-EXPORT_SYMBOL(unregister_netdevice_many); +- +-static void rollback_registered_many(struct list_head *head) +-{ + struct net_device *dev, *tmp; + LIST_HEAD(close_head); + + BUG_ON(dev_boot_phase); + ASSERT_RTNL(); + ++ if (list_empty(head)) ++ return; ++ + list_for_each_entry_safe(dev, tmp, head, unreg_list) { + /* Some devices call without registering + * for initialization unwind. Remove those +@@ -10771,7 +10762,10 @@ static void rollback_registered_many(str + dev_put(dev); + net_set_todo(dev); + } ++ ++ list_del(head); + } ++EXPORT_SYMBOL(unregister_netdevice_many); + + /** + * unregister_netdev - remove device from the kernel diff --git a/queue-5.10/net-make-free_netdev-more-lenient-with-unregistering-devices.patch b/queue-5.10/net-make-free_netdev-more-lenient-with-unregistering-devices.patch new file mode 100644 index 00000000000..84e5f1a2020 --- /dev/null +++ b/queue-5.10/net-make-free_netdev-more-lenient-with-unregistering-devices.patch @@ -0,0 +1,117 @@ +From foo@baz Sat Jul 23 05:03:39 PM CEST 2022 +From: Fedor Pchelkin +Date: Fri, 15 Jul 2022 19:26:27 +0300 +Subject: net: make free_netdev() more lenient with unregistering devices +To: stable@vger.kernel.org, Greg Kroah-Hartman +Cc: Fedor Pchelkin , Jakub Kicinski , Alexey Khoroshilov +Message-ID: <20220715162632.332718-3-pchelkin@ispras.ru> + +From: Fedor Pchelkin + +From: Jakub Kicinski + +commit c269a24ce057abfc31130960e96ab197ef6ab196 upstream. + +There are two flavors of handling netdev registration: + - ones called without holding rtnl_lock: register_netdev() and + unregister_netdev(); and + - those called with rtnl_lock held: register_netdevice() and + unregister_netdevice(). + +While the semantics of the former are pretty clear, the same can't +be said about the latter. The netdev_todo mechanism is utilized to +perform some of the device unregistering tasks and it hooks into +rtnl_unlock() so the locked variants can't actually finish the work. +In general free_netdev() does not mix well with locked calls. Most +drivers operating under rtnl_lock set dev->needs_free_netdev to true +and expect core to make the free_netdev() call some time later. + +The part where this becomes most problematic is error paths. There is +no way to unwind the state cleanly after a call to register_netdevice(), +since unreg can't be performed fully without dropping locks. + +Make free_netdev() more lenient, and defer the freeing if device +is being unregistered. This allows error paths to simply call +free_netdev() both after register_netdevice() failed, and after +a call to unregister_netdevice() but before dropping rtnl_lock. + +Simplify the error paths which are currently doing gymnastics +around free_netdev() handling. + +Signed-off-by: Jakub Kicinski +Signed-off-by: Fedor Pchelkin +Signed-off-by: Greg Kroah-Hartman +--- + net/8021q/vlan.c | 4 +--- + net/core/dev.c | 11 +++++++++++ + net/core/rtnetlink.c | 23 ++++++----------------- + 3 files changed, 18 insertions(+), 20 deletions(-) + +--- a/net/8021q/vlan.c ++++ b/net/8021q/vlan.c +@@ -278,9 +278,7 @@ static int register_vlan_device(struct n + return 0; + + out_free_newdev: +- if (new_dev->reg_state == NETREG_UNINITIALIZED || +- new_dev->reg_state == NETREG_UNREGISTERED) +- free_netdev(new_dev); ++ free_netdev(new_dev); + return err; + } + +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -10683,6 +10683,17 @@ void free_netdev(struct net_device *dev) + struct napi_struct *p, *n; + + might_sleep(); ++ ++ /* When called immediately after register_netdevice() failed the unwind ++ * handling may still be dismantling the device. Handle that case by ++ * deferring the free. ++ */ ++ if (dev->reg_state == NETREG_UNREGISTERING) { ++ ASSERT_RTNL(); ++ dev->needs_free_netdev = true; ++ return; ++ } ++ + netif_free_tx_queues(dev); + netif_free_rx_queues(dev); + +--- a/net/core/rtnetlink.c ++++ b/net/core/rtnetlink.c +@@ -3442,26 +3442,15 @@ replay: + + dev->ifindex = ifm->ifi_index; + +- if (ops->newlink) { ++ if (ops->newlink) + err = ops->newlink(link_net ? : net, dev, tb, data, extack); +- /* Drivers should set dev->needs_free_netdev +- * and unregister it on failure after registration +- * so that device could be finally freed in rtnl_unlock. +- */ +- if (err < 0) { +- /* If device is not registered at all, free it now */ +- if (dev->reg_state == NETREG_UNINITIALIZED || +- dev->reg_state == NETREG_UNREGISTERED) +- free_netdev(dev); +- goto out; +- } +- } else { ++ else + err = register_netdevice(dev); +- if (err < 0) { +- free_netdev(dev); +- goto out; +- } ++ if (err < 0) { ++ free_netdev(dev); ++ goto out; + } ++ + err = rtnl_configure_link(dev, ifm); + if (err < 0) + goto out_unregister; diff --git a/queue-5.10/net-make-sure-devices-go-through-netdev_wait_all_refs.patch b/queue-5.10/net-make-sure-devices-go-through-netdev_wait_all_refs.patch new file mode 100644 index 00000000000..41e0ecd97f1 --- /dev/null +++ b/queue-5.10/net-make-sure-devices-go-through-netdev_wait_all_refs.patch @@ -0,0 +1,68 @@ +From foo@baz Sat Jul 23 05:03:39 PM CEST 2022 +From: Fedor Pchelkin +Date: Fri, 15 Jul 2022 19:26:28 +0300 +Subject: net: make sure devices go through netdev_wait_all_refs +To: stable@vger.kernel.org, Greg Kroah-Hartman +Cc: Fedor Pchelkin , Jakub Kicinski , Alexey Khoroshilov , Hulk Robot , Yang Yingliang +Message-ID: <20220715162632.332718-4-pchelkin@ispras.ru> + +From: Fedor Pchelkin + +From: Jakub Kicinski + +commit 766b0515d5bec4b780750773ed3009b148df8c0a upstream. + +If register_netdevice() fails at the very last stage - the +notifier call - some subsystems may have already seen it and +grabbed a reference. struct net_device can't be freed right +away without calling netdev_wait_all_refs(). + +Now that we have a clean interface in form of dev->needs_free_netdev +and lenient free_netdev() we can undo what commit 93ee31f14f6f ("[NET]: +Fix free_netdev on register_netdev failure.") has done and complete +the unregistration path by bringing the net_set_todo() call back. + +After registration fails user is still expected to explicitly +free the net_device, so make sure ->needs_free_netdev is cleared, +otherwise rolling back the registration will cause the old double +free for callers who release rtnl_lock before the free. + +This also solves the problem of priv_destructor not being called +on notifier error. + +net_set_todo() will be moved back into unregister_netdevice_queue() +in a follow up. + +Reported-by: Hulk Robot +Reported-by: Yang Yingliang +Signed-off-by: Jakub Kicinski +Signed-off-by: Fedor Pchelkin +Signed-off-by: Greg Kroah-Hartman +--- + net/core/dev.c | 14 ++++---------- + 1 file changed, 4 insertions(+), 10 deletions(-) + +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -10144,17 +10144,11 @@ int register_netdevice(struct net_device + ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); + ret = notifier_to_errno(ret); + if (ret) { ++ /* Expect explicit free_netdev() on failure */ ++ dev->needs_free_netdev = false; + rollback_registered(dev); +- rcu_barrier(); +- +- dev->reg_state = NETREG_UNREGISTERED; +- /* We should put the kobject that hold in +- * netdev_unregister_kobject(), otherwise +- * the net device cannot be freed when +- * driver calls free_netdev(), because the +- * kobject is being hold. +- */ +- kobject_put(&dev->dev.kobj); ++ net_set_todo(dev); ++ goto out; + } + /* + * Prevent userspace races by waiting until the network diff --git a/queue-5.10/net-move-net_set_todo-inside-rollback_registered.patch b/queue-5.10/net-move-net_set_todo-inside-rollback_registered.patch new file mode 100644 index 00000000000..050500178f8 --- /dev/null +++ b/queue-5.10/net-move-net_set_todo-inside-rollback_registered.patch @@ -0,0 +1,80 @@ +From foo@baz Sat Jul 23 05:03:39 PM CEST 2022 +From: Fedor Pchelkin +Date: Fri, 15 Jul 2022 19:26:29 +0300 +Subject: net: move net_set_todo inside rollback_registered() +To: stable@vger.kernel.org, Greg Kroah-Hartman +Cc: Fedor Pchelkin , Jakub Kicinski , Alexey Khoroshilov , Edwin Peer +Message-ID: <20220715162632.332718-5-pchelkin@ispras.ru> + +From: Fedor Pchelkin + +From: Jakub Kicinski + +commit 2014beea7eb165c745706b13659a0f1d0a9a2a61 upstream. + +Commit 93ee31f14f6f ("[NET]: Fix free_netdev on register_netdev +failure.") moved net_set_todo() outside of rollback_registered() +so that rollback_registered() can be used in the failure path of +register_netdevice() but without risking a double free. + +Since commit cf124db566e6 ("net: Fix inconsistent teardown and +release of private netdev state."), however, we have a better +way of handling that condition, since destructors don't call +free_netdev() directly. + +After the change in commit c269a24ce057 ("net: make free_netdev() +more lenient with unregistering devices") we can now move +net_set_todo() back. + +Reviewed-by: Edwin Peer +Signed-off-by: Jakub Kicinski +Signed-off-by: Fedor Pchelkin +Signed-off-by: Greg Kroah-Hartman +--- + net/core/dev.c | 11 +++-------- + 1 file changed, 3 insertions(+), 8 deletions(-) + +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -9595,8 +9595,10 @@ static void rollback_registered_many(str + + synchronize_net(); + +- list_for_each_entry(dev, head, unreg_list) ++ list_for_each_entry(dev, head, unreg_list) { + dev_put(dev); ++ net_set_todo(dev); ++ } + } + + static void rollback_registered(struct net_device *dev) +@@ -10147,7 +10149,6 @@ int register_netdevice(struct net_device + /* Expect explicit free_netdev() on failure */ + dev->needs_free_netdev = false; + rollback_registered(dev); +- net_set_todo(dev); + goto out; + } + /* +@@ -10755,8 +10756,6 @@ void unregister_netdevice_queue(struct n + list_move_tail(&dev->unreg_list, head); + } else { + rollback_registered(dev); +- /* Finish processing unregister after unlock */ +- net_set_todo(dev); + } + } + EXPORT_SYMBOL(unregister_netdevice_queue); +@@ -10770,12 +10769,8 @@ EXPORT_SYMBOL(unregister_netdevice_queue + */ + void unregister_netdevice_many(struct list_head *head) + { +- struct net_device *dev; +- + if (!list_empty(head)) { + rollback_registered_many(head); +- list_for_each_entry(dev, head, unreg_list) +- net_set_todo(dev); + list_del(head); + } + } diff --git a/queue-5.10/net-move-rollback_registered_many.patch b/queue-5.10/net-move-rollback_registered_many.patch new file mode 100644 index 00000000000..29d9e5bd836 --- /dev/null +++ b/queue-5.10/net-move-rollback_registered_many.patch @@ -0,0 +1,239 @@ +From foo@baz Sat Jul 23 05:03:39 PM CEST 2022 +From: Fedor Pchelkin +Date: Fri, 15 Jul 2022 19:26:31 +0300 +Subject: net: move rollback_registered_many() +To: stable@vger.kernel.org, Greg Kroah-Hartman +Cc: Fedor Pchelkin , Jakub Kicinski , Alexey Khoroshilov , Edwin Peer +Message-ID: <20220715162632.332718-7-pchelkin@ispras.ru> + +From: Fedor Pchelkin + +From: Jakub Kicinski + +commit bcfe2f1a3818d9dca945b6aca4ae741cb1f75329 upstream. + +Move rollback_registered_many() and add a temporary +forward declaration to make merging the code into +unregister_netdevice_many() easier to review. + +No functional changes. + +Reviewed-by: Edwin Peer +Signed-off-by: Jakub Kicinski +Signed-off-by: Fedor Pchelkin +Signed-off-by: Greg Kroah-Hartman +--- + net/core/dev.c | 188 ++++++++++++++++++++++++++++----------------------------- + 1 file changed, 95 insertions(+), 93 deletions(-) + +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -9508,99 +9508,6 @@ static void net_set_todo(struct net_devi + dev_net(dev)->dev_unreg_count++; + } + +-static void rollback_registered_many(struct list_head *head) +-{ +- struct net_device *dev, *tmp; +- LIST_HEAD(close_head); +- +- BUG_ON(dev_boot_phase); +- ASSERT_RTNL(); +- +- list_for_each_entry_safe(dev, tmp, head, unreg_list) { +- /* Some devices call without registering +- * for initialization unwind. Remove those +- * devices and proceed with the remaining. +- */ +- if (dev->reg_state == NETREG_UNINITIALIZED) { +- pr_debug("unregister_netdevice: device %s/%p never was registered\n", +- dev->name, dev); +- +- WARN_ON(1); +- list_del(&dev->unreg_list); +- continue; +- } +- dev->dismantle = true; +- BUG_ON(dev->reg_state != NETREG_REGISTERED); +- } +- +- /* If device is running, close it first. */ +- list_for_each_entry(dev, head, unreg_list) +- list_add_tail(&dev->close_list, &close_head); +- dev_close_many(&close_head, true); +- +- list_for_each_entry(dev, head, unreg_list) { +- /* And unlink it from device chain. */ +- unlist_netdevice(dev); +- +- dev->reg_state = NETREG_UNREGISTERING; +- } +- flush_all_backlogs(); +- +- synchronize_net(); +- +- list_for_each_entry(dev, head, unreg_list) { +- struct sk_buff *skb = NULL; +- +- /* Shutdown queueing discipline. */ +- dev_shutdown(dev); +- +- dev_xdp_uninstall(dev); +- +- /* Notify protocols, that we are about to destroy +- * this device. They should clean all the things. +- */ +- call_netdevice_notifiers(NETDEV_UNREGISTER, dev); +- +- if (!dev->rtnl_link_ops || +- dev->rtnl_link_state == RTNL_LINK_INITIALIZED) +- skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0, +- GFP_KERNEL, NULL, 0); +- +- /* +- * Flush the unicast and multicast chains +- */ +- dev_uc_flush(dev); +- dev_mc_flush(dev); +- +- netdev_name_node_alt_flush(dev); +- netdev_name_node_free(dev->name_node); +- +- if (dev->netdev_ops->ndo_uninit) +- dev->netdev_ops->ndo_uninit(dev); +- +- if (skb) +- rtmsg_ifinfo_send(skb, dev, GFP_KERNEL); +- +- /* Notifier chain MUST detach us all upper devices. */ +- WARN_ON(netdev_has_any_upper_dev(dev)); +- WARN_ON(netdev_has_any_lower_dev(dev)); +- +- /* Remove entries from kobject tree */ +- netdev_unregister_kobject(dev); +-#ifdef CONFIG_XPS +- /* Remove XPS queueing entries */ +- netif_reset_xps_queues_gt(dev, 0); +-#endif +- } +- +- synchronize_net(); +- +- list_for_each_entry(dev, head, unreg_list) { +- dev_put(dev); +- net_set_todo(dev); +- } +-} +- + static netdev_features_t netdev_sync_upper_features(struct net_device *lower, + struct net_device *upper, netdev_features_t features) + { +@@ -10726,6 +10633,8 @@ void synchronize_net(void) + } + EXPORT_SYMBOL(synchronize_net); + ++static void rollback_registered_many(struct list_head *head); ++ + /** + * unregister_netdevice_queue - remove device from the kernel + * @dev: device +@@ -10771,6 +10680,99 @@ void unregister_netdevice_many(struct li + } + EXPORT_SYMBOL(unregister_netdevice_many); + ++static void rollback_registered_many(struct list_head *head) ++{ ++ struct net_device *dev, *tmp; ++ LIST_HEAD(close_head); ++ ++ BUG_ON(dev_boot_phase); ++ ASSERT_RTNL(); ++ ++ list_for_each_entry_safe(dev, tmp, head, unreg_list) { ++ /* Some devices call without registering ++ * for initialization unwind. Remove those ++ * devices and proceed with the remaining. ++ */ ++ if (dev->reg_state == NETREG_UNINITIALIZED) { ++ pr_debug("unregister_netdevice: device %s/%p never was registered\n", ++ dev->name, dev); ++ ++ WARN_ON(1); ++ list_del(&dev->unreg_list); ++ continue; ++ } ++ dev->dismantle = true; ++ BUG_ON(dev->reg_state != NETREG_REGISTERED); ++ } ++ ++ /* If device is running, close it first. */ ++ list_for_each_entry(dev, head, unreg_list) ++ list_add_tail(&dev->close_list, &close_head); ++ dev_close_many(&close_head, true); ++ ++ list_for_each_entry(dev, head, unreg_list) { ++ /* And unlink it from device chain. */ ++ unlist_netdevice(dev); ++ ++ dev->reg_state = NETREG_UNREGISTERING; ++ } ++ flush_all_backlogs(); ++ ++ synchronize_net(); ++ ++ list_for_each_entry(dev, head, unreg_list) { ++ struct sk_buff *skb = NULL; ++ ++ /* Shutdown queueing discipline. */ ++ dev_shutdown(dev); ++ ++ dev_xdp_uninstall(dev); ++ ++ /* Notify protocols, that we are about to destroy ++ * this device. They should clean all the things. ++ */ ++ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); ++ ++ if (!dev->rtnl_link_ops || ++ dev->rtnl_link_state == RTNL_LINK_INITIALIZED) ++ skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0, ++ GFP_KERNEL, NULL, 0); ++ ++ /* ++ * Flush the unicast and multicast chains ++ */ ++ dev_uc_flush(dev); ++ dev_mc_flush(dev); ++ ++ netdev_name_node_alt_flush(dev); ++ netdev_name_node_free(dev->name_node); ++ ++ if (dev->netdev_ops->ndo_uninit) ++ dev->netdev_ops->ndo_uninit(dev); ++ ++ if (skb) ++ rtmsg_ifinfo_send(skb, dev, GFP_KERNEL); ++ ++ /* Notifier chain MUST detach us all upper devices. */ ++ WARN_ON(netdev_has_any_upper_dev(dev)); ++ WARN_ON(netdev_has_any_lower_dev(dev)); ++ ++ /* Remove entries from kobject tree */ ++ netdev_unregister_kobject(dev); ++#ifdef CONFIG_XPS ++ /* Remove XPS queueing entries */ ++ netif_reset_xps_queues_gt(dev, 0); ++#endif ++ } ++ ++ synchronize_net(); ++ ++ list_for_each_entry(dev, head, unreg_list) { ++ dev_put(dev); ++ net_set_todo(dev); ++ } ++} ++ + /** + * unregister_netdev - remove device from the kernel + * @dev: device diff --git a/queue-5.10/series b/queue-5.10/series index a180deda5c8..2f7c45e97e8 100644 --- a/queue-5.10/series +++ b/queue-5.10/series @@ -4,3 +4,12 @@ mlxsw-spectrum_router-fix-ipv4-nexthop-gateway-indication.patch lockdown-fix-kexec-lockdown-bypass-with-ima-policy.patch io_uring-use-original-task-for-req-identity-in-io_identity_cow.patch xen-gntdev-ignore-failure-to-unmap-invalid_grant_handle.patch +block-split-bio_kmalloc-from-bio_alloc_bioset.patch +block-fix-bounce_clone_bio-for-passthrough-bios.patch +docs-net-explain-struct-net_device-lifetime.patch +net-make-free_netdev-more-lenient-with-unregistering-devices.patch +net-make-sure-devices-go-through-netdev_wait_all_refs.patch +net-move-net_set_todo-inside-rollback_registered.patch +net-inline-rollback_registered.patch +net-move-rollback_registered_many.patch +net-inline-rollback_registered_many.patch -- 2.47.3