From aed9a1a4f7106ff99a882ad06318cebfa71016a2 Mon Sep 17 00:00:00 2001 From: Mohamed Ahmed Date: Thu, 9 May 2024 23:43:52 +0300 Subject: drm/nouveau: use tile_mode and pte_kind for VM_BIND bo allocations Allow PTE kind and tile mode on BO create with VM_BIND, and add a GETPARAM to indicate this change. This is needed to support modifiers in NVK and ensure correctness when dealing with the nouveau GL driver. The userspace modifiers implementation this is for can be found here: https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24795 Fixes: b88baab82871 ("drm/nouveau: implement new VM_BIND uAPI") Signed-off-by: Mohamed Ahmed Reviewed-by: Faith Ekstrand Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/20240509204352.7597-1-mohamedahmedegypt2001@gmail.com --- include/uapi/drm/nouveau_drm.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/uapi/drm/nouveau_drm.h b/include/uapi/drm/nouveau_drm.h index cd84227f1b42..5402f77ee859 100644 --- a/include/uapi/drm/nouveau_drm.h +++ b/include/uapi/drm/nouveau_drm.h @@ -68,6 +68,13 @@ extern "C" { */ #define NOUVEAU_GETPARAM_VRAM_USED 19 +/* + * NOUVEAU_GETPARAM_HAS_VMA_TILEMODE + * + * Query whether tile mode and PTE kind are accepted with VM allocs or not. + */ +#define NOUVEAU_GETPARAM_HAS_VMA_TILEMODE 20 + struct drm_nouveau_getparam { __u64 param; __u64 value; -- cgit v1.2.3 From 117bbc0e43adc6f76a3fc39a98f75a811a853459 Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Thu, 29 Feb 2024 10:51:13 +0000 Subject: drm/buddy: stop using PAGE_SIZE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The drm_buddy minimum page-size requirements should be distinct from the CPU PAGE_SIZE. Only restriction is that the minimum page-size is at least 4K. Signed-off-by: Matthew Auld Cc: Arunpravin Paneer Selvam Cc: Christian König Cc: Arnd Bergmann Reviewed-by: Arunpravin Paneer Selvam Acked-by: Arnd Bergmann Link: https://patchwork.freedesktop.org/patch/msgid/20240229105112.250077-3-matthew.auld@intel.com Signed-off-by: Christian König --- drivers/gpu/drm/drm_buddy.c | 2 +- include/drm/drm_buddy.h | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index 5ebdd6f8f36e..f999568d69c1 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -102,7 +102,7 @@ int drm_buddy_init(struct drm_buddy *mm, u64 size, u64 chunk_size) if (size < chunk_size) return -EINVAL; - if (chunk_size < PAGE_SIZE) + if (chunk_size < SZ_4K) return -EINVAL; if (!is_power_of_2(chunk_size)) diff --git a/include/drm/drm_buddy.h b/include/drm/drm_buddy.h index a5b39fc01003..19ed661a32f3 100644 --- a/include/drm/drm_buddy.h +++ b/include/drm/drm_buddy.h @@ -53,8 +53,8 @@ struct drm_buddy_block { struct list_head tmp_link; }; -/* Order-zero must be at least PAGE_SIZE */ -#define DRM_BUDDY_MAX_ORDER (63 - PAGE_SHIFT) +/* Order-zero must be at least SZ_4K */ +#define DRM_BUDDY_MAX_ORDER (63 - 12) /* * Binary Buddy System. @@ -82,7 +82,7 @@ struct drm_buddy { unsigned int n_roots; unsigned int max_order; - /* Must be at least PAGE_SIZE */ + /* Must be at least SZ_4K */ u64 chunk_size; u64 size; u64 avail; -- cgit v1.2.3 From 06e785aeb9ea8a43d0a3967c1ba6e69d758e82d4 Mon Sep 17 00:00:00 2001 From: Matt Jan Date: Tue, 14 May 2024 12:10:46 +0800 Subject: connector: Fix invalid conversion in cn_proc.h The implicit conversion from unsigned int to enum proc_cn_event is invalid, so explicitly cast it for compilation in a C++ compiler. /usr/include/linux/cn_proc.h: In function 'proc_cn_event valid_event(proc_cn_event)': /usr/include/linux/cn_proc.h:72:17: error: invalid conversion from 'unsigned int' to 'proc_cn_event' [-fpermissive] 72 | ev_type &= PROC_EVENT_ALL; | ^ | | | unsigned int Signed-off-by: Matt Jan Signed-off-by: David S. Miller --- include/uapi/linux/cn_proc.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/cn_proc.h b/include/uapi/linux/cn_proc.h index f2afb7cc4926..18e3745b86cd 100644 --- a/include/uapi/linux/cn_proc.h +++ b/include/uapi/linux/cn_proc.h @@ -69,8 +69,7 @@ struct proc_input { static inline enum proc_cn_event valid_event(enum proc_cn_event ev_type) { - ev_type &= PROC_EVENT_ALL; - return ev_type; + return (enum proc_cn_event)(ev_type & PROC_EVENT_ALL); } /* -- cgit v1.2.3 From 79c137454815ba5554caa8eeb4ad5c94e96e45ce Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Tue, 21 May 2024 19:49:38 +0800 Subject: filemap: add helper mapping_max_folio_size() Add mapping_max_folio_size() to get the maximum folio size for this pagecache mapping. Fixes: 5d8edfb900d5 ("iomap: Copy larger chunks from userspace") Cc: stable@vger.kernel.org Reviewed-by: Darrick J. Wong Signed-off-by: Xu Yang Link: https://lore.kernel.org/r/20240521114939.2541461-1-xu.yang_2@nxp.com Reviewed-by: Ritesh Harjani (IBM) Reviewed-by: Christoph Hellwig Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- include/linux/pagemap.h | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 3d69589c00a4..ee633712bba0 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -346,6 +346,19 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask) m->gfp_mask = mask; } +/* + * There are some parts of the kernel which assume that PMD entries + * are exactly HPAGE_PMD_ORDER. Those should be fixed, but until then, + * limit the maximum allocation order to PMD size. I'm not aware of any + * assumptions about maximum order if THP are disabled, but 8 seems like + * a good order (that's 1MB if you're using 4kB pages) + */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER +#else +#define MAX_PAGECACHE_ORDER 8 +#endif + /** * mapping_set_large_folios() - Indicate the file supports large folios. * @mapping: The file. @@ -372,6 +385,14 @@ static inline bool mapping_large_folio_support(struct address_space *mapping) test_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags); } +/* Return the maximum folio size for this pagecache mapping, in bytes. */ +static inline size_t mapping_max_folio_size(struct address_space *mapping) +{ + if (mapping_large_folio_support(mapping)) + return PAGE_SIZE << MAX_PAGECACHE_ORDER; + return PAGE_SIZE; +} + static inline int filemap_nr_thps(struct address_space *mapping) { #ifdef CONFIG_READ_ONLY_THP_FOR_FS @@ -530,19 +551,6 @@ static inline void *detach_page_private(struct page *page) return folio_detach_private(page_folio(page)); } -/* - * There are some parts of the kernel which assume that PMD entries - * are exactly HPAGE_PMD_ORDER. Those should be fixed, but until then, - * limit the maximum allocation order to PMD size. I'm not aware of any - * assumptions about maximum order if THP are disabled, but 8 seems like - * a good order (that's 1MB if you're using 4kB pages) - */ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER -#else -#define MAX_PAGECACHE_ORDER 8 -#endif - #ifdef CONFIG_NUMA struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order); #else -- cgit v1.2.3 From 1b9f86c6d53245dab087f1b2c05727b5982142ff Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Wed, 22 May 2024 22:26:54 +0300 Subject: net/mlx5: Fix MTMP register capability offset in MCAM register The MTMP register (0x900a) capability offset is off-by-one, move it to the right place. Fixes: 1f507e80c700 ("net/mlx5: Expose NIC temperature via hardware monitoring kernel API") Signed-off-by: Gal Pressman Reviewed-by: Cosmin Ratiu Signed-off-by: Tariq Toukan Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/linux/mlx5/mlx5_ifc.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index f468763478ae..5df52e15f7d6 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -10308,9 +10308,9 @@ struct mlx5_ifc_mcam_access_reg_bits { u8 mfrl[0x1]; u8 regs_39_to_32[0x8]; - u8 regs_31_to_10[0x16]; + u8 regs_31_to_11[0x15]; u8 mtmp[0x1]; - u8 regs_8_to_0[0x9]; + u8 regs_9_to_0[0xa]; }; struct mlx5_ifc_mcam_access_reg_bits1 { -- cgit v1.2.3 From 3998d184267dfcff858aaa84d3de17429253629d Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 24 May 2024 18:36:17 +0200 Subject: netkit: Fix pkt_type override upon netkit pass verdict When running Cilium connectivity test suite with netkit in L2 mode, we found that compared to tcx a few tests were failing which pushed traffic into an L7 proxy sitting in host namespace. The problem in particular is around the invocation of eth_type_trans() in netkit. In case of tcx, this is run before the tcx ingress is triggered inside host namespace and thus if the BPF program uses the bpf_skb_change_type() helper the newly set type is retained. However, in case of netkit, the late eth_type_trans() invocation overrides the earlier decision from the BPF program which eventually leads to the test failure. Instead of eth_type_trans(), split out the relevant parts, meaning, reset of mac header and call to eth_skb_pkt_type() before the BPF program is run in order to have the same behavior as with tcx, and refactor a small helper called eth_skb_pull_mac() which is run in case it's passed up the stack where the mac header must be pulled. With this all connectivity tests pass. Fixes: 35dfaad7188c ("netkit, bpf: Add bpf programmable net device") Signed-off-by: Daniel Borkmann Acked-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20240524163619.26001-2-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- drivers/net/netkit.c | 4 +++- include/linux/etherdevice.h | 8 ++++++++ net/ethernet/eth.c | 4 +--- 3 files changed, 12 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c index 272894053e2c..16789cd446e9 100644 --- a/drivers/net/netkit.c +++ b/drivers/net/netkit.c @@ -55,6 +55,7 @@ static void netkit_prep_forward(struct sk_buff *skb, bool xnet) skb_scrub_packet(skb, xnet); skb->priority = 0; nf_skip_egress(skb, true); + skb_reset_mac_header(skb); } static struct netkit *netkit_priv(const struct net_device *dev) @@ -78,6 +79,7 @@ static netdev_tx_t netkit_xmit(struct sk_buff *skb, struct net_device *dev) skb_orphan_frags(skb, GFP_ATOMIC))) goto drop; netkit_prep_forward(skb, !net_eq(dev_net(dev), dev_net(peer))); + eth_skb_pkt_type(skb, peer); skb->dev = peer; entry = rcu_dereference(nk->active); if (entry) @@ -85,7 +87,7 @@ static netdev_tx_t netkit_xmit(struct sk_buff *skb, struct net_device *dev) switch (ret) { case NETKIT_NEXT: case NETKIT_PASS: - skb->protocol = eth_type_trans(skb, skb->dev); + eth_skb_pull_mac(skb); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); if (likely(__netif_rx(skb) == NET_RX_SUCCESS)) { dev_sw_netstats_tx_add(dev, 1, len); diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 2ad1ffa4ccb9..0ed47d00549b 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -636,6 +636,14 @@ static inline void eth_skb_pkt_type(struct sk_buff *skb, } } +static inline struct ethhdr *eth_skb_pull_mac(struct sk_buff *skb) +{ + struct ethhdr *eth = (struct ethhdr *)skb->data; + + skb_pull_inline(skb, ETH_HLEN); + return eth; +} + /** * eth_skb_pad - Pad buffer to mininum number of octets for Ethernet frame * @skb: Buffer to pad diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 049c3adeb850..4e3651101b86 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -161,9 +161,7 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) skb->dev = dev; skb_reset_mac_header(skb); - eth = (struct ethhdr *)skb->data; - skb_pull_inline(skb, ETH_HLEN); - + eth = eth_skb_pull_mac(skb); eth_skb_pkt_type(skb, dev); /* -- cgit v1.2.3 From f89ea63f1c65d3e93b255f14f9d9e05df87955fa Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 24 May 2024 15:26:11 +0100 Subject: netfs, 9p: Fix race between umount and async request completion There's a problem in 9p's interaction with netfslib whereby a crash occurs because the 9p_fid structs get forcibly destroyed during client teardown (without paying attention to their refcounts) before netfslib has finished with them. However, it's not a simple case of deferring the clunking that p9_fid_put() does as that requires the p9_client record to still be present. The problem is that netfslib has to unlock pages and clear the IN_PROGRESS flag before destroying the objects involved - including the fid - and, in any case, nothing checks to see if writeback completed barring looking at the page flags. Fix this by keeping a count of outstanding I/O requests (of any type) and waiting for it to quiesce during inode eviction. Reported-by: syzbot+df038d463cca332e8414@syzkaller.appspotmail.com Link: https://lore.kernel.org/all/0000000000005be0aa061846f8d6@google.com/ Reported-by: syzbot+d7c7a495a5e466c031b6@syzkaller.appspotmail.com Link: https://lore.kernel.org/all/000000000000b86c5e06130da9c6@google.com/ Reported-by: syzbot+1527696d41a634cc1819@syzkaller.appspotmail.com Link: https://lore.kernel.org/all/000000000000041f960618206d7e@google.com/ Signed-off-by: David Howells Link: https://lore.kernel.org/r/755891.1716560771@warthog.procyon.org.uk Tested-by: syzbot+d7c7a495a5e466c031b6@syzkaller.appspotmail.com Reviewed-by: Dominique Martinet cc: Eric Van Hensbergen cc: Latchesar Ionkov cc: Christian Schoenebeck cc: Jeff Layton cc: Steve French cc: Hillf Danton cc: v9fs@lists.linux.dev cc: linux-afs@lists.infradead.org cc: linux-cifs@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Reported-and-tested-by: syzbot+d7c7a495a5e466c031b6@syzkaller.appspotmail.com Signed-off-by: Christian Brauner --- fs/9p/vfs_inode.c | 1 + fs/afs/inode.c | 1 + fs/netfs/objects.c | 5 +++++ fs/smb/client/cifsfs.c | 1 + include/linux/netfs.h | 18 ++++++++++++++++++ 5 files changed, 26 insertions(+) (limited to 'include') diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 7a3308d77606..fd72fc38c8f5 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -348,6 +348,7 @@ void v9fs_evict_inode(struct inode *inode) __le32 __maybe_unused version; if (!is_bad_inode(inode)) { + netfs_wait_for_outstanding_io(inode); truncate_inode_pages_final(&inode->i_data); version = cpu_to_le32(v9inode->qid.version); diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 94fc049aff58..15bb7989c387 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -648,6 +648,7 @@ void afs_evict_inode(struct inode *inode) ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode); + netfs_wait_for_outstanding_io(inode); truncate_inode_pages_final(&inode->i_data); afs_set_cache_aux(vnode, &aux); diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c index c90d482b1650..f4a642727479 100644 --- a/fs/netfs/objects.c +++ b/fs/netfs/objects.c @@ -72,6 +72,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, } } + atomic_inc(&ctx->io_count); trace_netfs_rreq_ref(rreq->debug_id, 1, netfs_rreq_trace_new); netfs_proc_add_rreq(rreq); netfs_stat(&netfs_n_rh_rreq); @@ -124,6 +125,7 @@ static void netfs_free_request(struct work_struct *work) { struct netfs_io_request *rreq = container_of(work, struct netfs_io_request, work); + struct netfs_inode *ictx = netfs_inode(rreq->inode); unsigned int i; trace_netfs_rreq(rreq, netfs_rreq_trace_free); @@ -142,6 +144,9 @@ static void netfs_free_request(struct work_struct *work) } kvfree(rreq->direct_bv); } + + if (atomic_dec_and_test(&ictx->io_count)) + wake_up_var(&ictx->io_count); call_rcu(&rreq->rcu, netfs_free_request_rcu); } diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index ec5b639f421a..14810ffd15c8 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -431,6 +431,7 @@ cifs_free_inode(struct inode *inode) static void cifs_evict_inode(struct inode *inode) { + netfs_wait_for_outstanding_io(inode); truncate_inode_pages_final(&inode->i_data); if (inode->i_state & I_PINNING_NETFS_WB) cifs_fscache_unuse_inode_cookie(inode, true); diff --git a/include/linux/netfs.h b/include/linux/netfs.h index ca56a4428043..3b22ce0d064c 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -68,6 +68,7 @@ struct netfs_inode { loff_t remote_i_size; /* Size of the remote file */ loff_t zero_point; /* Size after which we assume there's no data * on the server */ + atomic_t io_count; /* Number of outstanding reqs */ unsigned long flags; #define NETFS_ICTX_ODIRECT 0 /* The file has DIO in progress */ #define NETFS_ICTX_UNBUFFERED 1 /* I/O should not use the pagecache */ @@ -472,6 +473,7 @@ static inline void netfs_inode_init(struct netfs_inode *ctx, ctx->remote_i_size = i_size_read(&ctx->inode); ctx->zero_point = LLONG_MAX; ctx->flags = 0; + atomic_set(&ctx->io_count, 0); #if IS_ENABLED(CONFIG_FSCACHE) ctx->cache = NULL; #endif @@ -515,4 +517,20 @@ static inline struct fscache_cookie *netfs_i_cookie(struct netfs_inode *ctx) #endif } +/** + * netfs_wait_for_outstanding_io - Wait for outstanding I/O to complete + * @ctx: The netfs inode to wait on + * + * Wait for outstanding I/O requests of any type to complete. This is intended + * to be called from inode eviction routines. This makes sure that any + * resources held by those requests are cleaned up before we let the inode get + * cleaned up. + */ +static inline void netfs_wait_for_outstanding_io(struct inode *inode) +{ + struct netfs_inode *ictx = netfs_inode(inode); + + wait_var_event(&ictx->io_count, atomic_read(&ictx->io_count) == 0); +} + #endif /* _LINUX_NETFS_H */ -- cgit v1.2.3 From 80e4e17ac9e05adba3f1f0e1793398086e6d4007 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 21 May 2024 15:16:06 -0700 Subject: block: remove blk_queue_max_integrity_segments This is unused now that all the atomic queue limit conversions are merged. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240521221606.393040-1-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-integrity.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include') diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index e253e7bd0d17..7428cb43952d 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -66,12 +66,6 @@ blk_integrity_queue_supports_integrity(struct request_queue *q) return q->integrity.profile; } -static inline void blk_queue_max_integrity_segments(struct request_queue *q, - unsigned int segs) -{ - q->limits.max_integrity_segments = segs; -} - static inline unsigned short queue_max_integrity_segments(const struct request_queue *q) { @@ -151,10 +145,6 @@ static inline void blk_integrity_register(struct gendisk *d, static inline void blk_integrity_unregister(struct gendisk *d) { } -static inline void blk_queue_max_integrity_segments(struct request_queue *q, - unsigned int segs) -{ -} static inline unsigned short queue_max_integrity_segments(const struct request_queue *q) { -- cgit v1.2.3 From f4dca95fc0f6350918f2e6727e35b41f7f86fcce Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 23 May 2024 13:05:27 +0000 Subject: tcp: reduce accepted window in NEW_SYN_RECV state Jason commit made checks against ACK sequence less strict and can be exploited by attackers to establish spoofed flows with less probes. Innocent users might use tcp_rmem[1] == 1,000,000,000, or something more reasonable. An attacker can use a regular TCP connection to learn the server initial tp->rcv_wnd, and use it to optimize the attack. If we make sure that only the announced window (smaller than 65535) is used for ACK validation, we force an attacker to use 65537 packets to complete the 3WHS (assuming server ISN is unknown) Fixes: 378979e94e95 ("tcp: remove 64 KByte limit for initial tp->rcv_wnd value") Link: https://datatracker.ietf.org/meeting/119/materials/slides-119-tcpm-ghost-acks-00 Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Reviewed-by: Jason Xing Link: https://lore.kernel.org/r/20240523130528.60376-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/request_sock.h | 12 ++++++++++++ net/ipv4/tcp_ipv4.c | 7 +------ net/ipv4/tcp_minisocks.c | 7 +++++-- net/ipv6/tcp_ipv6.c | 7 +------ 4 files changed, 19 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/net/request_sock.h b/include/net/request_sock.h index d88c0dfc2d46..ebcb8896bffc 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -285,4 +285,16 @@ static inline int reqsk_queue_len_young(const struct request_sock_queue *queue) return atomic_read(&queue->young); } +/* RFC 7323 2.3 Using the Window Scale Option + * The window field (SEG.WND) of every outgoing segment, with the + * exception of segments, MUST be right-shifted by + * Rcv.Wind.Shift bits. + * + * This means the SEG.WND carried in SYNACK can not exceed 65535. + * We use this property to harden TCP stack while in NEW_SYN_RECV state. + */ +static inline u32 tcp_synack_window(const struct request_sock *req) +{ + return min(req->rsk_rcv_wnd, 65535U); +} #endif /* _REQUEST_SOCK_H */ diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 30ef0c8f5e92..b710958393e6 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1144,14 +1144,9 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, #endif } - /* RFC 7323 2.3 - * The window field (SEG.WND) of every outgoing segment, with the - * exception of segments, MUST be right-shifted by - * Rcv.Wind.Shift bits: - */ tcp_v4_send_ack(sk, skb, seq, tcp_rsk(req)->rcv_nxt, - req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, + tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale, tcp_rsk_tsval(tcp_rsk(req)), READ_ONCE(req->ts_recent), 0, &key, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index b93619b2384b..538c06f95918 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -783,8 +783,11 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, /* RFC793: "first check sequence number". */ - if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, - tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rsk_rcv_wnd)) { + if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->end_seq, + tcp_rsk(req)->rcv_nxt, + tcp_rsk(req)->rcv_nxt + + tcp_synack_window(req))) { /* Out of window: send ACK and drop. */ if (!(flg & TCP_FLAG_RST) && !tcp_oow_rate_limited(sock_net(sk), skb, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 4c3605485b68..8c577b651bfc 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1272,15 +1272,10 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV * sk->sk_state == TCP_SYN_RECV -> for Fast Open. */ - /* RFC 7323 2.3 - * The window field (SEG.WND) of every outgoing segment, with the - * exception of segments, MUST be right-shifted by - * Rcv.Wind.Shift bits: - */ tcp_v6_send_ack(sk, skb, (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, tcp_rsk(req)->rcv_nxt, - req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, + tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale, tcp_rsk_tsval(tcp_rsk(req)), READ_ONCE(req->ts_recent), sk->sk_bound_dev_if, &key, ipv6_get_dsfield(ipv6_hdr(skb)), 0, -- cgit v1.2.3 From 266aa3b4812e97942a8ce5c7aafa7da059f7b5b8 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Fri, 24 May 2024 13:28:59 +0200 Subject: page_pool: fix &page_pool_params kdoc issues After the tagged commit, @netdev got documented twice and the kdoc script didn't notice that. Remove the second description added later and move the initial one according to the field position. After merging commit 5f8e4007c10d ("kernel-doc: fix struct_group_tagged() parsing"), kdoc requires to describe struct groups as well. &page_pool_params has 2 struct groups which generated new warnings, describe them to resolve this. Fixes: 403f11ac9ab7 ("page_pool: don't use driver-set flags field directly") Signed-off-by: Alexander Lobakin Link: https://lore.kernel.org/r/20240524112859.2757403-1-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- include/net/page_pool/types.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index b088d131aeb0..7e8477057f3d 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -45,16 +45,17 @@ struct pp_alloc_cache { /** * struct page_pool_params - page pool parameters + * @fast: params accessed frequently on hotpath * @order: 2^order pages on allocation * @pool_size: size of the ptr_ring * @nid: NUMA node id to allocate from pages from * @dev: device, for DMA pre-mapping purposes - * @netdev: netdev this pool will serve (leave as NULL if none or multiple) * @napi: NAPI which is the sole consumer of pages, otherwise NULL * @dma_dir: DMA mapping direction * @max_len: max DMA sync memory size for PP_FLAG_DMA_SYNC_DEV * @offset: DMA sync address offset for PP_FLAG_DMA_SYNC_DEV - * @netdev: corresponding &net_device for Netlink introspection + * @slow: params with slowpath access only (initialization and Netlink) + * @netdev: netdev this pool will serve (leave as NULL if none or multiple) * @flags: PP_FLAG_DMA_MAP, PP_FLAG_DMA_SYNC_DEV, PP_FLAG_SYSTEM_POOL */ struct page_pool_params { -- cgit v1.2.3 From f3d7ba9e1bc0c9080834f263d4887bd9c9ea491f Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Mon, 27 May 2024 13:56:27 +0300 Subject: tpm: Open code tpm_buf_parameters() With only single call site, this makes no sense (slipped out of the radar during the review). Open code and document the action directly to the site, to make it more readable. Fixes: 1b6d7f9eb150 ("tpm: add session encryption protection to tpm2_get_random()") Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm-buf.c | 26 -------------------------- drivers/char/tpm/tpm2-cmd.c | 10 +++++++++- include/linux/tpm.h | 2 -- 3 files changed, 9 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/drivers/char/tpm/tpm-buf.c b/drivers/char/tpm/tpm-buf.c index 647c6ca92ac3..cad0048bcc3c 100644 --- a/drivers/char/tpm/tpm-buf.c +++ b/drivers/char/tpm/tpm-buf.c @@ -223,30 +223,4 @@ u32 tpm_buf_read_u32(struct tpm_buf *buf, off_t *offset) } EXPORT_SYMBOL_GPL(tpm_buf_read_u32); -static u16 tpm_buf_tag(struct tpm_buf *buf) -{ - struct tpm_header *head = (struct tpm_header *)buf->data; - - return be16_to_cpu(head->tag); -} - -/** - * tpm_buf_parameters - return the TPM response parameters area of the tpm_buf - * @buf: tpm_buf to use - * - * Where the parameters are located depends on the tag of a TPM - * command (it's immediately after the header for TPM_ST_NO_SESSIONS - * or 4 bytes after for TPM_ST_SESSIONS). Evaluate this and return a - * pointer to the first byte of the parameters area. - * - * @return: pointer to parameters area - */ -u8 *tpm_buf_parameters(struct tpm_buf *buf) -{ - int offset = TPM_HEADER_SIZE; - - if (tpm_buf_tag(buf) == TPM2_ST_SESSIONS) - offset += 4; - return &buf->data[offset]; -} diff --git a/drivers/char/tpm/tpm2-cmd.c b/drivers/char/tpm/tpm2-cmd.c index 0cdf892ec2a7..1e856259219e 100644 --- a/drivers/char/tpm/tpm2-cmd.c +++ b/drivers/char/tpm/tpm2-cmd.c @@ -281,6 +281,7 @@ struct tpm2_get_random_out { int tpm2_get_random(struct tpm_chip *chip, u8 *dest, size_t max) { struct tpm2_get_random_out *out; + struct tpm_header *head; struct tpm_buf buf; u32 recd; u32 num_bytes = max; @@ -288,6 +289,7 @@ int tpm2_get_random(struct tpm_chip *chip, u8 *dest, size_t max) int total = 0; int retries = 5; u8 *dest_ptr = dest; + off_t offset; if (!num_bytes || max > TPM_MAX_RNG_DATA) return -EINVAL; @@ -320,7 +322,13 @@ int tpm2_get_random(struct tpm_chip *chip, u8 *dest, size_t max) goto out; } - out = (struct tpm2_get_random_out *)tpm_buf_parameters(&buf); + head = (struct tpm_header *)buf.data; + offset = TPM_HEADER_SIZE; + /* Skip the parameter size field: */ + if (be16_to_cpu(head->tag) == TPM2_ST_SESSIONS) + offset += 4; + + out = (struct tpm2_get_random_out *)&buf.data[offset]; recd = min_t(u32, be16_to_cpu(out->size), num_bytes); if (tpm_buf_length(&buf) < TPM_HEADER_SIZE + diff --git a/include/linux/tpm.h b/include/linux/tpm.h index c17e4efbb2e5..b3217200df28 100644 --- a/include/linux/tpm.h +++ b/include/linux/tpm.h @@ -437,8 +437,6 @@ u8 tpm_buf_read_u8(struct tpm_buf *buf, off_t *offset); u16 tpm_buf_read_u16(struct tpm_buf *buf, off_t *offset); u32 tpm_buf_read_u32(struct tpm_buf *buf, off_t *offset); -u8 *tpm_buf_parameters(struct tpm_buf *buf); - /* * Check if TPM device is in the firmware upgrade mode. */ -- cgit v1.2.3 From f09fc6cee0dcfc38148ee6b6dd04f93e353d22f2 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Tue, 28 May 2024 12:52:21 +0300 Subject: tpm: Rename TPM2_OA_TMPL to TPM2_OA_NULL_KEY and make it local Rename and document TPM2_OA_TMPL, as originally requested in the patch set review, but left unaddressed without any appropriate reasoning. The new name is TPM2_OA_NULL_KEY, has a documentation and is local only to tpm2-sessions.c. Link: https://lore.kernel.org/linux-integrity/ddbeb8111f48a8ddb0b8fca248dff6cc9d7079b2.camel@HansenPartnership.com/ Link: https://lore.kernel.org/linux-integrity/CZCKTWU6ZCC9.2UTEQPEVICYHL@suppilovahvero/ Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm2-sessions.c | 21 +++++++++++++++++++-- include/linux/tpm.h | 15 --------------- 2 files changed, 19 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/drivers/char/tpm/tpm2-sessions.c b/drivers/char/tpm/tpm2-sessions.c index ea8860661876..907ac9956a78 100644 --- a/drivers/char/tpm/tpm2-sessions.c +++ b/drivers/char/tpm/tpm2-sessions.c @@ -80,6 +80,9 @@ /* maximum number of names the TPM must remember for authorization */ #define AUTH_MAX_NAMES 3 +#define AES_KEY_BYTES AES_KEYSIZE_128 +#define AES_KEY_BITS (AES_KEY_BYTES*8) + static int tpm2_create_primary(struct tpm_chip *chip, u32 hierarchy, u32 *handle, u8 *name); @@ -954,6 +957,20 @@ int tpm2_start_auth_session(struct tpm_chip *chip) } EXPORT_SYMBOL(tpm2_start_auth_session); +/* + * A mask containing the object attributes for the kernel held null primary key + * used in HMAC encryption. For more information on specific attributes look up + * to "8.3 TPMA_OBJECT (Object Attributes)". + */ +#define TPM2_OA_NULL_KEY ( \ + TPM2_OA_NO_DA | \ + TPM2_OA_FIXED_TPM | \ + TPM2_OA_FIXED_PARENT | \ + TPM2_OA_SENSITIVE_DATA_ORIGIN | \ + TPM2_OA_USER_WITH_AUTH | \ + TPM2_OA_DECRYPT | \ + TPM2_OA_RESTRICTED) + /** * tpm2_parse_create_primary() - parse the data returned from TPM_CC_CREATE_PRIMARY * @@ -1018,7 +1035,7 @@ static int tpm2_parse_create_primary(struct tpm_chip *chip, struct tpm_buf *buf, val = tpm_buf_read_u32(buf, &offset_t); /* object properties */ - if (val != TPM2_OA_TMPL) + if (val != TPM2_OA_NULL_KEY) return -EINVAL; /* auth policy (empty) */ @@ -1178,7 +1195,7 @@ static int tpm2_create_primary(struct tpm_chip *chip, u32 hierarchy, tpm_buf_append_u16(&template, TPM_ALG_SHA256); /* object properties */ - tpm_buf_append_u32(&template, TPM2_OA_TMPL); + tpm_buf_append_u32(&template, TPM2_OA_NULL_KEY); /* sauth policy (empty) */ tpm_buf_append_u16(&template, 0); diff --git a/include/linux/tpm.h b/include/linux/tpm.h index b3217200df28..21a67dc9efe8 100644 --- a/include/linux/tpm.h +++ b/include/linux/tpm.h @@ -394,21 +394,6 @@ enum tpm2_object_attributes { TPM2_OA_SIGN = BIT(18), }; -/* - * definitions for the canonical template. These are mandated - * by the TCG key template documents - */ - -#define AES_KEY_BYTES AES_KEYSIZE_128 -#define AES_KEY_BITS (AES_KEY_BYTES*8) -#define TPM2_OA_TMPL (TPM2_OA_NO_DA | \ - TPM2_OA_FIXED_TPM | \ - TPM2_OA_FIXED_PARENT | \ - TPM2_OA_SENSITIVE_DATA_ORIGIN | \ - TPM2_OA_USER_WITH_AUTH | \ - TPM2_OA_DECRYPT | \ - TPM2_OA_RESTRICTED) - enum tpm2_session_attributes { TPM2_SA_CONTINUE_SESSION = BIT(0), TPM2_SA_AUDIT_EXCLUSIVE = BIT(1), -- cgit v1.2.3 From 6d40dbc75877110c5e0d661dd77f6cfce916765e Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Tue, 28 May 2024 21:18:50 +0200 Subject: ALSA: pcm: fix typo in comment Fix the typo in the comment for SNDRV_PCM_RATE_KNOT Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20240528191850.63314-1-alexandre.belloni@bootlin.com Signed-off-by: Takashi Iwai --- include/sound/pcm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/sound/pcm.h b/include/sound/pcm.h index 61c6054618c8..3edd7a7346da 100644 --- a/include/sound/pcm.h +++ b/include/sound/pcm.h @@ -124,7 +124,7 @@ struct snd_pcm_ops { #define SNDRV_PCM_RATE_768000 (1U<<16) /* 768000Hz */ #define SNDRV_PCM_RATE_CONTINUOUS (1U<<30) /* continuous range */ -#define SNDRV_PCM_RATE_KNOT (1U<<31) /* supports more non-continuos rates */ +#define SNDRV_PCM_RATE_KNOT (1U<<31) /* supports more non-continuous rates */ #define SNDRV_PCM_RATE_8000_44100 (SNDRV_PCM_RATE_8000|SNDRV_PCM_RATE_11025|\ SNDRV_PCM_RATE_16000|SNDRV_PCM_RATE_22050|\ -- cgit v1.2.3 From 92f1655aa2b2294d0b49925f3b875a634bd3b59e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 May 2024 11:43:53 +0000 Subject: net: fix __dst_negative_advice() race __dst_negative_advice() does not enforce proper RCU rules when sk->dst_cache must be cleared, leading to possible UAF. RCU rules are that we must first clear sk->sk_dst_cache, then call dst_release(old_dst). Note that sk_dst_reset(sk) is implementing this protocol correctly, while __dst_negative_advice() uses the wrong order. Given that ip6_negative_advice() has special logic against RTF_CACHE, this means each of the three ->negative_advice() existing methods must perform the sk_dst_reset() themselves. Note the check against NULL dst is centralized in __dst_negative_advice(), there is no need to duplicate it in various callbacks. Many thanks to Clement Lecigne for tracking this issue. This old bug became visible after the blamed commit, using UDP sockets. Fixes: a87cb3e48ee8 ("net: Facility to report route quality of connected sockets") Reported-by: Clement Lecigne Diagnosed-by: Clement Lecigne Signed-off-by: Eric Dumazet Cc: Tom Herbert Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240528114353.1794151-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/dst_ops.h | 2 +- include/net/sock.h | 13 +++---------- net/ipv4/route.c | 22 ++++++++-------------- net/ipv6/route.c | 29 +++++++++++++++-------------- net/xfrm/xfrm_policy.c | 11 +++-------- 5 files changed, 30 insertions(+), 47 deletions(-) (limited to 'include') diff --git a/include/net/dst_ops.h b/include/net/dst_ops.h index 6d1c8541183d..3a9001a042a5 100644 --- a/include/net/dst_ops.h +++ b/include/net/dst_ops.h @@ -24,7 +24,7 @@ struct dst_ops { void (*destroy)(struct dst_entry *); void (*ifdown)(struct dst_entry *, struct net_device *dev); - struct dst_entry * (*negative_advice)(struct dst_entry *); + void (*negative_advice)(struct sock *sk, struct dst_entry *); void (*link_failure)(struct sk_buff *); void (*update_pmtu)(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, diff --git a/include/net/sock.h b/include/net/sock.h index 5f4d0629348f..953c8dc4e259 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2063,17 +2063,10 @@ sk_dst_get(const struct sock *sk) static inline void __dst_negative_advice(struct sock *sk) { - struct dst_entry *ndst, *dst = __sk_dst_get(sk); + struct dst_entry *dst = __sk_dst_get(sk); - if (dst && dst->ops->negative_advice) { - ndst = dst->ops->negative_advice(dst); - - if (ndst != dst) { - rcu_assign_pointer(sk->sk_dst_cache, ndst); - sk_tx_queue_clear(sk); - WRITE_ONCE(sk->sk_dst_pending_confirm, 0); - } - } + if (dst && dst->ops->negative_advice) + dst->ops->negative_advice(sk, dst); } static inline void dst_negative_advice(struct sock *sk) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 5fd54103174f..b3073d1c8f8f 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -129,7 +129,8 @@ struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); static unsigned int ipv4_default_advmss(const struct dst_entry *dst); INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst); -static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); +static void ipv4_negative_advice(struct sock *sk, + struct dst_entry *dst); static void ipv4_link_failure(struct sk_buff *skb); static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, @@ -825,22 +826,15 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf __ip_do_redirect(rt, skb, &fl4, true); } -static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) +static void ipv4_negative_advice(struct sock *sk, + struct dst_entry *dst) { struct rtable *rt = dst_rtable(dst); - struct dst_entry *ret = dst; - if (rt) { - if (dst->obsolete > 0) { - ip_rt_put(rt); - ret = NULL; - } else if ((rt->rt_flags & RTCF_REDIRECTED) || - rt->dst.expires) { - ip_rt_put(rt); - ret = NULL; - } - } - return ret; + if ((dst->obsolete > 0) || + (rt->rt_flags & RTCF_REDIRECTED) || + rt->dst.expires) + sk_dst_reset(sk); } /* diff --git a/net/ipv6/route.c b/net/ipv6/route.c index bbc2a0dd9314..a504b88ec06b 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -87,7 +87,8 @@ struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); static unsigned int ip6_default_advmss(const struct dst_entry *dst); INDIRECT_CALLABLE_SCOPE unsigned int ip6_mtu(const struct dst_entry *dst); -static struct dst_entry *ip6_negative_advice(struct dst_entry *); +static void ip6_negative_advice(struct sock *sk, + struct dst_entry *dst); static void ip6_dst_destroy(struct dst_entry *); static void ip6_dst_ifdown(struct dst_entry *, struct net_device *dev); @@ -2770,24 +2771,24 @@ INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst, } EXPORT_INDIRECT_CALLABLE(ip6_dst_check); -static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) +static void ip6_negative_advice(struct sock *sk, + struct dst_entry *dst) { struct rt6_info *rt = dst_rt6_info(dst); - if (rt) { - if (rt->rt6i_flags & RTF_CACHE) { - rcu_read_lock(); - if (rt6_check_expired(rt)) { - rt6_remove_exception_rt(rt); - dst = NULL; - } - rcu_read_unlock(); - } else { - dst_release(dst); - dst = NULL; + if (rt->rt6i_flags & RTF_CACHE) { + rcu_read_lock(); + if (rt6_check_expired(rt)) { + /* counteract the dst_release() in sk_dst_reset() */ + dst_hold(dst); + sk_dst_reset(sk); + + rt6_remove_exception_rt(rt); } + rcu_read_unlock(); + return; } - return dst; + sk_dst_reset(sk); } static void ip6_link_failure(struct sk_buff *skb) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 475b904fe68b..66e07de2de35 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -3910,15 +3910,10 @@ static void xfrm_link_failure(struct sk_buff *skb) /* Impossible. Such dst must be popped before reaches point of failure. */ } -static struct dst_entry *xfrm_negative_advice(struct dst_entry *dst) +static void xfrm_negative_advice(struct sock *sk, struct dst_entry *dst) { - if (dst) { - if (dst->obsolete) { - dst_release(dst); - dst = NULL; - } - } - return dst; + if (dst->obsolete) + sk_dst_reset(sk); } static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr) -- cgit v1.2.3 From 13c7c941e72908b8cce5a84b45a7b5e485ca12ed Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 29 May 2024 09:35:47 -0700 Subject: netdev: add qstat for csum complete Recent commit 0cfe71f45f42 ("netdev: add queue stats") added a lot of useful stats, but only those immediately needed by virtio. Presumably virtio does not support CHECKSUM_COMPLETE, so statistic for that form of checksumming wasn't included. Other drivers will definitely need it, in fact we expect it to be needed in net-next soon (mlx5). So let's add the definition of the counter for CHECKSUM_COMPLETE to uAPI in net already, so that the counters are in a more natural order (all subsequent counters have not been present in any released kernel, yet). Signed-off-by: Jakub Kicinski Reviewed-by: Joe Damato Fixes: 0cfe71f45f42 ("netdev: add queue stats") Link: https://lore.kernel.org/r/20240529163547.3693194-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/netdev.yaml | 4 ++++ include/uapi/linux/netdev.h | 1 + tools/include/uapi/linux/netdev.h | 1 + 3 files changed, 6 insertions(+) (limited to 'include') diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index 11a32373365a..959755be4d7f 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -349,6 +349,10 @@ attribute-sets: Number of packets dropped due to transient lack of resources, such as buffer space, host descriptors etc. type: uint + - + name: rx-csum-complete + doc: Number of packets that were marked as CHECKSUM_COMPLETE. + type: uint - name: rx-csum-unnecessary doc: Number of packets that were marked as CHECKSUM_UNNECESSARY. diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index a8188202413e..43742ac5b00d 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -148,6 +148,7 @@ enum { NETDEV_A_QSTATS_RX_ALLOC_FAIL, NETDEV_A_QSTATS_RX_HW_DROPS, NETDEV_A_QSTATS_RX_HW_DROP_OVERRUNS, + NETDEV_A_QSTATS_RX_CSUM_COMPLETE, NETDEV_A_QSTATS_RX_CSUM_UNNECESSARY, NETDEV_A_QSTATS_RX_CSUM_NONE, NETDEV_A_QSTATS_RX_CSUM_BAD, diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index a8188202413e..43742ac5b00d 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -148,6 +148,7 @@ enum { NETDEV_A_QSTATS_RX_ALLOC_FAIL, NETDEV_A_QSTATS_RX_HW_DROPS, NETDEV_A_QSTATS_RX_HW_DROP_OVERRUNS, + NETDEV_A_QSTATS_RX_CSUM_COMPLETE, NETDEV_A_QSTATS_RX_CSUM_UNNECESSARY, NETDEV_A_QSTATS_RX_CSUM_NONE, NETDEV_A_QSTATS_RX_CSUM_BAD, -- cgit v1.2.3 From 29459c3eaa5c6261fbe0dea7bdeb9b48d35d862a Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 30 May 2024 14:40:34 +0900 Subject: block: Fix zone write plugging handling of devices with a runt zone A zoned device may have a last sequential write required zone that is smaller than other zones. However, all tests to check if a zone write plug write offset exceeds the zone capacity use the same capacity value stored in the gendisk zone_capacity field. This is incorrect for a zoned device with a last runt (smaller) zone. Add the new field last_zone_capacity to struct gendisk to store the capacity of the last zone of the device. blk_revalidate_seq_zone() and blk_revalidate_conv_zone() are both modified to get this value when disk_zone_is_last() returns true. Similarly to zone_capacity, the value is first stored using the last_zone_capacity field of struct blk_revalidate_zone_args. Once zone revalidation of all zones is done, this is used to set the gendisk last_zone_capacity field. The checks to determine if a zone is full or if a sector offset in a zone exceeds the zone capacity in disk_should_remove_zone_wplug(), disk_zone_wplug_abort_unaligned(), blk_zone_write_plug_init_request(), and blk_zone_wplug_prepare_bio() are modified to use the new helper functions disk_zone_is_full() and disk_zone_wplug_is_full(). disk_zone_is_full() uses the zone index to determine if the zone being tested is the last one of the disk and uses the either the disk zone_capacity or last_zone_capacity accordingly. Fixes: dd291d77cc90 ("block: Introduce zone write plugging") Signed-off-by: Damien Le Moal Reviewed-by: Bart Van Assche Reviewed-by: Niklas Cassel Link: https://lore.kernel.org/r/20240530054035.491497-4-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 35 +++++++++++++++++++++++++++-------- include/linux/blkdev.h | 1 + 2 files changed, 28 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 402a50a1ac4d..52abebf56027 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -455,6 +455,20 @@ static bool disk_zone_is_last(struct gendisk *disk, struct blk_zone *zone) return zone->start + zone->len >= get_capacity(disk); } +static bool disk_zone_is_full(struct gendisk *disk, + unsigned int zno, unsigned int offset_in_zone) +{ + if (zno < disk->nr_zones - 1) + return offset_in_zone >= disk->zone_capacity; + return offset_in_zone >= disk->last_zone_capacity; +} + +static bool disk_zone_wplug_is_full(struct gendisk *disk, + struct blk_zone_wplug *zwplug) +{ + return disk_zone_is_full(disk, zwplug->zone_no, zwplug->wp_offset); +} + static bool disk_insert_zone_wplug(struct gendisk *disk, struct blk_zone_wplug *zwplug) { @@ -548,7 +562,7 @@ static inline bool disk_should_remove_zone_wplug(struct gendisk *disk, return false; /* We can remove zone write plugs for zones that are empty or full. */ - return !zwplug->wp_offset || zwplug->wp_offset >= disk->zone_capacity; + return !zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug); } static void disk_remove_zone_wplug(struct gendisk *disk, @@ -669,13 +683,12 @@ static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug) static void disk_zone_wplug_abort_unaligned(struct gendisk *disk, struct blk_zone_wplug *zwplug) { - unsigned int zone_capacity = disk->zone_capacity; unsigned int wp_offset = zwplug->wp_offset; struct bio_list bl = BIO_EMPTY_LIST; struct bio *bio; while ((bio = bio_list_pop(&zwplug->bio_list))) { - if (wp_offset >= zone_capacity || + if (disk_zone_is_full(disk, zwplug->zone_no, wp_offset) || (bio_op(bio) != REQ_OP_ZONE_APPEND && bio_offset_from_zone_start(bio) != wp_offset)) { blk_zone_wplug_bio_io_error(zwplug, bio); @@ -914,7 +927,6 @@ void blk_zone_write_plug_init_request(struct request *req) sector_t req_back_sector = blk_rq_pos(req) + blk_rq_sectors(req); struct request_queue *q = req->q; struct gendisk *disk = q->disk; - unsigned int zone_capacity = disk->zone_capacity; struct blk_zone_wplug *zwplug = disk_get_zone_wplug(disk, blk_rq_pos(req)); unsigned long flags; @@ -938,7 +950,7 @@ void blk_zone_write_plug_init_request(struct request *req) * into the back of the request. */ spin_lock_irqsave(&zwplug->lock, flags); - while (zwplug->wp_offset < zone_capacity) { + while (!disk_zone_wplug_is_full(disk, zwplug)) { bio = bio_list_peek(&zwplug->bio_list); if (!bio) break; @@ -984,7 +996,7 @@ static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug, * We know such BIO will fail, and that would potentially overflow our * write pointer offset beyond the end of the zone. */ - if (zwplug->wp_offset >= disk->zone_capacity) + if (disk_zone_wplug_is_full(disk, zwplug)) goto err; if (bio_op(bio) == REQ_OP_ZONE_APPEND) { @@ -1561,6 +1573,7 @@ void disk_free_zone_resources(struct gendisk *disk) kfree(disk->conv_zones_bitmap); disk->conv_zones_bitmap = NULL; disk->zone_capacity = 0; + disk->last_zone_capacity = 0; disk->nr_zones = 0; } @@ -1605,6 +1618,7 @@ struct blk_revalidate_zone_args { unsigned long *conv_zones_bitmap; unsigned int nr_zones; unsigned int zone_capacity; + unsigned int last_zone_capacity; sector_t sector; }; @@ -1622,6 +1636,7 @@ static int disk_update_zone_resources(struct gendisk *disk, disk->nr_zones = args->nr_zones; disk->zone_capacity = args->zone_capacity; + disk->last_zone_capacity = args->last_zone_capacity; swap(disk->conv_zones_bitmap, args->conv_zones_bitmap); if (disk->conv_zones_bitmap) nr_conv_zones = bitmap_weight(disk->conv_zones_bitmap, @@ -1673,6 +1688,9 @@ static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx, return -ENODEV; } + if (disk_zone_is_last(disk, zone)) + args->last_zone_capacity = zone->capacity; + if (!disk_need_zone_resources(disk)) return 0; @@ -1703,8 +1721,9 @@ static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx, */ if (!args->zone_capacity) args->zone_capacity = zone->capacity; - if (!disk_zone_is_last(disk, zone) && - zone->capacity != args->zone_capacity) { + if (disk_zone_is_last(disk, zone)) { + args->last_zone_capacity = zone->capacity; + } else if (zone->capacity != args->zone_capacity) { pr_warn("%s: Invalid variable zone capacity\n", disk->disk_name); return -ENODEV; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index aefdda9f4ec7..24c36929920b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -186,6 +186,7 @@ struct gendisk { */ unsigned int nr_zones; unsigned int zone_capacity; + unsigned int last_zone_capacity; unsigned long *conv_zones_bitmap; unsigned int zone_wplugs_hash_bits; spinlock_t zone_wplugs_lock; -- cgit v1.2.3 From 7bc4244c882a7d7d79f4afefc50893244eb11d07 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 1 Jun 2024 07:28:21 +0200 Subject: Revert "VT: Use macros to define ioctls" This reverts commit 8c467f3300591a206fa8dcc6988d768910799872. Turns out this breaks many architectures as the vt ioctls do not all match up everywhere due to historical reasons, so the original commit is invalid for many values. Reported-by: Nick Bowler Reported-by: Arnd Bergmann Reported-by: Jiri Slaby Reported-by: Christian Zigotzky Reported-by: Michael Ellerman Cc: Al Viro Cc: Alexey Gladkov Link: https://lore.kernel.org/r/ad4e561c-1d49-4f25-882c-7a36c6b1b5c0@draconx.ca Link: https://lore.kernel.org/r/0da9785e-ba44-4718-9d08-4e96c1ba7ab2@kernel.org Link: https://lore.kernel.org/all/34d848f4-670b-4493-bf21-130ef862521b@xenosoft.de/ Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/kd.h | 96 ++++++++++++++++++++++++------------------------- 1 file changed, 47 insertions(+), 49 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/kd.h b/include/uapi/linux/kd.h index 8ddb2219a84b..6b384065c013 100644 --- a/include/uapi/linux/kd.h +++ b/include/uapi/linux/kd.h @@ -5,61 +5,60 @@ #include /* 0x4B is 'K', to avoid collision with termios and vt */ -#define KD_IOCTL_BASE 'K' -#define GIO_FONT _IO(KD_IOCTL_BASE, 0x60) /* gets font in expanded form */ -#define PIO_FONT _IO(KD_IOCTL_BASE, 0x61) /* use font in expanded form */ +#define GIO_FONT 0x4B60 /* gets font in expanded form */ +#define PIO_FONT 0x4B61 /* use font in expanded form */ -#define GIO_FONTX _IO(KD_IOCTL_BASE, 0x6B) /* get font using struct consolefontdesc */ -#define PIO_FONTX _IO(KD_IOCTL_BASE, 0x6C) /* set font using struct consolefontdesc */ +#define GIO_FONTX 0x4B6B /* get font using struct consolefontdesc */ +#define PIO_FONTX 0x4B6C /* set font using struct consolefontdesc */ struct consolefontdesc { unsigned short charcount; /* characters in font (256 or 512) */ unsigned short charheight; /* scan lines per character (1-32) */ char __user *chardata; /* font data in expanded form */ }; -#define PIO_FONTRESET _IO(KD_IOCTL_BASE, 0x6D) /* reset to default font */ +#define PIO_FONTRESET 0x4B6D /* reset to default font */ -#define GIO_CMAP _IO(KD_IOCTL_BASE, 0x70) /* gets colour palette on VGA+ */ -#define PIO_CMAP _IO(KD_IOCTL_BASE, 0x71) /* sets colour palette on VGA+ */ +#define GIO_CMAP 0x4B70 /* gets colour palette on VGA+ */ +#define PIO_CMAP 0x4B71 /* sets colour palette on VGA+ */ -#define KIOCSOUND _IO(KD_IOCTL_BASE, 0x2F) /* start sound generation (0 for off) */ -#define KDMKTONE _IO(KD_IOCTL_BASE, 0x30) /* generate tone */ +#define KIOCSOUND 0x4B2F /* start sound generation (0 for off) */ +#define KDMKTONE 0x4B30 /* generate tone */ -#define KDGETLED _IO(KD_IOCTL_BASE, 0x31) /* return current led state */ -#define KDSETLED _IO(KD_IOCTL_BASE, 0x32) /* set led state [lights, not flags] */ +#define KDGETLED 0x4B31 /* return current led state */ +#define KDSETLED 0x4B32 /* set led state [lights, not flags] */ #define LED_SCR 0x01 /* scroll lock led */ #define LED_NUM 0x02 /* num lock led */ #define LED_CAP 0x04 /* caps lock led */ -#define KDGKBTYPE _IO(KD_IOCTL_BASE, 0x33) /* get keyboard type */ +#define KDGKBTYPE 0x4B33 /* get keyboard type */ #define KB_84 0x01 #define KB_101 0x02 /* this is what we always answer */ #define KB_OTHER 0x03 -#define KDADDIO _IO(KD_IOCTL_BASE, 0x34) /* add i/o port as valid */ -#define KDDELIO _IO(KD_IOCTL_BASE, 0x35) /* del i/o port as valid */ -#define KDENABIO _IO(KD_IOCTL_BASE, 0x36) /* enable i/o to video board */ -#define KDDISABIO _IO(KD_IOCTL_BASE, 0x37) /* disable i/o to video board */ +#define KDADDIO 0x4B34 /* add i/o port as valid */ +#define KDDELIO 0x4B35 /* del i/o port as valid */ +#define KDENABIO 0x4B36 /* enable i/o to video board */ +#define KDDISABIO 0x4B37 /* disable i/o to video board */ -#define KDSETMODE _IO(KD_IOCTL_BASE, 0x3A) /* set text/graphics mode */ +#define KDSETMODE 0x4B3A /* set text/graphics mode */ #define KD_TEXT 0x00 #define KD_GRAPHICS 0x01 #define KD_TEXT0 0x02 /* obsolete */ #define KD_TEXT1 0x03 /* obsolete */ -#define KDGETMODE _IO(KD_IOCTL_BASE, 0x3B) /* get current mode */ +#define KDGETMODE 0x4B3B /* get current mode */ -#define KDMAPDISP _IO(KD_IOCTL_BASE, 0x3C) /* map display into address space */ -#define KDUNMAPDISP _IO(KD_IOCTL_BASE, 0x3D) /* unmap display from address space */ +#define KDMAPDISP 0x4B3C /* map display into address space */ +#define KDUNMAPDISP 0x4B3D /* unmap display from address space */ typedef char scrnmap_t; #define E_TABSZ 256 -#define GIO_SCRNMAP _IO(KD_IOCTL_BASE, 0x40) /* get screen mapping from kernel */ -#define PIO_SCRNMAP _IO(KD_IOCTL_BASE, 0x41) /* put screen mapping table in kernel */ -#define GIO_UNISCRNMAP _IO(KD_IOCTL_BASE, 0x69) /* get full Unicode screen mapping */ -#define PIO_UNISCRNMAP _IO(KD_IOCTL_BASE, 0x6A) /* set full Unicode screen mapping */ +#define GIO_SCRNMAP 0x4B40 /* get screen mapping from kernel */ +#define PIO_SCRNMAP 0x4B41 /* put screen mapping table in kernel */ +#define GIO_UNISCRNMAP 0x4B69 /* get full Unicode screen mapping */ +#define PIO_UNISCRNMAP 0x4B6A /* set full Unicode screen mapping */ -#define GIO_UNIMAP _IO(KD_IOCTL_BASE, 0x66) /* get unicode-to-font mapping from kernel */ +#define GIO_UNIMAP 0x4B66 /* get unicode-to-font mapping from kernel */ struct unipair { unsigned short unicode; unsigned short fontpos; @@ -68,8 +67,8 @@ struct unimapdesc { unsigned short entry_ct; struct unipair __user *entries; }; -#define PIO_UNIMAP _IO(KD_IOCTL_BASE, 0x67) /* put unicode-to-font mapping in kernel */ -#define PIO_UNIMAPCLR _IO(KD_IOCTL_BASE, 0x68) /* clear table, possibly advise hash algorithm */ +#define PIO_UNIMAP 0x4B67 /* put unicode-to-font mapping in kernel */ +#define PIO_UNIMAPCLR 0x4B68 /* clear table, possibly advise hash algorithm */ struct unimapinit { unsigned short advised_hashsize; /* 0 if no opinion */ unsigned short advised_hashstep; /* 0 if no opinion */ @@ -84,19 +83,19 @@ struct unimapinit { #define K_MEDIUMRAW 0x02 #define K_UNICODE 0x03 #define K_OFF 0x04 -#define KDGKBMODE _IO(KD_IOCTL_BASE, 0x44) /* gets current keyboard mode */ -#define KDSKBMODE _IO(KD_IOCTL_BASE, 0x45) /* sets current keyboard mode */ +#define KDGKBMODE 0x4B44 /* gets current keyboard mode */ +#define KDSKBMODE 0x4B45 /* sets current keyboard mode */ #define K_METABIT 0x03 #define K_ESCPREFIX 0x04 -#define KDGKBMETA _IO(KD_IOCTL_BASE, 0x62) /* gets meta key handling mode */ -#define KDSKBMETA _IO(KD_IOCTL_BASE, 0x63) /* sets meta key handling mode */ +#define KDGKBMETA 0x4B62 /* gets meta key handling mode */ +#define KDSKBMETA 0x4B63 /* sets meta key handling mode */ #define K_SCROLLLOCK 0x01 #define K_NUMLOCK 0x02 #define K_CAPSLOCK 0x04 -#define KDGKBLED _IO(KD_IOCTL_BASE, 0x64) /* get led flags (not lights) */ -#define KDSKBLED _IO(KD_IOCTL_BASE, 0x65) /* set led flags (not lights) */ +#define KDGKBLED 0x4B64 /* get led flags (not lights) */ +#define KDSKBLED 0x4B65 /* set led flags (not lights) */ struct kbentry { unsigned char kb_table; @@ -108,15 +107,15 @@ struct kbentry { #define K_ALTTAB 0x02 #define K_ALTSHIFTTAB 0x03 -#define KDGKBENT _IO(KD_IOCTL_BASE, 0x46) /* gets one entry in translation table */ -#define KDSKBENT _IO(KD_IOCTL_BASE, 0x47) /* sets one entry in translation table */ +#define KDGKBENT 0x4B46 /* gets one entry in translation table */ +#define KDSKBENT 0x4B47 /* sets one entry in translation table */ struct kbsentry { unsigned char kb_func; unsigned char kb_string[512]; }; -#define KDGKBSENT _IO(KD_IOCTL_BASE, 0x48) /* gets one function key string entry */ -#define KDSKBSENT _IO(KD_IOCTL_BASE, 0x49) /* sets one function key string entry */ +#define KDGKBSENT 0x4B48 /* gets one function key string entry */ +#define KDSKBSENT 0x4B49 /* sets one function key string entry */ struct kbdiacr { unsigned char diacr, base, result; @@ -125,8 +124,8 @@ struct kbdiacrs { unsigned int kb_cnt; /* number of entries in following array */ struct kbdiacr kbdiacr[256]; /* MAX_DIACR from keyboard.h */ }; -#define KDGKBDIACR _IO(KD_IOCTL_BASE, 0x4A) /* read kernel accent table */ -#define KDSKBDIACR _IO(KD_IOCTL_BASE, 0x4B) /* write kernel accent table */ +#define KDGKBDIACR 0x4B4A /* read kernel accent table */ +#define KDSKBDIACR 0x4B4B /* write kernel accent table */ struct kbdiacruc { unsigned int diacr, base, result; @@ -135,16 +134,16 @@ struct kbdiacrsuc { unsigned int kb_cnt; /* number of entries in following array */ struct kbdiacruc kbdiacruc[256]; /* MAX_DIACR from keyboard.h */ }; -#define KDGKBDIACRUC _IO(KD_IOCTL_BASE, 0xFA) /* read kernel accent table - UCS */ -#define KDSKBDIACRUC _IO(KD_IOCTL_BASE, 0xFB) /* write kernel accent table - UCS */ +#define KDGKBDIACRUC 0x4BFA /* read kernel accent table - UCS */ +#define KDSKBDIACRUC 0x4BFB /* write kernel accent table - UCS */ struct kbkeycode { unsigned int scancode, keycode; }; -#define KDGETKEYCODE _IO(KD_IOCTL_BASE, 0x4C) /* read kernel keycode table entry */ -#define KDSETKEYCODE _IO(KD_IOCTL_BASE, 0x4D) /* write kernel keycode table entry */ +#define KDGETKEYCODE 0x4B4C /* read kernel keycode table entry */ +#define KDSETKEYCODE 0x4B4D /* write kernel keycode table entry */ -#define KDSIGACCEPT _IO(KD_IOCTL_BASE, 0x4E) /* accept kbd generated signals */ +#define KDSIGACCEPT 0x4B4E /* accept kbd generated signals */ struct kbd_repeat { int delay; /* in msec; <= 0: don't change */ @@ -152,11 +151,10 @@ struct kbd_repeat { /* earlier this field was misnamed "rate" */ }; -#define KDKBDREP _IO(KD_IOCTL_BASE, 0x52) /* set keyboard delay/repeat rate; - * actually used values are returned - */ +#define KDKBDREP 0x4B52 /* set keyboard delay/repeat rate; + * actually used values are returned */ -#define KDFONTOP _IO(KD_IOCTL_BASE, 0x72) /* font operations */ +#define KDFONTOP 0x4B72 /* font operations */ struct console_font_op { unsigned int op; /* operation code KD_FONT_OP_* */ -- cgit v1.2.3