diff options
Diffstat (limited to 'net')
162 files changed, 2413 insertions, 2169 deletions
diff --git a/net/ax25/Kconfig b/net/ax25/Kconfig index d3a9843a043d..fdb666607f10 100644 --- a/net/ax25/Kconfig +++ b/net/ax25/Kconfig @@ -10,7 +10,7 @@ menuconfig HAMRADIO If you want to connect your Linux box to an amateur radio, answer Y here. You want to read <https://www.tapr.org/> and more specifically about AX.25 on Linux - <http://www.linux-ax25.org/>. + <https://linux-ax25.in-berlin.de>. Note that the answer to this question won't directly affect the kernel: saying N will just cause the configurator to skip all @@ -61,7 +61,7 @@ config AX25_DAMA_SLAVE configuration. Linux cannot yet act as a DAMA server. This option only compiles DAMA slave support into the kernel. It still needs to be enabled at runtime. For more about DAMA see - <http://www.linux-ax25.org>. If unsure, say Y. + <https://linux-ax25.in-berlin.de>. If unsure, say Y. # placeholder until implemented config AX25_DAMA_MASTER @@ -87,9 +87,9 @@ config NETROM A comprehensive listing of all the software for Linux amateur radio users as well as information about how to configure an AX.25 port is contained in the Linux Ham Wiki, available from - <http://www.linux-ax25.org>. You also might want to check out the - file <file:Documentation/networking/ax25.rst>. More information about - digital amateur radio in general is on the WWW at + <https://linux-ax25.in-berlin.de>. You also might want to check out + the file <file:Documentation/networking/ax25.rst>. More information + about digital amateur radio in general is on the WWW at <https://www.tapr.org/>. To compile this driver as a module, choose M here: the @@ -106,9 +106,9 @@ config ROSE A comprehensive listing of all the software for Linux amateur radio users as well as information about how to configure an AX.25 port is contained in the Linux Ham Wiki, available from - <http://www.linux-ax25.org>. You also might want to check out the - file <file:Documentation/networking/ax25.rst>. More information about - digital amateur radio in general is on the WWW at + <https://linux-ax25.in-berlin.de>. You also might want to check out + the file <file:Documentation/networking/ax25.rst>. More information + about digital amateur radio in general is on the WWW at <https://www.tapr.org/>. To compile this driver as a module, choose M here: the diff --git a/net/ax25/sysctl_net_ax25.c b/net/ax25/sysctl_net_ax25.c index 2154d004d3dc..db66e11e7fe8 100644 --- a/net/ax25/sysctl_net_ax25.c +++ b/net/ax25/sysctl_net_ax25.c @@ -159,7 +159,8 @@ int ax25_register_dev_sysctl(ax25_dev *ax25_dev) table[k].data = &ax25_dev->values[k]; snprintf(path, sizeof(path), "net/ax25/%s", ax25_dev->dev->name); - ax25_dev->sysheader = register_net_sysctl(&init_net, path, table); + ax25_dev->sysheader = register_net_sysctl_sz(&init_net, path, table, + ARRAY_SIZE(ax25_param_table)); if (!ax25_dev->sysheader) { kfree(table); return -ENOMEM; diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 9d5057cef30a..73470cc3518a 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -1627,6 +1627,15 @@ struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst, return ERR_PTR(-EOPNOTSUPP); } + /* Reject outgoing connection to device with same BD ADDR against + * CVE-2020-26555 + */ + if (!bacmp(&hdev->bdaddr, dst)) { + bt_dev_dbg(hdev, "Reject connection with same BD_ADDR %pMR\n", + dst); + return ERR_PTR(-ECONNREFUSED); + } + acl = hci_conn_hash_lookup_ba(hdev, ACL_LINK, dst); if (!acl) { acl = hci_conn_add(hdev, ACL_LINK, dst, HCI_ROLE_MASTER); @@ -2413,34 +2422,41 @@ int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type, if (!test_bit(HCI_CONN_AUTH, &conn->flags)) goto auth; - /* An authenticated FIPS approved combination key has sufficient - * security for security level 4. */ - if (conn->key_type == HCI_LK_AUTH_COMBINATION_P256 && - sec_level == BT_SECURITY_FIPS) - goto encrypt; - - /* An authenticated combination key has sufficient security for - security level 3. */ - if ((conn->key_type == HCI_LK_AUTH_COMBINATION_P192 || - conn->key_type == HCI_LK_AUTH_COMBINATION_P256) && - sec_level == BT_SECURITY_HIGH) - goto encrypt; - - /* An unauthenticated combination key has sufficient security for - security level 1 and 2. */ - if ((conn->key_type == HCI_LK_UNAUTH_COMBINATION_P192 || - conn->key_type == HCI_LK_UNAUTH_COMBINATION_P256) && - (sec_level == BT_SECURITY_MEDIUM || sec_level == BT_SECURITY_LOW)) - goto encrypt; - - /* A combination key has always sufficient security for the security - levels 1 or 2. High security level requires the combination key - is generated using maximum PIN code length (16). - For pre 2.1 units. */ - if (conn->key_type == HCI_LK_COMBINATION && - (sec_level == BT_SECURITY_MEDIUM || sec_level == BT_SECURITY_LOW || - conn->pin_length == 16)) - goto encrypt; + switch (conn->key_type) { + case HCI_LK_AUTH_COMBINATION_P256: + /* An authenticated FIPS approved combination key has + * sufficient security for security level 4 or lower. + */ + if (sec_level <= BT_SECURITY_FIPS) + goto encrypt; + break; + case HCI_LK_AUTH_COMBINATION_P192: + /* An authenticated combination key has sufficient security for + * security level 3 or lower. + */ + if (sec_level <= BT_SECURITY_HIGH) + goto encrypt; + break; + case HCI_LK_UNAUTH_COMBINATION_P192: + case HCI_LK_UNAUTH_COMBINATION_P256: + /* An unauthenticated combination key has sufficient security + * for security level 2 or lower. + */ + if (sec_level <= BT_SECURITY_MEDIUM) + goto encrypt; + break; + case HCI_LK_COMBINATION: + /* A combination key has always sufficient security for the + * security levels 2 or lower. High security level requires the + * combination key is generated using maximum PIN code length + * (16). For pre 2.1 units. + */ + if (sec_level <= BT_SECURITY_MEDIUM || conn->pin_length == 16) + goto encrypt; + break; + default: + break; + } auth: if (test_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags)) diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index a5992f1b3c9b..195aea2198a9 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -2617,7 +2617,11 @@ int hci_register_dev(struct hci_dev *hdev) if (id < 0) return id; - snprintf(hdev->name, sizeof(hdev->name), "hci%d", id); + error = dev_set_name(&hdev->dev, "hci%u", id); + if (error) + return error; + + hdev->name = dev_name(&hdev->dev); hdev->id = id; BT_DBG("%p name %s bus %d", hdev, hdev->name, hdev->bus); @@ -2639,8 +2643,6 @@ int hci_register_dev(struct hci_dev *hdev) if (!IS_ERR_OR_NULL(bt_debugfs)) hdev->debugfs = debugfs_create_dir(hdev->name, bt_debugfs); - dev_set_name(&hdev->dev, "%s", hdev->name); - error = device_add(&hdev->dev); if (error < 0) goto err_wqueue; @@ -2784,6 +2786,7 @@ void hci_release_dev(struct hci_dev *hdev) hci_conn_params_clear_all(hdev); hci_discovery_filter_clear(hdev); hci_blocked_keys_clear(hdev); + hci_codec_list_clear(&hdev->local_codecs); hci_dev_unlock(hdev); ida_simple_remove(&hci_index_ida, hdev->id); @@ -3418,7 +3421,12 @@ static void hci_link_tx_to(struct hci_dev *hdev, __u8 type) if (c->type == type && c->sent) { bt_dev_err(hdev, "killing stalled connection %pMR", &c->dst); + /* hci_disconnect might sleep, so, we have to release + * the RCU read lock before calling it. + */ + rcu_read_unlock(); hci_disconnect(c, HCI_ERROR_REMOTE_USER_TERM); + rcu_read_lock(); } } diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 35f251041eeb..1e1c9147356c 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -26,6 +26,8 @@ /* Bluetooth HCI event handling. */ #include <asm/unaligned.h> +#include <linux/crypto.h> +#include <crypto/algapi.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> @@ -33,6 +35,7 @@ #include "hci_request.h" #include "hci_debugfs.h" +#include "hci_codec.h" #include "a2mp.h" #include "amp.h" #include "smp.h" @@ -3267,6 +3270,16 @@ static void hci_conn_request_evt(struct hci_dev *hdev, void *data, bt_dev_dbg(hdev, "bdaddr %pMR type 0x%x", &ev->bdaddr, ev->link_type); + /* Reject incoming connection from device with same BD ADDR against + * CVE-2020-26555 + */ + if (hdev && !bacmp(&hdev->bdaddr, &ev->bdaddr)) { + bt_dev_dbg(hdev, "Reject connection with same BD_ADDR %pMR\n", + &ev->bdaddr); + hci_reject_conn(hdev, &ev->bdaddr); + return; + } + mask |= hci_proto_connect_ind(hdev, &ev->bdaddr, ev->link_type, &flags); @@ -4741,6 +4754,15 @@ static void hci_link_key_notify_evt(struct hci_dev *hdev, void *data, if (!conn) goto unlock; + /* Ignore NULL link key against CVE-2020-26555 */ + if (!crypto_memneq(ev->link_key, ZERO_KEY, HCI_LINK_KEY_SIZE)) { + bt_dev_dbg(hdev, "Ignore NULL link key (ZERO KEY) for %pMR", + &ev->bdaddr); + hci_disconnect(conn, HCI_ERROR_AUTH_FAILURE); + hci_conn_drop(conn); + goto unlock; + } + hci_conn_hold(conn); conn->disc_timeout = HCI_DISCONN_TIMEOUT; hci_conn_drop(conn); @@ -5273,8 +5295,8 @@ static u8 bredr_oob_data_present(struct hci_conn *conn) * available, then do not declare that OOB data is * present. */ - if (!memcmp(data->rand256, ZERO_KEY, 16) || - !memcmp(data->hash256, ZERO_KEY, 16)) + if (!crypto_memneq(data->rand256, ZERO_KEY, 16) || + !crypto_memneq(data->hash256, ZERO_KEY, 16)) return 0x00; return 0x02; @@ -5284,8 +5306,8 @@ static u8 bredr_oob_data_present(struct hci_conn *conn) * not supported by the hardware, then check that if * P-192 data values are present. */ - if (!memcmp(data->rand192, ZERO_KEY, 16) || - !memcmp(data->hash192, ZERO_KEY, 16)) + if (!crypto_memneq(data->rand192, ZERO_KEY, 16) || + !crypto_memneq(data->hash192, ZERO_KEY, 16)) return 0x00; return 0x01; @@ -5302,7 +5324,7 @@ static void hci_io_capa_request_evt(struct hci_dev *hdev, void *data, hci_dev_lock(hdev); conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr); - if (!conn) + if (!conn || !hci_conn_ssp_enabled(conn)) goto unlock; hci_conn_hold(conn); @@ -5549,7 +5571,7 @@ static void hci_simple_pair_complete_evt(struct hci_dev *hdev, void *data, hci_dev_lock(hdev); conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr); - if (!conn) + if (!conn || !hci_conn_ssp_enabled(conn)) goto unlock; /* Reset the authentication requirement to unknown */ @@ -7020,6 +7042,14 @@ unlock: hci_dev_unlock(hdev); } +static int hci_iso_term_big_sync(struct hci_dev *hdev, void *data) +{ + u8 handle = PTR_UINT(data); + + return hci_le_terminate_big_sync(hdev, handle, + HCI_ERROR_LOCAL_HOST_TERM); +} + static void hci_le_create_big_complete_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb) { @@ -7064,16 +7094,17 @@ static void hci_le_create_big_complete_evt(struct hci_dev *hdev, void *data, rcu_read_lock(); } + rcu_read_unlock(); + if (!ev->status && !i) /* If no BISes have been connected for the BIG, * terminate. This is in case all bound connections * have been closed before the BIG creation * has completed. */ - hci_le_terminate_big_sync(hdev, ev->handle, - HCI_ERROR_LOCAL_HOST_TERM); + hci_cmd_sync_queue(hdev, hci_iso_term_big_sync, + UINT_PTR(ev->handle), NULL); - rcu_read_unlock(); hci_dev_unlock(hdev); } diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h index b9c5a9823837..0be75cf0efed 100644 --- a/net/bluetooth/hci_request.h +++ b/net/bluetooth/hci_request.h @@ -71,7 +71,5 @@ struct sk_buff *hci_prepare_cmd(struct hci_dev *hdev, u16 opcode, u32 plen, void hci_req_add_le_scan_disable(struct hci_request *req, bool rpa_le_conn); void hci_req_add_le_passive_scan(struct hci_request *req); -void hci_req_prepare_suspend(struct hci_dev *hdev, enum suspended_state next); - void hci_request_setup(struct hci_dev *hdev); void hci_request_cancel_all(struct hci_dev *hdev); diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 5e4f718073b7..3e7cd330d731 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -488,7 +488,8 @@ static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event) ni->type = hdev->dev_type; ni->bus = hdev->bus; bacpy(&ni->bdaddr, &hdev->bdaddr); - memcpy(ni->name, hdev->name, 8); + memcpy_and_pad(ni->name, sizeof(ni->name), hdev->name, + strnlen(hdev->name, sizeof(ni->name)), '\0'); opcode = cpu_to_le16(HCI_MON_NEW_INDEX); break; diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 9b93653c6197..a15ab0b874a9 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -413,11 +413,6 @@ static int hci_le_scan_restart_sync(struct hci_dev *hdev) LE_SCAN_FILTER_DUP_ENABLE); } -static int le_scan_restart_sync(struct hci_dev *hdev, void *data) -{ - return hci_le_scan_restart_sync(hdev); -} - static void le_scan_restart(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, @@ -427,15 +422,15 @@ static void le_scan_restart(struct work_struct *work) bt_dev_dbg(hdev, ""); - hci_dev_lock(hdev); - - status = hci_cmd_sync_queue(hdev, le_scan_restart_sync, NULL, NULL); + status = hci_le_scan_restart_sync(hdev); if (status) { bt_dev_err(hdev, "failed to restart LE scan: status %d", status); - goto unlock; + return; } + hci_dev_lock(hdev); + if (!test_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks) || !hdev->discovery.scan_start) goto unlock; @@ -5079,6 +5074,7 @@ int hci_dev_close_sync(struct hci_dev *hdev) memset(hdev->eir, 0, sizeof(hdev->eir)); memset(hdev->dev_class, 0, sizeof(hdev->dev_class)); bacpy(&hdev->random_addr, BDADDR_ANY); + hci_codec_list_clear(&hdev->local_codecs); hci_dev_put(hdev); return err; @@ -5373,6 +5369,7 @@ int hci_abort_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason) { int err = 0; u16 handle = conn->handle; + bool disconnect = false; struct hci_conn *c; switch (conn->state) { @@ -5403,24 +5400,15 @@ int hci_abort_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason) hci_dev_unlock(hdev); return 0; case BT_BOUND: - hci_dev_lock(hdev); - hci_conn_failed(conn, reason); - hci_dev_unlock(hdev); - return 0; + break; default: - hci_dev_lock(hdev); - conn->state = BT_CLOSED; - hci_disconn_cfm(conn, reason); - hci_conn_del(conn); - hci_dev_unlock(hdev); - return 0; + disconnect = true; + break; } hci_dev_lock(hdev); - /* Check if the connection hasn't been cleanup while waiting - * commands to complete. - */ + /* Check if the connection has been cleaned up concurrently */ c = hci_conn_hash_lookup_handle(hdev, handle); if (!c || c != conn) { err = 0; @@ -5432,7 +5420,13 @@ int hci_abort_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason) * or in case of LE it was still scanning so it can be cleanup * safely. */ - hci_conn_failed(conn, reason); + if (disconnect) { + conn->state = BT_CLOSED; + hci_disconn_cfm(conn, reason); + hci_conn_del(conn); + } else { + hci_conn_failed(conn, reason); + } unlock: hci_dev_unlock(hdev); diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 16da946f5881..71248163ce9a 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -502,7 +502,7 @@ drop: } /* -------- Socket interface ---------- */ -static struct sock *__iso_get_sock_listen_by_addr(bdaddr_t *ba) +static struct sock *__iso_get_sock_listen_by_addr(bdaddr_t *src, bdaddr_t *dst) { struct sock *sk; @@ -510,7 +510,10 @@ static struct sock *__iso_get_sock_listen_by_addr(bdaddr_t *ba) if (sk->sk_state != BT_LISTEN) continue; - if (!bacmp(&iso_pi(sk)->src, ba)) + if (bacmp(&iso_pi(sk)->dst, dst)) + continue; + + if (!bacmp(&iso_pi(sk)->src, src)) return sk; } @@ -952,7 +955,7 @@ static int iso_listen_cis(struct sock *sk) write_lock(&iso_sk_list.lock); - if (__iso_get_sock_listen_by_addr(&iso_pi(sk)->src)) + if (__iso_get_sock_listen_by_addr(&iso_pi(sk)->src, &iso_pi(sk)->dst)) err = -EADDRINUSE; write_unlock(&iso_sk_list.lock); diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c index 5697df9d4394..94ec913dfb76 100644 --- a/net/bluetooth/rfcomm/tty.c +++ b/net/bluetooth/rfcomm/tty.c @@ -771,7 +771,7 @@ static int rfcomm_tty_open(struct tty_struct *tty, struct file *filp) static void rfcomm_tty_close(struct tty_struct *tty, struct file *filp) { - struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; + struct rfcomm_dev *dev = tty->driver_data; BT_DBG("tty %p dev %p dlc %p opened %d", tty, dev, dev->dlc, dev->port.count); @@ -779,17 +779,18 @@ static void rfcomm_tty_close(struct tty_struct *tty, struct file *filp) tty_port_close(&dev->port, tty, filp); } -static int rfcomm_tty_write(struct tty_struct *tty, const unsigned char *buf, int count) +static ssize_t rfcomm_tty_write(struct tty_struct *tty, const u8 *buf, + size_t count) { - struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; + struct rfcomm_dev *dev = tty->driver_data; struct rfcomm_dlc *dlc = dev->dlc; struct sk_buff *skb; - int sent = 0, size; + size_t sent = 0, size; - BT_DBG("tty %p count %d", tty, count); + BT_DBG("tty %p count %zu", tty, count); while (count) { - size = min_t(uint, count, dlc->mtu); + size = min_t(size_t, count, dlc->mtu); skb = rfcomm_wmalloc(dev, size + RFCOMM_SKB_RESERVE, GFP_ATOMIC); if (!skb) @@ -810,7 +811,7 @@ static int rfcomm_tty_write(struct tty_struct *tty, const unsigned char *buf, in static unsigned int rfcomm_tty_write_room(struct tty_struct *tty) { - struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; + struct rfcomm_dev *dev = tty->driver_data; int room = 0; if (dev && dev->dlc) @@ -864,7 +865,7 @@ static void rfcomm_tty_set_termios(struct tty_struct *tty, u8 baud, data_bits, stop_bits, parity, x_on, x_off; u16 changes = 0; - struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; + struct rfcomm_dev *dev = tty->driver_data; BT_DBG("tty %p termios %p", tty, old); @@ -996,7 +997,7 @@ static void rfcomm_tty_set_termios(struct tty_struct *tty, static void rfcomm_tty_throttle(struct tty_struct *tty) { - struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; + struct rfcomm_dev *dev = tty->driver_data; BT_DBG("tty %p dev %p", tty, dev); @@ -1005,7 +1006,7 @@ static void rfcomm_tty_throttle(struct tty_struct *tty) static void rfcomm_tty_unthrottle(struct tty_struct *tty) { - struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; + struct rfcomm_dev *dev = tty->driver_data; BT_DBG("tty %p dev %p", tty, dev); @@ -1014,7 +1015,7 @@ static void rfcomm_tty_unthrottle(struct tty_struct *tty) static unsigned int rfcomm_tty_chars_in_buffer(struct tty_struct *tty) { - struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; + struct rfcomm_dev *dev = tty->driver_data; BT_DBG("tty %p dev %p", tty, dev); @@ -1029,7 +1030,7 @@ static unsigned int rfcomm_tty_chars_in_buffer(struct tty_struct *tty) static void rfcomm_tty_flush_buffer(struct tty_struct *tty) { - struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; + struct rfcomm_dev *dev = tty->driver_data; BT_DBG("tty %p dev %p", tty, dev); @@ -1052,7 +1053,7 @@ static void rfcomm_tty_wait_until_sent(struct tty_struct *tty, int timeout) static void rfcomm_tty_hangup(struct tty_struct *tty) { - struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; + struct rfcomm_dev *dev = tty->driver_data; BT_DBG("tty %p dev %p", tty, dev); @@ -1061,7 +1062,7 @@ static void rfcomm_tty_hangup(struct tty_struct *tty) static int rfcomm_tty_tiocmget(struct tty_struct *tty) { - struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; + struct rfcomm_dev *dev = tty->driver_data; BT_DBG("tty %p dev %p", tty, dev); @@ -1070,7 +1071,7 @@ static int rfcomm_tty_tiocmget(struct tty_struct *tty) static int rfcomm_tty_tiocmset(struct tty_struct *tty, unsigned int set, unsigned int clear) { - struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; + struct rfcomm_dev *dev = tty->driver_data; struct rfcomm_dlc *dlc = dev->dlc; u8 v24_sig; diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 9d7bc8b96b53..7431f89e897b 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -124,7 +124,7 @@ static int deliver_clone(const struct net_bridge_port *prev, skb = skb_clone(skb, GFP_ATOMIC); if (!skb) { - dev->stats.tx_dropped++; + DEV_STATS_INC(dev, tx_dropped); return -ENOMEM; } @@ -268,7 +268,7 @@ static void maybe_deliver_addr(struct net_bridge_port *p, struct sk_buff *skb, skb = skb_copy(skb, GFP_ATOMIC); if (!skb) { - dev->stats.tx_dropped++; + DEV_STATS_INC(dev, tx_dropped); return; } diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index c34a0b0901b0..c729528b5e85 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -181,12 +181,12 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb if ((mdst && mdst->host_joined) || br_multicast_is_router(brmctx, skb)) { local_rcv = true; - br->dev->stats.multicast++; + DEV_STATS_INC(br->dev, multicast); } mcast_hit = true; } else { local_rcv = true; - br->dev->stats.multicast++; + DEV_STATS_INC(br->dev, multicast); } break; case BR_PKT_UNICAST: diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 1a801fab9543..033034d68f1f 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -294,7 +294,7 @@ int br_nf_pre_routing_finish_bridge(struct net *net, struct sock *sk, struct sk_ /* tell br_dev_xmit to continue with forwarding */ nf_bridge->bridged_dnat = 1; /* FIXME Need to refragment */ - ret = neigh->output(neigh, skb); + ret = READ_ONCE(neigh->output)(neigh, skb); } neigh_release(neigh); return ret; @@ -1135,7 +1135,8 @@ static int br_netfilter_sysctl_init_net(struct net *net) br_netfilter_sysctl_default(brnet); - brnet->ctl_hdr = register_net_sysctl(net, "net/bridge", table); + brnet->ctl_hdr = register_net_sysctl_sz(net, "net/bridge", table, + ARRAY_SIZE(brnf_table)); if (!brnet->ctl_hdr) { if (!net_eq(net, &init_net)) kfree(table); diff --git a/net/can/isotp.c b/net/can/isotp.c index f02b5d3e4733..d1c6f206f429 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -948,21 +948,18 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) if (!so->bound || so->tx.state == ISOTP_SHUTDOWN) return -EADDRNOTAVAIL; -wait_free_buffer: - /* we do not support multiple buffers - for now */ - if (wq_has_sleeper(&so->wait) && (msg->msg_flags & MSG_DONTWAIT)) - return -EAGAIN; + while (cmpxchg(&so->tx.state, ISOTP_IDLE, ISOTP_SENDING) != ISOTP_IDLE) { + /* we do not support multiple buffers - for now */ + if (msg->msg_flags & MSG_DONTWAIT) + return -EAGAIN; - /* wait for complete transmission of current pdu */ - err = wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE); - if (err) - goto err_event_drop; - - if (cmpxchg(&so->tx.state, ISOTP_IDLE, ISOTP_SENDING) != ISOTP_IDLE) { if (so->tx.state == ISOTP_SHUTDOWN) return -EADDRNOTAVAIL; - goto wait_free_buffer; + /* wait for complete transmission of current pdu */ + err = wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE); + if (err) + goto err_event_drop; } /* PDU size > default => try max_pdu_size */ diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 5eb4898cccd4..10a41cd9c523 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -969,6 +969,62 @@ static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor, return true; } +static void ceph_msg_data_iter_cursor_init(struct ceph_msg_data_cursor *cursor, + size_t length) +{ + struct ceph_msg_data *data = cursor->data; + + cursor->iov_iter = data->iter; + cursor->lastlen = 0; + iov_iter_truncate(&cursor->iov_iter, length); + cursor->resid = iov_iter_count(&cursor->iov_iter); +} + +static struct page *ceph_msg_data_iter_next(struct ceph_msg_data_cursor *cursor, + size_t *page_offset, size_t *length) +{ + struct page *page; + ssize_t len; + + if (cursor->lastlen) + iov_iter_revert(&cursor->iov_iter, cursor->lastlen); + + len = iov_iter_get_pages2(&cursor->iov_iter, &page, PAGE_SIZE, + 1, page_offset); + BUG_ON(len < 0); + + cursor->lastlen = len; + + /* + * FIXME: The assumption is that the pages represented by the iov_iter + * are pinned, with the references held by the upper-level + * callers, or by virtue of being under writeback. Eventually, + * we'll get an iov_iter_get_pages2 variant that doesn't take + * page refs. Until then, just put the page ref. + */ + VM_BUG_ON_PAGE(!PageWriteback(page) && page_count(page) < 2, page); + put_page(page); + + *length = min_t(size_t, len, cursor->resid); + return page; +} + +static bool ceph_msg_data_iter_advance(struct ceph_msg_data_cursor *cursor, + size_t bytes) +{ + BUG_ON(bytes > cursor->resid); + cursor->resid -= bytes; + + if (bytes < cursor->lastlen) { + cursor->lastlen -= bytes; + } else { + iov_iter_advance(&cursor->iov_iter, bytes - cursor->lastlen); + cursor->lastlen = 0; + } + + return cursor->resid; +} + /* * Message data is handled (sent or received) in pieces, where each * piece resides on a single page. The network layer might not @@ -996,6 +1052,9 @@ static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor) case CEPH_MSG_DATA_BVECS: ceph_msg_data_bvecs_cursor_init(cursor, length); break; + case CEPH_MSG_DATA_ITER: + ceph_msg_data_iter_cursor_init(cursor, length); + break; case CEPH_MSG_DATA_NONE: default: /* BUG(); */ @@ -1013,6 +1072,7 @@ void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor, cursor->total_resid = length; cursor->data = msg->data; + cursor->sr_resid = 0; __ceph_msg_data_cursor_init(cursor); } @@ -1042,6 +1102,9 @@ struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor, case CEPH_MSG_DATA_BVECS: page = ceph_msg_data_bvecs_next(cursor, page_offset, length); break; + case CEPH_MSG_DATA_ITER: + page = ceph_msg_data_iter_next(cursor, page_offset, length); + break; case CEPH_MSG_DATA_NONE: default: page = NULL; @@ -1080,6 +1143,9 @@ void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, size_t bytes) case CEPH_MSG_DATA_BVECS: new_piece = ceph_msg_data_bvecs_advance(cursor, bytes); break; + case CEPH_MSG_DATA_ITER: + new_piece = ceph_msg_data_iter_advance(cursor, bytes); + break; case CEPH_MSG_DATA_NONE: default: BUG(); @@ -1879,6 +1945,18 @@ void ceph_msg_data_add_bvecs(struct ceph_msg *msg, } EXPORT_SYMBOL(ceph_msg_data_add_bvecs); +void ceph_msg_data_add_iter(struct ceph_msg *msg, + struct iov_iter *iter) +{ + struct ceph_msg_data *data; + + data = ceph_msg_data_add(msg); + data->type = CEPH_MSG_DATA_ITER; + data->iter = *iter; + + msg->data_length += iov_iter_count(&data->iter); +} + /* * construct a new message with given type, size * the new msg has a ref count of 1. diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c index 3d57bb48a2b4..f9a50d7f0d20 100644 --- a/net/ceph/messenger_v1.c +++ b/net/ceph/messenger_v1.c @@ -159,9 +159,9 @@ static size_t sizeof_footer(struct ceph_connection *con) static void prepare_message_data(struct ceph_msg *msg, u32 data_len) { - /* Initialize data cursor */ - - ceph_msg_data_cursor_init(&msg->cursor, msg, data_len); + /* Initialize data cursor if it's not a sparse read */ + if (!msg->sparse_read) + ceph_msg_data_cursor_init(&msg->cursor, msg, data_len); } /* @@ -960,9 +960,9 @@ static void process_ack(struct ceph_connection *con) prepare_read_tag(con); } -static int read_partial_message_section(struct ceph_connection *con, - struct kvec *section, - unsigned int sec_len, u32 *crc) +static int read_partial_message_chunk(struct ceph_connection *con, + struct kvec *section, + unsigned int sec_len, u32 *crc) { int ret, left; @@ -978,11 +978,91 @@ static int read_partial_message_section(struct ceph_connection *con, section->iov_len += ret; } if (section->iov_len == sec_len) - *crc = crc32c(0, section->iov_base, section->iov_len); + *crc = crc32c(*crc, section->iov_base, section->iov_len); return 1; } +static inline int read_partial_message_section(struct ceph_connection *con, + struct kvec *section, + unsigned int sec_len, u32 *crc) +{ + *crc = 0; + return read_partial_message_chunk(con, section, sec_len, crc); +} + +static int read_sparse_msg_extent(struct ceph_connection *con, u32 *crc) +{ + struct ceph_msg_data_cursor *cursor = &con->in_msg->cursor; + bool do_bounce = ceph_test_opt(from_msgr(con->msgr), RXBOUNCE); + + if (do_bounce && unlikely(!con->bounce_page)) { + con->bounce_page = alloc_page(GFP_NOIO); + if (!con->bounce_page) { + pr_err("failed to allocate bounce page\n"); + return -ENOMEM; + } + } + + while (cursor->sr_resid > 0) { + struct page *page, *rpage; + size_t off, len; + int ret; + + page = ceph_msg_data_next(cursor, &off, &len); + rpage = do_bounce ? con->bounce_page : page; + + /* clamp to what remains in extent */ + len = min_t(int, len, cursor->sr_resid); + ret = ceph_tcp_recvpage(con->sock, rpage, (int)off, len); + if (ret <= 0) + return ret; + *crc = ceph_crc32c_page(*crc, rpage, off, ret); + ceph_msg_data_advance(cursor, (size_t)ret); + cursor->sr_resid -= ret; + if (do_bounce) + memcpy_page(page, off, rpage, off, ret); + } + return 1; +} + +static int read_sparse_msg_data(struct ceph_connection *con) +{ + struct ceph_msg_data_cursor *cursor = &con->in_msg->cursor; + bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); + u32 crc = 0; + int ret = 1; + + if (do_datacrc) + crc = con->in_data_crc; + + do { + if (con->v1.in_sr_kvec.iov_base) + ret = read_partial_message_chunk(con, + &con->v1.in_sr_kvec, + con->v1.in_sr_len, + &crc); + else if (cursor->sr_resid > 0) + ret = read_sparse_msg_extent(con, &crc); + + if (ret <= 0) { + if (do_datacrc) + con->in_data_crc = crc; + return ret; + } + + memset(&con->v1.in_sr_kvec, 0, sizeof(con->v1.in_sr_kvec)); + ret = con->ops->sparse_read(con, cursor, + (char **)&con->v1.in_sr_kvec.iov_base); + con->v1.in_sr_len = ret; + } while (ret > 0); + + if (do_datacrc) + con->in_data_crc = crc; + + return ret < 0 ? ret : 1; /* must return > 0 to indicate success */ +} + static int read_partial_msg_data(struct ceph_connection *con) { struct ceph_msg_data_cursor *cursor = &con->in_msg->cursor; @@ -1173,7 +1253,9 @@ static int read_partial_message(struct ceph_connection *con) if (!m->num_data_items) return -EIO; - if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) + if (m->sparse_read) + ret = read_sparse_msg_data(con); + else if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) ret = read_partial_msg_data_bounce(con); else ret = read_partial_msg_data(con); diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c index 1df1d29dee92..d09a39ff2cf0 100644 --- a/net/ceph/messenger_v2.c +++ b/net/ceph/messenger_v2.c @@ -8,9 +8,9 @@ #include <linux/ceph/ceph_debug.h> #include <crypto/aead.h> -#include <crypto/algapi.h> /* for crypto_memneq() */ #include <crypto/hash.h> #include <crypto/sha2.h> +#include <crypto/utils.h> #include <linux/bvec.h> #include <linux/crc32c.h> #include <linux/net.h> @@ -52,14 +52,16 @@ #define FRAME_LATE_STATUS_COMPLETE 0xe #define FRAME_LATE_STATUS_ABORTED_MASK 0xf -#define IN_S_HANDLE_PREAMBLE 1 -#define IN_S_HANDLE_CONTROL 2 -#define IN_S_HANDLE_CONTROL_REMAINDER 3 -#define IN_S_PREPARE_READ_DATA 4 -#define IN_S_PREPARE_READ_DATA_CONT 5 -#define IN_S_PREPARE_READ_ENC_PAGE 6 -#define IN_S_HANDLE_EPILOGUE 7 -#define IN_S_FINISH_SKIP 8 +#define IN_S_HANDLE_PREAMBLE 1 +#define IN_S_HANDLE_CONTROL 2 +#define IN_S_HANDLE_CONTROL_REMAINDER 3 +#define IN_S_PREPARE_READ_DATA 4 +#define IN_S_PREPARE_READ_DATA_CONT 5 +#define IN_S_PREPARE_READ_ENC_PAGE 6 +#define IN_S_PREPARE_SPARSE_DATA 7 +#define IN_S_PREPARE_SPARSE_DATA_CONT 8 +#define IN_S_HANDLE_EPILOGUE 9 +#define IN_S_FINISH_SKIP 10 #define OUT_S_QUEUE_DATA 1 #define OUT_S_QUEUE_DATA_CONT 2 @@ -967,12 +969,48 @@ static void init_sgs_cursor(struct scatterlist **sg, } } +/** + * init_sgs_pages: set up scatterlist on an array of page pointers + * @sg: scatterlist to populate + * @pages: pointer to page array + * @dpos: position in the array to start (bytes) + * @dlen: len to add to sg (bytes) + * @pad: pointer to pad destination (if any) + * + * Populate the scatterlist from the page array, starting at an arbitrary + * byte in the array and running for a specified length. + */ +static void init_sgs_pages(struct scatterlist **sg, struct page **pages, + int dpos, int dlen, u8 *pad) +{ + int idx = dpos >> PAGE_SHIFT; + int off = offset_in_page(dpos); + int resid = dlen; + + do { + int len = min(resid, (int)PAGE_SIZE - off); + + sg_set_page(*sg, pages[idx], len, off); + *sg = sg_next(*sg); + off = 0; + ++idx; + resid -= len; + } while (resid); + + if (need_padding(dlen)) { + sg_set_buf(*sg, pad, padding_len(dlen)); + *sg = sg_next(*sg); + } +} + static int setup_message_sgs(struct sg_table *sgt, struct ceph_msg *msg, u8 *front_pad, u8 *middle_pad, u8 *data_pad, - void *epilogue, bool add_tag) + void *epilogue, struct page **pages, int dpos, + bool add_tag) { struct ceph_msg_data_cursor cursor; struct scatterlist *cur_sg; + int dlen = data_len(msg); int sg_cnt; int ret; @@ -986,9 +1024,15 @@ static int setup_message_sgs(struct sg_table *sgt, struct ceph_msg *msg, if (middle_len(msg)) sg_cnt += calc_sg_cnt(msg->middle->vec.iov_base, middle_len(msg)); - if (data_len(msg)) { - ceph_msg_data_cursor_init(&cursor, msg, data_len(msg)); - sg_cnt += calc_sg_cnt_cursor(&cursor); + if (dlen) { + if (pages) { + sg_cnt += calc_pages_for(dpos, dlen); + if (need_padding(dlen)) + sg_cnt++; + } else { + ceph_msg_data_cursor_init(&cursor, msg, dlen); + sg_cnt += calc_sg_cnt_cursor(&cursor); + } } ret = sg_alloc_table(sgt, sg_cnt, GFP_NOIO); @@ -1002,9 +1046,13 @@ static int setup_message_sgs(struct sg_table *sgt, struct ceph_msg *msg, if (middle_len(msg)) init_sgs(&cur_sg, msg->middle->vec.iov_base, middle_len(msg), middle_pad); - if (data_len(msg)) { - ceph_msg_data_cursor_init(&cursor, msg, data_len(msg)); - init_sgs_cursor(&cur_sg, &cursor, data_pad); + if (dlen) { + if (pages) { + init_sgs_pages(&cur_sg, pages, dpos, dlen, data_pad); + } else { + ceph_msg_data_cursor_init(&cursor, msg, dlen); + init_sgs_cursor(&cur_sg, &cursor, data_pad); + } } WARN_ON(!sg_is_last(cur_sg)); @@ -1039,10 +1087,53 @@ static int decrypt_control_remainder(struct ceph_connection *con) padded_len(rem_len) + CEPH_GCM_TAG_LEN); } +/* Process sparse read data that lives in a buffer */ +static int process_v2_sparse_read(struct ceph_connection *con, + struct page **pages, int spos) +{ + struct ceph_msg_data_cursor *cursor = &con->v2.in_cursor; + int ret; + + for (;;) { + char *buf = NULL; + + ret = con->ops->sparse_read(con, cursor, &buf); + if (ret <= 0) + return ret; + + dout("%s: sparse_read return %x buf %p\n", __func__, ret, buf); + + do { + int idx = spos >> PAGE_SHIFT; + int soff = offset_in_page(spos); + struct page *spage = con->v2.in_enc_pages[idx]; + int len = min_t(int, ret, PAGE_SIZE - soff); + + if (buf) { + memcpy_from_page(buf, spage, soff, len); + buf += len; + } else { + struct bio_vec bv; + + get_bvec_at(cursor, &bv); + len = min_t(int, len, bv.bv_len); + memcpy_page(bv.bv_page, bv.bv_offset, + spage, soff, len); + ceph_msg_data_advance(cursor, len); + } + spos += len; + ret -= len; + } while (ret); + } +} + static int decrypt_tail(struct ceph_connection *con) { struct sg_table enc_sgt = {}; struct sg_table sgt = {}; + struct page **pages = NULL; + bool sparse = con->in_msg->sparse_read; + int dpos = 0; int tail_len; int ret; @@ -1053,9 +1144,14 @@ static int decrypt_tail(struct ceph_connection *con) if (ret) goto out; + if (sparse) { + dpos = padded_len(front_len(con->in_msg) + padded_len(middle_len(con->in_msg))); + pages = con->v2.in_enc_pages; + } + ret = setup_message_sgs(&sgt, con->in_msg, FRONT_PAD(con->v2.in_buf), - MIDDLE_PAD(con->v2.in_buf), DATA_PAD(con->v2.in_buf), - con->v2.in_buf, true); + MIDDLE_PAD(con->v2.in_buf), DATA_PAD(con->v2.in_buf), + con->v2.in_buf, pages, dpos, true); if (ret) goto out; @@ -1065,6 +1161,12 @@ static int decrypt_tail(struct ceph_connection *con) if (ret) goto out; + if (sparse && data_len(con->in_msg)) { + ret = process_v2_sparse_read(con, con->v2.in_enc_pages, dpos); + if (ret) + goto out; + } + WARN_ON(!con->v2.in_enc_page_cnt); ceph_release_page_vector(con->v2.in_enc_pages, con->v2.in_enc_page_cnt); @@ -1588,7 +1690,7 @@ static int prepare_message_secure(struct ceph_connection *con) encode_epilogue_secure(con, false); ret = setup_message_sgs(&sgt, con->out_msg, zerop, zerop, zerop, - &con->v2.out_epil, false); + &con->v2.out_epil, NULL, 0, false); if (ret) goto out; @@ -1825,6 +1927,123 @@ static void prepare_read_data_cont(struct ceph_connection *con) con->v2.in_state = IN_S_HANDLE_EPILOGUE; } +static int prepare_sparse_read_cont(struct ceph_connection *con) +{ + int ret; + struct bio_vec bv; + char *buf = NULL; + struct ceph_msg_data_cursor *cursor = &con->v2.in_cursor; + + WARN_ON(con->v2.in_state != IN_S_PREPARE_SPARSE_DATA_CONT); + + if (iov_iter_is_bvec(&con->v2.in_iter)) { + if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) { + con->in_data_crc = crc32c(con->in_data_crc, + page_address(con->bounce_page), + con->v2.in_bvec.bv_len); + get_bvec_at(cursor, &bv); + memcpy_to_page(bv.bv_page, bv.bv_offset, + page_address(con->bounce_page), + con->v2.in_bvec.bv_len); + } else { + con->in_data_crc = ceph_crc32c_page(con->in_data_crc, + con->v2.in_bvec.bv_page, + con->v2.in_bvec.bv_offset, + con->v2.in_bvec.bv_len); + } + + ceph_msg_data_advance(cursor, con->v2.in_bvec.bv_len); + cursor->sr_resid -= con->v2.in_bvec.bv_len; + dout("%s: advance by 0x%x sr_resid 0x%x\n", __func__, + con->v2.in_bvec.bv_len, cursor->sr_resid); + WARN_ON_ONCE(cursor->sr_resid > cursor->total_resid); + if (cursor->sr_resid) { + get_bvec_at(cursor, &bv); + if (bv.bv_len > cursor->sr_resid) + bv.bv_len = cursor->sr_resid; + if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) { + bv.bv_page = con->bounce_page; + bv.bv_offset = 0; + } + set_in_bvec(con, &bv); + con->v2.data_len_remain -= bv.bv_len; + return 0; + } + } else if (iov_iter_is_kvec(&con->v2.in_iter)) { + /* On first call, we have no kvec so don't compute crc */ + if (con->v2.in_kvec_cnt) { + WARN_ON_ONCE(con->v2.in_kvec_cnt > 1); + con->in_data_crc = crc32c(con->in_data_crc, + con->v2.in_kvecs[0].iov_base, + con->v2.in_kvecs[0].iov_len); + } + } else { + return -EIO; + } + + /* get next extent */ + ret = con->ops->sparse_read(con, cursor, &buf); + if (ret <= 0) { + if (ret < 0) + return ret; + + reset_in_kvecs(con); + add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_PLAIN_LEN); + con->v2.in_state = IN_S_HANDLE_EPILOGUE; + return 0; + } + + if (buf) { + /* receive into buffer */ + reset_in_kvecs(con); + add_in_kvec(con, buf, ret); + con->v2.data_len_remain -= ret; + return 0; + } + + if (ret > cursor->total_resid) { + pr_warn("%s: ret 0x%x total_resid 0x%zx resid 0x%zx\n", + __func__, ret, cursor->total_resid, cursor->resid); + return -EIO; + } + get_bvec_at(cursor, &bv); + if (bv.bv_len > cursor->sr_resid) + bv.bv_len = cursor->sr_resid; + if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) { + if (unlikely(!con->bounce_page)) { + con->bounce_page = alloc_page(GFP_NOIO); + if (!con->bounce_page) { + pr_err("failed to allocate bounce page\n"); + return -ENOMEM; + } + } + + bv.bv_page = con->bounce_page; + bv.bv_offset = 0; + } + set_in_bvec(con, &bv); + con->v2.data_len_remain -= ret; + return ret; +} + +static int prepare_sparse_read_data(struct ceph_connection *con) +{ + struct ceph_msg *msg = con->in_msg; + + dout("%s: starting sparse read\n", __func__); + + if (WARN_ON_ONCE(!con->ops->sparse_read)) + return -EOPNOTSUPP; + + if (!con_secure(con)) + con->in_data_crc = -1; + + reset_in_kvecs(con); + con->v2.in_state = IN_S_PREPARE_SPARSE_DATA_CONT; + con->v2.data_len_remain = data_len(msg); + return prepare_sparse_read_cont(con); +} + static int prepare_read_tail_plain(struct ceph_connection *con) { struct ceph_msg *msg = con->in_msg; @@ -1845,7 +2064,10 @@ static int prepare_read_tail_plain(struct ceph_connection *con) } if (data_len(msg)) { - con->v2.in_state = IN_S_PREPARE_READ_DATA; + if (msg->sparse_read) + con->v2.in_state = IN_S_PREPARE_SPARSE_DATA; + else + con->v2.in_state = IN_S_PREPARE_READ_DATA; } else { add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_PLAIN_LEN); con->v2.in_state = IN_S_HANDLE_EPILOGUE; @@ -2898,6 +3120,12 @@ static int populate_in_iter(struct ceph_connection *con) prepare_read_enc_page(con); ret = 0; break; + case IN_S_PREPARE_SPARSE_DATA: + ret = prepare_sparse_read_data(con); + break; + case IN_S_PREPARE_SPARSE_DATA_CONT: + ret = prepare_sparse_read_cont(con); + break; case IN_S_HANDLE_EPILOGUE: ret = handle_epilogue(con); break; @@ -3489,6 +3717,23 @@ static void revoke_at_prepare_read_enc_page(struct ceph_connection *con) con->v2.in_state = IN_S_FINISH_SKIP; } +static void revoke_at_prepare_sparse_data(struct ceph_connection *con) +{ + int resid; /* current piece of data */ + int remaining; + + WARN_ON(con_secure(con)); + WARN_ON(!data_len(con->in_msg)); + WARN_ON(!iov_iter_is_bvec(&con->v2.in_iter)); + resid = iov_iter_count(&con->v2.in_iter); + dout("%s con %p resid %d\n", __func__, con, resid); + + remaining = CEPH_EPILOGUE_PLAIN_LEN + con->v2.data_len_remain; + con->v2.in_iter.count -= resid; + set_in_skip(con, resid + remaining); + con->v2.in_state = IN_S_FINISH_SKIP; +} + static void revoke_at_handle_epilogue(struct ceph_connection *con) { int resid; @@ -3505,6 +3750,7 @@ static void revoke_at_handle_epilogue(struct ceph_connection *con) void ceph_con_v2_revoke_incoming(struct ceph_connection *con) { switch (con->v2.in_state) { + case IN_S_PREPARE_SPARSE_DATA: case IN_S_PREPARE_READ_DATA: revoke_at_prepare_read_data(con); break; @@ -3514,6 +3760,9 @@ void ceph_con_v2_revoke_incoming(struct ceph_connection *con) case IN_S_PREPARE_READ_ENC_PAGE: revoke_at_prepare_read_enc_page(con); break; + case IN_S_PREPARE_SPARSE_DATA_CONT: + revoke_at_prepare_sparse_data(con); + break; case IN_S_HANDLE_EPILOGUE: revoke_at_handle_epilogue(con); break; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 658a6f2320cf..d3a759e052c8 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -171,6 +171,13 @@ static void ceph_osd_data_bvecs_init(struct ceph_osd_data *osd_data, osd_data->num_bvecs = num_bvecs; } +static void ceph_osd_iter_init(struct ceph_osd_data *osd_data, + struct iov_iter *iter) +{ + osd_data->type = CEPH_OSD_DATA_TYPE_ITER; + osd_data->iter = *iter; +} + static struct ceph_osd_data * osd_req_op_raw_data_in(struct ceph_osd_request *osd_req, unsigned int which) { @@ -264,6 +271,22 @@ void osd_req_op_extent_osd_data_bvec_pos(struct ceph_osd_request *osd_req, } EXPORT_SYMBOL(osd_req_op_extent_osd_data_bvec_pos); +/** + * osd_req_op_extent_osd_iter - Set up an operation with an iterator buffer + * @osd_req: The request to set up + * @which: Index of the operation in which to set the iter + * @iter: The buffer iterator + */ +void osd_req_op_extent_osd_iter(struct ceph_osd_request *osd_req, + unsigned int which, struct iov_iter *iter) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_data(osd_req, which, extent, osd_data); + ceph_osd_iter_init(osd_data, iter); +} +EXPORT_SYMBOL(osd_req_op_extent_osd_iter); + static void osd_req_op_cls_request_info_pagelist( struct ceph_osd_request *osd_req, unsigned int which, struct ceph_pagelist *pagelist) @@ -346,6 +369,8 @@ static u64 ceph_osd_data_length(struct ceph_osd_data *osd_data) #endif /* CONFIG_BLOCK */ case CEPH_OSD_DATA_TYPE_BVECS: return osd_data->bvec_pos.iter.bi_size; + case CEPH_OSD_DATA_TYPE_ITER: + return iov_iter_count(&osd_data->iter); default: WARN(true, "unrecognized data type %d\n", (int)osd_data->type); return 0; @@ -376,8 +401,10 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req, switch (op->op) { case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_SPARSE_READ: case CEPH_OSD_OP_WRITE: case CEPH_OSD_OP_WRITEFULL: + kfree(op->extent.sparse_ext); ceph_osd_data_release(&op->extent.osd_data); break; case CEPH_OSD_OP_CALL: @@ -669,6 +696,7 @@ static void get_num_data_items(struct ceph_osd_request *req, /* reply */ case CEPH_OSD_OP_STAT: case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_SPARSE_READ: case CEPH_OSD_OP_LIST_WATCHERS: *num_reply_data_items += 1; break; @@ -738,7 +766,7 @@ void osd_req_op_extent_init(struct ceph_osd_request *osd_req, BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE && opcode != CEPH_OSD_OP_WRITEFULL && opcode != CEPH_OSD_OP_ZERO && - opcode != CEPH_OSD_OP_TRUNCATE); + opcode != CEPH_OSD_OP_TRUNCATE && opcode != CEPH_OSD_OP_SPARSE_READ); op->extent.offset = offset; op->extent.length = length; @@ -951,6 +979,8 @@ static void ceph_osdc_msg_data_add(struct ceph_msg *msg, #endif } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BVECS) { ceph_msg_data_add_bvecs(msg, &osd_data->bvec_pos); + } else if (osd_data->type == CEPH_OSD_DATA_TYPE_ITER) { + ceph_msg_data_add_iter(msg, &osd_data->iter); } else { BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE); } @@ -963,6 +993,7 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst, case CEPH_OSD_OP_STAT: break; case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_SPARSE_READ: case CEPH_OSD_OP_WRITE: case CEPH_OSD_OP_WRITEFULL: case CEPH_OSD_OP_ZERO: @@ -1017,6 +1048,10 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst, dst->copy_from.src_fadvise_flags = cpu_to_le32(src->copy_from.src_fadvise_flags); break; + case CEPH_OSD_OP_ASSERT_VER: + dst->assert_ver.unused = cpu_to_le64(0); + dst->assert_ver.ver = cpu_to_le64(src->assert_ver.ver); + break; default: pr_err("unsupported osd opcode %s\n", ceph_osd_op_name(src->op)); @@ -1059,7 +1094,8 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE && opcode != CEPH_OSD_OP_ZERO && opcode != CEPH_OSD_OP_TRUNCATE && - opcode != CEPH_OSD_OP_CREATE && opcode != CEPH_OSD_OP_DELETE); + opcode != CEPH_OSD_OP_CREATE && opcode != CEPH_OSD_OP_DELETE && + opcode != CEPH_OSD_OP_SPARSE_READ); req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool, GFP_NOFS); @@ -1100,15 +1136,30 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, if (flags & CEPH_OSD_FLAG_WRITE) req->r_data_offset = off; - if (num_ops > 1) + if (num_ops > 1) { + int num_req_ops, num_rep_ops; + /* - * This is a special case for ceph_writepages_start(), but it - * also covers ceph_uninline_data(). If more multi-op request - * use cases emerge, we will need a separate helper. + * If this is a multi-op write request, assume that we'll need + * request ops. If it's a multi-op read then assume we'll need + * reply ops. Anything else and call it -EINVAL. */ - r = __ceph_osdc_alloc_messages(req, GFP_NOFS, num_ops, 0); - else + if (flags & CEPH_OSD_FLAG_WRITE) { + num_req_ops = num_ops; + num_rep_ops = 0; + } else if (flags & CEPH_OSD_FLAG_READ) { + num_req_ops = 0; + num_rep_ops = num_ops; + } else { + r = -EINVAL; + goto fail; + } + + r = __ceph_osdc_alloc_messages(req, GFP_NOFS, num_req_ops, + num_rep_ops); + } else { r = ceph_osdc_alloc_messages(req, GFP_NOFS); + } if (r) goto fail; @@ -1120,6 +1171,18 @@ fail: } EXPORT_SYMBOL(ceph_osdc_new_request); +int __ceph_alloc_sparse_ext_map(struct ceph_osd_req_op *op, int cnt) +{ + op->extent.sparse_ext_cnt = cnt; + op->extent.sparse_ext = kmalloc_array(cnt, + sizeof(*op->extent.sparse_ext), + GFP_NOFS); + if (!op->extent.sparse_ext) + return -ENOMEM; + return 0; +} +EXPORT_SYMBOL(__ceph_alloc_sparse_ext_map); + /* * We keep osd requests in an rbtree, sorted by ->r_tid. */ @@ -1177,6 +1240,7 @@ static void osd_init(struct ceph_osd *osd) { refcount_set(&osd->o_ref, 1); RB_CLEAR_NODE(&osd->o_node); + spin_lock_init(&osd->o_requests_lock); osd->o_requests = RB_ROOT; osd->o_linger_requests = RB_ROOT; osd->o_backoff_mappings = RB_ROOT; @@ -1187,6 +1251,13 @@ static void osd_init(struct ceph_osd *osd) mutex_init(&osd->lock); } +static void ceph_init_sparse_read(struct ceph_sparse_read *sr) +{ + kfree(sr->sr_extent); + memset(sr, '\0', sizeof(*sr)); + sr->sr_state = CEPH_SPARSE_READ_HDR; +} + static void osd_cleanup(struct ceph_osd *osd) { WARN_ON(!RB_EMPTY_NODE(&osd->o_node)); @@ -1197,6 +1268,8 @@ static void osd_cleanup(struct ceph_osd *osd) WARN_ON(!list_empty(&osd->o_osd_lru)); WARN_ON(!list_empty(&osd->o_keepalive_item)); + ceph_init_sparse_read(&osd->o_sparse_read); + if (osd->o_auth.authorizer) { WARN_ON(osd_homeless(osd)); ceph_auth_destroy_authorizer(osd->o_auth.authorizer); @@ -1216,6 +1289,9 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum) osd_init(osd); osd->o_osdc = osdc; osd->o_osd = onum; + osd->o_sparse_op_idx = -1; + + ceph_init_sparse_read(&osd->o_sparse_read); ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr); @@ -1406,7 +1482,9 @@ static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req) atomic_inc(&osd->o_osdc->num_homeless); get_osd(osd); + spin_lock(&osd->o_requests_lock); insert_request(&osd->o_requests, req); + spin_unlock(&osd->o_requests_lock); req->r_osd = osd; } @@ -1418,7 +1496,9 @@ static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req) req, req->r_tid); req->r_osd = NULL; + spin_lock(&osd->o_requests_lock); erase_request(&osd->o_requests, req); + spin_unlock(&osd->o_requests_lock); put_osd(osd); if (!osd_homeless(osd)) @@ -2016,6 +2096,7 @@ static void setup_request_data(struct ceph_osd_request *req) &op->raw_data_in); break; case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_SPARSE_READ: ceph_osdc_msg_data_add(reply_msg, &op->extent.osd_data); break; @@ -2435,8 +2516,10 @@ static void finish_request(struct ceph_osd_request *req) req->r_end_latency = ktime_get(); - if (req->r_osd) + if (req->r_osd) { + ceph_init_sparse_read(&req->r_osd->o_sparse_read); unlink_request(req->r_osd, req); + } atomic_dec(&osdc->num_requests); /* @@ -3795,6 +3878,7 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) * one (type of) reply back. */ WARN_ON(!(m.flags & CEPH_OSD_FLAG_ONDISK)); + req->r_version = m.user_version; req->r_result = m.result ?: data_len; finish_request(req); mutex_unlock(&osd->lock); @@ -5348,6 +5432,24 @@ static void osd_dispatch(struct ceph_connection *con, struct ceph_msg *msg) ceph_msg_put(msg); } +/* How much sparse data was requested? */ +static u64 sparse_data_requested(struct ceph_osd_request *req) +{ + u64 len = 0; + + if (req->r_flags & CEPH_OSD_FLAG_READ) { + int i; + + for (i = 0; i < req->r_num_ops; ++i) { + struct ceph_osd_req_op *op = &req->r_ops[i]; + + if (op->op == CEPH_OSD_OP_SPARSE_READ) + len += op->extent.length; + } + } + return len; +} + /* * Lookup and return message for incoming reply. Don't try to do * anything about a larger than preallocated data portion of the @@ -5364,6 +5466,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, int front_len = le32_to_cpu(hdr->front_len); int data_len = le32_to_cpu(hdr->data_len); u64 tid = le64_to_cpu(hdr->tid); + u64 srlen; down_read(&osdc->lock); if (!osd_registered(osd)) { @@ -5396,7 +5499,8 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, req->r_reply = m; } - if (data_len > req->r_reply->data_length) { + srlen = sparse_data_requested(req); + if (!srlen && data_len > req->r_reply->data_length) { pr_warn("%s osd%d tid %llu data %d > preallocated %zu, skipping\n", __func__, osd->o_osd, req->r_tid, data_len, req->r_reply->data_length); @@ -5406,6 +5510,8 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, } m = ceph_msg_get(req->r_reply); + m->sparse_read = (bool)srlen; + dout("get_reply tid %lld %p\n", tid, m); out_unlock_session: @@ -5638,9 +5744,217 @@ static int osd_check_message_signature(struct ceph_msg *msg) return ceph_auth_check_message_signature(auth, msg); } +static void advance_cursor(struct ceph_msg_data_cursor *cursor, size_t len, + bool zero) +{ + while (len) { + struct page *page; + size_t poff, plen; + + page = ceph_msg_data_next(cursor, &poff, &plen); + if (plen > len) + plen = len; + if (zero) + zero_user_segment(page, poff, poff + plen); + len -= plen; + ceph_msg_data_advance(cursor, plen); + } +} + +static int prep_next_sparse_read(struct ceph_connection *con, + struct ceph_msg_data_cursor *cursor) +{ + struct ceph_osd *o = con->private; + struct ceph_sparse_read *sr = &o->o_sparse_read; + struct ceph_osd_request *req; + struct ceph_osd_req_op *op; + + spin_lock(&o->o_requests_lock); + req = lookup_request(&o->o_requests, le64_to_cpu(con->in_msg->hdr.tid)); + if (!req) { + spin_unlock(&o->o_requests_lock); + return -EBADR; + } + + if (o->o_sparse_op_idx < 0) { + u64 srlen = sparse_data_requested(req); + + dout("%s: [%d] starting new sparse read req. srlen=0x%llx\n", + __func__, o->o_osd, srlen); + ceph_msg_data_cursor_init(cursor, con->in_msg, srlen); + } else { + u64 end; + + op = &req->r_ops[o->o_sparse_op_idx]; + + WARN_ON_ONCE(op->extent.sparse_ext); + + /* hand back buffer we took earlier */ + op->extent.sparse_ext = sr->sr_extent; + sr->sr_extent = NULL; + op->extent.sparse_ext_cnt = sr->sr_count; + sr->sr_ext_len = 0; + dout("%s: [%d] completed extent array len %d cursor->resid %zd\n", + __func__, o->o_osd, op->extent.sparse_ext_cnt, cursor->resid); + /* Advance to end of data for this operation */ + end = ceph_sparse_ext_map_end(op); + if (end < sr->sr_req_len) + advance_cursor(cursor, sr->sr_req_len - end, false); + } + + ceph_init_sparse_read(sr); + + /* find next op in this request (if any) */ + while (++o->o_sparse_op_idx < req->r_num_ops) { + op = &req->r_ops[o->o_sparse_op_idx]; + if (op->op == CEPH_OSD_OP_SPARSE_READ) + goto found; + } + + /* reset for next sparse read request */ + spin_unlock(&o->o_requests_lock); + o->o_sparse_op_idx = -1; + return 0; +found: + sr->sr_req_off = op->extent.offset; + sr->sr_req_len = op->extent.length; + sr->sr_pos = sr->sr_req_off; + dout("%s: [%d] new sparse read op at idx %d 0x%llx~0x%llx\n", __func__, + o->o_osd, o->o_sparse_op_idx, sr->sr_req_off, sr->sr_req_len); + + /* hand off request's sparse extent map buffer */ + sr->sr_ext_len = op->extent.sparse_ext_cnt; + op->extent.sparse_ext_cnt = 0; + sr->sr_extent = op->extent.sparse_ext; + op->extent.sparse_ext = NULL; + + spin_unlock(&o->o_requests_lock); + return 1; +} + +#ifdef __BIG_ENDIAN +static inline void convert_extent_map(struct ceph_sparse_read *sr) +{ + int i; + + for (i = 0; i < sr->sr_count; i++) { + struct ceph_sparse_extent *ext = &sr->sr_extent[i]; + + ext->off = le64_to_cpu((__force __le64)ext->off); + ext->len = le64_to_cpu((__force __le64)ext->len); + } +} +#else +static inline void convert_extent_map(struct ceph_sparse_read *sr) +{ +} +#endif + +#define MAX_EXTENTS 4096 + +static int osd_sparse_read(struct ceph_connection *con, + struct ceph_msg_data_cursor *cursor, + char **pbuf) +{ + struct ceph_osd *o = con->private; + struct ceph_sparse_read *sr = &o->o_sparse_read; + u32 count = sr->sr_count; + u64 eoff, elen; + int ret; + + switch (sr->sr_state) { + case CEPH_SPARSE_READ_HDR: +next_op: + ret = prep_next_sparse_read(con, cursor); + if (ret <= 0) + return ret; + + /* number of extents */ + ret = sizeof(sr->sr_count); + *pbuf = (char *)&sr->sr_count; + sr->sr_state = CEPH_SPARSE_READ_EXTENTS; + break; + case CEPH_SPARSE_READ_EXTENTS: + /* Convert sr_count to host-endian */ + count = le32_to_cpu((__force __le32)sr->sr_count); + sr->sr_count = count; + dout("[%d] got %u extents\n", o->o_osd, count); + + if (count > 0) { + if (!sr->sr_extent || count > sr->sr_ext_len) { + /* + * Apply a hard cap to the number of extents. + * If we have more, assume something is wrong. + */ + if (count > MAX_EXTENTS) { + dout("%s: OSD returned 0x%x extents in a single reply!\n", + __func__, count); + return -EREMOTEIO; + } + + /* no extent array provided, or too short */ + kfree(sr->sr_extent); + sr->sr_extent = kmalloc_array(count, + sizeof(*sr->sr_extent), + GFP_NOIO); + if (!sr->sr_extent) + return -ENOMEM; + sr->sr_ext_len = count; + } + ret = count * sizeof(*sr->sr_extent); + *pbuf = (char *)sr->sr_extent; + sr->sr_state = CEPH_SPARSE_READ_DATA_LEN; + break; + } + /* No extents? Read data len */ + fallthrough; + case CEPH_SPARSE_READ_DATA_LEN: + convert_extent_map(sr); + ret = sizeof(sr->sr_datalen); + *pbuf = (char *)&sr->sr_datalen; + sr->sr_state = CEPH_SPARSE_READ_DATA; + break; + case CEPH_SPARSE_READ_DATA: + if (sr->sr_index >= count) { + sr->sr_state = CEPH_SPARSE_READ_HDR; + goto next_op; + } + + eoff = sr->sr_extent[sr->sr_index].off; + elen = sr->sr_extent[sr->sr_index].len; + + dout("[%d] ext %d off 0x%llx len 0x%llx\n", + o->o_osd, sr->sr_index, eoff, elen); + + if (elen > INT_MAX) { + dout("Sparse read extent length too long (0x%llx)\n", + elen); + return -EREMOTEIO; + } + + /* zero out anything from sr_pos to start of extent */ + if (sr->sr_pos < eoff) + advance_cursor(cursor, eoff - sr->sr_pos, true); + + /* Set position to end of extent */ + sr->sr_pos = eoff + elen; + + /* send back the new length and nullify the ptr */ + cursor->sr_resid = elen; + ret = elen; + *pbuf = NULL; + + /* Bump the array index */ + ++sr->sr_index; + break; + } + return ret; +} + static const struct ceph_connection_operations osd_con_ops = { .get = osd_get_con, .put = osd_put_con, + .sparse_read = osd_sparse_read, .alloc_msg = osd_alloc_msg, .dispatch = osd_dispatch, .fault = osd_fault, diff --git a/net/core/dev.c b/net/core/dev.c index ccff2b6ef958..5aaf5753d4e4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -69,7 +69,7 @@ */ #include <linux/uaccess.h> -#include <linux/bitops.h> +#include <linux/bitmap.h> #include <linux/capability.h> #include <linux/cpu.h> #include <linux/types.h> @@ -1080,7 +1080,7 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf) return -EINVAL; /* Use one page as a bit array of possible slots */ - inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC); + inuse = bitmap_zalloc(max_netdevices, GFP_ATOMIC); if (!inuse) return -ENOMEM; @@ -1109,7 +1109,7 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf) } i = find_first_zero_bit(inuse, max_netdevices); - free_page((unsigned long) inuse); + bitmap_free(inuse); } snprintf(buf, IFNAMSIZ, name, i); @@ -3292,15 +3292,19 @@ int skb_checksum_help(struct sk_buff *skb) offset = skb_checksum_start_offset(skb); ret = -EINVAL; - if (WARN_ON_ONCE(offset >= skb_headlen(skb))) { + if (unlikely(offset >= skb_headlen(skb))) { DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false); + WARN_ONCE(true, "offset (%d) >= skb_headlen() (%u)\n", + offset, skb_headlen(skb)); goto out; } csum = skb_checksum(skb, offset, skb->len - offset, 0); offset += skb->csum_offset; - if (WARN_ON_ONCE(offset + sizeof(__sum16) > skb_headlen(skb))) { + if (unlikely(offset + sizeof(__sum16) > skb_headlen(skb))) { DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false); + WARN_ONCE(true, "offset+2 (%zu) > skb_headlen() (%u)\n", + offset + sizeof(__sum16), skb_headlen(skb)); goto out; } ret = skb_ensure_writable(skb, offset + sizeof(__sum16)); diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 89d15ceaf9af..272f09251343 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -1446,7 +1446,7 @@ proto_again: break; } - nhoff += ntohs(hdr->message_length); + nhoff += sizeof(struct ptp_header); fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } @@ -1831,8 +1831,7 @@ u32 __skb_get_hash_symmetric(const struct sk_buff *skb) memset(&keys, 0, sizeof(keys)); __skb_flow_dissect(NULL, skb, &flow_keys_dissector_symmetric, - &keys, NULL, 0, 0, 0, - FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); + &keys, NULL, 0, 0, 0, 0); return __flow_hash_from_keys(&keys, &hashrnd); } diff --git a/net/core/neighbour.c b/net/core/neighbour.c index ddd0f32de20e..9c09f091cbff 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -410,7 +410,7 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev, */ __skb_queue_purge(&n->arp_queue); n->arp_queue_len_bytes = 0; - n->output = neigh_blackhole; + WRITE_ONCE(n->output, neigh_blackhole); if (n->nud_state & NUD_VALID) n->nud_state = NUD_NOARP; else @@ -920,7 +920,7 @@ static void neigh_suspect(struct neighbour *neigh) { neigh_dbg(2, "neigh %p is suspected\n", neigh); - neigh->output = neigh->ops->output; + WRITE_ONCE(neigh->output, neigh->ops->output); } /* Neighbour state is OK; @@ -932,7 +932,7 @@ static void neigh_connect(struct neighbour *neigh) { neigh_dbg(2, "neigh %p is connected\n", neigh); - neigh->output = neigh->ops->connected_output; + WRITE_ONCE(neigh->output, neigh->ops->connected_output); } static void neigh_periodic_work(struct work_struct *work) @@ -988,7 +988,9 @@ static void neigh_periodic_work(struct work_struct *work) (state == NUD_FAILED || !time_in_range_open(jiffies, n->used, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) { - *np = n->next; + rcu_assign_pointer(*np, + rcu_dereference_protected(n->next, + lockdep_is_held(&tbl->lock))); neigh_mark_dead(n); write_unlock(&n->lock); neigh_cleanup_and_release(n); @@ -1447,7 +1449,7 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, if (n2) n1 = n2; } - n1->output(n1, skb); + READ_ONCE(n1->output)(n1, skb); if (n2) neigh_release(n2); rcu_read_unlock(); @@ -3153,7 +3155,7 @@ int neigh_xmit(int index, struct net_device *dev, rcu_read_unlock(); goto out_kfree_skb; } - err = neigh->output(neigh, skb); + err = READ_ONCE(neigh->output)(neigh, skb); rcu_read_unlock(); } else if (index == NEIGH_LINK_TABLE) { @@ -3779,6 +3781,7 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, const char *dev_name_source; char neigh_path[ sizeof("net//neigh/") + IFNAMSIZ + IFNAMSIZ ]; char *p_name; + size_t neigh_vars_size; t = kmemdup(&neigh_sysctl_template, sizeof(*t), GFP_KERNEL_ACCOUNT); if (!t) @@ -3790,11 +3793,13 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, t->neigh_vars[i].extra2 = p; } + neigh_vars_size = ARRAY_SIZE(t->neigh_vars); if (dev) { dev_name_source = dev->name; /* Terminate the table early */ memset(&t->neigh_vars[NEIGH_VAR_GC_INTERVAL], 0, sizeof(t->neigh_vars[NEIGH_VAR_GC_INTERVAL])); + neigh_vars_size = NEIGH_VAR_BASE_REACHABLE_TIME_MS + 1; } else { struct neigh_table *tbl = p->tbl; dev_name_source = "default"; @@ -3841,8 +3846,9 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, snprintf(neigh_path, sizeof(neigh_path), "net/%s/neigh/%s", p_name, dev_name_source); - t->sysctl_header = - register_net_sysctl(neigh_parms_net(p), neigh_path, t->neigh_vars); + t->sysctl_header = register_net_sysctl_sz(neigh_parms_net(p), + neigh_path, t->neigh_vars, + neigh_vars_size); if (!t->sysctl_header) goto free; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 4a2ec33bfb51..53c377d054f0 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -5503,13 +5503,11 @@ static unsigned int rtnl_offload_xstats_get_size_hw_s_info_one(const struct net_device *dev, enum netdev_offload_xstats_type type) { - bool enabled = netdev_offload_xstats_enabled(dev, type); - return nla_total_size(0) + /* IFLA_OFFLOAD_XSTATS_HW_S_INFO_REQUEST */ nla_total_size(sizeof(u8)) + /* IFLA_OFFLOAD_XSTATS_HW_S_INFO_USED */ - (enabled ? nla_total_size(sizeof(u8)) : 0) + + nla_total_size(sizeof(u8)) + 0; } diff --git a/net/core/skmsg.c b/net/core/skmsg.c index a0659fc29bcc..6c31eefbd777 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -612,12 +612,18 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, u32 off, u32 len, bool ingress) { + int err = 0; + if (!ingress) { if (!sock_writeable(psock->sk)) return -EAGAIN; return skb_send_sock(psock->sk, skb, off, len); } - return sk_psock_skb_ingress(psock, skb, off, len); + skb_get(skb); + err = sk_psock_skb_ingress(psock, skb, off, len); + if (err < 0) + kfree_skb(skb); + return err; } static void sk_psock_skb_state(struct sk_psock *psock, @@ -685,9 +691,7 @@ static void sk_psock_backlog(struct work_struct *work) } while (len); skb = skb_dequeue(&psock->ingress_skb); - if (!ingress) { - kfree_skb(skb); - } + kfree_skb(skb); } end: mutex_unlock(&psock->work_mutex); diff --git a/net/core/sock_map.c b/net/core/sock_map.c index cb11750b1df5..4292c2ed1828 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -668,6 +668,8 @@ BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg *, msg, sk = __sock_map_lookup_elem(map, key); if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; + if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk)) + return SK_DROP; msg->flags = flags; msg->sk_redir = sk; @@ -1267,6 +1269,8 @@ BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg *, msg, sk = __sock_hash_lookup_elem(map, key); if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; + if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk)) + return SK_DROP; msg->flags = flags; msg->sk_redir = sk; diff --git a/net/core/stream.c b/net/core/stream.c index f5c4e47df165..96fbcb9bbb30 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -117,7 +117,7 @@ EXPORT_SYMBOL(sk_stream_wait_close); */ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) { - int err = 0; + int ret, err = 0; long vm_wait = 0; long current_timeo = *timeo_p; DEFINE_WAIT_FUNC(wait, woken_wake_function); @@ -142,11 +142,13 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); sk->sk_write_pending++; - sk_wait_event(sk, ¤t_timeo, READ_ONCE(sk->sk_err) || - (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) || - (sk_stream_memory_free(sk) && - !vm_wait), &wait); + ret = sk_wait_event(sk, ¤t_timeo, READ_ONCE(sk->sk_err) || + (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) || + (sk_stream_memory_free(sk) && !vm_wait), + &wait); sk->sk_write_pending--; + if (ret < 0) + goto do_error; if (vm_wait) { vm_wait -= current_timeo; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 782273bb93c2..03f1edb948d7 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -712,7 +712,8 @@ static __net_init int sysctl_core_net_init(struct net *net) tmp->data += (char *)net - (char *)&init_net; } - net->core.sysctl_hdr = register_net_sysctl(net, "net/core", tbl); + net->core.sysctl_hdr = register_net_sysctl_sz(net, "net/core", tbl, + ARRAY_SIZE(netns_core_table)); if (net->core.sysctl_hdr == NULL) goto err_reg; diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 8f56e8723c73..69453b936bd5 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -254,13 +254,8 @@ static int dccp_v4_err(struct sk_buff *skb, u32 info) int err; struct net *net = dev_net(skb->dev); - /* For the first __dccp_basic_hdr_len() check, we only need dh->dccph_x, - * which is in byte 7 of the dccp header. - * Our caller (icmp_socket_deliver()) already pulled 8 bytes for us. - * - * Later on, we want to access the sequence number fields, which are - * beyond 8 bytes, so we have to pskb_may_pull() ourselves. - */ + if (!pskb_may_pull(skb, offset + sizeof(*dh))) + return -EINVAL; dh = (struct dccp_hdr *)(skb->data + offset); if (!pskb_may_pull(skb, offset + __dccp_basic_hdr_len(dh))) return -EINVAL; diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 33f6ccf6ba77..c693a570682f 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -83,13 +83,8 @@ static int dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, __u64 seq; struct net *net = dev_net(skb->dev); - /* For the first __dccp_basic_hdr_len() check, we only need dh->dccph_x, - * which is in byte 7 of the dccp header. - * Our caller (icmpv6_notify()) already pulled 8 bytes for us. - * - * Later on, we want to access the sequence number fields, which are - * beyond 8 bytes, so we have to pskb_may_pull() ourselves. - */ + if (!pskb_may_pull(skb, offset + sizeof(*dh))) + return -EINVAL; dh = (struct dccp_hdr *)(skb->data + offset); if (!pskb_may_pull(skb, offset + __dccp_basic_hdr_len(dh))) return -EINVAL; diff --git a/net/devlink/health.c b/net/devlink/health.c index 638cad8d5c65..51e6e81e31bb 100644 --- a/net/devlink/health.c +++ b/net/devlink/health.c @@ -58,7 +58,6 @@ struct devlink_health_reporter { struct devlink *devlink; struct devlink_port *devlink_port; struct devlink_fmsg *dump_fmsg; - struct mutex dump_lock; /* lock parallel read/write from dump buffers */ u64 graceful_period; bool auto_recover; bool auto_dump; @@ -125,7 +124,6 @@ __devlink_health_reporter_create(struct devlink *devlink, reporter->graceful_period = graceful_period; reporter->auto_recover = !!ops->recover; reporter->auto_dump = !!ops->dump; - mutex_init(&reporter->dump_lock); return reporter; } @@ -226,7 +224,6 @@ EXPORT_SYMBOL_GPL(devlink_health_reporter_create); static void devlink_health_reporter_free(struct devlink_health_reporter *reporter) { - mutex_destroy(&reporter->dump_lock); if (reporter->dump_fmsg) devlink_fmsg_free(reporter->dump_fmsg); kfree(reporter); @@ -625,10 +622,10 @@ int devlink_health_report(struct devlink_health_reporter *reporter, } if (reporter->auto_dump) { - mutex_lock(&reporter->dump_lock); + devl_lock(devlink); /* store current dump of current error, for later analysis */ devlink_health_do_dump(reporter, priv_ctx, NULL); - mutex_unlock(&reporter->dump_lock); + devl_unlock(devlink); } if (!reporter->auto_recover) @@ -1262,7 +1259,7 @@ out: } static struct devlink_health_reporter * -devlink_health_reporter_get_from_cb(struct netlink_callback *cb) +devlink_health_reporter_get_from_cb_lock(struct netlink_callback *cb) { const struct genl_info *info = genl_info_dump(cb); struct devlink_health_reporter *reporter; @@ -1272,10 +1269,12 @@ devlink_health_reporter_get_from_cb(struct netlink_callback *cb) devlink = devlink_get_from_attrs_lock(sock_net(cb->skb->sk), attrs); if (IS_ERR(devlink)) return NULL; - devl_unlock(devlink); reporter = devlink_health_reporter_get_from_attrs(devlink, attrs); - devlink_put(devlink); + if (!reporter) { + devl_unlock(devlink); + devlink_put(devlink); + } return reporter; } @@ -1284,16 +1283,20 @@ int devlink_nl_cmd_health_reporter_dump_get_dumpit(struct sk_buff *skb, { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_health_reporter *reporter; + struct devlink *devlink; int err; - reporter = devlink_health_reporter_get_from_cb(cb); + reporter = devlink_health_reporter_get_from_cb_lock(cb); if (!reporter) return -EINVAL; - if (!reporter->ops->dump) + devlink = reporter->devlink; + if (!reporter->ops->dump) { + devl_unlock(devlink); + devlink_put(devlink); return -EOPNOTSUPP; + } - mutex_lock(&reporter->dump_lock); if (!state->idx) { err = devlink_health_do_dump(reporter, NULL, cb->extack); if (err) @@ -1309,7 +1312,8 @@ int devlink_nl_cmd_health_reporter_dump_get_dumpit(struct sk_buff *skb, err = devlink_fmsg_dumpit(reporter->dump_fmsg, skb, cb, DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET); unlock: - mutex_unlock(&reporter->dump_lock); + devl_unlock(devlink); + devlink_put(devlink); return err; } @@ -1326,9 +1330,7 @@ int devlink_nl_cmd_health_reporter_dump_clear_doit(struct sk_buff *skb, if (!reporter->ops->dump) return -EOPNOTSUPP; - mutex_lock(&reporter->dump_lock); devlink_health_dump_clear(reporter); - mutex_unlock(&reporter->dump_lock); return 0; } diff --git a/net/ethtool/bitset.c b/net/ethtool/bitset.c index 0515d6604b3b..883ed9be81f9 100644 --- a/net/ethtool/bitset.c +++ b/net/ethtool/bitset.c @@ -431,8 +431,10 @@ ethnl_update_bitset32_verbose(u32 *bitmap, unsigned int nbits, ethnl_string_array_t names, struct netlink_ext_ack *extack, bool *mod) { + u32 *orig_bitmap, *saved_bitmap = NULL; struct nlattr *bit_attr; bool no_mask; + bool dummy; int rem; int ret; @@ -448,8 +450,22 @@ ethnl_update_bitset32_verbose(u32 *bitmap, unsigned int nbits, } no_mask = tb[ETHTOOL_A_BITSET_NOMASK]; - if (no_mask) - ethnl_bitmap32_clear(bitmap, 0, nbits, mod); + if (no_mask) { + unsigned int nwords = DIV_ROUND_UP(nbits, 32); + unsigned int nbytes = nwords * sizeof(u32); + + /* The bitmap size is only the size of the map part without + * its mask part. + */ + saved_bitmap = kcalloc(nwords, sizeof(u32), GFP_KERNEL); + if (!saved_bitmap) + return -ENOMEM; + memcpy(saved_bitmap, bitmap, nbytes); + ethnl_bitmap32_clear(bitmap, 0, nbits, &dummy); + orig_bitmap = saved_bitmap; + } else { + orig_bitmap = bitmap; + } nla_for_each_nested(bit_attr, tb[ETHTOOL_A_BITSET_BITS], rem) { bool old_val, new_val; @@ -458,13 +474,14 @@ ethnl_update_bitset32_verbose(u32 *bitmap, unsigned int nbits, if (nla_type(bit_attr) != ETHTOOL_A_BITSET_BITS_BIT) { NL_SET_ERR_MSG_ATTR(extack, bit_attr, "only ETHTOOL_A_BITSET_BITS_BIT allowed in ETHTOOL_A_BITSET_BITS"); - return -EINVAL; + ret = -EINVAL; + goto out; } ret = ethnl_parse_bit(&idx, &new_val, nbits, bit_attr, no_mask, names, extack); if (ret < 0) - return ret; - old_val = bitmap[idx / 32] & ((u32)1 << (idx % 32)); + goto out; + old_val = orig_bitmap[idx / 32] & ((u32)1 << (idx % 32)); if (new_val != old_val) { if (new_val) bitmap[idx / 32] |= ((u32)1 << (idx % 32)); @@ -474,7 +491,10 @@ ethnl_update_bitset32_verbose(u32 *bitmap, unsigned int nbits, } } - return 0; + ret = 0; +out: + kfree(saved_bitmap); + return ret; } static int ethnl_compact_sanity_checks(unsigned int nbits, diff --git a/net/ethtool/plca.c b/net/ethtool/plca.c index b238a1afe9ae..b1e2e3b5027f 100644 --- a/net/ethtool/plca.c +++ b/net/ethtool/plca.c @@ -21,16 +21,6 @@ struct plca_reply_data { #define PLCA_REPDATA(__reply_base) \ container_of(__reply_base, struct plca_reply_data, base) -static void plca_update_sint(int *dst, const struct nlattr *attr, - bool *mod) -{ - if (!attr) - return; - - *dst = nla_get_u32(attr); - *mod = true; -} - // PLCA get configuration message ------------------------------------------- // const struct nla_policy ethnl_plca_get_cfg_policy[] = { @@ -38,6 +28,29 @@ const struct nla_policy ethnl_plca_get_cfg_policy[] = { NLA_POLICY_NESTED(ethnl_header_policy), }; +static void plca_update_sint(int *dst, struct nlattr **tb, u32 attrid, + bool *mod) +{ + const struct nlattr *attr = tb[attrid]; + + if (!attr || + WARN_ON_ONCE(attrid >= ARRAY_SIZE(ethnl_plca_set_cfg_policy))) + return; + + switch (ethnl_plca_set_cfg_policy[attrid].type) { + case NLA_U8: + *dst = nla_get_u8(attr); + break; + case NLA_U32: + *dst = nla_get_u32(attr); + break; + default: + WARN_ON_ONCE(1); + } + + *mod = true; +} + static int plca_get_cfg_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) @@ -144,13 +157,13 @@ ethnl_set_plca(struct ethnl_req_info *req_info, struct genl_info *info) return -EOPNOTSUPP; memset(&plca_cfg, 0xff, sizeof(plca_cfg)); - plca_update_sint(&plca_cfg.enabled, tb[ETHTOOL_A_PLCA_ENABLED], &mod); - plca_update_sint(&plca_cfg.node_id, tb[ETHTOOL_A_PLCA_NODE_ID], &mod); - plca_update_sint(&plca_cfg.node_cnt, tb[ETHTOOL_A_PLCA_NODE_CNT], &mod); - plca_update_sint(&plca_cfg.to_tmr, tb[ETHTOOL_A_PLCA_TO_TMR], &mod); - plca_update_sint(&plca_cfg.burst_cnt, tb[ETHTOOL_A_PLCA_BURST_CNT], + plca_update_sint(&plca_cfg.enabled, tb, ETHTOOL_A_PLCA_ENABLED, &mod); + plca_update_sint(&plca_cfg.node_id, tb, ETHTOOL_A_PLCA_NODE_ID, &mod); + plca_update_sint(&plca_cfg.node_cnt, tb, ETHTOOL_A_PLCA_NODE_CNT, &mod); + plca_update_sint(&plca_cfg.to_tmr, tb, ETHTOOL_A_PLCA_TO_TMR, &mod); + plca_update_sint(&plca_cfg.burst_cnt, tb, ETHTOOL_A_PLCA_BURST_CNT, &mod); - plca_update_sint(&plca_cfg.burst_tmr, tb[ETHTOOL_A_PLCA_BURST_TMR], + plca_update_sint(&plca_cfg.burst_tmr, tb, ETHTOOL_A_PLCA_BURST_TMR, &mod); if (!mod) return 0; diff --git a/net/handshake/handshake-test.c b/net/handshake/handshake-test.c index 6d37bab35c8f..16ed7bfd29e4 100644 --- a/net/handshake/handshake-test.c +++ b/net/handshake/handshake-test.c @@ -235,7 +235,7 @@ static void handshake_req_submit_test4(struct kunit *test) KUNIT_EXPECT_PTR_EQ(test, req, result); handshake_req_cancel(sock->sk); - sock_release(sock); + fput(filp); } static void handshake_req_submit_test5(struct kunit *test) @@ -272,7 +272,7 @@ static void handshake_req_submit_test5(struct kunit *test) /* Assert */ KUNIT_EXPECT_EQ(test, err, -EAGAIN); - sock_release(sock); + fput(filp); hn->hn_pending = saved; } @@ -306,7 +306,7 @@ static void handshake_req_submit_test6(struct kunit *test) KUNIT_EXPECT_EQ(test, err, -EBUSY); handshake_req_cancel(sock->sk); - sock_release(sock); + fput(filp); } static void handshake_req_cancel_test1(struct kunit *test) @@ -340,7 +340,7 @@ static void handshake_req_cancel_test1(struct kunit *test) /* Assert */ KUNIT_EXPECT_TRUE(test, result); - sock_release(sock); + fput(filp); } static void handshake_req_cancel_test2(struct kunit *test) @@ -382,7 +382,7 @@ static void handshake_req_cancel_test2(struct kunit *test) /* Assert */ KUNIT_EXPECT_TRUE(test, result); - sock_release(sock); + fput(filp); } static void handshake_req_cancel_test3(struct kunit *test) @@ -427,7 +427,7 @@ static void handshake_req_cancel_test3(struct kunit *test) /* Assert */ KUNIT_EXPECT_FALSE(test, result); - sock_release(sock); + fput(filp); } static struct handshake_req *handshake_req_destroy_test; @@ -471,7 +471,7 @@ static void handshake_req_destroy_test1(struct kunit *test) handshake_req_cancel(sock->sk); /* Act */ - sock_release(sock); + fput(filp); /* Assert */ KUNIT_EXPECT_PTR_EQ(test, handshake_req_destroy_test, req); diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index 629daacc9607..b71dab630a87 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -594,6 +594,7 @@ static int fill_frame_info(struct hsr_frame_info *frame, proto = vlan_hdr->vlanhdr.h_vlan_encapsulated_proto; /* FIXME: */ netdev_warn_once(skb->dev, "VLAN not yet supported"); + return -EINVAL; } frame->is_from_san = false; diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index b77f1189d19d..6d14d935ee82 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -288,13 +288,13 @@ void hsr_handle_sup_frame(struct hsr_frame_info *frame) /* And leave the HSR tag. */ if (ethhdr->h_proto == htons(ETH_P_HSR)) { - pull_size = sizeof(struct ethhdr); + pull_size = sizeof(struct hsr_tag); skb_pull(skb, pull_size); total_pull_size += pull_size; } /* And leave the HSR sup tag. */ - pull_size = sizeof(struct hsr_tag); + pull_size = sizeof(struct hsr_sup_tag); skb_pull(skb, pull_size); total_pull_size += pull_size; diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index 6851e33df7d1..18e01791ad79 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -83,7 +83,7 @@ struct hsr_vlan_ethhdr { struct hsr_sup_tlv { u8 HSR_TLV_type; u8 HSR_TLV_length; -}; +} __packed; /* HSR/PRP Supervision Frame data types. * Field names as defined in the IEC:2010 standard for HSR. diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index a91283d1e5bf..6dd960ec558c 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -360,6 +360,7 @@ static int __net_init lowpan_frags_ns_sysctl_register(struct net *net) struct ctl_table_header *hdr; struct netns_ieee802154_lowpan *ieee802154_lowpan = net_ieee802154_lowpan(net); + size_t table_size = ARRAY_SIZE(lowpan_frags_ns_ctl_table); table = lowpan_frags_ns_ctl_table; if (!net_eq(net, &init_net)) { @@ -369,8 +370,10 @@ static int __net_init lowpan_frags_ns_sysctl_register(struct net *net) goto err_alloc; /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) + if (net->user_ns != &init_user_ns) { table[0].procname = NULL; + table_size = 0; + } } table[0].data = &ieee802154_lowpan->fqdir->high_thresh; @@ -379,7 +382,8 @@ static int __net_init lowpan_frags_ns_sysctl_register(struct net *net) table[1].extra2 = &ieee802154_lowpan->fqdir->high_thresh; table[2].data = &ieee802154_lowpan->fqdir->timeout; - hdr = register_net_sysctl(net, "net/ieee802154/6lowpan", table); + hdr = register_net_sysctl_sz(net, "net/ieee802154/6lowpan", table, + table_size); if (hdr == NULL) goto err_reg; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 3d2e30e20473..2713c9b06c4c 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -597,7 +597,6 @@ static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias) add_wait_queue(sk_sleep(sk), &wait); sk->sk_write_pending += writebias; - sk->sk_wait_pending++; /* Basic assumption: if someone sets sk->sk_err, he _must_ * change state of the socket from TCP_SYN_*. @@ -613,7 +612,6 @@ static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias) } remove_wait_queue(sk_sleep(sk), &wait); sk->sk_write_pending -= writebias; - sk->sk_wait_pending--; return timeo; } @@ -642,6 +640,7 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, return -EINVAL; if (uaddr->sa_family == AF_UNSPEC) { + sk->sk_disconnects++; err = sk->sk_prot->disconnect(sk, flags); sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; goto out; @@ -696,6 +695,7 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, int writebias = (sk->sk_protocol == IPPROTO_TCP) && tcp_sk(sk)->fastopen_req && tcp_sk(sk)->fastopen_req->data ? 1 : 0; + int dis = sk->sk_disconnects; /* Error code is set above */ if (!timeo || !inet_wait_for_connect(sk, timeo, writebias)) @@ -704,6 +704,11 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, err = sock_intr_errno(timeo); if (signal_pending(current)) goto out; + + if (dis != sk->sk_disconnects) { + err = -EPIPE; + goto out; + } } /* Connection was closed by RST, timeout, ICMP error @@ -725,6 +730,7 @@ out: sock_error: err = sock_error(sk) ? : -ECONNABORTED; sock->state = SS_UNCONNECTED; + sk->sk_disconnects++; if (sk->sk_prot->disconnect(sk, flags)) sock->state = SS_DISCONNECTING; goto out; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index c3658b8755bc..ca0ff15dc8fa 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -355,14 +355,14 @@ static void __inet_del_ifa(struct in_device *in_dev, { struct in_ifaddr *promote = NULL; struct in_ifaddr *ifa, *ifa1; - struct in_ifaddr *last_prim; + struct in_ifaddr __rcu **last_prim; struct in_ifaddr *prev_prom = NULL; int do_promote = IN_DEV_PROMOTE_SECONDARIES(in_dev); ASSERT_RTNL(); ifa1 = rtnl_dereference(*ifap); - last_prim = rtnl_dereference(in_dev->ifa_list); + last_prim = ifap; if (in_dev->dead) goto no_promotions; @@ -376,7 +376,7 @@ static void __inet_del_ifa(struct in_device *in_dev, while ((ifa = rtnl_dereference(*ifap1)) != NULL) { if (!(ifa->ifa_flags & IFA_F_SECONDARY) && ifa1->ifa_scope <= ifa->ifa_scope) - last_prim = ifa; + last_prim = &ifa->ifa_next; if (!(ifa->ifa_flags & IFA_F_SECONDARY) || ifa1->ifa_mask != ifa->ifa_mask || @@ -440,9 +440,9 @@ no_promotions: rcu_assign_pointer(prev_prom->ifa_next, next_sec); - last_sec = rtnl_dereference(last_prim->ifa_next); + last_sec = rtnl_dereference(*last_prim); rcu_assign_pointer(promote->ifa_next, last_sec); - rcu_assign_pointer(last_prim->ifa_next, promote); + rcu_assign_pointer(*last_prim, promote); } promote->ifa_flags &= ~IFA_F_SECONDARY; @@ -2737,7 +2737,8 @@ static __net_init int devinet_init_net(struct net *net) goto err_reg_dflt; err = -ENOMEM; - forw_hdr = register_net_sysctl(net, "net/ipv4", tbl); + forw_hdr = register_net_sysctl_sz(net, "net/ipv4", tbl, + ARRAY_SIZE(ctl_forward_entry)); if (!forw_hdr) goto err_reg_ctl; net->ipv4.forw_hdr = forw_hdr; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index eafa4a033515..1ea82bc33ef1 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1887,6 +1887,7 @@ int fib_sync_down_addr(struct net_device *dev, __be32 local) continue; if (fi->fib_prefsrc == local) { fi->fib_flags |= RTNH_F_DEAD; + fi->pfsrc_removed = true; ret++; } } diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index d13fb9e76b97..9bdfdab906fe 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -2027,6 +2027,7 @@ void fib_table_flush_external(struct fib_table *tb) int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all) { struct trie *t = (struct trie *)tb->tb_data; + struct nl_info info = { .nl_net = net }; struct key_vector *pn = t->kv; unsigned long cindex = 1; struct hlist_node *tmp; @@ -2089,6 +2090,9 @@ int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all) fib_notify_alias_delete(net, n->key, &n->leaf, fa, NULL); + if (fi->pfsrc_removed) + rtmsg_fib(RTM_DELROUTE, htonl(n->key), fa, + KEYLENGTH - fa->fa_slen, tb->tb_id, &info, 0); hlist_del_rcu(&fa->fa_list); fib_release_info(fa->fa_info); alias_free_mem_rcu(fa); diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 0c9e768e5628..418e5fb58fd3 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -353,8 +353,9 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu) struct flowi4 fl4; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; - unsigned int size = mtu; + unsigned int size; + size = min(mtu, IP_MAX_MTU); while (1) { skb = alloc_skb(size + hlen + tlen, GFP_ATOMIC | __GFP_NOWARN); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index aeebe8816689..394a498c2823 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -1145,7 +1145,6 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, if (newsk) { struct inet_connection_sock *newicsk = inet_csk(newsk); - newsk->sk_wait_pending = 0; inet_sk_set_state(newsk, TCP_SYN_RECV); newicsk->icsk_bind_hash = NULL; newicsk->icsk_bind2_hash = NULL; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 7876b7d703cb..598c1b114d2c 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -149,8 +149,14 @@ static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2, const struct sock *sk) { #if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family != tb2->family) - return false; + if (sk->sk_family != tb2->family) { + if (sk->sk_family == AF_INET) + return ipv6_addr_v4mapped(&tb2->v6_rcv_saddr) && + tb2->v6_rcv_saddr.s6_addr32[3] == sk->sk_rcv_saddr; + + return ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr) && + sk->sk_v6_rcv_saddr.s6_addr32[3] == tb2->rcv_saddr; + } if (sk->sk_family == AF_INET6) return ipv6_addr_equal(&tb2->v6_rcv_saddr, @@ -815,41 +821,33 @@ static bool inet_bind2_bucket_match(const struct inet_bind2_bucket *tb, const struct net *net, unsigned short port, int l3mdev, const struct sock *sk) { -#if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family != tb->family) + if (!net_eq(ib2_net(tb), net) || tb->port != port || + tb->l3mdev != l3mdev) return false; - if (sk->sk_family == AF_INET6) - return net_eq(ib2_net(tb), net) && tb->port == port && - tb->l3mdev == l3mdev && - ipv6_addr_equal(&tb->v6_rcv_saddr, &sk->sk_v6_rcv_saddr); - else -#endif - return net_eq(ib2_net(tb), net) && tb->port == port && - tb->l3mdev == l3mdev && tb->rcv_saddr == sk->sk_rcv_saddr; + return inet_bind2_bucket_addr_match(tb, sk); } bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const struct net *net, unsigned short port, int l3mdev, const struct sock *sk) { + if (!net_eq(ib2_net(tb), net) || tb->port != port || + tb->l3mdev != l3mdev) + return false; + #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family != tb->family) { if (sk->sk_family == AF_INET) - return net_eq(ib2_net(tb), net) && tb->port == port && - tb->l3mdev == l3mdev && - ipv6_addr_any(&tb->v6_rcv_saddr); + return ipv6_addr_any(&tb->v6_rcv_saddr) || + ipv6_addr_v4mapped_any(&tb->v6_rcv_saddr); return false; } if (sk->sk_family == AF_INET6) - return net_eq(ib2_net(tb), net) && tb->port == port && - tb->l3mdev == l3mdev && - ipv6_addr_any(&tb->v6_rcv_saddr); - else + return ipv6_addr_any(&tb->v6_rcv_saddr); #endif - return net_eq(ib2_net(tb), net) && tb->port == port && - tb->l3mdev == l3mdev && tb->rcv_saddr == 0; + return tb->rcv_saddr == 0; } /* The socket's bhash2 hashbucket spinlock must be held when this is called */ diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 69c00ffdcf3e..a4941f53b523 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -615,7 +615,8 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net) table[2].data = &net->ipv4.fqdir->timeout; table[3].data = &net->ipv4.fqdir->max_dist; - hdr = register_net_sysctl(net, "net/ipv4", table); + hdr = register_net_sysctl_sz(net, "net/ipv4", table, + ARRAY_SIZE(ip4_frags_ns_ctl_table)); if (!hdr) goto err_reg; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 6a3f57a3fa41..b214b5a2e045 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1213,6 +1213,7 @@ EXPORT_INDIRECT_CALLABLE(ipv4_dst_check); static void ipv4_send_dest_unreach(struct sk_buff *skb) { + struct net_device *dev; struct ip_options opt; int res; @@ -1230,7 +1231,8 @@ static void ipv4_send_dest_unreach(struct sk_buff *skb) opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr); rcu_read_lock(); - res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL); + dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev; + res = __ip_options_compile(dev_net(dev), &opt, skb, NULL); rcu_read_unlock(); if (res) @@ -3415,6 +3417,8 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, fa->fa_type == fri.type) { fri.offload = READ_ONCE(fa->offload); fri.trap = READ_ONCE(fa->trap); + fri.offload_failed = + READ_ONCE(fa->offload_failed); break; } } @@ -3593,6 +3597,7 @@ static struct ctl_table ipv4_route_netns_table[] = { static __net_init int sysctl_route_net_init(struct net *net) { struct ctl_table *tbl; + size_t table_size = ARRAY_SIZE(ipv4_route_netns_table); tbl = ipv4_route_netns_table; if (!net_eq(net, &init_net)) { @@ -3604,8 +3609,10 @@ static __net_init int sysctl_route_net_init(struct net *net) /* Don't export non-whitelisted sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) { - if (tbl[0].procname != ipv4_route_flush_procname) + if (tbl[0].procname != ipv4_route_flush_procname) { tbl[0].procname = NULL; + table_size = 0; + } } /* Update the variables to point into the current struct net @@ -3616,7 +3623,8 @@ static __net_init int sysctl_route_net_init(struct net *net) } tbl[0].extra1 = net; - net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl); + net->ipv4.route_hdr = register_net_sysctl_sz(net, "net/ipv4/route", + tbl, table_size); if (!net->ipv4.route_hdr) goto err_reg; return 0; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 2afb0870648b..6ac890b4073f 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1519,7 +1519,8 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) } } - net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table); + net->ipv4.ipv4_hdr = register_net_sysctl_sz(net, "net/ipv4", table, + ARRAY_SIZE(ipv4_net_table)); if (!net->ipv4.ipv4_hdr) goto err_reg; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index cc4b250262c1..d3456cf840de 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -831,7 +831,9 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, */ if (!skb_queue_empty(&sk->sk_receive_queue)) break; - sk_wait_data(sk, &timeo, NULL); + ret = sk_wait_data(sk, &timeo, NULL); + if (ret < 0) + break; if (signal_pending(current)) { ret = sock_intr_errno(timeo); break; @@ -1621,16 +1623,13 @@ EXPORT_SYMBOL(tcp_read_sock); int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { - struct tcp_sock *tp = tcp_sk(sk); - u32 seq = tp->copied_seq; struct sk_buff *skb; int copied = 0; - u32 offset; if (sk->sk_state == TCP_LISTEN) return -ENOTCONN; - while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) { + while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { u8 tcp_flags; int used; @@ -1643,13 +1642,10 @@ int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) copied = used; break; } - seq += used; copied += used; - if (tcp_flags & TCPHDR_FIN) { - ++seq; + if (tcp_flags & TCPHDR_FIN) break; - } } return copied; } @@ -1742,7 +1738,7 @@ void tcp_update_recv_tstamps(struct sk_buff *skb, } #ifdef CONFIG_MMU -const struct vm_operations_struct tcp_vm_ops = { +static const struct vm_operations_struct tcp_vm_ops = { }; int tcp_mmap(struct file *file, struct socket *sock, @@ -2045,13 +2041,10 @@ static struct vm_area_struct *find_tcp_vma(struct mm_struct *mm, unsigned long address, bool *mmap_locked) { - struct vm_area_struct *vma = NULL; + struct vm_area_struct *vma = lock_vma_under_rcu(mm, address); -#ifdef CONFIG_PER_VMA_LOCK - vma = lock_vma_under_rcu(mm, address); -#endif if (vma) { - if (!vma_is_tcp(vma)) { + if (vma->vm_ops != &tcp_vm_ops) { vma_end_read(vma); return NULL; } @@ -2061,7 +2054,7 @@ static struct vm_area_struct *find_tcp_vma(struct mm_struct *mm, mmap_read_lock(mm); vma = vma_lookup(mm, address); - if (!vma || !vma_is_tcp(vma)) { + if (!vma || vma->vm_ops != &tcp_vm_ops) { mmap_read_unlock(mm); return NULL; } @@ -2451,7 +2444,11 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, __sk_flush_backlog(sk); } else { tcp_cleanup_rbuf(sk, copied); - sk_wait_data(sk, &timeo, last); + err = sk_wait_data(sk, &timeo, last); + if (err < 0) { + err = copied ? : err; + goto out; + } } if ((flags & MSG_PEEK) && @@ -2975,12 +2972,6 @@ int tcp_disconnect(struct sock *sk, int flags) int old_state = sk->sk_state; u32 seq; - /* Deny disconnect if other threads are blocked in sk_wait_event() - * or inet_wait_for_connect(). - */ - if (sk->sk_wait_pending) - return -EBUSY; - if (old_state != TCP_CLOSE) tcp_set_state(sk, TCP_CLOSE); diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 81f0dff69e0b..ba2e92188124 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -222,6 +222,7 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk, int *addr_len) { struct tcp_sock *tcp = tcp_sk(sk); + int peek = flags & MSG_PEEK; u32 seq = tcp->copied_seq; struct sk_psock *psock; int copied = 0; @@ -306,12 +307,15 @@ msg_bytes_ready: } data = tcp_msg_wait_data(sk, psock, timeo); + if (data < 0) + return data; if (data && !sk_psock_queue_empty(psock)) goto msg_bytes_ready; copied = -EAGAIN; } out: - WRITE_ONCE(tcp->copied_seq, seq); + if (!peek) + WRITE_ONCE(tcp->copied_seq, seq); tcp_rcv_space_adjust(sk); if (copied > 0) __tcp_cleanup_rbuf(sk, copied); @@ -349,6 +353,8 @@ msg_bytes_ready: timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); data = tcp_msg_wait_data(sk, psock, timeo); + if (data < 0) + return data; if (data) { if (!sk_psock_queue_empty(psock)) goto msg_bytes_ready; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 06fe1cf645d5..8afb0950a697 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -253,6 +253,19 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) if (unlikely(len > icsk->icsk_ack.rcv_mss + MAX_TCP_OPTION_SPACE)) tcp_gro_dev_warn(sk, skb, len); + /* If the skb has a len of exactly 1*MSS and has the PSH bit + * set then it is likely the end of an application write. So + * more data may not be arriving soon, and yet the data sender + * may be waiting for an ACK if cwnd-bound or using TX zero + * copy. So we set ICSK_ACK_PUSHED here so that + * tcp_cleanup_rbuf() will send an ACK immediately if the app + * reads all of the data and is not ping-pong. If len > MSS + * then this logic does not matter (and does not hurt) because + * tcp_cleanup_rbuf() will always ACK immediately if the app + * reads data and there is more than an MSS of unACKed data. + */ + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_PSH) + icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; } else { /* Otherwise, we make more careful check taking into account, * that SACKs block is variable. diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index ccfc8bbf7455..bbd85672fda7 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -177,8 +177,7 @@ static void tcp_event_data_sent(struct tcp_sock *tp, } /* Account for an ACK we sent. */ -static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts, - u32 rcv_nxt) +static inline void tcp_event_ack_sent(struct sock *sk, u32 rcv_nxt) { struct tcp_sock *tp = tcp_sk(sk); @@ -192,7 +191,7 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts, if (unlikely(rcv_nxt != tp->rcv_nxt)) return; /* Special ACK sent by DCTCP to reflect ECN */ - tcp_dec_quickack_mode(sk, pkts); + tcp_dec_quickack_mode(sk); inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); } @@ -1387,7 +1386,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, sk, skb); if (likely(tcb->tcp_flags & TCPHDR_ACK)) - tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt); + tcp_event_ack_sent(sk, rcv_nxt); if (skb->len != tcp_header_size) { tcp_event_data_sent(tp, sk); @@ -2457,6 +2456,7 @@ static int tcp_mtu_probe(struct sock *sk) /* build the payload, and be prepared to abort if this fails. */ if (tcp_clone_payload(sk, nskb, probe_size)) { + tcp_skb_tsorted_anchor_cleanup(nskb); consume_skb(nskb); return -1; } @@ -2788,7 +2788,7 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - u32 timeout, rto_delta_us; + u32 timeout, timeout_us, rto_delta_us; int early_retrans; /* Don't do any loss probe on a Fast Open connection before 3WHS @@ -2812,11 +2812,12 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) * sample is available then probe after TCP_TIMEOUT_INIT. */ if (tp->srtt_us) { - timeout = usecs_to_jiffies(tp->srtt_us >> 2); + timeout_us = tp->srtt_us >> 2; if (tp->packets_out == 1) - timeout += TCP_RTO_MIN; + timeout_us += tcp_rto_min_us(sk); else - timeout += TCP_TIMEOUT_MIN; + timeout_us += TCP_TIMEOUT_MIN_US; + timeout = usecs_to_jiffies(timeout_us); } else { timeout = TCP_TIMEOUT_INIT; } diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index acf4869c5d3b..bba10110fbbc 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -104,7 +104,7 @@ bool tcp_rack_mark_lost(struct sock *sk) tp->rack.advanced = 0; tcp_rack_detect_loss(sk, &timeout); if (timeout) { - timeout = usecs_to_jiffies(timeout) + TCP_TIMEOUT_MIN; + timeout = usecs_to_jiffies(timeout + TCP_TIMEOUT_MIN_US); inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT, timeout, inet_csk(sk)->icsk_rto); } diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index cdcc0f6b4f0a..c33bca2c3841 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -169,7 +169,8 @@ static __net_init int xfrm4_net_sysctl_init(struct net *net) table[0].data = &net->xfrm.xfrm4_dst_ops.gc_thresh; } - hdr = register_net_sysctl(net, "net/ipv4", table); + hdr = register_net_sysctl_sz(net, "net/ipv4", table, + ARRAY_SIZE(xfrm4_policy_table)); if (!hdr) goto err_reg; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 85cdbc252654..0b6ee962c84e 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -7135,7 +7135,8 @@ static int __addrconf_sysctl_register(struct net *net, char *dev_name, snprintf(path, sizeof(path), "net/ipv6/conf/%s", dev_name); - p->sysctl_header = register_net_sysctl(net, path, table); + p->sysctl_header = register_net_sysctl_sz(net, path, table, + ARRAY_SIZE(addrconf_sysctl)); if (!p->sysctl_header) goto free; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 6d88f5248c1f..93a594a901d1 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -1227,4 +1227,9 @@ struct ctl_table * __net_init ipv6_icmp_sysctl_init(struct net *net) } return table; } + +size_t ipv6_icmp_sysctl_table_size(void) +{ + return ARRAY_SIZE(ipv6_icmp_table_template); +} #endif diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index d13240f13607..b2dd48911c8d 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -87,7 +87,8 @@ static int nf_ct_frag6_sysctl_register(struct net *net) table[2].data = &nf_frag->fqdir->high_thresh; table[2].extra1 = &nf_frag->fqdir->low_thresh; - hdr = register_net_sysctl(net, "net/netfilter", table); + hdr = register_net_sysctl_sz(net, "net/netfilter", table, + ARRAY_SIZE(nf_ct_frag6_sysctl_table)); if (hdr == NULL) goto err_reg; diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 5bc8a28e67f9..5ebc47da1000 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -470,7 +470,8 @@ static int __net_init ip6_frags_ns_sysctl_register(struct net *net) table[1].extra2 = &net->ipv6.fqdir->high_thresh; table[2].data = &net->ipv6.fqdir->timeout; - hdr = register_net_sysctl(net, "net/ipv6", table); + hdr = register_net_sysctl_sz(net, "net/ipv6", table, + ARRAY_SIZE(ip6_frags_ns_ctl_table)); if (!hdr) goto err_reg; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 01d6d352850a..9c687b357e6a 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -6456,6 +6456,15 @@ struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) return table; } + +size_t ipv6_route_sysctl_table_size(struct net *net) +{ + /* Don't export sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) + return 1; + + return ARRAY_SIZE(ipv6_route_table_template); +} #endif static int __net_init ip6_route_net_init(struct net *net) diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 94a0a294c6a1..888676163e90 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -275,17 +275,23 @@ static int __net_init ipv6_sysctl_net_init(struct net *net) if (!ipv6_icmp_table) goto out_ipv6_route_table; - net->ipv6.sysctl.hdr = register_net_sysctl(net, "net/ipv6", ipv6_table); + net->ipv6.sysctl.hdr = register_net_sysctl_sz(net, "net/ipv6", + ipv6_table, + ARRAY_SIZE(ipv6_table_template)); if (!net->ipv6.sysctl.hdr) goto out_ipv6_icmp_table; - net->ipv6.sysctl.route_hdr = - register_net_sysctl(net, "net/ipv6/route", ipv6_route_table); + net->ipv6.sysctl.route_hdr = register_net_sysctl_sz(net, + "net/ipv6/route", + ipv6_route_table, + ipv6_route_sysctl_table_size(net)); if (!net->ipv6.sysctl.route_hdr) goto out_unregister_ipv6_table; - net->ipv6.sysctl.icmp_hdr = - register_net_sysctl(net, "net/ipv6/icmp", ipv6_icmp_table); + net->ipv6.sysctl.icmp_hdr = register_net_sysctl_sz(net, + "net/ipv6/icmp", + ipv6_icmp_table, + ipv6_icmp_sysctl_table_size()); if (!net->ipv6.sysctl.icmp_hdr) goto out_unregister_route_table; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 3a88545a265d..44b6949d72b2 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1640,9 +1640,12 @@ process: struct sock *nsk; sk = req->rsk_listener; - drop_reason = tcp_inbound_md5_hash(sk, skb, - &hdr->saddr, &hdr->daddr, - AF_INET6, dif, sdif); + if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) + drop_reason = SKB_DROP_REASON_XFRM_POLICY; + else + drop_reason = tcp_inbound_md5_hash(sk, skb, + &hdr->saddr, &hdr->daddr, + AF_INET6, dif, sdif); if (drop_reason) { sk_drops_add(sk, skb); reqsk_put(req); @@ -1689,6 +1692,7 @@ process: } goto discard_and_relse; } + nf_reset_ct(skb); if (nsk == sk) { reqsk_put(req); tcp_v6_restore_cb(skb); diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 45d0f9a8b28c..42fb6996b077 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -201,7 +201,8 @@ static int __net_init xfrm6_net_sysctl_init(struct net *net) table[0].data = &net->xfrm.xfrm6_dst_ops.gc_thresh; } - hdr = register_net_sysctl(net, "net/ipv6", table); + hdr = register_net_sysctl_sz(net, "net/ipv6", table, + ARRAY_SIZE(xfrm6_policy_table)); if (!hdr) goto err_reg; diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 4580f61426bb..dd1d8ffd5f59 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -930,15 +930,18 @@ partial_message: out_error: kcm_push(kcm); - if (copied && sock->type == SOCK_SEQPACKET) { + if (sock->type == SOCK_SEQPACKET) { /* Wrote some bytes before encountering an * error, return partial success. */ - goto partial_message; - } - - if (head != kcm->seq_skb) + if (copied) + goto partial_message; + if (head != kcm->seq_skb) + kfree_skb(head); + } else { kfree_skb(head); + kcm->seq_skb = NULL; + } err = sk_stream_error(sk, msg->msg_flags, err); diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index ed8ebb6f5909..11f3d375cec0 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -507,7 +507,6 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) */ if (len > INT_MAX - transhdrlen) return -EMSGSIZE; - ulen = len + transhdrlen; /* Mirror BSD error message compatibility */ if (msg->msg_flags & MSG_OOB) @@ -628,6 +627,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) back_from_confirm: lock_sock(sk); + ulen = len + skb_queue_empty(&sk->sk_write_queue) ? transhdrlen : 0; err = ip6_append_data(sk, ip_generic_getfrag, msg, ulen, transhdrlen, &ipc6, &fl6, (struct rt6_info *)dst, diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 45e7a5d9c7d9..0e3a1753a51c 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -566,6 +566,9 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev, } err = ieee80211_key_link(key, link, sta); + /* KRACK protection, shouldn't happen but just silently accept key */ + if (err == -EALREADY) + err = 0; out_unlock: mutex_unlock(&local->sta_mtx); @@ -1857,7 +1860,8 @@ static int sta_link_apply_parameters(struct ieee80211_local *local, /* VHT can override some HT caps such as the A-MSDU max length */ if (params->vht_capa) ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, - params->vht_capa, link_sta); + params->vht_capa, NULL, + link_sta); if (params->he_capa) ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband, diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index e1900077bc4b..5542c93edfba 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -1072,7 +1072,7 @@ static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata, &chandef); memcpy(&cap_ie, elems->vht_cap_elem, sizeof(cap_ie)); ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, - &cap_ie, + &cap_ie, NULL, &sta->deflink); if (memcmp(&cap, &sta->sta.deflink.vht_cap, sizeof(cap))) rates_updated |= true; diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 06bd406846d2..98ef1fe1226e 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -676,7 +676,7 @@ struct ieee80211_if_mesh { struct timer_list mesh_path_root_timer; unsigned long wrkq_flags; - unsigned long mbss_changed; + unsigned long mbss_changed[64 / BITS_PER_LONG]; bool userspace_handles_dfs; @@ -2141,6 +2141,7 @@ void ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, const struct ieee80211_vht_cap *vht_cap_ie, + const struct ieee80211_vht_cap *vht_cap_ie2, struct link_sta_info *link_sta); enum ieee80211_sta_rx_bandwidth ieee80211_sta_cap_rx_bw(struct link_sta_info *link_sta); diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 13050dc9321f..0665ff5e456e 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -802,6 +802,9 @@ static void ieee80211_key_destroy(struct ieee80211_key *key, void ieee80211_key_free_unused(struct ieee80211_key *key) { + if (!key) + return; + WARN_ON(key->sdata || key->local); ieee80211_key_free_common(key); } @@ -854,7 +857,7 @@ int ieee80211_key_link(struct ieee80211_key *key, * can cause warnings to appear. */ bool delay_tailroom = sdata->vif.type == NL80211_IFTYPE_STATION; - int ret = -EOPNOTSUPP; + int ret; mutex_lock(&sdata->local->key_mtx); @@ -868,8 +871,10 @@ int ieee80211_key_link(struct ieee80211_key *key, * the same cipher. Enforce the assumption for pairwise keys. */ if ((alt_key && alt_key->conf.cipher != key->conf.cipher) || - (old_key && old_key->conf.cipher != key->conf.cipher)) + (old_key && old_key->conf.cipher != key->conf.cipher)) { + ret = -EOPNOTSUPP; goto out; + } } else if (sta) { struct link_sta_info *link_sta = &sta->deflink; int link_id = key->conf.link_id; @@ -895,8 +900,10 @@ int ieee80211_key_link(struct ieee80211_key *key, /* Non-pairwise keys must also not switch the cipher on rekey */ if (!pairwise) { - if (old_key && old_key->conf.cipher != key->conf.cipher) + if (old_key && old_key->conf.cipher != key->conf.cipher) { + ret = -EOPNOTSUPP; goto out; + } } /* @@ -904,9 +911,8 @@ int ieee80211_key_link(struct ieee80211_key *key, * new version of the key to avoid nonce reuse or replay issues. */ if (ieee80211_key_identical(sdata, old_key, key)) { - ieee80211_key_free_unused(key); - ret = 0; - goto out; + ret = -EALREADY; + goto unlock; } key->local = sdata->local; @@ -930,7 +936,11 @@ int ieee80211_key_link(struct ieee80211_key *key, ieee80211_key_free(key, delay_tailroom); } + key = NULL; + out: + ieee80211_key_free_unused(key); + unlock: mutex_unlock(&sdata->local->key_mtx); return ret; diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index af8c5fc2db14..e31c312c124a 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -1175,7 +1175,7 @@ void ieee80211_mbss_info_change_notify(struct ieee80211_sub_if_data *sdata, /* if we race with running work, worst case this work becomes a noop */ for_each_set_bit(bit, &bits, sizeof(changed) * BITS_PER_BYTE) - set_bit(bit, &ifmsh->mbss_changed); + set_bit(bit, ifmsh->mbss_changed); set_bit(MESH_WORK_MBSS_CHANGED, &ifmsh->wrkq_flags); wiphy_work_queue(sdata->local->hw.wiphy, &sdata->work); } @@ -1257,7 +1257,7 @@ void ieee80211_stop_mesh(struct ieee80211_sub_if_data *sdata) /* clear any mesh work (for next join) we may have accrued */ ifmsh->wrkq_flags = 0; - ifmsh->mbss_changed = 0; + memset(ifmsh->mbss_changed, 0, sizeof(ifmsh->mbss_changed)); local->fif_other_bss--; atomic_dec(&local->iff_allmultis); @@ -1724,9 +1724,9 @@ static void mesh_bss_info_changed(struct ieee80211_sub_if_data *sdata) u32 bit; u64 changed = 0; - for_each_set_bit(bit, &ifmsh->mbss_changed, + for_each_set_bit(bit, ifmsh->mbss_changed, sizeof(changed) * BITS_PER_BYTE) { - clear_bit(bit, &ifmsh->mbss_changed); + clear_bit(bit, ifmsh->mbss_changed); changed |= BIT(bit); } diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index f3d5bb0a59f1..a1e526419e9d 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -451,7 +451,7 @@ static void mesh_sta_info_init(struct ieee80211_sub_if_data *sdata, changed |= IEEE80211_RC_BW_CHANGED; ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, - elems->vht_cap_elem, + elems->vht_cap_elem, NULL, &sta->deflink); ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband, elems->he_cap, diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index f93eb38ae0b8..0c9198997482 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -4202,10 +4202,33 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, elems->ht_cap_elem, link_sta); - if (elems->vht_cap_elem && !(link->u.mgd.conn_flags & IEEE80211_CONN_DISABLE_VHT)) + if (elems->vht_cap_elem && + !(link->u.mgd.conn_flags & IEEE80211_CONN_DISABLE_VHT)) { + const struct ieee80211_vht_cap *bss_vht_cap = NULL; + const struct cfg80211_bss_ies *ies; + + /* + * Cisco AP module 9115 with FW 17.3 has a bug and sends a + * too large maximum MPDU length in the association response + * (indicating 12k) that it cannot actually process ... + * Work around that. + */ + rcu_read_lock(); + ies = rcu_dereference(cbss->ies); + if (ies) { + const struct element *elem; + + elem = cfg80211_find_elem(WLAN_EID_VHT_CAPABILITY, + ies->data, ies->len); + if (elem && elem->datalen >= sizeof(*bss_vht_cap)) + bss_vht_cap = (const void *)elem->data; + } + ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, elems->vht_cap_elem, - link_sta); + bss_vht_cap, link_sta); + rcu_read_unlock(); + } if (elems->he_operation && !(link->u.mgd.conn_flags & IEEE80211_CONN_DISABLE_HE) && elems->he_cap) { @@ -5107,9 +5130,10 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, continue; valid_links |= BIT(link_id); - if (assoc_data->link[link_id].disabled) { + if (assoc_data->link[link_id].disabled) dormant_links |= BIT(link_id); - } else if (link_id != assoc_data->assoc_link_id) { + + if (link_id != assoc_data->assoc_link_id) { err = ieee80211_sta_allocate_link(sta, link_id); if (err) goto out_err; @@ -5124,7 +5148,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link; struct link_sta_info *link_sta; - if (!cbss || assoc_data->link[link_id].disabled) + if (!cbss) continue; link = sdata_dereference(sdata->link[link_id], sdata); @@ -5429,17 +5453,18 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { struct ieee80211_link_data *link; - link = sdata_dereference(sdata->link[link_id], sdata); - if (!link) - continue; - if (!assoc_data->link[link_id].bss) continue; resp.links[link_id].bss = assoc_data->link[link_id].bss; - resp.links[link_id].addr = link->conf->addr; + ether_addr_copy(resp.links[link_id].addr, + assoc_data->link[link_id].addr); resp.links[link_id].status = assoc_data->link[link_id].status; + link = sdata_dereference(sdata->link[link_id], sdata); + if (!link) + continue; + /* get uapsd queues configuration - same for all links */ resp.uapsd_queues = 0; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 7fe7280e8437..d45d4be63dd8 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -665,7 +665,8 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx) } if (unlikely(tx->key && tx->key->flags & KEY_FLAG_TAINTED && - !ieee80211_is_deauth(hdr->frame_control))) + !ieee80211_is_deauth(hdr->frame_control)) && + tx->skb->protocol != tx->sdata->control_port_protocol) return TX_DROP; if (!skip_hw && tx->key && diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c index c1250aa47808..b3a5c3e96a72 100644 --- a/net/mac80211/vht.c +++ b/net/mac80211/vht.c @@ -4,7 +4,7 @@ * * Portions of this file * Copyright(c) 2015 - 2016 Intel Deutschland GmbH - * Copyright (C) 2018 - 2022 Intel Corporation + * Copyright (C) 2018 - 2023 Intel Corporation */ #include <linux/ieee80211.h> @@ -116,12 +116,14 @@ void ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, const struct ieee80211_vht_cap *vht_cap_ie, + const struct ieee80211_vht_cap *vht_cap_ie2, struct link_sta_info *link_sta) { struct ieee80211_sta_vht_cap *vht_cap = &link_sta->pub->vht_cap; struct ieee80211_sta_vht_cap own_cap; u32 cap_info, i; bool have_80mhz; + u32 mpdu_len; memset(vht_cap, 0, sizeof(*vht_cap)); @@ -318,10 +320,20 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata, link_sta->pub->bandwidth = ieee80211_sta_cur_vht_bw(link_sta); /* + * Work around the Cisco 9115 FW 17.3 bug by taking the min of + * both reported MPDU lengths. + */ + mpdu_len = vht_cap->cap & IEEE80211_VHT_CAP_MAX_MPDU_MASK; + if (vht_cap_ie2) + mpdu_len = min_t(u32, mpdu_len, + le32_get_bits(vht_cap_ie2->vht_cap_info, + IEEE80211_VHT_CAP_MAX_MPDU_MASK)); + + /* * FIXME - should the amsdu len be per link? store per link * and maintain a minimum? */ - switch (vht_cap->cap & IEEE80211_VHT_CAP_MAX_MPDU_MASK) { + switch (mpdu_len) { case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454: link_sta->pub->agg.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_11454; break; diff --git a/net/mctp/route.c b/net/mctp/route.c index ab62fe447038..7a47a58aa54b 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -737,6 +737,8 @@ struct mctp_route *mctp_route_lookup(struct net *net, unsigned int dnet, { struct mctp_route *tmp, *rt = NULL; + rcu_read_lock(); + list_for_each_entry_rcu(tmp, &net->mctp.routes, list) { /* TODO: add metrics */ if (mctp_rt_match_eid(tmp, dnet, daddr)) { @@ -747,21 +749,29 @@ struct mctp_route *mctp_route_lookup(struct net *net, unsigned int dnet, } } + rcu_read_unlock(); + return rt; } static struct mctp_route *mctp_route_lookup_null(struct net *net, struct net_device *dev) { - struct mctp_route *rt; + struct mctp_route *tmp, *rt = NULL; - list_for_each_entry_rcu(rt, &net->mctp.routes, list) { - if (rt->dev->dev == dev && rt->type == RTN_LOCAL && - refcount_inc_not_zero(&rt->refs)) - return rt; + rcu_read_lock(); + + list_for_each_entry_rcu(tmp, &net->mctp.routes, list) { + if (tmp->dev->dev == dev && tmp->type == RTN_LOCAL && + refcount_inc_not_zero(&tmp->refs)) { + rt = tmp; + break; + } } - return NULL; + rcu_read_unlock(); + + return rt; } static int mctp_do_fragment_route(struct mctp_route *rt, struct sk_buff *skb, diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index bf6e81d56263..1af29af65388 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1419,7 +1419,8 @@ static int mpls_dev_sysctl_register(struct net_device *dev, snprintf(path, sizeof(path), "net/mpls/conf/%s", dev->name); - mdev->sysctl = register_net_sysctl(net, path, table); + mdev->sysctl = register_net_sysctl_sz(net, path, table, + ARRAY_SIZE(mpls_dev_table)); if (!mdev->sysctl) goto free; @@ -2689,7 +2690,8 @@ static int mpls_net_init(struct net *net) for (i = 0; i < ARRAY_SIZE(mpls_table) - 1; i++) table[i].data = (char *)net + (uintptr_t)table[i].data; - net->mpls.ctl = register_net_sysctl(net, "net/mpls", table); + net->mpls.ctl = register_net_sysctl_sz(net, "net/mpls", table, + ARRAY_SIZE(mpls_table)); if (net->mpls.ctl == NULL) { kfree(table); return -ENOMEM; diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index c46c22a84d23..e72b518c5d02 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -164,7 +164,8 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) table[5].data = &pernet->pm_type; table[6].data = &pernet->scheduler; - hdr = register_net_sysctl(net, MPTCP_SYSCTL_PATH, table); + hdr = register_net_sysctl_sz(net, MPTCP_SYSCTL_PATH, table, + ARRAY_SIZE(mptcp_sysctl_table)); if (!hdr) goto err_reg; diff --git a/net/mptcp/options.c b/net/mptcp/options.c index c254accb14de..cd15ec73073e 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -1269,12 +1269,13 @@ static void mptcp_set_rwin(struct tcp_sock *tp, struct tcphdr *th) if (rcv_wnd == rcv_wnd_old) break; - if (before64(rcv_wnd_new, rcv_wnd)) { + + rcv_wnd_old = rcv_wnd; + if (before64(rcv_wnd_new, rcv_wnd_old)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICTUPDATE); goto raise_win; } MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICT); - rcv_wnd_old = rcv_wnd; } return; } diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index b5a8aa4c1ebd..d042d32beb4d 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -307,12 +307,6 @@ int mptcp_nl_cmd_sf_create(struct sk_buff *skb, struct genl_info *info) goto create_err; } - if (addr_l.id == 0) { - NL_SET_ERR_MSG_ATTR(info->extack, laddr, "missing local addr id"); - err = -EINVAL; - goto create_err; - } - err = mptcp_pm_parse_addr(raddr, info, &addr_r); if (err < 0) { NL_SET_ERR_MSG_ATTR(info->extack, raddr, "error parsing remote addr"); diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index a7fc16f5175d..d1902373c974 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -405,7 +405,7 @@ drop: return false; } -static void mptcp_stop_timer(struct sock *sk) +static void mptcp_stop_rtx_timer(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -770,6 +770,46 @@ static bool __mptcp_ofo_queue(struct mptcp_sock *msk) return moved; } +static bool __mptcp_subflow_error_report(struct sock *sk, struct sock *ssk) +{ + int err = sock_error(ssk); + int ssk_state; + + if (!err) + return false; + + /* only propagate errors on fallen-back sockets or + * on MPC connect + */ + if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(mptcp_sk(sk))) + return false; + + /* We need to propagate only transition to CLOSE state. + * Orphaned socket will see such state change via + * subflow_sched_work_if_closed() and that path will properly + * destroy the msk as needed. + */ + ssk_state = inet_sk_state_load(ssk); + if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD)) + inet_sk_state_store(sk, ssk_state); + WRITE_ONCE(sk->sk_err, -err); + + /* This barrier is coupled with smp_rmb() in mptcp_poll() */ + smp_wmb(); + sk_error_report(sk); + return true; +} + +void __mptcp_error_report(struct sock *sk) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_sock *msk = mptcp_sk(sk); + + mptcp_for_each_subflow(msk, subflow) + if (__mptcp_subflow_error_report(sk, mptcp_subflow_tcp_sock(subflow))) + break; +} + /* In most cases we will be able to lock the mptcp socket. If its already * owned, we need to defer to the work queue to avoid ABBA deadlock. */ @@ -852,6 +892,7 @@ static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk) mptcp_subflow_ctx(ssk)->subflow_id = msk->subflow_id++; mptcp_sockopt_sync_locked(msk, ssk); mptcp_subflow_joined(msk, ssk); + mptcp_stop_tout_timer(sk); return true; } @@ -871,12 +912,12 @@ static void __mptcp_flush_join_list(struct sock *sk, struct list_head *join_list } } -static bool mptcp_timer_pending(struct sock *sk) +static bool mptcp_rtx_timer_pending(struct sock *sk) { return timer_pending(&inet_csk(sk)->icsk_retransmit_timer); } -static void mptcp_reset_timer(struct sock *sk) +static void mptcp_reset_rtx_timer(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); unsigned long tout; @@ -1010,10 +1051,10 @@ static void __mptcp_clean_una(struct sock *sk) out: if (snd_una == READ_ONCE(msk->snd_nxt) && snd_una == READ_ONCE(msk->write_seq)) { - if (mptcp_timer_pending(sk) && !mptcp_data_fin_enabled(msk)) - mptcp_stop_timer(sk); + if (mptcp_rtx_timer_pending(sk) && !mptcp_data_fin_enabled(msk)) + mptcp_stop_rtx_timer(sk); } else { - mptcp_reset_timer(sk); + mptcp_reset_rtx_timer(sk); } } @@ -1586,8 +1627,8 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags) mptcp_push_release(ssk, &info); /* ensure the rtx timer is running */ - if (!mptcp_timer_pending(sk)) - mptcp_reset_timer(sk); + if (!mptcp_rtx_timer_pending(sk)) + mptcp_reset_rtx_timer(sk); if (do_check_data_fin) mptcp_check_send_data_fin(sk); } @@ -1650,8 +1691,8 @@ out: if (copied) { tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, info.size_goal); - if (!mptcp_timer_pending(sk)) - mptcp_reset_timer(sk); + if (!mptcp_rtx_timer_pending(sk)) + mptcp_reset_rtx_timer(sk); if (msk->snd_data_fin_enable && msk->snd_nxt + 1 == msk->write_seq) @@ -2220,7 +2261,7 @@ static void mptcp_retransmit_timer(struct timer_list *t) sock_put(sk); } -static void mptcp_timeout_timer(struct timer_list *t) +static void mptcp_tout_timer(struct timer_list *t) { struct sock *sk = from_timer(sk, t, sk_timer); @@ -2329,18 +2370,14 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, bool dispose_it, need_push = false; /* If the first subflow moved to a close state before accept, e.g. due - * to an incoming reset, mptcp either: - * - if either the subflow or the msk are dead, destroy the context - * (the subflow socket is deleted by inet_child_forget) and the msk - * - otherwise do nothing at the moment and take action at accept and/or - * listener shutdown - user-space must be able to accept() the closed - * socket. + * to an incoming reset or listener shutdown, the subflow socket is + * already deleted by inet_child_forget() and the mptcp socket can't + * survive too. */ - if (msk->in_accept_queue && msk->first == ssk) { - if (!sock_flag(sk, SOCK_DEAD) && !sock_flag(ssk, SOCK_DEAD)) - return; - + if (msk->in_accept_queue && msk->first == ssk && + (sock_flag(sk, SOCK_DEAD) || sock_flag(ssk, SOCK_DEAD))) { /* ensure later check in mptcp_worker() will dispose the msk */ + mptcp_set_close_tout(sk, tcp_jiffies32 - (TCP_TIMEWAIT_LEN + 1)); sock_set_flag(sk, SOCK_DEAD); lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); mptcp_subflow_drop_ctx(ssk); @@ -2392,6 +2429,7 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, } out_release: + __mptcp_subflow_error_report(sk, ssk); release_sock(ssk); sock_put(ssk); @@ -2402,6 +2440,22 @@ out_release: out: if (need_push) __mptcp_push_pending(sk, 0); + + /* Catch every 'all subflows closed' scenario, including peers silently + * closing them, e.g. due to timeout. + * For established sockets, allow an additional timeout before closing, + * as the protocol can still create more subflows. + */ + if (list_is_singular(&msk->conn_list) && msk->first && + inet_sk_state_load(msk->first) == TCP_CLOSE) { + if (sk->sk_state != TCP_ESTABLISHED || + msk->in_accept_queue || sock_flag(sk, SOCK_DEAD)) { + inet_sk_state_store(sk, TCP_CLOSE); + mptcp_close_wake_up(sk); + } else { + mptcp_start_tout_timer(sk); + } + } } void mptcp_close_ssk(struct sock *sk, struct sock *ssk, @@ -2445,23 +2499,14 @@ static void __mptcp_close_subflow(struct sock *sk) } -static bool mptcp_should_close(const struct sock *sk) +static bool mptcp_close_tout_expired(const struct sock *sk) { - s32 delta = tcp_jiffies32 - inet_csk(sk)->icsk_mtup.probe_timestamp; - struct mptcp_subflow_context *subflow; - - if (delta >= TCP_TIMEWAIT_LEN || mptcp_sk(sk)->in_accept_queue) - return true; + if (!inet_csk(sk)->icsk_mtup.probe_timestamp || + sk->sk_state == TCP_CLOSE) + return false; - /* if all subflows are in closed status don't bother with additional - * timeout - */ - mptcp_for_each_subflow(mptcp_sk(sk), subflow) { - if (inet_sk_state_load(mptcp_subflow_tcp_sock(subflow)) != - TCP_CLOSE) - return false; - } - return true; + return time_after32(tcp_jiffies32, + inet_csk(sk)->icsk_mtup.probe_timestamp + TCP_TIMEWAIT_LEN); } static void mptcp_check_fastclose(struct mptcp_sock *msk) @@ -2588,27 +2633,28 @@ static void __mptcp_retrans(struct sock *sk) reset_timer: mptcp_check_and_set_pending(sk); - if (!mptcp_timer_pending(sk)) - mptcp_reset_timer(sk); + if (!mptcp_rtx_timer_pending(sk)) + mptcp_reset_rtx_timer(sk); } /* schedule the timeout timer for the relevant event: either close timeout * or mp_fail timeout. The close timeout takes precedence on the mp_fail one */ -void mptcp_reset_timeout(struct mptcp_sock *msk, unsigned long fail_tout) +void mptcp_reset_tout_timer(struct mptcp_sock *msk, unsigned long fail_tout) { struct sock *sk = (struct sock *)msk; unsigned long timeout, close_timeout; - if (!fail_tout && !sock_flag(sk, SOCK_DEAD)) + if (!fail_tout && !inet_csk(sk)->icsk_mtup.probe_timestamp) return; - close_timeout = inet_csk(sk)->icsk_mtup.probe_timestamp - tcp_jiffies32 + jiffies + TCP_TIMEWAIT_LEN; + close_timeout = inet_csk(sk)->icsk_mtup.probe_timestamp - tcp_jiffies32 + jiffies + + TCP_TIMEWAIT_LEN; /* the close timeout takes precedence on the fail one, and here at least one of * them is active */ - timeout = sock_flag(sk, SOCK_DEAD) ? close_timeout : fail_tout; + timeout = inet_csk(sk)->icsk_mtup.probe_timestamp ? close_timeout : fail_tout; sk_reset_timer(sk, &sk->sk_timer, timeout); } @@ -2627,8 +2673,6 @@ static void mptcp_mp_fail_no_response(struct mptcp_sock *msk) mptcp_subflow_reset(ssk); WRITE_ONCE(mptcp_subflow_ctx(ssk)->fail_tout, 0); unlock_sock_fast(ssk, slow); - - mptcp_reset_timeout(msk, 0); } static void mptcp_do_fastclose(struct sock *sk) @@ -2665,18 +2709,14 @@ static void mptcp_worker(struct work_struct *work) if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) __mptcp_close_subflow(sk); - /* There is no point in keeping around an orphaned sk timedout or - * closed, but we need the msk around to reply to incoming DATA_FIN, - * even if it is orphaned and in FIN_WAIT2 state - */ - if (sock_flag(sk, SOCK_DEAD)) { - if (mptcp_should_close(sk)) - mptcp_do_fastclose(sk); + if (mptcp_close_tout_expired(sk)) { + mptcp_do_fastclose(sk); + mptcp_close_wake_up(sk); + } - if (sk->sk_state == TCP_CLOSE) { - __mptcp_destroy_sock(sk); - goto unlock; - } + if (sock_flag(sk, SOCK_DEAD) && sk->sk_state == TCP_CLOSE) { + __mptcp_destroy_sock(sk); + goto unlock; } if (test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags)) @@ -2717,7 +2757,7 @@ static void __mptcp_init_sock(struct sock *sk) /* re-use the csk retrans timer for MPTCP-level retrans */ timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0); - timer_setup(&sk->sk_timer, mptcp_timeout_timer, 0); + timer_setup(&sk->sk_timer, mptcp_tout_timer, 0); } static void mptcp_ca_reset(struct sock *sk) @@ -2808,8 +2848,8 @@ void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how) } else { pr_debug("Sending DATA_FIN on subflow %p", ssk); tcp_send_ack(ssk); - if (!mptcp_timer_pending(sk)) - mptcp_reset_timer(sk); + if (!mptcp_rtx_timer_pending(sk)) + mptcp_reset_rtx_timer(sk); } break; } @@ -2892,7 +2932,7 @@ static void __mptcp_destroy_sock(struct sock *sk) might_sleep(); - mptcp_stop_timer(sk); + mptcp_stop_rtx_timer(sk); sk_stop_timer(sk, &sk->sk_timer); msk->pm.status = 0; mptcp_release_sched(msk); @@ -2975,7 +3015,6 @@ bool __mptcp_close(struct sock *sk, long timeout) cleanup: /* orphan all the subflows */ - inet_csk(sk)->icsk_mtup.probe_timestamp = tcp_jiffies32; mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow = lock_sock_fast_nested(ssk); @@ -3012,7 +3051,7 @@ cleanup: __mptcp_destroy_sock(sk); do_cancel_work = true; } else { - mptcp_reset_timeout(msk, 0); + mptcp_start_tout_timer(sk); } return do_cancel_work; @@ -3059,12 +3098,6 @@ static int mptcp_disconnect(struct sock *sk, int flags) { struct mptcp_sock *msk = mptcp_sk(sk); - /* Deny disconnect if other threads are blocked in sk_wait_event() - * or inet_wait_for_connect(). - */ - if (sk->sk_wait_pending) - return -EBUSY; - /* We are on the fastopen error path. We can't call straight into the * subflows cleanup code due to lock nesting (we are already under * msk->firstsocket lock). @@ -3075,8 +3108,8 @@ static int mptcp_disconnect(struct sock *sk, int flags) mptcp_check_listen_stop(sk); inet_sk_state_store(sk, TCP_CLOSE); - mptcp_stop_timer(sk); - sk_stop_timer(sk, &sk->sk_timer); + mptcp_stop_rtx_timer(sk); + mptcp_stop_tout_timer(sk); if (msk->token) mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL); @@ -3134,7 +3167,6 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk, inet_sk(nsk)->pinet6 = mptcp_inet6_sk(nsk); #endif - nsk->sk_wait_pending = 0; __mptcp_init_sock(nsk); msk = mptcp_sk(nsk); @@ -3386,24 +3418,21 @@ static void schedule_3rdack_retransmission(struct sock *ssk) sk_reset_timer(ssk, &icsk->icsk_delack_timer, timeout); } -void mptcp_subflow_process_delegated(struct sock *ssk) +void mptcp_subflow_process_delegated(struct sock *ssk, long status) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = subflow->conn; - if (test_bit(MPTCP_DELEGATE_SEND, &subflow->delegated_status)) { + if (status & BIT(MPTCP_DELEGATE_SEND)) { mptcp_data_lock(sk); if (!sock_owned_by_user(sk)) __mptcp_subflow_push_pending(sk, ssk, true); else __set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags); mptcp_data_unlock(sk); - mptcp_subflow_delegated_done(subflow, MPTCP_DELEGATE_SEND); } - if (test_bit(MPTCP_DELEGATE_ACK, &subflow->delegated_status)) { + if (status & BIT(MPTCP_DELEGATE_ACK)) schedule_3rdack_retransmission(ssk); - mptcp_subflow_delegated_done(subflow, MPTCP_DELEGATE_ACK); - } } static int mptcp_hash(struct sock *sk) @@ -3929,14 +3958,17 @@ static int mptcp_napi_poll(struct napi_struct *napi, int budget) struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bh_lock_sock_nested(ssk); - if (!sock_owned_by_user(ssk) && - mptcp_subflow_has_delegated_action(subflow)) - mptcp_subflow_process_delegated(ssk); - /* ... elsewhere tcp_release_cb_override already processed - * the action or will do at next release_sock(). - * In both case must dequeue the subflow here - on the same - * CPU that scheduled it. - */ + if (!sock_owned_by_user(ssk)) { + mptcp_subflow_process_delegated(ssk, xchg(&subflow->delegated_status, 0)); + } else { + /* tcp_release_cb_override already processed + * the action or will do at next release_sock(). + * In both case must dequeue the subflow here - on the same + * CPU that scheduled it. + */ + smp_wmb(); + clear_bit(MPTCP_DELEGATE_SCHEDULED, &subflow->delegated_status); + } bh_unlock_sock(ssk); sock_put(ssk); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 7254b3562575..3612545fa62e 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -444,9 +444,11 @@ struct mptcp_delegated_action { DECLARE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions); -#define MPTCP_DELEGATE_SEND 0 -#define MPTCP_DELEGATE_ACK 1 +#define MPTCP_DELEGATE_SCHEDULED 0 +#define MPTCP_DELEGATE_SEND 1 +#define MPTCP_DELEGATE_ACK 2 +#define MPTCP_DELEGATE_ACTIONS_MASK (~BIT(MPTCP_DELEGATE_SCHEDULED)) /* MPTCP subflow context */ struct mptcp_subflow_context { struct list_head node;/* conn_list of subflows */ @@ -564,23 +566,24 @@ mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow) return subflow->map_seq + mptcp_subflow_get_map_offset(subflow); } -void mptcp_subflow_process_delegated(struct sock *ssk); +void mptcp_subflow_process_delegated(struct sock *ssk, long actions); static inline void mptcp_subflow_delegate(struct mptcp_subflow_context *subflow, int action) { + long old, set_bits = BIT(MPTCP_DELEGATE_SCHEDULED) | BIT(action); struct mptcp_delegated_action *delegated; bool schedule; /* the caller held the subflow bh socket lock */ lockdep_assert_in_softirq(); - /* The implied barrier pairs with mptcp_subflow_delegated_done(), and - * ensures the below list check sees list updates done prior to status - * bit changes + /* The implied barrier pairs with tcp_release_cb_override() + * mptcp_napi_poll(), and ensures the below list check sees list + * updates done prior to delegated status bits changes */ - if (!test_and_set_bit(action, &subflow->delegated_status)) { - /* still on delegated list from previous scheduling */ - if (!list_empty(&subflow->delegated_node)) + old = set_mask_bits(&subflow->delegated_status, 0, set_bits); + if (!(old & BIT(MPTCP_DELEGATE_SCHEDULED))) { + if (WARN_ON_ONCE(!list_empty(&subflow->delegated_node))) return; delegated = this_cpu_ptr(&mptcp_delegated_actions); @@ -605,20 +608,6 @@ mptcp_subflow_delegated_next(struct mptcp_delegated_action *delegated) return ret; } -static inline bool mptcp_subflow_has_delegated_action(const struct mptcp_subflow_context *subflow) -{ - return !!READ_ONCE(subflow->delegated_status); -} - -static inline void mptcp_subflow_delegated_done(struct mptcp_subflow_context *subflow, int action) -{ - /* pairs with mptcp_subflow_delegate, ensures delegate_node is updated before - * touching the status bit - */ - smp_wmb(); - clear_bit(action, &subflow->delegated_status); -} - int mptcp_is_enabled(const struct net *net); unsigned int mptcp_get_add_addr_timeout(const struct net *net); int mptcp_is_checksum_enabled(const struct net *net); @@ -718,7 +707,29 @@ void mptcp_get_options(const struct sk_buff *skb, void mptcp_finish_connect(struct sock *sk); void __mptcp_set_connected(struct sock *sk); -void mptcp_reset_timeout(struct mptcp_sock *msk, unsigned long fail_tout); +void mptcp_reset_tout_timer(struct mptcp_sock *msk, unsigned long fail_tout); + +static inline void mptcp_stop_tout_timer(struct sock *sk) +{ + if (!inet_csk(sk)->icsk_mtup.probe_timestamp) + return; + + sk_stop_timer(sk, &sk->sk_timer); + inet_csk(sk)->icsk_mtup.probe_timestamp = 0; +} + +static inline void mptcp_set_close_tout(struct sock *sk, unsigned long tout) +{ + /* avoid 0 timestamp, as that means no close timeout */ + inet_csk(sk)->icsk_mtup.probe_timestamp = tout ? : 1; +} + +static inline void mptcp_start_tout_timer(struct sock *sk) +{ + mptcp_set_close_tout(sk, tcp_jiffies32); + mptcp_reset_tout_timer(mptcp_sk(sk), 0); +} + static inline bool mptcp_is_fully_established(struct sock *sk) { return inet_sk_state_load(sk) == TCP_ESTABLISHED && diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 9bf3c7bc1762..9c1f8d1d63d2 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1226,7 +1226,7 @@ static void mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk) WRITE_ONCE(subflow->fail_tout, fail_tout); tcp_send_ack(ssk); - mptcp_reset_timeout(msk, subflow->fail_tout); + mptcp_reset_tout_timer(msk, subflow->fail_tout); } static bool subflow_check_data_avail(struct sock *ssk) @@ -1362,42 +1362,6 @@ void mptcp_space(const struct sock *ssk, int *space, int *full_space) *full_space = mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf)); } -void __mptcp_error_report(struct sock *sk) -{ - struct mptcp_subflow_context *subflow; - struct mptcp_sock *msk = mptcp_sk(sk); - - mptcp_for_each_subflow(msk, subflow) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - int err = sock_error(ssk); - int ssk_state; - - if (!err) - continue; - - /* only propagate errors on fallen-back sockets or - * on MPC connect - */ - if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(msk)) - continue; - - /* We need to propagate only transition to CLOSE state. - * Orphaned socket will see such state change via - * subflow_sched_work_if_closed() and that path will properly - * destroy the msk as needed. - */ - ssk_state = inet_sk_state_load(ssk); - if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD)) - inet_sk_state_store(sk, ssk_state); - WRITE_ONCE(sk->sk_err, -err); - - /* This barrier is coupled with smp_rmb() in mptcp_poll() */ - smp_wmb(); - sk_error_report(sk); - break; - } -} - static void subflow_error_report(struct sock *ssk) { struct sock *sk = mptcp_subflow_ctx(ssk)->conn; @@ -1588,6 +1552,7 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, mptcp_sock_graft(ssk, sk->sk_socket); iput(SOCK_INODE(sf)); WRITE_ONCE(msk->allow_infinite_fallback, false); + mptcp_stop_tout_timer(sk); return 0; failed_unlink: @@ -1991,9 +1956,15 @@ static void subflow_ulp_clone(const struct request_sock *req, static void tcp_release_cb_override(struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + long status; - if (mptcp_subflow_has_delegated_action(subflow)) - mptcp_subflow_process_delegated(ssk); + /* process and clear all the pending actions, but leave the subflow into + * the napi queue. To respect locking, only the same CPU that originated + * the action can touch the list. mptcp_napi_poll will take care of it. + */ + status = set_mask_bits(&subflow->delegated_status, MPTCP_DELEGATE_ACTIONS_MASK, 0); + if (status) + mptcp_subflow_process_delegated(ssk, status); tcp_release_cb(ssk); } diff --git a/net/ncsi/ncsi-aen.c b/net/ncsi/ncsi-aen.c index 62fb1031763d..f8854bff286c 100644 --- a/net/ncsi/ncsi-aen.c +++ b/net/ncsi/ncsi-aen.c @@ -89,6 +89,11 @@ static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp, if ((had_link == has_link) || chained) return 0; + if (had_link) + netif_carrier_off(ndp->ndev.dev); + else + netif_carrier_on(ndp->ndev.dev); + if (!ndp->multi_package && !nc->package->multi_channel) { if (had_link) { ndp->flags |= NCSI_DEV_RESHUFFLE; diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index e564b5174261..35d2f9c9ada0 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -683,6 +683,14 @@ __ip_set_put(struct ip_set *set) * a separate reference counter */ static void +__ip_set_get_netlink(struct ip_set *set) +{ + write_lock_bh(&ip_set_ref_lock); + set->ref_netlink++; + write_unlock_bh(&ip_set_ref_lock); +} + +static void __ip_set_put_netlink(struct ip_set *set) { write_lock_bh(&ip_set_ref_lock); @@ -1693,11 +1701,11 @@ call_ad(struct net *net, struct sock *ctnl, struct sk_buff *skb, do { if (retried) { - __ip_set_get(set); + __ip_set_get_netlink(set); nfnl_unlock(NFNL_SUBSYS_IPSET); cond_resched(); nfnl_lock(NFNL_SUBSYS_IPSET); - __ip_set_put(set); + __ip_set_put_netlink(set); } ip_set_lock(set); diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index 005a7ce87217..bf4f91b78e1d 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -36,6 +36,7 @@ MODULE_ALIAS("ip_set_hash:net,port,net"); #define IP_SET_HASH_WITH_PROTO #define IP_SET_HASH_WITH_NETS #define IPSET_NET_COUNT 2 +#define IP_SET_HASH_WITH_NET0 /* IPv4 variant */ diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 4bb0d90eca1c..143a341bbc0a 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -4269,6 +4269,7 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) struct net *net = ipvs->net; struct ctl_table *tbl; int idx, ret; + size_t ctl_table_size = ARRAY_SIZE(vs_vars); atomic_set(&ipvs->dropentry, 0); spin_lock_init(&ipvs->dropentry_lock); @@ -4285,8 +4286,10 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) return -ENOMEM; /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) + if (net->user_ns != &init_user_ns) { tbl[0].procname = NULL; + ctl_table_size = 0; + } } else tbl = vs_vars; /* Initialize sysctl defaults */ @@ -4357,7 +4360,8 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) #endif ret = -ENOMEM; - ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl); + ipvs->sysctl_hdr = register_net_sysctl_sz(net, "net/ipv4/vs", tbl, + ctl_table_size); if (!ipvs->sysctl_hdr) goto err; ipvs->sysctl_tbl = tbl; diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index 1b87214d385e..cf78ba4ce5ff 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -550,6 +550,7 @@ static struct ip_vs_scheduler ip_vs_lblc_scheduler = { static int __net_init __ip_vs_lblc_init(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); + size_t vars_table_size = ARRAY_SIZE(vs_vars_table); if (!ipvs) return -ENOENT; @@ -562,16 +563,19 @@ static int __net_init __ip_vs_lblc_init(struct net *net) return -ENOMEM; /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) + if (net->user_ns != &init_user_ns) { ipvs->lblc_ctl_table[0].procname = NULL; + vars_table_size = 0; + } } else ipvs->lblc_ctl_table = vs_vars_table; ipvs->sysctl_lblc_expiration = DEFAULT_EXPIRATION; ipvs->lblc_ctl_table[0].data = &ipvs->sysctl_lblc_expiration; - ipvs->lblc_ctl_header = - register_net_sysctl(net, "net/ipv4/vs", ipvs->lblc_ctl_table); + ipvs->lblc_ctl_header = register_net_sysctl_sz(net, "net/ipv4/vs", + ipvs->lblc_ctl_table, + vars_table_size); if (!ipvs->lblc_ctl_header) { if (!net_eq(net, &init_net)) kfree(ipvs->lblc_ctl_table); diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index ad8f5fea6d3a..9eddf118b40e 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -736,6 +736,7 @@ static struct ip_vs_scheduler ip_vs_lblcr_scheduler = static int __net_init __ip_vs_lblcr_init(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); + size_t vars_table_size = ARRAY_SIZE(vs_vars_table); if (!ipvs) return -ENOENT; @@ -748,15 +749,18 @@ static int __net_init __ip_vs_lblcr_init(struct net *net) return -ENOMEM; /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) + if (net->user_ns != &init_user_ns) { ipvs->lblcr_ctl_table[0].procname = NULL; + vars_table_size = 0; + } } else ipvs->lblcr_ctl_table = vs_vars_table; ipvs->sysctl_lblcr_expiration = DEFAULT_EXPIRATION; ipvs->lblcr_ctl_table[0].data = &ipvs->sysctl_lblcr_expiration; - ipvs->lblcr_ctl_header = - register_net_sysctl(net, "net/ipv4/vs", ipvs->lblcr_ctl_table); + ipvs->lblcr_ctl_header = register_net_sysctl_sz(net, "net/ipv4/vs", + ipvs->lblcr_ctl_table, + vars_table_size); if (!ipvs->lblcr_ctl_header) { if (!net_eq(net, &init_net)) kfree(ipvs->lblcr_ctl_table); diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index da5af28ff57b..4174076c66fa 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1439,7 +1439,7 @@ static int bind_mcastif_addr(struct socket *sock, struct net_device *dev) sin.sin_addr.s_addr = addr; sin.sin_port = 0; - return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin)); + return kernel_bind(sock, (struct sockaddr *)&sin, sizeof(sin)); } static void get_mcast_sockaddr(union ipvs_sockaddr *sa, int *salen, @@ -1505,8 +1505,8 @@ static int make_send_sock(struct netns_ipvs *ipvs, int id, } get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->mcfg, id); - result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr, - salen, 0); + result = kernel_connect(sock, (struct sockaddr *)&mcast_addr, + salen, 0); if (result < 0) { pr_err("Error connecting to the multicast addr\n"); goto error; @@ -1546,7 +1546,7 @@ static int make_receive_sock(struct netns_ipvs *ipvs, int id, get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id); sock->sk->sk_bound_dev_if = dev->ifindex; - result = sock->ops->bind(sock, (struct sockaddr *)&mcast_addr, salen); + result = kernel_bind(sock, (struct sockaddr *)&mcast_addr, salen); if (result < 0) { pr_err("Error binding to the multicast addr\n"); goto error; diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c index c7a6114091ae..b21799d468d2 100644 --- a/net/netfilter/nf_conntrack_bpf.c +++ b/net/netfilter/nf_conntrack_bpf.c @@ -381,6 +381,8 @@ __bpf_kfunc struct nf_conn *bpf_ct_insert_entry(struct nf_conn___init *nfct_i) struct nf_conn *nfct = (struct nf_conn *)nfct_i; int err; + if (!nf_ct_is_confirmed(nfct)) + nfct->timeout += nfct_time_stamp; nfct->status |= IPS_CONFIRMED; err = nf_conntrack_hash_check_insert(nfct); if (err < 0) { diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c index 0b513f7bf9f3..dd62cc12e775 100644 --- a/net/netfilter/nf_conntrack_extend.c +++ b/net/netfilter/nf_conntrack_extend.c @@ -40,10 +40,10 @@ static const u8 nf_ct_ext_type_len[NF_CT_EXT_NUM] = { [NF_CT_EXT_ECACHE] = sizeof(struct nf_conntrack_ecache), #endif #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP - [NF_CT_EXT_TSTAMP] = sizeof(struct nf_conn_acct), + [NF_CT_EXT_TSTAMP] = sizeof(struct nf_conn_tstamp), #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT - [NF_CT_EXT_TIMEOUT] = sizeof(struct nf_conn_tstamp), + [NF_CT_EXT_TIMEOUT] = sizeof(struct nf_conn_timeout), #endif #ifdef CONFIG_NF_CONNTRACK_LABELS [NF_CT_EXT_LABELS] = sizeof(struct nf_conn_labels), diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index b6bcc8f2f46b..c6bd533983c1 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -112,7 +112,7 @@ static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = { /* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA}, /* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL},/* Can't have Stale cookie*/ /* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL},/* 5.2.4 - Big TODO */ -/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL},/* Can't come in orig dir */ +/* cookie_ack */ {sCL, sCL, sCW, sES, sES, sSS, sSR, sSA, sCL},/* Can't come in orig dir */ /* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL}, /* heartbeat */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS}, /* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS}, @@ -126,7 +126,7 @@ static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = { /* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV}, /* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV}, /* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV}, -/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV},/* Can't come in reply dir */ +/* cookie_echo */ {sIV, sCL, sCE, sCE, sES, sSS, sSR, sSA, sIV},/* Can't come in reply dir */ /* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV}, /* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV}, /* heartbeat */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS}, @@ -412,6 +412,9 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, /* (D) vtag must be same as init_vtag as found in INIT_ACK */ if (sh->vtag != ct->proto.sctp.vtag[dir]) goto out_unlock; + } else if (sch->type == SCTP_CID_COOKIE_ACK) { + ct->proto.sctp.init[dir] = 0; + ct->proto.sctp.init[!dir] = 0; } else if (sch->type == SCTP_CID_HEARTBEAT) { if (ct->proto.sctp.vtag[dir] == 0) { pr_debug("Setting %d vtag %x for dir %d\n", sch->type, sh->vtag, dir); @@ -461,16 +464,18 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, } /* If it is an INIT or an INIT ACK note down the vtag */ - if (sch->type == SCTP_CID_INIT || - sch->type == SCTP_CID_INIT_ACK) { - struct sctp_inithdr _inithdr, *ih; + if (sch->type == SCTP_CID_INIT) { + struct sctp_inithdr _ih, *ih; - ih = skb_header_pointer(skb, offset + sizeof(_sch), - sizeof(_inithdr), &_inithdr); - if (ih == NULL) + ih = skb_header_pointer(skb, offset + sizeof(_sch), sizeof(*ih), &_ih); + if (!ih) goto out_unlock; - pr_debug("Setting vtag %x for dir %d\n", - ih->init_tag, !dir); + + if (ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir]) + ct->proto.sctp.init[!dir] = 0; + ct->proto.sctp.init[dir] = 1; + + pr_debug("Setting vtag %x for dir %d\n", ih->init_tag, !dir); ct->proto.sctp.vtag[!dir] = ih->init_tag; /* don't renew timeout on init retransmit so @@ -481,6 +486,24 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, old_state == SCTP_CONNTRACK_CLOSED && nf_ct_is_confirmed(ct)) ignore = true; + } else if (sch->type == SCTP_CID_INIT_ACK) { + struct sctp_inithdr _ih, *ih; + __be32 vtag; + + ih = skb_header_pointer(skb, offset + sizeof(_sch), sizeof(*ih), &_ih); + if (!ih) + goto out_unlock; + + vtag = ct->proto.sctp.vtag[!dir]; + if (!ct->proto.sctp.init[!dir] && vtag && vtag != ih->init_tag) + goto out_unlock; + /* collision */ + if (ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir] && + vtag != ih->init_tag) + goto out_unlock; + + pr_debug("Setting vtag %x for dir %d\n", ih->init_tag, !dir); + ct->proto.sctp.vtag[!dir] = ih->init_tag; } ct->proto.sctp.state = new_state; diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 169e16fc2bce..0ee98ce5b816 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -1106,7 +1106,9 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) table[NF_SYSCTL_CT_BUCKETS].mode = 0444; } - cnet->sysctl_header = register_net_sysctl(net, "net/netfilter", table); + cnet->sysctl_header = register_net_sysctl_sz(net, "net/netfilter", + table, + ARRAY_SIZE(nf_ct_sysctl_table)); if (!cnet->sysctl_header) goto out_unregister_netfilter; diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 8a29290149bd..8cc52d2bd31b 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -487,9 +487,10 @@ static int netfilter_log_sysctl_init(struct net *net) for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) table[i].extra2 = net; - net->nf.nf_log_dir_header = register_net_sysctl(net, - "net/netfilter/nf_log", - table); + net->nf.nf_log_dir_header = register_net_sysctl_sz(net, + "net/netfilter/nf_log", + table, + ARRAY_SIZE(nf_log_sysctl_table)); if (!net->nf.nf_log_dir_header) goto err_reg; diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index fadbd4ed3dc0..c4e0516a8dfa 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -327,7 +327,7 @@ static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple, /* If we source map this tuple so reply looks like reply_tuple, will * that meet the constraints of range. */ -static int in_range(const struct nf_conntrack_tuple *tuple, +static int nf_in_range(const struct nf_conntrack_tuple *tuple, const struct nf_nat_range2 *range) { /* If we are supposed to map IPs, then we must be in the @@ -376,7 +376,7 @@ find_appropriate_src(struct net *net, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); result->dst = tuple->dst; - if (in_range(result, range)) + if (nf_in_range(result, range)) return 1; } } @@ -607,7 +607,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, if (maniptype == NF_NAT_MANIP_SRC && !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { /* try the original tuple first */ - if (in_range(orig_tuple, range)) { + if (nf_in_range(orig_tuple, range)) { if (!nf_nat_used_tuple(orig_tuple, ct)) { *tuple = *orig_tuple; return; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 2c81cee858d6..a623d31b6518 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1219,6 +1219,10 @@ static int nf_tables_updtable(struct nft_ctx *ctx) flags & NFT_TABLE_F_OWNER)) return -EOPNOTSUPP; + /* No dormant off/on/off/on games in single transaction */ + if (ctx->table->flags & __NFT_TABLE_F_UPDATE) + return -EINVAL; + trans = nft_trans_alloc(ctx, NFT_MSG_NEWTABLE, sizeof(struct nft_trans_table)); if (trans == NULL) @@ -1432,7 +1436,7 @@ static int nft_flush_table(struct nft_ctx *ctx) if (!nft_is_active_next(ctx->net, chain)) continue; - if (nft_chain_is_bound(chain)) + if (nft_chain_binding(chain)) continue; ctx->chain = chain; @@ -1446,8 +1450,7 @@ static int nft_flush_table(struct nft_ctx *ctx) if (!nft_is_active_next(ctx->net, set)) continue; - if (nft_set_is_anonymous(set) && - !list_empty(&set->bindings)) + if (nft_set_is_anonymous(set)) continue; err = nft_delset(ctx, set); @@ -1477,7 +1480,7 @@ static int nft_flush_table(struct nft_ctx *ctx) if (!nft_is_active_next(ctx->net, chain)) continue; - if (nft_chain_is_bound(chain)) + if (nft_chain_binding(chain)) continue; ctx->chain = chain; @@ -2910,6 +2913,9 @@ static int nf_tables_delchain(struct sk_buff *skb, const struct nfnl_info *info, return PTR_ERR(chain); } + if (nft_chain_binding(chain)) + return -EOPNOTSUPP; + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla); if (nla[NFTA_CHAIN_HOOK]) { @@ -3160,7 +3166,7 @@ int nft_expr_inner_parse(const struct nft_ctx *ctx, const struct nlattr *nla, if (err < 0) return err; - if (!tb[NFTA_EXPR_DATA]) + if (!tb[NFTA_EXPR_DATA] || !tb[NFTA_EXPR_NAME]) return -EINVAL; type = __nft_expr_type_get(ctx->family, tb[NFTA_EXPR_NAME]); @@ -3449,6 +3455,8 @@ static int __nf_tables_dump_rules(struct sk_buff *skb, struct net *net = sock_net(skb->sk); const struct nft_rule *rule, *prule; unsigned int s_idx = cb->args[0]; + unsigned int entries = 0; + int ret = 0; u64 handle; prule = NULL; @@ -3471,16 +3479,22 @@ static int __nf_tables_dump_rules(struct sk_buff *skb, NFT_MSG_NEWRULE, NLM_F_MULTI | NLM_F_APPEND, table->family, - table, chain, rule, handle, reset) < 0) - return 1; - + table, chain, rule, handle, reset) < 0) { + ret = 1; + break; + } + entries++; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: prule = rule; cont_skip: (*idx)++; } - return 0; + + if (reset && entries) + audit_log_rule_reset(table, cb->seq, entries); + + return ret; } static int nf_tables_dump_rules(struct sk_buff *skb, @@ -3540,9 +3554,6 @@ static int nf_tables_dump_rules(struct sk_buff *skb, done: rcu_read_unlock(); - if (reset && idx > cb->args[0]) - audit_log_rule_reset(table, cb->seq, idx - cb->args[0]); - cb->args[0] = idx; return skb->len; } @@ -3970,6 +3981,11 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, } if (info->nlh->nlmsg_flags & NLM_F_REPLACE) { + if (nft_chain_binding(chain)) { + err = -EOPNOTSUPP; + goto err_destroy_flow_rule; + } + err = nft_delrule(&ctx, old_rule); if (err < 0) goto err_destroy_flow_rule; @@ -4077,7 +4093,7 @@ static int nf_tables_delrule(struct sk_buff *skb, const struct nfnl_info *info, NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); return PTR_ERR(chain); } - if (nft_chain_is_bound(chain)) + if (nft_chain_binding(chain)) return -EOPNOTSUPP; } @@ -4111,7 +4127,7 @@ static int nf_tables_delrule(struct sk_buff *skb, const struct nfnl_info *info, list_for_each_entry(chain, &table->chains, list) { if (!nft_is_active_next(net, chain)) continue; - if (nft_chain_is_bound(chain)) + if (nft_chain_binding(chain)) continue; ctx.chain = chain; @@ -5540,7 +5556,6 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; - u64 timeout = 0; nest = nla_nest_start_noflag(skb, NFTA_LIST_ELEM); if (nest == NULL) @@ -5576,15 +5591,11 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, htonl(*nft_set_ext_flags(ext)))) goto nla_put_failure; - if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT)) { - timeout = *nft_set_ext_timeout(ext); - if (nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT, - nf_jiffies64_to_msecs(timeout), - NFTA_SET_ELEM_PAD)) - goto nla_put_failure; - } else if (set->flags & NFT_SET_TIMEOUT) { - timeout = READ_ONCE(set->timeout); - } + if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) && + nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT, + nf_jiffies64_to_msecs(*nft_set_ext_timeout(ext)), + NFTA_SET_ELEM_PAD)) + goto nla_put_failure; if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { u64 expires, now = get_jiffies_64(); @@ -5599,9 +5610,6 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, nf_jiffies64_to_msecs(expires), NFTA_SET_ELEM_PAD)) goto nla_put_failure; - - if (reset) - *nft_set_ext_expiration(ext) = now + timeout; } if (nft_set_ext_exists(ext, NFT_SET_EXT_USERDATA)) { @@ -5760,8 +5768,6 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) if (!args.iter.err && args.iter.count == cb->args[0]) args.iter.err = nft_set_catchall_dump(net, skb, set, reset, cb->seq); - rcu_read_unlock(); - nla_nest_end(skb, nest); nlmsg_end(skb, nlh); @@ -5769,6 +5775,8 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) audit_log_nft_set_reset(table, cb->seq, args.iter.count - args.iter.skip); + rcu_read_unlock(); + if (args.iter.err && args.iter.err != -EMSGSIZE) return args.iter.err; if (args.iter.count == cb->args[0]) @@ -7182,8 +7190,10 @@ static int nf_tables_delsetelem(struct sk_buff *skb, if (IS_ERR(set)) return PTR_ERR(set); - if (!list_empty(&set->bindings) && - (set->flags & (NFT_SET_CONSTANT | NFT_SET_ANONYMOUS))) + if (nft_set_is_anonymous(set)) + return -EOPNOTSUPP; + + if (!list_empty(&set->bindings) && (set->flags & NFT_SET_CONSTANT)) return -EBUSY; nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); @@ -7853,24 +7863,14 @@ static int nf_tables_delobj(struct sk_buff *skb, const struct nfnl_info *info, return nft_delobj(&ctx, obj); } -void nft_obj_notify(struct net *net, const struct nft_table *table, - struct nft_object *obj, u32 portid, u32 seq, int event, - u16 flags, int family, int report, gfp_t gfp) +static void +__nft_obj_notify(struct net *net, const struct nft_table *table, + struct nft_object *obj, u32 portid, u32 seq, int event, + u16 flags, int family, int report, gfp_t gfp) { struct nftables_pernet *nft_net = nft_pernet(net); struct sk_buff *skb; int err; - char *buf = kasprintf(gfp, "%s:%u", - table->name, nft_net->base_seq); - - audit_log_nfcfg(buf, - family, - obj->handle, - event == NFT_MSG_NEWOBJ ? - AUDIT_NFT_OP_OBJ_REGISTER : - AUDIT_NFT_OP_OBJ_UNREGISTER, - gfp); - kfree(buf); if (!report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) @@ -7893,13 +7893,35 @@ void nft_obj_notify(struct net *net, const struct nft_table *table, err: nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS); } + +void nft_obj_notify(struct net *net, const struct nft_table *table, + struct nft_object *obj, u32 portid, u32 seq, int event, + u16 flags, int family, int report, gfp_t gfp) +{ + struct nftables_pernet *nft_net = nft_pernet(net); + char *buf = kasprintf(gfp, "%s:%u", + table->name, nft_net->base_seq); + + audit_log_nfcfg(buf, + family, + obj->handle, + event == NFT_MSG_NEWOBJ ? + AUDIT_NFT_OP_OBJ_REGISTER : + AUDIT_NFT_OP_OBJ_UNREGISTER, + gfp); + kfree(buf); + + __nft_obj_notify(net, table, obj, portid, seq, event, + flags, family, report, gfp); +} EXPORT_SYMBOL_GPL(nft_obj_notify); static void nf_tables_obj_notify(const struct nft_ctx *ctx, struct nft_object *obj, int event) { - nft_obj_notify(ctx->net, ctx->table, obj, ctx->portid, ctx->seq, event, - ctx->flags, ctx->family, ctx->report, GFP_KERNEL); + __nft_obj_notify(ctx->net, ctx->table, obj, ctx->portid, + ctx->seq, event, ctx->flags, ctx->family, + ctx->report, GFP_KERNEL); } /* @@ -9561,12 +9583,15 @@ static int nft_trans_gc_space(struct nft_trans_gc *trans) struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc, unsigned int gc_seq, gfp_t gfp) { + struct nft_set *set; + if (nft_trans_gc_space(gc)) return gc; + set = gc->set; nft_trans_gc_queue_work(gc); - return nft_trans_gc_alloc(gc->set, gc_seq, gfp); + return nft_trans_gc_alloc(set, gc_seq, gfp); } void nft_trans_gc_queue_async_done(struct nft_trans_gc *trans) @@ -9581,15 +9606,18 @@ void nft_trans_gc_queue_async_done(struct nft_trans_gc *trans) struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp) { + struct nft_set *set; + if (WARN_ON_ONCE(!lockdep_commit_lock_is_held(gc->net))) return NULL; if (nft_trans_gc_space(gc)) return gc; + set = gc->set; call_rcu(&gc->rcu, nft_trans_gc_trans_free); - return nft_trans_gc_alloc(gc->set, 0, gfp); + return nft_trans_gc_alloc(set, 0, gfp); } void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans) @@ -9604,8 +9632,9 @@ void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans) call_rcu(&trans->rcu, nft_trans_gc_trans_free); } -struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc, - unsigned int gc_seq) +static struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc, + unsigned int gc_seq, + bool sync) { struct nft_set_elem_catchall *catchall; const struct nft_set *set = gc->set; @@ -9621,7 +9650,11 @@ struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc, nft_set_elem_dead(ext); dead_elem: - gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); + if (sync) + gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); + else + gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); + if (!gc) return NULL; @@ -9631,6 +9664,17 @@ dead_elem: return gc; } +struct nft_trans_gc *nft_trans_gc_catchall_async(struct nft_trans_gc *gc, + unsigned int gc_seq) +{ + return nft_trans_gc_catchall(gc, gc_seq, false); +} + +struct nft_trans_gc *nft_trans_gc_catchall_sync(struct nft_trans_gc *gc) +{ + return nft_trans_gc_catchall(gc, 0, true); +} + static void nf_tables_module_autoload_cleanup(struct net *net) { struct nftables_pernet *nft_net = nft_pernet(net); @@ -10295,7 +10339,10 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) break; } te = (struct nft_trans_elem *)trans->data; - nft_setelem_remove(net, te->set, &te->elem); + if (!te->set->ops->abort || + nft_setelem_is_catchall(te->set, &te->elem)) + nft_setelem_remove(net, te->set, &te->elem); + if (!nft_setelem_is_catchall(te->set, &te->elem)) atomic_dec(&te->set->nelems); @@ -11053,7 +11100,7 @@ static void __nft_release_table(struct net *net, struct nft_table *table) ctx.family = table->family; ctx.table = table; list_for_each_entry(chain, &table->chains, list) { - if (nft_chain_is_bound(chain)) + if (nft_chain_binding(chain)) continue; ctx.chain = chain; diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 53c9e76473ba..f03f4d4d7d88 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -698,8 +698,8 @@ nfulnl_log_packet(struct net *net, unsigned int plen = 0; struct nfnl_log_net *log = nfnl_log_pernet(net); const struct nfnl_ct_hook *nfnl_ct = NULL; + enum ip_conntrack_info ctinfo = 0; struct nf_conn *ct = NULL; - enum ip_conntrack_info ctinfo; if (li_user && li_user->type == NF_LOG_TYPE_ULOG) li = li_user; diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c index 8f1bfa6ccc2d..50723ba08289 100644 --- a/net/netfilter/nfnetlink_osf.c +++ b/net/netfilter/nfnetlink_osf.c @@ -315,6 +315,14 @@ static int nfnl_osf_add_callback(struct sk_buff *skb, f = nla_data(osf_attrs[OSF_ATTR_FINGER]); + if (f->opt_num > ARRAY_SIZE(f->opt)) + return -EINVAL; + + if (!memchr(f->genre, 0, MAXGENRELEN) || + !memchr(f->subtype, 0, MAXGENRELEN) || + !memchr(f->version, 0, MAXGENRELEN)) + return -EINVAL; + kf = kmalloc(sizeof(struct nf_osf_finger), GFP_KERNEL); if (!kf) return -ENOMEM; diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index a9844eefedeb..3fbaa7bf41f9 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -35,6 +35,14 @@ static unsigned int optlen(const u8 *opt, unsigned int offset) return opt[offset + 1]; } +static int nft_skb_copy_to_reg(const struct sk_buff *skb, int offset, u32 *dest, unsigned int len) +{ + if (len % NFT_REG32_SIZE) + dest[len / NFT_REG32_SIZE] = 0; + + return skb_copy_bits(skb, offset, dest, len); +} + static void nft_exthdr_ipv6_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -56,8 +64,7 @@ static void nft_exthdr_ipv6_eval(const struct nft_expr *expr, } offset += priv->offset; - dest[priv->len / NFT_REG32_SIZE] = 0; - if (skb_copy_bits(pkt->skb, offset, dest, priv->len) < 0) + if (nft_skb_copy_to_reg(pkt->skb, offset, dest, priv->len) < 0) goto err; return; err: @@ -153,8 +160,7 @@ static void nft_exthdr_ipv4_eval(const struct nft_expr *expr, } offset += priv->offset; - dest[priv->len / NFT_REG32_SIZE] = 0; - if (skb_copy_bits(pkt->skb, offset, dest, priv->len) < 0) + if (nft_skb_copy_to_reg(pkt->skb, offset, dest, priv->len) < 0) goto err; return; err: @@ -210,7 +216,8 @@ static void nft_exthdr_tcp_eval(const struct nft_expr *expr, if (priv->flags & NFT_EXTHDR_F_PRESENT) { *dest = 1; } else { - dest[priv->len / NFT_REG32_SIZE] = 0; + if (priv->len % NFT_REG32_SIZE) + dest[priv->len / NFT_REG32_SIZE] = 0; memcpy(dest, opt + offset, priv->len); } @@ -388,9 +395,8 @@ static void nft_exthdr_sctp_eval(const struct nft_expr *expr, offset + ntohs(sch->length) > pkt->skb->len) break; - dest[priv->len / NFT_REG32_SIZE] = 0; - if (skb_copy_bits(pkt->skb, offset + priv->offset, - dest, priv->len) < 0) + if (nft_skb_copy_to_reg(pkt->skb, offset + priv->offset, + dest, priv->len) < 0) break; return; } diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c index 28e2873ba24e..928312d01eb1 100644 --- a/net/netfilter/nft_inner.c +++ b/net/netfilter/nft_inner.c @@ -298,6 +298,7 @@ static int nft_inner_init(const struct nft_ctx *ctx, int err; if (!tb[NFTA_INNER_FLAGS] || + !tb[NFTA_INNER_NUM] || !tb[NFTA_INNER_HDRSIZE] || !tb[NFTA_INNER_TYPE] || !tb[NFTA_INNER_EXPR]) diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 8cb800989947..0a689c8e0295 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -154,6 +154,17 @@ int nft_payload_inner_offset(const struct nft_pktinfo *pkt) return pkt->inneroff; } +static bool nft_payload_need_vlan_copy(const struct nft_payload *priv) +{ + unsigned int len = priv->offset + priv->len; + + /* data past ether src/dst requested, copy needed */ + if (len > offsetof(struct ethhdr, h_proto)) + return true; + + return false; +} + void nft_payload_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -168,11 +179,11 @@ void nft_payload_eval(const struct nft_expr *expr, switch (priv->base) { case NFT_PAYLOAD_LL_HEADER: - if (!skb_mac_header_was_set(skb)) + if (!skb_mac_header_was_set(skb) || skb_mac_header_len(skb) == 0) goto err; if (skb_vlan_tag_present(skb) && - priv->offset >= offsetof(struct ethhdr, h_proto)) { + nft_payload_need_vlan_copy(priv)) { if (!nft_payload_copy_vlan(dest, skb, priv->offset, priv->len)) goto err; diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 524763659f25..2013de934cef 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -338,12 +338,9 @@ static void nft_rhash_gc(struct work_struct *work) while ((he = rhashtable_walk_next(&hti))) { if (IS_ERR(he)) { - if (PTR_ERR(he) != -EAGAIN) { - nft_trans_gc_destroy(gc); - gc = NULL; - goto try_later; - } - continue; + nft_trans_gc_destroy(gc); + gc = NULL; + goto try_later; } /* Ruleset has been updated, try later. */ @@ -372,7 +369,7 @@ dead_elem: nft_trans_gc_elem_add(gc, he); } - gc = nft_trans_gc_catchall(gc, gc_seq); + gc = nft_trans_gc_catchall_async(gc, gc_seq); try_later: /* catchall list iteration requires rcu read side lock. */ diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 6af9c9ed4b5c..c0dcc40de358 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -1596,7 +1596,7 @@ static void pipapo_gc(const struct nft_set *_set, struct nft_pipapo_match *m) gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); if (!gc) - break; + return; nft_pipapo_gc_deactivate(net, set, e); pipapo_drop(m, rulemap); @@ -1610,7 +1610,7 @@ static void pipapo_gc(const struct nft_set *_set, struct nft_pipapo_match *m) } } - gc = nft_trans_gc_catchall(gc, 0); + gc = nft_trans_gc_catchall_sync(gc); if (gc) { nft_trans_gc_queue_sync_done(gc); priv->last_gc = jiffies; diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h index 25a75591583e..2e164a319945 100644 --- a/net/netfilter/nft_set_pipapo.h +++ b/net/netfilter/nft_set_pipapo.h @@ -147,7 +147,7 @@ struct nft_pipapo_match { unsigned long * __percpu *scratch; size_t bsize_max; struct rcu_head rcu; - struct nft_pipapo_field f[]; + struct nft_pipapo_field f[] __counted_by(field_count); }; /** diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index c6435e709231..2660ceab3759 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -233,10 +233,9 @@ static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set, rb_erase(&rbe->node, &priv->root); } -static int nft_rbtree_gc_elem(const struct nft_set *__set, - struct nft_rbtree *priv, - struct nft_rbtree_elem *rbe, - u8 genmask) +static const struct nft_rbtree_elem * +nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv, + struct nft_rbtree_elem *rbe, u8 genmask) { struct nft_set *set = (struct nft_set *)__set; struct rb_node *prev = rb_prev(&rbe->node); @@ -246,7 +245,7 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set, gc = nft_trans_gc_alloc(set, 0, GFP_ATOMIC); if (!gc) - return -ENOMEM; + return ERR_PTR(-ENOMEM); /* search for end interval coming before this element. * end intervals don't carry a timeout extension, they @@ -261,6 +260,7 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set, prev = rb_prev(prev); } + rbe_prev = NULL; if (prev) { rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); nft_rbtree_gc_remove(net, set, priv, rbe_prev); @@ -272,7 +272,7 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set, */ gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); if (WARN_ON_ONCE(!gc)) - return -ENOMEM; + return ERR_PTR(-ENOMEM); nft_trans_gc_elem_add(gc, rbe_prev); } @@ -280,13 +280,13 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set, nft_rbtree_gc_remove(net, set, priv, rbe); gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); if (WARN_ON_ONCE(!gc)) - return -ENOMEM; + return ERR_PTR(-ENOMEM); nft_trans_gc_elem_add(gc, rbe); nft_trans_gc_queue_sync_done(gc); - return 0; + return rbe_prev; } static bool nft_rbtree_update_first(const struct nft_set *set, @@ -312,8 +312,9 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, struct nft_rbtree_elem *rbe, *rbe_le = NULL, *rbe_ge = NULL; struct rb_node *node, *next, *parent, **p, *first = NULL; struct nft_rbtree *priv = nft_set_priv(set); + u8 cur_genmask = nft_genmask_cur(net); u8 genmask = nft_genmask_next(net); - int d, err; + int d; /* Descend the tree to search for an existing element greater than the * key value to insert that is greater than the new element. This is the @@ -357,11 +358,19 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, if (!nft_set_elem_active(&rbe->ext, genmask)) continue; - /* perform garbage collection to avoid bogus overlap reports. */ - if (nft_set_elem_expired(&rbe->ext)) { - err = nft_rbtree_gc_elem(set, priv, rbe, genmask); - if (err < 0) - return err; + /* perform garbage collection to avoid bogus overlap reports + * but skip new elements in this transaction. + */ + if (nft_set_elem_expired(&rbe->ext) && + nft_set_elem_active(&rbe->ext, cur_genmask)) { + const struct nft_rbtree_elem *removed_end; + + removed_end = nft_rbtree_gc_elem(set, priv, rbe, genmask); + if (IS_ERR(removed_end)) + return PTR_ERR(removed_end); + + if (removed_end == rbe_le || removed_end == rbe_ge) + return -EAGAIN; continue; } @@ -482,11 +491,18 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set, struct nft_rbtree_elem *rbe = elem->priv; int err; - write_lock_bh(&priv->lock); - write_seqcount_begin(&priv->count); - err = __nft_rbtree_insert(net, set, rbe, ext); - write_seqcount_end(&priv->count); - write_unlock_bh(&priv->lock); + do { + if (fatal_signal_pending(current)) + return -EINTR; + + cond_resched(); + + write_lock_bh(&priv->lock); + write_seqcount_begin(&priv->count); + err = __nft_rbtree_insert(net, set, rbe, ext); + write_seqcount_end(&priv->count); + write_unlock_bh(&priv->lock); + } while (err == -EAGAIN); return err; } @@ -618,8 +634,7 @@ static void nft_rbtree_gc(struct work_struct *work) if (!gc) goto done; - write_lock_bh(&priv->lock); - write_seqcount_begin(&priv->count); + read_lock_bh(&priv->lock); for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { /* Ruleset has been updated, try later. */ @@ -666,11 +681,10 @@ dead_elem: nft_trans_gc_elem_add(gc, rbe); } - gc = nft_trans_gc_catchall(gc, gc_seq); + gc = nft_trans_gc_catchall_async(gc, gc_seq); try_later: - write_seqcount_end(&priv->count); - write_unlock_bh(&priv->lock); + read_unlock_bh(&priv->lock); if (gc) nft_trans_gc_queue_async_done(gc); diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 642b9d382fb4..eb086b06d60d 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -352,7 +352,7 @@ static void netlink_overrun(struct sock *sk) if (!nlk_test_bit(RECV_NO_ENOBUFS, sk)) { if (!test_and_set_bit(NETLINK_S_CONGESTED, &nlk_sk(sk)->state)) { - sk->sk_err = ENOBUFS; + WRITE_ONCE(sk->sk_err, ENOBUFS); sk_error_report(sk); } } @@ -1605,7 +1605,7 @@ static int do_one_set_err(struct sock *sk, struct netlink_set_err_data *p) goto out; } - sk->sk_err = p->code; + WRITE_ONCE(sk->sk_err, p->code); sk_error_report(sk); out: return ret; @@ -1991,7 +1991,7 @@ static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) { ret = netlink_dump(sk); if (ret) { - sk->sk_err = -ret; + WRITE_ONCE(sk->sk_err, -ret); sk_error_report(sk); } } @@ -2511,7 +2511,7 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, err_bad_put: nlmsg_free(skb); err_skb: - NETLINK_CB(in_skb).sk->sk_err = ENOBUFS; + WRITE_ONCE(NETLINK_CB(in_skb).sk->sk_err, ENOBUFS); sk_error_report(NETLINK_CB(in_skb).sk); } EXPORT_SYMBOL(netlink_ack); diff --git a/net/nfc/llcp_core.c b/net/nfc/llcp_core.c index f60e424e0607..1dac28136e6a 100644 --- a/net/nfc/llcp_core.c +++ b/net/nfc/llcp_core.c @@ -203,17 +203,13 @@ static struct nfc_llcp_sock *nfc_llcp_sock_get(struct nfc_llcp_local *local, if (tmp_sock->ssap == ssap && tmp_sock->dsap == dsap) { llcp_sock = tmp_sock; + sock_hold(&llcp_sock->sk); break; } } read_unlock(&local->sockets.lock); - if (llcp_sock == NULL) - return NULL; - - sock_hold(&llcp_sock->sk); - return llcp_sock; } @@ -346,7 +342,8 @@ static int nfc_llcp_wks_sap(const char *service_name, size_t service_name_len) static struct nfc_llcp_sock *nfc_llcp_sock_from_sn(struct nfc_llcp_local *local, - const u8 *sn, size_t sn_len) + const u8 *sn, size_t sn_len, + bool needref) { struct sock *sk; struct nfc_llcp_sock *llcp_sock, *tmp_sock; @@ -382,6 +379,8 @@ struct nfc_llcp_sock *nfc_llcp_sock_from_sn(struct nfc_llcp_local *local, if (memcmp(sn, tmp_sock->service_name, sn_len) == 0) { llcp_sock = tmp_sock; + if (needref) + sock_hold(&llcp_sock->sk); break; } } @@ -423,7 +422,8 @@ u8 nfc_llcp_get_sdp_ssap(struct nfc_llcp_local *local, * to this service name. */ if (nfc_llcp_sock_from_sn(local, sock->service_name, - sock->service_name_len) != NULL) { + sock->service_name_len, + false) != NULL) { mutex_unlock(&local->sdp_lock); return LLCP_SAP_MAX; @@ -824,16 +824,7 @@ out: static struct nfc_llcp_sock *nfc_llcp_sock_get_sn(struct nfc_llcp_local *local, const u8 *sn, size_t sn_len) { - struct nfc_llcp_sock *llcp_sock; - - llcp_sock = nfc_llcp_sock_from_sn(local, sn, sn_len); - - if (llcp_sock == NULL) - return NULL; - - sock_hold(&llcp_sock->sk); - - return llcp_sock; + return nfc_llcp_sock_from_sn(local, sn, sn_len, true); } static const u8 *nfc_llcp_connect_sn(const struct sk_buff *skb, size_t *sn_len) @@ -1298,7 +1289,8 @@ static void nfc_llcp_recv_snl(struct nfc_llcp_local *local, } llcp_sock = nfc_llcp_sock_from_sn(local, service_name, - service_name_len); + service_name_len, + true); if (!llcp_sock) { sap = 0; goto add_snl; @@ -1318,6 +1310,7 @@ static void nfc_llcp_recv_snl(struct nfc_llcp_local *local, if (sap == LLCP_SAP_MAX) { sap = 0; + nfc_llcp_sock_put(llcp_sock); goto add_snl; } @@ -1335,6 +1328,7 @@ static void nfc_llcp_recv_snl(struct nfc_llcp_local *local, pr_debug("%p %d\n", llcp_sock, sap); + nfc_llcp_sock_put(llcp_sock); add_snl: sdp = nfc_llcp_build_sdres_tlv(tid, sap); if (sdp == NULL) @@ -1636,7 +1630,9 @@ int nfc_llcp_register_device(struct nfc_dev *ndev) timer_setup(&local->sdreq_timer, nfc_llcp_sdreq_timer, 0); INIT_WORK(&local->sdreq_timeout_work, nfc_llcp_sdreq_timeout_work); + spin_lock(&llcp_devices_lock); list_add(&local->list, &llcp_devices); + spin_unlock(&llcp_devices_lock); return 0; } diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index fff755dde30d..6c9592d05120 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -909,6 +909,11 @@ static int nci_activate_target(struct nfc_dev *nfc_dev, return -EINVAL; } + if (protocol >= NFC_PROTO_MAX) { + pr_err("the requested nfc protocol is invalid\n"); + return -EINVAL; + } + if (!(nci_target->supported_protocols & (1 << protocol))) { pr_err("target does not support the requested protocol 0x%x\n", protocol); diff --git a/net/nfc/nci/spi.c b/net/nfc/nci/spi.c index 0935527d1d12..b68150c971d0 100644 --- a/net/nfc/nci/spi.c +++ b/net/nfc/nci/spi.c @@ -151,6 +151,8 @@ static int send_acknowledge(struct nci_spi *nspi, u8 acknowledge) int ret; skb = nci_skb_alloc(nspi->ndev, 0, GFP_KERNEL); + if (!skb) + return -ENOMEM; /* add the NCI SPI header to the start of the buffer */ hdr = skb_push(skb, NCI_SPI_HDR_LEN); diff --git a/net/nfc/nci/uart.c b/net/nfc/nci/uart.c index cc8fa9e36159..ed1508a9e093 100644 --- a/net/nfc/nci/uart.c +++ b/net/nfc/nci/uart.c @@ -172,7 +172,7 @@ static int nci_uart_tty_open(struct tty_struct *tty) */ static void nci_uart_tty_close(struct tty_struct *tty) { - struct nci_uart *nu = (void *)tty->disc_data; + struct nci_uart *nu = tty->disc_data; /* Detach from the tty */ tty->disc_data = NULL; @@ -204,7 +204,7 @@ static void nci_uart_tty_close(struct tty_struct *tty) */ static void nci_uart_tty_wakeup(struct tty_struct *tty) { - struct nci_uart *nu = (void *)tty->disc_data; + struct nci_uart *nu = tty->disc_data; if (!nu) return; @@ -296,9 +296,9 @@ static int nci_uart_default_recv_buf(struct nci_uart *nu, const u8 *data, * Return Value: None */ static void nci_uart_tty_receive(struct tty_struct *tty, const u8 *data, - const char *flags, int count) + const u8 *flags, size_t count) { - struct nci_uart *nu = (void *)tty->disc_data; + struct nci_uart *nu = tty->disc_data; if (!nu || tty != nu->tty) return; @@ -325,7 +325,7 @@ static void nci_uart_tty_receive(struct tty_struct *tty, const u8 *data, static int nci_uart_tty_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { - struct nci_uart *nu = (void *)tty->disc_data; + struct nci_uart *nu = tty->disc_data; int err = 0; switch (cmd) { @@ -345,20 +345,14 @@ static int nci_uart_tty_ioctl(struct tty_struct *tty, unsigned int cmd, /* We don't provide read/write/poll interface for user space. */ static ssize_t nci_uart_tty_read(struct tty_struct *tty, struct file *file, - unsigned char *buf, size_t nr, - void **cookie, unsigned long offset) + u8 *buf, size_t nr, void **cookie, + unsigned long offset) { return 0; } static ssize_t nci_uart_tty_write(struct tty_struct *tty, struct file *file, - const unsigned char *data, size_t count) -{ - return 0; -} - -static __poll_t nci_uart_tty_poll(struct tty_struct *tty, - struct file *filp, poll_table *wait) + const u8 *data, size_t count) { return 0; } @@ -435,7 +429,6 @@ static struct tty_ldisc_ops nci_uart_ldisc = { .close = nci_uart_tty_close, .read = nci_uart_tty_read, .write = nci_uart_tty_write, - .poll = nci_uart_tty_poll, .receive_buf = nci_uart_tty_receive, .write_wakeup = nci_uart_tty_wakeup, .ioctl = nci_uart_tty_ioctl, diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 8f97648d652f..a84e00b5904b 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3607,7 +3607,12 @@ static int packet_getname(struct socket *sock, struct sockaddr *uaddr, if (dev) { sll->sll_hatype = dev->type; sll->sll_halen = dev->addr_len; - memcpy(sll->sll_addr_flex, dev->dev_addr, dev->addr_len); + + /* Let __fortify_memcpy_chk() know the actual buffer size. */ + memcpy(((struct sockaddr_storage *)sll)->__data + + offsetof(struct sockaddr_ll, sll_addr) - + offsetofend(struct sockaddr_ll, sll_family), + dev->dev_addr, dev->addr_len); } else { sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */ sll->sll_halen = 0; diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index d36f3f6b4351..b15cf316b23a 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c @@ -86,11 +86,13 @@ static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id, break; case RDMA_CM_EVENT_ADDR_RESOLVED: - rdma_set_service_type(cm_id, conn->c_tos); - rdma_set_min_rnr_timer(cm_id, IB_RNR_TIMER_000_32); - /* XXX do we need to clean up if this fails? */ - ret = rdma_resolve_route(cm_id, - RDS_RDMA_RESOLVE_TIMEOUT_MS); + if (conn) { + rdma_set_service_type(cm_id, conn->c_tos); + rdma_set_min_rnr_timer(cm_id, IB_RNR_TIMER_000_32); + /* XXX do we need to clean up if this fails? */ + ret = rdma_resolve_route(cm_id, + RDS_RDMA_RESOLVE_TIMEOUT_MS); + } break; case RDMA_CM_EVENT_ROUTE_RESOLVED: diff --git a/net/rds/tcp.c b/net/rds/tcp.c index c5b86066ff66..2dba7505b414 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -565,7 +565,8 @@ static __net_init int rds_tcp_init_net(struct net *net) } tbl[RDS_TCP_SNDBUF].data = &rtn->sndbuf_size; tbl[RDS_TCP_RCVBUF].data = &rtn->rcvbuf_size; - rtn->rds_tcp_sysctl = register_net_sysctl(net, "net/rds/tcp", tbl); + rtn->rds_tcp_sysctl = register_net_sysctl_sz(net, "net/rds/tcp", tbl, + ARRAY_SIZE(rds_tcp_sysctl_table)); if (!rtn->rds_tcp_sysctl) { pr_warn("could not register sysctl\n"); err = -ENOMEM; diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index f0c477c5d1db..a0046e99d6df 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -145,7 +145,7 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp) addrlen = sizeof(sin); } - ret = sock->ops->bind(sock, addr, addrlen); + ret = kernel_bind(sock, addr, addrlen); if (ret) { rdsdebug("bind failed with %d at address %pI6c\n", ret, &conn->c_laddr); @@ -173,7 +173,7 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp) * own the socket */ rds_tcp_set_callbacks(sock, cp); - ret = sock->ops->connect(sock, addr, addrlen, O_NONBLOCK); + ret = kernel_connect(sock, addr, addrlen, O_NONBLOCK); rdsdebug("connect to address %pI6c returned %d\n", &conn->c_faddr, ret); if (ret == -EINPROGRESS) diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 014fa24418c1..53b3535a1e4a 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -306,7 +306,7 @@ struct socket *rds_tcp_listen_init(struct net *net, bool isv6) addr_len = sizeof(*sin); } - ret = sock->ops->bind(sock, (struct sockaddr *)&ss, addr_len); + ret = kernel_bind(sock, (struct sockaddr *)&ss, addr_len); if (ret < 0) { rdsdebug("could not bind %s listener socket: %d\n", isv6 ? "IPv6" : "IPv4", ret); diff --git a/net/rfkill/core.c b/net/rfkill/core.c index 01fca7a10b4b..08630896b6c8 100644 --- a/net/rfkill/core.c +++ b/net/rfkill/core.c @@ -48,6 +48,7 @@ struct rfkill { bool persistent; bool polling_paused; bool suspended; + bool need_sync; const struct rfkill_ops *ops; void *data; @@ -368,6 +369,17 @@ static void rfkill_set_block(struct rfkill *rfkill, bool blocked) rfkill_event(rfkill); } +static void rfkill_sync(struct rfkill *rfkill) +{ + lockdep_assert_held(&rfkill_global_mutex); + + if (!rfkill->need_sync) + return; + + rfkill_set_block(rfkill, rfkill_global_states[rfkill->type].cur); + rfkill->need_sync = false; +} + static void rfkill_update_global_state(enum rfkill_type type, bool blocked) { int i; @@ -730,6 +742,10 @@ static ssize_t soft_show(struct device *dev, struct device_attribute *attr, { struct rfkill *rfkill = to_rfkill(dev); + mutex_lock(&rfkill_global_mutex); + rfkill_sync(rfkill); + mutex_unlock(&rfkill_global_mutex); + return sysfs_emit(buf, "%d\n", (rfkill->state & RFKILL_BLOCK_SW) ? 1 : 0); } @@ -751,6 +767,7 @@ static ssize_t soft_store(struct device *dev, struct device_attribute *attr, return -EINVAL; mutex_lock(&rfkill_global_mutex); + rfkill_sync(rfkill); rfkill_set_block(rfkill, state); mutex_unlock(&rfkill_global_mutex); @@ -783,6 +800,10 @@ static ssize_t state_show(struct device *dev, struct device_attribute *attr, { struct rfkill *rfkill = to_rfkill(dev); + mutex_lock(&rfkill_global_mutex); + rfkill_sync(rfkill); + mutex_unlock(&rfkill_global_mutex); + return sysfs_emit(buf, "%d\n", user_state_from_blocked(rfkill->state)); } @@ -805,6 +826,7 @@ static ssize_t state_store(struct device *dev, struct device_attribute *attr, return -EINVAL; mutex_lock(&rfkill_global_mutex); + rfkill_sync(rfkill); rfkill_set_block(rfkill, state == RFKILL_USER_STATE_SOFT_BLOCKED); mutex_unlock(&rfkill_global_mutex); @@ -1032,14 +1054,10 @@ static void rfkill_uevent_work(struct work_struct *work) static void rfkill_sync_work(struct work_struct *work) { - struct rfkill *rfkill; - bool cur; - - rfkill = container_of(work, struct rfkill, sync_work); + struct rfkill *rfkill = container_of(work, struct rfkill, sync_work); mutex_lock(&rfkill_global_mutex); - cur = rfkill_global_states[rfkill->type].cur; - rfkill_set_block(rfkill, cur); + rfkill_sync(rfkill); mutex_unlock(&rfkill_global_mutex); } @@ -1087,6 +1105,7 @@ int __must_check rfkill_register(struct rfkill *rfkill) round_jiffies_relative(POLL_INTERVAL)); if (!rfkill->persistent || rfkill_epo_lock_active) { + rfkill->need_sync = true; schedule_work(&rfkill->sync_work); } else { #ifdef CONFIG_RFKILL_INPUT @@ -1171,6 +1190,7 @@ static int rfkill_fop_open(struct inode *inode, struct file *file) ev = kzalloc(sizeof(*ev), GFP_KERNEL); if (!ev) goto free; + rfkill_sync(rfkill); rfkill_fill_event(&ev->ev, rfkill, RFKILL_OP_ADD); list_add_tail(&ev->list, &data->events); } diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index da4c179a4d41..6663e971a13e 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -366,7 +366,7 @@ static int u32_init(struct tcf_proto *tp) idr_init(&root_ht->handle_idr); if (tp_c == NULL) { - tp_c = kzalloc(struct_size(tp_c, hlist->ht, 1), GFP_KERNEL); + tp_c = kzalloc(sizeof(*tp_c), GFP_KERNEL); if (tp_c == NULL) { kfree(root_ht); return -ENOBUFS; diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 796529167e8d..c45c192b7878 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -1159,8 +1159,7 @@ int sctp_assoc_update(struct sctp_association *asoc, /* Add any peer addresses from the new association. */ list_for_each_entry(trans, &new->peer.transport_addr_list, transports) - if (!sctp_assoc_lookup_paddr(asoc, &trans->ipaddr) && - !sctp_assoc_add_peer(asoc, &trans->ipaddr, + if (!sctp_assoc_add_peer(asoc, &trans->ipaddr, GFP_ATOMIC, trans->state)) return -ENOMEM; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index ab943e8fb1db..7f89e43154c0 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -2450,6 +2450,7 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params, if (trans) { trans->hbinterval = msecs_to_jiffies(params->spp_hbinterval); + sctp_transport_reset_hb_timer(trans); } else if (asoc) { asoc->hbinterval = msecs_to_jiffies(params->spp_hbinterval); diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index a7a9136198fd..f65d6f92afcb 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -612,7 +612,9 @@ int sctp_sysctl_net_register(struct net *net) table[SCTP_PF_RETRANS_IDX].extra2 = &net->sctp.ps_retrans; table[SCTP_PS_RETRANS_IDX].extra1 = &net->sctp.pf_retrans; - net->sctp.sysctl_header = register_net_sysctl(net, "net/sctp", table); + net->sctp.sysctl_header = register_net_sysctl_sz(net, "net/sctp", + table, + ARRAY_SIZE(sctp_net_table)); if (net->sctp.sysctl_header == NULL) { kfree(table); return -ENOMEM; diff --git a/net/smc/Kconfig b/net/smc/Kconfig index 1ab3c5a2c5ad..746be3996768 100644 --- a/net/smc/Kconfig +++ b/net/smc/Kconfig @@ -2,6 +2,7 @@ config SMC tristate "SMC socket protocol family" depends on INET && INFINIBAND + depends on m || ISM != m help SMC-R provides a "sockets over RDMA" solution making use of RDMA over Converged Ethernet (RoCE) technology to upgrade diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index bacdd971615e..35ddebae8894 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1201,6 +1201,7 @@ static int smc_connect_rdma_v2_prepare(struct smc_sock *smc, (struct smc_clc_msg_accept_confirm_v2 *)aclc; struct smc_clc_first_contact_ext *fce = smc_get_clc_first_contact_ext(clc_v2, false); + struct net *net = sock_net(&smc->sk); int rc; if (!ini->first_contact_peer || aclc->hdr.version == SMC_V1) @@ -1210,7 +1211,7 @@ static int smc_connect_rdma_v2_prepare(struct smc_sock *smc, memcpy(ini->smcrv2.nexthop_mac, &aclc->r0.lcl.mac, ETH_ALEN); ini->smcrv2.uses_gateway = false; } else { - if (smc_ib_find_route(smc->clcsock->sk->sk_rcv_saddr, + if (smc_ib_find_route(net, smc->clcsock->sk->sk_rcv_saddr, smc_ib_gid_to_ipv4(aclc->r0.lcl.gid), ini->smcrv2.nexthop_mac, &ini->smcrv2.uses_gateway)) @@ -2361,7 +2362,7 @@ static int smc_listen_find_device(struct smc_sock *new_smc, smc_find_ism_store_rc(rc, ini); return (!rc) ? 0 : ini->rc; } - return SMC_CLC_DECL_NOSMCDEV; + return prfx_rc; } /* listen worker: finish RDMA setup */ diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index bd01dd31e4bd..d520ee62c8ec 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1662,6 +1662,7 @@ void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport) { struct smc_link_group *lgr, *n; + spin_lock_bh(&smc_lgr_list.lock); list_for_each_entry_safe(lgr, n, &smc_lgr_list.list, list) { struct smc_link *link; @@ -1680,6 +1681,7 @@ void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport) if (link) smc_llc_add_link_local(link); } + spin_unlock_bh(&smc_lgr_list.lock); } /* link is down - switch connections to alternate link, diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 9b66d6aeeb1a..89981dbe46c9 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -193,7 +193,7 @@ bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport) return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE; } -int smc_ib_find_route(__be32 saddr, __be32 daddr, +int smc_ib_find_route(struct net *net, __be32 saddr, __be32 daddr, u8 nexthop_mac[], u8 *uses_gateway) { struct neighbour *neigh = NULL; @@ -205,7 +205,7 @@ int smc_ib_find_route(__be32 saddr, __be32 daddr, if (daddr == cpu_to_be32(INADDR_NONE)) goto out; - rt = ip_route_output_flow(&init_net, &fl4, NULL); + rt = ip_route_output_flow(net, &fl4, NULL); if (IS_ERR(rt)) goto out; if (rt->rt_uses_gateway && rt->rt_gw_family != AF_INET) @@ -235,6 +235,7 @@ static int smc_ib_determine_gid_rcu(const struct net_device *ndev, if (smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP && smc_ib_gid_to_ipv4((u8 *)&attr->gid) != cpu_to_be32(INADDR_NONE)) { struct in_device *in_dev = __in_dev_get_rcu(ndev); + struct net *net = dev_net(ndev); const struct in_ifaddr *ifa; bool subnet_match = false; @@ -248,7 +249,7 @@ static int smc_ib_determine_gid_rcu(const struct net_device *ndev, } if (!subnet_match) goto out; - if (smcrv2->daddr && smc_ib_find_route(smcrv2->saddr, + if (smcrv2->daddr && smc_ib_find_route(net, smcrv2->saddr, smcrv2->daddr, smcrv2->nexthop_mac, &smcrv2->uses_gateway)) diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index 4df5f8c8a0a1..ef8ac2b7546d 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -112,7 +112,7 @@ void smc_ib_sync_sg_for_device(struct smc_link *lnk, int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport, unsigned short vlan_id, u8 gid[], u8 *sgid_index, struct smc_init_info_smcrv2 *smcrv2); -int smc_ib_find_route(__be32 saddr, __be32 daddr, +int smc_ib_find_route(struct net *net, __be32 saddr, __be32 daddr, u8 nexthop_mac[], u8 *uses_gateway); bool smc_ib_is_valid_local_systemid(void); int smcr_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb); diff --git a/net/smc/smc_stats.h b/net/smc/smc_stats.h index b60fe1eb37ab..9d32058db2b5 100644 --- a/net/smc/smc_stats.h +++ b/net/smc/smc_stats.h @@ -92,13 +92,14 @@ do { \ typeof(_smc_stats) stats = (_smc_stats); \ typeof(_tech) t = (_tech); \ typeof(_len) l = (_len); \ - int _pos = fls64((l) >> 13); \ + int _pos; \ typeof(_rc) r = (_rc); \ int m = SMC_BUF_MAX - 1; \ this_cpu_inc((*stats).smc[t].key ## _cnt); \ - if (r <= 0) \ + if (r <= 0 || l <= 0) \ break; \ - _pos = (_pos < m) ? ((l == 1 << (_pos + 12)) ? _pos - 1 : _pos) : m; \ + _pos = fls64((l - 1) >> 13); \ + _pos = (_pos <= m) ? _pos : m; \ this_cpu_inc((*stats).smc[t].key ## _pd.buf[_pos]); \ this_cpu_add((*stats).smc[t].key ## _bytes, r); \ } \ @@ -138,9 +139,12 @@ while (0) do { \ typeof(_len) _l = (_len); \ typeof(_tech) t = (_tech); \ - int _pos = fls((_l) >> 13); \ + int _pos; \ int m = SMC_BUF_MAX - 1; \ - _pos = (_pos < m) ? ((_l == 1 << (_pos + 12)) ? _pos - 1 : _pos) : m; \ + if (_l <= 0) \ + break; \ + _pos = fls((_l - 1) >> 13); \ + _pos = (_pos <= m) ? _pos : m; \ this_cpu_inc((*(_smc_stats)).smc[t].k ## _rmbsize.buf[_pos]); \ } \ while (0) @@ -243,8 +247,9 @@ while (0) #define SMC_STAT_SERV_SUCC_INC(net, _ini) \ do { \ typeof(_ini) i = (_ini); \ - bool is_v2 = (i->smcd_version & SMC_V2); \ bool is_smcd = (i->is_smcd); \ + u8 version = is_smcd ? i->smcd_version : i->smcr_version; \ + bool is_v2 = (version & SMC_V2); \ typeof(net->smc.smc_stats) smc_stats = (net)->smc.smc_stats; \ if (is_v2 && is_smcd) \ this_cpu_inc(smc_stats->smc[SMC_TYPE_D].srv_v2_succ_cnt); \ diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index 0b2a957ca5f5..5cbc18c6e62b 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -87,7 +87,8 @@ int __net_init smc_sysctl_net_init(struct net *net) table[i].data += (void *)net - (void *)&init_net; } - net->smc.smc_hdr = register_net_sysctl(net, "net/smc", table); + net->smc.smc_hdr = register_net_sysctl_sz(net, "net/smc", table, + ARRAY_SIZE(smc_table)); if (!net->smc.smc_hdr) goto err_reg; diff --git a/net/socket.c b/net/socket.c index 928b05811cfd..c4a6f5532955 100644 --- a/net/socket.c +++ b/net/socket.c @@ -88,6 +88,7 @@ #include <linux/xattr.h> #include <linux/nospec.h> #include <linux/indirect_call_wrapper.h> +#include <linux/io_uring.h> #include <linux/uaccess.h> #include <asm/unistd.h> @@ -160,6 +161,7 @@ static const struct file_operations socket_file_ops = { #ifdef CONFIG_COMPAT .compat_ioctl = compat_sock_ioctl, #endif + .uring_cmd = io_uring_cmd_sock, .mmap = sock_mmap, .release = sock_close, .fasync = sock_fasync, @@ -735,6 +737,14 @@ static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg) return ret; } +static int __sock_sendmsg(struct socket *sock, struct msghdr *msg) +{ + int err = security_socket_sendmsg(sock, msg, + msg_data_left(msg)); + + return err ?: sock_sendmsg_nosec(sock, msg); +} + /** * sock_sendmsg - send a message through @sock * @sock: socket @@ -745,10 +755,19 @@ static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg) */ int sock_sendmsg(struct socket *sock, struct msghdr *msg) { - int err = security_socket_sendmsg(sock, msg, - msg_data_left(msg)); + struct sockaddr_storage *save_addr = (struct sockaddr_storage *)msg->msg_name; + struct sockaddr_storage address; + int ret; - return err ?: sock_sendmsg_nosec(sock, msg); + if (msg->msg_name) { + memcpy(&address, msg->msg_name, msg->msg_namelen); + msg->msg_name = &address; + } + + ret = __sock_sendmsg(sock, msg); + msg->msg_name = save_addr; + + return ret; } EXPORT_SYMBOL(sock_sendmsg); @@ -1136,7 +1155,7 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from) if (sock->type == SOCK_SEQPACKET) msg.msg_flags |= MSG_EOR; - res = sock_sendmsg(sock, &msg); + res = __sock_sendmsg(sock, &msg); *from = msg.msg_iter; return res; } @@ -2172,7 +2191,7 @@ int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags, if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; msg.msg_flags = flags; - err = sock_sendmsg(sock, &msg); + err = __sock_sendmsg(sock, &msg); out_put: fput_light(sock->file, fput_needed); @@ -2536,7 +2555,7 @@ static int ____sys_sendmsg(struct socket *sock, struct msghdr *msg_sys, err = sock_sendmsg_nosec(sock, msg_sys); goto out_freectl; } - err = sock_sendmsg(sock, msg_sys); + err = __sock_sendmsg(sock, msg_sys); /* * If this is sendmmsg() and sending to current destination address was * successful, remember it. @@ -3497,7 +3516,12 @@ static long compat_sock_ioctl(struct file *file, unsigned int cmd, int kernel_bind(struct socket *sock, struct sockaddr *addr, int addrlen) { - return READ_ONCE(sock->ops)->bind(sock, addr, addrlen); + struct sockaddr_storage address; + + memcpy(&address, addr, addrlen); + + return READ_ONCE(sock->ops)->bind(sock, (struct sockaddr *)&address, + addrlen); } EXPORT_SYMBOL(kernel_bind); diff --git a/net/sunrpc/.kunitconfig b/net/sunrpc/.kunitconfig index a55a00fa649b..eb02b906c295 100644 --- a/net/sunrpc/.kunitconfig +++ b/net/sunrpc/.kunitconfig @@ -23,7 +23,6 @@ CONFIG_NFS_FS=y CONFIG_SUNRPC=y CONFIG_SUNRPC_GSS=y CONFIG_RPCSEC_GSS_KRB5=y -CONFIG_RPCSEC_GSS_KRB5_ENCTYPES_DES=y CONFIG_RPCSEC_GSS_KRB5_ENCTYPES_AES_SHA1=y CONFIG_RPCSEC_GSS_KRB5_ENCTYPES_CAMELLIA=y CONFIG_RPCSEC_GSS_KRB5_ENCTYPES_AES_SHA2=y diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig index 4afc5fd71d44..2d8b67dac7b5 100644 --- a/net/sunrpc/Kconfig +++ b/net/sunrpc/Kconfig @@ -34,38 +34,6 @@ config RPCSEC_GSS_KRB5 If unsure, say Y. -config RPCSEC_GSS_KRB5_SIMPLIFIED - bool - depends on RPCSEC_GSS_KRB5 - -config RPCSEC_GSS_KRB5_CRYPTOSYSTEM - bool - depends on RPCSEC_GSS_KRB5 - -config RPCSEC_GSS_KRB5_ENCTYPES_DES - bool "Enable Kerberos enctypes based on DES (deprecated)" - depends on RPCSEC_GSS_KRB5 - depends on CRYPTO_CBC && CRYPTO_CTS && CRYPTO_ECB - depends on CRYPTO_HMAC && CRYPTO_MD5 && CRYPTO_SHA1 - depends on CRYPTO_DES - default n - select RPCSEC_GSS_KRB5_SIMPLIFIED - help - Choose Y to enable the use of deprecated Kerberos 5 - encryption types that utilize Data Encryption Standard - (DES) based ciphers. These include des-cbc-md5, - des-cbc-crc, and des-cbc-md4, which were deprecated by - RFC 6649, and des3-cbc-sha1, which was deprecated by RFC - 8429. - - These encryption types are known to be insecure, therefore - the default setting of this option is N. Support for these - encryption types is available only for compatibility with - legacy NFS client and server implementations. - - Removal of support is planned for a subsequent kernel - release. - config RPCSEC_GSS_KRB5_ENCTYPES_AES_SHA1 bool "Enable Kerberos enctypes based on AES and SHA-1" depends on RPCSEC_GSS_KRB5 @@ -73,7 +41,6 @@ config RPCSEC_GSS_KRB5_ENCTYPES_AES_SHA1 depends on CRYPTO_HMAC && CRYPTO_SHA1 depends on CRYPTO_AES default y - select RPCSEC_GSS_KRB5_CRYPTOSYSTEM help Choose Y to enable the use of Kerberos 5 encryption types that utilize Advanced Encryption Standard (AES) ciphers and @@ -86,7 +53,6 @@ config RPCSEC_GSS_KRB5_ENCTYPES_CAMELLIA depends on CRYPTO_CBC && CRYPTO_CTS && CRYPTO_CAMELLIA depends on CRYPTO_CMAC default n - select RPCSEC_GSS_KRB5_CRYPTOSYSTEM help Choose Y to enable the use of Kerberos 5 encryption types that utilize Camellia ciphers (RFC 3713) and CMAC digests @@ -100,7 +66,6 @@ config RPCSEC_GSS_KRB5_ENCTYPES_AES_SHA2 depends on CRYPTO_HMAC && CRYPTO_SHA256 && CRYPTO_SHA512 depends on CRYPTO_AES default n - select RPCSEC_GSS_KRB5_CRYPTOSYSTEM help Choose Y to enable the use of Kerberos 5 encryption types that utilize Advanced Encryption Standard (AES) ciphers and diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 2f16f9d17966..814b0169f972 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -769,9 +769,14 @@ int rpcauth_wrap_req(struct rpc_task *task, struct xdr_stream *xdr) * @task: controlling RPC task * @xdr: xdr_stream containing RPC Reply header * - * On success, @xdr is updated to point past the verifier and - * zero is returned. Otherwise, @xdr is in an undefined state - * and a negative errno is returned. + * Return values: + * %0: Verifier is valid. @xdr now points past the verifier. + * %-EIO: Verifier is corrupted or message ended early. + * %-EACCES: Verifier is intact but not valid. + * %-EPROTONOSUPPORT: Server does not support the requested auth type. + * + * When a negative errno is returned, @xdr is left in an undefined + * state. */ int rpcauth_checkverf(struct rpc_task *task, struct xdr_stream *xdr) diff --git a/net/sunrpc/auth_gss/Makefile b/net/sunrpc/auth_gss/Makefile index 012ae1720689..ad1736d93b76 100644 --- a/net/sunrpc/auth_gss/Makefile +++ b/net/sunrpc/auth_gss/Makefile @@ -12,6 +12,6 @@ auth_rpcgss-y := auth_gss.o gss_generic_token.o \ obj-$(CONFIG_RPCSEC_GSS_KRB5) += rpcsec_gss_krb5.o rpcsec_gss_krb5-y := gss_krb5_mech.o gss_krb5_seal.o gss_krb5_unseal.o \ - gss_krb5_seqnum.o gss_krb5_wrap.o gss_krb5_crypto.o gss_krb5_keys.o + gss_krb5_wrap.o gss_krb5_crypto.o gss_krb5_keys.o obj-$(CONFIG_RPCSEC_GSS_KRB5_KUNIT_TEST) += gss_krb5_test.o diff --git a/net/sunrpc/auth_gss/gss_krb5_internal.h b/net/sunrpc/auth_gss/gss_krb5_internal.h index b673e2626acb..3afd4065bf3d 100644 --- a/net/sunrpc/auth_gss/gss_krb5_internal.h +++ b/net/sunrpc/auth_gss/gss_krb5_internal.h @@ -33,7 +33,6 @@ struct gss_krb5_enctype { const u32 Ke_length; /* encryption subkey length, in octets */ const u32 Ki_length; /* integrity subkey length, in octets */ - int (*import_ctx)(struct krb5_ctx *ctx, gfp_t gfp_mask); int (*derive_key)(const struct gss_krb5_enctype *gk5e, const struct xdr_netobj *in, struct xdr_netobj *out, @@ -85,24 +84,15 @@ struct krb5_ctx { * GSS Kerberos 5 mechanism Per-Message calls. */ -u32 gss_krb5_get_mic_v1(struct krb5_ctx *ctx, struct xdr_buf *text, - struct xdr_netobj *token); u32 gss_krb5_get_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *text, struct xdr_netobj *token); -u32 gss_krb5_verify_mic_v1(struct krb5_ctx *ctx, struct xdr_buf *message_buffer, - struct xdr_netobj *read_token); u32 gss_krb5_verify_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *message_buffer, struct xdr_netobj *read_token); -u32 gss_krb5_wrap_v1(struct krb5_ctx *kctx, int offset, - struct xdr_buf *buf, struct page **pages); u32 gss_krb5_wrap_v2(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf, struct page **pages); -u32 gss_krb5_unwrap_v1(struct krb5_ctx *kctx, int offset, int len, - struct xdr_buf *buf, unsigned int *slack, - unsigned int *align); u32 gss_krb5_unwrap_v2(struct krb5_ctx *kctx, int offset, int len, struct xdr_buf *buf, unsigned int *slack, unsigned int *align); @@ -113,12 +103,6 @@ u32 gss_krb5_unwrap_v2(struct krb5_ctx *kctx, int offset, int len, /* Key Derivation Functions */ -int krb5_derive_key_v1(const struct gss_krb5_enctype *gk5e, - const struct xdr_netobj *inkey, - struct xdr_netobj *outkey, - const struct xdr_netobj *label, - gfp_t gfp_mask); - int krb5_derive_key_v2(const struct gss_krb5_enctype *gk5e, const struct xdr_netobj *inkey, struct xdr_netobj *outkey, @@ -169,13 +153,6 @@ static inline int krb5_derive_key(struct krb5_ctx *kctx, return gk5e->derive_key(gk5e, inkey, outkey, &label, gfp_mask); } -s32 krb5_make_seq_num(struct krb5_ctx *kctx, struct crypto_sync_skcipher *key, - int direction, u32 seqnum, unsigned char *cksum, - unsigned char *buf); - -s32 krb5_get_seq_num(struct krb5_ctx *kctx, unsigned char *cksum, - unsigned char *buf, int *direction, u32 *seqnum); - void krb5_make_confounder(u8 *p, int conflen); u32 make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen, diff --git a/net/sunrpc/auth_gss/gss_krb5_keys.c b/net/sunrpc/auth_gss/gss_krb5_keys.c index 5347fe1cc93f..06d8ee0db000 100644 --- a/net/sunrpc/auth_gss/gss_krb5_keys.c +++ b/net/sunrpc/auth_gss/gss_krb5_keys.c @@ -222,90 +222,6 @@ err_return: return ret; } -#define smask(step) ((1<<step)-1) -#define pstep(x, step) (((x)&smask(step))^(((x)>>step)&smask(step))) -#define parity_char(x) pstep(pstep(pstep((x), 4), 2), 1) - -static void mit_des_fixup_key_parity(u8 key[8]) -{ - int i; - for (i = 0; i < 8; i++) { - key[i] &= 0xfe; - key[i] |= 1^parity_char(key[i]); - } -} - -static int krb5_random_to_key_v1(const struct gss_krb5_enctype *gk5e, - struct xdr_netobj *randombits, - struct xdr_netobj *key) -{ - int i, ret = -EINVAL; - - if (key->len != 24) { - dprintk("%s: key->len is %d\n", __func__, key->len); - goto err_out; - } - if (randombits->len != 21) { - dprintk("%s: randombits->len is %d\n", - __func__, randombits->len); - goto err_out; - } - - /* take the seven bytes, move them around into the top 7 bits of the - 8 key bytes, then compute the parity bits. Do this three times. */ - - for (i = 0; i < 3; i++) { - memcpy(key->data + i*8, randombits->data + i*7, 7); - key->data[i*8+7] = (((key->data[i*8]&1)<<1) | - ((key->data[i*8+1]&1)<<2) | - ((key->data[i*8+2]&1)<<3) | - ((key->data[i*8+3]&1)<<4) | - ((key->data[i*8+4]&1)<<5) | - ((key->data[i*8+5]&1)<<6) | - ((key->data[i*8+6]&1)<<7)); - - mit_des_fixup_key_parity(key->data + i*8); - } - ret = 0; -err_out: - return ret; -} - -/** - * krb5_derive_key_v1 - Derive a subkey for an RFC 3961 enctype - * @gk5e: Kerberos 5 enctype profile - * @inkey: base protocol key - * @outkey: OUT: derived key - * @label: subkey usage label - * @gfp_mask: memory allocation control flags - * - * Caller sets @outkey->len to the desired length of the derived key. - * - * On success, returns 0 and fills in @outkey. A negative errno value - * is returned on failure. - */ -int krb5_derive_key_v1(const struct gss_krb5_enctype *gk5e, - const struct xdr_netobj *inkey, - struct xdr_netobj *outkey, - const struct xdr_netobj *label, - gfp_t gfp_mask) -{ - struct xdr_netobj inblock; - int ret; - - inblock.len = gk5e->keybytes; - inblock.data = kmalloc(inblock.len, gfp_mask); - if (!inblock.data) - return -ENOMEM; - - ret = krb5_DK(gk5e, inkey, inblock.data, label, gfp_mask); - if (!ret) - ret = krb5_random_to_key_v1(gk5e, &inblock, outkey); - - kfree_sensitive(inblock.data); - return ret; -} - /* * This is the identity function, with some sanity checking. */ diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index 20e21d08badb..e31cfdf7eadc 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -30,61 +30,7 @@ static struct gss_api_mech gss_kerberos_mech; -#if defined(CONFIG_RPCSEC_GSS_KRB5_SIMPLIFIED) -static int gss_krb5_import_ctx_des(struct krb5_ctx *ctx, gfp_t gfp_mask); -static int gss_krb5_import_ctx_v1(struct krb5_ctx *ctx, gfp_t gfp_mask); -#endif -#if defined(CONFIG_RPCSEC_GSS_KRB5_CRYPTOSYSTEM) -static int gss_krb5_import_ctx_v2(struct krb5_ctx *ctx, gfp_t gfp_mask); -#endif - static const struct gss_krb5_enctype supported_gss_krb5_enctypes[] = { -#if defined(CONFIG_RPCSEC_GSS_KRB5_ENCTYPES_DES) - /* - * DES (All DES enctypes are mapped to the same gss functionality) - */ - { - .etype = ENCTYPE_DES_CBC_RAW, - .ctype = CKSUMTYPE_RSA_MD5, - .name = "des-cbc-crc", - .encrypt_name = "cbc(des)", - .cksum_name = "md5", - .import_ctx = gss_krb5_import_ctx_des, - .get_mic = gss_krb5_get_mic_v1, - .verify_mic = gss_krb5_verify_mic_v1, - .wrap = gss_krb5_wrap_v1, - .unwrap = gss_krb5_unwrap_v1, - .signalg = SGN_ALG_DES_MAC_MD5, - .sealalg = SEAL_ALG_DES, - .keybytes = 7, - .keylength = 8, - .cksumlength = 8, - .keyed_cksum = 0, - }, - /* - * 3DES - */ - { - .etype = ENCTYPE_DES3_CBC_RAW, - .ctype = CKSUMTYPE_HMAC_SHA1_DES3, - .name = "des3-hmac-sha1", - .encrypt_name = "cbc(des3_ede)", - .cksum_name = "hmac(sha1)", - .import_ctx = gss_krb5_import_ctx_v1, - .derive_key = krb5_derive_key_v1, - .get_mic = gss_krb5_get_mic_v1, - .verify_mic = gss_krb5_verify_mic_v1, - .wrap = gss_krb5_wrap_v1, - .unwrap = gss_krb5_unwrap_v1, - .signalg = SGN_ALG_HMAC_SHA1_DES3_KD, - .sealalg = SEAL_ALG_DES3KD, - .keybytes = 21, - .keylength = 24, - .cksumlength = 20, - .keyed_cksum = 1, - }, -#endif - #if defined(CONFIG_RPCSEC_GSS_KRB5_ENCTYPES_AES_SHA1) /* * AES-128 with SHA-1 (RFC 3962) @@ -96,7 +42,6 @@ static const struct gss_krb5_enctype supported_gss_krb5_enctypes[] = { .encrypt_name = "cts(cbc(aes))", .aux_cipher = "cbc(aes)", .cksum_name = "hmac(sha1)", - .import_ctx = gss_krb5_import_ctx_v2, .derive_key = krb5_derive_key_v2, .encrypt = gss_krb5_aes_encrypt, .decrypt = gss_krb5_aes_decrypt, @@ -126,7 +71,6 @@ static const struct gss_krb5_enctype supported_gss_krb5_enctypes[] = { .encrypt_name = "cts(cbc(aes))", .aux_cipher = "cbc(aes)", .cksum_name = "hmac(sha1)", - .import_ctx = gss_krb5_import_ctx_v2, .derive_key = krb5_derive_key_v2, .encrypt = gss_krb5_aes_encrypt, .decrypt = gss_krb5_aes_decrypt, @@ -166,7 +110,6 @@ static const struct gss_krb5_enctype supported_gss_krb5_enctypes[] = { .Ke_length = BITS2OCTETS(128), .Ki_length = BITS2OCTETS(128), - .import_ctx = gss_krb5_import_ctx_v2, .derive_key = krb5_kdf_feedback_cmac, .encrypt = gss_krb5_aes_encrypt, .decrypt = gss_krb5_aes_decrypt, @@ -193,7 +136,6 @@ static const struct gss_krb5_enctype supported_gss_krb5_enctypes[] = { .Ke_length = BITS2OCTETS(256), .Ki_length = BITS2OCTETS(256), - .import_ctx = gss_krb5_import_ctx_v2, .derive_key = krb5_kdf_feedback_cmac, .encrypt = gss_krb5_aes_encrypt, .decrypt = gss_krb5_aes_decrypt, @@ -223,7 +165,6 @@ static const struct gss_krb5_enctype supported_gss_krb5_enctypes[] = { .Ke_length = BITS2OCTETS(128), .Ki_length = BITS2OCTETS(128), - .import_ctx = gss_krb5_import_ctx_v2, .derive_key = krb5_kdf_hmac_sha2, .encrypt = krb5_etm_encrypt, .decrypt = krb5_etm_decrypt, @@ -250,7 +191,6 @@ static const struct gss_krb5_enctype supported_gss_krb5_enctypes[] = { .Ke_length = BITS2OCTETS(256), .Ki_length = BITS2OCTETS(192), - .import_ctx = gss_krb5_import_ctx_v2, .derive_key = krb5_kdf_hmac_sha2, .encrypt = krb5_etm_encrypt, .decrypt = krb5_etm_decrypt, @@ -284,12 +224,6 @@ static void gss_krb5_prepare_enctype_priority_list(void) ENCTYPE_AES256_CTS_HMAC_SHA1_96, ENCTYPE_AES128_CTS_HMAC_SHA1_96, #endif -#if defined(CONFIG_RPCSEC_GSS_KRB5_ENCTYPES_DES) - ENCTYPE_DES3_CBC_SHA1, - ENCTYPE_DES_CBC_MD5, - ENCTYPE_DES_CBC_CRC, - ENCTYPE_DES_CBC_MD4, -#endif }; size_t total, i; char buf[16]; @@ -330,185 +264,6 @@ const struct gss_krb5_enctype *gss_krb5_lookup_enctype(u32 etype) EXPORT_SYMBOL_IF_KUNIT(gss_krb5_lookup_enctype); static struct crypto_sync_skcipher * -gss_krb5_alloc_cipher_v1(struct krb5_ctx *ctx, struct xdr_netobj *key) -{ - struct crypto_sync_skcipher *tfm; - - tfm = crypto_alloc_sync_skcipher(ctx->gk5e->encrypt_name, 0, 0); - if (IS_ERR(tfm)) - return NULL; - if (crypto_sync_skcipher_setkey(tfm, key->data, key->len)) { - crypto_free_sync_skcipher(tfm); - return NULL; - } - return tfm; -} - -static inline const void * -get_key(const void *p, const void *end, - struct krb5_ctx *ctx, struct crypto_sync_skcipher **res) -{ - struct crypto_sync_skcipher *tfm; - struct xdr_netobj key; - int alg; - - p = simple_get_bytes(p, end, &alg, sizeof(alg)); - if (IS_ERR(p)) - goto out_err; - switch (alg) { - case ENCTYPE_DES_CBC_CRC: - case ENCTYPE_DES_CBC_MD4: - case ENCTYPE_DES_CBC_MD5: - /* Map all these key types to ENCTYPE_DES_CBC_RAW */ - alg = ENCTYPE_DES_CBC_RAW; - break; - } - if (!gss_krb5_lookup_enctype(alg)) { - pr_warn("gss_krb5: unsupported enctype: %d\n", alg); - goto out_err_inval; - } - - p = simple_get_netobj(p, end, &key); - if (IS_ERR(p)) - goto out_err; - tfm = gss_krb5_alloc_cipher_v1(ctx, &key); - kfree(key.data); - if (!tfm) { - pr_warn("gss_krb5: failed to initialize cipher '%s'\n", - ctx->gk5e->encrypt_name); - goto out_err_inval; - } - *res = tfm; - - return p; - -out_err_inval: - p = ERR_PTR(-EINVAL); -out_err: - return p; -} - -static int -gss_import_v1_context(const void *p, const void *end, struct krb5_ctx *ctx) -{ - u32 seq_send; - int tmp; - u32 time32; - - p = simple_get_bytes(p, end, &ctx->initiate, sizeof(ctx->initiate)); - if (IS_ERR(p)) - goto out_err; - - /* Old format supports only DES! Any other enctype uses new format */ - ctx->enctype = ENCTYPE_DES_CBC_RAW; - - ctx->gk5e = gss_krb5_lookup_enctype(ctx->enctype); - if (ctx->gk5e == NULL) { - p = ERR_PTR(-EINVAL); - goto out_err; - } - - /* The downcall format was designed before we completely understood - * the uses of the context fields; so it includes some stuff we - * just give some minimal sanity-checking, and some we ignore - * completely (like the next twenty bytes): */ - if (unlikely(p + 20 > end || p + 20 < p)) { - p = ERR_PTR(-EFAULT); - goto out_err; - } - p += 20; - p = simple_get_bytes(p, end, &tmp, sizeof(tmp)); - if (IS_ERR(p)) - goto out_err; - if (tmp != SGN_ALG_DES_MAC_MD5) { - p = ERR_PTR(-ENOSYS); - goto out_err; - } - p = simple_get_bytes(p, end, &tmp, sizeof(tmp)); - if (IS_ERR(p)) - goto out_err; - if (tmp != SEAL_ALG_DES) { - p = ERR_PTR(-ENOSYS); - goto out_err; - } - p = simple_get_bytes(p, end, &time32, sizeof(time32)); - if (IS_ERR(p)) - goto out_err; - /* unsigned 32-bit time overflows in year 2106 */ - ctx->endtime = (time64_t)time32; - p = simple_get_bytes(p, end, &seq_send, sizeof(seq_send)); - if (IS_ERR(p)) - goto out_err; - atomic_set(&ctx->seq_send, seq_send); - p = simple_get_netobj(p, end, &ctx->mech_used); - if (IS_ERR(p)) - goto out_err; - p = get_key(p, end, ctx, &ctx->enc); - if (IS_ERR(p)) - goto out_err_free_mech; - p = get_key(p, end, ctx, &ctx->seq); - if (IS_ERR(p)) - goto out_err_free_key1; - if (p != end) { - p = ERR_PTR(-EFAULT); - goto out_err_free_key2; - } - - return 0; - -out_err_free_key2: - crypto_free_sync_skcipher(ctx->seq); -out_err_free_key1: - crypto_free_sync_skcipher(ctx->enc); -out_err_free_mech: - kfree(ctx->mech_used.data); -out_err: - return PTR_ERR(p); -} - -#if defined(CONFIG_RPCSEC_GSS_KRB5_SIMPLIFIED) -static int -gss_krb5_import_ctx_des(struct krb5_ctx *ctx, gfp_t gfp_mask) -{ - return -EINVAL; -} - -static int -gss_krb5_import_ctx_v1(struct krb5_ctx *ctx, gfp_t gfp_mask) -{ - struct xdr_netobj keyin, keyout; - - keyin.data = ctx->Ksess; - keyin.len = ctx->gk5e->keylength; - - ctx->seq = gss_krb5_alloc_cipher_v1(ctx, &keyin); - if (ctx->seq == NULL) - goto out_err; - ctx->enc = gss_krb5_alloc_cipher_v1(ctx, &keyin); - if (ctx->enc == NULL) - goto out_free_seq; - - /* derive cksum */ - keyout.data = ctx->cksum; - keyout.len = ctx->gk5e->keylength; - if (krb5_derive_key(ctx, &keyin, &keyout, KG_USAGE_SIGN, - KEY_USAGE_SEED_CHECKSUM, gfp_mask)) - goto out_free_enc; - - return 0; - -out_free_enc: - crypto_free_sync_skcipher(ctx->enc); -out_free_seq: - crypto_free_sync_skcipher(ctx->seq); -out_err: - return -EINVAL; -} -#endif - -#if defined(CONFIG_RPCSEC_GSS_KRB5_CRYPTOSYSTEM) - -static struct crypto_sync_skcipher * gss_krb5_alloc_cipher_v2(const char *cname, const struct xdr_netobj *key) { struct crypto_sync_skcipher *tfm; @@ -636,8 +391,6 @@ out_free: goto out; } -#endif - static int gss_import_v2_context(const void *p, const void *end, struct krb5_ctx *ctx, gfp_t gfp_mask) @@ -671,9 +424,6 @@ gss_import_v2_context(const void *p, const void *end, struct krb5_ctx *ctx, p = simple_get_bytes(p, end, &ctx->enctype, sizeof(ctx->enctype)); if (IS_ERR(p)) goto out_err; - /* Map ENCTYPE_DES3_CBC_SHA1 to ENCTYPE_DES3_CBC_RAW */ - if (ctx->enctype == ENCTYPE_DES3_CBC_SHA1) - ctx->enctype = ENCTYPE_DES3_CBC_RAW; ctx->gk5e = gss_krb5_lookup_enctype(ctx->enctype); if (ctx->gk5e == NULL) { dprintk("gss_kerberos_mech: unsupported krb5 enctype %u\n", @@ -700,7 +450,7 @@ gss_import_v2_context(const void *p, const void *end, struct krb5_ctx *ctx, } ctx->mech_used.len = gss_kerberos_mech.gm_oid.len; - return ctx->gk5e->import_ctx(ctx, gfp_mask); + return gss_krb5_import_ctx_v2(ctx, gfp_mask); out_err: return PTR_ERR(p); @@ -718,10 +468,7 @@ gss_krb5_import_sec_context(const void *p, size_t len, struct gss_ctx *ctx_id, if (ctx == NULL) return -ENOMEM; - if (len == 85) - ret = gss_import_v1_context(p, end, ctx); - else - ret = gss_import_v2_context(p, end, ctx, gfp_mask); + ret = gss_import_v2_context(p, end, ctx, gfp_mask); memzero_explicit(&ctx->Ksess, sizeof(ctx->Ksess)); if (ret) { kfree(ctx); diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c index 146aa755f07d..ce540df9bce4 100644 --- a/net/sunrpc/auth_gss/gss_krb5_seal.c +++ b/net/sunrpc/auth_gss/gss_krb5_seal.c @@ -71,75 +71,6 @@ # define RPCDBG_FACILITY RPCDBG_AUTH #endif -#if defined(CONFIG_RPCSEC_GSS_KRB5_SIMPLIFIED) - -static void * -setup_token(struct krb5_ctx *ctx, struct xdr_netobj *token) -{ - u16 *ptr; - void *krb5_hdr; - int body_size = GSS_KRB5_TOK_HDR_LEN + ctx->gk5e->cksumlength; - - token->len = g_token_size(&ctx->mech_used, body_size); - - ptr = (u16 *)token->data; - g_make_token_header(&ctx->mech_used, body_size, (unsigned char **)&ptr); - - /* ptr now at start of header described in rfc 1964, section 1.2.1: */ - krb5_hdr = ptr; - *ptr++ = KG_TOK_MIC_MSG; - /* - * signalg is stored as if it were converted from LE to host endian, even - * though it's an opaque pair of bytes according to the RFC. - */ - *ptr++ = (__force u16)cpu_to_le16(ctx->gk5e->signalg); - *ptr++ = SEAL_ALG_NONE; - *ptr = 0xffff; - - return krb5_hdr; -} - -u32 -gss_krb5_get_mic_v1(struct krb5_ctx *ctx, struct xdr_buf *text, - struct xdr_netobj *token) -{ - char cksumdata[GSS_KRB5_MAX_CKSUM_LEN]; - struct xdr_netobj md5cksum = {.len = sizeof(cksumdata), - .data = cksumdata}; - void *ptr; - time64_t now; - u32 seq_send; - u8 *cksumkey; - - dprintk("RPC: %s\n", __func__); - BUG_ON(ctx == NULL); - - now = ktime_get_real_seconds(); - - ptr = setup_token(ctx, token); - - if (ctx->gk5e->keyed_cksum) - cksumkey = ctx->cksum; - else - cksumkey = NULL; - - if (make_checksum(ctx, ptr, 8, text, 0, cksumkey, - KG_USAGE_SIGN, &md5cksum)) - return GSS_S_FAILURE; - - memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data, md5cksum.len); - - seq_send = atomic_fetch_inc(&ctx->seq_send); - - if (krb5_make_seq_num(ctx, ctx->seq, ctx->initiate ? 0 : 0xff, - seq_send, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8)) - return GSS_S_FAILURE; - - return (ctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE; -} - -#endif - static void * setup_token_v2(struct krb5_ctx *ctx, struct xdr_netobj *token) { diff --git a/net/sunrpc/auth_gss/gss_krb5_seqnum.c b/net/sunrpc/auth_gss/gss_krb5_seqnum.c deleted file mode 100644 index 1babc3474e10..000000000000 --- a/net/sunrpc/auth_gss/gss_krb5_seqnum.c +++ /dev/null @@ -1,106 +0,0 @@ -/* - * linux/net/sunrpc/gss_krb5_seqnum.c - * - * Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/krb5/util_seqnum.c - * - * Copyright (c) 2000 The Regents of the University of Michigan. - * All rights reserved. - * - * Andy Adamson <andros@umich.edu> - */ - -/* - * Copyright 1993 by OpenVision Technologies, Inc. - * - * Permission to use, copy, modify, distribute, and sell this software - * and its documentation for any purpose is hereby granted without fee, - * provided that the above copyright notice appears in all copies and - * that both that copyright notice and this permission notice appear in - * supporting documentation, and that the name of OpenVision not be used - * in advertising or publicity pertaining to distribution of the software - * without specific, written prior permission. OpenVision makes no - * representations about the suitability of this software for any - * purpose. It is provided "as is" without express or implied warranty. - * - * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, - * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO - * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR - * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF - * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR - * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR - * PERFORMANCE OF THIS SOFTWARE. - */ - -#include <crypto/skcipher.h> -#include <linux/types.h> -#include <linux/sunrpc/gss_krb5.h> - -#include "gss_krb5_internal.h" - -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -# define RPCDBG_FACILITY RPCDBG_AUTH -#endif - -s32 -krb5_make_seq_num(struct krb5_ctx *kctx, - struct crypto_sync_skcipher *key, - int direction, - u32 seqnum, - unsigned char *cksum, unsigned char *buf) -{ - unsigned char *plain; - s32 code; - - plain = kmalloc(8, GFP_KERNEL); - if (!plain) - return -ENOMEM; - - plain[0] = (unsigned char) (seqnum & 0xff); - plain[1] = (unsigned char) ((seqnum >> 8) & 0xff); - plain[2] = (unsigned char) ((seqnum >> 16) & 0xff); - plain[3] = (unsigned char) ((seqnum >> 24) & 0xff); - - plain[4] = direction; - plain[5] = direction; - plain[6] = direction; - plain[7] = direction; - - code = krb5_encrypt(key, cksum, plain, buf, 8); - kfree(plain); - return code; -} - -s32 -krb5_get_seq_num(struct krb5_ctx *kctx, - unsigned char *cksum, - unsigned char *buf, - int *direction, u32 *seqnum) -{ - s32 code; - unsigned char *plain; - struct crypto_sync_skcipher *key = kctx->seq; - - dprintk("RPC: krb5_get_seq_num:\n"); - - plain = kmalloc(8, GFP_KERNEL); - if (!plain) - return -ENOMEM; - - if ((code = krb5_decrypt(key, cksum, buf, plain, 8))) - goto out; - - if ((plain[4] != plain[5]) || (plain[4] != plain[6]) || - (plain[4] != plain[7])) { - code = (s32)KG_BAD_SEQ; - goto out; - } - - *direction = plain[4]; - - *seqnum = ((plain[0]) | - (plain[1] << 8) | (plain[2] << 16) | (plain[3] << 24)); - -out: - kfree(plain); - return code; -} diff --git a/net/sunrpc/auth_gss/gss_krb5_test.c b/net/sunrpc/auth_gss/gss_krb5_test.c index 95ca783795c5..85625e3f3814 100644 --- a/net/sunrpc/auth_gss/gss_krb5_test.c +++ b/net/sunrpc/auth_gss/gss_krb5_test.c @@ -320,208 +320,12 @@ static void rfc3961_nfold_case(struct kunit *test) "result mismatch"); } -/* - * RFC 3961 Appendix A.3. DES3 DR and DK - * - * These tests show the derived-random and derived-key values for the - * des3-hmac-sha1-kd encryption scheme, using the DR and DK functions - * defined in section 6.3.1. The input keys were randomly generated; - * the usage values are from this specification. - * - * This test material is copyright (C) The Internet Society (2005). - */ - -DEFINE_HEX_XDR_NETOBJ(des3_dk_usage_155, - 0x00, 0x00, 0x00, 0x01, 0x55 -); - -DEFINE_HEX_XDR_NETOBJ(des3_dk_usage_1aa, - 0x00, 0x00, 0x00, 0x01, 0xaa -); - -DEFINE_HEX_XDR_NETOBJ(des3_dk_usage_kerberos, - 0x6b, 0x65, 0x72, 0x62, 0x65, 0x72, 0x6f, 0x73 -); - -DEFINE_HEX_XDR_NETOBJ(des3_dk_test1_base_key, - 0xdc, 0xe0, 0x6b, 0x1f, 0x64, 0xc8, 0x57, 0xa1, - 0x1c, 0x3d, 0xb5, 0x7c, 0x51, 0x89, 0x9b, 0x2c, - 0xc1, 0x79, 0x10, 0x08, 0xce, 0x97, 0x3b, 0x92 -); -DEFINE_HEX_XDR_NETOBJ(des3_dk_test1_derived_key, - 0x92, 0x51, 0x79, 0xd0, 0x45, 0x91, 0xa7, 0x9b, - 0x5d, 0x31, 0x92, 0xc4, 0xa7, 0xe9, 0xc2, 0x89, - 0xb0, 0x49, 0xc7, 0x1f, 0x6e, 0xe6, 0x04, 0xcd -); - -DEFINE_HEX_XDR_NETOBJ(des3_dk_test2_base_key, - 0x5e, 0x13, 0xd3, 0x1c, 0x70, 0xef, 0x76, 0x57, - 0x46, 0x57, 0x85, 0x31, 0xcb, 0x51, 0xc1, 0x5b, - 0xf1, 0x1c, 0xa8, 0x2c, 0x97, 0xce, 0xe9, 0xf2 -); -DEFINE_HEX_XDR_NETOBJ(des3_dk_test2_derived_key, - 0x9e, 0x58, 0xe5, 0xa1, 0x46, 0xd9, 0x94, 0x2a, - 0x10, 0x1c, 0x46, 0x98, 0x45, 0xd6, 0x7a, 0x20, - 0xe3, 0xc4, 0x25, 0x9e, 0xd9, 0x13, 0xf2, 0x07 -); - -DEFINE_HEX_XDR_NETOBJ(des3_dk_test3_base_key, - 0x98, 0xe6, 0xfd, 0x8a, 0x04, 0xa4, 0xb6, 0x85, - 0x9b, 0x75, 0xa1, 0x76, 0x54, 0x0b, 0x97, 0x52, - 0xba, 0xd3, 0xec, 0xd6, 0x10, 0xa2, 0x52, 0xbc -); -DEFINE_HEX_XDR_NETOBJ(des3_dk_test3_derived_key, - 0x13, 0xfe, 0xf8, 0x0d, 0x76, 0x3e, 0x94, 0xec, - 0x6d, 0x13, 0xfd, 0x2c, 0xa1, 0xd0, 0x85, 0x07, - 0x02, 0x49, 0xda, 0xd3, 0x98, 0x08, 0xea, 0xbf -); - -DEFINE_HEX_XDR_NETOBJ(des3_dk_test4_base_key, - 0x62, 0x2a, 0xec, 0x25, 0xa2, 0xfe, 0x2c, 0xad, - 0x70, 0x94, 0x68, 0x0b, 0x7c, 0x64, 0x94, 0x02, - 0x80, 0x08, 0x4c, 0x1a, 0x7c, 0xec, 0x92, 0xb5 -); -DEFINE_HEX_XDR_NETOBJ(des3_dk_test4_derived_key, - 0xf8, 0xdf, 0xbf, 0x04, 0xb0, 0x97, 0xe6, 0xd9, - 0xdc, 0x07, 0x02, 0x68, 0x6b, 0xcb, 0x34, 0x89, - 0xd9, 0x1f, 0xd9, 0xa4, 0x51, 0x6b, 0x70, 0x3e -); - -DEFINE_HEX_XDR_NETOBJ(des3_dk_test5_base_key, - 0xd3, 0xf8, 0x29, 0x8c, 0xcb, 0x16, 0x64, 0x38, - 0xdc, 0xb9, 0xb9, 0x3e, 0xe5, 0xa7, 0x62, 0x92, - 0x86, 0xa4, 0x91, 0xf8, 0x38, 0xf8, 0x02, 0xfb -); -DEFINE_HEX_XDR_NETOBJ(des3_dk_test5_derived_key, - 0x23, 0x70, 0xda, 0x57, 0x5d, 0x2a, 0x3d, 0xa8, - 0x64, 0xce, 0xbf, 0xdc, 0x52, 0x04, 0xd5, 0x6d, - 0xf7, 0x79, 0xa7, 0xdf, 0x43, 0xd9, 0xda, 0x43 -); - -DEFINE_HEX_XDR_NETOBJ(des3_dk_test6_base_key, - 0xc1, 0x08, 0x16, 0x49, 0xad, 0xa7, 0x43, 0x62, - 0xe6, 0xa1, 0x45, 0x9d, 0x01, 0xdf, 0xd3, 0x0d, - 0x67, 0xc2, 0x23, 0x4c, 0x94, 0x07, 0x04, 0xda -); -DEFINE_HEX_XDR_NETOBJ(des3_dk_test6_derived_key, - 0x34, 0x80, 0x57, 0xec, 0x98, 0xfd, 0xc4, 0x80, - 0x16, 0x16, 0x1c, 0x2a, 0x4c, 0x7a, 0x94, 0x3e, - 0x92, 0xae, 0x49, 0x2c, 0x98, 0x91, 0x75, 0xf7 -); - -DEFINE_HEX_XDR_NETOBJ(des3_dk_test7_base_key, - 0x5d, 0x15, 0x4a, 0xf2, 0x38, 0xf4, 0x67, 0x13, - 0x15, 0x57, 0x19, 0xd5, 0x5e, 0x2f, 0x1f, 0x79, - 0x0d, 0xd6, 0x61, 0xf2, 0x79, 0xa7, 0x91, 0x7c -); -DEFINE_HEX_XDR_NETOBJ(des3_dk_test7_derived_key, - 0xa8, 0x80, 0x8a, 0xc2, 0x67, 0xda, 0xda, 0x3d, - 0xcb, 0xe9, 0xa7, 0xc8, 0x46, 0x26, 0xfb, 0xc7, - 0x61, 0xc2, 0x94, 0xb0, 0x13, 0x15, 0xe5, 0xc1 -); - -DEFINE_HEX_XDR_NETOBJ(des3_dk_test8_base_key, - 0x79, 0x85, 0x62, 0xe0, 0x49, 0x85, 0x2f, 0x57, - 0xdc, 0x8c, 0x34, 0x3b, 0xa1, 0x7f, 0x2c, 0xa1, - 0xd9, 0x73, 0x94, 0xef, 0xc8, 0xad, 0xc4, 0x43 -); -DEFINE_HEX_XDR_NETOBJ(des3_dk_test8_derived_key, - 0xc8, 0x13, 0xf8, 0x8a, 0x3b, 0xe3, 0xb3, 0x34, - 0xf7, 0x54, 0x25, 0xce, 0x91, 0x75, 0xfb, 0xe3, - 0xc8, 0x49, 0x3b, 0x89, 0xc8, 0x70, 0x3b, 0x49 -); - -DEFINE_HEX_XDR_NETOBJ(des3_dk_test9_base_key, - 0x26, 0xdc, 0xe3, 0x34, 0xb5, 0x45, 0x29, 0x2f, - 0x2f, 0xea, 0xb9, 0xa8, 0x70, 0x1a, 0x89, 0xa4, - 0xb9, 0x9e, 0xb9, 0x94, 0x2c, 0xec, 0xd0, 0x16 -); -DEFINE_HEX_XDR_NETOBJ(des3_dk_test9_derived_key, - 0xf4, 0x8f, 0xfd, 0x6e, 0x83, 0xf8, 0x3e, 0x73, - 0x54, 0xe6, 0x94, 0xfd, 0x25, 0x2c, 0xf8, 0x3b, - 0xfe, 0x58, 0xf7, 0xd5, 0xba, 0x37, 0xec, 0x5d -); - -static const struct gss_krb5_test_param rfc3961_kdf_test_params[] = { - { - .desc = "des3-hmac-sha1 key derivation case 1", - .enctype = ENCTYPE_DES3_CBC_RAW, - .base_key = &des3_dk_test1_base_key, - .usage = &des3_dk_usage_155, - .expected_result = &des3_dk_test1_derived_key, - }, - { - .desc = "des3-hmac-sha1 key derivation case 2", - .enctype = ENCTYPE_DES3_CBC_RAW, - .base_key = &des3_dk_test2_base_key, - .usage = &des3_dk_usage_1aa, - .expected_result = &des3_dk_test2_derived_key, - }, - { - .desc = "des3-hmac-sha1 key derivation case 3", - .enctype = ENCTYPE_DES3_CBC_RAW, - .base_key = &des3_dk_test3_base_key, - .usage = &des3_dk_usage_155, - .expected_result = &des3_dk_test3_derived_key, - }, - { - .desc = "des3-hmac-sha1 key derivation case 4", - .enctype = ENCTYPE_DES3_CBC_RAW, - .base_key = &des3_dk_test4_base_key, - .usage = &des3_dk_usage_1aa, - .expected_result = &des3_dk_test4_derived_key, - }, - { - .desc = "des3-hmac-sha1 key derivation case 5", - .enctype = ENCTYPE_DES3_CBC_RAW, - .base_key = &des3_dk_test5_base_key, - .usage = &des3_dk_usage_kerberos, - .expected_result = &des3_dk_test5_derived_key, - }, - { - .desc = "des3-hmac-sha1 key derivation case 6", - .enctype = ENCTYPE_DES3_CBC_RAW, - .base_key = &des3_dk_test6_base_key, - .usage = &des3_dk_usage_155, - .expected_result = &des3_dk_test6_derived_key, - }, - { - .desc = "des3-hmac-sha1 key derivation case 7", - .enctype = ENCTYPE_DES3_CBC_RAW, - .base_key = &des3_dk_test7_base_key, - .usage = &des3_dk_usage_1aa, - .expected_result = &des3_dk_test7_derived_key, - }, - { - .desc = "des3-hmac-sha1 key derivation case 8", - .enctype = ENCTYPE_DES3_CBC_RAW, - .base_key = &des3_dk_test8_base_key, - .usage = &des3_dk_usage_155, - .expected_result = &des3_dk_test8_derived_key, - }, - { - .desc = "des3-hmac-sha1 key derivation case 9", - .enctype = ENCTYPE_DES3_CBC_RAW, - .base_key = &des3_dk_test9_base_key, - .usage = &des3_dk_usage_1aa, - .expected_result = &des3_dk_test9_derived_key, - }, -}; - -/* Creates the function rfc3961_kdf_gen_params */ -KUNIT_ARRAY_PARAM(rfc3961_kdf, rfc3961_kdf_test_params, gss_krb5_get_desc); - static struct kunit_case rfc3961_test_cases[] = { { .name = "RFC 3961 n-fold", .run_case = rfc3961_nfold_case, .generate_params = rfc3961_nfold_gen_params, }, - { - .name = "RFC 3961 key derivation", - .run_case = kdf_case, - .generate_params = rfc3961_kdf_gen_params, - }, {} }; diff --git a/net/sunrpc/auth_gss/gss_krb5_unseal.c b/net/sunrpc/auth_gss/gss_krb5_unseal.c index 7d6d4ae4a3c9..4fbc50a0a2c4 100644 --- a/net/sunrpc/auth_gss/gss_krb5_unseal.c +++ b/net/sunrpc/auth_gss/gss_krb5_unseal.c @@ -69,83 +69,6 @@ # define RPCDBG_FACILITY RPCDBG_AUTH #endif - -#if defined(CONFIG_RPCSEC_GSS_KRB5_SIMPLIFIED) -/* read_token is a mic token, and message_buffer is the data that the mic was - * supposedly taken over. */ -u32 -gss_krb5_verify_mic_v1(struct krb5_ctx *ctx, struct xdr_buf *message_buffer, - struct xdr_netobj *read_token) -{ - int signalg; - int sealalg; - char cksumdata[GSS_KRB5_MAX_CKSUM_LEN]; - struct xdr_netobj md5cksum = {.len = sizeof(cksumdata), - .data = cksumdata}; - s32 now; - int direction; - u32 seqnum; - unsigned char *ptr = (unsigned char *)read_token->data; - int bodysize; - u8 *cksumkey; - - dprintk("RPC: krb5_read_token\n"); - - if (g_verify_token_header(&ctx->mech_used, &bodysize, &ptr, - read_token->len)) - return GSS_S_DEFECTIVE_TOKEN; - - if ((ptr[0] != ((KG_TOK_MIC_MSG >> 8) & 0xff)) || - (ptr[1] != (KG_TOK_MIC_MSG & 0xff))) - return GSS_S_DEFECTIVE_TOKEN; - - /* XXX sanity-check bodysize?? */ - - signalg = ptr[2] + (ptr[3] << 8); - if (signalg != ctx->gk5e->signalg) - return GSS_S_DEFECTIVE_TOKEN; - - sealalg = ptr[4] + (ptr[5] << 8); - if (sealalg != SEAL_ALG_NONE) - return GSS_S_DEFECTIVE_TOKEN; - - if ((ptr[6] != 0xff) || (ptr[7] != 0xff)) - return GSS_S_DEFECTIVE_TOKEN; - - if (ctx->gk5e->keyed_cksum) - cksumkey = ctx->cksum; - else - cksumkey = NULL; - - if (make_checksum(ctx, ptr, 8, message_buffer, 0, - cksumkey, KG_USAGE_SIGN, &md5cksum)) - return GSS_S_FAILURE; - - if (memcmp(md5cksum.data, ptr + GSS_KRB5_TOK_HDR_LEN, - ctx->gk5e->cksumlength)) - return GSS_S_BAD_SIG; - - /* it got through unscathed. Make sure the context is unexpired */ - - now = ktime_get_real_seconds(); - - if (now > ctx->endtime) - return GSS_S_CONTEXT_EXPIRED; - - /* do sequencing checks */ - - if (krb5_get_seq_num(ctx, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8, - &direction, &seqnum)) - return GSS_S_FAILURE; - - if ((ctx->initiate && direction != 0xff) || - (!ctx->initiate && direction != 0)) - return GSS_S_BAD_SIG; - - return GSS_S_COMPLETE; -} -#endif - u32 gss_krb5_verify_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *message_buffer, struct xdr_netobj *read_token) diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c index 6d6b082380b2..b3e1738ff6bf 100644 --- a/net/sunrpc/auth_gss/gss_krb5_wrap.c +++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c @@ -40,293 +40,6 @@ # define RPCDBG_FACILITY RPCDBG_AUTH #endif -#if defined(CONFIG_RPCSEC_GSS_KRB5_SIMPLIFIED) - -static inline int -gss_krb5_padding(int blocksize, int length) -{ - return blocksize - (length % blocksize); -} - -static inline void -gss_krb5_add_padding(struct xdr_buf *buf, int offset, int blocksize) -{ - int padding = gss_krb5_padding(blocksize, buf->len - offset); - char *p; - struct kvec *iov; - - if (buf->page_len || buf->tail[0].iov_len) - iov = &buf->tail[0]; - else - iov = &buf->head[0]; - p = iov->iov_base + iov->iov_len; - iov->iov_len += padding; - buf->len += padding; - memset(p, padding, padding); -} - -static inline int -gss_krb5_remove_padding(struct xdr_buf *buf, int blocksize) -{ - u8 *ptr; - u8 pad; - size_t len = buf->len; - - if (len <= buf->head[0].iov_len) { - pad = *(u8 *)(buf->head[0].iov_base + len - 1); - if (pad > buf->head[0].iov_len) - return -EINVAL; - buf->head[0].iov_len -= pad; - goto out; - } else - len -= buf->head[0].iov_len; - if (len <= buf->page_len) { - unsigned int last = (buf->page_base + len - 1) - >>PAGE_SHIFT; - unsigned int offset = (buf->page_base + len - 1) - & (PAGE_SIZE - 1); - ptr = kmap_atomic(buf->pages[last]); - pad = *(ptr + offset); - kunmap_atomic(ptr); - goto out; - } else - len -= buf->page_len; - BUG_ON(len > buf->tail[0].iov_len); - pad = *(u8 *)(buf->tail[0].iov_base + len - 1); -out: - /* XXX: NOTE: we do not adjust the page lengths--they represent - * a range of data in the real filesystem page cache, and we need - * to know that range so the xdr code can properly place read data. - * However adjusting the head length, as we do above, is harmless. - * In the case of a request that fits into a single page, the server - * also uses length and head length together to determine the original - * start of the request to copy the request for deferal; so it's - * easier on the server if we adjust head and tail length in tandem. - * It's not really a problem that we don't fool with the page and - * tail lengths, though--at worst badly formed xdr might lead the - * server to attempt to parse the padding. - * XXX: Document all these weird requirements for gss mechanism - * wrap/unwrap functions. */ - if (pad > blocksize) - return -EINVAL; - if (buf->len > pad) - buf->len -= pad; - else - return -EINVAL; - return 0; -} - -/* Assumptions: the head and tail of inbuf are ours to play with. - * The pages, however, may be real pages in the page cache and we replace - * them with scratch pages from **pages before writing to them. */ -/* XXX: obviously the above should be documentation of wrap interface, - * and shouldn't be in this kerberos-specific file. */ - -/* XXX factor out common code with seal/unseal. */ - -u32 -gss_krb5_wrap_v1(struct krb5_ctx *kctx, int offset, - struct xdr_buf *buf, struct page **pages) -{ - char cksumdata[GSS_KRB5_MAX_CKSUM_LEN]; - struct xdr_netobj md5cksum = {.len = sizeof(cksumdata), - .data = cksumdata}; - int blocksize = 0, plainlen; - unsigned char *ptr, *msg_start; - time64_t now; - int headlen; - struct page **tmp_pages; - u32 seq_send; - u8 *cksumkey; - u32 conflen = crypto_sync_skcipher_blocksize(kctx->enc); - - dprintk("RPC: %s\n", __func__); - - now = ktime_get_real_seconds(); - - blocksize = crypto_sync_skcipher_blocksize(kctx->enc); - gss_krb5_add_padding(buf, offset, blocksize); - BUG_ON((buf->len - offset) % blocksize); - plainlen = conflen + buf->len - offset; - - headlen = g_token_size(&kctx->mech_used, - GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength + plainlen) - - (buf->len - offset); - - ptr = buf->head[0].iov_base + offset; - /* shift data to make room for header. */ - xdr_extend_head(buf, offset, headlen); - - /* XXX Would be cleverer to encrypt while copying. */ - BUG_ON((buf->len - offset - headlen) % blocksize); - - g_make_token_header(&kctx->mech_used, - GSS_KRB5_TOK_HDR_LEN + - kctx->gk5e->cksumlength + plainlen, &ptr); - - - /* ptr now at header described in rfc 1964, section 1.2.1: */ - ptr[0] = (unsigned char) ((KG_TOK_WRAP_MSG >> 8) & 0xff); - ptr[1] = (unsigned char) (KG_TOK_WRAP_MSG & 0xff); - - msg_start = ptr + GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength; - - /* - * signalg and sealalg are stored as if they were converted from LE - * to host endian, even though they're opaque pairs of bytes according - * to the RFC. - */ - *(__le16 *)(ptr + 2) = cpu_to_le16(kctx->gk5e->signalg); - *(__le16 *)(ptr + 4) = cpu_to_le16(kctx->gk5e->sealalg); - ptr[6] = 0xff; - ptr[7] = 0xff; - - krb5_make_confounder(msg_start, conflen); - - if (kctx->gk5e->keyed_cksum) - cksumkey = kctx->cksum; - else - cksumkey = NULL; - - /* XXXJBF: UGH!: */ - tmp_pages = buf->pages; - buf->pages = pages; - if (make_checksum(kctx, ptr, 8, buf, offset + headlen - conflen, - cksumkey, KG_USAGE_SEAL, &md5cksum)) - return GSS_S_FAILURE; - buf->pages = tmp_pages; - - memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data, md5cksum.len); - - seq_send = atomic_fetch_inc(&kctx->seq_send); - - /* XXX would probably be more efficient to compute checksum - * and encrypt at the same time: */ - if ((krb5_make_seq_num(kctx, kctx->seq, kctx->initiate ? 0 : 0xff, - seq_send, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8))) - return GSS_S_FAILURE; - - if (gss_encrypt_xdr_buf(kctx->enc, buf, - offset + headlen - conflen, pages)) - return GSS_S_FAILURE; - - return (kctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE; -} - -u32 -gss_krb5_unwrap_v1(struct krb5_ctx *kctx, int offset, int len, - struct xdr_buf *buf, unsigned int *slack, - unsigned int *align) -{ - int signalg; - int sealalg; - char cksumdata[GSS_KRB5_MAX_CKSUM_LEN]; - struct xdr_netobj md5cksum = {.len = sizeof(cksumdata), - .data = cksumdata}; - time64_t now; - int direction; - s32 seqnum; - unsigned char *ptr; - int bodysize; - void *data_start, *orig_start; - int data_len; - int blocksize; - u32 conflen = crypto_sync_skcipher_blocksize(kctx->enc); - int crypt_offset; - u8 *cksumkey; - unsigned int saved_len = buf->len; - - dprintk("RPC: gss_unwrap_kerberos\n"); - - ptr = (u8 *)buf->head[0].iov_base + offset; - if (g_verify_token_header(&kctx->mech_used, &bodysize, &ptr, - len - offset)) - return GSS_S_DEFECTIVE_TOKEN; - - if ((ptr[0] != ((KG_TOK_WRAP_MSG >> 8) & 0xff)) || - (ptr[1] != (KG_TOK_WRAP_MSG & 0xff))) - return GSS_S_DEFECTIVE_TOKEN; - - /* XXX sanity-check bodysize?? */ - - /* get the sign and seal algorithms */ - - signalg = ptr[2] + (ptr[3] << 8); - if (signalg != kctx->gk5e->signalg) - return GSS_S_DEFECTIVE_TOKEN; - - sealalg = ptr[4] + (ptr[5] << 8); - if (sealalg != kctx->gk5e->sealalg) - return GSS_S_DEFECTIVE_TOKEN; - - if ((ptr[6] != 0xff) || (ptr[7] != 0xff)) - return GSS_S_DEFECTIVE_TOKEN; - - /* - * Data starts after token header and checksum. ptr points - * to the beginning of the token header - */ - crypt_offset = ptr + (GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength) - - (unsigned char *)buf->head[0].iov_base; - - buf->len = len; - if (gss_decrypt_xdr_buf(kctx->enc, buf, crypt_offset)) - return GSS_S_DEFECTIVE_TOKEN; - - if (kctx->gk5e->keyed_cksum) - cksumkey = kctx->cksum; - else - cksumkey = NULL; - - if (make_checksum(kctx, ptr, 8, buf, crypt_offset, - cksumkey, KG_USAGE_SEAL, &md5cksum)) - return GSS_S_FAILURE; - - if (memcmp(md5cksum.data, ptr + GSS_KRB5_TOK_HDR_LEN, - kctx->gk5e->cksumlength)) - return GSS_S_BAD_SIG; - - /* it got through unscathed. Make sure the context is unexpired */ - - now = ktime_get_real_seconds(); - - if (now > kctx->endtime) - return GSS_S_CONTEXT_EXPIRED; - - /* do sequencing checks */ - - if (krb5_get_seq_num(kctx, ptr + GSS_KRB5_TOK_HDR_LEN, - ptr + 8, &direction, &seqnum)) - return GSS_S_BAD_SIG; - - if ((kctx->initiate && direction != 0xff) || - (!kctx->initiate && direction != 0)) - return GSS_S_BAD_SIG; - - /* Copy the data back to the right position. XXX: Would probably be - * better to copy and encrypt at the same time. */ - - blocksize = crypto_sync_skcipher_blocksize(kctx->enc); - data_start = ptr + (GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength) + - conflen; - orig_start = buf->head[0].iov_base + offset; - data_len = (buf->head[0].iov_base + buf->head[0].iov_len) - data_start; - memmove(orig_start, data_start, data_len); - buf->head[0].iov_len -= (data_start - orig_start); - buf->len = len - (data_start - orig_start); - - if (gss_krb5_remove_padding(buf, blocksize)) - return GSS_S_DEFECTIVE_TOKEN; - - /* slack must include room for krb5 padding */ - *slack = XDR_QUADLEN(saved_len - buf->len); - /* The GSS blob always precedes the RPC message payload */ - *align = *slack; - return GSS_S_COMPLETE; -} - -#endif - /* * We can shift data by up to LOCAL_BUF_LEN bytes in a pass. If we need * to do more than that, we shift repeatedly. Kevin Coffman reports diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index c4a566737085..18734e70c5dd 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -986,7 +986,7 @@ bad_unwrap: return -EINVAL; } -static int +static enum svc_auth_status svcauth_gss_set_client(struct svc_rqst *rqstp) { struct gss_svc_data *svcdata = rqstp->rq_auth_data; @@ -1634,7 +1634,7 @@ svcauth_gss_decode_credbody(struct xdr_stream *xdr, * * The rqstp->rq_auth_stat field is also set (see RFCs 2203 and 5531). */ -static int +static enum svc_auth_status svcauth_gss_accept(struct svc_rqst *rqstp) { struct gss_svc_data *svcdata = rqstp->rq_auth_data; @@ -1945,9 +1945,6 @@ bad_wrap: * %0: the Reply is ready to be sent * %-ENOMEM: failed to allocate memory * %-EINVAL: encoding error - * - * XXX: These return values do not match the return values documented - * for the auth_ops ->release method in linux/sunrpc/svcauth.h. */ static int svcauth_gss_release(struct svc_rqst *rqstp) diff --git a/net/sunrpc/auth_tls.c b/net/sunrpc/auth_tls.c index de7678f8a23d..87f570fd3b00 100644 --- a/net/sunrpc/auth_tls.c +++ b/net/sunrpc/auth_tls.c @@ -129,9 +129,9 @@ static int tls_validate(struct rpc_task *task, struct xdr_stream *xdr) if (*p != rpc_auth_null) return -EIO; if (xdr_stream_decode_opaque_inline(xdr, &str, starttls_len) != starttls_len) - return -EIO; + return -EPROTONOSUPPORT; if (memcmp(str, starttls_token, starttls_len)) - return -EIO; + return -EPROTONOSUPPORT; return 0; } diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index d7c697af3762..9c210273d06b 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -534,6 +534,8 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args) .servername = args->servername, .bc_xprt = args->bc_xprt, .xprtsec = args->xprtsec, + .connect_timeout = args->connect_timeout, + .reconnect_timeout = args->reconnect_timeout, }; char servername[48]; struct rpc_clnt *clnt; @@ -2474,8 +2476,7 @@ call_status(struct rpc_task *task) goto out_exit; } task->tk_action = call_encode; - if (status != -ECONNRESET && status != -ECONNABORTED) - rpc_check_timeout(task); + rpc_check_timeout(task); return; out_exit: rpc_call_rpcerror(task, status); @@ -2602,6 +2603,7 @@ out: case 0: task->tk_action = rpc_exit_task; task->tk_status = rpcauth_unwrap_resp(task, &xdr); + xdr_finish_decode(&xdr); return; case -EAGAIN: task->tk_status = 0; @@ -2722,7 +2724,15 @@ out_unparsable: out_verifier: trace_rpc_bad_verifier(task); - goto out_err; + switch (error) { + case -EPROTONOSUPPORT: + goto out_err; + case -EACCES: + /* Re-encode with a fresh cred */ + fallthrough; + default: + goto out_garbage; + } out_msg_denied: error = -EACCES; @@ -2748,6 +2758,7 @@ out_msg_denied: case rpc_autherr_rejectedverf: case rpcsec_gsserr_credproblem: case rpcsec_gsserr_ctxproblem: + rpcauth_invalcred(task); if (!task->tk_cred_retry) break; task->tk_cred_retry--; @@ -2904,19 +2915,22 @@ static const struct rpc_call_ops rpc_cb_add_xprt_call_ops = { * @clnt: pointer to struct rpc_clnt * @xps: pointer to struct rpc_xprt_switch, * @xprt: pointer struct rpc_xprt - * @dummy: unused + * @in_max_connect: pointer to the max_connect value for the passed in xprt transport */ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt, struct rpc_xprt_switch *xps, struct rpc_xprt *xprt, - void *dummy) + void *in_max_connect) { struct rpc_cb_add_xprt_calldata *data; struct rpc_task *task; + int max_connect = clnt->cl_max_connect; - if (xps->xps_nunique_destaddr_xprts + 1 > clnt->cl_max_connect) { + if (in_max_connect) + max_connect = *(int *)in_max_connect; + if (xps->xps_nunique_destaddr_xprts + 1 > max_connect) { rcu_read_lock(); pr_warn("SUNRPC: reached max allowed number (%d) did not add " - "transport to server: %s\n", clnt->cl_max_connect, + "transport to server: %s\n", max_connect, rpc_peeraddr2str(clnt, RPC_DISPLAY_ADDR)); rcu_read_unlock(); return -EINVAL; @@ -3069,6 +3083,11 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt, } xprt->resvport = resvport; xprt->reuseport = reuseport; + + if (xprtargs->connect_timeout) + connect_timeout = xprtargs->connect_timeout; + if (xprtargs->reconnect_timeout) + reconnect_timeout = xprtargs->reconnect_timeout; if (xprt->ops->set_connect_timeout != NULL) xprt->ops->set_connect_timeout(xprt, connect_timeout, diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 587811a002c9..812fda9d45dd 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -513,9 +513,9 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, INIT_LIST_HEAD(&pool->sp_all_threads); spin_lock_init(&pool->sp_lock); + percpu_counter_init(&pool->sp_messages_arrived, 0, GFP_KERNEL); percpu_counter_init(&pool->sp_sockets_queued, 0, GFP_KERNEL); percpu_counter_init(&pool->sp_threads_woken, 0, GFP_KERNEL); - percpu_counter_init(&pool->sp_threads_timedout, 0, GFP_KERNEL); } return serv; @@ -588,9 +588,9 @@ svc_destroy(struct kref *ref) for (i = 0; i < serv->sv_nrpools; i++) { struct svc_pool *pool = &serv->sv_pools[i]; + percpu_counter_destroy(&pool->sp_messages_arrived); percpu_counter_destroy(&pool->sp_sockets_queued); percpu_counter_destroy(&pool->sp_threads_woken); - percpu_counter_destroy(&pool->sp_threads_timedout); } kfree(serv->sv_pools); kfree(serv); @@ -689,23 +689,44 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node) return rqstp; } -/* - * Choose a pool in which to create a new thread, for svc_set_num_threads +/** + * svc_pool_wake_idle_thread - Awaken an idle thread in @pool + * @pool: service thread pool + * + * Can be called from soft IRQ or process context. Finding an idle + * service thread and marking it BUSY is atomic with respect to + * other calls to svc_pool_wake_idle_thread(). + * */ -static inline struct svc_pool * -choose_pool(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state) +void svc_pool_wake_idle_thread(struct svc_pool *pool) { - if (pool != NULL) - return pool; + struct svc_rqst *rqstp; + + rcu_read_lock(); + list_for_each_entry_rcu(rqstp, &pool->sp_all_threads, rq_all) { + if (test_and_set_bit(RQ_BUSY, &rqstp->rq_flags)) + continue; + + WRITE_ONCE(rqstp->rq_qtime, ktime_get()); + wake_up_process(rqstp->rq_task); + rcu_read_unlock(); + percpu_counter_inc(&pool->sp_threads_woken); + trace_svc_wake_up(rqstp->rq_task->pid); + return; + } + rcu_read_unlock(); - return &serv->sv_pools[(*state)++ % serv->sv_nrpools]; + set_bit(SP_CONGESTED, &pool->sp_flags); } -/* - * Choose a thread to kill, for svc_set_num_threads - */ -static inline struct task_struct * -choose_victim(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state) +static struct svc_pool * +svc_pool_next(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state) +{ + return pool ? pool : &serv->sv_pools[(*state)++ % serv->sv_nrpools]; +} + +static struct task_struct * +svc_pool_victim(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state) { unsigned int i; struct task_struct *task = NULL; @@ -713,7 +734,6 @@ choose_victim(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state) if (pool != NULL) { spin_lock_bh(&pool->sp_lock); } else { - /* choose a pool in round-robin fashion */ for (i = 0; i < serv->sv_nrpools; i++) { pool = &serv->sv_pools[--(*state) % serv->sv_nrpools]; spin_lock_bh(&pool->sp_lock); @@ -728,21 +748,15 @@ found_pool: if (!list_empty(&pool->sp_all_threads)) { struct svc_rqst *rqstp; - /* - * Remove from the pool->sp_all_threads list - * so we don't try to kill it again. - */ rqstp = list_entry(pool->sp_all_threads.next, struct svc_rqst, rq_all); set_bit(RQ_VICTIM, &rqstp->rq_flags); list_del_rcu(&rqstp->rq_all); task = rqstp->rq_task; } spin_unlock_bh(&pool->sp_lock); - return task; } -/* create new threads */ static int svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) { @@ -754,13 +768,12 @@ svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) do { nrservs--; - chosen_pool = choose_pool(serv, pool, &state); - + chosen_pool = svc_pool_next(serv, pool, &state); node = svc_pool_map_get_node(chosen_pool->sp_id); + rqstp = svc_prepare_thread(serv, chosen_pool, node); if (IS_ERR(rqstp)) return PTR_ERR(rqstp); - task = kthread_create_on_node(serv->sv_threadfn, rqstp, node, "%s", serv->sv_name); if (IS_ERR(task)) { @@ -779,15 +792,6 @@ svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) return 0; } -/* - * Create or destroy enough new threads to make the number - * of threads the given number. If `pool' is non-NULL, applies - * only to threads in that pool, otherwise round-robins between - * all pools. Caller must ensure that mutual exclusion between this and - * server startup or shutdown. - */ - -/* destroy old threads */ static int svc_stop_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) { @@ -795,9 +799,8 @@ svc_stop_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) struct task_struct *task; unsigned int state = serv->sv_nrthreads-1; - /* destroy old threads */ do { - task = choose_victim(serv, pool, &state); + task = svc_pool_victim(serv, pool, &state); if (task == NULL) break; rqstp = kthread_data(task); @@ -809,6 +812,23 @@ svc_stop_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) return 0; } +/** + * svc_set_num_threads - adjust number of threads per RPC service + * @serv: RPC service to adjust + * @pool: Specific pool from which to choose threads, or NULL + * @nrservs: New number of threads for @serv (0 or less means kill all threads) + * + * Create or destroy threads to make the number of threads for @serv the + * given number. If @pool is non-NULL, change only threads in that pool; + * otherwise, round-robin between all pools for @serv. @serv's + * sv_nrthreads is adjusted for each thread created or destroyed. + * + * Caller must ensure mutual exclusion between this and server startup or + * shutdown. + * + * Returns zero on success or a negative errno if an error occurred while + * starting a thread. + */ int svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) { @@ -1277,8 +1297,9 @@ svc_process_common(struct svc_rqst *rqstp) const struct svc_procedure *procp = NULL; struct svc_serv *serv = rqstp->rq_server; struct svc_process_info process; - int auth_res, rc; + enum svc_auth_status auth_res; unsigned int aoffset; + int rc; __be32 *p; /* Will be turned off by GSS integrity and privacy services */ @@ -1333,6 +1354,9 @@ svc_process_common(struct svc_rqst *rqstp) goto dropit; case SVC_COMPLETE: goto sendit; + default: + pr_warn_once("Unexpected svc_auth_status (%d)\n", auth_res); + goto err_system_err; } if (progp == NULL) @@ -1370,6 +1394,8 @@ svc_process_common(struct svc_rqst *rqstp) rc = process.dispatch(rqstp); if (procp->pc_release) procp->pc_release(rqstp); + xdr_finish_decode(xdr); + if (!rc) goto dropit; if (rqstp->rq_auth_stat != rpc_auth_ok) @@ -1516,7 +1542,6 @@ out_baddir: out_drop: svc_drop(rqstp); } -EXPORT_SYMBOL_GPL(svc_process); #if defined(CONFIG_SUNRPC_BACKCHANNEL) /* diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 62c7919ea610..4cfe9640df48 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -434,6 +434,7 @@ static bool svc_xprt_ready(struct svc_xprt *xprt) smp_rmb(); xpt_flags = READ_ONCE(xprt->xpt_flags); + trace_svc_xprt_enqueue(xprt, xpt_flags); if (xpt_flags & BIT(XPT_BUSY)) return false; if (xpt_flags & (BIT(XPT_CONN) | BIT(XPT_CLOSE) | BIT(XPT_HANDSHAKE))) @@ -456,7 +457,6 @@ static bool svc_xprt_ready(struct svc_xprt *xprt) void svc_xprt_enqueue(struct svc_xprt *xprt) { struct svc_pool *pool; - struct svc_rqst *rqstp = NULL; if (!svc_xprt_ready(xprt)) return; @@ -476,21 +476,7 @@ void svc_xprt_enqueue(struct svc_xprt *xprt) list_add_tail(&xprt->xpt_ready, &pool->sp_sockets); spin_unlock_bh(&pool->sp_lock); - /* find a thread for this xprt */ - rcu_read_lock(); - list_for_each_entry_rcu(rqstp, &pool->sp_all_threads, rq_all) { - if (test_and_set_bit(RQ_BUSY, &rqstp->rq_flags)) - continue; - percpu_counter_inc(&pool->sp_threads_woken); - rqstp->rq_qtime = ktime_get(); - wake_up_process(rqstp->rq_task); - goto out_unlock; - } - set_bit(SP_CONGESTED, &pool->sp_flags); - rqstp = NULL; -out_unlock: - rcu_read_unlock(); - trace_svc_xprt_enqueue(xprt, rqstp); + svc_pool_wake_idle_thread(pool); } EXPORT_SYMBOL_GPL(svc_xprt_enqueue); @@ -581,7 +567,10 @@ static void svc_xprt_release(struct svc_rqst *rqstp) svc_xprt_put(xprt); } -/* +/** + * svc_wake_up - Wake up a service thread for non-transport work + * @serv: RPC service + * * Some svc_serv's will have occasional work to do, even when a xprt is not * waiting to be serviced. This function is there to "kick" a task in one of * those services so that it can wake up and do that work. Note that we only @@ -590,27 +579,10 @@ static void svc_xprt_release(struct svc_rqst *rqstp) */ void svc_wake_up(struct svc_serv *serv) { - struct svc_rqst *rqstp; - struct svc_pool *pool; - - pool = &serv->sv_pools[0]; + struct svc_pool *pool = &serv->sv_pools[0]; - rcu_read_lock(); - list_for_each_entry_rcu(rqstp, &pool->sp_all_threads, rq_all) { - /* skip any that aren't queued */ - if (test_bit(RQ_BUSY, &rqstp->rq_flags)) - continue; - rcu_read_unlock(); - wake_up_process(rqstp->rq_task); - trace_svc_wake_up(rqstp->rq_task->pid); - return; - } - rcu_read_unlock(); - - /* No free entries available */ set_bit(SP_TASK_PENDING, &pool->sp_flags); - smp_wmb(); - trace_svc_wake_up(0); + svc_pool_wake_idle_thread(pool); } EXPORT_SYMBOL_GPL(svc_wake_up); @@ -679,7 +651,7 @@ static void svc_check_conn_limits(struct svc_serv *serv) } } -static int svc_alloc_arg(struct svc_rqst *rqstp) +static bool svc_alloc_arg(struct svc_rqst *rqstp) { struct svc_serv *serv = rqstp->rq_server; struct xdr_buf *arg = &rqstp->rq_arg; @@ -701,10 +673,10 @@ static int svc_alloc_arg(struct svc_rqst *rqstp) /* Made progress, don't sleep yet */ continue; - set_current_state(TASK_INTERRUPTIBLE); - if (signalled() || kthread_should_stop()) { + set_current_state(TASK_IDLE); + if (kthread_should_stop()) { set_current_state(TASK_RUNNING); - return -EINTR; + return false; } trace_svc_alloc_arg_err(pages, ret); memalloc_retry_wait(GFP_KERNEL); @@ -723,7 +695,7 @@ static int svc_alloc_arg(struct svc_rqst *rqstp) arg->tail[0].iov_len = 0; rqstp->rq_xid = xdr_zero; - return 0; + return true; } static bool @@ -732,7 +704,7 @@ rqst_should_sleep(struct svc_rqst *rqstp) struct svc_pool *pool = rqstp->rq_pool; /* did someone call svc_wake_up? */ - if (test_and_clear_bit(SP_TASK_PENDING, &pool->sp_flags)) + if (test_bit(SP_TASK_PENDING, &pool->sp_flags)) return false; /* was a socket queued? */ @@ -740,7 +712,7 @@ rqst_should_sleep(struct svc_rqst *rqstp) return false; /* are we shutting down? */ - if (signalled() || kthread_should_stop()) + if (kthread_should_stop()) return false; /* are we freezing? */ @@ -750,10 +722,9 @@ rqst_should_sleep(struct svc_rqst *rqstp) return true; } -static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout) +static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp) { struct svc_pool *pool = rqstp->rq_pool; - long time_left = 0; /* rq_xprt should be clear on entry */ WARN_ON_ONCE(rqstp->rq_xprt); @@ -762,18 +733,14 @@ static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout) if (rqstp->rq_xprt) goto out_found; - /* - * We have to be able to interrupt this wait - * to bring down the daemons ... - */ - set_current_state(TASK_INTERRUPTIBLE); + set_current_state(TASK_IDLE); smp_mb__before_atomic(); clear_bit(SP_CONGESTED, &pool->sp_flags); clear_bit(RQ_BUSY, &rqstp->rq_flags); smp_mb__after_atomic(); if (likely(rqst_should_sleep(rqstp))) - time_left = schedule_timeout(timeout); + schedule(); else __set_current_state(TASK_RUNNING); @@ -781,17 +748,16 @@ static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout) set_bit(RQ_BUSY, &rqstp->rq_flags); smp_mb__after_atomic(); + clear_bit(SP_TASK_PENDING, &pool->sp_flags); rqstp->rq_xprt = svc_xprt_dequeue(pool); if (rqstp->rq_xprt) goto out_found; - if (!time_left) - percpu_counter_inc(&pool->sp_threads_timedout); - - if (signalled() || kthread_should_stop()) - return ERR_PTR(-EINTR); - return ERR_PTR(-EAGAIN); + if (kthread_should_stop()) + return NULL; + return NULL; out_found: + clear_bit(SP_TASK_PENDING, &pool->sp_flags); /* Normally we will wait up to 5 seconds for any required * cache information to be provided. */ @@ -867,37 +833,35 @@ out: return len; } -/* - * Receive the next request on any transport. This code is carefully - * organised not to touch any cachelines in the shared svc_serv - * structure, only cachelines in the local svc_pool. +/** + * svc_recv - Receive and process the next request on any transport + * @rqstp: an idle RPC service thread + * + * This code is carefully organised not to touch any cachelines in + * the shared svc_serv structure, only cachelines in the local + * svc_pool. */ -int svc_recv(struct svc_rqst *rqstp, long timeout) +void svc_recv(struct svc_rqst *rqstp) { struct svc_xprt *xprt = NULL; struct svc_serv *serv = rqstp->rq_server; - int len, err; + int len; - err = svc_alloc_arg(rqstp); - if (err) + if (!svc_alloc_arg(rqstp)) goto out; try_to_freeze(); cond_resched(); - err = -EINTR; - if (signalled() || kthread_should_stop()) + if (kthread_should_stop()) goto out; - xprt = svc_get_next_xprt(rqstp, timeout); - if (IS_ERR(xprt)) { - err = PTR_ERR(xprt); + xprt = svc_get_next_xprt(rqstp); + if (!xprt) goto out; - } len = svc_handle_xprt(rqstp, xprt); /* No data, incomplete (TCP) read, or accept() */ - err = -EAGAIN; if (len <= 0) goto out_release; @@ -909,13 +873,14 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) if (serv->sv_stats) serv->sv_stats->netcnt++; + percpu_counter_inc(&rqstp->rq_pool->sp_messages_arrived); rqstp->rq_stime = ktime_get(); - return len; + svc_process(rqstp); +out: + return; out_release: rqstp->rq_res.len = 0; svc_xprt_release(rqstp); -out: - return err; } EXPORT_SYMBOL_GPL(svc_recv); @@ -1456,12 +1421,11 @@ static int svc_pool_stats_show(struct seq_file *m, void *p) return 0; } - seq_printf(m, "%u %llu %llu %llu %llu\n", - pool->sp_id, - percpu_counter_sum_positive(&pool->sp_sockets_queued), - percpu_counter_sum_positive(&pool->sp_sockets_queued), - percpu_counter_sum_positive(&pool->sp_threads_woken), - percpu_counter_sum_positive(&pool->sp_threads_timedout)); + seq_printf(m, "%u %llu %llu %llu 0\n", + pool->sp_id, + percpu_counter_sum_positive(&pool->sp_messages_arrived), + percpu_counter_sum_positive(&pool->sp_sockets_queued), + percpu_counter_sum_positive(&pool->sp_threads_woken)); return 0; } diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c index 67d8245a08af..aa4429d0b810 100644 --- a/net/sunrpc/svcauth.c +++ b/net/sunrpc/svcauth.c @@ -60,8 +60,19 @@ svc_put_auth_ops(struct auth_ops *aops) module_put(aops->owner); } -int -svc_authenticate(struct svc_rqst *rqstp) +/** + * svc_authenticate - Initialize an outgoing credential + * @rqstp: RPC execution context + * + * Return values: + * %SVC_OK: XDR encoding of the result can begin + * %SVC_DENIED: Credential or verifier is not valid + * %SVC_GARBAGE: Failed to decode credential or verifier + * %SVC_COMPLETE: GSS context lifetime event; no further action + * %SVC_DROP: Drop this request; no further action + * %SVC_CLOSE: Like drop, but also close transport connection + */ +enum svc_auth_status svc_authenticate(struct svc_rqst *rqstp) { struct auth_ops *aops; u32 flavor; @@ -89,16 +100,28 @@ svc_authenticate(struct svc_rqst *rqstp) } EXPORT_SYMBOL_GPL(svc_authenticate); -int svc_set_client(struct svc_rqst *rqstp) +/** + * svc_set_client - Assign an appropriate 'auth_domain' as the client + * @rqstp: RPC execution context + * + * Return values: + * %SVC_OK: Client was found and assigned + * %SVC_DENY: Client was explicitly denied + * %SVC_DROP: Ignore this request + * %SVC_CLOSE: Ignore this request and close the connection + */ +enum svc_auth_status svc_set_client(struct svc_rqst *rqstp) { rqstp->rq_client = NULL; return rqstp->rq_authop->set_client(rqstp); } EXPORT_SYMBOL_GPL(svc_set_client); -/* A request, which was authenticated, has now executed. - * Time to finalise the credentials and verifier - * and release and resources +/** + * svc_authorise - Finalize credentials/verifier and release resources + * @rqstp: RPC execution context + * + * Returns zero on success, or a negative errno. */ int svc_authorise(struct svc_rqst *rqstp) { diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 174783f804fa..04b45588ae6f 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -665,7 +665,7 @@ static struct group_info *unix_gid_find(kuid_t uid, struct svc_rqst *rqstp) } } -int +enum svc_auth_status svcauth_unix_set_client(struct svc_rqst *rqstp) { struct sockaddr_in *sin; @@ -736,7 +736,6 @@ out: rqstp->rq_auth_stat = rpc_auth_ok; return SVC_OK; } - EXPORT_SYMBOL_GPL(svcauth_unix_set_client); /** @@ -751,7 +750,7 @@ EXPORT_SYMBOL_GPL(svcauth_unix_set_client); * * rqstp->rq_auth_stat is set as mandated by RFC 5531. */ -static int +static enum svc_auth_status svcauth_null_accept(struct svc_rqst *rqstp) { struct xdr_stream *xdr = &rqstp->rq_arg_stream; @@ -828,7 +827,7 @@ struct auth_ops svcauth_null = { * * rqstp->rq_auth_stat is set as mandated by RFC 5531. */ -static int +static enum svc_auth_status svcauth_tls_accept(struct svc_rqst *rqstp) { struct xdr_stream *xdr = &rqstp->rq_arg_stream; @@ -913,7 +912,7 @@ struct auth_ops svcauth_tls = { * * rqstp->rq_auth_stat is set as mandated by RFC 5531. */ -static int +static enum svc_auth_status svcauth_unix_accept(struct svc_rqst *rqstp) { struct xdr_stream *xdr = &rqstp->rq_arg_stream; diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 8c9a8ee76aa0..998687421fa6 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -36,6 +36,8 @@ #include <linux/skbuff.h> #include <linux/file.h> #include <linux/freezer.h> +#include <linux/bvec.h> + #include <net/sock.h> #include <net/checksum.h> #include <net/ip.h> @@ -695,9 +697,10 @@ static int svc_udp_sendto(struct svc_rqst *rqstp) .msg_name = &rqstp->rq_addr, .msg_namelen = rqstp->rq_addrlen, .msg_control = cmh, + .msg_flags = MSG_SPLICE_PAGES, .msg_controllen = sizeof(buffer), }; - unsigned int sent; + unsigned int count; int err; svc_udp_release_ctxt(xprt, rqstp->rq_xprt_ctxt); @@ -710,22 +713,23 @@ static int svc_udp_sendto(struct svc_rqst *rqstp) if (svc_xprt_is_dead(xprt)) goto out_notconn; - err = xdr_alloc_bvec(xdr, GFP_KERNEL); - if (err < 0) - goto out_unlock; + count = xdr_buf_to_bvec(rqstp->rq_bvec, + ARRAY_SIZE(rqstp->rq_bvec), xdr); - err = xprt_sock_sendmsg(svsk->sk_sock, &msg, xdr, 0, 0, &sent); + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec, + count, 0); + err = sock_sendmsg(svsk->sk_sock, &msg); if (err == -ECONNREFUSED) { /* ICMP error on earlier request. */ - err = xprt_sock_sendmsg(svsk->sk_sock, &msg, xdr, 0, 0, &sent); + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec, + count, 0); + err = sock_sendmsg(svsk->sk_sock, &msg); } - xdr_free_bvec(xdr); + trace_svcsock_udp_send(xprt, err); -out_unlock: + mutex_unlock(&xprt->xpt_mutex); - if (err < 0) - return err; - return sent; + return err; out_notconn: mutex_unlock(&xprt->xpt_mutex); @@ -1089,6 +1093,9 @@ static void svc_tcp_fragment_received(struct svc_sock *svsk) /* If we have more data, signal svc_xprt_enqueue() to try again */ svsk->sk_tcplen = 0; svsk->sk_marker = xdr_zero; + + smp_wmb(); + tcp_set_rcvlowat(svsk->sk_sk, 1); } /** @@ -1178,10 +1185,17 @@ err_incomplete: goto err_delete; if (len == want) svc_tcp_fragment_received(svsk); - else + else { + /* Avoid more ->sk_data_ready() calls until the rest + * of the message has arrived. This reduces service + * thread wake-ups on large incoming messages. */ + tcp_set_rcvlowat(svsk->sk_sk, + svc_sock_reclen(svsk) - svsk->sk_tcplen); + trace_svcsock_tcp_recv_short(&svsk->sk_xprt, svc_sock_reclen(svsk), svsk->sk_tcplen - sizeof(rpc_fraghdr)); + } goto err_noclose; error: if (len != -EAGAIN) @@ -1198,75 +1212,51 @@ err_noclose: return 0; /* record not complete */ } -static int svc_tcp_send_kvec(struct socket *sock, const struct kvec *vec, - int flags) -{ - struct msghdr msg = { .msg_flags = MSG_SPLICE_PAGES | flags, }; - - iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, vec, 1, vec->iov_len); - return sock_sendmsg(sock, &msg); -} - /* * MSG_SPLICE_PAGES is used exclusively to reduce the number of * copy operations in this path. Therefore the caller must ensure * that the pages backing @xdr are unchanging. * - * In addition, the logic assumes that * .bv_len is never larger - * than PAGE_SIZE. + * Note that the send is non-blocking. The caller has incremented + * the reference count on each page backing the RPC message, and + * the network layer will "put" these pages when transmission is + * complete. + * + * This is safe for our RPC services because the memory backing + * the head and tail components is never kmalloc'd. These always + * come from pages in the svc_rqst::rq_pages array. */ -static int svc_tcp_sendmsg(struct socket *sock, struct xdr_buf *xdr, +static int svc_tcp_sendmsg(struct svc_sock *svsk, struct svc_rqst *rqstp, rpc_fraghdr marker, unsigned int *sentp) { - const struct kvec *head = xdr->head; - const struct kvec *tail = xdr->tail; - struct kvec rm = { - .iov_base = &marker, - .iov_len = sizeof(marker), - }; struct msghdr msg = { - .msg_flags = 0, + .msg_flags = MSG_SPLICE_PAGES, }; + unsigned int count; + void *buf; int ret; *sentp = 0; - ret = xdr_alloc_bvec(xdr, GFP_KERNEL); - if (ret < 0) - return ret; - - ret = kernel_sendmsg(sock, &msg, &rm, 1, rm.iov_len); - if (ret < 0) - return ret; - *sentp += ret; - if (ret != rm.iov_len) - return -EAGAIN; - - ret = svc_tcp_send_kvec(sock, head, 0); - if (ret < 0) - return ret; - *sentp += ret; - if (ret != head->iov_len) - goto out; - if (xdr_buf_pagecount(xdr)) - xdr->bvec[0].bv_offset = offset_in_page(xdr->page_base); - - msg.msg_flags = MSG_SPLICE_PAGES; - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, xdr->bvec, - xdr_buf_pagecount(xdr), xdr->page_len); - ret = sock_sendmsg(sock, &msg); + /* The stream record marker is copied into a temporary page + * fragment buffer so that it can be included in rq_bvec. + */ + buf = page_frag_alloc(&svsk->sk_frag_cache, sizeof(marker), + GFP_KERNEL); + if (!buf) + return -ENOMEM; + memcpy(buf, &marker, sizeof(marker)); + bvec_set_virt(rqstp->rq_bvec, buf, sizeof(marker)); + + count = xdr_buf_to_bvec(rqstp->rq_bvec + 1, + ARRAY_SIZE(rqstp->rq_bvec) - 1, &rqstp->rq_res); + + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec, + 1 + count, sizeof(marker) + rqstp->rq_res.len); + ret = sock_sendmsg(svsk->sk_sock, &msg); if (ret < 0) return ret; *sentp += ret; - - if (tail->iov_len) { - ret = svc_tcp_send_kvec(sock, tail, 0); - if (ret < 0) - return ret; - *sentp += ret; - } - -out: return 0; } @@ -1292,23 +1282,17 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp) svc_tcp_release_ctxt(xprt, rqstp->rq_xprt_ctxt); rqstp->rq_xprt_ctxt = NULL; - atomic_inc(&svsk->sk_sendqlen); mutex_lock(&xprt->xpt_mutex); if (svc_xprt_is_dead(xprt)) goto out_notconn; - tcp_sock_set_cork(svsk->sk_sk, true); - err = svc_tcp_sendmsg(svsk->sk_sock, xdr, marker, &sent); - xdr_free_bvec(xdr); + err = svc_tcp_sendmsg(svsk, rqstp, marker, &sent); trace_svcsock_tcp_send(xprt, err < 0 ? (long)err : sent); if (err < 0 || sent != (xdr->len + sizeof(marker))) goto out_close; - if (atomic_dec_and_test(&svsk->sk_sendqlen)) - tcp_sock_set_cork(svsk->sk_sk, false); mutex_unlock(&xprt->xpt_mutex); return sent; out_notconn: - atomic_dec(&svsk->sk_sendqlen); mutex_unlock(&xprt->xpt_mutex); return -ENOTCONN; out_close: @@ -1317,7 +1301,6 @@ out_close: (err < 0) ? "got error" : "sent", (err < 0) ? err : sent, xdr->len); svc_xprt_deferred_close(xprt); - atomic_dec(&svsk->sk_sendqlen); mutex_unlock(&xprt->xpt_mutex); return -EAGAIN; } @@ -1644,6 +1627,7 @@ static void svc_tcp_sock_detach(struct svc_xprt *xprt) static void svc_sock_free(struct svc_xprt *xprt) { struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); + struct page_frag_cache *pfc = &svsk->sk_frag_cache; struct socket *sock = svsk->sk_sock; trace_svcsock_free(svsk, sock); @@ -1653,5 +1637,8 @@ static void svc_sock_free(struct svc_xprt *xprt) sockfd_put(sock); else sock_release(sock); + if (pfc->va) + __page_frag_cache_drain(virt_to_head_page(pfc->va), + pfc->pagecnt_bias); kfree(svsk); } diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 2a22e78af116..62e07c330a66 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -165,6 +165,56 @@ xdr_free_bvec(struct xdr_buf *buf) } /** + * xdr_buf_to_bvec - Copy components of an xdr_buf into a bio_vec array + * @bvec: bio_vec array to populate + * @bvec_size: element count of @bio_vec + * @xdr: xdr_buf to be copied + * + * Returns the number of entries consumed in @bvec. + */ +unsigned int xdr_buf_to_bvec(struct bio_vec *bvec, unsigned int bvec_size, + const struct xdr_buf *xdr) +{ + const struct kvec *head = xdr->head; + const struct kvec *tail = xdr->tail; + unsigned int count = 0; + + if (head->iov_len) { + bvec_set_virt(bvec++, head->iov_base, head->iov_len); + ++count; + } + + if (xdr->page_len) { + unsigned int offset, len, remaining; + struct page **pages = xdr->pages; + + offset = offset_in_page(xdr->page_base); + remaining = xdr->page_len; + while (remaining > 0) { + len = min_t(unsigned int, remaining, + PAGE_SIZE - offset); + bvec_set_page(bvec++, *pages++, len, offset); + remaining -= len; + offset = 0; + if (unlikely(++count > bvec_size)) + goto bvec_overflow; + } + } + + if (tail->iov_len) { + bvec_set_virt(bvec, tail->iov_base, tail->iov_len); + if (unlikely(++count > bvec_size)) + goto bvec_overflow; + } + + return count; + +bvec_overflow: + pr_warn_once("%s: bio_vec array overflow\n", __func__); + return count - 1; +} + +/** * xdr_inline_pages - Prepare receive buffer for a large reply * @xdr: xdr_buf into which reply will be placed * @offset: expected offset where data payload will start, in bytes @@ -1288,6 +1338,14 @@ static unsigned int xdr_set_tail_base(struct xdr_stream *xdr, return xdr_set_iov(xdr, buf->tail, base, len); } +static void xdr_stream_unmap_current_page(struct xdr_stream *xdr) +{ + if (xdr->page_kaddr) { + kunmap_local(xdr->page_kaddr); + xdr->page_kaddr = NULL; + } +} + static unsigned int xdr_set_page_base(struct xdr_stream *xdr, unsigned int base, unsigned int len) { @@ -1305,12 +1363,18 @@ static unsigned int xdr_set_page_base(struct xdr_stream *xdr, if (len > maxlen) len = maxlen; + xdr_stream_unmap_current_page(xdr); xdr_stream_page_set_pos(xdr, base); base += xdr->buf->page_base; pgnr = base >> PAGE_SHIFT; xdr->page_ptr = &xdr->buf->pages[pgnr]; - kaddr = page_address(*xdr->page_ptr); + + if (PageHighMem(*xdr->page_ptr)) { + xdr->page_kaddr = kmap_local_page(*xdr->page_ptr); + kaddr = xdr->page_kaddr; + } else + kaddr = page_address(*xdr->page_ptr); pgoff = base & ~PAGE_MASK; xdr->p = (__be32*)(kaddr + pgoff); @@ -1364,6 +1428,7 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p, struct rpc_rqst *rqst) { xdr->buf = buf; + xdr->page_kaddr = NULL; xdr_reset_scratch_buffer(xdr); xdr->nwords = XDR_QUADLEN(buf->len); if (xdr_set_iov(xdr, buf->head, 0, buf->len) == 0 && @@ -1396,6 +1461,16 @@ void xdr_init_decode_pages(struct xdr_stream *xdr, struct xdr_buf *buf, } EXPORT_SYMBOL_GPL(xdr_init_decode_pages); +/** + * xdr_finish_decode - Clean up the xdr_stream after decoding data. + * @xdr: pointer to xdr_stream struct + */ +void xdr_finish_decode(struct xdr_stream *xdr) +{ + xdr_stream_unmap_current_page(xdr); +} +EXPORT_SYMBOL(xdr_finish_decode); + static __be32 * __xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes) { unsigned int nwords = XDR_QUADLEN(nbytes); diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 5e5ff6784ef5..da409450dfc0 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -593,7 +593,6 @@ void xprt_rdma_cleanup(void); int xprt_rdma_bc_setup(struct rpc_xprt *, unsigned int); size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *); unsigned int xprt_rdma_bc_max_slots(struct rpc_xprt *); -int rpcrdma_bc_post_recv(struct rpcrdma_xprt *, unsigned int); void rpcrdma_bc_receive_call(struct rpcrdma_xprt *, struct rpcrdma_rep *); int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst); void xprt_rdma_bc_free_rqst(struct rpc_rqst *); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 268a2cc61acd..a15bf2ede89b 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2237,9 +2237,13 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, struct socket *sock) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + struct net *net = sock_net(sock->sk); + unsigned long connect_timeout; + unsigned long syn_retries; unsigned int keepidle; unsigned int keepcnt; unsigned int timeo; + unsigned long t; spin_lock(&xprt->transport_lock); keepidle = DIV_ROUND_UP(xprt->timeout->to_initval, HZ); @@ -2257,6 +2261,35 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, /* TCP user timeout (see RFC5482) */ tcp_sock_set_user_timeout(sock->sk, timeo); + + /* Connect timeout */ + connect_timeout = max_t(unsigned long, + DIV_ROUND_UP(xprt->connect_timeout, HZ), 1); + syn_retries = max_t(unsigned long, + READ_ONCE(net->ipv4.sysctl_tcp_syn_retries), 1); + for (t = 0; t <= syn_retries && (1UL << t) < connect_timeout; t++) + ; + if (t <= syn_retries) + tcp_sock_set_syncnt(sock->sk, t - 1); +} + +static void xs_tcp_do_set_connect_timeout(struct rpc_xprt *xprt, + unsigned long connect_timeout) +{ + struct sock_xprt *transport = + container_of(xprt, struct sock_xprt, xprt); + struct rpc_timeout to; + unsigned long initval; + + memcpy(&to, xprt->timeout, sizeof(to)); + /* Arbitrary lower limit */ + initval = max_t(unsigned long, connect_timeout, XS_TCP_INIT_REEST_TO); + to.to_initval = initval; + to.to_maxval = initval; + to.to_retries = 0; + memcpy(&transport->tcp_timeout, &to, sizeof(transport->tcp_timeout)); + xprt->timeout = &transport->tcp_timeout; + xprt->connect_timeout = connect_timeout; } static void xs_tcp_set_connect_timeout(struct rpc_xprt *xprt, @@ -2264,25 +2297,12 @@ static void xs_tcp_set_connect_timeout(struct rpc_xprt *xprt, unsigned long reconnect_timeout) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); - struct rpc_timeout to; - unsigned long initval; spin_lock(&xprt->transport_lock); if (reconnect_timeout < xprt->max_reconnect_timeout) xprt->max_reconnect_timeout = reconnect_timeout; - if (connect_timeout < xprt->connect_timeout) { - memcpy(&to, xprt->timeout, sizeof(to)); - initval = DIV_ROUND_UP(connect_timeout, to.to_retries + 1); - /* Arbitrary lower limit */ - if (initval < XS_TCP_INIT_REEST_TO << 1) - initval = XS_TCP_INIT_REEST_TO << 1; - to.to_initval = initval; - to.to_maxval = initval; - memcpy(&transport->tcp_timeout, &to, - sizeof(transport->tcp_timeout)); - xprt->timeout = &transport->tcp_timeout; - xprt->connect_timeout = connect_timeout; - } + if (connect_timeout < xprt->connect_timeout) + xs_tcp_do_set_connect_timeout(xprt, connect_timeout); set_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state); spin_unlock(&xprt->transport_lock); } @@ -2652,6 +2672,10 @@ static void xs_tcp_tls_setup_socket(struct work_struct *work) rcu_read_lock(); lower_xprt = rcu_dereference(lower_clnt->cl_xprt); rcu_read_unlock(); + + if (wait_on_bit_lock(&lower_xprt->state, XPRT_LOCKED, TASK_KILLABLE)) + goto out_unlock; + status = xs_tls_handshake_sync(lower_xprt, &upper_xprt->xprtsec); if (status) { trace_rpc_tls_not_started(upper_clnt, upper_xprt); @@ -2661,6 +2685,7 @@ static void xs_tcp_tls_setup_socket(struct work_struct *work) status = xs_tcp_tls_finish_connecting(lower_xprt, upper_transport); if (status) goto out_close; + xprt_release_write(lower_xprt, NULL); trace_rpc_socket_connect(upper_xprt, upper_transport->sock, 0); if (!xprt_test_and_set_connected(upper_xprt)) { @@ -2682,6 +2707,7 @@ out_unlock: return; out_close: + xprt_release_write(lower_xprt, NULL); rpc_shutdown_client(lower_clnt); /* xprt_force_disconnect() wakes tasks with a fixed tk_status code. @@ -3335,8 +3361,13 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args) xprt->timeout = &xs_tcp_default_timeout; xprt->max_reconnect_timeout = xprt->timeout->to_maxval; + if (args->reconnect_timeout) + xprt->max_reconnect_timeout = args->reconnect_timeout; + xprt->connect_timeout = xprt->timeout->to_initval * (xprt->timeout->to_retries + 1); + if (args->connect_timeout) + xs_tcp_do_set_connect_timeout(xprt, args->connect_timeout); INIT_WORK(&transport->recv_worker, xs_stream_data_receive_workfn); INIT_WORK(&transport->error_worker, xs_error_handle); diff --git a/net/sysctl_net.c b/net/sysctl_net.c index 4b45ed631eb8..051ed5f6fc93 100644 --- a/net/sysctl_net.c +++ b/net/sysctl_net.c @@ -101,7 +101,7 @@ __init int net_sysctl_init(void) * registering "/proc/sys/net" as an empty directory not in a * network namespace. */ - net_header = register_sysctl("net", empty); + net_header = register_sysctl_sz("net", empty, 0); if (!net_header) goto out; ret = register_pernet_subsys(&sysctl_pernet_ops); @@ -122,12 +122,13 @@ out1: * allocated. */ static void ensure_safe_net_sysctl(struct net *net, const char *path, - struct ctl_table *table) + struct ctl_table *table, size_t table_size) { struct ctl_table *ent; pr_debug("Registering net sysctl (net %p): %s\n", net, path); - for (ent = table; ent->procname; ent++) { + ent = table; + for (size_t i = 0; i < table_size && ent->procname; ent++, i++) { unsigned long addr; const char *where; @@ -160,15 +161,24 @@ static void ensure_safe_net_sysctl(struct net *net, const char *path, } } -struct ctl_table_header *register_net_sysctl(struct net *net, - const char *path, struct ctl_table *table) +struct ctl_table_header *register_net_sysctl_sz(struct net *net, + const char *path, + struct ctl_table *table, + size_t table_size) { + int count; + struct ctl_table *entry; + if (!net_eq(net, &init_net)) - ensure_safe_net_sysctl(net, path, table); + ensure_safe_net_sysctl(net, path, table, table_size); + + entry = table; + for (count = 0 ; count < table_size && entry->procname; entry++, count++) + ; - return __register_sysctl_table(&net->sysctls, path, table); + return __register_sysctl_table(&net->sysctls, path, table, count); } -EXPORT_SYMBOL_GPL(register_net_sysctl); +EXPORT_SYMBOL_GPL(register_net_sysctl_sz); void unregister_net_sysctl_table(struct ctl_table_header *header) { diff --git a/net/tipc/core.h b/net/tipc/core.h index 0a3f7a70a50a..7eccd97e0609 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -197,7 +197,7 @@ static inline int less(u16 left, u16 right) return less_eq(left, right) && (mod(right) != mod(left)); } -static inline int in_range(u16 val, u16 min, u16 max) +static inline int tipc_in_range(u16 val, u16 min, u16 max) { return !less(val, min) && !more(val, max); } diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c index 302fd749c424..43c3f1c971b8 100644 --- a/net/tipc/crypto.c +++ b/net/tipc/crypto.c @@ -1441,14 +1441,14 @@ static int tipc_crypto_key_revoke(struct net *net, u8 tx_key) struct tipc_crypto *tx = tipc_net(net)->crypto_tx; struct tipc_key key; - spin_lock(&tx->lock); + spin_lock_bh(&tx->lock); key = tx->key; WARN_ON(!key.active || tx_key != key.active); /* Free the active key */ tipc_crypto_key_set_state(tx, key.passive, 0, key.pending); tipc_crypto_key_detach(tx->aead[key.active], &tx->lock); - spin_unlock(&tx->lock); + spin_unlock_bh(&tx->lock); pr_warn("%s: key is revoked\n", tx->name); return -EKEYREVOKED; diff --git a/net/tipc/link.c b/net/tipc/link.c index 2eff1c7949cb..e33b4f29f77c 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1623,7 +1623,7 @@ next_gap_ack: last_ga->bgack_cnt); } /* Check against the last Gap ACK block */ - if (in_range(seqno, start, end)) + if (tipc_in_range(seqno, start, end)) continue; /* Update/release the packet peer is acking */ bc_has_acked = true; @@ -2251,12 +2251,12 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, strncpy(if_name, data, TIPC_MAX_IF_NAME); /* Update own tolerance if peer indicates a non-zero value */ - if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL)) { + if (tipc_in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL)) { l->tolerance = peers_tol; l->bc_rcvlink->tolerance = peers_tol; } /* Update own priority if peer's priority is higher */ - if (in_range(peers_prio, l->priority + 1, TIPC_MAX_LINK_PRI)) + if (tipc_in_range(peers_prio, l->priority + 1, TIPC_MAX_LINK_PRI)) l->priority = peers_prio; /* If peer is going down we want full re-establish cycle */ @@ -2299,13 +2299,13 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, l->rcv_nxt_state = msg_seqno(hdr) + 1; /* Update own tolerance if peer indicates a non-zero value */ - if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL)) { + if (tipc_in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL)) { l->tolerance = peers_tol; l->bc_rcvlink->tolerance = peers_tol; } /* Update own prio if peer indicates a different value */ if ((peers_prio != l->priority) && - in_range(peers_prio, 1, TIPC_MAX_LINK_PRI)) { + tipc_in_range(peers_prio, 1, TIPC_MAX_LINK_PRI)) { l->priority = peers_prio; rc = tipc_link_fsm_evt(l, LINK_FAILURE_EVT); } diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 02f583ff9239..002483e60c19 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -139,8 +139,8 @@ void update_sk_prot(struct sock *sk, struct tls_context *ctx) int wait_on_pending_writer(struct sock *sk, long *timeo) { - int rc = 0; DEFINE_WAIT_FUNC(wait, woken_wake_function); + int ret, rc = 0; add_wait_queue(sk_sleep(sk), &wait); while (1) { @@ -154,9 +154,13 @@ int wait_on_pending_writer(struct sock *sk, long *timeo) break; } - if (sk_wait_event(sk, timeo, - !READ_ONCE(sk->sk_write_pending), &wait)) + ret = sk_wait_event(sk, timeo, + !READ_ONCE(sk->sk_write_pending), &wait); + if (ret) { + if (ret < 0) + rc = ret; break; + } } remove_wait_queue(sk_sleep(sk), &wait); return rc; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 1ed4a611631f..e9d1e83a859d 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -817,7 +817,7 @@ static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk, psock = sk_psock_get(sk); if (!psock || !policy) { err = tls_push_record(sk, flags, record_type); - if (err && sk->sk_err == EBADMSG) { + if (err && err != -EINPROGRESS && sk->sk_err == EBADMSG) { *copied -= sk_msg_free(sk, msg); tls_free_open_rec(sk); err = -sk->sk_err; @@ -846,7 +846,7 @@ more_data: switch (psock->eval) { case __SK_PASS: err = tls_push_record(sk, flags, record_type); - if (err && sk->sk_err == EBADMSG) { + if (err && err != -EINPROGRESS && sk->sk_err == EBADMSG) { *copied -= sk_msg_free(sk, msg); tls_free_open_rec(sk); err = -sk->sk_err; @@ -1291,6 +1291,7 @@ tls_rx_rec_wait(struct sock *sk, struct sk_psock *psock, bool nonblock, struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); DEFINE_WAIT_FUNC(wait, woken_wake_function); + int ret = 0; long timeo; timeo = sock_rcvtimeo(sk, nonblock); @@ -1302,6 +1303,9 @@ tls_rx_rec_wait(struct sock *sk, struct sk_psock *psock, bool nonblock, if (sk->sk_err) return sock_error(sk); + if (ret < 0) + return ret; + if (!skb_queue_empty(&sk->sk_receive_queue)) { tls_strp_check_rcv(&ctx->strp); if (tls_strp_msg_ready(ctx)) @@ -1320,10 +1324,10 @@ tls_rx_rec_wait(struct sock *sk, struct sk_psock *psock, bool nonblock, released = true; add_wait_queue(sk_sleep(sk), &wait); sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); - sk_wait_event(sk, &timeo, - tls_strp_msg_ready(ctx) || - !sk_psock_queue_empty(psock), - &wait); + ret = sk_wait_event(sk, &timeo, + tls_strp_msg_ready(ctx) || + !sk_psock_queue_empty(psock), + &wait); sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); remove_wait_queue(sk_sleep(sk), &wait); @@ -1852,6 +1856,7 @@ static int tls_rx_reader_acquire(struct sock *sk, struct tls_sw_context_rx *ctx, bool nonblock) { long timeo; + int ret; timeo = sock_rcvtimeo(sk, nonblock); @@ -1861,14 +1866,16 @@ static int tls_rx_reader_acquire(struct sock *sk, struct tls_sw_context_rx *ctx, ctx->reader_contended = 1; add_wait_queue(&ctx->wq, &wait); - sk_wait_event(sk, &timeo, - !READ_ONCE(ctx->reader_present), &wait); + ret = sk_wait_event(sk, &timeo, + !READ_ONCE(ctx->reader_present), &wait); remove_wait_queue(&ctx->wq, &wait); if (timeo <= 0) return -EAGAIN; if (signal_pending(current)) return sock_intr_errno(timeo); + if (ret < 0) + return ret; } WRITE_ONCE(ctx->reader_present, 1); diff --git a/net/unix/sysctl_net_unix.c b/net/unix/sysctl_net_unix.c index 500129aa710c..3e84b31c355a 100644 --- a/net/unix/sysctl_net_unix.c +++ b/net/unix/sysctl_net_unix.c @@ -36,7 +36,8 @@ int __net_init unix_sysctl_register(struct net *net) table[0].data = &net->unx.sysctl_max_dgram_qlen; } - net->unx.ctl = register_net_sysctl(net, "net/unix", table); + net->unx.ctl = register_net_sysctl_sz(net, "net/unix", table, + ARRAY_SIZE(unix_table)); if (net->unx.ctl == NULL) goto err_reg; diff --git a/net/wireless/core.c b/net/wireless/core.c index 25bc2e50a061..64e861617110 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -1181,16 +1181,11 @@ void wiphy_rfkill_set_hw_state_reason(struct wiphy *wiphy, bool blocked, } EXPORT_SYMBOL(wiphy_rfkill_set_hw_state_reason); -void cfg80211_cqm_config_free(struct wireless_dev *wdev) -{ - kfree(wdev->cqm_config); - wdev->cqm_config = NULL; -} - static void _cfg80211_unregister_wdev(struct wireless_dev *wdev, bool unregister_netdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); + struct cfg80211_cqm_config *cqm_config; unsigned int link_id; ASSERT_RTNL(); @@ -1227,7 +1222,10 @@ static void _cfg80211_unregister_wdev(struct wireless_dev *wdev, kfree_sensitive(wdev->wext.keys); wdev->wext.keys = NULL; #endif - cfg80211_cqm_config_free(wdev); + wiphy_work_cancel(wdev->wiphy, &wdev->cqm_rssi_work); + /* deleted from the list, so can't be found from nl80211 any more */ + cqm_config = rcu_access_pointer(wdev->cqm_config); + kfree_rcu(cqm_config, rcu_head); /* * Ensure that all events have been processed and @@ -1379,6 +1377,8 @@ void cfg80211_init_wdev(struct wireless_dev *wdev) wdev->wext.connect.auth_type = NL80211_AUTHTYPE_AUTOMATIC; #endif + wiphy_work_init(&wdev->cqm_rssi_work, cfg80211_cqm_rssi_notify_work); + if (wdev->wiphy->flags & WIPHY_FLAG_PS_ON_BY_DEFAULT) wdev->ps = true; else diff --git a/net/wireless/core.h b/net/wireless/core.h index 507d184b8b40..ba9c7170afa4 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -295,12 +295,17 @@ struct cfg80211_beacon_registration { }; struct cfg80211_cqm_config { + struct rcu_head rcu_head; u32 rssi_hyst; s32 last_rssi_event_value; + enum nl80211_cqm_rssi_threshold_event last_rssi_event_type; int n_rssi_thresholds; s32 rssi_thresholds[] __counted_by(n_rssi_thresholds); }; +void cfg80211_cqm_rssi_notify_work(struct wiphy *wiphy, + struct wiphy_work *work); + void cfg80211_destroy_ifaces(struct cfg80211_registered_device *rdev); /* free object */ @@ -566,8 +571,6 @@ cfg80211_bss_update(struct cfg80211_registered_device *rdev, #define CFG80211_DEV_WARN_ON(cond) ({bool __r = (cond); __r; }) #endif -void cfg80211_cqm_config_free(struct wireless_dev *wdev); - void cfg80211_release_pmsr(struct wireless_dev *wdev, u32 portid); void cfg80211_pmsr_wdev_down(struct wireless_dev *wdev); void cfg80211_pmsr_free_wk(struct work_struct *work); diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 775cac4d6100..3e2c398abddc 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -52,7 +52,8 @@ void cfg80211_rx_assoc_resp(struct net_device *dev, cr.links[link_id].bssid = data->links[link_id].bss->bssid; cr.links[link_id].addr = data->links[link_id].addr; /* need to have local link addresses for MLO connections */ - WARN_ON(cr.ap_mld_addr && !cr.links[link_id].addr); + WARN_ON(cr.ap_mld_addr && + !is_valid_ether_addr(cr.links[link_id].addr)); BUG_ON(!cr.links[link_id].bss->channel); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index de47838aca4f..931a03f4549c 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -5909,6 +5909,21 @@ out: nlmsg_free(msg); } +static int nl80211_validate_ap_phy_operation(struct cfg80211_ap_settings *params) +{ + struct ieee80211_channel *channel = params->chandef.chan; + + if ((params->he_cap || params->he_oper) && + (channel->flags & IEEE80211_CHAN_NO_HE)) + return -EOPNOTSUPP; + + if ((params->eht_cap || params->eht_oper) && + (channel->flags & IEEE80211_CHAN_NO_EHT)) + return -EOPNOTSUPP; + + return 0; +} + static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; @@ -6178,6 +6193,10 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) if (err) goto out_unlock; + err = nl80211_validate_ap_phy_operation(params); + if (err) + goto out_unlock; + if (info->attrs[NL80211_ATTR_AP_SETTINGS_FLAGS]) params->flags = nla_get_u32( info->attrs[NL80211_ATTR_AP_SETTINGS_FLAGS]); @@ -8482,7 +8501,7 @@ static int nl80211_update_mesh_config(struct sk_buff *skb, struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wireless_dev *wdev = dev->ieee80211_ptr; - struct mesh_config cfg; + struct mesh_config cfg = {}; u32 mask; int err; @@ -12796,7 +12815,8 @@ static int nl80211_set_cqm_txe(struct genl_info *info, } static int cfg80211_cqm_rssi_update(struct cfg80211_registered_device *rdev, - struct net_device *dev) + struct net_device *dev, + struct cfg80211_cqm_config *cqm_config) { struct wireless_dev *wdev = dev->ieee80211_ptr; s32 last, low, high; @@ -12805,7 +12825,7 @@ static int cfg80211_cqm_rssi_update(struct cfg80211_registered_device *rdev, int err; /* RSSI reporting disabled? */ - if (!wdev->cqm_config) + if (!cqm_config) return rdev_set_cqm_rssi_range_config(rdev, dev, 0, 0); /* @@ -12814,7 +12834,7 @@ static int cfg80211_cqm_rssi_update(struct cfg80211_registered_device *rdev, * connection is established and enough beacons received to calculate * the average. */ - if (!wdev->cqm_config->last_rssi_event_value && + if (!cqm_config->last_rssi_event_value && wdev->links[0].client.current_bss && rdev->ops->get_station) { struct station_info sinfo = {}; @@ -12828,30 +12848,30 @@ static int cfg80211_cqm_rssi_update(struct cfg80211_registered_device *rdev, cfg80211_sinfo_release_content(&sinfo); if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_BEACON_SIGNAL_AVG)) - wdev->cqm_config->last_rssi_event_value = + cqm_config->last_rssi_event_value = (s8) sinfo.rx_beacon_signal_avg; } - last = wdev->cqm_config->last_rssi_event_value; - hyst = wdev->cqm_config->rssi_hyst; - n = wdev->cqm_config->n_rssi_thresholds; + last = cqm_config->last_rssi_event_value; + hyst = cqm_config->rssi_hyst; + n = cqm_config->n_rssi_thresholds; for (i = 0; i < n; i++) { i = array_index_nospec(i, n); - if (last < wdev->cqm_config->rssi_thresholds[i]) + if (last < cqm_config->rssi_thresholds[i]) break; } low_index = i - 1; if (low_index >= 0) { low_index = array_index_nospec(low_index, n); - low = wdev->cqm_config->rssi_thresholds[low_index] - hyst; + low = cqm_config->rssi_thresholds[low_index] - hyst; } else { low = S32_MIN; } if (i < n) { i = array_index_nospec(i, n); - high = wdev->cqm_config->rssi_thresholds[i] + hyst - 1; + high = cqm_config->rssi_thresholds[i] + hyst - 1; } else { high = S32_MAX; } @@ -12864,6 +12884,7 @@ static int nl80211_set_cqm_rssi(struct genl_info *info, u32 hysteresis) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct cfg80211_cqm_config *cqm_config = NULL, *old; struct net_device *dev = info->user_ptr[1]; struct wireless_dev *wdev = dev->ieee80211_ptr; int i, err; @@ -12881,10 +12902,6 @@ static int nl80211_set_cqm_rssi(struct genl_info *info, wdev->iftype != NL80211_IFTYPE_P2P_CLIENT) return -EOPNOTSUPP; - wdev_lock(wdev); - cfg80211_cqm_config_free(wdev); - wdev_unlock(wdev); - if (n_thresholds <= 1 && rdev->ops->set_cqm_rssi_config) { if (n_thresholds == 0 || thresholds[0] == 0) /* Disabling */ return rdev_set_cqm_rssi_config(rdev, dev, 0, 0); @@ -12901,9 +12918,10 @@ static int nl80211_set_cqm_rssi(struct genl_info *info, n_thresholds = 0; wdev_lock(wdev); - if (n_thresholds) { - struct cfg80211_cqm_config *cqm_config; + old = rcu_dereference_protected(wdev->cqm_config, + lockdep_is_held(&wdev->mtx)); + if (n_thresholds) { cqm_config = kzalloc(struct_size(cqm_config, rssi_thresholds, n_thresholds), GFP_KERNEL); @@ -12918,11 +12936,18 @@ static int nl80211_set_cqm_rssi(struct genl_info *info, flex_array_size(cqm_config, rssi_thresholds, n_thresholds)); - wdev->cqm_config = cqm_config; + rcu_assign_pointer(wdev->cqm_config, cqm_config); + } else { + RCU_INIT_POINTER(wdev->cqm_config, NULL); } - err = cfg80211_cqm_rssi_update(rdev, dev); - + err = cfg80211_cqm_rssi_update(rdev, dev, cqm_config); + if (err) { + rcu_assign_pointer(wdev->cqm_config, old); + kfree_rcu(cqm_config, rcu_head); + } else { + kfree_rcu(old, rcu_head); + } unlock: wdev_unlock(wdev); @@ -19073,9 +19098,8 @@ void cfg80211_cqm_rssi_notify(struct net_device *dev, enum nl80211_cqm_rssi_threshold_event rssi_event, s32 rssi_level, gfp_t gfp) { - struct sk_buff *msg; struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); + struct cfg80211_cqm_config *cqm_config; trace_cfg80211_cqm_rssi_notify(dev, rssi_event, rssi_level); @@ -19083,18 +19107,41 @@ void cfg80211_cqm_rssi_notify(struct net_device *dev, rssi_event != NL80211_CQM_RSSI_THRESHOLD_EVENT_HIGH)) return; - if (wdev->cqm_config) { - wdev->cqm_config->last_rssi_event_value = rssi_level; + rcu_read_lock(); + cqm_config = rcu_dereference(wdev->cqm_config); + if (cqm_config) { + cqm_config->last_rssi_event_value = rssi_level; + cqm_config->last_rssi_event_type = rssi_event; + wiphy_work_queue(wdev->wiphy, &wdev->cqm_rssi_work); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL(cfg80211_cqm_rssi_notify); - cfg80211_cqm_rssi_update(rdev, dev); +void cfg80211_cqm_rssi_notify_work(struct wiphy *wiphy, struct wiphy_work *work) +{ + struct wireless_dev *wdev = container_of(work, struct wireless_dev, + cqm_rssi_work); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + enum nl80211_cqm_rssi_threshold_event rssi_event; + struct cfg80211_cqm_config *cqm_config; + struct sk_buff *msg; + s32 rssi_level; - if (rssi_level == 0) - rssi_level = wdev->cqm_config->last_rssi_event_value; - } + wdev_lock(wdev); + cqm_config = rcu_dereference_protected(wdev->cqm_config, + lockdep_is_held(&wdev->mtx)); + if (!wdev->cqm_config) + goto unlock; - msg = cfg80211_prepare_cqm(dev, NULL, gfp); + cfg80211_cqm_rssi_update(rdev, wdev->netdev, cqm_config); + + rssi_level = cqm_config->last_rssi_event_value; + rssi_event = cqm_config->last_rssi_event_type; + + msg = cfg80211_prepare_cqm(wdev->netdev, NULL, GFP_KERNEL); if (!msg) - return; + goto unlock; if (nla_put_u32(msg, NL80211_ATTR_CQM_RSSI_THRESHOLD_EVENT, rssi_event)) @@ -19104,14 +19151,15 @@ void cfg80211_cqm_rssi_notify(struct net_device *dev, rssi_level)) goto nla_put_failure; - cfg80211_send_cqm(msg, gfp); + cfg80211_send_cqm(msg, GFP_KERNEL); - return; + goto unlock; nla_put_failure: nlmsg_free(msg); + unlock: + wdev_unlock(wdev); } -EXPORT_SYMBOL(cfg80211_cqm_rssi_notify); void cfg80211_cqm_txe_notify(struct net_device *dev, const u8 *peer, u32 num_packets, diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 0cf1ce7b6934..939deecf0bbe 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -908,6 +908,10 @@ static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev) !cfg80211_find_ssid_match(ap, request)) continue; + if (!is_broadcast_ether_addr(request->bssid) && + !ether_addr_equal(request->bssid, ap->bssid)) + continue; + if (!request->n_ssids && ap->multi_bss && !ap->transmitted_bssid) continue; diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c index f8905400ee07..d2c264030017 100644 --- a/net/xdp/xsk_queue.c +++ b/net/xdp/xsk_queue.c @@ -34,6 +34,16 @@ struct xsk_queue *xskq_create(u32 nentries, bool umem_queue) q->ring_mask = nentries - 1; size = xskq_get_ring_size(q, umem_queue); + + /* size which is overflowing or close to SIZE_MAX will become 0 in + * PAGE_ALIGN(), checking SIZE_MAX is enough due to the previous + * is_power_of_2(), the rest will be handled by vmalloc_user() + */ + if (unlikely(size == SIZE_MAX)) { + kfree(q); + return NULL; + } + size = PAGE_ALIGN(size); q->ring = vmalloc_user(size); diff --git a/net/xfrm/xfrm_sysctl.c b/net/xfrm/xfrm_sysctl.c index 0c6c5ef65f9d..7fdeafc838a7 100644 --- a/net/xfrm/xfrm_sysctl.c +++ b/net/xfrm/xfrm_sysctl.c @@ -44,6 +44,7 @@ static struct ctl_table xfrm_table[] = { int __net_init xfrm_sysctl_init(struct net *net) { struct ctl_table *table; + size_t table_size = ARRAY_SIZE(xfrm_table); __xfrm_sysctl_init(net); @@ -56,10 +57,13 @@ int __net_init xfrm_sysctl_init(struct net *net) table[3].data = &net->xfrm.sysctl_acq_expires; /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) + if (net->user_ns != &init_user_ns) { table[0].procname = NULL; + table_size = 0; + } - net->xfrm.sysctl_hdr = register_net_sysctl(net, "net/core", table); + net->xfrm.sysctl_hdr = register_net_sysctl_sz(net, "net/core", table, + table_size); if (!net->xfrm.sysctl_hdr) goto out_register; return 0; |