diff options
Diffstat (limited to 'net')
626 files changed, 20666 insertions, 15849 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 8dfdd94e430f..bad01b14a4ad 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -111,12 +111,7 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head) vlan_gvrp_uninit_applicant(real_dev); } - /* Take it out of our own structures, but be sure to interlock with - * HW accelerating devices or SW vlan input packet processing if - * VLAN is not 0 (leave it there for 802.1p). - */ - if (vlan_id) - vlan_vid_del(real_dev, vlan->vlan_proto, vlan_id); + vlan_vid_del(real_dev, vlan->vlan_proto, vlan_id); /* Get rid of the vlan's reference to real_dev */ dev_put(real_dev); diff --git a/net/8021q/vlanproc.c b/net/8021q/vlanproc.c index 5f1446c9f098..a662ccc166df 100644 --- a/net/8021q/vlanproc.c +++ b/net/8021q/vlanproc.c @@ -80,7 +80,6 @@ static int vlan_seq_open(struct inode *inode, struct file *file) } static const struct file_operations vlan_fops = { - .owner = THIS_MODULE, .open = vlan_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -97,7 +96,6 @@ static int vlandev_seq_open(struct inode *inode, struct file *file) } static const struct file_operations vlandev_fops = { - .owner = THIS_MODULE, .open = vlandev_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 985046ae4231..d6f7f7cb79c4 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -228,32 +228,31 @@ static void p9_conn_cancel(struct p9_conn *m, int err) } } -static int -p9_fd_poll(struct p9_client *client, struct poll_table_struct *pt) +static __poll_t +p9_fd_poll(struct p9_client *client, struct poll_table_struct *pt, int *err) { - int ret, n; + __poll_t ret, n; struct p9_trans_fd *ts = NULL; if (client && client->status == Connected) ts = client->trans; - if (!ts) - return -EREMOTEIO; + if (!ts) { + if (err) + *err = -EREMOTEIO; + return POLLERR; + } if (!ts->rd->f_op->poll) - return -EIO; - - if (!ts->wr->f_op->poll) - return -EIO; - - ret = ts->rd->f_op->poll(ts->rd, pt); - if (ret < 0) - return ret; + ret = DEFAULT_POLLMASK; + else + ret = ts->rd->f_op->poll(ts->rd, pt); if (ts->rd != ts->wr) { - n = ts->wr->f_op->poll(ts->wr, pt); - if (n < 0) - return n; + if (!ts->wr->f_op->poll) + n = DEFAULT_POLLMASK; + else + n = ts->wr->f_op->poll(ts->wr, pt); ret = (ret & ~POLLOUT) | (n & ~POLLIN); } @@ -298,7 +297,8 @@ static int p9_fd_read(struct p9_client *client, void *v, int len) static void p9_read_work(struct work_struct *work) { - int n, err; + __poll_t n; + int err; struct p9_conn *m; int status = REQ_STATUS_ERROR; @@ -398,7 +398,7 @@ end_clear: if (test_and_clear_bit(Rpending, &m->wsched)) n = POLLIN; else - n = p9_fd_poll(m->client, NULL); + n = p9_fd_poll(m->client, NULL, NULL); if ((n & POLLIN) && !test_and_set_bit(Rworksched, &m->wsched)) { p9_debug(P9_DEBUG_TRANS, "sched read work %p\n", m); @@ -448,7 +448,8 @@ static int p9_fd_write(struct p9_client *client, void *v, int len) static void p9_write_work(struct work_struct *work) { - int n, err; + __poll_t n; + int err; struct p9_conn *m; struct p9_req_t *req; @@ -506,7 +507,7 @@ end_clear: if (test_and_clear_bit(Wpending, &m->wsched)) n = POLLOUT; else - n = p9_fd_poll(m->client, NULL); + n = p9_fd_poll(m->client, NULL, NULL); if ((n & POLLOUT) && !test_and_set_bit(Wworksched, &m->wsched)) { @@ -581,7 +582,7 @@ p9_pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p) static void p9_conn_create(struct p9_client *client) { - int n; + __poll_t n; struct p9_trans_fd *ts = client->trans; struct p9_conn *m = &ts->conn; @@ -597,7 +598,7 @@ static void p9_conn_create(struct p9_client *client) INIT_LIST_HEAD(&m->poll_pending_link); init_poll_funcptr(&m->pt, p9_pollwait); - n = p9_fd_poll(client, &m->pt); + n = p9_fd_poll(client, &m->pt, NULL); if (n & POLLIN) { p9_debug(P9_DEBUG_TRANS, "mux %p can read\n", m); set_bit(Rpending, &m->wsched); @@ -617,17 +618,16 @@ static void p9_conn_create(struct p9_client *client) static void p9_poll_mux(struct p9_conn *m) { - int n; + __poll_t n; + int err = -ECONNRESET; if (m->err < 0) return; - n = p9_fd_poll(m->client, NULL); - if (n < 0 || n & (POLLERR | POLLHUP | POLLNVAL)) { + n = p9_fd_poll(m->client, NULL, &err); + if (n & (POLLERR | POLLHUP | POLLNVAL)) { p9_debug(P9_DEBUG_TRANS, "error mux %p err %d\n", m, n); - if (n >= 0) - n = -ECONNRESET; - p9_conn_cancel(m, n); + p9_conn_cancel(m, err); } if (n & POLLIN) { @@ -663,7 +663,7 @@ static void p9_poll_mux(struct p9_conn *m) static int p9_fd_request(struct p9_client *client, struct p9_req_t *req) { - int n; + __poll_t n; struct p9_trans_fd *ts = client->trans; struct p9_conn *m = &ts->conn; @@ -680,7 +680,7 @@ static int p9_fd_request(struct p9_client *client, struct p9_req_t *req) if (test_and_clear_bit(Wpending, &m->wsched)) n = POLLOUT; else - n = p9_fd_poll(m->client, NULL); + n = p9_fd_poll(m->client, NULL, NULL); if (n & POLLOUT && !test_and_set_bit(Wworksched, &m->wsched)) schedule_work(&m->wq); @@ -839,7 +839,6 @@ static int p9_socket_open(struct p9_client *client, struct socket *csocket) if (IS_ERR(file)) { pr_err("%s (%d): failed to map fd\n", __func__, task_pid_nr(current)); - sock_release(csocket); kfree(p); return PTR_ERR(file); } diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c index 325c56043007..086a4abdfa7c 100644 --- a/net/9p/trans_xen.c +++ b/net/9p/trans_xen.c @@ -543,3 +543,7 @@ static void p9_trans_xen_exit(void) return xenbus_unregister_driver(&xen_9pfs_front_driver); } module_exit(p9_trans_xen_exit); + +MODULE_AUTHOR("Stefano Stabellini <stefano@aporeto.com>"); +MODULE_DESCRIPTION("Xen Transport for 9P"); +MODULE_LICENSE("GPL"); diff --git a/net/Kconfig b/net/Kconfig index 9dba2715919d..0428f12c25c2 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -182,6 +182,7 @@ config BRIDGE_NETFILTER depends on BRIDGE depends on NETFILTER && INET depends on NETFILTER_ADVANCED + select NETFILTER_FAMILY_BRIDGE default m ---help--- Enabling this option will let arptables resp. iptables see bridged @@ -212,7 +213,6 @@ source "net/dsa/Kconfig" source "net/8021q/Kconfig" source "net/decnet/Kconfig" source "net/llc/Kconfig" -source "net/ipx/Kconfig" source "drivers/net/appletalk/Kconfig" source "net/x25/Kconfig" source "net/lapb/Kconfig" @@ -336,23 +336,6 @@ config NET_PKTGEN To compile this code as a module, choose M here: the module will be called pktgen. -config NET_TCPPROBE - tristate "TCP connection probing" - depends on INET && PROC_FS && KPROBES - ---help--- - This module allows for capturing the changes to TCP connection - state in response to incoming packets. It is used for debugging - TCP congestion avoidance modules. If you don't understand - what was just said, you don't need it: say N. - - Documentation on how to use TCP connection probing can be found - at: - - http://www.linuxfoundation.org/collaborate/workgroups/networking/tcpprobe - - To compile this code as a module, choose M here: the - module will be called tcp_probe. - config NET_DROP_MONITOR tristate "Network packet drop alerting service" depends on INET && TRACEPOINTS diff --git a/net/Makefile b/net/Makefile index 14fede520840..a6147c61b174 100644 --- a/net/Makefile +++ b/net/Makefile @@ -24,7 +24,6 @@ obj-$(CONFIG_PACKET) += packet/ obj-$(CONFIG_NET_KEY) += key/ obj-$(CONFIG_BRIDGE) += bridge/ obj-$(CONFIG_NET_DSA) += dsa/ -obj-$(CONFIG_IPX) += ipx/ obj-$(CONFIG_ATALK) += appletalk/ obj-$(CONFIG_X25) += x25/ obj-$(CONFIG_LAPB) += lapb/ diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c index 309d7dbb36e8..d4c1021e74e1 100644 --- a/net/appletalk/aarp.c +++ b/net/appletalk/aarp.c @@ -1047,7 +1047,6 @@ static int aarp_seq_open(struct inode *inode, struct file *file) } const struct file_operations atalk_seq_arp_fops = { - .owner = THIS_MODULE, .open = aarp_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/appletalk/atalk_proc.c b/net/appletalk/atalk_proc.c index af46bc49e1e9..a3bf9d519193 100644 --- a/net/appletalk/atalk_proc.c +++ b/net/appletalk/atalk_proc.c @@ -226,7 +226,6 @@ static int atalk_seq_socket_open(struct inode *inode, struct file *file) } static const struct file_operations atalk_seq_interface_fops = { - .owner = THIS_MODULE, .open = atalk_seq_interface_open, .read = seq_read, .llseek = seq_lseek, @@ -234,7 +233,6 @@ static const struct file_operations atalk_seq_interface_fops = { }; static const struct file_operations atalk_seq_route_fops = { - .owner = THIS_MODULE, .open = atalk_seq_route_open, .read = seq_read, .llseek = seq_lseek, @@ -242,7 +240,6 @@ static const struct file_operations atalk_seq_route_fops = { }; static const struct file_operations atalk_seq_socket_fops = { - .owner = THIS_MODULE, .open = atalk_seq_socket_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/atm/br2684.c b/net/atm/br2684.c index 4e111196f902..fd94bea36ee8 100644 --- a/net/atm/br2684.c +++ b/net/atm/br2684.c @@ -824,7 +824,6 @@ static int br2684_proc_open(struct inode *inode, struct file *file) } static const struct file_operations br2684_proc_ops = { - .owner = THIS_MODULE, .open = br2684_proc_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/atm/common.c b/net/atm/common.c index 8a4f99114cd2..6523f38c4957 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -14,7 +14,7 @@ #include <linux/capability.h> #include <linux/mm.h> #include <linux/sched/signal.h> -#include <linux/time.h> /* struct timeval */ +#include <linux/time64.h> /* 64-bit time for seconds */ #include <linux/skbuff.h> #include <linux/bitops.h> #include <linux/init.h> @@ -648,11 +648,11 @@ out: return error; } -unsigned int vcc_poll(struct file *file, struct socket *sock, poll_table *wait) +__poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct atm_vcc *vcc; - unsigned int mask; + __poll_t mask; sock_poll_wait(file, sk_sleep(sk), wait); mask = 0; diff --git a/net/atm/common.h b/net/atm/common.h index d9d583712a91..5850649068bb 100644 --- a/net/atm/common.h +++ b/net/atm/common.h @@ -17,7 +17,7 @@ int vcc_connect(struct socket *sock, int itf, short vpi, int vci); int vcc_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags); int vcc_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len); -unsigned int vcc_poll(struct file *file, struct socket *sock, poll_table *wait); +__poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait); int vcc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int vcc_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int vcc_setsockopt(struct socket *sock, int level, int optname, diff --git a/net/atm/lec.c b/net/atm/lec.c index 6676e3433261..09a1f056712a 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -992,7 +992,6 @@ static int lec_seq_open(struct inode *inode, struct file *file) } static const struct file_operations lec_seq_fops = { - .owner = THIS_MODULE, .open = lec_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/atm/mpc.c b/net/atm/mpc.c index 7c6a1cc760a2..31e0dcb970f8 100644 --- a/net/atm/mpc.c +++ b/net/atm/mpc.c @@ -1089,7 +1089,7 @@ static void MPOA_trigger_rcvd(struct k_message *msg, struct mpoa_client *mpc) msg->type = SND_MPOA_RES_RQST; msg->content.in_info = entry->ctrl_info; msg_to_mpoad(msg, mpc); - do_gettimeofday(&(entry->reply_wait)); + entry->reply_wait = ktime_get_seconds(); mpc->in_ops->put(entry); return; } @@ -1099,7 +1099,7 @@ static void MPOA_trigger_rcvd(struct k_message *msg, struct mpoa_client *mpc) msg->type = SND_MPOA_RES_RQST; msg->content.in_info = entry->ctrl_info; msg_to_mpoad(msg, mpc); - do_gettimeofday(&(entry->reply_wait)); + entry->reply_wait = ktime_get_seconds(); mpc->in_ops->put(entry); return; } @@ -1175,8 +1175,9 @@ static void MPOA_res_reply_rcvd(struct k_message *msg, struct mpoa_client *mpc) } entry->ctrl_info = msg->content.in_info; - do_gettimeofday(&(entry->tv)); - do_gettimeofday(&(entry->reply_wait)); /* Used in refreshing func from now on */ + entry->time = ktime_get_seconds(); + /* Used in refreshing func from now on */ + entry->reply_wait = ktime_get_seconds(); entry->refresh_time = 0; ddprintk_cont("entry->shortcut = %p\n", entry->shortcut); diff --git a/net/atm/mpoa_caches.c b/net/atm/mpoa_caches.c index e01450bb32d6..4bb418313720 100644 --- a/net/atm/mpoa_caches.c +++ b/net/atm/mpoa_caches.c @@ -117,7 +117,7 @@ static in_cache_entry *in_cache_add_entry(__be32 dst_ip, memcpy(entry->MPS_ctrl_ATM_addr, client->mps_ctrl_addr, ATM_ESA_LEN); entry->ctrl_info.in_dst_ip = dst_ip; - do_gettimeofday(&(entry->tv)); + entry->time = ktime_get_seconds(); entry->retry_time = client->parameters.mpc_p4; entry->count = 1; entry->entry_state = INGRESS_INVALID; @@ -148,7 +148,7 @@ static int cache_hit(in_cache_entry *entry, struct mpoa_client *mpc) if (qos != NULL) msg.qos = qos->qos; msg_to_mpoad(&msg, mpc); - do_gettimeofday(&(entry->reply_wait)); + entry->reply_wait = ktime_get_seconds(); entry->entry_state = INGRESS_RESOLVING; } if (entry->shortcut != NULL) @@ -171,7 +171,7 @@ static int cache_hit(in_cache_entry *entry, struct mpoa_client *mpc) if (qos != NULL) msg.qos = qos->qos; msg_to_mpoad(&msg, mpc); - do_gettimeofday(&(entry->reply_wait)); + entry->reply_wait = ktime_get_seconds(); } return CLOSED; @@ -227,17 +227,16 @@ static void in_cache_remove_entry(in_cache_entry *entry, static void clear_count_and_expired(struct mpoa_client *client) { in_cache_entry *entry, *next_entry; - struct timeval now; + time64_t now; - do_gettimeofday(&now); + now = ktime_get_seconds(); write_lock_bh(&client->ingress_lock); entry = client->in_cache; while (entry != NULL) { entry->count = 0; next_entry = entry->next; - if ((now.tv_sec - entry->tv.tv_sec) - > entry->ctrl_info.holding_time) { + if ((now - entry->time) > entry->ctrl_info.holding_time) { dprintk("holding time expired, ip = %pI4\n", &entry->ctrl_info.in_dst_ip); client->in_ops->remove_entry(entry, client); @@ -253,35 +252,35 @@ static void check_resolving_entries(struct mpoa_client *client) struct atm_mpoa_qos *qos; in_cache_entry *entry; - struct timeval now; + time64_t now; struct k_message msg; - do_gettimeofday(&now); + now = ktime_get_seconds(); read_lock_bh(&client->ingress_lock); entry = client->in_cache; while (entry != NULL) { if (entry->entry_state == INGRESS_RESOLVING) { - if ((now.tv_sec - entry->hold_down.tv_sec) < - client->parameters.mpc_p6) { + + if ((now - entry->hold_down) + < client->parameters.mpc_p6) { entry = entry->next; /* Entry in hold down */ continue; } - if ((now.tv_sec - entry->reply_wait.tv_sec) > - entry->retry_time) { + if ((now - entry->reply_wait) > entry->retry_time) { entry->retry_time = MPC_C1 * (entry->retry_time); /* * Retry time maximum exceeded, * put entry in hold down. */ if (entry->retry_time > client->parameters.mpc_p5) { - do_gettimeofday(&(entry->hold_down)); + entry->hold_down = ktime_get_seconds(); entry->retry_time = client->parameters.mpc_p4; entry = entry->next; continue; } /* Ask daemon to send a resolution request. */ - memset(&(entry->hold_down), 0, sizeof(struct timeval)); + memset(&entry->hold_down, 0, sizeof(time64_t)); msg.type = SND_MPOA_RES_RTRY; memcpy(msg.MPS_ctrl, client->mps_ctrl_addr, ATM_ESA_LEN); msg.content.in_info = entry->ctrl_info; @@ -289,7 +288,7 @@ static void check_resolving_entries(struct mpoa_client *client) if (qos != NULL) msg.qos = qos->qos; msg_to_mpoad(&msg, client); - do_gettimeofday(&(entry->reply_wait)); + entry->reply_wait = ktime_get_seconds(); } } entry = entry->next; @@ -300,18 +299,18 @@ static void check_resolving_entries(struct mpoa_client *client) /* Call this every MPC-p5 seconds. */ static void refresh_entries(struct mpoa_client *client) { - struct timeval now; + time64_t now; struct in_cache_entry *entry = client->in_cache; ddprintk("refresh_entries\n"); - do_gettimeofday(&now); + now = ktime_get_seconds(); read_lock_bh(&client->ingress_lock); while (entry != NULL) { if (entry->entry_state == INGRESS_RESOLVED) { if (!(entry->refresh_time)) entry->refresh_time = (2 * (entry->ctrl_info.holding_time))/3; - if ((now.tv_sec - entry->reply_wait.tv_sec) > + if ((now - entry->reply_wait) > entry->refresh_time) { dprintk("refreshing an entry.\n"); entry->entry_state = INGRESS_REFRESHING; @@ -480,7 +479,7 @@ static eg_cache_entry *eg_cache_add_entry(struct k_message *msg, memcpy(entry->MPS_ctrl_ATM_addr, client->mps_ctrl_addr, ATM_ESA_LEN); entry->ctrl_info = msg->content.eg_info; - do_gettimeofday(&(entry->tv)); + entry->time = ktime_get_seconds(); entry->entry_state = EGRESS_RESOLVED; dprintk("new_eg_cache_entry cache_id %u\n", ntohl(entry->ctrl_info.cache_id)); @@ -495,7 +494,7 @@ static eg_cache_entry *eg_cache_add_entry(struct k_message *msg, static void update_eg_cache_entry(eg_cache_entry *entry, uint16_t holding_time) { - do_gettimeofday(&(entry->tv)); + entry->time = ktime_get_seconds(); entry->entry_state = EGRESS_RESOLVED; entry->ctrl_info.holding_time = holding_time; } @@ -503,17 +502,16 @@ static void update_eg_cache_entry(eg_cache_entry *entry, uint16_t holding_time) static void clear_expired(struct mpoa_client *client) { eg_cache_entry *entry, *next_entry; - struct timeval now; + time64_t now; struct k_message msg; - do_gettimeofday(&now); + now = ktime_get_seconds(); write_lock_irq(&client->egress_lock); entry = client->eg_cache; while (entry != NULL) { next_entry = entry->next; - if ((now.tv_sec - entry->tv.tv_sec) - > entry->ctrl_info.holding_time) { + if ((now - entry->time) > entry->ctrl_info.holding_time) { msg.type = SND_EGRESS_PURGE; msg.content.eg_info = entry->ctrl_info; dprintk("egress_cache: holding time expired, cache_id = %u.\n", diff --git a/net/atm/mpoa_caches.h b/net/atm/mpoa_caches.h index 6a266669ebf4..464c4c7f8d1f 100644 --- a/net/atm/mpoa_caches.h +++ b/net/atm/mpoa_caches.h @@ -2,6 +2,7 @@ #ifndef MPOA_CACHES_H #define MPOA_CACHES_H +#include <linux/time64.h> #include <linux/netdevice.h> #include <linux/types.h> #include <linux/atm.h> @@ -16,9 +17,9 @@ void atm_mpoa_init_cache(struct mpoa_client *mpc); typedef struct in_cache_entry { struct in_cache_entry *next; struct in_cache_entry *prev; - struct timeval tv; - struct timeval reply_wait; - struct timeval hold_down; + time64_t time; + time64_t reply_wait; + time64_t hold_down; uint32_t packets_fwded; uint16_t entry_state; uint32_t retry_time; @@ -53,7 +54,7 @@ struct in_cache_ops{ typedef struct eg_cache_entry{ struct eg_cache_entry *next; struct eg_cache_entry *prev; - struct timeval tv; + time64_t time; uint8_t MPS_ctrl_ATM_addr[ATM_ESA_LEN]; struct atm_vcc *shortcut; uint32_t packets_rcvd; diff --git a/net/atm/mpoa_proc.c b/net/atm/mpoa_proc.c index 8a0c17e1c203..b93cc0f18292 100644 --- a/net/atm/mpoa_proc.c +++ b/net/atm/mpoa_proc.c @@ -8,7 +8,7 @@ #include <linux/mm.h> #include <linux/module.h> #include <linux/proc_fs.h> -#include <linux/time.h> +#include <linux/ktime.h> #include <linux/seq_file.h> #include <linux/uaccess.h> #include <linux/atmmpc.h> @@ -57,7 +57,6 @@ static int parse_qos(const char *buff); * Define allowed FILE OPERATIONS */ static const struct file_operations mpc_file_operations = { - .owner = THIS_MODULE, .open = proc_mpc_open, .read = seq_read, .llseek = seq_lseek, @@ -138,7 +137,7 @@ static int mpc_show(struct seq_file *m, void *v) int i; in_cache_entry *in_entry; eg_cache_entry *eg_entry; - struct timeval now; + time64_t now; unsigned char ip_string[16]; if (v == SEQ_START_TOKEN) { @@ -148,15 +147,17 @@ static int mpc_show(struct seq_file *m, void *v) seq_printf(m, "\nInterface %d:\n\n", mpc->dev_num); seq_printf(m, "Ingress Entries:\nIP address State Holding time Packets fwded VPI VCI\n"); - do_gettimeofday(&now); + now = ktime_get_seconds(); for (in_entry = mpc->in_cache; in_entry; in_entry = in_entry->next) { + unsigned long seconds_delta = now - in_entry->time; + sprintf(ip_string, "%pI4", &in_entry->ctrl_info.in_dst_ip); seq_printf(m, "%-16s%s%-14lu%-12u", ip_string, ingress_state_string(in_entry->entry_state), in_entry->ctrl_info.holding_time - - (now.tv_sec-in_entry->tv.tv_sec), + seconds_delta, in_entry->packets_fwded); if (in_entry->shortcut) seq_printf(m, " %-3d %-3d", @@ -169,13 +170,14 @@ static int mpc_show(struct seq_file *m, void *v) seq_printf(m, "Egress Entries:\nIngress MPC ATM addr\nCache-id State Holding time Packets recvd Latest IP addr VPI VCI\n"); for (eg_entry = mpc->eg_cache; eg_entry; eg_entry = eg_entry->next) { unsigned char *p = eg_entry->ctrl_info.in_MPC_data_ATM_addr; + unsigned long seconds_delta = now - eg_entry->time; + for (i = 0; i < ATM_ESA_LEN; i++) seq_printf(m, "%02x", p[i]); seq_printf(m, "\n%-16lu%s%-14lu%-15u", (unsigned long)ntohl(eg_entry->ctrl_info.cache_id), egress_state_string(eg_entry->entry_state), - (eg_entry->ctrl_info.holding_time - - (now.tv_sec-eg_entry->tv.tv_sec)), + (eg_entry->ctrl_info.holding_time - seconds_delta), eg_entry->packets_rcvd); /* latest IP address */ diff --git a/net/atm/proc.c b/net/atm/proc.c index 642f9272ab95..edc48edc95c1 100644 --- a/net/atm/proc.c +++ b/net/atm/proc.c @@ -37,7 +37,6 @@ static ssize_t proc_dev_atm_read(struct file *file, char __user *buf, size_t count, loff_t *pos); static const struct file_operations proc_atm_dev_ops = { - .owner = THIS_MODULE, .read = proc_dev_atm_read, .llseek = noop_llseek, }; diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 06eac1f50c5e..47fdd399626b 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1931,7 +1931,6 @@ static int ax25_info_open(struct inode *inode, struct file *file) } static const struct file_operations ax25_info_fops = { - .owner = THIS_MODULE, .open = ax25_info_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c index 0446b892618a..525558972fd9 100644 --- a/net/ax25/ax25_route.c +++ b/net/ax25/ax25_route.c @@ -336,7 +336,6 @@ static int ax25_rt_info_open(struct inode *inode, struct file *file) } const struct file_operations ax25_route_fops = { - .owner = THIS_MODULE, .open = ax25_rt_info_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ax25/ax25_uid.c b/net/ax25/ax25_uid.c index 83b035f56202..4ebe91ba317a 100644 --- a/net/ax25/ax25_uid.c +++ b/net/ax25/ax25_uid.c @@ -194,7 +194,6 @@ static int ax25_uid_info_open(struct inode *inode, struct file *file) } const struct file_operations ax25_uid_fops = { - .owner = THIS_MODULE, .open = ax25_uid_info_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig index b73b96a2854b..c44f6515be5e 100644 --- a/net/batman-adv/Kconfig +++ b/net/batman-adv/Kconfig @@ -1,3 +1,20 @@ +# SPDX-License-Identifier: GPL-2.0 +# Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +# +# Marek Lindner, Simon Wunderlich +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of version 2 of the GNU General Public +# License as published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, see <http://www.gnu.org/licenses/>. + # # B.A.T.M.A.N meshing protocol # diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile index 915987bc6d29..022f6e77307b 100644 --- a/net/batman-adv/Makefile +++ b/net/batman-adv/Makefile @@ -1,4 +1,4 @@ -# +# SPDX-License-Identifier: GPL-2.0 # Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: # # Marek Lindner, Simon Wunderlich diff --git a/net/batman-adv/bat_algo.c b/net/batman-adv/bat_algo.c index 44fd073b7546..80c72c7d3cad 100644 --- a/net/batman-adv/bat_algo.c +++ b/net/batman-adv/bat_algo.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich @@ -37,7 +38,8 @@ char batadv_routing_algo[20] = "BATMAN_IV"; static struct hlist_head batadv_algo_list; /** - * batadv_algo_init - Initialize batman-adv algorithm management data structures + * batadv_algo_init() - Initialize batman-adv algorithm management data + * structures */ void batadv_algo_init(void) { @@ -59,6 +61,12 @@ static struct batadv_algo_ops *batadv_algo_get(char *name) return bat_algo_ops; } +/** + * batadv_algo_register() - Register callbacks for a mesh algorithm + * @bat_algo_ops: mesh algorithm callbacks to add + * + * Return: 0 on success or negative error number in case of failure + */ int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops) { struct batadv_algo_ops *bat_algo_ops_tmp; @@ -88,6 +96,19 @@ int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops) return 0; } +/** + * batadv_algo_select() - Select algorithm of soft interface + * @bat_priv: the bat priv with all the soft interface information + * @name: name of the algorithm to select + * + * The algorithm callbacks for the soft interface will be set when the algorithm + * with the correct name was found. Any previous selected algorithm will not be + * deinitialized and the new selected algorithm will also not be initialized. + * It is therefore not allowed to call batadv_algo_select outside the creation + * function of the soft interface. + * + * Return: 0 on success or negative error number in case of failure + */ int batadv_algo_select(struct batadv_priv *bat_priv, char *name) { struct batadv_algo_ops *bat_algo_ops; @@ -102,6 +123,14 @@ int batadv_algo_select(struct batadv_priv *bat_priv, char *name) } #ifdef CONFIG_BATMAN_ADV_DEBUGFS + +/** + * batadv_algo_seq_print_text() - Print the supported algorithms in a seq file + * @seq: seq file to print on + * @offset: not used + * + * Return: always 0 + */ int batadv_algo_seq_print_text(struct seq_file *seq, void *offset) { struct batadv_algo_ops *bat_algo_ops; @@ -148,7 +177,7 @@ module_param_cb(routing_algo, &batadv_param_ops_ra, &batadv_param_string_ra, 0644); /** - * batadv_algo_dump_entry - fill in information about one supported routing + * batadv_algo_dump_entry() - fill in information about one supported routing * algorithm * @msg: netlink message to be sent back * @portid: Port to reply to @@ -179,7 +208,7 @@ static int batadv_algo_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, } /** - * batadv_algo_dump - fill in information about supported routing + * batadv_algo_dump() - fill in information about supported routing * algorithms * @msg: netlink message to be sent back * @cb: Parameters to the netlink request diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h index 29f6312f9bf1..029221615ba3 100644 --- a/net/batman-adv/bat_algo.h +++ b/net/batman-adv/bat_algo.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Linus Lüssing diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 1b659ab652fb..79e326383726 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich @@ -26,7 +27,7 @@ #include <linux/cache.h> #include <linux/errno.h> #include <linux/etherdevice.h> -#include <linux/fs.h> +#include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/init.h> #include <linux/jiffies.h> @@ -51,6 +52,7 @@ #include <linux/workqueue.h> #include <net/genetlink.h> #include <net/netlink.h> +#include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "bat_algo.h" @@ -62,7 +64,6 @@ #include "netlink.h" #include "network-coding.h" #include "originator.h" -#include "packet.h" #include "routing.h" #include "send.h" #include "translation-table.h" @@ -72,21 +73,28 @@ static void batadv_iv_send_outstanding_bat_ogm_packet(struct work_struct *work); /** * enum batadv_dup_status - duplicate status - * @BATADV_NO_DUP: the packet is no duplicate - * @BATADV_ORIG_DUP: OGM is a duplicate in the originator (but not for the - * neighbor) - * @BATADV_NEIGH_DUP: OGM is a duplicate for the neighbor - * @BATADV_PROTECTED: originator is currently protected (after reboot) */ enum batadv_dup_status { + /** @BATADV_NO_DUP: the packet is no duplicate */ BATADV_NO_DUP = 0, + + /** + * @BATADV_ORIG_DUP: OGM is a duplicate in the originator (but not for + * the neighbor) + */ BATADV_ORIG_DUP, + + /** @BATADV_NEIGH_DUP: OGM is a duplicate for the neighbor */ BATADV_NEIGH_DUP, + + /** + * @BATADV_PROTECTED: originator is currently protected (after reboot) + */ BATADV_PROTECTED, }; /** - * batadv_ring_buffer_set - update the ring buffer with the given value + * batadv_ring_buffer_set() - update the ring buffer with the given value * @lq_recv: pointer to the ring buffer * @lq_index: index to store the value at * @value: value to store in the ring buffer @@ -98,7 +106,7 @@ static void batadv_ring_buffer_set(u8 lq_recv[], u8 *lq_index, u8 value) } /** - * batadv_ring_buffer_avg - compute the average of all non-zero values stored + * batadv_ring_buffer_avg() - compute the average of all non-zero values stored * in the given ring buffer * @lq_recv: pointer to the ring buffer * @@ -130,7 +138,7 @@ static u8 batadv_ring_buffer_avg(const u8 lq_recv[]) } /** - * batadv_iv_ogm_orig_free - free the private resources allocated for this + * batadv_iv_ogm_orig_free() - free the private resources allocated for this * orig_node * @orig_node: the orig_node for which the resources have to be free'd */ @@ -141,8 +149,8 @@ static void batadv_iv_ogm_orig_free(struct batadv_orig_node *orig_node) } /** - * batadv_iv_ogm_orig_add_if - change the private structures of the orig_node to - * include the new hard-interface + * batadv_iv_ogm_orig_add_if() - change the private structures of the orig_node + * to include the new hard-interface * @orig_node: the orig_node that has to be changed * @max_if_num: the current amount of interfaces * @@ -186,7 +194,7 @@ unlock: } /** - * batadv_iv_ogm_drop_bcast_own_entry - drop section of bcast_own + * batadv_iv_ogm_drop_bcast_own_entry() - drop section of bcast_own * @orig_node: the orig_node that has to be changed * @max_if_num: the current amount of interfaces * @del_if_num: the index of the interface being removed @@ -224,7 +232,7 @@ batadv_iv_ogm_drop_bcast_own_entry(struct batadv_orig_node *orig_node, } /** - * batadv_iv_ogm_drop_bcast_own_sum_entry - drop section of bcast_own_sum + * batadv_iv_ogm_drop_bcast_own_sum_entry() - drop section of bcast_own_sum * @orig_node: the orig_node that has to be changed * @max_if_num: the current amount of interfaces * @del_if_num: the index of the interface being removed @@ -259,8 +267,8 @@ batadv_iv_ogm_drop_bcast_own_sum_entry(struct batadv_orig_node *orig_node, } /** - * batadv_iv_ogm_orig_del_if - change the private structures of the orig_node to - * exclude the removed interface + * batadv_iv_ogm_orig_del_if() - change the private structures of the orig_node + * to exclude the removed interface * @orig_node: the orig_node that has to be changed * @max_if_num: the current amount of interfaces * @del_if_num: the index of the interface being removed @@ -290,7 +298,8 @@ static int batadv_iv_ogm_orig_del_if(struct batadv_orig_node *orig_node, } /** - * batadv_iv_ogm_orig_get - retrieve or create (if does not exist) an originator + * batadv_iv_ogm_orig_get() - retrieve or create (if does not exist) an + * originator * @bat_priv: the bat priv with all the soft interface information * @addr: mac address of the originator * @@ -447,7 +456,7 @@ static u8 batadv_hop_penalty(u8 tq, const struct batadv_priv *bat_priv) } /** - * batadv_iv_ogm_aggr_packet - checks if there is another OGM attached + * batadv_iv_ogm_aggr_packet() - checks if there is another OGM attached * @buff_pos: current position in the skb * @packet_len: total length of the skb * @tvlv_len: tvlv length of the previously considered OGM @@ -557,7 +566,7 @@ static void batadv_iv_ogm_emit(struct batadv_forw_packet *forw_packet) } /** - * batadv_iv_ogm_can_aggregate - find out if an OGM can be aggregated on an + * batadv_iv_ogm_can_aggregate() - find out if an OGM can be aggregated on an * existing forward packet * @new_bat_ogm_packet: OGM packet to be aggregated * @bat_priv: the bat priv with all the soft interface information @@ -660,7 +669,7 @@ out: } /** - * batadv_iv_ogm_aggregate_new - create a new aggregated packet and add this + * batadv_iv_ogm_aggregate_new() - create a new aggregated packet and add this * packet to it. * @packet_buff: pointer to the OGM * @packet_len: (total) length of the OGM @@ -743,7 +752,7 @@ static void batadv_iv_ogm_aggregate(struct batadv_forw_packet *forw_packet_aggr, } /** - * batadv_iv_ogm_queue_add - queue up an OGM for transmission + * batadv_iv_ogm_queue_add() - queue up an OGM for transmission * @bat_priv: the bat priv with all the soft interface information * @packet_buff: pointer to the OGM * @packet_len: (total) length of the OGM @@ -869,8 +878,8 @@ static void batadv_iv_ogm_forward(struct batadv_orig_node *orig_node, } /** - * batadv_iv_ogm_slide_own_bcast_window - bitshift own OGM broadcast windows for - * the given interface + * batadv_iv_ogm_slide_own_bcast_window() - bitshift own OGM broadcast windows + * for the given interface * @hard_iface: the interface for which the windows have to be shifted */ static void @@ -987,7 +996,7 @@ out: } /** - * batadv_iv_ogm_orig_update - use OGM to update corresponding data in an + * batadv_iv_ogm_orig_update() - use OGM to update corresponding data in an * originator * @bat_priv: the bat priv with all the soft interface information * @orig_node: the orig node who originally emitted the ogm packet @@ -1152,7 +1161,7 @@ out: } /** - * batadv_iv_ogm_calc_tq - calculate tq for current received ogm packet + * batadv_iv_ogm_calc_tq() - calculate tq for current received ogm packet * @orig_node: the orig node who originally emitted the ogm packet * @orig_neigh_node: the orig node struct of the neighbor who sent the packet * @batadv_ogm_packet: the ogm packet @@ -1214,7 +1223,7 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, orig_node->last_seen = jiffies; /* find packet count of corresponding one hop neighbor */ - spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock); + spin_lock_bh(&orig_neigh_node->bat_iv.ogm_cnt_lock); if_num = if_incoming->if_num; orig_eq_count = orig_neigh_node->bat_iv.bcast_own_sum[if_num]; neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing); @@ -1224,7 +1233,7 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, } else { neigh_rq_count = 0; } - spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock); + spin_unlock_bh(&orig_neigh_node->bat_iv.ogm_cnt_lock); /* pay attention to not get a value bigger than 100 % */ if (orig_eq_count > neigh_rq_count) @@ -1298,7 +1307,7 @@ out: } /** - * batadv_iv_ogm_update_seqnos - process a batman packet for all interfaces, + * batadv_iv_ogm_update_seqnos() - process a batman packet for all interfaces, * adjust the sequence number and find out whether it is a duplicate * @ethhdr: ethernet header of the packet * @batadv_ogm_packet: OGM packet to be considered @@ -1401,7 +1410,8 @@ out: } /** - * batadv_iv_ogm_process_per_outif - process a batman iv OGM for an outgoing if + * batadv_iv_ogm_process_per_outif() - process a batman iv OGM for an outgoing + * interface * @skb: the skb containing the OGM * @ogm_offset: offset from skb->data to start of ogm header * @orig_node: the (cached) orig node for the originator of this OGM @@ -1608,7 +1618,7 @@ out: } /** - * batadv_iv_ogm_process - process an incoming batman iv OGM + * batadv_iv_ogm_process() - process an incoming batman iv OGM * @skb: the skb containing the OGM * @ogm_offset: offset to the OGM which should be processed (for aggregates) * @if_incoming: the interface where this packet was receved @@ -1861,7 +1871,7 @@ free_skb: #ifdef CONFIG_BATMAN_ADV_DEBUGFS /** - * batadv_iv_ogm_orig_print_neigh - print neighbors for the originator table + * batadv_iv_ogm_orig_print_neigh() - print neighbors for the originator table * @orig_node: the orig_node for which the neighbors are printed * @if_outgoing: outgoing interface for these entries * @seq: debugfs table seq_file struct @@ -1890,7 +1900,7 @@ batadv_iv_ogm_orig_print_neigh(struct batadv_orig_node *orig_node, } /** - * batadv_iv_ogm_orig_print - print the originator table + * batadv_iv_ogm_orig_print() - print the originator table * @bat_priv: the bat priv with all the soft interface information * @seq: debugfs table seq_file struct * @if_outgoing: the outgoing interface for which this should be printed @@ -1960,7 +1970,7 @@ next: #endif /** - * batadv_iv_ogm_neigh_get_tq_avg - Get the TQ average for a neighbour on a + * batadv_iv_ogm_neigh_get_tq_avg() - Get the TQ average for a neighbour on a * given outgoing interface. * @neigh_node: Neighbour of interest * @if_outgoing: Outgoing interface of interest @@ -1986,7 +1996,7 @@ batadv_iv_ogm_neigh_get_tq_avg(struct batadv_neigh_node *neigh_node, } /** - * batadv_iv_ogm_orig_dump_subentry - Dump an originator subentry into a + * batadv_iv_ogm_orig_dump_subentry() - Dump an originator subentry into a * message * @msg: Netlink message to dump into * @portid: Port making netlink request @@ -2048,7 +2058,7 @@ batadv_iv_ogm_orig_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq, } /** - * batadv_iv_ogm_orig_dump_entry - Dump an originator entry into a message + * batadv_iv_ogm_orig_dump_entry() - Dump an originator entry into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message @@ -2110,7 +2120,7 @@ batadv_iv_ogm_orig_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, } /** - * batadv_iv_ogm_orig_dump_bucket - Dump an originator bucket into a + * batadv_iv_ogm_orig_dump_bucket() - Dump an originator bucket into a * message * @msg: Netlink message to dump into * @portid: Port making netlink request @@ -2153,7 +2163,7 @@ batadv_iv_ogm_orig_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, } /** - * batadv_iv_ogm_orig_dump - Dump the originators into a message + * batadv_iv_ogm_orig_dump() - Dump the originators into a message * @msg: Netlink message to dump into * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information @@ -2190,7 +2200,7 @@ batadv_iv_ogm_orig_dump(struct sk_buff *msg, struct netlink_callback *cb, #ifdef CONFIG_BATMAN_ADV_DEBUGFS /** - * batadv_iv_hardif_neigh_print - print a single hop neighbour node + * batadv_iv_hardif_neigh_print() - print a single hop neighbour node * @seq: neighbour table seq_file struct * @hardif_neigh: hardif neighbour information */ @@ -2209,7 +2219,7 @@ batadv_iv_hardif_neigh_print(struct seq_file *seq, } /** - * batadv_iv_ogm_neigh_print - print the single hop neighbour list + * batadv_iv_ogm_neigh_print() - print the single hop neighbour list * @bat_priv: the bat priv with all the soft interface information * @seq: neighbour table seq_file struct */ @@ -2242,7 +2252,7 @@ static void batadv_iv_neigh_print(struct batadv_priv *bat_priv, #endif /** - * batadv_iv_ogm_neigh_diff - calculate tq difference of two neighbors + * batadv_iv_ogm_neigh_diff() - calculate tq difference of two neighbors * @neigh1: the first neighbor object of the comparison * @if_outgoing1: outgoing interface for the first neighbor * @neigh2: the second neighbor object of the comparison @@ -2287,7 +2297,7 @@ out: } /** - * batadv_iv_ogm_neigh_dump_neigh - Dump a neighbour into a netlink message + * batadv_iv_ogm_neigh_dump_neigh() - Dump a neighbour into a netlink message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message @@ -2326,7 +2336,7 @@ batadv_iv_ogm_neigh_dump_neigh(struct sk_buff *msg, u32 portid, u32 seq, } /** - * batadv_iv_ogm_neigh_dump_hardif - Dump the neighbours of a hard interface + * batadv_iv_ogm_neigh_dump_hardif() - Dump the neighbours of a hard interface * into a message * @msg: Netlink message to dump into * @portid: Port making netlink request @@ -2365,7 +2375,7 @@ batadv_iv_ogm_neigh_dump_hardif(struct sk_buff *msg, u32 portid, u32 seq, } /** - * batadv_iv_ogm_neigh_dump - Dump the neighbours into a message + * batadv_iv_ogm_neigh_dump() - Dump the neighbours into a message * @msg: Netlink message to dump into * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information @@ -2417,7 +2427,7 @@ batadv_iv_ogm_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb, } /** - * batadv_iv_ogm_neigh_cmp - compare the metrics of two neighbors + * batadv_iv_ogm_neigh_cmp() - compare the metrics of two neighbors * @neigh1: the first neighbor object of the comparison * @if_outgoing1: outgoing interface for the first neighbor * @neigh2: the second neighbor object of the comparison @@ -2443,7 +2453,7 @@ static int batadv_iv_ogm_neigh_cmp(struct batadv_neigh_node *neigh1, } /** - * batadv_iv_ogm_neigh_is_sob - check if neigh1 is similarly good or better + * batadv_iv_ogm_neigh_is_sob() - check if neigh1 is similarly good or better * than neigh2 from the metric prospective * @neigh1: the first neighbor object of the comparison * @if_outgoing1: outgoing interface for the first neighbor @@ -2478,7 +2488,7 @@ static void batadv_iv_iface_activate(struct batadv_hard_iface *hard_iface) } /** - * batadv_iv_init_sel_class - initialize GW selection class + * batadv_iv_init_sel_class() - initialize GW selection class * @bat_priv: the bat priv with all the soft interface information */ static void batadv_iv_init_sel_class(struct batadv_priv *bat_priv) @@ -2703,7 +2713,7 @@ static void batadv_iv_gw_print(struct batadv_priv *bat_priv, #endif /** - * batadv_iv_gw_dump_entry - Dump a gateway into a message + * batadv_iv_gw_dump_entry() - Dump a gateway into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message @@ -2774,7 +2784,7 @@ out: } /** - * batadv_iv_gw_dump - Dump gateways into a message + * batadv_iv_gw_dump() - Dump gateways into a message * @msg: Netlink message to dump into * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information @@ -2843,6 +2853,11 @@ static struct batadv_algo_ops batadv_batman_iv __read_mostly = { }, }; +/** + * batadv_iv_init() - B.A.T.M.A.N. IV initialization function + * + * Return: 0 on success or negative error number in case of failure + */ int __init batadv_iv_init(void) { int ret; diff --git a/net/batman-adv/bat_iv_ogm.h b/net/batman-adv/bat_iv_ogm.h index ae2ab526bdb1..9dc0dd5c83df 100644 --- a/net/batman-adv/bat_iv_ogm.h +++ b/net/batman-adv/bat_iv_ogm.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index 341ceab8338d..27e165ac9302 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors: * * Linus Lüssing, Marek Lindner @@ -36,6 +37,7 @@ #include <linux/workqueue.h> #include <net/genetlink.h> #include <net/netlink.h> +#include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "bat_algo.h" @@ -48,7 +50,6 @@ #include "log.h" #include "netlink.h" #include "originator.h" -#include "packet.h" struct sk_buff; @@ -99,7 +100,7 @@ static void batadv_v_primary_iface_set(struct batadv_hard_iface *hard_iface) } /** - * batadv_v_iface_update_mac - react to hard-interface MAC address change + * batadv_v_iface_update_mac() - react to hard-interface MAC address change * @hard_iface: the modified interface * * If the modified interface is the primary one, update the originator @@ -130,7 +131,7 @@ batadv_v_hardif_neigh_init(struct batadv_hardif_neigh_node *hardif_neigh) #ifdef CONFIG_BATMAN_ADV_DEBUGFS /** - * batadv_v_orig_print_neigh - print neighbors for the originator table + * batadv_v_orig_print_neigh() - print neighbors for the originator table * @orig_node: the orig_node for which the neighbors are printed * @if_outgoing: outgoing interface for these entries * @seq: debugfs table seq_file struct @@ -160,7 +161,7 @@ batadv_v_orig_print_neigh(struct batadv_orig_node *orig_node, } /** - * batadv_v_hardif_neigh_print - print a single ELP neighbour node + * batadv_v_hardif_neigh_print() - print a single ELP neighbour node * @seq: neighbour table seq_file struct * @hardif_neigh: hardif neighbour information */ @@ -181,7 +182,7 @@ batadv_v_hardif_neigh_print(struct seq_file *seq, } /** - * batadv_v_neigh_print - print the single hop neighbour list + * batadv_v_neigh_print() - print the single hop neighbour list * @bat_priv: the bat priv with all the soft interface information * @seq: neighbour table seq_file struct */ @@ -215,7 +216,7 @@ static void batadv_v_neigh_print(struct batadv_priv *bat_priv, #endif /** - * batadv_v_neigh_dump_neigh - Dump a neighbour into a message + * batadv_v_neigh_dump_neigh() - Dump a neighbour into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message @@ -258,7 +259,7 @@ batadv_v_neigh_dump_neigh(struct sk_buff *msg, u32 portid, u32 seq, } /** - * batadv_v_neigh_dump_hardif - Dump the neighbours of a hard interface into + * batadv_v_neigh_dump_hardif() - Dump the neighbours of a hard interface into * a message * @msg: Netlink message to dump into * @portid: Port making netlink request @@ -296,7 +297,7 @@ batadv_v_neigh_dump_hardif(struct sk_buff *msg, u32 portid, u32 seq, } /** - * batadv_v_neigh_dump - Dump the neighbours of a hard interface into a + * batadv_v_neigh_dump() - Dump the neighbours of a hard interface into a * message * @msg: Netlink message to dump into * @cb: Control block containing additional options @@ -348,7 +349,7 @@ batadv_v_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb, #ifdef CONFIG_BATMAN_ADV_DEBUGFS /** - * batadv_v_orig_print - print the originator table + * batadv_v_orig_print() - print the originator table * @bat_priv: the bat priv with all the soft interface information * @seq: debugfs table seq_file struct * @if_outgoing: the outgoing interface for which this should be printed @@ -416,8 +417,7 @@ next: #endif /** - * batadv_v_orig_dump_subentry - Dump an originator subentry into a - * message + * batadv_v_orig_dump_subentry() - Dump an originator subentry into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message @@ -483,7 +483,7 @@ batadv_v_orig_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq, } /** - * batadv_v_orig_dump_entry - Dump an originator entry into a message + * batadv_v_orig_dump_entry() - Dump an originator entry into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message @@ -536,8 +536,7 @@ batadv_v_orig_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, } /** - * batadv_v_orig_dump_bucket - Dump an originator bucket into a - * message + * batadv_v_orig_dump_bucket() - Dump an originator bucket into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message @@ -578,7 +577,7 @@ batadv_v_orig_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, } /** - * batadv_v_orig_dump - Dump the originators into a message + * batadv_v_orig_dump() - Dump the originators into a message * @msg: Netlink message to dump into * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information @@ -668,7 +667,7 @@ err_ifinfo1: } /** - * batadv_v_init_sel_class - initialize GW selection class + * batadv_v_init_sel_class() - initialize GW selection class * @bat_priv: the bat priv with all the soft interface information */ static void batadv_v_init_sel_class(struct batadv_priv *bat_priv) @@ -704,7 +703,7 @@ static ssize_t batadv_v_show_sel_class(struct batadv_priv *bat_priv, char *buff) } /** - * batadv_v_gw_throughput_get - retrieve the GW-bandwidth for a given GW + * batadv_v_gw_throughput_get() - retrieve the GW-bandwidth for a given GW * @gw_node: the GW to retrieve the metric for * @bw: the pointer where the metric will be stored. The metric is computed as * the minimum between the GW advertised throughput and the path throughput to @@ -747,7 +746,7 @@ out: } /** - * batadv_v_gw_get_best_gw_node - retrieve the best GW node + * batadv_v_gw_get_best_gw_node() - retrieve the best GW node * @bat_priv: the bat priv with all the soft interface information * * Return: the GW node having the best GW-metric, NULL if no GW is known @@ -785,7 +784,7 @@ next: } /** - * batadv_v_gw_is_eligible - check if a originator would be selected as GW + * batadv_v_gw_is_eligible() - check if a originator would be selected as GW * @bat_priv: the bat priv with all the soft interface information * @curr_gw_orig: originator representing the currently selected GW * @orig_node: the originator representing the new candidate @@ -814,7 +813,7 @@ static bool batadv_v_gw_is_eligible(struct batadv_priv *bat_priv, } orig_gw = batadv_gw_node_get(bat_priv, orig_node); - if (!orig_node) + if (!orig_gw) goto out; if (batadv_v_gw_throughput_get(orig_gw, &orig_throughput) < 0) @@ -884,7 +883,7 @@ out: } /** - * batadv_v_gw_print - print the gateway list + * batadv_v_gw_print() - print the gateway list * @bat_priv: the bat priv with all the soft interface information * @seq: gateway table seq_file struct */ @@ -913,7 +912,7 @@ static void batadv_v_gw_print(struct batadv_priv *bat_priv, #endif /** - * batadv_v_gw_dump_entry - Dump a gateway into a message + * batadv_v_gw_dump_entry() - Dump a gateway into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message @@ -1004,7 +1003,7 @@ out: } /** - * batadv_v_gw_dump - Dump gateways into a message + * batadv_v_gw_dump() - Dump gateways into a message * @msg: Netlink message to dump into * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information @@ -1074,7 +1073,7 @@ static struct batadv_algo_ops batadv_batman_v __read_mostly = { }; /** - * batadv_v_hardif_init - initialize the algorithm specific fields in the + * batadv_v_hardif_init() - initialize the algorithm specific fields in the * hard-interface object * @hard_iface: the hard-interface to initialize */ @@ -1088,7 +1087,7 @@ void batadv_v_hardif_init(struct batadv_hard_iface *hard_iface) } /** - * batadv_v_mesh_init - initialize the B.A.T.M.A.N. V private resources for a + * batadv_v_mesh_init() - initialize the B.A.T.M.A.N. V private resources for a * mesh * @bat_priv: the object representing the mesh interface to initialise * @@ -1106,7 +1105,7 @@ int batadv_v_mesh_init(struct batadv_priv *bat_priv) } /** - * batadv_v_mesh_free - free the B.A.T.M.A.N. V private resources for a mesh + * batadv_v_mesh_free() - free the B.A.T.M.A.N. V private resources for a mesh * @bat_priv: the object representing the mesh interface to free */ void batadv_v_mesh_free(struct batadv_priv *bat_priv) @@ -1115,7 +1114,7 @@ void batadv_v_mesh_free(struct batadv_priv *bat_priv) } /** - * batadv_v_init - B.A.T.M.A.N. V initialization function + * batadv_v_init() - B.A.T.M.A.N. V initialization function * * Description: Takes care of initializing all the subcomponents. * It is invoked upon module load only. diff --git a/net/batman-adv/bat_v.h b/net/batman-adv/bat_v.h index dd7c4b647e6b..a17ab68bbce8 100644 --- a/net/batman-adv/bat_v.h +++ b/net/batman-adv/bat_v.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Linus Lüssing diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 1de992c58b35..a83478c46597 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors: * * Linus Lüssing, Marek Lindner @@ -24,7 +25,7 @@ #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> -#include <linux/fs.h> +#include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/jiffies.h> #include <linux/kernel.h> @@ -41,18 +42,18 @@ #include <linux/types.h> #include <linux/workqueue.h> #include <net/cfg80211.h> +#include <uapi/linux/batadv_packet.h> #include "bat_algo.h" #include "bat_v_ogm.h" #include "hard-interface.h" #include "log.h" #include "originator.h" -#include "packet.h" #include "routing.h" #include "send.h" /** - * batadv_v_elp_start_timer - restart timer for ELP periodic work + * batadv_v_elp_start_timer() - restart timer for ELP periodic work * @hard_iface: the interface for which the timer has to be reset */ static void batadv_v_elp_start_timer(struct batadv_hard_iface *hard_iface) @@ -67,7 +68,7 @@ static void batadv_v_elp_start_timer(struct batadv_hard_iface *hard_iface) } /** - * batadv_v_elp_get_throughput - get the throughput towards a neighbour + * batadv_v_elp_get_throughput() - get the throughput towards a neighbour * @neigh: the neighbour for which the throughput has to be obtained * * Return: The throughput towards the given neighbour in multiples of 100kpbs @@ -153,8 +154,8 @@ default_throughput: } /** - * batadv_v_elp_throughput_metric_update - worker updating the throughput metric - * of a single hop neighbour + * batadv_v_elp_throughput_metric_update() - worker updating the throughput + * metric of a single hop neighbour * @work: the work queue item */ void batadv_v_elp_throughput_metric_update(struct work_struct *work) @@ -177,7 +178,7 @@ void batadv_v_elp_throughput_metric_update(struct work_struct *work) } /** - * batadv_v_elp_wifi_neigh_probe - send link probing packets to a neighbour + * batadv_v_elp_wifi_neigh_probe() - send link probing packets to a neighbour * @neigh: the neighbour to probe * * Sends a predefined number of unicast wifi packets to a given neighbour in @@ -240,7 +241,7 @@ batadv_v_elp_wifi_neigh_probe(struct batadv_hardif_neigh_node *neigh) } /** - * batadv_v_elp_periodic_work - ELP periodic task per interface + * batadv_v_elp_periodic_work() - ELP periodic task per interface * @work: work queue item * * Emits broadcast ELP message in regular intervals. @@ -327,7 +328,7 @@ out: } /** - * batadv_v_elp_iface_enable - setup the ELP interface private resources + * batadv_v_elp_iface_enable() - setup the ELP interface private resources * @hard_iface: interface for which the data has to be prepared * * Return: 0 on success or a -ENOMEM in case of failure. @@ -375,7 +376,7 @@ out: } /** - * batadv_v_elp_iface_disable - release ELP interface private resources + * batadv_v_elp_iface_disable() - release ELP interface private resources * @hard_iface: interface for which the resources have to be released */ void batadv_v_elp_iface_disable(struct batadv_hard_iface *hard_iface) @@ -387,7 +388,7 @@ void batadv_v_elp_iface_disable(struct batadv_hard_iface *hard_iface) } /** - * batadv_v_elp_iface_activate - update the ELP buffer belonging to the given + * batadv_v_elp_iface_activate() - update the ELP buffer belonging to the given * hard-interface * @primary_iface: the new primary interface * @hard_iface: interface holding the to-be-updated buffer @@ -408,7 +409,7 @@ void batadv_v_elp_iface_activate(struct batadv_hard_iface *primary_iface, } /** - * batadv_v_elp_primary_iface_set - change internal data to reflect the new + * batadv_v_elp_primary_iface_set() - change internal data to reflect the new * primary interface * @primary_iface: the new primary interface */ @@ -428,7 +429,7 @@ void batadv_v_elp_primary_iface_set(struct batadv_hard_iface *primary_iface) } /** - * batadv_v_elp_neigh_update - update an ELP neighbour node + * batadv_v_elp_neigh_update() - update an ELP neighbour node * @bat_priv: the bat priv with all the soft interface information * @neigh_addr: the neighbour interface address * @if_incoming: the interface the packet was received through @@ -488,7 +489,7 @@ orig_free: } /** - * batadv_v_elp_packet_recv - main ELP packet handler + * batadv_v_elp_packet_recv() - main ELP packet handler * @skb: the received packet * @if_incoming: the interface this packet was received through * diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h index 376ead280ab9..5e39d0588a48 100644 --- a/net/batman-adv/bat_v_elp.h +++ b/net/batman-adv/bat_v_elp.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors: * * Linus Lüssing, Marek Lindner diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index c251445a42a0..ba59b77c605d 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors: * * Antonio Quartulli @@ -22,7 +23,7 @@ #include <linux/byteorder/generic.h> #include <linux/errno.h> #include <linux/etherdevice.h> -#include <linux/fs.h> +#include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/jiffies.h> #include <linux/kernel.h> @@ -38,20 +39,20 @@ #include <linux/string.h> #include <linux/types.h> #include <linux/workqueue.h> +#include <uapi/linux/batadv_packet.h> #include "bat_algo.h" #include "hard-interface.h" #include "hash.h" #include "log.h" #include "originator.h" -#include "packet.h" #include "routing.h" #include "send.h" #include "translation-table.h" #include "tvlv.h" /** - * batadv_v_ogm_orig_get - retrieve and possibly create an originator node + * batadv_v_ogm_orig_get() - retrieve and possibly create an originator node * @bat_priv: the bat priv with all the soft interface information * @addr: the address of the originator * @@ -88,7 +89,7 @@ struct batadv_orig_node *batadv_v_ogm_orig_get(struct batadv_priv *bat_priv, } /** - * batadv_v_ogm_start_timer - restart the OGM sending timer + * batadv_v_ogm_start_timer() - restart the OGM sending timer * @bat_priv: the bat priv with all the soft interface information */ static void batadv_v_ogm_start_timer(struct batadv_priv *bat_priv) @@ -107,7 +108,7 @@ static void batadv_v_ogm_start_timer(struct batadv_priv *bat_priv) } /** - * batadv_v_ogm_send_to_if - send a batman ogm using a given interface + * batadv_v_ogm_send_to_if() - send a batman ogm using a given interface * @skb: the OGM to send * @hard_iface: the interface to use to send the OGM */ @@ -127,7 +128,7 @@ static void batadv_v_ogm_send_to_if(struct sk_buff *skb, } /** - * batadv_v_ogm_send - periodic worker broadcasting the own OGM + * batadv_v_ogm_send() - periodic worker broadcasting the own OGM * @work: work queue item */ static void batadv_v_ogm_send(struct work_struct *work) @@ -235,7 +236,7 @@ out: } /** - * batadv_v_ogm_iface_enable - prepare an interface for B.A.T.M.A.N. V + * batadv_v_ogm_iface_enable() - prepare an interface for B.A.T.M.A.N. V * @hard_iface: the interface to prepare * * Takes care of scheduling own OGM sending routine for this interface. @@ -252,7 +253,7 @@ int batadv_v_ogm_iface_enable(struct batadv_hard_iface *hard_iface) } /** - * batadv_v_ogm_primary_iface_set - set a new primary interface + * batadv_v_ogm_primary_iface_set() - set a new primary interface * @primary_iface: the new primary interface */ void batadv_v_ogm_primary_iface_set(struct batadv_hard_iface *primary_iface) @@ -268,8 +269,8 @@ void batadv_v_ogm_primary_iface_set(struct batadv_hard_iface *primary_iface) } /** - * batadv_v_forward_penalty - apply a penalty to the throughput metric forwarded - * with B.A.T.M.A.N. V OGMs + * batadv_v_forward_penalty() - apply a penalty to the throughput metric + * forwarded with B.A.T.M.A.N. V OGMs * @bat_priv: the bat priv with all the soft interface information * @if_incoming: the interface where the OGM has been received * @if_outgoing: the interface where the OGM has to be forwarded to @@ -314,7 +315,7 @@ static u32 batadv_v_forward_penalty(struct batadv_priv *bat_priv, } /** - * batadv_v_ogm_forward - check conditions and forward an OGM to the given + * batadv_v_ogm_forward() - check conditions and forward an OGM to the given * outgoing interface * @bat_priv: the bat priv with all the soft interface information * @ogm_received: previously received OGM to be forwarded @@ -405,7 +406,7 @@ out: } /** - * batadv_v_ogm_metric_update - update route metric based on OGM + * batadv_v_ogm_metric_update() - update route metric based on OGM * @bat_priv: the bat priv with all the soft interface information * @ogm2: OGM2 structure * @orig_node: Originator structure for which the OGM has been received @@ -490,7 +491,7 @@ out: } /** - * batadv_v_ogm_route_update - update routes based on OGM + * batadv_v_ogm_route_update() - update routes based on OGM * @bat_priv: the bat priv with all the soft interface information * @ethhdr: the Ethernet header of the OGM2 * @ogm2: OGM2 structure @@ -590,7 +591,7 @@ out: } /** - * batadv_v_ogm_process_per_outif - process a batman v OGM for an outgoing if + * batadv_v_ogm_process_per_outif() - process a batman v OGM for an outgoing if * @bat_priv: the bat priv with all the soft interface information * @ethhdr: the Ethernet header of the OGM2 * @ogm2: OGM2 structure @@ -639,7 +640,7 @@ batadv_v_ogm_process_per_outif(struct batadv_priv *bat_priv, } /** - * batadv_v_ogm_aggr_packet - checks if there is another OGM aggregated + * batadv_v_ogm_aggr_packet() - checks if there is another OGM aggregated * @buff_pos: current position in the skb * @packet_len: total length of the skb * @tvlv_len: tvlv length of the previously considered OGM @@ -659,7 +660,7 @@ static bool batadv_v_ogm_aggr_packet(int buff_pos, int packet_len, } /** - * batadv_v_ogm_process - process an incoming batman v OGM + * batadv_v_ogm_process() - process an incoming batman v OGM * @skb: the skb containing the OGM * @ogm_offset: offset to the OGM which should be processed (for aggregates) * @if_incoming: the interface where this packet was receved @@ -787,7 +788,7 @@ out: } /** - * batadv_v_ogm_packet_recv - OGM2 receiving handler + * batadv_v_ogm_packet_recv() - OGM2 receiving handler * @skb: the received OGM * @if_incoming: the interface where this OGM has been received * @@ -851,7 +852,7 @@ free_skb: } /** - * batadv_v_ogm_init - initialise the OGM2 engine + * batadv_v_ogm_init() - initialise the OGM2 engine * @bat_priv: the bat priv with all the soft interface information * * Return: 0 on success or a negative error code in case of failure @@ -884,7 +885,7 @@ int batadv_v_ogm_init(struct batadv_priv *bat_priv) } /** - * batadv_v_ogm_free - free OGM private resources + * batadv_v_ogm_free() - free OGM private resources * @bat_priv: the bat priv with all the soft interface information */ void batadv_v_ogm_free(struct batadv_priv *bat_priv) diff --git a/net/batman-adv/bat_v_ogm.h b/net/batman-adv/bat_v_ogm.h index 2068770b542d..6a4c14ccc3c6 100644 --- a/net/batman-adv/bat_v_ogm.h +++ b/net/batman-adv/bat_v_ogm.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors: * * Antonio Quartulli diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c index 2b070c7e31da..bdc1ef06e05b 100644 --- a/net/batman-adv/bitarray.c +++ b/net/batman-adv/bitarray.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2006-2017 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner @@ -32,7 +33,7 @@ static void batadv_bitmap_shift_left(unsigned long *seq_bits, s32 n) } /** - * batadv_bit_get_packet - receive and process one packet within the sequence + * batadv_bit_get_packet() - receive and process one packet within the sequence * number window * @priv: the bat priv with all the soft interface information * @seq_bits: pointer to the sequence number receive packet diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h index cc262c9d97e0..ca9d0753dd6b 100644 --- a/net/batman-adv/bitarray.h +++ b/net/batman-adv/bitarray.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2006-2017 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner @@ -26,7 +27,7 @@ #include <linux/types.h> /** - * batadv_test_bit - check if bit is set in the current window + * batadv_test_bit() - check if bit is set in the current window * * @seq_bits: pointer to the sequence number receive packet * @last_seqno: latest sequence number in seq_bits @@ -46,7 +47,12 @@ static inline bool batadv_test_bit(const unsigned long *seq_bits, return test_bit(diff, seq_bits) != 0; } -/* turn corresponding bit on, so we can remember that we got the packet */ +/** + * batadv_set_bit() - Turn corresponding bit on, so we can remember that we got + * the packet + * @seq_bits: bitmap of the packet receive window + * @n: relative sequence number of newly received packet + */ static inline void batadv_set_bit(unsigned long *seq_bits, s32 n) { /* if too old, just drop it */ diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index cdd8e8e4df0b..fad47853ad3c 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors: * * Simon Wunderlich @@ -24,7 +25,7 @@ #include <linux/crc16.h> #include <linux/errno.h> #include <linux/etherdevice.h> -#include <linux/fs.h> +#include <linux/gfp.h> #include <linux/if_arp.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> @@ -49,6 +50,7 @@ #include <net/genetlink.h> #include <net/netlink.h> #include <net/sock.h> +#include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "hard-interface.h" @@ -56,7 +58,6 @@ #include "log.h" #include "netlink.h" #include "originator.h" -#include "packet.h" #include "soft-interface.h" #include "sysfs.h" #include "translation-table.h" @@ -69,7 +70,7 @@ batadv_bla_send_announce(struct batadv_priv *bat_priv, struct batadv_bla_backbone_gw *backbone_gw); /** - * batadv_choose_claim - choose the right bucket for a claim. + * batadv_choose_claim() - choose the right bucket for a claim. * @data: data to hash * @size: size of the hash table * @@ -87,7 +88,7 @@ static inline u32 batadv_choose_claim(const void *data, u32 size) } /** - * batadv_choose_backbone_gw - choose the right bucket for a backbone gateway. + * batadv_choose_backbone_gw() - choose the right bucket for a backbone gateway. * @data: data to hash * @size: size of the hash table * @@ -105,7 +106,7 @@ static inline u32 batadv_choose_backbone_gw(const void *data, u32 size) } /** - * batadv_compare_backbone_gw - compare address and vid of two backbone gws + * batadv_compare_backbone_gw() - compare address and vid of two backbone gws * @node: list node of the first entry to compare * @data2: pointer to the second backbone gateway * @@ -129,7 +130,7 @@ static bool batadv_compare_backbone_gw(const struct hlist_node *node, } /** - * batadv_compare_claim - compare address and vid of two claims + * batadv_compare_claim() - compare address and vid of two claims * @node: list node of the first entry to compare * @data2: pointer to the second claims * @@ -153,7 +154,7 @@ static bool batadv_compare_claim(const struct hlist_node *node, } /** - * batadv_backbone_gw_release - release backbone gw from lists and queue for + * batadv_backbone_gw_release() - release backbone gw from lists and queue for * free after rcu grace period * @ref: kref pointer of the backbone gw */ @@ -168,7 +169,7 @@ static void batadv_backbone_gw_release(struct kref *ref) } /** - * batadv_backbone_gw_put - decrement the backbone gw refcounter and possibly + * batadv_backbone_gw_put() - decrement the backbone gw refcounter and possibly * release it * @backbone_gw: backbone gateway to be free'd */ @@ -178,8 +179,8 @@ static void batadv_backbone_gw_put(struct batadv_bla_backbone_gw *backbone_gw) } /** - * batadv_claim_release - release claim from lists and queue for free after rcu - * grace period + * batadv_claim_release() - release claim from lists and queue for free after + * rcu grace period * @ref: kref pointer of the claim */ static void batadv_claim_release(struct kref *ref) @@ -204,8 +205,7 @@ static void batadv_claim_release(struct kref *ref) } /** - * batadv_claim_put - decrement the claim refcounter and possibly - * release it + * batadv_claim_put() - decrement the claim refcounter and possibly release it * @claim: claim to be free'd */ static void batadv_claim_put(struct batadv_bla_claim *claim) @@ -214,7 +214,7 @@ static void batadv_claim_put(struct batadv_bla_claim *claim) } /** - * batadv_claim_hash_find - looks for a claim in the claim hash + * batadv_claim_hash_find() - looks for a claim in the claim hash * @bat_priv: the bat priv with all the soft interface information * @data: search data (may be local/static data) * @@ -253,7 +253,7 @@ batadv_claim_hash_find(struct batadv_priv *bat_priv, } /** - * batadv_backbone_hash_find - looks for a backbone gateway in the hash + * batadv_backbone_hash_find() - looks for a backbone gateway in the hash * @bat_priv: the bat priv with all the soft interface information * @addr: the address of the originator * @vid: the VLAN ID @@ -297,7 +297,7 @@ batadv_backbone_hash_find(struct batadv_priv *bat_priv, u8 *addr, } /** - * batadv_bla_del_backbone_claims - delete all claims for a backbone + * batadv_bla_del_backbone_claims() - delete all claims for a backbone * @backbone_gw: backbone gateway where the claims should be removed */ static void @@ -337,7 +337,7 @@ batadv_bla_del_backbone_claims(struct batadv_bla_backbone_gw *backbone_gw) } /** - * batadv_bla_send_claim - sends a claim frame according to the provided info + * batadv_bla_send_claim() - sends a claim frame according to the provided info * @bat_priv: the bat priv with all the soft interface information * @mac: the mac address to be announced within the claim * @vid: the VLAN ID @@ -457,7 +457,7 @@ out: } /** - * batadv_bla_loopdetect_report - worker for reporting the loop + * batadv_bla_loopdetect_report() - worker for reporting the loop * @work: work queue item * * Throws an uevent, as the loopdetect check function can't do that itself @@ -487,7 +487,7 @@ static void batadv_bla_loopdetect_report(struct work_struct *work) } /** - * batadv_bla_get_backbone_gw - finds or creates a backbone gateway + * batadv_bla_get_backbone_gw() - finds or creates a backbone gateway * @bat_priv: the bat priv with all the soft interface information * @orig: the mac address of the originator * @vid: the VLAN ID @@ -560,7 +560,7 @@ batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, u8 *orig, } /** - * batadv_bla_update_own_backbone_gw - updates the own backbone gw for a VLAN + * batadv_bla_update_own_backbone_gw() - updates the own backbone gw for a VLAN * @bat_priv: the bat priv with all the soft interface information * @primary_if: the selected primary interface * @vid: VLAN identifier @@ -586,7 +586,7 @@ batadv_bla_update_own_backbone_gw(struct batadv_priv *bat_priv, } /** - * batadv_bla_answer_request - answer a bla request by sending own claims + * batadv_bla_answer_request() - answer a bla request by sending own claims * @bat_priv: the bat priv with all the soft interface information * @primary_if: interface where the request came on * @vid: the vid where the request came on @@ -636,7 +636,7 @@ static void batadv_bla_answer_request(struct batadv_priv *bat_priv, } /** - * batadv_bla_send_request - send a request to repeat claims + * batadv_bla_send_request() - send a request to repeat claims * @backbone_gw: the backbone gateway from whom we are out of sync * * When the crc is wrong, ask the backbone gateway for a full table update. @@ -663,7 +663,7 @@ static void batadv_bla_send_request(struct batadv_bla_backbone_gw *backbone_gw) } /** - * batadv_bla_send_announce - Send an announcement frame + * batadv_bla_send_announce() - Send an announcement frame * @bat_priv: the bat priv with all the soft interface information * @backbone_gw: our backbone gateway which should be announced */ @@ -684,7 +684,7 @@ static void batadv_bla_send_announce(struct batadv_priv *bat_priv, } /** - * batadv_bla_add_claim - Adds a claim in the claim hash + * batadv_bla_add_claim() - Adds a claim in the claim hash * @bat_priv: the bat priv with all the soft interface information * @mac: the mac address of the claim * @vid: the VLAN ID of the frame @@ -774,7 +774,7 @@ claim_free_ref: } /** - * batadv_bla_claim_get_backbone_gw - Get valid reference for backbone_gw of + * batadv_bla_claim_get_backbone_gw() - Get valid reference for backbone_gw of * claim * @claim: claim whose backbone_gw should be returned * @@ -794,7 +794,7 @@ batadv_bla_claim_get_backbone_gw(struct batadv_bla_claim *claim) } /** - * batadv_bla_del_claim - delete a claim from the claim hash + * batadv_bla_del_claim() - delete a claim from the claim hash * @bat_priv: the bat priv with all the soft interface information * @mac: mac address of the claim to be removed * @vid: VLAN id for the claim to be removed @@ -822,7 +822,7 @@ static void batadv_bla_del_claim(struct batadv_priv *bat_priv, } /** - * batadv_handle_announce - check for ANNOUNCE frame + * batadv_handle_announce() - check for ANNOUNCE frame * @bat_priv: the bat priv with all the soft interface information * @an_addr: announcement mac address (ARP Sender HW address) * @backbone_addr: originator address of the sender (Ethernet source MAC) @@ -880,7 +880,7 @@ static bool batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr, } /** - * batadv_handle_request - check for REQUEST frame + * batadv_handle_request() - check for REQUEST frame * @bat_priv: the bat priv with all the soft interface information * @primary_if: the primary hard interface of this batman soft interface * @backbone_addr: backbone address to be requested (ARP sender HW MAC) @@ -913,7 +913,7 @@ static bool batadv_handle_request(struct batadv_priv *bat_priv, } /** - * batadv_handle_unclaim - check for UNCLAIM frame + * batadv_handle_unclaim() - check for UNCLAIM frame * @bat_priv: the bat priv with all the soft interface information * @primary_if: the primary hard interface of this batman soft interface * @backbone_addr: originator address of the backbone (Ethernet source) @@ -951,7 +951,7 @@ static bool batadv_handle_unclaim(struct batadv_priv *bat_priv, } /** - * batadv_handle_claim - check for CLAIM frame + * batadv_handle_claim() - check for CLAIM frame * @bat_priv: the bat priv with all the soft interface information * @primary_if: the primary hard interface of this batman soft interface * @backbone_addr: originator address of the backbone (Ethernet Source) @@ -988,7 +988,7 @@ static bool batadv_handle_claim(struct batadv_priv *bat_priv, } /** - * batadv_check_claim_group - check for claim group membership + * batadv_check_claim_group() - check for claim group membership * @bat_priv: the bat priv with all the soft interface information * @primary_if: the primary interface of this batman interface * @hw_src: the Hardware source in the ARP Header @@ -1063,7 +1063,7 @@ static int batadv_check_claim_group(struct batadv_priv *bat_priv, } /** - * batadv_bla_process_claim - Check if this is a claim frame, and process it + * batadv_bla_process_claim() - Check if this is a claim frame, and process it * @bat_priv: the bat priv with all the soft interface information * @primary_if: the primary hard interface of this batman soft interface * @skb: the frame to be checked @@ -1205,7 +1205,7 @@ static bool batadv_bla_process_claim(struct batadv_priv *bat_priv, } /** - * batadv_bla_purge_backbone_gw - Remove backbone gateways after a timeout or + * batadv_bla_purge_backbone_gw() - Remove backbone gateways after a timeout or * immediately * @bat_priv: the bat priv with all the soft interface information * @now: whether the whole hash shall be wiped now @@ -1258,7 +1258,7 @@ purge_now: } /** - * batadv_bla_purge_claims - Remove claims after a timeout or immediately + * batadv_bla_purge_claims() - Remove claims after a timeout or immediately * @bat_priv: the bat priv with all the soft interface information * @primary_if: the selected primary interface, may be NULL if now is set * @now: whether the whole hash shall be wiped now @@ -1316,7 +1316,7 @@ skip: } /** - * batadv_bla_update_orig_address - Update the backbone gateways when the own + * batadv_bla_update_orig_address() - Update the backbone gateways when the own * originator address changes * @bat_priv: the bat priv with all the soft interface information * @primary_if: the new selected primary_if @@ -1372,7 +1372,7 @@ void batadv_bla_update_orig_address(struct batadv_priv *bat_priv, } /** - * batadv_bla_send_loopdetect - send a loopdetect frame + * batadv_bla_send_loopdetect() - send a loopdetect frame * @bat_priv: the bat priv with all the soft interface information * @backbone_gw: the backbone gateway for which a loop should be detected * @@ -1392,7 +1392,7 @@ batadv_bla_send_loopdetect(struct batadv_priv *bat_priv, } /** - * batadv_bla_status_update - purge bla interfaces if necessary + * batadv_bla_status_update() - purge bla interfaces if necessary * @net_dev: the soft interface net device */ void batadv_bla_status_update(struct net_device *net_dev) @@ -1412,7 +1412,7 @@ void batadv_bla_status_update(struct net_device *net_dev) } /** - * batadv_bla_periodic_work - performs periodic bla work + * batadv_bla_periodic_work() - performs periodic bla work * @work: kernel work struct * * periodic work to do: @@ -1517,7 +1517,7 @@ static struct lock_class_key batadv_claim_hash_lock_class_key; static struct lock_class_key batadv_backbone_hash_lock_class_key; /** - * batadv_bla_init - initialize all bla structures + * batadv_bla_init() - initialize all bla structures * @bat_priv: the bat priv with all the soft interface information * * Return: 0 on success, < 0 on error. @@ -1579,7 +1579,7 @@ int batadv_bla_init(struct batadv_priv *bat_priv) } /** - * batadv_bla_check_bcast_duplist - Check if a frame is in the broadcast dup. + * batadv_bla_check_bcast_duplist() - Check if a frame is in the broadcast dup. * @bat_priv: the bat priv with all the soft interface information * @skb: contains the bcast_packet to be checked * @@ -1652,7 +1652,7 @@ out: } /** - * batadv_bla_is_backbone_gw_orig - Check if the originator is a gateway for + * batadv_bla_is_backbone_gw_orig() - Check if the originator is a gateway for * the VLAN identified by vid. * @bat_priv: the bat priv with all the soft interface information * @orig: originator mac address @@ -1692,7 +1692,7 @@ bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig, } /** - * batadv_bla_is_backbone_gw - check if originator is a backbone gw for a VLAN. + * batadv_bla_is_backbone_gw() - check if originator is a backbone gw for a VLAN * @skb: the frame to be checked * @orig_node: the orig_node of the frame * @hdr_size: maximum length of the frame @@ -1726,7 +1726,7 @@ bool batadv_bla_is_backbone_gw(struct sk_buff *skb, } /** - * batadv_bla_free - free all bla structures + * batadv_bla_free() - free all bla structures * @bat_priv: the bat priv with all the soft interface information * * for softinterface free or module unload @@ -1753,7 +1753,7 @@ void batadv_bla_free(struct batadv_priv *bat_priv) } /** - * batadv_bla_loopdetect_check - check and handle a detected loop + * batadv_bla_loopdetect_check() - check and handle a detected loop * @bat_priv: the bat priv with all the soft interface information * @skb: the packet to check * @primary_if: interface where the request came on @@ -1802,7 +1802,7 @@ batadv_bla_loopdetect_check(struct batadv_priv *bat_priv, struct sk_buff *skb, } /** - * batadv_bla_rx - check packets coming from the mesh. + * batadv_bla_rx() - check packets coming from the mesh. * @bat_priv: the bat priv with all the soft interface information * @skb: the frame to be checked * @vid: the VLAN ID of the frame @@ -1914,7 +1914,7 @@ out: } /** - * batadv_bla_tx - check packets going into the mesh + * batadv_bla_tx() - check packets going into the mesh * @bat_priv: the bat priv with all the soft interface information * @skb: the frame to be checked * @vid: the VLAN ID of the frame @@ -2022,7 +2022,7 @@ out: #ifdef CONFIG_BATMAN_ADV_DEBUGFS /** - * batadv_bla_claim_table_seq_print_text - print the claim table in a seq file + * batadv_bla_claim_table_seq_print_text() - print the claim table in a seq file * @seq: seq file to print on * @offset: not used * @@ -2084,7 +2084,7 @@ out: #endif /** - * batadv_bla_claim_dump_entry - dump one entry of the claim table + * batadv_bla_claim_dump_entry() - dump one entry of the claim table * to a netlink socket * @msg: buffer for the message * @portid: netlink port @@ -2143,7 +2143,7 @@ out: } /** - * batadv_bla_claim_dump_bucket - dump one bucket of the claim table + * batadv_bla_claim_dump_bucket() - dump one bucket of the claim table * to a netlink socket * @msg: buffer for the message * @portid: netlink port @@ -2180,7 +2180,7 @@ unlock: } /** - * batadv_bla_claim_dump - dump claim table to a netlink socket + * batadv_bla_claim_dump() - dump claim table to a netlink socket * @msg: buffer for the message * @cb: callback structure containing arguments * @@ -2247,8 +2247,8 @@ out: #ifdef CONFIG_BATMAN_ADV_DEBUGFS /** - * batadv_bla_backbone_table_seq_print_text - print the backbone table in a seq - * file + * batadv_bla_backbone_table_seq_print_text() - print the backbone table in a + * seq file * @seq: seq file to print on * @offset: not used * @@ -2312,8 +2312,8 @@ out: #endif /** - * batadv_bla_backbone_dump_entry - dump one entry of the backbone table - * to a netlink socket + * batadv_bla_backbone_dump_entry() - dump one entry of the backbone table to a + * netlink socket * @msg: buffer for the message * @portid: netlink port * @seq: Sequence number of netlink message @@ -2373,8 +2373,8 @@ out: } /** - * batadv_bla_backbone_dump_bucket - dump one bucket of the backbone table - * to a netlink socket + * batadv_bla_backbone_dump_bucket() - dump one bucket of the backbone table to + * a netlink socket * @msg: buffer for the message * @portid: netlink port * @seq: Sequence number of netlink message @@ -2410,7 +2410,7 @@ unlock: } /** - * batadv_bla_backbone_dump - dump backbone table to a netlink socket + * batadv_bla_backbone_dump() - dump backbone table to a netlink socket * @msg: buffer for the message * @cb: callback structure containing arguments * @@ -2477,7 +2477,7 @@ out: #ifdef CONFIG_BATMAN_ADV_DAT /** - * batadv_bla_check_claim - check if address is claimed + * batadv_bla_check_claim() - check if address is claimed * * @bat_priv: the bat priv with all the soft interface information * @addr: mac address of which the claim status is checked diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h index 234775748b8e..b27571abcd2f 100644 --- a/net/batman-adv/bridge_loop_avoidance.h +++ b/net/batman-adv/bridge_loop_avoidance.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors: * * Simon Wunderlich @@ -30,8 +31,8 @@ struct seq_file; struct sk_buff; /** - * batadv_bla_is_loopdetect_mac - check if the mac address is from a loop detect - * frame sent by bridge loop avoidance + * batadv_bla_is_loopdetect_mac() - check if the mac address is from a loop + * detect frame sent by bridge loop avoidance * @mac: mac address to check * * Return: true if the it looks like a loop detect frame diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c index e32ad47c6efd..21d1189957a7 100644 --- a/net/batman-adv/debugfs.c +++ b/net/batman-adv/debugfs.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2010-2017 B.A.T.M.A.N. contributors: * * Marek Lindner @@ -25,7 +26,6 @@ #include <linux/fs.h> #include <linux/netdevice.h> #include <linux/printk.h> -#include <linux/sched.h> /* for linux/wait.h */ #include <linux/seq_file.h> #include <linux/stat.h> #include <linux/stddef.h> @@ -66,8 +66,8 @@ static int batadv_originators_open(struct inode *inode, struct file *file) } /** - * batadv_originators_hardif_open - handles debugfs output for the - * originator table of an hard interface + * batadv_originators_hardif_open() - handles debugfs output for the originator + * table of an hard interface * @inode: inode pointer to debugfs file * @file: pointer to the seq_file * @@ -117,7 +117,7 @@ static int batadv_bla_backbone_table_open(struct inode *inode, #ifdef CONFIG_BATMAN_ADV_DAT /** - * batadv_dat_cache_open - Prepare file handler for reads from dat_chache + * batadv_dat_cache_open() - Prepare file handler for reads from dat_chache * @inode: inode which was opened * @file: file handle to be initialized * @@ -154,7 +154,7 @@ static int batadv_nc_nodes_open(struct inode *inode, struct file *file) #ifdef CONFIG_BATMAN_ADV_MCAST /** - * batadv_mcast_flags_open - prepare file handler for reads from mcast_flags + * batadv_mcast_flags_open() - prepare file handler for reads from mcast_flags * @inode: inode which was opened * @file: file handle to be initialized * @@ -259,6 +259,9 @@ static struct batadv_debuginfo *batadv_hardif_debuginfos[] = { NULL, }; +/** + * batadv_debugfs_init() - Initialize soft interface independent debugfs entries + */ void batadv_debugfs_init(void) { struct batadv_debuginfo **bat_debug; @@ -289,6 +292,9 @@ err: batadv_debugfs = NULL; } +/** + * batadv_debugfs_destroy() - Remove all debugfs entries + */ void batadv_debugfs_destroy(void) { debugfs_remove_recursive(batadv_debugfs); @@ -296,7 +302,7 @@ void batadv_debugfs_destroy(void) } /** - * batadv_debugfs_add_hardif - creates the base directory for a hard interface + * batadv_debugfs_add_hardif() - creates the base directory for a hard interface * in debugfs. * @hard_iface: hard interface which should be added. * @@ -338,7 +344,7 @@ out: } /** - * batadv_debugfs_del_hardif - delete the base directory for a hard interface + * batadv_debugfs_del_hardif() - delete the base directory for a hard interface * in debugfs. * @hard_iface: hard interface which is deleted. */ @@ -355,6 +361,12 @@ void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface) } } +/** + * batadv_debugfs_add_meshif() - Initialize interface dependent debugfs entries + * @dev: netdev struct of the soft interface + * + * Return: 0 on success or negative error number in case of failure + */ int batadv_debugfs_add_meshif(struct net_device *dev) { struct batadv_priv *bat_priv = netdev_priv(dev); @@ -401,6 +413,10 @@ out: return -ENOMEM; } +/** + * batadv_debugfs_del_meshif() - Remove interface dependent debugfs entries + * @dev: netdev struct of the soft interface + */ void batadv_debugfs_del_meshif(struct net_device *dev) { struct batadv_priv *bat_priv = netdev_priv(dev); diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h index 9c5d4a65b98c..90a08d35c501 100644 --- a/net/batman-adv/debugfs.h +++ b/net/batman-adv/debugfs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2010-2017 B.A.T.M.A.N. contributors: * * Marek Lindner diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index 760c0de72582..9703c791ffc5 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors: * * Antonio Quartulli @@ -23,7 +24,7 @@ #include <linux/byteorder/generic.h> #include <linux/errno.h> #include <linux/etherdevice.h> -#include <linux/fs.h> +#include <linux/gfp.h> #include <linux/if_arp.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> @@ -55,7 +56,7 @@ static void batadv_dat_purge(struct work_struct *work); /** - * batadv_dat_start_timer - initialise the DAT periodic worker + * batadv_dat_start_timer() - initialise the DAT periodic worker * @bat_priv: the bat priv with all the soft interface information */ static void batadv_dat_start_timer(struct batadv_priv *bat_priv) @@ -66,7 +67,7 @@ static void batadv_dat_start_timer(struct batadv_priv *bat_priv) } /** - * batadv_dat_entry_release - release dat_entry from lists and queue for free + * batadv_dat_entry_release() - release dat_entry from lists and queue for free * after rcu grace period * @ref: kref pointer of the dat_entry */ @@ -80,7 +81,7 @@ static void batadv_dat_entry_release(struct kref *ref) } /** - * batadv_dat_entry_put - decrement the dat_entry refcounter and possibly + * batadv_dat_entry_put() - decrement the dat_entry refcounter and possibly * release it * @dat_entry: dat_entry to be free'd */ @@ -90,7 +91,7 @@ static void batadv_dat_entry_put(struct batadv_dat_entry *dat_entry) } /** - * batadv_dat_to_purge - check whether a dat_entry has to be purged or not + * batadv_dat_to_purge() - check whether a dat_entry has to be purged or not * @dat_entry: the entry to check * * Return: true if the entry has to be purged now, false otherwise. @@ -102,7 +103,7 @@ static bool batadv_dat_to_purge(struct batadv_dat_entry *dat_entry) } /** - * __batadv_dat_purge - delete entries from the DAT local storage + * __batadv_dat_purge() - delete entries from the DAT local storage * @bat_priv: the bat priv with all the soft interface information * @to_purge: function in charge to decide whether an entry has to be purged or * not. This function takes the dat_entry as argument and has to @@ -145,8 +146,8 @@ static void __batadv_dat_purge(struct batadv_priv *bat_priv, } /** - * batadv_dat_purge - periodic task that deletes old entries from the local DAT - * hash table + * batadv_dat_purge() - periodic task that deletes old entries from the local + * DAT hash table * @work: kernel work struct */ static void batadv_dat_purge(struct work_struct *work) @@ -164,7 +165,7 @@ static void batadv_dat_purge(struct work_struct *work) } /** - * batadv_compare_dat - comparing function used in the local DAT hash table + * batadv_compare_dat() - comparing function used in the local DAT hash table * @node: node in the local table * @data2: second object to compare the node to * @@ -179,7 +180,7 @@ static bool batadv_compare_dat(const struct hlist_node *node, const void *data2) } /** - * batadv_arp_hw_src - extract the hw_src field from an ARP packet + * batadv_arp_hw_src() - extract the hw_src field from an ARP packet * @skb: ARP packet * @hdr_size: size of the possible header before the ARP packet * @@ -196,7 +197,7 @@ static u8 *batadv_arp_hw_src(struct sk_buff *skb, int hdr_size) } /** - * batadv_arp_ip_src - extract the ip_src field from an ARP packet + * batadv_arp_ip_src() - extract the ip_src field from an ARP packet * @skb: ARP packet * @hdr_size: size of the possible header before the ARP packet * @@ -208,7 +209,7 @@ static __be32 batadv_arp_ip_src(struct sk_buff *skb, int hdr_size) } /** - * batadv_arp_hw_dst - extract the hw_dst field from an ARP packet + * batadv_arp_hw_dst() - extract the hw_dst field from an ARP packet * @skb: ARP packet * @hdr_size: size of the possible header before the ARP packet * @@ -220,7 +221,7 @@ static u8 *batadv_arp_hw_dst(struct sk_buff *skb, int hdr_size) } /** - * batadv_arp_ip_dst - extract the ip_dst field from an ARP packet + * batadv_arp_ip_dst() - extract the ip_dst field from an ARP packet * @skb: ARP packet * @hdr_size: size of the possible header before the ARP packet * @@ -232,7 +233,7 @@ static __be32 batadv_arp_ip_dst(struct sk_buff *skb, int hdr_size) } /** - * batadv_hash_dat - compute the hash value for an IP address + * batadv_hash_dat() - compute the hash value for an IP address * @data: data to hash * @size: size of the hash table * @@ -267,7 +268,7 @@ static u32 batadv_hash_dat(const void *data, u32 size) } /** - * batadv_dat_entry_hash_find - look for a given dat_entry in the local hash + * batadv_dat_entry_hash_find() - look for a given dat_entry in the local hash * table * @bat_priv: the bat priv with all the soft interface information * @ip: search key @@ -310,7 +311,7 @@ batadv_dat_entry_hash_find(struct batadv_priv *bat_priv, __be32 ip, } /** - * batadv_dat_entry_add - add a new dat entry or update it if already exists + * batadv_dat_entry_add() - add a new dat entry or update it if already exists * @bat_priv: the bat priv with all the soft interface information * @ip: ipv4 to add/edit * @mac_addr: mac address to assign to the given ipv4 @@ -367,7 +368,8 @@ out: #ifdef CONFIG_BATMAN_ADV_DEBUG /** - * batadv_dbg_arp - print a debug message containing all the ARP packet details + * batadv_dbg_arp() - print a debug message containing all the ARP packet + * details * @bat_priv: the bat priv with all the soft interface information * @skb: ARP packet * @hdr_size: size of the possible header before the ARP packet @@ -448,7 +450,7 @@ static void batadv_dbg_arp(struct batadv_priv *bat_priv, struct sk_buff *skb, #endif /* CONFIG_BATMAN_ADV_DEBUG */ /** - * batadv_is_orig_node_eligible - check whether a node can be a DHT candidate + * batadv_is_orig_node_eligible() - check whether a node can be a DHT candidate * @res: the array with the already selected candidates * @select: number of already selected candidates * @tmp_max: address of the currently evaluated node @@ -502,7 +504,7 @@ out: } /** - * batadv_choose_next_candidate - select the next DHT candidate + * batadv_choose_next_candidate() - select the next DHT candidate * @bat_priv: the bat priv with all the soft interface information * @cands: candidates array * @select: number of candidates already present in the array @@ -566,8 +568,8 @@ static void batadv_choose_next_candidate(struct batadv_priv *bat_priv, } /** - * batadv_dat_select_candidates - select the nodes which the DHT message has to - * be sent to + * batadv_dat_select_candidates() - select the nodes which the DHT message has + * to be sent to * @bat_priv: the bat priv with all the soft interface information * @ip_dst: ipv4 to look up in the DHT * @vid: VLAN identifier @@ -612,7 +614,7 @@ batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst, } /** - * batadv_dat_send_data - send a payload to the selected candidates + * batadv_dat_send_data() - send a payload to the selected candidates * @bat_priv: the bat priv with all the soft interface information * @skb: payload to send * @ip: the DHT key @@ -688,7 +690,7 @@ out: } /** - * batadv_dat_tvlv_container_update - update the dat tvlv container after dat + * batadv_dat_tvlv_container_update() - update the dat tvlv container after dat * setting change * @bat_priv: the bat priv with all the soft interface information */ @@ -710,7 +712,7 @@ static void batadv_dat_tvlv_container_update(struct batadv_priv *bat_priv) } /** - * batadv_dat_status_update - update the dat tvlv container after dat + * batadv_dat_status_update() - update the dat tvlv container after dat * setting change * @net_dev: the soft interface net device */ @@ -722,7 +724,7 @@ void batadv_dat_status_update(struct net_device *net_dev) } /** - * batadv_dat_tvlv_ogm_handler_v1 - process incoming dat tvlv container + * batadv_dat_tvlv_ogm_handler_v1() - process incoming dat tvlv container * @bat_priv: the bat priv with all the soft interface information * @orig: the orig_node of the ogm * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags) @@ -741,7 +743,7 @@ static void batadv_dat_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, } /** - * batadv_dat_hash_free - free the local DAT hash table + * batadv_dat_hash_free() - free the local DAT hash table * @bat_priv: the bat priv with all the soft interface information */ static void batadv_dat_hash_free(struct batadv_priv *bat_priv) @@ -757,7 +759,7 @@ static void batadv_dat_hash_free(struct batadv_priv *bat_priv) } /** - * batadv_dat_init - initialise the DAT internals + * batadv_dat_init() - initialise the DAT internals * @bat_priv: the bat priv with all the soft interface information * * Return: 0 in case of success, a negative error code otherwise @@ -782,7 +784,7 @@ int batadv_dat_init(struct batadv_priv *bat_priv) } /** - * batadv_dat_free - free the DAT internals + * batadv_dat_free() - free the DAT internals * @bat_priv: the bat priv with all the soft interface information */ void batadv_dat_free(struct batadv_priv *bat_priv) @@ -797,7 +799,7 @@ void batadv_dat_free(struct batadv_priv *bat_priv) #ifdef CONFIG_BATMAN_ADV_DEBUGFS /** - * batadv_dat_cache_seq_print_text - print the local DAT hash table + * batadv_dat_cache_seq_print_text() - print the local DAT hash table * @seq: seq file to print on * @offset: not used * @@ -850,7 +852,7 @@ out: #endif /** - * batadv_arp_get_type - parse an ARP packet and gets the type + * batadv_arp_get_type() - parse an ARP packet and gets the type * @bat_priv: the bat priv with all the soft interface information * @skb: packet to analyse * @hdr_size: size of the possible header before the ARP packet in the skb @@ -924,7 +926,7 @@ out: } /** - * batadv_dat_get_vid - extract the VLAN identifier from skb if any + * batadv_dat_get_vid() - extract the VLAN identifier from skb if any * @skb: the buffer containing the packet to extract the VID from * @hdr_size: the size of the batman-adv header encapsulating the packet * @@ -950,7 +952,7 @@ static unsigned short batadv_dat_get_vid(struct sk_buff *skb, int *hdr_size) } /** - * batadv_dat_arp_create_reply - create an ARP Reply + * batadv_dat_arp_create_reply() - create an ARP Reply * @bat_priv: the bat priv with all the soft interface information * @ip_src: ARP sender IP * @ip_dst: ARP target IP @@ -985,7 +987,7 @@ batadv_dat_arp_create_reply(struct batadv_priv *bat_priv, __be32 ip_src, } /** - * batadv_dat_snoop_outgoing_arp_request - snoop the ARP request and try to + * batadv_dat_snoop_outgoing_arp_request() - snoop the ARP request and try to * answer using DAT * @bat_priv: the bat priv with all the soft interface information * @skb: packet to check @@ -1083,7 +1085,7 @@ out: } /** - * batadv_dat_snoop_incoming_arp_request - snoop the ARP request and try to + * batadv_dat_snoop_incoming_arp_request() - snoop the ARP request and try to * answer using the local DAT storage * @bat_priv: the bat priv with all the soft interface information * @skb: packet to check @@ -1153,7 +1155,7 @@ out: } /** - * batadv_dat_snoop_outgoing_arp_reply - snoop the ARP reply and fill the DHT + * batadv_dat_snoop_outgoing_arp_reply() - snoop the ARP reply and fill the DHT * @bat_priv: the bat priv with all the soft interface information * @skb: packet to check */ @@ -1193,8 +1195,8 @@ void batadv_dat_snoop_outgoing_arp_reply(struct batadv_priv *bat_priv, } /** - * batadv_dat_snoop_incoming_arp_reply - snoop the ARP reply and fill the local - * DAT storage only + * batadv_dat_snoop_incoming_arp_reply() - snoop the ARP reply and fill the + * local DAT storage only * @bat_priv: the bat priv with all the soft interface information * @skb: packet to check * @hdr_size: size of the encapsulation header @@ -1282,8 +1284,8 @@ out: } /** - * batadv_dat_drop_broadcast_packet - check if an ARP request has to be dropped - * (because the node has already obtained the reply via DAT) or not + * batadv_dat_drop_broadcast_packet() - check if an ARP request has to be + * dropped (because the node has already obtained the reply via DAT) or not * @bat_priv: the bat priv with all the soft interface information * @forw_packet: the broadcast packet * diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h index ec364a3c1c66..12897eb46268 100644 --- a/net/batman-adv/distributed-arp-table.h +++ b/net/batman-adv/distributed-arp-table.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors: * * Antonio Quartulli @@ -23,9 +24,9 @@ #include <linux/compiler.h> #include <linux/netdevice.h> #include <linux/types.h> +#include <uapi/linux/batadv_packet.h> #include "originator.h" -#include "packet.h" struct seq_file; struct sk_buff; @@ -48,7 +49,7 @@ bool batadv_dat_drop_broadcast_packet(struct batadv_priv *bat_priv, struct batadv_forw_packet *forw_packet); /** - * batadv_dat_init_orig_node_addr - assign a DAT address to the orig_node + * batadv_dat_init_orig_node_addr() - assign a DAT address to the orig_node * @orig_node: the node to assign the DAT address to */ static inline void @@ -61,7 +62,7 @@ batadv_dat_init_orig_node_addr(struct batadv_orig_node *orig_node) } /** - * batadv_dat_init_own_addr - assign a DAT address to the node itself + * batadv_dat_init_own_addr() - assign a DAT address to the node itself * @bat_priv: the bat priv with all the soft interface information * @primary_if: a pointer to the primary interface */ @@ -82,7 +83,7 @@ void batadv_dat_free(struct batadv_priv *bat_priv); int batadv_dat_cache_seq_print_text(struct seq_file *seq, void *offset); /** - * batadv_dat_inc_counter - increment the correct DAT packet counter + * batadv_dat_inc_counter() - increment the correct DAT packet counter * @bat_priv: the bat priv with all the soft interface information * @subtype: the 4addr subtype of the packet to be counted * diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index a98cf1104a30..22dde42fd80e 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors: * * Martin Hundebøll <martin@hundeboll.net> @@ -22,7 +23,7 @@ #include <linux/byteorder/generic.h> #include <linux/errno.h> #include <linux/etherdevice.h> -#include <linux/fs.h> +#include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/jiffies.h> #include <linux/kernel.h> @@ -32,16 +33,16 @@ #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/string.h> +#include <uapi/linux/batadv_packet.h> #include "hard-interface.h" #include "originator.h" -#include "packet.h" #include "routing.h" #include "send.h" #include "soft-interface.h" /** - * batadv_frag_clear_chain - delete entries in the fragment buffer chain + * batadv_frag_clear_chain() - delete entries in the fragment buffer chain * @head: head of chain with entries. * @dropped: whether the chain is cleared because all fragments are dropped * @@ -65,7 +66,7 @@ static void batadv_frag_clear_chain(struct hlist_head *head, bool dropped) } /** - * batadv_frag_purge_orig - free fragments associated to an orig + * batadv_frag_purge_orig() - free fragments associated to an orig * @orig_node: originator to free fragments from * @check_cb: optional function to tell if an entry should be purged */ @@ -89,7 +90,7 @@ void batadv_frag_purge_orig(struct batadv_orig_node *orig_node, } /** - * batadv_frag_size_limit - maximum possible size of packet to be fragmented + * batadv_frag_size_limit() - maximum possible size of packet to be fragmented * * Return: the maximum size of payload that can be fragmented. */ @@ -104,7 +105,7 @@ static int batadv_frag_size_limit(void) } /** - * batadv_frag_init_chain - check and prepare fragment chain for new fragment + * batadv_frag_init_chain() - check and prepare fragment chain for new fragment * @chain: chain in fragments table to init * @seqno: sequence number of the received fragment * @@ -134,7 +135,7 @@ static bool batadv_frag_init_chain(struct batadv_frag_table_entry *chain, } /** - * batadv_frag_insert_packet - insert a fragment into a fragment chain + * batadv_frag_insert_packet() - insert a fragment into a fragment chain * @orig_node: originator that the fragment was received from * @skb: skb to insert * @chain_out: list head to attach complete chains of fragments to @@ -248,7 +249,7 @@ err: } /** - * batadv_frag_merge_packets - merge a chain of fragments + * batadv_frag_merge_packets() - merge a chain of fragments * @chain: head of chain with fragments * * Expand the first skb in the chain and copy the content of the remaining @@ -306,7 +307,7 @@ free: } /** - * batadv_frag_skb_buffer - buffer fragment for later merge + * batadv_frag_skb_buffer() - buffer fragment for later merge * @skb: skb to buffer * @orig_node_src: originator that the skb is received from * @@ -346,7 +347,7 @@ out_err: } /** - * batadv_frag_skb_fwd - forward fragments that would exceed MTU when merged + * batadv_frag_skb_fwd() - forward fragments that would exceed MTU when merged * @skb: skb to forward * @recv_if: interface that the skb is received on * @orig_node_src: originator that the skb is received from @@ -400,7 +401,7 @@ out: } /** - * batadv_frag_create - create a fragment from skb + * batadv_frag_create() - create a fragment from skb * @skb: skb to create fragment from * @frag_head: header to use in new fragment * @fragment_size: size of new fragment @@ -438,7 +439,7 @@ err: } /** - * batadv_frag_send_packet - create up to 16 fragments from the passed skb + * batadv_frag_send_packet() - create up to 16 fragments from the passed skb * @skb: skb to create fragments from * @orig_node: final destination of the created fragments * @neigh_node: next-hop of the created fragments @@ -499,6 +500,8 @@ int batadv_frag_send_packet(struct sk_buff *skb, */ if (skb->priority >= 256 && skb->priority <= 263) frag_header.priority = skb->priority - 256; + else + frag_header.priority = 0; ether_addr_copy(frag_header.orig, primary_if->net_dev->dev_addr); ether_addr_copy(frag_header.dest, orig_node->orig); diff --git a/net/batman-adv/fragmentation.h b/net/batman-adv/fragmentation.h index 1a2d6c308745..138b22a1836a 100644 --- a/net/batman-adv/fragmentation.h +++ b/net/batman-adv/fragmentation.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors: * * Martin Hundebøll <martin@hundeboll.net> @@ -39,7 +40,7 @@ int batadv_frag_send_packet(struct sk_buff *skb, struct batadv_neigh_node *neigh_node); /** - * batadv_frag_check_entry - check if a list of fragments has timed out + * batadv_frag_check_entry() - check if a list of fragments has timed out * @frags_entry: table entry to check * * Return: true if the frags entry has timed out, false otherwise. diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index 10d521f0b17f..37fe9a644f22 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2009-2017 B.A.T.M.A.N. contributors: * * Marek Lindner @@ -22,7 +23,7 @@ #include <linux/byteorder/generic.h> #include <linux/errno.h> #include <linux/etherdevice.h> -#include <linux/fs.h> +#include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <linux/in.h> @@ -42,6 +43,7 @@ #include <linux/stddef.h> #include <linux/udp.h> #include <net/sock.h> +#include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "gateway_common.h" @@ -49,7 +51,6 @@ #include "log.h" #include "netlink.h" #include "originator.h" -#include "packet.h" #include "routing.h" #include "soft-interface.h" #include "sysfs.h" @@ -68,8 +69,8 @@ #define BATADV_DHCP_CHADDR_OFFSET 28 /** - * batadv_gw_node_release - release gw_node from lists and queue for free after - * rcu grace period + * batadv_gw_node_release() - release gw_node from lists and queue for free + * after rcu grace period * @ref: kref pointer of the gw_node */ static void batadv_gw_node_release(struct kref *ref) @@ -83,7 +84,8 @@ static void batadv_gw_node_release(struct kref *ref) } /** - * batadv_gw_node_put - decrement the gw_node refcounter and possibly release it + * batadv_gw_node_put() - decrement the gw_node refcounter and possibly release + * it * @gw_node: gateway node to free */ void batadv_gw_node_put(struct batadv_gw_node *gw_node) @@ -91,6 +93,12 @@ void batadv_gw_node_put(struct batadv_gw_node *gw_node) kref_put(&gw_node->refcount, batadv_gw_node_release); } +/** + * batadv_gw_get_selected_gw_node() - Get currently selected gateway + * @bat_priv: the bat priv with all the soft interface information + * + * Return: selected gateway (with increased refcnt), NULL on errors + */ struct batadv_gw_node * batadv_gw_get_selected_gw_node(struct batadv_priv *bat_priv) { @@ -109,6 +117,12 @@ out: return gw_node; } +/** + * batadv_gw_get_selected_orig() - Get originator of currently selected gateway + * @bat_priv: the bat priv with all the soft interface information + * + * Return: orig_node of selected gateway (with increased refcnt), NULL on errors + */ struct batadv_orig_node * batadv_gw_get_selected_orig(struct batadv_priv *bat_priv) { @@ -155,7 +169,7 @@ static void batadv_gw_select(struct batadv_priv *bat_priv, } /** - * batadv_gw_reselect - force a gateway reselection + * batadv_gw_reselect() - force a gateway reselection * @bat_priv: the bat priv with all the soft interface information * * Set a flag to remind the GW component to perform a new gateway reselection. @@ -171,7 +185,7 @@ void batadv_gw_reselect(struct batadv_priv *bat_priv) } /** - * batadv_gw_check_client_stop - check if client mode has been switched off + * batadv_gw_check_client_stop() - check if client mode has been switched off * @bat_priv: the bat priv with all the soft interface information * * This function assumes the caller has checked that the gw state *is actually @@ -202,6 +216,10 @@ void batadv_gw_check_client_stop(struct batadv_priv *bat_priv) batadv_gw_node_put(curr_gw); } +/** + * batadv_gw_election() - Elect the best gateway + * @bat_priv: the bat priv with all the soft interface information + */ void batadv_gw_election(struct batadv_priv *bat_priv) { struct batadv_gw_node *curr_gw = NULL; @@ -290,6 +308,11 @@ out: batadv_neigh_ifinfo_put(router_ifinfo); } +/** + * batadv_gw_check_election() - Elect orig node as best gateway when eligible + * @bat_priv: the bat priv with all the soft interface information + * @orig_node: orig node which is to be checked + */ void batadv_gw_check_election(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node) { @@ -321,7 +344,7 @@ out: } /** - * batadv_gw_node_add - add gateway node to list of available gateways + * batadv_gw_node_add() - add gateway node to list of available gateways * @bat_priv: the bat priv with all the soft interface information * @orig_node: originator announcing gateway capabilities * @gateway: announced bandwidth information @@ -364,7 +387,7 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv, } /** - * batadv_gw_node_get - retrieve gateway node from list of available gateways + * batadv_gw_node_get() - retrieve gateway node from list of available gateways * @bat_priv: the bat priv with all the soft interface information * @orig_node: originator announcing gateway capabilities * @@ -393,7 +416,7 @@ struct batadv_gw_node *batadv_gw_node_get(struct batadv_priv *bat_priv, } /** - * batadv_gw_node_update - update list of available gateways with changed + * batadv_gw_node_update() - update list of available gateways with changed * bandwidth information * @bat_priv: the bat priv with all the soft interface information * @orig_node: originator announcing gateway capabilities @@ -458,6 +481,11 @@ out: batadv_gw_node_put(gw_node); } +/** + * batadv_gw_node_delete() - Remove orig_node from gateway list + * @bat_priv: the bat priv with all the soft interface information + * @orig_node: orig node which is currently in process of being removed + */ void batadv_gw_node_delete(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node) { @@ -469,6 +497,10 @@ void batadv_gw_node_delete(struct batadv_priv *bat_priv, batadv_gw_node_update(bat_priv, orig_node, &gateway); } +/** + * batadv_gw_node_free() - Free gateway information from soft interface + * @bat_priv: the bat priv with all the soft interface information + */ void batadv_gw_node_free(struct batadv_priv *bat_priv) { struct batadv_gw_node *gw_node; @@ -484,6 +516,14 @@ void batadv_gw_node_free(struct batadv_priv *bat_priv) } #ifdef CONFIG_BATMAN_ADV_DEBUGFS + +/** + * batadv_gw_client_seq_print_text() - Print the gateway table in a seq file + * @seq: seq file to print on + * @offset: not used + * + * Return: always 0 + */ int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset) { struct net_device *net_dev = (struct net_device *)seq->private; @@ -514,7 +554,7 @@ int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset) #endif /** - * batadv_gw_dump - Dump gateways into a message + * batadv_gw_dump() - Dump gateways into a message * @msg: Netlink message to dump into * @cb: Control block containing additional options * @@ -567,7 +607,7 @@ out: } /** - * batadv_gw_dhcp_recipient_get - check if a packet is a DHCP message + * batadv_gw_dhcp_recipient_get() - check if a packet is a DHCP message * @skb: the packet to check * @header_len: a pointer to the batman-adv header size * @chaddr: buffer where the client address will be stored. Valid @@ -686,7 +726,8 @@ batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len, } /** - * batadv_gw_out_of_range - check if the dhcp request destination is the best gw + * batadv_gw_out_of_range() - check if the dhcp request destination is the best + * gateway * @bat_priv: the bat priv with all the soft interface information * @skb: the outgoing packet * diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h index 3baa3d466e5e..981f58421a32 100644 --- a/net/batman-adv/gateway_client.h +++ b/net/batman-adv/gateway_client.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2009-2017 B.A.T.M.A.N. contributors: * * Marek Lindner diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index 2c26039c23fc..b3e156af2256 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2009-2017 B.A.T.M.A.N. contributors: * * Marek Lindner @@ -26,15 +27,15 @@ #include <linux/netdevice.h> #include <linux/stddef.h> #include <linux/string.h> +#include <uapi/linux/batadv_packet.h> #include "gateway_client.h" #include "log.h" -#include "packet.h" #include "tvlv.h" /** - * batadv_parse_throughput - parse supplied string buffer to extract throughput - * information + * batadv_parse_throughput() - parse supplied string buffer to extract + * throughput information * @net_dev: the soft interface net device * @buff: string buffer to parse * @description: text shown when throughput string cannot be parsed @@ -100,8 +101,8 @@ bool batadv_parse_throughput(struct net_device *net_dev, char *buff, } /** - * batadv_parse_gw_bandwidth - parse supplied string buffer to extract download - * and upload bandwidth information + * batadv_parse_gw_bandwidth() - parse supplied string buffer to extract + * download and upload bandwidth information * @net_dev: the soft interface net device * @buff: string buffer to parse * @down: pointer holding the returned download bandwidth information @@ -136,8 +137,8 @@ static bool batadv_parse_gw_bandwidth(struct net_device *net_dev, char *buff, } /** - * batadv_gw_tvlv_container_update - update the gw tvlv container after gateway - * setting change + * batadv_gw_tvlv_container_update() - update the gw tvlv container after + * gateway setting change * @bat_priv: the bat priv with all the soft interface information */ void batadv_gw_tvlv_container_update(struct batadv_priv *bat_priv) @@ -164,6 +165,15 @@ void batadv_gw_tvlv_container_update(struct batadv_priv *bat_priv) } } +/** + * batadv_gw_bandwidth_set() - Parse and set download/upload gateway bandwidth + * from supplied string buffer + * @net_dev: netdev struct of the soft interface + * @buff: the buffer containing the user data + * @count: number of bytes in the buffer + * + * Return: 'count' on success or a negative error code in case of failure + */ ssize_t batadv_gw_bandwidth_set(struct net_device *net_dev, char *buff, size_t count) { @@ -207,7 +217,7 @@ ssize_t batadv_gw_bandwidth_set(struct net_device *net_dev, char *buff, } /** - * batadv_gw_tvlv_ogm_handler_v1 - process incoming gateway tvlv container + * batadv_gw_tvlv_ogm_handler_v1() - process incoming gateway tvlv container * @bat_priv: the bat priv with all the soft interface information * @orig: the orig_node of the ogm * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags) @@ -248,7 +258,7 @@ static void batadv_gw_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, } /** - * batadv_gw_init - initialise the gateway handling internals + * batadv_gw_init() - initialise the gateway handling internals * @bat_priv: the bat priv with all the soft interface information */ void batadv_gw_init(struct batadv_priv *bat_priv) @@ -264,7 +274,7 @@ void batadv_gw_init(struct batadv_priv *bat_priv) } /** - * batadv_gw_free - free the gateway handling internals + * batadv_gw_free() - free the gateway handling internals * @bat_priv: the bat priv with all the soft interface information */ void batadv_gw_free(struct batadv_priv *bat_priv) diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h index 0a6a97d201f2..afebd9c7edf4 100644 --- a/net/batman-adv/gateway_common.h +++ b/net/batman-adv/gateway_common.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2009-2017 B.A.T.M.A.N. contributors: * * Marek Lindner @@ -32,11 +33,12 @@ enum batadv_gw_modes { /** * enum batadv_bandwidth_units - bandwidth unit types - * @BATADV_BW_UNIT_KBIT: unit type kbit - * @BATADV_BW_UNIT_MBIT: unit type mbit */ enum batadv_bandwidth_units { + /** @BATADV_BW_UNIT_KBIT: unit type kbit */ BATADV_BW_UNIT_KBIT, + + /** @BATADV_BW_UNIT_MBIT: unit type mbit */ BATADV_BW_UNIT_MBIT, }; diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 4e3d5340ad96..5f186bff284a 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich @@ -22,7 +23,7 @@ #include <linux/bug.h> #include <linux/byteorder/generic.h> #include <linux/errno.h> -#include <linux/fs.h> +#include <linux/gfp.h> #include <linux/if.h> #include <linux/if_arp.h> #include <linux/if_ether.h> @@ -37,6 +38,7 @@ #include <linux/spinlock.h> #include <net/net_namespace.h> #include <net/rtnetlink.h> +#include <uapi/linux/batadv_packet.h> #include "bat_v.h" #include "bridge_loop_avoidance.h" @@ -45,14 +47,13 @@ #include "gateway_client.h" #include "log.h" #include "originator.h" -#include "packet.h" #include "send.h" #include "soft-interface.h" #include "sysfs.h" #include "translation-table.h" /** - * batadv_hardif_release - release hard interface from lists and queue for + * batadv_hardif_release() - release hard interface from lists and queue for * free after rcu grace period * @ref: kref pointer of the hard interface */ @@ -66,6 +67,12 @@ void batadv_hardif_release(struct kref *ref) kfree_rcu(hard_iface, rcu); } +/** + * batadv_hardif_get_by_netdev() - Get hard interface object of a net_device + * @net_dev: net_device to search for + * + * Return: batadv_hard_iface of net_dev (with increased refcnt), NULL on errors + */ struct batadv_hard_iface * batadv_hardif_get_by_netdev(const struct net_device *net_dev) { @@ -86,7 +93,7 @@ out: } /** - * batadv_getlink_net - return link net namespace (of use fallback) + * batadv_getlink_net() - return link net namespace (of use fallback) * @netdev: net_device to check * @fallback_net: return in case get_link_net is not available for @netdev * @@ -105,7 +112,7 @@ static struct net *batadv_getlink_net(const struct net_device *netdev, } /** - * batadv_mutual_parents - check if two devices are each others parent + * batadv_mutual_parents() - check if two devices are each others parent * @dev1: 1st net dev * @net1: 1st devices netns * @dev2: 2nd net dev @@ -138,7 +145,7 @@ static bool batadv_mutual_parents(const struct net_device *dev1, } /** - * batadv_is_on_batman_iface - check if a device is a batman iface descendant + * batadv_is_on_batman_iface() - check if a device is a batman iface descendant * @net_dev: the device to check * * If the user creates any virtual device on top of a batman-adv interface, it @@ -202,7 +209,7 @@ static bool batadv_is_valid_iface(const struct net_device *net_dev) } /** - * batadv_get_real_netdevice - check if the given netdev struct is a virtual + * batadv_get_real_netdevice() - check if the given netdev struct is a virtual * interface on top of another 'real' interface * @netdev: the device to check * @@ -246,7 +253,7 @@ out: } /** - * batadv_get_real_netdev - check if the given net_device struct is a virtual + * batadv_get_real_netdev() - check if the given net_device struct is a virtual * interface on top of another 'real' interface * @net_device: the device to check * @@ -265,7 +272,7 @@ struct net_device *batadv_get_real_netdev(struct net_device *net_device) } /** - * batadv_is_wext_netdev - check if the given net_device struct is a + * batadv_is_wext_netdev() - check if the given net_device struct is a * wext wifi interface * @net_device: the device to check * @@ -289,7 +296,7 @@ static bool batadv_is_wext_netdev(struct net_device *net_device) } /** - * batadv_is_cfg80211_netdev - check if the given net_device struct is a + * batadv_is_cfg80211_netdev() - check if the given net_device struct is a * cfg80211 wifi interface * @net_device: the device to check * @@ -309,7 +316,7 @@ static bool batadv_is_cfg80211_netdev(struct net_device *net_device) } /** - * batadv_wifi_flags_evaluate - calculate wifi flags for net_device + * batadv_wifi_flags_evaluate() - calculate wifi flags for net_device * @net_device: the device to check * * Return: batadv_hard_iface_wifi_flags flags of the device @@ -344,7 +351,7 @@ out: } /** - * batadv_is_cfg80211_hardif - check if the given hardif is a cfg80211 wifi + * batadv_is_cfg80211_hardif() - check if the given hardif is a cfg80211 wifi * interface * @hard_iface: the device to check * @@ -362,7 +369,7 @@ bool batadv_is_cfg80211_hardif(struct batadv_hard_iface *hard_iface) } /** - * batadv_is_wifi_hardif - check if the given hardif is a wifi interface + * batadv_is_wifi_hardif() - check if the given hardif is a wifi interface * @hard_iface: the device to check * * Return: true if the net device is a 802.11 wireless device, false otherwise. @@ -376,7 +383,7 @@ bool batadv_is_wifi_hardif(struct batadv_hard_iface *hard_iface) } /** - * batadv_hardif_no_broadcast - check whether (re)broadcast is necessary + * batadv_hardif_no_broadcast() - check whether (re)broadcast is necessary * @if_outgoing: the outgoing interface checked and considered for (re)broadcast * @orig_addr: the originator of this packet * @orig_neigh: originator address of the forwarder we just got the packet from @@ -560,6 +567,13 @@ static void batadv_hardif_recalc_extra_skbroom(struct net_device *soft_iface) soft_iface->needed_tailroom = lower_tailroom; } +/** + * batadv_hardif_min_mtu() - Calculate maximum MTU for soft interface + * @soft_iface: netdev struct of the soft interface + * + * Return: MTU for the soft-interface (limited by the minimal MTU of all active + * slave interfaces) + */ int batadv_hardif_min_mtu(struct net_device *soft_iface) { struct batadv_priv *bat_priv = netdev_priv(soft_iface); @@ -606,7 +620,11 @@ out: return min_t(int, min_mtu - batadv_max_header_len(), ETH_DATA_LEN); } -/* adjusts the MTU if a new interface with a smaller MTU appeared. */ +/** + * batadv_update_min_mtu() - Adjusts the MTU if a new interface with a smaller + * MTU appeared + * @soft_iface: netdev struct of the soft interface + */ void batadv_update_min_mtu(struct net_device *soft_iface) { soft_iface->mtu = batadv_hardif_min_mtu(soft_iface); @@ -667,7 +685,7 @@ batadv_hardif_deactivate_interface(struct batadv_hard_iface *hard_iface) } /** - * batadv_master_del_slave - remove hard_iface from the current master interface + * batadv_master_del_slave() - remove hard_iface from the current master iface * @slave: the interface enslaved in another master * @master: the master from which slave has to be removed * @@ -691,6 +709,14 @@ static int batadv_master_del_slave(struct batadv_hard_iface *slave, return ret; } +/** + * batadv_hardif_enable_interface() - Enslave hard interface to soft interface + * @hard_iface: hard interface to add to soft interface + * @net: the applicable net namespace + * @iface_name: name of the soft interface + * + * Return: 0 on success or negative error number in case of failure + */ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, struct net *net, const char *iface_name) { @@ -802,6 +828,12 @@ err: return ret; } +/** + * batadv_hardif_disable_interface() - Remove hard interface from soft interface + * @hard_iface: hard interface to be removed + * @autodel: whether to delete soft interface when it doesn't contain any other + * slave interfaces + */ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface, enum batadv_hard_if_cleanup autodel) { @@ -936,6 +968,9 @@ static void batadv_hardif_remove_interface(struct batadv_hard_iface *hard_iface) batadv_hardif_put(hard_iface); } +/** + * batadv_hardif_remove_interfaces() - Remove all hard interfaces + */ void batadv_hardif_remove_interfaces(void) { struct batadv_hard_iface *hard_iface, *hard_iface_tmp; diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h index 9f9890ff7a22..de5e9a374ece 100644 --- a/net/batman-adv/hard-interface.h +++ b/net/batman-adv/hard-interface.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich @@ -30,36 +31,74 @@ struct net_device; struct net; +/** + * enum batadv_hard_if_state - State of a hard interface + */ enum batadv_hard_if_state { + /** + * @BATADV_IF_NOT_IN_USE: interface is not used as slave interface of a + * batman-adv soft interface + */ BATADV_IF_NOT_IN_USE, + + /** + * @BATADV_IF_TO_BE_REMOVED: interface will be removed from soft + * interface + */ BATADV_IF_TO_BE_REMOVED, + + /** @BATADV_IF_INACTIVE: interface is deactivated */ BATADV_IF_INACTIVE, + + /** @BATADV_IF_ACTIVE: interface is used */ BATADV_IF_ACTIVE, + + /** @BATADV_IF_TO_BE_ACTIVATED: interface is getting activated */ BATADV_IF_TO_BE_ACTIVATED, + + /** + * @BATADV_IF_I_WANT_YOU: interface is queued up (using sysfs) for being + * added as slave interface of a batman-adv soft interface + */ BATADV_IF_I_WANT_YOU, }; /** * enum batadv_hard_if_bcast - broadcast avoidance options - * @BATADV_HARDIF_BCAST_OK: Do broadcast on according hard interface - * @BATADV_HARDIF_BCAST_NORECIPIENT: Broadcast not needed, there is no recipient - * @BATADV_HARDIF_BCAST_DUPFWD: There is just the neighbor we got it from - * @BATADV_HARDIF_BCAST_DUPORIG: There is just the originator */ enum batadv_hard_if_bcast { + /** @BATADV_HARDIF_BCAST_OK: Do broadcast on according hard interface */ BATADV_HARDIF_BCAST_OK = 0, + + /** + * @BATADV_HARDIF_BCAST_NORECIPIENT: Broadcast not needed, there is no + * recipient + */ BATADV_HARDIF_BCAST_NORECIPIENT, + + /** + * @BATADV_HARDIF_BCAST_DUPFWD: There is just the neighbor we got it + * from + */ BATADV_HARDIF_BCAST_DUPFWD, + + /** @BATADV_HARDIF_BCAST_DUPORIG: There is just the originator */ BATADV_HARDIF_BCAST_DUPORIG, }; /** * enum batadv_hard_if_cleanup - Cleanup modi for soft_iface after slave removal - * @BATADV_IF_CLEANUP_KEEP: Don't automatically delete soft-interface - * @BATADV_IF_CLEANUP_AUTO: Delete soft-interface after last slave was removed */ enum batadv_hard_if_cleanup { + /** + * @BATADV_IF_CLEANUP_KEEP: Don't automatically delete soft-interface + */ BATADV_IF_CLEANUP_KEEP, + + /** + * @BATADV_IF_CLEANUP_AUTO: Delete soft-interface after last slave was + * removed + */ BATADV_IF_CLEANUP_AUTO, }; @@ -82,7 +121,7 @@ int batadv_hardif_no_broadcast(struct batadv_hard_iface *if_outgoing, u8 *orig_addr, u8 *orig_neigh); /** - * batadv_hardif_put - decrement the hard interface refcounter and possibly + * batadv_hardif_put() - decrement the hard interface refcounter and possibly * release it * @hard_iface: the hard interface to free */ @@ -91,6 +130,12 @@ static inline void batadv_hardif_put(struct batadv_hard_iface *hard_iface) kref_put(&hard_iface->refcount, batadv_hardif_release); } +/** + * batadv_primary_if_get_selected() - Get reference to primary interface + * @bat_priv: the bat priv with all the soft interface information + * + * Return: primary interface (with increased refcnt), otherwise NULL + */ static inline struct batadv_hard_iface * batadv_primary_if_get_selected(struct batadv_priv *bat_priv) { diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c index b5f7e13918ac..04d964358c98 100644 --- a/net/batman-adv/hash.c +++ b/net/batman-adv/hash.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2006-2017 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner @@ -18,7 +19,7 @@ #include "hash.h" #include "main.h" -#include <linux/fs.h> +#include <linux/gfp.h> #include <linux/lockdep.h> #include <linux/slab.h> @@ -33,7 +34,10 @@ static void batadv_hash_init(struct batadv_hashtable *hash) } } -/* free only the hashtable and the hash itself. */ +/** + * batadv_hash_destroy() - Free only the hashtable and the hash itself + * @hash: hash object to destroy + */ void batadv_hash_destroy(struct batadv_hashtable *hash) { kfree(hash->list_locks); @@ -41,7 +45,12 @@ void batadv_hash_destroy(struct batadv_hashtable *hash) kfree(hash); } -/* allocates and clears the hash */ +/** + * batadv_hash_new() - Allocates and clears the hashtable + * @size: number of hash buckets to allocate + * + * Return: newly allocated hashtable, NULL on errors + */ struct batadv_hashtable *batadv_hash_new(u32 size) { struct batadv_hashtable *hash; @@ -70,6 +79,11 @@ free_hash: return NULL; } +/** + * batadv_hash_set_lock_class() - Set specific lockdep class for hash spinlocks + * @hash: hash object to modify + * @key: lockdep class key address + */ void batadv_hash_set_lock_class(struct batadv_hashtable *hash, struct lock_class_key *key) { diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h index 0c905e91c5e2..4ce1b6d3ad5c 100644 --- a/net/batman-adv/hash.h +++ b/net/batman-adv/hash.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2006-2017 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner @@ -45,10 +46,18 @@ typedef bool (*batadv_hashdata_compare_cb)(const struct hlist_node *, typedef u32 (*batadv_hashdata_choose_cb)(const void *, u32); typedef void (*batadv_hashdata_free_cb)(struct hlist_node *, void *); +/** + * struct batadv_hashtable - Wrapper of simple hlist based hashtable + */ struct batadv_hashtable { - struct hlist_head *table; /* the hashtable itself with the buckets */ - spinlock_t *list_locks; /* spinlock for each hash list entry */ - u32 size; /* size of hashtable */ + /** @table: the hashtable itself with the buckets */ + struct hlist_head *table; + + /** @list_locks: spinlock for each hash list entry */ + spinlock_t *list_locks; + + /** @size: size of hashtable */ + u32 size; }; /* allocates and clears the hash */ @@ -62,7 +71,7 @@ void batadv_hash_set_lock_class(struct batadv_hashtable *hash, void batadv_hash_destroy(struct batadv_hashtable *hash); /** - * batadv_hash_add - adds data to the hashtable + * batadv_hash_add() - adds data to the hashtable * @hash: storage hash table * @compare: callback to determine if 2 hash elements are identical * @choose: callback calculating the hash index @@ -112,8 +121,15 @@ out: return ret; } -/* removes data from hash, if found. data could be the structure you use with - * just the key filled, we just need the key for comparing. +/** + * batadv_hash_remove() - Removes data from hash, if found + * @hash: hash table + * @compare: callback to determine if 2 hash elements are identical + * @choose: callback calculating the hash index + * @data: data passed to the aforementioned callbacks as argument + * + * ata could be the structure you use with just the key filled, we just need + * the key for comparing. * * Return: returns pointer do data on success, so you can remove the used * structure yourself, or NULL on error diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c index bded31121d12..581375d0eed2 100644 --- a/net/batman-adv/icmp_socket.c +++ b/net/batman-adv/icmp_socket.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner @@ -26,6 +27,7 @@ #include <linux/export.h> #include <linux/fcntl.h> #include <linux/fs.h> +#include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/kernel.h> #include <linux/list.h> @@ -42,11 +44,11 @@ #include <linux/string.h> #include <linux/uaccess.h> #include <linux/wait.h> +#include <uapi/linux/batadv_packet.h> #include "hard-interface.h" #include "log.h" #include "originator.h" -#include "packet.h" #include "send.h" static struct batadv_socket_client *batadv_socket_client_hash[256]; @@ -55,6 +57,9 @@ static void batadv_socket_add_packet(struct batadv_socket_client *socket_client, struct batadv_icmp_header *icmph, size_t icmp_len); +/** + * batadv_socket_init() - Initialize soft interface independent socket data + */ void batadv_socket_init(void) { memset(batadv_socket_client_hash, 0, sizeof(batadv_socket_client_hash)); @@ -292,7 +297,7 @@ out: return len; } -static unsigned int batadv_socket_poll(struct file *file, poll_table *wait) +static __poll_t batadv_socket_poll(struct file *file, poll_table *wait) { struct batadv_socket_client *socket_client = file->private_data; @@ -314,6 +319,12 @@ static const struct file_operations batadv_fops = { .llseek = no_llseek, }; +/** + * batadv_socket_setup() - Create debugfs "socket" file + * @bat_priv: the bat priv with all the soft interface information + * + * Return: 0 on success or negative error number in case of failure + */ int batadv_socket_setup(struct batadv_priv *bat_priv) { struct dentry *d; @@ -333,7 +344,7 @@ err: } /** - * batadv_socket_add_packet - schedule an icmp packet to be sent to + * batadv_socket_add_packet() - schedule an icmp packet to be sent to * userspace on an icmp socket. * @socket_client: the socket this packet belongs to * @icmph: pointer to the header of the icmp packet @@ -390,7 +401,7 @@ static void batadv_socket_add_packet(struct batadv_socket_client *socket_client, } /** - * batadv_socket_receive_packet - schedule an icmp packet to be received + * batadv_socket_receive_packet() - schedule an icmp packet to be received * locally and sent to userspace. * @icmph: pointer to the header of the icmp packet * @icmp_len: total length of the icmp packet diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h index f3fec40aae86..84cddd01eeab 100644 --- a/net/batman-adv/icmp_socket.h +++ b/net/batman-adv/icmp_socket.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c index 4ef4bde2cc2d..9be74a44e99d 100644 --- a/net/batman-adv/log.c +++ b/net/batman-adv/log.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2010-2017 B.A.T.M.A.N. contributors: * * Marek Lindner @@ -24,6 +25,7 @@ #include <linux/export.h> #include <linux/fcntl.h> #include <linux/fs.h> +#include <linux/gfp.h> #include <linux/jiffies.h> #include <linux/kernel.h> #include <linux/module.h> @@ -86,6 +88,13 @@ static int batadv_fdebug_log(struct batadv_priv_debug_log *debug_log, return 0; } +/** + * batadv_debug_log() - Add debug log entry + * @bat_priv: the bat priv with all the soft interface information + * @fmt: format string + * + * Return: 0 on success or negative error number in case of failure + */ int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...) { va_list args; @@ -176,7 +185,7 @@ static ssize_t batadv_log_read(struct file *file, char __user *buf, return error; } -static unsigned int batadv_log_poll(struct file *file, poll_table *wait) +static __poll_t batadv_log_poll(struct file *file, poll_table *wait) { struct batadv_priv *bat_priv = file->private_data; struct batadv_priv_debug_log *debug_log = bat_priv->debug_log; @@ -197,6 +206,12 @@ static const struct file_operations batadv_log_fops = { .llseek = no_llseek, }; +/** + * batadv_debug_log_setup() - Initialize debug log + * @bat_priv: the bat priv with all the soft interface information + * + * Return: 0 on success or negative error number in case of failure + */ int batadv_debug_log_setup(struct batadv_priv *bat_priv) { struct dentry *d; @@ -222,6 +237,10 @@ err: return -ENOMEM; } +/** + * batadv_debug_log_cleanup() - Destroy debug log + * @bat_priv: the bat priv with all the soft interface information + */ void batadv_debug_log_cleanup(struct batadv_priv *bat_priv) { kfree(bat_priv->debug_log); diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h index 65ce97efa6b5..35e02b2b9e72 100644 --- a/net/batman-adv/log.h +++ b/net/batman-adv/log.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich @@ -44,25 +45,33 @@ static inline void batadv_debug_log_cleanup(struct batadv_priv *bat_priv) /** * enum batadv_dbg_level - available log levels - * @BATADV_DBG_BATMAN: OGM and TQ computations related messages - * @BATADV_DBG_ROUTES: route added / changed / deleted - * @BATADV_DBG_TT: translation table messages - * @BATADV_DBG_BLA: bridge loop avoidance messages - * @BATADV_DBG_DAT: ARP snooping and DAT related messages - * @BATADV_DBG_NC: network coding related messages - * @BATADV_DBG_MCAST: multicast related messages - * @BATADV_DBG_TP_METER: throughput meter messages - * @BATADV_DBG_ALL: the union of all the above log levels */ enum batadv_dbg_level { + /** @BATADV_DBG_BATMAN: OGM and TQ computations related messages */ BATADV_DBG_BATMAN = BIT(0), + + /** @BATADV_DBG_ROUTES: route added / changed / deleted */ BATADV_DBG_ROUTES = BIT(1), + + /** @BATADV_DBG_TT: translation table messages */ BATADV_DBG_TT = BIT(2), + + /** @BATADV_DBG_BLA: bridge loop avoidance messages */ BATADV_DBG_BLA = BIT(3), + + /** @BATADV_DBG_DAT: ARP snooping and DAT related messages */ BATADV_DBG_DAT = BIT(4), + + /** @BATADV_DBG_NC: network coding related messages */ BATADV_DBG_NC = BIT(5), + + /** @BATADV_DBG_MCAST: multicast related messages */ BATADV_DBG_MCAST = BIT(6), + + /** @BATADV_DBG_TP_METER: throughput meter messages */ BATADV_DBG_TP_METER = BIT(7), + + /** @BATADV_DBG_ALL: the union of all the above log levels */ BATADV_DBG_ALL = 255, }; @@ -70,7 +79,14 @@ enum batadv_dbg_level { int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...) __printf(2, 3); -/* possibly ratelimited debug output */ +/** + * _batadv_dbg() - Store debug output with(out) ratelimiting + * @type: type of debug message + * @bat_priv: the bat priv with all the soft interface information + * @ratelimited: whether output should be rate limited + * @fmt: format string + * @arg...: variable arguments + */ #define _batadv_dbg(type, bat_priv, ratelimited, fmt, arg...) \ do { \ struct batadv_priv *__batpriv = (bat_priv); \ @@ -89,11 +105,30 @@ static inline void _batadv_dbg(int type __always_unused, } #endif +/** + * batadv_dbg() - Store debug output without ratelimiting + * @type: type of debug message + * @bat_priv: the bat priv with all the soft interface information + * @arg...: format string and variable arguments + */ #define batadv_dbg(type, bat_priv, arg...) \ _batadv_dbg(type, bat_priv, 0, ## arg) + +/** + * batadv_dbg_ratelimited() - Store debug output with ratelimiting + * @type: type of debug message + * @bat_priv: the bat priv with all the soft interface information + * @arg...: format string and variable arguments + */ #define batadv_dbg_ratelimited(type, bat_priv, arg...) \ _batadv_dbg(type, bat_priv, 1, ## arg) +/** + * batadv_info() - Store message in debug buffer and print it to kmsg buffer + * @net_dev: the soft interface net device + * @fmt: format string + * @arg...: variable arguments + */ #define batadv_info(net_dev, fmt, arg...) \ do { \ struct net_device *_netdev = (net_dev); \ @@ -101,6 +136,13 @@ static inline void _batadv_dbg(int type __always_unused, batadv_dbg(BATADV_DBG_ALL, _batpriv, fmt, ## arg); \ pr_info("%s: " fmt, _netdev->name, ## arg); \ } while (0) + +/** + * batadv_err() - Store error in debug buffer and print it to kmsg buffer + * @net_dev: the soft interface net device + * @fmt: format string + * @arg...: variable arguments + */ #define batadv_err(net_dev, fmt, arg...) \ do { \ struct net_device *_netdev = (net_dev); \ diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 4daed7ad46f2..d31c8266e244 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich @@ -18,12 +19,12 @@ #include "main.h" #include <linux/atomic.h> -#include <linux/bug.h> +#include <linux/build_bug.h> #include <linux/byteorder/generic.h> #include <linux/crc32c.h> #include <linux/errno.h> -#include <linux/fs.h> #include <linux/genetlink.h> +#include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <linux/init.h> @@ -45,6 +46,7 @@ #include <linux/workqueue.h> #include <net/dsfield.h> #include <net/rtnetlink.h> +#include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "bat_algo.h" @@ -62,7 +64,6 @@ #include "netlink.h" #include "network-coding.h" #include "originator.h" -#include "packet.h" #include "routing.h" #include "send.h" #include "soft-interface.h" @@ -139,6 +140,12 @@ static void __exit batadv_exit(void) batadv_tt_cache_destroy(); } +/** + * batadv_mesh_init() - Initialize soft interface + * @soft_iface: netdev struct of the soft interface + * + * Return: 0 on success or negative error number in case of failure + */ int batadv_mesh_init(struct net_device *soft_iface) { struct batadv_priv *bat_priv = netdev_priv(soft_iface); @@ -216,6 +223,10 @@ err: return ret; } +/** + * batadv_mesh_free() - Deinitialize soft interface + * @soft_iface: netdev struct of the soft interface + */ void batadv_mesh_free(struct net_device *soft_iface) { struct batadv_priv *bat_priv = netdev_priv(soft_iface); @@ -255,8 +266,8 @@ void batadv_mesh_free(struct net_device *soft_iface) } /** - * batadv_is_my_mac - check if the given mac address belongs to any of the real - * interfaces in the current mesh + * batadv_is_my_mac() - check if the given mac address belongs to any of the + * real interfaces in the current mesh * @bat_priv: the bat priv with all the soft interface information * @addr: the address to check * @@ -286,7 +297,7 @@ bool batadv_is_my_mac(struct batadv_priv *bat_priv, const u8 *addr) #ifdef CONFIG_BATMAN_ADV_DEBUGFS /** - * batadv_seq_print_text_primary_if_get - called from debugfs table printing + * batadv_seq_print_text_primary_if_get() - called from debugfs table printing * function that requires the primary interface * @seq: debugfs table seq_file struct * @@ -323,7 +334,7 @@ out: #endif /** - * batadv_max_header_len - calculate maximum encapsulation overhead for a + * batadv_max_header_len() - calculate maximum encapsulation overhead for a * payload packet * * Return: the maximum encapsulation overhead in bytes. @@ -348,7 +359,7 @@ int batadv_max_header_len(void) } /** - * batadv_skb_set_priority - sets skb priority according to packet content + * batadv_skb_set_priority() - sets skb priority according to packet content * @skb: the packet to be sent * @offset: offset to the packet content * @@ -412,6 +423,16 @@ static int batadv_recv_unhandled_packet(struct sk_buff *skb, /* incoming packets with the batman ethertype received on any active hard * interface */ + +/** + * batadv_batman_skb_recv() - Handle incoming message from an hard interface + * @skb: the received packet + * @dev: the net device that the packet was received on + * @ptype: packet type of incoming packet (ETH_P_BATMAN) + * @orig_dev: the original receive net device (e.g. bonded device) + * + * Return: NET_RX_SUCCESS on success or NET_RX_DROP in case of failure + */ int batadv_batman_skb_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *ptype, struct net_device *orig_dev) @@ -535,6 +556,13 @@ static void batadv_recv_handler_init(void) batadv_rx_handler[BATADV_UNICAST_FRAG] = batadv_recv_frag_packet; } +/** + * batadv_recv_handler_register() - Register handler for batman-adv packet type + * @packet_type: batadv_packettype which should be handled + * @recv_handler: receive handler for the packet type + * + * Return: 0 on success or negative error number in case of failure + */ int batadv_recv_handler_register(u8 packet_type, int (*recv_handler)(struct sk_buff *, @@ -552,13 +580,17 @@ batadv_recv_handler_register(u8 packet_type, return 0; } +/** + * batadv_recv_handler_unregister() - Unregister handler for packet type + * @packet_type: batadv_packettype which should no longer be handled + */ void batadv_recv_handler_unregister(u8 packet_type) { batadv_rx_handler[packet_type] = batadv_recv_unhandled_packet; } /** - * batadv_skb_crc32 - calculate CRC32 of the whole packet and skip bytes in + * batadv_skb_crc32() - calculate CRC32 of the whole packet and skip bytes in * the header * @skb: skb pointing to fragmented socket buffers * @payload_ptr: Pointer to position inside the head buffer of the skb @@ -591,7 +623,7 @@ __be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr) } /** - * batadv_get_vid - extract the VLAN identifier from skb if any + * batadv_get_vid() - extract the VLAN identifier from skb if any * @skb: the buffer containing the packet * @header_len: length of the batman header preceding the ethernet header * @@ -618,7 +650,7 @@ unsigned short batadv_get_vid(struct sk_buff *skb, size_t header_len) } /** - * batadv_vlan_ap_isola_get - return the AP isolation status for the given vlan + * batadv_vlan_ap_isola_get() - return AP isolation status for the given vlan * @bat_priv: the bat priv with all the soft interface information * @vid: the VLAN identifier for which the AP isolation attributed as to be * looked up diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index edb2f239d04d..f7ba3f96d8f3 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich @@ -24,7 +25,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2017.4" +#define BATADV_SOURCE_VERSION "2018.0" #endif /* B.A.T.M.A.N. parameters */ @@ -140,24 +141,56 @@ */ #define BATADV_TP_MAX_NUM 5 +/** + * enum batadv_mesh_state - State of a soft interface + */ enum batadv_mesh_state { + /** @BATADV_MESH_INACTIVE: soft interface is not yet running */ BATADV_MESH_INACTIVE, + + /** @BATADV_MESH_ACTIVE: interface is up and running */ BATADV_MESH_ACTIVE, + + /** @BATADV_MESH_DEACTIVATING: interface is getting shut down */ BATADV_MESH_DEACTIVATING, }; #define BATADV_BCAST_QUEUE_LEN 256 #define BATADV_BATMAN_QUEUE_LEN 256 +/** + * enum batadv_uev_action - action type of uevent + */ enum batadv_uev_action { + /** @BATADV_UEV_ADD: gateway was selected (after none was selected) */ BATADV_UEV_ADD = 0, + + /** + * @BATADV_UEV_DEL: selected gateway was removed and none is selected + * anymore + */ BATADV_UEV_DEL, + + /** + * @BATADV_UEV_CHANGE: a different gateway was selected as based gateway + */ BATADV_UEV_CHANGE, + + /** + * @BATADV_UEV_LOOPDETECT: loop was detected which cannot be handled by + * bridge loop avoidance + */ BATADV_UEV_LOOPDETECT, }; +/** + * enum batadv_uev_type - Type of uevent + */ enum batadv_uev_type { + /** @BATADV_UEV_GW: selected gateway was modified */ BATADV_UEV_GW = 0, + + /** @BATADV_UEV_BLA: bridge loop avoidance event */ BATADV_UEV_BLA, }; @@ -184,16 +217,14 @@ enum batadv_uev_type { /* Kernel headers */ -#include <linux/bitops.h> /* for packet.h */ #include <linux/compiler.h> #include <linux/etherdevice.h> -#include <linux/if_ether.h> /* for packet.h */ #include <linux/if_vlan.h> #include <linux/jiffies.h> #include <linux/percpu.h> #include <linux/types.h> +#include <uapi/linux/batadv_packet.h> -#include "packet.h" #include "types.h" struct net_device; @@ -202,7 +233,7 @@ struct seq_file; struct sk_buff; /** - * batadv_print_vid - return printable version of vid information + * batadv_print_vid() - return printable version of vid information * @vid: the VLAN identifier * * Return: -1 when no VLAN is used, VLAN id otherwise @@ -238,7 +269,7 @@ void batadv_recv_handler_unregister(u8 packet_type); __be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr); /** - * batadv_compare_eth - Compare two not u16 aligned Ethernet addresses + * batadv_compare_eth() - Compare two not u16 aligned Ethernet addresses * @data1: Pointer to a six-byte array containing the Ethernet address * @data2: Pointer other six-byte array containing the Ethernet address * @@ -252,7 +283,7 @@ static inline bool batadv_compare_eth(const void *data1, const void *data2) } /** - * batadv_has_timed_out - compares current time (jiffies) and timestamp + + * batadv_has_timed_out() - compares current time (jiffies) and timestamp + * timeout * @timestamp: base value to compare with (in jiffies) * @timeout: added to base value before comparing (in milliseconds) @@ -265,40 +296,96 @@ static inline bool batadv_has_timed_out(unsigned long timestamp, return time_is_before_jiffies(timestamp + msecs_to_jiffies(timeout)); } +/** + * batadv_atomic_dec_not_zero() - Decrease unless the number is 0 + * @v: pointer of type atomic_t + * + * Return: non-zero if v was not 0, and zero otherwise. + */ #define batadv_atomic_dec_not_zero(v) atomic_add_unless((v), -1, 0) -/* Returns the smallest signed integer in two's complement with the sizeof x */ +/** + * batadv_smallest_signed_int() - Returns the smallest signed integer in two's + * complement with the sizeof x + * @x: type of integer + * + * Return: smallest signed integer of type + */ #define batadv_smallest_signed_int(x) (1u << (7u + 8u * (sizeof(x) - 1u))) -/* Checks if a sequence number x is a predecessor/successor of y. - * they handle overflows/underflows and can correctly check for a - * predecessor/successor unless the variable sequence number has grown by - * more then 2**(bitwidth(x)-1)-1. +/** + * batadv_seq_before() - Checks if a sequence number x is a predecessor of y + * @x: potential predecessor of @y + * @y: value to compare @x against + * + * It handles overflows/underflows and can correctly check for a predecessor + * unless the variable sequence number has grown by more then + * 2**(bitwidth(x)-1)-1. + * * This means that for a u8 with the maximum value 255, it would think: - * - when adding nothing - it is neither a predecessor nor a successor - * - before adding more than 127 to the starting value - it is a predecessor, - * - when adding 128 - it is neither a predecessor nor a successor, - * - after adding more than 127 to the starting value - it is a successor + * + * * when adding nothing - it is neither a predecessor nor a successor + * * before adding more than 127 to the starting value - it is a predecessor, + * * when adding 128 - it is neither a predecessor nor a successor, + * * after adding more than 127 to the starting value - it is a successor + * + * Return: true when x is a predecessor of y, false otherwise */ #define batadv_seq_before(x, y) ({typeof(x)_d1 = (x); \ typeof(y)_d2 = (y); \ typeof(x)_dummy = (_d1 - _d2); \ (void)(&_d1 == &_d2); \ _dummy > batadv_smallest_signed_int(_dummy); }) + +/** + * batadv_seq_after() - Checks if a sequence number x is a successor of y + * @x: potential sucessor of @y + * @y: value to compare @x against + * + * It handles overflows/underflows and can correctly check for a successor + * unless the variable sequence number has grown by more then + * 2**(bitwidth(x)-1)-1. + * + * This means that for a u8 with the maximum value 255, it would think: + * + * * when adding nothing - it is neither a predecessor nor a successor + * * before adding more than 127 to the starting value - it is a predecessor, + * * when adding 128 - it is neither a predecessor nor a successor, + * * after adding more than 127 to the starting value - it is a successor + * + * Return: true when x is a successor of y, false otherwise + */ #define batadv_seq_after(x, y) batadv_seq_before(y, x) -/* Stop preemption on local cpu while incrementing the counter */ +/** + * batadv_add_counter() - Add to per cpu statistics counter of soft interface + * @bat_priv: the bat priv with all the soft interface information + * @idx: counter index which should be modified + * @count: value to increase counter by + * + * Stop preemption on local cpu while incrementing the counter + */ static inline void batadv_add_counter(struct batadv_priv *bat_priv, size_t idx, size_t count) { this_cpu_add(bat_priv->bat_counters[idx], count); } +/** + * batadv_inc_counter() - Increase per cpu statistics counter of soft interface + * @b: the bat priv with all the soft interface information + * @i: counter index which should be modified + */ #define batadv_inc_counter(b, i) batadv_add_counter(b, i, 1) -/* Define a macro to reach the control buffer of the skb. The members of the - * control buffer are defined in struct batadv_skb_cb in types.h. - * The macro is inspired by the similar macro TCP_SKB_CB() in tcp.h. +/** + * BATADV_SKB_CB() - Get batadv_skb_cb from skb control buffer + * @__skb: skb holding the control buffer + * + * The members of the control buffer are defined in struct batadv_skb_cb in + * types.h. The macro is inspired by the similar macro TCP_SKB_CB() in tcp.h. + * + * Return: pointer to the batadv_skb_cb of the skb */ #define BATADV_SKB_CB(__skb) ((struct batadv_skb_cb *)&((__skb)->cb[0])) diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index e553a8770a89..cbdeb47ec3f6 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2014-2017 B.A.T.M.A.N. contributors: * * Linus Lüssing @@ -24,7 +25,7 @@ #include <linux/byteorder/generic.h> #include <linux/errno.h> #include <linux/etherdevice.h> -#include <linux/fs.h> +#include <linux/gfp.h> #include <linux/icmpv6.h> #include <linux/if_bridge.h> #include <linux/if_ether.h> @@ -54,18 +55,18 @@ #include <net/if_inet6.h> #include <net/ip.h> #include <net/ipv6.h> +#include <uapi/linux/batadv_packet.h> #include "hard-interface.h" #include "hash.h" #include "log.h" -#include "packet.h" #include "translation-table.h" #include "tvlv.h" static void batadv_mcast_mla_update(struct work_struct *work); /** - * batadv_mcast_start_timer - schedule the multicast periodic worker + * batadv_mcast_start_timer() - schedule the multicast periodic worker * @bat_priv: the bat priv with all the soft interface information */ static void batadv_mcast_start_timer(struct batadv_priv *bat_priv) @@ -75,7 +76,7 @@ static void batadv_mcast_start_timer(struct batadv_priv *bat_priv) } /** - * batadv_mcast_get_bridge - get the bridge on top of the softif if it exists + * batadv_mcast_get_bridge() - get the bridge on top of the softif if it exists * @soft_iface: netdev struct of the mesh interface * * If the given soft interface has a bridge on top then the refcount @@ -101,7 +102,7 @@ static struct net_device *batadv_mcast_get_bridge(struct net_device *soft_iface) } /** - * batadv_mcast_mla_softif_get - get softif multicast listeners + * batadv_mcast_mla_softif_get() - get softif multicast listeners * @dev: the device to collect multicast addresses from * @mcast_list: a list to put found addresses into * @@ -147,7 +148,7 @@ static int batadv_mcast_mla_softif_get(struct net_device *dev, } /** - * batadv_mcast_mla_is_duplicate - check whether an address is in a list + * batadv_mcast_mla_is_duplicate() - check whether an address is in a list * @mcast_addr: the multicast address to check * @mcast_list: the list with multicast addresses to search in * @@ -167,7 +168,7 @@ static bool batadv_mcast_mla_is_duplicate(u8 *mcast_addr, } /** - * batadv_mcast_mla_br_addr_cpy - copy a bridge multicast address + * batadv_mcast_mla_br_addr_cpy() - copy a bridge multicast address * @dst: destination to write to - a multicast MAC address * @src: source to read from - a multicast IP address * @@ -191,7 +192,7 @@ static void batadv_mcast_mla_br_addr_cpy(char *dst, const struct br_ip *src) } /** - * batadv_mcast_mla_bridge_get - get bridged-in multicast listeners + * batadv_mcast_mla_bridge_get() - get bridged-in multicast listeners * @dev: a bridge slave whose bridge to collect multicast addresses from * @mcast_list: a list to put found addresses into * @@ -244,7 +245,7 @@ out: } /** - * batadv_mcast_mla_list_free - free a list of multicast addresses + * batadv_mcast_mla_list_free() - free a list of multicast addresses * @mcast_list: the list to free * * Removes and frees all items in the given mcast_list. @@ -261,7 +262,7 @@ static void batadv_mcast_mla_list_free(struct hlist_head *mcast_list) } /** - * batadv_mcast_mla_tt_retract - clean up multicast listener announcements + * batadv_mcast_mla_tt_retract() - clean up multicast listener announcements * @bat_priv: the bat priv with all the soft interface information * @mcast_list: a list of addresses which should _not_ be removed * @@ -297,7 +298,7 @@ static void batadv_mcast_mla_tt_retract(struct batadv_priv *bat_priv, } /** - * batadv_mcast_mla_tt_add - add multicast listener announcements + * batadv_mcast_mla_tt_add() - add multicast listener announcements * @bat_priv: the bat priv with all the soft interface information * @mcast_list: a list of addresses which are going to get added * @@ -333,7 +334,7 @@ static void batadv_mcast_mla_tt_add(struct batadv_priv *bat_priv, } /** - * batadv_mcast_has_bridge - check whether the soft-iface is bridged + * batadv_mcast_has_bridge() - check whether the soft-iface is bridged * @bat_priv: the bat priv with all the soft interface information * * Checks whether there is a bridge on top of our soft interface. @@ -354,7 +355,8 @@ static bool batadv_mcast_has_bridge(struct batadv_priv *bat_priv) } /** - * batadv_mcast_querier_log - debug output regarding the querier status on link + * batadv_mcast_querier_log() - debug output regarding the querier status on + * link * @bat_priv: the bat priv with all the soft interface information * @str_proto: a string for the querier protocol (e.g. "IGMP" or "MLD") * @old_state: the previous querier state on our link @@ -405,7 +407,8 @@ batadv_mcast_querier_log(struct batadv_priv *bat_priv, char *str_proto, } /** - * batadv_mcast_bridge_log - debug output for topology changes in bridged setups + * batadv_mcast_bridge_log() - debug output for topology changes in bridged + * setups * @bat_priv: the bat priv with all the soft interface information * @bridged: a flag about whether the soft interface is currently bridged or not * @querier_ipv4: (maybe) new status of a potential, selected IGMP querier @@ -444,7 +447,7 @@ batadv_mcast_bridge_log(struct batadv_priv *bat_priv, bool bridged, } /** - * batadv_mcast_flags_logs - output debug information about mcast flag changes + * batadv_mcast_flags_logs() - output debug information about mcast flag changes * @bat_priv: the bat priv with all the soft interface information * @flags: flags indicating the new multicast state * @@ -470,7 +473,7 @@ static void batadv_mcast_flags_log(struct batadv_priv *bat_priv, u8 flags) } /** - * batadv_mcast_mla_tvlv_update - update multicast tvlv + * batadv_mcast_mla_tvlv_update() - update multicast tvlv * @bat_priv: the bat priv with all the soft interface information * * Updates the own multicast tvlv with our current multicast related settings, @@ -545,7 +548,7 @@ update: } /** - * __batadv_mcast_mla_update - update the own MLAs + * __batadv_mcast_mla_update() - update the own MLAs * @bat_priv: the bat priv with all the soft interface information * * Updates the own multicast listener announcements in the translation @@ -582,7 +585,7 @@ out: } /** - * batadv_mcast_mla_update - update the own MLAs + * batadv_mcast_mla_update() - update the own MLAs * @work: kernel work struct * * Updates the own multicast listener announcements in the translation @@ -605,7 +608,7 @@ static void batadv_mcast_mla_update(struct work_struct *work) } /** - * batadv_mcast_is_report_ipv4 - check for IGMP reports + * batadv_mcast_is_report_ipv4() - check for IGMP reports * @skb: the ethernet frame destined for the mesh * * This call might reallocate skb data. @@ -630,7 +633,8 @@ static bool batadv_mcast_is_report_ipv4(struct sk_buff *skb) } /** - * batadv_mcast_forw_mode_check_ipv4 - check for optimized forwarding potential + * batadv_mcast_forw_mode_check_ipv4() - check for optimized forwarding + * potential * @bat_priv: the bat priv with all the soft interface information * @skb: the IPv4 packet to check * @is_unsnoopable: stores whether the destination is snoopable @@ -671,7 +675,7 @@ static int batadv_mcast_forw_mode_check_ipv4(struct batadv_priv *bat_priv, } /** - * batadv_mcast_is_report_ipv6 - check for MLD reports + * batadv_mcast_is_report_ipv6() - check for MLD reports * @skb: the ethernet frame destined for the mesh * * This call might reallocate skb data. @@ -695,7 +699,8 @@ static bool batadv_mcast_is_report_ipv6(struct sk_buff *skb) } /** - * batadv_mcast_forw_mode_check_ipv6 - check for optimized forwarding potential + * batadv_mcast_forw_mode_check_ipv6() - check for optimized forwarding + * potential * @bat_priv: the bat priv with all the soft interface information * @skb: the IPv6 packet to check * @is_unsnoopable: stores whether the destination is snoopable @@ -736,7 +741,7 @@ static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv, } /** - * batadv_mcast_forw_mode_check - check for optimized forwarding potential + * batadv_mcast_forw_mode_check() - check for optimized forwarding potential * @bat_priv: the bat priv with all the soft interface information * @skb: the multicast frame to check * @is_unsnoopable: stores whether the destination is snoopable @@ -774,7 +779,7 @@ static int batadv_mcast_forw_mode_check(struct batadv_priv *bat_priv, } /** - * batadv_mcast_forw_want_all_ip_count - count nodes with unspecific mcast + * batadv_mcast_forw_want_all_ip_count() - count nodes with unspecific mcast * interest * @bat_priv: the bat priv with all the soft interface information * @ethhdr: ethernet header of a packet @@ -798,7 +803,7 @@ static int batadv_mcast_forw_want_all_ip_count(struct batadv_priv *bat_priv, } /** - * batadv_mcast_forw_tt_node_get - get a multicast tt node + * batadv_mcast_forw_tt_node_get() - get a multicast tt node * @bat_priv: the bat priv with all the soft interface information * @ethhdr: the ether header containing the multicast destination * @@ -814,7 +819,7 @@ batadv_mcast_forw_tt_node_get(struct batadv_priv *bat_priv, } /** - * batadv_mcast_forw_ipv4_node_get - get a node with an ipv4 flag + * batadv_mcast_forw_ipv4_node_get() - get a node with an ipv4 flag * @bat_priv: the bat priv with all the soft interface information * * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_IPV4 flag set and @@ -841,7 +846,7 @@ batadv_mcast_forw_ipv4_node_get(struct batadv_priv *bat_priv) } /** - * batadv_mcast_forw_ipv6_node_get - get a node with an ipv6 flag + * batadv_mcast_forw_ipv6_node_get() - get a node with an ipv6 flag * @bat_priv: the bat priv with all the soft interface information * * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_IPV6 flag set @@ -868,7 +873,7 @@ batadv_mcast_forw_ipv6_node_get(struct batadv_priv *bat_priv) } /** - * batadv_mcast_forw_ip_node_get - get a node with an ipv4/ipv6 flag + * batadv_mcast_forw_ip_node_get() - get a node with an ipv4/ipv6 flag * @bat_priv: the bat priv with all the soft interface information * @ethhdr: an ethernet header to determine the protocol family from * @@ -892,7 +897,7 @@ batadv_mcast_forw_ip_node_get(struct batadv_priv *bat_priv, } /** - * batadv_mcast_forw_unsnoop_node_get - get a node with an unsnoopable flag + * batadv_mcast_forw_unsnoop_node_get() - get a node with an unsnoopable flag * @bat_priv: the bat priv with all the soft interface information * * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_UNSNOOPABLES flag @@ -919,7 +924,7 @@ batadv_mcast_forw_unsnoop_node_get(struct batadv_priv *bat_priv) } /** - * batadv_mcast_forw_mode - check on how to forward a multicast packet + * batadv_mcast_forw_mode() - check on how to forward a multicast packet * @bat_priv: the bat priv with all the soft interface information * @skb: The multicast packet to check * @orig: an originator to be set to forward the skb to @@ -973,7 +978,7 @@ batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, } /** - * batadv_mcast_want_unsnoop_update - update unsnoop counter and list + * batadv_mcast_want_unsnoop_update() - update unsnoop counter and list * @bat_priv: the bat priv with all the soft interface information * @orig: the orig_node which multicast state might have changed of * @mcast_flags: flags indicating the new multicast state @@ -1018,7 +1023,7 @@ static void batadv_mcast_want_unsnoop_update(struct batadv_priv *bat_priv, } /** - * batadv_mcast_want_ipv4_update - update want-all-ipv4 counter and list + * batadv_mcast_want_ipv4_update() - update want-all-ipv4 counter and list * @bat_priv: the bat priv with all the soft interface information * @orig: the orig_node which multicast state might have changed of * @mcast_flags: flags indicating the new multicast state @@ -1063,7 +1068,7 @@ static void batadv_mcast_want_ipv4_update(struct batadv_priv *bat_priv, } /** - * batadv_mcast_want_ipv6_update - update want-all-ipv6 counter and list + * batadv_mcast_want_ipv6_update() - update want-all-ipv6 counter and list * @bat_priv: the bat priv with all the soft interface information * @orig: the orig_node which multicast state might have changed of * @mcast_flags: flags indicating the new multicast state @@ -1108,7 +1113,7 @@ static void batadv_mcast_want_ipv6_update(struct batadv_priv *bat_priv, } /** - * batadv_mcast_tvlv_ogm_handler - process incoming multicast tvlv container + * batadv_mcast_tvlv_ogm_handler() - process incoming multicast tvlv container * @bat_priv: the bat priv with all the soft interface information * @orig: the orig_node of the ogm * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags) @@ -1164,7 +1169,7 @@ static void batadv_mcast_tvlv_ogm_handler(struct batadv_priv *bat_priv, } /** - * batadv_mcast_init - initialize the multicast optimizations structures + * batadv_mcast_init() - initialize the multicast optimizations structures * @bat_priv: the bat priv with all the soft interface information */ void batadv_mcast_init(struct batadv_priv *bat_priv) @@ -1179,7 +1184,7 @@ void batadv_mcast_init(struct batadv_priv *bat_priv) #ifdef CONFIG_BATMAN_ADV_DEBUGFS /** - * batadv_mcast_flags_print_header - print own mcast flags to debugfs table + * batadv_mcast_flags_print_header() - print own mcast flags to debugfs table * @bat_priv: the bat priv with all the soft interface information * @seq: debugfs table seq_file struct * @@ -1220,7 +1225,7 @@ static void batadv_mcast_flags_print_header(struct batadv_priv *bat_priv, } /** - * batadv_mcast_flags_seq_print_text - print the mcast flags of other nodes + * batadv_mcast_flags_seq_print_text() - print the mcast flags of other nodes * @seq: seq file to print on * @offset: not used * @@ -1281,7 +1286,7 @@ int batadv_mcast_flags_seq_print_text(struct seq_file *seq, void *offset) #endif /** - * batadv_mcast_free - free the multicast optimizations structures + * batadv_mcast_free() - free the multicast optimizations structures * @bat_priv: the bat priv with all the soft interface information */ void batadv_mcast_free(struct batadv_priv *bat_priv) @@ -1296,7 +1301,7 @@ void batadv_mcast_free(struct batadv_priv *bat_priv) } /** - * batadv_mcast_purge_orig - reset originator global mcast state modifications + * batadv_mcast_purge_orig() - reset originator global mcast state modifications * @orig: the originator which is going to get purged */ void batadv_mcast_purge_orig(struct batadv_orig_node *orig) diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h index 2a78cddab0e9..3ac06337ab71 100644 --- a/net/batman-adv/multicast.h +++ b/net/batman-adv/multicast.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2014-2017 B.A.T.M.A.N. contributors: * * Linus Lüssing @@ -25,15 +26,21 @@ struct sk_buff; /** * enum batadv_forw_mode - the way a packet should be forwarded as - * @BATADV_FORW_ALL: forward the packet to all nodes (currently via classic - * flooding) - * @BATADV_FORW_SINGLE: forward the packet to a single node (currently via the - * BATMAN unicast routing protocol) - * @BATADV_FORW_NONE: don't forward, drop it */ enum batadv_forw_mode { + /** + * @BATADV_FORW_ALL: forward the packet to all nodes (currently via + * classic flooding) + */ BATADV_FORW_ALL, + + /** + * @BATADV_FORW_SINGLE: forward the packet to a single node (currently + * via the BATMAN unicast routing protocol) + */ BATADV_FORW_SINGLE, + + /** @BATADV_FORW_NONE: don't forward, drop it */ BATADV_FORW_NONE, }; diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index ab13b4d58733..a823d3899bad 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2016-2017 B.A.T.M.A.N. contributors: * * Matthias Schiffer @@ -23,8 +24,8 @@ #include <linux/cache.h> #include <linux/errno.h> #include <linux/export.h> -#include <linux/fs.h> #include <linux/genetlink.h> +#include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/init.h> #include <linux/kernel.h> @@ -39,6 +40,7 @@ #include <net/genetlink.h> #include <net/netlink.h> #include <net/sock.h> +#include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "bat_algo.h" @@ -46,7 +48,6 @@ #include "gateway_client.h" #include "hard-interface.h" #include "originator.h" -#include "packet.h" #include "soft-interface.h" #include "tp_meter.h" #include "translation-table.h" @@ -99,7 +100,7 @@ static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = { }; /** - * batadv_netlink_get_ifindex - Extract an interface index from a message + * batadv_netlink_get_ifindex() - Extract an interface index from a message * @nlh: Message header * @attrtype: Attribute which holds an interface index * @@ -114,7 +115,7 @@ batadv_netlink_get_ifindex(const struct nlmsghdr *nlh, int attrtype) } /** - * batadv_netlink_mesh_info_put - fill in generic information about mesh + * batadv_netlink_mesh_info_put() - fill in generic information about mesh * interface * @msg: netlink message to be sent back * @soft_iface: interface for which the data should be taken @@ -169,7 +170,7 @@ batadv_netlink_mesh_info_put(struct sk_buff *msg, struct net_device *soft_iface) } /** - * batadv_netlink_get_mesh_info - handle incoming BATADV_CMD_GET_MESH_INFO + * batadv_netlink_get_mesh_info() - handle incoming BATADV_CMD_GET_MESH_INFO * netlink request * @skb: received netlink message * @info: receiver information @@ -230,7 +231,7 @@ batadv_netlink_get_mesh_info(struct sk_buff *skb, struct genl_info *info) } /** - * batadv_netlink_tp_meter_put - Fill information of started tp_meter session + * batadv_netlink_tp_meter_put() - Fill information of started tp_meter session * @msg: netlink message to be sent back * @cookie: tp meter session cookie * @@ -246,7 +247,7 @@ batadv_netlink_tp_meter_put(struct sk_buff *msg, u32 cookie) } /** - * batadv_netlink_tpmeter_notify - send tp_meter result via netlink to client + * batadv_netlink_tpmeter_notify() - send tp_meter result via netlink to client * @bat_priv: the bat priv with all the soft interface information * @dst: destination of tp_meter session * @result: reason for tp meter session stop @@ -309,7 +310,7 @@ err_genlmsg: } /** - * batadv_netlink_tp_meter_start - Start a new tp_meter session + * batadv_netlink_tp_meter_start() - Start a new tp_meter session * @skb: received netlink message * @info: receiver information * @@ -386,7 +387,7 @@ batadv_netlink_tp_meter_start(struct sk_buff *skb, struct genl_info *info) } /** - * batadv_netlink_tp_meter_start - Cancel a running tp_meter session + * batadv_netlink_tp_meter_start() - Cancel a running tp_meter session * @skb: received netlink message * @info: receiver information * @@ -431,7 +432,7 @@ out: } /** - * batadv_netlink_dump_hardif_entry - Dump one hard interface into a message + * batadv_netlink_dump_hardif_entry() - Dump one hard interface into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message @@ -473,7 +474,7 @@ batadv_netlink_dump_hardif_entry(struct sk_buff *msg, u32 portid, u32 seq, } /** - * batadv_netlink_dump_hardifs - Dump all hard interface into a messages + * batadv_netlink_dump_hardifs() - Dump all hard interface into a messages * @msg: Netlink message to dump into * @cb: Parameters from query * @@ -620,7 +621,7 @@ struct genl_family batadv_netlink_family __ro_after_init = { }; /** - * batadv_netlink_register - register batadv genl netlink family + * batadv_netlink_register() - register batadv genl netlink family */ void __init batadv_netlink_register(void) { @@ -632,7 +633,7 @@ void __init batadv_netlink_register(void) } /** - * batadv_netlink_unregister - unregister batadv genl netlink family + * batadv_netlink_unregister() - unregister batadv genl netlink family */ void batadv_netlink_unregister(void) { diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h index f1cd8c5da966..0e7e57b69b54 100644 --- a/net/batman-adv/netlink.h +++ b/net/batman-adv/netlink.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2016-2017 B.A.T.M.A.N. contributors: * * Matthias Schiffer diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index 3604d7899e2c..b48116bb24ef 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2012-2017 B.A.T.M.A.N. contributors: * * Martin Hundebøll, Jeppe Ledet-Pedersen @@ -25,7 +26,7 @@ #include <linux/debugfs.h> #include <linux/errno.h> #include <linux/etherdevice.h> -#include <linux/fs.h> +#include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/if_packet.h> #include <linux/init.h> @@ -35,6 +36,7 @@ #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> +#include <linux/net.h> #include <linux/netdevice.h> #include <linux/printk.h> #include <linux/random.h> @@ -47,12 +49,12 @@ #include <linux/stddef.h> #include <linux/string.h> #include <linux/workqueue.h> +#include <uapi/linux/batadv_packet.h> #include "hard-interface.h" #include "hash.h" #include "log.h" #include "originator.h" -#include "packet.h" #include "routing.h" #include "send.h" #include "tvlv.h" @@ -65,7 +67,7 @@ static int batadv_nc_recv_coded_packet(struct sk_buff *skb, struct batadv_hard_iface *recv_if); /** - * batadv_nc_init - one-time initialization for network coding + * batadv_nc_init() - one-time initialization for network coding * * Return: 0 on success or negative error number in case of failure */ @@ -81,7 +83,7 @@ int __init batadv_nc_init(void) } /** - * batadv_nc_start_timer - initialise the nc periodic worker + * batadv_nc_start_timer() - initialise the nc periodic worker * @bat_priv: the bat priv with all the soft interface information */ static void batadv_nc_start_timer(struct batadv_priv *bat_priv) @@ -91,7 +93,7 @@ static void batadv_nc_start_timer(struct batadv_priv *bat_priv) } /** - * batadv_nc_tvlv_container_update - update the network coding tvlv container + * batadv_nc_tvlv_container_update() - update the network coding tvlv container * after network coding setting change * @bat_priv: the bat priv with all the soft interface information */ @@ -113,7 +115,7 @@ static void batadv_nc_tvlv_container_update(struct batadv_priv *bat_priv) } /** - * batadv_nc_status_update - update the network coding tvlv container after + * batadv_nc_status_update() - update the network coding tvlv container after * network coding setting change * @net_dev: the soft interface net device */ @@ -125,7 +127,7 @@ void batadv_nc_status_update(struct net_device *net_dev) } /** - * batadv_nc_tvlv_ogm_handler_v1 - process incoming nc tvlv container + * batadv_nc_tvlv_ogm_handler_v1() - process incoming nc tvlv container * @bat_priv: the bat priv with all the soft interface information * @orig: the orig_node of the ogm * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags) @@ -144,7 +146,7 @@ static void batadv_nc_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, } /** - * batadv_nc_mesh_init - initialise coding hash table and start house keeping + * batadv_nc_mesh_init() - initialise coding hash table and start house keeping * @bat_priv: the bat priv with all the soft interface information * * Return: 0 on success or negative error number in case of failure @@ -185,7 +187,7 @@ err: } /** - * batadv_nc_init_bat_priv - initialise the nc specific bat_priv variables + * batadv_nc_init_bat_priv() - initialise the nc specific bat_priv variables * @bat_priv: the bat priv with all the soft interface information */ void batadv_nc_init_bat_priv(struct batadv_priv *bat_priv) @@ -197,7 +199,7 @@ void batadv_nc_init_bat_priv(struct batadv_priv *bat_priv) } /** - * batadv_nc_init_orig - initialise the nc fields of an orig_node + * batadv_nc_init_orig() - initialise the nc fields of an orig_node * @orig_node: the orig_node which is going to be initialised */ void batadv_nc_init_orig(struct batadv_orig_node *orig_node) @@ -209,8 +211,8 @@ void batadv_nc_init_orig(struct batadv_orig_node *orig_node) } /** - * batadv_nc_node_release - release nc_node from lists and queue for free after - * rcu grace period + * batadv_nc_node_release() - release nc_node from lists and queue for free + * after rcu grace period * @ref: kref pointer of the nc_node */ static void batadv_nc_node_release(struct kref *ref) @@ -224,7 +226,7 @@ static void batadv_nc_node_release(struct kref *ref) } /** - * batadv_nc_node_put - decrement the nc_node refcounter and possibly + * batadv_nc_node_put() - decrement the nc_node refcounter and possibly * release it * @nc_node: nc_node to be free'd */ @@ -234,8 +236,8 @@ static void batadv_nc_node_put(struct batadv_nc_node *nc_node) } /** - * batadv_nc_path_release - release nc_path from lists and queue for free after - * rcu grace period + * batadv_nc_path_release() - release nc_path from lists and queue for free + * after rcu grace period * @ref: kref pointer of the nc_path */ static void batadv_nc_path_release(struct kref *ref) @@ -248,7 +250,7 @@ static void batadv_nc_path_release(struct kref *ref) } /** - * batadv_nc_path_put - decrement the nc_path refcounter and possibly + * batadv_nc_path_put() - decrement the nc_path refcounter and possibly * release it * @nc_path: nc_path to be free'd */ @@ -258,7 +260,7 @@ static void batadv_nc_path_put(struct batadv_nc_path *nc_path) } /** - * batadv_nc_packet_free - frees nc packet + * batadv_nc_packet_free() - frees nc packet * @nc_packet: the nc packet to free * @dropped: whether the packet is freed because is is dropped */ @@ -275,7 +277,7 @@ static void batadv_nc_packet_free(struct batadv_nc_packet *nc_packet, } /** - * batadv_nc_to_purge_nc_node - checks whether an nc node has to be purged + * batadv_nc_to_purge_nc_node() - checks whether an nc node has to be purged * @bat_priv: the bat priv with all the soft interface information * @nc_node: the nc node to check * @@ -291,7 +293,7 @@ static bool batadv_nc_to_purge_nc_node(struct batadv_priv *bat_priv, } /** - * batadv_nc_to_purge_nc_path_coding - checks whether an nc path has timed out + * batadv_nc_to_purge_nc_path_coding() - checks whether an nc path has timed out * @bat_priv: the bat priv with all the soft interface information * @nc_path: the nc path to check * @@ -311,7 +313,8 @@ static bool batadv_nc_to_purge_nc_path_coding(struct batadv_priv *bat_priv, } /** - * batadv_nc_to_purge_nc_path_decoding - checks whether an nc path has timed out + * batadv_nc_to_purge_nc_path_decoding() - checks whether an nc path has timed + * out * @bat_priv: the bat priv with all the soft interface information * @nc_path: the nc path to check * @@ -331,7 +334,7 @@ static bool batadv_nc_to_purge_nc_path_decoding(struct batadv_priv *bat_priv, } /** - * batadv_nc_purge_orig_nc_nodes - go through list of nc nodes and purge stale + * batadv_nc_purge_orig_nc_nodes() - go through list of nc nodes and purge stale * entries * @bat_priv: the bat priv with all the soft interface information * @list: list of nc nodes @@ -369,7 +372,7 @@ batadv_nc_purge_orig_nc_nodes(struct batadv_priv *bat_priv, } /** - * batadv_nc_purge_orig - purges all nc node data attached of the given + * batadv_nc_purge_orig() - purges all nc node data attached of the given * originator * @bat_priv: the bat priv with all the soft interface information * @orig_node: orig_node with the nc node entries to be purged @@ -395,8 +398,8 @@ void batadv_nc_purge_orig(struct batadv_priv *bat_priv, } /** - * batadv_nc_purge_orig_hash - traverse entire originator hash to check if they - * have timed out nc nodes + * batadv_nc_purge_orig_hash() - traverse entire originator hash to check if + * they have timed out nc nodes * @bat_priv: the bat priv with all the soft interface information */ static void batadv_nc_purge_orig_hash(struct batadv_priv *bat_priv) @@ -422,7 +425,7 @@ static void batadv_nc_purge_orig_hash(struct batadv_priv *bat_priv) } /** - * batadv_nc_purge_paths - traverse all nc paths part of the hash and remove + * batadv_nc_purge_paths() - traverse all nc paths part of the hash and remove * unused ones * @bat_priv: the bat priv with all the soft interface information * @hash: hash table containing the nc paths to check @@ -481,7 +484,7 @@ static void batadv_nc_purge_paths(struct batadv_priv *bat_priv, } /** - * batadv_nc_hash_key_gen - computes the nc_path hash key + * batadv_nc_hash_key_gen() - computes the nc_path hash key * @key: buffer to hold the final hash key * @src: source ethernet mac address going into the hash key * @dst: destination ethernet mac address going into the hash key @@ -494,7 +497,7 @@ static void batadv_nc_hash_key_gen(struct batadv_nc_path *key, const char *src, } /** - * batadv_nc_hash_choose - compute the hash value for an nc path + * batadv_nc_hash_choose() - compute the hash value for an nc path * @data: data to hash * @size: size of the hash table * @@ -512,7 +515,7 @@ static u32 batadv_nc_hash_choose(const void *data, u32 size) } /** - * batadv_nc_hash_compare - comparing function used in the network coding hash + * batadv_nc_hash_compare() - comparing function used in the network coding hash * tables * @node: node in the local table * @data2: second object to compare the node to @@ -538,7 +541,7 @@ static bool batadv_nc_hash_compare(const struct hlist_node *node, } /** - * batadv_nc_hash_find - search for an existing nc path and return it + * batadv_nc_hash_find() - search for an existing nc path and return it * @hash: hash table containing the nc path * @data: search key * @@ -575,7 +578,7 @@ batadv_nc_hash_find(struct batadv_hashtable *hash, } /** - * batadv_nc_send_packet - send non-coded packet and free nc_packet struct + * batadv_nc_send_packet() - send non-coded packet and free nc_packet struct * @nc_packet: the nc packet to send */ static void batadv_nc_send_packet(struct batadv_nc_packet *nc_packet) @@ -586,7 +589,7 @@ static void batadv_nc_send_packet(struct batadv_nc_packet *nc_packet) } /** - * batadv_nc_sniffed_purge - Checks timestamp of given sniffed nc_packet. + * batadv_nc_sniffed_purge() - Checks timestamp of given sniffed nc_packet. * @bat_priv: the bat priv with all the soft interface information * @nc_path: the nc path the packet belongs to * @nc_packet: the nc packet to be checked @@ -625,7 +628,7 @@ out: } /** - * batadv_nc_fwd_flush - Checks the timestamp of the given nc packet. + * batadv_nc_fwd_flush() - Checks the timestamp of the given nc packet. * @bat_priv: the bat priv with all the soft interface information * @nc_path: the nc path the packet belongs to * @nc_packet: the nc packet to be checked @@ -663,8 +666,8 @@ static bool batadv_nc_fwd_flush(struct batadv_priv *bat_priv, } /** - * batadv_nc_process_nc_paths - traverse given nc packet pool and free timed out - * nc packets + * batadv_nc_process_nc_paths() - traverse given nc packet pool and free timed + * out nc packets * @bat_priv: the bat priv with all the soft interface information * @hash: to be processed hash table * @process_fn: Function called to process given nc packet. Should return true @@ -709,7 +712,8 @@ batadv_nc_process_nc_paths(struct batadv_priv *bat_priv, } /** - * batadv_nc_worker - periodic task for house keeping related to network coding + * batadv_nc_worker() - periodic task for house keeping related to network + * coding * @work: kernel work struct */ static void batadv_nc_worker(struct work_struct *work) @@ -749,8 +753,8 @@ static void batadv_nc_worker(struct work_struct *work) } /** - * batadv_can_nc_with_orig - checks whether the given orig node is suitable for - * coding or not + * batadv_can_nc_with_orig() - checks whether the given orig node is suitable + * for coding or not * @bat_priv: the bat priv with all the soft interface information * @orig_node: neighboring orig node which may be used as nc candidate * @ogm_packet: incoming ogm packet also used for the checks @@ -790,7 +794,7 @@ static bool batadv_can_nc_with_orig(struct batadv_priv *bat_priv, } /** - * batadv_nc_find_nc_node - search for an existing nc node and return it + * batadv_nc_find_nc_node() - search for an existing nc node and return it * @orig_node: orig node originating the ogm packet * @orig_neigh_node: neighboring orig node from which we received the ogm packet * (can be equal to orig_node) @@ -830,7 +834,7 @@ batadv_nc_find_nc_node(struct batadv_orig_node *orig_node, } /** - * batadv_nc_get_nc_node - retrieves an nc node or creates the entry if it was + * batadv_nc_get_nc_node() - retrieves an nc node or creates the entry if it was * not found * @bat_priv: the bat priv with all the soft interface information * @orig_node: orig node originating the ogm packet @@ -890,7 +894,7 @@ batadv_nc_get_nc_node(struct batadv_priv *bat_priv, } /** - * batadv_nc_update_nc_node - updates stored incoming and outgoing nc node + * batadv_nc_update_nc_node() - updates stored incoming and outgoing nc node * structs (best called on incoming OGMs) * @bat_priv: the bat priv with all the soft interface information * @orig_node: orig node originating the ogm packet @@ -945,7 +949,7 @@ out: } /** - * batadv_nc_get_path - get existing nc_path or allocate a new one + * batadv_nc_get_path() - get existing nc_path or allocate a new one * @bat_priv: the bat priv with all the soft interface information * @hash: hash table containing the nc path * @src: ethernet source address - first half of the nc path search key @@ -1006,7 +1010,7 @@ static struct batadv_nc_path *batadv_nc_get_path(struct batadv_priv *bat_priv, } /** - * batadv_nc_random_weight_tq - scale the receivers TQ-value to avoid unfair + * batadv_nc_random_weight_tq() - scale the receivers TQ-value to avoid unfair * selection of a receiver with slightly lower TQ than the other * @tq: to be weighted tq value * @@ -1029,7 +1033,7 @@ static u8 batadv_nc_random_weight_tq(u8 tq) } /** - * batadv_nc_memxor - XOR destination with source + * batadv_nc_memxor() - XOR destination with source * @dst: byte array to XOR into * @src: byte array to XOR from * @len: length of destination array @@ -1043,7 +1047,7 @@ static void batadv_nc_memxor(char *dst, const char *src, unsigned int len) } /** - * batadv_nc_code_packets - code a received unicast_packet with an nc packet + * batadv_nc_code_packets() - code a received unicast_packet with an nc packet * into a coded_packet and send it * @bat_priv: the bat priv with all the soft interface information * @skb: data skb to forward @@ -1236,7 +1240,7 @@ out: } /** - * batadv_nc_skb_coding_possible - true if a decoded skb is available at dst. + * batadv_nc_skb_coding_possible() - true if a decoded skb is available at dst. * @skb: data skb to forward * @dst: destination mac address of the other skb to code with * @src: source mac address of skb @@ -1260,7 +1264,7 @@ static bool batadv_nc_skb_coding_possible(struct sk_buff *skb, u8 *dst, u8 *src) } /** - * batadv_nc_path_search - Find the coding path matching in_nc_node and + * batadv_nc_path_search() - Find the coding path matching in_nc_node and * out_nc_node to retrieve a buffered packet that can be used for coding. * @bat_priv: the bat priv with all the soft interface information * @in_nc_node: pointer to skb next hop's neighbor nc node @@ -1328,8 +1332,8 @@ batadv_nc_path_search(struct batadv_priv *bat_priv, } /** - * batadv_nc_skb_src_search - Loops through the list of neighoring nodes of the - * skb's sender (may be equal to the originator). + * batadv_nc_skb_src_search() - Loops through the list of neighoring nodes of + * the skb's sender (may be equal to the originator). * @bat_priv: the bat priv with all the soft interface information * @skb: data skb to forward * @eth_dst: next hop mac address of skb @@ -1374,7 +1378,7 @@ batadv_nc_skb_src_search(struct batadv_priv *bat_priv, } /** - * batadv_nc_skb_store_before_coding - set the ethernet src and dst of the + * batadv_nc_skb_store_before_coding() - set the ethernet src and dst of the * unicast skb before it is stored for use in later decoding * @bat_priv: the bat priv with all the soft interface information * @skb: data skb to store @@ -1409,7 +1413,7 @@ static void batadv_nc_skb_store_before_coding(struct batadv_priv *bat_priv, } /** - * batadv_nc_skb_dst_search - Loops through list of neighboring nodes to dst. + * batadv_nc_skb_dst_search() - Loops through list of neighboring nodes to dst. * @skb: data skb to forward * @neigh_node: next hop to forward packet to * @ethhdr: pointer to the ethernet header inside the skb @@ -1467,7 +1471,7 @@ static bool batadv_nc_skb_dst_search(struct sk_buff *skb, } /** - * batadv_nc_skb_add_to_path - buffer skb for later encoding / decoding + * batadv_nc_skb_add_to_path() - buffer skb for later encoding / decoding * @skb: skb to add to path * @nc_path: path to add skb to * @neigh_node: next hop to forward packet to @@ -1502,7 +1506,7 @@ static bool batadv_nc_skb_add_to_path(struct sk_buff *skb, } /** - * batadv_nc_skb_forward - try to code a packet or add it to the coding packet + * batadv_nc_skb_forward() - try to code a packet or add it to the coding packet * buffer * @skb: data skb to forward * @neigh_node: next hop to forward packet to @@ -1559,8 +1563,8 @@ out: } /** - * batadv_nc_skb_store_for_decoding - save a clone of the skb which can be used - * when decoding coded packets + * batadv_nc_skb_store_for_decoding() - save a clone of the skb which can be + * used when decoding coded packets * @bat_priv: the bat priv with all the soft interface information * @skb: data skb to store */ @@ -1620,7 +1624,7 @@ out: } /** - * batadv_nc_skb_store_sniffed_unicast - check if a received unicast packet + * batadv_nc_skb_store_sniffed_unicast() - check if a received unicast packet * should be saved in the decoding buffer and, if so, store it there * @bat_priv: the bat priv with all the soft interface information * @skb: unicast skb to store @@ -1640,7 +1644,7 @@ void batadv_nc_skb_store_sniffed_unicast(struct batadv_priv *bat_priv, } /** - * batadv_nc_skb_decode_packet - decode given skb using the decode data stored + * batadv_nc_skb_decode_packet() - decode given skb using the decode data stored * in nc_packet * @bat_priv: the bat priv with all the soft interface information * @skb: unicast skb to decode @@ -1734,7 +1738,7 @@ batadv_nc_skb_decode_packet(struct batadv_priv *bat_priv, struct sk_buff *skb, } /** - * batadv_nc_find_decoding_packet - search through buffered decoding data to + * batadv_nc_find_decoding_packet() - search through buffered decoding data to * find the data needed to decode the coded packet * @bat_priv: the bat priv with all the soft interface information * @ethhdr: pointer to the ethernet header inside the coded packet @@ -1799,7 +1803,7 @@ batadv_nc_find_decoding_packet(struct batadv_priv *bat_priv, } /** - * batadv_nc_recv_coded_packet - try to decode coded packet and enqueue the + * batadv_nc_recv_coded_packet() - try to decode coded packet and enqueue the * resulting unicast packet * @skb: incoming coded packet * @recv_if: pointer to interface this packet was received on @@ -1874,7 +1878,7 @@ free_skb: } /** - * batadv_nc_mesh_free - clean up network coding memory + * batadv_nc_mesh_free() - clean up network coding memory * @bat_priv: the bat priv with all the soft interface information */ void batadv_nc_mesh_free(struct batadv_priv *bat_priv) @@ -1891,7 +1895,7 @@ void batadv_nc_mesh_free(struct batadv_priv *bat_priv) #ifdef CONFIG_BATMAN_ADV_DEBUGFS /** - * batadv_nc_nodes_seq_print_text - print the nc node information + * batadv_nc_nodes_seq_print_text() - print the nc node information * @seq: seq file to print on * @offset: not used * @@ -1954,7 +1958,7 @@ out: } /** - * batadv_nc_init_debugfs - create nc folder and related files in debugfs + * batadv_nc_init_debugfs() - create nc folder and related files in debugfs * @bat_priv: the bat priv with all the soft interface information * * Return: 0 on success or negative error number in case of failure diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h index c66efb81d2f4..adaeafa4f71e 100644 --- a/net/batman-adv/network-coding.h +++ b/net/batman-adv/network-coding.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2012-2017 B.A.T.M.A.N. contributors: * * Martin Hundebøll, Jeppe Ledet-Pedersen diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 2967b86c13da..58a7d9274435 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2009-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich @@ -21,7 +22,7 @@ #include <linux/atomic.h> #include <linux/errno.h> #include <linux/etherdevice.h> -#include <linux/fs.h> +#include <linux/gfp.h> #include <linux/jiffies.h> #include <linux/kernel.h> #include <linux/kref.h> @@ -30,10 +31,12 @@ #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/rculist.h> +#include <linux/rcupdate.h> #include <linux/seq_file.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> +#include <linux/stddef.h> #include <linux/workqueue.h> #include <net/sock.h> #include <uapi/linux/batman_adv.h> @@ -55,10 +58,47 @@ /* hash class keys */ static struct lock_class_key batadv_orig_hash_lock_class_key; +/** + * batadv_orig_hash_find() - Find and return originator from orig_hash + * @bat_priv: the bat priv with all the soft interface information + * @data: mac address of the originator + * + * Return: orig_node (with increased refcnt), NULL on errors + */ +struct batadv_orig_node * +batadv_orig_hash_find(struct batadv_priv *bat_priv, const void *data) +{ + struct batadv_hashtable *hash = bat_priv->orig_hash; + struct hlist_head *head; + struct batadv_orig_node *orig_node, *orig_node_tmp = NULL; + int index; + + if (!hash) + return NULL; + + index = batadv_choose_orig(data, hash->size); + head = &hash->table[index]; + + rcu_read_lock(); + hlist_for_each_entry_rcu(orig_node, head, hash_entry) { + if (!batadv_compare_eth(orig_node, data)) + continue; + + if (!kref_get_unless_zero(&orig_node->refcount)) + continue; + + orig_node_tmp = orig_node; + break; + } + rcu_read_unlock(); + + return orig_node_tmp; +} + static void batadv_purge_orig(struct work_struct *work); /** - * batadv_compare_orig - comparing function used in the originator hash table + * batadv_compare_orig() - comparing function used in the originator hash table * @node: node in the local table * @data2: second object to compare the node to * @@ -73,7 +113,7 @@ bool batadv_compare_orig(const struct hlist_node *node, const void *data2) } /** - * batadv_orig_node_vlan_get - get an orig_node_vlan object + * batadv_orig_node_vlan_get() - get an orig_node_vlan object * @orig_node: the originator serving the VLAN * @vid: the VLAN identifier * @@ -104,7 +144,7 @@ batadv_orig_node_vlan_get(struct batadv_orig_node *orig_node, } /** - * batadv_orig_node_vlan_new - search and possibly create an orig_node_vlan + * batadv_orig_node_vlan_new() - search and possibly create an orig_node_vlan * object * @orig_node: the originator serving the VLAN * @vid: the VLAN identifier @@ -145,7 +185,7 @@ out: } /** - * batadv_orig_node_vlan_release - release originator-vlan object from lists + * batadv_orig_node_vlan_release() - release originator-vlan object from lists * and queue for free after rcu grace period * @ref: kref pointer of the originator-vlan object */ @@ -159,7 +199,7 @@ static void batadv_orig_node_vlan_release(struct kref *ref) } /** - * batadv_orig_node_vlan_put - decrement the refcounter and possibly release + * batadv_orig_node_vlan_put() - decrement the refcounter and possibly release * the originator-vlan object * @orig_vlan: the originator-vlan object to release */ @@ -168,6 +208,12 @@ void batadv_orig_node_vlan_put(struct batadv_orig_node_vlan *orig_vlan) kref_put(&orig_vlan->refcount, batadv_orig_node_vlan_release); } +/** + * batadv_originator_init() - Initialize all originator structures + * @bat_priv: the bat priv with all the soft interface information + * + * Return: 0 on success or negative error number in case of failure + */ int batadv_originator_init(struct batadv_priv *bat_priv) { if (bat_priv->orig_hash) @@ -193,7 +239,7 @@ err: } /** - * batadv_neigh_ifinfo_release - release neigh_ifinfo from lists and queue for + * batadv_neigh_ifinfo_release() - release neigh_ifinfo from lists and queue for * free after rcu grace period * @ref: kref pointer of the neigh_ifinfo */ @@ -210,7 +256,7 @@ static void batadv_neigh_ifinfo_release(struct kref *ref) } /** - * batadv_neigh_ifinfo_put - decrement the refcounter and possibly release + * batadv_neigh_ifinfo_put() - decrement the refcounter and possibly release * the neigh_ifinfo * @neigh_ifinfo: the neigh_ifinfo object to release */ @@ -220,7 +266,7 @@ void batadv_neigh_ifinfo_put(struct batadv_neigh_ifinfo *neigh_ifinfo) } /** - * batadv_hardif_neigh_release - release hardif neigh node from lists and + * batadv_hardif_neigh_release() - release hardif neigh node from lists and * queue for free after rcu grace period * @ref: kref pointer of the neigh_node */ @@ -240,7 +286,7 @@ static void batadv_hardif_neigh_release(struct kref *ref) } /** - * batadv_hardif_neigh_put - decrement the hardif neighbors refcounter + * batadv_hardif_neigh_put() - decrement the hardif neighbors refcounter * and possibly release it * @hardif_neigh: hardif neigh neighbor to free */ @@ -250,7 +296,7 @@ void batadv_hardif_neigh_put(struct batadv_hardif_neigh_node *hardif_neigh) } /** - * batadv_neigh_node_release - release neigh_node from lists and queue for + * batadv_neigh_node_release() - release neigh_node from lists and queue for * free after rcu grace period * @ref: kref pointer of the neigh_node */ @@ -275,7 +321,7 @@ static void batadv_neigh_node_release(struct kref *ref) } /** - * batadv_neigh_node_put - decrement the neighbors refcounter and possibly + * batadv_neigh_node_put() - decrement the neighbors refcounter and possibly * release it * @neigh_node: neigh neighbor to free */ @@ -285,7 +331,7 @@ void batadv_neigh_node_put(struct batadv_neigh_node *neigh_node) } /** - * batadv_orig_router_get - router to the originator depending on iface + * batadv_orig_router_get() - router to the originator depending on iface * @orig_node: the orig node for the router * @if_outgoing: the interface where the payload packet has been received or * the OGM should be sent to @@ -318,7 +364,7 @@ batadv_orig_router_get(struct batadv_orig_node *orig_node, } /** - * batadv_orig_ifinfo_get - find the ifinfo from an orig_node + * batadv_orig_ifinfo_get() - find the ifinfo from an orig_node * @orig_node: the orig node to be queried * @if_outgoing: the interface for which the ifinfo should be acquired * @@ -350,7 +396,7 @@ batadv_orig_ifinfo_get(struct batadv_orig_node *orig_node, } /** - * batadv_orig_ifinfo_new - search and possibly create an orig_ifinfo object + * batadv_orig_ifinfo_new() - search and possibly create an orig_ifinfo object * @orig_node: the orig node to be queried * @if_outgoing: the interface for which the ifinfo should be acquired * @@ -396,7 +442,7 @@ out: } /** - * batadv_neigh_ifinfo_get - find the ifinfo from an neigh_node + * batadv_neigh_ifinfo_get() - find the ifinfo from an neigh_node * @neigh: the neigh node to be queried * @if_outgoing: the interface for which the ifinfo should be acquired * @@ -429,7 +475,7 @@ batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh, } /** - * batadv_neigh_ifinfo_new - search and possibly create an neigh_ifinfo object + * batadv_neigh_ifinfo_new() - search and possibly create an neigh_ifinfo object * @neigh: the neigh node to be queried * @if_outgoing: the interface for which the ifinfo should be acquired * @@ -472,7 +518,7 @@ out: } /** - * batadv_neigh_node_get - retrieve a neighbour from the list + * batadv_neigh_node_get() - retrieve a neighbour from the list * @orig_node: originator which the neighbour belongs to * @hard_iface: the interface where this neighbour is connected to * @addr: the address of the neighbour @@ -509,7 +555,7 @@ batadv_neigh_node_get(const struct batadv_orig_node *orig_node, } /** - * batadv_hardif_neigh_create - create a hardif neighbour node + * batadv_hardif_neigh_create() - create a hardif neighbour node * @hard_iface: the interface this neighbour is connected to * @neigh_addr: the interface address of the neighbour to retrieve * @orig_node: originator object representing the neighbour @@ -555,7 +601,7 @@ out: } /** - * batadv_hardif_neigh_get_or_create - retrieve or create a hardif neighbour + * batadv_hardif_neigh_get_or_create() - retrieve or create a hardif neighbour * node * @hard_iface: the interface this neighbour is connected to * @neigh_addr: the interface address of the neighbour to retrieve @@ -579,7 +625,7 @@ batadv_hardif_neigh_get_or_create(struct batadv_hard_iface *hard_iface, } /** - * batadv_hardif_neigh_get - retrieve a hardif neighbour from the list + * batadv_hardif_neigh_get() - retrieve a hardif neighbour from the list * @hard_iface: the interface where this neighbour is connected to * @neigh_addr: the address of the neighbour * @@ -611,7 +657,7 @@ batadv_hardif_neigh_get(const struct batadv_hard_iface *hard_iface, } /** - * batadv_neigh_node_create - create a neigh node object + * batadv_neigh_node_create() - create a neigh node object * @orig_node: originator object representing the neighbour * @hard_iface: the interface where the neighbour is connected to * @neigh_addr: the mac address of the neighbour interface @@ -676,7 +722,7 @@ out: } /** - * batadv_neigh_node_get_or_create - retrieve or create a neigh node object + * batadv_neigh_node_get_or_create() - retrieve or create a neigh node object * @orig_node: originator object representing the neighbour * @hard_iface: the interface where the neighbour is connected to * @neigh_addr: the mac address of the neighbour interface @@ -700,7 +746,7 @@ batadv_neigh_node_get_or_create(struct batadv_orig_node *orig_node, #ifdef CONFIG_BATMAN_ADV_DEBUGFS /** - * batadv_hardif_neigh_seq_print_text - print the single hop neighbour list + * batadv_hardif_neigh_seq_print_text() - print the single hop neighbour list * @seq: neighbour table seq_file struct * @offset: not used * @@ -735,8 +781,8 @@ int batadv_hardif_neigh_seq_print_text(struct seq_file *seq, void *offset) #endif /** - * batadv_hardif_neigh_dump - Dump to netlink the neighbor infos for a specific - * outgoing interface + * batadv_hardif_neigh_dump() - Dump to netlink the neighbor infos for a + * specific outgoing interface * @msg: message to dump into * @cb: parameters for the dump * @@ -812,7 +858,7 @@ int batadv_hardif_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb) } /** - * batadv_orig_ifinfo_release - release orig_ifinfo from lists and queue for + * batadv_orig_ifinfo_release() - release orig_ifinfo from lists and queue for * free after rcu grace period * @ref: kref pointer of the orig_ifinfo */ @@ -835,7 +881,7 @@ static void batadv_orig_ifinfo_release(struct kref *ref) } /** - * batadv_orig_ifinfo_put - decrement the refcounter and possibly release + * batadv_orig_ifinfo_put() - decrement the refcounter and possibly release * the orig_ifinfo * @orig_ifinfo: the orig_ifinfo object to release */ @@ -845,7 +891,7 @@ void batadv_orig_ifinfo_put(struct batadv_orig_ifinfo *orig_ifinfo) } /** - * batadv_orig_node_free_rcu - free the orig_node + * batadv_orig_node_free_rcu() - free the orig_node * @rcu: rcu pointer of the orig_node */ static void batadv_orig_node_free_rcu(struct rcu_head *rcu) @@ -866,7 +912,7 @@ static void batadv_orig_node_free_rcu(struct rcu_head *rcu) } /** - * batadv_orig_node_release - release orig_node from lists and queue for + * batadv_orig_node_release() - release orig_node from lists and queue for * free after rcu grace period * @ref: kref pointer of the orig_node */ @@ -917,7 +963,7 @@ static void batadv_orig_node_release(struct kref *ref) } /** - * batadv_orig_node_put - decrement the orig node refcounter and possibly + * batadv_orig_node_put() - decrement the orig node refcounter and possibly * release it * @orig_node: the orig node to free */ @@ -926,6 +972,10 @@ void batadv_orig_node_put(struct batadv_orig_node *orig_node) kref_put(&orig_node->refcount, batadv_orig_node_release); } +/** + * batadv_originator_free() - Free all originator structures + * @bat_priv: the bat priv with all the soft interface information + */ void batadv_originator_free(struct batadv_priv *bat_priv) { struct batadv_hashtable *hash = bat_priv->orig_hash; @@ -959,7 +1009,7 @@ void batadv_originator_free(struct batadv_priv *bat_priv) } /** - * batadv_orig_node_new - creates a new orig_node + * batadv_orig_node_new() - creates a new orig_node * @bat_priv: the bat priv with all the soft interface information * @addr: the mac address of the originator * @@ -1038,7 +1088,7 @@ free_orig_node: } /** - * batadv_purge_neigh_ifinfo - purge obsolete ifinfo entries from neighbor + * batadv_purge_neigh_ifinfo() - purge obsolete ifinfo entries from neighbor * @bat_priv: the bat priv with all the soft interface information * @neigh: orig node which is to be checked */ @@ -1079,7 +1129,7 @@ batadv_purge_neigh_ifinfo(struct batadv_priv *bat_priv, } /** - * batadv_purge_orig_ifinfo - purge obsolete ifinfo entries from originator + * batadv_purge_orig_ifinfo() - purge obsolete ifinfo entries from originator * @bat_priv: the bat priv with all the soft interface information * @orig_node: orig node which is to be checked * @@ -1131,7 +1181,7 @@ batadv_purge_orig_ifinfo(struct batadv_priv *bat_priv, } /** - * batadv_purge_orig_neighbors - purges neighbors from originator + * batadv_purge_orig_neighbors() - purges neighbors from originator * @bat_priv: the bat priv with all the soft interface information * @orig_node: orig node which is to be checked * @@ -1189,7 +1239,7 @@ batadv_purge_orig_neighbors(struct batadv_priv *bat_priv, } /** - * batadv_find_best_neighbor - finds the best neighbor after purging + * batadv_find_best_neighbor() - finds the best neighbor after purging * @bat_priv: the bat priv with all the soft interface information * @orig_node: orig node which is to be checked * @if_outgoing: the interface for which the metric should be compared @@ -1224,7 +1274,7 @@ batadv_find_best_neighbor(struct batadv_priv *bat_priv, } /** - * batadv_purge_orig_node - purges obsolete information from an orig_node + * batadv_purge_orig_node() - purges obsolete information from an orig_node * @bat_priv: the bat priv with all the soft interface information * @orig_node: orig node which is to be checked * @@ -1341,12 +1391,24 @@ static void batadv_purge_orig(struct work_struct *work) msecs_to_jiffies(BATADV_ORIG_WORK_PERIOD)); } +/** + * batadv_purge_orig_ref() - Purge all outdated originators + * @bat_priv: the bat priv with all the soft interface information + */ void batadv_purge_orig_ref(struct batadv_priv *bat_priv) { _batadv_purge_orig(bat_priv); } #ifdef CONFIG_BATMAN_ADV_DEBUGFS + +/** + * batadv_orig_seq_print_text() - Print the originator table in a seq file + * @seq: seq file to print on + * @offset: not used + * + * Return: always 0 + */ int batadv_orig_seq_print_text(struct seq_file *seq, void *offset) { struct net_device *net_dev = (struct net_device *)seq->private; @@ -1376,7 +1438,7 @@ int batadv_orig_seq_print_text(struct seq_file *seq, void *offset) } /** - * batadv_orig_hardif_seq_print_text - writes originator infos for a specific + * batadv_orig_hardif_seq_print_text() - writes originator infos for a specific * outgoing interface * @seq: debugfs table seq_file struct * @offset: not used @@ -1423,7 +1485,7 @@ out: #endif /** - * batadv_orig_dump - Dump to netlink the originator infos for a specific + * batadv_orig_dump() - Dump to netlink the originator infos for a specific * outgoing interface * @msg: message to dump into * @cb: parameters for the dump @@ -1499,6 +1561,13 @@ int batadv_orig_dump(struct sk_buff *msg, struct netlink_callback *cb) return ret; } +/** + * batadv_orig_hash_add_if() - Add interface to originators in orig_hash + * @hard_iface: hard interface to add (already slave of the soft interface) + * @max_if_num: new number of interfaces + * + * Return: 0 on success or negative error number in case of failure + */ int batadv_orig_hash_add_if(struct batadv_hard_iface *hard_iface, int max_if_num) { @@ -1534,6 +1603,13 @@ err: return -ENOMEM; } +/** + * batadv_orig_hash_del_if() - Remove interface from originators in orig_hash + * @hard_iface: hard interface to remove (still slave of the soft interface) + * @max_if_num: new number of interfaces + * + * Return: 0 on success or negative error number in case of failure + */ int batadv_orig_hash_del_if(struct batadv_hard_iface *hard_iface, int max_if_num) { diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index d94220a6d21a..8e543a3cdc6c 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich @@ -23,14 +24,8 @@ #include <linux/compiler.h> #include <linux/if_ether.h> #include <linux/jhash.h> -#include <linux/kref.h> -#include <linux/rculist.h> -#include <linux/rcupdate.h> -#include <linux/stddef.h> #include <linux/types.h> -#include "hash.h" - struct netlink_callback; struct seq_file; struct sk_buff; @@ -89,8 +84,13 @@ batadv_orig_node_vlan_get(struct batadv_orig_node *orig_node, unsigned short vid); void batadv_orig_node_vlan_put(struct batadv_orig_node_vlan *orig_vlan); -/* hashfunction to choose an entry in a hash table of given size - * hash algorithm from http://en.wikipedia.org/wiki/Hash_table +/** + * batadv_choose_orig() - Return the index of the orig entry in the hash table + * @data: mac address of the originator node + * @size: the size of the hash table + * + * Return: the hash index where the object represented by @data should be + * stored at. */ static inline u32 batadv_choose_orig(const void *data, u32 size) { @@ -100,34 +100,7 @@ static inline u32 batadv_choose_orig(const void *data, u32 size) return hash % size; } -static inline struct batadv_orig_node * -batadv_orig_hash_find(struct batadv_priv *bat_priv, const void *data) -{ - struct batadv_hashtable *hash = bat_priv->orig_hash; - struct hlist_head *head; - struct batadv_orig_node *orig_node, *orig_node_tmp = NULL; - int index; - - if (!hash) - return NULL; - - index = batadv_choose_orig(data, hash->size); - head = &hash->table[index]; - - rcu_read_lock(); - hlist_for_each_entry_rcu(orig_node, head, hash_entry) { - if (!batadv_compare_eth(orig_node, data)) - continue; - - if (!kref_get_unless_zero(&orig_node->refcount)) - continue; - - orig_node_tmp = orig_node; - break; - } - rcu_read_unlock(); - - return orig_node_tmp; -} +struct batadv_orig_node * +batadv_orig_hash_find(struct batadv_priv *bat_priv, const void *data); #endif /* _NET_BATMAN_ADV_ORIGINATOR_H_ */ diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h deleted file mode 100644 index 8e8a5db197cb..000000000000 --- a/net/batman-adv/packet.h +++ /dev/null @@ -1,621 +0,0 @@ -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: - * - * Marek Lindner, Simon Wunderlich - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. - */ - -#ifndef _NET_BATMAN_ADV_PACKET_H_ -#define _NET_BATMAN_ADV_PACKET_H_ - -#include <asm/byteorder.h> -#include <linux/types.h> - -#define batadv_tp_is_error(n) ((u8)(n) > 127 ? 1 : 0) - -/** - * enum batadv_packettype - types for batman-adv encapsulated packets - * @BATADV_IV_OGM: originator messages for B.A.T.M.A.N. IV - * @BATADV_BCAST: broadcast packets carrying broadcast payload - * @BATADV_CODED: network coded packets - * @BATADV_ELP: echo location packets for B.A.T.M.A.N. V - * @BATADV_OGM2: originator messages for B.A.T.M.A.N. V - * - * @BATADV_UNICAST: unicast packets carrying unicast payload traffic - * @BATADV_UNICAST_FRAG: unicast packets carrying a fragment of the original - * payload packet - * @BATADV_UNICAST_4ADDR: unicast packet including the originator address of - * the sender - * @BATADV_ICMP: unicast packet like IP ICMP used for ping or traceroute - * @BATADV_UNICAST_TVLV: unicast packet carrying TVLV containers - */ -enum batadv_packettype { - /* 0x00 - 0x3f: local packets or special rules for handling */ - BATADV_IV_OGM = 0x00, - BATADV_BCAST = 0x01, - BATADV_CODED = 0x02, - BATADV_ELP = 0x03, - BATADV_OGM2 = 0x04, - /* 0x40 - 0x7f: unicast */ -#define BATADV_UNICAST_MIN 0x40 - BATADV_UNICAST = 0x40, - BATADV_UNICAST_FRAG = 0x41, - BATADV_UNICAST_4ADDR = 0x42, - BATADV_ICMP = 0x43, - BATADV_UNICAST_TVLV = 0x44, -#define BATADV_UNICAST_MAX 0x7f - /* 0x80 - 0xff: reserved */ -}; - -/** - * enum batadv_subtype - packet subtype for unicast4addr - * @BATADV_P_DATA: user payload - * @BATADV_P_DAT_DHT_GET: DHT request message - * @BATADV_P_DAT_DHT_PUT: DHT store message - * @BATADV_P_DAT_CACHE_REPLY: ARP reply generated by DAT - */ -enum batadv_subtype { - BATADV_P_DATA = 0x01, - BATADV_P_DAT_DHT_GET = 0x02, - BATADV_P_DAT_DHT_PUT = 0x03, - BATADV_P_DAT_CACHE_REPLY = 0x04, -}; - -/* this file is included by batctl which needs these defines */ -#define BATADV_COMPAT_VERSION 15 - -/** - * enum batadv_iv_flags - flags used in B.A.T.M.A.N. IV OGM packets - * @BATADV_NOT_BEST_NEXT_HOP: flag is set when ogm packet is forwarded and was - * previously received from someone else than the best neighbor. - * @BATADV_PRIMARIES_FIRST_HOP: flag unused. - * @BATADV_DIRECTLINK: flag is for the first hop or if rebroadcasted from a - * one hop neighbor on the interface where it was originally received. - */ -enum batadv_iv_flags { - BATADV_NOT_BEST_NEXT_HOP = BIT(0), - BATADV_PRIMARIES_FIRST_HOP = BIT(1), - BATADV_DIRECTLINK = BIT(2), -}; - -/* ICMP message types */ -enum batadv_icmp_packettype { - BATADV_ECHO_REPLY = 0, - BATADV_DESTINATION_UNREACHABLE = 3, - BATADV_ECHO_REQUEST = 8, - BATADV_TTL_EXCEEDED = 11, - BATADV_PARAMETER_PROBLEM = 12, - BATADV_TP = 15, -}; - -/** - * enum batadv_mcast_flags - flags for multicast capabilities and settings - * @BATADV_MCAST_WANT_ALL_UNSNOOPABLES: we want all packets destined for - * 224.0.0.0/24 or ff02::1 - * @BATADV_MCAST_WANT_ALL_IPV4: we want all IPv4 multicast packets - * @BATADV_MCAST_WANT_ALL_IPV6: we want all IPv6 multicast packets - */ -enum batadv_mcast_flags { - BATADV_MCAST_WANT_ALL_UNSNOOPABLES = BIT(0), - BATADV_MCAST_WANT_ALL_IPV4 = BIT(1), - BATADV_MCAST_WANT_ALL_IPV6 = BIT(2), -}; - -/* tt data subtypes */ -#define BATADV_TT_DATA_TYPE_MASK 0x0F - -/** - * enum batadv_tt_data_flags - flags for tt data tvlv - * @BATADV_TT_OGM_DIFF: TT diff propagated through OGM - * @BATADV_TT_REQUEST: TT request message - * @BATADV_TT_RESPONSE: TT response message - * @BATADV_TT_FULL_TABLE: contains full table to replace existing table - */ -enum batadv_tt_data_flags { - BATADV_TT_OGM_DIFF = BIT(0), - BATADV_TT_REQUEST = BIT(1), - BATADV_TT_RESPONSE = BIT(2), - BATADV_TT_FULL_TABLE = BIT(4), -}; - -/** - * enum batadv_vlan_flags - flags for the four MSB of any vlan ID field - * @BATADV_VLAN_HAS_TAG: whether the field contains a valid vlan tag or not - */ -enum batadv_vlan_flags { - BATADV_VLAN_HAS_TAG = BIT(15), -}; - -/* claim frame types for the bridge loop avoidance */ -enum batadv_bla_claimframe { - BATADV_CLAIM_TYPE_CLAIM = 0x00, - BATADV_CLAIM_TYPE_UNCLAIM = 0x01, - BATADV_CLAIM_TYPE_ANNOUNCE = 0x02, - BATADV_CLAIM_TYPE_REQUEST = 0x03, - BATADV_CLAIM_TYPE_LOOPDETECT = 0x04, -}; - -/** - * enum batadv_tvlv_type - tvlv type definitions - * @BATADV_TVLV_GW: gateway tvlv - * @BATADV_TVLV_DAT: distributed arp table tvlv - * @BATADV_TVLV_NC: network coding tvlv - * @BATADV_TVLV_TT: translation table tvlv - * @BATADV_TVLV_ROAM: roaming advertisement tvlv - * @BATADV_TVLV_MCAST: multicast capability tvlv - */ -enum batadv_tvlv_type { - BATADV_TVLV_GW = 0x01, - BATADV_TVLV_DAT = 0x02, - BATADV_TVLV_NC = 0x03, - BATADV_TVLV_TT = 0x04, - BATADV_TVLV_ROAM = 0x05, - BATADV_TVLV_MCAST = 0x06, -}; - -#pragma pack(2) -/* the destination hardware field in the ARP frame is used to - * transport the claim type and the group id - */ -struct batadv_bla_claim_dst { - u8 magic[3]; /* FF:43:05 */ - u8 type; /* bla_claimframe */ - __be16 group; /* group id */ -}; - -#pragma pack() - -/** - * struct batadv_ogm_packet - ogm (routing protocol) packet - * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header - * @flags: contains routing relevant flags - see enum batadv_iv_flags - * @seqno: sequence identification - * @orig: address of the source node - * @prev_sender: address of the previous sender - * @reserved: reserved byte for alignment - * @tq: transmission quality - * @tvlv_len: length of tvlv data following the ogm header - */ -struct batadv_ogm_packet { - u8 packet_type; - u8 version; - u8 ttl; - u8 flags; - __be32 seqno; - u8 orig[ETH_ALEN]; - u8 prev_sender[ETH_ALEN]; - u8 reserved; - u8 tq; - __be16 tvlv_len; - /* __packed is not needed as the struct size is divisible by 4, - * and the largest data type in this struct has a size of 4. - */ -}; - -#define BATADV_OGM_HLEN sizeof(struct batadv_ogm_packet) - -/** - * struct batadv_ogm2_packet - ogm2 (routing protocol) packet - * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the general header - * @ttl: time to live for this packet, part of the general header - * @flags: reseved for routing relevant flags - currently always 0 - * @seqno: sequence number - * @orig: originator mac address - * @tvlv_len: length of the appended tvlv buffer (in bytes) - * @throughput: the currently flooded path throughput - */ -struct batadv_ogm2_packet { - u8 packet_type; - u8 version; - u8 ttl; - u8 flags; - __be32 seqno; - u8 orig[ETH_ALEN]; - __be16 tvlv_len; - __be32 throughput; - /* __packed is not needed as the struct size is divisible by 4, - * and the largest data type in this struct has a size of 4. - */ -}; - -#define BATADV_OGM2_HLEN sizeof(struct batadv_ogm2_packet) - -/** - * struct batadv_elp_packet - elp (neighbor discovery) packet - * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @orig: originator mac address - * @seqno: sequence number - * @elp_interval: currently used ELP sending interval in ms - */ -struct batadv_elp_packet { - u8 packet_type; - u8 version; - u8 orig[ETH_ALEN]; - __be32 seqno; - __be32 elp_interval; -}; - -#define BATADV_ELP_HLEN sizeof(struct batadv_elp_packet) - -/** - * struct batadv_icmp_header - common members among all the ICMP packets - * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header - * @msg_type: ICMP packet type - * @dst: address of the destination node - * @orig: address of the source node - * @uid: local ICMP socket identifier - * @align: not used - useful for alignment purposes only - * - * This structure is used for ICMP packets parsing only and it is never sent - * over the wire. The alignment field at the end is there to ensure that - * members are padded the same way as they are in real packets. - */ -struct batadv_icmp_header { - u8 packet_type; - u8 version; - u8 ttl; - u8 msg_type; /* see ICMP message types above */ - u8 dst[ETH_ALEN]; - u8 orig[ETH_ALEN]; - u8 uid; - u8 align[3]; -}; - -/** - * struct batadv_icmp_packet - ICMP packet - * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header - * @msg_type: ICMP packet type - * @dst: address of the destination node - * @orig: address of the source node - * @uid: local ICMP socket identifier - * @reserved: not used - useful for alignment - * @seqno: ICMP sequence number - */ -struct batadv_icmp_packet { - u8 packet_type; - u8 version; - u8 ttl; - u8 msg_type; /* see ICMP message types above */ - u8 dst[ETH_ALEN]; - u8 orig[ETH_ALEN]; - u8 uid; - u8 reserved; - __be16 seqno; -}; - -/** - * struct batadv_icmp_tp_packet - ICMP TP Meter packet - * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header - * @msg_type: ICMP packet type - * @dst: address of the destination node - * @orig: address of the source node - * @uid: local ICMP socket identifier - * @subtype: TP packet subtype (see batadv_icmp_tp_subtype) - * @session: TP session identifier - * @seqno: the TP sequence number - * @timestamp: time when the packet has been sent. This value is filled in a - * TP_MSG and echoed back in the next TP_ACK so that the sender can compute the - * RTT. Since it is read only by the host which wrote it, there is no need to - * store it using network order - */ -struct batadv_icmp_tp_packet { - u8 packet_type; - u8 version; - u8 ttl; - u8 msg_type; /* see ICMP message types above */ - u8 dst[ETH_ALEN]; - u8 orig[ETH_ALEN]; - u8 uid; - u8 subtype; - u8 session[2]; - __be32 seqno; - __be32 timestamp; -}; - -/** - * enum batadv_icmp_tp_subtype - ICMP TP Meter packet subtypes - * @BATADV_TP_MSG: Msg from sender to receiver - * @BATADV_TP_ACK: acknowledgment from receiver to sender - */ -enum batadv_icmp_tp_subtype { - BATADV_TP_MSG = 0, - BATADV_TP_ACK, -}; - -#define BATADV_RR_LEN 16 - -/** - * struct batadv_icmp_packet_rr - ICMP RouteRecord packet - * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header - * @msg_type: ICMP packet type - * @dst: address of the destination node - * @orig: address of the source node - * @uid: local ICMP socket identifier - * @rr_cur: number of entries the rr array - * @seqno: ICMP sequence number - * @rr: route record array - */ -struct batadv_icmp_packet_rr { - u8 packet_type; - u8 version; - u8 ttl; - u8 msg_type; /* see ICMP message types above */ - u8 dst[ETH_ALEN]; - u8 orig[ETH_ALEN]; - u8 uid; - u8 rr_cur; - __be16 seqno; - u8 rr[BATADV_RR_LEN][ETH_ALEN]; -}; - -#define BATADV_ICMP_MAX_PACKET_SIZE sizeof(struct batadv_icmp_packet_rr) - -/* All packet headers in front of an ethernet header have to be completely - * divisible by 2 but not by 4 to make the payload after the ethernet - * header again 4 bytes boundary aligned. - * - * A packing of 2 is necessary to avoid extra padding at the end of the struct - * caused by a structure member which is larger than two bytes. Otherwise - * the structure would not fulfill the previously mentioned rule to avoid the - * misalignment of the payload after the ethernet header. It may also lead to - * leakage of information when the padding it not initialized before sending. - */ -#pragma pack(2) - -/** - * struct batadv_unicast_packet - unicast packet for network payload - * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header - * @ttvn: translation table version number - * @dest: originator destination of the unicast packet - */ -struct batadv_unicast_packet { - u8 packet_type; - u8 version; - u8 ttl; - u8 ttvn; /* destination translation table version number */ - u8 dest[ETH_ALEN]; - /* "4 bytes boundary + 2 bytes" long to make the payload after the - * following ethernet header again 4 bytes boundary aligned - */ -}; - -/** - * struct batadv_unicast_4addr_packet - extended unicast packet - * @u: common unicast packet header - * @src: address of the source - * @subtype: packet subtype - * @reserved: reserved byte for alignment - */ -struct batadv_unicast_4addr_packet { - struct batadv_unicast_packet u; - u8 src[ETH_ALEN]; - u8 subtype; - u8 reserved; - /* "4 bytes boundary + 2 bytes" long to make the payload after the - * following ethernet header again 4 bytes boundary aligned - */ -}; - -/** - * struct batadv_frag_packet - fragmented packet - * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header - * @dest: final destination used when routing fragments - * @orig: originator of the fragment used when merging the packet - * @no: fragment number within this sequence - * @priority: priority of frame, from ToS IP precedence or 802.1p - * @reserved: reserved byte for alignment - * @seqno: sequence identification - * @total_size: size of the merged packet - */ -struct batadv_frag_packet { - u8 packet_type; - u8 version; /* batman version field */ - u8 ttl; -#if defined(__BIG_ENDIAN_BITFIELD) - u8 no:4; - u8 priority:3; - u8 reserved:1; -#elif defined(__LITTLE_ENDIAN_BITFIELD) - u8 reserved:1; - u8 priority:3; - u8 no:4; -#else -#error "unknown bitfield endianness" -#endif - u8 dest[ETH_ALEN]; - u8 orig[ETH_ALEN]; - __be16 seqno; - __be16 total_size; -}; - -/** - * struct batadv_bcast_packet - broadcast packet for network payload - * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header - * @reserved: reserved byte for alignment - * @seqno: sequence identification - * @orig: originator of the broadcast packet - */ -struct batadv_bcast_packet { - u8 packet_type; - u8 version; /* batman version field */ - u8 ttl; - u8 reserved; - __be32 seqno; - u8 orig[ETH_ALEN]; - /* "4 bytes boundary + 2 bytes" long to make the payload after the - * following ethernet header again 4 bytes boundary aligned - */ -}; - -/** - * struct batadv_coded_packet - network coded packet - * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header - * @first_source: original source of first included packet - * @first_orig_dest: original destinal of first included packet - * @first_crc: checksum of first included packet - * @first_ttvn: tt-version number of first included packet - * @second_ttl: ttl of second packet - * @second_dest: second receiver of this coded packet - * @second_source: original source of second included packet - * @second_orig_dest: original destination of second included packet - * @second_crc: checksum of second included packet - * @second_ttvn: tt version number of second included packet - * @coded_len: length of network coded part of the payload - */ -struct batadv_coded_packet { - u8 packet_type; - u8 version; /* batman version field */ - u8 ttl; - u8 first_ttvn; - /* u8 first_dest[ETH_ALEN]; - saved in mac header destination */ - u8 first_source[ETH_ALEN]; - u8 first_orig_dest[ETH_ALEN]; - __be32 first_crc; - u8 second_ttl; - u8 second_ttvn; - u8 second_dest[ETH_ALEN]; - u8 second_source[ETH_ALEN]; - u8 second_orig_dest[ETH_ALEN]; - __be32 second_crc; - __be16 coded_len; -}; - -#pragma pack() - -/** - * struct batadv_unicast_tvlv_packet - generic unicast packet with tvlv payload - * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header - * @reserved: reserved field (for packet alignment) - * @src: address of the source - * @dst: address of the destination - * @tvlv_len: length of tvlv data following the unicast tvlv header - * @align: 2 bytes to align the header to a 4 byte boundary - */ -struct batadv_unicast_tvlv_packet { - u8 packet_type; - u8 version; /* batman version field */ - u8 ttl; - u8 reserved; - u8 dst[ETH_ALEN]; - u8 src[ETH_ALEN]; - __be16 tvlv_len; - u16 align; -}; - -/** - * struct batadv_tvlv_hdr - base tvlv header struct - * @type: tvlv container type (see batadv_tvlv_type) - * @version: tvlv container version - * @len: tvlv container length - */ -struct batadv_tvlv_hdr { - u8 type; - u8 version; - __be16 len; -}; - -/** - * struct batadv_tvlv_gateway_data - gateway data propagated through gw tvlv - * container - * @bandwidth_down: advertised uplink download bandwidth - * @bandwidth_up: advertised uplink upload bandwidth - */ -struct batadv_tvlv_gateway_data { - __be32 bandwidth_down; - __be32 bandwidth_up; -}; - -/** - * struct batadv_tvlv_tt_data - tt data propagated through the tt tvlv container - * @flags: translation table flags (see batadv_tt_data_flags) - * @ttvn: translation table version number - * @num_vlan: number of announced VLANs. In the TVLV this struct is followed by - * one batadv_tvlv_tt_vlan_data object per announced vlan - */ -struct batadv_tvlv_tt_data { - u8 flags; - u8 ttvn; - __be16 num_vlan; -}; - -/** - * struct batadv_tvlv_tt_vlan_data - vlan specific tt data propagated through - * the tt tvlv container - * @crc: crc32 checksum of the entries belonging to this vlan - * @vid: vlan identifier - * @reserved: unused, useful for alignment purposes - */ -struct batadv_tvlv_tt_vlan_data { - __be32 crc; - __be16 vid; - u16 reserved; -}; - -/** - * struct batadv_tvlv_tt_change - translation table diff data - * @flags: status indicators concerning the non-mesh client (see - * batadv_tt_client_flags) - * @reserved: reserved field - useful for alignment purposes only - * @addr: mac address of non-mesh client that triggered this tt change - * @vid: VLAN identifier - */ -struct batadv_tvlv_tt_change { - u8 flags; - u8 reserved[3]; - u8 addr[ETH_ALEN]; - __be16 vid; -}; - -/** - * struct batadv_tvlv_roam_adv - roaming advertisement - * @client: mac address of roaming client - * @vid: VLAN identifier - */ -struct batadv_tvlv_roam_adv { - u8 client[ETH_ALEN]; - __be16 vid; -}; - -/** - * struct batadv_tvlv_mcast_data - payload of a multicast tvlv - * @flags: multicast flags announced by the orig node - * @reserved: reserved field - */ -struct batadv_tvlv_mcast_data { - u8 flags; - u8 reserved[3]; -}; - -#endif /* _NET_BATMAN_ADV_PACKET_H_ */ diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index 40d9bf3e5bfe..b6891e8b741c 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich @@ -33,6 +34,7 @@ #include <linux/skbuff.h> #include <linux/spinlock.h> #include <linux/stddef.h> +#include <uapi/linux/batadv_packet.h> #include "bitarray.h" #include "bridge_loop_avoidance.h" @@ -43,7 +45,6 @@ #include "log.h" #include "network-coding.h" #include "originator.h" -#include "packet.h" #include "send.h" #include "soft-interface.h" #include "tp_meter.h" @@ -54,7 +55,7 @@ static int batadv_route_unicast_packet(struct sk_buff *skb, struct batadv_hard_iface *recv_if); /** - * _batadv_update_route - set the router for this originator + * _batadv_update_route() - set the router for this originator * @bat_priv: the bat priv with all the soft interface information * @orig_node: orig node which is to be configured * @recv_if: the receive interface for which this route is set @@ -118,7 +119,7 @@ static void _batadv_update_route(struct batadv_priv *bat_priv, } /** - * batadv_update_route - set the router for this originator + * batadv_update_route() - set the router for this originator * @bat_priv: the bat priv with all the soft interface information * @orig_node: orig node which is to be configured * @recv_if: the receive interface for which this route is set @@ -145,7 +146,7 @@ out: } /** - * batadv_window_protected - checks whether the host restarted and is in the + * batadv_window_protected() - checks whether the host restarted and is in the * protection time. * @bat_priv: the bat priv with all the soft interface information * @seq_num_diff: difference between the current/received sequence number and @@ -180,6 +181,14 @@ bool batadv_window_protected(struct batadv_priv *bat_priv, s32 seq_num_diff, return false; } +/** + * batadv_check_management_packet() - Check preconditions for management packets + * @skb: incoming packet buffer + * @hard_iface: incoming hard interface + * @header_len: minimal header length of packet type + * + * Return: true when management preconditions are met, false otherwise + */ bool batadv_check_management_packet(struct sk_buff *skb, struct batadv_hard_iface *hard_iface, int header_len) @@ -212,7 +221,7 @@ bool batadv_check_management_packet(struct sk_buff *skb, } /** - * batadv_recv_my_icmp_packet - receive an icmp packet locally + * batadv_recv_my_icmp_packet() - receive an icmp packet locally * @bat_priv: the bat priv with all the soft interface information * @skb: icmp packet to process * @@ -347,6 +356,13 @@ out: return ret; } +/** + * batadv_recv_icmp_packet() - Process incoming icmp packet + * @skb: incoming packet buffer + * @recv_if: incoming hard interface + * + * Return: NET_RX_SUCCESS on success or NET_RX_DROP in case of failure + */ int batadv_recv_icmp_packet(struct sk_buff *skb, struct batadv_hard_iface *recv_if) { @@ -440,7 +456,7 @@ free_skb: } /** - * batadv_check_unicast_packet - Check for malformed unicast packets + * batadv_check_unicast_packet() - Check for malformed unicast packets * @bat_priv: the bat priv with all the soft interface information * @skb: packet to check * @hdr_size: size of header to pull @@ -478,7 +494,7 @@ static int batadv_check_unicast_packet(struct batadv_priv *bat_priv, } /** - * batadv_last_bonding_get - Get last_bonding_candidate of orig_node + * batadv_last_bonding_get() - Get last_bonding_candidate of orig_node * @orig_node: originator node whose last bonding candidate should be retrieved * * Return: last bonding candidate of router or NULL if not found @@ -501,7 +517,7 @@ batadv_last_bonding_get(struct batadv_orig_node *orig_node) } /** - * batadv_last_bonding_replace - Replace last_bonding_candidate of orig_node + * batadv_last_bonding_replace() - Replace last_bonding_candidate of orig_node * @orig_node: originator node whose bonding candidates should be replaced * @new_candidate: new bonding candidate or NULL */ @@ -524,7 +540,7 @@ batadv_last_bonding_replace(struct batadv_orig_node *orig_node, } /** - * batadv_find_router - find a suitable router for this originator + * batadv_find_router() - find a suitable router for this originator * @bat_priv: the bat priv with all the soft interface information * @orig_node: the destination node * @recv_if: pointer to interface this packet was received on @@ -741,7 +757,7 @@ free_skb: } /** - * batadv_reroute_unicast_packet - update the unicast header for re-routing + * batadv_reroute_unicast_packet() - update the unicast header for re-routing * @bat_priv: the bat priv with all the soft interface information * @unicast_packet: the unicast header to be updated * @dst_addr: the payload destination @@ -904,7 +920,7 @@ static bool batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, } /** - * batadv_recv_unhandled_unicast_packet - receive and process packets which + * batadv_recv_unhandled_unicast_packet() - receive and process packets which * are in the unicast number space but not yet known to the implementation * @skb: unicast tvlv packet to process * @recv_if: pointer to interface this packet was received on @@ -935,6 +951,13 @@ free_skb: return NET_RX_DROP; } +/** + * batadv_recv_unicast_packet() - Process incoming unicast packet + * @skb: incoming packet buffer + * @recv_if: incoming hard interface + * + * Return: NET_RX_SUCCESS on success or NET_RX_DROP in case of failure + */ int batadv_recv_unicast_packet(struct sk_buff *skb, struct batadv_hard_iface *recv_if) { @@ -1036,7 +1059,7 @@ free_skb: } /** - * batadv_recv_unicast_tvlv - receive and process unicast tvlv packets + * batadv_recv_unicast_tvlv() - receive and process unicast tvlv packets * @skb: unicast tvlv packet to process * @recv_if: pointer to interface this packet was received on * @@ -1090,7 +1113,7 @@ free_skb: } /** - * batadv_recv_frag_packet - process received fragment + * batadv_recv_frag_packet() - process received fragment * @skb: the received fragment * @recv_if: interface that the skb is received on * @@ -1155,6 +1178,13 @@ free_skb: return ret; } +/** + * batadv_recv_bcast_packet() - Process incoming broadcast packet + * @skb: incoming packet buffer + * @recv_if: incoming hard interface + * + * Return: NET_RX_SUCCESS on success or NET_RX_DROP in case of failure + */ int batadv_recv_bcast_packet(struct sk_buff *skb, struct batadv_hard_iface *recv_if) { diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h index 5ede16c32f15..a1289bc5f115 100644 --- a/net/batman-adv/routing.h +++ b/net/batman-adv/routing.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 7895323fd2a7..2a5ab6f1076d 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich @@ -23,7 +24,7 @@ #include <linux/byteorder/generic.h> #include <linux/errno.h> #include <linux/etherdevice.h> -#include <linux/fs.h> +#include <linux/gfp.h> #include <linux/if.h> #include <linux/if_ether.h> #include <linux/jiffies.h> @@ -54,7 +55,7 @@ static void batadv_send_outstanding_bcast_packet(struct work_struct *work); /** - * batadv_send_skb_packet - send an already prepared packet + * batadv_send_skb_packet() - send an already prepared packet * @skb: the packet to send * @hard_iface: the interface to use to send the broadcast packet * @dst_addr: the payload destination @@ -123,12 +124,30 @@ send_skb_err: return NET_XMIT_DROP; } +/** + * batadv_send_broadcast_skb() - Send broadcast packet via hard interface + * @skb: packet to be transmitted (with batadv header and no outer eth header) + * @hard_iface: outgoing interface + * + * Return: A negative errno code is returned on a failure. A success does not + * guarantee the frame will be transmitted as it may be dropped due + * to congestion or traffic shaping. + */ int batadv_send_broadcast_skb(struct sk_buff *skb, struct batadv_hard_iface *hard_iface) { return batadv_send_skb_packet(skb, hard_iface, batadv_broadcast_addr); } +/** + * batadv_send_unicast_skb() - Send unicast packet to neighbor + * @skb: packet to be transmitted (with batadv header and no outer eth header) + * @neigh: neighbor which is used as next hop to destination + * + * Return: A negative errno code is returned on a failure. A success does not + * guarantee the frame will be transmitted as it may be dropped due + * to congestion or traffic shaping. + */ int batadv_send_unicast_skb(struct sk_buff *skb, struct batadv_neigh_node *neigh) { @@ -153,7 +172,7 @@ int batadv_send_unicast_skb(struct sk_buff *skb, } /** - * batadv_send_skb_to_orig - Lookup next-hop and transmit skb. + * batadv_send_skb_to_orig() - Lookup next-hop and transmit skb. * @skb: Packet to be transmitted. * @orig_node: Final destination of the packet. * @recv_if: Interface used when receiving the packet (can be NULL). @@ -216,7 +235,7 @@ free_skb: } /** - * batadv_send_skb_push_fill_unicast - extend the buffer and initialize the + * batadv_send_skb_push_fill_unicast() - extend the buffer and initialize the * common fields for unicast packets * @skb: the skb carrying the unicast header to initialize * @hdr_size: amount of bytes to push at the beginning of the skb @@ -249,7 +268,7 @@ batadv_send_skb_push_fill_unicast(struct sk_buff *skb, int hdr_size, } /** - * batadv_send_skb_prepare_unicast - encapsulate an skb with a unicast header + * batadv_send_skb_prepare_unicast() - encapsulate an skb with a unicast header * @skb: the skb containing the payload to encapsulate * @orig_node: the destination node * @@ -264,7 +283,7 @@ static bool batadv_send_skb_prepare_unicast(struct sk_buff *skb, } /** - * batadv_send_skb_prepare_unicast_4addr - encapsulate an skb with a + * batadv_send_skb_prepare_unicast_4addr() - encapsulate an skb with a * unicast 4addr header * @bat_priv: the bat priv with all the soft interface information * @skb: the skb containing the payload to encapsulate @@ -308,7 +327,7 @@ out: } /** - * batadv_send_skb_unicast - encapsulate and send an skb via unicast + * batadv_send_skb_unicast() - encapsulate and send an skb via unicast * @bat_priv: the bat priv with all the soft interface information * @skb: payload to send * @packet_type: the batman unicast packet type to use @@ -378,7 +397,7 @@ out: } /** - * batadv_send_skb_via_tt_generic - send an skb via TT lookup + * batadv_send_skb_via_tt_generic() - send an skb via TT lookup * @bat_priv: the bat priv with all the soft interface information * @skb: payload to send * @packet_type: the batman unicast packet type to use @@ -425,7 +444,7 @@ int batadv_send_skb_via_tt_generic(struct batadv_priv *bat_priv, } /** - * batadv_send_skb_via_gw - send an skb via gateway lookup + * batadv_send_skb_via_gw() - send an skb via gateway lookup * @bat_priv: the bat priv with all the soft interface information * @skb: payload to send * @vid: the vid to be used to search the translation table @@ -452,7 +471,7 @@ int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb, } /** - * batadv_forw_packet_free - free a forwarding packet + * batadv_forw_packet_free() - free a forwarding packet * @forw_packet: The packet to free * @dropped: whether the packet is freed because is is dropped * @@ -477,7 +496,7 @@ void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet, } /** - * batadv_forw_packet_alloc - allocate a forwarding packet + * batadv_forw_packet_alloc() - allocate a forwarding packet * @if_incoming: The (optional) if_incoming to be grabbed * @if_outgoing: The (optional) if_outgoing to be grabbed * @queue_left: The (optional) queue counter to decrease @@ -543,7 +562,7 @@ err: } /** - * batadv_forw_packet_was_stolen - check whether someone stole this packet + * batadv_forw_packet_was_stolen() - check whether someone stole this packet * @forw_packet: the forwarding packet to check * * This function checks whether the given forwarding packet was claimed by @@ -558,7 +577,7 @@ batadv_forw_packet_was_stolen(struct batadv_forw_packet *forw_packet) } /** - * batadv_forw_packet_steal - claim a forw_packet for free() + * batadv_forw_packet_steal() - claim a forw_packet for free() * @forw_packet: the forwarding packet to steal * @lock: a key to the store to steal from (e.g. forw_{bat,bcast}_list_lock) * @@ -589,7 +608,7 @@ bool batadv_forw_packet_steal(struct batadv_forw_packet *forw_packet, } /** - * batadv_forw_packet_list_steal - claim a list of forward packets for free() + * batadv_forw_packet_list_steal() - claim a list of forward packets for free() * @forw_list: the to be stolen forward packets * @cleanup_list: a backup pointer, to be able to dispose the packet later * @hard_iface: the interface to steal forward packets from @@ -625,7 +644,7 @@ batadv_forw_packet_list_steal(struct hlist_head *forw_list, } /** - * batadv_forw_packet_list_free - free a list of forward packets + * batadv_forw_packet_list_free() - free a list of forward packets * @head: a list of to be freed forw_packets * * This function cancels the scheduling of any packet in the provided list, @@ -649,7 +668,7 @@ static void batadv_forw_packet_list_free(struct hlist_head *head) } /** - * batadv_forw_packet_queue - try to queue a forwarding packet + * batadv_forw_packet_queue() - try to queue a forwarding packet * @forw_packet: the forwarding packet to queue * @lock: a key to the store (e.g. forw_{bat,bcast}_list_lock) * @head: the shelve to queue it on (e.g. forw_{bat,bcast}_list) @@ -693,7 +712,7 @@ static void batadv_forw_packet_queue(struct batadv_forw_packet *forw_packet, } /** - * batadv_forw_packet_bcast_queue - try to queue a broadcast packet + * batadv_forw_packet_bcast_queue() - try to queue a broadcast packet * @bat_priv: the bat priv with all the soft interface information * @forw_packet: the forwarding packet to queue * @send_time: timestamp (jiffies) when the packet is to be sent @@ -712,7 +731,7 @@ batadv_forw_packet_bcast_queue(struct batadv_priv *bat_priv, } /** - * batadv_forw_packet_ogmv1_queue - try to queue an OGMv1 packet + * batadv_forw_packet_ogmv1_queue() - try to queue an OGMv1 packet * @bat_priv: the bat priv with all the soft interface information * @forw_packet: the forwarding packet to queue * @send_time: timestamp (jiffies) when the packet is to be sent @@ -730,7 +749,7 @@ void batadv_forw_packet_ogmv1_queue(struct batadv_priv *bat_priv, } /** - * batadv_add_bcast_packet_to_list - queue broadcast packet for multiple sends + * batadv_add_bcast_packet_to_list() - queue broadcast packet for multiple sends * @bat_priv: the bat priv with all the soft interface information * @skb: broadcast packet to add * @delay: number of jiffies to wait before sending @@ -790,7 +809,7 @@ err: } /** - * batadv_forw_packet_bcasts_left - check if a retransmission is necessary + * batadv_forw_packet_bcasts_left() - check if a retransmission is necessary * @forw_packet: the forwarding packet to check * @hard_iface: the interface to check on * @@ -818,7 +837,8 @@ batadv_forw_packet_bcasts_left(struct batadv_forw_packet *forw_packet, } /** - * batadv_forw_packet_bcasts_inc - increment retransmission counter of a packet + * batadv_forw_packet_bcasts_inc() - increment retransmission counter of a + * packet * @forw_packet: the packet to increase the counter for */ static void @@ -828,7 +848,7 @@ batadv_forw_packet_bcasts_inc(struct batadv_forw_packet *forw_packet) } /** - * batadv_forw_packet_is_rebroadcast - check packet for previous transmissions + * batadv_forw_packet_is_rebroadcast() - check packet for previous transmissions * @forw_packet: the packet to check * * Return: True if this packet was transmitted before, false otherwise. @@ -953,7 +973,7 @@ out: } /** - * batadv_purge_outstanding_packets - stop/purge scheduled bcast/OGMv1 packets + * batadv_purge_outstanding_packets() - stop/purge scheduled bcast/OGMv1 packets * @bat_priv: the bat priv with all the soft interface information * @hard_iface: the hard interface to cancel and purge bcast/ogm packets on * diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h index a16b34f473ef..1e8c79093623 100644 --- a/net/batman-adv/send.h +++ b/net/batman-adv/send.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich @@ -23,8 +24,7 @@ #include <linux/compiler.h> #include <linux/spinlock.h> #include <linux/types.h> - -#include "packet.h" +#include <uapi/linux/batadv_packet.h> struct sk_buff; @@ -76,7 +76,7 @@ int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid); /** - * batadv_send_skb_via_tt - send an skb via TT lookup + * batadv_send_skb_via_tt() - send an skb via TT lookup * @bat_priv: the bat priv with all the soft interface information * @skb: the payload to send * @dst_hint: can be used to override the destination contained in the skb @@ -97,7 +97,7 @@ static inline int batadv_send_skb_via_tt(struct batadv_priv *bat_priv, } /** - * batadv_send_skb_via_tt_4addr - send an skb via TT lookup + * batadv_send_skb_via_tt_4addr() - send an skb via TT lookup * @bat_priv: the bat priv with all the soft interface information * @skb: the payload to send * @packet_subtype: the unicast 4addr packet subtype to use diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 9f673cdfecf8..900c5ce21cd4 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich @@ -26,7 +27,7 @@ #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> -#include <linux/fs.h> +#include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <linux/jiffies.h> @@ -48,6 +49,7 @@ #include <linux/stddef.h> #include <linux/string.h> #include <linux/types.h> +#include <uapi/linux/batadv_packet.h> #include "bat_algo.h" #include "bridge_loop_avoidance.h" @@ -59,11 +61,17 @@ #include "multicast.h" #include "network-coding.h" #include "originator.h" -#include "packet.h" #include "send.h" #include "sysfs.h" #include "translation-table.h" +/** + * batadv_skb_head_push() - Increase header size and move (push) head pointer + * @skb: packet buffer which should be modified + * @len: number of bytes to add + * + * Return: 0 on success or negative error number in case of failure + */ int batadv_skb_head_push(struct sk_buff *skb, unsigned int len) { int result; @@ -96,7 +104,7 @@ static int batadv_interface_release(struct net_device *dev) } /** - * batadv_sum_counter - Sum the cpu-local counters for index 'idx' + * batadv_sum_counter() - Sum the cpu-local counters for index 'idx' * @bat_priv: the bat priv with all the soft interface information * @idx: index of counter to sum up * @@ -169,7 +177,7 @@ static int batadv_interface_change_mtu(struct net_device *dev, int new_mtu) } /** - * batadv_interface_set_rx_mode - set the rx mode of a device + * batadv_interface_set_rx_mode() - set the rx mode of a device * @dev: registered network device to modify * * We do not actually need to set any rx filters for the virtual batman @@ -389,7 +397,7 @@ end: } /** - * batadv_interface_rx - receive ethernet frame on local batman-adv interface + * batadv_interface_rx() - receive ethernet frame on local batman-adv interface * @soft_iface: local interface which will receive the ethernet frame * @skb: ethernet frame for @soft_iface * @hdr_size: size of already parsed batman-adv header @@ -501,8 +509,8 @@ out: } /** - * batadv_softif_vlan_release - release vlan from lists and queue for free after - * rcu grace period + * batadv_softif_vlan_release() - release vlan from lists and queue for free + * after rcu grace period * @ref: kref pointer of the vlan object */ static void batadv_softif_vlan_release(struct kref *ref) @@ -519,7 +527,7 @@ static void batadv_softif_vlan_release(struct kref *ref) } /** - * batadv_softif_vlan_put - decrease the vlan object refcounter and + * batadv_softif_vlan_put() - decrease the vlan object refcounter and * possibly release it * @vlan: the vlan object to release */ @@ -532,7 +540,7 @@ void batadv_softif_vlan_put(struct batadv_softif_vlan *vlan) } /** - * batadv_softif_vlan_get - get the vlan object for a specific vid + * batadv_softif_vlan_get() - get the vlan object for a specific vid * @bat_priv: the bat priv with all the soft interface information * @vid: the identifier of the vlan object to retrieve * @@ -561,7 +569,7 @@ struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv, } /** - * batadv_softif_create_vlan - allocate the needed resources for a new vlan + * batadv_softif_create_vlan() - allocate the needed resources for a new vlan * @bat_priv: the bat priv with all the soft interface information * @vid: the VLAN identifier * @@ -613,7 +621,7 @@ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid) } /** - * batadv_softif_destroy_vlan - remove and destroy a softif_vlan object + * batadv_softif_destroy_vlan() - remove and destroy a softif_vlan object * @bat_priv: the bat priv with all the soft interface information * @vlan: the object to remove */ @@ -631,7 +639,7 @@ static void batadv_softif_destroy_vlan(struct batadv_priv *bat_priv, } /** - * batadv_interface_add_vid - ndo_add_vid API implementation + * batadv_interface_add_vid() - ndo_add_vid API implementation * @dev: the netdev of the mesh interface * @proto: protocol of the the vlan id * @vid: identifier of the new vlan @@ -689,7 +697,7 @@ static int batadv_interface_add_vid(struct net_device *dev, __be16 proto, } /** - * batadv_interface_kill_vid - ndo_kill_vid API implementation + * batadv_interface_kill_vid() - ndo_kill_vid API implementation * @dev: the netdev of the mesh interface * @proto: protocol of the the vlan id * @vid: identifier of the deleted vlan @@ -732,7 +740,7 @@ static struct lock_class_key batadv_netdev_xmit_lock_key; static struct lock_class_key batadv_netdev_addr_lock_key; /** - * batadv_set_lockdep_class_one - Set lockdep class for a single tx queue + * batadv_set_lockdep_class_one() - Set lockdep class for a single tx queue * @dev: device which owns the tx queue * @txq: tx queue to modify * @_unused: always NULL @@ -745,7 +753,7 @@ static void batadv_set_lockdep_class_one(struct net_device *dev, } /** - * batadv_set_lockdep_class - Set txq and addr_list lockdep class + * batadv_set_lockdep_class() - Set txq and addr_list lockdep class * @dev: network device to modify */ static void batadv_set_lockdep_class(struct net_device *dev) @@ -755,7 +763,7 @@ static void batadv_set_lockdep_class(struct net_device *dev) } /** - * batadv_softif_init_late - late stage initialization of soft interface + * batadv_softif_init_late() - late stage initialization of soft interface * @dev: registered network device to modify * * Return: error code on failures @@ -860,7 +868,7 @@ free_bat_counters: } /** - * batadv_softif_slave_add - Add a slave interface to a batadv_soft_interface + * batadv_softif_slave_add() - Add a slave interface to a batadv_soft_interface * @dev: batadv_soft_interface used as master interface * @slave_dev: net_device which should become the slave interface * @extack: extended ACK report struct @@ -888,7 +896,7 @@ out: } /** - * batadv_softif_slave_del - Delete a slave iface from a batadv_soft_interface + * batadv_softif_slave_del() - Delete a slave iface from a batadv_soft_interface * @dev: batadv_soft_interface used as master interface * @slave_dev: net_device which should be removed from the master interface * @@ -1023,7 +1031,7 @@ static const struct ethtool_ops batadv_ethtool_ops = { }; /** - * batadv_softif_free - Deconstructor of batadv_soft_interface + * batadv_softif_free() - Deconstructor of batadv_soft_interface * @dev: Device to cleanup and remove */ static void batadv_softif_free(struct net_device *dev) @@ -1039,7 +1047,7 @@ static void batadv_softif_free(struct net_device *dev) } /** - * batadv_softif_init_early - early stage initialization of soft interface + * batadv_softif_init_early() - early stage initialization of soft interface * @dev: registered network device to modify */ static void batadv_softif_init_early(struct net_device *dev) @@ -1063,6 +1071,13 @@ static void batadv_softif_init_early(struct net_device *dev) dev->ethtool_ops = &batadv_ethtool_ops; } +/** + * batadv_softif_create() - Create and register soft interface + * @net: the applicable net namespace + * @name: name of the new soft interface + * + * Return: newly allocated soft_interface, NULL on errors + */ struct net_device *batadv_softif_create(struct net *net, const char *name) { struct net_device *soft_iface; @@ -1089,7 +1104,7 @@ struct net_device *batadv_softif_create(struct net *net, const char *name) } /** - * batadv_softif_destroy_sysfs - deletion of batadv_soft_interface via sysfs + * batadv_softif_destroy_sysfs() - deletion of batadv_soft_interface via sysfs * @soft_iface: the to-be-removed batman-adv interface */ void batadv_softif_destroy_sysfs(struct net_device *soft_iface) @@ -1111,7 +1126,8 @@ void batadv_softif_destroy_sysfs(struct net_device *soft_iface) } /** - * batadv_softif_destroy_netlink - deletion of batadv_soft_interface via netlink + * batadv_softif_destroy_netlink() - deletion of batadv_soft_interface via + * netlink * @soft_iface: the to-be-removed batman-adv interface * @head: list pointer */ @@ -1139,6 +1155,12 @@ static void batadv_softif_destroy_netlink(struct net_device *soft_iface, unregister_netdevice_queue(soft_iface, head); } +/** + * batadv_softif_is_valid() - Check whether device is a batadv soft interface + * @net_dev: device which should be checked + * + * Return: true when net_dev is a batman-adv interface, false otherwise + */ bool batadv_softif_is_valid(const struct net_device *net_dev) { if (net_dev->netdev_ops->ndo_start_xmit == batadv_interface_tx) diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h index 639c3abb214a..075c5b5b2ce1 100644 --- a/net/batman-adv/soft-interface.h +++ b/net/batman-adv/soft-interface.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index aa187fd42475..c1578fa0b952 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2010-2017 B.A.T.M.A.N. contributors: * * Marek Lindner @@ -22,10 +23,11 @@ #include <linux/compiler.h> #include <linux/device.h> #include <linux/errno.h> -#include <linux/fs.h> +#include <linux/gfp.h> #include <linux/if.h> #include <linux/if_vlan.h> #include <linux/kernel.h> +#include <linux/kobject.h> #include <linux/kref.h> #include <linux/netdevice.h> #include <linux/printk.h> @@ -37,6 +39,7 @@ #include <linux/string.h> #include <linux/stringify.h> #include <linux/workqueue.h> +#include <uapi/linux/batadv_packet.h> #include "bridge_loop_avoidance.h" #include "distributed-arp-table.h" @@ -45,7 +48,6 @@ #include "hard-interface.h" #include "log.h" #include "network-coding.h" -#include "packet.h" #include "soft-interface.h" static struct net_device *batadv_kobj_to_netdev(struct kobject *obj) @@ -63,7 +65,7 @@ static struct batadv_priv *batadv_kobj_to_batpriv(struct kobject *obj) } /** - * batadv_vlan_kobj_to_batpriv - convert a vlan kobj in the associated batpriv + * batadv_vlan_kobj_to_batpriv() - convert a vlan kobj in the associated batpriv * @obj: kobject to covert * * Return: the associated batadv_priv struct. @@ -83,7 +85,7 @@ static struct batadv_priv *batadv_vlan_kobj_to_batpriv(struct kobject *obj) } /** - * batadv_kobj_to_vlan - convert a kobj in the associated softif_vlan struct + * batadv_kobj_to_vlan() - convert a kobj in the associated softif_vlan struct * @bat_priv: the bat priv with all the soft interface information * @obj: kobject to covert * @@ -598,7 +600,7 @@ static ssize_t batadv_store_gw_bwidth(struct kobject *kobj, } /** - * batadv_show_isolation_mark - print the current isolation mark/mask + * batadv_show_isolation_mark() - print the current isolation mark/mask * @kobj: kobject representing the private mesh sysfs directory * @attr: the batman-adv attribute the user is interacting with * @buff: the buffer that will contain the data to send back to the user @@ -616,8 +618,8 @@ static ssize_t batadv_show_isolation_mark(struct kobject *kobj, } /** - * batadv_store_isolation_mark - parse and store the isolation mark/mask entered - * by the user + * batadv_store_isolation_mark() - parse and store the isolation mark/mask + * entered by the user * @kobj: kobject representing the private mesh sysfs directory * @attr: the batman-adv attribute the user is interacting with * @buff: the buffer containing the user data @@ -733,6 +735,12 @@ static struct batadv_attribute *batadv_vlan_attrs[] = { NULL, }; +/** + * batadv_sysfs_add_meshif() - Add soft interface specific sysfs entries + * @dev: netdev struct of the soft interface + * + * Return: 0 on success or negative error number in case of failure + */ int batadv_sysfs_add_meshif(struct net_device *dev) { struct kobject *batif_kobject = &dev->dev.kobj; @@ -773,6 +781,10 @@ out: return -ENOMEM; } +/** + * batadv_sysfs_del_meshif() - Remove soft interface specific sysfs entries + * @dev: netdev struct of the soft interface + */ void batadv_sysfs_del_meshif(struct net_device *dev) { struct batadv_priv *bat_priv = netdev_priv(dev); @@ -788,7 +800,7 @@ void batadv_sysfs_del_meshif(struct net_device *dev) } /** - * batadv_sysfs_add_vlan - add all the needed sysfs objects for the new vlan + * batadv_sysfs_add_vlan() - add all the needed sysfs objects for the new vlan * @dev: netdev of the mesh interface * @vlan: private data of the newly added VLAN interface * @@ -849,7 +861,7 @@ out: } /** - * batadv_sysfs_del_vlan - remove all the sysfs objects for a given VLAN + * batadv_sysfs_del_vlan() - remove all the sysfs objects for a given VLAN * @bat_priv: the bat priv with all the soft interface information * @vlan: the private data of the VLAN to destroy */ @@ -894,7 +906,7 @@ static ssize_t batadv_show_mesh_iface(struct kobject *kobj, } /** - * batadv_store_mesh_iface_finish - store new hardif mesh_iface state + * batadv_store_mesh_iface_finish() - store new hardif mesh_iface state * @net_dev: netdevice to add/remove to/from batman-adv soft-interface * @ifname: name of soft-interface to modify * @@ -947,7 +959,7 @@ out: } /** - * batadv_store_mesh_iface_work - store new hardif mesh_iface state + * batadv_store_mesh_iface_work() - store new hardif mesh_iface state * @work: work queue item * * Changes the parts of the hard+soft interface which can not be modified under @@ -1043,7 +1055,7 @@ static ssize_t batadv_show_iface_status(struct kobject *kobj, #ifdef CONFIG_BATMAN_ADV_BATMAN_V /** - * batadv_store_throughput_override - parse and store throughput override + * batadv_store_throughput_override() - parse and store throughput override * entered by the user * @kobj: kobject representing the private mesh sysfs directory * @attr: the batman-adv attribute the user is interacting with @@ -1130,6 +1142,13 @@ static struct batadv_attribute *batadv_batman_attrs[] = { NULL, }; +/** + * batadv_sysfs_add_hardif() - Add hard interface specific sysfs entries + * @hardif_obj: address where to store the pointer to new sysfs folder + * @dev: netdev struct of the hard interface + * + * Return: 0 on success or negative error number in case of failure + */ int batadv_sysfs_add_hardif(struct kobject **hardif_obj, struct net_device *dev) { struct kobject *hardif_kobject = &dev->dev.kobj; @@ -1164,6 +1183,11 @@ out: return -ENOMEM; } +/** + * batadv_sysfs_del_hardif() - Remove hard interface specific sysfs entries + * @hardif_obj: address to the pointer to which stores batman-adv sysfs folder + * of the hard interface + */ void batadv_sysfs_del_hardif(struct kobject **hardif_obj) { kobject_uevent(*hardif_obj, KOBJ_REMOVE); @@ -1172,6 +1196,16 @@ void batadv_sysfs_del_hardif(struct kobject **hardif_obj) *hardif_obj = NULL; } +/** + * batadv_throw_uevent() - Send an uevent with batman-adv specific env data + * @bat_priv: the bat priv with all the soft interface information + * @type: subsystem type of event. Stored in uevent's BATTYPE + * @action: action type of event. Stored in uevent's BATACTION + * @data: string with additional information to the event (ignored for + * BATADV_UEV_DEL). Stored in uevent's BATDATA + * + * Return: 0 on success or negative error number in case of failure + */ int batadv_throw_uevent(struct batadv_priv *bat_priv, enum batadv_uev_type type, enum batadv_uev_action action, const char *data) { diff --git a/net/batman-adv/sysfs.h b/net/batman-adv/sysfs.h index e487412e256b..bbeee61221fa 100644 --- a/net/batman-adv/sysfs.h +++ b/net/batman-adv/sysfs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2010-2017 B.A.T.M.A.N. contributors: * * Marek Lindner @@ -35,10 +36,23 @@ struct net_device; */ #define BATADV_SYSFS_VLAN_SUBDIR_PREFIX "vlan" +/** + * struct batadv_attribute - sysfs export helper for batman-adv attributes + */ struct batadv_attribute { + /** @attr: sysfs attribute file */ struct attribute attr; + + /** + * @show: function to export the current attribute's content to sysfs + */ ssize_t (*show)(struct kobject *kobj, struct attribute *attr, char *buf); + + /** + * @store: function to load new value from character buffer and save it + * in batman-adv attribute + */ ssize_t (*store)(struct kobject *kobj, struct attribute *attr, char *buf, size_t count); }; diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index 15cd2139381e..8b576712d0c1 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2012-2017 B.A.T.M.A.N. contributors: * * Edo Monticelli, Antonio Quartulli @@ -19,13 +20,13 @@ #include "main.h" #include <linux/atomic.h> -#include <linux/bug.h> +#include <linux/build_bug.h> #include <linux/byteorder/generic.h> #include <linux/cache.h> #include <linux/compiler.h> #include <linux/err.h> #include <linux/etherdevice.h> -#include <linux/fs.h> +#include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/init.h> #include <linux/jiffies.h> @@ -48,13 +49,13 @@ #include <linux/timer.h> #include <linux/wait.h> #include <linux/workqueue.h> +#include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "hard-interface.h" #include "log.h" #include "netlink.h" #include "originator.h" -#include "packet.h" #include "send.h" /** @@ -97,7 +98,7 @@ static u8 batadv_tp_prerandom[4096] __read_mostly; /** - * batadv_tp_session_cookie - generate session cookie based on session ids + * batadv_tp_session_cookie() - generate session cookie based on session ids * @session: TP session identifier * @icmp_uid: icmp pseudo uid of the tp session * @@ -115,7 +116,7 @@ static u32 batadv_tp_session_cookie(const u8 session[2], u8 icmp_uid) } /** - * batadv_tp_cwnd - compute the new cwnd size + * batadv_tp_cwnd() - compute the new cwnd size * @base: base cwnd size value * @increment: the value to add to base to get the new size * @min: minumim cwnd value (usually MSS) @@ -140,7 +141,7 @@ static u32 batadv_tp_cwnd(u32 base, u32 increment, u32 min) } /** - * batadv_tp_updated_cwnd - update the Congestion Windows + * batadv_tp_updated_cwnd() - update the Congestion Windows * @tp_vars: the private data of the current TP meter session * @mss: maximum segment size of transmission * @@ -176,7 +177,7 @@ static void batadv_tp_update_cwnd(struct batadv_tp_vars *tp_vars, u32 mss) } /** - * batadv_tp_update_rto - calculate new retransmission timeout + * batadv_tp_update_rto() - calculate new retransmission timeout * @tp_vars: the private data of the current TP meter session * @new_rtt: new roundtrip time in msec */ @@ -212,7 +213,7 @@ static void batadv_tp_update_rto(struct batadv_tp_vars *tp_vars, } /** - * batadv_tp_batctl_notify - send client status result to client + * batadv_tp_batctl_notify() - send client status result to client * @reason: reason for tp meter session stop * @dst: destination of tp_meter session * @bat_priv: the bat priv with all the soft interface information @@ -244,7 +245,7 @@ static void batadv_tp_batctl_notify(enum batadv_tp_meter_reason reason, } /** - * batadv_tp_batctl_error_notify - send client error result to client + * batadv_tp_batctl_error_notify() - send client error result to client * @reason: reason for tp meter session stop * @dst: destination of tp_meter session * @bat_priv: the bat priv with all the soft interface information @@ -259,7 +260,7 @@ static void batadv_tp_batctl_error_notify(enum batadv_tp_meter_reason reason, } /** - * batadv_tp_list_find - find a tp_vars object in the global list + * batadv_tp_list_find() - find a tp_vars object in the global list * @bat_priv: the bat priv with all the soft interface information * @dst: the other endpoint MAC address to look for * @@ -294,7 +295,8 @@ static struct batadv_tp_vars *batadv_tp_list_find(struct batadv_priv *bat_priv, } /** - * batadv_tp_list_find_session - find tp_vars session object in the global list + * batadv_tp_list_find_session() - find tp_vars session object in the global + * list * @bat_priv: the bat priv with all the soft interface information * @dst: the other endpoint MAC address to look for * @session: session identifier @@ -335,7 +337,7 @@ batadv_tp_list_find_session(struct batadv_priv *bat_priv, const u8 *dst, } /** - * batadv_tp_vars_release - release batadv_tp_vars from lists and queue for + * batadv_tp_vars_release() - release batadv_tp_vars from lists and queue for * free after rcu grace period * @ref: kref pointer of the batadv_tp_vars */ @@ -360,7 +362,7 @@ static void batadv_tp_vars_release(struct kref *ref) } /** - * batadv_tp_vars_put - decrement the batadv_tp_vars refcounter and possibly + * batadv_tp_vars_put() - decrement the batadv_tp_vars refcounter and possibly * release it * @tp_vars: the private data of the current TP meter session to be free'd */ @@ -370,7 +372,7 @@ static void batadv_tp_vars_put(struct batadv_tp_vars *tp_vars) } /** - * batadv_tp_sender_cleanup - cleanup sender data and drop and timer + * batadv_tp_sender_cleanup() - cleanup sender data and drop and timer * @bat_priv: the bat priv with all the soft interface information * @tp_vars: the private data of the current TP meter session to cleanup */ @@ -400,7 +402,7 @@ static void batadv_tp_sender_cleanup(struct batadv_priv *bat_priv, } /** - * batadv_tp_sender_end - print info about ended session and inform client + * batadv_tp_sender_end() - print info about ended session and inform client * @bat_priv: the bat priv with all the soft interface information * @tp_vars: the private data of the current TP meter session */ @@ -433,7 +435,7 @@ static void batadv_tp_sender_end(struct batadv_priv *bat_priv, } /** - * batadv_tp_sender_shutdown - let sender thread/timer stop gracefully + * batadv_tp_sender_shutdown() - let sender thread/timer stop gracefully * @tp_vars: the private data of the current TP meter session * @reason: reason for tp meter session stop */ @@ -447,7 +449,7 @@ static void batadv_tp_sender_shutdown(struct batadv_tp_vars *tp_vars, } /** - * batadv_tp_sender_finish - stop sender session after test_length was reached + * batadv_tp_sender_finish() - stop sender session after test_length was reached * @work: delayed work reference of the related tp_vars */ static void batadv_tp_sender_finish(struct work_struct *work) @@ -463,7 +465,7 @@ static void batadv_tp_sender_finish(struct work_struct *work) } /** - * batadv_tp_reset_sender_timer - reschedule the sender timer + * batadv_tp_reset_sender_timer() - reschedule the sender timer * @tp_vars: the private TP meter data for this session * * Reschedule the timer using tp_vars->rto as delay @@ -481,8 +483,8 @@ static void batadv_tp_reset_sender_timer(struct batadv_tp_vars *tp_vars) } /** - * batadv_tp_sender_timeout - timer that fires in case of packet loss - * @arg: address of the related tp_vars + * batadv_tp_sender_timeout() - timer that fires in case of packet loss + * @t: address to timer_list inside tp_vars * * If fired it means that there was packet loss. * Switch to Slow Start, set the ss_threshold to half of the current cwnd and @@ -531,7 +533,7 @@ static void batadv_tp_sender_timeout(struct timer_list *t) } /** - * batadv_tp_fill_prerandom - Fill buffer with prefetched random bytes + * batadv_tp_fill_prerandom() - Fill buffer with prefetched random bytes * @tp_vars: the private TP meter data for this session * @buf: Buffer to fill with bytes * @nbytes: amount of pseudorandom bytes @@ -563,7 +565,7 @@ static void batadv_tp_fill_prerandom(struct batadv_tp_vars *tp_vars, } /** - * batadv_tp_send_msg - send a single message + * batadv_tp_send_msg() - send a single message * @tp_vars: the private TP meter data for this session * @src: source mac address * @orig_node: the originator of the destination @@ -623,7 +625,7 @@ static int batadv_tp_send_msg(struct batadv_tp_vars *tp_vars, const u8 *src, } /** - * batadv_tp_recv_ack - ACK receiving function + * batadv_tp_recv_ack() - ACK receiving function * @bat_priv: the bat priv with all the soft interface information * @skb: the buffer containing the received packet * @@ -765,7 +767,7 @@ out: } /** - * batadv_tp_avail - check if congestion window is not full + * batadv_tp_avail() - check if congestion window is not full * @tp_vars: the private data of the current TP meter session * @payload_len: size of the payload of a single message * @@ -783,7 +785,7 @@ static bool batadv_tp_avail(struct batadv_tp_vars *tp_vars, } /** - * batadv_tp_wait_available - wait until congestion window becomes free or + * batadv_tp_wait_available() - wait until congestion window becomes free or * timeout is reached * @tp_vars: the private data of the current TP meter session * @plen: size of the payload of a single message @@ -805,7 +807,7 @@ static int batadv_tp_wait_available(struct batadv_tp_vars *tp_vars, size_t plen) } /** - * batadv_tp_send - main sending thread of a tp meter session + * batadv_tp_send() - main sending thread of a tp meter session * @arg: address of the related tp_vars * * Return: nothing, this function never returns @@ -904,7 +906,8 @@ out: } /** - * batadv_tp_start_kthread - start new thread which manages the tp meter sender + * batadv_tp_start_kthread() - start new thread which manages the tp meter + * sender * @tp_vars: the private data of the current TP meter session */ static void batadv_tp_start_kthread(struct batadv_tp_vars *tp_vars) @@ -935,7 +938,7 @@ static void batadv_tp_start_kthread(struct batadv_tp_vars *tp_vars) } /** - * batadv_tp_start - start a new tp meter session + * batadv_tp_start() - start a new tp meter session * @bat_priv: the bat priv with all the soft interface information * @dst: the receiver MAC address * @test_length: test length in milliseconds @@ -1060,7 +1063,7 @@ void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst, } /** - * batadv_tp_stop - stop currently running tp meter session + * batadv_tp_stop() - stop currently running tp meter session * @bat_priv: the bat priv with all the soft interface information * @dst: the receiver MAC address * @return_value: reason for tp meter session stop @@ -1092,7 +1095,7 @@ out: } /** - * batadv_tp_reset_receiver_timer - reset the receiver shutdown timer + * batadv_tp_reset_receiver_timer() - reset the receiver shutdown timer * @tp_vars: the private data of the current TP meter session * * start the receiver shutdown timer or reset it if already started @@ -1104,9 +1107,9 @@ static void batadv_tp_reset_receiver_timer(struct batadv_tp_vars *tp_vars) } /** - * batadv_tp_receiver_shutdown - stop a tp meter receiver when timeout is + * batadv_tp_receiver_shutdown() - stop a tp meter receiver when timeout is * reached without received ack - * @arg: address of the related tp_vars + * @t: address to timer_list inside tp_vars */ static void batadv_tp_receiver_shutdown(struct timer_list *t) { @@ -1149,7 +1152,7 @@ static void batadv_tp_receiver_shutdown(struct timer_list *t) } /** - * batadv_tp_send_ack - send an ACK packet + * batadv_tp_send_ack() - send an ACK packet * @bat_priv: the bat priv with all the soft interface information * @dst: the mac address of the destination originator * @seq: the sequence number to ACK @@ -1221,7 +1224,7 @@ out: } /** - * batadv_tp_handle_out_of_order - store an out of order packet + * batadv_tp_handle_out_of_order() - store an out of order packet * @tp_vars: the private data of the current TP meter session * @skb: the buffer containing the received packet * @@ -1297,7 +1300,7 @@ out: } /** - * batadv_tp_ack_unordered - update number received bytes in current stream + * batadv_tp_ack_unordered() - update number received bytes in current stream * without gaps * @tp_vars: the private data of the current TP meter session */ @@ -1330,7 +1333,7 @@ static void batadv_tp_ack_unordered(struct batadv_tp_vars *tp_vars) } /** - * batadv_tp_init_recv - return matching or create new receiver tp_vars + * batadv_tp_init_recv() - return matching or create new receiver tp_vars * @bat_priv: the bat priv with all the soft interface information * @icmp: received icmp tp msg * @@ -1383,7 +1386,7 @@ out_unlock: } /** - * batadv_tp_recv_msg - process a single data message + * batadv_tp_recv_msg() - process a single data message * @bat_priv: the bat priv with all the soft interface information * @skb: the buffer containing the received packet * @@ -1468,7 +1471,7 @@ out: } /** - * batadv_tp_meter_recv - main TP Meter receiving function + * batadv_tp_meter_recv() - main TP Meter receiving function * @bat_priv: the bat priv with all the soft interface information * @skb: the buffer containing the received packet */ @@ -1494,7 +1497,7 @@ void batadv_tp_meter_recv(struct batadv_priv *bat_priv, struct sk_buff *skb) } /** - * batadv_tp_meter_init - initialize global tp_meter structures + * batadv_tp_meter_init() - initialize global tp_meter structures */ void __init batadv_tp_meter_init(void) { diff --git a/net/batman-adv/tp_meter.h b/net/batman-adv/tp_meter.h index a8ada5c123bd..c8b8f2cb2c2b 100644 --- a/net/batman-adv/tp_meter.h +++ b/net/batman-adv/tp_meter.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2012-2017 B.A.T.M.A.N. contributors: * * Edo Monticelli, Antonio Quartulli diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 8a3ce79b1307..7550a9ccd695 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli @@ -20,14 +21,14 @@ #include <linux/atomic.h> #include <linux/bitops.h> -#include <linux/bug.h> +#include <linux/build_bug.h> #include <linux/byteorder/generic.h> #include <linux/cache.h> #include <linux/compiler.h> #include <linux/crc32c.h> #include <linux/errno.h> #include <linux/etherdevice.h> -#include <linux/fs.h> +#include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/init.h> #include <linux/jhash.h> @@ -36,6 +37,7 @@ #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> +#include <linux/net.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/rculist.h> @@ -50,6 +52,7 @@ #include <net/genetlink.h> #include <net/netlink.h> #include <net/sock.h> +#include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "bridge_loop_avoidance.h" @@ -58,7 +61,6 @@ #include "log.h" #include "netlink.h" #include "originator.h" -#include "packet.h" #include "soft-interface.h" #include "tvlv.h" @@ -86,7 +88,7 @@ static void batadv_tt_global_del(struct batadv_priv *bat_priv, bool roaming); /** - * batadv_compare_tt - check if two TT entries are the same + * batadv_compare_tt() - check if two TT entries are the same * @node: the list element pointer of the first TT entry * @data2: pointer to the tt_common_entry of the second TT entry * @@ -105,7 +107,7 @@ static bool batadv_compare_tt(const struct hlist_node *node, const void *data2) } /** - * batadv_choose_tt - return the index of the tt entry in the hash table + * batadv_choose_tt() - return the index of the tt entry in the hash table * @data: pointer to the tt_common_entry object to map * @size: the size of the hash table * @@ -125,7 +127,7 @@ static inline u32 batadv_choose_tt(const void *data, u32 size) } /** - * batadv_tt_hash_find - look for a client in the given hash table + * batadv_tt_hash_find() - look for a client in the given hash table * @hash: the hash table to search * @addr: the mac address of the client to look for * @vid: VLAN identifier @@ -170,7 +172,7 @@ batadv_tt_hash_find(struct batadv_hashtable *hash, const u8 *addr, } /** - * batadv_tt_local_hash_find - search the local table for a given client + * batadv_tt_local_hash_find() - search the local table for a given client * @bat_priv: the bat priv with all the soft interface information * @addr: the mac address of the client to look for * @vid: VLAN identifier @@ -195,7 +197,7 @@ batadv_tt_local_hash_find(struct batadv_priv *bat_priv, const u8 *addr, } /** - * batadv_tt_global_hash_find - search the global table for a given client + * batadv_tt_global_hash_find() - search the global table for a given client * @bat_priv: the bat priv with all the soft interface information * @addr: the mac address of the client to look for * @vid: VLAN identifier @@ -220,7 +222,7 @@ batadv_tt_global_hash_find(struct batadv_priv *bat_priv, const u8 *addr, } /** - * batadv_tt_local_entry_free_rcu - free the tt_local_entry + * batadv_tt_local_entry_free_rcu() - free the tt_local_entry * @rcu: rcu pointer of the tt_local_entry */ static void batadv_tt_local_entry_free_rcu(struct rcu_head *rcu) @@ -234,7 +236,7 @@ static void batadv_tt_local_entry_free_rcu(struct rcu_head *rcu) } /** - * batadv_tt_local_entry_release - release tt_local_entry from lists and queue + * batadv_tt_local_entry_release() - release tt_local_entry from lists and queue * for free after rcu grace period * @ref: kref pointer of the nc_node */ @@ -251,7 +253,7 @@ static void batadv_tt_local_entry_release(struct kref *ref) } /** - * batadv_tt_local_entry_put - decrement the tt_local_entry refcounter and + * batadv_tt_local_entry_put() - decrement the tt_local_entry refcounter and * possibly release it * @tt_local_entry: tt_local_entry to be free'd */ @@ -263,7 +265,7 @@ batadv_tt_local_entry_put(struct batadv_tt_local_entry *tt_local_entry) } /** - * batadv_tt_global_entry_free_rcu - free the tt_global_entry + * batadv_tt_global_entry_free_rcu() - free the tt_global_entry * @rcu: rcu pointer of the tt_global_entry */ static void batadv_tt_global_entry_free_rcu(struct rcu_head *rcu) @@ -277,8 +279,8 @@ static void batadv_tt_global_entry_free_rcu(struct rcu_head *rcu) } /** - * batadv_tt_global_entry_release - release tt_global_entry from lists and queue - * for free after rcu grace period + * batadv_tt_global_entry_release() - release tt_global_entry from lists and + * queue for free after rcu grace period * @ref: kref pointer of the nc_node */ static void batadv_tt_global_entry_release(struct kref *ref) @@ -294,7 +296,7 @@ static void batadv_tt_global_entry_release(struct kref *ref) } /** - * batadv_tt_global_entry_put - decrement the tt_global_entry refcounter and + * batadv_tt_global_entry_put() - decrement the tt_global_entry refcounter and * possibly release it * @tt_global_entry: tt_global_entry to be free'd */ @@ -306,7 +308,7 @@ batadv_tt_global_entry_put(struct batadv_tt_global_entry *tt_global_entry) } /** - * batadv_tt_global_hash_count - count the number of orig entries + * batadv_tt_global_hash_count() - count the number of orig entries * @bat_priv: the bat priv with all the soft interface information * @addr: the mac address of the client to count entries for * @vid: VLAN identifier @@ -331,8 +333,8 @@ int batadv_tt_global_hash_count(struct batadv_priv *bat_priv, } /** - * batadv_tt_local_size_mod - change the size by v of the local table identified - * by vid + * batadv_tt_local_size_mod() - change the size by v of the local table + * identified by vid * @bat_priv: the bat priv with all the soft interface information * @vid: the VLAN identifier of the sub-table to change * @v: the amount to sum to the local table size @@ -352,8 +354,8 @@ static void batadv_tt_local_size_mod(struct batadv_priv *bat_priv, } /** - * batadv_tt_local_size_inc - increase by one the local table size for the given - * vid + * batadv_tt_local_size_inc() - increase by one the local table size for the + * given vid * @bat_priv: the bat priv with all the soft interface information * @vid: the VLAN identifier */ @@ -364,8 +366,8 @@ static void batadv_tt_local_size_inc(struct batadv_priv *bat_priv, } /** - * batadv_tt_local_size_dec - decrease by one the local table size for the given - * vid + * batadv_tt_local_size_dec() - decrease by one the local table size for the + * given vid * @bat_priv: the bat priv with all the soft interface information * @vid: the VLAN identifier */ @@ -376,7 +378,7 @@ static void batadv_tt_local_size_dec(struct batadv_priv *bat_priv, } /** - * batadv_tt_global_size_mod - change the size by v of the global table + * batadv_tt_global_size_mod() - change the size by v of the global table * for orig_node identified by vid * @orig_node: the originator for which the table has to be modified * @vid: the VLAN identifier @@ -404,7 +406,7 @@ static void batadv_tt_global_size_mod(struct batadv_orig_node *orig_node, } /** - * batadv_tt_global_size_inc - increase by one the global table size for the + * batadv_tt_global_size_inc() - increase by one the global table size for the * given vid * @orig_node: the originator which global table size has to be decreased * @vid: the vlan identifier @@ -416,7 +418,7 @@ static void batadv_tt_global_size_inc(struct batadv_orig_node *orig_node, } /** - * batadv_tt_global_size_dec - decrease by one the global table size for the + * batadv_tt_global_size_dec() - decrease by one the global table size for the * given vid * @orig_node: the originator which global table size has to be decreased * @vid: the vlan identifier @@ -428,7 +430,7 @@ static void batadv_tt_global_size_dec(struct batadv_orig_node *orig_node, } /** - * batadv_tt_orig_list_entry_free_rcu - free the orig_entry + * batadv_tt_orig_list_entry_free_rcu() - free the orig_entry * @rcu: rcu pointer of the orig_entry */ static void batadv_tt_orig_list_entry_free_rcu(struct rcu_head *rcu) @@ -441,7 +443,7 @@ static void batadv_tt_orig_list_entry_free_rcu(struct rcu_head *rcu) } /** - * batadv_tt_orig_list_entry_release - release tt orig entry from lists and + * batadv_tt_orig_list_entry_release() - release tt orig entry from lists and * queue for free after rcu grace period * @ref: kref pointer of the tt orig entry */ @@ -457,7 +459,7 @@ static void batadv_tt_orig_list_entry_release(struct kref *ref) } /** - * batadv_tt_orig_list_entry_put - decrement the tt orig entry refcounter and + * batadv_tt_orig_list_entry_put() - decrement the tt orig entry refcounter and * possibly release it * @orig_entry: tt orig entry to be free'd */ @@ -468,7 +470,7 @@ batadv_tt_orig_list_entry_put(struct batadv_tt_orig_list_entry *orig_entry) } /** - * batadv_tt_local_event - store a local TT event (ADD/DEL) + * batadv_tt_local_event() - store a local TT event (ADD/DEL) * @bat_priv: the bat priv with all the soft interface information * @tt_local_entry: the TT entry involved in the event * @event_flags: flags to store in the event structure @@ -543,7 +545,7 @@ unlock: } /** - * batadv_tt_len - compute length in bytes of given number of tt changes + * batadv_tt_len() - compute length in bytes of given number of tt changes * @changes_num: number of tt changes * * Return: computed length in bytes. @@ -554,7 +556,7 @@ static int batadv_tt_len(int changes_num) } /** - * batadv_tt_entries - compute the number of entries fitting in tt_len bytes + * batadv_tt_entries() - compute the number of entries fitting in tt_len bytes * @tt_len: available space * * Return: the number of entries. @@ -565,8 +567,8 @@ static u16 batadv_tt_entries(u16 tt_len) } /** - * batadv_tt_local_table_transmit_size - calculates the local translation table - * size when transmitted over the air + * batadv_tt_local_table_transmit_size() - calculates the local translation + * table size when transmitted over the air * @bat_priv: the bat priv with all the soft interface information * * Return: local translation table size in bytes. @@ -625,7 +627,7 @@ static void batadv_tt_global_free(struct batadv_priv *bat_priv, } /** - * batadv_tt_local_add - add a new client to the local table or update an + * batadv_tt_local_add() - add a new client to the local table or update an * existing client * @soft_iface: netdev struct of the mesh interface * @addr: the mac address of the client to add @@ -830,7 +832,7 @@ out: } /** - * batadv_tt_prepare_tvlv_global_data - prepare the TVLV TT header to send + * batadv_tt_prepare_tvlv_global_data() - prepare the TVLV TT header to send * within a TT Response directed to another node * @orig_node: originator for which the TT data has to be prepared * @tt_data: uninitialised pointer to the address of the TVLV buffer @@ -903,8 +905,8 @@ out: } /** - * batadv_tt_prepare_tvlv_local_data - allocate and prepare the TT TVLV for this - * node + * batadv_tt_prepare_tvlv_local_data() - allocate and prepare the TT TVLV for + * this node * @bat_priv: the bat priv with all the soft interface information * @tt_data: uninitialised pointer to the address of the TVLV buffer * @tt_change: uninitialised pointer to the address of the area where the TT @@ -977,8 +979,8 @@ out: } /** - * batadv_tt_tvlv_container_update - update the translation table tvlv container - * after local tt changes have been committed + * batadv_tt_tvlv_container_update() - update the translation table tvlv + * container after local tt changes have been committed * @bat_priv: the bat priv with all the soft interface information */ static void batadv_tt_tvlv_container_update(struct batadv_priv *bat_priv) @@ -1053,6 +1055,14 @@ container_register: } #ifdef CONFIG_BATMAN_ADV_DEBUGFS + +/** + * batadv_tt_local_seq_print_text() - Print the local tt table in a seq file + * @seq: seq file to print on + * @offset: not used + * + * Return: always 0 + */ int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset) { struct net_device *net_dev = (struct net_device *)seq->private; @@ -1123,7 +1133,7 @@ out: #endif /** - * batadv_tt_local_dump_entry - Dump one TT local entry into a message + * batadv_tt_local_dump_entry() - Dump one TT local entry into a message * @msg :Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message @@ -1179,7 +1189,7 @@ batadv_tt_local_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, } /** - * batadv_tt_local_dump_bucket - Dump one TT local bucket into a message + * batadv_tt_local_dump_bucket() - Dump one TT local bucket into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message @@ -1216,7 +1226,7 @@ batadv_tt_local_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, } /** - * batadv_tt_local_dump - Dump TT local entries into a message + * batadv_tt_local_dump() - Dump TT local entries into a message * @msg: Netlink message to dump into * @cb: Parameters from query * @@ -1300,7 +1310,7 @@ batadv_tt_local_set_pending(struct batadv_priv *bat_priv, } /** - * batadv_tt_local_remove - logically remove an entry from the local table + * batadv_tt_local_remove() - logically remove an entry from the local table * @bat_priv: the bat priv with all the soft interface information * @addr: the MAC address of the client to remove * @vid: VLAN identifier @@ -1362,7 +1372,7 @@ out: } /** - * batadv_tt_local_purge_list - purge inactive tt local entries + * batadv_tt_local_purge_list() - purge inactive tt local entries * @bat_priv: the bat priv with all the soft interface information * @head: pointer to the list containing the local tt entries * @timeout: parameter deciding whether a given tt local entry is considered @@ -1397,7 +1407,7 @@ static void batadv_tt_local_purge_list(struct batadv_priv *bat_priv, } /** - * batadv_tt_local_purge - purge inactive tt local entries + * batadv_tt_local_purge() - purge inactive tt local entries * @bat_priv: the bat priv with all the soft interface information * @timeout: parameter deciding whether a given tt local entry is considered * inactive or not @@ -1490,7 +1500,7 @@ static void batadv_tt_changes_list_free(struct batadv_priv *bat_priv) } /** - * batadv_tt_global_orig_entry_find - find a TT orig_list_entry + * batadv_tt_global_orig_entry_find() - find a TT orig_list_entry * @entry: the TT global entry where the orig_list_entry has to be * extracted from * @orig_node: the originator for which the orig_list_entry has to be found @@ -1524,8 +1534,8 @@ batadv_tt_global_orig_entry_find(const struct batadv_tt_global_entry *entry, } /** - * batadv_tt_global_entry_has_orig - check if a TT global entry is also handled - * by a given originator + * batadv_tt_global_entry_has_orig() - check if a TT global entry is also + * handled by a given originator * @entry: the TT global entry to check * @orig_node: the originator to search in the list * @@ -1550,7 +1560,7 @@ batadv_tt_global_entry_has_orig(const struct batadv_tt_global_entry *entry, } /** - * batadv_tt_global_sync_flags - update TT sync flags + * batadv_tt_global_sync_flags() - update TT sync flags * @tt_global: the TT global entry to update sync flags in * * Updates the sync flag bits in the tt_global flag attribute with a logical @@ -1574,7 +1584,7 @@ batadv_tt_global_sync_flags(struct batadv_tt_global_entry *tt_global) } /** - * batadv_tt_global_orig_entry_add - add or update a TT orig entry + * batadv_tt_global_orig_entry_add() - add or update a TT orig entry * @tt_global: the TT global entry to add an orig entry in * @orig_node: the originator to add an orig entry for * @ttvn: translation table version number of this changeset @@ -1624,7 +1634,7 @@ out: } /** - * batadv_tt_global_add - add a new TT global entry or update an existing one + * batadv_tt_global_add() - add a new TT global entry or update an existing one * @bat_priv: the bat priv with all the soft interface information * @orig_node: the originator announcing the client * @tt_addr: the mac address of the non-mesh client @@ -1796,7 +1806,7 @@ out: } /** - * batadv_transtable_best_orig - Get best originator list entry from tt entry + * batadv_transtable_best_orig() - Get best originator list entry from tt entry * @bat_priv: the bat priv with all the soft interface information * @tt_global_entry: global translation table entry to be analyzed * @@ -1842,8 +1852,8 @@ batadv_transtable_best_orig(struct batadv_priv *bat_priv, #ifdef CONFIG_BATMAN_ADV_DEBUGFS /** - * batadv_tt_global_print_entry - print all orig nodes who announce the address - * for this global entry + * batadv_tt_global_print_entry() - print all orig nodes who announce the + * address for this global entry * @bat_priv: the bat priv with all the soft interface information * @tt_global_entry: global translation table entry to be printed * @seq: debugfs table seq_file struct @@ -1925,6 +1935,13 @@ print_list: } } +/** + * batadv_tt_global_seq_print_text() - Print the global tt table in a seq file + * @seq: seq file to print on + * @offset: not used + * + * Return: always 0 + */ int batadv_tt_global_seq_print_text(struct seq_file *seq, void *offset) { struct net_device *net_dev = (struct net_device *)seq->private; @@ -1967,7 +1984,7 @@ out: #endif /** - * batadv_tt_global_dump_subentry - Dump all TT local entries into a message + * batadv_tt_global_dump_subentry() - Dump all TT local entries into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message @@ -2028,7 +2045,7 @@ batadv_tt_global_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq, } /** - * batadv_tt_global_dump_entry - Dump one TT global entry into a message + * batadv_tt_global_dump_entry() - Dump one TT global entry into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message @@ -2073,7 +2090,7 @@ batadv_tt_global_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, } /** - * batadv_tt_global_dump_bucket - Dump one TT local bucket into a message + * batadv_tt_global_dump_bucket() - Dump one TT local bucket into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message @@ -2112,7 +2129,7 @@ batadv_tt_global_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, } /** - * batadv_tt_global_dump - Dump TT global entries into a message + * batadv_tt_global_dump() - Dump TT global entries into a message * @msg: Netlink message to dump into * @cb: Parameters from query * @@ -2180,7 +2197,7 @@ int batadv_tt_global_dump(struct sk_buff *msg, struct netlink_callback *cb) } /** - * _batadv_tt_global_del_orig_entry - remove and free an orig_entry + * _batadv_tt_global_del_orig_entry() - remove and free an orig_entry * @tt_global_entry: the global entry to remove the orig_entry from * @orig_entry: the orig entry to remove and free * @@ -2222,7 +2239,7 @@ batadv_tt_global_del_orig_list(struct batadv_tt_global_entry *tt_global_entry) } /** - * batadv_tt_global_del_orig_node - remove orig_node from a global tt entry + * batadv_tt_global_del_orig_node() - remove orig_node from a global tt entry * @bat_priv: the bat priv with all the soft interface information * @tt_global_entry: the global entry to remove the orig_node from * @orig_node: the originator announcing the client @@ -2301,7 +2318,7 @@ batadv_tt_global_del_roaming(struct batadv_priv *bat_priv, } /** - * batadv_tt_global_del - remove a client from the global table + * batadv_tt_global_del() - remove a client from the global table * @bat_priv: the bat priv with all the soft interface information * @orig_node: an originator serving this client * @addr: the mac address of the client @@ -2367,8 +2384,8 @@ out: } /** - * batadv_tt_global_del_orig - remove all the TT global entries belonging to the - * given originator matching the provided vid + * batadv_tt_global_del_orig() - remove all the TT global entries belonging to + * the given originator matching the provided vid * @bat_priv: the bat priv with all the soft interface information * @orig_node: the originator owning the entries to remove * @match_vid: the VLAN identifier to match. If negative all the entries will be @@ -2539,7 +2556,7 @@ _batadv_is_ap_isolated(struct batadv_tt_local_entry *tt_local_entry, } /** - * batadv_transtable_search - get the mesh destination for a given client + * batadv_transtable_search() - get the mesh destination for a given client * @bat_priv: the bat priv with all the soft interface information * @src: mac address of the source client * @addr: mac address of the destination client @@ -2599,7 +2616,7 @@ out: } /** - * batadv_tt_global_crc - calculates the checksum of the local table belonging + * batadv_tt_global_crc() - calculates the checksum of the local table belonging * to the given orig_node * @bat_priv: the bat priv with all the soft interface information * @orig_node: originator for which the CRC should be computed @@ -2694,7 +2711,7 @@ static u32 batadv_tt_global_crc(struct batadv_priv *bat_priv, } /** - * batadv_tt_local_crc - calculates the checksum of the local table + * batadv_tt_local_crc() - calculates the checksum of the local table * @bat_priv: the bat priv with all the soft interface information * @vid: VLAN identifier for which the CRC32 has to be computed * @@ -2751,7 +2768,7 @@ static u32 batadv_tt_local_crc(struct batadv_priv *bat_priv, } /** - * batadv_tt_req_node_release - free tt_req node entry + * batadv_tt_req_node_release() - free tt_req node entry * @ref: kref pointer of the tt req_node entry */ static void batadv_tt_req_node_release(struct kref *ref) @@ -2764,7 +2781,7 @@ static void batadv_tt_req_node_release(struct kref *ref) } /** - * batadv_tt_req_node_put - decrement the tt_req_node refcounter and + * batadv_tt_req_node_put() - decrement the tt_req_node refcounter and * possibly release it * @tt_req_node: tt_req_node to be free'd */ @@ -2826,7 +2843,7 @@ static void batadv_tt_req_purge(struct batadv_priv *bat_priv) } /** - * batadv_tt_req_node_new - search and possibly create a tt_req_node object + * batadv_tt_req_node_new() - search and possibly create a tt_req_node object * @bat_priv: the bat priv with all the soft interface information * @orig_node: orig node this request is being issued for * @@ -2863,7 +2880,7 @@ unlock: } /** - * batadv_tt_local_valid - verify that given tt entry is a valid one + * batadv_tt_local_valid() - verify that given tt entry is a valid one * @entry_ptr: to be checked local tt entry * @data_ptr: not used but definition required to satisfy the callback prototype * @@ -2897,7 +2914,7 @@ static bool batadv_tt_global_valid(const void *entry_ptr, } /** - * batadv_tt_tvlv_generate - fill the tvlv buff with the tt entries from the + * batadv_tt_tvlv_generate() - fill the tvlv buff with the tt entries from the * specified tt hash * @bat_priv: the bat priv with all the soft interface information * @hash: hash table containing the tt entries @@ -2948,7 +2965,7 @@ static void batadv_tt_tvlv_generate(struct batadv_priv *bat_priv, } /** - * batadv_tt_global_check_crc - check if all the CRCs are correct + * batadv_tt_global_check_crc() - check if all the CRCs are correct * @orig_node: originator for which the CRCs have to be checked * @tt_vlan: pointer to the first tvlv VLAN entry * @num_vlan: number of tvlv VLAN entries @@ -3005,7 +3022,7 @@ static bool batadv_tt_global_check_crc(struct batadv_orig_node *orig_node, } /** - * batadv_tt_local_update_crc - update all the local CRCs + * batadv_tt_local_update_crc() - update all the local CRCs * @bat_priv: the bat priv with all the soft interface information */ static void batadv_tt_local_update_crc(struct batadv_priv *bat_priv) @@ -3021,7 +3038,7 @@ static void batadv_tt_local_update_crc(struct batadv_priv *bat_priv) } /** - * batadv_tt_global_update_crc - update all the global CRCs for this orig_node + * batadv_tt_global_update_crc() - update all the global CRCs for this orig_node * @bat_priv: the bat priv with all the soft interface information * @orig_node: the orig_node for which the CRCs have to be updated */ @@ -3048,7 +3065,7 @@ static void batadv_tt_global_update_crc(struct batadv_priv *bat_priv, } /** - * batadv_send_tt_request - send a TT Request message to a given node + * batadv_send_tt_request() - send a TT Request message to a given node * @bat_priv: the bat priv with all the soft interface information * @dst_orig_node: the destination of the message * @ttvn: the version number that the source of the message is looking for @@ -3137,7 +3154,7 @@ out: } /** - * batadv_send_other_tt_response - send reply to tt request concerning another + * batadv_send_other_tt_response() - send reply to tt request concerning another * node's translation table * @bat_priv: the bat priv with all the soft interface information * @tt_data: tt data containing the tt request information @@ -3270,8 +3287,8 @@ out: } /** - * batadv_send_my_tt_response - send reply to tt request concerning this node's - * translation table + * batadv_send_my_tt_response() - send reply to tt request concerning this + * node's translation table * @bat_priv: the bat priv with all the soft interface information * @tt_data: tt data containing the tt request information * @req_src: mac address of tt request sender @@ -3388,7 +3405,7 @@ out: } /** - * batadv_send_tt_response - send reply to tt request + * batadv_send_tt_response() - send reply to tt request * @bat_priv: the bat priv with all the soft interface information * @tt_data: tt data containing the tt request information * @req_src: mac address of tt request sender @@ -3484,7 +3501,7 @@ static void batadv_tt_update_changes(struct batadv_priv *bat_priv, } /** - * batadv_is_my_client - check if a client is served by the local node + * batadv_is_my_client() - check if a client is served by the local node * @bat_priv: the bat priv with all the soft interface information * @addr: the mac address of the client to check * @vid: VLAN identifier @@ -3514,7 +3531,7 @@ out: } /** - * batadv_handle_tt_response - process incoming tt reply + * batadv_handle_tt_response() - process incoming tt reply * @bat_priv: the bat priv with all the soft interface information * @tt_data: tt data containing the tt request information * @resp_src: mac address of tt reply sender @@ -3607,7 +3624,7 @@ static void batadv_tt_roam_purge(struct batadv_priv *bat_priv) } /** - * batadv_tt_check_roam_count - check if a client has roamed too frequently + * batadv_tt_check_roam_count() - check if a client has roamed too frequently * @bat_priv: the bat priv with all the soft interface information * @client: mac address of the roaming client * @@ -3662,7 +3679,7 @@ unlock: } /** - * batadv_send_roam_adv - send a roaming advertisement message + * batadv_send_roam_adv() - send a roaming advertisement message * @bat_priv: the bat priv with all the soft interface information * @client: mac address of the roaming client * @vid: VLAN identifier @@ -3727,6 +3744,10 @@ static void batadv_tt_purge(struct work_struct *work) msecs_to_jiffies(BATADV_TT_WORK_PERIOD)); } +/** + * batadv_tt_free() - Free translation table of soft interface + * @bat_priv: the bat priv with all the soft interface information + */ void batadv_tt_free(struct batadv_priv *bat_priv) { batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_TT, 1); @@ -3744,7 +3765,7 @@ void batadv_tt_free(struct batadv_priv *bat_priv) } /** - * batadv_tt_local_set_flags - set or unset the specified flags on the local + * batadv_tt_local_set_flags() - set or unset the specified flags on the local * table and possibly count them in the TT size * @bat_priv: the bat priv with all the soft interface information * @flags: the flag to switch @@ -3830,7 +3851,7 @@ static void batadv_tt_local_purge_pending_clients(struct batadv_priv *bat_priv) } /** - * batadv_tt_local_commit_changes_nolock - commit all pending local tt changes + * batadv_tt_local_commit_changes_nolock() - commit all pending local tt changes * which have been queued in the time since the last commit * @bat_priv: the bat priv with all the soft interface information * @@ -3863,7 +3884,7 @@ static void batadv_tt_local_commit_changes_nolock(struct batadv_priv *bat_priv) } /** - * batadv_tt_local_commit_changes - commit all pending local tt changes which + * batadv_tt_local_commit_changes() - commit all pending local tt changes which * have been queued in the time since the last commit * @bat_priv: the bat priv with all the soft interface information */ @@ -3874,6 +3895,15 @@ void batadv_tt_local_commit_changes(struct batadv_priv *bat_priv) spin_unlock_bh(&bat_priv->tt.commit_lock); } +/** + * batadv_is_ap_isolated() - Check if packet from upper layer should be dropped + * @bat_priv: the bat priv with all the soft interface information + * @src: source mac address of packet + * @dst: destination mac address of packet + * @vid: vlan id of packet + * + * Return: true when src+dst(+vid) pair should be isolated, false otherwise + */ bool batadv_is_ap_isolated(struct batadv_priv *bat_priv, u8 *src, u8 *dst, unsigned short vid) { @@ -3909,7 +3939,7 @@ vlan_put: } /** - * batadv_tt_update_orig - update global translation table with new tt + * batadv_tt_update_orig() - update global translation table with new tt * information received via ogms * @bat_priv: the bat priv with all the soft interface information * @orig_node: the orig_node of the ogm @@ -3994,7 +4024,7 @@ request_table: } /** - * batadv_tt_global_client_is_roaming - check if a client is marked as roaming + * batadv_tt_global_client_is_roaming() - check if a client is marked as roaming * @bat_priv: the bat priv with all the soft interface information * @addr: the mac address of the client to check * @vid: VLAN identifier @@ -4020,7 +4050,7 @@ out: } /** - * batadv_tt_local_client_is_roaming - tells whether the client is roaming + * batadv_tt_local_client_is_roaming() - tells whether the client is roaming * @bat_priv: the bat priv with all the soft interface information * @addr: the mac address of the local client to query * @vid: VLAN identifier @@ -4045,6 +4075,15 @@ out: return ret; } +/** + * batadv_tt_add_temporary_global_entry() - Add temporary entry to global TT + * @bat_priv: the bat priv with all the soft interface information + * @orig_node: orig node which the temporary entry should be associated with + * @addr: mac address of the client + * @vid: VLAN id of the new temporary global translation table + * + * Return: true when temporary tt entry could be added, false otherwise + */ bool batadv_tt_add_temporary_global_entry(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, const unsigned char *addr, @@ -4069,7 +4108,7 @@ bool batadv_tt_add_temporary_global_entry(struct batadv_priv *bat_priv, } /** - * batadv_tt_local_resize_to_mtu - resize the local translation table fit the + * batadv_tt_local_resize_to_mtu() - resize the local translation table fit the * maximum packet size that can be transported through the mesh * @soft_iface: netdev struct of the mesh interface * @@ -4110,7 +4149,7 @@ void batadv_tt_local_resize_to_mtu(struct net_device *soft_iface) } /** - * batadv_tt_tvlv_ogm_handler_v1 - process incoming tt tvlv container + * batadv_tt_tvlv_ogm_handler_v1() - process incoming tt tvlv container * @bat_priv: the bat priv with all the soft interface information * @orig: the orig_node of the ogm * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags) @@ -4149,7 +4188,7 @@ static void batadv_tt_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, } /** - * batadv_tt_tvlv_unicast_handler_v1 - process incoming (unicast) tt tvlv + * batadv_tt_tvlv_unicast_handler_v1() - process incoming (unicast) tt tvlv * container * @bat_priv: the bat priv with all the soft interface information * @src: mac address of tt tvlv sender @@ -4231,7 +4270,8 @@ static int batadv_tt_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv, } /** - * batadv_roam_tvlv_unicast_handler_v1 - process incoming tt roam tvlv container + * batadv_roam_tvlv_unicast_handler_v1() - process incoming tt roam tvlv + * container * @bat_priv: the bat priv with all the soft interface information * @src: mac address of tt tvlv sender * @dst: mac address of tt tvlv recipient @@ -4281,7 +4321,7 @@ out: } /** - * batadv_tt_init - initialise the translation table internals + * batadv_tt_init() - initialise the translation table internals * @bat_priv: the bat priv with all the soft interface information * * Return: 0 on success or negative error number in case of failure. @@ -4317,7 +4357,7 @@ int batadv_tt_init(struct batadv_priv *bat_priv) } /** - * batadv_tt_global_is_isolated - check if a client is marked as isolated + * batadv_tt_global_is_isolated() - check if a client is marked as isolated * @bat_priv: the bat priv with all the soft interface information * @addr: the mac address of the client * @vid: the identifier of the VLAN where this client is connected @@ -4343,7 +4383,7 @@ bool batadv_tt_global_is_isolated(struct batadv_priv *bat_priv, } /** - * batadv_tt_cache_init - Initialize tt memory object cache + * batadv_tt_cache_init() - Initialize tt memory object cache * * Return: 0 on success or negative error number in case of failure. */ @@ -4412,7 +4452,7 @@ err_tt_tl_destroy: } /** - * batadv_tt_cache_destroy - Destroy tt memory object cache + * batadv_tt_cache_destroy() - Destroy tt memory object cache */ void batadv_tt_cache_destroy(void) { diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h index 411d586191da..8d9e3abec2c8 100644 --- a/net/batman-adv/translation-table.h +++ b/net/batman-adv/translation-table.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c index 1d9e267caec9..5ffcb45ac6ff 100644 --- a/net/batman-adv/tvlv.c +++ b/net/batman-adv/tvlv.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich @@ -19,7 +20,7 @@ #include <linux/byteorder/generic.h> #include <linux/etherdevice.h> -#include <linux/fs.h> +#include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/kernel.h> #include <linux/kref.h> @@ -35,14 +36,14 @@ #include <linux/stddef.h> #include <linux/string.h> #include <linux/types.h> +#include <uapi/linux/batadv_packet.h> #include "originator.h" -#include "packet.h" #include "send.h" #include "tvlv.h" /** - * batadv_tvlv_handler_release - release tvlv handler from lists and queue for + * batadv_tvlv_handler_release() - release tvlv handler from lists and queue for * free after rcu grace period * @ref: kref pointer of the tvlv */ @@ -55,7 +56,7 @@ static void batadv_tvlv_handler_release(struct kref *ref) } /** - * batadv_tvlv_handler_put - decrement the tvlv container refcounter and + * batadv_tvlv_handler_put() - decrement the tvlv container refcounter and * possibly release it * @tvlv_handler: the tvlv handler to free */ @@ -65,7 +66,7 @@ static void batadv_tvlv_handler_put(struct batadv_tvlv_handler *tvlv_handler) } /** - * batadv_tvlv_handler_get - retrieve tvlv handler from the tvlv handler list + * batadv_tvlv_handler_get() - retrieve tvlv handler from the tvlv handler list * based on the provided type and version (both need to match) * @bat_priv: the bat priv with all the soft interface information * @type: tvlv handler type to look for @@ -99,7 +100,7 @@ batadv_tvlv_handler_get(struct batadv_priv *bat_priv, u8 type, u8 version) } /** - * batadv_tvlv_container_release - release tvlv from lists and free + * batadv_tvlv_container_release() - release tvlv from lists and free * @ref: kref pointer of the tvlv */ static void batadv_tvlv_container_release(struct kref *ref) @@ -111,7 +112,7 @@ static void batadv_tvlv_container_release(struct kref *ref) } /** - * batadv_tvlv_container_put - decrement the tvlv container refcounter and + * batadv_tvlv_container_put() - decrement the tvlv container refcounter and * possibly release it * @tvlv: the tvlv container to free */ @@ -121,7 +122,7 @@ static void batadv_tvlv_container_put(struct batadv_tvlv_container *tvlv) } /** - * batadv_tvlv_container_get - retrieve tvlv container from the tvlv container + * batadv_tvlv_container_get() - retrieve tvlv container from the tvlv container * list based on the provided type and version (both need to match) * @bat_priv: the bat priv with all the soft interface information * @type: tvlv container type to look for @@ -155,7 +156,7 @@ batadv_tvlv_container_get(struct batadv_priv *bat_priv, u8 type, u8 version) } /** - * batadv_tvlv_container_list_size - calculate the size of the tvlv container + * batadv_tvlv_container_list_size() - calculate the size of the tvlv container * list entries * @bat_priv: the bat priv with all the soft interface information * @@ -180,8 +181,8 @@ static u16 batadv_tvlv_container_list_size(struct batadv_priv *bat_priv) } /** - * batadv_tvlv_container_remove - remove tvlv container from the tvlv container - * list + * batadv_tvlv_container_remove() - remove tvlv container from the tvlv + * container list * @bat_priv: the bat priv with all the soft interface information * @tvlv: the to be removed tvlv container * @@ -204,7 +205,7 @@ static void batadv_tvlv_container_remove(struct batadv_priv *bat_priv, } /** - * batadv_tvlv_container_unregister - unregister tvlv container based on the + * batadv_tvlv_container_unregister() - unregister tvlv container based on the * provided type and version (both need to match) * @bat_priv: the bat priv with all the soft interface information * @type: tvlv container type to unregister @@ -222,7 +223,7 @@ void batadv_tvlv_container_unregister(struct batadv_priv *bat_priv, } /** - * batadv_tvlv_container_register - register tvlv type, version and content + * batadv_tvlv_container_register() - register tvlv type, version and content * to be propagated with each (primary interface) OGM * @bat_priv: the bat priv with all the soft interface information * @type: tvlv container type @@ -267,7 +268,7 @@ void batadv_tvlv_container_register(struct batadv_priv *bat_priv, } /** - * batadv_tvlv_realloc_packet_buff - reallocate packet buffer to accommodate + * batadv_tvlv_realloc_packet_buff() - reallocate packet buffer to accommodate * requested packet size * @packet_buff: packet buffer * @packet_buff_len: packet buffer size @@ -300,7 +301,7 @@ static bool batadv_tvlv_realloc_packet_buff(unsigned char **packet_buff, } /** - * batadv_tvlv_container_ogm_append - append tvlv container content to given + * batadv_tvlv_container_ogm_append() - append tvlv container content to given * OGM packet buffer * @bat_priv: the bat priv with all the soft interface information * @packet_buff: ogm packet buffer @@ -353,7 +354,7 @@ end: } /** - * batadv_tvlv_call_handler - parse the given tvlv buffer to call the + * batadv_tvlv_call_handler() - parse the given tvlv buffer to call the * appropriate handlers * @bat_priv: the bat priv with all the soft interface information * @tvlv_handler: tvlv callback function handling the tvlv content @@ -407,7 +408,7 @@ static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv, } /** - * batadv_tvlv_containers_process - parse the given tvlv buffer to call the + * batadv_tvlv_containers_process() - parse the given tvlv buffer to call the * appropriate handlers * @bat_priv: the bat priv with all the soft interface information * @ogm_source: flag indicating whether the tvlv is an ogm or a unicast packet @@ -474,7 +475,7 @@ int batadv_tvlv_containers_process(struct batadv_priv *bat_priv, } /** - * batadv_tvlv_ogm_receive - process an incoming ogm and call the appropriate + * batadv_tvlv_ogm_receive() - process an incoming ogm and call the appropriate * handlers * @bat_priv: the bat priv with all the soft interface information * @batadv_ogm_packet: ogm packet containing the tvlv containers @@ -501,7 +502,7 @@ void batadv_tvlv_ogm_receive(struct batadv_priv *bat_priv, } /** - * batadv_tvlv_handler_register - register tvlv handler based on the provided + * batadv_tvlv_handler_register() - register tvlv handler based on the provided * type and version (both need to match) for ogm tvlv payload and/or unicast * payload * @bat_priv: the bat priv with all the soft interface information @@ -556,7 +557,7 @@ void batadv_tvlv_handler_register(struct batadv_priv *bat_priv, } /** - * batadv_tvlv_handler_unregister - unregister tvlv handler based on the + * batadv_tvlv_handler_unregister() - unregister tvlv handler based on the * provided type and version (both need to match) * @bat_priv: the bat priv with all the soft interface information * @type: tvlv handler type to be unregistered @@ -579,7 +580,7 @@ void batadv_tvlv_handler_unregister(struct batadv_priv *bat_priv, } /** - * batadv_tvlv_unicast_send - send a unicast packet with tvlv payload to the + * batadv_tvlv_unicast_send() - send a unicast packet with tvlv payload to the * specified host * @bat_priv: the bat priv with all the soft interface information * @src: source mac address of the unicast packet diff --git a/net/batman-adv/tvlv.h b/net/batman-adv/tvlv.h index 4d01400ada30..a74df33f446d 100644 --- a/net/batman-adv/tvlv.h +++ b/net/batman-adv/tvlv.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index a62795868794..bb1578410e0c 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich @@ -34,10 +35,9 @@ #include <linux/types.h> #include <linux/wait.h> #include <linux/workqueue.h> +#include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> -#include "packet.h" - struct seq_file; #ifdef CONFIG_BATMAN_ADV_DAT @@ -54,13 +54,15 @@ struct seq_file; /** * enum batadv_dhcp_recipient - dhcp destination - * @BATADV_DHCP_NO: packet is not a dhcp message - * @BATADV_DHCP_TO_SERVER: dhcp message is directed to a server - * @BATADV_DHCP_TO_CLIENT: dhcp message is directed to a client */ enum batadv_dhcp_recipient { + /** @BATADV_DHCP_NO: packet is not a dhcp message */ BATADV_DHCP_NO = 0, + + /** @BATADV_DHCP_TO_SERVER: dhcp message is directed to a server */ BATADV_DHCP_TO_SERVER, + + /** @BATADV_DHCP_TO_CLIENT: dhcp message is directed to a client */ BATADV_DHCP_TO_CLIENT, }; @@ -78,196 +80,274 @@ enum batadv_dhcp_recipient { /** * struct batadv_hard_iface_bat_iv - per hard-interface B.A.T.M.A.N. IV data - * @ogm_buff: buffer holding the OGM packet - * @ogm_buff_len: length of the OGM packet buffer - * @ogm_seqno: OGM sequence number - used to identify each OGM */ struct batadv_hard_iface_bat_iv { + /** @ogm_buff: buffer holding the OGM packet */ unsigned char *ogm_buff; + + /** @ogm_buff_len: length of the OGM packet buffer */ int ogm_buff_len; + + /** @ogm_seqno: OGM sequence number - used to identify each OGM */ atomic_t ogm_seqno; }; /** * enum batadv_v_hard_iface_flags - interface flags useful to B.A.T.M.A.N. V - * @BATADV_FULL_DUPLEX: tells if the connection over this link is full-duplex - * @BATADV_WARNING_DEFAULT: tells whether we have warned the user that no - * throughput data is available for this interface and that default values are - * assumed. */ enum batadv_v_hard_iface_flags { + /** + * @BATADV_FULL_DUPLEX: tells if the connection over this link is + * full-duplex + */ BATADV_FULL_DUPLEX = BIT(0), + + /** + * @BATADV_WARNING_DEFAULT: tells whether we have warned the user that + * no throughput data is available for this interface and that default + * values are assumed. + */ BATADV_WARNING_DEFAULT = BIT(1), }; /** * struct batadv_hard_iface_bat_v - per hard-interface B.A.T.M.A.N. V data - * @elp_interval: time interval between two ELP transmissions - * @elp_seqno: current ELP sequence number - * @elp_skb: base skb containing the ELP message to send - * @elp_wq: workqueue used to schedule ELP transmissions - * @throughput_override: throughput override to disable link auto-detection - * @flags: interface specific flags */ struct batadv_hard_iface_bat_v { + /** @elp_interval: time interval between two ELP transmissions */ atomic_t elp_interval; + + /** @elp_seqno: current ELP sequence number */ atomic_t elp_seqno; + + /** @elp_skb: base skb containing the ELP message to send */ struct sk_buff *elp_skb; + + /** @elp_wq: workqueue used to schedule ELP transmissions */ struct delayed_work elp_wq; + + /** + * @throughput_override: throughput override to disable link + * auto-detection + */ atomic_t throughput_override; + + /** @flags: interface specific flags */ u8 flags; }; /** * enum batadv_hard_iface_wifi_flags - Flags describing the wifi configuration * of a batadv_hard_iface - * @BATADV_HARDIF_WIFI_WEXT_DIRECT: it is a wext wifi device - * @BATADV_HARDIF_WIFI_CFG80211_DIRECT: it is a cfg80211 wifi device - * @BATADV_HARDIF_WIFI_WEXT_INDIRECT: link device is a wext wifi device - * @BATADV_HARDIF_WIFI_CFG80211_INDIRECT: link device is a cfg80211 wifi device */ enum batadv_hard_iface_wifi_flags { + /** @BATADV_HARDIF_WIFI_WEXT_DIRECT: it is a wext wifi device */ BATADV_HARDIF_WIFI_WEXT_DIRECT = BIT(0), + + /** @BATADV_HARDIF_WIFI_CFG80211_DIRECT: it is a cfg80211 wifi device */ BATADV_HARDIF_WIFI_CFG80211_DIRECT = BIT(1), + + /** + * @BATADV_HARDIF_WIFI_WEXT_INDIRECT: link device is a wext wifi device + */ BATADV_HARDIF_WIFI_WEXT_INDIRECT = BIT(2), + + /** + * @BATADV_HARDIF_WIFI_CFG80211_INDIRECT: link device is a cfg80211 wifi + * device + */ BATADV_HARDIF_WIFI_CFG80211_INDIRECT = BIT(3), }; /** * struct batadv_hard_iface - network device known to batman-adv - * @list: list node for batadv_hardif_list - * @if_num: identificator of the interface - * @if_status: status of the interface for batman-adv - * @num_bcasts: number of payload re-broadcasts on this interface (ARQ) - * @wifi_flags: flags whether this is (directly or indirectly) a wifi interface - * @net_dev: pointer to the net_device - * @hardif_obj: kobject of the per interface sysfs "mesh" directory - * @refcount: number of contexts the object is used - * @batman_adv_ptype: packet type describing packets that should be processed by - * batman-adv for this interface - * @soft_iface: the batman-adv interface which uses this network interface - * @rcu: struct used for freeing in an RCU-safe manner - * @bat_iv: per hard-interface B.A.T.M.A.N. IV data - * @bat_v: per hard-interface B.A.T.M.A.N. V data - * @debug_dir: dentry for nc subdir in batman-adv directory in debugfs - * @neigh_list: list of unique single hop neighbors via this interface - * @neigh_list_lock: lock protecting neigh_list */ struct batadv_hard_iface { + /** @list: list node for batadv_hardif_list */ struct list_head list; + + /** @if_num: identificator of the interface */ s16 if_num; + + /** @if_status: status of the interface for batman-adv */ char if_status; + + /** + * @num_bcasts: number of payload re-broadcasts on this interface (ARQ) + */ u8 num_bcasts; + + /** + * @wifi_flags: flags whether this is (directly or indirectly) a wifi + * interface + */ u32 wifi_flags; + + /** @net_dev: pointer to the net_device */ struct net_device *net_dev; + + /** @hardif_obj: kobject of the per interface sysfs "mesh" directory */ struct kobject *hardif_obj; + + /** @refcount: number of contexts the object is used */ struct kref refcount; + + /** + * @batman_adv_ptype: packet type describing packets that should be + * processed by batman-adv for this interface + */ struct packet_type batman_adv_ptype; + + /** + * @soft_iface: the batman-adv interface which uses this network + * interface + */ struct net_device *soft_iface; + + /** @rcu: struct used for freeing in an RCU-safe manner */ struct rcu_head rcu; + + /** @bat_iv: per hard-interface B.A.T.M.A.N. IV data */ struct batadv_hard_iface_bat_iv bat_iv; + #ifdef CONFIG_BATMAN_ADV_BATMAN_V + /** @bat_v: per hard-interface B.A.T.M.A.N. V data */ struct batadv_hard_iface_bat_v bat_v; #endif + + /** + * @debug_dir: dentry for nc subdir in batman-adv directory in debugfs + */ struct dentry *debug_dir; + + /** + * @neigh_list: list of unique single hop neighbors via this interface + */ struct hlist_head neigh_list; - /* neigh_list_lock protects: neigh_list */ + + /** @neigh_list_lock: lock protecting neigh_list */ spinlock_t neigh_list_lock; }; /** * struct batadv_orig_ifinfo - originator info per outgoing interface - * @list: list node for orig_node::ifinfo_list - * @if_outgoing: pointer to outgoing hard-interface - * @router: router that should be used to reach this originator - * @last_real_seqno: last and best known sequence number - * @last_ttl: ttl of last received packet - * @last_seqno_forwarded: seqno of the OGM which was forwarded last - * @batman_seqno_reset: time when the batman seqno window was reset - * @refcount: number of contexts the object is used - * @rcu: struct used for freeing in an RCU-safe manner */ struct batadv_orig_ifinfo { + /** @list: list node for &batadv_orig_node.ifinfo_list */ struct hlist_node list; + + /** @if_outgoing: pointer to outgoing hard-interface */ struct batadv_hard_iface *if_outgoing; - struct batadv_neigh_node __rcu *router; /* rcu protected pointer */ + + /** @router: router that should be used to reach this originator */ + struct batadv_neigh_node __rcu *router; + + /** @last_real_seqno: last and best known sequence number */ u32 last_real_seqno; + + /** @last_ttl: ttl of last received packet */ u8 last_ttl; + + /** @last_seqno_forwarded: seqno of the OGM which was forwarded last */ u32 last_seqno_forwarded; + + /** @batman_seqno_reset: time when the batman seqno window was reset */ unsigned long batman_seqno_reset; + + /** @refcount: number of contexts the object is used */ struct kref refcount; + + /** @rcu: struct used for freeing in an RCU-safe manner */ struct rcu_head rcu; }; /** * struct batadv_frag_table_entry - head in the fragment buffer table - * @fragment_list: head of list with fragments - * @lock: lock to protect the list of fragments - * @timestamp: time (jiffie) of last received fragment - * @seqno: sequence number of the fragments in the list - * @size: accumulated size of packets in list - * @total_size: expected size of the assembled packet */ struct batadv_frag_table_entry { + /** @fragment_list: head of list with fragments */ struct hlist_head fragment_list; - spinlock_t lock; /* protects fragment_list */ + + /** @lock: lock to protect the list of fragments */ + spinlock_t lock; + + /** @timestamp: time (jiffie) of last received fragment */ unsigned long timestamp; + + /** @seqno: sequence number of the fragments in the list */ u16 seqno; + + /** @size: accumulated size of packets in list */ u16 size; + + /** @total_size: expected size of the assembled packet */ u16 total_size; }; /** * struct batadv_frag_list_entry - entry in a list of fragments - * @list: list node information - * @skb: fragment - * @no: fragment number in the set */ struct batadv_frag_list_entry { + /** @list: list node information */ struct hlist_node list; + + /** @skb: fragment */ struct sk_buff *skb; + + /** @no: fragment number in the set */ u8 no; }; /** * struct batadv_vlan_tt - VLAN specific TT attributes - * @crc: CRC32 checksum of the entries belonging to this vlan - * @num_entries: number of TT entries for this VLAN */ struct batadv_vlan_tt { + /** @crc: CRC32 checksum of the entries belonging to this vlan */ u32 crc; + + /** @num_entries: number of TT entries for this VLAN */ atomic_t num_entries; }; /** * struct batadv_orig_node_vlan - VLAN specific data per orig_node - * @vid: the VLAN identifier - * @tt: VLAN specific TT attributes - * @list: list node for orig_node::vlan_list - * @refcount: number of context where this object is currently in use - * @rcu: struct used for freeing in a RCU-safe manner */ struct batadv_orig_node_vlan { + /** @vid: the VLAN identifier */ unsigned short vid; + + /** @tt: VLAN specific TT attributes */ struct batadv_vlan_tt tt; + + /** @list: list node for &batadv_orig_node.vlan_list */ struct hlist_node list; + + /** + * @refcount: number of context where this object is currently in use + */ struct kref refcount; + + /** @rcu: struct used for freeing in a RCU-safe manner */ struct rcu_head rcu; }; /** * struct batadv_orig_bat_iv - B.A.T.M.A.N. IV private orig_node members - * @bcast_own: set of bitfields (one per hard-interface) where each one counts - * the number of our OGMs this orig_node rebroadcasted "back" to us (relative - * to last_real_seqno). Every bitfield is BATADV_TQ_LOCAL_WINDOW_SIZE bits long. - * @bcast_own_sum: sum of bcast_own - * @ogm_cnt_lock: lock protecting bcast_own, bcast_own_sum, - * neigh_node->bat_iv.real_bits & neigh_node->bat_iv.real_packet_count */ struct batadv_orig_bat_iv { + /** + * @bcast_own: set of bitfields (one per hard-interface) where each one + * counts the number of our OGMs this orig_node rebroadcasted "back" to + * us (relative to last_real_seqno). Every bitfield is + * BATADV_TQ_LOCAL_WINDOW_SIZE bits long. + */ unsigned long *bcast_own; + + /** @bcast_own_sum: sum of bcast_own */ u8 *bcast_own_sum; - /* ogm_cnt_lock protects: bcast_own, bcast_own_sum, + + /** + * @ogm_cnt_lock: lock protecting bcast_own, bcast_own_sum, * neigh_node->bat_iv.real_bits & neigh_node->bat_iv.real_packet_count */ spinlock_t ogm_cnt_lock; @@ -275,130 +355,205 @@ struct batadv_orig_bat_iv { /** * struct batadv_orig_node - structure for orig_list maintaining nodes of mesh - * @orig: originator ethernet address - * @ifinfo_list: list for routers per outgoing interface - * @last_bonding_candidate: pointer to last ifinfo of last used router - * @dat_addr: address of the orig node in the distributed hash - * @last_seen: time when last packet from this node was received - * @bcast_seqno_reset: time when the broadcast seqno window was reset - * @mcast_handler_lock: synchronizes mcast-capability and -flag changes - * @mcast_flags: multicast flags announced by the orig node - * @mcast_want_all_unsnoopables_node: a list node for the - * mcast.want_all_unsnoopables list - * @mcast_want_all_ipv4_node: a list node for the mcast.want_all_ipv4 list - * @mcast_want_all_ipv6_node: a list node for the mcast.want_all_ipv6 list - * @capabilities: announced capabilities of this originator - * @capa_initialized: bitfield to remember whether a capability was initialized - * @last_ttvn: last seen translation table version number - * @tt_buff: last tt changeset this node received from the orig node - * @tt_buff_len: length of the last tt changeset this node received from the - * orig node - * @tt_buff_lock: lock that protects tt_buff and tt_buff_len - * @tt_lock: prevents from updating the table while reading it. Table update is - * made up by two operations (data structure update and metdata -CRC/TTVN- - * recalculation) and they have to be executed atomically in order to avoid - * another thread to read the table/metadata between those. - * @bcast_bits: bitfield containing the info which payload broadcast originated - * from this orig node this host already has seen (relative to - * last_bcast_seqno) - * @last_bcast_seqno: last broadcast sequence number received by this host - * @neigh_list: list of potential next hop neighbor towards this orig node - * @neigh_list_lock: lock protecting neigh_list and router - * @hash_entry: hlist node for batadv_priv::orig_hash - * @bat_priv: pointer to soft_iface this orig node belongs to - * @bcast_seqno_lock: lock protecting bcast_bits & last_bcast_seqno - * @refcount: number of contexts the object is used - * @rcu: struct used for freeing in an RCU-safe manner - * @in_coding_list: list of nodes this orig can hear - * @out_coding_list: list of nodes that can hear this orig - * @in_coding_list_lock: protects in_coding_list - * @out_coding_list_lock: protects out_coding_list - * @fragments: array with heads for fragment chains - * @vlan_list: a list of orig_node_vlan structs, one per VLAN served by the - * originator represented by this object - * @vlan_list_lock: lock protecting vlan_list - * @bat_iv: B.A.T.M.A.N. IV private structure */ struct batadv_orig_node { + /** @orig: originator ethernet address */ u8 orig[ETH_ALEN]; + + /** @ifinfo_list: list for routers per outgoing interface */ struct hlist_head ifinfo_list; + + /** + * @last_bonding_candidate: pointer to last ifinfo of last used router + */ struct batadv_orig_ifinfo *last_bonding_candidate; + #ifdef CONFIG_BATMAN_ADV_DAT + /** @dat_addr: address of the orig node in the distributed hash */ batadv_dat_addr_t dat_addr; #endif + + /** @last_seen: time when last packet from this node was received */ unsigned long last_seen; + + /** + * @bcast_seqno_reset: time when the broadcast seqno window was reset + */ unsigned long bcast_seqno_reset; + #ifdef CONFIG_BATMAN_ADV_MCAST - /* synchronizes mcast tvlv specific orig changes */ + /** + * @mcast_handler_lock: synchronizes mcast-capability and -flag changes + */ spinlock_t mcast_handler_lock; + + /** @mcast_flags: multicast flags announced by the orig node */ u8 mcast_flags; + + /** + * @mcast_want_all_unsnoopables_node: a list node for the + * mcast.want_all_unsnoopables list + */ struct hlist_node mcast_want_all_unsnoopables_node; + + /** + * @mcast_want_all_ipv4_node: a list node for the mcast.want_all_ipv4 + * list + */ struct hlist_node mcast_want_all_ipv4_node; + /** + * @mcast_want_all_ipv6_node: a list node for the mcast.want_all_ipv6 + * list + */ struct hlist_node mcast_want_all_ipv6_node; #endif + + /** @capabilities: announced capabilities of this originator */ unsigned long capabilities; + + /** + * @capa_initialized: bitfield to remember whether a capability was + * initialized + */ unsigned long capa_initialized; + + /** @last_ttvn: last seen translation table version number */ atomic_t last_ttvn; + + /** @tt_buff: last tt changeset this node received from the orig node */ unsigned char *tt_buff; + + /** + * @tt_buff_len: length of the last tt changeset this node received + * from the orig node + */ s16 tt_buff_len; - spinlock_t tt_buff_lock; /* protects tt_buff & tt_buff_len */ - /* prevents from changing the table while reading it */ + + /** @tt_buff_lock: lock that protects tt_buff and tt_buff_len */ + spinlock_t tt_buff_lock; + + /** + * @tt_lock: prevents from updating the table while reading it. Table + * update is made up by two operations (data structure update and + * metdata -CRC/TTVN-recalculation) and they have to be executed + * atomically in order to avoid another thread to read the + * table/metadata between those. + */ spinlock_t tt_lock; + + /** + * @bcast_bits: bitfield containing the info which payload broadcast + * originated from this orig node this host already has seen (relative + * to last_bcast_seqno) + */ DECLARE_BITMAP(bcast_bits, BATADV_TQ_LOCAL_WINDOW_SIZE); + + /** + * @last_bcast_seqno: last broadcast sequence number received by this + * host + */ u32 last_bcast_seqno; + + /** + * @neigh_list: list of potential next hop neighbor towards this orig + * node + */ struct hlist_head neigh_list; - /* neigh_list_lock protects: neigh_list, ifinfo_list, - * last_bonding_candidate and router + + /** + * @neigh_list_lock: lock protecting neigh_list, ifinfo_list, + * last_bonding_candidate and router */ spinlock_t neigh_list_lock; + + /** @hash_entry: hlist node for &batadv_priv.orig_hash */ struct hlist_node hash_entry; + + /** @bat_priv: pointer to soft_iface this orig node belongs to */ struct batadv_priv *bat_priv; - /* bcast_seqno_lock protects: bcast_bits & last_bcast_seqno */ + + /** @bcast_seqno_lock: lock protecting bcast_bits & last_bcast_seqno */ spinlock_t bcast_seqno_lock; + + /** @refcount: number of contexts the object is used */ struct kref refcount; + + /** @rcu: struct used for freeing in an RCU-safe manner */ struct rcu_head rcu; + #ifdef CONFIG_BATMAN_ADV_NC + /** @in_coding_list: list of nodes this orig can hear */ struct list_head in_coding_list; + + /** @out_coding_list: list of nodes that can hear this orig */ struct list_head out_coding_list; - spinlock_t in_coding_list_lock; /* Protects in_coding_list */ - spinlock_t out_coding_list_lock; /* Protects out_coding_list */ + + /** @in_coding_list_lock: protects in_coding_list */ + spinlock_t in_coding_list_lock; + + /** @out_coding_list_lock: protects out_coding_list */ + spinlock_t out_coding_list_lock; #endif + + /** @fragments: array with heads for fragment chains */ struct batadv_frag_table_entry fragments[BATADV_FRAG_BUFFER_COUNT]; + + /** + * @vlan_list: a list of orig_node_vlan structs, one per VLAN served by + * the originator represented by this object + */ struct hlist_head vlan_list; - spinlock_t vlan_list_lock; /* protects vlan_list */ + + /** @vlan_list_lock: lock protecting vlan_list */ + spinlock_t vlan_list_lock; + + /** @bat_iv: B.A.T.M.A.N. IV private structure */ struct batadv_orig_bat_iv bat_iv; }; /** * enum batadv_orig_capabilities - orig node capabilities - * @BATADV_ORIG_CAPA_HAS_DAT: orig node has distributed arp table enabled - * @BATADV_ORIG_CAPA_HAS_NC: orig node has network coding enabled - * @BATADV_ORIG_CAPA_HAS_TT: orig node has tt capability - * @BATADV_ORIG_CAPA_HAS_MCAST: orig node has some multicast capability - * (= orig node announces a tvlv of type BATADV_TVLV_MCAST) */ enum batadv_orig_capabilities { + /** + * @BATADV_ORIG_CAPA_HAS_DAT: orig node has distributed arp table + * enabled + */ BATADV_ORIG_CAPA_HAS_DAT, + + /** @BATADV_ORIG_CAPA_HAS_NC: orig node has network coding enabled */ BATADV_ORIG_CAPA_HAS_NC, + + /** @BATADV_ORIG_CAPA_HAS_TT: orig node has tt capability */ BATADV_ORIG_CAPA_HAS_TT, + + /** + * @BATADV_ORIG_CAPA_HAS_MCAST: orig node has some multicast capability + * (= orig node announces a tvlv of type BATADV_TVLV_MCAST) + */ BATADV_ORIG_CAPA_HAS_MCAST, }; /** * struct batadv_gw_node - structure for orig nodes announcing gw capabilities - * @list: list node for batadv_priv_gw::list - * @orig_node: pointer to corresponding orig node - * @bandwidth_down: advertised uplink download bandwidth - * @bandwidth_up: advertised uplink upload bandwidth - * @refcount: number of contexts the object is used - * @rcu: struct used for freeing in an RCU-safe manner */ struct batadv_gw_node { + /** @list: list node for &batadv_priv_gw.list */ struct hlist_node list; + + /** @orig_node: pointer to corresponding orig node */ struct batadv_orig_node *orig_node; + + /** @bandwidth_down: advertised uplink download bandwidth */ u32 bandwidth_down; + + /** @bandwidth_up: advertised uplink upload bandwidth */ u32 bandwidth_up; + + /** @refcount: number of contexts the object is used */ struct kref refcount; + + /** @rcu: struct used for freeing in an RCU-safe manner */ struct rcu_head rcu; }; @@ -407,118 +562,161 @@ DECLARE_EWMA(throughput, 10, 8) /** * struct batadv_hardif_neigh_node_bat_v - B.A.T.M.A.N. V private neighbor * information - * @throughput: ewma link throughput towards this neighbor - * @elp_interval: time interval between two ELP transmissions - * @elp_latest_seqno: latest and best known ELP sequence number - * @last_unicast_tx: when the last unicast packet has been sent to this neighbor - * @metric_work: work queue callback item for metric update */ struct batadv_hardif_neigh_node_bat_v { + /** @throughput: ewma link throughput towards this neighbor */ struct ewma_throughput throughput; + + /** @elp_interval: time interval between two ELP transmissions */ u32 elp_interval; + + /** @elp_latest_seqno: latest and best known ELP sequence number */ u32 elp_latest_seqno; + + /** + * @last_unicast_tx: when the last unicast packet has been sent to this + * neighbor + */ unsigned long last_unicast_tx; + + /** @metric_work: work queue callback item for metric update */ struct work_struct metric_work; }; /** * struct batadv_hardif_neigh_node - unique neighbor per hard-interface - * @list: list node for batadv_hard_iface::neigh_list - * @addr: the MAC address of the neighboring interface - * @orig: the address of the originator this neighbor node belongs to - * @if_incoming: pointer to incoming hard-interface - * @last_seen: when last packet via this neighbor was received - * @bat_v: B.A.T.M.A.N. V private data - * @refcount: number of contexts the object is used - * @rcu: struct used for freeing in a RCU-safe manner */ struct batadv_hardif_neigh_node { + /** @list: list node for &batadv_hard_iface.neigh_list */ struct hlist_node list; + + /** @addr: the MAC address of the neighboring interface */ u8 addr[ETH_ALEN]; + + /** + * @orig: the address of the originator this neighbor node belongs to + */ u8 orig[ETH_ALEN]; + + /** @if_incoming: pointer to incoming hard-interface */ struct batadv_hard_iface *if_incoming; + + /** @last_seen: when last packet via this neighbor was received */ unsigned long last_seen; + #ifdef CONFIG_BATMAN_ADV_BATMAN_V + /** @bat_v: B.A.T.M.A.N. V private data */ struct batadv_hardif_neigh_node_bat_v bat_v; #endif + + /** @refcount: number of contexts the object is used */ struct kref refcount; + + /** @rcu: struct used for freeing in a RCU-safe manner */ struct rcu_head rcu; }; /** * struct batadv_neigh_node - structure for single hops neighbors - * @list: list node for batadv_orig_node::neigh_list - * @orig_node: pointer to corresponding orig_node - * @addr: the MAC address of the neighboring interface - * @ifinfo_list: list for routing metrics per outgoing interface - * @ifinfo_lock: lock protecting private ifinfo members and list - * @if_incoming: pointer to incoming hard-interface - * @last_seen: when last packet via this neighbor was received - * @hardif_neigh: hardif_neigh of this neighbor - * @refcount: number of contexts the object is used - * @rcu: struct used for freeing in an RCU-safe manner */ struct batadv_neigh_node { + /** @list: list node for &batadv_orig_node.neigh_list */ struct hlist_node list; + + /** @orig_node: pointer to corresponding orig_node */ struct batadv_orig_node *orig_node; + + /** @addr: the MAC address of the neighboring interface */ u8 addr[ETH_ALEN]; + + /** @ifinfo_list: list for routing metrics per outgoing interface */ struct hlist_head ifinfo_list; - spinlock_t ifinfo_lock; /* protects ifinfo_list and its members */ + + /** @ifinfo_lock: lock protecting ifinfo_list and its members */ + spinlock_t ifinfo_lock; + + /** @if_incoming: pointer to incoming hard-interface */ struct batadv_hard_iface *if_incoming; + + /** @last_seen: when last packet via this neighbor was received */ unsigned long last_seen; + + /** @hardif_neigh: hardif_neigh of this neighbor */ struct batadv_hardif_neigh_node *hardif_neigh; + + /** @refcount: number of contexts the object is used */ struct kref refcount; + + /** @rcu: struct used for freeing in an RCU-safe manner */ struct rcu_head rcu; }; /** * struct batadv_neigh_ifinfo_bat_iv - neighbor information per outgoing * interface for B.A.T.M.A.N. IV - * @tq_recv: ring buffer of received TQ values from this neigh node - * @tq_index: ring buffer index - * @tq_avg: averaged tq of all tq values in the ring buffer (tq_recv) - * @real_bits: bitfield containing the number of OGMs received from this neigh - * node (relative to orig_node->last_real_seqno) - * @real_packet_count: counted result of real_bits */ struct batadv_neigh_ifinfo_bat_iv { + /** @tq_recv: ring buffer of received TQ values from this neigh node */ u8 tq_recv[BATADV_TQ_GLOBAL_WINDOW_SIZE]; + + /** @tq_index: ring buffer index */ u8 tq_index; + + /** + * @tq_avg: averaged tq of all tq values in the ring buffer (tq_recv) + */ u8 tq_avg; + + /** + * @real_bits: bitfield containing the number of OGMs received from this + * neigh node (relative to orig_node->last_real_seqno) + */ DECLARE_BITMAP(real_bits, BATADV_TQ_LOCAL_WINDOW_SIZE); + + /** @real_packet_count: counted result of real_bits */ u8 real_packet_count; }; /** * struct batadv_neigh_ifinfo_bat_v - neighbor information per outgoing * interface for B.A.T.M.A.N. V - * @throughput: last throughput metric received from originator via this neigh - * @last_seqno: last sequence number known for this neighbor */ struct batadv_neigh_ifinfo_bat_v { + /** + * @throughput: last throughput metric received from originator via this + * neigh + */ u32 throughput; + + /** @last_seqno: last sequence number known for this neighbor */ u32 last_seqno; }; /** * struct batadv_neigh_ifinfo - neighbor information per outgoing interface - * @list: list node for batadv_neigh_node::ifinfo_list - * @if_outgoing: pointer to outgoing hard-interface - * @bat_iv: B.A.T.M.A.N. IV private structure - * @bat_v: B.A.T.M.A.N. V private data - * @last_ttl: last received ttl from this neigh node - * @refcount: number of contexts the object is used - * @rcu: struct used for freeing in a RCU-safe manner */ struct batadv_neigh_ifinfo { + /** @list: list node for &batadv_neigh_node.ifinfo_list */ struct hlist_node list; + + /** @if_outgoing: pointer to outgoing hard-interface */ struct batadv_hard_iface *if_outgoing; + + /** @bat_iv: B.A.T.M.A.N. IV private structure */ struct batadv_neigh_ifinfo_bat_iv bat_iv; + #ifdef CONFIG_BATMAN_ADV_BATMAN_V + /** @bat_v: B.A.T.M.A.N. V private data */ struct batadv_neigh_ifinfo_bat_v bat_v; #endif + + /** @last_ttl: last received ttl from this neigh node */ u8 last_ttl; + + /** @refcount: number of contexts the object is used */ struct kref refcount; + + /** @rcu: struct used for freeing in a RCU-safe manner */ struct rcu_head rcu; }; @@ -526,148 +724,278 @@ struct batadv_neigh_ifinfo { /** * struct batadv_bcast_duplist_entry - structure for LAN broadcast suppression - * @orig: mac address of orig node orginating the broadcast - * @crc: crc32 checksum of broadcast payload - * @entrytime: time when the broadcast packet was received */ struct batadv_bcast_duplist_entry { + /** @orig: mac address of orig node orginating the broadcast */ u8 orig[ETH_ALEN]; + + /** @crc: crc32 checksum of broadcast payload */ __be32 crc; + + /** @entrytime: time when the broadcast packet was received */ unsigned long entrytime; }; #endif /** * enum batadv_counters - indices for traffic counters - * @BATADV_CNT_TX: transmitted payload traffic packet counter - * @BATADV_CNT_TX_BYTES: transmitted payload traffic bytes counter - * @BATADV_CNT_TX_DROPPED: dropped transmission payload traffic packet counter - * @BATADV_CNT_RX: received payload traffic packet counter - * @BATADV_CNT_RX_BYTES: received payload traffic bytes counter - * @BATADV_CNT_FORWARD: forwarded payload traffic packet counter - * @BATADV_CNT_FORWARD_BYTES: forwarded payload traffic bytes counter - * @BATADV_CNT_MGMT_TX: transmitted routing protocol traffic packet counter - * @BATADV_CNT_MGMT_TX_BYTES: transmitted routing protocol traffic bytes counter - * @BATADV_CNT_MGMT_RX: received routing protocol traffic packet counter - * @BATADV_CNT_MGMT_RX_BYTES: received routing protocol traffic bytes counter - * @BATADV_CNT_FRAG_TX: transmitted fragment traffic packet counter - * @BATADV_CNT_FRAG_TX_BYTES: transmitted fragment traffic bytes counter - * @BATADV_CNT_FRAG_RX: received fragment traffic packet counter - * @BATADV_CNT_FRAG_RX_BYTES: received fragment traffic bytes counter - * @BATADV_CNT_FRAG_FWD: forwarded fragment traffic packet counter - * @BATADV_CNT_FRAG_FWD_BYTES: forwarded fragment traffic bytes counter - * @BATADV_CNT_TT_REQUEST_TX: transmitted tt req traffic packet counter - * @BATADV_CNT_TT_REQUEST_RX: received tt req traffic packet counter - * @BATADV_CNT_TT_RESPONSE_TX: transmitted tt resp traffic packet counter - * @BATADV_CNT_TT_RESPONSE_RX: received tt resp traffic packet counter - * @BATADV_CNT_TT_ROAM_ADV_TX: transmitted tt roam traffic packet counter - * @BATADV_CNT_TT_ROAM_ADV_RX: received tt roam traffic packet counter - * @BATADV_CNT_DAT_GET_TX: transmitted dht GET traffic packet counter - * @BATADV_CNT_DAT_GET_RX: received dht GET traffic packet counter - * @BATADV_CNT_DAT_PUT_TX: transmitted dht PUT traffic packet counter - * @BATADV_CNT_DAT_PUT_RX: received dht PUT traffic packet counter - * @BATADV_CNT_DAT_CACHED_REPLY_TX: transmitted dat cache reply traffic packet - * counter - * @BATADV_CNT_NC_CODE: transmitted nc-combined traffic packet counter - * @BATADV_CNT_NC_CODE_BYTES: transmitted nc-combined traffic bytes counter - * @BATADV_CNT_NC_RECODE: transmitted nc-recombined traffic packet counter - * @BATADV_CNT_NC_RECODE_BYTES: transmitted nc-recombined traffic bytes counter - * @BATADV_CNT_NC_BUFFER: counter for packets buffered for later nc decoding - * @BATADV_CNT_NC_DECODE: received and nc-decoded traffic packet counter - * @BATADV_CNT_NC_DECODE_BYTES: received and nc-decoded traffic bytes counter - * @BATADV_CNT_NC_DECODE_FAILED: received and decode-failed traffic packet - * counter - * @BATADV_CNT_NC_SNIFFED: counter for nc-decoded packets received in promisc - * mode. - * @BATADV_CNT_NUM: number of traffic counters */ enum batadv_counters { + /** @BATADV_CNT_TX: transmitted payload traffic packet counter */ BATADV_CNT_TX, + + /** @BATADV_CNT_TX_BYTES: transmitted payload traffic bytes counter */ BATADV_CNT_TX_BYTES, + + /** + * @BATADV_CNT_TX_DROPPED: dropped transmission payload traffic packet + * counter + */ BATADV_CNT_TX_DROPPED, + + /** @BATADV_CNT_RX: received payload traffic packet counter */ BATADV_CNT_RX, + + /** @BATADV_CNT_RX_BYTES: received payload traffic bytes counter */ BATADV_CNT_RX_BYTES, + + /** @BATADV_CNT_FORWARD: forwarded payload traffic packet counter */ BATADV_CNT_FORWARD, + + /** + * @BATADV_CNT_FORWARD_BYTES: forwarded payload traffic bytes counter + */ BATADV_CNT_FORWARD_BYTES, + + /** + * @BATADV_CNT_MGMT_TX: transmitted routing protocol traffic packet + * counter + */ BATADV_CNT_MGMT_TX, + + /** + * @BATADV_CNT_MGMT_TX_BYTES: transmitted routing protocol traffic bytes + * counter + */ BATADV_CNT_MGMT_TX_BYTES, + + /** + * @BATADV_CNT_MGMT_RX: received routing protocol traffic packet counter + */ BATADV_CNT_MGMT_RX, + + /** + * @BATADV_CNT_MGMT_RX_BYTES: received routing protocol traffic bytes + * counter + */ BATADV_CNT_MGMT_RX_BYTES, + + /** @BATADV_CNT_FRAG_TX: transmitted fragment traffic packet counter */ BATADV_CNT_FRAG_TX, + + /** + * @BATADV_CNT_FRAG_TX_BYTES: transmitted fragment traffic bytes counter + */ BATADV_CNT_FRAG_TX_BYTES, + + /** @BATADV_CNT_FRAG_RX: received fragment traffic packet counter */ BATADV_CNT_FRAG_RX, + + /** + * @BATADV_CNT_FRAG_RX_BYTES: received fragment traffic bytes counter + */ BATADV_CNT_FRAG_RX_BYTES, + + /** @BATADV_CNT_FRAG_FWD: forwarded fragment traffic packet counter */ BATADV_CNT_FRAG_FWD, + + /** + * @BATADV_CNT_FRAG_FWD_BYTES: forwarded fragment traffic bytes counter + */ BATADV_CNT_FRAG_FWD_BYTES, + + /** + * @BATADV_CNT_TT_REQUEST_TX: transmitted tt req traffic packet counter + */ BATADV_CNT_TT_REQUEST_TX, + + /** @BATADV_CNT_TT_REQUEST_RX: received tt req traffic packet counter */ BATADV_CNT_TT_REQUEST_RX, + + /** + * @BATADV_CNT_TT_RESPONSE_TX: transmitted tt resp traffic packet + * counter + */ BATADV_CNT_TT_RESPONSE_TX, + + /** + * @BATADV_CNT_TT_RESPONSE_RX: received tt resp traffic packet counter + */ BATADV_CNT_TT_RESPONSE_RX, + + /** + * @BATADV_CNT_TT_ROAM_ADV_TX: transmitted tt roam traffic packet + * counter + */ BATADV_CNT_TT_ROAM_ADV_TX, + + /** + * @BATADV_CNT_TT_ROAM_ADV_RX: received tt roam traffic packet counter + */ BATADV_CNT_TT_ROAM_ADV_RX, + #ifdef CONFIG_BATMAN_ADV_DAT + /** + * @BATADV_CNT_DAT_GET_TX: transmitted dht GET traffic packet counter + */ BATADV_CNT_DAT_GET_TX, + + /** @BATADV_CNT_DAT_GET_RX: received dht GET traffic packet counter */ BATADV_CNT_DAT_GET_RX, + + /** + * @BATADV_CNT_DAT_PUT_TX: transmitted dht PUT traffic packet counter + */ BATADV_CNT_DAT_PUT_TX, + + /** @BATADV_CNT_DAT_PUT_RX: received dht PUT traffic packet counter */ BATADV_CNT_DAT_PUT_RX, + + /** + * @BATADV_CNT_DAT_CACHED_REPLY_TX: transmitted dat cache reply traffic + * packet counter + */ BATADV_CNT_DAT_CACHED_REPLY_TX, #endif + #ifdef CONFIG_BATMAN_ADV_NC + /** + * @BATADV_CNT_NC_CODE: transmitted nc-combined traffic packet counter + */ BATADV_CNT_NC_CODE, + + /** + * @BATADV_CNT_NC_CODE_BYTES: transmitted nc-combined traffic bytes + * counter + */ BATADV_CNT_NC_CODE_BYTES, + + /** + * @BATADV_CNT_NC_RECODE: transmitted nc-recombined traffic packet + * counter + */ BATADV_CNT_NC_RECODE, + + /** + * @BATADV_CNT_NC_RECODE_BYTES: transmitted nc-recombined traffic bytes + * counter + */ BATADV_CNT_NC_RECODE_BYTES, + + /** + * @BATADV_CNT_NC_BUFFER: counter for packets buffered for later nc + * decoding + */ BATADV_CNT_NC_BUFFER, + + /** + * @BATADV_CNT_NC_DECODE: received and nc-decoded traffic packet counter + */ BATADV_CNT_NC_DECODE, + + /** + * @BATADV_CNT_NC_DECODE_BYTES: received and nc-decoded traffic bytes + * counter + */ BATADV_CNT_NC_DECODE_BYTES, + + /** + * @BATADV_CNT_NC_DECODE_FAILED: received and decode-failed traffic + * packet counter + */ BATADV_CNT_NC_DECODE_FAILED, + + /** + * @BATADV_CNT_NC_SNIFFED: counter for nc-decoded packets received in + * promisc mode. + */ BATADV_CNT_NC_SNIFFED, #endif + + /** @BATADV_CNT_NUM: number of traffic counters */ BATADV_CNT_NUM, }; /** * struct batadv_priv_tt - per mesh interface translation table data - * @vn: translation table version number - * @ogm_append_cnt: counter of number of OGMs containing the local tt diff - * @local_changes: changes registered in an originator interval - * @changes_list: tracks tt local changes within an originator interval - * @local_hash: local translation table hash table - * @global_hash: global translation table hash table - * @req_list: list of pending & unanswered tt_requests - * @roam_list: list of the last roaming events of each client limiting the - * number of roaming events to avoid route flapping - * @changes_list_lock: lock protecting changes_list - * @req_list_lock: lock protecting req_list - * @roam_list_lock: lock protecting roam_list - * @last_changeset: last tt changeset this host has generated - * @last_changeset_len: length of last tt changeset this host has generated - * @last_changeset_lock: lock protecting last_changeset & last_changeset_len - * @commit_lock: prevents from executing a local TT commit while reading the - * local table. The local TT commit is made up by two operations (data - * structure update and metdata -CRC/TTVN- recalculation) and they have to be - * executed atomically in order to avoid another thread to read the - * table/metadata between those. - * @work: work queue callback item for translation table purging */ struct batadv_priv_tt { + /** @vn: translation table version number */ atomic_t vn; + + /** + * @ogm_append_cnt: counter of number of OGMs containing the local tt + * diff + */ atomic_t ogm_append_cnt; + + /** @local_changes: changes registered in an originator interval */ atomic_t local_changes; + + /** + * @changes_list: tracks tt local changes within an originator interval + */ struct list_head changes_list; + + /** @local_hash: local translation table hash table */ struct batadv_hashtable *local_hash; + + /** @global_hash: global translation table hash table */ struct batadv_hashtable *global_hash; + + /** @req_list: list of pending & unanswered tt_requests */ struct hlist_head req_list; + + /** + * @roam_list: list of the last roaming events of each client limiting + * the number of roaming events to avoid route flapping + */ struct list_head roam_list; - spinlock_t changes_list_lock; /* protects changes */ - spinlock_t req_list_lock; /* protects req_list */ - spinlock_t roam_list_lock; /* protects roam_list */ + + /** @changes_list_lock: lock protecting changes_list */ + spinlock_t changes_list_lock; + + /** @req_list_lock: lock protecting req_list */ + spinlock_t req_list_lock; + + /** @roam_list_lock: lock protecting roam_list */ + spinlock_t roam_list_lock; + + /** @last_changeset: last tt changeset this host has generated */ unsigned char *last_changeset; + + /** + * @last_changeset_len: length of last tt changeset this host has + * generated + */ s16 last_changeset_len; - /* protects last_changeset & last_changeset_len */ + + /** + * @last_changeset_lock: lock protecting last_changeset & + * last_changeset_len + */ spinlock_t last_changeset_lock; - /* prevents from executing a commit while reading the table */ + + /** + * @commit_lock: prevents from executing a local TT commit while reading + * the local table. The local TT commit is made up by two operations + * (data structure update and metdata -CRC/TTVN- recalculation) and + * they have to be executed atomically in order to avoid another thread + * to read the table/metadata between those. + */ spinlock_t commit_lock; + + /** @work: work queue callback item for translation table purging */ struct delayed_work work; }; @@ -675,31 +1003,57 @@ struct batadv_priv_tt { /** * struct batadv_priv_bla - per mesh interface bridge loope avoidance data - * @num_requests: number of bla requests in flight - * @claim_hash: hash table containing mesh nodes this host has claimed - * @backbone_hash: hash table containing all detected backbone gateways - * @loopdetect_addr: MAC address used for own loopdetection frames - * @loopdetect_lasttime: time when the loopdetection frames were sent - * @loopdetect_next: how many periods to wait for the next loopdetect process - * @bcast_duplist: recently received broadcast packets array (for broadcast - * duplicate suppression) - * @bcast_duplist_curr: index of last broadcast packet added to bcast_duplist - * @bcast_duplist_lock: lock protecting bcast_duplist & bcast_duplist_curr - * @claim_dest: local claim data (e.g. claim group) - * @work: work queue callback item for cleanups & bla announcements */ struct batadv_priv_bla { + /** @num_requests: number of bla requests in flight */ atomic_t num_requests; + + /** + * @claim_hash: hash table containing mesh nodes this host has claimed + */ struct batadv_hashtable *claim_hash; + + /** + * @backbone_hash: hash table containing all detected backbone gateways + */ struct batadv_hashtable *backbone_hash; + + /** @loopdetect_addr: MAC address used for own loopdetection frames */ u8 loopdetect_addr[ETH_ALEN]; + + /** + * @loopdetect_lasttime: time when the loopdetection frames were sent + */ unsigned long loopdetect_lasttime; + + /** + * @loopdetect_next: how many periods to wait for the next loopdetect + * process + */ atomic_t loopdetect_next; + + /** + * @bcast_duplist: recently received broadcast packets array (for + * broadcast duplicate suppression) + */ struct batadv_bcast_duplist_entry bcast_duplist[BATADV_DUPLIST_SIZE]; + + /** + * @bcast_duplist_curr: index of last broadcast packet added to + * bcast_duplist + */ int bcast_duplist_curr; - /* protects bcast_duplist & bcast_duplist_curr */ + + /** + * @bcast_duplist_lock: lock protecting bcast_duplist & + * bcast_duplist_curr + */ spinlock_t bcast_duplist_lock; + + /** @claim_dest: local claim data (e.g. claim group) */ struct batadv_bla_claim_dst claim_dest; + + /** @work: work queue callback item for cleanups & bla announcements */ struct delayed_work work; }; #endif @@ -708,68 +1062,94 @@ struct batadv_priv_bla { /** * struct batadv_priv_debug_log - debug logging data - * @log_buff: buffer holding the logs (ring bufer) - * @log_start: index of next character to read - * @log_end: index of next character to write - * @lock: lock protecting log_buff, log_start & log_end - * @queue_wait: log reader's wait queue */ struct batadv_priv_debug_log { + /** @log_buff: buffer holding the logs (ring bufer) */ char log_buff[BATADV_LOG_BUF_LEN]; + + /** @log_start: index of next character to read */ unsigned long log_start; + + /** @log_end: index of next character to write */ unsigned long log_end; - spinlock_t lock; /* protects log_buff, log_start and log_end */ + + /** @lock: lock protecting log_buff, log_start & log_end */ + spinlock_t lock; + + /** @queue_wait: log reader's wait queue */ wait_queue_head_t queue_wait; }; #endif /** * struct batadv_priv_gw - per mesh interface gateway data - * @gateway_list: list of available gateway nodes - * @list_lock: lock protecting gateway_list & curr_gw - * @curr_gw: pointer to currently selected gateway node - * @mode: gateway operation: off, client or server (see batadv_gw_modes) - * @sel_class: gateway selection class (applies if gw_mode client) - * @bandwidth_down: advertised uplink download bandwidth (if gw_mode server) - * @bandwidth_up: advertised uplink upload bandwidth (if gw_mode server) - * @reselect: bool indicating a gateway re-selection is in progress */ struct batadv_priv_gw { + /** @gateway_list: list of available gateway nodes */ struct hlist_head gateway_list; - spinlock_t list_lock; /* protects gateway_list & curr_gw */ - struct batadv_gw_node __rcu *curr_gw; /* rcu protected pointer */ + + /** @list_lock: lock protecting gateway_list & curr_gw */ + spinlock_t list_lock; + + /** @curr_gw: pointer to currently selected gateway node */ + struct batadv_gw_node __rcu *curr_gw; + + /** + * @mode: gateway operation: off, client or server (see batadv_gw_modes) + */ atomic_t mode; + + /** @sel_class: gateway selection class (applies if gw_mode client) */ atomic_t sel_class; + + /** + * @bandwidth_down: advertised uplink download bandwidth (if gw_mode + * server) + */ atomic_t bandwidth_down; + + /** + * @bandwidth_up: advertised uplink upload bandwidth (if gw_mode server) + */ atomic_t bandwidth_up; + + /** @reselect: bool indicating a gateway re-selection is in progress */ atomic_t reselect; }; /** * struct batadv_priv_tvlv - per mesh interface tvlv data - * @container_list: list of registered tvlv containers to be sent with each OGM - * @handler_list: list of the various tvlv content handlers - * @container_list_lock: protects tvlv container list access - * @handler_list_lock: protects handler list access */ struct batadv_priv_tvlv { + /** + * @container_list: list of registered tvlv containers to be sent with + * each OGM + */ struct hlist_head container_list; + + /** @handler_list: list of the various tvlv content handlers */ struct hlist_head handler_list; - spinlock_t container_list_lock; /* protects container_list */ - spinlock_t handler_list_lock; /* protects handler_list */ + + /** @container_list_lock: protects tvlv container list access */ + spinlock_t container_list_lock; + + /** @handler_list_lock: protects handler list access */ + spinlock_t handler_list_lock; }; #ifdef CONFIG_BATMAN_ADV_DAT /** * struct batadv_priv_dat - per mesh interface DAT private data - * @addr: node DAT address - * @hash: hashtable representing the local ARP cache - * @work: work queue callback item for cache purging */ struct batadv_priv_dat { + /** @addr: node DAT address */ batadv_dat_addr_t addr; + + /** @hash: hashtable representing the local ARP cache */ struct batadv_hashtable *hash; + + /** @work: work queue callback item for cache purging */ struct delayed_work work; }; #endif @@ -777,375 +1157,582 @@ struct batadv_priv_dat { #ifdef CONFIG_BATMAN_ADV_MCAST /** * struct batadv_mcast_querier_state - IGMP/MLD querier state when bridged - * @exists: whether a querier exists in the mesh - * @shadowing: if a querier exists, whether it is potentially shadowing - * multicast listeners (i.e. querier is behind our own bridge segment) */ struct batadv_mcast_querier_state { + /** @exists: whether a querier exists in the mesh */ bool exists; + + /** + * @shadowing: if a querier exists, whether it is potentially shadowing + * multicast listeners (i.e. querier is behind our own bridge segment) + */ bool shadowing; }; /** * struct batadv_priv_mcast - per mesh interface mcast data - * @mla_list: list of multicast addresses we are currently announcing via TT - * @want_all_unsnoopables_list: a list of orig_nodes wanting all unsnoopable - * multicast traffic - * @want_all_ipv4_list: a list of orig_nodes wanting all IPv4 multicast traffic - * @want_all_ipv6_list: a list of orig_nodes wanting all IPv6 multicast traffic - * @querier_ipv4: the current state of an IGMP querier in the mesh - * @querier_ipv6: the current state of an MLD querier in the mesh - * @flags: the flags we have last sent in our mcast tvlv - * @enabled: whether the multicast tvlv is currently enabled - * @bridged: whether the soft interface has a bridge on top - * @num_disabled: number of nodes that have no mcast tvlv - * @num_want_all_unsnoopables: number of nodes wanting unsnoopable IP traffic - * @num_want_all_ipv4: counter for items in want_all_ipv4_list - * @num_want_all_ipv6: counter for items in want_all_ipv6_list - * @want_lists_lock: lock for protecting modifications to mcast want lists - * (traversals are rcu-locked) - * @work: work queue callback item for multicast TT and TVLV updates */ struct batadv_priv_mcast { + /** + * @mla_list: list of multicast addresses we are currently announcing + * via TT + */ struct hlist_head mla_list; /* see __batadv_mcast_mla_update() */ + + /** + * @want_all_unsnoopables_list: a list of orig_nodes wanting all + * unsnoopable multicast traffic + */ struct hlist_head want_all_unsnoopables_list; + + /** + * @want_all_ipv4_list: a list of orig_nodes wanting all IPv4 multicast + * traffic + */ struct hlist_head want_all_ipv4_list; + + /** + * @want_all_ipv6_list: a list of orig_nodes wanting all IPv6 multicast + * traffic + */ struct hlist_head want_all_ipv6_list; + + /** @querier_ipv4: the current state of an IGMP querier in the mesh */ struct batadv_mcast_querier_state querier_ipv4; + + /** @querier_ipv6: the current state of an MLD querier in the mesh */ struct batadv_mcast_querier_state querier_ipv6; + + /** @flags: the flags we have last sent in our mcast tvlv */ u8 flags; + + /** @enabled: whether the multicast tvlv is currently enabled */ bool enabled; + + /** @bridged: whether the soft interface has a bridge on top */ bool bridged; + + /** @num_disabled: number of nodes that have no mcast tvlv */ atomic_t num_disabled; + + /** + * @num_want_all_unsnoopables: number of nodes wanting unsnoopable IP + * traffic + */ atomic_t num_want_all_unsnoopables; + + /** @num_want_all_ipv4: counter for items in want_all_ipv4_list */ atomic_t num_want_all_ipv4; + + /** @num_want_all_ipv6: counter for items in want_all_ipv6_list */ atomic_t num_want_all_ipv6; - /* protects want_all_{unsnoopables,ipv4,ipv6}_list */ + + /** + * @want_lists_lock: lock for protecting modifications to mcasts + * want_all_{unsnoopables,ipv4,ipv6}_list (traversals are rcu-locked) + */ spinlock_t want_lists_lock; + + /** @work: work queue callback item for multicast TT and TVLV updates */ struct delayed_work work; }; #endif /** * struct batadv_priv_nc - per mesh interface network coding private data - * @work: work queue callback item for cleanup - * @debug_dir: dentry for nc subdir in batman-adv directory in debugfs - * @min_tq: only consider neighbors for encoding if neigh_tq > min_tq - * @max_fwd_delay: maximum packet forward delay to allow coding of packets - * @max_buffer_time: buffer time for sniffed packets used to decoding - * @timestamp_fwd_flush: timestamp of last forward packet queue flush - * @timestamp_sniffed_purge: timestamp of last sniffed packet queue purge - * @coding_hash: Hash table used to buffer skbs while waiting for another - * incoming skb to code it with. Skbs are added to the buffer just before being - * forwarded in routing.c - * @decoding_hash: Hash table used to buffer skbs that might be needed to decode - * a received coded skb. The buffer is used for 1) skbs arriving on the - * soft-interface; 2) skbs overheard on the hard-interface; and 3) skbs - * forwarded by batman-adv. */ struct batadv_priv_nc { + /** @work: work queue callback item for cleanup */ struct delayed_work work; + + /** + * @debug_dir: dentry for nc subdir in batman-adv directory in debugfs + */ struct dentry *debug_dir; + + /** + * @min_tq: only consider neighbors for encoding if neigh_tq > min_tq + */ u8 min_tq; + + /** + * @max_fwd_delay: maximum packet forward delay to allow coding of + * packets + */ u32 max_fwd_delay; + + /** + * @max_buffer_time: buffer time for sniffed packets used to decoding + */ u32 max_buffer_time; + + /** + * @timestamp_fwd_flush: timestamp of last forward packet queue flush + */ unsigned long timestamp_fwd_flush; + + /** + * @timestamp_sniffed_purge: timestamp of last sniffed packet queue + * purge + */ unsigned long timestamp_sniffed_purge; + + /** + * @coding_hash: Hash table used to buffer skbs while waiting for + * another incoming skb to code it with. Skbs are added to the buffer + * just before being forwarded in routing.c + */ struct batadv_hashtable *coding_hash; + + /** + * @decoding_hash: Hash table used to buffer skbs that might be needed + * to decode a received coded skb. The buffer is used for 1) skbs + * arriving on the soft-interface; 2) skbs overheard on the + * hard-interface; and 3) skbs forwarded by batman-adv. + */ struct batadv_hashtable *decoding_hash; }; /** * struct batadv_tp_unacked - unacked packet meta-information - * @seqno: seqno of the unacked packet - * @len: length of the packet - * @list: list node for batadv_tp_vars::unacked_list * * This struct is supposed to represent a buffer unacked packet. However, since * the purpose of the TP meter is to count the traffic only, there is no need to * store the entire sk_buff, the starting offset and the length are enough */ struct batadv_tp_unacked { + /** @seqno: seqno of the unacked packet */ u32 seqno; + + /** @len: length of the packet */ u16 len; + + /** @list: list node for &batadv_tp_vars.unacked_list */ struct list_head list; }; /** * enum batadv_tp_meter_role - Modus in tp meter session - * @BATADV_TP_RECEIVER: Initialized as receiver - * @BATADV_TP_SENDER: Initialized as sender */ enum batadv_tp_meter_role { + /** @BATADV_TP_RECEIVER: Initialized as receiver */ BATADV_TP_RECEIVER, + + /** @BATADV_TP_SENDER: Initialized as sender */ BATADV_TP_SENDER }; /** * struct batadv_tp_vars - tp meter private variables per session - * @list: list node for bat_priv::tp_list - * @timer: timer for ack (receiver) and retry (sender) - * @bat_priv: pointer to the mesh object - * @start_time: start time in jiffies - * @other_end: mac address of remote - * @role: receiver/sender modi - * @sending: sending binary semaphore: 1 if sending, 0 is not - * @reason: reason for a stopped session - * @finish_work: work item for the finishing procedure - * @test_length: test length in milliseconds - * @session: TP session identifier - * @icmp_uid: local ICMP "socket" index - * @dec_cwnd: decimal part of the cwnd used during linear growth - * @cwnd: current size of the congestion window - * @cwnd_lock: lock do protect @cwnd & @dec_cwnd - * @ss_threshold: Slow Start threshold. Once cwnd exceeds this value the - * connection switches to the Congestion Avoidance state - * @last_acked: last acked byte - * @last_sent: last sent byte, not yet acked - * @tot_sent: amount of data sent/ACKed so far - * @dup_acks: duplicate ACKs counter - * @fast_recovery: true if in Fast Recovery mode - * @recover: last sent seqno when entering Fast Recovery - * @rto: sender timeout - * @srtt: smoothed RTT scaled by 2^3 - * @rttvar: RTT variation scaled by 2^2 - * @more_bytes: waiting queue anchor when waiting for more ack/retry timeout - * @prerandom_offset: offset inside the prerandom buffer - * @prerandom_lock: spinlock protecting access to prerandom_offset - * @last_recv: last in-order received packet - * @unacked_list: list of unacked packets (meta-info only) - * @unacked_lock: protect unacked_list - * @last_recv_time: time time (jiffies) a msg was received - * @refcount: number of context where the object is used - * @rcu: struct used for freeing in an RCU-safe manner */ struct batadv_tp_vars { + /** @list: list node for &bat_priv.tp_list */ struct hlist_node list; + + /** @timer: timer for ack (receiver) and retry (sender) */ struct timer_list timer; + + /** @bat_priv: pointer to the mesh object */ struct batadv_priv *bat_priv; + + /** @start_time: start time in jiffies */ unsigned long start_time; + + /** @other_end: mac address of remote */ u8 other_end[ETH_ALEN]; + + /** @role: receiver/sender modi */ enum batadv_tp_meter_role role; + + /** @sending: sending binary semaphore: 1 if sending, 0 is not */ atomic_t sending; + + /** @reason: reason for a stopped session */ enum batadv_tp_meter_reason reason; + + /** @finish_work: work item for the finishing procedure */ struct delayed_work finish_work; + + /** @test_length: test length in milliseconds */ u32 test_length; + + /** @session: TP session identifier */ u8 session[2]; + + /** @icmp_uid: local ICMP "socket" index */ u8 icmp_uid; /* sender variables */ + + /** @dec_cwnd: decimal part of the cwnd used during linear growth */ u16 dec_cwnd; + + /** @cwnd: current size of the congestion window */ u32 cwnd; - spinlock_t cwnd_lock; /* Protects cwnd & dec_cwnd */ + + /** @cwnd_lock: lock do protect @cwnd & @dec_cwnd */ + spinlock_t cwnd_lock; + + /** + * @ss_threshold: Slow Start threshold. Once cwnd exceeds this value the + * connection switches to the Congestion Avoidance state + */ u32 ss_threshold; + + /** @last_acked: last acked byte */ atomic_t last_acked; + + /** @last_sent: last sent byte, not yet acked */ u32 last_sent; + + /** @tot_sent: amount of data sent/ACKed so far */ atomic64_t tot_sent; + + /** @dup_acks: duplicate ACKs counter */ atomic_t dup_acks; + + /** @fast_recovery: true if in Fast Recovery mode */ bool fast_recovery; + + /** @recover: last sent seqno when entering Fast Recovery */ u32 recover; + + /** @rto: sender timeout */ u32 rto; + + /** @srtt: smoothed RTT scaled by 2^3 */ u32 srtt; + + /** @rttvar: RTT variation scaled by 2^2 */ u32 rttvar; + + /** + * @more_bytes: waiting queue anchor when waiting for more ack/retry + * timeout + */ wait_queue_head_t more_bytes; + + /** @prerandom_offset: offset inside the prerandom buffer */ u32 prerandom_offset; - spinlock_t prerandom_lock; /* Protects prerandom_offset */ + + /** @prerandom_lock: spinlock protecting access to prerandom_offset */ + spinlock_t prerandom_lock; /* receiver variables */ + + /** @last_recv: last in-order received packet */ u32 last_recv; + + /** @unacked_list: list of unacked packets (meta-info only) */ struct list_head unacked_list; - spinlock_t unacked_lock; /* Protects unacked_list */ + + /** @unacked_lock: protect unacked_list */ + spinlock_t unacked_lock; + + /** @last_recv_time: time time (jiffies) a msg was received */ unsigned long last_recv_time; + + /** @refcount: number of context where the object is used */ struct kref refcount; + + /** @rcu: struct used for freeing in an RCU-safe manner */ struct rcu_head rcu; }; /** * struct batadv_softif_vlan - per VLAN attributes set - * @bat_priv: pointer to the mesh object - * @vid: VLAN identifier - * @kobj: kobject for sysfs vlan subdirectory - * @ap_isolation: AP isolation state - * @tt: TT private attributes (VLAN specific) - * @list: list node for bat_priv::softif_vlan_list - * @refcount: number of context where this object is currently in use - * @rcu: struct used for freeing in a RCU-safe manner */ struct batadv_softif_vlan { + /** @bat_priv: pointer to the mesh object */ struct batadv_priv *bat_priv; + + /** @vid: VLAN identifier */ unsigned short vid; + + /** @kobj: kobject for sysfs vlan subdirectory */ struct kobject *kobj; + + /** @ap_isolation: AP isolation state */ atomic_t ap_isolation; /* boolean */ + + /** @tt: TT private attributes (VLAN specific) */ struct batadv_vlan_tt tt; + + /** @list: list node for &bat_priv.softif_vlan_list */ struct hlist_node list; + + /** + * @refcount: number of context where this object is currently in use + */ struct kref refcount; + + /** @rcu: struct used for freeing in a RCU-safe manner */ struct rcu_head rcu; }; /** * struct batadv_priv_bat_v - B.A.T.M.A.N. V per soft-interface private data - * @ogm_buff: buffer holding the OGM packet - * @ogm_buff_len: length of the OGM packet buffer - * @ogm_seqno: OGM sequence number - used to identify each OGM - * @ogm_wq: workqueue used to schedule OGM transmissions */ struct batadv_priv_bat_v { + /** @ogm_buff: buffer holding the OGM packet */ unsigned char *ogm_buff; + + /** @ogm_buff_len: length of the OGM packet buffer */ int ogm_buff_len; + + /** @ogm_seqno: OGM sequence number - used to identify each OGM */ atomic_t ogm_seqno; + + /** @ogm_wq: workqueue used to schedule OGM transmissions */ struct delayed_work ogm_wq; }; /** * struct batadv_priv - per mesh interface data - * @mesh_state: current status of the mesh (inactive/active/deactivating) - * @soft_iface: net device which holds this struct as private data - * @bat_counters: mesh internal traffic statistic counters (see batadv_counters) - * @aggregated_ogms: bool indicating whether OGM aggregation is enabled - * @bonding: bool indicating whether traffic bonding is enabled - * @fragmentation: bool indicating whether traffic fragmentation is enabled - * @packet_size_max: max packet size that can be transmitted via - * multiple fragmented skbs or a single frame if fragmentation is disabled - * @frag_seqno: incremental counter to identify chains of egress fragments - * @bridge_loop_avoidance: bool indicating whether bridge loop avoidance is - * enabled - * @distributed_arp_table: bool indicating whether distributed ARP table is - * enabled - * @multicast_mode: Enable or disable multicast optimizations on this node's - * sender/originating side - * @orig_interval: OGM broadcast interval in milliseconds - * @hop_penalty: penalty which will be applied to an OGM's tq-field on every hop - * @log_level: configured log level (see batadv_dbg_level) - * @isolation_mark: the skb->mark value used to match packets for AP isolation - * @isolation_mark_mask: bitmask identifying the bits in skb->mark to be used - * for the isolation mark - * @bcast_seqno: last sent broadcast packet sequence number - * @bcast_queue_left: number of remaining buffered broadcast packet slots - * @batman_queue_left: number of remaining OGM packet slots - * @num_ifaces: number of interfaces assigned to this mesh interface - * @mesh_obj: kobject for sysfs mesh subdirectory - * @debug_dir: dentry for debugfs batman-adv subdirectory - * @forw_bat_list: list of aggregated OGMs that will be forwarded - * @forw_bcast_list: list of broadcast packets that will be rebroadcasted - * @tp_list: list of tp sessions - * @tp_num: number of currently active tp sessions - * @orig_hash: hash table containing mesh participants (orig nodes) - * @forw_bat_list_lock: lock protecting forw_bat_list - * @forw_bcast_list_lock: lock protecting forw_bcast_list - * @tp_list_lock: spinlock protecting @tp_list - * @orig_work: work queue callback item for orig node purging - * @primary_if: one of the hard-interfaces assigned to this mesh interface - * becomes the primary interface - * @algo_ops: routing algorithm used by this mesh interface - * @softif_vlan_list: a list of softif_vlan structs, one per VLAN created on top - * of the mesh interface represented by this object - * @softif_vlan_list_lock: lock protecting softif_vlan_list - * @bla: bridge loope avoidance data - * @debug_log: holding debug logging relevant data - * @gw: gateway data - * @tt: translation table data - * @tvlv: type-version-length-value data - * @dat: distributed arp table data - * @mcast: multicast data - * @network_coding: bool indicating whether network coding is enabled - * @nc: network coding data - * @bat_v: B.A.T.M.A.N. V per soft-interface private data */ struct batadv_priv { + /** + * @mesh_state: current status of the mesh + * (inactive/active/deactivating) + */ atomic_t mesh_state; + + /** @soft_iface: net device which holds this struct as private data */ struct net_device *soft_iface; + + /** + * @bat_counters: mesh internal traffic statistic counters (see + * batadv_counters) + */ u64 __percpu *bat_counters; /* Per cpu counters */ + + /** + * @aggregated_ogms: bool indicating whether OGM aggregation is enabled + */ atomic_t aggregated_ogms; + + /** @bonding: bool indicating whether traffic bonding is enabled */ atomic_t bonding; + + /** + * @fragmentation: bool indicating whether traffic fragmentation is + * enabled + */ atomic_t fragmentation; + + /** + * @packet_size_max: max packet size that can be transmitted via + * multiple fragmented skbs or a single frame if fragmentation is + * disabled + */ atomic_t packet_size_max; + + /** + * @frag_seqno: incremental counter to identify chains of egress + * fragments + */ atomic_t frag_seqno; + #ifdef CONFIG_BATMAN_ADV_BLA + /** + * @bridge_loop_avoidance: bool indicating whether bridge loop + * avoidance is enabled + */ atomic_t bridge_loop_avoidance; #endif + #ifdef CONFIG_BATMAN_ADV_DAT + /** + * @distributed_arp_table: bool indicating whether distributed ARP table + * is enabled + */ atomic_t distributed_arp_table; #endif + #ifdef CONFIG_BATMAN_ADV_MCAST + /** + * @multicast_mode: Enable or disable multicast optimizations on this + * node's sender/originating side + */ atomic_t multicast_mode; #endif + + /** @orig_interval: OGM broadcast interval in milliseconds */ atomic_t orig_interval; + + /** + * @hop_penalty: penalty which will be applied to an OGM's tq-field on + * every hop + */ atomic_t hop_penalty; + #ifdef CONFIG_BATMAN_ADV_DEBUG + /** @log_level: configured log level (see batadv_dbg_level) */ atomic_t log_level; #endif + + /** + * @isolation_mark: the skb->mark value used to match packets for AP + * isolation + */ u32 isolation_mark; + + /** + * @isolation_mark_mask: bitmask identifying the bits in skb->mark to be + * used for the isolation mark + */ u32 isolation_mark_mask; + + /** @bcast_seqno: last sent broadcast packet sequence number */ atomic_t bcast_seqno; + + /** + * @bcast_queue_left: number of remaining buffered broadcast packet + * slots + */ atomic_t bcast_queue_left; + + /** @batman_queue_left: number of remaining OGM packet slots */ atomic_t batman_queue_left; + + /** @num_ifaces: number of interfaces assigned to this mesh interface */ char num_ifaces; + + /** @mesh_obj: kobject for sysfs mesh subdirectory */ struct kobject *mesh_obj; + + /** @debug_dir: dentry for debugfs batman-adv subdirectory */ struct dentry *debug_dir; + + /** @forw_bat_list: list of aggregated OGMs that will be forwarded */ struct hlist_head forw_bat_list; + + /** + * @forw_bcast_list: list of broadcast packets that will be + * rebroadcasted + */ struct hlist_head forw_bcast_list; + + /** @tp_list: list of tp sessions */ struct hlist_head tp_list; + + /** @tp_num: number of currently active tp sessions */ struct batadv_hashtable *orig_hash; - spinlock_t forw_bat_list_lock; /* protects forw_bat_list */ - spinlock_t forw_bcast_list_lock; /* protects forw_bcast_list */ - spinlock_t tp_list_lock; /* protects tp_list */ + + /** @orig_hash: hash table containing mesh participants (orig nodes) */ + spinlock_t forw_bat_list_lock; + + /** @forw_bat_list_lock: lock protecting forw_bat_list */ + spinlock_t forw_bcast_list_lock; + + /** @forw_bcast_list_lock: lock protecting forw_bcast_list */ + spinlock_t tp_list_lock; + + /** @tp_list_lock: spinlock protecting @tp_list */ atomic_t tp_num; + + /** @orig_work: work queue callback item for orig node purging */ struct delayed_work orig_work; + + /** + * @primary_if: one of the hard-interfaces assigned to this mesh + * interface becomes the primary interface + */ struct batadv_hard_iface __rcu *primary_if; /* rcu protected pointer */ + + /** @algo_ops: routing algorithm used by this mesh interface */ struct batadv_algo_ops *algo_ops; + + /** + * @softif_vlan_list: a list of softif_vlan structs, one per VLAN + * created on top of the mesh interface represented by this object + */ struct hlist_head softif_vlan_list; - spinlock_t softif_vlan_list_lock; /* protects softif_vlan_list */ + + /** @softif_vlan_list_lock: lock protecting softif_vlan_list */ + spinlock_t softif_vlan_list_lock; + #ifdef CONFIG_BATMAN_ADV_BLA + /** @bla: bridge loope avoidance data */ struct batadv_priv_bla bla; #endif + #ifdef CONFIG_BATMAN_ADV_DEBUG + /** @debug_log: holding debug logging relevant data */ struct batadv_priv_debug_log *debug_log; #endif + + /** @gw: gateway data */ struct batadv_priv_gw gw; + + /** @tt: translation table data */ struct batadv_priv_tt tt; + + /** @tvlv: type-version-length-value data */ struct batadv_priv_tvlv tvlv; + #ifdef CONFIG_BATMAN_ADV_DAT + /** @dat: distributed arp table data */ struct batadv_priv_dat dat; #endif + #ifdef CONFIG_BATMAN_ADV_MCAST + /** @mcast: multicast data */ struct batadv_priv_mcast mcast; #endif + #ifdef CONFIG_BATMAN_ADV_NC + /** + * @network_coding: bool indicating whether network coding is enabled + */ atomic_t network_coding; + + /** @nc: network coding data */ struct batadv_priv_nc nc; #endif /* CONFIG_BATMAN_ADV_NC */ + #ifdef CONFIG_BATMAN_ADV_BATMAN_V + /** @bat_v: B.A.T.M.A.N. V per soft-interface private data */ struct batadv_priv_bat_v bat_v; #endif }; /** * struct batadv_socket_client - layer2 icmp socket client data - * @queue_list: packet queue for packets destined for this socket client - * @queue_len: number of packets in the packet queue (queue_list) - * @index: socket client's index in the batadv_socket_client_hash - * @lock: lock protecting queue_list, queue_len & index - * @queue_wait: socket client's wait queue - * @bat_priv: pointer to soft_iface this client belongs to */ struct batadv_socket_client { + /** + * @queue_list: packet queue for packets destined for this socket client + */ struct list_head queue_list; + + /** @queue_len: number of packets in the packet queue (queue_list) */ unsigned int queue_len; + + /** @index: socket client's index in the batadv_socket_client_hash */ unsigned char index; - spinlock_t lock; /* protects queue_list, queue_len & index */ + + /** @lock: lock protecting queue_list, queue_len & index */ + spinlock_t lock; + + /** @queue_wait: socket client's wait queue */ wait_queue_head_t queue_wait; + + /** @bat_priv: pointer to soft_iface this client belongs to */ struct batadv_priv *bat_priv; }; /** * struct batadv_socket_packet - layer2 icmp packet for socket client - * @list: list node for batadv_socket_client::queue_list - * @icmp_len: size of the layer2 icmp packet - * @icmp_packet: layer2 icmp packet */ struct batadv_socket_packet { + /** @list: list node for &batadv_socket_client.queue_list */ struct list_head list; + + /** @icmp_len: size of the layer2 icmp packet */ size_t icmp_len; + + /** @icmp_packet: layer2 icmp packet */ u8 icmp_packet[BATADV_ICMP_MAX_PACKET_SIZE]; }; @@ -1153,312 +1740,432 @@ struct batadv_socket_packet { /** * struct batadv_bla_backbone_gw - batman-adv gateway bridged into the LAN - * @orig: originator address of backbone node (mac address of primary iface) - * @vid: vlan id this gateway was detected on - * @hash_entry: hlist node for batadv_priv_bla::backbone_hash - * @bat_priv: pointer to soft_iface this backbone gateway belongs to - * @lasttime: last time we heard of this backbone gw - * @wait_periods: grace time for bridge forward delays and bla group forming at - * bootup phase - no bcast traffic is formwared until it has elapsed - * @request_sent: if this bool is set to true we are out of sync with this - * backbone gateway - no bcast traffic is formwared until the situation was - * resolved - * @crc: crc16 checksum over all claims - * @crc_lock: lock protecting crc - * @report_work: work struct for reporting detected loops - * @refcount: number of contexts the object is used - * @rcu: struct used for freeing in an RCU-safe manner */ struct batadv_bla_backbone_gw { + /** + * @orig: originator address of backbone node (mac address of primary + * iface) + */ u8 orig[ETH_ALEN]; + + /** @vid: vlan id this gateway was detected on */ unsigned short vid; + + /** @hash_entry: hlist node for &batadv_priv_bla.backbone_hash */ struct hlist_node hash_entry; + + /** @bat_priv: pointer to soft_iface this backbone gateway belongs to */ struct batadv_priv *bat_priv; + + /** @lasttime: last time we heard of this backbone gw */ unsigned long lasttime; + + /** + * @wait_periods: grace time for bridge forward delays and bla group + * forming at bootup phase - no bcast traffic is formwared until it has + * elapsed + */ atomic_t wait_periods; + + /** + * @request_sent: if this bool is set to true we are out of sync with + * this backbone gateway - no bcast traffic is formwared until the + * situation was resolved + */ atomic_t request_sent; + + /** @crc: crc16 checksum over all claims */ u16 crc; - spinlock_t crc_lock; /* protects crc */ + + /** @crc_lock: lock protecting crc */ + spinlock_t crc_lock; + + /** @report_work: work struct for reporting detected loops */ struct work_struct report_work; + + /** @refcount: number of contexts the object is used */ struct kref refcount; + + /** @rcu: struct used for freeing in an RCU-safe manner */ struct rcu_head rcu; }; /** * struct batadv_bla_claim - claimed non-mesh client structure - * @addr: mac address of claimed non-mesh client - * @vid: vlan id this client was detected on - * @backbone_gw: pointer to backbone gw claiming this client - * @backbone_lock: lock protecting backbone_gw pointer - * @lasttime: last time we heard of claim (locals only) - * @hash_entry: hlist node for batadv_priv_bla::claim_hash - * @refcount: number of contexts the object is used - * @rcu: struct used for freeing in an RCU-safe manner */ struct batadv_bla_claim { + /** @addr: mac address of claimed non-mesh client */ u8 addr[ETH_ALEN]; + + /** @vid: vlan id this client was detected on */ unsigned short vid; + + /** @backbone_gw: pointer to backbone gw claiming this client */ struct batadv_bla_backbone_gw *backbone_gw; - spinlock_t backbone_lock; /* protects backbone_gw */ + + /** @backbone_lock: lock protecting backbone_gw pointer */ + spinlock_t backbone_lock; + + /** @lasttime: last time we heard of claim (locals only) */ unsigned long lasttime; + + /** @hash_entry: hlist node for &batadv_priv_bla.claim_hash */ struct hlist_node hash_entry; + + /** @refcount: number of contexts the object is used */ struct rcu_head rcu; + + /** @rcu: struct used for freeing in an RCU-safe manner */ struct kref refcount; }; #endif /** * struct batadv_tt_common_entry - tt local & tt global common data - * @addr: mac address of non-mesh client - * @vid: VLAN identifier - * @hash_entry: hlist node for batadv_priv_tt::local_hash or for - * batadv_priv_tt::global_hash - * @flags: various state handling flags (see batadv_tt_client_flags) - * @added_at: timestamp used for purging stale tt common entries - * @refcount: number of contexts the object is used - * @rcu: struct used for freeing in an RCU-safe manner */ struct batadv_tt_common_entry { + /** @addr: mac address of non-mesh client */ u8 addr[ETH_ALEN]; + + /** @vid: VLAN identifier */ unsigned short vid; + + /** + * @hash_entry: hlist node for &batadv_priv_tt.local_hash or for + * &batadv_priv_tt.global_hash + */ struct hlist_node hash_entry; + + /** @flags: various state handling flags (see batadv_tt_client_flags) */ u16 flags; + + /** @added_at: timestamp used for purging stale tt common entries */ unsigned long added_at; + + /** @refcount: number of contexts the object is used */ struct kref refcount; + + /** @rcu: struct used for freeing in an RCU-safe manner */ struct rcu_head rcu; }; /** * struct batadv_tt_local_entry - translation table local entry data - * @common: general translation table data - * @last_seen: timestamp used for purging stale tt local entries - * @vlan: soft-interface vlan of the entry */ struct batadv_tt_local_entry { + /** @common: general translation table data */ struct batadv_tt_common_entry common; + + /** @last_seen: timestamp used for purging stale tt local entries */ unsigned long last_seen; + + /** @vlan: soft-interface vlan of the entry */ struct batadv_softif_vlan *vlan; }; /** * struct batadv_tt_global_entry - translation table global entry data - * @common: general translation table data - * @orig_list: list of orig nodes announcing this non-mesh client - * @orig_list_count: number of items in the orig_list - * @list_lock: lock protecting orig_list - * @roam_at: time at which TT_GLOBAL_ROAM was set */ struct batadv_tt_global_entry { + /** @common: general translation table data */ struct batadv_tt_common_entry common; + + /** @orig_list: list of orig nodes announcing this non-mesh client */ struct hlist_head orig_list; + + /** @orig_list_count: number of items in the orig_list */ atomic_t orig_list_count; - spinlock_t list_lock; /* protects orig_list */ + + /** @list_lock: lock protecting orig_list */ + spinlock_t list_lock; + + /** @roam_at: time at which TT_GLOBAL_ROAM was set */ unsigned long roam_at; }; /** * struct batadv_tt_orig_list_entry - orig node announcing a non-mesh client - * @orig_node: pointer to orig node announcing this non-mesh client - * @ttvn: translation table version number which added the non-mesh client - * @flags: per orig entry TT sync flags - * @list: list node for batadv_tt_global_entry::orig_list - * @refcount: number of contexts the object is used - * @rcu: struct used for freeing in an RCU-safe manner */ struct batadv_tt_orig_list_entry { + /** @orig_node: pointer to orig node announcing this non-mesh client */ struct batadv_orig_node *orig_node; + + /** + * @ttvn: translation table version number which added the non-mesh + * client + */ u8 ttvn; + + /** @flags: per orig entry TT sync flags */ u8 flags; + + /** @list: list node for &batadv_tt_global_entry.orig_list */ struct hlist_node list; + + /** @refcount: number of contexts the object is used */ struct kref refcount; + + /** @rcu: struct used for freeing in an RCU-safe manner */ struct rcu_head rcu; }; /** * struct batadv_tt_change_node - structure for tt changes occurred - * @list: list node for batadv_priv_tt::changes_list - * @change: holds the actual translation table diff data */ struct batadv_tt_change_node { + /** @list: list node for &batadv_priv_tt.changes_list */ struct list_head list; + + /** @change: holds the actual translation table diff data */ struct batadv_tvlv_tt_change change; }; /** * struct batadv_tt_req_node - data to keep track of the tt requests in flight - * @addr: mac address address of the originator this request was sent to - * @issued_at: timestamp used for purging stale tt requests - * @refcount: number of contexts the object is used by - * @list: list node for batadv_priv_tt::req_list */ struct batadv_tt_req_node { + /** + * @addr: mac address address of the originator this request was sent to + */ u8 addr[ETH_ALEN]; + + /** @issued_at: timestamp used for purging stale tt requests */ unsigned long issued_at; + + /** @refcount: number of contexts the object is used by */ struct kref refcount; + + /** @list: list node for &batadv_priv_tt.req_list */ struct hlist_node list; }; /** * struct batadv_tt_roam_node - roaming client data - * @addr: mac address of the client in the roaming phase - * @counter: number of allowed roaming events per client within a single - * OGM interval (changes are committed with each OGM) - * @first_time: timestamp used for purging stale roaming node entries - * @list: list node for batadv_priv_tt::roam_list */ struct batadv_tt_roam_node { + /** @addr: mac address of the client in the roaming phase */ u8 addr[ETH_ALEN]; + + /** + * @counter: number of allowed roaming events per client within a single + * OGM interval (changes are committed with each OGM) + */ atomic_t counter; + + /** + * @first_time: timestamp used for purging stale roaming node entries + */ unsigned long first_time; + + /** @list: list node for &batadv_priv_tt.roam_list */ struct list_head list; }; /** * struct batadv_nc_node - network coding node - * @list: next and prev pointer for the list handling - * @addr: the node's mac address - * @refcount: number of contexts the object is used by - * @rcu: struct used for freeing in an RCU-safe manner - * @orig_node: pointer to corresponding orig node struct - * @last_seen: timestamp of last ogm received from this node */ struct batadv_nc_node { + /** @list: next and prev pointer for the list handling */ struct list_head list; + + /** @addr: the node's mac address */ u8 addr[ETH_ALEN]; + + /** @refcount: number of contexts the object is used by */ struct kref refcount; + + /** @rcu: struct used for freeing in an RCU-safe manner */ struct rcu_head rcu; + + /** @orig_node: pointer to corresponding orig node struct */ struct batadv_orig_node *orig_node; + + /** @last_seen: timestamp of last ogm received from this node */ unsigned long last_seen; }; /** * struct batadv_nc_path - network coding path - * @hash_entry: next and prev pointer for the list handling - * @rcu: struct used for freeing in an RCU-safe manner - * @refcount: number of contexts the object is used by - * @packet_list: list of buffered packets for this path - * @packet_list_lock: access lock for packet list - * @next_hop: next hop (destination) of path - * @prev_hop: previous hop (source) of path - * @last_valid: timestamp for last validation of path */ struct batadv_nc_path { + /** @hash_entry: next and prev pointer for the list handling */ struct hlist_node hash_entry; + + /** @rcu: struct used for freeing in an RCU-safe manner */ struct rcu_head rcu; + + /** @refcount: number of contexts the object is used by */ struct kref refcount; + + /** @packet_list: list of buffered packets for this path */ struct list_head packet_list; - spinlock_t packet_list_lock; /* Protects packet_list */ + + /** @packet_list_lock: access lock for packet list */ + spinlock_t packet_list_lock; + + /** @next_hop: next hop (destination) of path */ u8 next_hop[ETH_ALEN]; + + /** @prev_hop: previous hop (source) of path */ u8 prev_hop[ETH_ALEN]; + + /** @last_valid: timestamp for last validation of path */ unsigned long last_valid; }; /** * struct batadv_nc_packet - network coding packet used when coding and * decoding packets - * @list: next and prev pointer for the list handling - * @packet_id: crc32 checksum of skb data - * @timestamp: field containing the info when the packet was added to path - * @neigh_node: pointer to original next hop neighbor of skb - * @skb: skb which can be encoded or used for decoding - * @nc_path: pointer to path this nc packet is attached to */ struct batadv_nc_packet { + /** @list: next and prev pointer for the list handling */ struct list_head list; + + /** @packet_id: crc32 checksum of skb data */ __be32 packet_id; + + /** + * @timestamp: field containing the info when the packet was added to + * path + */ unsigned long timestamp; + + /** @neigh_node: pointer to original next hop neighbor of skb */ struct batadv_neigh_node *neigh_node; + + /** @skb: skb which can be encoded or used for decoding */ struct sk_buff *skb; + + /** @nc_path: pointer to path this nc packet is attached to */ struct batadv_nc_path *nc_path; }; /** * struct batadv_skb_cb - control buffer structure used to store private data * relevant to batman-adv in the skb->cb buffer in skbs. - * @decoded: Marks a skb as decoded, which is checked when searching for coding - * opportunities in network-coding.c - * @num_bcasts: Counter for broadcast packet retransmissions */ struct batadv_skb_cb { + /** + * @decoded: Marks a skb as decoded, which is checked when searching for + * coding opportunities in network-coding.c + */ bool decoded; + + /** @num_bcasts: Counter for broadcast packet retransmissions */ unsigned int num_bcasts; }; /** * struct batadv_forw_packet - structure for bcast packets to be sent/forwarded - * @list: list node for batadv_priv::forw_{bat,bcast}_list - * @cleanup_list: list node for purging functions - * @send_time: execution time for delayed_work (packet sending) - * @own: bool for locally generated packets (local OGMs are re-scheduled after - * sending) - * @skb: bcast packet's skb buffer - * @packet_len: size of aggregated OGM packet inside the skb buffer - * @direct_link_flags: direct link flags for aggregated OGM packets - * @num_packets: counter for aggregated OGMv1 packets - * @delayed_work: work queue callback item for packet sending - * @if_incoming: pointer to incoming hard-iface or primary iface if - * locally generated packet - * @if_outgoing: packet where the packet should be sent to, or NULL if - * unspecified - * @queue_left: The queue (counter) this packet was applied to */ struct batadv_forw_packet { + /** + * @list: list node for &batadv_priv.forw.bcast_list and + * &batadv_priv.forw.bat_list + */ struct hlist_node list; + + /** @cleanup_list: list node for purging functions */ struct hlist_node cleanup_list; + + /** @send_time: execution time for delayed_work (packet sending) */ unsigned long send_time; + + /** + * @own: bool for locally generated packets (local OGMs are re-scheduled + * after sending) + */ u8 own; + + /** @skb: bcast packet's skb buffer */ struct sk_buff *skb; + + /** @packet_len: size of aggregated OGM packet inside the skb buffer */ u16 packet_len; + + /** @direct_link_flags: direct link flags for aggregated OGM packets */ u32 direct_link_flags; + + /** @num_packets: counter for aggregated OGMv1 packets */ u8 num_packets; + + /** @delayed_work: work queue callback item for packet sending */ struct delayed_work delayed_work; + + /** + * @if_incoming: pointer to incoming hard-iface or primary iface if + * locally generated packet + */ struct batadv_hard_iface *if_incoming; + + /** + * @if_outgoing: packet where the packet should be sent to, or NULL if + * unspecified + */ struct batadv_hard_iface *if_outgoing; + + /** @queue_left: The queue (counter) this packet was applied to */ atomic_t *queue_left; }; /** * struct batadv_algo_iface_ops - mesh algorithm callbacks (interface specific) - * @activate: start routing mechanisms when hard-interface is brought up - * (optional) - * @enable: init routing info when hard-interface is enabled - * @disable: de-init routing info when hard-interface is disabled - * @update_mac: (re-)init mac addresses of the protocol information - * belonging to this hard-interface - * @primary_set: called when primary interface is selected / changed */ struct batadv_algo_iface_ops { + /** + * @activate: start routing mechanisms when hard-interface is brought up + * (optional) + */ void (*activate)(struct batadv_hard_iface *hard_iface); + + /** @enable: init routing info when hard-interface is enabled */ int (*enable)(struct batadv_hard_iface *hard_iface); + + /** @disable: de-init routing info when hard-interface is disabled */ void (*disable)(struct batadv_hard_iface *hard_iface); + + /** + * @update_mac: (re-)init mac addresses of the protocol information + * belonging to this hard-interface + */ void (*update_mac)(struct batadv_hard_iface *hard_iface); + + /** @primary_set: called when primary interface is selected / changed */ void (*primary_set)(struct batadv_hard_iface *hard_iface); }; /** * struct batadv_algo_neigh_ops - mesh algorithm callbacks (neighbour specific) - * @hardif_init: called on creation of single hop entry - * (optional) - * @cmp: compare the metrics of two neighbors for their respective outgoing - * interfaces - * @is_similar_or_better: check if neigh1 is equally similar or better than - * neigh2 for their respective outgoing interface from the metric prospective - * @print: print the single hop neighbor list (optional) - * @dump: dump neighbors to a netlink socket (optional) */ struct batadv_algo_neigh_ops { + /** @hardif_init: called on creation of single hop entry (optional) */ void (*hardif_init)(struct batadv_hardif_neigh_node *neigh); + + /** + * @cmp: compare the metrics of two neighbors for their respective + * outgoing interfaces + */ int (*cmp)(struct batadv_neigh_node *neigh1, struct batadv_hard_iface *if_outgoing1, struct batadv_neigh_node *neigh2, struct batadv_hard_iface *if_outgoing2); + + /** + * @is_similar_or_better: check if neigh1 is equally similar or better + * than neigh2 for their respective outgoing interface from the metric + * prospective + */ bool (*is_similar_or_better)(struct batadv_neigh_node *neigh1, struct batadv_hard_iface *if_outgoing1, struct batadv_neigh_node *neigh2, struct batadv_hard_iface *if_outgoing2); + #ifdef CONFIG_BATMAN_ADV_DEBUGFS + /** @print: print the single hop neighbor list (optional) */ void (*print)(struct batadv_priv *priv, struct seq_file *seq); #endif + + /** @dump: dump neighbors to a netlink socket (optional) */ void (*dump)(struct sk_buff *msg, struct netlink_callback *cb, struct batadv_priv *priv, struct batadv_hard_iface *hard_iface); @@ -1466,24 +2173,36 @@ struct batadv_algo_neigh_ops { /** * struct batadv_algo_orig_ops - mesh algorithm callbacks (originator specific) - * @free: free the resources allocated by the routing algorithm for an orig_node - * object (optional) - * @add_if: ask the routing algorithm to apply the needed changes to the - * orig_node due to a new hard-interface being added into the mesh (optional) - * @del_if: ask the routing algorithm to apply the needed changes to the - * orig_node due to an hard-interface being removed from the mesh (optional) - * @print: print the originator table (optional) - * @dump: dump originators to a netlink socket (optional) */ struct batadv_algo_orig_ops { + /** + * @free: free the resources allocated by the routing algorithm for an + * orig_node object (optional) + */ void (*free)(struct batadv_orig_node *orig_node); + + /** + * @add_if: ask the routing algorithm to apply the needed changes to the + * orig_node due to a new hard-interface being added into the mesh + * (optional) + */ int (*add_if)(struct batadv_orig_node *orig_node, int max_if_num); + + /** + * @del_if: ask the routing algorithm to apply the needed changes to the + * orig_node due to an hard-interface being removed from the mesh + * (optional) + */ int (*del_if)(struct batadv_orig_node *orig_node, int max_if_num, int del_if_num); + #ifdef CONFIG_BATMAN_ADV_DEBUGFS + /** @print: print the originator table (optional) */ void (*print)(struct batadv_priv *priv, struct seq_file *seq, struct batadv_hard_iface *hard_iface); #endif + + /** @dump: dump originators to a netlink socket (optional) */ void (*dump)(struct sk_buff *msg, struct netlink_callback *cb, struct batadv_priv *priv, struct batadv_hard_iface *hard_iface); @@ -1491,158 +2210,213 @@ struct batadv_algo_orig_ops { /** * struct batadv_algo_gw_ops - mesh algorithm callbacks (GW specific) - * @init_sel_class: initialize GW selection class (optional) - * @store_sel_class: parse and stores a new GW selection class (optional) - * @show_sel_class: prints the current GW selection class (optional) - * @get_best_gw_node: select the best GW from the list of available nodes - * (optional) - * @is_eligible: check if a newly discovered GW is a potential candidate for - * the election as best GW (optional) - * @print: print the gateway table (optional) - * @dump: dump gateways to a netlink socket (optional) */ struct batadv_algo_gw_ops { + /** @init_sel_class: initialize GW selection class (optional) */ void (*init_sel_class)(struct batadv_priv *bat_priv); + + /** + * @store_sel_class: parse and stores a new GW selection class + * (optional) + */ ssize_t (*store_sel_class)(struct batadv_priv *bat_priv, char *buff, size_t count); + + /** @show_sel_class: prints the current GW selection class (optional) */ ssize_t (*show_sel_class)(struct batadv_priv *bat_priv, char *buff); + + /** + * @get_best_gw_node: select the best GW from the list of available + * nodes (optional) + */ struct batadv_gw_node *(*get_best_gw_node) (struct batadv_priv *bat_priv); + + /** + * @is_eligible: check if a newly discovered GW is a potential candidate + * for the election as best GW (optional) + */ bool (*is_eligible)(struct batadv_priv *bat_priv, struct batadv_orig_node *curr_gw_orig, struct batadv_orig_node *orig_node); + #ifdef CONFIG_BATMAN_ADV_DEBUGFS + /** @print: print the gateway table (optional) */ void (*print)(struct batadv_priv *bat_priv, struct seq_file *seq); #endif + + /** @dump: dump gateways to a netlink socket (optional) */ void (*dump)(struct sk_buff *msg, struct netlink_callback *cb, struct batadv_priv *priv); }; /** * struct batadv_algo_ops - mesh algorithm callbacks - * @list: list node for the batadv_algo_list - * @name: name of the algorithm - * @iface: callbacks related to interface handling - * @neigh: callbacks related to neighbors handling - * @orig: callbacks related to originators handling - * @gw: callbacks related to GW mode */ struct batadv_algo_ops { + /** @list: list node for the batadv_algo_list */ struct hlist_node list; + + /** @name: name of the algorithm */ char *name; + + /** @iface: callbacks related to interface handling */ struct batadv_algo_iface_ops iface; + + /** @neigh: callbacks related to neighbors handling */ struct batadv_algo_neigh_ops neigh; + + /** @orig: callbacks related to originators handling */ struct batadv_algo_orig_ops orig; + + /** @gw: callbacks related to GW mode */ struct batadv_algo_gw_ops gw; }; /** * struct batadv_dat_entry - it is a single entry of batman-adv ARP backend. It * is used to stored ARP entries needed for the global DAT cache - * @ip: the IPv4 corresponding to this DAT/ARP entry - * @mac_addr: the MAC address associated to the stored IPv4 - * @vid: the vlan ID associated to this entry - * @last_update: time in jiffies when this entry was refreshed last time - * @hash_entry: hlist node for batadv_priv_dat::hash - * @refcount: number of contexts the object is used - * @rcu: struct used for freeing in an RCU-safe manner */ struct batadv_dat_entry { + /** @ip: the IPv4 corresponding to this DAT/ARP entry */ __be32 ip; + + /** @mac_addr: the MAC address associated to the stored IPv4 */ u8 mac_addr[ETH_ALEN]; + + /** @vid: the vlan ID associated to this entry */ unsigned short vid; + + /** + * @last_update: time in jiffies when this entry was refreshed last time + */ unsigned long last_update; + + /** @hash_entry: hlist node for &batadv_priv_dat.hash */ struct hlist_node hash_entry; + + /** @refcount: number of contexts the object is used */ struct kref refcount; + + /** @rcu: struct used for freeing in an RCU-safe manner */ struct rcu_head rcu; }; /** * struct batadv_hw_addr - a list entry for a MAC address - * @list: list node for the linking of entries - * @addr: the MAC address of this list entry */ struct batadv_hw_addr { + /** @list: list node for the linking of entries */ struct hlist_node list; + + /** @addr: the MAC address of this list entry */ unsigned char addr[ETH_ALEN]; }; /** * struct batadv_dat_candidate - candidate destination for DAT operations - * @type: the type of the selected candidate. It can one of the following: - * - BATADV_DAT_CANDIDATE_NOT_FOUND - * - BATADV_DAT_CANDIDATE_ORIG - * @orig_node: if type is BATADV_DAT_CANDIDATE_ORIG this field points to the - * corresponding originator node structure */ struct batadv_dat_candidate { + /** + * @type: the type of the selected candidate. It can one of the + * following: + * - BATADV_DAT_CANDIDATE_NOT_FOUND + * - BATADV_DAT_CANDIDATE_ORIG + */ int type; + + /** + * @orig_node: if type is BATADV_DAT_CANDIDATE_ORIG this field points to + * the corresponding originator node structure + */ struct batadv_orig_node *orig_node; }; /** * struct batadv_tvlv_container - container for tvlv appended to OGMs - * @list: hlist node for batadv_priv_tvlv::container_list - * @tvlv_hdr: tvlv header information needed to construct the tvlv - * @refcount: number of contexts the object is used */ struct batadv_tvlv_container { + /** @list: hlist node for &batadv_priv_tvlv.container_list */ struct hlist_node list; + + /** @tvlv_hdr: tvlv header information needed to construct the tvlv */ struct batadv_tvlv_hdr tvlv_hdr; + + /** @refcount: number of contexts the object is used */ struct kref refcount; }; /** * struct batadv_tvlv_handler - handler for specific tvlv type and version - * @list: hlist node for batadv_priv_tvlv::handler_list - * @ogm_handler: handler callback which is given the tvlv payload to process on - * incoming OGM packets - * @unicast_handler: handler callback which is given the tvlv payload to process - * on incoming unicast tvlv packets - * @type: tvlv type this handler feels responsible for - * @version: tvlv version this handler feels responsible for - * @flags: tvlv handler flags - * @refcount: number of contexts the object is used - * @rcu: struct used for freeing in an RCU-safe manner */ struct batadv_tvlv_handler { + /** @list: hlist node for &batadv_priv_tvlv.handler_list */ struct hlist_node list; + + /** + * @ogm_handler: handler callback which is given the tvlv payload to + * process on incoming OGM packets + */ void (*ogm_handler)(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 flags, void *tvlv_value, u16 tvlv_value_len); + + /** + * @unicast_handler: handler callback which is given the tvlv payload to + * process on incoming unicast tvlv packets + */ int (*unicast_handler)(struct batadv_priv *bat_priv, u8 *src, u8 *dst, void *tvlv_value, u16 tvlv_value_len); + + /** @type: tvlv type this handler feels responsible for */ u8 type; + + /** @version: tvlv version this handler feels responsible for */ u8 version; + + /** @flags: tvlv handler flags */ u8 flags; + + /** @refcount: number of contexts the object is used */ struct kref refcount; + + /** @rcu: struct used for freeing in an RCU-safe manner */ struct rcu_head rcu; }; /** * enum batadv_tvlv_handler_flags - tvlv handler flags definitions - * @BATADV_TVLV_HANDLER_OGM_CIFNOTFND: tvlv ogm processing function will call - * this handler even if its type was not found (with no data) - * @BATADV_TVLV_HANDLER_OGM_CALLED: interval tvlv handling flag - the API marks - * a handler as being called, so it won't be called if the - * BATADV_TVLV_HANDLER_OGM_CIFNOTFND flag was set */ enum batadv_tvlv_handler_flags { + /** + * @BATADV_TVLV_HANDLER_OGM_CIFNOTFND: tvlv ogm processing function + * will call this handler even if its type was not found (with no data) + */ BATADV_TVLV_HANDLER_OGM_CIFNOTFND = BIT(1), + + /** + * @BATADV_TVLV_HANDLER_OGM_CALLED: interval tvlv handling flag - the + * API marks a handler as being called, so it won't be called if the + * BATADV_TVLV_HANDLER_OGM_CIFNOTFND flag was set + */ BATADV_TVLV_HANDLER_OGM_CALLED = BIT(2), }; /** * struct batadv_store_mesh_work - Work queue item to detach add/del interface * from sysfs locks - * @net_dev: netdevice to add/remove to/from batman-adv soft-interface - * @soft_iface_name: name of soft-interface to modify - * @work: work queue item */ struct batadv_store_mesh_work { + /** + * @net_dev: netdevice to add/remove to/from batman-adv soft-interface + */ struct net_device *net_dev; + + /** @soft_iface_name: name of soft-interface to modify */ char soft_iface_name[IFNAMSIZ]; + + /** @work: work queue item */ struct work_struct work; }; diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 91e3ba280706..f897681780db 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -421,7 +421,7 @@ out: } EXPORT_SYMBOL(bt_sock_stream_recvmsg); -static inline unsigned int bt_accept_poll(struct sock *parent) +static inline __poll_t bt_accept_poll(struct sock *parent) { struct bt_sock *s, *n; struct sock *sk; @@ -437,11 +437,11 @@ static inline unsigned int bt_accept_poll(struct sock *parent) return 0; } -unsigned int bt_sock_poll(struct file *file, struct socket *sock, +__poll_t bt_sock_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; - unsigned int mask = 0; + __poll_t mask = 0; BT_DBG("sock %p, sk %p", sock, sk); @@ -766,43 +766,39 @@ static int __init bt_init(void) return err; err = sock_register(&bt_sock_family_ops); - if (err < 0) { - bt_sysfs_cleanup(); - return err; - } + if (err) + goto cleanup_sysfs; BT_INFO("HCI device and connection manager initialized"); err = hci_sock_init(); - if (err < 0) - goto error; + if (err) + goto unregister_socket; err = l2cap_init(); - if (err < 0) - goto sock_err; + if (err) + goto cleanup_socket; err = sco_init(); - if (err < 0) { - l2cap_exit(); - goto sock_err; - } + if (err) + goto cleanup_cap; err = mgmt_init(); - if (err < 0) { - sco_exit(); - l2cap_exit(); - goto sock_err; - } + if (err) + goto cleanup_sco; return 0; -sock_err: +cleanup_sco: + sco_exit(); +cleanup_cap: + l2cap_exit(); +cleanup_socket: hci_sock_cleanup(); - -error: +unregister_socket: sock_unregister(PF_BLUETOOTH); +cleanup_sysfs: bt_sysfs_cleanup(); - return err; } diff --git a/net/bluetooth/cmtp/capi.c b/net/bluetooth/cmtp/capi.c index bb308224099c..426a92f02db4 100644 --- a/net/bluetooth/cmtp/capi.c +++ b/net/bluetooth/cmtp/capi.c @@ -527,7 +527,6 @@ static int cmtp_proc_open(struct inode *inode, struct file *file) } static const struct file_operations cmtp_proc_fops = { - .owner = THIS_MODULE, .open = cmtp_proc_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c index 63df63ebfb24..57403bd567d0 100644 --- a/net/bluetooth/hci_debugfs.c +++ b/net/bluetooth/hci_debugfs.c @@ -88,6 +88,9 @@ static int __name ## _show(struct seq_file *f, void *ptr) \ return 0; \ } \ \ +DEFINE_SHOW_ATTRIBUTE(__name) + +#define DEFINE_SHOW_ATTRIBUTE(__name) \ static int __name ## _open(struct inode *inode, struct file *file) \ { \ return single_open(file, __name ## _show, inode->i_private); \ @@ -106,37 +109,16 @@ static int features_show(struct seq_file *f, void *ptr) u8 p; hci_dev_lock(hdev); - for (p = 0; p < HCI_MAX_PAGES && p <= hdev->max_page; p++) { - seq_printf(f, "%2u: 0x%2.2x 0x%2.2x 0x%2.2x 0x%2.2x " - "0x%2.2x 0x%2.2x 0x%2.2x 0x%2.2x\n", p, - hdev->features[p][0], hdev->features[p][1], - hdev->features[p][2], hdev->features[p][3], - hdev->features[p][4], hdev->features[p][5], - hdev->features[p][6], hdev->features[p][7]); - } + for (p = 0; p < HCI_MAX_PAGES && p <= hdev->max_page; p++) + seq_printf(f, "%2u: %8ph\n", p, hdev->features[p]); if (lmp_le_capable(hdev)) - seq_printf(f, "LE: 0x%2.2x 0x%2.2x 0x%2.2x 0x%2.2x " - "0x%2.2x 0x%2.2x 0x%2.2x 0x%2.2x\n", - hdev->le_features[0], hdev->le_features[1], - hdev->le_features[2], hdev->le_features[3], - hdev->le_features[4], hdev->le_features[5], - hdev->le_features[6], hdev->le_features[7]); + seq_printf(f, "LE: %8ph\n", hdev->le_features); hci_dev_unlock(hdev); return 0; } -static int features_open(struct inode *inode, struct file *file) -{ - return single_open(file, features_show, inode->i_private); -} - -static const struct file_operations features_fops = { - .open = features_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(features); static int device_id_show(struct seq_file *f, void *ptr) { @@ -150,17 +132,7 @@ static int device_id_show(struct seq_file *f, void *ptr) return 0; } -static int device_id_open(struct inode *inode, struct file *file) -{ - return single_open(file, device_id_show, inode->i_private); -} - -static const struct file_operations device_id_fops = { - .open = device_id_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(device_id); static int device_list_show(struct seq_file *f, void *ptr) { @@ -180,17 +152,7 @@ static int device_list_show(struct seq_file *f, void *ptr) return 0; } -static int device_list_open(struct inode *inode, struct file *file) -{ - return single_open(file, device_list_show, inode->i_private); -} - -static const struct file_operations device_list_fops = { - .open = device_list_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(device_list); static int blacklist_show(struct seq_file *f, void *p) { @@ -205,17 +167,7 @@ static int blacklist_show(struct seq_file *f, void *p) return 0; } -static int blacklist_open(struct inode *inode, struct file *file) -{ - return single_open(file, blacklist_show, inode->i_private); -} - -static const struct file_operations blacklist_fops = { - .open = blacklist_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(blacklist); static int uuids_show(struct seq_file *f, void *p) { @@ -240,17 +192,7 @@ static int uuids_show(struct seq_file *f, void *p) return 0; } -static int uuids_open(struct inode *inode, struct file *file) -{ - return single_open(file, uuids_show, inode->i_private); -} - -static const struct file_operations uuids_fops = { - .open = uuids_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(uuids); static int remote_oob_show(struct seq_file *f, void *ptr) { @@ -269,17 +211,7 @@ static int remote_oob_show(struct seq_file *f, void *ptr) return 0; } -static int remote_oob_open(struct inode *inode, struct file *file) -{ - return single_open(file, remote_oob_show, inode->i_private); -} - -static const struct file_operations remote_oob_fops = { - .open = remote_oob_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(remote_oob); static int conn_info_min_age_set(void *data, u64 val) { @@ -443,17 +375,7 @@ static int inquiry_cache_show(struct seq_file *f, void *p) return 0; } -static int inquiry_cache_open(struct inode *inode, struct file *file) -{ - return single_open(file, inquiry_cache_show, inode->i_private); -} - -static const struct file_operations inquiry_cache_fops = { - .open = inquiry_cache_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(inquiry_cache); static int link_keys_show(struct seq_file *f, void *ptr) { @@ -469,17 +391,7 @@ static int link_keys_show(struct seq_file *f, void *ptr) return 0; } -static int link_keys_open(struct inode *inode, struct file *file) -{ - return single_open(file, link_keys_show, inode->i_private); -} - -static const struct file_operations link_keys_fops = { - .open = link_keys_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(link_keys); static int dev_class_show(struct seq_file *f, void *ptr) { @@ -493,17 +405,7 @@ static int dev_class_show(struct seq_file *f, void *ptr) return 0; } -static int dev_class_open(struct inode *inode, struct file *file) -{ - return single_open(file, dev_class_show, inode->i_private); -} - -static const struct file_operations dev_class_fops = { - .open = dev_class_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(dev_class); static int voice_setting_get(void *data, u64 *val) { @@ -692,17 +594,7 @@ static int identity_show(struct seq_file *f, void *p) return 0; } -static int identity_open(struct inode *inode, struct file *file) -{ - return single_open(file, identity_show, inode->i_private); -} - -static const struct file_operations identity_fops = { - .open = identity_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(identity); static int rpa_timeout_set(void *data, u64 val) { @@ -746,17 +638,7 @@ static int random_address_show(struct seq_file *f, void *p) return 0; } -static int random_address_open(struct inode *inode, struct file *file) -{ - return single_open(file, random_address_show, inode->i_private); -} - -static const struct file_operations random_address_fops = { - .open = random_address_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(random_address); static int static_address_show(struct seq_file *f, void *p) { @@ -769,17 +651,7 @@ static int static_address_show(struct seq_file *f, void *p) return 0; } -static int static_address_open(struct inode *inode, struct file *file) -{ - return single_open(file, static_address_show, inode->i_private); -} - -static const struct file_operations static_address_fops = { - .open = static_address_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(static_address); static ssize_t force_static_address_read(struct file *file, char __user *user_buf, @@ -841,17 +713,7 @@ static int white_list_show(struct seq_file *f, void *ptr) return 0; } -static int white_list_open(struct inode *inode, struct file *file) -{ - return single_open(file, white_list_show, inode->i_private); -} - -static const struct file_operations white_list_fops = { - .open = white_list_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(white_list); static int identity_resolving_keys_show(struct seq_file *f, void *ptr) { @@ -869,18 +731,7 @@ static int identity_resolving_keys_show(struct seq_file *f, void *ptr) return 0; } -static int identity_resolving_keys_open(struct inode *inode, struct file *file) -{ - return single_open(file, identity_resolving_keys_show, - inode->i_private); -} - -static const struct file_operations identity_resolving_keys_fops = { - .open = identity_resolving_keys_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(identity_resolving_keys); static int long_term_keys_show(struct seq_file *f, void *ptr) { @@ -898,17 +749,7 @@ static int long_term_keys_show(struct seq_file *f, void *ptr) return 0; } -static int long_term_keys_open(struct inode *inode, struct file *file) -{ - return single_open(file, long_term_keys_show, inode->i_private); -} - -static const struct file_operations long_term_keys_fops = { - .open = long_term_keys_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(long_term_keys); static int conn_min_interval_set(void *data, u64 val) { diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index abc0f3224dd1..3394e6791673 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -919,6 +919,43 @@ static bool adv_use_rpa(struct hci_dev *hdev, uint32_t flags) return true; } +static bool is_advertising_allowed(struct hci_dev *hdev, bool connectable) +{ + /* If there is no connection we are OK to advertise. */ + if (hci_conn_num(hdev, LE_LINK) == 0) + return true; + + /* Check le_states if there is any connection in slave role. */ + if (hdev->conn_hash.le_num_slave > 0) { + /* Slave connection state and non connectable mode bit 20. */ + if (!connectable && !(hdev->le_states[2] & 0x10)) + return false; + + /* Slave connection state and connectable mode bit 38 + * and scannable bit 21. + */ + if (connectable && (!(hdev->le_states[4] & 0x01) || + !(hdev->le_states[2] & 0x40))) + return false; + } + + /* Check le_states if there is any connection in master role. */ + if (hci_conn_num(hdev, LE_LINK) != hdev->conn_hash.le_num_slave) { + /* Master connection state and non connectable mode bit 18. */ + if (!connectable && !(hdev->le_states[2] & 0x02)) + return false; + + /* Master connection state and connectable mode bit 35 and + * scannable 19. + */ + if (connectable && (!(hdev->le_states[4] & 0x10) || + !(hdev->le_states[2] & 0x08))) + return false; + } + + return true; +} + void __hci_req_enable_advertising(struct hci_request *req) { struct hci_dev *hdev = req->hdev; @@ -927,7 +964,15 @@ void __hci_req_enable_advertising(struct hci_request *req) bool connectable; u32 flags; - if (hci_conn_num(hdev, LE_LINK) > 0) + flags = get_adv_instance_flags(hdev, hdev->cur_adv_instance); + + /* If the "connectable" instance flag was not set, then choose between + * ADV_IND and ADV_NONCONN_IND based on the global connectable setting. + */ + connectable = (flags & MGMT_ADV_FLAG_CONNECTABLE) || + mgmt_get_connectable(hdev); + + if (!is_advertising_allowed(hdev, connectable)) return; if (hci_dev_test_flag(hdev, HCI_LE_ADV)) @@ -940,14 +985,6 @@ void __hci_req_enable_advertising(struct hci_request *req) */ hci_dev_clear_flag(hdev, HCI_LE_ADV); - flags = get_adv_instance_flags(hdev, hdev->cur_adv_instance); - - /* If the "connectable" instance flag was not set, then choose between - * ADV_IND and ADV_NONCONN_IND based on the global connectable setting. - */ - connectable = (flags & MGMT_ADV_FLAG_CONNECTABLE) || - mgmt_get_connectable(hdev); - /* Set require_privacy to true only when non-connectable * advertising is used. In that case it is fine to use a * non-resolvable private address. @@ -1985,13 +2022,6 @@ unlock: hci_dev_unlock(hdev); } -static void disable_advertising(struct hci_request *req) -{ - u8 enable = 0x00; - - hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable); -} - static int active_scan(struct hci_request *req, unsigned long opt) { uint16_t interval = opt; @@ -2017,7 +2047,7 @@ static int active_scan(struct hci_request *req, unsigned long opt) cancel_adv_timeout(hdev); hci_dev_unlock(hdev); - disable_advertising(req); + __hci_req_disable_advertising(req); } /* If controller is scanning, it means the background scanning is diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index f2cec70d520c..1036e4fa1ea2 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -789,7 +789,7 @@ static int hidp_setup_hid(struct hidp_session *session, hid->dev.parent = &session->conn->hcon->dev; hid->ll_driver = &hidp_hid_driver; - /* True if device is blacklisted in drivers/hid/hid-core.c */ + /* True if device is blacklisted in drivers/hid/hid-quirks.c */ if (hid_ignore(hid)) { hid_destroy_device(session->hid); session->hid = NULL; diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 43ba91c440bc..fc6615d59165 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -3363,9 +3363,10 @@ static int l2cap_parse_conf_req(struct l2cap_chan *chan, void *data, size_t data break; case L2CAP_CONF_EFS: - remote_efs = 1; - if (olen == sizeof(efs)) + if (olen == sizeof(efs)) { + remote_efs = 1; memcpy(&efs, (void *) val, olen); + } break; case L2CAP_CONF_EWS: @@ -3584,16 +3585,17 @@ static int l2cap_parse_conf_rsp(struct l2cap_chan *chan, void *rsp, int len, break; case L2CAP_CONF_EFS: - if (olen == sizeof(efs)) + if (olen == sizeof(efs)) { memcpy(&efs, (void *)val, olen); - if (chan->local_stype != L2CAP_SERV_NOTRAFIC && - efs.stype != L2CAP_SERV_NOTRAFIC && - efs.stype != chan->local_stype) - return -ECONNREFUSED; + if (chan->local_stype != L2CAP_SERV_NOTRAFIC && + efs.stype != L2CAP_SERV_NOTRAFIC && + efs.stype != chan->local_stype) + return -ECONNREFUSED; - l2cap_add_conf_opt(&ptr, L2CAP_CONF_EFS, sizeof(efs), - (unsigned long) &efs, endptr - ptr); + l2cap_add_conf_opt(&ptr, L2CAP_CONF_EFS, sizeof(efs), + (unsigned long) &efs, endptr - ptr); + } break; case L2CAP_CONF_FCS: diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index af5b8c87f590..1285ca30ab0a 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -125,9 +125,16 @@ static int br_dev_init(struct net_device *dev) if (!br->stats) return -ENOMEM; + err = br_fdb_hash_init(br); + if (err) { + free_percpu(br->stats); + return err; + } + err = br_vlan_init(br); if (err) { free_percpu(br->stats); + br_fdb_hash_fini(br); return err; } @@ -135,6 +142,7 @@ static int br_dev_init(struct net_device *dev) if (err) { free_percpu(br->stats); br_vlan_flush(br); + br_fdb_hash_fini(br); } br_set_lockdep_class(dev); @@ -148,6 +156,7 @@ static void br_dev_uninit(struct net_device *dev) br_multicast_dev_del(br); br_multicast_uninit_stats(br); br_vlan_flush(br); + br_fdb_hash_fini(br); free_percpu(br->stats); } @@ -416,6 +425,7 @@ void br_dev_setup(struct net_device *dev) br->dev = dev; spin_lock_init(&br->lock); INIT_LIST_HEAD(&br->port_list); + INIT_HLIST_HEAD(&br->fdb_list); spin_lock_init(&br->hash_lock); br->bridge_id.prio[0] = 0x80; diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 4ea5c8bbe286..d9e69e4514be 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -28,14 +28,20 @@ #include <trace/events/bridge.h> #include "br_private.h" +static const struct rhashtable_params br_fdb_rht_params = { + .head_offset = offsetof(struct net_bridge_fdb_entry, rhnode), + .key_offset = offsetof(struct net_bridge_fdb_entry, key), + .key_len = sizeof(struct net_bridge_fdb_key), + .automatic_shrinking = true, + .locks_mul = 1, +}; + static struct kmem_cache *br_fdb_cache __read_mostly; static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, u16 vid); static void fdb_notify(struct net_bridge *br, const struct net_bridge_fdb_entry *, int); -static u32 fdb_salt __read_mostly; - int __init br_fdb_init(void) { br_fdb_cache = kmem_cache_create("bridge_fdb_cache", @@ -45,7 +51,6 @@ int __init br_fdb_init(void) if (!br_fdb_cache) return -ENOMEM; - get_random_bytes(&fdb_salt, sizeof(fdb_salt)); return 0; } @@ -54,6 +59,15 @@ void br_fdb_fini(void) kmem_cache_destroy(br_fdb_cache); } +int br_fdb_hash_init(struct net_bridge *br) +{ + return rhashtable_init(&br->fdb_hash_tbl, &br_fdb_rht_params); +} + +void br_fdb_hash_fini(struct net_bridge *br) +{ + rhashtable_destroy(&br->fdb_hash_tbl); +} /* if topology_changing then use forward_delay (default 15 sec) * otherwise keep longer (default 5 minutes) @@ -70,13 +84,6 @@ static inline int has_expired(const struct net_bridge *br, time_before_eq(fdb->updated + hold_time(br), jiffies); } -static inline int br_mac_hash(const unsigned char *mac, __u16 vid) -{ - /* use 1 byte of OUI and 3 bytes of NIC */ - u32 key = get_unaligned((u32 *)(mac + 2)); - return jhash_2words(key, vid, fdb_salt) & (BR_HASH_SIZE - 1); -} - static void fdb_rcu_free(struct rcu_head *head) { struct net_bridge_fdb_entry *ent @@ -84,19 +91,18 @@ static void fdb_rcu_free(struct rcu_head *head) kmem_cache_free(br_fdb_cache, ent); } -static struct net_bridge_fdb_entry *fdb_find_rcu(struct hlist_head *head, +static struct net_bridge_fdb_entry *fdb_find_rcu(struct rhashtable *tbl, const unsigned char *addr, __u16 vid) { - struct net_bridge_fdb_entry *f; + struct net_bridge_fdb_key key; WARN_ON_ONCE(!rcu_read_lock_held()); - hlist_for_each_entry_rcu(f, head, hlist) - if (ether_addr_equal(f->addr.addr, addr) && f->vlan_id == vid) - break; + key.vlan_id = vid; + memcpy(key.addr.addr, addr, sizeof(key.addr.addr)); - return f; + return rhashtable_lookup(tbl, &key, br_fdb_rht_params); } /* requires bridge hash_lock */ @@ -104,13 +110,12 @@ static struct net_bridge_fdb_entry *br_fdb_find(struct net_bridge *br, const unsigned char *addr, __u16 vid) { - struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)]; struct net_bridge_fdb_entry *fdb; lockdep_assert_held_once(&br->hash_lock); rcu_read_lock(); - fdb = fdb_find_rcu(head, addr, vid); + fdb = fdb_find_rcu(&br->fdb_hash_tbl, addr, vid); rcu_read_unlock(); return fdb; @@ -120,9 +125,7 @@ struct net_bridge_fdb_entry *br_fdb_find_rcu(struct net_bridge *br, const unsigned char *addr, __u16 vid) { - struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)]; - - return fdb_find_rcu(head, addr, vid); + return fdb_find_rcu(&br->fdb_hash_tbl, addr, vid); } /* When a static FDB entry is added, the mac address from the entry is @@ -175,9 +178,11 @@ static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f) trace_fdb_delete(br, f); if (f->is_static) - fdb_del_hw_addr(br, f->addr.addr); + fdb_del_hw_addr(br, f->key.addr.addr); - hlist_del_init_rcu(&f->hlist); + hlist_del_init_rcu(&f->fdb_node); + rhashtable_remove_fast(&br->fdb_hash_tbl, &f->rhnode, + br_fdb_rht_params); fdb_notify(br, f, RTM_DELNEIGH); call_rcu(&f->rcu, fdb_rcu_free); } @@ -187,11 +192,11 @@ static void fdb_delete_local(struct net_bridge *br, const struct net_bridge_port *p, struct net_bridge_fdb_entry *f) { - const unsigned char *addr = f->addr.addr; + const unsigned char *addr = f->key.addr.addr; struct net_bridge_vlan_group *vg; const struct net_bridge_vlan *v; struct net_bridge_port *op; - u16 vid = f->vlan_id; + u16 vid = f->key.vlan_id; /* Maybe another port has same hw addr? */ list_for_each_entry(op, &br->port_list, list) { @@ -233,31 +238,23 @@ void br_fdb_find_delete_local(struct net_bridge *br, void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr) { struct net_bridge_vlan_group *vg; + struct net_bridge_fdb_entry *f; struct net_bridge *br = p->br; struct net_bridge_vlan *v; - int i; spin_lock_bh(&br->hash_lock); - vg = nbp_vlan_group(p); - /* Search all chains since old address/hash is unknown */ - for (i = 0; i < BR_HASH_SIZE; i++) { - struct hlist_node *h; - hlist_for_each(h, &br->hash[i]) { - struct net_bridge_fdb_entry *f; - - f = hlist_entry(h, struct net_bridge_fdb_entry, hlist); - if (f->dst == p && f->is_local && !f->added_by_user) { - /* delete old one */ - fdb_delete_local(br, p, f); - - /* if this port has no vlan information - * configured, we can safely be done at - * this point. - */ - if (!vg || !vg->num_vlans) - goto insert; - } + hlist_for_each_entry(f, &br->fdb_list, fdb_node) { + if (f->dst == p && f->is_local && !f->added_by_user) { + /* delete old one */ + fdb_delete_local(br, p, f); + + /* if this port has no vlan information + * configured, we can safely be done at + * this point. + */ + if (!vg || !vg->num_vlans) + goto insert; } } @@ -316,35 +313,32 @@ void br_fdb_cleanup(struct work_struct *work) { struct net_bridge *br = container_of(work, struct net_bridge, gc_work.work); + struct net_bridge_fdb_entry *f = NULL; unsigned long delay = hold_time(br); unsigned long work_delay = delay; unsigned long now = jiffies; - int i; - for (i = 0; i < BR_HASH_SIZE; i++) { - struct net_bridge_fdb_entry *f; - struct hlist_node *n; + /* this part is tricky, in order to avoid blocking learning and + * consequently forwarding, we rely on rcu to delete objects with + * delayed freeing allowing us to continue traversing + */ + rcu_read_lock(); + hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { + unsigned long this_timer; - if (!br->hash[i].first) + if (f->is_static || f->added_by_external_learn) continue; - - spin_lock_bh(&br->hash_lock); - hlist_for_each_entry_safe(f, n, &br->hash[i], hlist) { - unsigned long this_timer; - - if (f->is_static) - continue; - if (f->added_by_external_learn) - continue; - this_timer = f->updated + delay; - if (time_after(this_timer, now)) - work_delay = min(work_delay, this_timer - now); - else + this_timer = f->updated + delay; + if (time_after(this_timer, now)) { + work_delay = min(work_delay, this_timer - now); + } else { + spin_lock_bh(&br->hash_lock); + if (!hlist_unhashed(&f->fdb_node)) fdb_delete(br, f); + spin_unlock_bh(&br->hash_lock); } - spin_unlock_bh(&br->hash_lock); - cond_resched(); } + rcu_read_unlock(); /* Cleanup minimum 10 milliseconds apart */ work_delay = max_t(unsigned long, work_delay, msecs_to_jiffies(10)); @@ -354,16 +348,13 @@ void br_fdb_cleanup(struct work_struct *work) /* Completely flush all dynamic entries in forwarding database.*/ void br_fdb_flush(struct net_bridge *br) { - int i; + struct net_bridge_fdb_entry *f; + struct hlist_node *tmp; spin_lock_bh(&br->hash_lock); - for (i = 0; i < BR_HASH_SIZE; i++) { - struct net_bridge_fdb_entry *f; - struct hlist_node *n; - hlist_for_each_entry_safe(f, n, &br->hash[i], hlist) { - if (!f->is_static) - fdb_delete(br, f); - } + hlist_for_each_entry_safe(f, tmp, &br->fdb_list, fdb_node) { + if (!f->is_static) + fdb_delete(br, f); } spin_unlock_bh(&br->hash_lock); } @@ -377,27 +368,22 @@ void br_fdb_delete_by_port(struct net_bridge *br, u16 vid, int do_all) { - int i; + struct net_bridge_fdb_entry *f; + struct hlist_node *tmp; spin_lock_bh(&br->hash_lock); - for (i = 0; i < BR_HASH_SIZE; i++) { - struct hlist_node *h, *g; + hlist_for_each_entry_safe(f, tmp, &br->fdb_list, fdb_node) { + if (f->dst != p) + continue; - hlist_for_each_safe(h, g, &br->hash[i]) { - struct net_bridge_fdb_entry *f - = hlist_entry(h, struct net_bridge_fdb_entry, hlist); - if (f->dst != p) + if (!do_all) + if (f->is_static || (vid && f->key.vlan_id != vid)) continue; - if (!do_all) - if (f->is_static || (vid && f->vlan_id != vid)) - continue; - - if (f->is_local) - fdb_delete_local(br, p, f); - else - fdb_delete(br, f); - } + if (f->is_local) + fdb_delete_local(br, p, f); + else + fdb_delete(br, f); } spin_unlock_bh(&br->hash_lock); } @@ -433,52 +419,48 @@ int br_fdb_test_addr(struct net_device *dev, unsigned char *addr) int br_fdb_fillbuf(struct net_bridge *br, void *buf, unsigned long maxnum, unsigned long skip) { - struct __fdb_entry *fe = buf; - int i, num = 0; struct net_bridge_fdb_entry *f; + struct __fdb_entry *fe = buf; + int num = 0; memset(buf, 0, maxnum*sizeof(struct __fdb_entry)); rcu_read_lock(); - for (i = 0; i < BR_HASH_SIZE; i++) { - hlist_for_each_entry_rcu(f, &br->hash[i], hlist) { - if (num >= maxnum) - goto out; + hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { + if (num >= maxnum) + break; - if (has_expired(br, f)) - continue; + if (has_expired(br, f)) + continue; - /* ignore pseudo entry for local MAC address */ - if (!f->dst) - continue; + /* ignore pseudo entry for local MAC address */ + if (!f->dst) + continue; - if (skip) { - --skip; - continue; - } + if (skip) { + --skip; + continue; + } - /* convert from internal format to API */ - memcpy(fe->mac_addr, f->addr.addr, ETH_ALEN); + /* convert from internal format to API */ + memcpy(fe->mac_addr, f->key.addr.addr, ETH_ALEN); - /* due to ABI compat need to split into hi/lo */ - fe->port_no = f->dst->port_no; - fe->port_hi = f->dst->port_no >> 8; + /* due to ABI compat need to split into hi/lo */ + fe->port_no = f->dst->port_no; + fe->port_hi = f->dst->port_no >> 8; - fe->is_local = f->is_local; - if (!f->is_static) - fe->ageing_timer_value = jiffies_delta_to_clock_t(jiffies - f->updated); - ++fe; - ++num; - } + fe->is_local = f->is_local; + if (!f->is_static) + fe->ageing_timer_value = jiffies_delta_to_clock_t(jiffies - f->updated); + ++fe; + ++num; } - - out: rcu_read_unlock(); return num; } -static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head, +static struct net_bridge_fdb_entry *fdb_create(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, __u16 vid, @@ -489,16 +471,23 @@ static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head, fdb = kmem_cache_alloc(br_fdb_cache, GFP_ATOMIC); if (fdb) { - memcpy(fdb->addr.addr, addr, ETH_ALEN); + memcpy(fdb->key.addr.addr, addr, ETH_ALEN); fdb->dst = source; - fdb->vlan_id = vid; + fdb->key.vlan_id = vid; fdb->is_local = is_local; fdb->is_static = is_static; fdb->added_by_user = 0; fdb->added_by_external_learn = 0; fdb->offloaded = 0; fdb->updated = fdb->used = jiffies; - hlist_add_head_rcu(&fdb->hlist, head); + if (rhashtable_lookup_insert_fast(&br->fdb_hash_tbl, + &fdb->rhnode, + br_fdb_rht_params)) { + kmem_cache_free(br_fdb_cache, fdb); + fdb = NULL; + } else { + hlist_add_head_rcu(&fdb->fdb_node, &br->fdb_list); + } } return fdb; } @@ -506,7 +495,6 @@ static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head, static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, u16 vid) { - struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)]; struct net_bridge_fdb_entry *fdb; if (!is_valid_ether_addr(addr)) @@ -524,7 +512,7 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, fdb_delete(br, fdb); } - fdb = fdb_create(head, source, addr, vid, 1, 1); + fdb = fdb_create(br, source, addr, vid, 1, 1); if (!fdb) return -ENOMEM; @@ -548,7 +536,6 @@ int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source, void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, u16 vid, bool added_by_user) { - struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)]; struct net_bridge_fdb_entry *fdb; bool fdb_modified = false; @@ -561,7 +548,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, source->state == BR_STATE_FORWARDING)) return; - fdb = fdb_find_rcu(head, addr, vid); + fdb = fdb_find_rcu(&br->fdb_hash_tbl, addr, vid); if (likely(fdb)) { /* attempt to update an entry for a local interface */ if (unlikely(fdb->is_local)) { @@ -590,14 +577,13 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, } } else { spin_lock(&br->hash_lock); - if (likely(!fdb_find_rcu(head, addr, vid))) { - fdb = fdb_create(head, source, addr, vid, 0, 0); - if (fdb) { - if (unlikely(added_by_user)) - fdb->added_by_user = 1; - trace_br_fdb_update(br, source, addr, vid, added_by_user); - fdb_notify(br, fdb, RTM_NEWNEIGH); - } + fdb = fdb_create(br, source, addr, vid, 0, 0); + if (fdb) { + if (unlikely(added_by_user)) + fdb->added_by_user = 1; + trace_br_fdb_update(br, source, addr, vid, + added_by_user); + fdb_notify(br, fdb, RTM_NEWNEIGH); } /* else we lose race and someone else inserts * it first, don't bother updating @@ -646,7 +632,7 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br, if (fdb->added_by_external_learn) ndm->ndm_flags |= NTF_EXT_LEARNED; - if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->addr)) + if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->key.addr)) goto nla_put_failure; if (nla_put_u32(skb, NDA_MASTER, br->dev->ifindex)) goto nla_put_failure; @@ -657,7 +643,8 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br, if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci)) goto nla_put_failure; - if (fdb->vlan_id && nla_put(skb, NDA_VLAN, sizeof(u16), &fdb->vlan_id)) + if (fdb->key.vlan_id && nla_put(skb, NDA_VLAN, sizeof(u16), + &fdb->key.vlan_id)) goto nla_put_failure; nlmsg_end(skb, nlh); @@ -711,54 +698,48 @@ int br_fdb_dump(struct sk_buff *skb, int *idx) { struct net_bridge *br = netdev_priv(dev); + struct net_bridge_fdb_entry *f; int err = 0; - int i; if (!(dev->priv_flags & IFF_EBRIDGE)) - goto out; + return err; if (!filter_dev) { err = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx); if (err < 0) - goto out; + return err; } - for (i = 0; i < BR_HASH_SIZE; i++) { - struct net_bridge_fdb_entry *f; - - hlist_for_each_entry_rcu(f, &br->hash[i], hlist) { - - if (*idx < cb->args[2]) + rcu_read_lock(); + hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { + if (*idx < cb->args[2]) + goto skip; + if (filter_dev && (!f->dst || f->dst->dev != filter_dev)) { + if (filter_dev != dev) goto skip; - - if (filter_dev && - (!f->dst || f->dst->dev != filter_dev)) { - if (filter_dev != dev) - goto skip; - /* !f->dst is a special case for bridge - * It means the MAC belongs to the bridge - * Therefore need a little more filtering - * we only want to dump the !f->dst case - */ - if (f->dst) - goto skip; - } - if (!filter_dev && f->dst) + /* !f->dst is a special case for bridge + * It means the MAC belongs to the bridge + * Therefore need a little more filtering + * we only want to dump the !f->dst case + */ + if (f->dst) goto skip; - - err = fdb_fill_info(skb, br, f, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_NEWNEIGH, - NLM_F_MULTI); - if (err < 0) - goto out; -skip: - *idx += 1; } + if (!filter_dev && f->dst) + goto skip; + + err = fdb_fill_info(skb, br, f, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGH, + NLM_F_MULTI); + if (err < 0) + break; +skip: + *idx += 1; } + rcu_read_unlock(); -out: return err; } @@ -766,7 +747,6 @@ out: static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, const __u8 *addr, __u16 state, __u16 flags, __u16 vid) { - struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)]; struct net_bridge_fdb_entry *fdb; bool modified = false; @@ -787,7 +767,7 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, if (!(flags & NLM_F_CREATE)) return -ENOENT; - fdb = fdb_create(head, source, addr, vid, 0, 0); + fdb = fdb_create(br, source, addr, vid, 0, 0); if (!fdb) return -ENOMEM; @@ -1012,65 +992,60 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p) { - struct net_bridge_fdb_entry *fdb, *tmp; - int i; - int err; + struct net_bridge_fdb_entry *f, *tmp; + int err = 0; ASSERT_RTNL(); - for (i = 0; i < BR_HASH_SIZE; i++) { - hlist_for_each_entry(fdb, &br->hash[i], hlist) { - /* We only care for static entries */ - if (!fdb->is_static) - continue; - - err = dev_uc_add(p->dev, fdb->addr.addr); - if (err) - goto rollback; - } + /* the key here is that static entries change only under rtnl */ + rcu_read_lock(); + hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { + /* We only care for static entries */ + if (!f->is_static) + continue; + err = dev_uc_add(p->dev, f->key.addr.addr); + if (err) + goto rollback; } - return 0; +done: + rcu_read_unlock(); -rollback: - for (i = 0; i < BR_HASH_SIZE; i++) { - hlist_for_each_entry(tmp, &br->hash[i], hlist) { - /* If we reached the fdb that failed, we can stop */ - if (tmp == fdb) - break; - - /* We only care for static entries */ - if (!tmp->is_static) - continue; + return err; - dev_uc_del(p->dev, tmp->addr.addr); - } +rollback: + hlist_for_each_entry_rcu(tmp, &br->fdb_list, fdb_node) { + /* We only care for static entries */ + if (!tmp->is_static) + continue; + if (tmp == f) + break; + dev_uc_del(p->dev, tmp->key.addr.addr); } - return err; + + goto done; } void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p) { - struct net_bridge_fdb_entry *fdb; - int i; + struct net_bridge_fdb_entry *f; ASSERT_RTNL(); - for (i = 0; i < BR_HASH_SIZE; i++) { - hlist_for_each_entry_rcu(fdb, &br->hash[i], hlist) { - /* We only care for static entries */ - if (!fdb->is_static) - continue; + rcu_read_lock(); + hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { + /* We only care for static entries */ + if (!f->is_static) + continue; - dev_uc_del(p->dev, fdb->addr.addr); - } + dev_uc_del(p->dev, f->key.addr.addr); } + rcu_read_unlock(); } int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, const unsigned char *addr, u16 vid) { struct net_bridge_fdb_entry *fdb; - struct hlist_head *head; bool modified = false; int err = 0; @@ -1078,10 +1053,9 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, spin_lock_bh(&br->hash_lock); - head = &br->hash[br_mac_hash(addr, vid)]; fdb = br_fdb_find(br, addr, vid); if (!fdb) { - fdb = fdb_create(head, p, addr, vid, 0, 0); + fdb = fdb_create(br, p, addr, vid, 0, 0); if (!fdb) { err = -ENOMEM; goto err_unlock; diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index b0f4c734900b..6d9f48bd374a 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -760,9 +760,9 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, void br_mdb_init(void) { - rtnl_register(PF_BRIDGE, RTM_GETMDB, NULL, br_mdb_dump, 0); - rtnl_register(PF_BRIDGE, RTM_NEWMDB, br_mdb_add, NULL, 0); - rtnl_register(PF_BRIDGE, RTM_DELMDB, br_mdb_del, NULL, 0); + rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_GETMDB, NULL, br_mdb_dump, 0); + rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_NEWMDB, br_mdb_add, NULL, 0); + rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_DELMDB, br_mdb_del, NULL, 0); } void br_mdb_uninit(void) diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index c2eea1b8737a..27f1d4f2114a 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -991,7 +991,7 @@ int br_nf_hook_thresh(unsigned int hook, struct net *net, unsigned int i; int ret; - e = rcu_dereference(net->nf.hooks[NFPROTO_BRIDGE][hook]); + e = rcu_dereference(net->nf.hooks_bridge[hook]); if (!e) return okfn(net, sk, skb); diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index d0ef0a8e8831..015f465c514b 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -1262,19 +1262,20 @@ static int br_dev_newlink(struct net *src_net, struct net_device *dev, struct net_bridge *br = netdev_priv(dev); int err; + err = register_netdevice(dev); + if (err) + return err; + if (tb[IFLA_ADDRESS]) { spin_lock_bh(&br->lock); br_stp_change_bridge_id(br, nla_data(tb[IFLA_ADDRESS])); spin_unlock_bh(&br->lock); } - err = register_netdevice(dev); - if (err) - return err; - err = br_changelink(dev, tb, data, extack); if (err) - unregister_netdevice(dev); + br_dev_delete(dev, NULL); + return err; } diff --git a/net/bridge/br_nf_core.c b/net/bridge/br_nf_core.c index 20cbb727df4d..8e2d7cfa4e16 100644 --- a/net/bridge/br_nf_core.c +++ b/net/bridge/br_nf_core.c @@ -78,7 +78,6 @@ void br_netfilter_rtable_init(struct net_bridge *br) atomic_set(&rt->dst.__refcnt, 1); rt->dst.dev = br->dev; - rt->dst.path = &rt->dst; dst_init_metrics(&rt->dst, br_dst_default_metrics, true); rt->dst.flags = DST_NOXFRM | DST_FAKE_RTABLE; rt->dst.ops = &fake_dst_ops; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 1312b8d20ec3..8e13a64d8c99 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -168,12 +168,17 @@ struct net_bridge_vlan_group { u16 pvid; }; +struct net_bridge_fdb_key { + mac_addr addr; + u16 vlan_id; +}; + struct net_bridge_fdb_entry { - struct hlist_node hlist; + struct rhash_head rhnode; struct net_bridge_port *dst; - mac_addr addr; - __u16 vlan_id; + struct net_bridge_fdb_key key; + struct hlist_node fdb_node; unsigned char is_local:1, is_static:1, added_by_user:1, @@ -315,7 +320,7 @@ struct net_bridge { struct net_bridge_vlan_group __rcu *vlgrp; #endif - struct hlist_head hash[BR_HASH_SIZE]; + struct rhashtable fdb_hash_tbl; #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) union { struct rtable fake_rtable; @@ -405,6 +410,7 @@ struct net_bridge { int offload_fwd_mark; #endif bool neigh_suppress_enabled; + struct hlist_head fdb_list; }; struct br_input_skb_cb { @@ -515,6 +521,8 @@ static inline void br_netpoll_disable(struct net_bridge_port *p) /* br_fdb.c */ int br_fdb_init(void); void br_fdb_fini(void); +int br_fdb_hash_init(struct net_bridge *br); +void br_fdb_hash_fini(struct net_bridge *br); void br_fdb_flush(struct net_bridge *br); void br_fdb_find_delete_local(struct net_bridge *br, const struct net_bridge_port *p, @@ -752,7 +760,7 @@ static inline void br_multicast_flood(struct net_bridge_mdb_entry *mdst, static inline bool br_multicast_is_router(struct net_bridge *br) { - return 0; + return false; } static inline bool br_multicast_querier_exists(struct net_bridge *br, diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index 9700e0f3307b..ee775f4ff76c 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -121,13 +121,13 @@ br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type) switch (type) { case RTM_DELNEIGH: - br_switchdev_fdb_call_notifiers(false, fdb->addr.addr, - fdb->vlan_id, + br_switchdev_fdb_call_notifiers(false, fdb->key.addr.addr, + fdb->key.vlan_id, fdb->dst->dev); break; case RTM_NEWNEIGH: - br_switchdev_fdb_call_notifiers(true, fdb->addr.addr, - fdb->vlan_id, + br_switchdev_fdb_call_notifiers(true, fdb->key.addr.addr, + fdb->key.vlan_id, fdb->dst->dev); break; } diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index 723f25eed8ea..b1be0dcfba6b 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -272,10 +272,7 @@ static ssize_t group_addr_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%x:%x:%x:%x:%x:%x\n", - br->group_addr[0], br->group_addr[1], - br->group_addr[2], br->group_addr[3], - br->group_addr[4], br->group_addr[5]); + return sprintf(buf, "%pM\n", br->group_addr); } static ssize_t group_addr_store(struct device *d, @@ -284,14 +281,11 @@ static ssize_t group_addr_store(struct device *d, { struct net_bridge *br = to_bridge(d); u8 new_addr[6]; - int i; if (!ns_capable(dev_net(br->dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; - if (sscanf(buf, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx", - &new_addr[0], &new_addr[1], &new_addr[2], - &new_addr[3], &new_addr[4], &new_addr[5]) != 6) + if (!mac_pton(buf, new_addr)) return -EINVAL; if (!is_link_local_ether_addr(new_addr)) @@ -306,8 +300,7 @@ static ssize_t group_addr_store(struct device *d, return restart_syscall(); spin_lock_bh(&br->lock); - for (i = 0; i < 6; i++) - br->group_addr[i] = new_addr[i]; + ether_addr_copy(br->group_addr, new_addr); spin_unlock_bh(&br->lock); br->group_addr_set = true; diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig index e7ef1a1ef3a6..225d1668dfdd 100644 --- a/net/bridge/netfilter/Kconfig +++ b/net/bridge/netfilter/Kconfig @@ -4,6 +4,7 @@ # menuconfig NF_TABLES_BRIDGE depends on BRIDGE && NETFILTER && NF_TABLES + select NETFILTER_FAMILY_BRIDGE tristate "Ethernet Bridge nf_tables support" if NF_TABLES_BRIDGE @@ -29,6 +30,7 @@ endif # NF_TABLES_BRIDGE menuconfig BRIDGE_NF_EBTABLES tristate "Ethernet Bridge tables (ebtables) support" depends on BRIDGE && NETFILTER && NETFILTER_XTABLES + select NETFILTER_FAMILY_BRIDGE help ebtables is a general, extensible frame/packet identification framework. Say 'Y' or 'M' here if you want to do Ethernet diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 37817d25b63d..02c4b409d317 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -2445,7 +2445,6 @@ static int __init ebtables_init(void) return ret; } - printk(KERN_INFO "Ebtables v2.0 registered\n"); return 0; } @@ -2453,7 +2452,6 @@ static void __exit ebtables_fini(void) { nf_unregister_sockopt(&ebt_sockopts); xt_unregister_target(&ebt_standard_target); - printk(KERN_INFO "Ebtables v2.0 unregistered\n"); } EXPORT_SYMBOL(ebt_register_table); diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c index 97afdc0744e6..5160cf614176 100644 --- a/net/bridge/netfilter/nf_tables_bridge.c +++ b/net/bridge/netfilter/nf_tables_bridge.c @@ -25,63 +25,23 @@ nft_do_chain_bridge(void *priv, { struct nft_pktinfo pkt; + nft_set_pktinfo(&pkt, skb, state); + switch (eth_hdr(skb)->h_proto) { case htons(ETH_P_IP): - nft_set_pktinfo_ipv4_validate(&pkt, skb, state); + nft_set_pktinfo_ipv4_validate(&pkt, skb); break; case htons(ETH_P_IPV6): - nft_set_pktinfo_ipv6_validate(&pkt, skb, state); + nft_set_pktinfo_ipv6_validate(&pkt, skb); break; default: - nft_set_pktinfo_unspec(&pkt, skb, state); + nft_set_pktinfo_unspec(&pkt, skb); break; } return nft_do_chain(&pkt, priv); } -static struct nft_af_info nft_af_bridge __read_mostly = { - .family = NFPROTO_BRIDGE, - .nhooks = NF_BR_NUMHOOKS, - .owner = THIS_MODULE, - .nops = 1, - .hooks = { - [NF_BR_PRE_ROUTING] = nft_do_chain_bridge, - [NF_BR_LOCAL_IN] = nft_do_chain_bridge, - [NF_BR_FORWARD] = nft_do_chain_bridge, - [NF_BR_LOCAL_OUT] = nft_do_chain_bridge, - [NF_BR_POST_ROUTING] = nft_do_chain_bridge, - }, -}; - -static int nf_tables_bridge_init_net(struct net *net) -{ - net->nft.bridge = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL); - if (net->nft.bridge == NULL) - return -ENOMEM; - - memcpy(net->nft.bridge, &nft_af_bridge, sizeof(nft_af_bridge)); - - if (nft_register_afinfo(net, net->nft.bridge) < 0) - goto err; - - return 0; -err: - kfree(net->nft.bridge); - return -ENOMEM; -} - -static void nf_tables_bridge_exit_net(struct net *net) -{ - nft_unregister_afinfo(net, net->nft.bridge); - kfree(net->nft.bridge); -} - -static struct pernet_operations nf_tables_bridge_net_ops = { - .init = nf_tables_bridge_init_net, - .exit = nf_tables_bridge_exit_net, -}; - static const struct nf_chain_type filter_bridge = { .name = "filter", .type = NFT_CHAIN_T_DEFAULT, @@ -92,75 +52,23 @@ static const struct nf_chain_type filter_bridge = { (1 << NF_BR_FORWARD) | (1 << NF_BR_LOCAL_OUT) | (1 << NF_BR_POST_ROUTING), -}; - -static void nf_br_saveroute(const struct sk_buff *skb, - struct nf_queue_entry *entry) -{ -} - -static int nf_br_reroute(struct net *net, struct sk_buff *skb, - const struct nf_queue_entry *entry) -{ - return 0; -} - -static __sum16 nf_br_checksum(struct sk_buff *skb, unsigned int hook, - unsigned int dataoff, u_int8_t protocol) -{ - return 0; -} - -static __sum16 nf_br_checksum_partial(struct sk_buff *skb, unsigned int hook, - unsigned int dataoff, unsigned int len, - u_int8_t protocol) -{ - return 0; -} - -static int nf_br_route(struct net *net, struct dst_entry **dst, - struct flowi *fl, bool strict __always_unused) -{ - return 0; -} - -static const struct nf_afinfo nf_br_afinfo = { - .family = AF_BRIDGE, - .checksum = nf_br_checksum, - .checksum_partial = nf_br_checksum_partial, - .route = nf_br_route, - .saveroute = nf_br_saveroute, - .reroute = nf_br_reroute, - .route_key_size = 0, + .hooks = { + [NF_BR_PRE_ROUTING] = nft_do_chain_bridge, + [NF_BR_LOCAL_IN] = nft_do_chain_bridge, + [NF_BR_FORWARD] = nft_do_chain_bridge, + [NF_BR_LOCAL_OUT] = nft_do_chain_bridge, + [NF_BR_POST_ROUTING] = nft_do_chain_bridge, + }, }; static int __init nf_tables_bridge_init(void) { - int ret; - - nf_register_afinfo(&nf_br_afinfo); - ret = nft_register_chain_type(&filter_bridge); - if (ret < 0) - goto err1; - - ret = register_pernet_subsys(&nf_tables_bridge_net_ops); - if (ret < 0) - goto err2; - - return ret; - -err2: - nft_unregister_chain_type(&filter_bridge); -err1: - nf_unregister_afinfo(&nf_br_afinfo); - return ret; + return nft_register_chain_type(&filter_bridge); } static void __exit nf_tables_bridge_exit(void) { - unregister_pernet_subsys(&nf_tables_bridge_net_ops); nft_unregister_chain_type(&filter_bridge); - nf_unregister_afinfo(&nf_br_afinfo); } module_init(nf_tables_bridge_init); @@ -168,4 +76,4 @@ module_exit(nf_tables_bridge_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_ALIAS_NFT_FAMILY(AF_BRIDGE); +MODULE_ALIAS_NFT_CHAIN(AF_BRIDGE, "filter"); diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c index 2d38b6e34203..e0adcd123f48 100644 --- a/net/caif/caif_dev.c +++ b/net/caif/caif_dev.c @@ -334,9 +334,8 @@ void caif_enroll_dev(struct net_device *dev, struct caif_dev_common *caifdev, mutex_lock(&caifdevs->lock); list_add_rcu(&caifd->list, &caifdevs->list); - strncpy(caifd->layer.name, dev->name, - sizeof(caifd->layer.name) - 1); - caifd->layer.name[sizeof(caifd->layer.name) - 1] = 0; + strlcpy(caifd->layer.name, dev->name, + sizeof(caifd->layer.name)); caifd->layer.transmit = transmit; cfcnfg_add_phy_layer(cfg, dev, diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index 632d5a416d97..64048cec41e0 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -934,11 +934,11 @@ static int caif_release(struct socket *sock) } /* Copied from af_unix.c:unix_poll(), added CAIF tx_flow handling */ -static unsigned int caif_poll(struct file *file, +static __poll_t caif_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; - unsigned int mask; + __poll_t mask; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); sock_poll_wait(file, sk_sleep(sk), wait); diff --git a/net/caif/caif_usb.c b/net/caif/caif_usb.c index 5cd44f001f64..1a082a946045 100644 --- a/net/caif/caif_usb.c +++ b/net/caif/caif_usb.c @@ -176,9 +176,7 @@ static int cfusbl_device_notify(struct notifier_block *me, unsigned long what, dev_add_pack(&caif_usb_type); pack_added = true; - strncpy(layer->name, dev->name, - sizeof(layer->name) - 1); - layer->name[sizeof(layer->name) - 1] = 0; + strlcpy(layer->name, dev->name, sizeof(layer->name)); return 0; } diff --git a/net/caif/cfcnfg.c b/net/caif/cfcnfg.c index 273cb07f57d8..8f00bea093b9 100644 --- a/net/caif/cfcnfg.c +++ b/net/caif/cfcnfg.c @@ -268,17 +268,15 @@ static int caif_connect_req_to_link_param(struct cfcnfg *cnfg, case CAIFPROTO_RFM: l->linktype = CFCTRL_SRV_RFM; l->u.datagram.connid = s->sockaddr.u.rfm.connection_id; - strncpy(l->u.rfm.volume, s->sockaddr.u.rfm.volume, - sizeof(l->u.rfm.volume)-1); - l->u.rfm.volume[sizeof(l->u.rfm.volume)-1] = 0; + strlcpy(l->u.rfm.volume, s->sockaddr.u.rfm.volume, + sizeof(l->u.rfm.volume)); break; case CAIFPROTO_UTIL: l->linktype = CFCTRL_SRV_UTIL; l->endpoint = 0x00; l->chtype = 0x00; - strncpy(l->u.utility.name, s->sockaddr.u.util.service, - sizeof(l->u.utility.name)-1); - l->u.utility.name[sizeof(l->u.utility.name)-1] = 0; + strlcpy(l->u.utility.name, s->sockaddr.u.util.service, + sizeof(l->u.utility.name)); caif_assert(sizeof(l->u.utility.name) > 10); l->u.utility.paramlen = s->param.size; if (l->u.utility.paramlen > sizeof(l->u.utility.params)) diff --git a/net/caif/cfctrl.c b/net/caif/cfctrl.c index f5afda1abc76..a1e85f032108 100644 --- a/net/caif/cfctrl.c +++ b/net/caif/cfctrl.c @@ -258,8 +258,8 @@ int cfctrl_linkup_request(struct cflayer *layer, tmp16 = cpu_to_le16(param->u.utility.fifosize_bufs); cfpkt_add_body(pkt, &tmp16, 2); memset(utility_name, 0, sizeof(utility_name)); - strncpy(utility_name, param->u.utility.name, - UTILITY_NAME_LENGTH - 1); + strlcpy(utility_name, param->u.utility.name, + UTILITY_NAME_LENGTH); cfpkt_add_body(pkt, utility_name, UTILITY_NAME_LENGTH); tmp8 = param->u.utility.paramlen; cfpkt_add_body(pkt, &tmp8, 1); @@ -352,15 +352,14 @@ static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt) u8 cmdrsp; u8 cmd; int ret = -1; - u16 tmp16; u8 len; u8 param[255]; - u8 linkid; + u8 linkid = 0; struct cfctrl *cfctrl = container_obj(layer); struct cfctrl_request_info rsp, *req; - cfpkt_extr_head(pkt, &cmdrsp, 1); + cmdrsp = cfpkt_extr_head_u8(pkt); cmd = cmdrsp & CFCTRL_CMD_MASK; if (cmd != CFCTRL_CMD_LINK_ERR && CFCTRL_RSP_BIT != (CFCTRL_RSP_BIT & cmdrsp) @@ -378,13 +377,12 @@ static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt) u8 physlinkid; u8 prio; u8 tmp; - u32 tmp32; u8 *cp; int i; struct cfctrl_link_param linkparam; memset(&linkparam, 0, sizeof(linkparam)); - cfpkt_extr_head(pkt, &tmp, 1); + tmp = cfpkt_extr_head_u8(pkt); serv = tmp & CFCTRL_SRV_MASK; linkparam.linktype = serv; @@ -392,13 +390,13 @@ static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt) servtype = tmp >> 4; linkparam.chtype = servtype; - cfpkt_extr_head(pkt, &tmp, 1); + tmp = cfpkt_extr_head_u8(pkt); physlinkid = tmp & 0x07; prio = tmp >> 3; linkparam.priority = prio; linkparam.phyid = physlinkid; - cfpkt_extr_head(pkt, &endpoint, 1); + endpoint = cfpkt_extr_head_u8(pkt); linkparam.endpoint = endpoint & 0x03; switch (serv) { @@ -407,45 +405,43 @@ static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt) if (CFCTRL_ERR_BIT & cmdrsp) break; /* Link ID */ - cfpkt_extr_head(pkt, &linkid, 1); + linkid = cfpkt_extr_head_u8(pkt); break; case CFCTRL_SRV_VIDEO: - cfpkt_extr_head(pkt, &tmp, 1); + tmp = cfpkt_extr_head_u8(pkt); linkparam.u.video.connid = tmp; if (CFCTRL_ERR_BIT & cmdrsp) break; /* Link ID */ - cfpkt_extr_head(pkt, &linkid, 1); + linkid = cfpkt_extr_head_u8(pkt); break; case CFCTRL_SRV_DATAGRAM: - cfpkt_extr_head(pkt, &tmp32, 4); linkparam.u.datagram.connid = - le32_to_cpu(tmp32); + cfpkt_extr_head_u32(pkt); if (CFCTRL_ERR_BIT & cmdrsp) break; /* Link ID */ - cfpkt_extr_head(pkt, &linkid, 1); + linkid = cfpkt_extr_head_u8(pkt); break; case CFCTRL_SRV_RFM: /* Construct a frame, convert * DatagramConnectionID * to network format long and copy it out... */ - cfpkt_extr_head(pkt, &tmp32, 4); linkparam.u.rfm.connid = - le32_to_cpu(tmp32); + cfpkt_extr_head_u32(pkt); cp = (u8 *) linkparam.u.rfm.volume; - for (cfpkt_extr_head(pkt, &tmp, 1); + for (tmp = cfpkt_extr_head_u8(pkt); cfpkt_more(pkt) && tmp != '\0'; - cfpkt_extr_head(pkt, &tmp, 1)) + tmp = cfpkt_extr_head_u8(pkt)) *cp++ = tmp; *cp = '\0'; if (CFCTRL_ERR_BIT & cmdrsp) break; /* Link ID */ - cfpkt_extr_head(pkt, &linkid, 1); + linkid = cfpkt_extr_head_u8(pkt); break; case CFCTRL_SRV_UTIL: @@ -454,13 +450,11 @@ static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt) * to network format long and copy it out... */ /* Fifosize KB */ - cfpkt_extr_head(pkt, &tmp16, 2); linkparam.u.utility.fifosize_kb = - le16_to_cpu(tmp16); + cfpkt_extr_head_u16(pkt); /* Fifosize bufs */ - cfpkt_extr_head(pkt, &tmp16, 2); linkparam.u.utility.fifosize_bufs = - le16_to_cpu(tmp16); + cfpkt_extr_head_u16(pkt); /* name */ cp = (u8 *) linkparam.u.utility.name; caif_assert(sizeof(linkparam.u.utility.name) @@ -468,24 +462,24 @@ static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt) for (i = 0; i < UTILITY_NAME_LENGTH && cfpkt_more(pkt); i++) { - cfpkt_extr_head(pkt, &tmp, 1); + tmp = cfpkt_extr_head_u8(pkt); *cp++ = tmp; } /* Length */ - cfpkt_extr_head(pkt, &len, 1); + len = cfpkt_extr_head_u8(pkt); linkparam.u.utility.paramlen = len; /* Param Data */ cp = linkparam.u.utility.params; while (cfpkt_more(pkt) && len--) { - cfpkt_extr_head(pkt, &tmp, 1); + tmp = cfpkt_extr_head_u8(pkt); *cp++ = tmp; } if (CFCTRL_ERR_BIT & cmdrsp) break; /* Link ID */ - cfpkt_extr_head(pkt, &linkid, 1); + linkid = cfpkt_extr_head_u8(pkt); /* Length */ - cfpkt_extr_head(pkt, &len, 1); + len = cfpkt_extr_head_u8(pkt); /* Param Data */ cfpkt_extr_head(pkt, ¶m, len); break; @@ -522,7 +516,7 @@ static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt) } break; case CFCTRL_CMD_LINK_DESTROY: - cfpkt_extr_head(pkt, &linkid, 1); + linkid = cfpkt_extr_head_u8(pkt); cfctrl->res.linkdestroy_rsp(cfctrl->serv.layer.up, linkid); break; case CFCTRL_CMD_LINK_ERR: diff --git a/net/caif/cfpkt_skbuff.c b/net/caif/cfpkt_skbuff.c index 71b6ab240dea..38c2b7a890dd 100644 --- a/net/caif/cfpkt_skbuff.c +++ b/net/caif/cfpkt_skbuff.c @@ -8,7 +8,6 @@ #include <linux/string.h> #include <linux/skbuff.h> -#include <linux/hardirq.h> #include <linux/export.h> #include <net/caif/cfpkt.h> diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c index 922ac1d605b3..53ecda10b790 100644 --- a/net/caif/chnl_net.c +++ b/net/caif/chnl_net.c @@ -8,7 +8,6 @@ #define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ #include <linux/fs.h> -#include <linux/hardirq.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netdevice.h> diff --git a/net/can/Kconfig b/net/can/Kconfig index a15c0e0d1fc7..a4399be54ff4 100644 --- a/net/can/Kconfig +++ b/net/can/Kconfig @@ -11,7 +11,7 @@ menuconfig CAN 1991, mainly for automotive, but now widely used in marine (NMEA2000), industrial, and medical applications. More information on the CAN network protocol family PF_CAN - is contained in <Documentation/networking/can.txt>. + is contained in <Documentation/networking/can.rst>. If you want CAN support you should say Y here and also to the specific driver for your controller(s) below. diff --git a/net/can/af_can.c b/net/can/af_can.c index 003b2d6d655f..6da324550eec 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -321,13 +321,13 @@ EXPORT_SYMBOL(can_send); * af_can rx path */ -static struct dev_rcv_lists *find_dev_rcv_lists(struct net *net, +static struct can_dev_rcv_lists *find_dev_rcv_lists(struct net *net, struct net_device *dev) { if (!dev) return net->can.can_rx_alldev_list; else - return (struct dev_rcv_lists *)dev->ml_priv; + return (struct can_dev_rcv_lists *)dev->ml_priv; } /** @@ -381,7 +381,7 @@ static unsigned int effhash(canid_t can_id) * Reduced can_id to have a preprocessed filter compare value. */ static struct hlist_head *find_rcv_list(canid_t *can_id, canid_t *mask, - struct dev_rcv_lists *d) + struct can_dev_rcv_lists *d) { canid_t inv = *can_id & CAN_INV_FILTER; /* save flag before masking */ @@ -464,7 +464,7 @@ int can_rx_register(struct net *net, struct net_device *dev, canid_t can_id, { struct receiver *r; struct hlist_head *rl; - struct dev_rcv_lists *d; + struct can_dev_rcv_lists *d; struct s_pstats *can_pstats = net->can.can_pstats; int err = 0; @@ -542,7 +542,7 @@ void can_rx_unregister(struct net *net, struct net_device *dev, canid_t can_id, struct receiver *r = NULL; struct hlist_head *rl; struct s_pstats *can_pstats = net->can.can_pstats; - struct dev_rcv_lists *d; + struct can_dev_rcv_lists *d; if (dev && dev->type != ARPHRD_CAN) return; @@ -615,7 +615,7 @@ static inline void deliver(struct sk_buff *skb, struct receiver *r) r->matches++; } -static int can_rcv_filter(struct dev_rcv_lists *d, struct sk_buff *skb) +static int can_rcv_filter(struct can_dev_rcv_lists *d, struct sk_buff *skb) { struct receiver *r; int matches = 0; @@ -682,7 +682,7 @@ static int can_rcv_filter(struct dev_rcv_lists *d, struct sk_buff *skb) static void can_receive(struct sk_buff *skb, struct net_device *dev) { - struct dev_rcv_lists *d; + struct can_dev_rcv_lists *d; struct net *net = dev_net(dev); struct s_stats *can_stats = net->can.can_stats; int matches; @@ -721,20 +721,16 @@ static int can_rcv(struct sk_buff *skb, struct net_device *dev, { struct canfd_frame *cfd = (struct canfd_frame *)skb->data; - if (WARN_ONCE(dev->type != ARPHRD_CAN || - skb->len != CAN_MTU || - cfd->len > CAN_MAX_DLEN, - "PF_CAN: dropped non conform CAN skbuf: " - "dev type %d, len %d, datalen %d\n", - dev->type, skb->len, cfd->len)) - goto drop; + if (unlikely(dev->type != ARPHRD_CAN || skb->len != CAN_MTU || + cfd->len > CAN_MAX_DLEN)) { + pr_warn_once("PF_CAN: dropped non conform CAN skbuf: dev type %d, len %d, datalen %d\n", + dev->type, skb->len, cfd->len); + kfree_skb(skb); + return NET_RX_DROP; + } can_receive(skb, dev); return NET_RX_SUCCESS; - -drop: - kfree_skb(skb); - return NET_RX_DROP; } static int canfd_rcv(struct sk_buff *skb, struct net_device *dev, @@ -742,20 +738,16 @@ static int canfd_rcv(struct sk_buff *skb, struct net_device *dev, { struct canfd_frame *cfd = (struct canfd_frame *)skb->data; - if (WARN_ONCE(dev->type != ARPHRD_CAN || - skb->len != CANFD_MTU || - cfd->len > CANFD_MAX_DLEN, - "PF_CAN: dropped non conform CAN FD skbuf: " - "dev type %d, len %d, datalen %d\n", - dev->type, skb->len, cfd->len)) - goto drop; + if (unlikely(dev->type != ARPHRD_CAN || skb->len != CANFD_MTU || + cfd->len > CANFD_MAX_DLEN)) { + pr_warn_once("PF_CAN: dropped non conform CAN FD skbuf: dev type %d, len %d, datalen %d\n", + dev->type, skb->len, cfd->len); + kfree_skb(skb); + return NET_RX_DROP; + } can_receive(skb, dev); return NET_RX_SUCCESS; - -drop: - kfree_skb(skb); - return NET_RX_DROP; } /* @@ -829,7 +821,7 @@ static int can_notifier(struct notifier_block *nb, unsigned long msg, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct dev_rcv_lists *d; + struct can_dev_rcv_lists *d; if (dev->type != ARPHRD_CAN) return NOTIFY_DONE; @@ -874,7 +866,7 @@ static int can_pernet_init(struct net *net) { spin_lock_init(&net->can.can_rcvlists_lock); net->can.can_rx_alldev_list = - kzalloc(sizeof(struct dev_rcv_lists), GFP_KERNEL); + kzalloc(sizeof(struct can_dev_rcv_lists), GFP_KERNEL); if (!net->can.can_rx_alldev_list) goto out; net->can.can_stats = kzalloc(sizeof(struct s_stats), GFP_KERNEL); @@ -920,7 +912,7 @@ static void can_pernet_exit(struct net *net) rcu_read_lock(); for_each_netdev_rcu(net, dev) { if (dev->type == ARPHRD_CAN && dev->ml_priv) { - struct dev_rcv_lists *d = dev->ml_priv; + struct can_dev_rcv_lists *d = dev->ml_priv; BUG_ON(d->entries); kfree(d); diff --git a/net/can/af_can.h b/net/can/af_can.h index eca6463c6213..9cb3719632bd 100644 --- a/net/can/af_can.h +++ b/net/can/af_can.h @@ -67,7 +67,7 @@ struct receiver { enum { RX_ERR, RX_ALL, RX_FIL, RX_INV, RX_MAX }; /* per device receive filters linked at dev->ml_priv */ -struct dev_rcv_lists { +struct can_dev_rcv_lists { struct hlist_head rx[RX_MAX]; struct hlist_head rx_sff[CAN_SFF_RCV_ARRAY_SZ]; struct hlist_head rx_eff[CAN_EFF_RCV_ARRAY_SZ]; diff --git a/net/can/bcm.c b/net/can/bcm.c index 13690334efa3..ac5e5e34fee3 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -246,7 +246,6 @@ static int bcm_proc_open(struct inode *inode, struct file *file) } static const struct file_operations bcm_proc_fops = { - .owner = THIS_MODULE, .open = bcm_proc_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/can/gw.c b/net/can/gw.c index 73a02af4b5d7..398dd0395ad9 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -1014,6 +1014,8 @@ static struct pernet_operations cangw_pernet_ops = { static __init int cgw_module_init(void) { + int ret; + /* sanitize given module parameter */ max_hops = clamp_t(unsigned int, max_hops, CGW_MIN_HOPS, CGW_MAX_HOPS); @@ -1031,15 +1033,19 @@ static __init int cgw_module_init(void) notifier.notifier_call = cgw_notifier; register_netdevice_notifier(¬ifier); - if (__rtnl_register(PF_CAN, RTM_GETROUTE, NULL, cgw_dump_jobs, 0)) { + ret = rtnl_register_module(THIS_MODULE, PF_CAN, RTM_GETROUTE, + NULL, cgw_dump_jobs, 0); + if (ret) { unregister_netdevice_notifier(¬ifier); kmem_cache_destroy(cgw_cache); return -ENOBUFS; } - /* Only the first call to __rtnl_register can fail */ - __rtnl_register(PF_CAN, RTM_NEWROUTE, cgw_create_job, NULL, 0); - __rtnl_register(PF_CAN, RTM_DELROUTE, cgw_remove_job, NULL, 0); + /* Only the first call to rtnl_register_module can fail */ + rtnl_register_module(THIS_MODULE, PF_CAN, RTM_NEWROUTE, + cgw_create_job, NULL, 0); + rtnl_register_module(THIS_MODULE, PF_CAN, RTM_DELROUTE, + cgw_remove_job, NULL, 0); return 0; } diff --git a/net/can/proc.c b/net/can/proc.c index 0c59f876fe6f..fdf704e9bb8c 100644 --- a/net/can/proc.c +++ b/net/can/proc.c @@ -276,7 +276,6 @@ static int can_stats_proc_open(struct inode *inode, struct file *file) } static const struct file_operations can_stats_proc_fops = { - .owner = THIS_MODULE, .open = can_stats_proc_open, .read = seq_read, .llseek = seq_lseek, @@ -310,7 +309,6 @@ static int can_reset_stats_proc_open(struct inode *inode, struct file *file) } static const struct file_operations can_reset_stats_proc_fops = { - .owner = THIS_MODULE, .open = can_reset_stats_proc_open, .read = seq_read, .llseek = seq_lseek, @@ -329,7 +327,6 @@ static int can_version_proc_open(struct inode *inode, struct file *file) } static const struct file_operations can_version_proc_fops = { - .owner = THIS_MODULE, .open = can_version_proc_open, .read = seq_read, .llseek = seq_lseek, @@ -338,7 +335,7 @@ static const struct file_operations can_version_proc_fops = { static inline void can_rcvlist_proc_show_one(struct seq_file *m, int idx, struct net_device *dev, - struct dev_rcv_lists *d) + struct can_dev_rcv_lists *d) { if (!hlist_empty(&d->rx[idx])) { can_print_recv_banner(m); @@ -353,7 +350,7 @@ static int can_rcvlist_proc_show(struct seq_file *m, void *v) /* double cast to prevent GCC warning */ int idx = (int)(long)PDE_DATA(m->file->f_inode); struct net_device *dev; - struct dev_rcv_lists *d; + struct can_dev_rcv_lists *d; struct net *net = m->private; seq_printf(m, "\nreceive list '%s':\n", rx_list_name[idx]); @@ -382,7 +379,6 @@ static int can_rcvlist_proc_open(struct inode *inode, struct file *file) } static const struct file_operations can_rcvlist_proc_fops = { - .owner = THIS_MODULE, .open = can_rcvlist_proc_open, .read = seq_read, .llseek = seq_lseek, @@ -417,7 +413,7 @@ static inline void can_rcvlist_proc_show_array(struct seq_file *m, static int can_rcvlist_sff_proc_show(struct seq_file *m, void *v) { struct net_device *dev; - struct dev_rcv_lists *d; + struct can_dev_rcv_lists *d; struct net *net = m->private; /* RX_SFF */ @@ -450,7 +446,6 @@ static int can_rcvlist_sff_proc_open(struct inode *inode, struct file *file) } static const struct file_operations can_rcvlist_sff_proc_fops = { - .owner = THIS_MODULE, .open = can_rcvlist_sff_proc_open, .read = seq_read, .llseek = seq_lseek, @@ -461,7 +456,7 @@ static const struct file_operations can_rcvlist_sff_proc_fops = { static int can_rcvlist_eff_proc_show(struct seq_file *m, void *v) { struct net_device *dev; - struct dev_rcv_lists *d; + struct can_dev_rcv_lists *d; struct net *net = m->private; /* RX_EFF */ @@ -494,7 +489,6 @@ static int can_rcvlist_eff_proc_open(struct inode *inode, struct file *file) } static const struct file_operations can_rcvlist_eff_proc_fops = { - .owner = THIS_MODULE, .open = can_rcvlist_eff_proc_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/can/raw.c b/net/can/raw.c index 864c80dbdb72..f2ecc43376a1 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -401,6 +401,8 @@ static int raw_bind(struct socket *sock, struct sockaddr *uaddr, int len) if (len < sizeof(*addr)) return -EINVAL; + if (addr->can_family != AF_CAN) + return -EINVAL; lock_sock(sk); diff --git a/net/core/Makefile b/net/core/Makefile index 1fd0a9c88b1b..6dbbba8c57ae 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -11,7 +11,7 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \ neighbour.o rtnetlink.o utils.o link_watch.o filter.o \ sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \ - fib_notifier.o + fib_notifier.o xdp.o obj-y += net-sysfs.o obj-$(CONFIG_PROC_FS) += net-procfs.o diff --git a/net/core/datagram.c b/net/core/datagram.c index 522873ed120b..b7d9293940b5 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -72,12 +72,10 @@ static inline int connection_based(struct sock *sk) static int receiver_wake_function(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key) { - unsigned long bits = (unsigned long)key; - /* * Avoid a wakeup if event not interesting for us */ - if (bits && !(bits & (POLLIN | POLLERR))) + if (key && !(key_to_poll(key) & (POLLIN | POLLERR))) return 0; return autoremove_wake_function(wait, mode, sync, key); } @@ -833,11 +831,11 @@ EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg); * and you use a different write policy from sock_writeable() * then please supply your own write_space callback. */ -unsigned int datagram_poll(struct file *file, struct socket *sock, +__poll_t datagram_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; - unsigned int mask; + __poll_t mask; sock_poll_wait(file, sk_sleep(sk), wait); mask = 0; diff --git a/net/core/dev.c b/net/core/dev.c index 07ed21d64f92..dda9d7b9a840 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1106,7 +1106,7 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf) * when the name is long and there isn't enough space left * for the digits, or if all bits are used. */ - return p ? -ENFILE : -EEXIST; + return -ENFILE; } static int dev_alloc_name_ns(struct net *net, @@ -1146,7 +1146,19 @@ EXPORT_SYMBOL(dev_alloc_name); int dev_get_valid_name(struct net *net, struct net_device *dev, const char *name) { - return dev_alloc_name_ns(net, dev, name); + BUG_ON(!net); + + if (!dev_valid_name(name)) + return -EINVAL; + + if (strchr(name, '%')) + return dev_alloc_name_ns(net, dev, name); + else if (__dev_get_by_name(net, name)) + return -EEXIST; + else if (dev->name != name) + strlcpy(dev->name, name, IFNAMSIZ); + + return 0; } EXPORT_SYMBOL(dev_get_valid_name); @@ -1542,6 +1554,23 @@ void dev_disable_lro(struct net_device *dev) } EXPORT_SYMBOL(dev_disable_lro); +/** + * dev_disable_gro_hw - disable HW Generic Receive Offload on a device + * @dev: device + * + * Disable HW Generic Receive Offload (GRO_HW) on a net device. Must be + * called under RTNL. This is needed if Generic XDP is installed on + * the device. + */ +static void dev_disable_gro_hw(struct net_device *dev) +{ + dev->wanted_features &= ~NETIF_F_GRO_HW; + netdev_update_features(dev); + + if (unlikely(dev->features & NETIF_F_GRO_HW)) + netdev_WARN(dev, "failed to disable GRO_HW!\n"); +} + static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, struct net_device *dev) { @@ -1665,7 +1694,6 @@ EXPORT_SYMBOL(unregister_netdevice_notifier); /** * call_netdevice_notifiers_info - call all network notifier blocks * @val: value passed unmodified to notifier function - * @dev: net_device pointer passed unmodified to notifier function * @info: notifier information data * * Call all network notifier blocks. Parameters and return value @@ -2803,7 +2831,7 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb, segs = skb_mac_gso_segment(skb, features); - if (unlikely(skb_needs_check(skb, tx_path))) + if (unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs))) skb_warn_bad_offload(skb); return segs; @@ -3042,7 +3070,7 @@ int skb_csum_hwoffload_help(struct sk_buff *skb, } EXPORT_SYMBOL(skb_csum_hwoffload_help); -static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) +static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again) { netdev_features_t features; @@ -3066,9 +3094,6 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device __skb_linearize(skb)) goto out_kfree_skb; - if (validate_xmit_xfrm(skb, features)) - goto out_kfree_skb; - /* If packet is not checksummed and device does not * support checksumming for this protocol, complete * checksumming here. @@ -3085,6 +3110,8 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device } } + skb = validate_xmit_xfrm(skb, features, again); + return skb; out_kfree_skb: @@ -3094,7 +3121,7 @@ out_null: return NULL; } -struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev) +struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again) { struct sk_buff *next, *head = NULL, *tail; @@ -3105,7 +3132,7 @@ struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *d /* in case skb wont be segmented, point to itself */ skb->prev = skb; - skb = validate_xmit_skb(skb, dev); + skb = validate_xmit_skb(skb, dev, again); if (!skb) continue; @@ -3139,10 +3166,21 @@ static void qdisc_pkt_len_init(struct sk_buff *skb) hdr_len = skb_transport_header(skb) - skb_mac_header(skb); /* + transport layer */ - if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) - hdr_len += tcp_hdrlen(skb); - else - hdr_len += sizeof(struct udphdr); + if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { + const struct tcphdr *th; + struct tcphdr _tcphdr; + + th = skb_header_pointer(skb, skb_transport_offset(skb), + sizeof(_tcphdr), &_tcphdr); + if (likely(th)) + hdr_len += __tcp_hdrlen(th); + } else { + struct udphdr _udphdr; + + if (skb_header_pointer(skb, skb_transport_offset(skb), + sizeof(_udphdr), &_udphdr)) + hdr_len += sizeof(struct udphdr); + } if (shinfo->gso_type & SKB_GSO_DODGY) gso_segs = DIV_ROUND_UP(skb->len - hdr_len, @@ -3162,6 +3200,21 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, int rc; qdisc_calculate_pkt_len(skb, q); + + if (q->flags & TCQ_F_NOLOCK) { + if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { + __qdisc_drop(skb, &to_free); + rc = NET_XMIT_DROP; + } else { + rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; + __qdisc_run(q); + } + + if (unlikely(to_free)) + kfree_skb_list(to_free); + return rc; + } + /* * Heuristic to force contended enqueues to serialize on a * separate lock before trying to get qdisc main lock. @@ -3192,9 +3245,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, contended = false; } __qdisc_run(q); - } else - qdisc_run_end(q); + } + qdisc_run_end(q); rc = NET_XMIT_SUCCESS; } else { rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; @@ -3204,6 +3257,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, contended = false; } __qdisc_run(q); + qdisc_run_end(q); } } spin_unlock(root_lock); @@ -3376,8 +3430,7 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev, else queue_index = __netdev_pick_tx(dev, skb); - if (!accel_priv) - queue_index = netdev_cap_txqueue(dev, queue_index); + queue_index = netdev_cap_txqueue(dev, queue_index); } skb_set_queue_mapping(skb, queue_index); @@ -3416,6 +3469,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) struct netdev_queue *txq; struct Qdisc *q; int rc = -ENOMEM; + bool again = false; skb_reset_mac_header(skb); @@ -3477,7 +3531,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) XMIT_RECURSION_LIMIT)) goto recursion_alert; - skb = validate_xmit_skb(skb, dev); + skb = validate_xmit_skb(skb, dev, &again); if (!skb) goto out; @@ -3873,9 +3927,33 @@ drop: return NET_RX_DROP; } +static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct netdev_rx_queue *rxqueue; + + rxqueue = dev->_rx; + + if (skb_rx_queue_recorded(skb)) { + u16 index = skb_get_rx_queue(skb); + + if (unlikely(index >= dev->real_num_rx_queues)) { + WARN_ONCE(dev->real_num_rx_queues > 1, + "%s received packet on queue %u, but number " + "of RX queues is %u\n", + dev->name, index, dev->real_num_rx_queues); + + return rxqueue; /* Return first rxqueue */ + } + rxqueue += index; + } + return rxqueue; +} + static u32 netif_receive_generic_xdp(struct sk_buff *skb, struct bpf_prog *xdp_prog) { + struct netdev_rx_queue *rxqueue; u32 metalen, act = XDP_DROP; struct xdp_buff xdp; void *orig_data; @@ -3904,7 +3982,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0, troom > 0 ? troom + 128 : 0, GFP_ATOMIC)) goto do_drop; - if (troom > 0 && __skb_linearize(skb)) + if (skb_linearize(skb)) goto do_drop; } @@ -3919,6 +3997,9 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, xdp.data_hard_start = skb->data - skb_headroom(skb); orig_data = xdp.data; + rxqueue = netif_get_rxqueue(skb); + xdp.rxq = &rxqueue->xdp_rxq; + act = bpf_prog_run_xdp(xdp_prog, &xdp); off = xdp.data - orig_data; @@ -4143,21 +4224,26 @@ static __latent_entropy void net_tx_action(struct softirq_action *h) while (head) { struct Qdisc *q = head; - spinlock_t *root_lock; + spinlock_t *root_lock = NULL; head = head->next_sched; - root_lock = qdisc_lock(q); - spin_lock(root_lock); + if (!(q->flags & TCQ_F_NOLOCK)) { + root_lock = qdisc_lock(q); + spin_lock(root_lock); + } /* We need to make sure head->next_sched is read * before clearing __QDISC_STATE_SCHED */ smp_mb__before_atomic(); clear_bit(__QDISC_STATE_SCHED, &q->state); qdisc_run(q); - spin_unlock(root_lock); + if (root_lock) + spin_unlock(root_lock); } } + + xfrm_dev_backlog(sd); } #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE) @@ -4545,6 +4631,7 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) } else if (new && !old) { static_key_slow_inc(&generic_xdp_needed); dev_disable_lro(dev); + dev_disable_gro_hw(dev); } break; @@ -6348,6 +6435,7 @@ rollback: * netdev_upper_dev_link - Add a link to the upper device * @dev: device * @upper_dev: new upper device + * @extack: netlink extended ack * * Adds a link to device which is upper to this one. The caller must hold * the RTNL lock. On a failure a negative errno code is returned. @@ -6369,6 +6457,7 @@ EXPORT_SYMBOL(netdev_upper_dev_link); * @upper_dev: new upper device * @upper_priv: upper device private * @upper_info: upper info to be passed down via notifier + * @extack: netlink extended ack * * Adds a link to device which is upper to this one. In this case, only * one master upper device can be linked, although other non-master devices @@ -6959,6 +7048,35 @@ int dev_set_mtu(struct net_device *dev, int new_mtu) EXPORT_SYMBOL(dev_set_mtu); /** + * dev_change_tx_queue_len - Change TX queue length of a netdevice + * @dev: device + * @new_len: new tx queue length + */ +int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len) +{ + unsigned int orig_len = dev->tx_queue_len; + int res; + + if (new_len != (unsigned int)new_len) + return -ERANGE; + + if (new_len != orig_len) { + dev->tx_queue_len = new_len; + res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev); + res = notifier_to_errno(res); + if (res) { + netdev_err(dev, + "refused to change device tx_queue_len\n"); + dev->tx_queue_len = orig_len; + return res; + } + return dev_qdisc_change_tx_queue_len(dev); + } + + return 0; +} + +/** * dev_set_group - Change group this device belongs to * @dev: device * @new_group: group this device should belong to @@ -7073,17 +7191,21 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down) } EXPORT_SYMBOL(dev_change_proto_down); -u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op, u32 *prog_id) +void __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op, + struct netdev_bpf *xdp) { - struct netdev_bpf xdp; - - memset(&xdp, 0, sizeof(xdp)); - xdp.command = XDP_QUERY_PROG; + memset(xdp, 0, sizeof(*xdp)); + xdp->command = XDP_QUERY_PROG; /* Query must always succeed. */ - WARN_ON(bpf_op(dev, &xdp) < 0); - if (prog_id) - *prog_id = xdp.prog_id; + WARN_ON(bpf_op(dev, xdp) < 0); +} + +static u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op) +{ + struct netdev_bpf xdp; + + __dev_xdp_query(dev, bpf_op, &xdp); return xdp.prog_attached; } @@ -7106,6 +7228,27 @@ static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op, return bpf_op(dev, &xdp); } +static void dev_xdp_uninstall(struct net_device *dev) +{ + struct netdev_bpf xdp; + bpf_op_t ndo_bpf; + + /* Remove generic XDP */ + WARN_ON(dev_xdp_install(dev, generic_xdp_install, NULL, 0, NULL)); + + /* Remove from the driver */ + ndo_bpf = dev->netdev_ops->ndo_bpf; + if (!ndo_bpf) + return; + + __dev_xdp_query(dev, ndo_bpf, &xdp); + if (xdp.prog_attached == XDP_ATTACHED_NONE) + return; + + /* Program removal should always succeed */ + WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags, NULL)); +} + /** * dev_change_xdp_fd - set or clear a bpf program for a device rx path * @dev: device @@ -7134,10 +7277,10 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, bpf_chk = generic_xdp_install; if (fd >= 0) { - if (bpf_chk && __dev_xdp_attached(dev, bpf_chk, NULL)) + if (bpf_chk && __dev_xdp_attached(dev, bpf_chk)) return -EEXIST; if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && - __dev_xdp_attached(dev, bpf_op, NULL)) + __dev_xdp_attached(dev, bpf_op)) return -EBUSY; prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP, @@ -7236,6 +7379,7 @@ static void rollback_registered_many(struct list_head *head) /* Shutdown queueing discipline. */ dev_shutdown(dev); + dev_xdp_uninstall(dev); /* Notify protocols, that we are about to destroy * this device. They should clean all the things. @@ -7245,7 +7389,7 @@ static void rollback_registered_many(struct list_head *head) if (!dev->rtnl_link_ops || dev->rtnl_link_state == RTNL_LINK_INITIALIZED) skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0, - GFP_KERNEL, NULL); + GFP_KERNEL, NULL, 0); /* * Flush the unicast and multicast chains @@ -7379,6 +7523,18 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, features &= ~dev->gso_partial_features; } + if (!(features & NETIF_F_RXCSUM)) { + /* NETIF_F_GRO_HW implies doing RXCSUM since every packet + * successfully merged by hardware must also have the + * checksum verified by hardware. If the user does not + * want to enable RXCSUM, logically, we should disable GRO_HW. + */ + if (features & NETIF_F_GRO_HW) { + netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n"); + features &= ~NETIF_F_GRO_HW; + } + } + return features; } @@ -7512,12 +7668,12 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev, } EXPORT_SYMBOL(netif_stacked_transfer_operstate); -#ifdef CONFIG_SYSFS static int netif_alloc_rx_queues(struct net_device *dev) { unsigned int i, count = dev->num_rx_queues; struct netdev_rx_queue *rx; size_t sz = count * sizeof(*rx); + int err = 0; BUG_ON(count < 1); @@ -7527,11 +7683,38 @@ static int netif_alloc_rx_queues(struct net_device *dev) dev->_rx = rx; - for (i = 0; i < count; i++) + for (i = 0; i < count; i++) { rx[i].dev = dev; + + /* XDP RX-queue setup */ + err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i); + if (err < 0) + goto err_rxq_info; + } return 0; + +err_rxq_info: + /* Rollback successful reg's and free other resources */ + while (i--) + xdp_rxq_info_unreg(&rx[i].xdp_rxq); + kvfree(dev->_rx); + dev->_rx = NULL; + return err; +} + +static void netif_free_rx_queues(struct net_device *dev) +{ + unsigned int i, count = dev->num_rx_queues; + + /* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */ + if (!dev->_rx) + return; + + for (i = 0; i < count; i++) + xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq); + + kvfree(dev->_rx); } -#endif static void netdev_init_one_queue(struct net_device *dev, struct netdev_queue *queue, void *_unused) @@ -8092,12 +8275,10 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, return NULL; } -#ifdef CONFIG_SYSFS if (rxqs < 1) { pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n"); return NULL; } -#endif alloc_size = sizeof(struct net_device); if (sizeof_priv) { @@ -8154,12 +8335,10 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, if (netif_alloc_netdev_queues(dev)) goto free_all; -#ifdef CONFIG_SYSFS dev->num_rx_queues = rxqs; dev->real_num_rx_queues = rxqs; if (netif_alloc_rx_queues(dev)) goto free_all; -#endif strcpy(dev->name, name); dev->name_assign_type = name_assign_type; @@ -8195,13 +8374,10 @@ EXPORT_SYMBOL(alloc_netdev_mqs); void free_netdev(struct net_device *dev) { struct napi_struct *p, *n; - struct bpf_prog *prog; might_sleep(); netif_free_tx_queues(dev); -#ifdef CONFIG_SYSFS - kvfree(dev->_rx); -#endif + netif_free_rx_queues(dev); kfree(rcu_dereference_protected(dev->ingress_queue, 1)); @@ -8214,12 +8390,6 @@ void free_netdev(struct net_device *dev) free_percpu(dev->pcpu_refcnt); dev->pcpu_refcnt = NULL; - prog = rcu_dereference_protected(dev->xdp_prog, 1); - if (prog) { - bpf_prog_put(prog); - static_key_slow_dec(&generic_xdp_needed); - } - /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED) { netdev_freemem(dev); @@ -8332,7 +8502,7 @@ EXPORT_SYMBOL(unregister_netdev); int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) { - int err, new_nsid; + int err, new_nsid, new_ifindex; ASSERT_RTNL(); @@ -8388,11 +8558,16 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char call_netdevice_notifiers(NETDEV_UNREGISTER, dev); rcu_barrier(); call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); - if (dev->rtnl_link_ops && dev->rtnl_link_ops->get_link_net) - new_nsid = peernet2id_alloc(dev_net(dev), net); + + new_nsid = peernet2id_alloc(dev_net(dev), net); + /* If there is an ifindex conflict assign a new one */ + if (__dev_get_by_index(net, dev->ifindex)) + new_ifindex = dev_new_index(net); else - new_nsid = peernet2id(dev_net(dev), net); - rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid); + new_ifindex = dev->ifindex; + + rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid, + new_ifindex); /* * Flush the unicast and multicast chains @@ -8406,10 +8581,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char /* Actually switch the network namespace */ dev_net_set(dev, net); - - /* If there is an ifindex conflict assign a new one */ - if (__dev_get_by_index(net, dev->ifindex)) - dev->ifindex = dev_new_index(net); + dev->ifindex = new_ifindex; /* Send a netdev-add uevent to the new namespace */ kobject_uevent(&dev->dev.kobj, KOBJ_ADD); @@ -8807,6 +8979,9 @@ static int __init net_dev_init(void) skb_queue_head_init(&sd->input_pkt_queue); skb_queue_head_init(&sd->process_queue); +#ifdef CONFIG_XFRM_OFFLOAD + skb_queue_head_init(&sd->xfrm_backlog); +#endif INIT_LIST_HEAD(&sd->poll_list); sd->output_queue_tailp = &sd->output_queue; #ifdef CONFIG_RPS diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 7e690d0ccd05..0ab1af04296c 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -18,26 +18,10 @@ * match. --pb */ -static int dev_ifname(struct net *net, struct ifreq __user *arg) +static int dev_ifname(struct net *net, struct ifreq *ifr) { - struct ifreq ifr; - int error; - - /* - * Fetch the caller's info block. - */ - - if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) - return -EFAULT; - ifr.ifr_name[IFNAMSIZ-1] = 0; - - error = netdev_get_name(net, ifr.ifr_name, ifr.ifr_ifindex); - if (error) - return error; - - if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) - return -EFAULT; - return 0; + ifr->ifr_name[IFNAMSIZ-1] = 0; + return netdev_get_name(net, ifr->ifr_name, ifr->ifr_ifindex); } static gifconf_func_t *gifconf_list[NPROTO]; @@ -66,9 +50,8 @@ EXPORT_SYMBOL(register_gifconf); * Thus we will need a 'compatibility mode'. */ -static int dev_ifconf(struct net *net, char __user *arg) +int dev_ifconf(struct net *net, struct ifconf *ifc, int size) { - struct ifconf ifc; struct net_device *dev; char __user *pos; int len; @@ -79,11 +62,8 @@ static int dev_ifconf(struct net *net, char __user *arg) * Fetch the caller's info block. */ - if (copy_from_user(&ifc, arg, sizeof(struct ifconf))) - return -EFAULT; - - pos = ifc.ifc_buf; - len = ifc.ifc_len; + pos = ifc->ifc_buf; + len = ifc->ifc_len; /* * Loop over the interfaces, and write an info block for each. @@ -95,10 +75,10 @@ static int dev_ifconf(struct net *net, char __user *arg) if (gifconf_list[i]) { int done; if (!pos) - done = gifconf_list[i](dev, NULL, 0); + done = gifconf_list[i](dev, NULL, 0, size); else done = gifconf_list[i](dev, pos + total, - len - total); + len - total, size); if (done < 0) return -EFAULT; total += done; @@ -109,12 +89,12 @@ static int dev_ifconf(struct net *net, char __user *arg) /* * All done. Write the updated control block back to the caller. */ - ifc.ifc_len = total; + ifc->ifc_len = total; /* * Both BSD and Solaris return 0 here, so we do too. */ - return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0; + return 0; } /* @@ -406,53 +386,24 @@ EXPORT_SYMBOL(dev_load); * positive or a negative errno code on error. */ -int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) +int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_copyout) { - struct ifreq ifr; int ret; char *colon; - /* One special case: SIOCGIFCONF takes ifconf argument - and requires shared lock, because it sleeps writing - to user space. - */ - - if (cmd == SIOCGIFCONF) { - rtnl_lock(); - ret = dev_ifconf(net, (char __user *) arg); - rtnl_unlock(); - return ret; - } + if (need_copyout) + *need_copyout = true; if (cmd == SIOCGIFNAME) - return dev_ifname(net, (struct ifreq __user *)arg); - - /* - * Take care of Wireless Extensions. Unfortunately struct iwreq - * isn't a proper subset of struct ifreq (it's 8 byte shorter) - * so we need to treat it specially, otherwise applications may - * fault if the struct they're passing happens to land at the - * end of a mapped page. - */ - if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) { - struct iwreq iwr; - - if (copy_from_user(&iwr, arg, sizeof(iwr))) - return -EFAULT; - - iwr.ifr_name[sizeof(iwr.ifr_name) - 1] = 0; + return dev_ifname(net, ifr); - return wext_handle_ioctl(net, &iwr, cmd, arg); - } - - if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) - return -EFAULT; - - ifr.ifr_name[IFNAMSIZ-1] = 0; + ifr->ifr_name[IFNAMSIZ-1] = 0; - colon = strchr(ifr.ifr_name, ':'); + colon = strchr(ifr->ifr_name, ':'); if (colon) *colon = 0; + dev_load(net, ifr->ifr_name); + /* * See which interface the caller is talking about. */ @@ -472,31 +423,19 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) case SIOCGIFMAP: case SIOCGIFINDEX: case SIOCGIFTXQLEN: - dev_load(net, ifr.ifr_name); rcu_read_lock(); - ret = dev_ifsioc_locked(net, &ifr, cmd); + ret = dev_ifsioc_locked(net, ifr, cmd); rcu_read_unlock(); - if (!ret) { - if (colon) - *colon = ':'; - if (copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; - } + if (colon) + *colon = ':'; return ret; case SIOCETHTOOL: - dev_load(net, ifr.ifr_name); rtnl_lock(); - ret = dev_ethtool(net, &ifr); + ret = dev_ethtool(net, ifr); rtnl_unlock(); - if (!ret) { - if (colon) - *colon = ':'; - if (copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; - } + if (colon) + *colon = ':'; return ret; /* @@ -510,17 +449,11 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) case SIOCSIFNAME: if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; - dev_load(net, ifr.ifr_name); rtnl_lock(); - ret = dev_ifsioc(net, &ifr, cmd); + ret = dev_ifsioc(net, ifr, cmd); rtnl_unlock(); - if (!ret) { - if (colon) - *colon = ':'; - if (copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; - } + if (colon) + *colon = ':'; return ret; /* @@ -561,10 +494,11 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) /* fall through */ case SIOCBONDSLAVEINFOQUERY: case SIOCBONDINFOQUERY: - dev_load(net, ifr.ifr_name); rtnl_lock(); - ret = dev_ifsioc(net, &ifr, cmd); + ret = dev_ifsioc(net, ifr, cmd); rtnl_unlock(); + if (need_copyout) + *need_copyout = false; return ret; case SIOCGIFMEM: @@ -584,13 +518,9 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) cmd == SIOCGHWTSTAMP || (cmd >= SIOCDEVPRIVATE && cmd <= SIOCDEVPRIVATE + 15)) { - dev_load(net, ifr.ifr_name); rtnl_lock(); - ret = dev_ifsioc(net, &ifr, cmd); + ret = dev_ifsioc(net, ifr, cmd); rtnl_unlock(); - if (!ret && copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; return ret; } return -ENOTTY; diff --git a/net/core/devlink.c b/net/core/devlink.c index 7d430c1d9c3e..18d385ed8237 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -92,12 +92,6 @@ static LIST_HEAD(devlink_list); */ static DEFINE_MUTEX(devlink_mutex); -/* devlink_port_mutex - * - * Shared lock to guard lists of ports in all devlink devices. - */ -static DEFINE_MUTEX(devlink_port_mutex); - static struct net *devlink_net(const struct devlink *devlink) { return read_pnet(&devlink->_net); @@ -335,15 +329,18 @@ devlink_sb_tc_index_get_from_info(struct devlink_sb *devlink_sb, #define DEVLINK_NL_FLAG_NEED_DEVLINK BIT(0) #define DEVLINK_NL_FLAG_NEED_PORT BIT(1) #define DEVLINK_NL_FLAG_NEED_SB BIT(2) -#define DEVLINK_NL_FLAG_LOCK_PORTS BIT(3) - /* port is not needed but we need to ensure they don't - * change in the middle of command - */ + +/* The per devlink instance lock is taken by default in the pre-doit + * operation, yet several commands do not require this. The global + * devlink lock is taken and protects from disruption by user-calls. + */ +#define DEVLINK_NL_FLAG_NO_LOCK BIT(3) static int devlink_nl_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink; + int err; mutex_lock(&devlink_mutex); devlink = devlink_get_from_info(info); @@ -351,44 +348,47 @@ static int devlink_nl_pre_doit(const struct genl_ops *ops, mutex_unlock(&devlink_mutex); return PTR_ERR(devlink); } + if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK) + mutex_lock(&devlink->lock); if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_DEVLINK) { info->user_ptr[0] = devlink; } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) { struct devlink_port *devlink_port; - mutex_lock(&devlink_port_mutex); devlink_port = devlink_port_get_from_info(devlink, info); if (IS_ERR(devlink_port)) { - mutex_unlock(&devlink_port_mutex); - mutex_unlock(&devlink_mutex); - return PTR_ERR(devlink_port); + err = PTR_ERR(devlink_port); + goto unlock; } info->user_ptr[0] = devlink_port; } - if (ops->internal_flags & DEVLINK_NL_FLAG_LOCK_PORTS) { - mutex_lock(&devlink_port_mutex); - } if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_SB) { struct devlink_sb *devlink_sb; devlink_sb = devlink_sb_get_from_info(devlink, info); if (IS_ERR(devlink_sb)) { - if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) - mutex_unlock(&devlink_port_mutex); - mutex_unlock(&devlink_mutex); - return PTR_ERR(devlink_sb); + err = PTR_ERR(devlink_sb); + goto unlock; } info->user_ptr[1] = devlink_sb; } return 0; + +unlock: + if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK) + mutex_unlock(&devlink->lock); + mutex_unlock(&devlink_mutex); + return err; } static void devlink_nl_post_doit(const struct genl_ops *ops, struct sk_buff *skb, struct genl_info *info) { - if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT || - ops->internal_flags & DEVLINK_NL_FLAG_LOCK_PORTS) - mutex_unlock(&devlink_port_mutex); + struct devlink *devlink; + + devlink = devlink_get_from_info(info); + if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK) + mutex_unlock(&devlink->lock); mutex_unlock(&devlink_mutex); } @@ -614,10 +614,10 @@ static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg, int err; mutex_lock(&devlink_mutex); - mutex_lock(&devlink_port_mutex); list_for_each_entry(devlink, &devlink_list, list) { if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) continue; + mutex_lock(&devlink->lock); list_for_each_entry(devlink_port, &devlink->port_list, list) { if (idx < start) { idx++; @@ -628,13 +628,15 @@ static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI); - if (err) + if (err) { + mutex_unlock(&devlink->lock); goto out; + } idx++; } + mutex_unlock(&devlink->lock); } out: - mutex_unlock(&devlink_port_mutex); mutex_unlock(&devlink_mutex); cb->args[0] = idx; @@ -801,6 +803,7 @@ static int devlink_nl_cmd_sb_get_dumpit(struct sk_buff *msg, list_for_each_entry(devlink, &devlink_list, list) { if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) continue; + mutex_lock(&devlink->lock); list_for_each_entry(devlink_sb, &devlink->sb_list, list) { if (idx < start) { idx++; @@ -811,10 +814,13 @@ static int devlink_nl_cmd_sb_get_dumpit(struct sk_buff *msg, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI); - if (err) + if (err) { + mutex_unlock(&devlink->lock); goto out; + } idx++; } + mutex_unlock(&devlink->lock); } out: mutex_unlock(&devlink_mutex); @@ -935,14 +941,18 @@ static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg, if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) || !devlink->ops || !devlink->ops->sb_pool_get) continue; + mutex_lock(&devlink->lock); list_for_each_entry(devlink_sb, &devlink->sb_list, list) { err = __sb_pool_get_dumpit(msg, start, &idx, devlink, devlink_sb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq); - if (err && err != -EOPNOTSUPP) + if (err && err != -EOPNOTSUPP) { + mutex_unlock(&devlink->lock); goto out; + } } + mutex_unlock(&devlink->lock); } out: mutex_unlock(&devlink_mutex); @@ -1123,22 +1133,24 @@ static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg, int err; mutex_lock(&devlink_mutex); - mutex_lock(&devlink_port_mutex); list_for_each_entry(devlink, &devlink_list, list) { if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) || !devlink->ops || !devlink->ops->sb_port_pool_get) continue; + mutex_lock(&devlink->lock); list_for_each_entry(devlink_sb, &devlink->sb_list, list) { err = __sb_port_pool_get_dumpit(msg, start, &idx, devlink, devlink_sb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq); - if (err && err != -EOPNOTSUPP) + if (err && err != -EOPNOTSUPP) { + mutex_unlock(&devlink->lock); goto out; + } } + mutex_unlock(&devlink->lock); } out: - mutex_unlock(&devlink_port_mutex); mutex_unlock(&devlink_mutex); cb->args[0] = idx; @@ -1347,23 +1359,26 @@ devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, int err; mutex_lock(&devlink_mutex); - mutex_lock(&devlink_port_mutex); list_for_each_entry(devlink, &devlink_list, list) { if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) || !devlink->ops || !devlink->ops->sb_tc_pool_bind_get) continue; + + mutex_lock(&devlink->lock); list_for_each_entry(devlink_sb, &devlink->sb_list, list) { err = __sb_tc_pool_bind_get_dumpit(msg, start, &idx, devlink, devlink_sb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq); - if (err && err != -EOPNOTSUPP) + if (err && err != -EOPNOTSUPP) { + mutex_unlock(&devlink->lock); goto out; + } } + mutex_unlock(&devlink->lock); } out: - mutex_unlock(&devlink_port_mutex); mutex_unlock(&devlink_mutex); cb->args[0] = idx; @@ -1679,6 +1694,12 @@ static int devlink_dpipe_table_put(struct sk_buff *skb, table->counters_enabled)) goto nla_put_failure; + if (table->resource_valid) { + nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID, + table->resource_id, DEVLINK_ATTR_PAD); + nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_UNITS, + table->resource_units, DEVLINK_ATTR_PAD); + } if (devlink_dpipe_matches_put(table, skb)) goto nla_put_failure; @@ -2273,6 +2294,273 @@ static int devlink_nl_cmd_dpipe_table_counters_set(struct sk_buff *skb, counters_enable); } +static struct devlink_resource * +devlink_resource_find(struct devlink *devlink, + struct devlink_resource *resource, u64 resource_id) +{ + struct list_head *resource_list; + + if (resource) + resource_list = &resource->resource_list; + else + resource_list = &devlink->resource_list; + + list_for_each_entry(resource, resource_list, list) { + struct devlink_resource *child_resource; + + if (resource->id == resource_id) + return resource; + + child_resource = devlink_resource_find(devlink, resource, + resource_id); + if (child_resource) + return child_resource; + } + return NULL; +} + +static void +devlink_resource_validate_children(struct devlink_resource *resource) +{ + struct devlink_resource *child_resource; + bool size_valid = true; + u64 parts_size = 0; + + if (list_empty(&resource->resource_list)) + goto out; + + list_for_each_entry(child_resource, &resource->resource_list, list) + parts_size += child_resource->size_new; + + if (parts_size > resource->size) + size_valid = false; +out: + resource->size_valid = size_valid; +} + +static int devlink_nl_cmd_resource_set(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_resource *resource; + u64 resource_id; + u64 size; + int err; + + if (!info->attrs[DEVLINK_ATTR_RESOURCE_ID] || + !info->attrs[DEVLINK_ATTR_RESOURCE_SIZE]) + return -EINVAL; + resource_id = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_ID]); + + resource = devlink_resource_find(devlink, NULL, resource_id); + if (!resource) + return -EINVAL; + + if (!resource->resource_ops->size_validate) + return -EINVAL; + + size = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_SIZE]); + err = resource->resource_ops->size_validate(devlink, size, + info->extack); + if (err) + return err; + + resource->size_new = size; + devlink_resource_validate_children(resource); + if (resource->parent) + devlink_resource_validate_children(resource->parent); + return 0; +} + +static void +devlink_resource_size_params_put(struct devlink_resource *resource, + struct sk_buff *skb) +{ + struct devlink_resource_size_params *size_params; + + size_params = resource->size_params; + nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_GRAN, + size_params->size_granularity, DEVLINK_ATTR_PAD); + nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_MAX, + size_params->size_max, DEVLINK_ATTR_PAD); + nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_MIN, + size_params->size_min, DEVLINK_ATTR_PAD); + nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_UNIT, size_params->unit); +} + +static int devlink_resource_put(struct devlink *devlink, struct sk_buff *skb, + struct devlink_resource *resource) +{ + struct devlink_resource *child_resource; + struct nlattr *child_resource_attr; + struct nlattr *resource_attr; + + resource_attr = nla_nest_start(skb, DEVLINK_ATTR_RESOURCE); + if (!resource_attr) + return -EMSGSIZE; + + if (nla_put_string(skb, DEVLINK_ATTR_RESOURCE_NAME, resource->name) || + nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE, resource->size, + DEVLINK_ATTR_PAD) || + nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_ID, resource->id, + DEVLINK_ATTR_PAD)) + goto nla_put_failure; + if (resource->size != resource->size_new) + nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_NEW, + resource->size_new, DEVLINK_ATTR_PAD); + if (resource->resource_ops && resource->resource_ops->occ_get) + nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_OCC, + resource->resource_ops->occ_get(devlink), + DEVLINK_ATTR_PAD); + devlink_resource_size_params_put(resource, skb); + if (list_empty(&resource->resource_list)) + goto out; + + if (nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_SIZE_VALID, + resource->size_valid)) + goto nla_put_failure; + + child_resource_attr = nla_nest_start(skb, DEVLINK_ATTR_RESOURCE_LIST); + if (!child_resource_attr) + goto nla_put_failure; + + list_for_each_entry(child_resource, &resource->resource_list, list) { + if (devlink_resource_put(devlink, skb, child_resource)) + goto resource_put_failure; + } + + nla_nest_end(skb, child_resource_attr); +out: + nla_nest_end(skb, resource_attr); + return 0; + +resource_put_failure: + nla_nest_cancel(skb, child_resource_attr); +nla_put_failure: + nla_nest_cancel(skb, resource_attr); + return -EMSGSIZE; +} + +static int devlink_resource_fill(struct genl_info *info, + enum devlink_command cmd, int flags) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_resource *resource; + struct nlattr *resources_attr; + struct sk_buff *skb = NULL; + struct nlmsghdr *nlh; + bool incomplete; + void *hdr; + int i; + int err; + + resource = list_first_entry(&devlink->resource_list, + struct devlink_resource, list); +start_again: + err = devlink_dpipe_send_and_alloc_skb(&skb, info); + if (err) + return err; + + hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, + &devlink_nl_family, NLM_F_MULTI, cmd); + if (!hdr) { + nlmsg_free(skb); + return -EMSGSIZE; + } + + if (devlink_nl_put_handle(skb, devlink)) + goto nla_put_failure; + + resources_attr = nla_nest_start(skb, DEVLINK_ATTR_RESOURCE_LIST); + if (!resources_attr) + goto nla_put_failure; + + incomplete = false; + i = 0; + list_for_each_entry_from(resource, &devlink->resource_list, list) { + err = devlink_resource_put(devlink, skb, resource); + if (err) { + if (!i) + goto err_resource_put; + incomplete = true; + break; + } + i++; + } + nla_nest_end(skb, resources_attr); + genlmsg_end(skb, hdr); + if (incomplete) + goto start_again; +send_done: + nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq, + NLMSG_DONE, 0, flags | NLM_F_MULTI); + if (!nlh) { + err = devlink_dpipe_send_and_alloc_skb(&skb, info); + if (err) + goto err_skb_send_alloc; + goto send_done; + } + return genlmsg_reply(skb, info); + +nla_put_failure: + err = -EMSGSIZE; +err_resource_put: +err_skb_send_alloc: + genlmsg_cancel(skb, hdr); + nlmsg_free(skb); + return err; +} + +static int devlink_nl_cmd_resource_dump(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + + if (list_empty(&devlink->resource_list)) + return -EOPNOTSUPP; + + return devlink_resource_fill(info, DEVLINK_CMD_RESOURCE_DUMP, 0); +} + +static int +devlink_resources_validate(struct devlink *devlink, + struct devlink_resource *resource, + struct genl_info *info) +{ + struct list_head *resource_list; + int err = 0; + + if (resource) + resource_list = &resource->resource_list; + else + resource_list = &devlink->resource_list; + + list_for_each_entry(resource, resource_list, list) { + if (!resource->size_valid) + return -EINVAL; + err = devlink_resources_validate(devlink, resource, info); + if (err) + return err; + } + return err; +} + +static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + int err; + + if (!devlink->ops->reload) + return -EOPNOTSUPP; + + err = devlink_resources_validate(devlink, NULL, info); + if (err) { + NL_SET_ERR_MSG_MOD(info->extack, "resources size validation failed"); + return err; + } + return devlink->ops->reload(devlink); +} + static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, @@ -2291,6 +2579,8 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_ESWITCH_ENCAP_MODE] = { .type = NLA_U8 }, [DEVLINK_ATTR_DPIPE_TABLE_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED] = { .type = NLA_U8 }, + [DEVLINK_ATTR_RESOURCE_ID] = { .type = NLA_U64}, + [DEVLINK_ATTR_RESOURCE_SIZE] = { .type = NLA_U64}, }; static const struct genl_ops devlink_nl_ops[] = { @@ -2322,14 +2612,16 @@ static const struct genl_ops devlink_nl_ops[] = { .doit = devlink_nl_cmd_port_split_doit, .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + DEVLINK_NL_FLAG_NO_LOCK, }, { .cmd = DEVLINK_CMD_PORT_UNSPLIT, .doit = devlink_nl_cmd_port_unsplit_doit, .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + DEVLINK_NL_FLAG_NO_LOCK, }, { .cmd = DEVLINK_CMD_SB_GET, @@ -2397,8 +2689,7 @@ static const struct genl_ops devlink_nl_ops[] = { .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | - DEVLINK_NL_FLAG_NEED_SB | - DEVLINK_NL_FLAG_LOCK_PORTS, + DEVLINK_NL_FLAG_NEED_SB, }, { .cmd = DEVLINK_CMD_SB_OCC_MAX_CLEAR, @@ -2406,8 +2697,7 @@ static const struct genl_ops devlink_nl_ops[] = { .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | - DEVLINK_NL_FLAG_NEED_SB | - DEVLINK_NL_FLAG_LOCK_PORTS, + DEVLINK_NL_FLAG_NEED_SB, }, { .cmd = DEVLINK_CMD_ESWITCH_GET, @@ -2451,6 +2741,28 @@ static const struct genl_ops devlink_nl_ops[] = { .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, + { + .cmd = DEVLINK_CMD_RESOURCE_SET, + .doit = devlink_nl_cmd_resource_set, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, + { + .cmd = DEVLINK_CMD_RESOURCE_DUMP, + .doit = devlink_nl_cmd_resource_dump, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, + { + .cmd = DEVLINK_CMD_RELOAD, + .doit = devlink_nl_cmd_reload, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + DEVLINK_NL_FLAG_NO_LOCK, + }, }; static struct genl_family devlink_nl_family __ro_after_init = { @@ -2488,6 +2800,8 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) INIT_LIST_HEAD(&devlink->port_list); INIT_LIST_HEAD(&devlink->sb_list); INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list); + INIT_LIST_HEAD(&devlink->resource_list); + mutex_init(&devlink->lock); return devlink; } EXPORT_SYMBOL_GPL(devlink_alloc); @@ -2550,16 +2864,16 @@ int devlink_port_register(struct devlink *devlink, struct devlink_port *devlink_port, unsigned int port_index) { - mutex_lock(&devlink_port_mutex); + mutex_lock(&devlink->lock); if (devlink_port_index_exists(devlink, port_index)) { - mutex_unlock(&devlink_port_mutex); + mutex_unlock(&devlink->lock); return -EEXIST; } devlink_port->devlink = devlink; devlink_port->index = port_index; devlink_port->registered = true; list_add_tail(&devlink_port->list, &devlink->port_list); - mutex_unlock(&devlink_port_mutex); + mutex_unlock(&devlink->lock); devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); return 0; } @@ -2572,10 +2886,12 @@ EXPORT_SYMBOL_GPL(devlink_port_register); */ void devlink_port_unregister(struct devlink_port *devlink_port) { + struct devlink *devlink = devlink_port->devlink; + devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL); - mutex_lock(&devlink_port_mutex); + mutex_lock(&devlink->lock); list_del(&devlink_port->list); - mutex_unlock(&devlink_port_mutex); + mutex_unlock(&devlink->lock); } EXPORT_SYMBOL_GPL(devlink_port_unregister); @@ -2651,7 +2967,7 @@ int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, struct devlink_sb *devlink_sb; int err = 0; - mutex_lock(&devlink_mutex); + mutex_lock(&devlink->lock); if (devlink_sb_index_exists(devlink, sb_index)) { err = -EEXIST; goto unlock; @@ -2670,7 +2986,7 @@ int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, devlink_sb->egress_tc_count = egress_tc_count; list_add_tail(&devlink_sb->list, &devlink->sb_list); unlock: - mutex_unlock(&devlink_mutex); + mutex_unlock(&devlink->lock); return err; } EXPORT_SYMBOL_GPL(devlink_sb_register); @@ -2679,11 +2995,11 @@ void devlink_sb_unregister(struct devlink *devlink, unsigned int sb_index) { struct devlink_sb *devlink_sb; - mutex_lock(&devlink_mutex); + mutex_lock(&devlink->lock); devlink_sb = devlink_sb_get_by_index(devlink, sb_index); WARN_ON(!devlink_sb); list_del(&devlink_sb->list); - mutex_unlock(&devlink_mutex); + mutex_unlock(&devlink->lock); kfree(devlink_sb); } EXPORT_SYMBOL_GPL(devlink_sb_unregister); @@ -2699,9 +3015,9 @@ EXPORT_SYMBOL_GPL(devlink_sb_unregister); int devlink_dpipe_headers_register(struct devlink *devlink, struct devlink_dpipe_headers *dpipe_headers) { - mutex_lock(&devlink_mutex); + mutex_lock(&devlink->lock); devlink->dpipe_headers = dpipe_headers; - mutex_unlock(&devlink_mutex); + mutex_unlock(&devlink->lock); return 0; } EXPORT_SYMBOL_GPL(devlink_dpipe_headers_register); @@ -2715,9 +3031,9 @@ EXPORT_SYMBOL_GPL(devlink_dpipe_headers_register); */ void devlink_dpipe_headers_unregister(struct devlink *devlink) { - mutex_lock(&devlink_mutex); + mutex_lock(&devlink->lock); devlink->dpipe_headers = NULL; - mutex_unlock(&devlink_mutex); + mutex_unlock(&devlink->lock); } EXPORT_SYMBOL_GPL(devlink_dpipe_headers_unregister); @@ -2783,9 +3099,9 @@ int devlink_dpipe_table_register(struct devlink *devlink, table->priv = priv; table->counter_control_extern = counter_control_extern; - mutex_lock(&devlink_mutex); + mutex_lock(&devlink->lock); list_add_tail_rcu(&table->list, &devlink->dpipe_table_list); - mutex_unlock(&devlink_mutex); + mutex_unlock(&devlink->lock); return 0; } EXPORT_SYMBOL_GPL(devlink_dpipe_table_register); @@ -2801,20 +3117,182 @@ void devlink_dpipe_table_unregister(struct devlink *devlink, { struct devlink_dpipe_table *table; - mutex_lock(&devlink_mutex); + mutex_lock(&devlink->lock); table = devlink_dpipe_table_find(&devlink->dpipe_table_list, table_name); if (!table) goto unlock; list_del_rcu(&table->list); - mutex_unlock(&devlink_mutex); + mutex_unlock(&devlink->lock); kfree_rcu(table, rcu); return; unlock: - mutex_unlock(&devlink_mutex); + mutex_unlock(&devlink->lock); } EXPORT_SYMBOL_GPL(devlink_dpipe_table_unregister); +/** + * devlink_resource_register - devlink resource register + * + * @devlink: devlink + * @resource_name: resource's name + * @top_hierarchy: top hierarchy + * @reload_required: reload is required for new configuration to + * apply + * @resource_size: resource's size + * @resource_id: resource's id + * @parent_reosurce_id: resource's parent id + * @size params: size parameters + * @resource_ops: resource ops + */ +int devlink_resource_register(struct devlink *devlink, + const char *resource_name, + bool top_hierarchy, + u64 resource_size, + u64 resource_id, + u64 parent_resource_id, + struct devlink_resource_size_params *size_params, + const struct devlink_resource_ops *resource_ops) +{ + struct devlink_resource *resource; + struct list_head *resource_list; + int err = 0; + + mutex_lock(&devlink->lock); + resource = devlink_resource_find(devlink, NULL, resource_id); + if (resource) { + err = -EINVAL; + goto out; + } + + resource = kzalloc(sizeof(*resource), GFP_KERNEL); + if (!resource) { + err = -ENOMEM; + goto out; + } + + if (top_hierarchy) { + resource_list = &devlink->resource_list; + } else { + struct devlink_resource *parent_resource; + + parent_resource = devlink_resource_find(devlink, NULL, + parent_resource_id); + if (parent_resource) { + resource_list = &parent_resource->resource_list; + resource->parent = parent_resource; + } else { + kfree(resource); + err = -EINVAL; + goto out; + } + } + + resource->name = resource_name; + resource->size = resource_size; + resource->size_new = resource_size; + resource->id = resource_id; + resource->resource_ops = resource_ops; + resource->size_valid = true; + resource->size_params = size_params; + INIT_LIST_HEAD(&resource->resource_list); + list_add_tail(&resource->list, resource_list); +out: + mutex_unlock(&devlink->lock); + return err; +} +EXPORT_SYMBOL_GPL(devlink_resource_register); + +/** + * devlink_resources_unregister - free all resources + * + * @devlink: devlink + * @resource: resource + */ +void devlink_resources_unregister(struct devlink *devlink, + struct devlink_resource *resource) +{ + struct devlink_resource *tmp, *child_resource; + struct list_head *resource_list; + + if (resource) + resource_list = &resource->resource_list; + else + resource_list = &devlink->resource_list; + + if (!resource) + mutex_lock(&devlink->lock); + + list_for_each_entry_safe(child_resource, tmp, resource_list, list) { + devlink_resources_unregister(devlink, child_resource); + list_del(&child_resource->list); + kfree(child_resource); + } + + if (!resource) + mutex_unlock(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devlink_resources_unregister); + +/** + * devlink_resource_size_get - get and update size + * + * @devlink: devlink + * @resource_id: the requested resource id + * @p_resource_size: ptr to update + */ +int devlink_resource_size_get(struct devlink *devlink, + u64 resource_id, + u64 *p_resource_size) +{ + struct devlink_resource *resource; + int err = 0; + + mutex_lock(&devlink->lock); + resource = devlink_resource_find(devlink, NULL, resource_id); + if (!resource) { + err = -EINVAL; + goto out; + } + *p_resource_size = resource->size_new; + resource->size = resource->size_new; +out: + mutex_unlock(&devlink->lock); + return err; +} +EXPORT_SYMBOL_GPL(devlink_resource_size_get); + +/** + * devlink_dpipe_table_resource_set - set the resource id + * + * @devlink: devlink + * @table_name: table name + * @resource_id: resource id + * @resource_units: number of resource's units consumed per table's entry + */ +int devlink_dpipe_table_resource_set(struct devlink *devlink, + const char *table_name, u64 resource_id, + u64 resource_units) +{ + struct devlink_dpipe_table *table; + int err = 0; + + mutex_lock(&devlink->lock); + table = devlink_dpipe_table_find(&devlink->dpipe_table_list, + table_name); + if (!table) { + err = -EINVAL; + goto out; + } + table->resource_id = resource_id; + table->resource_units = resource_units; + table->resource_valid = true; +out: + mutex_unlock(&devlink->lock); + return err; +} +EXPORT_SYMBOL_GPL(devlink_dpipe_table_resource_set); + static int __init devlink_module_init(void) { return genl_register_family(&devlink_nl_family); diff --git a/net/core/dst.c b/net/core/dst.c index 662a2d4a3d19..007aa0b08291 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -21,6 +21,7 @@ #include <linux/sched.h> #include <linux/prefetch.h> #include <net/lwtunnel.h> +#include <net/xfrm.h> #include <net/dst.h> #include <net/dst_metadata.h> @@ -62,15 +63,12 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, struct net_device *dev, int initial_ref, int initial_obsolete, unsigned short flags) { - dst->child = NULL; dst->dev = dev; if (dev) dev_hold(dev); dst->ops = ops; dst_init_metrics(dst, dst_default_metrics.metrics, true); dst->expires = 0UL; - dst->path = dst; - dst->from = NULL; #ifdef CONFIG_XFRM dst->xfrm = NULL; #endif @@ -88,7 +86,6 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, dst->__use = 0; dst->lastuse = jiffies; dst->flags = flags; - dst->next = NULL; if (!(flags & DST_NOCOUNT)) dst_entries_add(ops, 1); } @@ -116,12 +113,17 @@ EXPORT_SYMBOL(dst_alloc); struct dst_entry *dst_destroy(struct dst_entry * dst) { - struct dst_entry *child; + struct dst_entry *child = NULL; smp_rmb(); - child = dst->child; +#ifdef CONFIG_XFRM + if (dst->xfrm) { + struct xfrm_dst *xdst = (struct xfrm_dst *) dst; + child = xdst->child; + } +#endif if (!(dst->flags & DST_NOCOUNT)) dst_entries_add(dst->ops, -1); diff --git a/net/core/ethtool.c b/net/core/ethtool.c index f8fcf450a36e..107b122c8969 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -73,6 +73,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_LLTX_BIT] = "tx-lockless", [NETIF_F_NETNS_LOCAL_BIT] = "netns-local", [NETIF_F_GRO_BIT] = "rx-gro", + [NETIF_F_GRO_HW_BIT] = "rx-gro-hw", [NETIF_F_LRO_BIT] = "rx-lro", [NETIF_F_TSO_BIT] = "tx-tcp-segmentation", @@ -770,15 +771,6 @@ static int ethtool_set_link_ksettings(struct net_device *dev, return dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings); } -static void -warn_incomplete_ethtool_legacy_settings_conversion(const char *details) -{ - char name[sizeof(current->comm)]; - - pr_info_once("warning: `%s' uses legacy ethtool link settings API, %s\n", - get_task_comm(name, current), details); -} - /* Query device for its ethtool_cmd settings. * * Backward compatibility note: for compatibility with legacy ethtool, @@ -805,10 +797,8 @@ static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) &link_ksettings); if (err < 0) return err; - if (!convert_link_ksettings_to_legacy_settings(&cmd, - &link_ksettings)) - warn_incomplete_ethtool_legacy_settings_conversion( - "link modes are only partially reported"); + convert_link_ksettings_to_legacy_settings(&cmd, + &link_ksettings); /* send a sensible cmd tag back to user */ cmd.cmd = ETHTOOL_GSET; @@ -1703,14 +1693,23 @@ static int ethtool_get_ringparam(struct net_device *dev, void __user *useraddr) static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr) { - struct ethtool_ringparam ringparam; + struct ethtool_ringparam ringparam, max = { .cmd = ETHTOOL_GRINGPARAM }; - if (!dev->ethtool_ops->set_ringparam) + if (!dev->ethtool_ops->set_ringparam || !dev->ethtool_ops->get_ringparam) return -EOPNOTSUPP; if (copy_from_user(&ringparam, useraddr, sizeof(ringparam))) return -EFAULT; + dev->ethtool_ops->get_ringparam(dev, &max); + + /* ensure new ring parameters are within the maximums */ + if (ringparam.rx_pending > max.rx_max_pending || + ringparam.rx_mini_pending > max.rx_mini_max_pending || + ringparam.rx_jumbo_pending > max.rx_jumbo_max_pending || + ringparam.tx_pending > max.tx_max_pending) + return -EINVAL; + return dev->ethtool_ops->set_ringparam(dev, &ringparam); } diff --git a/net/core/filter.c b/net/core/filter.c index 6a85e67fafce..08ab4c65a998 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -401,8 +401,8 @@ do_pass: /* Classic BPF expects A and X to be reset first. These need * to be guaranteed to be the first two instructions. */ - *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); - *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_X, BPF_REG_X); + *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); + *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_X, BPF_REG_X); /* All programs must keep CTX in callee saved BPF_REG_CTX. * In eBPF case it's done by the compiler, here we need to @@ -458,6 +458,17 @@ do_pass: convert_bpf_extensions(fp, &insn)) break; + if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) || + fp->code == (BPF_ALU | BPF_MOD | BPF_X)) { + *insn++ = BPF_MOV32_REG(BPF_REG_X, BPF_REG_X); + /* Error with exception code on div/mod by 0. + * For cBPF programs, this was always return 0. + */ + *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_X, 0, 2); + *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); + *insn++ = BPF_EXIT_INSN(); + } + *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k); break; @@ -1054,11 +1065,9 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) */ goto out_err_free; - /* We are guaranteed to never error here with cBPF to eBPF - * transitions, since there's no issue with type compatibility - * checks on program arrays. - */ fp = bpf_prog_select_runtime(fp, &err); + if (err) + goto out_err_free; kfree(old_prog); return fp; @@ -2684,8 +2693,9 @@ static int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, struct net_device *fwd) return 0; } -int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb, - struct bpf_prog *xdp_prog) +static int xdp_do_generic_redirect_map(struct net_device *dev, + struct sk_buff *skb, + struct bpf_prog *xdp_prog) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); unsigned long map_owner = ri->map_owner; @@ -2862,7 +2872,7 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = { .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM, - .arg5_type = ARG_CONST_SIZE, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; static unsigned short bpf_tunnel_key_af(u64 flags) @@ -3013,6 +3023,8 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE; if (flags & BPF_F_DONT_FRAGMENT) info->key.tun_flags |= TUNNEL_DONT_FRAGMENT; + if (flags & BPF_F_ZERO_CSUM_TX) + info->key.tun_flags &= ~TUNNEL_CSUM; info->key.tun_id = cpu_to_be64(from->tunnel_id); info->key.tos = from->tunnel_tos; @@ -3026,8 +3038,6 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, IPV6_FLOWLABEL_MASK; } else { info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4); - if (flags & BPF_F_ZERO_CSUM_TX) - info->key.tun_flags &= ~TUNNEL_CSUM; } return 0; @@ -3151,7 +3161,7 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = { .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM, - .arg5_type = ARG_CONST_SIZE, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb) @@ -3229,6 +3239,29 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, ret = -EINVAL; } #ifdef CONFIG_INET +#if IS_ENABLED(CONFIG_IPV6) + } else if (level == SOL_IPV6) { + if (optlen != sizeof(int) || sk->sk_family != AF_INET6) + return -EINVAL; + + val = *((int *)optval); + /* Only some options are supported */ + switch (optname) { + case IPV6_TCLASS: + if (val < -1 || val > 0xff) { + ret = -EINVAL; + } else { + struct ipv6_pinfo *np = inet6_sk(sk); + + if (val == -1) + val = 0; + np->tclass = val; + } + break; + default: + ret = -EINVAL; + } +#endif } else if (level == SOL_TCP && sk->sk_prot->setsockopt == tcp_setsockopt) { if (optname == TCP_CONGESTION) { @@ -3238,7 +3271,8 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, strncpy(name, optval, min_t(long, optlen, TCP_CA_NAME_MAX-1)); name[TCP_CA_NAME_MAX-1] = 0; - ret = tcp_set_congestion_control(sk, name, false, reinit); + ret = tcp_set_congestion_control(sk, name, false, + reinit); } else { struct tcp_sock *tp = tcp_sk(sk); @@ -3304,6 +3338,22 @@ BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, } else { goto err_clear; } +#if IS_ENABLED(CONFIG_IPV6) + } else if (level == SOL_IPV6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + if (optlen != sizeof(int) || sk->sk_family != AF_INET6) + goto err_clear; + + /* Only some options are supported */ + switch (optname) { + case IPV6_TCLASS: + *((int *)optval) = (int)np->tclass; + break; + default: + goto err_clear; + } +#endif } else { goto err_clear; } @@ -3325,6 +3375,33 @@ static const struct bpf_func_proto bpf_getsockopt_proto = { .arg5_type = ARG_CONST_SIZE, }; +BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock, + int, argval) +{ + struct sock *sk = bpf_sock->sk; + int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS; + + if (!sk_fullsock(sk)) + return -EINVAL; + +#ifdef CONFIG_INET + if (val) + tcp_sk(sk)->bpf_sock_ops_cb_flags = val; + + return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS); +#else + return -EINVAL; +#endif +} + +static const struct bpf_func_proto bpf_sock_ops_cb_flags_set_proto = { + .func = bpf_sock_ops_cb_flags_set, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { @@ -3457,6 +3534,8 @@ xdp_func_proto(enum bpf_func_id func_id) return &bpf_xdp_event_output_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; + case BPF_FUNC_csum_diff: + return &bpf_csum_diff_proto; case BPF_FUNC_xdp_adjust_head: return &bpf_xdp_adjust_head_proto; case BPF_FUNC_xdp_adjust_meta: @@ -3505,6 +3584,8 @@ static const struct bpf_func_proto * return &bpf_setsockopt_proto; case BPF_FUNC_getsockopt: return &bpf_getsockopt_proto; + case BPF_FUNC_sock_ops_cb_flags_set: + return &bpf_sock_ops_cb_flags_set_proto; case BPF_FUNC_sock_map_update: return &bpf_sock_map_update_proto; default: @@ -3821,34 +3902,44 @@ void bpf_warn_invalid_xdp_action(u32 act) } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); -static bool __is_valid_sock_ops_access(int off, int size) +static bool sock_ops_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) { + const int size_default = sizeof(__u32); + if (off < 0 || off >= sizeof(struct bpf_sock_ops)) return false; + /* The verifier guarantees that size > 0. */ if (off % size != 0) return false; - if (size != sizeof(__u32)) - return false; - return true; -} - -static bool sock_ops_is_valid_access(int off, int size, - enum bpf_access_type type, - struct bpf_insn_access_aux *info) -{ if (type == BPF_WRITE) { switch (off) { - case offsetof(struct bpf_sock_ops, op) ... - offsetof(struct bpf_sock_ops, replylong[3]): + case offsetof(struct bpf_sock_ops, reply): + case offsetof(struct bpf_sock_ops, sk_txhash): + if (size != size_default) + return false; break; default: return false; } + } else { + switch (off) { + case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received, + bytes_acked): + if (size != sizeof(__u64)) + return false; + break; + default: + if (size != size_default) + return false; + break; + } } - return __is_valid_sock_ops_access(off, size); + return true; } static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write, @@ -4303,6 +4394,24 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, si->dst_reg, si->src_reg, offsetof(struct xdp_buff, data_end)); break; + case offsetof(struct xdp_md, ingress_ifindex): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq), + si->dst_reg, si->src_reg, + offsetof(struct xdp_buff, rxq)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_rxq_info, dev), + si->dst_reg, si->dst_reg, + offsetof(struct xdp_rxq_info, dev)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct net_device, ifindex)); + break; + case offsetof(struct xdp_md, rx_queue_index): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq), + si->dst_reg, si->src_reg, + offsetof(struct xdp_buff, rxq)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct xdp_rxq_info, + queue_index)); + break; } return insn - insn_buf; @@ -4437,6 +4546,211 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_num)); break; + + case offsetof(struct bpf_sock_ops, is_fullsock): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, + is_fullsock), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, + is_fullsock)); + break; + + case offsetof(struct bpf_sock_ops, state): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_state) != 1); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_state)); + break; + + case offsetof(struct bpf_sock_ops, rtt_min): + BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, rtt_min) != + sizeof(struct minmax)); + BUILD_BUG_ON(sizeof(struct minmax) < + sizeof(struct minmax_sample)); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct tcp_sock, rtt_min) + + FIELD_SIZEOF(struct minmax_sample, t)); + break; + +/* Helper macro for adding read access to tcp_sock or sock fields. */ +#define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ + do { \ + BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) > \ + FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, \ + is_fullsock), \ + si->dst_reg, si->src_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + is_fullsock)); \ + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 2); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, sk),\ + si->dst_reg, si->src_reg, \ + offsetof(struct bpf_sock_ops_kern, sk));\ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ, \ + OBJ_FIELD), \ + si->dst_reg, si->dst_reg, \ + offsetof(OBJ, OBJ_FIELD)); \ + } while (0) + +/* Helper macro for adding write access to tcp_sock or sock fields. + * The macro is called with two registers, dst_reg which contains a pointer + * to ctx (context) and src_reg which contains the value that should be + * stored. However, we need an additional register since we cannot overwrite + * dst_reg because it may be used later in the program. + * Instead we "borrow" one of the other register. We first save its value + * into a new (temp) field in bpf_sock_ops_kern, use it, and then restore + * it at the end of the macro. + */ +#define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ + do { \ + int reg = BPF_REG_9; \ + BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) > \ + FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD)); \ + if (si->dst_reg == reg || si->src_reg == reg) \ + reg--; \ + if (si->dst_reg == reg || si->src_reg == reg) \ + reg--; \ + *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, \ + is_fullsock), \ + reg, si->dst_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + is_fullsock)); \ + *insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, sk),\ + reg, si->dst_reg, \ + offsetof(struct bpf_sock_ops_kern, sk));\ + *insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD), \ + reg, si->src_reg, \ + offsetof(OBJ, OBJ_FIELD)); \ + *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ + } while (0) + +#define SOCK_OPS_GET_OR_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ, TYPE) \ + do { \ + if (TYPE == BPF_WRITE) \ + SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ + else \ + SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ + } while (0) + + case offsetof(struct bpf_sock_ops, snd_cwnd): + SOCK_OPS_GET_FIELD(snd_cwnd, snd_cwnd, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, srtt_us): + SOCK_OPS_GET_FIELD(srtt_us, srtt_us, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, bpf_sock_ops_cb_flags): + SOCK_OPS_GET_FIELD(bpf_sock_ops_cb_flags, bpf_sock_ops_cb_flags, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, snd_ssthresh): + SOCK_OPS_GET_FIELD(snd_ssthresh, snd_ssthresh, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, rcv_nxt): + SOCK_OPS_GET_FIELD(rcv_nxt, rcv_nxt, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, snd_nxt): + SOCK_OPS_GET_FIELD(snd_nxt, snd_nxt, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, snd_una): + SOCK_OPS_GET_FIELD(snd_una, snd_una, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, mss_cache): + SOCK_OPS_GET_FIELD(mss_cache, mss_cache, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, ecn_flags): + SOCK_OPS_GET_FIELD(ecn_flags, ecn_flags, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, rate_delivered): + SOCK_OPS_GET_FIELD(rate_delivered, rate_delivered, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, rate_interval_us): + SOCK_OPS_GET_FIELD(rate_interval_us, rate_interval_us, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, packets_out): + SOCK_OPS_GET_FIELD(packets_out, packets_out, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, retrans_out): + SOCK_OPS_GET_FIELD(retrans_out, retrans_out, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, total_retrans): + SOCK_OPS_GET_FIELD(total_retrans, total_retrans, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, segs_in): + SOCK_OPS_GET_FIELD(segs_in, segs_in, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, data_segs_in): + SOCK_OPS_GET_FIELD(data_segs_in, data_segs_in, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, segs_out): + SOCK_OPS_GET_FIELD(segs_out, segs_out, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, data_segs_out): + SOCK_OPS_GET_FIELD(data_segs_out, data_segs_out, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, lost_out): + SOCK_OPS_GET_FIELD(lost_out, lost_out, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, sacked_out): + SOCK_OPS_GET_FIELD(sacked_out, sacked_out, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, sk_txhash): + SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash, + struct sock, type); + break; + + case offsetof(struct bpf_sock_ops, bytes_received): + SOCK_OPS_GET_FIELD(bytes_received, bytes_received, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, bytes_acked): + SOCK_OPS_GET_FIELD(bytes_acked, bytes_acked, struct tcp_sock); + break; + } return insn - insn_buf; } @@ -4473,6 +4787,7 @@ const struct bpf_verifier_ops sk_filter_verifier_ops = { }; const struct bpf_prog_ops sk_filter_prog_ops = { + .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops tc_cls_act_verifier_ops = { diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 15ce30063765..559db9ea8d86 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -24,6 +24,7 @@ #include <linux/tcp.h> #include <net/flow_dissector.h> #include <scsi/fc/fc_fcoe.h> +#include <uapi/linux/batadv_packet.h> static void dissector_set_key(struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) @@ -133,10 +134,10 @@ skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type, ctrl->addr_type = type; } -static void -__skb_flow_dissect_tunnel_info(const struct sk_buff *skb, - struct flow_dissector *flow_dissector, - void *target_container) +void +skb_flow_dissect_tunnel_info(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container) { struct ip_tunnel_info *info; struct ip_tunnel_key *key; @@ -212,6 +213,7 @@ __skb_flow_dissect_tunnel_info(const struct sk_buff *skb, tp->dst = key->tp_dst; } } +EXPORT_SYMBOL(skb_flow_dissect_tunnel_info); static enum flow_dissect_ret __skb_flow_dissect_mpls(const struct sk_buff *skb, @@ -436,6 +438,57 @@ __skb_flow_dissect_gre(const struct sk_buff *skb, return FLOW_DISSECT_RET_PROTO_AGAIN; } +/** + * __skb_flow_dissect_batadv() - dissect batman-adv header + * @skb: sk_buff to with the batman-adv header + * @key_control: flow dissectors control key + * @data: raw buffer pointer to the packet, if NULL use skb->data + * @p_proto: pointer used to update the protocol to process next + * @p_nhoff: pointer used to update inner network header offset + * @hlen: packet header length + * @flags: any combination of FLOW_DISSECTOR_F_* + * + * ETH_P_BATMAN packets are tried to be dissected. Only + * &struct batadv_unicast packets are actually processed because they contain an + * inner ethernet header and are usually followed by actual network header. This + * allows the flow dissector to continue processing the packet. + * + * Return: FLOW_DISSECT_RET_PROTO_AGAIN when &struct batadv_unicast was found, + * FLOW_DISSECT_RET_OUT_GOOD when dissector should stop after encapsulation, + * otherwise FLOW_DISSECT_RET_OUT_BAD + */ +static enum flow_dissect_ret +__skb_flow_dissect_batadv(const struct sk_buff *skb, + struct flow_dissector_key_control *key_control, + void *data, __be16 *p_proto, int *p_nhoff, int hlen, + unsigned int flags) +{ + struct { + struct batadv_unicast_packet batadv_unicast; + struct ethhdr eth; + } *hdr, _hdr; + + hdr = __skb_header_pointer(skb, *p_nhoff, sizeof(_hdr), data, hlen, + &_hdr); + if (!hdr) + return FLOW_DISSECT_RET_OUT_BAD; + + if (hdr->batadv_unicast.version != BATADV_COMPAT_VERSION) + return FLOW_DISSECT_RET_OUT_BAD; + + if (hdr->batadv_unicast.packet_type != BATADV_UNICAST) + return FLOW_DISSECT_RET_OUT_BAD; + + *p_proto = hdr->eth.h_proto; + *p_nhoff += sizeof(*hdr); + + key_control->flags |= FLOW_DIS_ENCAPSULATION; + if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) + return FLOW_DISSECT_RET_OUT_GOOD; + + return FLOW_DISSECT_RET_PROTO_AGAIN; +} + static void __skb_flow_dissect_tcp(const struct sk_buff *skb, struct flow_dissector *flow_dissector, @@ -576,9 +629,6 @@ bool __skb_flow_dissect(const struct sk_buff *skb, FLOW_DISSECTOR_KEY_BASIC, target_container); - __skb_flow_dissect_tunnel_info(skb, flow_dissector, - target_container); - if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS)) { struct ethhdr *eth = eth_hdr(skb); @@ -817,6 +867,11 @@ proto_again: nhoff, hlen); break; + case htons(ETH_P_BATMAN): + fdret = __skb_flow_dissect_batadv(skb, key_control, data, + &proto, &nhoff, hlen, flags); + break; + default: fdret = FLOW_DISSECT_RET_OUT_BAD; break; @@ -976,8 +1031,8 @@ ip_proto_again: out_good: ret = true; - key_control->thoff = (u16)nhoff; out: + key_control->thoff = min_t(u16, nhoff, skb ? skb->len : hlen); key_basic->n_proto = proto; key_basic->ip_proto = ip_proto; @@ -985,7 +1040,6 @@ out: out_bad: ret = false; - key_control->thoff = min_t(u16, nhoff, skb ? skb->len : hlen); goto out; } EXPORT_SYMBOL(__skb_flow_dissect); diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 9834cfa21b21..0a3f88f08727 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -159,7 +159,11 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, est->intvl_log = intvl_log; est->cpu_bstats = cpu_bstats; + if (stats_lock) + local_bh_disable(); est_fetch_counters(est, &b); + if (stats_lock) + local_bh_enable(); est->last_bytes = b.bytes; est->last_packets = b.packets; old = rcu_dereference_protected(*rate_est, 1); diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index 87f28557b329..b2b2323bdc84 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -252,10 +252,10 @@ __gnet_stats_copy_queue_cpu(struct gnet_stats_queue *qstats, } } -static void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, - const struct gnet_stats_queue __percpu *cpu, - const struct gnet_stats_queue *q, - __u32 qlen) +void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, + const struct gnet_stats_queue __percpu *cpu, + const struct gnet_stats_queue *q, + __u32 qlen) { if (cpu) { __gnet_stats_copy_queue_cpu(qstats, cpu); @@ -269,6 +269,7 @@ static void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, qstats->qlen = qlen; } +EXPORT_SYMBOL(__gnet_stats_copy_queue); /** * gnet_stats_copy_queue - copy queue statistics into statistics TLV diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 982861607f88..e38e641e98d5 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -92,7 +92,7 @@ static bool linkwatch_urgent_event(struct net_device *dev) if (dev->ifindex != dev_get_iflink(dev)) return true; - if (dev->priv_flags & IFF_TEAM_PORT) + if (netif_is_lag_port(dev) || netif_is_lag_master(dev)) return true; return netif_carrier_ok(dev) && qdisc_tx_changing(dev); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index d1f5fe986edd..7b7a14abba28 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -532,7 +532,7 @@ struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, if (atomic_read(&tbl->entries) > (1 << nht->hash_shift)) nht = neigh_hash_grow(tbl, nht->hash_shift + 1); - hash_val = tbl->hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift); + hash_val = tbl->hash(n->primary_key, dev, nht->hash_rnd) >> (32 - nht->hash_shift); if (n->parms->dead) { rc = ERR_PTR(-EINVAL); @@ -544,7 +544,7 @@ struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, n1 != NULL; n1 = rcu_dereference_protected(n1->next, lockdep_is_held(&tbl->lock))) { - if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) { + if (dev == n1->dev && !memcmp(n1->primary_key, n->primary_key, key_len)) { if (want_ref) neigh_hold(n1); rc = n1; @@ -2862,7 +2862,6 @@ static int neigh_stat_seq_open(struct inode *inode, struct file *file) }; static const struct file_operations neigh_stat_seq_fops = { - .owner = THIS_MODULE, .open = neigh_stat_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 615ccab55f38..e010bb800d7b 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -182,7 +182,6 @@ static int dev_seq_open(struct inode *inode, struct file *file) } static const struct file_operations dev_seq_fops = { - .owner = THIS_MODULE, .open = dev_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -202,7 +201,6 @@ static int softnet_seq_open(struct inode *inode, struct file *file) } static const struct file_operations softnet_seq_fops = { - .owner = THIS_MODULE, .open = softnet_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -306,7 +304,6 @@ static int ptype_seq_open(struct inode *inode, struct file *file) } static const struct file_operations ptype_seq_fops = { - .owner = THIS_MODULE, .open = ptype_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -387,7 +384,6 @@ static int dev_mc_seq_open(struct inode *inode, struct file *file) } static const struct file_operations dev_mc_seq_fops = { - .owner = THIS_MODULE, .open = dev_mc_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 799b75268291..60a5ad2c33ee 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -295,10 +295,31 @@ static ssize_t carrier_changes_show(struct device *dev, struct net_device *netdev = to_net_dev(dev); return sprintf(buf, fmt_dec, - atomic_read(&netdev->carrier_changes)); + atomic_read(&netdev->carrier_up_count) + + atomic_read(&netdev->carrier_down_count)); } static DEVICE_ATTR_RO(carrier_changes); +static ssize_t carrier_up_count_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + + return sprintf(buf, fmt_dec, atomic_read(&netdev->carrier_up_count)); +} +static DEVICE_ATTR_RO(carrier_up_count); + +static ssize_t carrier_down_count_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + + return sprintf(buf, fmt_dec, atomic_read(&netdev->carrier_down_count)); +} +static DEVICE_ATTR_RO(carrier_down_count); + /* read-write attributes */ static int change_mtu(struct net_device *dev, unsigned long new_mtu) @@ -325,29 +346,6 @@ static ssize_t flags_store(struct device *dev, struct device_attribute *attr, } NETDEVICE_SHOW_RW(flags, fmt_hex); -static int change_tx_queue_len(struct net_device *dev, unsigned long new_len) -{ - unsigned int orig_len = dev->tx_queue_len; - int res; - - if (new_len != (unsigned int)new_len) - return -ERANGE; - - if (new_len != orig_len) { - dev->tx_queue_len = new_len; - res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev); - res = notifier_to_errno(res); - if (res) { - netdev_err(dev, - "refused to change device tx_queue_len\n"); - dev->tx_queue_len = orig_len; - return -EFAULT; - } - } - - return 0; -} - static ssize_t tx_queue_len_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) @@ -355,7 +353,7 @@ static ssize_t tx_queue_len_store(struct device *dev, if (!capable(CAP_NET_ADMIN)) return -EPERM; - return netdev_store(dev, attr, buf, len, change_tx_queue_len); + return netdev_store(dev, attr, buf, len, dev_change_tx_queue_len); } NETDEVICE_SHOW_RW(tx_queue_len, fmt_dec); @@ -547,6 +545,8 @@ static struct attribute *net_class_attrs[] __ro_after_init = { &dev_attr_phys_port_name.attr, &dev_attr_phys_switch_id.attr, &dev_attr_proto_down.attr, + &dev_attr_carrier_up_count.attr, + &dev_attr_carrier_down_count.attr, NULL, }; ATTRIBUTE_GROUPS(net_class); @@ -961,7 +961,7 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) while (--i >= new_num) { struct kobject *kobj = &dev->_rx[i].kobj; - if (!atomic_read(&dev_net(dev)->count)) + if (!refcount_read(&dev_net(dev)->count)) kobj->uevent_suppress = 1; if (dev->sysfs_rx_queue_group) sysfs_remove_group(kobj, dev->sysfs_rx_queue_group); @@ -1367,7 +1367,7 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) while (--i >= new_num) { struct netdev_queue *queue = dev->_tx + i; - if (!atomic_read(&dev_net(dev)->count)) + if (!refcount_read(&dev_net(dev)->count)) queue->kobj.uevent_suppress = 1; #ifdef CONFIG_BQL sysfs_remove_group(&queue->kobj, &dql_group); @@ -1558,7 +1558,7 @@ void netdev_unregister_kobject(struct net_device *ndev) { struct device *dev = &ndev->dev; - if (!atomic_read(&dev_net(ndev)->count)) + if (!refcount_read(&dev_net(ndev)->count)) dev_set_uevent_suppress(dev, 1); kobject_get(&dev->kobj); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index b797832565d3..3cad5f51afd3 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -35,7 +35,7 @@ LIST_HEAD(net_namespace_list); EXPORT_SYMBOL_GPL(net_namespace_list); struct net init_net = { - .count = ATOMIC_INIT(1), + .count = REFCOUNT_INIT(1), .dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head), }; EXPORT_SYMBOL(init_net); @@ -221,17 +221,26 @@ static void rtnl_net_notifyid(struct net *net, int cmd, int id); */ int peernet2id_alloc(struct net *net, struct net *peer) { - bool alloc; + bool alloc = false, alive = false; int id; - if (atomic_read(&net->count) == 0) + if (refcount_read(&net->count) == 0) return NETNSA_NSID_NOT_ASSIGNED; spin_lock_bh(&net->nsid_lock); - alloc = atomic_read(&peer->count) == 0 ? false : true; + /* + * When peer is obtained from RCU lists, we may race with + * its cleanup. Check whether it's alive, and this guarantees + * we never hash a peer back to net->netns_ids, after it has + * just been idr_remove()'d from there in cleanup_net(). + */ + if (maybe_get_net(peer)) + alive = alloc = true; id = __peernet2id_alloc(net, peer, &alloc); spin_unlock_bh(&net->nsid_lock); if (alloc && id >= 0) rtnl_net_notifyid(net, RTM_NEWNSID, id); + if (alive) + put_net(peer); return id; } EXPORT_SYMBOL_GPL(peernet2id_alloc); @@ -264,11 +273,9 @@ struct net *get_net_ns_by_id(struct net *net, int id) return NULL; rcu_read_lock(); - spin_lock_bh(&net->nsid_lock); peer = idr_find(&net->netns_ids, id); if (peer) - get_net(peer); - spin_unlock_bh(&net->nsid_lock); + peer = maybe_get_net(peer); rcu_read_unlock(); return peer; @@ -284,7 +291,7 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) int error = 0; LIST_HEAD(net_exit_list); - atomic_set(&net->count, 1); + refcount_set(&net->count, 1); refcount_set(&net->passive, 1); net->dev_base_seq = 1; net->user_ns = user_ns; @@ -432,13 +439,40 @@ struct net *copy_net_ns(unsigned long flags, return net; } +static void unhash_nsid(struct net *net, struct net *last) +{ + struct net *tmp; + /* This function is only called from cleanup_net() work, + * and this work is the only process, that may delete + * a net from net_namespace_list. So, when the below + * is executing, the list may only grow. Thus, we do not + * use for_each_net_rcu() or rtnl_lock(). + */ + for_each_net(tmp) { + int id; + + spin_lock_bh(&tmp->nsid_lock); + id = __peernet2id(tmp, net); + if (id >= 0) + idr_remove(&tmp->netns_ids, id); + spin_unlock_bh(&tmp->nsid_lock); + if (id >= 0) + rtnl_net_notifyid(tmp, RTM_DELNSID, id); + if (tmp == last) + break; + } + spin_lock_bh(&net->nsid_lock); + idr_destroy(&net->netns_ids); + spin_unlock_bh(&net->nsid_lock); +} + static DEFINE_SPINLOCK(cleanup_list_lock); static LIST_HEAD(cleanup_list); /* Must hold cleanup_list_lock to touch */ static void cleanup_net(struct work_struct *work) { const struct pernet_operations *ops; - struct net *net, *tmp; + struct net *net, *tmp, *last; struct list_head net_kill_list; LIST_HEAD(net_exit_list); @@ -451,26 +485,25 @@ static void cleanup_net(struct work_struct *work) /* Don't let anyone else find us. */ rtnl_lock(); - list_for_each_entry(net, &net_kill_list, cleanup_list) { + list_for_each_entry(net, &net_kill_list, cleanup_list) list_del_rcu(&net->list); - list_add_tail(&net->exit_list, &net_exit_list); - for_each_net(tmp) { - int id; - - spin_lock_bh(&tmp->nsid_lock); - id = __peernet2id(tmp, net); - if (id >= 0) - idr_remove(&tmp->netns_ids, id); - spin_unlock_bh(&tmp->nsid_lock); - if (id >= 0) - rtnl_net_notifyid(tmp, RTM_DELNSID, id); - } - spin_lock_bh(&net->nsid_lock); - idr_destroy(&net->netns_ids); - spin_unlock_bh(&net->nsid_lock); + /* Cache last net. After we unlock rtnl, no one new net + * added to net_namespace_list can assign nsid pointer + * to a net from net_kill_list (see peernet2id_alloc()). + * So, we skip them in unhash_nsid(). + * + * Note, that unhash_nsid() does not delete nsid links + * between net_kill_list's nets, as they've already + * deleted from net_namespace_list. But, this would be + * useless anyway, as netns_ids are destroyed there. + */ + last = list_last_entry(&net_namespace_list, struct net, list); + rtnl_unlock(); + list_for_each_entry(net, &net_kill_list, cleanup_list) { + unhash_nsid(net, last); + list_add_tail(&net->exit_list, &net_exit_list); } - rtnl_unlock(); /* * Another CPU might be rcu-iterating the list, wait for it. diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 1c4810919a0a..b9057478d69c 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -14,7 +14,6 @@ #include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> -#include <linux/module.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> diff --git a/net/core/pktgen.c b/net/core/pktgen.c index f95a15086225..b8ab5c829511 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -184,25 +184,44 @@ #define func_enter() pr_debug("entering %s\n", __func__); +#define PKT_FLAGS \ + pf(IPV6) /* Interface in IPV6 Mode */ \ + pf(IPSRC_RND) /* IP-Src Random */ \ + pf(IPDST_RND) /* IP-Dst Random */ \ + pf(TXSIZE_RND) /* Transmit size is random */ \ + pf(UDPSRC_RND) /* UDP-Src Random */ \ + pf(UDPDST_RND) /* UDP-Dst Random */ \ + pf(UDPCSUM) /* Include UDP checksum */ \ + pf(NO_TIMESTAMP) /* Don't timestamp packets (default TS) */ \ + pf(MPLS_RND) /* Random MPLS labels */ \ + pf(QUEUE_MAP_RND) /* queue map Random */ \ + pf(QUEUE_MAP_CPU) /* queue map mirrors smp_processor_id() */ \ + pf(FLOW_SEQ) /* Sequential flows */ \ + pf(IPSEC) /* ipsec on for flows */ \ + pf(MACSRC_RND) /* MAC-Src Random */ \ + pf(MACDST_RND) /* MAC-Dst Random */ \ + pf(VID_RND) /* Random VLAN ID */ \ + pf(SVID_RND) /* Random SVLAN ID */ \ + pf(NODE) /* Node memory alloc*/ \ + +#define pf(flag) flag##_SHIFT, +enum pkt_flags { + PKT_FLAGS +}; +#undef pf + /* Device flag bits */ -#define F_IPSRC_RND (1<<0) /* IP-Src Random */ -#define F_IPDST_RND (1<<1) /* IP-Dst Random */ -#define F_UDPSRC_RND (1<<2) /* UDP-Src Random */ -#define F_UDPDST_RND (1<<3) /* UDP-Dst Random */ -#define F_MACSRC_RND (1<<4) /* MAC-Src Random */ -#define F_MACDST_RND (1<<5) /* MAC-Dst Random */ -#define F_TXSIZE_RND (1<<6) /* Transmit size is random */ -#define F_IPV6 (1<<7) /* Interface in IPV6 Mode */ -#define F_MPLS_RND (1<<8) /* Random MPLS labels */ -#define F_VID_RND (1<<9) /* Random VLAN ID */ -#define F_SVID_RND (1<<10) /* Random SVLAN ID */ -#define F_FLOW_SEQ (1<<11) /* Sequential flows */ -#define F_IPSEC_ON (1<<12) /* ipsec on for flows */ -#define F_QUEUE_MAP_RND (1<<13) /* queue map Random */ -#define F_QUEUE_MAP_CPU (1<<14) /* queue map mirrors smp_processor_id() */ -#define F_NODE (1<<15) /* Node memory alloc*/ -#define F_UDPCSUM (1<<16) /* Include UDP checksum */ -#define F_NO_TIMESTAMP (1<<17) /* Don't timestamp packets (default TS) */ +#define pf(flag) static const __u32 F_##flag = (1<<flag##_SHIFT); +PKT_FLAGS +#undef pf + +#define pf(flag) __stringify(flag), +static char *pkt_flag_names[] = { + PKT_FLAGS +}; +#undef pf + +#define NR_PKT_FLAGS ARRAY_SIZE(pkt_flag_names) /* Thread control flag bits */ #define T_STOP (1<<0) /* Stop run */ @@ -399,7 +418,7 @@ struct pktgen_dev { __u8 ipsmode; /* IPSEC mode (config) */ __u8 ipsproto; /* IPSEC type (config) */ __u32 spi; - struct dst_entry dst; + struct xfrm_dst xdst; struct dst_ops dstops; #endif char result[512]; @@ -523,7 +542,6 @@ static int pgctrl_open(struct inode *inode, struct file *file) } static const struct file_operations pktgen_fops = { - .owner = THIS_MODULE, .open = pgctrl_open, .read = seq_read, .llseek = seq_lseek, @@ -535,6 +553,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v) { const struct pktgen_dev *pkt_dev = seq->private; ktime_t stopped; + unsigned int i; u64 idle; seq_printf(seq, @@ -596,7 +615,6 @@ static int pktgen_if_show(struct seq_file *seq, void *v) pkt_dev->src_mac_count, pkt_dev->dst_mac_count); if (pkt_dev->nr_labels) { - unsigned int i; seq_puts(seq, " mpls: "); for (i = 0; i < pkt_dev->nr_labels; i++) seq_printf(seq, "%08x%s", ntohl(pkt_dev->labels[i]), @@ -632,68 +650,21 @@ static int pktgen_if_show(struct seq_file *seq, void *v) seq_puts(seq, " Flags: "); - if (pkt_dev->flags & F_IPV6) - seq_puts(seq, "IPV6 "); - - if (pkt_dev->flags & F_IPSRC_RND) - seq_puts(seq, "IPSRC_RND "); - - if (pkt_dev->flags & F_IPDST_RND) - seq_puts(seq, "IPDST_RND "); - - if (pkt_dev->flags & F_TXSIZE_RND) - seq_puts(seq, "TXSIZE_RND "); - - if (pkt_dev->flags & F_UDPSRC_RND) - seq_puts(seq, "UDPSRC_RND "); - - if (pkt_dev->flags & F_UDPDST_RND) - seq_puts(seq, "UDPDST_RND "); - - if (pkt_dev->flags & F_UDPCSUM) - seq_puts(seq, "UDPCSUM "); - - if (pkt_dev->flags & F_NO_TIMESTAMP) - seq_puts(seq, "NO_TIMESTAMP "); - - if (pkt_dev->flags & F_MPLS_RND) - seq_puts(seq, "MPLS_RND "); - - if (pkt_dev->flags & F_QUEUE_MAP_RND) - seq_puts(seq, "QUEUE_MAP_RND "); + for (i = 0; i < NR_PKT_FLAGS; i++) { + if (i == F_FLOW_SEQ) + if (!pkt_dev->cflows) + continue; - if (pkt_dev->flags & F_QUEUE_MAP_CPU) - seq_puts(seq, "QUEUE_MAP_CPU "); - - if (pkt_dev->cflows) { - if (pkt_dev->flags & F_FLOW_SEQ) - seq_puts(seq, "FLOW_SEQ "); /*in sequence flows*/ - else - seq_puts(seq, "FLOW_RND "); - } + if (pkt_dev->flags & (1 << i)) + seq_printf(seq, "%s ", pkt_flag_names[i]); + else if (i == F_FLOW_SEQ) + seq_puts(seq, "FLOW_RND "); #ifdef CONFIG_XFRM - if (pkt_dev->flags & F_IPSEC_ON) { - seq_puts(seq, "IPSEC "); - if (pkt_dev->spi) + if (i == F_IPSEC && pkt_dev->spi) seq_printf(seq, "spi:%u", pkt_dev->spi); - } #endif - - if (pkt_dev->flags & F_MACSRC_RND) - seq_puts(seq, "MACSRC_RND "); - - if (pkt_dev->flags & F_MACDST_RND) - seq_puts(seq, "MACDST_RND "); - - if (pkt_dev->flags & F_VID_RND) - seq_puts(seq, "VID_RND "); - - if (pkt_dev->flags & F_SVID_RND) - seq_puts(seq, "SVID_RND "); - - if (pkt_dev->flags & F_NODE) - seq_puts(seq, "NODE_ALLOC "); + } seq_puts(seq, "\n"); @@ -859,6 +830,35 @@ static ssize_t get_labels(const char __user *buffer, struct pktgen_dev *pkt_dev) return i; } +static __u32 pktgen_read_flag(const char *f, bool *disable) +{ + __u32 i; + + if (f[0] == '!') { + *disable = true; + f++; + } + + for (i = 0; i < NR_PKT_FLAGS; i++) { + if (!IS_ENABLED(CONFIG_XFRM) && i == IPSEC_SHIFT) + continue; + + /* allow only disabling ipv6 flag */ + if (!*disable && i == IPV6_SHIFT) + continue; + + if (strcmp(f, pkt_flag_names[i]) == 0) + return 1 << i; + } + + if (strcmp(f, "FLOW_RND") == 0) { + *disable = !*disable; + return F_FLOW_SEQ; + } + + return 0; +} + static ssize_t pktgen_if_write(struct file *file, const char __user * user_buffer, size_t count, loff_t * offset) @@ -1216,7 +1216,10 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "flag")) { + __u32 flag; char f[32]; + bool disable = false; + memset(f, 0, 32); len = strn_len(&user_buffer[i], sizeof(f) - 1); if (len < 0) @@ -1225,107 +1228,15 @@ static ssize_t pktgen_if_write(struct file *file, if (copy_from_user(f, &user_buffer[i], len)) return -EFAULT; i += len; - if (strcmp(f, "IPSRC_RND") == 0) - pkt_dev->flags |= F_IPSRC_RND; - - else if (strcmp(f, "!IPSRC_RND") == 0) - pkt_dev->flags &= ~F_IPSRC_RND; - - else if (strcmp(f, "TXSIZE_RND") == 0) - pkt_dev->flags |= F_TXSIZE_RND; - - else if (strcmp(f, "!TXSIZE_RND") == 0) - pkt_dev->flags &= ~F_TXSIZE_RND; - - else if (strcmp(f, "IPDST_RND") == 0) - pkt_dev->flags |= F_IPDST_RND; - - else if (strcmp(f, "!IPDST_RND") == 0) - pkt_dev->flags &= ~F_IPDST_RND; - - else if (strcmp(f, "UDPSRC_RND") == 0) - pkt_dev->flags |= F_UDPSRC_RND; - - else if (strcmp(f, "!UDPSRC_RND") == 0) - pkt_dev->flags &= ~F_UDPSRC_RND; - - else if (strcmp(f, "UDPDST_RND") == 0) - pkt_dev->flags |= F_UDPDST_RND; - - else if (strcmp(f, "!UDPDST_RND") == 0) - pkt_dev->flags &= ~F_UDPDST_RND; - - else if (strcmp(f, "MACSRC_RND") == 0) - pkt_dev->flags |= F_MACSRC_RND; - - else if (strcmp(f, "!MACSRC_RND") == 0) - pkt_dev->flags &= ~F_MACSRC_RND; - - else if (strcmp(f, "MACDST_RND") == 0) - pkt_dev->flags |= F_MACDST_RND; - - else if (strcmp(f, "!MACDST_RND") == 0) - pkt_dev->flags &= ~F_MACDST_RND; - - else if (strcmp(f, "MPLS_RND") == 0) - pkt_dev->flags |= F_MPLS_RND; - - else if (strcmp(f, "!MPLS_RND") == 0) - pkt_dev->flags &= ~F_MPLS_RND; - else if (strcmp(f, "VID_RND") == 0) - pkt_dev->flags |= F_VID_RND; + flag = pktgen_read_flag(f, &disable); - else if (strcmp(f, "!VID_RND") == 0) - pkt_dev->flags &= ~F_VID_RND; - - else if (strcmp(f, "SVID_RND") == 0) - pkt_dev->flags |= F_SVID_RND; - - else if (strcmp(f, "!SVID_RND") == 0) - pkt_dev->flags &= ~F_SVID_RND; - - else if (strcmp(f, "FLOW_SEQ") == 0) - pkt_dev->flags |= F_FLOW_SEQ; - - else if (strcmp(f, "QUEUE_MAP_RND") == 0) - pkt_dev->flags |= F_QUEUE_MAP_RND; - - else if (strcmp(f, "!QUEUE_MAP_RND") == 0) - pkt_dev->flags &= ~F_QUEUE_MAP_RND; - - else if (strcmp(f, "QUEUE_MAP_CPU") == 0) - pkt_dev->flags |= F_QUEUE_MAP_CPU; - - else if (strcmp(f, "!QUEUE_MAP_CPU") == 0) - pkt_dev->flags &= ~F_QUEUE_MAP_CPU; -#ifdef CONFIG_XFRM - else if (strcmp(f, "IPSEC") == 0) - pkt_dev->flags |= F_IPSEC_ON; -#endif - - else if (strcmp(f, "!IPV6") == 0) - pkt_dev->flags &= ~F_IPV6; - - else if (strcmp(f, "NODE_ALLOC") == 0) - pkt_dev->flags |= F_NODE; - - else if (strcmp(f, "!NODE_ALLOC") == 0) - pkt_dev->flags &= ~F_NODE; - - else if (strcmp(f, "UDPCSUM") == 0) - pkt_dev->flags |= F_UDPCSUM; - - else if (strcmp(f, "!UDPCSUM") == 0) - pkt_dev->flags &= ~F_UDPCSUM; - - else if (strcmp(f, "NO_TIMESTAMP") == 0) - pkt_dev->flags |= F_NO_TIMESTAMP; - - else if (strcmp(f, "!NO_TIMESTAMP") == 0) - pkt_dev->flags &= ~F_NO_TIMESTAMP; - - else { + if (flag) { + if (disable) + pkt_dev->flags &= ~flag; + else + pkt_dev->flags |= flag; + } else { sprintf(pg_result, "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s", f, @@ -1804,7 +1715,6 @@ static int pktgen_if_open(struct inode *inode, struct file *file) } static const struct file_operations pktgen_if_fops = { - .owner = THIS_MODULE, .open = pktgen_if_open, .read = seq_read, .llseek = seq_lseek, @@ -1942,7 +1852,6 @@ static int pktgen_thread_open(struct inode *inode, struct file *file) } static const struct file_operations pktgen_thread_fops = { - .owner = THIS_MODULE, .open = pktgen_thread_open, .read = seq_read, .llseek = seq_lseek, @@ -2544,7 +2453,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) pkt_dev->flows[flow].cur_daddr = pkt_dev->cur_daddr; #ifdef CONFIG_XFRM - if (pkt_dev->flags & F_IPSEC_ON) + if (pkt_dev->flags & F_IPSEC) get_ipsec_sa(pkt_dev, flow); #endif pkt_dev->nflows++; @@ -2609,7 +2518,7 @@ static int pktgen_output_ipsec(struct sk_buff *skb, struct pktgen_dev *pkt_dev) * supports both transport/tunnel mode + ESP/AH type. */ if ((x->props.mode == XFRM_MODE_TUNNEL) && (pkt_dev->spi != 0)) - skb->_skb_refdst = (unsigned long)&pkt_dev->dst | SKB_DST_NOREF; + skb->_skb_refdst = (unsigned long)&pkt_dev->xdst.u.dst | SKB_DST_NOREF; rcu_read_lock_bh(); err = x->outer_mode->output(x, skb); @@ -2649,7 +2558,7 @@ static void free_SAs(struct pktgen_dev *pkt_dev) static int process_ipsec(struct pktgen_dev *pkt_dev, struct sk_buff *skb, __be16 protocol) { - if (pkt_dev->flags & F_IPSEC_ON) { + if (pkt_dev->flags & F_IPSEC) { struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x; int nhead = 0; if (x) { @@ -3742,10 +3651,10 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) * performance under such circumstance. */ pkt_dev->dstops.family = AF_INET; - pkt_dev->dst.dev = pkt_dev->odev; - dst_init_metrics(&pkt_dev->dst, pktgen_dst_metrics, false); - pkt_dev->dst.child = &pkt_dev->dst; - pkt_dev->dst.ops = &pkt_dev->dstops; + pkt_dev->xdst.u.dst.dev = pkt_dev->odev; + dst_init_metrics(&pkt_dev->xdst.u.dst, pktgen_dst_metrics, false); + pkt_dev->xdst.child = &pkt_dev->xdst.u.dst; + pkt_dev->xdst.u.dst.ops = &pkt_dev->dstops; #endif return add_dev_to_thread(t, pkt_dev); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index dabba2a91fc8..56af8e41abfc 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -62,7 +62,9 @@ struct rtnl_link { rtnl_doit_func doit; rtnl_dumpit_func dumpit; + struct module *owner; unsigned int flags; + struct rcu_head rcu; }; static DEFINE_MUTEX(rtnl_mutex); @@ -127,8 +129,7 @@ bool lockdep_rtnl_is_held(void) EXPORT_SYMBOL(lockdep_rtnl_is_held); #endif /* #ifdef CONFIG_PROVE_LOCKING */ -static struct rtnl_link __rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; -static refcount_t rtnl_msg_handlers_ref[RTNL_FAMILY_MAX + 1]; +static struct rtnl_link *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; static inline int rtm_msgindex(int msgtype) { @@ -144,72 +145,127 @@ static inline int rtm_msgindex(int msgtype) return msgindex; } -/** - * __rtnl_register - Register a rtnetlink message type - * @protocol: Protocol family or PF_UNSPEC - * @msgtype: rtnetlink message type - * @doit: Function pointer called for each request message - * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message - * @flags: rtnl_link_flags to modifiy behaviour of doit/dumpit functions - * - * Registers the specified function pointers (at least one of them has - * to be non-NULL) to be called whenever a request message for the - * specified protocol family and message type is received. - * - * The special protocol family PF_UNSPEC may be used to define fallback - * function pointers for the case when no entry for the specific protocol - * family exists. - * - * Returns 0 on success or a negative error code. - */ -int __rtnl_register(int protocol, int msgtype, - rtnl_doit_func doit, rtnl_dumpit_func dumpit, - unsigned int flags) +static struct rtnl_link *rtnl_get_link(int protocol, int msgtype) +{ + struct rtnl_link **tab; + + if (protocol >= ARRAY_SIZE(rtnl_msg_handlers)) + protocol = PF_UNSPEC; + + tab = rcu_dereference_rtnl(rtnl_msg_handlers[protocol]); + if (!tab) + tab = rcu_dereference_rtnl(rtnl_msg_handlers[PF_UNSPEC]); + + return tab[msgtype]; +} + +static int rtnl_register_internal(struct module *owner, + int protocol, int msgtype, + rtnl_doit_func doit, rtnl_dumpit_func dumpit, + unsigned int flags) { - struct rtnl_link *tab; + struct rtnl_link *link, *old; + struct rtnl_link __rcu **tab; int msgindex; + int ret = -ENOBUFS; BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); msgindex = rtm_msgindex(msgtype); - tab = rcu_dereference_raw(rtnl_msg_handlers[protocol]); + rtnl_lock(); + tab = rtnl_msg_handlers[protocol]; if (tab == NULL) { - tab = kcalloc(RTM_NR_MSGTYPES, sizeof(*tab), GFP_KERNEL); - if (tab == NULL) - return -ENOBUFS; + tab = kcalloc(RTM_NR_MSGTYPES, sizeof(void *), GFP_KERNEL); + if (!tab) + goto unlock; + /* ensures we see the 0 stores */ rcu_assign_pointer(rtnl_msg_handlers[protocol], tab); } + old = rtnl_dereference(tab[msgindex]); + if (old) { + link = kmemdup(old, sizeof(*old), GFP_KERNEL); + if (!link) + goto unlock; + } else { + link = kzalloc(sizeof(*link), GFP_KERNEL); + if (!link) + goto unlock; + } + + WARN_ON(link->owner && link->owner != owner); + link->owner = owner; + + WARN_ON(doit && link->doit && link->doit != doit); if (doit) - tab[msgindex].doit = doit; + link->doit = doit; + WARN_ON(dumpit && link->dumpit && link->dumpit != dumpit); if (dumpit) - tab[msgindex].dumpit = dumpit; - tab[msgindex].flags |= flags; + link->dumpit = dumpit; - return 0; + link->flags |= flags; + + /* publish protocol:msgtype */ + rcu_assign_pointer(tab[msgindex], link); + ret = 0; + if (old) + kfree_rcu(old, rcu); +unlock: + rtnl_unlock(); + return ret; } -EXPORT_SYMBOL_GPL(__rtnl_register); + +/** + * rtnl_register_module - Register a rtnetlink message type + * + * @owner: module registering the hook (THIS_MODULE) + * @protocol: Protocol family or PF_UNSPEC + * @msgtype: rtnetlink message type + * @doit: Function pointer called for each request message + * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message + * @flags: rtnl_link_flags to modifiy behaviour of doit/dumpit functions + * + * Like rtnl_register, but for use by removable modules. + */ +int rtnl_register_module(struct module *owner, + int protocol, int msgtype, + rtnl_doit_func doit, rtnl_dumpit_func dumpit, + unsigned int flags) +{ + return rtnl_register_internal(owner, protocol, msgtype, + doit, dumpit, flags); +} +EXPORT_SYMBOL_GPL(rtnl_register_module); /** * rtnl_register - Register a rtnetlink message type + * @protocol: Protocol family or PF_UNSPEC + * @msgtype: rtnetlink message type + * @doit: Function pointer called for each request message + * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message + * @flags: rtnl_link_flags to modifiy behaviour of doit/dumpit functions + * + * Registers the specified function pointers (at least one of them has + * to be non-NULL) to be called whenever a request message for the + * specified protocol family and message type is received. * - * Identical to __rtnl_register() but panics on failure. This is useful - * as failure of this function is very unlikely, it can only happen due - * to lack of memory when allocating the chain to store all message - * handlers for a protocol. Meant for use in init functions where lack - * of memory implies no sense in continuing. + * The special protocol family PF_UNSPEC may be used to define fallback + * function pointers for the case when no entry for the specific protocol + * family exists. */ void rtnl_register(int protocol, int msgtype, rtnl_doit_func doit, rtnl_dumpit_func dumpit, unsigned int flags) { - if (__rtnl_register(protocol, msgtype, doit, dumpit, flags) < 0) - panic("Unable to register rtnetlink message handler, " - "protocol = %d, message type = %d\n", - protocol, msgtype); + int err; + + err = rtnl_register_internal(NULL, protocol, msgtype, doit, dumpit, + flags); + if (err) + pr_err("Unable to register rtnetlink message handler, " + "protocol = %d, message type = %d\n", protocol, msgtype); } -EXPORT_SYMBOL_GPL(rtnl_register); /** * rtnl_unregister - Unregister a rtnetlink message type @@ -220,24 +276,25 @@ EXPORT_SYMBOL_GPL(rtnl_register); */ int rtnl_unregister(int protocol, int msgtype) { - struct rtnl_link *handlers; + struct rtnl_link **tab, *link; int msgindex; BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); msgindex = rtm_msgindex(msgtype); rtnl_lock(); - handlers = rtnl_dereference(rtnl_msg_handlers[protocol]); - if (!handlers) { + tab = rtnl_dereference(rtnl_msg_handlers[protocol]); + if (!tab) { rtnl_unlock(); return -ENOENT; } - handlers[msgindex].doit = NULL; - handlers[msgindex].dumpit = NULL; - handlers[msgindex].flags = 0; + link = tab[msgindex]; + rcu_assign_pointer(tab[msgindex], NULL); rtnl_unlock(); + kfree_rcu(link, rcu); + return 0; } EXPORT_SYMBOL_GPL(rtnl_unregister); @@ -251,20 +308,27 @@ EXPORT_SYMBOL_GPL(rtnl_unregister); */ void rtnl_unregister_all(int protocol) { - struct rtnl_link *handlers; + struct rtnl_link **tab, *link; + int msgindex; BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); rtnl_lock(); - handlers = rtnl_dereference(rtnl_msg_handlers[protocol]); + tab = rtnl_msg_handlers[protocol]; RCU_INIT_POINTER(rtnl_msg_handlers[protocol], NULL); + for (msgindex = 0; msgindex < RTM_NR_MSGTYPES; msgindex++) { + link = tab[msgindex]; + if (!link) + continue; + + rcu_assign_pointer(tab[msgindex], NULL); + kfree_rcu(link, rcu); + } rtnl_unlock(); synchronize_net(); - while (refcount_read(&rtnl_msg_handlers_ref[protocol]) > 1) - schedule(); - kfree(handlers); + kfree(tab); } EXPORT_SYMBOL_GPL(rtnl_unregister_all); @@ -840,6 +904,10 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev, nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_MULTICAST */ nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_RX_DROPPED */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_TX_DROPPED */ + nla_total_size_64bit(sizeof(__u64)) + nla_total_size(sizeof(struct ifla_vf_trust))); return size; } else @@ -920,8 +988,11 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + rtnl_xdp_size() /* IFLA_XDP */ + nla_total_size(4) /* IFLA_EVENT */ + nla_total_size(4) /* IFLA_NEW_NETNSID */ + + nla_total_size(4) /* IFLA_NEW_IFINDEX */ + nla_total_size(1) /* IFLA_PROTO_DOWN */ + nla_total_size(4) /* IFLA_IF_NETNSID */ + + nla_total_size(4) /* IFLA_CARRIER_UP_COUNT */ + + nla_total_size(4) /* IFLA_CARRIER_DOWN_COUNT */ + 0; } @@ -1194,7 +1265,11 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST, vf_stats.broadcast, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST, - vf_stats.multicast, IFLA_VF_STATS_PAD)) { + vf_stats.multicast, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_DROPPED, + vf_stats.rx_dropped, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_DROPPED, + vf_stats.tx_dropped, IFLA_VF_STATS_PAD)) { nla_nest_cancel(skb, vfstats); goto nla_put_vf_failure; } @@ -1261,6 +1336,7 @@ static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id) { const struct net_device_ops *ops = dev->netdev_ops; const struct bpf_prog *generic_xdp_prog; + struct netdev_bpf xdp; ASSERT_RTNL(); @@ -1273,7 +1349,10 @@ static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id) if (!ops->ndo_bpf) return XDP_ATTACHED_NONE; - return __dev_xdp_attached(dev, ops->ndo_bpf, prog_id); + __dev_xdp_query(dev, ops->ndo_bpf, &xdp); + *prog_id = xdp.prog_id; + + return xdp.prog_attached; } static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) @@ -1433,7 +1512,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, struct net *src_net, int type, u32 pid, u32 seq, u32 change, unsigned int flags, u32 ext_filter_mask, - u32 event, int *new_nsid, int tgt_netnsid) + u32 event, int *new_nsid, int new_ifindex, + int tgt_netnsid) { struct ifinfomsg *ifm; struct nlmsghdr *nlh; @@ -1475,8 +1555,13 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, nla_put_string(skb, IFLA_QDISC, dev->qdisc->ops->id)) || nla_put_ifalias(skb, dev) || nla_put_u32(skb, IFLA_CARRIER_CHANGES, - atomic_read(&dev->carrier_changes)) || - nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down)) + atomic_read(&dev->carrier_up_count) + + atomic_read(&dev->carrier_down_count)) || + nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down) || + nla_put_u32(skb, IFLA_CARRIER_UP_COUNT, + atomic_read(&dev->carrier_up_count)) || + nla_put_u32(skb, IFLA_CARRIER_DOWN_COUNT, + atomic_read(&dev->carrier_down_count))) goto nla_put_failure; if (event != IFLA_EVENT_NONE) { @@ -1525,6 +1610,10 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, if (new_nsid && nla_put_s32(skb, IFLA_NEW_NETNSID, *new_nsid) < 0) goto nla_put_failure; + if (new_ifindex && + nla_put_s32(skb, IFLA_NEW_IFINDEX, new_ifindex) < 0) + goto nla_put_failure; + rcu_read_lock(); if (rtnl_fill_link_af(skb, dev, ext_filter_mask)) @@ -1569,6 +1658,8 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_PROMISCUITY] = { .type = NLA_U32 }, [IFLA_NUM_TX_QUEUES] = { .type = NLA_U32 }, [IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 }, + [IFLA_GSO_MAX_SEGS] = { .type = NLA_U32 }, + [IFLA_GSO_MAX_SIZE] = { .type = NLA_U32 }, [IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, [IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */ [IFLA_PHYS_SWITCH_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, @@ -1578,6 +1669,8 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_EVENT] = { .type = NLA_U32 }, [IFLA_GROUP] = { .type = NLA_U32 }, [IFLA_IF_NETNSID] = { .type = NLA_S32 }, + [IFLA_CARRIER_UP_COUNT] = { .type = NLA_U32 }, + [IFLA_CARRIER_DOWN_COUNT] = { .type = NLA_U32 }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -1681,18 +1774,18 @@ static bool link_dump_filtered(struct net_device *dev, return false; } -static struct net *get_target_net(struct sk_buff *skb, int netnsid) +static struct net *get_target_net(struct sock *sk, int netnsid) { struct net *net; - net = get_net_ns_by_id(sock_net(skb->sk), netnsid); + net = get_net_ns_by_id(sock_net(sk), netnsid); if (!net) return ERR_PTR(-EINVAL); /* For now, the caller is required to have CAP_NET_ADMIN in * the user namespace owning the target net ns. */ - if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { + if (!sk_ns_capable(sk, net->user_ns, CAP_NET_ADMIN)) { put_net(net); return ERR_PTR(-EACCES); } @@ -1733,7 +1826,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) ifla_policy, NULL) >= 0) { if (tb[IFLA_IF_NETNSID]) { netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]); - tgt_net = get_target_net(skb, netnsid); + tgt_net = get_target_net(skb->sk, netnsid); if (IS_ERR(tgt_net)) { tgt_net = net; netnsid = -1; @@ -1766,7 +1859,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 0, flags, - ext_filter_mask, 0, NULL, + ext_filter_mask, 0, NULL, 0, netnsid); if (err < 0) { @@ -1815,6 +1908,49 @@ struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) } EXPORT_SYMBOL(rtnl_link_get_net); +/* Figure out which network namespace we are talking about by + * examining the link attributes in the following order: + * + * 1. IFLA_NET_NS_PID + * 2. IFLA_NET_NS_FD + * 3. IFLA_IF_NETNSID + */ +static struct net *rtnl_link_get_net_by_nlattr(struct net *src_net, + struct nlattr *tb[]) +{ + struct net *net; + + if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]) + return rtnl_link_get_net(src_net, tb); + + if (!tb[IFLA_IF_NETNSID]) + return get_net(src_net); + + net = get_net_ns_by_id(src_net, nla_get_u32(tb[IFLA_IF_NETNSID])); + if (!net) + return ERR_PTR(-EINVAL); + + return net; +} + +static struct net *rtnl_link_get_net_capable(const struct sk_buff *skb, + struct net *src_net, + struct nlattr *tb[], int cap) +{ + struct net *net; + + net = rtnl_link_get_net_by_nlattr(src_net, tb); + if (IS_ERR(net)) + return net; + + if (!netlink_ns_capable(skb, net->user_ns, cap)) { + put_net(net); + return ERR_PTR(-EPERM); + } + + return net; +} + static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[]) { if (dev) { @@ -2077,17 +2213,14 @@ static int do_setlink(const struct sk_buff *skb, const struct net_device_ops *ops = dev->netdev_ops; int err; - if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]) { - struct net *net = rtnl_link_get_net(dev_net(dev), tb); + if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_IF_NETNSID]) { + struct net *net = rtnl_link_get_net_capable(skb, dev_net(dev), + tb, CAP_NET_ADMIN); if (IS_ERR(net)) { err = PTR_ERR(net); goto errout; } - if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { - put_net(net); - err = -EPERM; - goto errout; - } + err = dev_change_net_namespace(dev, net, ifname); put_net(net); if (err) @@ -2204,17 +2337,37 @@ static int do_setlink(const struct sk_buff *skb, if (tb[IFLA_TXQLEN]) { unsigned int value = nla_get_u32(tb[IFLA_TXQLEN]); - unsigned int orig_len = dev->tx_queue_len; - - if (dev->tx_queue_len ^ value) { - dev->tx_queue_len = value; - err = call_netdevice_notifiers( - NETDEV_CHANGE_TX_QUEUE_LEN, dev); - err = notifier_to_errno(err); - if (err) { - dev->tx_queue_len = orig_len; - goto errout; - } + + err = dev_change_tx_queue_len(dev, value); + if (err) + goto errout; + status |= DO_SETLINK_MODIFIED; + } + + if (tb[IFLA_GSO_MAX_SIZE]) { + u32 max_size = nla_get_u32(tb[IFLA_GSO_MAX_SIZE]); + + if (max_size > GSO_MAX_SIZE) { + err = -EINVAL; + goto errout; + } + + if (dev->gso_max_size ^ max_size) { + netif_set_gso_max_size(dev, max_size); + status |= DO_SETLINK_MODIFIED; + } + } + + if (tb[IFLA_GSO_MAX_SEGS]) { + u32 max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]); + + if (max_segs > GSO_MAX_SEGS) { + err = -EINVAL; + goto errout; + } + + if (dev->gso_max_segs ^ max_segs) { + dev->gso_max_segs = max_segs; status |= DO_SETLINK_MODIFIED; } } @@ -2400,9 +2553,6 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) goto errout; - if (tb[IFLA_IF_NETNSID]) - return -EOPNOTSUPP; - if (tb[IFLA_IFNAME]) nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); else @@ -2487,36 +2637,53 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); - struct net_device *dev; + struct net *tgt_net = net; + struct net_device *dev = NULL; struct ifinfomsg *ifm; char ifname[IFNAMSIZ]; struct nlattr *tb[IFLA_MAX+1]; int err; + int netnsid = -1; err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); if (err < 0) return err; - if (tb[IFLA_IF_NETNSID]) - return -EOPNOTSUPP; - if (tb[IFLA_IFNAME]) nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); + if (tb[IFLA_IF_NETNSID]) { + netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]); + tgt_net = get_target_net(NETLINK_CB(skb).sk, netnsid); + if (IS_ERR(tgt_net)) + return PTR_ERR(tgt_net); + } + + err = -EINVAL; ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) - dev = __dev_get_by_index(net, ifm->ifi_index); + dev = __dev_get_by_index(tgt_net, ifm->ifi_index); else if (tb[IFLA_IFNAME]) - dev = __dev_get_by_name(net, ifname); + dev = __dev_get_by_name(tgt_net, ifname); else if (tb[IFLA_GROUP]) - return rtnl_group_dellink(net, nla_get_u32(tb[IFLA_GROUP])); + err = rtnl_group_dellink(tgt_net, nla_get_u32(tb[IFLA_GROUP])); else - return -EINVAL; + goto out; - if (!dev) - return -ENODEV; + if (!dev) { + if (tb[IFLA_IFNAME] || ifm->ifi_index > 0) + err = -ENODEV; + + goto out; + } + + err = rtnl_delete_link(dev); + +out: + if (netnsid >= 0) + put_net(tgt_net); - return rtnl_delete_link(dev); + return err; } int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) @@ -2583,6 +2750,10 @@ struct net_device *rtnl_create_link(struct net *net, dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]); if (tb[IFLA_GROUP]) dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); + if (tb[IFLA_GSO_MAX_SIZE]) + netif_set_gso_max_size(dev, nla_get_u32(tb[IFLA_GSO_MAX_SIZE])); + if (tb[IFLA_GSO_MAX_SEGS]) + dev->gso_max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]); return dev; } @@ -2631,9 +2802,6 @@ replay: if (err < 0) return err; - if (tb[IFLA_IF_NETNSID]) - return -EOPNOTSUPP; - if (tb[IFLA_IFNAME]) nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); else @@ -2781,14 +2949,10 @@ replay: name_assign_type = NET_NAME_ENUM; } - dest_net = rtnl_link_get_net(net, tb); + dest_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN); if (IS_ERR(dest_net)) return PTR_ERR(dest_net); - err = -EPERM; - if (!netlink_ns_capable(skb, dest_net->user_ns, CAP_NET_ADMIN)) - goto out; - if (tb[IFLA_LINK_NETNSID]) { int id = nla_get_s32(tb[IFLA_LINK_NETNSID]); @@ -2883,7 +3047,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, if (tb[IFLA_IF_NETNSID]) { netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]); - tgt_net = get_target_net(skb, netnsid); + tgt_net = get_target_net(NETLINK_CB(skb).sk, netnsid); if (IS_ERR(tgt_net)) return PTR_ERR(tgt_net); } @@ -2915,7 +3079,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, err = rtnl_fill_ifinfo(nskb, dev, net, RTM_NEWLINK, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, 0, ext_filter_mask, - 0, NULL, netnsid); + 0, NULL, 0, netnsid); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size */ WARN_ON(err == -EMSGSIZE); @@ -2973,18 +3137,26 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) s_idx = 1; for (idx = 1; idx <= RTNL_FAMILY_MAX; idx++) { + struct rtnl_link **tab; int type = cb->nlh->nlmsg_type-RTM_BASE; - struct rtnl_link *handlers; + struct rtnl_link *link; rtnl_dumpit_func dumpit; if (idx < s_idx || idx == PF_PACKET) continue; - handlers = rtnl_dereference(rtnl_msg_handlers[idx]); - if (!handlers) + if (type < 0 || type >= RTM_NR_MSGTYPES) continue; - dumpit = READ_ONCE(handlers[type].dumpit); + tab = rcu_dereference_rtnl(rtnl_msg_handlers[idx]); + if (!tab) + continue; + + link = tab[type]; + if (!link) + continue; + + dumpit = link->dumpit; if (!dumpit) continue; @@ -3003,7 +3175,8 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, unsigned int change, - u32 event, gfp_t flags, int *new_nsid) + u32 event, gfp_t flags, int *new_nsid, + int new_ifindex) { struct net *net = dev_net(dev); struct sk_buff *skb; @@ -3016,7 +3189,7 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, err = rtnl_fill_ifinfo(skb, dev, dev_net(dev), type, 0, 0, change, 0, 0, event, - new_nsid, -1); + new_nsid, new_ifindex, -1); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); @@ -3039,14 +3212,15 @@ void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags) static void rtmsg_ifinfo_event(int type, struct net_device *dev, unsigned int change, u32 event, - gfp_t flags, int *new_nsid) + gfp_t flags, int *new_nsid, int new_ifindex) { struct sk_buff *skb; if (dev->reg_state != NETREG_REGISTERED) return; - skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags, new_nsid); + skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags, new_nsid, + new_ifindex); if (skb) rtmsg_ifinfo_send(skb, dev, flags); } @@ -3054,14 +3228,15 @@ static void rtmsg_ifinfo_event(int type, struct net_device *dev, void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, gfp_t flags) { - rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, NULL); + rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, + NULL, 0); } void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change, - gfp_t flags, int *new_nsid) + gfp_t flags, int *new_nsid, int new_ifindex) { rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, - new_nsid); + new_nsid, new_ifindex); } static int nlmsg_populate_fdb_fill(struct sk_buff *skb, @@ -4314,7 +4489,8 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); - struct rtnl_link *handlers; + struct rtnl_link *link; + struct module *owner; int err = -EOPNOTSUPP; rtnl_doit_func doit; unsigned int flags; @@ -4338,79 +4514,85 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, if (kind != 2 && !netlink_net_capable(skb, CAP_NET_ADMIN)) return -EPERM; - if (family >= ARRAY_SIZE(rtnl_msg_handlers)) - family = PF_UNSPEC; - rcu_read_lock(); - handlers = rcu_dereference(rtnl_msg_handlers[family]); - if (!handlers) { - family = PF_UNSPEC; - handlers = rcu_dereference(rtnl_msg_handlers[family]); - } - if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) { struct sock *rtnl; rtnl_dumpit_func dumpit; u16 min_dump_alloc = 0; - dumpit = READ_ONCE(handlers[type].dumpit); - if (!dumpit) { + link = rtnl_get_link(family, type); + if (!link || !link->dumpit) { family = PF_UNSPEC; - handlers = rcu_dereference(rtnl_msg_handlers[PF_UNSPEC]); - if (!handlers) - goto err_unlock; - - dumpit = READ_ONCE(handlers[type].dumpit); - if (!dumpit) + link = rtnl_get_link(family, type); + if (!link || !link->dumpit) goto err_unlock; } - - refcount_inc(&rtnl_msg_handlers_ref[family]); + owner = link->owner; + dumpit = link->dumpit; if (type == RTM_GETLINK - RTM_BASE) min_dump_alloc = rtnl_calcit(skb, nlh); + err = 0; + /* need to do this before rcu_read_unlock() */ + if (!try_module_get(owner)) + err = -EPROTONOSUPPORT; + rcu_read_unlock(); rtnl = net->rtnl; - { + if (err == 0) { struct netlink_dump_control c = { .dump = dumpit, .min_dump_alloc = min_dump_alloc, + .module = owner, }; err = netlink_dump_start(rtnl, skb, nlh, &c); + /* netlink_dump_start() will keep a reference on + * module if dump is still in progress. + */ + module_put(owner); } - refcount_dec(&rtnl_msg_handlers_ref[family]); return err; } - doit = READ_ONCE(handlers[type].doit); - if (!doit) { + link = rtnl_get_link(family, type); + if (!link || !link->doit) { family = PF_UNSPEC; - handlers = rcu_dereference(rtnl_msg_handlers[family]); + link = rtnl_get_link(PF_UNSPEC, type); + if (!link || !link->doit) + goto out_unlock; + } + + owner = link->owner; + if (!try_module_get(owner)) { + err = -EPROTONOSUPPORT; + goto out_unlock; } - flags = READ_ONCE(handlers[type].flags); + flags = link->flags; if (flags & RTNL_FLAG_DOIT_UNLOCKED) { - refcount_inc(&rtnl_msg_handlers_ref[family]); - doit = READ_ONCE(handlers[type].doit); + doit = link->doit; rcu_read_unlock(); if (doit) err = doit(skb, nlh, extack); - refcount_dec(&rtnl_msg_handlers_ref[family]); + module_put(owner); return err; } - rcu_read_unlock(); rtnl_lock(); - handlers = rtnl_dereference(rtnl_msg_handlers[family]); - if (handlers) { - doit = READ_ONCE(handlers[type].doit); - if (doit) - err = doit(skb, nlh, extack); - } + link = rtnl_get_link(family, type); + if (link && link->doit) + err = link->doit(skb, nlh, extack); rtnl_unlock(); + + module_put(owner); + + return err; + +out_unlock: + rcu_read_unlock(); return err; err_unlock: @@ -4454,7 +4636,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi case NETDEV_CHANGELOWERSTATE: case NETDEV_CHANGE_TX_QUEUE_LEN: rtmsg_ifinfo_event(RTM_NEWLINK, dev, 0, rtnl_get_event(event), - GFP_KERNEL, NULL); + GFP_KERNEL, NULL, 0); break; default: break; @@ -4498,11 +4680,6 @@ static struct pernet_operations rtnetlink_net_ops = { void __init rtnetlink_init(void) { - int i; - - for (i = 0; i < ARRAY_SIZE(rtnl_msg_handlers_ref); i++) - refcount_set(&rtnl_msg_handlers_ref[i], 1); - if (register_pernet_subsys(&rtnetlink_net_ops)) panic("rtnetlink_init: cannot initialize rtnetlink\n"); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 6b0ff396fa9d..8c61c27c1b28 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1177,12 +1177,12 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) int i, new_frags; u32 d_off; - if (!num_frags) - return 0; - if (skb_shared(skb) || skb_unclone(skb, gfp_mask)) return -EINVAL; + if (!num_frags) + goto release; + new_frags = (__skb_pagelen(skb) + PAGE_SIZE - 1) >> PAGE_SHIFT; for (i = 0; i < new_frags; i++) { page = alloc_page(gfp_mask); @@ -1238,6 +1238,7 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) __skb_fill_page_desc(skb, new_frags - 1, head, 0, d_off); skb_shinfo(skb)->nr_frags = new_frags; +release: skb_zcopy_clear(skb, false); return 0; } @@ -3654,7 +3655,9 @@ normal: skb_shinfo(nskb)->tx_flags |= skb_shinfo(head_skb)->tx_flags & SKBTX_SHARED_FRAG; - if (skb_zerocopy_clone(nskb, head_skb, GFP_ATOMIC)) + + if (skb_orphan_frags(frag_skb, GFP_ATOMIC) || + skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC)) goto err; while (pos < offset + len) { @@ -3668,6 +3671,11 @@ normal: BUG_ON(!nfrags); + if (skb_orphan_frags(frag_skb, GFP_ATOMIC) || + skb_zerocopy_clone(nskb, frag_skb, + GFP_ATOMIC)) + goto err; + list_skb = list_skb->next; } @@ -3679,9 +3687,6 @@ normal: goto err; } - if (unlikely(skb_orphan_frags(frag_skb, GFP_ATOMIC))) - goto err; - *nskb_frag = *frag; __skb_frag_ref(nskb_frag); size = skb_frag_size(nskb_frag); @@ -4293,7 +4298,7 @@ void skb_complete_tx_timestamp(struct sk_buff *skb, struct sock *sk = skb->sk; if (!skb_may_tx_timestamp(sk, false)) - return; + goto err; /* Take a reference to prevent skb_orphan() from freeing the socket, * but only if the socket refcount is not zero. @@ -4302,7 +4307,11 @@ void skb_complete_tx_timestamp(struct sk_buff *skb, *skb_hwtstamps(skb) = *hwtstamps; __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false); sock_put(sk); + return; } + +err: + kfree_skb(skb); } EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); @@ -4905,37 +4914,74 @@ unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) EXPORT_SYMBOL_GPL(skb_gso_transport_seglen); /** - * skb_gso_validate_mtu - Return in case such skb fits a given MTU + * skb_gso_size_check - check the skb size, considering GSO_BY_FRAGS * - * @skb: GSO skb - * @mtu: MTU to validate against + * There are a couple of instances where we have a GSO skb, and we + * want to determine what size it would be after it is segmented. * - * skb_gso_validate_mtu validates if a given skb will fit a wanted MTU - * once split. + * We might want to check: + * - L3+L4+payload size (e.g. IP forwarding) + * - L2+L3+L4+payload size (e.g. sanity check before passing to driver) + * + * This is a helper to do that correctly considering GSO_BY_FRAGS. + * + * @seg_len: The segmented length (from skb_gso_*_seglen). In the + * GSO_BY_FRAGS case this will be [header sizes + GSO_BY_FRAGS]. + * + * @max_len: The maximum permissible length. + * + * Returns true if the segmented length <= max length. */ -bool skb_gso_validate_mtu(const struct sk_buff *skb, unsigned int mtu) -{ +static inline bool skb_gso_size_check(const struct sk_buff *skb, + unsigned int seg_len, + unsigned int max_len) { const struct skb_shared_info *shinfo = skb_shinfo(skb); const struct sk_buff *iter; - unsigned int hlen; - - hlen = skb_gso_network_seglen(skb); if (shinfo->gso_size != GSO_BY_FRAGS) - return hlen <= mtu; + return seg_len <= max_len; /* Undo this so we can re-use header sizes */ - hlen -= GSO_BY_FRAGS; + seg_len -= GSO_BY_FRAGS; skb_walk_frags(skb, iter) { - if (hlen + skb_headlen(iter) > mtu) + if (seg_len + skb_headlen(iter) > max_len) return false; } return true; } + +/** + * skb_gso_validate_mtu - Return in case such skb fits a given MTU + * + * @skb: GSO skb + * @mtu: MTU to validate against + * + * skb_gso_validate_mtu validates if a given skb will fit a wanted MTU + * once split. + */ +bool skb_gso_validate_mtu(const struct sk_buff *skb, unsigned int mtu) +{ + return skb_gso_size_check(skb, skb_gso_network_seglen(skb), mtu); +} EXPORT_SYMBOL_GPL(skb_gso_validate_mtu); +/** + * skb_gso_validate_mac_len - Will a split GSO skb fit in a given length? + * + * @skb: GSO skb + * @len: length to validate against + * + * skb_gso_validate_mac_len validates if a given skb will fit a wanted + * length once split, including L2, L3 and L4 headers and the payload. + */ +bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len) +{ + return skb_gso_size_check(skb, skb_gso_mac_seglen(skb), len); +} +EXPORT_SYMBOL_GPL(skb_gso_validate_mac_len); + static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb) { if (skb_cow(skb, skb_headroom(skb)) < 0) { diff --git a/net/core/sock.c b/net/core/sock.c index c0b5b2f17412..e50e7b3f2223 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -145,6 +145,8 @@ static DEFINE_MUTEX(proto_list_mutex); static LIST_HEAD(proto_list); +static void sock_inuse_add(struct net *net, int val); + /** * sk_ns_capable - General socket capability test * @sk: Socket to use a capability on or through @@ -1531,8 +1533,11 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, sk->sk_kern_sock = kern; sock_lock_init(sk); sk->sk_net_refcnt = kern ? 0 : 1; - if (likely(sk->sk_net_refcnt)) + if (likely(sk->sk_net_refcnt)) { get_net(net); + sock_inuse_add(net, 1); + } + sock_net_set(sk, net); refcount_set(&sk->sk_wmem_alloc, 1); @@ -1595,6 +1600,9 @@ void sk_destruct(struct sock *sk) static void __sk_free(struct sock *sk) { + if (likely(sk->sk_net_refcnt)) + sock_inuse_add(sock_net(sk), -1); + if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt)) sock_diag_broadcast_destroy(sk); else @@ -1675,16 +1683,13 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_dst_pending_confirm = 0; newsk->sk_wmem_queued = 0; newsk->sk_forward_alloc = 0; - - /* sk->sk_memcg will be populated at accept() time */ - newsk->sk_memcg = NULL; - atomic_set(&newsk->sk_drops, 0); newsk->sk_send_head = NULL; newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; atomic_set(&newsk->sk_zckey, 0); sock_reset_flag(newsk, SOCK_DONE); + mem_cgroup_sk_alloc(newsk); cgroup_sk_alloc(&newsk->sk_cgrp_data); rcu_read_lock(); @@ -1716,6 +1721,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_priority = 0; newsk->sk_incoming_cpu = raw_smp_processor_id(); atomic64_set(&newsk->sk_cookie, 0); + if (likely(newsk->sk_net_refcnt)) + sock_inuse_add(sock_net(newsk), 1); /* * Before updating sk_refcnt, we must commit prior changes to memory @@ -2496,7 +2503,7 @@ int sock_no_getname(struct socket *sock, struct sockaddr *saddr, } EXPORT_SYMBOL(sock_no_getname); -unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt) +__poll_t sock_no_poll(struct file *file, struct socket *sock, poll_table *pt) { return 0; } @@ -3045,7 +3052,7 @@ static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) { - __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val); + __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val); } EXPORT_SYMBOL_GPL(sock_prot_inuse_add); @@ -3055,21 +3062,50 @@ int sock_prot_inuse_get(struct net *net, struct proto *prot) int res = 0; for_each_possible_cpu(cpu) - res += per_cpu_ptr(net->core.inuse, cpu)->val[idx]; + res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx]; return res >= 0 ? res : 0; } EXPORT_SYMBOL_GPL(sock_prot_inuse_get); +static void sock_inuse_add(struct net *net, int val) +{ + this_cpu_add(*net->core.sock_inuse, val); +} + +int sock_inuse_get(struct net *net) +{ + int cpu, res = 0; + + for_each_possible_cpu(cpu) + res += *per_cpu_ptr(net->core.sock_inuse, cpu); + + return res; +} + +EXPORT_SYMBOL_GPL(sock_inuse_get); + static int __net_init sock_inuse_init_net(struct net *net) { - net->core.inuse = alloc_percpu(struct prot_inuse); - return net->core.inuse ? 0 : -ENOMEM; + net->core.prot_inuse = alloc_percpu(struct prot_inuse); + if (net->core.prot_inuse == NULL) + return -ENOMEM; + + net->core.sock_inuse = alloc_percpu(int); + if (net->core.sock_inuse == NULL) + goto out; + + return 0; + +out: + free_percpu(net->core.prot_inuse); + return -ENOMEM; } static void __net_exit sock_inuse_exit_net(struct net *net) { - free_percpu(net->core.inuse); + free_percpu(net->core.prot_inuse); + free_percpu(net->core.sock_inuse); } static struct pernet_operations net_inuse_ops = { @@ -3112,6 +3148,10 @@ static inline void assign_proto_idx(struct proto *prot) static inline void release_proto_idx(struct proto *prot) { } + +static void sock_inuse_add(struct net *net, int val) +{ +} #endif static void req_prot_cleanup(struct request_sock_ops *rsk_prot) @@ -3319,7 +3359,6 @@ static int proto_seq_open(struct inode *inode, struct file *file) } static const struct file_operations proto_seq_fops = { - .owner = THIS_MODULE, .open = proto_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index 217f4e3b82f6..146b50e30659 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -288,7 +288,7 @@ static int sock_diag_bind(struct net *net, int group) case SKNLGRP_INET6_UDP_DESTROY: if (!sock_diag_handlers[AF_INET6]) request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, - NETLINK_SOCK_DIAG, AF_INET); + NETLINK_SOCK_DIAG, AF_INET6); break; } return 0; diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index 5eeb1d20cc38..064acb04be0f 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -94,6 +94,16 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse) return more_reuse; } +static void reuseport_free_rcu(struct rcu_head *head) +{ + struct sock_reuseport *reuse; + + reuse = container_of(head, struct sock_reuseport, rcu); + if (reuse->prog) + bpf_prog_destroy(reuse->prog); + kfree(reuse); +} + /** * reuseport_add_sock - Add a socket to the reuseport group of another. * @sk: New socket to add to the group. @@ -102,7 +112,7 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse) */ int reuseport_add_sock(struct sock *sk, struct sock *sk2) { - struct sock_reuseport *reuse; + struct sock_reuseport *old_reuse, *reuse; if (!rcu_access_pointer(sk2->sk_reuseport_cb)) { int err = reuseport_alloc(sk2); @@ -113,10 +123,13 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2) spin_lock_bh(&reuseport_lock); reuse = rcu_dereference_protected(sk2->sk_reuseport_cb, - lockdep_is_held(&reuseport_lock)), - WARN_ONCE(rcu_dereference_protected(sk->sk_reuseport_cb, - lockdep_is_held(&reuseport_lock)), - "socket already in reuseport group"); + lockdep_is_held(&reuseport_lock)); + old_reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + if (old_reuse && old_reuse->num_socks != 1) { + spin_unlock_bh(&reuseport_lock); + return -EBUSY; + } if (reuse->num_socks == reuse->max_socks) { reuse = reuseport_grow(reuse); @@ -134,19 +147,11 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2) spin_unlock_bh(&reuseport_lock); + if (old_reuse) + call_rcu(&old_reuse->rcu, reuseport_free_rcu); return 0; } -static void reuseport_free_rcu(struct rcu_head *head) -{ - struct sock_reuseport *reuse; - - reuse = container_of(head, struct sock_reuseport, rcu); - if (reuse->prog) - bpf_prog_destroy(reuse->prog); - kfree(reuse); -} - void reuseport_detach_sock(struct sock *sk) { struct sock_reuseport *reuse; @@ -235,7 +240,9 @@ struct sock *reuseport_select_sock(struct sock *sk, if (prog && skb) sk2 = run_bpf(reuse, socks, prog, skb, hdr_len); - else + + /* no bpf or invalid bpf result: fall back to hash usage */ + if (!sk2) sk2 = reuse->socks[reciprocal_scale(hash, socks)]; } diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index cbc3dde4cfcc..f2d0462611c3 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -25,6 +25,7 @@ static int zero = 0; static int one = 1; +static int two __maybe_unused = 2; static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; static int max_skb_frags = MAX_SKB_FRAGS; @@ -250,6 +251,46 @@ static int proc_do_rss_key(struct ctl_table *table, int write, return proc_dostring(&fake_table, write, buffer, lenp, ppos); } +#ifdef CONFIG_BPF_JIT +static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret, jit_enable = *(int *)table->data; + struct ctl_table tmp = *table; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + tmp.data = &jit_enable; + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + if (write && !ret) { + if (jit_enable < 2 || + (jit_enable == 2 && bpf_dump_raw_ok())) { + *(int *)table->data = jit_enable; + if (jit_enable == 2) + pr_warn("bpf_jit_enable = 2 was set! NEVER use this in production, only for JIT debugging!\n"); + } else { + ret = -EPERM; + } + } + return ret; +} + +# ifdef CONFIG_HAVE_EBPF_JIT +static int +proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return proc_dointvec_minmax(table, write, buffer, lenp, ppos); +} +# endif +#endif + static struct ctl_table net_core_table[] = { #ifdef CONFIG_NET { @@ -325,7 +366,14 @@ static struct ctl_table net_core_table[] = { .data = &bpf_jit_enable, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax_bpf_enable, +# ifdef CONFIG_BPF_JIT_ALWAYS_ON + .extra1 = &one, + .extra2 = &one, +# else + .extra1 = &zero, + .extra2 = &two, +# endif }, # ifdef CONFIG_HAVE_EBPF_JIT { @@ -333,14 +381,18 @@ static struct ctl_table net_core_table[] = { .data = &bpf_jit_harden, .maxlen = sizeof(int), .mode = 0600, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax_bpf_restricted, + .extra1 = &zero, + .extra2 = &two, }, { .procname = "bpf_jit_kallsyms", .data = &bpf_jit_kallsyms, .maxlen = sizeof(int), .mode = 0600, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax_bpf_restricted, + .extra1 = &zero, + .extra2 = &one, }, # endif #endif diff --git a/net/core/xdp.c b/net/core/xdp.c new file mode 100644 index 000000000000..097a0f74e004 --- /dev/null +++ b/net/core/xdp.c @@ -0,0 +1,73 @@ +/* net/core/xdp.c + * + * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. + * Released under terms in GPL version 2. See COPYING. + */ +#include <linux/types.h> +#include <linux/mm.h> + +#include <net/xdp.h> + +#define REG_STATE_NEW 0x0 +#define REG_STATE_REGISTERED 0x1 +#define REG_STATE_UNREGISTERED 0x2 +#define REG_STATE_UNUSED 0x3 + +void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) +{ + /* Simplify driver cleanup code paths, allow unreg "unused" */ + if (xdp_rxq->reg_state == REG_STATE_UNUSED) + return; + + WARN(!(xdp_rxq->reg_state == REG_STATE_REGISTERED), "Driver BUG"); + + xdp_rxq->reg_state = REG_STATE_UNREGISTERED; + xdp_rxq->dev = NULL; +} +EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg); + +static void xdp_rxq_info_init(struct xdp_rxq_info *xdp_rxq) +{ + memset(xdp_rxq, 0, sizeof(*xdp_rxq)); +} + +/* Returns 0 on success, negative on failure */ +int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, + struct net_device *dev, u32 queue_index) +{ + if (xdp_rxq->reg_state == REG_STATE_UNUSED) { + WARN(1, "Driver promised not to register this"); + return -EINVAL; + } + + if (xdp_rxq->reg_state == REG_STATE_REGISTERED) { + WARN(1, "Missing unregister, handled but fix driver"); + xdp_rxq_info_unreg(xdp_rxq); + } + + if (!dev) { + WARN(1, "Missing net_device from driver"); + return -ENODEV; + } + + /* State either UNREGISTERED or NEW */ + xdp_rxq_info_init(xdp_rxq); + xdp_rxq->dev = dev; + xdp_rxq->queue_index = queue_index; + + xdp_rxq->reg_state = REG_STATE_REGISTERED; + return 0; +} +EXPORT_SYMBOL_GPL(xdp_rxq_info_reg); + +void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq) +{ + xdp_rxq->reg_state = REG_STATE_UNUSED; +} +EXPORT_SYMBOL_GPL(xdp_rxq_info_unused); + +bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq) +{ + return (xdp_rxq->reg_state == REG_STATE_REGISTERED); +} +EXPORT_SYMBOL_GPL(xdp_rxq_info_is_reg); diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig index 8c0ef71bed2f..b270e84d9c13 100644 --- a/net/dccp/Kconfig +++ b/net/dccp/Kconfig @@ -39,23 +39,6 @@ config IP_DCCP_DEBUG Just say N. -config NET_DCCPPROBE - tristate "DCCP connection probing" - depends on PROC_FS && KPROBES - ---help--- - This module allows for capturing the changes to DCCP connection - state in response to incoming packets. It is used for debugging - DCCP congestion avoidance modules. If you don't understand - what was just said, you don't need it: say N. - - Documentation on how to use DCCP connection probing can be found - at: - - http://www.linuxfoundation.org/collaborate/workgroups/networking/dccpprobe - - To compile this code as a module, choose M here: the - module will be called dccp_probe. - endmenu diff --git a/net/dccp/Makefile b/net/dccp/Makefile index 2e7b56097bc4..5b4ff37bc806 100644 --- a/net/dccp/Makefile +++ b/net/dccp/Makefile @@ -21,9 +21,10 @@ obj-$(subst y,$(CONFIG_IP_DCCP),$(CONFIG_IPV6)) += dccp_ipv6.o dccp_ipv6-y := ipv6.o obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o -obj-$(CONFIG_NET_DCCPPROBE) += dccp_probe.o dccp-$(CONFIG_SYSCTL) += sysctl.o dccp_diag-y := diag.o -dccp_probe-y := probe.o + +# build with local directory for trace.h +CFLAGS_proto.o := -I$(src) diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c index 3de0d0362d7f..2a24f7d171a5 100644 --- a/net/dccp/ackvec.c +++ b/net/dccp/ackvec.c @@ -228,7 +228,7 @@ static void dccp_ackvec_add_new(struct dccp_ackvec *av, u32 num_packets, } if (num_cells + dccp_ackvec_buflen(av) >= DCCPAV_MAX_ACKVEC_LEN) { - DCCP_CRIT("Ack Vector buffer overflow: dropping old entries\n"); + DCCP_CRIT("Ack Vector buffer overflow: dropping old entries"); av->av_overflow = true; } diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index 1c75cd1255f6..92d016e87816 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -140,6 +140,9 @@ static void ccid2_hc_tx_rto_expire(struct timer_list *t) ccid2_pr_debug("RTO_EXPIRE\n"); + if (sk->sk_state == DCCP_CLOSED) + goto out; + /* back-off timer */ hc->tx_rto <<= 1; if (hc->tx_rto > DCCP_RTO_MAX) diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 0c55ffb859bf..f91e3816806b 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -316,7 +316,7 @@ int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); void dccp_shutdown(struct sock *sk, int how); int inet_dccp_listen(struct socket *sock, int backlog); -unsigned int dccp_poll(struct file *file, struct socket *sock, +__poll_t dccp_poll(struct file *file, struct socket *sock, poll_table *wait); int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); void dccp_req_err(struct sock *sk, u64 seq); diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index abd07a443219..37ccbe62eb1a 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -57,10 +57,17 @@ void dccp_time_wait(struct sock *sk, int state, int timeo) if (state == DCCP_TIME_WAIT) timeo = DCCP_TIMEWAIT_LEN; + /* tw_timer is pinned, so we need to make sure BH are disabled + * in following section, otherwise timer handler could run before + * we complete the initialization. + */ + local_bh_disable(); inet_twsk_schedule(tw, timeo); - /* Linkage updates. */ - __inet_twsk_hashdance(tw, sk, &dccp_hashinfo); - inet_twsk_put(tw); + /* Linkage updates. + * Note that access to tw after this point is illegal. + */ + inet_twsk_hashdance(tw, sk, &dccp_hashinfo); + local_bh_enable(); } else { /* Sorry, if we're out of memory, just CLOSE this * socket up. We've got bigger problems than diff --git a/net/dccp/probe.c b/net/dccp/probe.c deleted file mode 100644 index 3d3fda05b32d..000000000000 --- a/net/dccp/probe.c +++ /dev/null @@ -1,203 +0,0 @@ -/* - * dccp_probe - Observe the DCCP flow with kprobes. - * - * The idea for this came from Werner Almesberger's umlsim - * Copyright (C) 2004, Stephen Hemminger <shemminger@osdl.org> - * - * Modified for DCCP from Stephen Hemminger's code - * Copyright (C) 2006, Ian McDonald <ian.mcdonald@jandi.co.nz> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include <linux/kernel.h> -#include <linux/kprobes.h> -#include <linux/socket.h> -#include <linux/dccp.h> -#include <linux/proc_fs.h> -#include <linux/module.h> -#include <linux/kfifo.h> -#include <linux/vmalloc.h> -#include <linux/time64.h> -#include <linux/gfp.h> -#include <net/net_namespace.h> - -#include "dccp.h" -#include "ccid.h" -#include "ccids/ccid3.h" - -static int port; - -static int bufsize = 64 * 1024; - -static const char procname[] = "dccpprobe"; - -static struct { - struct kfifo fifo; - spinlock_t lock; - wait_queue_head_t wait; - struct timespec64 tstart; -} dccpw; - -static void printl(const char *fmt, ...) -{ - va_list args; - int len; - struct timespec64 now; - char tbuf[256]; - - va_start(args, fmt); - getnstimeofday64(&now); - - now = timespec64_sub(now, dccpw.tstart); - - len = sprintf(tbuf, "%lu.%06lu ", - (unsigned long) now.tv_sec, - (unsigned long) now.tv_nsec / NSEC_PER_USEC); - len += vscnprintf(tbuf+len, sizeof(tbuf)-len, fmt, args); - va_end(args); - - kfifo_in_locked(&dccpw.fifo, tbuf, len, &dccpw.lock); - wake_up(&dccpw.wait); -} - -static int jdccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) -{ - const struct inet_sock *inet = inet_sk(sk); - struct ccid3_hc_tx_sock *hc = NULL; - - if (ccid_get_current_tx_ccid(dccp_sk(sk)) == DCCPC_CCID3) - hc = ccid3_hc_tx_sk(sk); - - if (port == 0 || ntohs(inet->inet_dport) == port || - ntohs(inet->inet_sport) == port) { - if (hc) - printl("%pI4:%u %pI4:%u %d %d %d %d %u %llu %llu %d\n", - &inet->inet_saddr, ntohs(inet->inet_sport), - &inet->inet_daddr, ntohs(inet->inet_dport), size, - hc->tx_s, hc->tx_rtt, hc->tx_p, - hc->tx_x_calc, hc->tx_x_recv >> 6, - hc->tx_x >> 6, hc->tx_t_ipi); - else - printl("%pI4:%u %pI4:%u %d\n", - &inet->inet_saddr, ntohs(inet->inet_sport), - &inet->inet_daddr, ntohs(inet->inet_dport), - size); - } - - jprobe_return(); - return 0; -} - -static struct jprobe dccp_send_probe = { - .kp = { - .symbol_name = "dccp_sendmsg", - }, - .entry = jdccp_sendmsg, -}; - -static int dccpprobe_open(struct inode *inode, struct file *file) -{ - kfifo_reset(&dccpw.fifo); - getnstimeofday64(&dccpw.tstart); - return 0; -} - -static ssize_t dccpprobe_read(struct file *file, char __user *buf, - size_t len, loff_t *ppos) -{ - int error = 0, cnt = 0; - unsigned char *tbuf; - - if (!buf) - return -EINVAL; - - if (len == 0) - return 0; - - tbuf = vmalloc(len); - if (!tbuf) - return -ENOMEM; - - error = wait_event_interruptible(dccpw.wait, - kfifo_len(&dccpw.fifo) != 0); - if (error) - goto out_free; - - cnt = kfifo_out_locked(&dccpw.fifo, tbuf, len, &dccpw.lock); - error = copy_to_user(buf, tbuf, cnt) ? -EFAULT : 0; - -out_free: - vfree(tbuf); - - return error ? error : cnt; -} - -static const struct file_operations dccpprobe_fops = { - .owner = THIS_MODULE, - .open = dccpprobe_open, - .read = dccpprobe_read, - .llseek = noop_llseek, -}; - -static __init int dccpprobe_init(void) -{ - int ret = -ENOMEM; - - init_waitqueue_head(&dccpw.wait); - spin_lock_init(&dccpw.lock); - if (kfifo_alloc(&dccpw.fifo, bufsize, GFP_KERNEL)) - return ret; - if (!proc_create(procname, S_IRUSR, init_net.proc_net, &dccpprobe_fops)) - goto err0; - - ret = register_jprobe(&dccp_send_probe); - if (ret) { - ret = request_module("dccp"); - if (!ret) - ret = register_jprobe(&dccp_send_probe); - } - - if (ret) - goto err1; - - pr_info("DCCP watch registered (port=%d)\n", port); - return 0; -err1: - remove_proc_entry(procname, init_net.proc_net); -err0: - kfifo_free(&dccpw.fifo); - return ret; -} -module_init(dccpprobe_init); - -static __exit void dccpprobe_exit(void) -{ - kfifo_free(&dccpw.fifo); - remove_proc_entry(procname, init_net.proc_net); - unregister_jprobe(&dccp_send_probe); - -} -module_exit(dccpprobe_exit); - -MODULE_PARM_DESC(port, "Port to match (0=all)"); -module_param(port, int, 0); - -MODULE_PARM_DESC(bufsize, "Log buffer size (default 64k)"); -module_param(bufsize, int, 0); - -MODULE_AUTHOR("Ian McDonald <ian.mcdonald@jandi.co.nz>"); -MODULE_DESCRIPTION("DCCP snooper"); -MODULE_LICENSE("GPL"); diff --git a/net/dccp/proto.c b/net/dccp/proto.c index b68168fcc06a..74685fecfdb9 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -38,6 +38,9 @@ #include "dccp.h" #include "feat.h" +#define CREATE_TRACE_POINTS +#include "trace.h" + DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly; EXPORT_SYMBOL_GPL(dccp_statistics); @@ -110,7 +113,7 @@ void dccp_set_state(struct sock *sk, const int state) /* Change state AFTER socket is unhashed to avoid closed * socket sitting in hash tables. */ - sk->sk_state = state; + inet_sk_set_state(sk, state); } EXPORT_SYMBOL_GPL(dccp_set_state); @@ -259,6 +262,7 @@ int dccp_disconnect(struct sock *sk, int flags) { struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet = inet_sk(sk); + struct dccp_sock *dp = dccp_sk(sk); int err = 0; const int old_state = sk->sk_state; @@ -278,6 +282,10 @@ int dccp_disconnect(struct sock *sk, int flags) sk->sk_err = ECONNRESET; dccp_clear_xmit_timers(sk); + ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); + ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); + dp->dccps_hc_rx_ccid = NULL; + dp->dccps_hc_tx_ccid = NULL; __skb_queue_purge(&sk->sk_receive_queue); __skb_queue_purge(&sk->sk_write_queue); @@ -313,10 +321,10 @@ EXPORT_SYMBOL_GPL(dccp_disconnect); * take care of normal races (between the test and the event) and we don't * go look at any of the socket buffers directly. */ -unsigned int dccp_poll(struct file *file, struct socket *sock, +__poll_t dccp_poll(struct file *file, struct socket *sock, poll_table *wait) { - unsigned int mask; + __poll_t mask; struct sock *sk = sock->sk; sock_poll_wait(file, sk_sleep(sk), wait); @@ -756,6 +764,8 @@ int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int rc, size; long timeo; + trace_dccp_probe(sk, len); + if (len > dp->dccps_mss_cache) return -EMSGSIZE; diff --git a/net/dccp/trace.h b/net/dccp/trace.h new file mode 100644 index 000000000000..5062421beee9 --- /dev/null +++ b/net/dccp/trace.h @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM dccp + +#if !defined(_TRACE_DCCP_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_DCCP_H + +#include <net/sock.h> +#include "dccp.h" +#include "ccids/ccid3.h" +#include <linux/tracepoint.h> +#include <trace/events/net_probe_common.h> + +TRACE_EVENT(dccp_probe, + + TP_PROTO(struct sock *sk, size_t size), + + TP_ARGS(sk, size), + + TP_STRUCT__entry( + /* sockaddr_in6 is always bigger than sockaddr_in */ + __array(__u8, saddr, sizeof(struct sockaddr_in6)) + __array(__u8, daddr, sizeof(struct sockaddr_in6)) + __field(__u16, sport) + __field(__u16, dport) + __field(__u16, size) + __field(__u16, tx_s) + __field(__u32, tx_rtt) + __field(__u32, tx_p) + __field(__u32, tx_x_calc) + __field(__u64, tx_x_recv) + __field(__u64, tx_x) + __field(__u32, tx_t_ipi) + ), + + TP_fast_assign( + const struct inet_sock *inet = inet_sk(sk); + struct ccid3_hc_tx_sock *hc = NULL; + + if (ccid_get_current_tx_ccid(dccp_sk(sk)) == DCCPC_CCID3) + hc = ccid3_hc_tx_sk(sk); + + memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); + memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); + + TP_STORE_ADDR_PORTS(__entry, inet, sk); + + /* For filtering use */ + __entry->sport = ntohs(inet->inet_sport); + __entry->dport = ntohs(inet->inet_dport); + + __entry->size = size; + if (hc) { + __entry->tx_s = hc->tx_s; + __entry->tx_rtt = hc->tx_rtt; + __entry->tx_p = hc->tx_p; + __entry->tx_x_calc = hc->tx_x_calc; + __entry->tx_x_recv = hc->tx_x_recv >> 6; + __entry->tx_x = hc->tx_x >> 6; + __entry->tx_t_ipi = hc->tx_t_ipi; + } else { + __entry->tx_s = 0; + memset(&__entry->tx_rtt, 0, (void *)&__entry->tx_t_ipi - + (void *)&__entry->tx_rtt + + sizeof(__entry->tx_t_ipi)); + } + ), + + TP_printk("src=%pISpc dest=%pISpc size=%d tx_s=%d tx_rtt=%d " + "tx_p=%d tx_x_calc=%u tx_x_recv=%llu tx_x=%llu tx_t_ipi=%d", + __entry->saddr, __entry->daddr, __entry->size, + __entry->tx_s, __entry->tx_rtt, __entry->tx_p, + __entry->tx_x_calc, __entry->tx_x_recv, __entry->tx_x, + __entry->tx_t_ipi) +); + +#endif /* _TRACE_TCP_H */ + +/* This part must be outside protection */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace +#include <trace/define_trace.h> diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 518cea17b811..cc1b505453a8 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -1209,11 +1209,11 @@ static int dn_getname(struct socket *sock, struct sockaddr *uaddr,int *uaddr_len } -static unsigned int dn_poll(struct file *file, struct socket *sock, poll_table *wait) +static __poll_t dn_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct dn_scp *scp = DN_SK(sk); - int mask = datagram_poll(file, sock, wait); + __poll_t mask = datagram_poll(file, sock, wait); if (!skb_queue_empty(&scp->other_receive_queue)) mask |= POLLRDBAND; @@ -2320,7 +2320,6 @@ static int dn_socket_seq_open(struct inode *inode, struct file *file) } static const struct file_operations dn_socket_seq_fops = { - .owner = THIS_MODULE, .open = dn_socket_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index 9153247dad28..c9f5e1ebb9c8 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -1389,7 +1389,6 @@ static int dn_dev_seq_open(struct inode *inode, struct file *file) } static const struct file_operations dn_dev_seq_fops = { - .owner = THIS_MODULE, .open = dn_dev_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -1418,9 +1417,12 @@ void __init dn_dev_init(void) dn_dev_devices_on(); - rtnl_register(PF_DECnet, RTM_NEWADDR, dn_nl_newaddr, NULL, 0); - rtnl_register(PF_DECnet, RTM_DELADDR, dn_nl_deladdr, NULL, 0); - rtnl_register(PF_DECnet, RTM_GETADDR, NULL, dn_nl_dump_ifaddr, 0); + rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_NEWADDR, + dn_nl_newaddr, NULL, 0); + rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_DELADDR, + dn_nl_deladdr, NULL, 0); + rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_GETADDR, + NULL, dn_nl_dump_ifaddr, 0); proc_create("decnet_dev", S_IRUGO, init_net.proc_net, &dn_dev_seq_fops); diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c index b37a1b833c77..fce94cbd4378 100644 --- a/net/decnet/dn_fib.c +++ b/net/decnet/dn_fib.c @@ -792,8 +792,10 @@ void __init dn_fib_init(void) register_dnaddr_notifier(&dn_fib_dnaddr_notifier); - rtnl_register(PF_DECnet, RTM_NEWROUTE, dn_fib_rtm_newroute, NULL, 0); - rtnl_register(PF_DECnet, RTM_DELROUTE, dn_fib_rtm_delroute, NULL, 0); + rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_NEWROUTE, + dn_fib_rtm_newroute, NULL, 0); + rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_DELROUTE, + dn_fib_rtm_delroute, NULL, 0); } diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c index 528119a5618e..6e37d9e6345e 100644 --- a/net/decnet/dn_neigh.c +++ b/net/decnet/dn_neigh.c @@ -597,7 +597,6 @@ static int dn_neigh_seq_open(struct inode *inode, struct file *file) } static const struct file_operations dn_neigh_seq_fops = { - .owner = THIS_MODULE, .open = dn_neigh_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 324cb9f2f551..ef20b8e31669 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -199,11 +199,11 @@ static void dn_dst_check_expire(struct timer_list *unused) lockdep_is_held(&dn_rt_hash_table[i].lock))) != NULL) { if (atomic_read(&rt->dst.__refcnt) > 1 || (now - rt->dst.lastuse) < expire) { - rtp = &rt->dst.dn_next; + rtp = &rt->dn_next; continue; } - *rtp = rt->dst.dn_next; - rt->dst.dn_next = NULL; + *rtp = rt->dn_next; + rt->dn_next = NULL; dst_dev_put(&rt->dst); dst_release(&rt->dst); } @@ -233,11 +233,11 @@ static int dn_dst_gc(struct dst_ops *ops) lockdep_is_held(&dn_rt_hash_table[i].lock))) != NULL) { if (atomic_read(&rt->dst.__refcnt) > 1 || (now - rt->dst.lastuse) < expire) { - rtp = &rt->dst.dn_next; + rtp = &rt->dn_next; continue; } - *rtp = rt->dst.dn_next; - rt->dst.dn_next = NULL; + *rtp = rt->dn_next; + rt->dn_next = NULL; dst_dev_put(&rt->dst); dst_release(&rt->dst); break; @@ -333,8 +333,8 @@ static int dn_insert_route(struct dn_route *rt, unsigned int hash, struct dn_rou lockdep_is_held(&dn_rt_hash_table[hash].lock))) != NULL) { if (compare_keys(&rth->fld, &rt->fld)) { /* Put it first */ - *rthp = rth->dst.dn_next; - rcu_assign_pointer(rth->dst.dn_next, + *rthp = rth->dn_next; + rcu_assign_pointer(rth->dn_next, dn_rt_hash_table[hash].chain); rcu_assign_pointer(dn_rt_hash_table[hash].chain, rth); @@ -345,10 +345,10 @@ static int dn_insert_route(struct dn_route *rt, unsigned int hash, struct dn_rou *rp = rth; return 0; } - rthp = &rth->dst.dn_next; + rthp = &rth->dn_next; } - rcu_assign_pointer(rt->dst.dn_next, dn_rt_hash_table[hash].chain); + rcu_assign_pointer(rt->dn_next, dn_rt_hash_table[hash].chain); rcu_assign_pointer(dn_rt_hash_table[hash].chain, rt); dst_hold_and_use(&rt->dst, now); @@ -369,8 +369,8 @@ static void dn_run_flush(struct timer_list *unused) goto nothing_to_declare; for(; rt; rt = next) { - next = rcu_dereference_raw(rt->dst.dn_next); - RCU_INIT_POINTER(rt->dst.dn_next, NULL); + next = rcu_dereference_raw(rt->dn_next); + RCU_INIT_POINTER(rt->dn_next, NULL); dst_dev_put(&rt->dst); dst_release(&rt->dst); } @@ -1183,6 +1183,7 @@ make_route: if (rt == NULL) goto e_nobufs; + rt->dn_next = NULL; memset(&rt->fld, 0, sizeof(rt->fld)); rt->fld.saddr = oldflp->saddr; rt->fld.daddr = oldflp->daddr; @@ -1252,7 +1253,7 @@ static int __dn_route_output_key(struct dst_entry **pprt, const struct flowidn * if (!(flags & MSG_TRYHARD)) { rcu_read_lock_bh(); for (rt = rcu_dereference_bh(dn_rt_hash_table[hash].chain); rt; - rt = rcu_dereference_bh(rt->dst.dn_next)) { + rt = rcu_dereference_bh(rt->dn_next)) { if ((flp->daddr == rt->fld.daddr) && (flp->saddr == rt->fld.saddr) && (flp->flowidn_mark == rt->fld.flowidn_mark) && @@ -1448,6 +1449,7 @@ make_route: if (rt == NULL) goto e_nobufs; + rt->dn_next = NULL; memset(&rt->fld, 0, sizeof(rt->fld)); rt->rt_saddr = fld.saddr; rt->rt_daddr = fld.daddr; @@ -1529,7 +1531,7 @@ static int dn_route_input(struct sk_buff *skb) rcu_read_lock(); for(rt = rcu_dereference(dn_rt_hash_table[hash].chain); rt != NULL; - rt = rcu_dereference(rt->dst.dn_next)) { + rt = rcu_dereference(rt->dn_next)) { if ((rt->fld.saddr == cb->src) && (rt->fld.daddr == cb->dst) && (rt->fld.flowidn_oif == 0) && @@ -1749,7 +1751,7 @@ int dn_cache_dump(struct sk_buff *skb, struct netlink_callback *cb) rcu_read_lock_bh(); for(rt = rcu_dereference_bh(dn_rt_hash_table[h].chain), idx = 0; rt; - rt = rcu_dereference_bh(rt->dst.dn_next), idx++) { + rt = rcu_dereference_bh(rt->dn_next), idx++) { if (idx < s_idx) continue; skb_dst_set(skb, dst_clone(&rt->dst)); @@ -1795,7 +1797,7 @@ static struct dn_route *dn_rt_cache_get_next(struct seq_file *seq, struct dn_rou { struct dn_rt_cache_iter_state *s = seq->private; - rt = rcu_dereference_bh(rt->dst.dn_next); + rt = rcu_dereference_bh(rt->dn_next); while (!rt) { rcu_read_unlock_bh(); if (--s->bucket < 0) @@ -1858,7 +1860,6 @@ static int dn_rt_cache_seq_open(struct inode *inode, struct file *file) } static const struct file_operations dn_rt_cache_seq_fops = { - .owner = THIS_MODULE, .open = dn_rt_cache_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -1921,11 +1922,11 @@ void __init dn_route_init(void) &dn_rt_cache_seq_fops); #ifdef CONFIG_DECNET_ROUTER - rtnl_register(PF_DECnet, RTM_GETROUTE, dn_cache_getroute, - dn_fib_dump, 0); + rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_GETROUTE, + dn_cache_getroute, dn_fib_dump, 0); #else - rtnl_register(PF_DECnet, RTM_GETROUTE, dn_cache_getroute, - dn_cache_dump, 0); + rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_GETROUTE, + dn_cache_getroute, dn_cache_dump, 0); #endif } diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 03c3bdf25468..bbf2c82cf7b2 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -16,6 +16,15 @@ config NET_DSA if NET_DSA +config NET_DSA_LEGACY + bool "Support for older platform device and Device Tree registration" + default y + ---help--- + Say Y if you want to enable support for the older platform device and + deprecated Device Tree binding registration. + + This feature is scheduled for removal in 4.17. + # tagging formats config NET_DSA_TAG_BRCM bool diff --git a/net/dsa/Makefile b/net/dsa/Makefile index 0e13c1f95d13..9e4d3536f977 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -1,7 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 # the core obj-$(CONFIG_NET_DSA) += dsa_core.o -dsa_core-y += dsa.o dsa2.o legacy.o master.o port.o slave.o switch.o +dsa_core-y += dsa.o dsa2.o master.o port.o slave.o switch.o +dsa_core-$(CONFIG_NET_DSA_LEGACY) += legacy.o # tagging formats dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 44e3fb7dec8c..adf50fbc4c13 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -51,9 +51,7 @@ static struct dsa_switch_tree *dsa_tree_alloc(int index) INIT_LIST_HEAD(&dst->list); list_add_tail(&dsa_tree_list, &dst->list); - /* Initialize the reference counter to the number of switches, not 1 */ kref_init(&dst->refcount); - refcount_set(&dst->refcount.refcount, 0); return dst; } @@ -64,20 +62,23 @@ static void dsa_tree_free(struct dsa_switch_tree *dst) kfree(dst); } -static struct dsa_switch_tree *dsa_tree_touch(int index) +static struct dsa_switch_tree *dsa_tree_get(struct dsa_switch_tree *dst) { - struct dsa_switch_tree *dst; - - dst = dsa_tree_find(index); - if (!dst) - dst = dsa_tree_alloc(index); + if (dst) + kref_get(&dst->refcount); return dst; } -static void dsa_tree_get(struct dsa_switch_tree *dst) +static struct dsa_switch_tree *dsa_tree_touch(int index) { - kref_get(&dst->refcount); + struct dsa_switch_tree *dst; + + dst = dsa_tree_find(index); + if (dst) + return dsa_tree_get(dst); + else + return dsa_tree_alloc(index); } static void dsa_tree_release(struct kref *ref) @@ -91,7 +92,8 @@ static void dsa_tree_release(struct kref *ref) static void dsa_tree_put(struct dsa_switch_tree *dst) { - kref_put(&dst->refcount, dsa_tree_release); + if (dst) + kref_put(&dst->refcount, dsa_tree_release); } static bool dsa_port_is_dsa(struct dsa_port *port) @@ -239,7 +241,7 @@ static int dsa_tree_setup_default_cpu(struct dsa_switch_tree *dst) for (port = 0; port < ds->num_ports; port++) { dp = &ds->ports[port]; - if (dsa_port_is_user(dp)) + if (dsa_port_is_user(dp) || dsa_port_is_dsa(dp)) dp->cpu_dp = dst->cpu_dp; } } @@ -269,13 +271,12 @@ static int dsa_port_setup(struct dsa_port *dp) break; case DSA_PORT_TYPE_CPU: case DSA_PORT_TYPE_DSA: - err = dsa_port_fixed_link_register_of(dp); + err = dsa_port_link_register_of(dp); if (err) { - dev_err(ds->dev, "failed to register fixed link for port %d.%d\n", + dev_err(ds->dev, "failed to setup link for port %d.%d\n", ds->index, dp->index); return err; } - break; case DSA_PORT_TYPE_USER: err = dsa_slave_create(dp); @@ -299,7 +300,7 @@ static void dsa_port_teardown(struct dsa_port *dp) break; case DSA_PORT_TYPE_CPU: case DSA_PORT_TYPE_DSA: - dsa_port_fixed_link_unregister_of(dp); + dsa_port_link_unregister_of(dp); break; case DSA_PORT_TYPE_USER: if (dp->slave) { @@ -765,6 +766,7 @@ int dsa_register_switch(struct dsa_switch *ds) mutex_lock(&dsa2_mutex); err = dsa_switch_probe(ds); + dsa_tree_put(ds->dst); mutex_unlock(&dsa2_mutex); return err; diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 7d036696e8c4..70de7895e5b8 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -97,8 +97,17 @@ const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol); bool dsa_schedule_work(struct work_struct *work); /* legacy.c */ +#if IS_ENABLED(CONFIG_NET_DSA_LEGACY) int dsa_legacy_register(void); void dsa_legacy_unregister(void); +#else +static inline int dsa_legacy_register(void) +{ + return 0; +} + +static inline void dsa_legacy_unregister(void) { } +#endif int dsa_legacy_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, @@ -157,8 +166,8 @@ int dsa_port_vlan_add(struct dsa_port *dp, struct switchdev_trans *trans); int dsa_port_vlan_del(struct dsa_port *dp, const struct switchdev_obj_port_vlan *vlan); -int dsa_port_fixed_link_register_of(struct dsa_port *dp); -void dsa_port_fixed_link_unregister_of(struct dsa_port *dp); +int dsa_port_link_register_of(struct dsa_port *dp); +void dsa_port_link_unregister_of(struct dsa_port *dp); /* slave.c */ extern const struct dsa_device_ops notag_netdev_ops; diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c index 84611d7fcfa2..cb54b81d0bd9 100644 --- a/net/dsa/legacy.c +++ b/net/dsa/legacy.c @@ -86,7 +86,7 @@ static int dsa_cpu_dsa_setups(struct dsa_switch *ds) if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))) continue; - ret = dsa_port_fixed_link_register_of(&ds->ports[port]); + ret = dsa_port_link_register_of(&ds->ports[port]); if (ret) return ret; } @@ -275,7 +275,7 @@ static void dsa_switch_destroy(struct dsa_switch *ds) for (port = 0; port < ds->num_ports; port++) { if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))) continue; - dsa_port_fixed_link_unregister_of(&ds->ports[port]); + dsa_port_link_unregister_of(&ds->ports[port]); } if (ds->slave_mii_bus && ds->ops->phy_read) @@ -718,26 +718,6 @@ static int dsa_resume(struct device *d) } #endif -/* legacy way, bypassing the bridge *****************************************/ -int dsa_legacy_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], - struct net_device *dev, - const unsigned char *addr, u16 vid, - u16 flags) -{ - struct dsa_port *dp = dsa_slave_to_port(dev); - - return dsa_port_fdb_add(dp, addr, vid); -} - -int dsa_legacy_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], - struct net_device *dev, - const unsigned char *addr, u16 vid) -{ - struct dsa_port *dp = dsa_slave_to_port(dev); - - return dsa_port_fdb_del(dp, addr, vid); -} - static SIMPLE_DEV_PM_OPS(dsa_pm_ops, dsa_suspend, dsa_resume); static const struct of_device_id dsa_of_match_table[] = { diff --git a/net/dsa/port.c b/net/dsa/port.c index bb4be2679904..7acc1169d75e 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -273,7 +273,56 @@ int dsa_port_vlan_del(struct dsa_port *dp, return 0; } -int dsa_port_fixed_link_register_of(struct dsa_port *dp) +static int dsa_port_setup_phy_of(struct dsa_port *dp, bool enable) +{ + struct device_node *port_dn = dp->dn; + struct device_node *phy_dn; + struct dsa_switch *ds = dp->ds; + struct phy_device *phydev; + int port = dp->index; + int err = 0; + + phy_dn = of_parse_phandle(port_dn, "phy-handle", 0); + if (!phy_dn) + return 0; + + phydev = of_phy_find_device(phy_dn); + if (!phydev) { + err = -EPROBE_DEFER; + goto err_put_of; + } + + if (enable) { + err = genphy_config_init(phydev); + if (err < 0) + goto err_put_dev; + + err = genphy_resume(phydev); + if (err < 0) + goto err_put_dev; + + err = genphy_read_status(phydev); + if (err < 0) + goto err_put_dev; + } else { + err = genphy_suspend(phydev); + if (err < 0) + goto err_put_dev; + } + + if (ds->ops->adjust_link) + ds->ops->adjust_link(ds, port, phydev); + + dev_dbg(ds->dev, "enabled port's phy: %s", phydev_name(phydev)); + +err_put_dev: + put_device(&phydev->mdio.dev); +err_put_of: + of_node_put(phy_dn); + return err; +} + +static int dsa_port_fixed_link_register_of(struct dsa_port *dp) { struct device_node *dn = dp->dn; struct dsa_switch *ds = dp->ds; @@ -282,38 +331,44 @@ int dsa_port_fixed_link_register_of(struct dsa_port *dp) int mode; int err; - if (of_phy_is_fixed_link(dn)) { - err = of_phy_register_fixed_link(dn); - if (err) { - dev_err(ds->dev, - "failed to register the fixed PHY of port %d\n", - port); - return err; - } + err = of_phy_register_fixed_link(dn); + if (err) { + dev_err(ds->dev, + "failed to register the fixed PHY of port %d\n", + port); + return err; + } - phydev = of_phy_find_device(dn); + phydev = of_phy_find_device(dn); - mode = of_get_phy_mode(dn); - if (mode < 0) - mode = PHY_INTERFACE_MODE_NA; - phydev->interface = mode; + mode = of_get_phy_mode(dn); + if (mode < 0) + mode = PHY_INTERFACE_MODE_NA; + phydev->interface = mode; - genphy_config_init(phydev); - genphy_read_status(phydev); + genphy_config_init(phydev); + genphy_read_status(phydev); - if (ds->ops->adjust_link) - ds->ops->adjust_link(ds, port, phydev); + if (ds->ops->adjust_link) + ds->ops->adjust_link(ds, port, phydev); - put_device(&phydev->mdio.dev); - } + put_device(&phydev->mdio.dev); return 0; } -void dsa_port_fixed_link_unregister_of(struct dsa_port *dp) +int dsa_port_link_register_of(struct dsa_port *dp) { - struct device_node *dn = dp->dn; + if (of_phy_is_fixed_link(dp->dn)) + return dsa_port_fixed_link_register_of(dp); + else + return dsa_port_setup_phy_of(dp, true); +} - if (of_phy_is_fixed_link(dn)) - of_phy_deregister_fixed_link(dn); +void dsa_port_link_unregister_of(struct dsa_port *dp) +{ + if (of_phy_is_fixed_link(dp->dn)) + of_phy_deregister_fixed_link(dp->dn); + else + dsa_port_setup_phy_of(dp, false); } diff --git a/net/dsa/slave.c b/net/dsa/slave.c index d6e7a642493b..f52307296de4 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -16,7 +16,6 @@ #include <linux/of_net.h> #include <linux/of_mdio.h> #include <linux/mdio.h> -#include <linux/list.h> #include <net/rtnetlink.h> #include <net/pkt_cls.h> #include <net/tc_act/tc_mirred.h> @@ -709,14 +708,12 @@ static int dsa_slave_add_cls_matchall(struct net_device *dev, struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_mall_tc_entry *mall_tc_entry; __be16 protocol = cls->common.protocol; - struct net *net = dev_net(dev); struct dsa_switch *ds = dp->ds; struct net_device *to_dev; const struct tc_action *a; struct dsa_port *to_dp; int err = -EOPNOTSUPP; LIST_HEAD(actions); - int ifindex; if (!ds->ops->port_mirror_add) return err; @@ -730,8 +727,7 @@ static int dsa_slave_add_cls_matchall(struct net_device *dev, if (is_tcf_mirred_egress_mirror(a) && protocol == htons(ETH_P_ALL)) { struct dsa_mall_mirror_tc_entry *mirror; - ifindex = tcf_mirred_ifindex(a); - to_dev = __dev_get_by_index(net, ifindex); + to_dev = tcf_mirred_dev(a); if (!to_dev) return -EINVAL; @@ -944,6 +940,26 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { .set_rxnfc = dsa_slave_set_rxnfc, }; +/* legacy way, bypassing the bridge *****************************************/ +int dsa_legacy_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], + struct net_device *dev, + const unsigned char *addr, u16 vid, + u16 flags) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + + return dsa_port_fdb_add(dp, addr, vid); +} + +int dsa_legacy_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], + struct net_device *dev, + const unsigned char *addr, u16 vid) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + + return dsa_port_fdb_del(dp, addr, vid); +} + static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_open = dsa_slave_open, .ndo_stop = dsa_slave_close, diff --git a/net/dsa/switch.c b/net/dsa/switch.c index 29608d087a7c..b93511726069 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -83,29 +83,52 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds, static int dsa_switch_fdb_add(struct dsa_switch *ds, struct dsa_notifier_fdb_info *info) { - /* Do not care yet about other switch chips of the fabric */ - if (ds->index != info->sw_index) - return 0; + int port = dsa_towards_port(ds, info->sw_index, info->port); if (!ds->ops->port_fdb_add) return -EOPNOTSUPP; - return ds->ops->port_fdb_add(ds, info->port, info->addr, - info->vid); + return ds->ops->port_fdb_add(ds, port, info->addr, info->vid); } static int dsa_switch_fdb_del(struct dsa_switch *ds, struct dsa_notifier_fdb_info *info) { - /* Do not care yet about other switch chips of the fabric */ - if (ds->index != info->sw_index) - return 0; + int port = dsa_towards_port(ds, info->sw_index, info->port); if (!ds->ops->port_fdb_del) return -EOPNOTSUPP; - return ds->ops->port_fdb_del(ds, info->port, info->addr, - info->vid); + return ds->ops->port_fdb_del(ds, port, info->addr, info->vid); +} + +static int +dsa_switch_mdb_prepare_bitmap(struct dsa_switch *ds, + const struct switchdev_obj_port_mdb *mdb, + const unsigned long *bitmap) +{ + int port, err; + + if (!ds->ops->port_mdb_prepare || !ds->ops->port_mdb_add) + return -EOPNOTSUPP; + + for_each_set_bit(port, bitmap, ds->num_ports) { + err = ds->ops->port_mdb_prepare(ds, port, mdb); + if (err) + return err; + } + + return 0; +} + +static void dsa_switch_mdb_add_bitmap(struct dsa_switch *ds, + const struct switchdev_obj_port_mdb *mdb, + const unsigned long *bitmap) +{ + int port; + + for_each_set_bit(port, bitmap, ds->num_ports) + ds->ops->port_mdb_add(ds, port, mdb); } static int dsa_switch_mdb_add(struct dsa_switch *ds, @@ -114,7 +137,7 @@ static int dsa_switch_mdb_add(struct dsa_switch *ds, const struct switchdev_obj_port_mdb *mdb = info->mdb; struct switchdev_trans *trans = info->trans; DECLARE_BITMAP(group, ds->num_ports); - int port, err; + int port; /* Build a mask of Multicast group members */ bitmap_zero(group, ds->num_ports); @@ -124,21 +147,10 @@ static int dsa_switch_mdb_add(struct dsa_switch *ds, if (dsa_is_dsa_port(ds, port)) set_bit(port, group); - if (switchdev_trans_ph_prepare(trans)) { - if (!ds->ops->port_mdb_prepare || !ds->ops->port_mdb_add) - return -EOPNOTSUPP; - - for_each_set_bit(port, group, ds->num_ports) { - err = ds->ops->port_mdb_prepare(ds, port, mdb, trans); - if (err) - return err; - } - - return 0; - } + if (switchdev_trans_ph_prepare(trans)) + return dsa_switch_mdb_prepare_bitmap(ds, mdb, group); - for_each_set_bit(port, group, ds->num_ports) - ds->ops->port_mdb_add(ds, port, mdb, trans); + dsa_switch_mdb_add_bitmap(ds, mdb, group); return 0; } @@ -157,13 +169,43 @@ static int dsa_switch_mdb_del(struct dsa_switch *ds, return 0; } +static int +dsa_switch_vlan_prepare_bitmap(struct dsa_switch *ds, + const struct switchdev_obj_port_vlan *vlan, + const unsigned long *bitmap) +{ + int port, err; + + if (!ds->ops->port_vlan_prepare || !ds->ops->port_vlan_add) + return -EOPNOTSUPP; + + for_each_set_bit(port, bitmap, ds->num_ports) { + err = ds->ops->port_vlan_prepare(ds, port, vlan); + if (err) + return err; + } + + return 0; +} + +static void +dsa_switch_vlan_add_bitmap(struct dsa_switch *ds, + const struct switchdev_obj_port_vlan *vlan, + const unsigned long *bitmap) +{ + int port; + + for_each_set_bit(port, bitmap, ds->num_ports) + ds->ops->port_vlan_add(ds, port, vlan); +} + static int dsa_switch_vlan_add(struct dsa_switch *ds, struct dsa_notifier_vlan_info *info) { const struct switchdev_obj_port_vlan *vlan = info->vlan; struct switchdev_trans *trans = info->trans; DECLARE_BITMAP(members, ds->num_ports); - int port, err; + int port; /* Build a mask of VLAN members */ bitmap_zero(members, ds->num_ports); @@ -173,21 +215,10 @@ static int dsa_switch_vlan_add(struct dsa_switch *ds, if (dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port)) set_bit(port, members); - if (switchdev_trans_ph_prepare(trans)) { - if (!ds->ops->port_vlan_prepare || !ds->ops->port_vlan_add) - return -EOPNOTSUPP; - - for_each_set_bit(port, members, ds->num_ports) { - err = ds->ops->port_vlan_prepare(ds, port, vlan, trans); - if (err) - return err; - } - - return 0; - } + if (switchdev_trans_ph_prepare(trans)) + return dsa_switch_vlan_prepare_bitmap(ds, vlan, members); - for_each_set_bit(port, members, ds->num_ports) - ds->ops->port_vlan_add(ds, port, vlan, trans); + dsa_switch_vlan_add_bitmap(ds, vlan, members); return 0; } diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index e6e0b7b6025c..2b06bb91318b 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -70,6 +70,18 @@ static struct sk_buff *brcm_tag_xmit_ll(struct sk_buff *skb, if (skb_cow_head(skb, BRCM_TAG_LEN) < 0) return NULL; + /* The Ethernet switch we are interfaced with needs packets to be at + * least 64 bytes (including FCS) otherwise they will be discarded when + * they enter the switch port logic. When Broadcom tags are enabled, we + * need to make sure that packets are at least 68 bytes + * (including FCS and tag) because the length verification is done after + * the Broadcom tag is stripped off the ingress packet. + * + * Let dsa_slave_xmit() free the SKB + */ + if (__skb_put_padto(skb, ETH_ZLEN + BRCM_TAG_LEN, false)) + return NULL; + skb_push(skb, BRCM_TAG_LEN); if (offset) diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c index 8475434af7d5..11535bc70743 100644 --- a/net/dsa/tag_mtk.c +++ b/net/dsa/tag_mtk.c @@ -13,10 +13,13 @@ */ #include <linux/etherdevice.h> +#include <linux/if_vlan.h> #include "dsa_priv.h" #define MTK_HDR_LEN 4 +#define MTK_HDR_XMIT_UNTAGGED 0 +#define MTK_HDR_XMIT_TAGGED_TPID_8100 1 #define MTK_HDR_RECV_SOURCE_PORT_MASK GENMASK(2, 0) #define MTK_HDR_XMIT_DP_BIT_MASK GENMASK(5, 0) @@ -25,20 +28,37 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb, { struct dsa_port *dp = dsa_slave_to_port(dev); u8 *mtk_tag; + bool is_vlan_skb = true; - if (skb_cow_head(skb, MTK_HDR_LEN) < 0) - return NULL; - - skb_push(skb, MTK_HDR_LEN); + /* Build the special tag after the MAC Source Address. If VLAN header + * is present, it's required that VLAN header and special tag is + * being combined. Only in this way we can allow the switch can parse + * the both special and VLAN tag at the same time and then look up VLAN + * table with VID. + */ + if (!skb_vlan_tagged(skb)) { + if (skb_cow_head(skb, MTK_HDR_LEN) < 0) + return NULL; - memmove(skb->data, skb->data + MTK_HDR_LEN, 2 * ETH_ALEN); + skb_push(skb, MTK_HDR_LEN); + memmove(skb->data, skb->data + MTK_HDR_LEN, 2 * ETH_ALEN); + is_vlan_skb = false; + } - /* Build the tag after the MAC Source Address */ mtk_tag = skb->data + 2 * ETH_ALEN; - mtk_tag[0] = 0; + + /* Mark tag attribute on special tag insertion to notify hardware + * whether that's a combined special tag with 802.1Q header. + */ + mtk_tag[0] = is_vlan_skb ? MTK_HDR_XMIT_TAGGED_TPID_8100 : + MTK_HDR_XMIT_UNTAGGED; mtk_tag[1] = (1 << dp->index) & MTK_HDR_XMIT_DP_BIT_MASK; - mtk_tag[2] = 0; - mtk_tag[3] = 0; + + /* Tag control information is kept for 802.1Q */ + if (!is_vlan_skb) { + mtk_tag[2] = 0; + mtk_tag[3] = 0; + } return skb; } diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index c6c8ad1d4b6d..47a0a6649a9d 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -43,7 +43,6 @@ obj-$(CONFIG_INET_DIAG) += inet_diag.o obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o -obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index f00499a46927..c24008daa3d8 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -121,6 +121,7 @@ #endif #include <net/l3mdev.h> +#include <trace/events/sock.h> /* The inetsw table contains everything that inet_create needs to * build a new socket. @@ -789,7 +790,8 @@ int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int addr_len = 0; int err; - sock_rps_record_flow(sk); + if (likely(!(flags & MSG_ERRQUEUE))) + sock_rps_record_flow(sk); err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, flags & ~MSG_DONTWAIT, &addr_len); @@ -870,6 +872,9 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) struct sock *sk = sock->sk; int err = 0; struct net *net = sock_net(sk); + void __user *p = (void __user *)arg; + struct ifreq ifr; + struct rtentry rt; switch (cmd) { case SIOCGSTAMP: @@ -880,8 +885,12 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) break; case SIOCADDRT: case SIOCDELRT: + if (copy_from_user(&rt, p, sizeof(struct rtentry))) + return -EFAULT; + err = ip_rt_ioctl(net, cmd, &rt); + break; case SIOCRTMSG: - err = ip_rt_ioctl(net, cmd, (void __user *)arg); + err = -EINVAL; break; case SIOCDARP: case SIOCGARP: @@ -889,17 +898,26 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) err = arp_ioctl(net, cmd, (void __user *)arg); break; case SIOCGIFADDR: - case SIOCSIFADDR: case SIOCGIFBRDADDR: - case SIOCSIFBRDADDR: case SIOCGIFNETMASK: - case SIOCSIFNETMASK: case SIOCGIFDSTADDR: + case SIOCGIFPFLAGS: + if (copy_from_user(&ifr, p, sizeof(struct ifreq))) + return -EFAULT; + err = devinet_ioctl(net, cmd, &ifr); + if (!err && copy_to_user(p, &ifr, sizeof(struct ifreq))) + err = -EFAULT; + break; + + case SIOCSIFADDR: + case SIOCSIFBRDADDR: + case SIOCSIFNETMASK: case SIOCSIFDSTADDR: case SIOCSIFPFLAGS: - case SIOCGIFPFLAGS: case SIOCSIFFLAGS: - err = devinet_ioctl(net, cmd, (void __user *)arg); + if (copy_from_user(&ifr, p, sizeof(struct ifreq))) + return -EFAULT; + err = devinet_ioctl(net, cmd, &ifr); break; default: if (sk->sk_prot->ioctl) @@ -1220,6 +1238,19 @@ int inet_sk_rebuild_header(struct sock *sk) } EXPORT_SYMBOL(inet_sk_rebuild_header); +void inet_sk_set_state(struct sock *sk, int state) +{ + trace_inet_sock_set_state(sk, sk->sk_state, state); + sk->sk_state = state; +} +EXPORT_SYMBOL(inet_sk_set_state); + +void inet_sk_state_store(struct sock *sk, int newstate) +{ + trace_inet_sock_set_state(sk, sk->sk_state, newstate); + smp_store_release(&sk->sk_state, newstate); +} + struct sk_buff *inet_gso_segment(struct sk_buff *skb, netdev_features_t features) { diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index a8d7c5a9fb05..f28f06c91ead 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -223,11 +223,16 @@ static bool arp_key_eq(const struct neighbour *neigh, const void *pkey) static int arp_constructor(struct neighbour *neigh) { - __be32 addr = *(__be32 *)neigh->primary_key; + __be32 addr; struct net_device *dev = neigh->dev; struct in_device *in_dev; struct neigh_parms *parms; + u32 inaddr_any = INADDR_ANY; + if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) + memcpy(neigh->primary_key, &inaddr_any, arp_tbl.key_len); + + addr = *(__be32 *)neigh->primary_key; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (!in_dev) { @@ -1420,7 +1425,6 @@ static int arp_seq_open(struct inode *inode, struct file *file) } static const struct file_operations arp_seq_fops = { - .owner = THIS_MODULE, .open = arp_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index a4573bccd6da..40f001782c1b 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -946,11 +946,10 @@ static int inet_abc_len(__be32 addr) } -int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) +int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr) { - struct ifreq ifr; struct sockaddr_in sin_orig; - struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr; + struct sockaddr_in *sin = (struct sockaddr_in *)&ifr->ifr_addr; struct in_device *in_dev; struct in_ifaddr **ifap = NULL; struct in_ifaddr *ifa = NULL; @@ -959,22 +958,16 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) int ret = -EFAULT; int tryaddrmatch = 0; - /* - * Fetch the caller's info block into kernel space - */ - - if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) - goto out; - ifr.ifr_name[IFNAMSIZ - 1] = 0; + ifr->ifr_name[IFNAMSIZ - 1] = 0; /* save original address for comparison */ memcpy(&sin_orig, sin, sizeof(*sin)); - colon = strchr(ifr.ifr_name, ':'); + colon = strchr(ifr->ifr_name, ':'); if (colon) *colon = 0; - dev_load(net, ifr.ifr_name); + dev_load(net, ifr->ifr_name); switch (cmd) { case SIOCGIFADDR: /* Get interface address */ @@ -1014,7 +1007,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) rtnl_lock(); ret = -ENODEV; - dev = __dev_get_by_name(net, ifr.ifr_name); + dev = __dev_get_by_name(net, ifr->ifr_name); if (!dev) goto done; @@ -1031,7 +1024,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) This is checked above. */ for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; ifap = &ifa->ifa_next) { - if (!strcmp(ifr.ifr_name, ifa->ifa_label) && + if (!strcmp(ifr->ifr_name, ifa->ifa_label) && sin_orig.sin_addr.s_addr == ifa->ifa_local) { break; /* found */ @@ -1044,7 +1037,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) if (!ifa) { for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; ifap = &ifa->ifa_next) - if (!strcmp(ifr.ifr_name, ifa->ifa_label)) + if (!strcmp(ifr->ifr_name, ifa->ifa_label)) break; } } @@ -1055,20 +1048,24 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) switch (cmd) { case SIOCGIFADDR: /* Get interface address */ + ret = 0; sin->sin_addr.s_addr = ifa->ifa_local; - goto rarok; + break; case SIOCGIFBRDADDR: /* Get the broadcast address */ + ret = 0; sin->sin_addr.s_addr = ifa->ifa_broadcast; - goto rarok; + break; case SIOCGIFDSTADDR: /* Get the destination address */ + ret = 0; sin->sin_addr.s_addr = ifa->ifa_address; - goto rarok; + break; case SIOCGIFNETMASK: /* Get the netmask for the interface */ + ret = 0; sin->sin_addr.s_addr = ifa->ifa_mask; - goto rarok; + break; case SIOCSIFFLAGS: if (colon) { @@ -1076,11 +1073,11 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) if (!ifa) break; ret = 0; - if (!(ifr.ifr_flags & IFF_UP)) + if (!(ifr->ifr_flags & IFF_UP)) inet_del_ifa(in_dev, ifap, 1); break; } - ret = dev_change_flags(dev, ifr.ifr_flags); + ret = dev_change_flags(dev, ifr->ifr_flags); break; case SIOCSIFADDR: /* Set interface address (and family) */ @@ -1095,7 +1092,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) break; INIT_HLIST_NODE(&ifa->hash); if (colon) - memcpy(ifa->ifa_label, ifr.ifr_name, IFNAMSIZ); + memcpy(ifa->ifa_label, ifr->ifr_name, IFNAMSIZ); else memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); } else { @@ -1182,28 +1179,27 @@ done: rtnl_unlock(); out: return ret; -rarok: - rtnl_unlock(); - ret = copy_to_user(arg, &ifr, sizeof(struct ifreq)) ? -EFAULT : 0; - goto out; } -static int inet_gifconf(struct net_device *dev, char __user *buf, int len) +static int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size) { struct in_device *in_dev = __in_dev_get_rtnl(dev); struct in_ifaddr *ifa; struct ifreq ifr; int done = 0; + if (WARN_ON(size > sizeof(struct ifreq))) + goto out; + if (!in_dev) goto out; for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { if (!buf) { - done += sizeof(ifr); + done += size; continue; } - if (len < (int) sizeof(ifr)) + if (len < size) break; memset(&ifr, 0, sizeof(struct ifreq)); strcpy(ifr.ifr_name, ifa->ifa_label); @@ -1212,13 +1208,12 @@ static int inet_gifconf(struct net_device *dev, char __user *buf, int len) (*(struct sockaddr_in *)&ifr.ifr_addr).sin_addr.s_addr = ifa->ifa_local; - if (copy_to_user(buf, &ifr, sizeof(struct ifreq))) { + if (copy_to_user(buf + done, &ifr, size)) { done = -EFAULT; break; } - buf += sizeof(struct ifreq); - len -= sizeof(struct ifreq); - done += sizeof(struct ifreq); + len -= size; + done += size; } out: return done; @@ -1428,7 +1423,7 @@ skip: static bool inetdev_valid_mtu(unsigned int mtu) { - return mtu >= 68; + return mtu >= IPV4_MIN_MTU; } static void inetdev_send_gratuitous_arp(struct net_device *dev, diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index d57aa64fa7c7..296d0b956bfe 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -121,14 +121,32 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp) static void esp_output_done(struct crypto_async_request *base, int err) { struct sk_buff *skb = base->data; + struct xfrm_offload *xo = xfrm_offload(skb); void *tmp; - struct dst_entry *dst = skb_dst(skb); - struct xfrm_state *x = dst->xfrm; + struct xfrm_state *x; + + if (xo && (xo->flags & XFRM_DEV_RESUME)) + x = skb->sp->xvec[skb->sp->len - 1]; + else + x = skb_dst(skb)->xfrm; tmp = ESP_SKB_CB(skb)->tmp; esp_ssg_unref(x, tmp); kfree(tmp); - xfrm_output_resume(skb, err); + + if (xo && (xo->flags & XFRM_DEV_RESUME)) { + if (err) { + XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR); + kfree_skb(skb); + return; + } + + skb_push(skb, skb->data - skb_mac_header(skb)); + secpath_reset(skb); + xfrm_dev_resume(skb); + } else { + xfrm_output_resume(skb, err); + } } /* Move ESP header back into place. */ @@ -825,17 +843,13 @@ static int esp_init_aead(struct xfrm_state *x) char aead_name[CRYPTO_MAX_ALG_NAME]; struct crypto_aead *aead; int err; - u32 mask = 0; err = -ENAMETOOLONG; if (snprintf(aead_name, CRYPTO_MAX_ALG_NAME, "%s(%s)", x->geniv, x->aead->alg_name) >= CRYPTO_MAX_ALG_NAME) goto error; - if (x->xso.offload_handle) - mask |= CRYPTO_ALG_ASYNC; - - aead = crypto_alloc_aead(aead_name, 0, mask); + aead = crypto_alloc_aead(aead_name, 0, 0); err = PTR_ERR(aead); if (IS_ERR(aead)) goto error; @@ -865,7 +879,6 @@ static int esp_init_authenc(struct xfrm_state *x) char authenc_name[CRYPTO_MAX_ALG_NAME]; unsigned int keylen; int err; - u32 mask = 0; err = -EINVAL; if (!x->ealg) @@ -891,10 +904,7 @@ static int esp_init_authenc(struct xfrm_state *x) goto error; } - if (x->xso.offload_handle) - mask |= CRYPTO_ALG_ASYNC; - - aead = crypto_alloc_aead(authenc_name, 0, mask); + aead = crypto_alloc_aead(authenc_name, 0, 0); err = PTR_ERR(aead); if (IS_ERR(aead)) goto error; @@ -981,6 +991,7 @@ static int esp_init_state(struct xfrm_state *x) switch (encap->encap_type) { default: + err = -EINVAL; goto error; case UDP_ENCAP_ESPINUDP: x->props.header_len += sizeof(struct udphdr); diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index f8b918c766b0..da5635fc52c2 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -38,7 +38,8 @@ static struct sk_buff **esp4_gro_receive(struct sk_buff **head, __be32 spi; int err; - skb_pull(skb, offset); + if (!pskb_pull(skb, offset)) + return NULL; if ((err = xfrm_parse_spi(skb, IPPROTO_ESP, &spi, &seq)) != 0) goto out; @@ -108,75 +109,39 @@ static void esp4_gso_encap(struct xfrm_state *x, struct sk_buff *skb) static struct sk_buff *esp4_gso_segment(struct sk_buff *skb, netdev_features_t features) { - __u32 seq; - int err = 0; - struct sk_buff *skb2; struct xfrm_state *x; struct ip_esp_hdr *esph; struct crypto_aead *aead; - struct sk_buff *segs = ERR_PTR(-EINVAL); netdev_features_t esp_features = features; struct xfrm_offload *xo = xfrm_offload(skb); if (!xo) - goto out; + return ERR_PTR(-EINVAL); - seq = xo->seq.low; + if (!(skb_shinfo(skb)->gso_type & SKB_GSO_ESP)) + return ERR_PTR(-EINVAL); x = skb->sp->xvec[skb->sp->len - 1]; aead = x->data; esph = ip_esp_hdr(skb); if (esph->spi != x->id.spi) - goto out; + return ERR_PTR(-EINVAL); if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead))) - goto out; + return ERR_PTR(-EINVAL); __skb_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead)); skb->encap_hdr_csum = 1; - if (!(features & NETIF_F_HW_ESP)) + if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle || + (x->xso.dev != skb->dev)) esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK); - segs = x->outer_mode->gso_segment(x, skb, esp_features); - if (IS_ERR_OR_NULL(segs)) - goto out; - - __skb_pull(skb, skb->data - skb_mac_header(skb)); - - skb2 = segs; - do { - struct sk_buff *nskb = skb2->next; - - xo = xfrm_offload(skb2); - xo->flags |= XFRM_GSO_SEGMENT; - xo->seq.low = seq; - xo->seq.hi = xfrm_replay_seqhi(x, seq); - - if(!(features & NETIF_F_HW_ESP)) - xo->flags |= CRYPTO_FALLBACK; - - x->outer_mode->xmit(x, skb2); - - err = x->type_offload->xmit(x, skb2, esp_features); - if (err) { - kfree_skb_list(segs); - return ERR_PTR(err); - } - - if (!skb_is_gso(skb2)) - seq++; - else - seq += skb_shinfo(skb2)->gso_segs; - - skb_push(skb2, skb2->mac_len); - skb2 = nskb; - } while (skb2); + xo->flags |= XFRM_GSO_SEGMENT; -out: - return segs; + return x->outer_mode->gso_segment(x, skb, esp_features); } static int esp_input_tail(struct xfrm_state *x, struct sk_buff *skb) @@ -203,6 +168,7 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_ struct crypto_aead *aead; struct esp_info esp; bool hw_offload = true; + __u32 seq; esp.inplace = true; @@ -241,23 +207,30 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_ return esp.nfrags; } + seq = xo->seq.low; + esph = esp.esph; esph->spi = x->id.spi; skb_push(skb, -skb_network_offset(skb)); if (xo->flags & XFRM_GSO_SEGMENT) { - esph->seq_no = htonl(xo->seq.low); - } else { - ip_hdr(skb)->tot_len = htons(skb->len); - ip_send_check(ip_hdr(skb)); + esph->seq_no = htonl(seq); + + if (!skb_is_gso(skb)) + xo->seq.low++; + else + xo->seq.low += skb_shinfo(skb)->gso_segs; } + esp.seqno = cpu_to_be64(seq + ((u64)xo->seq.hi << 32)); + + ip_hdr(skb)->tot_len = htons(skb->len); + ip_send_check(ip_hdr(skb)); + if (hw_offload) return 0; - esp.seqno = cpu_to_be64(xo->seq.low + ((u64)xo->seq.hi << 32)); - err = esp_output_tail(x, skb, &esp); if (err) return err; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index f52d27a422c3..f05afaf3235c 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -587,10 +587,9 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, * Handle IP routing ioctl calls. * These are used to manipulate the routing tables */ -int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg) +int ip_rt_ioctl(struct net *net, unsigned int cmd, struct rtentry *rt) { struct fib_config cfg; - struct rtentry rt; int err; switch (cmd) { @@ -599,11 +598,8 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg) if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; - if (copy_from_user(&rt, arg, sizeof(rt))) - return -EFAULT; - rtnl_lock(); - err = rtentry_to_fib_config(net, cmd, &rt, &cfg); + err = rtentry_to_fib_config(net, cmd, rt, &cfg); if (err == 0) { struct fib_table *tb; @@ -1298,14 +1294,19 @@ err_table_hash_alloc: static void ip_fib_net_exit(struct net *net) { - unsigned int i; + int i; rtnl_lock(); #ifdef CONFIG_IP_MULTIPLE_TABLES RCU_INIT_POINTER(net->ipv4.fib_main, NULL); RCU_INIT_POINTER(net->ipv4.fib_default, NULL); #endif - for (i = 0; i < FIB_TABLE_HASHSZ; i++) { + /* Destroy the tables in reverse order to guarantee that the + * local table, ID 255, is destroyed before the main table, ID + * 254. This is necessary as the local table may contain + * references to data contained in the main table. + */ + for (i = FIB_TABLE_HASHSZ - 1; i >= 0; i--) { struct hlist_head *head = &net->ipv4.fib_table_hash[i]; struct hlist_node *tmp; struct fib_table *tb; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index f04d944f8abe..c586597da20d 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -698,7 +698,7 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { int type = nla_type(nla); - u32 val; + u32 fi_val, val; if (!type) continue; @@ -715,7 +715,11 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) val = nla_get_u32(nla); } - if (fi->fib_metrics->metrics[type - 1] != val) + fi_val = fi->fib_metrics->metrics[type - 1]; + if (type == RTAX_FEATURES) + fi_val &= ~DST_FEATURE_ECN_CA; + + if (fi_val != val) return false; } diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 5ddc4aefff12..5530cd6fdbc7 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -2334,7 +2334,6 @@ static int fib_triestat_seq_open(struct inode *inode, struct file *file) } static const struct file_operations fib_triestat_fops = { - .owner = THIS_MODULE, .open = fib_triestat_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -2521,7 +2520,6 @@ static int fib_trie_seq_open(struct inode *inode, struct file *file) } static const struct file_operations fib_trie_fops = { - .owner = THIS_MODULE, .open = fib_trie_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -2715,7 +2713,6 @@ static int fib_route_seq_open(struct inode *inode, struct file *file) } static const struct file_operations fib_route_fops = { - .owner = THIS_MODULE, .open = fib_route_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index d1f8f302dbf3..f2402581fef1 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -89,6 +89,7 @@ #include <linux/rtnetlink.h> #include <linux/times.h> #include <linux/pkt_sched.h> +#include <linux/byteorder/generic.h> #include <net/net_namespace.h> #include <net/arp.h> @@ -321,6 +322,23 @@ igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted) return scount; } +/* source address selection per RFC 3376 section 4.2.13 */ +static __be32 igmpv3_get_srcaddr(struct net_device *dev, + const struct flowi4 *fl4) +{ + struct in_device *in_dev = __in_dev_get_rcu(dev); + + if (!in_dev) + return htonl(INADDR_ANY); + + for_ifa(in_dev) { + if (fl4->saddr == ifa->ifa_local) + return fl4->saddr; + } endfor_ifa(in_dev); + + return htonl(INADDR_ANY); +} + static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu) { struct sk_buff *skb; @@ -368,7 +386,11 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu) pip->frag_off = htons(IP_DF); pip->ttl = 1; pip->daddr = fl4.daddr; - pip->saddr = fl4.saddr; + + rcu_read_lock(); + pip->saddr = igmpv3_get_srcaddr(dev, &fl4); + rcu_read_unlock(); + pip->protocol = IPPROTO_IGMP; pip->tot_len = 0; /* filled in later */ ip_select_ident(net, skb, NULL); @@ -404,16 +426,17 @@ static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel) } static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc, - int type, struct igmpv3_grec **ppgr) + int type, struct igmpv3_grec **ppgr, unsigned int mtu) { struct net_device *dev = pmc->interface->dev; struct igmpv3_report *pih; struct igmpv3_grec *pgr; - if (!skb) - skb = igmpv3_newpack(dev, dev->mtu); - if (!skb) - return NULL; + if (!skb) { + skb = igmpv3_newpack(dev, mtu); + if (!skb) + return NULL; + } pgr = skb_put(skb, sizeof(struct igmpv3_grec)); pgr->grec_type = type; pgr->grec_auxwords = 0; @@ -436,12 +459,17 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, struct igmpv3_grec *pgr = NULL; struct ip_sf_list *psf, *psf_next, *psf_prev, **psf_list; int scount, stotal, first, isquery, truncate; + unsigned int mtu; if (pmc->multiaddr == IGMP_ALL_HOSTS) return skb; if (ipv4_is_local_multicast(pmc->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports) return skb; + mtu = READ_ONCE(dev->mtu); + if (mtu < IPV4_MIN_MTU) + return skb; + isquery = type == IGMPV3_MODE_IS_INCLUDE || type == IGMPV3_MODE_IS_EXCLUDE; truncate = type == IGMPV3_MODE_IS_EXCLUDE || @@ -462,7 +490,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) { if (skb) igmpv3_sendpack(skb); - skb = igmpv3_newpack(dev, dev->mtu); + skb = igmpv3_newpack(dev, mtu); } } first = 1; @@ -498,12 +526,12 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, pgr->grec_nsrcs = htons(scount); if (skb) igmpv3_sendpack(skb); - skb = igmpv3_newpack(dev, dev->mtu); + skb = igmpv3_newpack(dev, mtu); first = 1; scount = 0; } if (first) { - skb = add_grhead(skb, pmc, type, &pgr); + skb = add_grhead(skb, pmc, type, &pgr, mtu); first = 0; } if (!skb) @@ -538,7 +566,7 @@ empty_source: igmpv3_sendpack(skb); skb = NULL; /* add_grhead will get a new one */ } - skb = add_grhead(skb, pmc, type, &pgr); + skb = add_grhead(skb, pmc, type, &pgr, mtu); } } if (pgr) @@ -2808,7 +2836,6 @@ static int igmp_mc_seq_open(struct inode *inode, struct file *file) } static const struct file_operations igmp_mc_seq_fops = { - .owner = THIS_MODULE, .open = igmp_mc_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -2955,7 +2982,6 @@ static int igmp_mcf_seq_open(struct inode *inode, struct file *file) } static const struct file_operations igmp_mcf_seq_fops = { - .owner = THIS_MODULE, .open = igmp_mcf_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 4ca46dc08e63..881ac6d046f2 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -475,7 +475,6 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern) } spin_unlock_bh(&queue->fastopenq.lock); } - mem_cgroup_sk_alloc(newsk); out: release_sock(sk); if (req) @@ -685,7 +684,7 @@ static void reqsk_timer_handler(struct timer_list *t) int max_retries, thresh; u8 defer_accept; - if (sk_state_load(sk_listener) != TCP_LISTEN) + if (inet_sk_state_load(sk_listener) != TCP_LISTEN) goto drop; max_retries = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_synack_retries; @@ -783,7 +782,7 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, if (newsk) { struct inet_connection_sock *newicsk = inet_csk(newsk); - newsk->sk_state = TCP_SYN_RECV; + inet_sk_set_state(newsk, TCP_SYN_RECV); newicsk->icsk_bind_hash = NULL; inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port; @@ -877,7 +876,7 @@ int inet_csk_listen_start(struct sock *sk, int backlog) * It is OK, because this socket enters to hash table only * after validation is complete. */ - sk_state_store(sk, TCP_LISTEN); + inet_sk_state_store(sk, TCP_LISTEN); if (!sk->sk_prot->get_port(sk, inet->inet_num)) { inet->inet_sport = htons(inet->inet_num); @@ -888,7 +887,7 @@ int inet_csk_listen_start(struct sock *sk, int backlog) return 0; } - sk->sk_state = TCP_CLOSE; + inet_sk_set_state(sk, TCP_CLOSE); return err; } EXPORT_SYMBOL_GPL(inet_csk_listen_start); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index c9c35b61a027..a383f299ce24 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -564,12 +564,18 @@ static int inet_diag_bc_run(const struct nlattr *_bc, case INET_DIAG_BC_JMP: yes = 0; break; + case INET_DIAG_BC_S_EQ: + yes = entry->sport == op[1].no; + break; case INET_DIAG_BC_S_GE: yes = entry->sport >= op[1].no; break; case INET_DIAG_BC_S_LE: yes = entry->sport <= op[1].no; break; + case INET_DIAG_BC_D_EQ: + yes = entry->dport == op[1].no; + break; case INET_DIAG_BC_D_GE: yes = entry->dport >= op[1].no; break; @@ -802,8 +808,10 @@ static int inet_diag_bc_audit(const struct nlattr *attr, if (!valid_devcond(bc, len, &min_len)) return -EINVAL; break; + case INET_DIAG_BC_S_EQ: case INET_DIAG_BC_S_GE: case INET_DIAG_BC_S_LE: + case INET_DIAG_BC_D_EQ: case INET_DIAG_BC_D_GE: case INET_DIAG_BC_D_LE: if (!valid_port_comparison(bc, len, &min_len)) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index e7d15fb0d94d..31ff46daae97 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -19,6 +19,7 @@ #include <linux/slab.h> #include <linux/wait.h> #include <linux/vmalloc.h> +#include <linux/bootmem.h> #include <net/addrconf.h> #include <net/inet_connection_sock.h> @@ -168,6 +169,60 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child) } EXPORT_SYMBOL_GPL(__inet_inherit_port); +static struct inet_listen_hashbucket * +inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk) +{ + u32 hash; + +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) + hash = ipv6_portaddr_hash(sock_net(sk), + &sk->sk_v6_rcv_saddr, + inet_sk(sk)->inet_num); + else +#endif + hash = ipv4_portaddr_hash(sock_net(sk), + inet_sk(sk)->inet_rcv_saddr, + inet_sk(sk)->inet_num); + return inet_lhash2_bucket(h, hash); +} + +static void inet_hash2(struct inet_hashinfo *h, struct sock *sk) +{ + struct inet_listen_hashbucket *ilb2; + + if (!h->lhash2) + return; + + ilb2 = inet_lhash2_bucket_sk(h, sk); + + spin_lock(&ilb2->lock); + if (sk->sk_reuseport && sk->sk_family == AF_INET6) + hlist_add_tail_rcu(&inet_csk(sk)->icsk_listen_portaddr_node, + &ilb2->head); + else + hlist_add_head_rcu(&inet_csk(sk)->icsk_listen_portaddr_node, + &ilb2->head); + ilb2->count++; + spin_unlock(&ilb2->lock); +} + +static void inet_unhash2(struct inet_hashinfo *h, struct sock *sk) +{ + struct inet_listen_hashbucket *ilb2; + + if (!h->lhash2 || + WARN_ON_ONCE(hlist_unhashed(&inet_csk(sk)->icsk_listen_portaddr_node))) + return; + + ilb2 = inet_lhash2_bucket_sk(h, sk); + + spin_lock(&ilb2->lock); + hlist_del_init_rcu(&inet_csk(sk)->icsk_listen_portaddr_node); + ilb2->count--; + spin_unlock(&ilb2->lock); +} + static inline int compute_score(struct sock *sk, struct net *net, const unsigned short hnum, const __be32 daddr, const int dif, const int sdif, bool exact_dif) @@ -207,6 +262,40 @@ static inline int compute_score(struct sock *sk, struct net *net, */ /* called with rcu_read_lock() : No refcount taken on the socket */ +static struct sock *inet_lhash2_lookup(struct net *net, + struct inet_listen_hashbucket *ilb2, + struct sk_buff *skb, int doff, + const __be32 saddr, __be16 sport, + const __be32 daddr, const unsigned short hnum, + const int dif, const int sdif) +{ + bool exact_dif = inet_exact_dif_match(net, skb); + struct inet_connection_sock *icsk; + struct sock *sk, *result = NULL; + int score, hiscore = 0; + u32 phash = 0; + + inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) { + sk = (struct sock *)icsk; + score = compute_score(sk, net, hnum, daddr, + dif, sdif, exact_dif); + if (score > hiscore) { + if (sk->sk_reuseport) { + phash = inet_ehashfn(net, daddr, hnum, + saddr, sport); + result = reuseport_select_sock(sk, phash, + skb, doff); + if (result) + return result; + } + result = sk; + hiscore = score; + } + } + + return result; +} + struct sock *__inet_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, @@ -216,32 +305,57 @@ struct sock *__inet_lookup_listener(struct net *net, { unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; - int score, hiscore = 0, matches = 0, reuseport = 0; bool exact_dif = inet_exact_dif_match(net, skb); + struct inet_listen_hashbucket *ilb2; struct sock *sk, *result = NULL; + int score, hiscore = 0; + unsigned int hash2; u32 phash = 0; + if (ilb->count <= 10 || !hashinfo->lhash2) + goto port_lookup; + + /* Too many sk in the ilb bucket (which is hashed by port alone). + * Try lhash2 (which is hashed by port and addr) instead. + */ + + hash2 = ipv4_portaddr_hash(net, daddr, hnum); + ilb2 = inet_lhash2_bucket(hashinfo, hash2); + if (ilb2->count > ilb->count) + goto port_lookup; + + result = inet_lhash2_lookup(net, ilb2, skb, doff, + saddr, sport, daddr, hnum, + dif, sdif); + if (result) + return result; + + /* Lookup lhash2 with INADDR_ANY */ + + hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); + ilb2 = inet_lhash2_bucket(hashinfo, hash2); + if (ilb2->count > ilb->count) + goto port_lookup; + + return inet_lhash2_lookup(net, ilb2, skb, doff, + saddr, sport, daddr, hnum, + dif, sdif); + +port_lookup: sk_for_each_rcu(sk, &ilb->head) { score = compute_score(sk, net, hnum, daddr, dif, sdif, exact_dif); if (score > hiscore) { - reuseport = sk->sk_reuseport; - if (reuseport) { + if (sk->sk_reuseport) { phash = inet_ehashfn(net, daddr, hnum, saddr, sport); result = reuseport_select_sock(sk, phash, skb, doff); if (result) return result; - matches = 1; } result = sk; hiscore = score; - } else if (score == hiscore && reuseport) { - matches++; - if (reciprocal_scale(phash, matches) == 0) - result = sk; - phash = next_pseudo_random32(phash); } } return result; @@ -430,7 +544,7 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); } else { percpu_counter_inc(sk->sk_prot->orphan_count); - sk->sk_state = TCP_CLOSE; + inet_sk_set_state(sk, TCP_CLOSE); sock_set_flag(sk, SOCK_DEAD); inet_csk_destroy_sock(sk); } @@ -483,6 +597,8 @@ int __inet_hash(struct sock *sk, struct sock *osk) hlist_add_tail_rcu(&sk->sk_node, &ilb->head); else hlist_add_head_rcu(&sk->sk_node, &ilb->head); + inet_hash2(hashinfo, sk); + ilb->count++; sock_set_flag(sk, SOCK_RCU_FREE); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); unlock: @@ -509,28 +625,33 @@ EXPORT_SYMBOL_GPL(inet_hash); void inet_unhash(struct sock *sk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; + struct inet_listen_hashbucket *ilb = NULL; spinlock_t *lock; - bool listener = false; - int done; if (sk_unhashed(sk)) return; if (sk->sk_state == TCP_LISTEN) { - lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock; - listener = true; + ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; + lock = &ilb->lock; } else { lock = inet_ehash_lockp(hashinfo, sk->sk_hash); } spin_lock_bh(lock); + if (sk_unhashed(sk)) + goto unlock; + if (rcu_access_pointer(sk->sk_reuseport_cb)) reuseport_detach_sock(sk); - if (listener) - done = __sk_del_node_init(sk); - else - done = __sk_nulls_del_node_init_rcu(sk); - if (done) - sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + if (ilb) { + inet_unhash2(hashinfo, sk); + __sk_del_node_init(sk); + ilb->count--; + } else { + __sk_nulls_del_node_init_rcu(sk); + } + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); +unlock: spin_unlock_bh(lock); } EXPORT_SYMBOL_GPL(inet_unhash); @@ -665,10 +786,37 @@ void inet_hashinfo_init(struct inet_hashinfo *h) for (i = 0; i < INET_LHTABLE_SIZE; i++) { spin_lock_init(&h->listening_hash[i].lock); INIT_HLIST_HEAD(&h->listening_hash[i].head); + h->listening_hash[i].count = 0; } + + h->lhash2 = NULL; } EXPORT_SYMBOL_GPL(inet_hashinfo_init); +void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, + unsigned long numentries, int scale, + unsigned long low_limit, + unsigned long high_limit) +{ + unsigned int i; + + h->lhash2 = alloc_large_system_hash(name, + sizeof(*h->lhash2), + numentries, + scale, + 0, + NULL, + &h->lhash2_mask, + low_limit, + high_limit); + + for (i = 0; i <= h->lhash2_mask; i++) { + spin_lock_init(&h->lhash2[i].lock); + INIT_HLIST_HEAD(&h->lhash2[i].head); + h->lhash2[i].count = 0; + } +} + int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) { unsigned int locksz = sizeof(spinlock_t); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index c690cd0d9b3f..c3ea4906d237 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -93,11 +93,11 @@ static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw, } /* - * Enter the time wait state. + * Enter the time wait state. This is called with locally disabled BH. * Essentially we whip up a timewait bucket, copy the relevant info into it * from the SK, and mess with hash chains and list linkage. */ -void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, +void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, struct inet_hashinfo *hashinfo) { const struct inet_sock *inet = inet_sk(sk); @@ -111,7 +111,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, */ bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num, hashinfo->bhash_size)]; - spin_lock_bh(&bhead->lock); + spin_lock(&bhead->lock); tw->tw_tb = icsk->icsk_bind_hash; WARN_ON(!icsk->icsk_bind_hash); inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); @@ -119,27 +119,26 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, spin_lock(lock); - /* - * Step 2: Hash TW into tcp ehash chain. - * Notes : - * - tw_refcnt is set to 4 because : - * - We have one reference from bhash chain. - * - We have one reference from ehash chain. - * - We have one reference from timer. - * - One reference for ourself (our caller will release it). - * We can use atomic_set() because prior spin_lock()/spin_unlock() - * committed into memory all tw fields. - */ - refcount_set(&tw->tw_refcnt, 4); inet_twsk_add_node_rcu(tw, &ehead->chain); /* Step 3: Remove SK from hash chain */ if (__sk_nulls_del_node_init_rcu(sk)) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); - spin_unlock_bh(lock); + spin_unlock(lock); + + /* tw_refcnt is set to 3 because we have : + * - one reference for bhash chain. + * - one reference for ehash chain. + * - one reference for timer. + * We can use atomic_set() because prior spin_lock()/spin_unlock() + * committed into memory all tw fields. + * Also note that after this point, we lost our implicit reference + * so we are not allowed to use tw anymore. + */ + refcount_set(&tw->tw_refcnt, 3); } -EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); +EXPORT_SYMBOL_GPL(inet_twsk_hashdance); static void tw_timer_handler(struct timer_list *t) { @@ -271,14 +270,14 @@ restart: continue; tw = inet_twsk(sk); if ((tw->tw_family != family) || - atomic_read(&twsk_net(tw)->count)) + refcount_read(&twsk_net(tw)->count)) continue; if (unlikely(!refcount_inc_not_zero(&tw->tw_refcnt))) continue; if (unlikely((tw->tw_family != family) || - atomic_read(&twsk_net(tw)->count))) { + refcount_read(&twsk_net(tw)->count))) { inet_twsk_put(tw); goto restart; } diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index bb6239169b1a..6ec670fbbbdd 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -114,7 +114,8 @@ MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); static struct rtnl_link_ops ipgre_link_ops __read_mostly; static int ipgre_tunnel_init(struct net_device *dev); static void erspan_build_header(struct sk_buff *skb, - __be32 id, u32 index, bool truncate); + u32 id, u32 index, + bool truncate, bool is_ipv4); static unsigned int ipgre_net_id __read_mostly; static unsigned int gre_tap_net_id __read_mostly; @@ -255,34 +256,43 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, { struct net *net = dev_net(skb->dev); struct metadata_dst *tun_dst = NULL; + struct erspan_base_hdr *ershdr; + struct erspan_metadata *pkt_md; struct ip_tunnel_net *itn; struct ip_tunnel *tunnel; - struct erspanhdr *ershdr; const struct iphdr *iph; - __be32 index; + int ver; int len; itn = net_generic(net, erspan_net_id); len = gre_hdr_len + sizeof(*ershdr); + /* Check based hdr len */ if (unlikely(!pskb_may_pull(skb, len))) - return -ENOMEM; + return PACKET_REJECT; iph = ip_hdr(skb); - ershdr = (struct erspanhdr *)(skb->data + gre_hdr_len); + ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len); + ver = ershdr->ver; /* The original GRE header does not have key field, * Use ERSPAN 10-bit session ID as key. */ - tpi->key = cpu_to_be32(ntohs(ershdr->session_id) & ID_MASK); - index = ershdr->md.index; + tpi->key = cpu_to_be32(get_session_id(ershdr)); tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags | TUNNEL_KEY, iph->saddr, iph->daddr, tpi->key); if (tunnel) { + len = gre_hdr_len + erspan_hdr_len(ver); + if (unlikely(!pskb_may_pull(skb, len))) + return PACKET_REJECT; + + ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len); + pkt_md = (struct erspan_metadata *)(ershdr + 1); + if (__iptunnel_pull_header(skb, - gre_hdr_len + sizeof(*ershdr), + len, htons(ETH_P_TEB), false, false) < 0) goto drop; @@ -303,15 +313,21 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, return PACKET_REJECT; md = ip_tunnel_info_opts(&tun_dst->u.tun_info); - if (!md) - return PACKET_REJECT; + memcpy(md, pkt_md, sizeof(*md)); + md->version = ver; - md->index = index; info = &tun_dst->u.tun_info; info->key.tun_flags |= TUNNEL_ERSPAN_OPT; info->options_len = sizeof(*md); } else { - tunnel->index = ntohl(index); + tunnel->erspan_ver = ver; + if (ver == 1) { + tunnel->index = ntohl(pkt_md->u.index); + } else { + tunnel->dir = pkt_md->u.md2.dir; + tunnel->hwid = get_hwid(&pkt_md->u.md2); + } + } skb_reset_mac_header(skb); @@ -405,14 +421,17 @@ static int gre_rcv(struct sk_buff *skb) if (hdr_len < 0) goto drop; - if (unlikely(tpi.proto == htons(ETH_P_ERSPAN))) { + if (unlikely(tpi.proto == htons(ETH_P_ERSPAN) || + tpi.proto == htons(ETH_P_ERSPAN2))) { if (erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD) return 0; + goto out; } if (ipgre_rcv(skb, &tpi, hdr_len) == PACKET_RCVD) return 0; +out: icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); drop: kfree_skb(skb); @@ -560,6 +579,7 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev, bool truncate = false; struct flowi4 fl; int tunnel_hlen; + int version; __be16 df; tun_info = skb_tunnel_info(skb); @@ -568,9 +588,13 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev, goto err_free_skb; key = &tun_info->key; + md = ip_tunnel_info_opts(tun_info); + if (!md) + goto err_free_rt; /* ERSPAN has fixed 8 byte GRE header */ - tunnel_hlen = 8 + sizeof(struct erspanhdr); + version = md->version; + tunnel_hlen = 8 + erspan_hdr_len(version); rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen); if (!rt) @@ -584,12 +608,18 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev, truncate = true; } - md = ip_tunnel_info_opts(tun_info); - if (!md) + if (version == 1) { + erspan_build_header(skb, ntohl(tunnel_id_to_key32(key->tun_id)), + ntohl(md->u.index), truncate, true); + } else if (version == 2) { + erspan_build_header_v2(skb, + ntohl(tunnel_id_to_key32(key->tun_id)), + md->u.md2.dir, + get_hwid(&md->u.md2), + truncate, true); + } else { goto err_free_rt; - - erspan_build_header(skb, tunnel_id_to_key32(key->tun_id), - ntohl(md->index), truncate); + } gre_build_header(skb, 8, TUNNEL_SEQ, htons(ETH_P_ERSPAN), 0, htonl(tunnel->o_seqno++)); @@ -668,52 +698,6 @@ free_skb: return NETDEV_TX_OK; } -static inline u8 tos_to_cos(u8 tos) -{ - u8 dscp, cos; - - dscp = tos >> 2; - cos = dscp >> 3; - return cos; -} - -static void erspan_build_header(struct sk_buff *skb, - __be32 id, u32 index, bool truncate) -{ - struct iphdr *iphdr = ip_hdr(skb); - struct ethhdr *eth = eth_hdr(skb); - enum erspan_encap_type enc_type; - struct erspanhdr *ershdr; - struct qtag_prefix { - __be16 eth_type; - __be16 tci; - } *qp; - u16 vlan_tci = 0; - - enc_type = ERSPAN_ENCAP_NOVLAN; - - /* If mirrored packet has vlan tag, extract tci and - * perserve vlan header in the mirrored frame. - */ - if (eth->h_proto == htons(ETH_P_8021Q)) { - qp = (struct qtag_prefix *)(skb->data + 2 * ETH_ALEN); - vlan_tci = ntohs(qp->tci); - enc_type = ERSPAN_ENCAP_INFRAME; - } - - skb_push(skb, sizeof(*ershdr)); - ershdr = (struct erspanhdr *)skb->data; - memset(ershdr, 0, sizeof(*ershdr)); - - ershdr->ver_vlan = htons((vlan_tci & VLAN_MASK) | - (ERSPAN_VERSION << VER_OFFSET)); - ershdr->session_id = htons((u16)(ntohl(id) & ID_MASK) | - ((tos_to_cos(iphdr->tos) << COS_OFFSET) & COS_MASK) | - (enc_type << EN_OFFSET & EN_MASK) | - ((truncate << T_OFFSET) & T_MASK)); - ershdr->md.index = htonl(index & INDEX_MASK); -} - static netdev_tx_t erspan_xmit(struct sk_buff *skb, struct net_device *dev) { @@ -737,7 +721,15 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb, } /* Push ERSPAN header */ - erspan_build_header(skb, tunnel->parms.o_key, tunnel->index, truncate); + if (tunnel->erspan_ver == 1) + erspan_build_header(skb, ntohl(tunnel->parms.o_key), + tunnel->index, + truncate, true); + else + erspan_build_header_v2(skb, ntohl(tunnel->parms.o_key), + tunnel->dir, tunnel->hwid, + truncate, true); + tunnel->parms.o_flags &= ~TUNNEL_KEY; __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_ERSPAN)); return NETDEV_TX_OK; @@ -1209,13 +1201,32 @@ static int ipgre_netlink_parms(struct net_device *dev, if (data[IFLA_GRE_FWMARK]) *fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]); - if (data[IFLA_GRE_ERSPAN_INDEX]) { - t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]); + if (data[IFLA_GRE_ERSPAN_VER]) { + t->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]); - if (t->index & ~INDEX_MASK) + if (t->erspan_ver != 1 && t->erspan_ver != 2) return -EINVAL; } + if (t->erspan_ver == 1) { + if (data[IFLA_GRE_ERSPAN_INDEX]) { + t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]); + if (t->index & ~INDEX_MASK) + return -EINVAL; + } + } else if (t->erspan_ver == 2) { + if (data[IFLA_GRE_ERSPAN_DIR]) { + t->dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]); + if (t->dir & ~(DIR_MASK >> DIR_OFFSET)) + return -EINVAL; + } + if (data[IFLA_GRE_ERSPAN_HWID]) { + t->hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]); + if (t->hwid & ~(HWID_MASK >> HWID_OFFSET)) + return -EINVAL; + } + } + return 0; } @@ -1282,7 +1293,7 @@ static int erspan_tunnel_init(struct net_device *dev) tunnel->tun_hlen = 8; tunnel->parms.iph.protocol = IPPROTO_GRE; tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen + - sizeof(struct erspanhdr); + erspan_hdr_len(tunnel->erspan_ver); t_hlen = tunnel->hlen + sizeof(struct iphdr); dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4; @@ -1310,6 +1321,7 @@ static const struct net_device_ops erspan_netdev_ops = { static void ipgre_tap_setup(struct net_device *dev) { ether_setup(dev); + dev->max_mtu = 0; dev->netdev_ops = &gre_tap_netdev_ops; dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; @@ -1412,6 +1424,12 @@ static size_t ipgre_get_size(const struct net_device *dev) nla_total_size(4) + /* IFLA_GRE_ERSPAN_INDEX */ nla_total_size(4) + + /* IFLA_GRE_ERSPAN_VER */ + nla_total_size(1) + + /* IFLA_GRE_ERSPAN_DIR */ + nla_total_size(1) + + /* IFLA_GRE_ERSPAN_HWID */ + nla_total_size(2) + 0; } @@ -1454,9 +1472,18 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) goto nla_put_failure; } - if (t->index) + if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver)) + goto nla_put_failure; + + if (t->erspan_ver == 1) { if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index)) goto nla_put_failure; + } else if (t->erspan_ver == 2) { + if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, t->dir)) + goto nla_put_failure; + if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, t->hwid)) + goto nla_put_failure; + } return 0; @@ -1492,6 +1519,9 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = { [IFLA_GRE_IGNORE_DF] = { .type = NLA_U8 }, [IFLA_GRE_FWMARK] = { .type = NLA_U32 }, [IFLA_GRE_ERSPAN_INDEX] = { .type = NLA_U32 }, + [IFLA_GRE_ERSPAN_VER] = { .type = NLA_U8 }, + [IFLA_GRE_ERSPAN_DIR] = { .type = NLA_U8 }, + [IFLA_GRE_ERSPAN_HWID] = { .type = NLA_U16 }, }; static struct rtnl_link_ops ipgre_link_ops __read_mostly = { diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 60fb1eb7d7d8..008be04ac1cc 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -808,6 +808,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, { struct net_device *dev = NULL; int ifindex; + int midx; if (optlen != sizeof(int)) goto e_inval; @@ -823,10 +824,13 @@ static int do_ip_setsockopt(struct sock *sk, int level, err = -EADDRNOTAVAIL; if (!dev) break; + + midx = l3mdev_master_ifindex(dev); dev_put(dev); err = -EINVAL; - if (sk->sk_bound_dev_if) + if (sk->sk_bound_dev_if && + (!midx || midx != sk->sk_bound_dev_if)) break; inet->uc_index = ifindex; @@ -1251,11 +1255,8 @@ int ip_setsockopt(struct sock *sk, int level, if (err == -ENOPROTOOPT && optname != IP_HDRINCL && optname != IP_IPSEC_POLICY && optname != IP_XFRM_POLICY && - !ip_mroute_opt(optname)) { - lock_sock(sk); + !ip_mroute_opt(optname)) err = nf_setsockopt(sk, PF_INET, optname, optval, optlen); - release_sock(sk); - } #endif return err; } @@ -1280,12 +1281,9 @@ int compat_ip_setsockopt(struct sock *sk, int level, int optname, if (err == -ENOPROTOOPT && optname != IP_HDRINCL && optname != IP_IPSEC_POLICY && optname != IP_XFRM_POLICY && - !ip_mroute_opt(optname)) { - lock_sock(sk); - err = compat_nf_setsockopt(sk, PF_INET, optname, - optval, optlen); - release_sock(sk); - } + !ip_mroute_opt(optname)) + err = compat_nf_setsockopt(sk, PF_INET, optname, optval, + optlen); #endif return err; } diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index fe6fee728ce4..d786a8441bce 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -349,8 +349,8 @@ static int ip_tunnel_bind_dev(struct net_device *dev) dev->needed_headroom = t_hlen + hlen; mtu -= (dev->hard_header_len + t_hlen); - if (mtu < 68) - mtu = 68; + if (mtu < IPV4_MIN_MTU) + mtu = IPV4_MIN_MTU; return mtu; } @@ -520,8 +520,7 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, else mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; - if (skb_dst(skb)) - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); + skb_dst_update_pmtu(skb, mtu); if (skb->protocol == htons(ETH_P_IP)) { if (!skb_is_gso(skb) && @@ -711,9 +710,16 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, } } - init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr, - tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link, - tunnel->fwmark); + if (tunnel->fwmark) { + init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr, + tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link, + tunnel->fwmark); + } + else { + init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr, + tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link, + skb->mark); + } if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0) goto tx_error; diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 949f432a5f04..51b1669334fe 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -200,7 +200,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, mtu = dst_mtu(dst); if (skb->len > mtu) { - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); + skb_dst_update_pmtu(skb, mtu); if (skb->protocol == htons(ETH_P_IP)) { icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index abdebca848c9..f75802ad960f 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -329,39 +329,6 @@ set_sockaddr(struct sockaddr_in *sin, __be32 addr, __be16 port) sin->sin_port = port; } -static int __init ic_devinet_ioctl(unsigned int cmd, struct ifreq *arg) -{ - int res; - - mm_segment_t oldfs = get_fs(); - set_fs(get_ds()); - res = devinet_ioctl(&init_net, cmd, (struct ifreq __user *) arg); - set_fs(oldfs); - return res; -} - -static int __init ic_dev_ioctl(unsigned int cmd, struct ifreq *arg) -{ - int res; - - mm_segment_t oldfs = get_fs(); - set_fs(get_ds()); - res = dev_ioctl(&init_net, cmd, (struct ifreq __user *) arg); - set_fs(oldfs); - return res; -} - -static int __init ic_route_ioctl(unsigned int cmd, struct rtentry *arg) -{ - int res; - - mm_segment_t oldfs = get_fs(); - set_fs(get_ds()); - res = ip_rt_ioctl(&init_net, cmd, (void __user *) arg); - set_fs(oldfs); - return res; -} - /* * Set up interface addresses and routes. */ @@ -375,19 +342,19 @@ static int __init ic_setup_if(void) memset(&ir, 0, sizeof(ir)); strcpy(ir.ifr_ifrn.ifrn_name, ic_dev->dev->name); set_sockaddr(sin, ic_myaddr, 0); - if ((err = ic_devinet_ioctl(SIOCSIFADDR, &ir)) < 0) { + if ((err = devinet_ioctl(&init_net, SIOCSIFADDR, &ir)) < 0) { pr_err("IP-Config: Unable to set interface address (%d)\n", err); return -1; } set_sockaddr(sin, ic_netmask, 0); - if ((err = ic_devinet_ioctl(SIOCSIFNETMASK, &ir)) < 0) { + if ((err = devinet_ioctl(&init_net, SIOCSIFNETMASK, &ir)) < 0) { pr_err("IP-Config: Unable to set interface netmask (%d)\n", err); return -1; } set_sockaddr(sin, ic_myaddr | ~ic_netmask, 0); - if ((err = ic_devinet_ioctl(SIOCSIFBRDADDR, &ir)) < 0) { + if ((err = devinet_ioctl(&init_net, SIOCSIFBRDADDR, &ir)) < 0) { pr_err("IP-Config: Unable to set interface broadcast address (%d)\n", err); return -1; @@ -397,11 +364,11 @@ static int __init ic_setup_if(void) * out, we'll try to muddle along. */ if (ic_dev_mtu != 0) { - strcpy(ir.ifr_name, ic_dev->dev->name); - ir.ifr_mtu = ic_dev_mtu; - if ((err = ic_dev_ioctl(SIOCSIFMTU, &ir)) < 0) + rtnl_lock(); + if ((err = dev_set_mtu(ic_dev->dev, ic_dev_mtu)) < 0) pr_err("IP-Config: Unable to set interface mtu to %d (%d)\n", ic_dev_mtu, err); + rtnl_unlock(); } return 0; } @@ -423,7 +390,7 @@ static int __init ic_setup_routes(void) set_sockaddr((struct sockaddr_in *) &rm.rt_genmask, 0, 0); set_sockaddr((struct sockaddr_in *) &rm.rt_gateway, ic_gateway, 0); rm.rt_flags = RTF_UP | RTF_GATEWAY; - if ((err = ic_route_ioctl(SIOCADDRT, &rm)) < 0) { + if ((err = ip_rt_ioctl(&init_net, SIOCADDRT, &rm)) < 0) { pr_err("IP-Config: Cannot add default route (%d)\n", err); return -1; @@ -1322,7 +1289,6 @@ static int pnp_seq_open(struct inode *indoe, struct file *file) } static const struct file_operations pnp_seq_fops = { - .owner = THIS_MODULE, .open = pnp_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index fd5f19c988e4..b05689bbba31 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -3022,7 +3022,7 @@ static int ipmr_vif_seq_show(struct seq_file *seq, void *v) const char *name = vif->dev ? vif->dev->name : "none"; seq_printf(seq, - "%2zd %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n", + "%2td %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n", vif - mrt->vif_table, name, vif->bytes_in, vif->pkt_in, vif->bytes_out, vif->pkt_out, @@ -3045,7 +3045,6 @@ static int ipmr_vif_open(struct inode *inode, struct file *file) } static const struct file_operations ipmr_vif_fops = { - .owner = THIS_MODULE, .open = ipmr_vif_open, .read = seq_read, .llseek = seq_lseek, @@ -3198,7 +3197,6 @@ static int ipmr_mfc_open(struct inode *inode, struct file *file) } static const struct file_operations ipmr_mfc_fops = { - .owner = THIS_MODULE, .open = ipmr_mfc_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index c0cc6aa8cfaa..e6774ccb7731 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -80,35 +80,7 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_t } EXPORT_SYMBOL(ip_route_me_harder); -/* - * Extra routing may needed on local out, as the QUEUE target never - * returns control to the table. - */ - -struct ip_rt_info { - __be32 daddr; - __be32 saddr; - u_int8_t tos; - u_int32_t mark; -}; - -static void nf_ip_saveroute(const struct sk_buff *skb, - struct nf_queue_entry *entry) -{ - struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry); - - if (entry->state.hook == NF_INET_LOCAL_OUT) { - const struct iphdr *iph = ip_hdr(skb); - - rt_info->tos = iph->tos; - rt_info->daddr = iph->daddr; - rt_info->saddr = iph->saddr; - rt_info->mark = skb->mark; - } -} - -static int nf_ip_reroute(struct net *net, struct sk_buff *skb, - const struct nf_queue_entry *entry) +int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry) { const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry); @@ -119,10 +91,12 @@ static int nf_ip_reroute(struct net *net, struct sk_buff *skb, skb->mark == rt_info->mark && iph->daddr == rt_info->daddr && iph->saddr == rt_info->saddr)) - return ip_route_me_harder(net, skb, RTN_UNSPEC); + return ip_route_me_harder(entry->state.net, skb, + RTN_UNSPEC); } return 0; } +EXPORT_SYMBOL_GPL(nf_ip_reroute); __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, u_int8_t protocol) @@ -155,9 +129,9 @@ __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook, } EXPORT_SYMBOL(nf_ip_checksum); -static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook, - unsigned int dataoff, unsigned int len, - u_int8_t protocol) +__sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook, + unsigned int dataoff, unsigned int len, + u_int8_t protocol) { const struct iphdr *iph = ip_hdr(skb); __sum16 csum = 0; @@ -175,9 +149,10 @@ static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook, } return csum; } +EXPORT_SYMBOL_GPL(nf_ip_checksum_partial); -static int nf_ip_route(struct net *net, struct dst_entry **dst, - struct flowi *fl, bool strict __always_unused) +int nf_ip_route(struct net *net, struct dst_entry **dst, struct flowi *fl, + bool strict __always_unused) { struct rtable *rt = ip_route_output_key(net, &fl->u.ip4); if (IS_ERR(rt)) @@ -185,19 +160,4 @@ static int nf_ip_route(struct net *net, struct dst_entry **dst, *dst = &rt->dst; return 0; } - -static const struct nf_afinfo nf_ip_afinfo = { - .family = AF_INET, - .checksum = nf_ip_checksum, - .checksum_partial = nf_ip_checksum_partial, - .route = nf_ip_route, - .saveroute = nf_ip_saveroute, - .reroute = nf_ip_reroute, - .route_key_size = sizeof(struct ip_rt_info), -}; - -static int __init ipv4_netfilter_init(void) -{ - return nf_register_afinfo(&nf_ip_afinfo); -} -subsys_initcall(ipv4_netfilter_init); +EXPORT_SYMBOL_GPL(nf_ip_route); diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index c11eb1744ab1..5f52236780b4 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -72,11 +72,21 @@ endif # NF_TABLES_IPV4 config NF_TABLES_ARP tristate "ARP nf_tables support" + select NETFILTER_FAMILY_ARP help This option enables the ARP support for nf_tables. endif # NF_TABLES +config NF_FLOW_TABLE_IPV4 + tristate "Netfilter flow table IPv4 module" + depends on NF_CONNTRACK && NF_TABLES + select NF_FLOW_TABLE + help + This option adds the flow table IPv4 support. + + To compile it as a module, choose M here. + config NF_DUP_IPV4 tristate "Netfilter IPv4 packet duplication to alternate destination" depends on !NF_CONNTRACK || NF_CONNTRACK @@ -148,6 +158,7 @@ config NF_NAT_SNMP_BASIC depends on NF_CONNTRACK_SNMP depends on NETFILTER_ADVANCED default NF_NAT && NF_CONNTRACK_SNMP + select ASN1 ---help--- This module implements an Application Layer Gateway (ALG) for @@ -333,6 +344,7 @@ config IP_NF_TARGET_CLUSTERIP depends on NF_CONNTRACK_IPV4 depends on NETFILTER_ADVANCED select NF_CONNTRACK_MARK + select NETFILTER_FAMILY_ARP help The CLUSTERIP target allows you to build load-balancing clusters of network servers without having a dedicated load-balancing @@ -392,6 +404,7 @@ endif # IP_NF_IPTABLES config IP_NF_ARPTABLES tristate "ARP tables support" select NETFILTER_XTABLES + select NETFILTER_FAMILY_ARP depends on NETFILTER_ADVANCED help arptables is a general, extensible packet identification framework. diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index adcdae358365..2dad20eefd26 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -27,9 +27,15 @@ obj-$(CONFIG_NF_REJECT_IPV4) += nf_reject_ipv4.o # NAT helpers (nf_conntrack) obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o + +nf_nat_snmp_basic-y := nf_nat_snmp_basic-asn1.o nf_nat_snmp_basic_main.o +nf_nat_snmp_basic-y : nf_nat_snmp_basic-asn1.h nf_nat_snmp_basic-asn1.c obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o +clean-files := nf_nat_snmp_basic-asn1.c nf_nat_snmp_basic-asn1.h + obj-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o + # NAT protocols (nf_nat) obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o @@ -43,6 +49,9 @@ obj-$(CONFIG_NFT_REDIR_IPV4) += nft_redir_ipv4.o obj-$(CONFIG_NFT_DUP_IPV4) += nft_dup_ipv4.o obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o +# flow table support +obj-$(CONFIG_NF_FLOW_TABLE_IPV4) += nf_flow_table_ipv4.o + # generic IP tables obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index f88221aebc9d..4ffe302f9b82 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -202,13 +202,8 @@ unsigned int arpt_do_table(struct sk_buff *skb, local_bh_disable(); addend = xt_write_recseq_begin(); - private = table->private; + private = READ_ONCE(table->private); /* Address dependency. */ cpu = smp_processor_id(); - /* - * Ensure we load private-> members after we've fetched the base - * pointer. - */ - smp_read_barrier_depends(); table_base = private->entries; jumpstack = (struct arpt_entry **)private->jumpstack[cpu]; @@ -373,7 +368,6 @@ static int mark_source_chains(const struct xt_table_info *newinfo, if (!xt_find_jump_offset(offsets, newpos, newinfo->number)) return 0; - e = entry0 + newpos; } else { /* ... this is a fallthru */ newpos = pos + e->next_offset; @@ -811,9 +805,8 @@ static int get_info(struct net *net, void __user *user, if (compat) xt_compat_lock(NFPROTO_ARP); #endif - t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name), - "arptable_%s", name); - if (t) { + t = xt_request_find_table_lock(net, NFPROTO_ARP, name); + if (!IS_ERR(t)) { struct arpt_getinfo info; const struct xt_table_info *private = t->private; #ifdef CONFIG_COMPAT @@ -842,7 +835,7 @@ static int get_info(struct net *net, void __user *user, xt_table_unlock(t); module_put(t->me); } else - ret = -ENOENT; + ret = PTR_ERR(t); #ifdef CONFIG_COMPAT if (compat) xt_compat_unlock(NFPROTO_ARP); @@ -867,7 +860,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr, get.name[sizeof(get.name) - 1] = '\0'; t = xt_find_table_lock(net, NFPROTO_ARP, get.name); - if (t) { + if (!IS_ERR(t)) { const struct xt_table_info *private = t->private; if (get.size == private->size) @@ -879,7 +872,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr, module_put(t->me); xt_table_unlock(t); } else - ret = -ENOENT; + ret = PTR_ERR(t); return ret; } @@ -904,10 +897,9 @@ static int __do_replace(struct net *net, const char *name, goto out; } - t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name), - "arptable_%s", name); - if (!t) { - ret = -ENOENT; + t = xt_request_find_table_lock(net, NFPROTO_ARP, name); + if (IS_ERR(t)) { + ret = PTR_ERR(t); goto free_newinfo_counters_untrans; } @@ -1021,8 +1013,8 @@ static int do_add_counters(struct net *net, const void __user *user, return PTR_ERR(paddc); t = xt_find_table_lock(net, NFPROTO_ARP, tmp.name); - if (!t) { - ret = -ENOENT; + if (IS_ERR(t)) { + ret = PTR_ERR(t); goto free; } @@ -1409,7 +1401,7 @@ static int compat_get_entries(struct net *net, xt_compat_lock(NFPROTO_ARP); t = xt_find_table_lock(net, NFPROTO_ARP, get.name); - if (t) { + if (!IS_ERR(t)) { const struct xt_table_info *private = t->private; struct xt_table_info info; @@ -1424,7 +1416,7 @@ static int compat_get_entries(struct net *net, module_put(t->me); xt_table_unlock(t); } else - ret = -ENOENT; + ret = PTR_ERR(t); xt_compat_unlock(NFPROTO_ARP); return ret; @@ -1659,7 +1651,6 @@ static int __init arp_tables_init(void) if (ret < 0) goto err4; - pr_info("arp_tables: (C) 2002 David S. Miller\n"); return 0; err4: diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 4cbe5e80f3bf..9a71f3149507 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -260,13 +260,8 @@ ipt_do_table(struct sk_buff *skb, WARN_ON(!(table->valid_hooks & (1 << hook))); local_bh_disable(); addend = xt_write_recseq_begin(); - private = table->private; + private = READ_ONCE(table->private); /* Address dependency. */ cpu = smp_processor_id(); - /* - * Ensure we load private-> members after we've fetched the base - * pointer. - */ - smp_read_barrier_depends(); table_base = private->entries; jumpstack = (struct ipt_entry **)private->jumpstack[cpu]; @@ -439,7 +434,6 @@ mark_source_chains(const struct xt_table_info *newinfo, if (!xt_find_jump_offset(offsets, newpos, newinfo->number)) return 0; - e = entry0 + newpos; } else { /* ... this is a fallthru */ newpos = pos + e->next_offset; @@ -974,9 +968,8 @@ static int get_info(struct net *net, void __user *user, if (compat) xt_compat_lock(AF_INET); #endif - t = try_then_request_module(xt_find_table_lock(net, AF_INET, name), - "iptable_%s", name); - if (t) { + t = xt_request_find_table_lock(net, AF_INET, name); + if (!IS_ERR(t)) { struct ipt_getinfo info; const struct xt_table_info *private = t->private; #ifdef CONFIG_COMPAT @@ -1006,7 +999,7 @@ static int get_info(struct net *net, void __user *user, xt_table_unlock(t); module_put(t->me); } else - ret = -ENOENT; + ret = PTR_ERR(t); #ifdef CONFIG_COMPAT if (compat) xt_compat_unlock(AF_INET); @@ -1031,7 +1024,7 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr, get.name[sizeof(get.name) - 1] = '\0'; t = xt_find_table_lock(net, AF_INET, get.name); - if (t) { + if (!IS_ERR(t)) { const struct xt_table_info *private = t->private; if (get.size == private->size) ret = copy_entries_to_user(private->size, @@ -1042,7 +1035,7 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr, module_put(t->me); xt_table_unlock(t); } else - ret = -ENOENT; + ret = PTR_ERR(t); return ret; } @@ -1065,10 +1058,9 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, goto out; } - t = try_then_request_module(xt_find_table_lock(net, AF_INET, name), - "iptable_%s", name); - if (!t) { - ret = -ENOENT; + t = xt_request_find_table_lock(net, AF_INET, name); + if (IS_ERR(t)) { + ret = PTR_ERR(t); goto free_newinfo_counters_untrans; } @@ -1182,8 +1174,8 @@ do_add_counters(struct net *net, const void __user *user, return PTR_ERR(paddc); t = xt_find_table_lock(net, AF_INET, tmp.name); - if (!t) { - ret = -ENOENT; + if (IS_ERR(t)) { + ret = PTR_ERR(t); goto free; } @@ -1626,7 +1618,7 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr, xt_compat_lock(AF_INET); t = xt_find_table_lock(net, AF_INET, get.name); - if (t) { + if (!IS_ERR(t)) { const struct xt_table_info *private = t->private; struct xt_table_info info; ret = compat_table_info(private, &info); @@ -1640,7 +1632,7 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr, module_put(t->me); xt_table_unlock(t); } else - ret = -ENOENT; + ret = PTR_ERR(t); xt_compat_unlock(AF_INET); return ret; @@ -1942,7 +1934,6 @@ static int __init ip_tables_init(void) if (ret < 0) goto err5; - pr_info("(C) 2000-2006 Netfilter Core Team\n"); return 0; err5: diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 17b4ca562944..3a84a60f6b39 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -431,7 +431,7 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par) struct ipt_clusterip_tgt_info *cipinfo = par->targinfo; const struct ipt_entry *e = par->entryinfo; struct clusterip_config *config; - int ret; + int ret, i; if (par->nft_compat) { pr_err("cannot use CLUSTERIP target from nftables compat\n"); @@ -450,8 +450,18 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par) pr_info("Please specify destination IP\n"); return -EINVAL; } - - /* FIXME: further sanity checks */ + if (cipinfo->num_local_nodes > ARRAY_SIZE(cipinfo->local_nodes)) { + pr_info("bad num_local_nodes %u\n", cipinfo->num_local_nodes); + return -EINVAL; + } + for (i = 0; i < cipinfo->num_local_nodes; i++) { + if (cipinfo->local_nodes[i] - 1 >= + sizeof(config->local_nodes) * 8) { + pr_info("bad local_nodes[%d] %u\n", + i, cipinfo->local_nodes[i]); + return -EINVAL; + } + } config = clusterip_config_find_get(par->net, e->ip.dst.s_addr, 1); if (!config) { @@ -776,7 +786,6 @@ static ssize_t clusterip_proc_write(struct file *file, const char __user *input, } static const struct file_operations clusterip_proc_fops = { - .owner = THIS_MODULE, .open = clusterip_proc_open, .read = seq_read, .write = clusterip_proc_write, @@ -813,12 +822,13 @@ static int clusterip_net_init(struct net *net) static void clusterip_net_exit(struct net *net) { -#ifdef CONFIG_PROC_FS struct clusterip_net *cn = net_generic(net, clusterip_net_id); +#ifdef CONFIG_PROC_FS proc_remove(cn->procdir); cn->procdir = NULL; #endif nf_unregister_net_hook(net, &cip_arp_ops); + WARN_ON_ONCE(!list_empty(&cn->configs)); } static struct pernet_operations clusterip_net_ops = { diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index 7667f223d7f8..9ac92ea7b93c 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -38,12 +38,6 @@ static unsigned int iptable_filter_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - if (state->hook == NF_INET_LOCAL_OUT && - (skb->len < sizeof(struct iphdr) || - ip_hdrlen(skb) < sizeof(struct iphdr))) - /* root is playing with raw sockets. */ - return NF_ACCEPT; - return ipt_do_table(skb, state, state->net->ipv4.iptable_filter); } diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index aebdb337fd7e..dea138ca8925 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -49,11 +49,6 @@ ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state) u_int32_t mark; int err; - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct iphdr) || - ip_hdrlen(skb) < sizeof(struct iphdr)) - return NF_ACCEPT; - /* Save things which could affect route */ mark = skb->mark; iph = ip_hdr(skb); diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index a1a07b338ccf..0f7255cc65ee 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -72,6 +72,7 @@ static const struct nf_hook_ops nf_nat_ipv4_ops[] = { { .hook = iptable_nat_ipv4_in, .pf = NFPROTO_IPV4, + .nat_hook = true, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_NAT_DST, }, @@ -79,6 +80,7 @@ static const struct nf_hook_ops nf_nat_ipv4_ops[] = { { .hook = iptable_nat_ipv4_out, .pf = NFPROTO_IPV4, + .nat_hook = true, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_NAT_SRC, }, @@ -86,6 +88,7 @@ static const struct nf_hook_ops nf_nat_ipv4_ops[] = { { .hook = iptable_nat_ipv4_local_fn, .pf = NFPROTO_IPV4, + .nat_hook = true, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_NAT_DST, }, @@ -93,6 +96,7 @@ static const struct nf_hook_ops nf_nat_ipv4_ops[] = { { .hook = iptable_nat_ipv4_fn, .pf = NFPROTO_IPV4, + .nat_hook = true, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SRC, }, diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index 2642ecd2645c..960625aabf04 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -3,6 +3,7 @@ * * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/slab.h> @@ -12,6 +13,10 @@ static int __net_init iptable_raw_table_init(struct net *net); +static bool raw_before_defrag __read_mostly; +MODULE_PARM_DESC(raw_before_defrag, "Enable raw table before defrag"); +module_param(raw_before_defrag, bool, 0000); + static const struct xt_table packet_raw = { .name = "raw", .valid_hooks = RAW_VALID_HOOKS, @@ -21,17 +26,20 @@ static const struct xt_table packet_raw = { .table_init = iptable_raw_table_init, }; +static const struct xt_table packet_raw_before_defrag = { + .name = "raw", + .valid_hooks = RAW_VALID_HOOKS, + .me = THIS_MODULE, + .af = NFPROTO_IPV4, + .priority = NF_IP_PRI_RAW_BEFORE_DEFRAG, + .table_init = iptable_raw_table_init, +}; + /* The work comes in here from netfilter.c. */ static unsigned int iptable_raw_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - if (state->hook == NF_INET_LOCAL_OUT && - (skb->len < sizeof(struct iphdr) || - ip_hdrlen(skb) < sizeof(struct iphdr))) - /* root is playing with raw sockets. */ - return NF_ACCEPT; - return ipt_do_table(skb, state, state->net->ipv4.iptable_raw); } @@ -40,15 +48,19 @@ static struct nf_hook_ops *rawtable_ops __read_mostly; static int __net_init iptable_raw_table_init(struct net *net) { struct ipt_replace *repl; + const struct xt_table *table = &packet_raw; int ret; + if (raw_before_defrag) + table = &packet_raw_before_defrag; + if (net->ipv4.iptable_raw) return 0; - repl = ipt_alloc_initial_table(&packet_raw); + repl = ipt_alloc_initial_table(table); if (repl == NULL) return -ENOMEM; - ret = ipt_register_table(net, &packet_raw, repl, rawtable_ops, + ret = ipt_register_table(net, table, repl, rawtable_ops, &net->ipv4.iptable_raw); kfree(repl); return ret; @@ -69,8 +81,15 @@ static struct pernet_operations iptable_raw_net_ops = { static int __init iptable_raw_init(void) { int ret; + const struct xt_table *table = &packet_raw; + + if (raw_before_defrag) { + table = &packet_raw_before_defrag; + + pr_info("Enabling raw table before defrag\n"); + } - rawtable_ops = xt_hook_ops_alloc(&packet_raw, iptable_raw_hook); + rawtable_ops = xt_hook_ops_alloc(table, iptable_raw_hook); if (IS_ERR(rawtable_ops)) return PTR_ERR(rawtable_ops); diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index ff226596e4b5..e5379fe57b64 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c @@ -43,12 +43,6 @@ static unsigned int iptable_security_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - if (state->hook == NF_INET_LOCAL_OUT && - (skb->len < sizeof(struct iphdr) || - ip_hdrlen(skb) < sizeof(struct iphdr))) - /* Somebody is playing with raw sockets. */ - return NF_ACCEPT; - return ipt_do_table(skb, state, state->net->ipv4.iptable_security); } diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 89af9d88ca21..b50721d9d30e 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -154,11 +154,6 @@ static unsigned int ipv4_conntrack_local(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct iphdr) || - ip_hdrlen(skb) < sizeof(struct iphdr)) - return NF_ACCEPT; - if (ip_is_fragment(ip_hdr(skb))) /* IP_NODEFRAG setsockopt set */ return NF_ACCEPT; @@ -218,15 +213,19 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) struct nf_conntrack_tuple tuple; memset(&tuple, 0, sizeof(tuple)); + + lock_sock(sk); tuple.src.u3.ip = inet->inet_rcv_saddr; tuple.src.u.tcp.port = inet->inet_sport; tuple.dst.u3.ip = inet->inet_daddr; tuple.dst.u.tcp.port = inet->inet_dport; tuple.src.l3num = PF_INET; tuple.dst.protonum = sk->sk_protocol; + release_sock(sk); /* We only do TCP and SCTP at the moment: is there a better way? */ - if (sk->sk_protocol != IPPROTO_TCP && sk->sk_protocol != IPPROTO_SCTP) { + if (tuple.dst.protonum != IPPROTO_TCP && + tuple.dst.protonum != IPPROTO_SCTP) { pr_debug("SO_ORIGINAL_DST: Not a TCP/SCTP socket\n"); return -ENOPROTOOPT; } @@ -368,7 +367,7 @@ MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET)); MODULE_ALIAS("ip_conntrack"); MODULE_LICENSE("GPL"); -static struct nf_conntrack_l4proto *builtin_l4proto4[] = { +static const struct nf_conntrack_l4proto * const builtin_l4proto4[] = { &nf_conntrack_l4proto_tcp4, &nf_conntrack_l4proto_udp4, &nf_conntrack_l4proto_icmp, diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 1849fedd9b81..5c15beafa711 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -22,7 +22,7 @@ #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_log.h> -static unsigned int nf_ct_icmp_timeout __read_mostly = 30*HZ; +static const unsigned int nf_ct_icmp_timeout = 30*HZ; static inline struct nf_icmp_net *icmp_pernet(struct net *net) { @@ -351,7 +351,7 @@ static struct nf_proto_net *icmp_get_net_proto(struct net *net) return &net->ct.nf_ct_proto.icmp.pn; } -struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly = +const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp = { .l3proto = PF_INET, .l4proto = IPPROTO_ICMP, diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index 37fe1616ca0b..a0d3ad60a411 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -78,6 +78,8 @@ static unsigned int ipv4_conntrack_defrag(void *priv, if (skb_nfct(skb) && !nf_ct_is_template((struct nf_conn *)skb_nfct(skb))) return NF_ACCEPT; #endif + if (skb->_nfct == IP_CT_UNTRACKED) + return NF_ACCEPT; #endif /* Gather fragments. */ if (ip_is_fragment(ip_hdr(skb))) { diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c new file mode 100644 index 000000000000..b2d01eb25f2c --- /dev/null +++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c @@ -0,0 +1,284 @@ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netfilter.h> +#include <linux/rhashtable.h> +#include <linux/ip.h> +#include <linux/netdevice.h> +#include <net/ip.h> +#include <net/neighbour.h> +#include <net/netfilter/nf_flow_table.h> +#include <net/netfilter/nf_tables.h> +/* For layer 4 checksum field offset. */ +#include <linux/tcp.h> +#include <linux/udp.h> + +static int nf_flow_nat_ip_tcp(struct sk_buff *skb, unsigned int thoff, + __be32 addr, __be32 new_addr) +{ + struct tcphdr *tcph; + + if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) || + skb_try_make_writable(skb, thoff + sizeof(*tcph))) + return -1; + + tcph = (void *)(skb_network_header(skb) + thoff); + inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, true); + + return 0; +} + +static int nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff, + __be32 addr, __be32 new_addr) +{ + struct udphdr *udph; + + if (!pskb_may_pull(skb, thoff + sizeof(*udph)) || + skb_try_make_writable(skb, thoff + sizeof(*udph))) + return -1; + + udph = (void *)(skb_network_header(skb) + thoff); + if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) { + inet_proto_csum_replace4(&udph->check, skb, addr, + new_addr, true); + if (!udph->check) + udph->check = CSUM_MANGLED_0; + } + + return 0; +} + +static int nf_flow_nat_ip_l4proto(struct sk_buff *skb, struct iphdr *iph, + unsigned int thoff, __be32 addr, + __be32 new_addr) +{ + switch (iph->protocol) { + case IPPROTO_TCP: + if (nf_flow_nat_ip_tcp(skb, thoff, addr, new_addr) < 0) + return NF_DROP; + break; + case IPPROTO_UDP: + if (nf_flow_nat_ip_udp(skb, thoff, addr, new_addr) < 0) + return NF_DROP; + break; + } + + return 0; +} + +static int nf_flow_snat_ip(const struct flow_offload *flow, struct sk_buff *skb, + struct iphdr *iph, unsigned int thoff, + enum flow_offload_tuple_dir dir) +{ + __be32 addr, new_addr; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + addr = iph->saddr; + new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4.s_addr; + iph->saddr = new_addr; + break; + case FLOW_OFFLOAD_DIR_REPLY: + addr = iph->daddr; + new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4.s_addr; + iph->daddr = new_addr; + break; + default: + return -1; + } + csum_replace4(&iph->check, addr, new_addr); + + return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr); +} + +static int nf_flow_dnat_ip(const struct flow_offload *flow, struct sk_buff *skb, + struct iphdr *iph, unsigned int thoff, + enum flow_offload_tuple_dir dir) +{ + __be32 addr, new_addr; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + addr = iph->daddr; + new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4.s_addr; + iph->daddr = new_addr; + break; + case FLOW_OFFLOAD_DIR_REPLY: + addr = iph->saddr; + new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4.s_addr; + iph->saddr = new_addr; + break; + default: + return -1; + } + + return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr); +} + +static int nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb, + enum flow_offload_tuple_dir dir) +{ + struct iphdr *iph = ip_hdr(skb); + unsigned int thoff = iph->ihl * 4; + + if (flow->flags & FLOW_OFFLOAD_SNAT && + (nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir) < 0 || + nf_flow_snat_ip(flow, skb, iph, thoff, dir) < 0)) + return -1; + if (flow->flags & FLOW_OFFLOAD_DNAT && + (nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir) < 0 || + nf_flow_dnat_ip(flow, skb, iph, thoff, dir) < 0)) + return -1; + + return 0; +} + +static bool ip_has_options(unsigned int thoff) +{ + return thoff != sizeof(struct iphdr); +} + +static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev, + struct flow_offload_tuple *tuple) +{ + struct flow_ports *ports; + unsigned int thoff; + struct iphdr *iph; + + if (!pskb_may_pull(skb, sizeof(*iph))) + return -1; + + iph = ip_hdr(skb); + thoff = iph->ihl * 4; + + if (ip_is_fragment(iph) || + unlikely(ip_has_options(thoff))) + return -1; + + if (iph->protocol != IPPROTO_TCP && + iph->protocol != IPPROTO_UDP) + return -1; + + thoff = iph->ihl * 4; + if (!pskb_may_pull(skb, thoff + sizeof(*ports))) + return -1; + + ports = (struct flow_ports *)(skb_network_header(skb) + thoff); + + tuple->src_v4.s_addr = iph->saddr; + tuple->dst_v4.s_addr = iph->daddr; + tuple->src_port = ports->source; + tuple->dst_port = ports->dest; + tuple->l3proto = AF_INET; + tuple->l4proto = iph->protocol; + tuple->iifidx = dev->ifindex; + + return 0; +} + +/* Based on ip_exceeds_mtu(). */ +static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) +{ + if (skb->len <= mtu) + return false; + + if ((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0) + return false; + + if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu)) + return false; + + return true; +} + +static bool nf_flow_exceeds_mtu(struct sk_buff *skb, const struct rtable *rt) +{ + u32 mtu; + + mtu = ip_dst_mtu_maybe_forward(&rt->dst, true); + if (__nf_flow_exceeds_mtu(skb, mtu)) + return true; + + return false; +} + +unsigned int +nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct flow_offload_tuple_rhash *tuplehash; + struct nf_flowtable *flow_table = priv; + struct flow_offload_tuple tuple = {}; + enum flow_offload_tuple_dir dir; + struct flow_offload *flow; + struct net_device *outdev; + const struct rtable *rt; + struct iphdr *iph; + __be32 nexthop; + + if (skb->protocol != htons(ETH_P_IP)) + return NF_ACCEPT; + + if (nf_flow_tuple_ip(skb, state->in, &tuple) < 0) + return NF_ACCEPT; + + tuplehash = flow_offload_lookup(flow_table, &tuple); + if (tuplehash == NULL) + return NF_ACCEPT; + + outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx); + if (!outdev) + return NF_ACCEPT; + + dir = tuplehash->tuple.dir; + flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); + + rt = (const struct rtable *)flow->tuplehash[dir].tuple.dst_cache; + if (unlikely(nf_flow_exceeds_mtu(skb, rt))) + return NF_ACCEPT; + + if (skb_try_make_writable(skb, sizeof(*iph))) + return NF_DROP; + + if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) && + nf_flow_nat_ip(flow, skb, dir) < 0) + return NF_DROP; + + flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT; + iph = ip_hdr(skb); + ip_decrease_ttl(iph); + + skb->dev = outdev; + nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr); + neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb); + + return NF_STOLEN; +} +EXPORT_SYMBOL_GPL(nf_flow_offload_ip_hook); + +static struct nf_flowtable_type flowtable_ipv4 = { + .family = NFPROTO_IPV4, + .params = &nf_flow_offload_rhash_params, + .gc = nf_flow_offload_work_gc, + .hook = nf_flow_offload_ip_hook, + .owner = THIS_MODULE, +}; + +static int __init nf_flow_ipv4_module_init(void) +{ + nft_register_flowtable_type(&flowtable_ipv4); + + return 0; +} + +static void __exit nf_flow_ipv4_module_exit(void) +{ + nft_unregister_flowtable_type(&flowtable_ipv4); +} + +module_init(nf_flow_ipv4_module_init); +module_exit(nf_flow_ipv4_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); +MODULE_ALIAS_NF_FLOWTABLE(AF_INET); diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c index 0443ca4120b0..f7ff6a364d7b 100644 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -356,11 +356,6 @@ nf_nat_ipv4_out(void *priv, struct sk_buff *skb, #endif unsigned int ret; - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct iphdr) || - ip_hdrlen(skb) < sizeof(struct iphdr)) - return NF_ACCEPT; - ret = nf_nat_ipv4_fn(priv, skb, state, do_chain); #ifdef CONFIG_XFRM if (ret != NF_DROP && ret != NF_STOLEN && @@ -396,11 +391,6 @@ nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb, unsigned int ret; int err; - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct iphdr) || - ip_hdrlen(skb) < sizeof(struct iphdr)) - return NF_ACCEPT; - ret = nf_nat_ipv4_fn(priv, skb, state, do_chain); if (ret != NF_DROP && ret != NF_STOLEN && (ct = nf_ct_get(skb, &ctinfo)) != NULL) { diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.asn1 b/net/ipv4/netfilter/nf_nat_snmp_basic.asn1 new file mode 100644 index 000000000000..24b73268f362 --- /dev/null +++ b/net/ipv4/netfilter/nf_nat_snmp_basic.asn1 @@ -0,0 +1,177 @@ +Message ::= + SEQUENCE { + version + INTEGER ({snmp_version}), + + community + OCTET STRING, + + pdu + PDUs + } + + +ObjectName ::= + OBJECT IDENTIFIER + +ObjectSyntax ::= + CHOICE { + simple + SimpleSyntax, + + application-wide + ApplicationSyntax + } + +SimpleSyntax ::= + CHOICE { + integer-value + INTEGER, + + string-value + OCTET STRING, + + objectID-value + OBJECT IDENTIFIER + } + +ApplicationSyntax ::= + CHOICE { + ipAddress-value + IpAddress, + + counter-value + Counter32, + + timeticks-value + TimeTicks, + + arbitrary-value + Opaque, + + big-counter-value + Counter64, + + unsigned-integer-value + Unsigned32 + } + +IpAddress ::= + [APPLICATION 0] + IMPLICIT OCTET STRING OPTIONAL ({snmp_helper}) + +Counter32 ::= + [APPLICATION 1] + IMPLICIT INTEGER OPTIONAL + +Unsigned32 ::= + [APPLICATION 2] + IMPLICIT INTEGER OPTIONAL + +Gauge32 ::= Unsigned32 OPTIONAL + +TimeTicks ::= + [APPLICATION 3] + IMPLICIT INTEGER OPTIONAL + +Opaque ::= + [APPLICATION 4] + IMPLICIT OCTET STRING OPTIONAL + +Counter64 ::= + [APPLICATION 6] + IMPLICIT INTEGER OPTIONAL + +PDUs ::= + CHOICE { + get-request + GetRequest-PDU, + + get-next-request + GetNextRequest-PDU, + + get-bulk-request + GetBulkRequest-PDU, + + response + Response-PDU, + + set-request + SetRequest-PDU, + + inform-request + InformRequest-PDU, + + snmpV2-trap + SNMPv2-Trap-PDU, + + report + Report-PDU + } + +GetRequest-PDU ::= + [0] IMPLICIT PDU OPTIONAL + +GetNextRequest-PDU ::= + [1] IMPLICIT PDU OPTIONAL + +Response-PDU ::= + [2] IMPLICIT PDU OPTIONAL + +SetRequest-PDU ::= + [3] IMPLICIT PDU OPTIONAL + +-- [4] is obsolete + +GetBulkRequest-PDU ::= + [5] IMPLICIT PDU OPTIONAL + +InformRequest-PDU ::= + [6] IMPLICIT PDU OPTIONAL + +SNMPv2-Trap-PDU ::= + [7] IMPLICIT PDU OPTIONAL + +Report-PDU ::= + [8] IMPLICIT PDU OPTIONAL + +PDU ::= + SEQUENCE { + request-id + INTEGER, + + error-status + INTEGER, + + error-index + INTEGER, + + variable-bindings + VarBindList + } + + +VarBind ::= + SEQUENCE { + name + ObjectName, + + CHOICE { + value + ObjectSyntax, + + unSpecified + NULL, + + noSuchObject + [0] IMPLICIT NULL, + + noSuchInstance + [1] IMPLICIT NULL, + + endOfMibView + [2] IMPLICIT NULL + } +} + +VarBindList ::= SEQUENCE OF VarBind diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c deleted file mode 100644 index d5b1e0b3f687..000000000000 --- a/net/ipv4/netfilter/nf_nat_snmp_basic.c +++ /dev/null @@ -1,1286 +0,0 @@ -/* - * nf_nat_snmp_basic.c - * - * Basic SNMP Application Layer Gateway - * - * This IP NAT module is intended for use with SNMP network - * discovery and monitoring applications where target networks use - * conflicting private address realms. - * - * Static NAT is used to remap the networks from the view of the network - * management system at the IP layer, and this module remaps some application - * layer addresses to match. - * - * The simplest form of ALG is performed, where only tagged IP addresses - * are modified. The module does not need to be MIB aware and only scans - * messages at the ASN.1/BER level. - * - * Currently, only SNMPv1 and SNMPv2 are supported. - * - * More information on ALG and associated issues can be found in - * RFC 2962 - * - * The ASB.1/BER parsing code is derived from the gxsnmp package by Gregory - * McLean & Jochen Friedrich, stripped down for use in the kernel. - * - * Copyright (c) 2000 RP Internet (www.rpi.net.au). - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. - * - * Author: James Morris <jmorris@intercode.com.au> - * - * Copyright (c) 2006-2010 Patrick McHardy <kaber@trash.net> - */ -#include <linux/module.h> -#include <linux/moduleparam.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/slab.h> -#include <linux/in.h> -#include <linux/ip.h> -#include <linux/udp.h> -#include <net/checksum.h> -#include <net/udp.h> - -#include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_conntrack_expect.h> -#include <net/netfilter/nf_conntrack_helper.h> -#include <net/netfilter/nf_nat_helper.h> -#include <linux/netfilter/nf_conntrack_snmp.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); -MODULE_DESCRIPTION("Basic SNMP Application Layer Gateway"); -MODULE_ALIAS("ip_nat_snmp_basic"); - -#define SNMP_PORT 161 -#define SNMP_TRAP_PORT 162 -#define NOCT1(n) (*(u8 *)(n)) - -static int debug; -static DEFINE_SPINLOCK(snmp_lock); - -/* - * Application layer address mapping mimics the NAT mapping, but - * only for the first octet in this case (a more flexible system - * can be implemented if needed). - */ -struct oct1_map -{ - u_int8_t from; - u_int8_t to; -}; - - -/***************************************************************************** - * - * Basic ASN.1 decoding routines (gxsnmp author Dirk Wisse) - * - *****************************************************************************/ - -/* Class */ -#define ASN1_UNI 0 /* Universal */ -#define ASN1_APL 1 /* Application */ -#define ASN1_CTX 2 /* Context */ -#define ASN1_PRV 3 /* Private */ - -/* Tag */ -#define ASN1_EOC 0 /* End Of Contents */ -#define ASN1_BOL 1 /* Boolean */ -#define ASN1_INT 2 /* Integer */ -#define ASN1_BTS 3 /* Bit String */ -#define ASN1_OTS 4 /* Octet String */ -#define ASN1_NUL 5 /* Null */ -#define ASN1_OJI 6 /* Object Identifier */ -#define ASN1_OJD 7 /* Object Description */ -#define ASN1_EXT 8 /* External */ -#define ASN1_SEQ 16 /* Sequence */ -#define ASN1_SET 17 /* Set */ -#define ASN1_NUMSTR 18 /* Numerical String */ -#define ASN1_PRNSTR 19 /* Printable String */ -#define ASN1_TEXSTR 20 /* Teletext String */ -#define ASN1_VIDSTR 21 /* Video String */ -#define ASN1_IA5STR 22 /* IA5 String */ -#define ASN1_UNITIM 23 /* Universal Time */ -#define ASN1_GENTIM 24 /* General Time */ -#define ASN1_GRASTR 25 /* Graphical String */ -#define ASN1_VISSTR 26 /* Visible String */ -#define ASN1_GENSTR 27 /* General String */ - -/* Primitive / Constructed methods*/ -#define ASN1_PRI 0 /* Primitive */ -#define ASN1_CON 1 /* Constructed */ - -/* - * Error codes. - */ -#define ASN1_ERR_NOERROR 0 -#define ASN1_ERR_DEC_EMPTY 2 -#define ASN1_ERR_DEC_EOC_MISMATCH 3 -#define ASN1_ERR_DEC_LENGTH_MISMATCH 4 -#define ASN1_ERR_DEC_BADVALUE 5 - -/* - * ASN.1 context. - */ -struct asn1_ctx -{ - int error; /* Error condition */ - unsigned char *pointer; /* Octet just to be decoded */ - unsigned char *begin; /* First octet */ - unsigned char *end; /* Octet after last octet */ -}; - -/* - * Octet string (not null terminated) - */ -struct asn1_octstr -{ - unsigned char *data; - unsigned int len; -}; - -static void asn1_open(struct asn1_ctx *ctx, - unsigned char *buf, - unsigned int len) -{ - ctx->begin = buf; - ctx->end = buf + len; - ctx->pointer = buf; - ctx->error = ASN1_ERR_NOERROR; -} - -static unsigned char asn1_octet_decode(struct asn1_ctx *ctx, unsigned char *ch) -{ - if (ctx->pointer >= ctx->end) { - ctx->error = ASN1_ERR_DEC_EMPTY; - return 0; - } - *ch = *(ctx->pointer)++; - return 1; -} - -static unsigned char asn1_tag_decode(struct asn1_ctx *ctx, unsigned int *tag) -{ - unsigned char ch; - - *tag = 0; - - do - { - if (!asn1_octet_decode(ctx, &ch)) - return 0; - *tag <<= 7; - *tag |= ch & 0x7F; - } while ((ch & 0x80) == 0x80); - return 1; -} - -static unsigned char asn1_id_decode(struct asn1_ctx *ctx, - unsigned int *cls, - unsigned int *con, - unsigned int *tag) -{ - unsigned char ch; - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *cls = (ch & 0xC0) >> 6; - *con = (ch & 0x20) >> 5; - *tag = (ch & 0x1F); - - if (*tag == 0x1F) { - if (!asn1_tag_decode(ctx, tag)) - return 0; - } - return 1; -} - -static unsigned char asn1_length_decode(struct asn1_ctx *ctx, - unsigned int *def, - unsigned int *len) -{ - unsigned char ch, cnt; - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - if (ch == 0x80) - *def = 0; - else { - *def = 1; - - if (ch < 0x80) - *len = ch; - else { - cnt = ch & 0x7F; - *len = 0; - - while (cnt > 0) { - if (!asn1_octet_decode(ctx, &ch)) - return 0; - *len <<= 8; - *len |= ch; - cnt--; - } - } - } - - /* don't trust len bigger than ctx buffer */ - if (*len > ctx->end - ctx->pointer) - return 0; - - return 1; -} - -static unsigned char asn1_header_decode(struct asn1_ctx *ctx, - unsigned char **eoc, - unsigned int *cls, - unsigned int *con, - unsigned int *tag) -{ - unsigned int def, len; - - if (!asn1_id_decode(ctx, cls, con, tag)) - return 0; - - def = len = 0; - if (!asn1_length_decode(ctx, &def, &len)) - return 0; - - /* primitive shall be definite, indefinite shall be constructed */ - if (*con == ASN1_PRI && !def) - return 0; - - if (def) - *eoc = ctx->pointer + len; - else - *eoc = NULL; - return 1; -} - -static unsigned char asn1_eoc_decode(struct asn1_ctx *ctx, unsigned char *eoc) -{ - unsigned char ch; - - if (eoc == NULL) { - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - if (ch != 0x00) { - ctx->error = ASN1_ERR_DEC_EOC_MISMATCH; - return 0; - } - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - if (ch != 0x00) { - ctx->error = ASN1_ERR_DEC_EOC_MISMATCH; - return 0; - } - return 1; - } else { - if (ctx->pointer != eoc) { - ctx->error = ASN1_ERR_DEC_LENGTH_MISMATCH; - return 0; - } - return 1; - } -} - -static unsigned char asn1_null_decode(struct asn1_ctx *ctx, unsigned char *eoc) -{ - ctx->pointer = eoc; - return 1; -} - -static unsigned char asn1_long_decode(struct asn1_ctx *ctx, - unsigned char *eoc, - long *integer) -{ - unsigned char ch; - unsigned int len; - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *integer = (signed char) ch; - len = 1; - - while (ctx->pointer < eoc) { - if (++len > sizeof (long)) { - ctx->error = ASN1_ERR_DEC_BADVALUE; - return 0; - } - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *integer <<= 8; - *integer |= ch; - } - return 1; -} - -static unsigned char asn1_uint_decode(struct asn1_ctx *ctx, - unsigned char *eoc, - unsigned int *integer) -{ - unsigned char ch; - unsigned int len; - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *integer = ch; - if (ch == 0) len = 0; - else len = 1; - - while (ctx->pointer < eoc) { - if (++len > sizeof (unsigned int)) { - ctx->error = ASN1_ERR_DEC_BADVALUE; - return 0; - } - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *integer <<= 8; - *integer |= ch; - } - return 1; -} - -static unsigned char asn1_ulong_decode(struct asn1_ctx *ctx, - unsigned char *eoc, - unsigned long *integer) -{ - unsigned char ch; - unsigned int len; - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *integer = ch; - if (ch == 0) len = 0; - else len = 1; - - while (ctx->pointer < eoc) { - if (++len > sizeof (unsigned long)) { - ctx->error = ASN1_ERR_DEC_BADVALUE; - return 0; - } - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *integer <<= 8; - *integer |= ch; - } - return 1; -} - -static unsigned char asn1_octets_decode(struct asn1_ctx *ctx, - unsigned char *eoc, - unsigned char **octets, - unsigned int *len) -{ - unsigned char *ptr; - - *len = 0; - - *octets = kmalloc(eoc - ctx->pointer, GFP_ATOMIC); - if (*octets == NULL) - return 0; - - ptr = *octets; - while (ctx->pointer < eoc) { - if (!asn1_octet_decode(ctx, ptr++)) { - kfree(*octets); - *octets = NULL; - return 0; - } - (*len)++; - } - return 1; -} - -static unsigned char asn1_subid_decode(struct asn1_ctx *ctx, - unsigned long *subid) -{ - unsigned char ch; - - *subid = 0; - - do { - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *subid <<= 7; - *subid |= ch & 0x7F; - } while ((ch & 0x80) == 0x80); - return 1; -} - -static unsigned char asn1_oid_decode(struct asn1_ctx *ctx, - unsigned char *eoc, - unsigned long **oid, - unsigned int *len) -{ - unsigned long subid; - unsigned long *optr; - size_t size; - - size = eoc - ctx->pointer + 1; - - /* first subid actually encodes first two subids */ - if (size < 2 || size > ULONG_MAX/sizeof(unsigned long)) - return 0; - - *oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC); - if (*oid == NULL) - return 0; - - optr = *oid; - - if (!asn1_subid_decode(ctx, &subid)) { - kfree(*oid); - *oid = NULL; - return 0; - } - - if (subid < 40) { - optr[0] = 0; - optr[1] = subid; - } else if (subid < 80) { - optr[0] = 1; - optr[1] = subid - 40; - } else { - optr[0] = 2; - optr[1] = subid - 80; - } - - *len = 2; - optr += 2; - - while (ctx->pointer < eoc) { - if (++(*len) > size) { - ctx->error = ASN1_ERR_DEC_BADVALUE; - kfree(*oid); - *oid = NULL; - return 0; - } - - if (!asn1_subid_decode(ctx, optr++)) { - kfree(*oid); - *oid = NULL; - return 0; - } - } - return 1; -} - -/***************************************************************************** - * - * SNMP decoding routines (gxsnmp author Dirk Wisse) - * - *****************************************************************************/ - -/* SNMP Versions */ -#define SNMP_V1 0 -#define SNMP_V2C 1 -#define SNMP_V2 2 -#define SNMP_V3 3 - -/* Default Sizes */ -#define SNMP_SIZE_COMM 256 -#define SNMP_SIZE_OBJECTID 128 -#define SNMP_SIZE_BUFCHR 256 -#define SNMP_SIZE_BUFINT 128 -#define SNMP_SIZE_SMALLOBJECTID 16 - -/* Requests */ -#define SNMP_PDU_GET 0 -#define SNMP_PDU_NEXT 1 -#define SNMP_PDU_RESPONSE 2 -#define SNMP_PDU_SET 3 -#define SNMP_PDU_TRAP1 4 -#define SNMP_PDU_BULK 5 -#define SNMP_PDU_INFORM 6 -#define SNMP_PDU_TRAP2 7 - -/* Errors */ -#define SNMP_NOERROR 0 -#define SNMP_TOOBIG 1 -#define SNMP_NOSUCHNAME 2 -#define SNMP_BADVALUE 3 -#define SNMP_READONLY 4 -#define SNMP_GENERROR 5 -#define SNMP_NOACCESS 6 -#define SNMP_WRONGTYPE 7 -#define SNMP_WRONGLENGTH 8 -#define SNMP_WRONGENCODING 9 -#define SNMP_WRONGVALUE 10 -#define SNMP_NOCREATION 11 -#define SNMP_INCONSISTENTVALUE 12 -#define SNMP_RESOURCEUNAVAILABLE 13 -#define SNMP_COMMITFAILED 14 -#define SNMP_UNDOFAILED 15 -#define SNMP_AUTHORIZATIONERROR 16 -#define SNMP_NOTWRITABLE 17 -#define SNMP_INCONSISTENTNAME 18 - -/* General SNMP V1 Traps */ -#define SNMP_TRAP_COLDSTART 0 -#define SNMP_TRAP_WARMSTART 1 -#define SNMP_TRAP_LINKDOWN 2 -#define SNMP_TRAP_LINKUP 3 -#define SNMP_TRAP_AUTFAILURE 4 -#define SNMP_TRAP_EQPNEIGHBORLOSS 5 -#define SNMP_TRAP_ENTSPECIFIC 6 - -/* SNMPv1 Types */ -#define SNMP_NULL 0 -#define SNMP_INTEGER 1 /* l */ -#define SNMP_OCTETSTR 2 /* c */ -#define SNMP_DISPLAYSTR 2 /* c */ -#define SNMP_OBJECTID 3 /* ul */ -#define SNMP_IPADDR 4 /* uc */ -#define SNMP_COUNTER 5 /* ul */ -#define SNMP_GAUGE 6 /* ul */ -#define SNMP_TIMETICKS 7 /* ul */ -#define SNMP_OPAQUE 8 /* c */ - -/* Additional SNMPv2 Types */ -#define SNMP_UINTEGER 5 /* ul */ -#define SNMP_BITSTR 9 /* uc */ -#define SNMP_NSAP 10 /* uc */ -#define SNMP_COUNTER64 11 /* ul */ -#define SNMP_NOSUCHOBJECT 12 -#define SNMP_NOSUCHINSTANCE 13 -#define SNMP_ENDOFMIBVIEW 14 - -union snmp_syntax -{ - unsigned char uc[0]; /* 8 bit unsigned */ - char c[0]; /* 8 bit signed */ - unsigned long ul[0]; /* 32 bit unsigned */ - long l[0]; /* 32 bit signed */ -}; - -struct snmp_object -{ - unsigned long *id; - unsigned int id_len; - unsigned short type; - unsigned int syntax_len; - union snmp_syntax syntax; -}; - -struct snmp_request -{ - unsigned long id; - unsigned int error_status; - unsigned int error_index; -}; - -struct snmp_v1_trap -{ - unsigned long *id; - unsigned int id_len; - unsigned long ip_address; /* pointer */ - unsigned int general; - unsigned int specific; - unsigned long time; -}; - -/* SNMP types */ -#define SNMP_IPA 0 -#define SNMP_CNT 1 -#define SNMP_GGE 2 -#define SNMP_TIT 3 -#define SNMP_OPQ 4 -#define SNMP_C64 6 - -/* SNMP errors */ -#define SERR_NSO 0 -#define SERR_NSI 1 -#define SERR_EOM 2 - -static inline void mangle_address(unsigned char *begin, - unsigned char *addr, - const struct oct1_map *map, - __sum16 *check); -struct snmp_cnv -{ - unsigned int class; - unsigned int tag; - int syntax; -}; - -static const struct snmp_cnv snmp_conv[] = { - {ASN1_UNI, ASN1_NUL, SNMP_NULL}, - {ASN1_UNI, ASN1_INT, SNMP_INTEGER}, - {ASN1_UNI, ASN1_OTS, SNMP_OCTETSTR}, - {ASN1_UNI, ASN1_OTS, SNMP_DISPLAYSTR}, - {ASN1_UNI, ASN1_OJI, SNMP_OBJECTID}, - {ASN1_APL, SNMP_IPA, SNMP_IPADDR}, - {ASN1_APL, SNMP_CNT, SNMP_COUNTER}, /* Counter32 */ - {ASN1_APL, SNMP_GGE, SNMP_GAUGE}, /* Gauge32 == Unsigned32 */ - {ASN1_APL, SNMP_TIT, SNMP_TIMETICKS}, - {ASN1_APL, SNMP_OPQ, SNMP_OPAQUE}, - - /* SNMPv2 data types and errors */ - {ASN1_UNI, ASN1_BTS, SNMP_BITSTR}, - {ASN1_APL, SNMP_C64, SNMP_COUNTER64}, - {ASN1_CTX, SERR_NSO, SNMP_NOSUCHOBJECT}, - {ASN1_CTX, SERR_NSI, SNMP_NOSUCHINSTANCE}, - {ASN1_CTX, SERR_EOM, SNMP_ENDOFMIBVIEW}, - {0, 0, -1} -}; - -static unsigned char snmp_tag_cls2syntax(unsigned int tag, - unsigned int cls, - unsigned short *syntax) -{ - const struct snmp_cnv *cnv; - - cnv = snmp_conv; - - while (cnv->syntax != -1) { - if (cnv->tag == tag && cnv->class == cls) { - *syntax = cnv->syntax; - return 1; - } - cnv++; - } - return 0; -} - -static unsigned char snmp_object_decode(struct asn1_ctx *ctx, - struct snmp_object **obj) -{ - unsigned int cls, con, tag, len, idlen; - unsigned short type; - unsigned char *eoc, *end, *p; - unsigned long *lp, *id; - unsigned long ul; - long l; - - *obj = NULL; - id = NULL; - - if (!asn1_header_decode(ctx, &eoc, &cls, &con, &tag)) - return 0; - - if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ) - return 0; - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) - return 0; - - if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OJI) - return 0; - - if (!asn1_oid_decode(ctx, end, &id, &idlen)) - return 0; - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) { - kfree(id); - return 0; - } - - if (con != ASN1_PRI) { - kfree(id); - return 0; - } - - type = 0; - if (!snmp_tag_cls2syntax(tag, cls, &type)) { - kfree(id); - return 0; - } - - l = 0; - switch (type) { - case SNMP_INTEGER: - len = sizeof(long); - if (!asn1_long_decode(ctx, end, &l)) { - kfree(id); - return 0; - } - *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); - if (*obj == NULL) { - kfree(id); - return 0; - } - (*obj)->syntax.l[0] = l; - break; - case SNMP_OCTETSTR: - case SNMP_OPAQUE: - if (!asn1_octets_decode(ctx, end, &p, &len)) { - kfree(id); - return 0; - } - *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); - if (*obj == NULL) { - kfree(p); - kfree(id); - return 0; - } - memcpy((*obj)->syntax.c, p, len); - kfree(p); - break; - case SNMP_NULL: - case SNMP_NOSUCHOBJECT: - case SNMP_NOSUCHINSTANCE: - case SNMP_ENDOFMIBVIEW: - len = 0; - *obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC); - if (*obj == NULL) { - kfree(id); - return 0; - } - if (!asn1_null_decode(ctx, end)) { - kfree(id); - kfree(*obj); - *obj = NULL; - return 0; - } - break; - case SNMP_OBJECTID: - if (!asn1_oid_decode(ctx, end, &lp, &len)) { - kfree(id); - return 0; - } - len *= sizeof(unsigned long); - *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); - if (*obj == NULL) { - kfree(lp); - kfree(id); - return 0; - } - memcpy((*obj)->syntax.ul, lp, len); - kfree(lp); - break; - case SNMP_IPADDR: - if (!asn1_octets_decode(ctx, end, &p, &len)) { - kfree(id); - return 0; - } - if (len != 4) { - kfree(p); - kfree(id); - return 0; - } - *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); - if (*obj == NULL) { - kfree(p); - kfree(id); - return 0; - } - memcpy((*obj)->syntax.uc, p, len); - kfree(p); - break; - case SNMP_COUNTER: - case SNMP_GAUGE: - case SNMP_TIMETICKS: - len = sizeof(unsigned long); - if (!asn1_ulong_decode(ctx, end, &ul)) { - kfree(id); - return 0; - } - *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); - if (*obj == NULL) { - kfree(id); - return 0; - } - (*obj)->syntax.ul[0] = ul; - break; - default: - kfree(id); - return 0; - } - - (*obj)->syntax_len = len; - (*obj)->type = type; - (*obj)->id = id; - (*obj)->id_len = idlen; - - if (!asn1_eoc_decode(ctx, eoc)) { - kfree(id); - kfree(*obj); - *obj = NULL; - return 0; - } - return 1; -} - -static unsigned char noinline_for_stack -snmp_request_decode(struct asn1_ctx *ctx, struct snmp_request *request) -{ - unsigned int cls, con, tag; - unsigned char *end; - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) - return 0; - - if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) - return 0; - - if (!asn1_ulong_decode(ctx, end, &request->id)) - return 0; - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) - return 0; - - if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) - return 0; - - if (!asn1_uint_decode(ctx, end, &request->error_status)) - return 0; - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) - return 0; - - if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) - return 0; - - if (!asn1_uint_decode(ctx, end, &request->error_index)) - return 0; - - return 1; -} - -/* - * Fast checksum update for possibly oddly-aligned UDP byte, from the - * code example in the draft. - */ -static void fast_csum(__sum16 *csum, - const unsigned char *optr, - const unsigned char *nptr, - int offset) -{ - unsigned char s[4]; - - if (offset & 1) { - s[0] = ~0; - s[1] = ~*optr; - s[2] = 0; - s[3] = *nptr; - } else { - s[0] = ~*optr; - s[1] = ~0; - s[2] = *nptr; - s[3] = 0; - } - - *csum = csum_fold(csum_partial(s, 4, ~csum_unfold(*csum))); -} - -/* - * Mangle IP address. - * - begin points to the start of the snmp messgae - * - addr points to the start of the address - */ -static inline void mangle_address(unsigned char *begin, - unsigned char *addr, - const struct oct1_map *map, - __sum16 *check) -{ - if (map->from == NOCT1(addr)) { - u_int32_t old; - - if (debug) - memcpy(&old, addr, sizeof(old)); - - *addr = map->to; - - /* Update UDP checksum if being used */ - if (*check) { - fast_csum(check, - &map->from, &map->to, addr - begin); - - } - - if (debug) - printk(KERN_DEBUG "bsalg: mapped %pI4 to %pI4\n", - &old, addr); - } -} - -static unsigned char noinline_for_stack -snmp_trap_decode(struct asn1_ctx *ctx, struct snmp_v1_trap *trap, - const struct oct1_map *map, - __sum16 *check) -{ - unsigned int cls, con, tag, len; - unsigned char *end; - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) - return 0; - - if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OJI) - return 0; - - if (!asn1_oid_decode(ctx, end, &trap->id, &trap->id_len)) - return 0; - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) - goto err_id_free; - - if (!((cls == ASN1_APL && con == ASN1_PRI && tag == SNMP_IPA) || - (cls == ASN1_UNI && con == ASN1_PRI && tag == ASN1_OTS))) - goto err_id_free; - - if (!asn1_octets_decode(ctx, end, (unsigned char **)&trap->ip_address, &len)) - goto err_id_free; - - /* IPv4 only */ - if (len != 4) - goto err_addr_free; - - mangle_address(ctx->begin, ctx->pointer - 4, map, check); - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) - goto err_addr_free; - - if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) - goto err_addr_free; - - if (!asn1_uint_decode(ctx, end, &trap->general)) - goto err_addr_free; - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) - goto err_addr_free; - - if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) - goto err_addr_free; - - if (!asn1_uint_decode(ctx, end, &trap->specific)) - goto err_addr_free; - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) - goto err_addr_free; - - if (!((cls == ASN1_APL && con == ASN1_PRI && tag == SNMP_TIT) || - (cls == ASN1_UNI && con == ASN1_PRI && tag == ASN1_INT))) - goto err_addr_free; - - if (!asn1_ulong_decode(ctx, end, &trap->time)) - goto err_addr_free; - - return 1; - -err_addr_free: - kfree((unsigned long *)trap->ip_address); - -err_id_free: - kfree(trap->id); - - return 0; -} - -/***************************************************************************** - * - * Misc. routines - * - *****************************************************************************/ - -/* - * Parse and mangle SNMP message according to mapping. - * (And this is the fucking 'basic' method). - */ -static int snmp_parse_mangle(unsigned char *msg, - u_int16_t len, - const struct oct1_map *map, - __sum16 *check) -{ - unsigned char *eoc, *end; - unsigned int cls, con, tag, vers, pdutype; - struct asn1_ctx ctx; - struct asn1_octstr comm; - struct snmp_object *obj; - - if (debug > 1) - print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_NONE, 16, 1, - msg, len, 0); - - asn1_open(&ctx, msg, len); - - /* - * Start of SNMP message. - */ - if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &tag)) - return 0; - if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ) - return 0; - - /* - * Version 1 or 2 handled. - */ - if (!asn1_header_decode(&ctx, &end, &cls, &con, &tag)) - return 0; - if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) - return 0; - if (!asn1_uint_decode (&ctx, end, &vers)) - return 0; - if (debug > 1) - pr_debug("bsalg: snmp version: %u\n", vers + 1); - if (vers > 1) - return 1; - - /* - * Community. - */ - if (!asn1_header_decode (&ctx, &end, &cls, &con, &tag)) - return 0; - if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OTS) - return 0; - if (!asn1_octets_decode(&ctx, end, &comm.data, &comm.len)) - return 0; - if (debug > 1) { - unsigned int i; - - pr_debug("bsalg: community: "); - for (i = 0; i < comm.len; i++) - pr_cont("%c", comm.data[i]); - pr_cont("\n"); - } - kfree(comm.data); - - /* - * PDU type - */ - if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &pdutype)) - return 0; - if (cls != ASN1_CTX || con != ASN1_CON) - return 0; - if (debug > 1) { - static const unsigned char *const pdus[] = { - [SNMP_PDU_GET] = "get", - [SNMP_PDU_NEXT] = "get-next", - [SNMP_PDU_RESPONSE] = "response", - [SNMP_PDU_SET] = "set", - [SNMP_PDU_TRAP1] = "trapv1", - [SNMP_PDU_BULK] = "bulk", - [SNMP_PDU_INFORM] = "inform", - [SNMP_PDU_TRAP2] = "trapv2" - }; - - if (pdutype > SNMP_PDU_TRAP2) - pr_debug("bsalg: bad pdu type %u\n", pdutype); - else - pr_debug("bsalg: pdu: %s\n", pdus[pdutype]); - } - if (pdutype != SNMP_PDU_RESPONSE && - pdutype != SNMP_PDU_TRAP1 && pdutype != SNMP_PDU_TRAP2) - return 1; - - /* - * Request header or v1 trap - */ - if (pdutype == SNMP_PDU_TRAP1) { - struct snmp_v1_trap trap; - unsigned char ret = snmp_trap_decode(&ctx, &trap, map, check); - - if (ret) { - kfree(trap.id); - kfree((unsigned long *)trap.ip_address); - } else - return ret; - - } else { - struct snmp_request req; - - if (!snmp_request_decode(&ctx, &req)) - return 0; - - if (debug > 1) - pr_debug("bsalg: request: id=0x%lx error_status=%u " - "error_index=%u\n", req.id, req.error_status, - req.error_index); - } - - /* - * Loop through objects, look for IP addresses to mangle. - */ - if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &tag)) - return 0; - - if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ) - return 0; - - while (!asn1_eoc_decode(&ctx, eoc)) { - unsigned int i; - - if (!snmp_object_decode(&ctx, &obj)) { - if (obj) { - kfree(obj->id); - kfree(obj); - } - return 0; - } - - if (debug > 1) { - pr_debug("bsalg: object: "); - for (i = 0; i < obj->id_len; i++) { - if (i > 0) - pr_cont("."); - pr_cont("%lu", obj->id[i]); - } - pr_cont(": type=%u\n", obj->type); - - } - - if (obj->type == SNMP_IPADDR) - mangle_address(ctx.begin, ctx.pointer - 4, map, check); - - kfree(obj->id); - kfree(obj); - } - - if (!asn1_eoc_decode(&ctx, eoc)) - return 0; - - return 1; -} - -/***************************************************************************** - * - * NAT routines. - * - *****************************************************************************/ - -/* - * SNMP translation routine. - */ -static int snmp_translate(struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - struct sk_buff *skb) -{ - struct iphdr *iph = ip_hdr(skb); - struct udphdr *udph = (struct udphdr *)((__be32 *)iph + iph->ihl); - u_int16_t udplen = ntohs(udph->len); - u_int16_t paylen = udplen - sizeof(struct udphdr); - int dir = CTINFO2DIR(ctinfo); - struct oct1_map map; - - /* - * Determine mappping for application layer addresses based - * on NAT manipulations for the packet. - */ - if (dir == IP_CT_DIR_ORIGINAL) { - /* SNAT traps */ - map.from = NOCT1(&ct->tuplehash[dir].tuple.src.u3.ip); - map.to = NOCT1(&ct->tuplehash[!dir].tuple.dst.u3.ip); - } else { - /* DNAT replies */ - map.from = NOCT1(&ct->tuplehash[!dir].tuple.src.u3.ip); - map.to = NOCT1(&ct->tuplehash[dir].tuple.dst.u3.ip); - } - - if (map.from == map.to) - return NF_ACCEPT; - - if (!snmp_parse_mangle((unsigned char *)udph + sizeof(struct udphdr), - paylen, &map, &udph->check)) { - net_warn_ratelimited("bsalg: parser failed\n"); - return NF_DROP; - } - return NF_ACCEPT; -} - -/* We don't actually set up expectations, just adjust internal IP - * addresses if this is being NATted */ -static int help(struct sk_buff *skb, unsigned int protoff, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo) -{ - int dir = CTINFO2DIR(ctinfo); - unsigned int ret; - const struct iphdr *iph = ip_hdr(skb); - const struct udphdr *udph = (struct udphdr *)((__be32 *)iph + iph->ihl); - - /* SNMP replies and originating SNMP traps get mangled */ - if (udph->source == htons(SNMP_PORT) && dir != IP_CT_DIR_REPLY) - return NF_ACCEPT; - if (udph->dest == htons(SNMP_TRAP_PORT) && dir != IP_CT_DIR_ORIGINAL) - return NF_ACCEPT; - - /* No NAT? */ - if (!(ct->status & IPS_NAT_MASK)) - return NF_ACCEPT; - - /* - * Make sure the packet length is ok. So far, we were only guaranteed - * to have a valid length IP header plus 8 bytes, which means we have - * enough room for a UDP header. Just verify the UDP length field so we - * can mess around with the payload. - */ - if (ntohs(udph->len) != skb->len - (iph->ihl << 2)) { - net_warn_ratelimited("SNMP: dropping malformed packet src=%pI4 dst=%pI4\n", - &iph->saddr, &iph->daddr); - return NF_DROP; - } - - if (!skb_make_writable(skb, skb->len)) - return NF_DROP; - - spin_lock_bh(&snmp_lock); - ret = snmp_translate(ct, ctinfo, skb); - spin_unlock_bh(&snmp_lock); - return ret; -} - -static const struct nf_conntrack_expect_policy snmp_exp_policy = { - .max_expected = 0, - .timeout = 180, -}; - -static struct nf_conntrack_helper snmp_trap_helper __read_mostly = { - .me = THIS_MODULE, - .help = help, - .expect_policy = &snmp_exp_policy, - .name = "snmp_trap", - .tuple.src.l3num = AF_INET, - .tuple.src.u.udp.port = cpu_to_be16(SNMP_TRAP_PORT), - .tuple.dst.protonum = IPPROTO_UDP, -}; - -/***************************************************************************** - * - * Module stuff. - * - *****************************************************************************/ - -static int __init nf_nat_snmp_basic_init(void) -{ - BUG_ON(nf_nat_snmp_hook != NULL); - RCU_INIT_POINTER(nf_nat_snmp_hook, help); - - return nf_conntrack_helper_register(&snmp_trap_helper); -} - -static void __exit nf_nat_snmp_basic_fini(void) -{ - RCU_INIT_POINTER(nf_nat_snmp_hook, NULL); - synchronize_rcu(); - nf_conntrack_helper_unregister(&snmp_trap_helper); -} - -module_init(nf_nat_snmp_basic_init); -module_exit(nf_nat_snmp_basic_fini); - -module_param(debug, int, 0600); diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic_main.c b/net/ipv4/netfilter/nf_nat_snmp_basic_main.c new file mode 100644 index 000000000000..b6e277093e7e --- /dev/null +++ b/net/ipv4/netfilter/nf_nat_snmp_basic_main.c @@ -0,0 +1,235 @@ +/* + * nf_nat_snmp_basic.c + * + * Basic SNMP Application Layer Gateway + * + * This IP NAT module is intended for use with SNMP network + * discovery and monitoring applications where target networks use + * conflicting private address realms. + * + * Static NAT is used to remap the networks from the view of the network + * management system at the IP layer, and this module remaps some application + * layer addresses to match. + * + * The simplest form of ALG is performed, where only tagged IP addresses + * are modified. The module does not need to be MIB aware and only scans + * messages at the ASN.1/BER level. + * + * Currently, only SNMPv1 and SNMPv2 are supported. + * + * More information on ALG and associated issues can be found in + * RFC 2962 + * + * The ASB.1/BER parsing code is derived from the gxsnmp package by Gregory + * McLean & Jochen Friedrich, stripped down for use in the kernel. + * + * Copyright (c) 2000 RP Internet (www.rpi.net.au). + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + * + * Author: James Morris <jmorris@intercode.com.au> + * + * Copyright (c) 2006-2010 Patrick McHardy <kaber@trash.net> + */ +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/udp.h> +#include <net/checksum.h> +#include <net/udp.h> + +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <linux/netfilter/nf_conntrack_snmp.h> +#include "nf_nat_snmp_basic-asn1.h" + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); +MODULE_DESCRIPTION("Basic SNMP Application Layer Gateway"); +MODULE_ALIAS("ip_nat_snmp_basic"); + +#define SNMP_PORT 161 +#define SNMP_TRAP_PORT 162 + +static DEFINE_SPINLOCK(snmp_lock); + +struct snmp_ctx { + unsigned char *begin; + __sum16 *check; + __be32 from; + __be32 to; +}; + +static void fast_csum(struct snmp_ctx *ctx, unsigned char offset) +{ + unsigned char s[12] = {0,}; + int size; + + if (offset & 1) { + memcpy(&s[1], &ctx->from, 4); + memcpy(&s[7], &ctx->to, 4); + s[0] = ~0; + s[1] = ~s[1]; + s[2] = ~s[2]; + s[3] = ~s[3]; + s[4] = ~s[4]; + s[5] = ~0; + size = 12; + } else { + memcpy(&s[0], &ctx->from, 4); + memcpy(&s[4], &ctx->to, 4); + s[0] = ~s[0]; + s[1] = ~s[1]; + s[2] = ~s[2]; + s[3] = ~s[3]; + size = 8; + } + *ctx->check = csum_fold(csum_partial(s, size, + ~csum_unfold(*ctx->check))); +} + +int snmp_version(void *context, size_t hdrlen, unsigned char tag, + const void *data, size_t datalen) +{ + if (*(unsigned char *)data > 1) + return -ENOTSUPP; + return 1; +} + +int snmp_helper(void *context, size_t hdrlen, unsigned char tag, + const void *data, size_t datalen) +{ + struct snmp_ctx *ctx = (struct snmp_ctx *)context; + __be32 *pdata = (__be32 *)data; + + if (*pdata == ctx->from) { + pr_debug("%s: %pI4 to %pI4\n", __func__, + (void *)&ctx->from, (void *)&ctx->to); + + if (*ctx->check) + fast_csum(ctx, (unsigned char *)data - ctx->begin); + *pdata = ctx->to; + } + + return 1; +} + +static int snmp_translate(struct nf_conn *ct, int dir, struct sk_buff *skb) +{ + struct iphdr *iph = ip_hdr(skb); + struct udphdr *udph = (struct udphdr *)((__be32 *)iph + iph->ihl); + u16 datalen = ntohs(udph->len) - sizeof(struct udphdr); + char *data = (unsigned char *)udph + sizeof(struct udphdr); + struct snmp_ctx ctx; + int ret; + + if (dir == IP_CT_DIR_ORIGINAL) { + ctx.from = ct->tuplehash[dir].tuple.src.u3.ip; + ctx.to = ct->tuplehash[!dir].tuple.dst.u3.ip; + } else { + ctx.from = ct->tuplehash[!dir].tuple.src.u3.ip; + ctx.to = ct->tuplehash[dir].tuple.dst.u3.ip; + } + + if (ctx.from == ctx.to) + return NF_ACCEPT; + + ctx.begin = (unsigned char *)udph + sizeof(struct udphdr); + ctx.check = &udph->check; + ret = asn1_ber_decoder(&nf_nat_snmp_basic_decoder, &ctx, data, datalen); + if (ret < 0) { + nf_ct_helper_log(skb, ct, "parser failed\n"); + return NF_DROP; + } + + return NF_ACCEPT; +} + +/* We don't actually set up expectations, just adjust internal IP + * addresses if this is being NATted + */ +static int help(struct sk_buff *skb, unsigned int protoff, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo) +{ + int dir = CTINFO2DIR(ctinfo); + unsigned int ret; + const struct iphdr *iph = ip_hdr(skb); + const struct udphdr *udph = (struct udphdr *)((__be32 *)iph + iph->ihl); + + /* SNMP replies and originating SNMP traps get mangled */ + if (udph->source == htons(SNMP_PORT) && dir != IP_CT_DIR_REPLY) + return NF_ACCEPT; + if (udph->dest == htons(SNMP_TRAP_PORT) && dir != IP_CT_DIR_ORIGINAL) + return NF_ACCEPT; + + /* No NAT? */ + if (!(ct->status & IPS_NAT_MASK)) + return NF_ACCEPT; + + /* Make sure the packet length is ok. So far, we were only guaranteed + * to have a valid length IP header plus 8 bytes, which means we have + * enough room for a UDP header. Just verify the UDP length field so we + * can mess around with the payload. + */ + if (ntohs(udph->len) != skb->len - (iph->ihl << 2)) { + nf_ct_helper_log(skb, ct, "dropping malformed packet\n"); + return NF_DROP; + } + + if (!skb_make_writable(skb, skb->len)) { + nf_ct_helper_log(skb, ct, "cannot mangle packet"); + return NF_DROP; + } + + spin_lock_bh(&snmp_lock); + ret = snmp_translate(ct, dir, skb); + spin_unlock_bh(&snmp_lock); + return ret; +} + +static const struct nf_conntrack_expect_policy snmp_exp_policy = { + .max_expected = 0, + .timeout = 180, +}; + +static struct nf_conntrack_helper snmp_trap_helper __read_mostly = { + .me = THIS_MODULE, + .help = help, + .expect_policy = &snmp_exp_policy, + .name = "snmp_trap", + .tuple.src.l3num = AF_INET, + .tuple.src.u.udp.port = cpu_to_be16(SNMP_TRAP_PORT), + .tuple.dst.protonum = IPPROTO_UDP, +}; + +static int __init nf_nat_snmp_basic_init(void) +{ + BUG_ON(nf_nat_snmp_hook != NULL); + RCU_INIT_POINTER(nf_nat_snmp_hook, help); + + return nf_conntrack_helper_register(&snmp_trap_helper); +} + +static void __exit nf_nat_snmp_basic_fini(void) +{ + RCU_INIT_POINTER(nf_nat_snmp_hook, NULL); + synchronize_rcu(); + nf_conntrack_helper_unregister(&snmp_trap_helper); +} + +module_init(nf_nat_snmp_basic_init); +module_exit(nf_nat_snmp_basic_fini); diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c index 4bbc273b45e8..036c074736b0 100644 --- a/net/ipv4/netfilter/nf_tables_arp.c +++ b/net/ipv4/netfilter/nf_tables_arp.c @@ -21,51 +21,12 @@ nft_do_chain_arp(void *priv, { struct nft_pktinfo pkt; - nft_set_pktinfo_unspec(&pkt, skb, state); + nft_set_pktinfo(&pkt, skb, state); + nft_set_pktinfo_unspec(&pkt, skb); return nft_do_chain(&pkt, priv); } -static struct nft_af_info nft_af_arp __read_mostly = { - .family = NFPROTO_ARP, - .nhooks = NF_ARP_NUMHOOKS, - .owner = THIS_MODULE, - .nops = 1, - .hooks = { - [NF_ARP_IN] = nft_do_chain_arp, - [NF_ARP_OUT] = nft_do_chain_arp, - [NF_ARP_FORWARD] = nft_do_chain_arp, - }, -}; - -static int nf_tables_arp_init_net(struct net *net) -{ - net->nft.arp = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL); - if (net->nft.arp== NULL) - return -ENOMEM; - - memcpy(net->nft.arp, &nft_af_arp, sizeof(nft_af_arp)); - - if (nft_register_afinfo(net, net->nft.arp) < 0) - goto err; - - return 0; -err: - kfree(net->nft.arp); - return -ENOMEM; -} - -static void nf_tables_arp_exit_net(struct net *net) -{ - nft_unregister_afinfo(net, net->nft.arp); - kfree(net->nft.arp); -} - -static struct pernet_operations nf_tables_arp_net_ops = { - .init = nf_tables_arp_init_net, - .exit = nf_tables_arp_exit_net, -}; - static const struct nf_chain_type filter_arp = { .name = "filter", .type = NFT_CHAIN_T_DEFAULT, @@ -73,26 +34,19 @@ static const struct nf_chain_type filter_arp = { .owner = THIS_MODULE, .hook_mask = (1 << NF_ARP_IN) | (1 << NF_ARP_OUT), + .hooks = { + [NF_ARP_IN] = nft_do_chain_arp, + [NF_ARP_OUT] = nft_do_chain_arp, + }, }; static int __init nf_tables_arp_init(void) { - int ret; - - ret = nft_register_chain_type(&filter_arp); - if (ret < 0) - return ret; - - ret = register_pernet_subsys(&nf_tables_arp_net_ops); - if (ret < 0) - nft_unregister_chain_type(&filter_arp); - - return ret; + return nft_register_chain_type(&filter_arp); } static void __exit nf_tables_arp_exit(void) { - unregister_pernet_subsys(&nf_tables_arp_net_ops); nft_unregister_chain_type(&filter_arp); } @@ -101,4 +55,4 @@ module_exit(nf_tables_arp_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_ALIAS_NFT_FAMILY(3); /* NFPROTO_ARP */ +MODULE_ALIAS_NFT_CHAIN(3, "filter"); /* NFPROTO_ARP */ diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c index 2840a29b2e04..96f955496d5f 100644 --- a/net/ipv4/netfilter/nf_tables_ipv4.c +++ b/net/ipv4/netfilter/nf_tables_ipv4.c @@ -24,69 +24,12 @@ static unsigned int nft_do_chain_ipv4(void *priv, { struct nft_pktinfo pkt; - nft_set_pktinfo_ipv4(&pkt, skb, state); + nft_set_pktinfo(&pkt, skb, state); + nft_set_pktinfo_ipv4(&pkt, skb); return nft_do_chain(&pkt, priv); } -static unsigned int nft_ipv4_output(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - if (unlikely(skb->len < sizeof(struct iphdr) || - ip_hdr(skb)->ihl < sizeof(struct iphdr) / 4)) { - if (net_ratelimit()) - pr_info("nf_tables_ipv4: ignoring short SOCK_RAW " - "packet\n"); - return NF_ACCEPT; - } - - return nft_do_chain_ipv4(priv, skb, state); -} - -struct nft_af_info nft_af_ipv4 __read_mostly = { - .family = NFPROTO_IPV4, - .nhooks = NF_INET_NUMHOOKS, - .owner = THIS_MODULE, - .nops = 1, - .hooks = { - [NF_INET_LOCAL_IN] = nft_do_chain_ipv4, - [NF_INET_LOCAL_OUT] = nft_ipv4_output, - [NF_INET_FORWARD] = nft_do_chain_ipv4, - [NF_INET_PRE_ROUTING] = nft_do_chain_ipv4, - [NF_INET_POST_ROUTING] = nft_do_chain_ipv4, - }, -}; -EXPORT_SYMBOL_GPL(nft_af_ipv4); - -static int nf_tables_ipv4_init_net(struct net *net) -{ - net->nft.ipv4 = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL); - if (net->nft.ipv4 == NULL) - return -ENOMEM; - - memcpy(net->nft.ipv4, &nft_af_ipv4, sizeof(nft_af_ipv4)); - - if (nft_register_afinfo(net, net->nft.ipv4) < 0) - goto err; - - return 0; -err: - kfree(net->nft.ipv4); - return -ENOMEM; -} - -static void nf_tables_ipv4_exit_net(struct net *net) -{ - nft_unregister_afinfo(net, net->nft.ipv4); - kfree(net->nft.ipv4); -} - -static struct pernet_operations nf_tables_ipv4_net_ops = { - .init = nf_tables_ipv4_init_net, - .exit = nf_tables_ipv4_exit_net, -}; - static const struct nf_chain_type filter_ipv4 = { .name = "filter", .type = NFT_CHAIN_T_DEFAULT, @@ -97,26 +40,22 @@ static const struct nf_chain_type filter_ipv4 = { (1 << NF_INET_FORWARD) | (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_POST_ROUTING), + .hooks = { + [NF_INET_LOCAL_IN] = nft_do_chain_ipv4, + [NF_INET_LOCAL_OUT] = nft_do_chain_ipv4, + [NF_INET_FORWARD] = nft_do_chain_ipv4, + [NF_INET_PRE_ROUTING] = nft_do_chain_ipv4, + [NF_INET_POST_ROUTING] = nft_do_chain_ipv4, + }, }; static int __init nf_tables_ipv4_init(void) { - int ret; - - ret = nft_register_chain_type(&filter_ipv4); - if (ret < 0) - return ret; - - ret = register_pernet_subsys(&nf_tables_ipv4_net_ops); - if (ret < 0) - nft_unregister_chain_type(&filter_ipv4); - - return ret; + return nft_register_chain_type(&filter_ipv4); } static void __exit nf_tables_ipv4_exit(void) { - unregister_pernet_subsys(&nf_tables_ipv4_net_ops); nft_unregister_chain_type(&filter_ipv4); } @@ -125,4 +64,4 @@ module_exit(nf_tables_ipv4_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_ALIAS_NFT_FAMILY(AF_INET); +MODULE_ALIAS_NFT_CHAIN(AF_INET, "filter"); diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c index f5c66a7a4bf2..f2a490981594 100644 --- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c +++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c @@ -33,7 +33,8 @@ static unsigned int nft_nat_do_chain(void *priv, { struct nft_pktinfo pkt; - nft_set_pktinfo_ipv4(&pkt, skb, state); + nft_set_pktinfo(&pkt, skb, state); + nft_set_pktinfo_ipv4(&pkt, skb); return nft_do_chain(&pkt, priv); } diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c index 30493beb611a..d965c225b9f6 100644 --- a/net/ipv4/netfilter/nft_chain_route_ipv4.c +++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c @@ -33,12 +33,8 @@ static unsigned int nf_route_table_hook(void *priv, const struct iphdr *iph; int err; - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct iphdr) || - ip_hdrlen(skb) < sizeof(struct iphdr)) - return NF_ACCEPT; - - nft_set_pktinfo_ipv4(&pkt, skb, state); + nft_set_pktinfo(&pkt, skb, state); + nft_set_pktinfo_ipv4(&pkt, skb); mark = skb->mark; iph = ip_hdr(skb); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 9f37c4727861..dc5edc8f7564 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -83,7 +83,6 @@ static int sockstat_seq_open(struct inode *inode, struct file *file) } static const struct file_operations sockstat_seq_fops = { - .owner = THIS_MODULE, .open = sockstat_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -467,7 +466,6 @@ static int snmp_seq_open(struct inode *inode, struct file *file) } static const struct file_operations snmp_seq_fops = { - .owner = THIS_MODULE, .open = snmp_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -515,7 +513,6 @@ static int netstat_seq_open(struct inode *inode, struct file *file) } static const struct file_operations netstat_seq_fops = { - .owner = THIS_MODULE, .open = netstat_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 33b70bfd1122..7c509697ebc7 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -513,11 +513,18 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int err; struct ip_options_data opt_copy; struct raw_frag_vec rfv; + int hdrincl; err = -EMSGSIZE; if (len > 0xFFFF) goto out; + /* hdrincl should be READ_ONCE(inet->hdrincl) + * but READ_ONCE() doesn't work with bit fields. + * Doing this indirectly yields the same result. + */ + hdrincl = inet->hdrincl; + hdrincl = READ_ONCE(hdrincl); /* * Check the flags. */ @@ -593,7 +600,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) /* Linux does not mangle headers on raw sockets, * so that IP options + IP_HDRINCL is non-sense. */ - if (inet->hdrincl) + if (hdrincl) goto done; if (ipc.opt->opt.srr) { if (!daddr) @@ -610,17 +617,30 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipc.oif = inet->mc_index; if (!saddr) saddr = inet->mc_addr; - } else if (!ipc.oif) + } else if (!ipc.oif) { ipc.oif = inet->uc_index; + } else if (ipv4_is_lbcast(daddr) && inet->uc_index) { + /* oif is set, packet is to local broadcast and + * and uc_index is set. oif is most likely set + * by sk_bound_dev_if. If uc_index != oif check if the + * oif is an L3 master and uc_index is an L3 slave. + * If so, we want to allow the send using the uc_index. + */ + if (ipc.oif != inet->uc_index && + ipc.oif == l3mdev_master_ifindex_by_index(sock_net(sk), + inet->uc_index)) { + ipc.oif = inet->uc_index; + } + } flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, - inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, + hdrincl ? IPPROTO_RAW : sk->sk_protocol, inet_sk_flowi_flags(sk) | - (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), + (hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), daddr, saddr, 0, 0, sk->sk_uid); - if (!inet->hdrincl) { + if (!hdrincl) { rfv.msg = msg; rfv.hlen = 0; @@ -645,7 +665,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) goto do_confirm; back_from_confirm: - if (inet->hdrincl) + if (hdrincl) err = raw_send_hdrinc(sk, &fl4, msg, len, &rt, msg->msg_flags, &ipc.sockc); @@ -1112,7 +1132,6 @@ static int raw_v4_seq_open(struct inode *inode, struct file *file) } static const struct file_operations raw_seq_fops = { - .owner = THIS_MODULE, .open = raw_v4_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 43b69af242e1..49cc1c1df1ba 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -240,7 +240,6 @@ static int rt_cache_seq_open(struct inode *inode, struct file *file) } static const struct file_operations rt_cache_seq_fops = { - .owner = THIS_MODULE, .open = rt_cache_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -331,7 +330,6 @@ static int rt_cpu_seq_open(struct inode *inode, struct file *file) } static const struct file_operations rt_cpu_seq_fops = { - .owner = THIS_MODULE, .open = rt_cpu_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -369,7 +367,6 @@ static int rt_acct_proc_open(struct inode *inode, struct file *file) } static const struct file_operations rt_acct_proc_fops = { - .owner = THIS_MODULE, .open = rt_acct_proc_open, .read = seq_read, .llseek = seq_lseek, @@ -1106,7 +1103,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) new = true; } - __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu); + __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu); if (!dst_check(&rt->dst, 0)) { if (new) @@ -2762,6 +2759,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (err == 0 && rt->dst.error) err = -rt->dst.error; } else { + fl4.flowi4_iif = LOOPBACK_IFINDEX; rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb); err = 0; if (IS_ERR(rt)) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index bf97317e6c97..c059aa7df0a9 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -283,8 +283,6 @@ #include <asm/ioctls.h> #include <net/busy_poll.h> -#include <trace/events/tcp.h> - struct percpu_counter tcp_orphan_count; EXPORT_SYMBOL_GPL(tcp_orphan_count); @@ -465,7 +463,7 @@ void tcp_init_transfer(struct sock *sk, int bpf_op) tcp_mtup_init(sk); icsk->icsk_af_ops->rebuild_header(sk); tcp_init_metrics(sk); - tcp_call_bpf(sk, bpf_op); + tcp_call_bpf(sk, bpf_op, 0, NULL); tcp_init_congestion_control(sk); tcp_init_buffer_space(sk); } @@ -493,18 +491,16 @@ static void tcp_tx_timestamp(struct sock *sk, u16 tsflags) * take care of normal races (between the test and the event) and we don't * go look at any of the socket buffers directly. */ -unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) +__poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) { - unsigned int mask; + __poll_t mask; struct sock *sk = sock->sk; const struct tcp_sock *tp = tcp_sk(sk); int state; - sock_rps_record_flow(sk); - sock_poll_wait(file, sk_sleep(sk), wait); - state = sk_state_load(sk); + state = inet_sk_state_load(sk); if (state == TCP_LISTEN) return inet_csk_listen_poll(sk); @@ -1106,12 +1102,15 @@ static int linear_payload_sz(bool first_skb) return 0; } -static int select_size(const struct sock *sk, bool sg, bool first_skb) +static int select_size(const struct sock *sk, bool sg, bool first_skb, bool zc) { const struct tcp_sock *tp = tcp_sk(sk); int tmp = tp->mss_cache; if (sg) { + if (zc) + return 0; + if (sk_can_gso(sk)) { tmp = linear_payload_sz(first_skb); } else { @@ -1188,7 +1187,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) int flags, err, copied = 0; int mss_now = 0, size_goal, copied_syn = 0; bool process_backlog = false; - bool sg; + bool sg, zc = false; long timeo; flags = msg->msg_flags; @@ -1206,7 +1205,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) goto out_err; } - if (!(sk_check_csum_caps(sk) && sk->sk_route_caps & NETIF_F_SG)) + zc = sk_check_csum_caps(sk) && sk->sk_route_caps & NETIF_F_SG; + if (!zc) uarg->zerocopy = 0; } @@ -1283,6 +1283,7 @@ restart: if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) { bool first_skb; + int linear; new_segment: /* Allocate new segment. If the interface is SG, @@ -1296,9 +1297,8 @@ new_segment: goto restart; } first_skb = tcp_rtx_and_write_queues_empty(sk); - skb = sk_stream_alloc_skb(sk, - select_size(sk, sg, first_skb), - sk->sk_allocation, + linear = select_size(sk, sg, first_skb, zc); + skb = sk_stream_alloc_skb(sk, linear, sk->sk_allocation, first_skb); if (!skb) goto wait_for_memory; @@ -1327,13 +1327,13 @@ new_segment: copy = msg_data_left(msg); /* Where to copy to? */ - if (skb_availroom(skb) > 0) { + if (skb_availroom(skb) > 0 && !zc) { /* We have some space in skb head. Superb! */ copy = min_t(int, copy, skb_availroom(skb)); err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy); if (err) goto do_fault; - } else if (!uarg || !uarg->zerocopy) { + } else if (!zc) { bool merge = true; int i = skb_shinfo(skb)->nr_frags; struct page_frag *pfrag = sk_page_frag(sk); @@ -1373,8 +1373,10 @@ new_segment: pfrag->offset += copy; } else { err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg); - if (err == -EMSGSIZE || err == -EEXIST) + if (err == -EMSGSIZE || err == -EEXIST) { + tcp_mark_push(tp, skb); goto new_segment; + } if (err < 0) goto do_error; copy = err; @@ -1731,8 +1733,8 @@ static void tcp_update_recv_tstamps(struct sk_buff *skb, } /* Similar to __sock_recv_timestamp, but does not require an skb */ -void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, - struct scm_timestamping *tss) +static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, + struct scm_timestamping *tss) { struct timeval tv; bool has_timestamping = false; @@ -2040,7 +2042,29 @@ void tcp_set_state(struct sock *sk, int state) { int oldstate = sk->sk_state; - trace_tcp_set_state(sk, oldstate, state); + /* We defined a new enum for TCP states that are exported in BPF + * so as not force the internal TCP states to be frozen. The + * following checks will detect if an internal state value ever + * differs from the BPF value. If this ever happens, then we will + * need to remap the internal value to the BPF value before calling + * tcp_call_bpf_2arg. + */ + BUILD_BUG_ON((int)BPF_TCP_ESTABLISHED != (int)TCP_ESTABLISHED); + BUILD_BUG_ON((int)BPF_TCP_SYN_SENT != (int)TCP_SYN_SENT); + BUILD_BUG_ON((int)BPF_TCP_SYN_RECV != (int)TCP_SYN_RECV); + BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT1 != (int)TCP_FIN_WAIT1); + BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT2 != (int)TCP_FIN_WAIT2); + BUILD_BUG_ON((int)BPF_TCP_TIME_WAIT != (int)TCP_TIME_WAIT); + BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE); + BUILD_BUG_ON((int)BPF_TCP_CLOSE_WAIT != (int)TCP_CLOSE_WAIT); + BUILD_BUG_ON((int)BPF_TCP_LAST_ACK != (int)TCP_LAST_ACK); + BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN); + BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING); + BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV); + BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES); + + if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG)) + tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state); switch (state) { case TCP_ESTABLISHED: @@ -2065,7 +2089,7 @@ void tcp_set_state(struct sock *sk, int state) /* Change state AFTER socket is unhashed to avoid closed * socket sitting in hash tables. */ - sk_state_store(sk, state); + inet_sk_state_store(sk, state); #ifdef STATE_TRACE SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]); @@ -2298,6 +2322,9 @@ adjudge_to_death: tcp_send_active_reset(sk, GFP_ATOMIC); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); + } else if (!check_net(sock_net(sk))) { + /* Not possible to send reset; just close */ + tcp_set_state(sk, TCP_CLOSE); } } @@ -2412,6 +2439,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->snd_cwnd_cnt = 0; tp->window_clamp = 0; tcp_set_ca_state(sk, TCP_CA_Open); + tp->is_sack_reneg = 0; tcp_clear_retrans(tp); inet_csk_delack_init(sk); /* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0 @@ -2430,6 +2458,12 @@ int tcp_disconnect(struct sock *sk, int flags) WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); + if (sk->sk_frag.page) { + put_page(sk->sk_frag.page); + sk->sk_frag.page = NULL; + sk->sk_frag.offset = 0; + } + sk->sk_error_report(sk); return err; } @@ -2919,7 +2953,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) if (sk->sk_type != SOCK_STREAM) return; - info->tcpi_state = sk_state_load(sk); + info->tcpi_state = inet_sk_state_load(sk); /* Report meaningful fields for all TCP states, including listeners */ rate = READ_ONCE(sk->sk_pacing_rate); @@ -3577,6 +3611,9 @@ void __init tcp_init(void) percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL); percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL); inet_hashinfo_init(&tcp_hashinfo); + inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash", + thash_entries, 21, /* one slot per 2 MB*/ + 0, 64 * 1024); tcp_hashinfo.bind_bucket_cachep = kmem_cache_create("tcp_bind_bucket", sizeof(struct inet_bind_bucket), 0, diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 69ee877574d0..a471f696e13c 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -110,7 +110,8 @@ struct bbr { u32 lt_last_lost; /* LT intvl start: tp->lost */ u32 pacing_gain:10, /* current gain for setting pacing rate */ cwnd_gain:10, /* current gain for setting cwnd */ - full_bw_cnt:3, /* number of rounds without large bw gains */ + full_bw_reached:1, /* reached full bw in Startup? */ + full_bw_cnt:2, /* number of rounds without large bw gains */ cycle_idx:3, /* current index in pacing_gain cycle array */ has_seen_rtt:1, /* have we seen an RTT sample yet? */ unused_b:5; @@ -180,7 +181,7 @@ static bool bbr_full_bw_reached(const struct sock *sk) { const struct bbr *bbr = inet_csk_ca(sk); - return bbr->full_bw_cnt >= bbr_full_bw_cnt; + return bbr->full_bw_reached; } /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */ @@ -480,7 +481,8 @@ static void bbr_advance_cycle_phase(struct sock *sk) bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1); bbr->cycle_mstamp = tp->delivered_mstamp; - bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx]; + bbr->pacing_gain = bbr->lt_use_bw ? BBR_UNIT : + bbr_pacing_gain[bbr->cycle_idx]; } /* Gain cycling: cycle pacing gain to converge to fair share of available bw. */ @@ -489,8 +491,7 @@ static void bbr_update_cycle_phase(struct sock *sk, { struct bbr *bbr = inet_csk_ca(sk); - if ((bbr->mode == BBR_PROBE_BW) && !bbr->lt_use_bw && - bbr_is_next_cycle_phase(sk, rs)) + if (bbr->mode == BBR_PROBE_BW && bbr_is_next_cycle_phase(sk, rs)) bbr_advance_cycle_phase(sk); } @@ -717,6 +718,7 @@ static void bbr_check_full_bw_reached(struct sock *sk, return; } ++bbr->full_bw_cnt; + bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt; } /* If pipe is probably full, drain the queue and then enter steady-state. */ @@ -764,7 +766,8 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) filter_expired = after(tcp_jiffies32, bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ); if (rs->rtt_us >= 0 && - (rs->rtt_us <= bbr->min_rtt_us || filter_expired)) { + (rs->rtt_us <= bbr->min_rtt_us || + (filter_expired && !rs->is_ack_delayed))) { bbr->min_rtt_us = rs->rtt_us; bbr->min_rtt_stamp = tcp_jiffies32; } @@ -850,6 +853,7 @@ static void bbr_init(struct sock *sk) bbr->restore_cwnd = 0; bbr->round_start = 0; bbr->idle_restart = 0; + bbr->full_bw_reached = 0; bbr->full_bw = 0; bbr->full_bw_cnt = 0; bbr->cycle_mstamp = 0; @@ -871,6 +875,11 @@ static u32 bbr_sndbuf_expand(struct sock *sk) */ static u32 bbr_undo_cwnd(struct sock *sk) { + struct bbr *bbr = inet_csk_ca(sk); + + bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */ + bbr->full_bw_cnt = 0; + bbr_reset_lt_bw_sampling(sk); return tcp_sk(sk)->snd_cwnd; } diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index abbf0edcf6c2..81148f7a2323 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -24,7 +24,7 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, { struct tcp_info *info = _info; - if (sk_state_load(sk) == TCP_LISTEN) { + if (inet_sk_state_load(sk) == TCP_LISTEN) { r->idiag_rqueue = sk->sk_ack_backlog; r->idiag_wqueue = sk->sk_max_ack_backlog; } else if (sk->sk_type == SOCK_STREAM) { diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 78c192ee03a4..018a48477355 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -379,18 +379,9 @@ fastopen: bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, struct tcp_fastopen_cookie *cookie) { - unsigned long last_syn_loss = 0; const struct dst_entry *dst; - int syn_loss = 0; - tcp_fastopen_cache_get(sk, mss, cookie, &syn_loss, &last_syn_loss); - - /* Recurring FO SYN losses: no cookie or data in SYN */ - if (syn_loss > 1 && - time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) { - cookie->len = -1; - return false; - } + tcp_fastopen_cache_get(sk, mss, cookie); /* Firewall blackhole issue check */ if (tcp_fastopen_active_should_disable(sk)) { @@ -448,6 +439,8 @@ EXPORT_SYMBOL(tcp_fastopen_defer_connect); * following circumstances: * 1. client side TFO socket receives out of order FIN * 2. client side TFO socket receives out of order RST + * 3. client side TFO socket has timed out three times consecutively during + * or after handshake * We disable active side TFO globally for 1hr at first. Then if it * happens again, we disable it for 2h, then 4h, 8h, ... * And we reset the timeout back to 1hr when we see a successful active @@ -524,3 +517,20 @@ void tcp_fastopen_active_disable_ofo_check(struct sock *sk) dst_release(dst); } } + +void tcp_fastopen_active_detect_blackhole(struct sock *sk, bool expired) +{ + u32 timeouts = inet_csk(sk)->icsk_retransmits; + struct tcp_sock *tp = tcp_sk(sk); + + /* Broken middle-boxes may black-hole Fast Open connection during or + * even after the handshake. Be extremely conservative and pause + * Fast Open globally after hitting the third consecutive timeout or + * exceeding the configured timeout limit. + */ + if ((tp->syn_fastopen || tp->syn_data || tp->syn_data_acked) && + (timeouts == 2 || (timeouts < 2 && expired))) { + tcp_fastopen_active_disable(sk); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL); + } +} diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 734cfc8ff76e..cfa51cfd2d99 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -97,6 +97,7 @@ int sysctl_tcp_max_orphans __read_mostly = NR_FILE; #define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ #define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */ #define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */ +#define FLAG_ACK_MAYBE_DELAYED 0x10000 /* Likely a delayed ACK */ #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) @@ -508,9 +509,6 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) u32 new_sample = tp->rcv_rtt_est.rtt_us; long m = sample; - if (m == 0) - m = 1; - if (new_sample != 0) { /* If we sample in larger samples in the non-timestamp * case, we could grossly overestimate the RTT especially @@ -547,6 +545,8 @@ static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp) if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq)) return; delta_us = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcv_rtt_est.time); + if (!delta_us) + delta_us = 1; tcp_rcv_rtt_update(tp, delta_us, 1); new_measure: @@ -563,8 +563,11 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss)) { u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; - u32 delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); + u32 delta_us; + if (!delta) + delta = 1; + delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); tcp_rcv_rtt_update(tp, delta_us, 0); } } @@ -576,9 +579,10 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, void tcp_rcv_space_adjust(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + u32 copied; int time; - int copied; + tcp_mstamp_refresh(tp); time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time); if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0) return; @@ -599,38 +603,31 @@ void tcp_rcv_space_adjust(struct sock *sk) if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { - int rcvwin, rcvmem, rcvbuf; + int rcvmem, rcvbuf; + u64 rcvwin, grow; /* minimal window to cope with packet losses, assuming * steady state. Add some cushion because of small variations. */ - rcvwin = (copied << 1) + 16 * tp->advmss; + rcvwin = ((u64)copied << 1) + 16 * tp->advmss; - /* If rate increased by 25%, - * assume slow start, rcvwin = 3 * copied - * If rate increased by 50%, - * assume sender can use 2x growth, rcvwin = 4 * copied - */ - if (copied >= - tp->rcvq_space.space + (tp->rcvq_space.space >> 2)) { - if (copied >= - tp->rcvq_space.space + (tp->rcvq_space.space >> 1)) - rcvwin <<= 1; - else - rcvwin += (rcvwin >> 1); - } + /* Accommodate for sender rate increase (eg. slow start) */ + grow = rcvwin * (copied - tp->rcvq_space.space); + do_div(grow, tp->rcvq_space.space); + rcvwin += (grow << 1); rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); while (tcp_win_from_space(sk, rcvmem) < tp->advmss) rcvmem += 128; - rcvbuf = min(rcvwin / tp->advmss * rcvmem, - sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); + do_div(rcvwin, tp->advmss); + rcvbuf = min_t(u64, rcvwin * rcvmem, + sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); if (rcvbuf > sk->sk_rcvbuf) { sk->sk_rcvbuf = rcvbuf; /* Make the window clamp follow along. */ - tp->window_clamp = rcvwin; + tp->window_clamp = tcp_win_from_space(sk, rcvbuf); } } tp->rcvq_space.space = copied; @@ -1941,6 +1938,8 @@ void tcp_enter_loss(struct sock *sk) if (is_reneg) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); tp->sacked_out = 0; + /* Mark SACK reneging until we recover from this loss event. */ + tp->is_sack_reneg = 1; } tcp_clear_all_retrans_hints(tp); @@ -2326,6 +2325,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss) } tp->snd_cwnd_stamp = tcp_jiffies32; tp->undo_marker = 0; + tp->rack.advanced = 1; /* Force RACK to re-exam losses */ } static inline bool tcp_may_undo(const struct tcp_sock *tp) @@ -2364,6 +2364,7 @@ static bool tcp_try_undo_recovery(struct sock *sk) return true; } tcp_set_ca_state(sk, TCP_CA_Open); + tp->is_sack_reneg = 0; return false; } @@ -2397,8 +2398,10 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS); inet_csk(sk)->icsk_retransmits = 0; - if (frto_undo || tcp_is_sack(tp)) + if (frto_undo || tcp_is_sack(tp)) { tcp_set_ca_state(sk, TCP_CA_Open); + tp->is_sack_reneg = 0; + } return true; } return false; @@ -2855,11 +2858,18 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, *rexmit = REXMIT_LOST; } -static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us) +static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us, const int flag) { u32 wlen = sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen * HZ; struct tcp_sock *tp = tcp_sk(sk); + if ((flag & FLAG_ACK_MAYBE_DELAYED) && rtt_us > tcp_min_rtt(tp)) { + /* If the remote keeps returning delayed ACKs, eventually + * the min filter would pick it up and overestimate the + * prop. delay when it expires. Skip suspected delayed ACKs. + */ + return; + } minmax_running_min(&tp->rtt_min, wlen, tcp_jiffies32, rtt_us ? : jiffies_to_usecs(1)); } @@ -2899,7 +2909,7 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag, * always taken together with ACK, SACK, or TS-opts. Any negative * values will be skipped with the seq_rtt_us < 0 check above. */ - tcp_update_rtt_min(sk, ca_rtt_us); + tcp_update_rtt_min(sk, ca_rtt_us, flag); tcp_rtt_estimator(sk, seq_rtt_us); tcp_set_rto(sk); @@ -3123,6 +3133,17 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) { seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt); ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, last_ackt); + + if (pkts_acked == 1 && last_in_flight < tp->mss_cache && + last_in_flight && !prior_sacked && fully_acked && + sack->rate->prior_delivered + 1 == tp->delivered && + !(flag & (FLAG_CA_ALERT | FLAG_SYN_ACKED))) { + /* Conservatively mark a delayed ACK. It's typically + * from a lone runt packet over the round trip to + * a receiver w/o out-of-order or CE events. + */ + flag |= FLAG_ACK_MAYBE_DELAYED; + } } if (sack->first_sackt) { sack_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->first_sackt); @@ -3495,6 +3516,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) struct tcp_sacktag_state sack_state; struct rate_sample rs = { .prior_delivered = 0 }; u32 prior_snd_una = tp->snd_una; + bool is_sack_reneg = tp->is_sack_reneg; u32 ack_seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; bool is_dupack = false; @@ -3611,7 +3633,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */ lost = tp->lost - lost; /* freshly marked lost */ - tcp_rate_gen(sk, delivered, lost, sack_state.rate); + rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED); + tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); tcp_cong_control(sk, ack, delivered, flag, sack_state.rate); tcp_xmit_recovery(sk, rexmit); return 1; @@ -5296,6 +5319,9 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, unsigned int len = skb->len; struct tcp_sock *tp = tcp_sk(sk); + /* TCP congestion window tracking */ + trace_tcp_probe(sk, skb); + tcp_mstamp_refresh(tp); if (unlikely(!sk->sk_rx_dst)) inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index c6bc0c4d19c6..95738aa0d8a6 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -848,7 +848,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, req->ts_recent, 0, - tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr, + tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr, AF_INET), inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, ip_hdr(skb)->tos); @@ -1591,6 +1591,34 @@ int tcp_filter(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(tcp_filter); +static void tcp_v4_restore_cb(struct sk_buff *skb) +{ + memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, + sizeof(struct inet_skb_parm)); +} + +static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, + const struct tcphdr *th) +{ + /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() + * barrier() makes sure compiler wont play fool^Waliasing games. + */ + memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), + sizeof(struct inet_skb_parm)); + barrier(); + + TCP_SKB_CB(skb)->seq = ntohl(th->seq); + TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + + skb->len - th->doff * 4); + TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); + TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); + TCP_SKB_CB(skb)->tcp_tw_isn = 0; + TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); + TCP_SKB_CB(skb)->sacked = 0; + TCP_SKB_CB(skb)->has_rxtstamp = + skb->tstamp || skb_hwtstamps(skb)->hwtstamp; +} + /* * From tcp_input.c */ @@ -1631,24 +1659,6 @@ int tcp_v4_rcv(struct sk_buff *skb) th = (const struct tcphdr *)skb->data; iph = ip_hdr(skb); - /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() - * barrier() makes sure compiler wont play fool^Waliasing games. - */ - memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), - sizeof(struct inet_skb_parm)); - barrier(); - - TCP_SKB_CB(skb)->seq = ntohl(th->seq); - TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + - skb->len - th->doff * 4); - TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); - TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); - TCP_SKB_CB(skb)->tcp_tw_isn = 0; - TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); - TCP_SKB_CB(skb)->sacked = 0; - TCP_SKB_CB(skb)->has_rxtstamp = - skb->tstamp || skb_hwtstamps(skb)->hwtstamp; - lookup: sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, th->dest, sdif, &refcounted); @@ -1679,14 +1689,19 @@ process: sock_hold(sk); refcounted = true; nsk = NULL; - if (!tcp_filter(sk, skb)) + if (!tcp_filter(sk, skb)) { + th = (const struct tcphdr *)skb->data; + iph = ip_hdr(skb); + tcp_v4_fill_cb(skb, iph, th); nsk = tcp_check_req(sk, skb, req, false); + } if (!nsk) { reqsk_put(req); goto discard_and_relse; } if (nsk == sk) { reqsk_put(req); + tcp_v4_restore_cb(skb); } else if (tcp_child_process(sk, nsk, skb)) { tcp_v4_send_reset(nsk, skb); goto discard_and_relse; @@ -1712,6 +1727,7 @@ process: goto discard_and_relse; th = (const struct tcphdr *)skb->data; iph = ip_hdr(skb); + tcp_v4_fill_cb(skb, iph, th); skb->dev = NULL; @@ -1742,6 +1758,8 @@ no_tcp_socket: if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto discard_it; + tcp_v4_fill_cb(skb, iph, th); + if (tcp_checksum_complete(skb)) { csum_error: __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); @@ -1768,6 +1786,8 @@ do_time_wait: goto discard_it; } + tcp_v4_fill_cb(skb, iph, th); + if (tcp_checksum_complete(skb)) { inet_twsk_put(inet_twsk(sk)); goto csum_error; @@ -1784,6 +1804,7 @@ do_time_wait: if (sk2) { inet_twsk_deschedule_put(inet_twsk(sk)); sk = sk2; + tcp_v4_restore_cb(skb); refcounted = false; goto process; } @@ -1890,7 +1911,7 @@ void tcp_v4_destroy_sock(struct sock *sk) /* Clean up the MD5 key list, if any */ if (tp->md5sig_info) { tcp_clear_md5_list(sk); - kfree_rcu(tp->md5sig_info, rcu); + kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu); tp->md5sig_info = NULL; } #endif @@ -2260,7 +2281,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) timer_expires = jiffies; } - state = sk_state_load(sk); + state = inet_sk_state_load(sk); if (state == TCP_LISTEN) rx_queue = sk->sk_ack_backlog; else @@ -2337,7 +2358,6 @@ out: } static const struct file_operations tcp_afinfo_seq_fops = { - .owner = THIS_MODULE, .open = tcp_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 7097f92d16e5..03b51cdcc731 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -546,8 +546,7 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst) static DEFINE_SEQLOCK(fastopen_seqlock); void tcp_fastopen_cache_get(struct sock *sk, u16 *mss, - struct tcp_fastopen_cookie *cookie, - int *syn_loss, unsigned long *last_syn_loss) + struct tcp_fastopen_cookie *cookie) { struct tcp_metrics_block *tm; @@ -564,8 +563,6 @@ void tcp_fastopen_cache_get(struct sock *sk, u16 *mss, *cookie = tfom->cookie; if (cookie->len <= 0 && tfom->try_exp == 1) cookie->exp = true; - *syn_loss = tfom->syn_loss; - *last_syn_loss = *syn_loss ? tfom->last_syn_loss : 0; } while (read_seqretry(&fastopen_seqlock, seq)); } rcu_read_unlock(); @@ -895,7 +892,7 @@ static void tcp_metrics_flush_all(struct net *net) pp = &hb->chain; for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) { match = net ? net_eq(tm_net(tm), net) : - !atomic_read(&tm_net(tm)->count); + !refcount_read(&tm_net(tm)->count); if (match) { *pp = tm->tcpm_next; kfree_rcu(tm, rcu_head); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index e36eff0403f4..a8384b0c11f8 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -310,10 +310,17 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) if (state == TCP_TIME_WAIT) timeo = TCP_TIMEWAIT_LEN; + /* tw_timer is pinned, so we need to make sure BH are disabled + * in following section, otherwise timer handler could run before + * we complete the initialization. + */ + local_bh_disable(); inet_twsk_schedule(tw, timeo); - /* Linkage updates. */ - __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); - inet_twsk_put(tw); + /* Linkage updates. + * Note that access to tw after this point is illegal. + */ + inet_twsk_hashdance(tw, sk, &tcp_hashinfo); + local_bh_enable(); } else { /* Sorry, if we're out of memory, just CLOSE this * socket up. We've got bigger problems than diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c index 0b5a05bd82e3..764298e52577 100644 --- a/net/ipv4/tcp_nv.c +++ b/net/ipv4/tcp_nv.c @@ -146,7 +146,7 @@ static void tcpnv_init(struct sock *sk) * within a datacenter, where we have reasonable estimates of * RTTs */ - base_rtt = tcp_call_bpf(sk, BPF_SOCK_OPS_BASE_RTT); + base_rtt = tcp_call_bpf(sk, BPF_SOCK_OPS_BASE_RTT, 0, NULL); if (base_rtt > 0) { ca->nv_base_rtt = base_rtt; ca->nv_lower_bound_rtt = (base_rtt * 205) >> 8; /* 80% */ @@ -364,7 +364,7 @@ static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample) */ cwnd_by_slope = (u32) div64_u64(((u64)ca->nv_rtt_max_rate) * ca->nv_min_rtt, - (u64)(80000 * tp->mss_cache)); + 80000ULL * tp->mss_cache); max_win = cwnd_by_slope + nv_pad; /* If cwnd > max_win, decrease cwnd diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index b6a2aa1dcf56..4d58e2ce0b5b 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -32,6 +32,9 @@ static void tcp_gso_tstamp(struct sk_buff *skb, unsigned int ts_seq, static struct sk_buff *tcp4_gso_segment(struct sk_buff *skb, netdev_features_t features) { + if (!(skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4)) + return ERR_PTR(-EINVAL); + if (!pskb_may_pull(skb, sizeof(struct tcphdr))) return ERR_PTR(-EINVAL); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a4d214c7b506..e9f985e42405 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1944,7 +1944,8 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, in_flight = tcp_packets_in_flight(tp); - BUG_ON(tcp_skb_pcount(skb) <= 1 || (tp->snd_cwnd <= in_flight)); + BUG_ON(tcp_skb_pcount(skb) <= 1); + BUG_ON(tp->snd_cwnd <= in_flight); send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; @@ -2414,15 +2415,12 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) early_retrans = sock_net(sk)->ipv4.sysctl_tcp_early_retrans; /* Schedule a loss probe in 2*RTT for SACK capable connections - * in Open state, that are either limited by cwnd or application. + * not in loss recovery, that are either limited by cwnd or application. */ if ((early_retrans != 3 && early_retrans != 4) || !tp->packets_out || !tcp_is_sack(tp) || - icsk->icsk_ca_state != TCP_CA_Open) - return false; - - if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) && - !tcp_write_queue_empty(sk)) + (icsk->icsk_ca_state != TCP_CA_Open && + icsk->icsk_ca_state != TCP_CA_CWR)) return false; /* Probe timeout is 2*rtt. Add minimum RTO to account @@ -2907,6 +2905,10 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } + if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG)) + tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB, + TCP_SKB_CB(skb)->seq, segs, err); + if (likely(!err)) { TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; trace_tcp_retransmit_skb(sk, skb); @@ -3471,7 +3473,7 @@ int tcp_connect(struct sock *sk) struct sk_buff *buff; int err; - tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB); + tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB, 0, NULL); if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) return -EHOSTUNREACH; /* Routing failure or similar. */ diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c deleted file mode 100644 index 697f4c67b2e3..000000000000 --- a/net/ipv4/tcp_probe.c +++ /dev/null @@ -1,301 +0,0 @@ -/* - * tcpprobe - Observe the TCP flow with kprobes. - * - * The idea for this came from Werner Almesberger's umlsim - * Copyright (C) 2004, Stephen Hemminger <shemminger@osdl.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/kernel.h> -#include <linux/kprobes.h> -#include <linux/socket.h> -#include <linux/tcp.h> -#include <linux/slab.h> -#include <linux/proc_fs.h> -#include <linux/module.h> -#include <linux/ktime.h> -#include <linux/time.h> -#include <net/net_namespace.h> - -#include <net/tcp.h> - -MODULE_AUTHOR("Stephen Hemminger <shemminger@linux-foundation.org>"); -MODULE_DESCRIPTION("TCP cwnd snooper"); -MODULE_LICENSE("GPL"); -MODULE_VERSION("1.1"); - -static int port __read_mostly; -MODULE_PARM_DESC(port, "Port to match (0=all)"); -module_param(port, int, 0); - -static unsigned int bufsize __read_mostly = 4096; -MODULE_PARM_DESC(bufsize, "Log buffer size in packets (4096)"); -module_param(bufsize, uint, 0); - -static unsigned int fwmark __read_mostly; -MODULE_PARM_DESC(fwmark, "skb mark to match (0=no mark)"); -module_param(fwmark, uint, 0); - -static int full __read_mostly; -MODULE_PARM_DESC(full, "Full log (1=every ack packet received, 0=only cwnd changes)"); -module_param(full, int, 0); - -static const char procname[] = "tcpprobe"; - -struct tcp_log { - ktime_t tstamp; - union { - struct sockaddr raw; - struct sockaddr_in v4; - struct sockaddr_in6 v6; - } src, dst; - u16 length; - u32 snd_nxt; - u32 snd_una; - u32 snd_wnd; - u32 rcv_wnd; - u32 snd_cwnd; - u32 ssthresh; - u32 srtt; -}; - -static struct { - spinlock_t lock; - wait_queue_head_t wait; - ktime_t start; - u32 lastcwnd; - - unsigned long head, tail; - struct tcp_log *log; -} tcp_probe; - -static inline int tcp_probe_used(void) -{ - return (tcp_probe.head - tcp_probe.tail) & (bufsize - 1); -} - -static inline int tcp_probe_avail(void) -{ - return bufsize - tcp_probe_used() - 1; -} - -#define tcp_probe_copy_fl_to_si4(inet, si4, mem) \ - do { \ - si4.sin_family = AF_INET; \ - si4.sin_port = inet->inet_##mem##port; \ - si4.sin_addr.s_addr = inet->inet_##mem##addr; \ - } while (0) \ - -/* - * Hook inserted to be called before each receive packet. - * Note: arguments must match tcp_rcv_established()! - */ -static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th) -{ - unsigned int len = skb->len; - const struct tcp_sock *tp = tcp_sk(sk); - const struct inet_sock *inet = inet_sk(sk); - - /* Only update if port or skb mark matches */ - if (((port == 0 && fwmark == 0) || - ntohs(inet->inet_dport) == port || - ntohs(inet->inet_sport) == port || - (fwmark > 0 && skb->mark == fwmark)) && - (full || tp->snd_cwnd != tcp_probe.lastcwnd)) { - - spin_lock(&tcp_probe.lock); - /* If log fills, just silently drop */ - if (tcp_probe_avail() > 1) { - struct tcp_log *p = tcp_probe.log + tcp_probe.head; - - p->tstamp = ktime_get(); - switch (sk->sk_family) { - case AF_INET: - tcp_probe_copy_fl_to_si4(inet, p->src.v4, s); - tcp_probe_copy_fl_to_si4(inet, p->dst.v4, d); - break; - case AF_INET6: - memset(&p->src.v6, 0, sizeof(p->src.v6)); - memset(&p->dst.v6, 0, sizeof(p->dst.v6)); -#if IS_ENABLED(CONFIG_IPV6) - p->src.v6.sin6_family = AF_INET6; - p->src.v6.sin6_port = inet->inet_sport; - p->src.v6.sin6_addr = inet6_sk(sk)->saddr; - - p->dst.v6.sin6_family = AF_INET6; - p->dst.v6.sin6_port = inet->inet_dport; - p->dst.v6.sin6_addr = sk->sk_v6_daddr; -#endif - break; - default: - BUG(); - } - - p->length = len; - p->snd_nxt = tp->snd_nxt; - p->snd_una = tp->snd_una; - p->snd_cwnd = tp->snd_cwnd; - p->snd_wnd = tp->snd_wnd; - p->rcv_wnd = tp->rcv_wnd; - p->ssthresh = tcp_current_ssthresh(sk); - p->srtt = tp->srtt_us >> 3; - - tcp_probe.head = (tcp_probe.head + 1) & (bufsize - 1); - } - tcp_probe.lastcwnd = tp->snd_cwnd; - spin_unlock(&tcp_probe.lock); - - wake_up(&tcp_probe.wait); - } - - jprobe_return(); -} - -static struct jprobe tcp_jprobe = { - .kp = { - .symbol_name = "tcp_rcv_established", - }, - .entry = jtcp_rcv_established, -}; - -static int tcpprobe_open(struct inode *inode, struct file *file) -{ - /* Reset (empty) log */ - spin_lock_bh(&tcp_probe.lock); - tcp_probe.head = tcp_probe.tail = 0; - tcp_probe.start = ktime_get(); - spin_unlock_bh(&tcp_probe.lock); - - return 0; -} - -static int tcpprobe_sprint(char *tbuf, int n) -{ - const struct tcp_log *p - = tcp_probe.log + tcp_probe.tail; - struct timespec64 ts - = ktime_to_timespec64(ktime_sub(p->tstamp, tcp_probe.start)); - - return scnprintf(tbuf, n, - "%lu.%09lu %pISpc %pISpc %d %#x %#x %u %u %u %u %u\n", - (unsigned long)ts.tv_sec, - (unsigned long)ts.tv_nsec, - &p->src, &p->dst, p->length, p->snd_nxt, p->snd_una, - p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt, p->rcv_wnd); -} - -static ssize_t tcpprobe_read(struct file *file, char __user *buf, - size_t len, loff_t *ppos) -{ - int error = 0; - size_t cnt = 0; - - if (!buf) - return -EINVAL; - - while (cnt < len) { - char tbuf[256]; - int width; - - /* Wait for data in buffer */ - error = wait_event_interruptible(tcp_probe.wait, - tcp_probe_used() > 0); - if (error) - break; - - spin_lock_bh(&tcp_probe.lock); - if (tcp_probe.head == tcp_probe.tail) { - /* multiple readers race? */ - spin_unlock_bh(&tcp_probe.lock); - continue; - } - - width = tcpprobe_sprint(tbuf, sizeof(tbuf)); - - if (cnt + width < len) - tcp_probe.tail = (tcp_probe.tail + 1) & (bufsize - 1); - - spin_unlock_bh(&tcp_probe.lock); - - /* if record greater than space available - return partial buffer (so far) */ - if (cnt + width >= len) - break; - - if (copy_to_user(buf + cnt, tbuf, width)) - return -EFAULT; - cnt += width; - } - - return cnt == 0 ? error : cnt; -} - -static const struct file_operations tcpprobe_fops = { - .owner = THIS_MODULE, - .open = tcpprobe_open, - .read = tcpprobe_read, - .llseek = noop_llseek, -}; - -static __init int tcpprobe_init(void) -{ - int ret = -ENOMEM; - - /* Warning: if the function signature of tcp_rcv_established, - * has been changed, you also have to change the signature of - * jtcp_rcv_established, otherwise you end up right here! - */ - BUILD_BUG_ON(__same_type(tcp_rcv_established, - jtcp_rcv_established) == 0); - - init_waitqueue_head(&tcp_probe.wait); - spin_lock_init(&tcp_probe.lock); - - if (bufsize == 0) - return -EINVAL; - - bufsize = roundup_pow_of_two(bufsize); - tcp_probe.log = kcalloc(bufsize, sizeof(struct tcp_log), GFP_KERNEL); - if (!tcp_probe.log) - goto err0; - - if (!proc_create(procname, S_IRUSR, init_net.proc_net, &tcpprobe_fops)) - goto err0; - - ret = register_jprobe(&tcp_jprobe); - if (ret) - goto err1; - - pr_info("probe registered (port=%d/fwmark=%u) bufsize=%u\n", - port, fwmark, bufsize); - return 0; - err1: - remove_proc_entry(procname, init_net.proc_net); - err0: - kfree(tcp_probe.log); - return ret; -} -module_init(tcpprobe_init); - -static __exit void tcpprobe_exit(void) -{ - remove_proc_entry(procname, init_net.proc_net); - unregister_jprobe(&tcp_jprobe); - kfree(tcp_probe.log); -} -module_exit(tcpprobe_exit); diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c index 3330a370d306..c61240e43923 100644 --- a/net/ipv4/tcp_rate.c +++ b/net/ipv4/tcp_rate.c @@ -106,7 +106,7 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, /* Update the connection delivery information and generate a rate sample. */ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, - struct rate_sample *rs) + bool is_sack_reneg, struct rate_sample *rs) { struct tcp_sock *tp = tcp_sk(sk); u32 snd_us, ack_us; @@ -124,8 +124,12 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, rs->acked_sacked = delivered; /* freshly ACKed or SACKed */ rs->losses = lost; /* freshly marked lost */ - /* Return an invalid sample if no timing information is available. */ - if (!rs->prior_mstamp) { + /* Return an invalid sample if no timing information is available or + * in recovery from loss with SACK reneging. Rate samples taken during + * a SACK reneging event may overestimate bw by including packets that + * were SACKed before the reneg. + */ + if (!rs->prior_mstamp || is_sack_reneg) { rs->delivered = -1; rs->interval_us = -1; return; diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index d3ea89020c69..3a81720ac0c4 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -55,7 +55,8 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) * to queuing or delayed ACKs. */ reo_wnd = 1000; - if ((tp->rack.reord || !tp->lost_out) && min_rtt != ~0U) { + if ((tp->rack.reord || inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery) && + min_rtt != ~0U) { reo_wnd = max((min_rtt >> 2) * tp->rack.reo_wnd_steps, reo_wnd); reo_wnd = min(reo_wnd, tp->srtt_us >> 3); } @@ -79,12 +80,12 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) */ remaining = tp->rack.rtt_us + reo_wnd - tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp); - if (remaining < 0) { + if (remaining <= 0) { tcp_rack_mark_skb_lost(sk, skb); list_del_init(&skb->tcp_tsorted_anchor); } else { - /* Record maximum wait time (+1 to avoid 0) */ - *reo_timeout = max_t(u32, *reo_timeout, 1 + remaining); + /* Record maximum wait time */ + *reo_timeout = max_t(u32, *reo_timeout, remaining); } } } @@ -116,13 +117,8 @@ void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, { u32 rtt_us; - if (tp->rack.mstamp && - !tcp_rack_sent_after(xmit_time, tp->rack.mstamp, - end_seq, tp->rack.end_seq)) - return; - rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, xmit_time); - if (sacked & TCPCB_RETRANS) { + if (rtt_us < tcp_min_rtt(tp) && (sacked & TCPCB_RETRANS)) { /* If the sacked packet was retransmitted, it's ambiguous * whether the retransmission or the original (or the prior * retransmission) was sacked. @@ -133,13 +129,15 @@ void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, * so it's at least one RTT (i.e., retransmission is at least * an RTT later). */ - if (rtt_us < tcp_min_rtt(tp)) - return; + return; } - tp->rack.rtt_us = rtt_us; - tp->rack.mstamp = xmit_time; - tp->rack.end_seq = end_seq; tp->rack.advanced = 1; + tp->rack.rtt_us = rtt_us; + if (tcp_rack_sent_after(xmit_time, tp->rack.mstamp, + end_seq, tp->rack.end_seq)) { + tp->rack.mstamp = xmit_time; + tp->rack.end_seq = end_seq; + } } /* We have waited long enough to accommodate reordering. Mark the expired diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 16df6dd44b98..71fc60f1b326 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -48,11 +48,19 @@ static void tcp_write_err(struct sock *sk) * to prevent DoS attacks. It is called when a retransmission timeout * or zero probe timeout occurs on orphaned socket. * + * Also close if our net namespace is exiting; in that case there is no + * hope of ever communicating again since all netns interfaces are already + * down (or about to be down), and we need to release our dst references, + * which have been moved to the netns loopback interface, so the namespace + * can finish exiting. This condition is only possible if we are a kernel + * socket, as those do not hold references to the namespace. + * * Criteria is still not confirmed experimentally and may change. * We kill the socket, if: * 1. If number of orphaned sockets exceeds an administratively configured * limit. * 2. If we have strong memory pressure. + * 3. If our net namespace is exiting. */ static int tcp_out_of_resources(struct sock *sk, bool do_reset) { @@ -81,6 +89,13 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); return 1; } + + if (!check_net(sock_net(sk))) { + /* Not possible to send reset; just close */ + tcp_done(sk); + return 1; + } + return 0; } @@ -183,11 +198,6 @@ static int tcp_write_timeout(struct sock *sk) if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { if (icsk->icsk_retransmits) { dst_negative_advice(sk); - if (tp->syn_fastopen || tp->syn_data) - tcp_fastopen_cache_set(sk, 0, NULL, true, 0); - if (tp->syn_data && icsk->icsk_retransmits == 1) - NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPFASTOPENACTIVEFAIL); } else if (!tp->syn_data && !tp->syn_fastopen) { sk_rethink_txhash(sk); } @@ -195,17 +205,6 @@ static int tcp_write_timeout(struct sock *sk) expired = icsk->icsk_retransmits >= retry_until; } else { if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0)) { - /* Some middle-boxes may black-hole Fast Open _after_ - * the handshake. Therefore we conservatively disable - * Fast Open on this path on recurring timeouts after - * successful Fast Open. - */ - if (tp->syn_data_acked) { - tcp_fastopen_cache_set(sk, 0, NULL, true, 0); - if (icsk->icsk_retransmits == net->ipv4.sysctl_tcp_retries1) - NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPFASTOPENACTIVEFAIL); - } /* Black hole detection */ tcp_mtu_probing(icsk, sk); @@ -228,11 +227,19 @@ static int tcp_write_timeout(struct sock *sk) expired = retransmits_timed_out(sk, retry_until, icsk->icsk_user_timeout); } + tcp_fastopen_active_detect_blackhole(sk, expired); + + if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTO_CB_FLAG)) + tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RTO_CB, + icsk->icsk_retransmits, + icsk->icsk_rto, (int)expired); + if (expired) { /* Has it gone just too far? */ tcp_write_err(sk); return 1; } + return 0; } @@ -264,6 +271,7 @@ void tcp_delack_timer_handler(struct sock *sk) icsk->icsk_ack.pingpong = 0; icsk->icsk_ack.ato = TCP_ATO_MIN; } + tcp_mstamp_refresh(tcp_sk(sk)); tcp_send_ack(sk); __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS); } @@ -632,6 +640,7 @@ static void tcp_keepalive_timer (struct timer_list *t) goto out; } + tcp_mstamp_refresh(tp); if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) { if (tp->linger2 >= 0) { const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index e4ff25c947c5..f81f969f9c06 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -357,18 +357,12 @@ fail: } EXPORT_SYMBOL(udp_lib_get_port); -static u32 udp4_portaddr_hash(const struct net *net, __be32 saddr, - unsigned int port) -{ - return jhash_1word((__force u32)saddr, net_hash_mix(net)) ^ port; -} - int udp_v4_get_port(struct sock *sk, unsigned short snum) { unsigned int hash2_nulladdr = - udp4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum); + ipv4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum); unsigned int hash2_partial = - udp4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0); + ipv4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0); /* precompute partial secondary hash */ udp_sk(sk)->udp_portaddr_hash = hash2_partial; @@ -445,7 +439,7 @@ static struct sock *udp4_lib_lookup2(struct net *net, struct sk_buff *skb) { struct sock *sk, *result; - int score, badness, matches = 0, reuseport = 0; + int score, badness; u32 hash = 0; result = NULL; @@ -454,23 +448,16 @@ static struct sock *udp4_lib_lookup2(struct net *net, score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif, exact_dif); if (score > badness) { - reuseport = sk->sk_reuseport; - if (reuseport) { + if (sk->sk_reuseport) { hash = udp_ehashfn(net, daddr, hnum, saddr, sport); result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); if (result) return result; - matches = 1; } badness = score; result = sk; - } else if (score == badness && reuseport) { - matches++; - if (reciprocal_scale(hash, matches) == 0) - result = sk; - hash = next_pseudo_random32(hash); } } return result; @@ -488,11 +475,11 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; bool exact_dif = udp_lib_exact_dif_match(net, skb); - int score, badness, matches = 0, reuseport = 0; + int score, badness; u32 hash = 0; if (hslot->count > 10) { - hash2 = udp4_portaddr_hash(net, daddr, hnum); + hash2 = ipv4_portaddr_hash(net, daddr, hnum); slot2 = hash2 & udptable->mask; hslot2 = &udptable->hash2[slot2]; if (hslot->count < hslot2->count) @@ -503,7 +490,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, exact_dif, hslot2, skb); if (!result) { unsigned int old_slot2 = slot2; - hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum); + hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); slot2 = hash2 & udptable->mask; /* avoid searching the same slot again. */ if (unlikely(slot2 == old_slot2)) @@ -526,23 +513,16 @@ begin: score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif, exact_dif); if (score > badness) { - reuseport = sk->sk_reuseport; - if (reuseport) { + if (sk->sk_reuseport) { hash = udp_ehashfn(net, daddr, hnum, saddr, sport); result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); if (result) return result; - matches = 1; } result = sk; badness = score; - } else if (score == badness && reuseport) { - matches++; - if (reciprocal_scale(hash, matches) == 0) - result = sk; - hash = next_pseudo_random32(hash); } } return result; @@ -997,8 +977,21 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (!saddr) saddr = inet->mc_addr; connected = 0; - } else if (!ipc.oif) + } else if (!ipc.oif) { ipc.oif = inet->uc_index; + } else if (ipv4_is_lbcast(daddr) && inet->uc_index) { + /* oif is set, packet is to local broadcast and + * and uc_index is set. oif is most likely set + * by sk_bound_dev_if. If uc_index != oif check if the + * oif is an L3 master and uc_index is an L3 slave. + * If so, we want to allow the send using the uc_index. + */ + if (ipc.oif != inet->uc_index && + ipc.oif == l3mdev_master_ifindex_by_index(sock_net(sk), + inet->uc_index)) { + ipc.oif = inet->uc_index; + } + } if (connected) rt = (struct rtable *)sk_dst_check(sk, 0); @@ -1775,7 +1768,7 @@ EXPORT_SYMBOL(udp_lib_rehash); static void udp_v4_rehash(struct sock *sk) { - u16 new_hash = udp4_portaddr_hash(sock_net(sk), + u16 new_hash = ipv4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, inet_sk(sk)->inet_num); udp_lib_rehash(sk, new_hash); @@ -1966,9 +1959,9 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, struct sk_buff *nskb; if (use_hash2) { - hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) & + hash2_any = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum) & udptable->mask; - hash2 = udp4_portaddr_hash(net, daddr, hnum) & udptable->mask; + hash2 = ipv4_portaddr_hash(net, daddr, hnum) & udptable->mask; start_lookup: hslot = &udptable->hash2[hash2]; offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node); @@ -2200,7 +2193,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net, int dif, int sdif) { unsigned short hnum = ntohs(loc_port); - unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum); + unsigned int hash2 = ipv4_portaddr_hash(net, loc_addr, hnum); unsigned int slot2 = hash2 & udp_table.mask; struct udp_hslot *hslot2 = &udp_table.hash2[slot2]; INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr); @@ -2502,16 +2495,14 @@ int compat_udp_getsockopt(struct sock *sk, int level, int optname, * but then block when reading it. Add special case code * to work around these arguably broken applications. */ -unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait) +__poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait) { - unsigned int mask = datagram_poll(file, sock, wait); + __poll_t mask = datagram_poll(file, sock, wait); struct sock *sk = sock->sk; if (!skb_queue_empty(&udp_sk(sk)->reader_queue)) mask |= POLLIN | POLLRDNORM; - sock_rps_record_flow(sk); - /* Check for false positives due to checksum errors */ if ((mask & POLLRDNORM) && !(file->f_flags & O_NONBLOCK) && !(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1) @@ -2736,7 +2727,6 @@ int udp4_seq_show(struct seq_file *seq, void *v) } static const struct file_operations udp_afinfo_seq_fops = { - .owner = THIS_MODULE, .open = udp_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 01801b77bd0d..ea6e6e7df0ee 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -203,6 +203,9 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, goto out; } + if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP)) + goto out; + if (!pskb_may_pull(skb, sizeof(struct udphdr))) goto out; diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 59f10fe9782e..f96614e9b9a5 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -75,7 +75,6 @@ static struct inet_protosw udplite4_protosw = { #ifdef CONFIG_PROC_FS static const struct file_operations udplite_afinfo_seq_fops = { - .owner = THIS_MODULE, .open = udp_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index e50b7fea57ee..bcfc00e88756 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -23,6 +23,12 @@ int xfrm4_extract_input(struct xfrm_state *x, struct sk_buff *skb) return xfrm4_extract_header(skb); } +static int xfrm4_rcv_encap_finish2(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + return dst_input(skb); +} + static inline int xfrm4_rcv_encap_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { @@ -33,7 +39,11 @@ static inline int xfrm4_rcv_encap_finish(struct net *net, struct sock *sk, iph->tos, skb->dev)) goto drop; } - return dst_input(skb); + + if (xfrm_trans_queue(skb, xfrm4_rcv_encap_finish2)) + goto drop; + + return 0; drop: kfree_skb(skb); return NET_RX_DROP; diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index e6265e2c274e..63faeee989a9 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -62,7 +62,7 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ? 0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF)); - top_iph->ttl = ip4_dst_hoplimit(dst->child); + top_iph->ttl = ip4_dst_hoplimit(xfrm_dst_child(dst)); top_iph->saddr = x->props.saddr.a4; top_iph->daddr = x->id.daddr.a4; @@ -92,6 +92,7 @@ static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) skb_reset_network_header(skb); skb_mac_header_rebuild(skb); + eth_hdr(skb)->h_proto = skb->protocol; err = 0; @@ -105,18 +106,15 @@ static struct sk_buff *xfrm4_mode_tunnel_gso_segment(struct xfrm_state *x, { __skb_push(skb, skb->mac_len); return skb_mac_gso_segment(skb, features); - } static void xfrm4_mode_tunnel_xmit(struct xfrm_state *x, struct sk_buff *skb) { struct xfrm_offload *xo = xfrm_offload(skb); - if (xo->flags & XFRM_GSO_SEGMENT) { - skb->network_header = skb->network_header - x->props.header_len; + if (xo->flags & XFRM_GSO_SEGMENT) skb->transport_header = skb->network_header + sizeof(struct iphdr); - } skb_reset_mac_len(skb); pskb_pull(skb, skb->mac_len + x->props.header_len); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index f49bd7897e95..e1846b97ee69 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -186,7 +186,8 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, static void addrconf_dad_start(struct inet6_ifaddr *ifp); static void addrconf_dad_work(struct work_struct *w); -static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id); +static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id, + bool send_na); static void addrconf_dad_run(struct inet6_dev *idev); static void addrconf_rs_timer(struct timer_list *t); static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa); @@ -3438,6 +3439,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, } else if (event == NETDEV_CHANGE) { if (!addrconf_link_ready(dev)) { /* device is still not ready. */ + rt6_sync_down_dev(dev, event); break; } @@ -3449,6 +3451,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, * multicast snooping switches */ ipv6_mc_up(idev); + rt6_sync_up(dev, RTNH_F_LINKDOWN); break; } idev->if_flags |= IF_READY; @@ -3484,6 +3487,9 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, if (run_pending) addrconf_dad_run(idev); + /* Device has an address by now */ + rt6_sync_up(dev, RTNH_F_DEAD); + /* * If the MTU changed during the interface down, * when the interface up, the changed MTU must be @@ -3577,6 +3583,7 @@ static bool addr_is_local(const struct in6_addr *addr) static int addrconf_ifdown(struct net_device *dev, int how) { + unsigned long event = how ? NETDEV_UNREGISTER : NETDEV_DOWN; struct net *net = dev_net(dev); struct inet6_dev *idev; struct inet6_ifaddr *ifa, *tmp; @@ -3586,8 +3593,7 @@ static int addrconf_ifdown(struct net_device *dev, int how) ASSERT_RTNL(); - rt6_ifdown(net, dev); - neigh_ifdown(&nd_tbl, dev); + rt6_disable_ip(dev, event); idev = __in6_dev_get(dev); if (!idev) @@ -3833,12 +3839,17 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) idev->cnf.accept_dad < 1) || !(ifp->flags&IFA_F_TENTATIVE) || ifp->flags & IFA_F_NODAD) { + bool send_na = false; + + if (ifp->flags & IFA_F_TENTATIVE && + !(ifp->flags & IFA_F_OPTIMISTIC)) + send_na = true; bump_id = ifp->flags & IFA_F_TENTATIVE; ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC|IFA_F_DADFAILED); spin_unlock(&ifp->lock); read_unlock_bh(&idev->lock); - addrconf_dad_completed(ifp, bump_id); + addrconf_dad_completed(ifp, bump_id, send_na); return; } @@ -3967,16 +3978,21 @@ static void addrconf_dad_work(struct work_struct *w) } if (ifp->dad_probes == 0) { + bool send_na = false; + /* * DAD was successful */ + if (ifp->flags & IFA_F_TENTATIVE && + !(ifp->flags & IFA_F_OPTIMISTIC)) + send_na = true; bump_id = ifp->flags & IFA_F_TENTATIVE; ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC|IFA_F_DADFAILED); spin_unlock(&ifp->lock); write_unlock_bh(&idev->lock); - addrconf_dad_completed(ifp, bump_id); + addrconf_dad_completed(ifp, bump_id, send_na); goto out; } @@ -4014,7 +4030,8 @@ static bool ipv6_lonely_lladdr(struct inet6_ifaddr *ifp) return true; } -static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id) +static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id, + bool send_na) { struct net_device *dev = ifp->idev->dev; struct in6_addr lladdr; @@ -4046,6 +4063,16 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id) if (send_mld) ipv6_mc_dad_complete(ifp->idev); + /* send unsolicited NA if enabled */ + if (send_na && + (ifp->idev->cnf.ndisc_notify || + dev_net(dev)->ipv6.devconf_all->ndisc_notify)) { + ndisc_send_na(dev, &in6addr_linklocal_allnodes, &ifp->addr, + /*router=*/ !!ifp->idev->cnf.forwarding, + /*solicited=*/ false, /*override=*/ true, + /*inc_opt=*/ true); + } + if (send_rs) { /* * If a host as already performed a random delay @@ -4209,7 +4236,6 @@ static int if6_seq_open(struct inode *inode, struct file *file) } static const struct file_operations if6_fops = { - .owner = THIS_MODULE, .open = if6_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -4352,9 +4378,11 @@ restart: spin_lock(&ifpub->lock); ifpub->regen_count = 0; spin_unlock(&ifpub->lock); + rcu_read_unlock_bh(); ipv6_create_tempaddr(ifpub, ifp, true); in6_ifa_put(ifpub); in6_ifa_put(ifp); + rcu_read_lock_bh(); goto restart; } } else if (time_before(ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ, next)) @@ -6595,27 +6623,45 @@ int __init addrconf_init(void) rtnl_af_register(&inet6_ops); - err = __rtnl_register(PF_INET6, RTM_GETLINK, NULL, inet6_dump_ifinfo, - 0); + err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETLINK, + NULL, inet6_dump_ifinfo, 0); if (err < 0) goto errout; - /* Only the first call to __rtnl_register can fail */ - __rtnl_register(PF_INET6, RTM_NEWADDR, inet6_rtm_newaddr, NULL, 0); - __rtnl_register(PF_INET6, RTM_DELADDR, inet6_rtm_deladdr, NULL, 0); - __rtnl_register(PF_INET6, RTM_GETADDR, inet6_rtm_getaddr, - inet6_dump_ifaddr, RTNL_FLAG_DOIT_UNLOCKED); - __rtnl_register(PF_INET6, RTM_GETMULTICAST, NULL, - inet6_dump_ifmcaddr, 0); - __rtnl_register(PF_INET6, RTM_GETANYCAST, NULL, - inet6_dump_ifacaddr, 0); - __rtnl_register(PF_INET6, RTM_GETNETCONF, inet6_netconf_get_devconf, - inet6_netconf_dump_devconf, RTNL_FLAG_DOIT_UNLOCKED); - - ipv6_addr_label_rtnl_register(); + err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWADDR, + inet6_rtm_newaddr, NULL, 0); + if (err < 0) + goto errout; + err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELADDR, + inet6_rtm_deladdr, NULL, 0); + if (err < 0) + goto errout; + err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETADDR, + inet6_rtm_getaddr, inet6_dump_ifaddr, + RTNL_FLAG_DOIT_UNLOCKED); + if (err < 0) + goto errout; + err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETMULTICAST, + NULL, inet6_dump_ifmcaddr, 0); + if (err < 0) + goto errout; + err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETANYCAST, + NULL, inet6_dump_ifacaddr, 0); + if (err < 0) + goto errout; + err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETNETCONF, + inet6_netconf_get_devconf, + inet6_netconf_dump_devconf, + RTNL_FLAG_DOIT_UNLOCKED); + if (err < 0) + goto errout; + err = ipv6_addr_label_rtnl_register(); + if (err < 0) + goto errout; return 0; errout: + rtnl_unregister_all(PF_INET6); rtnl_af_unregister(&inet6_ops); unregister_netdevice_notifier(&ipv6_dev_notf); errlo: diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index 00e1f8ee08f8..1d6ced37ad71 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -547,13 +547,22 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, return err; } -void __init ipv6_addr_label_rtnl_register(void) +int __init ipv6_addr_label_rtnl_register(void) { - __rtnl_register(PF_INET6, RTM_NEWADDRLABEL, ip6addrlbl_newdel, - NULL, RTNL_FLAG_DOIT_UNLOCKED); - __rtnl_register(PF_INET6, RTM_DELADDRLABEL, ip6addrlbl_newdel, - NULL, RTNL_FLAG_DOIT_UNLOCKED); - __rtnl_register(PF_INET6, RTM_GETADDRLABEL, ip6addrlbl_get, - ip6addrlbl_dump, RTNL_FLAG_DOIT_UNLOCKED); -} + int ret; + ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWADDRLABEL, + ip6addrlbl_newdel, + NULL, RTNL_FLAG_DOIT_UNLOCKED); + if (ret < 0) + return ret; + ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELADDRLABEL, + ip6addrlbl_newdel, + NULL, RTNL_FLAG_DOIT_UNLOCKED); + if (ret < 0) + return ret; + ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETADDRLABEL, + ip6addrlbl_get, + ip6addrlbl_dump, RTNL_FLAG_DOIT_UNLOCKED); + return ret; +} diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index c26f71234b9c..416917719a6f 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -210,7 +210,6 @@ lookup_protocol: np->mcast_hops = IPV6_DEFAULT_MCASTHOPS; np->mc_loop = 1; np->pmtudisc = IPV6_PMTUDISC_WANT; - np->autoflowlabel = ip6_default_np_autolabel(net); np->repflow = net->ipv6.sysctl.flowlabel_reflect; sk->sk_ipv6only = net->ipv6.sysctl.bindv6only; @@ -285,6 +284,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) struct net *net = sock_net(sk); __be32 v4addr = 0; unsigned short snum; + bool saved_ipv6only; int addr_type = 0; int err = 0; @@ -390,19 +390,21 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (!(addr_type & IPV6_ADDR_MULTICAST)) np->saddr = addr->sin6_addr; + saved_ipv6only = sk->sk_ipv6only; + if (addr_type != IPV6_ADDR_ANY && addr_type != IPV6_ADDR_MAPPED) + sk->sk_ipv6only = 1; + /* Make sure we are allowed to bind here. */ if ((snum || !inet->bind_address_no_port) && sk->sk_prot->get_port(sk, snum)) { + sk->sk_ipv6only = saved_ipv6only; inet_reset_saddr(sk); err = -EADDRINUSE; goto out; } - if (addr_type != IPV6_ADDR_ANY) { + if (addr_type != IPV6_ADDR_ANY) sk->sk_userlocks |= SOCK_BINDADDR_LOCK; - if (addr_type != IPV6_ADDR_MAPPED) - sk->sk_ipv6only = 1; - } if (snum) sk->sk_userlocks |= SOCK_BINDPORT_LOCK; inet->inet_sport = htons(inet->inet_num); diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index 0bbab8a4b5d8..8e085cc05aeb 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -533,7 +533,6 @@ static int ac6_seq_open(struct inode *inode, struct file *file) } static const struct file_operations ac6_seq_fops = { - .owner = THIS_MODULE, .open = ac6_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index a1f918713006..fbf08ce3f5ab 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -221,8 +221,7 @@ ipv4_connected: if (__ipv6_addr_needs_scope_id(addr_type)) { if (addr_len >= sizeof(struct sockaddr_in6) && usin->sin6_scope_id) { - if (sk->sk_bound_dev_if && - sk->sk_bound_dev_if != usin->sin6_scope_id) { + if (!sk_dev_equal_l3scope(sk, usin->sin6_scope_id)) { err = -EINVAL; goto out; } diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index a902ff8f59be..97513f35bcc5 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -141,14 +141,32 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp) static void esp_output_done(struct crypto_async_request *base, int err) { struct sk_buff *skb = base->data; + struct xfrm_offload *xo = xfrm_offload(skb); void *tmp; - struct dst_entry *dst = skb_dst(skb); - struct xfrm_state *x = dst->xfrm; + struct xfrm_state *x; + + if (xo && (xo->flags & XFRM_DEV_RESUME)) + x = skb->sp->xvec[skb->sp->len - 1]; + else + x = skb_dst(skb)->xfrm; tmp = ESP_SKB_CB(skb)->tmp; esp_ssg_unref(x, tmp); kfree(tmp); - xfrm_output_resume(skb, err); + + if (xo && (xo->flags & XFRM_DEV_RESUME)) { + if (err) { + XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR); + kfree_skb(skb); + return; + } + + skb_push(skb, skb->data - skb_mac_header(skb)); + secpath_reset(skb); + xfrm_dev_resume(skb); + } else { + xfrm_output_resume(skb, err); + } } /* Move ESP header back into place. */ @@ -734,17 +752,13 @@ static int esp_init_aead(struct xfrm_state *x) char aead_name[CRYPTO_MAX_ALG_NAME]; struct crypto_aead *aead; int err; - u32 mask = 0; err = -ENAMETOOLONG; if (snprintf(aead_name, CRYPTO_MAX_ALG_NAME, "%s(%s)", x->geniv, x->aead->alg_name) >= CRYPTO_MAX_ALG_NAME) goto error; - if (x->xso.offload_handle) - mask |= CRYPTO_ALG_ASYNC; - - aead = crypto_alloc_aead(aead_name, 0, mask); + aead = crypto_alloc_aead(aead_name, 0, 0); err = PTR_ERR(aead); if (IS_ERR(aead)) goto error; @@ -774,7 +788,6 @@ static int esp_init_authenc(struct xfrm_state *x) char authenc_name[CRYPTO_MAX_ALG_NAME]; unsigned int keylen; int err; - u32 mask = 0; err = -EINVAL; if (!x->ealg) @@ -800,10 +813,7 @@ static int esp_init_authenc(struct xfrm_state *x) goto error; } - if (x->xso.offload_handle) - mask |= CRYPTO_ALG_ASYNC; - - aead = crypto_alloc_aead(authenc_name, 0, mask); + aead = crypto_alloc_aead(authenc_name, 0, 0); err = PTR_ERR(aead); if (IS_ERR(aead)) goto error; @@ -890,13 +900,12 @@ static int esp6_init_state(struct xfrm_state *x) x->props.header_len += IPV4_BEET_PHMAXLEN + (sizeof(struct ipv6hdr) - sizeof(struct iphdr)); break; + default: case XFRM_MODE_TRANSPORT: break; case XFRM_MODE_TUNNEL: x->props.header_len += sizeof(struct ipv6hdr); break; - default: - goto error; } align = ALIGN(crypto_aead_blocksize(aead), 4); diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 333a478aa161..3fd1ec775dc2 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -60,7 +60,8 @@ static struct sk_buff **esp6_gro_receive(struct sk_buff **head, int nhoff; int err; - skb_pull(skb, offset); + if (!pskb_pull(skb, offset)) + return NULL; if ((err = xfrm_parse_spi(skb, IPPROTO_ESP, &spi, &seq)) != 0) goto out; @@ -135,75 +136,39 @@ static void esp6_gso_encap(struct xfrm_state *x, struct sk_buff *skb) static struct sk_buff *esp6_gso_segment(struct sk_buff *skb, netdev_features_t features) { - __u32 seq; - int err = 0; - struct sk_buff *skb2; struct xfrm_state *x; struct ip_esp_hdr *esph; struct crypto_aead *aead; - struct sk_buff *segs = ERR_PTR(-EINVAL); netdev_features_t esp_features = features; struct xfrm_offload *xo = xfrm_offload(skb); if (!xo) - goto out; + return ERR_PTR(-EINVAL); - seq = xo->seq.low; + if (!(skb_shinfo(skb)->gso_type & SKB_GSO_ESP)) + return ERR_PTR(-EINVAL); x = skb->sp->xvec[skb->sp->len - 1]; aead = x->data; esph = ip_esp_hdr(skb); if (esph->spi != x->id.spi) - goto out; + return ERR_PTR(-EINVAL); if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead))) - goto out; + return ERR_PTR(-EINVAL); __skb_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead)); skb->encap_hdr_csum = 1; - if (!(features & NETIF_F_HW_ESP)) + if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle || + (x->xso.dev != skb->dev)) esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK); - segs = x->outer_mode->gso_segment(x, skb, esp_features); - if (IS_ERR_OR_NULL(segs)) - goto out; - - __skb_pull(skb, skb->data - skb_mac_header(skb)); + xo->flags |= XFRM_GSO_SEGMENT; - skb2 = segs; - do { - struct sk_buff *nskb = skb2->next; - - xo = xfrm_offload(skb2); - xo->flags |= XFRM_GSO_SEGMENT; - xo->seq.low = seq; - xo->seq.hi = xfrm_replay_seqhi(x, seq); - - if(!(features & NETIF_F_HW_ESP)) - xo->flags |= CRYPTO_FALLBACK; - - x->outer_mode->xmit(x, skb2); - - err = x->type_offload->xmit(x, skb2, esp_features); - if (err) { - kfree_skb_list(segs); - return ERR_PTR(err); - } - - if (!skb_is_gso(skb2)) - seq++; - else - seq += skb_shinfo(skb2)->gso_segs; - - skb_push(skb2, skb2->mac_len); - skb2 = nskb; - } while (skb2); - -out: - return segs; + return x->outer_mode->gso_segment(x, skb, esp_features); } static int esp6_input_tail(struct xfrm_state *x, struct sk_buff *skb) @@ -222,6 +187,7 @@ static int esp6_input_tail(struct xfrm_state *x, struct sk_buff *skb) static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { + int len; int err; int alen; int blksize; @@ -230,6 +196,7 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features struct crypto_aead *aead; struct esp_info esp; bool hw_offload = true; + __u32 seq; esp.inplace = true; @@ -265,28 +232,33 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features return esp.nfrags; } + seq = xo->seq.low; + esph = ip_esp_hdr(skb); esph->spi = x->id.spi; skb_push(skb, -skb_network_offset(skb)); if (xo->flags & XFRM_GSO_SEGMENT) { - esph->seq_no = htonl(xo->seq.low); - } else { - int len; - - len = skb->len - sizeof(struct ipv6hdr); - if (len > IPV6_MAXPLEN) - len = 0; + esph->seq_no = htonl(seq); - ipv6_hdr(skb)->payload_len = htons(len); + if (!skb_is_gso(skb)) + xo->seq.low++; + else + xo->seq.low += skb_shinfo(skb)->gso_segs; } + esp.seqno = cpu_to_be64(xo->seq.low + ((u64)xo->seq.hi << 32)); + + len = skb->len - sizeof(struct ipv6hdr); + if (len > IPV6_MAXPLEN) + len = 0; + + ipv6_hdr(skb)->payload_len = htons(len); + if (hw_offload) return 0; - esp.seqno = cpu_to_be64(xo->seq.low + ((u64)xo->seq.hi << 32)); - err = esp6_output_tail(x, skb, &esp); if (err) return err; diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 83bd75713535..bc68eb661970 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -925,6 +925,15 @@ static void ipv6_push_rthdr4(struct sk_buff *skb, u8 *proto, sr_phdr->segments[0] = **addr_p; *addr_p = &sr_ihdr->segments[sr_ihdr->segments_left]; + if (sr_ihdr->hdrlen > hops * 2) { + int tlvs_offset, tlvs_length; + + tlvs_offset = (1 + hops * 2) << 3; + tlvs_length = (sr_ihdr->hdrlen - hops * 2) << 3; + memcpy((char *)sr_phdr + tlvs_offset, + (char *)sr_ihdr + tlvs_offset, tlvs_length); + } + #ifdef CONFIG_IPV6_SEG6_HMAC if (sr_has_hmac(sr_phdr)) { struct net *net = NULL; diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index 6eb5e68f112a..44c39c5f0638 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -512,9 +512,7 @@ static int ila_nl_dump(struct sk_buff *skb, struct netlink_callback *cb) struct ila_map *ila; int ret; - ret = rhashtable_walk_start(rhiter); - if (ret && ret != -EAGAIN) - goto done; + rhashtable_walk_start(rhiter); for (;;) { ila = rhashtable_walk_next(rhiter); diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index b01858f5deb1..2febe26de6a1 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -125,6 +125,40 @@ static inline int compute_score(struct sock *sk, struct net *net, } /* called with rcu_read_lock() */ +static struct sock *inet6_lhash2_lookup(struct net *net, + struct inet_listen_hashbucket *ilb2, + struct sk_buff *skb, int doff, + const struct in6_addr *saddr, + const __be16 sport, const struct in6_addr *daddr, + const unsigned short hnum, const int dif, const int sdif) +{ + bool exact_dif = inet6_exact_dif_match(net, skb); + struct inet_connection_sock *icsk; + struct sock *sk, *result = NULL; + int score, hiscore = 0; + u32 phash = 0; + + inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) { + sk = (struct sock *)icsk; + score = compute_score(sk, net, hnum, daddr, dif, sdif, + exact_dif); + if (score > hiscore) { + if (sk->sk_reuseport) { + phash = inet6_ehashfn(net, daddr, hnum, + saddr, sport); + result = reuseport_select_sock(sk, phash, + skb, doff); + if (result) + return result; + } + result = sk; + hiscore = score; + } + } + + return result; +} + struct sock *inet6_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, @@ -134,31 +168,56 @@ struct sock *inet6_lookup_listener(struct net *net, { unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; - int score, hiscore = 0, matches = 0, reuseport = 0; bool exact_dif = inet6_exact_dif_match(net, skb); + struct inet_listen_hashbucket *ilb2; struct sock *sk, *result = NULL; + int score, hiscore = 0; + unsigned int hash2; u32 phash = 0; + if (ilb->count <= 10 || !hashinfo->lhash2) + goto port_lookup; + + /* Too many sk in the ilb bucket (which is hashed by port alone). + * Try lhash2 (which is hashed by port and addr) instead. + */ + + hash2 = ipv6_portaddr_hash(net, daddr, hnum); + ilb2 = inet_lhash2_bucket(hashinfo, hash2); + if (ilb2->count > ilb->count) + goto port_lookup; + + result = inet6_lhash2_lookup(net, ilb2, skb, doff, + saddr, sport, daddr, hnum, + dif, sdif); + if (result) + return result; + + /* Lookup lhash2 with in6addr_any */ + + hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum); + ilb2 = inet_lhash2_bucket(hashinfo, hash2); + if (ilb2->count > ilb->count) + goto port_lookup; + + return inet6_lhash2_lookup(net, ilb2, skb, doff, + saddr, sport, daddr, hnum, + dif, sdif); + +port_lookup: sk_for_each(sk, &ilb->head) { score = compute_score(sk, net, hnum, daddr, dif, sdif, exact_dif); if (score > hiscore) { - reuseport = sk->sk_reuseport; - if (reuseport) { + if (sk->sk_reuseport) { phash = inet6_ehashfn(net, daddr, hnum, saddr, sport); result = reuseport_select_sock(sk, phash, skb, doff); if (result) return result; - matches = 1; } result = sk; hiscore = score; - } else if (score == hiscore && reuseport) { - matches++; - if (reciprocal_scale(phash, matches) == 0) - result = sk; - phash = next_pseudo_random32(phash); } } return result; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index f5285f4e1d08..92b8d8c75eed 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -107,16 +107,13 @@ enum { void fib6_update_sernum(struct rt6_info *rt) { - struct fib6_table *table = rt->rt6i_table; struct net *net = dev_net(rt->dst.dev); struct fib6_node *fn; - spin_lock_bh(&table->tb6_lock); fn = rcu_dereference_protected(rt->rt6i_node, - lockdep_is_held(&table->tb6_lock)); + lockdep_is_held(&rt->rt6i_table->tb6_lock)); if (fn) fn->fn_sernum = fib6_new_sernum(net); - spin_unlock_bh(&table->tb6_lock); } /* @@ -640,6 +637,11 @@ static struct fib6_node *fib6_add_1(struct net *net, if (!(fn->fn_flags & RTN_RTINFO)) { RCU_INIT_POINTER(fn->leaf, NULL); rt6_release(leaf); + /* remove null_entry in the root node */ + } else if (fn->fn_flags & RTN_TL_ROOT && + rcu_access_pointer(fn->leaf) == + net->ipv6.ip6_null_entry) { + RCU_INIT_POINTER(fn->leaf, NULL); } return fn; @@ -799,12 +801,6 @@ insert_above: return ln; } -static bool rt6_qualify_for_ecmp(struct rt6_info *rt) -{ - return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) == - RTF_GATEWAY; -} - static void fib6_copy_metrics(u32 *mp, const struct mx6_config *mxc) { int i; @@ -893,7 +889,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, ins = &fn->leaf; for (iter = leaf; iter; - iter = rcu_dereference_protected(iter->dst.rt6_next, + iter = rcu_dereference_protected(iter->rt6_next, lockdep_is_held(&rt->rt6i_table->tb6_lock))) { /* * Search for duplicates @@ -950,7 +946,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, break; next_iter: - ins = &iter->dst.rt6_next; + ins = &iter->rt6_next; } if (fallback_ins && !found) { @@ -979,7 +975,7 @@ next_iter: &sibling->rt6i_siblings); break; } - sibling = rcu_dereference_protected(sibling->dst.rt6_next, + sibling = rcu_dereference_protected(sibling->rt6_next, lockdep_is_held(&rt->rt6i_table->tb6_lock)); } /* For each sibling in the list, increment the counter of @@ -994,6 +990,7 @@ next_iter: rt6i_nsiblings++; } BUG_ON(rt6i_nsiblings != rt->rt6i_nsiblings); + rt6_multipath_rebalance(temp_sibling); } /* @@ -1009,7 +1006,7 @@ add: if (err) return err; - rcu_assign_pointer(rt->dst.rt6_next, iter); + rcu_assign_pointer(rt->rt6_next, iter); atomic_inc(&rt->rt6i_ref); rcu_assign_pointer(rt->rt6i_node, fn); rcu_assign_pointer(*ins, rt); @@ -1040,7 +1037,7 @@ add: atomic_inc(&rt->rt6i_ref); rcu_assign_pointer(rt->rt6i_node, fn); - rt->dst.rt6_next = iter->dst.rt6_next; + rt->rt6_next = iter->rt6_next; rcu_assign_pointer(*ins, rt); call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE, rt, extack); @@ -1059,14 +1056,14 @@ add: if (nsiblings) { /* Replacing an ECMP route, remove all siblings */ - ins = &rt->dst.rt6_next; + ins = &rt->rt6_next; iter = rcu_dereference_protected(*ins, lockdep_is_held(&rt->rt6i_table->tb6_lock)); while (iter) { if (iter->rt6i_metric > rt->rt6i_metric) break; if (rt6_qualify_for_ecmp(iter)) { - *ins = iter->dst.rt6_next; + *ins = iter->rt6_next; iter->rt6i_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); if (rcu_access_pointer(fn->rr_ptr) == iter) @@ -1075,7 +1072,7 @@ add: nsiblings--; info->nl_net->ipv6.rt6_stats->fib_rt_entries--; } else { - ins = &iter->dst.rt6_next; + ins = &iter->rt6_next; } iter = rcu_dereference_protected(*ins, lockdep_is_held(&rt->rt6i_table->tb6_lock)); @@ -1102,8 +1099,8 @@ void fib6_force_start_gc(struct net *net) jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); } -static void fib6_update_sernum_upto_root(struct rt6_info *rt, - int sernum) +static void __fib6_update_sernum_upto_root(struct rt6_info *rt, + int sernum) { struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node, lockdep_is_held(&rt->rt6i_table->tb6_lock)); @@ -1117,6 +1114,11 @@ static void fib6_update_sernum_upto_root(struct rt6_info *rt, } } +void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt) +{ + __fib6_update_sernum_upto_root(rt, fib6_new_sernum(net)); +} + /* * Add routing information to the routing tree. * <destination addr>/<source addr> @@ -1221,8 +1223,14 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, } if (!rcu_access_pointer(fn->leaf)) { - atomic_inc(&rt->rt6i_ref); - rcu_assign_pointer(fn->leaf, rt); + if (fn->fn_flags & RTN_TL_ROOT) { + /* put back null_entry for root node */ + rcu_assign_pointer(fn->leaf, + info->nl_net->ipv6.ip6_null_entry); + } else { + atomic_inc(&rt->rt6i_ref); + rcu_assign_pointer(fn->leaf, rt); + } } fn = sn; } @@ -1230,7 +1238,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, err = fib6_add_rt2node(fn, rt, info, mxc, extack); if (!err) { - fib6_update_sernum_upto_root(rt, sernum); + __fib6_update_sernum_upto_root(rt, sernum); fib6_start_gc(info->nl_net, rt); } @@ -1241,23 +1249,28 @@ out: * If fib6_add_1 has cleared the old leaf pointer in the * super-tree leaf node we have to find a new one for it. */ - struct rt6_info *pn_leaf = rcu_dereference_protected(pn->leaf, - lockdep_is_held(&table->tb6_lock)); - if (pn != fn && pn_leaf == rt) { - pn_leaf = NULL; - RCU_INIT_POINTER(pn->leaf, NULL); - atomic_dec(&rt->rt6i_ref); - } - if (pn != fn && !pn_leaf && !(pn->fn_flags & RTN_RTINFO)) { - pn_leaf = fib6_find_prefix(info->nl_net, table, pn); -#if RT6_DEBUG >= 2 - if (!pn_leaf) { - WARN_ON(!pn_leaf); - pn_leaf = info->nl_net->ipv6.ip6_null_entry; + if (pn != fn) { + struct rt6_info *pn_leaf = + rcu_dereference_protected(pn->leaf, + lockdep_is_held(&table->tb6_lock)); + if (pn_leaf == rt) { + pn_leaf = NULL; + RCU_INIT_POINTER(pn->leaf, NULL); + atomic_dec(&rt->rt6i_ref); } + if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) { + pn_leaf = fib6_find_prefix(info->nl_net, table, + pn); +#if RT6_DEBUG >= 2 + if (!pn_leaf) { + WARN_ON(!pn_leaf); + pn_leaf = + info->nl_net->ipv6.ip6_null_entry; + } #endif - atomic_inc(&pn_leaf->rt6i_ref); - rcu_assign_pointer(pn->leaf, pn_leaf); + atomic_inc(&pn_leaf->rt6i_ref); + rcu_assign_pointer(pn->leaf, pn_leaf); + } } #endif goto failure; @@ -1265,13 +1278,17 @@ out: return err; failure: - /* fn->leaf could be NULL if fn is an intermediate node and we - * failed to add the new route to it in both subtree creation - * failure and fib6_add_rt2node() failure case. - * In both cases, fib6_repair_tree() should be called to fix - * fn->leaf. + /* fn->leaf could be NULL and fib6_repair_tree() needs to be called if: + * 1. fn is an intermediate node and we failed to add the new + * route to it in both subtree creation failure and fib6_add_rt2node() + * failure case. + * 2. fn is the root node in the table and we fail to add the first + * default route to it. */ - if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT))) + if (fn && + (!(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)) || + (fn->fn_flags & RTN_TL_ROOT && + !rcu_access_pointer(fn->leaf)))) fib6_repair_tree(info->nl_net, table, fn); /* Always release dst as dst->__refcnt is guaranteed * to be taken before entering this function @@ -1526,6 +1543,12 @@ static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_walker *w; int iter = 0; + /* Set fn->leaf to null_entry for root node. */ + if (fn->fn_flags & RTN_TL_ROOT) { + rcu_assign_pointer(fn->leaf, net->ipv6.ip6_null_entry); + return fn; + } + for (;;) { struct fib6_node *fn_r = rcu_dereference_protected(fn->right, lockdep_is_held(&table->tb6_lock)); @@ -1644,7 +1667,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, WARN_ON_ONCE(rt->rt6i_flags & RTF_CACHE); /* Unlink it */ - *rtp = rt->dst.rt6_next; + *rtp = rt->rt6_next; rt->rt6i_node = NULL; net->ipv6.rt6_stats->fib_rt_entries--; net->ipv6.rt6_stats->fib_discarded_routes++; @@ -1665,6 +1688,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, sibling->rt6i_nsiblings--; rt->rt6i_nsiblings = 0; list_del_init(&rt->rt6i_siblings); + rt6_multipath_rebalance(next_sibling); } /* Adjust walkers */ @@ -1672,7 +1696,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, FOR_WALKERS(net, w) { if (w->state == FWS_C && w->leaf == rt) { RT6_TRACE("walker %p adjusted by delroute\n", w); - w->leaf = rcu_dereference_protected(rt->dst.rt6_next, + w->leaf = rcu_dereference_protected(rt->rt6_next, lockdep_is_held(&table->tb6_lock)); if (!w->leaf) w->state = FWS_U; @@ -1680,10 +1704,15 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, } read_unlock(&net->ipv6.fib6_walker_lock); - /* If it was last route, expunge its radix tree node */ + /* If it was last route, call fib6_repair_tree() to: + * 1. For root node, put back null_entry as how the table was created. + * 2. For other nodes, expunge its radix tree node. + */ if (!rcu_access_pointer(fn->leaf)) { - fn->fn_flags &= ~RTN_RTINFO; - net->ipv6.rt6_stats->fib_route_nodes--; + if (!(fn->fn_flags & RTN_TL_ROOT)) { + fn->fn_flags &= ~RTN_RTINFO; + net->ipv6.rt6_stats->fib_route_nodes--; + } fn = fib6_repair_tree(net, table, fn); } @@ -1731,7 +1760,7 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info) fib6_del_route(table, fn, rtp, info); return 0; } - rtp_next = &cur->dst.rt6_next; + rtp_next = &cur->rt6_next; } return -ENOENT; } @@ -1887,7 +1916,7 @@ static int fib6_clean_node(struct fib6_walker *w) for_each_fib6_walker_rt(w) { res = c->func(rt, c->arg); - if (res < 0) { + if (res == -1) { w->leaf = rt; res = fib6_del(rt, &info); if (res) { @@ -1900,6 +1929,12 @@ static int fib6_clean_node(struct fib6_walker *w) continue; } return 0; + } else if (res == -2) { + if (WARN_ON(!rt->rt6i_nsiblings)) + continue; + rt = list_last_entry(&rt->rt6i_siblings, + struct rt6_info, rt6i_siblings); + continue; } WARN_ON(res != 0); } @@ -1911,7 +1946,8 @@ static int fib6_clean_node(struct fib6_walker *w) * Convenient frontend to tree walker. * * func is called on each route. - * It may return -1 -> delete this route. + * It may return -2 -> skip multipath route. + * -1 -> delete this route. * 0 -> continue walking */ @@ -2103,7 +2139,6 @@ static void fib6_net_exit(struct net *net) { unsigned int i; - rt6_ifdown(net, NULL); del_timer_sync(&net->ipv6.ip6_fib_timer); for (i = 0; i < FIB6_TABLE_HASHSZ; i++) { @@ -2142,8 +2177,8 @@ int __init fib6_init(void) if (ret) goto out_kmem_cache_create; - ret = __rtnl_register(PF_INET6, RTM_GETROUTE, NULL, inet6_dump_fib, - 0); + ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, NULL, + inet6_dump_fib, 0); if (ret) goto out_unregister_subsys; @@ -2208,7 +2243,7 @@ static int ipv6_route_yield(struct fib6_walker *w) do { iter->w.leaf = rcu_dereference_protected( - iter->w.leaf->dst.rt6_next, + iter->w.leaf->rt6_next, lockdep_is_held(&iter->tbl->tb6_lock)); iter->skip--; if (!iter->skip && iter->w.leaf) @@ -2274,7 +2309,7 @@ static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) if (!v) goto iter_table; - n = rcu_dereference_bh(((struct rt6_info *)v)->dst.rt6_next); + n = rcu_dereference_bh(((struct rt6_info *)v)->rt6_next); if (n) { ++*pos; return n; diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index 7f59c8fabeeb..3dab664ff503 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -836,7 +836,6 @@ static int ip6fl_seq_release(struct inode *inode, struct file *file) } static const struct file_operations ip6fl_seq_fops = { - .owner = THIS_MODULE, .open = ip6fl_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 4cfd8e0696fe..05f070e123e4 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -55,6 +55,8 @@ #include <net/ip6_route.h> #include <net/ip6_tunnel.h> #include <net/gre.h> +#include <net/erspan.h> +#include <net/dst_metadata.h> static bool log_ecn_error = true; @@ -68,11 +70,13 @@ static unsigned int ip6gre_net_id __read_mostly; struct ip6gre_net { struct ip6_tnl __rcu *tunnels[4][IP6_GRE_HASH_SIZE]; + struct ip6_tnl __rcu *collect_md_tun; struct net_device *fb_tunnel_dev; }; static struct rtnl_link_ops ip6gre_link_ops __read_mostly; static struct rtnl_link_ops ip6gre_tap_ops __read_mostly; +static struct rtnl_link_ops ip6erspan_tap_ops __read_mostly; static int ip6gre_tunnel_init(struct net_device *dev); static void ip6gre_tunnel_setup(struct net_device *dev); static void ip6gre_tunnel_link(struct ip6gre_net *ign, struct ip6_tnl *t); @@ -121,7 +125,8 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev, unsigned int h1 = HASH_KEY(key); struct ip6_tnl *t, *cand = NULL; struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); - int dev_type = (gre_proto == htons(ETH_P_TEB)) ? + int dev_type = (gre_proto == htons(ETH_P_TEB) || + gre_proto == htons(ETH_P_ERSPAN)) ? ARPHRD_ETHER : ARPHRD_IP6GRE; int score, cand_score = 4; @@ -226,6 +231,10 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev, if (cand) return cand; + t = rcu_dereference(ign->collect_md_tun); + if (t && t->dev->flags & IFF_UP) + return t; + dev = ign->fb_tunnel_dev; if (dev->flags & IFF_UP) return netdev_priv(dev); @@ -261,6 +270,9 @@ static void ip6gre_tunnel_link(struct ip6gre_net *ign, struct ip6_tnl *t) { struct ip6_tnl __rcu **tp = ip6gre_bucket(ign, t); + if (t->parms.collect_md) + rcu_assign_pointer(ign->collect_md_tun, t); + rcu_assign_pointer(t->next, rtnl_dereference(*tp)); rcu_assign_pointer(*tp, t); } @@ -270,6 +282,9 @@ static void ip6gre_tunnel_unlink(struct ip6gre_net *ign, struct ip6_tnl *t) struct ip6_tnl __rcu **tp; struct ip6_tnl *iter; + if (t->parms.collect_md) + rcu_assign_pointer(ign->collect_md_tun, NULL); + for (tp = ip6gre_bucket(ign, t); (iter = rtnl_dereference(*tp)) != NULL; tp = &iter->next) { @@ -337,11 +352,12 @@ static struct ip6_tnl *ip6gre_tunnel_locate(struct net *net, nt->dev = dev; nt->net = dev_net(dev); - ip6gre_tnl_link_config(nt, 1); if (register_netdevice(dev) < 0) goto failed_free; + ip6gre_tnl_link_config(nt, 1); + /* Can use a lockless transmit, unless we generate output sequences */ if (!(nt->parms.o_flags & TUNNEL_SEQ)) dev->features |= NETIF_F_LLTX; @@ -460,7 +476,101 @@ static int ip6gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) &ipv6h->saddr, &ipv6h->daddr, tpi->key, tpi->proto); if (tunnel) { - ip6_tnl_rcv(tunnel, skb, tpi, NULL, log_ecn_error); + if (tunnel->parms.collect_md) { + struct metadata_dst *tun_dst; + __be64 tun_id; + __be16 flags; + + flags = tpi->flags; + tun_id = key32_to_tunnel_id(tpi->key); + + tun_dst = ipv6_tun_rx_dst(skb, flags, tun_id, 0); + if (!tun_dst) + return PACKET_REJECT; + + ip6_tnl_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error); + } else { + ip6_tnl_rcv(tunnel, skb, tpi, NULL, log_ecn_error); + } + + return PACKET_RCVD; + } + + return PACKET_REJECT; +} + +static int ip6erspan_rcv(struct sk_buff *skb, int gre_hdr_len, + struct tnl_ptk_info *tpi) +{ + struct erspan_base_hdr *ershdr; + struct erspan_metadata *pkt_md; + const struct ipv6hdr *ipv6h; + struct ip6_tnl *tunnel; + u8 ver; + + if (unlikely(!pskb_may_pull(skb, sizeof(*ershdr)))) + return PACKET_REJECT; + + ipv6h = ipv6_hdr(skb); + ershdr = (struct erspan_base_hdr *)skb->data; + ver = ershdr->ver; + tpi->key = cpu_to_be32(get_session_id(ershdr)); + + tunnel = ip6gre_tunnel_lookup(skb->dev, + &ipv6h->saddr, &ipv6h->daddr, tpi->key, + tpi->proto); + if (tunnel) { + int len = erspan_hdr_len(ver); + + if (unlikely(!pskb_may_pull(skb, len))) + return PACKET_REJECT; + + ershdr = (struct erspan_base_hdr *)skb->data; + pkt_md = (struct erspan_metadata *)(ershdr + 1); + + if (__iptunnel_pull_header(skb, len, + htons(ETH_P_TEB), + false, false) < 0) + return PACKET_REJECT; + + if (tunnel->parms.collect_md) { + struct metadata_dst *tun_dst; + struct ip_tunnel_info *info; + struct erspan_metadata *md; + __be64 tun_id; + __be16 flags; + + tpi->flags |= TUNNEL_KEY; + flags = tpi->flags; + tun_id = key32_to_tunnel_id(tpi->key); + + tun_dst = ipv6_tun_rx_dst(skb, flags, tun_id, + sizeof(*md)); + if (!tun_dst) + return PACKET_REJECT; + + info = &tun_dst->u.tun_info; + md = ip_tunnel_info_opts(info); + + memcpy(md, pkt_md, sizeof(*md)); + md->version = ver; + info->key.tun_flags |= TUNNEL_ERSPAN_OPT; + info->options_len = sizeof(*md); + + ip6_tnl_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error); + + } else { + tunnel->parms.erspan_ver = ver; + + if (ver == 1) { + tunnel->parms.index = ntohl(pkt_md->u.index); + } else { + tunnel->parms.dir = pkt_md->u.md2.dir; + tunnel->parms.hwid = get_hwid(&pkt_md->u.md2); + } + + ip6_tnl_rcv(tunnel, skb, tpi, NULL, log_ecn_error); + } return PACKET_RCVD; } @@ -481,9 +591,17 @@ static int gre_rcv(struct sk_buff *skb) if (iptunnel_pull_header(skb, hdr_len, tpi.proto, false)) goto drop; + if (unlikely(tpi.proto == htons(ETH_P_ERSPAN) || + tpi.proto == htons(ETH_P_ERSPAN2))) { + if (ip6erspan_rcv(skb, hdr_len, &tpi) == PACKET_RCVD) + return 0; + goto out; + } + if (ip6gre_rcv(skb, &tpi) == PACKET_RCVD) return 0; +out: icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); drop: kfree_skb(skb); @@ -496,6 +614,78 @@ static int gre_handle_offloads(struct sk_buff *skb, bool csum) csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE); } +static void prepare_ip6gre_xmit_ipv4(struct sk_buff *skb, + struct net_device *dev, + struct flowi6 *fl6, __u8 *dsfield, + int *encap_limit) +{ + const struct iphdr *iph = ip_hdr(skb); + struct ip6_tnl *t = netdev_priv(dev); + + if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) + *encap_limit = t->parms.encap_limit; + + memcpy(fl6, &t->fl.u.ip6, sizeof(*fl6)); + + if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) + *dsfield = ipv4_get_dsfield(iph); + else + *dsfield = ip6_tclass(t->parms.flowinfo); + + if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) + fl6->flowi6_mark = skb->mark; + else + fl6->flowi6_mark = t->parms.fwmark; + + fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL); +} + +static int prepare_ip6gre_xmit_ipv6(struct sk_buff *skb, + struct net_device *dev, + struct flowi6 *fl6, __u8 *dsfield, + int *encap_limit) +{ + struct ipv6hdr *ipv6h = ipv6_hdr(skb); + struct ip6_tnl *t = netdev_priv(dev); + __u16 offset; + + offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb)); + /* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */ + + if (offset > 0) { + struct ipv6_tlv_tnl_enc_lim *tel; + + tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset]; + if (tel->encap_limit == 0) { + icmpv6_send(skb, ICMPV6_PARAMPROB, + ICMPV6_HDR_FIELD, offset + 2); + return -1; + } + *encap_limit = tel->encap_limit - 1; + } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) { + *encap_limit = t->parms.encap_limit; + } + + memcpy(fl6, &t->fl.u.ip6, sizeof(*fl6)); + + if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) + *dsfield = ipv6_get_dsfield(ipv6h); + else + *dsfield = ip6_tclass(t->parms.flowinfo); + + if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) + fl6->flowlabel |= ip6_flowlabel(ipv6h); + + if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) + fl6->flowi6_mark = skb->mark; + else + fl6->flowi6_mark = t->parms.fwmark; + + fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL); + + return 0; +} + static netdev_tx_t __gre6_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, struct flowi6 *fl6, int encap_limit, @@ -517,8 +707,38 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, /* Push GRE header. */ protocol = (dev->type == ARPHRD_ETHER) ? htons(ETH_P_TEB) : proto; - gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags, - protocol, tunnel->parms.o_key, htonl(tunnel->o_seqno)); + + if (tunnel->parms.collect_md) { + struct ip_tunnel_info *tun_info; + const struct ip_tunnel_key *key; + __be16 flags; + + tun_info = skb_tunnel_info(skb); + if (unlikely(!tun_info || + !(tun_info->mode & IP_TUNNEL_INFO_TX) || + ip_tunnel_info_af(tun_info) != AF_INET6)) + return -EINVAL; + + key = &tun_info->key; + memset(fl6, 0, sizeof(*fl6)); + fl6->flowi6_proto = IPPROTO_GRE; + fl6->daddr = key->u.ipv6.dst; + fl6->flowlabel = key->label; + fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL); + + dsfield = key->tos; + flags = key->tun_flags & (TUNNEL_CSUM | TUNNEL_KEY); + tunnel->tun_hlen = gre_calc_hlen(flags); + + gre_build_header(skb, tunnel->tun_hlen, + flags, protocol, + tunnel_id_to_key32(tun_info->key.tun_id), 0); + + } else { + gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags, + protocol, tunnel->parms.o_key, + htonl(tunnel->o_seqno)); + } return ip6_tnl_xmit(skb, dev, dsfield, fl6, encap_limit, pmtu, NEXTHDR_GRE); @@ -527,30 +747,17 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - const struct iphdr *iph = ip_hdr(skb); int encap_limit = -1; struct flowi6 fl6; - __u8 dsfield; + __u8 dsfield = 0; __u32 mtu; int err; memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) - encap_limit = t->parms.encap_limit; - - memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); - - if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) - dsfield = ipv4_get_dsfield(iph); - else - dsfield = ip6_tclass(t->parms.flowinfo); - if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) - fl6.flowi6_mark = skb->mark; - else - fl6.flowi6_mark = t->parms.fwmark; - - fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); + if (!t->parms.collect_md) + prepare_ip6gre_xmit_ipv4(skb, dev, &fl6, + &dsfield, &encap_limit); err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM)); if (err) @@ -574,46 +781,17 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev) struct ip6_tnl *t = netdev_priv(dev); struct ipv6hdr *ipv6h = ipv6_hdr(skb); int encap_limit = -1; - __u16 offset; struct flowi6 fl6; - __u8 dsfield; + __u8 dsfield = 0; __u32 mtu; int err; if (ipv6_addr_equal(&t->parms.raddr, &ipv6h->saddr)) return -1; - offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb)); - /* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */ - ipv6h = ipv6_hdr(skb); - - if (offset > 0) { - struct ipv6_tlv_tnl_enc_lim *tel; - tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset]; - if (tel->encap_limit == 0) { - icmpv6_send(skb, ICMPV6_PARAMPROB, - ICMPV6_HDR_FIELD, offset + 2); - return -1; - } - encap_limit = tel->encap_limit - 1; - } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) - encap_limit = t->parms.encap_limit; - - memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); - - if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) - dsfield = ipv6_get_dsfield(ipv6h); - else - dsfield = ip6_tclass(t->parms.flowinfo); - - if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) - fl6.flowlabel |= ip6_flowlabel(ipv6h); - if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) - fl6.flowi6_mark = skb->mark; - else - fl6.flowi6_mark = t->parms.fwmark; - - fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); + if (!t->parms.collect_md && + prepare_ip6gre_xmit_ipv6(skb, dev, &fl6, &dsfield, &encap_limit)) + return -1; if (gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM))) return -1; @@ -660,7 +838,8 @@ static int ip6gre_xmit_other(struct sk_buff *skb, struct net_device *dev) if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) encap_limit = t->parms.encap_limit; - memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); + if (!t->parms.collect_md) + memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM)); if (err) @@ -705,6 +884,137 @@ tx_err: return NETDEV_TX_OK; } +static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct ipv6hdr *ipv6h = ipv6_hdr(skb); + struct ip6_tnl *t = netdev_priv(dev); + struct dst_entry *dst = skb_dst(skb); + struct net_device_stats *stats; + bool truncate = false; + int encap_limit = -1; + __u8 dsfield = false; + struct flowi6 fl6; + int err = -EINVAL; + __u32 mtu; + + if (!ip6_tnl_xmit_ctl(t, &t->parms.laddr, &t->parms.raddr)) + goto tx_err; + + if (gre_handle_offloads(skb, false)) + goto tx_err; + + if (skb->len > dev->mtu + dev->hard_header_len) { + pskb_trim(skb, dev->mtu + dev->hard_header_len); + truncate = true; + } + + t->parms.o_flags &= ~TUNNEL_KEY; + IPCB(skb)->flags = 0; + + /* For collect_md mode, derive fl6 from the tunnel key, + * for native mode, call prepare_ip6gre_xmit_{ipv4,ipv6}. + */ + if (t->parms.collect_md) { + struct ip_tunnel_info *tun_info; + const struct ip_tunnel_key *key; + struct erspan_metadata *md; + __be32 tun_id; + + tun_info = skb_tunnel_info(skb); + if (unlikely(!tun_info || + !(tun_info->mode & IP_TUNNEL_INFO_TX) || + ip_tunnel_info_af(tun_info) != AF_INET6)) + return -EINVAL; + + key = &tun_info->key; + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_GRE; + fl6.daddr = key->u.ipv6.dst; + fl6.flowlabel = key->label; + fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); + + dsfield = key->tos; + md = ip_tunnel_info_opts(tun_info); + if (!md) + goto tx_err; + + tun_id = tunnel_id_to_key32(key->tun_id); + if (md->version == 1) { + erspan_build_header(skb, + ntohl(tun_id), + ntohl(md->u.index), truncate, + false); + } else if (md->version == 2) { + erspan_build_header_v2(skb, + ntohl(tun_id), + md->u.md2.dir, + get_hwid(&md->u.md2), + truncate, false); + } + } else { + switch (skb->protocol) { + case htons(ETH_P_IP): + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + prepare_ip6gre_xmit_ipv4(skb, dev, &fl6, + &dsfield, &encap_limit); + break; + case htons(ETH_P_IPV6): + if (ipv6_addr_equal(&t->parms.raddr, &ipv6h->saddr)) + goto tx_err; + if (prepare_ip6gre_xmit_ipv6(skb, dev, &fl6, + &dsfield, &encap_limit)) + goto tx_err; + break; + default: + memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); + break; + } + + if (t->parms.erspan_ver == 1) + erspan_build_header(skb, ntohl(t->parms.o_key), + t->parms.index, + truncate, false); + else + erspan_build_header_v2(skb, ntohl(t->parms.o_key), + t->parms.dir, + t->parms.hwid, + truncate, false); + fl6.daddr = t->parms.raddr; + } + + /* Push GRE header. */ + gre_build_header(skb, 8, TUNNEL_SEQ, + htons(ETH_P_ERSPAN), 0, htonl(t->o_seqno++)); + + /* TooBig packet may have updated dst->dev's mtu */ + if (!t->parms.collect_md && dst && dst_mtu(dst) > dst->dev->mtu) + dst->ops->update_pmtu(dst, NULL, skb, dst->dev->mtu); + + err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, + NEXTHDR_GRE); + if (err != 0) { + /* XXX: send ICMP error even if DF is not set. */ + if (err == -EMSGSIZE) { + if (skb->protocol == htons(ETH_P_IP)) + icmp_send(skb, ICMP_DEST_UNREACH, + ICMP_FRAG_NEEDED, htonl(mtu)); + else + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + } + + goto tx_err; + } + return NETDEV_TX_OK; + +tx_err: + stats = &t->dev->stats; + stats->tx_errors++; + stats->tx_dropped++; + kfree_skb(skb); + return NETDEV_TX_OK; +} + static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu) { struct net_device *dev = t->dev; @@ -1014,6 +1324,36 @@ static void ip6gre_tunnel_setup(struct net_device *dev) eth_random_addr(dev->perm_addr); } +#define GRE6_FEATURES (NETIF_F_SG | \ + NETIF_F_FRAGLIST | \ + NETIF_F_HIGHDMA | \ + NETIF_F_HW_CSUM) + +static void ip6gre_tnl_init_features(struct net_device *dev) +{ + struct ip6_tnl *nt = netdev_priv(dev); + + dev->features |= GRE6_FEATURES; + dev->hw_features |= GRE6_FEATURES; + + if (!(nt->parms.o_flags & TUNNEL_SEQ)) { + /* TCP offload with GRE SEQ is not supported, nor + * can we support 2 levels of outer headers requiring + * an update. + */ + if (!(nt->parms.o_flags & TUNNEL_CSUM) || + nt->encap.type == TUNNEL_ENCAP_NONE) { + dev->features |= NETIF_F_GSO_SOFTWARE; + dev->hw_features |= NETIF_F_GSO_SOFTWARE; + } + + /* Can use a lockless transmit, unless we generate + * output sequences + */ + dev->features |= NETIF_F_LLTX; + } +} + static int ip6gre_tunnel_init_common(struct net_device *dev) { struct ip6_tnl *tunnel; @@ -1048,6 +1388,12 @@ static int ip6gre_tunnel_init_common(struct net_device *dev) if (!(tunnel->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) dev->mtu -= 8; + if (tunnel->parms.collect_md) { + dev->features |= NETIF_F_NETNS_LOCAL; + netif_keep_dst(dev); + } + ip6gre_tnl_init_features(dev); + return 0; } @@ -1062,6 +1408,9 @@ static int ip6gre_tunnel_init(struct net_device *dev) tunnel = netdev_priv(dev); + if (tunnel->parms.collect_md) + return 0; + memcpy(dev->dev_addr, &tunnel->parms.laddr, sizeof(struct in6_addr)); memcpy(dev->broadcast, &tunnel->parms.raddr, sizeof(struct in6_addr)); @@ -1084,7 +1433,6 @@ static void ip6gre_fb_tunnel_init(struct net_device *dev) dev_hold(dev); } - static struct inet6_protocol ip6gre_protocol __read_mostly = { .handler = gre_rcv, .err_handler = ip6gre_err, @@ -1099,7 +1447,8 @@ static void ip6gre_destroy_tunnels(struct net *net, struct list_head *head) for_each_netdev_safe(net, dev, aux) if (dev->rtnl_link_ops == &ip6gre_link_ops || - dev->rtnl_link_ops == &ip6gre_tap_ops) + dev->rtnl_link_ops == &ip6gre_tap_ops || + dev->rtnl_link_ops == &ip6erspan_tap_ops) unregister_netdevice_queue(dev, head); for (prio = 0; prio < 4; prio++) { @@ -1221,6 +1570,70 @@ out: return ip6gre_tunnel_validate(tb, data, extack); } +static int ip6erspan_tap_validate(struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + __be16 flags = 0; + int ret, ver = 0; + + if (!data) + return 0; + + ret = ip6gre_tap_validate(tb, data, extack); + if (ret) + return ret; + + /* ERSPAN should only have GRE sequence and key flag */ + if (data[IFLA_GRE_OFLAGS]) + flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]); + if (data[IFLA_GRE_IFLAGS]) + flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]); + if (!data[IFLA_GRE_COLLECT_METADATA] && + flags != (GRE_SEQ | GRE_KEY)) + return -EINVAL; + + /* ERSPAN Session ID only has 10-bit. Since we reuse + * 32-bit key field as ID, check it's range. + */ + if (data[IFLA_GRE_IKEY] && + (ntohl(nla_get_be32(data[IFLA_GRE_IKEY])) & ~ID_MASK)) + return -EINVAL; + + if (data[IFLA_GRE_OKEY] && + (ntohl(nla_get_be32(data[IFLA_GRE_OKEY])) & ~ID_MASK)) + return -EINVAL; + + if (data[IFLA_GRE_ERSPAN_VER]) { + ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]); + if (ver != 1 && ver != 2) + return -EINVAL; + } + + if (ver == 1) { + if (data[IFLA_GRE_ERSPAN_INDEX]) { + u32 index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]); + + if (index & ~INDEX_MASK) + return -EINVAL; + } + } else if (ver == 2) { + if (data[IFLA_GRE_ERSPAN_DIR]) { + u16 dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]); + + if (dir & ~(DIR_MASK >> DIR_OFFSET)) + return -EINVAL; + } + + if (data[IFLA_GRE_ERSPAN_HWID]) { + u16 hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]); + + if (hwid & ~(HWID_MASK >> HWID_OFFSET)) + return -EINVAL; + } + } + + return 0; +} static void ip6gre_netlink_parms(struct nlattr *data[], struct __ip6_tnl_parm *parms) @@ -1267,11 +1680,26 @@ static void ip6gre_netlink_parms(struct nlattr *data[], if (data[IFLA_GRE_FWMARK]) parms->fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]); + + if (data[IFLA_GRE_COLLECT_METADATA]) + parms->collect_md = true; + + if (data[IFLA_GRE_ERSPAN_VER]) + parms->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]); + + if (parms->erspan_ver == 1) { + if (data[IFLA_GRE_ERSPAN_INDEX]) + parms->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]); + } else if (parms->erspan_ver == 2) { + if (data[IFLA_GRE_ERSPAN_DIR]) + parms->dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]); + if (data[IFLA_GRE_ERSPAN_HWID]) + parms->hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]); + } } static int ip6gre_tap_init(struct net_device *dev) { - struct ip6_tnl *tunnel; int ret; ret = ip6gre_tunnel_init_common(dev); @@ -1280,10 +1708,6 @@ static int ip6gre_tap_init(struct net_device *dev) dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; - tunnel = netdev_priv(dev); - - ip6gre_tnl_link_config(tunnel, 1); - return 0; } @@ -1298,16 +1722,65 @@ static const struct net_device_ops ip6gre_tap_netdev_ops = { .ndo_get_iflink = ip6_tnl_get_iflink, }; -#define GRE6_FEATURES (NETIF_F_SG | \ - NETIF_F_FRAGLIST | \ - NETIF_F_HIGHDMA | \ - NETIF_F_HW_CSUM) +static int ip6erspan_tap_init(struct net_device *dev) +{ + struct ip6_tnl *tunnel; + int t_hlen; + int ret; + + tunnel = netdev_priv(dev); + + tunnel->dev = dev; + tunnel->net = dev_net(dev); + strcpy(tunnel->parms.name, dev->name); + + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!dev->tstats) + return -ENOMEM; + + ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); + if (ret) { + free_percpu(dev->tstats); + dev->tstats = NULL; + return ret; + } + + tunnel->tun_hlen = 8; + tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen + + erspan_hdr_len(tunnel->parms.erspan_ver); + t_hlen = tunnel->hlen + sizeof(struct ipv6hdr); + + dev->hard_header_len = LL_MAX_HEADER + t_hlen; + dev->mtu = ETH_DATA_LEN - t_hlen; + if (dev->type == ARPHRD_ETHER) + dev->mtu -= ETH_HLEN; + if (!(tunnel->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) + dev->mtu -= 8; + + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + tunnel = netdev_priv(dev); + ip6gre_tnl_link_config(tunnel, 1); + + return 0; +} + +static const struct net_device_ops ip6erspan_netdev_ops = { + .ndo_init = ip6erspan_tap_init, + .ndo_uninit = ip6gre_tunnel_uninit, + .ndo_start_xmit = ip6erspan_tunnel_xmit, + .ndo_set_mac_address = eth_mac_addr, + .ndo_validate_addr = eth_validate_addr, + .ndo_change_mtu = ip6_tnl_change_mtu, + .ndo_get_stats64 = ip_tunnel_get_stats64, + .ndo_get_iflink = ip6_tnl_get_iflink, +}; static void ip6gre_tap_setup(struct net_device *dev) { ether_setup(dev); + dev->max_mtu = 0; dev->netdev_ops = &ip6gre_tap_netdev_ops; dev->needs_free_netdev = true; dev->priv_destructor = ip6gre_dev_free; @@ -1372,40 +1845,29 @@ static int ip6gre_newlink(struct net *src_net, struct net_device *dev, ip6gre_netlink_parms(data, &nt->parms); - if (ip6gre_tunnel_find(net, &nt->parms, dev->type)) - return -EEXIST; + if (nt->parms.collect_md) { + if (rtnl_dereference(ign->collect_md_tun)) + return -EEXIST; + } else { + if (ip6gre_tunnel_find(net, &nt->parms, dev->type)) + return -EEXIST; + } if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS]) eth_hw_addr_random(dev); nt->dev = dev; nt->net = dev_net(dev); - ip6gre_tnl_link_config(nt, !tb[IFLA_MTU]); - - dev->features |= GRE6_FEATURES; - dev->hw_features |= GRE6_FEATURES; - - if (!(nt->parms.o_flags & TUNNEL_SEQ)) { - /* TCP offload with GRE SEQ is not supported, nor - * can we support 2 levels of outer headers requiring - * an update. - */ - if (!(nt->parms.o_flags & TUNNEL_CSUM) || - (nt->encap.type == TUNNEL_ENCAP_NONE)) { - dev->features |= NETIF_F_GSO_SOFTWARE; - dev->hw_features |= NETIF_F_GSO_SOFTWARE; - } - - /* Can use a lockless transmit, unless we generate - * output sequences - */ - dev->features |= NETIF_F_LLTX; - } err = register_netdevice(dev); if (err) goto out; + ip6gre_tnl_link_config(nt, !tb[IFLA_MTU]); + + if (tb[IFLA_MTU]) + ip6_tnl_change_mtu(dev, nla_get_u32(tb[IFLA_MTU])); + dev_hold(dev); ip6gre_tunnel_link(ign, nt); @@ -1492,8 +1954,12 @@ static size_t ip6gre_get_size(const struct net_device *dev) nla_total_size(2) + /* IFLA_GRE_ENCAP_DPORT */ nla_total_size(2) + + /* IFLA_GRE_COLLECT_METADATA */ + nla_total_size(0) + /* IFLA_GRE_FWMARK */ nla_total_size(4) + + /* IFLA_GRE_ERSPAN_INDEX */ + nla_total_size(4) + 0; } @@ -1515,7 +1981,8 @@ static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u8(skb, IFLA_GRE_ENCAP_LIMIT, p->encap_limit) || nla_put_be32(skb, IFLA_GRE_FLOWINFO, p->flowinfo) || nla_put_u32(skb, IFLA_GRE_FLAGS, p->flags) || - nla_put_u32(skb, IFLA_GRE_FWMARK, p->fwmark)) + nla_put_u32(skb, IFLA_GRE_FWMARK, p->fwmark) || + nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, p->index)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE, @@ -1528,6 +1995,24 @@ static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev) t->encap.flags)) goto nla_put_failure; + if (p->collect_md) { + if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA)) + goto nla_put_failure; + } + + if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, p->erspan_ver)) + goto nla_put_failure; + + if (p->erspan_ver == 1) { + if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, p->index)) + goto nla_put_failure; + } else if (p->erspan_ver == 2) { + if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, p->dir)) + goto nla_put_failure; + if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, p->hwid)) + goto nla_put_failure; + } + return 0; nla_put_failure: @@ -1550,9 +2035,28 @@ static const struct nla_policy ip6gre_policy[IFLA_GRE_MAX + 1] = { [IFLA_GRE_ENCAP_FLAGS] = { .type = NLA_U16 }, [IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 }, [IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 }, + [IFLA_GRE_COLLECT_METADATA] = { .type = NLA_FLAG }, [IFLA_GRE_FWMARK] = { .type = NLA_U32 }, + [IFLA_GRE_ERSPAN_INDEX] = { .type = NLA_U32 }, + [IFLA_GRE_ERSPAN_VER] = { .type = NLA_U8 }, + [IFLA_GRE_ERSPAN_DIR] = { .type = NLA_U8 }, + [IFLA_GRE_ERSPAN_HWID] = { .type = NLA_U16 }, }; +static void ip6erspan_tap_setup(struct net_device *dev) +{ + ether_setup(dev); + + dev->netdev_ops = &ip6erspan_netdev_ops; + dev->needs_free_netdev = true; + dev->priv_destructor = ip6gre_dev_free; + + dev->features |= NETIF_F_NETNS_LOCAL; + dev->priv_flags &= ~IFF_TX_SKB_SHARING; + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + netif_keep_dst(dev); +} + static struct rtnl_link_ops ip6gre_link_ops __read_mostly = { .kind = "ip6gre", .maxtype = IFLA_GRE_MAX, @@ -1582,6 +2086,20 @@ static struct rtnl_link_ops ip6gre_tap_ops __read_mostly = { .get_link_net = ip6_tnl_get_link_net, }; +static struct rtnl_link_ops ip6erspan_tap_ops __read_mostly = { + .kind = "ip6erspan", + .maxtype = IFLA_GRE_MAX, + .policy = ip6gre_policy, + .priv_size = sizeof(struct ip6_tnl), + .setup = ip6erspan_tap_setup, + .validate = ip6erspan_tap_validate, + .newlink = ip6gre_newlink, + .changelink = ip6gre_changelink, + .get_size = ip6gre_get_size, + .fill_info = ip6gre_fill_info, + .get_link_net = ip6_tnl_get_link_net, +}; + /* * And now the modules code and kernel interface. */ @@ -1610,9 +2128,15 @@ static int __init ip6gre_init(void) if (err < 0) goto tap_ops_failed; + err = rtnl_link_register(&ip6erspan_tap_ops); + if (err < 0) + goto erspan_link_failed; + out: return err; +erspan_link_failed: + rtnl_link_unregister(&ip6gre_tap_ops); tap_ops_failed: rtnl_link_unregister(&ip6gre_link_ops); rtnl_link_failed: @@ -1626,6 +2150,7 @@ static void __exit ip6gre_fini(void) { rtnl_link_unregister(&ip6gre_tap_ops); rtnl_link_unregister(&ip6gre_link_ops); + rtnl_link_unregister(&ip6erspan_tap_ops); inet6_del_protocol(&ip6gre_protocol, IPPROTO_GRE); unregister_pernet_device(&ip6gre_net_ops); } @@ -1637,4 +2162,5 @@ MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)"); MODULE_DESCRIPTION("GRE over IPv6 tunneling device"); MODULE_ALIAS_RTNL_LINK("ip6gre"); MODULE_ALIAS_RTNL_LINK("ip6gretap"); +MODULE_ALIAS_RTNL_LINK("ip6erspan"); MODULE_ALIAS_NETDEV("ip6gre0"); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 5110a418cc4d..997c7f19ad62 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -138,6 +138,14 @@ static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *s return ret; } +#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) + /* Policy lookup after SNAT yielded a new policy */ + if (skb_dst(skb)->xfrm) { + IPCB(skb)->flags |= IPSKB_REROUTED; + return dst_output(net, sk, skb); + } +#endif + if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) || dst_allfrag(skb_dst(skb)) || (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)) @@ -166,6 +174,14 @@ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } +bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np) +{ + if (!np->autoflowlabel_set) + return ip6_default_np_autolabel(net); + else + return np->autoflowlabel; +} + /* * xmit an sk_buff (used by TCP, SCTP and DCCP) * Note : socket lock is not held for SYNACK packets, but might be modified @@ -230,7 +246,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, hlimit = ip6_dst_hoplimit(dst); ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, - np->autoflowlabel, fl6)); + ip6_autoflowlabel(net, np), fl6)); hdr->payload_len = htons(seg_len); hdr->nexthdr = proto; @@ -362,7 +378,7 @@ static inline int ip6_forward_finish(struct net *net, struct sock *sk, return dst_output(net, sk, skb); } -static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst) +unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst) { unsigned int mtu; struct inet6_dev *idev; @@ -382,6 +398,7 @@ static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst) return mtu; } +EXPORT_SYMBOL_GPL(ip6_dst_mtu_forward); static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) { @@ -1198,16 +1215,18 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, v6_cork->tclass = ipc6->tclass; if (rt->dst.flags & DST_XFRM_TUNNEL) mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? - rt->dst.dev->mtu : dst_mtu(&rt->dst); + READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst); else mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? - rt->dst.dev->mtu : dst_mtu(rt->dst.path); + READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst)); if (np->frag_size < mtu) { if (np->frag_size) mtu = np->frag_size; } + if (mtu < IPV6_MIN_MTU) + return -EINVAL; cork->base.fragsize = mtu; - if (dst_allfrag(rt->dst.path)) + if (dst_allfrag(xfrm_dst_path(&rt->dst))) cork->base.flags |= IPCORK_ALLFRAG; cork->base.length = 0; @@ -1626,7 +1645,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk, ip6_flow_hdr(hdr, v6_cork->tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, - np->autoflowlabel, fl6)); + ip6_autoflowlabel(net, np), fl6)); hdr->hop_limit = v6_cork->hop_limit; hdr->nexthdr = proto; hdr->saddr = fl6->saddr; @@ -1725,11 +1744,13 @@ struct sk_buff *ip6_make_skb(struct sock *sk, cork.base.flags = 0; cork.base.addr = 0; cork.base.opt = NULL; + cork.base.dst = NULL; v6_cork.opt = NULL; err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6); - if (err) + if (err) { + ip6_cork_release(&cork, &v6_cork); return ERR_PTR(err); - + } if (ipc6->dontfrag < 0) ipc6->dontfrag = inet6_sk(sk)->dontfrag; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 3d3092adf1d2..4b15fe928278 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -642,8 +642,7 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (rel_info > dst_mtu(skb_dst(skb2))) goto out; - skb_dst(skb2)->ops->update_pmtu(skb_dst(skb2), NULL, skb2, - rel_info); + skb_dst_update_pmtu(skb2, rel_info); } icmp_send(skb2, rel_type, rel_code, htonl(rel_info)); @@ -861,7 +860,7 @@ int ip6_tnl_rcv(struct ip6_tnl *t, struct sk_buff *skb, struct metadata_dst *tun_dst, bool log_ecn_err) { - return __ip6_tnl_rcv(t, skb, tpi, NULL, ip6ip6_dscp_ecn_decapsulate, + return __ip6_tnl_rcv(t, skb, tpi, tun_dst, ip6ip6_dscp_ecn_decapsulate, log_ecn_err); } EXPORT_SYMBOL(ip6_tnl_rcv); @@ -904,7 +903,7 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto, if (t->parms.collect_md) { tun_dst = ipv6_tun_rx_dst(skb, 0, 0, 0); if (!tun_dst) - return 0; + goto drop; } ret = __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate, log_ecn_error); @@ -979,6 +978,9 @@ int ip6_tnl_xmit_ctl(struct ip6_tnl *t, int ret = 0; struct net *net = t->net; + if (t->parms.collect_md) + return 1; + if ((p->flags & IP6_TNL_F_CAP_XMIT) || ((p->flags & IP6_TNL_F_CAP_PER_PACKET) && (ip6_tnl_get_cap(t, laddr, raddr) & IP6_TNL_F_CAP_XMIT))) { @@ -1074,10 +1076,11 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr)); neigh_release(neigh); } - } else if (!(t->parms.flags & - (IP6_TNL_F_USE_ORIG_TCLASS | IP6_TNL_F_USE_ORIG_FWMARK))) { - /* enable the cache only only if the routing decision does - * not depend on the current inner header value + } else if (t->parms.proto != 0 && !(t->parms.flags & + (IP6_TNL_F_USE_ORIG_TCLASS | + IP6_TNL_F_USE_ORIG_FWMARK))) { + /* enable the cache only if neither the outer protocol nor the + * routing decision depends on the current inner header value */ use_cache = true; } @@ -1123,10 +1126,14 @@ route_lookup: max_headroom += 8; mtu -= 8; } - if (mtu < IPV6_MIN_MTU) - mtu = IPV6_MIN_MTU; - if (skb_dst(skb) && !t->parms.collect_md) - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); + if (skb->protocol == htons(ETH_P_IPV6)) { + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + } else if (mtu < 576) { + mtu = 576; + } + + skb_dst_update_pmtu(skb, mtu); if (skb->len - t->tun_hlen - eth_hlen > mtu && !skb_is_gso(skb)) { *pmtu = mtu; err = -EMSGSIZE; @@ -1671,11 +1678,11 @@ int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu) { struct ip6_tnl *tnl = netdev_priv(dev); - if (tnl->parms.proto == IPPROTO_IPIP) { - if (new_mtu < ETH_MIN_MTU) + if (tnl->parms.proto == IPPROTO_IPV6) { + if (new_mtu < IPV6_MIN_MTU) return -EINVAL; } else { - if (new_mtu < IPV6_MIN_MTU) + if (new_mtu < ETH_MIN_MTU) return -EINVAL; } if (new_mtu > 0xFFF8 - dev->hard_header_len) diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index dbb74f3c57a7..fa3ae1cb50d3 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -483,7 +483,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) mtu = dst_mtu(dst); if (!skb->ignore_df && skb->len > mtu) { - skb_dst(skb)->ops->update_pmtu(dst, NULL, skb, mtu); + skb_dst_update_pmtu(skb, mtu); if (skb->protocol == htons(ETH_P_IPV6)) { if (mtu < IPV6_MIN_MTU) @@ -626,6 +626,7 @@ static void vti6_link_config(struct ip6_tnl *t) { struct net_device *dev = t->dev; struct __ip6_tnl_parm *p = &t->parms; + struct net_device *tdev = NULL; memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr)); memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr)); @@ -638,6 +639,25 @@ static void vti6_link_config(struct ip6_tnl *t) dev->flags |= IFF_POINTOPOINT; else dev->flags &= ~IFF_POINTOPOINT; + + if (p->flags & IP6_TNL_F_CAP_XMIT) { + int strict = (ipv6_addr_type(&p->raddr) & + (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)); + struct rt6_info *rt = rt6_lookup(t->net, + &p->raddr, &p->laddr, + p->link, strict); + + if (rt) + tdev = rt->dst.dev; + ip6_rt_put(rt); + } + + if (!tdev && p->link) + tdev = __dev_get_by_index(t->net, p->link); + + if (tdev) + dev->mtu = max_t(int, tdev->mtu - dev->hard_header_len, + IPV6_MIN_MTU); } /** diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index a2e1a864eb46..9f6cace9c817 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -477,7 +477,6 @@ static int ip6mr_vif_open(struct inode *inode, struct file *file) } static const struct file_operations ip6mr_vif_fops = { - .owner = THIS_MODULE, .open = ip6mr_vif_open, .read = seq_read, .llseek = seq_lseek, @@ -495,6 +494,7 @@ static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos) return ERR_PTR(-ENOENT); it->mrt = mrt; + it->cache = NULL; return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1) : SEQ_START_TOKEN; } @@ -609,7 +609,6 @@ static int ipmr_mfc_open(struct inode *inode, struct file *file) } static const struct file_operations ip6mr_mfc_fops = { - .owner = THIS_MODULE, .open = ipmr_mfc_open, .read = seq_read, .llseek = seq_lseek, @@ -1425,10 +1424,13 @@ int __init ip6_mr_init(void) goto add_proto_fail; } #endif - rtnl_register(RTNL_FAMILY_IP6MR, RTM_GETROUTE, NULL, - ip6mr_rtm_dumproute, 0); - return 0; + err = rtnl_register_module(THIS_MODULE, RTNL_FAMILY_IP6MR, RTM_GETROUTE, + NULL, ip6mr_rtm_dumproute, 0); + if (err == 0) + return 0; + #ifdef CONFIG_IPV6_PIMSM_V2 + inet6_del_protocol(&pim6_protocol, IPPROTO_PIM); add_proto_fail: unregister_netdevice_notifier(&ip6_mr_notifier); #endif diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index b9404feabd78..d78d41fc4b1a 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -886,6 +886,7 @@ pref_skip_coa: break; case IPV6_AUTOFLOWLABEL: np->autoflowlabel = valbool; + np->autoflowlabel_set = 1; retv = 0; break; case IPV6_RECVFRAGSIZE: @@ -922,12 +923,8 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IPV6_IPSEC_POLICY && - optname != IPV6_XFRM_POLICY) { - lock_sock(sk); - err = nf_setsockopt(sk, PF_INET6, optname, optval, - optlen); - release_sock(sk); - } + optname != IPV6_XFRM_POLICY) + err = nf_setsockopt(sk, PF_INET6, optname, optval, optlen); #endif return err; } @@ -957,12 +954,9 @@ int compat_ipv6_setsockopt(struct sock *sk, int level, int optname, #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IPV6_IPSEC_POLICY && - optname != IPV6_XFRM_POLICY) { - lock_sock(sk); - err = compat_nf_setsockopt(sk, PF_INET6, optname, - optval, optlen); - release_sock(sk); - } + optname != IPV6_XFRM_POLICY) + err = compat_nf_setsockopt(sk, PF_INET6, optname, optval, + optlen); #endif return err; } @@ -1335,7 +1329,7 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, break; case IPV6_AUTOFLOWLABEL: - val = np->autoflowlabel; + val = ip6_autoflowlabel(sock_net(sk), np); break; case IPV6_RECVFRAGSIZE: diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index fc6d7d143f2c..6a5d0e39bb87 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1655,8 +1655,6 @@ static void mld_sendpack(struct sk_buff *skb) if (err) goto err_out; - payload_len = skb->len; - err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, net->ipv6.igmp_sk, skb, NULL, skb->dev, dst_output); @@ -1682,16 +1680,16 @@ static int grec_size(struct ifmcaddr6 *pmc, int type, int gdel, int sdel) } static struct sk_buff *add_grhead(struct sk_buff *skb, struct ifmcaddr6 *pmc, - int type, struct mld2_grec **ppgr) + int type, struct mld2_grec **ppgr, unsigned int mtu) { - struct net_device *dev = pmc->idev->dev; struct mld2_report *pmr; struct mld2_grec *pgr; - if (!skb) - skb = mld_newpack(pmc->idev, dev->mtu); - if (!skb) - return NULL; + if (!skb) { + skb = mld_newpack(pmc->idev, mtu); + if (!skb) + return NULL; + } pgr = skb_put(skb, sizeof(struct mld2_grec)); pgr->grec_type = type; pgr->grec_auxwords = 0; @@ -1714,10 +1712,15 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, struct mld2_grec *pgr = NULL; struct ip6_sf_list *psf, *psf_next, *psf_prev, **psf_list; int scount, stotal, first, isquery, truncate; + unsigned int mtu; if (pmc->mca_flags & MAF_NOREPORT) return skb; + mtu = READ_ONCE(dev->mtu); + if (mtu < IPV6_MIN_MTU) + return skb; + isquery = type == MLD2_MODE_IS_INCLUDE || type == MLD2_MODE_IS_EXCLUDE; truncate = type == MLD2_MODE_IS_EXCLUDE || @@ -1738,7 +1741,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) { if (skb) mld_sendpack(skb); - skb = mld_newpack(idev, dev->mtu); + skb = mld_newpack(idev, mtu); } } first = 1; @@ -1774,12 +1777,12 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, pgr->grec_nsrcs = htons(scount); if (skb) mld_sendpack(skb); - skb = mld_newpack(idev, dev->mtu); + skb = mld_newpack(idev, mtu); first = 1; scount = 0; } if (first) { - skb = add_grhead(skb, pmc, type, &pgr); + skb = add_grhead(skb, pmc, type, &pgr, mtu); first = 0; } if (!skb) @@ -1814,7 +1817,7 @@ empty_source: mld_sendpack(skb); skb = NULL; /* add_grhead will get a new one */ } - skb = add_grhead(skb, pmc, type, &pgr); + skb = add_grhead(skb, pmc, type, &pgr, mtu); } } if (pgr) @@ -2753,7 +2756,6 @@ static int igmp6_mc_seq_open(struct inode *inode, struct file *file) } static const struct file_operations igmp6_mc_seq_fops = { - .owner = THIS_MODULE, .open = igmp6_mc_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -2908,7 +2910,6 @@ static int igmp6_mcf_seq_open(struct inode *inode, struct file *file) } static const struct file_operations igmp6_mcf_seq_fops = { - .owner = THIS_MODULE, .open = igmp6_mcf_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index b3cea200c85e..f61a5b613b52 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -566,6 +566,11 @@ static void ndisc_send_unsol_na(struct net_device *dev) read_lock_bh(&idev->lock); list_for_each_entry(ifa, &idev->addr_list, if_list) { + /* skip tentative addresses until dad completes */ + if (ifa->flags & IFA_F_TENTATIVE && + !(ifa->flags & IFA_F_OPTIMISTIC)) + continue; + ndisc_send_na(dev, &in6addr_linklocal_allnodes, &ifa->addr, /*router=*/ !!idev->cnf.forwarding, /*solicited=*/ false, /*override=*/ true, diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 39970e212ad5..d95ceca7ff8f 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -68,32 +68,7 @@ int ip6_route_me_harder(struct net *net, struct sk_buff *skb) } EXPORT_SYMBOL(ip6_route_me_harder); -/* - * Extra routing may needed on local out, as the QUEUE target never - * returns control to the table. - */ - -struct ip6_rt_info { - struct in6_addr daddr; - struct in6_addr saddr; - u_int32_t mark; -}; - -static void nf_ip6_saveroute(const struct sk_buff *skb, - struct nf_queue_entry *entry) -{ - struct ip6_rt_info *rt_info = nf_queue_entry_reroute(entry); - - if (entry->state.hook == NF_INET_LOCAL_OUT) { - const struct ipv6hdr *iph = ipv6_hdr(skb); - - rt_info->daddr = iph->daddr; - rt_info->saddr = iph->saddr; - rt_info->mark = skb->mark; - } -} - -static int nf_ip6_reroute(struct net *net, struct sk_buff *skb, +static int nf_ip6_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry) { struct ip6_rt_info *rt_info = nf_queue_entry_reroute(entry); @@ -103,7 +78,7 @@ static int nf_ip6_reroute(struct net *net, struct sk_buff *skb, if (!ipv6_addr_equal(&iph->daddr, &rt_info->daddr) || !ipv6_addr_equal(&iph->saddr, &rt_info->saddr) || skb->mark != rt_info->mark) - return ip6_route_me_harder(net, skb); + return ip6_route_me_harder(entry->state.net, skb); } return 0; } @@ -190,25 +165,19 @@ static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook, }; static const struct nf_ipv6_ops ipv6ops = { - .chk_addr = ipv6_chk_addr, - .route_input = ip6_route_input, - .fragment = ip6_fragment -}; - -static const struct nf_afinfo nf_ip6_afinfo = { - .family = AF_INET6, + .chk_addr = ipv6_chk_addr, + .route_input = ip6_route_input, + .fragment = ip6_fragment, .checksum = nf_ip6_checksum, .checksum_partial = nf_ip6_checksum_partial, .route = nf_ip6_route, - .saveroute = nf_ip6_saveroute, .reroute = nf_ip6_reroute, - .route_key_size = sizeof(struct ip6_rt_info), }; int __init ipv6_netfilter_init(void) { RCU_INIT_POINTER(nf_ipv6_ops, &ipv6ops); - return nf_register_afinfo(&nf_ip6_afinfo); + return 0; } /* This can be called from inet6_init() on errors, so it cannot @@ -217,5 +186,4 @@ int __init ipv6_netfilter_init(void) void ipv6_netfilter_fini(void) { RCU_INIT_POINTER(nf_ipv6_ops, NULL); - nf_unregister_afinfo(&nf_ip6_afinfo); } diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 6acb2eecd986..4a634b7a2c80 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -71,6 +71,15 @@ config NFT_FIB_IPV6 endif # NF_TABLES_IPV6 endif # NF_TABLES +config NF_FLOW_TABLE_IPV6 + tristate "Netfilter flow table IPv6 module" + depends on NF_CONNTRACK && NF_TABLES + select NF_FLOW_TABLE + help + This option adds the flow table IPv6 support. + + To compile it as a module, choose M here. + config NF_DUP_IPV6 tristate "Netfilter IPv6 packet duplication to alternate destination" depends on !NF_CONNTRACK || NF_CONNTRACK @@ -232,6 +241,15 @@ config IP6_NF_MATCH_RT To compile it as a module, choose M here. If unsure, say N. +config IP6_NF_MATCH_SRH + tristate '"srh" Segment Routing header match support' + depends on NETFILTER_ADVANCED + help + srh matching allows you to match packets based on the segment + routing header of the packet. + + To compile it as a module, choose M here. If unsure, say N. + # The targets config IP6_NF_TARGET_HL tristate '"HL" hoplimit target support' diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index c6ee0cdd0ba9..d984057b8395 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -45,6 +45,9 @@ obj-$(CONFIG_NFT_REDIR_IPV6) += nft_redir_ipv6.o obj-$(CONFIG_NFT_DUP_IPV6) += nft_dup_ipv6.o obj-$(CONFIG_NFT_FIB_IPV6) += nft_fib_ipv6.o +# flow table support +obj-$(CONFIG_NF_FLOW_TABLE_IPV6) += nf_flow_table_ipv6.o + # matches obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o obj-$(CONFIG_IP6_NF_MATCH_EUI64) += ip6t_eui64.o @@ -54,6 +57,7 @@ obj-$(CONFIG_IP6_NF_MATCH_MH) += ip6t_mh.o obj-$(CONFIG_IP6_NF_MATCH_OPTS) += ip6t_hbh.o obj-$(CONFIG_IP6_NF_MATCH_RPFILTER) += ip6t_rpfilter.o obj-$(CONFIG_IP6_NF_MATCH_RT) += ip6t_rt.o +obj-$(CONFIG_IP6_NF_MATCH_SRH) += ip6t_srh.o # targets obj-$(CONFIG_IP6_NF_TARGET_MASQUERADE) += ip6t_MASQUERADE.o diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index f06e25065a34..af4c917e0836 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -282,12 +282,7 @@ ip6t_do_table(struct sk_buff *skb, local_bh_disable(); addend = xt_write_recseq_begin(); - private = table->private; - /* - * Ensure we load private-> members after we've fetched the base - * pointer. - */ - smp_read_barrier_depends(); + private = READ_ONCE(table->private); /* Address dependency. */ cpu = smp_processor_id(); table_base = private->entries; jumpstack = (struct ip6t_entry **)private->jumpstack[cpu]; @@ -458,7 +453,6 @@ mark_source_chains(const struct xt_table_info *newinfo, if (!xt_find_jump_offset(offsets, newpos, newinfo->number)) return 0; - e = entry0 + newpos; } else { /* ... this is a fallthru */ newpos = pos + e->next_offset; @@ -992,9 +986,8 @@ static int get_info(struct net *net, void __user *user, if (compat) xt_compat_lock(AF_INET6); #endif - t = try_then_request_module(xt_find_table_lock(net, AF_INET6, name), - "ip6table_%s", name); - if (t) { + t = xt_request_find_table_lock(net, AF_INET6, name); + if (!IS_ERR(t)) { struct ip6t_getinfo info; const struct xt_table_info *private = t->private; #ifdef CONFIG_COMPAT @@ -1024,7 +1017,7 @@ static int get_info(struct net *net, void __user *user, xt_table_unlock(t); module_put(t->me); } else - ret = -ENOENT; + ret = PTR_ERR(t); #ifdef CONFIG_COMPAT if (compat) xt_compat_unlock(AF_INET6); @@ -1050,7 +1043,7 @@ get_entries(struct net *net, struct ip6t_get_entries __user *uptr, get.name[sizeof(get.name) - 1] = '\0'; t = xt_find_table_lock(net, AF_INET6, get.name); - if (t) { + if (!IS_ERR(t)) { struct xt_table_info *private = t->private; if (get.size == private->size) ret = copy_entries_to_user(private->size, @@ -1061,7 +1054,7 @@ get_entries(struct net *net, struct ip6t_get_entries __user *uptr, module_put(t->me); xt_table_unlock(t); } else - ret = -ENOENT; + ret = PTR_ERR(t); return ret; } @@ -1084,10 +1077,9 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, goto out; } - t = try_then_request_module(xt_find_table_lock(net, AF_INET6, name), - "ip6table_%s", name); - if (!t) { - ret = -ENOENT; + t = xt_request_find_table_lock(net, AF_INET6, name); + if (IS_ERR(t)) { + ret = PTR_ERR(t); goto free_newinfo_counters_untrans; } @@ -1200,8 +1192,8 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len, if (IS_ERR(paddc)) return PTR_ERR(paddc); t = xt_find_table_lock(net, AF_INET6, tmp.name); - if (!t) { - ret = -ENOENT; + if (IS_ERR(t)) { + ret = PTR_ERR(t); goto free; } @@ -1637,7 +1629,7 @@ compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr, xt_compat_lock(AF_INET6); t = xt_find_table_lock(net, AF_INET6, get.name); - if (t) { + if (!IS_ERR(t)) { const struct xt_table_info *private = t->private; struct xt_table_info info; ret = compat_table_info(private, &info); @@ -1651,7 +1643,7 @@ compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr, module_put(t->me); xt_table_unlock(t); } else - ret = -ENOENT; + ret = PTR_ERR(t); xt_compat_unlock(AF_INET6); return ret; @@ -1955,7 +1947,6 @@ static int __init ip6_tables_init(void) if (ret < 0) goto err5; - pr_info("(C) 2000-2006 Netfilter Core Team\n"); return 0; err5: diff --git a/net/ipv6/netfilter/ip6t_MASQUERADE.c b/net/ipv6/netfilter/ip6t_MASQUERADE.c index 2b1a15846f9a..92c0047e7e33 100644 --- a/net/ipv6/netfilter/ip6t_MASQUERADE.c +++ b/net/ipv6/netfilter/ip6t_MASQUERADE.c @@ -33,13 +33,19 @@ static int masquerade_tg6_checkentry(const struct xt_tgchk_param *par) if (range->flags & NF_NAT_RANGE_MAP_IPS) return -EINVAL; - return 0; + return nf_ct_netns_get(par->net, par->family); +} + +static void masquerade_tg6_destroy(const struct xt_tgdtor_param *par) +{ + nf_ct_netns_put(par->net, par->family); } static struct xt_target masquerade_tg6_reg __read_mostly = { .name = "MASQUERADE", .family = NFPROTO_IPV6, .checkentry = masquerade_tg6_checkentry, + .destroy = masquerade_tg6_destroy, .target = masquerade_tg6, .targetsize = sizeof(struct nf_nat_range), .table = "nat", diff --git a/net/ipv6/netfilter/ip6t_srh.c b/net/ipv6/netfilter/ip6t_srh.c new file mode 100644 index 000000000000..9642164107ce --- /dev/null +++ b/net/ipv6/netfilter/ip6t_srh.c @@ -0,0 +1,161 @@ +/* Kernel module to match Segment Routing Header (SRH) parameters. */ + +/* Author: + * Ahmed Abdelsalam <amsalam20@gmail.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ipv6.h> +#include <linux/types.h> +#include <net/ipv6.h> +#include <net/seg6.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter_ipv6/ip6t_srh.h> +#include <linux/netfilter_ipv6/ip6_tables.h> + +/* Test a struct->mt_invflags and a boolean for inequality */ +#define NF_SRH_INVF(ptr, flag, boolean) \ + ((boolean) ^ !!((ptr)->mt_invflags & (flag))) + +static bool srh_mt6(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct ip6t_srh *srhinfo = par->matchinfo; + struct ipv6_sr_hdr *srh; + struct ipv6_sr_hdr _srh; + int hdrlen, srhoff = 0; + + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + return false; + srh = skb_header_pointer(skb, srhoff, sizeof(_srh), &_srh); + if (!srh) + return false; + + hdrlen = ipv6_optlen(srh); + if (skb->len - srhoff < hdrlen) + return false; + + if (srh->type != IPV6_SRCRT_TYPE_4) + return false; + + if (srh->segments_left > srh->first_segment) + return false; + + /* Next Header matching */ + if (srhinfo->mt_flags & IP6T_SRH_NEXTHDR) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_NEXTHDR, + !(srh->nexthdr == srhinfo->next_hdr))) + return false; + + /* Header Extension Length matching */ + if (srhinfo->mt_flags & IP6T_SRH_LEN_EQ) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_EQ, + !(srh->hdrlen == srhinfo->hdr_len))) + return false; + + if (srhinfo->mt_flags & IP6T_SRH_LEN_GT) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_GT, + !(srh->hdrlen > srhinfo->hdr_len))) + return false; + + if (srhinfo->mt_flags & IP6T_SRH_LEN_LT) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_LT, + !(srh->hdrlen < srhinfo->hdr_len))) + return false; + + /* Segments Left matching */ + if (srhinfo->mt_flags & IP6T_SRH_SEGS_EQ) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_EQ, + !(srh->segments_left == srhinfo->segs_left))) + return false; + + if (srhinfo->mt_flags & IP6T_SRH_SEGS_GT) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_GT, + !(srh->segments_left > srhinfo->segs_left))) + return false; + + if (srhinfo->mt_flags & IP6T_SRH_SEGS_LT) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_LT, + !(srh->segments_left < srhinfo->segs_left))) + return false; + + /** + * Last Entry matching + * Last_Entry field was introduced in revision 6 of the SRH draft. + * It was called First_Segment in the previous revision + */ + if (srhinfo->mt_flags & IP6T_SRH_LAST_EQ) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_EQ, + !(srh->first_segment == srhinfo->last_entry))) + return false; + + if (srhinfo->mt_flags & IP6T_SRH_LAST_GT) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_GT, + !(srh->first_segment > srhinfo->last_entry))) + return false; + + if (srhinfo->mt_flags & IP6T_SRH_LAST_LT) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_LT, + !(srh->first_segment < srhinfo->last_entry))) + return false; + + /** + * Tag matchig + * Tag field was introduced in revision 6 of the SRH draft. + */ + if (srhinfo->mt_flags & IP6T_SRH_TAG) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_TAG, + !(srh->tag == srhinfo->tag))) + return false; + return true; +} + +static int srh_mt6_check(const struct xt_mtchk_param *par) +{ + const struct ip6t_srh *srhinfo = par->matchinfo; + + if (srhinfo->mt_flags & ~IP6T_SRH_MASK) { + pr_err("unknown srh match flags %X\n", srhinfo->mt_flags); + return -EINVAL; + } + + if (srhinfo->mt_invflags & ~IP6T_SRH_INV_MASK) { + pr_err("unknown srh invflags %X\n", srhinfo->mt_invflags); + return -EINVAL; + } + + return 0; +} + +static struct xt_match srh_mt6_reg __read_mostly = { + .name = "srh", + .family = NFPROTO_IPV6, + .match = srh_mt6, + .matchsize = sizeof(struct ip6t_srh), + .checkentry = srh_mt6_check, + .me = THIS_MODULE, +}; + +static int __init srh_mt6_init(void) +{ + return xt_register_match(&srh_mt6_reg); +} + +static void __exit srh_mt6_exit(void) +{ + xt_unregister_match(&srh_mt6_reg); +} + +module_init(srh_mt6_init); +module_exit(srh_mt6_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Xtables: IPv6 Segment Routing Header match"); +MODULE_AUTHOR("Ahmed Abdelsalam <amsalam20@gmail.com>"); diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c index 2b1a9dcdbcb3..b0524b18c4fb 100644 --- a/net/ipv6/netfilter/ip6table_mangle.c +++ b/net/ipv6/netfilter/ip6table_mangle.c @@ -42,14 +42,6 @@ ip6t_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state) u_int8_t hop_limit; u_int32_t flowlabel, mark; int err; -#if 0 - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct iphdr) || - ip_hdrlen(skb) < sizeof(struct iphdr)) { - net_warn_ratelimited("ip6t_hook: happy cracking\n"); - return NF_ACCEPT; - } -#endif /* save source/dest address, mark, hoplimit, flowlabel, priority, */ memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr)); diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c index 991512576c8c..47306e45a80a 100644 --- a/net/ipv6/netfilter/ip6table_nat.c +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -74,6 +74,7 @@ static const struct nf_hook_ops nf_nat_ipv6_ops[] = { { .hook = ip6table_nat_in, .pf = NFPROTO_IPV6, + .nat_hook = true, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP6_PRI_NAT_DST, }, @@ -81,6 +82,7 @@ static const struct nf_hook_ops nf_nat_ipv6_ops[] = { { .hook = ip6table_nat_out, .pf = NFPROTO_IPV6, + .nat_hook = true, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP6_PRI_NAT_SRC, }, @@ -88,12 +90,14 @@ static const struct nf_hook_ops nf_nat_ipv6_ops[] = { { .hook = ip6table_nat_local_fn, .pf = NFPROTO_IPV6, + .nat_hook = true, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_NAT_DST, }, /* After packet filtering, change source */ { .hook = ip6table_nat_fn, + .nat_hook = true, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP6_PRI_NAT_SRC, diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c index d4bc56443dc1..710fa0806c37 100644 --- a/net/ipv6/netfilter/ip6table_raw.c +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -3,6 +3,7 @@ * * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/slab.h> @@ -11,6 +12,10 @@ static int __net_init ip6table_raw_table_init(struct net *net); +static bool raw_before_defrag __read_mostly; +MODULE_PARM_DESC(raw_before_defrag, "Enable raw table before defrag"); +module_param(raw_before_defrag, bool, 0000); + static const struct xt_table packet_raw = { .name = "raw", .valid_hooks = RAW_VALID_HOOKS, @@ -20,6 +25,15 @@ static const struct xt_table packet_raw = { .table_init = ip6table_raw_table_init, }; +static const struct xt_table packet_raw_before_defrag = { + .name = "raw", + .valid_hooks = RAW_VALID_HOOKS, + .me = THIS_MODULE, + .af = NFPROTO_IPV6, + .priority = NF_IP6_PRI_RAW_BEFORE_DEFRAG, + .table_init = ip6table_raw_table_init, +}; + /* The work comes in here from netfilter.c. */ static unsigned int ip6table_raw_hook(void *priv, struct sk_buff *skb, @@ -33,15 +47,19 @@ static struct nf_hook_ops *rawtable_ops __read_mostly; static int __net_init ip6table_raw_table_init(struct net *net) { struct ip6t_replace *repl; + const struct xt_table *table = &packet_raw; int ret; + if (raw_before_defrag) + table = &packet_raw_before_defrag; + if (net->ipv6.ip6table_raw) return 0; - repl = ip6t_alloc_initial_table(&packet_raw); + repl = ip6t_alloc_initial_table(table); if (repl == NULL) return -ENOMEM; - ret = ip6t_register_table(net, &packet_raw, repl, rawtable_ops, + ret = ip6t_register_table(net, table, repl, rawtable_ops, &net->ipv6.ip6table_raw); kfree(repl); return ret; @@ -62,9 +80,16 @@ static struct pernet_operations ip6table_raw_net_ops = { static int __init ip6table_raw_init(void) { int ret; + const struct xt_table *table = &packet_raw; + + if (raw_before_defrag) { + table = &packet_raw_before_defrag; + + pr_info("Enabling raw table before defrag\n"); + } /* Register hooks */ - rawtable_ops = xt_hook_ops_alloc(&packet_raw, ip6table_raw_hook); + rawtable_ops = xt_hook_ops_alloc(table, ip6table_raw_hook); if (IS_ERR(rawtable_ops)) return PTR_ERR(rawtable_ops); diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index 3b80a38f62b8..663827ee3cf8 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -176,11 +176,6 @@ static unsigned int ipv6_conntrack_local(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct ipv6hdr)) { - net_notice_ratelimited("ipv6_conntrack_local: packet too short\n"); - return NF_ACCEPT; - } return nf_conntrack_in(state->net, PF_INET6, state->hook, skb); } @@ -226,20 +221,27 @@ static const struct nf_hook_ops ipv6_conntrack_ops[] = { static int ipv6_getorigdst(struct sock *sk, int optval, void __user *user, int *len) { - const struct inet_sock *inet = inet_sk(sk); + struct nf_conntrack_tuple tuple = { .src.l3num = NFPROTO_IPV6 }; const struct ipv6_pinfo *inet6 = inet6_sk(sk); + const struct inet_sock *inet = inet_sk(sk); const struct nf_conntrack_tuple_hash *h; struct sockaddr_in6 sin6; - struct nf_conntrack_tuple tuple = { .src.l3num = NFPROTO_IPV6 }; struct nf_conn *ct; + __be32 flow_label; + int bound_dev_if; + lock_sock(sk); tuple.src.u3.in6 = sk->sk_v6_rcv_saddr; tuple.src.u.tcp.port = inet->inet_sport; tuple.dst.u3.in6 = sk->sk_v6_daddr; tuple.dst.u.tcp.port = inet->inet_dport; tuple.dst.protonum = sk->sk_protocol; + bound_dev_if = sk->sk_bound_dev_if; + flow_label = inet6->flow_label; + release_sock(sk); - if (sk->sk_protocol != IPPROTO_TCP && sk->sk_protocol != IPPROTO_SCTP) + if (tuple.dst.protonum != IPPROTO_TCP && + tuple.dst.protonum != IPPROTO_SCTP) return -ENOPROTOOPT; if (*len < 0 || (unsigned int) *len < sizeof(sin6)) @@ -257,14 +259,13 @@ ipv6_getorigdst(struct sock *sk, int optval, void __user *user, int *len) sin6.sin6_family = AF_INET6; sin6.sin6_port = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.tcp.port; - sin6.sin6_flowinfo = inet6->flow_label & IPV6_FLOWINFO_MASK; + sin6.sin6_flowinfo = flow_label & IPV6_FLOWINFO_MASK; memcpy(&sin6.sin6_addr, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in6, sizeof(sin6.sin6_addr)); nf_ct_put(ct); - sin6.sin6_scope_id = ipv6_iface_scope_id(&sin6.sin6_addr, - sk->sk_bound_dev_if); + sin6.sin6_scope_id = ipv6_iface_scope_id(&sin6.sin6_addr, bound_dev_if); return copy_to_user(user, &sin6, sizeof(sin6)) ? -EFAULT : 0; } @@ -368,7 +369,7 @@ static struct nf_sockopt_ops so_getorigdst6 = { .owner = THIS_MODULE, }; -static struct nf_conntrack_l4proto *builtin_l4proto6[] = { +static const struct nf_conntrack_l4proto * const builtin_l4proto6[] = { &nf_conntrack_l4proto_tcp6, &nf_conntrack_l4proto_udp6, &nf_conntrack_l4proto_icmpv6, diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index 3ac0d826afc4..2548e2c8aedd 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -27,7 +27,7 @@ #include <net/netfilter/ipv6/nf_conntrack_icmpv6.h> #include <net/netfilter/nf_log.h> -static unsigned int nf_ct_icmpv6_timeout __read_mostly = 30*HZ; +static const unsigned int nf_ct_icmpv6_timeout = 30*HZ; static inline struct nf_icmp_net *icmpv6_pernet(struct net *net) { @@ -352,7 +352,7 @@ static struct nf_proto_net *icmpv6_get_net_proto(struct net *net) return &net->ct.nf_ct_proto.icmpv6.pn; } -struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 __read_mostly = +const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 = { .l3proto = PF_INET6, .l4proto = IPPROTO_ICMPV6, diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 977d8900cfd1..ce53dcfda88a 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -231,7 +231,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, if ((unsigned int)end > IPV6_MAXPLEN) { pr_debug("offset is too large.\n"); - return -1; + return -EINVAL; } ecn = ip6_frag_ecn(ipv6_hdr(skb)); @@ -264,7 +264,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, * this case. -DaveM */ pr_debug("end of fragment not rounded to 8 bytes.\n"); - return -1; + return -EPROTO; } if (end > fq->q.len) { /* Some bits beyond end -> corruption. */ @@ -358,7 +358,7 @@ found: discard_fq: inet_frag_kill(&fq->q, &nf_frags); err: - return -1; + return -EINVAL; } /* @@ -567,6 +567,7 @@ find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff) int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) { + u16 savethdr = skb->transport_header; struct net_device *dev = skb->dev; int fhoff, nhoff, ret; struct frag_hdr *fhdr; @@ -600,8 +601,12 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) spin_lock_bh(&fq->q.lock); - if (nf_ct_frag6_queue(fq, skb, fhdr, nhoff) < 0) { - ret = -EINVAL; + ret = nf_ct_frag6_queue(fq, skb, fhdr, nhoff); + if (ret < 0) { + if (ret == -EPROTO) { + skb->transport_header = savethdr; + ret = 0; + } goto out_unlock; } diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c index b326da59257f..c87b48359e8f 100644 --- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -63,6 +63,9 @@ static unsigned int ipv6_defrag(void *priv, /* Previously seen (loopback)? */ if (skb_nfct(skb) && !nf_ct_is_template((struct nf_conn *)skb_nfct(skb))) return NF_ACCEPT; + + if (skb->_nfct == IP_CT_UNTRACKED) + return NF_ACCEPT; #endif err = nf_ct_frag6_gather(state->net, skb, diff --git a/net/ipv6/netfilter/nf_flow_table_ipv6.c b/net/ipv6/netfilter/nf_flow_table_ipv6.c new file mode 100644 index 000000000000..fff21602875a --- /dev/null +++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c @@ -0,0 +1,277 @@ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netfilter.h> +#include <linux/rhashtable.h> +#include <linux/ipv6.h> +#include <linux/netdevice.h> +#include <net/ipv6.h> +#include <net/ip6_route.h> +#include <net/neighbour.h> +#include <net/netfilter/nf_flow_table.h> +#include <net/netfilter/nf_tables.h> +/* For layer 4 checksum field offset. */ +#include <linux/tcp.h> +#include <linux/udp.h> + +static int nf_flow_nat_ipv6_tcp(struct sk_buff *skb, unsigned int thoff, + struct in6_addr *addr, + struct in6_addr *new_addr) +{ + struct tcphdr *tcph; + + if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) || + skb_try_make_writable(skb, thoff + sizeof(*tcph))) + return -1; + + tcph = (void *)(skb_network_header(skb) + thoff); + inet_proto_csum_replace16(&tcph->check, skb, addr->s6_addr32, + new_addr->s6_addr32, true); + + return 0; +} + +static int nf_flow_nat_ipv6_udp(struct sk_buff *skb, unsigned int thoff, + struct in6_addr *addr, + struct in6_addr *new_addr) +{ + struct udphdr *udph; + + if (!pskb_may_pull(skb, thoff + sizeof(*udph)) || + skb_try_make_writable(skb, thoff + sizeof(*udph))) + return -1; + + udph = (void *)(skb_network_header(skb) + thoff); + if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) { + inet_proto_csum_replace16(&udph->check, skb, addr->s6_addr32, + new_addr->s6_addr32, true); + if (!udph->check) + udph->check = CSUM_MANGLED_0; + } + + return 0; +} + +static int nf_flow_nat_ipv6_l4proto(struct sk_buff *skb, struct ipv6hdr *ip6h, + unsigned int thoff, struct in6_addr *addr, + struct in6_addr *new_addr) +{ + switch (ip6h->nexthdr) { + case IPPROTO_TCP: + if (nf_flow_nat_ipv6_tcp(skb, thoff, addr, new_addr) < 0) + return NF_DROP; + break; + case IPPROTO_UDP: + if (nf_flow_nat_ipv6_udp(skb, thoff, addr, new_addr) < 0) + return NF_DROP; + break; + } + + return 0; +} + +static int nf_flow_snat_ipv6(const struct flow_offload *flow, + struct sk_buff *skb, struct ipv6hdr *ip6h, + unsigned int thoff, + enum flow_offload_tuple_dir dir) +{ + struct in6_addr addr, new_addr; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + addr = ip6h->saddr; + new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6; + ip6h->saddr = new_addr; + break; + case FLOW_OFFLOAD_DIR_REPLY: + addr = ip6h->daddr; + new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6; + ip6h->daddr = new_addr; + break; + default: + return -1; + } + + return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr); +} + +static int nf_flow_dnat_ipv6(const struct flow_offload *flow, + struct sk_buff *skb, struct ipv6hdr *ip6h, + unsigned int thoff, + enum flow_offload_tuple_dir dir) +{ + struct in6_addr addr, new_addr; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + addr = ip6h->daddr; + new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6; + ip6h->daddr = new_addr; + break; + case FLOW_OFFLOAD_DIR_REPLY: + addr = ip6h->saddr; + new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6; + ip6h->saddr = new_addr; + break; + default: + return -1; + } + + return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr); +} + +static int nf_flow_nat_ipv6(const struct flow_offload *flow, + struct sk_buff *skb, + enum flow_offload_tuple_dir dir) +{ + struct ipv6hdr *ip6h = ipv6_hdr(skb); + unsigned int thoff = sizeof(*ip6h); + + if (flow->flags & FLOW_OFFLOAD_SNAT && + (nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 || + nf_flow_snat_ipv6(flow, skb, ip6h, thoff, dir) < 0)) + return -1; + if (flow->flags & FLOW_OFFLOAD_DNAT && + (nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 || + nf_flow_dnat_ipv6(flow, skb, ip6h, thoff, dir) < 0)) + return -1; + + return 0; +} + +static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev, + struct flow_offload_tuple *tuple) +{ + struct flow_ports *ports; + struct ipv6hdr *ip6h; + unsigned int thoff; + + if (!pskb_may_pull(skb, sizeof(*ip6h))) + return -1; + + ip6h = ipv6_hdr(skb); + + if (ip6h->nexthdr != IPPROTO_TCP && + ip6h->nexthdr != IPPROTO_UDP) + return -1; + + thoff = sizeof(*ip6h); + if (!pskb_may_pull(skb, thoff + sizeof(*ports))) + return -1; + + ports = (struct flow_ports *)(skb_network_header(skb) + thoff); + + tuple->src_v6 = ip6h->saddr; + tuple->dst_v6 = ip6h->daddr; + tuple->src_port = ports->source; + tuple->dst_port = ports->dest; + tuple->l3proto = AF_INET6; + tuple->l4proto = ip6h->nexthdr; + tuple->iifidx = dev->ifindex; + + return 0; +} + +/* Based on ip_exceeds_mtu(). */ +static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) +{ + if (skb->len <= mtu) + return false; + + if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu)) + return false; + + return true; +} + +static bool nf_flow_exceeds_mtu(struct sk_buff *skb, const struct rt6_info *rt) +{ + u32 mtu; + + mtu = ip6_dst_mtu_forward(&rt->dst); + if (__nf_flow_exceeds_mtu(skb, mtu)) + return true; + + return false; +} + +unsigned int +nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct flow_offload_tuple_rhash *tuplehash; + struct nf_flowtable *flow_table = priv; + struct flow_offload_tuple tuple = {}; + enum flow_offload_tuple_dir dir; + struct flow_offload *flow; + struct net_device *outdev; + struct in6_addr *nexthop; + struct ipv6hdr *ip6h; + struct rt6_info *rt; + + if (skb->protocol != htons(ETH_P_IPV6)) + return NF_ACCEPT; + + if (nf_flow_tuple_ipv6(skb, state->in, &tuple) < 0) + return NF_ACCEPT; + + tuplehash = flow_offload_lookup(flow_table, &tuple); + if (tuplehash == NULL) + return NF_ACCEPT; + + outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx); + if (!outdev) + return NF_ACCEPT; + + dir = tuplehash->tuple.dir; + flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); + + rt = (struct rt6_info *)flow->tuplehash[dir].tuple.dst_cache; + if (unlikely(nf_flow_exceeds_mtu(skb, rt))) + return NF_ACCEPT; + + if (skb_try_make_writable(skb, sizeof(*ip6h))) + return NF_DROP; + + if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) && + nf_flow_nat_ipv6(flow, skb, dir) < 0) + return NF_DROP; + + flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT; + ip6h = ipv6_hdr(skb); + ip6h->hop_limit--; + + skb->dev = outdev; + nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6); + neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb); + + return NF_STOLEN; +} +EXPORT_SYMBOL_GPL(nf_flow_offload_ipv6_hook); + +static struct nf_flowtable_type flowtable_ipv6 = { + .family = NFPROTO_IPV6, + .params = &nf_flow_offload_rhash_params, + .gc = nf_flow_offload_work_gc, + .hook = nf_flow_offload_ipv6_hook, + .owner = THIS_MODULE, +}; + +static int __init nf_flow_ipv6_module_init(void) +{ + nft_register_flowtable_type(&flowtable_ipv6); + + return 0; +} + +static void __exit nf_flow_ipv6_module_exit(void) +{ + nft_unregister_flowtable_type(&flowtable_ipv6); +} + +module_init(nf_flow_ipv6_module_init); +module_exit(nf_flow_ipv6_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); +MODULE_ALIAS_NF_FLOWTABLE(AF_INET6); diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c index 1d2fb9267d6f..bed57ee65f7b 100644 --- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c @@ -369,10 +369,6 @@ nf_nat_ipv6_out(void *priv, struct sk_buff *skb, #endif unsigned int ret; - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct ipv6hdr)) - return NF_ACCEPT; - ret = nf_nat_ipv6_fn(priv, skb, state, do_chain); #ifdef CONFIG_XFRM if (ret != NF_DROP && ret != NF_STOLEN && @@ -408,10 +404,6 @@ nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb, unsigned int ret; int err; - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct ipv6hdr)) - return NF_ACCEPT; - ret = nf_nat_ipv6_fn(priv, skb, state, do_chain); if (ret != NF_DROP && ret != NF_STOLEN && (ct = nf_ct_get(skb, &ctinfo)) != NULL) { diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c index d6e4ba5de916..17e03589331c 100644 --- a/net/ipv6/netfilter/nf_tables_ipv6.c +++ b/net/ipv6/netfilter/nf_tables_ipv6.c @@ -22,68 +22,12 @@ static unsigned int nft_do_chain_ipv6(void *priv, { struct nft_pktinfo pkt; - nft_set_pktinfo_ipv6(&pkt, skb, state); + nft_set_pktinfo(&pkt, skb, state); + nft_set_pktinfo_ipv6(&pkt, skb); return nft_do_chain(&pkt, priv); } -static unsigned int nft_ipv6_output(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - if (unlikely(skb->len < sizeof(struct ipv6hdr))) { - if (net_ratelimit()) - pr_info("nf_tables_ipv6: ignoring short SOCK_RAW " - "packet\n"); - return NF_ACCEPT; - } - - return nft_do_chain_ipv6(priv, skb, state); -} - -struct nft_af_info nft_af_ipv6 __read_mostly = { - .family = NFPROTO_IPV6, - .nhooks = NF_INET_NUMHOOKS, - .owner = THIS_MODULE, - .nops = 1, - .hooks = { - [NF_INET_LOCAL_IN] = nft_do_chain_ipv6, - [NF_INET_LOCAL_OUT] = nft_ipv6_output, - [NF_INET_FORWARD] = nft_do_chain_ipv6, - [NF_INET_PRE_ROUTING] = nft_do_chain_ipv6, - [NF_INET_POST_ROUTING] = nft_do_chain_ipv6, - }, -}; -EXPORT_SYMBOL_GPL(nft_af_ipv6); - -static int nf_tables_ipv6_init_net(struct net *net) -{ - net->nft.ipv6 = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL); - if (net->nft.ipv6 == NULL) - return -ENOMEM; - - memcpy(net->nft.ipv6, &nft_af_ipv6, sizeof(nft_af_ipv6)); - - if (nft_register_afinfo(net, net->nft.ipv6) < 0) - goto err; - - return 0; -err: - kfree(net->nft.ipv6); - return -ENOMEM; -} - -static void nf_tables_ipv6_exit_net(struct net *net) -{ - nft_unregister_afinfo(net, net->nft.ipv6); - kfree(net->nft.ipv6); -} - -static struct pernet_operations nf_tables_ipv6_net_ops = { - .init = nf_tables_ipv6_init_net, - .exit = nf_tables_ipv6_exit_net, -}; - static const struct nf_chain_type filter_ipv6 = { .name = "filter", .type = NFT_CHAIN_T_DEFAULT, @@ -94,26 +38,22 @@ static const struct nf_chain_type filter_ipv6 = { (1 << NF_INET_FORWARD) | (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_POST_ROUTING), + .hooks = { + [NF_INET_LOCAL_IN] = nft_do_chain_ipv6, + [NF_INET_LOCAL_OUT] = nft_do_chain_ipv6, + [NF_INET_FORWARD] = nft_do_chain_ipv6, + [NF_INET_PRE_ROUTING] = nft_do_chain_ipv6, + [NF_INET_POST_ROUTING] = nft_do_chain_ipv6, + }, }; static int __init nf_tables_ipv6_init(void) { - int ret; - - ret = nft_register_chain_type(&filter_ipv6); - if (ret < 0) - return ret; - - ret = register_pernet_subsys(&nf_tables_ipv6_net_ops); - if (ret < 0) - nft_unregister_chain_type(&filter_ipv6); - - return ret; + return nft_register_chain_type(&filter_ipv6); } static void __exit nf_tables_ipv6_exit(void) { - unregister_pernet_subsys(&nf_tables_ipv6_net_ops); nft_unregister_chain_type(&filter_ipv6); } @@ -122,4 +62,4 @@ module_exit(nf_tables_ipv6_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_ALIAS_NFT_FAMILY(AF_INET6); +MODULE_ALIAS_NFT_CHAIN(AF_INET6, "filter"); diff --git a/net/ipv6/netfilter/nft_chain_nat_ipv6.c b/net/ipv6/netfilter/nft_chain_nat_ipv6.c index 443cd306c0b0..73fe2bd13fcf 100644 --- a/net/ipv6/netfilter/nft_chain_nat_ipv6.c +++ b/net/ipv6/netfilter/nft_chain_nat_ipv6.c @@ -31,7 +31,8 @@ static unsigned int nft_nat_do_chain(void *priv, { struct nft_pktinfo pkt; - nft_set_pktinfo_ipv6(&pkt, skb, state); + nft_set_pktinfo(&pkt, skb, state); + nft_set_pktinfo_ipv6(&pkt, skb); return nft_do_chain(&pkt, priv); } diff --git a/net/ipv6/netfilter/nft_chain_route_ipv6.c b/net/ipv6/netfilter/nft_chain_route_ipv6.c index f2727475895e..11d3c3b9aa18 100644 --- a/net/ipv6/netfilter/nft_chain_route_ipv6.c +++ b/net/ipv6/netfilter/nft_chain_route_ipv6.c @@ -33,7 +33,8 @@ static unsigned int nf_route_table_hook(void *priv, u32 mark, flowlabel; int err; - nft_set_pktinfo_ipv6(&pkt, skb, state); + nft_set_pktinfo(&pkt, skb, state); + nft_set_pktinfo_ipv6(&pkt, skb); /* save source/dest address, mark, hoplimit, flowlabel, priority */ memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr)); diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c index 54b5899543ef..cc5174c7254c 100644 --- a/net/ipv6/netfilter/nft_fib_ipv6.c +++ b/net/ipv6/netfilter/nft_fib_ipv6.c @@ -60,7 +60,6 @@ static u32 __nft_fib6_eval_type(const struct nft_fib *priv, { const struct net_device *dev = NULL; const struct nf_ipv6_ops *v6ops; - const struct nf_afinfo *afinfo; int route_err, addrtype; struct rt6_info *rt; struct flowi6 fl6 = { @@ -69,8 +68,8 @@ static u32 __nft_fib6_eval_type(const struct nft_fib *priv, }; u32 ret = 0; - afinfo = nf_get_afinfo(NFPROTO_IPV6); - if (!afinfo) + v6ops = nf_get_ipv6_ops(); + if (!v6ops) return RTN_UNREACHABLE; if (priv->flags & NFTA_FIB_F_IIF) @@ -80,12 +79,11 @@ static u32 __nft_fib6_eval_type(const struct nft_fib *priv, nft_fib6_flowi_init(&fl6, priv, pkt, dev, iph); - v6ops = nf_get_ipv6_ops(); - if (dev && v6ops && v6ops->chk_addr(nft_net(pkt), &fl6.daddr, dev, true)) + if (dev && v6ops->chk_addr(nft_net(pkt), &fl6.daddr, dev, true)) ret = RTN_LOCAL; - route_err = afinfo->route(nft_net(pkt), (struct dst_entry **)&rt, - flowi6_to_flowi(&fl6), false); + route_err = v6ops->route(nft_net(pkt), (struct dst_entry **)&rt, + flowi6_to_flowi(&fl6), false); if (route_err) goto err; diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index e88bcb8ff0fd..b67814242f78 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -58,7 +58,6 @@ static int sockstat6_seq_open(struct inode *inode, struct file *file) } static const struct file_operations sockstat6_seq_fops = { - .owner = THIS_MODULE, .open = sockstat6_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -248,7 +247,6 @@ static int snmp6_seq_open(struct inode *inode, struct file *file) } static const struct file_operations snmp6_seq_fops = { - .owner = THIS_MODULE, .open = snmp6_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -274,7 +272,6 @@ static int snmp6_dev_seq_open(struct inode *inode, struct file *file) } static const struct file_operations snmp6_dev_seq_fops = { - .owner = THIS_MODULE, .open = snmp6_dev_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 761a473a07c5..ddda7eb3c623 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -1308,7 +1308,6 @@ static int raw6_seq_open(struct inode *inode, struct file *file) } static const struct file_operations raw6_seq_fops = { - .owner = THIS_MODULE, .open = raw6_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 7a8d1500d374..fb2d251c0500 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -186,7 +186,7 @@ static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt) { - return dst_metrics_write_ptr(rt->dst.from); + return dst_metrics_write_ptr(&rt->from->dst); } static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old) @@ -391,7 +391,7 @@ static void ip6_dst_destroy(struct dst_entry *dst) { struct rt6_info *rt = (struct rt6_info *)dst; struct rt6_exception_bucket *bucket; - struct dst_entry *from = dst->from; + struct rt6_info *from = rt->from; struct inet6_dev *idev; dst_destroy_metrics_generic(dst); @@ -409,8 +409,8 @@ static void ip6_dst_destroy(struct dst_entry *dst) kfree(bucket); } - dst->from = NULL; - dst_release(from); + rt->from = NULL; + dst_release(&from->dst); } static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, @@ -443,9 +443,9 @@ static bool rt6_check_expired(const struct rt6_info *rt) if (rt->rt6i_flags & RTF_EXPIRES) { if (time_after(jiffies, rt->dst.expires)) return true; - } else if (rt->dst.from) { + } else if (rt->from) { return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || - rt6_check_expired((struct rt6_info *)rt->dst.from); + rt6_check_expired(rt->from); } return false; } @@ -455,7 +455,6 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match, int strict) { struct rt6_info *sibling, *next_sibling; - int route_choosen; /* We might have already computed the hash for ICMPv6 errors. In such * case it will always be non-zero. Otherwise now is the time to do it. @@ -463,26 +462,19 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match, if (!fl6->mp_hash) fl6->mp_hash = rt6_multipath_hash(fl6, NULL); - route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1); - /* Don't change the route, if route_choosen == 0 - * (siblings does not include ourself) - */ - if (route_choosen) - list_for_each_entry_safe(sibling, next_sibling, - &match->rt6i_siblings, rt6i_siblings) { - route_choosen--; - if (route_choosen == 0) { - struct inet6_dev *idev = sibling->rt6i_idev; - - if (!netif_carrier_ok(sibling->dst.dev) && - idev->cnf.ignore_routes_with_linkdown) - break; - if (rt6_score_route(sibling, oif, strict) < 0) - break; - match = sibling; - break; - } - } + if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound)) + return match; + + list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings, + rt6i_siblings) { + if (fl6->mp_hash > atomic_read(&sibling->rt6i_nh_upper_bound)) + continue; + if (rt6_score_route(sibling, oif, strict) < 0) + break; + match = sibling; + break; + } + return match; } @@ -499,12 +491,15 @@ static inline struct rt6_info *rt6_device_match(struct net *net, struct rt6_info *local = NULL; struct rt6_info *sprt; - if (!oif && ipv6_addr_any(saddr)) - goto out; + if (!oif && ipv6_addr_any(saddr) && !(rt->rt6i_nh_flags & RTNH_F_DEAD)) + return rt; - for (sprt = rt; sprt; sprt = rcu_dereference(sprt->dst.rt6_next)) { + for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) { struct net_device *dev = sprt->dst.dev; + if (sprt->rt6i_nh_flags & RTNH_F_DEAD) + continue; + if (oif) { if (dev->ifindex == oif) return sprt; @@ -533,8 +528,8 @@ static inline struct rt6_info *rt6_device_match(struct net *net, if (flags & RT6_LOOKUP_F_IFACE) return net->ipv6.ip6_null_entry; } -out: - return rt; + + return rt->rt6i_nh_flags & RTNH_F_DEAD ? net->ipv6.ip6_null_entry : rt; } #ifdef CONFIG_IPV6_ROUTER_PREF @@ -679,10 +674,12 @@ static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict, int m; bool match_do_rr = false; struct inet6_dev *idev = rt->rt6i_idev; - struct net_device *dev = rt->dst.dev; - if (dev && !netif_carrier_ok(dev) && - idev->cnf.ignore_routes_with_linkdown && + if (rt->rt6i_nh_flags & RTNH_F_DEAD) + goto out; + + if (idev->cnf.ignore_routes_with_linkdown && + rt->rt6i_nh_flags & RTNH_F_LINKDOWN && !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) goto out; @@ -721,7 +718,7 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, match = NULL; cont = NULL; - for (rt = rr_head; rt; rt = rcu_dereference(rt->dst.rt6_next)) { + for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) { if (rt->rt6i_metric != metric) { cont = rt; break; @@ -731,7 +728,7 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, } for (rt = leaf; rt && rt != rr_head; - rt = rcu_dereference(rt->dst.rt6_next)) { + rt = rcu_dereference(rt->rt6_next)) { if (rt->rt6i_metric != metric) { cont = rt; break; @@ -743,7 +740,7 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, if (match || !cont) return match; - for (rt = cont; rt; rt = rcu_dereference(rt->dst.rt6_next)) + for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next)) match = find_match(rt, oif, strict, &mpri, match, do_rr); return match; @@ -781,7 +778,7 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn, &do_rr); if (do_rr) { - struct rt6_info *next = rcu_dereference(rt0->dst.rt6_next); + struct rt6_info *next = rcu_dereference(rt0->rt6_next); /* no entries matched; do round-robin */ if (!next || next->rt6i_metric != rt0->rt6i_metric) @@ -1054,7 +1051,7 @@ static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort, */ if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)) - ort = (struct rt6_info *)ort->dst.from; + ort = ort->from; rcu_read_lock(); dev = ip6_rt_get_dev_rcu(ort); @@ -1274,7 +1271,7 @@ static int rt6_insert_exception(struct rt6_info *nrt, /* ort can't be a cache or pcpu route */ if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)) - ort = (struct rt6_info *)ort->dst.from; + ort = ort->from; WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)); spin_lock_bh(&rt6_exception_lock); @@ -1346,7 +1343,9 @@ out: /* Update fn->fn_sernum to invalidate all cached dst */ if (!err) { + spin_lock_bh(&ort->rt6i_table->tb6_lock); fib6_update_sernum(ort); + spin_unlock_bh(&ort->rt6i_table->tb6_lock); fib6_force_start_gc(net); } @@ -1415,8 +1414,8 @@ static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt, /* Remove the passed in cached rt from the hash table that contains it */ int rt6_remove_exception_rt(struct rt6_info *rt) { - struct rt6_info *from = (struct rt6_info *)rt->dst.from; struct rt6_exception_bucket *bucket; + struct rt6_info *from = rt->from; struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; int err; @@ -1460,8 +1459,8 @@ int rt6_remove_exception_rt(struct rt6_info *rt) */ static void rt6_update_exception_stamp_rt(struct rt6_info *rt) { - struct rt6_info *from = (struct rt6_info *)rt->dst.from; struct rt6_exception_bucket *bucket; + struct rt6_info *from = rt->from; struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; @@ -1586,12 +1585,19 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when * expired, independently from their aging, as per RFC 8201 section 4 */ - if (!(rt->rt6i_flags & RTF_EXPIRES) && - time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { - RT6_TRACE("aging clone %p\n", rt); + if (!(rt->rt6i_flags & RTF_EXPIRES)) { + if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { + RT6_TRACE("aging clone %p\n", rt); + rt6_remove_exception(bucket, rt6_ex); + return; + } + } else if (time_after(jiffies, rt->dst.expires)) { + RT6_TRACE("purging expired route %p\n", rt); rt6_remove_exception(bucket, rt6_ex); return; - } else if (rt->rt6i_flags & RTF_GATEWAY) { + } + + if (rt->rt6i_flags & RTF_GATEWAY) { struct neighbour *neigh; __u8 neigh_flags = 0; @@ -1606,11 +1612,8 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, rt6_remove_exception(bucket, rt6_ex); return; } - } else if (__rt6_check_expired(rt)) { - RT6_TRACE("purging expired route %p\n", rt); - rt6_remove_exception(bucket, rt6_ex); - return; } + gc_args->more++; } @@ -1824,10 +1827,10 @@ u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb) if (skb) { ip6_multipath_l3_keys(skb, &hash_keys); - return flow_hash_from_keys(&hash_keys); + return flow_hash_from_keys(&hash_keys) >> 1; } - return get_hash_from_flowi6(fl6); + return get_hash_from_flowi6(fl6) >> 1; } void ip6_route_input(struct sk_buff *skb) @@ -1929,9 +1932,9 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori static void rt6_dst_from_metrics_check(struct rt6_info *rt) { - if (rt->dst.from && - dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from)) - dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true); + if (rt->from && + dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(&rt->from->dst)) + dst_init_metrics(&rt->dst, dst_metrics_ptr(&rt->from->dst), true); } static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie) @@ -1951,7 +1954,7 @@ static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie) { if (!__rt6_check_expired(rt) && rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && - rt6_check((struct rt6_info *)(rt->dst.from), cookie)) + rt6_check(rt->from, cookie)) return &rt->dst; else return NULL; @@ -1971,7 +1974,7 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) rt6_dst_from_metrics_check(rt); if (rt->rt6i_flags & RTF_PCPU || - (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from)) + (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from)) return rt6_dst_from_check(rt, cookie); else return rt6_check(rt, cookie); @@ -2154,6 +2157,8 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); restart: for_each_fib6_node_rt_rcu(fn) { + if (rt->rt6i_nh_flags & RTNH_F_DEAD) + continue; if (rt6_check_expired(rt)) continue; if (rt->dst.error) @@ -2336,6 +2341,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, } rt->dst.flags |= DST_HOST; + rt->dst.input = ip6_input; rt->dst.output = ip6_output; rt->rt6i_gateway = fl6->daddr; rt->rt6i_dst.addr = fl6->daddr; @@ -2343,7 +2349,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, rt->rt6i_idev = idev; dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); - /* Add this dst into uncached_list so that rt6_ifdown() can + /* Add this dst into uncached_list so that rt6_disable_ip() can * do proper release of the net_device */ rt6_uncached_list_add(rt); @@ -2438,7 +2444,8 @@ static int ip6_convert_metrics(struct mx6_config *mxc, static struct rt6_info *ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg, - const struct in6_addr *gw_addr) + const struct in6_addr *gw_addr, + u32 tbid, int flags) { struct flowi6 fl6 = { .flowi6_oif = cfg->fc_ifindex, @@ -2447,15 +2454,15 @@ static struct rt6_info *ip6_nh_lookup_table(struct net *net, }; struct fib6_table *table; struct rt6_info *rt; - int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE; - table = fib6_get_table(net, cfg->fc_table); + table = fib6_get_table(net, tbid); if (!table) return NULL; if (!ipv6_addr_any(&cfg->fc_prefsrc)) flags |= RT6_LOOKUP_F_HAS_SADDR; + flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags); /* if table lookup failed, fall back to full lookup */ @@ -2467,6 +2474,82 @@ static struct rt6_info *ip6_nh_lookup_table(struct net *net, return rt; } +static int ip6_route_check_nh_onlink(struct net *net, + struct fib6_config *cfg, + struct net_device *dev, + struct netlink_ext_ack *extack) +{ + u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL; + const struct in6_addr *gw_addr = &cfg->fc_gateway; + u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT; + struct rt6_info *grt; + int err; + + err = 0; + grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0); + if (grt) { + if (grt->rt6i_flags & flags || dev != grt->dst.dev) { + NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway"); + err = -EINVAL; + } + + ip6_rt_put(grt); + } + + return err; +} + +static int ip6_route_check_nh(struct net *net, + struct fib6_config *cfg, + struct net_device **_dev, + struct inet6_dev **idev) +{ + const struct in6_addr *gw_addr = &cfg->fc_gateway; + struct net_device *dev = _dev ? *_dev : NULL; + struct rt6_info *grt = NULL; + int err = -EHOSTUNREACH; + + if (cfg->fc_table) { + int flags = RT6_LOOKUP_F_IFACE; + + grt = ip6_nh_lookup_table(net, cfg, gw_addr, + cfg->fc_table, flags); + if (grt) { + if (grt->rt6i_flags & RTF_GATEWAY || + (dev && dev != grt->dst.dev)) { + ip6_rt_put(grt); + grt = NULL; + } + } + } + + if (!grt) + grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1); + + if (!grt) + goto out; + + if (dev) { + if (dev != grt->dst.dev) { + ip6_rt_put(grt); + goto out; + } + } else { + *_dev = dev = grt->dst.dev; + *idev = grt->rt6i_idev; + dev_hold(dev); + in6_dev_hold(grt->rt6i_idev); + } + + if (!(grt->rt6i_flags & RTF_GATEWAY)) + err = 0; + + ip6_rt_put(grt); + +out: + return err; +} + static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, struct netlink_ext_ack *extack) { @@ -2518,6 +2601,21 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, if (cfg->fc_metric == 0) cfg->fc_metric = IP6_RT_PRIO_USER; + if (cfg->fc_flags & RTNH_F_ONLINK) { + if (!dev) { + NL_SET_ERR_MSG(extack, + "Nexthop device required for onlink"); + err = -ENODEV; + goto out; + } + + if (!(dev->flags & IFF_UP)) { + NL_SET_ERR_MSG(extack, "Nexthop device is not up"); + err = -ENETDOWN; + goto out; + } + } + err = -ENOBUFS; if (cfg->fc_nlinfo.nlh && !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { @@ -2592,6 +2690,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, #endif rt->rt6i_metric = cfg->fc_metric; + rt->rt6i_nh_weight = 1; /* We cannot add true routes via loopback here, they would result in kernel looping; promote them to reject routes @@ -2661,8 +2760,6 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, rt->rt6i_gateway = *gw_addr; if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) { - struct rt6_info *grt = NULL; - /* IPv6 strictly inhibits using not link-local addresses as nexthop address. Otherwise, router will not able to send redirects. @@ -2679,40 +2776,12 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, goto out; } - if (cfg->fc_table) { - grt = ip6_nh_lookup_table(net, cfg, gw_addr); - - if (grt) { - if (grt->rt6i_flags & RTF_GATEWAY || - (dev && dev != grt->dst.dev)) { - ip6_rt_put(grt); - grt = NULL; - } - } - } - - if (!grt) - grt = rt6_lookup(net, gw_addr, NULL, - cfg->fc_ifindex, 1); - - err = -EHOSTUNREACH; - if (!grt) - goto out; - if (dev) { - if (dev != grt->dst.dev) { - ip6_rt_put(grt); - goto out; - } + if (cfg->fc_flags & RTNH_F_ONLINK) { + err = ip6_route_check_nh_onlink(net, cfg, dev, + extack); } else { - dev = grt->dst.dev; - idev = grt->rt6i_idev; - dev_hold(dev); - in6_dev_hold(grt->rt6i_idev); + err = ip6_route_check_nh(net, cfg, &dev, &idev); } - if (!(grt->rt6i_flags & RTF_GATEWAY)) - err = 0; - ip6_rt_put(grt); - if (err) goto out; } @@ -2731,6 +2800,12 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, if (!dev) goto out; + if (!(dev->flags & IFF_UP)) { + NL_SET_ERR_MSG(extack, "Nexthop device is not up"); + err = -ENETDOWN; + goto out; + } + if (!ipv6_addr_any(&cfg->fc_prefsrc)) { if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { NL_SET_ERR_MSG(extack, "Invalid source address"); @@ -2745,6 +2820,10 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, rt->rt6i_flags = cfg->fc_flags; install_route: + if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) && + !netif_carrier_ok(dev)) + rt->rt6i_nh_flags |= RTNH_F_LINKDOWN; + rt->rt6i_nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK); rt->dst.dev = dev; rt->rt6i_idev = idev; rt->rt6i_table = table; @@ -3055,11 +3134,11 @@ out: static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from) { - BUG_ON(from->dst.from); + BUG_ON(from->from); rt->rt6i_flags &= ~RTF_EXPIRES; dst_hold(&from->dst); - rt->dst.from = &from->dst; + rt->from = from; dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true); } @@ -3458,37 +3537,249 @@ void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) fib6_clean_all(net, fib6_clean_tohost, gateway); } -struct arg_dev_net { - struct net_device *dev; - struct net *net; +struct arg_netdev_event { + const struct net_device *dev; + union { + unsigned int nh_flags; + unsigned long event; + }; }; +static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt) +{ + struct rt6_info *iter; + struct fib6_node *fn; + + fn = rcu_dereference_protected(rt->rt6i_node, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); + iter = rcu_dereference_protected(fn->leaf, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); + while (iter) { + if (iter->rt6i_metric == rt->rt6i_metric && + rt6_qualify_for_ecmp(iter)) + return iter; + iter = rcu_dereference_protected(iter->rt6_next, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); + } + + return NULL; +} + +static bool rt6_is_dead(const struct rt6_info *rt) +{ + if (rt->rt6i_nh_flags & RTNH_F_DEAD || + (rt->rt6i_nh_flags & RTNH_F_LINKDOWN && + rt->rt6i_idev->cnf.ignore_routes_with_linkdown)) + return true; + + return false; +} + +static int rt6_multipath_total_weight(const struct rt6_info *rt) +{ + struct rt6_info *iter; + int total = 0; + + if (!rt6_is_dead(rt)) + total += rt->rt6i_nh_weight; + + list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) { + if (!rt6_is_dead(iter)) + total += iter->rt6i_nh_weight; + } + + return total; +} + +static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total) +{ + int upper_bound = -1; + + if (!rt6_is_dead(rt)) { + *weight += rt->rt6i_nh_weight; + upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, + total) - 1; + } + atomic_set(&rt->rt6i_nh_upper_bound, upper_bound); +} + +static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total) +{ + struct rt6_info *iter; + int weight = 0; + + rt6_upper_bound_set(rt, &weight, total); + + list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) + rt6_upper_bound_set(iter, &weight, total); +} + +void rt6_multipath_rebalance(struct rt6_info *rt) +{ + struct rt6_info *first; + int total; + + /* In case the entire multipath route was marked for flushing, + * then there is no need to rebalance upon the removal of every + * sibling route. + */ + if (!rt->rt6i_nsiblings || rt->should_flush) + return; + + /* During lookup routes are evaluated in order, so we need to + * make sure upper bounds are assigned from the first sibling + * onwards. + */ + first = rt6_multipath_first_sibling(rt); + if (WARN_ON_ONCE(!first)) + return; + + total = rt6_multipath_total_weight(first); + rt6_multipath_upper_bound_set(first, total); +} + +static int fib6_ifup(struct rt6_info *rt, void *p_arg) +{ + const struct arg_netdev_event *arg = p_arg; + const struct net *net = dev_net(arg->dev); + + if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) { + rt->rt6i_nh_flags &= ~arg->nh_flags; + fib6_update_sernum_upto_root(dev_net(rt->dst.dev), rt); + rt6_multipath_rebalance(rt); + } + + return 0; +} + +void rt6_sync_up(struct net_device *dev, unsigned int nh_flags) +{ + struct arg_netdev_event arg = { + .dev = dev, + { + .nh_flags = nh_flags, + }, + }; + + if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev)) + arg.nh_flags |= RTNH_F_LINKDOWN; + + fib6_clean_all(dev_net(dev), fib6_ifup, &arg); +} + +static bool rt6_multipath_uses_dev(const struct rt6_info *rt, + const struct net_device *dev) +{ + struct rt6_info *iter; + + if (rt->dst.dev == dev) + return true; + list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) + if (iter->dst.dev == dev) + return true; + + return false; +} + +static void rt6_multipath_flush(struct rt6_info *rt) +{ + struct rt6_info *iter; + + rt->should_flush = 1; + list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) + iter->should_flush = 1; +} + +static unsigned int rt6_multipath_dead_count(const struct rt6_info *rt, + const struct net_device *down_dev) +{ + struct rt6_info *iter; + unsigned int dead = 0; + + if (rt->dst.dev == down_dev || rt->rt6i_nh_flags & RTNH_F_DEAD) + dead++; + list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) + if (iter->dst.dev == down_dev || + iter->rt6i_nh_flags & RTNH_F_DEAD) + dead++; + + return dead; +} + +static void rt6_multipath_nh_flags_set(struct rt6_info *rt, + const struct net_device *dev, + unsigned int nh_flags) +{ + struct rt6_info *iter; + + if (rt->dst.dev == dev) + rt->rt6i_nh_flags |= nh_flags; + list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) + if (iter->dst.dev == dev) + iter->rt6i_nh_flags |= nh_flags; +} + /* called with write lock held for table with rt */ -static int fib6_ifdown(struct rt6_info *rt, void *arg) +static int fib6_ifdown(struct rt6_info *rt, void *p_arg) { - const struct arg_dev_net *adn = arg; - const struct net_device *dev = adn->dev; + const struct arg_netdev_event *arg = p_arg; + const struct net_device *dev = arg->dev; + const struct net *net = dev_net(dev); - if ((rt->dst.dev == dev || !dev) && - rt != adn->net->ipv6.ip6_null_entry && - (rt->rt6i_nsiblings == 0 || - (dev && netdev_unregistering(dev)) || - !rt->rt6i_idev->cnf.ignore_routes_with_linkdown)) - return -1; + if (rt == net->ipv6.ip6_null_entry) + return 0; + + switch (arg->event) { + case NETDEV_UNREGISTER: + return rt->dst.dev == dev ? -1 : 0; + case NETDEV_DOWN: + if (rt->should_flush) + return -1; + if (!rt->rt6i_nsiblings) + return rt->dst.dev == dev ? -1 : 0; + if (rt6_multipath_uses_dev(rt, dev)) { + unsigned int count; + + count = rt6_multipath_dead_count(rt, dev); + if (rt->rt6i_nsiblings + 1 == count) { + rt6_multipath_flush(rt); + return -1; + } + rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | + RTNH_F_LINKDOWN); + fib6_update_sernum(rt); + rt6_multipath_rebalance(rt); + } + return -2; + case NETDEV_CHANGE: + if (rt->dst.dev != dev || + rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) + break; + rt->rt6i_nh_flags |= RTNH_F_LINKDOWN; + rt6_multipath_rebalance(rt); + break; + } return 0; } -void rt6_ifdown(struct net *net, struct net_device *dev) +void rt6_sync_down_dev(struct net_device *dev, unsigned long event) { - struct arg_dev_net adn = { + struct arg_netdev_event arg = { .dev = dev, - .net = net, + { + .event = event, + }, }; - fib6_clean_all(net, fib6_ifdown, &adn); - if (dev) - rt6_uncached_list_flush_dev(net, dev); + fib6_clean_all(dev_net(dev), fib6_ifdown, &arg); +} + +void rt6_disable_ip(struct net_device *dev, unsigned long event) +{ + rt6_sync_down_dev(dev, event); + rt6_uncached_list_flush_dev(dev_net(dev), dev); + neigh_ifdown(&nd_tbl, dev); } struct rt6_mtu_change_arg { @@ -3602,6 +3893,8 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, if (rtm->rtm_flags & RTM_F_CLONED) cfg->fc_flags |= RTF_CACHE; + cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); + cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; cfg->fc_nlinfo.nlh = nlh; cfg->fc_nlinfo.nl_net = sock_net(skb->sk); @@ -3811,6 +4104,8 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, goto cleanup; } + rt->rt6i_nh_weight = rtnh->rtnh_hops + 1; + err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg); if (err) { dst_release_immediate(&rt->dst); @@ -3991,7 +4286,10 @@ static size_t rt6_nlmsg_size(struct rt6_info *rt) static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt, unsigned int *flags, bool skip_oif) { - if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) { + if (rt->rt6i_nh_flags & RTNH_F_DEAD) + *flags |= RTNH_F_DEAD; + + if (rt->rt6i_nh_flags & RTNH_F_LINKDOWN) { *flags |= RTNH_F_LINKDOWN; if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown) *flags |= RTNH_F_DEAD; @@ -4002,6 +4300,7 @@ static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt, goto nla_put_failure; } + *flags |= (rt->rt6i_nh_flags & RTNH_F_ONLINK); if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD) *flags |= RTNH_F_OFFLOAD; @@ -4030,7 +4329,7 @@ static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt) if (!rtnh) goto nla_put_failure; - rtnh->rtnh_hops = 0; + rtnh->rtnh_hops = rt->rt6i_nh_weight - 1; rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0; if (rt6_nexthop_info(skb, rt, &flags, true) < 0) @@ -4297,19 +4596,13 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (!ipv6_addr_any(&fl6.saddr)) flags |= RT6_LOOKUP_F_HAS_SADDR; - if (!fibmatch) - dst = ip6_route_input_lookup(net, dev, &fl6, flags); - else - dst = ip6_route_lookup(net, &fl6, 0); + dst = ip6_route_input_lookup(net, dev, &fl6, flags); rcu_read_unlock(); } else { fl6.flowi6_oif = oif; - if (!fibmatch) - dst = ip6_route_output(net, NULL, &fl6); - else - dst = ip6_route_lookup(net, &fl6, 0); + dst = ip6_route_output(net, NULL, &fl6); } @@ -4326,6 +4619,14 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, goto errout; } + if (fibmatch && rt->from) { + struct rt6_info *ort = rt->from; + + dst_hold(&ort->dst); + ip6_rt_put(rt); + rt = ort; + } + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) { ip6_rt_put(rt); @@ -4423,7 +4724,6 @@ static int ip6_route_dev_notify(struct notifier_block *this, #ifdef CONFIG_PROC_FS static const struct file_operations ipv6_route_proc_fops = { - .owner = THIS_MODULE, .open = ipv6_route_open, .read = seq_read, .llseek = seq_lseek, @@ -4451,7 +4751,6 @@ static int rt6_stats_seq_open(struct inode *inode, struct file *file) } static const struct file_operations rt6_stats_seq_fops = { - .owner = THIS_MODULE, .open = rt6_stats_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -4596,8 +4895,6 @@ static int __net_init ip6_route_net_init(struct net *net) GFP_KERNEL); if (!net->ipv6.ip6_null_entry) goto out_ip6_dst_entries; - net->ipv6.ip6_null_entry->dst.path = - (struct dst_entry *)net->ipv6.ip6_null_entry; net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_null_entry->dst, ip6_template_metrics, true); @@ -4609,8 +4906,6 @@ static int __net_init ip6_route_net_init(struct net *net) GFP_KERNEL); if (!net->ipv6.ip6_prohibit_entry) goto out_ip6_null_entry; - net->ipv6.ip6_prohibit_entry->dst.path = - (struct dst_entry *)net->ipv6.ip6_prohibit_entry; net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, ip6_template_metrics, true); @@ -4620,8 +4915,6 @@ static int __net_init ip6_route_net_init(struct net *net) GFP_KERNEL); if (!net->ipv6.ip6_blk_hole_entry) goto out_ip6_prohibit_entry; - net->ipv6.ip6_blk_hole_entry->dst.path = - (struct dst_entry *)net->ipv6.ip6_blk_hole_entry; net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, ip6_template_metrics, true); @@ -4778,11 +5071,20 @@ int __init ip6_route_init(void) if (ret) goto fib6_rules_init; - ret = -ENOBUFS; - if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, 0) || - __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, 0) || - __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, - RTNL_FLAG_DOIT_UNLOCKED)) + ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE, + inet6_rtm_newroute, NULL, 0); + if (ret < 0) + goto out_register_late_subsys; + + ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE, + inet6_rtm_delroute, NULL, 0); + if (ret < 0) + goto out_register_late_subsys; + + ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, + inet6_rtm_getroute, NULL, + RTNL_FLAG_DOIT_UNLOCKED); + if (ret < 0) goto out_register_late_subsys; ret = register_netdevice_notifier(&ip6_route_dev_notifier); @@ -4800,6 +5102,7 @@ out: return ret; out_register_late_subsys: + rtnl_unregister_all(PF_INET6); unregister_pernet_subsys(&ip6_route_net_late_ops); fib6_rules_init: fib6_rules_cleanup(); diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index c81407770956..7f5621d09571 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -306,9 +306,7 @@ static int seg6_genl_dumphmac(struct sk_buff *skb, struct netlink_callback *cb) struct seg6_hmac_info *hinfo; int ret; - ret = rhashtable_walk_start(iter); - if (ret && ret != -EAGAIN) - goto done; + rhashtable_walk_start(iter); for (;;) { hinfo = rhashtable_walk_next(iter); diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index 825b8e01f947..ba3767ef5e93 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -501,7 +501,7 @@ static struct seg6_action_desc *__get_action_desc(int action) struct seg6_action_desc *desc; int i, count; - count = sizeof(seg6_action_table) / sizeof(struct seg6_action_desc); + count = ARRAY_SIZE(seg6_action_table); for (i = 0; i < count; i++) { desc = &seg6_action_table[i]; if (desc->action == action) diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index d60ddcb0bfe2..3873d3877135 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -934,8 +934,8 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, df = 0; } - if (tunnel->parms.iph.daddr && skb_dst(skb)) - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); + if (tunnel->parms.iph.daddr) + skb_dst_update_pmtu(skb, mtu); if (skb->len > mtu && !skb_is_gso(skb)) { icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); @@ -1098,6 +1098,7 @@ static void ipip6_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p, ipip6_tunnel_link(sitn, t); t->parms.iph.ttl = p->iph.ttl; t->parms.iph.tos = p->iph.tos; + t->parms.iph.frag_off = p->iph.frag_off; if (t->parms.link != p->link || t->fwmark != fwmark) { t->parms.link = p->link; t->fwmark = fwmark; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 6bb98c93edfe..a1ab29e2ab3b 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -176,8 +176,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, /* If interface is set while binding, indices * must coincide. */ - if (sk->sk_bound_dev_if && - sk->sk_bound_dev_if != usin->sin6_scope_id) + if (!sk_dev_equal_l3scope(sk, usin->sin6_scope_id)) return -EINVAL; sk->sk_bound_dev_if = usin->sin6_scope_id; @@ -994,7 +993,7 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, req->ts_recent, sk->sk_bound_dev_if, - tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr), + tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr), 0, 0); } @@ -1454,7 +1453,6 @@ process: struct sock *nsk; sk = req->rsk_listener; - tcp_v6_fill_cb(skb, hdr, th); if (tcp_v6_inbound_md5_hash(sk, skb)) { sk_drops_add(sk, skb); reqsk_put(req); @@ -1467,8 +1465,12 @@ process: sock_hold(sk); refcounted = true; nsk = NULL; - if (!tcp_filter(sk, skb)) + if (!tcp_filter(sk, skb)) { + th = (const struct tcphdr *)skb->data; + hdr = ipv6_hdr(skb); + tcp_v6_fill_cb(skb, hdr, th); nsk = tcp_check_req(sk, skb, req, false); + } if (!nsk) { reqsk_put(req); goto discard_and_relse; @@ -1492,8 +1494,6 @@ process: if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; - tcp_v6_fill_cb(skb, hdr, th); - if (tcp_v6_inbound_md5_hash(sk, skb)) goto discard_and_relse; @@ -1501,6 +1501,7 @@ process: goto discard_and_relse; th = (const struct tcphdr *)skb->data; hdr = ipv6_hdr(skb); + tcp_v6_fill_cb(skb, hdr, th); skb->dev = NULL; @@ -1590,7 +1591,6 @@ do_time_wait: tcp_v6_timewait_ack(sk, skb); break; case TCP_TW_RST: - tcp_v6_restore_cb(skb); tcp_v6_send_reset(sk, skb); inet_twsk_deschedule_put(inet_twsk(sk)); goto discard_it; @@ -1794,7 +1794,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) timer_expires = jiffies; } - state = sk_state_load(sp); + state = inet_sk_state_load(sp); if (state == TCP_LISTEN) rx_queue = sp->sk_ack_backlog; else @@ -1883,7 +1883,6 @@ out: } static const struct file_operations tcp6_afinfo_seq_fops = { - .owner = THIS_MODULE, .open = tcp_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c index d883c9204c01..278e49cd67d4 100644 --- a/net/ipv6/tcpv6_offload.c +++ b/net/ipv6/tcpv6_offload.c @@ -46,6 +46,9 @@ static struct sk_buff *tcp6_gso_segment(struct sk_buff *skb, { struct tcphdr *th; + if (!(skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6)) + return ERR_PTR(-EINVAL); + if (!pskb_may_pull(skb, sizeof(*th))) return ERR_PTR(-EINVAL); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 3f30fa313bf2..52e3ea0e6f50 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -89,28 +89,12 @@ static u32 udp6_ehashfn(const struct net *net, udp_ipv6_hash_secret + net_hash_mix(net)); } -static u32 udp6_portaddr_hash(const struct net *net, - const struct in6_addr *addr6, - unsigned int port) -{ - unsigned int hash, mix = net_hash_mix(net); - - if (ipv6_addr_any(addr6)) - hash = jhash_1word(0, mix); - else if (ipv6_addr_v4mapped(addr6)) - hash = jhash_1word((__force u32)addr6->s6_addr32[3], mix); - else - hash = jhash2((__force u32 *)addr6->s6_addr32, 4, mix); - - return hash ^ port; -} - int udp_v6_get_port(struct sock *sk, unsigned short snum) { unsigned int hash2_nulladdr = - udp6_portaddr_hash(sock_net(sk), &in6addr_any, snum); + ipv6_portaddr_hash(sock_net(sk), &in6addr_any, snum); unsigned int hash2_partial = - udp6_portaddr_hash(sock_net(sk), &sk->sk_v6_rcv_saddr, 0); + ipv6_portaddr_hash(sock_net(sk), &sk->sk_v6_rcv_saddr, 0); /* precompute partial secondary hash */ udp_sk(sk)->udp_portaddr_hash = hash2_partial; @@ -119,7 +103,7 @@ int udp_v6_get_port(struct sock *sk, unsigned short snum) static void udp_v6_rehash(struct sock *sk) { - u16 new_hash = udp6_portaddr_hash(sock_net(sk), + u16 new_hash = ipv6_portaddr_hash(sock_net(sk), &sk->sk_v6_rcv_saddr, inet_sk(sk)->inet_num); @@ -184,7 +168,7 @@ static struct sock *udp6_lib_lookup2(struct net *net, struct udp_hslot *hslot2, struct sk_buff *skb) { struct sock *sk, *result; - int score, badness, matches = 0, reuseport = 0; + int score, badness; u32 hash = 0; result = NULL; @@ -193,8 +177,7 @@ static struct sock *udp6_lib_lookup2(struct net *net, score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif, exact_dif); if (score > badness) { - reuseport = sk->sk_reuseport; - if (reuseport) { + if (sk->sk_reuseport) { hash = udp6_ehashfn(net, daddr, hnum, saddr, sport); @@ -202,15 +185,9 @@ static struct sock *udp6_lib_lookup2(struct net *net, sizeof(struct udphdr)); if (result) return result; - matches = 1; } result = sk; badness = score; - } else if (score == badness && reuseport) { - matches++; - if (reciprocal_scale(hash, matches) == 0) - result = sk; - hash = next_pseudo_random32(hash); } } return result; @@ -228,11 +205,11 @@ struct sock *__udp6_lib_lookup(struct net *net, unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; bool exact_dif = udp6_lib_exact_dif_match(net, skb); - int score, badness, matches = 0, reuseport = 0; + int score, badness; u32 hash = 0; if (hslot->count > 10) { - hash2 = udp6_portaddr_hash(net, daddr, hnum); + hash2 = ipv6_portaddr_hash(net, daddr, hnum); slot2 = hash2 & udptable->mask; hslot2 = &udptable->hash2[slot2]; if (hslot->count < hslot2->count) @@ -243,7 +220,7 @@ struct sock *__udp6_lib_lookup(struct net *net, hslot2, skb); if (!result) { unsigned int old_slot2 = slot2; - hash2 = udp6_portaddr_hash(net, &in6addr_any, hnum); + hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum); slot2 = hash2 & udptable->mask; /* avoid searching the same slot again. */ if (unlikely(slot2 == old_slot2)) @@ -267,23 +244,16 @@ begin: score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif, exact_dif); if (score > badness) { - reuseport = sk->sk_reuseport; - if (reuseport) { + if (sk->sk_reuseport) { hash = udp6_ehashfn(net, daddr, hnum, saddr, sport); result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); if (result) return result; - matches = 1; } result = sk; badness = score; - } else if (score == badness && reuseport) { - matches++; - if (reciprocal_scale(hash, matches) == 0) - result = sk; - hash = next_pseudo_random32(hash); } } return result; @@ -719,9 +689,9 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, struct sk_buff *nskb; if (use_hash2) { - hash2_any = udp6_portaddr_hash(net, &in6addr_any, hnum) & + hash2_any = ipv6_portaddr_hash(net, &in6addr_any, hnum) & udptable->mask; - hash2 = udp6_portaddr_hash(net, daddr, hnum) & udptable->mask; + hash2 = ipv6_portaddr_hash(net, daddr, hnum) & udptable->mask; start_lookup: hslot = &udptable->hash2[hash2]; offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node); @@ -909,7 +879,7 @@ static struct sock *__udp6_lib_demux_lookup(struct net *net, int dif, int sdif) { unsigned short hnum = ntohs(loc_port); - unsigned int hash2 = udp6_portaddr_hash(net, loc_addr, hnum); + unsigned int hash2 = ipv6_portaddr_hash(net, loc_addr, hnum); unsigned int slot2 = hash2 & udp_table.mask; struct udp_hslot *hslot2 = &udp_table.hash2[slot2]; const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum); @@ -1509,7 +1479,6 @@ int udp6_seq_show(struct seq_file *seq, void *v) } static const struct file_operations udp6_afinfo_seq_fops = { - .owner = THIS_MODULE, .open = udp_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index a0f89ad76f9d..2a04dc9c781b 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -42,6 +42,9 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, const struct ipv6hdr *ipv6h; struct udphdr *uh; + if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP)) + goto out; + if (!pskb_may_pull(skb, sizeof(struct udphdr))) goto out; diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c index 2784cc363f2b..14ae32bb1f3d 100644 --- a/net/ipv6/udplite.c +++ b/net/ipv6/udplite.c @@ -94,7 +94,6 @@ void udplitev6_exit(void) #ifdef CONFIG_PROC_FS static const struct file_operations udplite6_afinfo_seq_fops = { - .owner = THIS_MODULE, .open = udp_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index fe04e23af986..841f4a07438e 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -32,6 +32,14 @@ int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi, } EXPORT_SYMBOL(xfrm6_rcv_spi); +static int xfrm6_transport_finish2(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + if (xfrm_trans_queue(skb, ip6_rcv_finish)) + __kfree_skb(skb); + return -1; +} + int xfrm6_transport_finish(struct sk_buff *skb, int async) { struct xfrm_offload *xo = xfrm_offload(skb); @@ -56,7 +64,7 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async) NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, dev_net(skb->dev), NULL, skb, skb->dev, NULL, - ip6_rcv_finish); + xfrm6_transport_finish2); return -1; } diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c index 02556e356f87..bb935a3b7fea 100644 --- a/net/ipv6/xfrm6_mode_tunnel.c +++ b/net/ipv6/xfrm6_mode_tunnel.c @@ -59,7 +59,7 @@ static int xfrm6_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) if (x->props.flags & XFRM_STATE_NOECN) dsfield &= ~INET_ECN_MASK; ipv6_change_dsfield(top_iph, 0, dsfield); - top_iph->hop_limit = ip6_dst_hoplimit(dst->child); + top_iph->hop_limit = ip6_dst_hoplimit(xfrm_dst_child(dst)); top_iph->saddr = *(struct in6_addr *)&x->props.saddr; top_iph->daddr = *(struct in6_addr *)&x->id.daddr; return 0; @@ -92,6 +92,7 @@ static int xfrm6_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) skb_reset_network_header(skb); skb_mac_header_rebuild(skb); + eth_hdr(skb)->h_proto = skb->protocol; err = 0; @@ -105,17 +106,14 @@ static struct sk_buff *xfrm6_mode_tunnel_gso_segment(struct xfrm_state *x, { __skb_push(skb, skb->mac_len); return skb_mac_gso_segment(skb, features); - } static void xfrm6_mode_tunnel_xmit(struct xfrm_state *x, struct sk_buff *skb) { struct xfrm_offload *xo = xfrm_offload(skb); - if (xo->flags & XFRM_GSO_SEGMENT) { - skb->network_header = skb->network_header - x->props.header_len; + if (xo->flags & XFRM_GSO_SEGMENT) skb->transport_header = skb->network_header + sizeof(struct ipv6hdr); - } skb_reset_mac_len(skb); pskb_pull(skb, skb->mac_len + x->props.header_len); diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 885ade234a49..09fb44ee3b45 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -265,7 +265,7 @@ static void xfrm6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, in6_dev_put(xdst->u.rt6.rt6i_idev); xdst->u.rt6.rt6i_idev = loopback_idev; in6_dev_hold(loopback_idev); - xdst = (struct xfrm_dst *)xdst->u.dst.child; + xdst = (struct xfrm_dst *)xfrm_dst_child(&xdst->u.dst); } while (xdst->u.dst.xfrm); __in6_dev_put(loopback_idev); diff --git a/net/ipx/Kconfig b/net/ipx/Kconfig deleted file mode 100644 index e9ad0062fbb6..000000000000 --- a/net/ipx/Kconfig +++ /dev/null @@ -1,60 +0,0 @@ -# -# IPX configuration -# -config IPX - tristate "The IPX protocol" - select LLC - ---help--- - This is support for the Novell networking protocol, IPX, commonly - used for local networks of Windows machines. You need it if you - want to access Novell NetWare file or print servers using the Linux - Novell client ncpfs (available from - <ftp://platan.vc.cvut.cz/pub/linux/ncpfs/>) or from - within the Linux DOS emulator DOSEMU (read the DOSEMU-HOWTO, - available from <http://www.tldp.org/docs.html#howto>). In order - to do the former, you'll also have to say Y to "NCP file system - support", below. - - IPX is similar in scope to IP, while SPX, which runs on top of IPX, - is similar to TCP. - - To turn your Linux box into a fully featured NetWare file server and - IPX router, say Y here and fetch either lwared from - <ftp://ibiblio.org/pub/Linux/system/network/daemons/> or - mars_nwe from <ftp://www.compu-art.de/mars_nwe/>. For more - information, read the IPX-HOWTO available from - <http://www.tldp.org/docs.html#howto>. - - The IPX driver would enlarge your kernel by about 16 KB. To compile - this driver as a module, choose M here: the module will be called ipx. - Unless you want to integrate your Linux box with a local Novell - network, say N. - -config IPX_INTERN - bool "IPX: Full internal IPX network" - depends on IPX - ---help--- - Every IPX network has an address that identifies it. Sometimes it is - useful to give an IPX "network" address to your Linux box as well - (for example if your box is acting as a file server for different - IPX networks: it will then be accessible from everywhere using the - same address). The way this is done is to create a virtual internal - "network" inside your box and to assign an IPX address to this - network. Say Y here if you want to do this; read the IPX-HOWTO at - <http://www.tldp.org/docs.html#howto> for details. - - The full internal IPX network enables you to allocate sockets on - different virtual nodes of the internal network. This is done by - evaluating the field sipx_node of the socket address given to the - bind call. So applications should always initialize the node field - to 0 when binding a socket on the primary network. In this case the - socket is assigned the default node that has been given to the - kernel when the internal network was created. By enabling the full - internal IPX network the cross-forwarding of packets targeted at - 'special' sockets to sockets listening on the primary network is - disabled. This might break existing applications, especially RIP/SAP - daemons. A RIP/SAP daemon that works well with the full internal net - can be found on <ftp://ftp.gwdg.de/pub/linux/misc/ncpfs/>. - - If you don't know what you are doing, say N. - diff --git a/net/ipx/Makefile b/net/ipx/Makefile deleted file mode 100644 index 440fafa9fd07..000000000000 --- a/net/ipx/Makefile +++ /dev/null @@ -1,8 +0,0 @@ -# -# Makefile for the Linux IPX layer. -# - -obj-$(CONFIG_IPX) += ipx.o - -ipx-y := af_ipx.o ipx_route.o ipx_proc.o pe2.o -ipx-$(CONFIG_SYSCTL) += sysctl_net_ipx.o diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c deleted file mode 100644 index d21a9d128d3e..000000000000 --- a/net/ipx/af_ipx.c +++ /dev/null @@ -1,2084 +0,0 @@ -/* - * Implements an IPX socket layer. - * - * This code is derived from work by - * Ross Biro : Writing the original IP stack - * Fred Van Kempen : Tidying up the TCP/IP - * - * Many thanks go to Keith Baker, Institute For Industrial Information - * Technology Ltd, Swansea University for allowing me to work on this - * in my own time even though it was in some ways related to commercial - * work I am currently employed to do there. - * - * All the material in this file is subject to the Gnu license version 2. - * Neither Alan Cox nor the Swansea University Computer Society admit - * liability nor provide warranty for any of this software. This material - * is provided as is and at no charge. - * - * Portions Copyright (c) 2000-2003 Conectiva, Inc. <acme@conectiva.com.br> - * Neither Arnaldo Carvalho de Melo nor Conectiva, Inc. admit liability nor - * provide warranty for any of this software. This material is provided - * "AS-IS" and at no charge. - * - * Portions Copyright (c) 1995 Caldera, Inc. <greg@caldera.com> - * Neither Greg Page nor Caldera, Inc. admit liability nor provide - * warranty for any of this software. This material is provided - * "AS-IS" and at no charge. - * - * See net/ipx/ChangeLog. - */ - -#include <linux/capability.h> -#include <linux/errno.h> -#include <linux/if_arp.h> -#include <linux/if_ether.h> -#include <linux/init.h> -#include <linux/ipx.h> -#include <linux/kernel.h> -#include <linux/list.h> -#include <linux/module.h> -#include <linux/net.h> -#include <linux/netdevice.h> -#include <linux/uio.h> -#include <linux/slab.h> -#include <linux/skbuff.h> -#include <linux/socket.h> -#include <linux/sockios.h> -#include <linux/string.h> -#include <linux/types.h> -#include <linux/termios.h> - -#include <net/ipx.h> -#include <net/p8022.h> -#include <net/psnap.h> -#include <net/sock.h> -#include <net/datalink.h> -#include <net/tcp_states.h> -#include <net/net_namespace.h> - -#include <linux/uaccess.h> - -/* Configuration Variables */ -static unsigned char ipxcfg_max_hops = 16; -static char ipxcfg_auto_select_primary; -static char ipxcfg_auto_create_interfaces; -int sysctl_ipx_pprop_broadcasting = 1; - -/* Global Variables */ -static struct datalink_proto *p8022_datalink; -static struct datalink_proto *pEII_datalink; -static struct datalink_proto *p8023_datalink; -static struct datalink_proto *pSNAP_datalink; - -static const struct proto_ops ipx_dgram_ops; - -LIST_HEAD(ipx_interfaces); -DEFINE_SPINLOCK(ipx_interfaces_lock); - -struct ipx_interface *ipx_primary_net; -struct ipx_interface *ipx_internal_net; - -struct ipx_interface *ipx_interfaces_head(void) -{ - struct ipx_interface *rc = NULL; - - if (!list_empty(&ipx_interfaces)) - rc = list_entry(ipx_interfaces.next, - struct ipx_interface, node); - return rc; -} - -static void ipxcfg_set_auto_select(char val) -{ - ipxcfg_auto_select_primary = val; - if (val && !ipx_primary_net) - ipx_primary_net = ipx_interfaces_head(); -} - -static int ipxcfg_get_config_data(struct ipx_config_data __user *arg) -{ - struct ipx_config_data vals; - - vals.ipxcfg_auto_create_interfaces = ipxcfg_auto_create_interfaces; - vals.ipxcfg_auto_select_primary = ipxcfg_auto_select_primary; - - return copy_to_user(arg, &vals, sizeof(vals)) ? -EFAULT : 0; -} - -/* - * Note: Sockets may not be removed _during_ an interrupt or inet_bh - * handler using this technique. They can be added although we do not - * use this facility. - */ - -static void ipx_remove_socket(struct sock *sk) -{ - /* Determine interface with which socket is associated */ - struct ipx_interface *intrfc = ipx_sk(sk)->intrfc; - - if (!intrfc) - goto out; - - ipxitf_hold(intrfc); - spin_lock_bh(&intrfc->if_sklist_lock); - sk_del_node_init(sk); - spin_unlock_bh(&intrfc->if_sklist_lock); - ipxitf_put(intrfc); -out: - return; -} - -static void ipx_destroy_socket(struct sock *sk) -{ - ipx_remove_socket(sk); - skb_queue_purge(&sk->sk_receive_queue); - sk_refcnt_debug_dec(sk); -} - -/* - * The following code is used to support IPX Interfaces (IPXITF). An - * IPX interface is defined by a physical device and a frame type. - */ - -/* ipxitf_clear_primary_net has to be called with ipx_interfaces_lock held */ - -static void ipxitf_clear_primary_net(void) -{ - ipx_primary_net = NULL; - if (ipxcfg_auto_select_primary) - ipx_primary_net = ipx_interfaces_head(); -} - -static struct ipx_interface *__ipxitf_find_using_phys(struct net_device *dev, - __be16 datalink) -{ - struct ipx_interface *i; - - list_for_each_entry(i, &ipx_interfaces, node) - if (i->if_dev == dev && i->if_dlink_type == datalink) - goto out; - i = NULL; -out: - return i; -} - -static struct ipx_interface *ipxitf_find_using_phys(struct net_device *dev, - __be16 datalink) -{ - struct ipx_interface *i; - - spin_lock_bh(&ipx_interfaces_lock); - i = __ipxitf_find_using_phys(dev, datalink); - if (i) - ipxitf_hold(i); - spin_unlock_bh(&ipx_interfaces_lock); - return i; -} - -struct ipx_interface *ipxitf_find_using_net(__be32 net) -{ - struct ipx_interface *i; - - spin_lock_bh(&ipx_interfaces_lock); - if (net) { - list_for_each_entry(i, &ipx_interfaces, node) - if (i->if_netnum == net) - goto hold; - i = NULL; - goto unlock; - } - - i = ipx_primary_net; - if (i) -hold: - ipxitf_hold(i); -unlock: - spin_unlock_bh(&ipx_interfaces_lock); - return i; -} - -/* Sockets are bound to a particular IPX interface. */ -static void ipxitf_insert_socket(struct ipx_interface *intrfc, struct sock *sk) -{ - ipxitf_hold(intrfc); - spin_lock_bh(&intrfc->if_sklist_lock); - ipx_sk(sk)->intrfc = intrfc; - sk_add_node(sk, &intrfc->if_sklist); - spin_unlock_bh(&intrfc->if_sklist_lock); - ipxitf_put(intrfc); -} - -/* caller must hold intrfc->if_sklist_lock */ -static struct sock *__ipxitf_find_socket(struct ipx_interface *intrfc, - __be16 port) -{ - struct sock *s; - - sk_for_each(s, &intrfc->if_sklist) - if (ipx_sk(s)->port == port) - goto found; - s = NULL; -found: - return s; -} - -/* caller must hold a reference to intrfc */ -static struct sock *ipxitf_find_socket(struct ipx_interface *intrfc, - __be16 port) -{ - struct sock *s; - - spin_lock_bh(&intrfc->if_sklist_lock); - s = __ipxitf_find_socket(intrfc, port); - if (s) - sock_hold(s); - spin_unlock_bh(&intrfc->if_sklist_lock); - - return s; -} - -#ifdef CONFIG_IPX_INTERN -static struct sock *ipxitf_find_internal_socket(struct ipx_interface *intrfc, - unsigned char *ipx_node, - __be16 port) -{ - struct sock *s; - - ipxitf_hold(intrfc); - spin_lock_bh(&intrfc->if_sklist_lock); - - sk_for_each(s, &intrfc->if_sklist) { - struct ipx_sock *ipxs = ipx_sk(s); - - if (ipxs->port == port && - !memcmp(ipx_node, ipxs->node, IPX_NODE_LEN)) - goto found; - } - s = NULL; -found: - spin_unlock_bh(&intrfc->if_sklist_lock); - ipxitf_put(intrfc); - return s; -} -#endif - -static void __ipxitf_down(struct ipx_interface *intrfc) -{ - struct sock *s; - struct hlist_node *t; - - /* Delete all routes associated with this interface */ - ipxrtr_del_routes(intrfc); - - spin_lock_bh(&intrfc->if_sklist_lock); - /* error sockets */ - sk_for_each_safe(s, t, &intrfc->if_sklist) { - struct ipx_sock *ipxs = ipx_sk(s); - - s->sk_err = ENOLINK; - s->sk_error_report(s); - ipxs->intrfc = NULL; - ipxs->port = 0; - sock_set_flag(s, SOCK_ZAPPED); /* Indicates it is no longer bound */ - sk_del_node_init(s); - } - INIT_HLIST_HEAD(&intrfc->if_sklist); - spin_unlock_bh(&intrfc->if_sklist_lock); - - /* remove this interface from list */ - list_del(&intrfc->node); - - /* remove this interface from *special* networks */ - if (intrfc == ipx_primary_net) - ipxitf_clear_primary_net(); - if (intrfc == ipx_internal_net) - ipx_internal_net = NULL; - - if (intrfc->if_dev) - dev_put(intrfc->if_dev); - kfree(intrfc); -} - -void ipxitf_down(struct ipx_interface *intrfc) -{ - spin_lock_bh(&ipx_interfaces_lock); - __ipxitf_down(intrfc); - spin_unlock_bh(&ipx_interfaces_lock); -} - -static void __ipxitf_put(struct ipx_interface *intrfc) -{ - if (refcount_dec_and_test(&intrfc->refcnt)) - __ipxitf_down(intrfc); -} - -static int ipxitf_device_event(struct notifier_block *notifier, - unsigned long event, void *ptr) -{ - struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct ipx_interface *i, *tmp; - - if (!net_eq(dev_net(dev), &init_net)) - return NOTIFY_DONE; - - if (event != NETDEV_DOWN && event != NETDEV_UP) - goto out; - - spin_lock_bh(&ipx_interfaces_lock); - list_for_each_entry_safe(i, tmp, &ipx_interfaces, node) - if (i->if_dev == dev) { - if (event == NETDEV_UP) - ipxitf_hold(i); - else - __ipxitf_put(i); - } - spin_unlock_bh(&ipx_interfaces_lock); -out: - return NOTIFY_DONE; -} - - -static __exit void ipxitf_cleanup(void) -{ - struct ipx_interface *i, *tmp; - - spin_lock_bh(&ipx_interfaces_lock); - list_for_each_entry_safe(i, tmp, &ipx_interfaces, node) - __ipxitf_put(i); - spin_unlock_bh(&ipx_interfaces_lock); -} - -static void ipxitf_def_skb_handler(struct sock *sock, struct sk_buff *skb) -{ - if (sock_queue_rcv_skb(sock, skb) < 0) - kfree_skb(skb); -} - -/* - * On input skb->sk is NULL. Nobody is charged for the memory. - */ - -/* caller must hold a reference to intrfc */ - -#ifdef CONFIG_IPX_INTERN -static int ipxitf_demux_socket(struct ipx_interface *intrfc, - struct sk_buff *skb, int copy) -{ - struct ipxhdr *ipx = ipx_hdr(skb); - int is_broadcast = !memcmp(ipx->ipx_dest.node, ipx_broadcast_node, - IPX_NODE_LEN); - struct sock *s; - int rc; - - spin_lock_bh(&intrfc->if_sklist_lock); - - sk_for_each(s, &intrfc->if_sklist) { - struct ipx_sock *ipxs = ipx_sk(s); - - if (ipxs->port == ipx->ipx_dest.sock && - (is_broadcast || !memcmp(ipx->ipx_dest.node, - ipxs->node, IPX_NODE_LEN))) { - /* We found a socket to which to send */ - struct sk_buff *skb1; - - if (copy) { - skb1 = skb_clone(skb, GFP_ATOMIC); - rc = -ENOMEM; - if (!skb1) - goto out; - } else { - skb1 = skb; - copy = 1; /* skb may only be used once */ - } - ipxitf_def_skb_handler(s, skb1); - - /* On an external interface, one socket can listen */ - if (intrfc != ipx_internal_net) - break; - } - } - - /* skb was solely for us, and we did not make a copy, so free it. */ - if (!copy) - kfree_skb(skb); - - rc = 0; -out: - spin_unlock_bh(&intrfc->if_sklist_lock); - return rc; -} -#else -static struct sock *ncp_connection_hack(struct ipx_interface *intrfc, - struct ipxhdr *ipx) -{ - /* The packet's target is a NCP connection handler. We want to hand it - * to the correct socket directly within the kernel, so that the - * mars_nwe packet distribution process does not have to do it. Here we - * only care about NCP and BURST packets. - * - * You might call this a hack, but believe me, you do not want a - * complete NCP layer in the kernel, and this is VERY fast as well. */ - struct sock *sk = NULL; - int connection = 0; - u8 *ncphdr = (u8 *)(ipx + 1); - - if (*ncphdr == 0x22 && *(ncphdr + 1) == 0x22) /* NCP request */ - connection = (((int) *(ncphdr + 5)) << 8) | (int) *(ncphdr + 3); - else if (*ncphdr == 0x77 && *(ncphdr + 1) == 0x77) /* BURST packet */ - connection = (((int) *(ncphdr + 9)) << 8) | (int) *(ncphdr + 8); - - if (connection) { - /* Now we have to look for a special NCP connection handling - * socket. Only these sockets have ipx_ncp_conn != 0, set by - * SIOCIPXNCPCONN. */ - spin_lock_bh(&intrfc->if_sklist_lock); - sk_for_each(sk, &intrfc->if_sklist) - if (ipx_sk(sk)->ipx_ncp_conn == connection) { - sock_hold(sk); - goto found; - } - sk = NULL; - found: - spin_unlock_bh(&intrfc->if_sklist_lock); - } - return sk; -} - -static int ipxitf_demux_socket(struct ipx_interface *intrfc, - struct sk_buff *skb, int copy) -{ - struct ipxhdr *ipx = ipx_hdr(skb); - struct sock *sock1 = NULL, *sock2 = NULL; - struct sk_buff *skb1 = NULL, *skb2 = NULL; - int rc; - - if (intrfc == ipx_primary_net && ntohs(ipx->ipx_dest.sock) == 0x451) - sock1 = ncp_connection_hack(intrfc, ipx); - if (!sock1) - /* No special socket found, forward the packet the normal way */ - sock1 = ipxitf_find_socket(intrfc, ipx->ipx_dest.sock); - - /* - * We need to check if there is a primary net and if - * this is addressed to one of the *SPECIAL* sockets because - * these need to be propagated to the primary net. - * The *SPECIAL* socket list contains: 0x452(SAP), 0x453(RIP) and - * 0x456(Diagnostic). - */ - - if (ipx_primary_net && intrfc != ipx_primary_net) { - const int dsock = ntohs(ipx->ipx_dest.sock); - - if (dsock == 0x452 || dsock == 0x453 || dsock == 0x456) - /* The appropriate thing to do here is to dup the - * packet and route to the primary net interface via - * ipxitf_send; however, we'll cheat and just demux it - * here. */ - sock2 = ipxitf_find_socket(ipx_primary_net, - ipx->ipx_dest.sock); - } - - /* - * If there is nothing to do return. The kfree will cancel any charging. - */ - rc = 0; - if (!sock1 && !sock2) { - if (!copy) - kfree_skb(skb); - goto out; - } - - /* - * This next segment of code is a little awkward, but it sets it up - * so that the appropriate number of copies of the SKB are made and - * that skb1 and skb2 point to it (them) so that it (they) can be - * demuxed to sock1 and/or sock2. If we are unable to make enough - * copies, we do as much as is possible. - */ - - if (copy) - skb1 = skb_clone(skb, GFP_ATOMIC); - else - skb1 = skb; - - rc = -ENOMEM; - if (!skb1) - goto out_put; - - /* Do we need 2 SKBs? */ - if (sock1 && sock2) - skb2 = skb_clone(skb1, GFP_ATOMIC); - else - skb2 = skb1; - - if (sock1) - ipxitf_def_skb_handler(sock1, skb1); - - if (!skb2) - goto out_put; - - if (sock2) - ipxitf_def_skb_handler(sock2, skb2); - - rc = 0; -out_put: - if (sock1) - sock_put(sock1); - if (sock2) - sock_put(sock2); -out: - return rc; -} -#endif /* CONFIG_IPX_INTERN */ - -static struct sk_buff *ipxitf_adjust_skbuff(struct ipx_interface *intrfc, - struct sk_buff *skb) -{ - struct sk_buff *skb2; - int in_offset = (unsigned char *)ipx_hdr(skb) - skb->head; - int out_offset = intrfc->if_ipx_offset; - int len; - - /* Hopefully, most cases */ - if (in_offset >= out_offset) - return skb; - - /* Need new SKB */ - len = skb->len + out_offset; - skb2 = alloc_skb(len, GFP_ATOMIC); - if (skb2) { - skb_reserve(skb2, out_offset); - skb_reset_network_header(skb2); - skb_reset_transport_header(skb2); - skb_put(skb2, skb->len); - memcpy(ipx_hdr(skb2), ipx_hdr(skb), skb->len); - memcpy(skb2->cb, skb->cb, sizeof(skb->cb)); - } - kfree_skb(skb); - return skb2; -} - -/* caller must hold a reference to intrfc and the skb has to be unshared */ -int ipxitf_send(struct ipx_interface *intrfc, struct sk_buff *skb, char *node) -{ - struct ipxhdr *ipx = ipx_hdr(skb); - struct net_device *dev = intrfc->if_dev; - struct datalink_proto *dl = intrfc->if_dlink; - char dest_node[IPX_NODE_LEN]; - int send_to_wire = 1; - int addr_len; - - ipx->ipx_tctrl = IPX_SKB_CB(skb)->ipx_tctrl; - ipx->ipx_dest.net = IPX_SKB_CB(skb)->ipx_dest_net; - ipx->ipx_source.net = IPX_SKB_CB(skb)->ipx_source_net; - - /* see if we need to include the netnum in the route list */ - if (IPX_SKB_CB(skb)->last_hop.index >= 0) { - __be32 *last_hop = (__be32 *)(((u8 *) skb->data) + - sizeof(struct ipxhdr) + - IPX_SKB_CB(skb)->last_hop.index * - sizeof(__be32)); - *last_hop = IPX_SKB_CB(skb)->last_hop.netnum; - IPX_SKB_CB(skb)->last_hop.index = -1; - } - - /* - * We need to know how many skbuffs it will take to send out this - * packet to avoid unnecessary copies. - */ - - if (!dl || !dev || dev->flags & IFF_LOOPBACK) - send_to_wire = 0; /* No non looped */ - - /* - * See if this should be demuxed to sockets on this interface - * - * We want to ensure the original was eaten or that we only use - * up clones. - */ - - if (ipx->ipx_dest.net == intrfc->if_netnum) { - /* - * To our own node, loop and free the original. - * The internal net will receive on all node address. - */ - if (intrfc == ipx_internal_net || - !memcmp(intrfc->if_node, node, IPX_NODE_LEN)) { - /* Don't charge sender */ - skb_orphan(skb); - - /* Will charge receiver */ - return ipxitf_demux_socket(intrfc, skb, 0); - } - - /* Broadcast, loop and possibly keep to send on. */ - if (!memcmp(ipx_broadcast_node, node, IPX_NODE_LEN)) { - if (!send_to_wire) - skb_orphan(skb); - ipxitf_demux_socket(intrfc, skb, send_to_wire); - if (!send_to_wire) - goto out; - } - } - - /* - * If the originating net is not equal to our net; this is routed - * We are still charging the sender. Which is right - the driver - * free will handle this fairly. - */ - if (ipx->ipx_source.net != intrfc->if_netnum) { - /* - * Unshare the buffer before modifying the count in - * case it's a flood or tcpdump - */ - skb = skb_unshare(skb, GFP_ATOMIC); - if (!skb) - goto out; - if (++ipx->ipx_tctrl > ipxcfg_max_hops) - send_to_wire = 0; - } - - if (!send_to_wire) { - kfree_skb(skb); - goto out; - } - - /* Determine the appropriate hardware address */ - addr_len = dev->addr_len; - if (!memcmp(ipx_broadcast_node, node, IPX_NODE_LEN)) - memcpy(dest_node, dev->broadcast, addr_len); - else - memcpy(dest_node, &(node[IPX_NODE_LEN-addr_len]), addr_len); - - /* Make any compensation for differing physical/data link size */ - skb = ipxitf_adjust_skbuff(intrfc, skb); - if (!skb) - goto out; - - /* set up data link and physical headers */ - skb->dev = dev; - skb->protocol = htons(ETH_P_IPX); - - /* Send it out */ - dl->request(dl, skb, dest_node); -out: - return 0; -} - -static int ipxitf_add_local_route(struct ipx_interface *intrfc) -{ - return ipxrtr_add_route(intrfc->if_netnum, intrfc, NULL); -} - -static void ipxitf_discover_netnum(struct ipx_interface *intrfc, - struct sk_buff *skb); -static int ipxitf_pprop(struct ipx_interface *intrfc, struct sk_buff *skb); - -static int ipxitf_rcv(struct ipx_interface *intrfc, struct sk_buff *skb) -{ - struct ipxhdr *ipx = ipx_hdr(skb); - int rc = 0; - - ipxitf_hold(intrfc); - - /* See if we should update our network number */ - if (!intrfc->if_netnum) /* net number of intrfc not known yet */ - ipxitf_discover_netnum(intrfc, skb); - - IPX_SKB_CB(skb)->last_hop.index = -1; - if (ipx->ipx_type == IPX_TYPE_PPROP) { - rc = ipxitf_pprop(intrfc, skb); - if (rc) - goto out_free_skb; - } - - /* local processing follows */ - if (!IPX_SKB_CB(skb)->ipx_dest_net) - IPX_SKB_CB(skb)->ipx_dest_net = intrfc->if_netnum; - if (!IPX_SKB_CB(skb)->ipx_source_net) - IPX_SKB_CB(skb)->ipx_source_net = intrfc->if_netnum; - - /* it doesn't make sense to route a pprop packet, there's no meaning - * in the ipx_dest_net for such packets */ - if (ipx->ipx_type != IPX_TYPE_PPROP && - intrfc->if_netnum != IPX_SKB_CB(skb)->ipx_dest_net) { - /* We only route point-to-point packets. */ - if (skb->pkt_type == PACKET_HOST) { - skb = skb_unshare(skb, GFP_ATOMIC); - if (skb) - rc = ipxrtr_route_skb(skb); - goto out_intrfc; - } - - goto out_free_skb; - } - - /* see if we should keep it */ - if (!memcmp(ipx_broadcast_node, ipx->ipx_dest.node, IPX_NODE_LEN) || - !memcmp(intrfc->if_node, ipx->ipx_dest.node, IPX_NODE_LEN)) { - rc = ipxitf_demux_socket(intrfc, skb, 0); - goto out_intrfc; - } - - /* we couldn't pawn it off so unload it */ -out_free_skb: - kfree_skb(skb); -out_intrfc: - ipxitf_put(intrfc); - return rc; -} - -static void ipxitf_discover_netnum(struct ipx_interface *intrfc, - struct sk_buff *skb) -{ - const struct ipx_cb *cb = IPX_SKB_CB(skb); - - /* see if this is an intra packet: source_net == dest_net */ - if (cb->ipx_source_net == cb->ipx_dest_net && cb->ipx_source_net) { - struct ipx_interface *i = - ipxitf_find_using_net(cb->ipx_source_net); - /* NB: NetWare servers lie about their hop count so we - * dropped the test based on it. This is the best way - * to determine this is a 0 hop count packet. */ - if (!i) { - intrfc->if_netnum = cb->ipx_source_net; - ipxitf_add_local_route(intrfc); - } else { - printk(KERN_WARNING "IPX: Network number collision " - "%lx\n %s %s and %s %s\n", - (unsigned long) ntohl(cb->ipx_source_net), - ipx_device_name(i), - ipx_frame_name(i->if_dlink_type), - ipx_device_name(intrfc), - ipx_frame_name(intrfc->if_dlink_type)); - ipxitf_put(i); - } - } -} - -/** - * ipxitf_pprop - Process packet propagation IPX packet type 0x14, used for - * NetBIOS broadcasts - * @intrfc: IPX interface receiving this packet - * @skb: Received packet - * - * Checks if packet is valid: if its more than %IPX_MAX_PPROP_HOPS hops or if it - * is smaller than a IPX header + the room for %IPX_MAX_PPROP_HOPS hops we drop - * it, not even processing it locally, if it has exact %IPX_MAX_PPROP_HOPS we - * don't broadcast it, but process it locally. See chapter 5 of Novell's "IPX - * RIP and SAP Router Specification", Part Number 107-000029-001. - * - * If it is valid, check if we have pprop broadcasting enabled by the user, - * if not, just return zero for local processing. - * - * If it is enabled check the packet and don't broadcast it if we have already - * seen this packet. - * - * Broadcast: send it to the interfaces that aren't on the packet visited nets - * array, just after the IPX header. - * - * Returns -EINVAL for invalid packets, so that the calling function drops - * the packet without local processing. 0 if packet is to be locally processed. - */ -static int ipxitf_pprop(struct ipx_interface *intrfc, struct sk_buff *skb) -{ - struct ipxhdr *ipx = ipx_hdr(skb); - int i, rc = -EINVAL; - struct ipx_interface *ifcs; - char *c; - __be32 *l; - - /* Illegal packet - too many hops or too short */ - /* We decide to throw it away: no broadcasting, no local processing. - * NetBIOS unaware implementations route them as normal packets - - * tctrl <= 15, any data payload... */ - if (IPX_SKB_CB(skb)->ipx_tctrl > IPX_MAX_PPROP_HOPS || - ntohs(ipx->ipx_pktsize) < sizeof(struct ipxhdr) + - IPX_MAX_PPROP_HOPS * sizeof(u32)) - goto out; - /* are we broadcasting this damn thing? */ - rc = 0; - if (!sysctl_ipx_pprop_broadcasting) - goto out; - /* We do broadcast packet on the IPX_MAX_PPROP_HOPS hop, but we - * process it locally. All previous hops broadcasted it, and process it - * locally. */ - if (IPX_SKB_CB(skb)->ipx_tctrl == IPX_MAX_PPROP_HOPS) - goto out; - - c = ((u8 *) ipx) + sizeof(struct ipxhdr); - l = (__be32 *) c; - - /* Don't broadcast packet if already seen this net */ - for (i = 0; i < IPX_SKB_CB(skb)->ipx_tctrl; i++) - if (*l++ == intrfc->if_netnum) - goto out; - - /* < IPX_MAX_PPROP_HOPS hops && input interface not in list. Save the - * position where we will insert recvd netnum into list, later on, - * in ipxitf_send */ - IPX_SKB_CB(skb)->last_hop.index = i; - IPX_SKB_CB(skb)->last_hop.netnum = intrfc->if_netnum; - /* xmit on all other interfaces... */ - spin_lock_bh(&ipx_interfaces_lock); - list_for_each_entry(ifcs, &ipx_interfaces, node) { - /* Except unconfigured interfaces */ - if (!ifcs->if_netnum) - continue; - - /* That aren't in the list */ - if (ifcs == intrfc) - continue; - l = (__be32 *) c; - /* don't consider the last entry in the packet list, - * it is our netnum, and it is not there yet */ - for (i = 0; i < IPX_SKB_CB(skb)->ipx_tctrl; i++) - if (ifcs->if_netnum == *l++) - break; - if (i == IPX_SKB_CB(skb)->ipx_tctrl) { - struct sk_buff *s = skb_copy(skb, GFP_ATOMIC); - - if (s) { - IPX_SKB_CB(s)->ipx_dest_net = ifcs->if_netnum; - ipxrtr_route_skb(s); - } - } - } - spin_unlock_bh(&ipx_interfaces_lock); -out: - return rc; -} - -static void ipxitf_insert(struct ipx_interface *intrfc) -{ - spin_lock_bh(&ipx_interfaces_lock); - list_add_tail(&intrfc->node, &ipx_interfaces); - spin_unlock_bh(&ipx_interfaces_lock); - - if (ipxcfg_auto_select_primary && !ipx_primary_net) - ipx_primary_net = intrfc; -} - -static struct ipx_interface *ipxitf_alloc(struct net_device *dev, __be32 netnum, - __be16 dlink_type, - struct datalink_proto *dlink, - unsigned char internal, - int ipx_offset) -{ - struct ipx_interface *intrfc = kmalloc(sizeof(*intrfc), GFP_ATOMIC); - - if (intrfc) { - intrfc->if_dev = dev; - intrfc->if_netnum = netnum; - intrfc->if_dlink_type = dlink_type; - intrfc->if_dlink = dlink; - intrfc->if_internal = internal; - intrfc->if_ipx_offset = ipx_offset; - intrfc->if_sknum = IPX_MIN_EPHEMERAL_SOCKET; - INIT_HLIST_HEAD(&intrfc->if_sklist); - refcount_set(&intrfc->refcnt, 1); - spin_lock_init(&intrfc->if_sklist_lock); - } - - return intrfc; -} - -static int ipxitf_create_internal(struct ipx_interface_definition *idef) -{ - struct ipx_interface *intrfc; - int rc = -EEXIST; - - /* Only one primary network allowed */ - if (ipx_primary_net) - goto out; - - /* Must have a valid network number */ - rc = -EADDRNOTAVAIL; - if (!idef->ipx_network) - goto out; - intrfc = ipxitf_find_using_net(idef->ipx_network); - rc = -EADDRINUSE; - if (intrfc) { - ipxitf_put(intrfc); - goto out; - } - intrfc = ipxitf_alloc(NULL, idef->ipx_network, 0, NULL, 1, 0); - rc = -EAGAIN; - if (!intrfc) - goto out; - memcpy((char *)&(intrfc->if_node), idef->ipx_node, IPX_NODE_LEN); - ipx_internal_net = ipx_primary_net = intrfc; - ipxitf_hold(intrfc); - ipxitf_insert(intrfc); - - rc = ipxitf_add_local_route(intrfc); - ipxitf_put(intrfc); -out: - return rc; -} - -static __be16 ipx_map_frame_type(unsigned char type) -{ - __be16 rc = 0; - - switch (type) { - case IPX_FRAME_ETHERII: rc = htons(ETH_P_IPX); break; - case IPX_FRAME_8022: rc = htons(ETH_P_802_2); break; - case IPX_FRAME_SNAP: rc = htons(ETH_P_SNAP); break; - case IPX_FRAME_8023: rc = htons(ETH_P_802_3); break; - } - - return rc; -} - -static int ipxitf_create(struct ipx_interface_definition *idef) -{ - struct net_device *dev; - __be16 dlink_type = 0; - struct datalink_proto *datalink = NULL; - struct ipx_interface *intrfc; - int rc; - - if (idef->ipx_special == IPX_INTERNAL) { - rc = ipxitf_create_internal(idef); - goto out; - } - - rc = -EEXIST; - if (idef->ipx_special == IPX_PRIMARY && ipx_primary_net) - goto out; - - intrfc = ipxitf_find_using_net(idef->ipx_network); - rc = -EADDRINUSE; - if (idef->ipx_network && intrfc) { - ipxitf_put(intrfc); - goto out; - } - - if (intrfc) - ipxitf_put(intrfc); - - dev = dev_get_by_name(&init_net, idef->ipx_device); - rc = -ENODEV; - if (!dev) - goto out; - - switch (idef->ipx_dlink_type) { - case IPX_FRAME_8022: - dlink_type = htons(ETH_P_802_2); - datalink = p8022_datalink; - break; - case IPX_FRAME_ETHERII: - if (dev->type != ARPHRD_IEEE802) { - dlink_type = htons(ETH_P_IPX); - datalink = pEII_datalink; - break; - } - /* fall through */ - case IPX_FRAME_SNAP: - dlink_type = htons(ETH_P_SNAP); - datalink = pSNAP_datalink; - break; - case IPX_FRAME_8023: - dlink_type = htons(ETH_P_802_3); - datalink = p8023_datalink; - break; - case IPX_FRAME_NONE: - default: - rc = -EPROTONOSUPPORT; - goto out_dev; - } - - rc = -ENETDOWN; - if (!(dev->flags & IFF_UP)) - goto out_dev; - - /* Check addresses are suitable */ - rc = -EINVAL; - if (dev->addr_len > IPX_NODE_LEN) - goto out_dev; - - intrfc = ipxitf_find_using_phys(dev, dlink_type); - if (!intrfc) { - /* Ok now create */ - intrfc = ipxitf_alloc(dev, idef->ipx_network, dlink_type, - datalink, 0, dev->hard_header_len + - datalink->header_length); - rc = -EAGAIN; - if (!intrfc) - goto out_dev; - /* Setup primary if necessary */ - if (idef->ipx_special == IPX_PRIMARY) - ipx_primary_net = intrfc; - if (!memcmp(idef->ipx_node, "\000\000\000\000\000\000", - IPX_NODE_LEN)) { - memset(intrfc->if_node, 0, IPX_NODE_LEN); - memcpy(intrfc->if_node + IPX_NODE_LEN - dev->addr_len, - dev->dev_addr, dev->addr_len); - } else - memcpy(intrfc->if_node, idef->ipx_node, IPX_NODE_LEN); - ipxitf_hold(intrfc); - ipxitf_insert(intrfc); - } - - - /* If the network number is known, add a route */ - rc = 0; - if (!intrfc->if_netnum) - goto out_intrfc; - - rc = ipxitf_add_local_route(intrfc); -out_intrfc: - ipxitf_put(intrfc); - goto out; -out_dev: - dev_put(dev); -out: - return rc; -} - -static int ipxitf_delete(struct ipx_interface_definition *idef) -{ - struct net_device *dev = NULL; - __be16 dlink_type = 0; - struct ipx_interface *intrfc; - int rc = 0; - - spin_lock_bh(&ipx_interfaces_lock); - if (idef->ipx_special == IPX_INTERNAL) { - if (ipx_internal_net) { - __ipxitf_put(ipx_internal_net); - goto out; - } - rc = -ENOENT; - goto out; - } - - dlink_type = ipx_map_frame_type(idef->ipx_dlink_type); - rc = -EPROTONOSUPPORT; - if (!dlink_type) - goto out; - - dev = __dev_get_by_name(&init_net, idef->ipx_device); - rc = -ENODEV; - if (!dev) - goto out; - - intrfc = __ipxitf_find_using_phys(dev, dlink_type); - rc = -EINVAL; - if (!intrfc) - goto out; - __ipxitf_put(intrfc); - - rc = 0; -out: - spin_unlock_bh(&ipx_interfaces_lock); - return rc; -} - -static struct ipx_interface *ipxitf_auto_create(struct net_device *dev, - __be16 dlink_type) -{ - struct ipx_interface *intrfc = NULL; - struct datalink_proto *datalink; - - if (!dev) - goto out; - - /* Check addresses are suitable */ - if (dev->addr_len > IPX_NODE_LEN) - goto out; - - switch (ntohs(dlink_type)) { - case ETH_P_IPX: datalink = pEII_datalink; break; - case ETH_P_802_2: datalink = p8022_datalink; break; - case ETH_P_SNAP: datalink = pSNAP_datalink; break; - case ETH_P_802_3: datalink = p8023_datalink; break; - default: goto out; - } - - intrfc = ipxitf_alloc(dev, 0, dlink_type, datalink, 0, - dev->hard_header_len + datalink->header_length); - - if (intrfc) { - memset(intrfc->if_node, 0, IPX_NODE_LEN); - memcpy((char *)&(intrfc->if_node[IPX_NODE_LEN-dev->addr_len]), - dev->dev_addr, dev->addr_len); - spin_lock_init(&intrfc->if_sklist_lock); - refcount_set(&intrfc->refcnt, 1); - ipxitf_insert(intrfc); - dev_hold(dev); - } - -out: - return intrfc; -} - -static int ipxitf_ioctl(unsigned int cmd, void __user *arg) -{ - int rc = -EINVAL; - struct ifreq ifr; - int val; - - switch (cmd) { - case SIOCSIFADDR: { - struct sockaddr_ipx *sipx; - struct ipx_interface_definition f; - - rc = -EFAULT; - if (copy_from_user(&ifr, arg, sizeof(ifr))) - break; - sipx = (struct sockaddr_ipx *)&ifr.ifr_addr; - rc = -EINVAL; - if (sipx->sipx_family != AF_IPX) - break; - f.ipx_network = sipx->sipx_network; - memcpy(f.ipx_device, ifr.ifr_name, - sizeof(f.ipx_device)); - memcpy(f.ipx_node, sipx->sipx_node, IPX_NODE_LEN); - f.ipx_dlink_type = sipx->sipx_type; - f.ipx_special = sipx->sipx_special; - - if (sipx->sipx_action == IPX_DLTITF) - rc = ipxitf_delete(&f); - else - rc = ipxitf_create(&f); - break; - } - case SIOCGIFADDR: { - struct sockaddr_ipx *sipx; - struct ipx_interface *ipxif; - struct net_device *dev; - - rc = -EFAULT; - if (copy_from_user(&ifr, arg, sizeof(ifr))) - break; - sipx = (struct sockaddr_ipx *)&ifr.ifr_addr; - dev = __dev_get_by_name(&init_net, ifr.ifr_name); - rc = -ENODEV; - if (!dev) - break; - ipxif = ipxitf_find_using_phys(dev, - ipx_map_frame_type(sipx->sipx_type)); - rc = -EADDRNOTAVAIL; - if (!ipxif) - break; - - sipx->sipx_family = AF_IPX; - sipx->sipx_network = ipxif->if_netnum; - memcpy(sipx->sipx_node, ipxif->if_node, - sizeof(sipx->sipx_node)); - rc = 0; - if (copy_to_user(arg, &ifr, sizeof(ifr))) - rc = -EFAULT; - ipxitf_put(ipxif); - break; - } - case SIOCAIPXITFCRT: - rc = -EFAULT; - if (get_user(val, (unsigned char __user *) arg)) - break; - rc = 0; - ipxcfg_auto_create_interfaces = val; - break; - case SIOCAIPXPRISLT: - rc = -EFAULT; - if (get_user(val, (unsigned char __user *) arg)) - break; - rc = 0; - ipxcfg_set_auto_select(val); - break; - } - - return rc; -} - -/* - * Checksum routine for IPX - */ - -/* Note: We assume ipx_tctrl==0 and htons(length)==ipx_pktsize */ -/* This functions should *not* mess with packet contents */ - -__be16 ipx_cksum(struct ipxhdr *packet, int length) -{ - /* - * NOTE: sum is a net byte order quantity, which optimizes the - * loop. This only works on big and little endian machines. (I - * don't know of a machine that isn't.) - */ - /* handle the first 3 words separately; checksum should be skipped - * and ipx_tctrl masked out */ - __u16 *p = (__u16 *)packet; - __u32 sum = p[1] + (p[2] & (__force u16)htons(0x00ff)); - __u32 i = (length >> 1) - 3; /* Number of remaining complete words */ - - /* Loop through them */ - p += 3; - while (i--) - sum += *p++; - - /* Add on the last part word if it exists */ - if (packet->ipx_pktsize & htons(1)) - sum += (__force u16)htons(0xff00) & *p; - - /* Do final fixup */ - sum = (sum & 0xffff) + (sum >> 16); - - /* It's a pity there's no concept of carry in C */ - if (sum >= 0x10000) - sum++; - - /* - * Leave 0 alone; we don't want 0xffff here. Note that we can't get - * here with 0x10000, so this check is the same as ((__u16)sum) - */ - if (sum) - sum = ~sum; - - return (__force __be16)sum; -} - -const char *ipx_frame_name(__be16 frame) -{ - char* rc = "None"; - - switch (ntohs(frame)) { - case ETH_P_IPX: rc = "EtherII"; break; - case ETH_P_802_2: rc = "802.2"; break; - case ETH_P_SNAP: rc = "SNAP"; break; - case ETH_P_802_3: rc = "802.3"; break; - } - - return rc; -} - -const char *ipx_device_name(struct ipx_interface *intrfc) -{ - return intrfc->if_internal ? "Internal" : - intrfc->if_dev ? intrfc->if_dev->name : "Unknown"; -} - -/* Handling for system calls applied via the various interfaces to an IPX - * socket object. */ - -static int ipx_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) -{ - struct sock *sk = sock->sk; - int opt; - int rc = -EINVAL; - - lock_sock(sk); - if (optlen != sizeof(int)) - goto out; - - rc = -EFAULT; - if (get_user(opt, (unsigned int __user *)optval)) - goto out; - - rc = -ENOPROTOOPT; - if (!(level == SOL_IPX && optname == IPX_TYPE)) - goto out; - - ipx_sk(sk)->type = opt; - rc = 0; -out: - release_sock(sk); - return rc; -} - -static int ipx_getsockopt(struct socket *sock, int level, int optname, - char __user *optval, int __user *optlen) -{ - struct sock *sk = sock->sk; - int val = 0; - int len; - int rc = -ENOPROTOOPT; - - lock_sock(sk); - if (!(level == SOL_IPX && optname == IPX_TYPE)) - goto out; - - val = ipx_sk(sk)->type; - - rc = -EFAULT; - if (get_user(len, optlen)) - goto out; - - len = min_t(unsigned int, len, sizeof(int)); - rc = -EINVAL; - if(len < 0) - goto out; - - rc = -EFAULT; - if (put_user(len, optlen) || copy_to_user(optval, &val, len)) - goto out; - - rc = 0; -out: - release_sock(sk); - return rc; -} - -static struct proto ipx_proto = { - .name = "IPX", - .owner = THIS_MODULE, - .obj_size = sizeof(struct ipx_sock), -}; - -static int ipx_create(struct net *net, struct socket *sock, int protocol, - int kern) -{ - int rc = -ESOCKTNOSUPPORT; - struct sock *sk; - - if (!net_eq(net, &init_net)) - return -EAFNOSUPPORT; - - /* - * SPX support is not anymore in the kernel sources. If you want to - * ressurrect it, completing it and making it understand shared skbs, - * be fully multithreaded, etc, grab the sources in an early 2.5 kernel - * tree. - */ - if (sock->type != SOCK_DGRAM) - goto out; - - rc = -ENOMEM; - sk = sk_alloc(net, PF_IPX, GFP_KERNEL, &ipx_proto, kern); - if (!sk) - goto out; - - sk_refcnt_debug_inc(sk); - sock_init_data(sock, sk); - sk->sk_no_check_tx = 1; /* Checksum off by default */ - sock->ops = &ipx_dgram_ops; - rc = 0; -out: - return rc; -} - -static int ipx_release(struct socket *sock) -{ - struct sock *sk = sock->sk; - - if (!sk) - goto out; - - lock_sock(sk); - sk->sk_shutdown = SHUTDOWN_MASK; - if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_state_change(sk); - - sock_set_flag(sk, SOCK_DEAD); - sock->sk = NULL; - sk_refcnt_debug_release(sk); - ipx_destroy_socket(sk); - release_sock(sk); - sock_put(sk); -out: - return 0; -} - -/* caller must hold a reference to intrfc */ - -static __be16 ipx_first_free_socketnum(struct ipx_interface *intrfc) -{ - unsigned short socketNum = intrfc->if_sknum; - - spin_lock_bh(&intrfc->if_sklist_lock); - - if (socketNum < IPX_MIN_EPHEMERAL_SOCKET) - socketNum = IPX_MIN_EPHEMERAL_SOCKET; - - while (__ipxitf_find_socket(intrfc, htons(socketNum))) - if (socketNum > IPX_MAX_EPHEMERAL_SOCKET) - socketNum = IPX_MIN_EPHEMERAL_SOCKET; - else - socketNum++; - - spin_unlock_bh(&intrfc->if_sklist_lock); - intrfc->if_sknum = socketNum; - - return htons(socketNum); -} - -static int __ipx_bind(struct socket *sock, - struct sockaddr *uaddr, int addr_len) -{ - struct sock *sk = sock->sk; - struct ipx_sock *ipxs = ipx_sk(sk); - struct ipx_interface *intrfc; - struct sockaddr_ipx *addr = (struct sockaddr_ipx *)uaddr; - int rc = -EINVAL; - - if (!sock_flag(sk, SOCK_ZAPPED) || addr_len != sizeof(struct sockaddr_ipx)) - goto out; - - intrfc = ipxitf_find_using_net(addr->sipx_network); - rc = -EADDRNOTAVAIL; - if (!intrfc) - goto out; - - if (!addr->sipx_port) { - addr->sipx_port = ipx_first_free_socketnum(intrfc); - rc = -EINVAL; - if (!addr->sipx_port) - goto out_put; - } - - /* protect IPX system stuff like routing/sap */ - rc = -EACCES; - if (ntohs(addr->sipx_port) < IPX_MIN_EPHEMERAL_SOCKET && - !capable(CAP_NET_ADMIN)) - goto out_put; - - ipxs->port = addr->sipx_port; - -#ifdef CONFIG_IPX_INTERN - if (intrfc == ipx_internal_net) { - /* The source address is to be set explicitly if the - * socket is to be bound on the internal network. If a - * node number 0 was specified, the default is used. - */ - - rc = -EINVAL; - if (!memcmp(addr->sipx_node, ipx_broadcast_node, IPX_NODE_LEN)) - goto out_put; - if (!memcmp(addr->sipx_node, ipx_this_node, IPX_NODE_LEN)) - memcpy(ipxs->node, intrfc->if_node, IPX_NODE_LEN); - else - memcpy(ipxs->node, addr->sipx_node, IPX_NODE_LEN); - - rc = -EADDRINUSE; - if (ipxitf_find_internal_socket(intrfc, ipxs->node, - ipxs->port)) { - SOCK_DEBUG(sk, - "IPX: bind failed because port %X in use.\n", - ntohs(addr->sipx_port)); - goto out_put; - } - } else { - /* Source addresses are easy. It must be our - * network:node pair for an interface routed to IPX - * with the ipx routing ioctl() - */ - - memcpy(ipxs->node, intrfc->if_node, IPX_NODE_LEN); - - rc = -EADDRINUSE; - if (ipxitf_find_socket(intrfc, addr->sipx_port)) { - SOCK_DEBUG(sk, - "IPX: bind failed because port %X in use.\n", - ntohs(addr->sipx_port)); - goto out_put; - } - } - -#else /* !def CONFIG_IPX_INTERN */ - - /* Source addresses are easy. It must be our network:node pair for - an interface routed to IPX with the ipx routing ioctl() */ - - rc = -EADDRINUSE; - if (ipxitf_find_socket(intrfc, addr->sipx_port)) { - SOCK_DEBUG(sk, "IPX: bind failed because port %X in use.\n", - ntohs((int)addr->sipx_port)); - goto out_put; - } - -#endif /* CONFIG_IPX_INTERN */ - - ipxitf_insert_socket(intrfc, sk); - sock_reset_flag(sk, SOCK_ZAPPED); - - rc = 0; -out_put: - ipxitf_put(intrfc); -out: - return rc; -} - -static int ipx_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) -{ - struct sock *sk = sock->sk; - int rc; - - lock_sock(sk); - rc = __ipx_bind(sock, uaddr, addr_len); - release_sock(sk); - - return rc; -} - -static int ipx_connect(struct socket *sock, struct sockaddr *uaddr, - int addr_len, int flags) -{ - struct sock *sk = sock->sk; - struct ipx_sock *ipxs = ipx_sk(sk); - struct sockaddr_ipx *addr; - int rc = -EINVAL; - struct ipx_route *rt; - - sk->sk_state = TCP_CLOSE; - sock->state = SS_UNCONNECTED; - - lock_sock(sk); - if (addr_len != sizeof(*addr)) - goto out; - addr = (struct sockaddr_ipx *)uaddr; - - /* put the autobinding in */ - if (!ipxs->port) { - struct sockaddr_ipx uaddr; - - uaddr.sipx_port = 0; - uaddr.sipx_network = 0; - -#ifdef CONFIG_IPX_INTERN - rc = -ENETDOWN; - if (!ipxs->intrfc) - goto out; /* Someone zonked the iface */ - memcpy(uaddr.sipx_node, ipxs->intrfc->if_node, - IPX_NODE_LEN); -#endif /* CONFIG_IPX_INTERN */ - - rc = __ipx_bind(sock, (struct sockaddr *)&uaddr, - sizeof(struct sockaddr_ipx)); - if (rc) - goto out; - } - - /* We can either connect to primary network or somewhere - * we can route to */ - rt = ipxrtr_lookup(addr->sipx_network); - rc = -ENETUNREACH; - if (!rt && !(!addr->sipx_network && ipx_primary_net)) - goto out; - - ipxs->dest_addr.net = addr->sipx_network; - ipxs->dest_addr.sock = addr->sipx_port; - memcpy(ipxs->dest_addr.node, addr->sipx_node, IPX_NODE_LEN); - ipxs->type = addr->sipx_type; - - if (sock->type == SOCK_DGRAM) { - sock->state = SS_CONNECTED; - sk->sk_state = TCP_ESTABLISHED; - } - - if (rt) - ipxrtr_put(rt); - rc = 0; -out: - release_sock(sk); - return rc; -} - - -static int ipx_getname(struct socket *sock, struct sockaddr *uaddr, - int *uaddr_len, int peer) -{ - struct ipx_address *addr; - struct sockaddr_ipx sipx; - struct sock *sk = sock->sk; - struct ipx_sock *ipxs = ipx_sk(sk); - int rc; - - *uaddr_len = sizeof(struct sockaddr_ipx); - - lock_sock(sk); - if (peer) { - rc = -ENOTCONN; - if (sk->sk_state != TCP_ESTABLISHED) - goto out; - - addr = &ipxs->dest_addr; - sipx.sipx_network = addr->net; - sipx.sipx_port = addr->sock; - memcpy(sipx.sipx_node, addr->node, IPX_NODE_LEN); - } else { - if (ipxs->intrfc) { - sipx.sipx_network = ipxs->intrfc->if_netnum; -#ifdef CONFIG_IPX_INTERN - memcpy(sipx.sipx_node, ipxs->node, IPX_NODE_LEN); -#else - memcpy(sipx.sipx_node, ipxs->intrfc->if_node, - IPX_NODE_LEN); -#endif /* CONFIG_IPX_INTERN */ - - } else { - sipx.sipx_network = 0; - memset(sipx.sipx_node, '\0', IPX_NODE_LEN); - } - - sipx.sipx_port = ipxs->port; - } - - sipx.sipx_family = AF_IPX; - sipx.sipx_type = ipxs->type; - sipx.sipx_zero = 0; - memcpy(uaddr, &sipx, sizeof(sipx)); - - rc = 0; -out: - release_sock(sk); - return rc; -} - -static int ipx_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) -{ - /* NULL here for pt means the packet was looped back */ - struct ipx_interface *intrfc; - struct ipxhdr *ipx; - u16 ipx_pktsize; - int rc = 0; - - if (!net_eq(dev_net(dev), &init_net)) - goto drop; - - /* Not ours */ - if (skb->pkt_type == PACKET_OTHERHOST) - goto drop; - - if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) - goto out; - - if (!pskb_may_pull(skb, sizeof(struct ipxhdr))) - goto drop; - - ipx_pktsize = ntohs(ipx_hdr(skb)->ipx_pktsize); - - /* Too small or invalid header? */ - if (ipx_pktsize < sizeof(struct ipxhdr) || - !pskb_may_pull(skb, ipx_pktsize)) - goto drop; - - ipx = ipx_hdr(skb); - if (ipx->ipx_checksum != IPX_NO_CHECKSUM && - ipx->ipx_checksum != ipx_cksum(ipx, ipx_pktsize)) - goto drop; - - IPX_SKB_CB(skb)->ipx_tctrl = ipx->ipx_tctrl; - IPX_SKB_CB(skb)->ipx_dest_net = ipx->ipx_dest.net; - IPX_SKB_CB(skb)->ipx_source_net = ipx->ipx_source.net; - - /* Determine what local ipx endpoint this is */ - intrfc = ipxitf_find_using_phys(dev, pt->type); - if (!intrfc) { - if (ipxcfg_auto_create_interfaces && - IPX_SKB_CB(skb)->ipx_dest_net) { - intrfc = ipxitf_auto_create(dev, pt->type); - if (intrfc) - ipxitf_hold(intrfc); - } - - if (!intrfc) /* Not one of ours */ - /* or invalid packet for auto creation */ - goto drop; - } - - rc = ipxitf_rcv(intrfc, skb); - ipxitf_put(intrfc); - goto out; -drop: - kfree_skb(skb); -out: - return rc; -} - -static int ipx_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) -{ - struct sock *sk = sock->sk; - struct ipx_sock *ipxs = ipx_sk(sk); - DECLARE_SOCKADDR(struct sockaddr_ipx *, usipx, msg->msg_name); - struct sockaddr_ipx local_sipx; - int rc = -EINVAL; - int flags = msg->msg_flags; - - lock_sock(sk); - /* Socket gets bound below anyway */ -/* if (sk->sk_zapped) - return -EIO; */ /* Socket not bound */ - if (flags & ~(MSG_DONTWAIT|MSG_CMSG_COMPAT)) - goto out; - - /* Max possible packet size limited by 16 bit pktsize in header */ - if (len >= 65535 - sizeof(struct ipxhdr)) - goto out; - - if (usipx) { - if (!ipxs->port) { - struct sockaddr_ipx uaddr; - - uaddr.sipx_port = 0; - uaddr.sipx_network = 0; -#ifdef CONFIG_IPX_INTERN - rc = -ENETDOWN; - if (!ipxs->intrfc) - goto out; /* Someone zonked the iface */ - memcpy(uaddr.sipx_node, ipxs->intrfc->if_node, - IPX_NODE_LEN); -#endif - rc = __ipx_bind(sock, (struct sockaddr *)&uaddr, - sizeof(struct sockaddr_ipx)); - if (rc) - goto out; - } - - rc = -EINVAL; - if (msg->msg_namelen < sizeof(*usipx) || - usipx->sipx_family != AF_IPX) - goto out; - } else { - rc = -ENOTCONN; - if (sk->sk_state != TCP_ESTABLISHED) - goto out; - - usipx = &local_sipx; - usipx->sipx_family = AF_IPX; - usipx->sipx_type = ipxs->type; - usipx->sipx_port = ipxs->dest_addr.sock; - usipx->sipx_network = ipxs->dest_addr.net; - memcpy(usipx->sipx_node, ipxs->dest_addr.node, IPX_NODE_LEN); - } - - rc = ipxrtr_route_packet(sk, usipx, msg, len, flags & MSG_DONTWAIT); - if (rc >= 0) - rc = len; -out: - release_sock(sk); - return rc; -} - - -static int ipx_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, - int flags) -{ - struct sock *sk = sock->sk; - struct ipx_sock *ipxs = ipx_sk(sk); - DECLARE_SOCKADDR(struct sockaddr_ipx *, sipx, msg->msg_name); - struct ipxhdr *ipx = NULL; - struct sk_buff *skb; - int copied, rc; - bool locked = true; - - lock_sock(sk); - /* put the autobinding in */ - if (!ipxs->port) { - struct sockaddr_ipx uaddr; - - uaddr.sipx_port = 0; - uaddr.sipx_network = 0; - -#ifdef CONFIG_IPX_INTERN - rc = -ENETDOWN; - if (!ipxs->intrfc) - goto out; /* Someone zonked the iface */ - memcpy(uaddr.sipx_node, ipxs->intrfc->if_node, IPX_NODE_LEN); -#endif /* CONFIG_IPX_INTERN */ - - rc = __ipx_bind(sock, (struct sockaddr *)&uaddr, - sizeof(struct sockaddr_ipx)); - if (rc) - goto out; - } - - rc = -ENOTCONN; - if (sock_flag(sk, SOCK_ZAPPED)) - goto out; - - release_sock(sk); - locked = false; - skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT, - flags & MSG_DONTWAIT, &rc); - if (!skb) { - if (rc == -EAGAIN && (sk->sk_shutdown & RCV_SHUTDOWN)) - rc = 0; - goto out; - } - - ipx = ipx_hdr(skb); - copied = ntohs(ipx->ipx_pktsize) - sizeof(struct ipxhdr); - if (copied > size) { - copied = size; - msg->msg_flags |= MSG_TRUNC; - } - - rc = skb_copy_datagram_msg(skb, sizeof(struct ipxhdr), msg, copied); - if (rc) - goto out_free; - if (skb->tstamp) - sk->sk_stamp = skb->tstamp; - - if (sipx) { - sipx->sipx_family = AF_IPX; - sipx->sipx_port = ipx->ipx_source.sock; - memcpy(sipx->sipx_node, ipx->ipx_source.node, IPX_NODE_LEN); - sipx->sipx_network = IPX_SKB_CB(skb)->ipx_source_net; - sipx->sipx_type = ipx->ipx_type; - sipx->sipx_zero = 0; - msg->msg_namelen = sizeof(*sipx); - } - rc = copied; - -out_free: - skb_free_datagram(sk, skb); -out: - if (locked) - release_sock(sk); - return rc; -} - - -static int ipx_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) -{ - int rc = 0; - long amount = 0; - struct sock *sk = sock->sk; - void __user *argp = (void __user *)arg; - - lock_sock(sk); - switch (cmd) { - case TIOCOUTQ: - amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk); - if (amount < 0) - amount = 0; - rc = put_user(amount, (int __user *)argp); - break; - case TIOCINQ: { - struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); - /* These two are safe on a single CPU system as only - * user tasks fiddle here */ - if (skb) - amount = skb->len - sizeof(struct ipxhdr); - rc = put_user(amount, (int __user *)argp); - break; - } - case SIOCADDRT: - case SIOCDELRT: - rc = -EPERM; - if (capable(CAP_NET_ADMIN)) - rc = ipxrtr_ioctl(cmd, argp); - break; - case SIOCSIFADDR: - case SIOCAIPXITFCRT: - case SIOCAIPXPRISLT: - rc = -EPERM; - if (!capable(CAP_NET_ADMIN)) - break; - /* fall through */ - case SIOCGIFADDR: - rc = ipxitf_ioctl(cmd, argp); - break; - case SIOCIPXCFGDATA: - rc = ipxcfg_get_config_data(argp); - break; - case SIOCIPXNCPCONN: - /* - * This socket wants to take care of the NCP connection - * handed to us in arg. - */ - rc = -EPERM; - if (!capable(CAP_NET_ADMIN)) - break; - rc = get_user(ipx_sk(sk)->ipx_ncp_conn, - (const unsigned short __user *)argp); - break; - case SIOCGSTAMP: - rc = sock_get_timestamp(sk, argp); - break; - case SIOCGIFDSTADDR: - case SIOCSIFDSTADDR: - case SIOCGIFBRDADDR: - case SIOCSIFBRDADDR: - case SIOCGIFNETMASK: - case SIOCSIFNETMASK: - rc = -EINVAL; - break; - default: - rc = -ENOIOCTLCMD; - break; - } - release_sock(sk); - - return rc; -} - - -#ifdef CONFIG_COMPAT -static int ipx_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) -{ - /* - * These 4 commands use same structure on 32bit and 64bit. Rest of IPX - * commands is handled by generic ioctl code. As these commands are - * SIOCPROTOPRIVATE..SIOCPROTOPRIVATE+3, they cannot be handled by generic - * code. - */ - switch (cmd) { - case SIOCAIPXITFCRT: - case SIOCAIPXPRISLT: - case SIOCIPXCFGDATA: - case SIOCIPXNCPCONN: - return ipx_ioctl(sock, cmd, arg); - default: - return -ENOIOCTLCMD; - } -} -#endif - -static int ipx_shutdown(struct socket *sock, int mode) -{ - struct sock *sk = sock->sk; - - if (mode < SHUT_RD || mode > SHUT_RDWR) - return -EINVAL; - /* This maps: - * SHUT_RD (0) -> RCV_SHUTDOWN (1) - * SHUT_WR (1) -> SEND_SHUTDOWN (2) - * SHUT_RDWR (2) -> SHUTDOWN_MASK (3) - */ - ++mode; - - lock_sock(sk); - sk->sk_shutdown |= mode; - release_sock(sk); - sk->sk_state_change(sk); - - return 0; -} - -/* - * Socket family declarations - */ - -static const struct net_proto_family ipx_family_ops = { - .family = PF_IPX, - .create = ipx_create, - .owner = THIS_MODULE, -}; - -static const struct proto_ops ipx_dgram_ops = { - .family = PF_IPX, - .owner = THIS_MODULE, - .release = ipx_release, - .bind = ipx_bind, - .connect = ipx_connect, - .socketpair = sock_no_socketpair, - .accept = sock_no_accept, - .getname = ipx_getname, - .poll = datagram_poll, - .ioctl = ipx_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = ipx_compat_ioctl, -#endif - .listen = sock_no_listen, - .shutdown = ipx_shutdown, - .setsockopt = ipx_setsockopt, - .getsockopt = ipx_getsockopt, - .sendmsg = ipx_sendmsg, - .recvmsg = ipx_recvmsg, - .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, -}; - -static struct packet_type ipx_8023_packet_type __read_mostly = { - .type = cpu_to_be16(ETH_P_802_3), - .func = ipx_rcv, -}; - -static struct packet_type ipx_dix_packet_type __read_mostly = { - .type = cpu_to_be16(ETH_P_IPX), - .func = ipx_rcv, -}; - -static struct notifier_block ipx_dev_notifier = { - .notifier_call = ipxitf_device_event, -}; - -static const unsigned char ipx_8022_type = 0xE0; -static const unsigned char ipx_snap_id[5] = { 0x0, 0x0, 0x0, 0x81, 0x37 }; -static const char ipx_EII_err_msg[] __initconst = - KERN_CRIT "IPX: Unable to register with Ethernet II\n"; -static const char ipx_8023_err_msg[] __initconst = - KERN_CRIT "IPX: Unable to register with 802.3\n"; -static const char ipx_llc_err_msg[] __initconst = - KERN_CRIT "IPX: Unable to register with 802.2\n"; -static const char ipx_snap_err_msg[] __initconst = - KERN_CRIT "IPX: Unable to register with SNAP\n"; - -static int __init ipx_init(void) -{ - int rc = proto_register(&ipx_proto, 1); - - if (rc != 0) - goto out; - - sock_register(&ipx_family_ops); - - pEII_datalink = make_EII_client(); - if (pEII_datalink) - dev_add_pack(&ipx_dix_packet_type); - else - printk(ipx_EII_err_msg); - - p8023_datalink = make_8023_client(); - if (p8023_datalink) - dev_add_pack(&ipx_8023_packet_type); - else - printk(ipx_8023_err_msg); - - p8022_datalink = register_8022_client(ipx_8022_type, ipx_rcv); - if (!p8022_datalink) - printk(ipx_llc_err_msg); - - pSNAP_datalink = register_snap_client(ipx_snap_id, ipx_rcv); - if (!pSNAP_datalink) - printk(ipx_snap_err_msg); - - register_netdevice_notifier(&ipx_dev_notifier); - ipx_register_sysctl(); - ipx_proc_init(); -out: - return rc; -} - -static void __exit ipx_proto_finito(void) -{ - ipx_proc_exit(); - ipx_unregister_sysctl(); - - unregister_netdevice_notifier(&ipx_dev_notifier); - - ipxitf_cleanup(); - - if (pSNAP_datalink) { - unregister_snap_client(pSNAP_datalink); - pSNAP_datalink = NULL; - } - - if (p8022_datalink) { - unregister_8022_client(p8022_datalink); - p8022_datalink = NULL; - } - - dev_remove_pack(&ipx_8023_packet_type); - if (p8023_datalink) { - destroy_8023_client(p8023_datalink); - p8023_datalink = NULL; - } - - dev_remove_pack(&ipx_dix_packet_type); - if (pEII_datalink) { - destroy_EII_client(pEII_datalink); - pEII_datalink = NULL; - } - - proto_unregister(&ipx_proto); - sock_unregister(ipx_family_ops.family); -} - -module_init(ipx_init); -module_exit(ipx_proto_finito); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_NETPROTO(PF_IPX); diff --git a/net/ipx/ipx_proc.c b/net/ipx/ipx_proc.c deleted file mode 100644 index 38a3d51d9ead..000000000000 --- a/net/ipx/ipx_proc.c +++ /dev/null @@ -1,341 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * IPX proc routines - * - * Copyright(C) Arnaldo Carvalho de Melo <acme@conectiva.com.br>, 2002 - */ - -#include <linux/init.h> -#ifdef CONFIG_PROC_FS -#include <linux/proc_fs.h> -#include <linux/spinlock.h> -#include <linux/seq_file.h> -#include <linux/export.h> -#include <net/net_namespace.h> -#include <net/tcp_states.h> -#include <net/ipx.h> - -static void *ipx_seq_interface_start(struct seq_file *seq, loff_t *pos) -{ - spin_lock_bh(&ipx_interfaces_lock); - return seq_list_start_head(&ipx_interfaces, *pos); -} - -static void *ipx_seq_interface_next(struct seq_file *seq, void *v, loff_t *pos) -{ - return seq_list_next(v, &ipx_interfaces, pos); -} - -static void ipx_seq_interface_stop(struct seq_file *seq, void *v) -{ - spin_unlock_bh(&ipx_interfaces_lock); -} - -static int ipx_seq_interface_show(struct seq_file *seq, void *v) -{ - struct ipx_interface *i; - - if (v == &ipx_interfaces) { - seq_puts(seq, "Network Node_Address Primary Device " - "Frame_Type"); -#ifdef IPX_REFCNT_DEBUG - seq_puts(seq, " refcnt"); -#endif - seq_puts(seq, "\n"); - goto out; - } - - i = list_entry(v, struct ipx_interface, node); - seq_printf(seq, "%08X ", ntohl(i->if_netnum)); - seq_printf(seq, "%02X%02X%02X%02X%02X%02X ", - i->if_node[0], i->if_node[1], i->if_node[2], - i->if_node[3], i->if_node[4], i->if_node[5]); - seq_printf(seq, "%-9s", i == ipx_primary_net ? "Yes" : "No"); - seq_printf(seq, "%-11s", ipx_device_name(i)); - seq_printf(seq, "%-9s", ipx_frame_name(i->if_dlink_type)); -#ifdef IPX_REFCNT_DEBUG - seq_printf(seq, "%6d", refcount_read(&i->refcnt)); -#endif - seq_puts(seq, "\n"); -out: - return 0; -} - -static void *ipx_seq_route_start(struct seq_file *seq, loff_t *pos) -{ - read_lock_bh(&ipx_routes_lock); - return seq_list_start_head(&ipx_routes, *pos); -} - -static void *ipx_seq_route_next(struct seq_file *seq, void *v, loff_t *pos) -{ - return seq_list_next(v, &ipx_routes, pos); -} - -static void ipx_seq_route_stop(struct seq_file *seq, void *v) -{ - read_unlock_bh(&ipx_routes_lock); -} - -static int ipx_seq_route_show(struct seq_file *seq, void *v) -{ - struct ipx_route *rt; - - if (v == &ipx_routes) { - seq_puts(seq, "Network Router_Net Router_Node\n"); - goto out; - } - - rt = list_entry(v, struct ipx_route, node); - - seq_printf(seq, "%08X ", ntohl(rt->ir_net)); - if (rt->ir_routed) - seq_printf(seq, "%08X %02X%02X%02X%02X%02X%02X\n", - ntohl(rt->ir_intrfc->if_netnum), - rt->ir_router_node[0], rt->ir_router_node[1], - rt->ir_router_node[2], rt->ir_router_node[3], - rt->ir_router_node[4], rt->ir_router_node[5]); - else - seq_puts(seq, "Directly Connected\n"); -out: - return 0; -} - -static __inline__ struct sock *ipx_get_socket_idx(loff_t pos) -{ - struct sock *s = NULL; - struct ipx_interface *i; - - list_for_each_entry(i, &ipx_interfaces, node) { - spin_lock_bh(&i->if_sklist_lock); - sk_for_each(s, &i->if_sklist) { - if (!pos) - break; - --pos; - } - spin_unlock_bh(&i->if_sklist_lock); - if (!pos) { - if (s) - goto found; - break; - } - } - s = NULL; -found: - return s; -} - -static void *ipx_seq_socket_start(struct seq_file *seq, loff_t *pos) -{ - loff_t l = *pos; - - spin_lock_bh(&ipx_interfaces_lock); - return l ? ipx_get_socket_idx(--l) : SEQ_START_TOKEN; -} - -static void *ipx_seq_socket_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct sock* sk, *next; - struct ipx_interface *i; - struct ipx_sock *ipxs; - - ++*pos; - if (v == SEQ_START_TOKEN) { - sk = NULL; - i = ipx_interfaces_head(); - if (!i) - goto out; - sk = sk_head(&i->if_sklist); - if (sk) - spin_lock_bh(&i->if_sklist_lock); - goto out; - } - sk = v; - next = sk_next(sk); - if (next) { - sk = next; - goto out; - } - ipxs = ipx_sk(sk); - i = ipxs->intrfc; - spin_unlock_bh(&i->if_sklist_lock); - sk = NULL; - for (;;) { - if (i->node.next == &ipx_interfaces) - break; - i = list_entry(i->node.next, struct ipx_interface, node); - spin_lock_bh(&i->if_sklist_lock); - if (!hlist_empty(&i->if_sklist)) { - sk = sk_head(&i->if_sklist); - break; - } - spin_unlock_bh(&i->if_sklist_lock); - } -out: - return sk; -} - -static int ipx_seq_socket_show(struct seq_file *seq, void *v) -{ - struct sock *s; - struct ipx_sock *ipxs; - - if (v == SEQ_START_TOKEN) { -#ifdef CONFIG_IPX_INTERN - seq_puts(seq, "Local_Address " - "Remote_Address Tx_Queue " - "Rx_Queue State Uid\n"); -#else - seq_puts(seq, "Local_Address Remote_Address " - "Tx_Queue Rx_Queue State Uid\n"); -#endif - goto out; - } - - s = v; - ipxs = ipx_sk(s); -#ifdef CONFIG_IPX_INTERN - seq_printf(seq, "%08X:%02X%02X%02X%02X%02X%02X:%04X ", - ntohl(ipxs->intrfc->if_netnum), - ipxs->node[0], ipxs->node[1], ipxs->node[2], ipxs->node[3], - ipxs->node[4], ipxs->node[5], ntohs(ipxs->port)); -#else - seq_printf(seq, "%08X:%04X ", ntohl(ipxs->intrfc->if_netnum), - ntohs(ipxs->port)); -#endif /* CONFIG_IPX_INTERN */ - if (s->sk_state != TCP_ESTABLISHED) - seq_printf(seq, "%-28s", "Not_Connected"); - else { - seq_printf(seq, "%08X:%02X%02X%02X%02X%02X%02X:%04X ", - ntohl(ipxs->dest_addr.net), - ipxs->dest_addr.node[0], ipxs->dest_addr.node[1], - ipxs->dest_addr.node[2], ipxs->dest_addr.node[3], - ipxs->dest_addr.node[4], ipxs->dest_addr.node[5], - ntohs(ipxs->dest_addr.sock)); - } - - seq_printf(seq, "%08X %08X %02X %03u\n", - sk_wmem_alloc_get(s), - sk_rmem_alloc_get(s), - s->sk_state, - from_kuid_munged(seq_user_ns(seq), sock_i_uid(s))); -out: - return 0; -} - -static const struct seq_operations ipx_seq_interface_ops = { - .start = ipx_seq_interface_start, - .next = ipx_seq_interface_next, - .stop = ipx_seq_interface_stop, - .show = ipx_seq_interface_show, -}; - -static const struct seq_operations ipx_seq_route_ops = { - .start = ipx_seq_route_start, - .next = ipx_seq_route_next, - .stop = ipx_seq_route_stop, - .show = ipx_seq_route_show, -}; - -static const struct seq_operations ipx_seq_socket_ops = { - .start = ipx_seq_socket_start, - .next = ipx_seq_socket_next, - .stop = ipx_seq_interface_stop, - .show = ipx_seq_socket_show, -}; - -static int ipx_seq_route_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &ipx_seq_route_ops); -} - -static int ipx_seq_interface_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &ipx_seq_interface_ops); -} - -static int ipx_seq_socket_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &ipx_seq_socket_ops); -} - -static const struct file_operations ipx_seq_interface_fops = { - .owner = THIS_MODULE, - .open = ipx_seq_interface_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static const struct file_operations ipx_seq_route_fops = { - .owner = THIS_MODULE, - .open = ipx_seq_route_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static const struct file_operations ipx_seq_socket_fops = { - .owner = THIS_MODULE, - .open = ipx_seq_socket_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static struct proc_dir_entry *ipx_proc_dir; - -int __init ipx_proc_init(void) -{ - struct proc_dir_entry *p; - int rc = -ENOMEM; - - ipx_proc_dir = proc_mkdir("ipx", init_net.proc_net); - - if (!ipx_proc_dir) - goto out; - p = proc_create("interface", S_IRUGO, - ipx_proc_dir, &ipx_seq_interface_fops); - if (!p) - goto out_interface; - - p = proc_create("route", S_IRUGO, ipx_proc_dir, &ipx_seq_route_fops); - if (!p) - goto out_route; - - p = proc_create("socket", S_IRUGO, ipx_proc_dir, &ipx_seq_socket_fops); - if (!p) - goto out_socket; - - rc = 0; -out: - return rc; -out_socket: - remove_proc_entry("route", ipx_proc_dir); -out_route: - remove_proc_entry("interface", ipx_proc_dir); -out_interface: - remove_proc_entry("ipx", init_net.proc_net); - goto out; -} - -void __exit ipx_proc_exit(void) -{ - remove_proc_entry("interface", ipx_proc_dir); - remove_proc_entry("route", ipx_proc_dir); - remove_proc_entry("socket", ipx_proc_dir); - remove_proc_entry("ipx", init_net.proc_net); -} - -#else /* CONFIG_PROC_FS */ - -int __init ipx_proc_init(void) -{ - return 0; -} - -void __exit ipx_proc_exit(void) -{ -} - -#endif /* CONFIG_PROC_FS */ diff --git a/net/ipx/ipx_route.c b/net/ipx/ipx_route.c deleted file mode 100644 index 3cf93aa9f284..000000000000 --- a/net/ipx/ipx_route.c +++ /dev/null @@ -1,293 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Implements the IPX routing routines. - * Code moved from af_ipx.c. - * - * Arnaldo Carvalho de Melo <acme@conectiva.com.br>, 2003 - * - * See net/ipx/ChangeLog. - */ - -#include <linux/list.h> -#include <linux/route.h> -#include <linux/slab.h> -#include <linux/spinlock.h> - -#include <net/ipx.h> -#include <net/sock.h> - -LIST_HEAD(ipx_routes); -DEFINE_RWLOCK(ipx_routes_lock); - -extern struct ipx_interface *ipx_internal_net; - -extern struct ipx_interface *ipxitf_find_using_net(__be32 net); -extern int ipxitf_demux_socket(struct ipx_interface *intrfc, - struct sk_buff *skb, int copy); -extern int ipxitf_demux_socket(struct ipx_interface *intrfc, - struct sk_buff *skb, int copy); - -struct ipx_route *ipxrtr_lookup(__be32 net) -{ - struct ipx_route *r; - - read_lock_bh(&ipx_routes_lock); - list_for_each_entry(r, &ipx_routes, node) - if (r->ir_net == net) { - ipxrtr_hold(r); - goto unlock; - } - r = NULL; -unlock: - read_unlock_bh(&ipx_routes_lock); - return r; -} - -/* - * Caller must hold a reference to intrfc - */ -int ipxrtr_add_route(__be32 network, struct ipx_interface *intrfc, - unsigned char *node) -{ - struct ipx_route *rt; - int rc; - - /* Get a route structure; either existing or create */ - rt = ipxrtr_lookup(network); - if (!rt) { - rt = kmalloc(sizeof(*rt), GFP_ATOMIC); - rc = -EAGAIN; - if (!rt) - goto out; - - refcount_set(&rt->refcnt, 1); - ipxrtr_hold(rt); - write_lock_bh(&ipx_routes_lock); - list_add(&rt->node, &ipx_routes); - write_unlock_bh(&ipx_routes_lock); - } else { - rc = -EEXIST; - if (intrfc == ipx_internal_net) - goto out_put; - } - - rt->ir_net = network; - rt->ir_intrfc = intrfc; - if (!node) { - memset(rt->ir_router_node, '\0', IPX_NODE_LEN); - rt->ir_routed = 0; - } else { - memcpy(rt->ir_router_node, node, IPX_NODE_LEN); - rt->ir_routed = 1; - } - - rc = 0; -out_put: - ipxrtr_put(rt); -out: - return rc; -} - -void ipxrtr_del_routes(struct ipx_interface *intrfc) -{ - struct ipx_route *r, *tmp; - - write_lock_bh(&ipx_routes_lock); - list_for_each_entry_safe(r, tmp, &ipx_routes, node) - if (r->ir_intrfc == intrfc) { - list_del(&r->node); - ipxrtr_put(r); - } - write_unlock_bh(&ipx_routes_lock); -} - -static int ipxrtr_create(struct ipx_route_definition *rd) -{ - struct ipx_interface *intrfc; - int rc = -ENETUNREACH; - - /* Find the appropriate interface */ - intrfc = ipxitf_find_using_net(rd->ipx_router_network); - if (!intrfc) - goto out; - rc = ipxrtr_add_route(rd->ipx_network, intrfc, rd->ipx_router_node); - ipxitf_put(intrfc); -out: - return rc; -} - -static int ipxrtr_delete(__be32 net) -{ - struct ipx_route *r, *tmp; - int rc; - - write_lock_bh(&ipx_routes_lock); - list_for_each_entry_safe(r, tmp, &ipx_routes, node) - if (r->ir_net == net) { - /* Directly connected; can't lose route */ - rc = -EPERM; - if (!r->ir_routed) - goto out; - list_del(&r->node); - ipxrtr_put(r); - rc = 0; - goto out; - } - rc = -ENOENT; -out: - write_unlock_bh(&ipx_routes_lock); - return rc; -} - -/* - * The skb has to be unshared, we'll end up calling ipxitf_send, that'll - * modify the packet - */ -int ipxrtr_route_skb(struct sk_buff *skb) -{ - struct ipxhdr *ipx = ipx_hdr(skb); - struct ipx_route *r = ipxrtr_lookup(IPX_SKB_CB(skb)->ipx_dest_net); - - if (!r) { /* no known route */ - kfree_skb(skb); - return 0; - } - - ipxitf_hold(r->ir_intrfc); - ipxitf_send(r->ir_intrfc, skb, r->ir_routed ? - r->ir_router_node : ipx->ipx_dest.node); - ipxitf_put(r->ir_intrfc); - ipxrtr_put(r); - - return 0; -} - -/* - * Route an outgoing frame from a socket. - */ -int ipxrtr_route_packet(struct sock *sk, struct sockaddr_ipx *usipx, - struct msghdr *msg, size_t len, int noblock) -{ - struct sk_buff *skb; - struct ipx_sock *ipxs = ipx_sk(sk); - struct ipx_interface *intrfc; - struct ipxhdr *ipx; - size_t size; - int ipx_offset; - struct ipx_route *rt = NULL; - int rc; - - /* Find the appropriate interface on which to send packet */ - if (!usipx->sipx_network && ipx_primary_net) { - usipx->sipx_network = ipx_primary_net->if_netnum; - intrfc = ipx_primary_net; - } else { - rt = ipxrtr_lookup(usipx->sipx_network); - rc = -ENETUNREACH; - if (!rt) - goto out; - intrfc = rt->ir_intrfc; - } - - ipxitf_hold(intrfc); - ipx_offset = intrfc->if_ipx_offset; - size = sizeof(struct ipxhdr) + len + ipx_offset; - - skb = sock_alloc_send_skb(sk, size, noblock, &rc); - if (!skb) - goto out_put; - - skb_reserve(skb, ipx_offset); - skb->sk = sk; - - /* Fill in IPX header */ - skb_reset_network_header(skb); - skb_reset_transport_header(skb); - skb_put(skb, sizeof(struct ipxhdr)); - ipx = ipx_hdr(skb); - ipx->ipx_pktsize = htons(len + sizeof(struct ipxhdr)); - IPX_SKB_CB(skb)->ipx_tctrl = 0; - ipx->ipx_type = usipx->sipx_type; - - IPX_SKB_CB(skb)->last_hop.index = -1; -#ifdef CONFIG_IPX_INTERN - IPX_SKB_CB(skb)->ipx_source_net = ipxs->intrfc->if_netnum; - memcpy(ipx->ipx_source.node, ipxs->node, IPX_NODE_LEN); -#else - rc = ntohs(ipxs->port); - if (rc == 0x453 || rc == 0x452) { - /* RIP/SAP special handling for mars_nwe */ - IPX_SKB_CB(skb)->ipx_source_net = intrfc->if_netnum; - memcpy(ipx->ipx_source.node, intrfc->if_node, IPX_NODE_LEN); - } else { - IPX_SKB_CB(skb)->ipx_source_net = ipxs->intrfc->if_netnum; - memcpy(ipx->ipx_source.node, ipxs->intrfc->if_node, - IPX_NODE_LEN); - } -#endif /* CONFIG_IPX_INTERN */ - ipx->ipx_source.sock = ipxs->port; - IPX_SKB_CB(skb)->ipx_dest_net = usipx->sipx_network; - memcpy(ipx->ipx_dest.node, usipx->sipx_node, IPX_NODE_LEN); - ipx->ipx_dest.sock = usipx->sipx_port; - - rc = memcpy_from_msg(skb_put(skb, len), msg, len); - if (rc) { - kfree_skb(skb); - goto out_put; - } - - /* Apply checksum. Not allowed on 802.3 links. */ - if (sk->sk_no_check_tx || - intrfc->if_dlink_type == htons(IPX_FRAME_8023)) - ipx->ipx_checksum = htons(0xFFFF); - else - ipx->ipx_checksum = ipx_cksum(ipx, len + sizeof(struct ipxhdr)); - - rc = ipxitf_send(intrfc, skb, (rt && rt->ir_routed) ? - rt->ir_router_node : ipx->ipx_dest.node); -out_put: - ipxitf_put(intrfc); - if (rt) - ipxrtr_put(rt); -out: - return rc; -} - -/* - * We use a normal struct rtentry for route handling - */ -int ipxrtr_ioctl(unsigned int cmd, void __user *arg) -{ - struct rtentry rt; /* Use these to behave like 'other' stacks */ - struct sockaddr_ipx *sg, *st; - int rc = -EFAULT; - - if (copy_from_user(&rt, arg, sizeof(rt))) - goto out; - - sg = (struct sockaddr_ipx *)&rt.rt_gateway; - st = (struct sockaddr_ipx *)&rt.rt_dst; - - rc = -EINVAL; - if (!(rt.rt_flags & RTF_GATEWAY) || /* Direct routes are fixed */ - sg->sipx_family != AF_IPX || - st->sipx_family != AF_IPX) - goto out; - - switch (cmd) { - case SIOCDELRT: - rc = ipxrtr_delete(st->sipx_network); - break; - case SIOCADDRT: { - struct ipx_route_definition f; - f.ipx_network = st->sipx_network; - f.ipx_router_network = sg->sipx_network; - memcpy(f.ipx_router_node, sg->sipx_node, IPX_NODE_LEN); - rc = ipxrtr_create(&f); - break; - } - } - -out: - return rc; -} diff --git a/net/ipx/pe2.c b/net/ipx/pe2.c deleted file mode 100644 index ba7d4214bbff..000000000000 --- a/net/ipx/pe2.c +++ /dev/null @@ -1,36 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/in.h> -#include <linux/mm.h> -#include <linux/module.h> -#include <linux/netdevice.h> -#include <linux/skbuff.h> -#include <linux/slab.h> - -#include <net/datalink.h> - -static int pEII_request(struct datalink_proto *dl, - struct sk_buff *skb, unsigned char *dest_node) -{ - struct net_device *dev = skb->dev; - - skb->protocol = htons(ETH_P_IPX); - dev_hard_header(skb, dev, ETH_P_IPX, dest_node, NULL, skb->len); - return dev_queue_xmit(skb); -} - -struct datalink_proto *make_EII_client(void) -{ - struct datalink_proto *proto = kmalloc(sizeof(*proto), GFP_ATOMIC); - - if (proto) { - proto->header_length = 0; - proto->request = pEII_request; - } - - return proto; -} - -void destroy_EII_client(struct datalink_proto *dl) -{ - kfree(dl); -} diff --git a/net/ipx/sysctl_net_ipx.c b/net/ipx/sysctl_net_ipx.c deleted file mode 100644 index c3eef457db88..000000000000 --- a/net/ipx/sysctl_net_ipx.c +++ /dev/null @@ -1,40 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* -*- linux-c -*- - * sysctl_net_ipx.c: sysctl interface to net IPX subsystem. - * - * Begun April 1, 1996, Mike Shaver. - * Added /proc/sys/net/ipx directory entry (empty =) ). [MS] - * Added /proc/sys/net/ipx/ipx_pprop_broadcasting - acme March 4, 2001 - */ - -#include <linux/mm.h> -#include <linux/sysctl.h> -#include <net/net_namespace.h> -#include <net/ipx.h> - -#ifndef CONFIG_SYSCTL -#error This file should not be compiled without CONFIG_SYSCTL defined -#endif - -static struct ctl_table ipx_table[] = { - { - .procname = "ipx_pprop_broadcasting", - .data = &sysctl_ipx_pprop_broadcasting, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { }, -}; - -static struct ctl_table_header *ipx_table_header; - -void ipx_register_sysctl(void) -{ - ipx_table_header = register_net_sysctl(&init_net, "net/ipx", ipx_table); -} - -void ipx_unregister_sysctl(void) -{ - unregister_net_sysctl_table(ipx_table_header); -} diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 148533169b1d..64331158d693 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1474,7 +1474,7 @@ done: return copied; } -static inline unsigned int iucv_accept_poll(struct sock *parent) +static inline __poll_t iucv_accept_poll(struct sock *parent) { struct iucv_sock *isk, *n; struct sock *sk; @@ -1489,11 +1489,11 @@ static inline unsigned int iucv_accept_poll(struct sock *parent) return 0; } -unsigned int iucv_sock_poll(struct file *file, struct socket *sock, +__poll_t iucv_sock_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; - unsigned int mask = 0; + __poll_t mask = 0; sock_poll_wait(file, sk_sleep(sk), wait); diff --git a/net/kcm/kcmproc.c b/net/kcm/kcmproc.c index bd5723315069..9d5649e4e8b7 100644 --- a/net/kcm/kcmproc.c +++ b/net/kcm/kcmproc.c @@ -247,7 +247,6 @@ static int kcm_seq_show(struct seq_file *seq, void *v) } static const struct file_operations kcm_seq_fops = { - .owner = THIS_MODULE, .open = kcm_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -397,7 +396,6 @@ static int kcm_stats_seq_open(struct inode *inode, struct file *file) } static const struct file_operations kcm_stats_seq_fops = { - .owner = THIS_MODULE, .open = kcm_stats_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 0b750a22c4b9..4a8d407f8902 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1387,8 +1387,13 @@ static int kcm_attach(struct socket *sock, struct socket *csock, if (!csk) return -EINVAL; - /* We must prevent loops or risk deadlock ! */ - if (csk->sk_family == PF_KCM) + /* Only allow TCP sockets to be attached for now */ + if ((csk->sk_family != AF_INET && csk->sk_family != AF_INET6) || + csk->sk_protocol != IPPROTO_TCP) + return -EOPNOTSUPP; + + /* Don't allow listeners or closed sockets */ + if (csk->sk_state == TCP_LISTEN || csk->sk_state == TCP_CLOSE) return -EOPNOTSUPP; psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL); @@ -1405,9 +1410,18 @@ static int kcm_attach(struct socket *sock, struct socket *csock, return err; } - sock_hold(csk); - write_lock_bh(&csk->sk_callback_lock); + + /* Check if sk_user_data is aready by KCM or someone else. + * Must be done under lock to prevent race conditions. + */ + if (csk->sk_user_data) { + write_unlock_bh(&csk->sk_callback_lock); + strp_done(&psock->strp); + kmem_cache_free(kcm_psockp, psock); + return -EALREADY; + } + psock->save_data_ready = csk->sk_data_ready; psock->save_write_space = csk->sk_write_space; psock->save_state_change = csk->sk_state_change; @@ -1415,8 +1429,11 @@ static int kcm_attach(struct socket *sock, struct socket *csock, csk->sk_data_ready = psock_data_ready; csk->sk_write_space = psock_write_space; csk->sk_state_change = psock_state_change; + write_unlock_bh(&csk->sk_callback_lock); + sock_hold(csk); + /* Finished initialization, now add the psock to the MUX. */ spin_lock_bh(&mux->lock); head = &mux->psocks; @@ -1625,60 +1642,30 @@ static struct proto kcm_proto = { }; /* Clone a kcm socket. */ -static int kcm_clone(struct socket *osock, struct kcm_clone *info, - struct socket **newsockp) +static struct file *kcm_clone(struct socket *osock) { struct socket *newsock; struct sock *newsk; - struct file *newfile; - int err, newfd; - err = -ENFILE; newsock = sock_alloc(); if (!newsock) - goto out; + return ERR_PTR(-ENFILE); newsock->type = osock->type; newsock->ops = osock->ops; __module_get(newsock->ops->owner); - newfd = get_unused_fd_flags(0); - if (unlikely(newfd < 0)) { - err = newfd; - goto out_fd_fail; - } - - newfile = sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name); - if (IS_ERR(newfile)) { - err = PTR_ERR(newfile); - goto out_sock_alloc_fail; - } - newsk = sk_alloc(sock_net(osock->sk), PF_KCM, GFP_KERNEL, &kcm_proto, true); if (!newsk) { - err = -ENOMEM; - goto out_sk_alloc_fail; + sock_release(newsock); + return ERR_PTR(-ENOMEM); } - sock_init_data(newsock, newsk); init_kcm_sock(kcm_sk(newsk), kcm_sk(osock->sk)->mux); - fd_install(newfd, newfile); - *newsockp = newsock; - info->fd = newfd; - - return 0; - -out_sk_alloc_fail: - fput(newfile); -out_sock_alloc_fail: - put_unused_fd(newfd); -out_fd_fail: - sock_release(newsock); -out: - return err; + return sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name); } static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) @@ -1708,17 +1695,25 @@ static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) } case SIOCKCMCLONE: { struct kcm_clone info; - struct socket *newsock = NULL; - - err = kcm_clone(sock, &info, &newsock); - if (!err) { - if (copy_to_user((void __user *)arg, &info, - sizeof(info))) { - err = -EFAULT; - sys_close(info.fd); - } - } + struct file *file; + info.fd = get_unused_fd_flags(0); + if (unlikely(info.fd < 0)) + return info.fd; + + file = kcm_clone(sock); + if (IS_ERR(file)) { + put_unused_fd(info.fd); + return PTR_ERR(file); + } + if (copy_to_user((void __user *)arg, &info, + sizeof(info))) { + put_unused_fd(info.fd); + fput(file); + return -EFAULT; + } + fd_install(info.fd, file); + err = 0; break; } default: diff --git a/net/key/af_key.c b/net/key/af_key.c index 3dffb892d52c..7e2e7188e7f4 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -401,6 +401,11 @@ static int verify_address_len(const void *p) #endif int len; + if (sp->sadb_address_len < + DIV_ROUND_UP(sizeof(*sp) + offsetofend(typeof(*addr), sa_family), + sizeof(uint64_t))) + return -EINVAL; + switch (addr->sa_family) { case AF_INET: len = DIV_ROUND_UP(sizeof(*sp) + sizeof(*sin), sizeof(uint64_t)); @@ -511,6 +516,9 @@ static int parse_exthdrs(struct sk_buff *skb, const struct sadb_msg *hdr, void * uint16_t ext_type; int ext_len; + if (len < sizeof(*ehdr)) + return -EINVAL; + ext_len = ehdr->sadb_ext_len; ext_len *= sizeof(uint64_t); ext_type = ehdr->sadb_ext_type; @@ -2194,8 +2202,10 @@ static int key_notify_policy(struct xfrm_policy *xp, int dir, const struct km_ev return PTR_ERR(out_skb); err = pfkey_xfrm_policy2msg(out_skb, xp, dir); - if (err < 0) + if (err < 0) { + kfree_skb(out_skb); return err; + } out_hdr = (struct sadb_msg *) out_skb->data; out_hdr->sadb_msg_version = PF_KEY_V2; diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 115918ad8eca..194a7483bb93 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -662,10 +662,9 @@ discard: * |x|S|x|x|x|x|x|x| Sequence Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * - * Cookie value, sublayer format and offset (pad) are negotiated with - * the peer when the session is set up. Unlike L2TPv2, we do not need - * to parse the packet header to determine if optional fields are - * present. + * Cookie value and sublayer format are negotiated with the peer when + * the session is set up. Unlike L2TPv2, we do not need to parse the + * packet header to determine if optional fields are present. * * Caller must already have parsed the frame and determined that it is * a data (not control) frame before coming here. Fields up to the @@ -731,11 +730,9 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, "%s: recv data ns=%u, session nr=%u\n", session->name, ns, session->nr); } + ptr += 4; } - /* Advance past L2-specific header, if present */ - ptr += session->l2specific_len; - if (L2TP_SKB_CB(skb)->has_seq) { /* Received a packet with sequence numbers. If we're the LNS, * check if we sre sending sequence numbers and if not, @@ -780,10 +777,8 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, } } - /* Session data offset is handled differently for L2TPv2 and - * L2TPv3. For L2TPv2, there is an optional 16-bit value in - * the header. For L2TPv3, the offset is negotiated using AVPs - * in the session setup control protocol. + /* Session data offset is defined only for L2TPv2 and is + * indicated by an optional 16-bit value in the header. */ if (tunnel->version == L2TP_HDR_VER_2) { /* If offset bit set, skip it. */ @@ -791,8 +786,7 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, offset = ntohs(*(__be16 *)ptr); ptr += 2 + offset; } - } else - ptr += session->offset; + } offset = ptr - optr; if (!pskb_may_pull(skb, offset)) @@ -1052,24 +1046,21 @@ static int l2tp_build_l2tpv3_header(struct l2tp_session *session, void *buf) memcpy(bufp, &session->cookie[0], session->cookie_len); bufp += session->cookie_len; } - if (session->l2specific_len) { - if (session->l2specific_type == L2TP_L2SPECTYPE_DEFAULT) { - u32 l2h = 0; - if (session->send_seq) { - l2h = 0x40000000 | session->ns; - session->ns++; - session->ns &= 0xffffff; - l2tp_dbg(session, L2TP_MSG_SEQ, - "%s: updated ns to %u\n", - session->name, session->ns); - } + if (session->l2specific_type == L2TP_L2SPECTYPE_DEFAULT) { + u32 l2h = 0; - *((__be32 *) bufp) = htonl(l2h); + if (session->send_seq) { + l2h = 0x40000000 | session->ns; + session->ns++; + session->ns &= 0xffffff; + l2tp_dbg(session, L2TP_MSG_SEQ, + "%s: updated ns to %u\n", + session->name, session->ns); } - bufp += session->l2specific_len; + + *((__be32 *)bufp) = htonl(l2h); + bufp += 4; } - if (session->offset) - bufp += session->offset; return bufp - optr; } @@ -1725,7 +1716,7 @@ int l2tp_session_delete(struct l2tp_session *session) EXPORT_SYMBOL_GPL(l2tp_session_delete); /* We come here whenever a session's send_seq, cookie_len or - * l2specific_len parameters are set. + * l2specific_type parameters are set. */ void l2tp_session_set_header_len(struct l2tp_session *session, int version) { @@ -1734,7 +1725,8 @@ void l2tp_session_set_header_len(struct l2tp_session *session, int version) if (session->send_seq) session->hdr_len += 4; } else { - session->hdr_len = 4 + session->cookie_len + session->l2specific_len + session->offset; + session->hdr_len = 4 + session->cookie_len; + session->hdr_len += l2tp_get_l2specific_len(session); if (session->tunnel->encap == L2TP_ENCAPTYPE_UDP) session->hdr_len += 4; } @@ -1784,9 +1776,7 @@ struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunn session->recv_seq = cfg->recv_seq; session->lns_mode = cfg->lns_mode; session->reorder_timeout = cfg->reorder_timeout; - session->offset = cfg->offset; session->l2specific_type = cfg->l2specific_type; - session->l2specific_len = cfg->l2specific_len; session->cookie_len = cfg->cookie_len; memcpy(&session->cookie[0], &cfg->cookie[0], cfg->cookie_len); session->peer_cookie_len = cfg->peer_cookie_len; diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index 9534e16965cc..9bbee90e9963 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -59,8 +59,6 @@ struct l2tp_session_cfg { int debug; /* bitmask of debug message * categories */ u16 vlan_id; /* VLAN pseudowire only */ - u16 offset; /* offset to payload */ - u16 l2specific_len; /* Layer 2 specific length */ u16 l2specific_type; /* Layer 2 specific type */ u8 cookie[8]; /* optional cookie */ int cookie_len; /* 0, 4 or 8 bytes */ @@ -86,9 +84,6 @@ struct l2tp_session { int cookie_len; u8 peer_cookie[8]; int peer_cookie_len; - u16 offset; /* offset from end of L2TP header - to beginning of data */ - u16 l2specific_len; u16 l2specific_type; u16 hdr_len; u32 nr; /* session NR state (receive) */ @@ -305,6 +300,17 @@ static inline void l2tp_session_dec_refcount(struct l2tp_session *session) l2tp_session_free(session); } +static inline int l2tp_get_l2specific_len(struct l2tp_session *session) +{ + switch (session->l2specific_type) { + case L2TP_L2SPECTYPE_DEFAULT: + return 4; + case L2TP_L2SPECTYPE_NONE: + default: + return 0; + } +} + #define l2tp_printk(ptr, type, func, fmt, ...) \ do { \ if (((ptr)->debug) & (type)) \ diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c index eb69411bcb47..72e713da4733 100644 --- a/net/l2tp/l2tp_debugfs.c +++ b/net/l2tp/l2tp_debugfs.c @@ -180,8 +180,8 @@ static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v) session->lns_mode ? "LNS" : "LAC", session->debug, jiffies_to_msecs(session->reorder_timeout)); - seq_printf(m, " offset %hu l2specific %hu/%hu\n", - session->offset, session->l2specific_type, session->l2specific_len); + seq_printf(m, " offset 0 l2specific %hu/%hu\n", + session->l2specific_type, l2tp_get_l2specific_len(session)); if (session->cookie_len) { seq_printf(m, " cookie %02x%02x%02x%02x", session->cookie[0], session->cookie[1], diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index a1f24fb2be98..e7ea9c4b89ff 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -547,19 +547,19 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf } if (tunnel->version > 2) { - if (info->attrs[L2TP_ATTR_OFFSET]) - cfg.offset = nla_get_u16(info->attrs[L2TP_ATTR_OFFSET]); - if (info->attrs[L2TP_ATTR_DATA_SEQ]) cfg.data_seq = nla_get_u8(info->attrs[L2TP_ATTR_DATA_SEQ]); - cfg.l2specific_type = L2TP_L2SPECTYPE_DEFAULT; - if (info->attrs[L2TP_ATTR_L2SPEC_TYPE]) + if (info->attrs[L2TP_ATTR_L2SPEC_TYPE]) { cfg.l2specific_type = nla_get_u8(info->attrs[L2TP_ATTR_L2SPEC_TYPE]); - - cfg.l2specific_len = 4; - if (info->attrs[L2TP_ATTR_L2SPEC_LEN]) - cfg.l2specific_len = nla_get_u8(info->attrs[L2TP_ATTR_L2SPEC_LEN]); + if (cfg.l2specific_type != L2TP_L2SPECTYPE_DEFAULT && + cfg.l2specific_type != L2TP_L2SPECTYPE_NONE) { + ret = -EINVAL; + goto out_tunnel; + } + } else { + cfg.l2specific_type = L2TP_L2SPECTYPE_DEFAULT; + } if (info->attrs[L2TP_ATTR_COOKIE]) { u16 len = nla_len(info->attrs[L2TP_ATTR_COOKIE]); @@ -620,27 +620,6 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf goto out_tunnel; } - /* Check that pseudowire-specific params are present */ - switch (cfg.pw_type) { - case L2TP_PWTYPE_NONE: - break; - case L2TP_PWTYPE_ETH_VLAN: - if (!info->attrs[L2TP_ATTR_VLAN_ID]) { - ret = -EINVAL; - goto out_tunnel; - } - break; - case L2TP_PWTYPE_ETH: - break; - case L2TP_PWTYPE_PPP: - case L2TP_PWTYPE_PPP_AC: - break; - case L2TP_PWTYPE_IP: - default: - ret = -EPROTONOSUPPORT; - break; - } - ret = l2tp_nl_cmd_ops[cfg.pw_type]->session_create(net, tunnel, session_id, peer_session_id, diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index b412fc3351dc..59f246d7b290 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -1734,7 +1734,6 @@ static int pppol2tp_proc_open(struct inode *inode, struct file *file) } static const struct file_operations pppol2tp_proc_fops = { - .owner = THIS_MODULE, .open = pppol2tp_proc_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/llc/llc_proc.c b/net/llc/llc_proc.c index 29c509c54bb2..66821e8a2b7a 100644 --- a/net/llc/llc_proc.c +++ b/net/llc/llc_proc.c @@ -225,7 +225,6 @@ static int llc_seq_core_open(struct inode *inode, struct file *file) } static const struct file_operations llc_seq_socket_fops = { - .owner = THIS_MODULE, .open = llc_seq_socket_open, .read = seq_read, .llseek = seq_lseek, @@ -233,7 +232,6 @@ static const struct file_operations llc_seq_socket_fops = { }; static const struct file_operations llc_seq_core_fops = { - .owner = THIS_MODULE, .open = llc_seq_core_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index d444752dbf40..a8b1616cec41 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -153,27 +153,16 @@ EXPORT_SYMBOL(ieee80211_stop_rx_ba_session); */ static void sta_rx_agg_session_timer_expired(struct timer_list *t) { - struct tid_ampdu_rx *tid_rx_timer = - from_timer(tid_rx_timer, t, session_timer); - struct sta_info *sta = tid_rx_timer->sta; - u8 tid = tid_rx_timer->tid; - struct tid_ampdu_rx *tid_rx; + struct tid_ampdu_rx *tid_rx = from_timer(tid_rx, t, session_timer); + struct sta_info *sta = tid_rx->sta; + u8 tid = tid_rx->tid; unsigned long timeout; - rcu_read_lock(); - tid_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[tid]); - if (!tid_rx) { - rcu_read_unlock(); - return; - } - timeout = tid_rx->last_rx + TU_TO_JIFFIES(tid_rx->timeout); if (time_is_after_jiffies(timeout)) { mod_timer(&tid_rx->session_timer, timeout); - rcu_read_unlock(); return; } - rcu_read_unlock(); ht_dbg(sta->sdata, "RX session timer expired on %pM tid %d\n", sta->sta.addr, tid); @@ -415,10 +404,11 @@ end: timeout); } -void __ieee80211_start_rx_ba_session(struct sta_info *sta, - u8 dialog_token, u16 timeout, - u16 start_seq_num, u16 ba_policy, u16 tid, - u16 buf_size, bool tx, bool auto_seq) +static void __ieee80211_start_rx_ba_session(struct sta_info *sta, + u8 dialog_token, u16 timeout, + u16 start_seq_num, u16 ba_policy, + u16 tid, u16 buf_size, bool tx, + bool auto_seq) { mutex_lock(&sta->ampdu_mlme.mtx); ___ieee80211_start_rx_ba_session(sta, dialog_token, timeout, diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index 5f8ab5be369f..595c662a61e8 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -392,7 +392,8 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, * telling the driver. New packets will not go through since * the aggregation session is no longer OPERATIONAL. */ - synchronize_net(); + if (!local->in_reconfig) + synchronize_net(); tid_tx->stop_initiator = reason == AGG_STOP_PEER_REQUEST ? WLAN_BACK_RECIPIENT : @@ -429,18 +430,12 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, */ static void sta_addba_resp_timer_expired(struct timer_list *t) { - struct tid_ampdu_tx *tid_tx_timer = - from_timer(tid_tx_timer, t, addba_resp_timer); - struct sta_info *sta = tid_tx_timer->sta; - u8 tid = tid_tx_timer->tid; - struct tid_ampdu_tx *tid_tx; + struct tid_ampdu_tx *tid_tx = from_timer(tid_tx, t, addba_resp_timer); + struct sta_info *sta = tid_tx->sta; + u8 tid = tid_tx->tid; /* check if the TID waits for addBA response */ - rcu_read_lock(); - tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]); - if (!tid_tx || - test_bit(HT_AGG_STATE_RESPONSE_RECEIVED, &tid_tx->state)) { - rcu_read_unlock(); + if (test_bit(HT_AGG_STATE_RESPONSE_RECEIVED, &tid_tx->state)) { ht_dbg(sta->sdata, "timer expired on %pM tid %d not expecting addBA response\n", sta->sta.addr, tid); @@ -451,7 +446,6 @@ static void sta_addba_resp_timer_expired(struct timer_list *t) sta->sta.addr, tid); ieee80211_stop_tx_ba_session(&sta->sta, tid); - rcu_read_unlock(); } void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) @@ -529,29 +523,21 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) */ static void sta_tx_agg_session_timer_expired(struct timer_list *t) { - struct tid_ampdu_tx *tid_tx_timer = - from_timer(tid_tx_timer, t, session_timer); - struct sta_info *sta = tid_tx_timer->sta; - u8 tid = tid_tx_timer->tid; - struct tid_ampdu_tx *tid_tx; + struct tid_ampdu_tx *tid_tx = from_timer(tid_tx, t, session_timer); + struct sta_info *sta = tid_tx->sta; + u8 tid = tid_tx->tid; unsigned long timeout; - rcu_read_lock(); - tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]); - if (!tid_tx || test_bit(HT_AGG_STATE_STOPPING, &tid_tx->state)) { - rcu_read_unlock(); + if (test_bit(HT_AGG_STATE_STOPPING, &tid_tx->state)) { return; } timeout = tid_tx->last_tx + TU_TO_JIFFIES(tid_tx->timeout); if (time_is_after_jiffies(timeout)) { mod_timer(&tid_tx->session_timer, timeout); - rcu_read_unlock(); return; } - rcu_read_unlock(); - ht_dbg(sta->sdata, "tx session timer expired on %pM tid %d\n", sta->sta.addr, tid); diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index fb15d3b97cb2..46028e12e216 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -573,10 +573,12 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev, case WLAN_CIPHER_SUITE_BIP_CMAC_256: BUILD_BUG_ON(offsetof(typeof(kseq), ccmp) != offsetof(typeof(kseq), aes_cmac)); + /* fall through */ case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: BUILD_BUG_ON(offsetof(typeof(kseq), ccmp) != offsetof(typeof(kseq), aes_gmac)); + /* fall through */ case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: BUILD_BUG_ON(offsetof(typeof(kseq), ccmp) != @@ -2205,6 +2207,7 @@ static int ieee80211_scan(struct wiphy *wiphy, * for now fall through to allow scanning only when * beaconing hasn't been configured yet */ + /* fall through */ case NL80211_IFTYPE_AP: /* * If the scan has been forced (and the driver supports @@ -2373,10 +2376,17 @@ static int ieee80211_set_tx_power(struct wiphy *wiphy, struct ieee80211_sub_if_data *sdata; enum nl80211_tx_power_setting txp_type = type; bool update_txp_type = false; + bool has_monitor = false; if (wdev) { sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); + if (sdata->vif.type == NL80211_IFTYPE_MONITOR) { + sdata = rtnl_dereference(local->monitor_sdata); + if (!sdata) + return -EOPNOTSUPP; + } + switch (type) { case NL80211_TX_POWER_AUTOMATIC: sdata->user_power_level = IEEE80211_UNSET_POWER_LEVEL; @@ -2415,15 +2425,34 @@ static int ieee80211_set_tx_power(struct wiphy *wiphy, mutex_lock(&local->iflist_mtx); list_for_each_entry(sdata, &local->interfaces, list) { + if (sdata->vif.type == NL80211_IFTYPE_MONITOR) { + has_monitor = true; + continue; + } sdata->user_power_level = local->user_power_level; if (txp_type != sdata->vif.bss_conf.txpower_type) update_txp_type = true; sdata->vif.bss_conf.txpower_type = txp_type; } - list_for_each_entry(sdata, &local->interfaces, list) + list_for_each_entry(sdata, &local->interfaces, list) { + if (sdata->vif.type == NL80211_IFTYPE_MONITOR) + continue; ieee80211_recalc_txpower(sdata, update_txp_type); + } mutex_unlock(&local->iflist_mtx); + if (has_monitor) { + sdata = rtnl_dereference(local->monitor_sdata); + if (sdata) { + sdata->user_power_level = local->user_power_level; + if (txp_type != sdata->vif.bss_conf.txpower_type) + update_txp_type = true; + sdata->vif.bss_conf.txpower_type = txp_type; + + ieee80211_recalc_txpower(sdata, update_txp_type); + } + } + return 0; } diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 5fae001f286c..1f466d12a6bc 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -211,6 +211,7 @@ static const char *hw_flag_names[] = { FLAG(TX_FRAG_LIST), FLAG(REPORTS_LOW_ACK), FLAG(SUPPORTS_TX_FRAG), + FLAG(SUPPORTS_TDLS_BUFFER_STA), #undef FLAG }; diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index b15412c21ac9..444ea8d127fe 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -420,7 +420,7 @@ static ssize_t sta_vht_capa_read(struct file *file, char __user *userbuf, default: p += scnprintf(p, sizeof(buf) + buf - p, "\t\tMAX-MPDU-UNKNOWN\n"); - }; + } switch (vhtc->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK) { case 0: p += scnprintf(p, sizeof(buf) + buf - p, @@ -438,7 +438,7 @@ static ssize_t sta_vht_capa_read(struct file *file, char __user *userbuf, p += scnprintf(p, sizeof(buf) + buf - p, "\t\tUNKNOWN-MHZ: 0x%x\n", (vhtc->cap >> 2) & 0x3); - }; + } PFLAG(RXLDPC, "RXLDPC"); PFLAG(SHORT_GI_80, "SHORT-GI-80"); PFLAG(SHORT_GI_160, "SHORT-GI-160"); diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index c7f93fd9ca7a..4d82fe7d627c 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -165,7 +165,8 @@ static inline void drv_bss_info_changed(struct ieee80211_local *local, if (WARN_ON_ONCE(sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE || sdata->vif.type == NL80211_IFTYPE_NAN || (sdata->vif.type == NL80211_IFTYPE_MONITOR && - !sdata->vif.mu_mimo_owner))) + !sdata->vif.mu_mimo_owner && + !(changed & BSS_CHANGED_TXPOWER)))) return; if (!check_sdata_in_driver(sdata)) diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index 41f5e48f8021..d7523530d3f8 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -291,13 +291,14 @@ void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta, int i; mutex_lock(&sta->ampdu_mlme.mtx); - for (i = 0; i < IEEE80211_NUM_TIDS; i++) { - ___ieee80211_stop_tx_ba_session(sta, i, reason); + for (i = 0; i < IEEE80211_NUM_TIDS; i++) ___ieee80211_stop_rx_ba_session(sta, i, WLAN_BACK_RECIPIENT, WLAN_REASON_QSTA_LEAVE_QBSS, reason != AGG_STOP_DESTROY_STA && reason != AGG_STOP_PEER_REQUEST); - } + + for (i = 0; i < IEEE80211_NUM_TIDS; i++) + ___ieee80211_stop_tx_ba_session(sta, i, reason); mutex_unlock(&sta->ampdu_mlme.mtx); /* stopping might queue the work again - so cancel only afterwards */ @@ -491,6 +492,7 @@ int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata, case IEEE80211_SMPS_AUTOMATIC: case IEEE80211_SMPS_NUM_MODES: WARN_ON(1); + /* fall through */ case IEEE80211_SMPS_OFF: action_frame->u.action.u.ht_smps.smps_control = WLAN_HT_SMPS_CONTROL_DISABLED; diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 885d00b41911..26900025de2f 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1757,10 +1757,6 @@ void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, u16 initiator, u16 reason, bool stop); void __ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, u16 initiator, u16 reason, bool stop); -void __ieee80211_start_rx_ba_session(struct sta_info *sta, - u8 dialog_token, u16 timeout, - u16 start_seq_num, u16 ba_policy, u16 tid, - u16 buf_size, bool tx, bool auto_seq); void ___ieee80211_start_rx_ba_session(struct sta_info *sta, u8 dialog_token, u16 timeout, u16 start_seq_num, u16 ba_policy, u16 tid, diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 13b16f90e1cf..5fe01f82df12 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1474,7 +1474,7 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata, break; case NL80211_IFTYPE_UNSPECIFIED: case NUM_NL80211_IFTYPES: - BUG(); + WARN_ON(1); break; } @@ -1633,7 +1633,7 @@ static void ieee80211_assign_perm_addr(struct ieee80211_local *local, goto out_unlock; } } - /* otherwise fall through */ + /* fall through */ default: /* assign a new address if possible -- try n_addresses first */ for (i = 0; i < local->hw.wiphy->n_addresses; i++) { diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 938049395f90..aee05ec3f7ea 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -178,13 +178,17 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key) if (!ret) { key->flags |= KEY_FLAG_UPLOADED_TO_HARDWARE; - if (!((key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC) || + if (!((key->conf.flags & (IEEE80211_KEY_FLAG_GENERATE_MMIC | + IEEE80211_KEY_FLAG_PUT_MIC_SPACE)) || (key->conf.flags & IEEE80211_KEY_FLAG_RESERVE_TAILROOM))) decrease_tailroom_need_count(sdata, 1); WARN_ON((key->conf.flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE) && (key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)); + WARN_ON((key->conf.flags & IEEE80211_KEY_FLAG_PUT_MIC_SPACE) && + (key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC)); + return 0; } @@ -237,7 +241,8 @@ static void ieee80211_key_disable_hw_accel(struct ieee80211_key *key) sta = key->sta; sdata = key->sdata; - if (!((key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC) || + if (!((key->conf.flags & (IEEE80211_KEY_FLAG_GENERATE_MMIC | + IEEE80211_KEY_FLAG_PUT_MIC_SPACE)) || (key->conf.flags & IEEE80211_KEY_FLAG_RESERVE_TAILROOM))) increment_tailroom_need_count(sdata); @@ -1104,7 +1109,8 @@ void ieee80211_remove_key(struct ieee80211_key_conf *keyconf) if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) { key->flags &= ~KEY_FLAG_UPLOADED_TO_HARDWARE; - if (!((key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC) || + if (!((key->conf.flags & (IEEE80211_KEY_FLAG_GENERATE_MMIC | + IEEE80211_KEY_FLAG_PUT_MIC_SPACE)) || (key->conf.flags & IEEE80211_KEY_FLAG_RESERVE_TAILROOM))) increment_tailroom_need_count(key->sdata); } diff --git a/net/mac80211/main.c b/net/mac80211/main.c index e054a2fd8d38..0785d04a80bc 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -263,6 +263,9 @@ static void ieee80211_restart_work(struct work_struct *work) flush_delayed_work(&local->roc_work); flush_work(&local->hw_roc_done); + /* wait for all packet processing to be done */ + synchronize_net(); + ieee80211_reconfig(local); rtnl_unlock(); } diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 5e27364e10ac..73ac607beb5d 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -989,8 +989,10 @@ ieee80211_mesh_process_chnswitch(struct ieee80211_sub_if_data *sdata, switch (sdata->vif.bss_conf.chandef.width) { case NL80211_CHAN_WIDTH_20_NOHT: sta_flags |= IEEE80211_STA_DISABLE_HT; + /* fall through */ case NL80211_CHAN_WIDTH_20: sta_flags |= IEEE80211_STA_DISABLE_40MHZ; + /* fall through */ case NL80211_CHAN_WIDTH_40: sta_flags |= IEEE80211_STA_DISABLE_VHT; break; diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index 4f7826d7b47c..35ad3983ae4b 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -797,7 +797,7 @@ static void hwmp_rann_frame_process(struct ieee80211_sub_if_data *sdata, struct mesh_path *mpath; u8 ttl, flags, hopcount; const u8 *orig_addr; - u32 orig_sn, metric, metric_txsta, interval; + u32 orig_sn, new_metric, orig_metric, last_hop_metric, interval; bool root_is_gate; ttl = rann->rann_ttl; @@ -808,7 +808,7 @@ static void hwmp_rann_frame_process(struct ieee80211_sub_if_data *sdata, interval = le32_to_cpu(rann->rann_interval); hopcount = rann->rann_hopcount; hopcount++; - metric = le32_to_cpu(rann->rann_metric); + orig_metric = le32_to_cpu(rann->rann_metric); /* Ignore our own RANNs */ if (ether_addr_equal(orig_addr, sdata->vif.addr)) @@ -825,7 +825,10 @@ static void hwmp_rann_frame_process(struct ieee80211_sub_if_data *sdata, return; } - metric_txsta = airtime_link_metric_get(local, sta); + last_hop_metric = airtime_link_metric_get(local, sta); + new_metric = orig_metric + last_hop_metric; + if (new_metric < orig_metric) + new_metric = MAX_METRIC; mpath = mesh_path_lookup(sdata, orig_addr); if (!mpath) { @@ -838,7 +841,7 @@ static void hwmp_rann_frame_process(struct ieee80211_sub_if_data *sdata, } if (!(SN_LT(mpath->sn, orig_sn)) && - !(mpath->sn == orig_sn && metric < mpath->rann_metric)) { + !(mpath->sn == orig_sn && new_metric < mpath->rann_metric)) { rcu_read_unlock(); return; } @@ -856,7 +859,7 @@ static void hwmp_rann_frame_process(struct ieee80211_sub_if_data *sdata, } mpath->sn = orig_sn; - mpath->rann_metric = metric + metric_txsta; + mpath->rann_metric = new_metric; mpath->is_root = true; /* Recording RANNs sender address to send individually * addressed PREQs destined for root mesh STA */ @@ -876,7 +879,7 @@ static void hwmp_rann_frame_process(struct ieee80211_sub_if_data *sdata, mesh_path_sel_frame_tx(MPATH_RANN, flags, orig_addr, orig_sn, 0, NULL, 0, broadcast_addr, hopcount, ttl, interval, - metric + metric_txsta, 0, sdata); + new_metric, 0, sdata); } rcu_read_unlock(); @@ -1247,6 +1250,7 @@ void mesh_path_tx_root_frame(struct ieee80211_sub_if_data *sdata) break; case IEEE80211_PROACTIVE_PREQ_WITH_PREP: flags |= IEEE80211_PREQ_PROACTIVE_PREP_FLAG; + /* fall through */ case IEEE80211_PROACTIVE_PREQ_NO_PREP: interval = ifmsh->mshcfg.dot11MeshHWMPactivePathToRootTimeout; target_flags |= IEEE80211_PREQ_TO_FLAG | diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c index 86c8dfef56a4..a5125624a76d 100644 --- a/net/mac80211/mesh_pathtbl.c +++ b/net/mac80211/mesh_pathtbl.c @@ -257,9 +257,7 @@ __mesh_path_lookup_by_idx(struct mesh_table *tbl, int idx) if (ret) return NULL; - ret = rhashtable_walk_start(&iter); - if (ret && ret != -EAGAIN) - goto err; + rhashtable_walk_start(&iter); while ((mpath = rhashtable_walk_next(&iter))) { if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN) @@ -269,7 +267,6 @@ __mesh_path_lookup_by_idx(struct mesh_table *tbl, int idx) if (i++ == idx) break; } -err: rhashtable_walk_stop(&iter); rhashtable_walk_exit(&iter); @@ -513,9 +510,7 @@ void mesh_plink_broken(struct sta_info *sta) if (ret) return; - ret = rhashtable_walk_start(&iter); - if (ret && ret != -EAGAIN) - goto out; + rhashtable_walk_start(&iter); while ((mpath = rhashtable_walk_next(&iter))) { if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN) @@ -535,7 +530,6 @@ void mesh_plink_broken(struct sta_info *sta) WLAN_REASON_MESH_PATH_DEST_UNREACHABLE, bcast); } } -out: rhashtable_walk_stop(&iter); rhashtable_walk_exit(&iter); } @@ -584,9 +578,7 @@ void mesh_path_flush_by_nexthop(struct sta_info *sta) if (ret) return; - ret = rhashtable_walk_start(&iter); - if (ret && ret != -EAGAIN) - goto out; + rhashtable_walk_start(&iter); while ((mpath = rhashtable_walk_next(&iter))) { if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN) @@ -597,7 +589,7 @@ void mesh_path_flush_by_nexthop(struct sta_info *sta) if (rcu_access_pointer(mpath->next_hop) == sta) __mesh_path_del(tbl, mpath); } -out: + rhashtable_walk_stop(&iter); rhashtable_walk_exit(&iter); } @@ -614,9 +606,7 @@ static void mpp_flush_by_proxy(struct ieee80211_sub_if_data *sdata, if (ret) return; - ret = rhashtable_walk_start(&iter); - if (ret && ret != -EAGAIN) - goto out; + rhashtable_walk_start(&iter); while ((mpath = rhashtable_walk_next(&iter))) { if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN) @@ -627,7 +617,7 @@ static void mpp_flush_by_proxy(struct ieee80211_sub_if_data *sdata, if (ether_addr_equal(mpath->mpp, proxy)) __mesh_path_del(tbl, mpath); } -out: + rhashtable_walk_stop(&iter); rhashtable_walk_exit(&iter); } @@ -642,9 +632,7 @@ static void table_flush_by_iface(struct mesh_table *tbl) if (ret) return; - ret = rhashtable_walk_start(&iter); - if (ret && ret != -EAGAIN) - goto out; + rhashtable_walk_start(&iter); while ((mpath = rhashtable_walk_next(&iter))) { if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN) @@ -653,7 +641,7 @@ static void table_flush_by_iface(struct mesh_table *tbl) break; __mesh_path_del(tbl, mpath); } -out: + rhashtable_walk_stop(&iter); rhashtable_walk_exit(&iter); } @@ -873,9 +861,7 @@ void mesh_path_tbl_expire(struct ieee80211_sub_if_data *sdata, if (ret) return; - ret = rhashtable_walk_start(&iter); - if (ret && ret != -EAGAIN) - goto out; + rhashtable_walk_start(&iter); while ((mpath = rhashtable_walk_next(&iter))) { if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN) @@ -887,7 +873,7 @@ void mesh_path_tbl_expire(struct ieee80211_sub_if_data *sdata, time_after(jiffies, mpath->exp_time + MESH_PATH_EXPIRE)) __mesh_path_del(tbl, mpath); } -out: + rhashtable_walk_stop(&iter); rhashtable_walk_exit(&iter); } diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index e2d00cce3c17..0f6c9ca59062 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -672,7 +672,7 @@ void mesh_plink_timer(struct timer_list *t) break; } reason = WLAN_REASON_MESH_MAX_RETRIES; - /* fall through on else */ + /* fall through */ case NL80211_PLINK_CNF_RCVD: /* confirm timer */ if (!reason) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 04460440d731..39b660b9a908 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -473,6 +473,7 @@ static void ieee80211_add_ht_ie(struct ieee80211_sub_if_data *sdata, case IEEE80211_SMPS_AUTOMATIC: case IEEE80211_SMPS_NUM_MODES: WARN_ON(1); + /* fall through */ case IEEE80211_SMPS_OFF: cap |= WLAN_HT_CAP_SM_PS_DISABLED << IEEE80211_HT_CAP_SM_PS_SHIFT; @@ -895,7 +896,7 @@ void ieee80211_send_nullfunc(struct ieee80211_local *local, struct ieee80211_hdr_3addr *nullfunc; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; - skb = ieee80211_nullfunc_get(&local->hw, &sdata->vif); + skb = ieee80211_nullfunc_get(&local->hw, &sdata->vif, true); if (!skb) return; @@ -2861,10 +2862,11 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, aid = le16_to_cpu(mgmt->u.assoc_resp.aid); capab_info = le16_to_cpu(mgmt->u.assoc_resp.capab_info); - if ((aid & (BIT(15) | BIT(14))) != (BIT(15) | BIT(14))) - sdata_info(sdata, "invalid AID value 0x%x; bits 15:14 not set\n", - aid); - aid &= ~(BIT(15) | BIT(14)); + /* + * The 5 MSB of the AID field are reserved + * (802.11-2016 9.4.1.8 AID field) + */ + aid &= 0x7ff; ifmgd->broken_ap = false; diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index faf4f6055000..f1d40b6645ff 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -801,14 +801,14 @@ int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, case NL80211_IFTYPE_ADHOC: if (!sdata->vif.bss_conf.ibss_joined) need_offchan = true; - /* fall through */ #ifdef CONFIG_MAC80211_MESH + /* fall through */ case NL80211_IFTYPE_MESH_POINT: if (ieee80211_vif_is_mesh(&sdata->vif) && !sdata->u.mesh.mesh_id_len) need_offchan = true; - /* fall through */ #endif + /* fall through */ case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_P2P_GO: diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 70e9d2ca8bbe..fd580614085b 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1607,23 +1607,16 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx) /* * Change STA power saving mode only at the end of a frame - * exchange sequence. + * exchange sequence, and only for a data or management + * frame as specified in IEEE 802.11-2016 11.2.3.2 */ if (!ieee80211_hw_check(&sta->local->hw, AP_LINK_PS) && !ieee80211_has_morefrags(hdr->frame_control) && - !ieee80211_is_back_req(hdr->frame_control) && + (ieee80211_is_mgmt(hdr->frame_control) || + ieee80211_is_data(hdr->frame_control)) && !(status->rx_flags & IEEE80211_RX_DEFERRED_RELEASE) && (rx->sdata->vif.type == NL80211_IFTYPE_AP || - rx->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) && - /* - * PM bit is only checked in frames where it isn't reserved, - * in AP mode it's reserved in non-bufferable management frames - * (cf. IEEE 802.11-2012 8.2.4.1.7 Power Management field) - * BAR frames should be ignored as specified in - * IEEE 802.11-2012 10.2.1.2. - */ - (!ieee80211_is_mgmt(hdr->frame_control) || - ieee80211_is_bufferable_mmpdu(hdr->frame_control))) { + rx->sdata->vif.type == NL80211_IFTYPE_AP_VLAN)) { if (test_sta_flag(sta, WLAN_STA_PS_STA)) { if (!ieee80211_has_pm(hdr->frame_control)) sta_ps_end(sta); @@ -3632,6 +3625,8 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx) } return true; case NL80211_IFTYPE_MESH_POINT: + if (ether_addr_equal(sdata->vif.addr, hdr->addr2)) + return false; if (multicast) return true; return ether_addr_equal(sdata->vif.addr, hdr->addr1); diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index 91093d4a2f84..5cd5e6e5834e 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -47,6 +47,8 @@ static void ieee80211_tdls_add_ext_capab(struct ieee80211_sub_if_data *sdata, NL80211_FEATURE_TDLS_CHANNEL_SWITCH; bool wider_band = ieee80211_hw_check(&local->hw, TDLS_WIDER_BW) && !ifmgd->tdls_wider_bw_prohibited; + bool buffer_sta = ieee80211_hw_check(&local->hw, + SUPPORTS_TDLS_BUFFER_STA); struct ieee80211_supported_band *sband = ieee80211_get_sband(sdata); bool vht = sband && sband->vht_cap.vht_supported; u8 *pos = skb_put(skb, 10); @@ -56,7 +58,8 @@ static void ieee80211_tdls_add_ext_capab(struct ieee80211_sub_if_data *sdata, *pos++ = 0x0; *pos++ = 0x0; *pos++ = 0x0; - *pos++ = chan_switch ? WLAN_EXT_CAPA4_TDLS_CHAN_SWITCH : 0; + *pos++ = (chan_switch ? WLAN_EXT_CAPA4_TDLS_CHAN_SWITCH : 0) | + (buffer_sta ? WLAN_EXT_CAPA4_TDLS_BUFFER_STA : 0); *pos++ = WLAN_EXT_CAPA5_TDLS_ENABLED; *pos++ = 0; *pos++ = 0; @@ -236,6 +239,7 @@ static enum ieee80211_ac_numbers ieee80211_ac_from_wmm(int ac) switch (ac) { default: WARN_ON_ONCE(1); + /* fall through */ case 0: return IEEE80211_AC_BE; case 1: diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 7b8154474b9e..25904af38839 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -2922,7 +2922,9 @@ void ieee80211_check_fast_xmit(struct sta_info *sta) gen_iv = build.key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV; iv_spc = build.key->conf.flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE; - mmic = build.key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC; + mmic = build.key->conf.flags & + (IEEE80211_KEY_FLAG_GENERATE_MMIC | + IEEE80211_KEY_FLAG_PUT_MIC_SPACE); /* don't handle software crypto */ if (!(build.key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE)) @@ -4438,13 +4440,15 @@ struct sk_buff *ieee80211_pspoll_get(struct ieee80211_hw *hw, EXPORT_SYMBOL(ieee80211_pspoll_get); struct sk_buff *ieee80211_nullfunc_get(struct ieee80211_hw *hw, - struct ieee80211_vif *vif) + struct ieee80211_vif *vif, + bool qos_ok) { struct ieee80211_hdr_3addr *nullfunc; struct ieee80211_sub_if_data *sdata; struct ieee80211_if_managed *ifmgd; struct ieee80211_local *local; struct sk_buff *skb; + bool qos = false; if (WARN_ON(vif->type != NL80211_IFTYPE_STATION)) return NULL; @@ -4453,7 +4457,17 @@ struct sk_buff *ieee80211_nullfunc_get(struct ieee80211_hw *hw, ifmgd = &sdata->u.mgd; local = sdata->local; - skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*nullfunc)); + if (qos_ok) { + struct sta_info *sta; + + rcu_read_lock(); + sta = sta_info_get(sdata, ifmgd->bssid); + qos = sta && sta->sta.wme; + rcu_read_unlock(); + } + + skb = dev_alloc_skb(local->hw.extra_tx_headroom + + sizeof(*nullfunc) + 2); if (!skb) return NULL; @@ -4463,6 +4477,19 @@ struct sk_buff *ieee80211_nullfunc_get(struct ieee80211_hw *hw, nullfunc->frame_control = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_NULLFUNC | IEEE80211_FCTL_TODS); + if (qos) { + __le16 qos = cpu_to_le16(7); + + BUILD_BUG_ON((IEEE80211_STYPE_QOS_NULLFUNC | + IEEE80211_STYPE_NULLFUNC) != + IEEE80211_STYPE_QOS_NULLFUNC); + nullfunc->frame_control |= + cpu_to_le16(IEEE80211_STYPE_QOS_NULLFUNC); + skb->priority = 7; + skb_set_queue_mapping(skb, IEEE80211_AC_VO); + skb_put_data(skb, &qos, sizeof(qos)); + } + memcpy(nullfunc->addr1, ifmgd->bssid, ETH_ALEN); memcpy(nullfunc->addr2, vif->addr, ETH_ALEN); memcpy(nullfunc->addr3, ifmgd->bssid, ETH_ALEN); diff --git a/net/mac80211/util.c b/net/mac80211/util.c index d57e5f6bd8b6..1f82191ce601 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -2110,15 +2110,6 @@ int ieee80211_reconfig(struct ieee80211_local *local) cfg80211_sched_scan_stopped_rtnl(local->hw.wiphy, 0); wake_up: - if (local->in_reconfig) { - local->in_reconfig = false; - barrier(); - - /* Restart deferred ROCs */ - mutex_lock(&local->mtx); - ieee80211_start_next_roc(local); - mutex_unlock(&local->mtx); - } if (local->monitors == local->open_count && local->monitors > 0) ieee80211_add_virtual_monitor(local); @@ -2146,6 +2137,16 @@ int ieee80211_reconfig(struct ieee80211_local *local) mutex_unlock(&local->sta_mtx); } + if (local->in_reconfig) { + local->in_reconfig = false; + barrier(); + + /* Restart deferred ROCs */ + mutex_lock(&local->mtx); + ieee80211_start_next_roc(local); + mutex_unlock(&local->mtx); + } + ieee80211_wake_queues_by_reason(hw, IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_SUSPEND, false); diff --git a/net/mac80211/wme.c b/net/mac80211/wme.c index 3e3d3014e9ab..5f7c96368b11 100644 --- a/net/mac80211/wme.c +++ b/net/mac80211/wme.c @@ -165,6 +165,7 @@ u16 ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, qos = sta->sta.wme; break; } + /* fall through */ case NL80211_IFTYPE_AP: ra = skb->data; break; diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c index b58722d9de37..785056cb76f6 100644 --- a/net/mac80211/wpa.c +++ b/net/mac80211/wpa.c @@ -1,7 +1,7 @@ /* * Copyright 2002-2004, Instant802 Networks, Inc. * Copyright 2008, Jouni Malinen <j@w1.fi> - * Copyright (C) 2016 Intel Deutschland GmbH + * Copyright (C) 2016-2017 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -59,8 +59,9 @@ ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx) if (info->control.hw_key && (info->flags & IEEE80211_TX_CTL_DONTFRAG || ieee80211_hw_check(&tx->local->hw, SUPPORTS_TX_FRAG)) && - !(tx->key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC)) { - /* hwaccel - with no need for SW-generated MMIC */ + !(tx->key->conf.flags & (IEEE80211_KEY_FLAG_GENERATE_MMIC | + IEEE80211_KEY_FLAG_PUT_MIC_SPACE))) { + /* hwaccel - with no need for SW-generated MMIC or MIC space */ return TX_CONTINUE; } @@ -75,8 +76,15 @@ ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx) skb_tailroom(skb), tail)) return TX_DROP; - key = &tx->key->conf.key[NL80211_TKIP_DATA_OFFSET_TX_MIC_KEY]; mic = skb_put(skb, MICHAEL_MIC_LEN); + + if (tx->key->conf.flags & IEEE80211_KEY_FLAG_PUT_MIC_SPACE) { + /* Zeroed MIC can help with debug */ + memset(mic, 0, MICHAEL_MIC_LEN); + return TX_CONTINUE; + } + + key = &tx->key->conf.key[NL80211_TKIP_DATA_OFFSET_TX_MIC_KEY]; michael_mic(key, hdr, data, data_len, mic); if (unlikely(info->flags & IEEE80211_TX_INTFL_TKIP_MIC_FAILURE)) mic[0]++; diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 8ca9915befc8..5dce8336d33f 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -2510,12 +2510,15 @@ static int __init mpls_init(void) rtnl_af_register(&mpls_af_ops); - rtnl_register(PF_MPLS, RTM_NEWROUTE, mpls_rtm_newroute, NULL, 0); - rtnl_register(PF_MPLS, RTM_DELROUTE, mpls_rtm_delroute, NULL, 0); - rtnl_register(PF_MPLS, RTM_GETROUTE, mpls_getroute, mpls_dump_routes, - 0); - rtnl_register(PF_MPLS, RTM_GETNETCONF, mpls_netconf_get_devconf, - mpls_netconf_dump_devconf, 0); + rtnl_register_module(THIS_MODULE, PF_MPLS, RTM_NEWROUTE, + mpls_rtm_newroute, NULL, 0); + rtnl_register_module(THIS_MODULE, PF_MPLS, RTM_DELROUTE, + mpls_rtm_delroute, NULL, 0); + rtnl_register_module(THIS_MODULE, PF_MPLS, RTM_GETROUTE, + mpls_getroute, mpls_dump_routes, 0); + rtnl_register_module(THIS_MODULE, PF_MPLS, RTM_GETNETCONF, + mpls_netconf_get_devconf, + mpls_netconf_dump_devconf, 0); err = ipgre_tunnel_encap_add_mpls_ops(); if (err) pr_err("Can't add mpls over gre tunnel ops\n"); diff --git a/net/ncsi/ncsi-aen.c b/net/ncsi/ncsi-aen.c index 67e708e98ccf..e7b05de1e6d1 100644 --- a/net/ncsi/ncsi-aen.c +++ b/net/ncsi/ncsi-aen.c @@ -143,43 +143,14 @@ static int ncsi_aen_handler_hncdsc(struct ncsi_dev_priv *ndp, if (!nc) return -ENODEV; - /* If the channel is active one, we need reconfigure it */ spin_lock_irqsave(&nc->lock, flags); ncm = &nc->modes[NCSI_MODE_LINK]; hncdsc = (struct ncsi_aen_hncdsc_pkt *)h; ncm->data[3] = ntohl(hncdsc->status); - netdev_info(ndp->ndev.dev, "NCSI: HNCDSC AEN - channel %u state %s\n", - nc->id, ncm->data[3] & 0x3 ? "up" : "down"); - if (!list_empty(&nc->link) || - nc->state != NCSI_CHANNEL_ACTIVE) { - spin_unlock_irqrestore(&nc->lock, flags); - return 0; - } - - spin_unlock_irqrestore(&nc->lock, flags); - if (!(ndp->flags & NCSI_DEV_HWA) && !(ncm->data[3] & 0x1)) - ndp->flags |= NCSI_DEV_RESHUFFLE; - - /* If this channel is the active one and the link doesn't - * work, we have to choose another channel to be active one. - * The logic here is exactly similar to what we do when link - * is down on the active channel. - * - * On the other hand, we need configure it when host driver - * state on the active channel becomes ready. - */ - ncsi_stop_channel_monitor(nc); - - spin_lock_irqsave(&nc->lock, flags); - nc->state = (ncm->data[3] & 0x1) ? NCSI_CHANNEL_INACTIVE : - NCSI_CHANNEL_ACTIVE; spin_unlock_irqrestore(&nc->lock, flags); - - spin_lock_irqsave(&ndp->lock, flags); - list_add_tail_rcu(&nc->link, &ndp->channel_queue); - spin_unlock_irqrestore(&ndp->lock, flags); - - ncsi_process_next_channel(ndp); + netdev_printk(KERN_DEBUG, ndp->ndev.dev, + "NCSI: host driver %srunning on channel %u\n", + ncm->data[3] & 0x1 ? "" : "not ", nc->id); return 0; } diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index e4a13cc8a2e7..9019fa98003d 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -12,6 +12,12 @@ config NETFILTER_INGRESS config NETFILTER_NETLINK tristate +config NETFILTER_FAMILY_BRIDGE + bool + +config NETFILTER_FAMILY_ARP + bool + config NETFILTER_NETLINK_ACCT tristate "Netfilter NFACCT over NFNETLINK interface" depends on NETFILTER_ADVANCED @@ -62,6 +68,8 @@ config NF_LOG_NETDEV select NF_LOG_COMMON if NF_CONNTRACK +config NETFILTER_CONNCOUNT + tristate config NF_CONNTRACK_MARK bool 'Connection mark tracking support' @@ -497,6 +505,13 @@ config NFT_CT This option adds the "ct" expression that you can use to match connection tracking information such as the flow state. +config NFT_FLOW_OFFLOAD + depends on NF_CONNTRACK && NF_FLOW_TABLE + tristate "Netfilter nf_tables hardware flow offload module" + help + This option adds the "flow_offload" expression that you can use to + choose what flows are placed into the hardware. + config NFT_SET_RBTREE tristate "Netfilter nf_tables rbtree set module" help @@ -649,6 +664,23 @@ endif # NF_TABLES_NETDEV endif # NF_TABLES +config NF_FLOW_TABLE_INET + tristate "Netfilter flow table mixed IPv4/IPv6 module" + depends on NF_FLOW_TABLE_IPV4 && NF_FLOW_TABLE_IPV6 + select NF_FLOW_TABLE + help + This option adds the flow table mixed IPv4/IPv6 support. + + To compile it as a module, choose M here. + +config NF_FLOW_TABLE + tristate "Netfilter flow table module" + depends on NF_CONNTRACK && NF_TABLES + help + This option adds the flow table core infrastructure. + + To compile it as a module, choose M here. + config NETFILTER_XTABLES tristate "Netfilter Xtables support (required for ip_tables)" default m if NETFILTER_ADVANCED=n @@ -1120,6 +1152,7 @@ config NETFILTER_XT_MATCH_CONNLIMIT tristate '"connlimit" match support' depends on NF_CONNTRACK depends on NETFILTER_ADVANCED + select NETFILTER_CONNCOUNT ---help--- This match allows you to match against the number of parallel connections to a server per client IP address (or address block). diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index f78ed2470831..5d9b8b959e58 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o +netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o utils.o nf_conntrack-y := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o nf_conntrack_acct.o nf_conntrack_seqadj.o nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMEOUT) += nf_conntrack_timeout.o @@ -67,6 +67,8 @@ obj-$(CONFIG_NF_NAT_TFTP) += nf_nat_tftp.o # SYNPROXY obj-$(CONFIG_NETFILTER_SYNPROXY) += nf_synproxy_core.o +obj-$(CONFIG_NETFILTER_CONNCOUNT) += nf_conncount.o + # generic packet duplication from netdev family obj-$(CONFIG_NF_DUP_NETDEV) += nf_dup_netdev.o @@ -84,6 +86,7 @@ obj-$(CONFIG_NFT_META) += nft_meta.o obj-$(CONFIG_NFT_RT) += nft_rt.o obj-$(CONFIG_NFT_NUMGEN) += nft_numgen.o obj-$(CONFIG_NFT_CT) += nft_ct.o +obj-$(CONFIG_NFT_FLOW_OFFLOAD) += nft_flow_offload.o obj-$(CONFIG_NFT_LIMIT) += nft_limit.o obj-$(CONFIG_NFT_NAT) += nft_nat.o obj-$(CONFIG_NFT_OBJREF) += nft_objref.o @@ -107,6 +110,10 @@ obj-$(CONFIG_NFT_FIB_NETDEV) += nft_fib_netdev.o obj-$(CONFIG_NFT_DUP_NETDEV) += nft_dup_netdev.o obj-$(CONFIG_NFT_FWD_NETDEV) += nft_fwd_netdev.o +# flow table infrastructure +obj-$(CONFIG_NF_FLOW_TABLE) += nf_flow_table.o +obj-$(CONFIG_NF_FLOW_TABLE_INET) += nf_flow_table_inet.o + # generic X tables obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 52cd2901a097..0f6b8172fb9a 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -4,8 +4,7 @@ * Thanks to Rob `CmdrTaco' Malda for not influencing this code in any * way. * - * Rusty Russell (C)2000 -- This code is GPL. - * Patrick McHardy (c) 2006-2012 + * This code is GPL. */ #include <linux/kernel.h> #include <linux/netfilter.h> @@ -28,34 +27,12 @@ #include "nf_internals.h" -static DEFINE_MUTEX(afinfo_mutex); - -const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly; -EXPORT_SYMBOL(nf_afinfo); const struct nf_ipv6_ops __rcu *nf_ipv6_ops __read_mostly; EXPORT_SYMBOL_GPL(nf_ipv6_ops); DEFINE_PER_CPU(bool, nf_skb_duplicated); EXPORT_SYMBOL_GPL(nf_skb_duplicated); -int nf_register_afinfo(const struct nf_afinfo *afinfo) -{ - mutex_lock(&afinfo_mutex); - RCU_INIT_POINTER(nf_afinfo[afinfo->family], afinfo); - mutex_unlock(&afinfo_mutex); - return 0; -} -EXPORT_SYMBOL_GPL(nf_register_afinfo); - -void nf_unregister_afinfo(const struct nf_afinfo *afinfo) -{ - mutex_lock(&afinfo_mutex); - RCU_INIT_POINTER(nf_afinfo[afinfo->family], NULL); - mutex_unlock(&afinfo_mutex); - synchronize_rcu(); -} -EXPORT_SYMBOL_GPL(nf_unregister_afinfo); - #ifdef HAVE_JUMP_LABEL struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; EXPORT_SYMBOL(nf_hooks_needed); @@ -74,7 +51,8 @@ static struct nf_hook_entries *allocate_hook_entries_size(u16 num) struct nf_hook_entries *e; size_t alloc = sizeof(*e) + sizeof(struct nf_hook_entry) * num + - sizeof(struct nf_hook_ops *) * num; + sizeof(struct nf_hook_ops *) * num + + sizeof(struct nf_hook_entries_rcu_head); if (num == 0) return NULL; @@ -85,6 +63,30 @@ static struct nf_hook_entries *allocate_hook_entries_size(u16 num) return e; } +static void __nf_hook_entries_free(struct rcu_head *h) +{ + struct nf_hook_entries_rcu_head *head; + + head = container_of(h, struct nf_hook_entries_rcu_head, head); + kvfree(head->allocation); +} + +static void nf_hook_entries_free(struct nf_hook_entries *e) +{ + struct nf_hook_entries_rcu_head *head; + struct nf_hook_ops **ops; + unsigned int num; + + if (!e) + return; + + num = e->num_hook_entries; + ops = nf_hook_entries_get_hook_ops(e); + head = (void *)&ops[num]; + head->allocation = e; + call_rcu(&head->head, __nf_hook_entries_free); +} + static unsigned int accept_all(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) @@ -135,6 +137,12 @@ nf_hook_entries_grow(const struct nf_hook_entries *old, ++i; continue; } + + if (reg->nat_hook && orig_ops[i]->nat_hook) { + kvfree(new); + return ERR_PTR(-EBUSY); + } + if (inserted || reg->priority > orig_ops[i]->priority) { new_ops[nhooks] = (void *)orig_ops[i]; new->hooks[nhooks] = old->hooks[i]; @@ -237,27 +245,61 @@ out_assign: return old; } -static struct nf_hook_entries __rcu **nf_hook_entry_head(struct net *net, const struct nf_hook_ops *reg) +static struct nf_hook_entries __rcu ** +nf_hook_entry_head(struct net *net, int pf, unsigned int hooknum, + struct net_device *dev) { - if (reg->pf != NFPROTO_NETDEV) - return net->nf.hooks[reg->pf]+reg->hooknum; + switch (pf) { + case NFPROTO_NETDEV: + break; +#ifdef CONFIG_NETFILTER_FAMILY_ARP + case NFPROTO_ARP: + if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_arp) <= hooknum)) + return NULL; + return net->nf.hooks_arp + hooknum; +#endif +#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE + case NFPROTO_BRIDGE: + if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_bridge) <= hooknum)) + return NULL; + return net->nf.hooks_bridge + hooknum; +#endif + case NFPROTO_IPV4: + if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_ipv4) <= hooknum)) + return NULL; + return net->nf.hooks_ipv4 + hooknum; + case NFPROTO_IPV6: + if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_ipv6) <= hooknum)) + return NULL; + return net->nf.hooks_ipv6 + hooknum; +#if IS_ENABLED(CONFIG_DECNET) + case NFPROTO_DECNET: + if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_decnet) <= hooknum)) + return NULL; + return net->nf.hooks_decnet + hooknum; +#endif + default: + WARN_ON_ONCE(1); + return NULL; + } #ifdef CONFIG_NETFILTER_INGRESS - if (reg->hooknum == NF_NETDEV_INGRESS) { - if (reg->dev && dev_net(reg->dev) == net) - return ®->dev->nf_hooks_ingress; + if (hooknum == NF_NETDEV_INGRESS) { + if (dev && dev_net(dev) == net) + return &dev->nf_hooks_ingress; } #endif WARN_ON_ONCE(1); return NULL; } -int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) +static int __nf_register_net_hook(struct net *net, int pf, + const struct nf_hook_ops *reg) { struct nf_hook_entries *p, *new_hooks; struct nf_hook_entries __rcu **pp; - if (reg->pf == NFPROTO_NETDEV) { + if (pf == NFPROTO_NETDEV) { #ifndef CONFIG_NETFILTER_INGRESS if (reg->hooknum == NF_NETDEV_INGRESS) return -EOPNOTSUPP; @@ -267,7 +309,7 @@ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) return -EINVAL; } - pp = nf_hook_entry_head(net, reg); + pp = nf_hook_entry_head(net, pf, reg->hooknum, reg->dev); if (!pp) return -EINVAL; @@ -285,21 +327,19 @@ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) hooks_validate(new_hooks); #ifdef CONFIG_NETFILTER_INGRESS - if (reg->pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) + if (pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) net_inc_ingress_queue(); #endif #ifdef HAVE_JUMP_LABEL - static_key_slow_inc(&nf_hooks_needed[reg->pf][reg->hooknum]); + static_key_slow_inc(&nf_hooks_needed[pf][reg->hooknum]); #endif - synchronize_net(); BUG_ON(p == new_hooks); - kvfree(p); + nf_hook_entries_free(p); return 0; } -EXPORT_SYMBOL(nf_register_net_hook); /* - * __nf_unregister_net_hook - remove a hook from blob + * nf_remove_net_hook - remove a hook from blob * * @oldp: current address of hook blob * @unreg: hook to unregister @@ -307,8 +347,8 @@ EXPORT_SYMBOL(nf_register_net_hook); * This cannot fail, hook unregistration must always succeed. * Therefore replace the to-be-removed hook with a dummy hook. */ -static void __nf_unregister_net_hook(struct nf_hook_entries *old, - const struct nf_hook_ops *unreg) +static void nf_remove_net_hook(struct nf_hook_entries *old, + const struct nf_hook_ops *unreg, int pf) { struct nf_hook_ops **orig_ops; bool found = false; @@ -326,24 +366,24 @@ static void __nf_unregister_net_hook(struct nf_hook_entries *old, if (found) { #ifdef CONFIG_NETFILTER_INGRESS - if (unreg->pf == NFPROTO_NETDEV && unreg->hooknum == NF_NETDEV_INGRESS) + if (pf == NFPROTO_NETDEV && unreg->hooknum == NF_NETDEV_INGRESS) net_dec_ingress_queue(); #endif #ifdef HAVE_JUMP_LABEL - static_key_slow_dec(&nf_hooks_needed[unreg->pf][unreg->hooknum]); + static_key_slow_dec(&nf_hooks_needed[pf][unreg->hooknum]); #endif } else { - WARN_ONCE(1, "hook not found, pf %d num %d", unreg->pf, unreg->hooknum); + WARN_ONCE(1, "hook not found, pf %d num %d", pf, unreg->hooknum); } } -void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) +static void __nf_unregister_net_hook(struct net *net, int pf, + const struct nf_hook_ops *reg) { struct nf_hook_entries __rcu **pp; struct nf_hook_entries *p; - unsigned int nfq; - pp = nf_hook_entry_head(net, reg); + pp = nf_hook_entry_head(net, pf, reg->hooknum, reg->dev); if (!pp) return; @@ -355,23 +395,52 @@ void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) return; } - __nf_unregister_net_hook(p, reg); + nf_remove_net_hook(p, reg, pf); p = __nf_hook_entries_try_shrink(pp); mutex_unlock(&nf_hook_mutex); if (!p) return; - synchronize_net(); + nf_queue_nf_hook_drop(net); + nf_hook_entries_free(p); +} - /* other cpu might still process nfqueue verdict that used reg */ - nfq = nf_queue_nf_hook_drop(net); - if (nfq) - synchronize_net(); - kvfree(p); +void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) +{ + if (reg->pf == NFPROTO_INET) { + __nf_unregister_net_hook(net, NFPROTO_IPV4, reg); + __nf_unregister_net_hook(net, NFPROTO_IPV6, reg); + } else { + __nf_unregister_net_hook(net, reg->pf, reg); + } } EXPORT_SYMBOL(nf_unregister_net_hook); +int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) +{ + int err; + + if (reg->pf == NFPROTO_INET) { + err = __nf_register_net_hook(net, NFPROTO_IPV4, reg); + if (err < 0) + return err; + + err = __nf_register_net_hook(net, NFPROTO_IPV6, reg); + if (err < 0) { + __nf_unregister_net_hook(net, NFPROTO_IPV4, reg); + return err; + } + } else { + err = __nf_register_net_hook(net, reg->pf, reg); + if (err < 0) + return err; + } + + return 0; +} +EXPORT_SYMBOL(nf_register_net_hook); + int nf_register_net_hooks(struct net *net, const struct nf_hook_ops *reg, unsigned int n) { @@ -395,63 +464,10 @@ EXPORT_SYMBOL(nf_register_net_hooks); void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg, unsigned int hookcount) { - struct nf_hook_entries *to_free[16], *p; - struct nf_hook_entries __rcu **pp; - unsigned int i, j, n; - - mutex_lock(&nf_hook_mutex); - for (i = 0; i < hookcount; i++) { - pp = nf_hook_entry_head(net, ®[i]); - if (!pp) - continue; - - p = nf_entry_dereference(*pp); - if (WARN_ON_ONCE(!p)) - continue; - __nf_unregister_net_hook(p, ®[i]); - } - mutex_unlock(&nf_hook_mutex); - - do { - n = min_t(unsigned int, hookcount, ARRAY_SIZE(to_free)); - - mutex_lock(&nf_hook_mutex); - - for (i = 0, j = 0; i < hookcount && j < n; i++) { - pp = nf_hook_entry_head(net, ®[i]); - if (!pp) - continue; - - p = nf_entry_dereference(*pp); - if (!p) - continue; - - to_free[j] = __nf_hook_entries_try_shrink(pp); - if (to_free[j]) - ++j; - } - - mutex_unlock(&nf_hook_mutex); - - if (j) { - unsigned int nfq; - - synchronize_net(); - - /* need 2nd synchronize_net() if nfqueue is used, skb - * can get reinjected right before nf_queue_hook_drop() - */ - nfq = nf_queue_nf_hook_drop(net); - if (nfq) - synchronize_net(); - - for (i = 0; i < j; i++) - kvfree(to_free[i]); - } + unsigned int i; - reg += n; - hookcount -= n; - } while (hookcount > 0); + for (i = 0; i < hookcount; i++) + nf_unregister_net_hook(net, ®[i]); } EXPORT_SYMBOL(nf_unregister_net_hooks); @@ -569,14 +585,27 @@ void (*nf_nat_decode_session_hook)(struct sk_buff *, struct flowi *); EXPORT_SYMBOL(nf_nat_decode_session_hook); #endif -static int __net_init netfilter_net_init(struct net *net) +static void __net_init __netfilter_net_init(struct nf_hook_entries **e, int max) { - int i, h; + int h; - for (i = 0; i < ARRAY_SIZE(net->nf.hooks); i++) { - for (h = 0; h < NF_MAX_HOOKS; h++) - RCU_INIT_POINTER(net->nf.hooks[i][h], NULL); - } + for (h = 0; h < max; h++) + RCU_INIT_POINTER(e[h], NULL); +} + +static int __net_init netfilter_net_init(struct net *net) +{ + __netfilter_net_init(net->nf.hooks_ipv4, ARRAY_SIZE(net->nf.hooks_ipv4)); + __netfilter_net_init(net->nf.hooks_ipv6, ARRAY_SIZE(net->nf.hooks_ipv6)); +#ifdef CONFIG_NETFILTER_FAMILY_ARP + __netfilter_net_init(net->nf.hooks_arp, ARRAY_SIZE(net->nf.hooks_arp)); +#endif +#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE + __netfilter_net_init(net->nf.hooks_bridge, ARRAY_SIZE(net->nf.hooks_bridge)); +#endif +#if IS_ENABLED(CONFIG_DECNET) + __netfilter_net_init(net->nf.hooks_decnet, ARRAY_SIZE(net->nf.hooks_decnet)); +#endif #ifdef CONFIG_PROC_FS net->nf.proc_netfilter = proc_net_mkdir(net, "netfilter", diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h index 5ca18f07683b..257ca393e6f2 100644 --- a/net/netfilter/ipset/ip_set_bitmap_gen.h +++ b/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -127,14 +127,7 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, if (ret <= 0) return ret; - if (SET_WITH_TIMEOUT(set) && - ip_set_timeout_expired(ext_timeout(x, set))) - return 0; - if (SET_WITH_COUNTER(set)) - ip_set_update_counter(ext_counter(x, set), ext, mext, flags); - if (SET_WITH_SKBINFO(set)) - ip_set_get_skbinfo(ext_skbinfo(x, set), ext, mext, flags); - return 1; + return ip_set_match_extensions(set, ext, mext, flags, x); } static int @@ -227,6 +220,7 @@ mtype_list(const struct ip_set *set, rcu_read_lock(); for (; cb->args[IPSET_CB_ARG0] < map->elements; cb->args[IPSET_CB_ARG0]++) { + cond_resched_rcu(); id = cb->args[IPSET_CB_ARG0]; x = get_ext(set, map, id); if (!test_bit(id, map->members) || diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index d8975a0b4282..488d6d05c65c 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -263,12 +263,8 @@ bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[], ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &last_ip); if (ret) return ret; - if (first_ip > last_ip) { - u32 tmp = first_ip; - - first_ip = last_ip; - last_ip = tmp; - } + if (first_ip > last_ip) + swap(first_ip, last_ip); } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c index 4c279fbd2d5d..c00b6a2e8e3c 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -337,12 +337,8 @@ bitmap_ipmac_create(struct net *net, struct ip_set *set, struct nlattr *tb[], ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &last_ip); if (ret) return ret; - if (first_ip > last_ip) { - u32 tmp = first_ip; - - first_ip = last_ip; - last_ip = tmp; - } + if (first_ip > last_ip) + swap(first_ip, last_ip); } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c index 7f9bbd7c98b5..b561ca8b3659 100644 --- a/net/netfilter/ipset/ip_set_bitmap_port.c +++ b/net/netfilter/ipset/ip_set_bitmap_port.c @@ -238,12 +238,8 @@ bitmap_port_create(struct net *net, struct ip_set *set, struct nlattr *tb[], first_port = ip_set_get_h16(tb[IPSET_ATTR_PORT]); last_port = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); - if (first_port > last_port) { - u16 tmp = first_port; - - first_port = last_port; - last_port = tmp; - } + if (first_port > last_port) + swap(first_port, last_port); elements = last_port - first_port + 1; set->dsize = ip_set_elem_len(set, tb, 0, 0); diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index cf84f7b37cd9..975a85a48d39 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -57,7 +57,7 @@ MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET); /* When the nfnl mutex is held: */ #define ip_set_dereference(p) \ - rcu_dereference_protected(p, 1) + rcu_dereference_protected(p, lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET)) #define ip_set(inst, id) \ ip_set_dereference((inst)->ip_set_list)[id] @@ -472,6 +472,31 @@ ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set, } EXPORT_SYMBOL_GPL(ip_set_put_extensions); +bool +ip_set_match_extensions(struct ip_set *set, const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags, void *data) +{ + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(data, set))) + return false; + if (SET_WITH_COUNTER(set)) { + struct ip_set_counter *counter = ext_counter(data, set); + + if (flags & IPSET_FLAG_MATCH_COUNTERS && + !(ip_set_match_counter(ip_set_get_packets(counter), + mext->packets, mext->packets_op) && + ip_set_match_counter(ip_set_get_bytes(counter), + mext->bytes, mext->bytes_op))) + return false; + ip_set_update_counter(counter, ext, flags); + } + if (SET_WITH_SKBINFO(set)) + ip_set_get_skbinfo(ext_skbinfo(data, set), + ext, mext, flags); + return true; +} +EXPORT_SYMBOL_GPL(ip_set_match_extensions); + /* Creating/destroying/renaming/swapping affect the existence and * the properties of a set. All of these can be executed from userspace * only and serialized by the nfnl mutex indirectly from nfnetlink. @@ -1386,11 +1411,9 @@ dump_last: goto next_set; if (set->variant->uref) set->variant->uref(set, cb, true); - /* Fall through and add elements */ + /* fall through */ default: - rcu_read_lock_bh(); ret = set->variant->list(set, skb, cb); - rcu_read_unlock_bh(); if (!cb->args[IPSET_CB_ARG0]) /* Set is done, proceed with next one */ goto next_set; @@ -2055,6 +2078,7 @@ ip_set_net_exit(struct net *net) inst->is_deleted = true; /* flag for ip_set_nfnl_put */ + nfnl_lock(NFNL_SUBSYS_IPSET); for (i = 0; i < inst->ip_set_max; i++) { set = ip_set(inst, i); if (set) { @@ -2062,6 +2086,7 @@ ip_set_net_exit(struct net *net) ip_set_destroy_set(set); } } + nfnl_unlock(NFNL_SUBSYS_IPSET); kfree(rcu_dereference_protected(inst->ip_set_list, 1)); } @@ -2097,7 +2122,6 @@ ip_set_init(void) return ret; } - pr_info("ip_set: protocol %u\n", IPSET_PROTOCOL); return 0; } @@ -2113,3 +2137,5 @@ ip_set_fini(void) module_init(ip_set_init); module_exit(ip_set_fini); + +MODULE_DESCRIPTION("ip_set: protocol " __stringify(IPSET_PROTOCOL)); diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index efffc8eabafe..bbad940c0137 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -917,12 +917,9 @@ static inline int mtype_data_match(struct mtype_elem *data, const struct ip_set_ext *ext, struct ip_set_ext *mext, struct ip_set *set, u32 flags) { - if (SET_WITH_COUNTER(set)) - ip_set_update_counter(ext_counter(data, set), - ext, mext, flags); - if (SET_WITH_SKBINFO(set)) - ip_set_get_skbinfo(ext_skbinfo(data, set), - ext, mext, flags); + if (!ip_set_match_extensions(set, ext, mext, flags, data)) + return 0; + /* nomatch entries return -ENOTEMPTY */ return mtype_do_data_match(data); } @@ -941,9 +938,9 @@ mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d, struct mtype_elem *data; #if IPSET_NET_COUNT == 2 struct mtype_elem orig = *d; - int i, j = 0, k; + int ret, i, j = 0, k; #else - int i, j = 0; + int ret, i, j = 0; #endif u32 key, multi = 0; @@ -969,18 +966,13 @@ mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d, data = ahash_data(n, i, set->dsize); if (!mtype_data_equal(data, d, &multi)) continue; - if (SET_WITH_TIMEOUT(set)) { - if (!ip_set_timeout_expired( - ext_timeout(data, set))) - return mtype_data_match(data, ext, - mext, set, - flags); + ret = mtype_data_match(data, ext, mext, set, flags); + if (ret != 0) + return ret; #ifdef IP_SET_HASH_WITH_MULTI - multi = 0; + /* No match, reset multiple match flag */ + multi = 0; #endif - } else - return mtype_data_match(data, ext, - mext, set, flags); } #if IPSET_NET_COUNT == 2 } @@ -1027,12 +1019,11 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, if (!test_bit(i, n->used)) continue; data = ahash_data(n, i, set->dsize); - if (mtype_data_equal(data, d, &multi) && - !(SET_WITH_TIMEOUT(set) && - ip_set_timeout_expired(ext_timeout(data, set)))) { - ret = mtype_data_match(data, ext, mext, set, flags); + if (!mtype_data_equal(data, d, &multi)) + continue; + ret = mtype_data_match(data, ext, mext, set, flags); + if (ret != 0) goto out; - } } out: return ret; @@ -1143,6 +1134,7 @@ mtype_list(const struct ip_set *set, rcu_read_lock(); for (; cb->args[IPSET_CB_ARG0] < jhash_size(t->htable_bits); cb->args[IPSET_CB_ARG0]++) { + cond_resched_rcu(); incomplete = skb_tail_pointer(skb); n = rcu_dereference(hbucket(t, cb->args[IPSET_CB_ARG0])); pr_debug("cb->arg bucket: %lu, t %p n %p\n", diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index 0f164e986bf1..88b83d6d3084 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -168,7 +168,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], struct hash_ipportnet4_elem e = { .cidr = HOST_MASK - 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0, p = 0, port, port_to; - u32 ip2_from = 0, ip2_to = 0, ip2_last, ip2; + u32 ip2_from = 0, ip2_to = 0, ip2; bool with_ports = false; u8 cidr; int ret; @@ -269,22 +269,21 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], ip_set_mask_from_to(ip2_from, ip2_to, e.cidr + 1); } - if (retried) + if (retried) { ip = ntohl(h->next.ip); + p = ntohs(h->next.port); + ip2 = ntohl(h->next.ip2); + } else { + p = port; + ip2 = ip2_from; + } for (; ip <= ip_to; ip++) { e.ip = htonl(ip); - p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) - : port; for (; p <= port_to; p++) { e.port = htons(p); - ip2 = retried && - ip == ntohl(h->next.ip) && - p == ntohs(h->next.port) - ? ntohl(h->next.ip2) : ip2_from; - while (ip2 <= ip2_to) { + do { e.ip2 = htonl(ip2); - ip2_last = ip_set_range_to_cidr(ip2, ip2_to, - &cidr); + ip2 = ip_set_range_to_cidr(ip2, ip2_to, &cidr); e.cidr = cidr - 1; ret = adtfn(set, &e, &ext, &ext, flags); @@ -292,9 +291,10 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], return ret; ret = 0; - ip2 = ip2_last + 1; - } + } while (ip2++ < ip2_to); + ip2 = ip2_from; } + p = port; } return ret; } diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index 1c67a1761e45..5449e23af13a 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -143,7 +143,7 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_net4_elem e = { .cidr = HOST_MASK }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 ip = 0, ip_to = 0, last; + u32 ip = 0, ip_to = 0; int ret; if (tb[IPSET_ATTR_LINENO]) @@ -193,16 +193,15 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], } if (retried) ip = ntohl(h->next.ip); - while (ip <= ip_to) { + do { e.ip = htonl(ip); - last = ip_set_range_to_cidr(ip, ip_to, &e.cidr); + ip = ip_set_range_to_cidr(ip, ip_to, &e.cidr); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; - ip = last + 1; - } + } while (ip++ < ip_to); return ret; } diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index d417074f1c1a..f5164c1efce2 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -200,7 +200,7 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netiface4_elem e = { .cidr = HOST_MASK, .elem = 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 ip = 0, ip_to = 0, last; + u32 ip = 0, ip_to = 0; int ret; if (tb[IPSET_ATTR_LINENO]) @@ -255,17 +255,16 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) ip = ntohl(h->next.ip); - while (ip <= ip_to) { + do { e.ip = htonl(ip); - last = ip_set_range_to_cidr(ip, ip_to, &e.cidr); + ip = ip_set_range_to_cidr(ip, ip_to, &e.cidr); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; - ip = last + 1; - } + } while (ip++ < ip_to); return ret; } diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index 7f9ae2e9645b..5a2b923bd81f 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -169,8 +169,8 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netnet4_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 ip = 0, ip_to = 0, last; - u32 ip2 = 0, ip2_from = 0, ip2_to = 0, last2; + u32 ip = 0, ip_to = 0; + u32 ip2 = 0, ip2_from = 0, ip2_to = 0; int ret; if (tb[IPSET_ATTR_LINENO]) @@ -247,27 +247,27 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], ip_set_mask_from_to(ip2_from, ip2_to, e.cidr[1]); } - if (retried) + if (retried) { ip = ntohl(h->next.ip[0]); + ip2 = ntohl(h->next.ip[1]); + } else { + ip2 = ip2_from; + } - while (ip <= ip_to) { + do { e.ip[0] = htonl(ip); - last = ip_set_range_to_cidr(ip, ip_to, &e.cidr[0]); - ip2 = (retried && - ip == ntohl(h->next.ip[0])) ? ntohl(h->next.ip[1]) - : ip2_from; - while (ip2 <= ip2_to) { + ip = ip_set_range_to_cidr(ip, ip_to, &e.cidr[0]); + do { e.ip[1] = htonl(ip2); - last2 = ip_set_range_to_cidr(ip2, ip2_to, &e.cidr[1]); + ip2 = ip_set_range_to_cidr(ip2, ip2_to, &e.cidr[1]); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; - ip2 = last2 + 1; - } - ip = last + 1; - } + } while (ip2++ < ip2_to); + ip2 = ip2_from; + } while (ip++ < ip_to); return ret; } diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index e6ef382febe4..1a187be9ebc8 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -161,7 +161,7 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netport4_elem e = { .cidr = HOST_MASK - 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 port, port_to, p = 0, ip = 0, ip_to = 0, last; + u32 port, port_to, p = 0, ip = 0, ip_to = 0; bool with_ports = false; u8 cidr; int ret; @@ -239,25 +239,26 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], ip_set_mask_from_to(ip, ip_to, e.cidr + 1); } - if (retried) + if (retried) { ip = ntohl(h->next.ip); - while (ip <= ip_to) { + p = ntohs(h->next.port); + } else { + p = port; + } + do { e.ip = htonl(ip); - last = ip_set_range_to_cidr(ip, ip_to, &cidr); + ip = ip_set_range_to_cidr(ip, ip_to, &cidr); e.cidr = cidr - 1; - p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) - : port; for (; p <= port_to; p++) { e.port = htons(p); ret = adtfn(set, &e, &ext, &ext, flags); - if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; } - ip = last + 1; - } + p = port; + } while (ip++ < ip_to); return ret; } diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index 8602f2595a1a..d391485a6acd 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -184,8 +184,8 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netportnet4_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 ip = 0, ip_to = 0, ip_last, p = 0, port, port_to; - u32 ip2_from = 0, ip2_to = 0, ip2_last, ip2; + u32 ip = 0, ip_to = 0, p = 0, port, port_to; + u32 ip2_from = 0, ip2_to = 0, ip2; bool with_ports = false; int ret; @@ -288,33 +288,34 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], ip_set_mask_from_to(ip2_from, ip2_to, e.cidr[1]); } - if (retried) + if (retried) { ip = ntohl(h->next.ip[0]); + p = ntohs(h->next.port); + ip2 = ntohl(h->next.ip[1]); + } else { + p = port; + ip2 = ip2_from; + } - while (ip <= ip_to) { + do { e.ip[0] = htonl(ip); - ip_last = ip_set_range_to_cidr(ip, ip_to, &e.cidr[0]); - p = retried && ip == ntohl(h->next.ip[0]) ? ntohs(h->next.port) - : port; + ip = ip_set_range_to_cidr(ip, ip_to, &e.cidr[0]); for (; p <= port_to; p++) { e.port = htons(p); - ip2 = (retried && ip == ntohl(h->next.ip[0]) && - p == ntohs(h->next.port)) ? ntohl(h->next.ip[1]) - : ip2_from; - while (ip2 <= ip2_to) { + do { e.ip[1] = htonl(ip2); - ip2_last = ip_set_range_to_cidr(ip2, ip2_to, - &e.cidr[1]); + ip2 = ip_set_range_to_cidr(ip2, ip2_to, + &e.cidr[1]); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; - ip2 = ip2_last + 1; - } + } while (ip2++ < ip2_to); + ip2 = ip2_from; } - ip = ip_last + 1; - } + p = port; + } while (ip++ < ip_to); return ret; } diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index e864681b8dc5..072a658fde04 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -55,8 +55,9 @@ list_set_ktest(struct ip_set *set, const struct sk_buff *skb, struct ip_set_adt_opt *opt, const struct ip_set_ext *ext) { struct list_set *map = set->data; + struct ip_set_ext *mext = &opt->ext; struct set_elem *e; - u32 cmdflags = opt->cmdflags; + u32 flags = opt->cmdflags; int ret; /* Don't lookup sub-counters at all */ @@ -64,21 +65,11 @@ list_set_ktest(struct ip_set *set, const struct sk_buff *skb, if (opt->cmdflags & IPSET_FLAG_SKIP_SUBCOUNTER_UPDATE) opt->cmdflags &= ~IPSET_FLAG_SKIP_COUNTER_UPDATE; list_for_each_entry_rcu(e, &map->members, list) { - if (SET_WITH_TIMEOUT(set) && - ip_set_timeout_expired(ext_timeout(e, set))) - continue; ret = ip_set_test(e->id, skb, par, opt); - if (ret > 0) { - if (SET_WITH_COUNTER(set)) - ip_set_update_counter(ext_counter(e, set), - ext, &opt->ext, - cmdflags); - if (SET_WITH_SKBINFO(set)) - ip_set_get_skbinfo(ext_skbinfo(e, set), - ext, &opt->ext, - cmdflags); - return ret; - } + if (ret <= 0) + continue; + if (ip_set_match_extensions(set, ext, mext, flags, e)) + return 1; } return 0; } diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index 299edc6add5a..1c98c907bc63 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -595,7 +595,6 @@ static int ip_vs_app_open(struct inode *inode, struct file *file) } static const struct file_operations ip_vs_app_fops = { - .owner = THIS_MODULE, .open = ip_vs_app_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 3e053cb30070..370abbf6f421 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -322,7 +322,7 @@ ip_vs_conn_fill_param_proto(struct netns_ipvs *ipvs, { __be16 _ports[2], *pptr; - pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph); + pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports); if (pptr == NULL) return 1; @@ -1143,7 +1143,6 @@ static int ip_vs_conn_open(struct inode *inode, struct file *file) } static const struct file_operations ip_vs_conn_fops = { - .owner = THIS_MODULE, .open = ip_vs_conn_open, .read = seq_read, .llseek = seq_lseek, @@ -1221,7 +1220,6 @@ static int ip_vs_conn_sync_open(struct inode *inode, struct file *file) } static const struct file_operations ip_vs_conn_sync_fops = { - .owner = THIS_MODULE, .open = ip_vs_conn_sync_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 5cb7cac9177d..5f6f73cf2174 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -433,7 +433,7 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, /* * IPv6 frags, only the first hit here. */ - pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph); + pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports); if (pptr == NULL) return NULL; @@ -566,7 +566,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, struct netns_ipvs *ipvs = svc->ipvs; struct net *net = ipvs->net; - pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph); + pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports); if (!pptr) return NF_DROP; dport = likely(!ip_vs_iph_inverse(iph)) ? pptr[1] : pptr[0]; @@ -982,7 +982,7 @@ static int ip_vs_out_icmp_v6(struct netns_ipvs *ipvs, struct sk_buff *skb, unsigned int offset; *related = 1; - ic = frag_safe_skb_hp(skb, ipvsh->len, sizeof(_icmph), &_icmph, ipvsh); + ic = frag_safe_skb_hp(skb, ipvsh->len, sizeof(_icmph), &_icmph); if (ic == NULL) return NF_DROP; @@ -1214,7 +1214,7 @@ static struct ip_vs_conn *__ip_vs_rs_conn_out(unsigned int hooknum, return NULL; pptr = frag_safe_skb_hp(skb, iph->len, - sizeof(_ports), _ports, iph); + sizeof(_ports), _ports); if (!pptr) return NULL; @@ -1407,7 +1407,7 @@ ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, in __be16 _ports[2], *pptr; pptr = frag_safe_skb_hp(skb, iph.len, - sizeof(_ports), _ports, &iph); + sizeof(_ports), _ports); if (pptr == NULL) return NF_ACCEPT; /* Not for me */ if (ip_vs_has_real_service(ipvs, af, iph.protocol, &iph.saddr, @@ -1741,7 +1741,7 @@ static int ip_vs_in_icmp_v6(struct netns_ipvs *ipvs, struct sk_buff *skb, *related = 1; - ic = frag_safe_skb_hp(skb, iph->len, sizeof(_icmph), &_icmph, iph); + ic = frag_safe_skb_hp(skb, iph->len, sizeof(_icmph), &_icmph); if (ic == NULL) return NF_DROP; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index fff213eacf2a..5ebde4b15810 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2116,7 +2116,6 @@ static int ip_vs_info_open(struct inode *inode, struct file *file) } static const struct file_operations ip_vs_info_fops = { - .owner = THIS_MODULE, .open = ip_vs_info_open, .read = seq_read, .llseek = seq_lseek, @@ -2161,7 +2160,6 @@ static int ip_vs_stats_seq_open(struct inode *inode, struct file *file) } static const struct file_operations ip_vs_stats_fops = { - .owner = THIS_MODULE, .open = ip_vs_stats_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -2230,7 +2228,6 @@ static int ip_vs_stats_percpu_seq_open(struct inode *inode, struct file *file) } static const struct file_operations ip_vs_stats_percpu_fops = { - .owner = THIS_MODULE, .open = ip_vs_stats_percpu_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index 121a321b91be..bcd9b7bde4ee 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -315,6 +315,7 @@ tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) switch (skb->ip_summed) { case CHECKSUM_NONE: skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); + /* fall through */ case CHECKSUM_COMPLETE: #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index 30e11cd6aa8a..c15ef7c2a1fa 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -319,6 +319,7 @@ udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) case CHECKSUM_NONE: skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); + /* fall through */ case CHECKSUM_COMPLETE: #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 9ee71cb276d7..fbaf3bd05b2e 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1636,17 +1636,14 @@ static int ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen) { struct msghdr msg = {NULL,}; - struct kvec iov; + struct kvec iov = {buffer, buflen}; int len; EnterFunction(7); /* Receive a packet */ - iov.iov_base = buffer; - iov.iov_len = (size_t)buflen; - - len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, MSG_DONTWAIT); - + iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &iov, 1, buflen); + len = sock_recvmsg(sock, &msg, MSG_DONTWAIT); if (len < 0) return len; diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c new file mode 100644 index 000000000000..6d65389e308f --- /dev/null +++ b/net/netfilter/nf_conncount.c @@ -0,0 +1,373 @@ +/* + * count the number of connections matching an arbitrary key. + * + * (C) 2017 Red Hat GmbH + * Author: Florian Westphal <fw@strlen.de> + * + * split from xt_connlimit.c: + * (c) 2000 Gerd Knorr <kraxel@bytesex.org> + * Nov 2002: Martin Bene <martin.bene@icomedias.com>: + * only ignore TIME_WAIT or gone connections + * (C) CC Computer Consultants GmbH, 2007 + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/in.h> +#include <linux/in6.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/jhash.h> +#include <linux/slab.h> +#include <linux/list.h> +#include <linux/rbtree.h> +#include <linux/module.h> +#include <linux/random.h> +#include <linux/skbuff.h> +#include <linux/spinlock.h> +#include <linux/netfilter/nf_conntrack_tcp.h> +#include <linux/netfilter/x_tables.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_count.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_tuple.h> +#include <net/netfilter/nf_conntrack_zones.h> + +#define CONNCOUNT_SLOTS 256U + +#ifdef CONFIG_LOCKDEP +#define CONNCOUNT_LOCK_SLOTS 8U +#else +#define CONNCOUNT_LOCK_SLOTS 256U +#endif + +#define CONNCOUNT_GC_MAX_NODES 8 +#define MAX_KEYLEN 5 + +/* we will save the tuples of all connections we care about */ +struct nf_conncount_tuple { + struct hlist_node node; + struct nf_conntrack_tuple tuple; +}; + +struct nf_conncount_rb { + struct rb_node node; + struct hlist_head hhead; /* connections/hosts in same subnet */ + u32 key[MAX_KEYLEN]; +}; + +static spinlock_t nf_conncount_locks[CONNCOUNT_LOCK_SLOTS] __cacheline_aligned_in_smp; + +struct nf_conncount_data { + unsigned int keylen; + struct rb_root root[CONNCOUNT_SLOTS]; +}; + +static u_int32_t conncount_rnd __read_mostly; +static struct kmem_cache *conncount_rb_cachep __read_mostly; +static struct kmem_cache *conncount_conn_cachep __read_mostly; + +static inline bool already_closed(const struct nf_conn *conn) +{ + if (nf_ct_protonum(conn) == IPPROTO_TCP) + return conn->proto.tcp.state == TCP_CONNTRACK_TIME_WAIT || + conn->proto.tcp.state == TCP_CONNTRACK_CLOSE; + else + return false; +} + +static int key_diff(const u32 *a, const u32 *b, unsigned int klen) +{ + return memcmp(a, b, klen * sizeof(u32)); +} + +static bool add_hlist(struct hlist_head *head, + const struct nf_conntrack_tuple *tuple) +{ + struct nf_conncount_tuple *conn; + + conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); + if (conn == NULL) + return false; + conn->tuple = *tuple; + hlist_add_head(&conn->node, head); + return true; +} + +static unsigned int check_hlist(struct net *net, + struct hlist_head *head, + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone, + bool *addit) +{ + const struct nf_conntrack_tuple_hash *found; + struct nf_conncount_tuple *conn; + struct hlist_node *n; + struct nf_conn *found_ct; + unsigned int length = 0; + + *addit = true; + + /* check the saved connections */ + hlist_for_each_entry_safe(conn, n, head, node) { + found = nf_conntrack_find_get(net, zone, &conn->tuple); + if (found == NULL) { + hlist_del(&conn->node); + kmem_cache_free(conncount_conn_cachep, conn); + continue; + } + + found_ct = nf_ct_tuplehash_to_ctrack(found); + + if (nf_ct_tuple_equal(&conn->tuple, tuple)) { + /* + * Just to be sure we have it only once in the list. + * We should not see tuples twice unless someone hooks + * this into a table without "-p tcp --syn". + */ + *addit = false; + } else if (already_closed(found_ct)) { + /* + * we do not care about connections which are + * closed already -> ditch it + */ + nf_ct_put(found_ct); + hlist_del(&conn->node); + kmem_cache_free(conncount_conn_cachep, conn); + continue; + } + + nf_ct_put(found_ct); + length++; + } + + return length; +} + +static void tree_nodes_free(struct rb_root *root, + struct nf_conncount_rb *gc_nodes[], + unsigned int gc_count) +{ + struct nf_conncount_rb *rbconn; + + while (gc_count) { + rbconn = gc_nodes[--gc_count]; + rb_erase(&rbconn->node, root); + kmem_cache_free(conncount_rb_cachep, rbconn); + } +} + +static unsigned int +count_tree(struct net *net, struct rb_root *root, + const u32 *key, u8 keylen, + u8 family, + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone) +{ + struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES]; + struct rb_node **rbnode, *parent; + struct nf_conncount_rb *rbconn; + struct nf_conncount_tuple *conn; + unsigned int gc_count; + bool no_gc = false; + + restart: + gc_count = 0; + parent = NULL; + rbnode = &(root->rb_node); + while (*rbnode) { + int diff; + bool addit; + + rbconn = rb_entry(*rbnode, struct nf_conncount_rb, node); + + parent = *rbnode; + diff = key_diff(key, rbconn->key, keylen); + if (diff < 0) { + rbnode = &((*rbnode)->rb_left); + } else if (diff > 0) { + rbnode = &((*rbnode)->rb_right); + } else { + /* same source network -> be counted! */ + unsigned int count; + count = check_hlist(net, &rbconn->hhead, tuple, zone, &addit); + + tree_nodes_free(root, gc_nodes, gc_count); + if (!addit) + return count; + + if (!add_hlist(&rbconn->hhead, tuple)) + return 0; /* hotdrop */ + + return count + 1; + } + + if (no_gc || gc_count >= ARRAY_SIZE(gc_nodes)) + continue; + + /* only used for GC on hhead, retval and 'addit' ignored */ + check_hlist(net, &rbconn->hhead, tuple, zone, &addit); + if (hlist_empty(&rbconn->hhead)) + gc_nodes[gc_count++] = rbconn; + } + + if (gc_count) { + no_gc = true; + tree_nodes_free(root, gc_nodes, gc_count); + /* tree_node_free before new allocation permits + * allocator to re-use newly free'd object. + * + * This is a rare event; in most cases we will find + * existing node to re-use. (or gc_count is 0). + */ + goto restart; + } + + /* no match, need to insert new node */ + rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC); + if (rbconn == NULL) + return 0; + + conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); + if (conn == NULL) { + kmem_cache_free(conncount_rb_cachep, rbconn); + return 0; + } + + conn->tuple = *tuple; + memcpy(rbconn->key, key, sizeof(u32) * keylen); + + INIT_HLIST_HEAD(&rbconn->hhead); + hlist_add_head(&conn->node, &rbconn->hhead); + + rb_link_node(&rbconn->node, parent, rbnode); + rb_insert_color(&rbconn->node, root); + return 1; +} + +unsigned int nf_conncount_count(struct net *net, + struct nf_conncount_data *data, + const u32 *key, + unsigned int family, + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone) +{ + struct rb_root *root; + int count; + u32 hash; + + hash = jhash2(key, data->keylen, conncount_rnd) % CONNCOUNT_SLOTS; + root = &data->root[hash]; + + spin_lock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); + + count = count_tree(net, root, key, data->keylen, family, tuple, zone); + + spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); + + return count; +} +EXPORT_SYMBOL_GPL(nf_conncount_count); + +struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family, + unsigned int keylen) +{ + struct nf_conncount_data *data; + int ret, i; + + if (keylen % sizeof(u32) || + keylen / sizeof(u32) > MAX_KEYLEN || + keylen == 0) + return ERR_PTR(-EINVAL); + + net_get_random_once(&conncount_rnd, sizeof(conncount_rnd)); + + data = kmalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return ERR_PTR(-ENOMEM); + + ret = nf_ct_netns_get(net, family); + if (ret < 0) { + kfree(data); + return ERR_PTR(ret); + } + + for (i = 0; i < ARRAY_SIZE(data->root); ++i) + data->root[i] = RB_ROOT; + + data->keylen = keylen / sizeof(u32); + + return data; +} +EXPORT_SYMBOL_GPL(nf_conncount_init); + +static void destroy_tree(struct rb_root *r) +{ + struct nf_conncount_tuple *conn; + struct nf_conncount_rb *rbconn; + struct hlist_node *n; + struct rb_node *node; + + while ((node = rb_first(r)) != NULL) { + rbconn = rb_entry(node, struct nf_conncount_rb, node); + + rb_erase(node, r); + + hlist_for_each_entry_safe(conn, n, &rbconn->hhead, node) + kmem_cache_free(conncount_conn_cachep, conn); + + kmem_cache_free(conncount_rb_cachep, rbconn); + } +} + +void nf_conncount_destroy(struct net *net, unsigned int family, + struct nf_conncount_data *data) +{ + unsigned int i; + + nf_ct_netns_put(net, family); + + for (i = 0; i < ARRAY_SIZE(data->root); ++i) + destroy_tree(&data->root[i]); + + kfree(data); +} +EXPORT_SYMBOL_GPL(nf_conncount_destroy); + +static int __init nf_conncount_modinit(void) +{ + int i; + + BUILD_BUG_ON(CONNCOUNT_LOCK_SLOTS > CONNCOUNT_SLOTS); + BUILD_BUG_ON((CONNCOUNT_SLOTS % CONNCOUNT_LOCK_SLOTS) != 0); + + for (i = 0; i < CONNCOUNT_LOCK_SLOTS; ++i) + spin_lock_init(&nf_conncount_locks[i]); + + conncount_conn_cachep = kmem_cache_create("nf_conncount_tuple", + sizeof(struct nf_conncount_tuple), + 0, 0, NULL); + if (!conncount_conn_cachep) + return -ENOMEM; + + conncount_rb_cachep = kmem_cache_create("nf_conncount_rb", + sizeof(struct nf_conncount_rb), + 0, 0, NULL); + if (!conncount_rb_cachep) { + kmem_cache_destroy(conncount_conn_cachep); + return -ENOMEM; + } + + return 0; +} + +static void __exit nf_conncount_modexit(void) +{ + kmem_cache_destroy(conncount_conn_cachep); + kmem_cache_destroy(conncount_rb_cachep); +} + +module_init(nf_conncount_modinit); +module_exit(nf_conncount_modexit); +MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>"); +MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); +MODULE_DESCRIPTION("netfilter: count number of connections matching a key"); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 85f643c1e227..705198de671d 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -58,8 +58,6 @@ #include "nf_internals.h" -#define NF_CONNTRACK_VERSION "0.5.0" - int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct, enum nf_nat_manip_type manip, const struct nlattr *attr) __read_mostly; @@ -901,6 +899,9 @@ static unsigned int early_drop_list(struct net *net, hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) { tmp = nf_ct_tuplehash_to_ctrack(h); + if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) + continue; + if (nf_ct_is_expired(tmp)) { nf_ct_gc_expired(tmp); continue; @@ -975,6 +976,18 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct) return false; } +#define DAY (86400 * HZ) + +/* Set an arbitrary timeout large enough not to ever expire, this save + * us a check for the IPS_OFFLOAD_BIT from the packet path via + * nf_ct_is_expired(). + */ +static void nf_ct_offload_timeout(struct nf_conn *ct) +{ + if (nf_ct_expires(ct) < DAY / 2) + ct->timeout = nfct_time_stamp + DAY; +} + static void gc_worker(struct work_struct *work) { unsigned int min_interval = max(HZ / GC_MAX_BUCKETS_DIV, 1u); @@ -1011,6 +1024,11 @@ static void gc_worker(struct work_struct *work) tmp = nf_ct_tuplehash_to_ctrack(h); scanned++; + if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) { + nf_ct_offload_timeout(tmp); + continue; + } + if (nf_ct_is_expired(tmp)) { nf_ct_gc_expired(tmp); expired_count++; @@ -1044,7 +1062,7 @@ static void gc_worker(struct work_struct *work) * we will just continue with next hash slot. */ rcu_read_unlock(); - cond_resched_rcu_qs(); + cond_resched(); } while (++buckets < goal); if (gc_work->exiting) @@ -2048,10 +2066,6 @@ int nf_conntrack_init_start(void) if (!nf_conntrack_cachep) goto err_cachep; - printk(KERN_INFO "nf_conntrack version %s (%u buckets, %d max)\n", - NF_CONNTRACK_VERSION, nf_conntrack_htable_size, - nf_conntrack_max); - ret = nf_conntrack_expect_init(); if (ret < 0) goto err_expect; diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index d6748a8a79c5..8ef21d9f9a00 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -649,7 +649,6 @@ static int exp_open(struct inode *inode, struct file *file) } static const struct file_operations exp_file_ops = { - .owner = THIS_MODULE, .open = exp_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/netfilter/nf_conntrack_h323_asn1.c b/net/netfilter/nf_conntrack_h323_asn1.c index cf1bf2605c10..1601275efe2d 100644 --- a/net/netfilter/nf_conntrack_h323_asn1.c +++ b/net/netfilter/nf_conntrack_h323_asn1.c @@ -1,4 +1,4 @@ -/**************************************************************************** +/* * ip_conntrack_helper_h323_asn1.c - BER and PER decoding library for H.323 * conntrack/NAT module. * @@ -8,7 +8,7 @@ * * See ip_conntrack_helper_h323_asn1.h for details. * - ****************************************************************************/ + */ #ifdef __KERNEL__ #include <linux/kernel.h> @@ -103,7 +103,6 @@ struct bitstr { #define INC_BIT(bs) if((++(bs)->bit)>7){(bs)->cur++;(bs)->bit=0;} #define INC_BITS(bs,b) if(((bs)->bit+=(b))>7){(bs)->cur+=(bs)->bit>>3;(bs)->bit&=7;} #define BYTE_ALIGN(bs) if((bs)->bit){(bs)->cur++;(bs)->bit=0;} -#define CHECK_BOUND(bs,n) if((bs)->cur+(n)>(bs)->end)return(H323_ERROR_BOUND) static unsigned int get_len(struct bitstr *bs); static unsigned int get_bit(struct bitstr *bs); static unsigned int get_bits(struct bitstr *bs, unsigned int b); @@ -141,14 +140,15 @@ static const decoder_t Decoders[] = { decode_choice, }; -/**************************************************************************** +/* * H.323 Types - ****************************************************************************/ + */ #include "nf_conntrack_h323_types.c" -/**************************************************************************** +/* * Functions - ****************************************************************************/ + */ + /* Assume bs is aligned && v < 16384 */ static unsigned int get_len(struct bitstr *bs) { @@ -165,7 +165,19 @@ static unsigned int get_len(struct bitstr *bs) return v; } -/****************************************************************************/ +static int nf_h323_error_boundary(struct bitstr *bs, size_t bytes, size_t bits) +{ + bits += bs->bit; + bytes += bits / BITS_PER_BYTE; + if (bits % BITS_PER_BYTE > 0) + bytes++; + + if (*bs->cur + bytes > *bs->end) + return 1; + + return 0; +} + static unsigned int get_bit(struct bitstr *bs) { unsigned int b = (*bs->cur) & (0x80 >> bs->bit); @@ -175,7 +187,6 @@ static unsigned int get_bit(struct bitstr *bs) return b; } -/****************************************************************************/ /* Assume b <= 8 */ static unsigned int get_bits(struct bitstr *bs, unsigned int b) { @@ -201,7 +212,6 @@ static unsigned int get_bits(struct bitstr *bs, unsigned int b) return v; } -/****************************************************************************/ /* Assume b <= 32 */ static unsigned int get_bitmap(struct bitstr *bs, unsigned int b) { @@ -239,9 +249,9 @@ static unsigned int get_bitmap(struct bitstr *bs, unsigned int b) return v; } -/**************************************************************************** +/* * Assume bs is aligned and sizeof(unsigned int) == 4 - ****************************************************************************/ + */ static unsigned int get_uint(struct bitstr *bs, int b) { unsigned int v = 0; @@ -250,12 +260,15 @@ static unsigned int get_uint(struct bitstr *bs, int b) case 4: v |= *bs->cur++; v <<= 8; + /* fall through */ case 3: v |= *bs->cur++; v <<= 8; + /* fall through */ case 2: v |= *bs->cur++; v <<= 8; + /* fall through */ case 1: v |= *bs->cur++; break; @@ -263,7 +276,6 @@ static unsigned int get_uint(struct bitstr *bs, int b) return v; } -/****************************************************************************/ static int decode_nul(struct bitstr *bs, const struct field_t *f, char *base, int level) { @@ -272,19 +284,17 @@ static int decode_nul(struct bitstr *bs, const struct field_t *f, return H323_ERROR_NONE; } -/****************************************************************************/ static int decode_bool(struct bitstr *bs, const struct field_t *f, char *base, int level) { PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); INC_BIT(bs); - - CHECK_BOUND(bs, 0); + if (nf_h323_error_boundary(bs, 0, 0)) + return H323_ERROR_BOUND; return H323_ERROR_NONE; } -/****************************************************************************/ static int decode_oid(struct bitstr *bs, const struct field_t *f, char *base, int level) { @@ -293,15 +303,17 @@ static int decode_oid(struct bitstr *bs, const struct field_t *f, PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); BYTE_ALIGN(bs); - CHECK_BOUND(bs, 1); + if (nf_h323_error_boundary(bs, 1, 0)) + return H323_ERROR_BOUND; + len = *bs->cur++; bs->cur += len; + if (nf_h323_error_boundary(bs, 0, 0)) + return H323_ERROR_BOUND; - CHECK_BOUND(bs, 0); return H323_ERROR_NONE; } -/****************************************************************************/ static int decode_int(struct bitstr *bs, const struct field_t *f, char *base, int level) { @@ -319,6 +331,8 @@ static int decode_int(struct bitstr *bs, const struct field_t *f, bs->cur += 2; break; case CONS: /* 64K < Range < 4G */ + if (nf_h323_error_boundary(bs, 0, 2)) + return H323_ERROR_BOUND; len = get_bits(bs, 2) + 1; BYTE_ALIGN(bs); if (base && (f->attr & DECODE)) { /* timeToLive */ @@ -330,7 +344,8 @@ static int decode_int(struct bitstr *bs, const struct field_t *f, break; case UNCO: BYTE_ALIGN(bs); - CHECK_BOUND(bs, 2); + if (nf_h323_error_boundary(bs, 2, 0)) + return H323_ERROR_BOUND; len = get_len(bs); bs->cur += len; break; @@ -341,11 +356,11 @@ static int decode_int(struct bitstr *bs, const struct field_t *f, PRINT("\n"); - CHECK_BOUND(bs, 0); + if (nf_h323_error_boundary(bs, 0, 0)) + return H323_ERROR_BOUND; return H323_ERROR_NONE; } -/****************************************************************************/ static int decode_enum(struct bitstr *bs, const struct field_t *f, char *base, int level) { @@ -357,11 +372,11 @@ static int decode_enum(struct bitstr *bs, const struct field_t *f, INC_BITS(bs, f->sz); } - CHECK_BOUND(bs, 0); + if (nf_h323_error_boundary(bs, 0, 0)) + return H323_ERROR_BOUND; return H323_ERROR_NONE; } -/****************************************************************************/ static int decode_bitstr(struct bitstr *bs, const struct field_t *f, char *base, int level) { @@ -375,12 +390,14 @@ static int decode_bitstr(struct bitstr *bs, const struct field_t *f, len = f->lb; break; case WORD: /* 2-byte length */ - CHECK_BOUND(bs, 2); + if (nf_h323_error_boundary(bs, 2, 0)) + return H323_ERROR_BOUND; len = (*bs->cur++) << 8; len += (*bs->cur++) + f->lb; break; case SEMI: - CHECK_BOUND(bs, 2); + if (nf_h323_error_boundary(bs, 2, 0)) + return H323_ERROR_BOUND; len = get_len(bs); break; default: @@ -391,11 +408,11 @@ static int decode_bitstr(struct bitstr *bs, const struct field_t *f, bs->cur += len >> 3; bs->bit = len & 7; - CHECK_BOUND(bs, 0); + if (nf_h323_error_boundary(bs, 0, 0)) + return H323_ERROR_BOUND; return H323_ERROR_NONE; } -/****************************************************************************/ static int decode_numstr(struct bitstr *bs, const struct field_t *f, char *base, int level) { @@ -404,16 +421,18 @@ static int decode_numstr(struct bitstr *bs, const struct field_t *f, PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); /* 2 <= Range <= 255 */ + if (nf_h323_error_boundary(bs, 0, f->sz)) + return H323_ERROR_BOUND; len = get_bits(bs, f->sz) + f->lb; BYTE_ALIGN(bs); INC_BITS(bs, (len << 2)); - CHECK_BOUND(bs, 0); + if (nf_h323_error_boundary(bs, 0, 0)) + return H323_ERROR_BOUND; return H323_ERROR_NONE; } -/****************************************************************************/ static int decode_octstr(struct bitstr *bs, const struct field_t *f, char *base, int level) { @@ -440,15 +459,19 @@ static int decode_octstr(struct bitstr *bs, const struct field_t *f, break; case BYTE: /* Range == 256 */ BYTE_ALIGN(bs); - CHECK_BOUND(bs, 1); + if (nf_h323_error_boundary(bs, 1, 0)) + return H323_ERROR_BOUND; len = (*bs->cur++) + f->lb; break; case SEMI: BYTE_ALIGN(bs); - CHECK_BOUND(bs, 2); + if (nf_h323_error_boundary(bs, 2, 0)) + return H323_ERROR_BOUND; len = get_len(bs) + f->lb; break; default: /* 2 <= Range <= 255 */ + if (nf_h323_error_boundary(bs, 0, f->sz)) + return H323_ERROR_BOUND; len = get_bits(bs, f->sz) + f->lb; BYTE_ALIGN(bs); break; @@ -458,11 +481,11 @@ static int decode_octstr(struct bitstr *bs, const struct field_t *f, PRINT("\n"); - CHECK_BOUND(bs, 0); + if (nf_h323_error_boundary(bs, 0, 0)) + return H323_ERROR_BOUND; return H323_ERROR_NONE; } -/****************************************************************************/ static int decode_bmpstr(struct bitstr *bs, const struct field_t *f, char *base, int level) { @@ -473,10 +496,13 @@ static int decode_bmpstr(struct bitstr *bs, const struct field_t *f, switch (f->sz) { case BYTE: /* Range == 256 */ BYTE_ALIGN(bs); - CHECK_BOUND(bs, 1); + if (nf_h323_error_boundary(bs, 1, 0)) + return H323_ERROR_BOUND; len = (*bs->cur++) + f->lb; break; default: /* 2 <= Range <= 255 */ + if (nf_h323_error_boundary(bs, 0, f->sz)) + return H323_ERROR_BOUND; len = get_bits(bs, f->sz) + f->lb; BYTE_ALIGN(bs); break; @@ -484,11 +510,11 @@ static int decode_bmpstr(struct bitstr *bs, const struct field_t *f, bs->cur += len << 1; - CHECK_BOUND(bs, 0); + if (nf_h323_error_boundary(bs, 0, 0)) + return H323_ERROR_BOUND; return H323_ERROR_NONE; } -/****************************************************************************/ static int decode_seq(struct bitstr *bs, const struct field_t *f, char *base, int level) { @@ -503,9 +529,13 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f, base = (base && (f->attr & DECODE)) ? base + f->offset : NULL; /* Extensible? */ + if (nf_h323_error_boundary(bs, 0, 1)) + return H323_ERROR_BOUND; ext = (f->attr & EXT) ? get_bit(bs) : 0; /* Get fields bitmap */ + if (nf_h323_error_boundary(bs, 0, f->sz)) + return H323_ERROR_BOUND; bmp = get_bitmap(bs, f->sz); if (base) *(unsigned int *)base = bmp; @@ -525,9 +555,11 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f, /* Decode */ if (son->attr & OPEN) { /* Open field */ - CHECK_BOUND(bs, 2); + if (nf_h323_error_boundary(bs, 2, 0)) + return H323_ERROR_BOUND; len = get_len(bs); - CHECK_BOUND(bs, len); + if (nf_h323_error_boundary(bs, len, 0)) + return H323_ERROR_BOUND; if (!base || !(son->attr & DECODE)) { PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ", son->name); @@ -555,8 +587,11 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f, return H323_ERROR_NONE; /* Get the extension bitmap */ + if (nf_h323_error_boundary(bs, 0, 7)) + return H323_ERROR_BOUND; bmp2_len = get_bits(bs, 7) + 1; - CHECK_BOUND(bs, (bmp2_len + 7) >> 3); + if (nf_h323_error_boundary(bs, 0, bmp2_len)) + return H323_ERROR_BOUND; bmp2 = get_bitmap(bs, bmp2_len); bmp |= bmp2 >> f->sz; if (base) @@ -567,9 +602,11 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f, for (opt = 0; opt < bmp2_len; opt++, i++, son++) { /* Check Range */ if (i >= f->ub) { /* Newer Version? */ - CHECK_BOUND(bs, 2); + if (nf_h323_error_boundary(bs, 2, 0)) + return H323_ERROR_BOUND; len = get_len(bs); - CHECK_BOUND(bs, len); + if (nf_h323_error_boundary(bs, len, 0)) + return H323_ERROR_BOUND; bs->cur += len; continue; } @@ -583,9 +620,11 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f, if (!((0x80000000 >> opt) & bmp2)) /* Not present */ continue; - CHECK_BOUND(bs, 2); + if (nf_h323_error_boundary(bs, 2, 0)) + return H323_ERROR_BOUND; len = get_len(bs); - CHECK_BOUND(bs, len); + if (nf_h323_error_boundary(bs, len, 0)) + return H323_ERROR_BOUND; if (!base || !(son->attr & DECODE)) { PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ", son->name); @@ -605,7 +644,6 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f, return H323_ERROR_NONE; } -/****************************************************************************/ static int decode_seqof(struct bitstr *bs, const struct field_t *f, char *base, int level) { @@ -623,22 +661,27 @@ static int decode_seqof(struct bitstr *bs, const struct field_t *f, switch (f->sz) { case BYTE: BYTE_ALIGN(bs); - CHECK_BOUND(bs, 1); + if (nf_h323_error_boundary(bs, 1, 0)) + return H323_ERROR_BOUND; count = *bs->cur++; break; case WORD: BYTE_ALIGN(bs); - CHECK_BOUND(bs, 2); + if (nf_h323_error_boundary(bs, 2, 0)) + return H323_ERROR_BOUND; count = *bs->cur++; count <<= 8; count += *bs->cur++; break; case SEMI: BYTE_ALIGN(bs); - CHECK_BOUND(bs, 2); + if (nf_h323_error_boundary(bs, 2, 0)) + return H323_ERROR_BOUND; count = get_len(bs); break; default: + if (nf_h323_error_boundary(bs, 0, f->sz)) + return H323_ERROR_BOUND; count = get_bits(bs, f->sz); break; } @@ -658,8 +701,11 @@ static int decode_seqof(struct bitstr *bs, const struct field_t *f, for (i = 0; i < count; i++) { if (son->attr & OPEN) { BYTE_ALIGN(bs); + if (nf_h323_error_boundary(bs, 2, 0)) + return H323_ERROR_BOUND; len = get_len(bs); - CHECK_BOUND(bs, len); + if (nf_h323_error_boundary(bs, len, 0)) + return H323_ERROR_BOUND; if (!base || !(son->attr & DECODE)) { PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ", son->name); @@ -694,8 +740,6 @@ static int decode_seqof(struct bitstr *bs, const struct field_t *f, return H323_ERROR_NONE; } - -/****************************************************************************/ static int decode_choice(struct bitstr *bs, const struct field_t *f, char *base, int level) { @@ -710,11 +754,17 @@ static int decode_choice(struct bitstr *bs, const struct field_t *f, base = (base && (f->attr & DECODE)) ? base + f->offset : NULL; /* Decode the choice index number */ + if (nf_h323_error_boundary(bs, 0, 1)) + return H323_ERROR_BOUND; if ((f->attr & EXT) && get_bit(bs)) { ext = 1; + if (nf_h323_error_boundary(bs, 0, 7)) + return H323_ERROR_BOUND; type = get_bits(bs, 7) + f->lb; } else { ext = 0; + if (nf_h323_error_boundary(bs, 0, f->sz)) + return H323_ERROR_BOUND; type = get_bits(bs, f->sz); if (type >= f->lb) return H323_ERROR_RANGE; @@ -727,8 +777,11 @@ static int decode_choice(struct bitstr *bs, const struct field_t *f, /* Check Range */ if (type >= f->ub) { /* Newer version? */ BYTE_ALIGN(bs); + if (nf_h323_error_boundary(bs, 2, 0)) + return H323_ERROR_BOUND; len = get_len(bs); - CHECK_BOUND(bs, len); + if (nf_h323_error_boundary(bs, len, 0)) + return H323_ERROR_BOUND; bs->cur += len; return H323_ERROR_NONE; } @@ -742,8 +795,11 @@ static int decode_choice(struct bitstr *bs, const struct field_t *f, if (ext || (son->attr & OPEN)) { BYTE_ALIGN(bs); + if (nf_h323_error_boundary(bs, len, 0)) + return H323_ERROR_BOUND; len = get_len(bs); - CHECK_BOUND(bs, len); + if (nf_h323_error_boundary(bs, len, 0)) + return H323_ERROR_BOUND; if (!base || !(son->attr & DECODE)) { PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ", son->name); @@ -765,7 +821,6 @@ static int decode_choice(struct bitstr *bs, const struct field_t *f, return H323_ERROR_NONE; } -/****************************************************************************/ int DecodeRasMessage(unsigned char *buf, size_t sz, RasMessage *ras) { static const struct field_t ras_message = { @@ -781,7 +836,6 @@ int DecodeRasMessage(unsigned char *buf, size_t sz, RasMessage *ras) return decode_choice(&bs, &ras_message, (char *) ras, 0); } -/****************************************************************************/ static int DecodeH323_UserInformation(unsigned char *buf, unsigned char *beg, size_t sz, H323_UserInformation *uuie) { @@ -799,7 +853,6 @@ static int DecodeH323_UserInformation(unsigned char *buf, unsigned char *beg, return decode_seq(&bs, &h323_userinformation, (char *) uuie, 0); } -/****************************************************************************/ int DecodeMultimediaSystemControlMessage(unsigned char *buf, size_t sz, MultimediaSystemControlMessage * mscm) @@ -818,7 +871,6 @@ int DecodeMultimediaSystemControlMessage(unsigned char *buf, size_t sz, (char *) mscm, 0); } -/****************************************************************************/ int DecodeQ931(unsigned char *buf, size_t sz, Q931 *q931) { unsigned char *p = buf; diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index f71f0d2558fd..005589c6d0f6 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -24,6 +24,7 @@ #include <linux/skbuff.h> #include <net/route.h> #include <net/ip6_route.h> +#include <linux/netfilter_ipv6.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> @@ -115,7 +116,6 @@ static struct nf_conntrack_helper nf_conntrack_helper_h245; static struct nf_conntrack_helper nf_conntrack_helper_q931[]; static struct nf_conntrack_helper nf_conntrack_helper_ras[]; -/****************************************************************************/ static int get_tpkt_data(struct sk_buff *skb, unsigned int protoff, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned char **data, int *datalen, int *dataoff) @@ -219,7 +219,6 @@ static int get_tpkt_data(struct sk_buff *skb, unsigned int protoff, return 0; } -/****************************************************************************/ static int get_h245_addr(struct nf_conn *ct, const unsigned char *data, H245_TransportAddress *taddr, union nf_inet_addr *addr, __be16 *port) @@ -254,7 +253,6 @@ static int get_h245_addr(struct nf_conn *ct, const unsigned char *data, return 1; } -/****************************************************************************/ static int expect_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int protoff, @@ -328,7 +326,6 @@ static int expect_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct, return ret; } -/****************************************************************************/ static int expect_t120(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, @@ -380,7 +377,6 @@ static int expect_t120(struct sk_buff *skb, return ret; } -/****************************************************************************/ static int process_h245_channel(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, @@ -410,7 +406,6 @@ static int process_h245_channel(struct sk_buff *skb, return 0; } -/****************************************************************************/ static int process_olc(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int protoff, @@ -472,7 +467,6 @@ static int process_olc(struct sk_buff *skb, struct nf_conn *ct, return 0; } -/****************************************************************************/ static int process_olca(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int protoff, unsigned char **data, int dataoff, @@ -542,7 +536,6 @@ static int process_olca(struct sk_buff *skb, struct nf_conn *ct, return 0; } -/****************************************************************************/ static int process_h245(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int protoff, unsigned char **data, int dataoff, @@ -578,7 +571,6 @@ static int process_h245(struct sk_buff *skb, struct nf_conn *ct, return 0; } -/****************************************************************************/ static int h245_help(struct sk_buff *skb, unsigned int protoff, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { @@ -628,7 +620,6 @@ static int h245_help(struct sk_buff *skb, unsigned int protoff, return NF_DROP; } -/****************************************************************************/ static const struct nf_conntrack_expect_policy h245_exp_policy = { .max_expected = H323_RTP_CHANNEL_MAX * 4 + 2 /* T.120 */, .timeout = 240, @@ -643,7 +634,6 @@ static struct nf_conntrack_helper nf_conntrack_helper_h245 __read_mostly = { .expect_policy = &h245_exp_policy, }; -/****************************************************************************/ int get_h225_addr(struct nf_conn *ct, unsigned char *data, TransportAddress *taddr, union nf_inet_addr *addr, __be16 *port) @@ -675,7 +665,6 @@ int get_h225_addr(struct nf_conn *ct, unsigned char *data, return 1; } -/****************************************************************************/ static int expect_h245(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int protoff, unsigned char **data, int dataoff, @@ -726,20 +715,15 @@ static int expect_h245(struct sk_buff *skb, struct nf_conn *ct, } /* If the calling party is on the same side of the forward-to party, - * we don't need to track the second call */ + * we don't need to track the second call + */ static int callforward_do_filter(struct net *net, const union nf_inet_addr *src, const union nf_inet_addr *dst, u_int8_t family) { - const struct nf_afinfo *afinfo; int ret = 0; - /* rcu_read_lock()ed by nf_hook_thresh */ - afinfo = nf_get_afinfo(family); - if (!afinfo) - return 0; - switch (family) { case AF_INET: { struct flowi4 fl1, fl2; @@ -750,10 +734,10 @@ static int callforward_do_filter(struct net *net, memset(&fl2, 0, sizeof(fl2)); fl2.daddr = dst->ip; - if (!afinfo->route(net, (struct dst_entry **)&rt1, - flowi4_to_flowi(&fl1), false)) { - if (!afinfo->route(net, (struct dst_entry **)&rt2, - flowi4_to_flowi(&fl2), false)) { + if (!nf_ip_route(net, (struct dst_entry **)&rt1, + flowi4_to_flowi(&fl1), false)) { + if (!nf_ip_route(net, (struct dst_entry **)&rt2, + flowi4_to_flowi(&fl2), false)) { if (rt_nexthop(rt1, fl1.daddr) == rt_nexthop(rt2, fl2.daddr) && rt1->dst.dev == rt2->dst.dev) @@ -766,18 +750,23 @@ static int callforward_do_filter(struct net *net, } #if IS_ENABLED(CONFIG_NF_CONNTRACK_IPV6) case AF_INET6: { - struct flowi6 fl1, fl2; + const struct nf_ipv6_ops *v6ops; struct rt6_info *rt1, *rt2; + struct flowi6 fl1, fl2; + + v6ops = nf_get_ipv6_ops(); + if (!v6ops) + return 0; memset(&fl1, 0, sizeof(fl1)); fl1.daddr = src->in6; memset(&fl2, 0, sizeof(fl2)); fl2.daddr = dst->in6; - if (!afinfo->route(net, (struct dst_entry **)&rt1, - flowi6_to_flowi(&fl1), false)) { - if (!afinfo->route(net, (struct dst_entry **)&rt2, - flowi6_to_flowi(&fl2), false)) { + if (!v6ops->route(net, (struct dst_entry **)&rt1, + flowi6_to_flowi(&fl1), false)) { + if (!v6ops->route(net, (struct dst_entry **)&rt2, + flowi6_to_flowi(&fl2), false)) { if (ipv6_addr_equal(rt6_nexthop(rt1, &fl1.daddr), rt6_nexthop(rt2, &fl2.daddr)) && rt1->dst.dev == rt2->dst.dev) @@ -794,7 +783,6 @@ static int callforward_do_filter(struct net *net, } -/****************************************************************************/ static int expect_callforwarding(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, @@ -815,7 +803,8 @@ static int expect_callforwarding(struct sk_buff *skb, return 0; /* If the calling party is on the same side of the forward-to party, - * we don't need to track the second call */ + * we don't need to track the second call + */ if (callforward_filter && callforward_do_filter(net, &addr, &ct->tuplehash[!dir].tuple.src.u3, nf_ct_l3num(ct))) { @@ -854,7 +843,6 @@ static int expect_callforwarding(struct sk_buff *skb, return ret; } -/****************************************************************************/ static int process_setup(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int protoff, @@ -925,7 +913,6 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct, return 0; } -/****************************************************************************/ static int process_callproceeding(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, @@ -958,7 +945,6 @@ static int process_callproceeding(struct sk_buff *skb, return 0; } -/****************************************************************************/ static int process_connect(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int protoff, @@ -990,7 +976,6 @@ static int process_connect(struct sk_buff *skb, struct nf_conn *ct, return 0; } -/****************************************************************************/ static int process_alerting(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int protoff, @@ -1022,7 +1007,6 @@ static int process_alerting(struct sk_buff *skb, struct nf_conn *ct, return 0; } -/****************************************************************************/ static int process_facility(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int protoff, @@ -1063,7 +1047,6 @@ static int process_facility(struct sk_buff *skb, struct nf_conn *ct, return 0; } -/****************************************************************************/ static int process_progress(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int protoff, @@ -1095,7 +1078,6 @@ static int process_progress(struct sk_buff *skb, struct nf_conn *ct, return 0; } -/****************************************************************************/ static int process_q931(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int protoff, unsigned char **data, int dataoff, @@ -1154,7 +1136,6 @@ static int process_q931(struct sk_buff *skb, struct nf_conn *ct, return 0; } -/****************************************************************************/ static int q931_help(struct sk_buff *skb, unsigned int protoff, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { @@ -1203,7 +1184,6 @@ static int q931_help(struct sk_buff *skb, unsigned int protoff, return NF_DROP; } -/****************************************************************************/ static const struct nf_conntrack_expect_policy q931_exp_policy = { /* T.120 and H.245 */ .max_expected = H323_RTP_CHANNEL_MAX * 4 + 4, @@ -1231,7 +1211,6 @@ static struct nf_conntrack_helper nf_conntrack_helper_q931[] __read_mostly = { }, }; -/****************************************************************************/ static unsigned char *get_udp_data(struct sk_buff *skb, unsigned int protoff, int *datalen) { @@ -1249,7 +1228,6 @@ static unsigned char *get_udp_data(struct sk_buff *skb, unsigned int protoff, return skb_header_pointer(skb, dataoff, *datalen, h323_buffer); } -/****************************************************************************/ static struct nf_conntrack_expect *find_expect(struct nf_conn *ct, union nf_inet_addr *addr, __be16 port) @@ -1270,7 +1248,6 @@ static struct nf_conntrack_expect *find_expect(struct nf_conn *ct, return NULL; } -/****************************************************************************/ static int expect_q931(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int protoff, unsigned char **data, @@ -1328,7 +1305,6 @@ static int expect_q931(struct sk_buff *skb, struct nf_conn *ct, return ret; } -/****************************************************************************/ static int process_grq(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int protoff, @@ -1346,7 +1322,6 @@ static int process_grq(struct sk_buff *skb, struct nf_conn *ct, return 0; } -/****************************************************************************/ static int process_gcf(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int protoff, @@ -1391,7 +1366,6 @@ static int process_gcf(struct sk_buff *skb, struct nf_conn *ct, return ret; } -/****************************************************************************/ static int process_rrq(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int protoff, @@ -1428,7 +1402,6 @@ static int process_rrq(struct sk_buff *skb, struct nf_conn *ct, return 0; } -/****************************************************************************/ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int protoff, @@ -1480,7 +1453,6 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct, return 0; } -/****************************************************************************/ static int process_urq(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int protoff, @@ -1514,7 +1486,6 @@ static int process_urq(struct sk_buff *skb, struct nf_conn *ct, return 0; } -/****************************************************************************/ static int process_arq(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int protoff, @@ -1559,7 +1530,6 @@ static int process_arq(struct sk_buff *skb, struct nf_conn *ct, return 0; } -/****************************************************************************/ static int process_acf(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int protoff, @@ -1608,7 +1578,6 @@ static int process_acf(struct sk_buff *skb, struct nf_conn *ct, return ret; } -/****************************************************************************/ static int process_lrq(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int protoff, @@ -1626,7 +1595,6 @@ static int process_lrq(struct sk_buff *skb, struct nf_conn *ct, return 0; } -/****************************************************************************/ static int process_lcf(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int protoff, @@ -1666,7 +1634,6 @@ static int process_lcf(struct sk_buff *skb, struct nf_conn *ct, return ret; } -/****************************************************************************/ static int process_irr(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int protoff, @@ -1700,7 +1667,6 @@ static int process_irr(struct sk_buff *skb, struct nf_conn *ct, return 0; } -/****************************************************************************/ static int process_ras(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int protoff, @@ -1745,7 +1711,6 @@ static int process_ras(struct sk_buff *skb, struct nf_conn *ct, return 0; } -/****************************************************************************/ static int ras_help(struct sk_buff *skb, unsigned int protoff, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { @@ -1788,7 +1753,6 @@ static int ras_help(struct sk_buff *skb, unsigned int protoff, return NF_DROP; } -/****************************************************************************/ static const struct nf_conntrack_expect_policy ras_exp_policy = { .max_expected = 32, .timeout = 240, @@ -1849,7 +1813,6 @@ static void __exit h323_helper_exit(void) nf_conntrack_helper_unregister(&nf_conntrack_helper_h245); } -/****************************************************************************/ static void __exit nf_conntrack_h323_fini(void) { h323_helper_exit(); @@ -1857,7 +1820,6 @@ static void __exit nf_conntrack_h323_fini(void) pr_debug("nf_ct_h323: fini\n"); } -/****************************************************************************/ static int __init nf_conntrack_h323_init(void) { int ret; @@ -1877,7 +1839,6 @@ err1: return ret; } -/****************************************************************************/ module_init(nf_conntrack_h323_init); module_exit(nf_conntrack_h323_fini); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 59c08997bfdf..dd177ebee9aa 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -45,7 +45,6 @@ #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_conntrack_timestamp.h> #include <net/netfilter/nf_conntrack_labels.h> -#include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_conntrack_synproxy.h> #ifdef CONFIG_NF_NAT_NEEDED #include <net/netfilter/nf_nat_core.h> @@ -58,8 +57,6 @@ MODULE_LICENSE("GPL"); -static char __initdata version[] = "0.93"; - static int ctnetlink_dump_tuples_proto(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_l4proto *l4proto) @@ -545,7 +542,7 @@ static size_t ctnetlink_proto_size(const struct nf_conn *ct) len *= 3u; /* ORIG, REPLY, MASTER */ l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); - len += l4proto->nla_size; + len += l4proto->nlattr_size; if (l4proto->nlattr_tuple_size) { len4 = l4proto->nlattr_tuple_size(); len4 *= 3u; /* ORIG, REPLY, MASTER */ @@ -1111,6 +1108,14 @@ static const struct nla_policy ct_nla_policy[CTA_MAX+1] = { .len = NF_CT_LABELS_MAX_SIZE }, }; +static int ctnetlink_flush_iterate(struct nf_conn *ct, void *data) +{ + if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) + return 0; + + return ctnetlink_filter_match(ct, data); +} + static int ctnetlink_flush_conntrack(struct net *net, const struct nlattr * const cda[], u32 portid, int report) @@ -1123,7 +1128,7 @@ static int ctnetlink_flush_conntrack(struct net *net, return PTR_ERR(filter); } - nf_ct_iterate_cleanup_net(net, ctnetlink_filter_match, filter, + nf_ct_iterate_cleanup_net(net, ctnetlink_flush_iterate, filter, portid, report); kfree(filter); @@ -1169,6 +1174,11 @@ static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl, ct = nf_ct_tuplehash_to_ctrack(h); + if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) { + nf_ct_put(ct); + return -EBUSY; + } + if (cda[CTA_ID]) { u_int32_t id = ntohl(nla_get_be32(cda[CTA_ID])); if (id != (u32)(unsigned long)ct) { @@ -1566,9 +1576,11 @@ static int ctnetlink_change_helper(struct nf_conn *ct, static int ctnetlink_change_timeout(struct nf_conn *ct, const struct nlattr * const cda[]) { - u_int32_t timeout = ntohl(nla_get_be32(cda[CTA_TIMEOUT])); + u64 timeout = (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ; - ct->timeout = nfct_time_stamp + timeout * HZ; + if (timeout > INT_MAX) + timeout = INT_MAX; + ct->timeout = nfct_time_stamp + (u32)timeout; if (test_bit(IPS_DYING_BIT, &ct->status)) return -ETIME; @@ -1768,6 +1780,7 @@ ctnetlink_create_conntrack(struct net *net, int err = -EINVAL; struct nf_conntrack_helper *helper; struct nf_conn_tstamp *tstamp; + u64 timeout; ct = nf_conntrack_alloc(net, zone, otuple, rtuple, GFP_ATOMIC); if (IS_ERR(ct)) @@ -1776,7 +1789,10 @@ ctnetlink_create_conntrack(struct net *net, if (!cda[CTA_TIMEOUT]) goto err1; - ct->timeout = nfct_time_stamp + ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ; + timeout = (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ; + if (timeout > INT_MAX) + timeout = INT_MAX; + ct->timeout = (u32)timeout + nfct_time_stamp; rcu_read_lock(); if (cda[CTA_HELP]) { @@ -3407,7 +3423,6 @@ static int __init ctnetlink_init(void) { int ret; - pr_info("ctnetlink v%s: registering with nfnetlink.\n", version); ret = nfnetlink_subsys_register(&ctnl_subsys); if (ret < 0) { pr_err("ctnetlink_init: cannot register with nfnetlink.\n"); @@ -3441,8 +3456,6 @@ err_out: static void __exit ctnetlink_exit(void) { - pr_info("ctnetlink: unregistering from nfnetlink.\n"); - unregister_pernet_subsys(&ctnetlink_net_ops); nfnetlink_subsys_unregister(&ctnl_exp_subsys); nfnetlink_subsys_unregister(&ctnl_subsys); diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index c8e9c9503a08..afdeca53e88b 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -385,14 +385,14 @@ void nf_ct_l4proto_unregister_sysctl(struct net *net, /* FIXME: Allow NULL functions and sub in pointers to generic for them. --RR */ -int nf_ct_l4proto_register_one(struct nf_conntrack_l4proto *l4proto) +int nf_ct_l4proto_register_one(const struct nf_conntrack_l4proto *l4proto) { int ret = 0; if (l4proto->l3proto >= ARRAY_SIZE(nf_ct_protos)) return -EBUSY; - if ((l4proto->to_nlattr && !l4proto->nlattr_size) || + if ((l4proto->to_nlattr && l4proto->nlattr_size == 0) || (l4proto->tuple_to_nlattr && !l4proto->nlattr_tuple_size)) return -EINVAL; @@ -428,10 +428,6 @@ int nf_ct_l4proto_register_one(struct nf_conntrack_l4proto *l4proto) goto out_unlock; } - l4proto->nla_size = 0; - if (l4proto->nlattr_size) - l4proto->nla_size += l4proto->nlattr_size(); - rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto], l4proto); out_unlock: @@ -502,7 +498,7 @@ void nf_ct_l4proto_pernet_unregister_one(struct net *net, } EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_unregister_one); -int nf_ct_l4proto_register(struct nf_conntrack_l4proto *l4proto[], +int nf_ct_l4proto_register(const struct nf_conntrack_l4proto * const l4proto[], unsigned int num_proto) { int ret = -EINVAL, ver; @@ -524,7 +520,7 @@ int nf_ct_l4proto_register(struct nf_conntrack_l4proto *l4proto[], EXPORT_SYMBOL_GPL(nf_ct_l4proto_register); int nf_ct_l4proto_pernet_register(struct net *net, - struct nf_conntrack_l4proto *const l4proto[], + const struct nf_conntrack_l4proto *const l4proto[], unsigned int num_proto) { int ret = -EINVAL; @@ -545,7 +541,7 @@ int nf_ct_l4proto_pernet_register(struct net *net, } EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_register); -void nf_ct_l4proto_unregister(struct nf_conntrack_l4proto *l4proto[], +void nf_ct_l4proto_unregister(const struct nf_conntrack_l4proto * const l4proto[], unsigned int num_proto) { mutex_lock(&nf_ct_proto_mutex); @@ -555,12 +551,12 @@ void nf_ct_l4proto_unregister(struct nf_conntrack_l4proto *l4proto[], synchronize_net(); /* Remove all contrack entries for this protocol */ - nf_ct_iterate_destroy(kill_l4proto, l4proto); + nf_ct_iterate_destroy(kill_l4proto, (void *)l4proto); } EXPORT_SYMBOL_GPL(nf_ct_l4proto_unregister); void nf_ct_l4proto_pernet_unregister(struct net *net, - struct nf_conntrack_l4proto *const l4proto[], + const struct nf_conntrack_l4proto *const l4proto[], unsigned int num_proto) { while (num_proto-- != 0) diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index 2a446f4a554c..abe647d5b8c6 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -654,6 +654,12 @@ static const struct nla_policy dccp_nla_policy[CTA_PROTOINFO_DCCP_MAX + 1] = { [CTA_PROTOINFO_DCCP_PAD] = { .type = NLA_UNSPEC }, }; +#define DCCP_NLATTR_SIZE ( \ + NLA_ALIGN(NLA_HDRLEN + 1) + \ + NLA_ALIGN(NLA_HDRLEN + 1) + \ + NLA_ALIGN(NLA_HDRLEN + sizeof(u64)) + \ + NLA_ALIGN(NLA_HDRLEN + 0)) + static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct) { struct nlattr *attr = cda[CTA_PROTOINFO_DCCP]; @@ -691,13 +697,6 @@ static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct) spin_unlock_bh(&ct->lock); return 0; } - -static int dccp_nlattr_size(void) -{ - return nla_total_size(0) /* CTA_PROTOINFO_DCCP */ - + nla_policy_len(dccp_nla_policy, CTA_PROTOINFO_DCCP_MAX + 1); -} - #endif #if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) @@ -862,7 +861,7 @@ static struct nf_proto_net *dccp_get_net_proto(struct net *net) return &net->ct.nf_ct_proto.dccp.pn; } -struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4 __read_mostly = { +const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4 = { .l3proto = AF_INET, .l4proto = IPPROTO_DCCP, .pkt_to_tuple = dccp_pkt_to_tuple, @@ -876,8 +875,8 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4 __read_mostly = { .print_conntrack = dccp_print_conntrack, #endif #if IS_ENABLED(CONFIG_NF_CT_NETLINK) + .nlattr_size = DCCP_NLATTR_SIZE, .to_nlattr = dccp_to_nlattr, - .nlattr_size = dccp_nlattr_size, .from_nlattr = nlattr_to_dccp, .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, @@ -898,7 +897,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4 __read_mostly = { }; EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_dccp4); -struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6 __read_mostly = { +const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6 = { .l3proto = AF_INET6, .l4proto = IPPROTO_DCCP, .pkt_to_tuple = dccp_pkt_to_tuple, @@ -912,8 +911,8 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6 __read_mostly = { .print_conntrack = dccp_print_conntrack, #endif #if IS_ENABLED(CONFIG_NF_CT_NETLINK) + .nlattr_size = DCCP_NLATTR_SIZE, .to_nlattr = dccp_to_nlattr, - .nlattr_size = dccp_nlattr_size, .from_nlattr = nlattr_to_dccp, .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c index 1f86ddf6649a..6c6896d21cd7 100644 --- a/net/netfilter/nf_conntrack_proto_generic.c +++ b/net/netfilter/nf_conntrack_proto_generic.c @@ -12,7 +12,7 @@ #include <linux/netfilter.h> #include <net/netfilter/nf_conntrack_l4proto.h> -static unsigned int nf_ct_generic_timeout __read_mostly = 600*HZ; +static const unsigned int nf_ct_generic_timeout = 600*HZ; static bool nf_generic_should_process(u8 proto) { @@ -163,7 +163,7 @@ static struct nf_proto_net *generic_get_net_proto(struct net *net) return &net->ct.nf_ct_proto.generic.pn; } -struct nf_conntrack_l4proto nf_conntrack_l4proto_generic __read_mostly = +const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic = { .l3proto = PF_UNSPEC, .l4proto = 255, diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index a2503005d80b..d049ea5a3770 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -48,7 +48,7 @@ enum grep_conntrack { GRE_CT_MAX }; -static unsigned int gre_timeouts[GRE_CT_MAX] = { +static const unsigned int gre_timeouts[GRE_CT_MAX] = { [GRE_CT_UNREPLIED] = 30*HZ, [GRE_CT_REPLIED] = 180*HZ, }; @@ -352,7 +352,7 @@ static int gre_init_net(struct net *net, u_int16_t proto) } /* protocol helper struct */ -static struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 __read_mostly = { +static const struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 = { .l3proto = AF_INET, .l4proto = IPPROTO_GRE, .pkt_to_tuple = gre_pkt_to_tuple, diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 80faf04ddf15..fb9a35d16069 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -52,7 +52,7 @@ static const char *const sctp_conntrack_names[] = { #define HOURS * 60 MINS #define DAYS * 24 HOURS -static unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] __read_mostly = { +static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = { [SCTP_CONNTRACK_CLOSED] = 10 SECS, [SCTP_CONNTRACK_COOKIE_WAIT] = 3 SECS, [SCTP_CONNTRACK_COOKIE_ECHOED] = 3 SECS, @@ -578,6 +578,11 @@ static const struct nla_policy sctp_nla_policy[CTA_PROTOINFO_SCTP_MAX+1] = { [CTA_PROTOINFO_SCTP_VTAG_REPLY] = { .type = NLA_U32 }, }; +#define SCTP_NLATTR_SIZE ( \ + NLA_ALIGN(NLA_HDRLEN + 1) + \ + NLA_ALIGN(NLA_HDRLEN + 4) + \ + NLA_ALIGN(NLA_HDRLEN + 4)) + static int nlattr_to_sctp(struct nlattr *cda[], struct nf_conn *ct) { struct nlattr *attr = cda[CTA_PROTOINFO_SCTP]; @@ -608,12 +613,6 @@ static int nlattr_to_sctp(struct nlattr *cda[], struct nf_conn *ct) return 0; } - -static int sctp_nlattr_size(void) -{ - return nla_total_size(0) /* CTA_PROTOINFO_SCTP */ - + nla_policy_len(sctp_nla_policy, CTA_PROTOINFO_SCTP_MAX + 1); -} #endif #if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) @@ -778,7 +777,7 @@ static struct nf_proto_net *sctp_get_net_proto(struct net *net) return &net->ct.nf_ct_proto.sctp.pn; } -struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = { +const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 = { .l3proto = PF_INET, .l4proto = IPPROTO_SCTP, .pkt_to_tuple = sctp_pkt_to_tuple, @@ -793,8 +792,8 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = { .can_early_drop = sctp_can_early_drop, .me = THIS_MODULE, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) + .nlattr_size = SCTP_NLATTR_SIZE, .to_nlattr = sctp_to_nlattr, - .nlattr_size = sctp_nlattr_size, .from_nlattr = nlattr_to_sctp, .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, @@ -815,7 +814,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = { }; EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_sctp4); -struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = { +const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 = { .l3proto = PF_INET6, .l4proto = IPPROTO_SCTP, .pkt_to_tuple = sctp_pkt_to_tuple, @@ -830,8 +829,8 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = { .can_early_drop = sctp_can_early_drop, .me = THIS_MODULE, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) + .nlattr_size = SCTP_NLATTR_SIZE, .to_nlattr = sctp_to_nlattr, - .nlattr_size = sctp_nlattr_size, .from_nlattr = nlattr_to_sctp, .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index b12fc07111d0..e97cdc1cf98c 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -68,7 +68,7 @@ static const char *const tcp_conntrack_names[] = { #define HOURS * 60 MINS #define DAYS * 24 HOURS -static unsigned int tcp_timeouts[TCP_CONNTRACK_TIMEOUT_MAX] __read_mostly = { +static const unsigned int tcp_timeouts[TCP_CONNTRACK_TIMEOUT_MAX] = { [TCP_CONNTRACK_SYN_SENT] = 2 MINS, [TCP_CONNTRACK_SYN_RECV] = 60 SECS, [TCP_CONNTRACK_ESTABLISHED] = 5 DAYS, @@ -305,6 +305,9 @@ static bool tcp_invert_tuple(struct nf_conntrack_tuple *tuple, /* Print out the private part of the conntrack. */ static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct) { + if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) + return; + seq_printf(s, "%s ", tcp_conntrack_names[ct->proto.tcp.state]); } #endif @@ -1039,6 +1042,9 @@ static int tcp_packet(struct nf_conn *ct, IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED && timeouts[new_state] > timeouts[TCP_CONNTRACK_UNACK]) timeout = timeouts[TCP_CONNTRACK_UNACK]; + else if (ct->proto.tcp.last_win == 0 && + timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS]) + timeout = timeouts[TCP_CONNTRACK_RETRANS]; else timeout = timeouts[new_state]; spin_unlock_bh(&ct->lock); @@ -1219,6 +1225,12 @@ static const struct nla_policy tcp_nla_policy[CTA_PROTOINFO_TCP_MAX+1] = { [CTA_PROTOINFO_TCP_FLAGS_REPLY] = { .len = sizeof(struct nf_ct_tcp_flags) }, }; +#define TCP_NLATTR_SIZE ( \ + NLA_ALIGN(NLA_HDRLEN + 1) + \ + NLA_ALIGN(NLA_HDRLEN + 1) + \ + NLA_ALIGN(NLA_HDRLEN + sizeof(sizeof(struct nf_ct_tcp_flags))) + \ + NLA_ALIGN(NLA_HDRLEN + sizeof(sizeof(struct nf_ct_tcp_flags)))) + static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct) { struct nlattr *pattr = cda[CTA_PROTOINFO_TCP]; @@ -1271,12 +1283,6 @@ static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct) return 0; } -static int tcp_nlattr_size(void) -{ - return nla_total_size(0) /* CTA_PROTOINFO_TCP */ - + nla_policy_len(tcp_nla_policy, CTA_PROTOINFO_TCP_MAX + 1); -} - static unsigned int tcp_nlattr_tuple_size(void) { static unsigned int size __read_mostly; @@ -1538,7 +1544,7 @@ static struct nf_proto_net *tcp_get_net_proto(struct net *net) return &net->ct.nf_ct_proto.tcp.pn; } -struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 __read_mostly = +const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 = { .l3proto = PF_INET, .l4proto = IPPROTO_TCP, @@ -1554,11 +1560,11 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 __read_mostly = .can_early_drop = tcp_can_early_drop, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .to_nlattr = tcp_to_nlattr, - .nlattr_size = tcp_nlattr_size, .from_nlattr = nlattr_to_tcp, .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, .nlattr_tuple_size = tcp_nlattr_tuple_size, + .nlattr_size = TCP_NLATTR_SIZE, .nla_policy = nf_ct_port_nla_policy, #endif #if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) @@ -1576,7 +1582,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 __read_mostly = }; EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp4); -struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 __read_mostly = +const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 = { .l3proto = PF_INET6, .l4proto = IPPROTO_TCP, @@ -1591,8 +1597,8 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 __read_mostly = .error = tcp_error, .can_early_drop = tcp_can_early_drop, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) + .nlattr_size = TCP_NLATTR_SIZE, .to_nlattr = tcp_to_nlattr, - .nlattr_size = tcp_nlattr_size, .from_nlattr = nlattr_to_tcp, .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 3a5f727103af..fe7243970aa4 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -26,7 +26,7 @@ #include <net/netfilter/ipv4/nf_conntrack_ipv4.h> #include <net/netfilter/ipv6/nf_conntrack_ipv6.h> -static unsigned int udp_timeouts[UDP_CT_MAX] = { +static const unsigned int udp_timeouts[UDP_CT_MAX] = { [UDP_CT_UNREPLIED] = 30*HZ, [UDP_CT_REPLIED] = 180*HZ, }; @@ -296,7 +296,7 @@ static struct nf_proto_net *udp_get_net_proto(struct net *net) return &net->ct.nf_ct_proto.udp.pn; } -struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 __read_mostly = +const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 = { .l3proto = PF_INET, .l4proto = IPPROTO_UDP, @@ -328,7 +328,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 __read_mostly = EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp4); #ifdef CONFIG_NF_CT_PROTO_UDPLITE -struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly = +const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 = { .l3proto = PF_INET, .l4proto = IPPROTO_UDPLITE, @@ -360,7 +360,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly = EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udplite4); #endif -struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly = +const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 = { .l3proto = PF_INET6, .l4proto = IPPROTO_UDP, @@ -392,7 +392,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly = EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp6); #ifdef CONFIG_NF_CT_PROTO_UDPLITE -struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly = +const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 = { .l3proto = PF_INET6, .l4proto = IPPROTO_UDPLITE, diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 5a101caa3e12..9123fdec5e14 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -309,10 +309,12 @@ static int ct_seq_show(struct seq_file *s, void *v) WARN_ON(!l4proto); ret = -ENOSPC; - seq_printf(s, "%-8s %u %-8s %u %ld ", + seq_printf(s, "%-8s %u %-8s %u ", l3proto_name(l3proto->l3proto), nf_ct_l3num(ct), - l4proto_name(l4proto->l4proto), nf_ct_protonum(ct), - nf_ct_expires(ct) / HZ); + l4proto_name(l4proto->l4proto), nf_ct_protonum(ct)); + + if (!test_bit(IPS_OFFLOAD_BIT, &ct->status)) + seq_printf(s, "%ld ", nf_ct_expires(ct) / HZ); if (l4proto->print_conntrack) l4proto->print_conntrack(s, ct); @@ -339,7 +341,9 @@ static int ct_seq_show(struct seq_file *s, void *v) if (seq_print_acct(s, ct, IP_CT_DIR_REPLY)) goto release; - if (test_bit(IPS_ASSURED_BIT, &ct->status)) + if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) + seq_puts(s, "[OFFLOAD] "); + else if (test_bit(IPS_ASSURED_BIT, &ct->status)) seq_puts(s, "[ASSURED] "); if (seq_has_overflowed(s)) @@ -378,7 +382,6 @@ static int ct_open(struct inode *inode, struct file *file) } static const struct file_operations ct_file_ops = { - .owner = THIS_MODULE, .open = ct_open, .read = seq_read, .llseek = seq_lseek, @@ -471,7 +474,6 @@ static int ct_cpu_seq_open(struct inode *inode, struct file *file) } static const struct file_operations ct_cpu_seq_fops = { - .owner = THIS_MODULE, .open = ct_cpu_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/netfilter/nf_flow_table.c b/net/netfilter/nf_flow_table.c new file mode 100644 index 000000000000..2f5099cb85b8 --- /dev/null +++ b/net/netfilter/nf_flow_table.c @@ -0,0 +1,429 @@ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netfilter.h> +#include <linux/rhashtable.h> +#include <linux/netdevice.h> +#include <net/netfilter/nf_flow_table.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_tuple.h> + +struct flow_offload_entry { + struct flow_offload flow; + struct nf_conn *ct; + struct rcu_head rcu_head; +}; + +struct flow_offload * +flow_offload_alloc(struct nf_conn *ct, struct nf_flow_route *route) +{ + struct flow_offload_entry *entry; + struct flow_offload *flow; + + if (unlikely(nf_ct_is_dying(ct) || + !atomic_inc_not_zero(&ct->ct_general.use))) + return NULL; + + entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + if (!entry) + goto err_ct_refcnt; + + flow = &entry->flow; + + if (!dst_hold_safe(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst)) + goto err_dst_cache_original; + + if (!dst_hold_safe(route->tuple[FLOW_OFFLOAD_DIR_REPLY].dst)) + goto err_dst_cache_reply; + + entry->ct = ct; + + switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num) { + case NFPROTO_IPV4: + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4 = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.in; + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4 = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4 = + ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.in; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4 = + ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.in; + break; + case NFPROTO_IPV6: + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6 = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.in6; + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6 = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in6; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6 = + ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.in6; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6 = + ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.in6; + break; + } + + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l3proto = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.l3proto = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.l4proto = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum; + + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache = + route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache = + route->tuple[FLOW_OFFLOAD_DIR_REPLY].dst; + + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port; + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.tcp.port; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port = + ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u.tcp.port; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port = + ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.tcp.port; + + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dir = + FLOW_OFFLOAD_DIR_ORIGINAL; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dir = + FLOW_OFFLOAD_DIR_REPLY; + + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.iifidx = + route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].ifindex; + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.oifidx = + route->tuple[FLOW_OFFLOAD_DIR_REPLY].ifindex; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.iifidx = + route->tuple[FLOW_OFFLOAD_DIR_REPLY].ifindex; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.oifidx = + route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].ifindex; + + if (ct->status & IPS_SRC_NAT) + flow->flags |= FLOW_OFFLOAD_SNAT; + else if (ct->status & IPS_DST_NAT) + flow->flags |= FLOW_OFFLOAD_DNAT; + + return flow; + +err_dst_cache_reply: + dst_release(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst); +err_dst_cache_original: + kfree(entry); +err_ct_refcnt: + nf_ct_put(ct); + + return NULL; +} +EXPORT_SYMBOL_GPL(flow_offload_alloc); + +void flow_offload_free(struct flow_offload *flow) +{ + struct flow_offload_entry *e; + + dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache); + dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache); + e = container_of(flow, struct flow_offload_entry, flow); + kfree(e); +} +EXPORT_SYMBOL_GPL(flow_offload_free); + +void flow_offload_dead(struct flow_offload *flow) +{ + flow->flags |= FLOW_OFFLOAD_DYING; +} +EXPORT_SYMBOL_GPL(flow_offload_dead); + +int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) +{ + flow->timeout = (u32)jiffies; + + rhashtable_insert_fast(&flow_table->rhashtable, + &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node, + *flow_table->type->params); + rhashtable_insert_fast(&flow_table->rhashtable, + &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node, + *flow_table->type->params); + return 0; +} +EXPORT_SYMBOL_GPL(flow_offload_add); + +void flow_offload_del(struct nf_flowtable *flow_table, + struct flow_offload *flow) +{ + struct flow_offload_entry *e; + + rhashtable_remove_fast(&flow_table->rhashtable, + &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node, + *flow_table->type->params); + rhashtable_remove_fast(&flow_table->rhashtable, + &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node, + *flow_table->type->params); + + e = container_of(flow, struct flow_offload_entry, flow); + kfree_rcu(e, rcu_head); +} +EXPORT_SYMBOL_GPL(flow_offload_del); + +struct flow_offload_tuple_rhash * +flow_offload_lookup(struct nf_flowtable *flow_table, + struct flow_offload_tuple *tuple) +{ + return rhashtable_lookup_fast(&flow_table->rhashtable, tuple, + *flow_table->type->params); +} +EXPORT_SYMBOL_GPL(flow_offload_lookup); + +static void nf_flow_release_ct(const struct flow_offload *flow) +{ + struct flow_offload_entry *e; + + e = container_of(flow, struct flow_offload_entry, flow); + nf_ct_delete(e->ct, 0, 0); + nf_ct_put(e->ct); +} + +int nf_flow_table_iterate(struct nf_flowtable *flow_table, + void (*iter)(struct flow_offload *flow, void *data), + void *data) +{ + struct flow_offload_tuple_rhash *tuplehash; + struct rhashtable_iter hti; + struct flow_offload *flow; + int err; + + err = rhashtable_walk_init(&flow_table->rhashtable, &hti, GFP_KERNEL); + if (err) + return err; + + rhashtable_walk_start(&hti); + + while ((tuplehash = rhashtable_walk_next(&hti))) { + if (IS_ERR(tuplehash)) { + err = PTR_ERR(tuplehash); + if (err != -EAGAIN) + goto out; + + continue; + } + if (tuplehash->tuple.dir) + continue; + + flow = container_of(tuplehash, struct flow_offload, tuplehash[0]); + + iter(flow, data); + } +out: + rhashtable_walk_stop(&hti); + rhashtable_walk_exit(&hti); + + return err; +} +EXPORT_SYMBOL_GPL(nf_flow_table_iterate); + +static inline bool nf_flow_has_expired(const struct flow_offload *flow) +{ + return (__s32)(flow->timeout - (u32)jiffies) <= 0; +} + +static inline bool nf_flow_is_dying(const struct flow_offload *flow) +{ + return flow->flags & FLOW_OFFLOAD_DYING; +} + +void nf_flow_offload_work_gc(struct work_struct *work) +{ + struct flow_offload_tuple_rhash *tuplehash; + struct nf_flowtable *flow_table; + struct rhashtable_iter hti; + struct flow_offload *flow; + int err; + + flow_table = container_of(work, struct nf_flowtable, gc_work.work); + + err = rhashtable_walk_init(&flow_table->rhashtable, &hti, GFP_KERNEL); + if (err) + goto schedule; + + rhashtable_walk_start(&hti); + + while ((tuplehash = rhashtable_walk_next(&hti))) { + if (IS_ERR(tuplehash)) { + err = PTR_ERR(tuplehash); + if (err != -EAGAIN) + goto out; + + continue; + } + if (tuplehash->tuple.dir) + continue; + + flow = container_of(tuplehash, struct flow_offload, tuplehash[0]); + + if (nf_flow_has_expired(flow) || + nf_flow_is_dying(flow)) { + flow_offload_del(flow_table, flow); + nf_flow_release_ct(flow); + } + } +out: + rhashtable_walk_stop(&hti); + rhashtable_walk_exit(&hti); +schedule: + queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ); +} +EXPORT_SYMBOL_GPL(nf_flow_offload_work_gc); + +static u32 flow_offload_hash(const void *data, u32 len, u32 seed) +{ + const struct flow_offload_tuple *tuple = data; + + return jhash(tuple, offsetof(struct flow_offload_tuple, dir), seed); +} + +static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed) +{ + const struct flow_offload_tuple_rhash *tuplehash = data; + + return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, dir), seed); +} + +static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg, + const void *ptr) +{ + const struct flow_offload_tuple *tuple = arg->key; + const struct flow_offload_tuple_rhash *x = ptr; + + if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, dir))) + return 1; + + return 0; +} + +const struct rhashtable_params nf_flow_offload_rhash_params = { + .head_offset = offsetof(struct flow_offload_tuple_rhash, node), + .hashfn = flow_offload_hash, + .obj_hashfn = flow_offload_hash_obj, + .obj_cmpfn = flow_offload_hash_cmp, + .automatic_shrinking = true, +}; +EXPORT_SYMBOL_GPL(nf_flow_offload_rhash_params); + +static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff, + __be16 port, __be16 new_port) +{ + struct tcphdr *tcph; + + if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) || + skb_try_make_writable(skb, thoff + sizeof(*tcph))) + return -1; + + tcph = (void *)(skb_network_header(skb) + thoff); + inet_proto_csum_replace2(&tcph->check, skb, port, new_port, true); + + return 0; +} + +static int nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff, + __be16 port, __be16 new_port) +{ + struct udphdr *udph; + + if (!pskb_may_pull(skb, thoff + sizeof(*udph)) || + skb_try_make_writable(skb, thoff + sizeof(*udph))) + return -1; + + udph = (void *)(skb_network_header(skb) + thoff); + if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) { + inet_proto_csum_replace2(&udph->check, skb, port, + new_port, true); + if (!udph->check) + udph->check = CSUM_MANGLED_0; + } + + return 0; +} + +static int nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff, + u8 protocol, __be16 port, __be16 new_port) +{ + switch (protocol) { + case IPPROTO_TCP: + if (nf_flow_nat_port_tcp(skb, thoff, port, new_port) < 0) + return NF_DROP; + break; + case IPPROTO_UDP: + if (nf_flow_nat_port_udp(skb, thoff, port, new_port) < 0) + return NF_DROP; + break; + } + + return 0; +} + +int nf_flow_snat_port(const struct flow_offload *flow, + struct sk_buff *skb, unsigned int thoff, + u8 protocol, enum flow_offload_tuple_dir dir) +{ + struct flow_ports *hdr; + __be16 port, new_port; + + if (!pskb_may_pull(skb, thoff + sizeof(*hdr)) || + skb_try_make_writable(skb, thoff + sizeof(*hdr))) + return -1; + + hdr = (void *)(skb_network_header(skb) + thoff); + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + port = hdr->source; + new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port; + hdr->source = new_port; + break; + case FLOW_OFFLOAD_DIR_REPLY: + port = hdr->dest; + new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port; + hdr->dest = new_port; + break; + default: + return -1; + } + + return nf_flow_nat_port(skb, thoff, protocol, port, new_port); +} +EXPORT_SYMBOL_GPL(nf_flow_snat_port); + +int nf_flow_dnat_port(const struct flow_offload *flow, + struct sk_buff *skb, unsigned int thoff, + u8 protocol, enum flow_offload_tuple_dir dir) +{ + struct flow_ports *hdr; + __be16 port, new_port; + + if (!pskb_may_pull(skb, thoff + sizeof(*hdr)) || + skb_try_make_writable(skb, thoff + sizeof(*hdr))) + return -1; + + hdr = (void *)(skb_network_header(skb) + thoff); + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + port = hdr->dest; + new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port; + hdr->dest = new_port; + break; + case FLOW_OFFLOAD_DIR_REPLY: + port = hdr->source; + new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port; + hdr->source = new_port; + break; + default: + return -1; + } + + return nf_flow_nat_port(skb, thoff, protocol, port, new_port); +} +EXPORT_SYMBOL_GPL(nf_flow_dnat_port); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c new file mode 100644 index 000000000000..281209aeba8f --- /dev/null +++ b/net/netfilter/nf_flow_table_inet.c @@ -0,0 +1,48 @@ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netfilter.h> +#include <linux/rhashtable.h> +#include <net/netfilter/nf_flow_table.h> +#include <net/netfilter/nf_tables.h> + +static unsigned int +nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + switch (skb->protocol) { + case htons(ETH_P_IP): + return nf_flow_offload_ip_hook(priv, skb, state); + case htons(ETH_P_IPV6): + return nf_flow_offload_ipv6_hook(priv, skb, state); + } + + return NF_ACCEPT; +} + +static struct nf_flowtable_type flowtable_inet = { + .family = NFPROTO_INET, + .params = &nf_flow_offload_rhash_params, + .gc = nf_flow_offload_work_gc, + .hook = nf_flow_offload_inet_hook, + .owner = THIS_MODULE, +}; + +static int __init nf_flow_inet_module_init(void) +{ + nft_register_flowtable_type(&flowtable_inet); + + return 0; +} + +static void __exit nf_flow_inet_module_exit(void) +{ + nft_unregister_flowtable_type(&flowtable_inet); +} + +module_init(nf_flow_inet_module_init); +module_exit(nf_flow_inet_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); +MODULE_ALIAS_NF_FLOWTABLE(1); /* NFPROTO_INET */ diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h index 44284cd2528d..18f6d7ae995b 100644 --- a/net/netfilter/nf_internals.h +++ b/net/netfilter/nf_internals.h @@ -10,7 +10,7 @@ int nf_queue(struct sk_buff *skb, struct nf_hook_state *state, const struct nf_hook_entries *entries, unsigned int index, unsigned int verdict); -unsigned int nf_queue_nf_hook_drop(struct net *net); +void nf_queue_nf_hook_drop(struct net *net); /* nf_log.c */ int __init netfilter_log_init(void); diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 8bb152a7cca4..c2c1b16b7538 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -402,7 +402,6 @@ static int nflog_open(struct inode *inode, struct file *file) } static const struct file_operations nflog_file_ops = { - .owner = THIS_MODULE, .open = nflog_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index f7e21953b1de..d67a96a25a68 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -10,6 +10,8 @@ #include <linux/proc_fs.h> #include <linux/skbuff.h> #include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv6.h> #include <linux/netfilter_bridge.h> #include <linux/seq_file.h> #include <linux/rcupdate.h> @@ -96,30 +98,56 @@ void nf_queue_entry_get_refs(struct nf_queue_entry *entry) } EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs); -unsigned int nf_queue_nf_hook_drop(struct net *net) +void nf_queue_nf_hook_drop(struct net *net) { const struct nf_queue_handler *qh; - unsigned int count = 0; rcu_read_lock(); qh = rcu_dereference(net->nf.queue_handler); if (qh) - count = qh->nf_hook_drop(net); + qh->nf_hook_drop(net); rcu_read_unlock(); - - return count; } EXPORT_SYMBOL_GPL(nf_queue_nf_hook_drop); +static void nf_ip_saveroute(const struct sk_buff *skb, + struct nf_queue_entry *entry) +{ + struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry); + + if (entry->state.hook == NF_INET_LOCAL_OUT) { + const struct iphdr *iph = ip_hdr(skb); + + rt_info->tos = iph->tos; + rt_info->daddr = iph->daddr; + rt_info->saddr = iph->saddr; + rt_info->mark = skb->mark; + } +} + +static void nf_ip6_saveroute(const struct sk_buff *skb, + struct nf_queue_entry *entry) +{ + struct ip6_rt_info *rt_info = nf_queue_entry_reroute(entry); + + if (entry->state.hook == NF_INET_LOCAL_OUT) { + const struct ipv6hdr *iph = ipv6_hdr(skb); + + rt_info->daddr = iph->daddr; + rt_info->saddr = iph->saddr; + rt_info->mark = skb->mark; + } +} + static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, const struct nf_hook_entries *entries, unsigned int index, unsigned int queuenum) { int status = -ENOENT; struct nf_queue_entry *entry = NULL; - const struct nf_afinfo *afinfo; const struct nf_queue_handler *qh; struct net *net = state->net; + unsigned int route_key_size; /* QUEUE == DROP if no one is waiting, to be safe. */ qh = rcu_dereference(net->nf.queue_handler); @@ -128,11 +156,19 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, goto err; } - afinfo = nf_get_afinfo(state->pf); - if (!afinfo) - goto err; + switch (state->pf) { + case AF_INET: + route_key_size = sizeof(struct ip_rt_info); + break; + case AF_INET6: + route_key_size = sizeof(struct ip6_rt_info); + break; + default: + route_key_size = 0; + break; + } - entry = kmalloc(sizeof(*entry) + afinfo->route_key_size, GFP_ATOMIC); + entry = kmalloc(sizeof(*entry) + route_key_size, GFP_ATOMIC); if (!entry) { status = -ENOMEM; goto err; @@ -142,12 +178,21 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, .skb = skb, .state = *state, .hook_index = index, - .size = sizeof(*entry) + afinfo->route_key_size, + .size = sizeof(*entry) + route_key_size, }; nf_queue_entry_get_refs(entry); skb_dst_force(skb); - afinfo->saveroute(skb, entry); + + switch (entry->state.pf) { + case AF_INET: + nf_ip_saveroute(skb, entry); + break; + case AF_INET6: + nf_ip6_saveroute(skb, entry); + break; + } + status = qh->outfn(entry, queuenum); if (status < 0) { @@ -204,13 +249,31 @@ repeat: return NF_ACCEPT; } +static struct nf_hook_entries *nf_hook_entries_head(const struct net *net, u8 pf, u8 hooknum) +{ + switch (pf) { +#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE + case NFPROTO_BRIDGE: + return rcu_dereference(net->nf.hooks_bridge[hooknum]); +#endif + case NFPROTO_IPV4: + return rcu_dereference(net->nf.hooks_ipv4[hooknum]); + case NFPROTO_IPV6: + return rcu_dereference(net->nf.hooks_ipv6[hooknum]); + default: + WARN_ON_ONCE(1); + return NULL; + } + + return NULL; +} + /* Caller must hold rcu read-side lock */ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) { const struct nf_hook_entry *hook_entry; const struct nf_hook_entries *hooks; struct sk_buff *skb = entry->skb; - const struct nf_afinfo *afinfo; const struct net *net; unsigned int i; int err; @@ -219,12 +282,12 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) net = entry->state.net; pf = entry->state.pf; - hooks = rcu_dereference(net->nf.hooks[pf][entry->state.hook]); + hooks = nf_hook_entries_head(net, pf, entry->state.hook); nf_queue_entry_release_refs(entry); i = entry->hook_index; - if (WARN_ON_ONCE(i >= hooks->num_hook_entries)) { + if (WARN_ON_ONCE(!hooks || i >= hooks->num_hook_entries)) { kfree_skb(skb); kfree(entry); return; @@ -237,8 +300,7 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) verdict = nf_hook_entry_hookfn(hook_entry, skb, &entry->state); if (verdict == NF_ACCEPT) { - afinfo = nf_get_afinfo(entry->state.pf); - if (!afinfo || afinfo->reroute(entry->state.net, skb, entry) < 0) + if (nf_reroute(skb, entry) < 0) verdict = NF_DROP; } diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 49bd8bb16b18..92139a087260 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -317,7 +317,6 @@ static int synproxy_cpu_seq_open(struct inode *inode, struct file *file) } static const struct file_operations synproxy_cpu_seq_fops = { - .owner = THIS_MODULE, .open = synproxy_cpu_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index d8327b43e4dc..0791813a1e7d 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -17,6 +17,7 @@ #include <linux/netfilter.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_flow_table.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> #include <net/net_namespace.h> @@ -24,86 +25,20 @@ static LIST_HEAD(nf_tables_expressions); static LIST_HEAD(nf_tables_objects); - -/** - * nft_register_afinfo - register nf_tables address family info - * - * @afi: address family info to register - * - * Register the address family for use with nf_tables. Returns zero on - * success or a negative errno code otherwise. - */ -int nft_register_afinfo(struct net *net, struct nft_af_info *afi) -{ - INIT_LIST_HEAD(&afi->tables); - nfnl_lock(NFNL_SUBSYS_NFTABLES); - list_add_tail_rcu(&afi->list, &net->nft.af_info); - nfnl_unlock(NFNL_SUBSYS_NFTABLES); - return 0; -} -EXPORT_SYMBOL_GPL(nft_register_afinfo); - -static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi); - -/** - * nft_unregister_afinfo - unregister nf_tables address family info - * - * @afi: address family info to unregister - * - * Unregister the address family for use with nf_tables. - */ -void nft_unregister_afinfo(struct net *net, struct nft_af_info *afi) -{ - nfnl_lock(NFNL_SUBSYS_NFTABLES); - __nft_release_afinfo(net, afi); - list_del_rcu(&afi->list); - nfnl_unlock(NFNL_SUBSYS_NFTABLES); -} -EXPORT_SYMBOL_GPL(nft_unregister_afinfo); - -static struct nft_af_info *nft_afinfo_lookup(struct net *net, int family) -{ - struct nft_af_info *afi; - - list_for_each_entry(afi, &net->nft.af_info, list) { - if (afi->family == family) - return afi; - } - return NULL; -} - -static struct nft_af_info * -nf_tables_afinfo_lookup(struct net *net, int family, bool autoload) -{ - struct nft_af_info *afi; - - afi = nft_afinfo_lookup(net, family); - if (afi != NULL) - return afi; -#ifdef CONFIG_MODULES - if (autoload) { - nfnl_unlock(NFNL_SUBSYS_NFTABLES); - request_module("nft-afinfo-%u", family); - nfnl_lock(NFNL_SUBSYS_NFTABLES); - afi = nft_afinfo_lookup(net, family); - if (afi != NULL) - return ERR_PTR(-EAGAIN); - } -#endif - return ERR_PTR(-EAFNOSUPPORT); -} +static LIST_HEAD(nf_tables_flowtables); +static u64 table_handle; static void nft_ctx_init(struct nft_ctx *ctx, struct net *net, const struct sk_buff *skb, const struct nlmsghdr *nlh, - struct nft_af_info *afi, + u8 family, struct nft_table *table, struct nft_chain *chain, const struct nlattr * const *nla) { ctx->net = net; - ctx->afi = afi; + ctx->family = family; ctx->table = table; ctx->chain = chain; ctx->nla = nla; @@ -139,29 +74,26 @@ static void nft_trans_destroy(struct nft_trans *trans) kfree(trans); } -static int nf_tables_register_hooks(struct net *net, - const struct nft_table *table, - struct nft_chain *chain, - unsigned int hook_nops) +static int nf_tables_register_hook(struct net *net, + const struct nft_table *table, + struct nft_chain *chain) { if (table->flags & NFT_TABLE_F_DORMANT || !nft_is_base_chain(chain)) return 0; - return nf_register_net_hooks(net, nft_base_chain(chain)->ops, - hook_nops); + return nf_register_net_hook(net, &nft_base_chain(chain)->ops); } -static void nf_tables_unregister_hooks(struct net *net, - const struct nft_table *table, - struct nft_chain *chain, - unsigned int hook_nops) +static void nf_tables_unregister_hook(struct net *net, + const struct nft_table *table, + struct nft_chain *chain) { if (table->flags & NFT_TABLE_F_DORMANT || !nft_is_base_chain(chain)) return; - nf_unregister_net_hooks(net, nft_base_chain(chain)->ops, hook_nops); + nf_unregister_net_hook(net, &nft_base_chain(chain)->ops); } static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type) @@ -348,34 +280,99 @@ static int nft_delobj(struct nft_ctx *ctx, struct nft_object *obj) return err; } +static int nft_trans_flowtable_add(struct nft_ctx *ctx, int msg_type, + struct nft_flowtable *flowtable) +{ + struct nft_trans *trans; + + trans = nft_trans_alloc(ctx, msg_type, + sizeof(struct nft_trans_flowtable)); + if (trans == NULL) + return -ENOMEM; + + if (msg_type == NFT_MSG_NEWFLOWTABLE) + nft_activate_next(ctx->net, flowtable); + + nft_trans_flowtable(trans) = flowtable; + list_add_tail(&trans->list, &ctx->net->nft.commit_list); + + return 0; +} + +static int nft_delflowtable(struct nft_ctx *ctx, + struct nft_flowtable *flowtable) +{ + int err; + + err = nft_trans_flowtable_add(ctx, NFT_MSG_DELFLOWTABLE, flowtable); + if (err < 0) + return err; + + nft_deactivate_next(ctx->net, flowtable); + ctx->table->use--; + + return err; +} + /* * Tables */ -static struct nft_table *nft_table_lookup(const struct nft_af_info *afi, +static struct nft_table *nft_table_lookup(const struct net *net, const struct nlattr *nla, - u8 genmask) + u8 family, u8 genmask) { struct nft_table *table; - list_for_each_entry(table, &afi->tables, list) { + list_for_each_entry(table, &net->nft.tables, list) { if (!nla_strcmp(nla, table->name) && + table->family == family && + nft_active_genmask(table, genmask)) + return table; + } + return NULL; +} + +static struct nft_table *nft_table_lookup_byhandle(const struct net *net, + const struct nlattr *nla, + u8 genmask) +{ + struct nft_table *table; + + list_for_each_entry(table, &net->nft.tables, list) { + if (be64_to_cpu(nla_get_be64(nla)) == table->handle && nft_active_genmask(table, genmask)) return table; } return NULL; } -static struct nft_table *nf_tables_table_lookup(const struct nft_af_info *afi, +static struct nft_table *nf_tables_table_lookup(const struct net *net, const struct nlattr *nla, - u8 genmask) + u8 family, u8 genmask) +{ + struct nft_table *table; + + if (nla == NULL) + return ERR_PTR(-EINVAL); + + table = nft_table_lookup(net, nla, family, genmask); + if (table != NULL) + return table; + + return ERR_PTR(-ENOENT); +} + +static struct nft_table *nf_tables_table_lookup_byhandle(const struct net *net, + const struct nlattr *nla, + u8 genmask) { struct nft_table *table; if (nla == NULL) return ERR_PTR(-EINVAL); - table = nft_table_lookup(afi, nla, genmask); + table = nft_table_lookup_byhandle(net, nla, genmask); if (table != NULL) return table; @@ -390,7 +387,7 @@ static inline u64 nf_tables_alloc_handle(struct nft_table *table) static const struct nf_chain_type *chain_type[NFPROTO_NUMPROTO][NFT_CHAIN_T_MAX]; static const struct nf_chain_type * -__nf_tables_chain_type_lookup(int family, const struct nlattr *nla) +__nf_tables_chain_type_lookup(const struct nlattr *nla, u8 family) { int i; @@ -403,22 +400,20 @@ __nf_tables_chain_type_lookup(int family, const struct nlattr *nla) } static const struct nf_chain_type * -nf_tables_chain_type_lookup(const struct nft_af_info *afi, - const struct nlattr *nla, - bool autoload) +nf_tables_chain_type_lookup(const struct nlattr *nla, u8 family, bool autoload) { const struct nf_chain_type *type; - type = __nf_tables_chain_type_lookup(afi->family, nla); + type = __nf_tables_chain_type_lookup(nla, family); if (type != NULL) return type; #ifdef CONFIG_MODULES if (autoload) { nfnl_unlock(NFNL_SUBSYS_NFTABLES); - request_module("nft-chain-%u-%.*s", afi->family, + request_module("nft-chain-%u-%.*s", family, nla_len(nla), (const char *)nla_data(nla)); nfnl_lock(NFNL_SUBSYS_NFTABLES); - type = __nf_tables_chain_type_lookup(afi->family, nla); + type = __nf_tables_chain_type_lookup(nla, family); if (type != NULL) return ERR_PTR(-EAGAIN); } @@ -430,6 +425,7 @@ static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = { [NFTA_TABLE_NAME] = { .type = NLA_STRING, .len = NFT_TABLE_MAXNAMELEN - 1 }, [NFTA_TABLE_FLAGS] = { .type = NLA_U32 }, + [NFTA_TABLE_HANDLE] = { .type = NLA_U64 }, }; static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net, @@ -451,7 +447,9 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net, if (nla_put_string(skb, NFTA_TABLE_NAME, table->name) || nla_put_be32(skb, NFTA_TABLE_FLAGS, htonl(table->flags)) || - nla_put_be32(skb, NFTA_TABLE_USE, htonl(table->use))) + nla_put_be32(skb, NFTA_TABLE_USE, htonl(table->use)) || + nla_put_be64(skb, NFTA_TABLE_HANDLE, cpu_to_be64(table->handle), + NFTA_TABLE_PAD)) goto nla_put_failure; nlmsg_end(skb, nlh); @@ -476,7 +474,7 @@ static void nf_tables_table_notify(const struct nft_ctx *ctx, int event) goto err; err = nf_tables_fill_table_info(skb, ctx->net, ctx->portid, ctx->seq, - event, 0, ctx->afi->family, ctx->table); + event, 0, ctx->family, ctx->table); if (err < 0) { kfree_skb(skb); goto err; @@ -493,7 +491,6 @@ static int nf_tables_dump_tables(struct sk_buff *skb, struct netlink_callback *cb) { const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); - const struct nft_af_info *afi; const struct nft_table *table; unsigned int idx = 0, s_idx = cb->args[0]; struct net *net = sock_net(skb->sk); @@ -502,30 +499,27 @@ static int nf_tables_dump_tables(struct sk_buff *skb, rcu_read_lock(); cb->seq = net->nft.base_seq; - list_for_each_entry_rcu(afi, &net->nft.af_info, list) { - if (family != NFPROTO_UNSPEC && family != afi->family) + list_for_each_entry_rcu(table, &net->nft.tables, list) { + if (family != NFPROTO_UNSPEC && family != table->family) continue; - list_for_each_entry_rcu(table, &afi->tables, list) { - if (idx < s_idx) - goto cont; - if (idx > s_idx) - memset(&cb->args[1], 0, - sizeof(cb->args) - sizeof(cb->args[0])); - if (!nft_is_active(net, table)) - continue; - if (nf_tables_fill_table_info(skb, net, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NFT_MSG_NEWTABLE, - NLM_F_MULTI, - afi->family, table) < 0) - goto done; - - nl_dump_check_consistent(cb, nlmsg_hdr(skb)); + if (idx < s_idx) + goto cont; + if (idx > s_idx) + memset(&cb->args[1], 0, + sizeof(cb->args) - sizeof(cb->args[0])); + if (!nft_is_active(net, table)) + continue; + if (nf_tables_fill_table_info(skb, net, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NFT_MSG_NEWTABLE, NLM_F_MULTI, + table->family, table) < 0) + goto done; + + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: - idx++; - } + idx++; } done: rcu_read_unlock(); @@ -540,7 +534,6 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk, { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_cur(net); - const struct nft_af_info *afi; const struct nft_table *table; struct sk_buff *skb2; int family = nfmsg->nfgen_family; @@ -553,11 +546,8 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk, return netlink_dump_start(nlsk, skb, nlh, &c); } - afi = nf_tables_afinfo_lookup(net, family, false); - if (IS_ERR(afi)) - return PTR_ERR(afi); - - table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME], genmask); + table = nf_tables_table_lookup(net, nla[NFTA_TABLE_NAME], family, + genmask); if (IS_ERR(table)) return PTR_ERR(table); @@ -578,10 +568,7 @@ err: return err; } -static void _nf_tables_table_disable(struct net *net, - const struct nft_af_info *afi, - struct nft_table *table, - u32 cnt) +static void nft_table_disable(struct net *net, struct nft_table *table, u32 cnt) { struct nft_chain *chain; u32 i = 0; @@ -595,14 +582,11 @@ static void _nf_tables_table_disable(struct net *net, if (cnt && i++ == cnt) break; - nf_unregister_net_hooks(net, nft_base_chain(chain)->ops, - afi->nops); + nf_unregister_net_hook(net, &nft_base_chain(chain)->ops); } } -static int nf_tables_table_enable(struct net *net, - const struct nft_af_info *afi, - struct nft_table *table) +static int nf_tables_table_enable(struct net *net, struct nft_table *table) { struct nft_chain *chain; int err, i = 0; @@ -613,8 +597,7 @@ static int nf_tables_table_enable(struct net *net, if (!nft_is_base_chain(chain)) continue; - err = nf_register_net_hooks(net, nft_base_chain(chain)->ops, - afi->nops); + err = nf_register_net_hook(net, &nft_base_chain(chain)->ops); if (err < 0) goto err; @@ -623,15 +606,13 @@ static int nf_tables_table_enable(struct net *net, return 0; err: if (i) - _nf_tables_table_disable(net, afi, table, i); + nft_table_disable(net, table, i); return err; } -static void nf_tables_table_disable(struct net *net, - const struct nft_af_info *afi, - struct nft_table *table) +static void nf_tables_table_disable(struct net *net, struct nft_table *table) { - _nf_tables_table_disable(net, afi, table, 0); + nft_table_disable(net, table, 0); } static int nf_tables_updtable(struct nft_ctx *ctx) @@ -660,7 +641,7 @@ static int nf_tables_updtable(struct nft_ctx *ctx) nft_trans_table_enable(trans) = false; } else if (!(flags & NFT_TABLE_F_DORMANT) && ctx->table->flags & NFT_TABLE_F_DORMANT) { - ret = nf_tables_table_enable(ctx->net, ctx->afi, ctx->table); + ret = nf_tables_table_enable(ctx->net, ctx->table); if (ret >= 0) { ctx->table->flags &= ~NFT_TABLE_F_DORMANT; nft_trans_table_enable(trans) = true; @@ -685,19 +666,14 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); const struct nlattr *name; - struct nft_af_info *afi; struct nft_table *table; int family = nfmsg->nfgen_family; u32 flags = 0; struct nft_ctx ctx; int err; - afi = nf_tables_afinfo_lookup(net, family, true); - if (IS_ERR(afi)) - return PTR_ERR(afi); - name = nla[NFTA_TABLE_NAME]; - table = nf_tables_table_lookup(afi, name, genmask); + table = nf_tables_table_lookup(net, name, family, genmask); if (IS_ERR(table)) { if (PTR_ERR(table) != -ENOENT) return PTR_ERR(table); @@ -707,7 +683,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, if (nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; - nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); return nf_tables_updtable(&ctx); } @@ -717,47 +693,45 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, return -EINVAL; } - err = -EAFNOSUPPORT; - if (!try_module_get(afi->owner)) - goto err1; - err = -ENOMEM; table = kzalloc(sizeof(*table), GFP_KERNEL); if (table == NULL) - goto err2; + goto err_kzalloc; table->name = nla_strdup(name, GFP_KERNEL); if (table->name == NULL) - goto err3; + goto err_strdup; INIT_LIST_HEAD(&table->chains); INIT_LIST_HEAD(&table->sets); INIT_LIST_HEAD(&table->objects); + INIT_LIST_HEAD(&table->flowtables); + table->family = family; table->flags = flags; + table->handle = ++table_handle; - nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE); if (err < 0) - goto err4; + goto err_trans; - list_add_tail_rcu(&table->list, &afi->tables); + list_add_tail_rcu(&table->list, &net->nft.tables); return 0; -err4: +err_trans: kfree(table->name); -err3: +err_strdup: kfree(table); -err2: - module_put(afi->owner); -err1: +err_kzalloc: return err; } static int nft_flush_table(struct nft_ctx *ctx) { - int err; + struct nft_flowtable *flowtable, *nft; struct nft_chain *chain, *nc; struct nft_object *obj, *ne; struct nft_set *set, *ns; + int err; list_for_each_entry(chain, &ctx->table->chains, list) { if (!nft_is_active_next(ctx->net, chain)) @@ -774,7 +748,7 @@ static int nft_flush_table(struct nft_ctx *ctx) if (!nft_is_active_next(ctx->net, set)) continue; - if (set->flags & NFT_SET_ANONYMOUS && + if (nft_set_is_anonymous(set) && !list_empty(&set->bindings)) continue; @@ -783,6 +757,12 @@ static int nft_flush_table(struct nft_ctx *ctx) goto out; } + list_for_each_entry_safe(flowtable, nft, &ctx->table->flowtables, list) { + err = nft_delflowtable(ctx, flowtable); + if (err < 0) + goto out; + } + list_for_each_entry_safe(obj, ne, &ctx->table->objects, list) { err = nft_delobj(ctx, obj); if (err < 0) @@ -807,30 +787,28 @@ out: static int nft_flush(struct nft_ctx *ctx, int family) { - struct nft_af_info *afi; struct nft_table *table, *nt; const struct nlattr * const *nla = ctx->nla; int err = 0; - list_for_each_entry(afi, &ctx->net->nft.af_info, list) { - if (family != AF_UNSPEC && afi->family != family) + list_for_each_entry_safe(table, nt, &ctx->net->nft.tables, list) { + if (family != AF_UNSPEC && table->family != family) continue; - ctx->afi = afi; - list_for_each_entry_safe(table, nt, &afi->tables, list) { - if (!nft_is_active_next(ctx->net, table)) - continue; + ctx->family = table->family; - if (nla[NFTA_TABLE_NAME] && - nla_strcmp(nla[NFTA_TABLE_NAME], table->name) != 0) - continue; + if (!nft_is_active_next(ctx->net, table)) + continue; - ctx->table = table; + if (nla[NFTA_TABLE_NAME] && + nla_strcmp(nla[NFTA_TABLE_NAME], table->name) != 0) + continue; - err = nft_flush_table(ctx); - if (err < 0) - goto out; - } + ctx->table = table; + + err = nft_flush_table(ctx); + if (err < 0) + goto out; } out: return err; @@ -843,20 +821,23 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk, { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); - struct nft_af_info *afi; struct nft_table *table; int family = nfmsg->nfgen_family; struct nft_ctx ctx; - nft_ctx_init(&ctx, net, skb, nlh, NULL, NULL, NULL, nla); - if (family == AF_UNSPEC || nla[NFTA_TABLE_NAME] == NULL) + nft_ctx_init(&ctx, net, skb, nlh, 0, NULL, NULL, nla); + if (family == AF_UNSPEC || + (!nla[NFTA_TABLE_NAME] && !nla[NFTA_TABLE_HANDLE])) return nft_flush(&ctx, family); - afi = nf_tables_afinfo_lookup(net, family, false); - if (IS_ERR(afi)) - return PTR_ERR(afi); + if (nla[NFTA_TABLE_HANDLE]) + table = nf_tables_table_lookup_byhandle(net, + nla[NFTA_TABLE_HANDLE], + genmask); + else + table = nf_tables_table_lookup(net, nla[NFTA_TABLE_NAME], + family, genmask); - table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME], genmask); if (IS_ERR(table)) return PTR_ERR(table); @@ -864,7 +845,7 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk, table->use > 0) return -EBUSY; - ctx.afi = afi; + ctx.family = family; ctx.table = table; return nft_flush_table(&ctx); @@ -876,7 +857,6 @@ static void nf_tables_table_destroy(struct nft_ctx *ctx) kfree(ctx->table->name); kfree(ctx->table); - module_put(ctx->afi->owner); } int nft_register_chain_type(const struct nf_chain_type *ctype) @@ -1026,7 +1006,7 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, if (nft_is_base_chain(chain)) { const struct nft_base_chain *basechain = nft_base_chain(chain); - const struct nf_hook_ops *ops = &basechain->ops[0]; + const struct nf_hook_ops *ops = &basechain->ops; struct nlattr *nest; nest = nla_nest_start(skb, NFTA_CHAIN_HOOK); @@ -1077,7 +1057,7 @@ static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event) goto err; err = nf_tables_fill_chain_info(skb, ctx->net, ctx->portid, ctx->seq, - event, 0, ctx->afi->family, ctx->table, + event, 0, ctx->family, ctx->table, ctx->chain); if (err < 0) { kfree_skb(skb); @@ -1095,7 +1075,6 @@ static int nf_tables_dump_chains(struct sk_buff *skb, struct netlink_callback *cb) { const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); - const struct nft_af_info *afi; const struct nft_table *table; const struct nft_chain *chain; unsigned int idx = 0, s_idx = cb->args[0]; @@ -1105,31 +1084,30 @@ static int nf_tables_dump_chains(struct sk_buff *skb, rcu_read_lock(); cb->seq = net->nft.base_seq; - list_for_each_entry_rcu(afi, &net->nft.af_info, list) { - if (family != NFPROTO_UNSPEC && family != afi->family) + list_for_each_entry_rcu(table, &net->nft.tables, list) { + if (family != NFPROTO_UNSPEC && family != table->family) continue; - list_for_each_entry_rcu(table, &afi->tables, list) { - list_for_each_entry_rcu(chain, &table->chains, list) { - if (idx < s_idx) - goto cont; - if (idx > s_idx) - memset(&cb->args[1], 0, - sizeof(cb->args) - sizeof(cb->args[0])); - if (!nft_is_active(net, chain)) - continue; - if (nf_tables_fill_chain_info(skb, net, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NFT_MSG_NEWCHAIN, - NLM_F_MULTI, - afi->family, table, chain) < 0) - goto done; + list_for_each_entry_rcu(chain, &table->chains, list) { + if (idx < s_idx) + goto cont; + if (idx > s_idx) + memset(&cb->args[1], 0, + sizeof(cb->args) - sizeof(cb->args[0])); + if (!nft_is_active(net, chain)) + continue; + if (nf_tables_fill_chain_info(skb, net, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NFT_MSG_NEWCHAIN, + NLM_F_MULTI, + table->family, table, + chain) < 0) + goto done; - nl_dump_check_consistent(cb, nlmsg_hdr(skb)); + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: - idx++; - } + idx++; } } done: @@ -1145,7 +1123,6 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk, { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_cur(net); - const struct nft_af_info *afi; const struct nft_table *table; const struct nft_chain *chain; struct sk_buff *skb2; @@ -1159,11 +1136,8 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk, return netlink_dump_start(nlsk, skb, nlh, &c); } - afi = nf_tables_afinfo_lookup(net, family, false); - if (IS_ERR(afi)) - return PTR_ERR(afi); - - table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE], genmask); + table = nf_tables_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, + genmask); if (IS_ERR(table)) return PTR_ERR(table); @@ -1227,13 +1201,13 @@ static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr) static void nft_chain_stats_replace(struct nft_base_chain *chain, struct nft_stats __percpu *newstats) { + struct nft_stats __percpu *oldstats; + if (newstats == NULL) return; if (chain->stats) { - struct nft_stats __percpu *oldstats = - nft_dereference(chain->stats); - + oldstats = nfnl_dereference(chain->stats, NFNL_SUBSYS_NFTABLES); rcu_assign_pointer(chain->stats, newstats); synchronize_rcu(); free_percpu(oldstats); @@ -1252,8 +1226,8 @@ static void nf_tables_chain_destroy(struct nft_chain *chain) free_percpu(basechain->stats); if (basechain->stats) static_branch_dec(&nft_counters_enabled); - if (basechain->ops[0].dev != NULL) - dev_put(basechain->ops[0].dev); + if (basechain->ops.dev != NULL) + dev_put(basechain->ops.dev); kfree(chain->name); kfree(basechain); } else { @@ -1264,15 +1238,15 @@ static void nf_tables_chain_destroy(struct nft_chain *chain) struct nft_chain_hook { u32 num; - u32 priority; + s32 priority; const struct nf_chain_type *type; struct net_device *dev; }; static int nft_chain_parse_hook(struct net *net, const struct nlattr * const nla[], - struct nft_af_info *afi, - struct nft_chain_hook *hook, bool create) + struct nft_chain_hook *hook, u8 family, + bool create) { struct nlattr *ha[NFTA_HOOK_MAX + 1]; const struct nf_chain_type *type; @@ -1289,27 +1263,29 @@ static int nft_chain_parse_hook(struct net *net, return -EINVAL; hook->num = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM])); - if (hook->num >= afi->nhooks) - return -EINVAL; - hook->priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY])); - type = chain_type[afi->family][NFT_CHAIN_T_DEFAULT]; + type = chain_type[family][NFT_CHAIN_T_DEFAULT]; if (nla[NFTA_CHAIN_TYPE]) { - type = nf_tables_chain_type_lookup(afi, nla[NFTA_CHAIN_TYPE], - create); + type = nf_tables_chain_type_lookup(nla[NFTA_CHAIN_TYPE], + family, create); if (IS_ERR(type)) return PTR_ERR(type); } if (!(type->hook_mask & (1 << hook->num))) return -EOPNOTSUPP; + + if (type->type == NFT_CHAIN_T_NAT && + hook->priority <= NF_IP_PRI_CONNTRACK) + return -EOPNOTSUPP; + if (!try_module_get(type->owner)) return -ENOENT; hook->type = type; hook->dev = NULL; - if (afi->flags & NFT_AF_NEEDS_DEV) { + if (family == NFPROTO_NETDEV) { char ifname[IFNAMSIZ]; if (!ha[NFTA_HOOK_DEV]) { @@ -1344,12 +1320,10 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, { const struct nlattr * const *nla = ctx->nla; struct nft_table *table = ctx->table; - struct nft_af_info *afi = ctx->afi; struct nft_base_chain *basechain; struct nft_stats __percpu *stats; struct net *net = ctx->net; struct nft_chain *chain; - unsigned int i; int err; if (table->use == UINT_MAX) @@ -1358,9 +1332,8 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, if (nla[NFTA_CHAIN_HOOK]) { struct nft_chain_hook hook; struct nf_hook_ops *ops; - nf_hookfn *hookfn; - err = nft_chain_parse_hook(net, nla, afi, &hook, create); + err = nft_chain_parse_hook(net, nla, &hook, family, create); if (err < 0) return err; @@ -1384,23 +1357,19 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, static_branch_inc(&nft_counters_enabled); } - hookfn = hook.type->hooks[hook.num]; basechain->type = hook.type; chain = &basechain->chain; - for (i = 0; i < afi->nops; i++) { - ops = &basechain->ops[i]; - ops->pf = family; - ops->hooknum = hook.num; - ops->priority = hook.priority; - ops->priv = chain; - ops->hook = afi->hooks[ops->hooknum]; - ops->dev = hook.dev; - if (hookfn) - ops->hook = hookfn; - if (afi->hook_ops_init) - afi->hook_ops_init(ops, i); - } + ops = &basechain->ops; + ops->pf = family; + ops->hooknum = hook.num; + ops->priority = hook.priority; + ops->priv = chain; + ops->hook = hook.type->hooks[ops->hooknum]; + ops->dev = hook.dev; + + if (basechain->type->type == NFT_CHAIN_T_NAT) + ops->nat_hook = true; chain->flags |= NFT_BASE_CHAIN; basechain->policy = policy; @@ -1418,7 +1387,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, goto err1; } - err = nf_tables_register_hooks(net, table, chain, afi->nops); + err = nf_tables_register_hook(net, table, chain); if (err < 0) goto err1; @@ -1432,7 +1401,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, return 0; err2: - nf_tables_unregister_hooks(net, table, chain, afi->nops); + nf_tables_unregister_hook(net, table, chain); err1: nf_tables_chain_destroy(chain); @@ -1445,20 +1414,19 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, const struct nlattr * const *nla = ctx->nla; struct nft_table *table = ctx->table; struct nft_chain *chain = ctx->chain; - struct nft_af_info *afi = ctx->afi; struct nft_base_chain *basechain; struct nft_stats *stats = NULL; struct nft_chain_hook hook; const struct nlattr *name; struct nf_hook_ops *ops; struct nft_trans *trans; - int err, i; + int err; if (nla[NFTA_CHAIN_HOOK]) { if (!nft_is_base_chain(chain)) return -EBUSY; - err = nft_chain_parse_hook(ctx->net, nla, ctx->afi, &hook, + err = nft_chain_parse_hook(ctx->net, nla, &hook, ctx->family, create); if (err < 0) return err; @@ -1469,14 +1437,12 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, return -EBUSY; } - for (i = 0; i < afi->nops; i++) { - ops = &basechain->ops[i]; - if (ops->hooknum != hook.num || - ops->priority != hook.priority || - ops->dev != hook.dev) { - nft_chain_release_hook(&hook); - return -EBUSY; - } + ops = &basechain->ops; + if (ops->hooknum != hook.num || + ops->priority != hook.priority || + ops->dev != hook.dev) { + nft_chain_release_hook(&hook); + return -EBUSY; } nft_chain_release_hook(&hook); } @@ -1539,7 +1505,6 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, const struct nlattr * uninitialized_var(name); u8 genmask = nft_genmask_next(net); int family = nfmsg->nfgen_family; - struct nft_af_info *afi; struct nft_table *table; struct nft_chain *chain; u8 policy = NF_ACCEPT; @@ -1549,11 +1514,8 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; - afi = nf_tables_afinfo_lookup(net, family, true); - if (IS_ERR(afi)) - return PTR_ERR(afi); - - table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE], genmask); + table = nf_tables_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, + genmask); if (IS_ERR(table)) return PTR_ERR(table); @@ -1593,7 +1555,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, } } - nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla); + nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla); if (chain != NULL) { if (nlh->nlmsg_flags & NLM_F_EXCL) @@ -1614,24 +1576,26 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk, { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); - struct nft_af_info *afi; struct nft_table *table; struct nft_chain *chain; struct nft_rule *rule; int family = nfmsg->nfgen_family; struct nft_ctx ctx; + u64 handle; u32 use; int err; - afi = nf_tables_afinfo_lookup(net, family, false); - if (IS_ERR(afi)) - return PTR_ERR(afi); - - table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE], genmask); + table = nf_tables_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, + genmask); if (IS_ERR(table)) return PTR_ERR(table); - chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask); + if (nla[NFTA_CHAIN_HANDLE]) { + handle = be64_to_cpu(nla_get_be64(nla[NFTA_CHAIN_HANDLE])); + chain = nf_tables_chain_lookup_byhandle(table, handle, genmask); + } else { + chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask); + } if (IS_ERR(chain)) return PTR_ERR(chain); @@ -1639,7 +1603,7 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk, chain->use > 0) return -EBUSY; - nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla); + nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla); use = chain->use; list_for_each_entry(rule, &chain->rules, list) { @@ -1804,7 +1768,7 @@ static int nf_tables_expr_parse(const struct nft_ctx *ctx, if (err < 0) return err; - type = nft_expr_type_get(ctx->afi->family, tb[NFTA_EXPR_NAME]); + type = nft_expr_type_get(ctx->family, tb[NFTA_EXPR_NAME]); if (IS_ERR(type)) return PTR_ERR(type); @@ -2027,7 +1991,7 @@ static void nf_tables_rule_notify(const struct nft_ctx *ctx, goto err; err = nf_tables_fill_rule_info(skb, ctx->net, ctx->portid, ctx->seq, - event, 0, ctx->afi->family, ctx->table, + event, 0, ctx->family, ctx->table, ctx->chain, rule); if (err < 0) { kfree_skb(skb); @@ -2051,7 +2015,6 @@ static int nf_tables_dump_rules(struct sk_buff *skb, { const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); const struct nft_rule_dump_ctx *ctx = cb->data; - const struct nft_af_info *afi; const struct nft_table *table; const struct nft_chain *chain; const struct nft_rule *rule; @@ -2062,39 +2025,37 @@ static int nf_tables_dump_rules(struct sk_buff *skb, rcu_read_lock(); cb->seq = net->nft.base_seq; - list_for_each_entry_rcu(afi, &net->nft.af_info, list) { - if (family != NFPROTO_UNSPEC && family != afi->family) + list_for_each_entry_rcu(table, &net->nft.tables, list) { + if (family != NFPROTO_UNSPEC && family != table->family) continue; - list_for_each_entry_rcu(table, &afi->tables, list) { - if (ctx && ctx->table && - strcmp(ctx->table, table->name) != 0) + if (ctx && ctx->table && strcmp(ctx->table, table->name) != 0) + continue; + + list_for_each_entry_rcu(chain, &table->chains, list) { + if (ctx && ctx->chain && + strcmp(ctx->chain, chain->name) != 0) continue; - list_for_each_entry_rcu(chain, &table->chains, list) { - if (ctx && ctx->chain[0] && - strcmp(ctx->chain, chain->name) != 0) - continue; - - list_for_each_entry_rcu(rule, &chain->rules, list) { - if (!nft_is_active(net, rule)) - goto cont; - if (idx < s_idx) - goto cont; - if (idx > s_idx) - memset(&cb->args[1], 0, - sizeof(cb->args) - sizeof(cb->args[0])); - if (nf_tables_fill_rule_info(skb, net, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NFT_MSG_NEWRULE, - NLM_F_MULTI | NLM_F_APPEND, - afi->family, table, chain, rule) < 0) - goto done; - - nl_dump_check_consistent(cb, nlmsg_hdr(skb)); + list_for_each_entry_rcu(rule, &chain->rules, list) { + if (!nft_is_active(net, rule)) + goto cont; + if (idx < s_idx) + goto cont; + if (idx > s_idx) + memset(&cb->args[1], 0, + sizeof(cb->args) - sizeof(cb->args[0])); + if (nf_tables_fill_rule_info(skb, net, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NFT_MSG_NEWRULE, + NLM_F_MULTI | NLM_F_APPEND, + table->family, + table, chain, rule) < 0) + goto done; + + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: - idx++; - } + idx++; } } } @@ -2124,7 +2085,6 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk, { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_cur(net); - const struct nft_af_info *afi; const struct nft_table *table; const struct nft_chain *chain; const struct nft_rule *rule; @@ -2168,11 +2128,8 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk, return netlink_dump_start(nlsk, skb, nlh, &c); } - afi = nf_tables_afinfo_lookup(net, family, false); - if (IS_ERR(afi)) - return PTR_ERR(afi); - - table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE], genmask); + table = nf_tables_table_lookup(net, nla[NFTA_RULE_TABLE], family, + genmask); if (IS_ERR(table)) return PTR_ERR(table); @@ -2229,7 +2186,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); - struct nft_af_info *afi; + int family = nfmsg->nfgen_family; struct nft_table *table; struct nft_chain *chain; struct nft_rule *rule, *old_rule = NULL; @@ -2245,11 +2202,8 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; - afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, create); - if (IS_ERR(afi)) - return PTR_ERR(afi); - - table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE], genmask); + table = nf_tables_table_lookup(net, nla[NFTA_RULE_TABLE], family, + genmask); if (IS_ERR(table)) return PTR_ERR(table); @@ -2288,7 +2242,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, return PTR_ERR(old_rule); } - nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla); + nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla); n = 0; size = 0; @@ -2412,18 +2366,14 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk, { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); - struct nft_af_info *afi; struct nft_table *table; struct nft_chain *chain = NULL; struct nft_rule *rule; int family = nfmsg->nfgen_family, err = 0; struct nft_ctx ctx; - afi = nf_tables_afinfo_lookup(net, family, false); - if (IS_ERR(afi)) - return PTR_ERR(afi); - - table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE], genmask); + table = nf_tables_table_lookup(net, nla[NFTA_RULE_TABLE], family, + genmask); if (IS_ERR(table)) return PTR_ERR(table); @@ -2434,7 +2384,7 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk, return PTR_ERR(chain); } - nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla); + nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla); if (chain) { if (nla[NFTA_RULE_HANDLE]) { @@ -2601,6 +2551,7 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { [NFTA_SET_USERDATA] = { .type = NLA_BINARY, .len = NFT_USERDATA_MAXLEN }, [NFTA_SET_OBJ_TYPE] = { .type = NLA_U32 }, + [NFTA_SET_HANDLE] = { .type = NLA_U64 }, }; static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = { @@ -2614,26 +2565,17 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net, u8 genmask) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - struct nft_af_info *afi = NULL; + int family = nfmsg->nfgen_family; struct nft_table *table = NULL; - if (nfmsg->nfgen_family != NFPROTO_UNSPEC) { - afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, false); - if (IS_ERR(afi)) - return PTR_ERR(afi); - } - if (nla[NFTA_SET_TABLE] != NULL) { - if (afi == NULL) - return -EAFNOSUPPORT; - - table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE], - genmask); + table = nf_tables_table_lookup(net, nla[NFTA_SET_TABLE], + family, genmask); if (IS_ERR(table)) return PTR_ERR(table); } - nft_ctx_init(ctx, net, skb, nlh, afi, table, NULL, nla); + nft_ctx_init(ctx, net, skb, nlh, family, table, NULL, nla); return 0; } @@ -2653,6 +2595,22 @@ static struct nft_set *nf_tables_set_lookup(const struct nft_table *table, return ERR_PTR(-ENOENT); } +static struct nft_set *nf_tables_set_lookup_byhandle(const struct nft_table *table, + const struct nlattr *nla, u8 genmask) +{ + struct nft_set *set; + + if (nla == NULL) + return ERR_PTR(-EINVAL); + + list_for_each_entry(set, &table->sets, list) { + if (be64_to_cpu(nla_get_be64(nla)) == set->handle && + nft_active_genmask(set, genmask)) + return set; + } + return ERR_PTR(-ENOENT); +} + static struct nft_set *nf_tables_set_lookup_byid(const struct net *net, const struct nlattr *nla, u8 genmask) @@ -2760,7 +2718,7 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, goto nla_put_failure; nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = ctx->afi->family; + nfmsg->nfgen_family = ctx->family; nfmsg->version = NFNETLINK_V0; nfmsg->res_id = htons(ctx->net->nft.base_seq & 0xffff); @@ -2768,6 +2726,9 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, goto nla_put_failure; if (nla_put_string(skb, NFTA_SET_NAME, set->name)) goto nla_put_failure; + if (nla_put_be64(skb, NFTA_SET_HANDLE, cpu_to_be64(set->handle), + NFTA_SET_PAD)) + goto nla_put_failure; if (set->flags != 0) if (nla_put_be32(skb, NFTA_SET_FLAGS, htonl(set->flags))) goto nla_put_failure; @@ -2852,10 +2813,8 @@ static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb) { const struct nft_set *set; unsigned int idx, s_idx = cb->args[0]; - struct nft_af_info *afi; struct nft_table *table, *cur_table = (struct nft_table *)cb->args[2]; struct net *net = sock_net(skb->sk); - int cur_family = cb->args[3]; struct nft_ctx *ctx = cb->data, ctx_set; if (cb->args[1]) @@ -2864,51 +2823,44 @@ static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb) rcu_read_lock(); cb->seq = net->nft.base_seq; - list_for_each_entry_rcu(afi, &net->nft.af_info, list) { - if (ctx->afi && ctx->afi != afi) + list_for_each_entry_rcu(table, &net->nft.tables, list) { + if (ctx->family != NFPROTO_UNSPEC && + ctx->family != table->family) + continue; + + if (ctx->table && ctx->table != table) continue; - if (cur_family) { - if (afi->family != cur_family) + if (cur_table) { + if (cur_table != table) continue; - cur_family = 0; + cur_table = NULL; } - list_for_each_entry_rcu(table, &afi->tables, list) { - if (ctx->table && ctx->table != table) - continue; + idx = 0; + list_for_each_entry_rcu(set, &table->sets, list) { + if (idx < s_idx) + goto cont; + if (!nft_is_active(net, set)) + goto cont; - if (cur_table) { - if (cur_table != table) - continue; + ctx_set = *ctx; + ctx_set.table = table; + ctx_set.family = table->family; - cur_table = NULL; + if (nf_tables_fill_set(skb, &ctx_set, set, + NFT_MSG_NEWSET, + NLM_F_MULTI) < 0) { + cb->args[0] = idx; + cb->args[2] = (unsigned long) table; + goto done; } - idx = 0; - list_for_each_entry_rcu(set, &table->sets, list) { - if (idx < s_idx) - goto cont; - if (!nft_is_active(net, set)) - goto cont; - - ctx_set = *ctx; - ctx_set.table = table; - ctx_set.afi = afi; - if (nf_tables_fill_set(skb, &ctx_set, set, - NFT_MSG_NEWSET, - NLM_F_MULTI) < 0) { - cb->args[0] = idx; - cb->args[2] = (unsigned long) table; - cb->args[3] = afi->family; - goto done; - } - nl_dump_check_consistent(cb, nlmsg_hdr(skb)); + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: - idx++; - } - if (s_idx) - s_idx = 0; + idx++; } + if (s_idx) + s_idx = 0; } cb->args[1] = 1; done: @@ -3006,8 +2958,8 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); + int family = nfmsg->nfgen_family; const struct nft_set_ops *ops; - struct nft_af_info *afi; struct nft_table *table; struct nft_set *set; struct nft_ctx ctx; @@ -3114,15 +3066,12 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; - afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, create); - if (IS_ERR(afi)) - return PTR_ERR(afi); - - table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE], genmask); + table = nf_tables_table_lookup(net, nla[NFTA_SET_TABLE], family, + genmask); if (IS_ERR(table)) return PTR_ERR(table); - nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); set = nf_tables_set_lookup(table, nla[NFTA_SET_NAME], genmask); if (IS_ERR(set)) { @@ -3188,6 +3137,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, set->udata = udata; set->timeout = timeout; set->gc_int = gc_int; + set->handle = nf_tables_alloc_handle(table); err = ops->init(set, &desc, nla); if (err < 0) @@ -3245,7 +3195,10 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk, if (err < 0) return err; - set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask); + if (nla[NFTA_SET_HANDLE]) + set = nf_tables_set_lookup_byhandle(ctx.table, nla[NFTA_SET_HANDLE], genmask); + else + set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask); if (IS_ERR(set)) return PTR_ERR(set); @@ -3277,7 +3230,7 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *i; struct nft_set_iter iter; - if (!list_empty(&set->bindings) && set->flags & NFT_SET_ANONYMOUS) + if (!list_empty(&set->bindings) && nft_set_is_anonymous(set)) return -EBUSY; if (binding->flags & NFT_SET_MAP) { @@ -3312,7 +3265,7 @@ void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, { list_del_rcu(&binding->list); - if (list_empty(&set->bindings) && set->flags & NFT_SET_ANONYMOUS && + if (list_empty(&set->bindings) && nft_set_is_anonymous(set) && nft_is_active(ctx->net, set)) nf_tables_set_destroy(ctx, set); } @@ -3380,19 +3333,15 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, struct net *net, u8 genmask) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - struct nft_af_info *afi; + int family = nfmsg->nfgen_family; struct nft_table *table; - afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, false); - if (IS_ERR(afi)) - return PTR_ERR(afi); - - table = nf_tables_table_lookup(afi, nla[NFTA_SET_ELEM_LIST_TABLE], - genmask); + table = nf_tables_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], + family, genmask); if (IS_ERR(table)) return PTR_ERR(table); - nft_ctx_init(ctx, net, skb, nlh, afi, table, NULL, nla); + nft_ctx_init(ctx, net, skb, nlh, family, table, NULL, nla); return 0; } @@ -3497,7 +3446,6 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) { struct nft_set_dump_ctx *dump_ctx = cb->data; struct net *net = sock_net(skb->sk); - struct nft_af_info *afi; struct nft_table *table; struct nft_set *set; struct nft_set_dump_args args; @@ -3509,21 +3457,19 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) int event; rcu_read_lock(); - list_for_each_entry_rcu(afi, &net->nft.af_info, list) { - if (afi != dump_ctx->ctx.afi) + list_for_each_entry_rcu(table, &net->nft.tables, list) { + if (dump_ctx->ctx.family != NFPROTO_UNSPEC && + dump_ctx->ctx.family != table->family) continue; - list_for_each_entry_rcu(table, &afi->tables, list) { - if (table != dump_ctx->ctx.table) - continue; + if (table != dump_ctx->ctx.table) + continue; - list_for_each_entry_rcu(set, &table->sets, list) { - if (set == dump_ctx->set) { - set_found = true; - break; - } + list_for_each_entry_rcu(set, &table->sets, list) { + if (set == dump_ctx->set) { + set_found = true; + break; } - break; } break; } @@ -3543,7 +3489,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) goto nla_put_failure; nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = afi->family; + nfmsg->nfgen_family = table->family; nfmsg->version = NFNETLINK_V0; nfmsg->res_id = htons(net->nft.base_seq & 0xffff); @@ -3606,7 +3552,7 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb, goto nla_put_failure; nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = ctx->afi->family; + nfmsg->nfgen_family = ctx->family; nfmsg->version = NFNETLINK_V0; nfmsg->res_id = htons(ctx->net->nft.base_seq & 0xffff); @@ -3963,7 +3909,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, list_for_each_entry(binding, &set->bindings, list) { struct nft_ctx bind_ctx = { .net = ctx->net, - .afi = ctx->afi, + .family = ctx->family, .table = ctx->table, .chain = (struct nft_chain *)binding->chain, }; @@ -4382,6 +4328,21 @@ struct nft_object *nf_tables_obj_lookup(const struct nft_table *table, } EXPORT_SYMBOL_GPL(nf_tables_obj_lookup); +struct nft_object *nf_tables_obj_lookup_byhandle(const struct nft_table *table, + const struct nlattr *nla, + u32 objtype, u8 genmask) +{ + struct nft_object *obj; + + list_for_each_entry(obj, &table->objects, list) { + if (be64_to_cpu(nla_get_be64(nla)) == obj->handle && + objtype == obj->ops->type->type && + nft_active_genmask(obj, genmask)) + return obj; + } + return ERR_PTR(-ENOENT); +} + static const struct nla_policy nft_obj_policy[NFTA_OBJ_MAX + 1] = { [NFTA_OBJ_TABLE] = { .type = NLA_STRING, .len = NFT_TABLE_MAXNAMELEN - 1 }, @@ -4389,6 +4350,7 @@ static const struct nla_policy nft_obj_policy[NFTA_OBJ_MAX + 1] = { .len = NFT_OBJ_MAXNAMELEN - 1 }, [NFTA_OBJ_TYPE] = { .type = NLA_U32 }, [NFTA_OBJ_DATA] = { .type = NLA_NESTED }, + [NFTA_OBJ_HANDLE] = { .type = NLA_U64}, }; static struct nft_object *nft_obj_init(const struct nft_ctx *ctx, @@ -4494,7 +4456,6 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk, const struct nft_object_type *type; u8 genmask = nft_genmask_next(net); int family = nfmsg->nfgen_family; - struct nft_af_info *afi; struct nft_table *table; struct nft_object *obj; struct nft_ctx ctx; @@ -4506,11 +4467,8 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk, !nla[NFTA_OBJ_DATA]) return -EINVAL; - afi = nf_tables_afinfo_lookup(net, family, true); - if (IS_ERR(afi)) - return PTR_ERR(afi); - - table = nf_tables_table_lookup(afi, nla[NFTA_OBJ_TABLE], genmask); + table = nf_tables_table_lookup(net, nla[NFTA_OBJ_TABLE], family, + genmask); if (IS_ERR(table)) return PTR_ERR(table); @@ -4528,7 +4486,7 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk, return 0; } - nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); type = nft_obj_type_get(objtype); if (IS_ERR(type)) @@ -4540,6 +4498,8 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk, goto err1; } obj->table = table; + obj->handle = nf_tables_alloc_handle(table); + obj->name = nla_strdup(nla[NFTA_OBJ_NAME], GFP_KERNEL); if (!obj->name) { err = -ENOMEM; @@ -4586,7 +4546,9 @@ static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net, nla_put_string(skb, NFTA_OBJ_NAME, obj->name) || nla_put_be32(skb, NFTA_OBJ_TYPE, htonl(obj->ops->type->type)) || nla_put_be32(skb, NFTA_OBJ_USE, htonl(obj->use)) || - nft_object_dump(skb, NFTA_OBJ_DATA, obj, reset)) + nft_object_dump(skb, NFTA_OBJ_DATA, obj, reset) || + nla_put_be64(skb, NFTA_OBJ_HANDLE, cpu_to_be64(obj->handle), + NFTA_OBJ_PAD)) goto nla_put_failure; nlmsg_end(skb, nlh); @@ -4605,7 +4567,6 @@ struct nft_obj_filter { static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb) { const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); - const struct nft_af_info *afi; const struct nft_table *table; unsigned int idx = 0, s_idx = cb->args[0]; struct nft_obj_filter *filter = cb->data; @@ -4620,38 +4581,37 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb) rcu_read_lock(); cb->seq = net->nft.base_seq; - list_for_each_entry_rcu(afi, &net->nft.af_info, list) { - if (family != NFPROTO_UNSPEC && family != afi->family) + list_for_each_entry_rcu(table, &net->nft.tables, list) { + if (family != NFPROTO_UNSPEC && family != table->family) continue; - list_for_each_entry_rcu(table, &afi->tables, list) { - list_for_each_entry_rcu(obj, &table->objects, list) { - if (!nft_is_active(net, obj)) - goto cont; - if (idx < s_idx) - goto cont; - if (idx > s_idx) - memset(&cb->args[1], 0, - sizeof(cb->args) - sizeof(cb->args[0])); - if (filter && filter->table[0] && - strcmp(filter->table, table->name)) - goto cont; - if (filter && - filter->type != NFT_OBJECT_UNSPEC && - obj->ops->type->type != filter->type) - goto cont; + list_for_each_entry_rcu(obj, &table->objects, list) { + if (!nft_is_active(net, obj)) + goto cont; + if (idx < s_idx) + goto cont; + if (idx > s_idx) + memset(&cb->args[1], 0, + sizeof(cb->args) - sizeof(cb->args[0])); + if (filter && filter->table[0] && + strcmp(filter->table, table->name)) + goto cont; + if (filter && + filter->type != NFT_OBJECT_UNSPEC && + obj->ops->type->type != filter->type) + goto cont; - if (nf_tables_fill_obj_info(skb, net, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NFT_MSG_NEWOBJ, - NLM_F_MULTI | NLM_F_APPEND, - afi->family, table, obj, reset) < 0) - goto done; + if (nf_tables_fill_obj_info(skb, net, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NFT_MSG_NEWOBJ, + NLM_F_MULTI | NLM_F_APPEND, + table->family, table, + obj, reset) < 0) + goto done; - nl_dump_check_consistent(cb, nlmsg_hdr(skb)); + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: - idx++; - } + idx++; } } done: @@ -4665,8 +4625,10 @@ static int nf_tables_dump_obj_done(struct netlink_callback *cb) { struct nft_obj_filter *filter = cb->data; - kfree(filter->table); - kfree(filter); + if (filter) { + kfree(filter->table); + kfree(filter); + } return 0; } @@ -4701,7 +4663,6 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk, const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_cur(net); int family = nfmsg->nfgen_family; - const struct nft_af_info *afi; const struct nft_table *table; struct nft_object *obj; struct sk_buff *skb2; @@ -4732,11 +4693,8 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk, !nla[NFTA_OBJ_TYPE]) return -EINVAL; - afi = nf_tables_afinfo_lookup(net, family, false); - if (IS_ERR(afi)) - return PTR_ERR(afi); - - table = nf_tables_table_lookup(afi, nla[NFTA_OBJ_TABLE], genmask); + table = nf_tables_table_lookup(net, nla[NFTA_OBJ_TABLE], family, + genmask); if (IS_ERR(table)) return PTR_ERR(table); @@ -4782,32 +4740,33 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk, const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); int family = nfmsg->nfgen_family; - struct nft_af_info *afi; struct nft_table *table; struct nft_object *obj; struct nft_ctx ctx; u32 objtype; if (!nla[NFTA_OBJ_TYPE] || - !nla[NFTA_OBJ_NAME]) + (!nla[NFTA_OBJ_NAME] && !nla[NFTA_OBJ_HANDLE])) return -EINVAL; - afi = nf_tables_afinfo_lookup(net, family, true); - if (IS_ERR(afi)) - return PTR_ERR(afi); - - table = nf_tables_table_lookup(afi, nla[NFTA_OBJ_TABLE], genmask); + table = nf_tables_table_lookup(net, nla[NFTA_OBJ_TABLE], family, + genmask); if (IS_ERR(table)) return PTR_ERR(table); objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); - obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask); + if (nla[NFTA_OBJ_HANDLE]) + obj = nf_tables_obj_lookup_byhandle(table, nla[NFTA_OBJ_HANDLE], + objtype, genmask); + else + obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], + objtype, genmask); if (IS_ERR(obj)) return PTR_ERR(obj); if (obj->use > 0) return -EBUSY; - nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); return nft_delobj(&ctx, obj); } @@ -4845,7 +4804,613 @@ static void nf_tables_obj_notify(const struct nft_ctx *ctx, struct nft_object *obj, int event) { nft_obj_notify(ctx->net, ctx->table, obj, ctx->portid, ctx->seq, event, - ctx->afi->family, ctx->report, GFP_KERNEL); + ctx->family, ctx->report, GFP_KERNEL); +} + +/* + * Flow tables + */ +void nft_register_flowtable_type(struct nf_flowtable_type *type) +{ + nfnl_lock(NFNL_SUBSYS_NFTABLES); + list_add_tail_rcu(&type->list, &nf_tables_flowtables); + nfnl_unlock(NFNL_SUBSYS_NFTABLES); +} +EXPORT_SYMBOL_GPL(nft_register_flowtable_type); + +void nft_unregister_flowtable_type(struct nf_flowtable_type *type) +{ + nfnl_lock(NFNL_SUBSYS_NFTABLES); + list_del_rcu(&type->list); + nfnl_unlock(NFNL_SUBSYS_NFTABLES); +} +EXPORT_SYMBOL_GPL(nft_unregister_flowtable_type); + +static const struct nla_policy nft_flowtable_policy[NFTA_FLOWTABLE_MAX + 1] = { + [NFTA_FLOWTABLE_TABLE] = { .type = NLA_STRING, + .len = NFT_NAME_MAXLEN - 1 }, + [NFTA_FLOWTABLE_NAME] = { .type = NLA_STRING, + .len = NFT_NAME_MAXLEN - 1 }, + [NFTA_FLOWTABLE_HOOK] = { .type = NLA_NESTED }, + [NFTA_FLOWTABLE_HANDLE] = { .type = NLA_U64 }, +}; + +struct nft_flowtable *nf_tables_flowtable_lookup(const struct nft_table *table, + const struct nlattr *nla, + u8 genmask) +{ + struct nft_flowtable *flowtable; + + list_for_each_entry(flowtable, &table->flowtables, list) { + if (!nla_strcmp(nla, flowtable->name) && + nft_active_genmask(flowtable, genmask)) + return flowtable; + } + return ERR_PTR(-ENOENT); +} +EXPORT_SYMBOL_GPL(nf_tables_flowtable_lookup); + +struct nft_flowtable * +nf_tables_flowtable_lookup_byhandle(const struct nft_table *table, + const struct nlattr *nla, u8 genmask) +{ + struct nft_flowtable *flowtable; + + list_for_each_entry(flowtable, &table->flowtables, list) { + if (be64_to_cpu(nla_get_be64(nla)) == flowtable->handle && + nft_active_genmask(flowtable, genmask)) + return flowtable; + } + return ERR_PTR(-ENOENT); +} + +#define NFT_FLOWTABLE_DEVICE_MAX 8 + +static int nf_tables_parse_devices(const struct nft_ctx *ctx, + const struct nlattr *attr, + struct net_device *dev_array[], int *len) +{ + const struct nlattr *tmp; + struct net_device *dev; + char ifname[IFNAMSIZ]; + int rem, n = 0, err; + + nla_for_each_nested(tmp, attr, rem) { + if (nla_type(tmp) != NFTA_DEVICE_NAME) { + err = -EINVAL; + goto err1; + } + + nla_strlcpy(ifname, tmp, IFNAMSIZ); + dev = dev_get_by_name(ctx->net, ifname); + if (!dev) { + err = -ENOENT; + goto err1; + } + + dev_array[n++] = dev; + if (n == NFT_FLOWTABLE_DEVICE_MAX) { + err = -EFBIG; + goto err1; + } + } + if (!len) + return -EINVAL; + + err = 0; +err1: + *len = n; + return err; +} + +static const struct nla_policy nft_flowtable_hook_policy[NFTA_FLOWTABLE_HOOK_MAX + 1] = { + [NFTA_FLOWTABLE_HOOK_NUM] = { .type = NLA_U32 }, + [NFTA_FLOWTABLE_HOOK_PRIORITY] = { .type = NLA_U32 }, + [NFTA_FLOWTABLE_HOOK_DEVS] = { .type = NLA_NESTED }, +}; + +static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx, + const struct nlattr *attr, + struct nft_flowtable *flowtable) +{ + struct net_device *dev_array[NFT_FLOWTABLE_DEVICE_MAX]; + struct nlattr *tb[NFTA_FLOWTABLE_HOOK_MAX + 1]; + struct nf_hook_ops *ops; + int hooknum, priority; + int err, n = 0, i; + + err = nla_parse_nested(tb, NFTA_FLOWTABLE_HOOK_MAX, attr, + nft_flowtable_hook_policy, NULL); + if (err < 0) + return err; + + if (!tb[NFTA_FLOWTABLE_HOOK_NUM] || + !tb[NFTA_FLOWTABLE_HOOK_PRIORITY] || + !tb[NFTA_FLOWTABLE_HOOK_DEVS]) + return -EINVAL; + + hooknum = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_NUM])); + if (hooknum != NF_NETDEV_INGRESS) + return -EINVAL; + + priority = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_PRIORITY])); + + err = nf_tables_parse_devices(ctx, tb[NFTA_FLOWTABLE_HOOK_DEVS], + dev_array, &n); + if (err < 0) + goto err1; + + ops = kzalloc(sizeof(struct nf_hook_ops) * n, GFP_KERNEL); + if (!ops) { + err = -ENOMEM; + goto err1; + } + + flowtable->hooknum = hooknum; + flowtable->priority = priority; + flowtable->ops = ops; + flowtable->ops_len = n; + + for (i = 0; i < n; i++) { + flowtable->ops[i].pf = NFPROTO_NETDEV; + flowtable->ops[i].hooknum = hooknum; + flowtable->ops[i].priority = priority; + flowtable->ops[i].priv = &flowtable->data.rhashtable; + flowtable->ops[i].hook = flowtable->data.type->hook; + flowtable->ops[i].dev = dev_array[i]; + } + + err = 0; +err1: + for (i = 0; i < n; i++) + dev_put(dev_array[i]); + + return err; +} + +static const struct nf_flowtable_type *__nft_flowtable_type_get(u8 family) +{ + const struct nf_flowtable_type *type; + + list_for_each_entry(type, &nf_tables_flowtables, list) { + if (family == type->family) + return type; + } + return NULL; +} + +static const struct nf_flowtable_type *nft_flowtable_type_get(u8 family) +{ + const struct nf_flowtable_type *type; + + type = __nft_flowtable_type_get(family); + if (type != NULL && try_module_get(type->owner)) + return type; + +#ifdef CONFIG_MODULES + if (type == NULL) { + nfnl_unlock(NFNL_SUBSYS_NFTABLES); + request_module("nf-flowtable-%u", family); + nfnl_lock(NFNL_SUBSYS_NFTABLES); + if (__nft_flowtable_type_get(family)) + return ERR_PTR(-EAGAIN); + } +#endif + return ERR_PTR(-ENOENT); +} + +void nft_flow_table_iterate(struct net *net, + void (*iter)(struct nf_flowtable *flowtable, void *data), + void *data) +{ + struct nft_flowtable *flowtable; + const struct nft_table *table; + + rcu_read_lock(); + list_for_each_entry_rcu(table, &net->nft.tables, list) { + list_for_each_entry_rcu(flowtable, &table->flowtables, list) { + iter(&flowtable->data, data); + } + } + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(nft_flow_table_iterate); + +static void nft_unregister_flowtable_net_hooks(struct net *net, + struct nft_flowtable *flowtable) +{ + int i; + + for (i = 0; i < flowtable->ops_len; i++) { + if (!flowtable->ops[i].dev) + continue; + + nf_unregister_net_hook(net, &flowtable->ops[i]); + } +} + +static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, + struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nla[], + struct netlink_ext_ack *extack) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + const struct nf_flowtable_type *type; + u8 genmask = nft_genmask_next(net); + int family = nfmsg->nfgen_family; + struct nft_flowtable *flowtable; + struct nft_table *table; + struct nft_ctx ctx; + int err, i, k; + + if (!nla[NFTA_FLOWTABLE_TABLE] || + !nla[NFTA_FLOWTABLE_NAME] || + !nla[NFTA_FLOWTABLE_HOOK]) + return -EINVAL; + + table = nf_tables_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], + family, genmask); + if (IS_ERR(table)) + return PTR_ERR(table); + + flowtable = nf_tables_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME], + genmask); + if (IS_ERR(flowtable)) { + err = PTR_ERR(flowtable); + if (err != -ENOENT) + return err; + } else { + if (nlh->nlmsg_flags & NLM_F_EXCL) + return -EEXIST; + + return 0; + } + + nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); + + flowtable = kzalloc(sizeof(*flowtable), GFP_KERNEL); + if (!flowtable) + return -ENOMEM; + + flowtable->table = table; + flowtable->handle = nf_tables_alloc_handle(table); + + flowtable->name = nla_strdup(nla[NFTA_FLOWTABLE_NAME], GFP_KERNEL); + if (!flowtable->name) { + err = -ENOMEM; + goto err1; + } + + type = nft_flowtable_type_get(family); + if (IS_ERR(type)) { + err = PTR_ERR(type); + goto err2; + } + + flowtable->data.type = type; + err = rhashtable_init(&flowtable->data.rhashtable, type->params); + if (err < 0) + goto err3; + + err = nf_tables_flowtable_parse_hook(&ctx, nla[NFTA_FLOWTABLE_HOOK], + flowtable); + if (err < 0) + goto err3; + + for (i = 0; i < flowtable->ops_len; i++) { + err = nf_register_net_hook(net, &flowtable->ops[i]); + if (err < 0) + goto err4; + } + + err = nft_trans_flowtable_add(&ctx, NFT_MSG_NEWFLOWTABLE, flowtable); + if (err < 0) + goto err5; + + INIT_DEFERRABLE_WORK(&flowtable->data.gc_work, type->gc); + queue_delayed_work(system_power_efficient_wq, + &flowtable->data.gc_work, HZ); + + list_add_tail_rcu(&flowtable->list, &table->flowtables); + table->use++; + + return 0; +err5: + i = flowtable->ops_len; +err4: + for (k = i - 1; k >= 0; k--) + nf_unregister_net_hook(net, &flowtable->ops[i]); + + kfree(flowtable->ops); +err3: + module_put(type->owner); +err2: + kfree(flowtable->name); +err1: + kfree(flowtable); + return err; +} + +static int nf_tables_delflowtable(struct net *net, struct sock *nlsk, + struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nla[], + struct netlink_ext_ack *extack) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u8 genmask = nft_genmask_next(net); + int family = nfmsg->nfgen_family; + struct nft_flowtable *flowtable; + struct nft_table *table; + struct nft_ctx ctx; + + table = nf_tables_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], + family, genmask); + if (IS_ERR(table)) + return PTR_ERR(table); + + if (nla[NFTA_FLOWTABLE_HANDLE]) + flowtable = nf_tables_flowtable_lookup_byhandle(table, + nla[NFTA_FLOWTABLE_HANDLE], + genmask); + else + flowtable = nf_tables_flowtable_lookup(table, + nla[NFTA_FLOWTABLE_NAME], + genmask); + if (IS_ERR(flowtable)) + return PTR_ERR(flowtable); + if (flowtable->use > 0) + return -EBUSY; + + nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); + + return nft_delflowtable(&ctx, flowtable); +} + +static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, + u32 portid, u32 seq, int event, + u32 flags, int family, + struct nft_flowtable *flowtable) +{ + struct nlattr *nest, *nest_devs; + struct nfgenmsg *nfmsg; + struct nlmsghdr *nlh; + int i; + + event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); + nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags); + if (nlh == NULL) + goto nla_put_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = family; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = htons(net->nft.base_seq & 0xffff); + + if (nla_put_string(skb, NFTA_FLOWTABLE_TABLE, flowtable->table->name) || + nla_put_string(skb, NFTA_FLOWTABLE_NAME, flowtable->name) || + nla_put_be32(skb, NFTA_FLOWTABLE_USE, htonl(flowtable->use)) || + nla_put_be64(skb, NFTA_FLOWTABLE_HANDLE, cpu_to_be64(flowtable->handle), + NFTA_FLOWTABLE_PAD)) + goto nla_put_failure; + + nest = nla_nest_start(skb, NFTA_FLOWTABLE_HOOK); + if (nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_NUM, htonl(flowtable->hooknum)) || + nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_PRIORITY, htonl(flowtable->priority))) + goto nla_put_failure; + + nest_devs = nla_nest_start(skb, NFTA_FLOWTABLE_HOOK_DEVS); + if (!nest_devs) + goto nla_put_failure; + + for (i = 0; i < flowtable->ops_len; i++) { + if (flowtable->ops[i].dev && + nla_put_string(skb, NFTA_DEVICE_NAME, + flowtable->ops[i].dev->name)) + goto nla_put_failure; + } + nla_nest_end(skb, nest_devs); + nla_nest_end(skb, nest); + + nlmsg_end(skb, nlh); + return 0; + +nla_put_failure: + nlmsg_trim(skb, nlh); + return -1; +} + +struct nft_flowtable_filter { + char *table; +}; + +static int nf_tables_dump_flowtable(struct sk_buff *skb, + struct netlink_callback *cb) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); + struct nft_flowtable_filter *filter = cb->data; + unsigned int idx = 0, s_idx = cb->args[0]; + struct net *net = sock_net(skb->sk); + int family = nfmsg->nfgen_family; + struct nft_flowtable *flowtable; + const struct nft_table *table; + + rcu_read_lock(); + cb->seq = net->nft.base_seq; + + list_for_each_entry_rcu(table, &net->nft.tables, list) { + if (family != NFPROTO_UNSPEC && family != table->family) + continue; + + list_for_each_entry_rcu(flowtable, &table->flowtables, list) { + if (!nft_is_active(net, flowtable)) + goto cont; + if (idx < s_idx) + goto cont; + if (idx > s_idx) + memset(&cb->args[1], 0, + sizeof(cb->args) - sizeof(cb->args[0])); + if (filter && filter->table[0] && + strcmp(filter->table, table->name)) + goto cont; + + if (nf_tables_fill_flowtable_info(skb, net, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NFT_MSG_NEWFLOWTABLE, + NLM_F_MULTI | NLM_F_APPEND, + table->family, flowtable) < 0) + goto done; + + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +cont: + idx++; + } + } +done: + rcu_read_unlock(); + + cb->args[0] = idx; + return skb->len; +} + +static int nf_tables_dump_flowtable_done(struct netlink_callback *cb) +{ + struct nft_flowtable_filter *filter = cb->data; + + if (!filter) + return 0; + + kfree(filter->table); + kfree(filter); + + return 0; +} + +static struct nft_flowtable_filter * +nft_flowtable_filter_alloc(const struct nlattr * const nla[]) +{ + struct nft_flowtable_filter *filter; + + filter = kzalloc(sizeof(*filter), GFP_KERNEL); + if (!filter) + return ERR_PTR(-ENOMEM); + + if (nla[NFTA_FLOWTABLE_TABLE]) { + filter->table = nla_strdup(nla[NFTA_FLOWTABLE_TABLE], + GFP_KERNEL); + if (!filter->table) { + kfree(filter); + return ERR_PTR(-ENOMEM); + } + } + return filter; +} + +static int nf_tables_getflowtable(struct net *net, struct sock *nlsk, + struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nla[], + struct netlink_ext_ack *extack) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u8 genmask = nft_genmask_cur(net); + int family = nfmsg->nfgen_family; + struct nft_flowtable *flowtable; + const struct nft_table *table; + struct sk_buff *skb2; + int err; + + if (nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .dump = nf_tables_dump_flowtable, + .done = nf_tables_dump_flowtable_done, + }; + + if (nla[NFTA_FLOWTABLE_TABLE]) { + struct nft_flowtable_filter *filter; + + filter = nft_flowtable_filter_alloc(nla); + if (IS_ERR(filter)) + return -ENOMEM; + + c.data = filter; + } + return netlink_dump_start(nlsk, skb, nlh, &c); + } + + if (!nla[NFTA_FLOWTABLE_NAME]) + return -EINVAL; + + table = nf_tables_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], + family, genmask); + if (IS_ERR(table)) + return PTR_ERR(table); + + flowtable = nf_tables_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME], + genmask); + if (IS_ERR(flowtable)) + return PTR_ERR(flowtable); + + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb2) + return -ENOMEM; + + err = nf_tables_fill_flowtable_info(skb2, net, NETLINK_CB(skb).portid, + nlh->nlmsg_seq, + NFT_MSG_NEWFLOWTABLE, 0, family, + flowtable); + if (err < 0) + goto err; + + return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); +err: + kfree_skb(skb2); + return err; +} + +static void nf_tables_flowtable_notify(struct nft_ctx *ctx, + struct nft_flowtable *flowtable, + int event) +{ + struct sk_buff *skb; + int err; + + if (ctx->report && + !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) + return; + + skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (skb == NULL) + goto err; + + err = nf_tables_fill_flowtable_info(skb, ctx->net, ctx->portid, + ctx->seq, event, 0, + ctx->family, flowtable); + if (err < 0) { + kfree_skb(skb); + goto err; + } + + nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, + ctx->report, GFP_KERNEL); + return; +err: + nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS); +} + +static void nft_flowtable_destroy(void *ptr, void *arg) +{ + kfree(ptr); +} + +static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable) +{ + cancel_delayed_work_sync(&flowtable->data.gc_work); + kfree(flowtable->name); + rhashtable_free_and_destroy(&flowtable->data.rhashtable, + nft_flowtable_destroy, NULL); + module_put(flowtable->data.type->owner); } static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net, @@ -4878,6 +5443,46 @@ nla_put_failure: return -EMSGSIZE; } +static void nft_flowtable_event(unsigned long event, struct net_device *dev, + struct nft_flowtable *flowtable) +{ + int i; + + for (i = 0; i < flowtable->ops_len; i++) { + if (flowtable->ops[i].dev != dev) + continue; + + nf_unregister_net_hook(dev_net(dev), &flowtable->ops[i]); + flowtable->ops[i].dev = NULL; + break; + } +} + +static int nf_tables_flowtable_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct nft_flowtable *flowtable; + struct nft_table *table; + + if (event != NETDEV_UNREGISTER) + return 0; + + nfnl_lock(NFNL_SUBSYS_NFTABLES); + list_for_each_entry(table, &dev_net(dev)->nft.tables, list) { + list_for_each_entry(flowtable, &table->flowtables, list) { + nft_flowtable_event(event, dev, flowtable); + } + } + nfnl_unlock(NFNL_SUBSYS_NFTABLES); + + return NOTIFY_DONE; +} + +static struct notifier_block nf_tables_flowtable_notifier = { + .notifier_call = nf_tables_flowtable_event, +}; + static void nf_tables_gen_notify(struct net *net, struct sk_buff *skb, int event) { @@ -5030,6 +5635,21 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_OBJ_MAX, .policy = nft_obj_policy, }, + [NFT_MSG_NEWFLOWTABLE] = { + .call_batch = nf_tables_newflowtable, + .attr_count = NFTA_FLOWTABLE_MAX, + .policy = nft_flowtable_policy, + }, + [NFT_MSG_GETFLOWTABLE] = { + .call = nf_tables_getflowtable, + .attr_count = NFTA_FLOWTABLE_MAX, + .policy = nft_flowtable_policy, + }, + [NFT_MSG_DELFLOWTABLE] = { + .call_batch = nf_tables_delflowtable, + .attr_count = NFTA_FLOWTABLE_MAX, + .policy = nft_flowtable_policy, + }, }; static void nft_chain_commit_update(struct nft_trans *trans) @@ -5075,6 +5695,9 @@ static void nf_tables_commit_release(struct nft_trans *trans) case NFT_MSG_DELOBJ: nft_obj_destroy(nft_trans_obj(trans)); break; + case NFT_MSG_DELFLOWTABLE: + nf_tables_flowtable_destroy(nft_trans_flowtable(trans)); + break; } kfree(trans); } @@ -5101,7 +5724,6 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) if (nft_trans_table_update(trans)) { if (!nft_trans_table_enable(trans)) { nf_tables_table_disable(net, - trans->ctx.afi, trans->ctx.table); trans->ctx.table->flags |= NFT_TABLE_F_DORMANT; } @@ -5127,10 +5749,9 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) case NFT_MSG_DELCHAIN: list_del_rcu(&trans->ctx.chain->list); nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN); - nf_tables_unregister_hooks(trans->ctx.net, - trans->ctx.table, - trans->ctx.chain, - trans->ctx.afi->nops); + nf_tables_unregister_hook(trans->ctx.net, + trans->ctx.table, + trans->ctx.chain); break; case NFT_MSG_NEWRULE: nft_clear(trans->ctx.net, nft_trans_rule(trans)); @@ -5150,7 +5771,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) /* This avoids hitting -EBUSY when deleting the table * from the transaction. */ - if (nft_trans_set(trans)->flags & NFT_SET_ANONYMOUS && + if (nft_set_is_anonymous(nft_trans_set(trans)) && !list_empty(&nft_trans_set(trans)->bindings)) trans->ctx.table->use--; @@ -5193,6 +5814,21 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nf_tables_obj_notify(&trans->ctx, nft_trans_obj(trans), NFT_MSG_DELOBJ); break; + case NFT_MSG_NEWFLOWTABLE: + nft_clear(net, nft_trans_flowtable(trans)); + nf_tables_flowtable_notify(&trans->ctx, + nft_trans_flowtable(trans), + NFT_MSG_NEWFLOWTABLE); + nft_trans_destroy(trans); + break; + case NFT_MSG_DELFLOWTABLE: + list_del_rcu(&nft_trans_flowtable(trans)->list); + nf_tables_flowtable_notify(&trans->ctx, + nft_trans_flowtable(trans), + NFT_MSG_DELFLOWTABLE); + nft_unregister_flowtable_net_hooks(net, + nft_trans_flowtable(trans)); + break; } } @@ -5230,6 +5866,9 @@ static void nf_tables_abort_release(struct nft_trans *trans) case NFT_MSG_NEWOBJ: nft_obj_destroy(nft_trans_obj(trans)); break; + case NFT_MSG_NEWFLOWTABLE: + nf_tables_flowtable_destroy(nft_trans_flowtable(trans)); + break; } kfree(trans); } @@ -5246,7 +5885,6 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb) if (nft_trans_table_update(trans)) { if (nft_trans_table_enable(trans)) { nf_tables_table_disable(net, - trans->ctx.afi, trans->ctx.table); trans->ctx.table->flags |= NFT_TABLE_F_DORMANT; } @@ -5267,10 +5905,9 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb) } else { trans->ctx.table->use--; list_del_rcu(&trans->ctx.chain->list); - nf_tables_unregister_hooks(trans->ctx.net, - trans->ctx.table, - trans->ctx.chain, - trans->ctx.afi->nops); + nf_tables_unregister_hook(trans->ctx.net, + trans->ctx.table, + trans->ctx.chain); } break; case NFT_MSG_DELCHAIN: @@ -5320,6 +5957,17 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb) nft_clear(trans->ctx.net, nft_trans_obj(trans)); nft_trans_destroy(trans); break; + case NFT_MSG_NEWFLOWTABLE: + trans->ctx.table->use--; + list_del_rcu(&nft_trans_flowtable(trans)->list); + nft_unregister_flowtable_net_hooks(net, + nft_trans_flowtable(trans)); + break; + case NFT_MSG_DELFLOWTABLE: + trans->ctx.table->use++; + nft_clear(trans->ctx.net, nft_trans_flowtable(trans)); + nft_trans_destroy(trans); + break; } } @@ -5371,7 +6019,7 @@ int nft_chain_validate_hooks(const struct nft_chain *chain, if (nft_is_base_chain(chain)) { basechain = nft_base_chain(chain); - if ((1 << basechain->ops[0].hooknum) & hook_flags) + if ((1 << basechain->ops.hooknum) & hook_flags) return 0; return -EOPNOTSUPP; @@ -5839,22 +6487,13 @@ int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data, } EXPORT_SYMBOL_GPL(nft_data_dump); -static int __net_init nf_tables_init_net(struct net *net) -{ - INIT_LIST_HEAD(&net->nft.af_info); - INIT_LIST_HEAD(&net->nft.commit_list); - net->nft.base_seq = 1; - return 0; -} - int __nft_release_basechain(struct nft_ctx *ctx) { struct nft_rule *rule, *nr; BUG_ON(!nft_is_base_chain(ctx->chain)); - nf_tables_unregister_hooks(ctx->net, ctx->chain->table, ctx->chain, - ctx->afi->nops); + nf_tables_unregister_hook(ctx->net, ctx->chain->table, ctx->chain); list_for_each_entry_safe(rule, nr, &ctx->chain->rules, list) { list_del(&rule->list); ctx->chain->use--; @@ -5868,9 +6507,9 @@ int __nft_release_basechain(struct nft_ctx *ctx) } EXPORT_SYMBOL_GPL(__nft_release_basechain); -/* Called by nft_unregister_afinfo() from __net_exit path, nfnl_lock is held. */ -static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi) +static void __nft_release_tables(struct net *net) { + struct nft_flowtable *flowtable, *nf; struct nft_table *table, *nt; struct nft_chain *chain, *nc; struct nft_object *obj, *ne; @@ -5878,13 +6517,16 @@ static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi) struct nft_set *set, *ns; struct nft_ctx ctx = { .net = net, - .afi = afi, }; - list_for_each_entry_safe(table, nt, &afi->tables, list) { + list_for_each_entry_safe(table, nt, &net->nft.tables, list) { + ctx.family = table->family; + list_for_each_entry(chain, &table->chains, list) - nf_tables_unregister_hooks(net, table, chain, - afi->nops); + nf_tables_unregister_hook(net, table, chain); + list_for_each_entry(flowtable, &table->flowtables, list) + nf_unregister_net_hooks(net, flowtable->ops, + flowtable->ops_len); /* No packets are walking on these chains anymore. */ ctx.table = table; list_for_each_entry(chain, &table->chains, list) { @@ -5895,6 +6537,11 @@ static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi) nf_tables_rule_destroy(&ctx, rule); } } + list_for_each_entry_safe(flowtable, nf, &table->flowtables, list) { + list_del(&flowtable->list); + table->use--; + nf_tables_flowtable_destroy(flowtable); + } list_for_each_entry_safe(set, ns, &table->sets, list) { list_del(&set->list); table->use--; @@ -5915,8 +6562,24 @@ static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi) } } +static int __net_init nf_tables_init_net(struct net *net) +{ + INIT_LIST_HEAD(&net->nft.tables); + INIT_LIST_HEAD(&net->nft.commit_list); + net->nft.base_seq = 1; + return 0; +} + +static void __net_exit nf_tables_exit_net(struct net *net) +{ + __nft_release_tables(net); + WARN_ON_ONCE(!list_empty(&net->nft.tables)); + WARN_ON_ONCE(!list_empty(&net->nft.commit_list)); +} + static struct pernet_operations nf_tables_net_ops = { .init = nf_tables_init_net, + .exit = nf_tables_exit_net, }; static int __init nf_tables_module_init(void) @@ -5938,7 +6601,8 @@ static int __init nf_tables_module_init(void) if (err < 0) goto err3; - pr_info("nf_tables: (c) 2007-2009 Patrick McHardy <kaber@trash.net>\n"); + register_netdevice_notifier(&nf_tables_flowtable_notifier); + return register_pernet_subsys(&nf_tables_net_ops); err3: nf_tables_core_module_exit(); @@ -5952,6 +6616,7 @@ static void __exit nf_tables_module_exit(void) { unregister_pernet_subsys(&nf_tables_net_ops); nfnetlink_subsys_unregister(&nf_tables_subsys); + unregister_netdevice_notifier(&nf_tables_flowtable_notifier); rcu_barrier(); nf_tables_core_module_exit(); kfree(info); diff --git a/net/netfilter/nf_tables_inet.c b/net/netfilter/nf_tables_inet.c index f713cc205669..e30c7da09d0d 100644 --- a/net/netfilter/nf_tables_inet.c +++ b/net/netfilter/nf_tables_inet.c @@ -9,6 +9,7 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/ip.h> +#include <linux/ipv6.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <net/netfilter/nf_tables.h> @@ -16,56 +17,27 @@ #include <net/netfilter/nf_tables_ipv6.h> #include <net/ip.h> -static void nft_inet_hook_ops_init(struct nf_hook_ops *ops, unsigned int n) +static unsigned int nft_do_chain_inet(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) { - struct nft_af_info *afi; - - if (n == 1) - afi = &nft_af_ipv4; - else - afi = &nft_af_ipv6; - - ops->pf = afi->family; - if (afi->hooks[ops->hooknum]) - ops->hook = afi->hooks[ops->hooknum]; -} - -static struct nft_af_info nft_af_inet __read_mostly = { - .family = NFPROTO_INET, - .nhooks = NF_INET_NUMHOOKS, - .owner = THIS_MODULE, - .nops = 2, - .hook_ops_init = nft_inet_hook_ops_init, -}; - -static int __net_init nf_tables_inet_init_net(struct net *net) -{ - net->nft.inet = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL); - if (net->nft.inet == NULL) - return -ENOMEM; - memcpy(net->nft.inet, &nft_af_inet, sizeof(nft_af_inet)); - - if (nft_register_afinfo(net, net->nft.inet) < 0) - goto err; - - return 0; - -err: - kfree(net->nft.inet); - return -ENOMEM; -} - -static void __net_exit nf_tables_inet_exit_net(struct net *net) -{ - nft_unregister_afinfo(net, net->nft.inet); - kfree(net->nft.inet); + struct nft_pktinfo pkt; + + nft_set_pktinfo(&pkt, skb, state); + + switch (state->pf) { + case NFPROTO_IPV4: + nft_set_pktinfo_ipv4(&pkt, skb); + break; + case NFPROTO_IPV6: + nft_set_pktinfo_ipv6(&pkt, skb); + break; + default: + break; + } + + return nft_do_chain(&pkt, priv); } -static struct pernet_operations nf_tables_inet_net_ops = { - .init = nf_tables_inet_init_net, - .exit = nf_tables_inet_exit_net, -}; - static const struct nf_chain_type filter_inet = { .name = "filter", .type = NFT_CHAIN_T_DEFAULT, @@ -76,26 +48,22 @@ static const struct nf_chain_type filter_inet = { (1 << NF_INET_FORWARD) | (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_POST_ROUTING), + .hooks = { + [NF_INET_LOCAL_IN] = nft_do_chain_inet, + [NF_INET_LOCAL_OUT] = nft_do_chain_inet, + [NF_INET_FORWARD] = nft_do_chain_inet, + [NF_INET_PRE_ROUTING] = nft_do_chain_inet, + [NF_INET_POST_ROUTING] = nft_do_chain_inet, + }, }; static int __init nf_tables_inet_init(void) { - int ret; - - ret = nft_register_chain_type(&filter_inet); - if (ret < 0) - return ret; - - ret = register_pernet_subsys(&nf_tables_inet_net_ops); - if (ret < 0) - nft_unregister_chain_type(&filter_inet); - - return ret; + return nft_register_chain_type(&filter_inet); } static void __exit nf_tables_inet_exit(void) { - unregister_pernet_subsys(&nf_tables_inet_net_ops); nft_unregister_chain_type(&filter_inet); } @@ -104,4 +72,4 @@ module_exit(nf_tables_inet_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_ALIAS_NFT_FAMILY(1); +MODULE_ALIAS_NFT_CHAIN(1, "filter"); diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c index 403432988313..4041fafca934 100644 --- a/net/netfilter/nf_tables_netdev.c +++ b/net/netfilter/nf_tables_netdev.c @@ -21,66 +21,32 @@ nft_do_chain_netdev(void *priv, struct sk_buff *skb, { struct nft_pktinfo pkt; + nft_set_pktinfo(&pkt, skb, state); + switch (skb->protocol) { case htons(ETH_P_IP): - nft_set_pktinfo_ipv4_validate(&pkt, skb, state); + nft_set_pktinfo_ipv4_validate(&pkt, skb); break; case htons(ETH_P_IPV6): - nft_set_pktinfo_ipv6_validate(&pkt, skb, state); + nft_set_pktinfo_ipv6_validate(&pkt, skb); break; default: - nft_set_pktinfo_unspec(&pkt, skb, state); + nft_set_pktinfo_unspec(&pkt, skb); break; } return nft_do_chain(&pkt, priv); } -static struct nft_af_info nft_af_netdev __read_mostly = { - .family = NFPROTO_NETDEV, - .nhooks = NF_NETDEV_NUMHOOKS, - .owner = THIS_MODULE, - .flags = NFT_AF_NEEDS_DEV, - .nops = 1, - .hooks = { - [NF_NETDEV_INGRESS] = nft_do_chain_netdev, - }, -}; - -static int nf_tables_netdev_init_net(struct net *net) -{ - net->nft.netdev = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL); - if (net->nft.netdev == NULL) - return -ENOMEM; - - memcpy(net->nft.netdev, &nft_af_netdev, sizeof(nft_af_netdev)); - - if (nft_register_afinfo(net, net->nft.netdev) < 0) - goto err; - - return 0; -err: - kfree(net->nft.netdev); - return -ENOMEM; -} - -static void nf_tables_netdev_exit_net(struct net *net) -{ - nft_unregister_afinfo(net, net->nft.netdev); - kfree(net->nft.netdev); -} - -static struct pernet_operations nf_tables_netdev_net_ops = { - .init = nf_tables_netdev_init_net, - .exit = nf_tables_netdev_exit_net, -}; - static const struct nf_chain_type nft_filter_chain_netdev = { .name = "filter", .type = NFT_CHAIN_T_DEFAULT, .family = NFPROTO_NETDEV, .owner = THIS_MODULE, .hook_mask = (1 << NF_NETDEV_INGRESS), + .hooks = { + [NF_NETDEV_INGRESS] = nft_do_chain_netdev, + }, }; static void nft_netdev_event(unsigned long event, struct net_device *dev, @@ -96,7 +62,7 @@ static void nft_netdev_event(unsigned long event, struct net_device *dev, __nft_release_basechain(ctx); break; case NETDEV_CHANGENAME: - if (dev->ifindex != basechain->ops[0].dev->ifindex) + if (dev->ifindex != basechain->ops.dev->ifindex) return; strncpy(basechain->dev_name, dev->name, IFNAMSIZ); @@ -108,7 +74,6 @@ static int nf_tables_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct nft_af_info *afi; struct nft_table *table; struct nft_chain *chain, *nr; struct nft_ctx ctx = { @@ -120,20 +85,18 @@ static int nf_tables_netdev_event(struct notifier_block *this, return NOTIFY_DONE; nfnl_lock(NFNL_SUBSYS_NFTABLES); - list_for_each_entry(afi, &dev_net(dev)->nft.af_info, list) { - ctx.afi = afi; - if (afi->family != NFPROTO_NETDEV) + list_for_each_entry(table, &ctx.net->nft.tables, list) { + if (table->family != NFPROTO_NETDEV) continue; - list_for_each_entry(table, &afi->tables, list) { - ctx.table = table; - list_for_each_entry_safe(chain, nr, &table->chains, list) { - if (!nft_is_base_chain(chain)) - continue; + ctx.family = table->family; + ctx.table = table; + list_for_each_entry_safe(chain, nr, &table->chains, list) { + if (!nft_is_base_chain(chain)) + continue; - ctx.chain = chain; - nft_netdev_event(event, dev, &ctx); - } + ctx.chain = chain; + nft_netdev_event(event, dev, &ctx); } } nfnl_unlock(NFNL_SUBSYS_NFTABLES); @@ -153,27 +116,21 @@ static int __init nf_tables_netdev_init(void) if (ret) return ret; - ret = register_pernet_subsys(&nf_tables_netdev_net_ops); - if (ret) - goto err1; - ret = register_netdevice_notifier(&nf_tables_netdev_notifier); if (ret) - goto err2; + goto err_register_netdevice_notifier; return 0; -err2: - unregister_pernet_subsys(&nf_tables_netdev_net_ops); -err1: +err_register_netdevice_notifier: nft_unregister_chain_type(&nft_filter_chain_netdev); + return ret; } static void __exit nf_tables_netdev_exit(void) { unregister_netdevice_notifier(&nf_tables_netdev_notifier); - unregister_pernet_subsys(&nf_tables_netdev_net_ops); nft_unregister_chain_type(&nft_filter_chain_netdev); } @@ -182,4 +139,4 @@ module_exit(nf_tables_netdev_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); -MODULE_ALIAS_NFT_FAMILY(5); /* NFPROTO_NETDEV */ +MODULE_ALIAS_NFT_CHAIN(5, "filter"); /* NFPROTO_NETDEV */ diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 733d3e4a30d8..03ead8a9e90c 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -37,8 +37,6 @@ MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER); rcu_dereference_protected(table[(id)].subsys, \ lockdep_nfnl_is_held((id))) -static char __initdata nfversion[] = "0.30"; - static struct { struct mutex mutex; const struct nfnetlink_subsystem __rcu *subsys; @@ -580,13 +578,11 @@ static int __init nfnetlink_init(void) for (i=0; i<NFNL_SUBSYS_COUNT; i++) mutex_init(&table[i].mutex); - pr_info("Netfilter messages via NETLINK v%s.\n", nfversion); return register_pernet_subsys(&nfnetlink_net_ops); } static void __exit nfnetlink_exit(void) { - pr_info("Removing netfilter NETLINK layer.\n"); unregister_pernet_subsys(&nfnetlink_net_ops); } module_init(nfnetlink_init); diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index c45e6d4358ab..88d427f9f9e6 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -527,7 +527,6 @@ static int __init nfnl_acct_init(void) goto err_out; } - pr_info("nfnl_acct: registering with nfnetlink.\n"); ret = nfnetlink_subsys_register(&nfnl_acct_subsys); if (ret < 0) { pr_err("nfnl_acct_init: cannot register with nfnetlink.\n"); @@ -543,7 +542,6 @@ err_out: static void __exit nfnl_acct_exit(void) { - pr_info("nfnl_acct: unregistering from nfnetlink.\n"); nfnetlink_subsys_unregister(&nfnl_acct_subsys); unregister_pernet_subsys(&nfnl_acct_ops); } diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index 41628b393673..d33ce6d5ebce 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -17,6 +17,7 @@ #include <linux/types.h> #include <linux/list.h> #include <linux/errno.h> +#include <linux/capability.h> #include <net/netlink.h> #include <net/sock.h> @@ -407,6 +408,9 @@ static int nfnl_cthelper_new(struct net *net, struct sock *nfnl, struct nfnl_cthelper *nlcth; int ret = 0; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (!tb[NFCTH_NAME] || !tb[NFCTH_TUPLE]) return -EINVAL; @@ -611,6 +615,9 @@ static int nfnl_cthelper_get(struct net *net, struct sock *nfnl, struct nfnl_cthelper *nlcth; bool tuple_set = false; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = nfnl_cthelper_dump_table, @@ -678,6 +685,9 @@ static int nfnl_cthelper_del(struct net *net, struct sock *nfnl, struct nfnl_cthelper *nlcth, *n; int j = 0, ret; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (tb[NFCTH_NAME]) helper_name = nla_data(tb[NFCTH_NAME]); diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index 32b1c0b44e79..95b04702a655 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -615,8 +615,6 @@ err_out: static void __exit cttimeout_exit(void) { - pr_info("cttimeout: unregistering from nfnetlink.\n"); - nfnetlink_subsys_unregister(&cttimeout_subsys); unregister_pernet_subsys(&cttimeout_ops); diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index e5afab86381c..7b46aa4c478d 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -1054,7 +1054,6 @@ static int nful_open(struct inode *inode, struct file *file) } static const struct file_operations nful_file_ops = { - .owner = THIS_MODULE, .open = nful_open, .read = seq_read, .llseek = seq_lseek, @@ -1093,10 +1092,15 @@ static int __net_init nfnl_log_net_init(struct net *net) static void __net_exit nfnl_log_net_exit(struct net *net) { + struct nfnl_log_net *log = nfnl_log_pernet(net); + unsigned int i; + #ifdef CONFIG_PROC_FS remove_proc_entry("nfnetlink_log", net->nf.proc_netfilter); #endif nf_log_unset(net, &nfulnl_logger); + for (i = 0; i < INSTANCE_BUCKETS; i++) + WARN_ON_ONCE(!hlist_empty(&log->instance_table[i])); } static struct pernet_operations nfnl_log_net_ops = { diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index a16356cacec3..8bba23160a68 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -941,23 +941,18 @@ static struct notifier_block nfqnl_dev_notifier = { .notifier_call = nfqnl_rcv_dev_event, }; -static unsigned int nfqnl_nf_hook_drop(struct net *net) +static void nfqnl_nf_hook_drop(struct net *net) { struct nfnl_queue_net *q = nfnl_queue_pernet(net); - unsigned int instances = 0; int i; for (i = 0; i < INSTANCE_BUCKETS; i++) { struct nfqnl_instance *inst; struct hlist_head *head = &q->instance_table[i]; - hlist_for_each_entry_rcu(inst, head, hlist) { + hlist_for_each_entry_rcu(inst, head, hlist) nfqnl_flush(inst, NULL, 0); - instances++; - } } - - return instances; } static int @@ -1482,7 +1477,6 @@ static int nfqnl_open(struct inode *inode, struct file *file) } static const struct file_operations nfqnl_file_ops = { - .owner = THIS_MODULE, .open = nfqnl_open, .read = seq_read, .llseek = seq_lseek, @@ -1512,10 +1506,15 @@ static int __net_init nfnl_queue_net_init(struct net *net) static void __net_exit nfnl_queue_net_exit(struct net *net) { + struct nfnl_queue_net *q = nfnl_queue_pernet(net); + unsigned int i; + nf_unregister_queue_handler(net); #ifdef CONFIG_PROC_FS remove_proc_entry("nfnetlink_queue", net->nf.proc_netfilter); #endif + for (i = 0; i < INSTANCE_BUCKETS; i++) + WARN_ON_ONCE(!hlist_empty(&q->instance_table[i])); } static void nfnl_queue_net_exit_batch(struct list_head *net_exit_list) diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index c2945eb3397c..fa90a8402845 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -44,6 +44,7 @@ static void nft_cmp_eval(const struct nft_expr *expr, case NFT_CMP_LT: if (d == 0) goto mismatch; + /* fall through */ case NFT_CMP_LTE: if (d > 0) goto mismatch; @@ -51,6 +52,7 @@ static void nft_cmp_eval(const struct nft_expr *expr, case NFT_CMP_GT: if (d == 0) goto mismatch; + /* fall through */ case NFT_CMP_GTE: if (d < 0) goto mismatch; diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index b89f4f65b2a0..8e23726b9081 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -144,7 +144,7 @@ nft_target_set_tgchk_param(struct xt_tgchk_param *par, { par->net = ctx->net; par->table = ctx->table->name; - switch (ctx->afi->family) { + switch (ctx->family) { case AF_INET: entry->e4.ip.proto = proto; entry->e4.ip.invflags = inv ? IPT_INV_PROTO : 0; @@ -169,13 +169,13 @@ nft_target_set_tgchk_param(struct xt_tgchk_param *par, if (nft_is_base_chain(ctx->chain)) { const struct nft_base_chain *basechain = nft_base_chain(ctx->chain); - const struct nf_hook_ops *ops = &basechain->ops[0]; + const struct nf_hook_ops *ops = &basechain->ops; par->hook_mask = 1 << ops->hooknum; } else { par->hook_mask = 0; } - par->family = ctx->afi->family; + par->family = ctx->family; par->nft_compat = true; } @@ -267,7 +267,7 @@ nft_target_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) par.net = ctx->net; par.target = target; par.targinfo = info; - par.family = ctx->afi->family; + par.family = ctx->family; if (par.target->destroy != NULL) par.target->destroy(&par); @@ -302,7 +302,7 @@ static int nft_target_validate(const struct nft_ctx *ctx, if (nft_is_base_chain(ctx->chain)) { const struct nft_base_chain *basechain = nft_base_chain(ctx->chain); - const struct nf_hook_ops *ops = &basechain->ops[0]; + const struct nf_hook_ops *ops = &basechain->ops; hook_mask = 1 << ops->hooknum; if (target->hooks && !(hook_mask & target->hooks)) @@ -358,7 +358,7 @@ nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx, { par->net = ctx->net; par->table = ctx->table->name; - switch (ctx->afi->family) { + switch (ctx->family) { case AF_INET: entry->e4.ip.proto = proto; entry->e4.ip.invflags = inv ? IPT_INV_PROTO : 0; @@ -383,13 +383,13 @@ nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx, if (nft_is_base_chain(ctx->chain)) { const struct nft_base_chain *basechain = nft_base_chain(ctx->chain); - const struct nf_hook_ops *ops = &basechain->ops[0]; + const struct nf_hook_ops *ops = &basechain->ops; par->hook_mask = 1 << ops->hooknum; } else { par->hook_mask = 0; } - par->family = ctx->afi->family; + par->family = ctx->family; par->nft_compat = true; } @@ -446,7 +446,7 @@ nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) par.net = ctx->net; par.match = match; par.matchinfo = info; - par.family = ctx->afi->family; + par.family = ctx->family; if (par.match->destroy != NULL) par.match->destroy(&par); @@ -481,7 +481,7 @@ static int nft_match_validate(const struct nft_ctx *ctx, if (nft_is_base_chain(ctx->chain)) { const struct nft_base_chain *basechain = nft_base_chain(ctx->chain); - const struct nf_hook_ops *ops = &basechain->ops[0]; + const struct nf_hook_ops *ops = &basechain->ops; hook_mask = 1 << ops->hooknum; if (match->hooks && !(hook_mask & match->hooks)) @@ -648,7 +648,7 @@ nft_match_select_ops(const struct nft_ctx *ctx, mt_name = nla_data(tb[NFTA_MATCH_NAME]); rev = ntohl(nla_get_be32(tb[NFTA_MATCH_REV])); - family = ctx->afi->family; + family = ctx->family; /* Re-use the existing match if it's already loaded. */ list_for_each_entry(nft_match, &nft_match_list, head) { @@ -733,7 +733,7 @@ nft_target_select_ops(const struct nft_ctx *ctx, tg_name = nla_data(tb[NFTA_TARGET_NAME]); rev = ntohl(nla_get_be32(tb[NFTA_TARGET_REV])); - family = ctx->afi->family; + family = ctx->family; /* Re-use the existing target if it's already loaded. */ list_for_each_entry(nft_target, &nft_target_list, head) { @@ -812,8 +812,6 @@ static int __init nft_compat_module_init(void) goto err_target; } - pr_info("nf_tables_compat: (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>\n"); - return ret; err_target: diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 2647b895f4b0..6ab274b14484 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -405,7 +405,7 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, if (tb[NFTA_CT_DIRECTION] == NULL) return -EINVAL; - switch (ctx->afi->family) { + switch (ctx->family) { case NFPROTO_IPV4: len = FIELD_SIZEOF(struct nf_conntrack_tuple, src.u3.ip); @@ -456,7 +456,7 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, if (err < 0) return err; - err = nf_ct_netns_get(ctx->net, ctx->afi->family); + err = nf_ct_netns_get(ctx->net, ctx->family); if (err < 0) return err; @@ -550,7 +550,7 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, if (err < 0) goto err1; - err = nf_ct_netns_get(ctx->net, ctx->afi->family); + err = nf_ct_netns_get(ctx->net, ctx->family); if (err < 0) goto err1; @@ -564,7 +564,7 @@ err1: static void nft_ct_get_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { - nf_ct_netns_put(ctx->net, ctx->afi->family); + nf_ct_netns_put(ctx->net, ctx->family); } static void nft_ct_set_destroy(const struct nft_ctx *ctx, @@ -573,7 +573,7 @@ static void nft_ct_set_destroy(const struct nft_ctx *ctx, struct nft_ct *priv = nft_expr_priv(expr); __nft_ct_set_destroy(ctx, priv); - nf_ct_netns_put(ctx->net, ctx->afi->family); + nf_ct_netns_put(ctx->net, ctx->family); } static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr) @@ -734,7 +734,7 @@ static int nft_ct_helper_obj_init(const struct nft_ctx *ctx, struct nft_ct_helper_obj *priv = nft_obj_data(obj); struct nf_conntrack_helper *help4, *help6; char name[NF_CT_HELPER_NAME_LEN]; - int family = ctx->afi->family; + int family = ctx->family; if (!tb[NFTA_CT_HELPER_NAME] || !tb[NFTA_CT_HELPER_L4PROTO]) return -EINVAL; @@ -753,14 +753,14 @@ static int nft_ct_helper_obj_init(const struct nft_ctx *ctx, switch (family) { case NFPROTO_IPV4: - if (ctx->afi->family == NFPROTO_IPV6) + if (ctx->family == NFPROTO_IPV6) return -EINVAL; help4 = nf_conntrack_helper_try_module_get(name, family, priv->l4proto); break; case NFPROTO_IPV6: - if (ctx->afi->family == NFPROTO_IPV4) + if (ctx->family == NFPROTO_IPV4) return -EINVAL; help6 = nf_conntrack_helper_try_module_get(name, family, diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 66221ad891a9..fc83e29d6634 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -164,7 +164,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx, } priv->sreg_key = nft_parse_register(tb[NFTA_DYNSET_SREG_KEY]); - err = nft_validate_register_load(priv->sreg_key, set->klen);; + err = nft_validate_register_load(priv->sreg_key, set->klen); if (err < 0) return err; @@ -184,7 +184,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx, if (tb[NFTA_DYNSET_EXPR] != NULL) { if (!(set->flags & NFT_SET_EVAL)) return -EINVAL; - if (!(set->flags & NFT_SET_ANONYMOUS)) + if (!nft_set_is_anonymous(set)) return -EOPNOTSUPP; priv->expr = nft_expr_init(ctx, tb[NFTA_DYNSET_EXPR]); diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index a0a93d987a3b..47ec1046ad11 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -214,6 +214,8 @@ static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = { [NFTA_EXTHDR_OFFSET] = { .type = NLA_U32 }, [NFTA_EXTHDR_LEN] = { .type = NLA_U32 }, [NFTA_EXTHDR_FLAGS] = { .type = NLA_U32 }, + [NFTA_EXTHDR_OP] = { .type = NLA_U32 }, + [NFTA_EXTHDR_SREG] = { .type = NLA_U32 }, }; static int nft_exthdr_init(const struct nft_ctx *ctx, diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c new file mode 100644 index 000000000000..4503b8dcf9c0 --- /dev/null +++ b/net/netfilter/nft_flow_offload.c @@ -0,0 +1,264 @@ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/workqueue.h> +#include <linux/spinlock.h> +#include <linux/netfilter/nf_tables.h> +#include <net/ip.h> /* for ipv4 options. */ +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <linux/netfilter/nf_conntrack_common.h> +#include <net/netfilter/nf_flow_table.h> + +struct nft_flow_offload { + struct nft_flowtable *flowtable; +}; + +static int nft_flow_route(const struct nft_pktinfo *pkt, + const struct nf_conn *ct, + struct nf_flow_route *route, + enum ip_conntrack_dir dir) +{ + struct dst_entry *this_dst = skb_dst(pkt->skb); + struct dst_entry *other_dst = NULL; + struct flowi fl; + + memset(&fl, 0, sizeof(fl)); + switch (nft_pf(pkt)) { + case NFPROTO_IPV4: + fl.u.ip4.daddr = ct->tuplehash[!dir].tuple.dst.u3.ip; + break; + case NFPROTO_IPV6: + fl.u.ip6.daddr = ct->tuplehash[!dir].tuple.dst.u3.in6; + break; + } + + nf_route(nft_net(pkt), &other_dst, &fl, false, nft_pf(pkt)); + if (!other_dst) + return -ENOENT; + + route->tuple[dir].dst = this_dst; + route->tuple[dir].ifindex = nft_in(pkt)->ifindex; + route->tuple[!dir].dst = other_dst; + route->tuple[!dir].ifindex = nft_out(pkt)->ifindex; + + return 0; +} + +static bool nft_flow_offload_skip(struct sk_buff *skb) +{ + struct ip_options *opt = &(IPCB(skb)->opt); + + if (unlikely(opt->optlen)) + return true; + if (skb_sec_path(skb)) + return true; + + return false; +} + +static void nft_flow_offload_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_flow_offload *priv = nft_expr_priv(expr); + struct nf_flowtable *flowtable = &priv->flowtable->data; + enum ip_conntrack_info ctinfo; + struct nf_flow_route route; + struct flow_offload *flow; + enum ip_conntrack_dir dir; + struct nf_conn *ct; + int ret; + + if (nft_flow_offload_skip(pkt->skb)) + goto out; + + ct = nf_ct_get(pkt->skb, &ctinfo); + if (!ct) + goto out; + + switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum) { + case IPPROTO_TCP: + case IPPROTO_UDP: + break; + default: + goto out; + } + + if (test_bit(IPS_HELPER_BIT, &ct->status)) + goto out; + + if (ctinfo == IP_CT_NEW || + ctinfo == IP_CT_RELATED) + goto out; + + if (test_and_set_bit(IPS_OFFLOAD_BIT, &ct->status)) + goto out; + + dir = CTINFO2DIR(ctinfo); + if (nft_flow_route(pkt, ct, &route, dir) < 0) + goto err_flow_route; + + flow = flow_offload_alloc(ct, &route); + if (!flow) + goto err_flow_alloc; + + ret = flow_offload_add(flowtable, flow); + if (ret < 0) + goto err_flow_add; + + return; + +err_flow_add: + flow_offload_free(flow); +err_flow_alloc: + dst_release(route.tuple[!dir].dst); +err_flow_route: + clear_bit(IPS_OFFLOAD_BIT, &ct->status); +out: + regs->verdict.code = NFT_BREAK; +} + +static int nft_flow_offload_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) +{ + unsigned int hook_mask = (1 << NF_INET_FORWARD); + + return nft_chain_validate_hooks(ctx->chain, hook_mask); +} + +static int nft_flow_offload_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_flow_offload *priv = nft_expr_priv(expr); + u8 genmask = nft_genmask_next(ctx->net); + struct nft_flowtable *flowtable; + + if (!tb[NFTA_FLOW_TABLE_NAME]) + return -EINVAL; + + flowtable = nf_tables_flowtable_lookup(ctx->table, + tb[NFTA_FLOW_TABLE_NAME], + genmask); + if (IS_ERR(flowtable)) + return PTR_ERR(flowtable); + + priv->flowtable = flowtable; + flowtable->use++; + + return nf_ct_netns_get(ctx->net, ctx->family); +} + +static void nft_flow_offload_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_flow_offload *priv = nft_expr_priv(expr); + + priv->flowtable->use--; + nf_ct_netns_put(ctx->net, ctx->family); +} + +static int nft_flow_offload_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + struct nft_flow_offload *priv = nft_expr_priv(expr); + + if (nla_put_string(skb, NFTA_FLOW_TABLE_NAME, priv->flowtable->name)) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_flow_offload_type; +static const struct nft_expr_ops nft_flow_offload_ops = { + .type = &nft_flow_offload_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_flow_offload)), + .eval = nft_flow_offload_eval, + .init = nft_flow_offload_init, + .destroy = nft_flow_offload_destroy, + .validate = nft_flow_offload_validate, + .dump = nft_flow_offload_dump, +}; + +static struct nft_expr_type nft_flow_offload_type __read_mostly = { + .name = "flow_offload", + .ops = &nft_flow_offload_ops, + .maxattr = NFTA_FLOW_MAX, + .owner = THIS_MODULE, +}; + +static void flow_offload_iterate_cleanup(struct flow_offload *flow, void *data) +{ + struct net_device *dev = data; + + if (dev && flow->tuplehash[0].tuple.iifidx != dev->ifindex) + return; + + flow_offload_dead(flow); +} + +static void nft_flow_offload_iterate_cleanup(struct nf_flowtable *flowtable, + void *data) +{ + nf_flow_table_iterate(flowtable, flow_offload_iterate_cleanup, data); +} + +static int flow_offload_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + + if (event != NETDEV_DOWN) + return NOTIFY_DONE; + + nft_flow_table_iterate(dev_net(dev), nft_flow_offload_iterate_cleanup, dev); + + return NOTIFY_DONE; +} + +static struct notifier_block flow_offload_netdev_notifier = { + .notifier_call = flow_offload_netdev_event, +}; + +static int __init nft_flow_offload_module_init(void) +{ + int err; + + register_netdevice_notifier(&flow_offload_netdev_notifier); + + err = nft_register_expr(&nft_flow_offload_type); + if (err < 0) + goto register_expr; + + return 0; + +register_expr: + unregister_netdevice_notifier(&flow_offload_netdev_notifier); + return err; +} + +static void __exit nft_flow_offload_module_exit(void) +{ + struct net *net; + + nft_unregister_expr(&nft_flow_offload_type); + unregister_netdevice_notifier(&flow_offload_netdev_notifier); + rtnl_lock(); + for_each_net(net) + nft_flow_table_iterate(net, nft_flow_offload_iterate_cleanup, NULL); + rtnl_unlock(); +} + +module_init(nft_flow_offload_module_init); +module_exit(nft_flow_offload_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); +MODULE_ALIAS_NFT_EXPR("flow_offload"); diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c index 6f6e64423643..a27be36dc0af 100644 --- a/net/netfilter/nft_log.c +++ b/net/netfilter/nft_log.c @@ -112,7 +112,7 @@ static int nft_log_init(const struct nft_ctx *ctx, break; } - err = nf_logger_find_get(ctx->afi->family, li->type); + err = nf_logger_find_get(ctx->family, li->type); if (err < 0) goto err1; @@ -133,7 +133,7 @@ static void nft_log_destroy(const struct nft_ctx *ctx, if (priv->prefix != nft_log_null_prefix) kfree(priv->prefix); - nf_logger_put(ctx->afi->family, li->type); + nf_logger_put(ctx->family, li->type); } static int nft_log_dump(struct sk_buff *skb, const struct nft_expr *expr) diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c index 6ac03d4266c9..9d8655bc1bea 100644 --- a/net/netfilter/nft_masq.c +++ b/net/netfilter/nft_masq.c @@ -73,7 +73,7 @@ int nft_masq_init(const struct nft_ctx *ctx, } } - return nf_ct_netns_get(ctx->net, ctx->afi->family); + return nf_ct_netns_get(ctx->net, ctx->family); } EXPORT_SYMBOL_GPL(nft_masq_init); diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 5a60eb23a7ed..8fb91940e2e7 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -210,6 +210,11 @@ void nft_meta_get_eval(const struct nft_expr *expr, *dest = prandom_u32_state(state); break; } +#ifdef CONFIG_XFRM + case NFT_META_SECPATH: + nft_reg_store8(dest, !!skb->sp); + break; +#endif default: WARN_ON(1); goto err; @@ -308,6 +313,11 @@ int nft_meta_get_init(const struct nft_ctx *ctx, prandom_init_once(&nft_prandom_state); len = sizeof(u32); break; +#ifdef CONFIG_XFRM + case NFT_META_SECPATH: + len = sizeof(u8); + break; +#endif default: return -EOPNOTSUPP; } @@ -318,6 +328,38 @@ int nft_meta_get_init(const struct nft_ctx *ctx, } EXPORT_SYMBOL_GPL(nft_meta_get_init); +static int nft_meta_get_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) +{ +#ifdef CONFIG_XFRM + const struct nft_meta *priv = nft_expr_priv(expr); + unsigned int hooks; + + if (priv->key != NFT_META_SECPATH) + return 0; + + switch (ctx->family) { + case NFPROTO_NETDEV: + hooks = 1 << NF_NETDEV_INGRESS; + break; + case NFPROTO_IPV4: + case NFPROTO_IPV6: + case NFPROTO_INET: + hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD); + break; + default: + return -EOPNOTSUPP; + } + + return nft_chain_validate_hooks(ctx->chain, hooks); +#else + return 0; +#endif +} + int nft_meta_set_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data) @@ -328,7 +370,7 @@ int nft_meta_set_validate(const struct nft_ctx *ctx, if (priv->key != NFT_META_PKTTYPE) return 0; - switch (ctx->afi->family) { + switch (ctx->family) { case NFPROTO_BRIDGE: hooks = 1 << NF_BR_PRE_ROUTING; break; @@ -434,6 +476,7 @@ static const struct nft_expr_ops nft_meta_get_ops = { .eval = nft_meta_get_eval, .init = nft_meta_get_init, .dump = nft_meta_get_dump, + .validate = nft_meta_get_validate, }; static const struct nft_expr_ops nft_meta_set_ops = { diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index ed548d06b6dd..1f36954c2ba9 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -142,7 +142,7 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, return -EINVAL; family = ntohl(nla_get_be32(tb[NFTA_NAT_FAMILY])); - if (family != ctx->afi->family) + if (family != ctx->family) return -EOPNOTSUPP; switch (family) { diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c index 1e66538bf0ff..c64cbe78dee7 100644 --- a/net/netfilter/nft_redir.c +++ b/net/netfilter/nft_redir.c @@ -75,7 +75,7 @@ int nft_redir_init(const struct nft_ctx *ctx, return -EINVAL; } - return nf_ct_netns_get(ctx->net, ctx->afi->family); + return nf_ct_netns_get(ctx->net, ctx->family); } EXPORT_SYMBOL_GPL(nft_redir_init); diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c index a6b7d05aeacf..11a2071b6dd4 100644 --- a/net/netfilter/nft_rt.c +++ b/net/netfilter/nft_rt.c @@ -27,7 +27,7 @@ static u16 get_tcpmss(const struct nft_pktinfo *pkt, const struct dst_entry *skb { u32 minlen = sizeof(struct ipv6hdr), mtu = dst_mtu(skbdst); const struct sk_buff *skb = pkt->skb; - const struct nf_afinfo *ai; + struct dst_entry *dst = NULL; struct flowi fl; memset(&fl, 0, sizeof(fl)); @@ -43,15 +43,10 @@ static u16 get_tcpmss(const struct nft_pktinfo *pkt, const struct dst_entry *skb break; } - ai = nf_get_afinfo(nft_pf(pkt)); - if (ai) { - struct dst_entry *dst = NULL; - - ai->route(nft_net(pkt), &dst, &fl, false); - if (dst) { - mtu = min(mtu, dst_mtu(dst)); - dst_release(dst); - } + nf_route(nft_net(pkt), &dst, &fl, false, nft_pf(pkt)); + if (dst) { + mtu = min(mtu, dst_mtu(dst)); + dst_release(dst); } if (mtu <= minlen || mtu > 0xffff) diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index f8166c1d5430..3f1624ee056f 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -251,11 +251,7 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, if (err) return; - err = rhashtable_walk_start(&hti); - if (err && err != -EAGAIN) { - iter->err = err; - goto out; - } + rhashtable_walk_start(&hti); while ((he = rhashtable_walk_next(&hti))) { if (IS_ERR(he)) { @@ -306,9 +302,7 @@ static void nft_rhash_gc(struct work_struct *work) if (err) goto schedule; - err = rhashtable_walk_start(&hti); - if (err && err != -EAGAIN) - goto out; + rhashtable_walk_start(&hti); while ((he = rhashtable_walk_next(&hti))) { if (IS_ERR(he)) { diff --git a/net/netfilter/utils.c b/net/netfilter/utils.c new file mode 100644 index 000000000000..0b660c568156 --- /dev/null +++ b/net/netfilter/utils.c @@ -0,0 +1,90 @@ +#include <linux/kernel.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv6.h> +#include <net/netfilter/nf_queue.h> + +__sum16 nf_checksum(struct sk_buff *skb, unsigned int hook, + unsigned int dataoff, u_int8_t protocol, + unsigned short family) +{ + const struct nf_ipv6_ops *v6ops; + __sum16 csum = 0; + + switch (family) { + case AF_INET: + csum = nf_ip_checksum(skb, hook, dataoff, protocol); + break; + case AF_INET6: + v6ops = rcu_dereference(nf_ipv6_ops); + if (v6ops) + csum = v6ops->checksum(skb, hook, dataoff, protocol); + break; + } + + return csum; +} +EXPORT_SYMBOL_GPL(nf_checksum); + +__sum16 nf_checksum_partial(struct sk_buff *skb, unsigned int hook, + unsigned int dataoff, unsigned int len, + u_int8_t protocol, unsigned short family) +{ + const struct nf_ipv6_ops *v6ops; + __sum16 csum = 0; + + switch (family) { + case AF_INET: + csum = nf_ip_checksum_partial(skb, hook, dataoff, len, + protocol); + break; + case AF_INET6: + v6ops = rcu_dereference(nf_ipv6_ops); + if (v6ops) + csum = v6ops->checksum_partial(skb, hook, dataoff, len, + protocol); + break; + } + + return csum; +} +EXPORT_SYMBOL_GPL(nf_checksum_partial); + +int nf_route(struct net *net, struct dst_entry **dst, struct flowi *fl, + bool strict, unsigned short family) +{ + const struct nf_ipv6_ops *v6ops; + int ret = 0; + + switch (family) { + case AF_INET: + ret = nf_ip_route(net, dst, fl, strict); + break; + case AF_INET6: + v6ops = rcu_dereference(nf_ipv6_ops); + if (v6ops) + ret = v6ops->route(net, dst, fl, strict); + break; + } + + return ret; +} +EXPORT_SYMBOL_GPL(nf_route); + +int nf_reroute(struct sk_buff *skb, struct nf_queue_entry *entry) +{ + const struct nf_ipv6_ops *v6ops; + int ret = 0; + + switch (entry->state.pf) { + case AF_INET: + ret = nf_ip_reroute(skb, entry); + break; + case AF_INET6: + v6ops = rcu_dereference(nf_ipv6_ops); + if (v6ops) + ret = v6ops->reroute(skb, entry); + break; + } + return ret; +} diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index a77dd514297c..8fa4d37141a7 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -39,7 +39,6 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_DESCRIPTION("{ip,ip6,arp,eb}_tables backend module"); -#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1)) #define XT_PCPU_BLOCK_SIZE 4096 struct compat_delta { @@ -210,6 +209,9 @@ xt_request_find_match(uint8_t nfproto, const char *name, uint8_t revision) { struct xt_match *match; + if (strnlen(name, XT_EXTENSION_MAXNAMELEN) == XT_EXTENSION_MAXNAMELEN) + return ERR_PTR(-EINVAL); + match = xt_find_match(nfproto, name, revision); if (IS_ERR(match)) { request_module("%st_%s", xt_prefix[nfproto], name); @@ -252,6 +254,9 @@ struct xt_target *xt_request_find_target(u8 af, const char *name, u8 revision) { struct xt_target *target; + if (strnlen(name, XT_EXTENSION_MAXNAMELEN) == XT_EXTENSION_MAXNAMELEN) + return ERR_PTR(-EINVAL); + target = xt_find_target(af, name, revision); if (IS_ERR(target)) { request_module("%st_%s", xt_prefix[af], name); @@ -1000,7 +1005,7 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size) return NULL; /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */ - if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > totalram_pages) + if ((size >> PAGE_SHIFT) + 2 > totalram_pages) return NULL; info = kvmalloc(sz, GFP_KERNEL); @@ -1027,7 +1032,7 @@ void xt_free_table_info(struct xt_table_info *info) } EXPORT_SYMBOL(xt_free_table_info); -/* Find table by name, grabs mutex & ref. Returns NULL on error. */ +/* Find table by name, grabs mutex & ref. Returns ERR_PTR on error. */ struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, const char *name) { @@ -1043,17 +1048,17 @@ struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, /* Table doesn't exist in this netns, re-try init */ list_for_each_entry(t, &init_net.xt.tables[af], list) { + int err; + if (strcmp(t->name, name)) continue; - if (!try_module_get(t->me)) { - mutex_unlock(&xt[af].mutex); - return NULL; - } - + if (!try_module_get(t->me)) + goto out; mutex_unlock(&xt[af].mutex); - if (t->table_init(net) != 0) { + err = t->table_init(net); + if (err < 0) { module_put(t->me); - return NULL; + return ERR_PTR(err); } found = t; @@ -1073,10 +1078,28 @@ struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, module_put(found->me); out: mutex_unlock(&xt[af].mutex); - return NULL; + return ERR_PTR(-ENOENT); } EXPORT_SYMBOL_GPL(xt_find_table_lock); +struct xt_table *xt_request_find_table_lock(struct net *net, u_int8_t af, + const char *name) +{ + struct xt_table *t = xt_find_table_lock(net, af, name); + +#ifdef CONFIG_MODULES + if (IS_ERR(t)) { + int err = request_module("%stable_%s", xt_prefix[af], name); + if (err < 0) + return ERR_PTR(err); + t = xt_find_table_lock(net, af, name); + } +#endif + + return t; +} +EXPORT_SYMBOL_GPL(xt_request_find_table_lock); + void xt_table_unlock(struct xt_table *table) { mutex_unlock(&xt[table->af].mutex); @@ -1344,7 +1367,6 @@ static int xt_table_open(struct inode *inode, struct file *file) } static const struct file_operations xt_table_ops = { - .owner = THIS_MODULE, .open = xt_table_open, .read = seq_read, .llseek = seq_lseek, @@ -1397,7 +1419,7 @@ static void *xt_mttg_seq_next(struct seq_file *seq, void *v, loff_t *ppos, trav->curr = trav->curr->next; if (trav->curr != trav->head) break; - /* fallthru, _stop will unlock */ + /* fall through */ default: return NULL; } @@ -1480,7 +1502,6 @@ static int xt_match_open(struct inode *inode, struct file *file) } static const struct file_operations xt_match_ops = { - .owner = THIS_MODULE, .open = xt_match_open, .read = seq_read, .llseek = seq_lseek, @@ -1533,7 +1554,6 @@ static int xt_target_open(struct inode *inode, struct file *file) } static const struct file_operations xt_target_ops = { - .owner = THIS_MODULE, .open = xt_target_open, .read = seq_read, .llseek = seq_lseek, @@ -1729,8 +1749,17 @@ static int __net_init xt_net_init(struct net *net) return 0; } +static void __net_exit xt_net_exit(struct net *net) +{ + int i; + + for (i = 0; i < NFPROTO_NUMPROTO; i++) + WARN_ON_ONCE(!list_empty(&net->xt.tables[i])); +} + static struct pernet_operations xt_net_ops = { .init = xt_net_init, + .exit = xt_net_exit, }; static int __init xt_init(void) diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c index ee3421ad108d..6c2482b709b1 100644 --- a/net/netfilter/xt_IDLETIMER.c +++ b/net/netfilter/xt_IDLETIMER.c @@ -252,6 +252,7 @@ static struct xt_target idletimer_tg __read_mostly = { .family = NFPROTO_UNSPEC, .target = idletimer_tg_target, .targetsize = sizeof(struct idletimer_tg_info), + .usersize = offsetof(struct idletimer_tg_info, timer), .checkentry = idletimer_tg_checkentry, .destroy = idletimer_tg_destroy, .me = THIS_MODULE, diff --git a/net/netfilter/xt_LED.c b/net/netfilter/xt_LED.c index 0971634e5444..1dcad893df78 100644 --- a/net/netfilter/xt_LED.c +++ b/net/netfilter/xt_LED.c @@ -198,6 +198,7 @@ static struct xt_target led_tg_reg __read_mostly = { .family = NFPROTO_UNSPEC, .target = led_tg, .targetsize = sizeof(struct xt_led_info), + .usersize = offsetof(struct xt_led_info, internal_data), .checkentry = led_tg_check, .destroy = led_tg_destroy, .me = THIS_MODULE, diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c index 9dae4d665965..99bb8e410f22 100644 --- a/net/netfilter/xt_TCPMSS.c +++ b/net/netfilter/xt_TCPMSS.c @@ -48,7 +48,6 @@ static u_int32_t tcpmss_reverse_mtu(struct net *net, unsigned int family) { struct flowi fl; - const struct nf_afinfo *ai; struct rtable *rt = NULL; u_int32_t mtu = ~0U; @@ -62,10 +61,8 @@ static u_int32_t tcpmss_reverse_mtu(struct net *net, memset(fl6, 0, sizeof(*fl6)); fl6->daddr = ipv6_hdr(skb)->saddr; } - ai = nf_get_afinfo(family); - if (ai != NULL) - ai->route(net, (struct dst_entry **)&rt, &fl, false); + nf_route(net, (struct dst_entry **)&rt, &fl, false, family); if (rt != NULL) { mtu = dst_mtu(&rt->dst); dst_release(&rt->dst); diff --git a/net/netfilter/xt_addrtype.c b/net/netfilter/xt_addrtype.c index 3b2be2ae6987..911a7c0da504 100644 --- a/net/netfilter/xt_addrtype.c +++ b/net/netfilter/xt_addrtype.c @@ -36,7 +36,7 @@ MODULE_ALIAS("ip6t_addrtype"); static u32 match_lookup_rt6(struct net *net, const struct net_device *dev, const struct in6_addr *addr, u16 mask) { - const struct nf_afinfo *afinfo; + const struct nf_ipv6_ops *v6ops; struct flowi6 flow; struct rt6_info *rt; u32 ret = 0; @@ -47,17 +47,14 @@ static u32 match_lookup_rt6(struct net *net, const struct net_device *dev, if (dev) flow.flowi6_oif = dev->ifindex; - afinfo = nf_get_afinfo(NFPROTO_IPV6); - if (afinfo != NULL) { - const struct nf_ipv6_ops *v6ops; - + v6ops = nf_get_ipv6_ops(); + if (v6ops) { if (dev && (mask & XT_ADDRTYPE_LOCAL)) { - v6ops = nf_get_ipv6_ops(); - if (v6ops && v6ops->chk_addr(net, addr, dev, true)) + if (v6ops->chk_addr(net, addr, dev, true)) ret = XT_ADDRTYPE_LOCAL; } - route_err = afinfo->route(net, (struct dst_entry **)&rt, - flowi6_to_flowi(&flow), false); + route_err = v6ops->route(net, (struct dst_entry **)&rt, + flowi6_to_flowi(&flow), false); } else { route_err = 1; } diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c index 041da0d9c06f..06b090d8e901 100644 --- a/net/netfilter/xt_bpf.c +++ b/net/netfilter/xt_bpf.c @@ -27,6 +27,9 @@ static int __bpf_mt_check_bytecode(struct sock_filter *insns, __u16 len, { struct sock_fprog_kern program; + if (len > XT_BPF_MAX_NUM_INSTR) + return -EINVAL; + program.len = len; program.filter = insns; @@ -52,18 +55,11 @@ static int __bpf_mt_check_fd(int fd, struct bpf_prog **ret) static int __bpf_mt_check_path(const char *path, struct bpf_prog **ret) { - mm_segment_t oldfs = get_fs(); - int retval, fd; - - set_fs(KERNEL_DS); - fd = bpf_obj_get_user(path, 0); - set_fs(oldfs); - if (fd < 0) - return fd; - - retval = __bpf_mt_check_fd(fd, ret); - sys_close(fd); - return retval; + if (strnlen(path, XT_BPF_PATH_MAX) == XT_BPF_PATH_MAX) + return -EINVAL; + + *ret = bpf_prog_get_type_path(path, BPF_PROG_TYPE_SOCKET_FILTER); + return PTR_ERR_OR_ZERO(*ret); } static int bpf_mt_check(const struct xt_mtchk_param *par) diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index a6214f235333..b1b17b9353e1 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -12,292 +12,30 @@ * GPL (C) 1999 Rusty Russell (rusty@rustcorp.com.au). */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/in.h> -#include <linux/in6.h> -#include <linux/ip.h> -#include <linux/ipv6.h> -#include <linux/jhash.h> -#include <linux/slab.h> -#include <linux/list.h> -#include <linux/rbtree.h> + #include <linux/module.h> -#include <linux/random.h> #include <linux/skbuff.h> -#include <linux/spinlock.h> -#include <linux/netfilter/nf_conntrack_tcp.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_connlimit.h> + #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_zones.h> - -#define CONNLIMIT_SLOTS 256U - -#ifdef CONFIG_LOCKDEP -#define CONNLIMIT_LOCK_SLOTS 8U -#else -#define CONNLIMIT_LOCK_SLOTS 256U -#endif - -#define CONNLIMIT_GC_MAX_NODES 8 - -/* we will save the tuples of all connections we care about */ -struct xt_connlimit_conn { - struct hlist_node node; - struct nf_conntrack_tuple tuple; -}; - -struct xt_connlimit_rb { - struct rb_node node; - struct hlist_head hhead; /* connections/hosts in same subnet */ - union nf_inet_addr addr; /* search key */ -}; - -static spinlock_t xt_connlimit_locks[CONNLIMIT_LOCK_SLOTS] __cacheline_aligned_in_smp; - -struct xt_connlimit_data { - struct rb_root climit_root[CONNLIMIT_SLOTS]; -}; - -static u_int32_t connlimit_rnd __read_mostly; -static struct kmem_cache *connlimit_rb_cachep __read_mostly; -static struct kmem_cache *connlimit_conn_cachep __read_mostly; - -static inline unsigned int connlimit_iphash(__be32 addr) -{ - return jhash_1word((__force __u32)addr, - connlimit_rnd) % CONNLIMIT_SLOTS; -} - -static inline unsigned int -connlimit_iphash6(const union nf_inet_addr *addr) -{ - return jhash2((u32 *)addr->ip6, ARRAY_SIZE(addr->ip6), - connlimit_rnd) % CONNLIMIT_SLOTS; -} - -static inline bool already_closed(const struct nf_conn *conn) -{ - if (nf_ct_protonum(conn) == IPPROTO_TCP) - return conn->proto.tcp.state == TCP_CONNTRACK_TIME_WAIT || - conn->proto.tcp.state == TCP_CONNTRACK_CLOSE; - else - return 0; -} - -static int -same_source(const union nf_inet_addr *addr, - const union nf_inet_addr *u3, u_int8_t family) -{ - if (family == NFPROTO_IPV4) - return ntohl(addr->ip) - ntohl(u3->ip); - - return memcmp(addr->ip6, u3->ip6, sizeof(addr->ip6)); -} - -static bool add_hlist(struct hlist_head *head, - const struct nf_conntrack_tuple *tuple, - const union nf_inet_addr *addr) -{ - struct xt_connlimit_conn *conn; - - conn = kmem_cache_alloc(connlimit_conn_cachep, GFP_ATOMIC); - if (conn == NULL) - return false; - conn->tuple = *tuple; - hlist_add_head(&conn->node, head); - return true; -} - -static unsigned int check_hlist(struct net *net, - struct hlist_head *head, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone, - bool *addit) -{ - const struct nf_conntrack_tuple_hash *found; - struct xt_connlimit_conn *conn; - struct hlist_node *n; - struct nf_conn *found_ct; - unsigned int length = 0; - - *addit = true; - - /* check the saved connections */ - hlist_for_each_entry_safe(conn, n, head, node) { - found = nf_conntrack_find_get(net, zone, &conn->tuple); - if (found == NULL) { - hlist_del(&conn->node); - kmem_cache_free(connlimit_conn_cachep, conn); - continue; - } - - found_ct = nf_ct_tuplehash_to_ctrack(found); - - if (nf_ct_tuple_equal(&conn->tuple, tuple)) { - /* - * Just to be sure we have it only once in the list. - * We should not see tuples twice unless someone hooks - * this into a table without "-p tcp --syn". - */ - *addit = false; - } else if (already_closed(found_ct)) { - /* - * we do not care about connections which are - * closed already -> ditch it - */ - nf_ct_put(found_ct); - hlist_del(&conn->node); - kmem_cache_free(connlimit_conn_cachep, conn); - continue; - } - - nf_ct_put(found_ct); - length++; - } - - return length; -} - -static void tree_nodes_free(struct rb_root *root, - struct xt_connlimit_rb *gc_nodes[], - unsigned int gc_count) -{ - struct xt_connlimit_rb *rbconn; - - while (gc_count) { - rbconn = gc_nodes[--gc_count]; - rb_erase(&rbconn->node, root); - kmem_cache_free(connlimit_rb_cachep, rbconn); - } -} - -static unsigned int -count_tree(struct net *net, struct rb_root *root, - const struct nf_conntrack_tuple *tuple, - const union nf_inet_addr *addr, - u8 family, const struct nf_conntrack_zone *zone) -{ - struct xt_connlimit_rb *gc_nodes[CONNLIMIT_GC_MAX_NODES]; - struct rb_node **rbnode, *parent; - struct xt_connlimit_rb *rbconn; - struct xt_connlimit_conn *conn; - unsigned int gc_count; - bool no_gc = false; - - restart: - gc_count = 0; - parent = NULL; - rbnode = &(root->rb_node); - while (*rbnode) { - int diff; - bool addit; - - rbconn = rb_entry(*rbnode, struct xt_connlimit_rb, node); - - parent = *rbnode; - diff = same_source(addr, &rbconn->addr, family); - if (diff < 0) { - rbnode = &((*rbnode)->rb_left); - } else if (diff > 0) { - rbnode = &((*rbnode)->rb_right); - } else { - /* same source network -> be counted! */ - unsigned int count; - count = check_hlist(net, &rbconn->hhead, tuple, zone, &addit); - - tree_nodes_free(root, gc_nodes, gc_count); - if (!addit) - return count; - - if (!add_hlist(&rbconn->hhead, tuple, addr)) - return 0; /* hotdrop */ - - return count + 1; - } - - if (no_gc || gc_count >= ARRAY_SIZE(gc_nodes)) - continue; - - /* only used for GC on hhead, retval and 'addit' ignored */ - check_hlist(net, &rbconn->hhead, tuple, zone, &addit); - if (hlist_empty(&rbconn->hhead)) - gc_nodes[gc_count++] = rbconn; - } - - if (gc_count) { - no_gc = true; - tree_nodes_free(root, gc_nodes, gc_count); - /* tree_node_free before new allocation permits - * allocator to re-use newly free'd object. - * - * This is a rare event; in most cases we will find - * existing node to re-use. (or gc_count is 0). - */ - goto restart; - } - - /* no match, need to insert new node */ - rbconn = kmem_cache_alloc(connlimit_rb_cachep, GFP_ATOMIC); - if (rbconn == NULL) - return 0; - - conn = kmem_cache_alloc(connlimit_conn_cachep, GFP_ATOMIC); - if (conn == NULL) { - kmem_cache_free(connlimit_rb_cachep, rbconn); - return 0; - } - - conn->tuple = *tuple; - rbconn->addr = *addr; - - INIT_HLIST_HEAD(&rbconn->hhead); - hlist_add_head(&conn->node, &rbconn->hhead); - - rb_link_node(&rbconn->node, parent, rbnode); - rb_insert_color(&rbconn->node, root); - return 1; -} - -static int count_them(struct net *net, - struct xt_connlimit_data *data, - const struct nf_conntrack_tuple *tuple, - const union nf_inet_addr *addr, - u_int8_t family, - const struct nf_conntrack_zone *zone) -{ - struct rb_root *root; - int count; - u32 hash; - - if (family == NFPROTO_IPV6) - hash = connlimit_iphash6(addr); - else - hash = connlimit_iphash(addr->ip); - root = &data->climit_root[hash]; - - spin_lock_bh(&xt_connlimit_locks[hash % CONNLIMIT_LOCK_SLOTS]); - - count = count_tree(net, root, tuple, addr, family, zone); - - spin_unlock_bh(&xt_connlimit_locks[hash % CONNLIMIT_LOCK_SLOTS]); - - return count; -} +#include <net/netfilter/nf_conntrack_count.h> static bool connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) { struct net *net = xt_net(par); const struct xt_connlimit_info *info = par->matchinfo; - union nf_inet_addr addr; struct nf_conntrack_tuple tuple; const struct nf_conntrack_tuple *tuple_ptr = &tuple; const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; enum ip_conntrack_info ctinfo; const struct nf_conn *ct; unsigned int connections; + u32 key[5]; ct = nf_ct_get(skb, &ctinfo); if (ct != NULL) { @@ -310,6 +48,7 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) if (xt_family(par) == NFPROTO_IPV6) { const struct ipv6hdr *iph = ipv6_hdr(skb); + union nf_inet_addr addr; unsigned int i; memcpy(&addr.ip6, (info->flags & XT_CONNLIMIT_DADDR) ? @@ -317,22 +56,24 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) for (i = 0; i < ARRAY_SIZE(addr.ip6); ++i) addr.ip6[i] &= info->mask.ip6[i]; + memcpy(key, &addr, sizeof(addr.ip6)); + key[4] = zone->id; } else { const struct iphdr *iph = ip_hdr(skb); - addr.ip = (info->flags & XT_CONNLIMIT_DADDR) ? + key[0] = (info->flags & XT_CONNLIMIT_DADDR) ? iph->daddr : iph->saddr; - addr.ip &= info->mask.ip; + key[0] &= info->mask.ip; + key[1] = zone->id; } - connections = count_them(net, info->data, tuple_ptr, &addr, - xt_family(par), zone); + connections = nf_conncount_count(net, info->data, key, + xt_family(par), tuple_ptr, zone); if (connections == 0) /* kmalloc failed, drop it entirely */ goto hotdrop; - return (connections > info->limit) ^ - !!(info->flags & XT_CONNLIMIT_INVERT); + return (connections > info->limit) ^ !!(info->flags & XT_CONNLIMIT_INVERT); hotdrop: par->hotdrop = true; @@ -342,61 +83,27 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) static int connlimit_mt_check(const struct xt_mtchk_param *par) { struct xt_connlimit_info *info = par->matchinfo; - unsigned int i; - int ret; + unsigned int keylen; - net_get_random_once(&connlimit_rnd, sizeof(connlimit_rnd)); - - ret = nf_ct_netns_get(par->net, par->family); - if (ret < 0) { - pr_info("cannot load conntrack support for " - "address family %u\n", par->family); - return ret; - } + keylen = sizeof(u32); + if (par->family == NFPROTO_IPV6) + keylen += sizeof(struct in6_addr); + else + keylen += sizeof(struct in_addr); /* init private data */ - info->data = kmalloc(sizeof(struct xt_connlimit_data), GFP_KERNEL); - if (info->data == NULL) { - nf_ct_netns_put(par->net, par->family); - return -ENOMEM; - } - - for (i = 0; i < ARRAY_SIZE(info->data->climit_root); ++i) - info->data->climit_root[i] = RB_ROOT; + info->data = nf_conncount_init(par->net, par->family, keylen); + if (IS_ERR(info->data)) + return PTR_ERR(info->data); return 0; } -static void destroy_tree(struct rb_root *r) -{ - struct xt_connlimit_conn *conn; - struct xt_connlimit_rb *rbconn; - struct hlist_node *n; - struct rb_node *node; - - while ((node = rb_first(r)) != NULL) { - rbconn = rb_entry(node, struct xt_connlimit_rb, node); - - rb_erase(node, r); - - hlist_for_each_entry_safe(conn, n, &rbconn->hhead, node) - kmem_cache_free(connlimit_conn_cachep, conn); - - kmem_cache_free(connlimit_rb_cachep, rbconn); - } -} - static void connlimit_mt_destroy(const struct xt_mtdtor_param *par) { const struct xt_connlimit_info *info = par->matchinfo; - unsigned int i; - - nf_ct_netns_put(par->net, par->family); - - for (i = 0; i < ARRAY_SIZE(info->data->climit_root); ++i) - destroy_tree(&info->data->climit_root[i]); - kfree(info->data); + nf_conncount_destroy(par->net, par->family, info->data); } static struct xt_match connlimit_mt_reg __read_mostly = { @@ -413,40 +120,12 @@ static struct xt_match connlimit_mt_reg __read_mostly = { static int __init connlimit_mt_init(void) { - int ret, i; - - BUILD_BUG_ON(CONNLIMIT_LOCK_SLOTS > CONNLIMIT_SLOTS); - BUILD_BUG_ON((CONNLIMIT_SLOTS % CONNLIMIT_LOCK_SLOTS) != 0); - - for (i = 0; i < CONNLIMIT_LOCK_SLOTS; ++i) - spin_lock_init(&xt_connlimit_locks[i]); - - connlimit_conn_cachep = kmem_cache_create("xt_connlimit_conn", - sizeof(struct xt_connlimit_conn), - 0, 0, NULL); - if (!connlimit_conn_cachep) - return -ENOMEM; - - connlimit_rb_cachep = kmem_cache_create("xt_connlimit_rb", - sizeof(struct xt_connlimit_rb), - 0, 0, NULL); - if (!connlimit_rb_cachep) { - kmem_cache_destroy(connlimit_conn_cachep); - return -ENOMEM; - } - ret = xt_register_match(&connlimit_mt_reg); - if (ret != 0) { - kmem_cache_destroy(connlimit_conn_cachep); - kmem_cache_destroy(connlimit_rb_cachep); - } - return ret; + return xt_register_match(&connlimit_mt_reg); } static void __exit connlimit_mt_exit(void) { xt_unregister_match(&connlimit_mt_reg); - kmem_cache_destroy(connlimit_conn_cachep); - kmem_cache_destroy(connlimit_rb_cachep); } module_init(connlimit_mt_init); diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 5da8746f7b88..ca6847403ca2 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -353,7 +353,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, static bool select_all(const struct xt_hashlimit_htable *ht, const struct dsthash_ent *he) { - return 1; + return true; } static bool select_gc(const struct xt_hashlimit_htable *ht, @@ -1266,7 +1266,6 @@ static int dl_proc_open(struct inode *inode, struct file *file) } static const struct file_operations dl_file_ops_v2 = { - .owner = THIS_MODULE, .open = dl_proc_open_v2, .read = seq_read, .llseek = seq_lseek, @@ -1274,7 +1273,6 @@ static const struct file_operations dl_file_ops_v2 = { }; static const struct file_operations dl_file_ops_v1 = { - .owner = THIS_MODULE, .open = dl_proc_open_v1, .read = seq_read, .llseek = seq_lseek, @@ -1282,7 +1280,6 @@ static const struct file_operations dl_file_ops_v1 = { }; static const struct file_operations dl_file_ops = { - .owner = THIS_MODULE, .open = dl_proc_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/netfilter/xt_ipcomp.c b/net/netfilter/xt_ipcomp.c index 000e70377f85..7ca64a50db04 100644 --- a/net/netfilter/xt_ipcomp.c +++ b/net/netfilter/xt_ipcomp.c @@ -58,7 +58,7 @@ static bool comp_mt(const struct sk_buff *skb, struct xt_action_param *par) */ pr_debug("Dropping evil IPComp tinygram.\n"); par->hotdrop = true; - return 0; + return false; } return spi_match(compinfo->spis[0], compinfo->spis[1], diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c index d27b5f1ea619..61403b77361c 100644 --- a/net/netfilter/xt_limit.c +++ b/net/netfilter/xt_limit.c @@ -193,9 +193,8 @@ static struct xt_match limit_mt_reg __read_mostly = { .compatsize = sizeof(struct compat_xt_rateinfo), .compat_from_user = limit_mt_compat_from_user, .compat_to_user = limit_mt_compat_to_user, -#else - .usersize = offsetof(struct xt_rateinfo, prev), #endif + .usersize = offsetof(struct xt_rateinfo, prev), .me = THIS_MODULE, }; diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c index cc0518fe598e..6f92d25590a8 100644 --- a/net/netfilter/xt_nfacct.c +++ b/net/netfilter/xt_nfacct.c @@ -62,6 +62,7 @@ static struct xt_match nfacct_mt_reg __read_mostly = { .match = nfacct_mt, .destroy = nfacct_mt_destroy, .matchsize = sizeof(struct xt_nfacct_match_info), + .usersize = offsetof(struct xt_nfacct_match_info, nfacct), .me = THIS_MODULE, }; diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c index 36e14b1f061d..a34f314a8c23 100644 --- a/net/netfilter/xt_osf.c +++ b/net/netfilter/xt_osf.c @@ -19,6 +19,7 @@ #include <linux/module.h> #include <linux/kernel.h> +#include <linux/capability.h> #include <linux/if.h> #include <linux/inetdevice.h> #include <linux/ip.h> @@ -70,6 +71,9 @@ static int xt_osf_add_callback(struct net *net, struct sock *ctnl, struct xt_osf_finger *kf = NULL, *sf; int err = 0; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (!osf_attrs[OSF_ATTR_FINGER]) return -EINVAL; @@ -115,6 +119,9 @@ static int xt_osf_remove_callback(struct net *net, struct sock *ctnl, struct xt_osf_finger *sf; int err = -ENOENT; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (!osf_attrs[OSF_ATTR_FINGER]) return -EINVAL; diff --git a/net/netfilter/xt_policy.c b/net/netfilter/xt_policy.c index 2b4ab189bba7..5639fb03bdd9 100644 --- a/net/netfilter/xt_policy.c +++ b/net/netfilter/xt_policy.c @@ -93,7 +93,8 @@ match_policy_out(const struct sk_buff *skb, const struct xt_policy_info *info, if (dst->xfrm == NULL) return -1; - for (i = 0; dst && dst->xfrm; dst = dst->child, i++) { + for (i = 0; dst && dst->xfrm; + dst = ((struct xfrm_dst *)dst)->child, i++) { pos = strict ? i : 0; if (pos >= info->len) return 0; diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c index 64285702afd5..16b6b11ee83f 100644 --- a/net/netfilter/xt_set.c +++ b/net/netfilter/xt_set.c @@ -39,13 +39,17 @@ match_set(ip_set_id_t index, const struct sk_buff *skb, return inv; } -#define ADT_OPT(n, f, d, fs, cfs, t) \ -struct ip_set_adt_opt n = { \ - .family = f, \ - .dim = d, \ - .flags = fs, \ - .cmdflags = cfs, \ - .ext.timeout = t, \ +#define ADT_OPT(n, f, d, fs, cfs, t, p, b, po, bo) \ +struct ip_set_adt_opt n = { \ + .family = f, \ + .dim = d, \ + .flags = fs, \ + .cmdflags = cfs, \ + .ext.timeout = t, \ + .ext.packets = p, \ + .ext.bytes = b, \ + .ext.packets_op = po, \ + .ext.bytes_op = bo, \ } /* Revision 0 interface: backward compatible with netfilter/iptables */ @@ -56,7 +60,8 @@ set_match_v0(const struct sk_buff *skb, struct xt_action_param *par) const struct xt_set_info_match_v0 *info = par->matchinfo; ADT_OPT(opt, xt_family(par), info->match_set.u.compat.dim, - info->match_set.u.compat.flags, 0, UINT_MAX); + info->match_set.u.compat.flags, 0, UINT_MAX, + 0, 0, 0, 0); return match_set(info->match_set.index, skb, par, &opt, info->match_set.u.compat.flags & IPSET_INV_MATCH); @@ -119,7 +124,8 @@ set_match_v1(const struct sk_buff *skb, struct xt_action_param *par) const struct xt_set_info_match_v1 *info = par->matchinfo; ADT_OPT(opt, xt_family(par), info->match_set.dim, - info->match_set.flags, 0, UINT_MAX); + info->match_set.flags, 0, UINT_MAX, + 0, 0, 0, 0); if (opt.flags & IPSET_RETURN_NOMATCH) opt.cmdflags |= IPSET_FLAG_RETURN_NOMATCH; @@ -161,45 +167,21 @@ set_match_v1_destroy(const struct xt_mtdtor_param *par) /* Revision 3 match */ static bool -match_counter0(u64 counter, const struct ip_set_counter_match0 *info) -{ - switch (info->op) { - case IPSET_COUNTER_NONE: - return true; - case IPSET_COUNTER_EQ: - return counter == info->value; - case IPSET_COUNTER_NE: - return counter != info->value; - case IPSET_COUNTER_LT: - return counter < info->value; - case IPSET_COUNTER_GT: - return counter > info->value; - } - return false; -} - -static bool set_match_v3(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_set_info_match_v3 *info = par->matchinfo; - int ret; ADT_OPT(opt, xt_family(par), info->match_set.dim, - info->match_set.flags, info->flags, UINT_MAX); + info->match_set.flags, info->flags, UINT_MAX, + info->packets.value, info->bytes.value, + info->packets.op, info->bytes.op); if (info->packets.op != IPSET_COUNTER_NONE || info->bytes.op != IPSET_COUNTER_NONE) opt.cmdflags |= IPSET_FLAG_MATCH_COUNTERS; - ret = match_set(info->match_set.index, skb, par, &opt, - info->match_set.flags & IPSET_INV_MATCH); - - if (!(ret && opt.cmdflags & IPSET_FLAG_MATCH_COUNTERS)) - return ret; - - if (!match_counter0(opt.ext.packets, &info->packets)) - return false; - return match_counter0(opt.ext.bytes, &info->bytes); + return match_set(info->match_set.index, skb, par, &opt, + info->match_set.flags & IPSET_INV_MATCH); } #define set_match_v3_checkentry set_match_v1_checkentry @@ -208,45 +190,21 @@ set_match_v3(const struct sk_buff *skb, struct xt_action_param *par) /* Revision 4 match */ static bool -match_counter(u64 counter, const struct ip_set_counter_match *info) -{ - switch (info->op) { - case IPSET_COUNTER_NONE: - return true; - case IPSET_COUNTER_EQ: - return counter == info->value; - case IPSET_COUNTER_NE: - return counter != info->value; - case IPSET_COUNTER_LT: - return counter < info->value; - case IPSET_COUNTER_GT: - return counter > info->value; - } - return false; -} - -static bool set_match_v4(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_set_info_match_v4 *info = par->matchinfo; - int ret; ADT_OPT(opt, xt_family(par), info->match_set.dim, - info->match_set.flags, info->flags, UINT_MAX); + info->match_set.flags, info->flags, UINT_MAX, + info->packets.value, info->bytes.value, + info->packets.op, info->bytes.op); if (info->packets.op != IPSET_COUNTER_NONE || info->bytes.op != IPSET_COUNTER_NONE) opt.cmdflags |= IPSET_FLAG_MATCH_COUNTERS; - ret = match_set(info->match_set.index, skb, par, &opt, - info->match_set.flags & IPSET_INV_MATCH); - - if (!(ret && opt.cmdflags & IPSET_FLAG_MATCH_COUNTERS)) - return ret; - - if (!match_counter(opt.ext.packets, &info->packets)) - return false; - return match_counter(opt.ext.bytes, &info->bytes); + return match_set(info->match_set.index, skb, par, &opt, + info->match_set.flags & IPSET_INV_MATCH); } #define set_match_v4_checkentry set_match_v1_checkentry @@ -260,9 +218,11 @@ set_target_v0(struct sk_buff *skb, const struct xt_action_param *par) const struct xt_set_info_target_v0 *info = par->targinfo; ADT_OPT(add_opt, xt_family(par), info->add_set.u.compat.dim, - info->add_set.u.compat.flags, 0, UINT_MAX); + info->add_set.u.compat.flags, 0, UINT_MAX, + 0, 0, 0, 0); ADT_OPT(del_opt, xt_family(par), info->del_set.u.compat.dim, - info->del_set.u.compat.flags, 0, UINT_MAX); + info->del_set.u.compat.flags, 0, UINT_MAX, + 0, 0, 0, 0); if (info->add_set.index != IPSET_INVALID_ID) ip_set_add(info->add_set.index, skb, par, &add_opt); @@ -333,9 +293,11 @@ set_target_v1(struct sk_buff *skb, const struct xt_action_param *par) const struct xt_set_info_target_v1 *info = par->targinfo; ADT_OPT(add_opt, xt_family(par), info->add_set.dim, - info->add_set.flags, 0, UINT_MAX); + info->add_set.flags, 0, UINT_MAX, + 0, 0, 0, 0); ADT_OPT(del_opt, xt_family(par), info->del_set.dim, - info->del_set.flags, 0, UINT_MAX); + info->del_set.flags, 0, UINT_MAX, + 0, 0, 0, 0); if (info->add_set.index != IPSET_INVALID_ID) ip_set_add(info->add_set.index, skb, par, &add_opt); @@ -402,9 +364,11 @@ set_target_v2(struct sk_buff *skb, const struct xt_action_param *par) const struct xt_set_info_target_v2 *info = par->targinfo; ADT_OPT(add_opt, xt_family(par), info->add_set.dim, - info->add_set.flags, info->flags, info->timeout); + info->add_set.flags, info->flags, info->timeout, + 0, 0, 0, 0); ADT_OPT(del_opt, xt_family(par), info->del_set.dim, - info->del_set.flags, 0, UINT_MAX); + info->del_set.flags, 0, UINT_MAX, + 0, 0, 0, 0); /* Normalize to fit into jiffies */ if (add_opt.ext.timeout != IPSET_NO_TIMEOUT && @@ -432,11 +396,14 @@ set_target_v3(struct sk_buff *skb, const struct xt_action_param *par) int ret; ADT_OPT(add_opt, xt_family(par), info->add_set.dim, - info->add_set.flags, info->flags, info->timeout); + info->add_set.flags, info->flags, info->timeout, + 0, 0, 0, 0); ADT_OPT(del_opt, xt_family(par), info->del_set.dim, - info->del_set.flags, 0, UINT_MAX); + info->del_set.flags, 0, UINT_MAX, + 0, 0, 0, 0); ADT_OPT(map_opt, xt_family(par), info->map_set.dim, - info->map_set.flags, 0, UINT_MAX); + info->map_set.flags, 0, UINT_MAX, + 0, 0, 0, 0); /* Normalize to fit into jiffies */ if (add_opt.ext.timeout != IPSET_NO_TIMEOUT && diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c index 11de55e7a868..8710fdba2ae2 100644 --- a/net/netfilter/xt_statistic.c +++ b/net/netfilter/xt_statistic.c @@ -84,6 +84,7 @@ static struct xt_match xt_statistic_mt_reg __read_mostly = { .checkentry = statistic_mt_check, .destroy = statistic_mt_destroy, .matchsize = sizeof(struct xt_statistic_info), + .usersize = offsetof(struct xt_statistic_info, master), .me = THIS_MODULE, }; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index b9e0ee4e22f5..2ad445c1d27c 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -65,6 +65,7 @@ #include <linux/net_namespace.h> #include <net/net_namespace.h> +#include <net/netns/generic.h> #include <net/sock.h> #include <net/scm.h> #include <net/netlink.h> @@ -145,8 +146,6 @@ static atomic_t nl_table_users = ATOMIC_INIT(0); static BLOCKING_NOTIFIER_HEAD(netlink_chain); -static DEFINE_SPINLOCK(netlink_tap_lock); -static struct list_head netlink_tap_all __read_mostly; static const struct rhashtable_params netlink_rhashtable_params; @@ -173,14 +172,24 @@ static struct sk_buff *netlink_to_full_skb(const struct sk_buff *skb, return new; } +static unsigned int netlink_tap_net_id; + +struct netlink_tap_net { + struct list_head netlink_tap_all; + struct mutex netlink_tap_lock; +}; + int netlink_add_tap(struct netlink_tap *nt) { + struct net *net = dev_net(nt->dev); + struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id); + if (unlikely(nt->dev->type != ARPHRD_NETLINK)) return -EINVAL; - spin_lock(&netlink_tap_lock); - list_add_rcu(&nt->list, &netlink_tap_all); - spin_unlock(&netlink_tap_lock); + mutex_lock(&nn->netlink_tap_lock); + list_add_rcu(&nt->list, &nn->netlink_tap_all); + mutex_unlock(&nn->netlink_tap_lock); __module_get(nt->module); @@ -190,12 +199,14 @@ EXPORT_SYMBOL_GPL(netlink_add_tap); static int __netlink_remove_tap(struct netlink_tap *nt) { + struct net *net = dev_net(nt->dev); + struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id); bool found = false; struct netlink_tap *tmp; - spin_lock(&netlink_tap_lock); + mutex_lock(&nn->netlink_tap_lock); - list_for_each_entry(tmp, &netlink_tap_all, list) { + list_for_each_entry(tmp, &nn->netlink_tap_all, list) { if (nt == tmp) { list_del_rcu(&nt->list); found = true; @@ -205,7 +216,7 @@ static int __netlink_remove_tap(struct netlink_tap *nt) pr_warn("__netlink_remove_tap: %p not found\n", nt); out: - spin_unlock(&netlink_tap_lock); + mutex_unlock(&nn->netlink_tap_lock); if (found) module_put(nt->module); @@ -224,6 +235,26 @@ int netlink_remove_tap(struct netlink_tap *nt) } EXPORT_SYMBOL_GPL(netlink_remove_tap); +static __net_init int netlink_tap_init_net(struct net *net) +{ + struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id); + + INIT_LIST_HEAD(&nn->netlink_tap_all); + mutex_init(&nn->netlink_tap_lock); + return 0; +} + +static void __net_exit netlink_tap_exit_net(struct net *net) +{ +} + +static struct pernet_operations netlink_tap_net_ops = { + .init = netlink_tap_init_net, + .exit = netlink_tap_exit_net, + .id = &netlink_tap_net_id, + .size = sizeof(struct netlink_tap_net), +}; + static bool netlink_filter_tap(const struct sk_buff *skb) { struct sock *sk = skb->sk; @@ -253,6 +284,9 @@ static int __netlink_deliver_tap_skb(struct sk_buff *skb, struct sock *sk = skb->sk; int ret = -ENOMEM; + if (!net_eq(dev_net(dev), sock_net(sk))) + return 0; + dev_hold(dev); if (is_vmalloc_addr(skb->head)) @@ -274,7 +308,7 @@ static int __netlink_deliver_tap_skb(struct sk_buff *skb, return ret; } -static void __netlink_deliver_tap(struct sk_buff *skb) +static void __netlink_deliver_tap(struct sk_buff *skb, struct netlink_tap_net *nn) { int ret; struct netlink_tap *tmp; @@ -282,19 +316,21 @@ static void __netlink_deliver_tap(struct sk_buff *skb) if (!netlink_filter_tap(skb)) return; - list_for_each_entry_rcu(tmp, &netlink_tap_all, list) { + list_for_each_entry_rcu(tmp, &nn->netlink_tap_all, list) { ret = __netlink_deliver_tap_skb(skb, tmp->dev); if (unlikely(ret)) break; } } -static void netlink_deliver_tap(struct sk_buff *skb) +static void netlink_deliver_tap(struct net *net, struct sk_buff *skb) { + struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id); + rcu_read_lock(); - if (unlikely(!list_empty(&netlink_tap_all))) - __netlink_deliver_tap(skb); + if (unlikely(!list_empty(&nn->netlink_tap_all))) + __netlink_deliver_tap(skb, nn); rcu_read_unlock(); } @@ -303,7 +339,7 @@ static void netlink_deliver_tap_kernel(struct sock *dst, struct sock *src, struct sk_buff *skb) { if (!(netlink_is_kernel(dst) && netlink_is_kernel(src))) - netlink_deliver_tap(skb); + netlink_deliver_tap(sock_net(dst), skb); } static void netlink_overrun(struct sock *sk) @@ -1213,7 +1249,7 @@ static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb) { int len = skb->len; - netlink_deliver_tap(skb); + netlink_deliver_tap(sock_net(sk), skb); skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk); @@ -2381,13 +2417,14 @@ int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, struct nlmsghdr *, struct netlink_ext_ack *)) { - struct netlink_ext_ack extack = {}; + struct netlink_ext_ack extack; struct nlmsghdr *nlh; int err; while (skb->len >= nlmsg_total_size(0)) { int msglen; + memset(&extack, 0, sizeof(extack)); nlh = nlmsg_hdr(skb); err = 0; @@ -2478,8 +2515,9 @@ static int netlink_walk_start(struct nl_seq_iter *iter) return err; } - err = rhashtable_walk_start(&iter->hti); - return err == -EAGAIN ? 0 : err; + rhashtable_walk_start(&iter->hti); + + return 0; } static void netlink_walk_stop(struct nl_seq_iter *iter) @@ -2600,7 +2638,6 @@ static int netlink_seq_open(struct inode *inode, struct file *file) } static const struct file_operations netlink_seq_fops = { - .owner = THIS_MODULE, .open = netlink_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -2730,12 +2767,11 @@ static int __init netlink_proto_init(void) } } - INIT_LIST_HEAD(&netlink_tap_all); - netlink_add_usersock_entry(); sock_register(&netlink_family_ops); register_pernet_subsys(&netlink_net_ops); + register_pernet_subsys(&netlink_tap_net_ops); /* The netlink device handler may be needed early. */ rtnetlink_init(); out: diff --git a/net/netlink/diag.c b/net/netlink/diag.c index 8faa20b4d457..7dda33b9b784 100644 --- a/net/netlink/diag.c +++ b/net/netlink/diag.c @@ -115,11 +115,7 @@ static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, if (!s_num) rhashtable_walk_enter(&tbl->hash, hti); - ret = rhashtable_walk_start(hti); - if (ret == -EAGAIN) - ret = 0; - if (ret) - goto stop; + rhashtable_walk_start(hti); while ((nlsk = rhashtable_walk_next(hti))) { if (IS_ERR(nlsk)) { @@ -146,8 +142,8 @@ static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, } } -stop: rhashtable_walk_stop(hti); + if (ret) goto done; diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 7ed9d4422a73..9ba30c63be3d 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -1344,7 +1344,6 @@ static int nr_info_open(struct inode *inode, struct file *file) } static const struct file_operations nr_info_fops = { - .owner = THIS_MODULE, .open = nr_info_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/netrom/nr_route.c b/net/netrom/nr_route.c index 75e6ba970fde..b5a7dcb30991 100644 --- a/net/netrom/nr_route.c +++ b/net/netrom/nr_route.c @@ -901,7 +901,6 @@ static int nr_node_info_open(struct inode *inode, struct file *file) } const struct file_operations nr_nodes_fops = { - .owner = THIS_MODULE, .open = nr_node_info_open, .read = seq_read, .llseek = seq_lseek, @@ -968,7 +967,6 @@ static int nr_neigh_info_open(struct inode *inode, struct file *file) } const struct file_operations nr_neigh_fops = { - .owner = THIS_MODULE, .open = nr_neigh_info_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index fb7afcaa3004..985909f105eb 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -531,7 +531,7 @@ static int llcp_sock_getname(struct socket *sock, struct sockaddr *uaddr, return 0; } -static inline unsigned int llcp_accept_poll(struct sock *parent) +static inline __poll_t llcp_accept_poll(struct sock *parent) { struct nfc_llcp_sock *llcp_sock, *parent_sock; struct sock *sk; @@ -549,11 +549,11 @@ static inline unsigned int llcp_accept_poll(struct sock *parent) return 0; } -static unsigned int llcp_sock_poll(struct file *file, struct socket *sock, +static __poll_t llcp_sock_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; - unsigned int mask = 0; + __poll_t mask = 0; pr_debug("%p\n", sk); diff --git a/net/nfc/nci/uart.c b/net/nfc/nci/uart.c index 8d104c1db628..a66f102c6c01 100644 --- a/net/nfc/nci/uart.c +++ b/net/nfc/nci/uart.c @@ -305,7 +305,7 @@ static ssize_t nci_uart_tty_write(struct tty_struct *tty, struct file *file, return 0; } -static unsigned int nci_uart_tty_poll(struct tty_struct *tty, +static __poll_t nci_uart_tty_poll(struct tty_struct *tty, struct file *filp, poll_table *wait) { return 0; diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index b27c5c6d9cab..c5904f629091 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1098,6 +1098,36 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key, return 0; } +/* Trim the skb to the length specified by the IP/IPv6 header, + * removing any trailing lower-layer padding. This prepares the skb + * for higher-layer processing that assumes skb->len excludes padding + * (such as nf_ip_checksum). The caller needs to pull the skb to the + * network header, and ensure ip_hdr/ipv6_hdr points to valid data. + */ +static int ovs_skb_network_trim(struct sk_buff *skb) +{ + unsigned int len; + int err; + + switch (skb->protocol) { + case htons(ETH_P_IP): + len = ntohs(ip_hdr(skb)->tot_len); + break; + case htons(ETH_P_IPV6): + len = sizeof(struct ipv6hdr) + + ntohs(ipv6_hdr(skb)->payload_len); + break; + default: + len = skb->len; + } + + err = pskb_trim_rcsum(skb, len); + if (err) + kfree_skb(skb); + + return err; +} + /* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero * value if 'skb' is freed. */ @@ -1112,6 +1142,10 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb, nh_ofs = skb_network_offset(skb); skb_pull_rcsum(skb, nh_ofs); + err = ovs_skb_network_trim(skb); + if (err) + return err; + if (key->ip.frag != OVS_FRAG_TYPE_NONE) { err = handle_fragments(net, key, info->zone.id, skb); if (err) @@ -1266,14 +1300,14 @@ static int parse_nat(const struct nlattr *attr, /* Do not allow flags if no type is given. */ if (info->range.flags) { OVS_NLERR(log, - "NAT flags may be given only when NAT range (SRC or DST) is also specified.\n" + "NAT flags may be given only when NAT range (SRC or DST) is also specified." ); return -EINVAL; } info->nat = OVS_CT_NAT; /* NAT existing connections. */ } else if (!info->commit) { OVS_NLERR(log, - "NAT attributes may be specified only when CT COMMIT flag is also specified.\n" + "NAT attributes may be specified only when CT COMMIT flag is also specified." ); return -EINVAL; } diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 99cfafc2a139..ef38e5aecd28 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -308,7 +308,7 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, const struct dp_upcall_info *upcall_info, uint32_t cutlen) { - unsigned short gso_type = skb_shinfo(skb)->gso_type; + unsigned int gso_type = skb_shinfo(skb)->gso_type; struct sw_flow_key later_key; struct sk_buff *segs, *nskb; int err; diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index dbe2379329c5..56b8e7167790 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -56,12 +56,12 @@ u64 ovs_flow_used_time(unsigned long flow_jiffies) { - struct timespec cur_ts; + struct timespec64 cur_ts; u64 cur_ms, idle_ms; - ktime_get_ts(&cur_ts); + ktime_get_ts64(&cur_ts); idle_ms = jiffies_to_msecs(jiffies - flow_jiffies); - cur_ms = (u64)cur_ts.tv_sec * MSEC_PER_SEC + + cur_ms = (u64)(u32)cur_ts.tv_sec * MSEC_PER_SEC + cur_ts.tv_nsec / NSEC_PER_MSEC; return cur_ms - idle_ms; @@ -579,6 +579,7 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) return -EINVAL; skb_reset_network_header(skb); + key->eth.type = skb->protocol; } else { eth = eth_hdr(skb); ether_addr_copy(key->eth.src, eth->h_source); @@ -592,15 +593,23 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) if (unlikely(parse_vlan(skb, key))) return -ENOMEM; - skb->protocol = parse_ethertype(skb); - if (unlikely(skb->protocol == htons(0))) + key->eth.type = parse_ethertype(skb); + if (unlikely(key->eth.type == htons(0))) return -ENOMEM; + /* Multiple tagged packets need to retain TPID to satisfy + * skb_vlan_pop(), which will later shift the ethertype into + * skb->protocol. + */ + if (key->eth.cvlan.tci & htons(VLAN_TAG_PRESENT)) + skb->protocol = key->eth.cvlan.tpid; + else + skb->protocol = key->eth.type; + skb_reset_network_header(skb); __skb_push(skb, skb->data - skb_mac_header(skb)); } skb_reset_mac_len(skb); - key->eth.type = skb->protocol; /* Network layer. */ if (key->eth.type == htons(ETH_P_IP)) { diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index dc424798ba6f..7322aa1e382e 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -330,12 +330,12 @@ size_t ovs_tun_key_attr_size(void) + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_CSUM */ + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_OAM */ + nla_total_size(256) /* OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS */ - /* OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS is mutually exclusive with + /* OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS and + * OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS is mutually exclusive with * OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS and covered by it. */ + nla_total_size(2) /* OVS_TUNNEL_KEY_ATTR_TP_SRC */ - + nla_total_size(2) /* OVS_TUNNEL_KEY_ATTR_TP_DST */ - + nla_total_size(4); /* OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS */ + + nla_total_size(2); /* OVS_TUNNEL_KEY_ATTR_TP_DST */ } static size_t ovs_nsh_key_attr_size(void) @@ -402,7 +402,7 @@ static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] .next = ovs_vxlan_ext_key_lens }, [OVS_TUNNEL_KEY_ATTR_IPV6_SRC] = { .len = sizeof(struct in6_addr) }, [OVS_TUNNEL_KEY_ATTR_IPV6_DST] = { .len = sizeof(struct in6_addr) }, - [OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS] = { .len = sizeof(u32) }, + [OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS] = { .len = OVS_ATTR_VARIABLE }, }; static const struct ovs_len_tbl @@ -634,30 +634,30 @@ static int vxlan_tun_opt_from_nlattr(const struct nlattr *attr, return 0; } -static int erspan_tun_opt_from_nlattr(const struct nlattr *attr, +static int erspan_tun_opt_from_nlattr(const struct nlattr *a, struct sw_flow_match *match, bool is_mask, bool log) { unsigned long opt_key_offset; - struct erspan_metadata opts; - BUILD_BUG_ON(sizeof(opts) > sizeof(match->key->tun_opts)); - - memset(&opts, 0, sizeof(opts)); - opts.index = nla_get_be32(attr); + BUILD_BUG_ON(sizeof(struct erspan_metadata) > + sizeof(match->key->tun_opts)); - /* Index has only 20-bit */ - if (ntohl(opts.index) & ~INDEX_MASK) { - OVS_NLERR(log, "ERSPAN index number %x too large.", - ntohl(opts.index)); + if (nla_len(a) > sizeof(match->key->tun_opts)) { + OVS_NLERR(log, "ERSPAN option length err (len %d, max %zu).", + nla_len(a), sizeof(match->key->tun_opts)); return -EINVAL; } - SW_FLOW_KEY_PUT(match, tun_opts_len, sizeof(opts), is_mask); - opt_key_offset = TUN_METADATA_OFFSET(sizeof(opts)); - SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset, &opts, sizeof(opts), - is_mask); + if (!is_mask) + SW_FLOW_KEY_PUT(match, tun_opts_len, + sizeof(struct erspan_metadata), false); + else + SW_FLOW_KEY_PUT(match, tun_opts_len, 0xff, true); + opt_key_offset = TUN_METADATA_OFFSET(nla_len(a)); + SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset, nla_data(a), + nla_len(a), is_mask); return 0; } @@ -774,7 +774,8 @@ static int ip_tun_from_nlattr(const struct nlattr *attr, return -EINVAL; } - err = erspan_tun_opt_from_nlattr(a, match, is_mask, log); + err = erspan_tun_opt_from_nlattr(a, match, is_mask, + log); if (err) return err; @@ -906,8 +907,8 @@ static int __ip_tun_to_nlattr(struct sk_buff *skb, vxlan_opt_to_nlattr(skb, tun_opts, swkey_tun_opts_len)) return -EMSGSIZE; else if (output->tun_flags & TUNNEL_ERSPAN_OPT && - nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS, - ((struct erspan_metadata *)tun_opts)->index)) + nla_put(skb, OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS, + swkey_tun_opts_len, tun_opts)) return -EMSGSIZE; } @@ -2241,14 +2242,11 @@ int ovs_nla_put_mask(const struct sw_flow *flow, struct sk_buff *skb) #define MAX_ACTIONS_BUFSIZE (32 * 1024) -static struct sw_flow_actions *nla_alloc_flow_actions(int size, bool log) +static struct sw_flow_actions *nla_alloc_flow_actions(int size) { struct sw_flow_actions *sfa; - if (size > MAX_ACTIONS_BUFSIZE) { - OVS_NLERR(log, "Flow action size %u bytes exceeds max", size); - return ERR_PTR(-EINVAL); - } + WARN_ON_ONCE(size > MAX_ACTIONS_BUFSIZE); sfa = kmalloc(sizeof(*sfa) + size, GFP_KERNEL); if (!sfa) @@ -2321,12 +2319,15 @@ static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa, new_acts_size = ksize(*sfa) * 2; if (new_acts_size > MAX_ACTIONS_BUFSIZE) { - if ((MAX_ACTIONS_BUFSIZE - next_offset) < req_size) + if ((MAX_ACTIONS_BUFSIZE - next_offset) < req_size) { + OVS_NLERR(log, "Flow action size exceeds max %u", + MAX_ACTIONS_BUFSIZE); return ERR_PTR(-EMSGSIZE); + } new_acts_size = MAX_ACTIONS_BUFSIZE; } - acts = nla_alloc_flow_actions(new_acts_size, log); + acts = nla_alloc_flow_actions(new_acts_size); if (IS_ERR(acts)) return (void *)acts; @@ -2501,7 +2502,7 @@ static int validate_geneve_opts(struct sw_flow_key *key) option = (struct geneve_opt *)((u8 *)option + len); opts_len -= len; - }; + } key->tun_key.tun_flags |= crit_opt ? TUNNEL_CRIT_OPT : 0; @@ -2536,7 +2537,7 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS: break; } - }; + } start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SET, log); if (start < 0) @@ -3059,7 +3060,7 @@ int ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, { int err; - *sfa = nla_alloc_flow_actions(nla_len(attr), log); + *sfa = nla_alloc_flow_actions(min(nla_len(attr), MAX_ACTIONS_BUFSIZE)); if (IS_ERR(*sfa)) return PTR_ERR(*sfa); diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c index 3fbfc78991ac..04b94281a30b 100644 --- a/net/openvswitch/meter.c +++ b/net/openvswitch/meter.c @@ -488,7 +488,7 @@ bool ovs_meter_execute(struct datapath *dp, struct sk_buff *skb, long long int max_bucket_size; band = &meter->bands[i]; - max_bucket_size = (band->burst_size + band->rate) * 1000; + max_bucket_size = (band->burst_size + band->rate) * 1000LL; band->bucket += delta_ms * band->rate; if (band->bucket > max_bucket_size) diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index 04a3128adcf0..bb95c43aae76 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -16,7 +16,6 @@ * 02110-1301, USA */ -#include <linux/hardirq.h> #include <linux/if_vlan.h> #include <linux/kernel.h> #include <linux/netdevice.h> @@ -126,18 +125,12 @@ internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) } } -static void internal_set_rx_headroom(struct net_device *dev, int new_hr) -{ - dev->needed_headroom = new_hr < 0 ? 0 : new_hr; -} - static const struct net_device_ops internal_dev_netdev_ops = { .ndo_open = internal_dev_open, .ndo_stop = internal_dev_stop, .ndo_start_xmit = internal_dev_xmit, .ndo_set_mac_address = eth_mac_addr, .ndo_get_stats64 = internal_get_stats, - .ndo_set_rx_headroom = internal_set_rx_headroom, }; static struct rtnl_link_ops internal_dev_link_ops __read_mostly = { @@ -154,7 +147,7 @@ static void do_setup(struct net_device *netdev) netdev->priv_flags &= ~IFF_TX_SKB_SHARING; netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH | - IFF_PHONY_HEADROOM | IFF_NO_QUEUE; + IFF_NO_QUEUE; netdev->needs_free_netdev = true; netdev->priv_destructor = internal_dev_destructor; netdev->ethtool_ops = &internal_dev_ethtool_ops; @@ -195,7 +188,6 @@ static struct vport *internal_dev_create(const struct vport_parms *parms) err = -ENOMEM; goto error_free_netdev; } - vport->dev->needed_headroom = vport->dp->max_headroom; dev_net_set(vport->dev, ovs_dp_get_net(vport->dp)); internal_dev = internal_dev_priv(vport->dev); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 737092ca9b4e..1d1483007e46 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -247,12 +247,13 @@ static int packet_direct_xmit(struct sk_buff *skb) struct sk_buff *orig_skb = skb; struct netdev_queue *txq; int ret = NETDEV_TX_BUSY; + bool again = false; if (unlikely(!netif_running(dev) || !netif_carrier_ok(dev))) goto drop; - skb = validate_xmit_skb_list(skb, dev); + skb = validate_xmit_skb_list(skb, dev, &again); if (skb != orig_skb) goto drop; @@ -1687,7 +1688,6 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) atomic_long_set(&rollover->num, 0); atomic_long_set(&rollover->num_huge, 0); atomic_long_set(&rollover->num_failed, 0); - po->rollover = rollover; } if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) { @@ -1745,6 +1745,8 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) if (refcount_read(&match->sk_ref) < PACKET_FANOUT_MAX) { __dev_remove_pack(&po->prot_hook); po->fanout = match; + po->rollover = rollover; + rollover = NULL; refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1); __fanout_link(sk, po); err = 0; @@ -1758,10 +1760,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) } out: - if (err && rollover) { - kfree_rcu(rollover, rcu); - po->rollover = NULL; - } + kfree(rollover); mutex_unlock(&fanout_mutex); return err; } @@ -1785,11 +1784,6 @@ static struct packet_fanout *fanout_release(struct sock *sk) list_del(&f->list); else f = NULL; - - if (po->rollover) { - kfree_rcu(po->rollover, rcu); - po->rollover = NULL; - } } mutex_unlock(&fanout_mutex); @@ -3029,6 +3023,7 @@ static int packet_release(struct socket *sock) synchronize_net(); if (f) { + kfree(po->rollover); fanout_release_data(f); kfree(f); } @@ -3097,6 +3092,10 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex, if (need_rehook) { if (po->running) { rcu_read_unlock(); + /* prevents packet_notifier() from calling + * register_prot_hook() + */ + po->num = 0; __unregister_prot_hook(sk, true); rcu_read_lock(); dev_curr = po->prot_hook.dev; @@ -3105,6 +3104,7 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex, dev->ifindex); } + BUG_ON(po->running); po->num = proto; po->prot_hook.type = proto; @@ -3843,7 +3843,6 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, void *data = &val; union tpacket_stats_u st; struct tpacket_rollover_stats rstats; - struct packet_rollover *rollover; if (level != SOL_PACKET) return -ENOPROTOOPT; @@ -3922,18 +3921,13 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, 0); break; case PACKET_ROLLOVER_STATS: - rcu_read_lock(); - rollover = rcu_dereference(po->rollover); - if (rollover) { - rstats.tp_all = atomic_long_read(&rollover->num); - rstats.tp_huge = atomic_long_read(&rollover->num_huge); - rstats.tp_failed = atomic_long_read(&rollover->num_failed); - data = &rstats; - lv = sizeof(rstats); - } - rcu_read_unlock(); - if (!rollover) + if (!po->rollover) return -EINVAL; + rstats.tp_all = atomic_long_read(&po->rollover->num); + rstats.tp_huge = atomic_long_read(&po->rollover->num_huge); + rstats.tp_failed = atomic_long_read(&po->rollover->num_failed); + data = &rstats; + lv = sizeof(rstats); break; case PACKET_TX_HAS_OFF: val = po->tp_tx_has_off; @@ -4080,12 +4074,12 @@ static int packet_ioctl(struct socket *sock, unsigned int cmd, return 0; } -static unsigned int packet_poll(struct file *file, struct socket *sock, +static __poll_t packet_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); - unsigned int mask = datagram_poll(file, sock, wait); + __poll_t mask = datagram_poll(file, sock, wait); spin_lock_bh(&sk->sk_receive_queue.lock); if (po->rx_ring.pg_vec) { @@ -4537,7 +4531,6 @@ static int packet_seq_open(struct inode *inode, struct file *file) } static const struct file_operations packet_seq_fops = { - .owner = THIS_MODULE, .open = packet_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/packet/internal.h b/net/packet/internal.h index 562fbc155006..a1d2b2319ae9 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -95,7 +95,6 @@ struct packet_fanout { struct packet_rollover { int sock; - struct rcu_head rcu; atomic_long_t num; atomic_long_t num_huge; atomic_long_t num_failed; diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c index da754fc926e7..871eaf2cb85e 100644 --- a/net/phonet/pn_netlink.c +++ b/net/phonet/pn_netlink.c @@ -299,16 +299,21 @@ out: int __init phonet_netlink_register(void) { - int err = __rtnl_register(PF_PHONET, RTM_NEWADDR, addr_doit, - NULL, 0); + int err = rtnl_register_module(THIS_MODULE, PF_PHONET, RTM_NEWADDR, + addr_doit, NULL, 0); if (err) return err; - /* Further __rtnl_register() cannot fail */ - __rtnl_register(PF_PHONET, RTM_DELADDR, addr_doit, NULL, 0); - __rtnl_register(PF_PHONET, RTM_GETADDR, NULL, getaddr_dumpit, 0); - __rtnl_register(PF_PHONET, RTM_NEWROUTE, route_doit, NULL, 0); - __rtnl_register(PF_PHONET, RTM_DELROUTE, route_doit, NULL, 0); - __rtnl_register(PF_PHONET, RTM_GETROUTE, NULL, route_dumpit, 0); + /* Further rtnl_register_module() cannot fail */ + rtnl_register_module(THIS_MODULE, PF_PHONET, RTM_DELADDR, + addr_doit, NULL, 0); + rtnl_register_module(THIS_MODULE, PF_PHONET, RTM_GETADDR, + NULL, getaddr_dumpit, 0); + rtnl_register_module(THIS_MODULE, PF_PHONET, RTM_NEWROUTE, + route_doit, NULL, 0); + rtnl_register_module(THIS_MODULE, PF_PHONET, RTM_DELROUTE, + route_doit, NULL, 0); + rtnl_register_module(THIS_MODULE, PF_PHONET, RTM_GETROUTE, + NULL, route_dumpit, 0); return 0; } diff --git a/net/phonet/socket.c b/net/phonet/socket.c index 1b050dd17393..08f6751d2030 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -341,12 +341,12 @@ static int pn_socket_getname(struct socket *sock, struct sockaddr *addr, return 0; } -static unsigned int pn_socket_poll(struct file *file, struct socket *sock, +static __poll_t pn_socket_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct pep_sock *pn = pep_sk(sk); - unsigned int mask = 0; + __poll_t mask = 0; poll_wait(file, sk_sleep(sk), wait); @@ -635,7 +635,6 @@ static int pn_sock_open(struct inode *inode, struct file *file) } const struct file_operations pn_sock_seq_fops = { - .owner = THIS_MODULE, .open = pn_sock_open, .read = seq_read, .llseek = seq_lseek, @@ -818,7 +817,6 @@ static int pn_res_open(struct inode *inode, struct file *file) } const struct file_operations pn_res_seq_fops = { - .owner = THIS_MODULE, .open = pn_res_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index 77ab05e23001..5fb3929e3d7d 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -1116,9 +1116,13 @@ static int __init qrtr_proto_init(void) return rc; } - rtnl_register(PF_QIPCRTR, RTM_NEWADDR, qrtr_addr_doit, NULL, 0); + rc = rtnl_register_module(THIS_MODULE, PF_QIPCRTR, RTM_NEWADDR, qrtr_addr_doit, NULL, 0); + if (rc) { + sock_unregister(qrtr_family.family); + proto_unregister(&qrtr_proto); + } - return 0; + return rc; } postcore_initcall(qrtr_proto_init); diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index b405f77d664c..88aa8ad0f5b6 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -152,12 +152,12 @@ static int rds_getname(struct socket *sock, struct sockaddr *uaddr, * to send to a congested destination, the system call may still fail (and * return ENOBUFS). */ -static unsigned int rds_poll(struct file *file, struct socket *sock, +static __poll_t rds_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct rds_sock *rs = rds_sk_to_rs(sk); - unsigned int mask = 0; + __poll_t mask = 0; unsigned long flags; poll_wait(file, sk_sleep(sk), wait); diff --git a/net/rds/bind.c b/net/rds/bind.c index 75d43dc8e96b..5aa3a64aa4f0 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -114,6 +114,7 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) rs, &addr, (int)ntohs(*port)); break; } else { + rs->rs_bound_addr = 0; rds_sock_put(rs); ret = -ENOMEM; break; diff --git a/net/rds/cong.c b/net/rds/cong.c index 8398fee7c866..8d19fd25dce3 100644 --- a/net/rds/cong.c +++ b/net/rds/cong.c @@ -219,7 +219,11 @@ void rds_cong_queue_updates(struct rds_cong_map *map) spin_lock_irqsave(&rds_cong_lock, flags); list_for_each_entry(conn, &map->m_conn_list, c_map_item) { - if (!test_and_set_bit(0, &conn->c_map_queued)) { + struct rds_conn_path *cp = &conn->c_path[0]; + + rcu_read_lock(); + if (!test_and_set_bit(0, &conn->c_map_queued) && + !test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) { rds_stats_inc(s_cong_update_queued); /* We cannot inline the call to rds_send_xmit() here * for two reasons (both pertaining to a TCP transport): @@ -235,9 +239,9 @@ void rds_cong_queue_updates(struct rds_cong_map *map) * therefore trigger warnings. * Defer the xmit to rds_send_worker() instead. */ - queue_delayed_work(rds_wq, - &conn->c_path[0].cp_send_w, 0); + queue_delayed_work(rds_wq, &cp->cp_send_w, 0); } + rcu_read_unlock(); } spin_unlock_irqrestore(&rds_cong_lock, flags); diff --git a/net/rds/connection.c b/net/rds/connection.c index 7ee2d5d68b78..b10c0ef36d8d 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -230,8 +230,8 @@ static struct rds_connection *__rds_conn_create(struct net *net, rdsdebug("allocated conn %p for %pI4 -> %pI4 over %s %s\n", conn, &laddr, &faddr, - trans->t_name ? trans->t_name : "[unknown]", - is_outgoing ? "(outgoing)" : ""); + strnlen(trans->t_name, sizeof(trans->t_name)) ? trans->t_name : + "[unknown]", is_outgoing ? "(outgoing)" : ""); /* * Since we ran without holding the conn lock, someone could @@ -382,10 +382,13 @@ static void rds_conn_path_destroy(struct rds_conn_path *cp) { struct rds_message *rm, *rtmp; + set_bit(RDS_DESTROY_PENDING, &cp->cp_flags); + if (!cp->cp_transport_data) return; /* make sure lingering queued work won't try to ref the conn */ + synchronize_rcu(); cancel_delayed_work_sync(&cp->cp_send_w); cancel_delayed_work_sync(&cp->cp_recv_w); @@ -403,6 +406,11 @@ static void rds_conn_path_destroy(struct rds_conn_path *cp) if (cp->cp_xmit_rm) rds_message_put(cp->cp_xmit_rm); + WARN_ON(delayed_work_pending(&cp->cp_send_w)); + WARN_ON(delayed_work_pending(&cp->cp_recv_w)); + WARN_ON(delayed_work_pending(&cp->cp_conn_w)); + WARN_ON(work_pending(&cp->cp_down_w)); + cp->cp_conn->c_trans->conn_free(cp->cp_transport_data); } @@ -424,7 +432,6 @@ void rds_conn_destroy(struct rds_connection *conn) "%pI4\n", conn, &conn->c_laddr, &conn->c_faddr); - conn->c_destroy_in_prog = 1; /* Ensure conn will not be scheduled for reconnect */ spin_lock_irq(&rds_conn_lock); hlist_del_init_rcu(&conn->c_hash_node); @@ -445,7 +452,6 @@ void rds_conn_destroy(struct rds_connection *conn) */ rds_cong_remove_conn(conn); - put_net(conn->c_net); kfree(conn->c_path); kmem_cache_free(rds_conn_slab, conn); @@ -684,10 +690,13 @@ void rds_conn_path_drop(struct rds_conn_path *cp, bool destroy) { atomic_set(&cp->cp_state, RDS_CONN_ERROR); - if (!destroy && cp->cp_conn->c_destroy_in_prog) + rcu_read_lock(); + if (!destroy && test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) { + rcu_read_unlock(); return; - + } queue_work(rds_wq, &cp->cp_down_w); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(rds_conn_path_drop); @@ -704,9 +713,15 @@ EXPORT_SYMBOL_GPL(rds_conn_drop); */ void rds_conn_path_connect_if_down(struct rds_conn_path *cp) { + rcu_read_lock(); + if (test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) { + rcu_read_unlock(); + return; + } if (rds_conn_path_state(cp) == RDS_CONN_DOWN && !test_and_set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags)) queue_delayed_work(rds_wq, &cp->cp_conn_w, 0); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(rds_conn_path_connect_if_down); diff --git a/net/rds/ib.c b/net/rds/ib.c index 36dd2099048a..b2a5067b4afe 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -301,13 +301,11 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn, memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid)); if (rds_conn_state(conn) == RDS_CONN_UP) { struct rds_ib_device *rds_ibdev; - struct rdma_dev_addr *dev_addr; ic = conn->c_transport_data; - dev_addr = &ic->i_cm_id->route.addr.dev_addr; - rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid); - rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid); + rdma_read_gids(ic->i_cm_id, (union ib_gid *)&iinfo->src_gid, + (union ib_gid *)&iinfo->dst_gid); rds_ibdev = ic->rds_ibdev; iinfo->max_send_wr = ic->i_send_ring.w_nr; diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 8886f15abe90..634cfcb7bba6 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -183,7 +183,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, long i; int ret; - if (rs->rs_bound_addr == 0) { + if (rs->rs_bound_addr == 0 || !rs->rs_transport) { ret = -ENOTCONN; /* XXX not a great errno */ goto out; } @@ -525,6 +525,9 @@ int rds_rdma_extra_size(struct rds_rdma_args *args) local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr; + if (args->nr_local == 0) + return -EINVAL; + /* figure out the number of pages in the vector */ for (i = 0; i < args->nr_local; i++) { if (copy_from_user(&vec, &local_vec[i], @@ -874,6 +877,7 @@ int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, err: if (page) put_page(page); + rm->atomic.op_active = 0; kfree(rm->atomic.op_notifier); return ret; diff --git a/net/rds/rds.h b/net/rds/rds.h index c349c71babff..374ae83b60d4 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -88,6 +88,7 @@ enum { #define RDS_RECONNECT_PENDING 1 #define RDS_IN_XMIT 2 #define RDS_RECV_REFILL 3 +#define RDS_DESTROY_PENDING 4 /* Max number of multipaths per RDS connection. Must be a power of 2 */ #define RDS_MPATH_WORKERS 8 @@ -139,8 +140,7 @@ struct rds_connection { __be32 c_faddr; unsigned int c_loopback:1, c_ping_triggered:1, - c_destroy_in_prog:1, - c_pad_to_32:29; + c_pad_to_32:30; int c_npaths; struct rds_connection *c_passive; struct rds_transport *c_trans; @@ -150,7 +150,7 @@ struct rds_connection { /* Protocol version */ unsigned int c_version; - struct net *c_net; + possible_net_t c_net; struct list_head c_map_item; unsigned long c_map_queued; @@ -165,13 +165,13 @@ struct rds_connection { static inline struct net *rds_conn_net(struct rds_connection *conn) { - return conn->c_net; + return read_pnet(&conn->c_net); } static inline void rds_conn_net_set(struct rds_connection *conn, struct net *net) { - conn->c_net = get_net(net); + write_pnet(&conn->c_net, net); } #define RDS_FLAG_CONG_BITMAP 0x01 diff --git a/net/rds/send.c b/net/rds/send.c index b52cdc8ae428..d3e32d1f3c7d 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -162,6 +162,12 @@ restart: goto out; } + if (test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) { + release_in_xmit(cp); + ret = -ENETUNREACH; /* dont requeue send work */ + goto out; + } + /* * we record the send generation after doing the xmit acquire. * if someone else manages to jump in and do some work, we'll use @@ -437,7 +443,12 @@ over_batch: !list_empty(&cp->cp_send_queue)) && !raced) { if (batch_count < send_batch_count) goto restart; - queue_delayed_work(rds_wq, &cp->cp_send_w, 1); + rcu_read_lock(); + if (test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) + ret = -ENETUNREACH; + else + queue_delayed_work(rds_wq, &cp->cp_send_w, 1); + rcu_read_unlock(); } else if (raced) { rds_stats_inc(s_send_lock_queue_raced); } @@ -1009,6 +1020,9 @@ static int rds_rdma_bytes(struct msghdr *msg, size_t *rdma_bytes) continue; if (cmsg->cmsg_type == RDS_CMSG_RDMA_ARGS) { + if (cmsg->cmsg_len < + CMSG_LEN(sizeof(struct rds_rdma_args))) + return -EINVAL; args = CMSG_DATA(cmsg); *rdma_bytes += args->remote_vec.bytes; } @@ -1148,6 +1162,11 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) else cpath = &conn->c_path[0]; + if (test_bit(RDS_DESTROY_PENDING, &cpath->cp_flags)) { + ret = -EAGAIN; + goto out; + } + rds_conn_path_connect_if_down(cpath); ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs); @@ -1187,9 +1206,17 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) rds_stats_inc(s_send_queued); ret = rds_send_xmit(cpath); - if (ret == -ENOMEM || ret == -EAGAIN) - queue_delayed_work(rds_wq, &cpath->cp_send_w, 1); - + if (ret == -ENOMEM || ret == -EAGAIN) { + ret = 0; + rcu_read_lock(); + if (test_bit(RDS_DESTROY_PENDING, &cpath->cp_flags)) + ret = -ENETUNREACH; + else + queue_delayed_work(rds_wq, &cpath->cp_send_w, 1); + rcu_read_unlock(); + } + if (ret) + goto out; rds_message_put(rm); return payload_len; @@ -1267,7 +1294,10 @@ rds_send_probe(struct rds_conn_path *cp, __be16 sport, rds_stats_inc(s_send_pong); /* schedule the send work on rds_wq */ - queue_delayed_work(rds_wq, &cp->cp_send_w, 1); + rcu_read_lock(); + if (!test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) + queue_delayed_work(rds_wq, &cp->cp_send_w, 1); + rcu_read_unlock(); rds_message_put(rm); return 0; diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 6b7ee71f40c6..9920d2f84eff 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -90,9 +90,10 @@ void rds_tcp_nonagle(struct socket *sock) sizeof(val)); } -u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc) +u32 rds_tcp_write_seq(struct rds_tcp_connection *tc) { - return tcp_sk(tc->t_sock->sk)->snd_nxt; + /* seq# of the last byte of data in tcp send buffer */ + return tcp_sk(tc->t_sock->sk)->write_seq; } u32 rds_tcp_snd_una(struct rds_tcp_connection *tc) @@ -270,16 +271,33 @@ static int rds_tcp_laddr_check(struct net *net, __be32 addr) return -EADDRNOTAVAIL; } +static void rds_tcp_conn_free(void *arg) +{ + struct rds_tcp_connection *tc = arg; + unsigned long flags; + + rdsdebug("freeing tc %p\n", tc); + + spin_lock_irqsave(&rds_tcp_conn_lock, flags); + if (!tc->t_tcp_node_detached) + list_del(&tc->t_tcp_node); + spin_unlock_irqrestore(&rds_tcp_conn_lock, flags); + + kmem_cache_free(rds_tcp_conn_slab, tc); +} + static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp) { struct rds_tcp_connection *tc; - int i; + int i, j; + int ret = 0; for (i = 0; i < RDS_MPATH_WORKERS; i++) { tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp); - if (!tc) - return -ENOMEM; - + if (!tc) { + ret = -ENOMEM; + break; + } mutex_init(&tc->t_conn_path_lock); tc->t_sock = NULL; tc->t_tinc = NULL; @@ -290,26 +308,17 @@ static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp) tc->t_cpath = &conn->c_path[i]; spin_lock_irq(&rds_tcp_conn_lock); + tc->t_tcp_node_detached = false; list_add_tail(&tc->t_tcp_node, &rds_tcp_conn_list); spin_unlock_irq(&rds_tcp_conn_lock); rdsdebug("rds_conn_path [%d] tc %p\n", i, conn->c_path[i].cp_transport_data); } - - return 0; -} - -static void rds_tcp_conn_free(void *arg) -{ - struct rds_tcp_connection *tc = arg; - unsigned long flags; - rdsdebug("freeing tc %p\n", tc); - - spin_lock_irqsave(&rds_tcp_conn_lock, flags); - list_del(&tc->t_tcp_node); - spin_unlock_irqrestore(&rds_tcp_conn_lock, flags); - - kmem_cache_free(rds_tcp_conn_slab, tc); + if (ret) { + for (j = 0; j < i; j++) + rds_tcp_conn_free(conn->c_path[j].cp_transport_data); + } + return ret; } static bool list_has_conn(struct list_head *list, struct rds_connection *conn) @@ -495,27 +504,6 @@ static struct pernet_operations rds_tcp_net_ops = { .size = sizeof(struct rds_tcp_net), }; -/* explicitly send a RST on each socket, thereby releasing any socket refcnts - * that may otherwise hold up netns deletion. - */ -static void rds_tcp_conn_paths_destroy(struct rds_connection *conn) -{ - struct rds_conn_path *cp; - struct rds_tcp_connection *tc; - int i; - struct sock *sk; - - for (i = 0; i < RDS_MPATH_WORKERS; i++) { - cp = &conn->c_path[i]; - tc = cp->cp_transport_data; - if (!tc->t_sock) - continue; - sk = tc->t_sock->sk; - sk->sk_prot->disconnect(sk, 0); - tcp_done(sk); - } -} - static void rds_tcp_kill_sock(struct net *net) { struct rds_tcp_connection *tc, *_tc; @@ -527,18 +515,20 @@ static void rds_tcp_kill_sock(struct net *net) rds_tcp_listen_stop(lsock, &rtn->rds_tcp_accept_w); spin_lock_irq(&rds_tcp_conn_lock); list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { - struct net *c_net = tc->t_cpath->cp_conn->c_net; + struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net); if (net != c_net || !tc->t_sock) continue; - if (!list_has_conn(&tmp_list, tc->t_cpath->cp_conn)) + if (!list_has_conn(&tmp_list, tc->t_cpath->cp_conn)) { list_move_tail(&tc->t_tcp_node, &tmp_list); + } else { + list_del(&tc->t_tcp_node); + tc->t_tcp_node_detached = true; + } } spin_unlock_irq(&rds_tcp_conn_lock); - list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) { - rds_tcp_conn_paths_destroy(tc->t_cpath->cp_conn); + list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) rds_conn_destroy(tc->t_cpath->cp_conn); - } } void *rds_tcp_listen_sock_def_readable(struct net *net) @@ -586,7 +576,7 @@ static void rds_tcp_sysctl_reset(struct net *net) spin_lock_irq(&rds_tcp_conn_lock); list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { - struct net *c_net = tc->t_cpath->cp_conn->c_net; + struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net); if (net != c_net || !tc->t_sock) continue; diff --git a/net/rds/tcp.h b/net/rds/tcp.h index 1aafbf7c3011..c6fa080e9b6d 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -12,6 +12,7 @@ struct rds_tcp_incoming { struct rds_tcp_connection { struct list_head t_tcp_node; + bool t_tcp_node_detached; struct rds_conn_path *t_cpath; /* t_conn_path_lock synchronizes the connection establishment between * rds_tcp_accept_one and rds_tcp_conn_path_connect @@ -54,7 +55,7 @@ void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp); void rds_tcp_reset_callbacks(struct socket *sock, struct rds_conn_path *cp); void rds_tcp_restore_callbacks(struct socket *sock, struct rds_tcp_connection *tc); -u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc); +u32 rds_tcp_write_seq(struct rds_tcp_connection *tc); u32 rds_tcp_snd_una(struct rds_tcp_connection *tc); u64 rds_tcp_map_seq(struct rds_tcp_connection *tc, u32 seq); extern struct rds_transport rds_tcp_transport; diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index 46f74dad0e16..534c67aeb20f 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -170,7 +170,7 @@ void rds_tcp_conn_path_shutdown(struct rds_conn_path *cp) cp->cp_conn, tc, sock); if (sock) { - if (cp->cp_conn->c_destroy_in_prog) + if (test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) rds_tcp_set_linger(sock); sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN); lock_sock(sock->sk); diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c index e006ef8e6d40..dd707b9e73e5 100644 --- a/net/rds/tcp_recv.c +++ b/net/rds/tcp_recv.c @@ -321,8 +321,12 @@ void rds_tcp_data_ready(struct sock *sk) ready = tc->t_orig_data_ready; rds_tcp_stats_inc(s_tcp_data_ready_calls); - if (rds_tcp_read_sock(cp, GFP_ATOMIC) == -ENOMEM) - queue_delayed_work(rds_wq, &cp->cp_recv_w, 0); + if (rds_tcp_read_sock(cp, GFP_ATOMIC) == -ENOMEM) { + rcu_read_lock(); + if (!test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) + queue_delayed_work(rds_wq, &cp->cp_recv_w, 0); + rcu_read_unlock(); + } out: read_unlock_bh(&sk->sk_callback_lock); ready(sk); diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c index dc860d1bb608..16f65744d984 100644 --- a/net/rds/tcp_send.c +++ b/net/rds/tcp_send.c @@ -86,7 +86,7 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, * m_ack_seq is set to the sequence number of the last byte of * header and data. see rds_tcp_is_acked(). */ - tc->t_last_sent_nxt = rds_tcp_snd_nxt(tc); + tc->t_last_sent_nxt = rds_tcp_write_seq(tc); rm->m_ack_seq = tc->t_last_sent_nxt + sizeof(struct rds_header) + be32_to_cpu(rm->m_inc.i_hdr.h_len) - 1; @@ -98,7 +98,7 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED; rdsdebug("rm %p tcp nxt %u ack_seq %llu\n", - rm, rds_tcp_snd_nxt(tc), + rm, rds_tcp_write_seq(tc), (unsigned long long)rm->m_ack_seq); } @@ -202,8 +202,11 @@ void rds_tcp_write_space(struct sock *sk) tc->t_last_seen_una = rds_tcp_snd_una(tc); rds_send_path_drop_acked(cp, rds_tcp_snd_una(tc), rds_tcp_is_acked); - if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) + rcu_read_lock(); + if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf && + !test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) queue_delayed_work(rds_wq, &cp->cp_send_w, 0); + rcu_read_unlock(); out: read_unlock_bh(&sk->sk_callback_lock); diff --git a/net/rds/threads.c b/net/rds/threads.c index f121daa402c8..eb76db1360b0 100644 --- a/net/rds/threads.c +++ b/net/rds/threads.c @@ -87,8 +87,12 @@ void rds_connect_path_complete(struct rds_conn_path *cp, int curr) cp->cp_reconnect_jiffies = 0; set_bit(0, &cp->cp_conn->c_map_queued); - queue_delayed_work(rds_wq, &cp->cp_send_w, 0); - queue_delayed_work(rds_wq, &cp->cp_recv_w, 0); + rcu_read_lock(); + if (!test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) { + queue_delayed_work(rds_wq, &cp->cp_send_w, 0); + queue_delayed_work(rds_wq, &cp->cp_recv_w, 0); + } + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(rds_connect_path_complete); @@ -133,7 +137,10 @@ void rds_queue_reconnect(struct rds_conn_path *cp) set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags); if (cp->cp_reconnect_jiffies == 0) { cp->cp_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies; - queue_delayed_work(rds_wq, &cp->cp_conn_w, 0); + rcu_read_lock(); + if (!test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) + queue_delayed_work(rds_wq, &cp->cp_conn_w, 0); + rcu_read_unlock(); return; } @@ -141,8 +148,11 @@ void rds_queue_reconnect(struct rds_conn_path *cp) rdsdebug("%lu delay %lu ceil conn %p for %pI4 -> %pI4\n", rand % cp->cp_reconnect_jiffies, cp->cp_reconnect_jiffies, conn, &conn->c_laddr, &conn->c_faddr); - queue_delayed_work(rds_wq, &cp->cp_conn_w, - rand % cp->cp_reconnect_jiffies); + rcu_read_lock(); + if (!test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) + queue_delayed_work(rds_wq, &cp->cp_conn_w, + rand % cp->cp_reconnect_jiffies); + rcu_read_unlock(); cp->cp_reconnect_jiffies = min(cp->cp_reconnect_jiffies * 2, rds_sysctl_reconnect_max_jiffies); diff --git a/net/rfkill/core.c b/net/rfkill/core.c index 2064c3a35ef8..124c77e9d058 100644 --- a/net/rfkill/core.c +++ b/net/rfkill/core.c @@ -1139,10 +1139,10 @@ static int rfkill_fop_open(struct inode *inode, struct file *file) return -ENOMEM; } -static unsigned int rfkill_fop_poll(struct file *file, poll_table *wait) +static __poll_t rfkill_fop_poll(struct file *file, poll_table *wait) { struct rfkill_data *data = file->private_data; - unsigned int res = POLLOUT | POLLWRNORM; + __poll_t res = POLLOUT | POLLWRNORM; poll_wait(file, &data->read_wait, wait); diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 6a5c4992cf61..083bd251406f 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -1461,7 +1461,6 @@ static int rose_info_open(struct inode *inode, struct file *file) } static const struct file_operations rose_info_fops = { - .owner = THIS_MODULE, .open = rose_info_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index 8ca3124df83f..178619ddab68 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -1156,7 +1156,6 @@ static int rose_nodes_open(struct inode *inode, struct file *file) } const struct file_operations rose_nodes_fops = { - .owner = THIS_MODULE, .open = rose_nodes_open, .read = seq_read, .llseek = seq_lseek, @@ -1240,7 +1239,6 @@ static int rose_neigh_open(struct inode *inode, struct file *file) } const struct file_operations rose_neigh_fops = { - .owner = THIS_MODULE, .open = rose_neigh_open, .read = seq_read, .llseek = seq_lseek, @@ -1326,7 +1324,6 @@ static int rose_route_open(struct inode *inode, struct file *file) } const struct file_operations rose_routes_fops = { - .owner = THIS_MODULE, .open = rose_route_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 9b5c46b052fd..21ad6a3a465c 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -285,6 +285,7 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, bool upgrade) { struct rxrpc_conn_parameters cp; + struct rxrpc_call_params p; struct rxrpc_call *call; struct rxrpc_sock *rx = rxrpc_sk(sock->sk); int ret; @@ -302,6 +303,10 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, if (key && !key->payload.data[0]) key = NULL; /* a no-security key */ + memset(&p, 0, sizeof(p)); + p.user_call_ID = user_call_ID; + p.tx_total_len = tx_total_len; + memset(&cp, 0, sizeof(cp)); cp.local = rx->local; cp.key = key; @@ -309,8 +314,7 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, cp.exclusive = false; cp.upgrade = upgrade; cp.service_id = srx->srx_service; - call = rxrpc_new_client_call(rx, &cp, srx, user_call_ID, tx_total_len, - gfp); + call = rxrpc_new_client_call(rx, &cp, srx, &p, gfp); /* The socket has been unlocked. */ if (!IS_ERR(call)) { call->notify_rx = notify_rx; @@ -725,12 +729,12 @@ static int rxrpc_getsockopt(struct socket *sock, int level, int optname, /* * permit an RxRPC socket to be polled */ -static unsigned int rxrpc_poll(struct file *file, struct socket *sock, +static __poll_t rxrpc_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct rxrpc_sock *rx = rxrpc_sk(sk); - unsigned int mask; + __poll_t mask; sock_poll_wait(file, sk_sleep(sk), wait); mask = 0; @@ -856,6 +860,7 @@ static void rxrpc_sock_destructor(struct sock *sk) static int rxrpc_release_sock(struct sock *sk) { struct rxrpc_sock *rx = rxrpc_sk(sk); + struct rxrpc_net *rxnet = rxrpc_net(sock_net(&rx->sk)); _enter("%p{%d,%d}", sk, sk->sk_state, refcount_read(&sk->sk_refcnt)); @@ -863,6 +868,19 @@ static int rxrpc_release_sock(struct sock *sk) sock_orphan(sk); sk->sk_shutdown = SHUTDOWN_MASK; + /* We want to kill off all connections from a service socket + * as fast as possible because we can't share these; client + * sockets, on the other hand, can share an endpoint. + */ + switch (sk->sk_state) { + case RXRPC_SERVER_BOUND: + case RXRPC_SERVER_BOUND2: + case RXRPC_SERVER_LISTENING: + case RXRPC_SERVER_LISTEN_DISABLED: + rx->local->service_closed = true; + break; + } + spin_lock_bh(&sk->sk_receive_queue.lock); sk->sk_state = RXRPC_CLOSE; spin_unlock_bh(&sk->sk_receive_queue.lock); @@ -878,6 +896,8 @@ static int rxrpc_release_sock(struct sock *sk) rxrpc_release_calls_on_socket(rx); flush_workqueue(rxrpc_workqueue); rxrpc_purge_queue(&sk->sk_receive_queue); + rxrpc_queue_work(&rxnet->service_conn_reaper); + rxrpc_queue_work(&rxnet->client_conn_reaper); rxrpc_put_local(rx->local); rx->local = NULL; diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index b2151993d384..416688381eb7 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -79,17 +79,20 @@ struct rxrpc_net { struct list_head conn_proc_list; /* List of conns in this namespace for proc */ struct list_head service_conns; /* Service conns in this namespace */ rwlock_t conn_lock; /* Lock for ->conn_proc_list, ->service_conns */ - struct delayed_work service_conn_reaper; + struct work_struct service_conn_reaper; + struct timer_list service_conn_reap_timer; unsigned int nr_client_conns; unsigned int nr_active_client_conns; bool kill_all_client_conns; + bool live; spinlock_t client_conn_cache_lock; /* Lock for ->*_client_conns */ spinlock_t client_conn_discard_lock; /* Prevent multiple discarders */ struct list_head waiting_client_conns; struct list_head active_client_conns; struct list_head idle_client_conns; - struct delayed_work client_conn_reaper; + struct work_struct client_conn_reaper; + struct timer_list client_conn_reap_timer; struct list_head local_endpoints; struct mutex local_mutex; /* Lock for ->local_endpoints */ @@ -265,6 +268,7 @@ struct rxrpc_local { rwlock_t services_lock; /* lock for services list */ int debug_id; /* debug ID for printks */ bool dead; + bool service_closed; /* Service socket closed */ struct sockaddr_rxrpc srx; /* local address */ }; @@ -338,8 +342,17 @@ enum rxrpc_conn_flag { RXRPC_CONN_DONT_REUSE, /* Don't reuse this connection */ RXRPC_CONN_COUNTED, /* Counted by rxrpc_nr_client_conns */ RXRPC_CONN_PROBING_FOR_UPGRADE, /* Probing for service upgrade */ + RXRPC_CONN_FINAL_ACK_0, /* Need final ACK for channel 0 */ + RXRPC_CONN_FINAL_ACK_1, /* Need final ACK for channel 1 */ + RXRPC_CONN_FINAL_ACK_2, /* Need final ACK for channel 2 */ + RXRPC_CONN_FINAL_ACK_3, /* Need final ACK for channel 3 */ }; +#define RXRPC_CONN_FINAL_ACK_MASK ((1UL << RXRPC_CONN_FINAL_ACK_0) | \ + (1UL << RXRPC_CONN_FINAL_ACK_1) | \ + (1UL << RXRPC_CONN_FINAL_ACK_2) | \ + (1UL << RXRPC_CONN_FINAL_ACK_3)) + /* * Events that can be raised upon a connection. */ @@ -393,6 +406,7 @@ struct rxrpc_connection { #define RXRPC_ACTIVE_CHANS_MASK ((1 << RXRPC_MAXCALLS) - 1) struct list_head waiting_calls; /* Calls waiting for channels */ struct rxrpc_channel { + unsigned long final_ack_at; /* Time at which to issue final ACK */ struct rxrpc_call __rcu *call; /* Active call */ u32 call_id; /* ID of current call */ u32 call_counter; /* Call ID counter */ @@ -404,6 +418,7 @@ struct rxrpc_connection { }; } channels[RXRPC_MAXCALLS]; + struct timer_list timer; /* Conn event timer */ struct work_struct processor; /* connection event processor */ union { struct rb_node client_node; /* Node in local->client_conns */ @@ -457,9 +472,10 @@ enum rxrpc_call_flag { enum rxrpc_call_event { RXRPC_CALL_EV_ACK, /* need to generate ACK */ RXRPC_CALL_EV_ABORT, /* need to generate abort */ - RXRPC_CALL_EV_TIMER, /* Timer expired */ RXRPC_CALL_EV_RESEND, /* Tx resend required */ RXRPC_CALL_EV_PING, /* Ping send required */ + RXRPC_CALL_EV_EXPIRED, /* Expiry occurred */ + RXRPC_CALL_EV_ACK_LOST, /* ACK may be lost, send ping */ }; /* @@ -503,10 +519,16 @@ struct rxrpc_call { struct rxrpc_peer *peer; /* Peer record for remote address */ struct rxrpc_sock __rcu *socket; /* socket responsible */ struct mutex user_mutex; /* User access mutex */ - ktime_t ack_at; /* When deferred ACK needs to happen */ - ktime_t resend_at; /* When next resend needs to happen */ - ktime_t ping_at; /* When next to send a ping */ - ktime_t expire_at; /* When the call times out */ + unsigned long ack_at; /* When deferred ACK needs to happen */ + unsigned long ack_lost_at; /* When ACK is figured as lost */ + unsigned long resend_at; /* When next resend needs to happen */ + unsigned long ping_at; /* When next to send a ping */ + unsigned long keepalive_at; /* When next to send a keepalive ping */ + unsigned long expect_rx_by; /* When we expect to get a packet by */ + unsigned long expect_req_by; /* When we expect to get a request DATA packet by */ + unsigned long expect_term_by; /* When we expect call termination by */ + u32 next_rx_timo; /* Timeout for next Rx packet (jif) */ + u32 next_req_timo; /* Timeout for next Rx request packet (jif) */ struct timer_list timer; /* Combined event timer */ struct work_struct processor; /* Event processor */ rxrpc_notify_rx_t notify_rx; /* kernel service Rx notification function */ @@ -609,6 +631,8 @@ struct rxrpc_call { ktime_t acks_latest_ts; /* Timestamp of latest ACK received */ rxrpc_serial_t acks_latest; /* serial number of latest ACK received */ rxrpc_seq_t acks_lowest_nak; /* Lowest NACK in the buffer (or ==tx_hard_ack) */ + rxrpc_seq_t acks_lost_top; /* tx_top at the time lost-ack ping sent */ + rxrpc_serial_t acks_lost_ping; /* Serial number of probe ACK */ }; /* @@ -632,6 +656,35 @@ struct rxrpc_ack_summary { u8 cumulative_acks; }; +/* + * sendmsg() cmsg-specified parameters. + */ +enum rxrpc_command { + RXRPC_CMD_SEND_DATA, /* send data message */ + RXRPC_CMD_SEND_ABORT, /* request abort generation */ + RXRPC_CMD_ACCEPT, /* [server] accept incoming call */ + RXRPC_CMD_REJECT_BUSY, /* [server] reject a call as busy */ +}; + +struct rxrpc_call_params { + s64 tx_total_len; /* Total Tx data length (if send data) */ + unsigned long user_call_ID; /* User's call ID */ + struct { + u32 hard; /* Maximum lifetime (sec) */ + u32 idle; /* Max time since last data packet (msec) */ + u32 normal; /* Max time since last call packet (msec) */ + } timeouts; + u8 nr_timeouts; /* Number of timeouts specified */ +}; + +struct rxrpc_send_params { + struct rxrpc_call_params call; + u32 abort_code; /* Abort code to Tx (if abort) */ + enum rxrpc_command command : 8; /* The command to implement */ + bool exclusive; /* Shared or exclusive call */ + bool upgrade; /* If the connection is upgradeable */ +}; + #include <trace/events/rxrpc.h> /* @@ -657,12 +710,19 @@ int rxrpc_reject_call(struct rxrpc_sock *); /* * call_event.c */ -void __rxrpc_set_timer(struct rxrpc_call *, enum rxrpc_timer_trace, ktime_t); -void rxrpc_set_timer(struct rxrpc_call *, enum rxrpc_timer_trace, ktime_t); void rxrpc_propose_ACK(struct rxrpc_call *, u8, u16, u32, bool, bool, enum rxrpc_propose_ack_trace); void rxrpc_process_call(struct work_struct *); +static inline void rxrpc_reduce_call_timer(struct rxrpc_call *call, + unsigned long expire_at, + unsigned long now, + enum rxrpc_timer_trace why) +{ + trace_rxrpc_timer(call, why, now); + timer_reduce(&call->timer, expire_at); +} + /* * call_object.c */ @@ -672,11 +732,11 @@ extern unsigned int rxrpc_max_call_lifetime; extern struct kmem_cache *rxrpc_call_jar; struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *, unsigned long); -struct rxrpc_call *rxrpc_alloc_call(gfp_t); +struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *, gfp_t); struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *, struct rxrpc_conn_parameters *, struct sockaddr_rxrpc *, - unsigned long, s64, gfp_t); + struct rxrpc_call_params *, gfp_t); int rxrpc_retry_client_call(struct rxrpc_sock *, struct rxrpc_call *, struct rxrpc_conn_parameters *, @@ -803,8 +863,8 @@ static inline bool __rxrpc_abort_eproto(struct rxrpc_call *call, */ extern unsigned int rxrpc_max_client_connections; extern unsigned int rxrpc_reap_client_connections; -extern unsigned int rxrpc_conn_idle_client_expiry; -extern unsigned int rxrpc_conn_idle_client_fast_expiry; +extern unsigned long rxrpc_conn_idle_client_expiry; +extern unsigned long rxrpc_conn_idle_client_fast_expiry; extern struct idr rxrpc_client_conn_ids; void rxrpc_destroy_client_conn_ids(void); @@ -825,6 +885,7 @@ void rxrpc_process_connection(struct work_struct *); * conn_object.c */ extern unsigned int rxrpc_connection_expiry; +extern unsigned int rxrpc_closed_conn_expiry; struct rxrpc_connection *rxrpc_alloc_connection(gfp_t); struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *, @@ -861,6 +922,12 @@ static inline void rxrpc_put_connection(struct rxrpc_connection *conn) rxrpc_put_service_conn(conn); } +static inline void rxrpc_reduce_conn_timer(struct rxrpc_connection *conn, + unsigned long expire_at) +{ + timer_reduce(&conn->timer, expire_at); +} + /* * conn_service.c */ @@ -930,13 +997,13 @@ static inline void rxrpc_queue_local(struct rxrpc_local *local) * misc.c */ extern unsigned int rxrpc_max_backlog __read_mostly; -extern unsigned int rxrpc_requested_ack_delay; -extern unsigned int rxrpc_soft_ack_delay; -extern unsigned int rxrpc_idle_ack_delay; +extern unsigned long rxrpc_requested_ack_delay; +extern unsigned long rxrpc_soft_ack_delay; +extern unsigned long rxrpc_idle_ack_delay; extern unsigned int rxrpc_rx_window_size; extern unsigned int rxrpc_rx_mtu; extern unsigned int rxrpc_rx_jumbo_max; -extern unsigned int rxrpc_resend_timeout; +extern unsigned long rxrpc_resend_timeout; extern const s8 rxrpc_ack_priority[]; @@ -954,7 +1021,7 @@ static inline struct rxrpc_net *rxrpc_net(struct net *net) /* * output.c */ -int rxrpc_send_ack_packet(struct rxrpc_call *, bool); +int rxrpc_send_ack_packet(struct rxrpc_call *, bool, rxrpc_serial_t *); int rxrpc_send_abort_packet(struct rxrpc_call *); int rxrpc_send_data_packet(struct rxrpc_call *, struct sk_buff *, bool); void rxrpc_reject_packets(struct rxrpc_local *); diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index cbd1701e813a..3028298ca561 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -94,7 +94,7 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, /* Now it gets complicated, because calls get registered with the * socket here, particularly if a user ID is preassigned by the user. */ - call = rxrpc_alloc_call(gfp); + call = rxrpc_alloc_call(rx, gfp); if (!call) return -ENOMEM; call->flags |= (1 << RXRPC_CALL_IS_SERVICE); diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 3574508baf9a..ad2ab1103189 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -22,80 +22,6 @@ #include "ar-internal.h" /* - * Set the timer - */ -void __rxrpc_set_timer(struct rxrpc_call *call, enum rxrpc_timer_trace why, - ktime_t now) -{ - unsigned long t_j, now_j = jiffies; - ktime_t t; - bool queue = false; - - if (call->state < RXRPC_CALL_COMPLETE) { - t = call->expire_at; - if (!ktime_after(t, now)) { - trace_rxrpc_timer(call, why, now, now_j); - queue = true; - goto out; - } - - if (!ktime_after(call->resend_at, now)) { - call->resend_at = call->expire_at; - if (!test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events)) - queue = true; - } else if (ktime_before(call->resend_at, t)) { - t = call->resend_at; - } - - if (!ktime_after(call->ack_at, now)) { - call->ack_at = call->expire_at; - if (!test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events)) - queue = true; - } else if (ktime_before(call->ack_at, t)) { - t = call->ack_at; - } - - if (!ktime_after(call->ping_at, now)) { - call->ping_at = call->expire_at; - if (!test_and_set_bit(RXRPC_CALL_EV_PING, &call->events)) - queue = true; - } else if (ktime_before(call->ping_at, t)) { - t = call->ping_at; - } - - t_j = nsecs_to_jiffies(ktime_to_ns(ktime_sub(t, now))); - t_j += jiffies; - - /* We have to make sure that the calculated jiffies value falls - * at or after the nsec value, or we may loop ceaselessly - * because the timer times out, but we haven't reached the nsec - * timeout yet. - */ - t_j++; - - if (call->timer.expires != t_j || !timer_pending(&call->timer)) { - mod_timer(&call->timer, t_j); - trace_rxrpc_timer(call, why, now, now_j); - } - } - -out: - if (queue) - rxrpc_queue_call(call); -} - -/* - * Set the timer - */ -void rxrpc_set_timer(struct rxrpc_call *call, enum rxrpc_timer_trace why, - ktime_t now) -{ - read_lock_bh(&call->state_lock); - __rxrpc_set_timer(call, why, now); - read_unlock_bh(&call->state_lock); -} - -/* * Propose a PING ACK be sent. */ static void rxrpc_propose_ping(struct rxrpc_call *call, @@ -106,12 +32,13 @@ static void rxrpc_propose_ping(struct rxrpc_call *call, !test_and_set_bit(RXRPC_CALL_EV_PING, &call->events)) rxrpc_queue_call(call); } else { - ktime_t now = ktime_get_real(); - ktime_t ping_at = ktime_add_ms(now, rxrpc_idle_ack_delay); + unsigned long now = jiffies; + unsigned long ping_at = now + rxrpc_idle_ack_delay; - if (ktime_before(ping_at, call->ping_at)) { - call->ping_at = ping_at; - rxrpc_set_timer(call, rxrpc_timer_set_for_ping, now); + if (time_before(ping_at, call->ping_at)) { + WRITE_ONCE(call->ping_at, ping_at); + rxrpc_reduce_call_timer(call, ping_at, now, + rxrpc_timer_set_for_ping); } } } @@ -125,8 +52,7 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, enum rxrpc_propose_ack_trace why) { enum rxrpc_propose_ack_outcome outcome = rxrpc_propose_ack_use; - unsigned int expiry = rxrpc_soft_ack_delay; - ktime_t now, ack_at; + unsigned long expiry = rxrpc_soft_ack_delay; s8 prior = rxrpc_ack_priority[ack_reason]; /* Pings are handled specially because we don't want to accidentally @@ -190,11 +116,18 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, background) rxrpc_queue_call(call); } else { - now = ktime_get_real(); - ack_at = ktime_add_ms(now, expiry); - if (ktime_before(ack_at, call->ack_at)) { - call->ack_at = ack_at; - rxrpc_set_timer(call, rxrpc_timer_set_for_ack, now); + unsigned long now = jiffies, ack_at; + + if (call->peer->rtt_usage > 0) + ack_at = nsecs_to_jiffies(call->peer->rtt); + else + ack_at = expiry; + + ack_at += now; + if (time_before(ack_at, call->ack_at)) { + WRITE_ONCE(call->ack_at, ack_at); + rxrpc_reduce_call_timer(call, ack_at, now, + rxrpc_timer_set_for_ack); } } @@ -227,18 +160,28 @@ static void rxrpc_congestion_timeout(struct rxrpc_call *call) /* * Perform retransmission of NAK'd and unack'd packets. */ -static void rxrpc_resend(struct rxrpc_call *call, ktime_t now) +static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) { struct rxrpc_skb_priv *sp; struct sk_buff *skb; + unsigned long resend_at; rxrpc_seq_t cursor, seq, top; - ktime_t max_age, oldest, ack_ts; + ktime_t now, max_age, oldest, ack_ts, timeout, min_timeo; int ix; u8 annotation, anno_type, retrans = 0, unacked = 0; _enter("{%d,%d}", call->tx_hard_ack, call->tx_top); - max_age = ktime_sub_ms(now, rxrpc_resend_timeout); + if (call->peer->rtt_usage > 1) + timeout = ns_to_ktime(call->peer->rtt * 3 / 2); + else + timeout = ms_to_ktime(rxrpc_resend_timeout); + min_timeo = ns_to_ktime((1000000000 / HZ) * 4); + if (ktime_before(timeout, min_timeo)) + timeout = min_timeo; + + now = ktime_get_real(); + max_age = ktime_sub(now, timeout); spin_lock_bh(&call->lock); @@ -282,7 +225,9 @@ static void rxrpc_resend(struct rxrpc_call *call, ktime_t now) ktime_to_ns(ktime_sub(skb->tstamp, max_age))); } - call->resend_at = ktime_add_ms(oldest, rxrpc_resend_timeout); + resend_at = nsecs_to_jiffies(ktime_to_ns(ktime_sub(oldest, now))); + resend_at += jiffies + rxrpc_resend_timeout; + WRITE_ONCE(call->resend_at, resend_at); if (unacked) rxrpc_congestion_timeout(call); @@ -292,14 +237,15 @@ static void rxrpc_resend(struct rxrpc_call *call, ktime_t now) * retransmitting data. */ if (!retrans) { - rxrpc_set_timer(call, rxrpc_timer_set_for_resend, now); + rxrpc_reduce_call_timer(call, resend_at, now, + rxrpc_timer_set_for_resend); spin_unlock_bh(&call->lock); ack_ts = ktime_sub(now, call->acks_latest_ts); if (ktime_to_ns(ack_ts) < call->peer->rtt) goto out; rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, 0, true, false, rxrpc_propose_ack_ping_for_lost_ack); - rxrpc_send_ack_packet(call, true); + rxrpc_send_ack_packet(call, true, NULL); goto out; } @@ -364,7 +310,8 @@ void rxrpc_process_call(struct work_struct *work) { struct rxrpc_call *call = container_of(work, struct rxrpc_call, processor); - ktime_t now; + rxrpc_serial_t *send_ack; + unsigned long now, next, t; rxrpc_see_call(call); @@ -384,22 +331,89 @@ recheck_state: goto out_put; } - now = ktime_get_real(); - if (ktime_before(call->expire_at, now)) { + /* Work out if any timeouts tripped */ + now = jiffies; + t = READ_ONCE(call->expect_rx_by); + if (time_after_eq(now, t)) { + trace_rxrpc_timer(call, rxrpc_timer_exp_normal, now); + set_bit(RXRPC_CALL_EV_EXPIRED, &call->events); + } + + t = READ_ONCE(call->expect_req_by); + if (call->state == RXRPC_CALL_SERVER_RECV_REQUEST && + time_after_eq(now, t)) { + trace_rxrpc_timer(call, rxrpc_timer_exp_idle, now); + set_bit(RXRPC_CALL_EV_EXPIRED, &call->events); + } + + t = READ_ONCE(call->expect_term_by); + if (time_after_eq(now, t)) { + trace_rxrpc_timer(call, rxrpc_timer_exp_hard, now); + set_bit(RXRPC_CALL_EV_EXPIRED, &call->events); + } + + t = READ_ONCE(call->ack_at); + if (time_after_eq(now, t)) { + trace_rxrpc_timer(call, rxrpc_timer_exp_ack, now); + cmpxchg(&call->ack_at, t, now + MAX_JIFFY_OFFSET); + set_bit(RXRPC_CALL_EV_ACK, &call->events); + } + + t = READ_ONCE(call->ack_lost_at); + if (time_after_eq(now, t)) { + trace_rxrpc_timer(call, rxrpc_timer_exp_lost_ack, now); + cmpxchg(&call->ack_lost_at, t, now + MAX_JIFFY_OFFSET); + set_bit(RXRPC_CALL_EV_ACK_LOST, &call->events); + } + + t = READ_ONCE(call->keepalive_at); + if (time_after_eq(now, t)) { + trace_rxrpc_timer(call, rxrpc_timer_exp_keepalive, now); + cmpxchg(&call->keepalive_at, t, now + MAX_JIFFY_OFFSET); + rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, 0, true, true, + rxrpc_propose_ack_ping_for_keepalive); + set_bit(RXRPC_CALL_EV_PING, &call->events); + } + + t = READ_ONCE(call->ping_at); + if (time_after_eq(now, t)) { + trace_rxrpc_timer(call, rxrpc_timer_exp_ping, now); + cmpxchg(&call->ping_at, t, now + MAX_JIFFY_OFFSET); + set_bit(RXRPC_CALL_EV_PING, &call->events); + } + + t = READ_ONCE(call->resend_at); + if (time_after_eq(now, t)) { + trace_rxrpc_timer(call, rxrpc_timer_exp_resend, now); + cmpxchg(&call->resend_at, t, now + MAX_JIFFY_OFFSET); + set_bit(RXRPC_CALL_EV_RESEND, &call->events); + } + + /* Process events */ + if (test_and_clear_bit(RXRPC_CALL_EV_EXPIRED, &call->events)) { rxrpc_abort_call("EXP", call, 0, RX_USER_ABORT, -ETIME); set_bit(RXRPC_CALL_EV_ABORT, &call->events); goto recheck_state; } - if (test_and_clear_bit(RXRPC_CALL_EV_ACK, &call->events)) { + send_ack = NULL; + if (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events)) { + call->acks_lost_top = call->tx_top; + rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, 0, true, false, + rxrpc_propose_ack_ping_for_lost_ack); + send_ack = &call->acks_lost_ping; + } + + if (test_and_clear_bit(RXRPC_CALL_EV_ACK, &call->events) || + send_ack) { if (call->ackr_reason) { - rxrpc_send_ack_packet(call, false); + rxrpc_send_ack_packet(call, false, send_ack); goto recheck_state; } } if (test_and_clear_bit(RXRPC_CALL_EV_PING, &call->events)) { - rxrpc_send_ack_packet(call, true); + rxrpc_send_ack_packet(call, true, NULL); goto recheck_state; } @@ -408,7 +422,24 @@ recheck_state: goto recheck_state; } - rxrpc_set_timer(call, rxrpc_timer_set_for_resend, now); + /* Make sure the timer is restarted */ + next = call->expect_rx_by; + +#define set(T) { t = READ_ONCE(T); if (time_before(t, next)) next = t; } + + set(call->expect_req_by); + set(call->expect_term_by); + set(call->ack_at); + set(call->ack_lost_at); + set(call->resend_at); + set(call->keepalive_at); + set(call->ping_at); + + now = jiffies; + if (time_after_eq(now, next)) + goto recheck_state; + + rxrpc_reduce_call_timer(call, next, now, rxrpc_timer_restart); /* other events may have been raised since we started checking */ if (call->events && call->state < RXRPC_CALL_COMPLETE) { diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 994dc2df57e4..0b2db38dd32d 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -51,10 +51,14 @@ static void rxrpc_call_timer_expired(struct timer_list *t) _enter("%d", call->debug_id); - if (call->state < RXRPC_CALL_COMPLETE) - rxrpc_set_timer(call, rxrpc_timer_expired, ktime_get_real()); + if (call->state < RXRPC_CALL_COMPLETE) { + trace_rxrpc_timer(call, rxrpc_timer_expired, jiffies); + rxrpc_queue_call(call); + } } +static struct lock_class_key rxrpc_call_user_mutex_lock_class_key; + /* * find an extant server call * - called in process context with IRQs enabled @@ -95,7 +99,7 @@ found_extant_call: /* * allocate a new call */ -struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp) +struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp) { struct rxrpc_call *call; @@ -114,6 +118,14 @@ struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp) goto nomem_2; mutex_init(&call->user_mutex); + + /* Prevent lockdep reporting a deadlock false positive between the afs + * filesystem and sys_sendmsg() via the mmap sem. + */ + if (rx->sk.sk_kern_sock) + lockdep_set_class(&call->user_mutex, + &rxrpc_call_user_mutex_lock_class_key); + timer_setup(&call->timer, rxrpc_call_timer_expired, 0); INIT_WORK(&call->processor, &rxrpc_process_call); INIT_LIST_HEAD(&call->link); @@ -128,6 +140,8 @@ struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp) atomic_set(&call->usage, 1); call->debug_id = atomic_inc_return(&rxrpc_debug_id); call->tx_total_len = -1; + call->next_rx_timo = 20 * HZ; + call->next_req_timo = 1 * HZ; memset(&call->sock_node, 0xed, sizeof(call->sock_node)); @@ -150,7 +164,8 @@ nomem: /* * Allocate a new client call. */ -static struct rxrpc_call *rxrpc_alloc_client_call(struct sockaddr_rxrpc *srx, +static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx, + struct sockaddr_rxrpc *srx, gfp_t gfp) { struct rxrpc_call *call; @@ -158,7 +173,7 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct sockaddr_rxrpc *srx, _enter(""); - call = rxrpc_alloc_call(gfp); + call = rxrpc_alloc_call(rx, gfp); if (!call) return ERR_PTR(-ENOMEM); call->state = RXRPC_CALL_CLIENT_AWAIT_CONN; @@ -177,15 +192,17 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct sockaddr_rxrpc *srx, */ static void rxrpc_start_call_timer(struct rxrpc_call *call) { - ktime_t now = ktime_get_real(), expire_at; - - expire_at = ktime_add_ms(now, rxrpc_max_call_lifetime); - call->expire_at = expire_at; - call->ack_at = expire_at; - call->ping_at = expire_at; - call->resend_at = expire_at; - call->timer.expires = jiffies + LONG_MAX / 2; - rxrpc_set_timer(call, rxrpc_timer_begin, now); + unsigned long now = jiffies; + unsigned long j = now + MAX_JIFFY_OFFSET; + + call->ack_at = j; + call->ack_lost_at = j; + call->resend_at = j; + call->ping_at = j; + call->expect_rx_by = j; + call->expect_req_by = j; + call->expect_term_by = j; + call->timer.expires = now; } /* @@ -196,8 +213,7 @@ static void rxrpc_start_call_timer(struct rxrpc_call *call) struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, struct rxrpc_conn_parameters *cp, struct sockaddr_rxrpc *srx, - unsigned long user_call_ID, - s64 tx_total_len, + struct rxrpc_call_params *p, gfp_t gfp) __releases(&rx->sk.sk_lock.slock) { @@ -207,18 +223,18 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, const void *here = __builtin_return_address(0); int ret; - _enter("%p,%lx", rx, user_call_ID); + _enter("%p,%lx", rx, p->user_call_ID); - call = rxrpc_alloc_client_call(srx, gfp); + call = rxrpc_alloc_client_call(rx, srx, gfp); if (IS_ERR(call)) { release_sock(&rx->sk); _leave(" = %ld", PTR_ERR(call)); return call; } - call->tx_total_len = tx_total_len; + call->tx_total_len = p->tx_total_len; trace_rxrpc_call(call, rxrpc_call_new_client, atomic_read(&call->usage), - here, (const void *)user_call_ID); + here, (const void *)p->user_call_ID); /* We need to protect a partially set up call against the user as we * will be acting outside the socket lock. @@ -234,16 +250,16 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, parent = *pp; xcall = rb_entry(parent, struct rxrpc_call, sock_node); - if (user_call_ID < xcall->user_call_ID) + if (p->user_call_ID < xcall->user_call_ID) pp = &(*pp)->rb_left; - else if (user_call_ID > xcall->user_call_ID) + else if (p->user_call_ID > xcall->user_call_ID) pp = &(*pp)->rb_right; else goto error_dup_user_ID; } rcu_assign_pointer(call->socket, rx); - call->user_call_ID = user_call_ID; + call->user_call_ID = p->user_call_ID; __set_bit(RXRPC_CALL_HAS_USERID, &call->flags); rxrpc_get_call(call, rxrpc_call_got_userid); rb_link_node(&call->sock_node, parent, pp); diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 5f9624bd311c..7f74ca3059f8 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -85,8 +85,8 @@ __read_mostly unsigned int rxrpc_max_client_connections = 1000; __read_mostly unsigned int rxrpc_reap_client_connections = 900; -__read_mostly unsigned int rxrpc_conn_idle_client_expiry = 2 * 60 * HZ; -__read_mostly unsigned int rxrpc_conn_idle_client_fast_expiry = 2 * HZ; +__read_mostly unsigned long rxrpc_conn_idle_client_expiry = 2 * 60 * HZ; +__read_mostly unsigned long rxrpc_conn_idle_client_fast_expiry = 2 * HZ; /* * We use machine-unique IDs for our client connections. @@ -554,6 +554,11 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn, trace_rxrpc_client(conn, channel, rxrpc_client_chan_activate); + /* Cancel the final ACK on the previous call if it hasn't been sent yet + * as the DATA packet will implicitly ACK it. + */ + clear_bit(RXRPC_CONN_FINAL_ACK_0 + channel, &conn->flags); + write_lock_bh(&call->state_lock); if (!test_bit(RXRPC_CALL_TX_LASTQ, &call->flags)) call->state = RXRPC_CALL_CLIENT_SEND_REQUEST; @@ -686,7 +691,7 @@ int rxrpc_connect_call(struct rxrpc_call *call, _enter("{%d,%lx},", call->debug_id, call->user_call_ID); - rxrpc_discard_expired_client_conns(&rxnet->client_conn_reaper.work); + rxrpc_discard_expired_client_conns(&rxnet->client_conn_reaper); rxrpc_cull_active_client_conns(rxnet); ret = rxrpc_get_client_conn(call, cp, srx, gfp); @@ -752,6 +757,18 @@ void rxrpc_expose_client_call(struct rxrpc_call *call) } /* + * Set the reap timer. + */ +static void rxrpc_set_client_reap_timer(struct rxrpc_net *rxnet) +{ + unsigned long now = jiffies; + unsigned long reap_at = now + rxrpc_conn_idle_client_expiry; + + if (rxnet->live) + timer_reduce(&rxnet->client_conn_reap_timer, reap_at); +} + +/* * Disconnect a client call. */ void rxrpc_disconnect_client_call(struct rxrpc_call *call) @@ -813,6 +830,19 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call) goto out_2; } + /* Schedule the final ACK to be transmitted in a short while so that it + * can be skipped if we find a follow-on call. The first DATA packet + * of the follow on call will implicitly ACK this call. + */ + if (test_bit(RXRPC_CALL_EXPOSED, &call->flags)) { + unsigned long final_ack_at = jiffies + 2; + + WRITE_ONCE(chan->final_ack_at, final_ack_at); + smp_wmb(); /* vs rxrpc_process_delayed_final_acks() */ + set_bit(RXRPC_CONN_FINAL_ACK_0 + channel, &conn->flags); + rxrpc_reduce_conn_timer(conn, final_ack_at); + } + /* Things are more complex and we need the cache lock. We might be * able to simply idle the conn or it might now be lurking on the wait * list. It might even get moved back to the active list whilst we're @@ -878,9 +908,7 @@ idle_connection: list_move_tail(&conn->cache_link, &rxnet->idle_client_conns); if (rxnet->idle_client_conns.next == &conn->cache_link && !rxnet->kill_all_client_conns) - queue_delayed_work(rxrpc_workqueue, - &rxnet->client_conn_reaper, - rxrpc_conn_idle_client_expiry); + rxrpc_set_client_reap_timer(rxnet); } else { trace_rxrpc_client(conn, channel, rxrpc_client_to_inactive); conn->cache_state = RXRPC_CONN_CLIENT_INACTIVE; @@ -1018,8 +1046,7 @@ void rxrpc_discard_expired_client_conns(struct work_struct *work) { struct rxrpc_connection *conn; struct rxrpc_net *rxnet = - container_of(to_delayed_work(work), - struct rxrpc_net, client_conn_reaper); + container_of(work, struct rxrpc_net, client_conn_reaper); unsigned long expiry, conn_expires_at, now; unsigned int nr_conns; bool did_discard = false; @@ -1061,6 +1088,8 @@ next: expiry = rxrpc_conn_idle_client_expiry; if (nr_conns > rxrpc_reap_client_connections) expiry = rxrpc_conn_idle_client_fast_expiry; + if (conn->params.local->service_closed) + expiry = rxrpc_closed_conn_expiry * HZ; conn_expires_at = conn->idle_timestamp + expiry; @@ -1096,9 +1125,8 @@ not_yet_expired: */ _debug("not yet"); if (!rxnet->kill_all_client_conns) - queue_delayed_work(rxrpc_workqueue, - &rxnet->client_conn_reaper, - conn_expires_at - now); + timer_reduce(&rxnet->client_conn_reap_timer, + conn_expires_at); out: spin_unlock(&rxnet->client_conn_cache_lock); @@ -1118,9 +1146,9 @@ void rxrpc_destroy_all_client_connections(struct rxrpc_net *rxnet) rxnet->kill_all_client_conns = true; spin_unlock(&rxnet->client_conn_cache_lock); - cancel_delayed_work(&rxnet->client_conn_reaper); + del_timer_sync(&rxnet->client_conn_reap_timer); - if (!queue_delayed_work(rxrpc_workqueue, &rxnet->client_conn_reaper, 0)) + if (!rxrpc_queue_work(&rxnet->client_conn_reaper)) _debug("destroy: queue failed"); _leave(""); diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 59a51a56e7c8..4ca11be6be3c 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -24,31 +24,28 @@ * Retransmit terminal ACK or ABORT of the previous call. */ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, - struct sk_buff *skb) + struct sk_buff *skb, + unsigned int channel) { - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxrpc_skb_priv *sp = skb ? rxrpc_skb(skb) : NULL; struct rxrpc_channel *chan; struct msghdr msg; - struct kvec iov; + struct kvec iov[3]; struct { struct rxrpc_wire_header whdr; union { - struct { - __be32 code; - } abort; - struct { - struct rxrpc_ackpacket ack; - u8 padding[3]; - struct rxrpc_ackinfo info; - }; + __be32 abort_code; + struct rxrpc_ackpacket ack; }; } __attribute__((packed)) pkt; + struct rxrpc_ackinfo ack_info; size_t len; - u32 serial, mtu, call_id; + int ioc; + u32 serial, mtu, call_id, padding; _enter("%d", conn->debug_id); - chan = &conn->channels[sp->hdr.cid & RXRPC_CHANNELMASK]; + chan = &conn->channels[channel]; /* If the last call got moved on whilst we were waiting to run, just * ignore this packet. @@ -56,7 +53,7 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, call_id = READ_ONCE(chan->last_call); /* Sync with __rxrpc_disconnect_call() */ smp_rmb(); - if (call_id != sp->hdr.callNumber) + if (skb && call_id != sp->hdr.callNumber) return; msg.msg_name = &conn->params.peer->srx.transport; @@ -65,9 +62,16 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, msg.msg_controllen = 0; msg.msg_flags = 0; - pkt.whdr.epoch = htonl(sp->hdr.epoch); - pkt.whdr.cid = htonl(sp->hdr.cid); - pkt.whdr.callNumber = htonl(sp->hdr.callNumber); + iov[0].iov_base = &pkt; + iov[0].iov_len = sizeof(pkt.whdr); + iov[1].iov_base = &padding; + iov[1].iov_len = 3; + iov[2].iov_base = &ack_info; + iov[2].iov_len = sizeof(ack_info); + + pkt.whdr.epoch = htonl(conn->proto.epoch); + pkt.whdr.cid = htonl(conn->proto.cid); + pkt.whdr.callNumber = htonl(call_id); pkt.whdr.seq = 0; pkt.whdr.type = chan->last_type; pkt.whdr.flags = conn->out_clientflag; @@ -79,27 +83,35 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, len = sizeof(pkt.whdr); switch (chan->last_type) { case RXRPC_PACKET_TYPE_ABORT: - pkt.abort.code = htonl(chan->last_abort); - len += sizeof(pkt.abort); + pkt.abort_code = htonl(chan->last_abort); + iov[0].iov_len += sizeof(pkt.abort_code); + len += sizeof(pkt.abort_code); + ioc = 1; break; case RXRPC_PACKET_TYPE_ACK: mtu = conn->params.peer->if_mtu; mtu -= conn->params.peer->hdrsize; pkt.ack.bufferSpace = 0; - pkt.ack.maxSkew = htons(skb->priority); - pkt.ack.firstPacket = htonl(chan->last_seq); - pkt.ack.previousPacket = htonl(chan->last_seq - 1); - pkt.ack.serial = htonl(sp->hdr.serial); - pkt.ack.reason = RXRPC_ACK_DUPLICATE; + pkt.ack.maxSkew = htons(skb ? skb->priority : 0); + pkt.ack.firstPacket = htonl(chan->last_seq + 1); + pkt.ack.previousPacket = htonl(chan->last_seq); + pkt.ack.serial = htonl(skb ? sp->hdr.serial : 0); + pkt.ack.reason = skb ? RXRPC_ACK_DUPLICATE : RXRPC_ACK_IDLE; pkt.ack.nAcks = 0; - pkt.info.rxMTU = htonl(rxrpc_rx_mtu); - pkt.info.maxMTU = htonl(mtu); - pkt.info.rwind = htonl(rxrpc_rx_window_size); - pkt.info.jumbo_max = htonl(rxrpc_rx_jumbo_max); + ack_info.rxMTU = htonl(rxrpc_rx_mtu); + ack_info.maxMTU = htonl(mtu); + ack_info.rwind = htonl(rxrpc_rx_window_size); + ack_info.jumbo_max = htonl(rxrpc_rx_jumbo_max); pkt.whdr.flags |= RXRPC_SLOW_START_OK; - len += sizeof(pkt.ack) + sizeof(pkt.info); + padding = 0; + iov[0].iov_len += sizeof(pkt.ack); + len += sizeof(pkt.ack) + 3 + sizeof(ack_info); + ioc = 3; break; + + default: + return; } /* Resync with __rxrpc_disconnect_call() and check that the last call @@ -109,9 +121,6 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, if (READ_ONCE(chan->last_call) != call_id) return; - iov.iov_base = &pkt; - iov.iov_len = len; - serial = atomic_inc_return(&conn->serial); pkt.whdr.serial = htonl(serial); @@ -126,7 +135,7 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, break; } - kernel_sendmsg(conn->params.local->socket, &msg, &iov, 1, len); + kernel_sendmsg(conn->params.local->socket, &msg, iov, ioc, len); _leave(""); return; } @@ -272,7 +281,8 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, switch (sp->hdr.type) { case RXRPC_PACKET_TYPE_DATA: case RXRPC_PACKET_TYPE_ACK: - rxrpc_conn_retransmit_call(conn, skb); + rxrpc_conn_retransmit_call(conn, skb, + sp->hdr.cid & RXRPC_CHANNELMASK); return 0; case RXRPC_PACKET_TYPE_BUSY: @@ -379,6 +389,48 @@ abort: } /* + * Process delayed final ACKs that we haven't subsumed into a subsequent call. + */ +static void rxrpc_process_delayed_final_acks(struct rxrpc_connection *conn) +{ + unsigned long j = jiffies, next_j; + unsigned int channel; + bool set; + +again: + next_j = j + LONG_MAX; + set = false; + for (channel = 0; channel < RXRPC_MAXCALLS; channel++) { + struct rxrpc_channel *chan = &conn->channels[channel]; + unsigned long ack_at; + + if (!test_bit(RXRPC_CONN_FINAL_ACK_0 + channel, &conn->flags)) + continue; + + smp_rmb(); /* vs rxrpc_disconnect_client_call */ + ack_at = READ_ONCE(chan->final_ack_at); + + if (time_before(j, ack_at)) { + if (time_before(ack_at, next_j)) { + next_j = ack_at; + set = true; + } + continue; + } + + if (test_and_clear_bit(RXRPC_CONN_FINAL_ACK_0 + channel, + &conn->flags)) + rxrpc_conn_retransmit_call(conn, NULL, channel); + } + + j = jiffies; + if (time_before_eq(next_j, j)) + goto again; + if (set) + rxrpc_reduce_conn_timer(conn, next_j); +} + +/* * connection-level event processor */ void rxrpc_process_connection(struct work_struct *work) @@ -394,6 +446,10 @@ void rxrpc_process_connection(struct work_struct *work) if (test_and_clear_bit(RXRPC_CONN_EV_CHALLENGE, &conn->events)) rxrpc_secure_connection(conn); + /* Process delayed ACKs whose time has come. */ + if (conn->flags & RXRPC_CONN_FINAL_ACK_MASK) + rxrpc_process_delayed_final_acks(conn); + /* go through the conn-level event packets, releasing the ref on this * connection that each one has when we've finished with it */ while ((skb = skb_dequeue(&conn->rx_queue))) { diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index fe575798592f..c628351eb900 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -20,10 +20,19 @@ /* * Time till a connection expires after last use (in seconds). */ -unsigned int rxrpc_connection_expiry = 10 * 60; +unsigned int __read_mostly rxrpc_connection_expiry = 10 * 60; +unsigned int __read_mostly rxrpc_closed_conn_expiry = 10; static void rxrpc_destroy_connection(struct rcu_head *); +static void rxrpc_connection_timer(struct timer_list *timer) +{ + struct rxrpc_connection *conn = + container_of(timer, struct rxrpc_connection, timer); + + rxrpc_queue_conn(conn); +} + /* * allocate a new connection */ @@ -38,6 +47,7 @@ struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp) INIT_LIST_HEAD(&conn->cache_link); spin_lock_init(&conn->channel_lock); INIT_LIST_HEAD(&conn->waiting_calls); + timer_setup(&conn->timer, &rxrpc_connection_timer, 0); INIT_WORK(&conn->processor, &rxrpc_process_connection); INIT_LIST_HEAD(&conn->proc_link); INIT_LIST_HEAD(&conn->link); @@ -301,21 +311,29 @@ rxrpc_get_connection_maybe(struct rxrpc_connection *conn) } /* + * Set the service connection reap timer. + */ +static void rxrpc_set_service_reap_timer(struct rxrpc_net *rxnet, + unsigned long reap_at) +{ + if (rxnet->live) + timer_reduce(&rxnet->service_conn_reap_timer, reap_at); +} + +/* * Release a service connection */ void rxrpc_put_service_conn(struct rxrpc_connection *conn) { - struct rxrpc_net *rxnet; const void *here = __builtin_return_address(0); int n; n = atomic_dec_return(&conn->usage); trace_rxrpc_conn(conn, rxrpc_conn_put_service, n, here); ASSERTCMP(n, >=, 0); - if (n == 0) { - rxnet = conn->params.local->rxnet; - rxrpc_queue_delayed_work(&rxnet->service_conn_reaper, 0); - } + if (n == 1) + rxrpc_set_service_reap_timer(conn->params.local->rxnet, + jiffies + rxrpc_connection_expiry); } /* @@ -332,6 +350,7 @@ static void rxrpc_destroy_connection(struct rcu_head *rcu) _net("DESTROY CONN %d", conn->debug_id); + del_timer_sync(&conn->timer); rxrpc_purge_queue(&conn->rx_queue); conn->security->clear(conn); @@ -351,17 +370,15 @@ void rxrpc_service_connection_reaper(struct work_struct *work) { struct rxrpc_connection *conn, *_p; struct rxrpc_net *rxnet = - container_of(to_delayed_work(work), - struct rxrpc_net, service_conn_reaper); - unsigned long reap_older_than, earliest, idle_timestamp, now; + container_of(work, struct rxrpc_net, service_conn_reaper); + unsigned long expire_at, earliest, idle_timestamp, now; LIST_HEAD(graveyard); _enter(""); now = jiffies; - reap_older_than = now - rxrpc_connection_expiry * HZ; - earliest = ULONG_MAX; + earliest = now + MAX_JIFFY_OFFSET; write_lock(&rxnet->conn_lock); list_for_each_entry_safe(conn, _p, &rxnet->service_conns, link) { @@ -371,15 +388,21 @@ void rxrpc_service_connection_reaper(struct work_struct *work) if (conn->state == RXRPC_CONN_SERVICE_PREALLOC) continue; - idle_timestamp = READ_ONCE(conn->idle_timestamp); - _debug("reap CONN %d { u=%d,t=%ld }", - conn->debug_id, atomic_read(&conn->usage), - (long)reap_older_than - (long)idle_timestamp); - - if (time_after(idle_timestamp, reap_older_than)) { - if (time_before(idle_timestamp, earliest)) - earliest = idle_timestamp; - continue; + if (rxnet->live) { + idle_timestamp = READ_ONCE(conn->idle_timestamp); + expire_at = idle_timestamp + rxrpc_connection_expiry * HZ; + if (conn->params.local->service_closed) + expire_at = idle_timestamp + rxrpc_closed_conn_expiry * HZ; + + _debug("reap CONN %d { u=%d,t=%ld }", + conn->debug_id, atomic_read(&conn->usage), + (long)expire_at - (long)now); + + if (time_before(now, expire_at)) { + if (time_before(expire_at, earliest)) + earliest = expire_at; + continue; + } } /* The usage count sits at 1 whilst the object is unused on the @@ -387,6 +410,7 @@ void rxrpc_service_connection_reaper(struct work_struct *work) */ if (atomic_cmpxchg(&conn->usage, 1, 0) != 1) continue; + trace_rxrpc_conn(conn, rxrpc_conn_reap_service, 0, 0); if (rxrpc_conn_is_client(conn)) BUG(); @@ -397,11 +421,10 @@ void rxrpc_service_connection_reaper(struct work_struct *work) } write_unlock(&rxnet->conn_lock); - if (earliest != ULONG_MAX) { - _debug("reschedule reaper %ld", (long) earliest - now); + if (earliest != now + MAX_JIFFY_OFFSET) { + _debug("reschedule reaper %ld", (long)earliest - (long)now); ASSERT(time_after(earliest, now)); - rxrpc_queue_delayed_work(&rxnet->client_conn_reaper, - earliest - now); + rxrpc_set_service_reap_timer(rxnet, earliest); } while (!list_empty(&graveyard)) { @@ -429,9 +452,8 @@ void rxrpc_destroy_all_connections(struct rxrpc_net *rxnet) rxrpc_destroy_all_client_connections(rxnet); - rxrpc_connection_expiry = 0; - cancel_delayed_work(&rxnet->client_conn_reaper); - rxrpc_queue_delayed_work(&rxnet->client_conn_reaper, 0); + del_timer_sync(&rxnet->service_conn_reap_timer); + rxrpc_queue_work(&rxnet->service_conn_reaper); flush_workqueue(rxrpc_workqueue); write_lock(&rxnet->conn_lock); diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 1b592073ec96..6fc61400337f 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -318,16 +318,18 @@ bad_state: static bool rxrpc_receiving_reply(struct rxrpc_call *call) { struct rxrpc_ack_summary summary = { 0 }; + unsigned long now, timo; rxrpc_seq_t top = READ_ONCE(call->tx_top); if (call->ackr_reason) { spin_lock_bh(&call->lock); call->ackr_reason = 0; - call->resend_at = call->expire_at; - call->ack_at = call->expire_at; spin_unlock_bh(&call->lock); - rxrpc_set_timer(call, rxrpc_timer_init_for_reply, - ktime_get_real()); + now = jiffies; + timo = now + MAX_JIFFY_OFFSET; + WRITE_ONCE(call->resend_at, timo); + WRITE_ONCE(call->ack_at, timo); + trace_rxrpc_timer(call, rxrpc_timer_init_for_reply, now); } if (!test_bit(RXRPC_CALL_TX_LAST, &call->flags)) @@ -437,6 +439,19 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb, if (state >= RXRPC_CALL_COMPLETE) return; + if (call->state == RXRPC_CALL_SERVER_RECV_REQUEST) { + unsigned long timo = READ_ONCE(call->next_req_timo); + unsigned long now, expect_req_by; + + if (timo) { + now = jiffies; + expect_req_by = now + timo; + WRITE_ONCE(call->expect_req_by, expect_req_by); + rxrpc_reduce_call_timer(call, expect_req_by, now, + rxrpc_timer_set_for_idle); + } + } + /* Received data implicitly ACKs all of the request packets we sent * when we're acting as a client. */ @@ -616,6 +631,43 @@ found: } /* + * Process the response to a ping that we sent to find out if we lost an ACK. + * + * If we got back a ping response that indicates a lower tx_top than what we + * had at the time of the ping transmission, we adjudge all the DATA packets + * sent between the response tx_top and the ping-time tx_top to have been lost. + */ +static void rxrpc_input_check_for_lost_ack(struct rxrpc_call *call) +{ + rxrpc_seq_t top, bottom, seq; + bool resend = false; + + spin_lock_bh(&call->lock); + + bottom = call->tx_hard_ack + 1; + top = call->acks_lost_top; + if (before(bottom, top)) { + for (seq = bottom; before_eq(seq, top); seq++) { + int ix = seq & RXRPC_RXTX_BUFF_MASK; + u8 annotation = call->rxtx_annotations[ix]; + u8 anno_type = annotation & RXRPC_TX_ANNO_MASK; + + if (anno_type != RXRPC_TX_ANNO_UNACK) + continue; + annotation &= ~RXRPC_TX_ANNO_MASK; + annotation |= RXRPC_TX_ANNO_RETRANS; + call->rxtx_annotations[ix] = annotation; + resend = true; + } + } + + spin_unlock_bh(&call->lock); + + if (resend && !test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events)) + rxrpc_queue_call(call); +} + +/* * Process a ping response. */ static void rxrpc_input_ping_response(struct rxrpc_call *call, @@ -630,6 +682,9 @@ static void rxrpc_input_ping_response(struct rxrpc_call *call, smp_rmb(); ping_serial = call->ping_serial; + if (orig_serial == call->acks_lost_ping) + rxrpc_input_check_for_lost_ack(call); + if (!test_bit(RXRPC_CALL_PINGING, &call->flags) || before(orig_serial, ping_serial)) return; @@ -908,9 +963,20 @@ static void rxrpc_input_call_packet(struct rxrpc_call *call, struct sk_buff *skb, u16 skew) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + unsigned long timo; _enter("%p,%p", call, skb); + timo = READ_ONCE(call->next_rx_timo); + if (timo) { + unsigned long now = jiffies, expect_rx_by; + + expect_rx_by = jiffies + timo; + WRITE_ONCE(call->expect_rx_by, expect_rx_by); + rxrpc_reduce_call_timer(call, expect_rx_by, now, + rxrpc_timer_set_for_normal); + } + switch (sp->hdr.type) { case RXRPC_PACKET_TYPE_DATA: rxrpc_input_data(call, skb, skew); @@ -1147,7 +1213,7 @@ void rxrpc_data_ready(struct sock *udp_sk) goto reupgrade; conn->service_id = sp->hdr.serviceId; } - + if (sp->hdr.callNumber == 0) { /* Connection-level packet */ _debug("CONN %p {%d}", conn, conn->debug_id); diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index 1a2d4b112064..c1d9e7fd7448 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -21,33 +21,28 @@ unsigned int rxrpc_max_backlog __read_mostly = 10; /* - * Maximum lifetime of a call (in mx). - */ -unsigned int rxrpc_max_call_lifetime = 60 * 1000; - -/* * How long to wait before scheduling ACK generation after seeing a - * packet with RXRPC_REQUEST_ACK set (in ms). + * packet with RXRPC_REQUEST_ACK set (in jiffies). */ -unsigned int rxrpc_requested_ack_delay = 1; +unsigned long rxrpc_requested_ack_delay = 1; /* - * How long to wait before scheduling an ACK with subtype DELAY (in ms). + * How long to wait before scheduling an ACK with subtype DELAY (in jiffies). * * We use this when we've received new data packets. If those packets aren't * all consumed within this time we will send a DELAY ACK if an ACK was not * requested to let the sender know it doesn't need to resend. */ -unsigned int rxrpc_soft_ack_delay = 1 * 1000; +unsigned long rxrpc_soft_ack_delay = HZ; /* - * How long to wait before scheduling an ACK with subtype IDLE (in ms). + * How long to wait before scheduling an ACK with subtype IDLE (in jiffies). * * We use this when we've consumed some previously soft-ACK'd packets when * further packets aren't immediately received to decide when to send an IDLE * ACK let the other end know that it can free up its Tx buffer space. */ -unsigned int rxrpc_idle_ack_delay = 0.5 * 1000; +unsigned long rxrpc_idle_ack_delay = HZ / 2; /* * Receive window size in packets. This indicates the maximum number of @@ -75,7 +70,7 @@ unsigned int rxrpc_rx_jumbo_max = 4; /* * Time till packet resend (in milliseconds). */ -unsigned int rxrpc_resend_timeout = 4 * 1000; +unsigned long rxrpc_resend_timeout = 4 * HZ; const s8 rxrpc_ack_priority[] = { [0] = 0, diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c index 7edceb8522f5..f18c9248e0d4 100644 --- a/net/rxrpc/net_ns.c +++ b/net/rxrpc/net_ns.c @@ -14,6 +14,24 @@ unsigned int rxrpc_net_id; +static void rxrpc_client_conn_reap_timeout(struct timer_list *timer) +{ + struct rxrpc_net *rxnet = + container_of(timer, struct rxrpc_net, client_conn_reap_timer); + + if (rxnet->live) + rxrpc_queue_work(&rxnet->client_conn_reaper); +} + +static void rxrpc_service_conn_reap_timeout(struct timer_list *timer) +{ + struct rxrpc_net *rxnet = + container_of(timer, struct rxrpc_net, service_conn_reap_timer); + + if (rxnet->live) + rxrpc_queue_work(&rxnet->service_conn_reaper); +} + /* * Initialise a per-network namespace record. */ @@ -22,6 +40,7 @@ static __net_init int rxrpc_init_net(struct net *net) struct rxrpc_net *rxnet = rxrpc_net(net); int ret; + rxnet->live = true; get_random_bytes(&rxnet->epoch, sizeof(rxnet->epoch)); rxnet->epoch |= RXRPC_RANDOM_EPOCH; @@ -31,8 +50,10 @@ static __net_init int rxrpc_init_net(struct net *net) INIT_LIST_HEAD(&rxnet->conn_proc_list); INIT_LIST_HEAD(&rxnet->service_conns); rwlock_init(&rxnet->conn_lock); - INIT_DELAYED_WORK(&rxnet->service_conn_reaper, - rxrpc_service_connection_reaper); + INIT_WORK(&rxnet->service_conn_reaper, + rxrpc_service_connection_reaper); + timer_setup(&rxnet->service_conn_reap_timer, + rxrpc_service_conn_reap_timeout, 0); rxnet->nr_client_conns = 0; rxnet->nr_active_client_conns = 0; @@ -42,8 +63,10 @@ static __net_init int rxrpc_init_net(struct net *net) INIT_LIST_HEAD(&rxnet->waiting_client_conns); INIT_LIST_HEAD(&rxnet->active_client_conns); INIT_LIST_HEAD(&rxnet->idle_client_conns); - INIT_DELAYED_WORK(&rxnet->client_conn_reaper, - rxrpc_discard_expired_client_conns); + INIT_WORK(&rxnet->client_conn_reaper, + rxrpc_discard_expired_client_conns); + timer_setup(&rxnet->client_conn_reap_timer, + rxrpc_client_conn_reap_timeout, 0); INIT_LIST_HEAD(&rxnet->local_endpoints); mutex_init(&rxnet->local_mutex); @@ -60,6 +83,7 @@ static __net_init int rxrpc_init_net(struct net *net) return 0; err_proc: + rxnet->live = false; return ret; } @@ -70,6 +94,7 @@ static __net_exit void rxrpc_exit_net(struct net *net) { struct rxrpc_net *rxnet = rxrpc_net(net); + rxnet->live = false; rxrpc_destroy_all_calls(rxnet); rxrpc_destroy_all_connections(rxnet); rxrpc_destroy_all_locals(rxnet); diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index f47659c7b224..42410e910aff 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -33,6 +33,24 @@ struct rxrpc_abort_buffer { }; /* + * Arrange for a keepalive ping a certain time after we last transmitted. This + * lets the far side know we're still interested in this call and helps keep + * the route through any intervening firewall open. + * + * Receiving a response to the ping will prevent the ->expect_rx_by timer from + * expiring. + */ +static void rxrpc_set_keepalive(struct rxrpc_call *call) +{ + unsigned long now = jiffies, keepalive_at = call->next_rx_timo / 6; + + keepalive_at += now; + WRITE_ONCE(call->keepalive_at, keepalive_at); + rxrpc_reduce_call_timer(call, keepalive_at, now, + rxrpc_timer_set_for_keepalive); +} + +/* * Fill out an ACK packet. */ static size_t rxrpc_fill_out_ack(struct rxrpc_connection *conn, @@ -95,7 +113,8 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_connection *conn, /* * Send an ACK call packet. */ -int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping) +int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping, + rxrpc_serial_t *_serial) { struct rxrpc_connection *conn = NULL; struct rxrpc_ack_buffer *pkt; @@ -165,6 +184,8 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping) ntohl(pkt->ack.firstPacket), ntohl(pkt->ack.serial), pkt->ack.reason, pkt->ack.nAcks); + if (_serial) + *_serial = serial; if (ping) { call->ping_serial = serial; @@ -202,6 +223,8 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping) call->ackr_seen = top; spin_unlock_bh(&call->lock); } + + rxrpc_set_keepalive(call); } out: @@ -323,7 +346,8 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, * ACKs if a DATA packet appears to have been lost. */ if (!(sp->hdr.flags & RXRPC_LAST_PACKET) && - (retrans || + (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events) || + retrans || call->cong_mode == RXRPC_CALL_SLOW_START || (call->peer->rtt_usage < 3 && sp->hdr.seq & 1) || ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), @@ -370,8 +394,23 @@ done: if (whdr.flags & RXRPC_REQUEST_ACK) { call->peer->rtt_last_req = now; trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_data, serial); + if (call->peer->rtt_usage > 1) { + unsigned long nowj = jiffies, ack_lost_at; + + ack_lost_at = nsecs_to_jiffies(2 * call->peer->rtt); + if (ack_lost_at < 1) + ack_lost_at = 1; + + ack_lost_at += nowj; + WRITE_ONCE(call->ack_lost_at, ack_lost_at); + rxrpc_reduce_call_timer(call, ack_lost_at, nowj, + rxrpc_timer_set_for_lost_ack); + } } } + + rxrpc_set_keepalive(call); + _leave(" = %d [%u]", ret, call->peer->maxdata); return ret; diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index 7421656963a9..f79f260c6ddc 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -125,7 +125,6 @@ static int rxrpc_call_seq_open(struct inode *inode, struct file *file) } const struct file_operations rxrpc_call_seq_fops = { - .owner = THIS_MODULE, .open = rxrpc_call_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -217,7 +216,6 @@ static int rxrpc_connection_seq_open(struct inode *inode, struct file *file) } const struct file_operations rxrpc_connection_seq_fops = { - .owner = THIS_MODULE, .open = rxrpc_connection_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 8510a98b87e1..cc21e8db25b0 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -144,11 +144,13 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial) trace_rxrpc_receive(call, rxrpc_receive_end, 0, call->rx_top); ASSERTCMP(call->rx_hard_ack, ==, call->rx_top); +#if 0 // TODO: May want to transmit final ACK under some circumstances anyway if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY) { rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, serial, true, false, rxrpc_propose_ack_terminal_ack); - rxrpc_send_ack_packet(call, false); + rxrpc_send_ack_packet(call, false, NULL); } +#endif write_lock_bh(&call->state_lock); @@ -161,7 +163,7 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial) case RXRPC_CALL_SERVER_RECV_REQUEST: call->tx_phase = true; call->state = RXRPC_CALL_SERVER_ACK_REQUEST; - call->ack_at = call->expire_at; + call->expect_req_by = jiffies + MAX_JIFFY_OFFSET; write_unlock_bh(&call->state_lock); rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, 0, serial, false, true, rxrpc_propose_ack_processing_op); @@ -217,10 +219,10 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call) after_eq(top, call->ackr_seen + 2) || (hard_ack == top && after(hard_ack, call->ackr_consumed))) rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, 0, serial, - true, false, + true, true, rxrpc_propose_ack_rotate_rx); - if (call->ackr_reason) - rxrpc_send_ack_packet(call, false); + if (call->ackr_reason && call->ackr_reason != RXRPC_ACK_DELAY) + rxrpc_send_ack_packet(call, false, NULL); } } diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 7d2595582c09..09f2a3e05221 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -21,22 +21,6 @@ #include <net/af_rxrpc.h> #include "ar-internal.h" -enum rxrpc_command { - RXRPC_CMD_SEND_DATA, /* send data message */ - RXRPC_CMD_SEND_ABORT, /* request abort generation */ - RXRPC_CMD_ACCEPT, /* [server] accept incoming call */ - RXRPC_CMD_REJECT_BUSY, /* [server] reject a call as busy */ -}; - -struct rxrpc_send_params { - s64 tx_total_len; /* Total Tx data length (if send data) */ - unsigned long user_call_ID; /* User's call ID */ - u32 abort_code; /* Abort code to Tx (if abort) */ - enum rxrpc_command command : 8; /* The command to implement */ - bool exclusive; /* Shared or exclusive call */ - bool upgrade; /* If the connection is upgradeable */ -}; - /* * Wait for space to appear in the Tx queue or a signal to occur. */ @@ -174,6 +158,7 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, rxrpc_notify_end_tx_t notify_end_tx) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + unsigned long now; rxrpc_seq_t seq = sp->hdr.seq; int ret, ix; u8 annotation = RXRPC_TX_ANNO_UNACK; @@ -213,11 +198,11 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, break; case RXRPC_CALL_SERVER_ACK_REQUEST: call->state = RXRPC_CALL_SERVER_SEND_REPLY; - call->ack_at = call->expire_at; + now = jiffies; + WRITE_ONCE(call->ack_at, now + MAX_JIFFY_OFFSET); if (call->ackr_reason == RXRPC_ACK_DELAY) call->ackr_reason = 0; - __rxrpc_set_timer(call, rxrpc_timer_init_for_send_reply, - ktime_get_real()); + trace_rxrpc_timer(call, rxrpc_timer_init_for_send_reply, now); if (!last) break; /* Fall through */ @@ -239,14 +224,19 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, _debug("need instant resend %d", ret); rxrpc_instant_resend(call, ix); } else { - ktime_t now = ktime_get_real(), resend_at; - - resend_at = ktime_add_ms(now, rxrpc_resend_timeout); - - if (ktime_before(resend_at, call->resend_at)) { - call->resend_at = resend_at; - rxrpc_set_timer(call, rxrpc_timer_set_for_send, now); - } + unsigned long now = jiffies, resend_at; + + if (call->peer->rtt_usage > 1) + resend_at = nsecs_to_jiffies(call->peer->rtt * 3 / 2); + else + resend_at = rxrpc_resend_timeout; + if (resend_at < 1) + resend_at = 1; + + resend_at += now; + WRITE_ONCE(call->resend_at, resend_at); + rxrpc_reduce_call_timer(call, resend_at, now, + rxrpc_timer_set_for_send); } rxrpc_free_skb(skb, rxrpc_skb_tx_freed); @@ -295,7 +285,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, do { /* Check to see if there's a ping ACK to reply to. */ if (call->ackr_reason == RXRPC_ACK_PING_RESPONSE) - rxrpc_send_ack_packet(call, false); + rxrpc_send_ack_packet(call, false, NULL); if (!skb) { size_t size, chunk, max, space; @@ -480,11 +470,11 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg, struct rxrpc_send_params *p) if (msg->msg_flags & MSG_CMSG_COMPAT) { if (len != sizeof(u32)) return -EINVAL; - p->user_call_ID = *(u32 *)CMSG_DATA(cmsg); + p->call.user_call_ID = *(u32 *)CMSG_DATA(cmsg); } else { if (len != sizeof(unsigned long)) return -EINVAL; - p->user_call_ID = *(unsigned long *) + p->call.user_call_ID = *(unsigned long *) CMSG_DATA(cmsg); } got_user_ID = true; @@ -522,11 +512,24 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg, struct rxrpc_send_params *p) break; case RXRPC_TX_LENGTH: - if (p->tx_total_len != -1 || len != sizeof(__s64)) + if (p->call.tx_total_len != -1 || len != sizeof(__s64)) + return -EINVAL; + p->call.tx_total_len = *(__s64 *)CMSG_DATA(cmsg); + if (p->call.tx_total_len < 0) return -EINVAL; - p->tx_total_len = *(__s64 *)CMSG_DATA(cmsg); - if (p->tx_total_len < 0) + break; + + case RXRPC_SET_CALL_TIMEOUT: + if (len & 3 || len < 4 || len > 12) return -EINVAL; + memcpy(&p->call.timeouts, CMSG_DATA(cmsg), len); + p->call.nr_timeouts = len / 4; + if (p->call.timeouts.hard > INT_MAX / HZ) + return -ERANGE; + if (p->call.nr_timeouts >= 2 && p->call.timeouts.idle > 60 * 60 * 1000) + return -ERANGE; + if (p->call.nr_timeouts >= 3 && p->call.timeouts.normal > 60 * 60 * 1000) + return -ERANGE; break; default: @@ -536,7 +539,7 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg, struct rxrpc_send_params *p) if (!got_user_ID) return -EINVAL; - if (p->tx_total_len != -1 && p->command != RXRPC_CMD_SEND_DATA) + if (p->call.tx_total_len != -1 && p->command != RXRPC_CMD_SEND_DATA) return -EINVAL; _leave(" = 0"); return 0; @@ -576,8 +579,7 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, cp.exclusive = rx->exclusive | p->exclusive; cp.upgrade = p->upgrade; cp.service_id = srx->srx_service; - call = rxrpc_new_client_call(rx, &cp, srx, p->user_call_ID, - p->tx_total_len, GFP_KERNEL); + call = rxrpc_new_client_call(rx, &cp, srx, &p->call, GFP_KERNEL); /* The socket is now unlocked */ _leave(" = %p\n", call); @@ -594,15 +596,17 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) { enum rxrpc_call_state state; struct rxrpc_call *call; + unsigned long now, j; int ret; struct rxrpc_send_params p = { - .tx_total_len = -1, - .user_call_ID = 0, - .abort_code = 0, - .command = RXRPC_CMD_SEND_DATA, - .exclusive = false, - .upgrade = true, + .call.tx_total_len = -1, + .call.user_call_ID = 0, + .call.nr_timeouts = 0, + .abort_code = 0, + .command = RXRPC_CMD_SEND_DATA, + .exclusive = false, + .upgrade = false, }; _enter(""); @@ -615,15 +619,15 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) ret = -EINVAL; if (rx->sk.sk_state != RXRPC_SERVER_LISTENING) goto error_release_sock; - call = rxrpc_accept_call(rx, p.user_call_ID, NULL); + call = rxrpc_accept_call(rx, p.call.user_call_ID, NULL); /* The socket is now unlocked. */ if (IS_ERR(call)) return PTR_ERR(call); - rxrpc_put_call(call, rxrpc_call_put); - return 0; + ret = 0; + goto out_put_unlock; } - call = rxrpc_find_call_by_user_ID(rx, p.user_call_ID); + call = rxrpc_find_call_by_user_ID(rx, p.call.user_call_ID); if (!call) { ret = -EBADSLT; if (p.command != RXRPC_CMD_SEND_DATA) @@ -653,14 +657,39 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) goto error_put; } - if (p.tx_total_len != -1) { + if (p.call.tx_total_len != -1) { ret = -EINVAL; if (call->tx_total_len != -1 || call->tx_pending || call->tx_top != 0) goto error_put; - call->tx_total_len = p.tx_total_len; + call->tx_total_len = p.call.tx_total_len; + } + } + + switch (p.call.nr_timeouts) { + case 3: + j = msecs_to_jiffies(p.call.timeouts.normal); + if (p.call.timeouts.normal > 0 && j == 0) + j = 1; + WRITE_ONCE(call->next_rx_timo, j); + /* Fall through */ + case 2: + j = msecs_to_jiffies(p.call.timeouts.idle); + if (p.call.timeouts.idle > 0 && j == 0) + j = 1; + WRITE_ONCE(call->next_req_timo, j); + /* Fall through */ + case 1: + if (p.call.timeouts.hard > 0) { + j = msecs_to_jiffies(p.call.timeouts.hard); + now = jiffies; + j += now; + WRITE_ONCE(call->expect_term_by, j); + rxrpc_reduce_call_timer(call, j, now, + rxrpc_timer_set_for_hard); } + break; } state = READ_ONCE(call->state); @@ -689,6 +718,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) ret = rxrpc_send_data(rx, call, msg, len, NULL); } +out_put_unlock: mutex_unlock(&call->user_mutex); error_put: rxrpc_put_call(call, rxrpc_call_put); diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c index 34c706d2f79c..4a7af7aff37d 100644 --- a/net/rxrpc/sysctl.c +++ b/net/rxrpc/sysctl.c @@ -21,6 +21,8 @@ static const unsigned int four = 4; static const unsigned int thirtytwo = 32; static const unsigned int n_65535 = 65535; static const unsigned int n_max_acks = RXRPC_RXTX_BUFF_SIZE - 1; +static const unsigned long one_jiffy = 1; +static const unsigned long max_jiffies = MAX_JIFFY_OFFSET; /* * RxRPC operating parameters. @@ -29,64 +31,60 @@ static const unsigned int n_max_acks = RXRPC_RXTX_BUFF_SIZE - 1; * information on the individual parameters. */ static struct ctl_table rxrpc_sysctl_table[] = { - /* Values measured in milliseconds */ + /* Values measured in milliseconds but used in jiffies */ { .procname = "req_ack_delay", .data = &rxrpc_requested_ack_delay, - .maxlen = sizeof(unsigned int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec, - .extra1 = (void *)&zero, + .proc_handler = proc_doulongvec_ms_jiffies_minmax, + .extra1 = (void *)&one_jiffy, + .extra2 = (void *)&max_jiffies, }, { .procname = "soft_ack_delay", .data = &rxrpc_soft_ack_delay, - .maxlen = sizeof(unsigned int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec, - .extra1 = (void *)&one, + .proc_handler = proc_doulongvec_ms_jiffies_minmax, + .extra1 = (void *)&one_jiffy, + .extra2 = (void *)&max_jiffies, }, { .procname = "idle_ack_delay", .data = &rxrpc_idle_ack_delay, - .maxlen = sizeof(unsigned int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec, - .extra1 = (void *)&one, - }, - { - .procname = "resend_timeout", - .data = &rxrpc_resend_timeout, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - .extra1 = (void *)&one, + .proc_handler = proc_doulongvec_ms_jiffies_minmax, + .extra1 = (void *)&one_jiffy, + .extra2 = (void *)&max_jiffies, }, { .procname = "idle_conn_expiry", .data = &rxrpc_conn_idle_client_expiry, - .maxlen = sizeof(unsigned int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec_ms_jiffies, - .extra1 = (void *)&one, + .proc_handler = proc_doulongvec_ms_jiffies_minmax, + .extra1 = (void *)&one_jiffy, + .extra2 = (void *)&max_jiffies, }, { .procname = "idle_conn_fast_expiry", .data = &rxrpc_conn_idle_client_fast_expiry, - .maxlen = sizeof(unsigned int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec_ms_jiffies, - .extra1 = (void *)&one, + .proc_handler = proc_doulongvec_ms_jiffies_minmax, + .extra1 = (void *)&one_jiffy, + .extra2 = (void *)&max_jiffies, }, - - /* Values measured in seconds but used in jiffies */ { - .procname = "max_call_lifetime", - .data = &rxrpc_max_call_lifetime, - .maxlen = sizeof(unsigned int), + .procname = "resend_timeout", + .data = &rxrpc_resend_timeout, + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec, - .extra1 = (void *)&one, + .proc_handler = proc_doulongvec_ms_jiffies_minmax, + .extra1 = (void *)&one_jiffy, + .extra2 = (void *)&max_jiffies, }, /* Non-time values */ diff --git a/net/sched/Kconfig b/net/sched/Kconfig index c03d86a7775e..f24a6ae6819a 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -857,17 +857,14 @@ config NET_ACT_TUNNEL_KEY config NET_IFE_SKBMARK tristate "Support to encoding decoding skb mark on IFE action" depends on NET_ACT_IFE - ---help--- config NET_IFE_SKBPRIO tristate "Support to encoding decoding skb prio on IFE action" depends on NET_ACT_IFE - ---help--- config NET_IFE_SKBTCINDEX tristate "Support to encoding decoding skb tcindex on IFE action" depends on NET_ACT_IFE - ---help--- config NET_CLS_IND bool "Incoming device classification" diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 4d33a50a8a6d..52622a3d2517 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -99,7 +99,7 @@ int __tcf_idr_release(struct tc_action *p, bool bind, bool strict) p->tcfa_refcnt--; if (p->tcfa_bindcnt <= 0 && p->tcfa_refcnt <= 0) { if (p->ops->cleanup) - p->ops->cleanup(p, bind); + p->ops->cleanup(p); tcf_idr_remove(p->idrinfo, p); ret = ACT_P_DELETED; } diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 5ef8ce8c83d4..b3f2c15affa7 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -357,7 +357,7 @@ out: return ret; } -static void tcf_bpf_cleanup(struct tc_action *act, int bind) +static void tcf_bpf_cleanup(struct tc_action *act) { struct tcf_bpf_cfg tmp; @@ -401,16 +401,14 @@ static __net_init int bpf_init_net(struct net *net) return tc_action_net_init(tn, &act_bpf_ops); } -static void __net_exit bpf_exit_net(struct net *net) +static void __net_exit bpf_exit_net(struct list_head *net_list) { - struct tc_action_net *tn = net_generic(net, bpf_net_id); - - tc_action_net_exit(tn); + tc_action_net_exit(net_list, bpf_net_id); } static struct pernet_operations bpf_net_ops = { .init = bpf_init_net, - .exit = bpf_exit_net, + .exit_batch = bpf_exit_net, .id = &bpf_net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 10b7a8855a6c..2b15ba84e0c8 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -209,16 +209,14 @@ static __net_init int connmark_init_net(struct net *net) return tc_action_net_init(tn, &act_connmark_ops); } -static void __net_exit connmark_exit_net(struct net *net) +static void __net_exit connmark_exit_net(struct list_head *net_list) { - struct tc_action_net *tn = net_generic(net, connmark_net_id); - - tc_action_net_exit(tn); + tc_action_net_exit(net_list, connmark_net_id); } static struct pernet_operations connmark_net_ops = { .init = connmark_init_net, - .exit = connmark_exit_net, + .exit_batch = connmark_exit_net, .id = &connmark_net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index d836f998117b..b7ba9b06b147 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -49,6 +49,7 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, int bind) { struct tc_action_net *tn = net_generic(net, csum_net_id); + struct tcf_csum_params *params_old, *params_new; struct nlattr *tb[TCA_CSUM_MAX + 1]; struct tc_csum *parm; struct tcf_csum *p; @@ -67,7 +68,7 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, if (!tcf_idr_check(tn, parm->index, a, bind)) { ret = tcf_idr_create(tn, parm->index, est, a, - &act_csum_ops, bind, false); + &act_csum_ops, bind, true); if (ret) return ret; ret = ACT_P_CREATED; @@ -80,10 +81,21 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, } p = to_tcf_csum(*a); - spin_lock_bh(&p->tcf_lock); - p->tcf_action = parm->action; - p->update_flags = parm->update_flags; - spin_unlock_bh(&p->tcf_lock); + ASSERT_RTNL(); + + params_new = kzalloc(sizeof(*params_new), GFP_KERNEL); + if (unlikely(!params_new)) { + if (ret == ACT_P_CREATED) + tcf_idr_release(*a, bind); + return -ENOMEM; + } + params_old = rtnl_dereference(p->params); + + params_new->action = parm->action; + params_new->update_flags = parm->update_flags; + rcu_assign_pointer(p->params, params_new); + if (params_old) + kfree_rcu(params_old, rcu); if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); @@ -539,19 +551,21 @@ static int tcf_csum(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_csum *p = to_tcf_csum(a); - int action; + struct tcf_csum_params *params; u32 update_flags; + int action; + + rcu_read_lock(); + params = rcu_dereference(p->params); - spin_lock(&p->tcf_lock); tcf_lastuse_update(&p->tcf_tm); - bstats_update(&p->tcf_bstats, skb); - action = p->tcf_action; - update_flags = p->update_flags; - spin_unlock(&p->tcf_lock); + bstats_cpu_update(this_cpu_ptr(p->common.cpu_bstats), skb); + action = params->action; if (unlikely(action == TC_ACT_SHOT)) - goto drop; + goto drop_stats; + update_flags = params->update_flags; switch (tc_skb_protocol(skb)) { case cpu_to_be16(ETH_P_IP): if (!tcf_csum_ipv4(skb, update_flags)) @@ -563,13 +577,16 @@ static int tcf_csum(struct sk_buff *skb, const struct tc_action *a, break; } +unlock: + rcu_read_unlock(); return action; drop: - spin_lock(&p->tcf_lock); - p->tcf_qstats.drops++; - spin_unlock(&p->tcf_lock); - return TC_ACT_SHOT; + action = TC_ACT_SHOT; + +drop_stats: + qstats_drop_inc(this_cpu_ptr(p->common.cpu_qstats)); + goto unlock; } static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind, @@ -577,15 +594,18 @@ static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind, { unsigned char *b = skb_tail_pointer(skb); struct tcf_csum *p = to_tcf_csum(a); + struct tcf_csum_params *params; struct tc_csum opt = { - .update_flags = p->update_flags, .index = p->tcf_index, - .action = p->tcf_action, .refcnt = p->tcf_refcnt - ref, .bindcnt = p->tcf_bindcnt - bind, }; struct tcf_t t; + params = rtnl_dereference(p->params); + opt.action = params->action; + opt.update_flags = params->update_flags; + if (nla_put(skb, TCA_CSUM_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -600,6 +620,15 @@ nla_put_failure: return -1; } +static void tcf_csum_cleanup(struct tc_action *a) +{ + struct tcf_csum *p = to_tcf_csum(a); + struct tcf_csum_params *params; + + params = rcu_dereference_protected(p->params, 1); + kfree_rcu(params, rcu); +} + static int tcf_csum_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, const struct tc_action_ops *ops) @@ -623,6 +652,7 @@ static struct tc_action_ops act_csum_ops = { .act = tcf_csum, .dump = tcf_csum_dump, .init = tcf_csum_init, + .cleanup = tcf_csum_cleanup, .walk = tcf_csum_walker, .lookup = tcf_csum_search, .size = sizeof(struct tcf_csum), @@ -635,16 +665,14 @@ static __net_init int csum_init_net(struct net *net) return tc_action_net_init(tn, &act_csum_ops); } -static void __net_exit csum_exit_net(struct net *net) +static void __net_exit csum_exit_net(struct list_head *net_list) { - struct tc_action_net *tn = net_generic(net, csum_net_id); - - tc_action_net_exit(tn); + tc_action_net_exit(net_list, csum_net_id); } static struct pernet_operations csum_net_ops = { .init = csum_init_net, - .exit = csum_exit_net, + .exit_batch = csum_exit_net, .id = &csum_net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index e29a48ef7fc3..b56986d41c87 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -159,7 +159,7 @@ static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets, if (action == TC_ACT_SHOT) this_cpu_ptr(gact->common.cpu_qstats)->drops += packets; - tm->lastuse = lastuse; + tm->lastuse = max_t(u64, tm->lastuse, lastuse); } static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, @@ -235,16 +235,14 @@ static __net_init int gact_init_net(struct net *net) return tc_action_net_init(tn, &act_gact_ops); } -static void __net_exit gact_exit_net(struct net *net) +static void __net_exit gact_exit_net(struct list_head *net_list) { - struct tc_action_net *tn = net_generic(net, gact_net_id); - - tc_action_net_exit(tn); + tc_action_net_exit(net_list, gact_net_id); } static struct pernet_operations gact_net_ops = { .init = gact_init_net, - .exit = gact_exit_net, + .exit_batch = gact_exit_net, .id = &gact_net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 3007cb1310ea..5954e992685a 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -387,7 +387,7 @@ out_nlmsg_trim: } /* under ife->tcf_lock */ -static void _tcf_ife_cleanup(struct tc_action *a, int bind) +static void _tcf_ife_cleanup(struct tc_action *a) { struct tcf_ife_info *ife = to_ife(a); struct tcf_meta_info *e, *n; @@ -405,13 +405,13 @@ static void _tcf_ife_cleanup(struct tc_action *a, int bind) } } -static void tcf_ife_cleanup(struct tc_action *a, int bind) +static void tcf_ife_cleanup(struct tc_action *a) { struct tcf_ife_info *ife = to_ife(a); struct tcf_ife_params *p; spin_lock_bh(&ife->tcf_lock); - _tcf_ife_cleanup(a, bind); + _tcf_ife_cleanup(a); spin_unlock_bh(&ife->tcf_lock); p = rcu_dereference_protected(ife->params, 1); @@ -546,7 +546,7 @@ metadata_parse_err: if (exists) tcf_idr_release(*a, bind); if (ret == ACT_P_CREATED) - _tcf_ife_cleanup(*a, bind); + _tcf_ife_cleanup(*a); if (exists) spin_unlock_bh(&ife->tcf_lock); @@ -567,7 +567,7 @@ metadata_parse_err: err = use_all_metadata(ife); if (err) { if (ret == ACT_P_CREATED) - _tcf_ife_cleanup(*a, bind); + _tcf_ife_cleanup(*a); if (exists) spin_unlock_bh(&ife->tcf_lock); @@ -858,16 +858,14 @@ static __net_init int ife_init_net(struct net *net) return tc_action_net_init(tn, &act_ife_ops); } -static void __net_exit ife_exit_net(struct net *net) +static void __net_exit ife_exit_net(struct list_head *net_list) { - struct tc_action_net *tn = net_generic(net, ife_net_id); - - tc_action_net_exit(tn); + tc_action_net_exit(net_list, ife_net_id); } static struct pernet_operations ife_net_ops = { .init = ife_init_net, - .exit = ife_exit_net, + .exit_batch = ife_exit_net, .id = &ife_net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index d9e399a7e3d5..06e380ae0928 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -77,7 +77,7 @@ static void ipt_destroy_target(struct xt_entry_target *t) module_put(par.target->me); } -static void tcf_ipt_release(struct tc_action *a, int bind) +static void tcf_ipt_release(struct tc_action *a) { struct tcf_ipt *ipt = to_ipt(a); ipt_destroy_target(ipt->tcfi_t); @@ -337,16 +337,14 @@ static __net_init int ipt_init_net(struct net *net) return tc_action_net_init(tn, &act_ipt_ops); } -static void __net_exit ipt_exit_net(struct net *net) +static void __net_exit ipt_exit_net(struct list_head *net_list) { - struct tc_action_net *tn = net_generic(net, ipt_net_id); - - tc_action_net_exit(tn); + tc_action_net_exit(net_list, ipt_net_id); } static struct pernet_operations ipt_net_ops = { .init = ipt_init_net, - .exit = ipt_exit_net, + .exit_batch = ipt_exit_net, .id = &ipt_net_id, .size = sizeof(struct tc_action_net), }; @@ -387,16 +385,14 @@ static __net_init int xt_init_net(struct net *net) return tc_action_net_init(tn, &act_xt_ops); } -static void __net_exit xt_exit_net(struct net *net) +static void __net_exit xt_exit_net(struct list_head *net_list) { - struct tc_action_net *tn = net_generic(net, xt_net_id); - - tc_action_net_exit(tn); + tc_action_net_exit(net_list, xt_net_id); } static struct pernet_operations xt_net_ops = { .init = xt_init_net, - .exit = xt_exit_net, + .exit_batch = xt_exit_net, .id = &xt_net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_meta_mark.c b/net/sched/act_meta_mark.c index 1e3f10e5da99..6445184b2759 100644 --- a/net/sched/act_meta_mark.c +++ b/net/sched/act_meta_mark.c @@ -22,7 +22,6 @@ #include <net/pkt_sched.h> #include <uapi/linux/tc_act/tc_ife.h> #include <net/tc_act/tc_ife.h> -#include <linux/rtnetlink.h> static int skbmark_encode(struct sk_buff *skb, void *skbdata, struct tcf_meta_info *e) diff --git a/net/sched/act_meta_skbtcindex.c b/net/sched/act_meta_skbtcindex.c index 2ea1f26c9e96..7221437ca3a6 100644 --- a/net/sched/act_meta_skbtcindex.c +++ b/net/sched/act_meta_skbtcindex.c @@ -22,7 +22,6 @@ #include <net/pkt_sched.h> #include <uapi/linux/tc_act/tc_ife.h> #include <net/tc_act/tc_ife.h> -#include <linux/rtnetlink.h> static int skbtcindex_encode(struct sk_buff *skb, void *skbdata, struct tcf_meta_info *e) diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 8b3e59388480..e6ff88f72900 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -29,7 +29,6 @@ #include <net/tc_act/tc_mirred.h> static LIST_HEAD(mirred_list); -static DEFINE_SPINLOCK(mirred_list_lock); static bool tcf_mirred_is_act_redirect(int action) { @@ -50,18 +49,15 @@ static bool tcf_mirred_act_wants_ingress(int action) } } -static void tcf_mirred_release(struct tc_action *a, int bind) +static void tcf_mirred_release(struct tc_action *a) { struct tcf_mirred *m = to_mirred(a); struct net_device *dev; - /* We could be called either in a RCU callback or with RTNL lock held. */ - spin_lock_bh(&mirred_list_lock); list_del(&m->tcfm_list); - dev = rcu_dereference_protected(m->tcfm_dev, 1); + dev = rtnl_dereference(m->tcfm_dev); if (dev) dev_put(dev); - spin_unlock_bh(&mirred_list_lock); } static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = { @@ -139,8 +135,6 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, m->tcf_action = parm->action; m->tcfm_eaction = parm->eaction; if (dev != NULL) { - m->tcfm_ifindex = parm->ifindex; - m->net = net; if (ret != ACT_P_CREATED) dev_put(rcu_dereference_protected(m->tcfm_dev, 1)); dev_hold(dev); @@ -149,9 +143,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, } if (ret == ACT_P_CREATED) { - spin_lock_bh(&mirred_list_lock); list_add(&m->tcfm_list, &mirred_list); - spin_unlock_bh(&mirred_list_lock); tcf_idr_insert(tn, *a); } @@ -239,7 +231,7 @@ static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets, struct tcf_t *tm = &m->tcf_tm; _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); - tm->lastuse = lastuse; + tm->lastuse = max_t(u64, tm->lastuse, lastuse); } static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, @@ -247,13 +239,14 @@ static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, { unsigned char *b = skb_tail_pointer(skb); struct tcf_mirred *m = to_mirred(a); + struct net_device *dev = rtnl_dereference(m->tcfm_dev); struct tc_mirred opt = { .index = m->tcf_index, .action = m->tcf_action, .refcnt = m->tcf_refcnt - ref, .bindcnt = m->tcf_bindcnt - bind, .eaction = m->tcfm_eaction, - .ifindex = m->tcfm_ifindex, + .ifindex = dev ? dev->ifindex : 0, }; struct tcf_t t; @@ -294,7 +287,6 @@ static int mirred_device_event(struct notifier_block *unused, ASSERT_RTNL(); if (event == NETDEV_UNREGISTER) { - spin_lock_bh(&mirred_list_lock); list_for_each_entry(m, &mirred_list, tcfm_list) { if (rcu_access_pointer(m->tcfm_dev) == dev) { dev_put(dev); @@ -304,7 +296,6 @@ static int mirred_device_event(struct notifier_block *unused, RCU_INIT_POINTER(m->tcfm_dev, NULL); } } - spin_unlock_bh(&mirred_list_lock); } return NOTIFY_DONE; @@ -318,7 +309,7 @@ static struct net_device *tcf_mirred_get_dev(const struct tc_action *a) { struct tcf_mirred *m = to_mirred(a); - return __dev_get_by_index(m->net, m->tcfm_ifindex); + return rtnl_dereference(m->tcfm_dev); } static struct tc_action_ops act_mirred_ops = { @@ -343,16 +334,14 @@ static __net_init int mirred_init_net(struct net *net) return tc_action_net_init(tn, &act_mirred_ops); } -static void __net_exit mirred_exit_net(struct net *net) +static void __net_exit mirred_exit_net(struct list_head *net_list) { - struct tc_action_net *tn = net_generic(net, mirred_net_id); - - tc_action_net_exit(tn); + tc_action_net_exit(net_list, mirred_net_id); } static struct pernet_operations mirred_net_ops = { .init = mirred_init_net, - .exit = mirred_exit_net, + .exit_batch = mirred_exit_net, .id = &mirred_net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index c365d01b99c8..98c6a4b2f523 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -310,16 +310,14 @@ static __net_init int nat_init_net(struct net *net) return tc_action_net_init(tn, &act_nat_ops); } -static void __net_exit nat_exit_net(struct net *net) +static void __net_exit nat_exit_net(struct list_head *net_list) { - struct tc_action_net *tn = net_generic(net, nat_net_id); - - tc_action_net_exit(tn); + tc_action_net_exit(net_list, nat_net_id); } static struct pernet_operations nat_net_ops = { .init = nat_init_net, - .exit = nat_exit_net, + .exit_batch = nat_exit_net, .id = &nat_net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 491fe5deb09e..349beaffb29e 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -216,7 +216,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, return ret; } -static void tcf_pedit_cleanup(struct tc_action *a, int bind) +static void tcf_pedit_cleanup(struct tc_action *a) { struct tcf_pedit *p = to_pedit(a); struct tc_pedit_key *keys = p->tcfp_keys; @@ -453,16 +453,14 @@ static __net_init int pedit_init_net(struct net *net) return tc_action_net_init(tn, &act_pedit_ops); } -static void __net_exit pedit_exit_net(struct net *net) +static void __net_exit pedit_exit_net(struct list_head *net_list) { - struct tc_action_net *tn = net_generic(net, pedit_net_id); - - tc_action_net_exit(tn); + tc_action_net_exit(net_list, pedit_net_id); } static struct pernet_operations pedit_net_ops = { .init = pedit_init_net, - .exit = pedit_exit_net, + .exit_batch = pedit_exit_net, .id = &pedit_net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 3bb2ebf9e9ae..95d3c9097b25 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -118,13 +118,13 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla, police = to_police(*a); if (parm->rate.rate) { err = -ENOMEM; - R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE]); + R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE], NULL); if (R_tab == NULL) goto failure; if (parm->peakrate.rate) { P_tab = qdisc_get_rtab(&parm->peakrate, - tb[TCA_POLICE_PEAKRATE]); + tb[TCA_POLICE_PEAKRATE], NULL); if (P_tab == NULL) goto failure; } @@ -334,16 +334,14 @@ static __net_init int police_init_net(struct net *net) return tc_action_net_init(tn, &act_police_ops); } -static void __net_exit police_exit_net(struct net *net) +static void __net_exit police_exit_net(struct list_head *net_list) { - struct tc_action_net *tn = net_generic(net, police_net_id); - - tc_action_net_exit(tn); + tc_action_net_exit(net_list, police_net_id); } static struct pernet_operations police_net_ops = { .init = police_init_net, - .exit = police_exit_net, + .exit_batch = police_exit_net, .id = &police_net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 8b5abcd2f32f..1ba0df238756 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -96,23 +96,16 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, return ret; } -static void tcf_sample_cleanup_rcu(struct rcu_head *rcu) +static void tcf_sample_cleanup(struct tc_action *a) { - struct tcf_sample *s = container_of(rcu, struct tcf_sample, rcu); + struct tcf_sample *s = to_sample(a); struct psample_group *psample_group; - psample_group = rcu_dereference_protected(s->psample_group, 1); + psample_group = rtnl_dereference(s->psample_group); RCU_INIT_POINTER(s->psample_group, NULL); psample_group_put(psample_group); } -static void tcf_sample_cleanup(struct tc_action *a, int bind) -{ - struct tcf_sample *s = to_sample(a); - - call_rcu(&s->rcu, tcf_sample_cleanup_rcu); -} - static bool tcf_sample_dev_ok_push(struct net_device *dev) { switch (dev->type) { @@ -243,16 +236,14 @@ static __net_init int sample_init_net(struct net *net) return tc_action_net_init(tn, &act_sample_ops); } -static void __net_exit sample_exit_net(struct net *net) +static void __net_exit sample_exit_net(struct list_head *net_list) { - struct tc_action_net *tn = net_generic(net, sample_net_id); - - tc_action_net_exit(tn); + tc_action_net_exit(net_list, sample_net_id); } static struct pernet_operations sample_net_ops = { .init = sample_init_net, - .exit = sample_exit_net, + .exit_batch = sample_exit_net, .id = &sample_net_id, .size = sizeof(struct tc_action_net), }; @@ -264,7 +255,6 @@ static int __init sample_init_module(void) static void __exit sample_cleanup_module(void) { - rcu_barrier(); tcf_unregister_action(&act_sample_ops, &sample_net_ops); } diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index e7b57e5071a3..425eac11f6da 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -47,7 +47,7 @@ static int tcf_simp(struct sk_buff *skb, const struct tc_action *a, return d->tcf_action; } -static void tcf_simp_release(struct tc_action *a, int bind) +static void tcf_simp_release(struct tc_action *a) { struct tcf_defact *d = to_defact(a); kfree(d->tcfd_defdata); @@ -204,16 +204,14 @@ static __net_init int simp_init_net(struct net *net) return tc_action_net_init(tn, &act_simp_ops); } -static void __net_exit simp_exit_net(struct net *net) +static void __net_exit simp_exit_net(struct list_head *net_list) { - struct tc_action_net *tn = net_generic(net, simp_net_id); - - tc_action_net_exit(tn); + tc_action_net_exit(net_list, simp_net_id); } static struct pernet_operations simp_net_ops = { .init = simp_init_net, - .exit = simp_exit_net, + .exit_batch = simp_exit_net, .id = &simp_net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 59949d61f20d..5a3f691bb545 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -241,16 +241,14 @@ static __net_init int skbedit_init_net(struct net *net) return tc_action_net_init(tn, &act_skbedit_ops); } -static void __net_exit skbedit_exit_net(struct net *net) +static void __net_exit skbedit_exit_net(struct list_head *net_list) { - struct tc_action_net *tn = net_generic(net, skbedit_net_id); - - tc_action_net_exit(tn); + tc_action_net_exit(net_list, skbedit_net_id); } static struct pernet_operations skbedit_net_ops = { .init = skbedit_init_net, - .exit = skbedit_exit_net, + .exit_batch = skbedit_exit_net, .id = &skbedit_net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index b642ad3d39dd..fa975262dbac 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -184,7 +184,7 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, return ret; } -static void tcf_skbmod_cleanup(struct tc_action *a, int bind) +static void tcf_skbmod_cleanup(struct tc_action *a) { struct tcf_skbmod *d = to_skbmod(a); struct tcf_skbmod_params *p; @@ -266,16 +266,14 @@ static __net_init int skbmod_init_net(struct net *net) return tc_action_net_init(tn, &act_skbmod_ops); } -static void __net_exit skbmod_exit_net(struct net *net) +static void __net_exit skbmod_exit_net(struct list_head *net_list) { - struct tc_action_net *tn = net_generic(net, skbmod_net_id); - - tc_action_net_exit(tn); + tc_action_net_exit(net_list, skbmod_net_id); } static struct pernet_operations skbmod_net_ops = { .init = skbmod_init_net, - .exit = skbmod_exit_net, + .exit_batch = skbmod_exit_net, .id = &skbmod_net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 30c96274c638..0e23aac09ad6 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -201,7 +201,7 @@ err_out: return ret; } -static void tunnel_key_release(struct tc_action *a, int bind) +static void tunnel_key_release(struct tc_action *a) { struct tcf_tunnel_key *t = to_tunnel_key(a); struct tcf_tunnel_key_params *params; @@ -325,16 +325,14 @@ static __net_init int tunnel_key_init_net(struct net *net) return tc_action_net_init(tn, &act_tunnel_key_ops); } -static void __net_exit tunnel_key_exit_net(struct net *net) +static void __net_exit tunnel_key_exit_net(struct list_head *net_list) { - struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); - - tc_action_net_exit(tn); + tc_action_net_exit(net_list, tunnel_key_net_id); } static struct pernet_operations tunnel_key_net_ops = { .init = tunnel_key_init_net, - .exit = tunnel_key_exit_net, + .exit_batch = tunnel_key_exit_net, .id = &tunnel_key_net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 97f717a13ad5..e1a1b3f3983a 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -219,7 +219,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, return ret; } -static void tcf_vlan_cleanup(struct tc_action *a, int bind) +static void tcf_vlan_cleanup(struct tc_action *a) { struct tcf_vlan *v = to_vlan(a); struct tcf_vlan_params *p; @@ -301,16 +301,14 @@ static __net_init int vlan_init_net(struct net *net) return tc_action_net_init(tn, &act_vlan_ops); } -static void __net_exit vlan_exit_net(struct net *net) +static void __net_exit vlan_exit_net(struct list_head *net_list) { - struct tc_action_net *tn = net_generic(net, vlan_net_id); - - tc_action_net_exit(tn); + tc_action_net_exit(net_list, vlan_net_id); } static struct pernet_operations vlan_net_ops = { .init = vlan_init_net, - .exit = vlan_exit_net, + .exit_batch = vlan_exit_net, .id = &vlan_net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 7d97f612c9b9..bcb4ccb5f894 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -23,8 +23,8 @@ #include <linux/skbuff.h> #include <linux/init.h> #include <linux/kmod.h> -#include <linux/err.h> #include <linux/slab.h> +#include <linux/idr.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/netlink.h> @@ -122,8 +122,8 @@ static inline u32 tcf_auto_prio(struct tcf_proto *tp) } static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol, - u32 prio, u32 parent, struct Qdisc *q, - struct tcf_chain *chain) + u32 prio, struct tcf_chain *chain, + struct netlink_ext_ack *extack) { struct tcf_proto *tp; int err; @@ -149,6 +149,7 @@ static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol, module_put(tp->ops->owner); err = -EAGAIN; } else { + NL_SET_ERR_MSG(extack, "TC classifier not found"); err = -ENOENT; } goto errout; @@ -157,8 +158,6 @@ static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol, tp->classify = tp->ops->classify; tp->protocol = protocol; tp->prio = prio; - tp->classid = parent; - tp->q = q; tp->chain = chain; err = tp->ops->init(tp); @@ -173,13 +172,20 @@ errout: return ERR_PTR(err); } -static void tcf_proto_destroy(struct tcf_proto *tp) +static void tcf_proto_destroy(struct tcf_proto *tp, + struct netlink_ext_ack *extack) { - tp->ops->destroy(tp); + tp->ops->destroy(tp, extack); module_put(tp->ops->owner); kfree_rcu(tp, rcu); } +struct tcf_filter_chain_list_item { + struct list_head list; + tcf_chain_head_change_t *chain_head_change; + void *chain_head_change_priv; +}; + static struct tcf_chain *tcf_chain_create(struct tcf_block *block, u32 chain_index) { @@ -188,6 +194,7 @@ static struct tcf_chain *tcf_chain_create(struct tcf_block *block, chain = kzalloc(sizeof(*chain), GFP_KERNEL); if (!chain) return NULL; + INIT_LIST_HEAD(&chain->filter_chain_list); list_add_tail(&chain->list, &block->chain_list); chain->block = block; chain->index = chain_index; @@ -195,12 +202,19 @@ static struct tcf_chain *tcf_chain_create(struct tcf_block *block, return chain; } +static void tcf_chain_head_change_item(struct tcf_filter_chain_list_item *item, + struct tcf_proto *tp_head) +{ + if (item->chain_head_change) + item->chain_head_change(tp_head, item->chain_head_change_priv); +} static void tcf_chain_head_change(struct tcf_chain *chain, struct tcf_proto *tp_head) { - if (chain->chain_head_change) - chain->chain_head_change(tp_head, - chain->chain_head_change_priv); + struct tcf_filter_chain_list_item *item; + + list_for_each_entry(item, &chain->filter_chain_list, list) + tcf_chain_head_change_item(item, tp_head); } static void tcf_chain_flush(struct tcf_chain *chain) @@ -210,7 +224,7 @@ static void tcf_chain_flush(struct tcf_chain *chain) tcf_chain_head_change(chain, NULL); while (tp) { RCU_INIT_POINTER(chain->filter_chain, tp->next); - tcf_proto_destroy(tp); + tcf_proto_destroy(tp, NULL); tp = rtnl_dereference(chain->filter_chain); tcf_chain_put(chain); } @@ -218,8 +232,12 @@ static void tcf_chain_flush(struct tcf_chain *chain) static void tcf_chain_destroy(struct tcf_chain *chain) { + struct tcf_block *block = chain->block; + list_del(&chain->list); kfree(chain); + if (list_empty(&block->chain_list)) + kfree(block); } static void tcf_chain_hold(struct tcf_chain *chain) @@ -250,62 +268,300 @@ void tcf_chain_put(struct tcf_chain *chain) } EXPORT_SYMBOL(tcf_chain_put); -static void tcf_block_offload_cmd(struct tcf_block *block, struct Qdisc *q, - struct tcf_block_ext_info *ei, - enum tc_block_command command) +static bool tcf_block_offload_in_use(struct tcf_block *block) +{ + return block->offloadcnt; +} + +static int tcf_block_offload_cmd(struct tcf_block *block, + struct net_device *dev, + struct tcf_block_ext_info *ei, + enum tc_block_command command) { - struct net_device *dev = q->dev_queue->dev; struct tc_block_offload bo = {}; - if (!dev->netdev_ops->ndo_setup_tc) - return; bo.command = command; bo.binder_type = ei->binder_type; bo.block = block; - dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo); + return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo); } -static void tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q, - struct tcf_block_ext_info *ei) +static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q, + struct tcf_block_ext_info *ei) { - tcf_block_offload_cmd(block, q, ei, TC_BLOCK_BIND); + struct net_device *dev = q->dev_queue->dev; + int err; + + if (!dev->netdev_ops->ndo_setup_tc) + goto no_offload_dev_inc; + + /* If tc offload feature is disabled and the block we try to bind + * to already has some offloaded filters, forbid to bind. + */ + if (!tc_can_offload(dev) && tcf_block_offload_in_use(block)) + return -EOPNOTSUPP; + + err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_BIND); + if (err == -EOPNOTSUPP) + goto no_offload_dev_inc; + return err; + +no_offload_dev_inc: + if (tcf_block_offload_in_use(block)) + return -EOPNOTSUPP; + block->nooffloaddevcnt++; + return 0; } static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q, struct tcf_block_ext_info *ei) { - tcf_block_offload_cmd(block, q, ei, TC_BLOCK_UNBIND); + struct net_device *dev = q->dev_queue->dev; + int err; + + if (!dev->netdev_ops->ndo_setup_tc) + goto no_offload_dev_dec; + err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_UNBIND); + if (err == -EOPNOTSUPP) + goto no_offload_dev_dec; + return; + +no_offload_dev_dec: + WARN_ON(block->nooffloaddevcnt-- == 0); } -int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, - struct tcf_block_ext_info *ei) +static int +tcf_chain_head_change_cb_add(struct tcf_chain *chain, + struct tcf_block_ext_info *ei, + struct netlink_ext_ack *extack) +{ + struct tcf_filter_chain_list_item *item; + + item = kmalloc(sizeof(*item), GFP_KERNEL); + if (!item) { + NL_SET_ERR_MSG(extack, "Memory allocation for head change callback item failed"); + return -ENOMEM; + } + item->chain_head_change = ei->chain_head_change; + item->chain_head_change_priv = ei->chain_head_change_priv; + if (chain->filter_chain) + tcf_chain_head_change_item(item, chain->filter_chain); + list_add(&item->list, &chain->filter_chain_list); + return 0; +} + +static void +tcf_chain_head_change_cb_del(struct tcf_chain *chain, + struct tcf_block_ext_info *ei) +{ + struct tcf_filter_chain_list_item *item; + + list_for_each_entry(item, &chain->filter_chain_list, list) { + if ((!ei->chain_head_change && !ei->chain_head_change_priv) || + (item->chain_head_change == ei->chain_head_change && + item->chain_head_change_priv == ei->chain_head_change_priv)) { + tcf_chain_head_change_item(item, NULL); + list_del(&item->list); + kfree(item); + return; + } + } + WARN_ON(1); +} + +struct tcf_net { + struct idr idr; +}; + +static unsigned int tcf_net_id; + +static int tcf_block_insert(struct tcf_block *block, struct net *net, + u32 block_index, struct netlink_ext_ack *extack) +{ + struct tcf_net *tn = net_generic(net, tcf_net_id); + int err; + + err = idr_alloc_ext(&tn->idr, block, NULL, block_index, + block_index + 1, GFP_KERNEL); + if (err) + return err; + block->index = block_index; + return 0; +} + +static void tcf_block_remove(struct tcf_block *block, struct net *net) +{ + struct tcf_net *tn = net_generic(net, tcf_net_id); + + idr_remove_ext(&tn->idr, block->index); +} + +static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q, + struct netlink_ext_ack *extack) { - struct tcf_block *block = kzalloc(sizeof(*block), GFP_KERNEL); + struct tcf_block *block; struct tcf_chain *chain; int err; - if (!block) - return -ENOMEM; + block = kzalloc(sizeof(*block), GFP_KERNEL); + if (!block) { + NL_SET_ERR_MSG(extack, "Memory allocation for block failed"); + return ERR_PTR(-ENOMEM); + } INIT_LIST_HEAD(&block->chain_list); INIT_LIST_HEAD(&block->cb_list); + INIT_LIST_HEAD(&block->owner_list); /* Create chain 0 by default, it has to be always present. */ chain = tcf_chain_create(block, 0); if (!chain) { + NL_SET_ERR_MSG(extack, "Failed to create new tcf chain"); err = -ENOMEM; goto err_chain_create; } - WARN_ON(!ei->chain_head_change); - chain->chain_head_change = ei->chain_head_change; - chain->chain_head_change_priv = ei->chain_head_change_priv; block->net = qdisc_net(q); + block->refcnt = 1; + block->net = net; block->q = q; - tcf_block_offload_bind(block, q, ei); - *p_block = block; - return 0; + return block; err_chain_create: kfree(block); + return ERR_PTR(err); +} + +static struct tcf_block *tcf_block_lookup(struct net *net, u32 block_index) +{ + struct tcf_net *tn = net_generic(net, tcf_net_id); + + return idr_find_ext(&tn->idr, block_index); +} + +static struct tcf_chain *tcf_block_chain_zero(struct tcf_block *block) +{ + return list_first_entry(&block->chain_list, struct tcf_chain, list); +} + +struct tcf_block_owner_item { + struct list_head list; + struct Qdisc *q; + enum tcf_block_binder_type binder_type; +}; + +static void +tcf_block_owner_netif_keep_dst(struct tcf_block *block, + struct Qdisc *q, + enum tcf_block_binder_type binder_type) +{ + if (block->keep_dst && + binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS && + binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_EGRESS) + netif_keep_dst(qdisc_dev(q)); +} + +void tcf_block_netif_keep_dst(struct tcf_block *block) +{ + struct tcf_block_owner_item *item; + + block->keep_dst = true; + list_for_each_entry(item, &block->owner_list, list) + tcf_block_owner_netif_keep_dst(block, item->q, + item->binder_type); +} +EXPORT_SYMBOL(tcf_block_netif_keep_dst); + +static int tcf_block_owner_add(struct tcf_block *block, + struct Qdisc *q, + enum tcf_block_binder_type binder_type) +{ + struct tcf_block_owner_item *item; + + item = kmalloc(sizeof(*item), GFP_KERNEL); + if (!item) + return -ENOMEM; + item->q = q; + item->binder_type = binder_type; + list_add(&item->list, &block->owner_list); + return 0; +} + +static void tcf_block_owner_del(struct tcf_block *block, + struct Qdisc *q, + enum tcf_block_binder_type binder_type) +{ + struct tcf_block_owner_item *item; + + list_for_each_entry(item, &block->owner_list, list) { + if (item->q == q && item->binder_type == binder_type) { + list_del(&item->list); + kfree(item); + return; + } + } + WARN_ON(1); +} + +int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, + struct tcf_block_ext_info *ei, + struct netlink_ext_ack *extack) +{ + struct net *net = qdisc_net(q); + struct tcf_block *block = NULL; + bool created = false; + int err; + + if (ei->block_index) { + /* block_index not 0 means the shared block is requested */ + block = tcf_block_lookup(net, ei->block_index); + if (block) + block->refcnt++; + } + + if (!block) { + block = tcf_block_create(net, q, extack); + if (IS_ERR(block)) + return PTR_ERR(block); + created = true; + if (ei->block_index) { + err = tcf_block_insert(block, net, + ei->block_index, extack); + if (err) + goto err_block_insert; + } + } + + err = tcf_block_owner_add(block, q, ei->binder_type); + if (err) + goto err_block_owner_add; + + tcf_block_owner_netif_keep_dst(block, q, ei->binder_type); + + err = tcf_chain_head_change_cb_add(tcf_block_chain_zero(block), + ei, extack); + if (err) + goto err_chain_head_change_cb_add; + + err = tcf_block_offload_bind(block, q, ei); + if (err) + goto err_block_offload_bind; + + *p_block = block; + return 0; + +err_block_offload_bind: + tcf_chain_head_change_cb_del(tcf_block_chain_zero(block), ei); +err_chain_head_change_cb_add: + tcf_block_owner_del(block, q, ei->binder_type); +err_block_owner_add: + if (created) { + if (tcf_block_shared(block)) + tcf_block_remove(block, net); +err_block_insert: + kfree(tcf_block_chain_zero(block)); + kfree(block); + } else { + block->refcnt--; + } return err; } EXPORT_SYMBOL(tcf_block_get_ext); @@ -318,7 +574,8 @@ static void tcf_chain_head_change_dflt(struct tcf_proto *tp_head, void *priv) } int tcf_block_get(struct tcf_block **p_block, - struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q) + struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, + struct netlink_ext_ack *extack) { struct tcf_block_ext_info ei = { .chain_head_change = tcf_chain_head_change_dflt, @@ -326,44 +583,47 @@ int tcf_block_get(struct tcf_block **p_block, }; WARN_ON(!p_filter_chain); - return tcf_block_get_ext(p_block, q, &ei); + return tcf_block_get_ext(p_block, q, &ei, extack); } EXPORT_SYMBOL(tcf_block_get); -static void tcf_block_put_final(struct work_struct *work) -{ - struct tcf_block *block = container_of(work, struct tcf_block, work); - struct tcf_chain *chain, *tmp; - - rtnl_lock(); - /* Only chain 0 should be still here. */ - list_for_each_entry_safe(chain, tmp, &block->chain_list, list) - tcf_chain_put(chain); - rtnl_unlock(); - kfree(block); -} - /* XXX: Standalone actions are not allowed to jump to any chain, and bound - * actions should be all removed after flushing. However, filters are now - * destroyed in tc filter workqueue with RTNL lock, they can not race here. + * actions should be all removed after flushing. */ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, struct tcf_block_ext_info *ei) { struct tcf_chain *chain, *tmp; - list_for_each_entry_safe(chain, tmp, &block->chain_list, list) - tcf_chain_flush(chain); + if (!block) + return; + tcf_chain_head_change_cb_del(tcf_block_chain_zero(block), ei); + tcf_block_owner_del(block, q, ei->binder_type); + + if (--block->refcnt == 0) { + if (tcf_block_shared(block)) + tcf_block_remove(block, block->net); + + /* Hold a refcnt for all chains, so that they don't disappear + * while we are iterating. + */ + list_for_each_entry(chain, &block->chain_list, list) + tcf_chain_hold(chain); + + list_for_each_entry(chain, &block->chain_list, list) + tcf_chain_flush(chain); + } tcf_block_offload_unbind(block, q, ei); - INIT_WORK(&block->work, tcf_block_put_final); - /* Wait for existing RCU callbacks to cool down, make sure their works - * have been queued before this. We can not flush pending works here - * because we are holding the RTNL lock. - */ - rcu_barrier(); - tcf_queue_work(&block->work); + if (block->refcnt == 0) { + /* At this point, all the chains should have refcnt >= 1. */ + list_for_each_entry_safe(chain, tmp, &block->chain_list, list) + tcf_chain_put(chain); + + /* Finally, put chain 0 and allow block to be freed. */ + tcf_chain_put(tcf_block_chain_zero(block)); + } } EXPORT_SYMBOL(tcf_block_put_ext); @@ -421,9 +681,16 @@ struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block, { struct tcf_block_cb *block_cb; + /* At this point, playback of previous block cb calls is not supported, + * so forbid to register to block which already has some offloaded + * filters present. + */ + if (tcf_block_offload_in_use(block)) + return ERR_PTR(-EOPNOTSUPP); + block_cb = kzalloc(sizeof(*block_cb), GFP_KERNEL); if (!block_cb) - return NULL; + return ERR_PTR(-ENOMEM); block_cb->cb = cb; block_cb->cb_ident = cb_ident; block_cb->cb_priv = cb_priv; @@ -439,7 +706,7 @@ int tcf_block_cb_register(struct tcf_block *block, struct tcf_block_cb *block_cb; block_cb = __tcf_block_cb_register(block, cb, cb_ident, cb_priv); - return block_cb ? 0 : -ENOMEM; + return IS_ERR(block_cb) ? PTR_ERR(block_cb) : 0; } EXPORT_SYMBOL(tcf_block_cb_register); @@ -469,6 +736,10 @@ static int tcf_block_cb_call(struct tcf_block *block, enum tc_setup_type type, int ok_count = 0; int err; + /* Make sure all netdevs sharing this block are offload-capable. */ + if (block->nooffloaddevcnt && err_stop) + return -EOPNOTSUPP; + list_for_each_entry(block_cb, &block->cb_list, list) { err = block_cb->cb(type, type_data, block_cb->cb_priv); if (err) { @@ -522,8 +793,9 @@ reclassify: #ifdef CONFIG_NET_CLS_ACT reset: if (unlikely(limit++ >= max_reclassify_loop)) { - net_notice_ratelimited("%s: reclassify loop, rule prio %u, protocol %02x\n", - tp->q->ops->id, tp->prio & 0xffff, + net_notice_ratelimited("%u: reclassify loop, rule prio %u, protocol %02x\n", + tp->chain->block->index, + tp->prio & 0xffff, ntohs(tp->protocol)); return TC_ACT_SHOT; } @@ -596,8 +868,9 @@ static struct tcf_proto *tcf_chain_tp_find(struct tcf_chain *chain, } static int tcf_fill_node(struct net *net, struct sk_buff *skb, - struct tcf_proto *tp, struct Qdisc *q, u32 parent, - void *fh, u32 portid, u32 seq, u16 flags, int event) + struct tcf_proto *tp, struct tcf_block *block, + struct Qdisc *q, u32 parent, void *fh, + u32 portid, u32 seq, u16 flags, int event) { struct tcmsg *tcm; struct nlmsghdr *nlh; @@ -610,8 +883,13 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb, tcm->tcm_family = AF_UNSPEC; tcm->tcm__pad1 = 0; tcm->tcm__pad2 = 0; - tcm->tcm_ifindex = qdisc_dev(q)->ifindex; - tcm->tcm_parent = parent; + if (q) { + tcm->tcm_ifindex = qdisc_dev(q)->ifindex; + tcm->tcm_parent = parent; + } else { + tcm->tcm_ifindex = TCM_IFINDEX_MAGIC_BLOCK; + tcm->tcm_block_index = block->index; + } tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol); if (nla_put_string(skb, TCA_KIND, tp->ops->kind)) goto nla_put_failure; @@ -634,8 +912,8 @@ nla_put_failure: static int tfilter_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, struct tcf_proto *tp, - struct Qdisc *q, u32 parent, - void *fh, int event, bool unicast) + struct tcf_block *block, struct Qdisc *q, + u32 parent, void *fh, int event, bool unicast) { struct sk_buff *skb; u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; @@ -644,8 +922,8 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb, if (!skb) return -ENOBUFS; - if (tcf_fill_node(net, skb, tp, q, parent, fh, portid, n->nlmsg_seq, - n->nlmsg_flags, event) <= 0) { + if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid, + n->nlmsg_seq, n->nlmsg_flags, event) <= 0) { kfree_skb(skb); return -EINVAL; } @@ -659,8 +937,9 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb, static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, struct tcf_proto *tp, - struct Qdisc *q, u32 parent, - void *fh, bool unicast, bool *last) + struct tcf_block *block, struct Qdisc *q, + u32 parent, void *fh, bool unicast, bool *last, + struct netlink_ext_ack *extack) { struct sk_buff *skb; u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; @@ -670,13 +949,14 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, if (!skb) return -ENOBUFS; - if (tcf_fill_node(net, skb, tp, q, parent, fh, portid, n->nlmsg_seq, - n->nlmsg_flags, RTM_DELTFILTER) <= 0) { + if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid, + n->nlmsg_seq, n->nlmsg_flags, RTM_DELTFILTER) <= 0) { + NL_SET_ERR_MSG(extack, "Failed to build del event notification"); kfree_skb(skb); return -EINVAL; } - err = tp->ops->delete(tp, fh, last); + err = tp->ops->delete(tp, fh, last, extack); if (err) { kfree_skb(skb); return err; @@ -685,20 +965,24 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, if (unicast) return netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT); - return rtnetlink_send(skb, net, portid, RTNLGRP_TC, - n->nlmsg_flags & NLM_F_ECHO); + err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, + n->nlmsg_flags & NLM_F_ECHO); + if (err < 0) + NL_SET_ERR_MSG(extack, "Failed to send filter delete notification"); + return err; } static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb, - struct Qdisc *q, u32 parent, - struct nlmsghdr *n, + struct tcf_block *block, struct Qdisc *q, + u32 parent, struct nlmsghdr *n, struct tcf_chain *chain, int event) { struct tcf_proto *tp; for (tp = rtnl_dereference(chain->filter_chain); tp; tp = rtnl_dereference(tp->next)) - tfilter_notify(net, oskb, n, tp, q, parent, 0, event, false); + tfilter_notify(net, oskb, n, tp, block, + q, parent, 0, event, false); } /* Add/change/delete/get a filter node */ @@ -714,13 +998,11 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, bool prio_allocate; u32 parent; u32 chain_index; - struct net_device *dev; - struct Qdisc *q; + struct Qdisc *q = NULL; struct tcf_chain_info chain_info; struct tcf_chain *chain = NULL; struct tcf_block *block; struct tcf_proto *tp; - const struct Qdisc_class_ops *cops; unsigned long cl; void *fh; int err; @@ -747,8 +1029,10 @@ replay: if (prio == 0) { switch (n->nlmsg_type) { case RTM_DELTFILTER: - if (protocol || t->tcm_handle || tca[TCA_KIND]) + if (protocol || t->tcm_handle || tca[TCA_KIND]) { + NL_SET_ERR_MSG(extack, "Cannot flush filters with protocol, handle or kind set"); return -ENOENT; + } break; case RTM_NEWTFILTER: /* If no priority is provided by the user, @@ -761,63 +1045,91 @@ replay: } /* fall-through */ default: + NL_SET_ERR_MSG(extack, "Invalid filter command with priority of zero"); return -ENOENT; } } /* Find head of filter chain. */ - /* Find link */ - dev = __dev_get_by_index(net, t->tcm_ifindex); - if (dev == NULL) - return -ENODEV; - - /* Find qdisc */ - if (!parent) { - q = dev->qdisc; - parent = q->handle; + if (t->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) { + block = tcf_block_lookup(net, t->tcm_block_index); + if (!block) { + NL_SET_ERR_MSG(extack, "Block of given index was not found"); + err = -EINVAL; + goto errout; + } } else { - q = qdisc_lookup(dev, TC_H_MAJ(t->tcm_parent)); - if (q == NULL) - return -EINVAL; - } + const struct Qdisc_class_ops *cops; + struct net_device *dev; - /* Is it classful? */ - cops = q->ops->cl_ops; - if (!cops) - return -EINVAL; + /* Find link */ + dev = __dev_get_by_index(net, t->tcm_ifindex); + if (!dev) + return -ENODEV; - if (!cops->tcf_block) - return -EOPNOTSUPP; + /* Find qdisc */ + if (!parent) { + q = dev->qdisc; + parent = q->handle; + } else { + q = qdisc_lookup(dev, TC_H_MAJ(t->tcm_parent)); + if (!q) { + NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists"); + return -EINVAL; + } + } - /* Do we search for filter, attached to class? */ - if (TC_H_MIN(parent)) { - cl = cops->find(q, parent); - if (cl == 0) - return -ENOENT; - } + /* Is it classful? */ + cops = q->ops->cl_ops; + if (!cops) { + NL_SET_ERR_MSG(extack, "Qdisc not classful"); + return -EINVAL; + } - /* And the last stroke */ - block = cops->tcf_block(q, cl); - if (!block) { - err = -EINVAL; - goto errout; + if (!cops->tcf_block) { + NL_SET_ERR_MSG(extack, "Class doesn't support blocks"); + return -EOPNOTSUPP; + } + + /* Do we search for filter, attached to class? */ + if (TC_H_MIN(parent)) { + cl = cops->find(q, parent); + if (cl == 0) { + NL_SET_ERR_MSG(extack, "Specified class doesn't exist"); + return -ENOENT; + } + } + + /* And the last stroke */ + block = cops->tcf_block(q, cl, extack); + if (!block) { + err = -EINVAL; + goto errout; + } + if (tcf_block_shared(block)) { + NL_SET_ERR_MSG(extack, "This filter block is shared. Please use the block index to manipulate the filters"); + err = -EOPNOTSUPP; + goto errout; + } } chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0; if (chain_index > TC_ACT_EXT_VAL_MASK) { + NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit"); err = -EINVAL; goto errout; } chain = tcf_chain_get(block, chain_index, n->nlmsg_type == RTM_NEWTFILTER); if (!chain) { + NL_SET_ERR_MSG(extack, "Cannot find specified filter chain"); err = n->nlmsg_type == RTM_NEWTFILTER ? -ENOMEM : -EINVAL; goto errout; } if (n->nlmsg_type == RTM_DELTFILTER && prio == 0) { - tfilter_notify_chain(net, skb, q, parent, n, + tfilter_notify_chain(net, skb, block, q, parent, n, chain, RTM_DELTFILTER); tcf_chain_flush(chain); err = 0; @@ -827,6 +1139,7 @@ replay: tp = tcf_chain_tp_find(chain, &chain_info, protocol, prio, prio_allocate); if (IS_ERR(tp)) { + NL_SET_ERR_MSG(extack, "Filter with specified priority/protocol not found"); err = PTR_ERR(tp); goto errout; } @@ -835,12 +1148,14 @@ replay: /* Proto-tcf does not exist, create new one */ if (tca[TCA_KIND] == NULL || !protocol) { + NL_SET_ERR_MSG(extack, "Filter kind and protocol must be specified"); err = -EINVAL; goto errout; } if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags & NLM_F_CREATE)) { + NL_SET_ERR_MSG(extack, "Need both RTM_NEWTFILTER and NLM_F_CREATE to create a new filter"); err = -ENOENT; goto errout; } @@ -849,13 +1164,14 @@ replay: prio = tcf_auto_prio(tcf_chain_tp_prev(&chain_info)); tp = tcf_proto_create(nla_data(tca[TCA_KIND]), - protocol, prio, parent, q, chain); + protocol, prio, chain, extack); if (IS_ERR(tp)) { err = PTR_ERR(tp); goto errout; } tp_created = 1; } else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) { + NL_SET_ERR_MSG(extack, "Specified filter kind does not match existing one"); err = -EINVAL; goto errout; } @@ -865,15 +1181,16 @@ replay: if (!fh) { if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) { tcf_chain_tp_remove(chain, &chain_info, tp); - tfilter_notify(net, skb, n, tp, q, parent, fh, + tfilter_notify(net, skb, n, tp, block, q, parent, fh, RTM_DELTFILTER, false); - tcf_proto_destroy(tp); + tcf_proto_destroy(tp, extack); err = 0; goto errout; } if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags & NLM_F_CREATE)) { + NL_SET_ERR_MSG(extack, "Need both RTM_NEWTFILTER and NLM_F_CREATE to create a new filter"); err = -ENOENT; goto errout; } @@ -884,41 +1201,47 @@ replay: case RTM_NEWTFILTER: if (n->nlmsg_flags & NLM_F_EXCL) { if (tp_created) - tcf_proto_destroy(tp); + tcf_proto_destroy(tp, NULL); + NL_SET_ERR_MSG(extack, "Filter already exists"); err = -EEXIST; goto errout; } break; case RTM_DELTFILTER: - err = tfilter_del_notify(net, skb, n, tp, q, parent, - fh, false, &last); + err = tfilter_del_notify(net, skb, n, tp, block, + q, parent, fh, false, &last, + extack); if (err) goto errout; if (last) { tcf_chain_tp_remove(chain, &chain_info, tp); - tcf_proto_destroy(tp); + tcf_proto_destroy(tp, extack); } goto errout; case RTM_GETTFILTER: - err = tfilter_notify(net, skb, n, tp, q, parent, fh, - RTM_NEWTFILTER, true); + err = tfilter_notify(net, skb, n, tp, block, q, parent, + fh, RTM_NEWTFILTER, true); + if (err < 0) + NL_SET_ERR_MSG(extack, "Failed to send filter notify message"); goto errout; default: + NL_SET_ERR_MSG(extack, "Invalid netlink message type"); err = -EINVAL; goto errout; } } err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh, - n->nlmsg_flags & NLM_F_CREATE ? TCA_ACT_NOREPLACE : TCA_ACT_REPLACE); + n->nlmsg_flags & NLM_F_CREATE ? TCA_ACT_NOREPLACE : TCA_ACT_REPLACE, + extack); if (err == 0) { if (tp_created) tcf_chain_tp_insert(chain, &chain_info, tp); - tfilter_notify(net, skb, n, tp, q, parent, fh, + tfilter_notify(net, skb, n, tp, block, q, parent, fh, RTM_NEWTFILTER, false); } else { if (tp_created) - tcf_proto_destroy(tp); + tcf_proto_destroy(tp, NULL); } errout: @@ -934,6 +1257,7 @@ struct tcf_dump_args { struct tcf_walker w; struct sk_buff *skb; struct netlink_callback *cb; + struct tcf_block *block; struct Qdisc *q; u32 parent; }; @@ -943,7 +1267,7 @@ static int tcf_node_dump(struct tcf_proto *tp, void *n, struct tcf_walker *arg) struct tcf_dump_args *a = (void *)arg; struct net *net = sock_net(a->skb->sk); - return tcf_fill_node(net, a->skb, tp, a->q, a->parent, + return tcf_fill_node(net, a->skb, tp, a->block, a->q, a->parent, n, NETLINK_CB(a->cb->skb).portid, a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER); @@ -954,6 +1278,7 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent, long index_start, long *p_index) { struct net *net = sock_net(skb->sk); + struct tcf_block *block = chain->block; struct tcmsg *tcm = nlmsg_data(cb->nlh); struct tcf_dump_args arg; struct tcf_proto *tp; @@ -972,7 +1297,7 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent, memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); if (cb->args[1] == 0) { - if (tcf_fill_node(net, skb, tp, q, parent, 0, + if (tcf_fill_node(net, skb, tp, block, q, parent, 0, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER) <= 0) @@ -985,6 +1310,7 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent, arg.w.fn = tcf_node_dump; arg.skb = skb; arg.cb = cb; + arg.block = block; arg.q = q; arg.parent = parent; arg.w.stop = 0; @@ -1003,13 +1329,10 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct nlattr *tca[TCA_MAX + 1]; - struct net_device *dev; - struct Qdisc *q; + struct Qdisc *q = NULL; struct tcf_block *block; struct tcf_chain *chain; struct tcmsg *tcm = nlmsg_data(cb->nlh); - unsigned long cl = 0; - const struct Qdisc_class_ops *cops; long index_start; long index; u32 parent; @@ -1022,32 +1345,51 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) if (err) return err; - dev = __dev_get_by_index(net, tcm->tcm_ifindex); - if (!dev) - return skb->len; - - parent = tcm->tcm_parent; - if (!parent) { - q = dev->qdisc; - parent = q->handle; + if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) { + block = tcf_block_lookup(net, tcm->tcm_block_index); + if (!block) + goto out; + /* If we work with block index, q is NULL and parent value + * will never be used in the following code. The check + * in tcf_fill_node prevents it. However, compiler does not + * see that far, so set parent to zero to silence the warning + * about parent being uninitialized. + */ + parent = 0; } else { - q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent)); - } - if (!q) - goto out; - cops = q->ops->cl_ops; - if (!cops) - goto out; - if (!cops->tcf_block) - goto out; - if (TC_H_MIN(tcm->tcm_parent)) { - cl = cops->find(q, tcm->tcm_parent); - if (cl == 0) + const struct Qdisc_class_ops *cops; + struct net_device *dev; + unsigned long cl = 0; + + dev = __dev_get_by_index(net, tcm->tcm_ifindex); + if (!dev) + return skb->len; + + parent = tcm->tcm_parent; + if (!parent) { + q = dev->qdisc; + parent = q->handle; + } else { + q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent)); + } + if (!q) + goto out; + cops = q->ops->cl_ops; + if (!cops) + goto out; + if (!cops->tcf_block) + goto out; + if (TC_H_MIN(tcm->tcm_parent)) { + cl = cops->find(q, tcm->tcm_parent); + if (cl == 0) + goto out; + } + block = cops->tcf_block(q, cl, NULL); + if (!block) goto out; + if (tcf_block_shared(block)) + q = NULL; } - block = cops->tcf_block(q, cl); - if (!block) - goto out; index_start = cb->args[0]; index = 0; @@ -1082,7 +1424,8 @@ void tcf_exts_destroy(struct tcf_exts *exts) EXPORT_SYMBOL(tcf_exts_destroy); int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, - struct nlattr *rate_tlv, struct tcf_exts *exts, bool ovr) + struct nlattr *rate_tlv, struct tcf_exts *exts, bool ovr, + struct netlink_ext_ack *extack) { #ifdef CONFIG_NET_CLS_ACT { @@ -1115,8 +1458,10 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, } #else if ((exts->action && tb[exts->action]) || - (exts->police && tb[exts->police])) + (exts->police && tb[exts->police])) { + NL_SET_ERR_MSG(extack, "Classifier actions are not supported per compile options (CONFIG_NET_CLS_ACT)"); return -EOPNOTSUPP; + } #endif return 0; @@ -1250,18 +1595,50 @@ int tc_setup_cb_call(struct tcf_block *block, struct tcf_exts *exts, } EXPORT_SYMBOL(tc_setup_cb_call); +static __net_init int tcf_net_init(struct net *net) +{ + struct tcf_net *tn = net_generic(net, tcf_net_id); + + idr_init(&tn->idr); + return 0; +} + +static void __net_exit tcf_net_exit(struct net *net) +{ + struct tcf_net *tn = net_generic(net, tcf_net_id); + + idr_destroy(&tn->idr); +} + +static struct pernet_operations tcf_net_ops = { + .init = tcf_net_init, + .exit = tcf_net_exit, + .id = &tcf_net_id, + .size = sizeof(struct tcf_net), +}; + static int __init tc_filter_init(void) { + int err; + tc_filter_wq = alloc_ordered_workqueue("tc_filter_workqueue", 0); if (!tc_filter_wq) return -ENOMEM; + err = register_pernet_subsys(&tcf_net_ops); + if (err) + goto err_register_pernet_subsys; + rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_ctl_tfilter, NULL, 0); rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_ctl_tfilter, tc_dump_tfilter, 0); return 0; + +err_register_pernet_subsys: + destroy_workqueue(tc_filter_wq); + return err; } subsys_initcall(tc_filter_init); diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index 5f169ded347e..d333f5c5101d 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -112,7 +112,7 @@ static void basic_delete_filter(struct rcu_head *head) tcf_queue_work(&f->work); } -static void basic_destroy(struct tcf_proto *tp) +static void basic_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) { struct basic_head *head = rtnl_dereference(tp->root); struct basic_filter *f, *n; @@ -130,7 +130,8 @@ static void basic_destroy(struct tcf_proto *tp) kfree_rcu(head, rcu); } -static int basic_delete(struct tcf_proto *tp, void *arg, bool *last) +static int basic_delete(struct tcf_proto *tp, void *arg, bool *last, + struct netlink_ext_ack *extack) { struct basic_head *head = rtnl_dereference(tp->root); struct basic_filter *f = arg; @@ -152,11 +153,12 @@ static const struct nla_policy basic_policy[TCA_BASIC_MAX + 1] = { static int basic_set_parms(struct net *net, struct tcf_proto *tp, struct basic_filter *f, unsigned long base, struct nlattr **tb, - struct nlattr *est, bool ovr) + struct nlattr *est, bool ovr, + struct netlink_ext_ack *extack) { int err; - err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr); + err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr, extack); if (err < 0) return err; @@ -175,7 +177,8 @@ static int basic_set_parms(struct net *net, struct tcf_proto *tp, static int basic_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, - struct nlattr **tca, void **arg, bool ovr) + struct nlattr **tca, void **arg, bool ovr, + struct netlink_ext_ack *extack) { int err; struct basic_head *head = rtnl_dereference(tp->root); @@ -221,7 +224,8 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, fnew->handle = idr_index; } - err = basic_set_parms(net, tp, fnew, base, tb, tca[TCA_RATE], ovr); + err = basic_set_parms(net, tp, fnew, base, tb, tca[TCA_RATE], ovr, + extack); if (err < 0) { if (!fold) idr_remove_ext(&head->handle_idr, fnew->handle); diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index a9f3e317055c..8e5326bc6440 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -42,7 +42,6 @@ struct cls_bpf_prog { struct list_head link; struct tcf_result res; bool exts_integrated; - bool offloaded; u32 gen_flags; struct tcf_exts exts; u32 handle; @@ -148,99 +147,95 @@ static bool cls_bpf_is_ebpf(const struct cls_bpf_prog *prog) } static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog, - enum tc_clsbpf_command cmd) + struct cls_bpf_prog *oldprog, + struct netlink_ext_ack *extack) { - bool addorrep = cmd == TC_CLSBPF_ADD || cmd == TC_CLSBPF_REPLACE; struct tcf_block *block = tp->chain->block; - bool skip_sw = tc_skip_sw(prog->gen_flags); struct tc_cls_bpf_offload cls_bpf = {}; + struct cls_bpf_prog *obj; + bool skip_sw; int err; - tc_cls_common_offload_init(&cls_bpf.common, tp); - cls_bpf.command = cmd; - cls_bpf.exts = &prog->exts; - cls_bpf.prog = prog->filter; - cls_bpf.name = prog->bpf_name; - cls_bpf.exts_integrated = prog->exts_integrated; - cls_bpf.gen_flags = prog->gen_flags; + skip_sw = prog && tc_skip_sw(prog->gen_flags); + obj = prog ?: oldprog; + + tc_cls_common_offload_init(&cls_bpf.common, tp, obj->gen_flags, + extack); + cls_bpf.command = TC_CLSBPF_OFFLOAD; + cls_bpf.exts = &obj->exts; + cls_bpf.prog = prog ? prog->filter : NULL; + cls_bpf.oldprog = oldprog ? oldprog->filter : NULL; + cls_bpf.name = obj->bpf_name; + cls_bpf.exts_integrated = obj->exts_integrated; + + if (oldprog) + tcf_block_offload_dec(block, &oldprog->gen_flags); err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSBPF, &cls_bpf, skip_sw); - if (addorrep) { + if (prog) { if (err < 0) { - cls_bpf_offload_cmd(tp, prog, TC_CLSBPF_DESTROY); + cls_bpf_offload_cmd(tp, oldprog, prog, extack); return err; } else if (err > 0) { - prog->gen_flags |= TCA_CLS_FLAGS_IN_HW; + tcf_block_offload_inc(block, &prog->gen_flags); } } - if (addorrep && skip_sw && !(prog->gen_flags & TCA_CLS_FLAGS_IN_HW)) + if (prog && skip_sw && !(prog->gen_flags & TCA_CLS_FLAGS_IN_HW)) return -EINVAL; return 0; } -static int cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog, - struct cls_bpf_prog *oldprog) +static u32 cls_bpf_flags(u32 flags) { - struct cls_bpf_prog *obj = prog; - enum tc_clsbpf_command cmd; - bool skip_sw; - int ret; - - skip_sw = tc_skip_sw(prog->gen_flags) || - (oldprog && tc_skip_sw(oldprog->gen_flags)); - - if (oldprog && oldprog->offloaded) { - if (!tc_skip_hw(prog->gen_flags)) { - cmd = TC_CLSBPF_REPLACE; - } else if (!tc_skip_sw(prog->gen_flags)) { - obj = oldprog; - cmd = TC_CLSBPF_DESTROY; - } else { - return -EINVAL; - } - } else { - if (tc_skip_hw(prog->gen_flags)) - return skip_sw ? -EINVAL : 0; - cmd = TC_CLSBPF_ADD; - } + return flags & CLS_BPF_SUPPORTED_GEN_FLAGS; +} - ret = cls_bpf_offload_cmd(tp, obj, cmd); - if (ret) - return ret; +static int cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog, + struct cls_bpf_prog *oldprog, + struct netlink_ext_ack *extack) +{ + if (prog && oldprog && + cls_bpf_flags(prog->gen_flags) != + cls_bpf_flags(oldprog->gen_flags)) + return -EINVAL; - obj->offloaded = true; - if (oldprog) - oldprog->offloaded = false; + if (prog && tc_skip_hw(prog->gen_flags)) + prog = NULL; + if (oldprog && tc_skip_hw(oldprog->gen_flags)) + oldprog = NULL; + if (!prog && !oldprog) + return 0; - return 0; + return cls_bpf_offload_cmd(tp, prog, oldprog, extack); } static void cls_bpf_stop_offload(struct tcf_proto *tp, - struct cls_bpf_prog *prog) + struct cls_bpf_prog *prog, + struct netlink_ext_ack *extack) { int err; - if (!prog->offloaded) - return; - - err = cls_bpf_offload_cmd(tp, prog, TC_CLSBPF_DESTROY); - if (err) { + err = cls_bpf_offload_cmd(tp, NULL, prog, extack); + if (err) pr_err("Stopping hardware offload failed: %d\n", err); - return; - } - - prog->offloaded = false; } static void cls_bpf_offload_update_stats(struct tcf_proto *tp, struct cls_bpf_prog *prog) { - if (!prog->offloaded) - return; + struct tcf_block *block = tp->chain->block; + struct tc_cls_bpf_offload cls_bpf = {}; - cls_bpf_offload_cmd(tp, prog, TC_CLSBPF_STATS); + tc_cls_common_offload_init(&cls_bpf.common, tp, prog->gen_flags, NULL); + cls_bpf.command = TC_CLSBPF_STATS; + cls_bpf.exts = &prog->exts; + cls_bpf.prog = prog->filter; + cls_bpf.name = prog->bpf_name; + cls_bpf.exts_integrated = prog->exts_integrated; + + tc_setup_cb_call(block, NULL, TC_SETUP_CLSBPF, &cls_bpf, false); } static int cls_bpf_init(struct tcf_proto *tp) @@ -258,11 +253,8 @@ static int cls_bpf_init(struct tcf_proto *tp) return 0; } -static void __cls_bpf_delete_prog(struct cls_bpf_prog *prog) +static void cls_bpf_free_parms(struct cls_bpf_prog *prog) { - tcf_exts_destroy(&prog->exts); - tcf_exts_put_net(&prog->exts); - if (cls_bpf_is_ebpf(prog)) bpf_prog_put(prog->filter); else @@ -270,6 +262,14 @@ static void __cls_bpf_delete_prog(struct cls_bpf_prog *prog) kfree(prog->bpf_name); kfree(prog->bpf_ops); +} + +static void __cls_bpf_delete_prog(struct cls_bpf_prog *prog) +{ + tcf_exts_destroy(&prog->exts); + tcf_exts_put_net(&prog->exts); + + cls_bpf_free_parms(prog); kfree(prog); } @@ -290,12 +290,13 @@ static void cls_bpf_delete_prog_rcu(struct rcu_head *rcu) tcf_queue_work(&prog->work); } -static void __cls_bpf_delete(struct tcf_proto *tp, struct cls_bpf_prog *prog) +static void __cls_bpf_delete(struct tcf_proto *tp, struct cls_bpf_prog *prog, + struct netlink_ext_ack *extack) { struct cls_bpf_head *head = rtnl_dereference(tp->root); idr_remove_ext(&head->handle_idr, prog->handle); - cls_bpf_stop_offload(tp, prog); + cls_bpf_stop_offload(tp, prog, extack); list_del_rcu(&prog->link); tcf_unbind_filter(tp, &prog->res); if (tcf_exts_get_net(&prog->exts)) @@ -304,22 +305,24 @@ static void __cls_bpf_delete(struct tcf_proto *tp, struct cls_bpf_prog *prog) __cls_bpf_delete_prog(prog); } -static int cls_bpf_delete(struct tcf_proto *tp, void *arg, bool *last) +static int cls_bpf_delete(struct tcf_proto *tp, void *arg, bool *last, + struct netlink_ext_ack *extack) { struct cls_bpf_head *head = rtnl_dereference(tp->root); - __cls_bpf_delete(tp, arg); + __cls_bpf_delete(tp, arg, extack); *last = list_empty(&head->plist); return 0; } -static void cls_bpf_destroy(struct tcf_proto *tp) +static void cls_bpf_destroy(struct tcf_proto *tp, + struct netlink_ext_ack *extack) { struct cls_bpf_head *head = rtnl_dereference(tp->root); struct cls_bpf_prog *prog, *tmp; list_for_each_entry_safe(prog, tmp, &head->plist, link) - __cls_bpf_delete(tp, prog); + __cls_bpf_delete(tp, prog, extack); idr_destroy(&head->handle_idr); kfree_rcu(head, rcu); @@ -404,15 +407,16 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog, prog->bpf_name = name; prog->filter = fp; - if (fp->dst_needed && !(tp->q->flags & TCQ_F_INGRESS)) - netif_keep_dst(qdisc_dev(tp->q)); + if (fp->dst_needed) + tcf_block_netif_keep_dst(tp->chain->block); return 0; } static int cls_bpf_set_parms(struct net *net, struct tcf_proto *tp, struct cls_bpf_prog *prog, unsigned long base, - struct nlattr **tb, struct nlattr *est, bool ovr) + struct nlattr **tb, struct nlattr *est, bool ovr, + struct netlink_ext_ack *extack) { bool is_bpf, is_ebpf, have_exts = false; u32 gen_flags = 0; @@ -423,7 +427,7 @@ static int cls_bpf_set_parms(struct net *net, struct tcf_proto *tp, if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf)) return -EINVAL; - ret = tcf_exts_validate(net, tp, tb, est, &prog->exts, ovr); + ret = tcf_exts_validate(net, tp, tb, est, &prog->exts, ovr, extack); if (ret < 0) return ret; @@ -461,7 +465,7 @@ static int cls_bpf_set_parms(struct net *net, struct tcf_proto *tp, static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, - void **arg, bool ovr) + void **arg, bool ovr, struct netlink_ext_ack *extack) { struct cls_bpf_head *head = rtnl_dereference(tp->root); struct cls_bpf_prog *oldprog = *arg; @@ -509,17 +513,14 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, prog->handle = handle; } - ret = cls_bpf_set_parms(net, tp, prog, base, tb, tca[TCA_RATE], ovr); + ret = cls_bpf_set_parms(net, tp, prog, base, tb, tca[TCA_RATE], ovr, + extack); if (ret < 0) goto errout_idr; - ret = cls_bpf_offload(tp, prog, oldprog); - if (ret) { - if (!oldprog) - idr_remove_ext(&head->handle_idr, prog->handle); - __cls_bpf_delete_prog(prog); - return ret; - } + ret = cls_bpf_offload(tp, prog, oldprog, extack); + if (ret) + goto errout_parms; if (!tc_in_hw(prog->gen_flags)) prog->gen_flags |= TCA_CLS_FLAGS_NOT_IN_HW; @@ -537,6 +538,8 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, *arg = prog; return 0; +errout_parms: + cls_bpf_free_parms(prog); errout_idr: if (!oldprog) idr_remove_ext(&head->handle_idr, prog->handle); diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index 309d5899265f..762da5c0cf5e 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -91,7 +91,8 @@ static void cls_cgroup_destroy_rcu(struct rcu_head *root) static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, - void **arg, bool ovr) + void **arg, bool ovr, + struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_CGROUP_MAX + 1]; struct cls_cgroup_head *head = rtnl_dereference(tp->root); @@ -121,7 +122,8 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, if (err < 0) goto errout; - err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &new->exts, ovr); + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &new->exts, ovr, + extack); if (err < 0) goto errout; @@ -141,7 +143,8 @@ errout: return err; } -static void cls_cgroup_destroy(struct tcf_proto *tp) +static void cls_cgroup_destroy(struct tcf_proto *tp, + struct netlink_ext_ack *extack) { struct cls_cgroup_head *head = rtnl_dereference(tp->root); @@ -154,7 +157,8 @@ static void cls_cgroup_destroy(struct tcf_proto *tp) } } -static int cls_cgroup_delete(struct tcf_proto *tp, void *arg, bool *last) +static int cls_cgroup_delete(struct tcf_proto *tp, void *arg, bool *last, + struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 25c2a888e1f0..cd5fe383afdd 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -401,7 +401,7 @@ static void flow_destroy_filter(struct rcu_head *head) static int flow_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, - void **arg, bool ovr) + void **arg, bool ovr, struct netlink_ext_ack *extack) { struct flow_head *head = rtnl_dereference(tp->root); struct flow_filter *fold, *fnew; @@ -454,7 +454,8 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, if (err < 0) goto err2; - err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &fnew->exts, ovr); + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &fnew->exts, ovr, + extack); if (err < 0) goto err2; @@ -526,7 +527,7 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, timer_setup(&fnew->perturb_timer, flow_perturbation, TIMER_DEFERRABLE); - netif_keep_dst(qdisc_dev(tp->q)); + tcf_block_netif_keep_dst(tp->chain->block); if (tb[TCA_FLOW_KEYS]) { fnew->keymask = keymask; @@ -574,7 +575,8 @@ err1: return err; } -static int flow_delete(struct tcf_proto *tp, void *arg, bool *last) +static int flow_delete(struct tcf_proto *tp, void *arg, bool *last, + struct netlink_ext_ack *extack) { struct flow_head *head = rtnl_dereference(tp->root); struct flow_filter *f = arg; @@ -598,7 +600,7 @@ static int flow_init(struct tcf_proto *tp) return 0; } -static void flow_destroy(struct tcf_proto *tp) +static void flow_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) { struct flow_head *head = rtnl_dereference(tp->root); struct flow_filter *f, *next; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 543a3e875d05..dc9acaafc0a8 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -166,6 +166,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, * so do it rather here. */ skb_key.basic.n_proto = skb->protocol; + skb_flow_dissect_tunnel_info(skb, &head->dissector, &skb_key); skb_flow_dissect(skb, &head->dissector, &skb_key, 0); fl_set_masked_key(&skb_mkey, &skb_key, &head->mask); @@ -217,30 +218,33 @@ static void fl_destroy_filter(struct rcu_head *head) tcf_queue_work(&f->work); } -static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f) +static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f, + struct netlink_ext_ack *extack) { struct tc_cls_flower_offload cls_flower = {}; struct tcf_block *block = tp->chain->block; - tc_cls_common_offload_init(&cls_flower.common, tp); + tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, extack); cls_flower.command = TC_CLSFLOWER_DESTROY; cls_flower.cookie = (unsigned long) f; tc_setup_cb_call(block, &f->exts, TC_SETUP_CLSFLOWER, &cls_flower, false); + tcf_block_offload_dec(block, &f->flags); } static int fl_hw_replace_filter(struct tcf_proto *tp, struct flow_dissector *dissector, struct fl_flow_key *mask, - struct cls_fl_filter *f) + struct cls_fl_filter *f, + struct netlink_ext_ack *extack) { struct tc_cls_flower_offload cls_flower = {}; struct tcf_block *block = tp->chain->block; bool skip_sw = tc_skip_sw(f->flags); int err; - tc_cls_common_offload_init(&cls_flower.common, tp); + tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, extack); cls_flower.command = TC_CLSFLOWER_REPLACE; cls_flower.cookie = (unsigned long) f; cls_flower.dissector = dissector; @@ -252,10 +256,10 @@ static int fl_hw_replace_filter(struct tcf_proto *tp, err = tc_setup_cb_call(block, &f->exts, TC_SETUP_CLSFLOWER, &cls_flower, skip_sw); if (err < 0) { - fl_hw_destroy_filter(tp, f); + fl_hw_destroy_filter(tp, f, NULL); return err; } else if (err > 0) { - f->flags |= TCA_CLS_FLAGS_IN_HW; + tcf_block_offload_inc(block, &f->flags); } if (skip_sw && !(f->flags & TCA_CLS_FLAGS_IN_HW)) @@ -269,7 +273,7 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f) struct tc_cls_flower_offload cls_flower = {}; struct tcf_block *block = tp->chain->block; - tc_cls_common_offload_init(&cls_flower.common, tp); + tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, NULL); cls_flower.command = TC_CLSFLOWER_STATS; cls_flower.cookie = (unsigned long) f; cls_flower.exts = &f->exts; @@ -279,14 +283,15 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f) &cls_flower, false); } -static void __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f) +static void __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f, + struct netlink_ext_ack *extack) { struct cls_fl_head *head = rtnl_dereference(tp->root); idr_remove_ext(&head->handle_idr, f->handle); list_del_rcu(&f->list); if (!tc_skip_hw(f->flags)) - fl_hw_destroy_filter(tp, f); + fl_hw_destroy_filter(tp, f, extack); tcf_unbind_filter(tp, &f->res); if (tcf_exts_get_net(&f->exts)) call_rcu(&f->rcu, fl_destroy_filter); @@ -312,13 +317,13 @@ static void fl_destroy_rcu(struct rcu_head *rcu) schedule_work(&head->work); } -static void fl_destroy(struct tcf_proto *tp) +static void fl_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) { struct cls_fl_head *head = rtnl_dereference(tp->root); struct cls_fl_filter *f, *next; list_for_each_entry_safe(f, next, &head->filters, list) - __fl_delete(tp, f); + __fl_delete(tp, f, extack); idr_destroy(&head->handle_idr); __module_get(THIS_MODULE); @@ -524,13 +529,14 @@ static void fl_set_key_ip(struct nlattr **tb, } static int fl_set_key(struct net *net, struct nlattr **tb, - struct fl_flow_key *key, struct fl_flow_key *mask) + struct fl_flow_key *key, struct fl_flow_key *mask, + struct netlink_ext_ack *extack) { __be16 ethertype; int ret = 0; #ifdef CONFIG_NET_CLS_IND if (tb[TCA_FLOWER_INDEV]) { - int err = tcf_change_indev(net, tb[TCA_FLOWER_INDEV]); + int err = tcf_change_indev(net, tb[TCA_FLOWER_INDEV], extack); if (err < 0) return err; key->indev_ifindex = err; @@ -825,11 +831,12 @@ static int fl_check_assign_mask(struct cls_fl_head *head, static int fl_set_parms(struct net *net, struct tcf_proto *tp, struct cls_fl_filter *f, struct fl_flow_mask *mask, unsigned long base, struct nlattr **tb, - struct nlattr *est, bool ovr) + struct nlattr *est, bool ovr, + struct netlink_ext_ack *extack) { int err; - err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr); + err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr, extack); if (err < 0) return err; @@ -838,7 +845,7 @@ static int fl_set_parms(struct net *net, struct tcf_proto *tp, tcf_bind_filter(tp, &f->res, base); } - err = fl_set_key(net, tb, &f->key, &mask->key); + err = fl_set_key(net, tb, &f->key, &mask->key, extack); if (err) return err; @@ -851,7 +858,7 @@ static int fl_set_parms(struct net *net, struct tcf_proto *tp, static int fl_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, - void **arg, bool ovr) + void **arg, bool ovr, struct netlink_ext_ack *extack) { struct cls_fl_head *head = rtnl_dereference(tp->root); struct cls_fl_filter *fold = *arg; @@ -914,7 +921,8 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, } } - err = fl_set_parms(net, tp, fnew, &mask, base, tb, tca[TCA_RATE], ovr); + err = fl_set_parms(net, tp, fnew, &mask, base, tb, tca[TCA_RATE], ovr, + extack); if (err) goto errout_idr; @@ -938,7 +946,8 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, err = fl_hw_replace_filter(tp, &head->dissector, &mask.key, - fnew); + fnew, + extack); if (err) goto errout_idr; } @@ -951,7 +960,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, rhashtable_remove_fast(&head->ht, &fold->ht_node, head->ht_params); if (!tc_skip_hw(fold->flags)) - fl_hw_destroy_filter(tp, fold); + fl_hw_destroy_filter(tp, fold, NULL); } *arg = fnew; @@ -981,7 +990,8 @@ errout_tb: return err; } -static int fl_delete(struct tcf_proto *tp, void *arg, bool *last) +static int fl_delete(struct tcf_proto *tp, void *arg, bool *last, + struct netlink_ext_ack *extack) { struct cls_fl_head *head = rtnl_dereference(tp->root); struct cls_fl_filter *f = arg; @@ -989,7 +999,7 @@ static int fl_delete(struct tcf_proto *tp, void *arg, bool *last) if (!tc_skip_sw(f->flags)) rhashtable_remove_fast(&head->ht, &f->ht_node, head->ht_params); - __fl_delete(tp, f); + __fl_delete(tp, f, extack); *last = list_empty(&head->filters); return 0; } diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index 20f0de1a960a..8b207723fbc2 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -149,7 +149,7 @@ static void fw_delete_filter(struct rcu_head *head) tcf_queue_work(&f->work); } -static void fw_destroy(struct tcf_proto *tp) +static void fw_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) { struct fw_head *head = rtnl_dereference(tp->root); struct fw_filter *f; @@ -172,7 +172,8 @@ static void fw_destroy(struct tcf_proto *tp) kfree_rcu(head, rcu); } -static int fw_delete(struct tcf_proto *tp, void *arg, bool *last) +static int fw_delete(struct tcf_proto *tp, void *arg, bool *last, + struct netlink_ext_ack *extack) { struct fw_head *head = rtnl_dereference(tp->root); struct fw_filter *f = arg; @@ -218,13 +219,15 @@ static const struct nla_policy fw_policy[TCA_FW_MAX + 1] = { static int fw_set_parms(struct net *net, struct tcf_proto *tp, struct fw_filter *f, struct nlattr **tb, - struct nlattr **tca, unsigned long base, bool ovr) + struct nlattr **tca, unsigned long base, bool ovr, + struct netlink_ext_ack *extack) { struct fw_head *head = rtnl_dereference(tp->root); u32 mask; int err; - err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &f->exts, ovr); + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &f->exts, ovr, + extack); if (err < 0) return err; @@ -236,7 +239,7 @@ static int fw_set_parms(struct net *net, struct tcf_proto *tp, #ifdef CONFIG_NET_CLS_IND if (tb[TCA_FW_INDEV]) { int ret; - ret = tcf_change_indev(net, tb[TCA_FW_INDEV]); + ret = tcf_change_indev(net, tb[TCA_FW_INDEV], extack); if (ret < 0) return ret; f->ifindex = ret; @@ -257,7 +260,7 @@ static int fw_set_parms(struct net *net, struct tcf_proto *tp, static int fw_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, void **arg, - bool ovr) + bool ovr, struct netlink_ext_ack *extack) { struct fw_head *head = rtnl_dereference(tp->root); struct fw_filter *f = *arg; @@ -296,7 +299,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, return err; } - err = fw_set_parms(net, tp, fnew, tb, tca, base, ovr); + err = fw_set_parms(net, tp, fnew, tb, tca, base, ovr, extack); if (err < 0) { tcf_exts_destroy(&fnew->exts); kfree(fnew); @@ -345,7 +348,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, f->id = handle; f->tp = tp; - err = fw_set_parms(net, tp, f, tb, tca, base, ovr); + err = fw_set_parms(net, tp, f, tb, tca, base, ovr, extack); if (err < 0) goto errout; diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 66d4e0099158..2ba721a590a7 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -71,28 +71,31 @@ static void mall_destroy_rcu(struct rcu_head *rcu) static void mall_destroy_hw_filter(struct tcf_proto *tp, struct cls_mall_head *head, - unsigned long cookie) + unsigned long cookie, + struct netlink_ext_ack *extack) { struct tc_cls_matchall_offload cls_mall = {}; struct tcf_block *block = tp->chain->block; - tc_cls_common_offload_init(&cls_mall.common, tp); + tc_cls_common_offload_init(&cls_mall.common, tp, head->flags, extack); cls_mall.command = TC_CLSMATCHALL_DESTROY; cls_mall.cookie = cookie; tc_setup_cb_call(block, NULL, TC_SETUP_CLSMATCHALL, &cls_mall, false); + tcf_block_offload_dec(block, &head->flags); } static int mall_replace_hw_filter(struct tcf_proto *tp, struct cls_mall_head *head, - unsigned long cookie) + unsigned long cookie, + struct netlink_ext_ack *extack) { struct tc_cls_matchall_offload cls_mall = {}; struct tcf_block *block = tp->chain->block; bool skip_sw = tc_skip_sw(head->flags); int err; - tc_cls_common_offload_init(&cls_mall.common, tp); + tc_cls_common_offload_init(&cls_mall.common, tp, head->flags, extack); cls_mall.command = TC_CLSMATCHALL_REPLACE; cls_mall.exts = &head->exts; cls_mall.cookie = cookie; @@ -100,10 +103,10 @@ static int mall_replace_hw_filter(struct tcf_proto *tp, err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSMATCHALL, &cls_mall, skip_sw); if (err < 0) { - mall_destroy_hw_filter(tp, head, cookie); + mall_destroy_hw_filter(tp, head, cookie, NULL); return err; } else if (err > 0) { - head->flags |= TCA_CLS_FLAGS_IN_HW; + tcf_block_offload_inc(block, &head->flags); } if (skip_sw && !(head->flags & TCA_CLS_FLAGS_IN_HW)) @@ -112,7 +115,7 @@ static int mall_replace_hw_filter(struct tcf_proto *tp, return 0; } -static void mall_destroy(struct tcf_proto *tp) +static void mall_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) { struct cls_mall_head *head = rtnl_dereference(tp->root); @@ -120,7 +123,7 @@ static void mall_destroy(struct tcf_proto *tp) return; if (!tc_skip_hw(head->flags)) - mall_destroy_hw_filter(tp, head, (unsigned long) head); + mall_destroy_hw_filter(tp, head, (unsigned long) head, extack); if (tcf_exts_get_net(&head->exts)) call_rcu(&head->rcu, mall_destroy_rcu); @@ -141,11 +144,12 @@ static const struct nla_policy mall_policy[TCA_MATCHALL_MAX + 1] = { static int mall_set_parms(struct net *net, struct tcf_proto *tp, struct cls_mall_head *head, unsigned long base, struct nlattr **tb, - struct nlattr *est, bool ovr) + struct nlattr *est, bool ovr, + struct netlink_ext_ack *extack) { int err; - err = tcf_exts_validate(net, tp, tb, est, &head->exts, ovr); + err = tcf_exts_validate(net, tp, tb, est, &head->exts, ovr, extack); if (err < 0) return err; @@ -159,7 +163,7 @@ static int mall_set_parms(struct net *net, struct tcf_proto *tp, static int mall_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, - void **arg, bool ovr) + void **arg, bool ovr, struct netlink_ext_ack *extack) { struct cls_mall_head *head = rtnl_dereference(tp->root); struct nlattr *tb[TCA_MATCHALL_MAX + 1]; @@ -197,12 +201,14 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, new->handle = handle; new->flags = flags; - err = mall_set_parms(net, tp, new, base, tb, tca[TCA_RATE], ovr); + err = mall_set_parms(net, tp, new, base, tb, tca[TCA_RATE], ovr, + extack); if (err) goto err_set_parms; if (!tc_skip_hw(new->flags)) { - err = mall_replace_hw_filter(tp, new, (unsigned long) new); + err = mall_replace_hw_filter(tp, new, (unsigned long)new, + extack); if (err) goto err_replace_hw_filter; } @@ -222,7 +228,8 @@ err_exts_init: return err; } -static int mall_delete(struct tcf_proto *tp, void *arg, bool *last) +static int mall_delete(struct tcf_proto *tp, void *arg, bool *last, + struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index ac9a5b8825b9..21a03a8ee029 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -281,7 +281,7 @@ static void route4_delete_filter(struct rcu_head *head) tcf_queue_work(&f->work); } -static void route4_destroy(struct tcf_proto *tp) +static void route4_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) { struct route4_head *head = rtnl_dereference(tp->root); int h1, h2; @@ -316,7 +316,8 @@ static void route4_destroy(struct tcf_proto *tp) kfree_rcu(head, rcu); } -static int route4_delete(struct tcf_proto *tp, void *arg, bool *last) +static int route4_delete(struct tcf_proto *tp, void *arg, bool *last, + struct netlink_ext_ack *extack) { struct route4_head *head = rtnl_dereference(tp->root); struct route4_filter *f = arg; @@ -389,7 +390,7 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, struct route4_filter *f, u32 handle, struct route4_head *head, struct nlattr **tb, struct nlattr *est, int new, - bool ovr) + bool ovr, struct netlink_ext_ack *extack) { u32 id = 0, to = 0, nhandle = 0x8000; struct route4_filter *fp; @@ -397,7 +398,7 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp, struct route4_bucket *b; int err; - err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr); + err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr, extack); if (err < 0) return err; @@ -471,7 +472,8 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp, static int route4_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, - struct nlattr **tca, void **arg, bool ovr) + struct nlattr **tca, void **arg, bool ovr, + struct netlink_ext_ack *extack) { struct route4_head *head = rtnl_dereference(tp->root); struct route4_filter __rcu **fp; @@ -515,7 +517,7 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, } err = route4_set_parms(net, tp, base, f, handle, head, tb, - tca[TCA_RATE], new, ovr); + tca[TCA_RATE], new, ovr, extack); if (err < 0) goto errout; @@ -527,7 +529,7 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, if (f->handle < f1->handle) break; - netif_keep_dst(qdisc_dev(tp->q)); + tcf_block_netif_keep_dst(tp->chain->block); rcu_assign_pointer(f->next, f1); rcu_assign_pointer(*fp, f); diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index cf325625c99d..4f1297657c27 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -322,7 +322,7 @@ static void rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f) __rsvp_delete_filter(f); } -static void rsvp_destroy(struct tcf_proto *tp) +static void rsvp_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) { struct rsvp_head *data = rtnl_dereference(tp->root); int h1, h2; @@ -350,7 +350,8 @@ static void rsvp_destroy(struct tcf_proto *tp) kfree_rcu(data, rcu); } -static int rsvp_delete(struct tcf_proto *tp, void *arg, bool *last) +static int rsvp_delete(struct tcf_proto *tp, void *arg, bool *last, + struct netlink_ext_ack *extack) { struct rsvp_head *head = rtnl_dereference(tp->root); struct rsvp_filter *nfp, *f = arg; @@ -486,7 +487,7 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, - void **arg, bool ovr) + void **arg, bool ovr, struct netlink_ext_ack *extack) { struct rsvp_head *data = rtnl_dereference(tp->root); struct rsvp_filter *f, *nfp; @@ -511,7 +512,7 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb, err = tcf_exts_init(&e, TCA_RSVP_ACT, TCA_RSVP_POLICE); if (err < 0) return err; - err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr); + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr, extack); if (err < 0) goto errout2; diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 67467ae24c97..b49cc990a000 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -193,7 +193,8 @@ static void tcindex_destroy_fexts(struct rcu_head *head) tcf_queue_work(&f->work); } -static int tcindex_delete(struct tcf_proto *tp, void *arg, bool *last) +static int tcindex_delete(struct tcf_proto *tp, void *arg, bool *last, + struct netlink_ext_ack *extack) { struct tcindex_data *p = rtnl_dereference(tp->root); struct tcindex_filter_result *r = arg; @@ -246,7 +247,7 @@ static int tcindex_destroy_element(struct tcf_proto *tp, { bool last; - return tcindex_delete(tp, arg, &last); + return tcindex_delete(tp, arg, &last, NULL); } static void __tcindex_destroy(struct rcu_head *head) @@ -322,7 +323,7 @@ static int tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, u32 handle, struct tcindex_data *p, struct tcindex_filter_result *r, struct nlattr **tb, - struct nlattr *est, bool ovr) + struct nlattr *est, bool ovr, struct netlink_ext_ack *extack) { struct tcindex_filter_result new_filter_result, *old_r = r; struct tcindex_filter_result cr; @@ -334,7 +335,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, err = tcf_exts_init(&e, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); if (err < 0) return err; - err = tcf_exts_validate(net, tp, tb, est, &e, ovr); + err = tcf_exts_validate(net, tp, tb, est, &e, ovr, extack); if (err < 0) goto errout; @@ -520,7 +521,8 @@ errout: static int tcindex_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, - struct nlattr **tca, void **arg, bool ovr) + struct nlattr **tca, void **arg, bool ovr, + struct netlink_ext_ack *extack) { struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_TCINDEX_MAX + 1]; @@ -540,7 +542,7 @@ tcindex_change(struct net *net, struct sk_buff *in_skb, return err; return tcindex_set_parms(net, tp, base, handle, p, r, tb, - tca[TCA_RATE], ovr); + tca[TCA_RATE], ovr, extack); } static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker) @@ -579,7 +581,8 @@ static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker) } } -static void tcindex_destroy(struct tcf_proto *tp) +static void tcindex_destroy(struct tcf_proto *tp, + struct netlink_ext_ack *extack) { struct tcindex_data *p = rtnl_dereference(tp->root); struct tcf_walker walker; diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index ac152b4f4247..e3c5e390ec23 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -45,7 +45,6 @@ #include <net/netlink.h> #include <net/act_api.h> #include <net/pkt_cls.h> -#include <linux/netdevice.h> #include <linux/idr.h> struct tc_u_knode { @@ -88,6 +87,7 @@ struct tc_u_hnode { unsigned int divisor; struct idr handle_idr; struct rcu_head rcu; + u32 flags; /* The 'ht' field MUST be the last field in structure to allow for * more entries allocated at end of structure. */ @@ -487,12 +487,13 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key) return 0; } -static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h) +static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h, + struct netlink_ext_ack *extack) { struct tcf_block *block = tp->chain->block; struct tc_cls_u32_offload cls_u32 = {}; - tc_cls_common_offload_init(&cls_u32.common, tp); + tc_cls_common_offload_init(&cls_u32.common, tp, h->flags, extack); cls_u32.command = TC_CLSU32_DELETE_HNODE; cls_u32.hnode.divisor = h->divisor; cls_u32.hnode.handle = h->handle; @@ -502,7 +503,7 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h) } static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h, - u32 flags) + u32 flags, struct netlink_ext_ack *extack) { struct tcf_block *block = tp->chain->block; struct tc_cls_u32_offload cls_u32 = {}; @@ -510,7 +511,7 @@ static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h, bool offloaded = false; int err; - tc_cls_common_offload_init(&cls_u32.common, tp); + tc_cls_common_offload_init(&cls_u32.common, tp, flags, extack); cls_u32.command = TC_CLSU32_NEW_HNODE; cls_u32.hnode.divisor = h->divisor; cls_u32.hnode.handle = h->handle; @@ -518,7 +519,7 @@ static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h, err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSU32, &cls_u32, skip_sw); if (err < 0) { - u32_clear_hw_hnode(tp, h); + u32_clear_hw_hnode(tp, h, NULL); return err; } else if (err > 0) { offloaded = true; @@ -530,27 +531,30 @@ static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h, return 0; } -static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle) +static void u32_remove_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n, + struct netlink_ext_ack *extack) { struct tcf_block *block = tp->chain->block; struct tc_cls_u32_offload cls_u32 = {}; - tc_cls_common_offload_init(&cls_u32.common, tp); + tc_cls_common_offload_init(&cls_u32.common, tp, n->flags, extack); cls_u32.command = TC_CLSU32_DELETE_KNODE; - cls_u32.knode.handle = handle; + cls_u32.knode.handle = n->handle; tc_setup_cb_call(block, NULL, TC_SETUP_CLSU32, &cls_u32, false); + tcf_block_offload_dec(block, &n->flags); } static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n, - u32 flags) + u32 flags, struct netlink_ext_ack *extack) { + struct tc_u_hnode *ht = rtnl_dereference(n->ht_down); struct tcf_block *block = tp->chain->block; struct tc_cls_u32_offload cls_u32 = {}; bool skip_sw = tc_skip_sw(flags); int err; - tc_cls_common_offload_init(&cls_u32.common, tp); + tc_cls_common_offload_init(&cls_u32.common, tp, flags, extack); cls_u32.command = TC_CLSU32_REPLACE_KNODE; cls_u32.knode.handle = n->handle; cls_u32.knode.fshift = n->fshift; @@ -564,14 +568,14 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n, cls_u32.knode.sel = &n->sel; cls_u32.knode.exts = &n->exts; if (n->ht_down) - cls_u32.knode.link_handle = n->ht_down->handle; + cls_u32.knode.link_handle = ht->handle; err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSU32, &cls_u32, skip_sw); if (err < 0) { - u32_remove_hw_knode(tp, n->handle); + u32_remove_hw_knode(tp, n, NULL); return err; } else if (err > 0) { - n->flags |= TCA_CLS_FLAGS_IN_HW; + tcf_block_offload_inc(block, &n->flags); } if (skip_sw && !(n->flags & TCA_CLS_FLAGS_IN_HW)) @@ -580,7 +584,8 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n, return 0; } -static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) +static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht, + struct netlink_ext_ack *extack) { struct tc_u_knode *n; unsigned int h; @@ -590,7 +595,7 @@ static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) RCU_INIT_POINTER(ht->ht[h], rtnl_dereference(n->next)); tcf_unbind_filter(tp, &n->res); - u32_remove_hw_knode(tp, n->handle); + u32_remove_hw_knode(tp, n, extack); idr_remove_ext(&ht->handle_idr, n->handle); if (tcf_exts_get_net(&n->exts)) call_rcu(&n->rcu, u32_delete_key_freepf_rcu); @@ -600,7 +605,8 @@ static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) } } -static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) +static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht, + struct netlink_ext_ack *extack) { struct tc_u_common *tp_c = tp->data; struct tc_u_hnode __rcu **hn; @@ -608,14 +614,14 @@ static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) WARN_ON(ht->refcnt); - u32_clear_hnode(tp, ht); + u32_clear_hnode(tp, ht, extack); hn = &tp_c->hlist; for (phn = rtnl_dereference(*hn); phn; hn = &phn->next, phn = rtnl_dereference(*hn)) { if (phn == ht) { - u32_clear_hw_hnode(tp, ht); + u32_clear_hw_hnode(tp, ht, extack); idr_destroy(&ht->handle_idr); idr_remove_ext(&tp_c->handle_idr, ht->handle); RCU_INIT_POINTER(*hn, ht->next); @@ -638,7 +644,7 @@ static bool ht_empty(struct tc_u_hnode *ht) return true; } -static void u32_destroy(struct tcf_proto *tp) +static void u32_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) { struct tc_u_common *tp_c = tp->data; struct tc_u_hnode *root_ht = rtnl_dereference(tp->root); @@ -646,7 +652,7 @@ static void u32_destroy(struct tcf_proto *tp) WARN_ON(root_ht == NULL); if (root_ht && --root_ht->refcnt == 0) - u32_destroy_hnode(tp, root_ht); + u32_destroy_hnode(tp, root_ht, extack); if (--tp_c->refcnt == 0) { struct tc_u_hnode *ht; @@ -657,7 +663,7 @@ static void u32_destroy(struct tcf_proto *tp) ht; ht = rtnl_dereference(ht->next)) { ht->refcnt--; - u32_clear_hnode(tp, ht); + u32_clear_hnode(tp, ht, extack); } while ((ht = rtnl_dereference(tp_c->hlist)) != NULL) { @@ -672,7 +678,8 @@ static void u32_destroy(struct tcf_proto *tp) tp->data = NULL; } -static int u32_delete(struct tcf_proto *tp, void *arg, bool *last) +static int u32_delete(struct tcf_proto *tp, void *arg, bool *last, + struct netlink_ext_ack *extack) { struct tc_u_hnode *ht = arg; struct tc_u_hnode *root_ht = rtnl_dereference(tp->root); @@ -683,18 +690,21 @@ static int u32_delete(struct tcf_proto *tp, void *arg, bool *last) goto out; if (TC_U32_KEY(ht->handle)) { - u32_remove_hw_knode(tp, ht->handle); + u32_remove_hw_knode(tp, (struct tc_u_knode *)ht, extack); ret = u32_delete_key(tp, (struct tc_u_knode *)ht); goto out; } - if (root_ht == ht) + if (root_ht == ht) { + NL_SET_ERR_MSG_MOD(extack, "Not allowed to delete root node"); return -EINVAL; + } if (ht->refcnt == 1) { ht->refcnt--; - u32_destroy_hnode(tp, ht); + u32_destroy_hnode(tp, ht, extack); } else { + NL_SET_ERR_MSG_MOD(extack, "Can not delete in-use filter"); return -EBUSY; } @@ -765,11 +775,12 @@ static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = { static int u32_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, struct tc_u_hnode *ht, struct tc_u_knode *n, struct nlattr **tb, - struct nlattr *est, bool ovr) + struct nlattr *est, bool ovr, + struct netlink_ext_ack *extack) { int err; - err = tcf_exts_validate(net, tp, tb, est, &n->exts, ovr); + err = tcf_exts_validate(net, tp, tb, est, &n->exts, ovr, extack); if (err < 0) return err; @@ -777,14 +788,18 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp, u32 handle = nla_get_u32(tb[TCA_U32_LINK]); struct tc_u_hnode *ht_down = NULL, *ht_old; - if (TC_U32_KEY(handle)) + if (TC_U32_KEY(handle)) { + NL_SET_ERR_MSG_MOD(extack, "u32 Link handle must be a hash table"); return -EINVAL; + } if (handle) { ht_down = u32_lookup_ht(ht->tp_c, handle); - if (ht_down == NULL) + if (!ht_down) { + NL_SET_ERR_MSG_MOD(extack, "Link hash table not found"); return -EINVAL; + } ht_down->refcnt++; } @@ -802,7 +817,7 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp, #ifdef CONFIG_NET_CLS_IND if (tb[TCA_U32_INDEV]) { int ret; - ret = tcf_change_indev(net, tb[TCA_U32_INDEV]); + ret = tcf_change_indev(net, tb[TCA_U32_INDEV], extack); if (ret < 0) return -EINVAL; n->ifindex = ret; @@ -841,8 +856,9 @@ static void u32_replace_knode(struct tcf_proto *tp, struct tc_u_common *tp_c, static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp, struct tc_u_knode *n) { - struct tc_u_knode *new; + struct tc_u_hnode *ht = rtnl_dereference(n->ht_down); struct tc_u32_sel *s = &n->sel; + struct tc_u_knode *new; new = kzalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key), GFP_KERNEL); @@ -860,11 +876,11 @@ static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp, new->fshift = n->fshift; new->res = n->res; new->flags = n->flags; - RCU_INIT_POINTER(new->ht_down, n->ht_down); + RCU_INIT_POINTER(new->ht_down, ht); /* bump reference count as long as we hold pointer to structure */ - if (new->ht_down) - new->ht_down->refcnt++; + if (ht) + ht->refcnt++; #ifdef CONFIG_CLS_U32_PERF /* Statistics may be incremented by readers during update @@ -893,7 +909,8 @@ static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp, static int u32_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, - struct nlattr **tca, void **arg, bool ovr) + struct nlattr **tca, void **arg, bool ovr, + struct netlink_ext_ack *extack) { struct tc_u_common *tp_c = tp->data; struct tc_u_hnode *ht; @@ -907,28 +924,40 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, size_t size; #endif - if (opt == NULL) - return handle ? -EINVAL : 0; + if (!opt) { + if (handle) { + NL_SET_ERR_MSG_MOD(extack, "Filter handle requires options"); + return -EINVAL; + } else { + return 0; + } + } - err = nla_parse_nested(tb, TCA_U32_MAX, opt, u32_policy, NULL); + err = nla_parse_nested(tb, TCA_U32_MAX, opt, u32_policy, extack); if (err < 0) return err; if (tb[TCA_U32_FLAGS]) { flags = nla_get_u32(tb[TCA_U32_FLAGS]); - if (!tc_flags_valid(flags)) + if (!tc_flags_valid(flags)) { + NL_SET_ERR_MSG_MOD(extack, "Invalid filter flags"); return -EINVAL; + } } n = *arg; if (n) { struct tc_u_knode *new; - if (TC_U32_KEY(n->handle) == 0) + if (TC_U32_KEY(n->handle) == 0) { + NL_SET_ERR_MSG_MOD(extack, "Key node id cannot be zero"); return -EINVAL; + } - if (n->flags != flags) + if (n->flags != flags) { + NL_SET_ERR_MSG_MOD(extack, "Key node flags do not match passed flags"); return -EINVAL; + } new = u32_init_knode(tp, n); if (!new) @@ -936,14 +965,14 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, err = u32_set_parms(net, tp, base, rtnl_dereference(n->ht_up), new, tb, - tca[TCA_RATE], ovr); + tca[TCA_RATE], ovr, extack); if (err) { u32_destroy_key(tp, new, false); return err; } - err = u32_replace_hw_knode(tp, new, flags); + err = u32_replace_hw_knode(tp, new, flags, extack); if (err) { u32_destroy_key(tp, new, false); return err; @@ -962,10 +991,14 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, if (tb[TCA_U32_DIVISOR]) { unsigned int divisor = nla_get_u32(tb[TCA_U32_DIVISOR]); - if (--divisor > 0x100) + if (--divisor > 0x100) { + NL_SET_ERR_MSG_MOD(extack, "Exceeded maximum 256 hash buckets"); return -EINVAL; - if (TC_U32_KEY(handle)) + } + if (TC_U32_KEY(handle)) { + NL_SET_ERR_MSG_MOD(extack, "Divisor can only be used on a hash table"); return -EINVAL; + } ht = kzalloc(sizeof(*ht) + divisor*sizeof(void *), GFP_KERNEL); if (ht == NULL) return -ENOBUFS; @@ -989,8 +1022,9 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, ht->handle = handle; ht->prio = tp->prio; idr_init(&ht->handle_idr); + ht->flags = flags; - err = u32_replace_hw_hnode(tp, ht, flags); + err = u32_replace_hw_hnode(tp, ht, flags, extack); if (err) { idr_remove_ext(&tp_c->handle_idr, handle); kfree(ht); @@ -1011,20 +1045,26 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, htid = ht->handle; } else { ht = u32_lookup_ht(tp->data, TC_U32_HTID(htid)); - if (ht == NULL) + if (!ht) { + NL_SET_ERR_MSG_MOD(extack, "Specified hash table not found"); return -EINVAL; + } } } else { ht = rtnl_dereference(tp->root); htid = ht->handle; } - if (ht->divisor < TC_U32_HASH(htid)) + if (ht->divisor < TC_U32_HASH(htid)) { + NL_SET_ERR_MSG_MOD(extack, "Specified hash table buckets exceed configured value"); return -EINVAL; + } if (handle) { - if (TC_U32_HTID(handle) && TC_U32_HTID(handle^htid)) + if (TC_U32_HTID(handle) && TC_U32_HTID(handle ^ htid)) { + NL_SET_ERR_MSG_MOD(extack, "Handle specified hash table address mismatch"); return -EINVAL; + } handle = htid | TC_U32_NODE(handle); err = idr_alloc_ext(&ht->handle_idr, NULL, NULL, handle, handle + 1, @@ -1035,6 +1075,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, handle = gen_new_kid(ht, htid); if (tb[TCA_U32_SEL] == NULL) { + NL_SET_ERR_MSG_MOD(extack, "Selector not specified"); err = -EINVAL; goto erridr; } @@ -1083,12 +1124,13 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, } #endif - err = u32_set_parms(net, tp, base, ht, n, tb, tca[TCA_RATE], ovr); + err = u32_set_parms(net, tp, base, ht, n, tb, tca[TCA_RATE], ovr, + extack); if (err == 0) { struct tc_u_knode __rcu **ins; struct tc_u_knode *pins; - err = u32_replace_hw_knode(tp, n, flags); + err = u32_replace_hw_knode(tp, n, flags, extack); if (err) goto errhw; diff --git a/net/sched/em_nbyte.c b/net/sched/em_nbyte.c index df3110d69585..07c10bac06a0 100644 --- a/net/sched/em_nbyte.c +++ b/net/sched/em_nbyte.c @@ -51,7 +51,7 @@ static int em_nbyte_match(struct sk_buff *skb, struct tcf_ematch *em, if (!tcf_valid_offset(skb, ptr, nbyte->hdr.len)) return 0; - return !memcmp(ptr + nbyte->hdr.off, nbyte->pattern, nbyte->hdr.len); + return !memcmp(ptr, nbyte->pattern, nbyte->hdr.len); } static struct tcf_ematch_ops em_nbyte_ops = { diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index b6c4f536876b..d512f49ee83c 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -393,13 +393,16 @@ static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab) static struct qdisc_rate_table *qdisc_rtab_list; struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, - struct nlattr *tab) + struct nlattr *tab, + struct netlink_ext_ack *extack) { struct qdisc_rate_table *rtab; if (tab == NULL || r->rate == 0 || r->cell_log == 0 || - nla_len(tab) != TC_RTAB_SIZE) + nla_len(tab) != TC_RTAB_SIZE) { + NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching"); return NULL; + } for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) { if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) && @@ -418,6 +421,8 @@ struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, r->linklayer = __detect_linklayer(r, rtab->data); rtab->next = qdisc_rtab_list; qdisc_rtab_list = rtab; + } else { + NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table"); } return rtab; } @@ -449,7 +454,8 @@ static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = { [TCA_STAB_DATA] = { .type = NLA_BINARY }, }; -static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt) +static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt, + struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_STAB_MAX + 1]; struct qdisc_size_table *stab; @@ -458,23 +464,29 @@ static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt) u16 *tab = NULL; int err; - err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy, NULL); + err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy, extack); if (err < 0) return ERR_PTR(err); - if (!tb[TCA_STAB_BASE]) + if (!tb[TCA_STAB_BASE]) { + NL_SET_ERR_MSG(extack, "Size table base attribute is missing"); return ERR_PTR(-EINVAL); + } s = nla_data(tb[TCA_STAB_BASE]); if (s->tsize > 0) { - if (!tb[TCA_STAB_DATA]) + if (!tb[TCA_STAB_DATA]) { + NL_SET_ERR_MSG(extack, "Size table data attribute is missing"); return ERR_PTR(-EINVAL); + } tab = nla_data(tb[TCA_STAB_DATA]); tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16); } - if (tsize != s->tsize || (!tab && tsize > 0)) + if (tsize != s->tsize || (!tab && tsize > 0)) { + NL_SET_ERR_MSG(extack, "Invalid size of size table"); return ERR_PTR(-EINVAL); + } list_for_each_entry(stab, &qdisc_stab_list, list) { if (memcmp(&stab->szopts, s, sizeof(*s))) @@ -669,7 +681,7 @@ int qdisc_class_hash_init(struct Qdisc_class_hash *clhash) unsigned int size = 4; clhash->hash = qdisc_class_hash_alloc(size); - if (clhash->hash == NULL) + if (!clhash->hash) return -ENOMEM; clhash->hashsize = size; clhash->hashmask = size - 1; @@ -779,6 +791,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, unsigned char *b = skb_tail_pointer(skb); struct gnet_dump d; struct qdisc_size_table *stab; + u32 block_index; __u32 qlen; cond_resched(); @@ -795,9 +808,23 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, tcm->tcm_info = refcount_read(&q->refcnt); if (nla_put_string(skb, TCA_KIND, q->ops->id)) goto nla_put_failure; + if (q->ops->ingress_block_get) { + block_index = q->ops->ingress_block_get(q); + if (block_index && + nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index)) + goto nla_put_failure; + } + if (q->ops->egress_block_get) { + block_index = q->ops->egress_block_get(q); + if (block_index && + nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index)) + goto nla_put_failure; + } if (q->ops->dump && q->ops->dump(q, skb) < 0) goto nla_put_failure; - qlen = q->q.qlen; + if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED))) + goto nla_put_failure; + qlen = qdisc_qlen_sum(q); stab = rtnl_dereference(q->stab); if (stab && qdisc_dump_stab(skb, stab) < 0) @@ -896,7 +923,8 @@ static void notify_and_destroy(struct net *net, struct sk_buff *skb, static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, struct sk_buff *skb, struct nlmsghdr *n, u32 classid, - struct Qdisc *new, struct Qdisc *old) + struct Qdisc *new, struct Qdisc *old, + struct netlink_ext_ack *extack) { struct Qdisc *q = old; struct net *net = dev_net(dev); @@ -911,8 +939,10 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, (new && new->flags & TCQ_F_INGRESS)) { num_q = 1; ingress = 1; - if (!dev_ingress_queue(dev)) + if (!dev_ingress_queue(dev)) { + NL_SET_ERR_MSG(extack, "Device does not have an ingress queue"); return -ENOENT; + } } if (dev->flags & IFF_UP) @@ -954,14 +984,22 @@ skip: } else { const struct Qdisc_class_ops *cops = parent->ops->cl_ops; + /* Only support running class lockless if parent is lockless */ + if (new && (new->flags & TCQ_F_NOLOCK) && + parent && !(parent->flags & TCQ_F_NOLOCK)) + new->flags &= ~TCQ_F_NOLOCK; + err = -EOPNOTSUPP; if (cops && cops->graft) { unsigned long cl = cops->find(parent, classid); - if (cl) - err = cops->graft(parent, cl, new, &old); - else + if (cl) { + err = cops->graft(parent, cl, new, &old, + extack); + } else { + NL_SET_ERR_MSG(extack, "Specified class not found"); err = -ENOENT; + } } if (!err) notify_and_destroy(net, skb, n, classid, old, new); @@ -969,6 +1007,40 @@ skip: return err; } +static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca, + struct netlink_ext_ack *extack) +{ + u32 block_index; + + if (tca[TCA_INGRESS_BLOCK]) { + block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]); + + if (!block_index) { + NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0"); + return -EINVAL; + } + if (!sch->ops->ingress_block_set) { + NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported"); + return -EOPNOTSUPP; + } + sch->ops->ingress_block_set(sch, block_index); + } + if (tca[TCA_EGRESS_BLOCK]) { + block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]); + + if (!block_index) { + NL_SET_ERR_MSG(extack, "Egress block index cannot be 0"); + return -EINVAL; + } + if (!sch->ops->egress_block_set) { + NL_SET_ERR_MSG(extack, "Egress block sharing is not supported"); + return -EOPNOTSUPP; + } + sch->ops->egress_block_set(sch, block_index); + } + return 0; +} + /* lockdep annotation is needed for ingress; egress gets it only for name */ static struct lock_class_key qdisc_tx_lock; static struct lock_class_key qdisc_rx_lock; @@ -982,7 +1054,8 @@ static struct lock_class_key qdisc_rx_lock; static struct Qdisc *qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue, struct Qdisc *p, u32 parent, u32 handle, - struct nlattr **tca, int *errp) + struct nlattr **tca, int *errp, + struct netlink_ext_ack *extack) { int err; struct nlattr *kind = tca[TCA_KIND]; @@ -1020,10 +1093,12 @@ static struct Qdisc *qdisc_create(struct net_device *dev, #endif err = -ENOENT; - if (ops == NULL) + if (!ops) { + NL_SET_ERR_MSG(extack, "Specified qdisc not found"); goto err_out; + } - sch = qdisc_alloc(dev_queue, ops); + sch = qdisc_alloc(dev_queue, ops, extack); if (IS_ERR(sch)) { err = PTR_ERR(sch); goto err_out2; @@ -1060,60 +1135,63 @@ static struct Qdisc *qdisc_create(struct net_device *dev, netdev_info(dev, "Caught tx_queue_len zero misconfig\n"); } - if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) { - if (qdisc_is_percpu_stats(sch)) { - sch->cpu_bstats = - netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); - if (!sch->cpu_bstats) - goto err_out4; + err = qdisc_block_indexes_set(sch, tca, extack); + if (err) + goto err_out3; + + if (ops->init) { + err = ops->init(sch, tca[TCA_OPTIONS], extack); + if (err != 0) + goto err_out5; + } - sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue); - if (!sch->cpu_qstats) - goto err_out4; + if (tca[TCA_STAB]) { + stab = qdisc_get_stab(tca[TCA_STAB], extack); + if (IS_ERR(stab)) { + err = PTR_ERR(stab); + goto err_out4; } + rcu_assign_pointer(sch->stab, stab); + } + if (tca[TCA_RATE]) { + seqcount_t *running; - if (tca[TCA_STAB]) { - stab = qdisc_get_stab(tca[TCA_STAB]); - if (IS_ERR(stab)) { - err = PTR_ERR(stab); - goto err_out4; - } - rcu_assign_pointer(sch->stab, stab); + err = -EOPNOTSUPP; + if (sch->flags & TCQ_F_MQROOT) { + NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc"); + goto err_out4; } - if (tca[TCA_RATE]) { - seqcount_t *running; - - err = -EOPNOTSUPP; - if (sch->flags & TCQ_F_MQROOT) - goto err_out4; - - if ((sch->parent != TC_H_ROOT) && - !(sch->flags & TCQ_F_INGRESS) && - (!p || !(p->flags & TCQ_F_MQROOT))) - running = qdisc_root_sleeping_running(sch); - else - running = &sch->running; - - err = gen_new_estimator(&sch->bstats, - sch->cpu_bstats, - &sch->rate_est, - NULL, - running, - tca[TCA_RATE]); - if (err) - goto err_out4; + + if (sch->parent != TC_H_ROOT && + !(sch->flags & TCQ_F_INGRESS) && + (!p || !(p->flags & TCQ_F_MQROOT))) + running = qdisc_root_sleeping_running(sch); + else + running = &sch->running; + + err = gen_new_estimator(&sch->bstats, + sch->cpu_bstats, + &sch->rate_est, + NULL, + running, + tca[TCA_RATE]); + if (err) { + NL_SET_ERR_MSG(extack, "Failed to generate new estimator"); + goto err_out4; } + } - qdisc_hash_add(sch, false); + qdisc_hash_add(sch, false); - return sch; - } + return sch; + +err_out5: /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */ if (ops->destroy) ops->destroy(sch); err_out3: dev_put(dev); - kfree((char *) sch - sch->padded); + qdisc_free(sch); err_out2: module_put(ops->owner); err_out: @@ -1121,8 +1199,6 @@ err_out: return NULL; err_out4: - free_percpu(sch->cpu_bstats); - free_percpu(sch->cpu_qstats); /* * Any broken qdiscs that would require a ops->reset() here? * The qdisc was never in action so it shouldn't be necessary. @@ -1133,21 +1209,28 @@ err_out4: goto err_out3; } -static int qdisc_change(struct Qdisc *sch, struct nlattr **tca) +static int qdisc_change(struct Qdisc *sch, struct nlattr **tca, + struct netlink_ext_ack *extack) { struct qdisc_size_table *ostab, *stab = NULL; int err = 0; if (tca[TCA_OPTIONS]) { - if (sch->ops->change == NULL) + if (!sch->ops->change) { + NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc"); return -EINVAL; - err = sch->ops->change(sch, tca[TCA_OPTIONS]); + } + if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) { + NL_SET_ERR_MSG(extack, "Change of blocks is not supported"); + return -EOPNOTSUPP; + } + err = sch->ops->change(sch, tca[TCA_OPTIONS], extack); if (err) return err; } if (tca[TCA_STAB]) { - stab = qdisc_get_stab(tca[TCA_STAB]); + stab = qdisc_get_stab(tca[TCA_STAB], extack); if (IS_ERR(stab)) return PTR_ERR(stab); } @@ -1245,8 +1328,10 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, if (clid != TC_H_ROOT) { if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) { p = qdisc_lookup(dev, TC_H_MAJ(clid)); - if (!p) + if (!p) { + NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid"); return -ENOENT; + } q = qdisc_leaf(p, clid); } else if (dev_ingress_queue(dev)) { q = dev_ingress_queue(dev)->qdisc_sleeping; @@ -1254,26 +1339,38 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, } else { q = dev->qdisc; } - if (!q) + if (!q) { + NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device"); return -ENOENT; + } - if (tcm->tcm_handle && q->handle != tcm->tcm_handle) + if (tcm->tcm_handle && q->handle != tcm->tcm_handle) { + NL_SET_ERR_MSG(extack, "Invalid handle"); return -EINVAL; + } } else { q = qdisc_lookup(dev, tcm->tcm_handle); - if (!q) + if (!q) { + NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle"); return -ENOENT; + } } - if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) + if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) { + NL_SET_ERR_MSG(extack, "Invalid qdisc name"); return -EINVAL; + } if (n->nlmsg_type == RTM_DELQDISC) { - if (!clid) + if (!clid) { + NL_SET_ERR_MSG(extack, "Classid cannot be zero"); return -EINVAL; - if (q->handle == 0) + } + if (q->handle == 0) { + NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero"); return -ENOENT; - err = qdisc_graft(dev, p, skb, n, clid, NULL, q); + } + err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack); if (err != 0) return err; } else { @@ -1319,8 +1416,10 @@ replay: if (clid != TC_H_ROOT) { if (clid != TC_H_INGRESS) { p = qdisc_lookup(dev, TC_H_MAJ(clid)); - if (!p) + if (!p) { + NL_SET_ERR_MSG(extack, "Failed to find specified qdisc"); return -ENOENT; + } q = qdisc_leaf(p, clid); } else if (dev_ingress_queue_create(dev)) { q = dev_ingress_queue(dev)->qdisc_sleeping; @@ -1335,20 +1434,31 @@ replay: if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) { if (tcm->tcm_handle) { - if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) + if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) { + NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override"); return -EEXIST; - if (TC_H_MIN(tcm->tcm_handle)) + } + if (TC_H_MIN(tcm->tcm_handle)) { + NL_SET_ERR_MSG(extack, "Invalid minor handle"); return -EINVAL; + } q = qdisc_lookup(dev, tcm->tcm_handle); if (!q) goto create_n_graft; - if (n->nlmsg_flags & NLM_F_EXCL) + if (n->nlmsg_flags & NLM_F_EXCL) { + NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override"); return -EEXIST; - if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) + } + if (tca[TCA_KIND] && + nla_strcmp(tca[TCA_KIND], q->ops->id)) { + NL_SET_ERR_MSG(extack, "Invalid qdisc name"); return -EINVAL; + } if (q == p || - (p && check_loop(q, p, 0))) + (p && check_loop(q, p, 0))) { + NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected"); return -ELOOP; + } qdisc_refcount_inc(q); goto graft; } else { @@ -1383,33 +1493,45 @@ replay: } } } else { - if (!tcm->tcm_handle) + if (!tcm->tcm_handle) { + NL_SET_ERR_MSG(extack, "Handle cannot be zero"); return -EINVAL; + } q = qdisc_lookup(dev, tcm->tcm_handle); } /* Change qdisc parameters */ - if (q == NULL) + if (!q) { + NL_SET_ERR_MSG(extack, "Specified qdisc not found"); return -ENOENT; - if (n->nlmsg_flags & NLM_F_EXCL) + } + if (n->nlmsg_flags & NLM_F_EXCL) { + NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify"); return -EEXIST; - if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) + } + if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) { + NL_SET_ERR_MSG(extack, "Invalid qdisc name"); return -EINVAL; - err = qdisc_change(q, tca); + } + err = qdisc_change(q, tca, extack); if (err == 0) qdisc_notify(net, skb, n, clid, NULL, q); return err; create_n_graft: - if (!(n->nlmsg_flags & NLM_F_CREATE)) + if (!(n->nlmsg_flags & NLM_F_CREATE)) { + NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag"); return -ENOENT; + } if (clid == TC_H_INGRESS) { - if (dev_ingress_queue(dev)) + if (dev_ingress_queue(dev)) { q = qdisc_create(dev, dev_ingress_queue(dev), p, tcm->tcm_parent, tcm->tcm_parent, - tca, &err); - else + tca, &err, extack); + } else { + NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device"); err = -ENOENT; + } } else { struct netdev_queue *dev_queue; @@ -1422,7 +1544,7 @@ create_n_graft: q = qdisc_create(dev, dev_queue, p, tcm->tcm_parent, tcm->tcm_handle, - tca, &err); + tca, &err, extack); } if (q == NULL) { if (err == -EAGAIN) @@ -1431,7 +1553,7 @@ create_n_graft: } graft: - err = qdisc_graft(dev, p, skb, n, clid, q, NULL); + err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack); if (err) { if (q) qdisc_destroy(q); @@ -1683,7 +1805,7 @@ static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid, cl = cops->find(q, portid); if (!cl) return; - block = cops->tcf_block(q, cl); + block = cops->tcf_block(q, cl, NULL); if (!block) return; list_for_each_entry(chain, &block->chain_list, list) { @@ -1827,10 +1949,15 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, } } + if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) { + NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes"); + return -EOPNOTSUPP; + } + new_cl = cl; err = -EOPNOTSUPP; if (cops->change) - err = cops->change(q, clid, portid, tca, &new_cl); + err = cops->change(q, clid, portid, tca, &new_cl, extack); if (err == 0) { tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS); /* We just create a new class, need to do reverse binding. */ @@ -1966,7 +2093,6 @@ static int psched_open(struct inode *inode, struct file *file) } static const struct file_operations psched_fops = { - .owner = THIS_MODULE, .open = psched_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index 2dbd249c0b2f..cd49afca9617 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -82,7 +82,8 @@ static inline struct atm_flow_data *lookup_flow(struct Qdisc *sch, u32 classid) } static int atm_tc_graft(struct Qdisc *sch, unsigned long arg, - struct Qdisc *new, struct Qdisc **old) + struct Qdisc *new, struct Qdisc **old, + struct netlink_ext_ack *extack) { struct atm_qdisc_data *p = qdisc_priv(sch); struct atm_flow_data *flow = (struct atm_flow_data *)arg; @@ -191,7 +192,8 @@ static const struct nla_policy atm_policy[TCA_ATM_MAX + 1] = { }; static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent, - struct nlattr **tca, unsigned long *arg) + struct nlattr **tca, unsigned long *arg, + struct netlink_ext_ack *extack) { struct atm_qdisc_data *p = qdisc_priv(sch); struct atm_flow_data *flow = (struct atm_flow_data *)*arg; @@ -281,13 +283,15 @@ static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent, goto err_out; } - error = tcf_block_get(&flow->block, &flow->filter_list, sch); + error = tcf_block_get(&flow->block, &flow->filter_list, sch, + extack); if (error) { kfree(flow); goto err_out; } - flow->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid); + flow->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid, + extack); if (!flow->q) flow->q = &noop_qdisc; pr_debug("atm_tc_change: qdisc %p\n", flow->q); @@ -356,7 +360,8 @@ static void atm_tc_walk(struct Qdisc *sch, struct qdisc_walker *walker) } } -static struct tcf_block *atm_tc_tcf_block(struct Qdisc *sch, unsigned long cl) +static struct tcf_block *atm_tc_tcf_block(struct Qdisc *sch, unsigned long cl, + struct netlink_ext_ack *extack) { struct atm_qdisc_data *p = qdisc_priv(sch); struct atm_flow_data *flow = (struct atm_flow_data *)cl; @@ -531,7 +536,8 @@ static struct sk_buff *atm_tc_peek(struct Qdisc *sch) return p->link.q->ops->peek(p->link.q); } -static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt) +static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { struct atm_qdisc_data *p = qdisc_priv(sch); int err; @@ -541,12 +547,13 @@ static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt) INIT_LIST_HEAD(&p->link.list); list_add(&p->link.list, &p->flows); p->link.q = qdisc_create_dflt(sch->dev_queue, - &pfifo_qdisc_ops, sch->handle); + &pfifo_qdisc_ops, sch->handle, extack); if (!p->link.q) p->link.q = &noop_qdisc; pr_debug("atm_tc_init: link (%p) qdisc %p\n", &p->link, p->link.q); - err = tcf_block_get(&p->link.block, &p->link.filter_list, sch); + err = tcf_block_get(&p->link.block, &p->link.filter_list, sch, + extack); if (err) return err; diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 6361be7881f1..f42025d53cfe 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -1132,7 +1132,8 @@ static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = { [TCA_CBQ_POLICE] = { .len = sizeof(struct tc_cbq_police) }, }; -static int cbq_init(struct Qdisc *sch, struct nlattr *opt) +static int cbq_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { struct cbq_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_CBQ_MAX + 1]; @@ -1143,30 +1144,39 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt) hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); q->delay_timer.function = cbq_undelay; - if (!opt) + if (!opt) { + NL_SET_ERR_MSG(extack, "CBQ options are required for this operation"); return -EINVAL; + } - err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy, NULL); + err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy, extack); if (err < 0) return err; - if (tb[TCA_CBQ_RTAB] == NULL || tb[TCA_CBQ_RATE] == NULL) + if (!tb[TCA_CBQ_RTAB] || !tb[TCA_CBQ_RATE]) { + NL_SET_ERR_MSG(extack, "Rate specification missing or incomplete"); return -EINVAL; + } r = nla_data(tb[TCA_CBQ_RATE]); - if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB])) == NULL) + q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB], extack); + if (!q->link.R_tab) return -EINVAL; + err = tcf_block_get(&q->link.block, &q->link.filter_list, sch, extack); + if (err) + goto put_rtab; + err = qdisc_class_hash_init(&q->clhash); if (err < 0) - goto put_rtab; + goto put_block; q->link.sibling = &q->link; q->link.common.classid = sch->handle; q->link.qdisc = sch; q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, - sch->handle); + sch->handle, NULL); if (!q->link.q) q->link.q = &noop_qdisc; else @@ -1194,6 +1204,9 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt) cbq_addprio(q, &q->link); return 0; +put_block: + tcf_block_put(q->link.block); + put_rtab: qdisc_put_rtab(q->link.R_tab); return err; @@ -1362,13 +1375,13 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg, } static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, - struct Qdisc **old) + struct Qdisc **old, struct netlink_ext_ack *extack) { struct cbq_class *cl = (struct cbq_class *)arg; if (new == NULL) { - new = qdisc_create_dflt(sch->dev_queue, - &pfifo_qdisc_ops, cl->common.classid); + new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, + cl->common.classid, extack); if (new == NULL) return -ENOBUFS; } @@ -1443,7 +1456,7 @@ static void cbq_destroy(struct Qdisc *sch) static int cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca, - unsigned long *arg) + unsigned long *arg, struct netlink_ext_ack *extack) { int err; struct cbq_sched_data *q = qdisc_priv(sch); @@ -1453,29 +1466,37 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t struct cbq_class *parent; struct qdisc_rate_table *rtab = NULL; - if (opt == NULL) + if (!opt) { + NL_SET_ERR_MSG(extack, "Mandatory qdisc options missing"); return -EINVAL; + } - err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy, NULL); + err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy, extack); if (err < 0) return err; - if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE]) + if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE]) { + NL_SET_ERR_MSG(extack, "Neither overlimit strategy nor policing attributes can be used for changing class params"); return -EOPNOTSUPP; + } if (cl) { /* Check parent */ if (parentid) { if (cl->tparent && - cl->tparent->common.classid != parentid) + cl->tparent->common.classid != parentid) { + NL_SET_ERR_MSG(extack, "Invalid parent id"); return -EINVAL; - if (!cl->tparent && parentid != TC_H_ROOT) + } + if (!cl->tparent && parentid != TC_H_ROOT) { + NL_SET_ERR_MSG(extack, "Parent must be root"); return -EINVAL; + } } if (tb[TCA_CBQ_RATE]) { rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), - tb[TCA_CBQ_RTAB]); + tb[TCA_CBQ_RTAB], extack); if (rtab == NULL) return -EINVAL; } @@ -1487,6 +1508,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t qdisc_root_sleeping_running(sch), tca[TCA_RATE]); if (err) { + NL_SET_ERR_MSG(extack, "Failed to replace specified rate estimator"); qdisc_put_rtab(rtab); return err; } @@ -1525,19 +1547,23 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t if (parentid == TC_H_ROOT) return -EINVAL; - if (tb[TCA_CBQ_WRROPT] == NULL || tb[TCA_CBQ_RATE] == NULL || - tb[TCA_CBQ_LSSOPT] == NULL) + if (!tb[TCA_CBQ_WRROPT] || !tb[TCA_CBQ_RATE] || !tb[TCA_CBQ_LSSOPT]) { + NL_SET_ERR_MSG(extack, "One of the following attributes MUST be specified: WRR, rate or link sharing"); return -EINVAL; + } - rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]); + rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB], + extack); if (rtab == NULL) return -EINVAL; if (classid) { err = -EINVAL; if (TC_H_MAJ(classid ^ sch->handle) || - cbq_class_lookup(q, classid)) + cbq_class_lookup(q, classid)) { + NL_SET_ERR_MSG(extack, "Specified class not found"); goto failure; + } } else { int i; classid = TC_H_MAKE(sch->handle, 0x8000); @@ -1549,8 +1575,10 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t break; } err = -ENOSR; - if (i >= 0x8000) + if (i >= 0x8000) { + NL_SET_ERR_MSG(extack, "Unable to generate classid"); goto failure; + } classid = classid|q->hgenerator; } @@ -1558,8 +1586,10 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t if (parentid) { parent = cbq_class_lookup(q, parentid); err = -EINVAL; - if (parent == NULL) + if (!parent) { + NL_SET_ERR_MSG(extack, "Failed to find parentid"); goto failure; + } } err = -ENOBUFS; @@ -1567,7 +1597,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t if (cl == NULL) goto failure; - err = tcf_block_get(&cl->block, &cl->filter_list, sch); + err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack); if (err) { kfree(cl); return err; @@ -1579,6 +1609,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t qdisc_root_sleeping_running(sch), tca[TCA_RATE]); if (err) { + NL_SET_ERR_MSG(extack, "Couldn't create new estimator"); tcf_block_put(cl->block); kfree(cl); goto failure; @@ -1587,7 +1618,8 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t cl->R_tab = rtab; rtab = NULL; - cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid); + cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid, + NULL); if (!cl->q) cl->q = &noop_qdisc; else @@ -1671,7 +1703,8 @@ static int cbq_delete(struct Qdisc *sch, unsigned long arg) return 0; } -static struct tcf_block *cbq_tcf_block(struct Qdisc *sch, unsigned long arg) +static struct tcf_block *cbq_tcf_block(struct Qdisc *sch, unsigned long arg, + struct netlink_ext_ack *extack) { struct cbq_sched_data *q = qdisc_priv(sch); struct cbq_class *cl = (struct cbq_class *)arg; diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c index 7a72980c1509..cdd96b9a27bc 100644 --- a/net/sched/sch_cbs.c +++ b/net/sched/sch_cbs.c @@ -219,14 +219,17 @@ static void cbs_disable_offload(struct net_device *dev, } static int cbs_enable_offload(struct net_device *dev, struct cbs_sched_data *q, - const struct tc_cbs_qopt *opt) + const struct tc_cbs_qopt *opt, + struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; struct tc_cbs_qopt_offload cbs = { }; int err; - if (!ops->ndo_setup_tc) + if (!ops->ndo_setup_tc) { + NL_SET_ERR_MSG(extack, "Specified device does not support cbs offload"); return -EOPNOTSUPP; + } cbs.queue = q->queue; @@ -237,8 +240,10 @@ static int cbs_enable_offload(struct net_device *dev, struct cbs_sched_data *q, cbs.sendslope = opt->sendslope; err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_CBS, &cbs); - if (err < 0) + if (err < 0) { + NL_SET_ERR_MSG(extack, "Specified device failed to setup cbs hardware offload"); return err; + } q->enqueue = cbs_enqueue_offload; q->dequeue = cbs_dequeue_offload; @@ -246,7 +251,8 @@ static int cbs_enable_offload(struct net_device *dev, struct cbs_sched_data *q, return 0; } -static int cbs_change(struct Qdisc *sch, struct nlattr *opt) +static int cbs_change(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { struct cbs_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); @@ -254,12 +260,14 @@ static int cbs_change(struct Qdisc *sch, struct nlattr *opt) struct tc_cbs_qopt *qopt; int err; - err = nla_parse_nested(tb, TCA_CBS_MAX, opt, cbs_policy, NULL); + err = nla_parse_nested(tb, TCA_CBS_MAX, opt, cbs_policy, extack); if (err < 0) return err; - if (!tb[TCA_CBS_PARMS]) + if (!tb[TCA_CBS_PARMS]) { + NL_SET_ERR_MSG(extack, "Missing CBS parameter which are mandatory"); return -EINVAL; + } qopt = nla_data(tb[TCA_CBS_PARMS]); @@ -276,7 +284,7 @@ static int cbs_change(struct Qdisc *sch, struct nlattr *opt) cbs_disable_offload(dev, q); } else { - err = cbs_enable_offload(dev, q, qopt); + err = cbs_enable_offload(dev, q, qopt, extack); if (err < 0) return err; } @@ -291,13 +299,16 @@ static int cbs_change(struct Qdisc *sch, struct nlattr *opt) return 0; } -static int cbs_init(struct Qdisc *sch, struct nlattr *opt) +static int cbs_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { struct cbs_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); - if (!opt) + if (!opt) { + NL_SET_ERR_MSG(extack, "Missing CBS qdisc options which are mandatory"); return -EINVAL; + } q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0); @@ -306,7 +317,7 @@ static int cbs_init(struct Qdisc *sch, struct nlattr *opt) qdisc_watchdog_init(&q->watchdog, sch); - return cbs_change(sch, opt); + return cbs_change(sch, opt, extack); } static void cbs_destroy(struct Qdisc *sch) diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index b30a2c70bd48..eafc0d17d174 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -344,7 +344,8 @@ static void choke_free(void *addr) kvfree(addr); } -static int choke_change(struct Qdisc *sch, struct nlattr *opt) +static int choke_change(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { struct choke_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_CHOKE_MAX + 1]; @@ -369,6 +370,9 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt) ctl = nla_data(tb[TCA_CHOKE_PARMS]); + if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog)) + return -EINVAL; + if (ctl->limit > CHOKE_MAX_QUEUE) return -EINVAL; @@ -428,9 +432,10 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt) return 0; } -static int choke_init(struct Qdisc *sch, struct nlattr *opt) +static int choke_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { - return choke_change(sch, opt); + return choke_change(sch, opt, extack); } static int choke_dump(struct Qdisc *sch, struct sk_buff *skb) diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index c518a1efcb9d..17cd81f84b5d 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -130,7 +130,8 @@ static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = { [TCA_CODEL_CE_THRESHOLD]= { .type = NLA_U32 }, }; -static int codel_change(struct Qdisc *sch, struct nlattr *opt) +static int codel_change(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { struct codel_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_CODEL_MAX + 1]; @@ -184,7 +185,8 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt) return 0; } -static int codel_init(struct Qdisc *sch, struct nlattr *opt) +static int codel_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { struct codel_sched_data *q = qdisc_priv(sch); @@ -196,7 +198,7 @@ static int codel_init(struct Qdisc *sch, struct nlattr *opt) q->params.mtu = psched_mtu(qdisc_dev(sch)); if (opt) { - int err = codel_change(sch, opt); + int err = codel_change(sch, opt, extack); if (err) return err; diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 5bbcef3dcd8c..e0b0cf8a9939 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -64,7 +64,8 @@ static const struct nla_policy drr_policy[TCA_DRR_MAX + 1] = { }; static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, - struct nlattr **tca, unsigned long *arg) + struct nlattr **tca, unsigned long *arg, + struct netlink_ext_ack *extack) { struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl = (struct drr_class *)*arg; @@ -73,17 +74,21 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, u32 quantum; int err; - if (!opt) + if (!opt) { + NL_SET_ERR_MSG(extack, "DRR options are required for this operation"); return -EINVAL; + } - err = nla_parse_nested(tb, TCA_DRR_MAX, opt, drr_policy, NULL); + err = nla_parse_nested(tb, TCA_DRR_MAX, opt, drr_policy, extack); if (err < 0) return err; if (tb[TCA_DRR_QUANTUM]) { quantum = nla_get_u32(tb[TCA_DRR_QUANTUM]); - if (quantum == 0) + if (quantum == 0) { + NL_SET_ERR_MSG(extack, "Specified DRR quantum cannot be zero"); return -EINVAL; + } } else quantum = psched_mtu(qdisc_dev(sch)); @@ -94,8 +99,10 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, NULL, qdisc_root_sleeping_running(sch), tca[TCA_RATE]); - if (err) + if (err) { + NL_SET_ERR_MSG(extack, "Failed to replace estimator"); return err; + } } sch_tree_lock(sch); @@ -113,7 +120,8 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, cl->common.classid = classid; cl->quantum = quantum; cl->qdisc = qdisc_create_dflt(sch->dev_queue, - &pfifo_qdisc_ops, classid); + &pfifo_qdisc_ops, classid, + NULL); if (cl->qdisc == NULL) cl->qdisc = &noop_qdisc; else @@ -125,6 +133,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, qdisc_root_sleeping_running(sch), tca[TCA_RATE]); if (err) { + NL_SET_ERR_MSG(extack, "Failed to replace estimator"); qdisc_destroy(cl->qdisc); kfree(cl); return err; @@ -172,12 +181,15 @@ static unsigned long drr_search_class(struct Qdisc *sch, u32 classid) return (unsigned long)drr_find_class(sch, classid); } -static struct tcf_block *drr_tcf_block(struct Qdisc *sch, unsigned long cl) +static struct tcf_block *drr_tcf_block(struct Qdisc *sch, unsigned long cl, + struct netlink_ext_ack *extack) { struct drr_sched *q = qdisc_priv(sch); - if (cl) + if (cl) { + NL_SET_ERR_MSG(extack, "DRR classid must be zero"); return NULL; + } return q->block; } @@ -201,13 +213,14 @@ static void drr_unbind_tcf(struct Qdisc *sch, unsigned long arg) } static int drr_graft_class(struct Qdisc *sch, unsigned long arg, - struct Qdisc *new, struct Qdisc **old) + struct Qdisc *new, struct Qdisc **old, + struct netlink_ext_ack *extack) { struct drr_class *cl = (struct drr_class *)arg; if (new == NULL) { - new = qdisc_create_dflt(sch->dev_queue, - &pfifo_qdisc_ops, cl->common.classid); + new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, + cl->common.classid, NULL); if (new == NULL) new = &noop_qdisc; } @@ -408,12 +421,13 @@ out: return NULL; } -static int drr_init_qdisc(struct Qdisc *sch, struct nlattr *opt) +static int drr_init_qdisc(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { struct drr_sched *q = qdisc_priv(sch); int err; - err = tcf_block_get(&q->block, &q->filter_list, sch); + err = tcf_block_get(&q->block, &q->filter_list, sch, extack); if (err) return err; err = qdisc_class_hash_init(&q->clhash); diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index fb4fb71c68cf..049714c57075 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -61,7 +61,8 @@ static inline int dsmark_valid_index(struct dsmark_qdisc_data *p, u16 index) /* ------------------------- Class/flow operations ------------------------- */ static int dsmark_graft(struct Qdisc *sch, unsigned long arg, - struct Qdisc *new, struct Qdisc **old) + struct Qdisc *new, struct Qdisc **old, + struct netlink_ext_ack *extack) { struct dsmark_qdisc_data *p = qdisc_priv(sch); @@ -70,7 +71,7 @@ static int dsmark_graft(struct Qdisc *sch, unsigned long arg, if (new == NULL) { new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, - sch->handle); + sch->handle, NULL); if (new == NULL) new = &noop_qdisc; } @@ -112,7 +113,8 @@ static const struct nla_policy dsmark_policy[TCA_DSMARK_MAX + 1] = { }; static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent, - struct nlattr **tca, unsigned long *arg) + struct nlattr **tca, unsigned long *arg, + struct netlink_ext_ack *extack) { struct dsmark_qdisc_data *p = qdisc_priv(sch); struct nlattr *opt = tca[TCA_OPTIONS]; @@ -184,7 +186,8 @@ ignore: } } -static struct tcf_block *dsmark_tcf_block(struct Qdisc *sch, unsigned long cl) +static struct tcf_block *dsmark_tcf_block(struct Qdisc *sch, unsigned long cl, + struct netlink_ext_ack *extack) { struct dsmark_qdisc_data *p = qdisc_priv(sch); @@ -330,7 +333,8 @@ static struct sk_buff *dsmark_peek(struct Qdisc *sch) return p->q->ops->peek(p->q); } -static int dsmark_init(struct Qdisc *sch, struct nlattr *opt) +static int dsmark_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { struct dsmark_qdisc_data *p = qdisc_priv(sch); struct nlattr *tb[TCA_DSMARK_MAX + 1]; @@ -344,7 +348,7 @@ static int dsmark_init(struct Qdisc *sch, struct nlattr *opt) if (!opt) goto errout; - err = tcf_block_get(&p->block, &p->filter_list, sch); + err = tcf_block_get(&p->block, &p->filter_list, sch, extack); if (err) return err; @@ -377,7 +381,8 @@ static int dsmark_init(struct Qdisc *sch, struct nlattr *opt) p->default_index = default_index; p->set_tc_index = nla_get_flag(tb[TCA_DSMARK_SET_TC_INDEX]); - p->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, sch->handle); + p->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, sch->handle, + NULL); if (p->q == NULL) p->q = &noop_qdisc; else diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c index 1e37247656f8..24893d3b5d22 100644 --- a/net/sched/sch_fifo.c +++ b/net/sched/sch_fifo.c @@ -55,7 +55,8 @@ static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch, return NET_XMIT_CN; } -static int fifo_init(struct Qdisc *sch, struct nlattr *opt) +static int fifo_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { bool bypass; bool is_bfifo = sch->ops == &bfifo_qdisc_ops; @@ -157,7 +158,7 @@ int fifo_set_limit(struct Qdisc *q, unsigned int limit) nla->nla_len = nla_attr_size(sizeof(struct tc_fifo_qopt)); ((struct tc_fifo_qopt *)nla_data(nla))->limit = limit; - ret = q->ops->change(q, nla); + ret = q->ops->change(q, nla, NULL); kfree(nla); } return ret; @@ -165,12 +166,14 @@ int fifo_set_limit(struct Qdisc *q, unsigned int limit) EXPORT_SYMBOL(fifo_set_limit); struct Qdisc *fifo_create_dflt(struct Qdisc *sch, struct Qdisc_ops *ops, - unsigned int limit) + unsigned int limit, + struct netlink_ext_ack *extack) { struct Qdisc *q; int err = -ENOMEM; - q = qdisc_create_dflt(sch->dev_queue, ops, TC_H_MAKE(sch->handle, 1)); + q = qdisc_create_dflt(sch->dev_queue, ops, TC_H_MAKE(sch->handle, 1), + extack); if (q) { err = fifo_set_limit(q, limit); if (err < 0) { diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 263d16e3219e..a366e4c9413a 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -685,7 +685,8 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { [TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 }, }; -static int fq_change(struct Qdisc *sch, struct nlattr *opt) +static int fq_change(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { struct fq_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_FQ_MAX + 1]; @@ -788,7 +789,8 @@ static void fq_destroy(struct Qdisc *sch) qdisc_watchdog_cancel(&q->watchdog); } -static int fq_init(struct Qdisc *sch, struct nlattr *opt) +static int fq_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { struct fq_sched_data *q = qdisc_priv(sch); int err; @@ -811,7 +813,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt) qdisc_watchdog_init(&q->watchdog, sch); if (opt) - err = fq_change(sch, opt); + err = fq_change(sch, opt, extack); else err = fq_resize(sch, q->fq_trees_log); diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 0305d791ea94..22fa13cf5d8b 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -377,7 +377,8 @@ static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = { [TCA_FQ_CODEL_MEMORY_LIMIT] = { .type = NLA_U32 }, }; -static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt) +static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { struct fq_codel_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_FQ_CODEL_MAX + 1]; @@ -458,7 +459,8 @@ static void fq_codel_destroy(struct Qdisc *sch) kvfree(q->flows); } -static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt) +static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { struct fq_codel_sched_data *q = qdisc_priv(sch); int i; @@ -477,12 +479,12 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt) q->cparams.mtu = psched_mtu(qdisc_dev(sch)); if (opt) { - int err = fq_codel_change(sch, opt); + int err = fq_codel_change(sch, opt, extack); if (err) return err; } - err = tcf_block_get(&q->block, &q->filter_list, sch); + err = tcf_block_get(&q->block, &q->filter_list, sch, extack); if (err) return err; @@ -595,7 +597,8 @@ static void fq_codel_unbind(struct Qdisc *q, unsigned long cl) { } -static struct tcf_block *fq_codel_tcf_block(struct Qdisc *sch, unsigned long cl) +static struct tcf_block *fq_codel_tcf_block(struct Qdisc *sch, unsigned long cl, + struct netlink_ext_ack *extack) { struct fq_codel_sched_data *q = qdisc_priv(sch); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 3839cbbdc32b..190570f21b20 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -26,10 +26,13 @@ #include <linux/list.h> #include <linux/slab.h> #include <linux/if_vlan.h> +#include <linux/skb_array.h> +#include <linux/if_macvlan.h> #include <net/sch_generic.h> #include <net/pkt_sched.h> #include <net/dst.h> #include <trace/events/qdisc.h> +#include <net/xfrm.h> /* Qdisc to use by default */ const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops; @@ -46,17 +49,115 @@ EXPORT_SYMBOL(default_qdisc_ops); * - updates to tree and tree walking are only done under the rtnl mutex. */ -static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) +static inline struct sk_buff *__skb_dequeue_bad_txq(struct Qdisc *q) +{ + const struct netdev_queue *txq = q->dev_queue; + spinlock_t *lock = NULL; + struct sk_buff *skb; + + if (q->flags & TCQ_F_NOLOCK) { + lock = qdisc_lock(q); + spin_lock(lock); + } + + skb = skb_peek(&q->skb_bad_txq); + if (skb) { + /* check the reason of requeuing without tx lock first */ + txq = skb_get_tx_queue(txq->dev, skb); + if (!netif_xmit_frozen_or_stopped(txq)) { + skb = __skb_dequeue(&q->skb_bad_txq); + if (qdisc_is_percpu_stats(q)) { + qdisc_qstats_cpu_backlog_dec(q, skb); + qdisc_qstats_cpu_qlen_dec(q); + } else { + qdisc_qstats_backlog_dec(q, skb); + q->q.qlen--; + } + } else { + skb = NULL; + } + } + + if (lock) + spin_unlock(lock); + + return skb; +} + +static inline struct sk_buff *qdisc_dequeue_skb_bad_txq(struct Qdisc *q) +{ + struct sk_buff *skb = skb_peek(&q->skb_bad_txq); + + if (unlikely(skb)) + skb = __skb_dequeue_bad_txq(q); + + return skb; +} + +static inline void qdisc_enqueue_skb_bad_txq(struct Qdisc *q, + struct sk_buff *skb) +{ + spinlock_t *lock = NULL; + + if (q->flags & TCQ_F_NOLOCK) { + lock = qdisc_lock(q); + spin_lock(lock); + } + + __skb_queue_tail(&q->skb_bad_txq, skb); + + if (lock) + spin_unlock(lock); +} + +static inline int __dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) { - q->gso_skb = skb; - q->qstats.requeues++; - qdisc_qstats_backlog_inc(q, skb); - q->q.qlen++; /* it's still part of the queue */ + while (skb) { + struct sk_buff *next = skb->next; + + __skb_queue_tail(&q->gso_skb, skb); + q->qstats.requeues++; + qdisc_qstats_backlog_inc(q, skb); + q->q.qlen++; /* it's still part of the queue */ + + skb = next; + } + __netif_schedule(q); + + return 0; +} + +static inline int dev_requeue_skb_locked(struct sk_buff *skb, struct Qdisc *q) +{ + spinlock_t *lock = qdisc_lock(q); + + spin_lock(lock); + while (skb) { + struct sk_buff *next = skb->next; + + __skb_queue_tail(&q->gso_skb, skb); + + qdisc_qstats_cpu_requeues_inc(q); + qdisc_qstats_cpu_backlog_inc(q, skb); + qdisc_qstats_cpu_qlen_inc(q); + + skb = next; + } + spin_unlock(lock); + __netif_schedule(q); return 0; } +static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) +{ + if (q->flags & TCQ_F_NOLOCK) + return dev_requeue_skb_locked(skb, q); + else + return __dev_requeue_skb(skb, q); +} + static void try_bulk_dequeue_skb(struct Qdisc *q, struct sk_buff *skb, const struct netdev_queue *txq, @@ -94,9 +195,15 @@ static void try_bulk_dequeue_skb_slow(struct Qdisc *q, if (!nskb) break; if (unlikely(skb_get_queue_mapping(nskb) != mapping)) { - q->skb_bad_txq = nskb; - qdisc_qstats_backlog_inc(q, nskb); - q->q.qlen++; + qdisc_enqueue_skb_bad_txq(q, nskb); + + if (qdisc_is_percpu_stats(q)) { + qdisc_qstats_cpu_backlog_inc(q, nskb); + qdisc_qstats_cpu_qlen_inc(q); + } else { + qdisc_qstats_backlog_inc(q, nskb); + q->q.qlen++; + } break; } skb->next = nskb; @@ -112,40 +219,62 @@ static void try_bulk_dequeue_skb_slow(struct Qdisc *q, static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate, int *packets) { - struct sk_buff *skb = q->gso_skb; const struct netdev_queue *txq = q->dev_queue; + struct sk_buff *skb = NULL; *packets = 1; - if (unlikely(skb)) { + if (unlikely(!skb_queue_empty(&q->gso_skb))) { + spinlock_t *lock = NULL; + + if (q->flags & TCQ_F_NOLOCK) { + lock = qdisc_lock(q); + spin_lock(lock); + } + + skb = skb_peek(&q->gso_skb); + + /* skb may be null if another cpu pulls gso_skb off in between + * empty check and lock. + */ + if (!skb) { + if (lock) + spin_unlock(lock); + goto validate; + } + /* skb in gso_skb were already validated */ *validate = false; + if (xfrm_offload(skb)) + *validate = true; /* check the reason of requeuing without tx lock first */ txq = skb_get_tx_queue(txq->dev, skb); if (!netif_xmit_frozen_or_stopped(txq)) { - q->gso_skb = NULL; - qdisc_qstats_backlog_dec(q, skb); - q->q.qlen--; - } else + skb = __skb_dequeue(&q->gso_skb); + if (qdisc_is_percpu_stats(q)) { + qdisc_qstats_cpu_backlog_dec(q, skb); + qdisc_qstats_cpu_qlen_dec(q); + } else { + qdisc_qstats_backlog_dec(q, skb); + q->q.qlen--; + } + } else { skb = NULL; - goto trace; - } - *validate = true; - skb = q->skb_bad_txq; - if (unlikely(skb)) { - /* check the reason of requeuing without tx lock first */ - txq = skb_get_tx_queue(txq->dev, skb); - if (!netif_xmit_frozen_or_stopped(txq)) { - q->skb_bad_txq = NULL; - qdisc_qstats_backlog_dec(q, skb); - q->q.qlen--; - goto bulk; } - skb = NULL; + if (lock) + spin_unlock(lock); goto trace; } - if (!(q->flags & TCQ_F_ONETXQUEUE) || - !netif_xmit_frozen_or_stopped(txq)) - skb = q->dequeue(q); +validate: + *validate = true; + + if ((q->flags & TCQ_F_ONETXQUEUE) && + netif_xmit_frozen_or_stopped(txq)) + return skb; + + skb = qdisc_dequeue_skb_bad_txq(q); + if (unlikely(skb)) + goto bulk; + skb = q->dequeue(q); if (skb) { bulk: if (qdisc_may_bulk(q)) @@ -164,21 +293,33 @@ trace: * only one CPU can execute this function. * * Returns to the caller: - * 0 - queue is empty or throttled. - * >0 - queue is not empty. + * false - hardware queue frozen backoff + * true - feel free to send more pkts */ -int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, - struct net_device *dev, struct netdev_queue *txq, - spinlock_t *root_lock, bool validate) +bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, + struct net_device *dev, struct netdev_queue *txq, + spinlock_t *root_lock, bool validate) { int ret = NETDEV_TX_BUSY; + bool again = false; /* And release qdisc */ - spin_unlock(root_lock); + if (root_lock) + spin_unlock(root_lock); /* Note that we validate skb (GSO, checksum, ...) outside of locks */ if (validate) - skb = validate_xmit_skb_list(skb, dev); + skb = validate_xmit_skb_list(skb, dev, &again); + +#ifdef CONFIG_XFRM_OFFLOAD + if (unlikely(again)) { + if (root_lock) + spin_lock(root_lock); + + dev_requeue_skb(skb, q); + return false; + } +#endif if (likely(skb)) { HARD_TX_LOCK(dev, txq, smp_processor_id()); @@ -187,27 +328,28 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, HARD_TX_UNLOCK(dev, txq); } else { - spin_lock(root_lock); - return qdisc_qlen(q); + if (root_lock) + spin_lock(root_lock); + return true; } - spin_lock(root_lock); - if (dev_xmit_complete(ret)) { - /* Driver sent out skb successfully or skb was consumed */ - ret = qdisc_qlen(q); - } else { + if (root_lock) + spin_lock(root_lock); + + if (!dev_xmit_complete(ret)) { /* Driver returned NETDEV_TX_BUSY - requeue skb */ if (unlikely(ret != NETDEV_TX_BUSY)) net_warn_ratelimited("BUG %s code %d qlen %d\n", dev->name, ret, q->q.qlen); - ret = dev_requeue_skb(skb, q); + dev_requeue_skb(skb, q); + return false; } if (ret && netif_xmit_frozen_or_stopped(txq)) - ret = 0; + return false; - return ret; + return true; } /* @@ -229,20 +371,22 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, * >0 - queue is not empty. * */ -static inline int qdisc_restart(struct Qdisc *q, int *packets) +static inline bool qdisc_restart(struct Qdisc *q, int *packets) { + spinlock_t *root_lock = NULL; struct netdev_queue *txq; struct net_device *dev; - spinlock_t *root_lock; struct sk_buff *skb; bool validate; /* Dequeue packet */ skb = dequeue_skb(q, &validate, packets); if (unlikely(!skb)) - return 0; + return false; + + if (!(q->flags & TCQ_F_NOLOCK)) + root_lock = qdisc_lock(q); - root_lock = qdisc_lock(q); dev = qdisc_dev(q); txq = skb_get_tx_queue(dev, skb); @@ -266,8 +410,6 @@ void __qdisc_run(struct Qdisc *q) break; } } - - qdisc_run_end(q); } unsigned long dev_trans_start(struct net_device *dev) @@ -277,6 +419,8 @@ unsigned long dev_trans_start(struct net_device *dev) if (is_vlan_dev(dev)) dev = vlan_dev_real_dev(dev); + else if (netif_is_macvlan(dev)) + dev = macvlan_dev_real_dev(dev); res = netdev_get_tx_queue(dev, 0)->trans_start; for (i = 1; i < dev->num_tx_queues; i++) { val = netdev_get_tx_queue(dev, i)->trans_start; @@ -366,7 +510,7 @@ void netif_carrier_on(struct net_device *dev) if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) { if (dev->reg_state == NETREG_UNINITIALIZED) return; - atomic_inc(&dev->carrier_changes); + atomic_inc(&dev->carrier_up_count); linkwatch_fire_event(dev); if (netif_running(dev)) __netdev_watchdog_up(dev); @@ -385,7 +529,7 @@ void netif_carrier_off(struct net_device *dev) if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) { if (dev->reg_state == NETREG_UNINITIALIZED) return; - atomic_inc(&dev->carrier_changes); + atomic_inc(&dev->carrier_down_count); linkwatch_fire_event(dev); } } @@ -434,7 +578,8 @@ struct Qdisc noop_qdisc = { }; EXPORT_SYMBOL(noop_qdisc); -static int noqueue_init(struct Qdisc *qdisc, struct nlattr *opt) +static int noqueue_init(struct Qdisc *qdisc, struct nlattr *opt, + struct netlink_ext_ack *extack) { /* register_qdisc() assigns a default of noop_enqueue if unset, * but __dev_queue_xmit() treats noqueue only as such @@ -465,93 +610,99 @@ static const u8 prio2band[TC_PRIO_MAX + 1] = { /* * Private data for a pfifo_fast scheduler containing: - * - queues for the three band - * - bitmap indicating which of the bands contain skbs + * - rings for priority bands */ struct pfifo_fast_priv { - u32 bitmap; - struct qdisc_skb_head q[PFIFO_FAST_BANDS]; + struct skb_array q[PFIFO_FAST_BANDS]; }; -/* - * Convert a bitmap to the first band number where an skb is queued, where: - * bitmap=0 means there are no skbs on any band. - * bitmap=1 means there is an skb on band 0. - * bitmap=7 means there are skbs on all 3 bands, etc. - */ -static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0}; - -static inline struct qdisc_skb_head *band2list(struct pfifo_fast_priv *priv, - int band) +static inline struct skb_array *band2list(struct pfifo_fast_priv *priv, + int band) { - return priv->q + band; + return &priv->q[band]; } static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, struct sk_buff **to_free) { - if (qdisc->q.qlen < qdisc_dev(qdisc)->tx_queue_len) { - int band = prio2band[skb->priority & TC_PRIO_MAX]; - struct pfifo_fast_priv *priv = qdisc_priv(qdisc); - struct qdisc_skb_head *list = band2list(priv, band); - - priv->bitmap |= (1 << band); - qdisc->q.qlen++; - return __qdisc_enqueue_tail(skb, qdisc, list); - } + int band = prio2band[skb->priority & TC_PRIO_MAX]; + struct pfifo_fast_priv *priv = qdisc_priv(qdisc); + struct skb_array *q = band2list(priv, band); + int err; - return qdisc_drop(skb, qdisc, to_free); + err = skb_array_produce(q, skb); + + if (unlikely(err)) + return qdisc_drop_cpu(skb, qdisc, to_free); + + qdisc_qstats_cpu_qlen_inc(qdisc); + qdisc_qstats_cpu_backlog_inc(qdisc, skb); + return NET_XMIT_SUCCESS; } static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) { struct pfifo_fast_priv *priv = qdisc_priv(qdisc); - int band = bitmap2band[priv->bitmap]; + struct sk_buff *skb = NULL; + int band; - if (likely(band >= 0)) { - struct qdisc_skb_head *qh = band2list(priv, band); - struct sk_buff *skb = __qdisc_dequeue_head(qh); + for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) { + struct skb_array *q = band2list(priv, band); - if (likely(skb != NULL)) { - qdisc_qstats_backlog_dec(qdisc, skb); - qdisc_bstats_update(qdisc, skb); - } + if (__skb_array_empty(q)) + continue; - qdisc->q.qlen--; - if (qh->qlen == 0) - priv->bitmap &= ~(1 << band); - - return skb; + skb = skb_array_consume_bh(q); + } + if (likely(skb)) { + qdisc_qstats_cpu_backlog_dec(qdisc, skb); + qdisc_bstats_cpu_update(qdisc, skb); + qdisc_qstats_cpu_qlen_dec(qdisc); } - return NULL; + return skb; } static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc) { struct pfifo_fast_priv *priv = qdisc_priv(qdisc); - int band = bitmap2band[priv->bitmap]; + struct sk_buff *skb = NULL; + int band; - if (band >= 0) { - struct qdisc_skb_head *qh = band2list(priv, band); + for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) { + struct skb_array *q = band2list(priv, band); - return qh->head; + skb = __skb_array_peek(q); } - return NULL; + return skb; } static void pfifo_fast_reset(struct Qdisc *qdisc) { - int prio; + int i, band; struct pfifo_fast_priv *priv = qdisc_priv(qdisc); - for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) - __qdisc_reset_queue(band2list(priv, prio)); + for (band = 0; band < PFIFO_FAST_BANDS; band++) { + struct skb_array *q = band2list(priv, band); + struct sk_buff *skb; - priv->bitmap = 0; - qdisc->qstats.backlog = 0; - qdisc->q.qlen = 0; + /* NULL ring is possible if destroy path is due to a failed + * skb_array_init() in pfifo_fast_init() case. + */ + if (!q->ring.queue) + continue; + + while ((skb = skb_array_consume_bh(q)) != NULL) + kfree_skb(skb); + } + + for_each_possible_cpu(i) { + struct gnet_stats_queue *q = per_cpu_ptr(qdisc->cpu_qstats, i); + + q->backlog = 0; + q->qlen = 0; + } } static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb) @@ -567,19 +718,68 @@ nla_put_failure: return -1; } -static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt) +static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt, + struct netlink_ext_ack *extack) { - int prio; + unsigned int qlen = qdisc_dev(qdisc)->tx_queue_len; struct pfifo_fast_priv *priv = qdisc_priv(qdisc); + int prio; - for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) - qdisc_skb_head_init(band2list(priv, prio)); + /* guard against zero length rings */ + if (!qlen) + return -EINVAL; + + for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) { + struct skb_array *q = band2list(priv, prio); + int err; + + err = skb_array_init(q, qlen, GFP_KERNEL); + if (err) + return -ENOMEM; + } /* Can by-pass the queue discipline */ qdisc->flags |= TCQ_F_CAN_BYPASS; return 0; } +static void pfifo_fast_destroy(struct Qdisc *sch) +{ + struct pfifo_fast_priv *priv = qdisc_priv(sch); + int prio; + + for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) { + struct skb_array *q = band2list(priv, prio); + + /* NULL ring is possible if destroy path is due to a failed + * skb_array_init() in pfifo_fast_init() case. + */ + if (!q->ring.queue) + continue; + /* Destroy ring but no need to kfree_skb because a call to + * pfifo_fast_reset() has already done that work. + */ + ptr_ring_cleanup(&q->ring, NULL); + } +} + +static int pfifo_fast_change_tx_queue_len(struct Qdisc *sch, + unsigned int new_len) +{ + struct pfifo_fast_priv *priv = qdisc_priv(sch); + struct skb_array *bands[PFIFO_FAST_BANDS]; + int prio; + + for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) { + struct skb_array *q = band2list(priv, prio); + + bands[prio] = q; + } + + return skb_array_resize_multiple(bands, PFIFO_FAST_BANDS, new_len, + GFP_KERNEL); +} + struct Qdisc_ops pfifo_fast_ops __read_mostly = { .id = "pfifo_fast", .priv_size = sizeof(struct pfifo_fast_priv), @@ -587,9 +787,12 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = { .dequeue = pfifo_fast_dequeue, .peek = pfifo_fast_peek, .init = pfifo_fast_init, + .destroy = pfifo_fast_destroy, .reset = pfifo_fast_reset, .dump = pfifo_fast_dump, + .change_tx_queue_len = pfifo_fast_change_tx_queue_len, .owner = THIS_MODULE, + .static_flags = TCQ_F_NOLOCK | TCQ_F_CPUSTATS, }; EXPORT_SYMBOL(pfifo_fast_ops); @@ -597,7 +800,8 @@ static struct lock_class_key qdisc_tx_busylock; static struct lock_class_key qdisc_running_key; struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, - const struct Qdisc_ops *ops) + const struct Qdisc_ops *ops, + struct netlink_ext_ack *extack) { void *p; struct Qdisc *sch; @@ -606,6 +810,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, struct net_device *dev; if (!dev_queue) { + NL_SET_ERR_MSG(extack, "No device queue given"); err = -EINVAL; goto errout; } @@ -627,9 +832,24 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p); sch->padded = (char *) sch - (char *) p; } + __skb_queue_head_init(&sch->gso_skb); + __skb_queue_head_init(&sch->skb_bad_txq); qdisc_skb_head_init(&sch->q); spin_lock_init(&sch->q.lock); + if (ops->static_flags & TCQ_F_CPUSTATS) { + sch->cpu_bstats = + netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); + if (!sch->cpu_bstats) + goto errout1; + + sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue); + if (!sch->cpu_qstats) { + free_percpu(sch->cpu_bstats); + goto errout1; + } + } + spin_lock_init(&sch->busylock); lockdep_set_class(&sch->busylock, dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); @@ -639,6 +859,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, dev->qdisc_running_key ?: &qdisc_running_key); sch->ops = ops; + sch->flags = ops->static_flags; sch->enqueue = ops->enqueue; sch->dequeue = ops->dequeue; sch->dev_queue = dev_queue; @@ -646,27 +867,32 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, refcount_set(&sch->refcnt, 1); return sch; +errout1: + kfree(p); errout: return ERR_PTR(err); } struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, const struct Qdisc_ops *ops, - unsigned int parentid) + unsigned int parentid, + struct netlink_ext_ack *extack) { struct Qdisc *sch; - if (!try_module_get(ops->owner)) + if (!try_module_get(ops->owner)) { + NL_SET_ERR_MSG(extack, "Failed to increase module reference counter"); return NULL; + } - sch = qdisc_alloc(dev_queue, ops); + sch = qdisc_alloc(dev_queue, ops, extack); if (IS_ERR(sch)) { module_put(ops->owner); return NULL; } sch->parent = parentid; - if (!ops->init || ops->init(sch, NULL) == 0) + if (!ops->init || ops->init(sch, NULL, extack) == 0) return sch; qdisc_destroy(sch); @@ -679,23 +905,27 @@ EXPORT_SYMBOL(qdisc_create_dflt); void qdisc_reset(struct Qdisc *qdisc) { const struct Qdisc_ops *ops = qdisc->ops; + struct sk_buff *skb, *tmp; if (ops->reset) ops->reset(qdisc); - kfree_skb(qdisc->skb_bad_txq); - qdisc->skb_bad_txq = NULL; + skb_queue_walk_safe(&qdisc->gso_skb, skb, tmp) { + __skb_unlink(skb, &qdisc->gso_skb); + kfree_skb_list(skb); + } - if (qdisc->gso_skb) { - kfree_skb_list(qdisc->gso_skb); - qdisc->gso_skb = NULL; + skb_queue_walk_safe(&qdisc->skb_bad_txq, skb, tmp) { + __skb_unlink(skb, &qdisc->skb_bad_txq); + kfree_skb_list(skb); } + qdisc->q.qlen = 0; qdisc->qstats.backlog = 0; } EXPORT_SYMBOL(qdisc_reset); -static void qdisc_free(struct Qdisc *qdisc) +void qdisc_free(struct Qdisc *qdisc) { if (qdisc_is_percpu_stats(qdisc)) { free_percpu(qdisc->cpu_bstats); @@ -708,6 +938,7 @@ static void qdisc_free(struct Qdisc *qdisc) void qdisc_destroy(struct Qdisc *qdisc) { const struct Qdisc_ops *ops = qdisc->ops; + struct sk_buff *skb, *tmp; if (qdisc->flags & TCQ_F_BUILTIN || !refcount_dec_and_test(&qdisc->refcnt)) @@ -727,8 +958,16 @@ void qdisc_destroy(struct Qdisc *qdisc) module_put(ops->owner); dev_put(qdisc_dev(qdisc)); - kfree_skb_list(qdisc->gso_skb); - kfree_skb(qdisc->skb_bad_txq); + skb_queue_walk_safe(&qdisc->gso_skb, skb, tmp) { + __skb_unlink(skb, &qdisc->gso_skb); + kfree_skb_list(skb); + } + + skb_queue_walk_safe(&qdisc->skb_bad_txq, skb, tmp) { + __skb_unlink(skb, &qdisc->skb_bad_txq); + kfree_skb_list(skb); + } + qdisc_free(qdisc); } EXPORT_SYMBOL(qdisc_destroy); @@ -743,10 +982,6 @@ struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue, root_lock = qdisc_lock(oqdisc); spin_lock_bh(root_lock); - /* Prune old scheduler */ - if (oqdisc && refcount_read(&oqdisc->refcnt) <= 1) - qdisc_reset(oqdisc); - /* ... and graft new one */ if (qdisc == NULL) qdisc = &noop_qdisc; @@ -769,7 +1004,7 @@ static void attach_one_default_qdisc(struct net_device *dev, if (dev->priv_flags & IFF_NO_QUEUE) ops = &noqueue_qdisc_ops; - qdisc = qdisc_create_dflt(dev_queue, ops, TC_H_ROOT); + qdisc = qdisc_create_dflt(dev_queue, ops, TC_H_ROOT, NULL); if (!qdisc) { netdev_info(dev, "activation failed\n"); return; @@ -792,7 +1027,7 @@ static void attach_default_qdiscs(struct net_device *dev) dev->qdisc = txq->qdisc_sleeping; qdisc_refcount_inc(dev->qdisc); } else { - qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT); + qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT, NULL); if (qdisc) { dev->qdisc = qdisc; qdisc->ops->attach(qdisc); @@ -882,14 +1117,18 @@ static bool some_qdisc_is_busy(struct net_device *dev) dev_queue = netdev_get_tx_queue(dev, i); q = dev_queue->qdisc_sleeping; - root_lock = qdisc_lock(q); - spin_lock_bh(root_lock); + if (q->flags & TCQ_F_NOLOCK) { + val = test_bit(__QDISC_STATE_SCHED, &q->state); + } else { + root_lock = qdisc_lock(q); + spin_lock_bh(root_lock); - val = (qdisc_is_running(q) || - test_bit(__QDISC_STATE_SCHED, &q->state)); + val = (qdisc_is_running(q) || + test_bit(__QDISC_STATE_SCHED, &q->state)); - spin_unlock_bh(root_lock); + spin_unlock_bh(root_lock); + } if (val) return true; @@ -897,6 +1136,16 @@ static bool some_qdisc_is_busy(struct net_device *dev) return false; } +static void dev_qdisc_reset(struct net_device *dev, + struct netdev_queue *dev_queue, + void *none) +{ + struct Qdisc *qdisc = dev_queue->qdisc_sleeping; + + if (qdisc) + qdisc_reset(qdisc); +} + /** * dev_deactivate_many - deactivate transmissions on several devices * @head: list of devices to deactivate @@ -907,7 +1156,6 @@ static bool some_qdisc_is_busy(struct net_device *dev) void dev_deactivate_many(struct list_head *head) { struct net_device *dev; - bool sync_needed = false; list_for_each_entry(dev, head, close_list) { netdev_for_each_tx_queue(dev, dev_deactivate_queue, @@ -917,20 +1165,25 @@ void dev_deactivate_many(struct list_head *head) &noop_qdisc); dev_watchdog_down(dev); - sync_needed |= !dev->dismantle; } /* Wait for outstanding qdisc-less dev_queue_xmit calls. * This is avoided if all devices are in dismantle phase : * Caller will call synchronize_net() for us */ - if (sync_needed) - synchronize_net(); + synchronize_net(); /* Wait for outstanding qdisc_run calls. */ - list_for_each_entry(dev, head, close_list) + list_for_each_entry(dev, head, close_list) { while (some_qdisc_is_busy(dev)) yield(); + /* The new qdisc is assigned at this point so we can safely + * unwind stale skb lists and qdisc statistics + */ + netdev_for_each_tx_queue(dev, dev_qdisc_reset, NULL); + if (dev_ingress_queue(dev)) + dev_qdisc_reset(dev, dev_ingress_queue(dev), NULL); + } } void dev_deactivate(struct net_device *dev) @@ -943,6 +1196,39 @@ void dev_deactivate(struct net_device *dev) } EXPORT_SYMBOL(dev_deactivate); +static int qdisc_change_tx_queue_len(struct net_device *dev, + struct netdev_queue *dev_queue) +{ + struct Qdisc *qdisc = dev_queue->qdisc_sleeping; + const struct Qdisc_ops *ops = qdisc->ops; + + if (ops->change_tx_queue_len) + return ops->change_tx_queue_len(qdisc, dev->tx_queue_len); + return 0; +} + +int dev_qdisc_change_tx_queue_len(struct net_device *dev) +{ + bool up = dev->flags & IFF_UP; + unsigned int i; + int ret = 0; + + if (up) + dev_deactivate(dev); + + for (i = 0; i < dev->num_tx_queues; i++) { + ret = qdisc_change_tx_queue_len(dev, &dev->_tx[i]); + + /* TODO: revert changes on a partial failure */ + if (ret) + break; + } + + if (up) + dev_activate(dev); + return ret; +} + static void dev_init_scheduler_queue(struct net_device *dev, struct netdev_queue *dev_queue, void *_qdisc) @@ -951,6 +1237,8 @@ static void dev_init_scheduler_queue(struct net_device *dev, rcu_assign_pointer(dev_queue->qdisc, qdisc); dev_queue->qdisc_sleeping = qdisc; + __skb_queue_head_init(&qdisc->gso_skb); + __skb_queue_head_init(&qdisc->skb_bad_txq); } void dev_init_scheduler(struct net_device *dev) @@ -1037,6 +1325,8 @@ void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp, if (!tp_head) { RCU_INIT_POINTER(*miniqp->p_miniq, NULL); + /* Wait for flying RCU callback before it is freed. */ + rcu_barrier_bh(); return; } @@ -1052,7 +1342,7 @@ void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp, rcu_assign_pointer(*miniqp->p_miniq, miniq); if (miniq_old) - /* This is counterpart of the rcu barrier above. We need to + /* This is counterpart of the rcu barriers above. We need to * block potential new user of miniq_old until all readers * are not seeing it. */ diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index 17c7130454bd..cbe4831f46f4 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -306,12 +306,13 @@ static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps) struct tc_gred_sopt *sopt; int i; - if (dps == NULL) + if (!dps) return -EINVAL; sopt = nla_data(dps); - if (sopt->DPs > MAX_DPs || sopt->DPs == 0 || sopt->def_DP >= sopt->DPs) + if (sopt->DPs > MAX_DPs || sopt->DPs == 0 || + sopt->def_DP >= sopt->DPs) return -EINVAL; sch_tree_lock(sch); @@ -356,6 +357,9 @@ static inline int gred_change_vq(struct Qdisc *sch, int dp, struct gred_sched *table = qdisc_priv(sch); struct gred_sched_data *q = table->tab[dp]; + if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog)) + return -EINVAL; + if (!q) { table->tab[dp] = q = *prealloc; *prealloc = NULL; @@ -388,7 +392,8 @@ static const struct nla_policy gred_policy[TCA_GRED_MAX + 1] = { [TCA_GRED_LIMIT] = { .type = NLA_U32 }, }; -static int gred_change(struct Qdisc *sch, struct nlattr *opt) +static int gred_change(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { struct gred_sched *table = qdisc_priv(sch); struct tc_gred_qopt *ctl; @@ -462,12 +467,13 @@ errout: return err; } -static int gred_init(struct Qdisc *sch, struct nlattr *opt) +static int gred_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_GRED_MAX + 1]; int err; - if (opt == NULL) + if (!opt) return -EINVAL; err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy, NULL); diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index d04068a97d81..3ae9877ea205 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -921,7 +921,8 @@ static const struct nla_policy hfsc_policy[TCA_HFSC_MAX + 1] = { static int hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, - struct nlattr **tca, unsigned long *arg) + struct nlattr **tca, unsigned long *arg, + struct netlink_ext_ack *extack) { struct hfsc_sched *q = qdisc_priv(sch); struct hfsc_class *cl = (struct hfsc_class *)*arg; @@ -1033,7 +1034,7 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (cl == NULL) return -ENOBUFS; - err = tcf_block_get(&cl->block, &cl->filter_list, sch); + err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack); if (err) { kfree(cl); return err; @@ -1061,8 +1062,8 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, cl->cl_common.classid = classid; cl->sched = q; cl->cl_parent = parent; - cl->qdisc = qdisc_create_dflt(sch->dev_queue, - &pfifo_qdisc_ops, classid); + cl->qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, + classid, NULL); if (cl->qdisc == NULL) cl->qdisc = &noop_qdisc; else @@ -1176,7 +1177,7 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) static int hfsc_graft_class(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, - struct Qdisc **old) + struct Qdisc **old, struct netlink_ext_ack *extack) { struct hfsc_class *cl = (struct hfsc_class *)arg; @@ -1184,7 +1185,7 @@ hfsc_graft_class(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, return -EINVAL; if (new == NULL) { new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, - cl->cl_common.classid); + cl->cl_common.classid, NULL); if (new == NULL) new = &noop_qdisc; } @@ -1246,7 +1247,8 @@ hfsc_unbind_tcf(struct Qdisc *sch, unsigned long arg) cl->filter_cnt--; } -static struct tcf_block *hfsc_tcf_block(struct Qdisc *sch, unsigned long arg) +static struct tcf_block *hfsc_tcf_block(struct Qdisc *sch, unsigned long arg, + struct netlink_ext_ack *extack) { struct hfsc_sched *q = qdisc_priv(sch); struct hfsc_class *cl = (struct hfsc_class *)arg; @@ -1388,7 +1390,8 @@ hfsc_schedule_watchdog(struct Qdisc *sch) } static int -hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt) +hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { struct hfsc_sched *q = qdisc_priv(sch); struct tc_hfsc_qopt *qopt; @@ -1396,7 +1399,7 @@ hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt) qdisc_watchdog_init(&q->watchdog, sch); - if (opt == NULL || nla_len(opt) < sizeof(*qopt)) + if (!opt || nla_len(opt) < sizeof(*qopt)) return -EINVAL; qopt = nla_data(opt); @@ -1406,14 +1409,14 @@ hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt) return err; q->eligible = RB_ROOT; - err = tcf_block_get(&q->root.block, &q->root.filter_list, sch); + err = tcf_block_get(&q->root.block, &q->root.filter_list, sch, extack); if (err) return err; q->root.cl_common.classid = sch->handle; q->root.sched = q; q->root.qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, - sch->handle); + sch->handle, NULL); if (q->root.qdisc == NULL) q->root.qdisc = &noop_qdisc; else @@ -1429,7 +1432,8 @@ hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt) } static int -hfsc_change_qdisc(struct Qdisc *sch, struct nlattr *opt) +hfsc_change_qdisc(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { struct hfsc_sched *q = qdisc_priv(sch); struct tc_hfsc_qopt *qopt; diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index 73a53c08091b..bce2632212d3 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -504,7 +504,8 @@ static const struct nla_policy hhf_policy[TCA_HHF_MAX + 1] = { [TCA_HHF_NON_HH_WEIGHT] = { .type = NLA_U32 }, }; -static int hhf_change(struct Qdisc *sch, struct nlattr *opt) +static int hhf_change(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { struct hhf_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_HHF_MAX + 1]; @@ -571,7 +572,8 @@ static int hhf_change(struct Qdisc *sch, struct nlattr *opt) return 0; } -static int hhf_init(struct Qdisc *sch, struct nlattr *opt) +static int hhf_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { struct hhf_sched_data *q = qdisc_priv(sch); int i; @@ -589,7 +591,7 @@ static int hhf_init(struct Qdisc *sch, struct nlattr *opt) q->hhf_non_hh_weight = 2; if (opt) { - int err = hhf_change(sch, opt); + int err = hhf_change(sch, opt, extack); if (err) return err; diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index fa0380730ff0..1ea9846cc6ce 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1017,7 +1017,8 @@ static void htb_work_func(struct work_struct *work) rcu_read_unlock(); } -static int htb_init(struct Qdisc *sch, struct nlattr *opt) +static int htb_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { struct htb_sched *q = qdisc_priv(sch); struct nlattr *tb[TCA_HTB_MAX + 1]; @@ -1031,7 +1032,7 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt) if (!opt) return -EINVAL; - err = tcf_block_get(&q->block, &q->filter_list, sch); + err = tcf_block_get(&q->block, &q->filter_list, sch, extack); if (err) return err; @@ -1171,7 +1172,7 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) } static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, - struct Qdisc **old) + struct Qdisc **old, struct netlink_ext_ack *extack) { struct htb_class *cl = (struct htb_class *)arg; @@ -1179,7 +1180,7 @@ static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, return -EINVAL; if (new == NULL && (new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, - cl->common.classid)) == NULL) + cl->common.classid, extack)) == NULL) return -ENOBUFS; *old = qdisc_replace(sch, new, &cl->un.leaf.q); @@ -1289,7 +1290,8 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg) if (!cl->level && htb_parent_last_child(cl)) { new_q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, - cl->parent->common.classid); + cl->parent->common.classid, + NULL); last_child = 1; } @@ -1326,7 +1328,7 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg) static int htb_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca, - unsigned long *arg) + unsigned long *arg, struct netlink_ext_ack *extack) { int err = -EINVAL; struct htb_sched *q = qdisc_priv(sch); @@ -1356,10 +1358,12 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, /* Keeping backward compatible with rate_table based iproute2 tc */ if (hopt->rate.linklayer == TC_LINKLAYER_UNAWARE) - qdisc_put_rtab(qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB])); + qdisc_put_rtab(qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB], + NULL)); if (hopt->ceil.linklayer == TC_LINKLAYER_UNAWARE) - qdisc_put_rtab(qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB])); + qdisc_put_rtab(qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB], + NULL)); if (!cl) { /* new class */ struct Qdisc *new_q; @@ -1394,7 +1398,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, if (!cl) goto failure; - err = tcf_block_get(&cl->block, &cl->filter_list, sch); + err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack); if (err) { kfree(cl); goto failure; @@ -1423,8 +1427,8 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, * so that can't be used inside of sch_tree_lock * -- thanks to Karlis Peisenieks */ - new_q = qdisc_create_dflt(sch->dev_queue, - &pfifo_qdisc_ops, classid); + new_q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, + classid, NULL); sch_tree_lock(sch); if (parent && !parent->level) { unsigned int qlen = parent->un.leaf.q->q.qlen; @@ -1524,7 +1528,8 @@ failure: return err; } -static struct tcf_block *htb_tcf_block(struct Qdisc *sch, unsigned long arg) +static struct tcf_block *htb_tcf_block(struct Qdisc *sch, unsigned long arg, + struct netlink_ext_ack *extack) { struct htb_sched *q = qdisc_priv(sch); struct htb_class *cl = (struct htb_class *)arg; diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index 5ecc38f35d47..ce3f55259d0d 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -48,7 +48,8 @@ static void ingress_walk(struct Qdisc *sch, struct qdisc_walker *walker) { } -static struct tcf_block *ingress_tcf_block(struct Qdisc *sch, unsigned long cl) +static struct tcf_block *ingress_tcf_block(struct Qdisc *sch, unsigned long cl, + struct netlink_ext_ack *extack) { struct ingress_sched_data *q = qdisc_priv(sch); @@ -60,13 +61,29 @@ static void clsact_chain_head_change(struct tcf_proto *tp_head, void *priv) struct mini_Qdisc_pair *miniqp = priv; mini_qdisc_pair_swap(miniqp, tp_head); +}; + +static void ingress_ingress_block_set(struct Qdisc *sch, u32 block_index) +{ + struct ingress_sched_data *q = qdisc_priv(sch); + + q->block_info.block_index = block_index; } -static int ingress_init(struct Qdisc *sch, struct nlattr *opt) +static u32 ingress_ingress_block_get(struct Qdisc *sch) +{ + struct ingress_sched_data *q = qdisc_priv(sch); + + return q->block_info.block_index; +} + +static int ingress_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { struct ingress_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); - int err; + + net_inc_ingress_queue(); mini_qdisc_pair_init(&q->miniqp, sch, &dev->miniq_ingress); @@ -74,14 +91,7 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt) q->block_info.chain_head_change = clsact_chain_head_change; q->block_info.chain_head_change_priv = &q->miniqp; - err = tcf_block_get_ext(&q->block, sch, &q->block_info); - if (err) - return err; - - net_inc_ingress_queue(); - sch->flags |= TCQ_F_CPUSTATS; - - return 0; + return tcf_block_get_ext(&q->block, sch, &q->block_info, extack); } static void ingress_destroy(struct Qdisc *sch) @@ -117,13 +127,16 @@ static const struct Qdisc_class_ops ingress_class_ops = { }; static struct Qdisc_ops ingress_qdisc_ops __read_mostly = { - .cl_ops = &ingress_class_ops, - .id = "ingress", - .priv_size = sizeof(struct ingress_sched_data), - .init = ingress_init, - .destroy = ingress_destroy, - .dump = ingress_dump, - .owner = THIS_MODULE, + .cl_ops = &ingress_class_ops, + .id = "ingress", + .priv_size = sizeof(struct ingress_sched_data), + .static_flags = TCQ_F_CPUSTATS, + .init = ingress_init, + .destroy = ingress_destroy, + .dump = ingress_dump, + .ingress_block_set = ingress_ingress_block_set, + .ingress_block_get = ingress_ingress_block_get, + .owner = THIS_MODULE, }; struct clsact_sched_data { @@ -152,7 +165,8 @@ static unsigned long clsact_bind_filter(struct Qdisc *sch, return clsact_find(sch, classid); } -static struct tcf_block *clsact_tcf_block(struct Qdisc *sch, unsigned long cl) +static struct tcf_block *clsact_tcf_block(struct Qdisc *sch, unsigned long cl, + struct netlink_ext_ack *extack) { struct clsact_sched_data *q = qdisc_priv(sch); @@ -166,19 +180,52 @@ static struct tcf_block *clsact_tcf_block(struct Qdisc *sch, unsigned long cl) } } -static int clsact_init(struct Qdisc *sch, struct nlattr *opt) +static void clsact_ingress_block_set(struct Qdisc *sch, u32 block_index) +{ + struct clsact_sched_data *q = qdisc_priv(sch); + + q->ingress_block_info.block_index = block_index; +} + +static void clsact_egress_block_set(struct Qdisc *sch, u32 block_index) +{ + struct clsact_sched_data *q = qdisc_priv(sch); + + q->egress_block_info.block_index = block_index; +} + +static u32 clsact_ingress_block_get(struct Qdisc *sch) +{ + struct clsact_sched_data *q = qdisc_priv(sch); + + return q->ingress_block_info.block_index; +} + +static u32 clsact_egress_block_get(struct Qdisc *sch) +{ + struct clsact_sched_data *q = qdisc_priv(sch); + + return q->egress_block_info.block_index; +} + +static int clsact_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { struct clsact_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); int err; + net_inc_ingress_queue(); + net_inc_egress_queue(); + mini_qdisc_pair_init(&q->miniqp_ingress, sch, &dev->miniq_ingress); q->ingress_block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS; q->ingress_block_info.chain_head_change = clsact_chain_head_change; q->ingress_block_info.chain_head_change_priv = &q->miniqp_ingress; - err = tcf_block_get_ext(&q->ingress_block, sch, &q->ingress_block_info); + err = tcf_block_get_ext(&q->ingress_block, sch, &q->ingress_block_info, + extack); if (err) return err; @@ -188,20 +235,7 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt) q->egress_block_info.chain_head_change = clsact_chain_head_change; q->egress_block_info.chain_head_change_priv = &q->miniqp_egress; - err = tcf_block_get_ext(&q->egress_block, sch, &q->egress_block_info); - if (err) - goto err_egress_block_get; - - net_inc_ingress_queue(); - net_inc_egress_queue(); - - sch->flags |= TCQ_F_CPUSTATS; - - return 0; - -err_egress_block_get: - tcf_block_put_ext(q->ingress_block, sch, &q->ingress_block_info); - return err; + return tcf_block_get_ext(&q->egress_block, sch, &q->egress_block_info, extack); } static void clsact_destroy(struct Qdisc *sch) @@ -225,13 +259,18 @@ static const struct Qdisc_class_ops clsact_class_ops = { }; static struct Qdisc_ops clsact_qdisc_ops __read_mostly = { - .cl_ops = &clsact_class_ops, - .id = "clsact", - .priv_size = sizeof(struct clsact_sched_data), - .init = clsact_init, - .destroy = clsact_destroy, - .dump = ingress_dump, - .owner = THIS_MODULE, + .cl_ops = &clsact_class_ops, + .id = "clsact", + .priv_size = sizeof(struct clsact_sched_data), + .static_flags = TCQ_F_CPUSTATS, + .init = clsact_init, + .destroy = clsact_destroy, + .dump = ingress_dump, + .ingress_block_set = clsact_ingress_block_set, + .egress_block_set = clsact_egress_block_set, + .ingress_block_get = clsact_ingress_block_get, + .egress_block_get = clsact_egress_block_get, + .owner = THIS_MODULE, }; static int __init ingress_module_init(void) diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index 213b586a06a0..f062a18e9162 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -17,6 +17,7 @@ #include <linux/skbuff.h> #include <net/netlink.h> #include <net/pkt_sched.h> +#include <net/sch_generic.h> struct mq_sched { struct Qdisc **qdiscs; @@ -35,7 +36,8 @@ static void mq_destroy(struct Qdisc *sch) kfree(priv->qdiscs); } -static int mq_init(struct Qdisc *sch, struct nlattr *opt) +static int mq_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { struct net_device *dev = qdisc_dev(sch); struct mq_sched *priv = qdisc_priv(sch); @@ -59,7 +61,8 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt) dev_queue = netdev_get_tx_queue(dev, ntx); qdisc = qdisc_create_dflt(dev_queue, get_default_qdisc_ops(dev, ntx), TC_H_MAKE(TC_H_MAJ(sch->handle), - TC_H_MIN(ntx + 1))); + TC_H_MIN(ntx + 1)), + extack); if (!qdisc) return -ENOMEM; priv->qdiscs[ntx] = qdisc; @@ -97,23 +100,42 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb) struct net_device *dev = qdisc_dev(sch); struct Qdisc *qdisc; unsigned int ntx; + __u32 qlen = 0; sch->q.qlen = 0; memset(&sch->bstats, 0, sizeof(sch->bstats)); memset(&sch->qstats, 0, sizeof(sch->qstats)); + /* MQ supports lockless qdiscs. However, statistics accounting needs + * to account for all, none, or a mix of locked and unlocked child + * qdiscs. Percpu stats are added to counters in-band and locking + * qdisc totals are added at end. + */ for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping; spin_lock_bh(qdisc_lock(qdisc)); - sch->q.qlen += qdisc->q.qlen; - sch->bstats.bytes += qdisc->bstats.bytes; - sch->bstats.packets += qdisc->bstats.packets; - sch->qstats.backlog += qdisc->qstats.backlog; - sch->qstats.drops += qdisc->qstats.drops; - sch->qstats.requeues += qdisc->qstats.requeues; - sch->qstats.overlimits += qdisc->qstats.overlimits; + + if (qdisc_is_percpu_stats(qdisc)) { + qlen = qdisc_qlen_sum(qdisc); + __gnet_stats_copy_basic(NULL, &sch->bstats, + qdisc->cpu_bstats, + &qdisc->bstats); + __gnet_stats_copy_queue(&sch->qstats, + qdisc->cpu_qstats, + &qdisc->qstats, qlen); + } else { + sch->q.qlen += qdisc->q.qlen; + sch->bstats.bytes += qdisc->bstats.bytes; + sch->bstats.packets += qdisc->bstats.packets; + sch->qstats.backlog += qdisc->qstats.backlog; + sch->qstats.drops += qdisc->qstats.drops; + sch->qstats.requeues += qdisc->qstats.requeues; + sch->qstats.overlimits += qdisc->qstats.overlimits; + } + spin_unlock_bh(qdisc_lock(qdisc)); } + return 0; } @@ -134,7 +156,7 @@ static struct netdev_queue *mq_select_queue(struct Qdisc *sch, } static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, - struct Qdisc **old) + struct Qdisc **old, struct netlink_ext_ack *extack) { struct netdev_queue *dev_queue = mq_queue_get(sch, cl); struct net_device *dev = qdisc_dev(sch); diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index b85885a9d8a1..0e9d761cdd80 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -132,7 +132,8 @@ static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, return 0; } -static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) +static int mqprio_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { struct net_device *dev = qdisc_dev(sch); struct mqprio_sched *priv = qdisc_priv(sch); @@ -229,7 +230,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) qdisc = qdisc_create_dflt(dev_queue, get_default_qdisc_ops(dev, i), TC_H_MAKE(TC_H_MAJ(sch->handle), - TC_H_MIN(i + 1))); + TC_H_MIN(i + 1)), extack); if (!qdisc) return -ENOMEM; @@ -319,7 +320,7 @@ static struct netdev_queue *mqprio_queue_get(struct Qdisc *sch, } static int mqprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, - struct Qdisc **old) + struct Qdisc **old, struct netlink_ext_ack *extack) { struct net_device *dev = qdisc_dev(sch); struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); @@ -388,22 +389,40 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) struct nlattr *nla = (struct nlattr *)skb_tail_pointer(skb); struct tc_mqprio_qopt opt = { 0 }; struct Qdisc *qdisc; - unsigned int i; + unsigned int ntx, tc; sch->q.qlen = 0; memset(&sch->bstats, 0, sizeof(sch->bstats)); memset(&sch->qstats, 0, sizeof(sch->qstats)); - for (i = 0; i < dev->num_tx_queues; i++) { - qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc); + /* MQ supports lockless qdiscs. However, statistics accounting needs + * to account for all, none, or a mix of locked and unlocked child + * qdiscs. Percpu stats are added to counters in-band and locking + * qdisc totals are added at end. + */ + for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { + qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping; spin_lock_bh(qdisc_lock(qdisc)); - sch->q.qlen += qdisc->q.qlen; - sch->bstats.bytes += qdisc->bstats.bytes; - sch->bstats.packets += qdisc->bstats.packets; - sch->qstats.backlog += qdisc->qstats.backlog; - sch->qstats.drops += qdisc->qstats.drops; - sch->qstats.requeues += qdisc->qstats.requeues; - sch->qstats.overlimits += qdisc->qstats.overlimits; + + if (qdisc_is_percpu_stats(qdisc)) { + __u32 qlen = qdisc_qlen_sum(qdisc); + + __gnet_stats_copy_basic(NULL, &sch->bstats, + qdisc->cpu_bstats, + &qdisc->bstats); + __gnet_stats_copy_queue(&sch->qstats, + qdisc->cpu_qstats, + &qdisc->qstats, qlen); + } else { + sch->q.qlen += qdisc->q.qlen; + sch->bstats.bytes += qdisc->bstats.bytes; + sch->bstats.packets += qdisc->bstats.packets; + sch->qstats.backlog += qdisc->qstats.backlog; + sch->qstats.drops += qdisc->qstats.drops; + sch->qstats.requeues += qdisc->qstats.requeues; + sch->qstats.overlimits += qdisc->qstats.overlimits; + } + spin_unlock_bh(qdisc_lock(qdisc)); } @@ -411,9 +430,9 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map)); opt.hw = priv->hw_offload; - for (i = 0; i < netdev_get_num_tc(dev); i++) { - opt.count[i] = dev->tc_to_txq[i].count; - opt.offset[i] = dev->tc_to_txq[i].offset; + for (tc = 0; tc < netdev_get_num_tc(dev); tc++) { + opt.count[tc] = dev->tc_to_txq[tc].count; + opt.offset[tc] = dev->tc_to_txq[tc].offset; } if (nla_put(skb, TCA_OPTIONS, NLA_ALIGN(sizeof(opt)), &opt)) @@ -495,7 +514,6 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, if (cl >= TC_H_MIN_PRIORITY) { int i; __u32 qlen = 0; - struct Qdisc *qdisc; struct gnet_stats_queue qstats = {0}; struct gnet_stats_basic_packed bstats = {0}; struct net_device *dev = qdisc_dev(sch); @@ -511,18 +529,26 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, for (i = tc.offset; i < tc.offset + tc.count; i++) { struct netdev_queue *q = netdev_get_tx_queue(dev, i); + struct Qdisc *qdisc = rtnl_dereference(q->qdisc); + struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL; + struct gnet_stats_queue __percpu *cpu_qstats = NULL; - qdisc = rtnl_dereference(q->qdisc); spin_lock_bh(qdisc_lock(qdisc)); - qlen += qdisc->q.qlen; - bstats.bytes += qdisc->bstats.bytes; - bstats.packets += qdisc->bstats.packets; - qstats.backlog += qdisc->qstats.backlog; - qstats.drops += qdisc->qstats.drops; - qstats.requeues += qdisc->qstats.requeues; - qstats.overlimits += qdisc->qstats.overlimits; + if (qdisc_is_percpu_stats(qdisc)) { + cpu_bstats = qdisc->cpu_bstats; + cpu_qstats = qdisc->cpu_qstats; + } + + qlen = qdisc_qlen_sum(qdisc); + __gnet_stats_copy_basic(NULL, &sch->bstats, + cpu_bstats, &qdisc->bstats); + __gnet_stats_copy_queue(&sch->qstats, + cpu_qstats, + &qdisc->qstats, + qlen); spin_unlock_bh(qdisc_lock(qdisc)); } + /* Reclaim root sleeping lock before completing stats */ if (d->lock) spin_lock_bh(d->lock); diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index 012216386c0b..1da7ea8de0ad 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -180,7 +180,8 @@ multiq_destroy(struct Qdisc *sch) kfree(q->queues); } -static int multiq_tune(struct Qdisc *sch, struct nlattr *opt) +static int multiq_tune(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { struct multiq_sched_data *q = qdisc_priv(sch); struct tc_multiq_qopt *qopt; @@ -215,7 +216,7 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt) child = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, TC_H_MAKE(sch->handle, - i + 1)); + i + 1), extack); if (child) { sch_tree_lock(sch); old = q->queues[i]; @@ -236,17 +237,18 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt) return 0; } -static int multiq_init(struct Qdisc *sch, struct nlattr *opt) +static int multiq_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { struct multiq_sched_data *q = qdisc_priv(sch); int i, err; q->queues = NULL; - if (opt == NULL) + if (!opt) return -EINVAL; - err = tcf_block_get(&q->block, &q->filter_list, sch); + err = tcf_block_get(&q->block, &q->filter_list, sch, extack); if (err) return err; @@ -258,7 +260,7 @@ static int multiq_init(struct Qdisc *sch, struct nlattr *opt) for (i = 0; i < q->max_bands; i++) q->queues[i] = &noop_qdisc; - return multiq_tune(sch, opt); + return multiq_tune(sch, opt, extack); } static int multiq_dump(struct Qdisc *sch, struct sk_buff *skb) @@ -281,7 +283,7 @@ nla_put_failure: } static int multiq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, - struct Qdisc **old) + struct Qdisc **old, struct netlink_ext_ack *extack) { struct multiq_sched_data *q = qdisc_priv(sch); unsigned long band = arg - 1; @@ -369,7 +371,8 @@ static void multiq_walk(struct Qdisc *sch, struct qdisc_walker *arg) } } -static struct tcf_block *multiq_tcf_block(struct Qdisc *sch, unsigned long cl) +static struct tcf_block *multiq_tcf_block(struct Qdisc *sch, unsigned long cl, + struct netlink_ext_ack *extack) { struct multiq_sched_data *q = qdisc_priv(sch); diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index dd70924cbcdf..7bbc13b8ca47 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -893,7 +893,8 @@ static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, } /* Parse netlink message to set options */ -static int netem_change(struct Qdisc *sch, struct nlattr *opt) +static int netem_change(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { struct netem_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_NETEM_MAX + 1]; @@ -984,7 +985,8 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt) return ret; } -static int netem_init(struct Qdisc *sch, struct nlattr *opt) +static int netem_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { struct netem_sched_data *q = qdisc_priv(sch); int ret; @@ -995,7 +997,7 @@ static int netem_init(struct Qdisc *sch, struct nlattr *opt) return -EINVAL; q->loss_model = CLG_RANDOM; - ret = netem_change(sch, opt); + ret = netem_change(sch, opt, extack); if (ret) pr_info("netem: change failed\n"); return ret; @@ -1157,7 +1159,7 @@ static int netem_dump_class(struct Qdisc *sch, unsigned long cl, } static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, - struct Qdisc **old) + struct Qdisc **old, struct netlink_ext_ack *extack) { struct netem_sched_data *q = qdisc_priv(sch); diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index 776c694c77c7..18d30bb86881 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -181,7 +181,8 @@ static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = { [TCA_PIE_BYTEMODE] = {.type = NLA_U32}, }; -static int pie_change(struct Qdisc *sch, struct nlattr *opt) +static int pie_change(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { struct pie_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_PIE_MAX + 1]; @@ -439,7 +440,8 @@ static void pie_timer(struct timer_list *t) } -static int pie_init(struct Qdisc *sch, struct nlattr *opt) +static int pie_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { struct pie_sched_data *q = qdisc_priv(sch); @@ -451,7 +453,7 @@ static int pie_init(struct Qdisc *sch, struct nlattr *opt) timer_setup(&q->adapt_timer, pie_timer, 0); if (opt) { - int err = pie_change(sch, opt); + int err = pie_change(sch, opt, extack); if (err) return err; diff --git a/net/sched/sch_plug.c b/net/sched/sch_plug.c index 1c6cbab3e7b9..5619d2eb17b6 100644 --- a/net/sched/sch_plug.c +++ b/net/sched/sch_plug.c @@ -123,7 +123,8 @@ static struct sk_buff *plug_dequeue(struct Qdisc *sch) return qdisc_dequeue_head(sch); } -static int plug_init(struct Qdisc *sch, struct nlattr *opt) +static int plug_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { struct plug_sched_data *q = qdisc_priv(sch); @@ -158,7 +159,8 @@ static int plug_init(struct Qdisc *sch, struct nlattr *opt) * command is received (just act as a pass-thru queue). * TCQ_PLUG_LIMIT: Increase/decrease queue size */ -static int plug_change(struct Qdisc *sch, struct nlattr *opt) +static int plug_change(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { struct plug_sched_data *q = qdisc_priv(sch); struct tc_plug_qopt *msg; diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 2c79559a0d31..efbf51f35778 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -142,6 +142,31 @@ prio_reset(struct Qdisc *sch) sch->q.qlen = 0; } +static int prio_offload(struct Qdisc *sch, bool enable) +{ + struct prio_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct tc_prio_qopt_offload opt = { + .handle = sch->handle, + .parent = sch->parent, + }; + + if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) + return -EOPNOTSUPP; + + if (enable) { + opt.command = TC_PRIO_REPLACE; + opt.replace_params.bands = q->bands; + memcpy(&opt.replace_params.priomap, q->prio2band, + TC_PRIO_MAX + 1); + opt.replace_params.qstats = &sch->qstats; + } else { + opt.command = TC_PRIO_DESTROY; + } + + return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_PRIO, &opt); +} + static void prio_destroy(struct Qdisc *sch) { @@ -149,11 +174,13 @@ prio_destroy(struct Qdisc *sch) struct prio_sched_data *q = qdisc_priv(sch); tcf_block_put(q->block); + prio_offload(sch, false); for (prio = 0; prio < q->bands; prio++) qdisc_destroy(q->queues[prio]); } -static int prio_tune(struct Qdisc *sch, struct nlattr *opt) +static int prio_tune(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { struct prio_sched_data *q = qdisc_priv(sch); struct Qdisc *queues[TCQ_PRIO_BANDS]; @@ -175,7 +202,8 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt) /* Before commit, make sure we can allocate all new qdiscs */ for (i = oldbands; i < qopt->bands; i++) { queues[i] = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, - TC_H_MAKE(sch->handle, i + 1)); + TC_H_MAKE(sch->handle, i + 1), + extack); if (!queues[i]) { while (i > oldbands) qdisc_destroy(queues[--i]); @@ -202,10 +230,12 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt) } sch_tree_unlock(sch); + prio_offload(sch, true); return 0; } -static int prio_init(struct Qdisc *sch, struct nlattr *opt) +static int prio_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { struct prio_sched_data *q = qdisc_priv(sch); int err; @@ -213,11 +243,42 @@ static int prio_init(struct Qdisc *sch, struct nlattr *opt) if (!opt) return -EINVAL; - err = tcf_block_get(&q->block, &q->filter_list, sch); + err = tcf_block_get(&q->block, &q->filter_list, sch, extack); if (err) return err; - return prio_tune(sch, opt); + return prio_tune(sch, opt, extack); +} + +static int prio_dump_offload(struct Qdisc *sch) +{ + struct net_device *dev = qdisc_dev(sch); + struct tc_prio_qopt_offload hw_stats = { + .command = TC_PRIO_STATS, + .handle = sch->handle, + .parent = sch->parent, + { + .stats = { + .bstats = &sch->bstats, + .qstats = &sch->qstats, + }, + }, + }; + int err; + + sch->flags &= ~TCQ_F_OFFLOADED; + if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) + return 0; + + err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_PRIO, + &hw_stats); + if (err == -EOPNOTSUPP) + return 0; + + if (!err) + sch->flags |= TCQ_F_OFFLOADED; + + return err; } static int prio_dump(struct Qdisc *sch, struct sk_buff *skb) @@ -225,10 +286,15 @@ static int prio_dump(struct Qdisc *sch, struct sk_buff *skb) struct prio_sched_data *q = qdisc_priv(sch); unsigned char *b = skb_tail_pointer(skb); struct tc_prio_qopt opt; + int err; opt.bands = q->bands; memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX + 1); + err = prio_dump_offload(sch); + if (err) + goto nla_put_failure; + if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) goto nla_put_failure; @@ -240,7 +306,7 @@ nla_put_failure: } static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, - struct Qdisc **old) + struct Qdisc **old, struct netlink_ext_ack *extack) { struct prio_sched_data *q = qdisc_priv(sch); unsigned long band = arg - 1; @@ -327,7 +393,8 @@ static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg) } } -static struct tcf_block *prio_tcf_block(struct Qdisc *sch, unsigned long cl) +static struct tcf_block *prio_tcf_block(struct Qdisc *sch, unsigned long cl, + struct netlink_ext_ack *extack) { struct prio_sched_data *q = qdisc_priv(sch); diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 6962b37a3ad3..bb1a9c11fc54 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -402,7 +402,8 @@ static int qfq_change_agg(struct Qdisc *sch, struct qfq_class *cl, u32 weight, } static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, - struct nlattr **tca, unsigned long *arg) + struct nlattr **tca, unsigned long *arg, + struct netlink_ext_ack *extack) { struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl = (struct qfq_class *)*arg; @@ -479,8 +480,8 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, cl->common.classid = classid; cl->deficit = lmax; - cl->qdisc = qdisc_create_dflt(sch->dev_queue, - &pfifo_qdisc_ops, classid); + cl->qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, + classid, NULL); if (cl->qdisc == NULL) cl->qdisc = &noop_qdisc; @@ -564,7 +565,8 @@ static unsigned long qfq_search_class(struct Qdisc *sch, u32 classid) return (unsigned long)qfq_find_class(sch, classid); } -static struct tcf_block *qfq_tcf_block(struct Qdisc *sch, unsigned long cl) +static struct tcf_block *qfq_tcf_block(struct Qdisc *sch, unsigned long cl, + struct netlink_ext_ack *extack) { struct qfq_sched *q = qdisc_priv(sch); @@ -593,13 +595,14 @@ static void qfq_unbind_tcf(struct Qdisc *sch, unsigned long arg) } static int qfq_graft_class(struct Qdisc *sch, unsigned long arg, - struct Qdisc *new, struct Qdisc **old) + struct Qdisc *new, struct Qdisc **old, + struct netlink_ext_ack *extack) { struct qfq_class *cl = (struct qfq_class *)arg; if (new == NULL) { - new = qdisc_create_dflt(sch->dev_queue, - &pfifo_qdisc_ops, cl->common.classid); + new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, + cl->common.classid, NULL); if (new == NULL) new = &noop_qdisc; } @@ -1413,14 +1416,15 @@ static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg) qfq_deactivate_class(q, cl); } -static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt) +static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { struct qfq_sched *q = qdisc_priv(sch); struct qfq_group *grp; int i, j, err; u32 max_cl_shift, maxbudg_shift, max_classes; - err = tcf_block_get(&q->block, &q->filter_list, sch); + err = tcf_block_get(&q->block, &q->filter_list, sch, extack); if (err) return err; diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 7f8ea9e297c3..16644b3d2362 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -167,6 +167,7 @@ static int red_offload(struct Qdisc *sch, bool enable) opt.set.max = q->parms.qth_max >> q->parms.Wlog; opt.set.probability = q->parms.max_P; opt.set.is_ecn = red_use_ecn(q); + opt.set.qstats = &sch->qstats; } else { opt.command = TC_RED_DESTROY; } @@ -189,7 +190,8 @@ static const struct nla_policy red_policy[TCA_RED_MAX + 1] = { [TCA_RED_MAX_P] = { .type = NLA_U32 }, }; -static int red_change(struct Qdisc *sch, struct nlattr *opt) +static int red_change(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { struct red_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_RED_MAX + 1]; @@ -212,9 +214,12 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt) max_P = tb[TCA_RED_MAX_P] ? nla_get_u32(tb[TCA_RED_MAX_P]) : 0; ctl = nla_data(tb[TCA_RED_PARMS]); + if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog)) + return -EINVAL; if (ctl->limit > 0) { - child = fifo_create_dflt(sch, &bfifo_qdisc_ops, ctl->limit); + child = fifo_create_dflt(sch, &bfifo_qdisc_ops, ctl->limit, + extack); if (IS_ERR(child)) return PTR_ERR(child); } @@ -262,17 +267,18 @@ static inline void red_adaptative_timer(struct timer_list *t) spin_unlock(root_lock); } -static int red_init(struct Qdisc *sch, struct nlattr *opt) +static int red_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { struct red_sched_data *q = qdisc_priv(sch); q->qdisc = &noop_qdisc; q->sch = sch; timer_setup(&q->adapt_timer, red_adaptative_timer, 0); - return red_change(sch, opt); + return red_change(sch, opt, extack); } -static int red_dump_offload(struct Qdisc *sch, struct tc_red_qopt *opt) +static int red_dump_offload_stats(struct Qdisc *sch, struct tc_red_qopt *opt) { struct net_device *dev = qdisc_dev(sch); struct tc_red_qopt_offload hw_stats = { @@ -286,7 +292,8 @@ static int red_dump_offload(struct Qdisc *sch, struct tc_red_qopt *opt) }; int err; - opt->flags &= ~TC_RED_OFFLOADED; + sch->flags &= ~TCQ_F_OFFLOADED; + if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) return 0; @@ -296,7 +303,7 @@ static int red_dump_offload(struct Qdisc *sch, struct tc_red_qopt *opt) return 0; if (!err) - opt->flags |= TC_RED_OFFLOADED; + sch->flags |= TCQ_F_OFFLOADED; return err; } @@ -316,8 +323,7 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb) }; int err; - sch->qstats.backlog = q->qdisc->qstats.backlog; - err = red_dump_offload(sch, &opt); + err = red_dump_offload_stats(sch, &opt); if (err) goto nla_put_failure; @@ -338,32 +344,24 @@ static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct red_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); - struct tc_red_xstats st = { - .early = q->stats.prob_drop + q->stats.forced_drop, - .pdrop = q->stats.pdrop, - .other = q->stats.other, - .marked = q->stats.prob_mark + q->stats.forced_mark, - }; + struct tc_red_xstats st = {0}; - if (tc_can_offload(dev) && dev->netdev_ops->ndo_setup_tc) { - struct red_stats hw_stats = {0}; + if (sch->flags & TCQ_F_OFFLOADED) { struct tc_red_qopt_offload hw_stats_request = { .command = TC_RED_XSTATS, .handle = sch->handle, .parent = sch->parent, { - .xstats = &hw_stats, + .xstats = &q->stats, }, }; - if (!dev->netdev_ops->ndo_setup_tc(dev, - TC_SETUP_QDISC_RED, - &hw_stats_request)) { - st.early += hw_stats.prob_drop + hw_stats.forced_drop; - st.pdrop += hw_stats.pdrop; - st.other += hw_stats.other; - st.marked += hw_stats.prob_mark + hw_stats.forced_mark; - } + dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED, + &hw_stats_request); } + st.early = q->stats.prob_drop + q->stats.forced_drop; + st.pdrop = q->stats.pdrop; + st.other = q->stats.other; + st.marked = q->stats.prob_mark + q->stats.forced_mark; return gnet_stats_copy_app(d, &st, sizeof(st)); } @@ -379,7 +377,7 @@ static int red_dump_class(struct Qdisc *sch, unsigned long cl, } static int red_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, - struct Qdisc **old) + struct Qdisc **old, struct netlink_ext_ack *extack) { struct red_sched_data *q = qdisc_priv(sch); diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index 0678debdd856..7cbdad8419b7 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -488,7 +488,8 @@ static const struct tc_sfb_qopt sfb_default_ops = { .penalty_burst = 20, }; -static int sfb_change(struct Qdisc *sch, struct nlattr *opt) +static int sfb_change(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { struct sfb_sched_data *q = qdisc_priv(sch); struct Qdisc *child; @@ -512,7 +513,7 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt) if (limit == 0) limit = qdisc_dev(sch)->tx_queue_len; - child = fifo_create_dflt(sch, &pfifo_qdisc_ops, limit); + child = fifo_create_dflt(sch, &pfifo_qdisc_ops, limit, extack); if (IS_ERR(child)) return PTR_ERR(child); @@ -549,17 +550,18 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt) return 0; } -static int sfb_init(struct Qdisc *sch, struct nlattr *opt) +static int sfb_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { struct sfb_sched_data *q = qdisc_priv(sch); int err; - err = tcf_block_get(&q->block, &q->filter_list, sch); + err = tcf_block_get(&q->block, &q->filter_list, sch, extack); if (err) return err; q->qdisc = &noop_qdisc; - return sfb_change(sch, opt); + return sfb_change(sch, opt, extack); } static int sfb_dump(struct Qdisc *sch, struct sk_buff *skb) @@ -615,7 +617,7 @@ static int sfb_dump_class(struct Qdisc *sch, unsigned long cl, } static int sfb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, - struct Qdisc **old) + struct Qdisc **old, struct netlink_ext_ack *extack) { struct sfb_sched_data *q = qdisc_priv(sch); @@ -643,7 +645,8 @@ static void sfb_unbind(struct Qdisc *sch, unsigned long arg) } static int sfb_change_class(struct Qdisc *sch, u32 classid, u32 parentid, - struct nlattr **tca, unsigned long *arg) + struct nlattr **tca, unsigned long *arg, + struct netlink_ext_ack *extack) { return -ENOSYS; } @@ -665,7 +668,8 @@ static void sfb_walk(struct Qdisc *sch, struct qdisc_walker *walker) } } -static struct tcf_block *sfb_tcf_block(struct Qdisc *sch, unsigned long cl) +static struct tcf_block *sfb_tcf_block(struct Qdisc *sch, unsigned long cl, + struct netlink_ext_ack *extack) { struct sfb_sched_data *q = qdisc_priv(sch); diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 890f4a4564e7..2f2678197760 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -639,6 +639,9 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) if (ctl->divisor && (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536)) return -EINVAL; + if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max, + ctl_v1->Wlog)) + return -EINVAL; if (ctl_v1 && ctl_v1->qth_min) { p = kmalloc(sizeof(*p), GFP_KERNEL); if (!p) @@ -718,15 +721,17 @@ static void sfq_destroy(struct Qdisc *sch) kfree(q->red_parms); } -static int sfq_init(struct Qdisc *sch, struct nlattr *opt) +static int sfq_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { struct sfq_sched_data *q = qdisc_priv(sch); int i; int err; + q->sch = sch; timer_setup(&q->perturb_timer, sfq_perturbation, TIMER_DEFERRABLE); - err = tcf_block_get(&q->block, &q->filter_list, sch); + err = tcf_block_get(&q->block, &q->filter_list, sch, extack); if (err) return err; @@ -832,7 +837,8 @@ static void sfq_unbind(struct Qdisc *q, unsigned long cl) { } -static struct tcf_block *sfq_tcf_block(struct Qdisc *sch, unsigned long cl) +static struct tcf_block *sfq_tcf_block(struct Qdisc *sch, unsigned long cl, + struct netlink_ext_ack *extack) { struct sfq_sched_data *q = qdisc_priv(sch); diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 120f4f365967..229172d509cc 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -142,16 +142,6 @@ static u64 psched_ns_t2l(const struct psched_ratecfg *r, return len; } -/* - * Return length of individual segments of a gso packet, - * including all headers (MAC, IP, TCP/UDP) - */ -static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb) -{ - unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb); - return hdr_len + skb_gso_transport_seglen(skb); -} - /* GSO packet is too big, segment it so that tbf can transmit * each segment in time */ @@ -302,7 +292,8 @@ static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = { [TCA_TBF_PBURST] = { .type = NLA_U32 }, }; -static int tbf_change(struct Qdisc *sch, struct nlattr *opt) +static int tbf_change(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { int err; struct tbf_sched_data *q = qdisc_priv(sch); @@ -326,11 +317,13 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt) qopt = nla_data(tb[TCA_TBF_PARMS]); if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE) qdisc_put_rtab(qdisc_get_rtab(&qopt->rate, - tb[TCA_TBF_RTAB])); + tb[TCA_TBF_RTAB], + NULL)); if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE) qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate, - tb[TCA_TBF_PTAB])); + tb[TCA_TBF_PTAB], + NULL)); buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U); mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U); @@ -383,7 +376,8 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt) if (err) goto done; } else if (qopt->limit > 0) { - child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit); + child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit, + extack); if (IS_ERR(child)) { err = PTR_ERR(child); goto done; @@ -421,19 +415,20 @@ done: return err; } -static int tbf_init(struct Qdisc *sch, struct nlattr *opt) +static int tbf_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { struct tbf_sched_data *q = qdisc_priv(sch); qdisc_watchdog_init(&q->watchdog, sch); q->qdisc = &noop_qdisc; - if (opt == NULL) + if (!opt) return -EINVAL; q->t_c = ktime_get_ns(); - return tbf_change(sch, opt); + return tbf_change(sch, opt, extack); } static void tbf_destroy(struct Qdisc *sch) @@ -494,7 +489,7 @@ static int tbf_dump_class(struct Qdisc *sch, unsigned long cl, } static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, - struct Qdisc **old) + struct Qdisc **old, struct netlink_ext_ack *extack) { struct tbf_sched_data *q = qdisc_priv(sch); diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index 9fe6b427afed..93f04cf5cac1 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -167,7 +167,8 @@ teql_destroy(struct Qdisc *sch) } } -static int teql_qdisc_init(struct Qdisc *sch, struct nlattr *opt) +static int teql_qdisc_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { struct net_device *dev = qdisc_dev(sch); struct teql_master *m = (struct teql_master *)sch->ops; diff --git a/net/sctp/Kconfig b/net/sctp/Kconfig index d9c04dc1b3f3..c740b189d4ba 100644 --- a/net/sctp/Kconfig +++ b/net/sctp/Kconfig @@ -37,18 +37,6 @@ menuconfig IP_SCTP if IP_SCTP -config NET_SCTPPROBE - tristate "SCTP: Association probing" - depends on PROC_FS && KPROBES - ---help--- - This module allows for capturing the changes to SCTP association - state in response to incoming packets. It is used for debugging - SCTP congestion control algorithms. If you don't understand - what was just said, you don't need it: say N. - - To compile this code as a module, choose M here: the - module will be called sctp_probe. - config SCTP_DBG_OBJCNT bool "SCTP: Debug object counts" depends on PROC_FS diff --git a/net/sctp/Makefile b/net/sctp/Makefile index 1ca84a288443..6776582ec449 100644 --- a/net/sctp/Makefile +++ b/net/sctp/Makefile @@ -4,7 +4,6 @@ # obj-$(CONFIG_IP_SCTP) += sctp.o -obj-$(CONFIG_NET_SCTPPROBE) += sctp_probe.o obj-$(CONFIG_INET_SCTP_DIAG) += sctp_diag.o sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \ @@ -14,9 +13,7 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \ tsnmap.o bind_addr.o socket.o primitive.o \ output.o input.o debug.o stream.o auth.o \ offload.o stream_sched.o stream_sched_prio.o \ - stream_sched_rr.o - -sctp_probe-y := probe.o + stream_sched_rr.o stream_interleave.o sctp-$(CONFIG_SCTP_DBG_OBJCNT) += objcnt.o sctp-$(CONFIG_PROC_FS) += proc.o diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 69394f4d6091..837806dd5799 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -861,7 +861,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc, event = sctp_ulpevent_make_peer_addr_change(asoc, &addr, 0, spc_state, error, GFP_ATOMIC); if (event) - sctp_ulpq_tail_event(&asoc->ulpq, event); + asoc->stream.si->enqueue_event(&asoc->ulpq, event); } /* Select new active and retran paths. */ diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index 7b261afc47b9..991a530c6b31 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -53,6 +53,7 @@ static void sctp_datamsg_init(struct sctp_datamsg *msg) msg->send_failed = 0; msg->send_error = 0; msg->can_delay = 1; + msg->abandoned = 0; msg->expires_at = 0; INIT_LIST_HEAD(&msg->chunks); } @@ -123,7 +124,7 @@ static void sctp_datamsg_destroy(struct sctp_datamsg *msg) ev = sctp_ulpevent_make_send_failed(asoc, chunk, sent, error, GFP_ATOMIC); if (ev) - sctp_ulpq_tail_event(&asoc->ulpq, ev); + asoc->stream.si->enqueue_event(&asoc->ulpq, ev); } sctp_chunk_put(chunk); @@ -190,7 +191,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, */ max_data = asoc->pathmtu - sctp_sk(asoc->base.sk)->pf->af->net_header_len - - sizeof(struct sctphdr) - sizeof(struct sctp_data_chunk); + sizeof(struct sctphdr) - sctp_datachk_len(&asoc->stream); max_data = SCTP_TRUNC4(max_data); /* If the the peer requested that we authenticate DATA chunks @@ -263,8 +264,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, frag |= SCTP_DATA_SACK_IMM; } - chunk = sctp_make_datafrag_empty(asoc, sinfo, len, frag, - 0, GFP_KERNEL); + chunk = asoc->stream.si->make_datafrag(asoc, sinfo, len, frag, + GFP_KERNEL); if (!chunk) { err = -ENOMEM; goto errout; @@ -304,6 +305,13 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk) if (!chunk->asoc->peer.prsctp_capable) return 0; + if (chunk->msg->abandoned) + return 1; + + if (!chunk->has_tsn && + !(chunk->chunk_hdr->flags & SCTP_DATA_FIRST_FRAG)) + return 0; + if (SCTP_PR_TTL_ENABLED(chunk->sinfo.sinfo_flags) && time_after(jiffies, chunk->msg->expires_at)) { struct sctp_stream_out *streamout = @@ -316,6 +324,7 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk) chunk->asoc->abandoned_unsent[SCTP_PR_INDEX(TTL)]++; streamout->ext->abandoned_unsent[SCTP_PR_INDEX(TTL)]++; } + chunk->msg->abandoned = 1; return 1; } else if (SCTP_PR_RTX_ENABLED(chunk->sinfo.sinfo_flags) && chunk->sent_count > chunk->sinfo.sinfo_timetolive) { @@ -324,10 +333,12 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk) chunk->asoc->abandoned_sent[SCTP_PR_INDEX(RTX)]++; streamout->ext->abandoned_sent[SCTP_PR_INDEX(RTX)]++; + chunk->msg->abandoned = 1; return 1; } else if (!SCTP_PR_POLICY(chunk->sinfo.sinfo_flags) && chunk->msg->expires_at && time_after(jiffies, chunk->msg->expires_at)) { + chunk->msg->abandoned = 1; return 1; } /* PRIO policy is processed by sendmsg, not here */ diff --git a/net/sctp/debug.c b/net/sctp/debug.c index 3f619fdcbf0a..291c97b07058 100644 --- a/net/sctp/debug.c +++ b/net/sctp/debug.c @@ -78,6 +78,9 @@ const char *sctp_cname(const union sctp_subtype cid) case SCTP_CID_AUTH: return "AUTH"; + case SCTP_CID_RECONF: + return "RECONF"; + default: break; } diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c index ee1e601a0b11..8b3146816519 100644 --- a/net/sctp/endpointola.c +++ b/net/sctp/endpointola.c @@ -232,7 +232,7 @@ void sctp_endpoint_free(struct sctp_endpoint *ep) { ep->base.dead = true; - ep->base.sk->sk_state = SCTP_SS_CLOSED; + inet_sk_set_state(ep->base.sk, SCTP_SS_CLOSED); /* Unlink this endpoint, so we can't find it again! */ sctp_unhash_endpoint(ep); diff --git a/net/sctp/input.c b/net/sctp/input.c index 621b5ca3fd1c..141c9c466ec1 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -399,20 +399,24 @@ void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc, return; } - if (t->param_flags & SPP_PMTUD_ENABLE) { - /* Update transports view of the MTU */ - sctp_transport_update_pmtu(t, pmtu); - - /* Update association pmtu. */ - sctp_assoc_sync_pmtu(asoc); - } + if (!(t->param_flags & SPP_PMTUD_ENABLE)) + /* We can't allow retransmitting in such case, as the + * retransmission would be sized just as before, and thus we + * would get another icmp, and retransmit again. + */ + return; - /* Retransmit with the new pmtu setting. - * Normally, if PMTU discovery is disabled, an ICMP Fragmentation - * Needed will never be sent, but if a message was sent before - * PMTU discovery was disabled that was larger than the PMTU, it - * would not be fragmented, so it must be re-transmitted fragmented. + /* Update transports view of the MTU. Return if no update was needed. + * If an update wasn't needed/possible, it also doesn't make sense to + * try to retransmit now. */ + if (!sctp_transport_update_pmtu(t, pmtu)) + return; + + /* Update association pmtu. */ + sctp_assoc_sync_pmtu(asoc); + + /* Retransmit with the new pmtu setting. */ sctp_retransmit(&asoc->outqueue, t, SCTP_RTXR_PMTUD); } diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 3b18085e3b10..5d4c15bf66d2 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -826,6 +826,7 @@ static int sctp_inet6_af_supported(sa_family_t family, struct sctp_sock *sp) case AF_INET: if (!__ipv6_only_sock(sctp_opt2sk(sp))) return 1; + /* fallthru */ default: return 0; } diff --git a/net/sctp/offload.c b/net/sctp/offload.c index 275925b93b29..35bc7106d182 100644 --- a/net/sctp/offload.c +++ b/net/sctp/offload.c @@ -45,6 +45,9 @@ static struct sk_buff *sctp_gso_segment(struct sk_buff *skb, struct sk_buff *segs = ERR_PTR(-EINVAL); struct sctphdr *sh; + if (!(skb_shinfo(skb)->gso_type & SKB_GSO_SCTP)) + goto out; + sh = sctp_hdr(skb); if (!pskb_may_pull(skb, sizeof(*sh))) goto out; diff --git a/net/sctp/output.c b/net/sctp/output.c index 4a865cd06d76..01a26ee051e3 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -313,6 +313,7 @@ static enum sctp_xmit __sctp_packet_append_chunk(struct sctp_packet *packet, /* We believe that this chunk is OK to add to the packet */ switch (chunk->chunk_hdr->type) { case SCTP_CID_DATA: + case SCTP_CID_I_DATA: /* Account for the data being in the packet */ sctp_packet_append_data(packet, chunk); /* Disallow SACK bundling after DATA. */ @@ -724,7 +725,7 @@ static enum sctp_xmit sctp_packet_can_append_data(struct sctp_packet *packet, * or delay in hopes of bundling a full sized packet. */ if (chunk->skb->len + q->out_qlen > transport->pathmtu - - packet->overhead - sizeof(struct sctp_data_chunk) - 4) + packet->overhead - sctp_datachk_len(&chunk->asoc->stream) - 4) /* Enough data queued to fill a packet */ return SCTP_XMIT_OK; @@ -759,7 +760,7 @@ static void sctp_packet_append_data(struct sctp_packet *packet, asoc->peer.rwnd = rwnd; sctp_chunk_assign_tsn(chunk); - sctp_chunk_assign_ssn(chunk); + asoc->stream.si->assign_number(chunk); } static enum sctp_xmit sctp_packet_will_fit(struct sctp_packet *packet, diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 4db012aa25f7..f211b3db6a35 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -67,8 +67,6 @@ static void sctp_mark_missing(struct sctp_outq *q, __u32 highest_new_tsn, int count_of_newacks); -static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 sack_ctsn); - static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp); /* Add data to the front of the queue. */ @@ -364,10 +362,12 @@ static int sctp_prsctp_prune_sent(struct sctp_association *asoc, list_for_each_entry_safe(chk, temp, queue, transmitted_list) { struct sctp_stream_out *streamout; - if (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) || - chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive) + if (!chk->msg->abandoned && + (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) || + chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive)) continue; + chk->msg->abandoned = 1; list_del_init(&chk->transmitted_list); sctp_insert_list(&asoc->outqueue.abandoned, &chk->transmitted_list); @@ -377,7 +377,8 @@ static int sctp_prsctp_prune_sent(struct sctp_association *asoc, asoc->abandoned_sent[SCTP_PR_INDEX(PRIO)]++; streamout->ext->abandoned_sent[SCTP_PR_INDEX(PRIO)]++; - if (!chk->tsn_gap_acked) { + if (queue != &asoc->outqueue.retransmit && + !chk->tsn_gap_acked) { if (chk->transport) chk->transport->flight_size -= sctp_data_size(chk); @@ -403,10 +404,13 @@ static int sctp_prsctp_prune_unsent(struct sctp_association *asoc, q->sched->unsched_all(&asoc->stream); list_for_each_entry_safe(chk, temp, &q->out_chunk_list, list) { - if (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) || - chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive) + if (!chk->msg->abandoned && + (!(chk->chunk_hdr->flags & SCTP_DATA_FIRST_FRAG) || + !SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) || + chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive)) continue; + chk->msg->abandoned = 1; sctp_sched_dequeue_common(q, chk); asoc->sent_cnt_removable--; asoc->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; @@ -585,7 +589,7 @@ void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport, * following the procedures outlined in C1 - C5. */ if (reason == SCTP_RTXR_T3_RTX) - sctp_generate_fwdtsn(q, q->asoc->ctsn_ack_point); + q->asoc->stream.si->generate_ftsn(q, q->asoc->ctsn_ack_point); /* Flush the queues only on timeout, since fast_rtx is only * triggered during sack processing and the queue @@ -912,9 +916,9 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) break; case SCTP_CID_ABORT: - if (sctp_test_T_bit(chunk)) { + if (sctp_test_T_bit(chunk)) packet->vtag = asoc->c.my_vtag; - } + /* fallthru */ /* The following chunks are "response" chunks, i.e. * they are generated in response to something we * received. If we are sending these, then we can @@ -936,6 +940,7 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) case SCTP_CID_ECN_ECNE: case SCTP_CID_ASCONF: case SCTP_CID_FWD_TSN: + case SCTP_CID_I_FWD_TSN: case SCTP_CID_RECONF: status = sctp_packet_transmit_chunk(packet, chunk, one_packet, gfp); @@ -950,7 +955,8 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) * sender MUST assure that at least one T3-rtx * timer is running. */ - if (chunk->chunk_hdr->type == SCTP_CID_FWD_TSN) { + if (chunk->chunk_hdr->type == SCTP_CID_FWD_TSN || + chunk->chunk_hdr->type == SCTP_CID_I_FWD_TSN) { sctp_transport_reset_t3_rtx(transport); transport->last_time_sent = jiffies; } @@ -1366,7 +1372,7 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_chunk *chunk) asoc->peer.rwnd = sack_a_rwnd; - sctp_generate_fwdtsn(q, sack_ctsn); + asoc->stream.si->generate_ftsn(q, sack_ctsn); pr_debug("%s: sack cumulative tsn ack:0x%x\n", __func__, sack_ctsn); pr_debug("%s: cumulative tsn ack of assoc:%p is 0x%x, " @@ -1434,7 +1440,8 @@ static void sctp_check_transmitted(struct sctp_outq *q, /* If this chunk has not been acked, stop * considering it as 'outstanding'. */ - if (!tchunk->tsn_gap_acked) { + if (transmitted_queue != &q->retransmit && + !tchunk->tsn_gap_acked) { if (tchunk->transport) tchunk->transport->flight_size -= sctp_data_size(tchunk); @@ -1788,7 +1795,7 @@ static inline int sctp_get_skip_pos(struct sctp_fwdtsn_skip *skiplist, } /* Create and add a fwdtsn chunk to the outq's control queue if needed. */ -static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 ctsn) +void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 ctsn) { struct sctp_association *asoc = q->asoc; struct sctp_chunk *ftsn_chunk = NULL; diff --git a/net/sctp/probe.c b/net/sctp/probe.c deleted file mode 100644 index 1280f85a598d..000000000000 --- a/net/sctp/probe.c +++ /dev/null @@ -1,244 +0,0 @@ -/* - * sctp_probe - Observe the SCTP flow with kprobes. - * - * The idea for this came from Werner Almesberger's umlsim - * Copyright (C) 2004, Stephen Hemminger <shemminger@osdl.org> - * - * Modified for SCTP from Stephen Hemminger's code - * Copyright (C) 2010, Wei Yongjun <yjwei@cn.fujitsu.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/kernel.h> -#include <linux/kprobes.h> -#include <linux/socket.h> -#include <linux/sctp.h> -#include <linux/proc_fs.h> -#include <linux/vmalloc.h> -#include <linux/module.h> -#include <linux/kfifo.h> -#include <linux/time.h> -#include <net/net_namespace.h> - -#include <net/sctp/sctp.h> -#include <net/sctp/sm.h> - -MODULE_SOFTDEP("pre: sctp"); -MODULE_AUTHOR("Wei Yongjun <yjwei@cn.fujitsu.com>"); -MODULE_DESCRIPTION("SCTP snooper"); -MODULE_LICENSE("GPL"); - -static int port __read_mostly = 0; -MODULE_PARM_DESC(port, "Port to match (0=all)"); -module_param(port, int, 0); - -static unsigned int fwmark __read_mostly = 0; -MODULE_PARM_DESC(fwmark, "skb mark to match (0=no mark)"); -module_param(fwmark, uint, 0); - -static int bufsize __read_mostly = 64 * 1024; -MODULE_PARM_DESC(bufsize, "Log buffer size (default 64k)"); -module_param(bufsize, int, 0); - -static int full __read_mostly = 1; -MODULE_PARM_DESC(full, "Full log (1=every ack packet received, 0=only cwnd changes)"); -module_param(full, int, 0); - -static const char procname[] = "sctpprobe"; - -static struct { - struct kfifo fifo; - spinlock_t lock; - wait_queue_head_t wait; - struct timespec64 tstart; -} sctpw; - -static __printf(1, 2) void printl(const char *fmt, ...) -{ - va_list args; - int len; - char tbuf[256]; - - va_start(args, fmt); - len = vscnprintf(tbuf, sizeof(tbuf), fmt, args); - va_end(args); - - kfifo_in_locked(&sctpw.fifo, tbuf, len, &sctpw.lock); - wake_up(&sctpw.wait); -} - -static int sctpprobe_open(struct inode *inode, struct file *file) -{ - kfifo_reset(&sctpw.fifo); - ktime_get_ts64(&sctpw.tstart); - - return 0; -} - -static ssize_t sctpprobe_read(struct file *file, char __user *buf, - size_t len, loff_t *ppos) -{ - int error = 0, cnt = 0; - unsigned char *tbuf; - - if (!buf) - return -EINVAL; - - if (len == 0) - return 0; - - tbuf = vmalloc(len); - if (!tbuf) - return -ENOMEM; - - error = wait_event_interruptible(sctpw.wait, - kfifo_len(&sctpw.fifo) != 0); - if (error) - goto out_free; - - cnt = kfifo_out_locked(&sctpw.fifo, tbuf, len, &sctpw.lock); - error = copy_to_user(buf, tbuf, cnt) ? -EFAULT : 0; - -out_free: - vfree(tbuf); - - return error ? error : cnt; -} - -static const struct file_operations sctpprobe_fops = { - .owner = THIS_MODULE, - .open = sctpprobe_open, - .read = sctpprobe_read, - .llseek = noop_llseek, -}; - -static enum sctp_disposition jsctp_sf_eat_sack( - struct net *net, - const struct sctp_endpoint *ep, - const struct sctp_association *asoc, - const union sctp_subtype type, - void *arg, - struct sctp_cmd_seq *commands) -{ - struct sctp_chunk *chunk = arg; - struct sk_buff *skb = chunk->skb; - struct sctp_transport *sp; - static __u32 lcwnd = 0; - struct timespec64 now; - - sp = asoc->peer.primary_path; - - if (((port == 0 && fwmark == 0) || - asoc->peer.port == port || - ep->base.bind_addr.port == port || - (fwmark > 0 && skb->mark == fwmark)) && - (full || sp->cwnd != lcwnd)) { - lcwnd = sp->cwnd; - - ktime_get_ts64(&now); - now = timespec64_sub(now, sctpw.tstart); - - printl("%lu.%06lu ", (unsigned long) now.tv_sec, - (unsigned long) now.tv_nsec / NSEC_PER_USEC); - - printl("%p %5d %5d %5d %8d %5d ", asoc, - ep->base.bind_addr.port, asoc->peer.port, - asoc->pathmtu, asoc->peer.rwnd, asoc->unack_data); - - list_for_each_entry(sp, &asoc->peer.transport_addr_list, - transports) { - if (sp == asoc->peer.primary_path) - printl("*"); - - printl("%pISc %2u %8u %8u %8u %8u %8u ", - &sp->ipaddr, sp->state, sp->cwnd, sp->ssthresh, - sp->flight_size, sp->partial_bytes_acked, - sp->pathmtu); - } - printl("\n"); - } - - jprobe_return(); - return 0; -} - -static struct jprobe sctp_recv_probe = { - .kp = { - .symbol_name = "sctp_sf_eat_sack_6_2", - }, - .entry = jsctp_sf_eat_sack, -}; - -static __init int sctp_setup_jprobe(void) -{ - int ret = register_jprobe(&sctp_recv_probe); - - if (ret) { - if (request_module("sctp")) - goto out; - ret = register_jprobe(&sctp_recv_probe); - } - -out: - return ret; -} - -static __init int sctpprobe_init(void) -{ - int ret = -ENOMEM; - - /* Warning: if the function signature of sctp_sf_eat_sack_6_2, - * has been changed, you also have to change the signature of - * jsctp_sf_eat_sack, otherwise you end up right here! - */ - BUILD_BUG_ON(__same_type(sctp_sf_eat_sack_6_2, - jsctp_sf_eat_sack) == 0); - - init_waitqueue_head(&sctpw.wait); - spin_lock_init(&sctpw.lock); - if (kfifo_alloc(&sctpw.fifo, bufsize, GFP_KERNEL)) - return ret; - - if (!proc_create(procname, S_IRUSR, init_net.proc_net, - &sctpprobe_fops)) - goto free_kfifo; - - ret = sctp_setup_jprobe(); - if (ret) - goto remove_proc; - - pr_info("probe registered (port=%d/fwmark=%u) bufsize=%u\n", - port, fwmark, bufsize); - return 0; - -remove_proc: - remove_proc_entry(procname, init_net.proc_net); -free_kfifo: - kfifo_free(&sctpw.fifo); - return ret; -} - -static __exit void sctpprobe_exit(void) -{ - kfifo_free(&sctpw.fifo); - remove_proc_entry(procname, init_net.proc_net); - unregister_jprobe(&sctp_recv_probe); -} - -module_init(sctpprobe_init); -module_exit(sctpprobe_exit); diff --git a/net/sctp/proc.c b/net/sctp/proc.c index 26b4be6b4172..537545ebcb0e 100644 --- a/net/sctp/proc.c +++ b/net/sctp/proc.c @@ -95,7 +95,6 @@ static int sctp_snmp_seq_open(struct inode *inode, struct file *file) } static const struct file_operations sctp_snmp_seq_fops = { - .owner = THIS_MODULE, .open = sctp_snmp_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -288,12 +287,8 @@ struct sctp_ht_iter { static void *sctp_transport_seq_start(struct seq_file *seq, loff_t *pos) { struct sctp_ht_iter *iter = seq->private; - int err = sctp_transport_walk_start(&iter->hti); - if (err) { - iter->start_fail = 1; - return ERR_PTR(err); - } + sctp_transport_walk_start(&iter->hti); iter->start_fail = 0; return sctp_transport_get_idx(seq_file_net(seq), &iter->hti, *pos); diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index f5172c21349b..6a38c2503649 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1499,6 +1499,7 @@ static __init int sctp_init(void) INIT_LIST_HEAD(&sctp_address_families); sctp_v4_pf_init(); sctp_v6_pf_init(); + sctp_sched_ops_init(); status = register_pernet_subsys(&sctp_defaults_ops); if (status) diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 9bf575f2e8ed..793b05ec692b 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -228,7 +228,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, struct sctp_inithdr init; union sctp_params addrs; struct sctp_sock *sp; - __u8 extensions[4]; + __u8 extensions[5]; size_t chunksize; __be16 types[2]; int num_ext = 0; @@ -278,6 +278,11 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, if (sp->adaptation_ind) chunksize += sizeof(aiparam); + if (sp->strm_interleave) { + extensions[num_ext] = SCTP_CID_I_DATA; + num_ext += 1; + } + chunksize += vparam_len; /* Account for AUTH related parameters */ @@ -392,7 +397,7 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, struct sctp_inithdr initack; union sctp_params addrs; struct sctp_sock *sp; - __u8 extensions[4]; + __u8 extensions[5]; size_t chunksize; int num_ext = 0; int cookie_len; @@ -442,6 +447,11 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, if (sp->adaptation_ind) chunksize += sizeof(aiparam); + if (asoc->intl_enable) { + extensions[num_ext] = SCTP_CID_I_DATA; + num_ext += 1; + } + if (asoc->peer.auth_capable) { auth_random = (struct sctp_paramhdr *)asoc->c.auth_random; chunksize += ntohs(auth_random->length); @@ -711,38 +721,31 @@ nodata: /* Make a DATA chunk for the given association from the provided * parameters. However, do not populate the data payload. */ -struct sctp_chunk *sctp_make_datafrag_empty(struct sctp_association *asoc, +struct sctp_chunk *sctp_make_datafrag_empty(const struct sctp_association *asoc, const struct sctp_sndrcvinfo *sinfo, - int data_len, __u8 flags, __u16 ssn, - gfp_t gfp) + int len, __u8 flags, gfp_t gfp) { struct sctp_chunk *retval; struct sctp_datahdr dp; - int chunk_len; /* We assign the TSN as LATE as possible, not here when * creating the chunk. */ - dp.tsn = 0; + memset(&dp, 0, sizeof(dp)); + dp.ppid = sinfo->sinfo_ppid; dp.stream = htons(sinfo->sinfo_stream); - dp.ppid = sinfo->sinfo_ppid; /* Set the flags for an unordered send. */ - if (sinfo->sinfo_flags & SCTP_UNORDERED) { + if (sinfo->sinfo_flags & SCTP_UNORDERED) flags |= SCTP_DATA_UNORDERED; - dp.ssn = 0; - } else - dp.ssn = htons(ssn); - chunk_len = sizeof(dp) + data_len; - retval = sctp_make_data(asoc, flags, chunk_len, gfp); + retval = sctp_make_data(asoc, flags, sizeof(dp) + len, gfp); if (!retval) - goto nodata; + return NULL; retval->subh.data_hdr = sctp_addto_chunk(retval, sizeof(dp), &dp); memcpy(&retval->sinfo, sinfo, sizeof(struct sctp_sndrcvinfo)); -nodata: return retval; } @@ -1273,7 +1276,6 @@ struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc) struct sctp_authhdr auth_hdr; struct sctp_hmac *hmac_desc; struct sctp_chunk *retval; - __u8 *hmac; /* Get the first hmac that the peer told us to use */ hmac_desc = sctp_auth_asoc_get_hmac(asoc); @@ -1292,7 +1294,7 @@ struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc) retval->subh.auth_hdr = sctp_addto_chunk(retval, sizeof(auth_hdr), &auth_hdr); - hmac = skb_put_zero(retval->skb, hmac_desc->hmac_len); + skb_put_zero(retval->skb, hmac_desc->hmac_len); /* Adjust the chunk header to include the empty MAC */ retval->chunk_hdr->length = @@ -1415,6 +1417,12 @@ static struct sctp_chunk *sctp_make_data(const struct sctp_association *asoc, return _sctp_make_chunk(asoc, SCTP_CID_DATA, flags, paylen, gfp); } +struct sctp_chunk *sctp_make_idata(const struct sctp_association *asoc, + __u8 flags, int paylen, gfp_t gfp) +{ + return _sctp_make_chunk(asoc, SCTP_CID_I_DATA, flags, paylen, gfp); +} + static struct sctp_chunk *sctp_make_control(const struct sctp_association *asoc, __u8 type, __u8 flags, int paylen, gfp_t gfp) @@ -2032,6 +2040,10 @@ static void sctp_process_ext_param(struct sctp_association *asoc, if (net->sctp.addip_enable) asoc->peer.asconf_capable = 1; break; + case SCTP_CID_I_DATA: + if (sctp_sk(asoc->base.sk)->strm_interleave) + asoc->intl_enable = 1; + break; default: break; } @@ -3523,6 +3535,30 @@ struct sctp_chunk *sctp_make_fwdtsn(const struct sctp_association *asoc, return retval; } +struct sctp_chunk *sctp_make_ifwdtsn(const struct sctp_association *asoc, + __u32 new_cum_tsn, size_t nstreams, + struct sctp_ifwdtsn_skip *skiplist) +{ + struct sctp_chunk *retval = NULL; + struct sctp_ifwdtsn_hdr ftsn_hdr; + size_t hint; + + hint = (nstreams + 1) * sizeof(__u32); + + retval = sctp_make_control(asoc, SCTP_CID_I_FWD_TSN, 0, hint, + GFP_ATOMIC); + if (!retval) + return NULL; + + ftsn_hdr.new_cum_tsn = htonl(new_cum_tsn); + retval->subh.ifwdtsn_hdr = + sctp_addto_chunk(retval, sizeof(ftsn_hdr), &ftsn_hdr); + + sctp_addto_chunk(retval, nstreams * sizeof(skiplist[0]), skiplist); + + return retval; +} + /* RE-CONFIG 3.1 (RE-CONFIG chunk) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index df94d77401e7..b71e7fb0a20a 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -632,7 +632,7 @@ static void sctp_cmd_assoc_failed(struct sctp_cmd_seq *commands, struct sctp_chunk *abort; /* Cancel any partial delivery in progress. */ - sctp_ulpq_abort_pd(&asoc->ulpq, GFP_ATOMIC); + asoc->stream.si->abort_pd(&asoc->ulpq, GFP_ATOMIC); if (event_type == SCTP_EVENT_T_CHUNK && subtype.chunk == SCTP_CID_ABORT) event = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_COMM_LOST, @@ -878,12 +878,12 @@ static void sctp_cmd_new_state(struct sctp_cmd_seq *cmds, * successfully completed a connect() call. */ if (sctp_state(asoc, ESTABLISHED) && sctp_sstate(sk, CLOSED)) - sk->sk_state = SCTP_SS_ESTABLISHED; + inet_sk_set_state(sk, SCTP_SS_ESTABLISHED); /* Set the RCV_SHUTDOWN flag when a SHUTDOWN is received. */ if (sctp_state(asoc, SHUTDOWN_RECEIVED) && sctp_sstate(sk, ESTABLISHED)) { - sk->sk_state = SCTP_SS_CLOSING; + inet_sk_set_state(sk, SCTP_SS_CLOSING); sk->sk_shutdown |= RCV_SHUTDOWN; } } @@ -972,7 +972,7 @@ static void sctp_cmd_process_operr(struct sctp_cmd_seq *cmds, if (!ev) return; - sctp_ulpq_tail_event(&asoc->ulpq, ev); + asoc->stream.si->enqueue_event(&asoc->ulpq, ev); switch (err_hdr->cause) { case SCTP_ERROR_UNKNOWN_CHUNK: @@ -1007,18 +1007,6 @@ static void sctp_cmd_process_operr(struct sctp_cmd_seq *cmds, } } -/* Process variable FWDTSN chunk information. */ -static void sctp_cmd_process_fwdtsn(struct sctp_ulpq *ulpq, - struct sctp_chunk *chunk) -{ - struct sctp_fwdtsn_skip *skip; - - /* Walk through all the skipped SSNs */ - sctp_walk_fwdtsn(skip, chunk) { - sctp_ulpq_skip(ulpq, ntohs(skip->stream), ntohs(skip->ssn)); - } -} - /* Helper function to remove the association non-primary peer * transports. */ @@ -1058,7 +1046,7 @@ static void sctp_cmd_assoc_change(struct sctp_cmd_seq *commands, asoc->c.sinit_max_instreams, NULL, GFP_ATOMIC); if (ev) - sctp_ulpq_tail_event(&asoc->ulpq, ev); + asoc->stream.si->enqueue_event(&asoc->ulpq, ev); } /* Helper function to generate an adaptation indication event */ @@ -1070,7 +1058,7 @@ static void sctp_cmd_adaptation_ind(struct sctp_cmd_seq *commands, ev = sctp_ulpevent_make_adaptation_indication(asoc, GFP_ATOMIC); if (ev) - sctp_ulpq_tail_event(&asoc->ulpq, ev); + asoc->stream.si->enqueue_event(&asoc->ulpq, ev); } @@ -1368,18 +1356,12 @@ static int sctp_cmd_interpreter(enum sctp_event event_type, break; case SCTP_CMD_REPORT_FWDTSN: - /* Move the Cumulattive TSN Ack ahead. */ - sctp_tsnmap_skip(&asoc->peer.tsn_map, cmd->obj.u32); - - /* purge the fragmentation queue */ - sctp_ulpq_reasm_flushtsn(&asoc->ulpq, cmd->obj.u32); - - /* Abort any in progress partial delivery. */ - sctp_ulpq_abort_pd(&asoc->ulpq, GFP_ATOMIC); + asoc->stream.si->report_ftsn(&asoc->ulpq, cmd->obj.u32); break; case SCTP_CMD_PROCESS_FWDTSN: - sctp_cmd_process_fwdtsn(&asoc->ulpq, cmd->obj.chunk); + asoc->stream.si->handle_ftsn(&asoc->ulpq, + cmd->obj.chunk); break; case SCTP_CMD_GEN_SACK: @@ -1483,8 +1465,9 @@ static int sctp_cmd_interpreter(enum sctp_event event_type, pr_debug("%s: sm_sideff: chunk_up:%p, ulpq:%p\n", __func__, cmd->obj.chunk, &asoc->ulpq); - sctp_ulpq_tail_data(&asoc->ulpq, cmd->obj.chunk, - GFP_ATOMIC); + asoc->stream.si->ulpevent_data(&asoc->ulpq, + cmd->obj.chunk, + GFP_ATOMIC); break; case SCTP_CMD_EVENT_ULP: @@ -1492,7 +1475,8 @@ static int sctp_cmd_interpreter(enum sctp_event event_type, pr_debug("%s: sm_sideff: event_up:%p, ulpq:%p\n", __func__, cmd->obj.ulpevent, &asoc->ulpq); - sctp_ulpq_tail_event(&asoc->ulpq, cmd->obj.ulpevent); + asoc->stream.si->enqueue_event(&asoc->ulpq, + cmd->obj.ulpevent); break; case SCTP_CMD_REPLY: @@ -1729,12 +1713,13 @@ static int sctp_cmd_interpreter(enum sctp_event event_type, break; case SCTP_CMD_PART_DELIVER: - sctp_ulpq_partial_delivery(&asoc->ulpq, GFP_ATOMIC); + asoc->stream.si->start_pd(&asoc->ulpq, GFP_ATOMIC); break; case SCTP_CMD_RENEGE: - sctp_ulpq_renege(&asoc->ulpq, cmd->obj.chunk, - GFP_ATOMIC); + asoc->stream.si->renege_events(&asoc->ulpq, + cmd->obj.chunk, + GFP_ATOMIC); break; case SCTP_CMD_SETUP_T4: diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 8f8ccded13e4..eb7905ffe5f2 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -59,6 +59,9 @@ #include <net/sctp/sm.h> #include <net/sctp/structs.h> +#define CREATE_TRACE_POINTS +#include <trace/events/sctp.h> + static struct sctp_packet *sctp_abort_pkt_new( struct net *net, const struct sctp_endpoint *ep, @@ -3013,7 +3016,7 @@ enum sctp_disposition sctp_sf_eat_data_6_2(struct net *net, return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); } - if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_data_chunk))) + if (!sctp_chunk_length_valid(chunk, sctp_datachk_len(&asoc->stream))) return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, commands); @@ -3034,7 +3037,7 @@ enum sctp_disposition sctp_sf_eat_data_6_2(struct net *net, case SCTP_IERROR_PROTO_VIOLATION: return sctp_sf_abort_violation(net, ep, asoc, chunk, commands, (u8 *)chunk->subh.data_hdr, - sizeof(struct sctp_datahdr)); + sctp_datahdr_len(&asoc->stream)); default: BUG(); } @@ -3133,7 +3136,7 @@ enum sctp_disposition sctp_sf_eat_data_fast_4_4( return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); } - if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_data_chunk))) + if (!sctp_chunk_length_valid(chunk, sctp_datachk_len(&asoc->stream))) return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, commands); @@ -3150,7 +3153,7 @@ enum sctp_disposition sctp_sf_eat_data_fast_4_4( case SCTP_IERROR_PROTO_VIOLATION: return sctp_sf_abort_violation(net, ep, asoc, chunk, commands, (u8 *)chunk->subh.data_hdr, - sizeof(struct sctp_datahdr)); + sctp_datahdr_len(&asoc->stream)); default: BUG(); } @@ -3219,6 +3222,8 @@ enum sctp_disposition sctp_sf_eat_sack_6_2(struct net *net, struct sctp_sackhdr *sackh; __u32 ctsn; + trace_sctp_probe(ep, asoc, chunk); + if (!sctp_vtag_verify(chunk, asoc)) return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); @@ -3957,7 +3962,6 @@ enum sctp_disposition sctp_sf_eat_fwd_tsn(struct net *net, { struct sctp_fwdtsn_hdr *fwdtsn_hdr; struct sctp_chunk *chunk = arg; - struct sctp_fwdtsn_skip *skip; __u16 len; __u32 tsn; @@ -3971,7 +3975,7 @@ enum sctp_disposition sctp_sf_eat_fwd_tsn(struct net *net, return sctp_sf_unk_chunk(net, ep, asoc, type, arg, commands); /* Make sure that the FORWARD_TSN chunk has valid length. */ - if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_fwdtsn_chunk))) + if (!sctp_chunk_length_valid(chunk, sctp_ftsnchk_len(&asoc->stream))) return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, commands); @@ -3990,14 +3994,11 @@ enum sctp_disposition sctp_sf_eat_fwd_tsn(struct net *net, if (sctp_tsnmap_check(&asoc->peer.tsn_map, tsn) < 0) goto discard_noforce; - /* Silently discard the chunk if stream-id is not valid */ - sctp_walk_fwdtsn(skip, chunk) { - if (ntohs(skip->stream) >= asoc->stream.incnt) - goto discard_noforce; - } + if (!asoc->stream.si->validate_ftsn(chunk)) + goto discard_noforce; sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_FWDTSN, SCTP_U32(tsn)); - if (len > sizeof(struct sctp_fwdtsn_hdr)) + if (len > sctp_ftsnhdr_len(&asoc->stream)) sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_FWDTSN, SCTP_CHUNK(chunk)); @@ -4028,7 +4029,6 @@ enum sctp_disposition sctp_sf_eat_fwd_tsn_fast( { struct sctp_fwdtsn_hdr *fwdtsn_hdr; struct sctp_chunk *chunk = arg; - struct sctp_fwdtsn_skip *skip; __u16 len; __u32 tsn; @@ -4042,7 +4042,7 @@ enum sctp_disposition sctp_sf_eat_fwd_tsn_fast( return sctp_sf_unk_chunk(net, ep, asoc, type, arg, commands); /* Make sure that the FORWARD_TSN chunk has a valid length. */ - if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_fwdtsn_chunk))) + if (!sctp_chunk_length_valid(chunk, sctp_ftsnchk_len(&asoc->stream))) return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, commands); @@ -4061,14 +4061,11 @@ enum sctp_disposition sctp_sf_eat_fwd_tsn_fast( if (sctp_tsnmap_check(&asoc->peer.tsn_map, tsn) < 0) goto gen_shutdown; - /* Silently discard the chunk if stream-id is not valid */ - sctp_walk_fwdtsn(skip, chunk) { - if (ntohs(skip->stream) >= asoc->stream.incnt) - goto gen_shutdown; - } + if (!asoc->stream.si->validate_ftsn(chunk)) + goto gen_shutdown; sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_FWDTSN, SCTP_U32(tsn)); - if (len > sizeof(struct sctp_fwdtsn_hdr)) + if (len > sctp_ftsnhdr_len(&asoc->stream)) sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_FWDTSN, SCTP_CHUNK(chunk)); @@ -6244,14 +6241,12 @@ static int sctp_eat_data(const struct sctp_association *asoc, struct sctp_chunk *err; enum sctp_verb deliver; size_t datalen; - u8 ordered = 0; - u16 ssn, sid; __u32 tsn; int tmp; data_hdr = (struct sctp_datahdr *)chunk->skb->data; chunk->subh.data_hdr = data_hdr; - skb_pull(chunk->skb, sizeof(*data_hdr)); + skb_pull(chunk->skb, sctp_datahdr_len(&asoc->stream)); tsn = ntohl(data_hdr->tsn); pr_debug("%s: TSN 0x%x\n", __func__, tsn); @@ -6299,7 +6294,7 @@ static int sctp_eat_data(const struct sctp_association *asoc, * Actually, allow a little bit of overflow (up to a MTU). */ datalen = ntohs(chunk->chunk_hdr->length); - datalen -= sizeof(struct sctp_data_chunk); + datalen -= sctp_datachk_len(&asoc->stream); deliver = SCTP_CMD_CHUNK_ULP; @@ -6394,7 +6389,6 @@ static int sctp_eat_data(const struct sctp_association *asoc, SCTP_INC_STATS(net, SCTP_MIB_INORDERCHUNKS); if (chunk->asoc) chunk->asoc->stats.iodchunks++; - ordered = 1; } /* RFC 2960 6.5 Stream Identifier and Stream Sequence Number @@ -6405,8 +6399,7 @@ static int sctp_eat_data(const struct sctp_association *asoc, * with cause set to "Invalid Stream Identifier" (See Section 3.3.10) * and discard the DATA chunk. */ - sid = ntohs(data_hdr->stream); - if (sid >= asoc->stream.incnt) { + if (ntohs(data_hdr->stream) >= asoc->stream.incnt) { /* Mark tsn as received even though we drop it */ sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_TSN, SCTP_U32(tsn)); @@ -6427,8 +6420,7 @@ static int sctp_eat_data(const struct sctp_association *asoc, * SSN is smaller then the next expected one. If it is, it wrapped * and is invalid. */ - ssn = ntohs(data_hdr->ssn); - if (ordered && SSN_lt(ssn, sctp_ssn_peek(&asoc->stream, in, sid))) + if (!asoc->stream.si->validate_data(chunk)) return SCTP_IERROR_PROTO_VIOLATION; /* Send the data up to the user. Note: Schedule the diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c index 79b6bee5b768..691d9dc620e3 100644 --- a/net/sctp/sm_statetable.c +++ b/net/sctp/sm_statetable.c @@ -985,11 +985,14 @@ static const struct sctp_sm_table_entry *sctp_chunk_event_lookup( if (state > SCTP_STATE_MAX) return &bug; + if (cid == SCTP_CID_I_DATA) + cid = SCTP_CID_DATA; + if (cid <= SCTP_CID_BASE_MAX) return &chunk_event_table[cid][state]; if (net->sctp.prsctp_enable) { - if (cid == SCTP_CID_FWD_TSN) + if (cid == SCTP_CID_FWD_TSN || cid == SCTP_CID_I_FWD_TSN) return &prsctp_chunk_event_table[0][state]; } diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 3204a9b29407..356e387f82e7 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -85,7 +85,7 @@ static int sctp_writeable(struct sock *sk); static void sctp_wfree(struct sk_buff *skb); static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, - size_t msg_len, struct sock **orig_sk); + size_t msg_len); static int sctp_wait_for_packet(struct sock *sk, int *err, long *timeo_p); static int sctp_wait_for_connect(struct sctp_association *, long *timeo_p); static int sctp_wait_for_accept(struct sock *sk, long timeo); @@ -188,19 +188,35 @@ static void sctp_for_each_tx_datachunk(struct sctp_association *asoc, list_for_each_entry(chunk, &t->transmitted, transmitted_list) cb(chunk); - list_for_each_entry(chunk, &q->retransmit, list) + list_for_each_entry(chunk, &q->retransmit, transmitted_list) cb(chunk); - list_for_each_entry(chunk, &q->sacked, list) + list_for_each_entry(chunk, &q->sacked, transmitted_list) cb(chunk); - list_for_each_entry(chunk, &q->abandoned, list) + list_for_each_entry(chunk, &q->abandoned, transmitted_list) cb(chunk); list_for_each_entry(chunk, &q->out_chunk_list, list) cb(chunk); } +static void sctp_for_each_rx_skb(struct sctp_association *asoc, struct sock *sk, + void (*cb)(struct sk_buff *, struct sock *)) + +{ + struct sk_buff *skb, *tmp; + + sctp_skb_for_each(skb, &asoc->ulpq.lobby, tmp) + cb(skb, sk); + + sctp_skb_for_each(skb, &asoc->ulpq.reasm, tmp) + cb(skb, sk); + + sctp_skb_for_each(skb, &asoc->ulpq.reasm_uo, tmp) + cb(skb, sk); +} + /* Verify that this is a valid address. */ static inline int sctp_verify_addr(struct sock *sk, union sctp_addr *addr, int len) @@ -335,16 +351,14 @@ static struct sctp_af *sctp_sockaddr_af(struct sctp_sock *opt, if (len < sizeof (struct sockaddr)) return NULL; + if (!opt->pf->af_supported(addr->sa.sa_family, opt)) + return NULL; + /* V4 mapped address are really of AF_INET family */ if (addr->sa.sa_family == AF_INET6 && - ipv6_addr_v4mapped(&addr->v6.sin6_addr)) { - if (!opt->pf->af_supported(AF_INET, opt)) - return NULL; - } else { - /* Does this PF support this AF? */ - if (!opt->pf->af_supported(addr->sa.sa_family, opt)) - return NULL; - } + ipv6_addr_v4mapped(&addr->v6.sin6_addr) && + !opt->pf->af_supported(AF_INET, opt)) + return NULL; /* If we get this far, af is valid. */ af = sctp_get_af_specific(addr->sa.sa_family); @@ -970,13 +984,6 @@ int sctp_asconf_mgmt(struct sctp_sock *sp, struct sctp_sockaddr_entry *addrw) * This is used for tunneling the sctp_bindx() request through sctp_setsockopt() * from userspace. * - * We don't use copy_from_user() for optimization: we first do the - * sanity checks (buffer size -fast- and access check-healthy - * pointer); if all of those succeed, then we can alloc the memory - * (expensive operation) needed to copy the data to kernel. Then we do - * the copying without checking the user space area - * (__copy_from_user()). - * * On exit there is no need to do sockfd_put(), sys_setsockopt() does * it. * @@ -1006,25 +1013,15 @@ static int sctp_setsockopt_bindx(struct sock *sk, if (unlikely(addrs_size <= 0)) return -EINVAL; - /* Check the user passed a healthy pointer. */ - if (unlikely(!access_ok(VERIFY_READ, addrs, addrs_size))) - return -EFAULT; - - /* Alloc space for the address array in kernel memory. */ - kaddrs = kmalloc(addrs_size, GFP_USER | __GFP_NOWARN); - if (unlikely(!kaddrs)) - return -ENOMEM; - - if (__copy_from_user(kaddrs, addrs, addrs_size)) { - kfree(kaddrs); - return -EFAULT; - } + kaddrs = vmemdup_user(addrs, addrs_size); + if (unlikely(IS_ERR(kaddrs))) + return PTR_ERR(kaddrs); /* Walk through the addrs buffer and count the number of addresses. */ addr_buf = kaddrs; while (walk_size < addrs_size) { if (walk_size + sizeof(sa_family_t) > addrs_size) { - kfree(kaddrs); + kvfree(kaddrs); return -EINVAL; } @@ -1035,7 +1032,7 @@ static int sctp_setsockopt_bindx(struct sock *sk, * causes the address buffer to overflow return EINVAL. */ if (!af || (walk_size + af->sockaddr_len) > addrs_size) { - kfree(kaddrs); + kvfree(kaddrs); return -EINVAL; } addrcnt++; @@ -1065,7 +1062,7 @@ static int sctp_setsockopt_bindx(struct sock *sk, } out: - kfree(kaddrs); + kvfree(kaddrs); return err; } @@ -1323,13 +1320,6 @@ out_free: * land and invoking either sctp_connectx(). This is used for tunneling * the sctp_connectx() request through sctp_setsockopt() from userspace. * - * We don't use copy_from_user() for optimization: we first do the - * sanity checks (buffer size -fast- and access check-healthy - * pointer); if all of those succeed, then we can alloc the memory - * (expensive operation) needed to copy the data to kernel. Then we do - * the copying without checking the user space area - * (__copy_from_user()). - * * On exit there is no need to do sockfd_put(), sys_setsockopt() does * it. * @@ -1345,7 +1335,6 @@ static int __sctp_setsockopt_connectx(struct sock *sk, sctp_assoc_t *assoc_id) { struct sockaddr *kaddrs; - gfp_t gfp = GFP_KERNEL; int err = 0; pr_debug("%s: sk:%p addrs:%p addrs_size:%d\n", @@ -1354,24 +1343,12 @@ static int __sctp_setsockopt_connectx(struct sock *sk, if (unlikely(addrs_size <= 0)) return -EINVAL; - /* Check the user passed a healthy pointer. */ - if (unlikely(!access_ok(VERIFY_READ, addrs, addrs_size))) - return -EFAULT; + kaddrs = vmemdup_user(addrs, addrs_size); + if (unlikely(IS_ERR(kaddrs))) + return PTR_ERR(kaddrs); - /* Alloc space for the address array in kernel memory. */ - if (sk->sk_socket->file) - gfp = GFP_USER | __GFP_NOWARN; - kaddrs = kmalloc(addrs_size, gfp); - if (unlikely(!kaddrs)) - return -ENOMEM; - - if (__copy_from_user(kaddrs, addrs, addrs_size)) { - err = -EFAULT; - } else { - err = __sctp_connect(sk, kaddrs, addrs_size, assoc_id); - } - - kfree(kaddrs); + err = __sctp_connect(sk, kaddrs, addrs_size, assoc_id); + kvfree(kaddrs); return err; } @@ -1528,7 +1505,7 @@ static void sctp_close(struct sock *sk, long timeout) lock_sock_nested(sk, SINGLE_DEPTH_NESTING); sk->sk_shutdown = SHUTDOWN_MASK; - sk->sk_state = SCTP_SS_CLOSING; + inet_sk_set_state(sk, SCTP_SS_CLOSING); ep = sctp_sk(sk)->ep; @@ -1554,6 +1531,7 @@ static void sctp_close(struct sock *sk, long timeout) if (data_was_unread || !skb_queue_empty(&asoc->ulpq.lobby) || !skb_queue_empty(&asoc->ulpq.reasm) || + !skb_queue_empty(&asoc->ulpq.reasm_uo) || (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime)) { struct sctp_chunk *chunk; @@ -1883,8 +1861,14 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) */ if (sinit) { if (sinit->sinit_num_ostreams) { - asoc->c.sinit_num_ostreams = - sinit->sinit_num_ostreams; + __u16 outcnt = sinit->sinit_num_ostreams; + + asoc->c.sinit_num_ostreams = outcnt; + /* outcnt has been changed, so re-init stream */ + err = sctp_stream_init(&asoc->stream, outcnt, 0, + GFP_KERNEL); + if (err) + goto out_free; } if (sinit->sinit_max_instreams) { asoc->c.sinit_max_instreams = @@ -1971,7 +1955,7 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); if (!sctp_wspace(asoc)) { /* sk can be changed by peel off when waiting for buf. */ - err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len, &sk); + err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len); if (err) { if (err == -ESRCH) { /* asoc is already dead. */ @@ -2002,7 +1986,20 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) if (err < 0) goto out_free; - wait_connect = true; + /* If stream interleave is enabled, wait_connect has to be + * done earlier than data enqueue, as it needs to make data + * or idata according to asoc->intl_enable which is set + * after connection is done. + */ + if (sctp_sk(asoc->base.sk)->strm_interleave) { + timeo = sock_sndtimeo(sk, 0); + err = sctp_wait_for_connect(asoc, &timeo); + if (err) + goto out_unlock; + } else { + wait_connect = true; + } + pr_debug("%s: we associated primitively\n", __func__); } @@ -2277,11 +2274,11 @@ static int sctp_setsockopt_events(struct sock *sk, char __user *optval, if (asoc && sctp_outq_is_empty(&asoc->outqueue)) { event = sctp_ulpevent_make_sender_dry_event(asoc, - GFP_ATOMIC); + GFP_USER | __GFP_NOWARN); if (!event) return -ENOMEM; - sctp_ulpq_tail_event(&asoc->ulpq, event); + asoc->stream.si->enqueue_event(&asoc->ulpq, event); } } @@ -3180,7 +3177,7 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned if (val == 0) { val = asoc->pathmtu - sp->pf->af->net_header_len; val -= sizeof(struct sctphdr) + - sizeof(struct sctp_data_chunk); + sctp_datachk_len(&asoc->stream); } asoc->user_frag = val; asoc->frag_point = sctp_frag_point(asoc, asoc->pathmtu); @@ -3350,7 +3347,10 @@ static int sctp_setsockopt_fragment_interleave(struct sock *sk, if (get_user(val, (int __user *)optval)) return -EFAULT; - sctp_sk(sk)->frag_interleave = (val == 0) ? 0 : 1; + sctp_sk(sk)->frag_interleave = !!val; + + if (!sctp_sk(sk)->frag_interleave) + sctp_sk(sk)->strm_interleave = 0; return 0; } @@ -3498,6 +3498,8 @@ static int sctp_setsockopt_hmac_ident(struct sock *sk, if (optlen < sizeof(struct sctp_hmacalgo)) return -EINVAL; + optlen = min_t(unsigned int, optlen, sizeof(struct sctp_hmacalgo) + + SCTP_AUTH_NUM_HMACS * sizeof(u16)); hmacs = memdup_user(optval, optlen); if (IS_ERR(hmacs)) @@ -3536,6 +3538,11 @@ static int sctp_setsockopt_auth_key(struct sock *sk, if (optlen <= sizeof(struct sctp_authkey)) return -EINVAL; + /* authkey->sca_keylength is u16, so optlen can't be bigger than + * this. + */ + optlen = min_t(unsigned int, optlen, USHRT_MAX + + sizeof(struct sctp_authkey)); authkey = memdup_user(optval, optlen); if (IS_ERR(authkey)) @@ -3891,13 +3898,20 @@ static int sctp_setsockopt_reset_streams(struct sock *sk, struct sctp_association *asoc; int retval = -EINVAL; - if (optlen < sizeof(struct sctp_reset_streams)) + if (optlen < sizeof(*params)) return -EINVAL; + /* srs_number_streams is u16, so optlen can't be bigger than this. */ + optlen = min_t(unsigned int, optlen, USHRT_MAX + + sizeof(__u16) * sizeof(*params)); params = memdup_user(optval, optlen); if (IS_ERR(params)) return PTR_ERR(params); + if (params->srs_number_streams * sizeof(__u16) > + optlen - sizeof(*params)) + goto out; + asoc = sctp_id2assoc(sk, params->srs_assoc_id); if (!asoc) goto out; @@ -4019,6 +4033,40 @@ out: return retval; } +static int sctp_setsockopt_interleaving_supported(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + struct sctp_sock *sp = sctp_sk(sk); + struct net *net = sock_net(sk); + struct sctp_assoc_value params; + int retval = -EINVAL; + + if (optlen < sizeof(params)) + goto out; + + optlen = sizeof(params); + if (copy_from_user(¶ms, optval, optlen)) { + retval = -EFAULT; + goto out; + } + + if (params.assoc_id) + goto out; + + if (!net->sctp.intl_enable || !sp->frag_interleave) { + retval = -EPERM; + goto out; + } + + sp->strm_interleave = !!params.assoc_value; + + retval = 0; + +out: + return retval; +} + /* API 6.2 setsockopt(), getsockopt() * * Applications use setsockopt() and getsockopt() to set or retrieve @@ -4206,6 +4254,10 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, case SCTP_STREAM_SCHEDULER_VALUE: retval = sctp_setsockopt_scheduler_value(sk, optval, optlen); break; + case SCTP_INTERLEAVING_SUPPORTED: + retval = sctp_setsockopt_interleaving_supported(sk, optval, + optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -4494,7 +4546,7 @@ static int sctp_init_sock(struct sock *sk) SCTP_DBG_OBJCNT_INC(sock); local_bh_disable(); - percpu_counter_inc(&sctp_sockets_allocated); + sk_sockets_allocated_inc(sk); sock_prot_inuse_add(net, sk->sk_prot, 1); /* Nothing can fail after this block, otherwise @@ -4538,7 +4590,7 @@ static void sctp_destroy_sock(struct sock *sk) } sctp_endpoint_free(sp->ep); local_bh_disable(); - percpu_counter_dec(&sctp_sockets_allocated); + sk_sockets_allocated_dec(sk); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); local_bh_enable(); } @@ -4582,7 +4634,7 @@ static void sctp_shutdown(struct sock *sk, int how) if (how & SEND_SHUTDOWN && !list_empty(&ep->asocs)) { struct sctp_association *asoc; - sk->sk_state = SCTP_SS_CLOSING; + inet_sk_set_state(sk, SCTP_SS_CLOSING); asoc = list_entry(ep->asocs.next, struct sctp_association, asocs); sctp_primitive_SHUTDOWN(net, asoc, NULL); @@ -4676,20 +4728,11 @@ int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc, EXPORT_SYMBOL_GPL(sctp_get_sctp_info); /* use callback to avoid exporting the core structure */ -int sctp_transport_walk_start(struct rhashtable_iter *iter) +void sctp_transport_walk_start(struct rhashtable_iter *iter) { - int err; - rhltable_walk_enter(&sctp_transport_hashtable, iter); - err = rhashtable_walk_start(iter); - if (err && err != -EAGAIN) { - rhashtable_walk_stop(iter); - rhashtable_walk_exit(iter); - return err; - } - - return 0; + rhashtable_walk_start(iter); } void sctp_transport_walk_stop(struct rhashtable_iter *iter) @@ -4783,9 +4826,8 @@ int sctp_for_each_transport(int (*cb)(struct sctp_transport *, void *), int ret; again: - ret = sctp_transport_walk_start(&hti); - if (ret) - return ret; + ret = 0; + sctp_transport_walk_start(&hti); tsp = sctp_transport_get_idx(net, &hti, *pos + 1); for (; !IS_ERR_OR_NULL(tsp); tsp = sctp_transport_get_next(net, &hti)) { @@ -5011,7 +5053,7 @@ static int sctp_getsockopt_autoclose(struct sock *sk, int len, char __user *optv len = sizeof(int); if (put_user(len, optlen)) return -EFAULT; - if (copy_to_user(optval, &sctp_sk(sk)->autoclose, sizeof(int))) + if (copy_to_user(optval, &sctp_sk(sk)->autoclose, len)) return -EFAULT; return 0; } @@ -5080,7 +5122,6 @@ static int sctp_getsockopt_peeloff_common(struct sock *sk, sctp_peeloff_arg_t *p *newfile = sock_alloc_file(newsock, 0, NULL); if (IS_ERR(*newfile)) { put_unused_fd(retval); - sock_release(newsock); retval = PTR_ERR(*newfile); *newfile = NULL; return retval; @@ -5642,6 +5683,9 @@ copy_getaddrs: err = -EFAULT; goto out; } + /* XXX: We should have accounted for sizeof(struct sctp_getaddrs) too, + * but we can't change it anymore. + */ if (put_user(bytes_copied, optlen)) err = -EFAULT; out: @@ -6078,7 +6122,7 @@ static int sctp_getsockopt_maxseg(struct sock *sk, int len, params.assoc_id = 0; } else if (len >= sizeof(struct sctp_assoc_value)) { len = sizeof(struct sctp_assoc_value); - if (copy_from_user(¶ms, optval, sizeof(params))) + if (copy_from_user(¶ms, optval, len)) return -EFAULT; } else return -EINVAL; @@ -6248,7 +6292,9 @@ static int sctp_getsockopt_active_key(struct sock *sk, int len, if (len < sizeof(struct sctp_authkeyid)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(struct sctp_authkeyid))) + + len = sizeof(struct sctp_authkeyid); + if (copy_from_user(&val, optval, len)) return -EFAULT; asoc = sctp_id2assoc(sk, val.scact_assoc_id); @@ -6260,7 +6306,6 @@ static int sctp_getsockopt_active_key(struct sock *sk, int len, else val.scact_keynumber = ep->active_key_id; - len = sizeof(struct sctp_authkeyid); if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) @@ -6286,7 +6331,7 @@ static int sctp_getsockopt_peer_auth_chunks(struct sock *sk, int len, if (len < sizeof(struct sctp_authchunks)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(struct sctp_authchunks))) + if (copy_from_user(&val, optval, sizeof(val))) return -EFAULT; to = p->gauth_chunks; @@ -6331,7 +6376,7 @@ static int sctp_getsockopt_local_auth_chunks(struct sock *sk, int len, if (len < sizeof(struct sctp_authchunks)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(struct sctp_authchunks))) + if (copy_from_user(&val, optval, sizeof(val))) return -EFAULT; to = p->gauth_chunks; @@ -6981,6 +7026,47 @@ out: return retval; } +static int sctp_getsockopt_interleaving_supported(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_assoc_value params; + struct sctp_association *asoc; + int retval = -EFAULT; + + if (len < sizeof(params)) { + retval = -EINVAL; + goto out; + } + + len = sizeof(params); + if (copy_from_user(¶ms, optval, len)) + goto out; + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (asoc) { + params.assoc_value = asoc->intl_enable; + } else if (!params.assoc_id) { + struct sctp_sock *sp = sctp_sk(sk); + + params.assoc_value = sp->strm_interleave; + } else { + retval = -EINVAL; + goto out; + } + + if (put_user(len, optlen)) + goto out; + + if (copy_to_user(optval, ¶ms, len)) + goto out; + + retval = 0; + +out: + return retval; +} + static int sctp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -7171,6 +7257,10 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, retval = sctp_getsockopt_scheduler_value(sk, len, optval, optlen); break; + case SCTP_INTERLEAVING_SUPPORTED: + retval = sctp_getsockopt_interleaving_supported(sk, len, optval, + optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -7405,13 +7495,13 @@ static int sctp_listen_start(struct sock *sk, int backlog) * sockets. * */ - sk->sk_state = SCTP_SS_LISTENING; + inet_sk_set_state(sk, SCTP_SS_LISTENING); if (!ep->base.bind_addr.port) { if (sctp_autobind(sk)) return -EAGAIN; } else { if (sctp_get_port(sk, inet_sk(sk)->inet_num)) { - sk->sk_state = SCTP_SS_CLOSED; + inet_sk_set_state(sk, SCTP_SS_CLOSED); return -EADDRINUSE; } } @@ -7497,11 +7587,11 @@ out: * here, again, by modeling the current TCP/UDP code. We don't have * a good way to test with it yet. */ -unsigned int sctp_poll(struct file *file, struct socket *sock, poll_table *wait) +__poll_t sctp_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct sctp_sock *sp = sctp_sk(sk); - unsigned int mask; + __poll_t mask; poll_wait(file, sk_sleep(sk), wait); @@ -7999,12 +8089,12 @@ void sctp_sock_rfree(struct sk_buff *skb) /* Helper function to wait for space in the sndbuf. */ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, - size_t msg_len, struct sock **orig_sk) + size_t msg_len) { struct sock *sk = asoc->base.sk; - int err = 0; long current_timeo = *timeo_p; DEFINE_WAIT(wait); + int err = 0; pr_debug("%s: asoc:%p, timeo:%ld, msg_len:%zu\n", __func__, asoc, *timeo_p, msg_len); @@ -8033,17 +8123,13 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, release_sock(sk); current_timeo = schedule_timeout(current_timeo); lock_sock(sk); - if (sk != asoc->base.sk) { - release_sock(sk); - sk = asoc->base.sk; - lock_sock(sk); - } + if (sk != asoc->base.sk) + goto do_error; *timeo_p = current_timeo; } out: - *orig_sk = sk; finish_wait(&asoc->wait, &wait); /* Release the association's refcnt. */ @@ -8408,11 +8494,7 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, } - sctp_skb_for_each(skb, &assoc->ulpq.reasm, tmp) - sctp_skb_set_owner_r_frag(skb, newsk); - - sctp_skb_for_each(skb, &assoc->ulpq.lobby, tmp) - sctp_skb_set_owner_r_frag(skb, newsk); + sctp_for_each_rx_skb(assoc, newsk, sctp_skb_set_owner_r_frag); /* Set the type of socket to indicate that it is peeled off from the * original UDP-style socket or created with the accept() call on a @@ -8438,10 +8520,10 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, * is called, set RCV_SHUTDOWN flag. */ if (sctp_state(assoc, CLOSED) && sctp_style(newsk, TCP)) { - newsk->sk_state = SCTP_SS_CLOSED; + inet_sk_set_state(newsk, SCTP_SS_CLOSED); newsk->sk_shutdown |= RCV_SHUTDOWN; } else { - newsk->sk_state = SCTP_SS_ESTABLISHED; + inet_sk_set_state(newsk, SCTP_SS_ESTABLISHED); } release_sock(newsk); diff --git a/net/sctp/stream.c b/net/sctp/stream.c index a11db21dc8a0..cedf672487f9 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -64,7 +64,7 @@ static void sctp_stream_outq_migrate(struct sctp_stream *stream, */ /* Mark as failed send. */ - sctp_chunk_fail(ch, SCTP_ERROR_INV_STRM); + sctp_chunk_fail(ch, (__force __u32)SCTP_ERROR_INV_STRM); if (asoc->peer.prsctp_capable && SCTP_PR_PRIO_ENABLED(ch->sinfo.sinfo_flags)) asoc->sent_cnt_removable--; @@ -156,9 +156,9 @@ int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, sctp_stream_outq_migrate(stream, NULL, outcnt); sched->sched_all(stream); - i = sctp_stream_alloc_out(stream, outcnt, gfp); - if (i) - return i; + ret = sctp_stream_alloc_out(stream, outcnt, gfp); + if (ret) + goto out; stream->outcnt = outcnt; for (i = 0; i < stream->outcnt; i++) @@ -167,22 +167,21 @@ int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, sched->init(stream); in: + sctp_stream_interleave_init(stream); if (!incnt) goto out; - i = sctp_stream_alloc_in(stream, incnt, gfp); - if (i) { - ret = -ENOMEM; - goto free; + ret = sctp_stream_alloc_in(stream, incnt, gfp); + if (ret) { + sched->free(stream); + kfree(stream->out); + stream->out = NULL; + stream->outcnt = 0; + goto out; } stream->incnt = incnt; - goto out; -free: - sched->free(stream); - kfree(stream->out); - stream->out = NULL; out: return ret; } @@ -215,11 +214,13 @@ void sctp_stream_clear(struct sctp_stream *stream) { int i; - for (i = 0; i < stream->outcnt; i++) - stream->out[i].ssn = 0; + for (i = 0; i < stream->outcnt; i++) { + stream->out[i].mid = 0; + stream->out[i].mid_uo = 0; + } for (i = 0; i < stream->incnt; i++) - stream->in[i].ssn = 0; + stream->in[i].mid = 0; } void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new) @@ -254,6 +255,30 @@ static int sctp_send_reconf(struct sctp_association *asoc, return retval; } +static bool sctp_stream_outq_is_empty(struct sctp_stream *stream, + __u16 str_nums, __be16 *str_list) +{ + struct sctp_association *asoc; + __u16 i; + + asoc = container_of(stream, struct sctp_association, stream); + if (!asoc->outqueue.out_qlen) + return true; + + if (!str_nums) + return false; + + for (i = 0; i < str_nums; i++) { + __u16 sid = ntohs(str_list[i]); + + if (stream->out[sid].ext && + !list_empty(&stream->out[sid].ext->outq)) + return false; + } + + return true; +} + int sctp_send_reset_streams(struct sctp_association *asoc, struct sctp_reset_streams *params) { @@ -317,6 +342,11 @@ int sctp_send_reset_streams(struct sctp_association *asoc, for (i = 0; i < str_nums; i++) nstr_list[i] = htons(str_list[i]); + if (out && !sctp_stream_outq_is_empty(stream, str_nums, nstr_list)) { + retval = -EAGAIN; + goto out; + } + chunk = sctp_make_strreset_req(asoc, str_nums, nstr_list, out, in); kfree(nstr_list); @@ -377,6 +407,9 @@ int sctp_send_reset_assoc(struct sctp_association *asoc) if (asoc->strreset_outstanding) return -EINPROGRESS; + if (!sctp_outq_is_empty(&asoc->outqueue)) + return -EAGAIN; + chunk = sctp_make_strreset_tsnreq(asoc); if (!chunk) return -ENOMEM; @@ -563,7 +596,7 @@ struct sctp_chunk *sctp_process_strreset_outreq( flags = SCTP_STREAM_RESET_INCOMING_SSN; } - nums = (ntohs(param.p->length) - sizeof(*outreq)) / 2; + nums = (ntohs(param.p->length) - sizeof(*outreq)) / sizeof(__u16); if (nums) { str_p = outreq->list_of_streams; for (i = 0; i < nums; i++) { @@ -574,10 +607,10 @@ struct sctp_chunk *sctp_process_strreset_outreq( } for (i = 0; i < nums; i++) - stream->in[ntohs(str_p[i])].ssn = 0; + stream->in[ntohs(str_p[i])].mid = 0; } else { for (i = 0; i < stream->incnt; i++) - stream->in[i].ssn = 0; + stream->in[i].mid = 0; } result = SCTP_STRRESET_PERFORMED; @@ -627,7 +660,7 @@ struct sctp_chunk *sctp_process_strreset_inreq( goto out; } - nums = (ntohs(param.p->length) - sizeof(*inreq)) / 2; + nums = (ntohs(param.p->length) - sizeof(*inreq)) / sizeof(__u16); str_p = inreq->list_of_streams; for (i = 0; i < nums; i++) { if (ntohs(str_p[i]) >= stream->outcnt) { @@ -636,6 +669,12 @@ struct sctp_chunk *sctp_process_strreset_inreq( } } + if (!sctp_stream_outq_is_empty(stream, nums, str_p)) { + result = SCTP_STRRESET_IN_PROGRESS; + asoc->strreset_inseq--; + goto err; + } + chunk = sctp_make_strreset_req(asoc, nums, str_p, 1, 0); if (!chunk) goto out; @@ -687,12 +726,18 @@ struct sctp_chunk *sctp_process_strreset_tsnreq( i = asoc->strreset_inseq - request_seq - 1; result = asoc->strreset_result[i]; if (result == SCTP_STRRESET_PERFORMED) { - next_tsn = asoc->next_tsn; + next_tsn = asoc->ctsn_ack_point + 1; init_tsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map) + 1; } goto err; } + + if (!sctp_outq_is_empty(&asoc->outqueue)) { + result = SCTP_STRRESET_IN_PROGRESS; + goto err; + } + asoc->strreset_inseq++; if (!(asoc->strreset_enable & SCTP_ENABLE_RESET_ASSOC_REQ)) @@ -703,13 +748,13 @@ struct sctp_chunk *sctp_process_strreset_tsnreq( goto out; } - /* G3: The same processing as though a SACK chunk with no gap report - * and a cumulative TSN ACK of the Sender's Next TSN minus 1 were - * received MUST be performed. + /* G4: The same processing as though a FWD-TSN chunk (as defined in + * [RFC3758]) with all streams affected and a new cumulative TSN + * ACK of the Receiver's Next TSN minus 1 were received MUST be + * performed. */ max_tsn_seen = sctp_tsnmap_get_max_tsn_seen(&asoc->peer.tsn_map); - sctp_ulpq_reasm_flushtsn(&asoc->ulpq, max_tsn_seen); - sctp_ulpq_abort_pd(&asoc->ulpq, GFP_ATOMIC); + asoc->stream.si->report_ftsn(&asoc->ulpq, max_tsn_seen); /* G1: Compute an appropriate value for the Receiver's Next TSN -- the * TSN that the peer should use to send the next DATA chunk. The @@ -720,10 +765,9 @@ struct sctp_chunk *sctp_process_strreset_tsnreq( sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_INITIAL, init_tsn, GFP_ATOMIC); - /* G4: The same processing as though a FWD-TSN chunk (as defined in - * [RFC3758]) with all streams affected and a new cumulative TSN - * ACK of the Receiver's Next TSN minus 1 were received MUST be - * performed. + /* G3: The same processing as though a SACK chunk with no gap report + * and a cumulative TSN ACK of the Sender's Next TSN minus 1 were + * received MUST be performed. */ sctp_outq_free(&asoc->outqueue); @@ -739,10 +783,12 @@ struct sctp_chunk *sctp_process_strreset_tsnreq( /* G5: The next expected and outgoing SSNs MUST be reset to 0 for all * incoming and outgoing streams. */ - for (i = 0; i < stream->outcnt; i++) - stream->out[i].ssn = 0; + for (i = 0; i < stream->outcnt; i++) { + stream->out[i].mid = 0; + stream->out[i].mid_uo = 0; + } for (i = 0; i < stream->incnt; i++) - stream->in[i].ssn = 0; + stream->in[i].mid = 0; result = SCTP_STRRESET_PERFORMED; @@ -927,15 +973,20 @@ struct sctp_chunk *sctp_process_strreset_resp( outreq = (struct sctp_strreset_outreq *)req; str_p = outreq->list_of_streams; - nums = (ntohs(outreq->param_hdr.length) - sizeof(*outreq)) / 2; + nums = (ntohs(outreq->param_hdr.length) - sizeof(*outreq)) / + sizeof(__u16); if (result == SCTP_STRRESET_PERFORMED) { if (nums) { - for (i = 0; i < nums; i++) - stream->out[ntohs(str_p[i])].ssn = 0; + for (i = 0; i < nums; i++) { + stream->out[ntohs(str_p[i])].mid = 0; + stream->out[ntohs(str_p[i])].mid_uo = 0; + } } else { - for (i = 0; i < stream->outcnt; i++) - stream->out[i].ssn = 0; + for (i = 0; i < stream->outcnt; i++) { + stream->out[i].mid = 0; + stream->out[i].mid_uo = 0; + } } flags = SCTP_STREAM_RESET_OUTGOING_SSN; @@ -956,7 +1007,8 @@ struct sctp_chunk *sctp_process_strreset_resp( inreq = (struct sctp_strreset_inreq *)req; str_p = inreq->list_of_streams; - nums = (ntohs(inreq->param_hdr.length) - sizeof(*inreq)) / 2; + nums = (ntohs(inreq->param_hdr.length) - sizeof(*inreq)) / + sizeof(__u16); *evp = sctp_ulpevent_make_stream_reset_event(asoc, flags, nums, str_p, GFP_ATOMIC); @@ -975,24 +1027,32 @@ struct sctp_chunk *sctp_process_strreset_resp( if (result == SCTP_STRRESET_PERFORMED) { __u32 mtsn = sctp_tsnmap_get_max_tsn_seen( &asoc->peer.tsn_map); + LIST_HEAD(temp); - sctp_ulpq_reasm_flushtsn(&asoc->ulpq, mtsn); - sctp_ulpq_abort_pd(&asoc->ulpq, GFP_ATOMIC); + asoc->stream.si->report_ftsn(&asoc->ulpq, mtsn); sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_INITIAL, stsn, GFP_ATOMIC); + /* Clean up sacked and abandoned queues only. As the + * out_chunk_list may not be empty, splice it to temp, + * then get it back after sctp_outq_free is done. + */ + list_splice_init(&asoc->outqueue.out_chunk_list, &temp); sctp_outq_free(&asoc->outqueue); + list_splice_init(&temp, &asoc->outqueue.out_chunk_list); asoc->next_tsn = rtsn; asoc->ctsn_ack_point = asoc->next_tsn - 1; asoc->adv_peer_ack_point = asoc->ctsn_ack_point; - for (i = 0; i < stream->outcnt; i++) - stream->out[i].ssn = 0; + for (i = 0; i < stream->outcnt; i++) { + stream->out[i].mid = 0; + stream->out[i].mid_uo = 0; + } for (i = 0; i < stream->incnt; i++) - stream->in[i].ssn = 0; + stream->in[i].mid = 0; } for (i = 0; i < stream->outcnt; i++) diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c new file mode 100644 index 000000000000..8c7cf8f08711 --- /dev/null +++ b/net/sctp/stream_interleave.c @@ -0,0 +1,1334 @@ +/* SCTP kernel implementation + * (C) Copyright Red Hat Inc. 2017 + * + * This file is part of the SCTP kernel implementation + * + * These functions manipulate sctp stream queue/scheduling. + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, see + * <http://www.gnu.org/licenses/>. + * + * Please send any bug reports or fixes you make to the + * email addresched(es): + * lksctp developers <linux-sctp@vger.kernel.org> + * + * Written or modified by: + * Xin Long <lucien.xin@gmail.com> + */ + +#include <net/busy_poll.h> +#include <net/sctp/sctp.h> +#include <net/sctp/sm.h> +#include <net/sctp/ulpevent.h> +#include <linux/sctp.h> + +static struct sctp_chunk *sctp_make_idatafrag_empty( + const struct sctp_association *asoc, + const struct sctp_sndrcvinfo *sinfo, + int len, __u8 flags, gfp_t gfp) +{ + struct sctp_chunk *retval; + struct sctp_idatahdr dp; + + memset(&dp, 0, sizeof(dp)); + dp.stream = htons(sinfo->sinfo_stream); + + if (sinfo->sinfo_flags & SCTP_UNORDERED) + flags |= SCTP_DATA_UNORDERED; + + retval = sctp_make_idata(asoc, flags, sizeof(dp) + len, gfp); + if (!retval) + return NULL; + + retval->subh.idata_hdr = sctp_addto_chunk(retval, sizeof(dp), &dp); + memcpy(&retval->sinfo, sinfo, sizeof(struct sctp_sndrcvinfo)); + + return retval; +} + +static void sctp_chunk_assign_mid(struct sctp_chunk *chunk) +{ + struct sctp_stream *stream; + struct sctp_chunk *lchunk; + __u32 cfsn = 0; + __u16 sid; + + if (chunk->has_mid) + return; + + sid = sctp_chunk_stream_no(chunk); + stream = &chunk->asoc->stream; + + list_for_each_entry(lchunk, &chunk->msg->chunks, frag_list) { + struct sctp_idatahdr *hdr; + __u32 mid; + + lchunk->has_mid = 1; + + hdr = lchunk->subh.idata_hdr; + + if (lchunk->chunk_hdr->flags & SCTP_DATA_FIRST_FRAG) + hdr->ppid = lchunk->sinfo.sinfo_ppid; + else + hdr->fsn = htonl(cfsn++); + + if (lchunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) { + mid = lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG ? + sctp_mid_uo_next(stream, out, sid) : + sctp_mid_uo_peek(stream, out, sid); + } else { + mid = lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG ? + sctp_mid_next(stream, out, sid) : + sctp_mid_peek(stream, out, sid); + } + hdr->mid = htonl(mid); + } +} + +static bool sctp_validate_data(struct sctp_chunk *chunk) +{ + const struct sctp_stream *stream; + __u16 sid, ssn; + + if (chunk->chunk_hdr->type != SCTP_CID_DATA) + return false; + + if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) + return true; + + stream = &chunk->asoc->stream; + sid = sctp_chunk_stream_no(chunk); + ssn = ntohs(chunk->subh.data_hdr->ssn); + + return !SSN_lt(ssn, sctp_ssn_peek(stream, in, sid)); +} + +static bool sctp_validate_idata(struct sctp_chunk *chunk) +{ + struct sctp_stream *stream; + __u32 mid; + __u16 sid; + + if (chunk->chunk_hdr->type != SCTP_CID_I_DATA) + return false; + + if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) + return true; + + stream = &chunk->asoc->stream; + sid = sctp_chunk_stream_no(chunk); + mid = ntohl(chunk->subh.idata_hdr->mid); + + return !MID_lt(mid, sctp_mid_peek(stream, in, sid)); +} + +static void sctp_intl_store_reasm(struct sctp_ulpq *ulpq, + struct sctp_ulpevent *event) +{ + struct sctp_ulpevent *cevent; + struct sk_buff *pos; + + pos = skb_peek_tail(&ulpq->reasm); + if (!pos) { + __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event)); + return; + } + + cevent = sctp_skb2event(pos); + + if (event->stream == cevent->stream && + event->mid == cevent->mid && + (cevent->msg_flags & SCTP_DATA_FIRST_FRAG || + (!(event->msg_flags & SCTP_DATA_FIRST_FRAG) && + event->fsn > cevent->fsn))) { + __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event)); + return; + } + + if ((event->stream == cevent->stream && + MID_lt(cevent->mid, event->mid)) || + event->stream > cevent->stream) { + __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event)); + return; + } + + skb_queue_walk(&ulpq->reasm, pos) { + cevent = sctp_skb2event(pos); + + if (event->stream < cevent->stream || + (event->stream == cevent->stream && + MID_lt(event->mid, cevent->mid))) + break; + + if (event->stream == cevent->stream && + event->mid == cevent->mid && + !(cevent->msg_flags & SCTP_DATA_FIRST_FRAG) && + (event->msg_flags & SCTP_DATA_FIRST_FRAG || + event->fsn < cevent->fsn)) + break; + } + + __skb_queue_before(&ulpq->reasm, pos, sctp_event2skb(event)); +} + +static struct sctp_ulpevent *sctp_intl_retrieve_partial( + struct sctp_ulpq *ulpq, + struct sctp_ulpevent *event) +{ + struct sk_buff *first_frag = NULL; + struct sk_buff *last_frag = NULL; + struct sctp_ulpevent *retval; + struct sctp_stream_in *sin; + struct sk_buff *pos; + __u32 next_fsn = 0; + int is_last = 0; + + sin = sctp_stream_in(ulpq->asoc, event->stream); + + skb_queue_walk(&ulpq->reasm, pos) { + struct sctp_ulpevent *cevent = sctp_skb2event(pos); + + if (cevent->stream < event->stream) + continue; + + if (cevent->stream > event->stream || + cevent->mid != sin->mid) + break; + + switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { + case SCTP_DATA_FIRST_FRAG: + goto out; + case SCTP_DATA_MIDDLE_FRAG: + if (!first_frag) { + if (cevent->fsn == sin->fsn) { + first_frag = pos; + last_frag = pos; + next_fsn = cevent->fsn + 1; + } + } else if (cevent->fsn == next_fsn) { + last_frag = pos; + next_fsn++; + } else { + goto out; + } + break; + case SCTP_DATA_LAST_FRAG: + if (!first_frag) { + if (cevent->fsn == sin->fsn) { + first_frag = pos; + last_frag = pos; + next_fsn = 0; + is_last = 1; + } + } else if (cevent->fsn == next_fsn) { + last_frag = pos; + next_fsn = 0; + is_last = 1; + } + goto out; + default: + goto out; + } + } + +out: + if (!first_frag) + return NULL; + + retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk), + &ulpq->reasm, first_frag, + last_frag); + if (retval) { + sin->fsn = next_fsn; + if (is_last) { + retval->msg_flags |= MSG_EOR; + sin->pd_mode = 0; + } + } + + return retval; +} + +static struct sctp_ulpevent *sctp_intl_retrieve_reassembled( + struct sctp_ulpq *ulpq, + struct sctp_ulpevent *event) +{ + struct sctp_association *asoc = ulpq->asoc; + struct sk_buff *pos, *first_frag = NULL; + struct sctp_ulpevent *retval = NULL; + struct sk_buff *pd_first = NULL; + struct sk_buff *pd_last = NULL; + struct sctp_stream_in *sin; + __u32 next_fsn = 0; + __u32 pd_point = 0; + __u32 pd_len = 0; + __u32 mid = 0; + + sin = sctp_stream_in(ulpq->asoc, event->stream); + + skb_queue_walk(&ulpq->reasm, pos) { + struct sctp_ulpevent *cevent = sctp_skb2event(pos); + + if (cevent->stream < event->stream) + continue; + if (cevent->stream > event->stream) + break; + + if (MID_lt(cevent->mid, event->mid)) + continue; + if (MID_lt(event->mid, cevent->mid)) + break; + + switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { + case SCTP_DATA_FIRST_FRAG: + if (cevent->mid == sin->mid) { + pd_first = pos; + pd_last = pos; + pd_len = pos->len; + } + + first_frag = pos; + next_fsn = 0; + mid = cevent->mid; + break; + + case SCTP_DATA_MIDDLE_FRAG: + if (first_frag && cevent->mid == mid && + cevent->fsn == next_fsn) { + next_fsn++; + if (pd_first) { + pd_last = pos; + pd_len += pos->len; + } + } else { + first_frag = NULL; + } + break; + + case SCTP_DATA_LAST_FRAG: + if (first_frag && cevent->mid == mid && + cevent->fsn == next_fsn) + goto found; + else + first_frag = NULL; + break; + } + } + + if (!pd_first) + goto out; + + pd_point = sctp_sk(asoc->base.sk)->pd_point; + if (pd_point && pd_point <= pd_len) { + retval = sctp_make_reassembled_event(sock_net(asoc->base.sk), + &ulpq->reasm, + pd_first, pd_last); + if (retval) { + sin->fsn = next_fsn; + sin->pd_mode = 1; + } + } + goto out; + +found: + retval = sctp_make_reassembled_event(sock_net(asoc->base.sk), + &ulpq->reasm, + first_frag, pos); + if (retval) + retval->msg_flags |= MSG_EOR; + +out: + return retval; +} + +static struct sctp_ulpevent *sctp_intl_reasm(struct sctp_ulpq *ulpq, + struct sctp_ulpevent *event) +{ + struct sctp_ulpevent *retval = NULL; + struct sctp_stream_in *sin; + + if (SCTP_DATA_NOT_FRAG == (event->msg_flags & SCTP_DATA_FRAG_MASK)) { + event->msg_flags |= MSG_EOR; + return event; + } + + sctp_intl_store_reasm(ulpq, event); + + sin = sctp_stream_in(ulpq->asoc, event->stream); + if (sin->pd_mode && event->mid == sin->mid && + event->fsn == sin->fsn) + retval = sctp_intl_retrieve_partial(ulpq, event); + + if (!retval) + retval = sctp_intl_retrieve_reassembled(ulpq, event); + + return retval; +} + +static void sctp_intl_store_ordered(struct sctp_ulpq *ulpq, + struct sctp_ulpevent *event) +{ + struct sctp_ulpevent *cevent; + struct sk_buff *pos; + + pos = skb_peek_tail(&ulpq->lobby); + if (!pos) { + __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); + return; + } + + cevent = (struct sctp_ulpevent *)pos->cb; + if (event->stream == cevent->stream && + MID_lt(cevent->mid, event->mid)) { + __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); + return; + } + + if (event->stream > cevent->stream) { + __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); + return; + } + + skb_queue_walk(&ulpq->lobby, pos) { + cevent = (struct sctp_ulpevent *)pos->cb; + + if (cevent->stream > event->stream) + break; + + if (cevent->stream == event->stream && + MID_lt(event->mid, cevent->mid)) + break; + } + + __skb_queue_before(&ulpq->lobby, pos, sctp_event2skb(event)); +} + +static void sctp_intl_retrieve_ordered(struct sctp_ulpq *ulpq, + struct sctp_ulpevent *event) +{ + struct sk_buff_head *event_list; + struct sctp_stream *stream; + struct sk_buff *pos, *tmp; + __u16 sid = event->stream; + + stream = &ulpq->asoc->stream; + event_list = (struct sk_buff_head *)sctp_event2skb(event)->prev; + + sctp_skb_for_each(pos, &ulpq->lobby, tmp) { + struct sctp_ulpevent *cevent = (struct sctp_ulpevent *)pos->cb; + + if (cevent->stream > sid) + break; + + if (cevent->stream < sid) + continue; + + if (cevent->mid != sctp_mid_peek(stream, in, sid)) + break; + + sctp_mid_next(stream, in, sid); + + __skb_unlink(pos, &ulpq->lobby); + + __skb_queue_tail(event_list, pos); + } +} + +static struct sctp_ulpevent *sctp_intl_order(struct sctp_ulpq *ulpq, + struct sctp_ulpevent *event) +{ + struct sctp_stream *stream; + __u16 sid; + + stream = &ulpq->asoc->stream; + sid = event->stream; + + if (event->mid != sctp_mid_peek(stream, in, sid)) { + sctp_intl_store_ordered(ulpq, event); + return NULL; + } + + sctp_mid_next(stream, in, sid); + + sctp_intl_retrieve_ordered(ulpq, event); + + return event; +} + +static int sctp_enqueue_event(struct sctp_ulpq *ulpq, + struct sctp_ulpevent *event) +{ + struct sk_buff *skb = sctp_event2skb(event); + struct sock *sk = ulpq->asoc->base.sk; + struct sctp_sock *sp = sctp_sk(sk); + struct sk_buff_head *skb_list; + + skb_list = (struct sk_buff_head *)skb->prev; + + if (sk->sk_shutdown & RCV_SHUTDOWN && + (sk->sk_shutdown & SEND_SHUTDOWN || + !sctp_ulpevent_is_notification(event))) + goto out_free; + + if (!sctp_ulpevent_is_notification(event)) { + sk_mark_napi_id(sk, skb); + sk_incoming_cpu_update(sk); + } + + if (!sctp_ulpevent_is_enabled(event, &sp->subscribe)) + goto out_free; + + if (skb_list) + skb_queue_splice_tail_init(skb_list, + &sk->sk_receive_queue); + else + __skb_queue_tail(&sk->sk_receive_queue, skb); + + if (!sp->data_ready_signalled) { + sp->data_ready_signalled = 1; + sk->sk_data_ready(sk); + } + + return 1; + +out_free: + if (skb_list) + sctp_queue_purge_ulpevents(skb_list); + else + sctp_ulpevent_free(event); + + return 0; +} + +static void sctp_intl_store_reasm_uo(struct sctp_ulpq *ulpq, + struct sctp_ulpevent *event) +{ + struct sctp_ulpevent *cevent; + struct sk_buff *pos; + + pos = skb_peek_tail(&ulpq->reasm_uo); + if (!pos) { + __skb_queue_tail(&ulpq->reasm_uo, sctp_event2skb(event)); + return; + } + + cevent = sctp_skb2event(pos); + + if (event->stream == cevent->stream && + event->mid == cevent->mid && + (cevent->msg_flags & SCTP_DATA_FIRST_FRAG || + (!(event->msg_flags & SCTP_DATA_FIRST_FRAG) && + event->fsn > cevent->fsn))) { + __skb_queue_tail(&ulpq->reasm_uo, sctp_event2skb(event)); + return; + } + + if ((event->stream == cevent->stream && + MID_lt(cevent->mid, event->mid)) || + event->stream > cevent->stream) { + __skb_queue_tail(&ulpq->reasm_uo, sctp_event2skb(event)); + return; + } + + skb_queue_walk(&ulpq->reasm_uo, pos) { + cevent = sctp_skb2event(pos); + + if (event->stream < cevent->stream || + (event->stream == cevent->stream && + MID_lt(event->mid, cevent->mid))) + break; + + if (event->stream == cevent->stream && + event->mid == cevent->mid && + !(cevent->msg_flags & SCTP_DATA_FIRST_FRAG) && + (event->msg_flags & SCTP_DATA_FIRST_FRAG || + event->fsn < cevent->fsn)) + break; + } + + __skb_queue_before(&ulpq->reasm_uo, pos, sctp_event2skb(event)); +} + +static struct sctp_ulpevent *sctp_intl_retrieve_partial_uo( + struct sctp_ulpq *ulpq, + struct sctp_ulpevent *event) +{ + struct sk_buff *first_frag = NULL; + struct sk_buff *last_frag = NULL; + struct sctp_ulpevent *retval; + struct sctp_stream_in *sin; + struct sk_buff *pos; + __u32 next_fsn = 0; + int is_last = 0; + + sin = sctp_stream_in(ulpq->asoc, event->stream); + + skb_queue_walk(&ulpq->reasm_uo, pos) { + struct sctp_ulpevent *cevent = sctp_skb2event(pos); + + if (cevent->stream < event->stream) + continue; + if (cevent->stream > event->stream) + break; + + if (MID_lt(cevent->mid, sin->mid_uo)) + continue; + if (MID_lt(sin->mid_uo, cevent->mid)) + break; + + switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { + case SCTP_DATA_FIRST_FRAG: + goto out; + case SCTP_DATA_MIDDLE_FRAG: + if (!first_frag) { + if (cevent->fsn == sin->fsn_uo) { + first_frag = pos; + last_frag = pos; + next_fsn = cevent->fsn + 1; + } + } else if (cevent->fsn == next_fsn) { + last_frag = pos; + next_fsn++; + } else { + goto out; + } + break; + case SCTP_DATA_LAST_FRAG: + if (!first_frag) { + if (cevent->fsn == sin->fsn_uo) { + first_frag = pos; + last_frag = pos; + next_fsn = 0; + is_last = 1; + } + } else if (cevent->fsn == next_fsn) { + last_frag = pos; + next_fsn = 0; + is_last = 1; + } + goto out; + default: + goto out; + } + } + +out: + if (!first_frag) + return NULL; + + retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk), + &ulpq->reasm_uo, first_frag, + last_frag); + if (retval) { + sin->fsn_uo = next_fsn; + if (is_last) { + retval->msg_flags |= MSG_EOR; + sin->pd_mode_uo = 0; + } + } + + return retval; +} + +static struct sctp_ulpevent *sctp_intl_retrieve_reassembled_uo( + struct sctp_ulpq *ulpq, + struct sctp_ulpevent *event) +{ + struct sctp_association *asoc = ulpq->asoc; + struct sk_buff *pos, *first_frag = NULL; + struct sctp_ulpevent *retval = NULL; + struct sk_buff *pd_first = NULL; + struct sk_buff *pd_last = NULL; + struct sctp_stream_in *sin; + __u32 next_fsn = 0; + __u32 pd_point = 0; + __u32 pd_len = 0; + __u32 mid = 0; + + sin = sctp_stream_in(ulpq->asoc, event->stream); + + skb_queue_walk(&ulpq->reasm_uo, pos) { + struct sctp_ulpevent *cevent = sctp_skb2event(pos); + + if (cevent->stream < event->stream) + continue; + if (cevent->stream > event->stream) + break; + + if (MID_lt(cevent->mid, event->mid)) + continue; + if (MID_lt(event->mid, cevent->mid)) + break; + + switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { + case SCTP_DATA_FIRST_FRAG: + if (!sin->pd_mode_uo) { + sin->mid_uo = cevent->mid; + pd_first = pos; + pd_last = pos; + pd_len = pos->len; + } + + first_frag = pos; + next_fsn = 0; + mid = cevent->mid; + break; + + case SCTP_DATA_MIDDLE_FRAG: + if (first_frag && cevent->mid == mid && + cevent->fsn == next_fsn) { + next_fsn++; + if (pd_first) { + pd_last = pos; + pd_len += pos->len; + } + } else { + first_frag = NULL; + } + break; + + case SCTP_DATA_LAST_FRAG: + if (first_frag && cevent->mid == mid && + cevent->fsn == next_fsn) + goto found; + else + first_frag = NULL; + break; + } + } + + if (!pd_first) + goto out; + + pd_point = sctp_sk(asoc->base.sk)->pd_point; + if (pd_point && pd_point <= pd_len) { + retval = sctp_make_reassembled_event(sock_net(asoc->base.sk), + &ulpq->reasm_uo, + pd_first, pd_last); + if (retval) { + sin->fsn_uo = next_fsn; + sin->pd_mode_uo = 1; + } + } + goto out; + +found: + retval = sctp_make_reassembled_event(sock_net(asoc->base.sk), + &ulpq->reasm_uo, + first_frag, pos); + if (retval) + retval->msg_flags |= MSG_EOR; + +out: + return retval; +} + +static struct sctp_ulpevent *sctp_intl_reasm_uo(struct sctp_ulpq *ulpq, + struct sctp_ulpevent *event) +{ + struct sctp_ulpevent *retval = NULL; + struct sctp_stream_in *sin; + + if (SCTP_DATA_NOT_FRAG == (event->msg_flags & SCTP_DATA_FRAG_MASK)) { + event->msg_flags |= MSG_EOR; + return event; + } + + sctp_intl_store_reasm_uo(ulpq, event); + + sin = sctp_stream_in(ulpq->asoc, event->stream); + if (sin->pd_mode_uo && event->mid == sin->mid_uo && + event->fsn == sin->fsn_uo) + retval = sctp_intl_retrieve_partial_uo(ulpq, event); + + if (!retval) + retval = sctp_intl_retrieve_reassembled_uo(ulpq, event); + + return retval; +} + +static struct sctp_ulpevent *sctp_intl_retrieve_first_uo(struct sctp_ulpq *ulpq) +{ + struct sctp_stream_in *csin, *sin = NULL; + struct sk_buff *first_frag = NULL; + struct sk_buff *last_frag = NULL; + struct sctp_ulpevent *retval; + struct sk_buff *pos; + __u32 next_fsn = 0; + __u16 sid = 0; + + skb_queue_walk(&ulpq->reasm_uo, pos) { + struct sctp_ulpevent *cevent = sctp_skb2event(pos); + + csin = sctp_stream_in(ulpq->asoc, cevent->stream); + if (csin->pd_mode_uo) + continue; + + switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { + case SCTP_DATA_FIRST_FRAG: + if (first_frag) + goto out; + first_frag = pos; + last_frag = pos; + next_fsn = 0; + sin = csin; + sid = cevent->stream; + sin->mid_uo = cevent->mid; + break; + case SCTP_DATA_MIDDLE_FRAG: + if (!first_frag) + break; + if (cevent->stream == sid && + cevent->mid == sin->mid_uo && + cevent->fsn == next_fsn) { + next_fsn++; + last_frag = pos; + } else { + goto out; + } + break; + case SCTP_DATA_LAST_FRAG: + if (first_frag) + goto out; + break; + default: + break; + } + } + + if (!first_frag) + return NULL; + +out: + retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk), + &ulpq->reasm_uo, first_frag, + last_frag); + if (retval) { + sin->fsn_uo = next_fsn; + sin->pd_mode_uo = 1; + } + + return retval; +} + +static int sctp_ulpevent_idata(struct sctp_ulpq *ulpq, + struct sctp_chunk *chunk, gfp_t gfp) +{ + struct sctp_ulpevent *event; + struct sk_buff_head temp; + int event_eor = 0; + + event = sctp_ulpevent_make_rcvmsg(chunk->asoc, chunk, gfp); + if (!event) + return -ENOMEM; + + event->mid = ntohl(chunk->subh.idata_hdr->mid); + if (event->msg_flags & SCTP_DATA_FIRST_FRAG) + event->ppid = chunk->subh.idata_hdr->ppid; + else + event->fsn = ntohl(chunk->subh.idata_hdr->fsn); + + if (!(event->msg_flags & SCTP_DATA_UNORDERED)) { + event = sctp_intl_reasm(ulpq, event); + if (event && event->msg_flags & MSG_EOR) { + skb_queue_head_init(&temp); + __skb_queue_tail(&temp, sctp_event2skb(event)); + + event = sctp_intl_order(ulpq, event); + } + } else { + event = sctp_intl_reasm_uo(ulpq, event); + } + + if (event) { + event_eor = (event->msg_flags & MSG_EOR) ? 1 : 0; + sctp_enqueue_event(ulpq, event); + } + + return event_eor; +} + +static struct sctp_ulpevent *sctp_intl_retrieve_first(struct sctp_ulpq *ulpq) +{ + struct sctp_stream_in *csin, *sin = NULL; + struct sk_buff *first_frag = NULL; + struct sk_buff *last_frag = NULL; + struct sctp_ulpevent *retval; + struct sk_buff *pos; + __u32 next_fsn = 0; + __u16 sid = 0; + + skb_queue_walk(&ulpq->reasm, pos) { + struct sctp_ulpevent *cevent = sctp_skb2event(pos); + + csin = sctp_stream_in(ulpq->asoc, cevent->stream); + if (csin->pd_mode) + continue; + + switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { + case SCTP_DATA_FIRST_FRAG: + if (first_frag) + goto out; + if (cevent->mid == csin->mid) { + first_frag = pos; + last_frag = pos; + next_fsn = 0; + sin = csin; + sid = cevent->stream; + } + break; + case SCTP_DATA_MIDDLE_FRAG: + if (!first_frag) + break; + if (cevent->stream == sid && + cevent->mid == sin->mid && + cevent->fsn == next_fsn) { + next_fsn++; + last_frag = pos; + } else { + goto out; + } + break; + case SCTP_DATA_LAST_FRAG: + if (first_frag) + goto out; + break; + default: + break; + } + } + + if (!first_frag) + return NULL; + +out: + retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk), + &ulpq->reasm, first_frag, + last_frag); + if (retval) { + sin->fsn = next_fsn; + sin->pd_mode = 1; + } + + return retval; +} + +static void sctp_intl_start_pd(struct sctp_ulpq *ulpq, gfp_t gfp) +{ + struct sctp_ulpevent *event; + + if (!skb_queue_empty(&ulpq->reasm)) { + do { + event = sctp_intl_retrieve_first(ulpq); + if (event) + sctp_enqueue_event(ulpq, event); + } while (event); + } + + if (!skb_queue_empty(&ulpq->reasm_uo)) { + do { + event = sctp_intl_retrieve_first_uo(ulpq); + if (event) + sctp_enqueue_event(ulpq, event); + } while (event); + } +} + +static void sctp_renege_events(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, + gfp_t gfp) +{ + struct sctp_association *asoc = ulpq->asoc; + __u32 freed = 0; + __u16 needed; + + if (chunk) { + needed = ntohs(chunk->chunk_hdr->length); + needed -= sizeof(struct sctp_idata_chunk); + } else { + needed = SCTP_DEFAULT_MAXWINDOW; + } + + if (skb_queue_empty(&asoc->base.sk->sk_receive_queue)) { + freed = sctp_ulpq_renege_list(ulpq, &ulpq->lobby, needed); + if (freed < needed) + freed += sctp_ulpq_renege_list(ulpq, &ulpq->reasm, + needed); + if (freed < needed) + freed += sctp_ulpq_renege_list(ulpq, &ulpq->reasm_uo, + needed); + } + + if (chunk && freed >= needed) + if (sctp_ulpevent_idata(ulpq, chunk, gfp) <= 0) + sctp_intl_start_pd(ulpq, gfp); + + sk_mem_reclaim(asoc->base.sk); +} + +static void sctp_intl_stream_abort_pd(struct sctp_ulpq *ulpq, __u16 sid, + __u32 mid, __u16 flags, gfp_t gfp) +{ + struct sock *sk = ulpq->asoc->base.sk; + struct sctp_ulpevent *ev = NULL; + + if (!sctp_ulpevent_type_enabled(SCTP_PARTIAL_DELIVERY_EVENT, + &sctp_sk(sk)->subscribe)) + return; + + ev = sctp_ulpevent_make_pdapi(ulpq->asoc, SCTP_PARTIAL_DELIVERY_ABORTED, + sid, mid, flags, gfp); + if (ev) { + __skb_queue_tail(&sk->sk_receive_queue, sctp_event2skb(ev)); + + if (!sctp_sk(sk)->data_ready_signalled) { + sctp_sk(sk)->data_ready_signalled = 1; + sk->sk_data_ready(sk); + } + } +} + +static void sctp_intl_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid) +{ + struct sctp_stream *stream = &ulpq->asoc->stream; + struct sctp_ulpevent *cevent, *event = NULL; + struct sk_buff_head *lobby = &ulpq->lobby; + struct sk_buff *pos, *tmp; + struct sk_buff_head temp; + __u16 csid; + __u32 cmid; + + skb_queue_head_init(&temp); + sctp_skb_for_each(pos, lobby, tmp) { + cevent = (struct sctp_ulpevent *)pos->cb; + csid = cevent->stream; + cmid = cevent->mid; + + if (csid > sid) + break; + + if (csid < sid) + continue; + + if (!MID_lt(cmid, sctp_mid_peek(stream, in, csid))) + break; + + __skb_unlink(pos, lobby); + if (!event) + event = sctp_skb2event(pos); + + __skb_queue_tail(&temp, pos); + } + + if (!event && pos != (struct sk_buff *)lobby) { + cevent = (struct sctp_ulpevent *)pos->cb; + csid = cevent->stream; + cmid = cevent->mid; + + if (csid == sid && cmid == sctp_mid_peek(stream, in, csid)) { + sctp_mid_next(stream, in, csid); + __skb_unlink(pos, lobby); + __skb_queue_tail(&temp, pos); + event = sctp_skb2event(pos); + } + } + + if (event) { + sctp_intl_retrieve_ordered(ulpq, event); + sctp_enqueue_event(ulpq, event); + } +} + +static void sctp_intl_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp) +{ + struct sctp_stream *stream = &ulpq->asoc->stream; + __u16 sid; + + for (sid = 0; sid < stream->incnt; sid++) { + struct sctp_stream_in *sin = &stream->in[sid]; + __u32 mid; + + if (sin->pd_mode_uo) { + sin->pd_mode_uo = 0; + + mid = sin->mid_uo; + sctp_intl_stream_abort_pd(ulpq, sid, mid, 0x1, gfp); + } + + if (sin->pd_mode) { + sin->pd_mode = 0; + + mid = sin->mid; + sctp_intl_stream_abort_pd(ulpq, sid, mid, 0, gfp); + sctp_mid_skip(stream, in, sid, mid); + + sctp_intl_reap_ordered(ulpq, sid); + } + } + + /* intl abort pd happens only when all data needs to be cleaned */ + sctp_ulpq_flush(ulpq); +} + +static inline int sctp_get_skip_pos(struct sctp_ifwdtsn_skip *skiplist, + int nskips, __be16 stream, __u8 flags) +{ + int i; + + for (i = 0; i < nskips; i++) + if (skiplist[i].stream == stream && + skiplist[i].flags == flags) + return i; + + return i; +} + +#define SCTP_FTSN_U_BIT 0x1 +static void sctp_generate_iftsn(struct sctp_outq *q, __u32 ctsn) +{ + struct sctp_ifwdtsn_skip ftsn_skip_arr[10]; + struct sctp_association *asoc = q->asoc; + struct sctp_chunk *ftsn_chunk = NULL; + struct list_head *lchunk, *temp; + int nskips = 0, skip_pos; + struct sctp_chunk *chunk; + __u32 tsn; + + if (!asoc->peer.prsctp_capable) + return; + + if (TSN_lt(asoc->adv_peer_ack_point, ctsn)) + asoc->adv_peer_ack_point = ctsn; + + list_for_each_safe(lchunk, temp, &q->abandoned) { + chunk = list_entry(lchunk, struct sctp_chunk, transmitted_list); + tsn = ntohl(chunk->subh.data_hdr->tsn); + + if (TSN_lte(tsn, ctsn)) { + list_del_init(lchunk); + sctp_chunk_free(chunk); + } else if (TSN_lte(tsn, asoc->adv_peer_ack_point + 1)) { + __be16 sid = chunk->subh.idata_hdr->stream; + __be32 mid = chunk->subh.idata_hdr->mid; + __u8 flags = 0; + + if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) + flags |= SCTP_FTSN_U_BIT; + + asoc->adv_peer_ack_point = tsn; + skip_pos = sctp_get_skip_pos(&ftsn_skip_arr[0], nskips, + sid, flags); + ftsn_skip_arr[skip_pos].stream = sid; + ftsn_skip_arr[skip_pos].reserved = 0; + ftsn_skip_arr[skip_pos].flags = flags; + ftsn_skip_arr[skip_pos].mid = mid; + if (skip_pos == nskips) + nskips++; + if (nskips == 10) + break; + } else { + break; + } + } + + if (asoc->adv_peer_ack_point > ctsn) + ftsn_chunk = sctp_make_ifwdtsn(asoc, asoc->adv_peer_ack_point, + nskips, &ftsn_skip_arr[0]); + + if (ftsn_chunk) { + list_add_tail(&ftsn_chunk->list, &q->control_chunk_list); + SCTP_INC_STATS(sock_net(asoc->base.sk), SCTP_MIB_OUTCTRLCHUNKS); + } +} + +#define _sctp_walk_ifwdtsn(pos, chunk, end) \ + for (pos = chunk->subh.ifwdtsn_hdr->skip; \ + (void *)pos < (void *)chunk->subh.ifwdtsn_hdr->skip + (end); pos++) + +#define sctp_walk_ifwdtsn(pos, ch) \ + _sctp_walk_ifwdtsn((pos), (ch), ntohs((ch)->chunk_hdr->length) - \ + sizeof(struct sctp_ifwdtsn_chunk)) + +static bool sctp_validate_fwdtsn(struct sctp_chunk *chunk) +{ + struct sctp_fwdtsn_skip *skip; + __u16 incnt; + + if (chunk->chunk_hdr->type != SCTP_CID_FWD_TSN) + return false; + + incnt = chunk->asoc->stream.incnt; + sctp_walk_fwdtsn(skip, chunk) + if (ntohs(skip->stream) >= incnt) + return false; + + return true; +} + +static bool sctp_validate_iftsn(struct sctp_chunk *chunk) +{ + struct sctp_ifwdtsn_skip *skip; + __u16 incnt; + + if (chunk->chunk_hdr->type != SCTP_CID_I_FWD_TSN) + return false; + + incnt = chunk->asoc->stream.incnt; + sctp_walk_ifwdtsn(skip, chunk) + if (ntohs(skip->stream) >= incnt) + return false; + + return true; +} + +static void sctp_report_fwdtsn(struct sctp_ulpq *ulpq, __u32 ftsn) +{ + /* Move the Cumulattive TSN Ack ahead. */ + sctp_tsnmap_skip(&ulpq->asoc->peer.tsn_map, ftsn); + /* purge the fragmentation queue */ + sctp_ulpq_reasm_flushtsn(ulpq, ftsn); + /* Abort any in progress partial delivery. */ + sctp_ulpq_abort_pd(ulpq, GFP_ATOMIC); +} + +static void sctp_intl_reasm_flushtsn(struct sctp_ulpq *ulpq, __u32 ftsn) +{ + struct sk_buff *pos, *tmp; + + skb_queue_walk_safe(&ulpq->reasm, pos, tmp) { + struct sctp_ulpevent *event = sctp_skb2event(pos); + __u32 tsn = event->tsn; + + if (TSN_lte(tsn, ftsn)) { + __skb_unlink(pos, &ulpq->reasm); + sctp_ulpevent_free(event); + } + } + + skb_queue_walk_safe(&ulpq->reasm_uo, pos, tmp) { + struct sctp_ulpevent *event = sctp_skb2event(pos); + __u32 tsn = event->tsn; + + if (TSN_lte(tsn, ftsn)) { + __skb_unlink(pos, &ulpq->reasm_uo); + sctp_ulpevent_free(event); + } + } +} + +static void sctp_report_iftsn(struct sctp_ulpq *ulpq, __u32 ftsn) +{ + /* Move the Cumulattive TSN Ack ahead. */ + sctp_tsnmap_skip(&ulpq->asoc->peer.tsn_map, ftsn); + /* purge the fragmentation queue */ + sctp_intl_reasm_flushtsn(ulpq, ftsn); + /* abort only when it's for all data */ + if (ftsn == sctp_tsnmap_get_max_tsn_seen(&ulpq->asoc->peer.tsn_map)) + sctp_intl_abort_pd(ulpq, GFP_ATOMIC); +} + +static void sctp_handle_fwdtsn(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk) +{ + struct sctp_fwdtsn_skip *skip; + + /* Walk through all the skipped SSNs */ + sctp_walk_fwdtsn(skip, chunk) + sctp_ulpq_skip(ulpq, ntohs(skip->stream), ntohs(skip->ssn)); +} + +static void sctp_intl_skip(struct sctp_ulpq *ulpq, __u16 sid, __u32 mid, + __u8 flags) +{ + struct sctp_stream_in *sin = sctp_stream_in(ulpq->asoc, sid); + struct sctp_stream *stream = &ulpq->asoc->stream; + + if (flags & SCTP_FTSN_U_BIT) { + if (sin->pd_mode_uo && MID_lt(sin->mid_uo, mid)) { + sin->pd_mode_uo = 0; + sctp_intl_stream_abort_pd(ulpq, sid, mid, 0x1, + GFP_ATOMIC); + } + return; + } + + if (MID_lt(mid, sctp_mid_peek(stream, in, sid))) + return; + + if (sin->pd_mode) { + sin->pd_mode = 0; + sctp_intl_stream_abort_pd(ulpq, sid, mid, 0x0, GFP_ATOMIC); + } + + sctp_mid_skip(stream, in, sid, mid); + + sctp_intl_reap_ordered(ulpq, sid); +} + +static void sctp_handle_iftsn(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk) +{ + struct sctp_ifwdtsn_skip *skip; + + /* Walk through all the skipped MIDs and abort stream pd if possible */ + sctp_walk_ifwdtsn(skip, chunk) + sctp_intl_skip(ulpq, ntohs(skip->stream), + ntohl(skip->mid), skip->flags); +} + +static struct sctp_stream_interleave sctp_stream_interleave_0 = { + .data_chunk_len = sizeof(struct sctp_data_chunk), + .ftsn_chunk_len = sizeof(struct sctp_fwdtsn_chunk), + /* DATA process functions */ + .make_datafrag = sctp_make_datafrag_empty, + .assign_number = sctp_chunk_assign_ssn, + .validate_data = sctp_validate_data, + .ulpevent_data = sctp_ulpq_tail_data, + .enqueue_event = sctp_ulpq_tail_event, + .renege_events = sctp_ulpq_renege, + .start_pd = sctp_ulpq_partial_delivery, + .abort_pd = sctp_ulpq_abort_pd, + /* FORWARD-TSN process functions */ + .generate_ftsn = sctp_generate_fwdtsn, + .validate_ftsn = sctp_validate_fwdtsn, + .report_ftsn = sctp_report_fwdtsn, + .handle_ftsn = sctp_handle_fwdtsn, +}; + +static struct sctp_stream_interleave sctp_stream_interleave_1 = { + .data_chunk_len = sizeof(struct sctp_idata_chunk), + .ftsn_chunk_len = sizeof(struct sctp_ifwdtsn_chunk), + /* I-DATA process functions */ + .make_datafrag = sctp_make_idatafrag_empty, + .assign_number = sctp_chunk_assign_mid, + .validate_data = sctp_validate_idata, + .ulpevent_data = sctp_ulpevent_idata, + .enqueue_event = sctp_enqueue_event, + .renege_events = sctp_renege_events, + .start_pd = sctp_intl_start_pd, + .abort_pd = sctp_intl_abort_pd, + /* I-FORWARD-TSN process functions */ + .generate_ftsn = sctp_generate_iftsn, + .validate_ftsn = sctp_validate_iftsn, + .report_ftsn = sctp_report_iftsn, + .handle_ftsn = sctp_handle_iftsn, +}; + +void sctp_stream_interleave_init(struct sctp_stream *stream) +{ + struct sctp_association *asoc; + + asoc = container_of(stream, struct sctp_association, stream); + stream->si = asoc->intl_enable ? &sctp_stream_interleave_1 + : &sctp_stream_interleave_0; +} diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c index 0b83ec51e43b..f5fcd425232a 100644 --- a/net/sctp/stream_sched.c +++ b/net/sctp/stream_sched.c @@ -119,16 +119,27 @@ static struct sctp_sched_ops sctp_sched_fcfs = { .unsched_all = sctp_sched_fcfs_unsched_all, }; +static void sctp_sched_ops_fcfs_init(void) +{ + sctp_sched_ops_register(SCTP_SS_FCFS, &sctp_sched_fcfs); +} + /* API to other parts of the stack */ -extern struct sctp_sched_ops sctp_sched_prio; -extern struct sctp_sched_ops sctp_sched_rr; +static struct sctp_sched_ops *sctp_sched_ops[SCTP_SS_MAX + 1]; -static struct sctp_sched_ops *sctp_sched_ops[] = { - &sctp_sched_fcfs, - &sctp_sched_prio, - &sctp_sched_rr, -}; +void sctp_sched_ops_register(enum sctp_sched_type sched, + struct sctp_sched_ops *sched_ops) +{ + sctp_sched_ops[sched] = sched_ops; +} + +void sctp_sched_ops_init(void) +{ + sctp_sched_ops_fcfs_init(); + sctp_sched_ops_prio_init(); + sctp_sched_ops_rr_init(); +} int sctp_sched_set_sched(struct sctp_association *asoc, enum sctp_sched_type sched) @@ -231,7 +242,8 @@ int sctp_sched_get_value(struct sctp_association *asoc, __u16 sid, void sctp_sched_dequeue_done(struct sctp_outq *q, struct sctp_chunk *ch) { - if (!list_is_last(&ch->frag_list, &ch->msg->chunks)) { + if (!list_is_last(&ch->frag_list, &ch->msg->chunks) && + !q->asoc->intl_enable) { struct sctp_stream_out *sout; __u16 sid; diff --git a/net/sctp/stream_sched_prio.c b/net/sctp/stream_sched_prio.c index 384dbf3c8760..7997d35dd0fd 100644 --- a/net/sctp/stream_sched_prio.c +++ b/net/sctp/stream_sched_prio.c @@ -333,7 +333,7 @@ static void sctp_sched_prio_unsched_all(struct sctp_stream *stream) sctp_sched_prio_unsched(soute); } -struct sctp_sched_ops sctp_sched_prio = { +static struct sctp_sched_ops sctp_sched_prio = { .set = sctp_sched_prio_set, .get = sctp_sched_prio_get, .init = sctp_sched_prio_init, @@ -345,3 +345,8 @@ struct sctp_sched_ops sctp_sched_prio = { .sched_all = sctp_sched_prio_sched_all, .unsched_all = sctp_sched_prio_unsched_all, }; + +void sctp_sched_ops_prio_init(void) +{ + sctp_sched_ops_register(SCTP_SS_PRIO, &sctp_sched_prio); +} diff --git a/net/sctp/stream_sched_rr.c b/net/sctp/stream_sched_rr.c index 7612a438c5b9..1155692448f1 100644 --- a/net/sctp/stream_sched_rr.c +++ b/net/sctp/stream_sched_rr.c @@ -187,7 +187,7 @@ static void sctp_sched_rr_unsched_all(struct sctp_stream *stream) sctp_sched_rr_unsched(stream, soute); } -struct sctp_sched_ops sctp_sched_rr = { +static struct sctp_sched_ops sctp_sched_rr = { .set = sctp_sched_rr_set, .get = sctp_sched_rr_get, .init = sctp_sched_rr_init, @@ -199,3 +199,8 @@ struct sctp_sched_ops sctp_sched_rr = { .sched_all = sctp_sched_rr_sched_all, .unsched_all = sctp_sched_rr_unsched_all, }; + +void sctp_sched_ops_rr_init(void) +{ + sctp_sched_ops_register(SCTP_SS_RR, &sctp_sched_rr); +} diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index ef7ca44d6e6a..33ca5b73cdb3 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -289,6 +289,13 @@ static struct ctl_table sctp_net_table[] = { .proc_handler = proc_sctp_do_auth, }, { + .procname = "intl_enable", + .data = &init_net.sctp.intl_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { .procname = "addr_scope_policy", .data = &init_net.sctp.scope_policy, .maxlen = sizeof(int), diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 1e5a22430cf5..47f82bd794d9 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -248,28 +248,37 @@ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk) transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT; } -void sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu) +bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu) { struct dst_entry *dst = sctp_transport_dst_check(t); + bool change = true; if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) { - pr_warn("%s: Reported pmtu %d too low, using default minimum of %d\n", - __func__, pmtu, SCTP_DEFAULT_MINSEGMENT); - /* Use default minimum segment size and disable - * pmtu discovery on this transport. - */ - t->pathmtu = SCTP_DEFAULT_MINSEGMENT; - } else { - t->pathmtu = pmtu; + pr_warn_ratelimited("%s: Reported pmtu %d too low, using default minimum of %d\n", + __func__, pmtu, SCTP_DEFAULT_MINSEGMENT); + /* Use default minimum segment instead */ + pmtu = SCTP_DEFAULT_MINSEGMENT; } + pmtu = SCTP_TRUNC4(pmtu); if (dst) { dst->ops->update_pmtu(dst, t->asoc->base.sk, NULL, pmtu); dst = sctp_transport_dst_check(t); } - if (!dst) + if (!dst) { t->af_specific->get_dst(t, &t->saddr, &t->fl, t->asoc->base.sk); + dst = t->dst; + } + + if (dst) { + /* Re-fetch, as under layers may have a higher minimum size */ + pmtu = SCTP_TRUNC4(dst_mtu(dst)); + change = t->pathmtu != pmtu; + } + t->pathmtu = pmtu; + + return change; } /* Caches the dst entry and source address for a transport's destination diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index 5447228bf1a0..84207ad33e8e 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -443,8 +443,8 @@ struct sctp_ulpevent *sctp_ulpevent_make_send_failed( goto fail; /* Pull off the common chunk header and DATA header. */ - skb_pull(skb, sizeof(struct sctp_data_chunk)); - len -= sizeof(struct sctp_data_chunk); + skb_pull(skb, sctp_datachk_len(&asoc->stream)); + len -= sctp_datachk_len(&asoc->stream); /* Embed the event fields inside the cloned skb. */ event = sctp_skb2event(skb); @@ -705,8 +705,6 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc, sctp_ulpevent_receive_data(event, asoc); event->stream = ntohs(chunk->subh.data_hdr->stream); - event->ssn = ntohs(chunk->subh.data_hdr->ssn); - event->ppid = chunk->subh.data_hdr->ppid; if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) { event->flags |= SCTP_UNORDERED; event->cumtsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map); @@ -732,8 +730,9 @@ fail: * various events. */ struct sctp_ulpevent *sctp_ulpevent_make_pdapi( - const struct sctp_association *asoc, __u32 indication, - gfp_t gfp) + const struct sctp_association *asoc, + __u32 indication, __u32 sid, __u32 seq, + __u32 flags, gfp_t gfp) { struct sctp_ulpevent *event; struct sctp_pdapi_event *pd; @@ -754,7 +753,9 @@ struct sctp_ulpevent *sctp_ulpevent_make_pdapi( * Currently unused. */ pd->pdapi_type = SCTP_PARTIAL_DELIVERY_EVENT; - pd->pdapi_flags = 0; + pd->pdapi_flags = flags; + pd->pdapi_stream = sid; + pd->pdapi_seq = seq; /* pdapi_length: 32 bits (unsigned integer) * diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index a71be33f3afe..0b427100b0d4 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -60,6 +60,7 @@ struct sctp_ulpq *sctp_ulpq_init(struct sctp_ulpq *ulpq, ulpq->asoc = asoc; skb_queue_head_init(&ulpq->reasm); + skb_queue_head_init(&ulpq->reasm_uo); skb_queue_head_init(&ulpq->lobby); ulpq->pd_mode = 0; @@ -83,6 +84,10 @@ void sctp_ulpq_flush(struct sctp_ulpq *ulpq) sctp_ulpevent_free(event); } + while ((skb = __skb_dequeue(&ulpq->reasm_uo)) != NULL) { + event = sctp_skb2event(skb); + sctp_ulpevent_free(event); + } } /* Dispose of a ulpqueue. */ @@ -104,6 +109,9 @@ int sctp_ulpq_tail_data(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, if (!event) return -ENOMEM; + event->ssn = ntohs(chunk->subh.data_hdr->ssn); + event->ppid = chunk->subh.data_hdr->ppid; + /* Do reassembly if needed. */ event = sctp_ulpq_reasm(ulpq, event); @@ -328,9 +336,10 @@ static void sctp_ulpq_store_reasm(struct sctp_ulpq *ulpq, * payload was fragmented on the way and ip had to reassemble them. * We add the rest of skb's to the first skb's fraglist. */ -static struct sctp_ulpevent *sctp_make_reassembled_event(struct net *net, - struct sk_buff_head *queue, struct sk_buff *f_frag, - struct sk_buff *l_frag) +struct sctp_ulpevent *sctp_make_reassembled_event(struct net *net, + struct sk_buff_head *queue, + struct sk_buff *f_frag, + struct sk_buff *l_frag) { struct sk_buff *pos; struct sk_buff *new = NULL; @@ -853,7 +862,7 @@ static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq, struct sctp_stream *stream; /* Check if this message needs ordering. */ - if (SCTP_DATA_UNORDERED & event->msg_flags) + if (event->msg_flags & SCTP_DATA_UNORDERED) return event; /* Note: The stream ID must be verified before this routine. */ @@ -974,8 +983,8 @@ void sctp_ulpq_skip(struct sctp_ulpq *ulpq, __u16 sid, __u16 ssn) sctp_ulpq_reap_ordered(ulpq, sid); } -static __u16 sctp_ulpq_renege_list(struct sctp_ulpq *ulpq, - struct sk_buff_head *list, __u16 needed) +__u16 sctp_ulpq_renege_list(struct sctp_ulpq *ulpq, struct sk_buff_head *list, + __u16 needed) { __u16 freed = 0; __u32 tsn, last_tsn; @@ -1084,29 +1093,21 @@ void sctp_ulpq_partial_delivery(struct sctp_ulpq *ulpq, void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, gfp_t gfp) { - struct sctp_association *asoc; - __u16 needed, freed; + struct sctp_association *asoc = ulpq->asoc; + __u32 freed = 0; + __u16 needed; - asoc = ulpq->asoc; - - if (chunk) { - needed = ntohs(chunk->chunk_hdr->length); - needed -= sizeof(struct sctp_data_chunk); - } else - needed = SCTP_DEFAULT_MAXWINDOW; - - freed = 0; + needed = ntohs(chunk->chunk_hdr->length) - + sizeof(struct sctp_data_chunk); if (skb_queue_empty(&asoc->base.sk->sk_receive_queue)) { freed = sctp_ulpq_renege_order(ulpq, needed); - if (freed < needed) { + if (freed < needed) freed += sctp_ulpq_renege_frags(ulpq, needed - freed); - } } /* If able to free enough room, accept this chunk. */ - if (chunk && (freed >= needed)) { - int retval; - retval = sctp_ulpq_tail_data(ulpq, chunk, gfp); + if (freed >= needed) { + int retval = sctp_ulpq_tail_data(ulpq, chunk, gfp); /* * Enter partial delivery if chunk has not been * delivered; otherwise, drain the reassembly queue. @@ -1140,7 +1141,7 @@ void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp) &sctp_sk(sk)->subscribe)) ev = sctp_ulpevent_make_pdapi(ulpq->asoc, SCTP_PARTIAL_DELIVERY_ABORTED, - gfp); + 0, 0, 0, gfp); if (ev) __skb_queue_tail(&sk->sk_receive_queue, sctp_event2skb(ev)); diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 6451c5013e06..3583c8ab1bae 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -115,7 +115,6 @@ static int smc_release(struct socket *sock) goto out; smc = smc_sk(sk); - sock_hold(sk); if (sk->sk_state == SMC_LISTEN) /* smc_close_non_accepted() is called and acquires * sock lock for child sockets again @@ -124,10 +123,7 @@ static int smc_release(struct socket *sock) else lock_sock(sk); - if (smc->use_fallback) { - sk->sk_state = SMC_CLOSED; - sk->sk_state_change(sk); - } else { + if (!smc->use_fallback) { rc = smc_close_active(smc); sock_set_flag(sk, SOCK_DEAD); sk->sk_shutdown |= SHUTDOWN_MASK; @@ -136,20 +132,21 @@ static int smc_release(struct socket *sock) sock_release(smc->clcsock); smc->clcsock = NULL; } + if (smc->use_fallback) { + sock_put(sk); /* passive closing */ + sk->sk_state = SMC_CLOSED; + sk->sk_state_change(sk); + } /* detach socket */ sock_orphan(sk); sock->sk = NULL; - if (smc->use_fallback) { - schedule_delayed_work(&smc->sock_put_work, TCP_TIMEWAIT_LEN); - } else if (sk->sk_state == SMC_CLOSED) { + if (!smc->use_fallback && sk->sk_state == SMC_CLOSED) smc_conn_free(&smc->conn); - schedule_delayed_work(&smc->sock_put_work, - SMC_CLOSE_SOCK_PUT_DELAY); - } release_sock(sk); - sock_put(sk); + sk->sk_prot->unhash(sk); + sock_put(sk); /* final sock_put */ out: return rc; } @@ -181,7 +178,6 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock) INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); INIT_LIST_HEAD(&smc->accept_q); spin_lock_init(&smc->accept_q_lock); - INIT_DELAYED_WORK(&smc->sock_put_work, smc_close_sock_put_work); sk->sk_prot->hash(sk); sk_refcnt_debug_inc(sk); @@ -377,6 +373,15 @@ static void smc_link_save_peer_info(struct smc_link *link, link->peer_mtu = clc->qp_mtu; } +static void smc_lgr_forget(struct smc_link_group *lgr) +{ + spin_lock_bh(&smc_lgr_list.lock); + /* do not use this link group for new connections */ + if (!list_empty(&lgr->list)) + list_del_init(&lgr->list); + spin_unlock_bh(&smc_lgr_list.lock); +} + /* setup for RDMA connection of client */ static int smc_connect_rdma(struct smc_sock *smc) { @@ -390,6 +395,8 @@ static int smc_connect_rdma(struct smc_sock *smc) int rc = 0; u8 ibport; + sock_hold(&smc->sk); /* sock put in passive closing */ + if (!tcp_sk(smc->clcsock->sk)->syn_smc) { /* peer has not signalled SMC-capability */ smc->use_fallback = true; @@ -513,6 +520,8 @@ out_connected: return rc ? rc : local_contact; decline_rdma_unlock: + if (local_contact == SMC_FIRST_CONTACT) + smc_lgr_forget(smc->conn.lgr); mutex_unlock(&smc_create_lgr_pending); smc_conn_free(&smc->conn); decline_rdma: @@ -520,15 +529,19 @@ decline_rdma: smc->use_fallback = true; if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) { rc = smc_clc_send_decline(smc, reason_code); - if (rc < sizeof(struct smc_clc_msg_decline)) + if (rc < 0) goto out_err; } goto out_connected; out_err_unlock: + if (local_contact == SMC_FIRST_CONTACT) + smc_lgr_forget(smc->conn.lgr); mutex_unlock(&smc_create_lgr_pending); smc_conn_free(&smc->conn); out_err: + if (smc->sk.sk_state == SMC_INIT) + sock_put(&smc->sk); /* passive closing */ return rc; } @@ -581,40 +594,33 @@ out_err: static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) { - struct sock *sk = &lsmc->sk; - struct socket *new_clcsock; + struct socket *new_clcsock = NULL; + struct sock *lsk = &lsmc->sk; struct sock *new_sk; int rc; - release_sock(&lsmc->sk); - new_sk = smc_sock_alloc(sock_net(sk), NULL); + release_sock(lsk); + new_sk = smc_sock_alloc(sock_net(lsk), NULL); if (!new_sk) { rc = -ENOMEM; - lsmc->sk.sk_err = ENOMEM; + lsk->sk_err = ENOMEM; *new_smc = NULL; - lock_sock(&lsmc->sk); + lock_sock(lsk); goto out; } *new_smc = smc_sk(new_sk); rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0); - lock_sock(&lsmc->sk); - if (rc < 0) { - lsmc->sk.sk_err = -rc; - new_sk->sk_state = SMC_CLOSED; - sock_set_flag(new_sk, SOCK_DEAD); - sk->sk_prot->unhash(new_sk); - sock_put(new_sk); - *new_smc = NULL; - goto out; - } - if (lsmc->sk.sk_state == SMC_CLOSED) { + lock_sock(lsk); + if (rc < 0) + lsk->sk_err = -rc; + if (rc < 0 || lsk->sk_state == SMC_CLOSED) { if (new_clcsock) sock_release(new_clcsock); new_sk->sk_state = SMC_CLOSED; sock_set_flag(new_sk, SOCK_DEAD); - sk->sk_prot->unhash(new_sk); - sock_put(new_sk); + new_sk->sk_prot->unhash(new_sk); + sock_put(new_sk); /* final */ *new_smc = NULL; goto out; } @@ -631,7 +637,7 @@ static void smc_accept_enqueue(struct sock *parent, struct sock *sk) { struct smc_sock *par = smc_sk(parent); - sock_hold(sk); + sock_hold(sk); /* sock_put in smc_accept_unlink () */ spin_lock(&par->accept_q_lock); list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q); spin_unlock(&par->accept_q_lock); @@ -647,7 +653,7 @@ static void smc_accept_unlink(struct sock *sk) list_del_init(&smc_sk(sk)->accept_q); spin_unlock(&par->accept_q_lock); sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk); - sock_put(sk); + sock_put(sk); /* sock_hold in smc_accept_enqueue */ } /* remove a sock from the accept queue to bind it to a new socket created @@ -664,8 +670,12 @@ struct sock *smc_accept_dequeue(struct sock *parent, smc_accept_unlink(new_sk); if (new_sk->sk_state == SMC_CLOSED) { + if (isk->clcsock) { + sock_release(isk->clcsock); + isk->clcsock = NULL; + } new_sk->sk_prot->unhash(new_sk); - sock_put(new_sk); + sock_put(new_sk); /* final */ continue; } if (new_sock) @@ -680,14 +690,11 @@ void smc_close_non_accepted(struct sock *sk) { struct smc_sock *smc = smc_sk(sk); - sock_hold(sk); lock_sock(sk); if (!sk->sk_lingertime) /* wait for peer closing */ sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT; - if (smc->use_fallback) { - sk->sk_state = SMC_CLOSED; - } else { + if (!smc->use_fallback) { smc_close_active(smc); sock_set_flag(sk, SOCK_DEAD); sk->sk_shutdown |= SHUTDOWN_MASK; @@ -700,14 +707,15 @@ void smc_close_non_accepted(struct sock *sk) sock_release(tcp); } if (smc->use_fallback) { - schedule_delayed_work(&smc->sock_put_work, TCP_TIMEWAIT_LEN); - } else if (sk->sk_state == SMC_CLOSED) { - smc_conn_free(&smc->conn); - schedule_delayed_work(&smc->sock_put_work, - SMC_CLOSE_SOCK_PUT_DELAY); + sock_put(sk); /* passive closing */ + sk->sk_state = SMC_CLOSED; + } else { + if (sk->sk_state == SMC_CLOSED) + smc_conn_free(&smc->conn); } release_sock(sk); - sock_put(sk); + sk->sk_prot->unhash(sk); + sock_put(sk); /* final sock_put */ } static int smc_serv_conf_first_link(struct smc_sock *smc) @@ -751,14 +759,16 @@ static void smc_listen_work(struct work_struct *work) { struct smc_sock *new_smc = container_of(work, struct smc_sock, smc_listen_work); + struct smc_clc_msg_proposal_prefix *pclc_prfx; struct socket *newclcsock = new_smc->clcsock; struct smc_sock *lsmc = new_smc->listen_smc; struct smc_clc_msg_accept_confirm cclc; int local_contact = SMC_REUSE_CONTACT; struct sock *newsmcsk = &new_smc->sk; - struct smc_clc_msg_proposal pclc; + struct smc_clc_msg_proposal *pclc; struct smc_ib_device *smcibdev; struct sockaddr_in peeraddr; + u8 buf[SMC_CLC_MAX_LEN]; struct smc_link *link; int reason_code = 0; int rc = 0, len; @@ -775,7 +785,7 @@ static void smc_listen_work(struct work_struct *work) /* do inband token exchange - *wait for and receive SMC Proposal CLC message */ - reason_code = smc_clc_wait_msg(new_smc, &pclc, sizeof(pclc), + reason_code = smc_clc_wait_msg(new_smc, &buf, sizeof(buf), SMC_CLC_PROPOSAL); if (reason_code < 0) goto out_err; @@ -804,8 +814,11 @@ static void smc_listen_work(struct work_struct *work) reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ goto decline_rdma; } - if ((pclc.outgoing_subnet != subnet) || - (pclc.prefix_len != prefix_len)) { + + pclc = (struct smc_clc_msg_proposal *)&buf; + pclc_prfx = smc_clc_proposal_get_prefix(pclc); + if (pclc_prfx->outgoing_subnet != subnet || + pclc_prfx->prefix_len != prefix_len) { reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ goto decline_rdma; } @@ -816,7 +829,7 @@ static void smc_listen_work(struct work_struct *work) /* allocate connection / link group */ mutex_lock(&smc_create_lgr_pending); local_contact = smc_conn_create(new_smc, peeraddr.sin_addr.s_addr, - smcibdev, ibport, &pclc.lcl, 0); + smcibdev, ibport, &pclc->lcl, 0); if (local_contact < 0) { rc = local_contact; if (rc == -ENOMEM) @@ -879,11 +892,9 @@ static void smc_listen_work(struct work_struct *work) } /* QP confirmation over RoCE fabric */ reason_code = smc_serv_conf_first_link(new_smc); - if (reason_code < 0) { + if (reason_code < 0) /* peer is not aware of a problem */ - rc = reason_code; goto out_err_unlock; - } if (reason_code > 0) goto decline_rdma_unlock; } @@ -910,21 +921,26 @@ enqueue: return; decline_rdma_unlock: + if (local_contact == SMC_FIRST_CONTACT) + smc_lgr_forget(new_smc->conn.lgr); mutex_unlock(&smc_create_lgr_pending); decline_rdma: /* RDMA setup failed, switch back to TCP */ smc_conn_free(&new_smc->conn); new_smc->use_fallback = true; if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) { - rc = smc_clc_send_decline(new_smc, reason_code); - if (rc < sizeof(struct smc_clc_msg_decline)) + if (smc_clc_send_decline(new_smc, reason_code) < 0) goto out_err; } goto out_connected; out_err_unlock: + if (local_contact == SMC_FIRST_CONTACT) + smc_lgr_forget(new_smc->conn.lgr); mutex_unlock(&smc_create_lgr_pending); out_err: + if (newsmcsk->sk_state == SMC_INIT) + sock_put(&new_smc->sk); /* passive closing */ newsmcsk->sk_state = SMC_CLOSED; smc_conn_free(&new_smc->conn); goto enqueue; /* queue new sock with sk_err set */ @@ -934,11 +950,12 @@ static void smc_tcp_listen_work(struct work_struct *work) { struct smc_sock *lsmc = container_of(work, struct smc_sock, tcp_listen_work); + struct sock *lsk = &lsmc->sk; struct smc_sock *new_smc; int rc = 0; - lock_sock(&lsmc->sk); - while (lsmc->sk.sk_state == SMC_LISTEN) { + lock_sock(lsk); + while (lsk->sk_state == SMC_LISTEN) { rc = smc_clcsock_accept(lsmc, &new_smc); if (rc) goto out; @@ -947,15 +964,25 @@ static void smc_tcp_listen_work(struct work_struct *work) new_smc->listen_smc = lsmc; new_smc->use_fallback = false; /* assume rdma capability first*/ - sock_hold(&lsmc->sk); /* sock_put in smc_listen_work */ + sock_hold(lsk); /* sock_put in smc_listen_work */ INIT_WORK(&new_smc->smc_listen_work, smc_listen_work); smc_copy_sock_settings_to_smc(new_smc); - schedule_work(&new_smc->smc_listen_work); + sock_hold(&new_smc->sk); /* sock_put in passive closing */ + if (!schedule_work(&new_smc->smc_listen_work)) + sock_put(&new_smc->sk); } out: - release_sock(&lsmc->sk); - lsmc->sk.sk_data_ready(&lsmc->sk); /* no more listening, wake accept */ + if (lsmc->clcsock) { + sock_release(lsmc->clcsock); + lsmc->clcsock = NULL; + } + release_sock(lsk); + /* no more listening, wake up smc_close_wait_listen_clcsock and + * accept + */ + lsk->sk_state_change(lsk); + sock_put(&lsmc->sk); /* sock_hold in smc_listen */ } static int smc_listen(struct socket *sock, int backlog) @@ -989,7 +1016,9 @@ static int smc_listen(struct socket *sock, int backlog) sk->sk_ack_backlog = 0; sk->sk_state = SMC_LISTEN; INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); - schedule_work(&smc->tcp_listen_work); + sock_hold(sk); /* sock_hold in tcp_listen_worker */ + if (!schedule_work(&smc->tcp_listen_work)) + sock_put(sk); out: release_sock(sk); @@ -1006,6 +1035,7 @@ static int smc_accept(struct socket *sock, struct socket *new_sock, int rc = 0; lsmc = smc_sk(sk); + sock_hold(sk); /* sock_put below */ lock_sock(sk); if (lsmc->sk.sk_state != SMC_LISTEN) { @@ -1040,6 +1070,7 @@ static int smc_accept(struct socket *sock, struct socket *new_sock, out: release_sock(sk); + sock_put(sk); /* sock_hold above */ return rc; } @@ -1107,36 +1138,36 @@ out: return rc; } -static unsigned int smc_accept_poll(struct sock *parent) +static __poll_t smc_accept_poll(struct sock *parent) { - struct smc_sock *isk; - struct sock *sk; + struct smc_sock *isk = smc_sk(parent); + int mask = 0; - lock_sock(parent); - list_for_each_entry(isk, &smc_sk(parent)->accept_q, accept_q) { - sk = (struct sock *)isk; + spin_lock(&isk->accept_q_lock); + if (!list_empty(&isk->accept_q)) + mask = POLLIN | POLLRDNORM; + spin_unlock(&isk->accept_q_lock); - if (sk->sk_state == SMC_ACTIVE) { - release_sock(parent); - return POLLIN | POLLRDNORM; - } - } - release_sock(parent); - - return 0; + return mask; } -static unsigned int smc_poll(struct file *file, struct socket *sock, +static __poll_t smc_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; - unsigned int mask = 0; + __poll_t mask = 0; struct smc_sock *smc; int rc; + if (!sk) + return POLLNVAL; + smc = smc_sk(sock->sk); + sock_hold(sk); + lock_sock(sk); if ((sk->sk_state == SMC_INIT) || smc->use_fallback) { /* delegate to CLC child sock */ + release_sock(sk); mask = smc->clcsock->ops->poll(file, smc->clcsock, wait); /* if non-blocking connect finished ... */ lock_sock(sk); @@ -1148,37 +1179,43 @@ static unsigned int smc_poll(struct file *file, struct socket *sock, rc = smc_connect_rdma(smc); if (rc < 0) mask |= POLLERR; - else - /* success cases including fallback */ - mask |= POLLOUT | POLLWRNORM; + /* success cases including fallback */ + mask |= POLLOUT | POLLWRNORM; } } - release_sock(sk); } else { - sock_poll_wait(file, sk_sleep(sk), wait); - if (sk->sk_state == SMC_LISTEN) - /* woken up by sk_data_ready in smc_listen_work() */ - mask |= smc_accept_poll(sk); + if (sk->sk_state != SMC_CLOSED) { + release_sock(sk); + sock_poll_wait(file, sk_sleep(sk), wait); + lock_sock(sk); + } if (sk->sk_err) mask |= POLLERR; - if (atomic_read(&smc->conn.sndbuf_space) || - (sk->sk_shutdown & SEND_SHUTDOWN)) { - mask |= POLLOUT | POLLWRNORM; - } else { - sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - } - if (atomic_read(&smc->conn.bytes_to_rcv)) - mask |= POLLIN | POLLRDNORM; if ((sk->sk_shutdown == SHUTDOWN_MASK) || (sk->sk_state == SMC_CLOSED)) mask |= POLLHUP; - if (sk->sk_shutdown & RCV_SHUTDOWN) - mask |= POLLIN | POLLRDNORM | POLLRDHUP; - if (sk->sk_state == SMC_APPCLOSEWAIT1) - mask |= POLLIN; + if (sk->sk_state == SMC_LISTEN) { + /* woken up by sk_data_ready in smc_listen_work() */ + mask = smc_accept_poll(sk); + } else { + if (atomic_read(&smc->conn.sndbuf_space) || + sk->sk_shutdown & SEND_SHUTDOWN) { + mask |= POLLOUT | POLLWRNORM; + } else { + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + } + if (atomic_read(&smc->conn.bytes_to_rcv)) + mask |= POLLIN | POLLRDNORM; + if (sk->sk_shutdown & RCV_SHUTDOWN) + mask |= POLLIN | POLLRDNORM | POLLRDHUP; + if (sk->sk_state == SMC_APPCLOSEWAIT1) + mask |= POLLIN; + } } + release_sock(sk); + sock_put(sk); return mask; } diff --git a/net/smc/smc.h b/net/smc/smc.h index 0bee9d16cf29..9518986c97b1 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -178,7 +178,6 @@ struct smc_sock { /* smc sock container */ struct work_struct smc_listen_work;/* prepare new accept socket */ struct list_head accept_q; /* sockets to be accepted */ spinlock_t accept_q_lock; /* protects accept_q */ - struct delayed_work sock_put_work; /* final socket freeing */ bool use_fallback; /* fallback to tcp */ u8 wait_close_tx_prepared : 1; /* shutdown wr or close @@ -253,12 +252,12 @@ static inline int smc_uncompress_bufsize(u8 compressed) static inline bool using_ipsec(struct smc_sock *smc) { return (smc->clcsock->sk->sk_policy[0] || - smc->clcsock->sk->sk_policy[1]) ? 1 : 0; + smc->clcsock->sk->sk_policy[1]) ? true : false; } #else static inline bool using_ipsec(struct smc_sock *smc) { - return 0; + return false; } #endif diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 87f7bede6eab..3cd086e5bd28 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -57,9 +57,6 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd, cdcpend->conn); } smc_tx_sndbuf_nonfull(smc); - if (smc->sk.sk_state != SMC_ACTIVE) - /* wake up smc_close_wait_tx_pends() */ - smc->sk.sk_state_change(&smc->sk); bh_unlock_sock(&smc->sk); } @@ -68,9 +65,14 @@ int smc_cdc_get_free_slot(struct smc_connection *conn, struct smc_cdc_tx_pend **pend) { struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK]; + int rc; - return smc_wr_tx_get_free_slot(link, smc_cdc_tx_handler, wr_buf, - (struct smc_wr_tx_pend_priv **)pend); + rc = smc_wr_tx_get_free_slot(link, smc_cdc_tx_handler, wr_buf, + (struct smc_wr_tx_pend_priv **)pend); + if (!conn->alert_token_local) + /* abnormal termination */ + rc = -EPIPE; + return rc; } static inline void smc_cdc_add_pending_send(struct smc_connection *conn, @@ -155,14 +157,6 @@ void smc_cdc_tx_dismiss_slots(struct smc_connection *conn) (unsigned long)conn); } -bool smc_cdc_tx_has_pending(struct smc_connection *conn) -{ - struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK]; - - return smc_wr_tx_has_pending(link, SMC_CDC_MSG_TYPE, - smc_cdc_tx_filter, (unsigned long)conn); -} - /********************************* receive ***********************************/ static inline bool smc_cdc_before(u16 seq1, u16 seq2) @@ -213,6 +207,17 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc, /* guarantee 0 <= bytes_to_rcv <= rmbe_size */ smp_mb__after_atomic(); smc->sk.sk_data_ready(&smc->sk); + } else if ((conn->local_rx_ctrl.prod_flags.write_blocked) || + (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req)) { + smc->sk.sk_data_ready(&smc->sk); + } + + /* piggy backed tx info */ + /* trigger sndbuf consumer: RDMA write into peer RMBE and CDC */ + if (diff_cons && smc_tx_prepared_sends(conn)) { + smc_tx_sndbuf_nonempty(conn); + /* trigger socket release if connection closed */ + smc_close_wake_tx_prepared(smc); } if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) { @@ -224,25 +229,10 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc, if (smc->clcsock && smc->clcsock->sk) smc->clcsock->sk->sk_shutdown |= RCV_SHUTDOWN; sock_set_flag(&smc->sk, SOCK_DONE); - schedule_work(&conn->close_work); + sock_hold(&smc->sk); /* sock_put in close_work */ + if (!schedule_work(&conn->close_work)) + sock_put(&smc->sk); } - - /* piggy backed tx info */ - /* trigger sndbuf consumer: RDMA write into peer RMBE and CDC */ - if (diff_cons && smc_tx_prepared_sends(conn)) { - smc_tx_sndbuf_nonempty(conn); - /* trigger socket release if connection closed */ - smc_close_wake_tx_prepared(smc); - } - - /* socket connected but not accepted */ - if (!smc->sk.sk_socket) - return; - - /* data available */ - if ((conn->local_rx_ctrl.prod_flags.write_blocked) || - (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req)) - smc_tx_consumer_update(conn); } /* called under tasklet context */ diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h index 149ceda1b088..ab240b37ad11 100644 --- a/net/smc/smc_cdc.h +++ b/net/smc/smc_cdc.h @@ -214,7 +214,6 @@ void smc_cdc_tx_dismiss_slots(struct smc_connection *conn); int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf, struct smc_cdc_tx_pend *pend); int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn); -bool smc_cdc_tx_has_pending(struct smc_connection *conn); int smc_cdc_init(void) __init; #endif /* SMC_CDC_H */ diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 1800e16b2a02..8ac51583a063 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -22,6 +22,54 @@ #include "smc_clc.h" #include "smc_ib.h" +/* check if received message has a correct header length and contains valid + * heading and trailing eyecatchers + */ +static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm) +{ + struct smc_clc_msg_proposal_prefix *pclc_prfx; + struct smc_clc_msg_accept_confirm *clc; + struct smc_clc_msg_proposal *pclc; + struct smc_clc_msg_decline *dclc; + struct smc_clc_msg_trail *trl; + + if (memcmp(clcm->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER))) + return false; + switch (clcm->type) { + case SMC_CLC_PROPOSAL: + pclc = (struct smc_clc_msg_proposal *)clcm; + pclc_prfx = smc_clc_proposal_get_prefix(pclc); + if (ntohs(pclc->hdr.length) != + sizeof(*pclc) + ntohs(pclc->iparea_offset) + + sizeof(*pclc_prfx) + + pclc_prfx->ipv6_prefixes_cnt * + sizeof(struct smc_clc_ipv6_prefix) + + sizeof(*trl)) + return false; + trl = (struct smc_clc_msg_trail *) + ((u8 *)pclc + ntohs(pclc->hdr.length) - sizeof(*trl)); + break; + case SMC_CLC_ACCEPT: + case SMC_CLC_CONFIRM: + clc = (struct smc_clc_msg_accept_confirm *)clcm; + if (ntohs(clc->hdr.length) != sizeof(*clc)) + return false; + trl = &clc->trl; + break; + case SMC_CLC_DECLINE: + dclc = (struct smc_clc_msg_decline *)clcm; + if (ntohs(dclc->hdr.length) != sizeof(*dclc)) + return false; + trl = &dclc->trl; + break; + default: + return false; + } + if (memcmp(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER))) + return false; + return true; +} + /* Wait for data on the tcp-socket, analyze received data * Returns: * 0 if success and it was not a decline that we received. @@ -35,7 +83,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, struct smc_clc_msg_hdr *clcm = buf; struct msghdr msg = {NULL, 0}; int reason_code = 0; - struct kvec vec; + struct kvec vec = {buf, buflen}; int len, datlen; int krflags; @@ -43,12 +91,15 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, * so we don't consume any subsequent CLC message or payload data * in the TCP byte stream */ - vec.iov_base = buf; - vec.iov_len = buflen; + /* + * Caller must make sure that buflen is no less than + * sizeof(struct smc_clc_msg_hdr) + */ krflags = MSG_PEEK | MSG_WAITALL; smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME; - len = kernel_recvmsg(smc->clcsock, &msg, &vec, 1, - sizeof(struct smc_clc_msg_hdr), krflags); + iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &vec, 1, + sizeof(struct smc_clc_msg_hdr)); + len = sock_recvmsg(smc->clcsock, &msg, krflags); if (signal_pending(current)) { reason_code = -EINTR; clc_sk->sk_err = EINTR; @@ -72,9 +123,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, } datlen = ntohs(clcm->length); if ((len < sizeof(struct smc_clc_msg_hdr)) || - (datlen < sizeof(struct smc_clc_msg_decline)) || - (datlen > sizeof(struct smc_clc_msg_accept_confirm)) || - memcmp(clcm->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) || + (datlen > buflen) || ((clcm->type != SMC_CLC_DECLINE) && (clcm->type != expected_type))) { smc->sk.sk_err = EPROTO; @@ -83,13 +132,12 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, } /* receive the complete CLC message */ - vec.iov_base = buf; - vec.iov_len = buflen; memset(&msg, 0, sizeof(struct msghdr)); + iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &vec, 1, buflen); krflags = MSG_WAITALL; smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME; - len = kernel_recvmsg(smc->clcsock, &msg, &vec, 1, datlen, krflags); - if (len < datlen) { + len = sock_recvmsg(smc->clcsock, &msg, krflags); + if (len < datlen || !smc_clc_msg_hdr_valid(clcm)) { smc->sk.sk_err = EPROTO; reason_code = -EPROTO; goto out; @@ -133,7 +181,7 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info) smc->sk.sk_err = EPROTO; if (len < 0) smc->sk.sk_err = -len; - return len; + return sock_error(&smc->sk); } /* send CLC PROPOSAL message across internal TCP socket */ @@ -141,33 +189,43 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_ib_device *smcibdev, u8 ibport) { + struct smc_clc_msg_proposal_prefix pclc_prfx; struct smc_clc_msg_proposal pclc; + struct smc_clc_msg_trail trl; int reason_code = 0; + struct kvec vec[3]; struct msghdr msg; - struct kvec vec; - int len, rc; + int len, plen, rc; /* send SMC Proposal CLC message */ + plen = sizeof(pclc) + sizeof(pclc_prfx) + sizeof(trl); memset(&pclc, 0, sizeof(pclc)); memcpy(pclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); pclc.hdr.type = SMC_CLC_PROPOSAL; - pclc.hdr.length = htons(sizeof(pclc)); + pclc.hdr.length = htons(plen); pclc.hdr.version = SMC_CLC_V1; /* SMC version */ memcpy(pclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid)); memcpy(&pclc.lcl.gid, &smcibdev->gid[ibport - 1], SMC_GID_SIZE); memcpy(&pclc.lcl.mac, &smcibdev->mac[ibport - 1], ETH_ALEN); + pclc.iparea_offset = htons(0); + memset(&pclc_prfx, 0, sizeof(pclc_prfx)); /* determine subnet and mask from internal TCP socket */ - rc = smc_netinfo_by_tcpsk(smc->clcsock, &pclc.outgoing_subnet, - &pclc.prefix_len); + rc = smc_netinfo_by_tcpsk(smc->clcsock, &pclc_prfx.outgoing_subnet, + &pclc_prfx.prefix_len); if (rc) return SMC_CLC_DECL_CNFERR; /* configuration error */ - memcpy(pclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); + pclc_prfx.ipv6_prefixes_cnt = 0; + memcpy(trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); memset(&msg, 0, sizeof(msg)); - vec.iov_base = &pclc; - vec.iov_len = sizeof(pclc); + vec[0].iov_base = &pclc; + vec[0].iov_len = sizeof(pclc); + vec[1].iov_base = &pclc_prfx; + vec[1].iov_len = sizeof(pclc_prfx); + vec[2].iov_base = &trl; + vec[2].iov_len = sizeof(trl); /* due to the few bytes needed for clc-handshake this cannot block */ - len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, sizeof(pclc)); + len = kernel_sendmsg(smc->clcsock, &msg, vec, 3, plen); if (len < sizeof(pclc)) { if (len >= 0) { reason_code = -ENETUNREACH; diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 12a9af1539a2..c145a0f36a68 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -44,7 +44,7 @@ struct smc_clc_msg_hdr { /* header1 of clc messages */ #if defined(__BIG_ENDIAN_BITFIELD) u8 version : 4, flag : 1, - rsvd : 3; + rsvd : 3; #elif defined(__LITTLE_ENDIAN_BITFIELD) u8 rsvd : 3, flag : 1, @@ -62,17 +62,31 @@ struct smc_clc_msg_local { /* header2 of clc messages */ u8 mac[6]; /* mac of ib_device port */ }; -struct smc_clc_msg_proposal { /* clc proposal message */ - struct smc_clc_msg_hdr hdr; - struct smc_clc_msg_local lcl; - __be16 iparea_offset; /* offset to IP address information area */ +struct smc_clc_ipv6_prefix { + u8 prefix[4]; + u8 prefix_len; +} __packed; + +struct smc_clc_msg_proposal_prefix { /* prefix part of clc proposal message*/ __be32 outgoing_subnet; /* subnet mask */ u8 prefix_len; /* number of significant bits in mask */ u8 reserved[2]; u8 ipv6_prefixes_cnt; /* number of IPv6 prefixes in prefix array */ - struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */ } __aligned(4); +struct smc_clc_msg_proposal { /* clc proposal message sent by Linux */ + struct smc_clc_msg_hdr hdr; + struct smc_clc_msg_local lcl; + __be16 iparea_offset; /* offset to IP address information area */ +} __aligned(4); + +#define SMC_CLC_PROPOSAL_MAX_OFFSET 0x28 +#define SMC_CLC_PROPOSAL_MAX_PREFIX (8 * sizeof(struct smc_clc_ipv6_prefix)) +#define SMC_CLC_MAX_LEN (sizeof(struct smc_clc_msg_proposal) + \ + SMC_CLC_PROPOSAL_MAX_OFFSET + \ + SMC_CLC_PROPOSAL_MAX_PREFIX + \ + sizeof(struct smc_clc_msg_trail)) + struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */ struct smc_clc_msg_hdr hdr; struct smc_clc_msg_local lcl; @@ -102,6 +116,14 @@ struct smc_clc_msg_decline { /* clc decline message */ struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */ } __aligned(4); +/* determine start of the prefix area within the proposal message */ +static inline struct smc_clc_msg_proposal_prefix * +smc_clc_proposal_get_prefix(struct smc_clc_msg_proposal *pclc) +{ + return (struct smc_clc_msg_proposal_prefix *) + ((u8 *)pclc + sizeof(*pclc) + ntohs(pclc->iparea_offset)); +} + struct smc_sock; struct smc_ib_device; diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index 48615d2ac4aa..e339c0186dcf 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -19,7 +19,7 @@ #include "smc_cdc.h" #include "smc_close.h" -#define SMC_CLOSE_WAIT_TX_PENDS_TIME (5 * HZ) +#define SMC_CLOSE_WAIT_LISTEN_CLCSOCK_TIME (5 * HZ) static void smc_close_cleanup_listen(struct sock *parent) { @@ -30,23 +30,24 @@ static void smc_close_cleanup_listen(struct sock *parent) smc_close_non_accepted(sk); } -static void smc_close_wait_tx_pends(struct smc_sock *smc) +static void smc_close_wait_listen_clcsock(struct smc_sock *smc) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct sock *sk = &smc->sk; signed long timeout; - timeout = SMC_CLOSE_WAIT_TX_PENDS_TIME; + timeout = SMC_CLOSE_WAIT_LISTEN_CLCSOCK_TIME; add_wait_queue(sk_sleep(sk), &wait); - while (!signal_pending(current) && timeout) { - int rc; - - rc = sk_wait_event(sk, &timeout, - !smc_cdc_tx_has_pending(&smc->conn), - &wait); - if (rc) + do { + release_sock(sk); + if (smc->clcsock) + timeout = wait_woken(&wait, TASK_UNINTERRUPTIBLE, + timeout); + sched_annotate_sleep(); + lock_sock(sk); + if (!smc->clcsock) break; - } + } while (timeout); remove_wait_queue(sk_sleep(sk), &wait); } @@ -111,58 +112,63 @@ static int smc_close_abort(struct smc_connection *conn) } /* terminate smc socket abnormally - active abort - * RDMA communication no longer possible + * link group is terminated, i.e. RDMA communication no longer possible */ -void smc_close_active_abort(struct smc_sock *smc) +static void smc_close_active_abort(struct smc_sock *smc) { + struct sock *sk = &smc->sk; + struct smc_cdc_conn_state_flags *txflags = &smc->conn.local_tx_ctrl.conn_state_flags; - smc->sk.sk_err = ECONNABORTED; + sk->sk_err = ECONNABORTED; if (smc->clcsock && smc->clcsock->sk) { smc->clcsock->sk->sk_err = ECONNABORTED; smc->clcsock->sk->sk_state_change(smc->clcsock->sk); } - switch (smc->sk.sk_state) { + switch (sk->sk_state) { case SMC_INIT: case SMC_ACTIVE: - smc->sk.sk_state = SMC_PEERABORTWAIT; + sk->sk_state = SMC_PEERABORTWAIT; + release_sock(sk); + cancel_delayed_work_sync(&smc->conn.tx_work); + lock_sock(sk); + sock_put(sk); /* passive closing */ break; case SMC_APPCLOSEWAIT1: case SMC_APPCLOSEWAIT2: - txflags->peer_conn_abort = 1; - sock_release(smc->clcsock); if (!smc_cdc_rxed_any_close(&smc->conn)) - smc->sk.sk_state = SMC_PEERABORTWAIT; + sk->sk_state = SMC_PEERABORTWAIT; else - smc->sk.sk_state = SMC_CLOSED; + sk->sk_state = SMC_CLOSED; + release_sock(sk); + cancel_delayed_work_sync(&smc->conn.tx_work); + lock_sock(sk); break; case SMC_PEERCLOSEWAIT1: case SMC_PEERCLOSEWAIT2: if (!txflags->peer_conn_closed) { - smc->sk.sk_state = SMC_PEERABORTWAIT; - txflags->peer_conn_abort = 1; - sock_release(smc->clcsock); + /* just SHUTDOWN_SEND done */ + sk->sk_state = SMC_PEERABORTWAIT; } else { - smc->sk.sk_state = SMC_CLOSED; + sk->sk_state = SMC_CLOSED; } + sock_put(sk); /* passive closing */ break; case SMC_PROCESSABORT: case SMC_APPFINCLOSEWAIT: - if (!txflags->peer_conn_closed) { - txflags->peer_conn_abort = 1; - sock_release(smc->clcsock); - } - smc->sk.sk_state = SMC_CLOSED; + sk->sk_state = SMC_CLOSED; break; case SMC_PEERFINCLOSEWAIT: + sock_put(sk); /* passive closing */ + break; case SMC_PEERABORTWAIT: case SMC_CLOSED: break; } - sock_set_flag(&smc->sk, SOCK_DEAD); - smc->sk.sk_state_change(&smc->sk); + sock_set_flag(sk, SOCK_DEAD); + sk->sk_state_change(sk); } static inline bool smc_close_sent_any_close(struct smc_connection *conn) @@ -185,13 +191,11 @@ int smc_close_active(struct smc_sock *smc) 0 : sock_flag(sk, SOCK_LINGER) ? sk->sk_lingertime : SMC_MAX_STREAM_WAIT_TIMEOUT; -again: old_state = sk->sk_state; - switch (old_state) { +again: + switch (sk->sk_state) { case SMC_INIT: sk->sk_state = SMC_CLOSED; - if (smc->smc_listen_work.func) - cancel_work_sync(&smc->smc_listen_work); break; case SMC_LISTEN: sk->sk_state = SMC_CLOSED; @@ -200,11 +204,9 @@ again: rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR); /* wake up kernel_accept of smc_tcp_listen_worker */ smc->clcsock->sk->sk_data_ready(smc->clcsock->sk); + smc_close_wait_listen_clcsock(smc); } - release_sock(sk); smc_close_cleanup_listen(sk); - cancel_work_sync(&smc->smc_listen_work); - lock_sock(sk); break; case SMC_ACTIVE: smc_close_stream_wait(smc, timeout); @@ -214,6 +216,8 @@ again: if (sk->sk_state == SMC_ACTIVE) { /* send close request */ rc = smc_close_final(conn); + if (rc) + break; sk->sk_state = SMC_PEERCLOSEWAIT1; } else { /* peer event has changed the state */ @@ -226,9 +230,10 @@ again: !smc_close_sent_any_close(conn)) { /* just shutdown wr done, send close request */ rc = smc_close_final(conn); + if (rc) + break; } sk->sk_state = SMC_CLOSED; - smc_close_wait_tx_pends(smc); break; case SMC_APPCLOSEWAIT1: case SMC_APPCLOSEWAIT2: @@ -237,19 +242,21 @@ again: release_sock(sk); cancel_delayed_work_sync(&conn->tx_work); lock_sock(sk); - if (sk->sk_err != ECONNABORTED) { - /* confirm close from peer */ - rc = smc_close_final(conn); - if (rc) - break; - } - if (smc_cdc_rxed_any_close(conn)) + if (sk->sk_state != SMC_APPCLOSEWAIT1 && + sk->sk_state != SMC_APPCLOSEWAIT2) + goto again; + /* confirm close from peer */ + rc = smc_close_final(conn); + if (rc) + break; + if (smc_cdc_rxed_any_close(conn)) { /* peer has closed the socket already */ sk->sk_state = SMC_CLOSED; - else + sock_put(sk); /* postponed passive closing */ + } else { /* peer has just issued a shutdown write */ sk->sk_state = SMC_PEERFINCLOSEWAIT; - smc_close_wait_tx_pends(smc); + } break; case SMC_PEERCLOSEWAIT1: case SMC_PEERCLOSEWAIT2: @@ -257,6 +264,8 @@ again: !smc_close_sent_any_close(conn)) { /* just shutdown wr done, send close request */ rc = smc_close_final(conn); + if (rc) + break; } /* peer sending PeerConnectionClosed will cause transition */ break; @@ -264,12 +273,8 @@ again: /* peer sending PeerConnectionClosed will cause transition */ break; case SMC_PROCESSABORT: - release_sock(sk); - cancel_delayed_work_sync(&conn->tx_work); - lock_sock(sk); smc_close_abort(conn); sk->sk_state = SMC_CLOSED; - smc_close_wait_tx_pends(smc); break; case SMC_PEERABORTWAIT: case SMC_CLOSED: @@ -278,7 +283,7 @@ again: } if (old_state != sk->sk_state) - sk->sk_state_change(&smc->sk); + sk->sk_state_change(sk); return rc; } @@ -289,37 +294,42 @@ static void smc_close_passive_abort_received(struct smc_sock *smc) struct sock *sk = &smc->sk; switch (sk->sk_state) { + case SMC_INIT: case SMC_ACTIVE: - case SMC_APPFINCLOSEWAIT: case SMC_APPCLOSEWAIT1: - case SMC_APPCLOSEWAIT2: - smc_close_abort(&smc->conn); + sk->sk_state = SMC_PROCESSABORT; + sock_put(sk); /* passive closing */ + break; + case SMC_APPFINCLOSEWAIT: sk->sk_state = SMC_PROCESSABORT; break; case SMC_PEERCLOSEWAIT1: case SMC_PEERCLOSEWAIT2: if (txflags->peer_done_writing && - !smc_close_sent_any_close(&smc->conn)) { + !smc_close_sent_any_close(&smc->conn)) /* just shutdown, but not yet closed locally */ - smc_close_abort(&smc->conn); sk->sk_state = SMC_PROCESSABORT; - } else { + else sk->sk_state = SMC_CLOSED; - } + sock_put(sk); /* passive closing */ break; + case SMC_APPCLOSEWAIT2: case SMC_PEERFINCLOSEWAIT: + sk->sk_state = SMC_CLOSED; + sock_put(sk); /* passive closing */ + break; case SMC_PEERABORTWAIT: sk->sk_state = SMC_CLOSED; break; - case SMC_INIT: case SMC_PROCESSABORT: /* nothing to do, add tracing in future patch */ break; } } -/* Some kind of closing has been received: peer_conn_closed, peer_conn_abort, - * or peer_done_writing. +/* Either some kind of closing has been received: peer_conn_closed, + * peer_conn_abort, or peer_done_writing + * or the link group of the connection terminates abnormally. */ static void smc_close_passive_work(struct work_struct *work) { @@ -331,7 +341,7 @@ static void smc_close_passive_work(struct work_struct *work) struct sock *sk = &smc->sk; int old_state; - lock_sock(&smc->sk); + lock_sock(sk); old_state = sk->sk_state; if (!conn->alert_token_local) { @@ -340,23 +350,32 @@ static void smc_close_passive_work(struct work_struct *work) goto wakeup; } - rxflags = &smc->conn.local_rx_ctrl.conn_state_flags; + rxflags = &conn->local_rx_ctrl.conn_state_flags; if (rxflags->peer_conn_abort) { + /* peer has not received all data */ smc_close_passive_abort_received(smc); + release_sock(&smc->sk); + cancel_delayed_work_sync(&conn->tx_work); + lock_sock(&smc->sk); goto wakeup; } switch (sk->sk_state) { case SMC_INIT: - if (atomic_read(&smc->conn.bytes_to_rcv) || + if (atomic_read(&conn->bytes_to_rcv) || (rxflags->peer_done_writing && - !smc_cdc_rxed_any_close(conn))) + !smc_cdc_rxed_any_close(conn))) { sk->sk_state = SMC_APPCLOSEWAIT1; - else + } else { sk->sk_state = SMC_CLOSED; + sock_put(sk); /* passive closing */ + } break; case SMC_ACTIVE: sk->sk_state = SMC_APPCLOSEWAIT1; + /* postpone sock_put() for passive closing to cover + * received SEND_SHUTDOWN as well + */ break; case SMC_PEERCLOSEWAIT1: if (rxflags->peer_done_writing) @@ -364,8 +383,7 @@ static void smc_close_passive_work(struct work_struct *work) /* fall through */ /* to check for closing */ case SMC_PEERCLOSEWAIT2: - case SMC_PEERFINCLOSEWAIT: - if (!smc_cdc_rxed_any_close(&smc->conn)) + if (!smc_cdc_rxed_any_close(conn)) break; if (sock_flag(sk, SOCK_DEAD) && smc_close_sent_any_close(conn)) { @@ -375,9 +393,20 @@ static void smc_close_passive_work(struct work_struct *work) /* just shutdown, but not yet closed locally */ sk->sk_state = SMC_APPFINCLOSEWAIT; } + sock_put(sk); /* passive closing */ + break; + case SMC_PEERFINCLOSEWAIT: + if (smc_cdc_rxed_any_close(conn)) { + sk->sk_state = SMC_CLOSED; + sock_put(sk); /* passive closing */ + } break; case SMC_APPCLOSEWAIT1: case SMC_APPCLOSEWAIT2: + /* postpone sock_put() for passive closing to cover + * received SEND_SHUTDOWN as well + */ + break; case SMC_APPFINCLOSEWAIT: case SMC_PEERABORTWAIT: case SMC_PROCESSABORT: @@ -393,23 +422,11 @@ wakeup: if (old_state != sk->sk_state) { sk->sk_state_change(sk); if ((sk->sk_state == SMC_CLOSED) && - (sock_flag(sk, SOCK_DEAD) || !sk->sk_socket)) { - smc_conn_free(&smc->conn); - schedule_delayed_work(&smc->sock_put_work, - SMC_CLOSE_SOCK_PUT_DELAY); - } + (sock_flag(sk, SOCK_DEAD) || !sk->sk_socket)) + smc_conn_free(conn); } - release_sock(&smc->sk); -} - -void smc_close_sock_put_work(struct work_struct *work) -{ - struct smc_sock *smc = container_of(to_delayed_work(work), - struct smc_sock, - sock_put_work); - - smc->sk.sk_prot->unhash(&smc->sk); - sock_put(&smc->sk); + release_sock(sk); + sock_put(sk); /* sock_hold done by schedulers of close_work */ } int smc_close_shutdown_write(struct smc_sock *smc) @@ -424,20 +441,21 @@ int smc_close_shutdown_write(struct smc_sock *smc) 0 : sock_flag(sk, SOCK_LINGER) ? sk->sk_lingertime : SMC_MAX_STREAM_WAIT_TIMEOUT; -again: old_state = sk->sk_state; - switch (old_state) { +again: + switch (sk->sk_state) { case SMC_ACTIVE: smc_close_stream_wait(smc, timeout); release_sock(sk); cancel_delayed_work_sync(&conn->tx_work); lock_sock(sk); + if (sk->sk_state != SMC_ACTIVE) + goto again; /* send close wr request */ rc = smc_close_wr(conn); - if (sk->sk_state == SMC_ACTIVE) - sk->sk_state = SMC_PEERCLOSEWAIT1; - else - goto again; + if (rc) + break; + sk->sk_state = SMC_PEERCLOSEWAIT1; break; case SMC_APPCLOSEWAIT1: /* passive close */ @@ -446,8 +464,12 @@ again: release_sock(sk); cancel_delayed_work_sync(&conn->tx_work); lock_sock(sk); + if (sk->sk_state != SMC_APPCLOSEWAIT1) + goto again; /* confirm close from peer */ rc = smc_close_wr(conn); + if (rc) + break; sk->sk_state = SMC_APPCLOSEWAIT2; break; case SMC_APPCLOSEWAIT2: @@ -462,7 +484,7 @@ again: } if (old_state != sk->sk_state) - sk->sk_state_change(&smc->sk); + sk->sk_state_change(sk); return rc; } diff --git a/net/smc/smc_close.h b/net/smc/smc_close.h index ed82506b1b0a..19eb6a211c23 100644 --- a/net/smc/smc_close.h +++ b/net/smc/smc_close.h @@ -20,9 +20,7 @@ #define SMC_CLOSE_SOCK_PUT_DELAY HZ void smc_close_wake_tx_prepared(struct smc_sock *smc); -void smc_close_active_abort(struct smc_sock *smc); int smc_close_active(struct smc_sock *smc); -void smc_close_sock_put_work(struct work_struct *work); int smc_close_shutdown_write(struct smc_sock *smc); void smc_close_init(struct smc_sock *smc); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 94f21116dac5..2424c7100aaf 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -128,6 +128,8 @@ static void smc_lgr_free_work(struct work_struct *work) bool conns; spin_lock_bh(&smc_lgr_list.lock); + if (list_empty(&lgr->list)) + goto free; read_lock_bh(&lgr->conns_lock); conns = RB_EMPTY_ROOT(&lgr->conns_all); read_unlock_bh(&lgr->conns_lock); @@ -136,6 +138,7 @@ static void smc_lgr_free_work(struct work_struct *work) return; } list_del_init(&lgr->list); /* remove from smc_lgr_list */ +free: spin_unlock_bh(&smc_lgr_list.lock); smc_lgr_free(lgr); } @@ -231,9 +234,7 @@ static void smc_buf_unuse(struct smc_connection *conn) /* remove a finished connection from its link group */ void smc_conn_free(struct smc_connection *conn) { - struct smc_link_group *lgr = conn->lgr; - - if (!lgr) + if (!conn->lgr) return; smc_cdc_tx_dismiss_slots(conn); smc_lgr_unregister_conn(conn); @@ -327,13 +328,17 @@ void smc_lgr_terminate(struct smc_link_group *lgr) while (node) { conn = rb_entry(node, struct smc_connection, alert_node); smc = container_of(conn, struct smc_sock, conn); - sock_hold(&smc->sk); + sock_hold(&smc->sk); /* sock_put in close work */ + conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; __smc_lgr_unregister_conn(conn); - schedule_work(&conn->close_work); - sock_put(&smc->sk); + write_unlock_bh(&lgr->conns_lock); + if (!schedule_work(&conn->close_work)) + sock_put(&smc->sk); + write_lock_bh(&lgr->conns_lock); node = rb_first(&lgr->conns_all); } write_unlock_bh(&lgr->conns_lock); + wake_up(&lgr->lnk[SMC_SINGLE_LINK].wr_reg_wait); } /* Determine vlan of internal TCP socket. diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index d2d01cf70224..427b91c1c964 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -86,7 +86,8 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, if (smc_diag_msg_attrs_fill(sk, skb, r, user_ns)) goto errout; - if ((req->diag_ext & (1 << (SMC_DIAG_CONNINFO - 1))) && smc->conn.lgr) { + if ((req->diag_ext & (1 << (SMC_DIAG_CONNINFO - 1))) && + smc->conn.alert_token_local) { struct smc_connection *conn = &smc->conn; struct smc_diag_conninfo cinfo = { .token = conn->alert_token_local, @@ -124,7 +125,8 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, goto errout; } - if ((req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) && smc->conn.lgr) { + if ((req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) && smc->conn.lgr && + !list_empty(&smc->conn.lgr->list)) { struct smc_diag_lgrinfo linfo = { .role = smc->conn.lgr->role, .lnk[0].ibport = smc->conn.lgr->lnk[0].ibport, diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 90f1a7f9085c..2a8957bd6d38 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -141,6 +141,17 @@ out: return rc; } +static void smc_ib_port_terminate(struct smc_ib_device *smcibdev, u8 ibport) +{ + struct smc_link_group *lgr, *l; + + list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { + if (lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev && + lgr->lnk[SMC_SINGLE_LINK].ibport == ibport) + smc_lgr_terminate(lgr); + } +} + /* process context wrapper for might_sleep smc_ib_remember_port_attr */ static void smc_ib_port_event_work(struct work_struct *work) { @@ -151,6 +162,8 @@ static void smc_ib_port_event_work(struct work_struct *work) for_each_set_bit(port_idx, &smcibdev->port_event_mask, SMC_MAX_PORTS) { smc_ib_remember_port_attr(smcibdev, port_idx + 1); clear_bit(port_idx, &smcibdev->port_event_mask); + if (!smc_ib_port_active(smcibdev, port_idx + 1)) + smc_ib_port_terminate(smcibdev, port_idx + 1); } } @@ -165,15 +178,7 @@ static void smc_ib_global_event_handler(struct ib_event_handler *handler, switch (ibevent->event) { case IB_EVENT_PORT_ERR: - port_idx = ibevent->element.port_num - 1; - set_bit(port_idx, &smcibdev->port_event_mask); - schedule_work(&smcibdev->port_event_work); - /* fall through */ case IB_EVENT_DEVICE_FATAL: - /* tbd in follow-on patch: - * abnormal close of corresponding connections - */ - break; case IB_EVENT_PORT_ACTIVE: port_idx = ibevent->element.port_num - 1; set_bit(port_idx, &smcibdev->port_event_mask); @@ -186,7 +191,8 @@ static void smc_ib_global_event_handler(struct ib_event_handler *handler, void smc_ib_dealloc_protection_domain(struct smc_link *lnk) { - ib_dealloc_pd(lnk->roce_pd); + if (lnk->roce_pd) + ib_dealloc_pd(lnk->roce_pd); lnk->roce_pd = NULL; } @@ -203,14 +209,18 @@ int smc_ib_create_protection_domain(struct smc_link *lnk) static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv) { + struct smc_ib_device *smcibdev = + (struct smc_ib_device *)ibevent->device; + u8 port_idx; + switch (ibevent->event) { case IB_EVENT_DEVICE_FATAL: case IB_EVENT_GID_CHANGE: case IB_EVENT_PORT_ERR: case IB_EVENT_QP_ACCESS_ERR: - /* tbd in follow-on patch: - * abnormal close of corresponding connections - */ + port_idx = ibevent->element.port_num - 1; + set_bit(port_idx, &smcibdev->port_event_mask); + schedule_work(&smcibdev->port_event_work); break; default: break; @@ -219,7 +229,8 @@ static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv) void smc_ib_destroy_queue_pair(struct smc_link *lnk) { - ib_destroy_qp(lnk->roce_qp); + if (lnk->roce_qp) + ib_destroy_qp(lnk->roce_qp); lnk->roce_qp = NULL; } @@ -462,6 +473,7 @@ static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev) { if (!smcibdev->initialized) return; + smcibdev->initialized = 0; smc_wr_remove_dev(smcibdev); ib_unregister_event_handler(&smcibdev->event_handler); ib_destroy_cq(smcibdev->roce_cq_recv); diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index cbf58637ee14..9dc392ca06bf 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -65,7 +65,6 @@ static int smc_rx_wait_data(struct smc_sock *smc, long *timeo) rc = sk_wait_event(sk, timeo, sk->sk_err || sk->sk_shutdown & RCV_SHUTDOWN || - sock_flag(sk, SOCK_DONE) || atomic_read(&conn->bytes_to_rcv) || smc_cdc_rxed_any_close_or_senddone(conn), &wait); @@ -116,7 +115,7 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len, if (read_done) { if (sk->sk_err || sk->sk_state == SMC_CLOSED || - (sk->sk_shutdown & RCV_SHUTDOWN) || + sk->sk_shutdown & RCV_SHUTDOWN || !timeo || signal_pending(current) || smc_cdc_rxed_any_close_or_senddone(conn) || @@ -124,8 +123,6 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len, peer_conn_abort) break; } else { - if (sock_flag(sk, SOCK_DONE)) - break; if (sk->sk_err) { read_done = sock_error(sk); break; diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index c48dc2d5fd3a..838bce20c361 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -86,7 +86,7 @@ static int smc_tx_wait_memory(struct smc_sock *smc, int flags) rc = -EPIPE; break; } - if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) { + if (smc_cdc_rxed_any_close(conn)) { rc = -ECONNRESET; break; } @@ -104,14 +104,12 @@ static int smc_tx_wait_memory(struct smc_sock *smc, int flags) if (atomic_read(&conn->sndbuf_space)) break; /* at least 1 byte of free space available */ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - sk->sk_write_pending++; sk_wait_event(sk, &timeo, sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN) || - smc_cdc_rxed_any_close_or_senddone(conn) || + smc_cdc_rxed_any_close(conn) || atomic_read(&conn->sndbuf_space), &wait); - sk->sk_write_pending--; } remove_wait_queue(sk_sleep(sk), &wait); return rc; @@ -250,8 +248,10 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset, peer_rmbe_offset; rdma_wr.rkey = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey; rc = ib_post_send(link->roce_qp, &rdma_wr.wr, &failed_wr); - if (rc) + if (rc) { conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; + smc_lgr_terminate(lgr); + } return rc; } @@ -408,8 +408,9 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn) goto out_unlock; } rc = 0; - schedule_delayed_work(&conn->tx_work, - SMC_TX_WORK_DELAY); + if (conn->alert_token_local) /* connection healthy */ + schedule_delayed_work(&conn->tx_work, + SMC_TX_WORK_DELAY); } goto out_unlock; } @@ -440,19 +441,24 @@ static void smc_tx_work(struct work_struct *work) int rc; lock_sock(&smc->sk); + if (smc->sk.sk_err || + !conn->alert_token_local || + conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) + goto out; + rc = smc_tx_sndbuf_nonempty(conn); if (!rc && conn->local_rx_ctrl.prod_flags.write_blocked && !atomic_read(&conn->bytes_to_rcv)) conn->local_rx_ctrl.prod_flags.write_blocked = 0; + +out: release_sock(&smc->sk); } void smc_tx_consumer_update(struct smc_connection *conn) { union smc_host_cursor cfed, cons; - struct smc_cdc_tx_pend *pend; - struct smc_wr_buf *wr_buf; - int to_confirm, rc; + int to_confirm; smc_curs_write(&cons, smc_curs_read(&conn->local_tx_ctrl.cons, conn), @@ -466,10 +472,8 @@ void smc_tx_consumer_update(struct smc_connection *conn) ((to_confirm > conn->rmbe_update_limit) && ((to_confirm > (conn->rmbe_size / 2)) || conn->local_rx_ctrl.prod_flags.write_blocked))) { - rc = smc_cdc_get_free_slot(conn, &wr_buf, &pend); - if (!rc) - rc = smc_cdc_msg_send(conn, wr_buf, pend); - if (rc < 0) { + if ((smc_cdc_get_slot_and_msg_send(conn) < 0) && + conn->alert_token_local) { /* connection healthy */ schedule_delayed_work(&conn->tx_work, SMC_TX_WORK_DELAY); return; diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index de4537f66832..1b8af23e6e2b 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -122,6 +122,7 @@ static void smc_wr_tx_tasklet_fn(unsigned long data) again: polled++; do { + memset(&wc, 0, sizeof(wc)); rc = ib_poll_cq(dev->roce_cq_send, SMC_WR_MAX_POLL_CQE, wc); if (polled == 1) { ib_req_notify_cq(dev->roce_cq_send, @@ -173,9 +174,9 @@ int smc_wr_tx_get_free_slot(struct smc_link *link, struct smc_wr_tx_pend_priv **wr_pend_priv) { struct smc_wr_tx_pend *wr_pend; + u32 idx = link->wr_tx_cnt; struct ib_send_wr *wr_ib; u64 wr_id; - u32 idx; int rc; *wr_buf = NULL; @@ -185,21 +186,20 @@ int smc_wr_tx_get_free_slot(struct smc_link *link, if (rc) return rc; } else { - rc = wait_event_interruptible_timeout( + struct smc_link_group *lgr; + + lgr = container_of(link, struct smc_link_group, + lnk[SMC_SINGLE_LINK]); + rc = wait_event_timeout( link->wr_tx_wait, + list_empty(&lgr->list) || /* lgr terminated */ (smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY), SMC_WR_TX_WAIT_FREE_SLOT_TIME); if (!rc) { /* timeout - terminate connections */ - struct smc_link_group *lgr; - - lgr = container_of(link, struct smc_link_group, - lnk[SMC_SINGLE_LINK]); smc_lgr_terminate(lgr); return -EPIPE; } - if (rc == -ERESTARTSYS) - return -EINTR; if (idx == link->wr_tx_cnt) return -EPIPE; } @@ -249,8 +249,14 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv) pend = container_of(priv, struct smc_wr_tx_pend, priv); rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], &failed_wr); - if (rc) + if (rc) { + struct smc_link_group *lgr = + container_of(link, struct smc_link_group, + lnk[SMC_SINGLE_LINK]); + smc_wr_tx_put_slot(link, priv); + smc_lgr_terminate(lgr); + } return rc; } @@ -300,18 +306,18 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) return rc; } -void smc_wr_tx_dismiss_slots(struct smc_link *link, u8 wr_rx_hdr_type, +void smc_wr_tx_dismiss_slots(struct smc_link *link, u8 wr_tx_hdr_type, smc_wr_tx_filter filter, smc_wr_tx_dismisser dismisser, unsigned long data) { struct smc_wr_tx_pend_priv *tx_pend; - struct smc_wr_rx_hdr *wr_rx; + struct smc_wr_rx_hdr *wr_tx; int i; for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) { - wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[i]; - if (wr_rx->type != wr_rx_hdr_type) + wr_tx = (struct smc_wr_rx_hdr *)&link->wr_tx_bufs[i]; + if (wr_tx->type != wr_tx_hdr_type) continue; tx_pend = &link->wr_tx_pends[i].priv; if (filter(tx_pend, data)) @@ -319,24 +325,6 @@ void smc_wr_tx_dismiss_slots(struct smc_link *link, u8 wr_rx_hdr_type, } } -bool smc_wr_tx_has_pending(struct smc_link *link, u8 wr_rx_hdr_type, - smc_wr_tx_filter filter, unsigned long data) -{ - struct smc_wr_tx_pend_priv *tx_pend; - struct smc_wr_rx_hdr *wr_rx; - int i; - - for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) { - wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[i]; - if (wr_rx->type != wr_rx_hdr_type) - continue; - tx_pend = &link->wr_tx_pends[i].priv; - if (filter(tx_pend, data)) - return true; - } - return false; -} - /****************************** receive queue ********************************/ int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler) diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index 2acf12b06063..ef0c3494c9cb 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -93,8 +93,6 @@ int smc_wr_tx_put_slot(struct smc_link *link, int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *wr_pend_priv); void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context); -bool smc_wr_tx_has_pending(struct smc_link *link, u8 wr_rx_hdr_type, - smc_wr_tx_filter filter, unsigned long data); void smc_wr_tx_dismiss_slots(struct smc_link *lnk, u8 wr_rx_hdr_type, smc_wr_tx_filter filter, smc_wr_tx_dismisser dismisser, diff --git a/net/socket.c b/net/socket.c index 42d8e9c9ccd5..a93c99b518ca 100644 --- a/net/socket.c +++ b/net/socket.c @@ -118,7 +118,7 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from); static int sock_mmap(struct file *file, struct vm_area_struct *vma); static int sock_close(struct inode *inode, struct file *file); -static unsigned int sock_poll(struct file *file, +static __poll_t sock_poll(struct file *file, struct poll_table_struct *wait); static long sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg); #ifdef CONFIG_COMPAT @@ -163,12 +163,6 @@ static DEFINE_SPINLOCK(net_family_lock); static const struct net_proto_family __rcu *net_families[NPROTO] __read_mostly; /* - * Statistics counters of the socket lists - */ - -static DEFINE_PER_CPU(int, sockets_in_use); - -/* * Support routines. * Move socket addresses back and forth across the kernel/user * divide and look after the messy bits. @@ -406,8 +400,10 @@ struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname) name.len = strlen(name.name); } path.dentry = d_alloc_pseudo(sock_mnt->mnt_sb, &name); - if (unlikely(!path.dentry)) + if (unlikely(!path.dentry)) { + sock_release(sock); return ERR_PTR(-ENOMEM); + } path.mnt = mntget(sock_mnt); d_instantiate(path.dentry, SOCK_INODE(sock)); @@ -415,9 +411,11 @@ struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname) file = alloc_file(&path, FMODE_READ | FMODE_WRITE, &socket_file_ops); if (IS_ERR(file)) { - /* drop dentry, keep inode */ + /* drop dentry, keep inode for a bit */ ihold(d_inode(path.dentry)); path_put(&path); + /* ... and now kill it properly */ + sock_release(sock); return file; } @@ -432,8 +430,10 @@ static int sock_map_fd(struct socket *sock, int flags) { struct file *newfile; int fd = get_unused_fd_flags(flags); - if (unlikely(fd < 0)) + if (unlikely(fd < 0)) { + sock_release(sock); return fd; + } newfile = sock_alloc_file(sock, flags, NULL); if (likely(!IS_ERR(newfile))) { @@ -574,7 +574,6 @@ struct socket *sock_alloc(void) inode->i_gid = current_fsgid(); inode->i_op = &sockfs_inode_ops; - this_cpu_add(sockets_in_use, 1); return sock; } EXPORT_SYMBOL(sock_alloc); @@ -601,7 +600,6 @@ void sock_release(struct socket *sock) if (rcu_dereference_protected(sock->wq, 1)->fasync_list) pr_err("%s: fasync list not empty!\n", __func__); - this_cpu_sub(sockets_in_use, 1); if (!sock->file) { iput(SOCK_INODE(sock)); return; @@ -963,9 +961,28 @@ static long sock_do_ioctl(struct net *net, struct socket *sock, * If this ioctl is unknown try to hand it down * to the NIC driver. */ - if (err == -ENOIOCTLCMD) - err = dev_ioctl(net, cmd, argp); + if (err != -ENOIOCTLCMD) + return err; + if (cmd == SIOCGIFCONF) { + struct ifconf ifc; + if (copy_from_user(&ifc, argp, sizeof(struct ifconf))) + return -EFAULT; + rtnl_lock(); + err = dev_ifconf(net, &ifc, sizeof(struct ifreq)); + rtnl_unlock(); + if (!err && copy_to_user(argp, &ifc, sizeof(struct ifconf))) + err = -EFAULT; + } else { + struct ifreq ifr; + bool need_copyout; + if (copy_from_user(&ifr, argp, sizeof(struct ifreq))) + return -EFAULT; + err = dev_ioctl(net, cmd, &ifr, &need_copyout); + if (!err && need_copyout) + if (copy_to_user(argp, &ifr, sizeof(struct ifreq))) + return -EFAULT; + } return err; } @@ -990,12 +1007,19 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) sock = file->private_data; sk = sock->sk; net = sock_net(sk); - if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) { - err = dev_ioctl(net, cmd, argp); + if (unlikely(cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15))) { + struct ifreq ifr; + bool need_copyout; + if (copy_from_user(&ifr, argp, sizeof(struct ifreq))) + return -EFAULT; + err = dev_ioctl(net, cmd, &ifr, &need_copyout); + if (!err && need_copyout) + if (copy_to_user(argp, &ifr, sizeof(struct ifreq))) + return -EFAULT; } else #ifdef CONFIG_WEXT_CORE if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) { - err = dev_ioctl(net, cmd, argp); + err = wext_handle_ioctl(net, cmd, argp); } else #endif switch (cmd) { @@ -1091,9 +1115,9 @@ out_release: EXPORT_SYMBOL(sock_create_lite); /* No kernel lock held - perfect */ -static unsigned int sock_poll(struct file *file, poll_table *wait) +static __poll_t sock_poll(struct file *file, poll_table *wait) { - unsigned int busy_flag = 0; + __poll_t busy_flag = 0; struct socket *sock; /* @@ -1330,19 +1354,9 @@ SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol) retval = sock_create(family, type, protocol, &sock); if (retval < 0) - goto out; - - retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK)); - if (retval < 0) - goto out_release; - -out: - /* It may be already another descriptor 8) Not kernel problem. */ - return retval; + return retval; -out_release: - sock_release(sock); - return retval; + return sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK)); } /* @@ -1366,87 +1380,72 @@ SYSCALL_DEFINE4(socketpair, int, family, int, type, int, protocol, flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; /* + * reserve descriptors and make sure we won't fail + * to return them to userland. + */ + fd1 = get_unused_fd_flags(flags); + if (unlikely(fd1 < 0)) + return fd1; + + fd2 = get_unused_fd_flags(flags); + if (unlikely(fd2 < 0)) { + put_unused_fd(fd1); + return fd2; + } + + err = put_user(fd1, &usockvec[0]); + if (err) + goto out; + + err = put_user(fd2, &usockvec[1]); + if (err) + goto out; + + /* * Obtain the first socket and check if the underlying protocol * supports the socketpair call. */ err = sock_create(family, type, protocol, &sock1); - if (err < 0) + if (unlikely(err < 0)) goto out; err = sock_create(family, type, protocol, &sock2); - if (err < 0) - goto out_release_1; - - err = sock1->ops->socketpair(sock1, sock2); - if (err < 0) - goto out_release_both; - - fd1 = get_unused_fd_flags(flags); - if (unlikely(fd1 < 0)) { - err = fd1; - goto out_release_both; + if (unlikely(err < 0)) { + sock_release(sock1); + goto out; } - fd2 = get_unused_fd_flags(flags); - if (unlikely(fd2 < 0)) { - err = fd2; - goto out_put_unused_1; + err = sock1->ops->socketpair(sock1, sock2); + if (unlikely(err < 0)) { + sock_release(sock2); + sock_release(sock1); + goto out; } newfile1 = sock_alloc_file(sock1, flags, NULL); if (IS_ERR(newfile1)) { err = PTR_ERR(newfile1); - goto out_put_unused_both; + sock_release(sock2); + goto out; } newfile2 = sock_alloc_file(sock2, flags, NULL); if (IS_ERR(newfile2)) { err = PTR_ERR(newfile2); - goto out_fput_1; + fput(newfile1); + goto out; } - err = put_user(fd1, &usockvec[0]); - if (err) - goto out_fput_both; - - err = put_user(fd2, &usockvec[1]); - if (err) - goto out_fput_both; - audit_fd_pair(fd1, fd2); fd_install(fd1, newfile1); fd_install(fd2, newfile2); - /* fd1 and fd2 may be already another descriptors. - * Not kernel problem. - */ - return 0; -out_fput_both: - fput(newfile2); - fput(newfile1); - put_unused_fd(fd2); - put_unused_fd(fd1); - goto out; - -out_fput_1: - fput(newfile1); - put_unused_fd(fd2); - put_unused_fd(fd1); - sock_release(sock2); - goto out; - -out_put_unused_both: +out: put_unused_fd(fd2); -out_put_unused_1: put_unused_fd(fd1); -out_release_both: - sock_release(sock2); -out_release_1: - sock_release(sock1); -out: return err; } @@ -1562,7 +1561,6 @@ SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr, if (IS_ERR(newfile)) { err = PTR_ERR(newfile); put_unused_fd(newfd); - sock_release(newsock); goto out_put; } @@ -2644,17 +2642,8 @@ core_initcall(sock_init); /* early initcall */ #ifdef CONFIG_PROC_FS void socket_seq_show(struct seq_file *seq) { - int cpu; - int counter = 0; - - for_each_possible_cpu(cpu) - counter += per_cpu(sockets_in_use, cpu); - - /* It can be negative, by the way. 8) */ - if (counter < 0) - counter = 0; - - seq_printf(seq, "sockets: used %d\n", counter); + seq_printf(seq, "sockets: used %d\n", + sock_inuse_get(seq->private)); } #endif /* CONFIG_PROC_FS */ @@ -2691,89 +2680,25 @@ static int do_siocgstampns(struct net *net, struct socket *sock, return err; } -static int dev_ifname32(struct net *net, struct compat_ifreq __user *uifr32) -{ - struct ifreq __user *uifr; - int err; - - uifr = compat_alloc_user_space(sizeof(struct ifreq)); - if (copy_in_user(uifr, uifr32, sizeof(struct compat_ifreq))) - return -EFAULT; - - err = dev_ioctl(net, SIOCGIFNAME, uifr); - if (err) - return err; - - if (copy_in_user(uifr32, uifr, sizeof(struct compat_ifreq))) - return -EFAULT; - - return 0; -} - -static int dev_ifconf(struct net *net, struct compat_ifconf __user *uifc32) +static int compat_dev_ifconf(struct net *net, struct compat_ifconf __user *uifc32) { struct compat_ifconf ifc32; struct ifconf ifc; - struct ifconf __user *uifc; - struct compat_ifreq __user *ifr32; - struct ifreq __user *ifr; - unsigned int i, j; int err; if (copy_from_user(&ifc32, uifc32, sizeof(struct compat_ifconf))) return -EFAULT; - memset(&ifc, 0, sizeof(ifc)); - if (ifc32.ifcbuf == 0) { - ifc32.ifc_len = 0; - ifc.ifc_len = 0; - ifc.ifc_req = NULL; - uifc = compat_alloc_user_space(sizeof(struct ifconf)); - } else { - size_t len = ((ifc32.ifc_len / sizeof(struct compat_ifreq)) + 1) * - sizeof(struct ifreq); - uifc = compat_alloc_user_space(sizeof(struct ifconf) + len); - ifc.ifc_len = len; - ifr = ifc.ifc_req = (void __user *)(uifc + 1); - ifr32 = compat_ptr(ifc32.ifcbuf); - for (i = 0; i < ifc32.ifc_len; i += sizeof(struct compat_ifreq)) { - if (copy_in_user(ifr, ifr32, sizeof(struct compat_ifreq))) - return -EFAULT; - ifr++; - ifr32++; - } - } - if (copy_to_user(uifc, &ifc, sizeof(struct ifconf))) - return -EFAULT; + ifc.ifc_len = ifc32.ifc_len; + ifc.ifc_req = compat_ptr(ifc32.ifcbuf); - err = dev_ioctl(net, SIOCGIFCONF, uifc); + rtnl_lock(); + err = dev_ifconf(net, &ifc, sizeof(struct compat_ifreq)); + rtnl_unlock(); if (err) return err; - if (copy_from_user(&ifc, uifc, sizeof(struct ifconf))) - return -EFAULT; - - ifr = ifc.ifc_req; - ifr32 = compat_ptr(ifc32.ifcbuf); - for (i = 0, j = 0; - i + sizeof(struct compat_ifreq) <= ifc32.ifc_len && j < ifc.ifc_len; - i += sizeof(struct compat_ifreq), j += sizeof(struct ifreq)) { - if (copy_in_user(ifr32, ifr, sizeof(struct compat_ifreq))) - return -EFAULT; - ifr32++; - ifr++; - } - - if (ifc32.ifcbuf == 0) { - /* Translate from 64-bit structure multiple to - * a 32-bit one. - */ - i = ifc.ifc_len; - i = ((i / sizeof(struct ifreq)) * sizeof(struct compat_ifreq)); - ifc32.ifc_len = i; - } else { - ifc32.ifc_len = i; - } + ifc32.ifc_len = ifc.ifc_len; if (copy_to_user(uifc32, &ifc32, sizeof(struct compat_ifconf))) return -EFAULT; @@ -2784,9 +2709,9 @@ static int ethtool_ioctl(struct net *net, struct compat_ifreq __user *ifr32) { struct compat_ethtool_rxnfc __user *compat_rxnfc; bool convert_in = false, convert_out = false; - size_t buf_size = ALIGN(sizeof(struct ifreq), 8); - struct ethtool_rxnfc __user *rxnfc; - struct ifreq __user *ifr; + size_t buf_size = 0; + struct ethtool_rxnfc __user *rxnfc = NULL; + struct ifreq ifr; u32 rule_cnt = 0, actual_rule_cnt; u32 ethcmd; u32 data; @@ -2823,18 +2748,14 @@ static int ethtool_ioctl(struct net *net, struct compat_ifreq __user *ifr32) case ETHTOOL_SRXCLSRLDEL: buf_size += sizeof(struct ethtool_rxnfc); convert_in = true; + rxnfc = compat_alloc_user_space(buf_size); break; } - ifr = compat_alloc_user_space(buf_size); - rxnfc = (void __user *)ifr + ALIGN(sizeof(struct ifreq), 8); - - if (copy_in_user(&ifr->ifr_name, &ifr32->ifr_name, IFNAMSIZ)) + if (copy_from_user(&ifr.ifr_name, &ifr32->ifr_name, IFNAMSIZ)) return -EFAULT; - if (put_user(convert_in ? rxnfc : compat_ptr(data), - &ifr->ifr_ifru.ifru_data)) - return -EFAULT; + ifr.ifr_data = convert_in ? rxnfc : (void __user *)compat_rxnfc; if (convert_in) { /* We expect there to be holes between fs.m_ext and @@ -2862,7 +2783,7 @@ static int ethtool_ioctl(struct net *net, struct compat_ifreq __user *ifr32) return -EFAULT; } - ret = dev_ioctl(net, SIOCETHTOOL, ifr); + ret = dev_ioctl(net, SIOCETHTOOL, &ifr, NULL); if (ret) return ret; @@ -2903,113 +2824,43 @@ static int ethtool_ioctl(struct net *net, struct compat_ifreq __user *ifr32) static int compat_siocwandev(struct net *net, struct compat_ifreq __user *uifr32) { - void __user *uptr; compat_uptr_t uptr32; - struct ifreq __user *uifr; + struct ifreq ifr; + void __user *saved; + int err; - uifr = compat_alloc_user_space(sizeof(*uifr)); - if (copy_in_user(uifr, uifr32, sizeof(struct compat_ifreq))) + if (copy_from_user(&ifr, uifr32, sizeof(struct compat_ifreq))) return -EFAULT; if (get_user(uptr32, &uifr32->ifr_settings.ifs_ifsu)) return -EFAULT; - uptr = compat_ptr(uptr32); - - if (put_user(uptr, &uifr->ifr_settings.ifs_ifsu.raw_hdlc)) - return -EFAULT; - - return dev_ioctl(net, SIOCWANDEV, uifr); -} + saved = ifr.ifr_settings.ifs_ifsu.raw_hdlc; + ifr.ifr_settings.ifs_ifsu.raw_hdlc = compat_ptr(uptr32); -static int bond_ioctl(struct net *net, unsigned int cmd, - struct compat_ifreq __user *ifr32) -{ - struct ifreq kifr; - mm_segment_t old_fs; - int err; - - switch (cmd) { - case SIOCBONDENSLAVE: - case SIOCBONDRELEASE: - case SIOCBONDSETHWADDR: - case SIOCBONDCHANGEACTIVE: - if (copy_from_user(&kifr, ifr32, sizeof(struct compat_ifreq))) - return -EFAULT; - - old_fs = get_fs(); - set_fs(KERNEL_DS); - err = dev_ioctl(net, cmd, - (struct ifreq __user __force *) &kifr); - set_fs(old_fs); - - return err; - default: - return -ENOIOCTLCMD; + err = dev_ioctl(net, SIOCWANDEV, &ifr, NULL); + if (!err) { + ifr.ifr_settings.ifs_ifsu.raw_hdlc = saved; + if (copy_to_user(uifr32, &ifr, sizeof(struct compat_ifreq))) + err = -EFAULT; } + return err; } /* Handle ioctls that use ifreq::ifr_data and just need struct ifreq converted */ static int compat_ifr_data_ioctl(struct net *net, unsigned int cmd, struct compat_ifreq __user *u_ifreq32) { - struct ifreq __user *u_ifreq64; - char tmp_buf[IFNAMSIZ]; - void __user *data64; + struct ifreq ifreq; u32 data32; - if (copy_from_user(&tmp_buf[0], &(u_ifreq32->ifr_ifrn.ifrn_name[0]), - IFNAMSIZ)) + if (copy_from_user(ifreq.ifr_name, u_ifreq32->ifr_name, IFNAMSIZ)) return -EFAULT; - if (get_user(data32, &u_ifreq32->ifr_ifru.ifru_data)) + if (get_user(data32, &u_ifreq32->ifr_data)) return -EFAULT; - data64 = compat_ptr(data32); + ifreq.ifr_data = compat_ptr(data32); - u_ifreq64 = compat_alloc_user_space(sizeof(*u_ifreq64)); - - if (copy_to_user(&u_ifreq64->ifr_ifrn.ifrn_name[0], &tmp_buf[0], - IFNAMSIZ)) - return -EFAULT; - if (put_user(data64, &u_ifreq64->ifr_ifru.ifru_data)) - return -EFAULT; - - return dev_ioctl(net, cmd, u_ifreq64); -} - -static int dev_ifsioc(struct net *net, struct socket *sock, - unsigned int cmd, struct compat_ifreq __user *uifr32) -{ - struct ifreq __user *uifr; - int err; - - uifr = compat_alloc_user_space(sizeof(*uifr)); - if (copy_in_user(uifr, uifr32, sizeof(*uifr32))) - return -EFAULT; - - err = sock_do_ioctl(net, sock, cmd, (unsigned long)uifr); - - if (!err) { - switch (cmd) { - case SIOCGIFFLAGS: - case SIOCGIFMETRIC: - case SIOCGIFMTU: - case SIOCGIFMEM: - case SIOCGIFHWADDR: - case SIOCGIFINDEX: - case SIOCGIFADDR: - case SIOCGIFBRDADDR: - case SIOCGIFDSTADDR: - case SIOCGIFNETMASK: - case SIOCGIFPFLAGS: - case SIOCGIFTXQLEN: - case SIOCGMIIPHY: - case SIOCGMIIREG: - if (copy_in_user(uifr32, uifr, sizeof(*uifr32))) - err = -EFAULT; - break; - } - } - return err; + return dev_ioctl(net, cmd, &ifreq, NULL); } static int compat_sioc_ifmap(struct net *net, unsigned int cmd, @@ -3017,7 +2868,6 @@ static int compat_sioc_ifmap(struct net *net, unsigned int cmd, { struct ifreq ifr; struct compat_ifmap __user *uifmap32; - mm_segment_t old_fs; int err; uifmap32 = &uifr32->ifr_ifru.ifru_map; @@ -3031,10 +2881,7 @@ static int compat_sioc_ifmap(struct net *net, unsigned int cmd, if (err) return -EFAULT; - old_fs = get_fs(); - set_fs(KERNEL_DS); - err = dev_ioctl(net, cmd, (void __user __force *)&ifr); - set_fs(old_fs); + err = dev_ioctl(net, cmd, &ifr, NULL); if (cmd == SIOCGIFMAP && !err) { err = copy_to_user(uifr32, &ifr, sizeof(ifr.ifr_name)); @@ -3167,10 +3014,8 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, case SIOCSIFBR: case SIOCGIFBR: return old_bridge_ioctl(argp); - case SIOCGIFNAME: - return dev_ifname32(net, argp); case SIOCGIFCONF: - return dev_ifconf(net, argp); + return compat_dev_ifconf(net, argp); case SIOCETHTOOL: return ethtool_ioctl(net, argp); case SIOCWANDEV: @@ -3178,11 +3023,6 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, case SIOCGIFMAP: case SIOCSIFMAP: return compat_sioc_ifmap(net, cmd, argp); - case SIOCBONDENSLAVE: - case SIOCBONDRELEASE: - case SIOCBONDSETHWADDR: - case SIOCBONDCHANGEACTIVE: - return bond_ioctl(net, cmd, argp); case SIOCADDRT: case SIOCDELRT: return routing_ioctl(net, sock, cmd, argp); @@ -3242,12 +3082,15 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, case SIOCGMIIPHY: case SIOCGMIIREG: case SIOCSMIIREG: - return dev_ifsioc(net, sock, cmd, argp); - case SIOCSARP: case SIOCGARP: case SIOCDARP: case SIOCATMARK: + case SIOCBONDENSLAVE: + case SIOCBONDRELEASE: + case SIOCBONDSETHWADDR: + case SIOCBONDCHANGEACTIVE: + case SIOCGIFNAME: return sock_do_ioctl(net, sock, cmd, arg); } @@ -3402,19 +3245,6 @@ int kernel_sendpage_locked(struct sock *sk, struct page *page, int offset, } EXPORT_SYMBOL(kernel_sendpage_locked); -int kernel_sock_ioctl(struct socket *sock, int cmd, unsigned long arg) -{ - mm_segment_t oldfs = get_fs(); - int err; - - set_fs(KERNEL_DS); - err = sock->ops->ioctl(sock, cmd, arg); - set_fs(oldfs); - - return err; -} -EXPORT_SYMBOL(kernel_sock_ioctl); - int kernel_sock_shutdown(struct socket *sock, enum sock_shutdown_cmd how) { return sock->ops->shutdown(sock, how); diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index c5fda15ba319..1fdab5c4eda8 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -401,7 +401,7 @@ void strp_data_ready(struct strparser *strp) * allows a thread in BH context to safely check if the process * lock is held. In this case, if the lock is held, queue work. */ - if (sock_owned_by_user(strp->sk)) { + if (sock_owned_by_user_nocheck(strp->sk)) { queue_work(strp_wq, &strp->work); return; } diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c index c4778cae58ef..444380f968f1 100644 --- a/net/sunrpc/auth_gss/gss_rpc_xdr.c +++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c @@ -231,6 +231,7 @@ static int gssx_dec_linux_creds(struct xdr_stream *xdr, goto out_free_groups; creds->cr_group_info->gid[i] = kgid; } + groups_sort(creds->cr_group_info); return 0; out_free_groups: diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 73165e9ca5bf..26531193fce4 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -264,7 +264,7 @@ out: return status; } -static struct cache_detail rsi_cache_template = { +static const struct cache_detail rsi_cache_template = { .owner = THIS_MODULE, .hash_size = RSI_HASHMAX, .name = "auth.rpcsec.init", @@ -481,6 +481,7 @@ static int rsc_parse(struct cache_detail *cd, goto out; rsci.cred.cr_group_info->gid[i] = kgid; } + groups_sort(rsci.cred.cr_group_info); /* mech name */ len = qword_get(&mesg, buf, mlen); @@ -524,7 +525,7 @@ out: return status; } -static struct cache_detail rsc_cache_template = { +static const struct cache_detail rsc_cache_template = { .owner = THIS_MODULE, .hash_size = RSC_HASHMAX, .name = "auth.rpcsec.context", diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 79d55d949d9a..aa36dad32db1 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -930,10 +930,10 @@ out: static DECLARE_WAIT_QUEUE_HEAD(queue_wait); -static unsigned int cache_poll(struct file *filp, poll_table *wait, +static __poll_t cache_poll(struct file *filp, poll_table *wait, struct cache_detail *cd) { - unsigned int mask; + __poll_t mask; struct cache_reader *rp = filp->private_data; struct cache_queue *cq; @@ -1501,7 +1501,7 @@ static ssize_t cache_write_procfs(struct file *filp, const char __user *buf, return cache_write(filp, buf, count, ppos, cd); } -static unsigned int cache_poll_procfs(struct file *filp, poll_table *wait) +static __poll_t cache_poll_procfs(struct file *filp, poll_table *wait) { struct cache_detail *cd = PDE_DATA(file_inode(filp)); @@ -1674,7 +1674,7 @@ void cache_unregister_net(struct cache_detail *cd, struct net *net) } EXPORT_SYMBOL_GPL(cache_unregister_net); -struct cache_detail *cache_create_net(struct cache_detail *tmpl, struct net *net) +struct cache_detail *cache_create_net(const struct cache_detail *tmpl, struct net *net) { struct cache_detail *cd; int i; @@ -1720,7 +1720,7 @@ static ssize_t cache_write_pipefs(struct file *filp, const char __user *buf, return cache_write(filp, buf, count, ppos, cd); } -static unsigned int cache_poll_pipefs(struct file *filp, poll_table *wait) +static __poll_t cache_poll_pipefs(struct file *filp, poll_table *wait) { struct cache_detail *cd = RPC_I(file_inode(filp))->private; diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index a801da812f86..6e432ecd7f99 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1376,22 +1376,6 @@ rpc_setbufsize(struct rpc_clnt *clnt, unsigned int sndsize, unsigned int rcvsize EXPORT_SYMBOL_GPL(rpc_setbufsize); /** - * rpc_protocol - Get transport protocol number for an RPC client - * @clnt: RPC client to query - * - */ -int rpc_protocol(struct rpc_clnt *clnt) -{ - int protocol; - - rcu_read_lock(); - protocol = rcu_dereference(clnt->cl_xprt)->prot; - rcu_read_unlock(); - return protocol; -} -EXPORT_SYMBOL_GPL(rpc_protocol); - -/** * rpc_net_ns - Get the network namespace for this RPC client * @clnt: RPC client to query * @@ -1841,6 +1825,7 @@ call_bind_status(struct rpc_task *task) case -ECONNABORTED: case -ENOTCONN: case -EHOSTDOWN: + case -ENETDOWN: case -EHOSTUNREACH: case -ENETUNREACH: case -ENOBUFS: @@ -1917,6 +1902,7 @@ call_connect_status(struct rpc_task *task) /* fall through */ case -ECONNRESET: case -ECONNABORTED: + case -ENETDOWN: case -ENETUNREACH: case -EHOSTUNREACH: case -EADDRINUSE: @@ -2022,6 +2008,7 @@ call_transmit_status(struct rpc_task *task) */ case -ECONNREFUSED: case -EHOSTDOWN: + case -ENETDOWN: case -EHOSTUNREACH: case -ENETUNREACH: case -EPERM: @@ -2071,6 +2058,7 @@ call_bc_transmit(struct rpc_task *task) switch (task->tk_status) { case 0: /* Success */ + case -ENETDOWN: case -EHOSTDOWN: case -EHOSTUNREACH: case -ENETUNREACH: @@ -2139,6 +2127,7 @@ call_status(struct rpc_task *task) task->tk_status = 0; switch(status) { case -EHOSTDOWN: + case -ENETDOWN: case -EHOSTUNREACH: case -ENETUNREACH: case -EPERM: diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 7803f3b6aa53..5c4330325787 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -340,12 +340,12 @@ rpc_pipe_write(struct file *filp, const char __user *buf, size_t len, loff_t *of return res; } -static unsigned int +static __poll_t rpc_pipe_poll(struct file *filp, struct poll_table_struct *wait) { struct inode *inode = file_inode(filp); struct rpc_inode *rpci = RPC_I(inode); - unsigned int mask = POLLOUT | POLLWRNORM; + __poll_t mask = POLLOUT | POLLWRNORM; poll_wait(filp, &rpci->waitq, wait); diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index b1b49edd7c4d..896691afbb1a 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -755,22 +755,20 @@ static void __rpc_execute(struct rpc_task *task) void (*do_action)(struct rpc_task *); /* - * Execute any pending callback first. + * Perform the next FSM step or a pending callback. + * + * tk_action may be NULL if the task has been killed. + * In particular, note that rpc_killall_tasks may + * do this at any time, so beware when dereferencing. */ - do_action = task->tk_callback; - task->tk_callback = NULL; - if (do_action == NULL) { - /* - * Perform the next FSM step. - * tk_action may be NULL if the task has been killed. - * In particular, note that rpc_killall_tasks may - * do this at any time, so beware when dereferencing. - */ - do_action = task->tk_action; - if (do_action == NULL) - break; + do_action = task->tk_action; + if (task->tk_callback) { + do_action = task->tk_callback; + task->tk_callback = NULL; } - trace_rpc_task_run_action(task->tk_client, task, task->tk_action); + if (!do_action) + break; + trace_rpc_task_run_action(task->tk_client, task, do_action); do_action(task); /* diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index f81eaa8e0888..af7f28fb8102 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -520,6 +520,7 @@ static int unix_gid_parse(struct cache_detail *cd, ug.gi->gid[i] = kgid; } + groups_sort(ug.gi); ugp = unix_gid_lookup(cd, uid); if (ugp) { struct cache_head *ch; @@ -569,7 +570,7 @@ static int unix_gid_show(struct seq_file *m, return 0; } -static struct cache_detail unix_gid_cache_template = { +static const struct cache_detail unix_gid_cache_template = { .owner = THIS_MODULE, .hash_size = GID_HASHMAX, .name = "auth.unix.gid", @@ -819,6 +820,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp) kgid_t kgid = make_kgid(&init_user_ns, svc_getnl(argv)); cred->cr_group_info->gid[i] = kgid; } + groups_sort(cred->cr_group_info); if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) { *authp = rpc_autherr_badverf; return SVC_DENIED; @@ -862,7 +864,7 @@ struct auth_ops svcauth_unix = { .set_client = svcauth_unix_set_client, }; -static struct cache_detail ip_map_cache_template = { +static const struct cache_detail ip_map_cache_template = { .owner = THIS_MODULE, .hash_size = IP_HASHMAX, .name = "auth.unix.ip", diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index ff8e06cd067e..5570719e4787 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -338,8 +338,8 @@ static int svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr, rqstp->rq_xprt_hlen = 0; clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); - len = kernel_recvmsg(svsk->sk_sock, &msg, iov, nr, buflen, - msg.msg_flags); + iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, iov, nr, buflen); + len = sock_recvmsg(svsk->sk_sock, &msg, msg.msg_flags); /* If we read a full record, then assume there may be more * data to read (stream based sockets only!) */ diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 333b9d697ae5..2436fd1125fc 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -940,8 +940,8 @@ static void xprt_timer(struct rpc_task *task) if (task->tk_status != -ETIMEDOUT) return; - dprintk("RPC: %5u xprt_timer\n", task->tk_pid); + trace_xprt_timer(xprt, req->rq_xid, task->tk_status); if (!req->rq_reply_bytes_recvd) { if (xprt->ops->timer) xprt->ops->timer(xprt, task); @@ -1001,6 +1001,7 @@ void xprt_transmit(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; + unsigned int connect_cookie; int status, numreqs; dprintk("RPC: %5u xprt_transmit(%u)\n", task->tk_pid, req->rq_slen); @@ -1024,6 +1025,7 @@ void xprt_transmit(struct rpc_task *task) } else if (!req->rq_bytes_sent) return; + connect_cookie = xprt->connect_cookie; req->rq_xtime = ktime_get(); status = xprt->ops->send_request(task); trace_xprt_transmit(xprt, req->rq_xid, status); @@ -1047,20 +1049,28 @@ void xprt_transmit(struct rpc_task *task) xprt->stat.bklog_u += xprt->backlog.qlen; xprt->stat.sending_u += xprt->sending.qlen; xprt->stat.pending_u += xprt->pending.qlen; + spin_unlock_bh(&xprt->transport_lock); - /* Don't race with disconnect */ - if (!xprt_connected(xprt)) - task->tk_status = -ENOTCONN; - else { + req->rq_connect_cookie = connect_cookie; + if (rpc_reply_expected(task) && !READ_ONCE(req->rq_reply_bytes_recvd)) { /* - * Sleep on the pending queue since - * we're expecting a reply. + * Sleep on the pending queue if we're expecting a reply. + * The spinlock ensures atomicity between the test of + * req->rq_reply_bytes_recvd, and the call to rpc_sleep_on(). */ - if (!req->rq_reply_bytes_recvd && rpc_reply_expected(task)) + spin_lock(&xprt->recv_lock); + if (!req->rq_reply_bytes_recvd) { rpc_sleep_on(&xprt->pending, task, xprt_timer); - req->rq_connect_cookie = xprt->connect_cookie; + /* + * Send an extra queue wakeup call if the + * connection was dropped in case the call to + * rpc_sleep_on() raced. + */ + if (!xprt_connected(xprt)) + xprt_wake_pending_tasks(xprt, -ENOTCONN); + } + spin_unlock(&xprt->recv_lock); } - spin_unlock_bh(&xprt->transport_lock); } static void xprt_add_backlog(struct rpc_xprt *xprt, struct rpc_task *task) diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 8b818bb3518a..ed1a4a3065ee 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -43,7 +43,6 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt, req = rpcrdma_create_req(r_xprt); if (IS_ERR(req)) return PTR_ERR(req); - __set_bit(RPCRDMA_REQ_F_BACKCHANNEL, &req->rl_flags); rb = rpcrdma_alloc_regbuf(RPCRDMA_HDRBUF_SIZE, DMA_TO_DEVICE, GFP_KERNEL); @@ -74,21 +73,13 @@ out_fail: static int rpcrdma_bc_setup_reps(struct rpcrdma_xprt *r_xprt, unsigned int count) { - struct rpcrdma_rep *rep; int rc = 0; while (count--) { - rep = rpcrdma_create_rep(r_xprt); - if (IS_ERR(rep)) { - pr_err("RPC: %s: reply buffer alloc failed\n", - __func__); - rc = PTR_ERR(rep); + rc = rpcrdma_create_rep(r_xprt); + if (rc) break; - } - - rpcrdma_recv_buffer_put(rep); } - return rc; } @@ -129,6 +120,7 @@ int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs) rqst->rq_xprt = &r_xprt->rx_xprt; INIT_LIST_HEAD(&rqst->rq_list); INIT_LIST_HEAD(&rqst->rq_bc_list); + __set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state); if (rpcrdma_bc_setup_rqst(r_xprt, rqst)) goto out_free; @@ -148,7 +140,7 @@ int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs) buffer->rb_bc_srv_max_requests = reqs; request_module("svcrdma"); - + trace_xprtrdma_cb_setup(r_xprt, reqs); return 0; out_free: @@ -196,13 +188,7 @@ size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt) return maxmsg - RPCRDMA_HDRLEN_MIN; } -/** - * rpcrdma_bc_marshal_reply - Send backwards direction reply - * @rqst: buffer containing RPC reply data - * - * Returns zero on success. - */ -int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) +static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); struct rpcrdma_req *req = rpcr_to_rdmar(rqst); @@ -226,7 +212,46 @@ int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) if (rpcrdma_prepare_send_sges(r_xprt, req, RPCRDMA_HDRLEN_MIN, &rqst->rq_snd_buf, rpcrdma_noch)) return -EIO; + + trace_xprtrdma_cb_reply(rqst); + return 0; +} + +/** + * xprt_rdma_bc_send_reply - marshal and send a backchannel reply + * @rqst: RPC rqst with a backchannel RPC reply in rq_snd_buf + * + * Caller holds the transport's write lock. + * + * Returns: + * %0 if the RPC message has been sent + * %-ENOTCONN if the caller should reconnect and call again + * %-EIO if a permanent error occurred and the request was not + * sent. Do not try to send this message again. + */ +int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst) +{ + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + int rc; + + if (!xprt_connected(rqst->rq_xprt)) + goto drop_connection; + + rc = rpcrdma_bc_marshal_reply(rqst); + if (rc < 0) + goto failed_marshal; + + if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) + goto drop_connection; return 0; + +failed_marshal: + if (rc != -ENOTCONN) + return rc; +drop_connection: + xprt_disconnect_done(rqst->rq_xprt); + return -ENOTCONN; } /** @@ -262,11 +287,6 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst) dprintk("RPC: %s: freeing rqst %p (req %p)\n", __func__, rqst, rpcr_to_rdmar(rqst)); - smp_mb__before_atomic(); - WARN_ON_ONCE(!test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state)); - clear_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state); - smp_mb__after_atomic(); - spin_lock_bh(&xprt->bc_pa_lock); list_add_tail(&rqst->rq_bc_pa_list, &xprt->bc_pa_list); spin_unlock_bh(&xprt->bc_pa_lock); @@ -274,7 +294,7 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst) /** * rpcrdma_bc_receive_call - Handle a backward direction call - * @xprt: transport receiving the call + * @r_xprt: transport receiving the call * @rep: receive buffer containing the call * * Operational assumptions: @@ -313,7 +333,6 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, struct rpc_rqst, rq_bc_pa_list); list_del(&rqst->rq_bc_pa_list); spin_unlock(&xprt->bc_pa_lock); - dprintk("RPC: %s: using rqst %p\n", __func__, rqst); /* Prepare rqst */ rqst->rq_reply_bytes_recvd = 0; @@ -321,7 +340,6 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, rqst->rq_xid = *p; rqst->rq_private_buf.len = size; - set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state); buf = &rqst->rq_rcv_buf; memset(buf, 0, sizeof(*buf)); @@ -335,12 +353,8 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, * the Upper Layer is done decoding it. */ req = rpcr_to_rdmar(rqst); - dprintk("RPC: %s: attaching rep %p to req %p\n", - __func__, rep, req); req->rl_reply = rep; - - /* Defeat the retransmit detection logic in send_request */ - req->rl_connect_cookie = 0; + trace_xprtrdma_cb_call(rqst); /* Queue rqst for ULP's callback service */ bc_serv = xprt->bc_serv; diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index 29fc84c7ff98..d5f95bb39300 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright (c) 2015 Oracle. All rights reserved. + * Copyright (c) 2015, 2017 Oracle. All rights reserved. * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. */ @@ -47,7 +47,7 @@ fmr_is_supported(struct rpcrdma_ia *ia) } static int -fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *mw) +fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) { static struct ib_fmr_attr fmr_attr = { .max_pages = RPCRDMA_MAX_FMR_SGES, @@ -55,106 +55,108 @@ fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *mw) .page_shift = PAGE_SHIFT }; - mw->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES, + mr->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES, sizeof(u64), GFP_KERNEL); - if (!mw->fmr.fm_physaddrs) + if (!mr->fmr.fm_physaddrs) goto out_free; - mw->mw_sg = kcalloc(RPCRDMA_MAX_FMR_SGES, - sizeof(*mw->mw_sg), GFP_KERNEL); - if (!mw->mw_sg) + mr->mr_sg = kcalloc(RPCRDMA_MAX_FMR_SGES, + sizeof(*mr->mr_sg), GFP_KERNEL); + if (!mr->mr_sg) goto out_free; - sg_init_table(mw->mw_sg, RPCRDMA_MAX_FMR_SGES); + sg_init_table(mr->mr_sg, RPCRDMA_MAX_FMR_SGES); - mw->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS, + mr->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS, &fmr_attr); - if (IS_ERR(mw->fmr.fm_mr)) + if (IS_ERR(mr->fmr.fm_mr)) goto out_fmr_err; return 0; out_fmr_err: dprintk("RPC: %s: ib_alloc_fmr returned %ld\n", __func__, - PTR_ERR(mw->fmr.fm_mr)); + PTR_ERR(mr->fmr.fm_mr)); out_free: - kfree(mw->mw_sg); - kfree(mw->fmr.fm_physaddrs); + kfree(mr->mr_sg); + kfree(mr->fmr.fm_physaddrs); return -ENOMEM; } static int -__fmr_unmap(struct rpcrdma_mw *mw) +__fmr_unmap(struct rpcrdma_mr *mr) { LIST_HEAD(l); int rc; - list_add(&mw->fmr.fm_mr->list, &l); + list_add(&mr->fmr.fm_mr->list, &l); rc = ib_unmap_fmr(&l); - list_del(&mw->fmr.fm_mr->list); + list_del(&mr->fmr.fm_mr->list); return rc; } static void -fmr_op_release_mr(struct rpcrdma_mw *r) +fmr_op_release_mr(struct rpcrdma_mr *mr) { LIST_HEAD(unmap_list); int rc; /* Ensure MW is not on any rl_registered list */ - if (!list_empty(&r->mw_list)) - list_del(&r->mw_list); + if (!list_empty(&mr->mr_list)) + list_del(&mr->mr_list); - kfree(r->fmr.fm_physaddrs); - kfree(r->mw_sg); + kfree(mr->fmr.fm_physaddrs); + kfree(mr->mr_sg); /* In case this one was left mapped, try to unmap it * to prevent dealloc_fmr from failing with EBUSY */ - rc = __fmr_unmap(r); + rc = __fmr_unmap(mr); if (rc) pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n", - r, rc); + mr, rc); - rc = ib_dealloc_fmr(r->fmr.fm_mr); + rc = ib_dealloc_fmr(mr->fmr.fm_mr); if (rc) pr_err("rpcrdma: final ib_dealloc_fmr for %p returned %i\n", - r, rc); + mr, rc); - kfree(r); + kfree(mr); } /* Reset of a single FMR. */ static void -fmr_op_recover_mr(struct rpcrdma_mw *mw) +fmr_op_recover_mr(struct rpcrdma_mr *mr) { - struct rpcrdma_xprt *r_xprt = mw->mw_xprt; + struct rpcrdma_xprt *r_xprt = mr->mr_xprt; int rc; /* ORDER: invalidate first */ - rc = __fmr_unmap(mw); - - /* ORDER: then DMA unmap */ - ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, - mw->mw_sg, mw->mw_nents, mw->mw_dir); + rc = __fmr_unmap(mr); if (rc) goto out_release; - rpcrdma_put_mw(r_xprt, mw); + /* ORDER: then DMA unmap */ + rpcrdma_mr_unmap_and_put(mr); + r_xprt->rx_stats.mrs_recovered++; return; out_release: - pr_err("rpcrdma: FMR reset failed (%d), %p released\n", rc, mw); + pr_err("rpcrdma: FMR reset failed (%d), %p released\n", rc, mr); r_xprt->rx_stats.mrs_orphaned++; - spin_lock(&r_xprt->rx_buf.rb_mwlock); - list_del(&mw->mw_all); - spin_unlock(&r_xprt->rx_buf.rb_mwlock); + trace_xprtrdma_dma_unmap(mr); + ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, + mr->mr_sg, mr->mr_nents, mr->mr_dir); + + spin_lock(&r_xprt->rx_buf.rb_mrlock); + list_del(&mr->mr_all); + spin_unlock(&r_xprt->rx_buf.rb_mrlock); - fmr_op_release_mr(mw); + fmr_op_release_mr(mr); } static int @@ -180,15 +182,15 @@ fmr_op_maxpages(struct rpcrdma_xprt *r_xprt) */ static struct rpcrdma_mr_seg * fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, - int nsegs, bool writing, struct rpcrdma_mw **out) + int nsegs, bool writing, struct rpcrdma_mr **out) { struct rpcrdma_mr_seg *seg1 = seg; int len, pageoff, i, rc; - struct rpcrdma_mw *mw; + struct rpcrdma_mr *mr; u64 *dma_pages; - mw = rpcrdma_get_mw(r_xprt); - if (!mw) + mr = rpcrdma_mr_get(r_xprt); + if (!mr) return ERR_PTR(-ENOBUFS); pageoff = offset_in_page(seg1->mr_offset); @@ -199,12 +201,12 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, nsegs = RPCRDMA_MAX_FMR_SGES; for (i = 0; i < nsegs;) { if (seg->mr_page) - sg_set_page(&mw->mw_sg[i], + sg_set_page(&mr->mr_sg[i], seg->mr_page, seg->mr_len, offset_in_page(seg->mr_offset)); else - sg_set_buf(&mw->mw_sg[i], seg->mr_offset, + sg_set_buf(&mr->mr_sg[i], seg->mr_offset, seg->mr_len); len += seg->mr_len; ++seg; @@ -214,40 +216,38 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) break; } - mw->mw_dir = rpcrdma_data_dir(writing); + mr->mr_dir = rpcrdma_data_dir(writing); - mw->mw_nents = ib_dma_map_sg(r_xprt->rx_ia.ri_device, - mw->mw_sg, i, mw->mw_dir); - if (!mw->mw_nents) + mr->mr_nents = ib_dma_map_sg(r_xprt->rx_ia.ri_device, + mr->mr_sg, i, mr->mr_dir); + if (!mr->mr_nents) goto out_dmamap_err; - for (i = 0, dma_pages = mw->fmr.fm_physaddrs; i < mw->mw_nents; i++) - dma_pages[i] = sg_dma_address(&mw->mw_sg[i]); - rc = ib_map_phys_fmr(mw->fmr.fm_mr, dma_pages, mw->mw_nents, + for (i = 0, dma_pages = mr->fmr.fm_physaddrs; i < mr->mr_nents; i++) + dma_pages[i] = sg_dma_address(&mr->mr_sg[i]); + rc = ib_map_phys_fmr(mr->fmr.fm_mr, dma_pages, mr->mr_nents, dma_pages[0]); if (rc) goto out_maperr; - mw->mw_handle = mw->fmr.fm_mr->rkey; - mw->mw_length = len; - mw->mw_offset = dma_pages[0] + pageoff; + mr->mr_handle = mr->fmr.fm_mr->rkey; + mr->mr_length = len; + mr->mr_offset = dma_pages[0] + pageoff; - *out = mw; + *out = mr; return seg; out_dmamap_err: pr_err("rpcrdma: failed to DMA map sg %p sg_nents %d\n", - mw->mw_sg, i); - rpcrdma_put_mw(r_xprt, mw); + mr->mr_sg, i); + rpcrdma_mr_put(mr); return ERR_PTR(-EIO); out_maperr: pr_err("rpcrdma: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n", len, (unsigned long long)dma_pages[0], - pageoff, mw->mw_nents, rc); - ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, - mw->mw_sg, mw->mw_nents, mw->mw_dir); - rpcrdma_put_mw(r_xprt, mw); + pageoff, mr->mr_nents, rc); + rpcrdma_mr_unmap_and_put(mr); return ERR_PTR(-EIO); } @@ -256,13 +256,13 @@ out_maperr: * Sleeps until it is safe for the host CPU to access the * previously mapped memory regions. * - * Caller ensures that @mws is not empty before the call. This + * Caller ensures that @mrs is not empty before the call. This * function empties the list. */ static void -fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws) +fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) { - struct rpcrdma_mw *mw; + struct rpcrdma_mr *mr; LIST_HEAD(unmap_list); int rc; @@ -271,10 +271,11 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws) * ib_unmap_fmr() is slow, so use a single call instead * of one call per mapped FMR. */ - list_for_each_entry(mw, mws, mw_list) { + list_for_each_entry(mr, mrs, mr_list) { dprintk("RPC: %s: unmapping fmr %p\n", - __func__, &mw->fmr); - list_add_tail(&mw->fmr.fm_mr->list, &unmap_list); + __func__, &mr->fmr); + trace_xprtrdma_localinv(mr); + list_add_tail(&mr->fmr.fm_mr->list, &unmap_list); } r_xprt->rx_stats.local_inv_needed++; rc = ib_unmap_fmr(&unmap_list); @@ -284,14 +285,10 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws) /* ORDER: Now DMA unmap all of the req's MRs, and return * them to the free MW list. */ - while (!list_empty(mws)) { - mw = rpcrdma_pop_mw(mws); - dprintk("RPC: %s: DMA unmapping fmr %p\n", - __func__, &mw->fmr); - list_del(&mw->fmr.fm_mr->list); - ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, - mw->mw_sg, mw->mw_nents, mw->mw_dir); - rpcrdma_put_mw(r_xprt, mw); + while (!list_empty(mrs)) { + mr = rpcrdma_mr_pop(mrs); + list_del(&mr->fmr.fm_mr->list); + rpcrdma_mr_unmap_and_put(mr); } return; @@ -299,10 +296,10 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws) out_reset: pr_err("rpcrdma: ib_unmap_fmr failed (%i)\n", rc); - while (!list_empty(mws)) { - mw = rpcrdma_pop_mw(mws); - list_del(&mw->fmr.fm_mr->list); - fmr_op_recover_mr(mw); + while (!list_empty(mrs)) { + mr = rpcrdma_mr_pop(mrs); + list_del(&mr->fmr.fm_mr->list); + fmr_op_recover_mr(mr); } } diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 773e66e10a15..90f688f19783 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -1,11 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright (c) 2015 Oracle. All rights reserved. + * Copyright (c) 2015, 2017 Oracle. All rights reserved. * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. */ /* Lightweight memory registration using Fast Registration Work - * Requests (FRWR). Also referred to sometimes as FRMR mode. + * Requests (FRWR). * * FRWR features ordered asynchronous registration and deregistration * of arbitrarily sized memory regions. This is the fastest and safest @@ -15,9 +15,9 @@ /* Normal operation * * A Memory Region is prepared for RDMA READ or WRITE using a FAST_REG - * Work Request (frmr_op_map). When the RDMA operation is finished, this + * Work Request (frwr_op_map). When the RDMA operation is finished, this * Memory Region is invalidated using a LOCAL_INV Work Request - * (frmr_op_unmap). + * (frwr_op_unmap_sync). * * Typically these Work Requests are not signaled, and neither are RDMA * SEND Work Requests (with the exception of signaling occasionally to @@ -26,7 +26,7 @@ * * As an optimization, frwr_op_unmap marks MRs INVALID before the * LOCAL_INV WR is posted. If posting succeeds, the MR is placed on - * rb_mws immediately so that no work (like managing a linked list + * rb_mrs immediately so that no work (like managing a linked list * under a spinlock) is needed in the completion upcall. * * But this means that frwr_op_map() can occasionally encounter an MR @@ -60,7 +60,7 @@ * When frwr_op_map encounters FLUSHED and VALID MRs, they are recovered * with ib_dereg_mr and then are re-initialized. Because MR recovery * allocates fresh resources, it is deferred to a workqueue, and the - * recovered MRs are placed back on the rb_mws list when recovery is + * recovered MRs are placed back on the rb_mrs list when recovery is * complete. frwr_op_map allocates another MR for the current RPC while * the broken MR is reset. * @@ -96,26 +96,26 @@ out_not_supported: } static int -frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r) +frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) { - unsigned int depth = ia->ri_max_frmr_depth; - struct rpcrdma_frmr *f = &r->frmr; + unsigned int depth = ia->ri_max_frwr_depth; + struct rpcrdma_frwr *frwr = &mr->frwr; int rc; - f->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth); - if (IS_ERR(f->fr_mr)) + frwr->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth); + if (IS_ERR(frwr->fr_mr)) goto out_mr_err; - r->mw_sg = kcalloc(depth, sizeof(*r->mw_sg), GFP_KERNEL); - if (!r->mw_sg) + mr->mr_sg = kcalloc(depth, sizeof(*mr->mr_sg), GFP_KERNEL); + if (!mr->mr_sg) goto out_list_err; - sg_init_table(r->mw_sg, depth); - init_completion(&f->fr_linv_done); + sg_init_table(mr->mr_sg, depth); + init_completion(&frwr->fr_linv_done); return 0; out_mr_err: - rc = PTR_ERR(f->fr_mr); + rc = PTR_ERR(frwr->fr_mr); dprintk("RPC: %s: ib_alloc_mr status %i\n", __func__, rc); return rc; @@ -124,83 +124,85 @@ out_list_err: rc = -ENOMEM; dprintk("RPC: %s: sg allocation failure\n", __func__); - ib_dereg_mr(f->fr_mr); + ib_dereg_mr(frwr->fr_mr); return rc; } static void -frwr_op_release_mr(struct rpcrdma_mw *r) +frwr_op_release_mr(struct rpcrdma_mr *mr) { int rc; - /* Ensure MW is not on any rl_registered list */ - if (!list_empty(&r->mw_list)) - list_del(&r->mw_list); + /* Ensure MR is not on any rl_registered list */ + if (!list_empty(&mr->mr_list)) + list_del(&mr->mr_list); - rc = ib_dereg_mr(r->frmr.fr_mr); + rc = ib_dereg_mr(mr->frwr.fr_mr); if (rc) pr_err("rpcrdma: final ib_dereg_mr for %p returned %i\n", - r, rc); - kfree(r->mw_sg); - kfree(r); + mr, rc); + kfree(mr->mr_sg); + kfree(mr); } static int -__frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r) +__frwr_mr_reset(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) { - struct rpcrdma_frmr *f = &r->frmr; + struct rpcrdma_frwr *frwr = &mr->frwr; int rc; - rc = ib_dereg_mr(f->fr_mr); + rc = ib_dereg_mr(frwr->fr_mr); if (rc) { pr_warn("rpcrdma: ib_dereg_mr status %d, frwr %p orphaned\n", - rc, r); + rc, mr); return rc; } - f->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, - ia->ri_max_frmr_depth); - if (IS_ERR(f->fr_mr)) { + frwr->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, + ia->ri_max_frwr_depth); + if (IS_ERR(frwr->fr_mr)) { pr_warn("rpcrdma: ib_alloc_mr status %ld, frwr %p orphaned\n", - PTR_ERR(f->fr_mr), r); - return PTR_ERR(f->fr_mr); + PTR_ERR(frwr->fr_mr), mr); + return PTR_ERR(frwr->fr_mr); } - dprintk("RPC: %s: recovered FRMR %p\n", __func__, f); - f->fr_state = FRMR_IS_INVALID; + dprintk("RPC: %s: recovered FRWR %p\n", __func__, frwr); + frwr->fr_state = FRWR_IS_INVALID; return 0; } -/* Reset of a single FRMR. Generate a fresh rkey by replacing the MR. +/* Reset of a single FRWR. Generate a fresh rkey by replacing the MR. */ static void -frwr_op_recover_mr(struct rpcrdma_mw *mw) +frwr_op_recover_mr(struct rpcrdma_mr *mr) { - enum rpcrdma_frmr_state state = mw->frmr.fr_state; - struct rpcrdma_xprt *r_xprt = mw->mw_xprt; + enum rpcrdma_frwr_state state = mr->frwr.fr_state; + struct rpcrdma_xprt *r_xprt = mr->mr_xprt; struct rpcrdma_ia *ia = &r_xprt->rx_ia; int rc; - rc = __frwr_reset_mr(ia, mw); - if (state != FRMR_FLUSHED_LI) + rc = __frwr_mr_reset(ia, mr); + if (state != FRWR_FLUSHED_LI) { + trace_xprtrdma_dma_unmap(mr); ib_dma_unmap_sg(ia->ri_device, - mw->mw_sg, mw->mw_nents, mw->mw_dir); + mr->mr_sg, mr->mr_nents, mr->mr_dir); + } if (rc) goto out_release; - rpcrdma_put_mw(r_xprt, mw); + rpcrdma_mr_put(mr); r_xprt->rx_stats.mrs_recovered++; return; out_release: - pr_err("rpcrdma: FRMR reset failed %d, %p release\n", rc, mw); + pr_err("rpcrdma: FRWR reset failed %d, %p release\n", rc, mr); r_xprt->rx_stats.mrs_orphaned++; - spin_lock(&r_xprt->rx_buf.rb_mwlock); - list_del(&mw->mw_all); - spin_unlock(&r_xprt->rx_buf.rb_mwlock); + spin_lock(&r_xprt->rx_buf.rb_mrlock); + list_del(&mr->mr_all); + spin_unlock(&r_xprt->rx_buf.rb_mrlock); - frwr_op_release_mr(mw); + frwr_op_release_mr(mr); } static int @@ -214,31 +216,31 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, if (attrs->device_cap_flags & IB_DEVICE_SG_GAPS_REG) ia->ri_mrtype = IB_MR_TYPE_SG_GAPS; - ia->ri_max_frmr_depth = + ia->ri_max_frwr_depth = min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, attrs->max_fast_reg_page_list_len); dprintk("RPC: %s: device's max FR page list len = %u\n", - __func__, ia->ri_max_frmr_depth); - - /* Add room for frmr register and invalidate WRs. - * 1. FRMR reg WR for head - * 2. FRMR invalidate WR for head - * 3. N FRMR reg WRs for pagelist - * 4. N FRMR invalidate WRs for pagelist - * 5. FRMR reg WR for tail - * 6. FRMR invalidate WR for tail + __func__, ia->ri_max_frwr_depth); + + /* Add room for frwr register and invalidate WRs. + * 1. FRWR reg WR for head + * 2. FRWR invalidate WR for head + * 3. N FRWR reg WRs for pagelist + * 4. N FRWR invalidate WRs for pagelist + * 5. FRWR reg WR for tail + * 6. FRWR invalidate WR for tail * 7. The RDMA_SEND WR */ depth = 7; - /* Calculate N if the device max FRMR depth is smaller than + /* Calculate N if the device max FRWR depth is smaller than * RPCRDMA_MAX_DATA_SEGS. */ - if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) { - delta = RPCRDMA_MAX_DATA_SEGS - ia->ri_max_frmr_depth; + if (ia->ri_max_frwr_depth < RPCRDMA_MAX_DATA_SEGS) { + delta = RPCRDMA_MAX_DATA_SEGS - ia->ri_max_frwr_depth; do { - depth += 2; /* FRMR reg + invalidate */ - delta -= ia->ri_max_frmr_depth; + depth += 2; /* FRWR reg + invalidate */ + delta -= ia->ri_max_frwr_depth; } while (delta > 0); } @@ -252,7 +254,7 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, } ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS / - ia->ri_max_frmr_depth); + ia->ri_max_frwr_depth); return 0; } @@ -265,7 +267,7 @@ frwr_op_maxpages(struct rpcrdma_xprt *r_xprt) struct rpcrdma_ia *ia = &r_xprt->rx_ia; return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, - RPCRDMA_MAX_HDR_SEGS * ia->ri_max_frmr_depth); + RPCRDMA_MAX_HDR_SEGS * ia->ri_max_frwr_depth); } static void @@ -286,16 +288,16 @@ __frwr_sendcompletion_flush(struct ib_wc *wc, const char *wr) static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) { - struct rpcrdma_frmr *frmr; - struct ib_cqe *cqe; + struct ib_cqe *cqe = wc->wr_cqe; + struct rpcrdma_frwr *frwr = + container_of(cqe, struct rpcrdma_frwr, fr_cqe); /* WARNING: Only wr_cqe and status are reliable at this point */ if (wc->status != IB_WC_SUCCESS) { - cqe = wc->wr_cqe; - frmr = container_of(cqe, struct rpcrdma_frmr, fr_cqe); - frmr->fr_state = FRMR_FLUSHED_FR; + frwr->fr_state = FRWR_FLUSHED_FR; __frwr_sendcompletion_flush(wc, "fastreg"); } + trace_xprtrdma_wc_fastreg(wc, frwr); } /** @@ -307,16 +309,16 @@ frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc) { - struct rpcrdma_frmr *frmr; - struct ib_cqe *cqe; + struct ib_cqe *cqe = wc->wr_cqe; + struct rpcrdma_frwr *frwr = container_of(cqe, struct rpcrdma_frwr, + fr_cqe); /* WARNING: Only wr_cqe and status are reliable at this point */ if (wc->status != IB_WC_SUCCESS) { - cqe = wc->wr_cqe; - frmr = container_of(cqe, struct rpcrdma_frmr, fr_cqe); - frmr->fr_state = FRMR_FLUSHED_LI; + frwr->fr_state = FRWR_FLUSHED_LI; __frwr_sendcompletion_flush(wc, "localinv"); } + trace_xprtrdma_wc_li(wc, frwr); } /** @@ -329,17 +331,17 @@ frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc) static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) { - struct rpcrdma_frmr *frmr; - struct ib_cqe *cqe; + struct ib_cqe *cqe = wc->wr_cqe; + struct rpcrdma_frwr *frwr = container_of(cqe, struct rpcrdma_frwr, + fr_cqe); /* WARNING: Only wr_cqe and status are reliable at this point */ - cqe = wc->wr_cqe; - frmr = container_of(cqe, struct rpcrdma_frmr, fr_cqe); if (wc->status != IB_WC_SUCCESS) { - frmr->fr_state = FRMR_FLUSHED_LI; + frwr->fr_state = FRWR_FLUSHED_LI; __frwr_sendcompletion_flush(wc, "localinv"); } - complete(&frmr->fr_linv_done); + complete(&frwr->fr_linv_done); + trace_xprtrdma_wc_li_wake(wc, frwr); } /* Post a REG_MR Work Request to register a memory region @@ -347,41 +349,39 @@ frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) */ static struct rpcrdma_mr_seg * frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, - int nsegs, bool writing, struct rpcrdma_mw **out) + int nsegs, bool writing, struct rpcrdma_mr **out) { struct rpcrdma_ia *ia = &r_xprt->rx_ia; bool holes_ok = ia->ri_mrtype == IB_MR_TYPE_SG_GAPS; - struct rpcrdma_mw *mw; - struct rpcrdma_frmr *frmr; - struct ib_mr *mr; + struct rpcrdma_frwr *frwr; + struct rpcrdma_mr *mr; + struct ib_mr *ibmr; struct ib_reg_wr *reg_wr; struct ib_send_wr *bad_wr; int rc, i, n; u8 key; - mw = NULL; + mr = NULL; do { - if (mw) - rpcrdma_defer_mr_recovery(mw); - mw = rpcrdma_get_mw(r_xprt); - if (!mw) + if (mr) + rpcrdma_mr_defer_recovery(mr); + mr = rpcrdma_mr_get(r_xprt); + if (!mr) return ERR_PTR(-ENOBUFS); - } while (mw->frmr.fr_state != FRMR_IS_INVALID); - frmr = &mw->frmr; - frmr->fr_state = FRMR_IS_VALID; - mr = frmr->fr_mr; - reg_wr = &frmr->fr_regwr; - - if (nsegs > ia->ri_max_frmr_depth) - nsegs = ia->ri_max_frmr_depth; + } while (mr->frwr.fr_state != FRWR_IS_INVALID); + frwr = &mr->frwr; + frwr->fr_state = FRWR_IS_VALID; + + if (nsegs > ia->ri_max_frwr_depth) + nsegs = ia->ri_max_frwr_depth; for (i = 0; i < nsegs;) { if (seg->mr_page) - sg_set_page(&mw->mw_sg[i], + sg_set_page(&mr->mr_sg[i], seg->mr_page, seg->mr_len, offset_in_page(seg->mr_offset)); else - sg_set_buf(&mw->mw_sg[i], seg->mr_offset, + sg_set_buf(&mr->mr_sg[i], seg->mr_offset, seg->mr_len); ++seg; @@ -392,30 +392,29 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) break; } - mw->mw_dir = rpcrdma_data_dir(writing); + mr->mr_dir = rpcrdma_data_dir(writing); - mw->mw_nents = ib_dma_map_sg(ia->ri_device, mw->mw_sg, i, mw->mw_dir); - if (!mw->mw_nents) + mr->mr_nents = ib_dma_map_sg(ia->ri_device, mr->mr_sg, i, mr->mr_dir); + if (!mr->mr_nents) goto out_dmamap_err; - n = ib_map_mr_sg(mr, mw->mw_sg, mw->mw_nents, NULL, PAGE_SIZE); - if (unlikely(n != mw->mw_nents)) + ibmr = frwr->fr_mr; + n = ib_map_mr_sg(ibmr, mr->mr_sg, mr->mr_nents, NULL, PAGE_SIZE); + if (unlikely(n != mr->mr_nents)) goto out_mapmr_err; - dprintk("RPC: %s: Using frmr %p to map %u segments (%llu bytes)\n", - __func__, frmr, mw->mw_nents, mr->length); - - key = (u8)(mr->rkey & 0x000000FF); - ib_update_fast_reg_key(mr, ++key); + key = (u8)(ibmr->rkey & 0x000000FF); + ib_update_fast_reg_key(ibmr, ++key); + reg_wr = &frwr->fr_regwr; reg_wr->wr.next = NULL; reg_wr->wr.opcode = IB_WR_REG_MR; - frmr->fr_cqe.done = frwr_wc_fastreg; - reg_wr->wr.wr_cqe = &frmr->fr_cqe; + frwr->fr_cqe.done = frwr_wc_fastreg; + reg_wr->wr.wr_cqe = &frwr->fr_cqe; reg_wr->wr.num_sge = 0; reg_wr->wr.send_flags = 0; - reg_wr->mr = mr; - reg_wr->key = mr->rkey; + reg_wr->mr = ibmr; + reg_wr->key = ibmr->rkey; reg_wr->access = writing ? IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : IB_ACCESS_REMOTE_READ; @@ -424,47 +423,64 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, if (rc) goto out_senderr; - mw->mw_handle = mr->rkey; - mw->mw_length = mr->length; - mw->mw_offset = mr->iova; + mr->mr_handle = ibmr->rkey; + mr->mr_length = ibmr->length; + mr->mr_offset = ibmr->iova; - *out = mw; + *out = mr; return seg; out_dmamap_err: pr_err("rpcrdma: failed to DMA map sg %p sg_nents %d\n", - mw->mw_sg, i); - frmr->fr_state = FRMR_IS_INVALID; - rpcrdma_put_mw(r_xprt, mw); + mr->mr_sg, i); + frwr->fr_state = FRWR_IS_INVALID; + rpcrdma_mr_put(mr); return ERR_PTR(-EIO); out_mapmr_err: pr_err("rpcrdma: failed to map mr %p (%d/%d)\n", - frmr->fr_mr, n, mw->mw_nents); - rpcrdma_defer_mr_recovery(mw); + frwr->fr_mr, n, mr->mr_nents); + rpcrdma_mr_defer_recovery(mr); return ERR_PTR(-EIO); out_senderr: - pr_err("rpcrdma: FRMR registration ib_post_send returned %i\n", rc); - rpcrdma_defer_mr_recovery(mw); + pr_err("rpcrdma: FRWR registration ib_post_send returned %i\n", rc); + rpcrdma_mr_defer_recovery(mr); return ERR_PTR(-ENOTCONN); } +/* Handle a remotely invalidated mr on the @mrs list + */ +static void +frwr_op_reminv(struct rpcrdma_rep *rep, struct list_head *mrs) +{ + struct rpcrdma_mr *mr; + + list_for_each_entry(mr, mrs, mr_list) + if (mr->mr_handle == rep->rr_inv_rkey) { + list_del(&mr->mr_list); + trace_xprtrdma_remoteinv(mr); + mr->frwr.fr_state = FRWR_IS_INVALID; + rpcrdma_mr_unmap_and_put(mr); + break; /* only one invalidated MR per RPC */ + } +} + /* Invalidate all memory regions that were registered for "req". * * Sleeps until it is safe for the host CPU to access the * previously mapped memory regions. * - * Caller ensures that @mws is not empty before the call. This + * Caller ensures that @mrs is not empty before the call. This * function empties the list. */ static void -frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws) +frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) { struct ib_send_wr *first, **prev, *last, *bad_wr; struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rpcrdma_frmr *f; - struct rpcrdma_mw *mw; + struct rpcrdma_frwr *frwr; + struct rpcrdma_mr *mr; int count, rc; /* ORDER: Invalidate all of the MRs first @@ -472,31 +488,27 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws) * Chain the LOCAL_INV Work Requests and post them with * a single ib_post_send() call. */ - f = NULL; + frwr = NULL; count = 0; prev = &first; - list_for_each_entry(mw, mws, mw_list) { - mw->frmr.fr_state = FRMR_IS_INVALID; + list_for_each_entry(mr, mrs, mr_list) { + mr->frwr.fr_state = FRWR_IS_INVALID; - if (mw->mw_flags & RPCRDMA_MW_F_RI) - continue; + frwr = &mr->frwr; + trace_xprtrdma_localinv(mr); - f = &mw->frmr; - dprintk("RPC: %s: invalidating frmr %p\n", - __func__, f); - - f->fr_cqe.done = frwr_wc_localinv; - last = &f->fr_invwr; + frwr->fr_cqe.done = frwr_wc_localinv; + last = &frwr->fr_invwr; memset(last, 0, sizeof(*last)); - last->wr_cqe = &f->fr_cqe; + last->wr_cqe = &frwr->fr_cqe; last->opcode = IB_WR_LOCAL_INV; - last->ex.invalidate_rkey = mw->mw_handle; + last->ex.invalidate_rkey = mr->mr_handle; count++; *prev = last; prev = &last->next; } - if (!f) + if (!frwr) goto unmap; /* Strong send queue ordering guarantees that when the @@ -504,8 +516,8 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws) * are complete. */ last->send_flags = IB_SEND_SIGNALED; - f->fr_cqe.done = frwr_wc_localinv_wake; - reinit_completion(&f->fr_linv_done); + frwr->fr_cqe.done = frwr_wc_localinv_wake; + reinit_completion(&frwr->fr_linv_done); /* Transport disconnect drains the receive CQ before it * replaces the QP. The RPC reply handler won't call us @@ -515,36 +527,32 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws) bad_wr = NULL; rc = ib_post_send(ia->ri_id->qp, first, &bad_wr); if (bad_wr != first) - wait_for_completion(&f->fr_linv_done); + wait_for_completion(&frwr->fr_linv_done); if (rc) goto reset_mrs; /* ORDER: Now DMA unmap all of the MRs, and return - * them to the free MW list. + * them to the free MR list. */ unmap: - while (!list_empty(mws)) { - mw = rpcrdma_pop_mw(mws); - dprintk("RPC: %s: DMA unmapping frmr %p\n", - __func__, &mw->frmr); - ib_dma_unmap_sg(ia->ri_device, - mw->mw_sg, mw->mw_nents, mw->mw_dir); - rpcrdma_put_mw(r_xprt, mw); + while (!list_empty(mrs)) { + mr = rpcrdma_mr_pop(mrs); + rpcrdma_mr_unmap_and_put(mr); } return; reset_mrs: - pr_err("rpcrdma: FRMR invalidate ib_post_send returned %i\n", rc); + pr_err("rpcrdma: FRWR invalidate ib_post_send returned %i\n", rc); /* Find and reset the MRs in the LOCAL_INV WRs that did not * get posted. */ while (bad_wr) { - f = container_of(bad_wr, struct rpcrdma_frmr, - fr_invwr); - mw = container_of(f, struct rpcrdma_mw, frmr); + frwr = container_of(bad_wr, struct rpcrdma_frwr, + fr_invwr); + mr = container_of(frwr, struct rpcrdma_mr, frwr); - __frwr_reset_mr(ia, mw); + __frwr_mr_reset(ia, mr); bad_wr = bad_wr->next; } @@ -553,6 +561,7 @@ reset_mrs: const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { .ro_map = frwr_op_map, + .ro_reminv = frwr_op_reminv, .ro_unmap_sync = frwr_op_unmap_sync, .ro_recover_mr = frwr_op_recover_mr, .ro_open = frwr_op_open, diff --git a/net/sunrpc/xprtrdma/module.c b/net/sunrpc/xprtrdma/module.c index 560712bd9fa2..a762d192372b 100644 --- a/net/sunrpc/xprtrdma/module.c +++ b/net/sunrpc/xprtrdma/module.c @@ -1,18 +1,20 @@ /* - * Copyright (c) 2015 Oracle. All rights reserved. + * Copyright (c) 2015, 2017 Oracle. All rights reserved. */ /* rpcrdma.ko module initialization */ +#include <linux/types.h> +#include <linux/compiler.h> #include <linux/module.h> #include <linux/init.h> #include <linux/sunrpc/svc_rdma.h> -#include "xprt_rdma.h" -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -# define RPCDBG_FACILITY RPCDBG_TRANS -#endif +#include <asm/swab.h> + +#define CREATE_TRACE_POINTS +#include "xprt_rdma.h" MODULE_AUTHOR("Open Grid Computing and Network Appliance, Inc."); MODULE_DESCRIPTION("RPC/RDMA Transport"); diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index ed34dc0f144c..162e5dd82466 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -292,15 +292,15 @@ encode_item_not_present(struct xdr_stream *xdr) } static void -xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mw *mw) +xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr *mr) { - *iptr++ = cpu_to_be32(mw->mw_handle); - *iptr++ = cpu_to_be32(mw->mw_length); - xdr_encode_hyper(iptr, mw->mw_offset); + *iptr++ = cpu_to_be32(mr->mr_handle); + *iptr++ = cpu_to_be32(mr->mr_length); + xdr_encode_hyper(iptr, mr->mr_offset); } static int -encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mw *mw) +encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr) { __be32 *p; @@ -308,12 +308,12 @@ encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mw *mw) if (unlikely(!p)) return -EMSGSIZE; - xdr_encode_rdma_segment(p, mw); + xdr_encode_rdma_segment(p, mr); return 0; } static int -encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mw *mw, +encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr, u32 position) { __be32 *p; @@ -324,7 +324,7 @@ encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mw *mw, *p++ = xdr_one; /* Item present */ *p++ = cpu_to_be32(position); - xdr_encode_rdma_segment(p, mw); + xdr_encode_rdma_segment(p, mr); return 0; } @@ -348,7 +348,7 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, { struct xdr_stream *xdr = &req->rl_stream; struct rpcrdma_mr_seg *seg; - struct rpcrdma_mw *mw; + struct rpcrdma_mr *mr; unsigned int pos; int nsegs; @@ -363,21 +363,17 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, do { seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, - false, &mw); + false, &mr); if (IS_ERR(seg)) return PTR_ERR(seg); - rpcrdma_push_mw(mw, &req->rl_registered); + rpcrdma_mr_push(mr, &req->rl_registered); - if (encode_read_segment(xdr, mw, pos) < 0) + if (encode_read_segment(xdr, mr, pos) < 0) return -EMSGSIZE; - dprintk("RPC: %5u %s: pos %u %u@0x%016llx:0x%08x (%s)\n", - rqst->rq_task->tk_pid, __func__, pos, - mw->mw_length, (unsigned long long)mw->mw_offset, - mw->mw_handle, mw->mw_nents < nsegs ? "more" : "last"); - + trace_xprtrdma_read_chunk(rqst->rq_task, pos, mr, nsegs); r_xprt->rx_stats.read_chunk_count++; - nsegs -= mw->mw_nents; + nsegs -= mr->mr_nents; } while (nsegs); return 0; @@ -404,7 +400,7 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, { struct xdr_stream *xdr = &req->rl_stream; struct rpcrdma_mr_seg *seg; - struct rpcrdma_mw *mw; + struct rpcrdma_mr *mr; int nsegs, nchunks; __be32 *segcount; @@ -425,23 +421,19 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, nchunks = 0; do { seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, - true, &mw); + true, &mr); if (IS_ERR(seg)) return PTR_ERR(seg); - rpcrdma_push_mw(mw, &req->rl_registered); + rpcrdma_mr_push(mr, &req->rl_registered); - if (encode_rdma_segment(xdr, mw) < 0) + if (encode_rdma_segment(xdr, mr) < 0) return -EMSGSIZE; - dprintk("RPC: %5u %s: %u@0x016%llx:0x%08x (%s)\n", - rqst->rq_task->tk_pid, __func__, - mw->mw_length, (unsigned long long)mw->mw_offset, - mw->mw_handle, mw->mw_nents < nsegs ? "more" : "last"); - + trace_xprtrdma_write_chunk(rqst->rq_task, mr, nsegs); r_xprt->rx_stats.write_chunk_count++; - r_xprt->rx_stats.total_rdma_request += seg->mr_len; + r_xprt->rx_stats.total_rdma_request += mr->mr_length; nchunks++; - nsegs -= mw->mw_nents; + nsegs -= mr->mr_nents; } while (nsegs); /* Update count of segments in this Write chunk */ @@ -468,7 +460,7 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, { struct xdr_stream *xdr = &req->rl_stream; struct rpcrdma_mr_seg *seg; - struct rpcrdma_mw *mw; + struct rpcrdma_mr *mr; int nsegs, nchunks; __be32 *segcount; @@ -487,23 +479,19 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, nchunks = 0; do { seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, - true, &mw); + true, &mr); if (IS_ERR(seg)) return PTR_ERR(seg); - rpcrdma_push_mw(mw, &req->rl_registered); + rpcrdma_mr_push(mr, &req->rl_registered); - if (encode_rdma_segment(xdr, mw) < 0) + if (encode_rdma_segment(xdr, mr) < 0) return -EMSGSIZE; - dprintk("RPC: %5u %s: %u@0x%016llx:0x%08x (%s)\n", - rqst->rq_task->tk_pid, __func__, - mw->mw_length, (unsigned long long)mw->mw_offset, - mw->mw_handle, mw->mw_nents < nsegs ? "more" : "last"); - + trace_xprtrdma_reply_chunk(rqst->rq_task, mr, nsegs); r_xprt->rx_stats.reply_chunk_count++; - r_xprt->rx_stats.total_rdma_request += seg->mr_len; + r_xprt->rx_stats.total_rdma_request += mr->mr_length; nchunks++; - nsegs -= mw->mw_nents; + nsegs -= mr->mr_nents; } while (nsegs); /* Update count of segments in the Reply chunk */ @@ -524,9 +512,6 @@ rpcrdma_unmap_sendctx(struct rpcrdma_sendctx *sc) struct ib_sge *sge; unsigned int count; - dprintk("RPC: %s: unmapping %u sges for sc=%p\n", - __func__, sc->sc_unmap_count, sc); - /* The first two SGEs contain the transport header and * the inline buffer. These are always left mapped so * they can be cheaply re-used. @@ -754,11 +739,6 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) __be32 *p; int ret; -#if defined(CONFIG_SUNRPC_BACKCHANNEL) - if (test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state)) - return rpcrdma_bc_marshal_reply(rqst); -#endif - rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0); xdr_init_encode(xdr, &req->rl_hdrbuf, req->rl_rdmabuf->rg_base); @@ -821,6 +801,17 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) rtype = rpcrdma_areadch; } + /* If this is a retransmit, discard previously registered + * chunks. Very likely the connection has been replaced, + * so these registrations are invalid and unusable. + */ + while (unlikely(!list_empty(&req->rl_registered))) { + struct rpcrdma_mr *mr; + + mr = rpcrdma_mr_pop(&req->rl_registered); + rpcrdma_mr_defer_recovery(mr); + } + /* This implementation supports the following combinations * of chunk lists in one RPC-over-RDMA Call message: * @@ -868,10 +859,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) if (ret) goto out_err; - dprintk("RPC: %5u %s: %s/%s: hdrlen %u rpclen\n", - rqst->rq_task->tk_pid, __func__, - transfertypes[rtype], transfertypes[wtype], - xdr_stream_pos(xdr)); + trace_xprtrdma_marshal(rqst, xdr_stream_pos(xdr), rtype, wtype); ret = rpcrdma_prepare_send_sges(r_xprt, req, xdr_stream_pos(xdr), &rqst->rq_snd_buf, rtype); @@ -926,8 +914,7 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) curlen = rqst->rq_rcv_buf.head[0].iov_len; if (curlen > copy_len) curlen = copy_len; - dprintk("RPC: %s: srcp 0x%p len %d hdrlen %d\n", - __func__, srcp, copy_len, curlen); + trace_xprtrdma_fixup(rqst, copy_len, curlen); srcp += curlen; copy_len -= curlen; @@ -947,9 +934,8 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) if (curlen > pagelist_len) curlen = pagelist_len; - dprintk("RPC: %s: page %d" - " srcp 0x%p len %d curlen %d\n", - __func__, i, srcp, copy_len, curlen); + trace_xprtrdma_fixup_pg(rqst, i, srcp, + copy_len, curlen); destp = kmap_atomic(ppages[i]); memcpy(destp + page_base, srcp, curlen); flush_dcache_page(ppages[i]); @@ -984,24 +970,6 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) return fixup_copy_count; } -/* Caller must guarantee @rep remains stable during this call. - */ -static void -rpcrdma_mark_remote_invalidation(struct list_head *mws, - struct rpcrdma_rep *rep) -{ - struct rpcrdma_mw *mw; - - if (!(rep->rr_wc_flags & IB_WC_WITH_INVALIDATE)) - return; - - list_for_each_entry(mw, mws, mw_list) - if (mw->mw_handle == rep->rr_inv_rkey) { - mw->mw_flags = RPCRDMA_MW_F_RI; - break; /* only one invalidated MR per RPC */ - } -} - /* By convention, backchannel calls arrive via rdma_msg type * messages, and never populate the chunk lists. This makes * the RPC/RDMA header small and fixed in size, so it is @@ -1058,26 +1026,19 @@ out_short: static int decode_rdma_segment(struct xdr_stream *xdr, u32 *length) { + u32 handle; + u64 offset; __be32 *p; p = xdr_inline_decode(xdr, 4 * sizeof(*p)); if (unlikely(!p)) return -EIO; - ifdebug(FACILITY) { - u64 offset; - u32 handle; - - handle = be32_to_cpup(p++); - *length = be32_to_cpup(p++); - xdr_decode_hyper(p, &offset); - dprintk("RPC: %s: segment %u@0x%016llx:0x%08x\n", - __func__, *length, (unsigned long long)offset, - handle); - } else { - *length = be32_to_cpup(p + 1); - } + handle = be32_to_cpup(p++); + *length = be32_to_cpup(p++); + xdr_decode_hyper(p, &offset); + trace_xprtrdma_decode_seg(handle, *length, offset); return 0; } @@ -1098,8 +1059,6 @@ static int decode_write_chunk(struct xdr_stream *xdr, u32 *length) *length += seglength; } - dprintk("RPC: %s: segcount=%u, %u bytes\n", - __func__, be32_to_cpup(p), *length); return 0; } @@ -1296,8 +1255,7 @@ out: * being marshaled. */ out_badheader: - dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n", - rqst->rq_task->tk_pid, __func__, be32_to_cpu(rep->rr_proc)); + trace_xprtrdma_reply_hdr(rep); r_xprt->rx_stats.bad_reply_count++; status = -EIO; goto out; @@ -1339,9 +1297,12 @@ void rpcrdma_deferred_completion(struct work_struct *work) struct rpcrdma_rep *rep = container_of(work, struct rpcrdma_rep, rr_work); struct rpcrdma_req *req = rpcr_to_rdmar(rep->rr_rqst); + struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; - rpcrdma_mark_remote_invalidation(&req->rl_registered, rep); - rpcrdma_release_rqst(rep->rr_rxprt, req); + trace_xprtrdma_defer_cmp(rep); + if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE) + r_xprt->rx_ia.ri_ops->ro_reminv(rep, &req->rl_registered); + rpcrdma_release_rqst(r_xprt, req); rpcrdma_complete_rqst(rep); } @@ -1360,8 +1321,6 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) u32 credits; __be32 *p; - dprintk("RPC: %s: incoming rep %p\n", __func__, rep); - if (rep->rr_hdrbuf.head[0].iov_len == 0) goto out_badstatus; @@ -1405,14 +1364,9 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) rep->rr_rqst = rqst; clear_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags); - dprintk("RPC: %s: reply %p completes request %p (xid 0x%08x)\n", - __func__, rep, req, be32_to_cpu(rep->rr_xid)); + trace_xprtrdma_reply(rqst->rq_task, rep, req, credits); - if (list_empty(&req->rl_registered) && - !test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) - rpcrdma_complete_rqst(rep); - else - queue_work(rpcrdma_receive_wq, &rep->rr_work); + queue_work_on(req->rl_cpu, rpcrdma_receive_wq, &rep->rr_work); return; out_badstatus: @@ -1424,8 +1378,7 @@ out_badstatus: return; out_badversion: - dprintk("RPC: %s: invalid version %d\n", - __func__, be32_to_cpu(rep->rr_vers)); + trace_xprtrdma_reply_vers(rep); goto repost; /* The RPC transaction has already been terminated, or the header @@ -1433,12 +1386,11 @@ out_badversion: */ out_norqst: spin_unlock(&xprt->recv_lock); - dprintk("RPC: %s: no match for incoming xid 0x%08x\n", - __func__, be32_to_cpu(rep->rr_xid)); + trace_xprtrdma_reply_rqst(rep); goto repost; out_shortreply: - dprintk("RPC: %s: short/invalid reply\n", __func__); + trace_xprtrdma_reply_short(rep); /* If no pending RPC transaction was matched, post a replacement * receive buffer before returning. diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 646c24494ea7..4b1ecfe979cf 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -52,6 +52,7 @@ #include <linux/slab.h> #include <linux/seq_file.h> #include <linux/sunrpc/addr.h> +#include <linux/smp.h> #include "xprt_rdma.h" @@ -66,8 +67,7 @@ static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE; unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE; static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE; -static unsigned int xprt_rdma_inline_write_padding; -unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR; +unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRWR; int xprt_rdma_pad_optimize; #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) @@ -80,6 +80,7 @@ static unsigned int zero; static unsigned int max_padding = PAGE_SIZE; static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS; static unsigned int max_memreg = RPCRDMA_LAST - 1; +static unsigned int dummy; static struct ctl_table_header *sunrpc_table_header; @@ -113,7 +114,7 @@ static struct ctl_table xr_tunables_table[] = { }, { .procname = "rdma_inline_write_padding", - .data = &xprt_rdma_inline_write_padding, + .data = &dummy, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, @@ -258,13 +259,10 @@ xprt_rdma_connect_worker(struct work_struct *work) xprt_clear_connected(xprt); - dprintk("RPC: %s: %sconnect\n", __func__, - r_xprt->rx_ep.rep_connected != 0 ? "re" : ""); rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia); if (rc) xprt_wake_pending_tasks(xprt, rc); - dprintk("RPC: %s: exit\n", __func__); xprt_clear_connecting(xprt); } @@ -274,7 +272,7 @@ xprt_rdma_inject_disconnect(struct rpc_xprt *xprt) struct rpcrdma_xprt *r_xprt = container_of(xprt, struct rpcrdma_xprt, rx_xprt); - pr_info("rpcrdma: injecting transport disconnect on xprt=%p\n", xprt); + trace_xprtrdma_inject_dsc(r_xprt); rdma_disconnect(r_xprt->rx_ia.ri_id); } @@ -294,7 +292,7 @@ xprt_rdma_destroy(struct rpc_xprt *xprt) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - dprintk("RPC: %s: called\n", __func__); + trace_xprtrdma_destroy(r_xprt); cancel_delayed_work_sync(&r_xprt->rx_connect_worker); @@ -305,11 +303,8 @@ xprt_rdma_destroy(struct rpc_xprt *xprt) rpcrdma_ia_close(&r_xprt->rx_ia); xprt_rdma_free_addresses(xprt); - xprt_free(xprt); - dprintk("RPC: %s: returning\n", __func__); - module_put(THIS_MODULE); } @@ -360,9 +355,7 @@ xprt_setup_rdma(struct xprt_create *args) /* * Set up RDMA-specific connect data. */ - - sap = (struct sockaddr *)&cdata.addr; - memcpy(sap, args->dstaddr, args->addrlen); + sap = args->dstaddr; /* Ensure xprt->addr holds valid server TCP (not RDMA) * address, for any side protocols which peek at it */ @@ -372,6 +365,7 @@ xprt_setup_rdma(struct xprt_create *args) if (rpc_get_port(sap)) xprt_set_bound(xprt); + xprt_rdma_format_addresses(xprt, sap); cdata.max_requests = xprt->max_reqs; @@ -386,8 +380,6 @@ xprt_setup_rdma(struct xprt_create *args) if (cdata.inline_rsize > cdata.rsize) cdata.inline_rsize = cdata.rsize; - cdata.padding = xprt_rdma_inline_write_padding; - /* * Create new transport instance, which includes initialized * o ia @@ -397,7 +389,7 @@ xprt_setup_rdma(struct xprt_create *args) new_xprt = rpcx_to_rdmax(xprt); - rc = rpcrdma_ia_open(new_xprt, sap); + rc = rpcrdma_ia_open(new_xprt); if (rc) goto out1; @@ -406,31 +398,19 @@ xprt_setup_rdma(struct xprt_create *args) */ new_xprt->rx_data = cdata; new_ep = &new_xprt->rx_ep; - new_ep->rep_remote_addr = cdata.addr; rc = rpcrdma_ep_create(&new_xprt->rx_ep, &new_xprt->rx_ia, &new_xprt->rx_data); if (rc) goto out2; - /* - * Allocate pre-registered send and receive buffers for headers and - * any inline data. Also specify any padding which will be provided - * from a preregistered zero buffer. - */ rc = rpcrdma_buffer_create(new_xprt); if (rc) goto out3; - /* - * Register a callback for connection events. This is necessary because - * connection loss notification is async. We also catch connection loss - * when reaping receives. - */ INIT_DELAYED_WORK(&new_xprt->rx_connect_worker, xprt_rdma_connect_worker); - xprt_rdma_format_addresses(xprt, sap); xprt->max_payload = new_xprt->rx_ia.ri_ops->ro_maxpages(new_xprt); if (xprt->max_payload == 0) goto out4; @@ -444,16 +424,19 @@ xprt_setup_rdma(struct xprt_create *args) dprintk("RPC: %s: %s:%s\n", __func__, xprt->address_strings[RPC_DISPLAY_ADDR], xprt->address_strings[RPC_DISPLAY_PORT]); + trace_xprtrdma_create(new_xprt); return xprt; out4: - xprt_rdma_free_addresses(xprt); - rc = -EINVAL; + rpcrdma_buffer_destroy(&new_xprt->rx_buf); + rc = -ENODEV; out3: rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia); out2: rpcrdma_ia_close(&new_xprt->rx_ia); out1: + trace_xprtrdma_destroy(new_xprt); + xprt_rdma_free_addresses(xprt); xprt_free(xprt); return ERR_PTR(rc); } @@ -487,16 +470,34 @@ xprt_rdma_close(struct rpc_xprt *xprt) rpcrdma_ep_disconnect(ep, ia); } +/** + * xprt_rdma_set_port - update server port with rpcbind result + * @xprt: controlling RPC transport + * @port: new port value + * + * Transport connect status is unchanged. + */ static void xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port) { - struct sockaddr_in *sap; + struct sockaddr *sap = (struct sockaddr *)&xprt->addr; + char buf[8]; - sap = (struct sockaddr_in *)&xprt->addr; - sap->sin_port = htons(port); - sap = (struct sockaddr_in *)&rpcx_to_rdmad(xprt).addr; - sap->sin_port = htons(port); - dprintk("RPC: %s: %u\n", __func__, port); + dprintk("RPC: %s: setting port for xprt %p (%s:%s) to %u\n", + __func__, xprt, + xprt->address_strings[RPC_DISPLAY_ADDR], + xprt->address_strings[RPC_DISPLAY_PORT], + port); + + rpc_set_port(sap, port); + + kfree(xprt->address_strings[RPC_DISPLAY_PORT]); + snprintf(buf, sizeof(buf), "%u", port); + xprt->address_strings[RPC_DISPLAY_PORT] = kstrdup(buf, GFP_KERNEL); + + kfree(xprt->address_strings[RPC_DISPLAY_HEX_PORT]); + snprintf(buf, sizeof(buf), "%4hx", port); + xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL); } /** @@ -515,8 +516,6 @@ xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port) static void xprt_rdma_timer(struct rpc_xprt *xprt, struct rpc_task *task) { - dprintk("RPC: %5u %s: xprt = %p\n", task->tk_pid, __func__, xprt); - xprt_force_disconnect(xprt); } @@ -639,7 +638,7 @@ xprt_rdma_allocate(struct rpc_task *task) req = rpcrdma_buffer_get(&r_xprt->rx_buf); if (req == NULL) - return -ENOMEM; + goto out_get; flags = RPCRDMA_DEF_GFP; if (RPC_IS_SWAPPER(task)) @@ -652,18 +651,18 @@ xprt_rdma_allocate(struct rpc_task *task) if (!rpcrdma_get_recvbuf(r_xprt, req, rqst->rq_rcvsize, flags)) goto out_fail; - dprintk("RPC: %5u %s: send size = %zd, recv size = %zd, req = %p\n", - task->tk_pid, __func__, rqst->rq_callsize, - rqst->rq_rcvsize, req); - + req->rl_cpu = smp_processor_id(); req->rl_connect_cookie = 0; /* our reserved value */ rpcrdma_set_xprtdata(rqst, req); rqst->rq_buffer = req->rl_sendbuf->rg_base; rqst->rq_rbuffer = req->rl_recvbuf->rg_base; + trace_xprtrdma_allocate(task, req); return 0; out_fail: rpcrdma_buffer_put(req); +out_get: + trace_xprtrdma_allocate(task, NULL); return -ENOMEM; } @@ -680,13 +679,9 @@ xprt_rdma_free(struct rpc_task *task) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); struct rpcrdma_req *req = rpcr_to_rdmar(rqst); - if (test_bit(RPCRDMA_REQ_F_BACKCHANNEL, &req->rl_flags)) - return; - - dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply); - if (test_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags)) rpcrdma_release_rqst(r_xprt, req); + trace_xprtrdma_rpc_done(task, req); rpcrdma_buffer_put(req); } @@ -696,22 +691,12 @@ xprt_rdma_free(struct rpc_task *task) * * Caller holds the transport's write lock. * - * Return values: - * 0: The request has been sent - * ENOTCONN: Caller needs to invoke connect logic then call again - * ENOBUFS: Call again later to send the request - * EIO: A permanent error occurred. The request was not sent, - * and don't try it again - * - * send_request invokes the meat of RPC RDMA. It must do the following: - * - * 1. Marshal the RPC request into an RPC RDMA request, which means - * putting a header in front of data, and creating IOVs for RDMA - * from those in the request. - * 2. In marshaling, detect opportunities for RDMA, and use them. - * 3. Post a recv message to set up asynch completion, then send - * the request (rpcrdma_ep_post). - * 4. No partial sends are possible in the RPC-RDMA protocol (as in UDP). + * Returns: + * %0 if the RPC message has been sent + * %-ENOTCONN if the caller should reconnect and call again + * %-ENOBUFS if the caller should call again later + * %-EIO if a permanent error occurred and the request was not + * sent. Do not try to send this message again. */ static int xprt_rdma_send_request(struct rpc_task *task) @@ -722,14 +707,14 @@ xprt_rdma_send_request(struct rpc_task *task) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); int rc = 0; +#if defined(CONFIG_SUNRPC_BACKCHANNEL) + if (unlikely(!rqst->rq_buffer)) + return xprt_rdma_bc_send_reply(rqst); +#endif /* CONFIG_SUNRPC_BACKCHANNEL */ + if (!xprt_connected(xprt)) goto drop_connection; - /* On retransmit, remove any previously registered chunks */ - if (unlikely(!list_empty(&req->rl_registered))) - r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, - &req->rl_registered); - rc = rpcrdma_marshal_req(r_xprt, rqst); if (rc < 0) goto failed_marshal; @@ -742,7 +727,7 @@ xprt_rdma_send_request(struct rpc_task *task) goto drop_connection; req->rl_connect_cookie = xprt->connect_cookie; - set_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags); + __set_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags); if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) goto drop_connection; @@ -902,8 +887,7 @@ int xprt_rdma_init(void) "\tMaxInlineRead %d\n\tMaxInlineWrite %d\n", xprt_rdma_slot_table_entries, xprt_rdma_max_inline_read, xprt_rdma_max_inline_write); - dprintk("\tPadding %d\n\tMemreg %d\n", - xprt_rdma_inline_write_padding, xprt_rdma_memreg_strategy); + dprintk("\tPadding 0\n\tMemreg %d\n", xprt_rdma_memreg_strategy); #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) if (!sunrpc_table_header) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 710b3f77db82..f4eb63e8e689 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -71,8 +71,8 @@ /* * internal functions */ -static void rpcrdma_create_mrs(struct rpcrdma_xprt *r_xprt); -static void rpcrdma_destroy_mrs(struct rpcrdma_buffer *buf); +static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt); +static void rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf); static void rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb); struct workqueue_struct *rpcrdma_receive_wq __read_mostly; @@ -83,7 +83,7 @@ rpcrdma_alloc_wq(void) struct workqueue_struct *recv_wq; recv_wq = alloc_workqueue("xprtrdma_receive", - WQ_MEM_RECLAIM | WQ_UNBOUND | WQ_HIGHPRI, + WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); if (!recv_wq) return -ENOMEM; @@ -108,7 +108,10 @@ static void rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context) { struct rpcrdma_ep *ep = context; + struct rpcrdma_xprt *r_xprt = container_of(ep, struct rpcrdma_xprt, + rx_ep); + trace_xprtrdma_qp_error(r_xprt, event); pr_err("rpcrdma: %s on device %s ep %p\n", ib_event_msg(event->event), event->device->name, context); @@ -133,6 +136,7 @@ rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) container_of(cqe, struct rpcrdma_sendctx, sc_cqe); /* WARNING: Only wr_cqe and status are reliable at this point */ + trace_xprtrdma_wc_send(sc, wc); if (wc->status != IB_WC_SUCCESS && wc->status != IB_WC_WR_FLUSH_ERR) pr_err("rpcrdma: Send: %s (%u/0x%x)\n", ib_wc_status_msg(wc->status), @@ -155,13 +159,11 @@ rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) rr_cqe); /* WARNING: Only wr_id and status are reliable at this point */ + trace_xprtrdma_wc_receive(rep, wc); if (wc->status != IB_WC_SUCCESS) goto out_fail; /* status == SUCCESS means all fields in wc are trustworthy */ - dprintk("RPC: %s: rep %p opcode 'recv', length %u: success\n", - __func__, rep, wc->byte_len); - rpcrdma_set_xdrlen(&rep->rr_hdrbuf, wc->byte_len); rep->rr_wc_flags = wc->wc_flags; rep->rr_inv_rkey = wc->ex.invalidate_rkey; @@ -192,7 +194,6 @@ rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt, unsigned int rsize, wsize; /* Default settings for RPC-over-RDMA Version One */ - r_xprt->rx_ia.ri_reminv_expected = false; r_xprt->rx_ia.ri_implicit_roundup = xprt_rdma_pad_optimize; rsize = RPCRDMA_V1_DEF_INLINE_SIZE; wsize = RPCRDMA_V1_DEF_INLINE_SIZE; @@ -200,7 +201,6 @@ rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt, if (pmsg && pmsg->cp_magic == rpcrdma_cmp_magic && pmsg->cp_version == RPCRDMA_CMP_VERSION) { - r_xprt->rx_ia.ri_reminv_expected = true; r_xprt->rx_ia.ri_implicit_roundup = true; rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size); wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size); @@ -221,11 +221,9 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) struct rpcrdma_xprt *xprt = id->context; struct rpcrdma_ia *ia = &xprt->rx_ia; struct rpcrdma_ep *ep = &xprt->rx_ep; -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) - struct sockaddr *sap = (struct sockaddr *)&ep->rep_remote_addr; -#endif int connstate = 0; + trace_xprtrdma_conn_upcall(xprt, event); switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: case RDMA_CM_EVENT_ROUTE_RESOLVED: @@ -234,21 +232,17 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) break; case RDMA_CM_EVENT_ADDR_ERROR: ia->ri_async_rc = -EHOSTUNREACH; - dprintk("RPC: %s: CM address resolution error, ep 0x%p\n", - __func__, ep); complete(&ia->ri_done); break; case RDMA_CM_EVENT_ROUTE_ERROR: ia->ri_async_rc = -ENETUNREACH; - dprintk("RPC: %s: CM route resolution error, ep 0x%p\n", - __func__, ep); complete(&ia->ri_done); break; case RDMA_CM_EVENT_DEVICE_REMOVAL: #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) - pr_info("rpcrdma: removing device %s for %pIS:%u\n", + pr_info("rpcrdma: removing device %s for %s:%s\n", ia->ri_device->name, - sap, rpc_get_port(sap)); + rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt)); #endif set_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags); ep->rep_connected = -ENODEV; @@ -271,8 +265,8 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) connstate = -ENETDOWN; goto connected; case RDMA_CM_EVENT_REJECTED: - dprintk("rpcrdma: connection to %pIS:%u rejected: %s\n", - sap, rpc_get_port(sap), + dprintk("rpcrdma: connection to %s:%s rejected: %s\n", + rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt), rdma_reject_msg(id, event->status)); connstate = -ECONNREFUSED; if (event->status == IB_CM_REJ_STALE_CONN) @@ -287,8 +281,9 @@ connected: wake_up_all(&ep->rep_connect_wait); /*FALLTHROUGH*/ default: - dprintk("RPC: %s: %pIS:%u on %s/%s (ep 0x%p): %s\n", - __func__, sap, rpc_get_port(sap), + dprintk("RPC: %s: %s:%s on %s/%s (ep 0x%p): %s\n", + __func__, + rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt), ia->ri_device->name, ia->ri_ops->ro_displayname, ep, rdma_event_msg(event->event)); break; @@ -298,13 +293,14 @@ connected: } static struct rdma_cm_id * -rpcrdma_create_id(struct rpcrdma_xprt *xprt, - struct rpcrdma_ia *ia, struct sockaddr *addr) +rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia) { unsigned long wtimeout = msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1; struct rdma_cm_id *id; int rc; + trace_xprtrdma_conn_start(xprt); + init_completion(&ia->ri_done); init_completion(&ia->ri_remove_done); @@ -318,7 +314,9 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, } ia->ri_async_rc = -ETIMEDOUT; - rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT); + rc = rdma_resolve_addr(id, NULL, + (struct sockaddr *)&xprt->rx_xprt.addr, + RDMA_RESOLVE_TIMEOUT); if (rc) { dprintk("RPC: %s: rdma_resolve_addr() failed %i\n", __func__, rc); @@ -326,8 +324,7 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, } rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout); if (rc < 0) { - dprintk("RPC: %s: wait() exited: %i\n", - __func__, rc); + trace_xprtrdma_conn_tout(xprt); goto out; } @@ -344,8 +341,7 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, } rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout); if (rc < 0) { - dprintk("RPC: %s: wait() exited: %i\n", - __func__, rc); + trace_xprtrdma_conn_tout(xprt); goto out; } rc = ia->ri_async_rc; @@ -365,19 +361,18 @@ out: /** * rpcrdma_ia_open - Open and initialize an Interface Adapter. - * @xprt: controlling transport - * @addr: IP address of remote peer + * @xprt: transport with IA to (re)initialize * * Returns 0 on success, negative errno if an appropriate * Interface Adapter could not be found and opened. */ int -rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr) +rpcrdma_ia_open(struct rpcrdma_xprt *xprt) { struct rpcrdma_ia *ia = &xprt->rx_ia; int rc; - ia->ri_id = rpcrdma_create_id(xprt, ia, addr); + ia->ri_id = rpcrdma_create_id(xprt, ia); if (IS_ERR(ia->ri_id)) { rc = PTR_ERR(ia->ri_id); goto out_err; @@ -392,7 +387,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr) } switch (xprt_rdma_memreg_strategy) { - case RPCRDMA_FRMR: + case RPCRDMA_FRWR: if (frwr_is_supported(ia)) { ia->ri_ops = &rpcrdma_frwr_memreg_ops; break; @@ -462,10 +457,12 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia) rpcrdma_dma_unmap_regbuf(req->rl_sendbuf); rpcrdma_dma_unmap_regbuf(req->rl_recvbuf); } - rpcrdma_destroy_mrs(buf); + rpcrdma_mrs_destroy(buf); /* Allow waiters to continue */ complete(&ia->ri_remove_done); + + trace_xprtrdma_remove(r_xprt); } /** @@ -476,7 +473,6 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia) void rpcrdma_ia_close(struct rpcrdma_ia *ia) { - dprintk("RPC: %s: entering\n", __func__); if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) { if (ia->ri_id->qp) rdma_destroy_qp(ia->ri_id); @@ -630,9 +626,6 @@ out1: void rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { - dprintk("RPC: %s: entering, connected is %d\n", - __func__, ep->rep_connected); - cancel_delayed_work_sync(&ep->rep_connect_worker); if (ia->ri_id->qp) { @@ -653,13 +646,12 @@ static int rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { - struct sockaddr *sap = (struct sockaddr *)&r_xprt->rx_data.addr; int rc, err; - pr_info("%s: r_xprt = %p\n", __func__, r_xprt); + trace_xprtrdma_reinsert(r_xprt); rc = -EHOSTUNREACH; - if (rpcrdma_ia_open(r_xprt, sap)) + if (rpcrdma_ia_open(r_xprt)) goto out1; rc = -ENOMEM; @@ -676,7 +668,7 @@ rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt, goto out3; } - rpcrdma_create_mrs(r_xprt); + rpcrdma_mrs_create(r_xprt); return 0; out3: @@ -691,16 +683,15 @@ static int rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { - struct sockaddr *sap = (struct sockaddr *)&r_xprt->rx_data.addr; struct rdma_cm_id *id, *old; int err, rc; - dprintk("RPC: %s: reconnecting...\n", __func__); + trace_xprtrdma_reconnect(r_xprt); rpcrdma_ep_disconnect(ep, ia); rc = -EHOSTUNREACH; - id = rpcrdma_create_id(r_xprt, ia, sap); + id = rpcrdma_create_id(r_xprt, ia); if (IS_ERR(id)) goto out; @@ -817,16 +808,14 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) int rc; rc = rdma_disconnect(ia->ri_id); - if (!rc) { + if (!rc) /* returns without wait if not connected */ wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 1); - dprintk("RPC: %s: after wait, %sconnected\n", __func__, - (ep->rep_connected == 1) ? "still " : "dis"); - } else { - dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc); + else ep->rep_connected = rc; - } + trace_xprtrdma_disconnect(container_of(ep, struct rpcrdma_xprt, + rx_ep), rc); ib_drain_qp(ia->ri_id->qp); } @@ -998,15 +987,15 @@ rpcrdma_mr_recovery_worker(struct work_struct *work) { struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer, rb_recovery_worker.work); - struct rpcrdma_mw *mw; + struct rpcrdma_mr *mr; spin_lock(&buf->rb_recovery_lock); while (!list_empty(&buf->rb_stale_mrs)) { - mw = rpcrdma_pop_mw(&buf->rb_stale_mrs); + mr = rpcrdma_mr_pop(&buf->rb_stale_mrs); spin_unlock(&buf->rb_recovery_lock); - dprintk("RPC: %s: recovering MR %p\n", __func__, mw); - mw->mw_xprt->rx_ia.ri_ops->ro_recover_mr(mw); + trace_xprtrdma_recover_mr(mr); + mr->mr_xprt->rx_ia.ri_ops->ro_recover_mr(mr); spin_lock(&buf->rb_recovery_lock); } @@ -1014,20 +1003,20 @@ rpcrdma_mr_recovery_worker(struct work_struct *work) } void -rpcrdma_defer_mr_recovery(struct rpcrdma_mw *mw) +rpcrdma_mr_defer_recovery(struct rpcrdma_mr *mr) { - struct rpcrdma_xprt *r_xprt = mw->mw_xprt; + struct rpcrdma_xprt *r_xprt = mr->mr_xprt; struct rpcrdma_buffer *buf = &r_xprt->rx_buf; spin_lock(&buf->rb_recovery_lock); - rpcrdma_push_mw(mw, &buf->rb_stale_mrs); + rpcrdma_mr_push(mr, &buf->rb_stale_mrs); spin_unlock(&buf->rb_recovery_lock); schedule_delayed_work(&buf->rb_recovery_worker, 0); } static void -rpcrdma_create_mrs(struct rpcrdma_xprt *r_xprt) +rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_ia *ia = &r_xprt->rx_ia; @@ -1036,32 +1025,32 @@ rpcrdma_create_mrs(struct rpcrdma_xprt *r_xprt) LIST_HEAD(all); for (count = 0; count < 32; count++) { - struct rpcrdma_mw *mw; + struct rpcrdma_mr *mr; int rc; - mw = kzalloc(sizeof(*mw), GFP_KERNEL); - if (!mw) + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) break; - rc = ia->ri_ops->ro_init_mr(ia, mw); + rc = ia->ri_ops->ro_init_mr(ia, mr); if (rc) { - kfree(mw); + kfree(mr); break; } - mw->mw_xprt = r_xprt; + mr->mr_xprt = r_xprt; - list_add(&mw->mw_list, &free); - list_add(&mw->mw_all, &all); + list_add(&mr->mr_list, &free); + list_add(&mr->mr_all, &all); } - spin_lock(&buf->rb_mwlock); - list_splice(&free, &buf->rb_mws); + spin_lock(&buf->rb_mrlock); + list_splice(&free, &buf->rb_mrs); list_splice(&all, &buf->rb_all); r_xprt->rx_stats.mrs_allocated += count; - spin_unlock(&buf->rb_mwlock); + spin_unlock(&buf->rb_mrlock); - dprintk("RPC: %s: created %u MRs\n", __func__, count); + trace_xprtrdma_createmrs(r_xprt, count); } static void @@ -1072,7 +1061,7 @@ rpcrdma_mr_refresh_worker(struct work_struct *work) struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt, rx_buf); - rpcrdma_create_mrs(r_xprt); + rpcrdma_mrs_create(r_xprt); } struct rpcrdma_req * @@ -1093,10 +1082,17 @@ rpcrdma_create_req(struct rpcrdma_xprt *r_xprt) return req; } -struct rpcrdma_rep * +/** + * rpcrdma_create_rep - Allocate an rpcrdma_rep object + * @r_xprt: controlling transport + * + * Returns 0 on success or a negative errno on failure. + */ +int rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_rep *rep; int rc; @@ -1121,12 +1117,18 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt) rep->rr_recv_wr.wr_cqe = &rep->rr_cqe; rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov; rep->rr_recv_wr.num_sge = 1; - return rep; + + spin_lock(&buf->rb_lock); + list_add(&rep->rr_list, &buf->rb_recv_bufs); + spin_unlock(&buf->rb_lock); + return 0; out_free: kfree(rep); out: - return ERR_PTR(rc); + dprintk("RPC: %s: reply buffer %d alloc failed\n", + __func__, rc); + return rc; } int @@ -1137,10 +1139,10 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) buf->rb_max_requests = r_xprt->rx_data.max_requests; buf->rb_bc_srv_max_requests = 0; - spin_lock_init(&buf->rb_mwlock); + spin_lock_init(&buf->rb_mrlock); spin_lock_init(&buf->rb_lock); spin_lock_init(&buf->rb_recovery_lock); - INIT_LIST_HEAD(&buf->rb_mws); + INIT_LIST_HEAD(&buf->rb_mrs); INIT_LIST_HEAD(&buf->rb_all); INIT_LIST_HEAD(&buf->rb_stale_mrs); INIT_DELAYED_WORK(&buf->rb_refresh_worker, @@ -1148,7 +1150,7 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) INIT_DELAYED_WORK(&buf->rb_recovery_worker, rpcrdma_mr_recovery_worker); - rpcrdma_create_mrs(r_xprt); + rpcrdma_mrs_create(r_xprt); INIT_LIST_HEAD(&buf->rb_send_bufs); INIT_LIST_HEAD(&buf->rb_allreqs); @@ -1167,17 +1169,10 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) } INIT_LIST_HEAD(&buf->rb_recv_bufs); - for (i = 0; i < buf->rb_max_requests + RPCRDMA_MAX_BC_REQUESTS; i++) { - struct rpcrdma_rep *rep; - - rep = rpcrdma_create_rep(r_xprt); - if (IS_ERR(rep)) { - dprintk("RPC: %s: reply buffer %d alloc failed\n", - __func__, i); - rc = PTR_ERR(rep); + for (i = 0; i <= buf->rb_max_requests; i++) { + rc = rpcrdma_create_rep(r_xprt); + if (rc) goto out; - } - list_add(&rep->rr_list, &buf->rb_recv_bufs); } rc = rpcrdma_sendctxs_create(r_xprt); @@ -1229,26 +1224,26 @@ rpcrdma_destroy_req(struct rpcrdma_req *req) } static void -rpcrdma_destroy_mrs(struct rpcrdma_buffer *buf) +rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf) { struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt, rx_buf); struct rpcrdma_ia *ia = rdmab_to_ia(buf); - struct rpcrdma_mw *mw; + struct rpcrdma_mr *mr; unsigned int count; count = 0; - spin_lock(&buf->rb_mwlock); + spin_lock(&buf->rb_mrlock); while (!list_empty(&buf->rb_all)) { - mw = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); - list_del(&mw->mw_all); + mr = list_entry(buf->rb_all.next, struct rpcrdma_mr, mr_all); + list_del(&mr->mr_all); - spin_unlock(&buf->rb_mwlock); - ia->ri_ops->ro_release_mr(mw); + spin_unlock(&buf->rb_mrlock); + ia->ri_ops->ro_release_mr(mr); count++; - spin_lock(&buf->rb_mwlock); + spin_lock(&buf->rb_mrlock); } - spin_unlock(&buf->rb_mwlock); + spin_unlock(&buf->rb_mrlock); r_xprt->rx_stats.mrs_allocated = 0; dprintk("RPC: %s: released %u MRs\n", __func__, count); @@ -1285,27 +1280,33 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) spin_unlock(&buf->rb_reqslock); buf->rb_recv_count = 0; - rpcrdma_destroy_mrs(buf); + rpcrdma_mrs_destroy(buf); } -struct rpcrdma_mw * -rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt) +/** + * rpcrdma_mr_get - Allocate an rpcrdma_mr object + * @r_xprt: controlling transport + * + * Returns an initialized rpcrdma_mr or NULL if no free + * rpcrdma_mr objects are available. + */ +struct rpcrdma_mr * +rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct rpcrdma_mw *mw = NULL; + struct rpcrdma_mr *mr = NULL; - spin_lock(&buf->rb_mwlock); - if (!list_empty(&buf->rb_mws)) - mw = rpcrdma_pop_mw(&buf->rb_mws); - spin_unlock(&buf->rb_mwlock); + spin_lock(&buf->rb_mrlock); + if (!list_empty(&buf->rb_mrs)) + mr = rpcrdma_mr_pop(&buf->rb_mrs); + spin_unlock(&buf->rb_mrlock); - if (!mw) - goto out_nomws; - mw->mw_flags = 0; - return mw; + if (!mr) + goto out_nomrs; + return mr; -out_nomws: - dprintk("RPC: %s: no MWs available\n", __func__); +out_nomrs: + trace_xprtrdma_nomrs(r_xprt); if (r_xprt->rx_ep.rep_connected != -ENODEV) schedule_delayed_work(&buf->rb_refresh_worker, 0); @@ -1315,14 +1316,39 @@ out_nomws: return NULL; } +static void +__rpcrdma_mr_put(struct rpcrdma_buffer *buf, struct rpcrdma_mr *mr) +{ + spin_lock(&buf->rb_mrlock); + rpcrdma_mr_push(mr, &buf->rb_mrs); + spin_unlock(&buf->rb_mrlock); +} + +/** + * rpcrdma_mr_put - Release an rpcrdma_mr object + * @mr: object to release + * + */ void -rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw) +rpcrdma_mr_put(struct rpcrdma_mr *mr) { - struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + __rpcrdma_mr_put(&mr->mr_xprt->rx_buf, mr); +} + +/** + * rpcrdma_mr_unmap_and_put - DMA unmap an MR and release it + * @mr: object to release + * + */ +void +rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr) +{ + struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - spin_lock(&buf->rb_mwlock); - rpcrdma_push_mw(mw, &buf->rb_mws); - spin_unlock(&buf->rb_mwlock); + trace_xprtrdma_dma_unmap(mr); + ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, + mr->mr_sg, mr->mr_nents, mr->mr_dir); + __rpcrdma_mr_put(&r_xprt->rx_buf, mr); } static struct rpcrdma_rep * @@ -1359,11 +1385,11 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) req = rpcrdma_buffer_get_req_locked(buffers); req->rl_reply = rpcrdma_buffer_get_rep(buffers); spin_unlock(&buffers->rb_lock); + return req; out_reqbuf: spin_unlock(&buffers->rb_lock); - pr_warn("RPC: %s: out of request buffers\n", __func__); return NULL; } @@ -1519,9 +1545,6 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, req->rl_reply = NULL; } - dprintk("RPC: %s: posting %d s/g entries\n", - __func__, send_wr->num_sge); - if (!ep->rep_send_count || test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) { send_wr->send_flags |= IB_SEND_SIGNALED; @@ -1530,14 +1553,12 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, send_wr->send_flags &= ~IB_SEND_SIGNALED; --ep->rep_send_count; } + rc = ib_post_send(ia->ri_id->qp, send_wr, &send_wr_fail); + trace_xprtrdma_post_send(req, rc); if (rc) - goto out_postsend_err; + return -ENOTCONN; return 0; - -out_postsend_err: - pr_err("rpcrdma: RDMA Send ib_post_send returned %i\n", rc); - return -ENOTCONN; } int @@ -1550,23 +1571,20 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, if (!rpcrdma_dma_map_regbuf(ia, rep->rr_rdmabuf)) goto out_map; rc = ib_post_recv(ia->ri_id->qp, &rep->rr_recv_wr, &recv_wr_fail); + trace_xprtrdma_post_recv(rep, rc); if (rc) - goto out_postrecv; + return -ENOTCONN; return 0; out_map: pr_err("rpcrdma: failed to DMA map the Receive buffer\n"); return -EIO; - -out_postrecv: - pr_err("rpcrdma: ib_post_recv returned %i\n", rc); - return -ENOTCONN; } /** * rpcrdma_ep_post_extra_recv - Post buffers for incoming backchannel requests * @r_xprt: transport associated with these backchannel resources - * @min_reqs: minimum number of incoming requests expected + * @count: minimum number of incoming requests expected * * Returns zero if all requested buffers were posted, or a negative errno. */ @@ -1594,7 +1612,7 @@ rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count) out_reqbuf: spin_unlock(&buffers->rb_lock); - pr_warn("%s: no extra receive buffers\n", __func__); + trace_xprtrdma_noreps(r_xprt); return -ENOMEM; out_rc: diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 51686d9eac5f..69883a960a3f 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -73,11 +73,10 @@ struct rpcrdma_ia { struct completion ri_remove_done; int ri_async_rc; unsigned int ri_max_segs; - unsigned int ri_max_frmr_depth; + unsigned int ri_max_frwr_depth; unsigned int ri_max_inline_write; unsigned int ri_max_inline_read; unsigned int ri_max_send_sges; - bool ri_reminv_expected; bool ri_implicit_roundup; enum ib_mr_type ri_mrtype; unsigned long ri_flags; @@ -101,7 +100,6 @@ struct rpcrdma_ep { wait_queue_head_t rep_connect_wait; struct rpcrdma_connect_private rep_cm_private; struct rdma_conn_param rep_remote_cma; - struct sockaddr_storage rep_remote_addr; struct delayed_work rep_connect_worker; }; @@ -232,29 +230,29 @@ enum { }; /* - * struct rpcrdma_mw - external memory region metadata + * struct rpcrdma_mr - external memory region metadata * * An external memory region is any buffer or page that is registered * on the fly (ie, not pre-registered). * - * Each rpcrdma_buffer has a list of free MWs anchored in rb_mws. During + * Each rpcrdma_buffer has a list of free MWs anchored in rb_mrs. During * call_allocate, rpcrdma_buffer_get() assigns one to each segment in * an rpcrdma_req. Then rpcrdma_register_external() grabs these to keep * track of registration metadata while each RPC is pending. * rpcrdma_deregister_external() uses this metadata to unmap and * release these resources when an RPC is complete. */ -enum rpcrdma_frmr_state { - FRMR_IS_INVALID, /* ready to be used */ - FRMR_IS_VALID, /* in use */ - FRMR_FLUSHED_FR, /* flushed FASTREG WR */ - FRMR_FLUSHED_LI, /* flushed LOCALINV WR */ +enum rpcrdma_frwr_state { + FRWR_IS_INVALID, /* ready to be used */ + FRWR_IS_VALID, /* in use */ + FRWR_FLUSHED_FR, /* flushed FASTREG WR */ + FRWR_FLUSHED_LI, /* flushed LOCALINV WR */ }; -struct rpcrdma_frmr { +struct rpcrdma_frwr { struct ib_mr *fr_mr; struct ib_cqe fr_cqe; - enum rpcrdma_frmr_state fr_state; + enum rpcrdma_frwr_state fr_state; struct completion fr_linv_done; union { struct ib_reg_wr fr_regwr; @@ -267,26 +265,20 @@ struct rpcrdma_fmr { u64 *fm_physaddrs; }; -struct rpcrdma_mw { - struct list_head mw_list; - struct scatterlist *mw_sg; - int mw_nents; - enum dma_data_direction mw_dir; - unsigned long mw_flags; +struct rpcrdma_mr { + struct list_head mr_list; + struct scatterlist *mr_sg; + int mr_nents; + enum dma_data_direction mr_dir; union { struct rpcrdma_fmr fmr; - struct rpcrdma_frmr frmr; + struct rpcrdma_frwr frwr; }; - struct rpcrdma_xprt *mw_xprt; - u32 mw_handle; - u32 mw_length; - u64 mw_offset; - struct list_head mw_all; -}; - -/* mw_flags */ -enum { - RPCRDMA_MW_F_RI = 1, + struct rpcrdma_xprt *mr_xprt; + u32 mr_handle; + u32 mr_length; + u64 mr_offset; + struct list_head mr_all; }; /* @@ -342,6 +334,7 @@ enum { struct rpcrdma_buffer; struct rpcrdma_req { struct list_head rl_list; + int rl_cpu; unsigned int rl_connect_cookie; struct rpcrdma_buffer *rl_buffer; struct rpcrdma_rep *rl_reply; @@ -361,8 +354,7 @@ struct rpcrdma_req { /* rl_flags */ enum { - RPCRDMA_REQ_F_BACKCHANNEL = 0, - RPCRDMA_REQ_F_PENDING, + RPCRDMA_REQ_F_PENDING = 0, RPCRDMA_REQ_F_TX_RESOURCES, }; @@ -373,25 +365,25 @@ rpcrdma_set_xprtdata(struct rpc_rqst *rqst, struct rpcrdma_req *req) } static inline struct rpcrdma_req * -rpcr_to_rdmar(struct rpc_rqst *rqst) +rpcr_to_rdmar(const struct rpc_rqst *rqst) { return rqst->rq_xprtdata; } static inline void -rpcrdma_push_mw(struct rpcrdma_mw *mw, struct list_head *list) +rpcrdma_mr_push(struct rpcrdma_mr *mr, struct list_head *list) { - list_add_tail(&mw->mw_list, list); + list_add_tail(&mr->mr_list, list); } -static inline struct rpcrdma_mw * -rpcrdma_pop_mw(struct list_head *list) +static inline struct rpcrdma_mr * +rpcrdma_mr_pop(struct list_head *list) { - struct rpcrdma_mw *mw; + struct rpcrdma_mr *mr; - mw = list_first_entry(list, struct rpcrdma_mw, mw_list); - list_del(&mw->mw_list); - return mw; + mr = list_first_entry(list, struct rpcrdma_mr, mr_list); + list_del(&mr->mr_list); + return mr; } /* @@ -401,8 +393,8 @@ rpcrdma_pop_mw(struct list_head *list) * One of these is associated with a transport instance */ struct rpcrdma_buffer { - spinlock_t rb_mwlock; /* protect rb_mws list */ - struct list_head rb_mws; + spinlock_t rb_mrlock; /* protect rb_mrs list */ + struct list_head rb_mrs; struct list_head rb_all; unsigned long rb_sc_head; @@ -437,13 +429,11 @@ struct rpcrdma_buffer { * This data should be set with mount options */ struct rpcrdma_create_data_internal { - struct sockaddr_storage addr; /* RDMA server address */ unsigned int max_requests; /* max requests (slots) in flight */ unsigned int rsize; /* mount rsize - max read hdr+data */ unsigned int wsize; /* mount wsize - max write hdr+data */ unsigned int inline_rsize; /* max non-rdma read data payload */ unsigned int inline_wsize; /* max non-rdma write data payload */ - unsigned int padding; /* non-rdma write header padding */ }; /* @@ -483,17 +473,19 @@ struct rpcrdma_memreg_ops { struct rpcrdma_mr_seg * (*ro_map)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *, int, bool, - struct rpcrdma_mw **); + struct rpcrdma_mr **); + void (*ro_reminv)(struct rpcrdma_rep *rep, + struct list_head *mrs); void (*ro_unmap_sync)(struct rpcrdma_xprt *, struct list_head *); - void (*ro_recover_mr)(struct rpcrdma_mw *); + void (*ro_recover_mr)(struct rpcrdma_mr *mr); int (*ro_open)(struct rpcrdma_ia *, struct rpcrdma_ep *, struct rpcrdma_create_data_internal *); size_t (*ro_maxpages)(struct rpcrdma_xprt *); int (*ro_init_mr)(struct rpcrdma_ia *, - struct rpcrdma_mw *); - void (*ro_release_mr)(struct rpcrdma_mw *); + struct rpcrdma_mr *); + void (*ro_release_mr)(struct rpcrdma_mr *mr); const char *ro_displayname; const int ro_send_w_inv_ok; }; @@ -524,6 +516,18 @@ struct rpcrdma_xprt { #define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, rx_xprt) #define rpcx_to_rdmad(x) (rpcx_to_rdmax(x)->rx_data) +static inline const char * +rpcrdma_addrstr(const struct rpcrdma_xprt *r_xprt) +{ + return r_xprt->rx_xprt.address_strings[RPC_DISPLAY_ADDR]; +} + +static inline const char * +rpcrdma_portstr(const struct rpcrdma_xprt *r_xprt) +{ + return r_xprt->rx_xprt.address_strings[RPC_DISPLAY_PORT]; +} + /* Setting this to 0 ensures interoperability with early servers. * Setting this to 1 enhances certain unaligned read/write performance. * Default is 0, see sysctl entry and rpc_rdma.c rpcrdma_convert_iovs() */ @@ -537,7 +541,7 @@ extern unsigned int xprt_rdma_memreg_strategy; /* * Interface Adapter calls - xprtrdma/verbs.c */ -int rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr); +int rpcrdma_ia_open(struct rpcrdma_xprt *xprt); void rpcrdma_ia_remove(struct rpcrdma_ia *ia); void rpcrdma_ia_close(struct rpcrdma_ia *); bool frwr_is_supported(struct rpcrdma_ia *); @@ -563,22 +567,23 @@ int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_rep *); * Buffer calls - xprtrdma/verbs.c */ struct rpcrdma_req *rpcrdma_create_req(struct rpcrdma_xprt *); -struct rpcrdma_rep *rpcrdma_create_rep(struct rpcrdma_xprt *); void rpcrdma_destroy_req(struct rpcrdma_req *); +int rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt); int rpcrdma_buffer_create(struct rpcrdma_xprt *); void rpcrdma_buffer_destroy(struct rpcrdma_buffer *); struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_buffer *buf); void rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc); -struct rpcrdma_mw *rpcrdma_get_mw(struct rpcrdma_xprt *); -void rpcrdma_put_mw(struct rpcrdma_xprt *, struct rpcrdma_mw *); +struct rpcrdma_mr *rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt); +void rpcrdma_mr_put(struct rpcrdma_mr *mr); +void rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr); +void rpcrdma_mr_defer_recovery(struct rpcrdma_mr *mr); + struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *); void rpcrdma_buffer_put(struct rpcrdma_req *); void rpcrdma_recv_buffer_get(struct rpcrdma_req *); void rpcrdma_recv_buffer_put(struct rpcrdma_rep *); -void rpcrdma_defer_mr_recovery(struct rpcrdma_mw *); - struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(size_t, enum dma_data_direction, gfp_t); bool __rpcrdma_dma_map_regbuf(struct rpcrdma_ia *, struct rpcrdma_regbuf *); @@ -662,7 +667,7 @@ int xprt_rdma_bc_up(struct svc_serv *, struct net *); size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *); int rpcrdma_bc_post_recv(struct rpcrdma_xprt *, unsigned int); void rpcrdma_bc_receive_call(struct rpcrdma_xprt *, struct rpcrdma_rep *); -int rpcrdma_bc_marshal_reply(struct rpc_rqst *); +int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst); void xprt_rdma_bc_free_rqst(struct rpc_rqst *); void xprt_rdma_bc_destroy(struct rpc_xprt *, unsigned int); #endif /* CONFIG_SUNRPC_BACKCHANNEL */ @@ -670,3 +675,5 @@ void xprt_rdma_bc_destroy(struct rpc_xprt *, unsigned int); extern struct xprt_class xprt_rdma_bc; #endif /* _LINUX_SUNRPC_XPRT_RDMA_H */ + +#include <trace/events/rpcrdma.h> diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 9cc850c2719e..18803021f242 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -52,6 +52,8 @@ #include "sunrpc.h" +#define RPC_TCP_READ_CHUNK_SZ (3*512*1024) + static void xs_close(struct rpc_xprt *xprt); static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, struct socket *sock); @@ -1003,6 +1005,7 @@ static void xs_local_data_receive(struct sock_xprt *transport) struct sock *sk; int err; +restart: mutex_lock(&transport->recv_mutex); sk = transport->inet; if (sk == NULL) @@ -1016,6 +1019,11 @@ static void xs_local_data_receive(struct sock_xprt *transport) } if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) break; + if (need_resched()) { + mutex_unlock(&transport->recv_mutex); + cond_resched(); + goto restart; + } } out: mutex_unlock(&transport->recv_mutex); @@ -1094,6 +1102,7 @@ static void xs_udp_data_receive(struct sock_xprt *transport) struct sock *sk; int err; +restart: mutex_lock(&transport->recv_mutex); sk = transport->inet; if (sk == NULL) @@ -1107,6 +1116,11 @@ static void xs_udp_data_receive(struct sock_xprt *transport) } if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) break; + if (need_resched()) { + mutex_unlock(&transport->recv_mutex); + cond_resched(); + goto restart; + } } out: mutex_unlock(&transport->recv_mutex); @@ -1479,6 +1493,7 @@ static int xs_tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, uns .offset = offset, .count = len, }; + size_t ret; dprintk("RPC: xs_tcp_data_recv started\n"); do { @@ -1507,9 +1522,14 @@ static int xs_tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, uns /* Skip over any trailing bytes on short reads */ xs_tcp_read_discard(transport, &desc); } while (desc.count); + ret = len - desc.count; + if (ret < rd_desc->count) + rd_desc->count -= ret; + else + rd_desc->count = 0; trace_xs_tcp_data_recv(transport); dprintk("RPC: xs_tcp_data_recv done\n"); - return len - desc.count; + return ret; } static void xs_tcp_data_receive(struct sock_xprt *transport) @@ -1517,30 +1537,34 @@ static void xs_tcp_data_receive(struct sock_xprt *transport) struct rpc_xprt *xprt = &transport->xprt; struct sock *sk; read_descriptor_t rd_desc = { - .count = 2*1024*1024, .arg.data = xprt, }; unsigned long total = 0; - int loop; int read = 0; +restart: mutex_lock(&transport->recv_mutex); sk = transport->inet; if (sk == NULL) goto out; /* We use rd_desc to pass struct xprt to xs_tcp_data_recv */ - for (loop = 0; loop < 64; loop++) { + for (;;) { + rd_desc.count = RPC_TCP_READ_CHUNK_SZ; lock_sock(sk); read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv); - if (read <= 0) { + if (rd_desc.count != 0 || read < 0) { clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); release_sock(sk); break; } release_sock(sk); total += read; - rd_desc.count = 65536; + if (need_resched()) { + mutex_unlock(&transport->recv_mutex); + cond_resched(); + goto restart; + } } if (test_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) queue_work(xprtiod_workqueue, &transport->recv_worker); @@ -2440,7 +2464,9 @@ static void xs_tcp_setup_socket(struct work_struct *work) */ case -ECONNREFUSED: case -ECONNRESET: + case -ENETDOWN: case -ENETUNREACH: + case -EHOSTUNREACH: case -EADDRINUSE: case -ENOBUFS: /* diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 329325bd553e..37892b3909af 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -1,7 +1,7 @@ /* * net/tipc/bcast.c: TIPC broadcast code * - * Copyright (c) 2004-2006, 2014-2016, Ericsson AB + * Copyright (c) 2004-2006, 2014-2017, Ericsson AB * Copyright (c) 2004, Intel Corporation. * Copyright (c) 2005, 2010-2011, Wind River Systems * All rights reserved. @@ -42,8 +42,8 @@ #include "link.h" #include "name_table.h" -#define BCLINK_WIN_DEFAULT 50 /* bcast link window size (default) */ -#define BCLINK_WIN_MIN 32 /* bcast minimum link window size */ +#define BCLINK_WIN_DEFAULT 50 /* bcast link window size (default) */ +#define BCLINK_WIN_MIN 32 /* bcast minimum link window size */ const char tipc_bclink_name[] = "broadcast-link"; @@ -74,6 +74,10 @@ static struct tipc_bc_base *tipc_bc_base(struct net *net) return tipc_net(net)->bcbase; } +/* tipc_bcast_get_mtu(): -get the MTU currently used by broadcast link + * Note: the MTU is decremented to give room for a tunnel header, in + * case the message needs to be sent as replicast + */ int tipc_bcast_get_mtu(struct net *net) { return tipc_link_mtu(tipc_bc_sndlink(net)) - INT_H_SIZE; @@ -515,7 +519,7 @@ int tipc_bcast_init(struct net *net) spin_lock_init(&tipc_net(net)->bclock); if (!tipc_link_bc_create(net, 0, 0, - U16_MAX, + FB_MTU, BCLINK_WIN_DEFAULT, 0, &bb->inputq, diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 47ec121574ce..c8001471da6c 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -324,6 +324,7 @@ restart: if (res) { pr_warn("Bearer <%s> rejected, enable failure (%d)\n", name, -res); + kfree(b); return -EINVAL; } @@ -347,8 +348,10 @@ restart: if (skb) tipc_bearer_xmit_skb(net, bearer_id, skb, &b->bcast_addr); - if (tipc_mon_create(net, bearer_id)) + if (tipc_mon_create(net, bearer_id)) { + bearer_disable(net, b); return -ENOMEM; + } pr_info("Enabled bearer <%s>, discovery domain %s, priority %u\n", name, diff --git a/net/tipc/core.h b/net/tipc/core.h index 964342689f2c..20b21af2ff14 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -49,7 +49,6 @@ #include <linux/uaccess.h> #include <linux/interrupt.h> #include <linux/atomic.h> -#include <asm/hardirq.h> #include <linux/netdevice.h> #include <linux/in.h> #include <linux/list.h> diff --git a/net/tipc/group.c b/net/tipc/group.c index 12777cac638a..122162a31816 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -49,8 +49,6 @@ #define ADV_ACTIVE (ADV_UNIT * 12) enum mbr_state { - MBR_QUARANTINED, - MBR_DISCOVERED, MBR_JOINING, MBR_PUBLISHED, MBR_JOINED, @@ -64,8 +62,7 @@ enum mbr_state { struct tipc_member { struct rb_node tree_node; struct list_head list; - struct list_head congested; - struct sk_buff *event_msg; + struct list_head small_win; struct sk_buff_head deferredq; struct tipc_group *group; u32 node; @@ -77,21 +74,18 @@ struct tipc_member { u16 bc_rcv_nxt; u16 bc_syncpt; u16 bc_acked; - bool usr_pending; }; struct tipc_group { struct rb_root members; - struct list_head congested; + struct list_head small_win; struct list_head pending; struct list_head active; - struct list_head reclaiming; struct tipc_nlist dests; struct net *net; int subid; u32 type; u32 instance; - u32 domain; u32 scope; u32 portid; u16 member_cnt; @@ -99,6 +93,7 @@ struct tipc_group { u16 max_active; u16 bc_snd_nxt; u16 bc_ackers; + bool *open; bool loopback; bool events; }; @@ -106,10 +101,21 @@ struct tipc_group { static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m, int mtyp, struct sk_buff_head *xmitq); +static void tipc_group_open(struct tipc_member *m, bool *wakeup) +{ + *wakeup = false; + if (list_empty(&m->small_win)) + return; + list_del_init(&m->small_win); + *m->group->open = true; + *wakeup = true; +} + static void tipc_group_decr_active(struct tipc_group *grp, struct tipc_member *m) { - if (m->state == MBR_ACTIVE || m->state == MBR_RECLAIMING) + if (m->state == MBR_ACTIVE || m->state == MBR_RECLAIMING || + m->state == MBR_REMITTED) grp->active_cnt--; } @@ -136,14 +142,14 @@ u16 tipc_group_bc_snd_nxt(struct tipc_group *grp) return grp->bc_snd_nxt; } -static bool tipc_group_is_enabled(struct tipc_member *m) +static bool tipc_group_is_receiver(struct tipc_member *m) { - return m->state != MBR_QUARANTINED && m->state != MBR_LEAVING; + return m && m->state != MBR_JOINING && m->state != MBR_LEAVING; } -static bool tipc_group_is_receiver(struct tipc_member *m) +static bool tipc_group_is_sender(struct tipc_member *m) { - return m && m->state >= MBR_JOINED; + return m && m->state != MBR_JOINING && m->state != MBR_PUBLISHED; } u32 tipc_group_exclude(struct tipc_group *grp) @@ -159,8 +165,11 @@ int tipc_group_size(struct tipc_group *grp) } struct tipc_group *tipc_group_create(struct net *net, u32 portid, - struct tipc_group_req *mreq) + struct tipc_group_req *mreq, + bool *group_is_open) { + u32 filter = TIPC_SUB_PORTS | TIPC_SUB_NO_STATUS; + bool global = mreq->scope != TIPC_NODE_SCOPE; struct tipc_group *grp; u32 type = mreq->type; @@ -168,25 +177,41 @@ struct tipc_group *tipc_group_create(struct net *net, u32 portid, if (!grp) return NULL; tipc_nlist_init(&grp->dests, tipc_own_addr(net)); - INIT_LIST_HEAD(&grp->congested); + INIT_LIST_HEAD(&grp->small_win); INIT_LIST_HEAD(&grp->active); INIT_LIST_HEAD(&grp->pending); - INIT_LIST_HEAD(&grp->reclaiming); grp->members = RB_ROOT; grp->net = net; grp->portid = portid; - grp->domain = addr_domain(net, mreq->scope); grp->type = type; grp->instance = mreq->instance; grp->scope = mreq->scope; grp->loopback = mreq->flags & TIPC_GROUP_LOOPBACK; grp->events = mreq->flags & TIPC_GROUP_MEMBER_EVTS; - if (tipc_topsrv_kern_subscr(net, portid, type, 0, ~0, &grp->subid)) + grp->open = group_is_open; + filter |= global ? TIPC_SUB_CLUSTER_SCOPE : TIPC_SUB_NODE_SCOPE; + if (tipc_topsrv_kern_subscr(net, portid, type, 0, ~0, + filter, &grp->subid)) return grp; kfree(grp); return NULL; } +void tipc_group_join(struct net *net, struct tipc_group *grp, int *sk_rcvbuf) +{ + struct rb_root *tree = &grp->members; + struct tipc_member *m, *tmp; + struct sk_buff_head xmitq; + + skb_queue_head_init(&xmitq); + rbtree_postorder_for_each_entry_safe(m, tmp, tree, tree_node) { + tipc_group_proto_xmit(grp, m, GRP_JOIN_MSG, &xmitq); + tipc_group_update_member(m, 0); + } + tipc_node_distr_xmit(net, &xmitq); + *sk_rcvbuf = tipc_group_rcvbuf_limit(grp); +} + void tipc_group_delete(struct net *net, struct tipc_group *grp) { struct rb_root *tree = &grp->members; @@ -232,7 +257,7 @@ static struct tipc_member *tipc_group_find_dest(struct tipc_group *grp, struct tipc_member *m; m = tipc_group_find_member(grp, node, port); - if (m && tipc_group_is_enabled(m)) + if (m && tipc_group_is_receiver(m)) return m; return NULL; } @@ -277,7 +302,7 @@ static void tipc_group_add_to_tree(struct tipc_group *grp, static struct tipc_member *tipc_group_create_member(struct tipc_group *grp, u32 node, u32 port, - int state) + u32 instance, int state) { struct tipc_member *m; @@ -285,11 +310,12 @@ static struct tipc_member *tipc_group_create_member(struct tipc_group *grp, if (!m) return NULL; INIT_LIST_HEAD(&m->list); - INIT_LIST_HEAD(&m->congested); + INIT_LIST_HEAD(&m->small_win); __skb_queue_head_init(&m->deferredq); m->group = grp; m->node = node; m->port = port; + m->instance = instance; m->bc_acked = grp->bc_snd_nxt - 1; grp->member_cnt++; tipc_group_add_to_tree(grp, m); @@ -298,9 +324,10 @@ static struct tipc_member *tipc_group_create_member(struct tipc_group *grp, return m; } -void tipc_group_add_member(struct tipc_group *grp, u32 node, u32 port) +void tipc_group_add_member(struct tipc_group *grp, u32 node, + u32 port, u32 instance) { - tipc_group_create_member(grp, node, port, MBR_DISCOVERED); + tipc_group_create_member(grp, node, port, instance, MBR_PUBLISHED); } static void tipc_group_delete_member(struct tipc_group *grp, @@ -314,7 +341,7 @@ static void tipc_group_delete_member(struct tipc_group *grp, grp->bc_ackers--; list_del_init(&m->list); - list_del_init(&m->congested); + list_del_init(&m->small_win); tipc_group_decr_active(grp, m); /* If last member on a node, remove node from dest list */ @@ -343,7 +370,7 @@ void tipc_group_update_member(struct tipc_member *m, int len) struct tipc_group *grp = m->group; struct tipc_member *_m, *tmp; - if (!tipc_group_is_enabled(m)) + if (!tipc_group_is_receiver(m)) return; m->window -= len; @@ -351,17 +378,14 @@ void tipc_group_update_member(struct tipc_member *m, int len) if (m->window >= ADV_IDLE) return; - if (!list_empty(&m->congested)) - return; + list_del_init(&m->small_win); - /* Sort member into congested members' list */ - list_for_each_entry_safe(_m, tmp, &grp->congested, congested) { - if (m->window > _m->window) - continue; - list_add_tail(&m->congested, &_m->congested); - return; + /* Sort member into small_window members' list */ + list_for_each_entry_safe(_m, tmp, &grp->small_win, small_win) { + if (_m->window > m->window) + break; } - list_add_tail(&m->congested, &grp->congested); + list_add_tail(&m->small_win, &_m->small_win); } void tipc_group_update_bc_members(struct tipc_group *grp, int len, bool ack) @@ -369,18 +393,20 @@ void tipc_group_update_bc_members(struct tipc_group *grp, int len, bool ack) u16 prev = grp->bc_snd_nxt - 1; struct tipc_member *m; struct rb_node *n; + u16 ackers = 0; for (n = rb_first(&grp->members); n; n = rb_next(n)) { m = container_of(n, struct tipc_member, tree_node); - if (tipc_group_is_enabled(m)) { + if (tipc_group_is_receiver(m)) { tipc_group_update_member(m, len); m->bc_acked = prev; + ackers++; } } /* Mark number of acknowledges to expect, if any */ if (ack) - grp->bc_ackers = grp->member_cnt; + grp->bc_ackers = ackers; grp->bc_snd_nxt++; } @@ -392,20 +418,20 @@ bool tipc_group_cong(struct tipc_group *grp, u32 dnode, u32 dport, int adv, state; m = tipc_group_find_dest(grp, dnode, dport); - *mbr = m; - if (!m) + if (!tipc_group_is_receiver(m)) { + *mbr = NULL; return false; - if (m->usr_pending) - return true; + } + *mbr = m; + if (m->window >= len) return false; - m->usr_pending = true; + + *grp->open = false; /* If not fully advertised, do it now to prevent mutual blocking */ adv = m->advertised; state = m->state; - if (state < MBR_JOINED) - return true; if (state == MBR_JOINED && adv == ADV_IDLE) return true; if (state == MBR_ACTIVE && adv == ADV_ACTIVE) @@ -423,13 +449,14 @@ bool tipc_group_bc_cong(struct tipc_group *grp, int len) struct tipc_member *m = NULL; /* If prev bcast was replicast, reject until all receivers have acked */ - if (grp->bc_ackers) + if (grp->bc_ackers) { + *grp->open = false; return true; - - if (list_empty(&grp->congested)) + } + if (list_empty(&grp->small_win)) return false; - m = list_first_entry(&grp->congested, struct tipc_member, congested); + m = list_first_entry(&grp->small_win, struct tipc_member, small_win); if (m->window >= len) return false; @@ -484,7 +511,7 @@ void tipc_group_filter_msg(struct tipc_group *grp, struct sk_buff_head *inputq, goto drop; m = tipc_group_find_member(grp, node, port); - if (!tipc_group_is_receiver(m)) + if (!tipc_group_is_sender(m)) goto drop; if (less(msg_grp_bc_seqno(hdr), m->bc_rcv_nxt)) @@ -497,6 +524,7 @@ void tipc_group_filter_msg(struct tipc_group *grp, struct sk_buff_head *inputq, while ((skb = skb_peek(defq))) { hdr = buf_msg(skb); mtyp = msg_type(hdr); + blks = msg_blocks(hdr); deliver = true; ack = false; update = false; @@ -546,7 +574,6 @@ void tipc_group_filter_msg(struct tipc_group *grp, struct sk_buff_head *inputq, if (!update) continue; - blks = msg_blocks(hdr); tipc_group_update_rcv_win(grp, blks, node, port, xmitq); } return; @@ -561,7 +588,7 @@ void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node, int max_active = grp->max_active; int reclaim_limit = max_active * 3 / 4; int active_cnt = grp->active_cnt; - struct tipc_member *m, *rm; + struct tipc_member *m, *rm, *pm; m = tipc_group_find_member(grp, node, port); if (!m) @@ -571,24 +598,34 @@ void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node, switch (m->state) { case MBR_JOINED: - /* Reclaim advertised space from least active member */ - if (!list_empty(active) && active_cnt >= reclaim_limit) { + /* First, decide if member can go active */ + if (active_cnt <= max_active) { + m->state = MBR_ACTIVE; + list_add_tail(&m->list, active); + grp->active_cnt++; + tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); + } else { + m->state = MBR_PENDING; + list_add_tail(&m->list, &grp->pending); + } + + if (active_cnt < reclaim_limit) + break; + + /* Reclaim from oldest active member, if possible */ + if (!list_empty(active)) { rm = list_first_entry(active, struct tipc_member, list); rm->state = MBR_RECLAIMING; - list_move_tail(&rm->list, &grp->reclaiming); + list_del_init(&rm->list); tipc_group_proto_xmit(grp, rm, GRP_RECLAIM_MSG, xmitq); - } - /* If max active, become pending and wait for reclaimed space */ - if (active_cnt >= max_active) { - m->state = MBR_PENDING; - list_add_tail(&m->list, &grp->pending); break; } - /* Otherwise become active */ - m->state = MBR_ACTIVE; - list_add_tail(&m->list, &grp->active); - grp->active_cnt++; - /* Fall through */ + /* Nobody to reclaim from; - revert oldest pending to JOINED */ + pm = list_first_entry(&grp->pending, struct tipc_member, list); + list_del_init(&pm->list); + pm->state = MBR_JOINED; + tipc_group_proto_xmit(grp, pm, GRP_ADV_MSG, xmitq); + break; case MBR_ACTIVE: if (!list_is_last(&m->list, &grp->active)) list_move_tail(&m->list, &grp->active); @@ -600,13 +637,23 @@ void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node, if (m->advertised > ADV_IDLE) break; m->state = MBR_JOINED; + grp->active_cnt--; if (m->advertised < ADV_IDLE) { pr_warn_ratelimited("Rcv unexpected msg after REMIT\n"); tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); } + + if (list_empty(&grp->pending)) + return; + + /* Set oldest pending member to active and advertise */ + pm = list_first_entry(&grp->pending, struct tipc_member, list); + pm->state = MBR_ACTIVE; + list_move_tail(&pm->list, &grp->active); + grp->active_cnt++; + tipc_group_proto_xmit(grp, pm, GRP_ADV_MSG, xmitq); break; case MBR_RECLAIMING: - case MBR_DISCOVERED: case MBR_JOINING: case MBR_LEAVING: default: @@ -614,6 +661,40 @@ void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node, } } +static void tipc_group_create_event(struct tipc_group *grp, + struct tipc_member *m, + u32 event, u16 seqno, + struct sk_buff_head *inputq) +{ u32 dnode = tipc_own_addr(grp->net); + struct tipc_event evt; + struct sk_buff *skb; + struct tipc_msg *hdr; + + evt.event = event; + evt.found_lower = m->instance; + evt.found_upper = m->instance; + evt.port.ref = m->port; + evt.port.node = m->node; + evt.s.seq.type = grp->type; + evt.s.seq.lower = m->instance; + evt.s.seq.upper = m->instance; + + skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_GRP_MEMBER_EVT, + GROUP_H_SIZE, sizeof(evt), dnode, m->node, + grp->portid, m->port, 0); + if (!skb) + return; + + hdr = buf_msg(skb); + msg_set_nametype(hdr, grp->type); + msg_set_grp_evt(hdr, event); + msg_set_dest_droppable(hdr, true); + msg_set_grp_bc_seqno(hdr, seqno); + memcpy(msg_data(hdr), &evt, sizeof(evt)); + TIPC_SKB_CB(skb)->orig_member = m->instance; + __skb_queue_tail(inputq, skb); +} + static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m, int mtyp, struct sk_buff_head *xmitq) { @@ -648,6 +729,7 @@ static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m, } else if (mtyp == GRP_REMIT_MSG) { msg_set_grp_remitted(hdr, m->window); } + msg_set_dest_droppable(hdr, true); __skb_queue_tail(xmitq, skb); } @@ -658,108 +740,95 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup, u32 node = msg_orignode(hdr); u32 port = msg_origport(hdr); struct tipc_member *m, *pm; - struct tipc_msg *ehdr; u16 remitted, in_flight; if (!grp) return; + if (grp->scope == TIPC_NODE_SCOPE && node != tipc_own_addr(grp->net)) + return; + m = tipc_group_find_member(grp, node, port); switch (msg_type(hdr)) { case GRP_JOIN_MSG: if (!m) m = tipc_group_create_member(grp, node, port, - MBR_QUARANTINED); + 0, MBR_JOINING); if (!m) return; m->bc_syncpt = msg_grp_bc_syncpt(hdr); m->bc_rcv_nxt = m->bc_syncpt; m->window += msg_adv_win(hdr); - /* Wait until PUBLISH event is received */ - if (m->state == MBR_DISCOVERED) { - m->state = MBR_JOINING; - } else if (m->state == MBR_PUBLISHED) { - m->state = MBR_JOINED; - *usr_wakeup = true; - m->usr_pending = false; - tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); - ehdr = buf_msg(m->event_msg); - msg_set_grp_bc_seqno(ehdr, m->bc_syncpt); - __skb_queue_tail(inputq, m->event_msg); - } - if (m->window < ADV_IDLE) - tipc_group_update_member(m, 0); - else - list_del_init(&m->congested); + /* Wait until PUBLISH event is received if necessary */ + if (m->state != MBR_PUBLISHED) + return; + + /* Member can be taken into service */ + m->state = MBR_JOINED; + tipc_group_open(m, usr_wakeup); + tipc_group_update_member(m, 0); + tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); + tipc_group_create_event(grp, m, TIPC_PUBLISHED, + m->bc_syncpt, inputq); return; case GRP_LEAVE_MSG: if (!m) return; m->bc_syncpt = msg_grp_bc_syncpt(hdr); - - /* Wait until WITHDRAW event is received */ - if (m->state != MBR_LEAVING) { - tipc_group_decr_active(grp, m); - m->state = MBR_LEAVING; - return; - } - /* Otherwise deliver already received WITHDRAW event */ - ehdr = buf_msg(m->event_msg); - msg_set_grp_bc_seqno(ehdr, m->bc_syncpt); - __skb_queue_tail(inputq, m->event_msg); - *usr_wakeup = true; - list_del_init(&m->congested); + list_del_init(&m->list); + tipc_group_open(m, usr_wakeup); + tipc_group_decr_active(grp, m); + m->state = MBR_LEAVING; + tipc_group_create_event(grp, m, TIPC_WITHDRAWN, + m->bc_syncpt, inputq); return; case GRP_ADV_MSG: if (!m) return; m->window += msg_adv_win(hdr); - *usr_wakeup = m->usr_pending; - m->usr_pending = false; - list_del_init(&m->congested); + tipc_group_open(m, usr_wakeup); return; case GRP_ACK_MSG: if (!m) return; m->bc_acked = msg_grp_bc_acked(hdr); if (--grp->bc_ackers) - break; + return; + list_del_init(&m->small_win); + *m->group->open = true; *usr_wakeup = true; - m->usr_pending = false; + tipc_group_update_member(m, 0); return; case GRP_RECLAIM_MSG: if (!m) return; - *usr_wakeup = m->usr_pending; - m->usr_pending = false; tipc_group_proto_xmit(grp, m, GRP_REMIT_MSG, xmitq); m->window = ADV_IDLE; + tipc_group_open(m, usr_wakeup); return; case GRP_REMIT_MSG: if (!m || m->state != MBR_RECLAIMING) return; - list_del_init(&m->list); - grp->active_cnt--; remitted = msg_grp_remitted(hdr); /* Messages preceding the REMIT still in receive queue */ if (m->advertised > remitted) { m->state = MBR_REMITTED; in_flight = m->advertised - remitted; + m->advertised = ADV_IDLE + in_flight; + return; } - /* All messages preceding the REMIT have been read */ - if (m->advertised <= remitted) { - m->state = MBR_JOINED; - in_flight = 0; - } - /* ..and the REMIT overtaken by more messages => re-advertise */ + /* This should never happen */ if (m->advertised < remitted) - tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); + pr_warn_ratelimited("Unexpected REMIT msg\n"); - m->advertised = ADV_IDLE + in_flight; + /* All messages preceding the REMIT have been read */ + m->state = MBR_JOINED; + grp->active_cnt--; + m->advertised = ADV_IDLE; /* Set oldest pending member to active and advertise */ if (list_empty(&grp->pending)) @@ -781,11 +850,10 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup, void tipc_group_member_evt(struct tipc_group *grp, bool *usr_wakeup, int *sk_rcvbuf, - struct sk_buff *skb, + struct tipc_msg *hdr, struct sk_buff_head *inputq, struct sk_buff_head *xmitq) { - struct tipc_msg *hdr = buf_msg(skb); struct tipc_event *evt = (void *)msg_data(hdr); u32 instance = evt->found_lower; u32 node = evt->port.node; @@ -793,79 +861,59 @@ void tipc_group_member_evt(struct tipc_group *grp, int event = evt->event; struct tipc_member *m; struct net *net; - bool node_up; u32 self; if (!grp) - goto drop; + return; net = grp->net; self = tipc_own_addr(net); if (!grp->loopback && node == self && port == grp->portid) - goto drop; - - /* Convert message before delivery to user */ - msg_set_hdr_sz(hdr, GROUP_H_SIZE); - msg_set_user(hdr, TIPC_CRITICAL_IMPORTANCE); - msg_set_type(hdr, TIPC_GRP_MEMBER_EVT); - msg_set_origport(hdr, port); - msg_set_orignode(hdr, node); - msg_set_nametype(hdr, grp->type); - msg_set_grp_evt(hdr, event); + return; m = tipc_group_find_member(grp, node, port); - if (event == TIPC_PUBLISHED) { - if (!m) - m = tipc_group_create_member(grp, node, port, - MBR_DISCOVERED); - if (!m) - goto drop; - - /* Hold back event if JOIN message not yet received */ - if (m->state == MBR_DISCOVERED) { - m->event_msg = skb; - m->state = MBR_PUBLISHED; - } else { - msg_set_grp_bc_seqno(hdr, m->bc_syncpt); - __skb_queue_tail(inputq, skb); - m->state = MBR_JOINED; - *usr_wakeup = true; - m->usr_pending = false; + switch (event) { + case TIPC_PUBLISHED: + /* Send and wait for arrival of JOIN message if necessary */ + if (!m) { + m = tipc_group_create_member(grp, node, port, instance, + MBR_PUBLISHED); + if (!m) + break; + tipc_group_update_member(m, 0); + tipc_group_proto_xmit(grp, m, GRP_JOIN_MSG, xmitq); + break; } + + if (m->state != MBR_JOINING) + break; + + /* Member can be taken into service */ m->instance = instance; - TIPC_SKB_CB(skb)->orig_member = m->instance; + m->state = MBR_JOINED; + tipc_group_open(m, usr_wakeup); + tipc_group_update_member(m, 0); tipc_group_proto_xmit(grp, m, GRP_JOIN_MSG, xmitq); - if (m->window < ADV_IDLE) - tipc_group_update_member(m, 0); - else - list_del_init(&m->congested); - } else if (event == TIPC_WITHDRAWN) { + tipc_group_create_event(grp, m, TIPC_PUBLISHED, + m->bc_syncpt, inputq); + break; + case TIPC_WITHDRAWN: if (!m) - goto drop; + break; - TIPC_SKB_CB(skb)->orig_member = m->instance; + tipc_group_decr_active(grp, m); + m->state = MBR_LEAVING; + list_del_init(&m->list); + tipc_group_open(m, usr_wakeup); - *usr_wakeup = true; - m->usr_pending = false; - node_up = tipc_node_is_up(net, node); - - /* Hold back event if more messages might be expected */ - if (m->state != MBR_LEAVING && node_up) { - m->event_msg = skb; - tipc_group_decr_active(grp, m); - m->state = MBR_LEAVING; - } else { - if (node_up) - msg_set_grp_bc_seqno(hdr, m->bc_syncpt); - else - msg_set_grp_bc_seqno(hdr, m->bc_rcv_nxt); - __skb_queue_tail(inputq, skb); - } - list_del_init(&m->congested); + /* Only send event if no LEAVE message can be expected */ + if (!tipc_node_is_up(net, node)) + tipc_group_create_event(grp, m, TIPC_WITHDRAWN, + m->bc_rcv_nxt, inputq); + break; + default: + break; } *sk_rcvbuf = tipc_group_rcvbuf_limit(grp); - return; -drop: - kfree_skb(skb); } diff --git a/net/tipc/group.h b/net/tipc/group.h index d525e1cd7de5..5996af6e9f1d 100644 --- a/net/tipc/group.h +++ b/net/tipc/group.h @@ -43,9 +43,12 @@ struct tipc_member; struct tipc_msg; struct tipc_group *tipc_group_create(struct net *net, u32 portid, - struct tipc_group_req *mreq); + struct tipc_group_req *mreq, + bool *group_is_open); +void tipc_group_join(struct net *net, struct tipc_group *grp, int *sk_rcv_buf); void tipc_group_delete(struct net *net, struct tipc_group *grp); -void tipc_group_add_member(struct tipc_group *grp, u32 node, u32 port); +void tipc_group_add_member(struct tipc_group *grp, u32 node, + u32 port, u32 instance); struct tipc_nlist *tipc_group_dests(struct tipc_group *grp); void tipc_group_self(struct tipc_group *grp, struct tipc_name_seq *seq, int *scope); @@ -54,7 +57,7 @@ void tipc_group_filter_msg(struct tipc_group *grp, struct sk_buff_head *inputq, struct sk_buff_head *xmitq); void tipc_group_member_evt(struct tipc_group *grp, bool *wakeup, - int *sk_rcvbuf, struct sk_buff *skb, + int *sk_rcvbuf, struct tipc_msg *hdr, struct sk_buff_head *inputq, struct sk_buff_head *xmitq); void tipc_group_proto_rcv(struct tipc_group *grp, bool *wakeup, @@ -69,5 +72,4 @@ void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node, u32 port, struct sk_buff_head *xmitq); u16 tipc_group_bc_snd_nxt(struct tipc_group *grp); void tipc_group_update_member(struct tipc_member *m, int len); -int tipc_group_size(struct tipc_group *grp); #endif diff --git a/net/tipc/link.c b/net/tipc/link.c index 6bce0b1117bd..2d6b2aed30e0 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -483,7 +483,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, /** * tipc_link_bc_create - create new link to be used for broadcast * @n: pointer to associated node - * @mtu: mtu to be used + * @mtu: mtu to be used initially if no peers * @window: send window to be used * @inputq: queue to put messages ready for delivery * @namedq: queue to put binding table update messages ready for delivery diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c index 8e884ed06d4b..32dc33a94bc7 100644 --- a/net/tipc/monitor.c +++ b/net/tipc/monitor.c @@ -642,9 +642,13 @@ void tipc_mon_delete(struct net *net, int bearer_id) { struct tipc_net *tn = tipc_net(net); struct tipc_monitor *mon = tipc_monitor(net, bearer_id); - struct tipc_peer *self = get_self(net, bearer_id); + struct tipc_peer *self; struct tipc_peer *peer, *tmp; + if (!mon) + return; + + self = get_self(net, bearer_id); write_lock_bh(&mon->lock); tn->monitors[bearer_id] = NULL; list_for_each_entry_safe(peer, tmp, &self->list, list) { diff --git a/net/tipc/msg.c b/net/tipc/msg.c index b0d07b35909d..55d8ba92291d 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -251,20 +251,23 @@ bool tipc_msg_validate(struct sk_buff **_skb) * @pktmax: Max packet size that can be used * @list: Buffer or chain of buffers to be returned to caller * + * Note that the recursive call we are making here is safe, since it can + * logically go only one further level down. + * * Returns message data size or errno: -ENOMEM, -EFAULT */ -int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, - int offset, int dsz, int pktmax, struct sk_buff_head *list) +int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, int offset, + int dsz, int pktmax, struct sk_buff_head *list) { int mhsz = msg_hdr_sz(mhdr); + struct tipc_msg pkthdr; int msz = mhsz + dsz; - int pktno = 1; - int pktsz; int pktrem = pktmax; - int drem = dsz; - struct tipc_msg pkthdr; struct sk_buff *skb; + int drem = dsz; + int pktno = 1; char *pktpos; + int pktsz; int rc; msg_set_size(mhdr, msz); @@ -272,8 +275,18 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, /* No fragmentation needed? */ if (likely(msz <= pktmax)) { skb = tipc_buf_acquire(msz, GFP_KERNEL); - if (unlikely(!skb)) + + /* Fall back to smaller MTU if node local message */ + if (unlikely(!skb)) { + if (pktmax != MAX_MSG_SIZE) + return -ENOMEM; + rc = tipc_msg_build(mhdr, m, offset, dsz, FB_MTU, list); + if (rc != dsz) + return rc; + if (tipc_msg_assemble(list)) + return dsz; return -ENOMEM; + } skb_orphan(skb); __skb_queue_tail(list, skb); skb_copy_to_linear_data(skb, mhdr, mhsz); @@ -589,6 +602,30 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err) return true; } +/* tipc_msg_assemble() - assemble chain of fragments into one message + */ +bool tipc_msg_assemble(struct sk_buff_head *list) +{ + struct sk_buff *skb, *tmp = NULL; + + if (skb_queue_len(list) == 1) + return true; + + while ((skb = __skb_dequeue(list))) { + skb->next = NULL; + if (tipc_buf_append(&tmp, &skb)) { + __skb_queue_tail(list, skb); + return true; + } + if (!tmp) + break; + } + __skb_queue_purge(list); + __skb_queue_head_init(list); + pr_warn("Failed do assemble buffer\n"); + return false; +} + /* tipc_msg_reassemble() - clone a buffer chain of fragments and * reassemble the clones into one message */ diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 3e4384c222f7..b4ba1b4f9ae7 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -98,7 +98,7 @@ struct plist; #define MAX_H_SIZE 60 /* Largest possible TIPC header size */ #define MAX_MSG_SIZE (MAX_H_SIZE + TIPC_MAX_USER_MSG_SIZE) - +#define FB_MTU 3744 #define TIPC_MEDIA_INFO_OFFSET 5 struct tipc_skb_cb { @@ -943,6 +943,7 @@ bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos); int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, int offset, int dsz, int mtu, struct sk_buff_head *list); bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err); +bool tipc_msg_assemble(struct sk_buff_head *list); bool tipc_msg_reassemble(struct sk_buff_head *list, struct sk_buff_head *rcvq); bool tipc_msg_pskb_copy(u32 dst, struct sk_buff_head *msg, struct sk_buff_head *cpy); diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index b3829bcf63c7..ed0457cc99d6 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -328,7 +328,8 @@ static struct publication *tipc_nameseq_insert_publ(struct net *net, list_for_each_entry_safe(s, st, &nseq->subscriptions, nameseq_list) { tipc_subscrp_report_overlap(s, publ->lower, publ->upper, TIPC_PUBLISHED, publ->ref, - publ->node, created_subseq); + publ->node, publ->scope, + created_subseq); } return publ; } @@ -398,19 +399,21 @@ found: list_for_each_entry_safe(s, st, &nseq->subscriptions, nameseq_list) { tipc_subscrp_report_overlap(s, publ->lower, publ->upper, TIPC_WITHDRAWN, publ->ref, - publ->node, removed_subseq); + publ->node, publ->scope, + removed_subseq); } return publ; } /** - * tipc_nameseq_subscribe - attach a subscription, and issue - * the prescribed number of events if there is any sub- + * tipc_nameseq_subscribe - attach a subscription, and optionally + * issue the prescribed number of events if there is any sub- * sequence overlapping with the requested sequence */ static void tipc_nameseq_subscribe(struct name_seq *nseq, - struct tipc_subscription *s) + struct tipc_subscription *s, + bool status) { struct sub_seq *sseq = nseq->sseqs; struct tipc_name_seq ns; @@ -420,7 +423,7 @@ static void tipc_nameseq_subscribe(struct name_seq *nseq, tipc_subscrp_get(s); list_add(&s->nameseq_list, &nseq->subscriptions); - if (!sseq) + if (!status || !sseq) return; while (sseq != &nseq->sseqs[nseq->first_free]) { @@ -434,6 +437,7 @@ static void tipc_nameseq_subscribe(struct name_seq *nseq, sseq->upper, TIPC_PUBLISHED, crs->ref, crs->node, + crs->scope, must_report); must_report = 0; } @@ -597,7 +601,7 @@ not_found: return ref; } -bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 domain, +bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 scope, struct list_head *dsts, int *dstcnt, u32 exclude, bool all) { @@ -607,9 +611,6 @@ bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 domain, struct name_seq *seq; struct sub_seq *sseq; - if (!tipc_in_scope(domain, self)) - return false; - *dstcnt = 0; rcu_read_lock(); seq = nametbl_find_seq(net, type); @@ -620,7 +621,7 @@ bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 domain, if (likely(sseq)) { info = sseq->info; list_for_each_entry(publ, &info->zone_list, zone_list) { - if (!tipc_in_scope(domain, publ->node)) + if (publ->scope != scope) continue; if (publ->ref == exclude && publ->node == self) continue; @@ -638,13 +639,14 @@ exit: return !list_empty(dsts); } -int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper, - u32 limit, struct list_head *dports) +int tipc_nametbl_mc_lookup(struct net *net, u32 type, u32 lower, u32 upper, + u32 scope, bool exact, struct list_head *dports) { - struct name_seq *seq; - struct sub_seq *sseq; struct sub_seq *sseq_stop; struct name_info *info; + struct publication *p; + struct name_seq *seq; + struct sub_seq *sseq; int res = 0; rcu_read_lock(); @@ -656,15 +658,12 @@ int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper, sseq = seq->sseqs + nameseq_locate_subseq(seq, lower); sseq_stop = seq->sseqs + seq->first_free; for (; sseq != sseq_stop; sseq++) { - struct publication *publ; - if (sseq->lower > upper) break; - info = sseq->info; - list_for_each_entry(publ, &info->node_list, node_list) { - if (publ->scope <= limit) - tipc_dest_push(dports, 0, publ->ref); + list_for_each_entry(p, &info->node_list, node_list) { + if (p->scope == scope || (!exact && p->scope < scope)) + tipc_dest_push(dports, 0, p->ref); } if (info->cluster_list_size != info->node_list_size) @@ -681,8 +680,7 @@ exit: * - Determines if any node local ports overlap */ void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower, - u32 upper, u32 domain, - struct tipc_nlist *nodes) + u32 upper, struct tipc_nlist *nodes) { struct sub_seq *sseq, *stop; struct publication *publ; @@ -700,8 +698,7 @@ void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower, for (; sseq != stop && sseq->lower <= upper; sseq++) { info = sseq->info; list_for_each_entry(publ, &info->zone_list, zone_list) { - if (tipc_in_scope(domain, publ->node)) - tipc_nlist_add(nodes, publ->node); + tipc_nlist_add(nodes, publ->node); } } spin_unlock_bh(&seq->lock); @@ -712,7 +709,7 @@ exit: /* tipc_nametbl_build_group - build list of communication group members */ void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp, - u32 type, u32 domain) + u32 type, u32 scope) { struct sub_seq *sseq, *stop; struct name_info *info; @@ -730,9 +727,9 @@ void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp, for (; sseq != stop; sseq++) { info = sseq->info; list_for_each_entry(p, &info->zone_list, zone_list) { - if (!tipc_in_scope(domain, p->node)) + if (p->scope != scope) continue; - tipc_group_add_member(grp, p->node, p->ref); + tipc_group_add_member(grp, p->node, p->ref, p->lower); } } spin_unlock_bh(&seq->lock); @@ -811,7 +808,7 @@ int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, u32 ref, /** * tipc_nametbl_subscribe - add a subscription object to the name table */ -void tipc_nametbl_subscribe(struct tipc_subscription *s) +void tipc_nametbl_subscribe(struct tipc_subscription *s, bool status) { struct tipc_net *tn = net_generic(s->net, tipc_net_id); u32 type = tipc_subscrp_convert_seq_type(s->evt.s.seq.type, s->swap); @@ -825,7 +822,7 @@ void tipc_nametbl_subscribe(struct tipc_subscription *s) seq = tipc_nameseq_create(type, &tn->nametbl->seq_hlist[index]); if (seq) { spin_lock_bh(&seq->lock); - tipc_nameseq_subscribe(seq, s); + tipc_nameseq_subscribe(seq, s, status); spin_unlock_bh(&seq->lock); } else { tipc_subscrp_convert_seq(&s->evt.s.seq, s->swap, &ns); diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h index 71926e429446..f56e7cb3d436 100644 --- a/net/tipc/name_table.h +++ b/net/tipc/name_table.h @@ -100,13 +100,12 @@ struct name_table { int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb); u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 *node); -int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper, - u32 limit, struct list_head *dports); +int tipc_nametbl_mc_lookup(struct net *net, u32 type, u32 lower, u32 upper, + u32 scope, bool exact, struct list_head *dports); void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp, u32 type, u32 domain); void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower, - u32 upper, u32 domain, - struct tipc_nlist *nodes); + u32 upper, struct tipc_nlist *nodes); bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 domain, struct list_head *dsts, int *dstcnt, u32 exclude, bool all); @@ -121,7 +120,7 @@ struct publication *tipc_nametbl_insert_publ(struct net *net, u32 type, struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type, u32 lower, u32 node, u32 ref, u32 key); -void tipc_nametbl_subscribe(struct tipc_subscription *s); +void tipc_nametbl_subscribe(struct tipc_subscription *s, bool status); void tipc_nametbl_unsubscribe(struct tipc_subscription *s); int tipc_nametbl_init(struct net *net); void tipc_nametbl_stop(struct net *net); diff --git a/net/tipc/node.c b/net/tipc/node.c index 507017fe0f1b..9036d8756e73 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1880,36 +1880,38 @@ int tipc_nl_node_get_link(struct sk_buff *skb, struct genl_info *info) if (strcmp(name, tipc_bclink_name) == 0) { err = tipc_nl_add_bc_link(net, &msg); - if (err) { - nlmsg_free(msg.skb); - return err; - } + if (err) + goto err_free; } else { int bearer_id; struct tipc_node *node; struct tipc_link *link; node = tipc_node_find_by_name(net, name, &bearer_id); - if (!node) - return -EINVAL; + if (!node) { + err = -EINVAL; + goto err_free; + } tipc_node_read_lock(node); link = node->links[bearer_id].link; if (!link) { tipc_node_read_unlock(node); - nlmsg_free(msg.skb); - return -EINVAL; + err = -EINVAL; + goto err_free; } err = __tipc_nl_add_link(net, &msg, link, 0); tipc_node_read_unlock(node); - if (err) { - nlmsg_free(msg.skb); - return err; - } + if (err) + goto err_free; } return genlmsg_reply(msg.skb, info); + +err_free: + nlmsg_free(msg.skb); + return err; } int tipc_nl_node_reset_link_stats(struct sk_buff *skb, struct genl_info *info) diff --git a/net/tipc/server.c b/net/tipc/server.c index acaef80fb88c..df0c563c90cd 100644 --- a/net/tipc/server.c +++ b/net/tipc/server.c @@ -132,10 +132,11 @@ static struct tipc_conn *tipc_conn_lookup(struct tipc_server *s, int conid) spin_lock_bh(&s->idr_lock); con = idr_find(&s->conn_idr, conid); - if (con && test_bit(CF_CONNECTED, &con->flags)) - conn_get(con); - else - con = NULL; + if (con) { + if (!test_bit(CF_CONNECTED, &con->flags) || + !kref_get_unless_zero(&con->kref)) + con = NULL; + } spin_unlock_bh(&s->idr_lock); return con; } @@ -183,35 +184,28 @@ static void tipc_register_callbacks(struct socket *sock, struct tipc_conn *con) write_unlock_bh(&sk->sk_callback_lock); } -static void tipc_unregister_callbacks(struct tipc_conn *con) -{ - struct sock *sk = con->sock->sk; - - write_lock_bh(&sk->sk_callback_lock); - sk->sk_user_data = NULL; - write_unlock_bh(&sk->sk_callback_lock); -} - static void tipc_close_conn(struct tipc_conn *con) { struct tipc_server *s = con->server; + struct sock *sk = con->sock->sk; + bool disconnect = false; - if (test_and_clear_bit(CF_CONNECTED, &con->flags)) { - if (con->sock) - tipc_unregister_callbacks(con); - + write_lock_bh(&sk->sk_callback_lock); + disconnect = test_and_clear_bit(CF_CONNECTED, &con->flags); + if (disconnect) { + sk->sk_user_data = NULL; if (con->conid) s->tipc_conn_release(con->conid, con->usr_data); - - /* We shouldn't flush pending works as we may be in the - * thread. In fact the races with pending rx/tx work structs - * are harmless for us here as we have already deleted this - * connection from server connection list. - */ - if (con->sock) - kernel_sock_shutdown(con->sock, SHUT_RDWR); - conn_put(con); } + write_unlock_bh(&sk->sk_callback_lock); + + /* Handle concurrent calls from sending and receiving threads */ + if (!disconnect) + return; + + /* Don't flush pending works, -just let them expire */ + kernel_sock_shutdown(con->sock, SHUT_RDWR); + conn_put(con); } static struct tipc_conn *tipc_alloc_conn(struct tipc_server *s) @@ -248,9 +242,10 @@ static struct tipc_conn *tipc_alloc_conn(struct tipc_server *s) static int tipc_receive_from_sock(struct tipc_conn *con) { - struct msghdr msg = {}; struct tipc_server *s = con->server; + struct sock *sk = con->sock->sk; struct sockaddr_tipc addr; + struct msghdr msg = {}; struct kvec iov; void *buf; int ret; @@ -264,19 +259,22 @@ static int tipc_receive_from_sock(struct tipc_conn *con) iov.iov_base = buf; iov.iov_len = s->max_rcvbuf_size; msg.msg_name = &addr; - ret = kernel_recvmsg(con->sock, &msg, &iov, 1, iov.iov_len, - MSG_DONTWAIT); + iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &iov, 1, iov.iov_len); + ret = sock_recvmsg(con->sock, &msg, MSG_DONTWAIT); if (ret <= 0) { kmem_cache_free(s->rcvbuf_cache, buf); goto out_close; } - s->tipc_conn_recvmsg(sock_net(con->sock->sk), con->conid, &addr, - con->usr_data, buf, ret); - + read_lock_bh(&sk->sk_callback_lock); + if (test_bit(CF_CONNECTED, &con->flags)) + ret = s->tipc_conn_recvmsg(sock_net(con->sock->sk), con->conid, + &addr, con->usr_data, buf, ret); + read_unlock_bh(&sk->sk_callback_lock); kmem_cache_free(s->rcvbuf_cache, buf); - - return 0; + if (ret < 0) + tipc_conn_terminate(s, con->conid); + return ret; out_close: if (ret != -EWOULDBLOCK) @@ -314,6 +312,7 @@ static int tipc_accept_from_sock(struct tipc_conn *con) newcon->usr_data = s->tipc_conn_new(newcon->conid); if (!newcon->usr_data) { sock_release(newsock); + conn_put(newcon); return -ENOMEM; } @@ -488,8 +487,8 @@ void tipc_conn_terminate(struct tipc_server *s, int conid) } } -bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, - u32 lower, u32 upper, int *conid) +bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower, + u32 upper, u32 filter, int *conid) { struct tipc_subscriber *scbr; struct tipc_subscr sub; @@ -500,7 +499,7 @@ bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, sub.seq.lower = lower; sub.seq.upper = upper; sub.timeout = TIPC_WAIT_FOREVER; - sub.filter = TIPC_SUB_PORTS; + sub.filter = filter; *(u32 *)&sub.usr_handle = port; con = tipc_alloc_conn(tipc_topsrv(net)); @@ -511,7 +510,7 @@ bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, s = con->server; scbr = s->tipc_conn_new(*conid); if (!scbr) { - tipc_close_conn(con); + conn_put(con); return false; } @@ -524,11 +523,17 @@ bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, void tipc_topsrv_kern_unsubscr(struct net *net, int conid) { struct tipc_conn *con; + struct tipc_server *srv; con = tipc_conn_lookup(tipc_topsrv(net), conid); if (!con) return; - tipc_close_conn(con); + + test_and_clear_bit(CF_CONNECTED, &con->flags); + srv = con->server; + if (con->conid) + srv->tipc_conn_release(con->conid, con->usr_data); + conn_put(con); conn_put(con); } diff --git a/net/tipc/server.h b/net/tipc/server.h index 2113c9192633..64df7513cd70 100644 --- a/net/tipc/server.h +++ b/net/tipc/server.h @@ -41,6 +41,9 @@ #include <net/net_namespace.h> #define TIPC_SERVER_NAME_LEN 32 +#define TIPC_SUB_CLUSTER_SCOPE 0x20 +#define TIPC_SUB_NODE_SCOPE 0x40 +#define TIPC_SUB_NO_STATUS 0x80 /** * struct tipc_server - TIPC server structure @@ -71,9 +74,9 @@ struct tipc_server { int max_rcvbuf_size; void *(*tipc_conn_new)(int conid); void (*tipc_conn_release)(int conid, void *usr_data); - void (*tipc_conn_recvmsg)(struct net *net, int conid, - struct sockaddr_tipc *addr, void *usr_data, - void *buf, size_t len); + int (*tipc_conn_recvmsg)(struct net *net, int conid, + struct sockaddr_tipc *addr, void *usr_data, + void *buf, size_t len); struct sockaddr_tipc *saddr; char name[TIPC_SERVER_NAME_LEN]; int imp; @@ -83,8 +86,8 @@ struct tipc_server { int tipc_conn_sendmsg(struct tipc_server *s, int conid, struct sockaddr_tipc *addr, void *data, size_t len); -bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, - u32 lower, u32 upper, int *conid); +bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower, + u32 upper, u32 filter, int *conid); void tipc_topsrv_kern_unsubscr(struct net *net, int conid); /** diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 5d18c0caa92b..163f3a547501 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -116,6 +116,7 @@ struct tipc_sock { struct tipc_mc_method mc_method; struct rcu_head rcu; struct tipc_group *group; + bool group_is_open; }; static int tipc_sk_backlog_rcv(struct sock *sk, struct sk_buff *skb); @@ -710,13 +711,12 @@ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr, * imply that the operation will succeed, merely that it should be performed * and will not block. */ -static unsigned int tipc_poll(struct file *file, struct socket *sock, +static __poll_t tipc_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); - struct tipc_group *grp = tsk->group; - u32 revents = 0; + __poll_t revents = 0; sock_poll_wait(file, sk_sleep(sk), wait); @@ -727,18 +727,17 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock, switch (sk->sk_state) { case TIPC_ESTABLISHED: + case TIPC_CONNECTING: if (!tsk->cong_link_cnt && !tsk_conn_cong(tsk)) revents |= POLLOUT; /* fall thru' */ case TIPC_LISTEN: - case TIPC_CONNECTING: if (!skb_queue_empty(&sk->sk_receive_queue)) revents |= POLLIN | POLLRDNORM; break; case TIPC_OPEN: - if (!grp || tipc_group_size(grp)) - if (!tsk->cong_link_cnt) - revents |= POLLOUT; + if (tsk->group_is_open && !tsk->cong_link_cnt) + revents |= POLLOUT; if (!tipc_sk_type_connectionless(sk)) break; if (skb_queue_empty(&sk->sk_receive_queue)) @@ -772,7 +771,6 @@ static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq, struct net *net = sock_net(sk); int mtu = tipc_bcast_get_mtu(net); struct tipc_mc_method *method = &tsk->mc_method; - u32 domain = addr_domain(net, TIPC_CLUSTER_SCOPE); struct sk_buff_head pkts; struct tipc_nlist dsts; int rc; @@ -788,7 +786,7 @@ static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq, /* Lookup destination nodes */ tipc_nlist_init(&dsts, tipc_own_addr(net)); tipc_nametbl_lookup_dst_nodes(net, seq->type, seq->lower, - seq->upper, domain, &dsts); + seq->upper, &dsts); if (!dsts.local && !dsts.remote) return -EHOSTUNREACH; @@ -928,21 +926,22 @@ static int tipc_send_group_anycast(struct socket *sock, struct msghdr *m, struct list_head *cong_links = &tsk->cong_links; int blks = tsk_blocks(GROUP_H_SIZE + dlen); struct tipc_group *grp = tsk->group; + struct tipc_msg *hdr = &tsk->phdr; struct tipc_member *first = NULL; struct tipc_member *mbr = NULL; struct net *net = sock_net(sk); u32 node, port, exclude; - u32 type, inst, domain; struct list_head dsts; + u32 type, inst, scope; int lookups = 0; int dstcnt, rc; bool cong; INIT_LIST_HEAD(&dsts); - type = dest->addr.name.name.type; + type = msg_nametype(hdr); inst = dest->addr.name.name.instance; - domain = addr_domain(net, dest->scope); + scope = msg_lookup_scope(hdr); exclude = tipc_group_exclude(grp); while (++lookups < 4) { @@ -950,7 +949,7 @@ static int tipc_send_group_anycast(struct socket *sock, struct msghdr *m, /* Look for a non-congested destination member, if any */ while (1) { - if (!tipc_nametbl_lookup(net, type, inst, domain, &dsts, + if (!tipc_nametbl_lookup(net, type, inst, scope, &dsts, &dstcnt, exclude, false)) return -EHOSTUNREACH; tipc_dest_pop(&dsts, &node, &port); @@ -1079,22 +1078,23 @@ static int tipc_send_group_mcast(struct socket *sock, struct msghdr *m, { struct sock *sk = sock->sk; DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); - struct tipc_name_seq *seq = &dest->addr.nameseq; struct tipc_sock *tsk = tipc_sk(sk); struct tipc_group *grp = tsk->group; + struct tipc_msg *hdr = &tsk->phdr; struct net *net = sock_net(sk); - u32 domain, exclude, dstcnt; + u32 type, inst, scope, exclude; struct list_head dsts; + u32 dstcnt; INIT_LIST_HEAD(&dsts); - if (seq->lower != seq->upper) - return -ENOTSUPP; - - domain = addr_domain(net, dest->scope); + type = msg_nametype(hdr); + inst = dest->addr.name.name.instance; + scope = msg_lookup_scope(hdr); exclude = tipc_group_exclude(grp); - if (!tipc_nametbl_lookup(net, seq->type, seq->lower, domain, - &dsts, &dstcnt, exclude, true)) + + if (!tipc_nametbl_lookup(net, type, inst, scope, &dsts, + &dstcnt, exclude, true)) return -EHOSTUNREACH; if (dstcnt == 1) { @@ -1116,49 +1116,64 @@ static int tipc_send_group_mcast(struct socket *sock, struct msghdr *m, void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, struct sk_buff_head *inputq) { - u32 scope = TIPC_CLUSTER_SCOPE; u32 self = tipc_own_addr(net); + u32 type, lower, upper, scope; struct sk_buff *skb, *_skb; - u32 lower = 0, upper = ~0; - struct sk_buff_head tmpq; u32 portid, oport, onode; + struct sk_buff_head tmpq; struct list_head dports; - struct tipc_msg *msg; - int user, mtyp, hsz; + struct tipc_msg *hdr; + int user, mtyp, hlen; + bool exact; __skb_queue_head_init(&tmpq); INIT_LIST_HEAD(&dports); skb = tipc_skb_peek(arrvq, &inputq->lock); for (; skb; skb = tipc_skb_peek(arrvq, &inputq->lock)) { - msg = buf_msg(skb); - user = msg_user(msg); - mtyp = msg_type(msg); + hdr = buf_msg(skb); + user = msg_user(hdr); + mtyp = msg_type(hdr); + hlen = skb_headroom(skb) + msg_hdr_sz(hdr); + oport = msg_origport(hdr); + onode = msg_orignode(hdr); + type = msg_nametype(hdr); + if (mtyp == TIPC_GRP_UCAST_MSG || user == GROUP_PROTOCOL) { spin_lock_bh(&inputq->lock); if (skb_peek(arrvq) == skb) { __skb_dequeue(arrvq); __skb_queue_tail(inputq, skb); } - refcount_dec(&skb->users); + kfree_skb(skb); spin_unlock_bh(&inputq->lock); continue; } - hsz = skb_headroom(skb) + msg_hdr_sz(msg); - oport = msg_origport(msg); - onode = msg_orignode(msg); - if (onode == self) - scope = TIPC_NODE_SCOPE; - - /* Create destination port list and message clones: */ - if (!msg_in_group(msg)) { - lower = msg_namelower(msg); - upper = msg_nameupper(msg); + + /* Group messages require exact scope match */ + if (msg_in_group(hdr)) { + lower = 0; + upper = ~0; + scope = msg_lookup_scope(hdr); + exact = true; + } else { + /* TIPC_NODE_SCOPE means "any scope" in this context */ + if (onode == self) + scope = TIPC_NODE_SCOPE; + else + scope = TIPC_CLUSTER_SCOPE; + exact = false; + lower = msg_namelower(hdr); + upper = msg_nameupper(hdr); } - tipc_nametbl_mc_translate(net, msg_nametype(msg), lower, upper, - scope, &dports); + + /* Create destination port list: */ + tipc_nametbl_mc_lookup(net, type, lower, upper, + scope, exact, &dports); + + /* Clone message per destination */ while (tipc_dest_pop(&dports, NULL, &portid)) { - _skb = __pskb_copy(skb, hsz, GFP_ATOMIC); + _skb = __pskb_copy(skb, hlen, GFP_ATOMIC); if (_skb) { msg_set_destport(buf_msg(_skb), portid); __skb_queue_tail(&tmpq, _skb); @@ -1933,8 +1948,7 @@ static void tipc_sk_proto_rcv(struct sock *sk, break; case TOP_SRV: tipc_group_member_evt(tsk->group, &wakeup, &sk->sk_rcvbuf, - skb, inputq, xmitq); - skb = NULL; + hdr, inputq, xmitq); break; default: break; @@ -2640,9 +2654,7 @@ void tipc_sk_reinit(struct net *net) rhashtable_walk_enter(&tn->sk_rht, &iter); do { - tsk = ERR_PTR(rhashtable_walk_start(&iter)); - if (IS_ERR(tsk)) - goto walk_stop; + rhashtable_walk_start(&iter); while ((tsk = rhashtable_walk_next(&iter)) && !IS_ERR(tsk)) { spin_lock_bh(&tsk->sk.sk_lock.slock); @@ -2651,7 +2663,7 @@ void tipc_sk_reinit(struct net *net) msg_set_orignode(msg, tn->own_addr); spin_unlock_bh(&tsk->sk.sk_lock.slock); } -walk_stop: + rhashtable_walk_stop(&iter); } while (tsk == ERR_PTR(-EAGAIN)); } @@ -2734,7 +2746,6 @@ void tipc_sk_rht_destroy(struct net *net) static int tipc_sk_join(struct tipc_sock *tsk, struct tipc_group_req *mreq) { struct net *net = sock_net(&tsk->sk); - u32 domain = addr_domain(net, mreq->scope); struct tipc_group *grp = tsk->group; struct tipc_msg *hdr = &tsk->phdr; struct tipc_name_seq seq; @@ -2742,9 +2753,11 @@ static int tipc_sk_join(struct tipc_sock *tsk, struct tipc_group_req *mreq) if (mreq->type < TIPC_RESERVED_TYPES) return -EACCES; + if (mreq->scope > TIPC_NODE_SCOPE) + return -EINVAL; if (grp) return -EACCES; - grp = tipc_group_create(net, tsk->portid, mreq); + grp = tipc_group_create(net, tsk->portid, mreq, &tsk->group_is_open); if (!grp) return -ENOMEM; tsk->group = grp; @@ -2754,16 +2767,17 @@ static int tipc_sk_join(struct tipc_sock *tsk, struct tipc_group_req *mreq) seq.type = mreq->type; seq.lower = mreq->instance; seq.upper = seq.lower; - tipc_nametbl_build_group(net, grp, mreq->type, domain); + tipc_nametbl_build_group(net, grp, mreq->type, mreq->scope); rc = tipc_sk_publish(tsk, mreq->scope, &seq); if (rc) { tipc_group_delete(net, grp); tsk->group = NULL; + return rc; } - - /* Eliminate any risk that a broadcast overtakes the sent JOIN */ + /* Eliminate any risk that a broadcast overtakes sent JOINs */ tsk->mc_method.rcast = true; tsk->mc_method.mandatory = true; + tipc_group_join(net, grp, &tsk->sk.sk_rcvbuf); return rc; } diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index 251065dfd8df..68e26470c516 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -118,15 +118,19 @@ void tipc_subscrp_convert_seq(struct tipc_name_seq *in, int swap, void tipc_subscrp_report_overlap(struct tipc_subscription *sub, u32 found_lower, u32 found_upper, u32 event, u32 port_ref, - u32 node, int must) + u32 node, u32 scope, int must) { + u32 filter = htohl(sub->evt.s.filter, sub->swap); struct tipc_name_seq seq; tipc_subscrp_convert_seq(&sub->evt.s.seq, sub->swap, &seq); if (!tipc_subscrp_check_overlap(&seq, found_lower, found_upper)) return; - if (!must && - !(htohl(sub->evt.s.filter, sub->swap) & TIPC_SUB_PORTS)) + if (!must && !(filter & TIPC_SUB_PORTS)) + return; + if (filter & TIPC_SUB_CLUSTER_SCOPE && scope == TIPC_NODE_SCOPE) + return; + if (filter & TIPC_SUB_NODE_SCOPE && scope != TIPC_NODE_SCOPE) return; tipc_subscrp_send_event(sub, found_lower, found_upper, event, port_ref, @@ -285,21 +289,21 @@ static struct tipc_subscription *tipc_subscrp_create(struct net *net, return sub; } -static void tipc_subscrp_subscribe(struct net *net, struct tipc_subscr *s, - struct tipc_subscriber *subscriber, int swap) +static int tipc_subscrp_subscribe(struct net *net, struct tipc_subscr *s, + struct tipc_subscriber *subscriber, int swap, + bool status) { - struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_subscription *sub = NULL; u32 timeout; sub = tipc_subscrp_create(net, s, swap); if (!sub) - return tipc_conn_terminate(tn->topsrv, subscriber->conid); + return -1; spin_lock_bh(&subscriber->lock); list_add(&sub->subscrp_list, &subscriber->subscrp_list); sub->subscriber = subscriber; - tipc_nametbl_subscribe(sub); + tipc_nametbl_subscribe(sub, status); tipc_subscrb_get(subscriber); spin_unlock_bh(&subscriber->lock); @@ -308,6 +312,7 @@ static void tipc_subscrp_subscribe(struct net *net, struct tipc_subscr *s, if (timeout != TIPC_WAIT_FOREVER) mod_timer(&sub->timer, jiffies + msecs_to_jiffies(timeout)); + return 0; } /* Handle one termination request for the subscriber */ @@ -317,12 +322,13 @@ static void tipc_subscrb_release_cb(int conid, void *usr_data) } /* Handle one request to create a new subscription for the subscriber */ -static void tipc_subscrb_rcv_cb(struct net *net, int conid, - struct sockaddr_tipc *addr, void *usr_data, - void *buf, size_t len) +static int tipc_subscrb_rcv_cb(struct net *net, int conid, + struct sockaddr_tipc *addr, void *usr_data, + void *buf, size_t len) { struct tipc_subscriber *subscriber = usr_data; struct tipc_subscr *s = (struct tipc_subscr *)buf; + bool status; int swap; /* Determine subscriber's endianness */ @@ -332,10 +338,11 @@ static void tipc_subscrb_rcv_cb(struct net *net, int conid, /* Detect & process a subscription cancellation request */ if (s->filter & htohl(TIPC_SUB_CANCEL, swap)) { s->filter &= ~htohl(TIPC_SUB_CANCEL, swap); - return tipc_subscrp_cancel(s, subscriber); + tipc_subscrp_cancel(s, subscriber); + return 0; } - - tipc_subscrp_subscribe(net, s, subscriber, swap); + status = !(s->filter & htohl(TIPC_SUB_NO_STATUS, swap)); + return tipc_subscrp_subscribe(net, s, subscriber, swap, status); } /* Handle one request to establish a new subscriber */ diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h index ee52957dc952..f3edca775d9f 100644 --- a/net/tipc/subscr.h +++ b/net/tipc/subscr.h @@ -71,7 +71,7 @@ int tipc_subscrp_check_overlap(struct tipc_name_seq *seq, u32 found_lower, u32 found_upper); void tipc_subscrp_report_overlap(struct tipc_subscription *sub, u32 found_lower, u32 found_upper, u32 event, - u32 port_ref, u32 node, int must); + u32 port_ref, u32 node, u32 scope, int must); void tipc_subscrp_convert_seq(struct tipc_name_seq *in, int swap, struct tipc_name_seq *out); u32 tipc_subscrp_convert_seq_type(u32 type, int swap); diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index ecca64fc6a6f..3deabcab4882 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -371,10 +371,6 @@ static int tipc_udp_recv(struct sock *sk, struct sk_buff *skb) goto rcu_out; } - tipc_rcv(sock_net(sk), skb, b); - rcu_read_unlock(); - return 0; - rcu_out: rcu_read_unlock(); out: diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index e07ee3ae0023..736719c8314e 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -367,8 +367,10 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, crypto_info = &ctx->crypto_send; /* Currently we don't support set crypto info more than one time */ - if (TLS_CRYPTO_INFO_READY(crypto_info)) + if (TLS_CRYPTO_INFO_READY(crypto_info)) { + rc = -EBUSY; goto out; + } rc = copy_from_user(crypto_info, optval, sizeof(*crypto_info)); if (rc) { @@ -386,7 +388,7 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, case TLS_CIPHER_AES_GCM_128: { if (optlen != sizeof(struct tls12_crypto_info_aes_gcm_128)) { rc = -EINVAL; - goto out; + goto err_crypto_info; } rc = copy_from_user(crypto_info + 1, optval + sizeof(*crypto_info), optlen - sizeof(*crypto_info)); @@ -398,7 +400,7 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, } default: rc = -EINVAL; - goto out; + goto err_crypto_info; } /* currently SW is default, we will have ethtool in future */ @@ -454,6 +456,15 @@ static int tls_init(struct sock *sk) struct tls_context *ctx; int rc = 0; + /* The TLS ulp is currently supported only for TCP sockets + * in ESTABLISHED state. + * Supporting sockets in LISTEN state will require us + * to modify the accept implementation to clone rather then + * share the ulp context. + */ + if (sk->sk_state != TCP_ESTABLISHED) + return -ENOTSUPP; + /* allocate tls context */ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) { diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 73d19210dd49..f26376e954ae 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -214,7 +214,11 @@ static int tls_do_encryption(struct tls_context *tls_ctx, aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE); aead_request_set_crypt(aead_req, ctx->sg_aead_in, ctx->sg_aead_out, data_len, tls_ctx->iv); - rc = crypto_aead_encrypt(aead_req); + + aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG, + crypto_req_done, &ctx->async_wait); + + rc = crypto_wait_req(crypto_aead_encrypt(aead_req), &ctx->async_wait); ctx->sg_encrypted_data[0].offset -= tls_ctx->prepend_size; ctx->sg_encrypted_data[0].length += tls_ctx->prepend_size; @@ -391,7 +395,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) while (msg_data_left(msg)) { if (sk->sk_err) { - ret = sk->sk_err; + ret = -sk->sk_err; goto send_end; } @@ -544,7 +548,7 @@ int tls_sw_sendpage(struct sock *sk, struct page *page, size_t copy, required_size; if (sk->sk_err) { - ret = sk->sk_err; + ret = -sk->sk_err; goto sendpage_end; } @@ -577,6 +581,8 @@ alloc_payload: get_page(page); sg = ctx->sg_plaintext_data + ctx->sg_plaintext_num_elem; sg_set_page(sg, page, copy, offset); + sg_unmark_end(sg); + ctx->sg_plaintext_num_elem++; sk_mem_charge(sk, copy); @@ -663,6 +669,8 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx) goto out; } + crypto_init_wait(&sw_ctx->async_wait); + ctx->priv_ctx = (struct tls_offload_context *)sw_ctx; crypto_info = &ctx->crypto_send; @@ -681,18 +689,17 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx) } default: rc = -EINVAL; - goto out; + goto free_priv; } ctx->prepend_size = TLS_HEADER_SIZE + nonce_size; ctx->tag_size = tag_size; ctx->overhead_size = ctx->prepend_size + ctx->tag_size; ctx->iv_size = iv_size; - ctx->iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE, - GFP_KERNEL); + ctx->iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE, GFP_KERNEL); if (!ctx->iv) { rc = -ENOMEM; - goto out; + goto free_priv; } memcpy(ctx->iv, gcm_128_info->salt, TLS_CIPHER_AES_GCM_128_SALT_SIZE); memcpy(ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size); @@ -740,7 +747,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx) rc = crypto_aead_setauthsize(sw_ctx->aead_send, ctx->tag_size); if (!rc) - goto out; + return 0; free_aead: crypto_free_aead(sw_ctx->aead_send); @@ -751,6 +758,9 @@ free_rec_seq: free_iv: kfree(ctx->iv); ctx->iv = NULL; +free_priv: + kfree(ctx->priv_ctx); + ctx->priv_ctx = NULL; out: return rc; } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index a9ee634f3c42..0214acbd6bff 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -367,7 +367,7 @@ static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int /* relaying can only happen while the wq still exists */ u_sleep = sk_sleep(&u->sk); if (u_sleep) - wake_up_interruptible_poll(u_sleep, key); + wake_up_interruptible_poll(u_sleep, key_to_poll(key)); return 0; } @@ -638,8 +638,8 @@ static int unix_stream_connect(struct socket *, struct sockaddr *, static int unix_socketpair(struct socket *, struct socket *); static int unix_accept(struct socket *, struct socket *, int, bool); static int unix_getname(struct socket *, struct sockaddr *, int *, int); -static unsigned int unix_poll(struct file *, struct socket *, poll_table *); -static unsigned int unix_dgram_poll(struct file *, struct socket *, +static __poll_t unix_poll(struct file *, struct socket *, poll_table *); +static __poll_t unix_dgram_poll(struct file *, struct socket *, poll_table *); static int unix_ioctl(struct socket *, unsigned int, unsigned long); static int unix_shutdown(struct socket *, int); @@ -2640,10 +2640,10 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return err; } -static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table *wait) +static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; - unsigned int mask; + __poll_t mask; sock_poll_wait(file, sk_sleep(sk), wait); mask = 0; @@ -2675,11 +2675,12 @@ static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table return mask; } -static unsigned int unix_dgram_poll(struct file *file, struct socket *sock, +static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk, *other; - unsigned int mask, writable; + unsigned int writable; + __poll_t mask; sock_poll_wait(file, sk_sleep(sk), wait); mask = 0; @@ -2869,7 +2870,6 @@ static int unix_seq_open(struct inode *inode, struct file *file) } static const struct file_operations unix_seq_fops = { - .owner = THIS_MODULE, .open = unix_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 5d28abf87fbf..9d95e773f4c8 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -850,11 +850,11 @@ static int vsock_shutdown(struct socket *sock, int mode) return err; } -static unsigned int vsock_poll(struct file *file, struct socket *sock, +static __poll_t vsock_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk; - unsigned int mask; + __poll_t mask; struct vsock_sock *vsk; sk = sock->sk; @@ -951,7 +951,7 @@ static unsigned int vsock_poll(struct file *file, struct socket *sock, * POLLOUT|POLLWRNORM when peer is closed and nothing to read, * but local send is not shutdown. */ - if (sk->sk_state == TCP_CLOSE) { + if (sk->sk_state == TCP_CLOSE || sk->sk_state == TCP_CLOSING) { if (!(sk->sk_shutdown & SEND_SHUTDOWN)) mask |= POLLOUT | POLLWRNORM; diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c index 5583df708b8c..a827547aa102 100644 --- a/net/vmw_vsock/hyperv_transport.c +++ b/net/vmw_vsock/hyperv_transport.c @@ -487,7 +487,7 @@ static void hvs_release(struct vsock_sock *vsk) lock_sock(sk); - sk->sk_state = SS_DISCONNECTING; + sk->sk_state = TCP_CLOSING; vsock_remove_sock(vsk); release_sock(sk); diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 391775e3575c..a7a73ffe675b 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -797,11 +797,13 @@ static void vmci_transport_handle_detach(struct sock *sk) /* We should not be sending anymore since the peer won't be * there to receive, but we can still receive if there is data - * left in our consume queue. + * left in our consume queue. If the local endpoint is a host, + * we can't call vsock_stream_has_data, since that may block, + * but a host endpoint can't read data once the VM has + * detached, so there is no available data in that case. */ - if (vsock_stream_has_data(vsk) <= 0) { - sk->sk_state = TCP_CLOSE; - + if (vsk->local_addr.svm_cid == VMADDR_CID_HOST || + vsock_stream_has_data(vsk) <= 0) { if (sk->sk_state == TCP_SYN_SENT) { /* The peer may detach from a queue pair while * we are still in the connecting state, i.e., @@ -811,10 +813,12 @@ static void vmci_transport_handle_detach(struct sock *sk) * event like a reset. */ + sk->sk_state = TCP_CLOSE; sk->sk_err = ECONNRESET; sk->sk_error_report(sk); return; } + sk->sk_state = TCP_CLOSE; } sk->sk_state_change(sk); } @@ -2144,7 +2148,7 @@ module_exit(vmci_transport_exit); MODULE_AUTHOR("VMware, Inc."); MODULE_DESCRIPTION("VMCI transport for Virtual Sockets"); -MODULE_VERSION("1.0.4.0-k"); +MODULE_VERSION("1.0.5.0-k"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS("vmware_vsock"); MODULE_ALIAS_NETPROTO(PF_VSOCK); diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig index da91bb547db3..1abcc4fc4df1 100644 --- a/net/wireless/Kconfig +++ b/net/wireless/Kconfig @@ -20,6 +20,10 @@ config CFG80211 tristate "cfg80211 - wireless configuration API" depends on RFKILL || !RFKILL select FW_LOADER + # may need to update this when certificates are changed and are + # using a different algorithm, though right now they shouldn't + # (this is here rather than below to allow it to be a module) + select CRYPTO_SHA256 if CFG80211_USE_KERNEL_REGDB_KEYS ---help--- cfg80211 is the Linux wireless LAN (802.11) configuration API. Enable this if you have a wireless device. @@ -113,6 +117,9 @@ config CFG80211_EXTRA_REGDB_KEYDIR certificates like in the kernel sources (net/wireless/certs/) that shall be accepted for a signed regulatory database. + Note that you need to also select the correct CRYPTO_<hash> modules + for your certificates, and if cfg80211 is built-in they also must be. + config CFG80211_REG_CELLULAR_HINTS bool "cfg80211 regulatory support for cellular base station hints" depends on CFG80211_CERTIFICATION_ONUS diff --git a/net/wireless/Makefile b/net/wireless/Makefile index 278d979c211a..1d84f91bbfb0 100644 --- a/net/wireless/Makefile +++ b/net/wireless/Makefile @@ -23,19 +23,36 @@ ifneq ($(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR),) cfg80211-y += extra-certs.o endif -$(obj)/shipped-certs.c: $(wildcard $(srctree)/$(src)/certs/*.x509) +$(obj)/shipped-certs.c: $(wildcard $(srctree)/$(src)/certs/*.hex) @$(kecho) " GEN $@" - @echo '#include "reg.h"' > $@ - @echo 'const u8 shipped_regdb_certs[] = {' >> $@ - @for f in $^ ; do hexdump -v -e '1/1 "0x%.2x," "\n"' < $$f >> $@ ; done - @echo '};' >> $@ - @echo 'unsigned int shipped_regdb_certs_len = sizeof(shipped_regdb_certs);' >> $@ + @(echo '#include "reg.h"'; \ + echo 'const u8 shipped_regdb_certs[] = {'; \ + cat $^ ; \ + echo '};'; \ + echo 'unsigned int shipped_regdb_certs_len = sizeof(shipped_regdb_certs);'; \ + ) > $@ $(obj)/extra-certs.c: $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR:"%"=%) \ $(wildcard $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR:"%"=%)/*.x509) @$(kecho) " GEN $@" - @echo '#include "reg.h"' > $@ - @echo 'const u8 extra_regdb_certs[] = {' >> $@ - @for f in $^ ; do test -f $$f && hexdump -v -e '1/1 "0x%.2x," "\n"' < $$f >> $@ || true ; done - @echo '};' >> $@ - @echo 'unsigned int extra_regdb_certs_len = sizeof(extra_regdb_certs);' >> $@ + @(set -e; \ + allf=""; \ + for f in $^ ; do \ + # similar to hexdump -v -e '1/1 "0x%.2x," "\n"' \ + thisf=$$(od -An -v -tx1 < $$f | \ + sed -e 's/ /\n/g' | \ + sed -e 's/^[0-9a-f]\+$$/\0/;t;d' | \ + sed -e 's/^/0x/;s/$$/,/'); \ + # file should not be empty - maybe command substitution failed? \ + test ! -z "$$thisf";\ + allf=$$allf$$thisf;\ + done; \ + ( \ + echo '#include "reg.h"'; \ + echo 'const u8 extra_regdb_certs[] = {'; \ + echo "$$allf"; \ + echo '};'; \ + echo 'unsigned int extra_regdb_certs_len = sizeof(extra_regdb_certs);'; \ + ) > $@) + +clean-files += shipped-certs.c extra-certs.c diff --git a/net/wireless/certs/sforshee.hex b/net/wireless/certs/sforshee.hex new file mode 100644 index 000000000000..14ea66643ffa --- /dev/null +++ b/net/wireless/certs/sforshee.hex @@ -0,0 +1,86 @@ +/* Seth Forshee's regdb certificate */ +0x30, 0x82, 0x02, 0xa4, 0x30, 0x82, 0x01, 0x8c, +0x02, 0x09, 0x00, 0xb2, 0x8d, 0xdf, 0x47, 0xae, +0xf9, 0xce, 0xa7, 0x30, 0x0d, 0x06, 0x09, 0x2a, +0x86, 0x48, 0x86, 0xf7, 0x0d, 0x01, 0x01, 0x0b, +0x05, 0x00, 0x30, 0x13, 0x31, 0x11, 0x30, 0x0f, +0x06, 0x03, 0x55, 0x04, 0x03, 0x0c, 0x08, 0x73, +0x66, 0x6f, 0x72, 0x73, 0x68, 0x65, 0x65, 0x30, +0x20, 0x17, 0x0d, 0x31, 0x37, 0x31, 0x30, 0x30, +0x36, 0x31, 0x39, 0x34, 0x30, 0x33, 0x35, 0x5a, +0x18, 0x0f, 0x32, 0x31, 0x31, 0x37, 0x30, 0x39, +0x31, 0x32, 0x31, 0x39, 0x34, 0x30, 0x33, 0x35, +0x5a, 0x30, 0x13, 0x31, 0x11, 0x30, 0x0f, 0x06, +0x03, 0x55, 0x04, 0x03, 0x0c, 0x08, 0x73, 0x66, +0x6f, 0x72, 0x73, 0x68, 0x65, 0x65, 0x30, 0x82, +0x01, 0x22, 0x30, 0x0d, 0x06, 0x09, 0x2a, 0x86, +0x48, 0x86, 0xf7, 0x0d, 0x01, 0x01, 0x01, 0x05, +0x00, 0x03, 0x82, 0x01, 0x0f, 0x00, 0x30, 0x82, +0x01, 0x0a, 0x02, 0x82, 0x01, 0x01, 0x00, 0xb5, +0x40, 0xe3, 0x9c, 0x28, 0x84, 0x39, 0x03, 0xf2, +0x39, 0xd7, 0x66, 0x2c, 0x41, 0x38, 0x15, 0xac, +0x7e, 0xa5, 0x83, 0x71, 0x25, 0x7e, 0x90, 0x7c, +0x68, 0xdd, 0x6f, 0x3f, 0xd9, 0xd7, 0x59, 0x38, +0x9f, 0x7c, 0x6a, 0x52, 0xc2, 0x03, 0x2a, 0x2d, +0x7e, 0x66, 0xf4, 0x1e, 0xb3, 0x12, 0x70, 0x20, +0x5b, 0xd4, 0x97, 0x32, 0x3d, 0x71, 0x8b, 0x3b, +0x1b, 0x08, 0x17, 0x14, 0x6b, 0x61, 0xc4, 0x57, +0x8b, 0x96, 0x16, 0x1c, 0xfd, 0x24, 0xd5, 0x0b, +0x09, 0xf9, 0x68, 0x11, 0x84, 0xfb, 0xca, 0x51, +0x0c, 0xd1, 0x45, 0x19, 0xda, 0x10, 0x44, 0x8a, +0xd9, 0xfe, 0x76, 0xa9, 0xfd, 0x60, 0x2d, 0x18, +0x0b, 0x28, 0x95, 0xb2, 0x2d, 0xea, 0x88, 0x98, +0xb8, 0xd1, 0x56, 0x21, 0xf0, 0x53, 0x1f, 0xf1, +0x02, 0x6f, 0xe9, 0x46, 0x9b, 0x93, 0x5f, 0x28, +0x90, 0x0f, 0xac, 0x36, 0xfa, 0x68, 0x23, 0x71, +0x57, 0x56, 0xf6, 0xcc, 0xd3, 0xdf, 0x7d, 0x2a, +0xd9, 0x1b, 0x73, 0x45, 0xeb, 0xba, 0x27, 0x85, +0xef, 0x7a, 0x7f, 0xa5, 0xcb, 0x80, 0xc7, 0x30, +0x36, 0xd2, 0x53, 0xee, 0xec, 0xac, 0x1e, 0xe7, +0x31, 0xf1, 0x36, 0xa2, 0x9c, 0x63, 0xc6, 0x65, +0x5b, 0x7f, 0x25, 0x75, 0x68, 0xa1, 0xea, 0xd3, +0x7e, 0x00, 0x5c, 0x9a, 0x5e, 0xd8, 0x20, 0x18, +0x32, 0x77, 0x07, 0x29, 0x12, 0x66, 0x1e, 0x36, +0x73, 0xe7, 0x97, 0x04, 0x41, 0x37, 0xb1, 0xb1, +0x72, 0x2b, 0xf4, 0xa1, 0x29, 0x20, 0x7c, 0x96, +0x79, 0x0b, 0x2b, 0xd0, 0xd8, 0xde, 0xc8, 0x6c, +0x3f, 0x93, 0xfb, 0xc5, 0xee, 0x78, 0x52, 0x11, +0x15, 0x1b, 0x7a, 0xf6, 0xe2, 0x68, 0x99, 0xe7, +0xfb, 0x46, 0x16, 0x84, 0xe3, 0xc7, 0xa1, 0xe6, +0xe0, 0xd2, 0x46, 0xd5, 0xe1, 0xc4, 0x5f, 0xa0, +0x66, 0xf4, 0xda, 0xc4, 0xff, 0x95, 0x1d, 0x02, +0x03, 0x01, 0x00, 0x01, 0x30, 0x0d, 0x06, 0x09, +0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, 0x01, 0x01, +0x0b, 0x05, 0x00, 0x03, 0x82, 0x01, 0x01, 0x00, +0x87, 0x03, 0xda, 0xf2, 0x82, 0xc2, 0xdd, 0xaf, +0x7c, 0x44, 0x2f, 0x86, 0xd3, 0x5f, 0x4c, 0x93, +0x48, 0xb9, 0xfe, 0x07, 0x17, 0xbb, 0x21, 0xf7, +0x25, 0x23, 0x4e, 0xaa, 0x22, 0x0c, 0x16, 0xb9, +0x73, 0xae, 0x9d, 0x46, 0x7c, 0x75, 0xd9, 0xc3, +0x49, 0x57, 0x47, 0xbf, 0x33, 0xb7, 0x97, 0xec, +0xf5, 0x40, 0x75, 0xc0, 0x46, 0x22, 0xf0, 0xa0, +0x5d, 0x9c, 0x79, 0x13, 0xa1, 0xff, 0xb8, 0xa3, +0x2f, 0x7b, 0x8e, 0x06, 0x3f, 0xc8, 0xb6, 0xe4, +0x6a, 0x28, 0xf2, 0x34, 0x5c, 0x23, 0x3f, 0x32, +0xc0, 0xe6, 0xad, 0x0f, 0xac, 0xcf, 0x55, 0x74, +0x47, 0x73, 0xd3, 0x01, 0x85, 0xb7, 0x0b, 0x22, +0x56, 0x24, 0x7d, 0x9f, 0x09, 0xa9, 0x0e, 0x86, +0x9e, 0x37, 0x5b, 0x9c, 0x6d, 0x02, 0xd9, 0x8c, +0xc8, 0x50, 0x6a, 0xe2, 0x59, 0xf3, 0x16, 0x06, +0xea, 0xb2, 0x42, 0xb5, 0x58, 0xfe, 0xba, 0xd1, +0x81, 0x57, 0x1a, 0xef, 0xb2, 0x38, 0x88, 0x58, +0xf6, 0xaa, 0xc4, 0x2e, 0x8b, 0x5a, 0x27, 0xe4, +0xa5, 0xe8, 0xa4, 0xca, 0x67, 0x5c, 0xac, 0x72, +0x67, 0xc3, 0x6f, 0x13, 0xc3, 0x2d, 0x35, 0x79, +0xd7, 0x8a, 0xe7, 0xf5, 0xd4, 0x21, 0x30, 0x4a, +0xd5, 0xf6, 0xa3, 0xd9, 0x79, 0x56, 0xf2, 0x0f, +0x10, 0xf7, 0x7d, 0xd0, 0x51, 0x93, 0x2f, 0x47, +0xf8, 0x7d, 0x4b, 0x0a, 0x84, 0x55, 0x12, 0x0a, +0x7d, 0x4e, 0x3b, 0x1f, 0x2b, 0x2f, 0xfc, 0x28, +0xb3, 0x69, 0x34, 0xe1, 0x80, 0x80, 0xbb, 0xe2, +0xaf, 0xb9, 0xd6, 0x30, 0xf1, 0x1d, 0x54, 0x87, +0x23, 0x99, 0x9f, 0x51, 0x03, 0x4c, 0x45, 0x7d, +0x02, 0x65, 0x73, 0xab, 0xfd, 0xcf, 0x94, 0xcc, +0x0d, 0x3a, 0x60, 0xfd, 0x3c, 0x14, 0x2f, 0x16, +0x33, 0xa9, 0x21, 0x1f, 0xcb, 0x50, 0xb1, 0x8f, +0x03, 0xee, 0xa0, 0x66, 0xa9, 0x16, 0x79, 0x14, diff --git a/net/wireless/certs/sforshee.x509 b/net/wireless/certs/sforshee.x509 Binary files differdeleted file mode 100644 index c6f8f9d6b988..000000000000 --- a/net/wireless/certs/sforshee.x509 +++ /dev/null diff --git a/net/wireless/core.c b/net/wireless/core.c index fdde0d98fde1..a6f3cac8c640 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -439,6 +439,8 @@ struct wiphy *wiphy_new_nm(const struct cfg80211_ops *ops, int sizeof_priv, if (rv) goto use_default_name; } else { + int rv; + use_default_name: /* NOTE: This is *probably* safe w/out holding rtnl because of * the restrictions on phy names. Probably this call could @@ -446,7 +448,11 @@ use_default_name: * phyX. But, might should add some locking and check return * value, and use a different name if this one exists? */ - dev_set_name(&rdev->wiphy.dev, PHY_NAME "%d", rdev->wiphy_idx); + rv = dev_set_name(&rdev->wiphy.dev, PHY_NAME "%d", rdev->wiphy_idx); + if (rv < 0) { + kfree(rdev); + return NULL; + } } INIT_LIST_HEAD(&rdev->wiphy.wdev_list); diff --git a/net/wireless/core.h b/net/wireless/core.h index d2f7e8b8a097..eaff636169c2 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -507,8 +507,6 @@ void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev, void cfg80211_stop_nan(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); -#define CFG80211_MAX_NUM_DIFFERENT_CHANNELS 10 - #ifdef CONFIG_CFG80211_DEVELOPER_WARNINGS #define CFG80211_DEV_WARN_ON(cond) WARN_ON(cond) #else diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c index 413d4f4e6334..a1d10993d08a 100644 --- a/net/wireless/ibss.c +++ b/net/wireless/ibss.c @@ -126,6 +126,11 @@ static int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev, wdev->ibss_fixed = params->channel_fixed; wdev->ibss_dfs_possible = params->userspace_handles_dfs; wdev->chandef = params->chandef; + if (connkeys) { + params->wep_keys = connkeys->params; + params->wep_tx_key = connkeys->def; + } + #ifdef CONFIG_CFG80211_WEXT wdev->wext.ibss.chandef = params->chandef; #endif diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index e7c64a8dce54..bbb9907bfa86 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -692,7 +692,7 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev, return rdev_mgmt_tx(rdev, wdev, params, cookie); } -bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_mbm, +bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_dbm, const u8 *buf, size_t len, u32 flags) { struct wiphy *wiphy = wdev->wiphy; @@ -708,7 +708,7 @@ bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_mbm, cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE); u16 stype; - trace_cfg80211_rx_mgmt(wdev, freq, sig_mbm); + trace_cfg80211_rx_mgmt(wdev, freq, sig_dbm); stype = (le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_STYPE) >> 4; if (!(stypes->rx & BIT(stype))) { @@ -735,7 +735,7 @@ bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_mbm, /* Indicate the received Action frame to user space */ if (nl80211_send_mgmt(rdev, wdev, reg->nlportid, - freq, sig_mbm, + freq, sig_dbm, buf, len, flags, GFP_ATOMIC)) continue; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index b1ac23ca20c8..ab0c687d0c44 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -734,11 +734,12 @@ struct key_parse { bool def_uni, def_multi; }; -static int nl80211_parse_key_new(struct nlattr *key, struct key_parse *k) +static int nl80211_parse_key_new(struct genl_info *info, struct nlattr *key, + struct key_parse *k) { struct nlattr *tb[NL80211_KEY_MAX + 1]; int err = nla_parse_nested(tb, NL80211_KEY_MAX, key, - nl80211_key_policy, NULL); + nl80211_key_policy, info->extack); if (err) return err; @@ -771,7 +772,8 @@ static int nl80211_parse_key_new(struct nlattr *key, struct key_parse *k) if (tb[NL80211_KEY_TYPE]) { k->type = nla_get_u32(tb[NL80211_KEY_TYPE]); if (k->type < 0 || k->type >= NUM_NL80211_KEYTYPES) - return -EINVAL; + return genl_err_attr(info, -EINVAL, + tb[NL80211_KEY_TYPE]); } if (tb[NL80211_KEY_DEFAULT_TYPES]) { @@ -779,7 +781,8 @@ static int nl80211_parse_key_new(struct nlattr *key, struct key_parse *k) err = nla_parse_nested(kdt, NUM_NL80211_KEY_DEFAULT_TYPES - 1, tb[NL80211_KEY_DEFAULT_TYPES], - nl80211_key_default_policy, NULL); + nl80211_key_default_policy, + info->extack); if (err) return err; @@ -820,8 +823,10 @@ static int nl80211_parse_key_old(struct genl_info *info, struct key_parse *k) if (info->attrs[NL80211_ATTR_KEY_TYPE]) { k->type = nla_get_u32(info->attrs[NL80211_ATTR_KEY_TYPE]); - if (k->type < 0 || k->type >= NUM_NL80211_KEYTYPES) + if (k->type < 0 || k->type >= NUM_NL80211_KEYTYPES) { + GENL_SET_ERR_MSG(info, "key type out of range"); return -EINVAL; + } } if (info->attrs[NL80211_ATTR_KEY_DEFAULT_TYPES]) { @@ -850,31 +855,42 @@ static int nl80211_parse_key(struct genl_info *info, struct key_parse *k) k->type = -1; if (info->attrs[NL80211_ATTR_KEY]) - err = nl80211_parse_key_new(info->attrs[NL80211_ATTR_KEY], k); + err = nl80211_parse_key_new(info, info->attrs[NL80211_ATTR_KEY], k); else err = nl80211_parse_key_old(info, k); if (err) return err; - if (k->def && k->defmgmt) + if (k->def && k->defmgmt) { + GENL_SET_ERR_MSG(info, "key with def && defmgmt is invalid"); return -EINVAL; + } if (k->defmgmt) { - if (k->def_uni || !k->def_multi) + if (k->def_uni || !k->def_multi) { + GENL_SET_ERR_MSG(info, "defmgmt key must be mcast"); return -EINVAL; + } } if (k->idx != -1) { if (k->defmgmt) { - if (k->idx < 4 || k->idx > 5) + if (k->idx < 4 || k->idx > 5) { + GENL_SET_ERR_MSG(info, + "defmgmt key idx not 4 or 5"); return -EINVAL; + } } else if (k->def) { - if (k->idx < 0 || k->idx > 3) + if (k->idx < 0 || k->idx > 3) { + GENL_SET_ERR_MSG(info, "def key idx not 0-3"); return -EINVAL; + } } else { - if (k->idx < 0 || k->idx > 5) + if (k->idx < 0 || k->idx > 5) { + GENL_SET_ERR_MSG(info, "key idx not 0-5"); return -EINVAL; + } } } @@ -883,8 +899,9 @@ static int nl80211_parse_key(struct genl_info *info, struct key_parse *k) static struct cfg80211_cached_keys * nl80211_parse_connkeys(struct cfg80211_registered_device *rdev, - struct nlattr *keys, bool *no_ht) + struct genl_info *info, bool *no_ht) { + struct nlattr *keys = info->attrs[NL80211_ATTR_KEYS]; struct key_parse parse; struct nlattr *key; struct cfg80211_cached_keys *result; @@ -909,17 +926,22 @@ nl80211_parse_connkeys(struct cfg80211_registered_device *rdev, memset(&parse, 0, sizeof(parse)); parse.idx = -1; - err = nl80211_parse_key_new(key, &parse); + err = nl80211_parse_key_new(info, key, &parse); if (err) goto error; err = -EINVAL; if (!parse.p.key) goto error; - if (parse.idx < 0 || parse.idx > 3) + if (parse.idx < 0 || parse.idx > 3) { + GENL_SET_ERR_MSG(info, "key index out of range [0-3]"); goto error; + } if (parse.def) { - if (def) + if (def) { + GENL_SET_ERR_MSG(info, + "only one key can be default"); goto error; + } def = 1; result->def = parse.idx; if (!parse.def_uni || !parse.def_multi) @@ -932,6 +954,7 @@ nl80211_parse_connkeys(struct cfg80211_registered_device *rdev, goto error; if (parse.p.cipher != WLAN_CIPHER_SUITE_WEP40 && parse.p.cipher != WLAN_CIPHER_SUITE_WEP104) { + GENL_SET_ERR_MSG(info, "connect key must be WEP"); err = -EINVAL; goto error; } @@ -947,6 +970,7 @@ nl80211_parse_connkeys(struct cfg80211_registered_device *rdev, if (result->def < 0) { err = -EINVAL; + GENL_SET_ERR_MSG(info, "need a default/TX key"); goto error; } @@ -2610,7 +2634,7 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag case NL80211_IFTYPE_AP: if (wdev->ssid_len && nla_put(msg, NL80211_ATTR_SSID, wdev->ssid_len, wdev->ssid)) - goto nla_put_failure; + goto nla_put_failure_locked; break; case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_P2P_CLIENT: @@ -2618,12 +2642,13 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag const u8 *ssid_ie; if (!wdev->current_bss) break; + rcu_read_lock(); ssid_ie = ieee80211_bss_get_ie(&wdev->current_bss->pub, WLAN_EID_SSID); - if (!ssid_ie) - break; - if (nla_put(msg, NL80211_ATTR_SSID, ssid_ie[1], ssid_ie + 2)) - goto nla_put_failure; + if (ssid_ie && + nla_put(msg, NL80211_ATTR_SSID, ssid_ie[1], ssid_ie + 2)) + goto nla_put_failure_rcu_locked; + rcu_read_unlock(); break; } default: @@ -2635,6 +2660,10 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag genlmsg_end(msg, hdr); return 0; + nla_put_failure_rcu_locked: + rcu_read_unlock(); + nla_put_failure_locked: + wdev_unlock(wdev); nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; @@ -7815,6 +7844,11 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb, intbss->ts_boottime, NL80211_BSS_PAD)) goto nla_put_failure; + if (!nl80211_put_signal(msg, intbss->pub.chains, + intbss->pub.chain_signal, + NL80211_BSS_CHAIN_SIGNAL)) + goto nla_put_failure; + switch (rdev->wiphy.signal_type) { case CFG80211_SIGNAL_TYPE_MBM: if (nla_put_u32(msg, NL80211_BSS_SIGNAL_MBM, res->signal)) @@ -8611,9 +8645,7 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info) if (ibss.privacy && info->attrs[NL80211_ATTR_KEYS]) { bool no_ht = false; - connkeys = nl80211_parse_connkeys(rdev, - info->attrs[NL80211_ATTR_KEYS], - &no_ht); + connkeys = nl80211_parse_connkeys(rdev, info, &no_ht); if (IS_ERR(connkeys)) return PTR_ERR(connkeys); @@ -9017,8 +9049,7 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) } if (connect.privacy && info->attrs[NL80211_ATTR_KEYS]) { - connkeys = nl80211_parse_connkeys(rdev, - info->attrs[NL80211_ATTR_KEYS], NULL); + connkeys = nl80211_parse_connkeys(rdev, info, NULL); if (IS_ERR(connkeys)) return PTR_ERR(connkeys); } @@ -9804,7 +9835,7 @@ static int cfg80211_cqm_rssi_update(struct cfg80211_registered_device *rdev, */ if (!wdev->cqm_config->last_rssi_event_value && wdev->current_bss && rdev->ops->get_station) { - struct station_info sinfo; + struct station_info sinfo = {}; u8 *mac_addr; mac_addr = wdev->current_bss->pub.bssid; @@ -11359,7 +11390,8 @@ static int nl80211_nan_add_func(struct sk_buff *skb, break; case NL80211_NAN_FUNC_FOLLOW_UP: if (!tb[NL80211_NAN_FUNC_FOLLOW_UP_ID] || - !tb[NL80211_NAN_FUNC_FOLLOW_UP_REQ_ID]) { + !tb[NL80211_NAN_FUNC_FOLLOW_UP_REQ_ID] || + !tb[NL80211_NAN_FUNC_FOLLOW_UP_DEST]) { err = -EINVAL; goto out; } @@ -13942,7 +13974,7 @@ void nl80211_send_disconnected(struct cfg80211_registered_device *rdev, if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) || - (from_ap && reason && + (reason && nla_put_u16(msg, NL80211_ATTR_REASON_CODE, reason)) || (from_ap && nla_put_flag(msg, NL80211_ATTR_DISCONNECTED_BY_AP)) || diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 78e71b0390be..7b42f0bacfd8 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -1769,8 +1769,7 @@ static void handle_reg_beacon(struct wiphy *wiphy, unsigned int chan_idx, if (wiphy->regulatory_flags & REGULATORY_DISABLE_BEACON_HINTS) return; - chan_before.center_freq = chan->center_freq; - chan_before.flags = chan->flags; + chan_before = *chan; if (chan->flags & IEEE80211_CHAN_NO_IR) { chan->flags &= ~IEEE80211_CHAN_NO_IR; diff --git a/net/wireless/scan.c b/net/wireless/scan.c index f6c5fe482506..d36c3eb7b931 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -981,6 +981,9 @@ cfg80211_bss_update(struct cfg80211_registered_device *rdev, found->ts = tmp->ts; found->ts_boottime = tmp->ts_boottime; found->parent_tsf = tmp->parent_tsf; + found->pub.chains = tmp->pub.chains; + memcpy(found->pub.chain_signal, tmp->pub.chain_signal, + IEEE80211_MAX_CHAINS); ether_addr_copy(found->parent_bssid, tmp->parent_bssid); } else { struct cfg80211_internal_bss *new; @@ -1233,6 +1236,8 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy, tmp.pub.capability = le16_to_cpu(mgmt->u.probe_resp.capab_info); tmp.ts_boottime = data->boottime_ns; tmp.parent_tsf = data->parent_tsf; + tmp.pub.chains = data->chains; + memcpy(tmp.pub.chain_signal, data->chain_signal, IEEE80211_MAX_CHAINS); ether_addr_copy(tmp.parent_bssid, data->parent_bssid); signal_valid = abs(data->chan->center_freq - channel->center_freq) <= diff --git a/net/wireless/trace.h b/net/wireless/trace.h index f3353fe5b35b..bcfedd39e7a3 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -2544,20 +2544,20 @@ DEFINE_EVENT(cfg80211_netdev_mac_evt, cfg80211_del_sta, ); TRACE_EVENT(cfg80211_rx_mgmt, - TP_PROTO(struct wireless_dev *wdev, int freq, int sig_mbm), - TP_ARGS(wdev, freq, sig_mbm), + TP_PROTO(struct wireless_dev *wdev, int freq, int sig_dbm), + TP_ARGS(wdev, freq, sig_dbm), TP_STRUCT__entry( WDEV_ENTRY __field(int, freq) - __field(int, sig_mbm) + __field(int, sig_dbm) ), TP_fast_assign( WDEV_ASSIGN; __entry->freq = freq; - __entry->sig_mbm = sig_mbm; + __entry->sig_dbm = sig_dbm; ), - TP_printk(WDEV_PR_FMT ", freq: %d, sig mbm: %d", - WDEV_PR_ARG, __entry->freq, __entry->sig_mbm) + TP_printk(WDEV_PR_FMT ", freq: %d, sig dbm: %d", + WDEV_PR_ARG, __entry->freq, __entry->sig_dbm) ); TRACE_EVENT(cfg80211_mgmt_tx_status, diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index 7ca04a7de85a..05186a47878f 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -1254,8 +1254,7 @@ static int cfg80211_wext_giwrate(struct net_device *dev, { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); - /* we are under RTNL - globally locked - so can use a static struct */ - static struct station_info sinfo; + struct station_info sinfo = {}; u8 addr[ETH_ALEN]; int err; diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c index 6cdb054484d6..9efbfc753347 100644 --- a/net/wireless/wext-core.c +++ b/net/wireless/wext-core.c @@ -1035,18 +1035,23 @@ static int ioctl_standard_call(struct net_device * dev, } -int wext_handle_ioctl(struct net *net, struct iwreq *iwr, unsigned int cmd, - void __user *arg) +int wext_handle_ioctl(struct net *net, unsigned int cmd, void __user *arg) { struct iw_request_info info = { .cmd = cmd, .flags = 0 }; + struct iwreq iwr; int ret; - ret = wext_ioctl_dispatch(net, iwr, cmd, &info, + if (copy_from_user(&iwr, arg, sizeof(iwr))) + return -EFAULT; + + iwr.ifr_name[sizeof(iwr.ifr_name) - 1] = 0; + + ret = wext_ioctl_dispatch(net, &iwr, cmd, &info, ioctl_standard_call, ioctl_private_call); if (ret >= 0 && IW_IS_GET(cmd) && - copy_to_user(arg, iwr, sizeof(struct iwreq))) + copy_to_user(arg, &iwr, sizeof(struct iwreq))) return -EFAULT; return ret; diff --git a/net/wireless/wext-proc.c b/net/wireless/wext-proc.c index e98a01c1034f..5511f989ef47 100644 --- a/net/wireless/wext-proc.c +++ b/net/wireless/wext-proc.c @@ -133,7 +133,6 @@ static int seq_open_wireless(struct inode *inode, struct file *file) } static const struct file_operations wireless_seq_fops = { - .owner = THIS_MODULE, .open = seq_open_wireless, .read = seq_read, .llseek = seq_lseek, diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 30e5746085b8..8e70291e586a 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -23,32 +23,114 @@ #include <linux/notifier.h> #ifdef CONFIG_XFRM_OFFLOAD -int validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features) +struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features, bool *again) { int err; + unsigned long flags; struct xfrm_state *x; + struct sk_buff *skb2; + struct softnet_data *sd; + netdev_features_t esp_features = features; struct xfrm_offload *xo = xfrm_offload(skb); - if (skb_is_gso(skb)) - return 0; + if (!xo) + return skb; - if (xo) { - x = skb->sp->xvec[skb->sp->len - 1]; - if (xo->flags & XFRM_GRO || x->xso.flags & XFRM_OFFLOAD_INBOUND) - return 0; + if (!(features & NETIF_F_HW_ESP)) + esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK); + + x = skb->sp->xvec[skb->sp->len - 1]; + if (xo->flags & XFRM_GRO || x->xso.flags & XFRM_OFFLOAD_INBOUND) + return skb; + local_irq_save(flags); + sd = this_cpu_ptr(&softnet_data); + err = !skb_queue_empty(&sd->xfrm_backlog); + local_irq_restore(flags); + + if (err) { + *again = true; + return skb; + } + + if (skb_is_gso(skb)) { + struct net_device *dev = skb->dev; + + if (unlikely(!x->xso.offload_handle || (x->xso.dev != dev))) { + struct sk_buff *segs; + + /* Packet got rerouted, fixup features and segment it. */ + esp_features = esp_features & ~(NETIF_F_HW_ESP + | NETIF_F_GSO_ESP); + + segs = skb_gso_segment(skb, esp_features); + if (IS_ERR(segs)) { + kfree_skb(skb); + atomic_long_inc(&dev->tx_dropped); + return NULL; + } else { + consume_skb(skb); + skb = segs; + } + } + } + + if (!skb->next) { x->outer_mode->xmit(x, skb); - err = x->type_offload->xmit(x, skb, features); + xo->flags |= XFRM_DEV_RESUME; + + err = x->type_offload->xmit(x, skb, esp_features); if (err) { + if (err == -EINPROGRESS) + return NULL; + XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR); - return err; + kfree_skb(skb); + return NULL; } skb_push(skb, skb->data - skb_mac_header(skb)); + + return skb; } - return 0; + skb2 = skb; + + do { + struct sk_buff *nskb = skb2->next; + skb2->next = NULL; + + xo = xfrm_offload(skb2); + xo->flags |= XFRM_DEV_RESUME; + + x->outer_mode->xmit(x, skb2); + + err = x->type_offload->xmit(x, skb2, esp_features); + if (!err) { + skb2->next = nskb; + } else if (err != -EINPROGRESS) { + XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR); + skb2->next = nskb; + kfree_skb_list(skb2); + return NULL; + } else { + if (skb == skb2) + skb = nskb; + + if (!skb) + return NULL; + + goto skip_push; + } + + skb_push(skb2, skb2->data - skb_mac_header(skb2)); + +skip_push: + skb2 = nskb; + } while (skb2); + + return skb; } EXPORT_SYMBOL_GPL(validate_xmit_xfrm); @@ -65,9 +147,9 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, if (!x->type_offload) return -EINVAL; - /* We don't yet support UDP encapsulation, TFC padding and ESN. */ - if (x->encap || x->tfcpad || (x->props.flags & XFRM_STATE_ESN)) - return 0; + /* We don't yet support UDP encapsulation and TFC padding. */ + if (x->encap || x->tfcpad) + return -EINVAL; dev = dev_get_by_index(net, xuo->ifindex); if (!dev) { @@ -96,12 +178,20 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, return 0; } + if (x->props.flags & XFRM_STATE_ESN && + !dev->xfrmdev_ops->xdo_dev_state_advance_esn) { + xso->dev = NULL; + dev_put(dev); + return -EINVAL; + } + xso->dev = dev; xso->num_exthdrs = 1; xso->flags = xuo->flags; err = dev->xfrmdev_ops->xdo_dev_state_add(x); if (err) { + xso->dev = NULL; dev_put(dev); return err; } @@ -120,8 +210,8 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x) if (!x->type_offload || x->encap) return false; - if ((x->xso.offload_handle && (dev == dst->path->dev)) && - !dst->child->xfrm && x->type->get_mtu) { + if ((!dev || (x->xso.offload_handle && (dev == xfrm_dst_path(dst)->dev))) && + (!xdst->child->xfrm && x->type->get_mtu)) { mtu = x->type->get_mtu(x, xdst->child_mtu_cached); if (skb->len <= mtu) @@ -140,19 +230,82 @@ ok: return true; } EXPORT_SYMBOL_GPL(xfrm_dev_offload_ok); + +void xfrm_dev_resume(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + int ret = NETDEV_TX_BUSY; + struct netdev_queue *txq; + struct softnet_data *sd; + unsigned long flags; + + rcu_read_lock(); + txq = netdev_pick_tx(dev, skb, NULL); + + HARD_TX_LOCK(dev, txq, smp_processor_id()); + if (!netif_xmit_frozen_or_stopped(txq)) + skb = dev_hard_start_xmit(skb, dev, txq, &ret); + HARD_TX_UNLOCK(dev, txq); + + if (!dev_xmit_complete(ret)) { + local_irq_save(flags); + sd = this_cpu_ptr(&softnet_data); + skb_queue_tail(&sd->xfrm_backlog, skb); + raise_softirq_irqoff(NET_TX_SOFTIRQ); + local_irq_restore(flags); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(xfrm_dev_resume); + +void xfrm_dev_backlog(struct softnet_data *sd) +{ + struct sk_buff_head *xfrm_backlog = &sd->xfrm_backlog; + struct sk_buff_head list; + struct sk_buff *skb; + + if (skb_queue_empty(xfrm_backlog)) + return; + + __skb_queue_head_init(&list); + + spin_lock(&xfrm_backlog->lock); + skb_queue_splice_init(xfrm_backlog, &list); + spin_unlock(&xfrm_backlog->lock); + + while (!skb_queue_empty(&list)) { + skb = __skb_dequeue(&list); + xfrm_dev_resume(skb); + } + +} #endif -static int xfrm_dev_register(struct net_device *dev) +static int xfrm_api_check(struct net_device *dev) { - if ((dev->features & NETIF_F_HW_ESP) && !dev->xfrmdev_ops) - return NOTIFY_BAD; +#ifdef CONFIG_XFRM_OFFLOAD if ((dev->features & NETIF_F_HW_ESP_TX_CSUM) && !(dev->features & NETIF_F_HW_ESP)) return NOTIFY_BAD; + if ((dev->features & NETIF_F_HW_ESP) && + (!(dev->xfrmdev_ops && + dev->xfrmdev_ops->xdo_dev_state_add && + dev->xfrmdev_ops->xdo_dev_state_delete))) + return NOTIFY_BAD; +#else + if (dev->features & (NETIF_F_HW_ESP | NETIF_F_HW_ESP_TX_CSUM)) + return NOTIFY_BAD; +#endif + return NOTIFY_DONE; } +static int xfrm_dev_register(struct net_device *dev) +{ + return xfrm_api_check(dev); +} + static int xfrm_dev_unregister(struct net_device *dev) { xfrm_policy_cache_flush(); @@ -161,16 +314,7 @@ static int xfrm_dev_unregister(struct net_device *dev) static int xfrm_dev_feat_change(struct net_device *dev) { - if ((dev->features & NETIF_F_HW_ESP) && !dev->xfrmdev_ops) - return NOTIFY_BAD; - else if (!(dev->features & NETIF_F_HW_ESP)) - dev->xfrmdev_ops = NULL; - - if ((dev->features & NETIF_F_HW_ESP_TX_CSUM) && - !(dev->features & NETIF_F_HW_ESP)) - return NOTIFY_BAD; - - return NOTIFY_DONE; + return xfrm_api_check(dev); } static int xfrm_dev_down(struct net_device *dev) diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 347ab31574d5..1472c0857975 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -8,15 +8,29 @@ * */ +#include <linux/bottom_half.h> +#include <linux/interrupt.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/netdevice.h> +#include <linux/percpu.h> #include <net/dst.h> #include <net/ip.h> #include <net/xfrm.h> #include <net/ip_tunnels.h> #include <net/ip6_tunnel.h> +struct xfrm_trans_tasklet { + struct tasklet_struct tasklet; + struct sk_buff_head queue; +}; + +struct xfrm_trans_cb { + int (*finish)(struct net *net, struct sock *sk, struct sk_buff *skb); +}; + +#define XFRM_TRANS_SKB_CB(__skb) ((struct xfrm_trans_cb *)&((__skb)->cb[0])) + static struct kmem_cache *secpath_cachep __read_mostly; static DEFINE_SPINLOCK(xfrm_input_afinfo_lock); @@ -25,6 +39,8 @@ static struct xfrm_input_afinfo const __rcu *xfrm_input_afinfo[AF_INET6 + 1]; static struct gro_cells gro_cells; static struct net_device xfrm_napi_dev; +static DEFINE_PER_CPU(struct xfrm_trans_tasklet, xfrm_trans_tasklet); + int xfrm_input_register_afinfo(const struct xfrm_input_afinfo *afinfo) { int err = 0; @@ -207,7 +223,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) xfrm_address_t *daddr; struct xfrm_mode *inner_mode; u32 mark = skb->mark; - unsigned int family; + unsigned int family = AF_UNSPEC; int decaps = 0; int async = 0; bool xfrm_gro = false; @@ -216,6 +232,16 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) if (encap_type < 0) { x = xfrm_input_state(skb); + + if (unlikely(x->km.state != XFRM_STATE_VALID)) { + if (x->km.state == XFRM_STATE_ACQ) + XFRM_INC_STATS(net, LINUX_MIB_XFRMACQUIREERROR); + else + XFRM_INC_STATS(net, + LINUX_MIB_XFRMINSTATEINVALID); + goto drop; + } + family = x->outer_mode->afinfo->family; /* An encap_type of -1 indicates async resumption. */ @@ -231,7 +257,6 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) if (xo && (xo->flags & CRYPTO_DONE)) { crypto_done = true; - x = xfrm_input_state(skb); family = XFRM_SPI_SKB_CB(skb)->family; if (!(xo->status & CRYPTO_SUCCESS)) { @@ -467,9 +492,41 @@ int xfrm_input_resume(struct sk_buff *skb, int nexthdr) } EXPORT_SYMBOL(xfrm_input_resume); +static void xfrm_trans_reinject(unsigned long data) +{ + struct xfrm_trans_tasklet *trans = (void *)data; + struct sk_buff_head queue; + struct sk_buff *skb; + + __skb_queue_head_init(&queue); + skb_queue_splice_init(&trans->queue, &queue); + + while ((skb = __skb_dequeue(&queue))) + XFRM_TRANS_SKB_CB(skb)->finish(dev_net(skb->dev), NULL, skb); +} + +int xfrm_trans_queue(struct sk_buff *skb, + int (*finish)(struct net *, struct sock *, + struct sk_buff *)) +{ + struct xfrm_trans_tasklet *trans; + + trans = this_cpu_ptr(&xfrm_trans_tasklet); + + if (skb_queue_len(&trans->queue) >= netdev_max_backlog) + return -ENOBUFS; + + XFRM_TRANS_SKB_CB(skb)->finish = finish; + __skb_queue_tail(&trans->queue, skb); + tasklet_schedule(&trans->tasklet); + return 0; +} +EXPORT_SYMBOL(xfrm_trans_queue); + void __init xfrm_input_init(void) { int err; + int i; init_dummy_netdev(&xfrm_napi_dev); err = gro_cells_init(&gro_cells, &xfrm_napi_dev); @@ -480,4 +537,13 @@ void __init xfrm_input_init(void) sizeof(struct sec_path), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + + for_each_possible_cpu(i) { + struct xfrm_trans_tasklet *trans; + + trans = &per_cpu(xfrm_trans_tasklet, i); + __skb_queue_head_init(&trans->queue); + tasklet_init(&trans->tasklet, xfrm_trans_reinject, + (unsigned long)trans); + } } diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 73ad8c8ef344..23468672a767 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -44,7 +44,7 @@ static int xfrm_skb_check_space(struct sk_buff *skb) static struct dst_entry *skb_dst_pop(struct sk_buff *skb) { - struct dst_entry *child = dst_clone(skb_dst(skb)->child); + struct dst_entry *child = dst_clone(xfrm_dst_child(skb_dst(skb))); skb_dst_drop(skb); return child; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 9542975eb2f9..7a23078132cf 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -54,7 +54,7 @@ static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1] static struct kmem_cache *xfrm_dst_cache __read_mostly; static __read_mostly seqcount_t xfrm_policy_hash_generation; -static void xfrm_init_pmtu(struct dst_entry *dst); +static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr); static int stale_bundle(struct dst_entry *dst); static int xfrm_bundle_ok(struct xfrm_dst *xdst); static void xfrm_policy_queue_process(struct timer_list *t); @@ -609,7 +609,8 @@ static void xfrm_hash_rebuild(struct work_struct *work) /* re-insert all policies by order of creation */ list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) { - if (xfrm_policy_id2dir(policy->index) >= XFRM_POLICY_MAX) { + if (policy->walk.dead || + xfrm_policy_id2dir(policy->index) >= XFRM_POLICY_MAX) { /* skip socket policies */ continue; } @@ -974,8 +975,6 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid) } if (!cnt) err = -ESRCH; - else - xfrm_policy_cache_flush(); out: spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return err; @@ -1168,9 +1167,15 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir, again: pol = rcu_dereference(sk->sk_policy[dir]); if (pol != NULL) { - bool match = xfrm_selector_match(&pol->selector, fl, family); + bool match; int err = 0; + if (pol->family != family) { + pol = NULL; + goto out; + } + + match = xfrm_selector_match(&pol->selector, fl, family); if (match) { if ((sk->sk_mark & pol->mark.m) != pol->mark.v) { pol = NULL; @@ -1251,7 +1256,7 @@ EXPORT_SYMBOL(xfrm_policy_delete); int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol) { - struct net *net = xp_net(pol); + struct net *net = sock_net(sk); struct xfrm_policy *old_pol; #ifdef CONFIG_XFRM_SUB_POLICY @@ -1538,7 +1543,9 @@ static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, */ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, - struct xfrm_state **xfrm, int nx, + struct xfrm_state **xfrm, + struct xfrm_dst **bundle, + int nx, const struct flowi *fl, struct dst_entry *dst) { @@ -1546,8 +1553,8 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, unsigned long now = jiffies; struct net_device *dev; struct xfrm_mode *inner_mode; - struct dst_entry *dst_prev = NULL; - struct dst_entry *dst0 = NULL; + struct xfrm_dst *xdst_prev = NULL; + struct xfrm_dst *xdst0 = NULL; int i = 0; int err; int header_len = 0; @@ -1573,13 +1580,14 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, goto put_states; } - if (!dst_prev) - dst0 = dst1; + bundle[i] = xdst; + if (!xdst_prev) + xdst0 = xdst; else /* Ref count is taken during xfrm_alloc_dst() * No need to do dst_clone() on dst1 */ - dst_prev->child = dst1; + xfrm_dst_set_child(xdst_prev, &xdst->u.dst); if (xfrm[i]->sel.family == AF_UNSPEC) { inner_mode = xfrm_ip2inner_mode(xfrm[i], @@ -1616,8 +1624,7 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, dst1->input = dst_discard; dst1->output = inner_mode->afinfo->output; - dst1->next = dst_prev; - dst_prev = dst1; + xdst_prev = xdst; header_len += xfrm[i]->props.header_len; if (xfrm[i]->type->flags & XFRM_TYPE_NON_FRAGMENT) @@ -1625,40 +1632,39 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, trailer_len += xfrm[i]->props.trailer_len; } - dst_prev->child = dst; - dst0->path = dst; + xfrm_dst_set_child(xdst_prev, dst); + xdst0->path = dst; err = -ENODEV; dev = dst->dev; if (!dev) goto free_dst; - xfrm_init_path((struct xfrm_dst *)dst0, dst, nfheader_len); - xfrm_init_pmtu(dst_prev); + xfrm_init_path(xdst0, dst, nfheader_len); + xfrm_init_pmtu(bundle, nx); - for (dst_prev = dst0; dst_prev != dst; dst_prev = dst_prev->child) { - struct xfrm_dst *xdst = (struct xfrm_dst *)dst_prev; - - err = xfrm_fill_dst(xdst, dev, fl); + for (xdst_prev = xdst0; xdst_prev != (struct xfrm_dst *)dst; + xdst_prev = (struct xfrm_dst *) xfrm_dst_child(&xdst_prev->u.dst)) { + err = xfrm_fill_dst(xdst_prev, dev, fl); if (err) goto free_dst; - dst_prev->header_len = header_len; - dst_prev->trailer_len = trailer_len; - header_len -= xdst->u.dst.xfrm->props.header_len; - trailer_len -= xdst->u.dst.xfrm->props.trailer_len; + xdst_prev->u.dst.header_len = header_len; + xdst_prev->u.dst.trailer_len = trailer_len; + header_len -= xdst_prev->u.dst.xfrm->props.header_len; + trailer_len -= xdst_prev->u.dst.xfrm->props.trailer_len; } out: - return dst0; + return &xdst0->u.dst; put_states: for (; i < nx; i++) xfrm_state_put(xfrm[i]); free_dst: - if (dst0) - dst_release_immediate(dst0); - dst0 = ERR_PTR(err); + if (xdst0) + dst_release_immediate(&xdst0->u.dst); + xdst0 = ERR_PTR(err); goto out; } @@ -1737,6 +1743,8 @@ void xfrm_policy_cache_flush(void) bool found = 0; int cpu; + might_sleep(); + local_bh_disable(); rcu_read_lock(); for_each_possible_cpu(cpu) { @@ -1800,7 +1808,7 @@ static bool xfrm_xdst_can_reuse(struct xfrm_dst *xdst, for (i = 0; i < num; i++) { if (!dst || dst->xfrm != xfrm[i]) return false; - dst = dst->child; + dst = xfrm_dst_child(dst); } return xfrm_bundle_ok(xdst); @@ -1813,6 +1821,7 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols, { struct net *net = xp_net(pols[0]); struct xfrm_state *xfrm[XFRM_MAX_DEPTH]; + struct xfrm_dst *bundle[XFRM_MAX_DEPTH]; struct xfrm_dst *xdst, *old; struct dst_entry *dst; int err; @@ -1833,6 +1842,7 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols, sizeof(struct xfrm_policy *) * num_pols) == 0 && xfrm_xdst_can_reuse(xdst, xfrm, err)) { dst_hold(&xdst->u.dst); + xfrm_pols_put(pols, num_pols); while (err > 0) xfrm_state_put(xfrm[--err]); return xdst; @@ -1840,7 +1850,7 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols, old = xdst; - dst = xfrm_bundle_create(pols[0], xfrm, err, fl, dst_orig); + dst = xfrm_bundle_create(pols[0], xfrm, bundle, err, fl, dst_orig); if (IS_ERR(dst)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLEGENERROR); return ERR_CAST(dst); @@ -1880,8 +1890,8 @@ static void xfrm_policy_queue_process(struct timer_list *t) xfrm_decode_session(skb, &fl, dst->ops->family); spin_unlock(&pq->hold_queue.lock); - dst_hold(dst->path); - dst = xfrm_lookup(net, dst->path, &fl, sk, 0); + dst_hold(xfrm_dst_path(dst)); + dst = xfrm_lookup(net, xfrm_dst_path(dst), &fl, sk, 0); if (IS_ERR(dst)) goto purge_queue; @@ -1910,8 +1920,8 @@ static void xfrm_policy_queue_process(struct timer_list *t) skb = __skb_dequeue(&list); xfrm_decode_session(skb, &fl, skb_dst(skb)->ops->family); - dst_hold(skb_dst(skb)->path); - dst = xfrm_lookup(net, skb_dst(skb)->path, &fl, skb->sk, 0); + dst_hold(xfrm_dst_path(skb_dst(skb))); + dst = xfrm_lookup(net, xfrm_dst_path(skb_dst(skb)), &fl, skb->sk, 0); if (IS_ERR(dst)) { kfree_skb(skb); continue; @@ -2012,8 +2022,8 @@ static struct xfrm_dst *xfrm_create_dummy_bundle(struct net *net, dst1->output = xdst_queue_output; dst_hold(dst); - dst1->child = dst; - dst1->path = dst; + xfrm_dst_set_child(xdst, dst); + xdst->path = dst; xfrm_init_path((struct xfrm_dst *)dst1, dst, 0); @@ -2055,8 +2065,11 @@ xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, if (num_xfrms <= 0) goto make_dummy_bundle; + local_bh_disable(); xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family, - xflo->dst_orig); + xflo->dst_orig); + local_bh_enable(); + if (IS_ERR(xdst)) { err = PTR_ERR(xdst); if (err != -EAGAIN) @@ -2143,9 +2156,12 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, goto no_transform; } + local_bh_disable(); xdst = xfrm_resolve_and_create_bundle( pols, num_pols, fl, family, dst_orig); + local_bh_enable(); + if (IS_ERR(xdst)) { xfrm_pols_put(pols, num_pols); err = PTR_ERR(xdst); @@ -2576,7 +2592,7 @@ static int stale_bundle(struct dst_entry *dst) void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev) { - while ((dst = dst->child) && dst->xfrm && dst->dev == dev) { + while ((dst = xfrm_dst_child(dst)) && dst->xfrm && dst->dev == dev) { dst->dev = dev_net(dev)->loopback_dev; dev_hold(dst->dev); dev_put(dev); @@ -2600,13 +2616,15 @@ static struct dst_entry *xfrm_negative_advice(struct dst_entry *dst) return dst; } -static void xfrm_init_pmtu(struct dst_entry *dst) +static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr) { - do { - struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + while (nr--) { + struct xfrm_dst *xdst = bundle[nr]; u32 pmtu, route_mtu_cached; + struct dst_entry *dst; - pmtu = dst_mtu(dst->child); + dst = &xdst->u.dst; + pmtu = dst_mtu(xfrm_dst_child(dst)); xdst->child_mtu_cached = pmtu; pmtu = xfrm_state_mtu(dst->xfrm, pmtu); @@ -2618,7 +2636,7 @@ static void xfrm_init_pmtu(struct dst_entry *dst) pmtu = route_mtu_cached; dst_metric_set(dst, RTAX_MTU, pmtu); - } while ((dst = dst->next)); + } } /* Check that the bundle accepts the flow and its components are @@ -2627,19 +2645,20 @@ static void xfrm_init_pmtu(struct dst_entry *dst) static int xfrm_bundle_ok(struct xfrm_dst *first) { + struct xfrm_dst *bundle[XFRM_MAX_DEPTH]; struct dst_entry *dst = &first->u.dst; - struct xfrm_dst *last; + struct xfrm_dst *xdst; + int start_from, nr; u32 mtu; - if (!dst_check(dst->path, ((struct xfrm_dst *)dst)->path_cookie) || + if (!dst_check(xfrm_dst_path(dst), ((struct xfrm_dst *)dst)->path_cookie) || (dst->dev && !netif_running(dst->dev))) return 0; if (dst->flags & DST_XFRM_QUEUE) return 1; - last = NULL; - + start_from = nr = 0; do { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; @@ -2651,9 +2670,11 @@ static int xfrm_bundle_ok(struct xfrm_dst *first) xdst->policy_genid != atomic_read(&xdst->pols[0]->genid)) return 0; - mtu = dst_mtu(dst->child); + bundle[nr++] = xdst; + + mtu = dst_mtu(xfrm_dst_child(dst)); if (xdst->child_mtu_cached != mtu) { - last = xdst; + start_from = nr; xdst->child_mtu_cached = mtu; } @@ -2661,30 +2682,30 @@ static int xfrm_bundle_ok(struct xfrm_dst *first) return 0; mtu = dst_mtu(xdst->route); if (xdst->route_mtu_cached != mtu) { - last = xdst; + start_from = nr; xdst->route_mtu_cached = mtu; } - dst = dst->child; + dst = xfrm_dst_child(dst); } while (dst->xfrm); - if (likely(!last)) + if (likely(!start_from)) return 1; - mtu = last->child_mtu_cached; - for (;;) { - dst = &last->u.dst; + xdst = bundle[start_from - 1]; + mtu = xdst->child_mtu_cached; + while (start_from--) { + dst = &xdst->u.dst; mtu = xfrm_state_mtu(dst->xfrm, mtu); - if (mtu > last->route_mtu_cached) - mtu = last->route_mtu_cached; + if (mtu > xdst->route_mtu_cached) + mtu = xdst->route_mtu_cached; dst_metric_set(dst, RTAX_MTU, mtu); - - if (last == first) + if (!start_from) break; - last = (struct xfrm_dst *)last->u.dst.next; - last->child_mtu_cached = mtu; + xdst = bundle[start_from - 1]; + xdst->child_mtu_cached = mtu; } return 1; @@ -2692,22 +2713,20 @@ static int xfrm_bundle_ok(struct xfrm_dst *first) static unsigned int xfrm_default_advmss(const struct dst_entry *dst) { - return dst_metric_advmss(dst->path); + return dst_metric_advmss(xfrm_dst_path(dst)); } static unsigned int xfrm_mtu(const struct dst_entry *dst) { unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); - return mtu ? : dst_mtu(dst->path); + return mtu ? : dst_mtu(xfrm_dst_path(dst)); } static const void *xfrm_get_dst_nexthop(const struct dst_entry *dst, const void *daddr) { - const struct dst_entry *path = dst->path; - - for (; dst != path; dst = dst->child) { + while (dst->xfrm) { const struct xfrm_state *xfrm = dst->xfrm; if (xfrm->props.mode == XFRM_MODE_TRANSPORT) @@ -2716,6 +2735,8 @@ static const void *xfrm_get_dst_nexthop(const struct dst_entry *dst, daddr = xfrm->coaddr; else if (!(xfrm->type->flags & XFRM_TYPE_LOCAL_COADDR)) daddr = &xfrm->id.daddr; + + dst = xfrm_dst_child(dst); } return daddr; } @@ -2724,7 +2745,7 @@ static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr) { - const struct dst_entry *path = dst->path; + const struct dst_entry *path = xfrm_dst_path(dst); if (!skb) daddr = xfrm_get_dst_nexthop(dst, daddr); @@ -2733,7 +2754,7 @@ static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst, static void xfrm_confirm_neigh(const struct dst_entry *dst, const void *daddr) { - const struct dst_entry *path = dst->path; + const struct dst_entry *path = xfrm_dst_path(dst); daddr = xfrm_get_dst_nexthop(dst, daddr); path->ops->confirm_neigh(path, daddr); diff --git a/net/xfrm/xfrm_proc.c b/net/xfrm/xfrm_proc.c index ba2b539879bc..6d5f85f4e672 100644 --- a/net/xfrm/xfrm_proc.c +++ b/net/xfrm/xfrm_proc.c @@ -71,7 +71,6 @@ static int xfrm_statistics_seq_open(struct inode *inode, struct file *file) } static const struct file_operations xfrm_statistics_seq_fops = { - .owner = THIS_MODULE, .open = xfrm_statistics_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index 8b23c5bcf8e8..1d38c6acf8af 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -551,6 +551,8 @@ static void xfrm_replay_advance_esn(struct xfrm_state *x, __be32 net_seq) bitnr = replay_esn->replay_window - (diff - pos); } + xfrm_dev_state_advance_esn(x); + nr = bitnr >> 5; bitnr = bitnr & 0x1F; replay_esn->bmp[nr] |= (1U << bitnr); @@ -666,7 +668,7 @@ static int xfrm_replay_overflow_offload_esn(struct xfrm_state *x, struct sk_buff if (unlikely(oseq < replay_esn->oseq)) { XFRM_SKB_CB(skb)->seq.output.hi = ++oseq_hi; xo->seq.hi = oseq_hi; - + replay_esn->oseq_hi = oseq_hi; if (replay_esn->oseq_hi == 0) { replay_esn->oseq--; replay_esn->oseq_hi--; @@ -678,7 +680,6 @@ static int xfrm_replay_overflow_offload_esn(struct xfrm_state *x, struct sk_buff } replay_esn->oseq = oseq; - replay_esn->oseq_hi = oseq_hi; if (xfrm_aevent_is_on(net)) x->repl->notify(x, XFRM_REPLAY_UPDATE); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 065d89606888..54e21f19d722 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -313,13 +313,14 @@ retry: if ((type && !try_module_get(type->owner))) type = NULL; + rcu_read_unlock(); + if (!type && try_load) { request_module("xfrm-offload-%d-%d", family, proto); - try_load = 0; + try_load = false; goto retry; } - rcu_read_unlock(); return type; } @@ -1343,6 +1344,7 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, if (orig->aead) { x->aead = xfrm_algo_aead_clone(orig->aead); + x->geniv = orig->geniv; if (!x->aead) goto error; } @@ -1533,8 +1535,12 @@ out: err = -EINVAL; spin_lock_bh(&x1->lock); if (likely(x1->km.state == XFRM_STATE_VALID)) { - if (x->encap && x1->encap) + if (x->encap && x1->encap && + x->encap->encap_type == x1->encap->encap_type) memcpy(x1->encap, x->encap, sizeof(*x1->encap)); + else if (x->encap || x1->encap) + goto fail; + if (x->coaddr && x1->coaddr) { memcpy(x1->coaddr, x->coaddr, sizeof(*x1->coaddr)); } @@ -1551,6 +1557,8 @@ out: x->km.state = XFRM_STATE_DEAD; __xfrm_state_put(x); } + +fail: spin_unlock_bh(&x1->lock); xfrm_state_put(x1); @@ -2048,6 +2056,13 @@ int xfrm_user_policy(struct sock *sk, int optname, u8 __user *optval, int optlen struct xfrm_mgr *km; struct xfrm_policy *pol = NULL; + if (!optval && !optlen) { + xfrm_sk_policy_insert(sk, XFRM_POLICY_IN, NULL); + xfrm_sk_policy_insert(sk, XFRM_POLICY_OUT, NULL); + __sk_dst_reset(sk); + return 0; + } + if (optlen <= 0 || optlen > PAGE_SIZE) return -EMSGSIZE; @@ -2264,8 +2279,6 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload) goto error; } - x->km.state = XFRM_STATE_VALID; - error: return err; } @@ -2274,7 +2287,13 @@ EXPORT_SYMBOL(__xfrm_init_state); int xfrm_init_state(struct xfrm_state *x) { - return __xfrm_init_state(x, true, false); + int err; + + err = __xfrm_init_state(x, true, false); + if (!err) + x->km.state = XFRM_STATE_VALID; + + return err; } EXPORT_SYMBOL(xfrm_init_state); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 983b0233767b..7f52b8eb177d 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -598,13 +598,6 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, goto error; } - if (attrs[XFRMA_OFFLOAD_DEV]) { - err = xfrm_dev_state_add(net, x, - nla_data(attrs[XFRMA_OFFLOAD_DEV])); - if (err) - goto error; - } - if ((err = xfrm_alloc_replay_state_esn(&x->replay_esn, &x->preplay_esn, attrs[XFRMA_REPLAY_ESN_VAL]))) goto error; @@ -620,6 +613,14 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, /* override default values from above */ xfrm_update_ae_params(x, attrs, 0); + /* configure the hardware if offload is requested */ + if (attrs[XFRMA_OFFLOAD_DEV]) { + err = xfrm_dev_state_add(net, x, + nla_data(attrs[XFRMA_OFFLOAD_DEV])); + if (err) + goto error; + } + return x; error: @@ -662,6 +663,9 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; } + if (x->km.state == XFRM_STATE_VOID) + x->km.state = XFRM_STATE_VALID; + c.seq = nlh->nlmsg_seq; c.portid = nlh->nlmsg_pid; c.event = nlh->nlmsg_type; @@ -1419,11 +1423,14 @@ static void copy_templates(struct xfrm_policy *xp, struct xfrm_user_tmpl *ut, static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family) { + u16 prev_family; int i; if (nr > XFRM_MAX_DEPTH) return -EINVAL; + prev_family = family; + for (i = 0; i < nr; i++) { /* We never validated the ut->family value, so many * applications simply leave it at zero. The check was @@ -1435,6 +1442,12 @@ static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family) if (!ut[i].family) ut[i].family = family; + if ((ut[i].mode == XFRM_MODE_TRANSPORT) && + (ut[i].family != prev_family)) + return -EINVAL; + + prev_family = ut[i].family; + switch (ut[i].family) { case AF_INET: break; @@ -1445,6 +1458,21 @@ static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family) default: return -EINVAL; } + + switch (ut[i].id.proto) { + case IPPROTO_AH: + case IPPROTO_ESP: + case IPPROTO_COMP: +#if IS_ENABLED(CONFIG_IPV6) + case IPPROTO_ROUTING: + case IPPROTO_DSTOPTS: +#endif + case IPSEC_PROTO_ANY: + break; + default: + return -EINVAL; + } + } return 0; @@ -2470,7 +2498,7 @@ static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = { [XFRMA_PROTO] = { .type = NLA_U8 }, [XFRMA_ADDRESS_FILTER] = { .len = sizeof(struct xfrm_address_filter) }, [XFRMA_OFFLOAD_DEV] = { .len = sizeof(struct xfrm_user_offload) }, - [XFRMA_OUTPUT_MARK] = { .len = NLA_U32 }, + [XFRMA_OUTPUT_MARK] = { .type = NLA_U32 }, }; static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = { |