From fd589a1be20fdd76ef97700dd0185e7a060546dc Mon Sep 17 00:00:00 2001 From: Jyri Sarha Date: Tue, 10 Nov 2015 18:12:42 +0200 Subject: ASoC: dapm: Reset dapm wcache after freeing damp widgets If there is anything in damp->path_source_cache or damp->path_sink_cache, it can not be valid after the widgets have been freed. Without this patch a repeated remove and load of a machine driver may cause NULL pointer reference in dapm_wcache_lookup() when a freed widget, not belonging to any list, is haunting in the wcache. Signed-off-by: Jyri Sarha Reported-by: Felipe Balbi Signed-off-by: Mark Brown --- include/sound/soc-dapm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/sound/soc-dapm.h b/include/sound/soc-dapm.h index 7855cfe46b69..95a937eafb79 100644 --- a/include/sound/soc-dapm.h +++ b/include/sound/soc-dapm.h @@ -398,6 +398,7 @@ int snd_soc_dapm_del_routes(struct snd_soc_dapm_context *dapm, int snd_soc_dapm_weak_routes(struct snd_soc_dapm_context *dapm, const struct snd_soc_dapm_route *route, int num); void snd_soc_dapm_free_widget(struct snd_soc_dapm_widget *w); +void snd_soc_dapm_reset_cache(struct snd_soc_dapm_context *dapm); /* dapm events */ void snd_soc_dapm_stream_event(struct snd_soc_pcm_runtime *rtd, int stream, -- cgit v1.2.3 From 34c06254ff82a815fdccdfae7517a06c9b768cee Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 5 Nov 2015 00:12:24 -0500 Subject: cgroup: fix cftype->file_offset handling 6f60eade2433 ("cgroup: generalize obtaining the handles of and notifying cgroup files") introduced cftype->file_offset so that the handles for per-css file instances can be recorded. These handles then can be used, for example, to generate file modified notifications. Unfortunately, it made the wrong assumption that files are created once for a given css and removed on its destruction. Due to the dependencies among subsystems, a css may be hidden from userland and then later shown again. This is implemented by removing and re-creating the affected files, so the associated kernfs_node for a given cgroup file may change over time. This incorrect assumption led to the corruption of css->files lists. Reimplement cftype->file_offset handling so that cgroup_file->kn is protected by a lock and updated as files are created and destroyed. This also makes keeping them on per-cgroup list unnecessary. Signed-off-by: Tejun Heo Reported-by: James Sedgwick Fixes: 6f60eade2433 ("cgroup: generalize obtaining the handles of and notifying cgroup files") Acked-by: Johannes Weiner Acked-by: Zefan Li --- include/linux/cgroup-defs.h | 4 ---- include/linux/cgroup.h | 14 +------------- kernel/cgroup.c | 42 ++++++++++++++++++++++++++++++++++-------- 3 files changed, 35 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 60d44b26276d..869fd4a3d28e 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -90,7 +90,6 @@ enum { */ struct cgroup_file { /* do not access any fields from outside cgroup core */ - struct list_head node; /* anchored at css->files */ struct kernfs_node *kn; }; @@ -134,9 +133,6 @@ struct cgroup_subsys_state { */ u64 serial_nr; - /* all cgroup_files associated with this css */ - struct list_head files; - /* percpu_ref killing and RCU release */ struct rcu_head rcu_head; struct work_struct destroy_work; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 22e3754f89c5..f64083030ad5 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -88,6 +88,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_rm_cftypes(struct cftype *cfts); +void cgroup_file_notify(struct cgroup_file *cfile); char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen); int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry); @@ -516,19 +517,6 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp) pr_cont_kernfs_path(cgrp->kn); } -/** - * cgroup_file_notify - generate a file modified event for a cgroup_file - * @cfile: target cgroup_file - * - * @cfile must have been obtained by setting cftype->file_offset. - */ -static inline void cgroup_file_notify(struct cgroup_file *cfile) -{ - /* might not have been created due to one of the CFTYPE selector flags */ - if (cfile->kn) - kernfs_notify(cfile->kn); -} - #else /* !CONFIG_CGROUPS */ struct cgroup_subsys_state; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f1603c153890..b316debadeb3 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -97,6 +97,12 @@ static DEFINE_SPINLOCK(css_set_lock); */ static DEFINE_SPINLOCK(cgroup_idr_lock); +/* + * Protects cgroup_file->kn for !self csses. It synchronizes notifications + * against file removal/re-creation across css hiding. + */ +static DEFINE_SPINLOCK(cgroup_file_kn_lock); + /* * Protects cgroup_subsys->release_agent_path. Modifying it also requires * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. @@ -1393,6 +1399,16 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) char name[CGROUP_FILE_NAME_MAX]; lockdep_assert_held(&cgroup_mutex); + + if (cft->file_offset) { + struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss); + struct cgroup_file *cfile = (void *)css + cft->file_offset; + + spin_lock_irq(&cgroup_file_kn_lock); + cfile->kn = NULL; + spin_unlock_irq(&cgroup_file_kn_lock); + } + kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); } @@ -1856,7 +1872,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->self.sibling); INIT_LIST_HEAD(&cgrp->self.children); - INIT_LIST_HEAD(&cgrp->self.files); INIT_LIST_HEAD(&cgrp->cset_links); INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); @@ -3313,9 +3328,9 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, if (cft->file_offset) { struct cgroup_file *cfile = (void *)css + cft->file_offset; - kernfs_get(kn); + spin_lock_irq(&cgroup_file_kn_lock); cfile->kn = kn; - list_add(&cfile->node, &css->files); + spin_unlock_irq(&cgroup_file_kn_lock); } return 0; @@ -3552,6 +3567,22 @@ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) return cgroup_add_cftypes(ss, cfts); } +/** + * cgroup_file_notify - generate a file modified event for a cgroup_file + * @cfile: target cgroup_file + * + * @cfile must have been obtained by setting cftype->file_offset. + */ +void cgroup_file_notify(struct cgroup_file *cfile) +{ + unsigned long flags; + + spin_lock_irqsave(&cgroup_file_kn_lock, flags); + if (cfile->kn) + kernfs_notify(cfile->kn); + spin_unlock_irqrestore(&cgroup_file_kn_lock, flags); +} + /** * cgroup_task_count - count the number of tasks in a cgroup. * @cgrp: the cgroup in question @@ -4613,13 +4644,9 @@ static void css_free_work_fn(struct work_struct *work) container_of(work, struct cgroup_subsys_state, destroy_work); struct cgroup_subsys *ss = css->ss; struct cgroup *cgrp = css->cgroup; - struct cgroup_file *cfile; percpu_ref_exit(&css->refcnt); - list_for_each_entry(cfile, &css->files, node) - kernfs_put(cfile->kn); - if (ss) { /* css free path */ int id = css->id; @@ -4724,7 +4751,6 @@ static void init_and_link_css(struct cgroup_subsys_state *css, css->ss = ss; INIT_LIST_HEAD(&css->sibling); INIT_LIST_HEAD(&css->children); - INIT_LIST_HEAD(&css->files); css->serial_nr = css_serial_nr_next++; if (cgroup_parent(cgrp)) { -- cgit v1.2.3 From aedf17f4515b12ba1cd73298e66baa69cf93010e Mon Sep 17 00:00:00 2001 From: Matias Bjørling Date: Mon, 16 Nov 2015 15:34:36 +0100 Subject: lightnvm: change max_phys_sect to uint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The max_phys_sect variable is defined as a char. We do a boundary check to maximally allow 256 physical page descriptors per command. As we are not indexing from zero. This expression is always false. Bump the max_phys_sect to an unsigned int to support the range check. Signed-off-by: Matias Bjørling Reported-by: Geert Uytterhoeven Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 69c9057e1ab8..32b5369e814e 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -220,7 +220,7 @@ struct nvm_dev_ops { nvm_dev_dma_alloc_fn *dev_dma_alloc; nvm_dev_dma_free_fn *dev_dma_free; - uint8_t max_phys_sect; + unsigned int max_phys_sect; }; struct nvm_lun { -- cgit v1.2.3 From 11450469830f2481a9e7cb181609288d40f41323 Mon Sep 17 00:00:00 2001 From: Matias Bjørling Date: Mon, 16 Nov 2015 15:34:37 +0100 Subject: lightnvm: update bad block table format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The specification was changed to reflect a multi-value bad block table. Instead of bit-based bad block table, the bad block table now allows eight bad block categories. Currently four are defined: * Factory bad blocks * Grown bad blocks * Device-side reserved blocks * Host-side reserved blocks The factory and grown bad blocks are the regular bad blocks. The reserved blocks are either for internal use or external use. In particular, the device-side reserved blocks allows the host to bootstrap from a limited number of flash blocks. Reducing the flash blocks to scan upon super block initialization. Support for both get bad block table and set bad block table is added. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/gennvm.c | 32 ++++++++---- drivers/lightnvm/gennvm.h | 2 + drivers/nvme/host/lightnvm.c | 113 ++++++++++++++++++++++++++++++++++--------- include/linux/lightnvm.h | 6 +-- 4 files changed, 117 insertions(+), 36 deletions(-) (limited to 'include') diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c index ae1fb2bdc5f4..8cfc0114ff13 100644 --- a/drivers/lightnvm/gennvm.c +++ b/drivers/lightnvm/gennvm.c @@ -64,19 +64,22 @@ static int gennvm_luns_init(struct nvm_dev *dev, struct gen_nvm *gn) return 0; } -static int gennvm_block_bb(u32 lun_id, void *bb_bitmap, unsigned int nr_blocks, +static int gennvm_block_bb(struct ppa_addr ppa, int nr_blocks, u8 *blks, void *private) { struct gen_nvm *gn = private; - struct gen_lun *lun = &gn->luns[lun_id]; + struct nvm_dev *dev = gn->dev; + struct gen_lun *lun; struct nvm_block *blk; int i; - if (unlikely(bitmap_empty(bb_bitmap, nr_blocks))) - return 0; + ppa = addr_to_generic_mode(gn->dev, ppa); + lun = &gn->luns[(dev->nr_luns * ppa.g.ch) + ppa.g.lun]; + + for (i = 0; i < nr_blocks; i++) { + if (blks[i] == 0) + continue; - i = -1; - while ((i = find_next_bit(bb_bitmap, nr_blocks, i + 1)) < nr_blocks) { blk = &lun->vlun.blocks[i]; if (!blk) { pr_err("gennvm: BB data is out of bounds.\n"); @@ -171,8 +174,16 @@ static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn) } if (dev->ops->get_bb_tbl) { - ret = dev->ops->get_bb_tbl(dev->q, lun->vlun.id, - dev->blks_per_lun, gennvm_block_bb, gn); + struct ppa_addr ppa; + + ppa.ppa = 0; + ppa.g.ch = lun->vlun.chnl_id; + ppa.g.lun = lun->vlun.id; + ppa = generic_to_addr_mode(dev, ppa); + + ret = dev->ops->get_bb_tbl(dev->q, ppa, + dev->blks_per_lun, + gennvm_block_bb, gn); if (ret) pr_err("gennvm: could not read BB table\n"); } @@ -199,6 +210,7 @@ static int gennvm_register(struct nvm_dev *dev) if (!gn) return -ENOMEM; + gn->dev = dev; gn->nr_luns = dev->nr_luns; dev->mp = gn; @@ -354,10 +366,10 @@ static void gennvm_mark_blk_bad(struct nvm_dev *dev, struct nvm_rq *rqd) { int i; - if (!dev->ops->set_bb) + if (!dev->ops->set_bb_tbl) return; - if (dev->ops->set_bb(dev->q, rqd, 1)) + if (dev->ops->set_bb_tbl(dev->q, rqd, 1)) return; gennvm_addr_to_generic_mode(dev, rqd); diff --git a/drivers/lightnvm/gennvm.h b/drivers/lightnvm/gennvm.h index d23bd3501ddc..9c24b5b32dac 100644 --- a/drivers/lightnvm/gennvm.h +++ b/drivers/lightnvm/gennvm.h @@ -35,6 +35,8 @@ struct gen_lun { }; struct gen_nvm { + struct nvm_dev *dev; + int nr_luns; struct gen_lun *luns; }; diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index e0b7b95813bc..2c3546516300 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -93,7 +93,7 @@ struct nvme_nvm_l2ptbl { __le16 cdw14[6]; }; -struct nvme_nvm_bbtbl { +struct nvme_nvm_getbbtbl { __u8 opcode; __u8 flags; __u16 command_id; @@ -101,10 +101,23 @@ struct nvme_nvm_bbtbl { __u64 rsvd[2]; __le64 prp1; __le64 prp2; - __le32 prp1_len; - __le32 prp2_len; - __le32 lbb; - __u32 rsvd11[3]; + __le64 spba; + __u32 rsvd4[4]; +}; + +struct nvme_nvm_setbbtbl { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __le64 rsvd[2]; + __le64 prp1; + __le64 prp2; + __le64 spba; + __le16 nlb; + __u8 value; + __u8 rsvd3; + __u32 rsvd4[3]; }; struct nvme_nvm_erase_blk { @@ -129,8 +142,8 @@ struct nvme_nvm_command { struct nvme_nvm_hb_rw hb_rw; struct nvme_nvm_ph_rw ph_rw; struct nvme_nvm_l2ptbl l2p; - struct nvme_nvm_bbtbl get_bb; - struct nvme_nvm_bbtbl set_bb; + struct nvme_nvm_getbbtbl get_bb; + struct nvme_nvm_setbbtbl set_bb; struct nvme_nvm_erase_blk erase; }; }; @@ -187,6 +200,20 @@ struct nvme_nvm_id { struct nvme_nvm_id_group groups[4]; } __packed; +struct nvme_nvm_bb_tbl { + __u8 tblid[4]; + __le16 verid; + __le16 revid; + __le32 rvsd1; + __le32 tblks; + __le32 tfact; + __le32 tgrown; + __le32 tdresv; + __le32 thresv; + __le32 rsvd2[8]; + __u8 blk[0]; +}; + /* * Check we didn't inadvertently grow the command struct */ @@ -195,12 +222,14 @@ static inline void _nvme_nvm_check_size(void) BUILD_BUG_ON(sizeof(struct nvme_nvm_identity) != 64); BUILD_BUG_ON(sizeof(struct nvme_nvm_hb_rw) != 64); BUILD_BUG_ON(sizeof(struct nvme_nvm_ph_rw) != 64); - BUILD_BUG_ON(sizeof(struct nvme_nvm_bbtbl) != 64); + BUILD_BUG_ON(sizeof(struct nvme_nvm_getbbtbl) != 64); + BUILD_BUG_ON(sizeof(struct nvme_nvm_setbbtbl) != 64); BUILD_BUG_ON(sizeof(struct nvme_nvm_l2ptbl) != 64); BUILD_BUG_ON(sizeof(struct nvme_nvm_erase_blk) != 64); BUILD_BUG_ON(sizeof(struct nvme_nvm_id_group) != 960); BUILD_BUG_ON(sizeof(struct nvme_nvm_addr_format) != 128); BUILD_BUG_ON(sizeof(struct nvme_nvm_id) != 4096); + BUILD_BUG_ON(sizeof(struct nvme_nvm_bb_tbl) != 512); } static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id) @@ -322,43 +351,80 @@ out: return ret; } -static int nvme_nvm_get_bb_tbl(struct request_queue *q, int lunid, - unsigned int nr_blocks, - nvm_bb_update_fn *update_bbtbl, void *priv) +static int nvme_nvm_get_bb_tbl(struct request_queue *q, struct ppa_addr ppa, + int nr_blocks, nvm_bb_update_fn *update_bbtbl, + void *priv) { struct nvme_ns *ns = q->queuedata; struct nvme_dev *dev = ns->dev; struct nvme_nvm_command c = {}; - void *bb_bitmap; - u16 bb_bitmap_size; + struct nvme_nvm_bb_tbl *bb_tbl; + int tblsz = sizeof(struct nvme_nvm_bb_tbl) + nr_blocks; int ret = 0; c.get_bb.opcode = nvme_nvm_admin_get_bb_tbl; c.get_bb.nsid = cpu_to_le32(ns->ns_id); - c.get_bb.lbb = cpu_to_le32(lunid); - bb_bitmap_size = ((nr_blocks >> 15) + 1) * PAGE_SIZE; - bb_bitmap = kmalloc(bb_bitmap_size, GFP_KERNEL); - if (!bb_bitmap) - return -ENOMEM; + c.get_bb.spba = cpu_to_le64(ppa.ppa); - bitmap_zero(bb_bitmap, nr_blocks); + bb_tbl = kzalloc(tblsz, GFP_KERNEL); + if (!bb_tbl) + return -ENOMEM; - ret = nvme_submit_sync_cmd(q, (struct nvme_command *)&c, bb_bitmap, - bb_bitmap_size); + ret = nvme_submit_sync_cmd(q, (struct nvme_command *)&c, bb_tbl, tblsz); if (ret) { dev_err(dev->dev, "get bad block table failed (%d)\n", ret); ret = -EIO; goto out; } - ret = update_bbtbl(lunid, bb_bitmap, nr_blocks, priv); + if (bb_tbl->tblid[0] != 'B' || bb_tbl->tblid[1] != 'B' || + bb_tbl->tblid[2] != 'L' || bb_tbl->tblid[3] != 'T') { + dev_err(dev->dev, "bbt format mismatch\n"); + ret = -EINVAL; + goto out; + } + + if (le16_to_cpu(bb_tbl->verid) != 1) { + ret = -EINVAL; + dev_err(dev->dev, "bbt version not supported\n"); + goto out; + } + + if (le32_to_cpu(bb_tbl->tblks) != nr_blocks) { + ret = -EINVAL; + dev_err(dev->dev, "bbt unsuspected blocks returned (%u!=%u)", + le32_to_cpu(bb_tbl->tblks), nr_blocks); + goto out; + } + + ret = update_bbtbl(ppa, nr_blocks, bb_tbl->blk, priv); if (ret) { ret = -EINTR; goto out; } out: - kfree(bb_bitmap); + kfree(bb_tbl); + return ret; +} + +static int nvme_nvm_set_bb_tbl(struct request_queue *q, struct nvm_rq *rqd, + int type) +{ + struct nvme_ns *ns = q->queuedata; + struct nvme_dev *dev = ns->dev; + struct nvme_nvm_command c = {}; + int ret = 0; + + c.set_bb.opcode = nvme_nvm_admin_set_bb_tbl; + c.set_bb.nsid = cpu_to_le32(ns->ns_id); + c.set_bb.spba = cpu_to_le64(rqd->ppa_addr.ppa); + c.set_bb.nlb = cpu_to_le16(rqd->nr_pages - 1); + c.set_bb.value = type; + + ret = nvme_submit_sync_cmd(q, (struct nvme_command *)&c, NULL, 0); + if (ret) + dev_err(dev->dev, "set bad block table failed (%d)\n", ret); return ret; } @@ -474,6 +540,7 @@ static struct nvm_dev_ops nvme_nvm_dev_ops = { .get_l2p_tbl = nvme_nvm_get_l2p_tbl, .get_bb_tbl = nvme_nvm_get_bb_tbl, + .set_bb_tbl = nvme_nvm_set_bb_tbl, .submit_io = nvme_nvm_submit_io, .erase_block = nvme_nvm_erase_block, diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 32b5369e814e..9b3dc1bc9296 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -191,11 +191,11 @@ static inline void *nvm_rq_to_pdu(struct nvm_rq *rqdata) struct nvm_block; typedef int (nvm_l2p_update_fn)(u64, u32, __le64 *, void *); -typedef int (nvm_bb_update_fn)(u32, void *, unsigned int, void *); +typedef int (nvm_bb_update_fn)(struct ppa_addr, int, u8 *, void *); typedef int (nvm_id_fn)(struct request_queue *, struct nvm_id *); typedef int (nvm_get_l2p_tbl_fn)(struct request_queue *, u64, u32, nvm_l2p_update_fn *, void *); -typedef int (nvm_op_bb_tbl_fn)(struct request_queue *, int, unsigned int, +typedef int (nvm_op_bb_tbl_fn)(struct request_queue *, struct ppa_addr, int, nvm_bb_update_fn *, void *); typedef int (nvm_op_set_bb_fn)(struct request_queue *, struct nvm_rq *, int); typedef int (nvm_submit_io_fn)(struct request_queue *, struct nvm_rq *); @@ -210,7 +210,7 @@ struct nvm_dev_ops { nvm_id_fn *identity; nvm_get_l2p_tbl_fn *get_l2p_tbl; nvm_op_bb_tbl_fn *get_bb_tbl; - nvm_op_set_bb_fn *set_bb; + nvm_op_set_bb_fn *set_bb_tbl; nvm_submit_io_fn *submit_io; nvm_erase_blk_fn *erase_block; -- cgit v1.2.3 From 12be5edf68e785dd5dc8665db5a88152b49c1fe8 Mon Sep 17 00:00:00 2001 From: Matias Bjørling Date: Mon, 16 Nov 2015 15:34:39 +0100 Subject: lightnvm: expose mccap in identify command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The mccap field is required for I/O command option support. It defines the following flash access modes: * SLC mode * Erase/Program Suspension * Scramble On/Off * Encryption It is slotted in between mpos and cpar, changing the offset for cpar as well. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/nvme/host/lightnvm.c | 4 +++- include/linux/lightnvm.h | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 60687ed68b5d..52b311cf694c 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -169,8 +169,9 @@ struct nvme_nvm_id_group { __le32 tbet; __le32 tbem; __le32 mpos; + __le32 mccap; __le16 cpar; - __u8 reserved[910]; + __u8 reserved[906]; } __packed; struct nvme_nvm_addr_format { @@ -265,6 +266,7 @@ static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id) dst->tbet = le32_to_cpu(src->tbet); dst->tbem = le32_to_cpu(src->tbem); dst->mpos = le32_to_cpu(src->mpos); + dst->mccap = le32_to_cpu(src->mccap); dst->cpar = le16_to_cpu(src->cpar); } diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 9b3dc1bc9296..2572856e2a89 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -74,6 +74,7 @@ struct nvm_id_group { u32 tbet; u32 tbem; u32 mpos; + u32 mccap; u16 cpar; u8 res[913]; } __packed; -- cgit v1.2.3 From 73387e7bed260c89628fc6a4e3632b45be9776b0 Mon Sep 17 00:00:00 2001 From: Matias Bjørling Date: Mon, 16 Nov 2015 15:34:40 +0100 Subject: lightnvm: remove unused attrs in nvm_id structs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The nvm_id, nvm_id_group and nvm_addr_format data structures contain reserved attributes. They are unused by media managers and targets. Remove them. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 2572856e2a89..e6ef8aaf533f 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -58,7 +58,6 @@ enum { struct nvm_id_group { u8 mtype; u8 fmtype; - u16 res16; u8 num_ch; u8 num_lun; u8 num_pln; @@ -76,8 +75,7 @@ struct nvm_id_group { u32 mpos; u32 mccap; u16 cpar; - u8 res[913]; -} __packed; +}; struct nvm_addr_format { u8 ch_offset; @@ -92,19 +90,16 @@ struct nvm_addr_format { u8 pg_len; u8 sect_offset; u8 sect_len; - u8 res[4]; }; struct nvm_id { u8 ver_id; u8 vmnt; u8 cgrps; - u8 res[5]; u32 cap; u32 dom; struct nvm_addr_format ppaf; u8 ppat; - u8 resv[224]; struct nvm_id_group groups[4]; } __packed; -- cgit v1.2.3 From 7386af270c72be65c7cb2ba4ad0d4e70dc373106 Mon Sep 17 00:00:00 2001 From: Matias Bjørling Date: Mon, 16 Nov 2015 15:34:44 +0100 Subject: lightnvm: remove linear and device addr modes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The linear and device specific address modes can be replaced with a simple offset and bit length conversion that is generic across all devices. This both simplifies the specification and removes the special case for qemu nvme, that previously relied on the linear address mapping. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/core.c | 3 +- drivers/lightnvm/gennvm.c | 12 ++-- drivers/lightnvm/rrpc.c | 32 ++++++++- drivers/nvme/host/lightnvm.c | 3 +- include/linux/lightnvm.h | 154 ++++++++++--------------------------------- 5 files changed, 73 insertions(+), 131 deletions(-) (limited to 'include') diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 899f6b9a9f68..790b1d7a8d43 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -174,8 +174,7 @@ static int nvm_core_init(struct nvm_dev *dev) dev->sec_size = grp->csecs; dev->oob_size = grp->sos; dev->sec_per_pg = grp->fpg_sz / grp->csecs; - dev->addr_mode = id->ppat; - dev->addr_format = id->ppaf; + memcpy(&dev->ppaf, &id->ppaf, sizeof(struct nvm_addr_format)); dev->plane_mode = NVM_PLANE_SINGLE; dev->max_rq_size = dev->ops->max_phys_sect * dev->sec_size; diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c index 8cfc0114ff13..c0d0eb2357a8 100644 --- a/drivers/lightnvm/gennvm.c +++ b/drivers/lightnvm/gennvm.c @@ -73,7 +73,7 @@ static int gennvm_block_bb(struct ppa_addr ppa, int nr_blocks, u8 *blks, struct nvm_block *blk; int i; - ppa = addr_to_generic_mode(gn->dev, ppa); + ppa = dev_to_generic_addr(gn->dev, ppa); lun = &gn->luns[(dev->nr_luns * ppa.g.ch) + ppa.g.lun]; for (i = 0; i < nr_blocks; i++) { @@ -179,7 +179,7 @@ static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn) ppa.ppa = 0; ppa.g.ch = lun->vlun.chnl_id; ppa.g.lun = lun->vlun.id; - ppa = generic_to_addr_mode(dev, ppa); + ppa = generic_to_dev_addr(dev, ppa); ret = dev->ops->get_bb_tbl(dev->q, ppa, dev->blks_per_lun, @@ -304,10 +304,10 @@ static void gennvm_addr_to_generic_mode(struct nvm_dev *dev, struct nvm_rq *rqd) if (rqd->nr_pages > 1) { for (i = 0; i < rqd->nr_pages; i++) - rqd->ppa_list[i] = addr_to_generic_mode(dev, + rqd->ppa_list[i] = dev_to_generic_addr(dev, rqd->ppa_list[i]); } else { - rqd->ppa_addr = addr_to_generic_mode(dev, rqd->ppa_addr); + rqd->ppa_addr = dev_to_generic_addr(dev, rqd->ppa_addr); } } @@ -317,10 +317,10 @@ static void gennvm_generic_to_addr_mode(struct nvm_dev *dev, struct nvm_rq *rqd) if (rqd->nr_pages > 1) { for (i = 0; i < rqd->nr_pages; i++) - rqd->ppa_list[i] = generic_to_addr_mode(dev, + rqd->ppa_list[i] = generic_to_dev_addr(dev, rqd->ppa_list[i]); } else { - rqd->ppa_addr = generic_to_addr_mode(dev, rqd->ppa_addr); + rqd->ppa_addr = generic_to_dev_addr(dev, rqd->ppa_addr); } } diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c index 7ba64c87ba1c..75e59c3a3f96 100644 --- a/drivers/lightnvm/rrpc.c +++ b/drivers/lightnvm/rrpc.c @@ -123,12 +123,42 @@ static u64 block_to_addr(struct rrpc *rrpc, struct rrpc_block *rblk) return blk->id * rrpc->dev->pgs_per_blk; } +static struct ppa_addr linear_to_generic_addr(struct nvm_dev *dev, + struct ppa_addr r) +{ + struct ppa_addr l; + int secs, pgs, blks, luns; + sector_t ppa = r.ppa; + + l.ppa = 0; + + div_u64_rem(ppa, dev->sec_per_pg, &secs); + l.g.sec = secs; + + sector_div(ppa, dev->sec_per_pg); + div_u64_rem(ppa, dev->sec_per_blk, &pgs); + l.g.pg = pgs; + + sector_div(ppa, dev->pgs_per_blk); + div_u64_rem(ppa, dev->blks_per_lun, &blks); + l.g.blk = blks; + + sector_div(ppa, dev->blks_per_lun); + div_u64_rem(ppa, dev->luns_per_chnl, &luns); + l.g.lun = luns; + + sector_div(ppa, dev->luns_per_chnl); + l.g.ch = ppa; + + return l; +} + static struct ppa_addr rrpc_ppa_to_gaddr(struct nvm_dev *dev, u64 addr) { struct ppa_addr paddr; paddr.ppa = addr; - return __linear_to_generic_addr(dev, paddr); + return linear_to_generic_addr(dev, paddr); } /* requires lun->lock taken */ diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 52b311cf694c..9069be811f82 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -198,8 +198,7 @@ struct nvme_nvm_id { __le32 cap; __le32 dom; struct nvme_nvm_addr_format ppaf; - __u8 ppat; - __u8 resv[223]; + __u8 resv[224]; struct nvme_nvm_id_group groups[4]; } __packed; diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index e6ef8aaf533f..cbe288acb1de 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -99,7 +99,6 @@ struct nvm_id { u32 cap; u32 dom; struct nvm_addr_format ppaf; - u8 ppat; struct nvm_id_group groups[4]; } __packed; @@ -119,39 +118,28 @@ struct nvm_tgt_instance { #define NVM_VERSION_MINOR 0 #define NVM_VERSION_PATCH 0 -#define NVM_SEC_BITS (8) -#define NVM_PL_BITS (6) -#define NVM_PG_BITS (16) #define NVM_BLK_BITS (16) -#define NVM_LUN_BITS (10) +#define NVM_PG_BITS (16) +#define NVM_SEC_BITS (8) +#define NVM_PL_BITS (8) +#define NVM_LUN_BITS (8) #define NVM_CH_BITS (8) struct ppa_addr { + /* Generic structure for all addresses */ union { - /* Channel-based PPA format in nand 4x2x2x2x8x10 */ - struct { - u64 ch : 4; - u64 sec : 2; /* 4 sectors per page */ - u64 pl : 2; /* 4 planes per LUN */ - u64 lun : 2; /* 4 LUNs per channel */ - u64 pg : 8; /* 256 pages per block */ - u64 blk : 10;/* 1024 blocks per plane */ - u64 resved : 36; - } chnl; - - /* Generic structure for all addresses */ struct { + u64 blk : NVM_BLK_BITS; + u64 pg : NVM_PG_BITS; u64 sec : NVM_SEC_BITS; u64 pl : NVM_PL_BITS; - u64 pg : NVM_PG_BITS; - u64 blk : NVM_BLK_BITS; u64 lun : NVM_LUN_BITS; u64 ch : NVM_CH_BITS; } g; u64 ppa; }; -} __packed; +}; struct nvm_rq { struct nvm_tgt_instance *ins; @@ -259,8 +247,7 @@ struct nvm_dev { int blks_per_lun; int sec_size; int oob_size; - int addr_mode; - struct nvm_addr_format addr_format; + struct nvm_addr_format ppaf; /* Calculated/Cached values. These do not reflect the actual usable * blocks at run-time. @@ -286,118 +273,45 @@ struct nvm_dev { char name[DISK_NAME_LEN]; }; -/* fallback conversion */ -static struct ppa_addr __generic_to_linear_addr(struct nvm_dev *dev, - struct ppa_addr r) -{ - struct ppa_addr l; - - l.ppa = r.g.sec + - r.g.pg * dev->sec_per_pg + - r.g.blk * (dev->pgs_per_blk * - dev->sec_per_pg) + - r.g.lun * (dev->blks_per_lun * - dev->pgs_per_blk * - dev->sec_per_pg) + - r.g.ch * (dev->blks_per_lun * - dev->pgs_per_blk * - dev->luns_per_chnl * - dev->sec_per_pg); - - return l; -} - -/* fallback conversion */ -static struct ppa_addr __linear_to_generic_addr(struct nvm_dev *dev, - struct ppa_addr r) -{ - struct ppa_addr l; - int secs, pgs, blks, luns; - sector_t ppa = r.ppa; - - l.ppa = 0; - - div_u64_rem(ppa, dev->sec_per_pg, &secs); - l.g.sec = secs; - - sector_div(ppa, dev->sec_per_pg); - div_u64_rem(ppa, dev->sec_per_blk, &pgs); - l.g.pg = pgs; - - sector_div(ppa, dev->pgs_per_blk); - div_u64_rem(ppa, dev->blks_per_lun, &blks); - l.g.blk = blks; - - sector_div(ppa, dev->blks_per_lun); - div_u64_rem(ppa, dev->luns_per_chnl, &luns); - l.g.lun = luns; - - sector_div(ppa, dev->luns_per_chnl); - l.g.ch = ppa; - - return l; -} - -static struct ppa_addr __generic_to_chnl_addr(struct ppa_addr r) +static inline struct ppa_addr generic_to_dev_addr(struct nvm_dev *dev, + struct ppa_addr r) { struct ppa_addr l; - l.ppa = 0; - - l.chnl.sec = r.g.sec; - l.chnl.pl = r.g.pl; - l.chnl.pg = r.g.pg; - l.chnl.blk = r.g.blk; - l.chnl.lun = r.g.lun; - l.chnl.ch = r.g.ch; + l.ppa = ((u64)r.g.blk) << dev->ppaf.blk_offset; + l.ppa |= ((u64)r.g.pg) << dev->ppaf.pg_offset; + l.ppa |= ((u64)r.g.sec) << dev->ppaf.sect_offset; + l.ppa |= ((u64)r.g.pl) << dev->ppaf.pln_offset; + l.ppa |= ((u64)r.g.lun) << dev->ppaf.lun_offset; + l.ppa |= ((u64)r.g.ch) << dev->ppaf.ch_offset; return l; } -static struct ppa_addr __chnl_to_generic_addr(struct ppa_addr r) +static inline struct ppa_addr dev_to_generic_addr(struct nvm_dev *dev, + struct ppa_addr r) { struct ppa_addr l; - l.ppa = 0; - - l.g.sec = r.chnl.sec; - l.g.pl = r.chnl.pl; - l.g.pg = r.chnl.pg; - l.g.blk = r.chnl.blk; - l.g.lun = r.chnl.lun; - l.g.ch = r.chnl.ch; + /* + * (r.ppa << X offset) & X len bitmask. X eq. blk, pg, etc. + */ + l.g.blk = (r.ppa >> dev->ppaf.blk_offset) & + (((1 << dev->ppaf.blk_len) - 1)); + l.g.pg |= (r.ppa >> dev->ppaf.pg_offset) & + (((1 << dev->ppaf.pg_len) - 1)); + l.g.sec |= (r.ppa >> dev->ppaf.sect_offset) & + (((1 << dev->ppaf.sect_len) - 1)); + l.g.pl |= (r.ppa >> dev->ppaf.pln_offset) & + (((1 << dev->ppaf.pln_len) - 1)); + l.g.lun |= (r.ppa >> dev->ppaf.lun_offset) & + (((1 << dev->ppaf.lun_len) - 1)); + l.g.ch |= (r.ppa >> dev->ppaf.ch_offset) & + (((1 << dev->ppaf.ch_len) - 1)); return l; } -static inline struct ppa_addr addr_to_generic_mode(struct nvm_dev *dev, - struct ppa_addr gppa) -{ - switch (dev->addr_mode) { - case NVM_ADDRMODE_LINEAR: - return __linear_to_generic_addr(dev, gppa); - case NVM_ADDRMODE_CHANNEL: - return __chnl_to_generic_addr(gppa); - default: - BUG(); - } - return gppa; -} - -static inline struct ppa_addr generic_to_addr_mode(struct nvm_dev *dev, - struct ppa_addr gppa) -{ - switch (dev->addr_mode) { - case NVM_ADDRMODE_LINEAR: - return __generic_to_linear_addr(dev, gppa); - case NVM_ADDRMODE_CHANNEL: - return __generic_to_chnl_addr(gppa); - default: - BUG(); - } - return gppa; -} - static inline int ppa_empty(struct ppa_addr ppa_addr) { return (ppa_addr.ppa == ADDR_EMPTY); -- cgit v1.2.3 From 451c2b5caf37b526ae34a1081b71115e1de2d063 Mon Sep 17 00:00:00 2001 From: Aya Mahfouz Date: Wed, 18 Nov 2015 08:36:44 +0200 Subject: net: dns_resolver: convert time_t to time64_t Changes the definition of the pointer _expiry from time_t to time64_t. This is to handle the Y2038 problem where time_t will overflow in the year 2038. The change is safe because the kernel subsystems that call dns_query pass NULL. Signed-off-by: Arnd Bergmann Signed-off-by: Aya Mahfouz Signed-off-by: David S. Miller --- include/linux/dns_resolver.h | 2 +- net/dns_resolver/dns_query.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/dns_resolver.h b/include/linux/dns_resolver.h index cc92268af89a..6ac3cad9aef1 100644 --- a/include/linux/dns_resolver.h +++ b/include/linux/dns_resolver.h @@ -27,7 +27,7 @@ #ifdef __KERNEL__ extern int dns_query(const char *type, const char *name, size_t namelen, - const char *options, char **_result, time_t *_expiry); + const char *options, char **_result, time64_t *_expiry); #endif /* KERNEL */ diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c index 4677b6fa6dda..ecc28cff08ab 100644 --- a/net/dns_resolver/dns_query.c +++ b/net/dns_resolver/dns_query.c @@ -67,7 +67,7 @@ * Returns the size of the result on success, -ve error code otherwise. */ int dns_query(const char *type, const char *name, size_t namelen, - const char *options, char **_result, time_t *_expiry) + const char *options, char **_result, time64_t *_expiry) { struct key *rkey; const struct user_key_payload *upayload; -- cgit v1.2.3 From 851df3dc11136fde86ebd78ee7527cb43c7cd349 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 16 Nov 2015 22:34:58 +0100 Subject: scpi: hide get_scpi_ops in module from built-in code The scpi_clock driver can be built-in when CONFIG_COMPILE_TEST is set even when ARM_SCPI_PROTOCOL is a loadable module, and that results in a link error: drivers/built-in.o: In function `scpi_clocks_probe': (.text+0x14453c): undefined reference to `get_scpi_ops' Using #if IS_REACHABLE() around the get_scpi_ops() declaration makes it build successfully in this case for compile-testing, but the effect is the same as when ARM_SCPI_PROTOCOL is disabled, as the code will not be used. Signed-off-by: Arnd Bergmann Acked-by: Punit Agrawal --- include/linux/scpi_protocol.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/scpi_protocol.h b/include/linux/scpi_protocol.h index 80af3cd35ae4..72ce932c69b2 100644 --- a/include/linux/scpi_protocol.h +++ b/include/linux/scpi_protocol.h @@ -71,7 +71,7 @@ struct scpi_ops { int (*sensor_get_value)(u16, u32 *); }; -#if IS_ENABLED(CONFIG_ARM_SCPI_PROTOCOL) +#if IS_REACHABLE(CONFIG_ARM_SCPI_PROTOCOL) struct scpi_ops *get_scpi_ops(void); #else static inline struct scpi_ops *get_scpi_ops(void) { return NULL; } -- cgit v1.2.3 From a35bb4458e5e5c9dc19a0daa0629409285f3b25e Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Thu, 19 Nov 2015 14:17:06 +0100 Subject: scsi: report 'INQUIRY result too short' once per host Some host adapters (e.g. Hyper-V storvsc) are known for not respecting the SPC-2/3/4 requirement for 'INQUIRY data (see table ...) shall contain at least 36 bytes'. As a result we get tons on 'scsi 0:7:1:1: scsi scan: INQUIRY result too short (5), using 36' messages on console. This can be problematic for slow consoles. Introduce short_inquiry flag in struct Scsi_Host to print the message once per host. Signed-off-by: Vitaly Kuznetsov Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_scan.c | 9 ++++++--- include/scsi/scsi_host.h | 3 +++ 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c index d01e423ef44b..403a63310fb1 100644 --- a/drivers/scsi/scsi_scan.c +++ b/drivers/scsi/scsi_scan.c @@ -702,9 +702,12 @@ static int scsi_probe_lun(struct scsi_device *sdev, unsigned char *inq_result, * strings. */ if (sdev->inquiry_len < 36) { - sdev_printk(KERN_INFO, sdev, - "scsi scan: INQUIRY result too short (%d)," - " using 36\n", sdev->inquiry_len); + if (!sdev->host->short_inquiry) { + shost_printk(KERN_INFO, sdev->host, + "scsi scan: INQUIRY result too short (%d)," + " using 36\n", sdev->inquiry_len); + sdev->host->short_inquiry = 1; + } sdev->inquiry_len = 36; } diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h index e113c757d555..3a22da73d59a 100644 --- a/include/scsi/scsi_host.h +++ b/include/scsi/scsi_host.h @@ -673,6 +673,9 @@ struct Scsi_Host { unsigned use_blk_mq:1; unsigned use_cmd_list:1; + /* Host responded with short (<36 bytes) INQUIRY result */ + unsigned short_inquiry:1; + /* * Optional work queue to be utilized by the transport */ -- cgit v1.2.3 From ac0621971a26526cad8cf9db7626d5e50562a441 Mon Sep 17 00:00:00 2001 From: Gregory Greenman Date: Tue, 17 Nov 2015 10:24:38 +0200 Subject: mac80211: always set the buf_size in AddBA req to 64 Advertising reordering window in ADDBA less than 64 can crash some APs, an example is LinkSys WRT120N (with FW v1.0.07 build 002 Jun 18 2012). On the other hand, a driver may need to limit Tx A-MPDU size for its own reasons, like specific HW limitations. Signed-off-by: Gregory Greenman Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/net/mac80211.h | 6 ++++-- net/mac80211/agg-tx.c | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 82045fca388b..760bc4d5a2cf 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2003,8 +2003,10 @@ enum ieee80211_hw_flags { * it shouldn't be set. * * @max_tx_aggregation_subframes: maximum number of subframes in an - * aggregate an HT driver will transmit, used by the peer as a - * hint to size its reorder buffer. + * aggregate an HT driver will transmit. Though ADDBA will advertise + * a constant value of 64 as some older APs can crash if the window + * size is smaller (an example is LinkSys WRT120N with FW v1.0.07 + * build 002 Jun 18 2012). * * @offchannel_tx_hw_queue: HW queue ID to use for offchannel TX * (if %IEEE80211_HW_QUEUE_CONTROL is set) diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index a758eb84e8f0..ff757181b0a8 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -500,7 +500,7 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) /* send AddBA request */ ieee80211_send_addba_request(sdata, sta->sta.addr, tid, tid_tx->dialog_token, start_seq_num, - local->hw.max_tx_aggregation_subframes, + IEEE80211_MAX_AMPDU_BUF, tid_tx->timeout); } @@ -926,6 +926,7 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local, amsdu = capab & IEEE80211_ADDBA_PARAM_AMSDU_MASK; tid = (capab & IEEE80211_ADDBA_PARAM_TID_MASK) >> 2; buf_size = (capab & IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK) >> 6; + buf_size = min(buf_size, local->hw.max_tx_aggregation_subframes); mutex_lock(&sta->ampdu_mlme.mtx); -- cgit v1.2.3 From 0b59733b95f9d7af6bee6e6a4d0d444eb694c514 Mon Sep 17 00:00:00 2001 From: Javier Gonzalez Date: Fri, 20 Nov 2015 13:47:56 +0100 Subject: lightnvm: keep track of block counts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Maintain number of in use blocks, free blocks, and bad blocks in a per lun basis. This allows the upper layers to get information about the state of each lun. Also, account for blocks reserved to the device on the free block count. nr_free_blocks matches now the actual number of blocks on the free list when the device is booted. Signed-off-by: Javier Gonzalez Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/gennvm.c | 14 +++++++++++++- include/linux/lightnvm.h | 2 ++ 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c index c0d0eb2357a8..43c01e0af887 100644 --- a/drivers/lightnvm/gennvm.c +++ b/drivers/lightnvm/gennvm.c @@ -60,6 +60,8 @@ static int gennvm_luns_init(struct nvm_dev *dev, struct gen_nvm *gn) lun->vlun.lun_id = i % dev->luns_per_chnl; lun->vlun.chnl_id = i / dev->luns_per_chnl; lun->vlun.nr_free_blocks = dev->blks_per_lun; + lun->vlun.nr_inuse_blocks = 0; + lun->vlun.nr_bad_blocks = 0; } return 0; } @@ -87,6 +89,7 @@ static int gennvm_block_bb(struct ppa_addr ppa, int nr_blocks, u8 *blks, } list_move_tail(&blk->list, &lun->bb_list); + lun->vlun.nr_bad_blocks++; } return 0; @@ -139,6 +142,7 @@ static int gennvm_block_map(u64 slba, u32 nlb, __le64 *entries, void *private) list_move_tail(&blk->list, &lun->used_list); blk->type = 1; lun->vlun.nr_free_blocks--; + lun->vlun.nr_inuse_blocks++; } } @@ -167,8 +171,10 @@ static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn) block->id = cur_block_id++; /* First block is reserved for device */ - if (unlikely(lun_iter == 0 && blk_iter == 0)) + if (unlikely(lun_iter == 0 && blk_iter == 0)) { + lun->vlun.nr_free_blocks--; continue; + } list_add_tail(&block->list, &lun->free_list); } @@ -266,6 +272,7 @@ static struct nvm_block *gennvm_get_blk(struct nvm_dev *dev, blk->type = 1; lun->vlun.nr_free_blocks--; + lun->vlun.nr_inuse_blocks++; spin_unlock(&vlun->lock); out: @@ -283,16 +290,21 @@ static void gennvm_put_blk(struct nvm_dev *dev, struct nvm_block *blk) case 1: list_move_tail(&blk->list, &lun->free_list); lun->vlun.nr_free_blocks++; + lun->vlun.nr_inuse_blocks--; blk->type = 0; break; case 2: list_move_tail(&blk->list, &lun->bb_list); + lun->vlun.nr_bad_blocks++; + lun->vlun.nr_inuse_blocks--; break; default: WARN_ON_ONCE(1); pr_err("gennvm: erroneous block type (%lu -> %u)\n", blk->id, blk->type); list_move_tail(&blk->list, &lun->bb_list); + lun->vlun.nr_bad_blocks++; + lun->vlun.nr_inuse_blocks--; } spin_unlock(&vlun->lock); diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index cbe288acb1de..831a20cf070c 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -213,7 +213,9 @@ struct nvm_lun { int lun_id; int chnl_id; + unsigned int nr_inuse_blocks; /* Number of used blocks */ unsigned int nr_free_blocks; /* Number of unused blocks */ + unsigned int nr_bad_blocks; /* Number of bad blocks */ struct nvm_block *blocks; spinlock_t lock; -- cgit v1.2.3 From 2fde0e482db2b43bb4ed0e9aebfbe78ebcbbf5a6 Mon Sep 17 00:00:00 2001 From: Javier Gonzalez Date: Fri, 20 Nov 2015 13:47:57 +0100 Subject: lightnvm: add free and bad lun info to show luns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add free block, used block, and bad block information to the show debug interface. This information is used to debug how targets track blocks. Also, change debug function name to make it more generic. Signed-off-by: Javier Gonzalez Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/core.c | 2 +- drivers/lightnvm/gennvm.c | 19 ++++++++++++++----- include/linux/lightnvm.h | 4 ++-- 3 files changed, 17 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index f61d325fd978..5178645ac42b 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -544,7 +544,7 @@ static int nvm_configure_show(const char *val) if (!dev->mt) return 0; - dev->mt->free_blocks_print(dev); + dev->mt->lun_info_print(dev); return 0; } diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c index 43c01e0af887..e20e74ec6b91 100644 --- a/drivers/lightnvm/gennvm.c +++ b/drivers/lightnvm/gennvm.c @@ -464,15 +464,24 @@ static struct nvm_lun *gennvm_get_lun(struct nvm_dev *dev, int lunid) return &gn->luns[lunid].vlun; } -static void gennvm_free_blocks_print(struct nvm_dev *dev) +static void gennvm_lun_info_print(struct nvm_dev *dev) { struct gen_nvm *gn = dev->mp; struct gen_lun *lun; unsigned int i; - gennvm_for_each_lun(gn, lun, i) - pr_info("%s: lun%8u\t%u\n", - dev->name, i, lun->vlun.nr_free_blocks); + + gennvm_for_each_lun(gn, lun, i) { + spin_lock(&lun->vlun.lock); + + pr_info("%s: lun%8u\t%u\t%u\t%u\n", + dev->name, i, + lun->vlun.nr_free_blocks, + lun->vlun.nr_inuse_blocks, + lun->vlun.nr_bad_blocks); + + spin_unlock(&lun->vlun.lock); + } } static struct nvmm_type gennvm = { @@ -490,7 +499,7 @@ static struct nvmm_type gennvm = { .erase_blk = gennvm_erase_blk, .get_lun = gennvm_get_lun, - .free_blocks_print = gennvm_free_blocks_print, + .lun_info_print = gennvm_lun_info_print, }; static int __init gennvm_module_init(void) diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 831a20cf070c..3db5552b17d5 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -380,7 +380,7 @@ typedef int (nvmm_end_io_fn)(struct nvm_rq *, int); typedef int (nvmm_erase_blk_fn)(struct nvm_dev *, struct nvm_block *, unsigned long); typedef struct nvm_lun *(nvmm_get_lun_fn)(struct nvm_dev *, int); -typedef void (nvmm_free_blocks_print_fn)(struct nvm_dev *); +typedef void (nvmm_lun_info_print_fn)(struct nvm_dev *); struct nvmm_type { const char *name; @@ -404,7 +404,7 @@ struct nvmm_type { nvmm_get_lun_fn *get_lun; /* Statistics */ - nvmm_free_blocks_print_fn *free_blocks_print; + nvmm_lun_info_print_fn *lun_info_print; struct list_head list; }; -- cgit v1.2.3 From 614e4c4ebc75517295bccd29b20ddbc5b52af6fc Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 12 Nov 2015 11:00:04 +0100 Subject: perf/core: Robustify the perf_cgroup_from_task() RCU checks This patch reinforces the lockdep checks performed by perf_cgroup_from_tsk() by passing the perf_event_context whenever possible. It is okay to not hold the RCU read lock when we know we hold the ctx->lock. This patch makes sure this property holds. In some functions, such as perf_cgroup_sched_in(), we do not pass the context because we are sure we are holding the RCU read lock. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vince Weaver Cc: edumazet@google.com Link: http://lkml.kernel.org/r/1447322404-10920-3-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_cqm.c | 2 +- include/linux/perf_event.h | 6 ++++-- kernel/events/core.c | 20 +++++++++++++------- 3 files changed, 18 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c index 377e8f8ed391..a316ca96f1b6 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c @@ -298,7 +298,7 @@ static bool __match_event(struct perf_event *a, struct perf_event *b) static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event) { if (event->attach_state & PERF_ATTACH_TASK) - return perf_cgroup_from_task(event->hw.target); + return perf_cgroup_from_task(event->hw.target, event->ctx); return event->cgrp; } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index d841d33bcdc9..f9828a48f16a 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -697,9 +697,11 @@ struct perf_cgroup { * if there is no cgroup event for the current CPU context. */ static inline struct perf_cgroup * -perf_cgroup_from_task(struct task_struct *task) +perf_cgroup_from_task(struct task_struct *task, struct perf_event_context *ctx) { - return container_of(task_css(task, perf_event_cgrp_id), + return container_of(task_css_check(task, perf_event_cgrp_id, + ctx ? lockdep_is_held(&ctx->lock) + : true), struct perf_cgroup, css); } #endif /* CONFIG_CGROUP_PERF */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 60e71ca42c22..1ac857aff7b0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -435,7 +435,7 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) if (!is_cgroup_event(event)) return; - cgrp = perf_cgroup_from_task(current); + cgrp = perf_cgroup_from_task(current, event->ctx); /* * Do not update time when cgroup is not active */ @@ -458,7 +458,7 @@ perf_cgroup_set_timestamp(struct task_struct *task, if (!task || !ctx->nr_cgroups) return; - cgrp = perf_cgroup_from_task(task); + cgrp = perf_cgroup_from_task(task, ctx); info = this_cpu_ptr(cgrp->info); info->timestamp = ctx->timestamp; } @@ -521,8 +521,10 @@ static void perf_cgroup_switch(struct task_struct *task, int mode) * set cgrp before ctxsw in to allow * event_filter_match() to not have to pass * task around + * we pass the cpuctx->ctx to perf_cgroup_from_task() + * because cgorup events are only per-cpu */ - cpuctx->cgrp = perf_cgroup_from_task(task); + cpuctx->cgrp = perf_cgroup_from_task(task, &cpuctx->ctx); cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); } perf_pmu_enable(cpuctx->ctx.pmu); @@ -542,15 +544,17 @@ static inline void perf_cgroup_sched_out(struct task_struct *task, rcu_read_lock(); /* * we come here when we know perf_cgroup_events > 0 + * we do not need to pass the ctx here because we know + * we are holding the rcu lock */ - cgrp1 = perf_cgroup_from_task(task); + cgrp1 = perf_cgroup_from_task(task, NULL); /* * next is NULL when called from perf_event_enable_on_exec() * that will systematically cause a cgroup_switch() */ if (next) - cgrp2 = perf_cgroup_from_task(next); + cgrp2 = perf_cgroup_from_task(next, NULL); /* * only schedule out current cgroup events if we know @@ -572,11 +576,13 @@ static inline void perf_cgroup_sched_in(struct task_struct *prev, rcu_read_lock(); /* * we come here when we know perf_cgroup_events > 0 + * we do not need to pass the ctx here because we know + * we are holding the rcu lock */ - cgrp1 = perf_cgroup_from_task(task); + cgrp1 = perf_cgroup_from_task(task, NULL); /* prev can never be NULL */ - cgrp2 = perf_cgroup_from_task(prev); + cgrp2 = perf_cgroup_from_task(prev, NULL); /* * only need to schedule in cgroup events if we are changing -- cgit v1.2.3 From 90eec103b96e30401c0b846045bf8a1c7159b6da Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 16 Nov 2015 11:08:45 +0100 Subject: treewide: Remove old email address There were still a number of references to my old Red Hat email address in the kernel source. Remove these while keeping the Red Hat copyright notices intact. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- arch/blackfin/kernel/perf_event.c | 2 +- arch/sh/kernel/perf_event.c | 2 +- arch/sparc/kernel/perf_event.c | 2 +- arch/tile/kernel/perf_event.c | 2 +- arch/x86/kernel/cpu/perf_event.c | 2 +- arch/x86/kernel/cpu/perf_event.h | 2 +- arch/x86/kernel/irq_work.c | 2 +- include/asm-generic/tlb.h | 2 +- include/linux/jump_label.h | 2 +- include/linux/lockdep.h | 2 +- include/linux/proportions.h | 2 +- include/linux/uprobes.h | 2 +- kernel/events/callchain.c | 2 +- kernel/events/core.c | 2 +- kernel/events/ring_buffer.c | 2 +- kernel/events/uprobes.c | 2 +- kernel/irq_work.c | 2 +- kernel/jump_label.c | 2 +- kernel/locking/lockdep.c | 2 +- kernel/locking/lockdep_proc.c | 2 +- kernel/sched/clock.c | 2 +- kernel/sched/fair.c | 2 +- kernel/trace/trace_event_perf.c | 2 +- lib/btree.c | 2 +- lib/proportions.c | 2 +- mm/page-writeback.c | 2 +- 26 files changed, 26 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/arch/blackfin/kernel/perf_event.c b/arch/blackfin/kernel/perf_event.c index 1e9c8b0bf486..170d786807c4 100644 --- a/arch/blackfin/kernel/perf_event.c +++ b/arch/blackfin/kernel/perf_event.c @@ -14,7 +14,7 @@ * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra * Copyright (C) 2009 Intel Corporation, * * ppc: diff --git a/arch/sh/kernel/perf_event.c b/arch/sh/kernel/perf_event.c index 7cfd7f153966..4dca18347ee9 100644 --- a/arch/sh/kernel/perf_event.c +++ b/arch/sh/kernel/perf_event.c @@ -10,7 +10,7 @@ * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra * Copyright (C) 2009 Intel Corporation, * * ppc: diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index b0da5aedb336..3091267c5cc3 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -9,7 +9,7 @@ * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra */ #include diff --git a/arch/tile/kernel/perf_event.c b/arch/tile/kernel/perf_event.c index bb509cee3b59..8767060d70fb 100644 --- a/arch/tile/kernel/perf_event.c +++ b/arch/tile/kernel/perf_event.c @@ -21,7 +21,7 @@ * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra * Copyright (C) 2009 Intel Corporation, * Copyright (C) 2009 Google, Inc., Stephane Eranian */ diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 4562cf070c27..2bf79d7c97df 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -5,7 +5,7 @@ * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra * Copyright (C) 2009 Intel Corporation, * Copyright (C) 2009 Google, Inc., Stephane Eranian * diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index ffa7a92025e1..ab18b8a91583 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -5,7 +5,7 @@ * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra * Copyright (C) 2009 Intel Corporation, * Copyright (C) 2009 Google, Inc., Stephane Eranian * diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c index dc5fa6a1e8d6..3512ba607361 100644 --- a/arch/x86/kernel/irq_work.c +++ b/arch/x86/kernel/irq_work.c @@ -1,7 +1,7 @@ /* * x86 specific code for irq_work * - * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra */ #include diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index db284bff29dc..9dbb739cafa0 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -5,7 +5,7 @@ * Copyright 2001 Red Hat, Inc. * Based on code from mm/memory.c Copyright Linus Torvalds and others. * - * Copyright 2011 Red Hat, Inc., Peter Zijlstra + * Copyright 2011 Red Hat, Inc., Peter Zijlstra * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index 8dde55974f18..0536524bb9eb 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -5,7 +5,7 @@ * Jump label support * * Copyright (C) 2009-2012 Jason Baron - * Copyright (C) 2011-2012 Peter Zijlstra + * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra * * DEPRECATED API: * diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 70400dc7660f..c57e424d914b 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -2,7 +2,7 @@ * Runtime locking correctness validator * * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * see Documentation/locking/lockdep-design.txt for more details. */ diff --git a/include/linux/proportions.h b/include/linux/proportions.h index 5440f64d2942..21221338ad18 100644 --- a/include/linux/proportions.h +++ b/include/linux/proportions.h @@ -1,7 +1,7 @@ /* * FLoating proportions * - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * This file contains the public data structure and API definitions. */ diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 0bdc72f36905..4a29c75b146e 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -21,7 +21,7 @@ * Authors: * Srikar Dronamraju * Jim Keniston - * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra */ #include diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index d659487254d5..9c418002b8c1 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -3,7 +3,7 @@ * * Copyright (C) 2008 Thomas Gleixner * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright 2009 Paul Mackerras, IBM Corp. * * For licensing details see kernel-base/COPYING diff --git a/kernel/events/core.c b/kernel/events/core.c index 1ac857aff7b0..5854fcf7f05a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3,7 +3,7 @@ * * Copyright (C) 2008 Thomas Gleixner * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. * * For licensing details see kernel-base/COPYING diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index b5d1ea79c595..adfdc0536117 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -3,7 +3,7 @@ * * Copyright (C) 2008 Thomas Gleixner * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. * * For licensing details see kernel-base/COPYING diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 4e5e9798aa0c..7dad84913abf 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -19,7 +19,7 @@ * Authors: * Srikar Dronamraju * Jim Keniston - * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra */ #include diff --git a/kernel/irq_work.c b/kernel/irq_work.c index cbf9fb899d92..bcf107ce0854 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra * * Provides a framework for enqueueing and running callbacks from hardirq * context. The enqueueing is NMI-safe. diff --git a/kernel/jump_label.c b/kernel/jump_label.c index f7dd15d537f9..05254eeb4b4e 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -2,7 +2,7 @@ * jump label support * * Copyright (C) 2009 Jason Baron - * Copyright (C) 2011 Peter Zijlstra + * Copyright (C) 2011 Peter Zijlstra * */ #include diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index deae3907ac1e..60ace56618f6 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -6,7 +6,7 @@ * Started by Ingo Molnar: * * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * this code maps all the lock dependencies as they occur in a live kernel * and will warn about the following classes of locking bugs: diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index d83d798bef95..dbb61a302548 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -6,7 +6,7 @@ * Started by Ingo Molnar: * * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * Code for /proc/lockdep and /proc/lockdep_stats: * diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index c0a205101c23..caf4041f5b0a 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -1,7 +1,7 @@ /* * sched_clock for unstable cpu clocks * - * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra * * Updates and enhancements: * Copyright (C) 2008 Red Hat, Inc. Steven Rostedt diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f04fda8f669c..90e26b11deaa 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -17,7 +17,7 @@ * Copyright (C) 2007, Thomas Gleixner * * Adaptive scheduling granularity, math enhancements by Peter Zijlstra - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ #include diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index abfc903e741e..cc9f7a9319be 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -1,7 +1,7 @@ /* * trace event based perf event profiling/tracing * - * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra + * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra * Copyright (C) 2009-2010 Frederic Weisbecker */ diff --git a/lib/btree.c b/lib/btree.c index 4264871ea1a0..f93a945274af 100644 --- a/lib/btree.c +++ b/lib/btree.c @@ -5,7 +5,7 @@ * * Copyright (c) 2007-2008 Joern Engel * Bits and pieces stolen from Peter Zijlstra's code, which is - * Copyright 2007, Red Hat Inc. Peter Zijlstra + * Copyright 2007, Red Hat Inc. Peter Zijlstra * GPLv2 * * see http://programming.kicks-ass.net/kernel-patches/vma_lookup/btree.patch diff --git a/lib/proportions.c b/lib/proportions.c index 6f724298f67a..efa54f259ea9 100644 --- a/lib/proportions.c +++ b/lib/proportions.c @@ -1,7 +1,7 @@ /* * Floating proportions * - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * Description: * diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 3e4d65445fa7..d15d88c8efa1 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2,7 +2,7 @@ * mm/page-writeback.c * * Copyright (C) 2002, Linus Torvalds. - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * Contains functions related to writing back dirty pages at the * address_space level. -- cgit v1.2.3 From 7d267278a9ece963d77eefec61630223fce08c6c Mon Sep 17 00:00:00 2001 From: Rainer Weikusat Date: Fri, 20 Nov 2015 22:07:23 +0000 Subject: unix: avoid use-after-free in ep_remove_wait_queue Rainer Weikusat writes: An AF_UNIX datagram socket being the client in an n:1 association with some server socket is only allowed to send messages to the server if the receive queue of this socket contains at most sk_max_ack_backlog datagrams. This implies that prospective writers might be forced to go to sleep despite none of the message presently enqueued on the server receive queue were sent by them. In order to ensure that these will be woken up once space becomes again available, the present unix_dgram_poll routine does a second sock_poll_wait call with the peer_wait wait queue of the server socket as queue argument (unix_dgram_recvmsg does a wake up on this queue after a datagram was received). This is inherently problematic because the server socket is only guaranteed to remain alive for as long as the client still holds a reference to it. In case the connection is dissolved via connect or by the dead peer detection logic in unix_dgram_sendmsg, the server socket may be freed despite "the polling mechanism" (in particular, epoll) still has a pointer to the corresponding peer_wait queue. There's no way to forcibly deregister a wait queue with epoll. Based on an idea by Jason Baron, the patch below changes the code such that a wait_queue_t belonging to the client socket is enqueued on the peer_wait queue of the server whenever the peer receive queue full condition is detected by either a sendmsg or a poll. A wake up on the peer queue is then relayed to the ordinary wait queue of the client socket via wake function. The connection to the peer wait queue is again dissolved if either a wake up is about to be relayed or the client socket reconnects or a dead peer is detected or the client socket is itself closed. This enables removing the second sock_poll_wait from unix_dgram_poll, thus avoiding the use-after-free, while still ensuring that no blocked writer sleeps forever. Signed-off-by: Rainer Weikusat Fixes: ec0d215f9420 ("af_unix: fix 'poll for write'/connected DGRAM sockets") Reviewed-by: Jason Baron Signed-off-by: David S. Miller --- include/net/af_unix.h | 1 + net/unix/af_unix.c | 183 ++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 165 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/net/af_unix.h b/include/net/af_unix.h index b36d837c701e..2a91a0561a47 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -62,6 +62,7 @@ struct unix_sock { #define UNIX_GC_CANDIDATE 0 #define UNIX_GC_MAYBE_CYCLE 1 struct socket_wq peer_wq; + wait_queue_t peer_wake; }; static inline struct unix_sock *unix_sk(const struct sock *sk) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 955ec152cb71..4e95bdf973d9 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -326,6 +326,118 @@ found: return s; } +/* Support code for asymmetrically connected dgram sockets + * + * If a datagram socket is connected to a socket not itself connected + * to the first socket (eg, /dev/log), clients may only enqueue more + * messages if the present receive queue of the server socket is not + * "too large". This means there's a second writeability condition + * poll and sendmsg need to test. The dgram recv code will do a wake + * up on the peer_wait wait queue of a socket upon reception of a + * datagram which needs to be propagated to sleeping would-be writers + * since these might not have sent anything so far. This can't be + * accomplished via poll_wait because the lifetime of the server + * socket might be less than that of its clients if these break their + * association with it or if the server socket is closed while clients + * are still connected to it and there's no way to inform "a polling + * implementation" that it should let go of a certain wait queue + * + * In order to propagate a wake up, a wait_queue_t of the client + * socket is enqueued on the peer_wait queue of the server socket + * whose wake function does a wake_up on the ordinary client socket + * wait queue. This connection is established whenever a write (or + * poll for write) hit the flow control condition and broken when the + * association to the server socket is dissolved or after a wake up + * was relayed. + */ + +static int unix_dgram_peer_wake_relay(wait_queue_t *q, unsigned mode, int flags, + void *key) +{ + struct unix_sock *u; + wait_queue_head_t *u_sleep; + + u = container_of(q, struct unix_sock, peer_wake); + + __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait, + q); + u->peer_wake.private = NULL; + + /* relaying can only happen while the wq still exists */ + u_sleep = sk_sleep(&u->sk); + if (u_sleep) + wake_up_interruptible_poll(u_sleep, key); + + return 0; +} + +static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other) +{ + struct unix_sock *u, *u_other; + int rc; + + u = unix_sk(sk); + u_other = unix_sk(other); + rc = 0; + spin_lock(&u_other->peer_wait.lock); + + if (!u->peer_wake.private) { + u->peer_wake.private = other; + __add_wait_queue(&u_other->peer_wait, &u->peer_wake); + + rc = 1; + } + + spin_unlock(&u_other->peer_wait.lock); + return rc; +} + +static void unix_dgram_peer_wake_disconnect(struct sock *sk, + struct sock *other) +{ + struct unix_sock *u, *u_other; + + u = unix_sk(sk); + u_other = unix_sk(other); + spin_lock(&u_other->peer_wait.lock); + + if (u->peer_wake.private == other) { + __remove_wait_queue(&u_other->peer_wait, &u->peer_wake); + u->peer_wake.private = NULL; + } + + spin_unlock(&u_other->peer_wait.lock); +} + +static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk, + struct sock *other) +{ + unix_dgram_peer_wake_disconnect(sk, other); + wake_up_interruptible_poll(sk_sleep(sk), + POLLOUT | + POLLWRNORM | + POLLWRBAND); +} + +/* preconditions: + * - unix_peer(sk) == other + * - association is stable + */ +static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other) +{ + int connected; + + connected = unix_dgram_peer_wake_connect(sk, other); + + if (unix_recvq_full(other)) + return 1; + + if (connected) + unix_dgram_peer_wake_disconnect(sk, other); + + return 0; +} + static int unix_writable(const struct sock *sk) { return sk->sk_state != TCP_LISTEN && @@ -431,6 +543,8 @@ static void unix_release_sock(struct sock *sk, int embrion) skpair->sk_state_change(skpair); sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP); } + + unix_dgram_peer_wake_disconnect(sk, skpair); sock_put(skpair); /* It may now die */ unix_peer(sk) = NULL; } @@ -666,6 +780,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern) INIT_LIST_HEAD(&u->link); mutex_init(&u->readlock); /* single task reading lock */ init_waitqueue_head(&u->peer_wait); + init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); unix_insert_socket(unix_sockets_unbound(sk), sk); out: if (sk == NULL) @@ -1033,6 +1148,8 @@ restart: if (unix_peer(sk)) { struct sock *old_peer = unix_peer(sk); unix_peer(sk) = other; + unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer); + unix_state_double_unlock(sk, other); if (other != old_peer) @@ -1472,6 +1589,7 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, struct scm_cookie scm; int max_level; int data_len = 0; + int sk_locked; wait_for_unix_gc(); err = scm_send(sock, msg, &scm, false); @@ -1550,12 +1668,14 @@ restart: goto out_free; } + sk_locked = 0; unix_state_lock(other); +restart_locked: err = -EPERM; if (!unix_may_send(sk, other)) goto out_unlock; - if (sock_flag(other, SOCK_DEAD)) { + if (unlikely(sock_flag(other, SOCK_DEAD))) { /* * Check with 1003.1g - what should * datagram error @@ -1563,10 +1683,14 @@ restart: unix_state_unlock(other); sock_put(other); + if (!sk_locked) + unix_state_lock(sk); + err = 0; - unix_state_lock(sk); if (unix_peer(sk) == other) { unix_peer(sk) = NULL; + unix_dgram_peer_wake_disconnect_wakeup(sk, other); + unix_state_unlock(sk); unix_dgram_disconnected(sk, other); @@ -1592,21 +1716,38 @@ restart: goto out_unlock; } - if (unix_peer(other) != sk && unix_recvq_full(other)) { - if (!timeo) { - err = -EAGAIN; - goto out_unlock; + if (unlikely(unix_peer(other) != sk && unix_recvq_full(other))) { + if (timeo) { + timeo = unix_wait_for_peer(other, timeo); + + err = sock_intr_errno(timeo); + if (signal_pending(current)) + goto out_free; + + goto restart; } - timeo = unix_wait_for_peer(other, timeo); + if (!sk_locked) { + unix_state_unlock(other); + unix_state_double_lock(sk, other); + } - err = sock_intr_errno(timeo); - if (signal_pending(current)) - goto out_free; + if (unix_peer(sk) != other || + unix_dgram_peer_wake_me(sk, other)) { + err = -EAGAIN; + sk_locked = 1; + goto out_unlock; + } - goto restart; + if (!sk_locked) { + sk_locked = 1; + goto restart_locked; + } } + if (unlikely(sk_locked)) + unix_state_unlock(sk); + if (sock_flag(other, SOCK_RCVTSTAMP)) __net_timestamp(skb); maybe_add_creds(skb, sock, other); @@ -1620,6 +1761,8 @@ restart: return len; out_unlock: + if (sk_locked) + unix_state_unlock(sk); unix_state_unlock(other); out_free: kfree_skb(skb); @@ -2476,14 +2619,16 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock, return mask; writable = unix_writable(sk); - other = unix_peer_get(sk); - if (other) { - if (unix_peer(other) != sk) { - sock_poll_wait(file, &unix_sk(other)->peer_wait, wait); - if (unix_recvq_full(other)) - writable = 0; - } - sock_put(other); + if (writable) { + unix_state_lock(sk); + + other = unix_peer(sk); + if (other && unix_peer(other) != sk && + unix_recvq_full(other) && + unix_dgram_peer_wake_me(sk, other)) + writable = 0; + + unix_state_unlock(sk); } if (writable) -- cgit v1.2.3 From c86b3de8c8b02d7e474fdc002c8df533b844524c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 17 Nov 2015 17:48:52 +0100 Subject: thermal: fix thermal_zone_bind_cooling_device prototype When the prototype for thermal_zone_bind_cooling_device changed, the static inline wrapper function was left alone, which in theory can cause build warnings: I have seen this error in the past: drivers/thermal/db8500_thermal.c: In function 'db8500_cdev_bind': drivers/thermal/db8500_thermal.c:78:9: error: too many arguments to function 'thermal_zone_bind_cooling_device' ret = thermal_zone_bind_cooling_device(thermal, i, cdev, while this one no longer shows up, there is no doubt that the prototype is still wrong, so let's just fix it anyway. Signed-off-by: Arnd Bergmann Fixes: 6cd9e9f629f1 ("thermal: of: fix cooling device weights in device tree") Signed-off-by: Eduardo Valentin --- include/linux/thermal.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 4014a59828fc..613c29bd6baf 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -438,7 +438,8 @@ static inline void thermal_zone_device_unregister( static inline int thermal_zone_bind_cooling_device( struct thermal_zone_device *tz, int trip, struct thermal_cooling_device *cdev, - unsigned long upper, unsigned long lower) + unsigned long upper, unsigned long lower, + unsigned int weight) { return -ENODEV; } static inline int thermal_zone_unbind_cooling_device( struct thermal_zone_device *tz, int trip, -- cgit v1.2.3 From 0f42a6a9b807b092841f7e1b381f8c7e80a0d86a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 13 Nov 2015 09:38:48 +0100 Subject: nfs: use btrfs ioctl defintions for clone The NFS CLONE_RANGE defintion was wrong and thus never worked. Fix this by simply using the btrfs ioctl defintion. Signed-off-by: Christoph Hellwig Signed-off-by: Trond Myklebust --- fs/nfs/nfs4file.c | 10 ++++++---- include/uapi/linux/nfs.h | 11 ----------- 2 files changed, 6 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 61f1c1c02d06..135353074c25 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -7,6 +7,7 @@ #include #include #include +#include /* BTRFS_IOC_CLONE/BTRFS_IOC_CLONE_RANGE */ #include "delegation.h" #include "internal.h" #include "iostat.h" @@ -300,12 +301,13 @@ out_drop_write: static long nfs42_ioctl_clone_range(struct file *dst_file, void __user *argp) { - struct nfs_ioctl_clone_range_args args; + struct btrfs_ioctl_clone_range_args args; if (copy_from_user(&args, argp, sizeof(args))) return -EFAULT; - return nfs42_ioctl_clone(dst_file, args.src_fd, args.src_off, args.dst_off, args.count); + return nfs42_ioctl_clone(dst_file, args.src_fd, args.src_offset, + args.dest_offset, args.src_length); } #else static long nfs42_ioctl_clone(struct file *dst_file, unsigned long srcfd, @@ -325,9 +327,9 @@ long nfs4_ioctl(struct file *file, unsigned int cmd, unsigned long arg) void __user *argp = (void __user *)arg; switch (cmd) { - case NFS_IOC_CLONE: + case BTRFS_IOC_CLONE: return nfs42_ioctl_clone(file, arg, 0, 0, 0); - case NFS_IOC_CLONE_RANGE: + case BTRFS_IOC_CLONE_RANGE: return nfs42_ioctl_clone_range(file, argp); } diff --git a/include/uapi/linux/nfs.h b/include/uapi/linux/nfs.h index 654bae3f1a38..5e6296160361 100644 --- a/include/uapi/linux/nfs.h +++ b/include/uapi/linux/nfs.h @@ -33,17 +33,6 @@ #define NFS_PIPE_DIRNAME "nfs" -/* NFS ioctls */ -/* Let's follow btrfs lead on CLONE to avoid messing userspace */ -#define NFS_IOC_CLONE _IOW(0x94, 9, int) -#define NFS_IOC_CLONE_RANGE _IOW(0x94, 13, int) - -struct nfs_ioctl_clone_range_args { - __s64 src_fd; - __u64 src_off, count; - __u64 dst_off; -}; - /* * NFS stats. The good thing with these values is that NFSv3 errors are * a superset of NFSv2 errors (with the exception of NFSERR_WFLUSH which -- cgit v1.2.3 From 91ab4b4d16e6649fbbf65f303c0c4e20ed680bd1 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 19 Nov 2015 14:30:26 -0500 Subject: nfs: use sliding delay when LAYOUTGET gets NFS4ERR_DELAY When LAYOUTGET gets NFS4ERR_DELAY, we currently will wait 15s before retrying the call. That is a _very_ long time, so add a timeout value to struct nfs4_layoutget and pass nfs4_async_handle_error a pointer to it. This allows the RPC engine to use a sliding delay window, instead of a 15s delay. Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 2 +- include/linux/nfs_xdr.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 765a03559363..89818036f035 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -7866,7 +7866,7 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) spin_unlock(&inode->i_lock); goto out_restart; } - if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN) + if (nfs4_async_handle_error(task, server, state, &lgp->timeout) == -EAGAIN) goto out_restart; out: dprintk("<-- %s\n", __func__); diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 570d630f98ae..11bbae44f4cb 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -251,6 +251,7 @@ struct nfs4_layoutget { struct nfs4_layoutget_res res; struct rpc_cred *cred; gfp_t gfp_flags; + long timeout; }; struct nfs4_getdeviceinfo_args { -- cgit v1.2.3 From c3ede03c881ca8ad618ad52c82b44ecb72c6e408 Mon Sep 17 00:00:00 2001 From: Philipp Zabel Date: Mon, 9 Nov 2015 16:43:09 +0100 Subject: gpu: ipu-v3: drop unused dmfc field from client platform data This field is never used, drop it. Signed-off-by: Philipp Zabel --- include/video/imx-ipu-v3.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/video/imx-ipu-v3.h b/include/video/imx-ipu-v3.h index 85dedca3dcfb..eeba75395f7d 100644 --- a/include/video/imx-ipu-v3.h +++ b/include/video/imx-ipu-v3.h @@ -343,7 +343,6 @@ struct ipu_client_platformdata { int di; int dc; int dp; - int dmfc; int dma[2]; }; -- cgit v1.2.3 From 264640fc2c5f4f913db5c73fa3eb1ead2c45e9d7 Mon Sep 17 00:00:00 2001 From: Michal Kubeček Date: Tue, 24 Nov 2015 15:07:11 +0100 Subject: ipv6: distinguish frag queues by device for multicast and link-local packets If a fragmented multicast packet is received on an ethernet device which has an active macvlan on top of it, each fragment is duplicated and received both on the underlying device and the macvlan. If some fragments for macvlan are processed before the whole packet for the underlying device is reassembled, the "overlapping fragments" test in ip6_frag_queue() discards the whole fragment queue. To resolve this, add device ifindex to the search key and require it to match reassembling multicast packets and packets to link-local addresses. Note: similar patch has been already submitted by Yoshifuji Hideaki in http://patchwork.ozlabs.org/patch/220979/ but got lost and forgotten for some reason. Signed-off-by: Michal Kubecek Signed-off-by: David S. Miller --- include/net/ipv6.h | 1 + net/ipv6/netfilter/nf_conntrack_reasm.c | 5 +++-- net/ipv6/reassembly.c | 10 +++++++--- 3 files changed, 11 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index e1a10b0ac0b0..ea5a13ef85a6 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -490,6 +490,7 @@ struct ip6_create_arg { u32 user; const struct in6_addr *src; const struct in6_addr *dst; + int iif; u8 ecn; }; diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index d5efeb87350e..bab4441ed4e4 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -190,7 +190,7 @@ static void nf_ct_frag6_expire(unsigned long data) /* Creation primitives. */ static inline struct frag_queue *fq_find(struct net *net, __be32 id, u32 user, struct in6_addr *src, - struct in6_addr *dst, u8 ecn) + struct in6_addr *dst, int iif, u8 ecn) { struct inet_frag_queue *q; struct ip6_create_arg arg; @@ -200,6 +200,7 @@ static inline struct frag_queue *fq_find(struct net *net, __be32 id, arg.user = user; arg.src = src; arg.dst = dst; + arg.iif = iif; arg.ecn = ecn; local_bh_disable(); @@ -601,7 +602,7 @@ struct sk_buff *nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 use fhdr = (struct frag_hdr *)skb_transport_header(clone); fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr, - ip6_frag_ecn(hdr)); + skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr)); if (fq == NULL) { pr_debug("Can't find and can't create new queue\n"); goto ret_orig; diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 44e21a03cfc3..45f5ae51de65 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -108,7 +108,10 @@ bool ip6_frag_match(const struct inet_frag_queue *q, const void *a) return fq->id == arg->id && fq->user == arg->user && ipv6_addr_equal(&fq->saddr, arg->src) && - ipv6_addr_equal(&fq->daddr, arg->dst); + ipv6_addr_equal(&fq->daddr, arg->dst) && + (arg->iif == fq->iif || + !(ipv6_addr_type(arg->dst) & (IPV6_ADDR_MULTICAST | + IPV6_ADDR_LINKLOCAL))); } EXPORT_SYMBOL(ip6_frag_match); @@ -180,7 +183,7 @@ static void ip6_frag_expire(unsigned long data) static struct frag_queue * fq_find(struct net *net, __be32 id, const struct in6_addr *src, - const struct in6_addr *dst, u8 ecn) + const struct in6_addr *dst, int iif, u8 ecn) { struct inet_frag_queue *q; struct ip6_create_arg arg; @@ -190,6 +193,7 @@ fq_find(struct net *net, __be32 id, const struct in6_addr *src, arg.user = IP6_DEFRAG_LOCAL_DELIVER; arg.src = src; arg.dst = dst; + arg.iif = iif; arg.ecn = ecn; hash = inet6_hash_frag(id, src, dst); @@ -551,7 +555,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) } fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr, - ip6_frag_ecn(hdr)); + skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr)); if (fq) { int ret; -- cgit v1.2.3 From fbc416ff86183e2203cdf975e2881d7c164b0271 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 20 Nov 2015 12:12:21 +0100 Subject: arm64: fix building without CONFIG_UID16 As reported by Michal Simek, building an ARM64 kernel with CONFIG_UID16 disabled currently fails because the system call table still needs to reference the individual function entry points that are provided by kernel/sys_ni.c in this case, and the declarations are hidden inside of #ifdef CONFIG_UID16: arch/arm64/include/asm/unistd32.h:57:8: error: 'sys_lchown16' undeclared here (not in a function) __SYSCALL(__NR_lchown, sys_lchown16) I believe this problem only exists on ARM64, because older architectures tend to not need declarations when their system call table is built in assembly code, while newer architectures tend to not need UID16 support. ARM64 only uses these system calls for compatibility with 32-bit ARM binaries. This changes the CONFIG_UID16 check into CONFIG_HAVE_UID16, which is set unconditionally on ARM64 with CONFIG_COMPAT, so we see the declarations whenever we need them, but otherwise the behavior is unchanged. Fixes: af1839eb4bd4 ("Kconfig: clean up the long arch list for the UID16 config option") Signed-off-by: Arnd Bergmann Acked-by: Will Deacon Cc: stable@vger.kernel.org Signed-off-by: Catalin Marinas --- include/linux/syscalls.h | 2 +- include/linux/types.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index a156b82dd14c..c2b66a277e98 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -524,7 +524,7 @@ asmlinkage long sys_chown(const char __user *filename, asmlinkage long sys_lchown(const char __user *filename, uid_t user, gid_t group); asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group); -#ifdef CONFIG_UID16 +#ifdef CONFIG_HAVE_UID16 asmlinkage long sys_chown16(const char __user *filename, old_uid_t user, old_gid_t group); asmlinkage long sys_lchown16(const char __user *filename, diff --git a/include/linux/types.h b/include/linux/types.h index 70d8500bddf1..70dd3dfde631 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -35,7 +35,7 @@ typedef __kernel_gid16_t gid16_t; typedef unsigned long uintptr_t; -#ifdef CONFIG_UID16 +#ifdef CONFIG_HAVE_UID16 /* This is defined by include/asm-{arch}/posix_types.h */ typedef __kernel_old_uid_t old_uid_t; typedef __kernel_old_gid_t old_gid_t; -- cgit v1.2.3 From c9da161c6517ba12154059d3b965c2cbaf16f90f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 24 Nov 2015 21:28:15 +0100 Subject: bpf: fix clearing on persistent program array maps Currently, when having map file descriptors pointing to program arrays, there's still the issue that we unconditionally flush program array contents via bpf_fd_array_map_clear() in bpf_map_release(). This happens when such a file descriptor is released and is independent of the map's refcount. Having this flush independent of the refcount is for a reason: there can be arbitrary complex dependency chains among tail calls, also circular ones (direct or indirect, nesting limit determined during runtime), and we need to make sure that the map drops all references to eBPF programs it holds, so that the map's refcount can eventually drop to zero and initiate its freeing. Btw, a walk of the whole dependency graph would not be possible for various reasons, one being complexity and another one inconsistency, i.e. new programs can be added to parts of the graph at any time, so there's no guaranteed consistent state for the time of such a walk. Now, the program array pinning itself works, but the issue is that each derived file descriptor on close would nevertheless call unconditionally into bpf_fd_array_map_clear(). Instead, keep track of users and postpone this flush until the last reference to a user is dropped. As this only concerns a subset of references (f.e. a prog array could hold a program that itself has reference on the prog array holding it, etc), we need to track them separately. Short analysis on the refcounting: on map creation time usercnt will be one, so there's no change in behaviour for bpf_map_release(), if unpinned. If we already fail in map_create(), we are immediately freed, and no file descriptor has been made public yet. In bpf_obj_pin_user(), we need to probe for a possible map in bpf_fd_probe_obj() already with a usercnt reference, so before we drop the reference on the fd with fdput(). Therefore, if actual pinning fails, we need to drop that reference again in bpf_any_put(), otherwise we keep holding it. When last reference drops on the inode, the bpf_any_put() in bpf_evict_inode() will take care of dropping the usercnt again. In the bpf_obj_get_user() case, the bpf_any_get() will grab a reference on the usercnt, still at a time when we have the reference on the path. Should we later on fail to grab a new file descriptor, bpf_any_put() will drop it, otherwise we hold it until bpf_map_release() time. Joint work with Alexei. Fixes: b2197755b263 ("bpf: add support for persistent maps/progs") Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 5 ++++- kernel/bpf/inode.c | 6 +++--- kernel/bpf/syscall.c | 36 +++++++++++++++++++++++++----------- kernel/bpf/verifier.c | 3 +-- 4 files changed, 33 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index de464e6683b6..83d1926c61e4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -40,6 +40,7 @@ struct bpf_map { struct user_struct *user; const struct bpf_map_ops *ops; struct work_struct work; + atomic_t usercnt; }; struct bpf_map_type_list { @@ -167,8 +168,10 @@ struct bpf_prog *bpf_prog_get(u32 ufd); void bpf_prog_put(struct bpf_prog *prog); void bpf_prog_put_rcu(struct bpf_prog *prog); -struct bpf_map *bpf_map_get(u32 ufd); +struct bpf_map *bpf_map_get_with_uref(u32 ufd); struct bpf_map *__bpf_map_get(struct fd f); +void bpf_map_inc(struct bpf_map *map, bool uref); +void bpf_map_put_with_uref(struct bpf_map *map); void bpf_map_put(struct bpf_map *map); extern int sysctl_unprivileged_bpf_disabled; diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index be6d726e31c9..5a8a797d50b7 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -34,7 +34,7 @@ static void *bpf_any_get(void *raw, enum bpf_type type) atomic_inc(&((struct bpf_prog *)raw)->aux->refcnt); break; case BPF_TYPE_MAP: - atomic_inc(&((struct bpf_map *)raw)->refcnt); + bpf_map_inc(raw, true); break; default: WARN_ON_ONCE(1); @@ -51,7 +51,7 @@ static void bpf_any_put(void *raw, enum bpf_type type) bpf_prog_put(raw); break; case BPF_TYPE_MAP: - bpf_map_put(raw); + bpf_map_put_with_uref(raw); break; default: WARN_ON_ONCE(1); @@ -64,7 +64,7 @@ static void *bpf_fd_probe_obj(u32 ufd, enum bpf_type *type) void *raw; *type = BPF_TYPE_MAP; - raw = bpf_map_get(ufd); + raw = bpf_map_get_with_uref(ufd); if (IS_ERR(raw)) { *type = BPF_TYPE_PROG; raw = bpf_prog_get(ufd); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0d3313d02a7e..4a8f3c1d7da6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -82,6 +82,14 @@ static void bpf_map_free_deferred(struct work_struct *work) map->ops->map_free(map); } +static void bpf_map_put_uref(struct bpf_map *map) +{ + if (atomic_dec_and_test(&map->usercnt)) { + if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) + bpf_fd_array_map_clear(map); + } +} + /* decrement map refcnt and schedule it for freeing via workqueue * (unrelying map implementation ops->map_free() might sleep) */ @@ -93,17 +101,15 @@ void bpf_map_put(struct bpf_map *map) } } -static int bpf_map_release(struct inode *inode, struct file *filp) +void bpf_map_put_with_uref(struct bpf_map *map) { - struct bpf_map *map = filp->private_data; - - if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) - /* prog_array stores refcnt-ed bpf_prog pointers - * release them all when user space closes prog_array_fd - */ - bpf_fd_array_map_clear(map); - + bpf_map_put_uref(map); bpf_map_put(map); +} + +static int bpf_map_release(struct inode *inode, struct file *filp) +{ + bpf_map_put_with_uref(filp->private_data); return 0; } @@ -142,6 +148,7 @@ static int map_create(union bpf_attr *attr) return PTR_ERR(map); atomic_set(&map->refcnt, 1); + atomic_set(&map->usercnt, 1); err = bpf_map_charge_memlock(map); if (err) @@ -174,7 +181,14 @@ struct bpf_map *__bpf_map_get(struct fd f) return f.file->private_data; } -struct bpf_map *bpf_map_get(u32 ufd) +void bpf_map_inc(struct bpf_map *map, bool uref) +{ + atomic_inc(&map->refcnt); + if (uref) + atomic_inc(&map->usercnt); +} + +struct bpf_map *bpf_map_get_with_uref(u32 ufd) { struct fd f = fdget(ufd); struct bpf_map *map; @@ -183,7 +197,7 @@ struct bpf_map *bpf_map_get(u32 ufd) if (IS_ERR(map)) return map; - atomic_inc(&map->refcnt); + bpf_map_inc(map, true); fdput(f); return map; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c6073056badf..a7945d10b378 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2021,8 +2021,7 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) * will be used by the valid program until it's unloaded * and all maps are released in free_bpf_prog_info() */ - atomic_inc(&map->refcnt); - + bpf_map_inc(map, false); fdput(f); next_insn: insn++; -- cgit v1.2.3 From 7c7a0e945349a3d0d497d7f32db6ed33d4031110 Mon Sep 17 00:00:00 2001 From: Gabriele Paoloni Date: Wed, 11 Nov 2015 09:12:25 +0800 Subject: ARM/PCI: Move align_resource function pointer to pci_host_bridge structure Commit b3a72384fe29 ("ARM/PCI: Replace pci_sys_data->align_resource with global function pointer") introduced an ARM-specific align_resource() function pointer. This is not portable to other arches and doesn't work for platforms with two different PCIe host bridge controllers. Move the function pointer to the pci_host_bridge structure so each host bridge driver can specify its own align_resource() function. Signed-off-by: Gabriele Paoloni Signed-off-by: Bjorn Helgaas Reviewed-by: Arnd Bergmann --- arch/arm/kernel/bios32.c | 19 +++++++++++-------- drivers/pci/pci.h | 2 -- include/linux/pci.h | 9 +++++++++ 3 files changed, 20 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/arch/arm/kernel/bios32.c b/arch/arm/kernel/bios32.c index 6551d28c27e6..066f7f9ba411 100644 --- a/arch/arm/kernel/bios32.c +++ b/arch/arm/kernel/bios32.c @@ -17,11 +17,6 @@ #include static int debug_pci; -static resource_size_t (*align_resource)(struct pci_dev *dev, - const struct resource *res, - resource_size_t start, - resource_size_t size, - resource_size_t align) = NULL; /* * We can't use pci_get_device() here since we are @@ -461,7 +456,6 @@ static void pcibios_init_hw(struct device *parent, struct hw_pci *hw, sys->busnr = busnr; sys->swizzle = hw->swizzle; sys->map_irq = hw->map_irq; - align_resource = hw->align_resource; INIT_LIST_HEAD(&sys->resources); if (hw->private_data) @@ -470,6 +464,8 @@ static void pcibios_init_hw(struct device *parent, struct hw_pci *hw, ret = hw->setup(nr, sys); if (ret > 0) { + struct pci_host_bridge *host_bridge; + ret = pcibios_init_resources(nr, sys); if (ret) { kfree(sys); @@ -491,6 +487,9 @@ static void pcibios_init_hw(struct device *parent, struct hw_pci *hw, busnr = sys->bus->busn_res.end + 1; list_add(&sys->node, head); + + host_bridge = pci_find_host_bridge(sys->bus); + host_bridge->align_resource = hw->align_resource; } else { kfree(sys); if (ret < 0) @@ -578,14 +577,18 @@ resource_size_t pcibios_align_resource(void *data, const struct resource *res, { struct pci_dev *dev = data; resource_size_t start = res->start; + struct pci_host_bridge *host_bridge; if (res->flags & IORESOURCE_IO && start & 0x300) start = (start + 0x3ff) & ~0x3ff; start = (start + align - 1) & ~(align - 1); - if (align_resource) - return align_resource(dev, res, start, size, align); + host_bridge = pci_find_host_bridge(dev->bus); + + if (host_bridge->align_resource) + return host_bridge->align_resource(dev, res, + start, size, align); return start; } diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index fd2f03fa53f3..d390fc1475ec 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -337,6 +337,4 @@ static inline int pci_dev_specific_reset(struct pci_dev *dev, int probe) } #endif -struct pci_host_bridge *pci_find_host_bridge(struct pci_bus *bus); - #endif /* DRIVERS_PCI_H */ diff --git a/include/linux/pci.h b/include/linux/pci.h index e828e7b4afec..6ae25aae88fd 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -412,9 +412,18 @@ struct pci_host_bridge { void (*release_fn)(struct pci_host_bridge *); void *release_data; unsigned int ignore_reset_delay:1; /* for entire hierarchy */ + /* Resource alignment requirements */ + resource_size_t (*align_resource)(struct pci_dev *dev, + const struct resource *res, + resource_size_t start, + resource_size_t size, + resource_size_t align); }; #define to_pci_host_bridge(n) container_of(n, struct pci_host_bridge, dev) + +struct pci_host_bridge *pci_find_host_bridge(struct pci_bus *bus); + void pci_set_host_bridge_release(struct pci_host_bridge *bridge, void (*release_fn)(struct pci_host_bridge *), void *release_data); -- cgit v1.2.3 From ca369d51b3e1649be4a72addd6d6a168cfb3f537 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Fri, 13 Nov 2015 16:46:48 -0500 Subject: block/sd: Fix device-imposed transfer length limits Commit 4f258a46346c ("sd: Fix maximum I/O size for BLOCK_PC requests") had the unfortunate side-effect of removing an implicit clamp to BLK_DEF_MAX_SECTORS for REQ_TYPE_FS requests in the block layer code. This caused problems for some SMR drives. Debugging this issue revealed a few problems with the existing infrastructure since the block layer didn't know how to deal with device-imposed limits, only limits set by the I/O controller. - Introduce a new queue limit, max_dev_sectors, which is used by the ULD to signal the maximum sectors for a REQ_TYPE_FS request. - Ensure that max_dev_sectors is correctly stacked and taken into account when overriding max_sectors through sysfs. - Rework sd_read_block_limits() so it saves the max_xfer and opt_xfer values for later processing. - In sd_revalidate() set the queue's max_dev_sectors based on the MAXIMUM TRANSFER LENGTH value in the Block Limits VPD. If this value is not reported, fall back to a cap based on the CDB TRANSFER LENGTH field size. - In sd_revalidate(), use OPTIMAL TRANSFER LENGTH from the Block Limits VPD--if reported and sane--to signal the preferred device transfer size for FS requests. Otherwise use BLK_DEF_MAX_SECTORS. - blk_limits_max_hw_sectors() is no longer used and can be removed. Signed-off-by: Martin K. Petersen Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=93581 Reviewed-by: Christoph Hellwig Tested-by: sweeneygj@gmx.com Tested-by: Arzeets Tested-by: David Eisner Tested-by: Mario Kicherer Signed-off-by: Martin K. Petersen --- block/blk-settings.c | 36 ++++++++++++++++-------------------- block/blk-sysfs.c | 3 +++ drivers/scsi/sd.c | 46 ++++++++++++++++++++++++++++++---------------- drivers/scsi/sd.h | 1 + include/linux/blkdev.h | 2 +- 5 files changed, 51 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/block/blk-settings.c b/block/blk-settings.c index 7d8f129a1516..dd4973583978 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -91,7 +91,8 @@ void blk_set_default_limits(struct queue_limits *lim) lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; lim->virt_boundary_mask = 0; lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; - lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS; + lim->max_sectors = lim->max_dev_sectors = lim->max_hw_sectors = + BLK_SAFE_MAX_SECTORS; lim->chunk_sectors = 0; lim->max_write_same_sectors = 0; lim->max_discard_sectors = 0; @@ -127,6 +128,7 @@ void blk_set_stacking_limits(struct queue_limits *lim) lim->max_hw_sectors = UINT_MAX; lim->max_segment_size = UINT_MAX; lim->max_sectors = UINT_MAX; + lim->max_dev_sectors = UINT_MAX; lim->max_write_same_sectors = UINT_MAX; } EXPORT_SYMBOL(blk_set_stacking_limits); @@ -214,8 +216,8 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 max_addr) EXPORT_SYMBOL(blk_queue_bounce_limit); /** - * blk_limits_max_hw_sectors - set hard and soft limit of max sectors for request - * @limits: the queue limits + * blk_queue_max_hw_sectors - set max sectors for a request for this queue + * @q: the request queue for the device * @max_hw_sectors: max hardware sectors in the usual 512b unit * * Description: @@ -224,13 +226,19 @@ EXPORT_SYMBOL(blk_queue_bounce_limit); * the device driver based upon the capabilities of the I/O * controller. * + * max_dev_sectors is a hard limit imposed by the storage device for + * READ/WRITE requests. It is set by the disk driver. + * * max_sectors is a soft limit imposed by the block layer for * filesystem type requests. This value can be overridden on a * per-device basis in /sys/block//queue/max_sectors_kb. * The soft limit can not exceed max_hw_sectors. **/ -void blk_limits_max_hw_sectors(struct queue_limits *limits, unsigned int max_hw_sectors) +void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors) { + struct queue_limits *limits = &q->limits; + unsigned int max_sectors; + if ((max_hw_sectors << 9) < PAGE_CACHE_SIZE) { max_hw_sectors = 1 << (PAGE_CACHE_SHIFT - 9); printk(KERN_INFO "%s: set to minimum %d\n", @@ -238,22 +246,9 @@ void blk_limits_max_hw_sectors(struct queue_limits *limits, unsigned int max_hw_ } limits->max_hw_sectors = max_hw_sectors; - limits->max_sectors = min_t(unsigned int, max_hw_sectors, - BLK_DEF_MAX_SECTORS); -} -EXPORT_SYMBOL(blk_limits_max_hw_sectors); - -/** - * blk_queue_max_hw_sectors - set max sectors for a request for this queue - * @q: the request queue for the device - * @max_hw_sectors: max hardware sectors in the usual 512b unit - * - * Description: - * See description for blk_limits_max_hw_sectors(). - **/ -void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors) -{ - blk_limits_max_hw_sectors(&q->limits, max_hw_sectors); + max_sectors = min_not_zero(max_hw_sectors, limits->max_dev_sectors); + max_sectors = min_t(unsigned int, max_sectors, BLK_DEF_MAX_SECTORS); + limits->max_sectors = max_sectors; } EXPORT_SYMBOL(blk_queue_max_hw_sectors); @@ -527,6 +522,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors); + t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors); t->max_write_same_sectors = min(t->max_write_same_sectors, b->max_write_same_sectors); t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 3e44a9da2a13..55c637b9c42b 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -205,6 +205,9 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) if (ret < 0) return ret; + max_hw_sectors_kb = min_not_zero(max_hw_sectors_kb, (unsigned long) + q->limits.max_dev_sectors >> 1); + if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb) return -EINVAL; diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index e868d39c39bb..7af47ed10d90 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -2238,11 +2238,8 @@ got_data: } } - if (sdkp->capacity > 0xffffffff) { + if (sdkp->capacity > 0xffffffff) sdp->use_16_for_rw = 1; - sdkp->max_xfer_blocks = SD_MAX_XFER_BLOCKS; - } else - sdkp->max_xfer_blocks = SD_DEF_XFER_BLOCKS; /* Rescale capacity to 512-byte units */ if (sector_size == 4096) @@ -2559,7 +2556,6 @@ static void sd_read_block_limits(struct scsi_disk *sdkp) { unsigned int sector_sz = sdkp->device->sector_size; const int vpd_len = 64; - u32 max_xfer_length; unsigned char *buffer = kmalloc(vpd_len, GFP_KERNEL); if (!buffer || @@ -2567,14 +2563,11 @@ static void sd_read_block_limits(struct scsi_disk *sdkp) scsi_get_vpd_page(sdkp->device, 0xb0, buffer, vpd_len)) goto out; - max_xfer_length = get_unaligned_be32(&buffer[8]); - if (max_xfer_length) - sdkp->max_xfer_blocks = max_xfer_length; - blk_queue_io_min(sdkp->disk->queue, get_unaligned_be16(&buffer[6]) * sector_sz); - blk_queue_io_opt(sdkp->disk->queue, - get_unaligned_be32(&buffer[12]) * sector_sz); + + sdkp->max_xfer_blocks = get_unaligned_be32(&buffer[8]); + sdkp->opt_xfer_blocks = get_unaligned_be32(&buffer[12]); if (buffer[3] == 0x3c) { unsigned int lba_count, desc_count; @@ -2723,6 +2716,11 @@ static int sd_try_extended_inquiry(struct scsi_device *sdp) return 0; } +static inline u32 logical_to_sectors(struct scsi_device *sdev, u32 blocks) +{ + return blocks << (ilog2(sdev->sector_size) - 9); +} + /** * sd_revalidate_disk - called the first time a new disk is seen, * performs disk spin up, read_capacity, etc. @@ -2732,8 +2730,9 @@ static int sd_revalidate_disk(struct gendisk *disk) { struct scsi_disk *sdkp = scsi_disk(disk); struct scsi_device *sdp = sdkp->device; + struct request_queue *q = sdkp->disk->queue; unsigned char *buffer; - unsigned int max_xfer; + unsigned int dev_max, rw_max; SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_revalidate_disk\n")); @@ -2781,11 +2780,26 @@ static int sd_revalidate_disk(struct gendisk *disk) */ sd_set_flush_flag(sdkp); - max_xfer = sdkp->max_xfer_blocks; - max_xfer <<= ilog2(sdp->sector_size) - 9; + /* Initial block count limit based on CDB TRANSFER LENGTH field size. */ + dev_max = sdp->use_16_for_rw ? SD_MAX_XFER_BLOCKS : SD_DEF_XFER_BLOCKS; + + /* Some devices report a maximum block count for READ/WRITE requests. */ + dev_max = min_not_zero(dev_max, sdkp->max_xfer_blocks); + q->limits.max_dev_sectors = logical_to_sectors(sdp, dev_max); + + /* + * Use the device's preferred I/O size for reads and writes + * unless the reported value is unreasonably large (or garbage). + */ + if (sdkp->opt_xfer_blocks && sdkp->opt_xfer_blocks <= dev_max && + sdkp->opt_xfer_blocks <= SD_DEF_XFER_BLOCKS) + rw_max = q->limits.io_opt = + logical_to_sectors(sdp, sdkp->opt_xfer_blocks); + else + rw_max = BLK_DEF_MAX_SECTORS; - sdkp->disk->queue->limits.max_sectors = - min_not_zero(queue_max_hw_sectors(sdkp->disk->queue), max_xfer); + /* Combine with controller limits */ + q->limits.max_sectors = min(rw_max, queue_max_hw_sectors(q)); set_capacity(disk, sdkp->capacity); sd_config_write_same(sdkp); diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h index 63ba5ca7f9a1..5f2a84aff29f 100644 --- a/drivers/scsi/sd.h +++ b/drivers/scsi/sd.h @@ -67,6 +67,7 @@ struct scsi_disk { atomic_t openers; sector_t capacity; /* size in 512-byte sectors */ u32 max_xfer_blocks; + u32 opt_xfer_blocks; u32 max_ws_blocks; u32 max_unmap_blocks; u32 unmap_granularity; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 38a5ff772a37..9dacb745fa96 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -253,6 +253,7 @@ struct queue_limits { unsigned long virt_boundary_mask; unsigned int max_hw_sectors; + unsigned int max_dev_sectors; unsigned int chunk_sectors; unsigned int max_sectors; unsigned int max_segment_size; @@ -948,7 +949,6 @@ extern struct request_queue *blk_init_allocated_queue(struct request_queue *, extern void blk_cleanup_queue(struct request_queue *); extern void blk_queue_make_request(struct request_queue *, make_request_fn *); extern void blk_queue_bounce_limit(struct request_queue *, u64); -extern void blk_limits_max_hw_sectors(struct queue_limits *, unsigned int); extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int); extern void blk_queue_chunk_sectors(struct request_queue *, unsigned int); extern void blk_queue_max_segments(struct request_queue *, unsigned short); -- cgit v1.2.3 From 057085e522f8bf94c2e691a5b76880f68060f8ba Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Thu, 5 Nov 2015 23:37:59 -0800 Subject: target: Fix race for SCF_COMPARE_AND_WRITE_POST checking This patch addresses a race + use after free where the first stage of COMPARE_AND_WRITE in compare_and_write_callback() is rescheduled after the backend sends the secondary WRITE, resulting in second stage compare_and_write_post() callback completing in target_complete_ok_work() before the first can return. Because current code depends on checking se_cmd->se_cmd_flags after return from se_cmd->transport_complete_callback(), this results in first stage having SCF_COMPARE_AND_WRITE_POST set, which incorrectly falls through into second stage CAW processing code, eventually triggering a NULL pointer dereference due to use after free. To address this bug, pass in a new *post_ret parameter into se_cmd->transport_complete_callback(), and depend upon this value instead of ->se_cmd_flags to determine when to return or fall through into ->queue_status() code for CAW. Cc: Sagi Grimberg Cc: # v3.12+ Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_sbc.c | 13 +++++++++---- drivers/target/target_core_transport.c | 14 ++++++++------ include/target/target_core_base.h | 2 +- 3 files changed, 18 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/drivers/target/target_core_sbc.c b/drivers/target/target_core_sbc.c index 0b4b2a67d9f9..ae24d0fdcd76 100644 --- a/drivers/target/target_core_sbc.c +++ b/drivers/target/target_core_sbc.c @@ -371,7 +371,8 @@ sbc_setup_write_same(struct se_cmd *cmd, unsigned char *flags, struct sbc_ops *o return 0; } -static sense_reason_t xdreadwrite_callback(struct se_cmd *cmd, bool success) +static sense_reason_t xdreadwrite_callback(struct se_cmd *cmd, bool success, + int *post_ret) { unsigned char *buf, *addr; struct scatterlist *sg; @@ -437,7 +438,8 @@ sbc_execute_rw(struct se_cmd *cmd) cmd->data_direction); } -static sense_reason_t compare_and_write_post(struct se_cmd *cmd, bool success) +static sense_reason_t compare_and_write_post(struct se_cmd *cmd, bool success, + int *post_ret) { struct se_device *dev = cmd->se_dev; @@ -447,8 +449,10 @@ static sense_reason_t compare_and_write_post(struct se_cmd *cmd, bool success) * sent to the backend driver. */ spin_lock_irq(&cmd->t_state_lock); - if ((cmd->transport_state & CMD_T_SENT) && !cmd->scsi_status) + if ((cmd->transport_state & CMD_T_SENT) && !cmd->scsi_status) { cmd->se_cmd_flags |= SCF_COMPARE_AND_WRITE_POST; + *post_ret = 1; + } spin_unlock_irq(&cmd->t_state_lock); /* @@ -460,7 +464,8 @@ static sense_reason_t compare_and_write_post(struct se_cmd *cmd, bool success) return TCM_NO_SENSE; } -static sense_reason_t compare_and_write_callback(struct se_cmd *cmd, bool success) +static sense_reason_t compare_and_write_callback(struct se_cmd *cmd, bool success, + int *post_ret) { struct se_device *dev = cmd->se_dev; struct scatterlist *write_sg = NULL, *sg; diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index 5bacc7b5ed6d..010b8c46f1ef 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -1658,7 +1658,7 @@ bool target_stop_cmd(struct se_cmd *cmd, unsigned long *flags) void transport_generic_request_failure(struct se_cmd *cmd, sense_reason_t sense_reason) { - int ret = 0; + int ret = 0, post_ret = 0; pr_debug("-----[ Storage Engine Exception for cmd: %p ITT: 0x%08llx" " CDB: 0x%02x\n", cmd, cmd->tag, cmd->t_task_cdb[0]); @@ -1680,7 +1680,7 @@ void transport_generic_request_failure(struct se_cmd *cmd, */ if ((cmd->se_cmd_flags & SCF_COMPARE_AND_WRITE) && cmd->transport_complete_callback) - cmd->transport_complete_callback(cmd, false); + cmd->transport_complete_callback(cmd, false, &post_ret); switch (sense_reason) { case TCM_NON_EXISTENT_LUN: @@ -2068,11 +2068,13 @@ static void target_complete_ok_work(struct work_struct *work) */ if (cmd->transport_complete_callback) { sense_reason_t rc; + bool caw = (cmd->se_cmd_flags & SCF_COMPARE_AND_WRITE); + bool zero_dl = !(cmd->data_length); + int post_ret = 0; - rc = cmd->transport_complete_callback(cmd, true); - if (!rc && !(cmd->se_cmd_flags & SCF_COMPARE_AND_WRITE_POST)) { - if ((cmd->se_cmd_flags & SCF_COMPARE_AND_WRITE) && - !cmd->data_length) + rc = cmd->transport_complete_callback(cmd, true, &post_ret); + if (!rc && !post_ret) { + if (caw && zero_dl) goto queue_rsp; return; diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h index 0a2c74008e53..aabf0aca0171 100644 --- a/include/target/target_core_base.h +++ b/include/target/target_core_base.h @@ -474,7 +474,7 @@ struct se_cmd { struct completion cmd_wait_comp; const struct target_core_fabric_ops *se_tfo; sense_reason_t (*execute_cmd)(struct se_cmd *); - sense_reason_t (*transport_complete_callback)(struct se_cmd *, bool); + sense_reason_t (*transport_complete_callback)(struct se_cmd *, bool, int *); void *protocol_data; unsigned char *t_task_cdb; -- cgit v1.2.3 From 3a66d7dca186ebdef9b0bf55e216778fa598062c Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 22 Oct 2015 16:02:14 -0700 Subject: kref: Remove kref_put_spinlock_irqsave() The last user is gone. Hence remove this function. Signed-off-by: Bart Van Assche Cc: Greg Kroah-Hartman Cc: Christoph Hellwig Cc: Joern Engel Signed-off-by: Nicholas Bellinger --- include/linux/kref.h | 33 --------------------------------- 1 file changed, 33 deletions(-) (limited to 'include') diff --git a/include/linux/kref.h b/include/linux/kref.h index 484604d184be..e15828fd71f1 100644 --- a/include/linux/kref.h +++ b/include/linux/kref.h @@ -19,7 +19,6 @@ #include #include #include -#include struct kref { atomic_t refcount; @@ -99,38 +98,6 @@ static inline int kref_put(struct kref *kref, void (*release)(struct kref *kref) return kref_sub(kref, 1, release); } -/** - * kref_put_spinlock_irqsave - decrement refcount for object. - * @kref: object. - * @release: pointer to the function that will clean up the object when the - * last reference to the object is released. - * This pointer is required, and it is not acceptable to pass kfree - * in as this function. - * @lock: lock to take in release case - * - * Behaves identical to kref_put with one exception. If the reference count - * drops to zero, the lock will be taken atomically wrt dropping the reference - * count. The release function has to call spin_unlock() without _irqrestore. - */ -static inline int kref_put_spinlock_irqsave(struct kref *kref, - void (*release)(struct kref *kref), - spinlock_t *lock) -{ - unsigned long flags; - - WARN_ON(release == NULL); - if (atomic_add_unless(&kref->refcount, -1, 1)) - return 0; - spin_lock_irqsave(lock, flags); - if (atomic_dec_and_test(&kref->refcount)) { - release(kref); - local_irq_restore(flags); - return 1; - } - spin_unlock_irqrestore(lock, flags); - return 0; -} - static inline int kref_put_mutex(struct kref *kref, void (*release)(struct kref *kref), struct mutex *lock) -- cgit v1.2.3 From 08236c6bb2980561fba657c58fdc76f2865f236c Mon Sep 17 00:00:00 2001 From: Matias Bjørling Date: Sat, 28 Nov 2015 16:49:27 +0100 Subject: lightnvm: unconverted ppa returned in get_bb_tbl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The get_bb_tbl function takes ppa as a generic address, which is converted to the ppa device address within the device driver. When the update_bbtbl callback is called from get_bb_tbl, the device specific ppa is used, instead of the generic ppa. Make sure to pass the generic ppa. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/gennvm.c | 3 +-- drivers/nvme/host/lightnvm.c | 4 +++- include/linux/lightnvm.h | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c index 3969a9875e59..35dde84b71e9 100644 --- a/drivers/lightnvm/gennvm.c +++ b/drivers/lightnvm/gennvm.c @@ -75,7 +75,6 @@ static int gennvm_block_bb(struct ppa_addr ppa, int nr_blocks, u8 *blks, struct nvm_block *blk; int i; - ppa = dev_to_generic_addr(gn->dev, ppa); lun = &gn->luns[(dev->nr_luns * ppa.g.ch) + ppa.g.lun]; for (i = 0; i < nr_blocks; i++) { @@ -187,7 +186,7 @@ static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn) ppa.g.lun = lun->vlun.id; ppa = generic_to_dev_addr(dev, ppa); - ret = dev->ops->get_bb_tbl(dev->q, ppa, + ret = dev->ops->get_bb_tbl(dev, ppa, dev->blks_per_lun, gennvm_block_bb, gn); if (ret) diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index b9e5cc74053f..06c336410235 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -355,10 +355,11 @@ out: return ret; } -static int nvme_nvm_get_bb_tbl(struct request_queue *q, struct ppa_addr ppa, +static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa, int nr_blocks, nvm_bb_update_fn *update_bbtbl, void *priv) { + struct request_queue *q = nvmdev->q; struct nvme_ns *ns = q->queuedata; struct nvme_dev *dev = ns->dev; struct nvme_nvm_command c = {}; @@ -402,6 +403,7 @@ static int nvme_nvm_get_bb_tbl(struct request_queue *q, struct ppa_addr ppa, goto out; } + ppa = dev_to_generic_addr(nvmdev, ppa); ret = update_bbtbl(ppa, nr_blocks, bb_tbl->blk, priv); if (ret) { ret = -EINTR; diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 3db5552b17d5..c6916aec43b6 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -179,7 +179,7 @@ typedef int (nvm_bb_update_fn)(struct ppa_addr, int, u8 *, void *); typedef int (nvm_id_fn)(struct request_queue *, struct nvm_id *); typedef int (nvm_get_l2p_tbl_fn)(struct request_queue *, u64, u32, nvm_l2p_update_fn *, void *); -typedef int (nvm_op_bb_tbl_fn)(struct request_queue *, struct ppa_addr, int, +typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, int, nvm_bb_update_fn *, void *); typedef int (nvm_op_set_bb_fn)(struct request_queue *, struct nvm_rq *, int); typedef int (nvm_submit_io_fn)(struct request_queue *, struct nvm_rq *); -- cgit v1.2.3 From bf4e6b4e757488dee1b6a581f49c7ac34cd217f8 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Thu, 26 Nov 2015 08:46:57 +0100 Subject: block: Always check queue limits for cloned requests When a cloned request is retried on other queues it always needs to be checked against the queue limits of that queue. Otherwise the calculations for nr_phys_segments might be wrong, leading to a crash in scsi_init_sgtable(). To clarify this the patch renames blk_rq_check_limits() to blk_cloned_rq_check_limits() and removes the symbol export, as the new function should only be used for cloned requests and never exported. Cc: Mike Snitzer Cc: Ewan Milne Cc: Jeff Moyer Signed-off-by: Hannes Reinecke Fixes: e2a60da74 ("block: Clean up special command handling logic") Cc: stable@vger.kernel.org # 3.7+ Acked-by: Mike Snitzer Signed-off-by: Jens Axboe --- block/blk-core.c | 21 +++++++-------------- include/linux/blkdev.h | 1 - 2 files changed, 7 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/block/blk-core.c b/block/blk-core.c index 5131993b23a1..a0af4043dda2 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2114,7 +2114,8 @@ blk_qc_t submit_bio(int rw, struct bio *bio) EXPORT_SYMBOL(submit_bio); /** - * blk_rq_check_limits - Helper function to check a request for the queue limit + * blk_cloned_rq_check_limits - Helper function to check a cloned request + * for new the queue limits * @q: the queue * @rq: the request being checked * @@ -2125,20 +2126,13 @@ EXPORT_SYMBOL(submit_bio); * after it is inserted to @q, it should be checked against @q before * the insertion using this generic function. * - * This function should also be useful for request stacking drivers - * in some cases below, so export this function. * Request stacking drivers like request-based dm may change the queue - * limits while requests are in the queue (e.g. dm's table swapping). - * Such request stacking drivers should check those requests against - * the new queue limits again when they dispatch those requests, - * although such checkings are also done against the old queue limits - * when submitting requests. + * limits when retrying requests on other queues. Those requests need + * to be checked against the new queue limits again during dispatch. */ -int blk_rq_check_limits(struct request_queue *q, struct request *rq) +static int blk_cloned_rq_check_limits(struct request_queue *q, + struct request *rq) { - if (!rq_mergeable(rq)) - return 0; - if (blk_rq_sectors(rq) > blk_queue_get_max_sectors(q, rq->cmd_flags)) { printk(KERN_ERR "%s: over max size limit.\n", __func__); return -EIO; @@ -2158,7 +2152,6 @@ int blk_rq_check_limits(struct request_queue *q, struct request *rq) return 0; } -EXPORT_SYMBOL_GPL(blk_rq_check_limits); /** * blk_insert_cloned_request - Helper for stacking drivers to submit a request @@ -2170,7 +2163,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq) unsigned long flags; int where = ELEVATOR_INSERT_BACK; - if (blk_rq_check_limits(q, rq)) + if (blk_cloned_rq_check_limits(q, rq)) return -EIO; if (rq->rq_disk && diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c0d2b7927c1f..c06f8eaa42ff 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -773,7 +773,6 @@ extern void blk_rq_set_block_pc(struct request *); extern void blk_requeue_request(struct request_queue *, struct request *); extern void blk_add_request_payload(struct request *rq, struct page *page, unsigned int len); -extern int blk_rq_check_limits(struct request_queue *q, struct request *rq); extern int blk_lld_busy(struct request_queue *q); extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src, struct bio_set *bs, gfp_t gfp_mask, -- cgit v1.2.3 From 880621c2605b82eb5af91a2c94223df6f5a3fb64 Mon Sep 17 00:00:00 2001 From: Martin Blumenstingl Date: Sun, 22 Nov 2015 17:46:09 +0100 Subject: packet: Allow packets with only a header (but no payload) Commit 9c7077622dd91 ("packet: make packet_snd fail on len smaller than l2 header") added validation for the packet size in packet_snd. This change enforces that every packet needs a header (with at least hard_header_len bytes) plus a payload with at least one byte. Before this change the payload was optional. This fixes PPPoE connections which do not have a "Service" or "Host-Uniq" configured (which is violating the spec, but is still widely used in real-world setups). Those are currently failing with the following message: "pppd: packet size is too short (24 <= 24)" Signed-off-by: Martin Blumenstingl Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 ++- net/packet/af_packet.c | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 67bfac1abfc1..3b5d134e945a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1398,7 +1398,8 @@ enum netdev_priv_flags { * @dma: DMA channel * @mtu: Interface MTU value * @type: Interface hardware type - * @hard_header_len: Hardware header length + * @hard_header_len: Hardware header length, which means that this is the + * minimum size of a packet. * * @needed_headroom: Extra headroom the hardware may need, but not in all * cases can this be guaranteed diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 1cf928fb573e..992396aa635c 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2329,8 +2329,8 @@ static void tpacket_destruct_skb(struct sk_buff *skb) static bool ll_header_truncated(const struct net_device *dev, int len) { /* net device doesn't like empty head */ - if (unlikely(len <= dev->hard_header_len)) { - net_warn_ratelimited("%s: packet size is too short (%d <= %d)\n", + if (unlikely(len < dev->hard_header_len)) { + net_warn_ratelimited("%s: packet size is too short (%d < %d)\n", current->comm, len, dev->hard_header_len); return true; } -- cgit v1.2.3 From 304d888b29cf96f1dd53511ee686499cd8cdf249 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Fri, 27 Nov 2015 18:17:05 +0100 Subject: Revert "ipv6: ndisc: inherit metadata dst when creating ndisc requests" This reverts commit ab450605b35caa768ca33e86db9403229bf42be4. In IPv6, we cannot inherit the dst of the original dst. ndisc packets are IPv6 packets and may take another route than the original packet. This patch breaks the following scenario: a packet comes from eth0 and is forwarded through vxlan1. The encapsulated packet triggers an NS which cannot be sent because of the wrong route. CC: Jiri Benc CC: Thomas Graf Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- include/net/ndisc.h | 3 +-- net/ipv6/addrconf.c | 2 +- net/ipv6/ndisc.c | 10 +++------- net/ipv6/route.c | 2 +- 4 files changed, 6 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/net/ndisc.h b/include/net/ndisc.h index bf3937431030..2d8edaad29cb 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -181,8 +181,7 @@ void ndisc_cleanup(void); int ndisc_rcv(struct sk_buff *skb); void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit, - const struct in6_addr *daddr, const struct in6_addr *saddr, - struct sk_buff *oskb); + const struct in6_addr *daddr, const struct in6_addr *saddr); void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr, const struct in6_addr *daddr); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index d84742f003a9..61f26851655c 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3642,7 +3642,7 @@ static void addrconf_dad_work(struct work_struct *w) /* send a neighbour solicitation for our addr */ addrconf_addr_solict_mult(&ifp->addr, &mcaddr); - ndisc_send_ns(ifp->idev->dev, &ifp->addr, &mcaddr, &in6addr_any, NULL); + ndisc_send_ns(ifp->idev->dev, &ifp->addr, &mcaddr, &in6addr_any); out: in6_ifa_put(ifp); rtnl_unlock(); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 3e0f855e1bea..d6161e1c48c8 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -556,8 +556,7 @@ static void ndisc_send_unsol_na(struct net_device *dev) } void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit, - const struct in6_addr *daddr, const struct in6_addr *saddr, - struct sk_buff *oskb) + const struct in6_addr *daddr, const struct in6_addr *saddr) { struct sk_buff *skb; struct in6_addr addr_buf; @@ -593,9 +592,6 @@ void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit, ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR, dev->dev_addr); - if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE) && oskb) - skb_dst_copy(skb, oskb); - ndisc_send_skb(skb, daddr, saddr); } @@ -682,12 +678,12 @@ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb) "%s: trying to ucast probe in NUD_INVALID: %pI6\n", __func__, target); } - ndisc_send_ns(dev, target, target, saddr, skb); + ndisc_send_ns(dev, target, target, saddr); } else if ((probes -= NEIGH_VAR(neigh->parms, APP_PROBES)) < 0) { neigh_app_ns(neigh); } else { addrconf_addr_solict_mult(target, &mcaddr); - ndisc_send_ns(dev, target, &mcaddr, saddr, skb); + ndisc_send_ns(dev, target, &mcaddr, saddr); } } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 6f01fe122abd..826e6aa44f8d 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -523,7 +523,7 @@ static void rt6_probe_deferred(struct work_struct *w) container_of(w, struct __rt6_probe_work, work); addrconf_addr_solict_mult(&work->target, &mcaddr); - ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, NULL); + ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL); dev_put(work->dev); kfree(work); } -- cgit v1.2.3 From 9cd3e072b0be17446e37d7414eac8a3499e0601e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 29 Nov 2015 20:03:10 -0800 Subject: net: rename SOCK_ASYNC_NOSPACE and SOCK_ASYNC_WAITDATA This patch is a cleanup to make following patch easier to review. Goal is to move SOCK_ASYNC_NOSPACE and SOCK_ASYNC_WAITDATA from (struct socket)->flags to a (struct socket_wq)->flags to benefit from RCU protection in sock_wake_async() To ease backports, we rename both constants. Two new helpers, sk_set_bit(int nr, struct sock *sk) and sk_clear_bit(int net, struct sock *sk) are added so that following patch can change their implementation. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- crypto/algif_aead.c | 4 ++-- crypto/algif_skcipher.c | 6 +++--- drivers/net/macvtap.c | 4 ++-- drivers/net/tun.c | 4 ++-- fs/dlm/lowcomms.c | 4 ++-- include/linux/net.h | 6 +++--- include/net/sock.h | 10 ++++++++++ net/bluetooth/af_bluetooth.c | 6 +++--- net/caif/caif_socket.c | 4 ++-- net/core/datagram.c | 2 +- net/core/sock.c | 8 ++++---- net/core/stream.c | 4 ++-- net/dccp/proto.c | 3 +-- net/decnet/af_decnet.c | 8 ++++---- net/ipv4/tcp.c | 7 +++---- net/iucv/af_iucv.c | 2 +- net/nfc/llcp_sock.c | 2 +- net/rxrpc/ar-output.c | 2 +- net/sctp/socket.c | 2 +- net/socket.c | 4 ++-- net/sunrpc/xprtsock.c | 14 +++++++------- net/unix/af_unix.c | 6 +++--- 22 files changed, 60 insertions(+), 52 deletions(-) (limited to 'include') diff --git a/crypto/algif_aead.c b/crypto/algif_aead.c index 0aa6fdfb448a..6d4d4569447e 100644 --- a/crypto/algif_aead.c +++ b/crypto/algif_aead.c @@ -125,7 +125,7 @@ static int aead_wait_for_data(struct sock *sk, unsigned flags) if (flags & MSG_DONTWAIT) return -EAGAIN; - set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); for (;;) { if (signal_pending(current)) @@ -139,7 +139,7 @@ static int aead_wait_for_data(struct sock *sk, unsigned flags) } finish_wait(sk_sleep(sk), &wait); - clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); return err; } diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c index af31a0ee4057..ca9efe17db1a 100644 --- a/crypto/algif_skcipher.c +++ b/crypto/algif_skcipher.c @@ -212,7 +212,7 @@ static int skcipher_wait_for_wmem(struct sock *sk, unsigned flags) if (flags & MSG_DONTWAIT) return -EAGAIN; - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); for (;;) { if (signal_pending(current)) @@ -258,7 +258,7 @@ static int skcipher_wait_for_data(struct sock *sk, unsigned flags) return -EAGAIN; } - set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); for (;;) { if (signal_pending(current)) @@ -272,7 +272,7 @@ static int skcipher_wait_for_data(struct sock *sk, unsigned flags) } finish_wait(sk_sleep(sk), &wait); - clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); return err; } diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index 54036ae0a388..0fc521941c71 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -498,7 +498,7 @@ static void macvtap_sock_write_space(struct sock *sk) wait_queue_head_t *wqueue; if (!sock_writeable(sk) || - !test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags)) + !test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags)) return; wqueue = sk_sleep(sk); @@ -585,7 +585,7 @@ static unsigned int macvtap_poll(struct file *file, poll_table * wait) mask |= POLLIN | POLLRDNORM; if (sock_writeable(&q->sk) || - (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &q->sock.flags) && + (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &q->sock.flags) && sock_writeable(&q->sk))) mask |= POLLOUT | POLLWRNORM; diff --git a/drivers/net/tun.c b/drivers/net/tun.c index b1878faea397..f0db770e8b2f 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1040,7 +1040,7 @@ static unsigned int tun_chr_poll(struct file *file, poll_table *wait) mask |= POLLIN | POLLRDNORM; if (sock_writeable(sk) || - (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags) && + (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags) && sock_writeable(sk))) mask |= POLLOUT | POLLWRNORM; @@ -1488,7 +1488,7 @@ static void tun_sock_write_space(struct sock *sk) if (!sock_writeable(sk)) return; - if (!test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags)) + if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags)) return; wqueue = sk_sleep(sk); diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 87e9d796cf7d..3a37bd3f9637 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -421,7 +421,7 @@ static void lowcomms_write_space(struct sock *sk) if (test_and_clear_bit(CF_APP_LIMITED, &con->flags)) { con->sock->sk->sk_write_pending--; - clear_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags); + clear_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags); } if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) @@ -1448,7 +1448,7 @@ static void send_to_sock(struct connection *con) msg_flags); if (ret == -EAGAIN || ret == 0) { if (ret == -EAGAIN && - test_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags) && + test_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags) && !test_and_set_bit(CF_APP_LIMITED, &con->flags)) { /* Notify TCP that we're limited by the * application window size. diff --git a/include/linux/net.h b/include/linux/net.h index 70ac5e28e6b7..f514e4dd5521 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -34,8 +34,8 @@ struct inode; struct file; struct net; -#define SOCK_ASYNC_NOSPACE 0 -#define SOCK_ASYNC_WAITDATA 1 +#define SOCKWQ_ASYNC_NOSPACE 0 +#define SOCKWQ_ASYNC_WAITDATA 1 #define SOCK_NOSPACE 2 #define SOCK_PASSCRED 3 #define SOCK_PASSSEC 4 @@ -96,7 +96,7 @@ struct socket_wq { * struct socket - general BSD socket * @state: socket state (%SS_CONNECTED, etc) * @type: socket type (%SOCK_STREAM, etc) - * @flags: socket flags (%SOCK_ASYNC_NOSPACE, etc) + * @flags: socket flags (%SOCK_NOSPACE, etc) * @ops: protocol specific socket operations * @file: File back pointer for gc * @sk: internal networking protocol agnostic socket representation diff --git a/include/net/sock.h b/include/net/sock.h index 7f89e4ba18d1..c155d09d8af4 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2005,6 +2005,16 @@ static inline unsigned long sock_wspace(struct sock *sk) return amt; } +static inline void sk_set_bit(int nr, struct sock *sk) +{ + set_bit(nr, &sk->sk_socket->flags); +} + +static inline void sk_clear_bit(int nr, struct sock *sk) +{ + clear_bit(nr, &sk->sk_socket->flags); +} + static inline void sk_wake_async(struct sock *sk, int how, int band) { if (sock_flag(sk, SOCK_FASYNC)) diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index a3bffd1ec2b4..70306cc9d814 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -271,11 +271,11 @@ static long bt_sock_data_wait(struct sock *sk, long timeo) if (signal_pending(current) || !timeo) break; - set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); release_sock(sk); timeo = schedule_timeout(timeo); lock_sock(sk); - clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); } __set_current_state(TASK_RUNNING); @@ -441,7 +441,7 @@ unsigned int bt_sock_poll(struct file *file, struct socket *sock, if (!test_bit(BT_SK_SUSPEND, &bt_sk(sk)->flags) && sock_writeable(sk)) mask |= POLLOUT | POLLWRNORM | POLLWRBAND; else - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); return mask; } diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index cc858919108e..aa209b1066c9 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -323,7 +323,7 @@ static long caif_stream_data_wait(struct sock *sk, long timeo) !timeo) break; - set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); release_sock(sk); timeo = schedule_timeout(timeo); lock_sock(sk); @@ -331,7 +331,7 @@ static long caif_stream_data_wait(struct sock *sk, long timeo) if (sock_flag(sk, SOCK_DEAD)) break; - clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); } finish_wait(sk_sleep(sk), &wait); diff --git a/net/core/datagram.c b/net/core/datagram.c index 617088aee21d..d62af69ad844 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -785,7 +785,7 @@ unsigned int datagram_poll(struct file *file, struct socket *sock, if (sock_writeable(sk)) mask |= POLLOUT | POLLWRNORM | POLLWRBAND; else - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); return mask; } diff --git a/net/core/sock.c b/net/core/sock.c index 1e4dd54bfb5a..9d79569935a3 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1815,7 +1815,7 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo) { DEFINE_WAIT(wait); - clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); for (;;) { if (!timeo) break; @@ -1861,7 +1861,7 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf) break; - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); err = -EAGAIN; if (!timeo) @@ -2048,9 +2048,9 @@ int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb) DEFINE_WAIT(wait); prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb); - clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); finish_wait(sk_sleep(sk), &wait); return rc; } diff --git a/net/core/stream.c b/net/core/stream.c index d70f77a0c889..43309428644d 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -126,7 +126,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) current_timeo = vm_wait = (prandom_u32() % (HZ / 5)) + 2; while (1) { - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); @@ -139,7 +139,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) } if (signal_pending(current)) goto do_interrupted; - clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); if (sk_stream_memory_free(sk) && !vm_wait) break; diff --git a/net/dccp/proto.c b/net/dccp/proto.c index b5cf13a28009..41e65804ddf5 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -339,8 +339,7 @@ unsigned int dccp_poll(struct file *file, struct socket *sock, if (sk_stream_is_writeable(sk)) { mask |= POLLOUT | POLLWRNORM; } else { /* send SIGIO later */ - set_bit(SOCK_ASYNC_NOSPACE, - &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); /* Race breaker. If space is freed after diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 675cf94e04f8..eebf5ac8ce18 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -1747,9 +1747,9 @@ static int dn_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, } prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); sk_wait_event(sk, &timeo, dn_data_ready(sk, queue, flags, target)); - clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); finish_wait(sk_sleep(sk), &wait); } @@ -2004,10 +2004,10 @@ static int dn_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) } prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); sk_wait_event(sk, &timeo, !dn_queue_too_long(scp, queue, flags)); - clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); finish_wait(sk_sleep(sk), &wait); continue; } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c1728771cf89..c82cca18c90f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -517,8 +517,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) if (sk_stream_is_writeable(sk)) { mask |= POLLOUT | POLLWRNORM; } else { /* send SIGIO later */ - set_bit(SOCK_ASYNC_NOSPACE, - &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); /* Race breaker. If space is freed after @@ -906,7 +905,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, goto out_err; } - clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); mss_now = tcp_send_mss(sk, &size_goal, flags); copied = 0; @@ -1134,7 +1133,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) } /* This should be in poll */ - clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); mss_now = tcp_send_mss(sk, &size_goal, flags); diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index fcb2752419c6..435608c4306d 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1483,7 +1483,7 @@ unsigned int iucv_sock_poll(struct file *file, struct socket *sock, if (sock_writeable(sk) && iucv_below_msglim(sk)) mask |= POLLOUT | POLLWRNORM | POLLWRBAND; else - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); return mask; } diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index b7de0da46acd..ecf0a0196f18 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -572,7 +572,7 @@ static unsigned int llcp_sock_poll(struct file *file, struct socket *sock, if (sock_writeable(sk) && sk->sk_state == LLCP_CONNECTED) mask |= POLLOUT | POLLWRNORM | POLLWRBAND; else - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); pr_debug("mask 0x%x\n", mask); diff --git a/net/rxrpc/ar-output.c b/net/rxrpc/ar-output.c index a40d3afe93b7..14c4e12c47b0 100644 --- a/net/rxrpc/ar-output.c +++ b/net/rxrpc/ar-output.c @@ -531,7 +531,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); /* this should be in poll */ - clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) return -EPIPE; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 897c01c029ca..2353985d689c 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -6458,7 +6458,7 @@ unsigned int sctp_poll(struct file *file, struct socket *sock, poll_table *wait) if (sctp_writeable(sk)) { mask |= POLLOUT | POLLWRNORM; } else { - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); /* * Since the socket is not locked, the buffer * might be made available after the writeable check and diff --git a/net/socket.c b/net/socket.c index dd2c247c99e3..16be908205fc 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1072,11 +1072,11 @@ int sock_wake_async(struct socket *sock, int how, int band) } switch (how) { case SOCK_WAKE_WAITD: - if (test_bit(SOCK_ASYNC_WAITDATA, &sock->flags)) + if (test_bit(SOCKWQ_ASYNC_WAITDATA, &sock->flags)) break; goto call_kill; case SOCK_WAKE_SPACE: - if (!test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags)) + if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sock->flags)) break; /* fall through */ case SOCK_WAKE_IO: diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 1d1a70498910..2ffaf6a79499 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -398,7 +398,7 @@ static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, if (unlikely(!sock)) return -ENOTSOCK; - clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags); + clear_bit(SOCKWQ_ASYNC_NOSPACE, &sock->flags); if (base != 0) { addr = NULL; addrlen = 0; @@ -442,7 +442,7 @@ static void xs_nospace_callback(struct rpc_task *task) struct sock_xprt *transport = container_of(task->tk_rqstp->rq_xprt, struct sock_xprt, xprt); transport->inet->sk_write_pending--; - clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); + clear_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags); } /** @@ -467,7 +467,7 @@ static int xs_nospace(struct rpc_task *task) /* Don't race with disconnect */ if (xprt_connected(xprt)) { - if (test_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags)) { + if (test_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags)) { /* * Notify TCP that we're limited by the application * window size @@ -478,7 +478,7 @@ static int xs_nospace(struct rpc_task *task) xprt_wait_for_buffer_space(task, xs_nospace_callback); } } else { - clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); + clear_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags); ret = -ENOTCONN; } @@ -626,7 +626,7 @@ process_status: case -EPERM: /* When the server has died, an ICMP port unreachable message * prompts ECONNREFUSED. */ - clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); + clear_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags); } return status; @@ -715,7 +715,7 @@ static int xs_tcp_send_request(struct rpc_task *task) case -EADDRINUSE: case -ENOBUFS: case -EPIPE: - clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); + clear_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags); } return status; @@ -1618,7 +1618,7 @@ static void xs_write_space(struct sock *sk) if (unlikely(!(xprt = xprt_from_sock(sk)))) return; - if (test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags) == 0) + if (test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sock->flags) == 0) return; xprt_write_space(xprt); diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 6ced74690eee..45aebd966978 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2191,7 +2191,7 @@ static long unix_stream_data_wait(struct sock *sk, long timeo, !timeo) break; - set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); unix_state_unlock(sk); timeo = freezable_schedule_timeout(timeo); unix_state_lock(sk); @@ -2199,7 +2199,7 @@ static long unix_stream_data_wait(struct sock *sk, long timeo, if (sock_flag(sk, SOCK_DEAD)) break; - clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); } finish_wait(sk_sleep(sk), &wait); @@ -2683,7 +2683,7 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock, if (writable) mask |= POLLOUT | POLLWRNORM | POLLWRBAND; else - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); return mask; } -- cgit v1.2.3 From ceb5d58b217098a657f3850b7a2640f995032e62 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 29 Nov 2015 20:03:11 -0800 Subject: net: fix sock_wake_async() rcu protection Dmitry provided a syzkaller (http://github.com/google/syzkaller) triggering a fault in sock_wake_async() when async IO is requested. Said program stressed af_unix sockets, but the issue is generic and should be addressed in core networking stack. The problem is that by the time sock_wake_async() is called, we should not access the @flags field of 'struct socket', as the inode containing this socket might be freed without further notice, and without RCU grace period. We already maintain an RCU protected structure, "struct socket_wq" so moving SOCKWQ_ASYNC_NOSPACE & SOCKWQ_ASYNC_WAITDATA into it is the safe route. It also reduces number of cache lines needing dirtying, so might provide a performance improvement anyway. In followup patches, we might move remaining flags (SOCK_NOSPACE, SOCK_PASSCRED, SOCK_PASSSEC) to save 8 bytes and let 'struct socket' being mostly read and let it being shared between cpus. Reported-by: Dmitry Vyukov Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/net.h | 7 ++++++- include/net/sock.h | 23 ++++++++++++++++------- net/core/stream.c | 2 +- net/sctp/socket.c | 24 ++++++++++++++---------- net/socket.c | 21 +++++++-------------- 5 files changed, 44 insertions(+), 33 deletions(-) (limited to 'include') diff --git a/include/linux/net.h b/include/linux/net.h index f514e4dd5521..0b4ac7da583a 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -34,6 +34,10 @@ struct inode; struct file; struct net; +/* Historically, SOCKWQ_ASYNC_NOSPACE & SOCKWQ_ASYNC_WAITDATA were located + * in sock->flags, but moved into sk->sk_wq->flags to be RCU protected. + * Eventually all flags will be in sk->sk_wq_flags. + */ #define SOCKWQ_ASYNC_NOSPACE 0 #define SOCKWQ_ASYNC_WAITDATA 1 #define SOCK_NOSPACE 2 @@ -89,6 +93,7 @@ struct socket_wq { /* Note: wait MUST be first field of socket_wq */ wait_queue_head_t wait; struct fasync_struct *fasync_list; + unsigned long flags; /* %SOCKWQ_ASYNC_NOSPACE, etc */ struct rcu_head rcu; } ____cacheline_aligned_in_smp; @@ -202,7 +207,7 @@ enum { SOCK_WAKE_URG, }; -int sock_wake_async(struct socket *sk, int how, int band); +int sock_wake_async(struct socket_wq *sk_wq, int how, int band); int sock_register(const struct net_proto_family *fam); void sock_unregister(int family); int __sock_create(struct net *net, int family, int type, int proto, diff --git a/include/net/sock.h b/include/net/sock.h index c155d09d8af4..0434138c5f95 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -384,8 +384,10 @@ struct sock { int sk_rcvbuf; struct sk_filter __rcu *sk_filter; - struct socket_wq __rcu *sk_wq; - + union { + struct socket_wq __rcu *sk_wq; + struct socket_wq *sk_wq_raw; + }; #ifdef CONFIG_XFRM struct xfrm_policy *sk_policy[2]; #endif @@ -2005,20 +2007,27 @@ static inline unsigned long sock_wspace(struct sock *sk) return amt; } +/* Note: + * We use sk->sk_wq_raw, from contexts knowing this + * pointer is not NULL and cannot disappear/change. + */ static inline void sk_set_bit(int nr, struct sock *sk) { - set_bit(nr, &sk->sk_socket->flags); + set_bit(nr, &sk->sk_wq_raw->flags); } static inline void sk_clear_bit(int nr, struct sock *sk) { - clear_bit(nr, &sk->sk_socket->flags); + clear_bit(nr, &sk->sk_wq_raw->flags); } -static inline void sk_wake_async(struct sock *sk, int how, int band) +static inline void sk_wake_async(const struct sock *sk, int how, int band) { - if (sock_flag(sk, SOCK_FASYNC)) - sock_wake_async(sk->sk_socket, how, band); + if (sock_flag(sk, SOCK_FASYNC)) { + rcu_read_lock(); + sock_wake_async(rcu_dereference(sk->sk_wq), how, band); + rcu_read_unlock(); + } } /* Since sk_{r,w}mem_alloc sums skb->truesize, even a small frame might diff --git a/net/core/stream.c b/net/core/stream.c index 43309428644d..b96f7a79e544 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -39,7 +39,7 @@ void sk_stream_write_space(struct sock *sk) wake_up_interruptible_poll(&wq->wait, POLLOUT | POLLWRNORM | POLLWRBAND); if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN)) - sock_wake_async(sock, SOCK_WAKE_SPACE, POLL_OUT); + sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT); rcu_read_unlock(); } } diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 2353985d689c..5e35ef34008b 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -6801,26 +6801,30 @@ no_packet: static void __sctp_write_space(struct sctp_association *asoc) { struct sock *sk = asoc->base.sk; - struct socket *sock = sk->sk_socket; - if ((sctp_wspace(asoc) > 0) && sock) { - if (waitqueue_active(&asoc->wait)) - wake_up_interruptible(&asoc->wait); + if (sctp_wspace(asoc) <= 0) + return; + + if (waitqueue_active(&asoc->wait)) + wake_up_interruptible(&asoc->wait); - if (sctp_writeable(sk)) { - wait_queue_head_t *wq = sk_sleep(sk); + if (sctp_writeable(sk)) { + struct socket_wq *wq; - if (wq && waitqueue_active(wq)) - wake_up_interruptible(wq); + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (wq) { + if (waitqueue_active(&wq->wait)) + wake_up_interruptible(&wq->wait); /* Note that we try to include the Async I/O support * here by modeling from the current TCP/UDP code. * We have not tested with it yet. */ if (!(sk->sk_shutdown & SEND_SHUTDOWN)) - sock_wake_async(sock, - SOCK_WAKE_SPACE, POLL_OUT); + sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT); } + rcu_read_unlock(); } } diff --git a/net/socket.c b/net/socket.c index 16be908205fc..456fadb3d819 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1056,27 +1056,20 @@ static int sock_fasync(int fd, struct file *filp, int on) return 0; } -/* This function may be called only under socket lock or callback_lock or rcu_lock */ +/* This function may be called only under rcu_lock */ -int sock_wake_async(struct socket *sock, int how, int band) +int sock_wake_async(struct socket_wq *wq, int how, int band) { - struct socket_wq *wq; - - if (!sock) - return -1; - rcu_read_lock(); - wq = rcu_dereference(sock->wq); - if (!wq || !wq->fasync_list) { - rcu_read_unlock(); + if (!wq || !wq->fasync_list) return -1; - } + switch (how) { case SOCK_WAKE_WAITD: - if (test_bit(SOCKWQ_ASYNC_WAITDATA, &sock->flags)) + if (test_bit(SOCKWQ_ASYNC_WAITDATA, &wq->flags)) break; goto call_kill; case SOCK_WAKE_SPACE: - if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sock->flags)) + if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags)) break; /* fall through */ case SOCK_WAKE_IO: @@ -1086,7 +1079,7 @@ call_kill: case SOCK_WAKE_URG: kill_fasync(&wq->fasync_list, SIGURG, band); } - rcu_read_unlock(); + return 0; } EXPORT_SYMBOL(sock_wake_async); -- cgit v1.2.3 From 64031e3e8a5c042840c5123af695eec89f9e6a24 Mon Sep 17 00:00:00 2001 From: Hanjun Guo Date: Wed, 2 Dec 2015 15:44:22 +0800 Subject: ACPI / property: fix compile error for acpi_node_get_property_reference() when CONFIG_ACPI=n In commit 60ba032ed76e ("ACPI / property: Drop size_prop from acpi_dev_get_property_reference()"), the argument "const char *cells_name" was dropped, but forgot to update the stub function in no-ACPI case, it will lead to compile error when CONFIG_ACPI=n, easliy remove "const char *cells_name" to fix it. Fixes: 60ba032ed76e "ACPI / property: Drop size_prop from acpi_dev_get_property_reference()" Reported-by: Kejian Yan Signed-off-by: Hanjun Guo Acked-by: Mika Westerberg Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 865d948c60e6..9e6f4bb4692f 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -782,8 +782,8 @@ static inline int acpi_dev_get_property(struct acpi_device *adev, } static inline int acpi_node_get_property_reference(struct fwnode_handle *fwnode, - const char *name, const char *cells_name, - size_t index, struct acpi_reference_args *args) + const char *name, size_t index, + struct acpi_reference_args *args) { return -ENXIO; } -- cgit v1.2.3 From 69030dd1c3671625c6f766af0b64a4bb4409ac3b Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Tue, 1 Dec 2015 16:52:14 -0800 Subject: cpufreq: use last policy after online for drivers with ->setpolicy For cpufreq drivers which use setpolicy interface, after offline->online the policy is set to default. This can be reproduced by setting the default policy of intel_pstate or longrun to ondemand and then change to "performance". After offline and online, the setpolicy will be called with the policy=ondemand. For drivers using governors this condition is handled by storing last_governor, during offline and restoring during online. The same should be done for drivers using setpolicy interface. Storing last_policy during offline and restoring during online. Signed-off-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 14 ++++++++++---- include/linux/cpufreq.h | 1 + 2 files changed, 11 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index a83c995a62df..8412ce5f93a7 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -976,10 +976,14 @@ static int cpufreq_init_policy(struct cpufreq_policy *policy) new_policy.governor = gov; - /* Use the default policy if its valid. */ - if (cpufreq_driver->setpolicy) - cpufreq_parse_governor(gov->name, &new_policy.policy, NULL); - + /* Use the default policy if there is no last_policy. */ + if (cpufreq_driver->setpolicy) { + if (policy->last_policy) + new_policy.policy = policy->last_policy; + else + cpufreq_parse_governor(gov->name, &new_policy.policy, + NULL); + } /* set default policy */ return cpufreq_set_policy(policy, &new_policy); } @@ -1330,6 +1334,8 @@ static void cpufreq_offline_prepare(unsigned int cpu) if (has_target()) strncpy(policy->last_governor, policy->governor->name, CPUFREQ_NAME_LEN); + else + policy->last_policy = policy->policy; } else if (cpu == policy->cpu) { /* Nominate new CPU */ policy->cpu = cpumask_any(policy->cpus); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index ef4c5b1a860f..177c7680c1a8 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -77,6 +77,7 @@ struct cpufreq_policy { unsigned int suspend_freq; /* freq to set during suspend */ unsigned int policy; /* see above */ + unsigned int last_policy; /* policy before unplug */ struct cpufreq_governor *governor; /* see below */ void *governor_data; bool governor_enabled; /* governor start/stop flag */ -- cgit v1.2.3 From 45f6fad84cc305103b28d73482b344d7f5b76f39 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 29 Nov 2015 19:37:57 -0800 Subject: ipv6: add complete rcu protection around np->opt This patch addresses multiple problems : UDP/RAW sendmsg() need to get a stable struct ipv6_txoptions while socket is not locked : Other threads can change np->opt concurrently. Dmitry posted a syzkaller (http://github.com/google/syzkaller) program desmonstrating use-after-free. Starting with TCP/DCCP lockless listeners, tcp_v6_syn_recv_sock() and dccp_v6_request_recv_sock() also need to use RCU protection to dereference np->opt once (before calling ipv6_dup_options()) This patch adds full RCU protection to np->opt Reported-by: Dmitry Vyukov Signed-off-by: Eric Dumazet Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/linux/ipv6.h | 2 +- include/net/ipv6.h | 21 ++++++++++++++++++++- net/dccp/ipv6.c | 33 +++++++++++++++++++++------------ net/ipv6/af_inet6.c | 13 +++++++++---- net/ipv6/datagram.c | 4 +++- net/ipv6/exthdrs.c | 3 ++- net/ipv6/inet6_connection_sock.c | 11 ++++++++--- net/ipv6/ipv6_sockglue.c | 33 ++++++++++++++++++++++----------- net/ipv6/raw.c | 8 ++++++-- net/ipv6/syncookies.c | 2 +- net/ipv6/tcp_ipv6.c | 28 +++++++++++++++++----------- net/ipv6/udp.c | 8 ++++++-- net/l2tp/l2tp_ip6.c | 8 ++++++-- 13 files changed, 122 insertions(+), 52 deletions(-) (limited to 'include') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 0ef2a97ccdb5..402753bccafa 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -227,7 +227,7 @@ struct ipv6_pinfo { struct ipv6_ac_socklist *ipv6_ac_list; struct ipv6_fl_socklist __rcu *ipv6_fl_list; - struct ipv6_txoptions *opt; + struct ipv6_txoptions __rcu *opt; struct sk_buff *pktoptions; struct sk_buff *rxpmtu; struct inet6_cork cork; diff --git a/include/net/ipv6.h b/include/net/ipv6.h index ea5a13ef85a6..9a5c9f013784 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -205,6 +205,7 @@ extern rwlock_t ip6_ra_lock; */ struct ipv6_txoptions { + atomic_t refcnt; /* Length of this structure */ int tot_len; @@ -217,7 +218,7 @@ struct ipv6_txoptions { struct ipv6_opt_hdr *dst0opt; struct ipv6_rt_hdr *srcrt; /* Routing Header */ struct ipv6_opt_hdr *dst1opt; - + struct rcu_head rcu; /* Option buffer, as read by IPV6_PKTOPTIONS, starts here. */ }; @@ -252,6 +253,24 @@ struct ipv6_fl_socklist { struct rcu_head rcu; }; +static inline struct ipv6_txoptions *txopt_get(const struct ipv6_pinfo *np) +{ + struct ipv6_txoptions *opt; + + rcu_read_lock(); + opt = rcu_dereference(np->opt); + if (opt && !atomic_inc_not_zero(&opt->refcnt)) + opt = NULL; + rcu_read_unlock(); + return opt; +} + +static inline void txopt_put(struct ipv6_txoptions *opt) +{ + if (opt && atomic_dec_and_test(&opt->refcnt)) + kfree_rcu(opt, rcu); +} + struct ip6_flowlabel *fl6_sock_lookup(struct sock *sk, __be32 label); struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions *opt_space, struct ip6_flowlabel *fl, diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index db5fc2440a23..e7e0b9bc2a43 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -202,7 +202,9 @@ static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req security_req_classify_flow(req, flowi6_to_flowi(&fl6)); - final_p = fl6_update_dst(&fl6, np->opt, &final); + rcu_read_lock(); + final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final); + rcu_read_unlock(); dst = ip6_dst_lookup_flow(sk, &fl6, final_p); if (IS_ERR(dst)) { @@ -219,7 +221,10 @@ static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req &ireq->ir_v6_loc_addr, &ireq->ir_v6_rmt_addr); fl6.daddr = ireq->ir_v6_rmt_addr; - err = ip6_xmit(sk, skb, &fl6, np->opt, np->tclass); + rcu_read_lock(); + err = ip6_xmit(sk, skb, &fl6, rcu_dereference(np->opt), + np->tclass); + rcu_read_unlock(); err = net_xmit_eval(err); } @@ -387,6 +392,7 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk, struct inet_request_sock *ireq = inet_rsk(req); struct ipv6_pinfo *newnp; const struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_txoptions *opt; struct inet_sock *newinet; struct dccp6_sock *newdp6; struct sock *newsk; @@ -488,13 +494,15 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk, * Yes, keeping reference count would be much more clever, but we make * one more one thing there: reattach optmem to newsk. */ - if (np->opt != NULL) - newnp->opt = ipv6_dup_options(newsk, np->opt); - + opt = rcu_dereference(np->opt); + if (opt) { + opt = ipv6_dup_options(newsk, opt); + RCU_INIT_POINTER(newnp->opt, opt); + } inet_csk(newsk)->icsk_ext_hdr_len = 0; - if (newnp->opt != NULL) - inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen + - newnp->opt->opt_flen); + if (opt) + inet_csk(newsk)->icsk_ext_hdr_len = opt->opt_nflen + + opt->opt_flen; dccp_sync_mss(newsk, dst_mtu(dst)); @@ -757,6 +765,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, struct ipv6_pinfo *np = inet6_sk(sk); struct dccp_sock *dp = dccp_sk(sk); struct in6_addr *saddr = NULL, *final_p, final; + struct ipv6_txoptions *opt; struct flowi6 fl6; struct dst_entry *dst; int addr_type; @@ -856,7 +865,8 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, fl6.fl6_sport = inet->inet_sport; security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); - final_p = fl6_update_dst(&fl6, np->opt, &final); + opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk)); + final_p = fl6_update_dst(&fl6, opt, &final); dst = ip6_dst_lookup_flow(sk, &fl6, final_p); if (IS_ERR(dst)) { @@ -876,9 +886,8 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, __ip6_dst_store(sk, dst, NULL, NULL); icsk->icsk_ext_hdr_len = 0; - if (np->opt != NULL) - icsk->icsk_ext_hdr_len = (np->opt->opt_flen + - np->opt->opt_nflen); + if (opt) + icsk->icsk_ext_hdr_len = opt->opt_flen + opt->opt_nflen; inet->inet_dport = usin->sin6_port; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 44bb66bde0e2..38d66ddfb937 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -428,9 +428,11 @@ void inet6_destroy_sock(struct sock *sk) /* Free tx options */ - opt = xchg(&np->opt, NULL); - if (opt) - sock_kfree_s(sk, opt, opt->tot_len); + opt = xchg((__force struct ipv6_txoptions **)&np->opt, NULL); + if (opt) { + atomic_sub(opt->tot_len, &sk->sk_omem_alloc); + txopt_put(opt); + } } EXPORT_SYMBOL_GPL(inet6_destroy_sock); @@ -659,7 +661,10 @@ int inet6_sk_rebuild_header(struct sock *sk) fl6.fl6_sport = inet->inet_sport; security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); - final_p = fl6_update_dst(&fl6, np->opt, &final); + rcu_read_lock(); + final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), + &final); + rcu_read_unlock(); dst = ip6_dst_lookup_flow(sk, &fl6, final_p); if (IS_ERR(dst)) { diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index d70b0238f468..517c55b01ba8 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -167,8 +167,10 @@ ipv4_connected: security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); - opt = flowlabel ? flowlabel->opt : np->opt; + rcu_read_lock(); + opt = flowlabel ? flowlabel->opt : rcu_dereference(np->opt); final_p = fl6_update_dst(&fl6, opt, &final); + rcu_read_unlock(); dst = ip6_dst_lookup_flow(sk, &fl6, final_p); err = 0; diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index ce203b0402be..ea7c4d64a00a 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -727,6 +727,7 @@ ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt) *((char **)&opt2->dst1opt) += dif; if (opt2->srcrt) *((char **)&opt2->srcrt) += dif; + atomic_set(&opt2->refcnt, 1); } return opt2; } @@ -790,7 +791,7 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt, return ERR_PTR(-ENOBUFS); memset(opt2, 0, tot_len); - + atomic_set(&opt2->refcnt, 1); opt2->tot_len = tot_len; p = (char *)(opt2 + 1); diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 5d1c7cee2cb2..3ff5208772bb 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -78,7 +78,9 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk, memset(fl6, 0, sizeof(*fl6)); fl6->flowi6_proto = proto; fl6->daddr = ireq->ir_v6_rmt_addr; - final_p = fl6_update_dst(fl6, np->opt, &final); + rcu_read_lock(); + final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final); + rcu_read_unlock(); fl6->saddr = ireq->ir_v6_loc_addr; fl6->flowi6_oif = ireq->ir_iif; fl6->flowi6_mark = ireq->ir_mark; @@ -142,7 +144,9 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk, fl6->fl6_dport = inet->inet_dport; security_sk_classify_flow(sk, flowi6_to_flowi(fl6)); - final_p = fl6_update_dst(fl6, np->opt, &final); + rcu_read_lock(); + final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final); + rcu_read_unlock(); dst = __inet6_csk_dst_check(sk, np->dst_cookie); if (!dst) { @@ -175,7 +179,8 @@ int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl_unused /* Restore final destination back after routing done */ fl6.daddr = sk->sk_v6_daddr; - res = ip6_xmit(sk, skb, &fl6, np->opt, np->tclass); + res = ip6_xmit(sk, skb, &fl6, rcu_dereference(np->opt), + np->tclass); rcu_read_unlock(); return res; } diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 63e6956917c9..4449ad1f8114 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -111,7 +111,8 @@ struct ipv6_txoptions *ipv6_update_options(struct sock *sk, icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); } } - opt = xchg(&inet6_sk(sk)->opt, opt); + opt = xchg((__force struct ipv6_txoptions **)&inet6_sk(sk)->opt, + opt); sk_dst_reset(sk); return opt; @@ -231,9 +232,12 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, sk->sk_socket->ops = &inet_dgram_ops; sk->sk_family = PF_INET; } - opt = xchg(&np->opt, NULL); - if (opt) - sock_kfree_s(sk, opt, opt->tot_len); + opt = xchg((__force struct ipv6_txoptions **)&np->opt, + NULL); + if (opt) { + atomic_sub(opt->tot_len, &sk->sk_omem_alloc); + txopt_put(opt); + } pktopt = xchg(&np->pktoptions, NULL); kfree_skb(pktopt); @@ -403,7 +407,8 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, if (optname != IPV6_RTHDR && !ns_capable(net->user_ns, CAP_NET_RAW)) break; - opt = ipv6_renew_options(sk, np->opt, optname, + opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk)); + opt = ipv6_renew_options(sk, opt, optname, (struct ipv6_opt_hdr __user *)optval, optlen); if (IS_ERR(opt)) { @@ -432,8 +437,10 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, retv = 0; opt = ipv6_update_options(sk, opt); sticky_done: - if (opt) - sock_kfree_s(sk, opt, opt->tot_len); + if (opt) { + atomic_sub(opt->tot_len, &sk->sk_omem_alloc); + txopt_put(opt); + } break; } @@ -486,6 +493,7 @@ sticky_done: break; memset(opt, 0, sizeof(*opt)); + atomic_set(&opt->refcnt, 1); opt->tot_len = sizeof(*opt) + optlen; retv = -EFAULT; if (copy_from_user(opt+1, optval, optlen)) @@ -502,8 +510,10 @@ update: retv = 0; opt = ipv6_update_options(sk, opt); done: - if (opt) - sock_kfree_s(sk, opt, opt->tot_len); + if (opt) { + atomic_sub(opt->tot_len, &sk->sk_omem_alloc); + txopt_put(opt); + } break; } case IPV6_UNICAST_HOPS: @@ -1110,10 +1120,11 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, case IPV6_RTHDR: case IPV6_DSTOPTS: { + struct ipv6_txoptions *opt; lock_sock(sk); - len = ipv6_getsockopt_sticky(sk, np->opt, - optname, optval, len); + opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk)); + len = ipv6_getsockopt_sticky(sk, opt, optname, optval, len); release_sock(sk); /* check if ipv6_getsockopt_sticky() returns err code */ if (len < 0) diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index dc65ec198f7c..99140986e887 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -733,6 +733,7 @@ static int raw6_getfrag(void *from, char *to, int offset, int len, int odd, static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { + struct ipv6_txoptions *opt_to_free = NULL; struct ipv6_txoptions opt_space; DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); struct in6_addr *daddr, *final_p, final; @@ -839,8 +840,10 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (!(opt->opt_nflen|opt->opt_flen)) opt = NULL; } - if (!opt) - opt = np->opt; + if (!opt) { + opt = txopt_get(np); + opt_to_free = opt; + } if (flowlabel) opt = fl6_merge_options(&opt_space, flowlabel, opt); opt = ipv6_fixup_options(&opt_space, opt); @@ -906,6 +909,7 @@ done: dst_release(dst); out: fl6_sock_release(flowlabel); + txopt_put(opt_to_free); return err < 0 ? err : len; do_confirm: dst_confirm(dst); diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index bb8f2fa1c7fb..eaf7ac496d50 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -222,7 +222,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = IPPROTO_TCP; fl6.daddr = ireq->ir_v6_rmt_addr; - final_p = fl6_update_dst(&fl6, np->opt, &final); + final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final); fl6.saddr = ireq->ir_v6_loc_addr; fl6.flowi6_oif = sk->sk_bound_dev_if; fl6.flowi6_mark = ireq->ir_mark; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index c5429a636f1a..6a50bb4a0dae 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -120,6 +120,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, struct ipv6_pinfo *np = inet6_sk(sk); struct tcp_sock *tp = tcp_sk(sk); struct in6_addr *saddr = NULL, *final_p, final; + struct ipv6_txoptions *opt; struct flowi6 fl6; struct dst_entry *dst; int addr_type; @@ -235,7 +236,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, fl6.fl6_dport = usin->sin6_port; fl6.fl6_sport = inet->inet_sport; - final_p = fl6_update_dst(&fl6, np->opt, &final); + opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk)); + final_p = fl6_update_dst(&fl6, opt, &final); security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); @@ -263,9 +265,9 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, tcp_fetch_timewait_stamp(sk, dst); icsk->icsk_ext_hdr_len = 0; - if (np->opt) - icsk->icsk_ext_hdr_len = (np->opt->opt_flen + - np->opt->opt_nflen); + if (opt) + icsk->icsk_ext_hdr_len = opt->opt_flen + + opt->opt_nflen; tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); @@ -461,7 +463,8 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, if (np->repflow && ireq->pktopts) fl6->flowlabel = ip6_flowlabel(ipv6_hdr(ireq->pktopts)); - err = ip6_xmit(sk, skb, fl6, np->opt, np->tclass); + err = ip6_xmit(sk, skb, fl6, rcu_dereference(np->opt), + np->tclass); err = net_xmit_eval(err); } @@ -972,6 +975,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * struct inet_request_sock *ireq; struct ipv6_pinfo *newnp; const struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_txoptions *opt; struct tcp6_sock *newtcp6sk; struct inet_sock *newinet; struct tcp_sock *newtp; @@ -1098,13 +1102,15 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * but we make one more one thing there: reattach optmem to newsk. */ - if (np->opt) - newnp->opt = ipv6_dup_options(newsk, np->opt); - + opt = rcu_dereference(np->opt); + if (opt) { + opt = ipv6_dup_options(newsk, opt); + RCU_INIT_POINTER(newnp->opt, opt); + } inet_csk(newsk)->icsk_ext_hdr_len = 0; - if (newnp->opt) - inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen + - newnp->opt->opt_flen); + if (opt) + inet_csk(newsk)->icsk_ext_hdr_len = opt->opt_nflen + + opt->opt_flen; tcp_ca_openreq_child(newsk, dst); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 01bcb49619ee..9da3287a3923 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1110,6 +1110,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); struct in6_addr *daddr, *final_p, final; struct ipv6_txoptions *opt = NULL; + struct ipv6_txoptions *opt_to_free = NULL; struct ip6_flowlabel *flowlabel = NULL; struct flowi6 fl6; struct dst_entry *dst; @@ -1263,8 +1264,10 @@ do_udp_sendmsg: opt = NULL; connected = 0; } - if (!opt) - opt = np->opt; + if (!opt) { + opt = txopt_get(np); + opt_to_free = opt; + } if (flowlabel) opt = fl6_merge_options(&opt_space, flowlabel, opt); opt = ipv6_fixup_options(&opt_space, opt); @@ -1373,6 +1376,7 @@ release_dst: out: dst_release(dst); fl6_sock_release(flowlabel); + txopt_put(opt_to_free); if (!err) return len; /* diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index aca38d8aed8e..a2c8747d2936 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -486,6 +486,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) DECLARE_SOCKADDR(struct sockaddr_l2tpip6 *, lsa, msg->msg_name); struct in6_addr *daddr, *final_p, final; struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_txoptions *opt_to_free = NULL; struct ipv6_txoptions *opt = NULL; struct ip6_flowlabel *flowlabel = NULL; struct dst_entry *dst = NULL; @@ -575,8 +576,10 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) opt = NULL; } - if (opt == NULL) - opt = np->opt; + if (!opt) { + opt = txopt_get(np); + opt_to_free = opt; + } if (flowlabel) opt = fl6_merge_options(&opt_space, flowlabel, opt); opt = ipv6_fixup_options(&opt_space, opt); @@ -631,6 +634,7 @@ done: dst_release(dst); out: fl6_sock_release(flowlabel); + txopt_put(opt_to_free); return err < 0 ? err : len; -- cgit v1.2.3 From 38ee8fb67c3457f36f5137073c4b8ac2436d2393 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Mon, 30 Nov 2015 12:17:06 -0200 Subject: sctp: convert sack_needed and sack_generation to bits They don't need to be any bigger than that and with this we start a new bitfield for tracking association runtime stuff, like zero window situation. Signed-off-by: Marcelo Ricardo Leitner Acked-by: Vlad Yasevich Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 495c87e367b3..7bbb71081aeb 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -775,10 +775,10 @@ struct sctp_transport { hb_sent:1, /* Is the Path MTU update pending on this tranport */ - pmtu_pending:1; + pmtu_pending:1, - /* Has this transport moved the ctsn since we last sacked */ - __u32 sack_generation; + /* Has this transport moved the ctsn since we last sacked */ + sack_generation:1; u32 dst_cookie; struct flowi fl; @@ -1482,19 +1482,19 @@ struct sctp_association { prsctp_capable:1, /* Can peer do PR-SCTP? */ auth_capable:1; /* Is peer doing SCTP-AUTH? */ - /* Ack State : This flag indicates if the next received + /* sack_needed : This flag indicates if the next received * : packet is to be responded to with a - * : SACK. This is initializedto 0. When a packet - * : is received it is incremented. If this value + * : SACK. This is initialized to 0. When a packet + * : is received sack_cnt is incremented. If this value * : reaches 2 or more, a SACK is sent and the * : value is reset to 0. Note: This is used only * : when no DATA chunks are received out of * : order. When DATA chunks are out of order, * : SACK's are not delayed (see Section 6). */ - __u8 sack_needed; /* Do we need to sack the peer? */ + __u8 sack_needed:1, /* Do we need to sack the peer? */ + sack_generation:1; __u32 sack_cnt; - __u32 sack_generation; __u32 adaptation_ind; /* Adaptation Code point. */ -- cgit v1.2.3 From 1f7dd3e5a6e4f093017fff12232572ee1aa4639b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Dec 2015 10:18:21 -0500 Subject: cgroup: fix handling of multi-destination migration from subtree_control enabling Consider the following v2 hierarchy. P0 (+memory) --- P1 (-memory) --- A \- B P0 has memory enabled in its subtree_control while P1 doesn't. If both A and B contain processes, they would belong to the memory css of P1. Now if memory is enabled on P1's subtree_control, memory csses should be created on both A and B and A's processes should be moved to the former and B's processes the latter. IOW, enabling controllers can cause atomic migrations into different csses. The core cgroup migration logic has been updated accordingly but the controller migration methods haven't and still assume that all tasks migrate to a single target css; furthermore, the methods were fed the css in which subtree_control was updated which is the parent of the target csses. pids controller depends on the migration methods to move charges and this made the controller attribute charges to the wrong csses often triggering the following warning by driving a counter negative. WARNING: CPU: 1 PID: 1 at kernel/cgroup_pids.c:97 pids_cancel.constprop.6+0x31/0x40() Modules linked in: CPU: 1 PID: 1 Comm: systemd Not tainted 4.4.0-rc1+ #29 ... ffffffff81f65382 ffff88007c043b90 ffffffff81551ffc 0000000000000000 ffff88007c043bc8 ffffffff810de202 ffff88007a752000 ffff88007a29ab00 ffff88007c043c80 ffff88007a1d8400 0000000000000001 ffff88007c043bd8 Call Trace: [] dump_stack+0x4e/0x82 [] warn_slowpath_common+0x82/0xc0 [] warn_slowpath_null+0x1a/0x20 [] pids_cancel.constprop.6+0x31/0x40 [] pids_can_attach+0x6d/0xf0 [] cgroup_taskset_migrate+0x6c/0x330 [] cgroup_migrate+0xf5/0x190 [] cgroup_attach_task+0x176/0x200 [] __cgroup_procs_write+0x2ad/0x460 [] cgroup_procs_write+0x14/0x20 [] cgroup_file_write+0x35/0x1c0 [] kernfs_fop_write+0x141/0x190 [] __vfs_write+0x28/0xe0 [] vfs_write+0xac/0x1a0 [] SyS_write+0x49/0xb0 [] entry_SYSCALL_64_fastpath+0x12/0x76 This patch fixes the bug by removing @css parameter from the three migration methods, ->can_attach, ->cancel_attach() and ->attach() and updating cgroup_taskset iteration helpers also return the destination css in addition to the task being migrated. All controllers are updated accordingly. * Controllers which don't care whether there are one or multiple target csses can be converted trivially. cpu, io, freezer, perf, netclassid and netprio fall in this category. * cpuset's current implementation assumes that there's single source and destination and thus doesn't support v2 hierarchy already. The only change made by this patchset is how that single destination css is obtained. * memory migration path already doesn't do anything on v2. How the single destination css is obtained is updated and the prep stage of mem_cgroup_can_attach() is reordered to accomodate the change. * pids is the only controller which was affected by this bug. It now correctly handles multi-destination migrations and no longer causes counter underflow from incorrect accounting. Signed-off-by: Tejun Heo Reported-and-tested-by: Daniel Wagner Cc: Aleksa Sarai --- block/blk-cgroup.c | 6 +++--- include/linux/cgroup-defs.h | 9 +++------ include/linux/cgroup.h | 33 +++++++++++++++++++++----------- kernel/cgroup.c | 43 +++++++++++++++++++++++++++++++++--------- kernel/cgroup_freezer.c | 6 +++--- kernel/cgroup_pids.c | 16 ++++++++-------- kernel/cpuset.c | 33 ++++++++++++++++++++------------ kernel/events/core.c | 6 +++--- kernel/sched/core.c | 12 ++++++------ mm/memcontrol.c | 45 ++++++++++++++++++++++---------------------- net/core/netclassid_cgroup.c | 11 ++++++----- net/core/netprio_cgroup.c | 9 +++++---- 12 files changed, 137 insertions(+), 92 deletions(-) (limited to 'include') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 5bcdfc10c23a..5a37188b559f 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1127,15 +1127,15 @@ void blkcg_exit_queue(struct request_queue *q) * of the main cic data structures. For now we allow a task to change * its cgroup only if it's the only owner of its ioc. */ -static int blkcg_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int blkcg_can_attach(struct cgroup_taskset *tset) { struct task_struct *task; + struct cgroup_subsys_state *dst_css; struct io_context *ioc; int ret = 0; /* task_lock() is needed to avoid races with exit_io_context() */ - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, dst_css, tset) { task_lock(task); ioc = task->io_context; if (ioc && atomic_read(&ioc->nr_tasks) > 1) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 869fd4a3d28e..06b77f9dd3f2 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -422,12 +422,9 @@ struct cgroup_subsys { void (*css_reset)(struct cgroup_subsys_state *css); void (*css_e_css_changed)(struct cgroup_subsys_state *css); - int (*can_attach)(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset); - void (*cancel_attach)(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset); - void (*attach)(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset); + int (*can_attach)(struct cgroup_taskset *tset); + void (*cancel_attach)(struct cgroup_taskset *tset); + void (*attach)(struct cgroup_taskset *tset); int (*can_fork)(struct task_struct *task, void **priv_p); void (*cancel_fork)(struct task_struct *task, void *priv); void (*fork)(struct task_struct *task, void *priv); diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index f64083030ad5..cb91b44f5f78 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -120,8 +120,10 @@ struct cgroup_subsys_state *css_rightmost_descendant(struct cgroup_subsys_state struct cgroup_subsys_state *css_next_descendant_post(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *css); -struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset); -struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset); +struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset, + struct cgroup_subsys_state **dst_cssp); +struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, + struct cgroup_subsys_state **dst_cssp); void css_task_iter_start(struct cgroup_subsys_state *css, struct css_task_iter *it); @@ -236,30 +238,39 @@ void css_task_iter_end(struct css_task_iter *it); /** * cgroup_taskset_for_each - iterate cgroup_taskset * @task: the loop cursor + * @dst_css: the destination css * @tset: taskset to iterate * * @tset may contain multiple tasks and they may belong to multiple - * processes. When there are multiple tasks in @tset, if a task of a - * process is in @tset, all tasks of the process are in @tset. Also, all - * are guaranteed to share the same source and destination csses. + * processes. + * + * On the v2 hierarchy, there may be tasks from multiple processes and they + * may not share the source or destination csses. + * + * On traditional hierarchies, when there are multiple tasks in @tset, if a + * task of a process is in @tset, all tasks of the process are in @tset. + * Also, all are guaranteed to share the same source and destination csses. * * Iteration is not in any specific order. */ -#define cgroup_taskset_for_each(task, tset) \ - for ((task) = cgroup_taskset_first((tset)); (task); \ - (task) = cgroup_taskset_next((tset))) +#define cgroup_taskset_for_each(task, dst_css, tset) \ + for ((task) = cgroup_taskset_first((tset), &(dst_css)); \ + (task); \ + (task) = cgroup_taskset_next((tset), &(dst_css))) /** * cgroup_taskset_for_each_leader - iterate group leaders in a cgroup_taskset * @leader: the loop cursor + * @dst_css: the destination css * @tset: takset to iterate * * Iterate threadgroup leaders of @tset. For single-task migrations, @tset * may not contain any. */ -#define cgroup_taskset_for_each_leader(leader, tset) \ - for ((leader) = cgroup_taskset_first((tset)); (leader); \ - (leader) = cgroup_taskset_next((tset))) \ +#define cgroup_taskset_for_each_leader(leader, dst_css, tset) \ + for ((leader) = cgroup_taskset_first((tset), &(dst_css)); \ + (leader); \ + (leader) = cgroup_taskset_next((tset), &(dst_css))) \ if ((leader) != (leader)->group_leader) \ ; \ else diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5cea63fe4095..470f6536b9e8 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2237,6 +2237,9 @@ struct cgroup_taskset { struct list_head src_csets; struct list_head dst_csets; + /* the subsys currently being processed */ + int ssid; + /* * Fields for cgroup_taskset_*() iteration. * @@ -2299,25 +2302,29 @@ static void cgroup_taskset_add(struct task_struct *task, /** * cgroup_taskset_first - reset taskset and return the first task * @tset: taskset of interest + * @dst_cssp: output variable for the destination css * * @tset iteration is initialized and the first task is returned. */ -struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset) +struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset, + struct cgroup_subsys_state **dst_cssp) { tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node); tset->cur_task = NULL; - return cgroup_taskset_next(tset); + return cgroup_taskset_next(tset, dst_cssp); } /** * cgroup_taskset_next - iterate to the next task in taskset * @tset: taskset of interest + * @dst_cssp: output variable for the destination css * * Return the next task in @tset. Iteration must have been initialized * with cgroup_taskset_first(). */ -struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) +struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, + struct cgroup_subsys_state **dst_cssp) { struct css_set *cset = tset->cur_cset; struct task_struct *task = tset->cur_task; @@ -2332,6 +2339,18 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) if (&task->cg_list != &cset->mg_tasks) { tset->cur_cset = cset; tset->cur_task = task; + + /* + * This function may be called both before and + * after cgroup_taskset_migrate(). The two cases + * can be distinguished by looking at whether @cset + * has its ->mg_dst_cset set. + */ + if (cset->mg_dst_cset) + *dst_cssp = cset->mg_dst_cset->subsys[tset->ssid]; + else + *dst_cssp = cset->subsys[tset->ssid]; + return task; } @@ -2367,7 +2386,8 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset, /* check that we can legitimately attach to the cgroup */ for_each_e_css(css, i, dst_cgrp) { if (css->ss->can_attach) { - ret = css->ss->can_attach(css, tset); + tset->ssid = i; + ret = css->ss->can_attach(tset); if (ret) { failed_css = css; goto out_cancel_attach; @@ -2400,9 +2420,12 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset, */ tset->csets = &tset->dst_csets; - for_each_e_css(css, i, dst_cgrp) - if (css->ss->attach) - css->ss->attach(css, tset); + for_each_e_css(css, i, dst_cgrp) { + if (css->ss->attach) { + tset->ssid = i; + css->ss->attach(tset); + } + } ret = 0; goto out_release_tset; @@ -2411,8 +2434,10 @@ out_cancel_attach: for_each_e_css(css, i, dst_cgrp) { if (css == failed_css) break; - if (css->ss->cancel_attach) - css->ss->cancel_attach(css, tset); + if (css->ss->cancel_attach) { + tset->ssid = i; + css->ss->cancel_attach(tset); + } } out_release_tset: spin_lock_bh(&css_set_lock); diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index ff02a8e51bb3..2d3df82c54f2 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -155,10 +155,10 @@ static void freezer_css_free(struct cgroup_subsys_state *css) * @freezer->lock. freezer_attach() makes the new tasks conform to the * current state and all following state changes can see the new tasks. */ -static void freezer_attach(struct cgroup_subsys_state *new_css, - struct cgroup_taskset *tset) +static void freezer_attach(struct cgroup_taskset *tset) { struct task_struct *task; + struct cgroup_subsys_state *new_css; mutex_lock(&freezer_mutex); @@ -172,7 +172,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css, * current state before executing the following - !frozen tasks may * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. */ - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, new_css, tset) { struct freezer *freezer = css_freezer(new_css); if (!(freezer->state & CGROUP_FREEZING)) { diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c index de3359a48dbb..8e27fc5dbb20 100644 --- a/kernel/cgroup_pids.c +++ b/kernel/cgroup_pids.c @@ -162,13 +162,13 @@ revert: return -EAGAIN; } -static int pids_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int pids_can_attach(struct cgroup_taskset *tset) { - struct pids_cgroup *pids = css_pids(css); struct task_struct *task; + struct cgroup_subsys_state *dst_css; - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, dst_css, tset) { + struct pids_cgroup *pids = css_pids(dst_css); struct cgroup_subsys_state *old_css; struct pids_cgroup *old_pids; @@ -187,13 +187,13 @@ static int pids_can_attach(struct cgroup_subsys_state *css, return 0; } -static void pids_cancel_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void pids_cancel_attach(struct cgroup_taskset *tset) { - struct pids_cgroup *pids = css_pids(css); struct task_struct *task; + struct cgroup_subsys_state *dst_css; - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, dst_css, tset) { + struct pids_cgroup *pids = css_pids(dst_css); struct cgroup_subsys_state *old_css; struct pids_cgroup *old_pids; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 10ae73611d80..02a8ea5c9963 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1429,15 +1429,16 @@ static int fmeter_getrate(struct fmeter *fmp) static struct cpuset *cpuset_attach_old_cs; /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ -static int cpuset_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int cpuset_can_attach(struct cgroup_taskset *tset) { - struct cpuset *cs = css_cs(css); + struct cgroup_subsys_state *css; + struct cpuset *cs; struct task_struct *task; int ret; /* used later by cpuset_attach() */ - cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset)); + cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); + cs = css_cs(css); mutex_lock(&cpuset_mutex); @@ -1447,7 +1448,7 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css, (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) goto out_unlock; - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, css, tset) { ret = task_can_attach(task, cs->cpus_allowed); if (ret) goto out_unlock; @@ -1467,9 +1468,14 @@ out_unlock: return ret; } -static void cpuset_cancel_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void cpuset_cancel_attach(struct cgroup_taskset *tset) { + struct cgroup_subsys_state *css; + struct cpuset *cs; + + cgroup_taskset_first(tset, &css); + cs = css_cs(css); + mutex_lock(&cpuset_mutex); css_cs(css)->attach_in_progress--; mutex_unlock(&cpuset_mutex); @@ -1482,16 +1488,19 @@ static void cpuset_cancel_attach(struct cgroup_subsys_state *css, */ static cpumask_var_t cpus_attach; -static void cpuset_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void cpuset_attach(struct cgroup_taskset *tset) { /* static buf protected by cpuset_mutex */ static nodemask_t cpuset_attach_nodemask_to; struct task_struct *task; struct task_struct *leader; - struct cpuset *cs = css_cs(css); + struct cgroup_subsys_state *css; + struct cpuset *cs; struct cpuset *oldcs = cpuset_attach_old_cs; + cgroup_taskset_first(tset, &css); + cs = css_cs(css); + mutex_lock(&cpuset_mutex); /* prepare for attach */ @@ -1502,7 +1511,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css, guarantee_online_mems(cs, &cpuset_attach_nodemask_to); - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, css, tset) { /* * can_attach beforehand should guarantee that this doesn't * fail. TODO: have a better way to handle failure here @@ -1518,7 +1527,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css, * sleep and should be moved outside migration path proper. */ cpuset_attach_nodemask_to = cs->effective_mems; - cgroup_taskset_for_each_leader(leader, tset) { + cgroup_taskset_for_each_leader(leader, css, tset) { struct mm_struct *mm = get_task_mm(leader); if (mm) { diff --git a/kernel/events/core.c b/kernel/events/core.c index 36babfd20648..026305dfe523 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9456,12 +9456,12 @@ static int __perf_cgroup_move(void *info) return 0; } -static void perf_cgroup_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void perf_cgroup_attach(struct cgroup_taskset *tset) { struct task_struct *task; + struct cgroup_subsys_state *css; - cgroup_taskset_for_each(task, tset) + cgroup_taskset_for_each(task, css, tset) task_function_call(task, __perf_cgroup_move, task); } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4d568ac9319e..a9db4819e586 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8217,12 +8217,12 @@ static void cpu_cgroup_fork(struct task_struct *task, void *private) sched_move_task(task); } -static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) { struct task_struct *task; + struct cgroup_subsys_state *css; - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, css, tset) { #ifdef CONFIG_RT_GROUP_SCHED if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; @@ -8235,12 +8235,12 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, return 0; } -static void cpu_cgroup_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void cpu_cgroup_attach(struct cgroup_taskset *tset) { struct task_struct *task; + struct cgroup_subsys_state *css; - cgroup_taskset_for_each(task, tset) + cgroup_taskset_for_each(task, css, tset) sched_move_task(task); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9acfb165eb52..c92a65b2b4ab 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4779,23 +4779,18 @@ static void mem_cgroup_clear_mc(void) spin_unlock(&mc.lock); } -static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int mem_cgroup_can_attach(struct cgroup_taskset *tset) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct cgroup_subsys_state *css; + struct mem_cgroup *memcg; struct mem_cgroup *from; struct task_struct *leader, *p; struct mm_struct *mm; unsigned long move_flags; int ret = 0; - /* - * We are now commited to this value whatever it is. Changes in this - * tunable will only affect upcoming migrations, not the current one. - * So we need to save it, and keep it going. - */ - move_flags = READ_ONCE(memcg->move_charge_at_immigrate); - if (!move_flags) + /* charge immigration isn't supported on the default hierarchy */ + if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) return 0; /* @@ -4805,13 +4800,23 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, * multiple. */ p = NULL; - cgroup_taskset_for_each_leader(leader, tset) { + cgroup_taskset_for_each_leader(leader, css, tset) { WARN_ON_ONCE(p); p = leader; + memcg = mem_cgroup_from_css(css); } if (!p) return 0; + /* + * We are now commited to this value whatever it is. Changes in this + * tunable will only affect upcoming migrations, not the current one. + * So we need to save it, and keep it going. + */ + move_flags = READ_ONCE(memcg->move_charge_at_immigrate); + if (!move_flags) + return 0; + from = mem_cgroup_from_task(p); VM_BUG_ON(from == memcg); @@ -4842,8 +4847,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, return ret; } -static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) { if (mc.to) mem_cgroup_clear_mc(); @@ -4985,10 +4989,10 @@ retry: atomic_dec(&mc.from->moving_account); } -static void mem_cgroup_move_task(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void mem_cgroup_move_task(struct cgroup_taskset *tset) { - struct task_struct *p = cgroup_taskset_first(tset); + struct cgroup_subsys_state *css; + struct task_struct *p = cgroup_taskset_first(tset, &css); struct mm_struct *mm = get_task_mm(p); if (mm) { @@ -5000,17 +5004,14 @@ static void mem_cgroup_move_task(struct cgroup_subsys_state *css, mem_cgroup_clear_mc(); } #else /* !CONFIG_MMU */ -static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int mem_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; } -static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) { } -static void mem_cgroup_move_task(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void mem_cgroup_move_task(struct cgroup_taskset *tset) { } #endif diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index 6441f47b1a8f..81cb3c72efe8 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -67,14 +67,15 @@ static int update_classid(const void *v, struct file *file, unsigned n) return 0; } -static void cgrp_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void cgrp_attach(struct cgroup_taskset *tset) { - struct cgroup_cls_state *cs = css_cls_state(css); - void *v = (void *)(unsigned long)cs->classid; struct task_struct *p; + struct cgroup_subsys_state *css; + + cgroup_taskset_for_each(p, css, tset) { + struct cgroup_cls_state *cs = css_cls_state(css); + void *v = (void *)(unsigned long)cs->classid; - cgroup_taskset_for_each(p, tset) { task_lock(p); iterate_fd(p->files, 0, update_classid, v); task_unlock(p); diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index cbd0a199bf52..40fd09fe06ae 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -218,13 +218,14 @@ static int update_netprio(const void *v, struct file *file, unsigned n) return 0; } -static void net_prio_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void net_prio_attach(struct cgroup_taskset *tset) { struct task_struct *p; - void *v = (void *)(unsigned long)css->cgroup->id; + struct cgroup_subsys_state *css; + + cgroup_taskset_for_each(p, css, tset) { + void *v = (void *)(unsigned long)css->cgroup->id; - cgroup_taskset_for_each(p, tset) { task_lock(p); iterate_fd(p->files, 0, update_netprio, v); task_unlock(p); -- cgit v1.2.3 From 6bd4f355df2eae80b8a5c7b097371cd1e05f20d5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 Dec 2015 21:53:57 -0800 Subject: ipv6: kill sk_dst_lock While testing the np->opt RCU conversion, I found that UDP/IPv6 was using a mixture of xchg() and sk_dst_lock to protect concurrent changes to sk->sk_dst_cache, leading to possible corruptions and crashes. ip6_sk_dst_lookup_flow() uses sk_dst_check() anyway, so the simplest way to fix the mess is to remove sk_dst_lock completely, as we did for IPv4. __ip6_dst_store() and ip6_dst_store() share same implementation. sk_setup_caps() being called with socket lock being held or not, we have to use sk_dst_set() instead of __sk_dst_set() Note that I had to move the "np->dst_cookie = rt6_get_cookie(rt);" in ip6_dst_store() before the sk_setup_caps(sk, dst) call. This is because ip6_dst_store() can be called from process context, without any lock held. As soon as the dst is installed in sk->sk_dst_cache, dst can be freed from another cpu doing a concurrent ip6_dst_store() Doing the dst dereference before doing the install is needed to make sure no use after free would trigger. Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Signed-off-by: David S. Miller --- include/net/ip6_route.h | 17 ++++------------- include/net/sock.h | 3 +-- net/core/sock.c | 4 +--- net/dccp/ipv6.c | 4 ++-- net/ipv6/af_inet6.c | 2 +- net/ipv6/icmp.c | 14 -------------- net/ipv6/inet6_connection_sock.c | 10 +--------- net/ipv6/tcp_ipv6.c | 4 ++-- 8 files changed, 12 insertions(+), 46 deletions(-) (limited to 'include') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 2bfb2ad2fab1..877f682989b8 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -133,27 +133,18 @@ void rt6_clean_tohost(struct net *net, struct in6_addr *gateway); /* * Store a destination cache entry in a socket */ -static inline void __ip6_dst_store(struct sock *sk, struct dst_entry *dst, - const struct in6_addr *daddr, - const struct in6_addr *saddr) +static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst, + const struct in6_addr *daddr, + const struct in6_addr *saddr) { struct ipv6_pinfo *np = inet6_sk(sk); - struct rt6_info *rt = (struct rt6_info *) dst; + np->dst_cookie = rt6_get_cookie((struct rt6_info *)dst); sk_setup_caps(sk, dst); np->daddr_cache = daddr; #ifdef CONFIG_IPV6_SUBTREES np->saddr_cache = saddr; #endif - np->dst_cookie = rt6_get_cookie(rt); -} - -static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst, - struct in6_addr *daddr, struct in6_addr *saddr) -{ - spin_lock(&sk->sk_dst_lock); - __ip6_dst_store(sk, dst, daddr, saddr); - spin_unlock(&sk->sk_dst_lock); } static inline bool ipv6_unicast_destination(const struct sk_buff *skb) diff --git a/include/net/sock.h b/include/net/sock.h index 0434138c5f95..52d27ee924f4 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -254,7 +254,6 @@ struct cg_proto; * @sk_wq: sock wait queue and async head * @sk_rx_dst: receive input route used by early demux * @sk_dst_cache: destination cache - * @sk_dst_lock: destination cache lock * @sk_policy: flow policy * @sk_receive_queue: incoming packets * @sk_wmem_alloc: transmit queue bytes committed @@ -393,7 +392,7 @@ struct sock { #endif struct dst_entry *sk_rx_dst; struct dst_entry __rcu *sk_dst_cache; - spinlock_t sk_dst_lock; + /* Note: 32bit hole on 64bit arches */ atomic_t sk_wmem_alloc; atomic_t sk_omem_alloc; int sk_sndbuf; diff --git a/net/core/sock.c b/net/core/sock.c index 9d79569935a3..e31dfcee1729 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1530,7 +1530,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) skb_queue_head_init(&newsk->sk_receive_queue); skb_queue_head_init(&newsk->sk_write_queue); - spin_lock_init(&newsk->sk_dst_lock); rwlock_init(&newsk->sk_callback_lock); lockdep_set_class_and_name(&newsk->sk_callback_lock, af_callback_keys + newsk->sk_family, @@ -1607,7 +1606,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { u32 max_segs = 1; - __sk_dst_set(sk, dst); + sk_dst_set(sk, dst); sk->sk_route_caps = dst->dev->features; if (sk->sk_route_caps & NETIF_F_GSO) sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; @@ -2388,7 +2387,6 @@ void sock_init_data(struct socket *sock, struct sock *sk) } else sk->sk_wq = NULL; - spin_lock_init(&sk->sk_dst_lock); rwlock_init(&sk->sk_callback_lock); lockdep_set_class_and_name(&sk->sk_callback_lock, af_callback_keys + sk->sk_family, diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index e7e0b9bc2a43..9c6d0508e63a 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -459,7 +459,7 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk, * comment in that function for the gory details. -acme */ - __ip6_dst_store(newsk, dst, NULL, NULL); + ip6_dst_store(newsk, dst, NULL, NULL); newsk->sk_route_caps = dst->dev->features & ~(NETIF_F_IP_CSUM | NETIF_F_TSO); newdp6 = (struct dccp6_sock *)newsk; @@ -883,7 +883,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, np->saddr = *saddr; inet->inet_rcv_saddr = LOOPBACK4_IPV6; - __ip6_dst_store(sk, dst, NULL, NULL); + ip6_dst_store(sk, dst, NULL, NULL); icsk->icsk_ext_hdr_len = 0; if (opt) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 38d66ddfb937..8ec0df75f1c4 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -673,7 +673,7 @@ int inet6_sk_rebuild_header(struct sock *sk) return PTR_ERR(dst); } - __ip6_dst_store(sk, dst, NULL, NULL); + ip6_dst_store(sk, dst, NULL, NULL); } return 0; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 36c5a98b0472..0a37ddc7af51 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -834,11 +834,6 @@ void icmpv6_flow_init(struct sock *sk, struct flowi6 *fl6, security_sk_classify_flow(sk, flowi6_to_flowi(fl6)); } -/* - * Special lock-class for __icmpv6_sk: - */ -static struct lock_class_key icmpv6_socket_sk_dst_lock_key; - static int __net_init icmpv6_sk_init(struct net *net) { struct sock *sk; @@ -860,15 +855,6 @@ static int __net_init icmpv6_sk_init(struct net *net) net->ipv6.icmp_sk[i] = sk; - /* - * Split off their lock-class, because sk->sk_dst_lock - * gets used from softirqs, which is safe for - * __icmpv6_sk (because those never get directly used - * via userspace syscalls), but unsafe for normal sockets. - */ - lockdep_set_class(&sk->sk_dst_lock, - &icmpv6_socket_sk_dst_lock_key); - /* Enough space for 2 64K ICMP packets, including * sk_buff struct overhead. */ diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 3ff5208772bb..a7ca2cde2ecb 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -110,14 +110,6 @@ void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) } EXPORT_SYMBOL_GPL(inet6_csk_addr2sockaddr); -static inline -void __inet6_csk_dst_store(struct sock *sk, struct dst_entry *dst, - const struct in6_addr *daddr, - const struct in6_addr *saddr) -{ - __ip6_dst_store(sk, dst, daddr, saddr); -} - static inline struct dst_entry *__inet6_csk_dst_check(struct sock *sk, u32 cookie) { @@ -153,7 +145,7 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk, dst = ip6_dst_lookup_flow(sk, fl6, final_p); if (!IS_ERR(dst)) - __inet6_csk_dst_store(sk, dst, NULL, NULL); + ip6_dst_store(sk, dst, NULL, NULL); } return dst; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 6a50bb4a0dae..e7aab561b7b4 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -257,7 +257,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, inet->inet_rcv_saddr = LOOPBACK4_IPV6; sk->sk_gso_type = SKB_GSO_TCPV6; - __ip6_dst_store(sk, dst, NULL, NULL); + ip6_dst_store(sk, dst, NULL, NULL); if (tcp_death_row.sysctl_tw_recycle && !tp->rx_opt.ts_recent_stamp && @@ -1060,7 +1060,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * */ newsk->sk_gso_type = SKB_GSO_TCPV6; - __ip6_dst_store(newsk, dst, NULL, NULL); + ip6_dst_store(newsk, dst, NULL, NULL); inet6_sk_rx_dst_set(newsk, skb); newtcp6sk = (struct tcp6_sock *)newsk; -- cgit v1.2.3 From 4eaf3b84f2881c9c028f1d5e76c52ab575fe3a66 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 1 Dec 2015 20:08:51 -0800 Subject: net_sched: fix qdisc_tree_decrease_qlen() races qdisc_tree_decrease_qlen() suffers from two problems on multiqueue devices. One problem is that it updates sch->q.qlen and sch->qstats.drops on the mq/mqprio root qdisc, while it should not : Daniele reported underflows errors : [ 681.774821] PAX: sch->q.qlen: 0 n: 1 [ 681.774825] PAX: size overflow detected in function qdisc_tree_decrease_qlen net/sched/sch_api.c:769 cicus.693_49 min, count: 72, decl: qlen; num: 0; context: sk_buff_head; [ 681.774954] CPU: 2 PID: 19 Comm: ksoftirqd/2 Tainted: G O 4.2.6.201511282239-1-grsec #1 [ 681.774955] Hardware name: ASUSTeK COMPUTER INC. X302LJ/X302LJ, BIOS X302LJ.202 03/05/2015 [ 681.774956] ffffffffa9a04863 0000000000000000 0000000000000000 ffffffffa990ff7c [ 681.774959] ffffc90000d3bc38 ffffffffa95d2810 0000000000000007 ffffffffa991002b [ 681.774960] ffffc90000d3bc68 ffffffffa91a44f4 0000000000000001 0000000000000001 [ 681.774962] Call Trace: [ 681.774967] [] dump_stack+0x4c/0x7f [ 681.774970] [] report_size_overflow+0x34/0x50 [ 681.774972] [] qdisc_tree_decrease_qlen+0x152/0x160 [ 681.774976] [] fq_codel_dequeue+0x7b1/0x820 [sch_fq_codel] [ 681.774978] [] ? qdisc_peek_dequeued+0xa0/0xa0 [sch_fq_codel] [ 681.774980] [] __qdisc_run+0x4d/0x1d0 [ 681.774983] [] net_tx_action+0xc2/0x160 [ 681.774985] [] __do_softirq+0xf1/0x200 [ 681.774987] [] run_ksoftirqd+0x1e/0x30 [ 681.774989] [] smpboot_thread_fn+0x150/0x260 [ 681.774991] [] ? sort_range+0x40/0x40 [ 681.774992] [] kthread+0xe4/0x100 [ 681.774994] [] ? kthread_worker_fn+0x170/0x170 [ 681.774995] [] ret_from_fork+0x3e/0x70 mq/mqprio have their own ways to report qlen/drops by folding stats on all their queues, with appropriate locking. A second problem is that qdisc_tree_decrease_qlen() calls qdisc_lookup() without proper locking : concurrent qdisc updates could corrupt the list that qdisc_match_from_root() parses to find a qdisc given its handle. Fix first problem adding a TCQ_F_NOPARENT qdisc flag that qdisc_tree_decrease_qlen() can use to abort its tree traversal, as soon as it meets a mq/mqprio qdisc children. Second problem can be fixed by RCU protection. Qdisc are already freed after RCU grace period, so qdisc_list_add() and qdisc_list_del() simply have to use appropriate rcu list variants. A future patch will add a per struct netdev_queue list anchor, so that qdisc_tree_decrease_qlen() can have more efficient lookups. Reported-by: Daniele Fucini Signed-off-by: Eric Dumazet Cc: Cong Wang Cc: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/sch_generic.h | 3 +++ net/sched/sch_api.c | 27 ++++++++++++++++++--------- net/sched/sch_generic.c | 2 +- net/sched/sch_mq.c | 4 ++-- net/sched/sch_mqprio.c | 4 ++-- 5 files changed, 26 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 4c79ce8c1f92..b2a8e6338576 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -61,6 +61,9 @@ struct Qdisc { */ #define TCQ_F_WARN_NONWC (1 << 16) #define TCQ_F_CPUSTATS 0x20 /* run using percpu statistics */ +#define TCQ_F_NOPARENT 0x40 /* root of its hierarchy : + * qdisc_tree_decrease_qlen() should stop. + */ u32 limit; const struct Qdisc_ops *ops; struct qdisc_size_table __rcu *stab; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index f43c8f33f09e..7ec667dd4ce1 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -253,7 +253,8 @@ int qdisc_set_default(const char *name) } /* We know handle. Find qdisc among all qdisc's attached to device - (root qdisc, all its children, children of children etc.) + * (root qdisc, all its children, children of children etc.) + * Note: caller either uses rtnl or rcu_read_lock() */ static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle) @@ -264,7 +265,7 @@ static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle) root->handle == handle) return root; - list_for_each_entry(q, &root->list, list) { + list_for_each_entry_rcu(q, &root->list, list) { if (q->handle == handle) return q; } @@ -277,15 +278,18 @@ void qdisc_list_add(struct Qdisc *q) struct Qdisc *root = qdisc_dev(q)->qdisc; WARN_ON_ONCE(root == &noop_qdisc); - list_add_tail(&q->list, &root->list); + ASSERT_RTNL(); + list_add_tail_rcu(&q->list, &root->list); } } EXPORT_SYMBOL(qdisc_list_add); void qdisc_list_del(struct Qdisc *q) { - if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) - list_del(&q->list); + if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) { + ASSERT_RTNL(); + list_del_rcu(&q->list); + } } EXPORT_SYMBOL(qdisc_list_del); @@ -750,14 +754,18 @@ void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n) if (n == 0) return; drops = max_t(int, n, 0); + rcu_read_lock(); while ((parentid = sch->parent)) { if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS)) - return; + break; + if (sch->flags & TCQ_F_NOPARENT) + break; + /* TODO: perform the search on a per txq basis */ sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid)); if (sch == NULL) { - WARN_ON(parentid != TC_H_ROOT); - return; + WARN_ON_ONCE(parentid != TC_H_ROOT); + break; } cops = sch->ops->cl_ops; if (cops->qlen_notify) { @@ -768,6 +776,7 @@ void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n) sch->q.qlen -= n; __qdisc_qstats_drop(sch, drops); } + rcu_read_unlock(); } EXPORT_SYMBOL(qdisc_tree_decrease_qlen); @@ -941,7 +950,7 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue, } lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock); if (!netif_is_multiqueue(dev)) - sch->flags |= TCQ_F_ONETXQUEUE; + sch->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; } sch->handle = handle; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index cb5d4ad32946..e82a1ad80aa5 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -737,7 +737,7 @@ static void attach_one_default_qdisc(struct net_device *dev, return; } if (!netif_is_multiqueue(dev)) - qdisc->flags |= TCQ_F_ONETXQUEUE; + qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; dev_queue->qdisc_sleeping = qdisc; } diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index f3cbaecd283a..3e82f047caaf 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -63,7 +63,7 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt) if (qdisc == NULL) goto err; priv->qdiscs[ntx] = qdisc; - qdisc->flags |= TCQ_F_ONETXQUEUE; + qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; } sch->flags |= TCQ_F_MQROOT; @@ -156,7 +156,7 @@ static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, *old = dev_graft_qdisc(dev_queue, new); if (new) - new->flags |= TCQ_F_ONETXQUEUE; + new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; if (dev->flags & IFF_UP) dev_activate(dev); return 0; diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 3811a745452c..ad70ecf57ce7 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -132,7 +132,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) goto err; } priv->qdiscs[i] = qdisc; - qdisc->flags |= TCQ_F_ONETXQUEUE; + qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; } /* If the mqprio options indicate that hardware should own @@ -209,7 +209,7 @@ static int mqprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, *old = dev_graft_qdisc(dev_queue, new); if (new) - new->flags |= TCQ_F_ONETXQUEUE; + new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; if (dev->flags & IFF_UP) dev_activate(dev); -- cgit v1.2.3 From a0af2e538c80f3e47f1d6ddf120a153ad909e8ad Mon Sep 17 00:00:00 2001 From: Thomas Hellstrom Date: Wed, 2 Dec 2015 09:24:46 -0800 Subject: drm: Fix an unwanted master inheritance v2 A client calling drmSetMaster() using a file descriptor that was opened when another client was master would inherit the latter client's master object and all its authenticated clients. This is unwanted behaviour, and when this happens, instead allocate a brand new master object for the client calling drmSetMaster(). Fixes a BUG() throw in vmw_master_set(). Cc: Signed-off-by: Thomas Hellstrom Signed-off-by: Dave Airlie --- drivers/gpu/drm/drm_drv.c | 5 +++ drivers/gpu/drm/drm_fops.c | 84 ++++++++++++++++++++++++++++++---------------- include/drm/drmP.h | 6 ++++ 3 files changed, 67 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c index 9362609df38a..7dd6728dd092 100644 --- a/drivers/gpu/drm/drm_drv.c +++ b/drivers/gpu/drm/drm_drv.c @@ -160,6 +160,11 @@ int drm_setmaster_ioctl(struct drm_device *dev, void *data, goto out_unlock; } + if (!file_priv->allowed_master) { + ret = drm_new_set_master(dev, file_priv); + goto out_unlock; + } + file_priv->minor->master = drm_master_get(file_priv->master); file_priv->is_master = 1; if (dev->driver->master_set) { diff --git a/drivers/gpu/drm/drm_fops.c b/drivers/gpu/drm/drm_fops.c index c59ce4d0ef75..6b5625e66119 100644 --- a/drivers/gpu/drm/drm_fops.c +++ b/drivers/gpu/drm/drm_fops.c @@ -125,6 +125,60 @@ static int drm_cpu_valid(void) return 1; } +/** + * drm_new_set_master - Allocate a new master object and become master for the + * associated master realm. + * + * @dev: The associated device. + * @fpriv: File private identifying the client. + * + * This function must be called with dev::struct_mutex held. + * Returns negative error code on failure. Zero on success. + */ +int drm_new_set_master(struct drm_device *dev, struct drm_file *fpriv) +{ + struct drm_master *old_master; + int ret; + + lockdep_assert_held_once(&dev->master_mutex); + + /* create a new master */ + fpriv->minor->master = drm_master_create(fpriv->minor); + if (!fpriv->minor->master) + return -ENOMEM; + + /* take another reference for the copy in the local file priv */ + old_master = fpriv->master; + fpriv->master = drm_master_get(fpriv->minor->master); + + if (dev->driver->master_create) { + ret = dev->driver->master_create(dev, fpriv->master); + if (ret) + goto out_err; + } + if (dev->driver->master_set) { + ret = dev->driver->master_set(dev, fpriv, true); + if (ret) + goto out_err; + } + + fpriv->is_master = 1; + fpriv->allowed_master = 1; + fpriv->authenticated = 1; + if (old_master) + drm_master_put(&old_master); + + return 0; + +out_err: + /* drop both references and restore old master on failure */ + drm_master_put(&fpriv->minor->master); + drm_master_put(&fpriv->master); + fpriv->master = old_master; + + return ret; +} + /** * Called whenever a process opens /dev/drm. * @@ -189,35 +243,9 @@ static int drm_open_helper(struct file *filp, struct drm_minor *minor) mutex_lock(&dev->master_mutex); if (drm_is_primary_client(priv) && !priv->minor->master) { /* create a new master */ - priv->minor->master = drm_master_create(priv->minor); - if (!priv->minor->master) { - ret = -ENOMEM; + ret = drm_new_set_master(dev, priv); + if (ret) goto out_close; - } - - priv->is_master = 1; - /* take another reference for the copy in the local file priv */ - priv->master = drm_master_get(priv->minor->master); - priv->authenticated = 1; - - if (dev->driver->master_create) { - ret = dev->driver->master_create(dev, priv->master); - if (ret) { - /* drop both references if this fails */ - drm_master_put(&priv->minor->master); - drm_master_put(&priv->master); - goto out_close; - } - } - if (dev->driver->master_set) { - ret = dev->driver->master_set(dev, priv, true); - if (ret) { - /* drop both references if this fails */ - drm_master_put(&priv->minor->master); - drm_master_put(&priv->master); - goto out_close; - } - } } else if (drm_is_primary_client(priv)) { /* get a reference to the master */ priv->master = drm_master_get(priv->minor->master); diff --git a/include/drm/drmP.h b/include/drm/drmP.h index 0b921ae06cd8..441b26e846d8 100644 --- a/include/drm/drmP.h +++ b/include/drm/drmP.h @@ -309,6 +309,11 @@ struct drm_file { unsigned universal_planes:1; /* true if client understands atomic properties */ unsigned atomic:1; + /* + * This client is allowed to gain master privileges for @master. + * Protected by struct drm_device::master_mutex. + */ + unsigned allowed_master:1; struct pid *pid; kuid_t uid; @@ -910,6 +915,7 @@ extern int drm_open(struct inode *inode, struct file *filp); extern ssize_t drm_read(struct file *filp, char __user *buffer, size_t count, loff_t *offset); extern int drm_release(struct inode *inode, struct file *filp); +extern int drm_new_set_master(struct drm_device *dev, struct drm_file *fpriv); /* Mapping support (drm_vm.h) */ extern unsigned int drm_poll(struct file *filp, struct poll_table_struct *wait); -- cgit v1.2.3 From bbc8764f80eb872d2b36302882ddfc9882de4b16 Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Tue, 10 Nov 2015 17:37:31 +0100 Subject: drm/nouveau: Fix pre-nv50 pageflip events (v4) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Apparently pre-nv50 pageflip events happen before the actual vblank period. Therefore that functionality got semi-disabled in commit af4870e406126b7ac0ae7c7ce5751f25ebe60f28 Author: Mario Kleiner Date: Tue May 13 00:42:08 2014 +0200 drm/nouveau/kms/nv04-nv40: fix pageflip events via special case. Unfortunately that hack got uprooted in commit cc1ef118fc099295ae6aabbacc8af94d8d8885eb Author: Thierry Reding Date: Wed Aug 12 17:00:31 2015 +0200 drm/irq: Make pipe unsigned and name consistent Triggering a warning when trying to sample the vblank timestamp for a non-existing pipe. There's a few ways to fix this: - Open-code the old behaviour, which just enshrines this slight breakage of the userspace ABI. - Revert Mario's commit and again inflict broken timestamps, again not pretty. - Fix this for real by delaying the pageflip TS until the next vblank interrupt, thereby making it accurate. This patch implements the third option. Since having a page flip interrupt that happens when the pageflip gets armed and not when it completes in the next vblank seems to be fairly common (older i915 hw works very similarly) create a new helper to arm vblank events for such drivers. v2 (Mario Kleiner): - Fix function prototypes in drmP.h - Add missing vblank_put() for pageflip completion without pageflip event. - Initialize sequence number for queued pageflip event to avoid trouble in drm_handle_vblank_events(). - Remove dead code and spelling fix. v3 (Mario Kleiner): - Add a signed-off-by and cc stable tag per Ilja's advice. v4 (Thierry Reding): - Fix kerneldoc typo, discovered by Michel Dänzer - Rearrange tags and changelog Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=106431 Cc: Thierry Reding Cc: Mario Kleiner Acked-by: Ben Skeggs Cc: Ilia Mirkin Signed-off-by: Daniel Vetter Reviewed-by: Mario Kleiner Cc: stable@vger.kernel.org # v4.3 Signed-off-by: Mario Kleiner Signed-off-by: Thierry Reding Signed-off-by: Dave Airlie --- drivers/gpu/drm/drm_irq.c | 54 ++++++++++++++++++++++++++++++- drivers/gpu/drm/nouveau/nouveau_display.c | 19 ++++++----- include/drm/drmP.h | 4 +++ 3 files changed, 68 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/drm_irq.c b/drivers/gpu/drm/drm_irq.c index 2151ea551d3b..607f493ae801 100644 --- a/drivers/gpu/drm/drm_irq.c +++ b/drivers/gpu/drm/drm_irq.c @@ -980,7 +980,8 @@ static void send_vblank_event(struct drm_device *dev, struct drm_pending_vblank_event *e, unsigned long seq, struct timeval *now) { - WARN_ON_SMP(!spin_is_locked(&dev->event_lock)); + assert_spin_locked(&dev->event_lock); + e->event.sequence = seq; e->event.tv_sec = now->tv_sec; e->event.tv_usec = now->tv_usec; @@ -992,6 +993,57 @@ static void send_vblank_event(struct drm_device *dev, e->event.sequence); } +/** + * drm_arm_vblank_event - arm vblank event after pageflip + * @dev: DRM device + * @pipe: CRTC index + * @e: the event to prepare to send + * + * A lot of drivers need to generate vblank events for the very next vblank + * interrupt. For example when the page flip interrupt happens when the page + * flip gets armed, but not when it actually executes within the next vblank + * period. This helper function implements exactly the required vblank arming + * behaviour. + * + * Caller must hold event lock. Caller must also hold a vblank reference for + * the event @e, which will be dropped when the next vblank arrives. + * + * This is the legacy version of drm_crtc_arm_vblank_event(). + */ +void drm_arm_vblank_event(struct drm_device *dev, unsigned int pipe, + struct drm_pending_vblank_event *e) +{ + assert_spin_locked(&dev->event_lock); + + e->pipe = pipe; + e->event.sequence = drm_vblank_count(dev, pipe); + list_add_tail(&e->base.link, &dev->vblank_event_list); +} +EXPORT_SYMBOL(drm_arm_vblank_event); + +/** + * drm_crtc_arm_vblank_event - arm vblank event after pageflip + * @crtc: the source CRTC of the vblank event + * @e: the event to send + * + * A lot of drivers need to generate vblank events for the very next vblank + * interrupt. For example when the page flip interrupt happens when the page + * flip gets armed, but not when it actually executes within the next vblank + * period. This helper function implements exactly the required vblank arming + * behaviour. + * + * Caller must hold event lock. Caller must also hold a vblank reference for + * the event @e, which will be dropped when the next vblank arrives. + * + * This is the native KMS version of drm_arm_vblank_event(). + */ +void drm_crtc_arm_vblank_event(struct drm_crtc *crtc, + struct drm_pending_vblank_event *e) +{ + drm_arm_vblank_event(crtc->dev, drm_crtc_index(crtc), e); +} +EXPORT_SYMBOL(drm_crtc_arm_vblank_event); + /** * drm_send_vblank_event - helper to send vblank event after pageflip * @dev: DRM device diff --git a/drivers/gpu/drm/nouveau/nouveau_display.c b/drivers/gpu/drm/nouveau/nouveau_display.c index db6bc6760545..64c8d932d5f1 100644 --- a/drivers/gpu/drm/nouveau/nouveau_display.c +++ b/drivers/gpu/drm/nouveau/nouveau_display.c @@ -829,7 +829,6 @@ nouveau_finish_page_flip(struct nouveau_channel *chan, struct drm_device *dev = drm->dev; struct nouveau_page_flip_state *s; unsigned long flags; - int crtcid = -1; spin_lock_irqsave(&dev->event_lock, flags); @@ -841,15 +840,19 @@ nouveau_finish_page_flip(struct nouveau_channel *chan, s = list_first_entry(&fctx->flip, struct nouveau_page_flip_state, head); if (s->event) { - /* Vblank timestamps/counts are only correct on >= NV-50 */ - if (drm->device.info.family >= NV_DEVICE_INFO_V0_TESLA) - crtcid = s->crtc; + if (drm->device.info.family < NV_DEVICE_INFO_V0_TESLA) { + drm_arm_vblank_event(dev, s->crtc, s->event); + } else { + drm_send_vblank_event(dev, s->crtc, s->event); - drm_send_vblank_event(dev, crtcid, s->event); + /* Give up ownership of vblank for page-flipped crtc */ + drm_vblank_put(dev, s->crtc); + } + } + else { + /* Give up ownership of vblank for page-flipped crtc */ + drm_vblank_put(dev, s->crtc); } - - /* Give up ownership of vblank for page-flipped crtc */ - drm_vblank_put(dev, s->crtc); list_del(&s->head); if (ps) diff --git a/include/drm/drmP.h b/include/drm/drmP.h index 441b26e846d8..0a271ca1f7c7 100644 --- a/include/drm/drmP.h +++ b/include/drm/drmP.h @@ -953,6 +953,10 @@ extern void drm_send_vblank_event(struct drm_device *dev, unsigned int pipe, struct drm_pending_vblank_event *e); extern void drm_crtc_send_vblank_event(struct drm_crtc *crtc, struct drm_pending_vblank_event *e); +extern void drm_arm_vblank_event(struct drm_device *dev, unsigned int pipe, + struct drm_pending_vblank_event *e); +extern void drm_crtc_arm_vblank_event(struct drm_crtc *crtc, + struct drm_pending_vblank_event *e); extern bool drm_handle_vblank(struct drm_device *dev, unsigned int pipe); extern bool drm_crtc_handle_vblank(struct drm_crtc *crtc); extern int drm_vblank_get(struct drm_device *dev, unsigned int pipe); -- cgit v1.2.3 From ae5515d66362b9d96cdcfce504567f0b8b7bd83e Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Fri, 4 Dec 2015 08:38:42 -0700 Subject: Revert: "vfio: Include No-IOMMU mode" Revert commit 033291eccbdb ("vfio: Include No-IOMMU mode") due to lack of a user. This was originally intended to fill a need for the DPDK driver, but uptake has been slow so rather than support an unproven kernel interface revert it and revisit when userspace catches up. Signed-off-by: Alex Williamson --- drivers/vfio/Kconfig | 15 ---- drivers/vfio/pci/vfio_pci.c | 8 +- drivers/vfio/vfio.c | 186 ++------------------------------------------ include/linux/vfio.h | 3 - include/uapi/linux/vfio.h | 7 -- 5 files changed, 10 insertions(+), 209 deletions(-) (limited to 'include') diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index da6e2ce77495..850d86ca685b 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -31,21 +31,6 @@ menuconfig VFIO If you don't know what to do here, say N. -menuconfig VFIO_NOIOMMU - bool "VFIO No-IOMMU support" - depends on VFIO - help - VFIO is built on the ability to isolate devices using the IOMMU. - Only with an IOMMU can userspace access to DMA capable devices be - considered secure. VFIO No-IOMMU mode enables IOMMU groups for - devices without IOMMU backing for the purpose of re-using the VFIO - infrastructure in a non-secure mode. Use of this mode will result - in an unsupportable kernel and will therefore taint the kernel. - Device assignment to virtual machines is also not possible with - this mode since there is no IOMMU to provide DMA translation. - - If you don't know what to do here, say N. - source "drivers/vfio/pci/Kconfig" source "drivers/vfio/platform/Kconfig" source "virt/lib/Kconfig" diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 2760a7ba3f30..56bf6dbb93db 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -940,13 +940,13 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL) return -EINVAL; - group = vfio_iommu_group_get(&pdev->dev); + group = iommu_group_get(&pdev->dev); if (!group) return -EINVAL; vdev = kzalloc(sizeof(*vdev), GFP_KERNEL); if (!vdev) { - vfio_iommu_group_put(group, &pdev->dev); + iommu_group_put(group); return -ENOMEM; } @@ -957,7 +957,7 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev); if (ret) { - vfio_iommu_group_put(group, &pdev->dev); + iommu_group_put(group); kfree(vdev); return ret; } @@ -993,7 +993,7 @@ static void vfio_pci_remove(struct pci_dev *pdev) if (!vdev) return; - vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev); + iommu_group_put(pdev->dev.iommu_group); kfree(vdev); if (vfio_pci_is_vga(pdev)) { diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 9da0703e09d0..6070b793cbcb 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -62,7 +62,6 @@ struct vfio_container { struct rw_semaphore group_lock; struct vfio_iommu_driver *iommu_driver; void *iommu_data; - bool noiommu; }; struct vfio_unbound_dev { @@ -85,7 +84,6 @@ struct vfio_group { struct list_head unbound_list; struct mutex unbound_lock; atomic_t opened; - bool noiommu; }; struct vfio_device { @@ -97,147 +95,6 @@ struct vfio_device { void *device_data; }; -#ifdef CONFIG_VFIO_NOIOMMU -static bool noiommu __read_mostly; -module_param_named(enable_unsafe_noiommu_support, - noiommu, bool, S_IRUGO | S_IWUSR); -MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)"); -#endif - -/* - * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe - * and remove functions, any use cases other than acquiring the first - * reference for the purpose of calling vfio_add_group_dev() or removing - * that symmetric reference after vfio_del_group_dev() should use the raw - * iommu_group_{get,put} functions. In particular, vfio_iommu_group_put() - * removes the device from the dummy group and cannot be nested. - */ -struct iommu_group *vfio_iommu_group_get(struct device *dev) -{ - struct iommu_group *group; - int __maybe_unused ret; - - group = iommu_group_get(dev); - -#ifdef CONFIG_VFIO_NOIOMMU - /* - * With noiommu enabled, an IOMMU group will be created for a device - * that doesn't already have one and doesn't have an iommu_ops on their - * bus. We use iommu_present() again in the main code to detect these - * fake groups. - */ - if (group || !noiommu || iommu_present(dev->bus)) - return group; - - group = iommu_group_alloc(); - if (IS_ERR(group)) - return NULL; - - iommu_group_set_name(group, "vfio-noiommu"); - ret = iommu_group_add_device(group, dev); - iommu_group_put(group); - if (ret) - return NULL; - - /* - * Where to taint? At this point we've added an IOMMU group for a - * device that is not backed by iommu_ops, therefore any iommu_ - * callback using iommu_ops can legitimately Oops. So, while we may - * be about to give a DMA capable device to a user without IOMMU - * protection, which is clearly taint-worthy, let's go ahead and do - * it here. - */ - add_taint(TAINT_USER, LOCKDEP_STILL_OK); - dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n"); -#endif - - return group; -} -EXPORT_SYMBOL_GPL(vfio_iommu_group_get); - -void vfio_iommu_group_put(struct iommu_group *group, struct device *dev) -{ -#ifdef CONFIG_VFIO_NOIOMMU - if (!iommu_present(dev->bus)) - iommu_group_remove_device(dev); -#endif - - iommu_group_put(group); -} -EXPORT_SYMBOL_GPL(vfio_iommu_group_put); - -#ifdef CONFIG_VFIO_NOIOMMU -static void *vfio_noiommu_open(unsigned long arg) -{ - if (arg != VFIO_NOIOMMU_IOMMU) - return ERR_PTR(-EINVAL); - if (!capable(CAP_SYS_RAWIO)) - return ERR_PTR(-EPERM); - - return NULL; -} - -static void vfio_noiommu_release(void *iommu_data) -{ -} - -static long vfio_noiommu_ioctl(void *iommu_data, - unsigned int cmd, unsigned long arg) -{ - if (cmd == VFIO_CHECK_EXTENSION) - return arg == VFIO_NOIOMMU_IOMMU ? 1 : 0; - - return -ENOTTY; -} - -static int vfio_iommu_present(struct device *dev, void *unused) -{ - return iommu_present(dev->bus) ? 1 : 0; -} - -static int vfio_noiommu_attach_group(void *iommu_data, - struct iommu_group *iommu_group) -{ - return iommu_group_for_each_dev(iommu_group, NULL, - vfio_iommu_present) ? -EINVAL : 0; -} - -static void vfio_noiommu_detach_group(void *iommu_data, - struct iommu_group *iommu_group) -{ -} - -static struct vfio_iommu_driver_ops vfio_noiommu_ops = { - .name = "vfio-noiommu", - .owner = THIS_MODULE, - .open = vfio_noiommu_open, - .release = vfio_noiommu_release, - .ioctl = vfio_noiommu_ioctl, - .attach_group = vfio_noiommu_attach_group, - .detach_group = vfio_noiommu_detach_group, -}; - -static struct vfio_iommu_driver vfio_noiommu_driver = { - .ops = &vfio_noiommu_ops, -}; - -/* - * Wrap IOMMU drivers, the noiommu driver is the one and only driver for - * noiommu groups (and thus containers) and not available for normal groups. - */ -#define vfio_for_each_iommu_driver(con, pos) \ - for (pos = con->noiommu ? &vfio_noiommu_driver : \ - list_first_entry(&vfio.iommu_drivers_list, \ - struct vfio_iommu_driver, vfio_next); \ - (con->noiommu ? pos != NULL : \ - &pos->vfio_next != &vfio.iommu_drivers_list); \ - pos = con->noiommu ? NULL : list_next_entry(pos, vfio_next)) -#else -#define vfio_for_each_iommu_driver(con, pos) \ - list_for_each_entry(pos, &vfio.iommu_drivers_list, vfio_next) -#endif - - /** * IOMMU driver registration */ @@ -342,8 +199,7 @@ static void vfio_group_unlock_and_free(struct vfio_group *group) /** * Group objects - create, release, get, put, search */ -static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group, - bool noiommu) +static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group) { struct vfio_group *group, *tmp; struct device *dev; @@ -361,7 +217,6 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group, atomic_set(&group->container_users, 0); atomic_set(&group->opened, 0); group->iommu_group = iommu_group; - group->noiommu = noiommu; group->nb.notifier_call = vfio_iommu_group_notifier; @@ -397,8 +252,7 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group, dev = device_create(vfio.class, NULL, MKDEV(MAJOR(vfio.group_devt), minor), - group, "%s%d", noiommu ? "noiommu-" : "", - iommu_group_id(iommu_group)); + group, "%d", iommu_group_id(iommu_group)); if (IS_ERR(dev)) { vfio_free_group_minor(minor); vfio_group_unlock_and_free(group); @@ -786,8 +640,7 @@ int vfio_add_group_dev(struct device *dev, group = vfio_group_get_from_iommu(iommu_group); if (!group) { - group = vfio_create_group(iommu_group, - !iommu_present(dev->bus)); + group = vfio_create_group(iommu_group); if (IS_ERR(group)) { iommu_group_put(iommu_group); return PTR_ERR(group); @@ -999,7 +852,8 @@ static long vfio_ioctl_check_extension(struct vfio_container *container, */ if (!driver) { mutex_lock(&vfio.iommu_drivers_lock); - vfio_for_each_iommu_driver(container, driver) { + list_for_each_entry(driver, &vfio.iommu_drivers_list, + vfio_next) { if (!try_module_get(driver->ops->owner)) continue; @@ -1068,7 +922,7 @@ static long vfio_ioctl_set_iommu(struct vfio_container *container, } mutex_lock(&vfio.iommu_drivers_lock); - vfio_for_each_iommu_driver(container, driver) { + list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) { void *data; if (!try_module_get(driver->ops->owner)) @@ -1333,9 +1187,6 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd) if (atomic_read(&group->container_users)) return -EINVAL; - if (group->noiommu && !capable(CAP_SYS_RAWIO)) - return -EPERM; - f = fdget(container_fd); if (!f.file) return -EBADF; @@ -1351,13 +1202,6 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd) down_write(&container->group_lock); - /* Real groups and fake groups cannot mix */ - if (!list_empty(&container->group_list) && - container->noiommu != group->noiommu) { - ret = -EPERM; - goto unlock_out; - } - driver = container->iommu_driver; if (driver) { ret = driver->ops->attach_group(container->iommu_data, @@ -1367,7 +1211,6 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd) } group->container = container; - container->noiommu = group->noiommu; list_add(&group->container_next, &container->group_list); /* Get a reference on the container and mark a user within the group */ @@ -1398,9 +1241,6 @@ static int vfio_group_get_device_fd(struct vfio_group *group, char *buf) !group->container->iommu_driver || !vfio_group_viable(group)) return -EINVAL; - if (group->noiommu && !capable(CAP_SYS_RAWIO)) - return -EPERM; - device = vfio_device_get_from_name(group, buf); if (!device) return -ENODEV; @@ -1443,10 +1283,6 @@ static int vfio_group_get_device_fd(struct vfio_group *group, char *buf) fd_install(ret, filep); - if (group->noiommu) - dev_warn(device->dev, "vfio-noiommu device opened by user " - "(%s:%d)\n", current->comm, task_pid_nr(current)); - return ret; } @@ -1535,11 +1371,6 @@ static int vfio_group_fops_open(struct inode *inode, struct file *filep) if (!group) return -ENODEV; - if (group->noiommu && !capable(CAP_SYS_RAWIO)) { - vfio_group_put(group); - return -EPERM; - } - /* Do we need multiple instances of the group open? Seems not. */ opened = atomic_cmpxchg(&group->opened, 0, 1); if (opened) { @@ -1702,11 +1533,6 @@ struct vfio_group *vfio_group_get_external_user(struct file *filep) if (!atomic_inc_not_zero(&group->container_users)) return ERR_PTR(-EINVAL); - if (group->noiommu) { - atomic_dec(&group->container_users); - return ERR_PTR(-EPERM); - } - if (!group->container->iommu_driver || !vfio_group_viable(group)) { atomic_dec(&group->container_users); diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 610a86a892b8..ddb440975382 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -44,9 +44,6 @@ struct vfio_device_ops { void (*request)(void *device_data, unsigned int count); }; -extern struct iommu_group *vfio_iommu_group_get(struct device *dev); -extern void vfio_iommu_group_put(struct iommu_group *group, struct device *dev); - extern int vfio_add_group_dev(struct device *dev, const struct vfio_device_ops *ops, void *device_data); diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 751b69f858c8..9fd7b5d8df2f 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -38,13 +38,6 @@ #define VFIO_SPAPR_TCE_v2_IOMMU 7 -/* - * The No-IOMMU IOMMU offers no translation or isolation for devices and - * supports no ioctls outside of VFIO_CHECK_EXTENSION. Use of VFIO's No-IOMMU - * code will taint the host kernel and should be used with extreme caution. - */ -#define VFIO_NOIOMMU_IOMMU 8 - /* * The IOCTL interface is designed for extensibility by embedding the * structure length (argsz) and flags into structures passed between -- cgit v1.2.3 From 7c23b7c1996597dd9d60bb282fb5fa1be6ebd18b Mon Sep 17 00:00:00 2001 From: "Lu, Han" Date: Mon, 7 Dec 2015 15:59:13 +0800 Subject: ALSA: hda - Fix playback noise with 24/32 bit sample size on BXT In BXT-P A0, HD-Audio DMA requests is later than expected, and makes an audio stream sensitive to system latencies when 24/32 bits are playing. Adjusting threshold of DMA fifo to force the DMA request sooner to improve latency tolerance at the expense of power. v2: move Intel specific code to hda_intel.c Signed-off-by: Lu, Han Signed-off-by: Takashi Iwai --- include/sound/hda_register.h | 3 +++ sound/pci/hda/hda_intel.c | 23 +++++++++++++++++++++++ 2 files changed, 26 insertions(+) (limited to 'include') diff --git a/include/sound/hda_register.h b/include/sound/hda_register.h index 2ae8812d7b1a..94dc6a9772e0 100644 --- a/include/sound/hda_register.h +++ b/include/sound/hda_register.h @@ -93,6 +93,9 @@ enum { SDI0, SDI1, SDI2, SDI3, SDO0, SDO1, SDO2, SDO3 }; #define AZX_REG_HSW_EM4 0x100c #define AZX_REG_HSW_EM5 0x1010 +/* Skylake/Broxton display HD-A controller Extended Mode registers */ +#define AZX_REG_SKL_EM4L 0x1040 + /* PCI space */ #define AZX_PCIREG_TCSEL 0x44 diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index 963f82430938..bff5c8b329d1 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -355,6 +355,8 @@ enum { ((pci)->device == 0x0d0c) || \ ((pci)->device == 0x160c)) +#define IS_BROXTON(pci) ((pci)->device == 0x5a98) + static char *driver_short_names[] = { [AZX_DRIVER_ICH] = "HDA Intel", [AZX_DRIVER_PCH] = "HDA Intel PCH", @@ -506,15 +508,36 @@ static void azx_init_pci(struct azx *chip) } } +/* + * In BXT-P A0, HD-Audio DMA requests is later than expected, + * and makes an audio stream sensitive to system latencies when + * 24/32 bits are playing. + * Adjusting threshold of DMA fifo to force the DMA request + * sooner to improve latency tolerance at the expense of power. + */ +static void bxt_reduce_dma_latency(struct azx *chip) +{ + u32 val; + + val = azx_readl(chip, SKL_EM4L); + val &= (0x3 << 20); + azx_writel(chip, SKL_EM4L, val); +} + static void hda_intel_init_chip(struct azx *chip, bool full_reset) { struct hdac_bus *bus = azx_bus(chip); + struct pci_dev *pci = chip->pci; if (chip->driver_caps & AZX_DCAPS_I915_POWERWELL) snd_hdac_set_codec_wakeup(bus, true); azx_init_chip(chip, full_reset); if (chip->driver_caps & AZX_DCAPS_I915_POWERWELL) snd_hdac_set_codec_wakeup(bus, false); + + /* reduce dma latency to avoid noise */ + if (IS_BROXTON(pci)) + bxt_reduce_dma_latency(chip); } /* calculate runtime delay from LPIB */ -- cgit v1.2.3 From ea013a9b205b47b1fcbc72522146fad560af0712 Mon Sep 17 00:00:00 2001 From: Andreas Werner Date: Fri, 4 Dec 2015 18:12:49 +0100 Subject: libata-eh.c: Introduce new ata port flag for controller which lockup on read log page Some controller lockup on a ata_read_log_page. Add new ata port flag ATA_FLAG_NO_LOG_PAGE which can used to blacklist a controller. If this flag is set, any attempt to read a log page returns an error without actually issuing the command. Signed-off-by: Andreas Werner Signed-off-by: Tejun Heo --- drivers/ata/libata-eh.c | 8 ++++++++ include/linux/libata.h | 1 + 2 files changed, 9 insertions(+) (limited to 'include') diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c index cb0508af1459..961acc788f44 100644 --- a/drivers/ata/libata-eh.c +++ b/drivers/ata/libata-eh.c @@ -1505,12 +1505,20 @@ static const char *ata_err_string(unsigned int err_mask) unsigned int ata_read_log_page(struct ata_device *dev, u8 log, u8 page, void *buf, unsigned int sectors) { + unsigned long ap_flags = dev->link->ap->flags; struct ata_taskfile tf; unsigned int err_mask; bool dma = false; DPRINTK("read log page - log 0x%x, page 0x%x\n", log, page); + /* + * Return error without actually issuing the command on controllers + * which e.g. lockup on a read log page. + */ + if (ap_flags & ATA_FLAG_NO_LOG_PAGE) + return AC_ERR_DEV; + retry: ata_tf_init(dev, &tf); if (dev->dma_mode && ata_id_has_read_log_dma_ext(dev->id) && diff --git a/include/linux/libata.h b/include/linux/libata.h index 83577f8fd15b..600c1e0626a5 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -210,6 +210,7 @@ enum { ATA_FLAG_SLAVE_POSS = (1 << 0), /* host supports slave dev */ /* (doesn't imply presence) */ ATA_FLAG_SATA = (1 << 1), + ATA_FLAG_NO_LOG_PAGE = (1 << 5), /* do not issue log page read */ ATA_FLAG_NO_ATAPI = (1 << 6), /* No ATAPI support */ ATA_FLAG_PIO_DMA = (1 << 7), /* PIO cmds via DMA */ ATA_FLAG_PIO_LBA48 = (1 << 8), /* Host DMA engine is LBA28 only */ -- cgit v1.2.3 From 57b4bd06ff0372fe1e3617889c4b37fbd500364a Mon Sep 17 00:00:00 2001 From: Matias Bjørling Date: Sun, 6 Dec 2015 11:25:47 +0100 Subject: lightnvm: comments on constants MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is not obvious what NVM_IO_* and NVM_BLK_T_* are used for. Make sure to comment them appropriately as the other constants. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index c6916aec43b6..935ef3844c05 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -50,9 +50,16 @@ enum { NVM_IO_DUAL_ACCESS = 0x1, NVM_IO_QUAD_ACCESS = 0x2, + /* NAND Access Modes */ NVM_IO_SUSPEND = 0x80, NVM_IO_SLC_MODE = 0x100, NVM_IO_SCRAMBLE_DISABLE = 0x200, + + /* Block Types */ + NVM_BLK_T_FREE = 0x0, + NVM_BLK_T_BAD = 0x1, + NVM_BLK_T_DEV = 0x2, + NVM_BLK_T_HOST = 0x4, }; struct nvm_id_group { -- cgit v1.2.3 From 16f26c3aa9b9c36a9d1092ae3258461d1008481e Mon Sep 17 00:00:00 2001 From: Matias Bjørling Date: Sun, 6 Dec 2015 11:25:48 +0100 Subject: lightnvm: replace req queue with nvmdev for lld MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the case where a request queue is passed to the low lever lightnvm device drive integration, the device driver might pass its admin commands through another queue. Instead pass nvm_dev, and let the low level drive the appropriate queue. Reported-by: Christoph Hellwig Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/block/null_blk.c | 9 +++++---- drivers/lightnvm/core.c | 7 +++---- drivers/lightnvm/gennvm.c | 8 ++++---- drivers/lightnvm/rrpc.c | 2 +- drivers/nvme/host/lightnvm.c | 24 +++++++++++++----------- include/linux/lightnvm.h | 14 +++++++------- 6 files changed, 33 insertions(+), 31 deletions(-) (limited to 'include') diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 0c3940ec5e62..7981b7407305 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -444,8 +444,9 @@ static void null_lnvm_end_io(struct request *rq, int error) blk_put_request(rq); } -static int null_lnvm_submit_io(struct request_queue *q, struct nvm_rq *rqd) +static int null_lnvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) { + struct request_queue *q = dev->q; struct request *rq; struct bio *bio = rqd->bio; @@ -470,7 +471,7 @@ static int null_lnvm_submit_io(struct request_queue *q, struct nvm_rq *rqd) return 0; } -static int null_lnvm_id(struct request_queue *q, struct nvm_id *id) +static int null_lnvm_id(struct nvm_dev *dev, struct nvm_id *id) { sector_t size = gb * 1024 * 1024 * 1024ULL; sector_t blksize; @@ -523,7 +524,7 @@ static int null_lnvm_id(struct request_queue *q, struct nvm_id *id) return 0; } -static void *null_lnvm_create_dma_pool(struct request_queue *q, char *name) +static void *null_lnvm_create_dma_pool(struct nvm_dev *dev, char *name) { mempool_t *virtmem_pool; @@ -541,7 +542,7 @@ static void null_lnvm_destroy_dma_pool(void *pool) mempool_destroy(pool); } -static void *null_lnvm_dev_dma_alloc(struct request_queue *q, void *pool, +static void *null_lnvm_dev_dma_alloc(struct nvm_dev *dev, void *pool, gfp_t mem_flags, dma_addr_t *dma_handler) { return mempool_alloc(pool, mem_flags); diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 86ce887b2ed6..4a8d1fe34c4e 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -74,7 +74,7 @@ EXPORT_SYMBOL(nvm_unregister_target); void *nvm_dev_dma_alloc(struct nvm_dev *dev, gfp_t mem_flags, dma_addr_t *dma_handler) { - return dev->ops->dev_dma_alloc(dev->q, dev->ppalist_pool, mem_flags, + return dev->ops->dev_dma_alloc(dev, dev->ppalist_pool, mem_flags, dma_handler); } EXPORT_SYMBOL(nvm_dev_dma_alloc); @@ -246,7 +246,7 @@ static int nvm_init(struct nvm_dev *dev) if (!dev->q || !dev->ops) return ret; - if (dev->ops->identity(dev->q, &dev->identity)) { + if (dev->ops->identity(dev, &dev->identity)) { pr_err("nvm: device could not be identified\n"); goto err; } @@ -326,8 +326,7 @@ int nvm_register(struct request_queue *q, char *disk_name, } if (dev->ops->max_phys_sect > 1) { - dev->ppalist_pool = dev->ops->create_dma_pool(dev->q, - "ppalist"); + dev->ppalist_pool = dev->ops->create_dma_pool(dev, "ppalist"); if (!dev->ppalist_pool) { pr_err("nvm: could not create ppa pool\n"); ret = -ENOMEM; diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c index ce6025487a5c..52b513a65946 100644 --- a/drivers/lightnvm/gennvm.c +++ b/drivers/lightnvm/gennvm.c @@ -195,7 +195,7 @@ static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn) } if (dev->ops->get_l2p_tbl) { - ret = dev->ops->get_l2p_tbl(dev->q, 0, dev->total_pages, + ret = dev->ops->get_l2p_tbl(dev, 0, dev->total_pages, gennvm_block_map, dev); if (ret) { pr_err("gennvm: could not read L2P table.\n"); @@ -346,7 +346,7 @@ static int gennvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) gennvm_generic_to_addr_mode(dev, rqd); rqd->dev = dev; - return dev->ops->submit_io(dev->q, rqd); + return dev->ops->submit_io(dev, rqd); } static void gennvm_blk_set_type(struct nvm_dev *dev, struct ppa_addr *ppa, @@ -382,7 +382,7 @@ static void gennvm_mark_blk_bad(struct nvm_dev *dev, struct nvm_rq *rqd) if (!dev->ops->set_bb_tbl) return; - if (dev->ops->set_bb_tbl(dev->q, rqd, 1)) + if (dev->ops->set_bb_tbl(dev, rqd, 1)) return; gennvm_addr_to_generic_mode(dev, rqd); @@ -450,7 +450,7 @@ static int gennvm_erase_blk(struct nvm_dev *dev, struct nvm_block *blk, gennvm_generic_to_addr_mode(dev, &rqd); - ret = dev->ops->erase_block(dev->q, &rqd); + ret = dev->ops->erase_block(dev, &rqd); if (plane_cnt) nvm_dev_dma_free(dev, rqd.ppa_list, rqd.dma_ppa_list); diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c index cf1a4a515b76..134e4faba482 100644 --- a/drivers/lightnvm/rrpc.c +++ b/drivers/lightnvm/rrpc.c @@ -1016,7 +1016,7 @@ static int rrpc_map_init(struct rrpc *rrpc) return 0; /* Bring up the mapping table from device */ - ret = dev->ops->get_l2p_tbl(dev->q, 0, dev->total_pages, + ret = dev->ops->get_l2p_tbl(dev, 0, dev->total_pages, rrpc_l2p_update, rrpc); if (ret) { pr_err("nvm: rrpc: could not read L2P table.\n"); diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 762c9a7cbfa6..15f2acb4d5cd 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -271,9 +271,9 @@ static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id) return 0; } -static int nvme_nvm_identity(struct request_queue *q, struct nvm_id *nvm_id) +static int nvme_nvm_identity(struct nvm_dev *nvmdev, struct nvm_id *nvm_id) { - struct nvme_ns *ns = q->queuedata; + struct nvme_ns *ns = nvmdev->q->queuedata; struct nvme_dev *dev = ns->dev; struct nvme_nvm_id *nvme_nvm_id; struct nvme_nvm_command c = {}; @@ -308,10 +308,10 @@ out: return ret; } -static int nvme_nvm_get_l2p_tbl(struct request_queue *q, u64 slba, u32 nlb, +static int nvme_nvm_get_l2p_tbl(struct nvm_dev *nvmdev, u64 slba, u32 nlb, nvm_l2p_update_fn *update_l2p, void *priv) { - struct nvme_ns *ns = q->queuedata; + struct nvme_ns *ns = nvmdev->q->queuedata; struct nvme_dev *dev = ns->dev; struct nvme_nvm_command c = {}; u32 len = queue_max_hw_sectors(dev->admin_q) << 9; @@ -415,10 +415,10 @@ out: return ret; } -static int nvme_nvm_set_bb_tbl(struct request_queue *q, struct nvm_rq *rqd, +static int nvme_nvm_set_bb_tbl(struct nvm_dev *nvmdev, struct nvm_rq *rqd, int type) { - struct nvme_ns *ns = q->queuedata; + struct nvme_ns *ns = nvmdev->q->queuedata; struct nvme_dev *dev = ns->dev; struct nvme_nvm_command c = {}; int ret = 0; @@ -463,8 +463,9 @@ static void nvme_nvm_end_io(struct request *rq, int error) blk_mq_free_request(rq); } -static int nvme_nvm_submit_io(struct request_queue *q, struct nvm_rq *rqd) +static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) { + struct request_queue *q = dev->q; struct nvme_ns *ns = q->queuedata; struct request *rq; struct bio *bio = rqd->bio; @@ -502,8 +503,9 @@ static int nvme_nvm_submit_io(struct request_queue *q, struct nvm_rq *rqd) return 0; } -static int nvme_nvm_erase_block(struct request_queue *q, struct nvm_rq *rqd) +static int nvme_nvm_erase_block(struct nvm_dev *dev, struct nvm_rq *rqd) { + struct request_queue *q = dev->q; struct nvme_ns *ns = q->queuedata; struct nvme_nvm_command c = {}; @@ -515,9 +517,9 @@ static int nvme_nvm_erase_block(struct request_queue *q, struct nvm_rq *rqd) return nvme_submit_sync_cmd(q, (struct nvme_command *)&c, NULL, 0); } -static void *nvme_nvm_create_dma_pool(struct request_queue *q, char *name) +static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name) { - struct nvme_ns *ns = q->queuedata; + struct nvme_ns *ns = nvmdev->q->queuedata; struct nvme_dev *dev = ns->dev; return dma_pool_create(name, dev->dev, PAGE_SIZE, PAGE_SIZE, 0); @@ -530,7 +532,7 @@ static void nvme_nvm_destroy_dma_pool(void *pool) dma_pool_destroy(dma_pool); } -static void *nvme_nvm_dev_dma_alloc(struct request_queue *q, void *pool, +static void *nvme_nvm_dev_dma_alloc(struct nvm_dev *dev, void *pool, gfp_t mem_flags, dma_addr_t *dma_handler) { return dma_pool_alloc(pool, mem_flags, dma_handler); diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 935ef3844c05..034117b3be5f 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -183,17 +183,17 @@ struct nvm_block; typedef int (nvm_l2p_update_fn)(u64, u32, __le64 *, void *); typedef int (nvm_bb_update_fn)(struct ppa_addr, int, u8 *, void *); -typedef int (nvm_id_fn)(struct request_queue *, struct nvm_id *); -typedef int (nvm_get_l2p_tbl_fn)(struct request_queue *, u64, u32, +typedef int (nvm_id_fn)(struct nvm_dev *, struct nvm_id *); +typedef int (nvm_get_l2p_tbl_fn)(struct nvm_dev *, u64, u32, nvm_l2p_update_fn *, void *); typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, int, nvm_bb_update_fn *, void *); -typedef int (nvm_op_set_bb_fn)(struct request_queue *, struct nvm_rq *, int); -typedef int (nvm_submit_io_fn)(struct request_queue *, struct nvm_rq *); -typedef int (nvm_erase_blk_fn)(struct request_queue *, struct nvm_rq *); -typedef void *(nvm_create_dma_pool_fn)(struct request_queue *, char *); +typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct nvm_rq *, int); +typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); +typedef int (nvm_erase_blk_fn)(struct nvm_dev *, struct nvm_rq *); +typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *); typedef void (nvm_destroy_dma_pool_fn)(void *); -typedef void *(nvm_dev_dma_alloc_fn)(struct request_queue *, void *, gfp_t, +typedef void *(nvm_dev_dma_alloc_fn)(struct nvm_dev *, void *, gfp_t, dma_addr_t *); typedef void (nvm_dev_dma_free_fn)(void *, void*, dma_addr_t); -- cgit v1.2.3 From d144da8c6f51f48ec39d891ea9dff80169c45f3b Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 2 Nov 2015 12:13:25 -0500 Subject: IB/core: use RCU for uverbs id lookup The current implementation gets a spin_lock, and at any scale with qib and hfi1 post send, the lock contention grows exponentially with the number of QPs. idr_find() is RCU compatibile, so read doesn't need the lock. Change to use rcu_read_lock() and rcu_read_unlock() in __idr_get_uobj(). kfree_rcu() is used to insure a grace period between the idr removal and actual free. Reviewed-by: Ira Weiny Signed-off-by: Mike Marciniszyn Reviewed-By: Jason Gunthorpe Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs_cmd.c | 12 +++++++----- include/rdma/ib_verbs.h | 1 + 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 4cb8e9d9966c..1c02deab068f 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -62,9 +62,11 @@ static struct uverbs_lock_class rule_lock_class = { .name = "RULE-uobj" }; * The ib_uobject locking scheme is as follows: * * - ib_uverbs_idr_lock protects the uverbs idrs themselves, so it - * needs to be held during all idr operations. When an object is + * needs to be held during all idr write operations. When an object is * looked up, a reference must be taken on the object's kref before - * dropping this lock. + * dropping this lock. For read operations, the rcu_read_lock() + * and rcu_write_lock() but similarly the kref reference is grabbed + * before the rcu_read_unlock(). * * - Each object also has an rwsem. This rwsem must be held for * reading while an operation that uses the object is performed. @@ -96,7 +98,7 @@ static void init_uobj(struct ib_uobject *uobj, u64 user_handle, static void release_uobj(struct kref *kref) { - kfree(container_of(kref, struct ib_uobject, ref)); + kfree_rcu(container_of(kref, struct ib_uobject, ref), rcu); } static void put_uobj(struct ib_uobject *uobj) @@ -145,7 +147,7 @@ static struct ib_uobject *__idr_get_uobj(struct idr *idr, int id, { struct ib_uobject *uobj; - spin_lock(&ib_uverbs_idr_lock); + rcu_read_lock(); uobj = idr_find(idr, id); if (uobj) { if (uobj->context == context) @@ -153,7 +155,7 @@ static struct ib_uobject *__idr_get_uobj(struct idr *idr, int id, else uobj = NULL; } - spin_unlock(&ib_uverbs_idr_lock); + rcu_read_unlock(); return uobj; } diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 9a68a19532ba..120da1d7f57e 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1271,6 +1271,7 @@ struct ib_uobject { int id; /* index into kernel idr */ struct kref ref; struct rw_semaphore mutex; /* protects .live */ + struct rcu_head rcu; /* kfree_rcu() overhead */ int live; }; -- cgit v1.2.3 From 533708867dd6388f643f12c87465b59e732d729d Mon Sep 17 00:00:00 2001 From: Hal Rosenstock Date: Fri, 13 Nov 2015 15:22:22 -0500 Subject: IB/mad: Require CM send method for everything except ClassPortInfo Receipt of CM MAD with other than the Send method for an attribute other than the ClassPortInfo attribute is invalid. CM attributes other than ClassPortInfo only use the send method. The SRP initiator does not maintain a timeout policy for CM connect requests relies on the CM layer to do that. The result was that the SRP initiator hung as the connect request never completed. A new SRP target has been observed to respond to Send CM REQ with GetResp of CM REQ with bad status. This is non conformant with IBA spec but exposes a vulnerability in the current MAD/CM code which will respond to the incoming GetResp of CM REQ as if it was a valid incoming Send of CM REQ rather than tossing this on the floor. It also causes the MAD layer not to retransmit the original REQ even though it has not received a REP. Reviewed-by: Sagi Grimberg Signed-off-by: Hal Rosenstock Reviewed-by: Ira Weiny Signed-off-by: Doug Ledford --- drivers/infiniband/core/mad.c | 5 +++++ include/rdma/ib_mad.h | 2 ++ 2 files changed, 7 insertions(+) (limited to 'include') diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index 8d8af7a41a30..2281de122038 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -1811,6 +1811,11 @@ static int validate_mad(const struct ib_mad_hdr *mad_hdr, if (qp_num == 0) valid = 1; } else { + /* CM attributes other than ClassPortInfo only use Send method */ + if ((mad_hdr->mgmt_class == IB_MGMT_CLASS_CM) && + (mad_hdr->attr_id != IB_MGMT_CLASSPORTINFO_ATTR_ID) && + (mad_hdr->method != IB_MGMT_METHOD_SEND)) + goto out; /* Filter GSI packets sent to QP0 */ if (qp_num != 0) valid = 1; diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h index 188df91d5851..ec9b44dd3d80 100644 --- a/include/rdma/ib_mad.h +++ b/include/rdma/ib_mad.h @@ -237,6 +237,8 @@ struct ib_vendor_mad { u8 data[IB_MGMT_VENDOR_DATA]; }; +#define IB_MGMT_CLASSPORTINFO_ATTR_ID cpu_to_be16(0x0001) + struct ib_class_port_info { u8 base_version; u8 class_version; -- cgit v1.2.3 From a5e14ba334e202c58e45ef47414ec94c585c1a8c Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 28 Oct 2015 13:28:15 +0200 Subject: mlx4: Expose correct max_sge_rd limit mlx4 devices (ConnectX-2, ConnectX-3) has a limitation where rdma read work queue entries cannot exceed 512 bytes. A rdma_read wqe needs to fit in 512 bytes: - wqe control segment (16 bytes) - rdma segment (16 bytes) - scatter elements (16 bytes each) So max_sge_rd should be: (512 - 16 - 16) / 16 = 30. Signed-off-by: Sagi Grimberg Reviewed-by: Steve Wise Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx4/main.c | 2 +- include/linux/mlx4/device.h | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index f567160a4a56..97d6878f9938 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -456,7 +456,7 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, props->max_qp_wr = dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE; props->max_sge = min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg); - props->max_sge_rd = props->max_sge; + props->max_sge_rd = MLX4_MAX_SGE_RD; props->max_cq = dev->dev->quotas.cq; props->max_cqe = dev->dev->caps.max_cqes; props->max_mr = dev->dev->quotas.mpt; diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 7501626ab529..d3133be12d92 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -426,6 +426,17 @@ enum { MLX4_MAX_FAST_REG_PAGES = 511, }; +enum { + /* + * Max wqe size for rdma read is 512 bytes, so this + * limits our max_sge_rd as the wqe needs to fit: + * - ctrl segment (16 bytes) + * - rdma segment (16 bytes) + * - scatter elements (16 bytes each) + */ + MLX4_MAX_SGE_RD = (512 - 16 - 16) / 16 +}; + enum { MLX4_DEV_PMC_SUBTYPE_GUID_INFO = 0x14, MLX4_DEV_PMC_SUBTYPE_PORT_INFO = 0x15, -- cgit v1.2.3 From 4c3141e09cfa6460bfcd5e90f73e498db654c917 Mon Sep 17 00:00:00 2001 From: Carlo Caione Date: Tue, 1 Dec 2015 17:24:17 +0100 Subject: of/irq: Export of_irq_find_parent again of_irq_find_parent was made static since it had no users outside of of_irq.c. Export it again since we are going to use it again. Signed-off-by: Carlo Caione [robh: move of_irq_find_parent to correct ifdef section] Signed-off-by: Rob Herring --- drivers/of/irq.c | 3 ++- include/linux/of_irq.h | 6 ++++++ 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/of/irq.c b/drivers/of/irq.c index 902b89be7217..4fa916dffc91 100644 --- a/drivers/of/irq.c +++ b/drivers/of/irq.c @@ -53,7 +53,7 @@ EXPORT_SYMBOL_GPL(irq_of_parse_and_map); * Returns a pointer to the interrupt parent node, or NULL if the interrupt * parent could not be determined. */ -static struct device_node *of_irq_find_parent(struct device_node *child) +struct device_node *of_irq_find_parent(struct device_node *child) { struct device_node *p; const __be32 *parp; @@ -77,6 +77,7 @@ static struct device_node *of_irq_find_parent(struct device_node *child) return p; } +EXPORT_SYMBOL_GPL(of_irq_find_parent); /** * of_irq_parse_raw - Low level interrupt tree parsing diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h index 039f2eec49ce..f648acf27ed7 100644 --- a/include/linux/of_irq.h +++ b/include/linux/of_irq.h @@ -46,6 +46,7 @@ extern int of_irq_get(struct device_node *dev, int index); extern int of_irq_get_byname(struct device_node *dev, const char *name); extern int of_irq_to_resource_table(struct device_node *dev, struct resource *res, int nr_irqs); +extern struct device_node *of_irq_find_parent(struct device_node *child); extern struct irq_domain *of_msi_get_domain(struct device *dev, struct device_node *np, enum irq_domain_bus_token token); @@ -70,6 +71,11 @@ static inline int of_irq_to_resource_table(struct device_node *dev, { return 0; } +static inline void *of_irq_find_parent(struct device_node *child) +{ + return NULL; +} + static inline struct irq_domain *of_msi_get_domain(struct device *dev, struct device_node *np, enum irq_domain_bus_token token) -- cgit v1.2.3 From eaddb5725357e9f05ffe5d271630f8197d089da4 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Wed, 9 Dec 2015 09:11:10 -0600 Subject: of/irq: move of_msi_map_rid declaration to the correct ifdef section In checking fixes for of_irq_find_parent declaration location, I found that of_msi_map_rid is also wrong. of_msi_map_rid is not implemented for Sparc, so it should not be in the Sparc specific section of the header. Move it to just depend on OF_IRQ. Cc: Frank Rowand Signed-off-by: Rob Herring --- include/linux/of_irq.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h index f648acf27ed7..1e0deb8e8494 100644 --- a/include/linux/of_irq.h +++ b/include/linux/of_irq.h @@ -53,6 +53,7 @@ extern struct irq_domain *of_msi_get_domain(struct device *dev, extern struct irq_domain *of_msi_map_get_device_domain(struct device *dev, u32 rid); extern void of_msi_configure(struct device *dev, struct device_node *np); +u32 of_msi_map_rid(struct device *dev, struct device_node *msi_np, u32 rid_in); #else static inline int of_irq_count(struct device_node *dev) { @@ -90,6 +91,11 @@ static inline struct irq_domain *of_msi_map_get_device_domain(struct device *dev static inline void of_msi_configure(struct device *dev, struct device_node *np) { } +static inline u32 of_msi_map_rid(struct device *dev, + struct device_node *msi_np, u32 rid_in) +{ + return rid_in; +} #endif #if defined(CONFIG_OF_IRQ) || defined(CONFIG_SPARC) @@ -99,7 +105,6 @@ static inline void of_msi_configure(struct device *dev, struct device_node *np) * so declare it here regardless of the CONFIG_OF_IRQ setting. */ extern unsigned int irq_of_parse_and_map(struct device_node *node, int index); -u32 of_msi_map_rid(struct device *dev, struct device_node *msi_np, u32 rid_in); #else /* !CONFIG_OF && !CONFIG_SPARC */ static inline unsigned int irq_of_parse_and_map(struct device_node *dev, @@ -107,12 +112,6 @@ static inline unsigned int irq_of_parse_and_map(struct device_node *dev, { return 0; } - -static inline u32 of_msi_map_rid(struct device *dev, - struct device_node *msi_np, u32 rid_in) -{ - return rid_in; -} #endif /* !CONFIG_OF */ #endif /* __OF_IRQ_H */ -- cgit v1.2.3 From d7e35dfa2531b53618b9e6edcd8752ce988ac555 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Thu, 3 Dec 2015 22:04:01 -0500 Subject: bitops.h: correctly handle rol32 with 0 byte shift ROL on a 32 bit integer with a shift of 32 or more is undefined and the result is arch-dependent. Avoid this by handling the trivial case of roling by 0 correctly. The trivial solution of checking if shift is 0 breaks gcc's detection of this code as a ROL instruction, which is unacceptable. This bug was reported and fixed in GCC (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=57157): The standard rotate idiom, (x << n) | (x >> (32 - n)) is recognized by gcc (for concreteness, I discuss only the case that x is an uint32_t here). However, this is portable C only for n in the range 0 < n < 32. For n == 0, we get x >> 32 which gives undefined behaviour according to the C standard (6.5.7, Bitwise shift operators). To portably support n == 0, one has to write the rotate as something like (x << n) | (x >> ((-n) & 31)) And this is apparently not recognized by gcc. Note that this is broken on older GCCs and will result in slower ROL. Acked-by: Linus Torvalds Signed-off-by: Sasha Levin Signed-off-by: Linus Torvalds --- include/linux/bitops.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 2b8ed123ad36..defeaac0745f 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -107,7 +107,7 @@ static inline __u64 ror64(__u64 word, unsigned int shift) */ static inline __u32 rol32(__u32 word, unsigned int shift) { - return (word << shift) | (word >> (32 - shift)); + return (word << shift) | (word >> ((-shift) & 31)); } /** -- cgit v1.2.3 From 059393c5bdd1420bdf1bed2972f33196dff263ae Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 7 Dec 2015 10:11:11 +0000 Subject: irqchip/gic-v3: Add missing struct device_node declaration When the GICv3 header file is used in a C file that doesn't include any of the OF stuff, we end up with a bunch of ugly warnings. Let's keep GCC quiet by adding a forward declaration. Signed-off-by: Marc Zyngier Cc: Cc: Jason Cooper Link: http://lkml.kernel.org/r/1449483072-17694-2-git-send-email-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner --- include/linux/irqchip/arm-gic-v3.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index c9ae0c6ec050..d5d798b35c1f 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -330,6 +330,7 @@ struct rdists { }; struct irq_domain; +struct device_node; int its_cpu_init(void); int its_init(struct device_node *node, struct rdists *rdists, struct irq_domain *domain); -- cgit v1.2.3 From ad87e03213b552a5c33d5e1e7a19a73768397010 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Thu, 10 Dec 2015 15:27:21 -0500 Subject: USB: add quirk for devices with broken LPM Some USB device / host controller combinations seem to have problems with Link Power Management. For example, Steinar found that his xHCI controller wouldn't handle bandwidth calculations correctly for two video cards simultaneously when LPM was enabled, even though the bus had plenty of bandwidth available. This patch introduces a new quirk flag for devices that should remain disabled for LPM, and creates quirk entries for Steinar's devices. Signed-off-by: Alan Stern Reported-by: Steinar H. Gunderson Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/hub.c | 7 ++++++- drivers/usb/core/quirks.c | 6 ++++++ include/linux/usb/quirks.h | 3 +++ 3 files changed, 15 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index 585c3cb07da6..a5cc032ef77a 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -124,6 +124,10 @@ struct usb_hub *usb_hub_to_struct_hub(struct usb_device *hdev) int usb_device_supports_lpm(struct usb_device *udev) { + /* Some devices have trouble with LPM */ + if (udev->quirks & USB_QUIRK_NO_LPM) + return 0; + /* USB 2.1 (and greater) devices indicate LPM support through * their USB 2.0 Extended Capabilities BOS descriptor. */ @@ -4512,6 +4516,8 @@ hub_port_init(struct usb_hub *hub, struct usb_device *udev, int port1, goto fail; } + usb_detect_quirks(udev); + if (udev->wusb == 0 && le16_to_cpu(udev->descriptor.bcdUSB) >= 0x0201) { retval = usb_get_bos_descriptor(udev); if (!retval) { @@ -4710,7 +4716,6 @@ static void hub_port_connect(struct usb_hub *hub, int port1, u16 portstatus, if (status < 0) goto loop; - usb_detect_quirks(udev); if (udev->quirks & USB_QUIRK_DELAY_INIT) msleep(1000); diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c index fcd6ac0c667f..6dc810bce295 100644 --- a/drivers/usb/core/quirks.c +++ b/drivers/usb/core/quirks.c @@ -202,6 +202,12 @@ static const struct usb_device_id usb_quirk_list[] = { { USB_DEVICE(0x1a0a, 0x0200), .driver_info = USB_QUIRK_LINEAR_UFRAME_INTR_BINTERVAL }, + /* Blackmagic Design Intensity Shuttle */ + { USB_DEVICE(0x1edb, 0xbd3b), .driver_info = USB_QUIRK_NO_LPM }, + + /* Blackmagic Design UltraStudio SDI */ + { USB_DEVICE(0x1edb, 0xbd4f), .driver_info = USB_QUIRK_NO_LPM }, + { } /* terminating entry must be last */ }; diff --git a/include/linux/usb/quirks.h b/include/linux/usb/quirks.h index 9948c874e3f1..1d0043dc34e4 100644 --- a/include/linux/usb/quirks.h +++ b/include/linux/usb/quirks.h @@ -47,4 +47,7 @@ /* device generates spurious wakeup, ignore remote wakeup capability */ #define USB_QUIRK_IGNORE_REMOTE_WAKEUP BIT(9) +/* device can't handle Link Power Management */ +#define USB_QUIRK_NO_LPM BIT(10) + #endif /* __LINUX_USB_QUIRKS_H */ -- cgit v1.2.3 From 98e89cf02aed11166698dd53c6f14865613babb3 Mon Sep 17 00:00:00 2001 From: Nicolas Iooss Date: Fri, 11 Dec 2015 13:40:43 -0800 Subject: mm: kmemleak: mark kmemleak_init prototype as __init The kmemleak_init() definition in mm/kmemleak.c is marked __init but its prototype in include/linux/kmemleak.h is marked __ref since commit a6186d89c913 ("kmemleak: Mark the early log buffer as __initdata"). This causes a section mismatch which is reported as a warning when building with clang -Wsection, because kmemleak_init() is declared in section .ref.text but defined in .init.text. Fix this by marking kmemleak_init() prototype __init. Signed-off-by: Nicolas Iooss Signed-off-by: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kmemleak.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h index d0a1f99e24e3..4894c6888bc6 100644 --- a/include/linux/kmemleak.h +++ b/include/linux/kmemleak.h @@ -25,7 +25,7 @@ #ifdef CONFIG_DEBUG_KMEMLEAK -extern void kmemleak_init(void) __ref; +extern void kmemleak_init(void) __init; extern void kmemleak_alloc(const void *ptr, size_t size, int min_count, gfp_t gfp) __ref; extern void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, -- cgit v1.2.3 From 86fffe4a61dd972d5a4e23260d530be6da02f614 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 11 Dec 2015 13:40:46 -0800 Subject: kernel: remove stop_machine() Kconfig dependency Currently the full stop_machine() routine is only enabled on SMP if module unloading is enabled, or if the CPUs are hotpluggable. This leads to configurations where stop_machine() is broken as it will then only run the callback on the local CPU with irqs disabled, and not stop the other CPUs or run the callback on them. For example, this breaks MTRR setup on x86 in certain configs since ea8596bb2d8d379 ("kprobes/x86: Remove unused text_poke_smp() and text_poke_smp_batch() functions") as the MTRR is only established on the boot CPU. This patch removes the Kconfig option for STOP_MACHINE and uses the SMP and HOTPLUG_CPU config options to compile the correct stop_machine() for the architecture, removing the false dependency on MODULE_UNLOAD in the process. Link: https://lkml.org/lkml/2014/10/8/124 References: https://bugs.freedesktop.org/show_bug.cgi?id=84794 Signed-off-by: Chris Wilson Acked-by: Ingo Molnar Cc: "Paul E. McKenney" Cc: Pranith Kumar Cc: Michal Hocko Cc: Vladimir Davydov Cc: Johannes Weiner Cc: H. Peter Anvin Cc: Tejun Heo Cc: Iulia Manda Cc: Andy Lutomirski Cc: Rusty Russell Cc: Peter Zijlstra Cc: Chuck Ebbert Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/stop_machine.h | 6 +++--- init/Kconfig | 7 ------- kernel/stop_machine.c | 4 ++-- 3 files changed, 5 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index 0adedca24c5b..0e1b1540597a 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -99,7 +99,7 @@ static inline int try_stop_cpus(const struct cpumask *cpumask, * grabbing every spinlock (and more). So the "read" side to such a * lock is anything which disables preemption. */ -#if defined(CONFIG_STOP_MACHINE) && defined(CONFIG_SMP) +#if defined(CONFIG_SMP) || defined(CONFIG_HOTPLUG_CPU) /** * stop_machine: freeze the machine on all CPUs and run this function @@ -118,7 +118,7 @@ int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus); int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus); -#else /* CONFIG_STOP_MACHINE && CONFIG_SMP */ +#else /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */ static inline int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) @@ -137,5 +137,5 @@ static inline int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data, return stop_machine(fn, data, cpus); } -#endif /* CONFIG_STOP_MACHINE && CONFIG_SMP */ +#endif /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */ #endif /* _LINUX_STOP_MACHINE */ diff --git a/init/Kconfig b/init/Kconfig index c24b6f767bf0..235c7a2c0d20 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -2030,13 +2030,6 @@ config INIT_ALL_POSSIBLE it was better to provide this option than to break all the archs and have several arch maintainers pursuing me down dark alleys. -config STOP_MACHINE - bool - default y - depends on (SMP && MODULE_UNLOAD) || HOTPLUG_CPU - help - Need stop_machine() primitive. - source "block/Kconfig" config PREEMPT_NOTIFIERS diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 867bc20e1ef1..a3bbaee77c58 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -531,7 +531,7 @@ static int __init cpu_stop_init(void) } early_initcall(cpu_stop_init); -#ifdef CONFIG_STOP_MACHINE +#if defined(CONFIG_SMP) || defined(CONFIG_HOTPLUG_CPU) static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) { @@ -631,4 +631,4 @@ int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data, return ret ?: done.ret; } -#endif /* CONFIG_STOP_MACHINE */ +#endif /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */ -- cgit v1.2.3 From dfd01f026058a59a513f8a365b439a0681b803af Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 13 Dec 2015 22:11:16 +0100 Subject: sched/wait: Fix the signal handling fix Jan Stancek reported that I wrecked things for him by fixing things for Vladimir :/ His report was due to an UNINTERRUPTIBLE wait getting -EINTR, which should not be possible, however my previous patch made this possible by unconditionally checking signal_pending(). We cannot use current->state as was done previously, because the instruction after the store to that variable it can be changed. We must instead pass the initial state along and use that. Fixes: 68985633bccb ("sched/wait: Fix signal handling in bit wait helpers") Reported-by: Jan Stancek Reported-by: Chris Mason Tested-by: Jan Stancek Tested-by: Vladimir Murzin Tested-by: Chris Mason Reviewed-by: Paul Turner Cc: Ingo Molnar Cc: tglx@linutronix.de Cc: Oleg Nesterov Cc: hpa@zytor.com Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Linus Torvalds --- fs/cifs/inode.c | 6 +++--- fs/nfs/inode.c | 6 +++--- fs/nfs/internal.h | 2 +- fs/nfs/pagelist.c | 2 +- fs/nfs/pnfs.c | 4 ++-- include/linux/wait.h | 10 +++++----- kernel/sched/wait.c | 20 ++++++++++---------- net/sunrpc/sched.c | 6 +++--- 8 files changed, 28 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 6b66dd5d1540..a329f5ba35aa 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1831,11 +1831,11 @@ cifs_invalidate_mapping(struct inode *inode) * @word: long word containing the bit lock */ static int -cifs_wait_bit_killable(struct wait_bit_key *key) +cifs_wait_bit_killable(struct wait_bit_key *key, int mode) { - if (fatal_signal_pending(current)) - return -ERESTARTSYS; freezable_schedule_unsafe(); + if (signal_pending_state(mode, current)) + return -ERESTARTSYS; return 0; } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 31b0a52223a7..c7e8b87da5b2 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -75,11 +75,11 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr) * nfs_wait_bit_killable - helper for functions that are sleeping on bit locks * @word: long word containing the bit lock */ -int nfs_wait_bit_killable(struct wait_bit_key *key) +int nfs_wait_bit_killable(struct wait_bit_key *key, int mode) { - if (fatal_signal_pending(current)) - return -ERESTARTSYS; freezable_schedule_unsafe(); + if (signal_pending_state(mode, current)) + return -ERESTARTSYS; return 0; } EXPORT_SYMBOL_GPL(nfs_wait_bit_killable); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 56cfde26fb9c..9dea85f7f918 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -379,7 +379,7 @@ extern int nfs_drop_inode(struct inode *); extern void nfs_clear_inode(struct inode *); extern void nfs_evict_inode(struct inode *); void nfs_zap_acl_cache(struct inode *inode); -extern int nfs_wait_bit_killable(struct wait_bit_key *key); +extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode); /* super.c */ extern const struct super_operations nfs_sops; diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index fe3ddd20ff89..452a011ba0d8 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -129,7 +129,7 @@ __nfs_iocounter_wait(struct nfs_io_counter *c) set_bit(NFS_IO_INPROGRESS, &c->flags); if (atomic_read(&c->io_count) == 0) break; - ret = nfs_wait_bit_killable(&q.key); + ret = nfs_wait_bit_killable(&q.key, TASK_KILLABLE); } while (atomic_read(&c->io_count) != 0 && !ret); finish_wait(wq, &q.wait); return ret; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 5a8ae2125b50..bec0384499f7 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1466,11 +1466,11 @@ static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx, } /* stop waiting if someone clears NFS_LAYOUT_RETRY_LAYOUTGET bit. */ -static int pnfs_layoutget_retry_bit_wait(struct wait_bit_key *key) +static int pnfs_layoutget_retry_bit_wait(struct wait_bit_key *key, int mode) { if (!test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, key->flags)) return 1; - return nfs_wait_bit_killable(key); + return nfs_wait_bit_killable(key, mode); } static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo) diff --git a/include/linux/wait.h b/include/linux/wait.h index 1e1bf9f963a9..513b36f04dfd 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -145,7 +145,7 @@ __remove_wait_queue(wait_queue_head_t *head, wait_queue_t *old) list_del(&old->task_list); } -typedef int wait_bit_action_f(struct wait_bit_key *); +typedef int wait_bit_action_f(struct wait_bit_key *, int mode); void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key); void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key); void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, void *key); @@ -960,10 +960,10 @@ int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key); } while (0) -extern int bit_wait(struct wait_bit_key *); -extern int bit_wait_io(struct wait_bit_key *); -extern int bit_wait_timeout(struct wait_bit_key *); -extern int bit_wait_io_timeout(struct wait_bit_key *); +extern int bit_wait(struct wait_bit_key *, int); +extern int bit_wait_io(struct wait_bit_key *, int); +extern int bit_wait_timeout(struct wait_bit_key *, int); +extern int bit_wait_io_timeout(struct wait_bit_key *, int); /** * wait_on_bit - wait for a bit to be cleared diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index f10bd873e684..f15d6b6a538a 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -392,7 +392,7 @@ __wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, do { prepare_to_wait(wq, &q->wait, mode); if (test_bit(q->key.bit_nr, q->key.flags)) - ret = (*action)(&q->key); + ret = (*action)(&q->key, mode); } while (test_bit(q->key.bit_nr, q->key.flags) && !ret); finish_wait(wq, &q->wait); return ret; @@ -431,7 +431,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, prepare_to_wait_exclusive(wq, &q->wait, mode); if (!test_bit(q->key.bit_nr, q->key.flags)) continue; - ret = action(&q->key); + ret = action(&q->key, mode); if (!ret) continue; abort_exclusive_wait(wq, &q->wait, mode, &q->key); @@ -581,43 +581,43 @@ void wake_up_atomic_t(atomic_t *p) } EXPORT_SYMBOL(wake_up_atomic_t); -__sched int bit_wait(struct wait_bit_key *word) +__sched int bit_wait(struct wait_bit_key *word, int mode) { schedule(); - if (signal_pending(current)) + if (signal_pending_state(mode, current)) return -EINTR; return 0; } EXPORT_SYMBOL(bit_wait); -__sched int bit_wait_io(struct wait_bit_key *word) +__sched int bit_wait_io(struct wait_bit_key *word, int mode) { io_schedule(); - if (signal_pending(current)) + if (signal_pending_state(mode, current)) return -EINTR; return 0; } EXPORT_SYMBOL(bit_wait_io); -__sched int bit_wait_timeout(struct wait_bit_key *word) +__sched int bit_wait_timeout(struct wait_bit_key *word, int mode) { unsigned long now = READ_ONCE(jiffies); if (time_after_eq(now, word->timeout)) return -EAGAIN; schedule_timeout(word->timeout - now); - if (signal_pending(current)) + if (signal_pending_state(mode, current)) return -EINTR; return 0; } EXPORT_SYMBOL_GPL(bit_wait_timeout); -__sched int bit_wait_io_timeout(struct wait_bit_key *word) +__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode) { unsigned long now = READ_ONCE(jiffies); if (time_after_eq(now, word->timeout)) return -EAGAIN; io_schedule_timeout(word->timeout - now); - if (signal_pending(current)) + if (signal_pending_state(mode, current)) return -EINTR; return 0; } diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index f14f24ee9983..73ad57a59989 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -250,11 +250,11 @@ void rpc_destroy_wait_queue(struct rpc_wait_queue *queue) } EXPORT_SYMBOL_GPL(rpc_destroy_wait_queue); -static int rpc_wait_bit_killable(struct wait_bit_key *key) +static int rpc_wait_bit_killable(struct wait_bit_key *key, int mode) { - if (fatal_signal_pending(current)) - return -ERESTARTSYS; freezable_schedule_unsafe(); + if (signal_pending_state(mode, current)) + return -ERESTARTSYS; return 0; } -- cgit v1.2.3 From 3c13b8f435acb452eac62d966148a8b6fa92151f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 1 Dec 2015 13:48:56 +0000 Subject: KVM: arm/arm64: vgic-v3: Make the LR indexing macro public We store GICv3 LRs in reverse order so that the CPU can save/restore them in rever order as well (don't ask why, the design is crazy), and yet generate memory traffic that doesn't completely suck. We need this macro to be available to the C version of save/restore. Signed-off-by: Marc Zyngier --- include/kvm/arm_vgic.h | 6 ++++++ virt/kvm/arm/vgic-v3.c | 10 ++-------- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index d2f41477f8ae..13a3d537811b 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -279,6 +279,12 @@ struct vgic_v2_cpu_if { u32 vgic_lr[VGIC_V2_MAX_LRS]; }; +/* + * LRs are stored in reverse order in memory. make sure we index them + * correctly. + */ +#define VGIC_V3_LR_INDEX(lr) (VGIC_V3_MAX_LRS - 1 - lr) + struct vgic_v3_cpu_if { #ifdef CONFIG_KVM_ARM_VGIC_V3 u32 vgic_hcr; diff --git a/virt/kvm/arm/vgic-v3.c b/virt/kvm/arm/vgic-v3.c index 487d6357b7e7..3813d23ebb80 100644 --- a/virt/kvm/arm/vgic-v3.c +++ b/virt/kvm/arm/vgic-v3.c @@ -36,18 +36,12 @@ #define GICH_LR_PHYSID_CPUID (7UL << GICH_LR_PHYSID_CPUID_SHIFT) #define ICH_LR_VIRTUALID_MASK (BIT_ULL(32) - 1) -/* - * LRs are stored in reverse order in memory. make sure we index them - * correctly. - */ -#define LR_INDEX(lr) (VGIC_V3_MAX_LRS - 1 - lr) - static u32 ich_vtr_el2; static struct vgic_lr vgic_v3_get_lr(const struct kvm_vcpu *vcpu, int lr) { struct vgic_lr lr_desc; - u64 val = vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[LR_INDEX(lr)]; + u64 val = vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[VGIC_V3_LR_INDEX(lr)]; if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) lr_desc.irq = val & ICH_LR_VIRTUALID_MASK; @@ -111,7 +105,7 @@ static void vgic_v3_set_lr(struct kvm_vcpu *vcpu, int lr, lr_val |= ((u64)lr_desc.hwirq) << ICH_LR_PHYS_ID_SHIFT; } - vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[LR_INDEX(lr)] = lr_val; + vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[VGIC_V3_LR_INDEX(lr)] = lr_val; if (!(lr_desc.state & LR_STATE_MASK)) vcpu->arch.vgic_cpu.vgic_v3.vgic_elrsr |= (1U << lr); -- cgit v1.2.3 From 1431af367e52b08038e78d346822966d968f1694 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 19 Oct 2015 16:32:20 +0100 Subject: arm64: KVM: Implement timer save/restore Implement the timer save restore as a direct translation of the assembly code version. Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/Makefile | 1 + arch/arm64/kvm/hyp/hyp.h | 3 ++ arch/arm64/kvm/hyp/timer-sr.c | 71 ++++++++++++++++++++++++++++++++++++ include/clocksource/arm_arch_timer.h | 6 +++ 4 files changed, 81 insertions(+) create mode 100644 arch/arm64/kvm/hyp/timer-sr.c (limited to 'include') diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile index d1e38ce16bb5..455dc0a3acb5 100644 --- a/arch/arm64/kvm/hyp/Makefile +++ b/arch/arm64/kvm/hyp/Makefile @@ -4,3 +4,4 @@ obj-$(CONFIG_KVM_ARM_HOST) += vgic-v2-sr.o obj-$(CONFIG_KVM_ARM_HOST) += vgic-v3-sr.o +obj-$(CONFIG_KVM_ARM_HOST) += timer-sr.o diff --git a/arch/arm64/kvm/hyp/hyp.h b/arch/arm64/kvm/hyp/hyp.h index 5759f9f5ef14..f213e4652bbf 100644 --- a/arch/arm64/kvm/hyp/hyp.h +++ b/arch/arm64/kvm/hyp/hyp.h @@ -35,5 +35,8 @@ void __vgic_v2_restore_state(struct kvm_vcpu *vcpu); void __vgic_v3_save_state(struct kvm_vcpu *vcpu); void __vgic_v3_restore_state(struct kvm_vcpu *vcpu); +void __timer_save_state(struct kvm_vcpu *vcpu); +void __timer_restore_state(struct kvm_vcpu *vcpu); + #endif /* __ARM64_KVM_HYP_H__ */ diff --git a/arch/arm64/kvm/hyp/timer-sr.c b/arch/arm64/kvm/hyp/timer-sr.c new file mode 100644 index 000000000000..1051e5d7320f --- /dev/null +++ b/arch/arm64/kvm/hyp/timer-sr.c @@ -0,0 +1,71 @@ +/* + * Copyright (C) 2012-2015 - ARM Ltd + * Author: Marc Zyngier + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include + +#include + +#include "hyp.h" + +/* vcpu is already in the HYP VA space */ +void __hyp_text __timer_save_state(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = kern_hyp_va(vcpu->kvm); + struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + u64 val; + + if (kvm->arch.timer.enabled) { + timer->cntv_ctl = read_sysreg(cntv_ctl_el0); + timer->cntv_cval = read_sysreg(cntv_cval_el0); + } + + /* Disable the virtual timer */ + write_sysreg(0, cntv_ctl_el0); + + /* Allow physical timer/counter access for the host */ + val = read_sysreg(cnthctl_el2); + val |= CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN; + write_sysreg(val, cnthctl_el2); + + /* Clear cntvoff for the host */ + write_sysreg(0, cntvoff_el2); +} + +void __hyp_text __timer_restore_state(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = kern_hyp_va(vcpu->kvm); + struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + u64 val; + + /* + * Disallow physical timer access for the guest + * Physical counter access is allowed + */ + val = read_sysreg(cnthctl_el2); + val &= ~CNTHCTL_EL1PCEN; + val |= CNTHCTL_EL1PCTEN; + write_sysreg(val, cnthctl_el2); + + if (kvm->arch.timer.enabled) { + write_sysreg(kvm->arch.timer.cntvoff, cntvoff_el2); + write_sysreg(timer->cntv_cval, cntv_cval_el0); + isb(); + write_sysreg(timer->cntv_ctl, cntv_ctl_el0); + } +} diff --git a/include/clocksource/arm_arch_timer.h b/include/clocksource/arm_arch_timer.h index 9916d0e4eff5..25d0914481a2 100644 --- a/include/clocksource/arm_arch_timer.h +++ b/include/clocksource/arm_arch_timer.h @@ -23,6 +23,12 @@ #define ARCH_TIMER_CTRL_IT_MASK (1 << 1) #define ARCH_TIMER_CTRL_IT_STAT (1 << 2) +#define CNTHCTL_EL1PCTEN (1 << 0) +#define CNTHCTL_EL1PCEN (1 << 1) +#define CNTHCTL_EVNTEN (1 << 2) +#define CNTHCTL_EVNTDIR (1 << 3) +#define CNTHCTL_EVNTI (0xF << 4) + enum arch_timer_reg { ARCH_TIMER_REG_CTRL, ARCH_TIMER_REG_TVAL, -- cgit v1.2.3