diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-05-27 22:49:24 +0200 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-05-27 22:49:24 +0200 |
commit | 2566278551d3db875bc3bbfc41b42f2e80392108 (patch) | |
tree | a8d9923a1b0f16e5af40fce0ee30e03f15956106 /drivers/iommu/intel-iommu.c | |
parent | Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm (diff) | |
parent | iommu/vt-d: Use per-cpu IOVA caching (diff) | |
download | linux-2566278551d3db875bc3bbfc41b42f2e80392108.tar.xz linux-2566278551d3db875bc3bbfc41b42f2e80392108.zip |
Merge git://git.infradead.org/intel-iommu
Pull intel IOMMU updates from David Woodhouse:
"This patchset improves the scalability of the Intel IOMMU code by
resolving two spinlock bottlenecks and eliminating the linearity of
the IOVA allocator, yielding up to ~5x performance improvement and
approaching 'iommu=off' performance"
* git://git.infradead.org/intel-iommu:
iommu/vt-d: Use per-cpu IOVA caching
iommu/iova: introduce per-cpu caching to iova allocation
iommu/vt-d: change intel-iommu to use IOVA frame numbers
iommu/vt-d: avoid dev iotlb logic for domains with no dev iotlbs
iommu/vt-d: only unmap mapped entries
iommu/vt-d: correct flush_unmaps pfn usage
iommu/vt-d: per-cpu deferred invalidation queues
iommu/vt-d: refactoring of deferred flush entries
Diffstat (limited to 'drivers/iommu/intel-iommu.c')
-rw-r--r-- | drivers/iommu/intel-iommu.c | 318 |
1 files changed, 224 insertions, 94 deletions
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index b2bfb9594508..a644d0cec2d8 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -33,6 +33,7 @@ #include <linux/dma-mapping.h> #include <linux/mempool.h> #include <linux/memory.h> +#include <linux/cpu.h> #include <linux/timer.h> #include <linux/io.h> #include <linux/iova.h> @@ -390,6 +391,7 @@ struct dmar_domain { * domain ids are 16 bit wide according * to VT-d spec, section 9.3 */ + bool has_iotlb_device; struct list_head devices; /* all devices' list */ struct iova_domain iovad; /* iova's that belong to this domain */ @@ -456,27 +458,32 @@ static LIST_HEAD(dmar_rmrr_units); static void flush_unmaps_timeout(unsigned long data); -static DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0); +struct deferred_flush_entry { + unsigned long iova_pfn; + unsigned long nrpages; + struct dmar_domain *domain; + struct page *freelist; +}; #define HIGH_WATER_MARK 250 -struct deferred_flush_tables { +struct deferred_flush_table { int next; - struct iova *iova[HIGH_WATER_MARK]; - struct dmar_domain *domain[HIGH_WATER_MARK]; - struct page *freelist[HIGH_WATER_MARK]; + struct deferred_flush_entry entries[HIGH_WATER_MARK]; +}; + +struct deferred_flush_data { + spinlock_t lock; + int timer_on; + struct timer_list timer; + long size; + struct deferred_flush_table *tables; }; -static struct deferred_flush_tables *deferred_flush; +DEFINE_PER_CPU(struct deferred_flush_data, deferred_flush); /* bitmap for indexing intel_iommus */ static int g_num_of_iommus; -static DEFINE_SPINLOCK(async_umap_flush_lock); -static LIST_HEAD(unmaps_to_do); - -static int timer_on; -static long list_size; - static void domain_exit(struct dmar_domain *domain); static void domain_remove_dev_info(struct dmar_domain *domain); static void dmar_remove_one_dev_info(struct dmar_domain *domain, @@ -1458,10 +1465,35 @@ iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu, return NULL; } +static void domain_update_iotlb(struct dmar_domain *domain) +{ + struct device_domain_info *info; + bool has_iotlb_device = false; + + assert_spin_locked(&device_domain_lock); + + list_for_each_entry(info, &domain->devices, link) { + struct pci_dev *pdev; + + if (!info->dev || !dev_is_pci(info->dev)) + continue; + + pdev = to_pci_dev(info->dev); + if (pdev->ats_enabled) { + has_iotlb_device = true; + break; + } + } + + domain->has_iotlb_device = has_iotlb_device; +} + static void iommu_enable_dev_iotlb(struct device_domain_info *info) { struct pci_dev *pdev; + assert_spin_locked(&device_domain_lock); + if (!info || !dev_is_pci(info->dev)) return; @@ -1481,6 +1513,7 @@ static void iommu_enable_dev_iotlb(struct device_domain_info *info) #endif if (info->ats_supported && !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) { info->ats_enabled = 1; + domain_update_iotlb(info->domain); info->ats_qdep = pci_ats_queue_depth(pdev); } } @@ -1489,6 +1522,8 @@ static void iommu_disable_dev_iotlb(struct device_domain_info *info) { struct pci_dev *pdev; + assert_spin_locked(&device_domain_lock); + if (!dev_is_pci(info->dev)) return; @@ -1497,6 +1532,7 @@ static void iommu_disable_dev_iotlb(struct device_domain_info *info) if (info->ats_enabled) { pci_disable_ats(pdev); info->ats_enabled = 0; + domain_update_iotlb(info->domain); } #ifdef CONFIG_INTEL_IOMMU_SVM if (info->pri_enabled) { @@ -1517,6 +1553,9 @@ static void iommu_flush_dev_iotlb(struct dmar_domain *domain, unsigned long flags; struct device_domain_info *info; + if (!domain->has_iotlb_device) + return; + spin_lock_irqsave(&device_domain_lock, flags); list_for_each_entry(info, &domain->devices, link) { if (!info->ats_enabled) @@ -1734,6 +1773,7 @@ static struct dmar_domain *alloc_domain(int flags) memset(domain, 0, sizeof(*domain)); domain->nid = -1; domain->flags = flags; + domain->has_iotlb_device = false; INIT_LIST_HEAD(&domain->devices); return domain; @@ -1918,8 +1958,12 @@ static void domain_exit(struct dmar_domain *domain) return; /* Flush any lazy unmaps that may reference this domain */ - if (!intel_iommu_strict) - flush_unmaps_timeout(0); + if (!intel_iommu_strict) { + int cpu; + + for_each_possible_cpu(cpu) + flush_unmaps_timeout(cpu); + } /* Remove associated devices and clear attached or cached domains */ rcu_read_lock(); @@ -3077,7 +3121,7 @@ static int __init init_dmars(void) bool copied_tables = false; struct device *dev; struct intel_iommu *iommu; - int i, ret; + int i, ret, cpu; /* * for each drhd @@ -3110,11 +3154,20 @@ static int __init init_dmars(void) goto error; } - deferred_flush = kzalloc(g_num_of_iommus * - sizeof(struct deferred_flush_tables), GFP_KERNEL); - if (!deferred_flush) { - ret = -ENOMEM; - goto free_g_iommus; + for_each_possible_cpu(cpu) { + struct deferred_flush_data *dfd = per_cpu_ptr(&deferred_flush, + cpu); + + dfd->tables = kzalloc(g_num_of_iommus * + sizeof(struct deferred_flush_table), + GFP_KERNEL); + if (!dfd->tables) { + ret = -ENOMEM; + goto free_g_iommus; + } + + spin_lock_init(&dfd->lock); + setup_timer(&dfd->timer, flush_unmaps_timeout, cpu); } for_each_active_iommu(iommu, drhd) { @@ -3291,19 +3344,20 @@ free_iommu: disable_dmar_iommu(iommu); free_dmar_iommu(iommu); } - kfree(deferred_flush); free_g_iommus: + for_each_possible_cpu(cpu) + kfree(per_cpu_ptr(&deferred_flush, cpu)->tables); kfree(g_iommus); error: return ret; } /* This takes a number of _MM_ pages, not VTD pages */ -static struct iova *intel_alloc_iova(struct device *dev, +static unsigned long intel_alloc_iova(struct device *dev, struct dmar_domain *domain, unsigned long nrpages, uint64_t dma_mask) { - struct iova *iova = NULL; + unsigned long iova_pfn = 0; /* Restrict dma_mask to the width that the iommu can handle */ dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask); @@ -3316,19 +3370,19 @@ static struct iova *intel_alloc_iova(struct device *dev, * DMA_BIT_MASK(32) and if that fails then try allocating * from higher range */ - iova = alloc_iova(&domain->iovad, nrpages, - IOVA_PFN(DMA_BIT_MASK(32)), 1); - if (iova) - return iova; + iova_pfn = alloc_iova_fast(&domain->iovad, nrpages, + IOVA_PFN(DMA_BIT_MASK(32))); + if (iova_pfn) + return iova_pfn; } - iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1); - if (unlikely(!iova)) { + iova_pfn = alloc_iova_fast(&domain->iovad, nrpages, IOVA_PFN(dma_mask)); + if (unlikely(!iova_pfn)) { pr_err("Allocating %ld-page iova for %s failed", nrpages, dev_name(dev)); - return NULL; + return 0; } - return iova; + return iova_pfn; } static struct dmar_domain *__get_valid_domain_for_dev(struct device *dev) @@ -3426,7 +3480,7 @@ static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr, { struct dmar_domain *domain; phys_addr_t start_paddr; - struct iova *iova; + unsigned long iova_pfn; int prot = 0; int ret; struct intel_iommu *iommu; @@ -3444,8 +3498,8 @@ static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr, iommu = domain_get_iommu(domain); size = aligned_nrpages(paddr, size); - iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask); - if (!iova) + iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask); + if (!iova_pfn) goto error; /* @@ -3463,7 +3517,7 @@ static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr, * might have two guest_addr mapping to the same host paddr, but this * is not a big problem */ - ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo), + ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn), mm_to_dma_pfn(paddr_pfn), size, prot); if (ret) goto error; @@ -3471,18 +3525,18 @@ static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr, /* it's a non-present to present mapping. Only flush if caching mode */ if (cap_caching_mode(iommu->cap)) iommu_flush_iotlb_psi(iommu, domain, - mm_to_dma_pfn(iova->pfn_lo), + mm_to_dma_pfn(iova_pfn), size, 0, 1); else iommu_flush_write_buffer(iommu); - start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT; + start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT; start_paddr += paddr & ~PAGE_MASK; return start_paddr; error: - if (iova) - __free_iova(&domain->iovad, iova); + if (iova_pfn) + free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size)); pr_err("Device %s request: %zx@%llx dir %d --- failed\n", dev_name(dev), size, (unsigned long long)paddr, dir); return 0; @@ -3497,91 +3551,120 @@ static dma_addr_t intel_map_page(struct device *dev, struct page *page, dir, *dev->dma_mask); } -static void flush_unmaps(void) +static void flush_unmaps(struct deferred_flush_data *flush_data) { int i, j; - timer_on = 0; + flush_data->timer_on = 0; /* just flush them all */ for (i = 0; i < g_num_of_iommus; i++) { struct intel_iommu *iommu = g_iommus[i]; + struct deferred_flush_table *flush_table = + &flush_data->tables[i]; if (!iommu) continue; - if (!deferred_flush[i].next) + if (!flush_table->next) continue; /* In caching mode, global flushes turn emulation expensive */ if (!cap_caching_mode(iommu->cap)) iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); - for (j = 0; j < deferred_flush[i].next; j++) { + for (j = 0; j < flush_table->next; j++) { unsigned long mask; - struct iova *iova = deferred_flush[i].iova[j]; - struct dmar_domain *domain = deferred_flush[i].domain[j]; + struct deferred_flush_entry *entry = + &flush_table->entries[j]; + unsigned long iova_pfn = entry->iova_pfn; + unsigned long nrpages = entry->nrpages; + struct dmar_domain *domain = entry->domain; + struct page *freelist = entry->freelist; /* On real hardware multiple invalidations are expensive */ if (cap_caching_mode(iommu->cap)) iommu_flush_iotlb_psi(iommu, domain, - iova->pfn_lo, iova_size(iova), - !deferred_flush[i].freelist[j], 0); + mm_to_dma_pfn(iova_pfn), + nrpages, !freelist, 0); else { - mask = ilog2(mm_to_dma_pfn(iova_size(iova))); - iommu_flush_dev_iotlb(deferred_flush[i].domain[j], - (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask); + mask = ilog2(nrpages); + iommu_flush_dev_iotlb(domain, + (uint64_t)iova_pfn << PAGE_SHIFT, mask); } - __free_iova(&deferred_flush[i].domain[j]->iovad, iova); - if (deferred_flush[i].freelist[j]) - dma_free_pagelist(deferred_flush[i].freelist[j]); + free_iova_fast(&domain->iovad, iova_pfn, nrpages); + if (freelist) + dma_free_pagelist(freelist); } - deferred_flush[i].next = 0; + flush_table->next = 0; } - list_size = 0; + flush_data->size = 0; } -static void flush_unmaps_timeout(unsigned long data) +static void flush_unmaps_timeout(unsigned long cpuid) { + struct deferred_flush_data *flush_data = per_cpu_ptr(&deferred_flush, cpuid); unsigned long flags; - spin_lock_irqsave(&async_umap_flush_lock, flags); - flush_unmaps(); - spin_unlock_irqrestore(&async_umap_flush_lock, flags); + spin_lock_irqsave(&flush_data->lock, flags); + flush_unmaps(flush_data); + spin_unlock_irqrestore(&flush_data->lock, flags); } -static void add_unmap(struct dmar_domain *dom, struct iova *iova, struct page *freelist) +static void add_unmap(struct dmar_domain *dom, unsigned long iova_pfn, + unsigned long nrpages, struct page *freelist) { unsigned long flags; - int next, iommu_id; + int entry_id, iommu_id; struct intel_iommu *iommu; + struct deferred_flush_entry *entry; + struct deferred_flush_data *flush_data; + unsigned int cpuid; - spin_lock_irqsave(&async_umap_flush_lock, flags); - if (list_size == HIGH_WATER_MARK) - flush_unmaps(); + cpuid = get_cpu(); + flush_data = per_cpu_ptr(&deferred_flush, cpuid); + + /* Flush all CPUs' entries to avoid deferring too much. If + * this becomes a bottleneck, can just flush us, and rely on + * flush timer for the rest. + */ + if (flush_data->size == HIGH_WATER_MARK) { + int cpu; + + for_each_online_cpu(cpu) + flush_unmaps_timeout(cpu); + } + + spin_lock_irqsave(&flush_data->lock, flags); iommu = domain_get_iommu(dom); iommu_id = iommu->seq_id; - next = deferred_flush[iommu_id].next; - deferred_flush[iommu_id].domain[next] = dom; - deferred_flush[iommu_id].iova[next] = iova; - deferred_flush[iommu_id].freelist[next] = freelist; - deferred_flush[iommu_id].next++; + entry_id = flush_data->tables[iommu_id].next; + ++(flush_data->tables[iommu_id].next); - if (!timer_on) { - mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10)); - timer_on = 1; + entry = &flush_data->tables[iommu_id].entries[entry_id]; + entry->domain = dom; + entry->iova_pfn = iova_pfn; + entry->nrpages = nrpages; + entry->freelist = freelist; + + if (!flush_data->timer_on) { + mod_timer(&flush_data->timer, jiffies + msecs_to_jiffies(10)); + flush_data->timer_on = 1; } - list_size++; - spin_unlock_irqrestore(&async_umap_flush_lock, flags); + flush_data->size++; + spin_unlock_irqrestore(&flush_data->lock, flags); + + put_cpu(); } -static void intel_unmap(struct device *dev, dma_addr_t dev_addr) +static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size) { struct dmar_domain *domain; unsigned long start_pfn, last_pfn; - struct iova *iova; + unsigned long nrpages; + unsigned long iova_pfn; struct intel_iommu *iommu; struct page *freelist; @@ -3593,13 +3676,11 @@ static void intel_unmap(struct device *dev, dma_addr_t dev_addr) iommu = domain_get_iommu(domain); - iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr)); - if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n", - (unsigned long long)dev_addr)) - return; + iova_pfn = IOVA_PFN(dev_addr); - start_pfn = mm_to_dma_pfn(iova->pfn_lo); - last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1; + nrpages = aligned_nrpages(dev_addr, size); + start_pfn = mm_to_dma_pfn(iova_pfn); + last_pfn = start_pfn + nrpages - 1; pr_debug("Device %s unmapping: pfn %lx-%lx\n", dev_name(dev), start_pfn, last_pfn); @@ -3608,12 +3689,12 @@ static void intel_unmap(struct device *dev, dma_addr_t dev_addr) if (intel_iommu_strict) { iommu_flush_iotlb_psi(iommu, domain, start_pfn, - last_pfn - start_pfn + 1, !freelist, 0); + nrpages, !freelist, 0); /* free iova */ - __free_iova(&domain->iovad, iova); + free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages)); dma_free_pagelist(freelist); } else { - add_unmap(domain, iova, freelist); + add_unmap(domain, iova_pfn, nrpages, freelist); /* * queue up the release of the unmap to save the 1/6th of the * cpu used up by the iotlb flush operation... @@ -3625,7 +3706,7 @@ static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size, enum dma_data_direction dir, struct dma_attrs *attrs) { - intel_unmap(dev, dev_addr); + intel_unmap(dev, dev_addr, size); } static void *intel_alloc_coherent(struct device *dev, size_t size, @@ -3684,7 +3765,7 @@ static void intel_free_coherent(struct device *dev, size_t size, void *vaddr, size = PAGE_ALIGN(size); order = get_order(size); - intel_unmap(dev, dma_handle); + intel_unmap(dev, dma_handle, size); if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT)) __free_pages(page, order); } @@ -3693,7 +3774,16 @@ static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems, enum dma_data_direction dir, struct dma_attrs *attrs) { - intel_unmap(dev, sglist[0].dma_address); + dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK; + unsigned long nrpages = 0; + struct scatterlist *sg; + int i; + + for_each_sg(sglist, sg, nelems, i) { + nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg)); + } + + intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT); } static int intel_nontranslate_map_sg(struct device *hddev, @@ -3717,7 +3807,7 @@ static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nele struct dmar_domain *domain; size_t size = 0; int prot = 0; - struct iova *iova = NULL; + unsigned long iova_pfn; int ret; struct scatterlist *sg; unsigned long start_vpfn; @@ -3736,9 +3826,9 @@ static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nele for_each_sg(sglist, sg, nelems, i) size += aligned_nrpages(sg->offset, sg->length); - iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), + iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), *dev->dma_mask); - if (!iova) { + if (!iova_pfn) { sglist->dma_length = 0; return 0; } @@ -3753,13 +3843,13 @@ static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nele if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) prot |= DMA_PTE_WRITE; - start_vpfn = mm_to_dma_pfn(iova->pfn_lo); + start_vpfn = mm_to_dma_pfn(iova_pfn); ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot); if (unlikely(ret)) { dma_pte_free_pagetable(domain, start_vpfn, start_vpfn + size - 1); - __free_iova(&domain->iovad, iova); + free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size)); return 0; } @@ -4498,6 +4588,46 @@ static struct notifier_block intel_iommu_memory_nb = { .priority = 0 }; +static void free_all_cpu_cached_iovas(unsigned int cpu) +{ + int i; + + for (i = 0; i < g_num_of_iommus; i++) { + struct intel_iommu *iommu = g_iommus[i]; + struct dmar_domain *domain; + u16 did; + + if (!iommu) + continue; + + for (did = 0; did < 0xffff; did++) { + domain = get_iommu_domain(iommu, did); + + if (!domain) + continue; + free_cpu_cached_iovas(cpu, &domain->iovad); + } + } +} + +static int intel_iommu_cpu_notifier(struct notifier_block *nfb, + unsigned long action, void *v) +{ + unsigned int cpu = (unsigned long)v; + + switch (action) { + case CPU_DEAD: + case CPU_DEAD_FROZEN: + free_all_cpu_cached_iovas(cpu); + flush_unmaps_timeout(cpu); + break; + } + return NOTIFY_OK; +} + +static struct notifier_block intel_iommu_cpu_nb = { + .notifier_call = intel_iommu_cpu_notifier, +}; static ssize_t intel_iommu_show_version(struct device *dev, struct device_attribute *attr, @@ -4631,7 +4761,6 @@ int __init intel_iommu_init(void) up_write(&dmar_global_lock); pr_info("Intel(R) Virtualization Technology for Directed I/O\n"); - init_timer(&unmap_timer); #ifdef CONFIG_SWIOTLB swiotlb = 0; #endif @@ -4648,6 +4777,7 @@ int __init intel_iommu_init(void) bus_register_notifier(&pci_bus_type, &device_nb); if (si_domain && !hw_pass_through) register_memory_notifier(&intel_iommu_memory_nb); + register_hotcpu_notifier(&intel_iommu_cpu_nb); intel_iommu_enabled = 1; |