diff options
author | Alistair Popple <apopple@nvidia.com> | 2021-07-01 03:54:25 +0200 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2021-07-01 20:06:03 +0200 |
commit | b756a3b5e7ead8f6f4b03cea8ac22478ce04c8a8 (patch) | |
tree | fc6e3f788e1144e48ddc15475dc18d44448d6e22 /include | |
parent | mm/memory.c: allow different return codes for copy_nonpresent_pte() (diff) | |
download | linux-b756a3b5e7ead8f6f4b03cea8ac22478ce04c8a8.tar.xz linux-b756a3b5e7ead8f6f4b03cea8ac22478ce04c8a8.zip |
mm: device exclusive memory access
Some devices require exclusive write access to shared virtual memory (SVM)
ranges to perform atomic operations on that memory. This requires CPU
page tables to be updated to deny access whilst atomic operations are
occurring.
In order to do this introduce a new swap entry type
(SWP_DEVICE_EXCLUSIVE). When a SVM range needs to be marked for exclusive
access by a device all page table mappings for the particular range are
replaced with device exclusive swap entries. This causes any CPU access
to the page to result in a fault.
Faults are resovled by replacing the faulting entry with the original
mapping. This results in MMU notifiers being called which a driver uses
to update access permissions such as revoking atomic access. After
notifiers have been called the device will no longer have exclusive access
to the region.
Walking of the page tables to find the target pages is handled by
get_user_pages() rather than a direct page table walk. A direct page
table walk similar to what migrate_vma_collect()/unmap() does could also
have been utilised. However this resulted in more code similar in
functionality to what get_user_pages() provides as page faulting is
required to make the PTEs present and to break COW.
[dan.carpenter@oracle.com: fix signedness bug in make_device_exclusive_range()]
Link: https://lkml.kernel.org/r/YNIz5NVnZ5GiZ3u1@mwanda
Link: https://lkml.kernel.org/r/20210616105937.23201-8-apopple@nvidia.com
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Ben Skeggs <bskeggs@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'include')
-rw-r--r-- | include/linux/mmu_notifier.h | 6 | ||||
-rw-r--r-- | include/linux/rmap.h | 4 | ||||
-rw-r--r-- | include/linux/swap.h | 9 | ||||
-rw-r--r-- | include/linux/swapops.h | 44 |
4 files changed, 60 insertions, 3 deletions
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 8e428eb813b8..6692da8d121d 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -42,6 +42,11 @@ struct mmu_interval_notifier; * @MMU_NOTIFY_MIGRATE: used during migrate_vma_collect() invalidate to signal * a device driver to possibly ignore the invalidation if the * owner field matches the driver's device private pgmap owner. + * + * @MMU_NOTIFY_EXCLUSIVE: to signal a device driver that the device will no + * longer have exclusive access to the page. When sent during creation of an + * exclusive range the owner will be initialised to the value provided by the + * caller of make_device_exclusive_range(), otherwise the owner will be NULL. */ enum mmu_notifier_event { MMU_NOTIFY_UNMAP = 0, @@ -51,6 +56,7 @@ enum mmu_notifier_event { MMU_NOTIFY_SOFT_DIRTY, MMU_NOTIFY_RELEASE, MMU_NOTIFY_MIGRATE, + MMU_NOTIFY_EXCLUSIVE, }; #define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index b0ea9d98302f..83fb86133fe1 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -194,6 +194,10 @@ int page_referenced(struct page *, int is_locked, void try_to_migrate(struct page *page, enum ttu_flags flags); void try_to_unmap(struct page *, enum ttu_flags flags); +int make_device_exclusive_range(struct mm_struct *mm, unsigned long start, + unsigned long end, struct page **pages, + void *arg); + /* Avoid racy checks */ #define PVMW_SYNC (1 << 0) /* Look for migarion entries rather than present PTEs */ diff --git a/include/linux/swap.h b/include/linux/swap.h index df7cbb6b3d3e..6f5a43251593 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -62,12 +62,17 @@ static inline int current_is_kswapd(void) * migrate part of a process memory to device memory. * * When a page is migrated from CPU to device, we set the CPU page table entry - * to a special SWP_DEVICE_* entry. + * to a special SWP_DEVICE_{READ|WRITE} entry. + * + * When a page is mapped by the device for exclusive access we set the CPU page + * table entries to special SWP_DEVICE_EXCLUSIVE_* entries. */ #ifdef CONFIG_DEVICE_PRIVATE -#define SWP_DEVICE_NUM 2 +#define SWP_DEVICE_NUM 4 #define SWP_DEVICE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM) #define SWP_DEVICE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+1) +#define SWP_DEVICE_EXCLUSIVE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+2) +#define SWP_DEVICE_EXCLUSIVE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+3) #else #define SWP_DEVICE_NUM 0 #endif diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 04d76357aa0c..d356ab4047f7 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -127,6 +127,27 @@ static inline bool is_writable_device_private_entry(swp_entry_t entry) { return unlikely(swp_type(entry) == SWP_DEVICE_WRITE); } + +static inline swp_entry_t make_readable_device_exclusive_entry(pgoff_t offset) +{ + return swp_entry(SWP_DEVICE_EXCLUSIVE_READ, offset); +} + +static inline swp_entry_t make_writable_device_exclusive_entry(pgoff_t offset) +{ + return swp_entry(SWP_DEVICE_EXCLUSIVE_WRITE, offset); +} + +static inline bool is_device_exclusive_entry(swp_entry_t entry) +{ + return swp_type(entry) == SWP_DEVICE_EXCLUSIVE_READ || + swp_type(entry) == SWP_DEVICE_EXCLUSIVE_WRITE; +} + +static inline bool is_writable_device_exclusive_entry(swp_entry_t entry) +{ + return unlikely(swp_type(entry) == SWP_DEVICE_EXCLUSIVE_WRITE); +} #else /* CONFIG_DEVICE_PRIVATE */ static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset) { @@ -147,6 +168,26 @@ static inline bool is_writable_device_private_entry(swp_entry_t entry) { return false; } + +static inline swp_entry_t make_readable_device_exclusive_entry(pgoff_t offset) +{ + return swp_entry(0, 0); +} + +static inline swp_entry_t make_writable_device_exclusive_entry(pgoff_t offset) +{ + return swp_entry(0, 0); +} + +static inline bool is_device_exclusive_entry(swp_entry_t entry) +{ + return false; +} + +static inline bool is_writable_device_exclusive_entry(swp_entry_t entry) +{ + return false; +} #endif /* CONFIG_DEVICE_PRIVATE */ #ifdef CONFIG_MIGRATION @@ -226,7 +267,8 @@ static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry) */ static inline bool is_pfn_swap_entry(swp_entry_t entry) { - return is_migration_entry(entry) || is_device_private_entry(entry); + return is_migration_entry(entry) || is_device_private_entry(entry) || + is_device_exclusive_entry(entry); } struct page_vma_mapped_walk; |