/* * linux/mm/percpu.c - percpu memory allocator * * Copyright (C) 2009 SUSE Linux Products GmbH * Copyright (C) 2009 Tejun Heo * * This file is released under the GPLv2. * * This is percpu allocator which can handle both static and dynamic * areas. Percpu areas are allocated in chunks in vmalloc area. Each * chunk is consisted of num_possible_cpus() units and the first chunk * is used for static percpu variables in the kernel image (special * boot time alloc/init handling necessary as these areas need to be * brought up before allocation services are running). Unit grows as * necessary and all units grow or shrink in unison. When a chunk is * filled up, another chunk is allocated. ie. in vmalloc area * * c0 c1 c2 * ------------------- ------------------- ------------ * | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u * ------------------- ...... ------------------- .... ------------ * * Allocation is done in offset-size areas of single unit space. Ie, * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring * percpu base registers pcpu_unit_size apart. * * There are usually many small percpu allocations many of them as * small as 4 bytes. The allocator organizes chunks into lists * according to free size and tries to allocate from the fullest one. * Each chunk keeps the maximum contiguous area size hint which is * guaranteed to be eqaul to or larger than the maximum contiguous * area in the chunk. This helps the allocator not to iterate the * chunk maps unnecessarily. * * Allocation state in each chunk is kept using an array of integers * on chunk->map. A positive value in the map represents a free * region and negative allocated. Allocation inside a chunk is done * by scanning this map sequentially and serving the first matching * entry. This is mostly copied from the percpu_modalloc() allocator. * Chunks can be determined from the address using the index field * in the page struct. The index field contains a pointer to the chunk. * * To use this allocator, arch code should do the followings. * * - drop CONFIG_HAVE_LEGACY_PER_CPU_AREA * * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate * regular address to percpu pointer and back if they need to be * different from the default * * - use pcpu_setup_first_chunk() during percpu area initialization to * setup the first chunk containing the kernel static percpu area */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ #ifndef __addr_to_pcpu_ptr #define __addr_to_pcpu_ptr(addr) \ (void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr \ + (unsigned long)__per_cpu_start) #endif #ifndef __pcpu_ptr_to_addr #define __pcpu_ptr_to_addr(ptr) \ (void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr \ - (unsigned long)__per_cpu_start) #endif struct pcpu_chunk { struct list_head list; /* linked to pcpu_slot lists */ int free_size; /* free bytes in the chunk */ int contig_hint; /* max contiguous size hint */ struct vm_struct *vm; /* mapped vmalloc region */ int map_used; /* # of map entries used */ int map_alloc; /* # of map entries allocated */ int *map; /* allocation map */ bool immutable; /* no [de]population allowed */ struct page **page; /* points to page array */ struct page *page_ar[]; /* #cpus * UNIT_PAGES */ }; static int pcpu_unit_pages __read_mostly; static int pcpu_unit_size __read_mostly; static int pcpu_chunk_size __read_mostly; static int pcpu_nr_slots __read_mostly; static size_t pcpu_chunk_struct_size __read_mostly; /* the address of the first chunk which starts with the kernel static area */ void *pcpu_base_addr __read_mostly; EXPORT_SYMBOL_GPL(pcpu_base_addr); /* * The first chunk which always exists. Note that unlike other * chunks, this one can be allocated and mapped in several different * ways and thus often doesn't live in the vmalloc area. */ static struct pcpu_chunk *pcpu_first_chunk; /* * Optional reserved chunk. This chunk reserves part of the first * chunk and serves it for reserved allocations. The amount of * reserved offset is in pcpu_reserved_chunk_limit. When reserved * area doesn't exist, the following variables contain NULL and 0 * respectively. */ static struct pcpu_chunk *pcpu_reserved_chunk; static int pcpu_reserved_chunk_limit; /* * Synchronization rules. * * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former * protects allocation/reclaim paths, chunks and chunk->page arrays. * The latter is a spinlock and protects the index data structures - * chunk slots, chunks and area maps in chunks. * * During allocation, pcpu_alloc_mutex is kept locked all the time and * pcpu_lock is grabbed and released as necessary. All actual memory * allocations are done using GFP_KERNEL with pcpu_lock released. * * Free path accesses and alters only the index data structures, so it * can be safely called from atomic context. When memory needs to be * returned to the system, free path schedules reclaim_work which * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be * reclaimed, release both locks and frees the chunks. Note that it's * necessary to grab both locks to remove a chunk from circulation as * allocation path might be referencing the chunk with only * pcpu_alloc_mutex locked. */ static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */ static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */ static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ /* reclaim work to release fully free chunks, scheduled from free path */ static void pcpu_reclaim(struct work_struct *work); static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim); static int __pcpu_size_to_slot(int size) { int highbit = fls(size); /* size is in bytes */ return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1); } static int pcpu_size_to_slot(int size) { if (size == pcpu_unit_size) return pcpu_nr_slots - 1; return __pcpu_size_to_slot(size); } static int pcpu_chunk_slot(const struct pcpu_chunk *chunk) { if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int)) return 0; return pcpu_size_to_slot(chunk->free_size); } static int pcpu_page_idx(unsigned int cpu, int page_idx) { return cpu * pcpu_unit_pages + page_idx; } static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk, unsigned int cpu, int page_idx) { return &chunk->page[pcpu_page_idx(cpu, page_idx)]; } static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, unsigned int cpu, int page_idx) { return (unsigned long)chunk->vm->addr + (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT); } static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, int page_idx) { return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL; } /* set the pointer to a chunk in a page struct */ static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu) { page->index = (unsigned long)pcpu; } /* obtain pointer to a chunk from a page struct */ static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page) { return (struct pcpu_chunk *)page->index; } /** * pcpu_mem_alloc - allocate memory * @size: bytes to allocate * * Allocate @size bytes. If @size is smaller than PAGE_SIZE, * kzalloc() is used; otherwise, vmalloc() is used. The returned * memory is always zeroed. * * CONTEXT: * Does GFP_KERNEL allocation. * * RETURNS: * Pointer to the allocated area on success, NULL on failure. */ static void *pcpu_mem_alloc(size_t size) { if (size <= PAGE_SIZE) return kzalloc(size, GFP_KERNEL); else { void *ptr = vmalloc(size); if (ptr) memset(ptr, 0, size); return ptr; } } /** * pcpu_mem_free - free memory * @ptr: memory to free * @size: size of the area * * Free @ptr. @ptr should have been allocated using pcpu_mem_alloc(). */ static void pcpu_mem_free(void *ptr, size_t size) { if (size <= PAGE_SIZE) kfree(ptr); else vfree(ptr); } /** * pcpu_chunk_relocate - put chunk in the appropriate chunk slot * @chunk: chunk of interest * @oslot: the previous slot it was on * * This function is called after an allocation or free changed @chunk. * New slot according to the changed state is determined and @chunk is * moved to the slot. Note that the reserved chunk is never put on * chunk slots. * * CONTEXT: * pcpu_lock. */ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) { int nslot = pcpu_chunk_slot(chunk); if (chunk != pcpu_reserved_chunk && oslot != nslot) { if (oslot < nslot) list_move(&chunk->list, &pcpu_slot[nslot]); else list_move_tail(&chunk->list, &pcpu_slot[nslot]); } } /** * pcpu_chunk_addr_search - determine chunk containing specified address * @addr: address for which the chunk needs to be determined. * * RETURNS: * The address of the found chunk. */ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) { void *first_start = pcpu_first_chunk->vm->addr; /* is it in the first chunk? */ if (addr >= first_start && addr < first_start + pcpu_unit_size) { /* is it in the reserved area? */ if (addr < first_start + pcpu_reserved_chunk_limit) return pcpu_reserved_chunk; return pcpu_first_chunk; } return pcpu_get_page_chunk(vmalloc_to_page(addr)); } /** * pcpu_extend_area_map - extend area map for allocation * @chunk: target chunk * * Extend area map of @chunk so that it can accomodate an allocation. * A single allocation can split an area into three areas, so this * function makes sure that @chunk->map has at least two extra slots. * * CONTEXT: * pcpu_alloc_mutex, pcpu_lock. pcpu_lock is released and reacquired * if area map is extended. * * RETURNS: * 0 if noop, 1 if successfully extended, -errno on failure. */ static int pcpu_extend_area_map(struct pcpu_chunk *chunk) { int new_alloc; int *new; size_t size; /* has enough? */ if (chunk->map_alloc >= chunk->map_used + 2) return 0; spin_unlock_irq(&pcpu_lock); new_alloc = PCPU_DFL_MAP_ALLOC; while (new_alloc < chunk->map_used + 2) new_alloc *= 2; new = pcpu_mem_alloc(new_alloc * sizeof(new[0])); if (!new) { spin_lock_irq(&pcpu_lock); return -ENOMEM; } /* * Acquire pcpu_lock and switch to new area map. Only free * could have happened inbetween, so map_used couldn't have * grown. */ spin_lock_irq(&pcpu_lock); BUG_ON(new_alloc < chunk->map_used + 2); size = chunk->map_alloc * sizeof(chunk->map[0]); memcpy(new, chunk->map, size); /* * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is * one of the first chunks and still using static map. */ if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC) pcpu_mem_free(chunk->map, size); chunk->map_alloc = new_alloc; chunk->map = new; return 0; } /** * pcpu_split_block - split a map block * @chunk: chunk of interest * @i: index of map block to split * @head: head size in bytes (can be 0) * @tail: tail size in bytes (can be 0) * * Split the @i'th map block into two or three blocks. If @head is * non-zero, @head bytes block is inserted before block @i moving it * to @i+1 and reducing its size by @head bytes. * * If @tail is non-zero, the target block, which can be @i or @i+1 * depending on @head, is reduced by @tail bytes and @tail byte block * is inserted after the target block. * * @chunk->map must have enough free slots to accomodate the split. * * CONTEXT: * pcpu_lock. */ static void pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail) { int nr_extra = !!head + !!tail; BUG_ON(chunk->map_alloc < chunk->map_used + nr_extra); /* insert new subblocks */ memmove(&chunk->map[i + nr_extra], &chunk->map[i], sizeof(chunk->map[0]) * (chunk->map_used - i)); chunk->map_used += nr_extra; if (head) { chunk->map[i + 1] = chunk->map[i] - head; chunk->map[i++] = head; } if (tail) { chunk->map[i++] -= tail; chunk->map[i] = tail; } } /** * pcpu_alloc_area - allocate area from a pcpu_chunk * @chunk: chunk of interest * @size: wanted size in bytes * @align: wanted align * * Try to allocate @size bytes area aligned at @align from @chunk. * Note that this function only allocates the offset. It doesn't * populate or map the area. * * @chunk->map must have at least two free slots. * * CONTEXT: * pcpu_lock. * * RETURNS: * Allocated offset in @chunk on success, -1 if no matching area is * found. */ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) { int oslot = pcpu_chunk_slot(chunk); int max_contig = 0; int i, off; for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) { bool is_last = i + 1 == chunk->map_used; int head, tail; /* extra for alignment requirement */ head = ALIGN(off, align) - off; BUG_ON(i == 0 && head != 0); if (chunk->map[i] < 0) continue; if (chunk->map[i] < head + size) { max_contig = max(chunk->map[i], max_contig); continue; } /* * If head is small or the previous block is free, * merge'em. Note that 'small' is defined as smaller * than sizeof(int), which is very small but isn't too * uncommon for percpu allocations. */ if (head && (head < sizeof(int) || chunk->map[i - 1] > 0)) { if (chunk->map[i - 1] > 0) chunk->map[i - 1] += head; else { chunk->map[i - 1] -= head; chunk->free_size -= head; } chunk->map[i] -= head; off += head; head = 0; } /* if tail is small, just keep it around */ tail = chunk->map[i] - head - size; if (tail < sizeof(int)) tail = 0; /* split if warranted */ if (head || tail) { pcpu_split_block(chunk, i, head, tail); if (head) { i++; off += head; max_contig = max(chunk->map[i - 1], max_contig); } if (tail) max_contig = max(chunk->map[i + 1], max_contig); } /* update hint and mark allocated */ if (is_last) chunk->contig_hint = max_contig; /* fully scanned */ else chunk->contig_hint = max(chunk->contig_hint, max_contig); chunk->free_size -= chunk->map[i]; chunk->map[i] = -chunk->map[i]; pcpu_chunk_relocate(chunk, oslot); return off; } chunk->contig_hint = max_contig; /* fully scanned */ pcpu_chunk_relocate(chunk, oslot); /* tell the upper layer that this chunk has no matching area */ return -1; } /** * pcpu_free_area - free area to a pcpu_chunk * @chunk: chunk of interest * @freeme: offset of area to free * * Free area starting from @freeme to @chunk. Note that this function * only modifies the allocation map. It doesn't depopulate or unmap * the area. * * CONTEXT: * pcpu_lock. */ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) { int oslot = pcpu_chunk_slot(chunk); int i, off; for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) if (off == freeme) break; BUG_ON(off != freeme); BUG_ON(chunk->map[i] > 0); chunk->map[i] = -chunk->map[i]; chunk->free_size += chunk->map[i]; /* merge with previous? */ if (i > 0 && chunk->map[i - 1] >= 0) { chunk->map[i - 1] += chunk->map[i]; chunk->map_used--; memmove(&chunk->map[i], &chunk->map[i + 1], (chunk->map_used - i) * sizeof(chunk->map[0])); i--; } /* merge with next? */ if (i + 1 < chunk->map_used && chunk->map[i + 1] >= 0) { chunk->map[i] += chunk->map[i + 1]; chunk->map_used--; memmove(&chunk->map[i + 1], &chunk->map[i + 2], (chunk->map_used - (i + 1)) * sizeof(chunk->map[0])); } chunk->contig_hint = max(chunk->map[i], chunk->contig_hint); pcpu_chunk_relocate(chunk, oslot); } /** * pcpu_unmap - unmap pages out of a pcpu_chunk * @chunk: chunk of interest * @page_start: page index of the first page to unmap * @page_end: page index of the last page to unmap + 1 * @flush_tlb: whether to flush tlb or not * * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. * If @flush is true, vcache is flushed before unmapping and tlb * after. */ static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, bool flush_tlb) { unsigned int last = num_possible_cpus() - 1; unsigned int cpu; /* unmap must not be done on immutable chunk */ WARN_ON(chunk->immutable); /* * Each flushing trial can be very expensive, issue flush on * the whole region at once rather than doing it for each cpu. * This could be an overkill but is more scalable. */ flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start), pcpu_chunk_addr(chunk, last, page_end)); for_each_possible_cpu(cpu) unmap_kernel_range_noflush( pcpu_chunk_addr(chunk, cpu, page_start), (page_end - page_start) << PAGE_SHIFT); /* ditto as flush_cache_vunmap() */ if (flush_tlb) flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start), pcpu_chunk_addr(chunk, last, page_end)); } /** * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk * @chunk: chunk to depopulate * @off: offset to the area to depopulate * @size: size of the area to depopulate in bytes * @flush: whether to flush cache and tlb or not * * For each cpu, depopulate and unmap pages [@page_start,@page_end) * from @chunk. If @flush is true, vcache is flushed before unmapping * and tlb after. * * CONTEXT: * pcpu_alloc_mutex. */ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size, bool flush) { int page_start = PFN_DOWN(off); int page_end = PFN_UP(off + size); int unmap_start = -1; int uninitialized_var(unmap_end); unsigned int cpu; int i; for (i = page_start; i < page_end; i++) { for_each_possible_cpu(cpu) { struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); if (!*pagep) continue; __free_page(*pagep); /* * If it's partial depopulation, it might get * populated or depopulated again. Mark the * page gone. */ *pagep = NULL; unmap_start = unmap_start < 0 ? i : unmap_start; unmap_end = i + 1; } } if (unmap_start >= 0) pcpu_unmap(chunk, unmap_start, unmap_end, flush); } static int __pcpu_map_pages(unsigned long addr, struct page **pages, int nr_pages) { return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT, PAGE_KERNEL, pages); } /** * pcpu_map - map pages into a pcpu_chunk * @chunk: chunk of interest * @page_start: page index of the first page to map * @page_end: page index of the last page to map + 1 * * For each cpu, map pages [@page_start,@page_end) into @chunk. * vcache is flushed afterwards. */ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) { unsigned int last = num_possible_cpus() - 1; unsigned int cpu; int err; /* map must not be done on immutable chunk */ WARN_ON(chunk->immutable); for_each_possible_cpu(cpu) { err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start), pcpu_chunk_pagep(chunk, cpu, page_start), page_end - page_start); if (err < 0) return err; } /* flush at once, please read comments in pcpu_unmap() */ flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start), pcpu_chunk_addr(chunk, last, page_end)); return 0; } /** * pcpu_populate_chunk - populate and map an area of a pcpu_chunk * @chunk: chunk of interest * @off: offset to the area to populate * @size: size of the area to populate in bytes * * For each cpu, populate and map pages [@page_start,@page_end) into * @chunk. The area is cleared on return. * * CONTEXT: * pcpu_alloc_mutex, does GFP_KERNEL allocation. */ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) { const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; int page_start = PFN_DOWN(off); int page_end = PFN_UP(off + size); int map_start = -1; int uninitialized_var(map_end); unsigned int cpu; int i; for (i = page_start; i < page_end; i++) { if (pcpu_chunk_page_occupied(chunk, i)) { if (map_start >= 0) { if (pcpu_map(chunk, map_start, map_end)) goto err; map_start = -1; } continue; } map_start = map_start < 0 ? i : map_start; map_end = i + 1; for_each_possible_cpu(cpu) { struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); *pagep = alloc_pages_node(cpu_to_node(cpu), alloc_mask, 0); if (!*pagep) goto err; pcpu_set_page_chunk(*pagep, chunk); } } if (map_start >= 0 && pcpu_map(chunk, map_start, map_end)) goto err; for_each_possible_cpu(cpu) memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0, size); return 0; err: /* likely under heavy memory pressure, give memory back */ pcpu_depopulate_chunk(chunk, off, size, true); return -ENOMEM; } static void free_pcpu_chunk(struct pcpu_chunk *chunk) { if (!chunk) return; if (chunk->vm) free_vm_area(chunk->vm); pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0])); kfree(chunk); } static struct pcpu_chunk *alloc_pcpu_chunk(void) { struct pcpu_chunk *chunk; chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL); if (!chunk) return NULL; chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); chunk->map_alloc = PCPU_DFL_MAP_ALLOC; chunk->map[chunk->map_used++] = pcpu_unit_size; chunk->page = chunk->page_ar; chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL); if (!chunk->vm) { free_pcpu_chunk(chunk); return NULL; } INIT_LIST_HEAD(&chunk->list); chunk->free_size = pcpu_unit_size; chunk->contig_hint = pcpu_unit_size; return chunk; } /** * pcpu_alloc - the percpu allocator * @size: size of area to allocate in bytes * @align: alignment of area (max PAGE_SIZE) * @reserved: allocate from the reserved chunk if available * * Allocate percpu area of @size bytes aligned at @align. * * CONTEXT: * Does GFP_KERNEL allocation. * * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ static void *pcpu_alloc(size_t size, size_t align, bool reserved) { struct pcpu_chunk *chunk; int slot, off; if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { WARN(true, "illegal size (%zu) or align (%zu) for " "percpu allocation\n", size, align); return NULL; } mutex_lock(&pcpu_alloc_mutex); spin_lock_irq(&pcpu_lock); /* serve reserved allocations from the reserved chunk if available */ if (reserved && pcpu_reserved_chunk) { chunk = pcpu_reserved_chunk; if (size > chunk->contig_hint || pcpu_extend_area_map(chunk) < 0) goto fail_unlock; off = pcpu_alloc_area(chunk, size, align); if (off >= 0) goto area_found; goto fail_unlock; } restart: /* search through normal chunks */ for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) { list_for_each_entry(chunk, &pcpu_slot[slot], list) { if (size > chunk->contig_hint) continue; switch (pcpu_extend_area_map(chunk)) { case 0: break; case 1: goto restart; /* pcpu_lock dropped, restart */ default: goto fail_unlock; } off = pcpu_alloc_area(chunk, size, align); if (off >= 0) goto area_found; } } /* hmmm... no space left, create a new chunk */ spin_unlock_irq(&pcpu_lock); chunk = alloc_pcpu_chunk(); if (!chunk) goto fail_unlock_mutex; spin_lock_irq(&pcpu_lock); pcpu_chunk_relocate(chunk, -1); goto restart; area_found: spin_unlock_irq(&pcpu_lock); /* populate, map and clear the area */ if (pcpu_populate_chunk(chunk, off, size)) { spin_lock_irq(&pcpu_lock); pcpu_free_area(chunk, off); goto fail_unlock; } mutex_unlock(&pcpu_alloc_mutex); return __addr_to_pcpu_ptr(chunk->vm->addr + off); fail_unlock: spin_unlock_irq(&pcpu_lock); fail_unlock_mutex: mutex_unlock(&pcpu_alloc_mutex); return NULL; } /** * __alloc_percpu - allocate dynamic percpu area * @size: size of area to allocate in bytes * @align: alignment of area (max PAGE_SIZE) * * Allocate percpu area of @size bytes aligned at @align. Might * sleep. Might trigger writeouts. * * CONTEXT: * Does GFP_KERNEL allocation. * * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ void *__alloc_percpu(size_t size, size_t align) { return pcpu_alloc(size, align, false); } EXPORT_SYMBOL_GPL(__alloc_percpu); /** * __alloc_reserved_percpu - allocate reserved percpu area * @size: size of area to allocate in bytes * @align: alignment of area (max PAGE_SIZE) * * Allocate percpu area of @size bytes aligned at @align from reserved * percpu area if arch has set it up; otherwise, allocation is served * from the same dynamic area. Might sleep. Might trigger writeouts. * * CONTEXT: * Does GFP_KERNEL allocation. * * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ void *__alloc_reserved_percpu(size_t size, size_t align) { return pcpu_alloc(size, align, true); } /** * pcpu_reclaim - reclaim fully free chunks, workqueue function * @work: unused * * Reclaim all fully free chunks except for the first one. * * CONTEXT: * workqueue context. */ static void pcpu_reclaim(struct work_struct *work) { LIST_HEAD(todo); struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1]; struct pcpu_chunk *chunk, *next; mutex_lock(&pcpu_alloc_mutex); spin_lock_irq(&pcpu_lock); list_for_each_entry_safe(chunk, next, head, list) { WARN_ON(chunk->immutable); /* spare the first one */ if (chunk == list_first_entry(head, struct pcpu_chunk, list)) continue; list_move(&chunk->list, &todo); } spin_unlock_irq(&pcpu_lock); mutex_unlock(&pcpu_alloc_mutex); list_for_each_entry_safe(chunk, next, &todo, list) { pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false); free_pcpu_chunk(chunk); } } /** * free_percpu - free percpu area * @ptr: pointer to area to free * * Free percpu area @ptr. * * CONTEXT: * Can be called from atomic context. */ void free_percpu(void *ptr) { void *addr = __pcpu_ptr_to_addr(ptr); struct pcpu_chunk *chunk; unsigned long flags; int off; if (!ptr) return; spin_lock_irqsave(&pcpu_lock, flags); chunk = pcpu_chunk_addr_search(addr); off = addr - chunk->vm->addr; pcpu_free_area(chunk, off); /* if there are more than one fully free chunks, wake up grim reaper */ if (chunk->free_size == pcpu_unit_size) { struct pcpu_chunk *pos; list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list) if (pos != chunk) { schedule_work(&pcpu_reclaim_work); break; } } spin_unlock_irqrestore(&pcpu_lock, flags); } EXPORT_SYMBOL_GPL(free_percpu); /** * pcpu_setup_first_chunk - initialize the first percpu chunk * @get_page_fn: callback to fetch page pointer * @static_size: the size of static percpu area in bytes * @reserved_size: the size of reserved percpu area in bytes, 0 for none * @dyn_size: free size for dynamic allocation in bytes, -1 for auto * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE * @base_addr: mapped address * * Initialize the first percpu chunk which contains the kernel static * perpcu area. This function is to be called from arch percpu area * setup path. * * @get_page_fn() should return pointer to percpu page given cpu * number and page number. It should at least return enough pages to * cover the static area. The returned pages for static area should * have been initialized with valid data. It can also return pages * after the static area. NULL return indicates end of pages for the * cpu. Note that @get_page_fn() must return the same number of pages * for all cpus. * * @reserved_size, if non-zero, specifies the amount of bytes to * reserve after the static area in the first chunk. This reserves * the first chunk such that it's available only through reserved * percpu allocation. This is primarily used to serve module percpu * static areas on architectures where the addressing model has * limited offset range for symbol relocations to guarantee module * percpu symbols fall inside the relocatable range. * * @dyn_size, if non-negative, determines the number of bytes * available for dynamic allocation in the first chunk. Specifying * non-negative value makes percpu leave alone the area beyond * @static_size + @reserved_size + @dyn_size. * * @unit_size specifies unit size and must be aligned to PAGE_SIZE and * equal to or larger than @static_size + @reserved_size + if * non-negative, @dyn_size. * * The caller should have mapped the first chunk at @base_addr and * copied static data to each unit. * * If the first chunk ends up with both reserved and dynamic areas, it * is served by two chunks - one to serve the core static and reserved * areas and the other for the dynamic area. They share the same vm * and page map but uses different area allocation map to stay away * from each other. The latter chunk is circulated in the chunk slots * and available for dynamic allocation like any other chunks. * * RETURNS: * The determined pcpu_unit_size which can be used to initialize * percpu access. */ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, size_t static_size, size_t reserved_size, ssize_t dyn_size, size_t unit_size, void *base_addr) { static struct vm_struct first_vm; static int smap[2], dmap[2]; size_t size_sum = static_size + reserved_size + (dyn_size >= 0 ? dyn_size : 0); struct pcpu_chunk *schunk, *dchunk = NULL; unsigned int cpu; int i, nr_pages; /* santiy checks */ BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC); BUG_ON(!static_size); BUG_ON(!base_addr); BUG_ON(unit_size < size_sum); BUG_ON(unit_size & ~PAGE_MASK); BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE); pcpu_unit_pages = unit_size >> PAGE_SHIFT; pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *); if (dyn_size < 0) dyn_size = pcpu_unit_size - static_size - reserved_size; first_vm.flags = VM_ALLOC; first_vm.size = pcpu_chunk_size; first_vm.addr = base_addr; /* * Allocate chunk slots. The additional last slot is for * empty chunks. */ pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2; pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0])); for (i = 0; i < pcpu_nr_slots; i++) INIT_LIST_HEAD(&pcpu_slot[i]); /* * Initialize static chunk. If reserved_size is zero, the * static chunk covers static area + dynamic allocation area * in the first chunk. If reserved_size is not zero, it * covers static area + reserved area (mostly used for module * static percpu allocation). */ schunk = alloc_bootmem(pcpu_chunk_struct_size); INIT_LIST_HEAD(&schunk->list); schunk->vm = &first_vm; schunk->map = smap; schunk->map_alloc = ARRAY_SIZE(smap); schunk->page = schunk->page_ar; schunk->immutable = true; if (reserved_size) { schunk->free_size = reserved_size; pcpu_reserved_chunk = schunk; pcpu_reserved_chunk_limit = static_size + reserved_size; } else { schunk->free_size = dyn_size; dyn_size = 0; /* dynamic area covered */ } schunk->contig_hint = schunk->free_size; schunk->map[schunk->map_used++] = -static_size; if (schunk->free_size) schunk->map[schunk->map_used++] = schunk->free_size; /* init dynamic chunk if necessary */ if (dyn_size) { dchunk = alloc_bootmem(sizeof(struct pcpu_chunk)); INIT_LIST_HEAD(&dchunk->list); dchunk->vm = &first_vm; dchunk->map = dmap; dchunk->map_alloc = ARRAY_SIZE(dmap); dchunk->page = schunk->page_ar; /* share page map with schunk */ dchunk->immutable = true; dchunk->contig_hint = dchunk->free_size = dyn_size; dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit; dchunk->map[dchunk->map_used++] = dchunk->free_size; } /* assign pages */ nr_pages = -1; for_each_possible_cpu(cpu) { for (i = 0; i < pcpu_unit_pages; i++) { struct page *page = get_page_fn(cpu, i); if (!page) break; *pcpu_chunk_pagep(schunk, cpu, i) = page; } BUG_ON(i < PFN_UP(static_size)); if (nr_pages < 0) nr_pages = i; else BUG_ON(nr_pages != i); } /* link the first chunk in */ pcpu_first_chunk = dchunk ?: schunk; pcpu_chunk_relocate(pcpu_first_chunk, -1); /* we're done */ pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0); return pcpu_unit_size; } static size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size, ssize_t *dyn_sizep) { size_t size_sum; size_sum = PFN_ALIGN(static_size + reserved_size + (*dyn_sizep >= 0 ? *dyn_sizep : 0)); if (*dyn_sizep != 0) *dyn_sizep = size_sum - static_size - reserved_size; return size_sum; } /* * Embedding first chunk setup helper. */ static void *pcpue_ptr __initdata; static size_t pcpue_size __initdata; static size_t pcpue_unit_size __initdata; static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) { size_t off = (size_t)pageno << PAGE_SHIFT; if (off >= pcpue_size) return NULL; return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off); } /** * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem * @static_size: the size of static percpu area in bytes * @reserved_size: the size of reserved percpu area in bytes * @dyn_size: free size for dynamic allocation in bytes, -1 for auto * * This is a helper to ease setting up embedded first percpu chunk and * can be called where pcpu_setup_first_chunk() is expected. * * If this function is used to setup the first chunk, it is allocated * as a contiguous area using bootmem allocator and used as-is without * being mapped into vmalloc area. This enables the first chunk to * piggy back on the linear physical mapping which often uses larger * page size. * * When @dyn_size is positive, dynamic area might be larger than * specified to fill page alignment. When @dyn_size is auto, * @dyn_size is just big enough to fill page alignment after static * and reserved areas. * * If the needed size is smaller than the minimum or specified unit * size, the leftover is returned to the bootmem allocator. * * RETURNS: * The determined pcpu_unit_size which can be used to initialize * percpu access on success, -errno on failure. */ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, ssize_t dyn_size) { size_t chunk_size; unsigned int cpu; /* determine parameters and allocate */ pcpue_size = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); chunk_size = pcpue_unit_size * num_possible_cpus(); pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); if (!pcpue_ptr) { pr_warning("PERCPU: failed to allocate %zu bytes for " "embedding\n", chunk_size); return -ENOMEM; } /* return the leftover and copy */ for_each_possible_cpu(cpu) { void *ptr = pcpue_ptr + cpu * pcpue_unit_size; free_bootmem(__pa(ptr + pcpue_size), pcpue_unit_size - pcpue_size); memcpy(ptr, __per_cpu_load, static_size); } /* we're ready, commit */ pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size); return pcpu_setup_first_chunk(pcpue_get_page, static_size, reserved_size, dyn_size, pcpue_unit_size, pcpue_ptr); } /* * 4k page first chunk setup helper. */ static struct page **pcpu4k_pages __initdata; static int pcpu4k_unit_pages __initdata; static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno) { if (pageno < pcpu4k_unit_pages) return pcpu4k_pages[cpu * pcpu4k_unit_pages + pageno]; return NULL; } /** * pcpu_4k_first_chunk - map the first chunk using PAGE_SIZE pages * @static_size: the size of static percpu area in bytes * @reserved_size: the size of reserved percpu area in bytes * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE * @free_fn: funtion to free percpu page, always called with PAGE_SIZE * @populate_pte_fn: function to populate pte * * This is a helper to ease setting up embedded first percpu chunk and * can be called where pcpu_setup_first_chunk() is expected. * * This is the basic allocator. Static percpu area is allocated * page-by-page into vmalloc area. * * RETURNS: * The determined pcpu_unit_size which can be used to initialize * percpu access on success, -errno on failure. */ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_populate_pte_fn_t populate_pte_fn) { static struct vm_struct vm; size_t pages_size; unsigned int cpu; int i, j; ssize_t ret; pcpu4k_unit_pages = PFN_UP(max_t(size_t, static_size + reserved_size, PCPU_MIN_UNIT_SIZE)); /* unaligned allocations can't be freed, round up to page size */ pages_size = PFN_ALIGN(pcpu4k_unit_pages * num_possible_cpus() * sizeof(pcpu4k_pages[0])); pcpu4k_pages = alloc_bootmem(pages_size); /* allocate pages */ j = 0; for_each_possible_cpu(cpu) for (i = 0; i < pcpu4k_unit_pages; i++) { void *ptr; ptr = alloc_fn(cpu, PAGE_SIZE); if (!ptr) { pr_warning("PERCPU: failed to allocate " "4k page for cpu%u\n", cpu); goto enomem; } pcpu4k_pages[j++] = virt_to_page(ptr); } /* allocate vm area, map the pages and copy static data */ vm.flags = VM_ALLOC; vm.size = num_possible_cpus() * pcpu4k_unit_pages << PAGE_SHIFT; vm_area_register_early(&vm, PAGE_SIZE); for_each_possible_cpu(cpu) { unsigned long unit_addr = (unsigned long)vm.addr + (cpu * pcpu4k_unit_pages << PAGE_SHIFT); for (i = 0; i < pcpu4k_unit_pages; i++) populate_pte_fn(unit_addr + (i << PAGE_SHIFT)); /* pte already populated, the following shouldn't fail */ ret = __pcpu_map_pages(unit_addr, &pcpu4k_pages[cpu * pcpu4k_unit_pages], pcpu4k_unit_pages); if (ret < 0) panic("failed to map percpu area, err=%zd\n", ret); /* * FIXME: Archs with virtual cache should flush local * cache for the linear mapping here - something * equivalent to flush_cache_vmap() on the local cpu. * flush_cache_vmap() can't be used as most supporting * data structures are not set up yet. */ /* copy static data */ memcpy((void *)unit_addr, __per_cpu_load, static_size); } /* we're ready, commit */ pr_info("PERCPU: %d 4k pages per cpu, static data %zu bytes\n", pcpu4k_unit_pages, static_size); ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, reserved_size, -1, pcpu4k_unit_pages << PAGE_SHIFT, vm.addr); goto out_free_ar; enomem: while (--j >= 0) free_fn(page_address(pcpu4k_pages[j]), PAGE_SIZE); ret = -ENOMEM; out_free_ar: free_bootmem(__pa(pcpu4k_pages), pages_size); return ret; } /* * Large page remapping first chunk setup helper */ #ifdef CONFIG_NEED_MULTIPLE_NODES struct pcpul_ent { unsigned int cpu; void *ptr; }; static size_t pcpul_size; static size_t pcpul_unit_size; static struct pcpul_ent *pcpul_map; static struct vm_struct pcpul_vm; static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) { size_t off = (size_t)pageno << PAGE_SHIFT; if (off >= pcpul_size) return NULL; return virt_to_page(pcpul_map[cpu].ptr + off); } /** * pcpu_lpage_first_chunk - remap the first percpu chunk using large page * @static_size: the size of static percpu area in bytes * @reserved_size: the size of reserved percpu area in bytes * @dyn_size: free size for dynamic allocation in bytes, -1 for auto * @lpage_size: the size of a large page * @alloc_fn: function to allocate percpu lpage, always called with lpage_size * @free_fn: function to free percpu memory, @size <= lpage_size * @map_fn: function to map percpu lpage, always called with lpage_size * * This allocator uses large page as unit. A large page is allocated * for each cpu and each is remapped into vmalloc area using large * page mapping. As large page can be quite large, only part of it is * used for the first chunk. Unused part is returned to the bootmem * allocator. * * So, the large pages are mapped twice - once to the physical mapping * and to the vmalloc area for the first percpu chunk. The double * mapping does add one more large TLB entry pressure but still is * much better than only using 4k mappings while still being NUMA * friendly. * * RETURNS: * The determined pcpu_unit_size which can be used to initialize * percpu access on success, -errno on failure. */ ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size, ssize_t dyn_size, size_t lpage_size, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_map_fn_t map_fn) { size_t size_sum; size_t map_size; unsigned int cpu; int i, j; ssize_t ret; /* * Currently supports only single page. Supporting multiple * pages won't be too difficult if it ever becomes necessary. */ size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); pcpul_unit_size = lpage_size; pcpul_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); if (pcpul_size > pcpul_unit_size) { pr_warning("PERCPU: static data is larger than large page, " "can't use large page\n"); return -EINVAL; } /* allocate pointer array and alloc large pages */ map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0])); pcpul_map = alloc_bootmem(map_size); for_each_possible_cpu(cpu) { void *ptr; ptr = alloc_fn(cpu, lpage_size); if (!ptr) { pr_warning("PERCPU: failed to allocate large page " "for cpu%u\n", cpu); goto enomem; } /* * Only use pcpul_size bytes and give back the rest. * * Ingo: The lpage_size up-rounding bootmem is needed * to make sure the partial lpage is still fully RAM - * it's not well-specified to have a incompatible area * (unmapped RAM, device memory, etc.) in that hole. */ free_fn(ptr + pcpul_size, lpage_size - pcpul_size); pcpul_map[cpu].cpu = cpu; pcpul_map[cpu].ptr = ptr; memcpy(ptr, __per_cpu_load, static_size); } /* allocate address and map */ pcpul_vm.flags = VM_ALLOC; pcpul_vm.size = num_possible_cpus() * pcpul_unit_size; vm_area_register_early(&pcpul_vm, pcpul_unit_size); for_each_possible_cpu(cpu) map_fn(pcpul_map[cpu].ptr, pcpul_unit_size, pcpul_vm.addr + cpu * pcpul_unit_size); /* we're ready, commit */ pr_info("PERCPU: Remapped at %p with large pages, static data " "%zu bytes\n", pcpul_vm.addr, static_size); ret = pcpu_setup_first_chunk(pcpul_get_page, static_size, reserved_size, dyn_size, pcpul_unit_size, pcpul_vm.addr); /* sort pcpul_map array for pcpu_lpage_remapped() */ for (i = 0; i < num_possible_cpus() - 1; i++) for (j = i + 1; j < num_possible_cpus(); j++) if (pcpul_map[i].ptr > pcpul_map[j].ptr) { struct pcpul_ent tmp = pcpul_map[i]; pcpul_map[i] = pcpul_map[j]; pcpul_map[j] = tmp; } return ret; enomem: for_each_possible_cpu(cpu) if (pcpul_map[cpu].ptr) free_fn(pcpul_map[cpu].ptr, pcpul_size); free_bootmem(__pa(pcpul_map), map_size); return -ENOMEM; } /** * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area * @kaddr: the kernel address in question * * Determine whether @kaddr falls in the pcpul recycled area. This is * used by pageattr to detect VM aliases and break up the pcpu large * page mapping such that the same physical page is not mapped under * different attributes. * * The recycled area is always at the tail of a partially used large * page. * * RETURNS: * Address of corresponding remapped pcpu address if match is found; * otherwise, NULL. */ void *pcpu_lpage_remapped(void *kaddr) { unsigned long unit_mask = pcpul_unit_size - 1; void *lpage_addr = (void *)((unsigned long)kaddr & ~unit_mask); unsigned long offset = (unsigned long)kaddr & unit_mask; int left = 0, right = num_possible_cpus() - 1; int pos; /* pcpul in use at all? */ if (!pcpul_map) return NULL; /* okay, perform binary search */ while (left <= right) { pos = (left + right) / 2; if (pcpul_map[pos].ptr < lpage_addr) left = pos + 1; else if (pcpul_map[pos].ptr > lpage_addr) right = pos - 1; else { /* it shouldn't be in the area for the first chunk */ WARN_ON(offset < pcpul_size); return pcpul_vm.addr + pcpul_map[pos].cpu * pcpul_unit_size + offset; } } return NULL; } #endif /* * Generic percpu area setup. * * The embedding helper is used because its behavior closely resembles * the original non-dynamic generic percpu area setup. This is * important because many archs have addressing restrictions and might * fail if the percpu area is located far away from the previous * location. As an added bonus, in non-NUMA cases, embedding is * generally a good idea TLB-wise because percpu area can piggy back * on the physical linear memory mapping which uses large page * mappings on applicable archs. */ #ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; EXPORT_SYMBOL(__per_cpu_offset); void __init setup_per_cpu_areas(void) { size_t static_size = __per_cpu_end - __per_cpu_start; ssize_t unit_size; unsigned long delta; unsigned int cpu; /* * Always reserve area for module percpu variables. That's * what the legacy allocator did. */ unit_size = pcpu_embed_first_chunk(static_size, PERCPU_MODULE_RESERVE, PERCPU_DYNAMIC_RESERVE); if (unit_size < 0) panic("Failed to initialized percpu areas."); delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; for_each_possible_cpu(cpu) __per_cpu_offset[cpu] = delta + cpu * unit_size; } #endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */