From 3cb989501c2688cacbb7dc4b0d353faf838f53a1 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 24 Sep 2013 10:35:17 +0100 Subject: Add a generic associative array implementation. Add a generic associative array implementation that can be used as the container for keyrings, thereby massively increasing the capacity available whilst also speeding up searching in keyrings that contain a lot of keys. This may also be useful in FS-Cache for tracking cookies. Documentation is added into Documentation/associative_array.txt Some of the properties of the implementation are: (1) Objects are opaque pointers. The implementation does not care where they point (if anywhere) or what they point to (if anything). [!] NOTE: Pointers to objects _must_ be zero in the two least significant bits. (2) Objects do not need to contain linkage blocks for use by the array. This permits an object to be located in multiple arrays simultaneously. Rather, the array is made up of metadata blocks that point to objects. (3) Objects are labelled as being one of two types (the type is a bool value). This information is stored in the array, but has no consequence to the array itself or its algorithms. (4) Objects require index keys to locate them within the array. (5) Index keys must be unique. Inserting an object with the same key as one already in the array will replace the old object. (6) Index keys can be of any length and can be of different lengths. (7) Index keys should encode the length early on, before any variation due to length is seen. (8) Index keys can include a hash to scatter objects throughout the array. (9) The array can iterated over. The objects will not necessarily come out in key order. (10) The array can be iterated whilst it is being modified, provided the RCU readlock is being held by the iterator. Note, however, under these circumstances, some objects may be seen more than once. If this is a problem, the iterator should lock against modification. Objects will not be missed, however, unless deleted. (11) Objects in the array can be looked up by means of their index key. (12) Objects can be looked up whilst the array is being modified, provided the RCU readlock is being held by the thread doing the look up. The implementation uses a tree of 16-pointer nodes internally that are indexed on each level by nibbles from the index key. To improve memory efficiency, shortcuts can be emplaced to skip over what would otherwise be a series of single-occupancy nodes. Further, nodes pack leaf object pointers into spare space in the node rather than making an extra branch until as such time an object needs to be added to a full node. Signed-off-by: David Howells --- lib/Kconfig | 14 + lib/Makefile | 1 + lib/assoc_array.c | 1745 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 1760 insertions(+) create mode 100644 lib/assoc_array.c (limited to 'lib') diff --git a/lib/Kconfig b/lib/Kconfig index b3c8be0da17f..3cb879b1f282 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -322,6 +322,20 @@ config TEXTSEARCH_FSM config BTREE boolean +config ASSOCIATIVE_ARRAY + bool + help + Generic associative array. Can be searched and iterated over whilst + it is being modified. It is also reasonably quick to search and + modify. The algorithms are non-recursive, and the trees are highly + capacious. + + See: + + Documentation/assoc_array.txt + + for more information. + config HAS_IOMEM boolean depends on !NO_IOMEM diff --git a/lib/Makefile b/lib/Makefile index f3bb2cb98adf..1e806477e472 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -51,6 +51,7 @@ CFLAGS_hweight.o = $(subst $(quote),,$(CONFIG_ARCH_HWEIGHT_CFLAGS)) obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o obj-$(CONFIG_BTREE) += btree.o +obj-$(CONFIG_ASSOCIATIVE_ARRAY) += assoc_array.o obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o obj-$(CONFIG_DEBUG_LIST) += list_debug.o obj-$(CONFIG_DEBUG_OBJECTS) += debugobjects.o diff --git a/lib/assoc_array.c b/lib/assoc_array.c new file mode 100644 index 000000000000..a0952818f938 --- /dev/null +++ b/lib/assoc_array.c @@ -0,0 +1,1745 @@ +/* Generic associative array implementation. + * + * See Documentation/assoc_array.txt for information. + * + * Copyright (C) 2013 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ +//#define DEBUG +#include +#include + +/* + * Iterate over an associative array. The caller must hold the RCU read lock + * or better. + */ +static int assoc_array_subtree_iterate(const struct assoc_array_ptr *root, + const struct assoc_array_ptr *stop, + int (*iterator)(const void *leaf, + void *iterator_data), + void *iterator_data) +{ + const struct assoc_array_shortcut *shortcut; + const struct assoc_array_node *node; + const struct assoc_array_ptr *cursor, *ptr, *parent; + unsigned long has_meta; + int slot, ret; + + cursor = root; + +begin_node: + if (assoc_array_ptr_is_shortcut(cursor)) { + /* Descend through a shortcut */ + shortcut = assoc_array_ptr_to_shortcut(cursor); + smp_read_barrier_depends(); + cursor = ACCESS_ONCE(shortcut->next_node); + } + + node = assoc_array_ptr_to_node(cursor); + smp_read_barrier_depends(); + slot = 0; + + /* We perform two passes of each node. + * + * The first pass does all the leaves in this node. This means we + * don't miss any leaves if the node is split up by insertion whilst + * we're iterating over the branches rooted here (we may, however, see + * some leaves twice). + */ + has_meta = 0; + for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) { + ptr = ACCESS_ONCE(node->slots[slot]); + has_meta |= (unsigned long)ptr; + if (ptr && assoc_array_ptr_is_leaf(ptr)) { + /* We need a barrier between the read of the pointer + * and dereferencing the pointer - but only if we are + * actually going to dereference it. + */ + smp_read_barrier_depends(); + + /* Invoke the callback */ + ret = iterator(assoc_array_ptr_to_leaf(ptr), + iterator_data); + if (ret) + return ret; + } + } + + /* The second pass attends to all the metadata pointers. If we follow + * one of these we may find that we don't come back here, but rather go + * back to a replacement node with the leaves in a different layout. + * + * We are guaranteed to make progress, however, as the slot number for + * a particular portion of the key space cannot change - and we + * continue at the back pointer + 1. + */ + if (!(has_meta & ASSOC_ARRAY_PTR_META_TYPE)) + goto finished_node; + slot = 0; + +continue_node: + node = assoc_array_ptr_to_node(cursor); + smp_read_barrier_depends(); + + for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) { + ptr = ACCESS_ONCE(node->slots[slot]); + if (assoc_array_ptr_is_meta(ptr)) { + cursor = ptr; + goto begin_node; + } + } + +finished_node: + /* Move up to the parent (may need to skip back over a shortcut) */ + parent = ACCESS_ONCE(node->back_pointer); + slot = node->parent_slot; + if (parent == stop) + return 0; + + if (assoc_array_ptr_is_shortcut(parent)) { + shortcut = assoc_array_ptr_to_shortcut(parent); + smp_read_barrier_depends(); + cursor = parent; + parent = ACCESS_ONCE(shortcut->back_pointer); + slot = shortcut->parent_slot; + if (parent == stop) + return 0; + } + + /* Ascend to next slot in parent node */ + cursor = parent; + slot++; + goto continue_node; +} + +/** + * assoc_array_iterate - Pass all objects in the array to a callback + * @array: The array to iterate over. + * @iterator: The callback function. + * @iterator_data: Private data for the callback function. + * + * Iterate over all the objects in an associative array. Each one will be + * presented to the iterator function. + * + * If the array is being modified concurrently with the iteration then it is + * possible that some objects in the array will be passed to the iterator + * callback more than once - though every object should be passed at least + * once. If this is undesirable then the caller must lock against modification + * for the duration of this function. + * + * The function will return 0 if no objects were in the array or else it will + * return the result of the last iterator function called. Iteration stops + * immediately if any call to the iteration function results in a non-zero + * return. + * + * The caller should hold the RCU read lock or better if concurrent + * modification is possible. + */ +int assoc_array_iterate(const struct assoc_array *array, + int (*iterator)(const void *object, + void *iterator_data), + void *iterator_data) +{ + struct assoc_array_ptr *root = ACCESS_ONCE(array->root); + + if (!root) + return 0; + return assoc_array_subtree_iterate(root, NULL, iterator, iterator_data); +} + +enum assoc_array_walk_status { + assoc_array_walk_tree_empty, + assoc_array_walk_found_terminal_node, + assoc_array_walk_found_wrong_shortcut, +} status; + +struct assoc_array_walk_result { + struct { + struct assoc_array_node *node; /* Node in which leaf might be found */ + int level; + int slot; + } terminal_node; + struct { + struct assoc_array_shortcut *shortcut; + int level; + int sc_level; + unsigned long sc_segments; + unsigned long dissimilarity; + } wrong_shortcut; +}; + +/* + * Navigate through the internal tree looking for the closest node to the key. + */ +static enum assoc_array_walk_status +assoc_array_walk(const struct assoc_array *array, + const struct assoc_array_ops *ops, + const void *index_key, + struct assoc_array_walk_result *result) +{ + struct assoc_array_shortcut *shortcut; + struct assoc_array_node *node; + struct assoc_array_ptr *cursor, *ptr; + unsigned long sc_segments, dissimilarity; + unsigned long segments; + int level, sc_level, next_sc_level; + int slot; + + pr_devel("-->%s()\n", __func__); + + cursor = ACCESS_ONCE(array->root); + if (!cursor) + return assoc_array_walk_tree_empty; + + level = 0; + + /* Use segments from the key for the new leaf to navigate through the + * internal tree, skipping through nodes and shortcuts that are on + * route to the destination. Eventually we'll come to a slot that is + * either empty or contains a leaf at which point we've found a node in + * which the leaf we're looking for might be found or into which it + * should be inserted. + */ +jumped: + segments = ops->get_key_chunk(index_key, level); + pr_devel("segments[%d]: %lx\n", level, segments); + + if (assoc_array_ptr_is_shortcut(cursor)) + goto follow_shortcut; + +consider_node: + node = assoc_array_ptr_to_node(cursor); + smp_read_barrier_depends(); + + slot = segments >> (level & ASSOC_ARRAY_KEY_CHUNK_MASK); + slot &= ASSOC_ARRAY_FAN_MASK; + ptr = ACCESS_ONCE(node->slots[slot]); + + pr_devel("consider slot %x [ix=%d type=%lu]\n", + slot, level, (unsigned long)ptr & 3); + + if (!assoc_array_ptr_is_meta(ptr)) { + /* The node doesn't have a node/shortcut pointer in the slot + * corresponding to the index key that we have to follow. + */ + result->terminal_node.node = node; + result->terminal_node.level = level; + result->terminal_node.slot = slot; + pr_devel("<--%s() = terminal_node\n", __func__); + return assoc_array_walk_found_terminal_node; + } + + if (assoc_array_ptr_is_node(ptr)) { + /* There is a pointer to a node in the slot corresponding to + * this index key segment, so we need to follow it. + */ + cursor = ptr; + level += ASSOC_ARRAY_LEVEL_STEP; + if ((level & ASSOC_ARRAY_KEY_CHUNK_MASK) != 0) + goto consider_node; + goto jumped; + } + + /* There is a shortcut in the slot corresponding to the index key + * segment. We follow the shortcut if its partial index key matches + * this leaf's. Otherwise we need to split the shortcut. + */ + cursor = ptr; +follow_shortcut: + shortcut = assoc_array_ptr_to_shortcut(cursor); + smp_read_barrier_depends(); + pr_devel("shortcut to %d\n", shortcut->skip_to_level); + sc_level = level + ASSOC_ARRAY_LEVEL_STEP; + BUG_ON(sc_level > shortcut->skip_to_level); + + do { + /* Check the leaf against the shortcut's index key a word at a + * time, trimming the final word (the shortcut stores the index + * key completely from the root to the shortcut's target). + */ + if ((sc_level & ASSOC_ARRAY_KEY_CHUNK_MASK) == 0) + segments = ops->get_key_chunk(index_key, sc_level); + + sc_segments = shortcut->index_key[sc_level >> ASSOC_ARRAY_KEY_CHUNK_SHIFT]; + dissimilarity = segments ^ sc_segments; + + if (round_up(sc_level, ASSOC_ARRAY_KEY_CHUNK_SIZE) > shortcut->skip_to_level) { + /* Trim segments that are beyond the shortcut */ + int shift = shortcut->skip_to_level & ASSOC_ARRAY_KEY_CHUNK_MASK; + dissimilarity &= ~(ULONG_MAX << shift); + next_sc_level = shortcut->skip_to_level; + } else { + next_sc_level = sc_level + ASSOC_ARRAY_KEY_CHUNK_SIZE; + next_sc_level = round_down(next_sc_level, ASSOC_ARRAY_KEY_CHUNK_SIZE); + } + + if (dissimilarity != 0) { + /* This shortcut points elsewhere */ + result->wrong_shortcut.shortcut = shortcut; + result->wrong_shortcut.level = level; + result->wrong_shortcut.sc_level = sc_level; + result->wrong_shortcut.sc_segments = sc_segments; + result->wrong_shortcut.dissimilarity = dissimilarity; + return assoc_array_walk_found_wrong_shortcut; + } + + sc_level = next_sc_level; + } while (sc_level < shortcut->skip_to_level); + + /* The shortcut matches the leaf's index to this point. */ + cursor = ACCESS_ONCE(shortcut->next_node); + if (((level ^ sc_level) & ~ASSOC_ARRAY_KEY_CHUNK_MASK) != 0) { + level = sc_level; + goto jumped; + } else { + level = sc_level; + goto consider_node; + } +} + +/** + * assoc_array_find - Find an object by index key + * @array: The associative array to search. + * @ops: The operations to use. + * @index_key: The key to the object. + * + * Find an object in an associative array by walking through the internal tree + * to the node that should contain the object and then searching the leaves + * there. NULL is returned if the requested object was not found in the array. + * + * The caller must hold the RCU read lock or better. + */ +void *assoc_array_find(const struct assoc_array *array, + const struct assoc_array_ops *ops, + const void *index_key) +{ + struct assoc_array_walk_result result; + const struct assoc_array_node *node; + const struct assoc_array_ptr *ptr; + const void *leaf; + int slot; + + if (assoc_array_walk(array, ops, index_key, &result) != + assoc_array_walk_found_terminal_node) + return NULL; + + node = result.terminal_node.node; + smp_read_barrier_depends(); + + /* If the target key is available to us, it's has to be pointed to by + * the terminal node. + */ + for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) { + ptr = ACCESS_ONCE(node->slots[slot]); + if (ptr && assoc_array_ptr_is_leaf(ptr)) { + /* We need a barrier between the read of the pointer + * and dereferencing the pointer - but only if we are + * actually going to dereference it. + */ + leaf = assoc_array_ptr_to_leaf(ptr); + smp_read_barrier_depends(); + if (ops->compare_object(leaf, index_key)) + return (void *)leaf; + } + } + + return NULL; +} + +/* + * Destructively iterate over an associative array. The caller must prevent + * other simultaneous accesses. + */ +static void assoc_array_destroy_subtree(struct assoc_array_ptr *root, + const struct assoc_array_ops *ops) +{ + struct assoc_array_shortcut *shortcut; + struct assoc_array_node *node; + struct assoc_array_ptr *cursor, *parent = NULL; + int slot = -1; + + pr_devel("-->%s()\n", __func__); + + cursor = root; + if (!cursor) { + pr_devel("empty\n"); + return; + } + +move_to_meta: + if (assoc_array_ptr_is_shortcut(cursor)) { + /* Descend through a shortcut */ + pr_devel("[%d] shortcut\n", slot); + BUG_ON(!assoc_array_ptr_is_shortcut(cursor)); + shortcut = assoc_array_ptr_to_shortcut(cursor); + BUG_ON(shortcut->back_pointer != parent); + BUG_ON(slot != -1 && shortcut->parent_slot != slot); + parent = cursor; + cursor = shortcut->next_node; + slot = -1; + BUG_ON(!assoc_array_ptr_is_node(cursor)); + } + + pr_devel("[%d] node\n", slot); + node = assoc_array_ptr_to_node(cursor); + BUG_ON(node->back_pointer != parent); + BUG_ON(slot != -1 && node->parent_slot != slot); + slot = 0; + +continue_node: + pr_devel("Node %p [back=%p]\n", node, node->back_pointer); + for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) { + struct assoc_array_ptr *ptr = node->slots[slot]; + if (!ptr) + continue; + if (assoc_array_ptr_is_meta(ptr)) { + parent = cursor; + cursor = ptr; + goto move_to_meta; + } + + if (ops) { + pr_devel("[%d] free leaf\n", slot); + ops->free_object(assoc_array_ptr_to_leaf(ptr)); + } + } + + parent = node->back_pointer; + slot = node->parent_slot; + pr_devel("free node\n"); + kfree(node); + if (!parent) + return; /* Done */ + + /* Move back up to the parent (may need to free a shortcut on + * the way up) */ + if (assoc_array_ptr_is_shortcut(parent)) { + shortcut = assoc_array_ptr_to_shortcut(parent); + BUG_ON(shortcut->next_node != cursor); + cursor = parent; + parent = shortcut->back_pointer; + slot = shortcut->parent_slot; + pr_devel("free shortcut\n"); + kfree(shortcut); + if (!parent) + return; + + BUG_ON(!assoc_array_ptr_is_node(parent)); + } + + /* Ascend to next slot in parent node */ + pr_devel("ascend to %p[%d]\n", parent, slot); + cursor = parent; + node = assoc_array_ptr_to_node(cursor); + slot++; + goto continue_node; +} + +/** + * assoc_array_destroy - Destroy an associative array + * @array: The array to destroy. + * @ops: The operations to use. + * + * Discard all metadata and free all objects in an associative array. The + * array will be empty and ready to use again upon completion. This function + * cannot fail. + * + * The caller must prevent all other accesses whilst this takes place as no + * attempt is made to adjust pointers gracefully to permit RCU readlock-holding + * accesses to continue. On the other hand, no memory allocation is required. + */ +void assoc_array_destroy(struct assoc_array *array, + const struct assoc_array_ops *ops) +{ + assoc_array_destroy_subtree(array->root, ops); + array->root = NULL; +} + +/* + * Handle insertion into an empty tree. + */ +static bool assoc_array_insert_in_empty_tree(struct assoc_array_edit *edit) +{ + struct assoc_array_node *new_n0; + + pr_devel("-->%s()\n", __func__); + + new_n0 = kzalloc(sizeof(struct assoc_array_node), GFP_KERNEL); + if (!new_n0) + return false; + + edit->new_meta[0] = assoc_array_node_to_ptr(new_n0); + edit->leaf_p = &new_n0->slots[0]; + edit->adjust_count_on = new_n0; + edit->set[0].ptr = &edit->array->root; + edit->set[0].to = assoc_array_node_to_ptr(new_n0); + + pr_devel("<--%s() = ok [no root]\n", __func__); + return true; +} + +/* + * Handle insertion into a terminal node. + */ +static bool assoc_array_insert_into_terminal_node(struct assoc_array_edit *edit, + const struct assoc_array_ops *ops, + const void *index_key, + struct assoc_array_walk_result *result) +{ + struct assoc_array_shortcut *shortcut, *new_s0; + struct assoc_array_node *node, *new_n0, *new_n1, *side; + struct assoc_array_ptr *ptr; + unsigned long dissimilarity, base_seg, blank; + size_t keylen; + bool have_meta; + int level, diff; + int slot, next_slot, free_slot, i, j; + + node = result->terminal_node.node; + level = result->terminal_node.level; + edit->segment_cache[ASSOC_ARRAY_FAN_OUT] = result->terminal_node.slot; + + pr_devel("-->%s()\n", __func__); + + /* We arrived at a node which doesn't have an onward node or shortcut + * pointer that we have to follow. This means that (a) the leaf we + * want must go here (either by insertion or replacement) or (b) we + * need to split this node and insert in one of the fragments. + */ + free_slot = -1; + + /* Firstly, we have to check the leaves in this node to see if there's + * a matching one we should replace in place. + */ + for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { + ptr = node->slots[i]; + if (!ptr) { + free_slot = i; + continue; + } + if (ops->compare_object(assoc_array_ptr_to_leaf(ptr), index_key)) { + pr_devel("replace in slot %d\n", i); + edit->leaf_p = &node->slots[i]; + edit->dead_leaf = node->slots[i]; + pr_devel("<--%s() = ok [replace]\n", __func__); + return true; + } + } + + /* If there is a free slot in this node then we can just insert the + * leaf here. + */ + if (free_slot >= 0) { + pr_devel("insert in free slot %d\n", free_slot); + edit->leaf_p = &node->slots[free_slot]; + edit->adjust_count_on = node; + pr_devel("<--%s() = ok [insert]\n", __func__); + return true; + } + + /* The node has no spare slots - so we're either going to have to split + * it or insert another node before it. + * + * Whatever, we're going to need at least two new nodes - so allocate + * those now. We may also need a new shortcut, but we deal with that + * when we need it. + */ + new_n0 = kzalloc(sizeof(struct assoc_array_node), GFP_KERNEL); + if (!new_n0) + return false; + edit->new_meta[0] = assoc_array_node_to_ptr(new_n0); + new_n1 = kzalloc(sizeof(struct assoc_array_node), GFP_KERNEL); + if (!new_n1) + return false; + edit->new_meta[1] = assoc_array_node_to_ptr(new_n1); + + /* We need to find out how similar the leaves are. */ + pr_devel("no spare slots\n"); + have_meta = false; + for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { + ptr = node->slots[i]; + if (assoc_array_ptr_is_meta(ptr)) { + edit->segment_cache[i] = 0xff; + have_meta = true; + continue; + } + base_seg = ops->get_object_key_chunk( + assoc_array_ptr_to_leaf(ptr), level); + base_seg >>= level & ASSOC_ARRAY_KEY_CHUNK_MASK; + edit->segment_cache[i] = base_seg & ASSOC_ARRAY_FAN_MASK; + } + + if (have_meta) { + pr_devel("have meta\n"); + goto split_node; + } + + /* The node contains only leaves */ + dissimilarity = 0; + base_seg = edit->segment_cache[0]; + for (i = 1; i < ASSOC_ARRAY_FAN_OUT; i++) + dissimilarity |= edit->segment_cache[i] ^ base_seg; + + pr_devel("only leaves; dissimilarity=%lx\n", dissimilarity); + + if ((dissimilarity & ASSOC_ARRAY_FAN_MASK) == 0) { + /* The old leaves all cluster in the same slot. We will need + * to insert a shortcut if the new node wants to cluster with them. + */ + if ((edit->segment_cache[ASSOC_ARRAY_FAN_OUT] ^ base_seg) == 0) + goto all_leaves_cluster_together; + + /* Otherwise we can just insert a new node ahead of the old + * one. + */ + goto present_leaves_cluster_but_not_new_leaf; + } + +split_node: + pr_devel("split node\n"); + + /* We need to split the current node; we know that the node doesn't + * simply contain a full set of leaves that cluster together (it + * contains meta pointers and/or non-clustering leaves). + * + * We need to expel at least two leaves out of a set consisting of the + * leaves in the node and the new leaf. + * + * We need a new node (n0) to replace the current one and a new node to + * take the expelled nodes (n1). + */ + edit->set[0].to = assoc_array_node_to_ptr(new_n0); + new_n0->back_pointer = node->back_pointer; + new_n0->parent_slot = node->parent_slot; + new_n1->back_pointer = assoc_array_node_to_ptr(new_n0); + new_n1->parent_slot = -1; /* Need to calculate this */ + +do_split_node: + pr_devel("do_split_node\n"); + + new_n0->nr_leaves_on_branch = node->nr_leaves_on_branch; + new_n1->nr_leaves_on_branch = 0; + + /* Begin by finding two matching leaves. There have to be at least two + * that match - even if there are meta pointers - because any leaf that + * would match a slot with a meta pointer in it must be somewhere + * behind that meta pointer and cannot be here. Further, given N + * remaining leaf slots, we now have N+1 leaves to go in them. + */ + for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { + slot = edit->segment_cache[i]; + if (slot != 0xff) + for (j = i + 1; j < ASSOC_ARRAY_FAN_OUT + 1; j++) + if (edit->segment_cache[j] == slot) + goto found_slot_for_multiple_occupancy; + } +found_slot_for_multiple_occupancy: + pr_devel("same slot: %x %x [%02x]\n", i, j, slot); + BUG_ON(i >= ASSOC_ARRAY_FAN_OUT); + BUG_ON(j >= ASSOC_ARRAY_FAN_OUT + 1); + BUG_ON(slot >= ASSOC_ARRAY_FAN_OUT); + + new_n1->parent_slot = slot; + + /* Metadata pointers cannot change slot */ + for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) + if (assoc_array_ptr_is_meta(node->slots[i])) + new_n0->slots[i] = node->slots[i]; + else + new_n0->slots[i] = NULL; + BUG_ON(new_n0->slots[slot] != NULL); + new_n0->slots[slot] = assoc_array_node_to_ptr(new_n1); + + /* Filter the leaf pointers between the new nodes */ + free_slot = -1; + next_slot = 0; + for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { + if (assoc_array_ptr_is_meta(node->slots[i])) + continue; + if (edit->segment_cache[i] == slot) { + new_n1->slots[next_slot++] = node->slots[i]; + new_n1->nr_leaves_on_branch++; + } else { + do { + free_slot++; + } while (new_n0->slots[free_slot] != NULL); + new_n0->slots[free_slot] = node->slots[i]; + } + } + + pr_devel("filtered: f=%x n=%x\n", free_slot, next_slot); + + if (edit->segment_cache[ASSOC_ARRAY_FAN_OUT] != slot) { + do { + free_slot++; + } while (new_n0->slots[free_slot] != NULL); + edit->leaf_p = &new_n0->slots[free_slot]; + edit->adjust_count_on = new_n0; + } else { + edit->leaf_p = &new_n1->slots[next_slot++]; + edit->adjust_count_on = new_n1; + } + + BUG_ON(next_slot <= 1); + + edit->set_backpointers_to = assoc_array_node_to_ptr(new_n0); + for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { + if (edit->segment_cache[i] == 0xff) { + ptr = node->slots[i]; + BUG_ON(assoc_array_ptr_is_leaf(ptr)); + if (assoc_array_ptr_is_node(ptr)) { + side = assoc_array_ptr_to_node(ptr); + edit->set_backpointers[i] = &side->back_pointer; + } else { + shortcut = assoc_array_ptr_to_shortcut(ptr); + edit->set_backpointers[i] = &shortcut->back_pointer; + } + } + } + + ptr = node->back_pointer; + if (!ptr) + edit->set[0].ptr = &edit->array->root; + else if (assoc_array_ptr_is_node(ptr)) + edit->set[0].ptr = &assoc_array_ptr_to_node(ptr)->slots[node->parent_slot]; + else + edit->set[0].ptr = &assoc_array_ptr_to_shortcut(ptr)->next_node; + edit->excised_meta[0] = assoc_array_node_to_ptr(node); + pr_devel("<--%s() = ok [split node]\n", __func__); + return true; + +present_leaves_cluster_but_not_new_leaf: + /* All the old leaves cluster in the same slot, but the new leaf wants + * to go into a different slot, so we create a new node to hold the new + * leaf and a pointer to a new node holding all the old leaves. + */ + pr_devel("present leaves cluster but not new leaf\n"); + + new_n0->back_pointer = node->back_pointer; + new_n0->parent_slot = node->parent_slot; + new_n0->nr_leaves_on_branch = node->nr_leaves_on_branch; + new_n1->back_pointer = assoc_array_node_to_ptr(new_n0); + new_n1->parent_slot = edit->segment_cache[0]; + new_n1->nr_leaves_on_branch = node->nr_leaves_on_branch; + edit->adjust_count_on = new_n0; + + for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) + new_n1->slots[i] = node->slots[i]; + + new_n0->slots[edit->segment_cache[0]] = assoc_array_node_to_ptr(new_n0); + edit->leaf_p = &new_n0->slots[edit->segment_cache[ASSOC_ARRAY_FAN_OUT]]; + + edit->set[0].ptr = &assoc_array_ptr_to_node(node->back_pointer)->slots[node->parent_slot]; + edit->set[0].to = assoc_array_node_to_ptr(new_n0); + edit->excised_meta[0] = assoc_array_node_to_ptr(node); + pr_devel("<--%s() = ok [insert node before]\n", __func__); + return true; + +all_leaves_cluster_together: + /* All the leaves, new and old, want to cluster together in this node + * in the same slot, so we have to replace this node with a shortcut to + * skip over the identical parts of the key and then place a pair of + * nodes, one inside the other, at the end of the shortcut and + * distribute the keys between them. + * + * Firstly we need to work out where the leaves start diverging as a + * bit position into their keys so that we know how big the shortcut + * needs to be. + * + * We only need to make a single pass of N of the N+1 leaves because if + * any keys differ between themselves at bit X then at least one of + * them must also differ with the base key at bit X or before. + */ + pr_devel("all leaves cluster together\n"); + diff = INT_MAX; + for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { + int x = ops->diff_objects(assoc_array_ptr_to_leaf(edit->leaf), + assoc_array_ptr_to_leaf(node->slots[i])); + if (x < diff) { + BUG_ON(x < 0); + diff = x; + } + } + BUG_ON(diff == INT_MAX); + BUG_ON(diff < level + ASSOC_ARRAY_LEVEL_STEP); + + keylen = round_up(diff, ASSOC_ARRAY_KEY_CHUNK_SIZE); + keylen >>= ASSOC_ARRAY_KEY_CHUNK_SHIFT; + + new_s0 = kzalloc(sizeof(struct assoc_array_shortcut) + + keylen * sizeof(unsigned long), GFP_KERNEL); + if (!new_s0) + return false; + edit->new_meta[2] = assoc_array_shortcut_to_ptr(new_s0); + + edit->set[0].to = assoc_array_shortcut_to_ptr(new_s0); + new_s0->back_pointer = node->back_pointer; + new_s0->parent_slot = node->parent_slot; + new_s0->next_node = assoc_array_node_to_ptr(new_n0); + new_n0->back_pointer = assoc_array_shortcut_to_ptr(new_s0); + new_n0->parent_slot = 0; + new_n1->back_pointer = assoc_array_node_to_ptr(new_n0); + new_n1->parent_slot = -1; /* Need to calculate this */ + + new_s0->skip_to_level = level = diff & ~ASSOC_ARRAY_LEVEL_STEP_MASK; + pr_devel("skip_to_level = %d [diff %d]\n", level, diff); + BUG_ON(level <= 0); + + for (i = 0; i < keylen; i++) + new_s0->index_key[i] = + ops->get_key_chunk(index_key, i * ASSOC_ARRAY_KEY_CHUNK_SIZE); + + blank = ULONG_MAX << (level & ASSOC_ARRAY_KEY_CHUNK_MASK); + pr_devel("blank off [%zu] %d: %lx\n", keylen - 1, level, blank); + new_s0->index_key[keylen - 1] &= ~blank; + + /* This now reduces to a node splitting exercise for which we'll need + * to regenerate the disparity table. + */ + for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { + ptr = node->slots[i]; + base_seg = ops->get_object_key_chunk(assoc_array_ptr_to_leaf(ptr), + level); + base_seg >>= level & ASSOC_ARRAY_KEY_CHUNK_MASK; + edit->segment_cache[i] = base_seg & ASSOC_ARRAY_FAN_MASK; + } + + base_seg = ops->get_key_chunk(index_key, level); + base_seg >>= level & ASSOC_ARRAY_KEY_CHUNK_MASK; + edit->segment_cache[ASSOC_ARRAY_FAN_OUT] = base_seg & ASSOC_ARRAY_FAN_MASK; + goto do_split_node; +} + +/* + * Handle insertion into the middle of a shortcut. + */ +static bool assoc_array_insert_mid_shortcut(struct assoc_array_edit *edit, + const struct assoc_array_ops *ops, + struct assoc_array_walk_result *result) +{ + struct assoc_array_shortcut *shortcut, *new_s0, *new_s1; + struct assoc_array_node *node, *new_n0, *side; + unsigned long sc_segments, dissimilarity, blank; + size_t keylen; + int level, sc_level, diff; + int sc_slot; + + shortcut = result->wrong_shortcut.shortcut; + level = result->wrong_shortcut.level; + sc_level = result->wrong_shortcut.sc_level; + sc_segments = result->wrong_shortcut.sc_segments; + dissimilarity = result->wrong_shortcut.dissimilarity; + + pr_devel("-->%s(ix=%d dis=%lx scix=%d)\n", + __func__, level, dissimilarity, sc_level); + + /* We need to split a shortcut and insert a node between the two + * pieces. Zero-length pieces will be dispensed with entirely. + * + * First of all, we need to find out in which level the first + * difference was. + */ + diff = __ffs(dissimilarity); + diff &= ~ASSOC_ARRAY_LEVEL_STEP_MASK; + diff += sc_level & ~ASSOC_ARRAY_KEY_CHUNK_MASK; + pr_devel("diff=%d\n", diff); + + if (!shortcut->back_pointer) { + edit->set[0].ptr = &edit->array->root; + } else if (assoc_array_ptr_is_node(shortcut->back_pointer)) { + node = assoc_array_ptr_to_node(shortcut->back_pointer); + edit->set[0].ptr = &node->slots[shortcut->parent_slot]; + } else { + BUG(); + } + + edit->excised_meta[0] = assoc_array_shortcut_to_ptr(shortcut); + + /* Create a new node now since we're going to need it anyway */ + new_n0 = kzalloc(sizeof(struct assoc_array_node), GFP_KERNEL); + if (!new_n0) + return false; + edit->new_meta[0] = assoc_array_node_to_ptr(new_n0); + edit->adjust_count_on = new_n0; + + /* Insert a new shortcut before the new node if this segment isn't of + * zero length - otherwise we just connect the new node directly to the + * parent. + */ + level += ASSOC_ARRAY_LEVEL_STEP; + if (diff > level) { + pr_devel("pre-shortcut %d...%d\n", level, diff); + keylen = round_up(diff, ASSOC_ARRAY_KEY_CHUNK_SIZE); + keylen >>= ASSOC_ARRAY_KEY_CHUNK_SHIFT; + + new_s0 = kzalloc(sizeof(struct assoc_array_shortcut) + + keylen * sizeof(unsigned long), GFP_KERNEL); + if (!new_s0) + return false; + edit->new_meta[1] = assoc_array_shortcut_to_ptr(new_s0); + edit->set[0].to = assoc_array_shortcut_to_ptr(new_s0); + new_s0->back_pointer = shortcut->back_pointer; + new_s0->parent_slot = shortcut->parent_slot; + new_s0->next_node = assoc_array_node_to_ptr(new_n0); + new_s0->skip_to_level = diff; + + new_n0->back_pointer = assoc_array_shortcut_to_ptr(new_s0); + new_n0->parent_slot = 0; + + memcpy(new_s0->index_key, shortcut->index_key, + keylen * sizeof(unsigned long)); + + blank = ULONG_MAX << (diff & ASSOC_ARRAY_KEY_CHUNK_MASK); + pr_devel("blank off [%zu] %d: %lx\n", keylen - 1, diff, blank); + new_s0->index_key[keylen - 1] &= ~blank; + } else { + pr_devel("no pre-shortcut\n"); + edit->set[0].to = assoc_array_node_to_ptr(new_n0); + new_n0->back_pointer = shortcut->back_pointer; + new_n0->parent_slot = shortcut->parent_slot; + } + + side = assoc_array_ptr_to_node(shortcut->next_node); + new_n0->nr_leaves_on_branch = side->nr_leaves_on_branch; + + /* We need to know which slot in the new node is going to take a + * metadata pointer. + */ + sc_slot = sc_segments >> (diff & ASSOC_ARRAY_KEY_CHUNK_MASK); + sc_slot &= ASSOC_ARRAY_FAN_MASK; + + pr_devel("new slot %lx >> %d -> %d\n", + sc_segments, diff & ASSOC_ARRAY_KEY_CHUNK_MASK, sc_slot); + + /* Determine whether we need to follow the new node with a replacement + * for the current shortcut. We could in theory reuse the current + * shortcut if its parent slot number doesn't change - but that's a + * 1-in-16 chance so not worth expending the code upon. + */ + level = diff + ASSOC_ARRAY_LEVEL_STEP; + if (level < shortcut->skip_to_level) { + pr_devel("post-shortcut %d...%d\n", level, shortcut->skip_to_level); + keylen = round_up(shortcut->skip_to_level, ASSOC_ARRAY_KEY_CHUNK_SIZE); + keylen >>= ASSOC_ARRAY_KEY_CHUNK_SHIFT; + + new_s1 = kzalloc(sizeof(struct assoc_array_shortcut) + + keylen * sizeof(unsigned long), GFP_KERNEL); + if (!new_s1) + return false; + edit->new_meta[2] = assoc_array_shortcut_to_ptr(new_s1); + + new_s1->back_pointer = assoc_array_node_to_ptr(new_n0); + new_s1->parent_slot = sc_slot; + new_s1->next_node = shortcut->next_node; + new_s1->skip_to_level = shortcut->skip_to_level; + + new_n0->slots[sc_slot] = assoc_array_shortcut_to_ptr(new_s1); + + memcpy(new_s1->index_key, shortcut->index_key, + keylen * sizeof(unsigned long)); + + edit->set[1].ptr = &side->back_pointer; + edit->set[1].to = assoc_array_shortcut_to_ptr(new_s1); + } else { + pr_devel("no post-shortcut\n"); + + /* We don't have to replace the pointed-to node as long as we + * use memory barriers to make sure the parent slot number is + * changed before the back pointer (the parent slot number is + * irrelevant to the old parent shortcut). + */ + new_n0->slots[sc_slot] = shortcut->next_node; + edit->set_parent_slot[0].p = &side->parent_slot; + edit->set_parent_slot[0].to = sc_slot; + edit->set[1].ptr = &side->back_pointer; + edit->set[1].to = assoc_array_node_to_ptr(new_n0); + } + + /* Install the new leaf in a spare slot in the new node. */ + if (sc_slot == 0) + edit->leaf_p = &new_n0->slots[1]; + else + edit->leaf_p = &new_n0->slots[0]; + + pr_devel("<--%s() = ok [split shortcut]\n", __func__); + return edit; +} + +/** + * assoc_array_insert - Script insertion of an object into an associative array + * @array: The array to insert into. + * @ops: The operations to use. + * @index_key: The key to insert at. + * @object: The object to insert. + * + * Precalculate and preallocate a script for the insertion or replacement of an + * object in an associative array. This results in an edit script that can + * either be applied or cancelled. + * + * The function returns a pointer to an edit script or -ENOMEM. + * + * The caller should lock against other modifications and must continue to hold + * the lock until assoc_array_apply_edit() has been called. + * + * Accesses to the tree may take place concurrently with this function, + * provided they hold the RCU read lock. + */ +struct assoc_array_edit *assoc_array_insert(struct assoc_array *array, + const struct assoc_array_ops *ops, + const void *index_key, + void *object) +{ + struct assoc_array_walk_result result; + struct assoc_array_edit *edit; + + pr_devel("-->%s()\n", __func__); + + /* The leaf pointer we're given must not have the bottom bit set as we + * use those for type-marking the pointer. NULL pointers are also not + * allowed as they indicate an empty slot but we have to allow them + * here as they can be updated later. + */ + BUG_ON(assoc_array_ptr_is_meta(object)); + + edit = kzalloc(sizeof(struct assoc_array_edit), GFP_KERNEL); + if (!edit) + return ERR_PTR(-ENOMEM); + edit->array = array; + edit->ops = ops; + edit->leaf = assoc_array_leaf_to_ptr(object); + edit->adjust_count_by = 1; + + switch (assoc_array_walk(array, ops, index_key, &result)) { + case assoc_array_walk_tree_empty: + /* Allocate a root node if there isn't one yet */ + if (!assoc_array_insert_in_empty_tree(edit)) + goto enomem; + return edit; + + case assoc_array_walk_found_terminal_node: + /* We found a node that doesn't have a node/shortcut pointer in + * the slot corresponding to the index key that we have to + * follow. + */ + if (!assoc_array_insert_into_terminal_node(edit, ops, index_key, + &result)) + goto enomem; + return edit; + + case assoc_array_walk_found_wrong_shortcut: + /* We found a shortcut that didn't match our key in a slot we + * needed to follow. + */ + if (!assoc_array_insert_mid_shortcut(edit, ops, &result)) + goto enomem; + return edit; + } + +enomem: + /* Clean up after an out of memory error */ + pr_devel("enomem\n"); + assoc_array_cancel_edit(edit); + return ERR_PTR(-ENOMEM); +} + +/** + * assoc_array_insert_set_object - Set the new object pointer in an edit script + * @edit: The edit script to modify. + * @object: The object pointer to set. + * + * Change the object to be inserted in an edit script. The object pointed to + * by the old object is not freed. This must be done prior to applying the + * script. + */ +void assoc_array_insert_set_object(struct assoc_array_edit *edit, void *object) +{ + BUG_ON(!object); + edit->leaf = assoc_array_leaf_to_ptr(object); +} + +struct assoc_array_delete_collapse_context { + struct assoc_array_node *node; + const void *skip_leaf; + int slot; +}; + +/* + * Subtree collapse to node iterator. + */ +static int assoc_array_delete_collapse_iterator(const void *leaf, + void *iterator_data) +{ + struct assoc_array_delete_collapse_context *collapse = iterator_data; + + if (leaf == collapse->skip_leaf) + return 0; + + BUG_ON(collapse->slot >= ASSOC_ARRAY_FAN_OUT); + + collapse->node->slots[collapse->slot++] = assoc_array_leaf_to_ptr(leaf); + return 0; +} + +/** + * assoc_array_delete - Script deletion of an object from an associative array + * @array: The array to search. + * @ops: The operations to use. + * @index_key: The key to the object. + * + * Precalculate and preallocate a script for the deletion of an object from an + * associative array. This results in an edit script that can either be + * applied or cancelled. + * + * The function returns a pointer to an edit script if the object was found, + * NULL if the object was not found or -ENOMEM. + * + * The caller should lock against other modifications and must continue to hold + * the lock until assoc_array_apply_edit() has been called. + * + * Accesses to the tree may take place concurrently with this function, + * provided they hold the RCU read lock. + */ +struct assoc_array_edit *assoc_array_delete(struct assoc_array *array, + const struct assoc_array_ops *ops, + const void *index_key) +{ + struct assoc_array_delete_collapse_context collapse; + struct assoc_array_walk_result result; + struct assoc_array_node *node, *new_n0; + struct assoc_array_edit *edit; + struct assoc_array_ptr *ptr; + bool has_meta; + int slot, i; + + pr_devel("-->%s()\n", __func__); + + edit = kzalloc(sizeof(struct assoc_array_edit), GFP_KERNEL); + if (!edit) + return ERR_PTR(-ENOMEM); + edit->array = array; + edit->ops = ops; + edit->adjust_count_by = -1; + + switch (assoc_array_walk(array, ops, index_key, &result)) { + case assoc_array_walk_found_terminal_node: + /* We found a node that should contain the leaf we've been + * asked to remove - *if* it's in the tree. + */ + pr_devel("terminal_node\n"); + node = result.terminal_node.node; + + for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) { + ptr = node->slots[slot]; + if (ptr && + assoc_array_ptr_is_leaf(ptr) && + ops->compare_object(assoc_array_ptr_to_leaf(ptr), + index_key)) + goto found_leaf; + } + case assoc_array_walk_tree_empty: + case assoc_array_walk_found_wrong_shortcut: + default: + assoc_array_cancel_edit(edit); + pr_devel("not found\n"); + return NULL; + } + +found_leaf: + BUG_ON(array->nr_leaves_on_tree <= 0); + + /* In the simplest form of deletion we just clear the slot and release + * the leaf after a suitable interval. + */ + edit->dead_leaf = node->slots[slot]; + edit->set[0].ptr = &node->slots[slot]; + edit->set[0].to = NULL; + edit->adjust_count_on = node; + + /* If that concludes erasure of the last leaf, then delete the entire + * internal array. + */ + if (array->nr_leaves_on_tree == 1) { + edit->set[1].ptr = &array->root; + edit->set[1].to = NULL; + edit->adjust_count_on = NULL; + edit->excised_subtree = array->root; + pr_devel("all gone\n"); + return edit; + } + + /* However, we'd also like to clear up some metadata blocks if we + * possibly can. + * + * We go for a simple algorithm of: if this node has FAN_OUT or fewer + * leaves in it, then attempt to collapse it - and attempt to + * recursively collapse up the tree. + * + * We could also try and collapse in partially filled subtrees to take + * up space in this node. + */ + if (node->nr_leaves_on_branch <= ASSOC_ARRAY_FAN_OUT + 1) { + struct assoc_array_node *parent, *grandparent; + struct assoc_array_ptr *ptr; + + /* First of all, we need to know if this node has metadata so + * that we don't try collapsing if all the leaves are already + * here. + */ + has_meta = false; + for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { + ptr = node->slots[i]; + if (assoc_array_ptr_is_meta(ptr)) { + has_meta = true; + break; + } + } + + pr_devel("leaves: %ld [m=%d]\n", + node->nr_leaves_on_branch - 1, has_meta); + + /* Look further up the tree to see if we can collapse this node + * into a more proximal node too. + */ + parent = node; + collapse_up: + pr_devel("collapse subtree: %ld\n", parent->nr_leaves_on_branch); + + ptr = parent->back_pointer; + if (!ptr) + goto do_collapse; + if (assoc_array_ptr_is_shortcut(ptr)) { + struct assoc_array_shortcut *s = assoc_array_ptr_to_shortcut(ptr); + ptr = s->back_pointer; + if (!ptr) + goto do_collapse; + } + + grandparent = assoc_array_ptr_to_node(ptr); + if (grandparent->nr_leaves_on_branch <= ASSOC_ARRAY_FAN_OUT + 1) { + parent = grandparent; + goto collapse_up; + } + + do_collapse: + /* There's no point collapsing if the original node has no meta + * pointers to discard and if we didn't merge into one of that + * node's ancestry. + */ + if (has_meta || parent != node) { + node = parent; + + /* Create a new node to collapse into */ + new_n0 = kzalloc(sizeof(struct assoc_array_node), GFP_KERNEL); + if (!new_n0) + goto enomem; + edit->new_meta[0] = assoc_array_node_to_ptr(new_n0); + + new_n0->back_pointer = node->back_pointer; + new_n0->parent_slot = node->parent_slot; + new_n0->nr_leaves_on_branch = node->nr_leaves_on_branch; + edit->adjust_count_on = new_n0; + + collapse.node = new_n0; + collapse.skip_leaf = assoc_array_ptr_to_leaf(edit->dead_leaf); + collapse.slot = 0; + assoc_array_subtree_iterate(assoc_array_node_to_ptr(node), + node->back_pointer, + assoc_array_delete_collapse_iterator, + &collapse); + pr_devel("collapsed %d,%lu\n", collapse.slot, new_n0->nr_leaves_on_branch); + BUG_ON(collapse.slot != new_n0->nr_leaves_on_branch - 1); + + if (!node->back_pointer) { + edit->set[1].ptr = &array->root; + } else if (assoc_array_ptr_is_leaf(node->back_pointer)) { + BUG(); + } else if (assoc_array_ptr_is_node(node->back_pointer)) { + struct assoc_array_node *p = + assoc_array_ptr_to_node(node->back_pointer); + edit->set[1].ptr = &p->slots[node->parent_slot]; + } else if (assoc_array_ptr_is_shortcut(node->back_pointer)) { + struct assoc_array_shortcut *s = + assoc_array_ptr_to_shortcut(node->back_pointer); + edit->set[1].ptr = &s->next_node; + } + edit->set[1].to = assoc_array_node_to_ptr(new_n0); + edit->excised_subtree = assoc_array_node_to_ptr(node); + } + } + + return edit; + +enomem: + /* Clean up after an out of memory error */ + pr_devel("enomem\n"); + assoc_array_cancel_edit(edit); + return ERR_PTR(-ENOMEM); +} + +/** + * assoc_array_clear - Script deletion of all objects from an associative array + * @array: The array to clear. + * @ops: The operations to use. + * + * Precalculate and preallocate a script for the deletion of all the objects + * from an associative array. This results in an edit script that can either + * be applied or cancelled. + * + * The function returns a pointer to an edit script if there are objects to be + * deleted, NULL if there are no objects in the array or -ENOMEM. + * + * The caller should lock against other modifications and must continue to hold + * the lock until assoc_array_apply_edit() has been called. + * + * Accesses to the tree may take place concurrently with this function, + * provided they hold the RCU read lock. + */ +struct assoc_array_edit *assoc_array_clear(struct assoc_array *array, + const struct assoc_array_ops *ops) +{ + struct assoc_array_edit *edit; + + pr_devel("-->%s()\n", __func__); + + if (!array->root) + return NULL; + + edit = kzalloc(sizeof(struct assoc_array_edit), GFP_KERNEL); + if (!edit) + return ERR_PTR(-ENOMEM); + edit->array = array; + edit->ops = ops; + edit->set[1].ptr = &array->root; + edit->set[1].to = NULL; + edit->excised_subtree = array->root; + edit->ops_for_excised_subtree = ops; + pr_devel("all gone\n"); + return edit; +} + +/* + * Handle the deferred destruction after an applied edit. + */ +static void assoc_array_rcu_cleanup(struct rcu_head *head) +{ + struct assoc_array_edit *edit = + container_of(head, struct assoc_array_edit, rcu); + int i; + + pr_devel("-->%s()\n", __func__); + + if (edit->dead_leaf) + edit->ops->free_object(assoc_array_ptr_to_leaf(edit->dead_leaf)); + for (i = 0; i < ARRAY_SIZE(edit->excised_meta); i++) + if (edit->excised_meta[i]) + kfree(assoc_array_ptr_to_node(edit->excised_meta[i])); + + if (edit->excised_subtree) { + BUG_ON(assoc_array_ptr_is_leaf(edit->excised_subtree)); + if (assoc_array_ptr_is_node(edit->excised_subtree)) { + struct assoc_array_node *n = + assoc_array_ptr_to_node(edit->excised_subtree); + n->back_pointer = NULL; + } else { + struct assoc_array_shortcut *s = + assoc_array_ptr_to_shortcut(edit->excised_subtree); + s->back_pointer = NULL; + } + assoc_array_destroy_subtree(edit->excised_subtree, + edit->ops_for_excised_subtree); + } + + kfree(edit); +} + +/** + * assoc_array_apply_edit - Apply an edit script to an associative array + * @edit: The script to apply. + * + * Apply an edit script to an associative array to effect an insertion, + * deletion or clearance. As the edit script includes preallocated memory, + * this is guaranteed not to fail. + * + * The edit script, dead objects and dead metadata will be scheduled for + * destruction after an RCU grace period to permit those doing read-only + * accesses on the array to continue to do so under the RCU read lock whilst + * the edit is taking place. + */ +void assoc_array_apply_edit(struct assoc_array_edit *edit) +{ + struct assoc_array_shortcut *shortcut; + struct assoc_array_node *node; + struct assoc_array_ptr *ptr; + int i; + + pr_devel("-->%s()\n", __func__); + + smp_wmb(); + if (edit->leaf_p) + *edit->leaf_p = edit->leaf; + + smp_wmb(); + for (i = 0; i < ARRAY_SIZE(edit->set_parent_slot); i++) + if (edit->set_parent_slot[i].p) + *edit->set_parent_slot[i].p = edit->set_parent_slot[i].to; + + smp_wmb(); + for (i = 0; i < ARRAY_SIZE(edit->set_backpointers); i++) + if (edit->set_backpointers[i]) + *edit->set_backpointers[i] = edit->set_backpointers_to; + + smp_wmb(); + for (i = 0; i < ARRAY_SIZE(edit->set); i++) + if (edit->set[i].ptr) + *edit->set[i].ptr = edit->set[i].to; + + if (edit->array->root == NULL) { + edit->array->nr_leaves_on_tree = 0; + } else if (edit->adjust_count_on) { + node = edit->adjust_count_on; + for (;;) { + node->nr_leaves_on_branch += edit->adjust_count_by; + + ptr = node->back_pointer; + if (!ptr) + break; + if (assoc_array_ptr_is_shortcut(ptr)) { + shortcut = assoc_array_ptr_to_shortcut(ptr); + ptr = shortcut->back_pointer; + if (!ptr) + break; + } + BUG_ON(!assoc_array_ptr_is_node(ptr)); + node = assoc_array_ptr_to_node(ptr); + } + + edit->array->nr_leaves_on_tree += edit->adjust_count_by; + } + + call_rcu(&edit->rcu, assoc_array_rcu_cleanup); +} + +/** + * assoc_array_cancel_edit - Discard an edit script. + * @edit: The script to discard. + * + * Free an edit script and all the preallocated data it holds without making + * any changes to the associative array it was intended for. + * + * NOTE! In the case of an insertion script, this does _not_ release the leaf + * that was to be inserted. That is left to the caller. + */ +void assoc_array_cancel_edit(struct assoc_array_edit *edit) +{ + struct assoc_array_ptr *ptr; + int i; + + pr_devel("-->%s()\n", __func__); + + /* Clean up after an out of memory error */ + for (i = 0; i < ARRAY_SIZE(edit->new_meta); i++) { + ptr = edit->new_meta[i]; + if (ptr) { + if (assoc_array_ptr_is_node(ptr)) + kfree(assoc_array_ptr_to_node(ptr)); + else + kfree(assoc_array_ptr_to_shortcut(ptr)); + } + } + kfree(edit); +} + +/** + * assoc_array_gc - Garbage collect an associative array. + * @array: The array to clean. + * @ops: The operations to use. + * @iterator: A callback function to pass judgement on each object. + * @iterator_data: Private data for the callback function. + * + * Collect garbage from an associative array and pack down the internal tree to + * save memory. + * + * The iterator function is asked to pass judgement upon each object in the + * array. If it returns false, the object is discard and if it returns true, + * the object is kept. If it returns true, it must increment the object's + * usage count (or whatever it needs to do to retain it) before returning. + * + * This function returns 0 if successful or -ENOMEM if out of memory. In the + * latter case, the array is not changed. + * + * The caller should lock against other modifications and must continue to hold + * the lock until assoc_array_apply_edit() has been called. + * + * Accesses to the tree may take place concurrently with this function, + * provided they hold the RCU read lock. + */ +int assoc_array_gc(struct assoc_array *array, + const struct assoc_array_ops *ops, + bool (*iterator)(void *object, void *iterator_data), + void *iterator_data) +{ + struct assoc_array_shortcut *shortcut, *new_s; + struct assoc_array_node *node, *new_n; + struct assoc_array_edit *edit; + struct assoc_array_ptr *cursor, *ptr; + struct assoc_array_ptr *new_root, *new_parent, **new_ptr_pp; + unsigned long nr_leaves_on_tree; + int keylen, slot, nr_free, next_slot, i; + + pr_devel("-->%s()\n", __func__); + + if (!array->root) + return 0; + + edit = kzalloc(sizeof(struct assoc_array_edit), GFP_KERNEL); + if (!edit) + return -ENOMEM; + edit->array = array; + edit->ops = ops; + edit->ops_for_excised_subtree = ops; + edit->set[0].ptr = &array->root; + edit->excised_subtree = array->root; + + new_root = new_parent = NULL; + new_ptr_pp = &new_root; + cursor = array->root; + +descend: + /* If this point is a shortcut, then we need to duplicate it and + * advance the target cursor. + */ + if (assoc_array_ptr_is_shortcut(cursor)) { + shortcut = assoc_array_ptr_to_shortcut(cursor); + keylen = round_up(shortcut->skip_to_level, ASSOC_ARRAY_KEY_CHUNK_SIZE); + keylen >>= ASSOC_ARRAY_KEY_CHUNK_SHIFT; + new_s = kmalloc(sizeof(struct assoc_array_shortcut) + + keylen * sizeof(unsigned long), GFP_KERNEL); + if (!new_s) + goto enomem; + pr_devel("dup shortcut %p -> %p\n", shortcut, new_s); + memcpy(new_s, shortcut, (sizeof(struct assoc_array_shortcut) + + keylen * sizeof(unsigned long))); + new_s->back_pointer = new_parent; + new_s->parent_slot = shortcut->parent_slot; + *new_ptr_pp = new_parent = assoc_array_shortcut_to_ptr(new_s); + new_ptr_pp = &new_s->next_node; + cursor = shortcut->next_node; + } + + /* Duplicate the node at this position */ + node = assoc_array_ptr_to_node(cursor); + new_n = kzalloc(sizeof(struct assoc_array_node), GFP_KERNEL); + if (!new_n) + goto enomem; + pr_devel("dup node %p -> %p\n", node, new_n); + new_n->back_pointer = new_parent; + new_n->parent_slot = node->parent_slot; + *new_ptr_pp = new_parent = assoc_array_node_to_ptr(new_n); + new_ptr_pp = NULL; + slot = 0; + +continue_node: + /* Filter across any leaves and gc any subtrees */ + for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) { + ptr = node->slots[slot]; + if (!ptr) + continue; + + if (assoc_array_ptr_is_leaf(ptr)) { + if (iterator(assoc_array_ptr_to_leaf(ptr), + iterator_data)) + /* The iterator will have done any reference + * counting on the object for us. + */ + new_n->slots[slot] = ptr; + continue; + } + + new_ptr_pp = &new_n->slots[slot]; + cursor = ptr; + goto descend; + } + + pr_devel("-- compress node %p --\n", new_n); + + /* Count up the number of empty slots in this node and work out the + * subtree leaf count. + */ + new_n->nr_leaves_on_branch = 0; + nr_free = 0; + for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) { + ptr = new_n->slots[slot]; + if (!ptr) + nr_free++; + else if (assoc_array_ptr_is_leaf(ptr)) + new_n->nr_leaves_on_branch++; + } + pr_devel("free=%d, leaves=%lu\n", nr_free, new_n->nr_leaves_on_branch); + + /* See what we can fold in */ + next_slot = 0; + for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) { + struct assoc_array_shortcut *s; + struct assoc_array_node *child; + + ptr = new_n->slots[slot]; + if (!ptr || assoc_array_ptr_is_leaf(ptr)) + continue; + + s = NULL; + if (assoc_array_ptr_is_shortcut(ptr)) { + s = assoc_array_ptr_to_shortcut(ptr); + ptr = s->next_node; + } + + child = assoc_array_ptr_to_node(ptr); + new_n->nr_leaves_on_branch += child->nr_leaves_on_branch; + + if (child->nr_leaves_on_branch <= nr_free + 1) { + /* Fold the child node into this one */ + pr_devel("[%d] fold node %lu/%d [nx %d]\n", + slot, child->nr_leaves_on_branch, nr_free + 1, + next_slot); + + /* We would already have reaped an intervening shortcut + * on the way back up the tree. + */ + BUG_ON(s); + + new_n->slots[slot] = NULL; + nr_free++; + if (slot < next_slot) + next_slot = slot; + for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { + struct assoc_array_ptr *p = child->slots[i]; + if (!p) + continue; + BUG_ON(assoc_array_ptr_is_meta(p)); + while (new_n->slots[next_slot]) + next_slot++; + BUG_ON(next_slot >= ASSOC_ARRAY_FAN_OUT); + new_n->slots[next_slot++] = p; + nr_free--; + } + kfree(child); + } else { + pr_devel("[%d] retain node %lu/%d [nx %d]\n", + slot, child->nr_leaves_on_branch, nr_free + 1, + next_slot); + } + } + + pr_devel("after: %lu\n", new_n->nr_leaves_on_branch); + + nr_leaves_on_tree = new_n->nr_leaves_on_branch; + + /* Excise this node if it is singly occupied by a shortcut */ + if (nr_free == ASSOC_ARRAY_FAN_OUT - 1) { + for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) + if ((ptr = new_n->slots[slot])) + break; + + if (assoc_array_ptr_is_meta(ptr) && + assoc_array_ptr_is_shortcut(ptr)) { + pr_devel("excise node %p with 1 shortcut\n", new_n); + new_s = assoc_array_ptr_to_shortcut(ptr); + new_parent = new_n->back_pointer; + slot = new_n->parent_slot; + kfree(new_n); + if (!new_parent) { + new_s->back_pointer = NULL; + new_s->parent_slot = 0; + new_root = ptr; + goto gc_complete; + } + + if (assoc_array_ptr_is_shortcut(new_parent)) { + /* We can discard any preceding shortcut also */ + struct assoc_array_shortcut *s = + assoc_array_ptr_to_shortcut(new_parent); + + pr_devel("excise preceding shortcut\n"); + + new_parent = new_s->back_pointer = s->back_pointer; + slot = new_s->parent_slot = s->parent_slot; + kfree(s); + if (!new_parent) { + new_s->back_pointer = NULL; + new_s->parent_slot = 0; + new_root = ptr; + goto gc_complete; + } + } + + new_s->back_pointer = new_parent; + new_s->parent_slot = slot; + new_n = assoc_array_ptr_to_node(new_parent); + new_n->slots[slot] = ptr; + goto ascend_old_tree; + } + } + + /* Excise any shortcuts we might encounter that point to nodes that + * only contain leaves. + */ + ptr = new_n->back_pointer; + if (!ptr) + goto gc_complete; + + if (assoc_array_ptr_is_shortcut(ptr)) { + new_s = assoc_array_ptr_to_shortcut(ptr); + new_parent = new_s->back_pointer; + slot = new_s->parent_slot; + + if (new_n->nr_leaves_on_branch <= ASSOC_ARRAY_FAN_OUT) { + struct assoc_array_node *n; + + pr_devel("excise shortcut\n"); + new_n->back_pointer = new_parent; + new_n->parent_slot = slot; + kfree(new_s); + if (!new_parent) { + new_root = assoc_array_node_to_ptr(new_n); + goto gc_complete; + } + + n = assoc_array_ptr_to_node(new_parent); + n->slots[slot] = assoc_array_node_to_ptr(new_n); + } + } else { + new_parent = ptr; + } + new_n = assoc_array_ptr_to_node(new_parent); + +ascend_old_tree: + ptr = node->back_pointer; + if (assoc_array_ptr_is_shortcut(ptr)) { + shortcut = assoc_array_ptr_to_shortcut(ptr); + slot = shortcut->parent_slot; + cursor = shortcut->back_pointer; + } else { + slot = node->parent_slot; + cursor = ptr; + } + BUG_ON(!ptr); + node = assoc_array_ptr_to_node(cursor); + slot++; + goto continue_node; + +gc_complete: + edit->set[0].to = new_root; + assoc_array_apply_edit(edit); + edit->array->nr_leaves_on_tree = nr_leaves_on_tree; + return 0; + +enomem: + pr_devel("enomem\n"); + assoc_array_destroy_subtree(new_root, edit->ops); + kfree(edit); + return -ENOMEM; +} -- cgit v1.2.3 From b2a4df200d570b2c33a57e1ebfa5896e4bc81b69 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 24 Sep 2013 10:35:18 +0100 Subject: KEYS: Expand the capacity of a keyring Expand the capacity of a keyring to be able to hold a lot more keys by using the previously added associative array implementation. Currently the maximum capacity is: (PAGE_SIZE - sizeof(header)) / sizeof(struct key *) which, on a 64-bit system, is a little more 500. However, since this is being used for the NFS uid mapper, we need more than that. The new implementation gives us effectively unlimited capacity. With some alterations, the keyutils testsuite runs successfully to completion after this patch is applied. The alterations are because (a) keyrings that are simply added to no longer appear ordered and (b) some of the errors have changed a bit. Signed-off-by: David Howells --- include/keys/keyring-type.h | 17 +- include/linux/key.h | 13 +- lib/assoc_array.c | 1 + security/keys/Kconfig | 1 + security/keys/gc.c | 33 +- security/keys/internal.h | 17 +- security/keys/key.c | 35 +- security/keys/keyring.c | 1436 ++++++++++++++++++++++--------------------- security/keys/request_key.c | 12 +- 9 files changed, 803 insertions(+), 762 deletions(-) (limited to 'lib') diff --git a/include/keys/keyring-type.h b/include/keys/keyring-type.h index cf49159b0e3a..fca5c62340a4 100644 --- a/include/keys/keyring-type.h +++ b/include/keys/keyring-type.h @@ -1,6 +1,6 @@ /* Keyring key type * - * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2008, 2013 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * This program is free software; you can redistribute it and/or @@ -13,19 +13,6 @@ #define _KEYS_KEYRING_TYPE_H #include -#include - -/* - * the keyring payload contains a list of the keys to which the keyring is - * subscribed - */ -struct keyring_list { - struct rcu_head rcu; /* RCU deletion hook */ - unsigned short maxkeys; /* max keys this list can hold */ - unsigned short nkeys; /* number of keys currently held */ - unsigned short delkey; /* key to be unlinked by RCU */ - struct key __rcu *keys[0]; -}; - +#include #endif /* _KEYS_KEYRING_TYPE_H */ diff --git a/include/linux/key.h b/include/linux/key.h index ef596c7af585..2417f789d29b 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -22,6 +22,7 @@ #include #include #include +#include #ifdef __KERNEL__ #include @@ -196,11 +197,13 @@ struct key { * whatever */ union { - unsigned long value; - void __rcu *rcudata; - void *data; - struct keyring_list __rcu *subscriptions; - } payload; + union { + unsigned long value; + void __rcu *rcudata; + void *data; + } payload; + struct assoc_array keys; + }; }; extern struct key *key_alloc(struct key_type *type, diff --git a/lib/assoc_array.c b/lib/assoc_array.c index a0952818f938..17edeaf19180 100644 --- a/lib/assoc_array.c +++ b/lib/assoc_array.c @@ -12,6 +12,7 @@ */ //#define DEBUG #include +#include #include /* diff --git a/security/keys/Kconfig b/security/keys/Kconfig index a90d6d300dbd..15e0dfe8c80f 100644 --- a/security/keys/Kconfig +++ b/security/keys/Kconfig @@ -4,6 +4,7 @@ config KEYS bool "Enable access key retention support" + select ASSOCIATIVE_ARRAY help This option provides support for retaining authentication tokens and access keys in the kernel. diff --git a/security/keys/gc.c b/security/keys/gc.c index d67c97bb1025..cce621c33dce 100644 --- a/security/keys/gc.c +++ b/security/keys/gc.c @@ -130,6 +130,13 @@ void key_gc_keytype(struct key_type *ktype) kleave(""); } +static int key_gc_keyring_func(const void *object, void *iterator_data) +{ + const struct key *key = object; + time_t *limit = iterator_data; + return key_is_dead(key, *limit); +} + /* * Garbage collect pointers from a keyring. * @@ -138,10 +145,9 @@ void key_gc_keytype(struct key_type *ktype) */ static void key_gc_keyring(struct key *keyring, time_t limit) { - struct keyring_list *klist; - int loop; + int result; - kenter("%x", key_serial(keyring)); + kenter("%x{%s}", keyring->serial, keyring->description ?: ""); if (keyring->flags & ((1 << KEY_FLAG_INVALIDATED) | (1 << KEY_FLAG_REVOKED))) @@ -149,27 +155,17 @@ static void key_gc_keyring(struct key *keyring, time_t limit) /* scan the keyring looking for dead keys */ rcu_read_lock(); - klist = rcu_dereference(keyring->payload.subscriptions); - if (!klist) - goto unlock_dont_gc; - - loop = klist->nkeys; - smp_rmb(); - for (loop--; loop >= 0; loop--) { - struct key *key = rcu_dereference(klist->keys[loop]); - if (key_is_dead(key, limit)) - goto do_gc; - } - -unlock_dont_gc: + result = assoc_array_iterate(&keyring->keys, + key_gc_keyring_func, &limit); rcu_read_unlock(); + if (result == true) + goto do_gc; + dont_gc: kleave(" [no gc]"); return; do_gc: - rcu_read_unlock(); - keyring_gc(keyring, limit); kleave(" [gc]"); } @@ -392,7 +388,6 @@ found_unreferenced_key: */ found_keyring: spin_unlock(&key_serial_lock); - kdebug("scan keyring %d", key->serial); key_gc_keyring(key, limit); goto maybe_resched; diff --git a/security/keys/internal.h b/security/keys/internal.h index 73950bf8f875..581c6f688352 100644 --- a/security/keys/internal.h +++ b/security/keys/internal.h @@ -90,20 +90,23 @@ extern void key_type_put(struct key_type *ktype); extern int __key_link_begin(struct key *keyring, const struct keyring_index_key *index_key, - unsigned long *_prealloc); + struct assoc_array_edit **_edit); extern int __key_link_check_live_key(struct key *keyring, struct key *key); -extern void __key_link(struct key *keyring, struct key *key, - unsigned long *_prealloc); +extern void __key_link(struct key *key, struct assoc_array_edit **_edit); extern void __key_link_end(struct key *keyring, const struct keyring_index_key *index_key, - unsigned long prealloc); + struct assoc_array_edit *edit); -extern key_ref_t __keyring_search_one(key_ref_t keyring_ref, - const struct keyring_index_key *index_key); +extern key_ref_t find_key_to_update(key_ref_t keyring_ref, + const struct keyring_index_key *index_key); extern struct key *keyring_search_instkey(struct key *keyring, key_serial_t target_id); +extern int iterate_over_keyring(const struct key *keyring, + int (*func)(const struct key *key, void *data), + void *data); + typedef int (*key_match_func_t)(const struct key *, const void *); struct keyring_search_context { @@ -119,6 +122,8 @@ struct keyring_search_context { #define KEYRING_SEARCH_NO_CHECK_PERM 0x0010 /* Don't check permissions */ #define KEYRING_SEARCH_DETECT_TOO_DEEP 0x0020 /* Give an error on excessive depth */ + int (*iterator)(const void *object, void *iterator_data); + /* Internal stuff */ int skipped_ret; bool possessed; diff --git a/security/keys/key.c b/security/keys/key.c index 7d716b82a61e..a819b5c7d4ec 100644 --- a/security/keys/key.c +++ b/security/keys/key.c @@ -409,7 +409,7 @@ static int __key_instantiate_and_link(struct key *key, struct key_preparsed_payload *prep, struct key *keyring, struct key *authkey, - unsigned long *_prealloc) + struct assoc_array_edit **_edit) { int ret, awaken; @@ -436,7 +436,7 @@ static int __key_instantiate_and_link(struct key *key, /* and link it into the destination keyring */ if (keyring) - __key_link(keyring, key, _prealloc); + __key_link(key, _edit); /* disable the authorisation key */ if (authkey) @@ -476,7 +476,7 @@ int key_instantiate_and_link(struct key *key, struct key *authkey) { struct key_preparsed_payload prep; - unsigned long prealloc; + struct assoc_array_edit *edit; int ret; memset(&prep, 0, sizeof(prep)); @@ -490,16 +490,15 @@ int key_instantiate_and_link(struct key *key, } if (keyring) { - ret = __key_link_begin(keyring, &key->index_key, &prealloc); + ret = __key_link_begin(keyring, &key->index_key, &edit); if (ret < 0) goto error_free_preparse; } - ret = __key_instantiate_and_link(key, &prep, keyring, authkey, - &prealloc); + ret = __key_instantiate_and_link(key, &prep, keyring, authkey, &edit); if (keyring) - __key_link_end(keyring, &key->index_key, prealloc); + __key_link_end(keyring, &key->index_key, edit); error_free_preparse: if (key->type->preparse) @@ -537,7 +536,7 @@ int key_reject_and_link(struct key *key, struct key *keyring, struct key *authkey) { - unsigned long prealloc; + struct assoc_array_edit *edit; struct timespec now; int ret, awaken, link_ret = 0; @@ -548,7 +547,7 @@ int key_reject_and_link(struct key *key, ret = -EBUSY; if (keyring) - link_ret = __key_link_begin(keyring, &key->index_key, &prealloc); + link_ret = __key_link_begin(keyring, &key->index_key, &edit); mutex_lock(&key_construction_mutex); @@ -570,7 +569,7 @@ int key_reject_and_link(struct key *key, /* and link it into the destination keyring */ if (keyring && link_ret == 0) - __key_link(keyring, key, &prealloc); + __key_link(key, &edit); /* disable the authorisation key */ if (authkey) @@ -580,7 +579,7 @@ int key_reject_and_link(struct key *key, mutex_unlock(&key_construction_mutex); if (keyring) - __key_link_end(keyring, &key->index_key, prealloc); + __key_link_end(keyring, &key->index_key, edit); /* wake up anyone waiting for a key to be constructed */ if (awaken) @@ -783,8 +782,8 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref, .description = description, }; struct key_preparsed_payload prep; + struct assoc_array_edit *edit; const struct cred *cred = current_cred(); - unsigned long prealloc; struct key *keyring, *key = NULL; key_ref_t key_ref; int ret; @@ -828,7 +827,7 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref, } index_key.desc_len = strlen(index_key.description); - ret = __key_link_begin(keyring, &index_key, &prealloc); + ret = __key_link_begin(keyring, &index_key, &edit); if (ret < 0) { key_ref = ERR_PTR(ret); goto error_free_prep; @@ -847,8 +846,8 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref, * update that instead if possible */ if (index_key.type->update) { - key_ref = __keyring_search_one(keyring_ref, &index_key); - if (!IS_ERR(key_ref)) + key_ref = find_key_to_update(keyring_ref, &index_key); + if (key_ref) goto found_matching_key; } @@ -874,7 +873,7 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref, } /* instantiate it and link it into the target keyring */ - ret = __key_instantiate_and_link(key, &prep, keyring, NULL, &prealloc); + ret = __key_instantiate_and_link(key, &prep, keyring, NULL, &edit); if (ret < 0) { key_put(key); key_ref = ERR_PTR(ret); @@ -884,7 +883,7 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref, key_ref = make_key_ref(key, is_key_possessed(keyring_ref)); error_link_end: - __key_link_end(keyring, &index_key, prealloc); + __key_link_end(keyring, &index_key, edit); error_free_prep: if (index_key.type->preparse) index_key.type->free_preparse(&prep); @@ -897,7 +896,7 @@ error: /* we found a matching key, so we're going to try to update it * - we can drop the locks first as we have the key pinned */ - __key_link_end(keyring, &index_key, prealloc); + __key_link_end(keyring, &index_key, edit); key_ref = __key_update(key_ref, &prep); goto error_free_prep; diff --git a/security/keys/keyring.c b/security/keys/keyring.c index eeef1a073db4..f7cdea22214f 100644 --- a/security/keys/keyring.c +++ b/security/keys/keyring.c @@ -1,6 +1,6 @@ /* Keyring handling * - * Copyright (C) 2004-2005, 2008 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2004-2005, 2008, 2013 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * This program is free software; you can redistribute it and/or @@ -17,25 +17,11 @@ #include #include #include +#include +#include #include #include "internal.h" -#define rcu_dereference_locked_keyring(keyring) \ - (rcu_dereference_protected( \ - (keyring)->payload.subscriptions, \ - rwsem_is_locked((struct rw_semaphore *)&(keyring)->sem))) - -#define rcu_deref_link_locked(klist, index, keyring) \ - (rcu_dereference_protected( \ - (klist)->keys[index], \ - rwsem_is_locked((struct rw_semaphore *)&(keyring)->sem))) - -#define MAX_KEYRING_LINKS \ - min_t(size_t, USHRT_MAX - 1, \ - ((PAGE_SIZE - sizeof(struct keyring_list)) / sizeof(struct key *))) - -#define KEY_LINK_FIXQUOTA 1UL - /* * When plumbing the depths of the key tree, this sets a hard limit * set on how deep we're willing to go. @@ -47,6 +33,28 @@ */ #define KEYRING_NAME_HASH_SIZE (1 << 5) +/* + * We mark pointers we pass to the associative array with bit 1 set if + * they're keyrings and clear otherwise. + */ +#define KEYRING_PTR_SUBTYPE 0x2UL + +static inline bool keyring_ptr_is_keyring(const struct assoc_array_ptr *x) +{ + return (unsigned long)x & KEYRING_PTR_SUBTYPE; +} +static inline struct key *keyring_ptr_to_key(const struct assoc_array_ptr *x) +{ + void *object = assoc_array_ptr_to_leaf(x); + return (struct key *)((unsigned long)object & ~KEYRING_PTR_SUBTYPE); +} +static inline void *keyring_key_to_ptr(struct key *key) +{ + if (key->type == &key_type_keyring) + return (void *)((unsigned long)key | KEYRING_PTR_SUBTYPE); + return key; +} + static struct list_head keyring_name_hash[KEYRING_NAME_HASH_SIZE]; static DEFINE_RWLOCK(keyring_name_lock); @@ -67,7 +75,6 @@ static inline unsigned keyring_hash(const char *desc) */ static int keyring_instantiate(struct key *keyring, struct key_preparsed_payload *prep); -static int keyring_match(const struct key *keyring, const void *criterion); static void keyring_revoke(struct key *keyring); static void keyring_destroy(struct key *keyring); static void keyring_describe(const struct key *keyring, struct seq_file *m); @@ -76,9 +83,9 @@ static long keyring_read(const struct key *keyring, struct key_type key_type_keyring = { .name = "keyring", - .def_datalen = sizeof(struct keyring_list), + .def_datalen = 0, .instantiate = keyring_instantiate, - .match = keyring_match, + .match = user_match, .revoke = keyring_revoke, .destroy = keyring_destroy, .describe = keyring_describe, @@ -127,6 +134,7 @@ static int keyring_instantiate(struct key *keyring, ret = -EINVAL; if (prep->datalen == 0) { + assoc_array_init(&keyring->keys); /* make the keyring available by name if it has one */ keyring_publish_name(keyring); ret = 0; @@ -136,14 +144,225 @@ static int keyring_instantiate(struct key *keyring, } /* - * Match keyrings on their name + * Multiply 64-bits by 32-bits to 96-bits and fold back to 64-bit. Ideally we'd + * fold the carry back too, but that requires inline asm. + */ +static u64 mult_64x32_and_fold(u64 x, u32 y) +{ + u64 hi = (u64)(u32)(x >> 32) * y; + u64 lo = (u64)(u32)(x) * y; + return lo + ((u64)(u32)hi << 32) + (u32)(hi >> 32); +} + +/* + * Hash a key type and description. + */ +static unsigned long hash_key_type_and_desc(const struct keyring_index_key *index_key) +{ + const unsigned level_shift = ASSOC_ARRAY_LEVEL_STEP; + const unsigned long level_mask = ASSOC_ARRAY_LEVEL_STEP_MASK; + const char *description = index_key->description; + unsigned long hash, type; + u32 piece; + u64 acc; + int n, desc_len = index_key->desc_len; + + type = (unsigned long)index_key->type; + + acc = mult_64x32_and_fold(type, desc_len + 13); + acc = mult_64x32_and_fold(acc, 9207); + for (;;) { + n = desc_len; + if (n <= 0) + break; + if (n > 4) + n = 4; + piece = 0; + memcpy(&piece, description, n); + description += n; + desc_len -= n; + acc = mult_64x32_and_fold(acc, piece); + acc = mult_64x32_and_fold(acc, 9207); + } + + /* Fold the hash down to 32 bits if need be. */ + hash = acc; + if (ASSOC_ARRAY_KEY_CHUNK_SIZE == 32) + hash ^= acc >> 32; + + /* Squidge all the keyrings into a separate part of the tree to + * ordinary keys by making sure the lowest level segment in the hash is + * zero for keyrings and non-zero otherwise. + */ + if (index_key->type != &key_type_keyring && (hash & level_mask) == 0) + return hash | (hash >> (ASSOC_ARRAY_KEY_CHUNK_SIZE - level_shift)) | 1; + if (index_key->type == &key_type_keyring && (hash & level_mask) != 0) + return (hash + (hash << level_shift)) & ~level_mask; + return hash; +} + +/* + * Build the next index key chunk. + * + * On 32-bit systems the index key is laid out as: + * + * 0 4 5 9... + * hash desclen typeptr desc[] + * + * On 64-bit systems: + * + * 0 8 9 17... + * hash desclen typeptr desc[] + * + * We return it one word-sized chunk at a time. */ -static int keyring_match(const struct key *keyring, const void *description) +static unsigned long keyring_get_key_chunk(const void *data, int level) +{ + const struct keyring_index_key *index_key = data; + unsigned long chunk = 0; + long offset = 0; + int desc_len = index_key->desc_len, n = sizeof(chunk); + + level /= ASSOC_ARRAY_KEY_CHUNK_SIZE; + switch (level) { + case 0: + return hash_key_type_and_desc(index_key); + case 1: + return ((unsigned long)index_key->type << 8) | desc_len; + case 2: + if (desc_len == 0) + return (u8)((unsigned long)index_key->type >> + (ASSOC_ARRAY_KEY_CHUNK_SIZE - 8)); + n--; + offset = 1; + default: + offset += sizeof(chunk) - 1; + offset += (level - 3) * sizeof(chunk); + if (offset >= desc_len) + return 0; + desc_len -= offset; + if (desc_len > n) + desc_len = n; + offset += desc_len; + do { + chunk <<= 8; + chunk |= ((u8*)index_key->description)[--offset]; + } while (--desc_len > 0); + + if (level == 2) { + chunk <<= 8; + chunk |= (u8)((unsigned long)index_key->type >> + (ASSOC_ARRAY_KEY_CHUNK_SIZE - 8)); + } + return chunk; + } +} + +static unsigned long keyring_get_object_key_chunk(const void *object, int level) +{ + const struct key *key = keyring_ptr_to_key(object); + return keyring_get_key_chunk(&key->index_key, level); +} + +static bool keyring_compare_object(const void *object, const void *data) { - return keyring->description && - strcmp(keyring->description, description) == 0; + const struct keyring_index_key *index_key = data; + const struct key *key = keyring_ptr_to_key(object); + + return key->index_key.type == index_key->type && + key->index_key.desc_len == index_key->desc_len && + memcmp(key->index_key.description, index_key->description, + index_key->desc_len) == 0; } +/* + * Compare the index keys of a pair of objects and determine the bit position + * at which they differ - if they differ. + */ +static int keyring_diff_objects(const void *_a, const void *_b) +{ + const struct key *key_a = keyring_ptr_to_key(_a); + const struct key *key_b = keyring_ptr_to_key(_b); + const struct keyring_index_key *a = &key_a->index_key; + const struct keyring_index_key *b = &key_b->index_key; + unsigned long seg_a, seg_b; + int level, i; + + level = 0; + seg_a = hash_key_type_and_desc(a); + seg_b = hash_key_type_and_desc(b); + if ((seg_a ^ seg_b) != 0) + goto differ; + + /* The number of bits contributed by the hash is controlled by a + * constant in the assoc_array headers. Everything else thereafter we + * can deal with as being machine word-size dependent. + */ + level += ASSOC_ARRAY_KEY_CHUNK_SIZE / 8; + seg_a = a->desc_len; + seg_b = b->desc_len; + if ((seg_a ^ seg_b) != 0) + goto differ; + + /* The next bit may not work on big endian */ + level++; + seg_a = (unsigned long)a->type; + seg_b = (unsigned long)b->type; + if ((seg_a ^ seg_b) != 0) + goto differ; + + level += sizeof(unsigned long); + if (a->desc_len == 0) + goto same; + + i = 0; + if (((unsigned long)a->description | (unsigned long)b->description) & + (sizeof(unsigned long) - 1)) { + do { + seg_a = *(unsigned long *)(a->description + i); + seg_b = *(unsigned long *)(b->description + i); + if ((seg_a ^ seg_b) != 0) + goto differ_plus_i; + i += sizeof(unsigned long); + } while (i < (a->desc_len & (sizeof(unsigned long) - 1))); + } + + for (; i < a->desc_len; i++) { + seg_a = *(unsigned char *)(a->description + i); + seg_b = *(unsigned char *)(b->description + i); + if ((seg_a ^ seg_b) != 0) + goto differ_plus_i; + } + +same: + return -1; + +differ_plus_i: + level += i; +differ: + i = level * 8 + __ffs(seg_a ^ seg_b); + return i; +} + +/* + * Free an object after stripping the keyring flag off of the pointer. + */ +static void keyring_free_object(void *object) +{ + key_put(keyring_ptr_to_key(object)); +} + +/* + * Operations for keyring management by the index-tree routines. + */ +static const struct assoc_array_ops keyring_assoc_array_ops = { + .get_key_chunk = keyring_get_key_chunk, + .get_object_key_chunk = keyring_get_object_key_chunk, + .compare_object = keyring_compare_object, + .diff_objects = keyring_diff_objects, + .free_object = keyring_free_object, +}; + /* * Clean up a keyring when it is destroyed. Unpublish its name if it had one * and dispose of its data. @@ -155,9 +374,6 @@ static int keyring_match(const struct key *keyring, const void *description) */ static void keyring_destroy(struct key *keyring) { - struct keyring_list *klist; - int loop; - if (keyring->description) { write_lock(&keyring_name_lock); @@ -168,12 +384,7 @@ static void keyring_destroy(struct key *keyring) write_unlock(&keyring_name_lock); } - klist = rcu_access_pointer(keyring->payload.subscriptions); - if (klist) { - for (loop = klist->nkeys - 1; loop >= 0; loop--) - key_put(rcu_access_pointer(klist->keys[loop])); - kfree(klist); - } + assoc_array_destroy(&keyring->keys, &keyring_assoc_array_ops); } /* @@ -181,76 +392,88 @@ static void keyring_destroy(struct key *keyring) */ static void keyring_describe(const struct key *keyring, struct seq_file *m) { - struct keyring_list *klist; - if (keyring->description) seq_puts(m, keyring->description); else seq_puts(m, "[anon]"); if (key_is_instantiated(keyring)) { - rcu_read_lock(); - klist = rcu_dereference(keyring->payload.subscriptions); - if (klist) - seq_printf(m, ": %u/%u", klist->nkeys, klist->maxkeys); + if (keyring->keys.nr_leaves_on_tree != 0) + seq_printf(m, ": %lu", keyring->keys.nr_leaves_on_tree); else seq_puts(m, ": empty"); - rcu_read_unlock(); } } +struct keyring_read_iterator_context { + size_t qty; + size_t count; + key_serial_t __user *buffer; +}; + +static int keyring_read_iterator(const void *object, void *data) +{ + struct keyring_read_iterator_context *ctx = data; + const struct key *key = keyring_ptr_to_key(object); + int ret; + + kenter("{%s,%d},,{%zu/%zu}", + key->type->name, key->serial, ctx->count, ctx->qty); + + if (ctx->count >= ctx->qty) + return 1; + + ret = put_user(key->serial, ctx->buffer); + if (ret < 0) + return ret; + ctx->buffer++; + ctx->count += sizeof(key->serial); + return 0; +} + /* * Read a list of key IDs from the keyring's contents in binary form * - * The keyring's semaphore is read-locked by the caller. + * The keyring's semaphore is read-locked by the caller. This prevents someone + * from modifying it under us - which could cause us to read key IDs multiple + * times. */ static long keyring_read(const struct key *keyring, char __user *buffer, size_t buflen) { - struct keyring_list *klist; - struct key *key; - size_t qty, tmp; - int loop, ret; + struct keyring_read_iterator_context ctx; + unsigned long nr_keys; + int ret; - ret = 0; - klist = rcu_dereference_locked_keyring(keyring); - if (klist) { - /* calculate how much data we could return */ - qty = klist->nkeys * sizeof(key_serial_t); - - if (buffer && buflen > 0) { - if (buflen > qty) - buflen = qty; - - /* copy the IDs of the subscribed keys into the - * buffer */ - ret = -EFAULT; - - for (loop = 0; loop < klist->nkeys; loop++) { - key = rcu_deref_link_locked(klist, loop, - keyring); - - tmp = sizeof(key_serial_t); - if (tmp > buflen) - tmp = buflen; - - if (copy_to_user(buffer, - &key->serial, - tmp) != 0) - goto error; - - buflen -= tmp; - if (buflen == 0) - break; - buffer += tmp; - } - } + kenter("{%d},,%zu", key_serial(keyring), buflen); + + if (buflen & (sizeof(key_serial_t) - 1)) + return -EINVAL; + + nr_keys = keyring->keys.nr_leaves_on_tree; + if (nr_keys == 0) + return 0; - ret = qty; + /* Calculate how much data we could return */ + ctx.qty = nr_keys * sizeof(key_serial_t); + + if (!buffer || !buflen) + return ctx.qty; + + if (buflen > ctx.qty) + ctx.qty = buflen; + + /* Copy the IDs of the subscribed keys into the buffer */ + ctx.buffer = (key_serial_t __user *)buffer; + ctx.count = 0; + ret = assoc_array_iterate(&keyring->keys, keyring_read_iterator, &ctx); + if (ret < 0) { + kleave(" = %d [iterate]", ret); + return ret; } -error: - return ret; + kleave(" = %zu [ok]", ctx.count); + return ctx.count; } /* @@ -277,219 +500,360 @@ struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid, } EXPORT_SYMBOL(keyring_alloc); -/** - * keyring_search_aux - Search a keyring tree for a key matching some criteria - * @keyring_ref: A pointer to the keyring with possession indicator. - * @ctx: The keyring search context. - * - * Search the supplied keyring tree for a key that matches the criteria given. - * The root keyring and any linked keyrings must grant Search permission to the - * caller to be searchable and keys can only be found if they too grant Search - * to the caller. The possession flag on the root keyring pointer controls use - * of the possessor bits in permissions checking of the entire tree. In - * addition, the LSM gets to forbid keyring searches and key matches. - * - * The search is performed as a breadth-then-depth search up to the prescribed - * limit (KEYRING_SEARCH_MAX_DEPTH). - * - * Keys are matched to the type provided and are then filtered by the match - * function, which is given the description to use in any way it sees fit. The - * match function may use any attributes of a key that it wishes to to - * determine the match. Normally the match function from the key type would be - * used. - * - * RCU is used to prevent the keyring key lists from disappearing without the - * need to take lots of locks. - * - * Returns a pointer to the found key and increments the key usage count if - * successful; -EAGAIN if no matching keys were found, or if expired or revoked - * keys were found; -ENOKEY if only negative keys were found; -ENOTDIR if the - * specified keyring wasn't a keyring. - * - * In the case of a successful return, the possession attribute from - * @keyring_ref is propagated to the returned key reference. +/* + * Iteration function to consider each key found. */ -key_ref_t keyring_search_aux(key_ref_t keyring_ref, - struct keyring_search_context *ctx) +static int keyring_search_iterator(const void *object, void *iterator_data) { - struct { - /* Need a separate keylist pointer for RCU purposes */ - struct key *keyring; - struct keyring_list *keylist; - int kix; - } stack[KEYRING_SEARCH_MAX_DEPTH]; - - struct keyring_list *keylist; - unsigned long kflags; - struct key *keyring, *key; - key_ref_t key_ref; - long err; - int sp, nkeys, kix; + struct keyring_search_context *ctx = iterator_data; + const struct key *key = keyring_ptr_to_key(object); + unsigned long kflags = key->flags; - keyring = key_ref_to_ptr(keyring_ref); - ctx->possessed = is_key_possessed(keyring_ref); - key_check(keyring); + kenter("{%d}", key->serial); - /* top keyring must have search permission to begin the search */ - err = key_task_permission(keyring_ref, ctx->cred, KEY_SEARCH); - if (err < 0) { - key_ref = ERR_PTR(err); - goto error; + /* ignore keys not of this type */ + if (key->type != ctx->index_key.type) { + kleave(" = 0 [!type]"); + return 0; } - key_ref = ERR_PTR(-ENOTDIR); - if (keyring->type != &key_type_keyring) - goto error; + /* skip invalidated, revoked and expired keys */ + if (ctx->flags & KEYRING_SEARCH_DO_STATE_CHECK) { + if (kflags & ((1 << KEY_FLAG_INVALIDATED) | + (1 << KEY_FLAG_REVOKED))) { + ctx->result = ERR_PTR(-EKEYREVOKED); + kleave(" = %d [invrev]", ctx->skipped_ret); + goto skipped; + } - rcu_read_lock(); + if (key->expiry && ctx->now.tv_sec >= key->expiry) { + ctx->result = ERR_PTR(-EKEYEXPIRED); + kleave(" = %d [expire]", ctx->skipped_ret); + goto skipped; + } + } - ctx->now = current_kernel_time(); - err = -EAGAIN; - sp = 0; - - /* firstly we should check to see if this top-level keyring is what we - * are looking for */ - key_ref = ERR_PTR(-EAGAIN); - kflags = keyring->flags; - if (keyring->type == ctx->index_key.type && - ctx->match(keyring, ctx->match_data)) { - key = keyring; - if (ctx->flags & KEYRING_SEARCH_NO_STATE_CHECK) - goto found; + /* keys that don't match */ + if (!ctx->match(key, ctx->match_data)) { + kleave(" = 0 [!match]"); + return 0; + } - /* check it isn't negative and hasn't expired or been - * revoked */ - if (kflags & (1 << KEY_FLAG_REVOKED)) - goto error_2; - if (key->expiry && ctx->now.tv_sec >= key->expiry) - goto error_2; - key_ref = ERR_PTR(key->type_data.reject_error); - if (kflags & (1 << KEY_FLAG_NEGATIVE)) - goto error_2; - goto found; + /* key must have search permissions */ + if (!(ctx->flags & KEYRING_SEARCH_NO_CHECK_PERM) && + key_task_permission(make_key_ref(key, ctx->possessed), + ctx->cred, KEY_SEARCH) < 0) { + ctx->result = ERR_PTR(-EACCES); + kleave(" = %d [!perm]", ctx->skipped_ret); + goto skipped; } - /* otherwise, the top keyring must not be revoked, expired, or - * negatively instantiated if we are to search it */ - key_ref = ERR_PTR(-EAGAIN); - if (kflags & ((1 << KEY_FLAG_INVALIDATED) | - (1 << KEY_FLAG_REVOKED) | - (1 << KEY_FLAG_NEGATIVE)) || - (keyring->expiry && ctx->now.tv_sec >= keyring->expiry)) - goto error_2; - - /* start processing a new keyring */ -descend: - kflags = keyring->flags; - if (kflags & ((1 << KEY_FLAG_INVALIDATED) | - (1 << KEY_FLAG_REVOKED))) - goto not_this_keyring; + if (ctx->flags & KEYRING_SEARCH_DO_STATE_CHECK) { + /* we set a different error code if we pass a negative key */ + if (kflags & (1 << KEY_FLAG_NEGATIVE)) { + ctx->result = ERR_PTR(key->type_data.reject_error); + kleave(" = %d [neg]", ctx->skipped_ret); + goto skipped; + } + } - keylist = rcu_dereference(keyring->payload.subscriptions); - if (!keylist) - goto not_this_keyring; + /* Found */ + ctx->result = make_key_ref(key, ctx->possessed); + kleave(" = 1 [found]"); + return 1; - /* iterate through the keys in this keyring first */ - nkeys = keylist->nkeys; - smp_rmb(); - for (kix = 0; kix < nkeys; kix++) { - key = rcu_dereference(keylist->keys[kix]); - kflags = key->flags; +skipped: + return ctx->skipped_ret; +} - /* ignore keys not of this type */ - if (key->type != ctx->index_key.type) - continue; +/* + * Search inside a keyring for a key. We can search by walking to it + * directly based on its index-key or we can iterate over the entire + * tree looking for it, based on the match function. + */ +static int search_keyring(struct key *keyring, struct keyring_search_context *ctx) +{ + if ((ctx->flags & KEYRING_SEARCH_LOOKUP_TYPE) == + KEYRING_SEARCH_LOOKUP_DIRECT) { + const void *object; + + object = assoc_array_find(&keyring->keys, + &keyring_assoc_array_ops, + &ctx->index_key); + return object ? ctx->iterator(object, ctx) : 0; + } + return assoc_array_iterate(&keyring->keys, ctx->iterator, ctx); +} - /* skip invalidated, revoked and expired keys */ - if (!(ctx->flags & KEYRING_SEARCH_NO_STATE_CHECK)) { - if (kflags & ((1 << KEY_FLAG_INVALIDATED) | - (1 << KEY_FLAG_REVOKED))) - continue; +/* + * Search a tree of keyrings that point to other keyrings up to the maximum + * depth. + */ +static bool search_nested_keyrings(struct key *keyring, + struct keyring_search_context *ctx) +{ + struct { + struct key *keyring; + struct assoc_array_node *node; + int slot; + } stack[KEYRING_SEARCH_MAX_DEPTH]; - if (key->expiry && ctx->now.tv_sec >= key->expiry) - continue; - } + struct assoc_array_shortcut *shortcut; + struct assoc_array_node *node; + struct assoc_array_ptr *ptr; + struct key *key; + int sp = 0, slot; - /* keys that don't match */ - if (!ctx->match(key, ctx->match_data)) - continue; + kenter("{%d},{%s,%s}", + keyring->serial, + ctx->index_key.type->name, + ctx->index_key.description); - /* key must have search permissions */ - if (key_task_permission(make_key_ref(key, ctx->possessed), - ctx->cred, KEY_SEARCH) < 0) - continue; + if (ctx->index_key.description) + ctx->index_key.desc_len = strlen(ctx->index_key.description); - if (ctx->flags & KEYRING_SEARCH_NO_STATE_CHECK) + /* Check to see if this top-level keyring is what we are looking for + * and whether it is valid or not. + */ + if (ctx->flags & KEYRING_SEARCH_LOOKUP_ITERATE || + keyring_compare_object(keyring, &ctx->index_key)) { + ctx->skipped_ret = 2; + ctx->flags |= KEYRING_SEARCH_DO_STATE_CHECK; + switch (ctx->iterator(keyring_key_to_ptr(keyring), ctx)) { + case 1: goto found; - - /* we set a different error code if we pass a negative key */ - if (kflags & (1 << KEY_FLAG_NEGATIVE)) { - err = key->type_data.reject_error; - continue; + case 2: + return false; + default: + break; } + } + ctx->skipped_ret = 0; + if (ctx->flags & KEYRING_SEARCH_NO_STATE_CHECK) + ctx->flags &= ~KEYRING_SEARCH_DO_STATE_CHECK; + + /* Start processing a new keyring */ +descend_to_keyring: + kdebug("descend to %d", keyring->serial); + if (keyring->flags & ((1 << KEY_FLAG_INVALIDATED) | + (1 << KEY_FLAG_REVOKED))) + goto not_this_keyring; + + /* Search through the keys in this keyring before its searching its + * subtrees. + */ + if (search_keyring(keyring, ctx)) goto found; - } - /* search through the keyrings nested in this one */ - kix = 0; -ascend: - nkeys = keylist->nkeys; - smp_rmb(); - for (; kix < nkeys; kix++) { - key = rcu_dereference(keylist->keys[kix]); - if (key->type != &key_type_keyring) - continue; + /* Then manually iterate through the keyrings nested in this one. + * + * Start from the root node of the index tree. Because of the way the + * hash function has been set up, keyrings cluster on the leftmost + * branch of the root node (root slot 0) or in the root node itself. + * Non-keyrings avoid the leftmost branch of the root entirely (root + * slots 1-15). + */ + ptr = ACCESS_ONCE(keyring->keys.root); + if (!ptr) + goto not_this_keyring; - /* recursively search nested keyrings - * - only search keyrings for which we have search permission + if (assoc_array_ptr_is_shortcut(ptr)) { + /* If the root is a shortcut, either the keyring only contains + * keyring pointers (everything clusters behind root slot 0) or + * doesn't contain any keyring pointers. */ - if (sp >= KEYRING_SEARCH_MAX_DEPTH) + shortcut = assoc_array_ptr_to_shortcut(ptr); + smp_read_barrier_depends(); + if ((shortcut->index_key[0] & ASSOC_ARRAY_FAN_MASK) != 0) + goto not_this_keyring; + + ptr = ACCESS_ONCE(shortcut->next_node); + node = assoc_array_ptr_to_node(ptr); + goto begin_node; + } + + node = assoc_array_ptr_to_node(ptr); + smp_read_barrier_depends(); + + ptr = node->slots[0]; + if (!assoc_array_ptr_is_meta(ptr)) + goto begin_node; + +descend_to_node: + /* Descend to a more distal node in this keyring's content tree and go + * through that. + */ + kdebug("descend"); + if (assoc_array_ptr_is_shortcut(ptr)) { + shortcut = assoc_array_ptr_to_shortcut(ptr); + smp_read_barrier_depends(); + ptr = ACCESS_ONCE(shortcut->next_node); + BUG_ON(!assoc_array_ptr_is_node(ptr)); + node = assoc_array_ptr_to_node(ptr); + } + +begin_node: + kdebug("begin_node"); + smp_read_barrier_depends(); + slot = 0; +ascend_to_node: + /* Go through the slots in a node */ + for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) { + ptr = ACCESS_ONCE(node->slots[slot]); + + if (assoc_array_ptr_is_meta(ptr) && node->back_pointer) + goto descend_to_node; + + if (!keyring_ptr_is_keyring(ptr)) continue; - if (key_task_permission(make_key_ref(key, ctx->possessed), + key = keyring_ptr_to_key(ptr); + + if (sp >= KEYRING_SEARCH_MAX_DEPTH) { + if (ctx->flags & KEYRING_SEARCH_DETECT_TOO_DEEP) { + ctx->result = ERR_PTR(-ELOOP); + return false; + } + goto not_this_keyring; + } + + /* Search a nested keyring */ + if (!(ctx->flags & KEYRING_SEARCH_NO_CHECK_PERM) && + key_task_permission(make_key_ref(key, ctx->possessed), ctx->cred, KEY_SEARCH) < 0) continue; /* stack the current position */ stack[sp].keyring = keyring; - stack[sp].keylist = keylist; - stack[sp].kix = kix; + stack[sp].node = node; + stack[sp].slot = slot; sp++; /* begin again with the new keyring */ keyring = key; - goto descend; + goto descend_to_keyring; + } + + /* We've dealt with all the slots in the current node, so now we need + * to ascend to the parent and continue processing there. + */ + ptr = ACCESS_ONCE(node->back_pointer); + slot = node->parent_slot; + + if (ptr && assoc_array_ptr_is_shortcut(ptr)) { + shortcut = assoc_array_ptr_to_shortcut(ptr); + smp_read_barrier_depends(); + ptr = ACCESS_ONCE(shortcut->back_pointer); + slot = shortcut->parent_slot; + } + if (!ptr) + goto not_this_keyring; + node = assoc_array_ptr_to_node(ptr); + smp_read_barrier_depends(); + slot++; + + /* If we've ascended to the root (zero backpointer), we must have just + * finished processing the leftmost branch rather than the root slots - + * so there can't be any more keyrings for us to find. + */ + if (node->back_pointer) { + kdebug("ascend %d", slot); + goto ascend_to_node; } - /* the keyring we're looking at was disqualified or didn't contain a - * matching key */ + /* The keyring we're looking at was disqualified or didn't contain a + * matching key. + */ not_this_keyring: - if (sp > 0) { - /* resume the processing of a keyring higher up in the tree */ - sp--; - keyring = stack[sp].keyring; - keylist = stack[sp].keylist; - kix = stack[sp].kix + 1; - goto ascend; + kdebug("not_this_keyring %d", sp); + if (sp <= 0) { + kleave(" = false"); + return false; } - key_ref = ERR_PTR(err); - goto error_2; + /* Resume the processing of a keyring higher up in the tree */ + sp--; + keyring = stack[sp].keyring; + node = stack[sp].node; + slot = stack[sp].slot + 1; + kdebug("ascend to %d [%d]", keyring->serial, slot); + goto ascend_to_node; - /* we found a viable match */ + /* We found a viable match */ found: - __key_get(key); - key->last_used_at = ctx->now.tv_sec; - keyring->last_used_at = ctx->now.tv_sec; - while (sp > 0) - stack[--sp].keyring->last_used_at = ctx->now.tv_sec; + key = key_ref_to_ptr(ctx->result); key_check(key); - key_ref = make_key_ref(key, ctx->possessed); -error_2: + if (!(ctx->flags & KEYRING_SEARCH_NO_UPDATE_TIME)) { + key->last_used_at = ctx->now.tv_sec; + keyring->last_used_at = ctx->now.tv_sec; + while (sp > 0) + stack[--sp].keyring->last_used_at = ctx->now.tv_sec; + } + kleave(" = true"); + return true; +} + +/** + * keyring_search_aux - Search a keyring tree for a key matching some criteria + * @keyring_ref: A pointer to the keyring with possession indicator. + * @ctx: The keyring search context. + * + * Search the supplied keyring tree for a key that matches the criteria given. + * The root keyring and any linked keyrings must grant Search permission to the + * caller to be searchable and keys can only be found if they too grant Search + * to the caller. The possession flag on the root keyring pointer controls use + * of the possessor bits in permissions checking of the entire tree. In + * addition, the LSM gets to forbid keyring searches and key matches. + * + * The search is performed as a breadth-then-depth search up to the prescribed + * limit (KEYRING_SEARCH_MAX_DEPTH). + * + * Keys are matched to the type provided and are then filtered by the match + * function, which is given the description to use in any way it sees fit. The + * match function may use any attributes of a key that it wishes to to + * determine the match. Normally the match function from the key type would be + * used. + * + * RCU can be used to prevent the keyring key lists from disappearing without + * the need to take lots of locks. + * + * Returns a pointer to the found key and increments the key usage count if + * successful; -EAGAIN if no matching keys were found, or if expired or revoked + * keys were found; -ENOKEY if only negative keys were found; -ENOTDIR if the + * specified keyring wasn't a keyring. + * + * In the case of a successful return, the possession attribute from + * @keyring_ref is propagated to the returned key reference. + */ +key_ref_t keyring_search_aux(key_ref_t keyring_ref, + struct keyring_search_context *ctx) +{ + struct key *keyring; + long err; + + ctx->iterator = keyring_search_iterator; + ctx->possessed = is_key_possessed(keyring_ref); + ctx->result = ERR_PTR(-EAGAIN); + + keyring = key_ref_to_ptr(keyring_ref); + key_check(keyring); + + if (keyring->type != &key_type_keyring) + return ERR_PTR(-ENOTDIR); + + if (!(ctx->flags & KEYRING_SEARCH_NO_CHECK_PERM)) { + err = key_task_permission(keyring_ref, ctx->cred, KEY_SEARCH); + if (err < 0) + return ERR_PTR(err); + } + + rcu_read_lock(); + ctx->now = current_kernel_time(); + if (search_nested_keyrings(keyring, ctx)) + __key_get(key_ref_to_ptr(ctx->result)); rcu_read_unlock(); -error: - return key_ref; + return ctx->result; } /** @@ -499,7 +863,7 @@ error: * @description: The name of the keyring we want to find. * * As keyring_search_aux() above, but using the current task's credentials and - * type's default matching function. + * type's default matching function and preferred search method. */ key_ref_t keyring_search(key_ref_t keyring, struct key_type *type, @@ -523,58 +887,49 @@ key_ref_t keyring_search(key_ref_t keyring, EXPORT_SYMBOL(keyring_search); /* - * Search the given keyring only (no recursion). + * Search the given keyring for a key that might be updated. * * The caller must guarantee that the keyring is a keyring and that the - * permission is granted to search the keyring as no check is made here. - * - * RCU is used to make it unnecessary to lock the keyring key list here. + * permission is granted to modify the keyring as no check is made here. The + * caller must also hold a lock on the keyring semaphore. * * Returns a pointer to the found key with usage count incremented if - * successful and returns -ENOKEY if not found. Revoked and invalidated keys - * are skipped over. + * successful and returns NULL if not found. Revoked and invalidated keys are + * skipped over. * * If successful, the possession indicator is propagated from the keyring ref * to the returned key reference. */ -key_ref_t __keyring_search_one(key_ref_t keyring_ref, - const struct keyring_index_key *index_key) +key_ref_t find_key_to_update(key_ref_t keyring_ref, + const struct keyring_index_key *index_key) { - struct keyring_list *klist; struct key *keyring, *key; - bool possessed; - int nkeys, loop; + const void *object; keyring = key_ref_to_ptr(keyring_ref); - possessed = is_key_possessed(keyring_ref); - rcu_read_lock(); + kenter("{%d},{%s,%s}", + keyring->serial, index_key->type->name, index_key->description); - klist = rcu_dereference(keyring->payload.subscriptions); - if (klist) { - nkeys = klist->nkeys; - smp_rmb(); - for (loop = 0; loop < nkeys ; loop++) { - key = rcu_dereference(klist->keys[loop]); - if (key->type == index_key->type && - (!key->type->match || - key->type->match(key, index_key->description)) && - !(key->flags & ((1 << KEY_FLAG_INVALIDATED) | - (1 << KEY_FLAG_REVOKED))) - ) - goto found; - } - } + object = assoc_array_find(&keyring->keys, &keyring_assoc_array_ops, + index_key); - rcu_read_unlock(); - return ERR_PTR(-ENOKEY); + if (object) + goto found; + + kleave(" = NULL"); + return NULL; found: + key = keyring_ptr_to_key(object); + if (key->flags & ((1 << KEY_FLAG_INVALIDATED) | + (1 << KEY_FLAG_REVOKED))) { + kleave(" = NULL [x]"); + return NULL; + } __key_get(key); - keyring->last_used_at = key->last_used_at = - current_kernel_time().tv_sec; - rcu_read_unlock(); - return make_key_ref(key, possessed); + kleave(" = {%d}", key->serial); + return make_key_ref(key, is_key_possessed(keyring_ref)); } /* @@ -637,6 +992,19 @@ out: return keyring; } +static int keyring_detect_cycle_iterator(const void *object, + void *iterator_data) +{ + struct keyring_search_context *ctx = iterator_data; + const struct key *key = keyring_ptr_to_key(object); + + kenter("{%d}", key->serial); + + BUG_ON(key != ctx->match_data); + ctx->result = ERR_PTR(-EDEADLK); + return 1; +} + /* * See if a cycle will will be created by inserting acyclic tree B in acyclic * tree A at the topmost level (ie: as a direct child of A). @@ -646,117 +1014,39 @@ out: */ static int keyring_detect_cycle(struct key *A, struct key *B) { - struct { - struct keyring_list *keylist; - int kix; - } stack[KEYRING_SEARCH_MAX_DEPTH]; - - struct keyring_list *keylist; - struct key *subtree, *key; - int sp, nkeys, kix, ret; + struct keyring_search_context ctx = { + .index_key = A->index_key, + .match_data = A, + .iterator = keyring_detect_cycle_iterator, + .flags = (KEYRING_SEARCH_LOOKUP_DIRECT | + KEYRING_SEARCH_NO_STATE_CHECK | + KEYRING_SEARCH_NO_UPDATE_TIME | + KEYRING_SEARCH_NO_CHECK_PERM | + KEYRING_SEARCH_DETECT_TOO_DEEP), + }; rcu_read_lock(); - - ret = -EDEADLK; - if (A == B) - goto cycle_detected; - - subtree = B; - sp = 0; - - /* start processing a new keyring */ -descend: - if (test_bit(KEY_FLAG_REVOKED, &subtree->flags)) - goto not_this_keyring; - - keylist = rcu_dereference(subtree->payload.subscriptions); - if (!keylist) - goto not_this_keyring; - kix = 0; - -ascend: - /* iterate through the remaining keys in this keyring */ - nkeys = keylist->nkeys; - smp_rmb(); - for (; kix < nkeys; kix++) { - key = rcu_dereference(keylist->keys[kix]); - - if (key == A) - goto cycle_detected; - - /* recursively check nested keyrings */ - if (key->type == &key_type_keyring) { - if (sp >= KEYRING_SEARCH_MAX_DEPTH) - goto too_deep; - - /* stack the current position */ - stack[sp].keylist = keylist; - stack[sp].kix = kix; - sp++; - - /* begin again with the new keyring */ - subtree = key; - goto descend; - } - } - - /* the keyring we're looking at was disqualified or didn't contain a - * matching key */ -not_this_keyring: - if (sp > 0) { - /* resume the checking of a keyring higher up in the tree */ - sp--; - keylist = stack[sp].keylist; - kix = stack[sp].kix + 1; - goto ascend; - } - - ret = 0; /* no cycles detected */ - -error: + search_nested_keyrings(B, &ctx); rcu_read_unlock(); - return ret; - -too_deep: - ret = -ELOOP; - goto error; - -cycle_detected: - ret = -EDEADLK; - goto error; -} - -/* - * Dispose of a keyring list after the RCU grace period, freeing the unlinked - * key - */ -static void keyring_unlink_rcu_disposal(struct rcu_head *rcu) -{ - struct keyring_list *klist = - container_of(rcu, struct keyring_list, rcu); - - if (klist->delkey != USHRT_MAX) - key_put(rcu_access_pointer(klist->keys[klist->delkey])); - kfree(klist); + return PTR_ERR(ctx.result) == -EAGAIN ? 0 : PTR_ERR(ctx.result); } /* * Preallocate memory so that a key can be linked into to a keyring. */ -int __key_link_begin(struct key *keyring, const struct keyring_index_key *index_key, - unsigned long *_prealloc) +int __key_link_begin(struct key *keyring, + const struct keyring_index_key *index_key, + struct assoc_array_edit **_edit) __acquires(&keyring->sem) __acquires(&keyring_serialise_link_sem) { - struct keyring_list *klist, *nklist; - unsigned long prealloc; - unsigned max; - time_t lowest_lru; - size_t size; - int loop, lru, ret; + struct assoc_array_edit *edit; + int ret; kenter("%d,%s,%s,", - key_serial(keyring), index_key->type->name, index_key->description); + keyring->serial, index_key->type->name, index_key->description); + + BUG_ON(index_key->desc_len == 0); if (keyring->type != &key_type_keyring) return -ENOTDIR; @@ -772,88 +1062,25 @@ int __key_link_begin(struct key *keyring, const struct keyring_index_key *index_ if (index_key->type == &key_type_keyring) down_write(&keyring_serialise_link_sem); - klist = rcu_dereference_locked_keyring(keyring); - - /* see if there's a matching key we can displace */ - lru = -1; - if (klist && klist->nkeys > 0) { - lowest_lru = TIME_T_MAX; - for (loop = klist->nkeys - 1; loop >= 0; loop--) { - struct key *key = rcu_deref_link_locked(klist, loop, - keyring); - if (key->type == index_key->type && - strcmp(key->description, index_key->description) == 0) { - /* Found a match - we'll replace the link with - * one to the new key. We record the slot - * position. - */ - klist->delkey = loop; - prealloc = 0; - goto done; - } - if (key->last_used_at < lowest_lru) { - lowest_lru = key->last_used_at; - lru = loop; - } - } - } - - /* If the keyring is full then do an LRU discard */ - if (klist && - klist->nkeys == klist->maxkeys && - klist->maxkeys >= MAX_KEYRING_LINKS) { - kdebug("LRU discard %d\n", lru); - klist->delkey = lru; - prealloc = 0; - goto done; - } - /* check that we aren't going to overrun the user's quota */ ret = key_payload_reserve(keyring, keyring->datalen + KEYQUOTA_LINK_BYTES); if (ret < 0) goto error_sem; - if (klist && klist->nkeys < klist->maxkeys) { - /* there's sufficient slack space to append directly */ - klist->delkey = klist->nkeys; - prealloc = KEY_LINK_FIXQUOTA; - } else { - /* grow the key list */ - max = 4; - if (klist) { - max += klist->maxkeys; - if (max > MAX_KEYRING_LINKS) - max = MAX_KEYRING_LINKS; - BUG_ON(max <= klist->maxkeys); - } - - size = sizeof(*klist) + sizeof(struct key *) * max; - - ret = -ENOMEM; - nklist = kmalloc(size, GFP_KERNEL); - if (!nklist) - goto error_quota; - - nklist->maxkeys = max; - if (klist) { - memcpy(nklist->keys, klist->keys, - sizeof(struct key *) * klist->nkeys); - nklist->delkey = klist->nkeys; - nklist->nkeys = klist->nkeys + 1; - klist->delkey = USHRT_MAX; - } else { - nklist->nkeys = 1; - nklist->delkey = 0; - } - - /* add the key into the new space */ - RCU_INIT_POINTER(nklist->keys[nklist->delkey], NULL); - prealloc = (unsigned long)nklist | KEY_LINK_FIXQUOTA; + /* Create an edit script that will insert/replace the key in the + * keyring tree. + */ + edit = assoc_array_insert(&keyring->keys, + &keyring_assoc_array_ops, + index_key, + NULL); + if (IS_ERR(edit)) { + ret = PTR_ERR(edit); + goto error_quota; } -done: - *_prealloc = prealloc; + *_edit = edit; kleave(" = 0"); return 0; @@ -893,60 +1120,12 @@ int __key_link_check_live_key(struct key *keyring, struct key *key) * holds at most one link to any given key of a particular type+description * combination. */ -void __key_link(struct key *keyring, struct key *key, - unsigned long *_prealloc) +void __key_link(struct key *key, struct assoc_array_edit **_edit) { - struct keyring_list *klist, *nklist; - struct key *discard; - - nklist = (struct keyring_list *)(*_prealloc & ~KEY_LINK_FIXQUOTA); - *_prealloc = 0; - - kenter("%d,%d,%p", keyring->serial, key->serial, nklist); - - klist = rcu_dereference_locked_keyring(keyring); - __key_get(key); - keyring->last_used_at = key->last_used_at = - current_kernel_time().tv_sec; - - /* there's a matching key we can displace or an empty slot in a newly - * allocated list we can fill */ - if (nklist) { - kdebug("reissue %hu/%hu/%hu", - nklist->delkey, nklist->nkeys, nklist->maxkeys); - - RCU_INIT_POINTER(nklist->keys[nklist->delkey], key); - - rcu_assign_pointer(keyring->payload.subscriptions, nklist); - - /* dispose of the old keyring list and, if there was one, the - * displaced key */ - if (klist) { - kdebug("dispose %hu/%hu/%hu", - klist->delkey, klist->nkeys, klist->maxkeys); - call_rcu(&klist->rcu, keyring_unlink_rcu_disposal); - } - } else if (klist->delkey < klist->nkeys) { - kdebug("replace %hu/%hu/%hu", - klist->delkey, klist->nkeys, klist->maxkeys); - - discard = rcu_dereference_protected( - klist->keys[klist->delkey], - rwsem_is_locked(&keyring->sem)); - rcu_assign_pointer(klist->keys[klist->delkey], key); - /* The garbage collector will take care of RCU - * synchronisation */ - key_put(discard); - } else { - /* there's sufficient slack space to append directly */ - kdebug("append %hu/%hu/%hu", - klist->delkey, klist->nkeys, klist->maxkeys); - - RCU_INIT_POINTER(klist->keys[klist->delkey], key); - smp_wmb(); - klist->nkeys++; - } + assoc_array_insert_set_object(*_edit, keyring_key_to_ptr(key)); + assoc_array_apply_edit(*_edit); + *_edit = NULL; } /* @@ -956,23 +1135,20 @@ void __key_link(struct key *keyring, struct key *key, */ void __key_link_end(struct key *keyring, const struct keyring_index_key *index_key, - unsigned long prealloc) + struct assoc_array_edit *edit) __releases(&keyring->sem) __releases(&keyring_serialise_link_sem) { BUG_ON(index_key->type == NULL); - BUG_ON(index_key->type->name == NULL); - kenter("%d,%s,%lx", keyring->serial, index_key->type->name, prealloc); + kenter("%d,%s,", keyring->serial, index_key->type->name); if (index_key->type == &key_type_keyring) up_write(&keyring_serialise_link_sem); - if (prealloc) { - if (prealloc & KEY_LINK_FIXQUOTA) - key_payload_reserve(keyring, - keyring->datalen - - KEYQUOTA_LINK_BYTES); - kfree((struct keyring_list *)(prealloc & ~KEY_LINK_FIXQUOTA)); + if (edit) { + key_payload_reserve(keyring, + keyring->datalen - KEYQUOTA_LINK_BYTES); + assoc_array_cancel_edit(edit); } up_write(&keyring->sem); } @@ -999,20 +1175,24 @@ void __key_link_end(struct key *keyring, */ int key_link(struct key *keyring, struct key *key) { - unsigned long prealloc; + struct assoc_array_edit *edit; int ret; + kenter("{%d,%d}", keyring->serial, atomic_read(&keyring->usage)); + key_check(keyring); key_check(key); - ret = __key_link_begin(keyring, &key->index_key, &prealloc); + ret = __key_link_begin(keyring, &key->index_key, &edit); if (ret == 0) { + kdebug("begun {%d,%d}", keyring->serial, atomic_read(&keyring->usage)); ret = __key_link_check_live_key(keyring, key); if (ret == 0) - __key_link(keyring, key, &prealloc); - __key_link_end(keyring, &key->index_key, prealloc); + __key_link(key, &edit); + __key_link_end(keyring, &key->index_key, edit); } + kleave(" = %d {%d,%d}", ret, keyring->serial, atomic_read(&keyring->usage)); return ret; } EXPORT_SYMBOL(key_link); @@ -1036,90 +1216,36 @@ EXPORT_SYMBOL(key_link); */ int key_unlink(struct key *keyring, struct key *key) { - struct keyring_list *klist, *nklist; - int loop, ret; + struct assoc_array_edit *edit; + int ret; key_check(keyring); key_check(key); - ret = -ENOTDIR; if (keyring->type != &key_type_keyring) - goto error; + return -ENOTDIR; down_write(&keyring->sem); - klist = rcu_dereference_locked_keyring(keyring); - if (klist) { - /* search the keyring for the key */ - for (loop = 0; loop < klist->nkeys; loop++) - if (rcu_access_pointer(klist->keys[loop]) == key) - goto key_is_present; + edit = assoc_array_delete(&keyring->keys, &keyring_assoc_array_ops, + &key->index_key); + if (IS_ERR(edit)) { + ret = PTR_ERR(edit); + goto error; } - - up_write(&keyring->sem); ret = -ENOENT; - goto error; - -key_is_present: - /* we need to copy the key list for RCU purposes */ - nklist = kmalloc(sizeof(*klist) + - sizeof(struct key *) * klist->maxkeys, - GFP_KERNEL); - if (!nklist) - goto nomem; - nklist->maxkeys = klist->maxkeys; - nklist->nkeys = klist->nkeys - 1; - - if (loop > 0) - memcpy(&nklist->keys[0], - &klist->keys[0], - loop * sizeof(struct key *)); - - if (loop < nklist->nkeys) - memcpy(&nklist->keys[loop], - &klist->keys[loop + 1], - (nklist->nkeys - loop) * sizeof(struct key *)); - - /* adjust the user's quota */ - key_payload_reserve(keyring, - keyring->datalen - KEYQUOTA_LINK_BYTES); - - rcu_assign_pointer(keyring->payload.subscriptions, nklist); - - up_write(&keyring->sem); - - /* schedule for later cleanup */ - klist->delkey = loop; - call_rcu(&klist->rcu, keyring_unlink_rcu_disposal); + if (edit == NULL) + goto error; + assoc_array_apply_edit(edit); ret = 0; error: - return ret; -nomem: - ret = -ENOMEM; up_write(&keyring->sem); - goto error; + return ret; } EXPORT_SYMBOL(key_unlink); -/* - * Dispose of a keyring list after the RCU grace period, releasing the keys it - * links to. - */ -static void keyring_clear_rcu_disposal(struct rcu_head *rcu) -{ - struct keyring_list *klist; - int loop; - - klist = container_of(rcu, struct keyring_list, rcu); - - for (loop = klist->nkeys - 1; loop >= 0; loop--) - key_put(rcu_access_pointer(klist->keys[loop])); - - kfree(klist); -} - /** * keyring_clear - Clear a keyring * @keyring: The keyring to clear. @@ -1130,33 +1256,25 @@ static void keyring_clear_rcu_disposal(struct rcu_head *rcu) */ int keyring_clear(struct key *keyring) { - struct keyring_list *klist; + struct assoc_array_edit *edit; int ret; - ret = -ENOTDIR; - if (keyring->type == &key_type_keyring) { - /* detach the pointer block with the locks held */ - down_write(&keyring->sem); - - klist = rcu_dereference_locked_keyring(keyring); - if (klist) { - /* adjust the quota */ - key_payload_reserve(keyring, - sizeof(struct keyring_list)); - - rcu_assign_pointer(keyring->payload.subscriptions, - NULL); - } - - up_write(&keyring->sem); + if (keyring->type != &key_type_keyring) + return -ENOTDIR; - /* free the keys after the locks have been dropped */ - if (klist) - call_rcu(&klist->rcu, keyring_clear_rcu_disposal); + down_write(&keyring->sem); + edit = assoc_array_clear(&keyring->keys, &keyring_assoc_array_ops); + if (IS_ERR(edit)) { + ret = PTR_ERR(edit); + } else { + if (edit) + assoc_array_apply_edit(edit); + key_payload_reserve(keyring, 0); ret = 0; } + up_write(&keyring->sem); return ret; } EXPORT_SYMBOL(keyring_clear); @@ -1168,17 +1286,25 @@ EXPORT_SYMBOL(keyring_clear); */ static void keyring_revoke(struct key *keyring) { - struct keyring_list *klist; + struct assoc_array_edit *edit; - klist = rcu_dereference_locked_keyring(keyring); + edit = assoc_array_clear(&keyring->keys, &keyring_assoc_array_ops); + if (!IS_ERR(edit)) { + if (edit) + assoc_array_apply_edit(edit); + key_payload_reserve(keyring, 0); + } +} - /* adjust the quota */ - key_payload_reserve(keyring, 0); +static bool gc_iterator(void *object, void *iterator_data) +{ + struct key *key = keyring_ptr_to_key(object); + time_t *limit = iterator_data; - if (klist) { - rcu_assign_pointer(keyring->payload.subscriptions, NULL); - call_rcu(&klist->rcu, keyring_clear_rcu_disposal); - } + if (key_is_dead(key, *limit)) + return false; + key_get(key); + return true; } /* @@ -1191,88 +1317,12 @@ static void keyring_revoke(struct key *keyring) */ void keyring_gc(struct key *keyring, time_t limit) { - struct keyring_list *klist, *new; - struct key *key; - int loop, keep, max; - kenter("{%x,%s}", key_serial(keyring), keyring->description); down_write(&keyring->sem); - - klist = rcu_dereference_locked_keyring(keyring); - if (!klist) - goto no_klist; - - /* work out how many subscriptions we're keeping */ - keep = 0; - for (loop = klist->nkeys - 1; loop >= 0; loop--) - if (!key_is_dead(rcu_deref_link_locked(klist, loop, keyring), - limit)) - keep++; - - if (keep == klist->nkeys) - goto just_return; - - /* allocate a new keyring payload */ - max = roundup(keep, 4); - new = kmalloc(sizeof(struct keyring_list) + max * sizeof(struct key *), - GFP_KERNEL); - if (!new) - goto nomem; - new->maxkeys = max; - new->nkeys = 0; - new->delkey = 0; - - /* install the live keys - * - must take care as expired keys may be updated back to life - */ - keep = 0; - for (loop = klist->nkeys - 1; loop >= 0; loop--) { - key = rcu_deref_link_locked(klist, loop, keyring); - if (!key_is_dead(key, limit)) { - if (keep >= max) - goto discard_new; - RCU_INIT_POINTER(new->keys[keep++], key_get(key)); - } - } - new->nkeys = keep; - - /* adjust the quota */ - key_payload_reserve(keyring, - sizeof(struct keyring_list) + - KEYQUOTA_LINK_BYTES * keep); - - if (keep == 0) { - rcu_assign_pointer(keyring->payload.subscriptions, NULL); - kfree(new); - } else { - rcu_assign_pointer(keyring->payload.subscriptions, new); - } - - up_write(&keyring->sem); - - call_rcu(&klist->rcu, keyring_clear_rcu_disposal); - kleave(" [yes]"); - return; - -discard_new: - new->nkeys = keep; - keyring_clear_rcu_disposal(&new->rcu); + assoc_array_gc(&keyring->keys, &keyring_assoc_array_ops, + gc_iterator, &limit); up_write(&keyring->sem); - kleave(" [discard]"); - return; -just_return: - up_write(&keyring->sem); - kleave(" [no dead]"); - return; - -no_klist: - up_write(&keyring->sem); - kleave(" [no_klist]"); - return; - -nomem: - up_write(&keyring->sem); - kleave(" [oom]"); + kleave(""); } diff --git a/security/keys/request_key.c b/security/keys/request_key.c index ab75df4745af..df94827103d0 100644 --- a/security/keys/request_key.c +++ b/security/keys/request_key.c @@ -351,7 +351,7 @@ static int construct_alloc_key(struct keyring_search_context *ctx, struct key_user *user, struct key **_key) { - unsigned long prealloc; + struct assoc_array_edit *edit; struct key *key; key_perm_t perm; key_ref_t key_ref; @@ -380,7 +380,7 @@ static int construct_alloc_key(struct keyring_search_context *ctx, set_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags); if (dest_keyring) { - ret = __key_link_begin(dest_keyring, &ctx->index_key, &prealloc); + ret = __key_link_begin(dest_keyring, &ctx->index_key, &edit); if (ret < 0) goto link_prealloc_failed; } @@ -395,11 +395,11 @@ static int construct_alloc_key(struct keyring_search_context *ctx, goto key_already_present; if (dest_keyring) - __key_link(dest_keyring, key, &prealloc); + __key_link(key, &edit); mutex_unlock(&key_construction_mutex); if (dest_keyring) - __key_link_end(dest_keyring, &ctx->index_key, prealloc); + __key_link_end(dest_keyring, &ctx->index_key, edit); mutex_unlock(&user->cons_lock); *_key = key; kleave(" = 0 [%d]", key_serial(key)); @@ -414,8 +414,8 @@ key_already_present: if (dest_keyring) { ret = __key_link_check_live_key(dest_keyring, key); if (ret == 0) - __key_link(dest_keyring, key, &prealloc); - __key_link_end(dest_keyring, &ctx->index_key, prealloc); + __key_link(key, &edit); + __key_link_end(dest_keyring, &ctx->index_key, edit); if (ret < 0) goto link_check_failed; } -- cgit v1.2.3 From 4a2b4b222743bb07fedf985b884550f2ca067ea9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 14 Aug 2013 14:55:24 +0200 Subject: sched: Introduce preempt_count accessor functions Replace the single preempt_count() 'function' that's an lvalue with two proper functions: preempt_count() - returns the preempt_count value as rvalue preempt_count_set() - Allows setting the preempt-count value Also provide preempt_count_ptr() as a convenience wrapper to implement all modifying operations. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-orxrbycjozopqfhb4dxdkdvb@git.kernel.org [ Fixed build failure. ] Signed-off-by: Ingo Molnar --- include/linux/preempt.h | 25 +++++++++++++++++++------ init/main.c | 2 +- kernel/sched/core.c | 4 ++-- kernel/softirq.c | 4 ++-- kernel/timer.c | 8 ++++---- lib/locking-selftest.c | 2 +- lib/smp_processor_id.c | 3 +-- 7 files changed, 30 insertions(+), 18 deletions(-) (limited to 'lib') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index f5d4723cdb3d..eaac52a8fe6a 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -10,19 +10,32 @@ #include #include +static __always_inline int preempt_count(void) +{ + return current_thread_info()->preempt_count; +} + +static __always_inline int *preempt_count_ptr(void) +{ + return ¤t_thread_info()->preempt_count; +} + +static __always_inline void preempt_count_set(int pc) +{ + *preempt_count_ptr() = pc; +} + #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER) extern void add_preempt_count(int val); extern void sub_preempt_count(int val); #else -# define add_preempt_count(val) do { preempt_count() += (val); } while (0) -# define sub_preempt_count(val) do { preempt_count() -= (val); } while (0) +# define add_preempt_count(val) do { *preempt_count_ptr() += (val); } while (0) +# define sub_preempt_count(val) do { *preempt_count_ptr() -= (val); } while (0) #endif #define inc_preempt_count() add_preempt_count(1) #define dec_preempt_count() sub_preempt_count(1) -#define preempt_count() (current_thread_info()->preempt_count) - #ifdef CONFIG_PREEMPT asmlinkage void preempt_schedule(void); @@ -81,9 +94,9 @@ do { \ /* For debugging and tracer internals only! */ #define add_preempt_count_notrace(val) \ - do { preempt_count() += (val); } while (0) + do { *preempt_count_ptr() += (val); } while (0) #define sub_preempt_count_notrace(val) \ - do { preempt_count() -= (val); } while (0) + do { *preempt_count_ptr() -= (val); } while (0) #define inc_preempt_count_notrace() add_preempt_count_notrace(1) #define dec_preempt_count_notrace() sub_preempt_count_notrace(1) diff --git a/init/main.c b/init/main.c index af310afbef28..7cc4b7889a88 100644 --- a/init/main.c +++ b/init/main.c @@ -692,7 +692,7 @@ int __init_or_module do_one_initcall(initcall_t fn) if (preempt_count() != count) { sprintf(msgbuf, "preemption imbalance "); - preempt_count() = count; + preempt_count_set(count); } if (irqs_disabled()) { strlcat(msgbuf, "disabled interrupts ", sizeof(msgbuf)); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 242da0c03aba..fe89afac4d09 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2219,7 +2219,7 @@ void __kprobes add_preempt_count(int val) if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) return; #endif - preempt_count() += val; + add_preempt_count_notrace(val); #ifdef CONFIG_DEBUG_PREEMPT /* * Spinlock count overflowing soon? @@ -2250,7 +2250,7 @@ void __kprobes sub_preempt_count(int val) if (preempt_count() == val) trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); - preempt_count() -= val; + sub_preempt_count_notrace(val); } EXPORT_SYMBOL(sub_preempt_count); diff --git a/kernel/softirq.c b/kernel/softirq.c index 53cc09ceb0b8..a90de70cf1f3 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -106,7 +106,7 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt) * We must manually increment preempt_count here and manually * call the trace_preempt_off later. */ - preempt_count() += cnt; + add_preempt_count_notrace(cnt); /* * Were softirqs turned off above: */ @@ -256,7 +256,7 @@ restart: " exited with %08x?\n", vec_nr, softirq_to_name[vec_nr], h->action, prev_count, preempt_count()); - preempt_count() = prev_count; + preempt_count_set(prev_count); } rcu_bh_qs(cpu); diff --git a/kernel/timer.c b/kernel/timer.c index 4296d13db3d1..6582b82fa966 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1092,7 +1092,7 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index) static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), unsigned long data) { - int preempt_count = preempt_count(); + int count = preempt_count(); #ifdef CONFIG_LOCKDEP /* @@ -1119,16 +1119,16 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), lock_map_release(&lockdep_map); - if (preempt_count != preempt_count()) { + if (count != preempt_count()) { WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n", - fn, preempt_count, preempt_count()); + fn, count, preempt_count()); /* * Restore the preempt count. That gives us a decent * chance to survive and extract information. If the * callback kept a lock held, bad luck, but not worse * than the BUG() we had. */ - preempt_count() = preempt_count; + preempt_count_set(count); } } diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c index 6dc09d8f4c24..872a15a2a637 100644 --- a/lib/locking-selftest.c +++ b/lib/locking-selftest.c @@ -1002,7 +1002,7 @@ static void dotest(void (*testcase_fn)(void), int expected, int lockclass_mask) * Some tests (e.g. double-unlock) might corrupt the preemption * count, so restore it: */ - preempt_count() = saved_preempt_count; + preempt_count_set(saved_preempt_count); #ifdef CONFIG_TRACE_IRQFLAGS if (softirq_count()) current->softirqs_enabled = 0; diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c index 4c0d0e51d49e..04abe53f12a1 100644 --- a/lib/smp_processor_id.c +++ b/lib/smp_processor_id.c @@ -9,10 +9,9 @@ notrace unsigned int debug_smp_processor_id(void) { - unsigned long preempt_count = preempt_count(); int this_cpu = raw_smp_processor_id(); - if (likely(preempt_count)) + if (likely(preempt_count())) goto out; if (irqs_disabled()) -- cgit v1.2.3 From 4ff158229770d245ce0a961a594adfc0e86d1cc5 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 17 Sep 2013 15:14:52 +0400 Subject: MPILIB: add module description and license This patch fixes lack of license, otherwise mpi.ko taints kernel. Signed-off-by: Konstantin Khlebnikov Signed-off-by: David Howells --- lib/mpi/mpiutil.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'lib') diff --git a/lib/mpi/mpiutil.c b/lib/mpi/mpiutil.c index 657979f71bef..bf076d281d40 100644 --- a/lib/mpi/mpiutil.c +++ b/lib/mpi/mpiutil.c @@ -121,3 +121,6 @@ void mpi_free(MPI a) kfree(a); } EXPORT_SYMBOL_GPL(mpi_free); + +MODULE_DESCRIPTION("Multiprecision maths library"); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From e34ff4906199d2ebd248ae897ae34f52bea151c9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 11 Sep 2013 22:29:05 -0400 Subject: sysfs: remove ktype->namespace() invocations in directory code For some unrecognizable reason, namespace information is communicated to sysfs through ktype->namespace() callback when there's *nothing* which needs the use of a callback. The whole sequence of operations is completely synchronous and sysfs operations simply end up calling back into the layer which just invoked it in order to find out the namespace information, which is completely backwards, obfuscates what's going on and unnecessarily tangles two separate layers. This patch doesn't remove ktype->namespace() but shifts its handling to kobject layer. We probably want to get rid of the callback in the long term. This patch adds an explicit param to sysfs_{create|rename|move}_dir() and renames them to sysfs_{create|rename|move}_dir_ns(), respectively. ktype->namespace() invocations are moved to the calling sites of the above functions. A new helper kboject_namespace() is introduced which directly tests kobj_ns_type_operations->type which should give the same result as testing sysfs_fs_type(parent_sd) and returns @kobj's namespace tag as necessary. kobject_namespace() is extern as it will be used from another file in the following patches. This patch should be an equivalent conversion without any functional difference. Signed-off-by: Tejun Heo Cc: Eric W. Biederman Cc: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 23 ++++++++--------------- include/linux/kobject.h | 1 + include/linux/sysfs.h | 20 ++++++++++++-------- lib/kobject.c | 28 ++++++++++++++++++++++++---- 4 files changed, 45 insertions(+), 27 deletions(-) (limited to 'lib') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 834c64cb7f88..878ac3afe1b8 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -730,14 +730,14 @@ static enum kobj_ns_type sysfs_read_ns_type(struct kobject *kobj) } /** - * sysfs_create_dir - create a directory for an object. - * @kobj: object we're creating directory for. + * sysfs_create_dir_ns - create a directory for an object with a namespace tag + * @kobj: object we're creating directory for + * @ns: the namespace tag to use */ -int sysfs_create_dir(struct kobject *kobj) +int sysfs_create_dir_ns(struct kobject *kobj, const void *ns) { enum kobj_ns_type type; struct sysfs_dirent *parent_sd, *sd; - const void *ns = NULL; int error = 0; BUG_ON(!kobj); @@ -750,8 +750,6 @@ int sysfs_create_dir(struct kobject *kobj) if (!parent_sd) return -ENOENT; - if (sysfs_ns_type(parent_sd)) - ns = kobj->ktype->namespace(kobj); type = sysfs_read_ns_type(kobj); error = create_dir(kobj, parent_sd, type, ns, kobject_name(kobj), &sd); @@ -909,26 +907,21 @@ int sysfs_rename(struct sysfs_dirent *sd, return error; } -int sysfs_rename_dir(struct kobject *kobj, const char *new_name) +int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name, + const void *new_ns) { struct sysfs_dirent *parent_sd = kobj->sd->s_parent; - const void *new_ns = NULL; - - if (sysfs_ns_type(parent_sd)) - new_ns = kobj->ktype->namespace(kobj); return sysfs_rename(kobj->sd, parent_sd, new_ns, new_name); } -int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj) +int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj, + const void *new_ns) { struct sysfs_dirent *sd = kobj->sd; struct sysfs_dirent *new_parent_sd; - const void *new_ns = NULL; BUG_ON(!sd->s_parent); - if (sysfs_ns_type(sd->s_parent)) - new_ns = kobj->ktype->namespace(kobj); new_parent_sd = new_parent_kobj && new_parent_kobj->sd ? new_parent_kobj->sd : &sysfs_root; diff --git a/include/linux/kobject.h b/include/linux/kobject.h index de6dcbcc6ef7..e7ba650086ce 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -107,6 +107,7 @@ extern int __must_check kobject_move(struct kobject *, struct kobject *); extern struct kobject *kobject_get(struct kobject *kobj); extern void kobject_put(struct kobject *kobj); +extern const void *kobject_namespace(struct kobject *kobj); extern char *kobject_get_path(struct kobject *kobj, gfp_t flag); struct kobj_type { diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index 82f7fac78e77..7f56bad3be82 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -182,11 +182,13 @@ struct sysfs_dirent; int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *), void *data, struct module *owner); -int __must_check sysfs_create_dir(struct kobject *kobj); +int __must_check sysfs_create_dir_ns(struct kobject *kobj, const void *ns); void sysfs_remove_dir(struct kobject *kobj); -int __must_check sysfs_rename_dir(struct kobject *kobj, const char *new_name); -int __must_check sysfs_move_dir(struct kobject *kobj, - struct kobject *new_parent_kobj); +int __must_check sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name, + const void *new_ns); +int __must_check sysfs_move_dir_ns(struct kobject *kobj, + struct kobject *new_parent_kobj, + const void *new_ns); int __must_check sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr, @@ -258,7 +260,7 @@ static inline int sysfs_schedule_callback(struct kobject *kobj, return -ENOSYS; } -static inline int sysfs_create_dir(struct kobject *kobj) +static inline int sysfs_create_dir_ns(struct kobject *kobj, const void *ns) { return 0; } @@ -267,13 +269,15 @@ static inline void sysfs_remove_dir(struct kobject *kobj) { } -static inline int sysfs_rename_dir(struct kobject *kobj, const char *new_name) +static inline int sysfs_rename_dir_ns(struct kobject *kobj, + const char *new_name, const void *new_ns) { return 0; } -static inline int sysfs_move_dir(struct kobject *kobj, - struct kobject *new_parent_kobj) +static inline int sysfs_move_dir_ns(struct kobject *kobj, + struct kobject *new_parent_kobj, + const void *new_ns) { return 0; } diff --git a/lib/kobject.c b/lib/kobject.c index 962175134702..85fb3a161b21 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -18,6 +18,24 @@ #include #include +/** + * kobject_namespace - return @kobj's namespace tag + * @kobj: kobject in question + * + * Returns namespace tag of @kobj if its parent has namespace ops enabled + * and thus @kobj should have a namespace tag associated with it. Returns + * %NULL otherwise. + */ +const void *kobject_namespace(struct kobject *kobj) +{ + const struct kobj_ns_type_operations *ns_ops = kobj_ns_ops(kobj); + + if (!ns_ops || ns_ops->type == KOBJ_NS_TYPE_NONE) + return NULL; + + return kobj->ktype->namespace(kobj); +} + /* * populate_dir - populate directory with attributes. * @kobj: object we're working on. @@ -46,8 +64,9 @@ static int populate_dir(struct kobject *kobj) static int create_dir(struct kobject *kobj) { - int error = 0; - error = sysfs_create_dir(kobj); + int error; + + error = sysfs_create_dir_ns(kobj, kobject_namespace(kobj)); if (!error) { error = populate_dir(kobj); if (error) @@ -428,7 +447,7 @@ int kobject_rename(struct kobject *kobj, const char *new_name) goto out; } - error = sysfs_rename_dir(kobj, new_name); + error = sysfs_rename_dir_ns(kobj, new_name, kobject_namespace(kobj)); if (error) goto out; @@ -472,6 +491,7 @@ int kobject_move(struct kobject *kobj, struct kobject *new_parent) if (kobj->kset) new_parent = kobject_get(&kobj->kset->kobj); } + /* old object path */ devpath = kobject_get_path(kobj, GFP_KERNEL); if (!devpath) { @@ -486,7 +506,7 @@ int kobject_move(struct kobject *kobj, struct kobject *new_parent) sprintf(devpath_string, "DEVPATH_OLD=%s", devpath); envp[0] = devpath_string; envp[1] = NULL; - error = sysfs_move_dir(kobj, new_parent); + error = sysfs_move_dir_ns(kobj, new_parent, kobject_namespace(kobj)); if (error) goto out; old_parent = kobj->parent; -- cgit v1.2.3 From cb26a311578e67769e92a39a0a63476533cb7e12 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 11 Sep 2013 22:29:07 -0400 Subject: sysfs: drop kobj_ns_type handling The way namespace tags are implemented in sysfs is more complicated than necessary. As each tag is a pointer value and required to be non-NULL under a namespace enabled parent, there's no need to record separately what type each tag is or where namespace is enabled. If multiple namespace types are needed, which currently aren't, we can simply compare the tag to a set of allowed tags in the superblock assuming that the tags, being pointers, won't have the same value across multiple types. Also, whether to filter by namespace tag or not can be trivially determined by whether the node has any tagged children or not. This patch rips out kobj_ns_type handling from sysfs. sysfs no longer cares whether specific type of namespace is enabled or not. If a sysfs_dirent has a non-NULL tag, the parent is marked as needing namespace filtering and the value is tested against the allowed set of tags for the superblock (currently only one but increasing this number isn't difficult) and the sysfs_dirent is ignored if it doesn't match. This removes most kobject namespace knowledge from sysfs proper which will enable proper separation and layering of sysfs. The namespace sanity checks in fs/sysfs/dir.c are replaced by the new sanity check in kobject_namespace(). As this is the only place ktype->namespace() is called for sysfs, this doesn't weaken the sanity check significantly. I omitted converting the sanity check in sysfs_do_create_link_sd(). While the check can be shifted to upper layer, mistakes there are well contained and should be easily visible anyway. Signed-off-by: Tejun Heo Cc: Eric W. Biederman Cc: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 90 +++++++++++++++--------------------------------------- fs/sysfs/mount.c | 24 ++++----------- fs/sysfs/symlink.c | 27 ++++------------ fs/sysfs/sysfs.h | 25 +++++---------- lib/kobject.c | 5 ++- 5 files changed, 48 insertions(+), 123 deletions(-) (limited to 'lib') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 878ac3afe1b8..1dfb4aaf9446 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -111,6 +111,11 @@ static int sysfs_link_sibling(struct sysfs_dirent *sd) /* add new node and rebalance the tree */ rb_link_node(&sd->s_rb, parent, node); rb_insert_color(&sd->s_rb, &sd->s_parent->s_dir.children); + + /* if @sd has ns tag, mark the parent to enable ns filtering */ + if (sd->s_ns) + sd->s_parent->s_flags |= SYSFS_FLAG_HAS_NS; + return 0; } @@ -130,6 +135,13 @@ static void sysfs_unlink_sibling(struct sysfs_dirent *sd) sd->s_parent->s_dir.subdirs--; rb_erase(&sd->s_rb, &sd->s_parent->s_dir.children); + + /* + * Either all or none of the children have tags. Clearing HAS_NS + * when there's no child left is enough to keep the flag synced. + */ + if (RB_EMPTY_ROOT(&sd->s_parent->s_dir.children)) + sd->s_parent->s_flags &= ~SYSFS_FLAG_HAS_NS; } #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -297,7 +309,6 @@ static int sysfs_dentry_delete(const struct dentry *dentry) static int sysfs_dentry_revalidate(struct dentry *dentry, unsigned int flags) { struct sysfs_dirent *sd; - int type; if (flags & LOOKUP_RCU) return -ECHILD; @@ -318,13 +329,8 @@ static int sysfs_dentry_revalidate(struct dentry *dentry, unsigned int flags) goto out_bad; /* The sysfs dirent has been moved to a different namespace */ - type = KOBJ_NS_TYPE_NONE; - if (sd->s_parent) { - type = sysfs_ns_type(sd->s_parent); - if (type != KOBJ_NS_TYPE_NONE && - sysfs_info(dentry->d_sb)->ns[type] != sd->s_ns) - goto out_bad; - } + if (sd->s_ns && sd->s_ns != sysfs_info(dentry->d_sb)->ns) + goto out_bad; mutex_unlock(&sysfs_mutex); out_valid: @@ -445,13 +451,6 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) struct sysfs_inode_attrs *ps_iattr; int ret; - if (!!sysfs_ns_type(acxt->parent_sd) != !!sd->s_ns) { - WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n", - sysfs_ns_type(acxt->parent_sd) ? "required" : "invalid", - acxt->parent_sd->s_name, sd->s_name); - return -EINVAL; - } - sd->s_hash = sysfs_name_hash(sd->s_ns, sd->s_name); sd->s_parent = sysfs_get(acxt->parent_sd); @@ -613,13 +612,6 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, struct rb_node *node = parent_sd->s_dir.children.rb_node; unsigned int hash; - if (!!sysfs_ns_type(parent_sd) != !!ns) { - WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n", - sysfs_ns_type(parent_sd) ? "required" : "invalid", - parent_sd->s_name, name); - return NULL; - } - hash = sysfs_name_hash(ns, name); while (node) { struct sysfs_dirent *sd; @@ -667,8 +659,7 @@ struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd, EXPORT_SYMBOL_GPL(sysfs_get_dirent); static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd, - enum kobj_ns_type type, const void *ns, const char *name, - struct sysfs_dirent **p_sd) + const void *ns, const char *name, struct sysfs_dirent **p_sd) { umode_t mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; struct sysfs_addrm_cxt acxt; @@ -680,7 +671,6 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd, if (!sd) return -ENOMEM; - sd->s_flags |= (type << SYSFS_NS_TYPE_SHIFT); sd->s_ns = ns; sd->s_dir.kobj = kobj; @@ -700,33 +690,7 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd, int sysfs_create_subdir(struct kobject *kobj, const char *name, struct sysfs_dirent **p_sd) { - return create_dir(kobj, kobj->sd, - KOBJ_NS_TYPE_NONE, NULL, name, p_sd); -} - -/** - * sysfs_read_ns_type: return associated ns_type - * @kobj: the kobject being queried - * - * Each kobject can be tagged with exactly one namespace type - * (i.e. network or user). Return the ns_type associated with - * this object if any - */ -static enum kobj_ns_type sysfs_read_ns_type(struct kobject *kobj) -{ - const struct kobj_ns_type_operations *ops; - enum kobj_ns_type type; - - ops = kobj_child_ns_ops(kobj); - if (!ops) - return KOBJ_NS_TYPE_NONE; - - type = ops->type; - BUG_ON(type <= KOBJ_NS_TYPE_NONE); - BUG_ON(type >= KOBJ_NS_TYPES); - BUG_ON(!kobj_ns_type_registered(type)); - - return type; + return create_dir(kobj, kobj->sd, NULL, name, p_sd); } /** @@ -736,7 +700,6 @@ static enum kobj_ns_type sysfs_read_ns_type(struct kobject *kobj) */ int sysfs_create_dir_ns(struct kobject *kobj, const void *ns) { - enum kobj_ns_type type; struct sysfs_dirent *parent_sd, *sd; int error = 0; @@ -750,9 +713,7 @@ int sysfs_create_dir_ns(struct kobject *kobj, const void *ns) if (!parent_sd) return -ENOENT; - type = sysfs_read_ns_type(kobj); - - error = create_dir(kobj, parent_sd, type, ns, kobject_name(kobj), &sd); + error = create_dir(kobj, parent_sd, ns, kobject_name(kobj), &sd); if (!error) kobj->sd = sd; return error; @@ -766,13 +727,12 @@ static struct dentry *sysfs_lookup(struct inode *dir, struct dentry *dentry, struct sysfs_dirent *parent_sd = parent->d_fsdata; struct sysfs_dirent *sd; struct inode *inode; - enum kobj_ns_type type; - const void *ns; + const void *ns = NULL; mutex_lock(&sysfs_mutex); - type = sysfs_ns_type(parent_sd); - ns = sysfs_info(dir->i_sb)->ns[type]; + if (parent_sd->s_flags & SYSFS_FLAG_HAS_NS) + ns = sysfs_info(dir->i_sb)->ns; sd = sysfs_find_dirent(parent_sd, ns, dentry->d_name.name); @@ -995,15 +955,15 @@ static int sysfs_readdir(struct file *file, struct dir_context *ctx) struct dentry *dentry = file->f_path.dentry; struct sysfs_dirent *parent_sd = dentry->d_fsdata; struct sysfs_dirent *pos = file->private_data; - enum kobj_ns_type type; - const void *ns; - - type = sysfs_ns_type(parent_sd); - ns = sysfs_info(dentry->d_sb)->ns[type]; + const void *ns = NULL; if (!dir_emit_dots(file, ctx)) return 0; mutex_lock(&sysfs_mutex); + + if (parent_sd->s_flags & SYSFS_FLAG_HAS_NS) + ns = sysfs_info(dentry->d_sb)->ns; + for (pos = sysfs_dir_pos(ns, parent_sd, ctx->pos, pos); pos; pos = sysfs_dir_next_pos(ns, parent_sd, ctx->pos, pos)) { diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 834ec2cdb7a3..8c24bce2f4ae 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -36,7 +36,7 @@ static const struct super_operations sysfs_ops = { struct sysfs_dirent sysfs_root = { .s_name = "", .s_count = ATOMIC_INIT(1), - .s_flags = SYSFS_DIR | (KOBJ_NS_TYPE_NONE << SYSFS_NS_TYPE_SHIFT), + .s_flags = SYSFS_DIR, .s_mode = S_IFDIR | S_IRUGO | S_IXUGO, .s_ino = 1, }; @@ -77,14 +77,8 @@ static int sysfs_test_super(struct super_block *sb, void *data) { struct sysfs_super_info *sb_info = sysfs_info(sb); struct sysfs_super_info *info = data; - enum kobj_ns_type type; - int found = 1; - for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) { - if (sb_info->ns[type] != info->ns[type]) - found = 0; - } - return found; + return sb_info->ns == info->ns; } static int sysfs_set_super(struct super_block *sb, void *data) @@ -98,9 +92,7 @@ static int sysfs_set_super(struct super_block *sb, void *data) static void free_sysfs_super_info(struct sysfs_super_info *info) { - int type; - for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) - kobj_ns_drop(type, info->ns[type]); + kobj_ns_drop(KOBJ_NS_TYPE_NET, info->ns); kfree(info); } @@ -108,7 +100,6 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { struct sysfs_super_info *info; - enum kobj_ns_type type; struct super_block *sb; int error; @@ -116,18 +107,15 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type)) return ERR_PTR(-EPERM); - for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) { - if (!kobj_ns_current_may_mount(type)) - return ERR_PTR(-EPERM); - } + if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET)) + return ERR_PTR(-EPERM); } info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return ERR_PTR(-ENOMEM); - for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) - info->ns[type] = kobj_ns_grab_current(type); + info->ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET); sb = sget(fs_type, sysfs_test_super, sysfs_set_super, flags, info); if (IS_ERR(sb) || sb->s_fs_info != info) diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index 12d58ada3e6d..7d981ce2e87f 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -28,7 +28,6 @@ static int sysfs_do_create_link_sd(struct sysfs_dirent *parent_sd, struct sysfs_dirent *target_sd = NULL; struct sysfs_dirent *sd = NULL; struct sysfs_addrm_cxt acxt; - enum kobj_ns_type ns_type; int error; BUG_ON(!name || !parent_sd); @@ -50,29 +49,15 @@ static int sysfs_do_create_link_sd(struct sysfs_dirent *parent_sd, if (!sd) goto out_put; - ns_type = sysfs_ns_type(parent_sd); - if (ns_type) - sd->s_ns = target_sd->s_ns; + sd->s_ns = target_sd->s_ns; sd->s_symlink.target_sd = target_sd; target_sd = NULL; /* reference is now owned by the symlink */ sysfs_addrm_start(&acxt, parent_sd); - /* Symlinks must be between directories with the same ns_type */ - if (!ns_type || - (ns_type == sysfs_ns_type(sd->s_symlink.target_sd->s_parent))) { - if (warn) - error = sysfs_add_one(&acxt, sd); - else - error = __sysfs_add_one(&acxt, sd); - } else { - error = -EINVAL; - WARN(1, KERN_WARNING - "sysfs: symlink across ns_types %s/%s -> %s/%s\n", - parent_sd->s_name, - sd->s_name, - sd->s_symlink.target_sd->s_parent->s_name, - sd->s_symlink.target_sd->s_name); - } + if (warn) + error = sysfs_add_one(&acxt, sd); + else + error = __sysfs_add_one(&acxt, sd); sysfs_addrm_finish(&acxt); if (error) @@ -156,7 +141,7 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ, { const void *ns = NULL; spin_lock(&sysfs_assoc_lock); - if (targ->sd && sysfs_ns_type(kobj->sd)) + if (targ->sd) ns = targ->sd->s_ns; spin_unlock(&sysfs_assoc_lock); sysfs_hash_and_remove(kobj->sd, ns, name); diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index a96da2559db2..7664d1b3d594 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -93,11 +93,8 @@ struct sysfs_dirent { #define SYSFS_COPY_NAME (SYSFS_DIR | SYSFS_KOBJ_LINK) #define SYSFS_ACTIVE_REF (SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR) -/* identify any namespace tag on sysfs_dirents */ -#define SYSFS_NS_TYPE_MASK 0xf00 -#define SYSFS_NS_TYPE_SHIFT 8 - -#define SYSFS_FLAG_MASK ~(SYSFS_NS_TYPE_MASK|SYSFS_TYPE_MASK) +#define SYSFS_FLAG_MASK ~SYSFS_TYPE_MASK +#define SYSFS_FLAG_HAS_NS 0x01000 #define SYSFS_FLAG_REMOVED 0x02000 static inline unsigned int sysfs_type(struct sysfs_dirent *sd) @@ -105,15 +102,6 @@ static inline unsigned int sysfs_type(struct sysfs_dirent *sd) return sd->s_flags & SYSFS_TYPE_MASK; } -/* - * Return any namespace tags on this dirent. - * enum kobj_ns_type is defined in linux/kobject.h - */ -static inline enum kobj_ns_type sysfs_ns_type(struct sysfs_dirent *sd) -{ - return (sd->s_flags & SYSFS_NS_TYPE_MASK) >> SYSFS_NS_TYPE_SHIFT; -} - #ifdef CONFIG_DEBUG_LOCK_ALLOC #define sysfs_dirent_init_lockdep(sd) \ do { \ @@ -141,12 +129,13 @@ struct sysfs_addrm_cxt { */ /* - * Each sb is associated with a set of namespace tags (i.e. - * the network namespace of the task which mounted this sysfs - * instance). + * Each sb is associated with one namespace tag, currently the network + * namespace of the task which mounted this sysfs instance. If multiple + * tags become necessary, make the following an array and compare + * sysfs_dirent tag against every entry. */ struct sysfs_super_info { - void *ns[KOBJ_NS_TYPES]; + void *ns; }; #define sysfs_info(SB) ((struct sysfs_super_info *)(SB->s_fs_info)) extern struct sysfs_dirent sysfs_root; diff --git a/lib/kobject.c b/lib/kobject.c index 85fb3a161b21..e769ee3c2fb9 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -29,11 +29,14 @@ const void *kobject_namespace(struct kobject *kobj) { const struct kobj_ns_type_operations *ns_ops = kobj_ns_ops(kobj); + const void *ns; if (!ns_ops || ns_ops->type == KOBJ_NS_TYPE_NONE) return NULL; - return kobj->ktype->namespace(kobj); + ns = kobj->ktype->namespace(kobj); + WARN_ON(!ns); /* @kobj in a namespace is required to have !NULL tag */ + return ns; } /* -- cgit v1.2.3 From eee031649707db3c9920d9498f8d03819b74fc23 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Wed, 11 Sep 2013 13:00:30 -0400 Subject: kobject: introduce kobj_completion A common way to handle kobject lifetimes in embedded in objects with different lifetime rules is to pair the kobject with a struct completion. This introduces a kobj_completion structure that can be used in place of the pairing, along with several convenience functions for initialization, release, and put-and-wait. Signed-off-by: Jeff Mahoney Signed-off-by: Greg Kroah-Hartman --- include/linux/kobj_completion.h | 18 +++++++++++++++ lib/kobject.c | 50 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) create mode 100644 include/linux/kobj_completion.h (limited to 'lib') diff --git a/include/linux/kobj_completion.h b/include/linux/kobj_completion.h new file mode 100644 index 000000000000..a428f6436063 --- /dev/null +++ b/include/linux/kobj_completion.h @@ -0,0 +1,18 @@ +#ifndef _KOBJ_COMPLETION_H_ +#define _KOBJ_COMPLETION_H_ + +#include +#include + +struct kobj_completion { + struct kobject kc_kobj; + struct completion kc_unregister; +}; + +#define kobj_to_kobj_completion(kobj) \ + container_of(kobj, struct kobj_completion, kc_kobj) + +void kobj_completion_init(struct kobj_completion *kc, struct kobj_type *ktype); +void kobj_completion_release(struct kobject *kobj); +void kobj_completion_del_and_wait(struct kobj_completion *kc); +#endif /* _KOBJ_COMPLETION_H_ */ diff --git a/lib/kobject.c b/lib/kobject.c index e769ee3c2fb9..a5a9b13b0648 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -13,6 +13,7 @@ */ #include +#include #include #include #include @@ -749,6 +750,55 @@ const struct sysfs_ops kobj_sysfs_ops = { .store = kobj_attr_store, }; +/** + * kobj_completion_init - initialize a kobj_completion object. + * @kc: kobj_completion + * @ktype: type of kobject to initialize + * + * kobj_completion structures can be embedded within structures with different + * lifetime rules. During the release of the enclosing object, we can + * wait on the release of the kobject so that we don't free it while it's + * still busy. + */ +void kobj_completion_init(struct kobj_completion *kc, struct kobj_type *ktype) +{ + init_completion(&kc->kc_unregister); + kobject_init(&kc->kc_kobj, ktype); +} +EXPORT_SYMBOL_GPL(kobj_completion_init); + +/** + * kobj_completion_release - release a kobj_completion object + * @kobj: kobject embedded in kobj_completion + * + * Used with kobject_release to notify waiters that the kobject has been + * released. + */ +void kobj_completion_release(struct kobject *kobj) +{ + struct kobj_completion *kc = kobj_to_kobj_completion(kobj); + complete(&kc->kc_unregister); +} +EXPORT_SYMBOL_GPL(kobj_completion_release); + +/** + * kobj_completion_del_and_wait - release the kobject and wait for it + * @kc: kobj_completion object to release + * + * Delete the kobject from sysfs and drop the reference count. Then wait + * until any other outstanding references are also dropped. This routine + * is only necessary once other references may have been taken on the + * kobject. Typically this happens when the kobject has been published + * to sysfs via kobject_add. + */ +void kobj_completion_del_and_wait(struct kobj_completion *kc) +{ + kobject_del(&kc->kc_kobj); + kobject_put(&kc->kc_kobj); + wait_for_completion(&kc->kc_unregister); +} +EXPORT_SYMBOL_GPL(kobj_completion_del_and_wait); + /** * kset_register - initialize and add a kset. * @k: kset. -- cgit v1.2.3 From 2b2b614dd24e4e6474fcf2dcf69c95c908838959 Mon Sep 17 00:00:00 2001 From: Zoltan Kiss Date: Wed, 4 Sep 2013 21:11:05 +0100 Subject: tracing/events: Add bounce tracing to swiotbl Ftrace is currently not able to detect when SWIOTLB has to do double buffering. Under Xen you can only see it indirectly in function_graph, when xen_swiotlb_map_page() doesn't stop after range_straddles_page_boundary(), but calls spinlock functions, memcpy() and xen_phys_to_bus() as well. This patch introduces the swiotlb:swiotlb_bounced event, which also prints out the following informations to help you find out why bouncing happened: dev_name: 0000:08:00.0 dma_mask=ffffffffffffffff dev_addr=9149f000 size=32768 swiotlb_force=0 If you use Xen, and (dev_addr + size + 1) > dma_mask, the buffer is out of the device's DMA range. If swiotlb_force == 1, you should really change the kernel parameters. Otherwise, the buffer is not contiguous in mfn space. Signed-off-by: Zoltan Kiss [v1: Don't print 'swiotlb_force=X', just print swiotlb_force if it is enabled] Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/swiotlb-xen.c | 5 +++++ include/trace/events/swiotlb.h | 46 ++++++++++++++++++++++++++++++++++++++++++ lib/swiotlb.c | 4 ++++ 3 files changed, 55 insertions(+) create mode 100644 include/trace/events/swiotlb.h (limited to 'lib') diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 1b2277c311d2..b31081007a81 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -42,6 +42,9 @@ #include #include #include + +#define CREATE_TRACE_POINTS +#include /* * Used to do a quick range check in swiotlb_tbl_unmap_single and * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this @@ -358,6 +361,8 @@ dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, /* * Oh well, have to allocate and map a bounce buffer. */ + trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force); + map = swiotlb_tbl_map_single(dev, start_dma_addr, phys, size, dir); if (map == SWIOTLB_MAP_ERROR) return DMA_ERROR_CODE; diff --git a/include/trace/events/swiotlb.h b/include/trace/events/swiotlb.h new file mode 100644 index 000000000000..7ea4c5e7c448 --- /dev/null +++ b/include/trace/events/swiotlb.h @@ -0,0 +1,46 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM swiotlb + +#if !defined(_TRACE_SWIOTLB_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_SWIOTLB_H + +#include + +TRACE_EVENT(swiotlb_bounced, + + TP_PROTO(struct device *dev, + dma_addr_t dev_addr, + size_t size, + int swiotlb_force), + + TP_ARGS(dev, dev_addr, size, swiotlb_force), + + TP_STRUCT__entry( + __string( dev_name, dev_name(dev) ) + __field( u64, dma_mask ) + __field( dma_addr_t, dev_addr ) + __field( size_t, size ) + __field( int, swiotlb_force ) + ), + + TP_fast_assign( + __assign_str(dev_name, dev_name(dev)); + __entry->dma_mask = (dev->dma_mask ? *dev->dma_mask : 0); + __entry->dev_addr = dev_addr; + __entry->size = size; + __entry->swiotlb_force = swiotlb_force; + ), + + TP_printk("dev_name: %s dma_mask=%llx dev_addr=%llx " + "size=%zu %s", + __get_str(dev_name), + __entry->dma_mask, + (unsigned long long)__entry->dev_addr, + __entry->size, + __entry->swiotlb_force ? "swiotlb_force" : "" ) +); + +#endif /* _TRACE_SWIOTLB_H */ + +/* This part must be outside protection */ +#include diff --git a/lib/swiotlb.c b/lib/swiotlb.c index 4e8686c7e5a4..f0d841907da6 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -38,6 +38,8 @@ #include #include +#include + #define OFFSET(val,align) ((unsigned long) \ ( (val) & ( (align) - 1))) @@ -726,6 +728,8 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, if (dma_capable(dev, dev_addr, size) && !swiotlb_force) return dev_addr; + trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force); + /* Oh well, have to allocate and map a bounce buffer. */ map = map_single(dev, phys, size, dir); if (map == SWIOTLB_MAP_ERROR) { -- cgit v1.2.3 From 6b378382396feb02c79ad562a01955df0529df45 Mon Sep 17 00:00:00 2001 From: Nick Swenson Date: Wed, 2 Oct 2013 17:55:45 -0700 Subject: percpu_ida: Removing unused arguement from alloc_local_tag Removing unused struct percpu_ida *pool from arguements of alloc_local_tag, changed it's one use in percpu_ida.c (nab: Fixed reference of idr.c -> percpu_ida.c) Signed-Off-By: Nick Swenson Signed-off-by: Nicholas Bellinger --- lib/percpu_ida.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'lib') diff --git a/lib/percpu_ida.c b/lib/percpu_ida.c index bab1ba2a4c71..e315903ec63a 100644 --- a/lib/percpu_ida.c +++ b/lib/percpu_ida.c @@ -126,8 +126,7 @@ static inline void alloc_global_tags(struct percpu_ida *pool, min(pool->nr_free, IDA_PCPU_BATCH_MOVE)); } -static inline unsigned alloc_local_tag(struct percpu_ida *pool, - struct percpu_ida_cpu *tags) +static inline unsigned alloc_local_tag(struct percpu_ida_cpu *tags) { int tag = -ENOSPC; @@ -168,7 +167,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, gfp_t gfp) tags = this_cpu_ptr(pool->tag_cpu); /* Fastpath */ - tag = alloc_local_tag(pool, tags); + tag = alloc_local_tag(tags); if (likely(tag >= 0)) { local_irq_restore(flags); return tag; -- cgit v1.2.3 From 26ea12dec0c84133add937455be76d44fe253d85 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 18 Sep 2013 17:15:36 -0400 Subject: kobject: grab an extra reference on kobject->sd to allow duplicate deletes sysfs currently has a rather weird behavior regarding removals. A directory removal would delete all files directly under it but wouldn't recurse into subdirectories, which, while a bit inconsistent, seems to make sense at the first glance as each directory is supposedly associated with a kobject and each kobject can take care of the directory deletion; however, this doesn't really hold as we have groups which can be directories without a kobject associated with it and require explicit deletions. We're in the process of separating out sysfs from kboject / driver core and want a consistent behavior. A removal should delete either only the specified node or everything under it. I think it is helpful to support recursive atomic removal and later patches will implement it. Such change means that a sysfs_dirent associated with kobject may be deleted before the kobject itself is removed if one of its ancestor gets removed before it. As sysfs_remove_dir() puts the base ref, we may end up with dangling pointer on descendants. This can be solved by holding an extra reference on the sd from kobject. Acquire an extra reference on the associated sysfs_dirent on directory creation and put it after removal. Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 7 ++++++- lib/kobject.c | 12 ++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 671868914b5b..105a7e2d1660 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -549,7 +549,12 @@ void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) { struct sysfs_inode_attrs *ps_iattr; - BUG_ON(sd->s_flags & SYSFS_FLAG_REMOVED); + /* + * Removal can be called multiple times on the same node. Only the + * first invocation is effective and puts the base ref. + */ + if (sd->s_flags & SYSFS_FLAG_REMOVED) + return; sysfs_unlink_sibling(sd); diff --git a/lib/kobject.c b/lib/kobject.c index 151089788c21..2fdf7fa9e9bd 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -76,6 +76,13 @@ static int create_dir(struct kobject *kobj) if (error) sysfs_remove_dir(kobj); } + + /* + * @kobj->sd may be deleted by an ancestor going away. Hold an + * extra reference so that it stays until @kobj is gone. + */ + sysfs_get(kobj->sd); + return error; } @@ -532,10 +539,15 @@ out: */ void kobject_del(struct kobject *kobj) { + struct sysfs_dirent *sd; + if (!kobj) return; + sd = kobj->sd; sysfs_remove_dir(kobj); + sysfs_put(sd); + kobj->state_in_sysfs = 0; kobj_kset_leave(kobj); kobject_put(kobj->parent); -- cgit v1.2.3 From 1461c5be7becc6e65dba5cadb31fb5f4339609f5 Mon Sep 17 00:00:00 2001 From: Fengguang Wu Date: Wed, 9 Oct 2013 09:26:21 +0800 Subject: kobject: show debug info on delayed kobject release Useful for locating buggy drivers on kernel oops. It may add dozens of new lines to boot dmesg. DEBUG_KOBJECT_RELEASE is hopefully only enabled in debug kernels (like maybe the Fedora rawhide one, or at developers), so being a bit more verbose is likely ok. Signed-off-by: Fengguang Wu Acked-by: Russell King Signed-off-by: Greg Kroah-Hartman --- lib/kobject.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/kobject.c b/lib/kobject.c index 2fdf7fa9e9bd..7a1c203083eb 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -628,7 +628,7 @@ static void kobject_release(struct kref *kref) { struct kobject *kobj = container_of(kref, struct kobject, kref); #ifdef CONFIG_DEBUG_KOBJECT_RELEASE - pr_debug("kobject: '%s' (%p): %s, parent %p (delayed)\n", + pr_info("kobject: '%s' (%p): %s, parent %p (delayed)\n", kobject_name(kobj), kobj, __func__, kobj->parent); INIT_DELAYED_WORK(&kobj->release, kobject_delayed_cleanup); schedule_delayed_work(&kobj->release, HZ); -- cgit v1.2.3 From e66cf161098a634dc96e32d0089c5767cf25668a Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 15 Oct 2013 15:18:08 +0100 Subject: GFS2: Use lockref for glocks Currently glocks have an atomic reference count and also a spinlock which covers various internal fields, such as the state. This intent of this patch is to replace the spinlock and the atomic reference count with a lockref structure. This contains a spinlock which we can continue to use as before, and a reference counter which is used in conjuction with the spinlock to replace the previous atomic counter. As a result of this there are some new rules for reference counting on glocks. We need to distinguish between reference count changes under gl_spin (which are now just increment or decrement of the new counter, provided the count cannot hit zero) and those which are outside of gl_spin, but which now take gl_spin internally. The conversion is relatively straight forward. There is probably some further clean up which can be done, but the priority at this stage is to make the change in as simple a manner as possible. A consequence of this change is that the reference count is being decoupled from the lru list processing. This should allow future adoption of the lru_list code with glocks in due course. The reason for using the "dead" state and not just relying on 0 being the "invalid state" is so that in due course 0 ref counts can be allowable. The intent is to eventually be able to remove the ref count changes which are currently hidden away in state_change(). Signed-off-by: Steven Whitehouse --- fs/gfs2/glock.c | 83 ++++++++++++++++++++++++------------------------- fs/gfs2/glock.h | 2 -- fs/gfs2/glops.c | 4 +-- fs/gfs2/incore.h | 5 +-- include/linux/lockref.h | 6 ++++ lib/lockref.c | 1 + 6 files changed, 52 insertions(+), 49 deletions(-) (limited to 'lib') diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index c2f41b4d00b9..e66a8009aff1 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -31,6 +31,7 @@ #include #include #include +#include #include "gfs2.h" #include "incore.h" @@ -129,10 +130,10 @@ void gfs2_glock_free(struct gfs2_glock *gl) * */ -void gfs2_glock_hold(struct gfs2_glock *gl) +static void gfs2_glock_hold(struct gfs2_glock *gl) { - GLOCK_BUG_ON(gl, atomic_read(&gl->gl_ref) == 0); - atomic_inc(&gl->gl_ref); + GLOCK_BUG_ON(gl, __lockref_is_dead(&gl->gl_lockref)); + lockref_get(&gl->gl_lockref); } /** @@ -186,20 +187,6 @@ static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl) spin_unlock(&lru_lock); } -/** - * gfs2_glock_put_nolock() - Decrement reference count on glock - * @gl: The glock to put - * - * This function should only be used if the caller has its own reference - * to the glock, in addition to the one it is dropping. - */ - -void gfs2_glock_put_nolock(struct gfs2_glock *gl) -{ - if (atomic_dec_and_test(&gl->gl_ref)) - GLOCK_BUG_ON(gl, 1); -} - /** * gfs2_glock_put() - Decrement reference count on glock * @gl: The glock to put @@ -211,17 +198,22 @@ void gfs2_glock_put(struct gfs2_glock *gl) struct gfs2_sbd *sdp = gl->gl_sbd; struct address_space *mapping = gfs2_glock2aspace(gl); - if (atomic_dec_and_lock(&gl->gl_ref, &lru_lock)) { - __gfs2_glock_remove_from_lru(gl); - spin_unlock(&lru_lock); - spin_lock_bucket(gl->gl_hash); - hlist_bl_del_rcu(&gl->gl_list); - spin_unlock_bucket(gl->gl_hash); - GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); - GLOCK_BUG_ON(gl, mapping && mapping->nrpages); - trace_gfs2_glock_put(gl); - sdp->sd_lockstruct.ls_ops->lm_put_lock(gl); - } + if (lockref_put_or_lock(&gl->gl_lockref)) + return; + + lockref_mark_dead(&gl->gl_lockref); + + spin_lock(&lru_lock); + __gfs2_glock_remove_from_lru(gl); + spin_unlock(&lru_lock); + spin_unlock(&gl->gl_lockref.lock); + spin_lock_bucket(gl->gl_hash); + hlist_bl_del_rcu(&gl->gl_list); + spin_unlock_bucket(gl->gl_hash); + GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); + GLOCK_BUG_ON(gl, mapping && mapping->nrpages); + trace_gfs2_glock_put(gl); + sdp->sd_lockstruct.ls_ops->lm_put_lock(gl); } /** @@ -244,7 +236,7 @@ static struct gfs2_glock *search_bucket(unsigned int hash, continue; if (gl->gl_sbd != sdp) continue; - if (atomic_inc_not_zero(&gl->gl_ref)) + if (lockref_get_not_dead(&gl->gl_lockref)) return gl; } @@ -396,10 +388,11 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state) held2 = (new_state != LM_ST_UNLOCKED); if (held1 != held2) { + GLOCK_BUG_ON(gl, __lockref_is_dead(&gl->gl_lockref)); if (held2) - gfs2_glock_hold(gl); + gl->gl_lockref.count++; else - gfs2_glock_put_nolock(gl); + gl->gl_lockref.count--; } if (held1 && held2 && list_empty(&gl->gl_holders)) clear_bit(GLF_QUEUED, &gl->gl_flags); @@ -626,9 +619,9 @@ out: out_sched: clear_bit(GLF_LOCK, &gl->gl_flags); smp_mb__after_clear_bit(); - gfs2_glock_hold(gl); + gl->gl_lockref.count++; if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) - gfs2_glock_put_nolock(gl); + gl->gl_lockref.count--; return; out_unlock: @@ -754,7 +747,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, gl->gl_sbd = sdp; gl->gl_flags = 0; gl->gl_name = name; - atomic_set(&gl->gl_ref, 1); + gl->gl_lockref.count = 1; gl->gl_state = LM_ST_UNLOCKED; gl->gl_target = LM_ST_UNLOCKED; gl->gl_demote_state = LM_ST_EXCLUSIVE; @@ -1356,10 +1349,10 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret) } } - spin_unlock(&gl->gl_spin); + gl->gl_lockref.count++; set_bit(GLF_REPLY_PENDING, &gl->gl_flags); - smp_wmb(); - gfs2_glock_hold(gl); + spin_unlock(&gl->gl_spin); + if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) gfs2_glock_put(gl); } @@ -1404,15 +1397,19 @@ __acquires(&lru_lock) while(!list_empty(list)) { gl = list_entry(list->next, struct gfs2_glock, gl_lru); list_del_init(&gl->gl_lru); + if (!spin_trylock(&gl->gl_spin)) { + list_add(&gl->gl_lru, &lru_list); + atomic_inc(&lru_count); + continue; + } clear_bit(GLF_LRU, &gl->gl_flags); - gfs2_glock_hold(gl); spin_unlock(&lru_lock); - spin_lock(&gl->gl_spin); + gl->gl_lockref.count++; if (demote_ok(gl)) handle_callback(gl, LM_ST_UNLOCKED, 0, false); WARN_ON(!test_and_clear_bit(GLF_LOCK, &gl->gl_flags)); if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) - gfs2_glock_put_nolock(gl); + gl->gl_lockref.count--; spin_unlock(&gl->gl_spin); spin_lock(&lru_lock); } @@ -1493,7 +1490,7 @@ static void examine_bucket(glock_examiner examiner, const struct gfs2_sbd *sdp, rcu_read_lock(); hlist_bl_for_each_entry_rcu(gl, pos, head, gl_list) { - if ((gl->gl_sbd == sdp) && atomic_inc_not_zero(&gl->gl_ref)) + if ((gl->gl_sbd == sdp) && lockref_get_not_dead(&gl->gl_lockref)) examiner(gl); } rcu_read_unlock(); @@ -1746,7 +1743,7 @@ int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl) state2str(gl->gl_demote_state), dtime, atomic_read(&gl->gl_ail_count), atomic_read(&gl->gl_revokes), - atomic_read(&gl->gl_ref), gl->gl_hold_time); + (int)gl->gl_lockref.count, gl->gl_hold_time); list_for_each_entry(gh, &gl->gl_holders, gh_list) { error = dump_holder(seq, gh); @@ -1902,7 +1899,7 @@ static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi) gi->nhash = 0; } /* Skip entries for other sb and dead entries */ - } while (gi->sdp != gi->gl->gl_sbd || atomic_read(&gi->gl->gl_ref) == 0); + } while (gi->sdp != gi->gl->gl_sbd || __lockref_is_dead(&gl->gl_lockref)); return 0; } diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 69f66e3d22bf..6647d77366ba 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -181,8 +181,6 @@ static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl) extern int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, const struct gfs2_glock_operations *glops, int create, struct gfs2_glock **glp); -extern void gfs2_glock_hold(struct gfs2_glock *gl); -extern void gfs2_glock_put_nolock(struct gfs2_glock *gl); extern void gfs2_glock_put(struct gfs2_glock *gl); extern void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags, struct gfs2_holder *gh); diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index e2e0a90396e7..db908f697139 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -525,9 +525,9 @@ static void iopen_go_callback(struct gfs2_glock *gl, bool remote) if (gl->gl_demote_state == LM_ST_UNLOCKED && gl->gl_state == LM_ST_SHARED && ip) { - gfs2_glock_hold(gl); + gl->gl_lockref.count++; if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0) - gfs2_glock_put_nolock(gl); + gl->gl_lockref.count--; } } diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 2ab4f8d8f4c4..bb88e417231f 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -21,6 +21,7 @@ #include #include #include +#include #define DIO_WAIT 0x00000010 #define DIO_METADATA 0x00000020 @@ -321,9 +322,9 @@ struct gfs2_glock { struct gfs2_sbd *gl_sbd; unsigned long gl_flags; /* GLF_... */ struct lm_lockname gl_name; - atomic_t gl_ref; - spinlock_t gl_spin; + struct lockref gl_lockref; +#define gl_spin gl_lockref.lock /* State fields protected by gl_spin */ unsigned int gl_state:2, /* Current state */ diff --git a/include/linux/lockref.h b/include/linux/lockref.h index f279ed9a9163..13dfd36a3294 100644 --- a/include/linux/lockref.h +++ b/include/linux/lockref.h @@ -36,4 +36,10 @@ extern int lockref_put_or_lock(struct lockref *); extern void lockref_mark_dead(struct lockref *); extern int lockref_get_not_dead(struct lockref *); +/* Must be called under spinlock for reliable results */ +static inline int __lockref_is_dead(const struct lockref *l) +{ + return ((int)l->count < 0); +} + #endif /* __LINUX_LOCKREF_H */ diff --git a/lib/lockref.c b/lib/lockref.c index e2cd2c0a8821..8ff162fe3413 100644 --- a/lib/lockref.c +++ b/lib/lockref.c @@ -136,6 +136,7 @@ void lockref_mark_dead(struct lockref *lockref) assert_spin_locked(&lockref->lock); lockref->count = -128; } +EXPORT_SYMBOL(lockref_mark_dead); /** * lockref_get_not_dead - Increments count unless the ref is dead -- cgit v1.2.3 From 8eaede49dfdc1ff1d727f9c913665b8009945191 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 7 Oct 2013 01:05:46 +0100 Subject: sysrq: Allow magic SysRq key functions to be disabled through Kconfig Turn the initial value of sysctl kernel.sysrq (SYSRQ_DEFAULT_ENABLE) into a Kconfig variable. Original version by Bastian Blank . Signed-off-by: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- Documentation/sysrq.txt | 13 ++++++------- drivers/tty/sysrq.c | 2 +- include/linux/sysrq.h | 3 --- kernel/sysctl.c | 2 +- lib/Kconfig.debug | 9 +++++++++ 5 files changed, 17 insertions(+), 12 deletions(-) (limited to 'lib') diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt index 1c0471dc70fe..0e307c94809a 100644 --- a/Documentation/sysrq.txt +++ b/Documentation/sysrq.txt @@ -11,11 +11,9 @@ regardless of whatever else it is doing, unless it is completely locked up. You need to say "yes" to 'Magic SysRq key (CONFIG_MAGIC_SYSRQ)' when configuring the kernel. When running a kernel with SysRq compiled in, /proc/sys/kernel/sysrq controls the functions allowed to be invoked via -the SysRq key. By default the file contains 1 which means that every -possible SysRq request is allowed (in older versions SysRq was disabled -by default, and you were required to specifically enable it at run-time -but this is not the case any more). Here is the list of possible values -in /proc/sys/kernel/sysrq: +the SysRq key. The default value in this file is set by the +CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE config symbol, which itself defaults +to 1. Here is the list of possible values in /proc/sys/kernel/sysrq: 0 - disable sysrq completely 1 - enable all functions of sysrq >1 - bitmask of allowed sysrq functions (see below for detailed function @@ -32,8 +30,9 @@ in /proc/sys/kernel/sysrq: You can set the value in the file by the following command: echo "number" >/proc/sys/kernel/sysrq -The number may be written either as decimal or as hexadecimal with the -0x prefix. +The number may be written here either as decimal or as hexadecimal +with the 0x prefix. CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE must always be +written in hexadecimal. Note that the value of /proc/sys/kernel/sysrq influences only the invocation via a keyboard. Invocation of any operation via /proc/sysrq-trigger is always diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index 40a9fe9d3b10..ce396ecdf412 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -51,7 +51,7 @@ #include /* Whether we react on sysrq keys or just ignore them */ -static int __read_mostly sysrq_enabled = SYSRQ_DEFAULT_ENABLE; +static int __read_mostly sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE; static bool __read_mostly sysrq_always_enabled; unsigned short platform_sysrq_reset_seq[] __weak = { KEY_RESERVED }; diff --git a/include/linux/sysrq.h b/include/linux/sysrq.h index 7faf933cced7..387fa7d05c98 100644 --- a/include/linux/sysrq.h +++ b/include/linux/sysrq.h @@ -17,9 +17,6 @@ #include #include -/* Enable/disable SYSRQ support by default (0==no, 1==yes). */ -#define SYSRQ_DEFAULT_ENABLE 1 - /* Possible values of bitmask for enabling sysrq functions */ /* 0x0001 is reserved for enable everything */ #define SYSRQ_ENABLE_LOG 0x0002 diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b2f06f3c6a3f..8b80f1bae21a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -190,7 +190,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write, #ifdef CONFIG_MAGIC_SYSRQ /* Note: sysrq code uses it's own private copy */ -static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE; +static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE; static int sysrq_sysctl_handler(ctl_table *table, int write, void __user *buffer, size_t *lenp, diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 06344d986eb9..29329374ff0d 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -312,6 +312,15 @@ config MAGIC_SYSRQ keys are documented in . Don't say Y unless you really know what this hack does. +config MAGIC_SYSRQ_DEFAULT_ENABLE + hex "Enable magic SysRq key functions by default" + depends on MAGIC_SYSRQ + default 0x1 + help + Specifies which SysRq key functions are enabled by default. + This may be set to 1 or 0 to enable or disable them all, or + to a bitmask as described in Documentation/sysrq.txt. + config DEBUG_KERNEL bool "Kernel debugging" help -- cgit v1.2.3 From ce5be5a16359962f78f0203f0ed7ad6d489492ab Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Wed, 23 Oct 2013 13:32:04 +0200 Subject: tracing/events: Fix swiotlb tracepoint creation Tracepoints are only created when Xen support is enabled, but they are also referenced within lib/swiotlb.c. So unless Xen support is enabled the tracepoints will be missing, therefore causing builds to fail. Fix this by moving the tracepoint creation to lib/swiotlb.c, which works nicely because the Xen swiotlb support selects the generic swiotlb support. Signed-off-by: Thierry Reding Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/swiotlb-xen.c | 1 - lib/swiotlb.c | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index b31081007a81..44af9d8577de 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -43,7 +43,6 @@ #include #include -#define CREATE_TRACE_POINTS #include /* * Used to do a quick range check in swiotlb_tbl_unmap_single and diff --git a/lib/swiotlb.c b/lib/swiotlb.c index f0d841907da6..55587060e893 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -38,6 +38,7 @@ #include #include +#define CREATE_TRACE_POINTS #include #define OFFSET(val,align) ((unsigned long) \ -- cgit v1.2.3 From 783d0281043b9a1111d81d11ed0610b83d8857ed Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Fri, 25 Oct 2013 10:33:26 +0000 Subject: swiotlb: print a warning when the swiotlb is full Signed-off-by: Stefano Stabellini Changes in v7: - use dev_warn instead of pr_warn. --- drivers/xen/swiotlb-xen.c | 1 + lib/swiotlb.c | 1 + 2 files changed, 2 insertions(+) (limited to 'lib') diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 4221cb52387d..4d50058d9630 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -542,6 +542,7 @@ xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, sg->length, dir); if (map == SWIOTLB_MAP_ERROR) { + dev_warn(hwdev, "swiotlb buffer is full\n"); /* Don't panic here, we expect map_sg users to do proper error handling. */ xen_swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir, diff --git a/lib/swiotlb.c b/lib/swiotlb.c index 4e8686c7e5a4..cdc051eaf667 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -502,6 +502,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, not_found: spin_unlock_irqrestore(&io_tlb_lock, flags); + dev_warn(hwdev, "swiotlb buffer is full\n"); return SWIOTLB_MAP_ERROR; found: spin_unlock_irqrestore(&io_tlb_lock, flags); -- cgit v1.2.3 From 098faf5805c80f951ce5e8b4a6842382ad793c38 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 24 Oct 2013 09:06:45 +0100 Subject: percpu_counter: make APIs irq safe In my usage, sometimes the percpu APIs are called with irq locked, sometimes not. lockdep complains there is potential deadlock. Let's always use percpucounter lock in irq safe way. There should be no performance penality, as all those are slow code path. Cc: Andrew Morton Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- lib/percpu_counter.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'lib') diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index 93c5d5ecff4e..7473ee3b4ee7 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -60,14 +60,15 @@ static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc) void percpu_counter_set(struct percpu_counter *fbc, s64 amount) { int cpu; + unsigned long flags; - raw_spin_lock(&fbc->lock); + raw_spin_lock_irqsave(&fbc->lock, flags); for_each_possible_cpu(cpu) { s32 *pcount = per_cpu_ptr(fbc->counters, cpu); *pcount = 0; } fbc->count = amount; - raw_spin_unlock(&fbc->lock); + raw_spin_unlock_irqrestore(&fbc->lock, flags); } EXPORT_SYMBOL(percpu_counter_set); @@ -78,9 +79,10 @@ void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch) preempt_disable(); count = __this_cpu_read(*fbc->counters) + amount; if (count >= batch || count <= -batch) { - raw_spin_lock(&fbc->lock); + unsigned long flags; + raw_spin_lock_irqsave(&fbc->lock, flags); fbc->count += count; - raw_spin_unlock(&fbc->lock); + raw_spin_unlock_irqrestore(&fbc->lock, flags); __this_cpu_write(*fbc->counters, 0); } else { __this_cpu_write(*fbc->counters, count); @@ -97,14 +99,15 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc) { s64 ret; int cpu; + unsigned long flags; - raw_spin_lock(&fbc->lock); + raw_spin_lock_irqsave(&fbc->lock, flags); ret = fbc->count; for_each_online_cpu(cpu) { s32 *pcount = per_cpu_ptr(fbc->counters, cpu); ret += *pcount; } - raw_spin_unlock(&fbc->lock); + raw_spin_unlock_irqrestore(&fbc->lock, flags); return ret; } EXPORT_SYMBOL(__percpu_counter_sum); -- cgit v1.2.3 From e26b53d0b287056646a0dffce8bc6b0f053f3823 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 15 Oct 2013 09:05:01 +0800 Subject: percpu_ida: make percpu_ida percpu size/batch configurable Make percpu_ida percpu size/batch configurable. The block-mq-tag will use it. After block-mq uses percpu_ida to manage tags, performance is improved. My test is done in a 2 sockets machine, 12 process cross the 2 sockets. So if there is lock contention or ipi, should be stressed heavily. Testing is done for null-blk. hw_queue_depth nopatch iops patch iops 64 ~800k/s ~1470k/s 2048 ~4470k/s ~4340k/s Cc: Andrew Morton Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- include/linux/percpu_ida.h | 18 +++++++++++++++++- lib/percpu_ida.c | 28 +++++++++++----------------- 2 files changed, 28 insertions(+), 18 deletions(-) (limited to 'lib') diff --git a/include/linux/percpu_ida.h b/include/linux/percpu_ida.h index 0b23edbee309..56c14033e7e7 100644 --- a/include/linux/percpu_ida.h +++ b/include/linux/percpu_ida.h @@ -16,6 +16,8 @@ struct percpu_ida { * percpu_ida_init() */ unsigned nr_tags; + unsigned percpu_max_size; + unsigned percpu_batch_size; struct percpu_ida_cpu __percpu *tag_cpu; @@ -51,10 +53,24 @@ struct percpu_ida { } ____cacheline_aligned_in_smp; }; +/* + * Number of tags we move between the percpu freelist and the global freelist at + * a time + */ +#define IDA_DEFAULT_PCPU_BATCH_MOVE 32U +/* Max size of percpu freelist, */ +#define IDA_DEFAULT_PCPU_SIZE ((IDA_DEFAULT_PCPU_BATCH_MOVE * 3) / 2) + int percpu_ida_alloc(struct percpu_ida *pool, gfp_t gfp); void percpu_ida_free(struct percpu_ida *pool, unsigned tag); void percpu_ida_destroy(struct percpu_ida *pool); -int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags); +int __percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags, + unsigned long max_size, unsigned long batch_size); +static inline int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags) +{ + return __percpu_ida_init(pool, nr_tags, IDA_DEFAULT_PCPU_SIZE, + IDA_DEFAULT_PCPU_BATCH_MOVE); +} #endif /* __PERCPU_IDA_H__ */ diff --git a/lib/percpu_ida.c b/lib/percpu_ida.c index bab1ba2a4c71..a601d4259e13 100644 --- a/lib/percpu_ida.c +++ b/lib/percpu_ida.c @@ -30,15 +30,6 @@ #include #include -/* - * Number of tags we move between the percpu freelist and the global freelist at - * a time - */ -#define IDA_PCPU_BATCH_MOVE 32U - -/* Max size of percpu freelist, */ -#define IDA_PCPU_SIZE ((IDA_PCPU_BATCH_MOVE * 3) / 2) - struct percpu_ida_cpu { /* * Even though this is percpu, we need a lock for tag stealing by remote @@ -78,7 +69,7 @@ static inline void steal_tags(struct percpu_ida *pool, struct percpu_ida_cpu *remote; for (cpus_have_tags = cpumask_weight(&pool->cpus_have_tags); - cpus_have_tags * IDA_PCPU_SIZE > pool->nr_tags / 2; + cpus_have_tags * pool->percpu_max_size > pool->nr_tags / 2; cpus_have_tags--) { cpu = cpumask_next(cpu, &pool->cpus_have_tags); @@ -123,7 +114,7 @@ static inline void alloc_global_tags(struct percpu_ida *pool, { move_tags(tags->freelist, &tags->nr_free, pool->freelist, &pool->nr_free, - min(pool->nr_free, IDA_PCPU_BATCH_MOVE)); + min(pool->nr_free, pool->percpu_batch_size)); } static inline unsigned alloc_local_tag(struct percpu_ida *pool, @@ -245,17 +236,17 @@ void percpu_ida_free(struct percpu_ida *pool, unsigned tag) wake_up(&pool->wait); } - if (nr_free == IDA_PCPU_SIZE) { + if (nr_free == pool->percpu_max_size) { spin_lock(&pool->lock); /* * Global lock held and irqs disabled, don't need percpu * lock */ - if (tags->nr_free == IDA_PCPU_SIZE) { + if (tags->nr_free == pool->percpu_max_size) { move_tags(pool->freelist, &pool->nr_free, tags->freelist, &tags->nr_free, - IDA_PCPU_BATCH_MOVE); + pool->percpu_batch_size); wake_up(&pool->wait); } @@ -292,7 +283,8 @@ EXPORT_SYMBOL_GPL(percpu_ida_destroy); * Allocation is percpu, but sharding is limited by nr_tags - for best * performance, the workload should not span more cpus than nr_tags / 128. */ -int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags) +int __percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags, + unsigned long max_size, unsigned long batch_size) { unsigned i, cpu, order; @@ -301,6 +293,8 @@ int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags) init_waitqueue_head(&pool->wait); spin_lock_init(&pool->lock); pool->nr_tags = nr_tags; + pool->percpu_max_size = max_size; + pool->percpu_batch_size = batch_size; /* Guard against overflow */ if (nr_tags > (unsigned) INT_MAX + 1) { @@ -319,7 +313,7 @@ int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags) pool->nr_free = nr_tags; pool->tag_cpu = __alloc_percpu(sizeof(struct percpu_ida_cpu) + - IDA_PCPU_SIZE * sizeof(unsigned), + pool->percpu_max_size * sizeof(unsigned), sizeof(unsigned)); if (!pool->tag_cpu) goto err; @@ -332,4 +326,4 @@ err: percpu_ida_destroy(pool); return -ENOMEM; } -EXPORT_SYMBOL_GPL(percpu_ida_init); +EXPORT_SYMBOL_GPL(__percpu_ida_init); -- cgit v1.2.3 From 7fc2ba17e8bf9f218cac10cc2a3de613d9d9086d Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 15 Oct 2013 09:05:02 +0800 Subject: percpu_ida: add percpu_ida_for_each_free Add a new API to iterate free ids. blk-mq-tag will use it. Note, this doesn't guarantee to iterate all free ids restrictly. Caller should be aware of this. blk-mq uses it to do sanity check for request timedout, so can tolerate the limitation. Cc: Andrew Morton Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- include/linux/percpu_ida.h | 4 ++++ lib/percpu_ida.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) (limited to 'lib') diff --git a/include/linux/percpu_ida.h b/include/linux/percpu_ida.h index 56c14033e7e7..63510ae6f933 100644 --- a/include/linux/percpu_ida.h +++ b/include/linux/percpu_ida.h @@ -73,4 +73,8 @@ static inline int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags IDA_DEFAULT_PCPU_BATCH_MOVE); } +typedef int (*percpu_ida_cb)(unsigned, void *); +int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn, + void *data); + #endif /* __PERCPU_IDA_H__ */ diff --git a/lib/percpu_ida.c b/lib/percpu_ida.c index a601d4259e13..0f51c1b556cf 100644 --- a/lib/percpu_ida.c +++ b/lib/percpu_ida.c @@ -327,3 +327,47 @@ err: return -ENOMEM; } EXPORT_SYMBOL_GPL(__percpu_ida_init); + +/** + * percpu_ida_for_each_free - iterate free ids of a pool + * @pool: pool to iterate + * @fn: interate callback function + * @data: parameter for @fn + * + * Note, this doesn't guarantee to iterate all free ids restrictly. Some free + * ids might be missed, some might be iterated duplicated, and some might + * be iterated and not free soon. + */ +int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn, + void *data) +{ + unsigned long flags; + struct percpu_ida_cpu *remote; + unsigned cpu, i, err = 0; + + local_irq_save(flags); + for_each_possible_cpu(cpu) { + remote = per_cpu_ptr(pool->tag_cpu, cpu); + spin_lock(&remote->lock); + for (i = 0; i < remote->nr_free; i++) { + err = fn(remote->freelist[i], data); + if (err) + break; + } + spin_unlock(&remote->lock); + if (err) + goto out; + } + + spin_lock(&pool->lock); + for (i = 0; i < pool->nr_free; i++) { + err = fn(pool->freelist[i], data); + if (err) + break; + } + spin_unlock(&pool->lock); +out: + local_irq_restore(flags); + return err; +} +EXPORT_SYMBOL_GPL(percpu_ida_for_each_free); -- cgit v1.2.3 From 1dddc01af0d42b21058e0cb9c1ca9e8d5204d9b0 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 15 Oct 2013 09:05:03 +0800 Subject: percpu_ida: add an API to return free tags Add an API to return free tags, blk-mq-tag will use it. Note, this just returns a snapshot of free tags number. blk-mq-tag has two usages of it. One is for info output for diagnosis. The other is to quickly check if there are free tags for request dispatch checking. Neither requires very precise. Cc: Andrew Morton Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- include/linux/percpu_ida.h | 1 + lib/percpu_ida.c | 17 +++++++++++++++++ 2 files changed, 18 insertions(+) (limited to 'lib') diff --git a/include/linux/percpu_ida.h b/include/linux/percpu_ida.h index 63510ae6f933..1900bd0fa639 100644 --- a/include/linux/percpu_ida.h +++ b/include/linux/percpu_ida.h @@ -77,4 +77,5 @@ typedef int (*percpu_ida_cb)(unsigned, void *); int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn, void *data); +unsigned percpu_ida_free_tags(struct percpu_ida *pool, int cpu); #endif /* __PERCPU_IDA_H__ */ diff --git a/lib/percpu_ida.c b/lib/percpu_ida.c index 0f51c1b556cf..b0698ea972c6 100644 --- a/lib/percpu_ida.c +++ b/lib/percpu_ida.c @@ -371,3 +371,20 @@ out: return err; } EXPORT_SYMBOL_GPL(percpu_ida_for_each_free); + +/** + * percpu_ida_free_tags - return free tags number of a specific cpu or global pool + * @pool: pool related + * @cpu: specific cpu or global pool if @cpu == nr_cpu_ids + * + * Note: this just returns a snapshot of free tags number. + */ +unsigned percpu_ida_free_tags(struct percpu_ida *pool, int cpu) +{ + struct percpu_ida_cpu *remote; + if (cpu == nr_cpu_ids) + return pool->nr_free; + remote = per_cpu_ptr(pool->tag_cpu, cpu); + return remote->nr_free; +} +EXPORT_SYMBOL_GPL(percpu_ida_free_tags); -- cgit v1.2.3 From d921e049a02c7bd6ddf8f5e72303ca336009556d Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 30 Oct 2013 11:50:48 +0100 Subject: lib: crc32: clean up spacing in test cases This is nothing more but a whitepace cleanup, as 80 chars is not a hard but soft limit, and otherwise makes the test cases array really look ugly. So fix it up. Signed-off-by: Daniel Borkmann Cc: linux-kernel@vger.kernel.org Signed-off-by: David S. Miller --- lib/crc32.c | 300 ++++++++++++++++++++---------------------------------------- 1 file changed, 100 insertions(+), 200 deletions(-) (limited to 'lib') diff --git a/lib/crc32.c b/lib/crc32.c index 410093dbe51c..429d61ce6aa0 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -795,206 +795,106 @@ static struct crc_test { u32 crc32c_le; /* expected crc32c_le result */ } test[] = { - {0x674bf11d, 0x00000038, 0x00000542, 0x0af6d466, 0xd8b6e4c1, - 0xf6e93d6c}, - {0x35c672c6, 0x0000003a, 0x000001aa, 0xc6d3dfba, 0x28aaf3ad, - 0x0fe92aca}, - {0x496da28e, 0x00000039, 0x000005af, 0xd933660f, 0x5d57e81f, - 0x52e1ebb8}, - {0x09a9b90e, 0x00000027, 0x000001f8, 0xb45fe007, 0xf45fca9a, - 0x0798af9a}, - {0xdc97e5a9, 0x00000025, 0x000003b6, 0xf81a3562, 0xe0126ba2, - 0x18eb3152}, - {0x47c58900, 0x0000000a, 0x000000b9, 0x8e58eccf, 0xf3afc793, - 0xd00d08c7}, - {0x292561e8, 0x0000000c, 0x00000403, 0xa2ba8aaf, 0x0b797aed, - 0x8ba966bc}, - {0x415037f6, 0x00000003, 0x00000676, 0xa17d52e8, 0x7f0fdf35, - 0x11d694a2}, - {0x3466e707, 0x00000026, 0x00000042, 0x258319be, 0x75c484a2, - 0x6ab3208d}, - {0xafd1281b, 0x00000023, 0x000002ee, 0x4428eaf8, 0x06c7ad10, - 0xba4603c5}, - {0xd3857b18, 0x00000028, 0x000004a2, 0x5c430821, 0xb062b7cb, - 0xe6071c6f}, - {0x1d825a8f, 0x0000002b, 0x0000050b, 0xd2c45f0c, 0xd68634e0, - 0x179ec30a}, - {0x5033e3bc, 0x0000000b, 0x00000078, 0xa3ea4113, 0xac6d31fb, - 0x0903beb8}, - {0x94f1fb5e, 0x0000000f, 0x000003a2, 0xfbfc50b1, 0x3cfe50ed, - 0x6a7cb4fa}, - {0xc9a0fe14, 0x00000009, 0x00000473, 0x5fb61894, 0x87070591, - 0xdb535801}, - {0x88a034b1, 0x0000001c, 0x000005ad, 0xc1b16053, 0x46f95c67, - 0x92bed597}, - {0xf0f72239, 0x00000020, 0x0000026d, 0xa6fa58f3, 0xf8c2c1dd, - 0x192a3f1b}, - {0xcc20a5e3, 0x0000003b, 0x0000067a, 0x7740185a, 0x308b979a, - 0xccbaec1a}, - {0xce589c95, 0x0000002b, 0x00000641, 0xd055e987, 0x40aae25b, - 0x7eabae4d}, - {0x78edc885, 0x00000035, 0x000005be, 0xa39cb14b, 0x035b0d1f, - 0x28c72982}, - {0x9d40a377, 0x0000003b, 0x00000038, 0x1f47ccd2, 0x197fbc9d, - 0xc3cd4d18}, - {0x703d0e01, 0x0000003c, 0x000006f1, 0x88735e7c, 0xfed57c5a, - 0xbca8f0e7}, - {0x776bf505, 0x0000000f, 0x000005b2, 0x5cc4fc01, 0xf32efb97, - 0x713f60b3}, - {0x4a3e7854, 0x00000027, 0x000004b8, 0x8d923c82, 0x0cbfb4a2, - 0xebd08fd5}, - {0x209172dd, 0x0000003b, 0x00000356, 0xb89e9c2b, 0xd7868138, - 0x64406c59}, - {0x3ba4cc5b, 0x0000002f, 0x00000203, 0xe51601a9, 0x5b2a1032, - 0x7421890e}, - {0xfc62f297, 0x00000000, 0x00000079, 0x71a8e1a2, 0x5d88685f, - 0xe9347603}, - {0x64280b8b, 0x00000016, 0x000007ab, 0x0fa7a30c, 0xda3a455f, - 0x1bef9060}, - {0x97dd724b, 0x00000033, 0x000007ad, 0x5788b2f4, 0xd7326d32, - 0x34720072}, - {0x61394b52, 0x00000035, 0x00000571, 0xc66525f1, 0xcabe7fef, - 0x48310f59}, - {0x29b4faff, 0x00000024, 0x0000006e, 0xca13751e, 0x993648e0, - 0x783a4213}, - {0x29bfb1dc, 0x0000000b, 0x00000244, 0x436c43f7, 0x429f7a59, - 0x9e8efd41}, - {0x86ae934b, 0x00000035, 0x00000104, 0x0760ec93, 0x9cf7d0f4, - 0xfc3d34a5}, - {0xc4c1024e, 0x0000002e, 0x000006b1, 0x6516a3ec, 0x19321f9c, - 0x17a52ae2}, - {0x3287a80a, 0x00000026, 0x00000496, 0x0b257eb1, 0x754ebd51, - 0x886d935a}, - {0xa4db423e, 0x00000023, 0x0000045d, 0x9b3a66dc, 0x873e9f11, - 0xeaaeaeb2}, - {0x7a1078df, 0x00000015, 0x0000014a, 0x8c2484c5, 0x6a628659, - 0x8e900a4b}, - {0x6048bd5b, 0x00000006, 0x0000006a, 0x897e3559, 0xac9961af, - 0xd74662b1}, - {0xd8f9ea20, 0x0000003d, 0x00000277, 0x60eb905b, 0xed2aaf99, - 0xd26752ba}, - {0xea5ec3b4, 0x0000002a, 0x000004fe, 0x869965dc, 0x6c1f833b, - 0x8b1fcd62}, - {0x2dfb005d, 0x00000016, 0x00000345, 0x6a3b117e, 0xf05e8521, - 0xf54342fe}, - {0x5a214ade, 0x00000020, 0x000005b6, 0x467f70be, 0xcb22ccd3, - 0x5b95b988}, - {0xf0ab9cca, 0x00000032, 0x00000515, 0xed223df3, 0x7f3ef01d, - 0x2e1176be}, - {0x91b444f9, 0x0000002e, 0x000007f8, 0x84e9a983, 0x5676756f, - 0x66120546}, - {0x1b5d2ddb, 0x0000002e, 0x0000012c, 0xba638c4c, 0x3f42047b, - 0xf256a5cc}, - {0xd824d1bb, 0x0000003a, 0x000007b5, 0x6288653b, 0x3a3ebea0, - 0x4af1dd69}, - {0x0470180c, 0x00000034, 0x000001f0, 0x9d5b80d6, 0x3de08195, - 0x56f0a04a}, - {0xffaa3a3f, 0x00000036, 0x00000299, 0xf3a82ab8, 0x53e0c13d, - 0x74f6b6b2}, - {0x6406cfeb, 0x00000023, 0x00000600, 0xa920b8e8, 0xe4e2acf4, - 0x085951fd}, - {0xb24aaa38, 0x0000003e, 0x000004a1, 0x657cc328, 0x5077b2c3, - 0xc65387eb}, - {0x58b2ab7c, 0x00000039, 0x000002b4, 0x3a17ee7e, 0x9dcb3643, - 0x1ca9257b}, - {0x3db85970, 0x00000006, 0x000002b6, 0x95268b59, 0xb9812c10, - 0xfd196d76}, - {0x857830c5, 0x00000003, 0x00000590, 0x4ef439d5, 0xf042161d, - 0x5ef88339}, - {0xe1fcd978, 0x0000003e, 0x000007d8, 0xae8d8699, 0xce0a1ef5, - 0x2c3714d9}, - {0xb982a768, 0x00000016, 0x000006e0, 0x62fad3df, 0x5f8a067b, - 0x58576548}, - {0x1d581ce8, 0x0000001e, 0x0000058b, 0xf0f5da53, 0x26e39eee, - 0xfd7c57de}, - {0x2456719b, 0x00000025, 0x00000503, 0x4296ac64, 0xd50e4c14, - 0xd5fedd59}, - {0xfae6d8f2, 0x00000000, 0x0000055d, 0x057fdf2e, 0x2a31391a, - 0x1cc3b17b}, - {0xcba828e3, 0x00000039, 0x000002ce, 0xe3f22351, 0x8f00877b, - 0x270eed73}, - {0x13d25952, 0x0000000a, 0x0000072d, 0x76d4b4cc, 0x5eb67ec3, - 0x91ecbb11}, - {0x0342be3f, 0x00000015, 0x00000599, 0xec75d9f1, 0x9d4d2826, - 0x05ed8d0c}, - {0xeaa344e0, 0x00000014, 0x000004d8, 0x72a4c981, 0x2064ea06, - 0x0b09ad5b}, - {0xbbb52021, 0x0000003b, 0x00000272, 0x04af99fc, 0xaf042d35, - 0xf8d511fb}, - {0xb66384dc, 0x0000001d, 0x000007fc, 0xd7629116, 0x782bd801, - 0x5ad832cc}, - {0x616c01b6, 0x00000022, 0x000002c8, 0x5b1dab30, 0x783ce7d2, - 0x1214d196}, - {0xce2bdaad, 0x00000016, 0x0000062a, 0x932535c8, 0x3f02926d, - 0x5747218a}, - {0x00fe84d7, 0x00000005, 0x00000205, 0x850e50aa, 0x753d649c, - 0xde8f14de}, - {0xbebdcb4c, 0x00000006, 0x0000055d, 0xbeaa37a2, 0x2d8c9eba, - 0x3563b7b9}, - {0xd8b1a02a, 0x00000010, 0x00000387, 0x5017d2fc, 0x503541a5, - 0x071475d0}, - {0x3b96cad2, 0x00000036, 0x00000347, 0x1d2372ae, 0x926cd90b, - 0x54c79d60}, - {0xc94c1ed7, 0x00000005, 0x0000038b, 0x9e9fdb22, 0x144a9178, - 0x4c53eee6}, - {0x1aad454e, 0x00000025, 0x000002b2, 0xc3f6315c, 0x5c7a35b3, - 0x10137a3c}, - {0xa4fec9a6, 0x00000000, 0x000006d6, 0x90be5080, 0xa4107605, - 0xaa9d6c73}, - {0x1bbe71e2, 0x0000001f, 0x000002fd, 0x4e504c3b, 0x284ccaf1, - 0xb63d23e7}, - {0x4201c7e4, 0x00000002, 0x000002b7, 0x7822e3f9, 0x0cc912a9, - 0x7f53e9cf}, - {0x23fddc96, 0x00000003, 0x00000627, 0x8a385125, 0x07767e78, - 0x13c1cd83}, - {0xd82ba25c, 0x00000016, 0x0000063e, 0x98e4148a, 0x283330c9, - 0x49ff5867}, - {0x786f2032, 0x0000002d, 0x0000060f, 0xf201600a, 0xf561bfcd, - 0x8467f211}, - {0xfebe4e1f, 0x0000002a, 0x000004f2, 0x95e51961, 0xfd80dcab, - 0x3f9683b2}, - {0x1a6e0a39, 0x00000008, 0x00000672, 0x8af6c2a5, 0x78dd84cb, - 0x76a3f874}, - {0x56000ab8, 0x0000000e, 0x000000e5, 0x36bacb8f, 0x22ee1f77, - 0x863b702f}, - {0x4717fe0c, 0x00000000, 0x000006ec, 0x8439f342, 0x5c8e03da, - 0xdc6c58ff}, - {0xd5d5d68e, 0x0000003c, 0x000003a3, 0x46fff083, 0x177d1b39, - 0x0622cc95}, - {0xc25dd6c6, 0x00000024, 0x000006c0, 0x5ceb8eb4, 0x892b0d16, - 0xe85605cd}, - {0xe9b11300, 0x00000023, 0x00000683, 0x07a5d59a, 0x6c6a3208, - 0x31da5f06}, - {0x95cd285e, 0x00000001, 0x00000047, 0x7b3a4368, 0x0202c07e, - 0xa1f2e784}, - {0xd9245a25, 0x0000001e, 0x000003a6, 0xd33c1841, 0x1936c0d5, - 0xb07cc616}, - {0x103279db, 0x00000006, 0x0000039b, 0xca09b8a0, 0x77d62892, - 0xbf943b6c}, - {0x1cba3172, 0x00000027, 0x000001c8, 0xcb377194, 0xebe682db, - 0x2c01af1c}, - {0x8f613739, 0x0000000c, 0x000001df, 0xb4b0bc87, 0x7710bd43, - 0x0fe5f56d}, - {0x1c6aa90d, 0x0000001b, 0x0000053c, 0x70559245, 0xda7894ac, - 0xf8943b2d}, - {0xaabe5b93, 0x0000003d, 0x00000715, 0xcdbf42fa, 0x0c3b99e7, - 0xe4d89272}, - {0xf15dd038, 0x00000006, 0x000006db, 0x6e104aea, 0x8d5967f2, - 0x7c2f6bbb}, - {0x584dd49c, 0x00000020, 0x000007bc, 0x36b6cfd6, 0xad4e23b2, - 0xabbf388b}, - {0x5d8c9506, 0x00000020, 0x00000470, 0x4c62378e, 0x31d92640, - 0x1dca1f4e}, - {0xb80d17b0, 0x00000032, 0x00000346, 0x22a5bb88, 0x9a7ec89f, - 0x5c170e23}, - {0xdaf0592e, 0x00000023, 0x000007b0, 0x3cab3f99, 0x9b1fdd99, - 0xc0e9d672}, - {0x4793cc85, 0x0000000d, 0x00000706, 0xe82e04f6, 0xed3db6b7, - 0xc18bdc86}, - {0x82ebf64e, 0x00000009, 0x000007c3, 0x69d590a9, 0x9efa8499, - 0xa874fcdd}, - {0xb18a0319, 0x00000026, 0x000007db, 0x1cf98dcc, 0x8fa9ad6a, - 0x9dc0bb48}, + {0x674bf11d, 0x00000038, 0x00000542, 0x0af6d466, 0xd8b6e4c1, 0xf6e93d6c}, + {0x35c672c6, 0x0000003a, 0x000001aa, 0xc6d3dfba, 0x28aaf3ad, 0x0fe92aca}, + {0x496da28e, 0x00000039, 0x000005af, 0xd933660f, 0x5d57e81f, 0x52e1ebb8}, + {0x09a9b90e, 0x00000027, 0x000001f8, 0xb45fe007, 0xf45fca9a, 0x0798af9a}, + {0xdc97e5a9, 0x00000025, 0x000003b6, 0xf81a3562, 0xe0126ba2, 0x18eb3152}, + {0x47c58900, 0x0000000a, 0x000000b9, 0x8e58eccf, 0xf3afc793, 0xd00d08c7}, + {0x292561e8, 0x0000000c, 0x00000403, 0xa2ba8aaf, 0x0b797aed, 0x8ba966bc}, + {0x415037f6, 0x00000003, 0x00000676, 0xa17d52e8, 0x7f0fdf35, 0x11d694a2}, + {0x3466e707, 0x00000026, 0x00000042, 0x258319be, 0x75c484a2, 0x6ab3208d}, + {0xafd1281b, 0x00000023, 0x000002ee, 0x4428eaf8, 0x06c7ad10, 0xba4603c5}, + {0xd3857b18, 0x00000028, 0x000004a2, 0x5c430821, 0xb062b7cb, 0xe6071c6f}, + {0x1d825a8f, 0x0000002b, 0x0000050b, 0xd2c45f0c, 0xd68634e0, 0x179ec30a}, + {0x5033e3bc, 0x0000000b, 0x00000078, 0xa3ea4113, 0xac6d31fb, 0x0903beb8}, + {0x94f1fb5e, 0x0000000f, 0x000003a2, 0xfbfc50b1, 0x3cfe50ed, 0x6a7cb4fa}, + {0xc9a0fe14, 0x00000009, 0x00000473, 0x5fb61894, 0x87070591, 0xdb535801}, + {0x88a034b1, 0x0000001c, 0x000005ad, 0xc1b16053, 0x46f95c67, 0x92bed597}, + {0xf0f72239, 0x00000020, 0x0000026d, 0xa6fa58f3, 0xf8c2c1dd, 0x192a3f1b}, + {0xcc20a5e3, 0x0000003b, 0x0000067a, 0x7740185a, 0x308b979a, 0xccbaec1a}, + {0xce589c95, 0x0000002b, 0x00000641, 0xd055e987, 0x40aae25b, 0x7eabae4d}, + {0x78edc885, 0x00000035, 0x000005be, 0xa39cb14b, 0x035b0d1f, 0x28c72982}, + {0x9d40a377, 0x0000003b, 0x00000038, 0x1f47ccd2, 0x197fbc9d, 0xc3cd4d18}, + {0x703d0e01, 0x0000003c, 0x000006f1, 0x88735e7c, 0xfed57c5a, 0xbca8f0e7}, + {0x776bf505, 0x0000000f, 0x000005b2, 0x5cc4fc01, 0xf32efb97, 0x713f60b3}, + {0x4a3e7854, 0x00000027, 0x000004b8, 0x8d923c82, 0x0cbfb4a2, 0xebd08fd5}, + {0x209172dd, 0x0000003b, 0x00000356, 0xb89e9c2b, 0xd7868138, 0x64406c59}, + {0x3ba4cc5b, 0x0000002f, 0x00000203, 0xe51601a9, 0x5b2a1032, 0x7421890e}, + {0xfc62f297, 0x00000000, 0x00000079, 0x71a8e1a2, 0x5d88685f, 0xe9347603}, + {0x64280b8b, 0x00000016, 0x000007ab, 0x0fa7a30c, 0xda3a455f, 0x1bef9060}, + {0x97dd724b, 0x00000033, 0x000007ad, 0x5788b2f4, 0xd7326d32, 0x34720072}, + {0x61394b52, 0x00000035, 0x00000571, 0xc66525f1, 0xcabe7fef, 0x48310f59}, + {0x29b4faff, 0x00000024, 0x0000006e, 0xca13751e, 0x993648e0, 0x783a4213}, + {0x29bfb1dc, 0x0000000b, 0x00000244, 0x436c43f7, 0x429f7a59, 0x9e8efd41}, + {0x86ae934b, 0x00000035, 0x00000104, 0x0760ec93, 0x9cf7d0f4, 0xfc3d34a5}, + {0xc4c1024e, 0x0000002e, 0x000006b1, 0x6516a3ec, 0x19321f9c, 0x17a52ae2}, + {0x3287a80a, 0x00000026, 0x00000496, 0x0b257eb1, 0x754ebd51, 0x886d935a}, + {0xa4db423e, 0x00000023, 0x0000045d, 0x9b3a66dc, 0x873e9f11, 0xeaaeaeb2}, + {0x7a1078df, 0x00000015, 0x0000014a, 0x8c2484c5, 0x6a628659, 0x8e900a4b}, + {0x6048bd5b, 0x00000006, 0x0000006a, 0x897e3559, 0xac9961af, 0xd74662b1}, + {0xd8f9ea20, 0x0000003d, 0x00000277, 0x60eb905b, 0xed2aaf99, 0xd26752ba}, + {0xea5ec3b4, 0x0000002a, 0x000004fe, 0x869965dc, 0x6c1f833b, 0x8b1fcd62}, + {0x2dfb005d, 0x00000016, 0x00000345, 0x6a3b117e, 0xf05e8521, 0xf54342fe}, + {0x5a214ade, 0x00000020, 0x000005b6, 0x467f70be, 0xcb22ccd3, 0x5b95b988}, + {0xf0ab9cca, 0x00000032, 0x00000515, 0xed223df3, 0x7f3ef01d, 0x2e1176be}, + {0x91b444f9, 0x0000002e, 0x000007f8, 0x84e9a983, 0x5676756f, 0x66120546}, + {0x1b5d2ddb, 0x0000002e, 0x0000012c, 0xba638c4c, 0x3f42047b, 0xf256a5cc}, + {0xd824d1bb, 0x0000003a, 0x000007b5, 0x6288653b, 0x3a3ebea0, 0x4af1dd69}, + {0x0470180c, 0x00000034, 0x000001f0, 0x9d5b80d6, 0x3de08195, 0x56f0a04a}, + {0xffaa3a3f, 0x00000036, 0x00000299, 0xf3a82ab8, 0x53e0c13d, 0x74f6b6b2}, + {0x6406cfeb, 0x00000023, 0x00000600, 0xa920b8e8, 0xe4e2acf4, 0x085951fd}, + {0xb24aaa38, 0x0000003e, 0x000004a1, 0x657cc328, 0x5077b2c3, 0xc65387eb}, + {0x58b2ab7c, 0x00000039, 0x000002b4, 0x3a17ee7e, 0x9dcb3643, 0x1ca9257b}, + {0x3db85970, 0x00000006, 0x000002b6, 0x95268b59, 0xb9812c10, 0xfd196d76}, + {0x857830c5, 0x00000003, 0x00000590, 0x4ef439d5, 0xf042161d, 0x5ef88339}, + {0xe1fcd978, 0x0000003e, 0x000007d8, 0xae8d8699, 0xce0a1ef5, 0x2c3714d9}, + {0xb982a768, 0x00000016, 0x000006e0, 0x62fad3df, 0x5f8a067b, 0x58576548}, + {0x1d581ce8, 0x0000001e, 0x0000058b, 0xf0f5da53, 0x26e39eee, 0xfd7c57de}, + {0x2456719b, 0x00000025, 0x00000503, 0x4296ac64, 0xd50e4c14, 0xd5fedd59}, + {0xfae6d8f2, 0x00000000, 0x0000055d, 0x057fdf2e, 0x2a31391a, 0x1cc3b17b}, + {0xcba828e3, 0x00000039, 0x000002ce, 0xe3f22351, 0x8f00877b, 0x270eed73}, + {0x13d25952, 0x0000000a, 0x0000072d, 0x76d4b4cc, 0x5eb67ec3, 0x91ecbb11}, + {0x0342be3f, 0x00000015, 0x00000599, 0xec75d9f1, 0x9d4d2826, 0x05ed8d0c}, + {0xeaa344e0, 0x00000014, 0x000004d8, 0x72a4c981, 0x2064ea06, 0x0b09ad5b}, + {0xbbb52021, 0x0000003b, 0x00000272, 0x04af99fc, 0xaf042d35, 0xf8d511fb}, + {0xb66384dc, 0x0000001d, 0x000007fc, 0xd7629116, 0x782bd801, 0x5ad832cc}, + {0x616c01b6, 0x00000022, 0x000002c8, 0x5b1dab30, 0x783ce7d2, 0x1214d196}, + {0xce2bdaad, 0x00000016, 0x0000062a, 0x932535c8, 0x3f02926d, 0x5747218a}, + {0x00fe84d7, 0x00000005, 0x00000205, 0x850e50aa, 0x753d649c, 0xde8f14de}, + {0xbebdcb4c, 0x00000006, 0x0000055d, 0xbeaa37a2, 0x2d8c9eba, 0x3563b7b9}, + {0xd8b1a02a, 0x00000010, 0x00000387, 0x5017d2fc, 0x503541a5, 0x071475d0}, + {0x3b96cad2, 0x00000036, 0x00000347, 0x1d2372ae, 0x926cd90b, 0x54c79d60}, + {0xc94c1ed7, 0x00000005, 0x0000038b, 0x9e9fdb22, 0x144a9178, 0x4c53eee6}, + {0x1aad454e, 0x00000025, 0x000002b2, 0xc3f6315c, 0x5c7a35b3, 0x10137a3c}, + {0xa4fec9a6, 0x00000000, 0x000006d6, 0x90be5080, 0xa4107605, 0xaa9d6c73}, + {0x1bbe71e2, 0x0000001f, 0x000002fd, 0x4e504c3b, 0x284ccaf1, 0xb63d23e7}, + {0x4201c7e4, 0x00000002, 0x000002b7, 0x7822e3f9, 0x0cc912a9, 0x7f53e9cf}, + {0x23fddc96, 0x00000003, 0x00000627, 0x8a385125, 0x07767e78, 0x13c1cd83}, + {0xd82ba25c, 0x00000016, 0x0000063e, 0x98e4148a, 0x283330c9, 0x49ff5867}, + {0x786f2032, 0x0000002d, 0x0000060f, 0xf201600a, 0xf561bfcd, 0x8467f211}, + {0xfebe4e1f, 0x0000002a, 0x000004f2, 0x95e51961, 0xfd80dcab, 0x3f9683b2}, + {0x1a6e0a39, 0x00000008, 0x00000672, 0x8af6c2a5, 0x78dd84cb, 0x76a3f874}, + {0x56000ab8, 0x0000000e, 0x000000e5, 0x36bacb8f, 0x22ee1f77, 0x863b702f}, + {0x4717fe0c, 0x00000000, 0x000006ec, 0x8439f342, 0x5c8e03da, 0xdc6c58ff}, + {0xd5d5d68e, 0x0000003c, 0x000003a3, 0x46fff083, 0x177d1b39, 0x0622cc95}, + {0xc25dd6c6, 0x00000024, 0x000006c0, 0x5ceb8eb4, 0x892b0d16, 0xe85605cd}, + {0xe9b11300, 0x00000023, 0x00000683, 0x07a5d59a, 0x6c6a3208, 0x31da5f06}, + {0x95cd285e, 0x00000001, 0x00000047, 0x7b3a4368, 0x0202c07e, 0xa1f2e784}, + {0xd9245a25, 0x0000001e, 0x000003a6, 0xd33c1841, 0x1936c0d5, 0xb07cc616}, + {0x103279db, 0x00000006, 0x0000039b, 0xca09b8a0, 0x77d62892, 0xbf943b6c}, + {0x1cba3172, 0x00000027, 0x000001c8, 0xcb377194, 0xebe682db, 0x2c01af1c}, + {0x8f613739, 0x0000000c, 0x000001df, 0xb4b0bc87, 0x7710bd43, 0x0fe5f56d}, + {0x1c6aa90d, 0x0000001b, 0x0000053c, 0x70559245, 0xda7894ac, 0xf8943b2d}, + {0xaabe5b93, 0x0000003d, 0x00000715, 0xcdbf42fa, 0x0c3b99e7, 0xe4d89272}, + {0xf15dd038, 0x00000006, 0x000006db, 0x6e104aea, 0x8d5967f2, 0x7c2f6bbb}, + {0x584dd49c, 0x00000020, 0x000007bc, 0x36b6cfd6, 0xad4e23b2, 0xabbf388b}, + {0x5d8c9506, 0x00000020, 0x00000470, 0x4c62378e, 0x31d92640, 0x1dca1f4e}, + {0xb80d17b0, 0x00000032, 0x00000346, 0x22a5bb88, 0x9a7ec89f, 0x5c170e23}, + {0xdaf0592e, 0x00000023, 0x000007b0, 0x3cab3f99, 0x9b1fdd99, 0xc0e9d672}, + {0x4793cc85, 0x0000000d, 0x00000706, 0xe82e04f6, 0xed3db6b7, 0xc18bdc86}, + {0x82ebf64e, 0x00000009, 0x000007c3, 0x69d590a9, 0x9efa8499, 0xa874fcdd}, + {0xb18a0319, 0x00000026, 0x000007db, 0x1cf98dcc, 0x8fa9ad6a, 0x9dc0bb48}, }; #include -- cgit v1.2.3 From 6e95fcaa42e5078ac265964deebed597f9eae07a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 30 Oct 2013 11:50:49 +0100 Subject: lib: crc32: add functionality to combine two crc32{, c}s in GF(2) This patch adds a combinator to merge two or more crc32{,c}s into a new one. This is useful for checksum computations of fragmented skbs that use crc32/crc32c as checksums. The arithmetics for combining both in the GF(2) was taken and slightly modified from zlib. Only passing two crcs is insufficient as two crcs and the length of the second piece is needed for merging. The code is made generic, so that only polynomials need to be passed for crc32_le resp. crc32c_le. Signed-off-by: Daniel Borkmann Cc: linux-kernel@vger.kernel.org Signed-off-by: David S. Miller --- include/linux/crc32.h | 40 +++++++++++++++++++++++++ lib/crc32.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 121 insertions(+) (limited to 'lib') diff --git a/include/linux/crc32.h b/include/linux/crc32.h index 68267b64bb98..7d275c4fc011 100644 --- a/include/linux/crc32.h +++ b/include/linux/crc32.h @@ -11,8 +11,48 @@ extern u32 crc32_le(u32 crc, unsigned char const *p, size_t len); extern u32 crc32_be(u32 crc, unsigned char const *p, size_t len); +/** + * crc32_le_combine - Combine two crc32 check values into one. For two + * sequences of bytes, seq1 and seq2 with lengths len1 + * and len2, crc32_le() check values were calculated + * for each, crc1 and crc2. + * + * @crc1: crc32 of the first block + * @crc2: crc32 of the second block + * @len2: length of the second block + * + * Return: The crc32_le() check value of seq1 and seq2 concatenated, + * requiring only crc1, crc2, and len2. Note: If seq_full denotes + * the concatenated memory area of seq1 with seq2, and crc_full + * the crc32_le() value of seq_full, then crc_full == + * crc32_le_combine(crc1, crc2, len2) when crc_full was seeded + * with the same initializer as crc1, and crc2 seed was 0. See + * also crc32_combine_test(). + */ +extern u32 crc32_le_combine(u32 crc1, u32 crc2, size_t len2); + extern u32 __crc32c_le(u32 crc, unsigned char const *p, size_t len); +/** + * __crc32c_le_combine - Combine two crc32c check values into one. For two + * sequences of bytes, seq1 and seq2 with lengths len1 + * and len2, __crc32c_le() check values were calculated + * for each, crc1 and crc2. + * + * @crc1: crc32c of the first block + * @crc2: crc32c of the second block + * @len2: length of the second block + * + * Return: The __crc32c_le() check value of seq1 and seq2 concatenated, + * requiring only crc1, crc2, and len2. Note: If seq_full denotes + * the concatenated memory area of seq1 with seq2, and crc_full + * the __crc32c_le() value of seq_full, then crc_full == + * __crc32c_le_combine(crc1, crc2, len2) when crc_full was + * seeded with the same initializer as crc1, and crc2 seed + * was 0. See also crc32c_combine_test(). + */ +extern u32 __crc32c_le_combine(u32 crc1, u32 crc2, size_t len2); + #define crc32(seed, data, length) crc32_le(seed, (unsigned char const *)(data), length) /* diff --git a/lib/crc32.c b/lib/crc32.c index 429d61ce6aa0..595205cddc30 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -49,6 +49,30 @@ MODULE_AUTHOR("Matt Domsch "); MODULE_DESCRIPTION("Various CRC32 calculations"); MODULE_LICENSE("GPL"); +#define GF2_DIM 32 + +static u32 gf2_matrix_times(u32 *mat, u32 vec) +{ + u32 sum = 0; + + while (vec) { + if (vec & 1) + sum ^= *mat; + vec >>= 1; + mat++; + } + + return sum; +} + +static void gf2_matrix_square(u32 *square, u32 *mat) +{ + int i; + + for (i = 0; i < GF2_DIM; i++) + square[i] = gf2_matrix_times(mat, mat[i]); +} + #if CRC_LE_BITS > 8 || CRC_BE_BITS > 8 /* implements slicing-by-4 or slicing-by-8 algorithm */ @@ -130,6 +154,52 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) } #endif +/* For conditions of distribution and use, see copyright notice in zlib.h */ +static u32 crc32_generic_combine(u32 crc1, u32 crc2, size_t len2, + u32 polynomial) +{ + u32 even[GF2_DIM]; /* Even-power-of-two zeros operator */ + u32 odd[GF2_DIM]; /* Odd-power-of-two zeros operator */ + u32 row; + int i; + + if (len2 <= 0) + return crc1; + + /* Put operator for one zero bit in odd */ + odd[0] = polynomial; + row = 1; + for (i = 1; i < GF2_DIM; i++) { + odd[i] = row; + row <<= 1; + } + + gf2_matrix_square(even, odd); /* Put operator for two zero bits in even */ + gf2_matrix_square(odd, even); /* Put operator for four zero bits in odd */ + + /* Apply len2 zeros to crc1 (first square will put the operator for one + * zero byte, eight zero bits, in even). + */ + do { + /* Apply zeros operator for this bit of len2 */ + gf2_matrix_square(even, odd); + if (len2 & 1) + crc1 = gf2_matrix_times(even, crc1); + len2 >>= 1; + /* If no more bits set, then done */ + if (len2 == 0) + break; + /* Another iteration of the loop with odd and even swapped */ + gf2_matrix_square(odd, even); + if (len2 & 1) + crc1 = gf2_matrix_times(odd, crc1); + len2 >>= 1; + } while (len2 != 0); + + crc1 ^= crc2; + return crc1; +} + /** * crc32_le_generic() - Calculate bitwise little-endian Ethernet AUTODIN II * CRC32/CRC32C @@ -200,8 +270,19 @@ u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len) (const u32 (*)[256])crc32ctable_le, CRC32C_POLY_LE); } #endif +u32 __pure crc32_le_combine(u32 crc1, u32 crc2, size_t len2) +{ + return crc32_generic_combine(crc1, crc2, len2, CRCPOLY_LE); +} + +u32 __pure __crc32c_le_combine(u32 crc1, u32 crc2, size_t len2) +{ + return crc32_generic_combine(crc1, crc2, len2, CRC32C_POLY_LE); +} EXPORT_SYMBOL(crc32_le); +EXPORT_SYMBOL(crc32_le_combine); EXPORT_SYMBOL(__crc32c_le); +EXPORT_SYMBOL(__crc32c_le_combine); /** * crc32_be_generic() - Calculate bitwise big-endian Ethernet AUTODIN II CRC32 -- cgit v1.2.3 From efba721f636ee016859d86d15748650119402b10 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 30 Oct 2013 11:50:50 +0100 Subject: lib: crc32: add test cases for crc32{, c}_combine routines We already have 100 test cases for crcs itself, so split the test buffer with a-prio known checksums, and test crc of two blocks against crc of the whole block for the same results. Output/result with CONFIG_CRC32_SELFTEST=y: [ 2.687095] crc32: CRC_LE_BITS = 64, CRC_BE BITS = 64 [ 2.687097] crc32: self tests passed, processed 225944 bytes in 278177 nsec [ 2.687383] crc32c: CRC_LE_BITS = 64 [ 2.687385] crc32c: self tests passed, processed 225944 bytes in 141708 nsec [ 7.336771] crc32_combine: 113072 self tests passed [ 12.050479] crc32c_combine: 113072 self tests passed [ 17.633089] alg: No test for crc32 (crc32-pclmul) Signed-off-by: Daniel Borkmann Cc: linux-kernel@vger.kernel.org Signed-off-by: David S. Miller --- lib/crc32.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) (limited to 'lib') diff --git a/lib/crc32.c b/lib/crc32.c index 595205cddc30..69dd124f0cfc 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -1031,6 +1031,40 @@ static int __init crc32c_test(void) return 0; } +static int __init crc32c_combine_test(void) +{ + int i, j; + int errors = 0, runs = 0; + + for (i = 0; i < 100; i++) { + u32 crc_full; + + crc_full = __crc32c_le(test[i].crc, test_buf + test[i].start, + test[i].length); + for (j = 0; j <= test[i].length; ++j) { + u32 crc1, crc2; + u32 len1 = j, len2 = test[i].length - j; + + crc1 = __crc32c_le(test[i].crc, test_buf + + test[i].start, len1); + crc2 = __crc32c_le(0, test_buf + test[i].start + + len1, len2); + + if (!(crc_full == __crc32c_le_combine(crc1, crc2, len2) && + crc_full == test[i].crc32c_le)) + errors++; + runs++; + } + } + + if (errors) + pr_warn("crc32c_combine: %d/%d self tests failed\n", errors, runs); + else + pr_info("crc32c_combine: %d self tests passed\n", runs); + + return 0; +} + static int __init crc32_test(void) { int i; @@ -1090,10 +1124,48 @@ static int __init crc32_test(void) return 0; } +static int __init crc32_combine_test(void) +{ + int i, j; + int errors = 0, runs = 0; + + for (i = 0; i < 100; i++) { + u32 crc_full; + + crc_full = crc32_le(test[i].crc, test_buf + test[i].start, + test[i].length); + for (j = 0; j <= test[i].length; ++j) { + u32 crc1, crc2; + u32 len1 = j, len2 = test[i].length - j; + + crc1 = crc32_le(test[i].crc, test_buf + + test[i].start, len1); + crc2 = crc32_le(0, test_buf + test[i].start + + len1, len2); + + if (!(crc_full == crc32_le_combine(crc1, crc2, len2) && + crc_full == test[i].crc_le)) + errors++; + runs++; + } + } + + if (errors) + pr_warn("crc32_combine: %d/%d self tests failed\n", errors, runs); + else + pr_info("crc32_combine: %d self tests passed\n", runs); + + return 0; +} + static int __init crc32test_init(void) { crc32_test(); crc32c_test(); + + crc32_combine_test(); + crc32c_combine_test(); + return 0; } -- cgit v1.2.3 From cc0ac1999589c9f713550adde85a09b0dbb75d86 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 4 Nov 2013 17:10:26 +0100 Subject: lib: crc32: conditionally resched when running testcases Fengguang reports that when crc32 selftests are running on startup, on some e.g. 32bit systems, we can get a CPU stall like "INFO: rcu_sched self-detected stall on CPU { 0} (t=2101 jiffies g=4294967081 c=4294967080 q=41)". As this is not intended, add a cond_resched() at the end of a test case to fix it. Introduced by efba721f63 ("lib: crc32: add test cases for crc32{, c}_combine routines"). Reported-by: Fengguang Wu Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- lib/crc32.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'lib') diff --git a/lib/crc32.c b/lib/crc32.c index 69dd124f0cfc..3a1dfa84203c 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -29,6 +29,7 @@ #include #include #include +#include #include "crc32defs.h" #if CRC_LE_BITS > 8 @@ -1054,6 +1055,7 @@ static int __init crc32c_combine_test(void) crc_full == test[i].crc32c_le)) errors++; runs++; + cond_resched(); } } @@ -1147,6 +1149,7 @@ static int __init crc32_combine_test(void) crc_full == test[i].crc_le)) errors++; runs++; + cond_resched(); } } -- cgit v1.2.3 From 165148396d8a220c0fb62e30101ea99b0223864e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 4 Nov 2013 17:10:27 +0100 Subject: lib: crc32: reduce number of cases for crc32{, c}_combine We can safely reduce the number of test cases by a tenth. There is no particular need to run as many as we're running now for crc32{,c}_combine, that gives us still ~8000 tests we're doing if people run kernels with crc selftests enabled which is perfectly fine. Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- lib/crc32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/crc32.c b/lib/crc32.c index 3a1dfa84203c..70f00ca5ef1e 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -1037,7 +1037,7 @@ static int __init crc32c_combine_test(void) int i, j; int errors = 0, runs = 0; - for (i = 0; i < 100; i++) { + for (i = 0; i < 10; i++) { u32 crc_full; crc_full = __crc32c_le(test[i].crc, test_buf + test[i].start, @@ -1131,7 +1131,7 @@ static int __init crc32_combine_test(void) int i, j; int errors = 0, runs = 0; - for (i = 0; i < 100; i++) { + for (i = 0; i < 10; i++) { u32 crc_full; crc_full = crc32_le(test[i].crc, test_buf + test[i].start, -- cgit v1.2.3 From 60fc28746a7b61775ae28950ddf7a4ac15955639 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 31 Oct 2013 18:15:36 +0100 Subject: locking: Move the spinlock code to kernel/locking/ Suggested-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-b81ol0z3mon45m51o131yc9j@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/Makefile | 3 - kernel/locking/Makefile | 4 + kernel/locking/spinlock.c | 399 ++++++++++++++++++++++++++++++++++++++++ kernel/locking/spinlock_debug.c | 302 ++++++++++++++++++++++++++++++ kernel/spinlock.c | 399 ---------------------------------------- lib/Makefile | 1 - lib/spinlock_debug.c | 302 ------------------------------ 7 files changed, 705 insertions(+), 705 deletions(-) create mode 100644 kernel/locking/spinlock.c create mode 100644 kernel/locking/spinlock_debug.c delete mode 100644 kernel/spinlock.c delete mode 100644 lib/spinlock_debug.c (limited to 'lib') diff --git a/kernel/Makefile b/kernel/Makefile index 4fffd6ee42c1..4bce165dce5d 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -43,9 +43,6 @@ obj-$(CONFIG_SMP) += smp.o ifneq ($(CONFIG_SMP),y) obj-y += up.o endif -obj-$(CONFIG_SMP) += spinlock.o -obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o -obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index c103599fc1ba..674d2152d10f 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -13,3 +13,7 @@ obj-$(CONFIG_LOCKDEP) += lockdep.o ifeq ($(CONFIG_PROC_FS),y) obj-$(CONFIG_LOCKDEP) += lockdep_proc.o endif +obj-$(CONFIG_SMP) += spinlock.o +obj-$(CONFIG_PROVE_LOCKING) += spinlock.o +obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o +obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c new file mode 100644 index 000000000000..4b082b5cac9e --- /dev/null +++ b/kernel/locking/spinlock.c @@ -0,0 +1,399 @@ +/* + * Copyright (2004) Linus Torvalds + * + * Author: Zwane Mwaikambo + * + * Copyright (2004, 2005) Ingo Molnar + * + * This file contains the spinlock/rwlock implementations for the + * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them) + * + * Note that some architectures have special knowledge about the + * stack frames of these functions in their profile_pc. If you + * change anything significant here that could change the stack + * frame contact the architecture maintainers. + */ + +#include +#include +#include +#include +#include +#include + +/* + * If lockdep is enabled then we use the non-preemption spin-ops + * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are + * not re-enabled during lock-acquire (which the preempt-spin-ops do): + */ +#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) +/* + * The __lock_function inlines are taken from + * include/linux/spinlock_api_smp.h + */ +#else +#define raw_read_can_lock(l) read_can_lock(l) +#define raw_write_can_lock(l) write_can_lock(l) + +/* + * Some architectures can relax in favour of the CPU owning the lock. + */ +#ifndef arch_read_relax +# define arch_read_relax(l) cpu_relax() +#endif +#ifndef arch_write_relax +# define arch_write_relax(l) cpu_relax() +#endif +#ifndef arch_spin_relax +# define arch_spin_relax(l) cpu_relax() +#endif + +/* + * We build the __lock_function inlines here. They are too large for + * inlining all over the place, but here is only one user per function + * which embedds them into the calling _lock_function below. + * + * This could be a long-held lock. We both prepare to spin for a long + * time (making _this_ CPU preemptable if possible), and we also signal + * towards that other CPU that it should break the lock ASAP. + */ +#define BUILD_LOCK_OPS(op, locktype) \ +void __lockfunc __raw_##op##_lock(locktype##_t *lock) \ +{ \ + for (;;) { \ + preempt_disable(); \ + if (likely(do_raw_##op##_trylock(lock))) \ + break; \ + preempt_enable(); \ + \ + if (!(lock)->break_lock) \ + (lock)->break_lock = 1; \ + while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\ + arch_##op##_relax(&lock->raw_lock); \ + } \ + (lock)->break_lock = 0; \ +} \ + \ +unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \ +{ \ + unsigned long flags; \ + \ + for (;;) { \ + preempt_disable(); \ + local_irq_save(flags); \ + if (likely(do_raw_##op##_trylock(lock))) \ + break; \ + local_irq_restore(flags); \ + preempt_enable(); \ + \ + if (!(lock)->break_lock) \ + (lock)->break_lock = 1; \ + while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\ + arch_##op##_relax(&lock->raw_lock); \ + } \ + (lock)->break_lock = 0; \ + return flags; \ +} \ + \ +void __lockfunc __raw_##op##_lock_irq(locktype##_t *lock) \ +{ \ + _raw_##op##_lock_irqsave(lock); \ +} \ + \ +void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \ +{ \ + unsigned long flags; \ + \ + /* */ \ + /* Careful: we must exclude softirqs too, hence the */ \ + /* irq-disabling. We use the generic preemption-aware */ \ + /* function: */ \ + /**/ \ + flags = _raw_##op##_lock_irqsave(lock); \ + local_bh_disable(); \ + local_irq_restore(flags); \ +} \ + +/* + * Build preemption-friendly versions of the following + * lock-spinning functions: + * + * __[spin|read|write]_lock() + * __[spin|read|write]_lock_irq() + * __[spin|read|write]_lock_irqsave() + * __[spin|read|write]_lock_bh() + */ +BUILD_LOCK_OPS(spin, raw_spinlock); +BUILD_LOCK_OPS(read, rwlock); +BUILD_LOCK_OPS(write, rwlock); + +#endif + +#ifndef CONFIG_INLINE_SPIN_TRYLOCK +int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock) +{ + return __raw_spin_trylock(lock); +} +EXPORT_SYMBOL(_raw_spin_trylock); +#endif + +#ifndef CONFIG_INLINE_SPIN_TRYLOCK_BH +int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock) +{ + return __raw_spin_trylock_bh(lock); +} +EXPORT_SYMBOL(_raw_spin_trylock_bh); +#endif + +#ifndef CONFIG_INLINE_SPIN_LOCK +void __lockfunc _raw_spin_lock(raw_spinlock_t *lock) +{ + __raw_spin_lock(lock); +} +EXPORT_SYMBOL(_raw_spin_lock); +#endif + +#ifndef CONFIG_INLINE_SPIN_LOCK_IRQSAVE +unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock) +{ + return __raw_spin_lock_irqsave(lock); +} +EXPORT_SYMBOL(_raw_spin_lock_irqsave); +#endif + +#ifndef CONFIG_INLINE_SPIN_LOCK_IRQ +void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock) +{ + __raw_spin_lock_irq(lock); +} +EXPORT_SYMBOL(_raw_spin_lock_irq); +#endif + +#ifndef CONFIG_INLINE_SPIN_LOCK_BH +void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock) +{ + __raw_spin_lock_bh(lock); +} +EXPORT_SYMBOL(_raw_spin_lock_bh); +#endif + +#ifdef CONFIG_UNINLINE_SPIN_UNLOCK +void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock) +{ + __raw_spin_unlock(lock); +} +EXPORT_SYMBOL(_raw_spin_unlock); +#endif + +#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE +void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags) +{ + __raw_spin_unlock_irqrestore(lock, flags); +} +EXPORT_SYMBOL(_raw_spin_unlock_irqrestore); +#endif + +#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQ +void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock) +{ + __raw_spin_unlock_irq(lock); +} +EXPORT_SYMBOL(_raw_spin_unlock_irq); +#endif + +#ifndef CONFIG_INLINE_SPIN_UNLOCK_BH +void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock) +{ + __raw_spin_unlock_bh(lock); +} +EXPORT_SYMBOL(_raw_spin_unlock_bh); +#endif + +#ifndef CONFIG_INLINE_READ_TRYLOCK +int __lockfunc _raw_read_trylock(rwlock_t *lock) +{ + return __raw_read_trylock(lock); +} +EXPORT_SYMBOL(_raw_read_trylock); +#endif + +#ifndef CONFIG_INLINE_READ_LOCK +void __lockfunc _raw_read_lock(rwlock_t *lock) +{ + __raw_read_lock(lock); +} +EXPORT_SYMBOL(_raw_read_lock); +#endif + +#ifndef CONFIG_INLINE_READ_LOCK_IRQSAVE +unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock) +{ + return __raw_read_lock_irqsave(lock); +} +EXPORT_SYMBOL(_raw_read_lock_irqsave); +#endif + +#ifndef CONFIG_INLINE_READ_LOCK_IRQ +void __lockfunc _raw_read_lock_irq(rwlock_t *lock) +{ + __raw_read_lock_irq(lock); +} +EXPORT_SYMBOL(_raw_read_lock_irq); +#endif + +#ifndef CONFIG_INLINE_READ_LOCK_BH +void __lockfunc _raw_read_lock_bh(rwlock_t *lock) +{ + __raw_read_lock_bh(lock); +} +EXPORT_SYMBOL(_raw_read_lock_bh); +#endif + +#ifndef CONFIG_INLINE_READ_UNLOCK +void __lockfunc _raw_read_unlock(rwlock_t *lock) +{ + __raw_read_unlock(lock); +} +EXPORT_SYMBOL(_raw_read_unlock); +#endif + +#ifndef CONFIG_INLINE_READ_UNLOCK_IRQRESTORE +void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) +{ + __raw_read_unlock_irqrestore(lock, flags); +} +EXPORT_SYMBOL(_raw_read_unlock_irqrestore); +#endif + +#ifndef CONFIG_INLINE_READ_UNLOCK_IRQ +void __lockfunc _raw_read_unlock_irq(rwlock_t *lock) +{ + __raw_read_unlock_irq(lock); +} +EXPORT_SYMBOL(_raw_read_unlock_irq); +#endif + +#ifndef CONFIG_INLINE_READ_UNLOCK_BH +void __lockfunc _raw_read_unlock_bh(rwlock_t *lock) +{ + __raw_read_unlock_bh(lock); +} +EXPORT_SYMBOL(_raw_read_unlock_bh); +#endif + +#ifndef CONFIG_INLINE_WRITE_TRYLOCK +int __lockfunc _raw_write_trylock(rwlock_t *lock) +{ + return __raw_write_trylock(lock); +} +EXPORT_SYMBOL(_raw_write_trylock); +#endif + +#ifndef CONFIG_INLINE_WRITE_LOCK +void __lockfunc _raw_write_lock(rwlock_t *lock) +{ + __raw_write_lock(lock); +} +EXPORT_SYMBOL(_raw_write_lock); +#endif + +#ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE +unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock) +{ + return __raw_write_lock_irqsave(lock); +} +EXPORT_SYMBOL(_raw_write_lock_irqsave); +#endif + +#ifndef CONFIG_INLINE_WRITE_LOCK_IRQ +void __lockfunc _raw_write_lock_irq(rwlock_t *lock) +{ + __raw_write_lock_irq(lock); +} +EXPORT_SYMBOL(_raw_write_lock_irq); +#endif + +#ifndef CONFIG_INLINE_WRITE_LOCK_BH +void __lockfunc _raw_write_lock_bh(rwlock_t *lock) +{ + __raw_write_lock_bh(lock); +} +EXPORT_SYMBOL(_raw_write_lock_bh); +#endif + +#ifndef CONFIG_INLINE_WRITE_UNLOCK +void __lockfunc _raw_write_unlock(rwlock_t *lock) +{ + __raw_write_unlock(lock); +} +EXPORT_SYMBOL(_raw_write_unlock); +#endif + +#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE +void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) +{ + __raw_write_unlock_irqrestore(lock, flags); +} +EXPORT_SYMBOL(_raw_write_unlock_irqrestore); +#endif + +#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQ +void __lockfunc _raw_write_unlock_irq(rwlock_t *lock) +{ + __raw_write_unlock_irq(lock); +} +EXPORT_SYMBOL(_raw_write_unlock_irq); +#endif + +#ifndef CONFIG_INLINE_WRITE_UNLOCK_BH +void __lockfunc _raw_write_unlock_bh(rwlock_t *lock) +{ + __raw_write_unlock_bh(lock); +} +EXPORT_SYMBOL(_raw_write_unlock_bh); +#endif + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + +void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass) +{ + preempt_disable(); + spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock); +} +EXPORT_SYMBOL(_raw_spin_lock_nested); + +unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock, + int subclass) +{ + unsigned long flags; + + local_irq_save(flags); + preempt_disable(); + spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + LOCK_CONTENDED_FLAGS(lock, do_raw_spin_trylock, do_raw_spin_lock, + do_raw_spin_lock_flags, &flags); + return flags; +} +EXPORT_SYMBOL(_raw_spin_lock_irqsave_nested); + +void __lockfunc _raw_spin_lock_nest_lock(raw_spinlock_t *lock, + struct lockdep_map *nest_lock) +{ + preempt_disable(); + spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_); + LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock); +} +EXPORT_SYMBOL(_raw_spin_lock_nest_lock); + +#endif + +notrace int in_lock_functions(unsigned long addr) +{ + /* Linker adds these: start and end of __lockfunc functions */ + extern char __lock_text_start[], __lock_text_end[]; + + return addr >= (unsigned long)__lock_text_start + && addr < (unsigned long)__lock_text_end; +} +EXPORT_SYMBOL(in_lock_functions); diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c new file mode 100644 index 000000000000..0374a596cffa --- /dev/null +++ b/kernel/locking/spinlock_debug.c @@ -0,0 +1,302 @@ +/* + * Copyright 2005, Red Hat, Inc., Ingo Molnar + * Released under the General Public License (GPL). + * + * This file contains the spinlock/rwlock implementations for + * DEBUG_SPINLOCK. + */ + +#include +#include +#include +#include +#include +#include + +void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, + struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held lock: + */ + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + lockdep_init_map(&lock->dep_map, name, key, 0); +#endif + lock->raw_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; + lock->magic = SPINLOCK_MAGIC; + lock->owner = SPINLOCK_OWNER_INIT; + lock->owner_cpu = -1; +} + +EXPORT_SYMBOL(__raw_spin_lock_init); + +void __rwlock_init(rwlock_t *lock, const char *name, + struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held lock: + */ + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + lockdep_init_map(&lock->dep_map, name, key, 0); +#endif + lock->raw_lock = (arch_rwlock_t) __ARCH_RW_LOCK_UNLOCKED; + lock->magic = RWLOCK_MAGIC; + lock->owner = SPINLOCK_OWNER_INIT; + lock->owner_cpu = -1; +} + +EXPORT_SYMBOL(__rwlock_init); + +static void spin_dump(raw_spinlock_t *lock, const char *msg) +{ + struct task_struct *owner = NULL; + + if (lock->owner && lock->owner != SPINLOCK_OWNER_INIT) + owner = lock->owner; + printk(KERN_EMERG "BUG: spinlock %s on CPU#%d, %s/%d\n", + msg, raw_smp_processor_id(), + current->comm, task_pid_nr(current)); + printk(KERN_EMERG " lock: %pS, .magic: %08x, .owner: %s/%d, " + ".owner_cpu: %d\n", + lock, lock->magic, + owner ? owner->comm : "", + owner ? task_pid_nr(owner) : -1, + lock->owner_cpu); + dump_stack(); +} + +static void spin_bug(raw_spinlock_t *lock, const char *msg) +{ + if (!debug_locks_off()) + return; + + spin_dump(lock, msg); +} + +#define SPIN_BUG_ON(cond, lock, msg) if (unlikely(cond)) spin_bug(lock, msg) + +static inline void +debug_spin_lock_before(raw_spinlock_t *lock) +{ + SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic"); + SPIN_BUG_ON(lock->owner == current, lock, "recursion"); + SPIN_BUG_ON(lock->owner_cpu == raw_smp_processor_id(), + lock, "cpu recursion"); +} + +static inline void debug_spin_lock_after(raw_spinlock_t *lock) +{ + lock->owner_cpu = raw_smp_processor_id(); + lock->owner = current; +} + +static inline void debug_spin_unlock(raw_spinlock_t *lock) +{ + SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic"); + SPIN_BUG_ON(!raw_spin_is_locked(lock), lock, "already unlocked"); + SPIN_BUG_ON(lock->owner != current, lock, "wrong owner"); + SPIN_BUG_ON(lock->owner_cpu != raw_smp_processor_id(), + lock, "wrong CPU"); + lock->owner = SPINLOCK_OWNER_INIT; + lock->owner_cpu = -1; +} + +static void __spin_lock_debug(raw_spinlock_t *lock) +{ + u64 i; + u64 loops = loops_per_jiffy * HZ; + + for (i = 0; i < loops; i++) { + if (arch_spin_trylock(&lock->raw_lock)) + return; + __delay(1); + } + /* lockup suspected: */ + spin_dump(lock, "lockup suspected"); +#ifdef CONFIG_SMP + trigger_all_cpu_backtrace(); +#endif + + /* + * The trylock above was causing a livelock. Give the lower level arch + * specific lock code a chance to acquire the lock. We have already + * printed a warning/backtrace at this point. The non-debug arch + * specific code might actually succeed in acquiring the lock. If it is + * not successful, the end-result is the same - there is no forward + * progress. + */ + arch_spin_lock(&lock->raw_lock); +} + +void do_raw_spin_lock(raw_spinlock_t *lock) +{ + debug_spin_lock_before(lock); + if (unlikely(!arch_spin_trylock(&lock->raw_lock))) + __spin_lock_debug(lock); + debug_spin_lock_after(lock); +} + +int do_raw_spin_trylock(raw_spinlock_t *lock) +{ + int ret = arch_spin_trylock(&lock->raw_lock); + + if (ret) + debug_spin_lock_after(lock); +#ifndef CONFIG_SMP + /* + * Must not happen on UP: + */ + SPIN_BUG_ON(!ret, lock, "trylock failure on UP"); +#endif + return ret; +} + +void do_raw_spin_unlock(raw_spinlock_t *lock) +{ + debug_spin_unlock(lock); + arch_spin_unlock(&lock->raw_lock); +} + +static void rwlock_bug(rwlock_t *lock, const char *msg) +{ + if (!debug_locks_off()) + return; + + printk(KERN_EMERG "BUG: rwlock %s on CPU#%d, %s/%d, %p\n", + msg, raw_smp_processor_id(), current->comm, + task_pid_nr(current), lock); + dump_stack(); +} + +#define RWLOCK_BUG_ON(cond, lock, msg) if (unlikely(cond)) rwlock_bug(lock, msg) + +#if 0 /* __write_lock_debug() can lock up - maybe this can too? */ +static void __read_lock_debug(rwlock_t *lock) +{ + u64 i; + u64 loops = loops_per_jiffy * HZ; + int print_once = 1; + + for (;;) { + for (i = 0; i < loops; i++) { + if (arch_read_trylock(&lock->raw_lock)) + return; + __delay(1); + } + /* lockup suspected: */ + if (print_once) { + print_once = 0; + printk(KERN_EMERG "BUG: read-lock lockup on CPU#%d, " + "%s/%d, %p\n", + raw_smp_processor_id(), current->comm, + current->pid, lock); + dump_stack(); + } + } +} +#endif + +void do_raw_read_lock(rwlock_t *lock) +{ + RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); + arch_read_lock(&lock->raw_lock); +} + +int do_raw_read_trylock(rwlock_t *lock) +{ + int ret = arch_read_trylock(&lock->raw_lock); + +#ifndef CONFIG_SMP + /* + * Must not happen on UP: + */ + RWLOCK_BUG_ON(!ret, lock, "trylock failure on UP"); +#endif + return ret; +} + +void do_raw_read_unlock(rwlock_t *lock) +{ + RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); + arch_read_unlock(&lock->raw_lock); +} + +static inline void debug_write_lock_before(rwlock_t *lock) +{ + RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); + RWLOCK_BUG_ON(lock->owner == current, lock, "recursion"); + RWLOCK_BUG_ON(lock->owner_cpu == raw_smp_processor_id(), + lock, "cpu recursion"); +} + +static inline void debug_write_lock_after(rwlock_t *lock) +{ + lock->owner_cpu = raw_smp_processor_id(); + lock->owner = current; +} + +static inline void debug_write_unlock(rwlock_t *lock) +{ + RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); + RWLOCK_BUG_ON(lock->owner != current, lock, "wrong owner"); + RWLOCK_BUG_ON(lock->owner_cpu != raw_smp_processor_id(), + lock, "wrong CPU"); + lock->owner = SPINLOCK_OWNER_INIT; + lock->owner_cpu = -1; +} + +#if 0 /* This can cause lockups */ +static void __write_lock_debug(rwlock_t *lock) +{ + u64 i; + u64 loops = loops_per_jiffy * HZ; + int print_once = 1; + + for (;;) { + for (i = 0; i < loops; i++) { + if (arch_write_trylock(&lock->raw_lock)) + return; + __delay(1); + } + /* lockup suspected: */ + if (print_once) { + print_once = 0; + printk(KERN_EMERG "BUG: write-lock lockup on CPU#%d, " + "%s/%d, %p\n", + raw_smp_processor_id(), current->comm, + current->pid, lock); + dump_stack(); + } + } +} +#endif + +void do_raw_write_lock(rwlock_t *lock) +{ + debug_write_lock_before(lock); + arch_write_lock(&lock->raw_lock); + debug_write_lock_after(lock); +} + +int do_raw_write_trylock(rwlock_t *lock) +{ + int ret = arch_write_trylock(&lock->raw_lock); + + if (ret) + debug_write_lock_after(lock); +#ifndef CONFIG_SMP + /* + * Must not happen on UP: + */ + RWLOCK_BUG_ON(!ret, lock, "trylock failure on UP"); +#endif + return ret; +} + +void do_raw_write_unlock(rwlock_t *lock) +{ + debug_write_unlock(lock); + arch_write_unlock(&lock->raw_lock); +} diff --git a/kernel/spinlock.c b/kernel/spinlock.c deleted file mode 100644 index 4b082b5cac9e..000000000000 --- a/kernel/spinlock.c +++ /dev/null @@ -1,399 +0,0 @@ -/* - * Copyright (2004) Linus Torvalds - * - * Author: Zwane Mwaikambo - * - * Copyright (2004, 2005) Ingo Molnar - * - * This file contains the spinlock/rwlock implementations for the - * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them) - * - * Note that some architectures have special knowledge about the - * stack frames of these functions in their profile_pc. If you - * change anything significant here that could change the stack - * frame contact the architecture maintainers. - */ - -#include -#include -#include -#include -#include -#include - -/* - * If lockdep is enabled then we use the non-preemption spin-ops - * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are - * not re-enabled during lock-acquire (which the preempt-spin-ops do): - */ -#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) -/* - * The __lock_function inlines are taken from - * include/linux/spinlock_api_smp.h - */ -#else -#define raw_read_can_lock(l) read_can_lock(l) -#define raw_write_can_lock(l) write_can_lock(l) - -/* - * Some architectures can relax in favour of the CPU owning the lock. - */ -#ifndef arch_read_relax -# define arch_read_relax(l) cpu_relax() -#endif -#ifndef arch_write_relax -# define arch_write_relax(l) cpu_relax() -#endif -#ifndef arch_spin_relax -# define arch_spin_relax(l) cpu_relax() -#endif - -/* - * We build the __lock_function inlines here. They are too large for - * inlining all over the place, but here is only one user per function - * which embedds them into the calling _lock_function below. - * - * This could be a long-held lock. We both prepare to spin for a long - * time (making _this_ CPU preemptable if possible), and we also signal - * towards that other CPU that it should break the lock ASAP. - */ -#define BUILD_LOCK_OPS(op, locktype) \ -void __lockfunc __raw_##op##_lock(locktype##_t *lock) \ -{ \ - for (;;) { \ - preempt_disable(); \ - if (likely(do_raw_##op##_trylock(lock))) \ - break; \ - preempt_enable(); \ - \ - if (!(lock)->break_lock) \ - (lock)->break_lock = 1; \ - while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\ - arch_##op##_relax(&lock->raw_lock); \ - } \ - (lock)->break_lock = 0; \ -} \ - \ -unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \ -{ \ - unsigned long flags; \ - \ - for (;;) { \ - preempt_disable(); \ - local_irq_save(flags); \ - if (likely(do_raw_##op##_trylock(lock))) \ - break; \ - local_irq_restore(flags); \ - preempt_enable(); \ - \ - if (!(lock)->break_lock) \ - (lock)->break_lock = 1; \ - while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\ - arch_##op##_relax(&lock->raw_lock); \ - } \ - (lock)->break_lock = 0; \ - return flags; \ -} \ - \ -void __lockfunc __raw_##op##_lock_irq(locktype##_t *lock) \ -{ \ - _raw_##op##_lock_irqsave(lock); \ -} \ - \ -void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \ -{ \ - unsigned long flags; \ - \ - /* */ \ - /* Careful: we must exclude softirqs too, hence the */ \ - /* irq-disabling. We use the generic preemption-aware */ \ - /* function: */ \ - /**/ \ - flags = _raw_##op##_lock_irqsave(lock); \ - local_bh_disable(); \ - local_irq_restore(flags); \ -} \ - -/* - * Build preemption-friendly versions of the following - * lock-spinning functions: - * - * __[spin|read|write]_lock() - * __[spin|read|write]_lock_irq() - * __[spin|read|write]_lock_irqsave() - * __[spin|read|write]_lock_bh() - */ -BUILD_LOCK_OPS(spin, raw_spinlock); -BUILD_LOCK_OPS(read, rwlock); -BUILD_LOCK_OPS(write, rwlock); - -#endif - -#ifndef CONFIG_INLINE_SPIN_TRYLOCK -int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock) -{ - return __raw_spin_trylock(lock); -} -EXPORT_SYMBOL(_raw_spin_trylock); -#endif - -#ifndef CONFIG_INLINE_SPIN_TRYLOCK_BH -int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock) -{ - return __raw_spin_trylock_bh(lock); -} -EXPORT_SYMBOL(_raw_spin_trylock_bh); -#endif - -#ifndef CONFIG_INLINE_SPIN_LOCK -void __lockfunc _raw_spin_lock(raw_spinlock_t *lock) -{ - __raw_spin_lock(lock); -} -EXPORT_SYMBOL(_raw_spin_lock); -#endif - -#ifndef CONFIG_INLINE_SPIN_LOCK_IRQSAVE -unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock) -{ - return __raw_spin_lock_irqsave(lock); -} -EXPORT_SYMBOL(_raw_spin_lock_irqsave); -#endif - -#ifndef CONFIG_INLINE_SPIN_LOCK_IRQ -void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock) -{ - __raw_spin_lock_irq(lock); -} -EXPORT_SYMBOL(_raw_spin_lock_irq); -#endif - -#ifndef CONFIG_INLINE_SPIN_LOCK_BH -void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock) -{ - __raw_spin_lock_bh(lock); -} -EXPORT_SYMBOL(_raw_spin_lock_bh); -#endif - -#ifdef CONFIG_UNINLINE_SPIN_UNLOCK -void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock) -{ - __raw_spin_unlock(lock); -} -EXPORT_SYMBOL(_raw_spin_unlock); -#endif - -#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE -void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags) -{ - __raw_spin_unlock_irqrestore(lock, flags); -} -EXPORT_SYMBOL(_raw_spin_unlock_irqrestore); -#endif - -#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQ -void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock) -{ - __raw_spin_unlock_irq(lock); -} -EXPORT_SYMBOL(_raw_spin_unlock_irq); -#endif - -#ifndef CONFIG_INLINE_SPIN_UNLOCK_BH -void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock) -{ - __raw_spin_unlock_bh(lock); -} -EXPORT_SYMBOL(_raw_spin_unlock_bh); -#endif - -#ifndef CONFIG_INLINE_READ_TRYLOCK -int __lockfunc _raw_read_trylock(rwlock_t *lock) -{ - return __raw_read_trylock(lock); -} -EXPORT_SYMBOL(_raw_read_trylock); -#endif - -#ifndef CONFIG_INLINE_READ_LOCK -void __lockfunc _raw_read_lock(rwlock_t *lock) -{ - __raw_read_lock(lock); -} -EXPORT_SYMBOL(_raw_read_lock); -#endif - -#ifndef CONFIG_INLINE_READ_LOCK_IRQSAVE -unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock) -{ - return __raw_read_lock_irqsave(lock); -} -EXPORT_SYMBOL(_raw_read_lock_irqsave); -#endif - -#ifndef CONFIG_INLINE_READ_LOCK_IRQ -void __lockfunc _raw_read_lock_irq(rwlock_t *lock) -{ - __raw_read_lock_irq(lock); -} -EXPORT_SYMBOL(_raw_read_lock_irq); -#endif - -#ifndef CONFIG_INLINE_READ_LOCK_BH -void __lockfunc _raw_read_lock_bh(rwlock_t *lock) -{ - __raw_read_lock_bh(lock); -} -EXPORT_SYMBOL(_raw_read_lock_bh); -#endif - -#ifndef CONFIG_INLINE_READ_UNLOCK -void __lockfunc _raw_read_unlock(rwlock_t *lock) -{ - __raw_read_unlock(lock); -} -EXPORT_SYMBOL(_raw_read_unlock); -#endif - -#ifndef CONFIG_INLINE_READ_UNLOCK_IRQRESTORE -void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) -{ - __raw_read_unlock_irqrestore(lock, flags); -} -EXPORT_SYMBOL(_raw_read_unlock_irqrestore); -#endif - -#ifndef CONFIG_INLINE_READ_UNLOCK_IRQ -void __lockfunc _raw_read_unlock_irq(rwlock_t *lock) -{ - __raw_read_unlock_irq(lock); -} -EXPORT_SYMBOL(_raw_read_unlock_irq); -#endif - -#ifndef CONFIG_INLINE_READ_UNLOCK_BH -void __lockfunc _raw_read_unlock_bh(rwlock_t *lock) -{ - __raw_read_unlock_bh(lock); -} -EXPORT_SYMBOL(_raw_read_unlock_bh); -#endif - -#ifndef CONFIG_INLINE_WRITE_TRYLOCK -int __lockfunc _raw_write_trylock(rwlock_t *lock) -{ - return __raw_write_trylock(lock); -} -EXPORT_SYMBOL(_raw_write_trylock); -#endif - -#ifndef CONFIG_INLINE_WRITE_LOCK -void __lockfunc _raw_write_lock(rwlock_t *lock) -{ - __raw_write_lock(lock); -} -EXPORT_SYMBOL(_raw_write_lock); -#endif - -#ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE -unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock) -{ - return __raw_write_lock_irqsave(lock); -} -EXPORT_SYMBOL(_raw_write_lock_irqsave); -#endif - -#ifndef CONFIG_INLINE_WRITE_LOCK_IRQ -void __lockfunc _raw_write_lock_irq(rwlock_t *lock) -{ - __raw_write_lock_irq(lock); -} -EXPORT_SYMBOL(_raw_write_lock_irq); -#endif - -#ifndef CONFIG_INLINE_WRITE_LOCK_BH -void __lockfunc _raw_write_lock_bh(rwlock_t *lock) -{ - __raw_write_lock_bh(lock); -} -EXPORT_SYMBOL(_raw_write_lock_bh); -#endif - -#ifndef CONFIG_INLINE_WRITE_UNLOCK -void __lockfunc _raw_write_unlock(rwlock_t *lock) -{ - __raw_write_unlock(lock); -} -EXPORT_SYMBOL(_raw_write_unlock); -#endif - -#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE -void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) -{ - __raw_write_unlock_irqrestore(lock, flags); -} -EXPORT_SYMBOL(_raw_write_unlock_irqrestore); -#endif - -#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQ -void __lockfunc _raw_write_unlock_irq(rwlock_t *lock) -{ - __raw_write_unlock_irq(lock); -} -EXPORT_SYMBOL(_raw_write_unlock_irq); -#endif - -#ifndef CONFIG_INLINE_WRITE_UNLOCK_BH -void __lockfunc _raw_write_unlock_bh(rwlock_t *lock) -{ - __raw_write_unlock_bh(lock); -} -EXPORT_SYMBOL(_raw_write_unlock_bh); -#endif - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - -void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass) -{ - preempt_disable(); - spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); - LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock); -} -EXPORT_SYMBOL(_raw_spin_lock_nested); - -unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock, - int subclass) -{ - unsigned long flags; - - local_irq_save(flags); - preempt_disable(); - spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); - LOCK_CONTENDED_FLAGS(lock, do_raw_spin_trylock, do_raw_spin_lock, - do_raw_spin_lock_flags, &flags); - return flags; -} -EXPORT_SYMBOL(_raw_spin_lock_irqsave_nested); - -void __lockfunc _raw_spin_lock_nest_lock(raw_spinlock_t *lock, - struct lockdep_map *nest_lock) -{ - preempt_disable(); - spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_); - LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock); -} -EXPORT_SYMBOL(_raw_spin_lock_nest_lock); - -#endif - -notrace int in_lock_functions(unsigned long addr) -{ - /* Linker adds these: start and end of __lockfunc functions */ - extern char __lock_text_start[], __lock_text_end[]; - - return addr >= (unsigned long)__lock_text_start - && addr < (unsigned long)__lock_text_end; -} -EXPORT_SYMBOL(in_lock_functions); diff --git a/lib/Makefile b/lib/Makefile index f3bb2cb98adf..bee27e1d3431 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -42,7 +42,6 @@ obj-$(CONFIG_GENERIC_PCI_IOMAP) += pci_iomap.o obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o obj-$(CONFIG_CHECK_SIGNATURE) += check_signature.o obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o -obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o lib-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o diff --git a/lib/spinlock_debug.c b/lib/spinlock_debug.c deleted file mode 100644 index 0374a596cffa..000000000000 --- a/lib/spinlock_debug.c +++ /dev/null @@ -1,302 +0,0 @@ -/* - * Copyright 2005, Red Hat, Inc., Ingo Molnar - * Released under the General Public License (GPL). - * - * This file contains the spinlock/rwlock implementations for - * DEBUG_SPINLOCK. - */ - -#include -#include -#include -#include -#include -#include - -void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, - struct lock_class_key *key) -{ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* - * Make sure we are not reinitializing a held lock: - */ - debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lockdep_init_map(&lock->dep_map, name, key, 0); -#endif - lock->raw_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; - lock->magic = SPINLOCK_MAGIC; - lock->owner = SPINLOCK_OWNER_INIT; - lock->owner_cpu = -1; -} - -EXPORT_SYMBOL(__raw_spin_lock_init); - -void __rwlock_init(rwlock_t *lock, const char *name, - struct lock_class_key *key) -{ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* - * Make sure we are not reinitializing a held lock: - */ - debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lockdep_init_map(&lock->dep_map, name, key, 0); -#endif - lock->raw_lock = (arch_rwlock_t) __ARCH_RW_LOCK_UNLOCKED; - lock->magic = RWLOCK_MAGIC; - lock->owner = SPINLOCK_OWNER_INIT; - lock->owner_cpu = -1; -} - -EXPORT_SYMBOL(__rwlock_init); - -static void spin_dump(raw_spinlock_t *lock, const char *msg) -{ - struct task_struct *owner = NULL; - - if (lock->owner && lock->owner != SPINLOCK_OWNER_INIT) - owner = lock->owner; - printk(KERN_EMERG "BUG: spinlock %s on CPU#%d, %s/%d\n", - msg, raw_smp_processor_id(), - current->comm, task_pid_nr(current)); - printk(KERN_EMERG " lock: %pS, .magic: %08x, .owner: %s/%d, " - ".owner_cpu: %d\n", - lock, lock->magic, - owner ? owner->comm : "", - owner ? task_pid_nr(owner) : -1, - lock->owner_cpu); - dump_stack(); -} - -static void spin_bug(raw_spinlock_t *lock, const char *msg) -{ - if (!debug_locks_off()) - return; - - spin_dump(lock, msg); -} - -#define SPIN_BUG_ON(cond, lock, msg) if (unlikely(cond)) spin_bug(lock, msg) - -static inline void -debug_spin_lock_before(raw_spinlock_t *lock) -{ - SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic"); - SPIN_BUG_ON(lock->owner == current, lock, "recursion"); - SPIN_BUG_ON(lock->owner_cpu == raw_smp_processor_id(), - lock, "cpu recursion"); -} - -static inline void debug_spin_lock_after(raw_spinlock_t *lock) -{ - lock->owner_cpu = raw_smp_processor_id(); - lock->owner = current; -} - -static inline void debug_spin_unlock(raw_spinlock_t *lock) -{ - SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic"); - SPIN_BUG_ON(!raw_spin_is_locked(lock), lock, "already unlocked"); - SPIN_BUG_ON(lock->owner != current, lock, "wrong owner"); - SPIN_BUG_ON(lock->owner_cpu != raw_smp_processor_id(), - lock, "wrong CPU"); - lock->owner = SPINLOCK_OWNER_INIT; - lock->owner_cpu = -1; -} - -static void __spin_lock_debug(raw_spinlock_t *lock) -{ - u64 i; - u64 loops = loops_per_jiffy * HZ; - - for (i = 0; i < loops; i++) { - if (arch_spin_trylock(&lock->raw_lock)) - return; - __delay(1); - } - /* lockup suspected: */ - spin_dump(lock, "lockup suspected"); -#ifdef CONFIG_SMP - trigger_all_cpu_backtrace(); -#endif - - /* - * The trylock above was causing a livelock. Give the lower level arch - * specific lock code a chance to acquire the lock. We have already - * printed a warning/backtrace at this point. The non-debug arch - * specific code might actually succeed in acquiring the lock. If it is - * not successful, the end-result is the same - there is no forward - * progress. - */ - arch_spin_lock(&lock->raw_lock); -} - -void do_raw_spin_lock(raw_spinlock_t *lock) -{ - debug_spin_lock_before(lock); - if (unlikely(!arch_spin_trylock(&lock->raw_lock))) - __spin_lock_debug(lock); - debug_spin_lock_after(lock); -} - -int do_raw_spin_trylock(raw_spinlock_t *lock) -{ - int ret = arch_spin_trylock(&lock->raw_lock); - - if (ret) - debug_spin_lock_after(lock); -#ifndef CONFIG_SMP - /* - * Must not happen on UP: - */ - SPIN_BUG_ON(!ret, lock, "trylock failure on UP"); -#endif - return ret; -} - -void do_raw_spin_unlock(raw_spinlock_t *lock) -{ - debug_spin_unlock(lock); - arch_spin_unlock(&lock->raw_lock); -} - -static void rwlock_bug(rwlock_t *lock, const char *msg) -{ - if (!debug_locks_off()) - return; - - printk(KERN_EMERG "BUG: rwlock %s on CPU#%d, %s/%d, %p\n", - msg, raw_smp_processor_id(), current->comm, - task_pid_nr(current), lock); - dump_stack(); -} - -#define RWLOCK_BUG_ON(cond, lock, msg) if (unlikely(cond)) rwlock_bug(lock, msg) - -#if 0 /* __write_lock_debug() can lock up - maybe this can too? */ -static void __read_lock_debug(rwlock_t *lock) -{ - u64 i; - u64 loops = loops_per_jiffy * HZ; - int print_once = 1; - - for (;;) { - for (i = 0; i < loops; i++) { - if (arch_read_trylock(&lock->raw_lock)) - return; - __delay(1); - } - /* lockup suspected: */ - if (print_once) { - print_once = 0; - printk(KERN_EMERG "BUG: read-lock lockup on CPU#%d, " - "%s/%d, %p\n", - raw_smp_processor_id(), current->comm, - current->pid, lock); - dump_stack(); - } - } -} -#endif - -void do_raw_read_lock(rwlock_t *lock) -{ - RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); - arch_read_lock(&lock->raw_lock); -} - -int do_raw_read_trylock(rwlock_t *lock) -{ - int ret = arch_read_trylock(&lock->raw_lock); - -#ifndef CONFIG_SMP - /* - * Must not happen on UP: - */ - RWLOCK_BUG_ON(!ret, lock, "trylock failure on UP"); -#endif - return ret; -} - -void do_raw_read_unlock(rwlock_t *lock) -{ - RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); - arch_read_unlock(&lock->raw_lock); -} - -static inline void debug_write_lock_before(rwlock_t *lock) -{ - RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); - RWLOCK_BUG_ON(lock->owner == current, lock, "recursion"); - RWLOCK_BUG_ON(lock->owner_cpu == raw_smp_processor_id(), - lock, "cpu recursion"); -} - -static inline void debug_write_lock_after(rwlock_t *lock) -{ - lock->owner_cpu = raw_smp_processor_id(); - lock->owner = current; -} - -static inline void debug_write_unlock(rwlock_t *lock) -{ - RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); - RWLOCK_BUG_ON(lock->owner != current, lock, "wrong owner"); - RWLOCK_BUG_ON(lock->owner_cpu != raw_smp_processor_id(), - lock, "wrong CPU"); - lock->owner = SPINLOCK_OWNER_INIT; - lock->owner_cpu = -1; -} - -#if 0 /* This can cause lockups */ -static void __write_lock_debug(rwlock_t *lock) -{ - u64 i; - u64 loops = loops_per_jiffy * HZ; - int print_once = 1; - - for (;;) { - for (i = 0; i < loops; i++) { - if (arch_write_trylock(&lock->raw_lock)) - return; - __delay(1); - } - /* lockup suspected: */ - if (print_once) { - print_once = 0; - printk(KERN_EMERG "BUG: write-lock lockup on CPU#%d, " - "%s/%d, %p\n", - raw_smp_processor_id(), current->comm, - current->pid, lock); - dump_stack(); - } - } -} -#endif - -void do_raw_write_lock(rwlock_t *lock) -{ - debug_write_lock_before(lock); - arch_write_lock(&lock->raw_lock); - debug_write_lock_after(lock); -} - -int do_raw_write_trylock(rwlock_t *lock) -{ - int ret = arch_write_trylock(&lock->raw_lock); - - if (ret) - debug_write_lock_after(lock); -#ifndef CONFIG_SMP - /* - * Must not happen on UP: - */ - RWLOCK_BUG_ON(!ret, lock, "trylock failure on UP"); -#endif - return ret; -} - -void do_raw_write_unlock(rwlock_t *lock) -{ - debug_write_unlock(lock); - arch_write_unlock(&lock->raw_lock); -} -- cgit v1.2.3 From ed428bfc3caaa4b1e6cd15ea12c90c30291903f0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 31 Oct 2013 18:19:28 +0100 Subject: locking: Move the rwsem code to kernel/locking/ Notably: changed lib/rwsem* targets from lib- to obj-, no idea about the ramifications of that. Suggested-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-g0kynfh5feriwc6p3h6kpbw6@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/Makefile | 2 +- kernel/locking/Makefile | 4 +- kernel/locking/rwsem-spinlock.c | 296 ++++++++++++++++++++++++++++++++++++++++ kernel/locking/rwsem-xadd.c | 293 +++++++++++++++++++++++++++++++++++++++ kernel/locking/rwsem.c | 157 +++++++++++++++++++++ kernel/rwsem.c | 157 --------------------- lib/Makefile | 2 - lib/rwsem-spinlock.c | 296 ---------------------------------------- lib/rwsem.c | 293 --------------------------------------- 9 files changed, 750 insertions(+), 750 deletions(-) create mode 100644 kernel/locking/rwsem-spinlock.c create mode 100644 kernel/locking/rwsem-xadd.c create mode 100644 kernel/locking/rwsem.c delete mode 100644 kernel/rwsem.c delete mode 100644 lib/rwsem-spinlock.c delete mode 100644 lib/rwsem.c (limited to 'lib') diff --git a/kernel/Makefile b/kernel/Makefile index 9c2ad1852223..1aef002aa56b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -8,7 +8,7 @@ obj-y = fork.o exec_domain.o panic.o \ signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ extable.o params.o posix-timers.o \ kthread.o sys_ni.o posix-cpu-timers.o \ - hrtimer.o rwsem.o nsproxy.o \ + hrtimer.o nsproxy.o \ notifier.o ksysfs.o cred.o reboot.o \ async.o range.o groups.o lglock.o smpboot.o diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 59f66dec2bf9..b0e0d73516e3 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -1,5 +1,5 @@ -obj-y += mutex.o semaphore.o +obj-y += mutex.o semaphore.o rwsem.o ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_lockdep.o = -pg @@ -20,3 +20,5 @@ obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o +obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o +obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o \ No newline at end of file diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c new file mode 100644 index 000000000000..9be8a9144978 --- /dev/null +++ b/kernel/locking/rwsem-spinlock.c @@ -0,0 +1,296 @@ +/* rwsem-spinlock.c: R/W semaphores: contention handling functions for + * generic spinlock implementation + * + * Copyright (c) 2001 David Howells (dhowells@redhat.com). + * - Derived partially from idea by Andrea Arcangeli + * - Derived also from comments by Linus + */ +#include +#include +#include + +enum rwsem_waiter_type { + RWSEM_WAITING_FOR_WRITE, + RWSEM_WAITING_FOR_READ +}; + +struct rwsem_waiter { + struct list_head list; + struct task_struct *task; + enum rwsem_waiter_type type; +}; + +int rwsem_is_locked(struct rw_semaphore *sem) +{ + int ret = 1; + unsigned long flags; + + if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) { + ret = (sem->activity != 0); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + } + return ret; +} +EXPORT_SYMBOL(rwsem_is_locked); + +/* + * initialise the semaphore + */ +void __init_rwsem(struct rw_semaphore *sem, const char *name, + struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held semaphore: + */ + debug_check_no_locks_freed((void *)sem, sizeof(*sem)); + lockdep_init_map(&sem->dep_map, name, key, 0); +#endif + sem->activity = 0; + raw_spin_lock_init(&sem->wait_lock); + INIT_LIST_HEAD(&sem->wait_list); +} +EXPORT_SYMBOL(__init_rwsem); + +/* + * handle the lock release when processes blocked on it that can now run + * - if we come here, then: + * - the 'active count' _reached_ zero + * - the 'waiting count' is non-zero + * - the spinlock must be held by the caller + * - woken process blocks are discarded from the list after having task zeroed + * - writers are only woken if wakewrite is non-zero + */ +static inline struct rw_semaphore * +__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) +{ + struct rwsem_waiter *waiter; + struct task_struct *tsk; + int woken; + + waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); + + if (waiter->type == RWSEM_WAITING_FOR_WRITE) { + if (wakewrite) + /* Wake up a writer. Note that we do not grant it the + * lock - it will have to acquire it when it runs. */ + wake_up_process(waiter->task); + goto out; + } + + /* grant an infinite number of read locks to the front of the queue */ + woken = 0; + do { + struct list_head *next = waiter->list.next; + + list_del(&waiter->list); + tsk = waiter->task; + smp_mb(); + waiter->task = NULL; + wake_up_process(tsk); + put_task_struct(tsk); + woken++; + if (next == &sem->wait_list) + break; + waiter = list_entry(next, struct rwsem_waiter, list); + } while (waiter->type != RWSEM_WAITING_FOR_WRITE); + + sem->activity += woken; + + out: + return sem; +} + +/* + * wake a single writer + */ +static inline struct rw_semaphore * +__rwsem_wake_one_writer(struct rw_semaphore *sem) +{ + struct rwsem_waiter *waiter; + + waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); + wake_up_process(waiter->task); + + return sem; +} + +/* + * get a read lock on the semaphore + */ +void __sched __down_read(struct rw_semaphore *sem) +{ + struct rwsem_waiter waiter; + struct task_struct *tsk; + unsigned long flags; + + raw_spin_lock_irqsave(&sem->wait_lock, flags); + + if (sem->activity >= 0 && list_empty(&sem->wait_list)) { + /* granted */ + sem->activity++; + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + goto out; + } + + tsk = current; + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + + /* set up my own style of waitqueue */ + waiter.task = tsk; + waiter.type = RWSEM_WAITING_FOR_READ; + get_task_struct(tsk); + + list_add_tail(&waiter.list, &sem->wait_list); + + /* we don't need to touch the semaphore struct anymore */ + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + + /* wait to be given the lock */ + for (;;) { + if (!waiter.task) + break; + schedule(); + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + } + + tsk->state = TASK_RUNNING; + out: + ; +} + +/* + * trylock for reading -- returns 1 if successful, 0 if contention + */ +int __down_read_trylock(struct rw_semaphore *sem) +{ + unsigned long flags; + int ret = 0; + + + raw_spin_lock_irqsave(&sem->wait_lock, flags); + + if (sem->activity >= 0 && list_empty(&sem->wait_list)) { + /* granted */ + sem->activity++; + ret = 1; + } + + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + + return ret; +} + +/* + * get a write lock on the semaphore + */ +void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) +{ + struct rwsem_waiter waiter; + struct task_struct *tsk; + unsigned long flags; + + raw_spin_lock_irqsave(&sem->wait_lock, flags); + + /* set up my own style of waitqueue */ + tsk = current; + waiter.task = tsk; + waiter.type = RWSEM_WAITING_FOR_WRITE; + list_add_tail(&waiter.list, &sem->wait_list); + + /* wait for someone to release the lock */ + for (;;) { + /* + * That is the key to support write lock stealing: allows the + * task already on CPU to get the lock soon rather than put + * itself into sleep and waiting for system woke it or someone + * else in the head of the wait list up. + */ + if (sem->activity == 0) + break; + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + schedule(); + raw_spin_lock_irqsave(&sem->wait_lock, flags); + } + /* got the lock */ + sem->activity = -1; + list_del(&waiter.list); + + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); +} + +void __sched __down_write(struct rw_semaphore *sem) +{ + __down_write_nested(sem, 0); +} + +/* + * trylock for writing -- returns 1 if successful, 0 if contention + */ +int __down_write_trylock(struct rw_semaphore *sem) +{ + unsigned long flags; + int ret = 0; + + raw_spin_lock_irqsave(&sem->wait_lock, flags); + + if (sem->activity == 0) { + /* got the lock */ + sem->activity = -1; + ret = 1; + } + + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + + return ret; +} + +/* + * release a read lock on the semaphore + */ +void __up_read(struct rw_semaphore *sem) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&sem->wait_lock, flags); + + if (--sem->activity == 0 && !list_empty(&sem->wait_list)) + sem = __rwsem_wake_one_writer(sem); + + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); +} + +/* + * release a write lock on the semaphore + */ +void __up_write(struct rw_semaphore *sem) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&sem->wait_lock, flags); + + sem->activity = 0; + if (!list_empty(&sem->wait_list)) + sem = __rwsem_do_wake(sem, 1); + + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); +} + +/* + * downgrade a write lock into a read lock + * - just wake up any readers at the front of the queue + */ +void __downgrade_write(struct rw_semaphore *sem) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&sem->wait_lock, flags); + + sem->activity = 1; + if (!list_empty(&sem->wait_list)) + sem = __rwsem_do_wake(sem, 0); + + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); +} + diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c new file mode 100644 index 000000000000..19c5fa95e0b4 --- /dev/null +++ b/kernel/locking/rwsem-xadd.c @@ -0,0 +1,293 @@ +/* rwsem.c: R/W semaphores: contention handling functions + * + * Written by David Howells (dhowells@redhat.com). + * Derived from arch/i386/kernel/semaphore.c + * + * Writer lock-stealing by Alex Shi + * and Michel Lespinasse + */ +#include +#include +#include +#include + +/* + * Initialize an rwsem: + */ +void __init_rwsem(struct rw_semaphore *sem, const char *name, + struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held semaphore: + */ + debug_check_no_locks_freed((void *)sem, sizeof(*sem)); + lockdep_init_map(&sem->dep_map, name, key, 0); +#endif + sem->count = RWSEM_UNLOCKED_VALUE; + raw_spin_lock_init(&sem->wait_lock); + INIT_LIST_HEAD(&sem->wait_list); +} + +EXPORT_SYMBOL(__init_rwsem); + +enum rwsem_waiter_type { + RWSEM_WAITING_FOR_WRITE, + RWSEM_WAITING_FOR_READ +}; + +struct rwsem_waiter { + struct list_head list; + struct task_struct *task; + enum rwsem_waiter_type type; +}; + +enum rwsem_wake_type { + RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */ + RWSEM_WAKE_READERS, /* Wake readers only */ + RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */ +}; + +/* + * handle the lock release when processes blocked on it that can now run + * - if we come here from up_xxxx(), then: + * - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed) + * - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so) + * - there must be someone on the queue + * - the spinlock must be held by the caller + * - woken process blocks are discarded from the list after having task zeroed + * - writers are only woken if downgrading is false + */ +static struct rw_semaphore * +__rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) +{ + struct rwsem_waiter *waiter; + struct task_struct *tsk; + struct list_head *next; + long oldcount, woken, loop, adjustment; + + waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); + if (waiter->type == RWSEM_WAITING_FOR_WRITE) { + if (wake_type == RWSEM_WAKE_ANY) + /* Wake writer at the front of the queue, but do not + * grant it the lock yet as we want other writers + * to be able to steal it. Readers, on the other hand, + * will block as they will notice the queued writer. + */ + wake_up_process(waiter->task); + goto out; + } + + /* Writers might steal the lock before we grant it to the next reader. + * We prefer to do the first reader grant before counting readers + * so we can bail out early if a writer stole the lock. + */ + adjustment = 0; + if (wake_type != RWSEM_WAKE_READ_OWNED) { + adjustment = RWSEM_ACTIVE_READ_BIAS; + try_reader_grant: + oldcount = rwsem_atomic_update(adjustment, sem) - adjustment; + if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { + /* A writer stole the lock. Undo our reader grant. */ + if (rwsem_atomic_update(-adjustment, sem) & + RWSEM_ACTIVE_MASK) + goto out; + /* Last active locker left. Retry waking readers. */ + goto try_reader_grant; + } + } + + /* Grant an infinite number of read locks to the readers at the front + * of the queue. Note we increment the 'active part' of the count by + * the number of readers before waking any processes up. + */ + woken = 0; + do { + woken++; + + if (waiter->list.next == &sem->wait_list) + break; + + waiter = list_entry(waiter->list.next, + struct rwsem_waiter, list); + + } while (waiter->type != RWSEM_WAITING_FOR_WRITE); + + adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; + if (waiter->type != RWSEM_WAITING_FOR_WRITE) + /* hit end of list above */ + adjustment -= RWSEM_WAITING_BIAS; + + if (adjustment) + rwsem_atomic_add(adjustment, sem); + + next = sem->wait_list.next; + loop = woken; + do { + waiter = list_entry(next, struct rwsem_waiter, list); + next = waiter->list.next; + tsk = waiter->task; + smp_mb(); + waiter->task = NULL; + wake_up_process(tsk); + put_task_struct(tsk); + } while (--loop); + + sem->wait_list.next = next; + next->prev = &sem->wait_list; + + out: + return sem; +} + +/* + * wait for the read lock to be granted + */ +struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) +{ + long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; + struct rwsem_waiter waiter; + struct task_struct *tsk = current; + + /* set up my own style of waitqueue */ + waiter.task = tsk; + waiter.type = RWSEM_WAITING_FOR_READ; + get_task_struct(tsk); + + raw_spin_lock_irq(&sem->wait_lock); + if (list_empty(&sem->wait_list)) + adjustment += RWSEM_WAITING_BIAS; + list_add_tail(&waiter.list, &sem->wait_list); + + /* we're now waiting on the lock, but no longer actively locking */ + count = rwsem_atomic_update(adjustment, sem); + + /* If there are no active locks, wake the front queued process(es). + * + * If there are no writers and we are first in the queue, + * wake our own waiter to join the existing active readers ! + */ + if (count == RWSEM_WAITING_BIAS || + (count > RWSEM_WAITING_BIAS && + adjustment != -RWSEM_ACTIVE_READ_BIAS)) + sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); + + raw_spin_unlock_irq(&sem->wait_lock); + + /* wait to be given the lock */ + while (true) { + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + if (!waiter.task) + break; + schedule(); + } + + tsk->state = TASK_RUNNING; + + return sem; +} + +/* + * wait until we successfully acquire the write lock + */ +struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) +{ + long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS; + struct rwsem_waiter waiter; + struct task_struct *tsk = current; + + /* set up my own style of waitqueue */ + waiter.task = tsk; + waiter.type = RWSEM_WAITING_FOR_WRITE; + + raw_spin_lock_irq(&sem->wait_lock); + if (list_empty(&sem->wait_list)) + adjustment += RWSEM_WAITING_BIAS; + list_add_tail(&waiter.list, &sem->wait_list); + + /* we're now waiting on the lock, but no longer actively locking */ + count = rwsem_atomic_update(adjustment, sem); + + /* If there were already threads queued before us and there are no + * active writers, the lock must be read owned; so we try to wake + * any read locks that were queued ahead of us. */ + if (count > RWSEM_WAITING_BIAS && + adjustment == -RWSEM_ACTIVE_WRITE_BIAS) + sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS); + + /* wait until we successfully acquire the lock */ + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + while (true) { + if (!(count & RWSEM_ACTIVE_MASK)) { + /* Try acquiring the write lock. */ + count = RWSEM_ACTIVE_WRITE_BIAS; + if (!list_is_singular(&sem->wait_list)) + count += RWSEM_WAITING_BIAS; + + if (sem->count == RWSEM_WAITING_BIAS && + cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) == + RWSEM_WAITING_BIAS) + break; + } + + raw_spin_unlock_irq(&sem->wait_lock); + + /* Block until there are no active lockers. */ + do { + schedule(); + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + } while ((count = sem->count) & RWSEM_ACTIVE_MASK); + + raw_spin_lock_irq(&sem->wait_lock); + } + + list_del(&waiter.list); + raw_spin_unlock_irq(&sem->wait_lock); + tsk->state = TASK_RUNNING; + + return sem; +} + +/* + * handle waking up a waiter on the semaphore + * - up_read/up_write has decremented the active part of count if we come here + */ +struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&sem->wait_lock, flags); + + /* do nothing if list empty */ + if (!list_empty(&sem->wait_list)) + sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); + + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + + return sem; +} + +/* + * downgrade a write lock into a read lock + * - caller incremented waiting part of count and discovered it still negative + * - just wake up any readers at the front of the queue + */ +struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&sem->wait_lock, flags); + + /* do nothing if list empty */ + if (!list_empty(&sem->wait_list)) + sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED); + + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + + return sem; +} + +EXPORT_SYMBOL(rwsem_down_read_failed); +EXPORT_SYMBOL(rwsem_down_write_failed); +EXPORT_SYMBOL(rwsem_wake); +EXPORT_SYMBOL(rwsem_downgrade_wake); diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c new file mode 100644 index 000000000000..cfff1435bdfb --- /dev/null +++ b/kernel/locking/rwsem.c @@ -0,0 +1,157 @@ +/* kernel/rwsem.c: R/W semaphores, public implementation + * + * Written by David Howells (dhowells@redhat.com). + * Derived from asm-i386/semaphore.h + */ + +#include +#include +#include +#include +#include + +#include + +/* + * lock for reading + */ +void __sched down_read(struct rw_semaphore *sem) +{ + might_sleep(); + rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); + + LOCK_CONTENDED(sem, __down_read_trylock, __down_read); +} + +EXPORT_SYMBOL(down_read); + +/* + * trylock for reading -- returns 1 if successful, 0 if contention + */ +int down_read_trylock(struct rw_semaphore *sem) +{ + int ret = __down_read_trylock(sem); + + if (ret == 1) + rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); + return ret; +} + +EXPORT_SYMBOL(down_read_trylock); + +/* + * lock for writing + */ +void __sched down_write(struct rw_semaphore *sem) +{ + might_sleep(); + rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); + + LOCK_CONTENDED(sem, __down_write_trylock, __down_write); +} + +EXPORT_SYMBOL(down_write); + +/* + * trylock for writing -- returns 1 if successful, 0 if contention + */ +int down_write_trylock(struct rw_semaphore *sem) +{ + int ret = __down_write_trylock(sem); + + if (ret == 1) + rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_); + return ret; +} + +EXPORT_SYMBOL(down_write_trylock); + +/* + * release a read lock + */ +void up_read(struct rw_semaphore *sem) +{ + rwsem_release(&sem->dep_map, 1, _RET_IP_); + + __up_read(sem); +} + +EXPORT_SYMBOL(up_read); + +/* + * release a write lock + */ +void up_write(struct rw_semaphore *sem) +{ + rwsem_release(&sem->dep_map, 1, _RET_IP_); + + __up_write(sem); +} + +EXPORT_SYMBOL(up_write); + +/* + * downgrade write lock to read lock + */ +void downgrade_write(struct rw_semaphore *sem) +{ + /* + * lockdep: a downgraded write will live on as a write + * dependency. + */ + __downgrade_write(sem); +} + +EXPORT_SYMBOL(downgrade_write); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + +void down_read_nested(struct rw_semaphore *sem, int subclass) +{ + might_sleep(); + rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); + + LOCK_CONTENDED(sem, __down_read_trylock, __down_read); +} + +EXPORT_SYMBOL(down_read_nested); + +void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) +{ + might_sleep(); + rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); + + LOCK_CONTENDED(sem, __down_write_trylock, __down_write); +} + +EXPORT_SYMBOL(_down_write_nest_lock); + +void down_read_non_owner(struct rw_semaphore *sem) +{ + might_sleep(); + + __down_read(sem); +} + +EXPORT_SYMBOL(down_read_non_owner); + +void down_write_nested(struct rw_semaphore *sem, int subclass) +{ + might_sleep(); + rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); + + LOCK_CONTENDED(sem, __down_write_trylock, __down_write); +} + +EXPORT_SYMBOL(down_write_nested); + +void up_read_non_owner(struct rw_semaphore *sem) +{ + __up_read(sem); +} + +EXPORT_SYMBOL(up_read_non_owner); + +#endif + + diff --git a/kernel/rwsem.c b/kernel/rwsem.c deleted file mode 100644 index cfff1435bdfb..000000000000 --- a/kernel/rwsem.c +++ /dev/null @@ -1,157 +0,0 @@ -/* kernel/rwsem.c: R/W semaphores, public implementation - * - * Written by David Howells (dhowells@redhat.com). - * Derived from asm-i386/semaphore.h - */ - -#include -#include -#include -#include -#include - -#include - -/* - * lock for reading - */ -void __sched down_read(struct rw_semaphore *sem) -{ - might_sleep(); - rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); - - LOCK_CONTENDED(sem, __down_read_trylock, __down_read); -} - -EXPORT_SYMBOL(down_read); - -/* - * trylock for reading -- returns 1 if successful, 0 if contention - */ -int down_read_trylock(struct rw_semaphore *sem) -{ - int ret = __down_read_trylock(sem); - - if (ret == 1) - rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); - return ret; -} - -EXPORT_SYMBOL(down_read_trylock); - -/* - * lock for writing - */ -void __sched down_write(struct rw_semaphore *sem) -{ - might_sleep(); - rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); - - LOCK_CONTENDED(sem, __down_write_trylock, __down_write); -} - -EXPORT_SYMBOL(down_write); - -/* - * trylock for writing -- returns 1 if successful, 0 if contention - */ -int down_write_trylock(struct rw_semaphore *sem) -{ - int ret = __down_write_trylock(sem); - - if (ret == 1) - rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_); - return ret; -} - -EXPORT_SYMBOL(down_write_trylock); - -/* - * release a read lock - */ -void up_read(struct rw_semaphore *sem) -{ - rwsem_release(&sem->dep_map, 1, _RET_IP_); - - __up_read(sem); -} - -EXPORT_SYMBOL(up_read); - -/* - * release a write lock - */ -void up_write(struct rw_semaphore *sem) -{ - rwsem_release(&sem->dep_map, 1, _RET_IP_); - - __up_write(sem); -} - -EXPORT_SYMBOL(up_write); - -/* - * downgrade write lock to read lock - */ -void downgrade_write(struct rw_semaphore *sem) -{ - /* - * lockdep: a downgraded write will live on as a write - * dependency. - */ - __downgrade_write(sem); -} - -EXPORT_SYMBOL(downgrade_write); - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - -void down_read_nested(struct rw_semaphore *sem, int subclass) -{ - might_sleep(); - rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); - - LOCK_CONTENDED(sem, __down_read_trylock, __down_read); -} - -EXPORT_SYMBOL(down_read_nested); - -void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) -{ - might_sleep(); - rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); - - LOCK_CONTENDED(sem, __down_write_trylock, __down_write); -} - -EXPORT_SYMBOL(_down_write_nest_lock); - -void down_read_non_owner(struct rw_semaphore *sem) -{ - might_sleep(); - - __down_read(sem); -} - -EXPORT_SYMBOL(down_read_non_owner); - -void down_write_nested(struct rw_semaphore *sem, int subclass) -{ - might_sleep(); - rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); - - LOCK_CONTENDED(sem, __down_write_trylock, __down_write); -} - -EXPORT_SYMBOL(down_write_nested); - -void up_read_non_owner(struct rw_semaphore *sem) -{ - __up_read(sem); -} - -EXPORT_SYMBOL(up_read_non_owner); - -#endif - - diff --git a/lib/Makefile b/lib/Makefile index bee27e1d3431..ca8cadc8fe4d 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -42,8 +42,6 @@ obj-$(CONFIG_GENERIC_PCI_IOMAP) += pci_iomap.o obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o obj-$(CONFIG_CHECK_SIGNATURE) += check_signature.o obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o -lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o -lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o lib-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o CFLAGS_hweight.o = $(subst $(quote),,$(CONFIG_ARCH_HWEIGHT_CFLAGS)) diff --git a/lib/rwsem-spinlock.c b/lib/rwsem-spinlock.c deleted file mode 100644 index 9be8a9144978..000000000000 --- a/lib/rwsem-spinlock.c +++ /dev/null @@ -1,296 +0,0 @@ -/* rwsem-spinlock.c: R/W semaphores: contention handling functions for - * generic spinlock implementation - * - * Copyright (c) 2001 David Howells (dhowells@redhat.com). - * - Derived partially from idea by Andrea Arcangeli - * - Derived also from comments by Linus - */ -#include -#include -#include - -enum rwsem_waiter_type { - RWSEM_WAITING_FOR_WRITE, - RWSEM_WAITING_FOR_READ -}; - -struct rwsem_waiter { - struct list_head list; - struct task_struct *task; - enum rwsem_waiter_type type; -}; - -int rwsem_is_locked(struct rw_semaphore *sem) -{ - int ret = 1; - unsigned long flags; - - if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) { - ret = (sem->activity != 0); - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - } - return ret; -} -EXPORT_SYMBOL(rwsem_is_locked); - -/* - * initialise the semaphore - */ -void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key) -{ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* - * Make sure we are not reinitializing a held semaphore: - */ - debug_check_no_locks_freed((void *)sem, sizeof(*sem)); - lockdep_init_map(&sem->dep_map, name, key, 0); -#endif - sem->activity = 0; - raw_spin_lock_init(&sem->wait_lock); - INIT_LIST_HEAD(&sem->wait_list); -} -EXPORT_SYMBOL(__init_rwsem); - -/* - * handle the lock release when processes blocked on it that can now run - * - if we come here, then: - * - the 'active count' _reached_ zero - * - the 'waiting count' is non-zero - * - the spinlock must be held by the caller - * - woken process blocks are discarded from the list after having task zeroed - * - writers are only woken if wakewrite is non-zero - */ -static inline struct rw_semaphore * -__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) -{ - struct rwsem_waiter *waiter; - struct task_struct *tsk; - int woken; - - waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); - - if (waiter->type == RWSEM_WAITING_FOR_WRITE) { - if (wakewrite) - /* Wake up a writer. Note that we do not grant it the - * lock - it will have to acquire it when it runs. */ - wake_up_process(waiter->task); - goto out; - } - - /* grant an infinite number of read locks to the front of the queue */ - woken = 0; - do { - struct list_head *next = waiter->list.next; - - list_del(&waiter->list); - tsk = waiter->task; - smp_mb(); - waiter->task = NULL; - wake_up_process(tsk); - put_task_struct(tsk); - woken++; - if (next == &sem->wait_list) - break; - waiter = list_entry(next, struct rwsem_waiter, list); - } while (waiter->type != RWSEM_WAITING_FOR_WRITE); - - sem->activity += woken; - - out: - return sem; -} - -/* - * wake a single writer - */ -static inline struct rw_semaphore * -__rwsem_wake_one_writer(struct rw_semaphore *sem) -{ - struct rwsem_waiter *waiter; - - waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); - wake_up_process(waiter->task); - - return sem; -} - -/* - * get a read lock on the semaphore - */ -void __sched __down_read(struct rw_semaphore *sem) -{ - struct rwsem_waiter waiter; - struct task_struct *tsk; - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (sem->activity >= 0 && list_empty(&sem->wait_list)) { - /* granted */ - sem->activity++; - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - goto out; - } - - tsk = current; - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - - /* set up my own style of waitqueue */ - waiter.task = tsk; - waiter.type = RWSEM_WAITING_FOR_READ; - get_task_struct(tsk); - - list_add_tail(&waiter.list, &sem->wait_list); - - /* we don't need to touch the semaphore struct anymore */ - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - - /* wait to be given the lock */ - for (;;) { - if (!waiter.task) - break; - schedule(); - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - } - - tsk->state = TASK_RUNNING; - out: - ; -} - -/* - * trylock for reading -- returns 1 if successful, 0 if contention - */ -int __down_read_trylock(struct rw_semaphore *sem) -{ - unsigned long flags; - int ret = 0; - - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (sem->activity >= 0 && list_empty(&sem->wait_list)) { - /* granted */ - sem->activity++; - ret = 1; - } - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - - return ret; -} - -/* - * get a write lock on the semaphore - */ -void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) -{ - struct rwsem_waiter waiter; - struct task_struct *tsk; - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - /* set up my own style of waitqueue */ - tsk = current; - waiter.task = tsk; - waiter.type = RWSEM_WAITING_FOR_WRITE; - list_add_tail(&waiter.list, &sem->wait_list); - - /* wait for someone to release the lock */ - for (;;) { - /* - * That is the key to support write lock stealing: allows the - * task already on CPU to get the lock soon rather than put - * itself into sleep and waiting for system woke it or someone - * else in the head of the wait list up. - */ - if (sem->activity == 0) - break; - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - schedule(); - raw_spin_lock_irqsave(&sem->wait_lock, flags); - } - /* got the lock */ - sem->activity = -1; - list_del(&waiter.list); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -} - -void __sched __down_write(struct rw_semaphore *sem) -{ - __down_write_nested(sem, 0); -} - -/* - * trylock for writing -- returns 1 if successful, 0 if contention - */ -int __down_write_trylock(struct rw_semaphore *sem) -{ - unsigned long flags; - int ret = 0; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (sem->activity == 0) { - /* got the lock */ - sem->activity = -1; - ret = 1; - } - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - - return ret; -} - -/* - * release a read lock on the semaphore - */ -void __up_read(struct rw_semaphore *sem) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (--sem->activity == 0 && !list_empty(&sem->wait_list)) - sem = __rwsem_wake_one_writer(sem); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -} - -/* - * release a write lock on the semaphore - */ -void __up_write(struct rw_semaphore *sem) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - sem->activity = 0; - if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, 1); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -} - -/* - * downgrade a write lock into a read lock - * - just wake up any readers at the front of the queue - */ -void __downgrade_write(struct rw_semaphore *sem) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - sem->activity = 1; - if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, 0); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -} - diff --git a/lib/rwsem.c b/lib/rwsem.c deleted file mode 100644 index 19c5fa95e0b4..000000000000 --- a/lib/rwsem.c +++ /dev/null @@ -1,293 +0,0 @@ -/* rwsem.c: R/W semaphores: contention handling functions - * - * Written by David Howells (dhowells@redhat.com). - * Derived from arch/i386/kernel/semaphore.c - * - * Writer lock-stealing by Alex Shi - * and Michel Lespinasse - */ -#include -#include -#include -#include - -/* - * Initialize an rwsem: - */ -void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key) -{ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* - * Make sure we are not reinitializing a held semaphore: - */ - debug_check_no_locks_freed((void *)sem, sizeof(*sem)); - lockdep_init_map(&sem->dep_map, name, key, 0); -#endif - sem->count = RWSEM_UNLOCKED_VALUE; - raw_spin_lock_init(&sem->wait_lock); - INIT_LIST_HEAD(&sem->wait_list); -} - -EXPORT_SYMBOL(__init_rwsem); - -enum rwsem_waiter_type { - RWSEM_WAITING_FOR_WRITE, - RWSEM_WAITING_FOR_READ -}; - -struct rwsem_waiter { - struct list_head list; - struct task_struct *task; - enum rwsem_waiter_type type; -}; - -enum rwsem_wake_type { - RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */ - RWSEM_WAKE_READERS, /* Wake readers only */ - RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */ -}; - -/* - * handle the lock release when processes blocked on it that can now run - * - if we come here from up_xxxx(), then: - * - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed) - * - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so) - * - there must be someone on the queue - * - the spinlock must be held by the caller - * - woken process blocks are discarded from the list after having task zeroed - * - writers are only woken if downgrading is false - */ -static struct rw_semaphore * -__rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) -{ - struct rwsem_waiter *waiter; - struct task_struct *tsk; - struct list_head *next; - long oldcount, woken, loop, adjustment; - - waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); - if (waiter->type == RWSEM_WAITING_FOR_WRITE) { - if (wake_type == RWSEM_WAKE_ANY) - /* Wake writer at the front of the queue, but do not - * grant it the lock yet as we want other writers - * to be able to steal it. Readers, on the other hand, - * will block as they will notice the queued writer. - */ - wake_up_process(waiter->task); - goto out; - } - - /* Writers might steal the lock before we grant it to the next reader. - * We prefer to do the first reader grant before counting readers - * so we can bail out early if a writer stole the lock. - */ - adjustment = 0; - if (wake_type != RWSEM_WAKE_READ_OWNED) { - adjustment = RWSEM_ACTIVE_READ_BIAS; - try_reader_grant: - oldcount = rwsem_atomic_update(adjustment, sem) - adjustment; - if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { - /* A writer stole the lock. Undo our reader grant. */ - if (rwsem_atomic_update(-adjustment, sem) & - RWSEM_ACTIVE_MASK) - goto out; - /* Last active locker left. Retry waking readers. */ - goto try_reader_grant; - } - } - - /* Grant an infinite number of read locks to the readers at the front - * of the queue. Note we increment the 'active part' of the count by - * the number of readers before waking any processes up. - */ - woken = 0; - do { - woken++; - - if (waiter->list.next == &sem->wait_list) - break; - - waiter = list_entry(waiter->list.next, - struct rwsem_waiter, list); - - } while (waiter->type != RWSEM_WAITING_FOR_WRITE); - - adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; - if (waiter->type != RWSEM_WAITING_FOR_WRITE) - /* hit end of list above */ - adjustment -= RWSEM_WAITING_BIAS; - - if (adjustment) - rwsem_atomic_add(adjustment, sem); - - next = sem->wait_list.next; - loop = woken; - do { - waiter = list_entry(next, struct rwsem_waiter, list); - next = waiter->list.next; - tsk = waiter->task; - smp_mb(); - waiter->task = NULL; - wake_up_process(tsk); - put_task_struct(tsk); - } while (--loop); - - sem->wait_list.next = next; - next->prev = &sem->wait_list; - - out: - return sem; -} - -/* - * wait for the read lock to be granted - */ -struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) -{ - long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; - struct rwsem_waiter waiter; - struct task_struct *tsk = current; - - /* set up my own style of waitqueue */ - waiter.task = tsk; - waiter.type = RWSEM_WAITING_FOR_READ; - get_task_struct(tsk); - - raw_spin_lock_irq(&sem->wait_lock); - if (list_empty(&sem->wait_list)) - adjustment += RWSEM_WAITING_BIAS; - list_add_tail(&waiter.list, &sem->wait_list); - - /* we're now waiting on the lock, but no longer actively locking */ - count = rwsem_atomic_update(adjustment, sem); - - /* If there are no active locks, wake the front queued process(es). - * - * If there are no writers and we are first in the queue, - * wake our own waiter to join the existing active readers ! - */ - if (count == RWSEM_WAITING_BIAS || - (count > RWSEM_WAITING_BIAS && - adjustment != -RWSEM_ACTIVE_READ_BIAS)) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); - - raw_spin_unlock_irq(&sem->wait_lock); - - /* wait to be given the lock */ - while (true) { - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - if (!waiter.task) - break; - schedule(); - } - - tsk->state = TASK_RUNNING; - - return sem; -} - -/* - * wait until we successfully acquire the write lock - */ -struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) -{ - long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS; - struct rwsem_waiter waiter; - struct task_struct *tsk = current; - - /* set up my own style of waitqueue */ - waiter.task = tsk; - waiter.type = RWSEM_WAITING_FOR_WRITE; - - raw_spin_lock_irq(&sem->wait_lock); - if (list_empty(&sem->wait_list)) - adjustment += RWSEM_WAITING_BIAS; - list_add_tail(&waiter.list, &sem->wait_list); - - /* we're now waiting on the lock, but no longer actively locking */ - count = rwsem_atomic_update(adjustment, sem); - - /* If there were already threads queued before us and there are no - * active writers, the lock must be read owned; so we try to wake - * any read locks that were queued ahead of us. */ - if (count > RWSEM_WAITING_BIAS && - adjustment == -RWSEM_ACTIVE_WRITE_BIAS) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS); - - /* wait until we successfully acquire the lock */ - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - while (true) { - if (!(count & RWSEM_ACTIVE_MASK)) { - /* Try acquiring the write lock. */ - count = RWSEM_ACTIVE_WRITE_BIAS; - if (!list_is_singular(&sem->wait_list)) - count += RWSEM_WAITING_BIAS; - - if (sem->count == RWSEM_WAITING_BIAS && - cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) == - RWSEM_WAITING_BIAS) - break; - } - - raw_spin_unlock_irq(&sem->wait_lock); - - /* Block until there are no active lockers. */ - do { - schedule(); - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - } while ((count = sem->count) & RWSEM_ACTIVE_MASK); - - raw_spin_lock_irq(&sem->wait_lock); - } - - list_del(&waiter.list); - raw_spin_unlock_irq(&sem->wait_lock); - tsk->state = TASK_RUNNING; - - return sem; -} - -/* - * handle waking up a waiter on the semaphore - * - up_read/up_write has decremented the active part of count if we come here - */ -struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - /* do nothing if list empty */ - if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - - return sem; -} - -/* - * downgrade a write lock into a read lock - * - caller incremented waiting part of count and discovered it still negative - * - just wake up any readers at the front of the queue - */ -struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - /* do nothing if list empty */ - if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - - return sem; -} - -EXPORT_SYMBOL(rwsem_down_read_failed); -EXPORT_SYMBOL(rwsem_down_write_failed); -EXPORT_SYMBOL(rwsem_wake); -EXPORT_SYMBOL(rwsem_downgrade_wake); -- cgit v1.2.3 From 32cf7c3c94623514eb882addae307212c1507239 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 4 Nov 2013 18:05:09 +0100 Subject: locking: Move the percpu-rwsem code to kernel/locking/ Suggested-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-52bjmtty46we26hbfd9sc9iy@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/locking/Makefile | 3 +- kernel/locking/percpu-rwsem.c | 165 ++++++++++++++++++++++++++++++++++++++++++ lib/Makefile | 1 - lib/percpu-rwsem.c | 165 ------------------------------------------ 4 files changed, 167 insertions(+), 167 deletions(-) create mode 100644 kernel/locking/percpu-rwsem.c delete mode 100644 lib/percpu-rwsem.c (limited to 'lib') diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index bdd313a3411d..baab8e5e7f66 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -21,4 +21,5 @@ obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o -obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o \ No newline at end of file +obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o +obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c new file mode 100644 index 000000000000..652a8ee8efe9 --- /dev/null +++ b/kernel/locking/percpu-rwsem.c @@ -0,0 +1,165 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int __percpu_init_rwsem(struct percpu_rw_semaphore *brw, + const char *name, struct lock_class_key *rwsem_key) +{ + brw->fast_read_ctr = alloc_percpu(int); + if (unlikely(!brw->fast_read_ctr)) + return -ENOMEM; + + /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ + __init_rwsem(&brw->rw_sem, name, rwsem_key); + atomic_set(&brw->write_ctr, 0); + atomic_set(&brw->slow_read_ctr, 0); + init_waitqueue_head(&brw->write_waitq); + return 0; +} + +void percpu_free_rwsem(struct percpu_rw_semaphore *brw) +{ + free_percpu(brw->fast_read_ctr); + brw->fast_read_ctr = NULL; /* catch use after free bugs */ +} + +/* + * This is the fast-path for down_read/up_read, it only needs to ensure + * there is no pending writer (atomic_read(write_ctr) == 0) and inc/dec the + * fast per-cpu counter. The writer uses synchronize_sched_expedited() to + * serialize with the preempt-disabled section below. + * + * The nontrivial part is that we should guarantee acquire/release semantics + * in case when + * + * R_W: down_write() comes after up_read(), the writer should see all + * changes done by the reader + * or + * W_R: down_read() comes after up_write(), the reader should see all + * changes done by the writer + * + * If this helper fails the callers rely on the normal rw_semaphore and + * atomic_dec_and_test(), so in this case we have the necessary barriers. + * + * But if it succeeds we do not have any barriers, atomic_read(write_ctr) or + * __this_cpu_add() below can be reordered with any LOAD/STORE done by the + * reader inside the critical section. See the comments in down_write and + * up_write below. + */ +static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val) +{ + bool success = false; + + preempt_disable(); + if (likely(!atomic_read(&brw->write_ctr))) { + __this_cpu_add(*brw->fast_read_ctr, val); + success = true; + } + preempt_enable(); + + return success; +} + +/* + * Like the normal down_read() this is not recursive, the writer can + * come after the first percpu_down_read() and create the deadlock. + * + * Note: returns with lock_is_held(brw->rw_sem) == T for lockdep, + * percpu_up_read() does rwsem_release(). This pairs with the usage + * of ->rw_sem in percpu_down/up_write(). + */ +void percpu_down_read(struct percpu_rw_semaphore *brw) +{ + might_sleep(); + if (likely(update_fast_ctr(brw, +1))) { + rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_); + return; + } + + down_read(&brw->rw_sem); + atomic_inc(&brw->slow_read_ctr); + /* avoid up_read()->rwsem_release() */ + __up_read(&brw->rw_sem); +} + +void percpu_up_read(struct percpu_rw_semaphore *brw) +{ + rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_); + + if (likely(update_fast_ctr(brw, -1))) + return; + + /* false-positive is possible but harmless */ + if (atomic_dec_and_test(&brw->slow_read_ctr)) + wake_up_all(&brw->write_waitq); +} + +static int clear_fast_ctr(struct percpu_rw_semaphore *brw) +{ + unsigned int sum = 0; + int cpu; + + for_each_possible_cpu(cpu) { + sum += per_cpu(*brw->fast_read_ctr, cpu); + per_cpu(*brw->fast_read_ctr, cpu) = 0; + } + + return sum; +} + +/* + * A writer increments ->write_ctr to force the readers to switch to the + * slow mode, note the atomic_read() check in update_fast_ctr(). + * + * After that the readers can only inc/dec the slow ->slow_read_ctr counter, + * ->fast_read_ctr is stable. Once the writer moves its sum into the slow + * counter it represents the number of active readers. + * + * Finally the writer takes ->rw_sem for writing and blocks the new readers, + * then waits until the slow counter becomes zero. + */ +void percpu_down_write(struct percpu_rw_semaphore *brw) +{ + /* tell update_fast_ctr() there is a pending writer */ + atomic_inc(&brw->write_ctr); + /* + * 1. Ensures that write_ctr != 0 is visible to any down_read/up_read + * so that update_fast_ctr() can't succeed. + * + * 2. Ensures we see the result of every previous this_cpu_add() in + * update_fast_ctr(). + * + * 3. Ensures that if any reader has exited its critical section via + * fast-path, it executes a full memory barrier before we return. + * See R_W case in the comment above update_fast_ctr(). + */ + synchronize_sched_expedited(); + + /* exclude other writers, and block the new readers completely */ + down_write(&brw->rw_sem); + + /* nobody can use fast_read_ctr, move its sum into slow_read_ctr */ + atomic_add(clear_fast_ctr(brw), &brw->slow_read_ctr); + + /* wait for all readers to complete their percpu_up_read() */ + wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr)); +} + +void percpu_up_write(struct percpu_rw_semaphore *brw) +{ + /* release the lock, but the readers can't use the fast-path */ + up_write(&brw->rw_sem); + /* + * Insert the barrier before the next fast-path in down_read, + * see W_R case in the comment above update_fast_ctr(). + */ + synchronize_sched_expedited(); + /* the last writer unblocks update_fast_ctr() */ + atomic_dec(&brw->write_ctr); +} diff --git a/lib/Makefile b/lib/Makefile index ca8cadc8fe4d..8d3c62936c8b 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -42,7 +42,6 @@ obj-$(CONFIG_GENERIC_PCI_IOMAP) += pci_iomap.o obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o obj-$(CONFIG_CHECK_SIGNATURE) += check_signature.o obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o -lib-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o CFLAGS_hweight.o = $(subst $(quote),,$(CONFIG_ARCH_HWEIGHT_CFLAGS)) obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o diff --git a/lib/percpu-rwsem.c b/lib/percpu-rwsem.c deleted file mode 100644 index 652a8ee8efe9..000000000000 --- a/lib/percpu-rwsem.c +++ /dev/null @@ -1,165 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -int __percpu_init_rwsem(struct percpu_rw_semaphore *brw, - const char *name, struct lock_class_key *rwsem_key) -{ - brw->fast_read_ctr = alloc_percpu(int); - if (unlikely(!brw->fast_read_ctr)) - return -ENOMEM; - - /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ - __init_rwsem(&brw->rw_sem, name, rwsem_key); - atomic_set(&brw->write_ctr, 0); - atomic_set(&brw->slow_read_ctr, 0); - init_waitqueue_head(&brw->write_waitq); - return 0; -} - -void percpu_free_rwsem(struct percpu_rw_semaphore *brw) -{ - free_percpu(brw->fast_read_ctr); - brw->fast_read_ctr = NULL; /* catch use after free bugs */ -} - -/* - * This is the fast-path for down_read/up_read, it only needs to ensure - * there is no pending writer (atomic_read(write_ctr) == 0) and inc/dec the - * fast per-cpu counter. The writer uses synchronize_sched_expedited() to - * serialize with the preempt-disabled section below. - * - * The nontrivial part is that we should guarantee acquire/release semantics - * in case when - * - * R_W: down_write() comes after up_read(), the writer should see all - * changes done by the reader - * or - * W_R: down_read() comes after up_write(), the reader should see all - * changes done by the writer - * - * If this helper fails the callers rely on the normal rw_semaphore and - * atomic_dec_and_test(), so in this case we have the necessary barriers. - * - * But if it succeeds we do not have any barriers, atomic_read(write_ctr) or - * __this_cpu_add() below can be reordered with any LOAD/STORE done by the - * reader inside the critical section. See the comments in down_write and - * up_write below. - */ -static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val) -{ - bool success = false; - - preempt_disable(); - if (likely(!atomic_read(&brw->write_ctr))) { - __this_cpu_add(*brw->fast_read_ctr, val); - success = true; - } - preempt_enable(); - - return success; -} - -/* - * Like the normal down_read() this is not recursive, the writer can - * come after the first percpu_down_read() and create the deadlock. - * - * Note: returns with lock_is_held(brw->rw_sem) == T for lockdep, - * percpu_up_read() does rwsem_release(). This pairs with the usage - * of ->rw_sem in percpu_down/up_write(). - */ -void percpu_down_read(struct percpu_rw_semaphore *brw) -{ - might_sleep(); - if (likely(update_fast_ctr(brw, +1))) { - rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_); - return; - } - - down_read(&brw->rw_sem); - atomic_inc(&brw->slow_read_ctr); - /* avoid up_read()->rwsem_release() */ - __up_read(&brw->rw_sem); -} - -void percpu_up_read(struct percpu_rw_semaphore *brw) -{ - rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_); - - if (likely(update_fast_ctr(brw, -1))) - return; - - /* false-positive is possible but harmless */ - if (atomic_dec_and_test(&brw->slow_read_ctr)) - wake_up_all(&brw->write_waitq); -} - -static int clear_fast_ctr(struct percpu_rw_semaphore *brw) -{ - unsigned int sum = 0; - int cpu; - - for_each_possible_cpu(cpu) { - sum += per_cpu(*brw->fast_read_ctr, cpu); - per_cpu(*brw->fast_read_ctr, cpu) = 0; - } - - return sum; -} - -/* - * A writer increments ->write_ctr to force the readers to switch to the - * slow mode, note the atomic_read() check in update_fast_ctr(). - * - * After that the readers can only inc/dec the slow ->slow_read_ctr counter, - * ->fast_read_ctr is stable. Once the writer moves its sum into the slow - * counter it represents the number of active readers. - * - * Finally the writer takes ->rw_sem for writing and blocks the new readers, - * then waits until the slow counter becomes zero. - */ -void percpu_down_write(struct percpu_rw_semaphore *brw) -{ - /* tell update_fast_ctr() there is a pending writer */ - atomic_inc(&brw->write_ctr); - /* - * 1. Ensures that write_ctr != 0 is visible to any down_read/up_read - * so that update_fast_ctr() can't succeed. - * - * 2. Ensures we see the result of every previous this_cpu_add() in - * update_fast_ctr(). - * - * 3. Ensures that if any reader has exited its critical section via - * fast-path, it executes a full memory barrier before we return. - * See R_W case in the comment above update_fast_ctr(). - */ - synchronize_sched_expedited(); - - /* exclude other writers, and block the new readers completely */ - down_write(&brw->rw_sem); - - /* nobody can use fast_read_ctr, move its sum into slow_read_ctr */ - atomic_add(clear_fast_ctr(brw), &brw->slow_read_ctr); - - /* wait for all readers to complete their percpu_up_read() */ - wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr)); -} - -void percpu_up_write(struct percpu_rw_semaphore *brw) -{ - /* release the lock, but the readers can't use the fast-path */ - up_write(&brw->rw_sem); - /* - * Insert the barrier before the next fast-path in down_read, - * see W_R case in the comment above update_fast_ctr(). - */ - synchronize_sched_expedited(); - /* the last writer unblocks update_fast_ctr() */ - atomic_dec(&brw->write_ctr); -} -- cgit v1.2.3 From a1212d278c05ca0a38f5cbd7ae90ac2e367228a8 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 7 Nov 2013 20:47:28 +0900 Subject: Revert "sysfs: drop kobj_ns_type handling" This reverts commit cb26a311578e67769e92a39a0a63476533cb7e12. It mysteriously causes NetworkManager to not find the wireless device for me. As far as I can tell, Tejun *meant* for this commit to not make any semantic changes, but there clearly are some. So revert it, taking into account some of the calling convention changes that happened in this area in subsequent commits. Cc: Tejun Heo Cc: Greg Kroah-Hartman Signed-off-by: Linus Torvalds --- fs/sysfs/dir.c | 88 +++++++++++++++++++++++++++++++++++++++--------------- fs/sysfs/mount.c | 24 +++++++++++---- fs/sysfs/symlink.c | 27 +++++++++++++---- fs/sysfs/sysfs.h | 25 +++++++++++----- lib/kobject.c | 5 +--- 5 files changed, 122 insertions(+), 47 deletions(-) (limited to 'lib') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 08c66969d52a..5e73d6626e50 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -111,11 +111,6 @@ static int sysfs_link_sibling(struct sysfs_dirent *sd) /* add new node and rebalance the tree */ rb_link_node(&sd->s_rb, parent, node); rb_insert_color(&sd->s_rb, &sd->s_parent->s_dir.children); - - /* if @sd has ns tag, mark the parent to enable ns filtering */ - if (sd->s_ns) - sd->s_parent->s_flags |= SYSFS_FLAG_HAS_NS; - return 0; } @@ -135,13 +130,6 @@ static void sysfs_unlink_sibling(struct sysfs_dirent *sd) sd->s_parent->s_dir.subdirs--; rb_erase(&sd->s_rb, &sd->s_parent->s_dir.children); - - /* - * Either all or none of the children have tags. Clearing HAS_NS - * when there's no child left is enough to keep the flag synced. - */ - if (RB_EMPTY_ROOT(&sd->s_parent->s_dir.children)) - sd->s_parent->s_flags &= ~SYSFS_FLAG_HAS_NS; } /** @@ -291,6 +279,7 @@ static int sysfs_dentry_delete(const struct dentry *dentry) static int sysfs_dentry_revalidate(struct dentry *dentry, unsigned int flags) { struct sysfs_dirent *sd; + int type; if (flags & LOOKUP_RCU) return -ECHILD; @@ -311,8 +300,13 @@ static int sysfs_dentry_revalidate(struct dentry *dentry, unsigned int flags) goto out_bad; /* The sysfs dirent has been moved to a different namespace */ - if (sd->s_ns && sd->s_ns != sysfs_info(dentry->d_sb)->ns) - goto out_bad; + type = KOBJ_NS_TYPE_NONE; + if (sd->s_parent) { + type = sysfs_ns_type(sd->s_parent); + if (type != KOBJ_NS_TYPE_NONE && + sysfs_info(dentry->d_sb)->ns[type] != sd->s_ns) + goto out_bad; + } mutex_unlock(&sysfs_mutex); out_valid: @@ -432,6 +426,13 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd, struct sysfs_inode_attrs *ps_iattr; int ret; + if (!!sysfs_ns_type(parent_sd) != !!sd->s_ns) { + WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n", + sysfs_ns_type(parent_sd) ? "required" : "invalid", + parent_sd->s_name, sd->s_name); + return -EINVAL; + } + sd->s_hash = sysfs_name_hash(sd->s_name, sd->s_ns); sd->s_parent = sysfs_get(parent_sd); @@ -611,6 +612,13 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, struct rb_node *node = parent_sd->s_dir.children.rb_node; unsigned int hash; + if (!!sysfs_ns_type(parent_sd) != !!ns) { + WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n", + sysfs_ns_type(parent_sd) ? "required" : "invalid", + parent_sd->s_name, name); + return NULL; + } + hash = sysfs_name_hash(name, ns); while (node) { struct sysfs_dirent *sd; @@ -659,6 +667,7 @@ struct sysfs_dirent *sysfs_get_dirent_ns(struct sysfs_dirent *parent_sd, EXPORT_SYMBOL_GPL(sysfs_get_dirent_ns); static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd, + enum kobj_ns_type type, const char *name, const void *ns, struct sysfs_dirent **p_sd) { @@ -672,6 +681,7 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd, if (!sd) return -ENOMEM; + sd->s_flags |= (type << SYSFS_NS_TYPE_SHIFT); sd->s_ns = ns; sd->s_dir.kobj = kobj; @@ -691,7 +701,33 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd, int sysfs_create_subdir(struct kobject *kobj, const char *name, struct sysfs_dirent **p_sd) { - return create_dir(kobj, kobj->sd, name, NULL, p_sd); + return create_dir(kobj, kobj->sd, + KOBJ_NS_TYPE_NONE, name, NULL, p_sd); +} + +/** + * sysfs_read_ns_type: return associated ns_type + * @kobj: the kobject being queried + * + * Each kobject can be tagged with exactly one namespace type + * (i.e. network or user). Return the ns_type associated with + * this object if any + */ +static enum kobj_ns_type sysfs_read_ns_type(struct kobject *kobj) +{ + const struct kobj_ns_type_operations *ops; + enum kobj_ns_type type; + + ops = kobj_child_ns_ops(kobj); + if (!ops) + return KOBJ_NS_TYPE_NONE; + + type = ops->type; + BUG_ON(type <= KOBJ_NS_TYPE_NONE); + BUG_ON(type >= KOBJ_NS_TYPES); + BUG_ON(!kobj_ns_type_registered(type)); + + return type; } /** @@ -701,6 +737,7 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name, */ int sysfs_create_dir_ns(struct kobject *kobj, const void *ns) { + enum kobj_ns_type type; struct sysfs_dirent *parent_sd, *sd; int error = 0; @@ -714,7 +751,9 @@ int sysfs_create_dir_ns(struct kobject *kobj, const void *ns) if (!parent_sd) return -ENOENT; - error = create_dir(kobj, parent_sd, kobject_name(kobj), ns, &sd); + type = sysfs_read_ns_type(kobj); + + error = create_dir(kobj, parent_sd, type, kobject_name(kobj), ns, &sd); if (!error) kobj->sd = sd; return error; @@ -728,12 +767,13 @@ static struct dentry *sysfs_lookup(struct inode *dir, struct dentry *dentry, struct sysfs_dirent *parent_sd = parent->d_fsdata; struct sysfs_dirent *sd; struct inode *inode; - const void *ns = NULL; + enum kobj_ns_type type; + const void *ns; mutex_lock(&sysfs_mutex); - if (parent_sd->s_flags & SYSFS_FLAG_HAS_NS) - ns = sysfs_info(dir->i_sb)->ns; + type = sysfs_ns_type(parent_sd); + ns = sysfs_info(dir->i_sb)->ns[type]; sd = sysfs_find_dirent(parent_sd, dentry->d_name.name, ns); @@ -1056,15 +1096,15 @@ static int sysfs_readdir(struct file *file, struct dir_context *ctx) struct dentry *dentry = file->f_path.dentry; struct sysfs_dirent *parent_sd = dentry->d_fsdata; struct sysfs_dirent *pos = file->private_data; - const void *ns = NULL; + enum kobj_ns_type type; + const void *ns; + + type = sysfs_ns_type(parent_sd); + ns = sysfs_info(dentry->d_sb)->ns[type]; if (!dir_emit_dots(file, ctx)) return 0; mutex_lock(&sysfs_mutex); - - if (parent_sd->s_flags & SYSFS_FLAG_HAS_NS) - ns = sysfs_info(dentry->d_sb)->ns; - for (pos = sysfs_dir_pos(ns, parent_sd, ctx->pos, pos); pos; pos = sysfs_dir_next_pos(ns, parent_sd, ctx->pos, pos)) { diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 8c24bce2f4ae..834ec2cdb7a3 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -36,7 +36,7 @@ static const struct super_operations sysfs_ops = { struct sysfs_dirent sysfs_root = { .s_name = "", .s_count = ATOMIC_INIT(1), - .s_flags = SYSFS_DIR, + .s_flags = SYSFS_DIR | (KOBJ_NS_TYPE_NONE << SYSFS_NS_TYPE_SHIFT), .s_mode = S_IFDIR | S_IRUGO | S_IXUGO, .s_ino = 1, }; @@ -77,8 +77,14 @@ static int sysfs_test_super(struct super_block *sb, void *data) { struct sysfs_super_info *sb_info = sysfs_info(sb); struct sysfs_super_info *info = data; + enum kobj_ns_type type; + int found = 1; - return sb_info->ns == info->ns; + for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) { + if (sb_info->ns[type] != info->ns[type]) + found = 0; + } + return found; } static int sysfs_set_super(struct super_block *sb, void *data) @@ -92,7 +98,9 @@ static int sysfs_set_super(struct super_block *sb, void *data) static void free_sysfs_super_info(struct sysfs_super_info *info) { - kobj_ns_drop(KOBJ_NS_TYPE_NET, info->ns); + int type; + for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) + kobj_ns_drop(type, info->ns[type]); kfree(info); } @@ -100,6 +108,7 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { struct sysfs_super_info *info; + enum kobj_ns_type type; struct super_block *sb; int error; @@ -107,15 +116,18 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type)) return ERR_PTR(-EPERM); - if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET)) - return ERR_PTR(-EPERM); + for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) { + if (!kobj_ns_current_may_mount(type)) + return ERR_PTR(-EPERM); + } } info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return ERR_PTR(-ENOMEM); - info->ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET); + for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) + info->ns[type] = kobj_ns_grab_current(type); sb = sget(fs_type, sysfs_test_super, sysfs_set_super, flags, info); if (IS_ERR(sb) || sb->s_fs_info != info) diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index 1a23681b8179..3ae3f1bf1a09 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -28,6 +28,7 @@ static int sysfs_do_create_link_sd(struct sysfs_dirent *parent_sd, struct sysfs_dirent *target_sd = NULL; struct sysfs_dirent *sd = NULL; struct sysfs_addrm_cxt acxt; + enum kobj_ns_type ns_type; int error; BUG_ON(!name || !parent_sd); @@ -51,15 +52,29 @@ static int sysfs_do_create_link_sd(struct sysfs_dirent *parent_sd, if (!sd) goto out_put; - sd->s_ns = target_sd->s_ns; + ns_type = sysfs_ns_type(parent_sd); + if (ns_type) + sd->s_ns = target_sd->s_ns; sd->s_symlink.target_sd = target_sd; target_sd = NULL; /* reference is now owned by the symlink */ sysfs_addrm_start(&acxt); - if (warn) - error = sysfs_add_one(&acxt, sd, parent_sd); - else - error = __sysfs_add_one(&acxt, sd, parent_sd); + /* Symlinks must be between directories with the same ns_type */ + if (!ns_type || + (ns_type == sysfs_ns_type(sd->s_symlink.target_sd->s_parent))) { + if (warn) + error = sysfs_add_one(&acxt, sd, parent_sd); + else + error = __sysfs_add_one(&acxt, sd, parent_sd); + } else { + error = -EINVAL; + WARN(1, KERN_WARNING + "sysfs: symlink across ns_types %s/%s -> %s/%s\n", + parent_sd->s_name, + sd->s_name, + sd->s_symlink.target_sd->s_parent->s_name, + sd->s_symlink.target_sd->s_name); + } sysfs_addrm_finish(&acxt); if (error) @@ -149,7 +164,7 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ, * sysfs_remove_dir() for details. */ spin_lock(&sysfs_symlink_target_lock); - if (targ->sd) + if (targ->sd && sysfs_ns_type(kobj->sd)) ns = targ->sd->s_ns; spin_unlock(&sysfs_symlink_target_lock); sysfs_hash_and_remove(kobj->sd, name, ns); diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index e3aea92ebfa3..0af09fbfb3f6 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -90,8 +90,11 @@ struct sysfs_dirent { #define SYSFS_COPY_NAME (SYSFS_DIR | SYSFS_KOBJ_LINK) #define SYSFS_ACTIVE_REF (SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR) -#define SYSFS_FLAG_MASK ~SYSFS_TYPE_MASK -#define SYSFS_FLAG_HAS_NS 0x01000 +/* identify any namespace tag on sysfs_dirents */ +#define SYSFS_NS_TYPE_MASK 0xf00 +#define SYSFS_NS_TYPE_SHIFT 8 + +#define SYSFS_FLAG_MASK ~(SYSFS_NS_TYPE_MASK|SYSFS_TYPE_MASK) #define SYSFS_FLAG_REMOVED 0x02000 static inline unsigned int sysfs_type(struct sysfs_dirent *sd) @@ -99,6 +102,15 @@ static inline unsigned int sysfs_type(struct sysfs_dirent *sd) return sd->s_flags & SYSFS_TYPE_MASK; } +/* + * Return any namespace tags on this dirent. + * enum kobj_ns_type is defined in linux/kobject.h + */ +static inline enum kobj_ns_type sysfs_ns_type(struct sysfs_dirent *sd) +{ + return (sd->s_flags & SYSFS_NS_TYPE_MASK) >> SYSFS_NS_TYPE_SHIFT; +} + #ifdef CONFIG_DEBUG_LOCK_ALLOC #define sysfs_dirent_init_lockdep(sd) \ @@ -143,13 +155,12 @@ struct sysfs_addrm_cxt { */ /* - * Each sb is associated with one namespace tag, currently the network - * namespace of the task which mounted this sysfs instance. If multiple - * tags become necessary, make the following an array and compare - * sysfs_dirent tag against every entry. + * Each sb is associated with a set of namespace tags (i.e. + * the network namespace of the task which mounted this sysfs + * instance). */ struct sysfs_super_info { - void *ns; + void *ns[KOBJ_NS_TYPES]; }; #define sysfs_info(SB) ((struct sysfs_super_info *)(SB->s_fs_info)) extern struct sysfs_dirent sysfs_root; diff --git a/lib/kobject.c b/lib/kobject.c index 7a1c203083eb..5b4b8886435e 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -30,14 +30,11 @@ const void *kobject_namespace(struct kobject *kobj) { const struct kobj_ns_type_operations *ns_ops = kobj_ns_ops(kobj); - const void *ns; if (!ns_ops || ns_ops->type == KOBJ_NS_TYPE_NONE) return NULL; - ns = kobj->ktype->namespace(kobj); - WARN_ON(!ns); /* @kobj in a namespace is required to have !NULL tag */ - return ns; + return kobj->ktype->namespace(kobj); } /* -- cgit v1.2.3 From c9e8d128fe316751230ee0c53478740c64f58436 Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Thu, 7 Nov 2013 10:45:35 -0800 Subject: percpu-refcount: Add EXPORT_SYMBOL to use percpu_ref from modules This patch adds EXPORT_SYMBOL() for percpu_ref_init(), percpu_ref_cancel_init() and percpu_ref_kill_and_confirm() so that percpu refcounting can be used by external modules. Cc: Kent Overstreet Signed-off-by: Nicholas Bellinger --- lib/percpu-refcount.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'lib') diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index 7deeb6297a48..25b9ac7b93e5 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -53,6 +53,7 @@ int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release) ref->release = release; return 0; } +EXPORT_SYMBOL(percpu_ref_init); /** * percpu_ref_cancel_init - cancel percpu_ref_init() @@ -84,6 +85,7 @@ void percpu_ref_cancel_init(struct percpu_ref *ref) free_percpu(ref->pcpu_count); } } +EXPORT_SYMBOL(percpu_ref_cancel_init); static void percpu_ref_kill_rcu(struct rcu_head *rcu) { @@ -156,3 +158,4 @@ void percpu_ref_kill_and_confirm(struct percpu_ref *ref, call_rcu_sched(&ref->rcu, percpu_ref_kill_rcu); } +EXPORT_SYMBOL(percpu_ref_kill_and_confirm); -- cgit v1.2.3 From 51c37a70aaa3f95773af560e6db3073520513912 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 11 Nov 2013 12:20:32 +0100 Subject: random32: fix off-by-one in seeding requirement For properly initialising the Tausworthe generator [1], we have a strict seeding requirement, that is, s1 > 1, s2 > 7, s3 > 15. Commit 697f8d0348 ("random32: seeding improvement") introduced a __seed() function that imposes boundary checks proposed by the errata paper [2] to properly ensure above conditions. However, we're off by one, as the function is implemented as: "return (x < m) ? x + m : x;", and called with __seed(X, 1), __seed(X, 7), __seed(X, 15). Thus, an unwanted seed of 1, 7, 15 would be possible, whereas the lower boundary should actually be of at least 2, 8, 16, just as GSL does. Fix this, as otherwise an initialization with an unwanted seed could have the effect that Tausworthe's PRNG properties cannot not be ensured. Note that this PRNG is *not* used for cryptography in the kernel. [1] http://www.iro.umontreal.ca/~lecuyer/myftp/papers/tausme.ps [2] http://www.iro.umontreal.ca/~lecuyer/myftp/papers/tausme2.ps Joint work with Hannes Frederic Sowa. Fixes: 697f8d0348a6 ("random32: seeding improvement") Cc: Stephen Hemminger Cc: Florian Weimer Cc: Theodore Ts'o Signed-off-by: Daniel Borkmann Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/linux/random.h | 6 +++--- lib/random32.c | 14 +++++++------- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'lib') diff --git a/include/linux/random.h b/include/linux/random.h index 6312dd9ba449..bf9085e89fb5 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -50,9 +50,9 @@ static inline void prandom_seed_state(struct rnd_state *state, u64 seed) { u32 i = (seed >> 32) ^ (seed << 10) ^ seed; - state->s1 = __seed(i, 1); - state->s2 = __seed(i, 7); - state->s3 = __seed(i, 15); + state->s1 = __seed(i, 2); + state->s2 = __seed(i, 8); + state->s3 = __seed(i, 16); } #ifdef CONFIG_ARCH_RANDOM diff --git a/lib/random32.c b/lib/random32.c index 52280d5526be..01e8890d1089 100644 --- a/lib/random32.c +++ b/lib/random32.c @@ -141,7 +141,7 @@ void prandom_seed(u32 entropy) */ for_each_possible_cpu (i) { struct rnd_state *state = &per_cpu(net_rand_state, i); - state->s1 = __seed(state->s1 ^ entropy, 1); + state->s1 = __seed(state->s1 ^ entropy, 2); } } EXPORT_SYMBOL(prandom_seed); @@ -158,9 +158,9 @@ static int __init prandom_init(void) struct rnd_state *state = &per_cpu(net_rand_state,i); #define LCG(x) ((x) * 69069) /* super-duper LCG */ - state->s1 = __seed(LCG(i + jiffies), 1); - state->s2 = __seed(LCG(state->s1), 7); - state->s3 = __seed(LCG(state->s2), 15); + state->s1 = __seed(LCG(i + jiffies), 2); + state->s2 = __seed(LCG(state->s1), 8); + state->s3 = __seed(LCG(state->s2), 16); /* "warm it up" */ prandom_u32_state(state); @@ -187,9 +187,9 @@ static int __init prandom_reseed(void) u32 seeds[3]; get_random_bytes(&seeds, sizeof(seeds)); - state->s1 = __seed(seeds[0], 1); - state->s2 = __seed(seeds[1], 7); - state->s3 = __seed(seeds[2], 15); + state->s1 = __seed(seeds[0], 2); + state->s2 = __seed(seeds[1], 8); + state->s3 = __seed(seeds[2], 16); /* mix it in */ prandom_u32_state(state); -- cgit v1.2.3 From 6d31920246a9fc80be4f16acd27c0bbe8d7b8494 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Mon, 11 Nov 2013 12:20:33 +0100 Subject: random32: add periodic reseeding The current Tausworthe PRNG is never reseeded with truly random data after the first attempt in late_initcall. As this PRNG is used for some critical random data as e.g. UDP port randomization we should try better and reseed the PRNG once in a while with truly random data from get_random_bytes(). When we reseed with prandom_seed we now make also sure to throw the first output away. This suffices the reseeding procedure. The delay calculation is based on a proposal from Eric Dumazet. Joint work with Daniel Borkmann. Cc: Eric Dumazet Cc: Theodore Ts'o Signed-off-by: Hannes Frederic Sowa Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- lib/random32.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'lib') diff --git a/lib/random32.c b/lib/random32.c index 01e8890d1089..12215df701e8 100644 --- a/lib/random32.c +++ b/lib/random32.c @@ -142,6 +142,7 @@ void prandom_seed(u32 entropy) for_each_possible_cpu (i) { struct rnd_state *state = &per_cpu(net_rand_state, i); state->s1 = __seed(state->s1 ^ entropy, 2); + prandom_u32_state(state); } } EXPORT_SYMBOL(prandom_seed); @@ -174,6 +175,27 @@ static int __init prandom_init(void) } core_initcall(prandom_init); +static void __prandom_timer(unsigned long dontcare); +static DEFINE_TIMER(seed_timer, __prandom_timer, 0, 0); + +static void __prandom_timer(unsigned long dontcare) +{ + u32 entropy; + + get_random_bytes(&entropy, sizeof(entropy)); + prandom_seed(entropy); + /* reseed every ~60 seconds, in [40 .. 80) interval with slack */ + seed_timer.expires = jiffies + (40 * HZ + (prandom_u32() % (40 * HZ))); + add_timer(&seed_timer); +} + +static void prandom_start_seed_timer(void) +{ + set_timer_slack(&seed_timer, HZ); + seed_timer.expires = jiffies + 40 * HZ; + add_timer(&seed_timer); +} + /* * Generate better values after random number generator * is fully initialized. @@ -194,6 +216,7 @@ static int __init prandom_reseed(void) /* mix it in */ prandom_u32_state(state); } + prandom_start_seed_timer(); return 0; } late_initcall(prandom_reseed); -- cgit v1.2.3 From 4af712e8df998475736f3e2727701bd31e3751a9 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Mon, 11 Nov 2013 12:20:34 +0100 Subject: random32: add prandom_reseed_late() and call when nonblocking pool becomes initialized The Tausworthe PRNG is initialized at late_initcall time. At that time the entropy pool serving get_random_bytes is not filled sufficiently. This patch adds an additional reseeding step as soon as the nonblocking pool gets marked as initialized. On some machines it might be possible that late_initcall gets called after the pool has been initialized. In this situation we won't reseed again. (A call to prandom_seed_late blocks later invocations of early reseed attempts.) Joint work with Daniel Borkmann. Cc: Eric Dumazet Cc: Theodore Ts'o Signed-off-by: Hannes Frederic Sowa Signed-off-by: Daniel Borkmann Acked-by: "Theodore Ts'o" Signed-off-by: David S. Miller --- drivers/char/random.c | 5 ++++- include/linux/random.h | 1 + lib/random32.c | 23 ++++++++++++++++++++++- 3 files changed, 27 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/drivers/char/random.c b/drivers/char/random.c index 7a744d391756..4fe5609eeb72 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -603,8 +603,11 @@ retry: if (!r->initialized && nbits > 0) { r->entropy_total += nbits; - if (r->entropy_total > 128) + if (r->entropy_total > 128) { r->initialized = 1; + if (r == &nonblocking_pool) + prandom_reseed_late(); + } } trace_credit_entropy_bits(r->name, nbits, entropy_count, diff --git a/include/linux/random.h b/include/linux/random.h index bf9085e89fb5..5117ae348fe8 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -29,6 +29,7 @@ unsigned long randomize_range(unsigned long start, unsigned long end, unsigned l u32 prandom_u32(void); void prandom_bytes(void *buf, int nbytes); void prandom_seed(u32 seed); +void prandom_reseed_late(void); u32 prandom_u32_state(struct rnd_state *); void prandom_bytes_state(struct rnd_state *state, void *buf, int nbytes); diff --git a/lib/random32.c b/lib/random32.c index 12215df701e8..9f2f2fb03dfe 100644 --- a/lib/random32.c +++ b/lib/random32.c @@ -200,9 +200,18 @@ static void prandom_start_seed_timer(void) * Generate better values after random number generator * is fully initialized. */ -static int __init prandom_reseed(void) +static void __prandom_reseed(bool late) { int i; + unsigned long flags; + static bool latch = false; + static DEFINE_SPINLOCK(lock); + + /* only allow initial seeding (late == false) once */ + spin_lock_irqsave(&lock, flags); + if (latch && !late) + goto out; + latch = true; for_each_possible_cpu(i) { struct rnd_state *state = &per_cpu(net_rand_state,i); @@ -216,6 +225,18 @@ static int __init prandom_reseed(void) /* mix it in */ prandom_u32_state(state); } +out: + spin_unlock_irqrestore(&lock, flags); +} + +void prandom_reseed_late(void) +{ + __prandom_reseed(true); +} + +static int __init prandom_reseed(void) +{ + __prandom_reseed(false); prandom_start_seed_timer(); return 0; } -- cgit v1.2.3 From a98814cef87946d2708812ad9f8b1e03b8366b6f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 11 Nov 2013 12:20:36 +0100 Subject: random32: upgrade taus88 generator to taus113 from errata paper Since we use prandom*() functions quite often in networking code i.e. in UDP port selection, netfilter code, etc, upgrade the PRNG from Pierre L'Ecuyer's original paper "Maximally Equidistributed Combined Tausworthe Generators", Mathematics of Computation, 65, 213 (1996), 203--213 to the version published in his errata paper [1]. The Tausworthe generator is a maximally-equidistributed generator, that is fast and has good statistical properties [1]. The version presented there upgrades the 3 state LFSR to a 4 state LFSR with increased periodicity from about 2^88 to 2^113. The algorithm is presented in [1] by the very same author who also designed the original algorithm in [2]. Also, by increasing the state, we make it a bit harder for attackers to "guess" the PRNGs internal state. See also discussion in [3]. Now, as we use this sort of weak initialization discussed in [3] only between core_initcall() until late_initcall() time [*] for prandom32*() users, namely in prandom_init(), it is less relevant from late_initcall() onwards as we overwrite seeds through prandom_reseed() anyways with a seed source of higher entropy, that is, get_random_bytes(). In other words, a exhaustive keysearch of 96 bit would be needed. Now, with the help of this patch, this state-search increases further to 128 bit. Initialization needs to make sure that s1 > 1, s2 > 7, s3 > 15, s4 > 127. taus88 and taus113 algorithm is also part of GSL. I added a test case in the next patch to verify internal behaviour of this patch with GSL and ran tests with the dieharder 3.31.1 RNG test suite: $ dieharder -g 052 -a -m 10 -s 1 -S 4137730333 #taus88 $ dieharder -g 054 -a -m 10 -s 1 -S 4137730333 #taus113 With this seed configuration, in order to compare both, we get the following differences: algorithm taus88 taus113 rands/second [**] 1.61e+08 1.37e+08 sts_serial(4, 1st run) WEAK PASSED sts_serial(9, 2nd run) WEAK PASSED rgb_lagged_sum(31) WEAK PASSED We took out diehard_sums test as according to the authors it is considered broken and unusable [4]. Despite that and the slight decrease in performance (which is acceptable), taus113 here passes all 113 tests (only rgb_minimum_distance_5 in WEAK, the rest PASSED). In general, taus/taus113 is considered "very good" by the authors of dieharder [5]. The papers [1][2] states a single warm-up step is sufficient by running quicktaus once on each state to ensure proper initialization of ~s_{0}: Our selection of (s) according to Table 1 of [1] row 1 holds the condition L - k <= r - s, that is, (32 32 32 32) - (31 29 28 25) <= (25 27 15 22) - (18 2 7 13) with r = k - q and q = (6 2 13 3) as also stated by the paper. So according to [2] we are safe with one round of quicktaus for initialization. However we decided to include the warm-up phase of the PRNG as done in GSL in every case as a safety net. We also use the warm up phase to make the output of the RNG easier to verify by the GSL output. In prandom_init(), we also mix random_get_entropy() into it, just like drivers/char/random.c does it, jiffies ^ random_get_entropy(). random-get_entropy() is get_cycles(). xor is entropy preserving so it is fine if it is not implemented by some architectures. Note, this PRNG is *not* used for cryptography in the kernel, but rather as a fast PRNG for various randomizations i.e. in the networking code, or elsewhere for debugging purposes, for example. [*]: In order to generate some "sort of pseduo-randomness", since get_random_bytes() is not yet available for us, we use jiffies and initialize states s1 - s3 with a simple linear congruential generator (LCG), that is x <- x * 69069; and derive s2, s3, from the 32bit initialization from s1. So the above quote from [3] accounts only for the time from core to late initcall, not afterwards. [**] Single threaded run on MacBook Air w/ Intel Core i5-3317U [1] http://www.iro.umontreal.ca/~lecuyer/myftp/papers/tausme2.ps [2] http://www.iro.umontreal.ca/~lecuyer/myftp/papers/tausme.ps [3] http://thread.gmane.org/gmane.comp.encryption.general/12103/ [4] http://code.google.com/p/dieharder/source/browse/trunk/libdieharder/diehard_sums.c?spec=svn490&r=490#20 [5] http://www.phy.duke.edu/~rgb/General/dieharder.php Joint work with Hannes Frederic Sowa. Cc: Florian Weimer Cc: Theodore Ts'o Signed-off-by: Daniel Borkmann Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/linux/random.h | 11 +++---- lib/random32.c | 80 +++++++++++++++++++++++++++++--------------------- 2 files changed, 52 insertions(+), 39 deletions(-) (limited to 'lib') diff --git a/include/linux/random.h b/include/linux/random.h index 8ef0b70bd1f9..4002b3df4c85 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -32,10 +32,10 @@ void prandom_seed(u32 seed); void prandom_reseed_late(void); struct rnd_state { - __u32 s1, s2, s3; + __u32 s1, s2, s3, s4; }; -u32 prandom_u32_state(struct rnd_state *); +u32 prandom_u32_state(struct rnd_state *state); void prandom_bytes_state(struct rnd_state *state, void *buf, int nbytes); /* @@ -55,9 +55,10 @@ static inline void prandom_seed_state(struct rnd_state *state, u64 seed) { u32 i = (seed >> 32) ^ (seed << 10) ^ seed; - state->s1 = __seed(i, 2); - state->s2 = __seed(i, 8); - state->s3 = __seed(i, 16); + state->s1 = __seed(i, 2U); + state->s2 = __seed(i, 8U); + state->s3 = __seed(i, 16U); + state->s4 = __seed(i, 128U); } #ifdef CONFIG_ARCH_RANDOM diff --git a/lib/random32.c b/lib/random32.c index 9f2f2fb03dfe..27adb753180f 100644 --- a/lib/random32.c +++ b/lib/random32.c @@ -2,19 +2,19 @@ This is a maximally equidistributed combined Tausworthe generator based on code from GNU Scientific Library 1.5 (30 Jun 2004) - x_n = (s1_n ^ s2_n ^ s3_n) + lfsr113 version: - s1_{n+1} = (((s1_n & 4294967294) <<12) ^ (((s1_n <<13) ^ s1_n) >>19)) - s2_{n+1} = (((s2_n & 4294967288) << 4) ^ (((s2_n << 2) ^ s2_n) >>25)) - s3_{n+1} = (((s3_n & 4294967280) <<17) ^ (((s3_n << 3) ^ s3_n) >>11)) + x_n = (s1_n ^ s2_n ^ s3_n ^ s4_n) - The period of this generator is about 2^88. + s1_{n+1} = (((s1_n & 4294967294) << 18) ^ (((s1_n << 6) ^ s1_n) >> 13)) + s2_{n+1} = (((s2_n & 4294967288) << 2) ^ (((s2_n << 2) ^ s2_n) >> 27)) + s3_{n+1} = (((s3_n & 4294967280) << 7) ^ (((s3_n << 13) ^ s3_n) >> 21)) + s4_{n+1} = (((s4_n & 4294967168) << 13) ^ (((s4_n << 3) ^ s4_n) >> 12)) - From: P. L'Ecuyer, "Maximally Equidistributed Combined Tausworthe - Generators", Mathematics of Computation, 65, 213 (1996), 203--213. - - This is available on the net from L'Ecuyer's home page, + The period of this generator is about 2^113 (see erratum paper). + From: P. L'Ecuyer, "Maximally Equidistributed Combined Tausworthe + Generators", Mathematics of Computation, 65, 213 (1996), 203--213: http://www.iro.umontreal.ca/~lecuyer/myftp/papers/tausme.ps ftp://ftp.iro.umontreal.ca/pub/simulation/lecuyer/papers/tausme.ps @@ -29,7 +29,7 @@ that paper.) This affects the seeding procedure by imposing the requirement - s1 > 1, s2 > 7, s3 > 15. + s1 > 1, s2 > 7, s3 > 15, s4 > 127. */ @@ -52,11 +52,12 @@ u32 prandom_u32_state(struct rnd_state *state) { #define TAUSWORTHE(s,a,b,c,d) ((s&c)<>b) - state->s1 = TAUSWORTHE(state->s1, 13, 19, 4294967294UL, 12); - state->s2 = TAUSWORTHE(state->s2, 2, 25, 4294967288UL, 4); - state->s3 = TAUSWORTHE(state->s3, 3, 11, 4294967280UL, 17); + state->s1 = TAUSWORTHE(state->s1, 6U, 13U, 4294967294U, 18U); + state->s2 = TAUSWORTHE(state->s2, 2U, 27U, 4294967288U, 2U); + state->s3 = TAUSWORTHE(state->s3, 13U, 21U, 4294967280U, 7U); + state->s4 = TAUSWORTHE(state->s4, 3U, 12U, 4294967168U, 13U); - return (state->s1 ^ state->s2 ^ state->s3); + return (state->s1 ^ state->s2 ^ state->s3 ^ state->s4); } EXPORT_SYMBOL(prandom_u32_state); @@ -126,6 +127,21 @@ void prandom_bytes(void *buf, int bytes) } EXPORT_SYMBOL(prandom_bytes); +static void prandom_warmup(struct rnd_state *state) +{ + /* Calling RNG ten times to satify recurrence condition */ + prandom_u32_state(state); + prandom_u32_state(state); + prandom_u32_state(state); + prandom_u32_state(state); + prandom_u32_state(state); + prandom_u32_state(state); + prandom_u32_state(state); + prandom_u32_state(state); + prandom_u32_state(state); + prandom_u32_state(state); +} + /** * prandom_seed - add entropy to pseudo random number generator * @seed: seed value @@ -141,8 +157,9 @@ void prandom_seed(u32 entropy) */ for_each_possible_cpu (i) { struct rnd_state *state = &per_cpu(net_rand_state, i); - state->s1 = __seed(state->s1 ^ entropy, 2); - prandom_u32_state(state); + + state->s1 = __seed(state->s1 ^ entropy, 2U); + prandom_warmup(state); } } EXPORT_SYMBOL(prandom_seed); @@ -158,18 +175,13 @@ static int __init prandom_init(void) for_each_possible_cpu(i) { struct rnd_state *state = &per_cpu(net_rand_state,i); -#define LCG(x) ((x) * 69069) /* super-duper LCG */ - state->s1 = __seed(LCG(i + jiffies), 2); - state->s2 = __seed(LCG(state->s1), 8); - state->s3 = __seed(LCG(state->s2), 16); - - /* "warm it up" */ - prandom_u32_state(state); - prandom_u32_state(state); - prandom_u32_state(state); - prandom_u32_state(state); - prandom_u32_state(state); - prandom_u32_state(state); +#define LCG(x) ((x) * 69069U) /* super-duper LCG */ + state->s1 = __seed(LCG((i + jiffies) ^ random_get_entropy()), 2U); + state->s2 = __seed(LCG(state->s1), 8U); + state->s3 = __seed(LCG(state->s2), 16U); + state->s4 = __seed(LCG(state->s3), 128U); + + prandom_warmup(state); } return 0; } @@ -215,15 +227,15 @@ static void __prandom_reseed(bool late) for_each_possible_cpu(i) { struct rnd_state *state = &per_cpu(net_rand_state,i); - u32 seeds[3]; + u32 seeds[4]; get_random_bytes(&seeds, sizeof(seeds)); - state->s1 = __seed(seeds[0], 2); - state->s2 = __seed(seeds[1], 8); - state->s3 = __seed(seeds[2], 16); + state->s1 = __seed(seeds[0], 2U); + state->s2 = __seed(seeds[1], 8U); + state->s3 = __seed(seeds[2], 16U); + state->s4 = __seed(seeds[3], 128U); - /* mix it in */ - prandom_u32_state(state); + prandom_warmup(state); } out: spin_unlock_irqrestore(&lock, flags); -- cgit v1.2.3 From a6a9c0f1bf5a9a5faa605773ea75e0b93c3ab108 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 11 Nov 2013 12:20:37 +0100 Subject: random32: add test cases for taus113 implementation We generated a battery of 100 test cases from GSL taus113 implemention and compare the results from a particular seed and a particular iteration with our implementation in the kernel. We have verified on 32 and 64 bit machines that our taus113 kernel implementation gives same results as GSL taus113 implementation: [ 0.147370] prandom: seed boundary self test passed [ 0.148078] prandom: 100 self tests passed This is a Kconfig option that is disabled on default, just like the crc32 init selftests in order to not unnecessary slow down boot process. We also refactored out prandom_seed_very_weak() as it's now used in multiple places in order to reduce redundant code. GSL code we used for generating test cases: int i, j; srand(time(NULL)); for (i = 0; i < 100; ++i) { int iteration = 500 + (rand() % 500); gsl_rng_default_seed = rand() + 1; gsl_rng *r = gsl_rng_alloc(gsl_rng_taus113); printf("\t{ %lu, ", gsl_rng_default_seed); for (j = 0; j < iteration - 1; ++j) gsl_rng_get(r); printf("%u, %lu },\n", iteration, gsl_rng_get(r)); gsl_rng_free(r); } Joint work with Hannes Frederic Sowa. Cc: Florian Weimer Cc: Theodore Ts'o Signed-off-by: Daniel Borkmann Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- lib/Kconfig | 7 +++ lib/random32.c | 195 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 196 insertions(+), 6 deletions(-) (limited to 'lib') diff --git a/lib/Kconfig b/lib/Kconfig index b3c8be0da17f..75485e163ca3 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -189,6 +189,13 @@ config AUDIT_GENERIC depends on AUDIT && !AUDIT_ARCH default y +config RANDOM32_SELFTEST + bool "PRNG perform self test on init" + default n + help + This option enables the 32 bit PRNG library functions to perform a + self test on initialization. + # # compression support is select'ed if needed # diff --git a/lib/random32.c b/lib/random32.c index 27adb753180f..82da4f4c3489 100644 --- a/lib/random32.c +++ b/lib/random32.c @@ -38,6 +38,11 @@ #include #include #include +#include + +#ifdef CONFIG_RANDOM32_SELFTEST +static void __init prandom_state_selftest(void); +#endif static DEFINE_PER_CPU(struct rnd_state, net_rand_state); @@ -142,6 +147,23 @@ static void prandom_warmup(struct rnd_state *state) prandom_u32_state(state); } +static void prandom_seed_very_weak(struct rnd_state *state, u32 seed) +{ + /* Note: This sort of seeding is ONLY used in test cases and + * during boot at the time from core_initcall until late_initcall + * as we don't have a stronger entropy source available yet. + * After late_initcall, we reseed entire state, we have to (!), + * otherwise an attacker just needs to search 32 bit space to + * probe for our internal 128 bit state if he knows a couple + * of prandom32 outputs! + */ +#define LCG(x) ((x) * 69069U) /* super-duper LCG */ + state->s1 = __seed(LCG(seed), 2U); + state->s2 = __seed(LCG(state->s1), 8U); + state->s3 = __seed(LCG(state->s2), 16U); + state->s4 = __seed(LCG(state->s3), 128U); +} + /** * prandom_seed - add entropy to pseudo random number generator * @seed: seed value @@ -172,15 +194,14 @@ static int __init prandom_init(void) { int i; +#ifdef CONFIG_RANDOM32_SELFTEST + prandom_state_selftest(); +#endif + for_each_possible_cpu(i) { struct rnd_state *state = &per_cpu(net_rand_state,i); -#define LCG(x) ((x) * 69069U) /* super-duper LCG */ - state->s1 = __seed(LCG((i + jiffies) ^ random_get_entropy()), 2U); - state->s2 = __seed(LCG(state->s1), 8U); - state->s3 = __seed(LCG(state->s2), 16U); - state->s4 = __seed(LCG(state->s3), 128U); - + prandom_seed_very_weak(state, (i + jiffies) ^ random_get_entropy()); prandom_warmup(state); } return 0; @@ -253,3 +274,165 @@ static int __init prandom_reseed(void) return 0; } late_initcall(prandom_reseed); + +#ifdef CONFIG_RANDOM32_SELFTEST +static struct prandom_test1 { + u32 seed; + u32 result; +} test1[] = { + { 1U, 3484351685U }, + { 2U, 2623130059U }, + { 3U, 3125133893U }, + { 4U, 984847254U }, +}; + +static struct prandom_test2 { + u32 seed; + u32 iteration; + u32 result; +} test2[] = { + /* Test cases against taus113 from GSL library. */ + { 931557656U, 959U, 2975593782U }, + { 1339693295U, 876U, 3887776532U }, + { 1545556285U, 961U, 1615538833U }, + { 601730776U, 723U, 1776162651U }, + { 1027516047U, 687U, 511983079U }, + { 416526298U, 700U, 916156552U }, + { 1395522032U, 652U, 2222063676U }, + { 366221443U, 617U, 2992857763U }, + { 1539836965U, 714U, 3783265725U }, + { 556206671U, 994U, 799626459U }, + { 684907218U, 799U, 367789491U }, + { 2121230701U, 931U, 2115467001U }, + { 1668516451U, 644U, 3620590685U }, + { 768046066U, 883U, 2034077390U }, + { 1989159136U, 833U, 1195767305U }, + { 536585145U, 996U, 3577259204U }, + { 1008129373U, 642U, 1478080776U }, + { 1740775604U, 939U, 1264980372U }, + { 1967883163U, 508U, 10734624U }, + { 1923019697U, 730U, 3821419629U }, + { 442079932U, 560U, 3440032343U }, + { 1961302714U, 845U, 841962572U }, + { 2030205964U, 962U, 1325144227U }, + { 1160407529U, 507U, 240940858U }, + { 635482502U, 779U, 4200489746U }, + { 1252788931U, 699U, 867195434U }, + { 1961817131U, 719U, 668237657U }, + { 1071468216U, 983U, 917876630U }, + { 1281848367U, 932U, 1003100039U }, + { 582537119U, 780U, 1127273778U }, + { 1973672777U, 853U, 1071368872U }, + { 1896756996U, 762U, 1127851055U }, + { 847917054U, 500U, 1717499075U }, + { 1240520510U, 951U, 2849576657U }, + { 1685071682U, 567U, 1961810396U }, + { 1516232129U, 557U, 3173877U }, + { 1208118903U, 612U, 1613145022U }, + { 1817269927U, 693U, 4279122573U }, + { 1510091701U, 717U, 638191229U }, + { 365916850U, 807U, 600424314U }, + { 399324359U, 702U, 1803598116U }, + { 1318480274U, 779U, 2074237022U }, + { 697758115U, 840U, 1483639402U }, + { 1696507773U, 840U, 577415447U }, + { 2081979121U, 981U, 3041486449U }, + { 955646687U, 742U, 3846494357U }, + { 1250683506U, 749U, 836419859U }, + { 595003102U, 534U, 366794109U }, + { 47485338U, 558U, 3521120834U }, + { 619433479U, 610U, 3991783875U }, + { 704096520U, 518U, 4139493852U }, + { 1712224984U, 606U, 2393312003U }, + { 1318233152U, 922U, 3880361134U }, + { 855572992U, 761U, 1472974787U }, + { 64721421U, 703U, 683860550U }, + { 678931758U, 840U, 380616043U }, + { 692711973U, 778U, 1382361947U }, + { 677703619U, 530U, 2826914161U }, + { 92393223U, 586U, 1522128471U }, + { 1222592920U, 743U, 3466726667U }, + { 358288986U, 695U, 1091956998U }, + { 1935056945U, 958U, 514864477U }, + { 735675993U, 990U, 1294239989U }, + { 1560089402U, 897U, 2238551287U }, + { 70616361U, 829U, 22483098U }, + { 368234700U, 731U, 2913875084U }, + { 20221190U, 879U, 1564152970U }, + { 539444654U, 682U, 1835141259U }, + { 1314987297U, 840U, 1801114136U }, + { 2019295544U, 645U, 3286438930U }, + { 469023838U, 716U, 1637918202U }, + { 1843754496U, 653U, 2562092152U }, + { 400672036U, 809U, 4264212785U }, + { 404722249U, 965U, 2704116999U }, + { 600702209U, 758U, 584979986U }, + { 519953954U, 667U, 2574436237U }, + { 1658071126U, 694U, 2214569490U }, + { 420480037U, 749U, 3430010866U }, + { 690103647U, 969U, 3700758083U }, + { 1029424799U, 937U, 3787746841U }, + { 2012608669U, 506U, 3362628973U }, + { 1535432887U, 998U, 42610943U }, + { 1330635533U, 857U, 3040806504U }, + { 1223800550U, 539U, 3954229517U }, + { 1322411537U, 680U, 3223250324U }, + { 1877847898U, 945U, 2915147143U }, + { 1646356099U, 874U, 965988280U }, + { 805687536U, 744U, 4032277920U }, + { 1948093210U, 633U, 1346597684U }, + { 392609744U, 783U, 1636083295U }, + { 690241304U, 770U, 1201031298U }, + { 1360302965U, 696U, 1665394461U }, + { 1220090946U, 780U, 1316922812U }, + { 447092251U, 500U, 3438743375U }, + { 1613868791U, 592U, 828546883U }, + { 523430951U, 548U, 2552392304U }, + { 726692899U, 810U, 1656872867U }, + { 1364340021U, 836U, 3710513486U }, + { 1986257729U, 931U, 935013962U }, + { 407983964U, 921U, 728767059U }, +}; + +static void __init prandom_state_selftest(void) +{ + int i, j, errors = 0, runs = 0; + bool error = false; + + for (i = 0; i < ARRAY_SIZE(test1); i++) { + struct rnd_state state; + + prandom_seed_very_weak(&state, test1[i].seed); + prandom_warmup(&state); + + if (test1[i].result != prandom_u32_state(&state)) + error = true; + } + + if (error) + pr_warn("prandom: seed boundary self test failed\n"); + else + pr_info("prandom: seed boundary self test passed\n"); + + for (i = 0; i < ARRAY_SIZE(test2); i++) { + struct rnd_state state; + + prandom_seed_very_weak(&state, test2[i].seed); + prandom_warmup(&state); + + for (j = 0; j < test2[i].iteration - 1; j++) + prandom_u32_state(&state); + + if (test2[i].result != prandom_u32_state(&state)) + errors++; + + runs++; + cond_resched(); + } + + if (errors) + pr_warn("prandom: %d/%d self tests failed\n", errors, runs); + else + pr_info("prandom: %d self tests passed\n", runs); +} +#endif -- cgit v1.2.3 From c78e93630d15b5f5774213aad9bdc9f52473a89b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 12 Nov 2013 15:08:15 -0800 Subject: mm: do not walk all of system memory during show_mem It has been reported on very large machines that show_mem is taking almost 5 minutes to display information. This is a serious problem if there is an OOM storm. The bulk of the cost is in show_mem doing a very expensive PFN walk to give us the following information Total RAM: Also available as totalram_pages Highmem pages: Also available as totalhigh_pages Reserved pages: Can be inferred from the zone structure Shared pages: PFN walk required Unshared pages: PFN walk required Quick pages: Per-cpu walk required Only the shared/unshared pages requires a full PFN walk but that information is useless. It is also inaccurate as page pins of unshared pages would be accounted for as shared. Even if the information was accurate, I'm struggling to think how the shared/unshared information could be useful for debugging OOM conditions. Maybe it was useful before rmap existed when reclaiming shared pages was costly but it is less relevant today. The PFN walk could be optimised a bit but why bother as the information is useless. This patch deletes the PFN walker and infers the total RAM, highmem and reserved pages count from struct zone. It omits the shared/unshared page usage on the grounds that it is useless. It also corrects the reporting of HighMem as HighMem/MovableOnly as ZONE_MOVABLE has similar problems to HighMem with respect to lowmem/highmem exhaustion. Signed-off-by: Mel Gorman Cc: David Rientjes Acked-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/show_mem.c | 39 +++++++++++---------------------------- 1 file changed, 11 insertions(+), 28 deletions(-) (limited to 'lib') diff --git a/lib/show_mem.c b/lib/show_mem.c index b7c72311ad0c..5847a4921b8e 100644 --- a/lib/show_mem.c +++ b/lib/show_mem.c @@ -12,8 +12,7 @@ void show_mem(unsigned int filter) { pg_data_t *pgdat; - unsigned long total = 0, reserved = 0, shared = 0, - nonshared = 0, highmem = 0; + unsigned long total = 0, reserved = 0, highmem = 0; printk("Mem-Info:\n"); show_free_areas(filter); @@ -22,43 +21,27 @@ void show_mem(unsigned int filter) return; for_each_online_pgdat(pgdat) { - unsigned long i, flags; + unsigned long flags; + int zoneid; pgdat_resize_lock(pgdat, &flags); - for (i = 0; i < pgdat->node_spanned_pages; i++) { - struct page *page; - unsigned long pfn = pgdat->node_start_pfn + i; - - if (unlikely(!(i % MAX_ORDER_NR_PAGES))) - touch_nmi_watchdog(); - - if (!pfn_valid(pfn)) + for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { + struct zone *zone = &pgdat->node_zones[zoneid]; + if (!populated_zone(zone)) continue; - page = pfn_to_page(pfn); - - if (PageHighMem(page)) - highmem++; + total += zone->present_pages; + reserved = zone->present_pages - zone->managed_pages; - if (PageReserved(page)) - reserved++; - else if (page_count(page) == 1) - nonshared++; - else if (page_count(page) > 1) - shared += page_count(page) - 1; - - total++; + if (is_highmem_idx(zoneid)) + highmem += zone->present_pages; } pgdat_resize_unlock(pgdat, &flags); } printk("%lu pages RAM\n", total); -#ifdef CONFIG_HIGHMEM - printk("%lu pages HighMem\n", highmem); -#endif + printk("%lu pages HighMem/MovableOnly\n", highmem); printk("%lu pages reserved\n", reserved); - printk("%lu pages shared\n", shared); - printk("%lu pages non-shared\n", nonshared); #ifdef CONFIG_QUICKLIST printk("%lu pages in pagetable cache\n", quicklist_total_size()); -- cgit v1.2.3 From 623fd8072c7c4d77a184bc9e35192acf480c18e4 Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Tue, 12 Nov 2013 15:08:34 -0800 Subject: percpu: add test module for various percpu operations Tests various percpu operations. Enable with CONFIG_PERCPU_TEST=m. Signed-off-by: Greg Thelen Acked-by: Tejun Heo Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.debug | 9 ++++ lib/Makefile | 2 + lib/percpu_test.c | 138 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 149 insertions(+) create mode 100644 lib/percpu_test.c (limited to 'lib') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index ebef88f61b7d..db25707aa41b 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1481,6 +1481,15 @@ config INTERVAL_TREE_TEST help A benchmark measuring the performance of the interval tree library +config PERCPU_TEST + tristate "Per cpu operations test" + depends on m && DEBUG_KERNEL + help + Enable this option to build test module which validates per-cpu + operations. + + If unsure, say N. + config ATOMIC64_SELFTEST bool "Perform an atomic64_t self-test at boot" help diff --git a/lib/Makefile b/lib/Makefile index f3bb2cb98adf..bb016e116ba4 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -157,6 +157,8 @@ obj-$(CONFIG_INTERVAL_TREE_TEST) += interval_tree_test.o interval_tree_test-objs := interval_tree_test_main.o interval_tree.o +obj-$(CONFIG_PERCPU_TEST) += percpu_test.o + obj-$(CONFIG_ASN1) += asn1_decoder.o obj-$(CONFIG_FONT_SUPPORT) += fonts/ diff --git a/lib/percpu_test.c b/lib/percpu_test.c new file mode 100644 index 000000000000..0b5d14dadd1a --- /dev/null +++ b/lib/percpu_test.c @@ -0,0 +1,138 @@ +#include + +/* validate @native and @pcp counter values match @expected */ +#define CHECK(native, pcp, expected) \ + do { \ + WARN((native) != (expected), \ + "raw %ld (0x%lx) != expected %lld (0x%llx)", \ + (native), (native), \ + (long long)(expected), (long long)(expected)); \ + WARN(__this_cpu_read(pcp) != (expected), \ + "pcp %ld (0x%lx) != expected %lld (0x%llx)", \ + __this_cpu_read(pcp), __this_cpu_read(pcp), \ + (long long)(expected), (long long)(expected)); \ + } while (0) + +static DEFINE_PER_CPU(long, long_counter); +static DEFINE_PER_CPU(unsigned long, ulong_counter); + +static int __init percpu_test_init(void) +{ + /* + * volatile prevents compiler from optimizing it uses, otherwise the + * +ul_one/-ul_one below would replace with inc/dec instructions. + */ + volatile unsigned int ui_one = 1; + long l = 0; + unsigned long ul = 0; + + pr_info("percpu test start\n"); + + preempt_disable(); + + l += -1; + __this_cpu_add(long_counter, -1); + CHECK(l, long_counter, -1); + + l += 1; + __this_cpu_add(long_counter, 1); + CHECK(l, long_counter, 0); + + ul = 0; + __this_cpu_write(ulong_counter, 0); + + ul += 1UL; + __this_cpu_add(ulong_counter, 1UL); + CHECK(ul, ulong_counter, 1); + + ul += -1UL; + __this_cpu_add(ulong_counter, -1UL); + CHECK(ul, ulong_counter, 0); + + ul += -(unsigned long)1; + __this_cpu_add(ulong_counter, -(unsigned long)1); + CHECK(ul, ulong_counter, -1); + + ul = 0; + __this_cpu_write(ulong_counter, 0); + + ul -= 1; + __this_cpu_dec(ulong_counter); + CHECK(ul, ulong_counter, -1); + CHECK(ul, ulong_counter, ULONG_MAX); + + l += -ui_one; + __this_cpu_add(long_counter, -ui_one); + CHECK(l, long_counter, 0xffffffff); + + l += ui_one; + __this_cpu_add(long_counter, ui_one); + CHECK(l, long_counter, (long)0x100000000LL); + + + l = 0; + __this_cpu_write(long_counter, 0); + + l -= ui_one; + __this_cpu_sub(long_counter, ui_one); + CHECK(l, long_counter, -1); + + l = 0; + __this_cpu_write(long_counter, 0); + + l += ui_one; + __this_cpu_add(long_counter, ui_one); + CHECK(l, long_counter, 1); + + l += -ui_one; + __this_cpu_add(long_counter, -ui_one); + CHECK(l, long_counter, (long)0x100000000LL); + + l = 0; + __this_cpu_write(long_counter, 0); + + l -= ui_one; + this_cpu_sub(long_counter, ui_one); + CHECK(l, long_counter, -1); + CHECK(l, long_counter, ULONG_MAX); + + ul = 0; + __this_cpu_write(ulong_counter, 0); + + ul += ui_one; + __this_cpu_add(ulong_counter, ui_one); + CHECK(ul, ulong_counter, 1); + + ul = 0; + __this_cpu_write(ulong_counter, 0); + + ul -= ui_one; + __this_cpu_sub(ulong_counter, ui_one); + CHECK(ul, ulong_counter, -1); + CHECK(ul, ulong_counter, ULONG_MAX); + + ul = 3; + __this_cpu_write(ulong_counter, 3); + + ul = this_cpu_sub_return(ulong_counter, ui_one); + CHECK(ul, ulong_counter, 2); + + ul = __this_cpu_sub_return(ulong_counter, ui_one); + CHECK(ul, ulong_counter, 1); + + preempt_enable(); + + pr_info("percpu test done\n"); + return -EAGAIN; /* Fail will directly unload the module */ +} + +static void __exit percpu_test_exit(void) +{ +} + +module_init(percpu_test_init) +module_exit(percpu_test_exit) + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Greg Thelen"); +MODULE_DESCRIPTION("percpu operations test"); -- cgit v1.2.3 From 312b4e226951f707e120b95b118cbc14f3d162b2 Mon Sep 17 00:00:00 2001 From: Ryan Mallon Date: Tue, 12 Nov 2013 15:08:51 -0800 Subject: vsprintf: check real user/group id for %pK Some setuid binaries will allow reading of files which have read permission by the real user id. This is problematic with files which use %pK because the file access permission is checked at open() time, but the kptr_restrict setting is checked at read() time. If a setuid binary opens a %pK file as an unprivileged user, and then elevates permissions before reading the file, then kernel pointer values may be leaked. This happens for example with the setuid pppd application on Ubuntu 12.04: $ head -1 /proc/kallsyms 00000000 T startup_32 $ pppd file /proc/kallsyms pppd: In file /proc/kallsyms: unrecognized option 'c1000000' This will only leak the pointer value from the first line, but other setuid binaries may leak more information. Fix this by adding a check that in addition to the current process having CAP_SYSLOG, that effective user and group ids are equal to the real ids. If a setuid binary reads the contents of a file which uses %pK then the pointer values will be printed as NULL if the real user is unprivileged. Update the sysctl documentation to reflect the changes, and also correct the documentation to state the kptr_restrict=0 is the default. This is a only temporary solution to the issue. The correct solution is to do the permission check at open() time on files, and to replace %pK with a function which checks the open() time permission. %pK uses in printk should be removed since no sane permission check can be done, and instead protected by using dmesg_restrict. Signed-off-by: Ryan Mallon Cc: Kees Cook Cc: Alexander Viro Cc: Joe Perches Cc: "Eric W. Biederman" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/kernel.txt | 25 ++++++++++++++++++------- lib/vsprintf.c | 33 ++++++++++++++++++++++++++++++--- 2 files changed, 48 insertions(+), 10 deletions(-) (limited to 'lib') diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 4273b2d71a27..26b7ee491df8 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -290,13 +290,24 @@ Default value is "/sbin/hotplug". kptr_restrict: This toggle indicates whether restrictions are placed on -exposing kernel addresses via /proc and other interfaces. When -kptr_restrict is set to (0), there are no restrictions. When -kptr_restrict is set to (1), the default, kernel pointers -printed using the %pK format specifier will be replaced with 0's -unless the user has CAP_SYSLOG. When kptr_restrict is set to -(2), kernel pointers printed using %pK will be replaced with 0's -regardless of privileges. +exposing kernel addresses via /proc and other interfaces. + +When kptr_restrict is set to (0), the default, there are no restrictions. + +When kptr_restrict is set to (1), kernel pointers printed using the %pK +format specifier will be replaced with 0's unless the user has CAP_SYSLOG +and effective user and group ids are equal to the real ids. This is +because %pK checks are done at read() time rather than open() time, so +if permissions are elevated between the open() and the read() (e.g via +a setuid binary) then %pK will not leak kernel pointers to unprivileged +users. Note, this is a temporary solution only. The correct long-term +solution is to do the permission checks at open() time. Consider removing +world read permissions from files that use %pK, and using dmesg_restrict +to protect against uses of %pK in dmesg(8) if leaking kernel pointer +values to unprivileged users is a concern. + +When kptr_restrict is set to (2), kernel pointers printed using +%pK will be replaced with 0's regardless of privileges. ============================================================== diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 26559bdb4c49..d76555c45ff4 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include /* for PAGE_SIZE */ @@ -1312,11 +1313,37 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr, spec.field_width = default_width; return string(buf, end, "pK-error", spec); } - if (!((kptr_restrict == 0) || - (kptr_restrict == 1 && - has_capability_noaudit(current, CAP_SYSLOG)))) + + switch (kptr_restrict) { + case 0: + /* Always print %pK values */ + break; + case 1: { + /* + * Only print the real pointer value if the current + * process has CAP_SYSLOG and is running with the + * same credentials it started with. This is because + * access to files is checked at open() time, but %pK + * checks permission at read() time. We don't want to + * leak pointer values if a binary opens a file using + * %pK and then elevates privileges before reading it. + */ + const struct cred *cred = current_cred(); + + if (!has_capability_noaudit(current, CAP_SYSLOG) || + !uid_eq(cred->euid, cred->uid) || + !gid_eq(cred->egid, cred->gid)) + ptr = NULL; + break; + } + case 2: + default: + /* Always print 0's for %pK */ ptr = NULL; + break; + } break; + case 'N': switch (fmt[1]) { case 'F': -- cgit v1.2.3 From d3773ba13c8ca63a4ee9e7926813f6e82be27a6c Mon Sep 17 00:00:00 2001 From: Xie XiuQi Date: Tue, 12 Nov 2013 15:09:49 -0800 Subject: lib/debugobjects.c: remove unnecessary work pending test Remove unnecessary work pending test before calling schedule_work(). It has been tested in queue_work_on() already. No functional changed. Signed-off-by: Xie XiuQi Reviewed-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/debugobjects.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/debugobjects.c b/lib/debugobjects.c index bf2c8b1043d8..e0731c3db706 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -196,7 +196,7 @@ static void free_object(struct debug_obj *obj) * initialized: */ if (obj_pool_free > ODEBUG_POOL_SIZE && obj_cache) - sched = keventd_up() && !work_pending(&debug_obj_work); + sched = keventd_up(); hlist_add_head(&obj->node, &obj_pool); obj_pool_free++; obj_pool_used--; -- cgit v1.2.3 From c0d92a57a88586586b8cb9c7ac149bd10bc40d11 Mon Sep 17 00:00:00 2001 From: Olof Johansson Date: Tue, 12 Nov 2013 15:09:50 -0800 Subject: lib/vsprintf.c: document formats for dentry and struct file Looks like these were added to Documentation/printk-formats.txt but not the in-file table. Signed-off-by: Olof Johansson Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/vsprintf.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'lib') diff --git a/lib/vsprintf.c b/lib/vsprintf.c index d76555c45ff4..48586ac3a62e 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -1219,6 +1219,8 @@ int kptr_restrict __read_mostly; * The maximum supported length is 64 bytes of the input. Consider * to use print_hex_dump() for the larger input. * - 'a' For a phys_addr_t type and its derivative types (passed by reference) + * - 'd[234]' For a dentry name (optionally 2-4 last components) + * - 'D[234]' Same as 'd' but for a struct file * * Note: The difference between 'S' and 'F' is that on ia64 and ppc64 * function pointers are really function descriptors, which contain a -- cgit v1.2.3 From ff6092a8a413f3db1938b4606e078fa751ed6cdb Mon Sep 17 00:00:00 2001 From: Duan Jiong Date: Tue, 12 Nov 2013 15:09:51 -0800 Subject: lib/digsig.c: use ERR_CAST inlined function instead of ERR_PTR(PTR_ERR(...)) Signed-off-by: Duan Jiong Cc: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/digsig.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/digsig.c b/lib/digsig.c index 2f31e6a45f0a..8793aeda30ca 100644 --- a/lib/digsig.c +++ b/lib/digsig.c @@ -209,7 +209,7 @@ int digsig_verify(struct key *keyring, const char *sig, int siglen, kref = keyring_search(make_key_ref(keyring, 1UL), &key_type_user, name); if (IS_ERR(kref)) - key = ERR_PTR(PTR_ERR(kref)); + key = ERR_CAST(kref); else key = key_ref_to_ptr(kref); } else { -- cgit v1.2.3 From 684f0d3d14f2744ae7ad79063b21909e32bf444e Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 12 Nov 2013 15:09:52 -0800 Subject: lib/genalloc: add a helper function for DMA buffer allocation When using pool space for DMA buffer, there might be duplicated calling of gen_pool_alloc() and gen_pool_virt_to_phys() in each implementation. Thus it's better to add a simple helper function, a compatible one to the common dma_alloc_coherent(), to save some code. Signed-off-by: Nicolin Chen Cc: "Hans J. Koch" Cc: Dan Williams Cc: Eric Miao Cc: Grant Likely Cc: Greg Kroah-Hartman Cc: Haojian Zhuang Cc: Jaroslav Kysela Cc: Kevin Hilman Cc: Liam Girdwood Cc: Mark Brown Cc: Mauro Carvalho Chehab Cc: Rob Herring Cc: Russell King Cc: Sekhar Nori Cc: Takashi Iwai Cc: Vinod Koul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/genalloc.h | 2 ++ lib/genalloc.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) (limited to 'lib') diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h index f8d41cb1cbe0..1eda33d7cb10 100644 --- a/include/linux/genalloc.h +++ b/include/linux/genalloc.h @@ -94,6 +94,8 @@ static inline int gen_pool_add(struct gen_pool *pool, unsigned long addr, } extern void gen_pool_destroy(struct gen_pool *); extern unsigned long gen_pool_alloc(struct gen_pool *, size_t); +extern void *gen_pool_dma_alloc(struct gen_pool *pool, size_t size, + dma_addr_t *dma); extern void gen_pool_free(struct gen_pool *, unsigned long, size_t); extern void gen_pool_for_each_chunk(struct gen_pool *, void (*)(struct gen_pool *, struct gen_pool_chunk *, void *), void *); diff --git a/lib/genalloc.c b/lib/genalloc.c index 26cf20be72b7..dda31168844f 100644 --- a/lib/genalloc.c +++ b/lib/genalloc.c @@ -312,6 +312,34 @@ retry: } EXPORT_SYMBOL(gen_pool_alloc); +/** + * gen_pool_dma_alloc - allocate special memory from the pool for DMA usage + * @pool: pool to allocate from + * @size: number of bytes to allocate from the pool + * @dma: dma-view physical address + * + * Allocate the requested number of bytes from the specified pool. + * Uses the pool allocation function (with first-fit algorithm by default). + * Can not be used in NMI handler on architectures without + * NMI-safe cmpxchg implementation. + */ +void *gen_pool_dma_alloc(struct gen_pool *pool, size_t size, dma_addr_t *dma) +{ + unsigned long vaddr; + + if (!pool) + return NULL; + + vaddr = gen_pool_alloc(pool, size); + if (!vaddr) + return NULL; + + *dma = gen_pool_virt_to_phys(pool, vaddr); + + return (void *)vaddr; +} +EXPORT_SYMBOL(gen_pool_dma_alloc); + /** * gen_pool_free - free allocated special memory back to the pool * @pool: pool to free to -- cgit v1.2.3 From 66b251422be7cb39e7619fee647724720f74d1f7 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 12 Nov 2013 23:45:41 +0100 Subject: random32: add __init prefix to prandom_start_seed_timer We only call that in functions annotated with __init, so add __init prefix in prandom_start_seed_timer() as well, so that the kernel can make use of this hint and we can possibly free up resources after it's usage. And since it's an internal function rename it to __prandom_start_seed_timer(). Signed-off-by: Daniel Borkmann Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- lib/random32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/random32.c b/lib/random32.c index 82da4f4c3489..4f9d5dffc554 100644 --- a/lib/random32.c +++ b/lib/random32.c @@ -222,7 +222,7 @@ static void __prandom_timer(unsigned long dontcare) add_timer(&seed_timer); } -static void prandom_start_seed_timer(void) +static void __init __prandom_start_seed_timer(void) { set_timer_slack(&seed_timer, HZ); seed_timer.expires = jiffies + 40 * HZ; @@ -270,7 +270,7 @@ void prandom_reseed_late(void) static int __init prandom_reseed(void) { __prandom_reseed(false); - prandom_start_seed_timer(); + __prandom_start_seed_timer(); return 0; } late_initcall(prandom_reseed); -- cgit v1.2.3 From 0125737accc5aac532719aecd80615364caa9e0f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 12 Nov 2013 23:45:42 +0100 Subject: random32: use msecs_to_jiffies for reseed timer Use msecs_to_jiffies, for these calculations as different HZ considerations are taken into account for conversion of the timer shot, and also it makes the code more readable. Signed-off-by: Daniel Borkmann Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- lib/random32.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/random32.c b/lib/random32.c index 4f9d5dffc554..1e5b2df44291 100644 --- a/lib/random32.c +++ b/lib/random32.c @@ -214,18 +214,22 @@ static DEFINE_TIMER(seed_timer, __prandom_timer, 0, 0); static void __prandom_timer(unsigned long dontcare) { u32 entropy; + unsigned long expires; get_random_bytes(&entropy, sizeof(entropy)); prandom_seed(entropy); + /* reseed every ~60 seconds, in [40 .. 80) interval with slack */ - seed_timer.expires = jiffies + (40 * HZ + (prandom_u32() % (40 * HZ))); + expires = 40 + (prandom_u32() % 40); + seed_timer.expires = jiffies + msecs_to_jiffies(expires * MSEC_PER_SEC); + add_timer(&seed_timer); } static void __init __prandom_start_seed_timer(void) { set_timer_slack(&seed_timer, HZ); - seed_timer.expires = jiffies + 40 * HZ; + seed_timer.expires = jiffies + msecs_to_jiffies(40 * MSEC_PER_SEC); add_timer(&seed_timer); } -- cgit v1.2.3 From 57f4257eae33e036125973858934730250d464e3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 Nov 2013 14:31:54 -0800 Subject: lockref: use BLOATED_SPINLOCKS to avoid explicit config dependencies Avoid the fragile Kconfig construct guestimating spinlock_t sizes; use a friendly compile-time test to determine this. [kirill.shutemov@linux.intel.com: drop CONFIG_CMPXCHG_LOCKREF] Signed-off-by: Peter Zijlstra Signed-off-by: Kirill A. Shutemov Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/lockref.h | 7 ++++++- lib/Kconfig | 7 ------- lib/lockref.c | 2 +- 3 files changed, 7 insertions(+), 9 deletions(-) (limited to 'lib') diff --git a/include/linux/lockref.h b/include/linux/lockref.h index 13dfd36a3294..c8929c3832db 100644 --- a/include/linux/lockref.h +++ b/include/linux/lockref.h @@ -15,10 +15,15 @@ */ #include +#include + +#define USE_CMPXCHG_LOCKREF \ + (IS_ENABLED(CONFIG_ARCH_USE_CMPXCHG_LOCKREF) && \ + IS_ENABLED(CONFIG_SMP) && !BLOATED_SPINLOCKS) struct lockref { union { -#ifdef CONFIG_CMPXCHG_LOCKREF +#if USE_CMPXCHG_LOCKREF aligned_u64 lock_count; #endif struct { diff --git a/lib/Kconfig b/lib/Kconfig index 75485e163ca3..06dc74200a51 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -51,13 +51,6 @@ config PERCPU_RWSEM config ARCH_USE_CMPXCHG_LOCKREF bool -config CMPXCHG_LOCKREF - def_bool y if ARCH_USE_CMPXCHG_LOCKREF - depends on SMP - depends on !GENERIC_LOCKBREAK - depends on !DEBUG_SPINLOCK - depends on !DEBUG_LOCK_ALLOC - config CRC_CCITT tristate "CRC-CCITT functions" help diff --git a/lib/lockref.c b/lib/lockref.c index af6e95d0bed6..d2b123f8456b 100644 --- a/lib/lockref.c +++ b/lib/lockref.c @@ -1,7 +1,7 @@ #include #include -#ifdef CONFIG_CMPXCHG_LOCKREF +#if USE_CMPXCHG_LOCKREF /* * Allow weakly-ordered memory architectures to provide barrier-less -- cgit v1.2.3 From 9196436ab2f713b823a2ba2024cb69f40b2f54a5 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 14 Nov 2013 14:31:58 -0800 Subject: vsprintf: ignore %n again This ignores %n in printf again, as was originally documented. Implementing %n poses a greater security risk than utility, so it should stay ignored. To help anyone attempting to use %n, a warning will be emitted if it is encountered. Based on an earlier patch by Joe Perches. Because %n was designed to write to pointers on the stack, it has been frequently used as an attack vector when bugs are found that leak user-controlled strings into functions that ultimately process format strings. While this class of bug can still be turned into an information leak, removing %n eliminates the common method of elevating such a bug into an arbitrary kernel memory writing primitive, significantly reducing the danger of this class of bug. For seq_file users that need to know the length of a written string for padding, please see seq_setwidth() and seq_pad() instead. Signed-off-by: Kees Cook Cc: Joe Perches Cc: Tetsuo Handa Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/vsprintf.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'lib') diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 48586ac3a62e..10909c571494 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -1712,18 +1712,16 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args) break; case FORMAT_TYPE_NRCHARS: { - u8 qualifier = spec.qualifier; + /* + * Since %n poses a greater security risk than + * utility, ignore %n and skip its argument. + */ + void *skip_arg; - if (qualifier == 'l') { - long *ip = va_arg(args, long *); - *ip = (str - buf); - } else if (_tolower(qualifier) == 'z') { - size_t *ip = va_arg(args, size_t *); - *ip = (str - buf); - } else { - int *ip = va_arg(args, int *); - *ip = (str - buf); - } + WARN_ONCE(1, "Please remove ignored %%n in '%s'\n", + old_fmt); + + skip_arg = va_arg(args, void *); break; } -- cgit v1.2.3 From b89241e8cdb8321c20546d47645a9b65b58113b5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 14 Nov 2013 14:32:11 -0800 Subject: llists: move llist_reverse_order from raid5 to llist.c Make this useful helper available for other users. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Cc: Jens Axboe Cc: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid5.c | 14 -------------- include/linux/llist.h | 2 ++ lib/llist.c | 22 ++++++++++++++++++++++ 3 files changed, 24 insertions(+), 14 deletions(-) (limited to 'lib') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f8b906843926..7f0e17a27aeb 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -293,20 +293,6 @@ static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) do_release_stripe(conf, sh); } -static struct llist_node *llist_reverse_order(struct llist_node *head) -{ - struct llist_node *new_head = NULL; - - while (head) { - struct llist_node *tmp = head; - head = head->next; - tmp->next = new_head; - new_head = tmp; - } - - return new_head; -} - /* should hold conf->device_lock already */ static int release_stripe_list(struct r5conf *conf) { diff --git a/include/linux/llist.h b/include/linux/llist.h index 8828a78dec9a..fbf10a0bc095 100644 --- a/include/linux/llist.h +++ b/include/linux/llist.h @@ -195,4 +195,6 @@ static inline struct llist_node *llist_del_all(struct llist_head *head) extern struct llist_node *llist_del_first(struct llist_head *head); +struct llist_node *llist_reverse_order(struct llist_node *head); + #endif /* LLIST_H */ diff --git a/lib/llist.c b/lib/llist.c index 4a70d120138c..ef48b87247ec 100644 --- a/lib/llist.c +++ b/lib/llist.c @@ -81,3 +81,25 @@ struct llist_node *llist_del_first(struct llist_head *head) return entry; } EXPORT_SYMBOL_GPL(llist_del_first); + +/** + * llist_reverse_order - reverse order of a llist chain + * @head: first item of the list to be reversed + * + * Reverse the oder of a chain of llist entries and return the + * new first entry. + */ +struct llist_node *llist_reverse_order(struct llist_node *head) +{ + struct llist_node *new_head = NULL; + + while (head) { + struct llist_node *tmp = head; + head = head->next; + tmp->next = new_head; + new_head = tmp; + } + + return new_head; +} +EXPORT_SYMBOL_GPL(llist_reverse_order); -- cgit v1.2.3 From 0791a6057cb60d12ec5e3182b99e6ffa8044ee3a Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 14 Nov 2013 14:32:13 -0800 Subject: llists-move-llist_reverse_order-from-raid5-to-llistc-fix fix comment typo, per Jan Cc: Christoph Hellwig Cc: Jan Kara Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/llist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/llist.c b/lib/llist.c index ef48b87247ec..f76196d07409 100644 --- a/lib/llist.c +++ b/lib/llist.c @@ -86,7 +86,7 @@ EXPORT_SYMBOL_GPL(llist_del_first); * llist_reverse_order - reverse order of a llist chain * @head: first item of the list to be reversed * - * Reverse the oder of a chain of llist entries and return the + * Reverse the order of a chain of llist entries and return the * new first entry. */ struct llist_node *llist_reverse_order(struct llist_node *head) -- cgit v1.2.3 From a019e48cfbfb358786326db3dbc1c565b8f14a56 Mon Sep 17 00:00:00 2001 From: Lars-Peter Clausen Date: Thu, 14 Nov 2013 14:32:16 -0800 Subject: kfifo: kfifo_copy_{to,from}_user: fix copied bytes calculation 'copied' and 'len' are in bytes, while 'ret' is in elements, so we need to multiply 'ret' with the size of one element to get the correct result. Signed-off-by: Lars-Peter Clausen Cc: Stefani Seibold Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/kfifo.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/kfifo.c b/lib/kfifo.c index 7b7f83027b7b..d79b9d222065 100644 --- a/lib/kfifo.c +++ b/lib/kfifo.c @@ -215,7 +215,7 @@ static unsigned long kfifo_copy_from_user(struct __kfifo *fifo, * incrementing the fifo->in index counter */ smp_wmb(); - *copied = len - ret; + *copied = len - ret * esize; /* return the number of elements which are not copied */ return ret; } @@ -275,7 +275,7 @@ static unsigned long kfifo_copy_to_user(struct __kfifo *fifo, void __user *to, * incrementing the fifo->out index counter */ smp_wmb(); - *copied = len - ret; + *copied = len - ret * esize; /* return the number of elements which are not copied */ return ret; } -- cgit v1.2.3 From fcd40d69afea8700eda8cbf381f6d160cf757ebf Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 19 Nov 2013 10:08:13 -0800 Subject: percpu-refcount: Add percpu-refcount.o to obj-y Drop percpu_ida.o from lib-y since it is also listed in obj-y and it doesn't need to be listed in both places. Move percpu-refcount.o from lib-y to obj-y to fix build errors in target_core_mod: ERROR: "percpu_ref_cancel_init" [drivers/target/target_core_mod.ko] undefined! ERROR: "percpu_ref_kill_and_confirm" [drivers/target/target_core_mod.ko] undefined! ERROR: "percpu_ref_init" [drivers/target/target_core_mod.ko] undefined! Signed-off-by: Randy Dunlap Signed-off-by: Nicholas Bellinger --- lib/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/Makefile b/lib/Makefile index f3bb2cb98adf..8ed321179849 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -13,7 +13,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \ proportions.o flex_proportions.o prio_heap.o ratelimit.o show_mem.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ - earlycpio.o percpu-refcount.o percpu_ida.o + earlycpio.o obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o lib-$(CONFIG_MMU) += ioremap.o @@ -26,7 +26,7 @@ obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \ bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \ gcd.o lcm.o list_sort.o uuid.o flex_array.o iovec.o clz_ctz.o \ bsearch.o find_last_bit.o find_next_bit.o llist.o memweight.o kfifo.o \ - percpu_ida.o + percpu-refcount.o percpu_ida.o obj-y += string_helpers.o obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o obj-y += kstrtox.o -- cgit v1.2.3 From 14058d20c155ab3ff473fb60eca4fa7aa21a16ac Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 27 Nov 2013 13:52:53 +0000 Subject: lockref: include mutex.h rather than reinvent arch_mutex_cpu_relax arch_mutex_cpu_relax is already conditionally defined in mutex.h, so simply include that header rather than replicate the code here. Signed-off-by: Will Deacon Signed-off-by: Linus Torvalds --- lib/lockref.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'lib') diff --git a/lib/lockref.c b/lib/lockref.c index d2b123f8456b..f07a40d33871 100644 --- a/lib/lockref.c +++ b/lib/lockref.c @@ -1,5 +1,6 @@ #include #include +#include #if USE_CMPXCHG_LOCKREF @@ -11,14 +12,6 @@ # define cmpxchg64_relaxed cmpxchg64 #endif -/* - * Allow architectures to override the default cpu_relax() within CMPXCHG_LOOP. - * This is useful for architectures with an expensive cpu_relax(). - */ -#ifndef arch_mutex_cpu_relax -# define arch_mutex_cpu_relax() cpu_relax() -#endif - /* * Note that the "cmpxchg()" reloads the "old" value for the * failure case. -- cgit v1.2.3 From 23fd78d76415729b338ff1802a0066b4a62f7fb8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 2 Dec 2013 11:24:18 +0000 Subject: KEYS: Fix multiple key add into associative array If sufficient keys (or keyrings) are added into a keyring such that a node in the associative array's tree overflows (each node has a capacity N, currently 16) and such that all N+1 keys have the same index key segment for that level of the tree (the level'th nibble of the index key), then assoc_array_insert() calls ops->diff_objects() to indicate at which bit position the two index keys vary. However, __key_link_begin() passes a NULL object to assoc_array_insert() with the intention of supplying the correct pointer later before we commit the change. This means that keyring_diff_objects() is given a NULL pointer as one of its arguments which it does not expect. This results in an oops like the attached. With the previous patch to fix the keyring hash function, this can be forced much more easily by creating a keyring and only adding keyrings to it. Add any other sort of key and a different insertion path is taken - all 16+1 objects must want to cluster in the same node slot. This can be tested by: r=`keyctl newring sandbox @s` for ((i=0; i<=16; i++)); do keyctl newring ring$i $r; done This should work fine, but oopses when the 17th keyring is added. Since ops->diff_objects() is always called with the first pointer pointing to the object to be inserted (ie. the NULL pointer), we can fix the problem by changing the to-be-inserted object pointer to point to the index key passed into assoc_array_insert() instead. Whilst we're at it, we also switch the arguments so that they are the same as for ->compare_object(). BUG: unable to handle kernel NULL pointer dereference at 0000000000000088 IP: [] hash_key_type_and_desc+0x18/0xb0 ... RIP: 0010:[] hash_key_type_and_desc+0x18/0xb0 ... Call Trace: [] keyring_diff_objects+0x21/0xd2 [] assoc_array_insert+0x3b6/0x908 [] __key_link_begin+0x78/0xe5 [] key_create_or_update+0x17d/0x36a [] SyS_add_key+0x123/0x183 [] tracesys+0xdd/0xe2 Signed-off-by: David Howells Tested-by: Stephen Gallagher --- Documentation/assoc_array.txt | 6 +++--- include/linux/assoc_array.h | 6 +++--- lib/assoc_array.c | 4 ++-- security/keys/keyring.c | 7 +++---- 4 files changed, 11 insertions(+), 12 deletions(-) (limited to 'lib') diff --git a/Documentation/assoc_array.txt b/Documentation/assoc_array.txt index f4faec0f66e4..2f2c6cdd73c0 100644 --- a/Documentation/assoc_array.txt +++ b/Documentation/assoc_array.txt @@ -164,10 +164,10 @@ This points to a number of methods, all of which need to be provided: (4) Diff the index keys of two objects. - int (*diff_objects)(const void *a, const void *b); + int (*diff_objects)(const void *object, const void *index_key); - Return the bit position at which the index keys of two objects differ or - -1 if they are the same. + Return the bit position at which the index key of the specified object + differs from the given index key or -1 if they are the same. (5) Free an object. diff --git a/include/linux/assoc_array.h b/include/linux/assoc_array.h index 9a193b84238a..a89df3be1686 100644 --- a/include/linux/assoc_array.h +++ b/include/linux/assoc_array.h @@ -41,10 +41,10 @@ struct assoc_array_ops { /* Is this the object we're looking for? */ bool (*compare_object)(const void *object, const void *index_key); - /* How different are two objects, to a bit position in their keys? (or - * -1 if they're the same) + /* How different is an object from an index key, to a bit position in + * their keys? (or -1 if they're the same) */ - int (*diff_objects)(const void *a, const void *b); + int (*diff_objects)(const void *object, const void *index_key); /* Method to free an object. */ void (*free_object)(void *object); diff --git a/lib/assoc_array.c b/lib/assoc_array.c index 17edeaf19180..1b6a44f1ec3e 100644 --- a/lib/assoc_array.c +++ b/lib/assoc_array.c @@ -759,8 +759,8 @@ all_leaves_cluster_together: pr_devel("all leaves cluster together\n"); diff = INT_MAX; for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { - int x = ops->diff_objects(assoc_array_ptr_to_leaf(edit->leaf), - assoc_array_ptr_to_leaf(node->slots[i])); + int x = ops->diff_objects(assoc_array_ptr_to_leaf(node->slots[i]), + index_key); if (x < diff) { BUG_ON(x < 0); diff = x; diff --git a/security/keys/keyring.c b/security/keys/keyring.c index 0adbc77a59b9..3dd8445cd489 100644 --- a/security/keys/keyring.c +++ b/security/keys/keyring.c @@ -279,12 +279,11 @@ static bool keyring_compare_object(const void *object, const void *data) * Compare the index keys of a pair of objects and determine the bit position * at which they differ - if they differ. */ -static int keyring_diff_objects(const void *_a, const void *_b) +static int keyring_diff_objects(const void *object, const void *data) { - const struct key *key_a = keyring_ptr_to_key(_a); - const struct key *key_b = keyring_ptr_to_key(_b); + const struct key *key_a = keyring_ptr_to_key(object); const struct keyring_index_key *a = &key_a->index_key; - const struct keyring_index_key *b = &key_b->index_key; + const struct keyring_index_key *b = data; unsigned long seg_a, seg_b; int level, i; -- cgit v1.2.3 From 74e72f894d56eb9d2e1218530c658e7d297e002b Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 14 Jan 2014 17:56:42 -0800 Subject: lib/percpu_counter.c: fix __percpu_counter_add() __percpu_counter_add() may be called in softirq/hardirq handler (such as, blk_mq_queue_exit() is typically called in hardirq/softirq handler), so we need to call this_cpu_add()(irq safe helper) to update percpu counter, otherwise counts may be lost. This fixes the problem that 'rmmod null_blk' hangs in blk_cleanup_queue() because of miscounting of request_queue->mq_usage_counter. This patch is the v1 of previous one of "lib/percpu_counter.c: disable local irq when updating percpu couter", and takes Andrew's approach which may be more efficient for ARCHs(x86, s390) that have optimized this_cpu_add(). Signed-off-by: Ming Lei Cc: Paul Gortmaker Cc: Shaohua Li Cc: Jens Axboe Cc: Fan Du Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/percpu_counter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index 7473ee3b4ee7..1da85bb1bc07 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -82,10 +82,10 @@ void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch) unsigned long flags; raw_spin_lock_irqsave(&fbc->lock, flags); fbc->count += count; + __this_cpu_sub(*fbc->counters, count); raw_spin_unlock_irqrestore(&fbc->lock, flags); - __this_cpu_write(*fbc->counters, 0); } else { - __this_cpu_write(*fbc->counters, count); + this_cpu_add(*fbc->counters, amount); } preempt_enable(); } -- cgit v1.2.3 From d1969a84dd6a44d375aa82bba7d6c38713a429c3 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 16 Jan 2014 15:26:48 -0800 Subject: percpu_counter: unbreak __percpu_counter_add() Commit 74e72f894d56 ("lib/percpu_counter.c: fix __percpu_counter_add()") looked very plausible, but its arithmetic was badly wrong: obvious once you see the fix, but maddening to get there from the weird tmpfs ENOSPCs Signed-off-by: Hugh Dickins Cc: Ming Lei Cc: Paul Gortmaker Cc: Shaohua Li Cc: Jens Axboe Cc: Fan Du Cc: Andrew Morton Signed-off-by: Linus Torvalds --- lib/percpu_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index 1da85bb1bc07..8280a5dd1727 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -82,7 +82,7 @@ void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch) unsigned long flags; raw_spin_lock_irqsave(&fbc->lock, flags); fbc->count += count; - __this_cpu_sub(*fbc->counters, count); + __this_cpu_sub(*fbc->counters, count - amount); raw_spin_unlock_irqrestore(&fbc->lock, flags); } else { this_cpu_add(*fbc->counters, amount); -- cgit v1.2.3