diff options
89 files changed, 2238 insertions, 868 deletions
diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl index f5696ba9ae96..9d0058e788e5 100644 --- a/Documentation/DocBook/kernel-api.tmpl +++ b/Documentation/DocBook/kernel-api.tmpl @@ -283,6 +283,7 @@ X!Earch/x86/kernel/mca_32.c <chapter id="security"> <title>Security Framework</title> !Isecurity/security.c +!Esecurity/inode.c </chapter> <chapter id="audit"> diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt index cf5562cbe356..6e253407b3dc 100644 --- a/Documentation/RCU/checklist.txt +++ b/Documentation/RCU/checklist.txt @@ -210,7 +210,7 @@ over a rather long period of time, but improvements are always welcome! number of updates per grace period. 9. All RCU list-traversal primitives, which include - rcu_dereference(), list_for_each_rcu(), list_for_each_entry_rcu(), + rcu_dereference(), list_for_each_entry_rcu(), list_for_each_continue_rcu(), and list_for_each_safe_rcu(), must be either within an RCU read-side critical section or must be protected by appropriate update-side locks. RCU diff --git a/Documentation/RCU/rcuref.txt b/Documentation/RCU/rcuref.txt index 451de2ad8329..4202ad093130 100644 --- a/Documentation/RCU/rcuref.txt +++ b/Documentation/RCU/rcuref.txt @@ -29,9 +29,9 @@ release_referenced() delete() } If this list/array is made lock free using RCU as in changing the -write_lock() in add() and delete() to spin_lock and changing read_lock -in search_and_reference to rcu_read_lock(), the atomic_get in -search_and_reference could potentially hold reference to an element which +write_lock() in add() and delete() to spin_lock() and changing read_lock() +in search_and_reference() to rcu_read_lock(), the atomic_inc() in +search_and_reference() could potentially hold reference to an element which has already been deleted from the list/array. Use atomic_inc_not_zero() in this scenario as follows: @@ -40,20 +40,20 @@ add() search_and_reference() { { alloc_object rcu_read_lock(); ... search_for_element - atomic_set(&el->rc, 1); if (atomic_inc_not_zero(&el->rc)) { - write_lock(&list_lock); rcu_read_unlock(); + atomic_set(&el->rc, 1); if (!atomic_inc_not_zero(&el->rc)) { + spin_lock(&list_lock); rcu_read_unlock(); return FAIL; add_element } ... ... - write_unlock(&list_lock); rcu_read_unlock(); + spin_unlock(&list_lock); rcu_read_unlock(); } } 3. 4. release_referenced() delete() { { - ... write_lock(&list_lock); + ... spin_lock(&list_lock); if (atomic_dec_and_test(&el->rc)) ... call_rcu(&el->head, el_free); delete_element - ... write_unlock(&list_lock); + ... spin_unlock(&list_lock); } ... if (atomic_dec_and_test(&el->rc)) call_rcu(&el->head, el_free); diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index e04d643a9f57..96170824a717 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt @@ -786,8 +786,6 @@ RCU pointer/list traversal: list_for_each_entry_rcu hlist_for_each_entry_rcu - list_for_each_rcu (to be deprecated in favor of - list_for_each_entry_rcu) list_for_each_continue_rcu (to be deprecated in favor of new list_for_each_entry_continue_rcu) diff --git a/Documentation/SELinux.txt b/Documentation/SELinux.txt new file mode 100644 index 000000000000..07eae00f3314 --- /dev/null +++ b/Documentation/SELinux.txt @@ -0,0 +1,27 @@ +If you want to use SELinux, chances are you will want +to use the distro-provided policies, or install the +latest reference policy release from + http://oss.tresys.com/projects/refpolicy + +However, if you want to install a dummy policy for +testing, you can do using 'mdp' provided under +scripts/selinux. Note that this requires the selinux +userspace to be installed - in particular you will +need checkpolicy to compile a kernel, and setfiles and +fixfiles to label the filesystem. + + 1. Compile the kernel with selinux enabled. + 2. Type 'make' to compile mdp. + 3. Make sure that you are not running with + SELinux enabled and a real policy. If + you are, reboot with selinux disabled + before continuing. + 4. Run install_policy.sh: + cd scripts/selinux + sh install_policy.sh + +Step 4 will create a new dummy policy valid for your +kernel, with a single selinux user, role, and type. +It will compile the policy, will set your SELINUXTYPE to +dummy in /etc/selinux/config, install the compiled policy +as 'dummy', and relabel your filesystem. diff --git a/Documentation/kernel-doc-nano-HOWTO.txt b/Documentation/kernel-doc-nano-HOWTO.txt index 0bd32748a467..c6841eee9598 100644 --- a/Documentation/kernel-doc-nano-HOWTO.txt +++ b/Documentation/kernel-doc-nano-HOWTO.txt @@ -168,10 +168,10 @@ if ($#ARGV < 0) { mkdir $ARGV[0],0777; $state = 0; while (<STDIN>) { - if (/^\.TH \"[^\"]*\" 4 \"([^\"]*)\"/) { + if (/^\.TH \"[^\"]*\" 9 \"([^\"]*)\"/) { if ($state == 1) { close OUT } $state = 1; - $fn = "$ARGV[0]/$1.4"; + $fn = "$ARGV[0]/$1.9"; print STDERR "Creating $fn\n"; open OUT, ">$fn" or die "can't open $fn: $!\n"; print OUT $_; diff --git a/Documentation/scheduler/sched-design-CFS.txt b/Documentation/scheduler/sched-design-CFS.txt index 88bcb8767335..9d8eb553884c 100644 --- a/Documentation/scheduler/sched-design-CFS.txt +++ b/Documentation/scheduler/sched-design-CFS.txt @@ -1,151 +1,242 @@ + ============= + CFS Scheduler + ============= -This is the CFS scheduler. - -80% of CFS's design can be summed up in a single sentence: CFS basically -models an "ideal, precise multi-tasking CPU" on real hardware. - -"Ideal multi-tasking CPU" is a (non-existent :-)) CPU that has 100% -physical power and which can run each task at precise equal speed, in -parallel, each at 1/nr_running speed. For example: if there are 2 tasks -running then it runs each at 50% physical power - totally in parallel. - -On real hardware, we can run only a single task at once, so while that -one task runs, the other tasks that are waiting for the CPU are at a -disadvantage - the current task gets an unfair amount of CPU time. In -CFS this fairness imbalance is expressed and tracked via the per-task -p->wait_runtime (nanosec-unit) value. "wait_runtime" is the amount of -time the task should now run on the CPU for it to become completely fair -and balanced. - -( small detail: on 'ideal' hardware, the p->wait_runtime value would - always be zero - no task would ever get 'out of balance' from the - 'ideal' share of CPU time. ) - -CFS's task picking logic is based on this p->wait_runtime value and it -is thus very simple: it always tries to run the task with the largest -p->wait_runtime value. In other words, CFS tries to run the task with -the 'gravest need' for more CPU time. So CFS always tries to split up -CPU time between runnable tasks as close to 'ideal multitasking -hardware' as possible. - -Most of the rest of CFS's design just falls out of this really simple -concept, with a few add-on embellishments like nice levels, -multiprocessing and various algorithm variants to recognize sleepers. - -In practice it works like this: the system runs a task a bit, and when -the task schedules (or a scheduler tick happens) the task's CPU usage is -'accounted for': the (small) time it just spent using the physical CPU -is deducted from p->wait_runtime. [minus the 'fair share' it would have -gotten anyway]. Once p->wait_runtime gets low enough so that another -task becomes the 'leftmost task' of the time-ordered rbtree it maintains -(plus a small amount of 'granularity' distance relative to the leftmost -task so that we do not over-schedule tasks and trash the cache) then the -new leftmost task is picked and the current task is preempted. - -The rq->fair_clock value tracks the 'CPU time a runnable task would have -fairly gotten, had it been runnable during that time'. So by using -rq->fair_clock values we can accurately timestamp and measure the -'expected CPU time' a task should have gotten. All runnable tasks are -sorted in the rbtree by the "rq->fair_clock - p->wait_runtime" key, and -CFS picks the 'leftmost' task and sticks to it. As the system progresses -forwards, newly woken tasks are put into the tree more and more to the -right - slowly but surely giving a chance for every task to become the -'leftmost task' and thus get on the CPU within a deterministic amount of -time. - -Some implementation details: - - - the introduction of Scheduling Classes: an extensible hierarchy of - scheduler modules. These modules encapsulate scheduling policy - details and are handled by the scheduler core without the core - code assuming about them too much. - - - sched_fair.c implements the 'CFS desktop scheduler': it is a - replacement for the vanilla scheduler's SCHED_OTHER interactivity - code. - - I'd like to give credit to Con Kolivas for the general approach here: - he has proven via RSDL/SD that 'fair scheduling' is possible and that - it results in better desktop scheduling. Kudos Con! - - The CFS patch uses a completely different approach and implementation - from RSDL/SD. My goal was to make CFS's interactivity quality exceed - that of RSDL/SD, which is a high standard to meet :-) Testing - feedback is welcome to decide this one way or another. [ and, in any - case, all of SD's logic could be added via a kernel/sched_sd.c module - as well, if Con is interested in such an approach. ] - - CFS's design is quite radical: it does not use runqueues, it uses a - time-ordered rbtree to build a 'timeline' of future task execution, - and thus has no 'array switch' artifacts (by which both the vanilla - scheduler and RSDL/SD are affected). - - CFS uses nanosecond granularity accounting and does not rely on any - jiffies or other HZ detail. Thus the CFS scheduler has no notion of - 'timeslices' and has no heuristics whatsoever. There is only one - central tunable (you have to switch on CONFIG_SCHED_DEBUG): - - /proc/sys/kernel/sched_granularity_ns - - which can be used to tune the scheduler from 'desktop' (low - latencies) to 'server' (good batching) workloads. It defaults to a - setting suitable for desktop workloads. SCHED_BATCH is handled by the - CFS scheduler module too. - - Due to its design, the CFS scheduler is not prone to any of the - 'attacks' that exist today against the heuristics of the stock - scheduler: fiftyp.c, thud.c, chew.c, ring-test.c, massive_intr.c all - work fine and do not impact interactivity and produce the expected - behavior. - - the CFS scheduler has a much stronger handling of nice levels and - SCHED_BATCH: both types of workloads should be isolated much more - agressively than under the vanilla scheduler. - - ( another detail: due to nanosec accounting and timeline sorting, - sched_yield() support is very simple under CFS, and in fact under - CFS sched_yield() behaves much better than under any other - scheduler i have tested so far. ) - - - sched_rt.c implements SCHED_FIFO and SCHED_RR semantics, in a simpler - way than the vanilla scheduler does. It uses 100 runqueues (for all - 100 RT priority levels, instead of 140 in the vanilla scheduler) - and it needs no expired array. - - - reworked/sanitized SMP load-balancing: the runqueue-walking - assumptions are gone from the load-balancing code now, and - iterators of the scheduling modules are used. The balancing code got - quite a bit simpler as a result. - - -Group scheduler extension to CFS -================================ - -Normally the scheduler operates on individual tasks and strives to provide -fair CPU time to each task. Sometimes, it may be desirable to group tasks -and provide fair CPU time to each such task group. For example, it may -be desirable to first provide fair CPU time to each user on the system -and then to each task belonging to a user. - -CONFIG_FAIR_GROUP_SCHED strives to achieve exactly that. It lets -SCHED_NORMAL/BATCH tasks be be grouped and divides CPU time fairly among such -groups. At present, there are two (mutually exclusive) mechanisms to group -tasks for CPU bandwidth control purpose: - - - Based on user id (CONFIG_FAIR_USER_SCHED) - In this option, tasks are grouped according to their user id. - - Based on "cgroup" pseudo filesystem (CONFIG_FAIR_CGROUP_SCHED) - This options lets the administrator create arbitrary groups - of tasks, using the "cgroup" pseudo filesystem. See - Documentation/cgroups.txt for more information about this - filesystem. -Only one of these options to group tasks can be chosen and not both. +1. OVERVIEW + +CFS stands for "Completely Fair Scheduler," and is the new "desktop" process +scheduler implemented by Ingo Molnar and merged in Linux 2.6.23. It is the +replacement for the previous vanilla scheduler's SCHED_OTHER interactivity +code. + +80% of CFS's design can be summed up in a single sentence: CFS basically models +an "ideal, precise multi-tasking CPU" on real hardware. + +"Ideal multi-tasking CPU" is a (non-existent :-)) CPU that has 100% physical +power and which can run each task at precise equal speed, in parallel, each at +1/nr_running speed. For example: if there are 2 tasks running, then it runs +each at 50% physical power --- i.e., actually in parallel. + +On real hardware, we can run only a single task at once, so we have to +introduce the concept of "virtual runtime." The virtual runtime of a task +specifies when its next timeslice would start execution on the ideal +multi-tasking CPU described above. In practice, the virtual runtime of a task +is its actual runtime normalized to the total number of running tasks. + + + +2. FEW IMPLEMENTATION DETAILS + +In CFS the virtual runtime is expressed and tracked via the per-task +p->se.vruntime (nanosec-unit) value. This way, it's possible to accurately +timestamp and measure the "expected CPU time" a task should have gotten. + +[ small detail: on "ideal" hardware, at any time all tasks would have the same + p->se.vruntime value --- i.e., tasks would execute simultaneously and no task + would ever get "out of balance" from the "ideal" share of CPU time. ] + +CFS's task picking logic is based on this p->se.vruntime value and it is thus +very simple: it always tries to run the task with the smallest p->se.vruntime +value (i.e., the task which executed least so far). CFS always tries to split +up CPU time between runnable tasks as close to "ideal multitasking hardware" as +possible. + +Most of the rest of CFS's design just falls out of this really simple concept, +with a few add-on embellishments like nice levels, multiprocessing and various +algorithm variants to recognize sleepers. + + + +3. THE RBTREE + +CFS's design is quite radical: it does not use the old data structures for the +runqueues, but it uses a time-ordered rbtree to build a "timeline" of future +task execution, and thus has no "array switch" artifacts (by which both the +previous vanilla scheduler and RSDL/SD are affected). + +CFS also maintains the rq->cfs.min_vruntime value, which is a monotonic +increasing value tracking the smallest vruntime among all tasks in the +runqueue. The total amount of work done by the system is tracked using +min_vruntime; that value is used to place newly activated entities on the left +side of the tree as much as possible. + +The total number of running tasks in the runqueue is accounted through the +rq->cfs.load value, which is the sum of the weights of the tasks queued on the +runqueue. + +CFS maintains a time-ordered rbtree, where all runnable tasks are sorted by the +p->se.vruntime key (there is a subtraction using rq->cfs.min_vruntime to +account for possible wraparounds). CFS picks the "leftmost" task from this +tree and sticks to it. +As the system progresses forwards, the executed tasks are put into the tree +more and more to the right --- slowly but surely giving a chance for every task +to become the "leftmost task" and thus get on the CPU within a deterministic +amount of time. + +Summing up, CFS works like this: it runs a task a bit, and when the task +schedules (or a scheduler tick happens) the task's CPU usage is "accounted +for": the (small) time it just spent using the physical CPU is added to +p->se.vruntime. Once p->se.vruntime gets high enough so that another task +becomes the "leftmost task" of the time-ordered rbtree it maintains (plus a +small amount of "granularity" distance relative to the leftmost task so that we +do not over-schedule tasks and trash the cache), then the new leftmost task is +picked and the current task is preempted. + + + +4. SOME FEATURES OF CFS + +CFS uses nanosecond granularity accounting and does not rely on any jiffies or +other HZ detail. Thus the CFS scheduler has no notion of "timeslices" in the +way the previous scheduler had, and has no heuristics whatsoever. There is +only one central tunable (you have to switch on CONFIG_SCHED_DEBUG): + + /proc/sys/kernel/sched_granularity_ns + +which can be used to tune the scheduler from "desktop" (i.e., low latencies) to +"server" (i.e., good batching) workloads. It defaults to a setting suitable +for desktop workloads. SCHED_BATCH is handled by the CFS scheduler module too. + +Due to its design, the CFS scheduler is not prone to any of the "attacks" that +exist today against the heuristics of the stock scheduler: fiftyp.c, thud.c, +chew.c, ring-test.c, massive_intr.c all work fine and do not impact +interactivity and produce the expected behavior. + +The CFS scheduler has a much stronger handling of nice levels and SCHED_BATCH +than the previous vanilla scheduler: both types of workloads are isolated much +more aggressively. + +SMP load-balancing has been reworked/sanitized: the runqueue-walking +assumptions are gone from the load-balancing code now, and iterators of the +scheduling modules are used. The balancing code got quite a bit simpler as a +result. + + + +5. Scheduling policies + +CFS implements three scheduling policies: + + - SCHED_NORMAL (traditionally called SCHED_OTHER): The scheduling + policy that is used for regular tasks. + + - SCHED_BATCH: Does not preempt nearly as often as regular tasks + would, thereby allowing tasks to run longer and make better use of + caches but at the cost of interactivity. This is well suited for + batch jobs. + + - SCHED_IDLE: This is even weaker than nice 19, but its not a true + idle timer scheduler in order to avoid to get into priority + inversion problems which would deadlock the machine. + +SCHED_FIFO/_RR are implemented in sched_rt.c and are as specified by +POSIX. + +The command chrt from util-linux-ng 2.13.1.1 can set all of these except +SCHED_IDLE. -Group scheduler tunables: -When CONFIG_FAIR_USER_SCHED is defined, a directory is created in sysfs for -each new user and a "cpu_share" file is added in that directory. + +6. SCHEDULING CLASSES + +The new CFS scheduler has been designed in such a way to introduce "Scheduling +Classes," an extensible hierarchy of scheduler modules. These modules +encapsulate scheduling policy details and are handled by the scheduler core +without the core code assuming too much about them. + +sched_fair.c implements the CFS scheduler described above. + +sched_rt.c implements SCHED_FIFO and SCHED_RR semantics, in a simpler way than +the previous vanilla scheduler did. It uses 100 runqueues (for all 100 RT +priority levels, instead of 140 in the previous scheduler) and it needs no +expired array. + +Scheduling classes are implemented through the sched_class structure, which +contains hooks to functions that must be called whenever an interesting event +occurs. + +This is the (partial) list of the hooks: + + - enqueue_task(...) + + Called when a task enters a runnable state. + It puts the scheduling entity (task) into the red-black tree and + increments the nr_running variable. + + - dequeue_tree(...) + + When a task is no longer runnable, this function is called to keep the + corresponding scheduling entity out of the red-black tree. It decrements + the nr_running variable. + + - yield_task(...) + + This function is basically just a dequeue followed by an enqueue, unless the + compat_yield sysctl is turned on; in that case, it places the scheduling + entity at the right-most end of the red-black tree. + + - check_preempt_curr(...) + + This function checks if a task that entered the runnable state should + preempt the currently running task. + + - pick_next_task(...) + + This function chooses the most appropriate task eligible to run next. + + - set_curr_task(...) + + This function is called when a task changes its scheduling class or changes + its task group. + + - task_tick(...) + + This function is mostly called from time tick functions; it might lead to + process switch. This drives the running preemption. + + - task_new(...) + + The core scheduler gives the scheduling module an opportunity to manage new + task startup. The CFS scheduling module uses it for group scheduling, while + the scheduling module for a real-time task does not use it. + + + +7. GROUP SCHEDULER EXTENSIONS TO CFS + +Normally, the scheduler operates on individual tasks and strives to provide +fair CPU time to each task. Sometimes, it may be desirable to group tasks and +provide fair CPU time to each such task group. For example, it may be +desirable to first provide fair CPU time to each user on the system and then to +each task belonging to a user. + +CONFIG_GROUP_SCHED strives to achieve exactly that. It lets tasks to be +grouped and divides CPU time fairly among such groups. + +CONFIG_RT_GROUP_SCHED permits to group real-time (i.e., SCHED_FIFO and +SCHED_RR) tasks. + +CONFIG_FAIR_GROUP_SCHED permits to group CFS (i.e., SCHED_NORMAL and +SCHED_BATCH) tasks. + +At present, there are two (mutually exclusive) mechanisms to group tasks for +CPU bandwidth control purposes: + + - Based on user id (CONFIG_USER_SCHED) + + With this option, tasks are grouped according to their user id. + + - Based on "cgroup" pseudo filesystem (CONFIG_CGROUP_SCHED) + + This options needs CONFIG_CGROUPS to be defined, and lets the administrator + create arbitrary groups of tasks, using the "cgroup" pseudo filesystem. See + Documentation/cgroups.txt for more information about this filesystem. + +Only one of these options to group tasks can be chosen and not both. + +When CONFIG_USER_SCHED is defined, a directory is created in sysfs for each new +user and a "cpu_share" file is added in that directory. # cd /sys/kernel/uids # cat 512/cpu_share # Display user 512's CPU share @@ -155,16 +246,14 @@ each new user and a "cpu_share" file is added in that directory. 2048 # -CPU bandwidth between two users are divided in the ratio of their CPU shares. -For ex: if you would like user "root" to get twice the bandwidth of user -"guest", then set the cpu_share for both the users such that "root"'s -cpu_share is twice "guest"'s cpu_share - +CPU bandwidth between two users is divided in the ratio of their CPU shares. +For example: if you would like user "root" to get twice the bandwidth of user +"guest," then set the cpu_share for both the users such that "root"'s cpu_share +is twice "guest"'s cpu_share. -When CONFIG_FAIR_CGROUP_SCHED is defined, a "cpu.shares" file is created -for each group created using the pseudo filesystem. See example steps -below to create task groups and modify their CPU share using the "cgroups" -pseudo filesystem +When CONFIG_CGROUP_SCHED is defined, a "cpu.shares" file is created for each +group created using the pseudo filesystem. See example steps below to create +task groups and modify their CPU share using the "cgroups" pseudo filesystem. # mkdir /dev/cpuctl # mount -t cgroup -ocpu none /dev/cpuctl diff --git a/MAINTAINERS b/MAINTAINERS index 8dae4555f10e..7a03bd5a91a3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3649,8 +3649,9 @@ M: jmorris@namei.org P: Eric Paris M: eparis@parisplace.org L: linux-kernel@vger.kernel.org (kernel issues) -L: selinux@tycho.nsa.gov (subscribers-only, general discussion) -W: http://www.nsa.gov/selinux +L: selinux@tycho.nsa.gov (subscribers-only, general discussion) +W: http://selinuxproject.org +T: git kernel.org:pub/scm/linux/kernel/git/jmorris/security-testing-2.6.git S: Supported SENSABLE PHANTOM diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c index 83df541650fc..06b6fdab639f 100644 --- a/arch/alpha/kernel/smp.c +++ b/arch/alpha/kernel/smp.c @@ -149,6 +149,9 @@ smp_callin(void) atomic_inc(&init_mm.mm_count); current->active_mm = &init_mm; + /* inform the notifiers about the new cpu */ + notify_cpu_starting(cpuid); + /* Must have completely accurate bogos. */ local_irq_enable(); diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index e9842f6767f9..e42a749a56dd 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -277,6 +277,7 @@ asmlinkage void __cpuinit secondary_start_kernel(void) /* * Enable local interrupts. */ + notify_cpu_starting(cpu); local_irq_enable(); local_fiq_enable(); diff --git a/arch/cris/arch-v32/kernel/smp.c b/arch/cris/arch-v32/kernel/smp.c index 952a24b2f5a9..52e16c6436f9 100644 --- a/arch/cris/arch-v32/kernel/smp.c +++ b/arch/cris/arch-v32/kernel/smp.c @@ -178,6 +178,7 @@ void __init smp_callin(void) unmask_irq(IPI_INTR_VECT); unmask_irq(TIMER0_INTR_VECT); preempt_disable(); + notify_cpu_starting(cpu); local_irq_enable(); cpu_set(cpu, cpu_online_map); diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c index d8f05e504fbf..1dcbb85fc4ee 100644 --- a/arch/ia64/kernel/smpboot.c +++ b/arch/ia64/kernel/smpboot.c @@ -401,6 +401,7 @@ smp_callin (void) spin_lock(&vector_lock); /* Setup the per cpu irq handling data structures */ __setup_vector_irq(cpuid); + notify_cpu_starting(cpuid); cpu_set(cpuid, cpu_online_map); per_cpu(cpu_state, cpuid) = CPU_ONLINE; spin_unlock(&vector_lock); diff --git a/arch/m32r/kernel/smpboot.c b/arch/m32r/kernel/smpboot.c index 2c03ac1d005f..fc2994811f15 100644 --- a/arch/m32r/kernel/smpboot.c +++ b/arch/m32r/kernel/smpboot.c @@ -498,6 +498,8 @@ static void __init smp_online(void) { int cpu_id = smp_processor_id(); + notify_cpu_starting(cpu_id); + local_irq_enable(); /* Get our bogomips. */ diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c index 4410f172b8ab..7b59cfb7e602 100644 --- a/arch/mips/kernel/smp.c +++ b/arch/mips/kernel/smp.c @@ -121,6 +121,8 @@ asmlinkage __cpuinit void start_secondary(void) cpu = smp_processor_id(); cpu_data[cpu].udelay_val = loops_per_jiffy; + notify_cpu_starting(cpu); + mp_ops->smp_finish(); set_cpu_sibling_map(cpu); diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 5337ca7bb649..c27b10a1bd79 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -453,6 +453,7 @@ int __devinit start_secondary(void *unused) secondary_cpu_time_init(); ipi_call_lock(); + notify_cpu_starting(cpu); cpu_set(cpu, cpu_online_map); /* Update sibling maps */ base = cpu_first_thread_in_core(cpu); diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 00b9b4dec5eb..9e8b1f9b8f4d 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -585,6 +585,8 @@ int __cpuinit start_secondary(void *cpuvoid) /* Enable pfault pseudo page faults on this cpu. */ pfault_init(); + /* call cpu notifiers */ + notify_cpu_starting(smp_processor_id()); /* Mark this cpu as online */ spin_lock(&call_lock); cpu_set(smp_processor_id(), cpu_online_map); diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c index 60c50841143e..001778f9adaf 100644 --- a/arch/sh/kernel/smp.c +++ b/arch/sh/kernel/smp.c @@ -82,6 +82,8 @@ asmlinkage void __cpuinit start_secondary(void) preempt_disable(); + notify_cpu_starting(smp_processor_id()); + local_irq_enable(); calibrate_delay(); diff --git a/arch/sparc/kernel/sun4d_smp.c b/arch/sparc/kernel/sun4d_smp.c index 69596402a500..446767e8f569 100644 --- a/arch/sparc/kernel/sun4d_smp.c +++ b/arch/sparc/kernel/sun4d_smp.c @@ -88,6 +88,7 @@ void __init smp4d_callin(void) local_flush_cache_all(); local_flush_tlb_all(); + notify_cpu_starting(cpuid); /* * Unblock the master CPU _only_ when the scheduler state * of all secondary CPUs will be up-to-date, so after diff --git a/arch/sparc/kernel/sun4m_smp.c b/arch/sparc/kernel/sun4m_smp.c index a14a76ac7f36..9964890dc1db 100644 --- a/arch/sparc/kernel/sun4m_smp.c +++ b/arch/sparc/kernel/sun4m_smp.c @@ -71,6 +71,8 @@ void __cpuinit smp4m_callin(void) local_flush_cache_all(); local_flush_tlb_all(); + notify_cpu_starting(cpuid); + /* Get our local ticker going. */ smp_setup_percpu_timer(); diff --git a/arch/um/kernel/smp.c b/arch/um/kernel/smp.c index be2d50c3aa95..045772142844 100644 --- a/arch/um/kernel/smp.c +++ b/arch/um/kernel/smp.c @@ -85,6 +85,7 @@ static int idle_proc(void *cpup) while (!cpu_isset(cpu, smp_commenced_mask)) cpu_relax(); + notify_cpu_starting(cpu); cpu_set(cpu, cpu_online_map); default_idle(); return 0; diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index dd097b835839..c24c4a487b7c 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -256,7 +256,8 @@ static u32 get_cur_val(const cpumask_t *mask) * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and * no meaning should be associated with absolute values of these MSRs. */ -static unsigned int get_measured_perf(unsigned int cpu) +static unsigned int get_measured_perf(struct cpufreq_policy *policy, + unsigned int cpu) { union { struct { @@ -326,7 +327,7 @@ static unsigned int get_measured_perf(unsigned int cpu) #endif - retval = per_cpu(drv_data, cpu)->max_freq * perf_percent / 100; + retval = per_cpu(drv_data, policy->cpu)->max_freq * perf_percent / 100; put_cpu(); set_cpus_allowed_ptr(current, &saved_mask); @@ -785,7 +786,11 @@ static int __init acpi_cpufreq_init(void) if (ret) return ret; - return cpufreq_register_driver(&acpi_cpufreq_driver); + ret = cpufreq_register_driver(&acpi_cpufreq_driver); + if (ret) + free_percpu(acpi_perf_data); + + return ret; } static void __exit acpi_cpufreq_exit(void) @@ -795,8 +800,6 @@ static void __exit acpi_cpufreq_exit(void) cpufreq_unregister_driver(&acpi_cpufreq_driver); free_percpu(acpi_perf_data); - - return; } module_param(acpi_pstate_strict, uint, 0644); diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c index e4a4bf870e94..fe613c93b366 100644 --- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c +++ b/arch/x86/kernel/cpu/cpufreq/elanfreq.c @@ -25,8 +25,8 @@ #include <linux/cpufreq.h> #include <asm/msr.h> -#include <asm/timex.h> -#include <asm/io.h> +#include <linux/timex.h> +#include <linux/io.h> #define REG_CSCIR 0x22 /* Chip Setup and Control Index Register */ #define REG_CSCDR 0x23 /* Chip Setup and Control Data Register */ @@ -82,7 +82,7 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu) u8 clockspeed_reg; /* Clock Speed Register */ local_irq_disable(); - outb_p(0x80,REG_CSCIR); + outb_p(0x80, REG_CSCIR); clockspeed_reg = inb_p(REG_CSCDR); local_irq_enable(); @@ -98,10 +98,10 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu) } /* 33 MHz is not 32 MHz... */ - if ((clockspeed_reg & 0xE0)==0xA0) + if ((clockspeed_reg & 0xE0) == 0xA0) return 33000; - return ((1<<((clockspeed_reg & 0xE0) >> 5)) * 1000); + return (1<<((clockspeed_reg & 0xE0) >> 5)) * 1000; } @@ -117,7 +117,7 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu) * There is no return value. */ -static void elanfreq_set_cpu_state (unsigned int state) +static void elanfreq_set_cpu_state(unsigned int state) { struct cpufreq_freqs freqs; @@ -144,20 +144,20 @@ static void elanfreq_set_cpu_state (unsigned int state) */ local_irq_disable(); - outb_p(0x40,REG_CSCIR); /* Disable hyperspeed mode */ - outb_p(0x00,REG_CSCDR); + outb_p(0x40, REG_CSCIR); /* Disable hyperspeed mode */ + outb_p(0x00, REG_CSCDR); local_irq_enable(); /* wait till internal pipelines and */ udelay(1000); /* buffers have cleaned up */ local_irq_disable(); /* now, set the CPU clock speed register (0x80) */ - outb_p(0x80,REG_CSCIR); - outb_p(elan_multiplier[state].val80h,REG_CSCDR); + outb_p(0x80, REG_CSCIR); + outb_p(elan_multiplier[state].val80h, REG_CSCDR); /* now, the hyperspeed bit in PMU Force Mode Register (0x40) */ - outb_p(0x40,REG_CSCIR); - outb_p(elan_multiplier[state].val40h,REG_CSCDR); + outb_p(0x40, REG_CSCIR); + outb_p(elan_multiplier[state].val40h, REG_CSCDR); udelay(10000); local_irq_enable(); @@ -173,12 +173,12 @@ static void elanfreq_set_cpu_state (unsigned int state) * for the hardware supported by the driver. */ -static int elanfreq_verify (struct cpufreq_policy *policy) +static int elanfreq_verify(struct cpufreq_policy *policy) { return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]); } -static int elanfreq_target (struct cpufreq_policy *policy, +static int elanfreq_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) { @@ -205,7 +205,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy) /* capability check */ if ((c->x86_vendor != X86_VENDOR_AMD) || - (c->x86 != 4) || (c->x86_model!=10)) + (c->x86 != 4) || (c->x86_model != 10)) return -ENODEV; /* max freq */ @@ -213,7 +213,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy) max_freq = elanfreq_get_cpu_frequency(0); /* table init */ - for (i=0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) { + for (i = 0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) { if (elanfreq_table[i].frequency > max_freq) elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID; } @@ -224,7 +224,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy) result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table); if (result) - return (result); + return result; cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu); return 0; @@ -260,7 +260,7 @@ __setup("elanfreq=", elanfreq_setup); #endif -static struct freq_attr* elanfreq_attr[] = { +static struct freq_attr *elanfreq_attr[] = { &cpufreq_freq_attr_scaling_available_freqs, NULL, }; @@ -284,9 +284,9 @@ static int __init elanfreq_init(void) /* Test if we have the right hardware */ if ((c->x86_vendor != X86_VENDOR_AMD) || - (c->x86 != 4) || (c->x86_model!=10)) { + (c->x86 != 4) || (c->x86_model != 10)) { printk(KERN_INFO "elanfreq: error: no Elan processor found!\n"); - return -ENODEV; + return -ENODEV; } return cpufreq_register_driver(&elanfreq_driver); } @@ -298,7 +298,7 @@ static void __exit elanfreq_exit(void) } -module_param (max_freq, int, 0444); +module_param(max_freq, int, 0444); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, Sven Geggus <sven@geggus.net>"); diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c index eb9b62b0830c..b5ced806a316 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c @@ -15,12 +15,11 @@ #include <linux/slab.h> #include <asm/msr.h> -#include <asm/timex.h> -#include <asm/io.h> +#include <linux/timex.h> +#include <linux/io.h> - -#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long - as it is unused */ +#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long + as it is unused */ static unsigned int busfreq; /* FSB, in 10 kHz */ static unsigned int max_multiplier; @@ -53,7 +52,7 @@ static int powernow_k6_get_cpu_multiplier(void) msrval = POWERNOW_IOPORT + 0x1; wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */ - invalue=inl(POWERNOW_IOPORT + 0x8); + invalue = inl(POWERNOW_IOPORT + 0x8); msrval = POWERNOW_IOPORT + 0x0; wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */ @@ -67,9 +66,9 @@ static int powernow_k6_get_cpu_multiplier(void) * * Tries to change the PowerNow! multiplier */ -static void powernow_k6_set_state (unsigned int best_i) +static void powernow_k6_set_state(unsigned int best_i) { - unsigned long outvalue=0, invalue=0; + unsigned long outvalue = 0, invalue = 0; unsigned long msrval; struct cpufreq_freqs freqs; @@ -90,10 +89,10 @@ static void powernow_k6_set_state (unsigned int best_i) msrval = POWERNOW_IOPORT + 0x1; wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */ - invalue=inl(POWERNOW_IOPORT + 0x8); + invalue = inl(POWERNOW_IOPORT + 0x8); invalue = invalue & 0xf; outvalue = outvalue | invalue; - outl(outvalue ,(POWERNOW_IOPORT + 0x8)); + outl(outvalue , (POWERNOW_IOPORT + 0x8)); msrval = POWERNOW_IOPORT + 0x0; wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */ @@ -124,7 +123,7 @@ static int powernow_k6_verify(struct cpufreq_policy *policy) * * sets a new CPUFreq policy */ -static int powernow_k6_target (struct cpufreq_policy *policy, +static int powernow_k6_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) { @@ -152,7 +151,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy) busfreq = cpu_khz / max_multiplier; /* table init */ - for (i=0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) { + for (i = 0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) { if (clock_ratio[i].index > max_multiplier) clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID; else @@ -165,7 +164,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy) result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio); if (result) - return (result); + return result; cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu); @@ -176,8 +175,8 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy) static int powernow_k6_cpu_exit(struct cpufreq_policy *policy) { unsigned int i; - for (i=0; i<8; i++) { - if (i==max_multiplier) + for (i = 0; i < 8; i++) { + if (i == max_multiplier) powernow_k6_set_state(i); } cpufreq_frequency_table_put_attr(policy->cpu); @@ -189,7 +188,7 @@ static unsigned int powernow_k6_get(unsigned int cpu) return busfreq * powernow_k6_get_cpu_multiplier(); } -static struct freq_attr* powernow_k6_attr[] = { +static struct freq_attr *powernow_k6_attr[] = { &cpufreq_freq_attr_scaling_available_freqs, NULL, }; @@ -227,7 +226,7 @@ static int __init powernow_k6_init(void) } if (cpufreq_register_driver(&powernow_k6_driver)) { - release_region (POWERNOW_IOPORT, 16); + release_region(POWERNOW_IOPORT, 16); return -EINVAL; } @@ -243,13 +242,13 @@ static int __init powernow_k6_init(void) static void __exit powernow_k6_exit(void) { cpufreq_unregister_driver(&powernow_k6_driver); - release_region (POWERNOW_IOPORT, 16); + release_region(POWERNOW_IOPORT, 16); } -MODULE_AUTHOR ("Arjan van de Ven <arjanv@redhat.com>, Dave Jones <davej@codemonkey.org.uk>, Dominik Brodowski <linux@brodo.de>"); -MODULE_DESCRIPTION ("PowerNow! driver for AMD K6-2+ / K6-3+ processors."); -MODULE_LICENSE ("GPL"); +MODULE_AUTHOR("Arjan van de Ven <arjanv@redhat.com>, Dave Jones <davej@codemonkey.org.uk>, Dominik Brodowski <linux@brodo.de>"); +MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors."); +MODULE_LICENSE("GPL"); module_init(powernow_k6_init); module_exit(powernow_k6_exit); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 45531e3ba194..4e7ccb0e2a9b 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -257,6 +257,7 @@ static void __cpuinit smp_callin(void) end_local_APIC_setup(); map_cpu_to_logical_apicid(); + notify_cpu_starting(cpuid); /* * Get our bogomips. * diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c index ee0fba092157..199a5f4a873c 100644 --- a/arch/x86/mach-voyager/voyager_smp.c +++ b/arch/x86/mach-voyager/voyager_smp.c @@ -448,6 +448,8 @@ static void __init start_secondary(void *unused) VDEBUG(("VOYAGER SMP: CPU%d, stack at about %p\n", cpuid, &cpuid)); + notify_cpu_starting(cpuid); + /* enable interrupts */ local_irq_enable(); diff --git a/drivers/char/tpm/Kconfig b/drivers/char/tpm/Kconfig index 3738cfa209ff..f5fc64f89c5c 100644 --- a/drivers/char/tpm/Kconfig +++ b/drivers/char/tpm/Kconfig @@ -6,6 +6,7 @@ menuconfig TCG_TPM tristate "TPM Hardware Support" depends on HAS_IOMEM depends on EXPERIMENTAL + select SECURITYFS ---help--- If you have a TPM security chip in your system, which implements the Trusted Computing Group's specification, diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 8a67f16987db..31d6f535a79d 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1467,25 +1467,27 @@ int cpufreq_driver_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) { - int ret; + int ret = -EINVAL; policy = cpufreq_cpu_get(policy->cpu); if (!policy) - return -EINVAL; + goto no_policy; if (unlikely(lock_policy_rwsem_write(policy->cpu))) - return -EINVAL; + goto fail; ret = __cpufreq_driver_target(policy, target_freq, relation); unlock_policy_rwsem_write(policy->cpu); +fail: cpufreq_cpu_put(policy); +no_policy: return ret; } EXPORT_SYMBOL_GPL(cpufreq_driver_target); -int __cpufreq_driver_getavg(struct cpufreq_policy *policy) +int __cpufreq_driver_getavg(struct cpufreq_policy *policy, unsigned int cpu) { int ret = 0; @@ -1493,8 +1495,8 @@ int __cpufreq_driver_getavg(struct cpufreq_policy *policy) if (!policy) return -EINVAL; - if (cpu_online(policy->cpu) && cpufreq_driver->getavg) - ret = cpufreq_driver->getavg(policy->cpu); + if (cpu_online(cpu) && cpufreq_driver->getavg) + ret = cpufreq_driver->getavg(policy, cpu); cpufreq_cpu_put(policy); return ret; @@ -1717,13 +1719,17 @@ int cpufreq_update_policy(unsigned int cpu) { struct cpufreq_policy *data = cpufreq_cpu_get(cpu); struct cpufreq_policy policy; - int ret = 0; + int ret; - if (!data) - return -ENODEV; + if (!data) { + ret = -ENODEV; + goto no_policy; + } - if (unlikely(lock_policy_rwsem_write(cpu))) - return -EINVAL; + if (unlikely(lock_policy_rwsem_write(cpu))) { + ret = -EINVAL; + goto fail; + } dprintk("updating policy for CPU %u\n", cpu); memcpy(&policy, data, sizeof(struct cpufreq_policy)); @@ -1750,7 +1756,9 @@ int cpufreq_update_policy(unsigned int cpu) unlock_policy_rwsem_write(cpu); +fail: cpufreq_cpu_put(data); +no_policy: return ret; } EXPORT_SYMBOL(cpufreq_update_policy); diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c index ac0bbf2d234f..e2657837d954 100644 --- a/drivers/cpufreq/cpufreq_conservative.c +++ b/drivers/cpufreq/cpufreq_conservative.c @@ -460,6 +460,7 @@ static void do_dbs_timer(struct work_struct *work) static inline void dbs_timer_init(void) { + init_timer_deferrable(&dbs_work.timer); schedule_delayed_work(&dbs_work, usecs_to_jiffies(dbs_tuners_ins.sampling_rate)); return; @@ -575,13 +576,15 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, return 0; } +#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE +static +#endif struct cpufreq_governor cpufreq_gov_conservative = { .name = "conservative", .governor = cpufreq_governor_dbs, .max_transition_latency = TRANSITION_LATENCY_LIMIT, .owner = THIS_MODULE, }; -EXPORT_SYMBOL(cpufreq_gov_conservative); static int __init cpufreq_gov_dbs_init(void) { diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index 33855cb3cf16..2ab3c12b88af 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -18,13 +18,19 @@ #include <linux/jiffies.h> #include <linux/kernel_stat.h> #include <linux/mutex.h> +#include <linux/hrtimer.h> +#include <linux/tick.h> +#include <linux/ktime.h> /* * dbs is used in this file as a shortform for demandbased switching * It helps to keep variable names smaller, simpler */ +#define DEF_FREQUENCY_DOWN_DIFFERENTIAL (10) #define DEF_FREQUENCY_UP_THRESHOLD (80) +#define MICRO_FREQUENCY_DOWN_DIFFERENTIAL (3) +#define MICRO_FREQUENCY_UP_THRESHOLD (95) #define MIN_FREQUENCY_UP_THRESHOLD (11) #define MAX_FREQUENCY_UP_THRESHOLD (100) @@ -57,6 +63,7 @@ enum {DBS_NORMAL_SAMPLE, DBS_SUB_SAMPLE}; struct cpu_dbs_info_s { cputime64_t prev_cpu_idle; cputime64_t prev_cpu_wall; + cputime64_t prev_cpu_nice; struct cpufreq_policy *cur_policy; struct delayed_work work; struct cpufreq_frequency_table *freq_table; @@ -86,21 +93,24 @@ static struct workqueue_struct *kondemand_wq; static struct dbs_tuners { unsigned int sampling_rate; unsigned int up_threshold; + unsigned int down_differential; unsigned int ignore_nice; unsigned int powersave_bias; } dbs_tuners_ins = { .up_threshold = DEF_FREQUENCY_UP_THRESHOLD, + .down_differential = DEF_FREQUENCY_DOWN_DIFFERENTIAL, .ignore_nice = 0, .powersave_bias = 0, }; -static inline cputime64_t get_cpu_idle_time(unsigned int cpu) +static inline cputime64_t get_cpu_idle_time_jiffy(unsigned int cpu, + cputime64_t *wall) { cputime64_t idle_time; - cputime64_t cur_jiffies; + cputime64_t cur_wall_time; cputime64_t busy_time; - cur_jiffies = jiffies64_to_cputime64(get_jiffies_64()); + cur_wall_time = jiffies64_to_cputime64(get_jiffies_64()); busy_time = cputime64_add(kstat_cpu(cpu).cpustat.user, kstat_cpu(cpu).cpustat.system); @@ -113,7 +123,37 @@ static inline cputime64_t get_cpu_idle_time(unsigned int cpu) kstat_cpu(cpu).cpustat.nice); } - idle_time = cputime64_sub(cur_jiffies, busy_time); + idle_time = cputime64_sub(cur_wall_time, busy_time); + if (wall) + *wall = cur_wall_time; + + return idle_time; +} + +static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall) +{ + u64 idle_time = get_cpu_idle_time_us(cpu, wall); + + if (idle_time == -1ULL) + return get_cpu_idle_time_jiffy(cpu, wall); + + if (dbs_tuners_ins.ignore_nice) { + cputime64_t cur_nice; + unsigned long cur_nice_jiffies; + struct cpu_dbs_info_s *dbs_info; + + dbs_info = &per_cpu(cpu_dbs_info, cpu); + cur_nice = cputime64_sub(kstat_cpu(cpu).cpustat.nice, + dbs_info->prev_cpu_nice); + /* + * Assumption: nice time between sampling periods will be + * less than 2^32 jiffies for 32 bit sys + */ + cur_nice_jiffies = (unsigned long) + cputime64_to_jiffies64(cur_nice); + dbs_info->prev_cpu_nice = kstat_cpu(cpu).cpustat.nice; + return idle_time + jiffies_to_usecs(cur_nice_jiffies); + } return idle_time; } @@ -277,8 +317,8 @@ static ssize_t store_ignore_nice_load(struct cpufreq_policy *policy, for_each_online_cpu(j) { struct cpu_dbs_info_s *dbs_info; dbs_info = &per_cpu(cpu_dbs_info, j); - dbs_info->prev_cpu_idle = get_cpu_idle_time(j); - dbs_info->prev_cpu_wall = get_jiffies_64(); + dbs_info->prev_cpu_idle = get_cpu_idle_time(j, + &dbs_info->prev_cpu_wall); } mutex_unlock(&dbs_mutex); @@ -334,9 +374,7 @@ static struct attribute_group dbs_attr_group = { static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) { - unsigned int idle_ticks, total_ticks; - unsigned int load = 0; - cputime64_t cur_jiffies; + unsigned int max_load_freq; struct cpufreq_policy *policy; unsigned int j; @@ -346,13 +384,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) this_dbs_info->freq_lo = 0; policy = this_dbs_info->cur_policy; - cur_jiffies = jiffies64_to_cputime64(get_jiffies_64()); - total_ticks = (unsigned int) cputime64_sub(cur_jiffies, - this_dbs_info->prev_cpu_wall); - this_dbs_info->prev_cpu_wall = get_jiffies_64(); - if (!total_ticks) - return; /* * Every sampling_rate, we check, if current idle time is less * than 20% (default), then we try to increase frequency @@ -365,27 +397,44 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) * 5% (default) of current frequency */ - /* Get Idle Time */ - idle_ticks = UINT_MAX; + /* Get Absolute Load - in terms of freq */ + max_load_freq = 0; + for_each_cpu_mask_nr(j, policy->cpus) { - cputime64_t total_idle_ticks; - unsigned int tmp_idle_ticks; struct cpu_dbs_info_s *j_dbs_info; + cputime64_t cur_wall_time, cur_idle_time; + unsigned int idle_time, wall_time; + unsigned int load, load_freq; + int freq_avg; j_dbs_info = &per_cpu(cpu_dbs_info, j); - total_idle_ticks = get_cpu_idle_time(j); - tmp_idle_ticks = (unsigned int) cputime64_sub(total_idle_ticks, + + cur_idle_time = get_cpu_idle_time(j, &cur_wall_time); + + wall_time = (unsigned int) cputime64_sub(cur_wall_time, + j_dbs_info->prev_cpu_wall); + j_dbs_info->prev_cpu_wall = cur_wall_time; + + idle_time = (unsigned int) cputime64_sub(cur_idle_time, j_dbs_info->prev_cpu_idle); - j_dbs_info->prev_cpu_idle = total_idle_ticks; + j_dbs_info->prev_cpu_idle = cur_idle_time; + + if (unlikely(!wall_time || wall_time < idle_time)) + continue; + + load = 100 * (wall_time - idle_time) / wall_time; + + freq_avg = __cpufreq_driver_getavg(policy, j); + if (freq_avg <= 0) + freq_avg = policy->cur; - if (tmp_idle_ticks < idle_ticks) - idle_ticks = tmp_idle_ticks; + load_freq = load * freq_avg; + if (load_freq > max_load_freq) + max_load_freq = load_freq; } - if (likely(total_ticks > idle_ticks)) - load = (100 * (total_ticks - idle_ticks)) / total_ticks; /* Check for frequency increase */ - if (load > dbs_tuners_ins.up_threshold) { + if (max_load_freq > dbs_tuners_ins.up_threshold * policy->cur) { /* if we are already at full speed then break out early */ if (!dbs_tuners_ins.powersave_bias) { if (policy->cur == policy->max) @@ -412,15 +461,13 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) * can support the current CPU usage without triggering the up * policy. To be safe, we focus 10 points under the threshold. */ - if (load < (dbs_tuners_ins.up_threshold - 10)) { - unsigned int freq_next, freq_cur; - - freq_cur = __cpufreq_driver_getavg(policy); - if (!freq_cur) - freq_cur = policy->cur; - - freq_next = (freq_cur * load) / - (dbs_tuners_ins.up_threshold - 10); + if (max_load_freq < + (dbs_tuners_ins.up_threshold - dbs_tuners_ins.down_differential) * + policy->cur) { + unsigned int freq_next; + freq_next = max_load_freq / + (dbs_tuners_ins.up_threshold - + dbs_tuners_ins.down_differential); if (!dbs_tuners_ins.powersave_bias) { __cpufreq_driver_target(policy, freq_next, @@ -526,8 +573,8 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, j_dbs_info = &per_cpu(cpu_dbs_info, j); j_dbs_info->cur_policy = policy; - j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j); - j_dbs_info->prev_cpu_wall = get_jiffies_64(); + j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j, + &j_dbs_info->prev_cpu_wall); } this_dbs_info->cpu = cpu; /* @@ -579,22 +626,42 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, return 0; } +#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND +static +#endif struct cpufreq_governor cpufreq_gov_ondemand = { .name = "ondemand", .governor = cpufreq_governor_dbs, .max_transition_latency = TRANSITION_LATENCY_LIMIT, .owner = THIS_MODULE, }; -EXPORT_SYMBOL(cpufreq_gov_ondemand); static int __init cpufreq_gov_dbs_init(void) { + int err; + cputime64_t wall; + u64 idle_time; + int cpu = get_cpu(); + + idle_time = get_cpu_idle_time_us(cpu, &wall); + put_cpu(); + if (idle_time != -1ULL) { + /* Idle micro accounting is supported. Use finer thresholds */ + dbs_tuners_ins.up_threshold = MICRO_FREQUENCY_UP_THRESHOLD; + dbs_tuners_ins.down_differential = + MICRO_FREQUENCY_DOWN_DIFFERENTIAL; + } + kondemand_wq = create_workqueue("kondemand"); if (!kondemand_wq) { printk(KERN_ERR "Creation of kondemand failed\n"); return -EFAULT; } - return cpufreq_register_governor(&cpufreq_gov_ondemand); + err = cpufreq_register_governor(&cpufreq_gov_ondemand); + if (err) + destroy_workqueue(kondemand_wq); + + return err; } static void __exit cpufreq_gov_dbs_exit(void) diff --git a/drivers/cpufreq/cpufreq_performance.c b/drivers/cpufreq/cpufreq_performance.c index e8e1451ef1c1..7e2e515087f8 100644 --- a/drivers/cpufreq/cpufreq_performance.c +++ b/drivers/cpufreq/cpufreq_performance.c @@ -36,12 +36,14 @@ static int cpufreq_governor_performance(struct cpufreq_policy *policy, return 0; } +#ifdef CONFIG_CPU_FREQ_GOV_PERFORMANCE_MODULE +static +#endif struct cpufreq_governor cpufreq_gov_performance = { .name = "performance", .governor = cpufreq_governor_performance, .owner = THIS_MODULE, }; -EXPORT_SYMBOL(cpufreq_gov_performance); static int __init cpufreq_gov_performance_init(void) diff --git a/drivers/cpufreq/cpufreq_powersave.c b/drivers/cpufreq/cpufreq_powersave.c index 88d2f44fba48..e6db5faf3eb1 100644 --- a/drivers/cpufreq/cpufreq_powersave.c +++ b/drivers/cpufreq/cpufreq_powersave.c @@ -35,12 +35,14 @@ static int cpufreq_governor_powersave(struct cpufreq_policy *policy, return 0; } +#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE +static +#endif struct cpufreq_governor cpufreq_gov_powersave = { .name = "powersave", .governor = cpufreq_governor_powersave, .owner = THIS_MODULE, }; -EXPORT_SYMBOL(cpufreq_gov_powersave); static int __init cpufreq_gov_powersave_init(void) { diff --git a/drivers/cpufreq/cpufreq_userspace.c b/drivers/cpufreq/cpufreq_userspace.c index 32244aa7cc0c..1442bbada053 100644 --- a/drivers/cpufreq/cpufreq_userspace.c +++ b/drivers/cpufreq/cpufreq_userspace.c @@ -187,6 +187,9 @@ static int cpufreq_governor_userspace(struct cpufreq_policy *policy, } +#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE +static +#endif struct cpufreq_governor cpufreq_gov_userspace = { .name = "userspace", .governor = cpufreq_governor_userspace, @@ -194,7 +197,6 @@ struct cpufreq_governor cpufreq_gov_userspace = { .show_setspeed = show_speed, .owner = THIS_MODULE, }; -EXPORT_SYMBOL(cpufreq_gov_userspace); static int __init cpufreq_gov_userspace_init(void) { diff --git a/include/linux/compiler.h b/include/linux/compiler.h index c8bd2daf95ec..8322141ee480 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -190,7 +190,9 @@ extern void __chk_io_ptr(const volatile void __iomem *); * ACCESS_ONCE() in different C statements. * * This macro does absolutely -nothing- to prevent the CPU from reordering, - * merging, or refetching absolutely anything at any time. + * merging, or refetching absolutely anything at any time. Its main intended + * use is to mediate communication between process-level code and irq/NMI + * handlers, all running on the same CPU. */ #define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x)) diff --git a/include/linux/completion.h b/include/linux/completion.h index 02ef8835999c..4a6b604ef7e4 100644 --- a/include/linux/completion.h +++ b/include/linux/completion.h @@ -10,6 +10,18 @@ #include <linux/wait.h> +/** + * struct completion - structure used to maintain state for a "completion" + * + * This is the opaque structure used to maintain the state for a "completion". + * Completions currently use a FIFO to queue threads that have to wait for + * the "completion" event. + * + * See also: complete(), wait_for_completion() (and friends _timeout, + * _interruptible, _interruptible_timeout, and _killable), init_completion(), + * and macros DECLARE_COMPLETION(), DECLARE_COMPLETION_ONSTACK(), and + * INIT_COMPLETION(). + */ struct completion { unsigned int done; wait_queue_head_t wait; @@ -21,6 +33,14 @@ struct completion { #define COMPLETION_INITIALIZER_ONSTACK(work) \ ({ init_completion(&work); work; }) +/** + * DECLARE_COMPLETION: - declare and initialize a completion structure + * @work: identifier for the completion structure + * + * This macro declares and initializes a completion structure. Generally used + * for static declarations. You should use the _ONSTACK variant for automatic + * variables. + */ #define DECLARE_COMPLETION(work) \ struct completion work = COMPLETION_INITIALIZER(work) @@ -29,6 +49,13 @@ struct completion { * completions - so we use the _ONSTACK() variant for those that * are on the kernel stack: */ +/** + * DECLARE_COMPLETION_ONSTACK: - declare and initialize a completion structure + * @work: identifier for the completion structure + * + * This macro declares and initializes a completion structure on the kernel + * stack. + */ #ifdef CONFIG_LOCKDEP # define DECLARE_COMPLETION_ONSTACK(work) \ struct completion work = COMPLETION_INITIALIZER_ONSTACK(work) @@ -36,6 +63,13 @@ struct completion { # define DECLARE_COMPLETION_ONSTACK(work) DECLARE_COMPLETION(work) #endif +/** + * init_completion: - Initialize a dynamically allocated completion + * @x: completion structure that is to be initialized + * + * This inline function will initialize a dynamically created completion + * structure. + */ static inline void init_completion(struct completion *x) { x->done = 0; @@ -55,6 +89,13 @@ extern bool completion_done(struct completion *x); extern void complete(struct completion *); extern void complete_all(struct completion *); +/** + * INIT_COMPLETION: - reinitialize a completion structure + * @x: completion structure to be reinitialized + * + * This macro should be used to reinitialize a completion structure so it can + * be reused. This is especially important after complete_all() is used. + */ #define INIT_COMPLETION(x) ((x).done = 0) diff --git a/include/linux/cpu.h b/include/linux/cpu.h index d7faf8808497..c2747ac2ae43 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -69,6 +69,7 @@ static inline void unregister_cpu_notifier(struct notifier_block *nb) #endif int cpu_up(unsigned int cpu); +void notify_cpu_starting(unsigned int cpu); extern void cpu_hotplug_init(void); extern void cpu_maps_update_begin(void); extern void cpu_maps_update_done(void); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 6fd5668aa572..1ee608fd7b77 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -187,7 +187,8 @@ extern int __cpufreq_driver_target(struct cpufreq_policy *policy, unsigned int relation); -extern int __cpufreq_driver_getavg(struct cpufreq_policy *policy); +extern int __cpufreq_driver_getavg(struct cpufreq_policy *policy, + unsigned int cpu); int cpufreq_register_governor(struct cpufreq_governor *governor); void cpufreq_unregister_governor(struct cpufreq_governor *governor); @@ -226,7 +227,9 @@ struct cpufreq_driver { unsigned int (*get) (unsigned int cpu); /* optional */ - unsigned int (*getavg) (unsigned int cpu); + unsigned int (*getavg) (struct cpufreq_policy *policy, + unsigned int cpu); + int (*exit) (struct cpufreq_policy *policy); int (*suspend) (struct cpufreq_policy *policy, pm_message_t pmsg); int (*resume) (struct cpufreq_policy *policy); diff --git a/include/linux/notifier.h b/include/linux/notifier.h index da2698b0fdd1..b86fa2ffca0c 100644 --- a/include/linux/notifier.h +++ b/include/linux/notifier.h @@ -213,9 +213,16 @@ static inline int notifier_to_errno(int ret) #define CPU_DOWN_FAILED 0x0006 /* CPU (unsigned)v NOT going down */ #define CPU_DEAD 0x0007 /* CPU (unsigned)v dead */ #define CPU_DYING 0x0008 /* CPU (unsigned)v not running any task, - * not handling interrupts, soon dead */ + * not handling interrupts, soon dead. + * Called on the dying cpu, interrupts + * are already disabled. Must not + * sleep, must not fail */ #define CPU_POST_DEAD 0x0009 /* CPU (unsigned)v dead, cpu_hotplug * lock is dropped */ +#define CPU_STARTING 0x000A /* CPU (unsigned)v soon running. + * Called on the new cpu, just before + * enabling interrupts. Must not sleep, + * must not fail */ /* Used for CPU hotplug events occuring while tasks are frozen due to a suspend * operation in progress @@ -229,6 +236,7 @@ static inline int notifier_to_errno(int ret) #define CPU_DOWN_FAILED_FROZEN (CPU_DOWN_FAILED | CPU_TASKS_FROZEN) #define CPU_DEAD_FROZEN (CPU_DEAD | CPU_TASKS_FROZEN) #define CPU_DYING_FROZEN (CPU_DYING | CPU_TASKS_FROZEN) +#define CPU_STARTING_FROZEN (CPU_STARTING | CPU_TASKS_FROZEN) /* Hibernation and suspend events */ #define PM_HIBERNATION_PREPARE 0x0001 /* Going to hibernate */ diff --git a/include/linux/proportions.h b/include/linux/proportions.h index 5afc1b23346d..cf793bbbd05e 100644 --- a/include/linux/proportions.h +++ b/include/linux/proportions.h @@ -104,8 +104,8 @@ struct prop_local_single { * snapshot of the last seen global state * and a lock protecting this state */ - int shift; unsigned long period; + int shift; spinlock_t lock; /* protect the snapshot state */ }; diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h index 4ab843622727..5f89b62e6983 100644 --- a/include/linux/rcuclassic.h +++ b/include/linux/rcuclassic.h @@ -40,12 +40,21 @@ #include <linux/cpumask.h> #include <linux/seqlock.h> +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR +#define RCU_SECONDS_TILL_STALL_CHECK ( 3 * HZ) /* for rcp->jiffies_stall */ +#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rcp->jiffies_stall */ +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ /* Global control variables for rcupdate callback mechanism. */ struct rcu_ctrlblk { long cur; /* Current batch number. */ long completed; /* Number of the last completed batch */ - int next_pending; /* Is the next batch already waiting? */ + long pending; /* Number of the last pending batch */ +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR + unsigned long gp_start; /* Time at which GP started in jiffies. */ + unsigned long jiffies_stall; + /* Time at which to check for CPU stalls. */ +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ int signaled; @@ -66,11 +75,7 @@ static inline int rcu_batch_after(long a, long b) return (a - b) > 0; } -/* - * Per-CPU data for Read-Copy UPdate. - * nxtlist - new callbacks are added here - * curlist - current batch for which quiescent cycle started if any - */ +/* Per-CPU data for Read-Copy UPdate. */ struct rcu_data { /* 1) quiescent state handling : */ long quiescbatch; /* Batch # for grace period */ @@ -78,12 +83,24 @@ struct rcu_data { int qs_pending; /* core waits for quiesc state */ /* 2) batch handling */ - long batch; /* Batch # for current RCU batch */ + /* + * if nxtlist is not NULL, then: + * batch: + * The batch # for the last entry of nxtlist + * [*nxttail[1], NULL = *nxttail[2]): + * Entries that batch # <= batch + * [*nxttail[0], *nxttail[1]): + * Entries that batch # <= batch - 1 + * [nxtlist, *nxttail[0]): + * Entries that batch # <= batch - 2 + * The grace period for these entries has completed, and + * the other grace-period-completed entries may be moved + * here temporarily in rcu_process_callbacks(). + */ + long batch; struct rcu_head *nxtlist; - struct rcu_head **nxttail; + struct rcu_head **nxttail[3]; long qlen; /* # of queued callbacks */ - struct rcu_head *curlist; - struct rcu_head **curtail; struct rcu_head *donelist; struct rcu_head **donetail; long blimit; /* Upper limit on a processed batch */ diff --git a/include/linux/rculist.h b/include/linux/rculist.h index eb4443c7e05b..e649bd3f2c97 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -198,20 +198,6 @@ static inline void list_splice_init_rcu(struct list_head *list, at->prev = last; } -/** - * list_for_each_rcu - iterate over an rcu-protected list - * @pos: the &struct list_head to use as a loop cursor. - * @head: the head for your list. - * - * This list-traversal primitive may safely run concurrently with - * the _rcu list-mutation primitives such as list_add_rcu() - * as long as the traversal is guarded by rcu_read_lock(). - */ -#define list_for_each_rcu(pos, head) \ - for (pos = rcu_dereference((head)->next); \ - prefetch(pos->next), pos != (head); \ - pos = rcu_dereference(pos->next)) - #define __list_for_each_rcu(pos, head) \ for (pos = rcu_dereference((head)->next); \ pos != (head); \ diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index e8b4039cfb2f..86f1f5e43e33 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -133,6 +133,26 @@ struct rcu_head { #define rcu_read_unlock_bh() __rcu_read_unlock_bh() /** + * rcu_read_lock_sched - mark the beginning of a RCU-classic critical section + * + * Should be used with either + * - synchronize_sched() + * or + * - call_rcu_sched() and rcu_barrier_sched() + * on the write-side to insure proper synchronization. + */ +#define rcu_read_lock_sched() preempt_disable() + +/* + * rcu_read_unlock_sched - marks the end of a RCU-classic critical section + * + * See rcu_read_lock_sched for more information. + */ +#define rcu_read_unlock_sched() preempt_enable() + + + +/** * rcu_dereference - fetch an RCU-protected pointer in an * RCU read-side critical section. This pointer may later * be safely dereferenced. diff --git a/include/linux/rcupreempt.h b/include/linux/rcupreempt.h index 0967f03b0705..3e05c09b54a2 100644 --- a/include/linux/rcupreempt.h +++ b/include/linux/rcupreempt.h @@ -57,7 +57,13 @@ static inline void rcu_qsctr_inc(int cpu) rdssp->sched_qs++; } #define rcu_bh_qsctr_inc(cpu) -#define call_rcu_bh(head, rcu) call_rcu(head, rcu) + +/* + * Someone might want to pass call_rcu_bh as a function pointer. + * So this needs to just be a rename and not a macro function. + * (no parentheses) + */ +#define call_rcu_bh call_rcu /** * call_rcu_sched - Queue RCU callback for invocation after sched grace period. @@ -111,7 +117,6 @@ extern struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu); struct softirq_action; #ifdef CONFIG_NO_HZ -DECLARE_PER_CPU(struct rcu_dyntick_sched, rcu_dyntick_sched); static inline void rcu_enter_nohz(void) { @@ -126,8 +131,8 @@ static inline void rcu_exit_nohz(void) { static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1); - smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ __get_cpu_var(rcu_dyntick_sched).dynticks++; + smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ WARN_ON_RATELIMIT(!(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1), &rs); } diff --git a/include/linux/sched.h b/include/linux/sched.h index 3d9120c5ad15..5d0819ee442a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -451,8 +451,8 @@ struct signal_struct { * - everyone except group_exit_task is stopped during signal delivery * of fatal signals, group_exit_task processes the signal. */ - struct task_struct *group_exit_task; int notify_count; + struct task_struct *group_exit_task; /* thread group stop support, overloads group_exit_code too */ int group_stop_count; @@ -824,6 +824,9 @@ struct sched_domain { unsigned int ttwu_move_affine; unsigned int ttwu_move_balance; #endif +#ifdef CONFIG_SCHED_DEBUG + char *name; +#endif }; extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, @@ -897,7 +900,7 @@ struct sched_class { void (*yield_task) (struct rq *rq); int (*select_task_rq)(struct task_struct *p, int sync); - void (*check_preempt_curr) (struct rq *rq, struct task_struct *p); + void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int sync); struct task_struct * (*pick_next_task) (struct rq *rq); void (*put_prev_task) (struct rq *rq, struct task_struct *p); @@ -1010,8 +1013,8 @@ struct sched_entity { struct sched_rt_entity { struct list_head run_list; - unsigned int time_slice; unsigned long timeout; + unsigned int time_slice; int nr_cpus_allowed; struct sched_rt_entity *back; diff --git a/include/linux/security.h b/include/linux/security.h index 80c4d002864c..f5c4a51eb42e 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -1560,11 +1560,6 @@ struct security_operations { extern int security_init(void); extern int security_module_enable(struct security_operations *ops); extern int register_security(struct security_operations *ops); -extern struct dentry *securityfs_create_file(const char *name, mode_t mode, - struct dentry *parent, void *data, - const struct file_operations *fops); -extern struct dentry *securityfs_create_dir(const char *name, struct dentry *parent); -extern void securityfs_remove(struct dentry *dentry); /* Security operations */ int security_ptrace_may_access(struct task_struct *child, unsigned int mode); @@ -2424,25 +2419,6 @@ static inline int security_netlink_recv(struct sk_buff *skb, int cap) return cap_netlink_recv(skb, cap); } -static inline struct dentry *securityfs_create_dir(const char *name, - struct dentry *parent) -{ - return ERR_PTR(-ENODEV); -} - -static inline struct dentry *securityfs_create_file(const char *name, - mode_t mode, - struct dentry *parent, - void *data, - const struct file_operations *fops) -{ - return ERR_PTR(-ENODEV); -} - -static inline void securityfs_remove(struct dentry *dentry) -{ -} - static inline int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen) { return -EOPNOTSUPP; @@ -2806,5 +2782,35 @@ static inline void security_audit_rule_free(void *lsmrule) #endif /* CONFIG_SECURITY */ #endif /* CONFIG_AUDIT */ +#ifdef CONFIG_SECURITYFS + +extern struct dentry *securityfs_create_file(const char *name, mode_t mode, + struct dentry *parent, void *data, + const struct file_operations *fops); +extern struct dentry *securityfs_create_dir(const char *name, struct dentry *parent); +extern void securityfs_remove(struct dentry *dentry); + +#else /* CONFIG_SECURITYFS */ + +static inline struct dentry *securityfs_create_dir(const char *name, + struct dentry *parent) +{ + return ERR_PTR(-ENODEV); +} + +static inline struct dentry *securityfs_create_file(const char *name, + mode_t mode, + struct dentry *parent, + void *data, + const struct file_operations *fops) +{ + return ERR_PTR(-ENODEV); +} + +static inline void securityfs_remove(struct dentry *dentry) +{} + +#endif + #endif /* ! __LINUX_SECURITY_H */ diff --git a/include/linux/tick.h b/include/linux/tick.h index 8cf8cfe2cc97..98921a3e1aa8 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -126,7 +126,7 @@ static inline ktime_t tick_nohz_get_sleep_length(void) return len; } static inline void tick_nohz_stop_idle(int cpu) { } -static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return 0; } +static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; } # endif /* !NO_HZ */ #endif diff --git a/kernel/cpu.c b/kernel/cpu.c index f17e9854c246..86d49045daed 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -199,13 +199,14 @@ static int __ref take_cpu_down(void *_param) struct take_cpu_down_param *param = _param; int err; - raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod, - param->hcpu); /* Ensure this CPU doesn't handle any more interrupts. */ err = __cpu_disable(); if (err < 0) return err; + raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod, + param->hcpu); + /* Force idle task to run as soon as we yield: it should immediately notice cpu is offline and die quickly. */ sched_idle_next(); @@ -453,6 +454,25 @@ out: } #endif /* CONFIG_PM_SLEEP_SMP */ +/** + * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers + * @cpu: cpu that just started + * + * This function calls the cpu_chain notifiers with CPU_STARTING. + * It must be called by the arch code on the new cpu, before the new cpu + * enables interrupts and before the "boot" cpu returns from __cpu_up(). + */ +void notify_cpu_starting(unsigned int cpu) +{ + unsigned long val = CPU_STARTING; + +#ifdef CONFIG_PM_SLEEP_SMP + if (cpu_isset(cpu, frozen_cpus)) + val = CPU_STARTING_FROZEN; +#endif /* CONFIG_PM_SLEEP_SMP */ + raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu); +} + #endif /* CONFIG_SMP */ /* diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 827cd9adccb2..eab7bd6628e0 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1921,7 +1921,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) * that has tasks along with an empty 'mems'. But if we did see such * a cpuset, we'd handle it just like we do if its 'cpus' was empty. */ -static void scan_for_empty_cpusets(const struct cpuset *root) +static void scan_for_empty_cpusets(struct cpuset *root) { LIST_HEAD(queue); struct cpuset *cp; /* scans cpusets being updated */ diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index aad93cdc9f68..37f72e551542 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -47,6 +47,7 @@ #include <linux/notifier.h> #include <linux/cpu.h> #include <linux/mutex.h> +#include <linux/time.h> #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key rcu_lock_key; @@ -60,12 +61,14 @@ EXPORT_SYMBOL_GPL(rcu_lock_map); static struct rcu_ctrlblk rcu_ctrlblk = { .cur = -300, .completed = -300, + .pending = -300, .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), .cpumask = CPU_MASK_NONE, }; static struct rcu_ctrlblk rcu_bh_ctrlblk = { .cur = -300, .completed = -300, + .pending = -300, .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), .cpumask = CPU_MASK_NONE, }; @@ -83,7 +86,10 @@ static void force_quiescent_state(struct rcu_data *rdp, { int cpu; cpumask_t cpumask; + unsigned long flags; + set_need_resched(); + spin_lock_irqsave(&rcp->lock, flags); if (unlikely(!rcp->signaled)) { rcp->signaled = 1; /* @@ -109,6 +115,7 @@ static void force_quiescent_state(struct rcu_data *rdp, for_each_cpu_mask_nr(cpu, cpumask) smp_send_reschedule(cpu); } + spin_unlock_irqrestore(&rcp->lock, flags); } #else static inline void force_quiescent_state(struct rcu_data *rdp, @@ -118,6 +125,126 @@ static inline void force_quiescent_state(struct rcu_data *rdp, } #endif +static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp, + struct rcu_data *rdp) +{ + long batch; + + head->next = NULL; + smp_mb(); /* Read of rcu->cur must happen after any change by caller. */ + + /* + * Determine the batch number of this callback. + * + * Using ACCESS_ONCE to avoid the following error when gcc eliminates + * local variable "batch" and emits codes like this: + * 1) rdp->batch = rcp->cur + 1 # gets old value + * ...... + * 2)rcu_batch_after(rcp->cur + 1, rdp->batch) # gets new value + * then [*nxttail[0], *nxttail[1]) may contain callbacks + * that batch# = rdp->batch, see the comment of struct rcu_data. + */ + batch = ACCESS_ONCE(rcp->cur) + 1; + + if (rdp->nxtlist && rcu_batch_after(batch, rdp->batch)) { + /* process callbacks */ + rdp->nxttail[0] = rdp->nxttail[1]; + rdp->nxttail[1] = rdp->nxttail[2]; + if (rcu_batch_after(batch - 1, rdp->batch)) + rdp->nxttail[0] = rdp->nxttail[2]; + } + + rdp->batch = batch; + *rdp->nxttail[2] = head; + rdp->nxttail[2] = &head->next; + + if (unlikely(++rdp->qlen > qhimark)) { + rdp->blimit = INT_MAX; + force_quiescent_state(rdp, &rcu_ctrlblk); + } +} + +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR + +static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) +{ + rcp->gp_start = jiffies; + rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK; +} + +static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) +{ + int cpu; + long delta; + unsigned long flags; + + /* Only let one CPU complain about others per time interval. */ + + spin_lock_irqsave(&rcp->lock, flags); + delta = jiffies - rcp->jiffies_stall; + if (delta < 2 || rcp->cur != rcp->completed) { + spin_unlock_irqrestore(&rcp->lock, flags); + return; + } + rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; + spin_unlock_irqrestore(&rcp->lock, flags); + + /* OK, time to rat on our buddy... */ + + printk(KERN_ERR "RCU detected CPU stalls:"); + for_each_possible_cpu(cpu) { + if (cpu_isset(cpu, rcp->cpumask)) + printk(" %d", cpu); + } + printk(" (detected by %d, t=%ld jiffies)\n", + smp_processor_id(), (long)(jiffies - rcp->gp_start)); +} + +static void print_cpu_stall(struct rcu_ctrlblk *rcp) +{ + unsigned long flags; + + printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n", + smp_processor_id(), jiffies, + jiffies - rcp->gp_start); + dump_stack(); + spin_lock_irqsave(&rcp->lock, flags); + if ((long)(jiffies - rcp->jiffies_stall) >= 0) + rcp->jiffies_stall = + jiffies + RCU_SECONDS_TILL_STALL_RECHECK; + spin_unlock_irqrestore(&rcp->lock, flags); + set_need_resched(); /* kick ourselves to get things going. */ +} + +static void check_cpu_stall(struct rcu_ctrlblk *rcp) +{ + long delta; + + delta = jiffies - rcp->jiffies_stall; + if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) { + + /* We haven't checked in, so go dump stack. */ + print_cpu_stall(rcp); + + } else if (rcp->cur != rcp->completed && delta >= 2) { + + /* They had two seconds to dump stack, so complain. */ + print_other_cpu_stall(rcp); + } +} + +#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + +static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) +{ +} + +static inline void check_cpu_stall(struct rcu_ctrlblk *rcp) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + /** * call_rcu - Queue an RCU callback for invocation after a grace period. * @head: structure to be used for queueing the RCU updates. @@ -133,18 +260,10 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { unsigned long flags; - struct rcu_data *rdp; head->func = func; - head->next = NULL; local_irq_save(flags); - rdp = &__get_cpu_var(rcu_data); - *rdp->nxttail = head; - rdp->nxttail = &head->next; - if (unlikely(++rdp->qlen > qhimark)) { - rdp->blimit = INT_MAX; - force_quiescent_state(rdp, &rcu_ctrlblk); - } + __call_rcu(head, &rcu_ctrlblk, &__get_cpu_var(rcu_data)); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(call_rcu); @@ -169,20 +288,10 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { unsigned long flags; - struct rcu_data *rdp; head->func = func; - head->next = NULL; local_irq_save(flags); - rdp = &__get_cpu_var(rcu_bh_data); - *rdp->nxttail = head; - rdp->nxttail = &head->next; - - if (unlikely(++rdp->qlen > qhimark)) { - rdp->blimit = INT_MAX; - force_quiescent_state(rdp, &rcu_bh_ctrlblk); - } - + __call_rcu(head, &rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(call_rcu_bh); @@ -211,12 +320,6 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); static inline void raise_rcu_softirq(void) { raise_softirq(RCU_SOFTIRQ); - /* - * The smp_mb() here is required to ensure that this cpu's - * __rcu_process_callbacks() reads the most recently updated - * value of rcu->cur. - */ - smp_mb(); } /* @@ -225,6 +328,7 @@ static inline void raise_rcu_softirq(void) */ static void rcu_do_batch(struct rcu_data *rdp) { + unsigned long flags; struct rcu_head *next, *list; int count = 0; @@ -239,9 +343,9 @@ static void rcu_do_batch(struct rcu_data *rdp) } rdp->donelist = list; - local_irq_disable(); + local_irq_save(flags); rdp->qlen -= count; - local_irq_enable(); + local_irq_restore(flags); if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark) rdp->blimit = blimit; @@ -269,6 +373,7 @@ static void rcu_do_batch(struct rcu_data *rdp) * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace * period (if necessary). */ + /* * Register a new batch of callbacks, and start it up if there is currently no * active batch and the batch to be registered has not already occurred. @@ -276,15 +381,10 @@ static void rcu_do_batch(struct rcu_data *rdp) */ static void rcu_start_batch(struct rcu_ctrlblk *rcp) { - if (rcp->next_pending && + if (rcp->cur != rcp->pending && rcp->completed == rcp->cur) { - rcp->next_pending = 0; - /* - * next_pending == 0 must be visible in - * __rcu_process_callbacks() before it can see new value of cur. - */ - smp_wmb(); rcp->cur++; + record_gp_stall_check_time(rcp); /* * Accessing nohz_cpu_mask before incrementing rcp->cur needs a @@ -322,6 +422,8 @@ static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp) static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { + unsigned long flags; + if (rdp->quiescbatch != rcp->cur) { /* start new grace period: */ rdp->qs_pending = 1; @@ -345,7 +447,7 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, return; rdp->qs_pending = 0; - spin_lock(&rcp->lock); + spin_lock_irqsave(&rcp->lock, flags); /* * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync * during cpu startup. Ignore the quiescent state. @@ -353,7 +455,7 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, if (likely(rdp->quiescbatch == rcp->cur)) cpu_quiet(rdp->cpu, rcp); - spin_unlock(&rcp->lock); + spin_unlock_irqrestore(&rcp->lock, flags); } @@ -364,33 +466,38 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, * which is dead and hence not processing interrupts. */ static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, - struct rcu_head **tail) + struct rcu_head **tail, long batch) { - local_irq_disable(); - *this_rdp->nxttail = list; - if (list) - this_rdp->nxttail = tail; - local_irq_enable(); + unsigned long flags; + + if (list) { + local_irq_save(flags); + this_rdp->batch = batch; + *this_rdp->nxttail[2] = list; + this_rdp->nxttail[2] = tail; + local_irq_restore(flags); + } } static void __rcu_offline_cpu(struct rcu_data *this_rdp, struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { - /* if the cpu going offline owns the grace period + unsigned long flags; + + /* + * if the cpu going offline owns the grace period * we can block indefinitely waiting for it, so flush * it here */ - spin_lock_bh(&rcp->lock); + spin_lock_irqsave(&rcp->lock, flags); if (rcp->cur != rcp->completed) cpu_quiet(rdp->cpu, rcp); - spin_unlock_bh(&rcp->lock); - rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail); - rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail); - rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail); + rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1); + rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1); + spin_unlock(&rcp->lock); - local_irq_disable(); this_rdp->qlen += rdp->qlen; - local_irq_enable(); + local_irq_restore(flags); } static void rcu_offline_cpu(int cpu) @@ -420,38 +527,52 @@ static void rcu_offline_cpu(int cpu) static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { - if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) { - *rdp->donetail = rdp->curlist; - rdp->donetail = rdp->curtail; - rdp->curlist = NULL; - rdp->curtail = &rdp->curlist; - } + unsigned long flags; + long completed_snap; - if (rdp->nxtlist && !rdp->curlist) { - local_irq_disable(); - rdp->curlist = rdp->nxtlist; - rdp->curtail = rdp->nxttail; - rdp->nxtlist = NULL; - rdp->nxttail = &rdp->nxtlist; - local_irq_enable(); + if (rdp->nxtlist) { + local_irq_save(flags); + completed_snap = ACCESS_ONCE(rcp->completed); /* - * start the next batch of callbacks + * move the other grace-period-completed entries to + * [rdp->nxtlist, *rdp->nxttail[0]) temporarily */ + if (!rcu_batch_before(completed_snap, rdp->batch)) + rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2]; + else if (!rcu_batch_before(completed_snap, rdp->batch - 1)) + rdp->nxttail[0] = rdp->nxttail[1]; - /* determine batch number */ - rdp->batch = rcp->cur + 1; - /* see the comment and corresponding wmb() in - * the rcu_start_batch() + /* + * the grace period for entries in + * [rdp->nxtlist, *rdp->nxttail[0]) has completed and + * move these entries to donelist */ - smp_rmb(); + if (rdp->nxttail[0] != &rdp->nxtlist) { + *rdp->donetail = rdp->nxtlist; + rdp->donetail = rdp->nxttail[0]; + rdp->nxtlist = *rdp->nxttail[0]; + *rdp->donetail = NULL; + + if (rdp->nxttail[1] == rdp->nxttail[0]) + rdp->nxttail[1] = &rdp->nxtlist; + if (rdp->nxttail[2] == rdp->nxttail[0]) + rdp->nxttail[2] = &rdp->nxtlist; + rdp->nxttail[0] = &rdp->nxtlist; + } + + local_irq_restore(flags); + + if (rcu_batch_after(rdp->batch, rcp->pending)) { + unsigned long flags2; - if (!rcp->next_pending) { /* and start it/schedule start if it's a new batch */ - spin_lock(&rcp->lock); - rcp->next_pending = 1; - rcu_start_batch(rcp); - spin_unlock(&rcp->lock); + spin_lock_irqsave(&rcp->lock, flags2); + if (rcu_batch_after(rdp->batch, rcp->pending)) { + rcp->pending = rdp->batch; + rcu_start_batch(rcp); + } + spin_unlock_irqrestore(&rcp->lock, flags2); } } @@ -462,21 +583,53 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, static void rcu_process_callbacks(struct softirq_action *unused) { + /* + * Memory references from any prior RCU read-side critical sections + * executed by the interrupted code must be see before any RCU + * grace-period manupulations below. + */ + + smp_mb(); /* See above block comment. */ + __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data)); __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); + + /* + * Memory references from any later RCU read-side critical sections + * executed by the interrupted code must be see after any RCU + * grace-period manupulations above. + */ + + smp_mb(); /* See above block comment. */ } static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { - /* This cpu has pending rcu entries and the grace period - * for them has completed. - */ - if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) - return 1; + /* Check for CPU stalls, if enabled. */ + check_cpu_stall(rcp); - /* This cpu has no pending entries, but there are new entries */ - if (!rdp->curlist && rdp->nxtlist) - return 1; + if (rdp->nxtlist) { + long completed_snap = ACCESS_ONCE(rcp->completed); + + /* + * This cpu has pending rcu entries and the grace period + * for them has completed. + */ + if (!rcu_batch_before(completed_snap, rdp->batch)) + return 1; + if (!rcu_batch_before(completed_snap, rdp->batch - 1) && + rdp->nxttail[0] != rdp->nxttail[1]) + return 1; + if (rdp->nxttail[0] != &rdp->nxtlist) + return 1; + + /* + * This cpu has pending rcu entries and the new batch + * for then hasn't been started nor scheduled start + */ + if (rcu_batch_after(rdp->batch, rcp->pending)) + return 1; + } /* This cpu has finished callbacks to invoke */ if (rdp->donelist) @@ -512,9 +665,15 @@ int rcu_needs_cpu(int cpu) struct rcu_data *rdp = &per_cpu(rcu_data, cpu); struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu); - return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu)); + return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu); } +/* + * Top-level function driving RCU grace-period detection, normally + * invoked from the scheduler-clock interrupt. This function simply + * increments counters that are read only from softirq by this same + * CPU, so there are no memory barriers required. + */ void rcu_check_callbacks(int cpu, int user) { if (user || @@ -558,14 +717,17 @@ void rcu_check_callbacks(int cpu, int user) static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { + unsigned long flags; + + spin_lock_irqsave(&rcp->lock, flags); memset(rdp, 0, sizeof(*rdp)); - rdp->curtail = &rdp->curlist; - rdp->nxttail = &rdp->nxtlist; + rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist; rdp->donetail = &rdp->donelist; rdp->quiescbatch = rcp->completed; rdp->qs_pending = 0; rdp->cpu = cpu; rdp->blimit = blimit; + spin_unlock_irqrestore(&rcp->lock, flags); } static void __cpuinit rcu_online_cpu(int cpu) @@ -610,6 +772,9 @@ static struct notifier_block __cpuinitdata rcu_nb = { */ void __init __rcu_init(void) { +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR + printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)smp_processor_id()); /* Register notifier for non-boot CPUs */ diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index 27827931ca0d..ca4bbbe04aa4 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -59,14 +59,6 @@ #include <linux/rcupreempt_trace.h> /* - * Macro that prevents the compiler from reordering accesses, but does - * absolutely -nothing- to prevent CPUs from reordering. This is used - * only to mediate communication between mainline code and hardware - * interrupt and NMI handlers. - */ -#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x)) - -/* * PREEMPT_RCU data structures. */ diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c index 5edf82c34bbc..35c2d3360ecf 100644 --- a/kernel/rcupreempt_trace.c +++ b/kernel/rcupreempt_trace.c @@ -308,11 +308,16 @@ out: static int __init rcupreempt_trace_init(void) { + int ret; + mutex_init(&rcupreempt_trace_mutex); rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL); if (!rcupreempt_trace_buf) return 1; - return rcupreempt_debugfs_init(); + ret = rcupreempt_debugfs_init(); + if (ret) + kfree(rcupreempt_trace_buf); + return ret; } static void __exit rcupreempt_trace_cleanup(void) diff --git a/kernel/sched.c b/kernel/sched.c index ad1962dc0aa2..6f230596bd0c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -204,11 +204,16 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED; } +static inline int rt_bandwidth_enabled(void) +{ + return sysctl_sched_rt_runtime >= 0; +} + static void start_rt_bandwidth(struct rt_bandwidth *rt_b) { ktime_t now; - if (rt_b->rt_runtime == RUNTIME_INF) + if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF) return; if (hrtimer_active(&rt_b->rt_period_timer)) @@ -298,9 +303,9 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; #endif /* CONFIG_RT_GROUP_SCHED */ -#else /* !CONFIG_FAIR_GROUP_SCHED */ +#else /* !CONFIG_USER_SCHED */ #define root_task_group init_task_group -#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* CONFIG_USER_SCHED */ /* task_group_lock serializes add/remove of task groups and also changes to * a task group's cpu shares. @@ -604,9 +609,9 @@ struct rq { static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) +static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) { - rq->curr->sched_class->check_preempt_curr(rq, p); + rq->curr->sched_class->check_preempt_curr(rq, p, sync); } static inline int cpu_of(struct rq *rq) @@ -1102,7 +1107,7 @@ static void hrtick_start(struct rq *rq, u64 delay) hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); } -static void init_hrtick(void) +static inline void init_hrtick(void) { } #endif /* CONFIG_SMP */ @@ -1121,7 +1126,7 @@ static void init_rq_hrtick(struct rq *rq) rq->hrtick_timer.function = hrtick; rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; } -#else +#else /* CONFIG_SCHED_HRTICK */ static inline void hrtick_clear(struct rq *rq) { } @@ -1133,7 +1138,7 @@ static inline void init_rq_hrtick(struct rq *rq) static inline void init_hrtick(void) { } -#endif +#endif /* CONFIG_SCHED_HRTICK */ /* * resched_task - mark a task 'to be rescheduled now'. @@ -1380,38 +1385,24 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) update_load_sub(&rq->load, load); } -#ifdef CONFIG_SMP -static unsigned long source_load(int cpu, int type); -static unsigned long target_load(int cpu, int type); -static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); - -static unsigned long cpu_avg_load_per_task(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - - if (rq->nr_running) - rq->avg_load_per_task = rq->load.weight / rq->nr_running; - - return rq->avg_load_per_task; -} - -#ifdef CONFIG_FAIR_GROUP_SCHED - -typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *); +#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) +typedef int (*tg_visitor)(struct task_group *, void *); /* * Iterate the full tree, calling @down when first entering a node and @up when * leaving it for the final time. */ -static void -walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd) +static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) { struct task_group *parent, *child; + int ret; rcu_read_lock(); parent = &root_task_group; down: - (*down)(parent, cpu, sd); + ret = (*down)(parent, data); + if (ret) + goto out_unlock; list_for_each_entry_rcu(child, &parent->children, siblings) { parent = child; goto down; @@ -1419,15 +1410,43 @@ down: up: continue; } - (*up)(parent, cpu, sd); + ret = (*up)(parent, data); + if (ret) + goto out_unlock; child = parent; parent = parent->parent; if (parent) goto up; +out_unlock: rcu_read_unlock(); + + return ret; } +static int tg_nop(struct task_group *tg, void *data) +{ + return 0; +} +#endif + +#ifdef CONFIG_SMP +static unsigned long source_load(int cpu, int type); +static unsigned long target_load(int cpu, int type); +static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); + +static unsigned long cpu_avg_load_per_task(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (rq->nr_running) + rq->avg_load_per_task = rq->load.weight / rq->nr_running; + + return rq->avg_load_per_task; +} + +#ifdef CONFIG_FAIR_GROUP_SCHED + static void __set_se_shares(struct sched_entity *se, unsigned long shares); /* @@ -1486,11 +1505,11 @@ __update_group_shares_cpu(struct task_group *tg, int cpu, * This needs to be done in a bottom-up fashion because the rq weight of a * parent group depends on the shares of its child groups. */ -static void -tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) +static int tg_shares_up(struct task_group *tg, void *data) { unsigned long rq_weight = 0; unsigned long shares = 0; + struct sched_domain *sd = data; int i; for_each_cpu_mask(i, sd->span) { @@ -1515,6 +1534,8 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) __update_group_shares_cpu(tg, i, shares, rq_weight); spin_unlock_irqrestore(&rq->lock, flags); } + + return 0; } /* @@ -1522,10 +1543,10 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) * This needs to be done in a top-down fashion because the load of a child * group is a fraction of its parents load. */ -static void -tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd) +static int tg_load_down(struct task_group *tg, void *data) { unsigned long load; + long cpu = (long)data; if (!tg->parent) { load = cpu_rq(cpu)->load.weight; @@ -1536,11 +1557,8 @@ tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd) } tg->cfs_rq[cpu]->h_load = load; -} -static void -tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd) -{ + return 0; } static void update_shares(struct sched_domain *sd) @@ -1550,7 +1568,7 @@ static void update_shares(struct sched_domain *sd) if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { sd->last_update = now; - walk_tg_tree(tg_nop, tg_shares_up, 0, sd); + walk_tg_tree(tg_nop, tg_shares_up, sd); } } @@ -1561,9 +1579,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd) spin_lock(&rq->lock); } -static void update_h_load(int cpu) +static void update_h_load(long cpu) { - walk_tg_tree(tg_load_down, tg_nop, cpu, NULL); + walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); } #else @@ -1921,11 +1939,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) running = task_running(rq, p); on_rq = p->se.on_rq; ncsw = 0; - if (!match_state || p->state == match_state) { - ncsw = p->nivcsw + p->nvcsw; - if (unlikely(!ncsw)) - ncsw = 1; - } + if (!match_state || p->state == match_state) + ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ task_rq_unlock(rq, &flags); /* @@ -2285,7 +2300,7 @@ out_running: trace_mark(kernel_sched_wakeup, "pid %d state %ld ## rq %p task %p rq->curr %p", p->pid, p->state, rq, p, rq->curr); - check_preempt_curr(rq, p); + check_preempt_curr(rq, p, sync); p->state = TASK_RUNNING; #ifdef CONFIG_SMP @@ -2420,7 +2435,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) trace_mark(kernel_sched_wakeup_new, "pid %d state %ld ## rq %p task %p rq->curr %p", p->pid, p->state, rq, p, rq->curr); - check_preempt_curr(rq, p); + check_preempt_curr(rq, p, 0); #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) p->sched_class->task_wake_up(rq, p); @@ -2880,7 +2895,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, * Note that idle threads have a prio of MAX_PRIO, for this test * to be always true for them. */ - check_preempt_curr(this_rq, p); + check_preempt_curr(this_rq, p, 0); } /* @@ -4627,6 +4642,15 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) } EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ +/** + * complete: - signals a single thread waiting on this completion + * @x: holds the state of this particular completion + * + * This will wake up a single thread waiting on this completion. Threads will be + * awakened in the same order in which they were queued. + * + * See also complete_all(), wait_for_completion() and related routines. + */ void complete(struct completion *x) { unsigned long flags; @@ -4638,6 +4662,12 @@ void complete(struct completion *x) } EXPORT_SYMBOL(complete); +/** + * complete_all: - signals all threads waiting on this completion + * @x: holds the state of this particular completion + * + * This will wake up all threads waiting on this particular completion event. + */ void complete_all(struct completion *x) { unsigned long flags; @@ -4658,10 +4688,7 @@ do_wait_for_common(struct completion *x, long timeout, int state) wait.flags |= WQ_FLAG_EXCLUSIVE; __add_wait_queue_tail(&x->wait, &wait); do { - if ((state == TASK_INTERRUPTIBLE && - signal_pending(current)) || - (state == TASK_KILLABLE && - fatal_signal_pending(current))) { + if (signal_pending_state(state, current)) { timeout = -ERESTARTSYS; break; } @@ -4689,12 +4716,31 @@ wait_for_common(struct completion *x, long timeout, int state) return timeout; } +/** + * wait_for_completion: - waits for completion of a task + * @x: holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It is NOT + * interruptible and there is no timeout. + * + * See also similar routines (i.e. wait_for_completion_timeout()) with timeout + * and interrupt capability. Also see complete(). + */ void __sched wait_for_completion(struct completion *x) { wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(wait_for_completion); +/** + * wait_for_completion_timeout: - waits for completion of a task (w/timeout) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. The timeout is in jiffies. It is not + * interruptible. + */ unsigned long __sched wait_for_completion_timeout(struct completion *x, unsigned long timeout) { @@ -4702,6 +4748,13 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout) } EXPORT_SYMBOL(wait_for_completion_timeout); +/** + * wait_for_completion_interruptible: - waits for completion of a task (w/intr) + * @x: holds the state of this particular completion + * + * This waits for completion of a specific task to be signaled. It is + * interruptible. + */ int __sched wait_for_completion_interruptible(struct completion *x) { long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); @@ -4711,6 +4764,14 @@ int __sched wait_for_completion_interruptible(struct completion *x) } EXPORT_SYMBOL(wait_for_completion_interruptible); +/** + * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. It is interruptible. The timeout is in jiffies. + */ unsigned long __sched wait_for_completion_interruptible_timeout(struct completion *x, unsigned long timeout) @@ -4719,6 +4780,13 @@ wait_for_completion_interruptible_timeout(struct completion *x, } EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); +/** + * wait_for_completion_killable: - waits for completion of a task (killable) + * @x: holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It can be + * interrupted by a kill signal. + */ int __sched wait_for_completion_killable(struct completion *x) { long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); @@ -5121,7 +5189,8 @@ recheck: * Do not allow realtime tasks into groups that have no runtime * assigned. */ - if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) + if (rt_bandwidth_enabled() && rt_policy(policy) && + task_group(p)->rt_bandwidth.rt_runtime == 0) return -EPERM; #endif @@ -5957,7 +6026,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) set_task_cpu(p, dest_cpu); if (on_rq) { activate_task(rq_dest, p, 0); - check_preempt_curr(rq_dest, p); + check_preempt_curr(rq_dest, p, 0); } done: ret = 1; @@ -6282,7 +6351,7 @@ set_table_entry(struct ctl_table *entry, static struct ctl_table * sd_alloc_ctl_domain_table(struct sched_domain *sd) { - struct ctl_table *table = sd_alloc_ctl_entry(12); + struct ctl_table *table = sd_alloc_ctl_entry(13); if (table == NULL) return NULL; @@ -6310,7 +6379,9 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) sizeof(int), 0644, proc_dointvec_minmax); set_table_entry(&table[10], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); - /* &table[11] is terminator */ + set_table_entry(&table[11], "name", sd->name, + CORENAME_MAX_SIZE, 0444, proc_dostring); + /* &table[12] is terminator */ return table; } @@ -7194,13 +7265,21 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) * Non-inlined to reduce accumulated stack pressure in build_sched_domains() */ +#ifdef CONFIG_SCHED_DEBUG +# define SD_INIT_NAME(sd, type) sd->name = #type +#else +# define SD_INIT_NAME(sd, type) do { } while (0) +#endif + #define SD_INIT(sd, type) sd_init_##type(sd) + #define SD_INIT_FUNC(type) \ static noinline void sd_init_##type(struct sched_domain *sd) \ { \ memset(sd, 0, sizeof(*sd)); \ *sd = SD_##type##_INIT; \ sd->level = SD_LV_##type; \ + SD_INIT_NAME(sd, type); \ } SD_INIT_FUNC(CPU) @@ -8242,20 +8321,25 @@ void __might_sleep(char *file, int line) #ifdef in_atomic static unsigned long prev_jiffy; /* ratelimiting */ - if ((in_atomic() || irqs_disabled()) && - system_state == SYSTEM_RUNNING && !oops_in_progress) { - if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) - return; - prev_jiffy = jiffies; - printk(KERN_ERR "BUG: sleeping function called from invalid" - " context at %s:%d\n", file, line); - printk("in_atomic():%d, irqs_disabled():%d\n", - in_atomic(), irqs_disabled()); - debug_show_held_locks(current); - if (irqs_disabled()) - print_irqtrace_events(current); - dump_stack(); - } + if ((!in_atomic() && !irqs_disabled()) || + system_state != SYSTEM_RUNNING || oops_in_progress) + return; + if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) + return; + prev_jiffy = jiffies; + + printk(KERN_ERR + "BUG: sleeping function called from invalid context at %s:%d\n", + file, line); + printk(KERN_ERR + "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", + in_atomic(), irqs_disabled(), + current->pid, current->comm); + + debug_show_held_locks(current); + if (irqs_disabled()) + print_irqtrace_events(current); + dump_stack(); #endif } EXPORT_SYMBOL(__might_sleep); @@ -8753,73 +8837,95 @@ static DEFINE_MUTEX(rt_constraints_mutex); static unsigned long to_ratio(u64 period, u64 runtime) { if (runtime == RUNTIME_INF) - return 1ULL << 16; + return 1ULL << 20; - return div64_u64(runtime << 16, period); + return div64_u64(runtime << 20, period); } -#ifdef CONFIG_CGROUP_SCHED -static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) +/* Must be called with tasklist_lock held */ +static inline int tg_has_rt_tasks(struct task_group *tg) { - struct task_group *tgi, *parent = tg->parent; - unsigned long total = 0; + struct task_struct *g, *p; - if (!parent) { - if (global_rt_period() < period) - return 0; + do_each_thread(g, p) { + if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) + return 1; + } while_each_thread(g, p); - return to_ratio(period, runtime) < - to_ratio(global_rt_period(), global_rt_runtime()); - } + return 0; +} - if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period) - return 0; +struct rt_schedulable_data { + struct task_group *tg; + u64 rt_period; + u64 rt_runtime; +}; - rcu_read_lock(); - list_for_each_entry_rcu(tgi, &parent->children, siblings) { - if (tgi == tg) - continue; +static int tg_schedulable(struct task_group *tg, void *data) +{ + struct rt_schedulable_data *d = data; + struct task_group *child; + unsigned long total, sum = 0; + u64 period, runtime; - total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), - tgi->rt_bandwidth.rt_runtime); + period = ktime_to_ns(tg->rt_bandwidth.rt_period); + runtime = tg->rt_bandwidth.rt_runtime; + + if (tg == d->tg) { + period = d->rt_period; + runtime = d->rt_runtime; } - rcu_read_unlock(); - return total + to_ratio(period, runtime) <= - to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), - parent->rt_bandwidth.rt_runtime); -} -#elif defined CONFIG_USER_SCHED -static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) -{ - struct task_group *tgi; - unsigned long total = 0; - unsigned long global_ratio = - to_ratio(global_rt_period(), global_rt_runtime()); + /* + * Cannot have more runtime than the period. + */ + if (runtime > period && runtime != RUNTIME_INF) + return -EINVAL; - rcu_read_lock(); - list_for_each_entry_rcu(tgi, &task_groups, list) { - if (tgi == tg) - continue; + /* + * Ensure we don't starve existing RT tasks. + */ + if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) + return -EBUSY; + + total = to_ratio(period, runtime); + + /* + * Nobody can have more than the global setting allows. + */ + if (total > to_ratio(global_rt_period(), global_rt_runtime())) + return -EINVAL; + + /* + * The sum of our children's runtime should not exceed our own. + */ + list_for_each_entry_rcu(child, &tg->children, siblings) { + period = ktime_to_ns(child->rt_bandwidth.rt_period); + runtime = child->rt_bandwidth.rt_runtime; + + if (child == d->tg) { + period = d->rt_period; + runtime = d->rt_runtime; + } - total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), - tgi->rt_bandwidth.rt_runtime); + sum += to_ratio(period, runtime); } - rcu_read_unlock(); - return total + to_ratio(period, runtime) < global_ratio; + if (sum > total) + return -EINVAL; + + return 0; } -#endif -/* Must be called with tasklist_lock held */ -static inline int tg_has_rt_tasks(struct task_group *tg) +static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) { - struct task_struct *g, *p; - do_each_thread(g, p) { - if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) - return 1; - } while_each_thread(g, p); - return 0; + struct rt_schedulable_data data = { + .tg = tg, + .rt_period = period, + .rt_runtime = runtime, + }; + + return walk_tg_tree(tg_schedulable, tg_nop, &data); } static int tg_set_bandwidth(struct task_group *tg, @@ -8829,14 +8935,9 @@ static int tg_set_bandwidth(struct task_group *tg, mutex_lock(&rt_constraints_mutex); read_lock(&tasklist_lock); - if (rt_runtime == 0 && tg_has_rt_tasks(tg)) { - err = -EBUSY; + err = __rt_schedulable(tg, rt_period, rt_runtime); + if (err) goto unlock; - } - if (!__rt_schedulable(tg, rt_period, rt_runtime)) { - err = -EINVAL; - goto unlock; - } spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); @@ -8905,19 +9006,25 @@ long sched_group_rt_period(struct task_group *tg) static int sched_rt_global_constraints(void) { - struct task_group *tg = &root_task_group; - u64 rt_runtime, rt_period; + u64 runtime, period; int ret = 0; if (sysctl_sched_rt_period <= 0) return -EINVAL; - rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); - rt_runtime = tg->rt_bandwidth.rt_runtime; + runtime = global_rt_runtime(); + period = global_rt_period(); + + /* + * Sanity check on the sysctl variables. + */ + if (runtime > period && runtime != RUNTIME_INF) + return -EINVAL; mutex_lock(&rt_constraints_mutex); - if (!__rt_schedulable(tg, rt_period, rt_runtime)) - ret = -EINVAL; + read_lock(&tasklist_lock); + ret = __rt_schedulable(NULL, 0, 0); + read_unlock(&tasklist_lock); mutex_unlock(&rt_constraints_mutex); return ret; @@ -8991,7 +9098,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) if (!cgrp->parent) { /* This is early initialization for the top cgroup */ - init_task_group.css.cgroup = cgrp; return &init_task_group.css; } @@ -9000,9 +9106,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) if (IS_ERR(tg)) return ERR_PTR(-ENOMEM); - /* Bind the cgroup to task_group object we just created */ - tg->css.cgroup = cgrp; - return &tg->css; } diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index fb8994c6d4bb..18fd17172eb6 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -409,64 +409,6 @@ static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) } /* - * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in - * that it favours >=0 over <0. - * - * -20 | - * | - * 0 --------+------- - * .' - * 19 .' - * - */ -static unsigned long -calc_delta_asym(unsigned long delta, struct sched_entity *se) -{ - struct load_weight lw = { - .weight = NICE_0_LOAD, - .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT) - }; - - for_each_sched_entity(se) { - struct load_weight *se_lw = &se->load; - unsigned long rw = cfs_rq_of(se)->load.weight; - -#ifdef CONFIG_FAIR_SCHED_GROUP - struct cfs_rq *cfs_rq = se->my_q; - struct task_group *tg = NULL - - if (cfs_rq) - tg = cfs_rq->tg; - - if (tg && tg->shares < NICE_0_LOAD) { - /* - * scale shares to what it would have been had - * tg->weight been NICE_0_LOAD: - * - * weight = 1024 * shares / tg->weight - */ - lw.weight *= se->load.weight; - lw.weight /= tg->shares; - - lw.inv_weight = 0; - - se_lw = &lw; - rw += lw.weight - se->load.weight; - } else -#endif - - if (se->load.weight < NICE_0_LOAD) { - se_lw = &lw; - rw += NICE_0_LOAD - se->load.weight; - } - - delta = calc_delta_mine(delta, rw, se_lw); - } - - return delta; -} - -/* * Update the current task's runtime statistics. Skip current tasks that * are not in our scheduling class. */ @@ -586,11 +528,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_add(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) inc_cpu_load(rq_of(cfs_rq), se->load.weight); - if (entity_is_task(se)) + if (entity_is_task(se)) { add_cfs_task_weight(cfs_rq, se->load.weight); + list_add(&se->group_node, &cfs_rq->tasks); + } cfs_rq->nr_running++; se->on_rq = 1; - list_add(&se->group_node, &cfs_rq->tasks); } static void @@ -599,11 +542,12 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_sub(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) dec_cpu_load(rq_of(cfs_rq), se->load.weight); - if (entity_is_task(se)) + if (entity_is_task(se)) { add_cfs_task_weight(cfs_rq, -se->load.weight); + list_del_init(&se->group_node); + } cfs_rq->nr_running--; se->on_rq = 0; - list_del_init(&se->group_node); } static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -1085,7 +1029,6 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) { struct sched_entity *se = tg->se[cpu]; - long more_w; if (!tg->parent) return wl; @@ -1097,18 +1040,17 @@ static long effective_load(struct task_group *tg, int cpu, if (!wl && sched_feat(ASYM_EFF_LOAD)) return wl; - /* - * Instead of using this increment, also add the difference - * between when the shares were last updated and now. - */ - more_w = se->my_q->load.weight - se->my_q->rq_weight; - wl += more_w; - wg += more_w; - for_each_sched_entity(se) { -#define D(n) (likely(n) ? (n) : 1) - long S, rw, s, a, b; + long more_w; + + /* + * Instead of using this increment, also add the difference + * between when the shares were last updated and now. + */ + more_w = se->my_q->load.weight - se->my_q->rq_weight; + wl += more_w; + wg += more_w; S = se->my_q->tg->shares; s = se->my_q->shares; @@ -1117,7 +1059,11 @@ static long effective_load(struct task_group *tg, int cpu, a = S*(rw + wl); b = S*rw + s*wg; - wl = s*(a-b)/D(b); + wl = s*(a-b); + + if (likely(b)) + wl /= b; + /* * Assume the group is already running and will * thus already be accounted for in the weight. @@ -1126,7 +1072,6 @@ static long effective_load(struct task_group *tg, int cpu, * alter the group weight. */ wg = 0; -#undef D } return wl; @@ -1143,7 +1088,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu, #endif static int -wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, +wake_affine(struct sched_domain *this_sd, struct rq *this_rq, struct task_struct *p, int prev_cpu, int this_cpu, int sync, int idx, unsigned long load, unsigned long this_load, unsigned int imbalance) @@ -1158,6 +1103,11 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) return 0; + if (!sync && sched_feat(SYNC_WAKEUPS) && + curr->se.avg_overlap < sysctl_sched_migration_cost && + p->se.avg_overlap < sysctl_sched_migration_cost) + sync = 1; + /* * If sync wakeup then subtract the (maximum possible) * effect of the currently running task from the load @@ -1182,17 +1132,14 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, * a reasonable amount of time then attract this newly * woken task: */ - if (sync && balanced) { - if (curr->se.avg_overlap < sysctl_sched_migration_cost && - p->se.avg_overlap < sysctl_sched_migration_cost) - return 1; - } + if (sync && balanced) + return 1; schedstat_inc(p, se.nr_wakeups_affine_attempts); tl_per_task = cpu_avg_load_per_task(this_cpu); - if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) || - balanced) { + if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <= + tl_per_task)) { /* * This domain has SD_WAKE_AFFINE and * p is cache cold in this domain, and @@ -1211,16 +1158,17 @@ static int select_task_rq_fair(struct task_struct *p, int sync) struct sched_domain *sd, *this_sd = NULL; int prev_cpu, this_cpu, new_cpu; unsigned long load, this_load; - struct rq *rq, *this_rq; + struct rq *this_rq; unsigned int imbalance; int idx; prev_cpu = task_cpu(p); - rq = task_rq(p); this_cpu = smp_processor_id(); this_rq = cpu_rq(this_cpu); new_cpu = prev_cpu; + if (prev_cpu == this_cpu) + goto out; /* * 'this_sd' is the first domain that both * this_cpu and prev_cpu are present in: @@ -1248,13 +1196,10 @@ static int select_task_rq_fair(struct task_struct *p, int sync) load = source_load(prev_cpu, idx); this_load = target_load(this_cpu, idx); - if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, + if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, load, this_load, imbalance)) return this_cpu; - if (prev_cpu == this_cpu) - goto out; - /* * Start passive balancing when half the imbalance_pct * limit is reached. @@ -1281,62 +1226,20 @@ static unsigned long wakeup_gran(struct sched_entity *se) * + nice tasks. */ if (sched_feat(ASYM_GRAN)) - gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se); - else - gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se); + gran = calc_delta_mine(gran, NICE_0_LOAD, &se->load); return gran; } /* - * Should 'se' preempt 'curr'. - * - * |s1 - * |s2 - * |s3 - * g - * |<--->|c - * - * w(c, s1) = -1 - * w(c, s2) = 0 - * w(c, s3) = 1 - * - */ -static int -wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) -{ - s64 gran, vdiff = curr->vruntime - se->vruntime; - - if (vdiff < 0) - return -1; - - gran = wakeup_gran(curr); - if (vdiff > gran) - return 1; - - return 0; -} - -/* return depth at which a sched entity is present in the hierarchy */ -static inline int depth_se(struct sched_entity *se) -{ - int depth = 0; - - for_each_sched_entity(se) - depth++; - - return depth; -} - -/* * Preempt the current task with a newly woken task if needed: */ -static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) +static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) { struct task_struct *curr = rq->curr; struct cfs_rq *cfs_rq = task_cfs_rq(curr); struct sched_entity *se = &curr->se, *pse = &p->se; - int se_depth, pse_depth; + s64 delta_exec; if (unlikely(rt_prio(p->prio))) { update_rq_clock(rq); @@ -1351,6 +1254,13 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) cfs_rq_of(pse)->next = pse; /* + * We can come here with TIF_NEED_RESCHED already set from new task + * wake up path. + */ + if (test_tsk_need_resched(curr)) + return; + + /* * Batch tasks do not preempt (their preemption is driven by * the tick): */ @@ -1360,33 +1270,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) if (!sched_feat(WAKEUP_PREEMPT)) return; - /* - * preemption test can be made between sibling entities who are in the - * same cfs_rq i.e who have a common parent. Walk up the hierarchy of - * both tasks until we find their ancestors who are siblings of common - * parent. - */ - - /* First walk up until both entities are at same depth */ - se_depth = depth_se(se); - pse_depth = depth_se(pse); - - while (se_depth > pse_depth) { - se_depth--; - se = parent_entity(se); - } - - while (pse_depth > se_depth) { - pse_depth--; - pse = parent_entity(pse); - } - - while (!is_same_group(se, pse)) { - se = parent_entity(se); - pse = parent_entity(pse); + if (sched_feat(WAKEUP_OVERLAP) && (sync || + (se->avg_overlap < sysctl_sched_migration_cost && + pse->avg_overlap < sysctl_sched_migration_cost))) { + resched_task(curr); + return; } - if (wakeup_preempt_entity(se, pse) == 1) + delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime; + if (delta_exec > wakeup_gran(pse)) resched_task(curr); } @@ -1445,19 +1337,9 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) if (next == &cfs_rq->tasks) return NULL; - /* Skip over entities that are not tasks */ - do { - se = list_entry(next, struct sched_entity, group_node); - next = next->next; - } while (next != &cfs_rq->tasks && !entity_is_task(se)); - - if (next == &cfs_rq->tasks) - return NULL; - - cfs_rq->balance_iterator = next; - - if (entity_is_task(se)) - p = task_of(se); + se = list_entry(next, struct sched_entity, group_node); + p = task_of(se); + cfs_rq->balance_iterator = next->next; return p; } @@ -1507,7 +1389,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, rcu_read_lock(); update_h_load(busiest_cpu); - list_for_each_entry(tg, &task_groups, list) { + list_for_each_entry_rcu(tg, &task_groups, list) { struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; unsigned long busiest_h_load = busiest_cfs_rq->h_load; unsigned long busiest_weight = busiest_cfs_rq->load.weight; @@ -1620,10 +1502,10 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) * 'current' within the tree based on its new key value. */ swap(curr->vruntime, se->vruntime); + resched_task(rq->curr); } enqueue_task_fair(rq, p, 0); - resched_task(rq->curr); } /* @@ -1642,7 +1524,7 @@ static void prio_changed_fair(struct rq *rq, struct task_struct *p, if (p->prio > oldprio) resched_task(rq->curr); } else - check_preempt_curr(rq, p); + check_preempt_curr(rq, p, 0); } /* @@ -1659,7 +1541,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p, if (running) resched_task(rq->curr); else - check_preempt_curr(rq, p); + check_preempt_curr(rq, p, 0); } /* Account for a task changing its policy or group. diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 9353ca78154e..7c9e8f4a049f 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -11,3 +11,4 @@ SCHED_FEAT(ASYM_GRAN, 1) SCHED_FEAT(LB_BIAS, 1) SCHED_FEAT(LB_WAKEUP_UPDATE, 1) SCHED_FEAT(ASYM_EFF_LOAD, 1) +SCHED_FEAT(WAKEUP_OVERLAP, 0) diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 3a4f92dbbe66..dec4ccabe2f5 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sync) /* * Idle tasks are unconditionally rescheduled: */ -static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p) +static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync) { resched_task(rq->idle); } @@ -76,7 +76,7 @@ static void switched_to_idle(struct rq *rq, struct task_struct *p, if (running) resched_task(rq->curr); else - check_preempt_curr(rq, p); + check_preempt_curr(rq, p, 0); } static void prio_changed_idle(struct rq *rq, struct task_struct *p, @@ -93,7 +93,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p, if (p->prio > oldprio) resched_task(rq->curr); } else - check_preempt_curr(rq, p); + check_preempt_curr(rq, p, 0); } /* diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 1113157b2058..cdf5740ab03e 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -102,12 +102,12 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se); static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { + struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; struct sched_rt_entity *rt_se = rt_rq->rt_se; - if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) { - struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; - - enqueue_rt_entity(rt_se); + if (rt_rq->rt_nr_running) { + if (rt_se && !on_rt_rq(rt_se)) + enqueue_rt_entity(rt_se); if (rt_rq->highest_prio < curr->prio) resched_task(curr); } @@ -231,6 +231,9 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) #endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_SMP +/* + * We ran out of runtime, see if we can borrow some from our neighbours. + */ static int do_balance_runtime(struct rt_rq *rt_rq) { struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); @@ -250,9 +253,18 @@ static int do_balance_runtime(struct rt_rq *rt_rq) continue; spin_lock(&iter->rt_runtime_lock); + /* + * Either all rqs have inf runtime and there's nothing to steal + * or __disable_runtime() below sets a specific rq to inf to + * indicate its been disabled and disalow stealing. + */ if (iter->rt_runtime == RUNTIME_INF) goto next; + /* + * From runqueues with spare time, take 1/n part of their + * spare time, but no more than our period. + */ diff = iter->rt_runtime - iter->rt_time; if (diff > 0) { diff = div_u64((u64)diff, weight); @@ -274,6 +286,9 @@ next: return more; } +/* + * Ensure this RQ takes back all the runtime it lend to its neighbours. + */ static void __disable_runtime(struct rq *rq) { struct root_domain *rd = rq->rd; @@ -289,17 +304,33 @@ static void __disable_runtime(struct rq *rq) spin_lock(&rt_b->rt_runtime_lock); spin_lock(&rt_rq->rt_runtime_lock); + /* + * Either we're all inf and nobody needs to borrow, or we're + * already disabled and thus have nothing to do, or we have + * exactly the right amount of runtime to take out. + */ if (rt_rq->rt_runtime == RUNTIME_INF || rt_rq->rt_runtime == rt_b->rt_runtime) goto balanced; spin_unlock(&rt_rq->rt_runtime_lock); + /* + * Calculate the difference between what we started out with + * and what we current have, that's the amount of runtime + * we lend and now have to reclaim. + */ want = rt_b->rt_runtime - rt_rq->rt_runtime; + /* + * Greedy reclaim, take back as much as we can. + */ for_each_cpu_mask(i, rd->span) { struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); s64 diff; + /* + * Can't reclaim from ourselves or disabled runqueues. + */ if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF) continue; @@ -319,8 +350,16 @@ static void __disable_runtime(struct rq *rq) } spin_lock(&rt_rq->rt_runtime_lock); + /* + * We cannot be left wanting - that would mean some runtime + * leaked out of the system. + */ BUG_ON(want); balanced: + /* + * Disable all the borrow logic by pretending we have inf + * runtime - in which case borrowing doesn't make sense. + */ rt_rq->rt_runtime = RUNTIME_INF; spin_unlock(&rt_rq->rt_runtime_lock); spin_unlock(&rt_b->rt_runtime_lock); @@ -343,6 +382,9 @@ static void __enable_runtime(struct rq *rq) if (unlikely(!scheduler_running)) return; + /* + * Reset each runqueue's bandwidth settings + */ for_each_leaf_rt_rq(rt_rq, rq) { struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); @@ -389,7 +431,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) int i, idle = 1; cpumask_t span; - if (rt_b->rt_runtime == RUNTIME_INF) + if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) return 1; span = sched_rt_period_mask(); @@ -487,6 +529,9 @@ static void update_curr_rt(struct rq *rq) curr->se.exec_start = rq->clock; cpuacct_charge(curr, delta_exec); + if (!rt_bandwidth_enabled()) + return; + for_each_sched_rt_entity(rt_se) { rt_rq = rt_rq_of_se(rt_se); @@ -784,7 +829,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) /* * Preempt the current task with a newly woken task if needed: */ -static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) +static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync) { if (p->prio < rq->curr->prio) { resched_task(rq->curr); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index cb02324bdb88..a4d219398167 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -20,6 +20,7 @@ #include <linux/profile.h> #include <linux/sched.h> #include <linux/tick.h> +#include <linux/module.h> #include <asm/irq_regs.h> @@ -190,9 +191,17 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - *last_update_time = ktime_to_us(ts->idle_lastupdate); + if (!tick_nohz_enabled) + return -1; + + if (ts->idle_active) + *last_update_time = ktime_to_us(ts->idle_lastupdate); + else + *last_update_time = ktime_to_us(ktime_get()); + return ktime_to_us(ts->idle_sleeptime); } +EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); /** * tick_nohz_stop_sched_tick - stop the idle tick from the idle task diff --git a/kernel/user.c b/kernel/user.c index 865ecf57a096..39d6159fae43 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -169,7 +169,7 @@ static ssize_t cpu_rt_runtime_show(struct kobject *kobj, { struct user_struct *up = container_of(kobj, struct user_struct, kobj); - return sprintf(buf, "%lu\n", sched_group_rt_runtime(up->tg)); + return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg)); } static ssize_t cpu_rt_runtime_store(struct kobject *kobj, @@ -180,7 +180,7 @@ static ssize_t cpu_rt_runtime_store(struct kobject *kobj, unsigned long rt_runtime; int rc; - sscanf(buf, "%lu", &rt_runtime); + sscanf(buf, "%ld", &rt_runtime); rc = sched_group_set_rt_runtime(up->tg, rt_runtime); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 7d7a31d0ddeb..ce697e0b319e 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -597,6 +597,19 @@ config RCU_TORTURE_TEST_RUNNABLE Say N here if you want the RCU torture tests to start only after being manually enabled via /proc. +config RCU_CPU_STALL_DETECTOR + bool "Check for stalled CPUs delaying RCU grace periods" + depends on CLASSIC_RCU + default n + help + This option causes RCU to printk information on which + CPUs are delaying the current grace period, but only when + the grace period extends for excessive time periods. + + Say Y if you want RCU to perform such checks. + + Say N if you are unsure. + config KPROBES_SANITY_TEST bool "Kprobes sanity tests" depends on DEBUG_KERNEL diff --git a/scripts/Makefile b/scripts/Makefile index 1c73c5aea66b..aafdf064feef 100644 --- a/scripts/Makefile +++ b/scripts/Makefile @@ -20,6 +20,7 @@ hostprogs-y += unifdef subdir-$(CONFIG_MODVERSIONS) += genksyms subdir-y += mod +subdir-$(CONFIG_SECURITY_SELINUX) += selinux # Let clean descend into subdirs -subdir- += basic kconfig package +subdir- += basic kconfig package selinux diff --git a/scripts/selinux/Makefile b/scripts/selinux/Makefile new file mode 100644 index 000000000000..ca4b1ec01822 --- /dev/null +++ b/scripts/selinux/Makefile @@ -0,0 +1,2 @@ +subdir-y := mdp +subdir- += mdp diff --git a/scripts/selinux/README b/scripts/selinux/README new file mode 100644 index 000000000000..a936315ba2c8 --- /dev/null +++ b/scripts/selinux/README @@ -0,0 +1,2 @@ +Please see Documentation/SELinux.txt for information on +installing a dummy SELinux policy. diff --git a/scripts/selinux/install_policy.sh b/scripts/selinux/install_policy.sh new file mode 100644 index 000000000000..7b9ccf61f8f9 --- /dev/null +++ b/scripts/selinux/install_policy.sh @@ -0,0 +1,69 @@ +#!/bin/sh +if [ `id -u` -ne 0 ]; then + echo "$0: must be root to install the selinux policy" + exit 1 +fi +SF=`which setfiles` +if [ $? -eq 1 ]; then + if [ -f /sbin/setfiles ]; then + SF="/usr/setfiles" + else + echo "no selinux tools installed: setfiles" + exit 1 + fi +fi + +cd mdp + +CP=`which checkpolicy` +VERS=`$CP -V | awk '{print $1}'` + +./mdp policy.conf file_contexts +$CP -o policy.$VERS policy.conf + +mkdir -p /etc/selinux/dummy/policy +mkdir -p /etc/selinux/dummy/contexts/files + +cp file_contexts /etc/selinux/dummy/contexts/files +cp dbus_contexts /etc/selinux/dummy/contexts +cp policy.$VERS /etc/selinux/dummy/policy +FC_FILE=/etc/selinux/dummy/contexts/files/file_contexts + +if [ ! -d /etc/selinux ]; then + mkdir -p /etc/selinux +fi +if [ ! -f /etc/selinux/config ]; then + cat > /etc/selinux/config << EOF +SELINUX=enforcing +SELINUXTYPE=dummy +EOF +else + TYPE=`cat /etc/selinux/config | grep "^SELINUXTYPE" | tail -1 | awk -F= '{ print $2 '}` + if [ "eq$TYPE" != "eqdummy" ]; then + selinuxenabled + if [ $? -eq 0 ]; then + echo "SELinux already enabled with a non-dummy policy." + echo "Exiting. Please install policy by hand if that" + echo "is what you REALLY want." + exit 1 + fi + mv /etc/selinux/config /etc/selinux/config.mdpbak + grep -v "^SELINUXTYPE" /etc/selinux/config.mdpbak >> /etc/selinux/config + echo "SELINUXTYPE=dummy" >> /etc/selinux/config + fi +fi + +cd /etc/selinux/dummy/contexts/files +$SF file_contexts / + +mounts=`cat /proc/$$/mounts | egrep "ext2|ext3|xfs|jfs|ext4|ext4dev|gfs2" | awk '{ print $2 '}` +$SF file_contexts $mounts + + +dodev=`cat /proc/$$/mounts | grep "/dev "` +if [ "eq$dodev" != "eq" ]; then + mount --move /dev /mnt + $SF file_contexts /dev + mount --move /mnt /dev +fi + diff --git a/scripts/selinux/mdp/.gitignore b/scripts/selinux/mdp/.gitignore new file mode 100644 index 000000000000..654546d8dffd --- /dev/null +++ b/scripts/selinux/mdp/.gitignore @@ -0,0 +1,2 @@ +# Generated file +mdp diff --git a/scripts/selinux/mdp/Makefile b/scripts/selinux/mdp/Makefile new file mode 100644 index 000000000000..eb365b333441 --- /dev/null +++ b/scripts/selinux/mdp/Makefile @@ -0,0 +1,5 @@ +hostprogs-y := mdp +HOST_EXTRACFLAGS += -Isecurity/selinux/include + +always := $(hostprogs-y) +clean-files := $(hostprogs-y) policy.* file_contexts diff --git a/scripts/selinux/mdp/dbus_contexts b/scripts/selinux/mdp/dbus_contexts new file mode 100644 index 000000000000..116e684f9fc1 --- /dev/null +++ b/scripts/selinux/mdp/dbus_contexts @@ -0,0 +1,6 @@ +<!DOCTYPE busconfig PUBLIC "-//freedesktop//DTD D-BUS Bus Configuration 1.0//EN" + "http://www.freedesktop.org/standards/dbus/1.0/busconfig.dtd"> +<busconfig> + <selinux> + </selinux> +</busconfig> diff --git a/scripts/selinux/mdp/mdp.c b/scripts/selinux/mdp/mdp.c new file mode 100644 index 000000000000..ca757d486187 --- /dev/null +++ b/scripts/selinux/mdp/mdp.c @@ -0,0 +1,242 @@ +/* + * + * mdp - make dummy policy + * + * When pointed at a kernel tree, builds a dummy policy for that kernel + * with exactly one type with full rights to itself. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2006 + * + * Authors: Serge E. Hallyn <serue@us.ibm.com> + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> + +#include "flask.h" + +void usage(char *name) +{ + printf("usage: %s [-m] policy_file context_file\n", name); + exit(1); +} + +void find_common_name(char *cname, char *dest, int len) +{ + char *start, *end; + + start = strchr(cname, '_')+1; + end = strchr(start, '_'); + if (!start || !end || start-cname > len || end-start > len) { + printf("Error with commons defines\n"); + exit(1); + } + strncpy(dest, start, end-start); + dest[end-start] = '\0'; +} + +#define S_(x) x, +static char *classlist[] = { +#include "class_to_string.h" + NULL +}; +#undef S_ + +#include "initial_sid_to_string.h" + +#define TB_(x) char *x[] = { +#define TE_(x) NULL }; +#define S_(x) x, +#include "common_perm_to_string.h" +#undef TB_ +#undef TE_ +#undef S_ + +struct common { + char *cname; + char **perms; +}; +struct common common[] = { +#define TB_(x) { #x, x }, +#define S_(x) +#define TE_(x) +#include "common_perm_to_string.h" +#undef TB_ +#undef TE_ +#undef S_ +}; + +#define S_(x, y, z) {x, #y}, +struct av_inherit { + int class; + char *common; +}; +struct av_inherit av_inherit[] = { +#include "av_inherit.h" +}; +#undef S_ + +#include "av_permissions.h" +#define S_(x, y, z) {x, y, z}, +struct av_perms { + int class; + int perm_i; + char *perm_s; +}; +struct av_perms av_perms[] = { +#include "av_perm_to_string.h" +}; +#undef S_ + +int main(int argc, char *argv[]) +{ + int i, j, mls = 0; + char **arg, *polout, *ctxout; + int classlist_len, initial_sid_to_string_len; + FILE *fout; + + if (argc < 3) + usage(argv[0]); + arg = argv+1; + if (argc==4 && strcmp(argv[1], "-m") == 0) { + mls = 1; + arg++; + } + polout = *arg++; + ctxout = *arg; + + fout = fopen(polout, "w"); + if (!fout) { + printf("Could not open %s for writing\n", polout); + usage(argv[0]); + } + + classlist_len = sizeof(classlist) / sizeof(char *); + /* print out the classes */ + for (i=1; i < classlist_len; i++) { + if(classlist[i]) + fprintf(fout, "class %s\n", classlist[i]); + else + fprintf(fout, "class user%d\n", i); + } + fprintf(fout, "\n"); + + initial_sid_to_string_len = sizeof(initial_sid_to_string) / sizeof (char *); + /* print out the sids */ + for (i=1; i < initial_sid_to_string_len; i++) + fprintf(fout, "sid %s\n", initial_sid_to_string[i]); + fprintf(fout, "\n"); + + /* print out the commons */ + for (i=0; i< sizeof(common)/sizeof(struct common); i++) { + char cname[101]; + find_common_name(common[i].cname, cname, 100); + cname[100] = '\0'; + fprintf(fout, "common %s\n{\n", cname); + for (j=0; common[i].perms[j]; j++) + fprintf(fout, "\t%s\n", common[i].perms[j]); + fprintf(fout, "}\n\n"); + } + fprintf(fout, "\n"); + + /* print out the class permissions */ + for (i=1; i < classlist_len; i++) { + if (classlist[i]) { + int firstperm = -1, numperms = 0; + + fprintf(fout, "class %s\n", classlist[i]); + /* does it inherit from a common? */ + for (j=0; j < sizeof(av_inherit)/sizeof(struct av_inherit); j++) + if (av_inherit[j].class == i) + fprintf(fout, "inherits %s\n", av_inherit[j].common); + + for (j=0; j < sizeof(av_perms)/sizeof(struct av_perms); j++) { + if (av_perms[j].class == i) { + if (firstperm == -1) + firstperm = j; + numperms++; + } + } + if (!numperms) { + fprintf(fout, "\n"); + continue; + } + + fprintf(fout, "{\n"); + /* print out the av_perms */ + for (j=0; j < numperms; j++) { + fprintf(fout, "\t%s\n", av_perms[firstperm+j].perm_s); + } + fprintf(fout, "}\n\n"); + } + } + fprintf(fout, "\n"); + + /* NOW PRINT OUT MLS STUFF */ + if (mls) { + printf("MLS not yet implemented\n"); + exit(1); + } + + /* types, roles, and allows */ + fprintf(fout, "type base_t;\n"); + fprintf(fout, "role base_r types { base_t };\n"); + for (i=1; i < classlist_len; i++) { + if (classlist[i]) + fprintf(fout, "allow base_t base_t:%s *;\n", classlist[i]); + else + fprintf(fout, "allow base_t base_t:user%d *;\n", i); + } + fprintf(fout, "user user_u roles { base_r };\n"); + fprintf(fout, "\n"); + + /* default sids */ + for (i=1; i < initial_sid_to_string_len; i++) + fprintf(fout, "sid %s user_u:base_r:base_t\n", initial_sid_to_string[i]); + fprintf(fout, "\n"); + + + fprintf(fout, "fs_use_xattr ext2 user_u:base_r:base_t;\n"); + fprintf(fout, "fs_use_xattr ext3 user_u:base_r:base_t;\n"); + fprintf(fout, "fs_use_xattr jfs user_u:base_r:base_t;\n"); + fprintf(fout, "fs_use_xattr xfs user_u:base_r:base_t;\n"); + fprintf(fout, "fs_use_xattr reiserfs user_u:base_r:base_t;\n"); + + fprintf(fout, "fs_use_task pipefs user_u:base_r:base_t;\n"); + fprintf(fout, "fs_use_task sockfs user_u:base_r:base_t;\n"); + + fprintf(fout, "fs_use_trans devpts user_u:base_r:base_t;\n"); + fprintf(fout, "fs_use_trans tmpfs user_u:base_r:base_t;\n"); + fprintf(fout, "fs_use_trans shm user_u:base_r:base_t;\n"); + + fprintf(fout, "genfscon proc / user_u:base_r:base_t\n"); + + fclose(fout); + + fout = fopen(ctxout, "w"); + if (!fout) { + printf("Wrote policy, but cannot open %s for writing\n", ctxout); + usage(argv[0]); + } + fprintf(fout, "/ user_u:base_r:base_t\n"); + fprintf(fout, "/.* user_u:base_r:base_t\n"); + fclose(fout); + + return 0; +} diff --git a/security/Kconfig b/security/Kconfig index 559293922a47..d9f47ce7e207 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -51,6 +51,14 @@ config SECURITY If you are unsure how to answer this question, answer N. +config SECURITYFS + bool "Enable the securityfs filesystem" + help + This will build the securityfs filesystem. It is currently used by + the TPM bios character driver. It is not used by SELinux or SMACK. + + If you are unsure how to answer this question, answer N. + config SECURITY_NETWORK bool "Socket and Networking Security Hooks" depends on SECURITY diff --git a/security/Makefile b/security/Makefile index f65426099aa6..c05c127fff9a 100644 --- a/security/Makefile +++ b/security/Makefile @@ -10,7 +10,8 @@ subdir-$(CONFIG_SECURITY_SMACK) += smack obj-y += commoncap.o # Object file lists -obj-$(CONFIG_SECURITY) += security.o capability.o inode.o +obj-$(CONFIG_SECURITY) += security.o capability.o +obj-$(CONFIG_SECURITYFS) += inode.o # Must precede capability.o in order to stack properly. obj-$(CONFIG_SECURITY_SELINUX) += selinux/built-in.o obj-$(CONFIG_SECURITY_SMACK) += smack/built-in.o diff --git a/security/commoncap.c b/security/commoncap.c index e4c4b3fc0c04..399bfdb9e2da 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -541,7 +541,7 @@ int cap_task_post_setuid (uid_t old_ruid, uid_t old_euid, uid_t old_suid, * yet with increased caps. * So we check for increased caps on the target process. */ -static inline int cap_safe_nice(struct task_struct *p) +static int cap_safe_nice(struct task_struct *p) { if (!cap_issubset(p->cap_permitted, current->cap_permitted) && !capable(CAP_SYS_NICE)) diff --git a/security/inode.c b/security/inode.c index acc6cf0d7900..ca4958ebad8d 100644 --- a/security/inode.c +++ b/security/inode.c @@ -190,7 +190,7 @@ static int create_by_name(const char *name, mode_t mode, * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a - * directory dentry if set. If this paramater is NULL, then the + * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the securityfs filesystem. * @data: a pointer to something that the caller will want to get to later * on. The inode.i_private pointer will point to this value on @@ -199,18 +199,18 @@ static int create_by_name(const char *name, mode_t mode, * this file. * * This is the basic "create a file" function for securityfs. It allows for a - * wide range of flexibility in createing a file, or a directory (if you + * wide range of flexibility in creating a file, or a directory (if you * want to create a directory, the securityfs_create_dir() function is - * recommended to be used instead.) + * recommended to be used instead). * - * This function will return a pointer to a dentry if it succeeds. This + * This function returns a pointer to a dentry if it succeeds. This * pointer must be passed to the securityfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, NULL will be returned. + * you are responsible here). If an error occurs, %NULL is returned. * - * If securityfs is not enabled in the kernel, the value -ENODEV will be + * If securityfs is not enabled in the kernel, the value %-ENODEV is * returned. It is not wise to check for this value, but rather, check for - * NULL or !NULL instead as to eliminate the need for #ifdef in the calling + * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling * code. */ struct dentry *securityfs_create_file(const char *name, mode_t mode, @@ -252,19 +252,19 @@ EXPORT_SYMBOL_GPL(securityfs_create_file); * @name: a pointer to a string containing the name of the directory to * create. * @parent: a pointer to the parent dentry for this file. This should be a - * directory dentry if set. If this paramater is NULL, then the + * directory dentry if set. If this parameter is %NULL, then the * directory will be created in the root of the securityfs filesystem. * - * This function creates a directory in securityfs with the given name. + * This function creates a directory in securityfs with the given @name. * - * This function will return a pointer to a dentry if it succeeds. This + * This function returns a pointer to a dentry if it succeeds. This * pointer must be passed to the securityfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, NULL will be returned. + * you are responsible here). If an error occurs, %NULL will be returned. * - * If securityfs is not enabled in the kernel, the value -ENODEV will be + * If securityfs is not enabled in the kernel, the value %-ENODEV is * returned. It is not wise to check for this value, but rather, check for - * NULL or !NULL instead as to eliminate the need for #ifdef in the calling + * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling * code. */ struct dentry *securityfs_create_dir(const char *name, struct dentry *parent) @@ -278,16 +278,15 @@ EXPORT_SYMBOL_GPL(securityfs_create_dir); /** * securityfs_remove - removes a file or directory from the securityfs filesystem * - * @dentry: a pointer to a the dentry of the file or directory to be - * removed. + * @dentry: a pointer to a the dentry of the file or directory to be removed. * * This function removes a file or directory in securityfs that was previously * created with a call to another securityfs function (like * securityfs_create_file() or variants thereof.) * * This function is required to be called in order for the file to be - * removed, no automatic cleanup of files will happen when a module is - * removed, you are responsible here. + * removed. No automatic cleanup of files will happen when a module is + * removed; you are responsible here. */ void securityfs_remove(struct dentry *dentry) { diff --git a/security/security.c b/security/security.c index 3a4b4f55b33f..255b08559b2b 100644 --- a/security/security.c +++ b/security/security.c @@ -82,8 +82,8 @@ __setup("security=", choose_lsm); * * Return true if: * -The passed LSM is the one chosen by user at boot time, - * -or user didsn't specify a specific LSM and we're the first to ask - * for registeration permissoin, + * -or user didn't specify a specific LSM and we're the first to ask + * for registration permission, * -or the passed LSM is currently loaded. * Otherwise, return false. */ @@ -101,13 +101,13 @@ int __init security_module_enable(struct security_operations *ops) * register_security - registers a security framework with the kernel * @ops: a pointer to the struct security_options that is to be registered * - * This function is to allow a security module to register itself with the + * This function allows a security module to register itself with the * kernel security subsystem. Some rudimentary checking is done on the @ops * value passed to this function. You'll need to check first if your LSM * is allowed to register its @ops by calling security_module_enable(@ops). * * If there is already a security module registered with the kernel, - * an error will be returned. Otherwise 0 is returned on success. + * an error will be returned. Otherwise %0 is returned on success. */ int register_security(struct security_operations *ops) { diff --git a/security/selinux/Kconfig b/security/selinux/Kconfig index a436d1cfa88b..26301dd651d3 100644 --- a/security/selinux/Kconfig +++ b/security/selinux/Kconfig @@ -6,9 +6,6 @@ config SECURITY_SELINUX help This selects NSA Security-Enhanced Linux (SELinux). You will also need a policy configuration and a labeled filesystem. - You can obtain the policy compiler (checkpolicy), the utility for - labeling filesystems (setfiles), and an example policy configuration - from <http://www.nsa.gov/selinux/>. If you are unsure how to answer this question, answer N. config SECURITY_SELINUX_BOOTPARAM diff --git a/security/selinux/avc.c b/security/selinux/avc.c index 114b4b4c97b2..cb30c7e350b3 100644 --- a/security/selinux/avc.c +++ b/security/selinux/avc.c @@ -136,7 +136,7 @@ static inline int avc_hash(u32 ssid, u32 tsid, u16 tclass) * @tclass: target security class * @av: access vector */ -static void avc_dump_av(struct audit_buffer *ab, u16 tclass, u32 av) +void avc_dump_av(struct audit_buffer *ab, u16 tclass, u32 av) { const char **common_pts = NULL; u32 common_base = 0; diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 03fc6a81ae32..4a7374c12d9c 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -957,7 +957,8 @@ out_err: return rc; } -void selinux_write_opts(struct seq_file *m, struct security_mnt_opts *opts) +static void selinux_write_opts(struct seq_file *m, + struct security_mnt_opts *opts) { int i; char *prefix; @@ -1290,7 +1291,7 @@ static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dent /* Default to the fs superblock SID. */ isec->sid = sbsec->sid; - if (sbsec->proc) { + if (sbsec->proc && !S_ISLNK(inode->i_mode)) { struct proc_inode *proci = PROC_I(inode); if (proci->pde) { isec->sclass = inode_mode_to_security_class(inode->i_mode); @@ -3548,38 +3549,44 @@ out: #endif /* IPV6 */ static int selinux_parse_skb(struct sk_buff *skb, struct avc_audit_data *ad, - char **addrp, int src, u8 *proto) + char **_addrp, int src, u8 *proto) { - int ret = 0; + char *addrp; + int ret; switch (ad->u.net.family) { case PF_INET: ret = selinux_parse_skb_ipv4(skb, ad, proto); - if (ret || !addrp) - break; - *addrp = (char *)(src ? &ad->u.net.v4info.saddr : - &ad->u.net.v4info.daddr); - break; + if (ret) + goto parse_error; + addrp = (char *)(src ? &ad->u.net.v4info.saddr : + &ad->u.net.v4info.daddr); + goto okay; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) case PF_INET6: ret = selinux_parse_skb_ipv6(skb, ad, proto); - if (ret || !addrp) - break; - *addrp = (char *)(src ? &ad->u.net.v6info.saddr : - &ad->u.net.v6info.daddr); - break; + if (ret) + goto parse_error; + addrp = (char *)(src ? &ad->u.net.v6info.saddr : + &ad->u.net.v6info.daddr); + goto okay; #endif /* IPV6 */ default: - break; + addrp = NULL; + goto okay; } - if (unlikely(ret)) - printk(KERN_WARNING - "SELinux: failure in selinux_parse_skb()," - " unable to parse packet\n"); - +parse_error: + printk(KERN_WARNING + "SELinux: failure in selinux_parse_skb()," + " unable to parse packet\n"); return ret; + +okay: + if (_addrp) + *_addrp = addrp; + return 0; } /** @@ -5219,8 +5226,12 @@ static int selinux_setprocattr(struct task_struct *p, if (sid == 0) return -EINVAL; - - /* Only allow single threaded processes to change context */ + /* + * SELinux allows to change context in the following case only. + * - Single threaded processes. + * - Multi threaded processes intend to change its context into + * more restricted domain (defined by TYPEBOUNDS statement). + */ if (atomic_read(&p->mm->mm_users) != 1) { struct task_struct *g, *t; struct mm_struct *mm = p->mm; @@ -5228,11 +5239,16 @@ static int selinux_setprocattr(struct task_struct *p, do_each_thread(g, t) { if (t->mm == mm && t != p) { read_unlock(&tasklist_lock); - return -EPERM; + error = security_bounded_transition(tsec->sid, sid); + if (!error) + goto boundary_ok; + + return error; } } while_each_thread(g, t); read_unlock(&tasklist_lock); } +boundary_ok: /* Check permissions for the transition. */ error = avc_has_perm(tsec->sid, sid, SECCLASS_PROCESS, diff --git a/security/selinux/include/avc.h b/security/selinux/include/avc.h index 7b9769f5e775..d12ff1a9c0aa 100644 --- a/security/selinux/include/avc.h +++ b/security/selinux/include/avc.h @@ -12,6 +12,7 @@ #include <linux/kdev_t.h> #include <linux/spinlock.h> #include <linux/init.h> +#include <linux/audit.h> #include <linux/in6.h> #include <linux/path.h> #include <asm/system.h> @@ -126,6 +127,9 @@ int avc_add_callback(int (*callback)(u32 event, u32 ssid, u32 tsid, u32 events, u32 ssid, u32 tsid, u16 tclass, u32 perms); +/* Shows permission in human readable form */ +void avc_dump_av(struct audit_buffer *ab, u16 tclass, u32 av); + /* Exported to selinuxfs */ int avc_get_hash_stats(char *page); extern unsigned int avc_cache_threshold; diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h index 7c543003d653..72447370bc95 100644 --- a/security/selinux/include/security.h +++ b/security/selinux/include/security.h @@ -27,13 +27,14 @@ #define POLICYDB_VERSION_RANGETRANS 21 #define POLICYDB_VERSION_POLCAP 22 #define POLICYDB_VERSION_PERMISSIVE 23 +#define POLICYDB_VERSION_BOUNDARY 24 /* Range of policy versions we understand*/ #define POLICYDB_VERSION_MIN POLICYDB_VERSION_BASE #ifdef CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX #define POLICYDB_VERSION_MAX CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX_VALUE #else -#define POLICYDB_VERSION_MAX POLICYDB_VERSION_PERMISSIVE +#define POLICYDB_VERSION_MAX POLICYDB_VERSION_BOUNDARY #endif #define CONTEXT_MNT 0x01 @@ -62,6 +63,16 @@ enum { extern int selinux_policycap_netpeer; extern int selinux_policycap_openperm; +/* + * type_datum properties + * available at the kernel policy version >= POLICYDB_VERSION_BOUNDARY + */ +#define TYPEDATUM_PROPERTY_PRIMARY 0x0001 +#define TYPEDATUM_PROPERTY_ATTRIBUTE 0x0002 + +/* limitation of boundary depth */ +#define POLICYDB_BOUNDS_MAXDEPTH 4 + int security_load_policy(void *data, size_t len); int security_policycap_supported(unsigned int req_cap); @@ -117,6 +128,8 @@ int security_node_sid(u16 domain, void *addr, u32 addrlen, int security_validate_transition(u32 oldsid, u32 newsid, u32 tasksid, u16 tclass); +int security_bounded_transition(u32 oldsid, u32 newsid); + int security_sid_mls_copy(u32 sid, u32 mls_sid, u32 *new_sid); int security_net_peersid_resolve(u32 nlbl_sid, u32 nlbl_type, diff --git a/security/selinux/ss/avtab.c b/security/selinux/ss/avtab.c index a1be97f8beea..1215b8e47dba 100644 --- a/security/selinux/ss/avtab.c +++ b/security/selinux/ss/avtab.c @@ -98,7 +98,7 @@ struct avtab_node * avtab_insert_nonunique(struct avtab *h, struct avtab_key *key, struct avtab_datum *datum) { int hvalue; - struct avtab_node *prev, *cur, *newnode; + struct avtab_node *prev, *cur; u16 specified = key->specified & ~(AVTAB_ENABLED|AVTAB_ENABLED_OLD); if (!h || !h->htable) @@ -122,9 +122,7 @@ avtab_insert_nonunique(struct avtab *h, struct avtab_key *key, struct avtab_datu key->target_class < cur->key.target_class) break; } - newnode = avtab_insert_node(h, hvalue, prev, cur, key, datum); - - return newnode; + return avtab_insert_node(h, hvalue, prev, cur, key, datum); } struct avtab_datum *avtab_search(struct avtab *h, struct avtab_key *key) @@ -231,7 +229,7 @@ void avtab_destroy(struct avtab *h) for (i = 0; i < h->nslot; i++) { cur = h->htable[i]; - while (cur != NULL) { + while (cur) { temp = cur; cur = cur->next; kmem_cache_free(avtab_node_cachep, temp); diff --git a/security/selinux/ss/conditional.c b/security/selinux/ss/conditional.c index fb4efe4f4bc8..4a4e35cac22b 100644 --- a/security/selinux/ss/conditional.c +++ b/security/selinux/ss/conditional.c @@ -29,7 +29,7 @@ static int cond_evaluate_expr(struct policydb *p, struct cond_expr *expr) int s[COND_EXPR_MAXDEPTH]; int sp = -1; - for (cur = expr; cur != NULL; cur = cur->next) { + for (cur = expr; cur; cur = cur->next) { switch (cur->expr_type) { case COND_BOOL: if (sp == (COND_EXPR_MAXDEPTH - 1)) @@ -97,14 +97,14 @@ int evaluate_cond_node(struct policydb *p, struct cond_node *node) if (new_state == -1) printk(KERN_ERR "SELinux: expression result was undefined - disabling all rules.\n"); /* turn the rules on or off */ - for (cur = node->true_list; cur != NULL; cur = cur->next) { + for (cur = node->true_list; cur; cur = cur->next) { if (new_state <= 0) cur->node->key.specified &= ~AVTAB_ENABLED; else cur->node->key.specified |= AVTAB_ENABLED; } - for (cur = node->false_list; cur != NULL; cur = cur->next) { + for (cur = node->false_list; cur; cur = cur->next) { /* -1 or 1 */ if (new_state) cur->node->key.specified &= ~AVTAB_ENABLED; @@ -128,7 +128,7 @@ int cond_policydb_init(struct policydb *p) static void cond_av_list_destroy(struct cond_av_list *list) { struct cond_av_list *cur, *next; - for (cur = list; cur != NULL; cur = next) { + for (cur = list; cur; cur = next) { next = cur->next; /* the avtab_ptr_t node is destroy by the avtab */ kfree(cur); @@ -139,7 +139,7 @@ static void cond_node_destroy(struct cond_node *node) { struct cond_expr *cur_expr, *next_expr; - for (cur_expr = node->expr; cur_expr != NULL; cur_expr = next_expr) { + for (cur_expr = node->expr; cur_expr; cur_expr = next_expr) { next_expr = cur_expr->next; kfree(cur_expr); } @@ -155,7 +155,7 @@ static void cond_list_destroy(struct cond_node *list) if (list == NULL) return; - for (cur = list; cur != NULL; cur = next) { + for (cur = list; cur; cur = next) { next = cur->next; cond_node_destroy(cur); } @@ -239,7 +239,7 @@ int cond_read_bool(struct policydb *p, struct hashtab *h, void *fp) rc = next_entry(key, fp, len); if (rc < 0) goto err; - key[len] = 0; + key[len] = '\0'; if (hashtab_insert(h, key, booldatum)) goto err; @@ -291,7 +291,7 @@ static int cond_insertf(struct avtab *a, struct avtab_key *k, struct avtab_datum goto err; } found = 0; - for (cur = other; cur != NULL; cur = cur->next) { + for (cur = other; cur; cur = cur->next) { if (cur->node == node_ptr) { found = 1; break; @@ -485,7 +485,7 @@ void cond_compute_av(struct avtab *ctab, struct avtab_key *key, struct av_decisi if (!ctab || !key || !avd) return; - for (node = avtab_search_node(ctab, key); node != NULL; + for (node = avtab_search_node(ctab, key); node; node = avtab_search_node_next(node, key->specified)) { if ((u16)(AVTAB_ALLOWED|AVTAB_ENABLED) == (node->key.specified & (AVTAB_ALLOWED|AVTAB_ENABLED))) diff --git a/security/selinux/ss/conditional.h b/security/selinux/ss/conditional.h index 65b9f8366e9c..53ddb013ae57 100644 --- a/security/selinux/ss/conditional.h +++ b/security/selinux/ss/conditional.h @@ -28,7 +28,7 @@ struct cond_expr { #define COND_XOR 5 /* bool ^ bool */ #define COND_EQ 6 /* bool == bool */ #define COND_NEQ 7 /* bool != bool */ -#define COND_LAST 8 +#define COND_LAST COND_NEQ __u32 expr_type; __u32 bool; struct cond_expr *next; diff --git a/security/selinux/ss/ebitmap.c b/security/selinux/ss/ebitmap.c index ddc275490af8..68c7348d1acc 100644 --- a/security/selinux/ss/ebitmap.c +++ b/security/selinux/ss/ebitmap.c @@ -109,7 +109,7 @@ int ebitmap_netlbl_export(struct ebitmap *ebmap, *catmap = c_iter; c_iter->startbit = e_iter->startbit & ~(NETLBL_CATMAP_SIZE - 1); - while (e_iter != NULL) { + while (e_iter) { for (i = 0; i < EBITMAP_UNIT_NUMS; i++) { unsigned int delta, e_startbit, c_endbit; @@ -197,7 +197,7 @@ int ebitmap_netlbl_import(struct ebitmap *ebmap, } } c_iter = c_iter->next; - } while (c_iter != NULL); + } while (c_iter); if (e_iter != NULL) ebmap->highbit = e_iter->startbit + EBITMAP_SIZE; else diff --git a/security/selinux/ss/hashtab.c b/security/selinux/ss/hashtab.c index 2e7788e13213..933e735bb185 100644 --- a/security/selinux/ss/hashtab.c +++ b/security/selinux/ss/hashtab.c @@ -81,7 +81,7 @@ void *hashtab_search(struct hashtab *h, const void *key) hvalue = h->hash_value(h, key); cur = h->htable[hvalue]; - while (cur != NULL && h->keycmp(h, key, cur->key) > 0) + while (cur && h->keycmp(h, key, cur->key) > 0) cur = cur->next; if (cur == NULL || (h->keycmp(h, key, cur->key) != 0)) @@ -100,7 +100,7 @@ void hashtab_destroy(struct hashtab *h) for (i = 0; i < h->size; i++) { cur = h->htable[i]; - while (cur != NULL) { + while (cur) { temp = cur; cur = cur->next; kfree(temp); @@ -127,7 +127,7 @@ int hashtab_map(struct hashtab *h, for (i = 0; i < h->size; i++) { cur = h->htable[i]; - while (cur != NULL) { + while (cur) { ret = apply(cur->key, cur->datum, args); if (ret) return ret; diff --git a/security/selinux/ss/mls.c b/security/selinux/ss/mls.c index 77d745da48bb..b5407f16c2a4 100644 --- a/security/selinux/ss/mls.c +++ b/security/selinux/ss/mls.c @@ -283,8 +283,8 @@ int mls_context_to_sid(struct policydb *pol, p++; delim = *p; - if (delim != 0) - *p++ = 0; + if (delim != '\0') + *p++ = '\0'; for (l = 0; l < 2; l++) { levdatum = hashtab_search(pol->p_levels.table, scontextp); @@ -302,14 +302,14 @@ int mls_context_to_sid(struct policydb *pol, while (*p && *p != ',' && *p != '-') p++; delim = *p; - if (delim != 0) - *p++ = 0; + if (delim != '\0') + *p++ = '\0'; /* Separate into range if exists */ rngptr = strchr(scontextp, '.'); if (rngptr != NULL) { /* Remove '.' */ - *rngptr++ = 0; + *rngptr++ = '\0'; } catdatum = hashtab_search(pol->p_cats.table, @@ -357,8 +357,8 @@ int mls_context_to_sid(struct policydb *pol, p++; delim = *p; - if (delim != 0) - *p++ = 0; + if (delim != '\0') + *p++ = '\0'; } else break; } diff --git a/security/selinux/ss/policydb.c b/security/selinux/ss/policydb.c index 2391761ae422..72e4a54973aa 100644 --- a/security/selinux/ss/policydb.c +++ b/security/selinux/ss/policydb.c @@ -30,6 +30,7 @@ #include <linux/slab.h> #include <linux/string.h> #include <linux/errno.h> +#include <linux/audit.h> #include "security.h" #include "policydb.h" @@ -116,7 +117,12 @@ static struct policydb_compat_info policydb_compat[] = { .version = POLICYDB_VERSION_PERMISSIVE, .sym_num = SYM_NUM, .ocon_num = OCON_NUM, - } + }, + { + .version = POLICYDB_VERSION_BOUNDARY, + .sym_num = SYM_NUM, + .ocon_num = OCON_NUM, + }, }; static struct policydb_compat_info *policydb_lookup_compat(int version) @@ -254,7 +260,9 @@ static int role_index(void *key, void *datum, void *datap) role = datum; p = datap; - if (!role->value || role->value > p->p_roles.nprim) + if (!role->value + || role->value > p->p_roles.nprim + || role->bounds > p->p_roles.nprim) return -EINVAL; p->p_role_val_to_name[role->value - 1] = key; p->role_val_to_struct[role->value - 1] = role; @@ -270,9 +278,12 @@ static int type_index(void *key, void *datum, void *datap) p = datap; if (typdatum->primary) { - if (!typdatum->value || typdatum->value > p->p_types.nprim) + if (!typdatum->value + || typdatum->value > p->p_types.nprim + || typdatum->bounds > p->p_types.nprim) return -EINVAL; p->p_type_val_to_name[typdatum->value - 1] = key; + p->type_val_to_struct[typdatum->value - 1] = typdatum; } return 0; @@ -285,7 +296,9 @@ static int user_index(void *key, void *datum, void *datap) usrdatum = datum; p = datap; - if (!usrdatum->value || usrdatum->value > p->p_users.nprim) + if (!usrdatum->value + || usrdatum->value > p->p_users.nprim + || usrdatum->bounds > p->p_users.nprim) return -EINVAL; p->p_user_val_to_name[usrdatum->value - 1] = key; p->user_val_to_struct[usrdatum->value - 1] = usrdatum; @@ -438,6 +451,14 @@ static int policydb_index_others(struct policydb *p) goto out; } + p->type_val_to_struct = + kmalloc(p->p_types.nprim * sizeof(*(p->type_val_to_struct)), + GFP_KERNEL); + if (!p->type_val_to_struct) { + rc = -ENOMEM; + goto out; + } + if (cond_init_bool_indexes(p)) { rc = -ENOMEM; goto out; @@ -625,6 +646,7 @@ void policydb_destroy(struct policydb *p) kfree(p->class_val_to_struct); kfree(p->role_val_to_struct); kfree(p->user_val_to_struct); + kfree(p->type_val_to_struct); avtab_destroy(&p->te_avtab); @@ -932,7 +954,7 @@ static int perm_read(struct policydb *p, struct hashtab *h, void *fp) rc = next_entry(key, fp, len); if (rc < 0) goto bad; - key[len] = 0; + key[len] = '\0'; rc = hashtab_insert(h, key, perdatum); if (rc) @@ -979,7 +1001,7 @@ static int common_read(struct policydb *p, struct hashtab *h, void *fp) rc = next_entry(key, fp, len); if (rc < 0) goto bad; - key[len] = 0; + key[len] = '\0'; for (i = 0; i < nel; i++) { rc = perm_read(p, comdatum->permissions.table, fp); @@ -1117,7 +1139,7 @@ static int class_read(struct policydb *p, struct hashtab *h, void *fp) rc = next_entry(key, fp, len); if (rc < 0) goto bad; - key[len] = 0; + key[len] = '\0'; if (len2) { cladatum->comkey = kmalloc(len2 + 1, GFP_KERNEL); @@ -1128,7 +1150,7 @@ static int class_read(struct policydb *p, struct hashtab *h, void *fp) rc = next_entry(cladatum->comkey, fp, len2); if (rc < 0) goto bad; - cladatum->comkey[len2] = 0; + cladatum->comkey[len2] = '\0'; cladatum->comdatum = hashtab_search(p->p_commons.table, cladatum->comkey); @@ -1176,8 +1198,8 @@ static int role_read(struct policydb *p, struct hashtab *h, void *fp) { char *key = NULL; struct role_datum *role; - int rc; - __le32 buf[2]; + int rc, to_read = 2; + __le32 buf[3]; u32 len; role = kzalloc(sizeof(*role), GFP_KERNEL); @@ -1186,12 +1208,17 @@ static int role_read(struct policydb *p, struct hashtab *h, void *fp) goto out; } - rc = next_entry(buf, fp, sizeof buf); + if (p->policyvers >= POLICYDB_VERSION_BOUNDARY) + to_read = 3; + + rc = next_entry(buf, fp, sizeof(buf[0]) * to_read); if (rc < 0) goto bad; len = le32_to_cpu(buf[0]); role->value = le32_to_cpu(buf[1]); + if (p->policyvers >= POLICYDB_VERSION_BOUNDARY) + role->bounds = le32_to_cpu(buf[2]); key = kmalloc(len + 1, GFP_KERNEL); if (!key) { @@ -1201,7 +1228,7 @@ static int role_read(struct policydb *p, struct hashtab *h, void *fp) rc = next_entry(key, fp, len); if (rc < 0) goto bad; - key[len] = 0; + key[len] = '\0'; rc = ebitmap_read(&role->dominates, fp); if (rc) @@ -1236,8 +1263,8 @@ static int type_read(struct policydb *p, struct hashtab *h, void *fp) { char *key = NULL; struct type_datum *typdatum; - int rc; - __le32 buf[3]; + int rc, to_read = 3; + __le32 buf[4]; u32 len; typdatum = kzalloc(sizeof(*typdatum), GFP_KERNEL); @@ -1246,13 +1273,27 @@ static int type_read(struct policydb *p, struct hashtab *h, void *fp) return rc; } - rc = next_entry(buf, fp, sizeof buf); + if (p->policyvers >= POLICYDB_VERSION_BOUNDARY) + to_read = 4; + + rc = next_entry(buf, fp, sizeof(buf[0]) * to_read); if (rc < 0) goto bad; len = le32_to_cpu(buf[0]); typdatum->value = le32_to_cpu(buf[1]); - typdatum->primary = le32_to_cpu(buf[2]); + if (p->policyvers >= POLICYDB_VERSION_BOUNDARY) { + u32 prop = le32_to_cpu(buf[2]); + + if (prop & TYPEDATUM_PROPERTY_PRIMARY) + typdatum->primary = 1; + if (prop & TYPEDATUM_PROPERTY_ATTRIBUTE) + typdatum->attribute = 1; + + typdatum->bounds = le32_to_cpu(buf[3]); + } else { + typdatum->primary = le32_to_cpu(buf[2]); + } key = kmalloc(len + 1, GFP_KERNEL); if (!key) { @@ -1262,7 +1303,7 @@ static int type_read(struct policydb *p, struct hashtab *h, void *fp) rc = next_entry(key, fp, len); if (rc < 0) goto bad; - key[len] = 0; + key[len] = '\0'; rc = hashtab_insert(h, key, typdatum); if (rc) @@ -1309,8 +1350,8 @@ static int user_read(struct policydb *p, struct hashtab *h, void *fp) { char *key = NULL; struct user_datum *usrdatum; - int rc; - __le32 buf[2]; + int rc, to_read = 2; + __le32 buf[3]; u32 len; usrdatum = kzalloc(sizeof(*usrdatum), GFP_KERNEL); @@ -1319,12 +1360,17 @@ static int user_read(struct policydb *p, struct hashtab *h, void *fp) goto out; } - rc = next_entry(buf, fp, sizeof buf); + if (p->policyvers >= POLICYDB_VERSION_BOUNDARY) + to_read = 3; + + rc = next_entry(buf, fp, sizeof(buf[0]) * to_read); if (rc < 0) goto bad; len = le32_to_cpu(buf[0]); usrdatum->value = le32_to_cpu(buf[1]); + if (p->policyvers >= POLICYDB_VERSION_BOUNDARY) + usrdatum->bounds = le32_to_cpu(buf[2]); key = kmalloc(len + 1, GFP_KERNEL); if (!key) { @@ -1334,7 +1380,7 @@ static int user_read(struct policydb *p, struct hashtab *h, void *fp) rc = next_entry(key, fp, len); if (rc < 0) goto bad; - key[len] = 0; + key[len] = '\0'; rc = ebitmap_read(&usrdatum->roles, fp); if (rc) @@ -1388,7 +1434,7 @@ static int sens_read(struct policydb *p, struct hashtab *h, void *fp) rc = next_entry(key, fp, len); if (rc < 0) goto bad; - key[len] = 0; + key[len] = '\0'; levdatum->level = kmalloc(sizeof(struct mls_level), GFP_ATOMIC); if (!levdatum->level) { @@ -1440,7 +1486,7 @@ static int cat_read(struct policydb *p, struct hashtab *h, void *fp) rc = next_entry(key, fp, len); if (rc < 0) goto bad; - key[len] = 0; + key[len] = '\0'; rc = hashtab_insert(h, key, catdatum); if (rc) @@ -1465,6 +1511,133 @@ static int (*read_f[SYM_NUM]) (struct policydb *p, struct hashtab *h, void *fp) cat_read, }; +static int user_bounds_sanity_check(void *key, void *datum, void *datap) +{ + struct user_datum *upper, *user; + struct policydb *p = datap; + int depth = 0; + + upper = user = datum; + while (upper->bounds) { + struct ebitmap_node *node; + unsigned long bit; + + if (++depth == POLICYDB_BOUNDS_MAXDEPTH) { + printk(KERN_ERR "SELinux: user %s: " + "too deep or looped boundary", + (char *) key); + return -EINVAL; + } + + upper = p->user_val_to_struct[upper->bounds - 1]; + ebitmap_for_each_positive_bit(&user->roles, node, bit) { + if (ebitmap_get_bit(&upper->roles, bit)) + continue; + + printk(KERN_ERR + "SELinux: boundary violated policy: " + "user=%s role=%s bounds=%s\n", + p->p_user_val_to_name[user->value - 1], + p->p_role_val_to_name[bit], + p->p_user_val_to_name[upper->value - 1]); + + return -EINVAL; + } + } + + return 0; +} + +static int role_bounds_sanity_check(void *key, void *datum, void *datap) +{ + struct role_datum *upper, *role; + struct policydb *p = datap; + int depth = 0; + + upper = role = datum; + while (upper->bounds) { + struct ebitmap_node *node; + unsigned long bit; + + if (++depth == POLICYDB_BOUNDS_MAXDEPTH) { + printk(KERN_ERR "SELinux: role %s: " + "too deep or looped bounds\n", + (char *) key); + return -EINVAL; + } + + upper = p->role_val_to_struct[upper->bounds - 1]; + ebitmap_for_each_positive_bit(&role->types, node, bit) { + if (ebitmap_get_bit(&upper->types, bit)) + continue; + + printk(KERN_ERR + "SELinux: boundary violated policy: " + "role=%s type=%s bounds=%s\n", + p->p_role_val_to_name[role->value - 1], + p->p_type_val_to_name[bit], + p->p_role_val_to_name[upper->value - 1]); + + return -EINVAL; + } + } + + return 0; +} + +static int type_bounds_sanity_check(void *key, void *datum, void *datap) +{ + struct type_datum *upper, *type; + struct policydb *p = datap; + int depth = 0; + + upper = type = datum; + while (upper->bounds) { + if (++depth == POLICYDB_BOUNDS_MAXDEPTH) { + printk(KERN_ERR "SELinux: type %s: " + "too deep or looped boundary\n", + (char *) key); + return -EINVAL; + } + + upper = p->type_val_to_struct[upper->bounds - 1]; + if (upper->attribute) { + printk(KERN_ERR "SELinux: type %s: " + "bounded by attribute %s", + (char *) key, + p->p_type_val_to_name[upper->value - 1]); + return -EINVAL; + } + } + + return 0; +} + +static int policydb_bounds_sanity_check(struct policydb *p) +{ + int rc; + + if (p->policyvers < POLICYDB_VERSION_BOUNDARY) + return 0; + + rc = hashtab_map(p->p_users.table, + user_bounds_sanity_check, p); + if (rc) + return rc; + + rc = hashtab_map(p->p_roles.table, + role_bounds_sanity_check, p); + if (rc) + return rc; + + rc = hashtab_map(p->p_types.table, + type_bounds_sanity_check, p); + if (rc) + return rc; + + return 0; +} + extern int ss_initialized; /* @@ -1523,7 +1696,7 @@ int policydb_read(struct policydb *p, void *fp) kfree(policydb_str); goto bad; } - policydb_str[len] = 0; + policydb_str[len] = '\0'; if (strcmp(policydb_str, POLICYDB_STRING)) { printk(KERN_ERR "SELinux: policydb string %s does not match " "my string %s\n", policydb_str, POLICYDB_STRING); @@ -1961,6 +2134,10 @@ int policydb_read(struct policydb *p, void *fp) goto bad; } + rc = policydb_bounds_sanity_check(p); + if (rc) + goto bad; + rc = 0; out: return rc; diff --git a/security/selinux/ss/policydb.h b/security/selinux/ss/policydb.h index 4253370fda6a..55152d498b53 100644 --- a/security/selinux/ss/policydb.h +++ b/security/selinux/ss/policydb.h @@ -61,6 +61,7 @@ struct class_datum { /* Role attributes */ struct role_datum { u32 value; /* internal role value */ + u32 bounds; /* boundary of role */ struct ebitmap dominates; /* set of roles dominated by this role */ struct ebitmap types; /* set of authorized types for role */ }; @@ -81,12 +82,15 @@ struct role_allow { /* Type attributes */ struct type_datum { u32 value; /* internal type value */ + u32 bounds; /* boundary of type */ unsigned char primary; /* primary name? */ + unsigned char attribute;/* attribute ?*/ }; /* User attributes */ struct user_datum { u32 value; /* internal user value */ + u32 bounds; /* bounds of user */ struct ebitmap roles; /* set of authorized roles for user */ struct mls_range range; /* MLS range (min - max) for user */ struct mls_level dfltlevel; /* default login MLS level for user */ @@ -209,6 +213,7 @@ struct policydb { struct class_datum **class_val_to_struct; struct role_datum **role_val_to_struct; struct user_datum **user_val_to_struct; + struct type_datum **type_val_to_struct; /* type enforcement access vectors and transitions */ struct avtab te_avtab; diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index 8551952ef329..ab0cc0c7b944 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -88,6 +88,11 @@ static u32 latest_granting; static int context_struct_to_string(struct context *context, char **scontext, u32 *scontext_len); +static int context_struct_compute_av(struct context *scontext, + struct context *tcontext, + u16 tclass, + u32 requested, + struct av_decision *avd); /* * Return the boolean value of a constraint expression * when it is applied to the specified source and target @@ -274,6 +279,100 @@ mls_ops: } /* + * security_boundary_permission - drops violated permissions + * on boundary constraint. + */ +static void type_attribute_bounds_av(struct context *scontext, + struct context *tcontext, + u16 tclass, + u32 requested, + struct av_decision *avd) +{ + struct context lo_scontext; + struct context lo_tcontext; + struct av_decision lo_avd; + struct type_datum *source + = policydb.type_val_to_struct[scontext->type - 1]; + struct type_datum *target + = policydb.type_val_to_struct[tcontext->type - 1]; + u32 masked = 0; + + if (source->bounds) { + memset(&lo_avd, 0, sizeof(lo_avd)); + + memcpy(&lo_scontext, scontext, sizeof(lo_scontext)); + lo_scontext.type = source->bounds; + + context_struct_compute_av(&lo_scontext, + tcontext, + tclass, + requested, + &lo_avd); + if ((lo_avd.allowed & avd->allowed) == avd->allowed) + return; /* no masked permission */ + masked = ~lo_avd.allowed & avd->allowed; + } + + if (target->bounds) { + memset(&lo_avd, 0, sizeof(lo_avd)); + + memcpy(&lo_tcontext, tcontext, sizeof(lo_tcontext)); + lo_tcontext.type = target->bounds; + + context_struct_compute_av(scontext, + &lo_tcontext, + tclass, + requested, + &lo_avd); + if ((lo_avd.allowed & avd->allowed) == avd->allowed) + return; /* no masked permission */ + masked = ~lo_avd.allowed & avd->allowed; + } + + if (source->bounds && target->bounds) { + memset(&lo_avd, 0, sizeof(lo_avd)); + /* + * lo_scontext and lo_tcontext are already + * set up. + */ + + context_struct_compute_av(&lo_scontext, + &lo_tcontext, + tclass, + requested, + &lo_avd); + if ((lo_avd.allowed & avd->allowed) == avd->allowed) + return; /* no masked permission */ + masked = ~lo_avd.allowed & avd->allowed; + } + + if (masked) { + struct audit_buffer *ab; + char *stype_name + = policydb.p_type_val_to_name[source->value - 1]; + char *ttype_name + = policydb.p_type_val_to_name[target->value - 1]; + char *tclass_name + = policydb.p_class_val_to_name[tclass - 1]; + + /* mask violated permissions */ + avd->allowed &= ~masked; + + /* notice to userspace via audit message */ + ab = audit_log_start(current->audit_context, + GFP_ATOMIC, AUDIT_SELINUX_ERR); + if (!ab) + return; + + audit_log_format(ab, "av boundary violation: " + "source=%s target=%s tclass=%s", + stype_name, ttype_name, tclass_name); + avc_dump_av(ab, tclass, masked); + audit_log_end(ab); + } +} + +/* * Compute access vectors based on a context structure pair for * the permissions in a particular class. */ @@ -356,7 +455,7 @@ static int context_struct_compute_av(struct context *scontext, avkey.source_type = i + 1; avkey.target_type = j + 1; for (node = avtab_search_node(&policydb.te_avtab, &avkey); - node != NULL; + node; node = avtab_search_node_next(node, avkey.specified)) { if (node->key.specified == AVTAB_ALLOWED) avd->allowed |= node->datum.data; @@ -404,6 +503,14 @@ static int context_struct_compute_av(struct context *scontext, PROCESS__DYNTRANSITION); } + /* + * If the given source and target types have boundary + * constraint, lazy checks have to mask any violated + * permission and notice it to userspace via audit. + */ + type_attribute_bounds_av(scontext, tcontext, + tclass, requested, avd); + return 0; inval_class: @@ -549,6 +656,69 @@ out: return rc; } +/* + * security_bounded_transition - check whether the given + * transition is directed to bounded, or not. + * It returns 0, if @newsid is bounded by @oldsid. + * Otherwise, it returns error code. + * + * @oldsid : current security identifier + * @newsid : destinated security identifier + */ +int security_bounded_transition(u32 old_sid, u32 new_sid) +{ + struct context *old_context, *new_context; + struct type_datum *type; + int index; + int rc = -EINVAL; + + read_lock(&policy_rwlock); + + old_context = sidtab_search(&sidtab, old_sid); + if (!old_context) { + printk(KERN_ERR "SELinux: %s: unrecognized SID %u\n", + __func__, old_sid); + goto out; + } + + new_context = sidtab_search(&sidtab, new_sid); + if (!new_context) { + printk(KERN_ERR "SELinux: %s: unrecognized SID %u\n", + __func__, new_sid); + goto out; + } + + /* type/domain unchaned */ + if (old_context->type == new_context->type) { + rc = 0; + goto out; + } + + index = new_context->type; + while (true) { + type = policydb.type_val_to_struct[index - 1]; + BUG_ON(!type); + + /* not bounded anymore */ + if (!type->bounds) { + rc = -EPERM; + break; + } + + /* @newsid is bounded by @oldsid */ + if (type->bounds == old_context->type) { + rc = 0; + break; + } + index = type->bounds; + } +out: + read_unlock(&policy_rwlock); + + return rc; +} + + /** * security_compute_av - Compute access vector decisions. * @ssid: source security identifier @@ -794,7 +964,7 @@ static int string_to_context_struct(struct policydb *pol, *p++ = 0; typdatum = hashtab_search(pol->p_types.table, scontextp); - if (!typdatum) + if (!typdatum || typdatum->attribute) goto out; ctx->type = typdatum->value; @@ -1037,7 +1207,7 @@ static int security_compute_sid(u32 ssid, /* If no permanent rule, also check for enabled conditional rules */ if (!avdatum) { node = avtab_search_node(&policydb.te_cond_avtab, &avkey); - for (; node != NULL; node = avtab_search_node_next(node, specified)) { + for (; node; node = avtab_search_node_next(node, specified)) { if (node->key.specified & AVTAB_ENABLED) { avdatum = &node->datum; break; @@ -2050,7 +2220,7 @@ int security_set_bools(int len, int *values) policydb.bool_val_to_struct[i]->state = 0; } - for (cur = policydb.cond_list; cur != NULL; cur = cur->next) { + for (cur = policydb.cond_list; cur; cur = cur->next) { rc = evaluate_cond_node(&policydb, cur); if (rc) goto out; @@ -2102,7 +2272,7 @@ static int security_preserve_bools(struct policydb *p) if (booldatum) booldatum->state = bvalues[i]; } - for (cur = p->cond_list; cur != NULL; cur = cur->next) { + for (cur = p->cond_list; cur; cur = cur->next) { rc = evaluate_cond_node(p, cur); if (rc) goto out; diff --git a/security/selinux/ss/sidtab.c b/security/selinux/ss/sidtab.c index a81ded104129..e817989764cd 100644 --- a/security/selinux/ss/sidtab.c +++ b/security/selinux/ss/sidtab.c @@ -43,7 +43,7 @@ int sidtab_insert(struct sidtab *s, u32 sid, struct context *context) hvalue = SIDTAB_HASH(sid); prev = NULL; cur = s->htable[hvalue]; - while (cur != NULL && sid > cur->sid) { + while (cur && sid > cur->sid) { prev = cur; cur = cur->next; } @@ -92,7 +92,7 @@ static struct context *sidtab_search_core(struct sidtab *s, u32 sid, int force) hvalue = SIDTAB_HASH(sid); cur = s->htable[hvalue]; - while (cur != NULL && sid > cur->sid) + while (cur && sid > cur->sid) cur = cur->next; if (force && cur && sid == cur->sid && cur->context.len) @@ -103,7 +103,7 @@ static struct context *sidtab_search_core(struct sidtab *s, u32 sid, int force) sid = SECINITSID_UNLABELED; hvalue = SIDTAB_HASH(sid); cur = s->htable[hvalue]; - while (cur != NULL && sid > cur->sid) + while (cur && sid > cur->sid) cur = cur->next; if (!cur || sid != cur->sid) return NULL; @@ -136,7 +136,7 @@ int sidtab_map(struct sidtab *s, for (i = 0; i < SIDTAB_SIZE; i++) { cur = s->htable[i]; - while (cur != NULL) { + while (cur) { rc = apply(cur->sid, &cur->context, args); if (rc) goto out; @@ -155,7 +155,7 @@ static inline u32 sidtab_search_context(struct sidtab *s, for (i = 0; i < SIDTAB_SIZE; i++) { cur = s->htable[i]; - while (cur != NULL) { + while (cur) { if (context_cmp(&cur->context, context)) return cur->sid; cur = cur->next; @@ -242,7 +242,7 @@ void sidtab_destroy(struct sidtab *s) for (i = 0; i < SIDTAB_SIZE; i++) { cur = s->htable[i]; - while (cur != NULL) { + while (cur) { temp = cur; cur = cur->next; context_destroy(&temp->context); diff --git a/security/smack/smack.h b/security/smack/smack.h index 4a4477f5afdc..31dce559595a 100644 --- a/security/smack/smack.h +++ b/security/smack/smack.h @@ -178,6 +178,7 @@ u32 smack_to_secid(const char *); extern int smack_cipso_direct; extern int smack_net_nltype; extern char *smack_net_ambient; +extern char *smack_onlycap; extern struct smack_known *smack_known; extern struct smack_known smack_known_floor; diff --git a/security/smack/smack_access.c b/security/smack/smack_access.c index f6b5f6eed6dd..79ff21ed4c3b 100644 --- a/security/smack/smack_access.c +++ b/security/smack/smack_access.c @@ -157,7 +157,7 @@ int smk_access(char *subject_label, char *object_label, int request) * * This function checks the current subject label/object label pair * in the access rule list and returns 0 if the access is permitted, - * non zero otherwise. It allows that current my have the capability + * non zero otherwise. It allows that current may have the capability * to override the rules. */ int smk_curacc(char *obj_label, u32 mode) @@ -168,6 +168,14 @@ int smk_curacc(char *obj_label, u32 mode) if (rc == 0) return 0; + /* + * Return if a specific label has been designated as the + * only one that gets privilege and current does not + * have that label. + */ + if (smack_onlycap != NULL && smack_onlycap != current->security) + return rc; + if (capable(CAP_MAC_OVERRIDE)) return 0; diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c index 271a835fbbe3..e7c642458ec9 100644 --- a/security/smack/smackfs.c +++ b/security/smack/smackfs.c @@ -39,6 +39,7 @@ enum smk_inos { SMK_DIRECT = 6, /* CIPSO level indicating direct label */ SMK_AMBIENT = 7, /* internet ambient label */ SMK_NLTYPE = 8, /* label scheme to use by default */ + SMK_ONLYCAP = 9, /* the only "capable" label */ }; /* @@ -68,6 +69,16 @@ int smack_net_nltype = NETLBL_NLTYPE_CIPSOV4; */ int smack_cipso_direct = SMACK_CIPSO_DIRECT_DEFAULT; +/* + * Unless a process is running with this label even + * having CAP_MAC_OVERRIDE isn't enough to grant + * privilege to violate MAC policy. If no label is + * designated (the NULL case) capabilities apply to + * everyone. It is expected that the hat (^) label + * will be used if any label is used. + */ +char *smack_onlycap; + static int smk_cipso_doi_value = SMACK_CIPSO_DOI_DEFAULT; struct smk_list_entry *smack_list; @@ -787,6 +798,85 @@ static const struct file_operations smk_ambient_ops = { .write = smk_write_ambient, }; +/** + * smk_read_onlycap - read() for /smack/onlycap + * @filp: file pointer, not actually used + * @buf: where to put the result + * @cn: maximum to send along + * @ppos: where to start + * + * Returns number of bytes read or error code, as appropriate + */ +static ssize_t smk_read_onlycap(struct file *filp, char __user *buf, + size_t cn, loff_t *ppos) +{ + char *smack = ""; + ssize_t rc = -EINVAL; + int asize; + + if (*ppos != 0) + return 0; + + if (smack_onlycap != NULL) + smack = smack_onlycap; + + asize = strlen(smack) + 1; + + if (cn >= asize) + rc = simple_read_from_buffer(buf, cn, ppos, smack, asize); + + return rc; +} + +/** + * smk_write_onlycap - write() for /smack/onlycap + * @filp: file pointer, not actually used + * @buf: where to get the data from + * @count: bytes sent + * @ppos: where to start + * + * Returns number of bytes written or error code, as appropriate + */ +static ssize_t smk_write_onlycap(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + char in[SMK_LABELLEN]; + char *sp = current->security; + + if (!capable(CAP_MAC_ADMIN)) + return -EPERM; + + /* + * This can be done using smk_access() but is done + * explicitly for clarity. The smk_access() implementation + * would use smk_access(smack_onlycap, MAY_WRITE) + */ + if (smack_onlycap != NULL && smack_onlycap != sp) + return -EPERM; + + if (count >= SMK_LABELLEN) + return -EINVAL; + + if (copy_from_user(in, buf, count) != 0) + return -EFAULT; + + /* + * Should the null string be passed in unset the onlycap value. + * This seems like something to be careful with as usually + * smk_import only expects to return NULL for errors. It + * is usually the case that a nullstring or "\n" would be + * bad to pass to smk_import but in fact this is useful here. + */ + smack_onlycap = smk_import(in, count); + + return count; +} + +static const struct file_operations smk_onlycap_ops = { + .read = smk_read_onlycap, + .write = smk_write_onlycap, +}; + struct option_names { int o_number; char *o_name; @@ -919,6 +1009,8 @@ static int smk_fill_super(struct super_block *sb, void *data, int silent) {"ambient", &smk_ambient_ops, S_IRUGO|S_IWUSR}, [SMK_NLTYPE] = {"nltype", &smk_nltype_ops, S_IRUGO|S_IWUSR}, + [SMK_ONLYCAP] = + {"onlycap", &smk_onlycap_ops, S_IRUGO|S_IWUSR}, /* last one */ {""} }; |