From bc7ee55633867909bb05e71f957a4d3c1aa1b488 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 30 Nov 2011 14:27:08 +0000 Subject: regmap: Add trace event for successful cache reads Currently we only trace physical reads, there's no instrumentation if the read is satisfied from cache. Signed-off-by: Mark Brown --- include/trace/events/regmap.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/trace') diff --git a/include/trace/events/regmap.h b/include/trace/events/regmap.h index 1e3193b8fcc8..12fbf43524e9 100644 --- a/include/trace/events/regmap.h +++ b/include/trace/events/regmap.h @@ -55,6 +55,15 @@ DEFINE_EVENT(regmap_reg, regmap_reg_read, ); +DEFINE_EVENT(regmap_reg, regmap_reg_read_cache, + + TP_PROTO(struct device *dev, unsigned int reg, + unsigned int val), + + TP_ARGS(dev, reg, val) + +); + DECLARE_EVENT_CLASS(regmap_block, TP_PROTO(struct device *dev, unsigned int reg, int count), -- cgit v1.2.3 From 83712358ba0a1497ce59a4f84ce4dd0f803fe6fc Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Sat, 11 Jun 2011 19:25:42 -0600 Subject: writeback: dirty ratelimit - think time compensation Compensate the task's think time when computing the final pause time, so that ->dirty_ratelimit can be executed accurately. think time := time spend outside of balance_dirty_pages() In the rare case that the task slept longer than the 200ms period time (result in negative pause time), the sleep time will be compensated in the following periods, too, if it's less than 1 second. Accumulated errors are carefully avoided as long as the max pause area is not hitted. Pseudo code: period = pages_dirtied / task_ratelimit; think = jiffies - dirty_paused_when; pause = period - think; 1) normal case: period > think pause = period - think dirty_paused_when = jiffies + pause nr_dirtied = 0 period time |===============================>| think time pause time |===============>|==============>| ------|----------------|---------------|------------------------ dirty_paused_when jiffies 2) no pause case: period <= think don't pause; reduce future pause time by: dirty_paused_when += period nr_dirtied = 0 period time |===============================>| think time |===================================================>| ------|--------------------------------+-------------------|---- dirty_paused_when jiffies Acked-by: Jan Kara Acked-by: Peter Zijlstra Signed-off-by: Wu Fengguang --- include/linux/sched.h | 1 + include/trace/events/writeback.h | 14 +++++++++++--- kernel/fork.c | 1 + mm/page-writeback.c | 36 ++++++++++++++++++++++++++++++++---- 4 files changed, 45 insertions(+), 7 deletions(-) (limited to 'include/trace') diff --git a/include/linux/sched.h b/include/linux/sched.h index 1c4f3e9b9bc5..984c3b295978 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1527,6 +1527,7 @@ struct task_struct { */ int nr_dirtied; int nr_dirtied_pause; + unsigned long dirty_paused_when; /* start of a write-and-pause period */ #ifdef CONFIG_LATENCYTOP int latency_record_count; diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 99d1d0decf88..8588a8918023 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -300,12 +300,13 @@ TRACE_EVENT(balance_dirty_pages, unsigned long dirty_ratelimit, unsigned long task_ratelimit, unsigned long dirtied, + unsigned long period, long pause, unsigned long start_time), TP_ARGS(bdi, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty, dirty_ratelimit, task_ratelimit, - dirtied, pause, start_time), + dirtied, period, pause, start_time), TP_STRUCT__entry( __array( char, bdi, 32) @@ -320,6 +321,8 @@ TRACE_EVENT(balance_dirty_pages, __field(unsigned int, dirtied_pause) __field(unsigned long, paused) __field( long, pause) + __field(unsigned long, period) + __field( long, think) ), TP_fast_assign( @@ -336,6 +339,9 @@ TRACE_EVENT(balance_dirty_pages, __entry->task_ratelimit = KBps(task_ratelimit); __entry->dirtied = dirtied; __entry->dirtied_pause = current->nr_dirtied_pause; + __entry->think = current->dirty_paused_when == 0 ? 0 : + (long)(jiffies - current->dirty_paused_when) * 1000/HZ; + __entry->period = period * 1000 / HZ; __entry->pause = pause * 1000 / HZ; __entry->paused = (jiffies - start_time) * 1000 / HZ; ), @@ -346,7 +352,7 @@ TRACE_EVENT(balance_dirty_pages, "bdi_setpoint=%lu bdi_dirty=%lu " "dirty_ratelimit=%lu task_ratelimit=%lu " "dirtied=%u dirtied_pause=%u " - "paused=%lu pause=%ld", + "paused=%lu pause=%ld period=%lu think=%ld", __entry->bdi, __entry->limit, __entry->setpoint, @@ -358,7 +364,9 @@ TRACE_EVENT(balance_dirty_pages, __entry->dirtied, __entry->dirtied_pause, __entry->paused, /* ms */ - __entry->pause /* ms */ + __entry->pause, /* ms */ + __entry->period, /* ms */ + __entry->think /* ms */ ) ); diff --git a/kernel/fork.c b/kernel/fork.c index da4a6a10d088..f8668cf6a32d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1296,6 +1296,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->nr_dirtied = 0; p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); + p->dirty_paused_when = 0; /* * Ok, make it visible to the rest of the system. diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 96b3e7aa705c..491932155825 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1016,6 +1016,7 @@ static void balance_dirty_pages(struct address_space *mapping, unsigned long background_thresh; unsigned long dirty_thresh; unsigned long bdi_thresh; + long period; long pause = 0; long uninitialized_var(max_pause); bool dirty_exceeded = false; @@ -1026,6 +1027,8 @@ static void balance_dirty_pages(struct address_space *mapping, unsigned long start_time = jiffies; for (;;) { + unsigned long now = jiffies; + /* * Unstable writes are a feature of certain networked * filesystems (i.e. NFS) in which data may have been @@ -1045,8 +1048,11 @@ static void balance_dirty_pages(struct address_space *mapping, */ freerun = dirty_freerun_ceiling(dirty_thresh, background_thresh); - if (nr_dirty <= freerun) + if (nr_dirty <= freerun) { + current->dirty_paused_when = now; + current->nr_dirtied = 0; break; + } if (unlikely(!writeback_in_progress(bdi))) bdi_start_background_writeback(bdi); @@ -1104,10 +1110,21 @@ static void balance_dirty_pages(struct address_space *mapping, task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >> RATELIMIT_CALC_SHIFT; if (unlikely(task_ratelimit == 0)) { + period = max_pause; pause = max_pause; goto pause; } - pause = HZ * pages_dirtied / task_ratelimit; + period = HZ * pages_dirtied / task_ratelimit; + pause = period; + if (current->dirty_paused_when) + pause -= now - current->dirty_paused_when; + /* + * For less than 1s think time (ext3/4 may block the dirtier + * for up to 800ms from time to time on 1-HDD; so does xfs, + * however at much less frequency), try to compensate it in + * future periods by updating the virtual time; otherwise just + * do a reset, as it may be a light dirtier. + */ if (unlikely(pause <= 0)) { trace_balance_dirty_pages(bdi, dirty_thresh, @@ -1118,8 +1135,16 @@ static void balance_dirty_pages(struct address_space *mapping, dirty_ratelimit, task_ratelimit, pages_dirtied, + period, pause, start_time); + if (pause < -HZ) { + current->dirty_paused_when = now; + current->nr_dirtied = 0; + } else if (period) { + current->dirty_paused_when += period; + current->nr_dirtied = 0; + } pause = 1; /* avoid resetting nr_dirtied_pause below */ break; } @@ -1135,11 +1160,15 @@ pause: dirty_ratelimit, task_ratelimit, pages_dirtied, + period, pause, start_time); __set_current_state(TASK_KILLABLE); io_schedule_timeout(pause); + current->dirty_paused_when = now + pause; + current->nr_dirtied = 0; + /* * This is typically equal to (nr_dirty < dirty_thresh) and can * also keep "1000+ dd on a slow USB stick" under control. @@ -1167,11 +1196,10 @@ pause: if (!dirty_exceeded && bdi->dirty_exceeded) bdi->dirty_exceeded = 0; - current->nr_dirtied = 0; if (pause == 0) { /* in freerun area */ current->nr_dirtied_pause = dirty_poll_interval(nr_dirty, dirty_thresh); - } else if (pause <= max_pause / 4 && + } else if (period <= max_pause / 4 && pages_dirtied >= current->nr_dirtied_pause) { current->nr_dirtied_pause = clamp_val( dirty_ratelimit * (max_pause / 2) / HZ, -- cgit v1.2.3 From 60e07cf515e541ea3e13b888d273c9b19a2ad9dd Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Sun, 18 Dec 2011 15:49:54 -0500 Subject: ext4: do not reference pa_inode from group_pa pa_inode in group_pa is set NULL in ext4_mb_new_group_pa, so pa_inode should be not referenced. Reported-by: Wu Fengguang Signed-off-by: Yongqiang Yang Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 2 +- include/trace/events/ext4.h | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/trace') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index e2d8be8f28bf..cb990b21c698 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3671,7 +3671,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b, ext4_group_t group; ext4_grpblk_t bit; - trace_ext4_mb_release_group_pa(pa); + trace_ext4_mb_release_group_pa(sb, pa); BUG_ON(pa->pa_deleted == 0); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); BUG_ON(group != e4b->bd_group && pa->pa_len != 0); diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 748ff7cbe555..319538bf17d2 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -573,9 +573,9 @@ TRACE_EVENT(ext4_mb_release_inode_pa, ); TRACE_EVENT(ext4_mb_release_group_pa, - TP_PROTO(struct ext4_prealloc_space *pa), + TP_PROTO(struct super_block *sb, struct ext4_prealloc_space *pa), - TP_ARGS(pa), + TP_ARGS(sb, pa), TP_STRUCT__entry( __field( dev_t, dev ) @@ -585,7 +585,7 @@ TRACE_EVENT(ext4_mb_release_group_pa, ), TP_fast_assign( - __entry->dev = pa->pa_inode->i_sb->s_dev; + __entry->dev = sb->s_dev; __entry->pa_pstart = pa->pa_pstart; __entry->pa_len = pa->pa_len; ), -- cgit v1.2.3 From b413d48aa70605701c0b395b2e350ca15f5d643a Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 10 Jan 2012 15:07:09 -0800 Subject: mm-tracepoint: rename page-free events Rename mm_page_free_direct into mm_page_free and mm_pagevec_free into mm_page_free_batched Since v2.6.33-5426-gc475dab the kernel triggers mm_page_free_direct for all freed pages, not only for directly freed. So, let's name it properly. For pages freed via page-list we also trigger mm_page_free_batched event. Signed-off-by: Konstantin Khlebnikov Cc: Mel Gorman Cc: KOSAKI Motohiro Reviewed-by: Minchan Kim Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/trace/events-kmem.txt | 12 ++++++------ .../trace/postprocess/trace-pagealloc-postprocess.pl | 20 ++++++++++---------- include/trace/events/kmem.h | 4 ++-- mm/page_alloc.c | 4 ++-- 4 files changed, 20 insertions(+), 20 deletions(-) (limited to 'include/trace') diff --git a/Documentation/trace/events-kmem.txt b/Documentation/trace/events-kmem.txt index aa82ee4a5a87..194800410061 100644 --- a/Documentation/trace/events-kmem.txt +++ b/Documentation/trace/events-kmem.txt @@ -40,8 +40,8 @@ but the call_site can usually be used to extrapolate that information. ================== mm_page_alloc page=%p pfn=%lu order=%d migratetype=%d gfp_flags=%s mm_page_alloc_zone_locked page=%p pfn=%lu order=%u migratetype=%d cpu=%d percpu_refill=%d -mm_page_free_direct page=%p pfn=%lu order=%d -mm_pagevec_free page=%p pfn=%lu order=%d cold=%d +mm_page_free page=%p pfn=%lu order=%d +mm_page_free_batched page=%p pfn=%lu order=%d cold=%d These four events deal with page allocation and freeing. mm_page_alloc is a simple indicator of page allocator activity. Pages may be allocated from @@ -53,13 +53,13 @@ amounts of activity imply high activity on the zone->lock. Taking this lock impairs performance by disabling interrupts, dirtying cache lines between CPUs and serialising many CPUs. -When a page is freed directly by the caller, the mm_page_free_direct event +When a page is freed directly by the caller, the only mm_page_free event is triggered. Significant amounts of activity here could indicate that the callers should be batching their activities. -When pages are freed using a pagevec, the mm_pagevec_free is -triggered. Broadly speaking, pages are taken off the LRU lock in bulk and -freed in batch with a pagevec. Significant amounts of activity here could +When pages are freed in batch, the also mm_page_free_batched is triggered. +Broadly speaking, pages are taken off the LRU lock in bulk and +freed in batch with a page list. Significant amounts of activity here could indicate that the system is under memory pressure and can also indicate contention on the zone->lru_lock. diff --git a/Documentation/trace/postprocess/trace-pagealloc-postprocess.pl b/Documentation/trace/postprocess/trace-pagealloc-postprocess.pl index 7df50e8cf4d9..0a120aae33ce 100644 --- a/Documentation/trace/postprocess/trace-pagealloc-postprocess.pl +++ b/Documentation/trace/postprocess/trace-pagealloc-postprocess.pl @@ -17,8 +17,8 @@ use Getopt::Long; # Tracepoint events use constant MM_PAGE_ALLOC => 1; -use constant MM_PAGE_FREE_DIRECT => 2; -use constant MM_PAGEVEC_FREE => 3; +use constant MM_PAGE_FREE => 2; +use constant MM_PAGE_FREE_BATCHED => 3; use constant MM_PAGE_PCPU_DRAIN => 4; use constant MM_PAGE_ALLOC_ZONE_LOCKED => 5; use constant MM_PAGE_ALLOC_EXTFRAG => 6; @@ -223,10 +223,10 @@ EVENT_PROCESS: # Perl Switch() sucks majorly if ($tracepoint eq "mm_page_alloc") { $perprocesspid{$process_pid}->{MM_PAGE_ALLOC}++; - } elsif ($tracepoint eq "mm_page_free_direct") { - $perprocesspid{$process_pid}->{MM_PAGE_FREE_DIRECT}++; - } elsif ($tracepoint eq "mm_pagevec_free") { - $perprocesspid{$process_pid}->{MM_PAGEVEC_FREE}++; + } elsif ($tracepoint eq "mm_page_free") { + $perprocesspid{$process_pid}->{MM_PAGE_FREE}++ + } elsif ($tracepoint eq "mm_page_free_batched") { + $perprocesspid{$process_pid}->{MM_PAGE_FREE_BATCHED}++; } elsif ($tracepoint eq "mm_page_pcpu_drain") { $perprocesspid{$process_pid}->{MM_PAGE_PCPU_DRAIN}++; $perprocesspid{$process_pid}->{STATE_PCPU_PAGES_DRAINED}++; @@ -336,8 +336,8 @@ sub dump_stats { $process_pid, $stats{$process_pid}->{MM_PAGE_ALLOC}, $stats{$process_pid}->{MM_PAGE_ALLOC_ZONE_LOCKED}, - $stats{$process_pid}->{MM_PAGE_FREE_DIRECT}, - $stats{$process_pid}->{MM_PAGEVEC_FREE}, + $stats{$process_pid}->{MM_PAGE_FREE}, + $stats{$process_pid}->{MM_PAGE_FREE_BATCHED}, $stats{$process_pid}->{MM_PAGE_PCPU_DRAIN}, $stats{$process_pid}->{HIGH_PCPU_DRAINS}, $stats{$process_pid}->{HIGH_PCPU_REFILLS}, @@ -364,8 +364,8 @@ sub aggregate_perprocesspid() { $perprocess{$process}->{MM_PAGE_ALLOC} += $perprocesspid{$process_pid}->{MM_PAGE_ALLOC}; $perprocess{$process}->{MM_PAGE_ALLOC_ZONE_LOCKED} += $perprocesspid{$process_pid}->{MM_PAGE_ALLOC_ZONE_LOCKED}; - $perprocess{$process}->{MM_PAGE_FREE_DIRECT} += $perprocesspid{$process_pid}->{MM_PAGE_FREE_DIRECT}; - $perprocess{$process}->{MM_PAGEVEC_FREE} += $perprocesspid{$process_pid}->{MM_PAGEVEC_FREE}; + $perprocess{$process}->{MM_PAGE_FREE} += $perprocesspid{$process_pid}->{MM_PAGE_FREE}; + $perprocess{$process}->{MM_PAGE_FREE_BATCHED} += $perprocesspid{$process_pid}->{MM_PAGE_FREE_BATCHED}; $perprocess{$process}->{MM_PAGE_PCPU_DRAIN} += $perprocesspid{$process_pid}->{MM_PAGE_PCPU_DRAIN}; $perprocess{$process}->{HIGH_PCPU_DRAINS} += $perprocesspid{$process_pid}->{HIGH_PCPU_DRAINS}; $perprocess{$process}->{HIGH_PCPU_REFILLS} += $perprocesspid{$process_pid}->{HIGH_PCPU_REFILLS}; diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index a9c87ad8331c..5f889f16b0c8 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -147,7 +147,7 @@ DEFINE_EVENT(kmem_free, kmem_cache_free, TP_ARGS(call_site, ptr) ); -TRACE_EVENT(mm_page_free_direct, +TRACE_EVENT(mm_page_free, TP_PROTO(struct page *page, unsigned int order), @@ -169,7 +169,7 @@ TRACE_EVENT(mm_page_free_direct, __entry->order) ); -TRACE_EVENT(mm_pagevec_free, +TRACE_EVENT(mm_page_free_batched, TP_PROTO(struct page *page, int cold), diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6c77efbca5bc..516ab623d773 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -632,7 +632,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order) int i; int bad = 0; - trace_mm_page_free_direct(page, order); + trace_mm_page_free(page, order); kmemcheck_free_shadow(page, order); if (PageAnon(page)) @@ -1196,7 +1196,7 @@ void free_hot_cold_page_list(struct list_head *list, int cold) struct page *page, *next; list_for_each_entry_safe(page, next, list, lru) { - trace_mm_pagevec_free(page, cold); + trace_mm_page_free_batched(page, cold); free_hot_cold_page(page, cold); } } -- cgit v1.2.3 From 43d2b113241d6797b890318767e0af78e313414b Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 10 Jan 2012 15:08:09 -0800 Subject: tracepoint: add tracepoints for debugging oom_score_adj oom_score_adj is used for guarding processes from OOM-Killer. One of problem is that it's inherited at fork(). When a daemon set oom_score_adj and make children, it's hard to know where the value is set. This patch adds some tracepoints useful for debugging. This patch adds 3 trace points. - creating new task - renaming a task (exec) - set oom_score_adj To debug, users need to enable some trace pointer. Maybe filtering is useful as # EVENT=/sys/kernel/debug/tracing/events/task/ # echo "oom_score_adj != 0" > $EVENT/task_newtask/filter # echo "oom_score_adj != 0" > $EVENT/task_rename/filter # echo 1 > $EVENT/enable # EVENT=/sys/kernel/debug/tracing/events/oom/ # echo 1 > $EVENT/enable output will be like this. # grep oom /sys/kernel/debug/tracing/trace bash-7699 [007] d..3 5140.744510: oom_score_adj_update: pid=7699 comm=bash oom_score_adj=-1000 bash-7699 [007] ...1 5151.818022: task_newtask: pid=7729 comm=bash clone_flags=1200011 oom_score_adj=-1000 ls-7729 [003] ...2 5151.818504: task_rename: pid=7729 oldcomm=bash newcomm=ls oom_score_adj=-1000 bash-7699 [002] ...1 5175.701468: task_newtask: pid=7730 comm=bash clone_flags=1200011 oom_score_adj=-1000 grep-7730 [007] ...2 5175.701993: task_rename: pid=7730 oldcomm=bash newcomm=grep oom_score_adj=-1000 Signed-off-by: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 4 +++ fs/proc/base.c | 3 +++ include/trace/events/oom.h | 33 ++++++++++++++++++++++++ include/trace/events/task.h | 61 +++++++++++++++++++++++++++++++++++++++++++++ kernel/fork.c | 6 +++++ mm/oom_kill.c | 6 +++++ 6 files changed, 113 insertions(+) create mode 100644 include/trace/events/oom.h create mode 100644 include/trace/events/task.h (limited to 'include/trace') diff --git a/fs/exec.c b/fs/exec.c index 3f64b9f26e7d..aeb135c7ff5c 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -59,6 +59,8 @@ #include #include #include + +#include #include "internal.h" int core_uses_pid; @@ -1054,6 +1056,8 @@ void set_task_comm(struct task_struct *tsk, char *buf) { task_lock(tsk); + trace_task_rename(tsk, buf); + /* * Threads may access current->comm without holding * the task lock, so write the string carefully. diff --git a/fs/proc/base.c b/fs/proc/base.c index a1dddda999f2..1aab5fe05a1b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -86,6 +86,7 @@ #ifdef CONFIG_HARDWALL #include #endif +#include #include "internal.h" /* NOTE: @@ -1010,6 +1011,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, else task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE; + trace_oom_score_adj_update(task); err_sighand: unlock_task_sighand(task, &flags); err_task_lock: @@ -1097,6 +1099,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, task->signal->oom_score_adj = oom_score_adj; if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) task->signal->oom_score_adj_min = oom_score_adj; + trace_oom_score_adj_update(task); /* * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is * always attainable. diff --git a/include/trace/events/oom.h b/include/trace/events/oom.h new file mode 100644 index 000000000000..dd4ba3b92002 --- /dev/null +++ b/include/trace/events/oom.h @@ -0,0 +1,33 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM oom + +#if !defined(_TRACE_OOM_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_OOM_H +#include + +TRACE_EVENT(oom_score_adj_update, + + TP_PROTO(struct task_struct *task), + + TP_ARGS(task), + + TP_STRUCT__entry( + __field( pid_t, pid) + __array( char, comm, TASK_COMM_LEN ) + __field( int, oom_score_adj) + ), + + TP_fast_assign( + __entry->pid = task->pid; + memcpy(__entry->comm, task->comm, TASK_COMM_LEN); + __entry->oom_score_adj = task->signal->oom_score_adj; + ), + + TP_printk("pid=%d comm=%s oom_score_adj=%d", + __entry->pid, __entry->comm, __entry->oom_score_adj) +); + +#endif + +/* This part must be outside protection */ +#include diff --git a/include/trace/events/task.h b/include/trace/events/task.h new file mode 100644 index 000000000000..b53add02e929 --- /dev/null +++ b/include/trace/events/task.h @@ -0,0 +1,61 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM task + +#if !defined(_TRACE_TASK_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_TASK_H +#include + +TRACE_EVENT(task_newtask, + + TP_PROTO(struct task_struct *task, unsigned long clone_flags), + + TP_ARGS(task, clone_flags), + + TP_STRUCT__entry( + __field( pid_t, pid) + __array( char, comm, TASK_COMM_LEN) + __field( unsigned long, clone_flags) + __field( int, oom_score_adj) + ), + + TP_fast_assign( + __entry->pid = task->pid; + memcpy(__entry->comm, task->comm, TASK_COMM_LEN); + __entry->clone_flags = clone_flags; + __entry->oom_score_adj = task->signal->oom_score_adj; + ), + + TP_printk("pid=%d comm=%s clone_flags=%lx oom_score_adj=%d", + __entry->pid, __entry->comm, + __entry->clone_flags, __entry->oom_score_adj) +); + +TRACE_EVENT(task_rename, + + TP_PROTO(struct task_struct *task, char *comm), + + TP_ARGS(task, comm), + + TP_STRUCT__entry( + __field( pid_t, pid) + __array( char, oldcomm, TASK_COMM_LEN) + __array( char, newcomm, TASK_COMM_LEN) + __field( int, oom_score_adj) + ), + + TP_fast_assign( + __entry->pid = task->pid; + memcpy(entry->oldcomm, task->comm, TASK_COMM_LEN); + memcpy(entry->newcomm, comm, TASK_COMM_LEN); + __entry->oom_score_adj = task->signal->oom_score_adj; + ), + + TP_printk("pid=%d oldcomm=%s newcomm=%s oom_score_adj=%d", + __entry->pid, __entry->oldcomm, + __entry->newcomm, __entry->oom_score_adj) +); + +#endif + +/* This part must be outside protection */ +#include diff --git a/kernel/fork.c b/kernel/fork.c index b00711ce7c13..5e1391b5ade0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -76,6 +76,9 @@ #include +#define CREATE_TRACE_POINTS +#include + /* * Protected counters by write_lock_irq(&tasklist_lock) */ @@ -1370,6 +1373,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (clone_flags & CLONE_THREAD) threadgroup_change_end(current); perf_event_fork(p); + + trace_task_newtask(p, clone_flags); + return p; bad_fork_free_pid: diff --git a/mm/oom_kill.c b/mm/oom_kill.c index eeb27e27dce3..7c122faa05c5 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -33,6 +33,10 @@ #include #include #include +#include + +#define CREATE_TRACE_POINTS +#include int sysctl_panic_on_oom; int sysctl_oom_kill_allocating_task; @@ -55,6 +59,7 @@ void compare_swap_oom_score_adj(int old_val, int new_val) spin_lock_irq(&sighand->siglock); if (current->signal->oom_score_adj == old_val) current->signal->oom_score_adj = new_val; + trace_oom_score_adj_update(current); spin_unlock_irq(&sighand->siglock); } @@ -74,6 +79,7 @@ int test_set_oom_score_adj(int new_val) spin_lock_irq(&sighand->siglock); old_val = current->signal->oom_score_adj; current->signal->oom_score_adj = new_val; + trace_oom_score_adj_update(current); spin_unlock_irq(&sighand->siglock); return old_val; -- cgit v1.2.3 From ea4d349ffa8028c655236497c2ba17c17aaa0d65 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Thu, 12 Jan 2012 17:19:20 -0800 Subject: vmscan/trace: Add 'file' info to trace_mm_vmscan_lru_isolate() In trace_mm_vmscan_lru_isolate(), we don't output 'file' information to the trace event and it is a bit inconvenient for the user to get the real information(like pasted below). mm_vmscan_lru_isolate: isolate_mode=2 order=0 nr_requested=32 nr_scanned=32 nr_taken=32 contig_taken=0 contig_dirty=0 contig_failed=0 'active' can be obtained by analyzing mode(Thanks go to Minchan and Mel), So this patch adds 'file' to the trace event and it now looks like: mm_vmscan_lru_isolate: isolate_mode=2 order=0 nr_requested=32 nr_scanned=32 nr_taken=32 contig_taken=0 contig_dirty=0 contig_failed=0 file=0 Signed-off-by: Tao Ma Acked-by: KOSAKI Motohiro Reviewed-by: KAMEZAWA Hiroyuki Cc: Mel Gorman Reviewed-by: Minchan Kim Cc: Rik van Riel Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/vmscan.h | 22 ++++++++++++++-------- mm/vmscan.c | 2 +- 2 files changed, 15 insertions(+), 9 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index edc4b3d25a2d..f64560e204bc 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -266,9 +266,10 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template, unsigned long nr_lumpy_taken, unsigned long nr_lumpy_dirty, unsigned long nr_lumpy_failed, - isolate_mode_t isolate_mode), + isolate_mode_t isolate_mode, + int file), - TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode), + TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode, file), TP_STRUCT__entry( __field(int, order) @@ -279,6 +280,7 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template, __field(unsigned long, nr_lumpy_dirty) __field(unsigned long, nr_lumpy_failed) __field(isolate_mode_t, isolate_mode) + __field(int, file) ), TP_fast_assign( @@ -290,9 +292,10 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template, __entry->nr_lumpy_dirty = nr_lumpy_dirty; __entry->nr_lumpy_failed = nr_lumpy_failed; __entry->isolate_mode = isolate_mode; + __entry->file = file; ), - TP_printk("isolate_mode=%d order=%d nr_requested=%lu nr_scanned=%lu nr_taken=%lu contig_taken=%lu contig_dirty=%lu contig_failed=%lu", + TP_printk("isolate_mode=%d order=%d nr_requested=%lu nr_scanned=%lu nr_taken=%lu contig_taken=%lu contig_dirty=%lu contig_failed=%lu file=%d", __entry->isolate_mode, __entry->order, __entry->nr_requested, @@ -300,7 +303,8 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template, __entry->nr_taken, __entry->nr_lumpy_taken, __entry->nr_lumpy_dirty, - __entry->nr_lumpy_failed) + __entry->nr_lumpy_failed, + __entry->file) ); DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_lru_isolate, @@ -312,9 +316,10 @@ DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_lru_isolate, unsigned long nr_lumpy_taken, unsigned long nr_lumpy_dirty, unsigned long nr_lumpy_failed, - isolate_mode_t isolate_mode), + isolate_mode_t isolate_mode, + int file), - TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode) + TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode, file) ); @@ -327,9 +332,10 @@ DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_memcg_isolate, unsigned long nr_lumpy_taken, unsigned long nr_lumpy_dirty, unsigned long nr_lumpy_failed, - isolate_mode_t isolate_mode), + isolate_mode_t isolate_mode, + int file), - TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode) + TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode, file) ); diff --git a/mm/vmscan.c b/mm/vmscan.c index e16ca8384ef7..a85a261bf8f9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1234,7 +1234,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, nr_to_scan, scan, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, - mode); + mode, file); return nr_taken; } -- cgit v1.2.3 From 3f7de037fb3727b20bc27332cdcf2488b702394c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 10 Nov 2011 08:29:20 -0500 Subject: Btrfs: add allocator tracepoints I used these tracepoints when figuring out what the cluster stuff was doing, so add them to mainline in case we need to profile this stuff again. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 9 +++ fs/btrfs/free-space-cache.c | 12 ++- include/trace/events/btrfs.h | 173 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 193 insertions(+), 1 deletion(-) (limited to 'include/trace') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a44072a692ab..ad1a20bc834d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5256,6 +5256,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, ins->objectid = 0; ins->offset = 0; + trace_find_free_extent(orig_root, num_bytes, empty_size, data); + space_info = __find_space_info(root->fs_info, data); if (!space_info) { printk(KERN_ERR "No space info for %llu\n", data); @@ -5432,6 +5434,8 @@ alloc: if (offset) { /* we have a block, we're done */ spin_unlock(&last_ptr->refill_lock); + trace_btrfs_reserve_extent_cluster(root, + block_group, search_start, num_bytes); goto checks; } @@ -5490,6 +5494,9 @@ refill_cluster: if (offset) { /* we found one, proceed */ spin_unlock(&last_ptr->refill_lock); + trace_btrfs_reserve_extent_cluster(root, + block_group, search_start, + num_bytes); goto checks; } } else if (!cached && loop > LOOP_CACHING_NOWAIT @@ -5576,6 +5583,8 @@ checks: ins->objectid = search_start; ins->offset = num_bytes; + trace_btrfs_reserve_extent(orig_root, block_group, + search_start, num_bytes); if (offset < search_start) btrfs_add_free_space(used_block_group, offset, search_start - offset); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 6c7887a7770c..efe20032e4a1 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2346,6 +2346,8 @@ again: &entry->offset_index, 1); BUG_ON(ret); + trace_btrfs_setup_cluster(block_group, cluster, + total_found * block_group->sectorsize, 1); return 0; } @@ -2368,6 +2370,7 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, u64 window_start; u64 window_free; u64 max_extent; + u64 total_size = 0; entry = tree_search_offset(ctl, offset, 0, 1); if (!entry) @@ -2433,11 +2436,12 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, rb_erase(&entry->offset_index, &ctl->free_space_offset); ret = tree_insert_offset(&cluster->root, entry->offset, &entry->offset_index, 0); + total_size += entry->bytes; BUG_ON(ret); } while (node && entry != last); cluster->max_size = max_extent; - + trace_btrfs_setup_cluster(block_group, cluster, total_size, 0); return 0; } @@ -2542,6 +2546,10 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, goto out; } + trace_btrfs_find_cluster(block_group, offset, bytes, empty_size, + min_bytes); + + INIT_LIST_HEAD(&bitmaps); ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, bytes + empty_size, cont1_bytes, min_bytes); @@ -2559,6 +2567,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, list_add_tail(&cluster->block_group_list, &block_group->cluster_list); cluster->block_group = block_group; + } else { + trace_btrfs_failed_cluster_setup(block_group); } out: spin_unlock(&cluster->lock); diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index b31702ac15be..1750c0e6660c 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -16,6 +16,8 @@ struct btrfs_delayed_ref_node; struct btrfs_delayed_tree_ref; struct btrfs_delayed_data_ref; struct btrfs_delayed_ref_head; +struct btrfs_block_group_cache; +struct btrfs_free_cluster; struct map_lookup; struct extent_buffer; @@ -44,6 +46,15 @@ struct extent_buffer; obj, ((obj >= BTRFS_DATA_RELOC_TREE_OBJECTID) || \ (obj <= BTRFS_CSUM_TREE_OBJECTID )) ? __show_root_type(obj) : "-" +#define BTRFS_GROUP_FLAGS \ + { BTRFS_BLOCK_GROUP_DATA, "DATA"}, \ + { BTRFS_BLOCK_GROUP_SYSTEM, "SYSTEM"}, \ + { BTRFS_BLOCK_GROUP_METADATA, "METADATA"}, \ + { BTRFS_BLOCK_GROUP_RAID0, "RAID0"}, \ + { BTRFS_BLOCK_GROUP_RAID1, "RAID1"}, \ + { BTRFS_BLOCK_GROUP_DUP, "DUP"}, \ + { BTRFS_BLOCK_GROUP_RAID10, "RAID10"} + TRACE_EVENT(btrfs_transaction_commit, TP_PROTO(struct btrfs_root *root), @@ -659,6 +670,168 @@ DEFINE_EVENT(btrfs__reserved_extent, btrfs_reserved_extent_free, TP_ARGS(root, start, len) ); +TRACE_EVENT(find_free_extent, + + TP_PROTO(struct btrfs_root *root, u64 num_bytes, u64 empty_size, + u64 data), + + TP_ARGS(root, num_bytes, empty_size, data), + + TP_STRUCT__entry( + __field( u64, root_objectid ) + __field( u64, num_bytes ) + __field( u64, empty_size ) + __field( u64, data ) + ), + + TP_fast_assign( + __entry->root_objectid = root->root_key.objectid; + __entry->num_bytes = num_bytes; + __entry->empty_size = empty_size; + __entry->data = data; + ), + + TP_printk("root = %Lu(%s), len = %Lu, empty_size = %Lu, " + "flags = %Lu(%s)", show_root_type(__entry->root_objectid), + __entry->num_bytes, __entry->empty_size, __entry->data, + __print_flags((unsigned long)__entry->data, "|", + BTRFS_GROUP_FLAGS)) +); + +DECLARE_EVENT_CLASS(btrfs__reserve_extent, + + TP_PROTO(struct btrfs_root *root, + struct btrfs_block_group_cache *block_group, u64 start, + u64 len), + + TP_ARGS(root, block_group, start, len), + + TP_STRUCT__entry( + __field( u64, root_objectid ) + __field( u64, bg_objectid ) + __field( u64, flags ) + __field( u64, start ) + __field( u64, len ) + ), + + TP_fast_assign( + __entry->root_objectid = root->root_key.objectid; + __entry->bg_objectid = block_group->key.objectid; + __entry->flags = block_group->flags; + __entry->start = start; + __entry->len = len; + ), + + TP_printk("root = %Lu(%s), block_group = %Lu, flags = %Lu(%s), " + "start = %Lu, len = %Lu", + show_root_type(__entry->root_objectid), __entry->bg_objectid, + __entry->flags, __print_flags((unsigned long)__entry->flags, + "|", BTRFS_GROUP_FLAGS), + __entry->start, __entry->len) +); + +DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent, + + TP_PROTO(struct btrfs_root *root, + struct btrfs_block_group_cache *block_group, u64 start, + u64 len), + + TP_ARGS(root, block_group, start, len) +); + +DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent_cluster, + + TP_PROTO(struct btrfs_root *root, + struct btrfs_block_group_cache *block_group, u64 start, + u64 len), + + TP_ARGS(root, block_group, start, len) +); + +TRACE_EVENT(btrfs_find_cluster, + + TP_PROTO(struct btrfs_block_group_cache *block_group, u64 start, + u64 bytes, u64 empty_size, u64 min_bytes), + + TP_ARGS(block_group, start, bytes, empty_size, min_bytes), + + TP_STRUCT__entry( + __field( u64, bg_objectid ) + __field( u64, flags ) + __field( u64, start ) + __field( u64, bytes ) + __field( u64, empty_size ) + __field( u64, min_bytes ) + ), + + TP_fast_assign( + __entry->bg_objectid = block_group->key.objectid; + __entry->flags = block_group->flags; + __entry->start = start; + __entry->bytes = bytes; + __entry->empty_size = empty_size; + __entry->min_bytes = min_bytes; + ), + + TP_printk("block_group = %Lu, flags = %Lu(%s), start = %Lu, len = %Lu," + " empty_size = %Lu, min_bytes = %Lu", __entry->bg_objectid, + __entry->flags, + __print_flags((unsigned long)__entry->flags, "|", + BTRFS_GROUP_FLAGS), __entry->start, + __entry->bytes, __entry->empty_size, __entry->min_bytes) +); + +TRACE_EVENT(btrfs_failed_cluster_setup, + + TP_PROTO(struct btrfs_block_group_cache *block_group), + + TP_ARGS(block_group), + + TP_STRUCT__entry( + __field( u64, bg_objectid ) + ), + + TP_fast_assign( + __entry->bg_objectid = block_group->key.objectid; + ), + + TP_printk("block_group = %Lu", __entry->bg_objectid) +); + +TRACE_EVENT(btrfs_setup_cluster, + + TP_PROTO(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, u64 size, int bitmap), + + TP_ARGS(block_group, cluster, size, bitmap), + + TP_STRUCT__entry( + __field( u64, bg_objectid ) + __field( u64, flags ) + __field( u64, start ) + __field( u64, max_size ) + __field( u64, size ) + __field( int, bitmap ) + ), + + TP_fast_assign( + __entry->bg_objectid = block_group->key.objectid; + __entry->flags = block_group->flags; + __entry->start = cluster->window_start; + __entry->max_size = cluster->max_size; + __entry->size = size; + __entry->bitmap = bitmap; + ), + + TP_printk("block_group = %Lu, flags = %Lu(%s), window_start = %Lu, " + "size = %Lu, max_size = %Lu, bitmap = %d", + __entry->bg_objectid, + __entry->flags, + __print_flags((unsigned long)__entry->flags, "|", + BTRFS_GROUP_FLAGS), __entry->start, + __entry->size, __entry->max_size, __entry->bitmap) +); + #endif /* _TRACE_BTRFS_H */ /* This part must be outside protection */ -- cgit v1.2.3 From 8c2a3ca20f6233677ac3222c6506174010eb414f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 10 Jan 2012 10:31:31 -0500 Subject: Btrfs: space leak tracepoints This in addition to a script in my btrfs-tracing tree will help track down space leaks when we're getting space left over in block groups on umount. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/delayed-inode.c | 45 +++++++++++++++++++++++++--------- fs/btrfs/extent-tree.c | 58 ++++++++++++++++++++++++++++++++++++++------ fs/btrfs/inode-map.c | 4 +++ fs/btrfs/transaction.c | 2 ++ include/trace/events/btrfs.h | 30 +++++++++++++++++++++++ 5 files changed, 119 insertions(+), 20 deletions(-) (limited to 'include/trace') diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 9c1eccc2c503..fe4cd0f1cef1 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -595,8 +595,12 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, num_bytes = btrfs_calc_trans_metadata_size(root, 1); ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); - if (!ret) + if (!ret) { + trace_btrfs_space_reservation(root->fs_info, "delayed_item", + item->key.objectid, + num_bytes, 1); item->bytes_reserved = num_bytes; + } return ret; } @@ -610,6 +614,9 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, return; rsv = &root->fs_info->delayed_block_rsv; + trace_btrfs_space_reservation(root->fs_info, "delayed_item", + item->key.objectid, item->bytes_reserved, + 0); btrfs_block_rsv_release(root, rsv, item->bytes_reserved); } @@ -624,7 +631,7 @@ static int btrfs_delayed_inode_reserve_metadata( struct btrfs_block_rsv *dst_rsv; u64 num_bytes; int ret; - int release = false; + bool release = false; src_rsv = trans->block_rsv; dst_rsv = &root->fs_info->delayed_block_rsv; @@ -651,8 +658,13 @@ static int btrfs_delayed_inode_reserve_metadata( */ if (ret == -EAGAIN) ret = -ENOSPC; - if (!ret) + if (!ret) { node->bytes_reserved = num_bytes; + trace_btrfs_space_reservation(root->fs_info, + "delayed_inode", + btrfs_ino(inode), + num_bytes, 1); + } return ret; } else if (src_rsv == &root->fs_info->delalloc_block_rsv) { spin_lock(&BTRFS_I(inode)->lock); @@ -707,11 +719,17 @@ out: * reservation here. I think it may be time for a documentation page on * how block rsvs. work. */ - if (!ret) + if (!ret) { + trace_btrfs_space_reservation(root->fs_info, "delayed_inode", + btrfs_ino(inode), num_bytes, 1); node->bytes_reserved = num_bytes; + } - if (release) + if (release) { + trace_btrfs_space_reservation(root->fs_info, "delalloc", + btrfs_ino(inode), num_bytes, 0); btrfs_block_rsv_release(root, src_rsv, num_bytes); + } return ret; } @@ -725,6 +743,8 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root, return; rsv = &root->fs_info->delayed_block_rsv; + trace_btrfs_space_reservation(root->fs_info, "delayed_inode", + node->inode_id, node->bytes_reserved, 0); btrfs_block_rsv_release(root, rsv, node->bytes_reserved); node->bytes_reserved = 0; @@ -1372,13 +1392,6 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, goto release_node; } - ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item); - /* - * we have reserved enough space when we start a new transaction, - * so reserving metadata failure is impossible - */ - BUG_ON(ret); - delayed_item->key.objectid = btrfs_ino(dir); btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY); delayed_item->key.offset = index; @@ -1391,6 +1404,14 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, dir_item->type = type; memcpy((char *)(dir_item + 1), name, name_len); + ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item); + /* + * we have reserved enough space when we start a new transaction, + * so reserving metadata failure is impossible + */ + BUG_ON(ret); + + mutex_lock(&delayed_node->mutex); ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item); if (unlikely(ret)) { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index ad1a20bc834d..556f9aa25bb7 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3310,6 +3310,8 @@ commit_trans: return -ENOSPC; } data_sinfo->bytes_may_use += bytes; + trace_btrfs_space_reservation(root->fs_info, "space_info", + (u64)data_sinfo, bytes, 1); spin_unlock(&data_sinfo->lock); return 0; @@ -3329,6 +3331,8 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) data_sinfo = BTRFS_I(inode)->space_info; spin_lock(&data_sinfo->lock); data_sinfo->bytes_may_use -= bytes; + trace_btrfs_space_reservation(root->fs_info, "space_info", + (u64)data_sinfo, bytes, 0); spin_unlock(&data_sinfo->lock); } @@ -3686,6 +3690,10 @@ again: if (used <= space_info->total_bytes) { if (used + orig_bytes <= space_info->total_bytes) { space_info->bytes_may_use += orig_bytes; + trace_btrfs_space_reservation(root->fs_info, + "space_info", + (u64)space_info, + orig_bytes, 1); ret = 0; } else { /* @@ -3753,6 +3761,10 @@ again: if (used + num_bytes < space_info->total_bytes + avail) { space_info->bytes_may_use += orig_bytes; + trace_btrfs_space_reservation(root->fs_info, + "space_info", + (u64)space_info, + orig_bytes, 1); ret = 0; } else { wait_ordered = true; @@ -3859,7 +3871,8 @@ static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, spin_unlock(&block_rsv->lock); } -static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv, +static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, struct btrfs_block_rsv *dest, u64 num_bytes) { struct btrfs_space_info *space_info = block_rsv->space_info; @@ -3895,6 +3908,9 @@ static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv, if (num_bytes) { spin_lock(&space_info->lock); space_info->bytes_may_use -= num_bytes; + trace_btrfs_space_reservation(fs_info, "space_info", + (u64)space_info, + num_bytes, 0); space_info->reservation_progress++; spin_unlock(&space_info->lock); } @@ -4051,7 +4067,8 @@ void btrfs_block_rsv_release(struct btrfs_root *root, if (global_rsv->full || global_rsv == block_rsv || block_rsv->space_info != global_rsv->space_info) global_rsv = NULL; - block_rsv_release_bytes(block_rsv, global_rsv, num_bytes); + block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv, + num_bytes); } /* @@ -4110,11 +4127,15 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) num_bytes = sinfo->total_bytes - num_bytes; block_rsv->reserved += num_bytes; sinfo->bytes_may_use += num_bytes; + trace_btrfs_space_reservation(fs_info, "space_info", + (u64)sinfo, num_bytes, 1); } if (block_rsv->reserved >= block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; sinfo->bytes_may_use -= num_bytes; + trace_btrfs_space_reservation(fs_info, "space_info", + (u64)sinfo, num_bytes, 0); sinfo->reservation_progress++; block_rsv->reserved = block_rsv->size; block_rsv->full = 1; @@ -4149,7 +4170,8 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info) static void release_global_block_rsv(struct btrfs_fs_info *fs_info) { - block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1); + block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL, + (u64)-1); WARN_ON(fs_info->delalloc_block_rsv.size > 0); WARN_ON(fs_info->delalloc_block_rsv.reserved > 0); WARN_ON(fs_info->trans_block_rsv.size > 0); @@ -4166,6 +4188,8 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, if (!trans->bytes_reserved) return; + trace_btrfs_space_reservation(root->fs_info, "transaction", (u64)trans, + trans->bytes_reserved, 0); btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); trans->bytes_reserved = 0; } @@ -4183,6 +4207,8 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, * when we are truly done with the orphan item. */ u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); + trace_btrfs_space_reservation(root->fs_info, "orphan", + btrfs_ino(inode), num_bytes, 1); return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); } @@ -4190,6 +4216,8 @@ void btrfs_orphan_release_metadata(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); + trace_btrfs_space_reservation(root->fs_info, "orphan", + btrfs_ino(inode), num_bytes, 0); btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); } @@ -4370,8 +4398,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) if (dropped) to_free += btrfs_calc_trans_metadata_size(root, dropped); - if (to_free) + if (to_free) { btrfs_block_rsv_release(root, block_rsv, to_free); + trace_btrfs_space_reservation(root->fs_info, + "delalloc", + btrfs_ino(inode), + to_free, 0); + } return ret; } @@ -4383,6 +4416,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) BTRFS_I(inode)->reserved_extents += nr_extents; spin_unlock(&BTRFS_I(inode)->lock); + if (to_reserve) + trace_btrfs_space_reservation(root->fs_info,"delalloc", + btrfs_ino(inode), to_reserve, 1); block_rsv_add_bytes(block_rsv, to_reserve, 1); return 0; @@ -4412,6 +4448,8 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) if (dropped > 0) to_free += btrfs_calc_trans_metadata_size(root, dropped); + trace_btrfs_space_reservation(root->fs_info, "delalloc", + btrfs_ino(inode), to_free, 0); btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, to_free); } @@ -4666,7 +4704,10 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, cache->reserved += num_bytes; space_info->bytes_reserved += num_bytes; if (reserve == RESERVE_ALLOC) { - BUG_ON(space_info->bytes_may_use < num_bytes); + trace_btrfs_space_reservation(cache->fs_info, + "space_info", + (u64)space_info, + num_bytes, 0); space_info->bytes_may_use -= num_bytes; } } @@ -6126,10 +6167,11 @@ use_block_rsv(struct btrfs_trans_handle *trans, return ERR_PTR(-ENOSPC); } -static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize) +static void unuse_block_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, u32 blocksize) { block_rsv_add_bytes(block_rsv, blocksize, 0); - block_rsv_release_bytes(block_rsv, NULL, 0); + block_rsv_release_bytes(fs_info, block_rsv, NULL, 0); } /* @@ -6159,7 +6201,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, ret = btrfs_reserve_extent(trans, root, blocksize, blocksize, empty_size, hint, (u64)-1, &ins, 0); if (ret) { - unuse_block_rsv(block_rsv, blocksize); + unuse_block_rsv(root->fs_info, block_rsv, blocksize); return ERR_PTR(ret); } diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index f8962a957d65..213ffa86ce1b 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -438,6 +438,8 @@ int btrfs_save_ino_cache(struct btrfs_root *root, trans->bytes_reserved); if (ret) goto out; + trace_btrfs_space_reservation(root->fs_info, "ino_cache", (u64)trans, + trans->bytes_reserved, 1); again: inode = lookup_free_ino_inode(root, path); if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { @@ -498,6 +500,8 @@ again: out_put: iput(inode); out_release: + trace_btrfs_space_reservation(root->fs_info, "ino_cache", (u64)trans, + trans->bytes_reserved, 0); btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); out: trans->block_rsv = rsv; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index d5f987b49d70..287a6728b1ad 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -326,6 +326,8 @@ again: } if (num_bytes) { + trace_btrfs_space_reservation(root->fs_info, "transaction", + (u64)h, num_bytes, 1); h->block_rsv = &root->fs_info->trans_block_rsv; h->bytes_reserved = num_bytes; } diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 1750c0e6660c..84f3001a568d 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -55,6 +55,8 @@ struct extent_buffer; { BTRFS_BLOCK_GROUP_DUP, "DUP"}, \ { BTRFS_BLOCK_GROUP_RAID10, "RAID10"} +#define BTRFS_UUID_SIZE 16 + TRACE_EVENT(btrfs_transaction_commit, TP_PROTO(struct btrfs_root *root), @@ -632,6 +634,34 @@ TRACE_EVENT(btrfs_cow_block, __entry->cow_level) ); +TRACE_EVENT(btrfs_space_reservation, + + TP_PROTO(struct btrfs_fs_info *fs_info, char *type, u64 val, + u64 bytes, int reserve), + + TP_ARGS(fs_info, type, val, bytes, reserve), + + TP_STRUCT__entry( + __array( u8, fsid, BTRFS_UUID_SIZE ) + __string( type, type ) + __field( u64, val ) + __field( u64, bytes ) + __field( int, reserve ) + ), + + TP_fast_assign( + memcpy(__entry->fsid, fs_info->fsid, BTRFS_UUID_SIZE); + __assign_str(type, type); + __entry->val = val; + __entry->bytes = bytes; + __entry->reserve = reserve; + ), + + TP_printk("%pU: %s: %Lu %s %Lu", __entry->fsid, __get_str(type), + __entry->val, __entry->reserve ? "reserve" : "release", + __entry->bytes) +); + DECLARE_EVENT_CLASS(btrfs__reserved_extent, TP_PROTO(struct btrfs_root *root, u64 start, u64 len), -- cgit v1.2.3