diff --git a/include/linux/sched.h b/include/linux/sched.h index 1c4f3e9b9bc5..984c3b295978 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1527,6 +1527,7 @@ struct task_struct { */ int nr_dirtied; int nr_dirtied_pause; + unsigned long dirty_paused_when; /* start of a write-and-pause period */ #ifdef CONFIG_LATENCYTOP int latency_record_count; diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 99d1d0decf88..8588a8918023 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -300,12 +300,13 @@ TRACE_EVENT(balance_dirty_pages, unsigned long dirty_ratelimit, unsigned long task_ratelimit, unsigned long dirtied, + unsigned long period, long pause, unsigned long start_time), TP_ARGS(bdi, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty, dirty_ratelimit, task_ratelimit, - dirtied, pause, start_time), + dirtied, period, pause, start_time), TP_STRUCT__entry( __array( char, bdi, 32) @@ -320,6 +321,8 @@ TRACE_EVENT(balance_dirty_pages, __field(unsigned int, dirtied_pause) __field(unsigned long, paused) __field( long, pause) + __field(unsigned long, period) + __field( long, think) ), TP_fast_assign( @@ -336,6 +339,9 @@ TRACE_EVENT(balance_dirty_pages, __entry->task_ratelimit = KBps(task_ratelimit); __entry->dirtied = dirtied; __entry->dirtied_pause = current->nr_dirtied_pause; + __entry->think = current->dirty_paused_when == 0 ? 0 : + (long)(jiffies - current->dirty_paused_when) * 1000/HZ; + __entry->period = period * 1000 / HZ; __entry->pause = pause * 1000 / HZ; __entry->paused = (jiffies - start_time) * 1000 / HZ; ), @@ -346,7 +352,7 @@ TRACE_EVENT(balance_dirty_pages, "bdi_setpoint=%lu bdi_dirty=%lu " "dirty_ratelimit=%lu task_ratelimit=%lu " "dirtied=%u dirtied_pause=%u " - "paused=%lu pause=%ld", + "paused=%lu pause=%ld period=%lu think=%ld", __entry->bdi, __entry->limit, __entry->setpoint, @@ -358,7 +364,9 @@ TRACE_EVENT(balance_dirty_pages, __entry->dirtied, __entry->dirtied_pause, __entry->paused, /* ms */ - __entry->pause /* ms */ + __entry->pause, /* ms */ + __entry->period, /* ms */ + __entry->think /* ms */ ) ); diff --git a/kernel/fork.c b/kernel/fork.c index da4a6a10d088..f8668cf6a32d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1296,6 +1296,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->nr_dirtied = 0; p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); + p->dirty_paused_when = 0; /* * Ok, make it visible to the rest of the system. diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 96b3e7aa705c..491932155825 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1016,6 +1016,7 @@ static void balance_dirty_pages(struct address_space *mapping, unsigned long background_thresh; unsigned long dirty_thresh; unsigned long bdi_thresh; + long period; long pause = 0; long uninitialized_var(max_pause); bool dirty_exceeded = false; @@ -1026,6 +1027,8 @@ static void balance_dirty_pages(struct address_space *mapping, unsigned long start_time = jiffies; for (;;) { + unsigned long now = jiffies; + /* * Unstable writes are a feature of certain networked * filesystems (i.e. NFS) in which data may have been @@ -1045,8 +1048,11 @@ static void balance_dirty_pages(struct address_space *mapping, */ freerun = dirty_freerun_ceiling(dirty_thresh, background_thresh); - if (nr_dirty <= freerun) + if (nr_dirty <= freerun) { + current->dirty_paused_when = now; + current->nr_dirtied = 0; break; + } if (unlikely(!writeback_in_progress(bdi))) bdi_start_background_writeback(bdi); @@ -1104,10 +1110,21 @@ static void balance_dirty_pages(struct address_space *mapping, task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >> RATELIMIT_CALC_SHIFT; if (unlikely(task_ratelimit == 0)) { + period = max_pause; pause = max_pause; goto pause; } - pause = HZ * pages_dirtied / task_ratelimit; + period = HZ * pages_dirtied / task_ratelimit; + pause = period; + if (current->dirty_paused_when) + pause -= now - current->dirty_paused_when; + /* + * For less than 1s think time (ext3/4 may block the dirtier + * for up to 800ms from time to time on 1-HDD; so does xfs, + * however at much less frequency), try to compensate it in + * future periods by updating the virtual time; otherwise just + * do a reset, as it may be a light dirtier. + */ if (unlikely(pause <= 0)) { trace_balance_dirty_pages(bdi, dirty_thresh, @@ -1118,8 +1135,16 @@ static void balance_dirty_pages(struct address_space *mapping, dirty_ratelimit, task_ratelimit, pages_dirtied, + period, pause, start_time); + if (pause < -HZ) { + current->dirty_paused_when = now; + current->nr_dirtied = 0; + } else if (period) { + current->dirty_paused_when += period; + current->nr_dirtied = 0; + } pause = 1; /* avoid resetting nr_dirtied_pause below */ break; } @@ -1135,11 +1160,15 @@ pause: dirty_ratelimit, task_ratelimit, pages_dirtied, + period, pause, start_time); __set_current_state(TASK_KILLABLE); io_schedule_timeout(pause); + current->dirty_paused_when = now + pause; + current->nr_dirtied = 0; + /* * This is typically equal to (nr_dirty < dirty_thresh) and can * also keep "1000+ dd on a slow USB stick" under control. @@ -1167,11 +1196,10 @@ pause: if (!dirty_exceeded && bdi->dirty_exceeded) bdi->dirty_exceeded = 0; - current->nr_dirtied = 0; if (pause == 0) { /* in freerun area */ current->nr_dirtied_pause = dirty_poll_interval(nr_dirty, dirty_thresh); - } else if (pause <= max_pause / 4 && + } else if (period <= max_pause / 4 && pages_dirtied >= current->nr_dirtied_pause) { current->nr_dirtied_pause = clamp_val( dirty_ratelimit * (max_pause / 2) / HZ,