sched: automated per-uid task group Android is a privilege-separated operating system, in which each application runs with a distinct system identity: the Linux user ID (uid). Automatically create CFS task groups per UID. Each Android application runs with its own distinct UID, so grouping tasks by UID and put them in the same task group allows to get a fair share of CPU time per application. Testcase: - run 4 cpu hog in background as user app_35 (com.android.email in my case) - run the Quadrant benchmark (http://www.aurorasoftworks.com/products/quadrant) as a different user and measure the result with and without the patch # su - app_35 $ for i in `seq 4`; do yes >/dev/null & done Without the patch (output of top): PID PPID USER STAT VSZ %MEM CPU %CPU COMMAND 6533 123 10070 R 202m 48.6 0 20.0 com.aurorasoftworks.quadrant.ui.st 6506 1 10035 R 1128 0.2 0 20.0 yes 6507 1 10035 R 1128 0.2 0 20.0 yes 6508 1 10035 R 1128 0.2 0 20.0 yes 6509 1 10035 R 1128 0.2 0 20.0 yes Benchmark result: 676 uid 10035 (cpu hog) : 60.0 % cpu quota uid 10070 (benchmark): 20.0 % cpu quota With automated per-uid task group (output of top): PID PPID USER STAT VSZ %MEM CPU %CPU COMMAND 6784 123 10070 S 209m 51.4 0 50.0 com.aurorasoftworks.quadrant.ui.st 6852 1 10035 R 1128 0.2 0 12.5 yes 6853 1 10035 R 1128 0.2 0 12.5 yes 6854 1 10035 R 1128 0.2 0 12.5 yes 6855 1 10035 R 1128 0.2 0 12.5 yes Benchmark result: 816 uid 10035 (cpu hog) : 45.9 % cpu quota uid 10070 (benchmark): 46.0 % cpu quota Total speedup: ~1.2 (the benchmark is about 20% faster in this case) Based on the patch "sched: automated per tty task groups" by Mike Galbraith. Signed-off-by: Andrea Righi Signed-off-by: Mike Galbraith --- fs/proc/base.c | 36 ++++++++++ include/linux/sched.h | 12 ++++ kernel/fork.c | 2 + kernel/sched.c | 14 ++++- kernel/sched_autogroup.c | 166 ++++++++++++++++++++++++++++++++++++++++++++++ kernel/sys.c | 2 + kernel/sysctl.c | 9 +++ 7 files changed, 239 insertions(+), 2 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index cce9792..bf96f02 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1293,6 +1293,41 @@ static const struct file_operations proc_pid_sched_operations = { #endif +static int sched_autogroup_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *p; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + proc_sched_autogroup_show_task(p, m); + + put_task_struct(p); + + return 0; +} + +static int sched_autogroup_open(struct inode *inode, struct file *filp) +{ + int ret; + + ret = single_open(filp, sched_autogroup_show, NULL); + if (!ret) { + struct seq_file *m = filp->private_data; + + m->private = inode; + } + return ret; +} + +static const struct file_operations proc_pid_sched_autogroup_operations = { + .open = sched_autogroup_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static ssize_t comm_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { @@ -2613,6 +2648,7 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #endif + REG("autogroup", S_IRUGO, proc_pid_sched_autogroup_operations), REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK INF("syscall", S_IRUSR, proc_pid_syscall), diff --git a/include/linux/sched.h b/include/linux/sched.h index f3418e0..92bfcc6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -513,6 +513,8 @@ struct thread_group_cputimer { spinlock_t lock; }; +struct autogroup; + /* * NOTE! "signal_struct" does not have it's own * locking, because a shared signal_struct always @@ -580,6 +582,7 @@ struct signal_struct { struct tty_struct *tty; /* NULL if no tty */ + struct autogroup *autogroup; /* * Cumulative resource counters for dead threads in the group, * and for reaped dead child processes forked by this group. @@ -1908,6 +1911,15 @@ int sched_rt_handler(struct ctl_table *table, int write, extern unsigned int sysctl_sched_compat_yield; +extern unsigned int sysctl_sched_autogroup_enabled; + +extern void sched_autogroup_create_attach(struct task_struct *p); +extern void sched_autogroup_detach(struct task_struct *p); +extern void sched_autogroup_fork(struct signal_struct *sig); +extern void sched_autogroup_exit(struct signal_struct *sig); +extern void proc_sched_autogroup_show_task(struct task_struct *p, + struct seq_file *m); + #ifdef CONFIG_RT_MUTEXES extern int rt_mutex_getprio(struct task_struct *p); extern void rt_mutex_setprio(struct task_struct *p, int prio); diff --git a/kernel/fork.c b/kernel/fork.c index 6ebbd77..c313dca 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -171,6 +171,7 @@ EXPORT_SYMBOL(free_task); static inline void free_signal_struct(struct signal_struct *sig) { taskstats_tgid_free(sig); + sched_autogroup_exit(sig); kmem_cache_free(signal_cachep, sig); } @@ -916,6 +917,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) posix_cpu_timers_init_group(sig); tty_audit_fork(sig); + sched_autogroup_fork(sig); sig->oom_adj = current->signal->oom_adj; diff --git a/kernel/sched.c b/kernel/sched.c index e88556e..4602746 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -268,6 +268,8 @@ struct task_group { struct task_group *parent; struct list_head siblings; struct list_head children; + + struct autogroup *autogroup; }; #define root_task_group init_task_group @@ -601,6 +603,9 @@ static inline int cpu_of(struct rq *rq) #ifdef CONFIG_CGROUP_SCHED +static inline struct task_group * +autogroup_task_group(struct task_struct *p, struct task_group *tg); + /* * Return the group to which this tasks belongs. * @@ -611,11 +616,14 @@ static inline int cpu_of(struct rq *rq) */ static inline struct task_group *task_group(struct task_struct *p) { + struct task_group *tg; struct cgroup_subsys_state *css; css = task_subsys_state_check(p, cpu_cgroup_subsys_id, lockdep_is_held(&task_rq(p)->lock)); - return container_of(css, struct task_group, css); + tg = container_of(css, struct task_group, css); + + return autogroup_task_group(p, tg); } /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ @@ -1886,6 +1894,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) #include "sched_idletask.c" #include "sched_fair.c" #include "sched_rt.c" +#include "sched_autogroup.c" #ifdef CONFIG_SCHED_DEBUG # include "sched_debug.c" #endif @@ -7554,7 +7563,7 @@ void __init sched_init(void) #ifdef CONFIG_CGROUP_SCHED list_add(&init_task_group.list, &task_groups); INIT_LIST_HEAD(&init_task_group.children); - + autogroup_init(&init_task); #endif /* CONFIG_CGROUP_SCHED */ #if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP @@ -8015,6 +8024,7 @@ static void free_sched_group(struct task_group *tg) { free_fair_sched_group(tg); free_rt_sched_group(tg); + autogroup_free(tg); kfree(tg); } diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c new file mode 100644 index 0000000..ad1cd14 --- /dev/null +++ b/kernel/sched_autogroup.c @@ -0,0 +1,166 @@ +struct autogroup { + struct kref kref; + struct task_group *tg; + unsigned long id; +}; + +unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; + +static struct autogroup autogroup_default; +static atomic_t autogroup_seq_nr; + +static void __init autogroup_init(struct task_struct *init_task) +{ + autogroup_default.tg = &init_task_group; + root_task_group.autogroup = &autogroup_default; + kref_init(&autogroup_default.kref); + init_task->signal->autogroup = &autogroup_default; +} + +static inline void autogroup_free(struct task_group *tg) +{ + kfree(tg->autogroup); +} + +static inline void autogroup_destroy(struct kref *kref) +{ + struct autogroup *ag = container_of(kref, struct autogroup, kref); + + sched_destroy_group(ag->tg); +} + +static inline void autogroup_kref_put(struct autogroup *ag) +{ + kref_put(&ag->kref, autogroup_destroy); +} + +static inline struct autogroup *autogroup_kref_get(struct autogroup *ag) +{ + kref_get(&ag->kref); + return ag; +} + +static inline struct autogroup *autogroup_create(void) +{ + struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL); + struct task_group *tg; + + if (!ag) + goto out_fail; + + tg = sched_create_group(&root_task_group); + + if (IS_ERR(tg)) + goto out_free; + + kref_init(&ag->kref); + ag->id = atomic_inc_return(&autogroup_seq_nr); + ag->tg = tg; + tg->autogroup = ag; + + return ag; + +out_free: + kfree(ag); +out_fail: + if (printk_ratelimit()) { + printk(KERN_WARNING "autogroup_create: %s failure.\n", + ag ? "sched_create_group()" : "kmalloc()"); + } + + return autogroup_kref_get(&autogroup_default); +} + +static inline struct task_group * +autogroup_task_group(struct task_struct *p, struct task_group *tg) +{ + int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); + + enabled &= (tg == &root_task_group); + enabled &= (p->sched_class == &fair_sched_class); + enabled &= (!(p->flags & PF_EXITING)); + + if (enabled) + return p->signal->autogroup->tg; + + return tg; +} + +static void +autogroup_move_group(struct task_struct *p, struct autogroup *ag) +{ + struct autogroup *prev; + struct task_struct *t; + unsigned long flags; + + if (unlikely(!lock_task_sighand(p, &flags))) { + WARN_ON(1); + return; + } + + prev = p->signal->autogroup; + if (prev == ag) { + unlock_task_sighand(p, &flags); + return; + } + + p->signal->autogroup = autogroup_kref_get(ag); + + t = p; + do { + sched_move_task(t); + } while_each_thread(p, t); + + unlock_task_sighand(p, &flags); + autogroup_kref_put(prev); +} + +/* Must be called with siglock held */ +void sched_autogroup_create_attach(struct task_struct *p) +{ + struct autogroup *ag = autogroup_create(); + + autogroup_move_group(p, ag); + /* drop extra refrence added by autogroup_create() */ + autogroup_kref_put(ag); +} +EXPORT_SYMBOL(sched_autogroup_create_attach); + +/* Must be called with siglock held. Currently has no users */ +void sched_autogroup_detach(struct task_struct *p) +{ + autogroup_move_group(p, &autogroup_default); +} +EXPORT_SYMBOL(sched_autogroup_detach); + +static struct autogroup *autogroup_task_get(struct task_struct *p) +{ + struct autogroup *ag; + unsigned long flags; + + if (!lock_task_sighand(p, &flags)) + return autogroup_kref_get(&autogroup_default); + + ag = autogroup_kref_get(p->signal->autogroup); + unlock_task_sighand(p, &flags); + + return ag; +} + +void sched_autogroup_fork(struct signal_struct *sig) +{ + sig->autogroup = autogroup_task_get(current); +} + +void sched_autogroup_exit(struct signal_struct *sig) +{ + autogroup_kref_put(sig->autogroup); +} + +void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m) +{ + struct autogroup *ag = autogroup_task_get(p); + + seq_printf(m, "/autogroup-%ld\n", ag->id); + autogroup_kref_put(ag); +} diff --git a/kernel/sys.c b/kernel/sys.c index f4fcc6e..20392ce 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -573,6 +573,8 @@ static int set_user(struct cred *new) free_uid(new->user); new->user = new_user; + sched_autogroup_create_attach(current); + return 0; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 0d95388..8ac6b7a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -383,6 +383,15 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "sched_autogroup_enabled", + .data = &sysctl_sched_autogroup_enabled, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &zero, + .extra2 = &one, + }, #ifdef CONFIG_PROVE_LOCKING { .procname = "prove_locking",