3.7.4.3. linux进程调度器之主调度器
内核提供了两个调度器主调度器和周期性调度器,两者合在一起就组成了核心调度器(core scheduler)也叫通用调度器(generic scheduler)
在内核的许多地方,如果要将cpu分配给与当前活动进程不同的另一个进程,都会直接调用主调度器函数schedule. 该函数完成以下工作
3.7.4.3.1. schedule函数
asmlinkage __visible void __sched schedule(void)
{
struct task_struct *tsk = current; //获取当前进程
sched_submit_work(tsk); //避免死锁
do {
preempt_disable(); //关闭内核抢占
__schedule(false); //完成调度
sched_preempt_enable_no_resched(); //开启内核抢占
} while (need_resched()); //如果该进程被其他进程设置了TIF_NEED_RESCHED标志,则函数重新执行调度
sched_update_worker(tsk);
}
EXPORT_SYMBOL(schedule);
注解
#define __sched __attribute__((__section__(“.sched.text”))) attribute((_section(”…”)))是gcc的编译属性,其目的在于将相关函数的代码编译之后,放到目标文件的特定段内
static inline void sched_submit_work(struct task_struct *tsk)
{
if (!tsk->state) //检测tsk->state是否为0(runnable),若为运行态时则返回
return;
/*
* If a worker went to sleep, notify and ask workqueue whether
* it wants to wake up a task to maintain concurrency.
* As this function is called inside the schedule() context,
* we disable preemption to avoid it calling schedule() again
* in the possible wakeup of a kworker.
*/
if (tsk->flags & PF_WQ_WORKER) {
preempt_disable();
wq_worker_sleeping(tsk);
preempt_enable_no_resched();
}
if (tsk_is_pi_blocked(tsk)) //检测tsk的死锁检测器是否为空
return;
/*
* If we are going to sleep and we have plugged IO queued,
* make sure to submit it to avoid deadlocks.
*/
if (blk_needs_flush_plug(tsk)) //检测是否刷新plug队列,用来避免死锁
blk_schedule_flush_plug(tsk);
}
内核抢占
linux除了内核态还有用户态,用户程序的上下文属于用户态,系统调用和中断处理例程上下文属于内核态.如果一个进程在用户态时被其他进程抢占了则发生了用户态抢占, 而如果此时进程进入了内核态,则内核态进程执行,如果此时发生了抢占,我们就说发生了内核抢占
抢占内核的主要特点是:一个在内核态运行的进程,当且仅当在执行内核函数期间被另外一个进程取代.
内核为了支撑内核抢占,提供了很多机制和结构,必要时候开关内核抢占也是必须的,这些函数定义在include/linux/preempt.h
#define preempt_disable() \
do { \
preempt_count_inc(); \
barrier(); \
} while (0)
#define sched_preempt_enable_no_resched() \
do { \
barrier(); \
preempt_count_dec(); \
} while (0)
#define preempt_enable_no_resched() sched_preempt_enable_no_resched()
3.7.4.3.2. __schedule开始进程调度
__schedule完成了真正的调度工作,其定义在 kernel/sched/core.c
static void __sched notrace __schedule(bool preempt)
{
struct task_struct *prev, *next;
unsigned long *switch_count;
struct rq_flags rf;
struct rq *rq;
int cpu;
//找到当前cpu上的就绪队列rq,并将正在运行的进程curr保存到prev中
cpu = smp_processor_id();
rq = cpu_rq(cpu);
prev = rq->curr;
//如果禁止内核抢占,而又调用了cond_resched就会出错,这里就是来捕获该错误的
schedule_debug(prev, preempt);
if (sched_feat(HRTICK))
hrtick_clear(rq);
local_irq_disable(); //关闭本地中断
rcu_note_context_switch(preempt); //更新全局状态,标识当前CPU发生上下文切换
/*
* Make sure that signal_pending_state()->signal_pending() below
* can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
* done by the caller to avoid the race with signal_wake_up().
*
* The membarrier system call requires a full memory barrier
* after coming from user-space, before storing to rq->curr.
*/
rq_lock(rq, &rf);
smp_mb__after_spinlock();
/* Promote REQ to ACT */
rq->clock_update_flags <<= 1;
update_rq_clock(rq);
switch_count = &prev->nivcsw; //切换次数记录
//scheduler检查prev的状态state和内核抢占标识,如果prev是不可运行的,并且内核在内核态没有被抢占
if (!preempt && prev->state) {
//如果当前进程有非阻塞等待信号,并且它的状态是TASK_INTERRUPTIBLE,则将进程状态设置为TASK_RUNNING
if (signal_pending_state(prev->state, prev)) {
prev->state = TASK_RUNNING;
} else {
deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); //将当前进程从runqueue中删除
if (prev->in_iowait) {
atomic_inc(&rq->nr_iowait);
delayacct_blkio_start();
}
}
switch_count = &prev->nvcsw; //获取切换次数
}
next = pick_next_task(rq, prev, &rf); //选出优先级最高的任务
clear_tsk_need_resched(prev); //清楚prev的TIF_NEED_RESCHED标志
clear_preempt_need_resched(); //清除内核抢占标识
if (likely(prev != next)) { //如果next和prev不是同一个进程
rq->nr_switches++; //队列切换次数更新
/*
* RCU users of rcu_dereference(rq->curr) may not see
* changes to task_struct made by pick_next_task().
*/
RCU_INIT_POINTER(rq->curr, next); //将next标记为队列的curr进程
/*
* The membarrier system call requires each architecture
* to have a full memory barrier after updating
* rq->curr, before returning to user-space.
*
* Here are the schemes providing that barrier on the
* various architectures:
* - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC.
* switch_mm() rely on membarrier_arch_switch_mm() on PowerPC.
* - finish_lock_switch() for weakly-ordered
* architectures where spin_unlock is a full barrier,
* - switch_to() for arm64 (weakly-ordered, spin_unlock
* is a RELEASE barrier),
*/
++*switch_count; //进程上下文切换次数加一
trace_sched_switch(preempt, prev, next); //
/* Also unlocks the rq: */
rq = context_switch(rq, prev, next, &rf); //进程之间上下文切换
} else {
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
rq_unlock_irq(rq, &rf);
}
balance_callback(rq);
}
3.7.4.3.2.1. pick_next_task选择抢占的进程
内核从cpu的就绪队列中选择一个最合适的进程来抢占CPU
next = pick_next_task(rq);
pick_next_task函数会按照优先级遍历所有调度器类的pick_next_task函数,去查找最优的那个进程,当然大多数情况下,系统中全是CFS调度的非实时进程,因而linux内核也有一些优化的策略.其执行流程如下
如果当前CPU上所有的进程都是cfs调度的普通非实时进程,则直接用cfs调度,如果无程序调度则调度idle进程
否则从优先级最高的调度器类开始遍历
/*
* Pick up the highest-prio task:
*/
static inline struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
const struct sched_class *class;
struct task_struct *p;
/*
* Optimization: we know that if all tasks are in the fair class we can
* call that function directly, but only if the @prev task wasn't of a
* higher scheduling class, because otherwise those loose the
* opportunity to pull in more work from other CPUs.
*/
//如果当前所有进程都被cfs调度,没有实时进程
if (likely((prev->sched_class == &idle_sched_class ||
prev->sched_class == &fair_sched_class) &&
rq->nr_running == rq->cfs.h_nr_running)) { //当前cpu上的就绪队列中进程数与cfs_rq的进程数相等,则说明当前cpu上所有进程都是cfs调度的普通非实时进程
//调用cfs的选择函数pick_next_task找到最优的那个进程
p = fair_sched_class.pick_next_task(rq, prev, rf);
if (unlikely(p == RETRY_TASK))
goto restart;
/* Assumes fair_sched_class->next == idle_sched_class */
if (unlikely(!p)) //如果没有进程可被调度,则调度idle进程
p = idle_sched_class.pick_next_task(rq, prev, rf);
return p;
}
restart:
#ifdef CONFIG_SMP
/*
* We must do the balancing pass before put_next_task(), such
* that when we release the rq->lock the task is in the same
* state as before we took rq->lock.
*
* We can terminate the balance pass as soon as we know there is
* a runnable task of @class priority or higher.
*/
for_class_range(class, prev->sched_class, &idle_sched_class) {
if (class->balance(rq, prev, rf))
break;
}
#endif
put_prev_task(rq, prev);
//进程中所有的调度器类,通过next域链接在一起的,调度的顺序为stop->dl->rt->fair->idle
for_each_class(class) {
p = class->pick_next_task(rq, NULL, NULL);
if (p)
return p;
}
/* The idle class should always have a runnable task: */
BUG();
}
加快经常性事件,是程序开发中一个优化的准则,linux系统中最普遍的进程就是非实时进程,其调度器必然是cfs
likely是gcc内建的一个编译选项,它其实就是告诉编译器表达式很大的情况下为真,编译器可以对此做出优化
# ifndef likely
# define likely(x) (__builtin_expect(!!(x), 1))
# endif
# ifndef unlikely
# define unlikely(x) (__builtin_expect(!!(x), 0))
# endif
3.7.4.3.3. context_switch进程上下文切换
3.7.4.3.3.1. 进程上下文切换
上下文切换(有时候也称作进程切换或者任务切换)是指CPU从一个进程或线程切换到另一个进程或线程
上下文切换可以认为是内核在CPU上对进程进行以下活动
挂起一个进程,将这个进程在cpu中的状态(上下文)存储在内存中的某处
在内存中检索出下一个进程的上下文并将其在CPU的寄存器中恢复
跳转到程序计数器所指向的位置(即跳转到进程中断时的代码行),以恢复该进程
因此上下文是指某一时间点CPU寄存器核程序计数器的内容,广义上还包括内存中进程的虚拟地址映射信息
上下文只能发生在内核态中,上下文切换通常是计算密集型的,也就是说他需要相当可观的处理器时间,在每秒几十上百次的切换中,会消耗大量的CPU时间
3.7.4.3.3.2. context_switch流程
context_switch
函数完成了进程上下文的切换,其定义在 kernel/sched/core.c中
/*
* context_switch - switch to the new MM and the new thread's register state.
*/
static __always_inline struct rq *
context_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next, struct rq_flags *rf)
{
prepare_task_switch(rq, prev, next);
/*
* For paravirt, this is coupled with an exit in switch_to to
* combine the page table reload and the switch backend into
* one hypercall.
*/
arch_start_context_switch(prev);
/*
* kernel -> kernel lazy + transfer active
* user -> kernel lazy + mmgrab() active
*
* kernel -> user switch + mmdrop() active
* user -> user switch
*/
if (!next->mm) { // to kernel
enter_lazy_tlb(prev->active_mm, next);
next->active_mm = prev->active_mm;
if (prev->mm) // from user
mmgrab(prev->active_mm);
else
prev->active_mm = NULL;
} else { // to user
membarrier_switch_mm(rq, prev->active_mm, next->mm);
/*
* sys_membarrier() requires an smp_mb() between setting
* rq->curr / membarrier_switch_mm() and returning to userspace.
*
* The below provides this either through switch_mm(), or in
* case 'prev->active_mm == next->mm' through
* finish_task_switch()'s mmdrop().
*/
switch_mm_irqs_off(prev->active_mm, next->mm, next);
if (!prev->mm) { // from kernel
/* will mmdrop() in finish_task_switch(). */
rq->prev_mm = prev->active_mm;
prev->active_mm = NULL;
}
}
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
prepare_lock_switch(rq, next, rf);
/* Here we just switch the register state and the stack. */
switch_to(prev, next, prev);
barrier();
return finish_task_switch(prev);
}