3.7.5.5. pick_next_task_fair选择下一个被调度的进程
每个调度器类sched_class都必须提供一个pick_next_task函数用以在就绪队列中选择一个最优的进程来等待调度,而我们的cfs调度器类中,选择下一个将要运行的 进程由pick_next_task_fair函数来完成
之前我们在讲主调度器的时候,主调度器schedule函数在进程调度抢占时,会通过__schedule函数调用全局pick_next_task选择一个最优的进程,在pick_next_task 中我们就按照优先级依次调用不同调度器类提供的pick_next_task方法
pick_next_task_fair
选择下一个将要运行的进程pick_next_task_fair执行,其代码执行流程如下,对于此函数的讲解我们从simple标签开始,这个常规状态下的pick_next思路,简单来说就是 pick_next_task_fair函数的框架
again:
控制循坏来读取最优进程
#ifdef CONFIG_FAIR_GROUP_SCHED
完成组调度下的pick_next选择
返回被选择的调度实体指针
#endif
simple:
最基础的pick_next函数
返回被选择的调度实体指针
idle:
如果系统中没有可运行的进程,则需要调度idle进程
可见我们会发现:
simple标签时cfs中最基础的pick_next操作
idle则使得在没有进程被调度时,调度idle进程
again标签用于循环的进行pick_next操作
static struct task_struct *
pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
struct cfs_rq *cfs_rq = &rq->cfs;
struct sched_entity *se;
struct task_struct *p;
int new_tasks;
again:
if (!sched_fair_runnable(rq)) //如果nr_running计数器为0,当前队列上没有可运行进程,则需要调度idle进程
goto idle;
#ifdef CONFIG_FAIR_GROUP_SCHED
if (!prev || prev->sched_class != &fair_sched_class) //如果当前运行的进程prev不是被fair调度的普通非实时进程
goto simple;
/*
* Because of the set_next_buddy() in dequeue_task_fair() it is rather
* likely that a next task is from the same cgroup as the current.
*
* Therefore attempt to avoid putting and setting the entire cgroup
* hierarchy, only change the part that actually changes.
*/
do {
struct sched_entity *curr = cfs_rq->curr;
/*
* Since we got here without doing put_prev_entity() we also
* have to consider cfs_rq->curr. If it is still a runnable
* entity, update_curr() will update its vruntime, otherwise
* forget we've ever seen it.
*/
if (curr) { //如果当前进程在curr在队列上,则需要更新其统计量和虚拟运行时间,否则设置curr为空
if (curr->on_rq)
update_curr(cfs_rq);
else
curr = NULL;
/*
* This call to check_cfs_rq_runtime() will do the
* throttle and dequeue its entity in the parent(s).
* Therefore the nr_running test will indeed
* be correct.
*/
if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
cfs_rq = &rq->cfs;
if (!cfs_rq->nr_running)
goto idle;
goto simple;
}
}
se = pick_next_entity(cfs_rq, curr); //选择一个最优的调度实体
cfs_rq = group_cfs_rq(se);
} while (cfs_rq); //如果调度的进程仍属于当前组,那么选取下一个可能被调度的任务,以保证子组件调度的公平性
p = task_of(se); //获取调度实体se的进程实体信息
/*
* Since we haven't yet done put_prev_entity and if the selected task
* is a different task than we started out with, try and touch the
* least amount of cfs_rqs.
*/
if (prev != p) {
struct sched_entity *pse = &prev->se;
while (!(cfs_rq = is_same_group(se, pse))) {
int se_depth = se->depth;
int pse_depth = pse->depth;
if (se_depth <= pse_depth) {
put_prev_entity(cfs_rq_of(pse), pse);
pse = parent_entity(pse);
}
if (se_depth >= pse_depth) {
set_next_entity(cfs_rq_of(se), se);
se = parent_entity(se);
}
}
put_prev_entity(cfs_rq, pse);
set_next_entity(cfs_rq, se);
}
goto done;
simple:
#endif
if (prev)
put_prev_task(rq, prev); //将当前进程放入运行队列的合适位置
do {
se = pick_next_entity(cfs_rq, NULL); //选出下一可执行的调度实体(进程)
set_next_entity(cfs_rq, se); //把选出的进程从红黑树中移除,更新红黑树,会调用__dequeue_entity完成此工作
cfs_rq = group_cfs_rq(se); //在非组调度的情况下返回NULL
} while (cfs_rq);
p = task_of(se); //获取到调度实体指代的进程信息
done: __maybe_unused;
#ifdef CONFIG_SMP
/*
* Move the next running task to the front of
* the list, so our cfs_tasks list becomes MRU
* one.
*/
list_move(&p->se.group_node, &rq->cfs_tasks);
#endif
if (hrtick_enabled(rq))
hrtick_start_fair(rq, p);
update_misfit_status(p, rq);
return p;
idle:
if (!rf)
return NULL;
new_tasks = newidle_balance(rq, rf);
/*
* Because newidle_balance() releases (and re-acquires) rq->lock, it is
* possible for any higher priority task to appear. In that case we
* must re-start the pick_next_entity() loop.
*/
if (new_tasks < 0)
return RETRY_TASK;
if (new_tasks > 0)
goto again;
/*
* rq is about to be idle, check if we need to update the
* lost_idle_time of clock_pelt
*/
update_idle_rq_clock_pelt(rq);
return NULL;
}
3.7.5.5.1. put_prev_task函数
3.7.5.5.1.1. 全局put_prev_task函数
put_prev_task是用来将前一个进程prev放回到就绪队列中,这是一个全局函数,而每个调度器也必须实现一个自己的put_prev_task函数
static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
{
prev->sched_class->put_prev_task(rq, prev);
}
3.7.5.5.1.2. CFS的put_prev_task_fair函数
static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
{
struct sched_entity *se = &prev->se;
struct cfs_rq *cfs_rq;
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
put_prev_entity(cfs_rq, se);
}
}
static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
{
/*
* If still on the runqueue then deactivate_task()
* was not called and update_curr() has to be done:
*/
if (prev->on_rq)
update_curr(cfs_rq);
/* throttle cfs_rqs exceeding runtime */
check_cfs_rq_runtime(cfs_rq);
check_spread(cfs_rq, prev);
if (prev->on_rq) {
update_stats_wait_start(cfs_rq, prev);
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);
/* in !on_rq case, update occurred at dequeue */
update_load_avg(cfs_rq, prev, 0);
}
cfs_rq->curr = NULL;
}
3.7.5.5.2. pick_next_entity函数
//1. 首先要确保任务组之间的公平这也是设置组的原因之一
//2. 选择下一个合适的(优先级比较高的)进程,因为它确实需要马上运行
//3. 如果没有条件2中进程,那么为了良好的局部性,选择上一次执行的进程
//4. 只要有任务存在就不要让CPU空转,只有在没有进程的情况下才会让CPU运行idle进程
static struct sched_entity *
pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
struct sched_entity *left = __pick_first_entity(cfs_rq); //选取红黑树最左边的进程
struct sched_entity *se;
/*
* If curr is set we have to see if its left of the leftmost entity
* still in the tree, provided there was anything in the tree at all.
*/
//如果left==null或者curr!=null 并且curr比left进程更优(即curr的虚拟运行时间更小)
//说明curr进程是自动放弃与运行权力,且其比最左进程更优
if (!left || (curr && entity_before(curr, left))
left = curr;
se = left; /* ideally we run the leftmost entity */ //存储了cfs_rq队列中最优的那个进程
/*
* Avoid running the skip buddy, if running something else can
* be done without getting too unfair.
*/
if (cfs_rq->skip == se) { //如果skip存储了需要跳过不参与调度的进程调度实体,那么我们需要选择次优的调度实体来进行调度
struct sched_entity *second;
if (se == curr) {
second = __pick_first_entity(cfs_rq); //se == curr == skip选择最左的那个调度实体left
} else {
second = __pick_next_entity(se); //选择红黑树上第二左的进程结点
if (!second || (curr && entity_before(curr, second))) //如果没有次优进程或者curr比second进程更优,则选择curr
second = curr;
}
if (second && wakeup_preempt_entity(second, left) < 1) //判断left和second的vruntime差距是否小于sysctl_sched_wakeup_granularity,即如果second能抢占left
se = second;
}
/*
* Prefer last buddy, try to return the CPU to a preempted task.
*/
if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
se = cfs_rq->last;
/*
* Someone really wants this to run. If it's not unfair, run it.
*/
if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
se = cfs_rq->next;
//清楚掉这个指针
clear_buddies(cfs_rq, se);
return se;
}
3.7.5.5.2.1. cfs的last和next指针域
在pick_next_entity的最后,要把红黑树最左下角的进程和另外两个进(next和last)做比较,next是抢占失败的进程,而last则是抢占成功后被抢占的进程,这三个进程到底哪一个是最优的next进程呢
linux CFS实现的判决条件是:
尽可能满足需要刚被唤醒的进程抢占其他进程的需求
尽可能减少以上这种抢占带来的缓存刷新的影响
cfs_rq的last和next指针,last表示最后一个执行wakeup的sched_entity, next表示最后一个被wakeup的sched_entity.他们在进程wakeup的时候会赋值,在pick新sched_entity的时候,会优先选择这些last或者next指针的sched_entity,有利于提高缓存的命中率
因此我们优选出来的进程必须同last和next指针域进行对比,其实就是检查就绪队列中的最优进程,即红黑树最左边节点last是否可以抢占last和next指针域,检查是否可以抢占是通过wake_up_preempt_entity函数来完成的
3.7.5.5.2.2. wakeup_preempt_entity检查是否可以被抢占
/*
* Should 'se' preempt 'curr'.
*
* |s1
* |s2
* |s3
* g
* |<--->|c
*
* w(c, s1) = -1
* w(c, s2) = 0
* w(c, s3) = 1
*
*/
static int
wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
{
s64 gran, vdiff = curr->vruntime - se->vruntime; //curr和se的vruntime的差值
if (vdiff <= 0) //如果curr的vruntime比较小,说明curr更加需要补偿,即se无法抢占curr
return -1;
gran = wakeup_gran(se); //计算curr的最小抢占期限粒度,当差值大于最小粒度时才抢占,这样可以避免频繁抢占
if (vdiff > gran)
return 1;
return 0;
}
static unsigned long wakeup_gran(struct sched_entity *se)
{
unsigned long gran = sysctl_sched_wakeup_granularity; //NICE_0_LOAD的基准最小期限
/*
* Since its curr running now, convert the gran from real-time
* to virtual-time in his units.
*
* By using 'se' instead of 'curr' we penalize light tasks, so
* they get preempted easier. That is, if 'se' < 'curr' then
* the resulting gran will be larger, therefore penalizing the
* lighter, if otoh 'se' > 'curr' then the resulting gran will
* be smaller, again penalizing the lighter task.
*
* This is especially important for buddies when the leftmost
* task is higher priority than the buddy.
*/
return calc_delta_fair(gran, se); //计算抢占的粒度
}
到底能不能选择last和next两个进程,则时wakeup_preempt_entity函数决定的,图解如下
