pick_next_task_fair选择下一个被调度的进程
==========================================

每个调度器类sched_class都必须提供一个pick_next_task函数用以在就绪队列中选择一个最优的进程来等待调度,而我们的cfs调度器类中,选择下一个将要运行的
进程由pick_next_task_fair函数来完成

之前我们在讲主调度器的时候,主调度器schedule函数在进程调度抢占时,会通过__schedule函数调用全局pick_next_task选择一个最优的进程，在pick_next_task
中我们就按照优先级依次调用不同调度器类提供的pick_next_task方法

- pick_next_task_fair

选择下一个将要运行的进程pick_next_task_fair执行，其代码执行流程如下，对于此函数的讲解我们从simple标签开始,这个常规状态下的pick_next思路，简单来说就是
pick_next_task_fair函数的框架

::

    again:
        控制循坏来读取最优进程

    #ifdef CONFIG_FAIR_GROUP_SCHED
        完成组调度下的pick_next选择
        返回被选择的调度实体指针
    #endif

    simple:
        最基础的pick_next函数
        返回被选择的调度实体指针
    
    idle:
        如果系统中没有可运行的进程,则需要调度idle进程

可见我们会发现:

- simple标签时cfs中最基础的pick_next操作

- idle则使得在没有进程被调度时,调度idle进程

- again标签用于循环的进行pick_next操作

::

    static struct task_struct *
    pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
    {
        struct cfs_rq *cfs_rq = &rq->cfs;
        struct sched_entity *se;
        struct task_struct *p;
        int new_tasks;

    again:
        if (!sched_fair_runnable(rq))   //如果nr_running计数器为0,当前队列上没有可运行进程,则需要调度idle进程
            goto idle;

    #ifdef CONFIG_FAIR_GROUP_SCHED
        if (!prev || prev->sched_class != &fair_sched_class)    //如果当前运行的进程prev不是被fair调度的普通非实时进程
            goto simple;

        /*
         * Because of the set_next_buddy() in dequeue_task_fair() it is rather
         * likely that a next task is from the same cgroup as the current.
         *
         * Therefore attempt to avoid putting and setting the entire cgroup
         * hierarchy, only change the part that actually changes.
         */

        do {
            struct sched_entity *curr = cfs_rq->curr;

            /*
             * Since we got here without doing put_prev_entity() we also
             * have to consider cfs_rq->curr. If it is still a runnable
             * entity, update_curr() will update its vruntime, otherwise
             * forget we've ever seen it.
             */
            if (curr) {     //如果当前进程在curr在队列上,则需要更新其统计量和虚拟运行时间，否则设置curr为空
                if (curr->on_rq)
                    update_curr(cfs_rq);
                else
                    curr = NULL;

                /*
                 * This call to check_cfs_rq_runtime() will do the
                 * throttle and dequeue its entity in the parent(s).
                 * Therefore the nr_running test will indeed
                 * be correct.
                 */
                if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
                    cfs_rq = &rq->cfs;

                    if (!cfs_rq->nr_running)
                        goto idle;

                    goto simple;
                }
            }

            se = pick_next_entity(cfs_rq, curr);    //选择一个最优的调度实体
            cfs_rq = group_cfs_rq(se);
        } while (cfs_rq);   //如果调度的进程仍属于当前组,那么选取下一个可能被调度的任务,以保证子组件调度的公平性

        p = task_of(se);    //获取调度实体se的进程实体信息

        /*
         * Since we haven't yet done put_prev_entity and if the selected task
         * is a different task than we started out with, try and touch the
         * least amount of cfs_rqs.
         */
        if (prev != p) {
            struct sched_entity *pse = &prev->se;

            while (!(cfs_rq = is_same_group(se, pse))) {
                int se_depth = se->depth;
                int pse_depth = pse->depth;

                if (se_depth <= pse_depth) {
                    put_prev_entity(cfs_rq_of(pse), pse);
                    pse = parent_entity(pse);
                }
                if (se_depth >= pse_depth) {
                    set_next_entity(cfs_rq_of(se), se);
                    se = parent_entity(se);
                }
            }

            put_prev_entity(cfs_rq, pse);
            set_next_entity(cfs_rq, se);
        }

        goto done;
    simple:
    #endif
        if (prev)
            put_prev_task(rq, prev);    //将当前进程放入运行队列的合适位置

        do {
            se = pick_next_entity(cfs_rq, NULL);    //选出下一可执行的调度实体(进程)
            set_next_entity(cfs_rq, se);    //把选出的进程从红黑树中移除，更新红黑树,会调用__dequeue_entity完成此工作
            cfs_rq = group_cfs_rq(se);  //在非组调度的情况下返回NULL
        } while (cfs_rq);

        p = task_of(se);    //获取到调度实体指代的进程信息

    done: __maybe_unused;
    #ifdef CONFIG_SMP
        /*
         * Move the next running task to the front of
         * the list, so our cfs_tasks list becomes MRU
         * one.
         */
        list_move(&p->se.group_node, &rq->cfs_tasks);
    #endif

        if (hrtick_enabled(rq))
            hrtick_start_fair(rq, p);

        update_misfit_status(p, rq);

        return p;

    idle:
        if (!rf)
            return NULL;

        new_tasks = newidle_balance(rq, rf);

        /*
         * Because newidle_balance() releases (and re-acquires) rq->lock, it is
         * possible for any higher priority task to appear. In that case we
         * must re-start the pick_next_entity() loop.
         */
        if (new_tasks < 0)
            return RETRY_TASK;

        if (new_tasks > 0)
            goto again;

        /*
         * rq is about to be idle, check if we need to update the
         * lost_idle_time of clock_pelt
         */
        update_idle_rq_clock_pelt(rq);

        return NULL;
    }

put_prev_task函数
------------------

全局put_prev_task函数
^^^^^^^^^^^^^^^^^^^^^^

put_prev_task是用来将前一个进程prev放回到就绪队列中,这是一个全局函数，而每个调度器也必须实现一个自己的put_prev_task函数


::

    static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
    {
        prev->sched_class->put_prev_task(rq, prev);
    }

CFS的put_prev_task_fair函数
^^^^^^^^^^^^^^^^^^^^^^^^^^^^

::

    static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
    {
        struct sched_entity *se = &prev->se;
        struct cfs_rq *cfs_rq;

        for_each_sched_entity(se) {
            cfs_rq = cfs_rq_of(se);
            put_prev_entity(cfs_rq, se);
        }
    }

    static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
    {
        /*
         * If still on the runqueue then deactivate_task()
         * was not called and update_curr() has to be done:
         */
        if (prev->on_rq)
            update_curr(cfs_rq);

        /* throttle cfs_rqs exceeding runtime */
        check_cfs_rq_runtime(cfs_rq);

        check_spread(cfs_rq, prev);

        if (prev->on_rq) {
            update_stats_wait_start(cfs_rq, prev);
            /* Put 'current' back into the tree. */
            __enqueue_entity(cfs_rq, prev);
            /* in !on_rq case, update occurred at dequeue */
            update_load_avg(cfs_rq, prev, 0);
        }
        cfs_rq->curr = NULL;
    }


pick_next_entity函数
---------------------

::

    //1. 首先要确保任务组之间的公平这也是设置组的原因之一
    //2. 选择下一个合适的(优先级比较高的)进程,因为它确实需要马上运行
    //3. 如果没有条件2中进程，那么为了良好的局部性,选择上一次执行的进程
    //4. 只要有任务存在就不要让CPU空转，只有在没有进程的情况下才会让CPU运行idle进程
    static struct sched_entity *
    pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
    {
        struct sched_entity *left = __pick_first_entity(cfs_rq);    //选取红黑树最左边的进程
        struct sched_entity *se;

        /*
         * If curr is set we have to see if its left of the leftmost entity
         * still in the tree, provided there was anything in the tree at all.
         */
         //如果left==null或者curr!=null  并且curr比left进程更优(即curr的虚拟运行时间更小)
         //说明curr进程是自动放弃与运行权力，且其比最左进程更优
        if (!left || (curr && entity_before(curr, left))
            left = curr;

        se = left; /* ideally we run the leftmost entity */     //存储了cfs_rq队列中最优的那个进程

        /*
         * Avoid running the skip buddy, if running something else can
         * be done without getting too unfair.
         */
        if (cfs_rq->skip == se) {   //如果skip存储了需要跳过不参与调度的进程调度实体,那么我们需要选择次优的调度实体来进行调度
            struct sched_entity *second;

            if (se == curr) {
                second = __pick_first_entity(cfs_rq);   //se == curr == skip选择最左的那个调度实体left
            } else {
                second = __pick_next_entity(se);    //选择红黑树上第二左的进程结点
                if (!second || (curr && entity_before(curr, second)))   //如果没有次优进程或者curr比second进程更优，则选择curr
                    second = curr;
            }

            if (second && wakeup_preempt_entity(second, left) < 1)  //判断left和second的vruntime差距是否小于sysctl_sched_wakeup_granularity,即如果second能抢占left
                se = second;
        }

        /*
         * Prefer last buddy, try to return the CPU to a preempted task.
         */
        if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
            se = cfs_rq->last;

        /*
         * Someone really wants this to run. If it's not unfair, run it.
         */
        if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
            se = cfs_rq->next;
        //清楚掉这个指针
        clear_buddies(cfs_rq, se);

        return se;
    }

cfs的last和next指针域
^^^^^^^^^^^^^^^^^^^^^

在pick_next_entity的最后，要把红黑树最左下角的进程和另外两个进(next和last)做比较,next是抢占失败的进程,而last则是抢占成功后被抢占的进程,这三个进程到底哪一个是最优的next进程呢

linux CFS实现的判决条件是:

1) 尽可能满足需要刚被唤醒的进程抢占其他进程的需求

2) 尽可能减少以上这种抢占带来的缓存刷新的影响

- cfs_rq的last和next指针,last表示最后一个执行wakeup的sched_entity, next表示最后一个被wakeup的sched_entity.他们在进程wakeup的时候会赋值,在pick新sched_entity的时候,会优先选择这些last或者next指针的sched_entity,有利于提高缓存的命中率

因此我们优选出来的进程必须同last和next指针域进行对比，其实就是检查就绪队列中的最优进程,即红黑树最左边节点last是否可以抢占last和next指针域，检查是否可以抢占是通过wake_up_preempt_entity函数来完成的

wakeup_preempt_entity检查是否可以被抢占
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

::

    /*
     * Should 'se' preempt 'curr'.
     *
     *             |s1
     *        |s2
     *   |s3
     *         g
     *      |<--->|c
     *
     *  w(c, s1) = -1
     *  w(c, s2) =  0
     *  w(c, s3) =  1
     *
     */
    static int
    wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
    {
        s64 gran, vdiff = curr->vruntime - se->vruntime;    //curr和se的vruntime的差值

        if (vdiff <= 0)     //如果curr的vruntime比较小,说明curr更加需要补偿,即se无法抢占curr
            return -1;

        gran = wakeup_gran(se); //计算curr的最小抢占期限粒度，当差值大于最小粒度时才抢占,这样可以避免频繁抢占
        if (vdiff > gran)
            return 1;

        return 0;
    }

    static unsigned long wakeup_gran(struct sched_entity *se)
    {
        unsigned long gran = sysctl_sched_wakeup_granularity;   //NICE_0_LOAD的基准最小期限

        /*
         * Since its curr running now, convert the gran from real-time
         * to virtual-time in his units.
         *
         * By using 'se' instead of 'curr' we penalize light tasks, so
         * they get preempted easier. That is, if 'se' < 'curr' then
         * the resulting gran will be larger, therefore penalizing the
         * lighter, if otoh 'se' > 'curr' then the resulting gran will
         * be smaller, again penalizing the lighter task.
         *
         * This is especially important for buddies when the leftmost
         * task is higher priority than the buddy.
         */
        return calc_delta_fair(gran, se);   //计算抢占的粒度
    }


到底能不能选择last和next两个进程,则时wakeup_preempt_entity函数决定的,图解如下

.. image::
    res/wakeup_preempt_entity.png