3.7.4.3. linux进程调度器之主调度器

内核提供了两个调度器主调度器和周期性调度器,两者合在一起就组成了核心调度器(core scheduler)也叫通用调度器(generic scheduler)

在内核的许多地方，如果要将cpu分配给与当前活动进程不同的另一个进程，都会直接调用主调度器函数schedule. 该函数完成以下工作

3.7.4.3.1. schedule函数

asmlinkage __visible void __sched schedule(void)
{
    struct task_struct *tsk = current;  //获取当前进程

    sched_submit_work(tsk); //避免死锁
    do {
        preempt_disable();  //关闭内核抢占
        __schedule(false);  //完成调度
        sched_preempt_enable_no_resched();  //开启内核抢占
    } while (need_resched());   //如果该进程被其他进程设置了TIF_NEED_RESCHED标志,则函数重新执行调度
    sched_update_worker(tsk);
}
EXPORT_SYMBOL(schedule);

注解

#define __sched __attribute__((__section__(“.sched.text”))) attribute((_section(”…”)))是gcc的编译属性，其目的在于将相关函数的代码编译之后,放到目标文件的特定段内

static inline void sched_submit_work(struct task_struct *tsk)
{
    if (!tsk->state) //检测tsk->state是否为0(runnable)，若为运行态时则返回
        return;

    /*
     * If a worker went to sleep, notify and ask workqueue whether
     * it wants to wake up a task to maintain concurrency.
     * As this function is called inside the schedule() context,
     * we disable preemption to avoid it calling schedule() again
     * in the possible wakeup of a kworker.
     */
    if (tsk->flags & PF_WQ_WORKER) {
        preempt_disable();
        wq_worker_sleeping(tsk);
        preempt_enable_no_resched();
    }

    if (tsk_is_pi_blocked(tsk)) //检测tsk的死锁检测器是否为空
        return;

    /*
     * If we are going to sleep and we have plugged IO queued,
     * make sure to submit it to avoid deadlocks.
     */
    if (blk_needs_flush_plug(tsk))  //检测是否刷新plug队列，用来避免死锁
        blk_schedule_flush_plug(tsk);
}

内核抢占

linux除了内核态还有用户态,用户程序的上下文属于用户态,系统调用和中断处理例程上下文属于内核态.如果一个进程在用户态时被其他进程抢占了则发生了用户态抢占, 而如果此时进程进入了内核态，则内核态进程执行，如果此时发生了抢占,我们就说发生了内核抢占

抢占内核的主要特点是：一个在内核态运行的进程,当且仅当在执行内核函数期间被另外一个进程取代.

内核为了支撑内核抢占，提供了很多机制和结构，必要时候开关内核抢占也是必须的,这些函数定义在include/linux/preempt.h

#define preempt_disable() \
do { \
    preempt_count_inc(); \
    barrier(); \
} while (0)

#define sched_preempt_enable_no_resched() \
do { \
    barrier(); \
    preempt_count_dec(); \
} while (0)

#define preempt_enable_no_resched() sched_preempt_enable_no_resched()

3.7.4.3.2. __schedule开始进程调度

__schedule完成了真正的调度工作,其定义在 kernel/sched/core.c

static void __sched notrace __schedule(bool preempt)
{
    struct task_struct *prev, *next;
    unsigned long *switch_count;
    struct rq_flags rf;
    struct rq *rq;
    int cpu;

    //找到当前cpu上的就绪队列rq，并将正在运行的进程curr保存到prev中
    cpu = smp_processor_id();
    rq = cpu_rq(cpu);
    prev = rq->curr;

    //如果禁止内核抢占,而又调用了cond_resched就会出错，这里就是来捕获该错误的
    schedule_debug(prev, preempt);

    if (sched_feat(HRTICK))
        hrtick_clear(rq);

    local_irq_disable(); //关闭本地中断
    rcu_note_context_switch(preempt);   //更新全局状态，标识当前CPU发生上下文切换

    /*
     * Make sure that signal_pending_state()->signal_pending() below
     * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
     * done by the caller to avoid the race with signal_wake_up().
     *
     * The membarrier system call requires a full memory barrier
     * after coming from user-space, before storing to rq->curr.
     */
    rq_lock(rq, &rf);
    smp_mb__after_spinlock();

    /* Promote REQ to ACT */
    rq->clock_update_flags <<= 1;
    update_rq_clock(rq);

    switch_count = &prev->nivcsw;   //切换次数记录
    //scheduler检查prev的状态state和内核抢占标识,如果prev是不可运行的,并且内核在内核态没有被抢占
    if (!preempt && prev->state) {
        //如果当前进程有非阻塞等待信号,并且它的状态是TASK_INTERRUPTIBLE,则将进程状态设置为TASK_RUNNING
        if (signal_pending_state(prev->state, prev)) {
            prev->state = TASK_RUNNING;
        } else {
            deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); //将当前进程从runqueue中删除

            if (prev->in_iowait) {
                atomic_inc(&rq->nr_iowait);
                delayacct_blkio_start();
            }
        }
        switch_count = &prev->nvcsw; //获取切换次数
    }

    next = pick_next_task(rq, prev, &rf);   //选出优先级最高的任务
    clear_tsk_need_resched(prev);           //清楚prev的TIF_NEED_RESCHED标志
    clear_preempt_need_resched();           //清除内核抢占标识

    if (likely(prev != next)) { //如果next和prev不是同一个进程
        rq->nr_switches++;  //队列切换次数更新
        /*
         * RCU users of rcu_dereference(rq->curr) may not see
         * changes to task_struct made by pick_next_task().
         */
        RCU_INIT_POINTER(rq->curr, next);   //将next标记为队列的curr进程
        /*
         * The membarrier system call requires each architecture
         * to have a full memory barrier after updating
         * rq->curr, before returning to user-space.
         *
         * Here are the schemes providing that barrier on the
         * various architectures:
         * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC.
         *   switch_mm() rely on membarrier_arch_switch_mm() on PowerPC.
         * - finish_lock_switch() for weakly-ordered
         *   architectures where spin_unlock is a full barrier,
         * - switch_to() for arm64 (weakly-ordered, spin_unlock
         *   is a RELEASE barrier),
         */
        ++*switch_count;    //进程上下文切换次数加一

        trace_sched_switch(preempt, prev, next);    //

        /* Also unlocks the rq: */
        rq = context_switch(rq, prev, next, &rf);   //进程之间上下文切换
    } else {
        rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
        rq_unlock_irq(rq, &rf);
    }

    balance_callback(rq);
}

3.7.4.3.2.1. pick_next_task选择抢占的进程

内核从cpu的就绪队列中选择一个最合适的进程来抢占CPU

next = pick_next_task(rq);

pick_next_task函数会按照优先级遍历所有调度器类的pick_next_task函数，去查找最优的那个进程,当然大多数情况下，系统中全是CFS调度的非实时进程，因而linux内核也有一些优化的策略.其执行流程如下

如果当前CPU上所有的进程都是cfs调度的普通非实时进程,则直接用cfs调度，如果无程序调度则调度idle进程
否则从优先级最高的调度器类开始遍历

/*
 * Pick up the highest-prio task:
 */
static inline struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
    const struct sched_class *class;
    struct task_struct *p;

    /*
     * Optimization: we know that if all tasks are in the fair class we can
     * call that function directly, but only if the @prev task wasn't of a
     * higher scheduling class, because otherwise those loose the
     * opportunity to pull in more work from other CPUs.
     */
     //如果当前所有进程都被cfs调度，没有实时进程
    if (likely((prev->sched_class == &idle_sched_class ||
            prev->sched_class == &fair_sched_class) &&
           rq->nr_running == rq->cfs.h_nr_running)) {       //当前cpu上的就绪队列中进程数与cfs_rq的进程数相等，则说明当前cpu上所有进程都是cfs调度的普通非实时进程
        //调用cfs的选择函数pick_next_task找到最优的那个进程
        p = fair_sched_class.pick_next_task(rq, prev, rf);
        if (unlikely(p == RETRY_TASK))
            goto restart;

        /* Assumes fair_sched_class->next == idle_sched_class */
        if (unlikely(!p))   //如果没有进程可被调度,则调度idle进程
            p = idle_sched_class.pick_next_task(rq, prev, rf);

        return p;
    }

restart:
#ifdef CONFIG_SMP
    /*
     * We must do the balancing pass before put_next_task(), such
     * that when we release the rq->lock the task is in the same
     * state as before we took rq->lock.
     *
     * We can terminate the balance pass as soon as we know there is
     * a runnable task of @class priority or higher.
     */
    for_class_range(class, prev->sched_class, &idle_sched_class) {
        if (class->balance(rq, prev, rf))
            break;
    }
#endif

    put_prev_task(rq, prev);
    //进程中所有的调度器类，通过next域链接在一起的,调度的顺序为stop->dl->rt->fair->idle
    for_each_class(class) {
        p = class->pick_next_task(rq, NULL, NULL);
        if (p)
            return p;
    }

    /* The idle class should always have a runnable task: */
    BUG();
}

加快经常性事件，是程序开发中一个优化的准则，linux系统中最普遍的进程就是非实时进程，其调度器必然是cfs

likely是gcc内建的一个编译选项，它其实就是告诉编译器表达式很大的情况下为真,编译器可以对此做出优化

# ifndef likely
#  define likely(x) (__builtin_expect(!!(x), 1))
# endif
# ifndef unlikely
#  define unlikely(x)       (__builtin_expect(!!(x), 0))
# endif

3.7.4.3.3. context_switch进程上下文切换

3.7.4.3.3.1. 进程上下文切换

上下文切换(有时候也称作进程切换或者任务切换)是指CPU从一个进程或线程切换到另一个进程或线程

上下文切换可以认为是内核在CPU上对进程进行以下活动

挂起一个进程，将这个进程在cpu中的状态(上下文)存储在内存中的某处
在内存中检索出下一个进程的上下文并将其在CPU的寄存器中恢复
跳转到程序计数器所指向的位置(即跳转到进程中断时的代码行)，以恢复该进程

因此上下文是指某一时间点CPU寄存器核程序计数器的内容，广义上还包括内存中进程的虚拟地址映射信息

上下文只能发生在内核态中，上下文切换通常是计算密集型的,也就是说他需要相当可观的处理器时间，在每秒几十上百次的切换中，会消耗大量的CPU时间

3.7.4.3.3.2. context_switch流程

context_switch 函数完成了进程上下文的切换,其定义在 kernel/sched/core.c中

/*
 * context_switch - switch to the new MM and the new thread's register state.
 */
static __always_inline struct rq *
context_switch(struct rq *rq, struct task_struct *prev,
           struct task_struct *next, struct rq_flags *rf)
{
    prepare_task_switch(rq, prev, next);

    /*
     * For paravirt, this is coupled with an exit in switch_to to
     * combine the page table reload and the switch backend into
     * one hypercall.
     */
    arch_start_context_switch(prev);

    /*
     * kernel -> kernel   lazy + transfer active
     *   user -> kernel   lazy + mmgrab() active
     *
     * kernel ->   user   switch + mmdrop() active
     *   user ->   user   switch
     */
    if (!next->mm) {                                // to kernel
        enter_lazy_tlb(prev->active_mm, next);

        next->active_mm = prev->active_mm;
        if (prev->mm)                           // from user
            mmgrab(prev->active_mm);
        else
            prev->active_mm = NULL;
    } else {                                        // to user
        membarrier_switch_mm(rq, prev->active_mm, next->mm);
        /*
         * sys_membarrier() requires an smp_mb() between setting
         * rq->curr / membarrier_switch_mm() and returning to userspace.
         *
         * The below provides this either through switch_mm(), or in
         * case 'prev->active_mm == next->mm' through
         * finish_task_switch()'s mmdrop().
         */
        switch_mm_irqs_off(prev->active_mm, next->mm, next);

        if (!prev->mm) {                        // from kernel
            /* will mmdrop() in finish_task_switch(). */
            rq->prev_mm = prev->active_mm;
            prev->active_mm = NULL;
        }
    }

    rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);

    prepare_lock_switch(rq, next, rf);

    /* Here we just switch the register state and the stack. */
    switch_to(prev, next, prev);
    barrier();

    return finish_task_switch(prev);
}