3.7.4.2. linux进程调度器的设计
调度器使用一系列数据结构来排序和管理系统中的进程,调度器的工作方式和这些数据紧密相关
3.7.4.2.1. task_struct中调度相关成员
struct task_struct
{
......
int on_rq; //表示是否在运行队列
/* 进程优先级
* prio: 动态优先级,范围为100~139,与静态优先级和补偿(bonus)有关
* static_prio: 静态优先级, static_prio = 100 + nice + 20 (nice值为-20~19所以static_prio值为100-139)
* normal_prio: 没有受优先级继承影响的常规优先级,具体见normal_prio函数,跟属于什么类型的进程有关
*/
int prio, static_prio, normal_prio;
unsigned int rt_priority; //实时进程优先级
const struct sched_class *sched_class; //调度类,调度处理函数类
struct sched_entity se; //调度实体(红黑树的一个结点)
struct sched_rt_entity rt;
struct sched_dl_entity dl;
#ifdef CONFIG_CGROUP_SCHED
struct task_group *sched_task_group; //指向所在进程组
#endif
}
3.7.4.2.1.1. 优先级
static_prio: 用于保存静态优先级,是进程启动时分配的优先级,可以通过nice或者sched_setcheduler系统调用来修改,否则在进程运行期间会一直保持恒定
prio: 保存进程的动态优先级
normal_prio: 表示基于进程的静态优先级和调度策略计算出的优先级,因此即使普通进程和实时进程具有相同的静态优先级,其普通优先级也是不同的,进程fork时,子进程会继承父进程的普通优先级
rt_priority: 用于保存实时优先级
0-99 实时进程 100-139 非实时进程
3.7.4.2.1.2. 调度策略
/*
* Scheduling policies
*/
#define SCHED_NORMAL 0
#define SCHED_FIFO 1
#define SCHED_RR 2
#define SCHED_BATCH 3 //用于非交互的处理器消耗型进程
/* SCHED_ISO: reserved but not implemented yet */
#define SCHED_IDLE 5
#define SCHED_DEADLINE 6
SCHED_BATCH用于非交互,CPU使用密集型的批处理进程,调度决策对此类进程给与”冷处理”。他们绝不会抢占CF调度器处理另一个进程,因此不会干扰交互式进程.如果打算使用nice值降低 进程的静态优先级,同时又不希望该进程影响系统的交互性,此时最适合使用该调度类
注解
注意 尽管名称是SCHED_IDLE但是SCHED_IDLE不负责调度空闲进程,空闲进程由内核提供单独的机制来处理
3.7.4.2.1.3. 调度类
sched_class结构体表示调度类,类提供了通用调度器和各个调度器之间的关联,调度器类和特定数据结构中汇集的几个函数指针表示,全局调度器请求的各个操作都可以用一个指针表示, 这使得无需了解调度类内部工作原理即可创建通用调度器,定义在kernel/sched/sched.h
struct sched_class {
//系统中多个调度类,按照其调度的优先级排成一个链表,指向下一优先级的调度类
//调度类优先级顺序:stop_sched_class > dl_sched_class > rt_sched_class > fair_sched_class > idle_sched_class
const struct sched_class *next;
#ifdef CONFIG_UCLAMP_TASK
int uclamp_enabled;
#endif
//将进程加入到运行队列中,即将调度实体(进程)放入红黑树中,并对nr_running变量加一
void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
//从运行队列中删除进程,并对nr_running变量减一
void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
//放弃CPU在compat_yield sysctl关闭的情况下,该函数实际上执行先出队后入队,在这种情况下,它将调度实体放在红黑树的最右端
void (*yield_task) (struct rq *rq);
bool (*yield_to_task)(struct rq *rq, struct task_struct *p, bool preempt);
//检查当前进程是否可被新进程抢占,在实际抢占正在运行的任务之前,CFS调度程序模块将执行公平性测试
void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);
/*
* Both @prev and @rf are optional and may be NULL, in which case the
* caller must already have invoked put_prev_task(rq, prev, rf).
*
* Otherwise it is the responsibility of the pick_next_task() to call
* put_prev_task() on the @prev task or something equivalent, IFF it
* returns a next task.
*
* In that case (@rf != NULL) it may return RETRY_TASK when it finds a
* higher prio class has runnable tasks.
*/
//选择下一个应该要运行的进程运行
struct task_struct * (*pick_next_task)(struct rq *rq,
struct task_struct *prev,
struct rq_flags *rf);
//将进程放回运行队列
void (*put_prev_task)(struct rq *rq, struct task_struct *p);
void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first);
#ifdef CONFIG_SMP
int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
//为进程选择一个合适的CPU
int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
//迁移任务到另外一个CPU
void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
//用于进程唤醒
void (*task_woken)(struct rq *this_rq, struct task_struct *task);
//修改进程的CPU亲和力
void (*set_cpus_allowed)(struct task_struct *p,
const struct cpumask *newmask);
//启动运行队列
void (*rq_online)(struct rq *rq);
//禁止运行队列
void (*rq_offline)(struct rq *rq);
#endif
/在每次激活周期调度器时,由周期性调度器使用,该函数通常调用自time tick函数,它可能引起进程切换
void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
//在进程创建时调用,不同调度策略的进程初始化不一样
void (*task_fork)(struct task_struct *p);
//进程退出时调用
void (*task_dead)(struct task_struct *p);
/*
* The switched_from() call is allowed to drop rq->lock, therefore we
* cannot assume the switched_from/switched_to pair is serliazed by
* rq->lock. They are however serialized by p->pi_lock.
*/
//用于进程切换
void (*switched_from)(struct rq *this_rq, struct task_struct *task);
void (*switched_to) (struct rq *this_rq, struct task_struct *task);
//改变优先级
void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
int oldprio);
unsigned int (*get_rr_interval)(struct rq *rq,
struct task_struct *task);
void (*update_curr)(struct rq *rq);
#define TASK_SET_GROUP 0
#define TASK_MOVE_GROUP 1
#ifdef CONFIG_FAIR_GROUP_SCHED
void (*task_change_group)(struct task_struct *p, int type);
#endif
};
对于每个调度器类,都必须提供struct sched_class的一个实例,目前内核中由实现以下五种
//优先级最高的线程,会中断其他线程,且不会被其他任务打断
//1.发生在cpu_stop_cpu_callback进行cpu之间任务migration 2.HOTPLUG_CPU的情况下关闭任务
extern const struct sched_class stop_sched_class;
//
extern const struct sched_class dl_sched_class;
//实时任务
extern const struct sched_class rt_sched_class;
//CFS(公平调度器),作用于一般常规线程
extern const struct sched_class fair_sched_class;
//每个CPU的第一个pid=0线程:swapper,是一个静态线程,调度类属于idle_sched_class
extern const struct sched_class idle_sched_class;
SCHED_NORMAL, SCHED_BATCH, SCHED_IDLE被映射到fail_sched_class
SCHED_RR和SCHED_FIFO则与rt_schedule_class相关联
3.7.4.2.1.4. 就绪队列
就绪队列是核心调度器用于管理活动进程的主要数据结构
每个CPU都有自身的就绪队列,各个活动进程只出现在一个就绪队列中,在多个CPU上同时运行一个进程是不可能的.就绪队列是全局调度器许多操作的起点,但是进程并不是就绪队列直接管理的 调度管理是各个调度器的职责,因此在各个就绪队列中潜入了特定调度类的子就绪队列(cfs的顶级调度就绪队列 struct cfs_rq, 实时调度类的就绪队列 struct rt_rq和deadline调度类的就绪队列 struct dl_rq)
在调度时,调度器首先会西安去实时进程队列找是否由实时进程需要运行,如果没有才回去CFS运行队列去找是否由需要运行的,这就是为什么实时进程比普通进程优先级高.不仅仅体现在prio优先级上,还 体现在调度器设计上
就绪队列用struct rq来表示,其定义在kernel/sched/sched.h
/* * This is the main, per-CPU runqueue data structure.
*
* Locking rule: those places that want to lock multiple runqueues
* (such as the load balancing or the thread migration code), lock
* acquire operations must be ordered by ascending &runqueue.
*/
//每个处理器都会配备一个rq
struct rq {
/* runqueue lock: */
raw_spinlock_t lock;
/*
* nr_running and cpu_load should be in the same cacheline because
* remote CPUs use both these fields when doing load calculation.
*/
//用来记录目前处理器rq中执行task的数量
unsigned int nr_running;
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
unsigned int numa_migrate_on;
#endif
#ifdef CONFIG_NO_HZ_COMMON
#ifdef CONFIG_SMP
unsigned long last_load_update_tick;
unsigned long last_blocked_load_update_tick;
unsigned int has_blocked_load;
#endif /* CONFIG_SMP */
unsigned int nohz_tick_stopped;
atomic_t nohz_flags;
#endif /* CONFIG_NO_HZ_COMMON */
//在每次scheduler tick中呼叫update_cpu_load时,这个值会增加一,用来更新cpu load更新的次数
unsigned long nr_load_updates;
//用来累加处理器进行context switch的次数,会在调用schedule时进行累加,并可以通过函数nr_context_switches统计目前所有处理器总共的
//context switch次数,或者可以通过/proc/stat中得知目前整个系统触发context switch的次数
u64 nr_switches;
#ifdef CONFIG_UCLAMP_TASK
/* Utilization clamp values based on CPU's RUNNABLE tasks */
struct uclamp_rq uclamp[UCLAMP_CNT] ____cacheline_aligned;
unsigned int uclamp_flags;
#define UCLAMP_FLAG_IDLE 0x01
#endif
struct cfs_rq cfs; //为cfs fair scheduling class的rq就绪队列
struct rt_rq rt; //为real-time scheduling class的rq就绪队列
struct dl_rq dl; //为dead-line scheduling class的rq就绪队列
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this CPU: */
//在有设置fair group scheduling的环境下,会基于原本cfs rq中包含有若干task的group所成的集合,也就是说当有一个group就会有自己的cfs rq用来排成自己所属的tasks
//而属于这group的tasks所使用到的处理器时间就会以这group总共所分的时间为上限
//基于cgoup的fair group scheduling架构,可以创造出有阶层性的task组织,根据不同的task的功能群组化再配置给该群组对应的处理器资源,
struct list_head leaf_cfs_rq_list;
struct list_head *tmp_alone_branch;
#endif /* CONFIG_FAIR_GROUP_SCHED */
/*
* This is part of a global counter where only the total sum
* over all CPUs matters. A task can increase this counter on
* one CPU and if it got migrated afterwards it may decrease
* it on another CPU. Always updated under the runqueue lock:
*/
//一般来说,linux kernel的task状态可以为TASK_RUNNING TASK_INTERRUPTIBLE(sleep) TASK_UNINTERRUPTIBLE(deactivate task) TASK_STOPPED
//通过这个变量会统计目前rq中有多少个task属于TASK_UNINTERRUPTIBLE状态,当调用active_tas时,会把nr_uninterruptible的值减一
unsigned long nr_uninterruptible;
struct task_struct *curr; //指向目前处理器正在执行的task
struct task_struct *idle; //指向属于idle-task scheduling class的idle task
struct task_struct *stop; //指向目前最高等级属于stop-task scheduling class的task
unsigned long next_balance; //基于处理器的jiffies的值,用以记录下次进行处理器的balancing的时间点
struct mm_struct *prev_mm; //用以存储context-switch发生时,前一个task的memory management结构并可用在函数finish_task_switch透过函数mmdrop释放前一个task的结构体资源
unsigned int clock_update_flags;
u64 clock; //用以记录目前rq的clock值,基本上该值会等于sched_clock_cpu(cpu_of(rq))的返回值,并会在每次调用scheduler_tick时通过函数update_rq_clock更新目前rq clock值
/* Ensure that all clocks are in the same cache line */
u64 clock_task ____cacheline_aligned;
u64 clock_pelt;
unsigned long lost_idle_time;
//用以记录目前rq中有多少个task处于等待i/o的sleep状态,在实际的使用上,例如当driver接受来自task的调用,但处于等待i/o回复的阶段时,为了充分利用处理器的执行资源,这时就可以在driver中
调用io_schedule,此时就会把目前rq中的nr_iowait加一并把目前task的io_wait设置为1,然后触发scheduling让其他task有机会可以得到处理器执行时间
atomic_t nr_iowait;
#ifdef CONFIG_MEMBARRIER
int membarrier_state;
#endif
#ifdef CONFIG_SMP
//root domain是基于多核心架构下的机制,会由rq结构记住目前采用的root domain,其中包括了目前的cpu mask(包括span, online rt overload),reference count 和 cpupri
struct root_domain *rd;
struct sched_domain __rcu *sd;
unsigned long cpu_capacity;
unsigned long cpu_capacity_orig;
struct callback_head *balance_callback;
unsigned char idle_balance;
unsigned long misfit_task_load;
//当rq中此值为1,表示rq正在进行,
/* For active balancing */
int active_balance;
int push_cpu;
struct cpu_stop_work active_balance_work;
/* CPU of this runqueue: */
int cpu; //用来记录rq的处理器ID
int online; //为1表示目前此rq有在对应的处理器上并执行
struct list_head cfs_tasks;
struct sched_avg avg_rt; //这个值用来统计real-time task执行时间的均值,dleta_exec = rq->clock_task - curr->se.exec_start.在通过函数sched_rt_avg_update把这个
//delta值跟rq中的avg_rt值取平均值,以运行周期来看,这个值可反应目前系统中real-time task平均分配到的执行时间
struct sched_avg avg_dl;
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
struct sched_avg avg_irq;
#endif
u64 idle_stamp; //这个值主要在sched_avg_update更新
u64 avg_idle;
/* This is used to determine avg_idle's max value */
u64 max_idle_balance_cost;
#endif
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
u64 prev_irq_time;
#endif
#ifdef CONFIG_PARAVIRT
u64 prev_steal_time;
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
u64 prev_steal_time_rq;
#endif
/* calc_load related fields */
unsigned long calc_load_update; //用来记录下一次计算cpu load的时间
long calc_load_active;
#ifdef CONFIG_SCHED_HRTICK
#ifdef CONFIG_SMP
int hrtick_csd_pending;
call_single_data_t hrtick_csd;
#endif
struct hrtimer hrtick_timer;
#endif
#ifdef CONFIG_SCHEDSTATS
/* latency stats */
struct sched_info rq_sched_info;
unsigned long long rq_cpu_time;
/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
/* sys_sched_yield() stats */
unsigned int yld_count;
/* schedule() stats */
unsigned int sched_count; //统计触发scheduling的次数
unsigned int sched_goidle; //统计进入idle task的次数
/* try_to_wake_up() stats */
unsigned int ttwu_count; //统计wake up task次数
unsigned int ttwu_local; //统计wake wp同一个处理器task的次数
#endif
#ifdef CONFIG_SMP
struct llist_head wake_list;
#endif
#ifdef CONFIG_CPU_IDLE
/* Must be inspected within a rcu lock section */
struct cpuidle_state *idle_state;
#endif
};
内核中也提供了一些宏,用来获取cpu上的就绪队列的信息
#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
#define this_rq() this_cpu_ptr(&runqueues)
#define task_rq(p) cpu_rq(task_cpu(p))
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
#define raw_rq() raw_cpu_ptr(&runqueues)
CFS公平调度器的就绪队列
在系统中至少有一个CFS运行队列,其就是根cfs运行队列,而其他的进程组和进程都包含在此运行队列中,不同的进程组又有他自己的cfs运行队列,其运行队列包含的是此进程组中的所有进程. 当调度器从根cfs运行队列中选择一个进程组进行调度时,进程组会从自己的cfs运行队列中选择一个调度实体进程调度(这个调度实体可能为进程也可能是一个子进程组),就这样一直深入,知道最后 选出一个进程运行位置
/* CFS-related fields in a runqueue */
//CFS调度的运行队列,每个cpu的rq会包含一个cfs_rq,而每个组调度的sched_entity也会有自己的一个cfs_rq队列
struct cfs_rq {
struct load_weight load; //cfs运行队列中所有进程的总负载
unsigned long runnable_weight;
unsigned int nr_running; //调度的实体数量
unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */ //只对进程组有效
unsigned int idle_h_nr_running; /* SCHED_IDLE */
u64 exec_clock;
u64 min_vruntime; //当前cfs队列上最小运行时间,单调递增,两种情况下会更新该值1.更新当前运行任务的累计运行时间2.当任务从队列删除,如任务睡眠或者退出,
//这时候查看剩下的任务的vruntime是否大于min_vruntime如果是则更新该值
#ifndef CONFIG_64BIT
u64 min_vruntime_copy;
#endif
struct rb_root_cached tasks_timeline; //该红黑树的root
/*
* 'curr' points to currently running entity on this cfs_rq.
* It is set to NULL otherwise (i.e when none are currently running).
*/
struct sched_entity *curr;
struct sched_entity *next;
struct sched_entity *last;
struct sched_entity *skip;
#ifdef CONFIG_SCHED_DEBUG
unsigned int nr_spread_over;
#endif
#ifdef CONFIG_SMP
/*
* CFS load tracking
*/
struct sched_avg avg;
#ifndef CONFIG_64BIT
u64 load_last_update_time_copy;
#endif
struct {
raw_spinlock_t lock ____cacheline_aligned;
int nr;
unsigned long load_avg;
unsigned long util_avg;
unsigned long runnable_sum;
} removed;
#ifdef CONFIG_FAIR_GROUP_SCHED
unsigned long tg_load_avg_contrib;
long propagate;
long prop_runnable_sum;
/*
* h_load = weight * f(tg)
*
* Where f(tg) is the recursive weight fraction assigned to
* this group.
*/
unsigned long h_load;
u64 last_h_load_update;
struct sched_entity *h_load_next;
#endif /* CONFIG_FAIR_GROUP_SCHED */
#endif /* CONFIG_SMP */
#ifdef CONFIG_FAIR_GROUP_SCHED
struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */
/*
* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
* a hierarchy). Non-leaf lrqs hold other higher schedulable entities
* (like users, containers etc.)
*
* leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU.
* This list is used during load balance.
*/
int on_list;
struct list_head leaf_cfs_rq_list;
struct task_group *tg; /* group that "owns" this runqueue */
#ifdef CONFIG_CFS_BANDWIDTH
int runtime_enabled;
s64 runtime_remaining;
u64 throttled_clock;
u64 throttled_clock_task;
u64 throttled_clock_task_time;
int throttled;
int throttle_count;
struct list_head throttled_list;
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
};
实时进程就绪队列rt_rq
/* Real-Time classes' related field in a runqueue: */
struct rt_rq {
struct rt_prio_array active;
unsigned int rt_nr_running;
unsigned int rr_nr_running;
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
struct {
int curr; /* highest queued rt task prio */
#ifdef CONFIG_SMP
int next; /* next highest */
#endif
} highest_prio;
#endif
#ifdef CONFIG_SMP
unsigned long rt_nr_migratory;
unsigned long rt_nr_total;
int overloaded;
struct plist_head pushable_tasks;
#endif /* CONFIG_SMP */
int rt_queued;
int rt_throttled;
u64 rt_time;
u64 rt_runtime;
/* Nests inside the rq lock: */
raw_spinlock_t rt_runtime_lock;
#ifdef CONFIG_RT_GROUP_SCHED
unsigned long rt_nr_boosted;
struct rq *rq;
struct task_group *tg;
#endif
};
deadline就绪队列dl_rq
/* Deadline class' related fields in a runqueue */
struct dl_rq {
/* runqueue is an rbtree, ordered by deadline */
struct rb_root_cached root;
unsigned long dl_nr_running;
#ifdef CONFIG_SMP
/*
* Deadline values of the currently executing and the
* earliest ready task on this rq. Caching these facilitates
* the decision whether or not a ready but not running task
* should migrate somewhere else.
*/
struct {
u64 curr;
u64 next;
} earliest_dl;
unsigned long dl_nr_migratory;
int overloaded;
/*
* Tasks on this rq that can be pushed away. They are kept in
* an rb-tree, ordered by tasks' deadlines, with caching
* of the leftmost (earliest deadline) element.
*/
struct rb_root_cached pushable_dl_tasks_root;
#else
struct dl_bw dl_bw;
#endif
/*
* "Active utilization" for this runqueue: increased when a
* task wakes up (becomes TASK_RUNNING) and decreased when a
* task blocks
*/
u64 running_bw;
/*
* Utilization of the tasks "assigned" to this runqueue (including
* the tasks that are in runqueue and the tasks that executed on this
* CPU and blocked). Increased when a task moves to this runqueue, and
* decreased when the task moves away (migrates, changes scheduling
* policy, or terminates).
* This is needed to compute the "inactive utilization" for the
* runqueue (inactive utilization = this_bw - running_bw).
*/
u64 this_bw;
u64 extra_bw;
/*
* Inverse of the fraction of CPU utilization that can be reclaimed
* by the GRUB algorithm.
*/
u64 bw_ratio;
};
3.7.4.2.1.5. 调度实体
调度器不限于调度进程,还可以调度更大的实体,比如实现组调度.
//一个调度实体(红黑树的一个结点),其包含一组或一个指定进程,包含一个自己的运行队列,一个父指针,一个指向需要调度的运行队列
struct sched_entity {
/* For load-balancing: */
struct load_weight load; //权重,在数组prio_to_weight包含优先级转权重的数值
unsigned long runnable_weight;
struct rb_node run_node; //实体在红黑树对应的结点信息
struct list_head group_node; //实体所在的进程组
unsigned int on_rq; //实体是否处于红黑树运行队列中
u64 exec_start; //开始运行时间
u64 sum_exec_runtime; //累计运行时间
u64 vruntime; //虚拟运行时间,在时间中断或者任务状态发生改变时会更新,其会不停的增长,增长速度与load权重成反比,load越高增长速度越慢
//就越可能处于红黑树最左边被调度,每次时钟中断都会修改其值
u64 prev_sum_exec_runtime; //进程在切换进CPU时的sum_exec_runtime值
u64 nr_migrations; //此调度实体中进程移到其他CPU组的数量
struct sched_statistics statistics;
#ifdef CONFIG_FAIR_GROUP_SCHED
int depth; //代表进程组的深度,每个进程组都比其parent调度组深度大1
struct sched_entity *parent; //父调度实体指针
/* rq on which this entity is (to be) queued: */
struct cfs_rq *cfs_rq; //实体所处的红黑树运行队列
/* rq "owned" by this entity/group: */
struct cfs_rq *my_q; //NULL表示是一个进程,非NULL则表明是一个调度组
#endif
#ifdef CONFIG_SMP
/*
* Per entity load average tracking.
*
* Put into separate cache line so it does not
* collide with read-mostly values above.
*/
struct sched_avg avg;
#endif
};
实时进程调度实体sched_rt_entity
struct sched_rt_entity {
struct list_head run_list;
unsigned long timeout;
unsigned long watchdog_stamp;
unsigned int time_slice;
unsigned short on_rq;
unsigned short on_list;
struct sched_rt_entity *back;
#ifdef CONFIG_RT_GROUP_SCHED
struct sched_rt_entity *parent;
/* rq on which this entity is (to be) queued: */
struct rt_rq *rt_rq;
/* rq "owned" by this entity/group: */
struct rt_rq *my_q;
#endif
} __randomize_layout;
edf调度实体sched_dl_entity
struct sched_dl_entity {
struct rb_node rb_node;
/*
* Original scheduling parameters. Copied here from sched_attr
* during sched_setattr(), they will remain the same until
* the next sched_setattr().
*/
u64 dl_runtime; /* Maximum runtime for each instance */
u64 dl_deadline; /* Relative deadline of each instance */
u64 dl_period; /* Separation of two instances (period) */
u64 dl_bw; /* dl_runtime / dl_period */
u64 dl_density; /* dl_runtime / dl_deadline */
/*
* Actual scheduling parameters. Initialized with the values above,
* they are continuously updated during task execution. Note that
* the remaining runtime could be < 0 in case we are in overrun.
*/
s64 runtime; /* Remaining runtime for this instance */
u64 deadline; /* Absolute deadline for this instance */
unsigned int flags; /* Specifying the scheduler behaviour */
/*
* Some bool flags:
*
* @dl_throttled tells if we exhausted the runtime. If so, the
* task has to wait for a replenishment to be performed at the
* next firing of dl_timer.
*
* @dl_boosted tells if we are boosted due to DI. If so we are
* outside bandwidth enforcement mechanism (but only until we
* exit the critical section);
*
* @dl_yielded tells if task gave up the CPU before consuming
* all its available runtime during the last job.
*
* @dl_non_contending tells if the task is inactive while still
* contributing to the active utilization. In other words, it
* indicates if the inactive timer has been armed and its handler
* has not been executed yet. This flag is useful to avoid race
* conditions between the inactive timer handler and the wakeup
* code.
*
* @dl_overrun tells if the task asked to be informed about runtime
* overruns.
*/
unsigned int dl_throttled : 1;
unsigned int dl_boosted : 1;
unsigned int dl_yielded : 1;
unsigned int dl_non_contending : 1;
unsigned int dl_overrun : 1;
/*
* Bandwidth enforcement timer. Each -deadline task has its
* own bandwidth to be enforced, thus we need one timer per task.
*/
struct hrtimer dl_timer;
/*
* Inactive timer, responsible for decreasing the active utilization
* at the "0-lag time". When a -deadline task blocks, it contributes
* to GRUB's active utilization until the "0-lag time", hence a
* timer is needed to decrease the active utilization at the correct
* time.
*/
struct hrtimer inactive_timer;
};
3.7.4.2.1.6. 组调度(struct task_group)
linux是一个多用户系统,如果有两个进程分别属于两个用户,而进程的优先级不同,会导致两个用户所占用的CPU时间不同,这显然是不公平的(如果优先级差距过大,低优先级进程所属用户使用CPU时间就很小),所以内核 引入组调度。如果基于用户分组,即使进程优先级不同,这两个用户使用的CPU时间都为50%
如果task_group中的运行时间还没有使用完,而当前进程运行时间使用完后,会调度task_group中下一个被调度进程,相反,如果task_group的运行时间使用结束,则调用上一层的下一个被调度进程.需要注意的是,一个组 调度中可能有一部分是实时进程,一部分是普通进程,这也导致这种组要能够满足既能在实时调度中进行调度又可以在CFS调度中被调度
linux可以用两种方式进行进程的分组
用户ID:按照进程的USER ID进行分组,在对应的/sys/kernel/uid目录下会生成一个cpu.share的文件,可以通过配置该文件来配置用户所占CPU的时间比例
2) cgroup(control group):生成组用于限制其所有进程,比如我生成一个组(生成后此组为空,里面没有进程),设置其CPU使用率为10%并把一个进程丢进这个组中,那么这个进程最多只能使用CPU的10%,如果将多个进程丢进这个 组,则这个组中的所有进程平分这10%
/* Task group related information */
struct task_group {
struct cgroup_subsys_state css;
#ifdef CONFIG_FAIR_GROUP_SCHED
/* schedulable entities of this group on each CPU */
struct sched_entity **se;
/* runqueue "owned" by this group on each CPU */
struct cfs_rq **cfs_rq;
unsigned long shares;
#ifdef CONFIG_SMP
/*
* load_avg can be heavily contended at clock tick time, so put
* it in its own cacheline separated from the fields above which
* will also be accessed at each tick.
*/
atomic_long_t load_avg ____cacheline_aligned;
#endif
#endif
#ifdef CONFIG_RT_GROUP_SCHED
struct sched_rt_entity **rt_se;
struct rt_rq **rt_rq;
struct rt_bandwidth rt_bandwidth;
#endif
struct rcu_head rcu;
struct list_head list;
struct task_group *parent;
struct list_head siblings;
struct list_head children;
#ifdef CONFIG_SCHED_AUTOGROUP
struct autogroup *autogroup;
#endif
struct cfs_bandwidth cfs_bandwidth;
#ifdef CONFIG_UCLAMP_TASK_GROUP
/* The two decimal precision [%] value requested from user-space */
unsigned int uclamp_pct[UCLAMP_CNT];
/* Clamp values requested for a task group */
struct uclamp_se uclamp_req[UCLAMP_CNT];
/* Effective clamp values used for a task group */
struct uclamp_se uclamp[UCLAMP_CNT];
#endif
};
3.7.4.2.2. 周期性调度器
周期性调度器在 scheduler_tick
中实现,如果系统正在活动中,内核会按照频率HZ自动调用该函数,如果没有进程等待调度,那么计算机在店里供应不足的情况下,内核会关闭该调度器
以减少能耗,这对于嵌入式设备是很重要的
3.7.4.2.2.1. 周期性调度器主流程
scheduler_tick函数定义在 kernel/sched/core.c 中,它有两个主要任务
更新相关统计量,管理内核中的与整个系统和各个进程的调度相关的统计量,其间执行的主要操作是对各种计数器+1
激活负责当前进程调度类的周期性调度方法,检查进程的执行时间是否超过了它对应的idea_runtime,如果超过了则告诉系统,需要启动主调度器进行进程切换
/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
*/
void scheduler_tick(void)
{
//获取当前CPU的ID
int cpu = smp_processor_id();
//获取cpu的全局就绪队列rq
struct rq *rq = cpu_rq(cpu);
//获取就绪队列上正在运行的进程curr
struct task_struct *curr = rq->curr;
struct rq_flags rf;
sched_clock_tick();
rq_lock(rq, &rf);
//更新rq的当前时间戳,即使rq->clock变为当前时间戳
update_rq_clock(rq);
//执行当前进程所在调度类的task_tick函数进行周期性调度
curr->sched_class->task_tick(rq, curr, 0);
//更新rq的负载信息
calc_global_load_tick(rq);
psi_task_tick(rq);
rq_unlock(rq, &rf);
perf_event_task_tick();
#ifdef CONFIG_SMP
//当前CPU是否空闲
rq->idle_balance = idle_cpu(cpu);
//如果到时候进行周期性负载均衡则触发SCHED_SOFTIRQ
trigger_load_balance(rq);
#endif
}
定时器周期性的激活调度器
定时器是linux提供的一种定时服务机制,他在某个特定的时间唤醒某个进程,来做一些工作,在低分辨率定时器的每次时钟中断完成全局统计量更新后,每个CPU在软中断中执行以下操作
更新该cpu上当前进程内核态、用户态、使用时间
调用该cpu上的定时器函数
启动周期性定时器(scheduler_tick)完成该cpu上任务的周期性调度工作
在支持动态定时器的系统中,可以关闭该调度器,从而进入深入睡眠过程,scheduler_tick查看当前进程是否运行太长时间,如果是,将进程的TIF_NEED_RESCHED置位,然后在中断返回时,调用schedule进行进程切换