4.8.1. poll机制
使用 man 2 poll
命令可查看poll的使用方法和作用。函数原型如下
intt poll(struct pollfd *fds, nfds_t nfds, int timeout);
第一个结构体代表文件描述符,第二个参数代表文件的数量,第三个参数代表超时自动唤醒时间
以下内容会涉及到等待队列的使用,可参考 kernel_note章节的kernel queue note
4.8.1.1. poll函数相关参数
struct pollfd {
int fd; /*该进程中打开文件的文件描述符*/
short events; /*表示要请求的事件类型*/
short revents; /*表示返回的事件类型*/
};
struct timspec {
__kernel_time_t tv_sec; /*seconds,以s为单位的时间*/
long tv_nsec; /*nanoseconds,以ns为单位的时间*/
};
typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *);
typedef struct poll_table_struct {
poll_queue_proc _qroc;
unsigned long _key;
} poll_table;
struct poll_list {
struct poll_list *next;
int len;
struct pollfd entries[0];
};
struct poll_wqueues {
poll_table pt;
struct poll_table_page *table;
struct task_struct *polling_task;
int triggered;
int error;
int inline_index;
struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES];
};
其中请求事件和返回事件可以是下面的几种
事件 |
描述 |
POLLIN |
有数据可读 |
POLLRDNORM |
普通数据可读 |
POLLRDBAND |
优先级数据可读 |
POLLPRI |
高优先级数据可读 |
POLLOUT |
数据可写 |
POLLWRNORM |
普通数据可写 |
POLLWRBAND |
优先级带数据可写 |
POLLERR |
发生错误 |
POLLHUP |
发生挂起 |
POLLNVAL |
描述字不是一个打开的文件 |
注解
后三个只能作为描述字的返回结果存放在revents中,而不能作为测试条件用于events中
4.8.1.2. sys_poll函数
SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
int, timeout_msecs)
{
struct timespec64 end_time, *to = NULL;
int ret;
if (timeout_msecs >= 0) { //超时时间换算,最终换算成以struct timespec结构方式计算的秒和纳秒形式
to = &end_time;
poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
}
//执行真正的系统调用
ret = do_sys_poll(ufds, nfds, to);
//如果在poll期间有信号到来,回返回-ERESTARTNOHAND,这会导致重新调用
if (ret == -ERESTARTNOHAND) {
struct restart_block *restart_block;
restart_block = ¤t->restart_block;
restart_block->fn = do_restart_poll;
restart_block->poll.ufds = ufds;
restart_block->poll.nfds = nfds;
if (timeout_msecs >= 0) {
restart_block->poll.tv_sec = end_time.tv_sec;
restart_block->poll.tv_nsec = end_time.tv_nsec;
restart_block->poll.has_timeout = 1;
} else
restart_block->poll.has_timeout = 0;
ret = -ERESTART_RESTARTBLOCK;
}
return ret;
}
4.8.1.2.1. do_sys_poll
#define FRONTEND_STACK_ALLOC 256
#define SELECT_STACK_ALLOC FRONTEND_STACK_ALLOC
#define POLL_STACK_ALLOC FRONTEND_STACK_ALLOC
#define WQUEUES_STACK_ALLOC (MAX_STACK_ALLOC - FRONTEND_STACK_ALLOC)
#define N_INLINE_POLL_ENTRIES (WQUEUES_STACK_ALLOC / sizeof(struct poll_table_entry))
#define DEFAULT_POLLMASK (EPOLLIN | EPOLLOUT | EPOLLRDNORM | EPOLLWRNORM)
static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
struct timespec64 *end_time)
{
struct poll_wqueues table; //局部变量分配poll_wqueues
int err = -EFAULT, fdcount, len;
/* Allocate small arguments on the stack to save memory and be
faster - use long to make sure the buffer is aligned properly
on 64 bit archs to avoid unaligned access */
long stack_pps[POLL_STACK_ALLOC/sizeof(long)]; //用来存放用户空间传过来的数据
struct poll_list *const head = (struct poll_list *)stack_pps;
struct poll_list *walk = head;
unsigned long todo = nfds; //等待的事件数目
if (nfds > rlimit(RLIMIT_NOFILE)) //检查监听的文件数量是否大于进行打开的文件数
return -EINVAL;
len = min_t(unsigned int, nfds, N_STACK_PPS);
for (;;) {
walk->next = NULL;
walk->len = len;
if (!len)
break;
//将用户空间的pollfd拷贝到可存放walk的entries空间中
if (copy_from_user(walk->entries, ufds + nfds-todo,
sizeof(struct pollfd) * walk->len))
goto out_fds;
todo -= walk->len;
if (!todo)
break;
/*检查一页(4K)和剩下没拷贝的那个小,返回较小者,如果一页也不够拷贝,下轮循环再申请一页
POLLFD_PER_PAGE表示一页的内存能够存储多少个struct pollfd,可以计算一下,一页是4K,而
struct pollfd的内存占用8个字节,就是一页的内存可以将近存储512个pollfd描述符。如果
在分配一页的内存之后,还不够nfds来用,没关系,循环不会退出的,会再分配一个页,并且所有分
配的块都被struct poll_list链接起来,上面可以看到,这个结构有一个next域,就是专门做这个
的。在这之后,就会形成一个以stack_pps存储空间为头,然后一页一页分配的内存为接点的链表,
这个链表上就存储了poll调用时传入的所有的fd描述符*/
len = min(todo, POLLFD_PER_PAGE);
//动态申请len长度,和64long长度的stack_pps组成一个单向链表
walk = walk->next = kmalloc(struct_size(walk, entries, len),
GFP_KERNEL);
if (!walk) {
err = -ENOMEM;
goto out_fds;
}
}
//初始化等待队列项
poll_initwait(&table);
//执行驱动中的poll函数
fdcount = do_poll(head, &table, end_time);
//删除等待队列,释放动态申请的空间
poll_freewait(&table);
//把内核空间每个文件描述符的返回值,拷贝到用户空间数组中
for (walk = head; walk; walk = walk->next) {
struct pollfd *fds = walk->entries;
int j;
for (j = 0; j < walk->len; j++, ufds++)
if (__put_user(fds[j].revents, &ufds->revents))
goto out_fds;
}
err = fdcount;
out_fds: //释放动态申请的内存
walk = head->next;
while (walk) {
struct poll_list *pos = walk;
walk = walk->next;
kfree(pos);
}
return err;
}
4.8.1.2.1.1. poll_initwait
//初始化等待队列
void poll_initwait(struct poll_wqueues *pwq)
{
init_poll_funcptr(&pwq->pt, __pollwait);
pwq->polling_task = current;
pwq->triggered = 0;
pwq->error = 0;
pwq->table = NULL;
pwq->inline_index = 0;
}
EXPORT_SYMBOL(poll_initwait);
static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
{
pt->_qproc = qproc; //绑定poll_wait函数
pt->_key = ~(__poll_t)0; /* all events enabled */
}
//删除等待队列中的所有项,并释放动态申请的空间
void poll_freewait(struct poll_wqueues *pwq)
{
struct poll_table_page * p = pwq->table;
int i;
for (i = 0; i < pwq->inline_index; i++)
free_poll_entry(pwq->inline_entries + i); //从等待队列中删除所有等待项
while (p) { //删除等待列表中所有动态申请的项
struct poll_table_entry * entry;
struct poll_table_page *old;
entry = p->entry;
do {
entry--;
free_poll_entry(entry);
} while (entry > p->entries);
old = p;
p = p->next;
free_page((unsigned long) old);
}
}
EXPORT_SYMBOL(poll_freewait);
4.8.1.2.1.2. do_poll
static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
struct timespec64 *end_time)
{
poll_table* pt = &wait->pt;
ktime_t expire, *to = NULL;
int timed_out = 0, count = 0;
u64 slack = 0;
__poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
unsigned long busy_start = 0;
/* Optimise the no-wait case */
if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
pt->_qproc = NULL;
timed_out = 1;
}
if (end_time && !timed_out)
slack = select_estimate_accuracy(end_time);
for (;;) {
struct poll_list *walk;
bool can_busy_loop = false;
//执行每个list链表项中的每个pollfd项
for (walk = list; walk != NULL; walk = walk->next) {
struct pollfd * pfd, * pfd_end;
pfd = walk->entries;
pfd_end = pfd + walk->len;
//每个链表中的entries上可能会挂载多个struct pollfd
for (; pfd != pfd_end; pfd++) {
/*
* Fish for events. If we found one, record it
* and kill poll_table->_qproc, so we don't
* needlessly register any other waiters after
* this. They'll get immediately deregistered
* when we break out and return.
*/
//处理当前进程的一个fd的poll操作,返回非0值表示有事件发生(或者错误)
if (do_pollfd(pfd, pt, &can_busy_loop,
busy_flag)) {
count++;
pt->_qproc = NULL;
/* found something, stop busy polling */
busy_flag = 0;
can_busy_loop = false;
}
}
}
/*
* All waiters have already been registered, so don't provide
* a poll_table->_qproc to them on the next loop iteration.
*/
pt->_qproc = NULL;
if (!count) {
count = wait->error;
if (signal_pending(current))
count = -ERESTARTNOHAND;
}
if (count || timed_out)
break;
/* only if found POLL_BUSY_LOOP sockets && not out of time */
if (can_busy_loop && !need_resched()) {
if (!busy_start) {
busy_start = busy_loop_current_time(); //如果有us级别的等待,不睡眠而是忙等(我们的调度是ms级别的)
continue;
}
if (!busy_loop_timeout(busy_start))
continue;
}
busy_flag = 0;
/*
* If this is the first loop and we have a timeout
* given, then we convert to ktime_t and set the to
* pointer to the expiry value.
*/
if (end_time && !to) {
expire = timespec64_to_ktime(*end_time);
to = &expire;
}
if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
timed_out = 1;
}
return count;
}
4.8.1.2.1.3. do_pollfd
static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait,
bool *can_busy_poll,
__poll_t busy_flag)
{
int fd = pollfd->fd; //得到文件描述符
__poll_t mask = 0, filter;
struct fd f;
if (fd < 0)
goto out;
mask = EPOLLNVAL;
f = fdget(fd); //通过文件描述符得到文件信息
if (!f.file)
goto out;
/* userland u16 ->events contains POLL... bitmap */
filter = demangle_poll(pollfd->events) | EPOLLERR | EPOLLHUP; //错误和热插拔事件是必须要的
pwait->_key = filter | busy_flag;
mask = vfs_poll(f.file, pwait); //执行驱动中的poll函数
if (mask & busy_flag)
*can_busy_poll = true;
mask &= filter; /* Mask out unneeded events. */
fdput(f);
out:
/* ... and so does ->revents */
pollfd->revents = mangle_poll(mask);
return mask;
}
static inline __poll_t vfs_poll(struct file *file, struct poll_table_struct *pt)
{
if (unlikely(!file->f_op->poll))
return DEFAULT_POLLMASK;
return file->f_op->poll(file, pt);
}
4.8.1.3. 驱动程序
我们看一下驱动程序做了什么
static unsigned int button_drv_poll(struct file *file, struct poll_table_struct *wait)
{
int mask = 0;
//将进程挂载button_waitq队列上,不是在这里睡眠
poll_wait(file, &button_wait_q, wait);
if(ev_press)
{
msk = POLLIN | POLLRDNORM;
}
return mask;
}
poll_wait函数
static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
if (p && p->_qproc && wait_address) /* 检查等待队列头和file中的_qproc存在 */
p->_qproc(filp, wait_address, p); /* 执行该函数 */
}
其中p->qproc是在初始化table的时候绑定的一个函数指针, poll_initwait(&table)函数将__pollwait绑定在_qproc上
/*Add a new entry*/
//__pollwait会把当前进程挂接到等待队列上,并不会睡眠该进程
//一旦有I/O事件到来,等待队列将被唤醒,就会唤醒等待队列上的进程
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
poll_table *p)
{
struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
struct poll_table_entry *entry = poll_get_entry(pwq);
if (!entry)
return;
entry->filp = get_file(filp);
entry->wait_address = wait_address; //驱动程序中的等待队列头
entry->key = p->_key; //设置等待的事件
init_waitqueue_func_entry(&entry->wait, pollwake); //绑定函数唤醒函数
entry->wait.private = pwq;
add_wait_queue(wait_address, &entry->wait); //把等待项加入等待队列
}
唤醒函数
static int __pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
struct poll_wqueues *pwq = wait->private;
DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
/*
* Although this function is called under waitqueue lock, LOCK
* doesn't imply write barrier and the users expect write
* barrier semantics on wakeup functions. The following
* smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
* and is paired with smp_store_mb() in poll_schedule_timeout.
*/
smp_wmb();
pwq->triggered = 1;
/*
* Perform the default wake up operation using a dummy
* waitqueue.
*
* TODO: This is hacky but there currently is no interface to
* pass in @sync. @sync is scheduled to be removed and once
* that happens, wake_up_process() can be used directly.
*/
return default_wake_function(&dummy_wait, mode, sync, key);
}
static int pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
struct poll_table_entry *entry;
entry = container_of(wait, struct poll_table_entry, wait);
if (key && !(key_to_poll(key) & entry->key))
return 0;
return __pollwake(wait, mode, sync, key);
}