引言

这篇文章主要对于epoll_create以及epoll中所使用的重要数据结构进行分析,这些是在后续文章中搞懂epoll为何如此高效的前提.我们首先来看看两个数据结构.

struct eventpoll {
    /* Protect the access to this structure */
    spinlock_t lock; //自旋锁,提供更细粒度的锁定,同时保护对这个结构的访问,比如向rdllist中添加数据等

    /*
     * This mutex is used to ensure that files are not removed
     * while epoll is using them. This is held during the event
     * collection loop, the file cleanup path, the epoll file exit
     * code and the ctl operations.
     */
    struct mutex mtx; 
    //最重要的作用就是确保文件描述符不会在使用时被删除,
    //在遍历rdllist,ctl操作中都会使用,这也告诉我们epoll是线程安全的

    /* Wait queue used by sys_epoll_wait() */
    wait_queue_head_t wq; //在epoll_wait中使用的等待队列,把进行epoll_wait的进程都加进去

    /* Wait queue used by file->poll() */
    wait_queue_head_t poll_wait;  //用于epollfd本身被poll的时候

    /* List of ready file descriptors */
    struct list_head rdllist; //就绪的文件描述符链表

    /* RB tree root used to store monitored fd structs */
    struct rb_root rbr; //存储被监听的fd结构的红黑树根

    /*
     * This is a single linked list that chains all the "struct epitem" that
     * happened while transferring ready events to userspace w/out
     * holding ->lock.
     */
    struct epitem *ovflist; 
    //我们在epoll_wait被唤醒后要把rdllist中的数据发往用户空间,
    //但可能这是也来了被触发的fd结构,这个结构的作用就是在使用rdllist期间把到来的fd结构加到ovflist中,这样可以不必加锁.

    /* The user that created the eventpoll descriptor */
    struct user_struct *user;
};

/*
 * Each file descriptor added to the eventpoll interface will
 * have an entry of this type linked to the "rbr" RB tree.
 */
 //Epoll每次被加入一个fd的时候就会创建一个epitem结构,表示一个被监听的fd.
struct epitem {
    /* RB tree node used to link this structure to the eventpoll RB tree */
    struct rb_node rbn; 
    //对应的红黑树节点,在epoll中已红黑树为主要结构管理fd,而每个fd对应一个epitem,其root保存在eventpoll,上面提到了,即rbr

    /* List header used to link this structure to the eventpoll ready list */
    struct list_head rdllink; //事件的就绪队列,已就绪的epitem会连接在rdllist中,

    /*
     * Works together "struct eventpoll"->ovflist in keeping the
     * single linked chain of items.
     */
    struct epitem *next;

    /* The file descriptor information this item refers to */
    struct epoll_filefd ffd; //此epitem对应的fd和文件指针,用在红黑树中的比较操作,就相当于正常排序的小于号

    /* Number of active wait queue attached to poll operations */
    int nwait; //记录了poll触发回调的次数 epoll_ctl中有提及
    /* List containing poll wait queues */
    struct list_head pwqlist; //保存着被监视fd的等待队列

    /* The "container" of this item */
    struct eventpoll *ep;//该项属于哪个主结构体（多个epitm从属于一个eventpoll)

    /* List header used to link this item to the "struct file" items list */
    //这个实在没搞懂什么意思,参考别的博主的,
    struct list_head fllink; //file中有f_ep_link,用作连接所有监听file的epitem,链表加入的成员为fllink

    /* The structure that describe the interested events and the source fd */
    struct epoll_event event; //所注册的事件,和我们用户空间中见到的差不多

我们这下来看看epoll_create到底干了什么

SYSCALL_DEFINE1(epoll_create, int, size)
{
//我们可以看到epoll_create会在内部调用epoll_create1,参数没有什么用,
//所以我们编写代码的时候完全可以直接使用epoll_create1,还省一次函数调用
    if (size <= 0)
        return -EINVAL;

    return sys_epoll_create1(0);
}

SYSCALL_DEFINE1(epoll_create1, int, flags)
{
    int error;
    struct eventpoll *ep = NULL;

    /* Check the EPOLL_* constant for consistency.  */
    BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);

    if (flags & ~EPOLL_CLOEXEC)
        return -EINVAL;
    /*
     * Create the internal data structure ("struct eventpoll").
     */
    error = ep_alloc(&ep); //为一个eventpoll指针分配空间,并对成员初始化,下面会细说
    if (error < 0)
        return error;
    /*
     * Creates all the items needed to setup an eventpoll file. That is,
     * a file structure and a free file descriptor.
     */
    error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
                 O_RDWR | (flags & O_CLOEXEC)); //这里会创建一个匿名文件,并返回文件描述符,下面会细讲
                 //还有一点其实值得一提,就是我们在创建epoll时经常会设置flag位为O_CLOEXEC,这样看来是不必要的 因为内核中已经帮我们做了这件事了
    if (error < 0)
        ep_free(ep);

    return error;
}

//这个函数其实就是在申请空间后进行一系列的初始化
static int ep_alloc(struct eventpoll **pep) //二级指针,改变指针的值,使用指针的指针
{
    int error;
    struct user_struct *user;
    struct eventpoll *ep;

    user = get_current_user();
    error = -ENOMEM;
    ep = kzalloc(sizeof(*ep), GFP_KERNEL);//申请空间,且在申请后清零
    if (unlikely(!ep))
        goto free_uid;

    spin_lock_init(&ep->lock);
    mutex_init(&ep->mtx);
    init_waitqueue_head(&ep->wq);
    init_waitqueue_head(&ep->poll_wait);
    INIT_LIST_HEAD(&ep->rdllist);
    ep->rbr = RB_ROOT;
    ep->ovflist = EP_UNACTIVE_PTR; //初始化为EP_UNACTIVE_PTR,epoll_wait中会提到其用处,用于避免惊群
    ep->user = user;

    *pep = ep;

    return 0;

free_uid:
    free_uid(user);
    return error;
}

void init_waitqueue_head(wwait_queue_head_t *q)
{
    spin_lock_init(&q->lock);
    INIT_LIST_HEAD(&q->task_list);
}

我们先把调用的过程放出来,也就是我们传入的参数.

 anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
                 O_RDWR | (flags & O_CLOEXEC));

int anon_inode_getfd(const char *name, const struct file_operations *fops,
             void *priv, int flags)
{
    int error, fd;
    struct file *file;

    error = get_unused_fd_flags(flags);
    if (error < 0)
        return error;
    fd = error;

    //这里做了一个优化,即当一个文件不需要完整的索引节点即可正常运行的时候,就使得它们链接到同一个inode上
    //创建一个匿名文件,所有由anon_inode_getfile创建的文件都指向一个inode
    file = anon_inode_getfile(name, fops, priv, flags);  //其中把private_data 设置为priv
    if (IS_ERR(file)) {
        error = PTR_ERR(file);
        goto err_put_unused_fd;
    }
    fd_install(fd, file); //把一个fd与一个文件指针相连接

    return fd;

err_put_unused_fd:
    put_unused_fd(fd);
    return error;
}

我们发现get_unused_fd_flags是一个宏

#define get_used_fd_flags(flags) alloc_fd(0, (flags))

我们再去看看alloc_fd

//这个函数其实就是在当前进程内创建了一个文件描述符并返回
int alloc_fd(unsigned start, unsigned flags)
{    //我们可以看到这是从当前进程获取文件描述符
    struct files_struct *files = current->files;
    unsigned int fd;
    int error;
    struct fdtable *fdt;

    spin_lock(&files->file_lock); 
repeat:
    fdt = files_fdtable(files); //打开进程的文件描述符表
    fd = start;
    if (fd < files->next_fd)
        fd = files->next_fd;

    if (fd < fdt->max_fds)
        //寻找一个空闲的bit位,也就是寻找一个未使用的文件描述符
        fd = find_next_zero_bit(fdt->open_fds->fds_bits,
                       fdt->max_fds, fd);

    //通过fd值判断是否需要扩展文件描述符表
    error = expand_files(files, fd);
    if (error < 0)
        goto out;
    //下面就是一些排错机制了
    /*
     * If we needed to expand the fs array we
     * might have blocked - try again.
     */
    if (error)
        goto repeat;

    if (start <= files->next_fd)
        files->next_fd = fd + 1;

    FD_SET(fd, fdt->open_fds);
    if (flags & O_CLOEXEC)
        FD_SET(fd, fdt->close_on_exec);
    else
        FD_CLR(fd, fdt->close_on_exec);
    error = fd;
#if 1
    /* Sanity check */
    if (rcu_dereference_raw(fdt->fd[fd]) != NULL) {
        printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
        rcu_assign_pointer(fdt->fd[fd], NULL);
    }
#endif

out:
    spin_unlock(&files->file_lock);
    return error;
}

总结

到了这里对于epoll_create的分析就结束了,其实这样想来并不困难,就是先对eventpoll对象调用ep_alloc分配空间,然后通过anon_inode_getfd中的get_unused_fd_flags获取文件描述符,再通过anon_inode_getfile获取file指针,这里做了一个小小的优化,即其实所有的epoll虽然fd不同,但其file指针共享一个inode.这样epoll_create的调用就完成了,这里其实对我们平时写代码有所帮助的点,

epoll_create的第二个参数确实没什么用处,我们可以直接使用epoll_create1,还少一个函数调用.
epoll_create1时没必要在flag中使用O_CLOEXEC,内核中已经帮我们加上了.

我在其他博主的文章中看到了这样的说法:

在内核里，一切皆文件。所以，epoll向内核注册了一个文件系统，用于存储上述的被监控的fd.

每个版本内核可能都会修改,同样,我在eventpoll中也找到了eventpoll_init的函数体,但并没有地方调用,我也查阅了比较新的内核版本中的实现(4.18),也是没有地方调用.从我所展示的示例代码的内核版本的实现,即2.6.38来看,其实并没有创建一个文件系统,只是同当前进程的文件系统中获取fd和file指针(特殊的匿名文件).

原文链接: https://www.cnblogs.com/lizhaolong/p/16437327.html

欢迎关注

微信关注下方公众号，第一时间获取干货硬货；公众号内回复【pdf】免费获取数百本计算机经典书籍;

也有高质量的技术群，里面有嵌入式、搜广推等BAT大佬

原创文章受到原创版权保护。转载请注明出处：https://www.ccppcoding.com/archives/395987

非原创文章文中已经注明原地址，如有侵权，联系删除

关注公众号【高性能架构探索】，第一时间获取最新文章

转载文章受原作者版权保护。转载请注明原作者出处！

epoll源码解析(1) epoll_create

引言

总结

相关推荐