为什么epoll是线程安全?

这个问题可以直接看一下linux的代码, 目前最新的稳定版本是v4.13, epoll的代码在fs/eventpoll.c

结论:

简要结论就是epoll是通过锁来保证线程安全的, epoll中粒度最小的自旋锁ep->lock(spinlock)用来保护就绪的队列, 互斥锁ep->mtx用来保护epoll的重要数据结构红黑树


先来看一下epoll的核心数据结构

struct eventpoll {
	...
	/* 一个自旋锁 */
	spinlock_t lock;
		
	/* 一个互斥锁 */
	struct mutex mtx;

	/* List of ready file descriptors */
	/* 就绪fd队列 */
	struct list_head rdllist;

	/* RB tree root used to store monitored fd structs */
	/* 红黑树 */
	struct rb_root_cached rbr;
	...
};


其中epoll主要需要考虑是否线程安全的接口是epoll_ctl() 和 epoll_wait()

epoll_ctl()

以下代码即为epoll_ctl()接口的实现, 当需要根据不同的operation通过ep_insert() 或者ep_remove()等接口对epoll自身的数据结构进行操作时都提前获得了ep->mex锁.

/ * epoll_ctl() 接口 */
SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
	struct epoll_event __user *, event)
{
	...
	/* 获得 mtx 锁 */
	mutex_lock_nested(&ep->mtx, 0);
	...

	epi = ep_find(ep, tf.file, fd);

	error = -EINVAL;
	switch (op) {
	case EPOLL_CTL_ADD:
		/* 
		 * 通过ep_insert()接口来完成EPOLL_CTL_ADD的操作
		 * /
		if (!epi) {
			epds.events |= POLLERR | POLLHUP;
			error = ep_insert(ep, &epds, tf.file, fd, full_check);
		} else
			error = -EEXIST;
		if (full_check)
			clear_tfile_check_list();
		break;
	case EPOLL_CTL_DEL:
		/* 
		 * 通过ep_insert()接口来完成EPOLL_CTL_ADD的操作
		 * /
		if (epi)
			error = ep_remove(ep, epi);
		else
			error = -ENOENT;
		break;
	case EPOLL_CTL_MOD:
		/* 
		 * 通过ep_insert()接口来完成EPOLL_CTL_ADD的操作
		 * /
		if (epi) {
			if (!(epi->event.events & EPOLLEXCLUSIVE)) {
				epds.events |= POLLERR | POLLHUP;
				error = ep_modify(ep, epi, &epds);
			}
		} else
			error = -ENOENT;
		break;
	}
	if (tep != NULL)
		mutex_unlock(&tep->mtx);
	mutex_unlock(&ep->mtx);		/* 释放mtx锁 */
	...
}


挑选其中EPOLL_CTL_ADD来简单看一下ep_insert()的实现

static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
		     struct file *tfile, int fd, int full_check)
{
	...
	/*
	 * Add the current item to the RB tree. All RB tree operations are
	 * protected by "mtx", and ep_insert() is called with "mtx" held.
	 */
	 /*
	  * 在使用ep_rbtree_insert()函数对epoll的核心数据结构红黑树插入之前,
	  * 已经获得了ep->mex锁
	  * /
	ep_rbtree_insert(ep, epi);

	...
}

所以当epoll操作红黑树时,通过ep->mex来保证线程安全

epoll_wait()

再来看看epll_wait()的代码实现


SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
		int, maxevents, int, timeout)
{
	...
	/* Time to fish for events ... */
	/* 
	 * 可以看到epoll_wait()是通过ep_poll()来等待就绪事件的.
	 * /
	error = ep_poll(ep, events, maxevents, timeout);
	...
}


epoll_wait()既然是通过ep_poll()接口来等待就绪的fd队列,那可以通过ep_poll_callback()回调函数来看一下ep_poll()背后的机制到底是什么?

static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
	int pwake = 0;
	unsigned long flags;
	struct epitem *epi = ep_item_from_wait(wait);
	struct eventpoll *ep = epi->ep;
	int ewake = 0;
	
	/* 获得自旋锁 ep->lock来保护就绪队列
	 * 自旋锁ep->lock在ep_poll()里被释放
	 * /
	spin_lock_irqsave(&ep->lock, flags);

	/* If this file is already in the ready list we exit soon */
	/* 在这里将就绪事件添加到rdllist */
	if (!ep_is_linked(&epi->rdllink)) {
		list_add_tail(&epi->rdllink, &ep->rdllist);
		ep_pm_stay_awake_rcu(epi);
	}
	...
}

编辑于 2017-11-11