一、epoll的应用
epoll在实际场景的应用是非常多的,特别是开源的框架中,基本都支持这种用法。大家可以在网上轻松的得到各种形式的epoll的封装代码,但是一定要明白的是,这些代码哪些是利用了epoll的机制,哪些是上层多线程的封装,哪些又是数据处理的封装。甚至有些可能做了一些其它的辅助动作如修改了一些内核的参数等等。
学习epoll,除了理解前面分析的各种内部原理和机制,更要根据实际情况进行应用。学不能致用,学和不学没有意义;用而不思考其中的技术,则遇到问题无从下手。
二、数据结构和API
1、epoll主要的数据结构如下:
struct epoll_event {
__poll_t events;
__u64 data;
} EPOLL_PACKED;
struct epitem {
union {
/* RB tree node links this structure to the eventpoll RB tree */
struct rb_node rbn;
/* Used to free the struct epitem */
struct rcu_head rcu;
};
/* List header used to link this structure to the eventpoll ready list */
struct list_head rdllink;
/*
* Works together "struct eventpoll"->ovflist in keeping the
* single linked chain of items.
*/
struct epitem *next;
/* The file descriptor information this item refers to */
struct epoll_filefd ffd;
/* List containing poll wait queues */
struct eppoll_entry *pwqlist;
/* The "container" of this item */
struct eventpoll *ep;
/* List header used to link this item to the "struct file" items list */
struct hlist_node fllink;
/* wakeup_source used when EPOLLWAKEUP is set */
struct wakeup_source __rcu *ws;
/* The structure that describe the interested events and the source fd */
struct epoll_event event;
};
struct eventpoll {
/*
* This mutex is used to ensure that files are not removed
* while epoll is using them. This is held during the event
* collection loop, the file cleanup path, the epoll file exit
* code and the ctl operations.
*/
struct mutex mtx;
/* Wait queue used by sys_epoll_wait() */
wait_queue_head_t wq;
/* Wait queue used by file->poll() */
wait_queue_head_t poll_wait;
/* List of ready file descriptors */
struct list_head rdllist;
/* Lock which protects rdllist and ovflist */
rwlock_t lock;
/* RB tree root used to store monitored fd structs */
struct rb_root_cached rbr;
/*
* This is a single linked list that chains all the "struct epitem" that
* happened while transferring ready events to userspace w/out
* holding ->lock.
*/
struct epitem *ovflist;
/* wakeup_source used when ep_scan_ready_list is running */
struct wakeup_source *ws;
/* The user that created the eventpoll descriptor */
struct user_struct *user;
struct file *file;
/* used to optimize loop detection check */
u64 gen;
struct hlist_head refs;
#ifdef CONFIG_NET_RX_BUSY_POLL
/* used to track busy poll napi_id */
unsigned int napi_id;
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
/* tracks wakeup nests for lockdep validation */
u8 nests;
#endif
};
/* Wrapper struct used by poll queueing */
struct ep_pqueue {
poll_table pt;
struct epitem *epi;
};
2、其API主要有三个
int epoll_create(int size);
int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event);
int epoll_wait(int epfd, struct epoll_event *events,int maxevents, int timeout);
其调用的是内核中的代码:
//fs/eventpoll.c
/*
* Open an eventpoll file descriptor.
*/
static int do_epoll_create(int flags)
{
int error, fd;
struct eventpoll *ep = NULL;
struct file *file;
/* Check the EPOLL_* constant for consistency. */
BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
if (flags & ~EPOLL_CLOEXEC)
return -EINVAL;
/*
* Create the internal data structure ("struct eventpoll").
*/
error = ep_alloc(&ep);
if (error < 0)
return error;
/*
* Creates all the items needed to setup an eventpoll file. That is,
* a file structure and a free file descriptor.
*/
fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
if (fd < 0) {
error = fd;
goto out_free_ep;
}
file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
O_RDWR | (flags & O_CLOEXEC));
if (IS_ERR(file)) {
error = PTR_ERR(file);
goto out_free_fd;
}
ep->file = file;
fd_install(fd, file);
return fd;
out_free_fd:
put_unused_fd(fd);
out_free_ep:
ep_free(ep);
return error;
}
SYSCALL_DEFINE1(epoll_create1, int, flags)
{
return do_epoll_create(flags);
}
SYSCALL_DEFINE1(epoll_create, int, size)
{
if (size <= 0)
return -EINVAL;
return do_epoll_create(0);
}
static inline int epoll_mutex_lock(struct mutex *mutex, int depth,
bool nonblock)
{
if (!nonblock) {
mutex_lock_nested(mutex, depth);
return 0;
}
if (mutex_trylock(mutex))
return 0;
return -EAGAIN;
}
int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
bool nonblock)
{
int error;
int full_check = 0;
struct fd f, tf;
struct eventpoll *ep;
struct epitem *epi;
struct eventpoll *tep = NULL;
error = -EBADF;
f = fdget(epfd);
if (!f.file)
goto error_return;
/* Get the "struct file *" for the target file */
tf = fdget(fd);
if (!tf.file)
goto error_fput;
/* The target file descriptor must support poll */
error = -EPERM;
if (!file_can_poll(tf.file))
goto error_tgt_fput;
/* Check if EPOLLWAKEUP is allowed */
if (ep_op_has_event(op))
ep_take_care_of_epollwakeup(epds);
/*
* We have to check that the file structure underneath the file descriptor
* the user passed to us _is_ an eventpoll file. And also we do not permit
* adding an epoll file descriptor inside itself.
*/
error = -EINVAL;
if (f.file == tf.file || !is_file_epoll(f.file))
goto error_tgt_fput;
/*
* epoll adds to the wakeup queue at EPOLL_CTL_ADD time only,
* so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
* Also, we do not currently supported nested exclusive wakeups.
*/
if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
if (op == EPOLL_CTL_MOD)
goto error_tgt_fput;
if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
(epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
goto error_tgt_fput;
}
/*
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
ep = f.file->private_data;
/*
* When we insert an epoll file descriptor inside another epoll file
* descriptor, there is the chance of creating closed loops, which are
* better be handled here, than in more critical paths. While we are
* checking for loops we also determine the list of files reachable
* and hang them on the tfile_check_list, so we can check that we
* haven't created too many possible wakeup paths.
*
* We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
* the epoll file descriptor is attaching directly to a wakeup source,
* unless the epoll file descriptor is nested. The purpose of taking the
* 'epmutex' on add is to prevent complex toplogies such as loops and
* deep wakeup paths from forming in parallel through multiple
* EPOLL_CTL_ADD operations.
*/
error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
if (error)
goto error_tgt_fput;
if (op == EPOLL_CTL_ADD) {
if (READ_ONCE(f.file->f_ep) || ep->gen == loop_check_gen ||
is_file_epoll(tf.file)) {
mutex_unlock(&ep->mtx);
error = epoll_mutex_lock(&epmutex, 0, nonblock);
if (error)
goto error_tgt_fput;
loop_check_gen++;
full_check = 1;
if (is_file_epoll(tf.file)) {
tep = tf.file->private_data;
error = -ELOOP;
if (ep_loop_check(ep, tep) != 0)
goto error_tgt_fput;
}
error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
if (error)
goto error_tgt_fput;
}
}
/*
* Try to lookup the file inside our RB tree. Since we grabbed "mtx"
* above, we can be sure to be able to use the item looked up by
* ep_find() till we release the mutex.
*/
epi = ep_find(ep, tf.file, fd);
error = -EINVAL;
switch (op) {
case EPOLL_CTL_ADD:
if (!epi) {
epds->events |= EPOLLERR | EPOLLHUP;
error = ep_insert(ep, epds, tf.file, fd, full_check);
} else
error = -EEXIST;
break;
case EPOLL_CTL_DEL:
if (epi)
error = ep_remove(ep, epi);
else
error = -ENOENT;
break;
case EPOLL_CTL_MOD:
if (epi) {
if (!(epi->event.events & EPOLLEXCLUSIVE)) {
epds->events |= EPOLLERR | EPOLLHUP;
error = ep_modify(ep, epi, epds);
}
} else
error = -ENOENT;
break;
}
mutex_unlock(&ep->mtx);
error_tgt_fput:
if (full_check) {
clear_tfile_check_list();
loop_check_gen++;
mutex_unlock(&epmutex);
}
fdput(tf);
error_fput:
fdput(f);
error_return:
return error;
}
/*
* The following function implements the controller interface for
* the eventpoll file that enables the insertion/removal/change of
* file descriptors inside the interest set.
*/
SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
struct epoll_event __user *, event)
{
struct epoll_event epds;
if (ep_op_has_event(op) &&
copy_from_user(&epds, event, sizeof(struct epoll_event)))
return -EFAULT;
return do_epoll_ctl(epfd, op, fd, &epds, false);
}
/*
* Implement the event wait interface for the eventpoll file. It is the kernel
* part of the user space epoll_wait(2).
*/
static int do_epoll_wait(int epfd, struct epoll_event __user *events,
int maxevents, struct timespec64 *to)
{
int error;
struct fd f;
struct eventpoll *ep;
/* The maximum number of event must be greater than zero */
if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
return -EINVAL;
/* Verify that the area passed by the user is writeable */
if (!access_ok(events, maxevents * sizeof(struct epoll_event)))
return -EFAULT;
/* Get the "struct file *" for the eventpoll file */
f = fdget(epfd);
if (!f.file)
return -EBADF;
/*
* We have to check that the file structure underneath the fd
* the user passed to us _is_ an eventpoll file.
*/
error = -EINVAL;
if (!is_file_epoll(f.file))
goto error_fput;
/*
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
ep = f.file->private_data;
/* Time to fish for events ... */
error = ep_poll(ep, events, maxevents, to);
error_fput:
fdput(f);
return error;
}
SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
int, maxevents, int, timeout)
{
struct timespec64 to;
return do_epoll_wait(epfd, events, maxevents,
ep_timeout_to_timespec(&to, timeout));
}
/*
* Implement the event wait interface for the eventpoll file. It is the kernel
* part of the user space epoll_pwait(2).
*/
static int do_epoll_pwait(int epfd, struct epoll_event __user *events,
int maxevents, struct timespec64 *to,
const sigset_t __user *sigmask, size_t sigsetsize)
{
int error;
/*
* If the caller wants a certain signal mask to be set during the wait,
* we apply it here.
*/
error = set_user_sigmask(sigmask, sigsetsize);
if (error)
return error;
error = do_epoll_wait(epfd, events, maxevents, to);
restore_saved_sigmask_unless(error == -EINTR);
return error;
}
SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
int, maxevents, int, timeout, const sigset_t __user *, sigmask,
size_t, sigsetsize)
{
struct timespec64 to;
return do_epoll_pwait(epfd, events, maxevents,
ep_timeout_to_timespec(&to, timeout),
sigmask, sigsetsize);
}
SYSCALL_DEFINE6(epoll_pwait2, int, epfd, struct epoll_event __user *, events,
int, maxevents, const struct __kernel_timespec __user *, timeout,
const sigset_t __user *, sigmask, size_t, sigsetsize)
{
struct timespec64 ts, *to = NULL;
if (timeout) {
if (get_timespec64(&ts, timeout))
return -EFAULT;
to = &ts;
if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
return -EINVAL;
}
return do_epoll_pwait(epfd, events, maxevents, to,
sigmask, sigsetsize);
}
#ifdef CONFIG_COMPAT
static int do_compat_epoll_pwait(int epfd, struct epoll_event __user *events,
int maxevents, struct timespec64 *timeout,
const compat_sigset_t __user *sigmask,
compat_size_t sigsetsize)
{
long err;
/*
* If the caller wants a certain signal mask to be set during the wait,
* we apply it here.
*/
err = set_compat_user_sigmask(sigmask, sigsetsize);
if (err)
return err;
err = do_epoll_wait(epfd, events, maxevents, timeout);
restore_saved_sigmask_unless(err == -EINTR);
return err;
}
epoll_create负责创建epoll的对象,然后通过epoll_ctl对其进行事件的处理,最后通过epoll_wait进行事件的监听。这样,再配合Socket自身的Send和Recv以及各种相关异常的处理,来达到初步的网络通信。
当然要想更好的实现epoll的应用,多线程(池)与前面分析过的同步异步阻塞和非阻塞都得协调好。更主要的是数据处理,要做从数据结构的设计到速度都得满足实际需求。
三、例程
1、Redis中的应用,看一下其相关代码:
//底层epoll的代码
#include <sys/epoll.h>
typedef struct aeApiState {
int epfd;
struct epoll_event *events;
} aeApiState;
static int aeApiCreate(aeEventLoop *eventLoop) {
aeApiState *state = zmalloc(sizeof(aeApiState));
if (!state) return -1;
state->events = zmalloc(sizeof(struct epoll_event)*eventLoop->setsize);
if (!state->events) {
zfree(state);
return -1;
}
state->epfd = epoll_create(1024); /* 1024 is just a hint for the kernel */
if (state->epfd == -1) {
zfree(state->events);
zfree(state);
return -1;
}
eventLoop->apidata = state;
return 0;
}
static int aeApiResize(aeEventLoop *eventLoop, int setsize) {
aeApiState *state = eventLoop->apidata;
state->events = zrealloc(state->events, sizeof(struct epoll_event)*setsize);
return 0;
}
static void aeApiFree(aeEventLoop *eventLoop) {
aeApiState *state = eventLoop->apidata;
close(state->epfd);
zfree(state->events);
zfree(state);
}
static int aeApiAddEvent(aeEventLoop *eventLoop, int fd, int mask) {
aeApiState *state = eventLoop->apidata;
struct epoll_event ee = {0}; /* avoid valgrind warning */
/* If the fd was already monitored for some event, we need a MOD
* operation. Otherwise we need an ADD operation. */
int op = eventLoop->events[fd].mask == AE_NONE ?
EPOLL_CTL_ADD : EPOLL_CTL_MOD;
ee.events = 0;
mask |= eventLoop->events[fd].mask; /* Merge old events */
if (mask & AE_READABLE) ee.events |= EPOLLIN;
if (mask & AE_WRITABLE) ee.events |= EPOLLOUT;
ee.data.fd = fd;
if (epoll_ctl(state->epfd,op,fd,&ee) == -1) return -1;
return 0;
}
static void aeApiDelEvent(aeEventLoop *eventLoop, int fd, int delmask) {
aeApiState *state = eventLoop->apidata;
struct epoll_event ee = {0}; /* avoid valgrind warning */
int mask = eventLoop->events[fd].mask & (~delmask);
ee.events = 0;
if (mask & AE_READABLE) ee.events |= EPOLLIN;
if (mask & AE_WRITABLE) ee.events |= EPOLLOUT;
ee.data.fd = fd;
if (mask != AE_NONE) {
epoll_ctl(state->epfd,EPOLL_CTL_MOD,fd,&ee);
} else {
/* Note, Kernel < 2.6.9 requires a non null event pointer even for
* EPOLL_CTL_DEL. */
epoll_ctl(state->epfd,EPOLL_CTL_DEL,fd,&ee);
}
}
static int aeApiPoll(aeEventLoop *eventLoop, struct timeval *tvp) {
aeApiState *state = eventLoop->apidata;
int retval, numevents = 0;
retval = epoll_wait(state->epfd,state->events,eventLoop->setsize,
tvp ? (tvp->tv_sec*1000 + tvp->tv_usec/1000) : -1);
if (retval > 0) {
int j;
numevents = retval;
for (j = 0; j < numevents; j++) {
int mask = 0;
struct epoll_event *e = state->events+j;
if (e->events & EPOLLIN) mask |= AE_READABLE;
if (e->events & EPOLLOUT) mask |= AE_WRITABLE;
if (e->events & EPOLLERR) mask |= AE_WRITABLE|AE_READABLE;
if (e->events & EPOLLHUP) mask |= AE_WRITABLE|AE_READABLE;
eventLoop->fired[j].fd = e->data.fd;
eventLoop->fired[j].mask = mask;
}
}
return numevents;
}
static char *aeApiName(void) {
return "epoll";
}
redis在这个基础了为了普适又封装了一层,以兼容epoll,poll等模型,其封装的代码如下:
#include <stdio.h>
#include <sys/time.h>
#include <sys/types.h>
#include <unistd.h>
#include <stdlib.h>
#include <poll.h>
#include <string.h>
#include <time.h>
#include <errno.h>
#include "ae.h"
#include "zmalloc.h"
#include "config.h"
/* Include the best multiplexing layer supported by this system.
* The following should be ordered by performances, descending. */
#ifdef HAVE_EVPORT
#include "ae_evport.c"
#else
#ifdef HAVE_EPOLL
#include "ae_epoll.c"
#else
#ifdef HAVE_KQUEUE
#include "ae_kqueue.c"
#else
#include "ae_select.c"
#endif
#endif
#endif
aeEventLoop *aeCreateEventLoop(int setsize) {
aeEventLoop *eventLoop;
int i;
if ((eventLoop = zmalloc(sizeof(*eventLoop))) == NULL) goto err;
eventLoop->events = zmalloc(sizeof(aeFileEvent)*setsize);
eventLoop->fired = zmalloc(sizeof(aeFiredEvent)*setsize);
if (eventLoop->events == NULL || eventLoop->fired == NULL) goto err;
eventLoop->setsize = setsize;
eventLoop->lastTime = time(NULL);
eventLoop->timeEventHead = NULL;
eventLoop->timeEventNextId = 0;
eventLoop->stop = 0;
eventLoop->maxfd = -1;
eventLoop->beforesleep = NULL;
eventLoop->aftersleep = NULL;
eventLoop->flags = 0;
if (aeApiCreate(eventLoop) == -1) goto err;
/* Events with mask == AE_NONE are not set. So let's initialize the
* vector with it. */
for (i = 0; i < setsize; i++)
eventLoop->events[i].mask = AE_NONE;
return eventLoop;
err:
if (eventLoop) {
zfree(eventLoop->events);
zfree(eventLoop->fired);
zfree(eventLoop);
}
return NULL;
}
/* Return the current set size. */
int aeGetSetSize(aeEventLoop *eventLoop) {
return eventLoop->setsize;
}
/* Tells the next iteration/s of the event processing to set timeout of 0. */
void aeSetDontWait(aeEventLoop *eventLoop, int noWait) {
if (noWait)
eventLoop->flags |= AE_DONT_WAIT;
else
eventLoop->flags &= ~AE_DONT_WAIT;
}
/* Resize the maximum set size of the event loop.
* If the requested set size is smaller than the current set size, but
* there is already a file descriptor in use that is >= the requested
* set size minus one, AE_ERR is returned and the operation is not
* performed at all.
*
* Otherwise AE_OK is returned and the operation is successful. */
int aeResizeSetSize(aeEventLoop *eventLoop, int setsize) {
int i;
if (setsize == eventLoop->setsize) return AE_OK;
if (eventLoop->maxfd >= setsize) return AE_ERR;
if (aeApiResize(eventLoop,setsize) == -1) return AE_ERR;
eventLoop->events = zrealloc(eventLoop->events,sizeof(aeFileEvent)*setsize);
eventLoop->fired = zrealloc(eventLoop->fired,sizeof(aeFiredEvent)*setsize);
eventLoop->setsize = setsize;
/* Make sure that if we created new slots, they are initialized with
* an AE_NONE mask. */
for (i = eventLoop->maxfd+1; i < setsize; i++)
eventLoop->events[i].mask = AE_NONE;
return AE_OK;
}
void aeDeleteEventLoop(aeEventLoop *eventLoop) {
aeApiFree(eventLoop);
zfree(eventLoop->events);
zfree(eventLoop->fired);
zfree(eventLoop);
}
void aeStop(aeEventLoop *eventLoop) {
eventLoop->stop = 1;
}
int aeCreateFileEvent(aeEventLoop *eventLoop, int fd, int mask,
aeFileProc *proc, void *clientData)
{
if (fd >= eventLoop->setsize) {
errno = ERANGE;
return AE_ERR;
}
aeFileEvent *fe = &eventLoop->events[fd];
if (aeApiAddEvent(eventLoop, fd, mask) == -1)
return AE_ERR;
fe->mask |= mask;
if (mask & AE_READABLE) fe->rfileProc = proc;
if (mask & AE_WRITABLE) fe->wfileProc = proc;
fe->clientData = clientData;
if (fd > eventLoop->maxfd)
eventLoop->maxfd = fd;
return AE_OK;
}
void aeDeleteFileEvent(aeEventLoop *eventLoop, int fd, int mask)
{
if (fd >= eventLoop->setsize) return;
aeFileEvent *fe = &eventLoop->events[fd];
if (fe->mask == AE_NONE) return;
/* We want to always remove AE_BARRIER if set when AE_WRITABLE
* is removed. */
if (mask & AE_WRITABLE) mask |= AE_BARRIER;
aeApiDelEvent(eventLoop, fd, mask);
fe->mask = fe->mask & (~mask);
if (fd == eventLoop->maxfd && fe->mask == AE_NONE) {
/* Update the max fd */
int j;
for (j = eventLoop->maxfd-1; j >= 0; j--)
if (eventLoop->events[j].mask != AE_NONE) break;
eventLoop->maxfd = j;
}
}
int aeGetFileEvents(aeEventLoop *eventLoop, int fd) {
if (fd >= eventLoop->setsize) return 0;
aeFileEvent *fe = &eventLoop->events[fd];
return fe->mask;
}
static void aeGetTime(long *seconds, long *milliseconds)
{
struct timeval tv;
gettimeofday(&tv, NULL);
*seconds = tv.tv_sec;
*milliseconds = tv.tv_usec/1000;
}
static void aeAddMillisecondsToNow(long long milliseconds, long *sec, long *ms) {
long cur_sec, cur_ms, when_sec, when_ms;
aeGetTime(&cur_sec, &cur_ms);
when_sec = cur_sec + milliseconds/1000;
when_ms = cur_ms + milliseconds%1000;
if (when_ms >= 1000) {
when_sec ++;
when_ms -= 1000;
}
*sec = when_sec;
*ms = when_ms;
}
long long aeCreateTimeEvent(aeEventLoop *eventLoop, long long milliseconds,
aeTimeProc *proc, void *clientData,
aeEventFinalizerProc *finalizerProc)
{
long long id = eventLoop->timeEventNextId++;
aeTimeEvent *te;
te = zmalloc(sizeof(*te));
if (te == NULL) return AE_ERR;
te->id = id;
aeAddMillisecondsToNow(milliseconds,&te->when_sec,&te->when_ms);
te->timeProc = proc;
te->finalizerProc = finalizerProc;
te->clientData = clientData;
te->prev = NULL;
te->next = eventLoop->timeEventHead;
if (te->next)
te->next->prev = te;
eventLoop->timeEventHead = te;
return id;
}
int aeDeleteTimeEvent(aeEventLoop *eventLoop, long long id)
{
aeTimeEvent *te = eventLoop->timeEventHead;
while(te) {
if (te->id == id) {
te->id = AE_DELETED_EVENT_ID;
return AE_OK;
}
te = te->next;
}
return AE_ERR; /* NO event with the specified ID found */
}
/* Search the first timer to fire.
* This operation is useful to know how many time the select can be
* put in sleep without to delay any event.
* If there are no timers NULL is returned.
*
* Note that's O(N) since time events are unsorted.
* Possible optimizations (not needed by Redis so far, but...):
* 1) Insert the event in order, so that the nearest is just the head.
* Much better but still insertion or deletion of timers is O(N).
* 2) Use a skiplist to have this operation as O(1) and insertion as O(log(N)).
*/
static aeTimeEvent *aeSearchNearestTimer(aeEventLoop *eventLoop)
{
aeTimeEvent *te = eventLoop->timeEventHead;
aeTimeEvent *nearest = NULL;
while(te) {
if (!nearest || te->when_sec < nearest->when_sec ||
(te->when_sec == nearest->when_sec &&
te->when_ms < nearest->when_ms))
nearest = te;
te = te->next;
}
return nearest;
}
/* Process time events */
static int processTimeEvents(aeEventLoop *eventLoop) {
int processed = 0;
aeTimeEvent *te;
long long maxId;
time_t now = time(NULL);
/* If the system clock is moved to the future, and then set back to the
* right value, time events may be delayed in a random way. Often this
* means that scheduled operations will not be performed soon enough.
*
* Here we try to detect system clock skews, and force all the time
* events to be processed ASAP when this happens: the idea is that
* processing events earlier is less dangerous than delaying them
* indefinitely, and practice suggests it is. */
if (now < eventLoop->lastTime) {
te = eventLoop->timeEventHead;
while(te) {
te->when_sec = 0;
te = te->next;
}
}
eventLoop->lastTime = now;
te = eventLoop->timeEventHead;
maxId = eventLoop->timeEventNextId-1;
while(te) {
long now_sec, now_ms;
long long id;
/* Remove events scheduled for deletion. */
if (te->id == AE_DELETED_EVENT_ID) {
aeTimeEvent *next = te->next;
if (te->prev)
te->prev->next = te->next;
else
eventLoop->timeEventHead = te->next;
if (te->next)
te->next->prev = te->prev;
if (te->finalizerProc)
te->finalizerProc(eventLoop, te->clientData);
zfree(te);
te = next;
continue;
}
/* Make sure we don't process time events created by time events in
* this iteration. Note that this check is currently useless: we always
* add new timers on the head, however if we change the implementation
* detail, this check may be useful again: we keep it here for future
* defense. */
if (te->id > maxId) {
te = te->next;
continue;
}
aeGetTime(&now_sec, &now_ms);
if (now_sec > te->when_sec ||
(now_sec == te->when_sec && now_ms >= te->when_ms))
{
int retval;
id = te->id;
retval = te->timeProc(eventLoop, id, te->clientData);
processed++;
if (retval != AE_NOMORE) {
aeAddMillisecondsToNow(retval,&te->when_sec,&te->when_ms);
} else {
te->id = AE_DELETED_EVENT_ID;
}
}
te = te->next;
}
return processed;
}
/* Process every pending time event, then every pending file event
* (that may be registered by time event callbacks just processed).
* Without special flags the function sleeps until some file event
* fires, or when the next time event occurs (if any).
*
* If flags is 0, the function does nothing and returns.
* if flags has AE_ALL_EVENTS set, all the kind of events are processed.
* if flags has AE_FILE_EVENTS set, file events are processed.
* if flags has AE_TIME_EVENTS set, time events are processed.
* if flags has AE_DONT_WAIT set the function returns ASAP until all
* the events that's possible to process without to wait are processed.
* if flags has AE_CALL_AFTER_SLEEP set, the aftersleep callback is called.
*
* The function returns the number of events processed. */
int aeProcessEvents(aeEventLoop *eventLoop, int flags)
{
int processed = 0, numevents;
/* Nothing to do? return ASAP */
if (!(flags & AE_TIME_EVENTS) && !(flags & AE_FILE_EVENTS)) return 0;
/* Note that we want call select() even if there are no
* file events to process as long as we want to process time
* events, in order to sleep until the next time event is ready
* to fire. */
if (eventLoop->maxfd != -1 ||
((flags & AE_TIME_EVENTS) && !(flags & AE_DONT_WAIT))) {
int j;
aeTimeEvent *shortest = NULL;
struct timeval tv, *tvp;
if (flags & AE_TIME_EVENTS && !(flags & AE_DONT_WAIT))
shortest = aeSearchNearestTimer(eventLoop);
if (shortest) {
long now_sec, now_ms;
aeGetTime(&now_sec, &now_ms);
tvp = &tv;
/* How many milliseconds we need to wait for the next
* time event to fire? */
long long ms =
(shortest->when_sec - now_sec)*1000 +
shortest->when_ms - now_ms;
if (ms > 0) {
tvp->tv_sec = ms/1000;
tvp->tv_usec = (ms % 1000)*1000;
} else {
tvp->tv_sec = 0;
tvp->tv_usec = 0;
}
} else {
/* If we have to check for events but need to return
* ASAP because of AE_DONT_WAIT we need to set the timeout
* to zero */
if (flags & AE_DONT_WAIT) {
tv.tv_sec = tv.tv_usec = 0;
tvp = &tv;
} else {
/* Otherwise we can block */
tvp = NULL; /* wait forever */
}
}
if (eventLoop->flags & AE_DONT_WAIT) {
tv.tv_sec = tv.tv_usec = 0;
tvp = &tv;
}
/* Call the multiplexing API, will return only on timeout or when
* some event fires. */
numevents = aeApiPoll(eventLoop, tvp);
/* After sleep callback. */
if (eventLoop->aftersleep != NULL && flags & AE_CALL_AFTER_SLEEP)
eventLoop->aftersleep(eventLoop);
for (j = 0; j < numevents; j++) {
aeFileEvent *fe = &eventLoop->events[eventLoop->fired[j].fd];
int mask = eventLoop->fired[j].mask;
int fd = eventLoop->fired[j].fd;
int fired = 0; /* Number of events fired for current fd. */
/* Normally we execute the readable event first, and the writable
* event laster. This is useful as sometimes we may be able
* to serve the reply of a query immediately after processing the
* query.
*
* However if AE_BARRIER is set in the mask, our application is
* asking us to do the reverse: never fire the writable event
* after the readable. In such a case, we invert the calls.
* This is useful when, for instance, we want to do things
* in the beforeSleep() hook, like fsynching a file to disk,
* before replying to a client. */
int invert = fe->mask & AE_BARRIER;
/* Note the "fe->mask & mask & ..." code: maybe an already
* processed event removed an element that fired and we still
* didn't processed, so we check if the event is still valid.
*
* Fire the readable event if the call sequence is not
* inverted. */
if (!invert && fe->mask & mask & AE_READABLE) {
fe->rfileProc(eventLoop,fd,fe->clientData,mask);
fired++;
}
/* Fire the writable event. */
if (fe->mask & mask & AE_WRITABLE) {
if (!fired || fe->wfileProc != fe->rfileProc) {
fe->wfileProc(eventLoop,fd,fe->clientData,mask);
fired++;
}
}
/* If we have to invert the call, fire the readable event now
* after the writable one. */
if (invert && fe->mask & mask & AE_READABLE) {
if (!fired || fe->wfileProc != fe->rfileProc) {
fe->rfileProc(eventLoop,fd,fe->clientData,mask);
fired++;
}
}
processed++;
}
}
/* Check time events */
if (flags & AE_TIME_EVENTS)
processed += processTimeEvents(eventLoop);
return processed; /* return the number of processed file/time events */
}
/* Wait for milliseconds until the given file descriptor becomes
* writable/readable/exception */
int aeWait(int fd, int mask, long long milliseconds) {
struct pollfd pfd;
int retmask = 0, retval;
memset(&pfd, 0, sizeof(pfd));
pfd.fd = fd;
if (mask & AE_READABLE) pfd.events |= POLLIN;
if (mask & AE_WRITABLE) pfd.events |= POLLOUT;
if ((retval = poll(&pfd, 1, milliseconds))== 1) {
if (pfd.revents & POLLIN) retmask |= AE_READABLE;
if (pfd.revents & POLLOUT) retmask |= AE_WRITABLE;
if (pfd.revents & POLLERR) retmask |= AE_WRITABLE;
if (pfd.revents & POLLHUP) retmask |= AE_WRITABLE;
return retmask;
} else {
return retval;
}
}
void aeMain(aeEventLoop *eventLoop) {
eventLoop->stop = 0;
while (!eventLoop->stop) {
if (eventLoop->beforesleep != NULL)
eventLoop->beforesleep(eventLoop);
aeProcessEvents(eventLoop, AE_ALL_EVENTS|AE_CALL_AFTER_SLEEP);
}
}
char *aeGetApiName(void) {
return aeApiName();
}
void aeSetBeforeSleepProc(aeEventLoop *eventLoop, aeBeforeSleepProc *beforesleep) {
eventLoop->beforesleep = beforesleep;
}
void aeSetAfterSleepProc(aeEventLoop *eventLoop, aeBeforeSleepProc *aftersleep) {
eventLoop->aftersleep = aftersleep;
}
再往上就是真正的应用了,这里不再贴更多的代码,有兴趣可以看看其源码,本代码的版本是redis 6.0。
2、自己写的一个例程
按照最大化原则利用多核的优越性的情况,将epoll中接收和监听等抽象出多个类,然后再进行业务处理。它其实更类似于分组的方法使用epoll,或者说简单的创建多个epoll然后分别监听。特点在于可以动态创建。代码很多,这里只列出一些头文件:
//Acceptor.h
#ifndef NETSERVER_ACCEPTOR_H
#define NETSERVER_ACCEPTOR_H
#include <memory>
#include <string>
#include "ThreadCondition.h"
namespace netserver
{
class EpollRunner;
struct NetReactorEx;
class Acceptor final :public ThreadCondition
{
public:
Acceptor();
~Acceptor();
public:
int InitAcceptor(std::shared_ptr<NetReactorEx> reactorEx, int waitcount = global::WAIT_COUNT);
int StartListen(int num = 1);//启动监听数量,默认是一个
void Close();
private:
int AcceptClient(int fd, int events);
private:
int AddListen(int fd,int num = 1);
int InitServer( std::string ip = "", unsigned short port = 0);
private:
std::shared_ptr<EpollRunner> epRunner_ = nullptr;
std::shared_ptr<NetReactorEx> netReactorEx_ = nullptr;
private:
int worker_ = 0; //Worker的计数器
private:
std::string ip_ = "0.0.0.0";
unsigned short port_ = 28888;
int socketfd_[global::MAX_LISENER] = { 0 };//监听的数量:可以把定时器、事件和Socket都整合
};
}
#endif
//Epoller.h
#pragma once
#include "INetServer.h"
#include <memory>
#include <unordered_map>
#include <unordered_set>
#include <queue>
#include <mutex>
#include <thread>
#include<condition_variable>
#include "../dataparse/MemCache.h"
namespace cmdcontrol
{
class CmdControl;
}
namespace toolhelper
{
class Timer;
}
namespace dataparse
{
class DataParse;
class AutoMachine;
template<typename T, typename U, typename E>
class MemCacheEx;
}
namespace netserver
{
class SocketEntry;
class Epoller :public INetServer
{
public:
Epoller();
~Epoller();
public:
virtual void Init(std::string ip = "", unsigned short port = 0);
int AddListener();
virtual int DestoryServer();
int StartWorker();
std::size_t Send(SocketContext *sc, char *buf, std::size_t len);
public:
std::shared_ptr<SocketEntry> InitSocketEntry(SocketContext *context);
std::shared_ptr<SocketEntry> InsertSet(SocketContext *context);
int UpdateSocketEntry(std::string &id, std::string &pbkey,bool &flag, std::weak_ptr<SocketEntry> pws);
void Dispatch(int type,std::shared_ptr<SocketEntry> se);
void StatusClear(SocketContext * sc, const std::string &id,bool master);
protected:
int InitServer(std::string ip = "", unsigned short port = 0);
int InitEpollParams();
int InitDataParms();
int CreateTimer(int type); //0:异步非阻塞定时器 1:阻塞定时器
void SendBeat();
void UpdateHeartSocketEntry(std::weak_ptr<SocketEntry> pwse);
void OnTimer();//时间消息触发函数
private:
void StartThread(int num) noexcept;
private:
int AddEvent(int epfd, SocketContext* context);
int DelEvent(int epfd, SocketContext* context);
int SetEvent(SocketContext *sc, int fd, int epfdpos,int event, int len, EVENT_STATUS status, CALLBACK callback,std::weak_ptr<SocketEntry> pwEntry);
int AcceptClient(int fd, int events );
ssize_t SendData(int fd, int events );
ssize_t RecvData(int fd, int events );
int CreateEpoll();
int RunWorker(int pos);
private:
std::shared_ptr<SocketEntry> FindIdMap(const std::string &id);
std::shared_ptr<SocketEntry> FindKeyMap(const std::string &pbkey);
bool DelIdMap(const std::string &id);
bool DelKeyMap(const std::string &pbkey);
//查询请求和应答结果
std::size_t QueryAskAndSendSelf(std::shared_ptr<CacheDataEx> ptr);
std::size_t QueryAckAndSendSelf(std::shared_ptr<CacheDataEx> ptr);
private:
inline bool Wait(int timeOut);
inline void Wait();
inline void Signal();
void SetSignal(bool quit = false)noexcept;
private:
void DisplayMapMD();
private:
std::string ip_ = "0.0.0.0";
unsigned short port_ = 28888;
std::shared_ptr<toolhelper::Timer> pTimer_ = nullptr;
int socketfd_ = 0; //监听句柄
//int fdArray[100] = {0};//监听数组
std::shared_ptr<SocketEntry> pListenerEntry_ = nullptr;
std::shared_ptr<netserver::NetReactor> pNetReactor_ = nullptr;
std::shared_ptr<cmdcontrol::CmdControl> pCmd_ = nullptr;
private:
//查找资源和定时器轮的变量
//umapKey_:通过公钥来查找资源 umapID_:通过ID来查找资源
//std::unordered_map<std::string, std::weak_ptr<SocketEntry>> umapKey_;//废弃
std::unordered_map<std::string, std::weak_ptr<SocketEntry>> umapID_;
std::queue<std::unordered_set<std::shared_ptr<SocketEntry>>> queue_;//重点是它
//锁
std::mutex mutex_;
std::function<void(SocketContext*,const std::string&, bool)> func_ = nullptr;
private:
std::thread thread_[global::THREAD_COUNT]; //线程池的句柄数组
std::thread epThread[global::MAX_EPFD];//Epoll的监听线程数组
bool signaled_ = false;
std::mutex lockMutex_;
std::condition_variable cvLock_;
std::queue<std::shared_ptr<CacheDataEx>> doQueue_;//处理队列
private:
//内存队列
std::shared_ptr<dataparse::MemCacheEx<std::string, dataparse::ArrayCacheData, dataparse::CacheData>> pMem_ = nullptr;
//心跳处理
char beat_[100] = {0};
int beatlen_ = 0;
private:
int epfdPos_ = 0; // NetReactor中的监听位置,通过其可以索引出相关的epfd
};
}
//EpollReactor.h
#ifndef NETSERVER_EPOLLREACTOR_H
#define NETSERVER_EPOLLREACTOR_H
#include <memory>
#include <unordered_map>
#include <functional>
#include <thread>
#include "netcommon.h"
#include "../dataparse/MemCache.h"
namespace dataparse
{
template<typename T, typename U, typename E>
class MemCacheEx;
}
namespace netserver
{
class Acceptor;
class EpollWorker;
class EpollReactor final
{
public:
EpollReactor();
~EpollReactor();
public:
int InitReactor(std::string ip = "", unsigned short port = 0);
int StartWorker();
void Close();
private:
int InitNetReactorEx();
int StartAcceptor(std::function<void(int,int)> func);
int StartEpollWorker();
private:
std::shared_ptr<NetReactorEx> reactorEx_ = nullptr;
//内存队列
std::shared_ptr<dataparse::MemCacheEx<std::string, CacheDataEx, dataparse::CacheData>> pMem_ = nullptr;
private:
std::thread tHandle_[global::MAX_EPFD];
};
}
#endif
//EpollRunner.h
#ifndef NETSERVER_EPOLLRUNNER_H
#define NETSERVER_EPOLLRUNNER_H
#include <functional>
#include "INetServer.h"
namespace netserver
{
using BINDFUNC = std::function<int(int, int)>;//fd events pos:属于哪个WORKER 或ACCEPTOR,暂时忽略
class EpollRunner :public INetServer
{
public:
EpollRunner();
~EpollRunner();
public:
int InitEpollRunner(int waitcount, std::string ip = "", unsigned short port = 0);
int CreateEpoll();
void SetBindFunc(BINDFUNC func, BIND_TYPE bt);
void Close();
public:
int StartRunner(int timeout = 1000);
public:
int AddEvent(SocketContext* context);
int DelEvent(SocketContext* context);
int SetEvent(SocketContext *sc, int fd, int event, int len, EVENT_STATUS status, CALLBACK callback, std::shared_ptr<SocketEntry> pwEntry, std::shared_ptr<EpollWorker> peWorker);
public:
//默认的发送和接收函数
ssize_t SendData(int fd, int events);
ssize_t RecvData(int fd, int events);
private:
std::function<void(int, int)> recvFunc_ = nullptr;
std::function<void(int, int)> sendFunc_ = nullptr;
std::function<void(int, int)> acceptFunc_ = nullptr;
private:
void Init(std::string ip = "", unsigned short port = 0);
private:
int epfd_ = 0;
struct epoll_event *epEvents_ = nullptr;//全部监听事件集合
int waitcount_ = 0;
};
}
#endif //NETSERVER_EPOLLRUNNER_H
//EpollWorker.h
#ifndef NETSERVER_EPOLLWORKER_H
#define NETSERVER_EPOLLWORKER_H
#include <queue>
#include <unordered_set>
#include <thread>
#include <mutex>
#include <condition_variable>
#include "ThreadCondition.h"
#include "netcommon.h"
#include "../dataparse/MemCache.h"
namespace cmdcontrol
{
class CmdControl;
}
namespace toolhelper
{
class Timer;
}
namespace dataparse
{
class DataParse;
class AutoMachine;
template<typename T, typename U, typename E>
class MemCacheEx;
}
namespace netserver
{
class SocketEntry;
class EpollRunner;
using MEMPTR = std::shared_ptr<dataparse::MemCacheEx<std::string, CacheDataEx, dataparse::CacheData>>;
using TalkMap = std::unordered_map<std::string, char>;
class EpollWorker:public ThreadCondition, public std::enable_shared_from_this<EpollWorker>
{
public:
EpollWorker();
~EpollWorker();
public:
int InitEpollWorker(std::shared_ptr<NetReactorEx> reactorEx, int waitcount = global::WAIT_COUNT,int pos = 0, MEMPTR ptr = nullptr);
int StartWorker(int timeout = 1000);
void Close();
public:
//启动工作线程
void StartThread(int num = 1) noexcept;
//将需要发送的数据发送给工作线程
std::size_t SendWorker(std::shared_ptr<CacheDataEx>);
bool DelIdMap(const std::string &id);
bool DelIdMapClose(const std::string &id);
bool DelIdMapOther(const std::string &id);
std::shared_ptr<SocketEntry> FindIdMap(const std::string & id);
std::shared_ptr<SocketEntry> FindIdMapAll(const std::string & id);
int GetCurPos() { return this->curPos_; }
void DisplayMap();
void DisplayQueue();
public:
//epoll的异步发送和接收函数,目前发送暂时没用
ssize_t SendData(int fd, int events);
ssize_t RecvData(int fd, int events);
public:
int AddEvent(SocketContext* context);
int DelEvent(SocketContext* context);
int SetEvent(SocketContext *sc, int fd, int event, int len, EVENT_STATUS status);
private:
int InitEpoll(int waitcount);
int InitParams(std::shared_ptr<NetReactorEx> reactorEx, int pos, MEMPTR ptr);
int InitOperatorFunc();
private:
//发送函数
std::size_t Send(SocketContext *sc, char *buf, std::size_t len);
std::shared_ptr<EpollWorker> GetPtr() { return shared_from_this(); }
void SetSendParams(char*buf,int len);
private:
//分发数据给工作线程
void Dispatch(int type, std::shared_ptr<SocketEntry> ptmp,std::shared_ptr<CacheDataEx> pcd = nullptr);
//更新心跳成功的连接到桶
void UpdateHeartSocketEntry(std::weak_ptr<SocketEntry> pwse);
//更新连接的特征ID
int UpdateSocketEntry(std::string &md, std::string &id, bool &flag, std::weak_ptr<SocketEntry> pws);
//初始化当前连接的资源:分为第一次和二次使用
std::shared_ptr<SocketEntry> InitSocketEntry(SocketContext *context);
//第一次初始化连接资源
std::shared_ptr<SocketEntry> InsertSet(SocketContext *context);
private:
void StatusClear(SocketContext * sc, const std::string &id, bool master);//清理回调函数
int CreateTimer(int type);
void OnTimer();//心跳控制时钟
void Close(int fd);
private:
std::shared_ptr<EpollRunner> epRunner_ = nullptr;
bool quit_ = false;
std::thread thread_;//解析发送线程句柄
std::shared_ptr<cmdcontrol::CmdControl> pCmd_ = nullptr;//命令解析变量
private:
std::mutex mutex_;
std::queue<std::shared_ptr<CacheDataEx>> doQueue_;//处理队列
std::function<void(SocketContext*, const std::string&, bool)> clearfunc_ = nullptr;//桶释放连接后回调函数,处理相关资源
//线程并发使用
bool signaled_ = false;
std::mutex lockMutex_;
std::condition_variable cvLock_;
private:
std::unordered_map<std::string, std::weak_ptr<SocketEntry>> umapID_; //当前监控的连接映射表
std::queue<std::unordered_set<std::shared_ptr<SocketEntry>>> queue_; //重点是它---连接控制桶
//内存队列
MEMPTR pMem_ = nullptr;
std::shared_ptr<toolhelper::Timer> pTimer_ = nullptr;
private:
//从上层控制器传入的当前整体控制指针
std::shared_ptr<NetReactorEx> netReactorEx_ = nullptr;
int curPos_ = 0; //当前创建的索引号
private:
//回调绑定函数---处理不同的类型需求
std::function<int(int, int)> funcRecv_ = nullptr;
std::function<int(int, int)> funcSend_ = nullptr;
private:
char sendBuf_[global::BUF_LEN] = {0};
int sendLen_ = 0;
std::string webId_ = "046EA8CFFFE99A27E1A83BF34A2A8BE9"; //websocket的ID
};
}
#endif
这块代码回头会在github上开源。主要是需要整理,防止把公司的一些业务相关信息暴露出来。上面的代码看头文件的名字就基本可以理解意思了,然后内部又有注释,所以这里就不再详细展开说明了。
四、总结
epoll的应用,不管是库还是自己开发,基本上都是各种线程(池)应用紧密的绑定在一起。那么如何处理各种事件的实时动作以及数据和命令的快速分发解析,都是一个非常重要的问题。再加上实际场景各种情况都有可能发生,这就需要开发者能把程序设计的尽量弹性容错性高。针对epoll的缺点,增加适当的限流和控制机制,确保程序的整体的安全性。
学习别人的应用,最重要是学会其设计的思想和技术实现手段。融会贯通,方能真正的把知识学到自己的头脑里,自己大脑的东西才是自己的。