epoll IO多路复用器
最近在浏览webrtc代码时看到其socket中使用了epoll机制,由于之前对linux的epoll机制较为陌生,故研究记录如下:
IO多路复用存在的意义在于应用程序可以同时监测多个fd的事件,便于单线程处理多个fd,epoll是众多多路复用器的一种,类似的还有select、poll等。服务器程序通常需要具备较高处理用户并发的能力,使用多路复用器意味着可以用一个线程同时处理多个用户并发请求。
- 阻塞:
阻塞指的是用户态程序调用系统api进入内核态后,如果条件不满足则被加入到对应的等待队列中,直到条件满足。比如:sleep 2s。在此期间线程得不到CPU调度,自然也就不会往下执行,表现的现象为线程卡在系统api不返回。- 非阻塞:
非阻塞则相反,不论条件是否满足都会立即返回到用户态,线程的CPU资源不会被剥夺,也就意味着程序可以继续往下执行。
在一次发送大量数据(超过发送缓冲区大小)的情况下,如果使用阻塞方式,程序一直阻塞,直到所有的数据都写入到缓冲区中。例如,要发送M字节数据,套接字发送缓冲区大小为B字节,只有当对端向本机返回ack表明其接收到大于等于M-B字节时,才意味着所有的数据都写入到缓冲区中。很明显,如果一次发送的数据量非常大,比如M=10GB、B=64KB,则:
1)一次发送过程中本机线程会在一个fd上阻塞相当长一段时间,其他fd得不到及时处理;
2)如果出现发送失败,无从得知到底有多少数据发送成功,应用程序只能选择重新发送这10G数据,
总之,上述两点都是无法接受的。因此,对性能有要求的服务器一般不采用阻塞而采用非阻塞。
采用非阻塞套接字一次发送大量数据的流程:
1)使劲往发送缓冲区中写数据,直到返回不可写;
2)等待下一次缓冲区可写;
可以有两种方式:
- 查询式,程序不停地查询是否可写,这种方式不仅效率低下,而且存在不确定性的处理延迟;
- 程序去干其他的事情,等多路复用器监测到可写事件后再接着写;很明显方式2更加高效。
3)要发送的数据写完;
EPOLLOUT事件 就是以事件的方式通知用户程序,可以继续往缓冲区写数据了
EPOLLOUT事件 表示fd的发送缓冲区可写,在一次发送大量数据(超过发送缓冲区大小)的情况下很有用。
EPOLLOIN事件 就是以事件的方式通知用户程序,可以接着从缓冲区读数据了
EPOLLOIN事件 表示fd的接收缓冲区可读,在一次接收大块数据(超过接收缓冲区大小)的情况下很有用。
客户端每次按下任意键(由epoll监听处理),触发一次模拟的http请求,服务器接收请求并返回响应,此处用‘\0’ 填充了一个超大数据体,来模拟超过发送缓冲区的大块数据,服务器和客户端都采用epoll来处理对该大数据块的发送和接收;为了简化示例,此处客户端事先知道服务器要发送的数据大小。
server端
#include <arpa/inet.h>
#include <errno.h>
#include <fcntl.h>
#include <netinet/in.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/epoll.h>
#include <sys/socket.h>
#include <unistd.h>
#include <map>
#include <string>
using namespace std;
int g_socketfd = -1;
int g_clientfd = -1;
#define SERV_PORT 8787
#define exit_if(ret, ...) \
if (ret) { \
printf(__VA_ARGS__); \
printf("->>> %s: %d error no: %d error msg %s\n",__func__,__LINE__, errno, strerror(errno)); \
exit(1); \
}
#define log(...) do{printf("%s(%d): ", __func__, __LINE__); \
printf(__VA_ARGS__);}while(0)
#define err_log(errlog) do{printf("%s(%d): ", __func__, __LINE__);\
perror(errlog);}while(0)
void SetNonBlock(int fd)
{
int flags = ::fcntl(fd, F_GETFL, 0);
exit_if(flags < 0, "fcntl failed");
log("set %s O_NONBLOCK\n", fd == g_socketfd ? "socketfd" : "clientfd");
int ret = ::fcntl(fd, F_SETFL, flags | O_NONBLOCK);
exit_if(ret < 0, "fcntl failed");
}
void UpdateEvents(int efd, int fd, int events, int op)
{
bool in_flag = false;
bool out_flag = false;
struct epoll_event ev;
::memset(&ev, 0, sizeof(ev));
ev.events = events;
ev.data.fd = fd;
if(ev.events & EPOLLIN) {
in_flag = true;
}
if(ev.events & EPOLLOUT) {
out_flag = true;
}
log("%s %s[%d], events read [%s] write [%s]\n",
op == EPOLL_CTL_MOD ? "mod" : "add",
fd == g_clientfd ? "clientfd" : "socketfd",
fd,
in_flag ? "SET" : "UNSET",
out_flag ? "SET" : "UNSET");
int ret = ::epoll_ctl(efd, op, fd, &ev);
exit_if(ret, "epoll_ctl failed");
}
void HandleAccept(int efd, int fd)
{
struct sockaddr_in raddr;
socklen_t rsz = sizeof(raddr);
int clientfd = ::accept(fd, (struct sockaddr *) &raddr, &rsz);
g_clientfd = clientfd;
exit_if(clientfd < 0, "accept failed");
sockaddr_in peer, local;
socklen_t alen = sizeof(peer);
int ret = ::getpeername(clientfd, (sockaddr *) &peer, &alen);
exit_if(ret < 0, "getpeername failed");
log("accept a connection from %s\n", inet_ntoa(raddr.sin_addr));
SetNonBlock(clientfd);
UpdateEvents(efd, clientfd, EPOLLIN, EPOLL_CTL_ADD);
}
struct Counter
{
string readed;
size_t n_wt;
size_t n_rd;
bool write_en;
Counter() : n_wt(0), n_rd(0), write_en(false) {}
};
std::map<int, Counter> g_counters;
string g_http_resp;
ssize_t WrapperWrite(int fd, const void *buf, size_t count)
{
int ret = ::write(fd, buf, count);
log("write return = %d\n", ret);
return ret;
}
void SendResp(int efd, int fd)
{
Counter &cter = g_counters[fd];
size_t left = g_http_resp.length() - cter.n_wt;
log("g_http_resp.length = %ld bytes, counter.n_wt= %ld bytes, left: %lu bytes\n",
g_http_resp.length(), cter.n_wt, left);
int wn = 0;
while ((wn = WrapperWrite(fd, g_http_resp.data() + cter.n_wt, left)) > 0) {
cter.n_wt += wn;
left -= wn;
log("write %d bytes, left: %lu bytes\n", wn, left);
}
if (left == 0) {
//close(fd);
if (cter.write_en) {
log("left is 0, %s[%d] updateEvents(EPOLL_CTL_MOD) to EPOLLIN aka monitor read\n",
fd == g_clientfd ? "clientfd" : "socketfd", fd);
UpdateEvents(efd, fd, EPOLLIN, EPOLL_CTL_MOD); // 当所有数据发送结束后,不再关注其缓冲区可写事件
cter.write_en= false;
}
g_counters.erase(fd);
return;
}
if (wn < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) {
if (!cter.write_en) { ///< default is false
log("write return %d, && errno is: %s, %s[%d] updateEvents(EPOLL_CTL_MOD) to EPOLLIN | EPOLLOUT\n",
wn, (errno == EAGAIN) ? "EAGAIN" : "EWOULDBLOCK",
fd == g_clientfd ? "clientfd" : "socketfd", fd);
UpdateEvents(efd, fd, EPOLLIN | EPOLLOUT, EPOLL_CTL_MOD);
cter.write_en= true;
}
return;
}
if (wn <= 0) {
log("write error for %s[%d]: %d %s\n",
fd == g_clientfd ? "clientfd" : "socketfd",
fd, errno, strerror(errno));
::close(fd);
g_counters.erase(fd);
}
}
void HandleRead(int efd, int fd)
{
char buf[4096];
int rn = 0;
log("reading data form %s[%d]\n",
fd == g_clientfd ? "clientfd" : "socketfd", fd);
while((rn = ::read(fd, buf, sizeof buf)) > 0) {
log("read %d bytes\n", rn);
string &readed = g_counters[fd].readed;
readed.append(buf, rn);
if (readed.length() > 4) {
if (readed.substr(readed.length() - 2, 2) == "\n\n" || readed.substr(readed.length() - 4, 4) == "\r\n\r\n") {
//当读取到一个完整的http请求,测试发送响应
log("parse http request success. sending response data to client\n");
SendResp(efd, fd);
}
}
}
if (rn < 0 && (errno == EAGAIN || errno == EWOULDBLOCK))
return;
//实际应用中,rn<0应当检查各类错误,如EINTR
if (rn < 0) {
log("read %s[%d]error: %d %s\n",
fd == g_clientfd ? "clientfd" : "socketfd", fd,
errno, strerror(errno));
}
if (rn == 0) {
log("%s[%d] closed\n",
fd == g_clientfd ? "clientfd" : "socketfd", fd);
}
::close(fd);
g_counters.erase(fd);
}
void HandleWrite(int efd, int fd)
{
SendResp(efd, fd);
}
void MainLoop(int efd, int sockfd, int waitms)
{
const int kMaxEvents = 20;
struct epoll_event activeEvs[100];
int n = ::epoll_wait(efd, activeEvs, kMaxEvents, waitms);
log("epoll_wait return %d\n", n);
for (int i = 0; i < n; i++) {
int fd = activeEvs[i].data.fd;
int events = activeEvs[i].events;
log("epoll get events from %s[%d], ",
fd == g_clientfd ? "clientfd" : "socketfd", fd);
if (events & (EPOLLIN | EPOLLERR)) { ///< EPOLLIN or EPOLLERR
if (fd == sockfd) {
log("handling EPOLLIN(accept)\n");
HandleAccept(efd, fd); ///< accept EPOLLIN
} else {
log("handling EPOLLIN(read)\n");
HandleRead(efd, fd); ///< read EPOLLIN
}
} else if (events & EPOLLOUT) { ///< EPOLLOUT
log("handling EPOLLOUT(write)\n");
HandleWrite(efd, fd); ///< write EPOLLOUT
} else {
exit_if(1, "unknown event");
}
}
}
int main(int argc, const char *argv[])
{
exit_if(argc < 2, "./xxx ip");
::signal(SIGPIPE, SIG_IGN);
g_http_resp = "HTTP/1.1 200 OK\r\nConnection: Keep-Alive\r\nContent-Type: text/html; charset=UTF-8\r\nContent-Length: 104857600*2+6\r\n\r\n123456";
for (int i = 0; i < 10485760*2; i++) {
g_http_resp += '\0';
}
int epollfd = ::epoll_create(1);
exit_if(epollfd < 0, "epoll_create failed");
int sockfd = ::socket(AF_INET, SOCK_STREAM, 0);
exit_if(sockfd< 0, "socket failed");
g_socketfd = sockfd;
struct sockaddr_in addr;
memset(&addr, 0, sizeof addr);
addr.sin_family = AF_INET;
addr.sin_port = htons(SERV_PORT);
::inet_pton(AF_INET, argv[1], &addr.sin_addr);
int ret = ::bind(sockfd, (struct sockaddr *) &addr, sizeof(struct sockaddr));
exit_if(ret, "bind to %s:%d failed %d %s", argv[1], SERV_PORT, errno, strerror(errno));
ret = ::listen(sockfd, 20);
exit_if(ret, "listen failed %d %s", errno, strerror(errno));
log("fd %d listening at %s[%d]\n", sockfd, argv[1], SERV_PORT);
SetNonBlock(sockfd);
UpdateEvents(epollfd, sockfd, EPOLLIN, EPOLL_CTL_ADD);
for (;;) { //实际应用应当注册信号处理函数,退出时清理资源
MainLoop(epollfd, sockfd, 10000);
}
return 0;
}
客户端
#include <netinet/in.h>
#include <sys/socket.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <sys/epoll.h>
#include <signal.h>
#include <time.h>
#include <unistd.h>
#include <sys/types.h>
#include <arpa/inet.h>
#include <iostream>
#include <fcntl.h>
#include <map>
using namespace std;
int g_socketfd = -1;
#define MAXSIZE 4096
#define SERV_PORT 8787
#define exit_if(ret, ...) \
if (ret) { \
printf(__VA_ARGS__); \
printf("->>> %s: %d error no: %d error msg %s\n", __func__, __LINE__, errno, strerror(errno)); \
exit(1); \
}
#define log(...) do{printf("%s(%d): ", __func__, __LINE__); \
printf(__VA_ARGS__);}while(0)
#define err_log(errlog) do{printf("%s(%d): ", __func__, __LINE__);\
perror(errlog);}while(0)
/**<
* typedef union epoll_data {
* void *ptr;
* int fd;
* uint32_t u32;
* uint64_t u64;
* } epoll_data_t;
*
* struct epoll_event {
* uint32_t events; ///< Epoll events
* epoll_data_t data; ///< User data variable
* };
*/
void SetNonBlock(int fd)
{
int flags = ::fcntl(fd, F_GETFL, 0);
exit_if(flags < 0, "fcntl failed");
log("set %s O_NONBLOCK\n", fd == g_socketfd ? "socketfd" : "clientfd");
int r = ::fcntl(fd, F_SETFL, flags | O_NONBLOCK);
exit_if(r < 0, "fcntl failed");
}
void UpdateEvents(int efd, int fd, int events, int op)
{
bool in_flag = false;
bool out_flag = false;
struct epoll_event ev;
::memset(&ev, 0, sizeof(ev));
ev.events = events;
ev.data.fd = fd;
if(ev.events & EPOLLIN) {
in_flag = true;
}
if(ev.events & EPOLLOUT) {
out_flag = true;
}
log("%s %s[%d], events read [%s] write [%s]\n",
op == EPOLL_CTL_MOD ? "mod" : (op == EPOLL_CTL_ADD ? "add" : "del"),
fd == STDIN_FILENO ? "STDIN_FILENO" : "socketfd",
fd,
in_flag ? "SET" : "UNSET",
out_flag ? "SET" : "UNSET");
int ret = ::epoll_ctl(efd, op, fd, &ev);
exit_if(ret, "epoll_ctl failed");
}
struct Counter
{
std::string readed;
unsigned long n_wt;
unsigned long n_rd;
bool write_en;
Counter() : n_wt(0), n_rd(0), write_en(false) {}
};
std::map<int, Counter> g_counters;
ssize_t WrapperRead(int fd, void *buf, size_t count)
{
log("prepare to read...\n");
int ret = ::read(fd, buf, MAXSIZE);
log("read %s[%d] ret = %d\n", fd == STDIN_FILENO ? "STDIN_FILENO" : "socketfd", fd,ret);
return ret;
}
static void HandleRead(int efd, int fd)
{
int nr = 0;
char buf[MAXSIZE];
Counter &cter = g_counters[fd];
log("fd is %s\n", fd == g_socketfd ? "socketfd" : "STDIN_FILENO");
while((nr = WrapperRead(fd, buf, MAXSIZE)) > 0) {
cter.readed.append(buf, nr);
cter.n_rd += nr;
log("10485760*2 + 6(%d) vs %ld\n", 20971640, cter.n_rd);
if (fd == STDIN_FILENO){ ///< STDIN_FILENO
UpdateEvents(efd, STDIN_FILENO, EPOLLIN, EPOLL_CTL_DEL); ///< remove STDIN_FILENO can read event.
UpdateEvents(efd, g_socketfd, EPOLLOUT, EPOLL_CTL_ADD); ///< add socket can write event.
return ;
} else if(cter.n_rd == 20971640) { ///< read over
UpdateEvents(efd, STDIN_FILENO, EPOLLIN, EPOLL_CTL_ADD); ///< add STDIN_FILENO can read event.
UpdateEvents(efd, fd, EPOLLIN, EPOLL_CTL_DEL); ///< remove socket can read event.
log("received: %s\n", cter.readed.c_str());
g_counters.erase(fd);
}
}
if (nr == 0) {
log("server closed.\n");
::close(fd);
g_counters.erase(fd);
return ;
}
if (nr < 0 && (errno == EAGAIN || errno == EWOULDBLOCK))
return;
//实际应用中,n<0应当检查各类错误,如EINTR
if (nr < 0) {
log("read %s[%d]error: %d %s\n",
fd == STDIN_FILENO ? "STDIN_FILENO" : "socketfd", fd,
errno, strerror(errno));
}
::close(fd);
g_counters.erase(fd);
}
std::string g_http_rqst;
ssize_t WrapperWrite(int fd, const void *buf, size_t count)
{
int ret = ::write(fd, buf, count);
log("write return = %d\n", ret);
return ret;
}
static void HandleWrite(int efd, int fd)
{
Counter &cter = g_counters[fd];
unsigned long left = g_http_rqst.length() - cter.n_wt;
log("g_http_rqst.length = %ld bytes, counter.n_wt= %ld bytes, left: %lu bytes\n",
g_http_rqst.length(), cter.n_wt, left);
int wn;
while((wn = WrapperWrite(fd, g_http_rqst.data() + cter.n_wt, left)) > 0) {
cter.n_wt += wn;
left -= wn;
log("write %d bytes, left: %lu bytes\n", wn, left);
}
if (left == 0) {
log("left is 0, %s[%d] UpdateEvents(EPOLL_CTL_MOD) to EPOLLIN aka monitor read\n",
fd == STDIN_FILENO ? "STDIN_FILENO" : "socketfd", fd);
UpdateEvents(efd, fd, EPOLLIN, EPOLL_CTL_MOD); // 当所有数据发送结束后,不再关注其缓冲区可写事件
cter.write_en = false;
g_counters.erase(fd);
return;
}
if (wn < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) {
if (!cter.write_en) { ///< default is false
log("write return %d, && errno is: %s, %s[%d] UpdateEvents(EPOLL_CTL_MOD) to EPOLLIN | EPOLLOUT\n",
wn, (errno == EAGAIN) ? "EAGAIN" : "EWOULDBLOCK",
fd == STDIN_FILENO ? "STDIN_FILENO" : "socketfd", fd);
UpdateEvents(efd, fd, EPOLLIN | EPOLLOUT, EPOLL_CTL_MOD);
cter.write_en = true;
}
return;
}
if (wn <= 0) {
log("write error for %s[%d]: %d %s\n",
fd == STDIN_FILENO ? "STDIN_FILENO" : "socketfd",
fd, errno, strerror(errno));
::close(fd);
g_counters.erase(fd);
}
}
static void MainLoop(int efd, int sockfd, int waitms)
{
const int kMaxEvents = 20;
struct epoll_event activeEvs[100];
/**< initialize epoll_event */
char buf[MAXSIZE];
/**< events用来从内核得到事件的集合,
* 函数返回需要处理的事件数目,如返回0表示已超时
*/
int num = ::epoll_wait(efd, activeEvs, kMaxEvents, waitms);
for (int i = 0;i < num; i++) {
int afd = activeEvs[i].data.fd; ///< get collected active fd(Events that have occurred)
if (activeEvs[i].events & EPOLLIN) ///< fd can read
HandleRead(efd, afd);
else if (activeEvs[i].events & EPOLLOUT) ///< fd can write
HandleWrite(efd, afd);
}
}
int main(int argc,char *argv[])
{
struct sockaddr_in servaddr;
exit_if(argc < 2, "./xxx ip");
::signal(SIGPIPE, SIG_IGN);
g_http_rqst = "GET /cia/arts/stories/LuAenOdp11Es9xEbHLoqEnqXad07tlnk4Ovdm0n1rauI5iQ3xmdjOwRRXJBQ3jMQ.mp3 HTTP/1.1\r\n";
g_http_rqst += "Host: public01-1255411705.file.myqcloud.com\r\n";
g_http_rqst += "Connection: close\r\n\r\n";
int epollfd = ::epoll_create(1);
exit_if(epollfd < 0, "epoll_create failed");
int sockfd = ::socket(AF_INET,SOCK_STREAM,0);
g_socketfd = sockfd;
::bzero(&servaddr,sizeof(servaddr));
servaddr.sin_family = AF_INET;
servaddr.sin_port = htons(SERV_PORT);
::inet_pton(AF_INET, argv[1], &servaddr.sin_addr);
int ret = ::connect(sockfd,(struct sockaddr*)&servaddr,sizeof(servaddr));
exit_if(ret, "connect failed %d %s", errno, strerror(errno));
log("fd %d connect to %s[%d]\n", sockfd, argv[1],SERV_PORT);
SetNonBlock(sockfd);
/**< add interested event [STDIN_FILENO] to epoll object
* EPOLLIN :表示对应的文件描述符可以读(包括对端SOCKET正常关闭)
*/
UpdateEvents(epollfd, STDIN_FILENO, EPOLLIN, EPOLL_CTL_ADD); ///< 此处监听标准输入描述符;
for (;;) {
MainLoop(epollfd, sockfd, 10000);
}
::close(epollfd);
::close(sockfd);
return 0;
}