diff options
author | Stefan Fritsch <sf@apache.org> | 2011-11-12 02:32:56 +0100 |
---|---|---|
committer | Stefan Fritsch <sf@apache.org> | 2011-11-12 02:32:56 +0100 |
commit | e6efd62b2bc80bd08a8a197f2ad93a1c81d7f2c9 (patch) | |
tree | f9cf55711c294885ecb5a0f1db9e020578668bac | |
parent | Change module sets and default activation status: (diff) | |
download | apache2-e6efd62b2bc80bd08a8a197f2ad93a1c81d7f2c9.tar.xz apache2-e6efd62b2bc80bd08a8a197f2ad93a1c81d7f2c9.zip |
Fix assertion failure during very high load by preventing race condition
between appending to the timeout queues and adding to the pollset. We don't
add additional locking calls but only extend the present calls to include the
apr_pollset_add. Therefore this hopefully should not cause too much performance
regression.
Add some comments
Replace two AP_DEBUG_ASSERTS with better error handling
git-svn-id: https://svn.apache.org/repos/asf/httpd/httpd/trunk@1201146 13f79535-47bb-0310-9956-ffa450edef68
-rw-r--r-- | CHANGES | 2 | ||||
-rw-r--r-- | STATUS | 5 | ||||
-rw-r--r-- | server/mpm/event/event.c | 105 |
3 files changed, 81 insertions, 31 deletions
@@ -1,6 +1,8 @@ -*- coding: utf-8 -*- Changes with Apache 2.3.16 + *) mpm_event: Fix assertion failure during very high load. [Stefan Fritsch] + *) configure: Only load the really imporant modules (i.e. those enabled by the 'few' selection) by default. Don't handle modules enabled with --enable-foo specially. [Stefan Fritsch] @@ -110,11 +110,6 @@ RELEASE SHOWSTOPPERS: the hackathon seems to be that mod_lua should be released as experimental with a note that the API may change during 2.4.x. - * mpm_event can abort under very high load with - (17)File exists: process_socket: apr_pollset_add failure - file event.c, line 952, assertion "rc == 0" failed - child pid 18196 exit signal Aborted (6) - FOR BETA: diff --git a/server/mpm/event/event.c b/server/mpm/event/event.c index 063d85ff7e..75af4f4995 100644 --- a/server/mpm/event/event.c +++ b/server/mpm/event/event.c @@ -190,6 +190,14 @@ struct timeout_queue { int count; const char *tag; }; +/* + * Several timeout queues that use different timeouts, so that we always can + * simply append to the end. + * write_completion_q uses TimeOut + * keepalive_q uses KeepAliveTimeOut + * linger_q uses MAX_SECS_TO_LINGER + * short_linger_q uses SECONDS_TO_LINGER + */ static struct timeout_queue write_completion_q, keepalive_q, linger_q, short_linger_q; static apr_pollfd_t *listener_pollfd; @@ -218,6 +226,15 @@ static apr_pollfd_t *listener_pollfd; #define TO_QUEUE_ELEM_INIT(el) APR_RING_ELEM_INIT(el, timeout_list) +/* + * The pollset for sockets that are in any of the timeout queues. Currently + * we use the timeout_mutex to make sure that connections are added/removed + * atomically to/from both event_pollset and a timeout queue. Otherwise + * some confusion can happen under high load if timeout queues and pollset + * get out of sync. + * XXX: It should be possible to make the lock unnecessary in many or even all + * XXX: cases. + */ static apr_pollset_t *event_pollset; #if HAVE_SERF @@ -720,6 +737,14 @@ static void set_signals(void) #endif } +/* + * close our side of the connection + * Pre-condition: cs is not in any timeout queue and not in the pollset, + * timeout_mutex is not locked + * return: 0 if connection is fully closed, + * 1 if connection is lingering + * may be called by listener or by worker thread + */ static int start_lingering_close(conn_state_t *cs) { apr_status_t rv; @@ -753,18 +778,30 @@ static int start_lingering_close(conn_state_t *cs) } apr_thread_mutex_lock(timeout_mutex); TO_QUEUE_APPEND(*q, cs); - apr_thread_mutex_unlock(timeout_mutex); cs->pfd.reqevents = APR_POLLIN | APR_POLLHUP | APR_POLLERR; rv = apr_pollset_add(event_pollset, &cs->pfd); + apr_thread_mutex_unlock(timeout_mutex); if (rv != APR_SUCCESS && !APR_STATUS_IS_EEXIST(rv)) { ap_log_error(APLOG_MARK, APLOG_ERR, rv, ap_server_conf, "start_lingering_close: apr_pollset_add failure"); - AP_DEBUG_ASSERT(0); + apr_thread_mutex_lock(timeout_mutex); + TO_QUEUE_REMOVE(*q, cs); + apr_thread_mutex_unlock(timeout_mutex); + apr_socket_close(cs->pfd.desc.s); + apr_pool_clear(cs->p); + ap_push_pool(worker_queue_info, cs->p); + return 0; } } return 1; } +/* + * forcibly close a lingering connection after the lingering period has + * expired + * Pre-condition: cs is not in any timeout queue and not in the pollset + * return: irrelevant (need same prototype as start_lingering_close) + */ static int stop_lingering_close(conn_state_t *cs) { apr_status_t rv; @@ -781,12 +818,11 @@ static int stop_lingering_close(conn_state_t *cs) return 0; } - - -/***************************************************************** - * Child process main loop. +/* + * process one connection in the worker + * return: 1 if the connection has been completed, + * 0 if it is still open and waiting for some event */ - static int process_socket(apr_thread_t *thd, apr_pool_t * p, apr_socket_t * sock, conn_state_t * cs, int my_child_num, int my_thread_num) @@ -902,9 +938,9 @@ read_request: cs->expiration_time = ap_server_conf->timeout + apr_time_now(); apr_thread_mutex_lock(timeout_mutex); TO_QUEUE_APPEND(write_completion_q, cs); - apr_thread_mutex_unlock(timeout_mutex); cs->pfd.reqevents = APR_POLLOUT | APR_POLLHUP | APR_POLLERR; rc = apr_pollset_add(event_pollset, &cs->pfd); + apr_thread_mutex_unlock(timeout_mutex); return 1; } else if (c->keepalive != AP_CONN_KEEPALIVE || c->aborted || @@ -939,11 +975,11 @@ read_request: apr_time_now(); apr_thread_mutex_lock(timeout_mutex); TO_QUEUE_APPEND(keepalive_q, cs); - apr_thread_mutex_unlock(timeout_mutex); /* Add work to pollset. */ cs->pfd.reqevents = APR_POLLIN; rc = apr_pollset_add(event_pollset, &cs->pfd); + apr_thread_mutex_unlock(timeout_mutex); if (rc != APR_SUCCESS) { ap_log_error(APLOG_MARK, APLOG_ERR, rc, ap_server_conf, @@ -1088,6 +1124,10 @@ static apr_status_t push_timer2worker(timer_event_t* te) return ap_queue_push_timer(worker_queue, te); } +/* + * Pre-condition: pfd->cs is neither in pollset nor timeout queue + * this function may only be called by the listener + */ static apr_status_t push2worker(const apr_pollfd_t * pfd, apr_pollset_t * pollset) { @@ -1095,21 +1135,6 @@ static apr_status_t push2worker(const apr_pollfd_t * pfd, conn_state_t *cs = (conn_state_t *) pt->baton; apr_status_t rc; - rc = apr_pollset_remove(pollset, pfd); - - /* - * Some of the pollset backends, like KQueue or Epoll - * automagically remove the FD if the socket is closed, - * therefore, we can accept _SUCCESS or _NOTFOUND, - * and we still want to keep going - */ - if (rc != APR_SUCCESS && !APR_STATUS_IS_NOTFOUND(rc)) { - ap_log_error(APLOG_MARK, APLOG_ERR, rc, ap_server_conf, - "pollset remove failed"); - start_lingering_close(cs); - return rc; - } - rc = ap_queue_push(worker_queue, cs->pfd.desc.s, cs, cs->p); if (rc != APR_SUCCESS) { /* trash the connection; we couldn't queue the connected @@ -1223,6 +1248,12 @@ static apr_status_t event_register_timed_callback(apr_time_t t, return APR_SUCCESS; } +/* + * Close socket and clean up if remote closed its end while we were in + * lingering close. + * Only to be called in the listener thread; + * Pre-condition: cs is in one of the linger queues and in the pollset + */ static void process_lingering_close(conn_state_t *cs, const apr_pollfd_t *pfd) { apr_socket_t *csd = ap_get_conn_socket(cs->c); @@ -1242,13 +1273,13 @@ static void process_lingering_close(conn_state_t *cs, const apr_pollfd_t *pfd) return; } + apr_thread_mutex_lock(timeout_mutex); rv = apr_pollset_remove(event_pollset, pfd); AP_DEBUG_ASSERT(rv == APR_SUCCESS); rv = apr_socket_close(csd); AP_DEBUG_ASSERT(rv == APR_SUCCESS); - apr_thread_mutex_lock(timeout_mutex); TO_QUEUE_REMOVE(*q, cs); apr_thread_mutex_unlock(timeout_mutex); TO_QUEUE_ELEM_INIT(cs); @@ -1267,6 +1298,7 @@ static void process_timeout_queue(struct timeout_queue *q, { int count = 0; conn_state_t *first, *cs, *last; + apr_status_t rv; if (!q->count) { return; } @@ -1276,6 +1308,11 @@ static void process_timeout_queue(struct timeout_queue *q, while (cs != APR_RING_SENTINEL(&q->head, conn_state_t, timeout_list) && cs->expiration_time < timeout_time) { last = cs; + rv = apr_pollset_remove(event_pollset, &cs->pfd); + if (rv != APR_SUCCESS && !APR_STATUS_IS_NOTFOUND(rv)) { + ap_log_cerror(APLOG_MARK, APLOG_ERR, rv, cs->c, + "apr_pollset_remove failed"); + } cs = APR_RING_NEXT(cs, timeout_list); count++; } @@ -1448,6 +1485,22 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) &workers_were_busy); apr_thread_mutex_lock(timeout_mutex); TO_QUEUE_REMOVE(*remove_from_q, cs); + rc = apr_pollset_remove(event_pollset, &cs->pfd); + + /* + * Some of the pollset backends, like KQueue or Epoll + * automagically remove the FD if the socket is closed, + * therefore, we can accept _SUCCESS or _NOTFOUND, + * and we still want to keep going + */ + if (rc != APR_SUCCESS && !APR_STATUS_IS_NOTFOUND(rc)) { + ap_log_error(APLOG_MARK, APLOG_ERR, rc, ap_server_conf, + "pollset remove failed"); + apr_thread_mutex_unlock(timeout_mutex); + start_lingering_close(cs); + break; + } + apr_thread_mutex_unlock(timeout_mutex); TO_QUEUE_ELEM_INIT(cs); /* If we didn't get a worker immediately for a keep-alive @@ -1476,7 +1529,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) ap_server_conf, "event_loop: unexpected state %d", cs->state); - AP_DEBUG_ASSERT(0); + ap_assert(0); } } else if (pt->type == PT_ACCEPT) { |