From e6efd62b2bc80bd08a8a197f2ad93a1c81d7f2c9 Mon Sep 17 00:00:00 2001 From: Stefan Fritsch Date: Sat, 12 Nov 2011 01:32:56 +0000 Subject: Fix assertion failure during very high load by preventing race condition between appending to the timeout queues and adding to the pollset. We don't add additional locking calls but only extend the present calls to include the apr_pollset_add. Therefore this hopefully should not cause too much performance regression. Add some comments Replace two AP_DEBUG_ASSERTS with better error handling git-svn-id: https://svn.apache.org/repos/asf/httpd/httpd/trunk@1201146 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES | 2 + STATUS | 5 --- server/mpm/event/event.c | 105 +++++++++++++++++++++++++++++++++++------------ 3 files changed, 81 insertions(+), 31 deletions(-) diff --git a/CHANGES b/CHANGES index c921965922..2e1eaf8354 100644 --- a/CHANGES +++ b/CHANGES @@ -1,6 +1,8 @@ -*- coding: utf-8 -*- Changes with Apache 2.3.16 + *) mpm_event: Fix assertion failure during very high load. [Stefan Fritsch] + *) configure: Only load the really imporant modules (i.e. those enabled by the 'few' selection) by default. Don't handle modules enabled with --enable-foo specially. [Stefan Fritsch] diff --git a/STATUS b/STATUS index 739410f045..62476842d0 100644 --- a/STATUS +++ b/STATUS @@ -110,11 +110,6 @@ RELEASE SHOWSTOPPERS: the hackathon seems to be that mod_lua should be released as experimental with a note that the API may change during 2.4.x. - * mpm_event can abort under very high load with - (17)File exists: process_socket: apr_pollset_add failure - file event.c, line 952, assertion "rc == 0" failed - child pid 18196 exit signal Aborted (6) - FOR BETA: diff --git a/server/mpm/event/event.c b/server/mpm/event/event.c index 063d85ff7e..75af4f4995 100644 --- a/server/mpm/event/event.c +++ b/server/mpm/event/event.c @@ -190,6 +190,14 @@ struct timeout_queue { int count; const char *tag; }; +/* + * Several timeout queues that use different timeouts, so that we always can + * simply append to the end. + * write_completion_q uses TimeOut + * keepalive_q uses KeepAliveTimeOut + * linger_q uses MAX_SECS_TO_LINGER + * short_linger_q uses SECONDS_TO_LINGER + */ static struct timeout_queue write_completion_q, keepalive_q, linger_q, short_linger_q; static apr_pollfd_t *listener_pollfd; @@ -218,6 +226,15 @@ static apr_pollfd_t *listener_pollfd; #define TO_QUEUE_ELEM_INIT(el) APR_RING_ELEM_INIT(el, timeout_list) +/* + * The pollset for sockets that are in any of the timeout queues. Currently + * we use the timeout_mutex to make sure that connections are added/removed + * atomically to/from both event_pollset and a timeout queue. Otherwise + * some confusion can happen under high load if timeout queues and pollset + * get out of sync. + * XXX: It should be possible to make the lock unnecessary in many or even all + * XXX: cases. + */ static apr_pollset_t *event_pollset; #if HAVE_SERF @@ -720,6 +737,14 @@ static void set_signals(void) #endif } +/* + * close our side of the connection + * Pre-condition: cs is not in any timeout queue and not in the pollset, + * timeout_mutex is not locked + * return: 0 if connection is fully closed, + * 1 if connection is lingering + * may be called by listener or by worker thread + */ static int start_lingering_close(conn_state_t *cs) { apr_status_t rv; @@ -753,18 +778,30 @@ static int start_lingering_close(conn_state_t *cs) } apr_thread_mutex_lock(timeout_mutex); TO_QUEUE_APPEND(*q, cs); - apr_thread_mutex_unlock(timeout_mutex); cs->pfd.reqevents = APR_POLLIN | APR_POLLHUP | APR_POLLERR; rv = apr_pollset_add(event_pollset, &cs->pfd); + apr_thread_mutex_unlock(timeout_mutex); if (rv != APR_SUCCESS && !APR_STATUS_IS_EEXIST(rv)) { ap_log_error(APLOG_MARK, APLOG_ERR, rv, ap_server_conf, "start_lingering_close: apr_pollset_add failure"); - AP_DEBUG_ASSERT(0); + apr_thread_mutex_lock(timeout_mutex); + TO_QUEUE_REMOVE(*q, cs); + apr_thread_mutex_unlock(timeout_mutex); + apr_socket_close(cs->pfd.desc.s); + apr_pool_clear(cs->p); + ap_push_pool(worker_queue_info, cs->p); + return 0; } } return 1; } +/* + * forcibly close a lingering connection after the lingering period has + * expired + * Pre-condition: cs is not in any timeout queue and not in the pollset + * return: irrelevant (need same prototype as start_lingering_close) + */ static int stop_lingering_close(conn_state_t *cs) { apr_status_t rv; @@ -781,12 +818,11 @@ static int stop_lingering_close(conn_state_t *cs) return 0; } - - -/***************************************************************** - * Child process main loop. +/* + * process one connection in the worker + * return: 1 if the connection has been completed, + * 0 if it is still open and waiting for some event */ - static int process_socket(apr_thread_t *thd, apr_pool_t * p, apr_socket_t * sock, conn_state_t * cs, int my_child_num, int my_thread_num) @@ -902,9 +938,9 @@ read_request: cs->expiration_time = ap_server_conf->timeout + apr_time_now(); apr_thread_mutex_lock(timeout_mutex); TO_QUEUE_APPEND(write_completion_q, cs); - apr_thread_mutex_unlock(timeout_mutex); cs->pfd.reqevents = APR_POLLOUT | APR_POLLHUP | APR_POLLERR; rc = apr_pollset_add(event_pollset, &cs->pfd); + apr_thread_mutex_unlock(timeout_mutex); return 1; } else if (c->keepalive != AP_CONN_KEEPALIVE || c->aborted || @@ -939,11 +975,11 @@ read_request: apr_time_now(); apr_thread_mutex_lock(timeout_mutex); TO_QUEUE_APPEND(keepalive_q, cs); - apr_thread_mutex_unlock(timeout_mutex); /* Add work to pollset. */ cs->pfd.reqevents = APR_POLLIN; rc = apr_pollset_add(event_pollset, &cs->pfd); + apr_thread_mutex_unlock(timeout_mutex); if (rc != APR_SUCCESS) { ap_log_error(APLOG_MARK, APLOG_ERR, rc, ap_server_conf, @@ -1088,6 +1124,10 @@ static apr_status_t push_timer2worker(timer_event_t* te) return ap_queue_push_timer(worker_queue, te); } +/* + * Pre-condition: pfd->cs is neither in pollset nor timeout queue + * this function may only be called by the listener + */ static apr_status_t push2worker(const apr_pollfd_t * pfd, apr_pollset_t * pollset) { @@ -1095,21 +1135,6 @@ static apr_status_t push2worker(const apr_pollfd_t * pfd, conn_state_t *cs = (conn_state_t *) pt->baton; apr_status_t rc; - rc = apr_pollset_remove(pollset, pfd); - - /* - * Some of the pollset backends, like KQueue or Epoll - * automagically remove the FD if the socket is closed, - * therefore, we can accept _SUCCESS or _NOTFOUND, - * and we still want to keep going - */ - if (rc != APR_SUCCESS && !APR_STATUS_IS_NOTFOUND(rc)) { - ap_log_error(APLOG_MARK, APLOG_ERR, rc, ap_server_conf, - "pollset remove failed"); - start_lingering_close(cs); - return rc; - } - rc = ap_queue_push(worker_queue, cs->pfd.desc.s, cs, cs->p); if (rc != APR_SUCCESS) { /* trash the connection; we couldn't queue the connected @@ -1223,6 +1248,12 @@ static apr_status_t event_register_timed_callback(apr_time_t t, return APR_SUCCESS; } +/* + * Close socket and clean up if remote closed its end while we were in + * lingering close. + * Only to be called in the listener thread; + * Pre-condition: cs is in one of the linger queues and in the pollset + */ static void process_lingering_close(conn_state_t *cs, const apr_pollfd_t *pfd) { apr_socket_t *csd = ap_get_conn_socket(cs->c); @@ -1242,13 +1273,13 @@ static void process_lingering_close(conn_state_t *cs, const apr_pollfd_t *pfd) return; } + apr_thread_mutex_lock(timeout_mutex); rv = apr_pollset_remove(event_pollset, pfd); AP_DEBUG_ASSERT(rv == APR_SUCCESS); rv = apr_socket_close(csd); AP_DEBUG_ASSERT(rv == APR_SUCCESS); - apr_thread_mutex_lock(timeout_mutex); TO_QUEUE_REMOVE(*q, cs); apr_thread_mutex_unlock(timeout_mutex); TO_QUEUE_ELEM_INIT(cs); @@ -1267,6 +1298,7 @@ static void process_timeout_queue(struct timeout_queue *q, { int count = 0; conn_state_t *first, *cs, *last; + apr_status_t rv; if (!q->count) { return; } @@ -1276,6 +1308,11 @@ static void process_timeout_queue(struct timeout_queue *q, while (cs != APR_RING_SENTINEL(&q->head, conn_state_t, timeout_list) && cs->expiration_time < timeout_time) { last = cs; + rv = apr_pollset_remove(event_pollset, &cs->pfd); + if (rv != APR_SUCCESS && !APR_STATUS_IS_NOTFOUND(rv)) { + ap_log_cerror(APLOG_MARK, APLOG_ERR, rv, cs->c, + "apr_pollset_remove failed"); + } cs = APR_RING_NEXT(cs, timeout_list); count++; } @@ -1448,6 +1485,22 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) &workers_were_busy); apr_thread_mutex_lock(timeout_mutex); TO_QUEUE_REMOVE(*remove_from_q, cs); + rc = apr_pollset_remove(event_pollset, &cs->pfd); + + /* + * Some of the pollset backends, like KQueue or Epoll + * automagically remove the FD if the socket is closed, + * therefore, we can accept _SUCCESS or _NOTFOUND, + * and we still want to keep going + */ + if (rc != APR_SUCCESS && !APR_STATUS_IS_NOTFOUND(rc)) { + ap_log_error(APLOG_MARK, APLOG_ERR, rc, ap_server_conf, + "pollset remove failed"); + apr_thread_mutex_unlock(timeout_mutex); + start_lingering_close(cs); + break; + } + apr_thread_mutex_unlock(timeout_mutex); TO_QUEUE_ELEM_INIT(cs); /* If we didn't get a worker immediately for a keep-alive @@ -1476,7 +1529,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) ap_server_conf, "event_loop: unexpected state %d", cs->state); - AP_DEBUG_ASSERT(0); + ap_assert(0); } } else if (pt->type == PT_ACCEPT) { -- cgit v1.2.3