From 84429564fc26fdfdf521ceaa383e8eda6b2a3a5d Mon Sep 17 00:00:00 2001 From: Tony Sun Date: Mon, 27 Jun 2016 18:43:57 -0700 Subject: [PATCH 1/2] Change process_response clause An older version of ibrowse would throw a {error, {'EXIT', Reason}}, when a connection:closed header was received. In the newer version of ibrowse, it throws {error, req_timedout} instead. This leads to a maybe_retry function call because we do not have a clause that handles this error. Which inevitably leads to the replication process dying once it exhausts the retry limit. So we change the process_response clause to address this bug. However, this also means we could end up trying forever for real timeouts. BugzId:69053 --- src/couch_replicator_httpc.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/couch_replicator_httpc.erl b/src/couch_replicator_httpc.erl index 668b218..17e2091 100644 --- a/src/couch_replicator_httpc.erl +++ b/src/couch_replicator_httpc.erl @@ -128,7 +128,7 @@ process_response({error, connection_closing}, Worker, HttpDb, Params, _Cb) -> stop_and_release_worker(HttpDb#httpdb.httpc_pool, Worker), throw({retry, HttpDb, Params}); -process_response({error, {'EXIT',{normal,_}}}, _Worker, HttpDb, Params, _Cb) -> +process_response({error, req_timedout}, _Worker, HttpDb, Params, _Cb) -> % ibrowse worker terminated because remote peer closed the socket % -> not an error throw({retry, HttpDb, Params}); From b7eb63dcdbbf482cc37b6d2cd7b94c5692680e29 Mon Sep 17 00:00:00 2001 From: Tony Sun Date: Mon, 27 Jun 2016 18:54:43 -0700 Subject: [PATCH 2/2] Check if worker is alive for clean_mailbox When a connection:closed header is sent from the server, we handle it by calling ibrowse:stop on the worker and release it from the worker pool. But our clean_mailbox tries to clean the mailbox of this worker when it's already dead, leading to a timeout that crashes the changes_reader process and subsequently the replicator process. So we check to ensure that the Worker is still alive before we call ibrowse:stream_next. BugzId:69053 --- src/couch_replicator_httpc.erl | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/src/couch_replicator_httpc.erl b/src/couch_replicator_httpc.erl index 17e2091..266b922 100644 --- a/src/couch_replicator_httpc.erl +++ b/src/couch_replicator_httpc.erl @@ -222,16 +222,12 @@ clean_mailbox(_ReqId, 0) -> clean_mailbox({ibrowse_req_id, ReqId}, Count) when Count > 0 -> case get(?STREAM_STATUS) of {streaming, Worker} -> - ibrowse:stream_next(ReqId), - receive - {ibrowse_async_response, ReqId, _} -> - clean_mailbox({ibrowse_req_id, ReqId}, Count - 1); - {ibrowse_async_response_end, ReqId} -> + case is_process_alive(Worker) of + true -> + discard_message(ReqId, Worker, Count); + false -> put(?STREAM_STATUS, ended), ok - after 30000 -> - exit(Worker, {timeout, ibrowse_stream_cleanup}), - exit({timeout, ibrowse_stream_cleanup}) end; Status when Status == init; Status == ended -> receive @@ -248,6 +244,20 @@ clean_mailbox(_, Count) when Count > 0 -> ok. +discard_message(ReqId, Worker, Count) -> + ibrowse:stream_next(ReqId), + receive + {ibrowse_async_response, ReqId, _} -> + clean_mailbox({ibrowse_req_id, ReqId}, Count - 1); + {ibrowse_async_response_end, ReqId} -> + put(?STREAM_STATUS, ended), + ok + after 30000 -> + exit(Worker, {timeout, ibrowse_stream_cleanup}), + exit({timeout, ibrowse_stream_cleanup}) + end. + + maybe_retry(Error, Worker, #httpdb{retries = 0} = HttpDb, Params) -> report_error(Worker, HttpDb, Params, {error, Error});