Skip to content

Commit

Permalink
Stage to master (#379)
Browse files Browse the repository at this point in the history
Improve response to partial file transfer failures
  • Loading branch information
jerryblakley authored Sep 20, 2017
1 parent d41d182 commit 3829533
Show file tree
Hide file tree
Showing 6 changed files with 88 additions and 24 deletions.
45 changes: 36 additions & 9 deletions src/filecache.c
Original file line number Diff line number Diff line change
Expand Up @@ -507,15 +507,25 @@ static void get_fresh_fd(filecache_t *cache,

if (slist) curl_slist_free_all(slist);

process_status(funcname, session, res, response_code, elapsed_time, idx, path, false);
bool non_retriable_error = process_status(funcname, session, res, response_code, elapsed_time, idx, path, false);
// Some errors should not be retried. (Non-errors will fail the
// for loop test and fall through naturally)
if (non_retriable_error) break;
}

if ((res != CURLE_OK || response_code >= 500) || inject_error(filecache_error_freshcurl1)) {
trigger_saint_event(CLUSTER_FAILURE);
set_dynamic_logging();
g_set_error(gerr, curl_quark(), E_FC_CURLERR, "%s: curl_easy_perform is not CURLE_OK or 500: %s",
funcname, curl_easy_strerror(res));
goto finish;
if (res == CURLE_PARTIAL_FILE) {
// We see errors on partial file transfers. This seems to mean that the
// file didn't actually make it to S3. Capture this particular error,
// and don't send into saint mode
response_code = 999; // fake code to avoid accidental match
} else {
trigger_saint_event(CLUSTER_FAILURE);
set_dynamic_logging();
g_set_error(gerr, curl_quark(), E_FC_CURLERR, "%s: curl_easy_perform is not CURLE_OK or 500: %s",
funcname, curl_easy_strerror(res));
goto finish;
}
} else {
trigger_saint_event(CLUSTER_SUCCESS);
}
Expand All @@ -531,7 +541,12 @@ static void get_fresh_fd(filecache_t *cache,
// but on the close (dav_flush/release), the PUT fails and the file never makes it to the server.
// On opening again, the server will deliver this unexpected 404. Changes for forensic-haven
// should prevent these errors in the future (2013-08-29)

// Update: if the offload of the file to S3 fails for some reason, either a partial file
// transfer error, which is a curl error so res != CURLE_OK, or a 404 is returned.

if (inject_error(filecache_error_fresh400)) response_code = 400;

if (response_code == 304) {
// This should never happen with a well-behaved server.
if (pdata == NULL || inject_error(filecache_error_freshcurl2)) {
Expand Down Expand Up @@ -713,16 +728,25 @@ static void get_fresh_fd(filecache_t *cache,
stats_timer("exceeded-time-small-GET-latency", elapsed_time);
}
}
else if (response_code == 404) {
else if (response_code == 404 || res == CURLE_PARTIAL_FILE) {
/* If two bindings are in a race condition on the same file, this
* can occur. Binding A does a propfind; Binding B does a propfind
* and the file is shown to exist. Binding A deletes the file
* then Binding B tries to access it and the server returns 404.
* Not sure how to remediate without doing way more work and
* introducing way more problems than the fix will fix.
* If a file doesn't make it to S3, fusedav often sees 'partial file transfer'
* In this case, treat more or less like a 404. The problem is that
* Valhalla Cassandra will think the file exists, and only fail when
* it tries to get content from S3. This will happen on each request
* for the file.
*/
struct stat_cache_value *value;
g_set_error(gerr, filecache_quark(), ENOENT, "%s: File expected to exist returns 404.", funcname);
if (res == CURLE_PARTIAL_FILE) {
g_set_error(gerr, filecache_quark(), ENOENT, "%s: File expected to exist returns 404-partial file transfer.", funcname);
} else {
g_set_error(gerr, filecache_quark(), ENOENT, "%s: File expected to exist returns 404.", funcname);
}
/* we get a 404 because the stat_cache returned that the file existed, but it
* was not on the server. Deleting it from the stat_cache makes the stat_cache
* consistent, so the next access to the file will be handled correctly.
Expand Down Expand Up @@ -1081,7 +1105,10 @@ static void put_return_etag(const char *path, int fd, char *etag, GError **gerr)

if (slist) curl_slist_free_all(slist);

process_status(funcname, session, res, response_code, elapsed_time, idx, path, false);
bool non_retriable_error = process_status(funcname, session, res, response_code, elapsed_time, idx, path, false);
// Some errors should not be retried. (Non-errors will fail the
// for loop test and fall through naturally)
if (non_retriable_error) break;
}

if ((res != CURLE_OK || response_code >= 500) || inject_error(filecache_error_etagcurl1)) {
Expand Down
25 changes: 20 additions & 5 deletions src/fusedav.c
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,10 @@ static void getdir_propfind_callback(__unused void *userdata, const char *path,
"%s: saw 410; calling HEAD on %s", funcname, path);
timed_curl_easy_perform(session, &res, &response_code, &elapsed_time);

process_status(funcname, session, res, response_code, elapsed_time, idx, path, tmp_session);
bool non_retriable_error = process_status(funcname, session, res, response_code, elapsed_time, idx, path, tmp_session);
// Some errors should not be retried. (Non-errors will fail the
// for loop test and fall through naturally)
if (non_retriable_error) break;
}

delete_tmp_session(session);
Expand Down Expand Up @@ -930,7 +933,10 @@ static void common_unlink(const char *path, bool do_unlink, GError **gerr) {

if (slist) curl_slist_free_all(slist);

process_status(funcname, session, res, response_code, elapsed_time, idx, path, false);
bool non_retriable_error = process_status(funcname, session, res, response_code, elapsed_time, idx, path, false);
// Some errors should not be retried. (Non-errors will fail the
// for loop test and fall through naturally)
if (non_retriable_error) break;
}

if(res != CURLE_OK || response_code >= 500 || inject_error(fusedav_error_cunlinkcurl)) {
Expand Down Expand Up @@ -1050,7 +1056,10 @@ static int dav_rmdir(const char *path) {

if (slist) curl_slist_free_all(slist);

process_status(funcname, session, res, response_code, elapsed_time, idx, path, false);
bool non_retriable_error = process_status(funcname, session, res, response_code, elapsed_time, idx, path, false);
// Some errors should not be retried. (Non-errors will fail the
// for loop test and fall through naturally)
if (non_retriable_error) break;
}

if (res != CURLE_OK || response_code >= 500) {
Expand Down Expand Up @@ -1120,7 +1129,10 @@ static int dav_mkdir(const char *path, mode_t mode) {

if (slist) curl_slist_free_all(slist);

process_status(funcname, session, res, response_code, elapsed_time, idx, path, false);
bool non_retriable_error = process_status(funcname, session, res, response_code, elapsed_time, idx, path, false);
// Some errors should not be retried. (Non-errors will fail the
// for loop test and fall through naturally)
if (non_retriable_error) break;
}

if (res != CURLE_OK || response_code >= 500) {
Expand Down Expand Up @@ -1222,7 +1234,10 @@ static int dav_rename(const char *from, const char *to) {

if (slist) curl_slist_free_all(slist);

process_status(funcname, session, res, response_code, elapsed_time, idx, to, false);
bool non_retriable_error = process_status(funcname, session, res, response_code, elapsed_time, idx, to, false);
// Some errors should not be retried. (Non-errors will fail the
// for loop test and fall through naturally)
if (non_retriable_error) break;
}

/* move:
Expand Down
2 changes: 1 addition & 1 deletion src/log.c
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ int logging(unsigned int log_level, unsigned int section) {

static int print_it(const char const *formatwithtid, const char const *msg, int log_level) {
int ret;
// fusedav-valhalla standardizing on names BINDING, SITE, and ENVIRONMENT
// fusedav-server standardizing on names BINDING, SITE, and ENVIRONMENT
ret = sd_journal_send("MESSAGE=%s%s", formatwithtid, msg,
"PRIORITY=%d", log_level,
"USER_AGENT=%s", get_user_agent(),
Expand Down
5 changes: 4 additions & 1 deletion src/props.c
Original file line number Diff line number Diff line change
Expand Up @@ -451,7 +451,10 @@ int simple_propfind(const char *path, size_t depth, time_t last_updated, props_r
free(query_string);
query_string = NULL;

process_status(funcname, session, res, response_code, elapsed_time, idx, path, false);
bool non_retriable_error = process_status(funcname, session, res, response_code, elapsed_time, idx, path, false);
// Some errors should not be retried. (Non-errors will fail the
// for loop test and fall through naturally)
if (non_retriable_error) break;
}

if (res != CURLE_OK || response_code >= 500 || inject_error(props_error_spropfindcurl)) {
Expand Down
33 changes: 26 additions & 7 deletions src/session.c
Original file line number Diff line number Diff line change
Expand Up @@ -1006,35 +1006,52 @@ static void increment_node_failure(char *addr, const CURLcode res, const long re
healthstatus->timestamp = time(NULL); // Most recent failure. We don't currently use this value, but it might be interesting
}

void process_status(const char *fcn_name, CURL *session, const CURLcode res,
bool process_status(const char *fcn_name, CURL *session, const CURLcode res,
const long response_code, const long elapsed_time, const int iter,
const char *path, bool tmp_session) {

bool non_retriable_error = false; // default to retry

stats_counter("attempts", 1);

if (res != CURLE_OK) {
print_errors(iter, "curl_failures", fcn_name, res, response_code, elapsed_time, path);
increment_node_failure(nodeaddr, res, response_code, elapsed_time);
delete_session(session, tmp_session);
return;
// Treat some curl failures differently than others
if (res == CURLE_PARTIAL_FILE) {
non_retriable_error = true;
print_errors(iter, "partial-file-transfer", fcn_name, res, response_code, elapsed_time, path);
// This error is not likely a reflection of the status of the node,
// but rather specific to the file being accessed. (We think it likely
// signals that the file was transferred to server but not to permanent storage.)
// So, don't retry, it's pretty certain it will have the same result.
} else {
print_errors(iter, "curl_failures", fcn_name, res, response_code, elapsed_time, path);
increment_node_failure(nodeaddr, res, response_code, elapsed_time);
delete_session(session, tmp_session);
}
return non_retriable_error;
}

if (response_code >= 500) {
// We could treat 50x errors differently here
print_errors(iter, "status500_failures", fcn_name, res, response_code, elapsed_time, path);
increment_node_failure(nodeaddr, res, response_code, elapsed_time);
delete_session(session, tmp_session);
return;
return non_retriable_error;
}

// We mark a node unhealthy if it takes longer than time_limit to respond, but since
// it is neither a curl error, nor a response_code >= 500 error, it won't retry
// anyway. For clarity, mark it non-retriable explicitly
if (elapsed_time > time_limit) {
non_retriable_error = true;
print_errors(iter, "slow_requests", fcn_name, res, response_code, elapsed_time, path);
increment_node_failure(nodeaddr, res, response_code, elapsed_time);
if (health_status_all_nodes() == UNHEALTHY) {
trigger_saint_event(CLUSTER_FAILURE);
set_dynamic_logging();
}
delete_session(session, tmp_session);
return;
return non_retriable_error;
}

// If it wasn't an error, and it isn't the 0'th iter, then we must have failed previously and now recovered
Expand All @@ -1044,6 +1061,8 @@ void process_status(const char *fcn_name, CURL *session, const CURLcode res,
"%s: curl iter %d on path %s -- fusedav.%s.server-%s.recoveries", fcn_name, iter, path, filesystem_cluster, nodeaddr);
increment_node_success(nodeaddr);
}

return non_retriable_error;
}

static bool valid_slist(void) {
Expand Down
2 changes: 1 addition & 1 deletion src/session.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ extern int num_filesystem_server_nodes;
int session_config_init(char *base, char *ca_cert, char *client_cert, bool grace);
CURL *session_request_init(const char *path, const char *query_string, bool temporary_handle);
void session_config_free(void);
void process_status(const char *fcn_name, CURL *session, const CURLcode res,
bool process_status(const char *fcn_name, CURL *session, const CURLcode res,
const long response_code, const long elapsed_time, const int iter,
const char *path, bool tmp_session);
const char *get_base_url(void);
Expand Down

0 comments on commit 3829533

Please sign in to comment.