diff --git a/src/filecache.c b/src/filecache.c index a5a0e253..ffe1e068 100644 --- a/src/filecache.c +++ b/src/filecache.c @@ -507,15 +507,25 @@ static void get_fresh_fd(filecache_t *cache, if (slist) curl_slist_free_all(slist); - process_status(funcname, session, res, response_code, elapsed_time, idx, path, false); + bool non_retriable_error = process_status(funcname, session, res, response_code, elapsed_time, idx, path, false); + // Some errors should not be retried. (Non-errors will fail the + // for loop test and fall through naturally) + if (non_retriable_error) break; } if ((res != CURLE_OK || response_code >= 500) || inject_error(filecache_error_freshcurl1)) { - trigger_saint_event(CLUSTER_FAILURE); - set_dynamic_logging(); - g_set_error(gerr, curl_quark(), E_FC_CURLERR, "%s: curl_easy_perform is not CURLE_OK or 500: %s", - funcname, curl_easy_strerror(res)); - goto finish; + if (res == CURLE_PARTIAL_FILE) { + // We see errors on partial file transfers. This seems to mean that the + // file didn't actually make it to S3. Capture this particular error, + // and don't send into saint mode + response_code = 999; // fake code to avoid accidental match + } else { + trigger_saint_event(CLUSTER_FAILURE); + set_dynamic_logging(); + g_set_error(gerr, curl_quark(), E_FC_CURLERR, "%s: curl_easy_perform is not CURLE_OK or 500: %s", + funcname, curl_easy_strerror(res)); + goto finish; + } } else { trigger_saint_event(CLUSTER_SUCCESS); } @@ -531,7 +541,12 @@ static void get_fresh_fd(filecache_t *cache, // but on the close (dav_flush/release), the PUT fails and the file never makes it to the server. // On opening again, the server will deliver this unexpected 404. Changes for forensic-haven // should prevent these errors in the future (2013-08-29) + + // Update: if the offload of the file to S3 fails for some reason, either a partial file + // transfer error, which is a curl error so res != CURLE_OK, or a 404 is returned. + if (inject_error(filecache_error_fresh400)) response_code = 400; + if (response_code == 304) { // This should never happen with a well-behaved server. if (pdata == NULL || inject_error(filecache_error_freshcurl2)) { @@ -713,16 +728,25 @@ static void get_fresh_fd(filecache_t *cache, stats_timer("exceeded-time-small-GET-latency", elapsed_time); } } - else if (response_code == 404) { + else if (response_code == 404 || res == CURLE_PARTIAL_FILE) { /* If two bindings are in a race condition on the same file, this * can occur. Binding A does a propfind; Binding B does a propfind * and the file is shown to exist. Binding A deletes the file * then Binding B tries to access it and the server returns 404. * Not sure how to remediate without doing way more work and * introducing way more problems than the fix will fix. + * If a file doesn't make it to S3, fusedav often sees 'partial file transfer' + * In this case, treat more or less like a 404. The problem is that + * Valhalla Cassandra will think the file exists, and only fail when + * it tries to get content from S3. This will happen on each request + * for the file. */ struct stat_cache_value *value; - g_set_error(gerr, filecache_quark(), ENOENT, "%s: File expected to exist returns 404.", funcname); + if (res == CURLE_PARTIAL_FILE) { + g_set_error(gerr, filecache_quark(), ENOENT, "%s: File expected to exist returns 404-partial file transfer.", funcname); + } else { + g_set_error(gerr, filecache_quark(), ENOENT, "%s: File expected to exist returns 404.", funcname); + } /* we get a 404 because the stat_cache returned that the file existed, but it * was not on the server. Deleting it from the stat_cache makes the stat_cache * consistent, so the next access to the file will be handled correctly. @@ -1081,7 +1105,10 @@ static void put_return_etag(const char *path, int fd, char *etag, GError **gerr) if (slist) curl_slist_free_all(slist); - process_status(funcname, session, res, response_code, elapsed_time, idx, path, false); + bool non_retriable_error = process_status(funcname, session, res, response_code, elapsed_time, idx, path, false); + // Some errors should not be retried. (Non-errors will fail the + // for loop test and fall through naturally) + if (non_retriable_error) break; } if ((res != CURLE_OK || response_code >= 500) || inject_error(filecache_error_etagcurl1)) { diff --git a/src/fusedav.c b/src/fusedav.c index 70a3aef2..2b0abd6e 100644 --- a/src/fusedav.c +++ b/src/fusedav.c @@ -275,7 +275,10 @@ static void getdir_propfind_callback(__unused void *userdata, const char *path, "%s: saw 410; calling HEAD on %s", funcname, path); timed_curl_easy_perform(session, &res, &response_code, &elapsed_time); - process_status(funcname, session, res, response_code, elapsed_time, idx, path, tmp_session); + bool non_retriable_error = process_status(funcname, session, res, response_code, elapsed_time, idx, path, tmp_session); + // Some errors should not be retried. (Non-errors will fail the + // for loop test and fall through naturally) + if (non_retriable_error) break; } delete_tmp_session(session); @@ -930,7 +933,10 @@ static void common_unlink(const char *path, bool do_unlink, GError **gerr) { if (slist) curl_slist_free_all(slist); - process_status(funcname, session, res, response_code, elapsed_time, idx, path, false); + bool non_retriable_error = process_status(funcname, session, res, response_code, elapsed_time, idx, path, false); + // Some errors should not be retried. (Non-errors will fail the + // for loop test and fall through naturally) + if (non_retriable_error) break; } if(res != CURLE_OK || response_code >= 500 || inject_error(fusedav_error_cunlinkcurl)) { @@ -1050,7 +1056,10 @@ static int dav_rmdir(const char *path) { if (slist) curl_slist_free_all(slist); - process_status(funcname, session, res, response_code, elapsed_time, idx, path, false); + bool non_retriable_error = process_status(funcname, session, res, response_code, elapsed_time, idx, path, false); + // Some errors should not be retried. (Non-errors will fail the + // for loop test and fall through naturally) + if (non_retriable_error) break; } if (res != CURLE_OK || response_code >= 500) { @@ -1120,7 +1129,10 @@ static int dav_mkdir(const char *path, mode_t mode) { if (slist) curl_slist_free_all(slist); - process_status(funcname, session, res, response_code, elapsed_time, idx, path, false); + bool non_retriable_error = process_status(funcname, session, res, response_code, elapsed_time, idx, path, false); + // Some errors should not be retried. (Non-errors will fail the + // for loop test and fall through naturally) + if (non_retriable_error) break; } if (res != CURLE_OK || response_code >= 500) { @@ -1222,7 +1234,10 @@ static int dav_rename(const char *from, const char *to) { if (slist) curl_slist_free_all(slist); - process_status(funcname, session, res, response_code, elapsed_time, idx, to, false); + bool non_retriable_error = process_status(funcname, session, res, response_code, elapsed_time, idx, to, false); + // Some errors should not be retried. (Non-errors will fail the + // for loop test and fall through naturally) + if (non_retriable_error) break; } /* move: diff --git a/src/log.c b/src/log.c index ac359f00..6dd6d3b4 100644 --- a/src/log.c +++ b/src/log.c @@ -169,7 +169,7 @@ int logging(unsigned int log_level, unsigned int section) { static int print_it(const char const *formatwithtid, const char const *msg, int log_level) { int ret; - // fusedav-valhalla standardizing on names BINDING, SITE, and ENVIRONMENT + // fusedav-server standardizing on names BINDING, SITE, and ENVIRONMENT ret = sd_journal_send("MESSAGE=%s%s", formatwithtid, msg, "PRIORITY=%d", log_level, "USER_AGENT=%s", get_user_agent(), diff --git a/src/props.c b/src/props.c index 178ff404..f9f116db 100644 --- a/src/props.c +++ b/src/props.c @@ -451,7 +451,10 @@ int simple_propfind(const char *path, size_t depth, time_t last_updated, props_r free(query_string); query_string = NULL; - process_status(funcname, session, res, response_code, elapsed_time, idx, path, false); + bool non_retriable_error = process_status(funcname, session, res, response_code, elapsed_time, idx, path, false); + // Some errors should not be retried. (Non-errors will fail the + // for loop test and fall through naturally) + if (non_retriable_error) break; } if (res != CURLE_OK || response_code >= 500 || inject_error(props_error_spropfindcurl)) { diff --git a/src/session.c b/src/session.c index cb5dd0ad..220f9ff9 100644 --- a/src/session.c +++ b/src/session.c @@ -1006,27 +1006,44 @@ static void increment_node_failure(char *addr, const CURLcode res, const long re healthstatus->timestamp = time(NULL); // Most recent failure. We don't currently use this value, but it might be interesting } -void process_status(const char *fcn_name, CURL *session, const CURLcode res, +bool process_status(const char *fcn_name, CURL *session, const CURLcode res, const long response_code, const long elapsed_time, const int iter, const char *path, bool tmp_session) { + bool non_retriable_error = false; // default to retry + stats_counter("attempts", 1); if (res != CURLE_OK) { - print_errors(iter, "curl_failures", fcn_name, res, response_code, elapsed_time, path); - increment_node_failure(nodeaddr, res, response_code, elapsed_time); - delete_session(session, tmp_session); - return; + // Treat some curl failures differently than others + if (res == CURLE_PARTIAL_FILE) { + non_retriable_error = true; + print_errors(iter, "partial-file-transfer", fcn_name, res, response_code, elapsed_time, path); + // This error is not likely a reflection of the status of the node, + // but rather specific to the file being accessed. (We think it likely + // signals that the file was transferred to server but not to permanent storage.) + // So, don't retry, it's pretty certain it will have the same result. + } else { + print_errors(iter, "curl_failures", fcn_name, res, response_code, elapsed_time, path); + increment_node_failure(nodeaddr, res, response_code, elapsed_time); + delete_session(session, tmp_session); + } + return non_retriable_error; } if (response_code >= 500) { + // We could treat 50x errors differently here print_errors(iter, "status500_failures", fcn_name, res, response_code, elapsed_time, path); increment_node_failure(nodeaddr, res, response_code, elapsed_time); delete_session(session, tmp_session); - return; + return non_retriable_error; } + // We mark a node unhealthy if it takes longer than time_limit to respond, but since + // it is neither a curl error, nor a response_code >= 500 error, it won't retry + // anyway. For clarity, mark it non-retriable explicitly if (elapsed_time > time_limit) { + non_retriable_error = true; print_errors(iter, "slow_requests", fcn_name, res, response_code, elapsed_time, path); increment_node_failure(nodeaddr, res, response_code, elapsed_time); if (health_status_all_nodes() == UNHEALTHY) { @@ -1034,7 +1051,7 @@ void process_status(const char *fcn_name, CURL *session, const CURLcode res, set_dynamic_logging(); } delete_session(session, tmp_session); - return; + return non_retriable_error; } // If it wasn't an error, and it isn't the 0'th iter, then we must have failed previously and now recovered @@ -1044,6 +1061,8 @@ void process_status(const char *fcn_name, CURL *session, const CURLcode res, "%s: curl iter %d on path %s -- fusedav.%s.server-%s.recoveries", fcn_name, iter, path, filesystem_cluster, nodeaddr); increment_node_success(nodeaddr); } + + return non_retriable_error; } static bool valid_slist(void) { diff --git a/src/session.h b/src/session.h index 0062775e..0e9745ab 100644 --- a/src/session.h +++ b/src/session.h @@ -27,7 +27,7 @@ extern int num_filesystem_server_nodes; int session_config_init(char *base, char *ca_cert, char *client_cert, bool grace); CURL *session_request_init(const char *path, const char *query_string, bool temporary_handle); void session_config_free(void); -void process_status(const char *fcn_name, CURL *session, const CURLcode res, +bool process_status(const char *fcn_name, CURL *session, const CURLcode res, const long response_code, const long elapsed_time, const int iter, const char *path, bool tmp_session); const char *get_base_url(void);