Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stage to master #379

Merged
merged 17 commits into from
Sep 20, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 36 additions & 9 deletions src/filecache.c
Original file line number Diff line number Diff line change
Expand Up @@ -507,15 +507,25 @@ static void get_fresh_fd(filecache_t *cache,

if (slist) curl_slist_free_all(slist);

process_status(funcname, session, res, response_code, elapsed_time, idx, path, false);
bool non_retriable_error = process_status(funcname, session, res, response_code, elapsed_time, idx, path, false);
// Some errors should not be retried. (Non-errors will fail the
// for loop test and fall through naturally)
if (non_retriable_error) break;
}

if ((res != CURLE_OK || response_code >= 500) || inject_error(filecache_error_freshcurl1)) {
trigger_saint_event(CLUSTER_FAILURE);
set_dynamic_logging();
g_set_error(gerr, curl_quark(), E_FC_CURLERR, "%s: curl_easy_perform is not CURLE_OK or 500: %s",
funcname, curl_easy_strerror(res));
goto finish;
if (res == CURLE_PARTIAL_FILE) {
// We see errors on partial file transfers. This seems to mean that the
// file didn't actually make it to S3. Capture this particular error,
// and don't send into saint mode
response_code = 999; // fake code to avoid accidental match
} else {
trigger_saint_event(CLUSTER_FAILURE);
set_dynamic_logging();
g_set_error(gerr, curl_quark(), E_FC_CURLERR, "%s: curl_easy_perform is not CURLE_OK or 500: %s",
funcname, curl_easy_strerror(res));
goto finish;
}
} else {
trigger_saint_event(CLUSTER_SUCCESS);
}
Expand All @@ -531,7 +541,12 @@ static void get_fresh_fd(filecache_t *cache,
// but on the close (dav_flush/release), the PUT fails and the file never makes it to the server.
// On opening again, the server will deliver this unexpected 404. Changes for forensic-haven
// should prevent these errors in the future (2013-08-29)

// Update: if the offload of the file to S3 fails for some reason, either a partial file
// transfer error, which is a curl error so res != CURLE_OK, or a 404 is returned.

if (inject_error(filecache_error_fresh400)) response_code = 400;

if (response_code == 304) {
// This should never happen with a well-behaved server.
if (pdata == NULL || inject_error(filecache_error_freshcurl2)) {
Expand Down Expand Up @@ -713,16 +728,25 @@ static void get_fresh_fd(filecache_t *cache,
stats_timer("exceeded-time-small-GET-latency", elapsed_time);
}
}
else if (response_code == 404) {
else if (response_code == 404 || res == CURLE_PARTIAL_FILE) {
/* If two bindings are in a race condition on the same file, this
* can occur. Binding A does a propfind; Binding B does a propfind
* and the file is shown to exist. Binding A deletes the file
* then Binding B tries to access it and the server returns 404.
* Not sure how to remediate without doing way more work and
* introducing way more problems than the fix will fix.
* If a file doesn't make it to S3, fusedav often sees 'partial file transfer'
* In this case, treat more or less like a 404. The problem is that
* Valhalla Cassandra will think the file exists, and only fail when
* it tries to get content from S3. This will happen on each request
* for the file.
*/
struct stat_cache_value *value;
g_set_error(gerr, filecache_quark(), ENOENT, "%s: File expected to exist returns 404.", funcname);
if (res == CURLE_PARTIAL_FILE) {
g_set_error(gerr, filecache_quark(), ENOENT, "%s: File expected to exist returns 404-partial file transfer.", funcname);
} else {
g_set_error(gerr, filecache_quark(), ENOENT, "%s: File expected to exist returns 404.", funcname);
}
/* we get a 404 because the stat_cache returned that the file existed, but it
* was not on the server. Deleting it from the stat_cache makes the stat_cache
* consistent, so the next access to the file will be handled correctly.
Expand Down Expand Up @@ -1081,7 +1105,10 @@ static void put_return_etag(const char *path, int fd, char *etag, GError **gerr)

if (slist) curl_slist_free_all(slist);

process_status(funcname, session, res, response_code, elapsed_time, idx, path, false);
bool non_retriable_error = process_status(funcname, session, res, response_code, elapsed_time, idx, path, false);
// Some errors should not be retried. (Non-errors will fail the
// for loop test and fall through naturally)
if (non_retriable_error) break;
}

if ((res != CURLE_OK || response_code >= 500) || inject_error(filecache_error_etagcurl1)) {
Expand Down
25 changes: 20 additions & 5 deletions src/fusedav.c
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,10 @@ static void getdir_propfind_callback(__unused void *userdata, const char *path,
"%s: saw 410; calling HEAD on %s", funcname, path);
timed_curl_easy_perform(session, &res, &response_code, &elapsed_time);

process_status(funcname, session, res, response_code, elapsed_time, idx, path, tmp_session);
bool non_retriable_error = process_status(funcname, session, res, response_code, elapsed_time, idx, path, tmp_session);
// Some errors should not be retried. (Non-errors will fail the
// for loop test and fall through naturally)
if (non_retriable_error) break;
}

delete_tmp_session(session);
Expand Down Expand Up @@ -930,7 +933,10 @@ static void common_unlink(const char *path, bool do_unlink, GError **gerr) {

if (slist) curl_slist_free_all(slist);

process_status(funcname, session, res, response_code, elapsed_time, idx, path, false);
bool non_retriable_error = process_status(funcname, session, res, response_code, elapsed_time, idx, path, false);
// Some errors should not be retried. (Non-errors will fail the
// for loop test and fall through naturally)
if (non_retriable_error) break;
}

if(res != CURLE_OK || response_code >= 500 || inject_error(fusedav_error_cunlinkcurl)) {
Expand Down Expand Up @@ -1050,7 +1056,10 @@ static int dav_rmdir(const char *path) {

if (slist) curl_slist_free_all(slist);

process_status(funcname, session, res, response_code, elapsed_time, idx, path, false);
bool non_retriable_error = process_status(funcname, session, res, response_code, elapsed_time, idx, path, false);
// Some errors should not be retried. (Non-errors will fail the
// for loop test and fall through naturally)
if (non_retriable_error) break;
}

if (res != CURLE_OK || response_code >= 500) {
Expand Down Expand Up @@ -1120,7 +1129,10 @@ static int dav_mkdir(const char *path, mode_t mode) {

if (slist) curl_slist_free_all(slist);

process_status(funcname, session, res, response_code, elapsed_time, idx, path, false);
bool non_retriable_error = process_status(funcname, session, res, response_code, elapsed_time, idx, path, false);
// Some errors should not be retried. (Non-errors will fail the
// for loop test and fall through naturally)
if (non_retriable_error) break;
}

if (res != CURLE_OK || response_code >= 500) {
Expand Down Expand Up @@ -1222,7 +1234,10 @@ static int dav_rename(const char *from, const char *to) {

if (slist) curl_slist_free_all(slist);

process_status(funcname, session, res, response_code, elapsed_time, idx, to, false);
bool non_retriable_error = process_status(funcname, session, res, response_code, elapsed_time, idx, to, false);
// Some errors should not be retried. (Non-errors will fail the
// for loop test and fall through naturally)
if (non_retriable_error) break;
}

/* move:
Expand Down
2 changes: 1 addition & 1 deletion src/log.c
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ int logging(unsigned int log_level, unsigned int section) {

static int print_it(const char const *formatwithtid, const char const *msg, int log_level) {
int ret;
// fusedav-valhalla standardizing on names BINDING, SITE, and ENVIRONMENT
// fusedav-server standardizing on names BINDING, SITE, and ENVIRONMENT
ret = sd_journal_send("MESSAGE=%s%s", formatwithtid, msg,
"PRIORITY=%d", log_level,
"USER_AGENT=%s", get_user_agent(),
Expand Down
5 changes: 4 additions & 1 deletion src/props.c
Original file line number Diff line number Diff line change
Expand Up @@ -451,7 +451,10 @@ int simple_propfind(const char *path, size_t depth, time_t last_updated, props_r
free(query_string);
query_string = NULL;

process_status(funcname, session, res, response_code, elapsed_time, idx, path, false);
bool non_retriable_error = process_status(funcname, session, res, response_code, elapsed_time, idx, path, false);
// Some errors should not be retried. (Non-errors will fail the
// for loop test and fall through naturally)
if (non_retriable_error) break;
}

if (res != CURLE_OK || response_code >= 500 || inject_error(props_error_spropfindcurl)) {
Expand Down
33 changes: 26 additions & 7 deletions src/session.c
Original file line number Diff line number Diff line change
Expand Up @@ -1006,35 +1006,52 @@ static void increment_node_failure(char *addr, const CURLcode res, const long re
healthstatus->timestamp = time(NULL); // Most recent failure. We don't currently use this value, but it might be interesting
}

void process_status(const char *fcn_name, CURL *session, const CURLcode res,
bool process_status(const char *fcn_name, CURL *session, const CURLcode res,
const long response_code, const long elapsed_time, const int iter,
const char *path, bool tmp_session) {

bool non_retriable_error = false; // default to retry

stats_counter("attempts", 1);

if (res != CURLE_OK) {
print_errors(iter, "curl_failures", fcn_name, res, response_code, elapsed_time, path);
increment_node_failure(nodeaddr, res, response_code, elapsed_time);
delete_session(session, tmp_session);
return;
// Treat some curl failures differently than others
if (res == CURLE_PARTIAL_FILE) {
non_retriable_error = true;
print_errors(iter, "partial-file-transfer", fcn_name, res, response_code, elapsed_time, path);
// This error is not likely a reflection of the status of the node,
// but rather specific to the file being accessed. (We think it likely
// signals that the file was transferred to server but not to permanent storage.)
// So, don't retry, it's pretty certain it will have the same result.
} else {
print_errors(iter, "curl_failures", fcn_name, res, response_code, elapsed_time, path);
increment_node_failure(nodeaddr, res, response_code, elapsed_time);
delete_session(session, tmp_session);
}
return non_retriable_error;
}

if (response_code >= 500) {
// We could treat 50x errors differently here
print_errors(iter, "status500_failures", fcn_name, res, response_code, elapsed_time, path);
increment_node_failure(nodeaddr, res, response_code, elapsed_time);
delete_session(session, tmp_session);
return;
return non_retriable_error;
}

// We mark a node unhealthy if it takes longer than time_limit to respond, but since
// it is neither a curl error, nor a response_code >= 500 error, it won't retry
// anyway. For clarity, mark it non-retriable explicitly
if (elapsed_time > time_limit) {
non_retriable_error = true;
print_errors(iter, "slow_requests", fcn_name, res, response_code, elapsed_time, path);
increment_node_failure(nodeaddr, res, response_code, elapsed_time);
if (health_status_all_nodes() == UNHEALTHY) {
trigger_saint_event(CLUSTER_FAILURE);
set_dynamic_logging();
}
delete_session(session, tmp_session);
return;
return non_retriable_error;
}

// If it wasn't an error, and it isn't the 0'th iter, then we must have failed previously and now recovered
Expand All @@ -1044,6 +1061,8 @@ void process_status(const char *fcn_name, CURL *session, const CURLcode res,
"%s: curl iter %d on path %s -- fusedav.%s.server-%s.recoveries", fcn_name, iter, path, filesystem_cluster, nodeaddr);
increment_node_success(nodeaddr);
}

return non_retriable_error;
}

static bool valid_slist(void) {
Expand Down
2 changes: 1 addition & 1 deletion src/session.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ extern int num_filesystem_server_nodes;
int session_config_init(char *base, char *ca_cert, char *client_cert, bool grace);
CURL *session_request_init(const char *path, const char *query_string, bool temporary_handle);
void session_config_free(void);
void process_status(const char *fcn_name, CURL *session, const CURLcode res,
bool process_status(const char *fcn_name, CURL *session, const CURLcode res,
const long response_code, const long elapsed_time, const int iter,
const char *path, bool tmp_session);
const char *get_base_url(void);
Expand Down