Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Send stats through socket rather than through logs #284

Merged
merged 5 commits into from
Apr 29, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 36 additions & 69 deletions src/filecache.c
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
#include "stats.h"
#include "session.h"
#include "fusedav_config.h"
#include "fusedav-statsd.h"

#define REFRESH_INTERVAL 3
#define CACHE_FILE_ENTROPY 20
Expand Down Expand Up @@ -141,14 +142,14 @@ static char *path2key(const char *path) {
* to enhanced logging to LOG_NOTICE.
*/
struct curl_slist* enhanced_logging(struct curl_slist *slist, int log_level, int section, const char *format, ...) {
va_list ap;

// If we are injecting errors, we can trigger enhanced logging by decrementing
// the log level (LOG_INFO -> LOG_NOTICE)
if (inject_error(filecache_error_enhanced_logging)) {
log_level -= 1;
}
if (logging(log_level, section)) {
va_list ap;
char *user_agent = NULL;
char msg[81] = {0};
slist = curl_slist_append(slist, "Log-To-Journal: true");
Expand Down Expand Up @@ -363,13 +364,8 @@ static void get_fresh_fd(filecache_t *cache,
struct timespec start_time;
long response_code = 500; // seed it as bad so we can enter the loop
CURLcode res = CURLE_OK;
static __thread unsigned long lgcount = 0;
static __thread long lglatency = 0;
static __thread time_t lgtime = 0;
static __thread unsigned long sgcount = 0;
static __thread long sglatency = 0;
static __thread time_t sgtime = 0;
// Not to exceed time for operation, else it's an error. Allow large files a longer time
// Somewhat arbitrary
static const unsigned small_time_allotment = 2000; // 2 seconds
static const unsigned large_time_allotment = 8000; // 8 seconds

Expand Down Expand Up @@ -652,80 +648,68 @@ static void get_fresh_fd(filecache_t *cache,
latency = FETCH(filecache_get_xlg_timing);
count = FETCH(filecache_get_xlg_count);
sz = "XLG";
aggregate_log_print_server(LOG_INFO, SECTION_ENHANCED, funcname, &lgtime,
"large-gets", &lgcount, 1,
"large-get-latency", &lglatency, elapsed_time);
stats_counter("large-gets", 1);
stats_timer("large-get-latency", elapsed_time);
}
else if (st.st_size > LG) {
TIMING(filecache_get_lg_timing, elapsed_time);
BUMP(filecache_get_lg_count);
latency = FETCH(filecache_get_lg_timing);
count = FETCH(filecache_get_lg_count);
sz = "LG";
aggregate_log_print_server(LOG_INFO, SECTION_ENHANCED, funcname, &lgtime,
"large-gets", &lgcount, 1,
"large-get-latency", &lglatency, elapsed_time);
stats_counter("large-gets", 1);
stats_timer("large-get-latency", elapsed_time);
}
else if (st.st_size > MED) {
TIMING(filecache_get_med_timing, elapsed_time);
BUMP(filecache_get_med_count);
latency = FETCH(filecache_get_med_timing);
count = FETCH(filecache_get_med_count);
sz = "MED";
aggregate_log_print_server(LOG_INFO, SECTION_ENHANCED, funcname, &lgtime,
"large-gets", &lgcount, 1,
"large-get-latency", &lglatency, elapsed_time);
stats_counter("large-gets", 1);
stats_timer("large-get-latency", elapsed_time);
}
else if (st.st_size > SM) {
TIMING(filecache_get_sm_timing, elapsed_time);
BUMP(filecache_get_sm_count);
latency = FETCH(filecache_get_sm_timing);
count = FETCH(filecache_get_sm_count);
sz = "SM";
aggregate_log_print_server(LOG_INFO, SECTION_ENHANCED, funcname, &sgtime,
"small-gets", &sgcount, 1,
"small-get-latency", &sglatency, elapsed_time);
stats_counter("small-gets", 1);
stats_timer("small-get-latency", elapsed_time);
}
else if (st.st_size > XSM) {
TIMING(filecache_get_xsm_timing, elapsed_time);
BUMP(filecache_get_xsm_count);
latency = FETCH(filecache_get_xsm_timing);
count = FETCH(filecache_get_xsm_count);
sz = "XSM";
aggregate_log_print_server(LOG_INFO, SECTION_ENHANCED, funcname, &sgtime,
"small-gets", &sgcount, 1,
"small-get-latency", &sglatency, elapsed_time);
stats_counter("small-gets", 1);
stats_timer("small-get-latency", elapsed_time);
}
else {
TIMING(filecache_get_xxsm_timing, elapsed_time);
BUMP(filecache_get_xxsm_count);
latency = FETCH(filecache_get_xxsm_timing);
count = FETCH(filecache_get_xxsm_count);
sz = "XXSM";
aggregate_log_print_server(LOG_INFO, SECTION_ENHANCED, funcname, &sgtime,
"small-gets", &sgcount, 1,
"small-get-latency", &sglatency, elapsed_time);
stats_counter("small-gets", 1);
stats_timer("small-get-latency", elapsed_time);
}
log_print(LOG_DEBUG, SECTION_FILECACHE_OPEN, "put_fresh_fd: GET on size %s (%lu) for %s -- Current:Average latency %lu :: %lu",
sz, st.st_size, path, elapsed_time, (latency / count));

/* The aggregate_log_print_server routine is expecting a cumulative count and latency, but by
* passing NULL for time, we ensure that aggregate prints this message and resets to 0, so
* there's no need to keep a static variable for count and latency.
*/
if (st.st_size >= LG && elapsed_time > large_time_allotment) {
unsigned long exceeded_count = 0;
log_print(LOG_WARNING, SECTION_FILECACHE_OPEN, "put_fresh_fd: large (%lu) GET for %s exceeded time allotment %lu with %lu",
st.st_size, path, large_time_allotment, elapsed_time);
aggregate_log_print_server(LOG_INFO, SECTION_ENHANCED, "put_fresh_fd", NULL, "exceeded-time-large-GET-count",
&exceeded_count, 1, "exceeded-time-large-GET-latency", &elapsed_time, 0);
stats_counter("exceeded-time-large-GET-count", 1);
stats_timer("exceeded-time-large-GET-latency", elapsed_time);
}
else if (st.st_size < LG && elapsed_time > small_time_allotment) {
unsigned long exceeded_count = 0;
log_print(LOG_WARNING, SECTION_FILECACHE_OPEN, "put_fresh_fd: small (%lu) GET for %s exceeded time allotment %lu with %lu",
st.st_size, path, small_time_allotment, elapsed_time);
aggregate_log_print_server(LOG_INFO, SECTION_ENHANCED, "put_fresh_fd", NULL, "exceeded-time-small-GET-count",
&exceeded_count, 1, "exceeded-time-small-GET-latency", &elapsed_time, 0);
stats_counter("exceeded-time-small-GET-count", 1);
stats_timer("exceeded-time-small-GET-latency", elapsed_time);
}
}
else if (response_code == 404) {
Expand Down Expand Up @@ -1006,13 +990,8 @@ static void put_return_etag(const char *path, int fd, char *etag, GError **gerr)
struct timespec start_time;
long response_code = 500; // seed it as bad so we can enter the loop
CURLcode res = CURLE_OK;
static __thread unsigned long lpcount = 0;
static __thread long lplatency = 0;
static __thread time_t lptime = 0;
static __thread unsigned long spcount = 0;
static __thread long splatency = 0;
static __thread time_t sptime = 0;
// Not to exceed time for operation, else it's an error. Allow large files a longer time
// Somewhat arbitrary
static const unsigned small_time_allotment = 2000; // 2 seconds
static const unsigned large_time_allotment = 8000; // 8 seconds

Expand Down Expand Up @@ -1135,80 +1114,68 @@ static void put_return_etag(const char *path, int fd, char *etag, GError **gerr)
latency = FETCH(filecache_put_xlg_timing);
count = FETCH(filecache_put_xlg_count);
sz = "XLG";
aggregate_log_print_server(LOG_INFO, SECTION_ENHANCED, funcname, &lptime,
"large-puts", &lpcount, 1,
"large-put-latency", &lplatency, elapsed_time);
stats_counter("large-puts", 1);
stats_timer("large-put-latency", elapsed_time);
}
else if (st.st_size > LG) {
TIMING(filecache_put_lg_timing, elapsed_time);
BUMP(filecache_put_lg_count);
latency = FETCH(filecache_put_lg_timing);
count = FETCH(filecache_put_lg_count);
sz = "LG";
aggregate_log_print_server(LOG_INFO, SECTION_ENHANCED, funcname, &lptime,
"large-puts", &lpcount, 1,
"large-put-latency", &lplatency, elapsed_time);
stats_counter("large-puts", 1);
stats_timer("large-put-latency", elapsed_time);
}
else if (st.st_size > MED) {
TIMING(filecache_put_med_timing, elapsed_time);
BUMP(filecache_put_med_count);
latency = FETCH(filecache_put_med_timing);
count = FETCH(filecache_put_med_count);
sz = "MED";
aggregate_log_print_server(LOG_INFO, SECTION_ENHANCED, funcname, &lptime,
"large-puts", &lpcount, 1,
"large-put-latency", &lplatency, elapsed_time);
stats_counter("large-puts", 1);
stats_timer("large-put-latency", elapsed_time);
}
else if (st.st_size > SM) {
TIMING(filecache_put_sm_timing, elapsed_time);
BUMP(filecache_put_sm_count);
latency = FETCH(filecache_put_sm_timing);
count = FETCH(filecache_put_sm_count);
sz = "SM";
aggregate_log_print_server(LOG_INFO, SECTION_ENHANCED, funcname, &sptime,
"small-puts", &spcount, 1,
"small-put-latency", &splatency, elapsed_time);
stats_counter("small-puts", 1);
stats_timer("small-put-latency", elapsed_time);
}
else if (st.st_size > XSM) {
TIMING(filecache_put_xsm_timing, elapsed_time);
BUMP(filecache_put_xsm_count);
latency = FETCH(filecache_put_xsm_timing);
count = FETCH(filecache_put_xsm_count);
sz = "XSM";
aggregate_log_print_server(LOG_INFO, SECTION_ENHANCED, funcname, &sptime,
"small-puts", &spcount, 1,
"small-put-latency", &splatency, elapsed_time);
stats_counter("small-puts", 1);
stats_timer("small-put-latency", elapsed_time);
}
else {
TIMING(filecache_put_xxsm_timing, elapsed_time);
BUMP(filecache_put_xxsm_count);
latency = FETCH(filecache_put_xxsm_timing);
count = FETCH(filecache_put_xxsm_count);
sz = "XXSM";
aggregate_log_print_server(LOG_INFO, SECTION_ENHANCED, funcname, &sptime,
"small-puts", &spcount, 1,
"small-put-latency", &splatency, elapsed_time);
stats_counter("small-puts", 1);
stats_timer("small-put-latency", elapsed_time);
}
log_print(LOG_DEBUG, SECTION_FILECACHE_OPEN, "%s: PUT on size %s (%lu) for %s -- Current:Average latency %lu :: %lu",
funcname, sz, st.st_size, path, elapsed_time, (latency / count));

/* The aggregate_log_print_server routine is expecting a cumulative count and latency, but by
* passing NULL for time, we ensure that aggregate prints this message and resets to 0, so
* there's no need to keep a static variable for count and latency.
*/
if (st.st_size >= LG && elapsed_time > large_time_allotment) {
unsigned long exceeded_count = 0;
log_print(LOG_WARNING, SECTION_FILECACHE_OPEN, "%s: large (%lu) PUT for %s exceeded time allotment %lu with %lu",
funcname, st.st_size, path, large_time_allotment, elapsed_time);
aggregate_log_print_server(LOG_INFO, SECTION_ENHANCED, funcname, NULL, "exceeded-time-large-PUT-count",
&exceeded_count, 1, "exceeded-time-large-PUT-latency", &elapsed_time, 0);
stats_counter("exceeded-time-large-PUT-count", 1);
stats_timer("exceeded-time-large-PUT-latency", elapsed_time);
}
else if (st.st_size < LG && elapsed_time > small_time_allotment) {
unsigned long exceeded_count = 0;
log_print(LOG_WARNING, SECTION_FILECACHE_OPEN, "%s: small (%lu) PUT for %s exceeded time allotment %lu with %lu",
funcname, st.st_size, path, small_time_allotment, elapsed_time);
aggregate_log_print_server(LOG_INFO, SECTION_ENHANCED, funcname, NULL, "exceeded-time-small-PUT-count",
&exceeded_count, 1, "exceeded-time-small-PUT-latency", &elapsed_time, 0);
stats_counter("exceeded-time-small-PUT-count", 1);
stats_timer("exceeded-time-small-PUT-latency", elapsed_time);
}
}

Expand Down
43 changes: 15 additions & 28 deletions src/fusedav.c
Original file line number Diff line number Diff line change
Expand Up @@ -73,24 +73,22 @@ static G_DEFINE_QUARK("FUSEDAV", fusedav)

static int processed_gerror(const char *prefix, const char *path, GError **pgerr) {
int ret;
static __thread unsigned long count = 0;
static __thread time_t previous_time = 0;
GError *gerr = *pgerr;

log_print(LOG_ERR, SECTION_FUSEDAV_DEFAULT, "%s on %s: %s -- %d: %s",
prefix, path ? path : "null path", gerr->message, gerr->code, g_strerror(gerr->code));
ret = -gerr->code;
if (gerr->code == ENOENT) {
aggregate_log_print_server(LOG_INFO, SECTION_ENHANCED, "processed_gerror", &previous_time, "error-ENOENT", &count, 1, NULL, NULL, 0);
stats_counter("error-ENOENT", 1);
}
else if (gerr->code == ENETDOWN) {
aggregate_log_print_server(LOG_INFO, SECTION_ENHANCED, "processed_gerror", &previous_time, "error-ENETDOWN", &count, 1, NULL, NULL, 0);
stats_counter("error-ENETDOWN", 1);
}
else if (gerr->code == EIO) {
aggregate_log_print_server(LOG_INFO, SECTION_ENHANCED, "processed_gerror", &previous_time, "error-EIO", &count, 1, NULL, NULL, 0);
stats_counter("error-EIO", 1);
}
else {
aggregate_log_print_server(LOG_INFO, SECTION_ENHANCED, "processed_gerror", &previous_time, "error-OTHER", &count, 1, NULL, NULL, 0);
stats_counter("error-OTHER", 1);
}
g_clear_error(pgerr);

Expand All @@ -109,32 +107,24 @@ static int simple_propfind_with_redirect(
struct timespec start_time;
struct timespec now;
long elapsed_time;
static __thread unsigned long count = 0;
static __thread long latency = 0;
static __thread time_t time = 0;
unsigned long exceeded_count = 0;
// Alert on propfind taking longer than 10 seconds
// Alert on propfind taking longer than 4 seconds. This is rather arbitrary.
static const unsigned propfind_time_allotment = 4000; // 4 seconds
int ret;

log_print(LOG_DEBUG, SECTION_FUSEDAV_STAT, "simple_propfind_with_redirect: Performing (%s) PROPFIND of depth %d on path %s.", last_updated > 0 ? "progressive" : "complete", depth, path);
log_print(LOG_DEBUG, SECTION_FUSEDAV_STAT, "simple_propfind_with_redirect: Performing (%s) PROPFIND of depth %d on path %s.",
last_updated > 0 ? "progressive" : "complete", depth, path);

clock_gettime(CLOCK_MONOTONIC, &start_time);
ret = simple_propfind(path, depth, last_updated, result_callback, userdata, &subgerr);
aggregate_log_print_server(LOG_INFO, SECTION_ENHANCED, "simple_propfind_with_redirect", &time, "propfind-count", &count, 1,
"propfind-latency", &latency, elapsed_time);
stats_counter("propfind-count", 1);
clock_gettime(CLOCK_MONOTONIC, &now);
elapsed_time = ((now.tv_sec - start_time.tv_sec) * 1000) + ((now.tv_nsec - start_time.tv_nsec) / (1000 * 1000));
/* The aggregate_log_print_server routine is expecting a cumulative count and latency, but by
* passing NULL for time, we ensure that aggregate prints this message and resets to 0, so
* there's no need to keep a static variable for count and latency.
*/
stats_counter("propfind-count", 1);
stats_timer("propfind-latency", elapsed_time);
if (elapsed_time > propfind_time_allotment) {
log_print(LOG_DEBUG, SECTION_FUSEDAV_STAT, "simple_propfind_with_redirect: (%s) PROPFIND exceeded allotment of %u ms; took %u ms.",
last_updated > 0 ? "progressive" : "complete", propfind_time_allotment, elapsed_time);
aggregate_log_print_server(LOG_INFO, SECTION_ENHANCED, "simple_propfind_with_redirect", NULL, "exceeded-time-propfind-count",
&exceeded_count, 1, "exceeded-time-propfind-latency", &elapsed_time, 0);
stats_counter("exceeded-time-propfind-count", 1);
stats_timer("exceeded-time-propfind-latency", elapsed_time);
}
if (subgerr) {
g_propagate_prefixed_error(gerr, subgerr, "simple_propfind_with_redirect: ");
Expand Down Expand Up @@ -322,12 +312,12 @@ static void update_directory(const char *path, bool attempt_progressive_update,
struct fusedav_config *config = fuse_get_context()->private_data;
GError *tmpgerr = NULL;
bool needs_update = true;
time_t last_updated;
time_t timestamp;
int propfind_result;

// Attempt to freshen the cache.
if (attempt_progressive_update && config->progressive_propfind) {
time_t last_updated;
timestamp = time(NULL);
last_updated = stat_cache_read_updated_children(config->cache, path, &tmpgerr);
if (tmpgerr) {
Expand Down Expand Up @@ -409,7 +399,6 @@ static int dav_readdir(
struct fusedav_config *config = fuse_get_context()->private_data;
struct fill_info f;
GError *gerr = NULL;
int ret;
int iters = 2;
bool ignore_freshness = false;

Expand Down Expand Up @@ -445,6 +434,7 @@ static int dav_readdir(
* way, the first access after network failure acts like a subsequent access in saint mode.
*/
for (int idx = 0; idx < iters; idx++) {
int ret;
// First, attempt to hit the cache.
ret = stat_cache_enumerate(config->cache, path, getdir_cache_callback, &f, ignore_freshness);
if (ret < 0) {
Expand Down Expand Up @@ -596,9 +586,6 @@ static void get_stat(const char *path, struct stat *stbuf, GError **gerr) {
time_t parent_children_update_ts;
bool is_base_directory;
int ret = -ENOENT;
// We are sharing these variables between negative and nonnegative cache counting. Should be fine.
static __thread unsigned long count = 0;
static __thread time_t previous_time = 0;
enum ignore_freshness skip_freshness_check = OFF;

memset(stbuf, 0, sizeof(struct stat));
Expand Down Expand Up @@ -724,7 +711,7 @@ static void get_stat(const char *path, struct stat *stbuf, GError **gerr) {
if (parent_children_update_ts < (time(NULL) - STAT_CACHE_NEGATIVE_TTL)) {
GError *subgerr = NULL;

aggregate_log_print_local(LOG_INFO, SECTION_ENHANCED, "get_stat", &previous_time, "propfind-nonnegative-cache", &count, 1, NULL, NULL, 0);
stats_counter("propfind-nonnegative-cache", 1);
log_print(LOG_INFO, SECTION_FUSEDAV_STAT, "get_stat: Calling update_directory: %s; attempt_progressive_update will be %d",
parent_path, (parent_children_update_ts > 0));
// If parent_children_update_ts is 0, there are no entries for updated_children in statcache
Expand All @@ -735,7 +722,7 @@ static void get_stat(const char *path, struct stat *stbuf, GError **gerr) {
goto fail;
}
} else {
aggregate_log_print_local(LOG_INFO, SECTION_ENHANCED, "get_stat", &previous_time, "propfind-negative-cache", &count, 1, NULL, NULL, 0);
stats_counter("propfind-negative-cache", 1);
BUMP(propfind_negative_cache);
}

Expand Down
Loading