Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug 1644359: Support error reporting in Python bindings #1039

Merged
merged 4 commits into from
Jul 8, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
* BUGFIX: raise an error if Glean is initialized with an empty string as the `application_id`.
* Python
* BUGFIX: correctly set the `app_build` metric to the newly provided `application_build_id` initialization option.
* The Python bindings now report networking errors in the `glean.upload.ping_upload_failure` metric (like all the other bindings).

[Full changelog](https://github.com/mozilla/glean/compare/v31.2.3...main)

Expand Down
7 changes: 6 additions & 1 deletion glean-core/ffi/glean.h
Original file line number Diff line number Diff line change
Expand Up @@ -427,7 +427,12 @@ void glean_get_upload_task(FfiPingUploadTask *result, uint8_t log_ping);
*/
uint8_t glean_initialize(const FfiConfiguration *cfg);

uint8_t glean_initialize_standalone_uploader(FfiStr data_dir, FfiStr language_binding_name);
/**
* # Safety
*
* A valid and non-null configuration object is required for this function.
*/
uint8_t glean_initialize_for_subprocess(const FfiConfiguration *cfg);

uint8_t glean_is_dirty_flag_set(void);

Expand Down
103 changes: 14 additions & 89 deletions glean-core/ffi/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ use ffi_support::{define_string_destructor, ConcurrentHandleMap, FfiStr, IntoFfi
pub use glean_core::metrics::MemoryUnit;
pub use glean_core::metrics::TimeUnit;
pub use glean_core::upload::ffi_upload_result::*;
use glean_core::upload::PingUploadManager;
use glean_core::Glean;
pub use glean_core::Lifetime;

Expand Down Expand Up @@ -128,48 +127,6 @@ where
with_glean_mut(|glean| Ok(callback(glean)))
}

/// Execute the callback with a reference to the PingUploadManager singleton,
/// returning a `Result`.
///
/// The callback returns a `Result<T, E>` while:
///
/// - Catching panics, and logging them.
/// - Converting `T` to a C-compatible type using [`IntoFfi`].
/// - Logging `E` and returning a default value.
pub(crate) fn with_standalone_uploader<R, F>(callback: F) -> R::Value
where
F: UnwindSafe + FnOnce(&PingUploadManager) -> Result<R, glean_core::Error>,
R: IntoFfi,
{
let mut error = ffi_support::ExternError::success();
let res = ffi_support::abort_on_panic::call_with_result(&mut error, || {
match glean_core::upload::global_upload_manager() {
Some(upload_manager) => {
let upload_manager = upload_manager.lock().unwrap();
callback(&upload_manager)
}
None => Err(glean_core::Error::not_initialized()),
}
});
handlemap_ext::log_if_error(error);
res
}

/// Execute the callback with a reference to the PingUploadManager singleton,
/// returning a value.
///
/// The callback returns a value while:
///
/// - Catching panics, and logging them.
/// - Converting the returned value to a C-compatible type using [`IntoFfi`].
pub(crate) fn with_standalone_uploader_value<R, F>(callback: F) -> R::Value
where
F: UnwindSafe + FnOnce(&PingUploadManager) -> R,
R: IntoFfi,
{
with_standalone_uploader(|ping_uploader| Ok(callback(ping_uploader)))
}

/// Initialize the logging system based on the target platform. This ensures
/// that logging is shown when executing the Glean SDK unit tests.
#[no_mangle]
Expand Down Expand Up @@ -402,19 +359,6 @@ pub extern "C" fn glean_is_first_run() -> u8 {
// * `result`: the object the output task will be written to.
#[no_mangle]
pub extern "C" fn glean_get_upload_task(result: *mut FfiPingUploadTask, log_ping: u8) {
// If an upload manager instance is available, use that (it should only happen
// in processes which do not initialize Glean).
if glean_core::upload::global_upload_manager().is_some() {
with_standalone_uploader_value(|ping_uploader| {
let ffi_task = FfiPingUploadTask::from(ping_uploader.get_upload_task(log_ping != 0));
unsafe {
std::ptr::write(result, ffi_task);
}
});
return;
}

// Otherwise
with_glean_value(|glean| {
let ffi_task = FfiPingUploadTask::from(glean.get_upload_task(log_ping != 0));
unsafe {
Expand Down Expand Up @@ -451,22 +395,6 @@ pub unsafe extern "C" fn glean_process_ping_upload_response(
// but as it controls the memory, we put something valid in place, just in case.
let task = std::ptr::replace(task, FfiPingUploadTask::Done);

// If an upload manager instance is available, use that (it should only happen
// in processes which do not initialize Glean).
if glean_core::upload::global_upload_manager().is_some() {
with_standalone_uploader(|ping_uploader| {
if let FfiPingUploadTask::Upload { document_id, .. } = task {
assert!(!document_id.is_null());
let document_id_str = CStr::from_ptr(document_id)
.to_str()
.map_err(|_| glean_core::Error::utf8_error())?;
ping_uploader.process_ping_upload_response(document_id_str, status.into());
};
Ok(())
});
return;
}

with_glean(|glean| {
if let FfiPingUploadTask::Upload { document_id, .. } = task {
assert!(!document_id.is_null());
Expand All @@ -479,25 +407,22 @@ pub unsafe extern "C" fn glean_process_ping_upload_response(
});
}

/// # Safety
///
/// A valid and non-null configuration object is required for this function.
#[no_mangle]
pub extern "C" fn glean_initialize_standalone_uploader(
data_dir: FfiStr,
language_binding_name: FfiStr,
) -> u8 {
pub unsafe extern "C" fn glean_initialize_for_subprocess(cfg: *const FfiConfiguration) -> u8 {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What happens if two processes write/open the database at the same time? Thin of the 'pingsender' case on Windows: FOG might be still running while the pingsender is being used.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My understanding is that LMDB handles this for us. See the Threads and Processes section here.

What isn't supported is opening the DB multiple times from the same process.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's very interesting. Would you mind adding a small comment about this guarantee here or in some other place? It would also be great to have test coverage for this, but I guess it's fine to do this as a follow-up (or at least just file a bug)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new Python tests in this PR are already sort of testing this case. Even in test mode, it fires off a subprocess (the difference is that it "joins" and waits for it to finish). To be extra sure, we could write a test that repeatedly changes metrics while the ping uploader is running and reporting network errors... Should be easy enough, so I might as well do that now.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was wrong about testing mode still using real subprocesses. So good catch -- I updated the tests to actually do that so we are testing things running on separate processes, and also added one that sets metrics from both processes at the same time.

assert!(!cfg.is_null());

handlemap_ext::handle_result(|| {
// Init the upload manager to perform a synchronous ping directory scan.
// Since this method is meant to be called from a process used exclusively
// for uploading, this is fine.
let mut upload_manager = PingUploadManager::new(
data_dir.to_string_fallible()?,
&language_binding_name.to_string_fallible()?,
true,
);
upload_manager.set_rate_limiter(
/* seconds per interval */ 60, /* max tasks per interval */ 10,
);
glean_core::upload::setup_upload_manager(upload_manager)?;
log::info!("Glean initialized in upload-only mode");
// We can create a reference to the FfiConfiguration struct:
// 1. We did a null check
// 2. We're not holding on to it beyond this function
// and we copy out all data when needed.
let glean_cfg = glean_core::Configuration::try_from(&*cfg)?;
let glean = Glean::new_for_subprocess(&glean_cfg)?;
glean_core::setup_glean(glean)?;
log::info!("Glean initialized for subprocess");
Ok(true)
})
}
Expand Down
7 changes: 6 additions & 1 deletion glean-core/ios/Glean/GleanFfi.h
Original file line number Diff line number Diff line change
Expand Up @@ -427,7 +427,12 @@ void glean_get_upload_task(FfiPingUploadTask *result, uint8_t log_ping);
*/
uint8_t glean_initialize(const FfiConfiguration *cfg);

uint8_t glean_initialize_standalone_uploader(FfiStr data_dir, FfiStr language_binding_name);
/**
* # Safety
*
* A valid and non-null configuration object is required for this function.
*/
uint8_t glean_initialize_for_subprocess(const FfiConfiguration *cfg);

uint8_t glean_is_dirty_flag_set(void);

Expand Down
21 changes: 11 additions & 10 deletions glean-core/python/glean/net/ping_upload_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import logging
from pathlib import Path
import re
import sys
import time
from typing import List, Tuple

Expand Down Expand Up @@ -50,7 +51,7 @@ def _process(cls):
from .. import Glean

return ProcessDispatcher.dispatch(
_process, (Glean._data_dir, Glean._configuration)
_process, (Glean._data_dir, Glean._application_id, Glean._configuration)
)

@classmethod
Expand Down Expand Up @@ -101,21 +102,21 @@ def _parse_ping_headers(
return headers


def _process(data_dir: Path, configuration) -> bool:
def _process(data_dir: Path, application_id: str, configuration) -> bool:

# Import here to avoid cyclical import
from ..glean import Glean

if not Glean.is_initialized():
# Always load the Glean shared object / dll even if we're in a (ping upload worker)
# subprocess.
# To make startup time better in subprocesses, consumers can initialize just the
# ping upload manager.
data_dir = ffi_support.new("char[]", _ffi.ffi_encode_string(str(data_dir)))
language_binding_name = ffi_support.new(
"char[]", _ffi.ffi_encode_string(_ffi.LANGUAGE_BINDING_NAME)
# We don't want to send pings or otherwise update the database during
# initialization in a subprocess, so we use
# `glean_initialize_for_subprocess` rather than `glean_initialize` here.
cfg = _ffi.make_config(
data_dir, application_id, True, configuration.max_events,
)
_ffi.lib.glean_initialize_standalone_uploader(data_dir, language_binding_name)
if _ffi.lib.glean_initialize_for_subprocess(cfg) == 0:
log.error("Couldn't initialize Glean in subprocess")
sys.exit(1)

wait_attempts = 0

Expand Down
18 changes: 18 additions & 0 deletions glean-core/python/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,24 @@ def safe_httpserver(httpserver):
return httpserver


@pytest.fixture
def slow_httpserver(httpserver):
"""
An httpserver that takes 0.5 seconds to respond.
"""
wait_for_server(httpserver)

orig_call = httpserver.__call__

def __call__(self, *args, **kwargs):
time.sleep(0.5)
return orig_call(*args, **kwargs)

httpserver.__call__ = __call__

return httpserver


@pytest.fixture
def safe_httpsserver(httpsserver):
wait_for_server(httpsserver)
Expand Down
88 changes: 88 additions & 0 deletions glean-core/python/tests/test_network.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,30 @@


from glean import Glean
from glean import metrics
from glean._process_dispatcher import ProcessDispatcher
from glean.net import PingUploadWorker
from glean.net.http_client import HttpClientUploader
from glean.net import ping_uploader


def get_upload_failure_metric():
return metrics.LabeledCounterMetricType(
disabled=False,
send_in_pings=["metrics"],
name="ping_upload_failure",
category="glean.upload",
labels=[
"status_code_4xx",
"status_code_5xx",
"status_code_unknown",
"unrecoverable",
"recoverable",
],
lifetime=metrics.Lifetime.PING,
)


def test_400_error(safe_httpserver):
safe_httpserver.serve_content(b"", code=400)

Expand All @@ -24,6 +43,22 @@ def test_400_error(safe_httpserver):
assert 1 == len(safe_httpserver.requests)


def test_400_error_submit(safe_httpserver, monkeypatch):
safe_httpserver.serve_content(b"", code=400)

# Force the ping upload worker into a separate process
monkeypatch.setattr(PingUploadWorker, "process", PingUploadWorker._process)
Glean._configuration._server_endpoint = safe_httpserver.url
Glean._submit_ping_by_name("baseline")
ProcessDispatcher._wait_for_last_process()

assert 1 == len(safe_httpserver.requests)

metric = get_upload_failure_metric()
assert 1 == metric["status_code_4xx"].test_get_value()
assert not metric["status_code_5xx"].test_has_value()


def test_500_error(safe_httpserver):
safe_httpserver.serve_content(b"", code=500)

Expand All @@ -36,6 +71,59 @@ def test_500_error(safe_httpserver):
assert 1 == len(safe_httpserver.requests)


def test_500_error_submit(safe_httpserver, monkeypatch):
safe_httpserver.serve_content(b"", code=500)

# Force the ping upload worker into a separate process
monkeypatch.setattr(PingUploadWorker, "process", PingUploadWorker._process)
Glean._configuration._server_endpoint = safe_httpserver.url
Glean._submit_ping_by_name("baseline")
ProcessDispatcher._wait_for_last_process()

# This kind of recoverable error will be tried 10 times
assert 10 == len(safe_httpserver.requests)

metric = get_upload_failure_metric()
assert not metric["status_code_4xx"].test_has_value()
assert 10 == metric["status_code_5xx"].test_get_value()


def test_500_error_submit_concurrent_writing(slow_httpserver, monkeypatch):
# This tests that concurrently writing to the database from the main process
# and the ping uploading subprocess.
slow_httpserver.serve_content(b"", code=500)

counter = metrics.CounterMetricType(
disabled=False,
category="test",
name="counter",
send_in_pings=["metrics"],
lifetime=metrics.Lifetime.PING,
)

# Force the ping upload worker into a separate process
monkeypatch.setattr(PingUploadWorker, "process", PingUploadWorker._process)
Glean._configuration._server_endpoint = slow_httpserver.url
Glean._submit_ping_by_name("baseline")

# While the uploader is running, increment the counter as fast as we can
times = 0
last_process = ProcessDispatcher._last_process
while last_process.poll() is None:
counter.add()
times += 1

# This kind of recoverable error will be tried 10 times
assert 10 == len(slow_httpserver.requests)

metric = get_upload_failure_metric()
assert not metric["status_code_4xx"].test_has_value()
assert 10 == metric["status_code_5xx"].test_get_value()

assert times > 0
assert times == counter.test_get_value()


def test_unknown_scheme():
response = HttpClientUploader.upload(
url="ftp://example.com/", data=b"{}", headers=[]
Expand Down
Loading