From 5b775bf297c47a6bc50e36da89d1ec74a6fa01dc Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Sat, 16 Feb 2019 01:18:14 +1100 Subject: [PATCH 1/5] nsenter: cloned_binary: detect and handle short copies For a variety of reasons, sendfile(2) can end up doing a short-copy so we need to just loop until we hit the binary size. Since /proc/self/exe is tautologically our own binary, there's no chance someone is going to modify it underneath us (or changing the size). Signed-off-by: Aleksa Sarai --- libcontainer/nsenter/cloned_binary.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/libcontainer/nsenter/cloned_binary.c b/libcontainer/nsenter/cloned_binary.c index c97dfcb70d3..104bab4fb4d 100644 --- a/libcontainer/nsenter/cloned_binary.c +++ b/libcontainer/nsenter/cloned_binary.c @@ -64,7 +64,6 @@ int memfd_create(const char *name, unsigned int flags) # define F_SEAL_WRITE 0x0008 /* prevent writes */ #endif -#define RUNC_SENDFILE_MAX 0x7FFFF000 /* sendfile(2) is limited to 2GB. */ #ifdef HAVE_MEMFD_CREATE # define RUNC_MEMFD_COMMENT "runc_cloned:/proc/self/exe" # define RUNC_MEMFD_SEALS \ @@ -195,7 +194,8 @@ static int fetchve(char ***argv) static int clone_binary(void) { int binfd, memfd; - ssize_t sent = 0; + struct stat statbuf = {}; + size_t sent = 0; #ifdef HAVE_MEMFD_CREATE memfd = memfd_create(RUNC_MEMFD_COMMENT, MFD_CLOEXEC | MFD_ALLOW_SEALING); @@ -209,9 +209,17 @@ static int clone_binary(void) if (binfd < 0) goto error; - sent = sendfile(memfd, binfd, NULL, RUNC_SENDFILE_MAX); + if (fstat(binfd, &statbuf) < 0) + goto error_binfd; + + while (sent < statbuf.st_size) { + int n = sendfile(memfd, binfd, NULL, statbuf.st_size - sent); + if (n < 0) + goto error_binfd; + sent += n; + } close(binfd); - if (sent < 0) + if (sent != statbuf.st_size) goto error; #ifdef HAVE_MEMFD_CREATE @@ -235,6 +243,8 @@ static int clone_binary(void) #endif return memfd; +error_binfd: + close(binfd); error: close(memfd); return -EIO; From 2429d59352b81f6b9cc79b5ed26780c5fe6ba4ec Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Sat, 16 Feb 2019 04:34:27 +1100 Subject: [PATCH 2/5] nsenter: cloned_binary: expand and add pre-3.11 fallbacks In order to get around the memfd_create(2) requirement, 0a8e4117e7f7 ("nsenter: clone /proc/self/exe to avoid exposing host binary to container") added an O_TMPFILE fallback. However, this fallback was flawed in two ways: * It required O_TMPFILE which is relatively new (having been added to Linux 3.11). * The fallback choice was made at compile-time, not runtime. This results in several complications when it comes to running binaries on different machines to the ones they were built on. The easiest way to resolve these things is to have fallbacks work in a more procedural way (though it does make the code unfortunately more complicated) and to add a new fallback that uses mkotemp(3). Signed-off-by: Aleksa Sarai --- libcontainer/nsenter/cloned_binary.c | 193 ++++++++++++++++++++------- 1 file changed, 146 insertions(+), 47 deletions(-) diff --git a/libcontainer/nsenter/cloned_binary.c b/libcontainer/nsenter/cloned_binary.c index 104bab4fb4d..24b79895147 100644 --- a/libcontainer/nsenter/cloned_binary.c +++ b/libcontainer/nsenter/cloned_binary.c @@ -36,18 +36,21 @@ #if !defined(SYS_memfd_create) && defined(__NR_memfd_create) # define SYS_memfd_create __NR_memfd_create #endif -#ifdef SYS_memfd_create -# define HAVE_MEMFD_CREATE /* memfd_create(2) flags -- copied from . */ -# ifndef MFD_CLOEXEC -# define MFD_CLOEXEC 0x0001U -# define MFD_ALLOW_SEALING 0x0002U -# endif +#ifndef MFD_CLOEXEC +# define MFD_CLOEXEC 0x0001U +# define MFD_ALLOW_SEALING 0x0002U +#endif int memfd_create(const char *name, unsigned int flags) { +#ifdef SYS_memfd_create return syscall(SYS_memfd_create, name, flags); -} +#else + errno = ENOSYS; + return -1; #endif +} + /* This comes directly from . */ #ifndef F_LINUX_SPECIFIC_BASE @@ -64,11 +67,9 @@ int memfd_create(const char *name, unsigned int flags) # define F_SEAL_WRITE 0x0008 /* prevent writes */ #endif -#ifdef HAVE_MEMFD_CREATE -# define RUNC_MEMFD_COMMENT "runc_cloned:/proc/self/exe" -# define RUNC_MEMFD_SEALS \ +#define RUNC_MEMFD_COMMENT "runc_cloned:/proc/self/exe" +#define RUNC_MEMFD_SEALS \ (F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE) -#endif static void *must_realloc(void *ptr, size_t size) { @@ -92,23 +93,29 @@ static int is_self_cloned(void) if (fd < 0) return -ENOTRECOVERABLE; -#ifdef HAVE_MEMFD_CREATE + /* First check memfd. */ ret = fcntl(fd, F_GET_SEALS); - is_cloned = (ret == RUNC_MEMFD_SEALS); -#else - struct stat statbuf = {0}; - ret = fstat(fd, &statbuf); - if (ret >= 0) - is_cloned = (statbuf.st_nlink == 0); -#endif + if (ret >= 0) { + is_cloned = (ret == RUNC_MEMFD_SEALS); + } else { + /* + * Okay, we're a tmpfile -- or we're currently running on RHEL <=7.6 + * which appears to have a borked backport of F_GET_SEALS. Either way, + * having a file which has no hardlinks indicates that we aren't using + * a host-side "runc" binary and this is something that a container + * cannot fake (because unlinking requires being able to resolve the + * path that you want to unlink). + */ + struct stat statbuf = {}; + if (fstat(fd, &statbuf) >= 0) + is_cloned = (statbuf.st_nlink == 0); + } + close(fd); return is_cloned; } -/* - * Basic wrapper around mmap(2) that gives you the file length so you can - * safely treat it as an ordinary buffer. Only gives you read access. - */ +/* Read a given file into a new buffer, and providing the length. */ static char *read_file(char *path, size_t *length) { int fd; @@ -191,18 +198,127 @@ static int fetchve(char ***argv) return -EINVAL; } +enum { + EFD_NONE = 0, + EFD_MEMFD, + EFD_FILE, +}; + +/* + * This comes from . We can't hard-code __O_TMPFILE because it + * changes depending on the architecture. If we don't have O_TMPFILE we always + * have the mkostemp(3) fallback. + */ +#ifndef O_TMPFILE +# if defined(__O_TMPFILE) && defined(O_DIRECTORY) +# define O_TMPFILE (__O_TMPFILE | O_DIRECTORY) +# endif +#endif + +static int make_execfd(int *fdtype) +{ + int fd; + char template[] = "/tmp/runc-cloned-binary.XXXXXX"; + + /* + * Try memfd first, it's much nicer since it's easily detected thanks to + * sealing and also doesn't require assumptions like /tmp. + */ + *fdtype = EFD_MEMFD; + fd = memfd_create(RUNC_MEMFD_COMMENT, MFD_CLOEXEC | MFD_ALLOW_SEALING); + if (fd >= 0) + return fd; + if (errno != ENOSYS) + goto err; + +#ifdef O_TMPFILE + /* + * Try O_TMPFILE to avoid races where someone might snatch our file. Note + * that O_EXCL isn't actually a security measure here (since you can just + * fd re-open it and clear O_EXCL). + */ + *fdtype = EFD_FILE; + fd = open("/tmp", O_TMPFILE | O_EXCL | O_RDWR | O_CLOEXEC, 0700); + if (fd >= 0) { + struct stat statbuf = {}; + bool working_otmpfile = false; + + /* + * open(2) ignores unknown O_* flags -- yeah, I was surprised when I + * found this out too. As a result we can't check for EINVAL. However, + * if we get nlink != 0 (or EISDIR) then we know that this kernel + * doesn't support O_TMPFILE. + */ + if (fstat(fd, &statbuf) >= 0) + working_otmpfile = (statbuf.st_nlink == 0); + + if (working_otmpfile) + return fd; + + /* Pretend that we got EISDIR since O_TMPFILE failed. */ + close(fd); + errno = EISDIR; + } + if (errno != EISDIR) + goto err; +#endif /* defined(O_TMPFILE) */ + + /* + * Our final option is to create a temporary file the old-school way, and + * then unlink it so that nothing else sees it by accident. + */ + *fdtype = EFD_FILE; + fd = mkostemp(template, O_CLOEXEC); + if (fd >= 0) { + if (unlink(template) >= 0) + return fd; + close(fd); + } + +err: + *fdtype = EFD_NONE; + return -1; +} + +static int seal_execfd(int *fd, int fdtype) +{ + switch (fdtype) { + case EFD_MEMFD: + return fcntl(*fd, F_ADD_SEALS, RUNC_MEMFD_SEALS); + case EFD_FILE: { + /* Need to re-open our pseudo-memfd as an O_PATH to avoid execve(2) giving -ETXTBSY. */ + int newfd; + char fdpath[PATH_MAX] = {0}; + + if (fchmod(*fd, 0100) < 0) + return -1; + + if (snprintf(fdpath, sizeof(fdpath), "/proc/self/fd/%d", *fd) < 0) + return -1; + + newfd = open(fdpath, O_PATH | O_CLOEXEC); + if (newfd < 0) + return -1; + + close(*fd); + *fd = newfd; + return 0; + } + default: + break; + } + return -1; +} + static int clone_binary(void) { int binfd, memfd; struct stat statbuf = {}; size_t sent = 0; + int fdtype = EFD_NONE; -#ifdef HAVE_MEMFD_CREATE - memfd = memfd_create(RUNC_MEMFD_COMMENT, MFD_CLOEXEC | MFD_ALLOW_SEALING); -#else - memfd = open("/tmp", O_TMPFILE | O_EXCL | O_RDWR | O_CLOEXEC, 0711); -#endif - if (memfd < 0) + memfd = make_execfd(&fdtype); + if (memfd < 0 || fdtype == EFD_NONE) return -ENOTRECOVERABLE; binfd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC); @@ -222,25 +338,8 @@ static int clone_binary(void) if (sent != statbuf.st_size) goto error; -#ifdef HAVE_MEMFD_CREATE - int err = fcntl(memfd, F_ADD_SEALS, RUNC_MEMFD_SEALS); - if (err < 0) - goto error; -#else - /* Need to re-open "memfd" as read-only to avoid execve(2) giving -EXTBUSY. */ - int newfd; - char *fdpath = NULL; - - if (asprintf(&fdpath, "/proc/self/fd/%d", memfd) < 0) - goto error; - newfd = open(fdpath, O_RDONLY | O_CLOEXEC); - free(fdpath); - if (newfd < 0) + if (seal_execfd(&memfd, fdtype) < 0) goto error; - - close(memfd); - memfd = newfd; -#endif return memfd; error_binfd: From af9da0a45082783f6005b252488943b5ee2e2138 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 19 Feb 2019 22:33:09 +1100 Subject: [PATCH 3/5] nsenter: cloned_binary: use the runc statedir for O_TMPFILE Writing a file to tmpfs actually incurs a memcg penalty, and thus the benefit of being able to disable memfd_create(2) with _LIBCONTAINER_DISABLE_MEMFD_CLONE is fairly minimal -- though it should be noted that quite a few distributions don't use tmpfs for /tmp (and instead have it as a regular directory or subvolume of the host filesystem). Since runc must have write access to the state directory anyway (and the state directory is usually not on a tmpfs) we can use that instead of /tmp -- avoiding potential memcg costs with no real downside. Signed-off-by: Aleksa Sarai --- libcontainer/container_linux.go | 1 + libcontainer/nsenter/cloned_binary.c | 12 +++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 042bf1c4027..6f272a128fb 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -481,6 +481,7 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec. cmd.ExtraFiles = append(cmd.ExtraFiles, childPipe) cmd.Env = append(cmd.Env, fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1), + fmt.Sprintf("_LIBCONTAINER_STATEDIR=%s", c.root), ) // NOTE: when running a container with no PID namespace and the parent process spawning the container is // PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason diff --git a/libcontainer/nsenter/cloned_binary.c b/libcontainer/nsenter/cloned_binary.c index 24b79895147..548ff9f2425 100644 --- a/libcontainer/nsenter/cloned_binary.c +++ b/libcontainer/nsenter/cloned_binary.c @@ -217,8 +217,14 @@ enum { static int make_execfd(int *fdtype) { - int fd; - char template[] = "/tmp/runc-cloned-binary.XXXXXX"; + int fd = -1; + char template[PATH_MAX] = {0}; + char *prefix = secure_getenv("_LIBCONTAINER_STATEDIR"); + + if (!prefix || *prefix != '/') + prefix = "/tmp"; + if (snprintf(template, sizeof(template), "%s/runc.XXXXXX", prefix) < 0) + return -1; /* * Try memfd first, it's much nicer since it's easily detected thanks to @@ -238,7 +244,7 @@ static int make_execfd(int *fdtype) * fd re-open it and clear O_EXCL). */ *fdtype = EFD_FILE; - fd = open("/tmp", O_TMPFILE | O_EXCL | O_RDWR | O_CLOEXEC, 0700); + fd = open(prefix, O_TMPFILE | O_EXCL | O_RDWR | O_CLOEXEC, 0700); if (fd >= 0) { struct stat statbuf = {}; bool working_otmpfile = false; From 16612d74de5f84977e50a9c8ead7f0e9e13b8628 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 26 Feb 2019 20:16:17 +1100 Subject: [PATCH 4/5] nsenter: cloned_binary: try to ro-bind /proc/self/exe before copying The usage of memfd_create(2) and other copying techniques is quite wasteful, despite attempts to minimise it with _LIBCONTAINER_STATEDIR. memfd_create(2) added ~10M of memory usage to the cgroup associated with the container, which can result in some setups getting OOM'd (or just hogging the hosts' memory when you have lots of created-but-not-started containers sticking around). The easiest way of solving this is by creating a read-only bind-mount of the binary, opening that read-only bindmount, and then umounting it to ensure that the host won't accidentally be re-mounted read-write. This avoids all copying and cleans up naturally like the other techniques used. Unfortunately, like the O_TMPFILE fallback, this requires being able to create a file inside _LIBCONTAINER_STATEDIR (since bind-mounting over the most obvious path -- /proc/self/exe -- is a *very bad idea*). Unfortunately detecting this isn't fool-proof -- on a system with a read-only root filesystem (that might become read-write during "runc init" execution), we cannot tell whether we have already done an ro remount. As a partial mitigation, we store a _LIBCONTAINER_CLONED_BINARY environment variable which is checked *alongside* the protection being present. Signed-off-by: Aleksa Sarai --- libcontainer/nsenter/cloned_binary.c | 157 ++++++++++++++++++++++----- 1 file changed, 131 insertions(+), 26 deletions(-) diff --git a/libcontainer/nsenter/cloned_binary.c b/libcontainer/nsenter/cloned_binary.c index 548ff9f2425..1381324bcea 100644 --- a/libcontainer/nsenter/cloned_binary.c +++ b/libcontainer/nsenter/cloned_binary.c @@ -27,8 +27,10 @@ #include #include +#include #include #include +#include #include #include @@ -67,6 +69,7 @@ int memfd_create(const char *name, unsigned int flags) # define F_SEAL_WRITE 0x0008 /* prevent writes */ #endif +#define CLONED_BINARY_ENV "_LIBCONTAINER_CLONED_BINARY" #define RUNC_MEMFD_COMMENT "runc_cloned:/proc/self/exe" #define RUNC_MEMFD_SEALS \ (F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE) @@ -88,29 +91,56 @@ static void *must_realloc(void *ptr, size_t size) static int is_self_cloned(void) { int fd, ret, is_cloned = 0; + struct stat statbuf = {}; + struct statfs fsbuf = {}; fd = open("/proc/self/exe", O_RDONLY|O_CLOEXEC); if (fd < 0) return -ENOTRECOVERABLE; - /* First check memfd. */ + /* + * Is the binary a fully-sealed memfd? We don't need CLONED_BINARY_ENV for + * this, because you cannot write to a sealed memfd no matter what (so + * sharing it isn't a bad thing -- and an admin could bind-mount a sealed + * memfd to /usr/bin/runc to allow re-use). + */ ret = fcntl(fd, F_GET_SEALS); if (ret >= 0) { is_cloned = (ret == RUNC_MEMFD_SEALS); - } else { - /* - * Okay, we're a tmpfile -- or we're currently running on RHEL <=7.6 - * which appears to have a borked backport of F_GET_SEALS. Either way, - * having a file which has no hardlinks indicates that we aren't using - * a host-side "runc" binary and this is something that a container - * cannot fake (because unlinking requires being able to resolve the - * path that you want to unlink). - */ - struct stat statbuf = {}; - if (fstat(fd, &statbuf) >= 0) - is_cloned = (statbuf.st_nlink == 0); + goto out; } + /* + * All other forms require CLONED_BINARY_ENV, since they are potentially + * writeable (or we can't tell if they're fully safe) and thus we must + * check the environment as an extra layer of defence. + */ + if (!getenv(CLONED_BINARY_ENV)) { + is_cloned = false; + goto out; + } + + /* + * Is the binary on a read-only filesystem? We can't detect bind-mounts in + * particular (in-kernel they are identical to regular mounts) but we can + * at least be sure that it's read-only. In addition, to make sure that + * it's *our* bind-mount we check CLONED_BINARY_ENV. + */ + if (fstatfs(fd, &fsbuf) >= 0) + is_cloned |= (fsbuf.f_flags & MS_RDONLY); + + /* + * Okay, we're a tmpfile -- or we're currently running on RHEL <=7.6 + * which appears to have a borked backport of F_GET_SEALS. Either way, + * having a file which has no hardlinks indicates that we aren't using + * a host-side "runc" binary and this is something that a container + * cannot fake (because unlinking requires being able to resolve the + * path that you want to unlink). + */ + if (fstat(fd, &statbuf) >= 0) + is_cloned |= (statbuf.st_nlink == 0); + +out: close(fd); return is_cloned; } @@ -227,15 +257,16 @@ static int make_execfd(int *fdtype) return -1; /* - * Try memfd first, it's much nicer since it's easily detected thanks to - * sealing and also doesn't require assumptions like /tmp. + * Now try memfd, it's much nicer than actually creating a file in STATEDIR + * since it's easily detected thanks to sealing and also doesn't require + * assumptions about STATEDIR. */ *fdtype = EFD_MEMFD; fd = memfd_create(RUNC_MEMFD_COMMENT, MFD_CLOEXEC | MFD_ALLOW_SEALING); if (fd >= 0) return fd; - if (errno != ENOSYS) - goto err; + if (errno != ENOSYS && errno != EINVAL) + goto error; #ifdef O_TMPFILE /* @@ -266,7 +297,7 @@ static int make_execfd(int *fdtype) errno = EISDIR; } if (errno != EISDIR) - goto err; + goto error; #endif /* defined(O_TMPFILE) */ /* @@ -281,7 +312,7 @@ static int make_execfd(int *fdtype) close(fd); } -err: +error: *fdtype = EFD_NONE; return -1; } @@ -316,15 +347,83 @@ static int seal_execfd(int *fd, int fdtype) return -1; } +static int try_bindfd(void) +{ + int fd, ret = -1; + char template[PATH_MAX] = {0}; + char *prefix = secure_getenv("_LIBCONTAINER_STATEDIR"); + + if (!prefix || *prefix != '/') + prefix = "/tmp"; + if (snprintf(template, sizeof(template), "%s/runc.XXXXXX", prefix) < 0) + return ret; + + /* + * We need somewhere to mount it, mounting anything over /proc/self is a + * BAD idea on the host -- even if we do it temporarily. + */ + fd = mkstemp(template); + if (fd < 0) + return ret; + close(fd); + + /* + * For obvious reasons this won't work in rootless mode because we haven't + * created a userns+mntns -- but getting that to work will be a bit + * complicated and it's only worth doing if someone actually needs it. + */ + ret = -EPERM; + if (mount("/proc/self/exe", template, "", MS_BIND, "") < 0) + goto out; + if (mount("", template, "", MS_REMOUNT | MS_BIND | MS_RDONLY, "") < 0) + goto out_umount; + + + /* Get read-only handle that we're sure can't be made read-write. */ + ret = open(template, O_PATH | O_CLOEXEC); + +out_umount: + /* + * Make sure the MNT_DETACH works, otherwise we could get remounted + * read-write and that would be quite bad (the fd would be made read-write + * too, invalidating the protection). + */ + if (umount2(template, MNT_DETACH) < 0) { + if (ret >= 0) + close(ret); + ret = -ENOTRECOVERABLE; + } + +out: + /* + * We don't care about unlink errors, the worst that happens is that + * there's an empty file left around in STATEDIR. + */ + unlink(template); + return ret; +} + static int clone_binary(void) { - int binfd, memfd; + int binfd, execfd; struct stat statbuf = {}; size_t sent = 0; int fdtype = EFD_NONE; - memfd = make_execfd(&fdtype); - if (memfd < 0 || fdtype == EFD_NONE) + /* + * Before we resort to copying, let's try creating an ro-binfd in one shot + * by getting a handle for a read-only bind-mount of the execfd. + */ + execfd = try_bindfd(); + if (execfd >= 0) + return execfd; + + /* + * Dammit, that didn't work -- time to copy the binary to a safe place we + * can seal the contents. + */ + execfd = make_execfd(&fdtype); + if (execfd < 0 || fdtype == EFD_NONE) return -ENOTRECOVERABLE; binfd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC); @@ -335,7 +434,7 @@ static int clone_binary(void) goto error_binfd; while (sent < statbuf.st_size) { - int n = sendfile(memfd, binfd, NULL, statbuf.st_size - sent); + int n = sendfile(execfd, binfd, NULL, statbuf.st_size - sent); if (n < 0) goto error_binfd; sent += n; @@ -344,14 +443,15 @@ static int clone_binary(void) if (sent != statbuf.st_size) goto error; - if (seal_execfd(&memfd, fdtype) < 0) + if (seal_execfd(&execfd, fdtype) < 0) goto error; - return memfd; + + return execfd; error_binfd: close(binfd); error: - close(memfd); + close(execfd); return -EIO; } @@ -375,6 +475,11 @@ int ensure_cloned_binary(void) if (execfd < 0) return -EIO; + if (putenv(CLONED_BINARY_ENV "=1")) + goto error; + fexecve(execfd, argv, environ); +error: + close(execfd); return -ENOEXEC; } From 2d4a37b427167907ef2402586a8e8e2931a22490 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Fri, 1 Mar 2019 18:23:54 +1100 Subject: [PATCH 5/5] nsenter: cloned_binary: userspace copy fallback if sendfile fails There are some circumstances where sendfile(2) can fail (one example is that AppArmor appears to block writing to deleted files with sendfile(2) under some circumstances) and so we need to have a userspace fallback. It's fairly trivial (and handles short-writes). Signed-off-by: Aleksa Sarai --- libcontainer/nsenter/cloned_binary.c | 37 +++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/libcontainer/nsenter/cloned_binary.c b/libcontainer/nsenter/cloned_binary.c index 1381324bcea..b410e295170 100644 --- a/libcontainer/nsenter/cloned_binary.c +++ b/libcontainer/nsenter/cloned_binary.c @@ -160,7 +160,7 @@ static char *read_file(char *path, size_t *length) *length = 0; for (;;) { - int n; + ssize_t n; n = read(fd, buf, sizeof(buf)); if (n < 0) @@ -403,6 +403,33 @@ static int try_bindfd(void) return ret; } +static ssize_t fd_to_fd(int outfd, int infd) +{ + ssize_t total = 0; + char buffer[4096]; + + for (;;) { + ssize_t nread, nwritten = 0; + + nread = read(infd, buffer, sizeof(buffer)); + if (nread < 0) + return -1; + if (!nread) + break; + + do { + ssize_t n = write(outfd, buffer + nwritten, nread - nwritten); + if (n < 0) + return -1; + nwritten += n; + } while(nwritten < nread); + + total += nwritten; + } + + return total; +} + static int clone_binary(void) { int binfd, execfd; @@ -435,8 +462,12 @@ static int clone_binary(void) while (sent < statbuf.st_size) { int n = sendfile(execfd, binfd, NULL, statbuf.st_size - sent); - if (n < 0) - goto error_binfd; + if (n < 0) { + /* sendfile can fail so we fallback to a dumb user-space copy. */ + n = fd_to_fd(execfd, binfd); + if (n < 0) + goto error_binfd; + } sent += n; } close(binfd);