diff --git a/src/condor_daemon_core.V6/condor_daemon_core.h b/src/condor_daemon_core.V6/condor_daemon_core.h index 3562577..d9d1736 100644 --- a/src/condor_daemon_core.V6/condor_daemon_core.h +++ b/src/condor_daemon_core.V6/condor_daemon_core.h @@ -192,6 +192,7 @@ struct FamilyInfo { gid_t* group_ptr; #endif const char* glexec_proxy; + bool want_pid_namespace; const char* cgroup; FamilyInfo() { @@ -201,6 +202,7 @@ struct FamilyInfo { group_ptr = NULL; #endif glexec_proxy = NULL; + want_pid_namespace = false; cgroup = NULL; } }; diff --git a/src/condor_daemon_core.V6/daemon_core.cpp b/src/condor_daemon_core.V6/daemon_core.cpp index e058fd3..74fe8a0 100644 --- a/src/condor_daemon_core.V6/daemon_core.cpp +++ b/src/condor_daemon_core.V6/daemon_core.cpp @@ -34,6 +34,7 @@ #if HAVE_CLONE #include #include +#include #endif #if HAVE_RESOLV_H && HAVE_DECL_RES_INIT @@ -112,6 +113,10 @@ CRITICAL_SECTION Big_fat_mutex; // coarse grained mutex for debugging purposes #include #endif +#if !defined(CLONE_NEWPID) +#define CLONE_NEWPID 0x20000000 +#endif + static const char* EMPTY_DESCRIP = ""; // special errno values that may be returned from Create_Process @@ -6566,7 +6571,9 @@ public: m_affinity_mask(affinity_mask), m_fs_remap(fs_remap), m_wrote_tracking_gid(false), - m_no_dprintf_allowed(false) + m_no_dprintf_allowed(false), + m_clone_newpid_pid(-1), + m_clone_newpid_ppid(-1) { } @@ -6627,6 +6634,10 @@ private: bool m_wrote_tracking_gid; bool m_no_dprintf_allowed; priv_state m_priv_state; + pid_t m_clone_newpid_pid; + pid_t m_clone_newpid_ppid; + + pid_t fork(int); }; enum { @@ -6650,7 +6661,19 @@ pid_t CreateProcessForkit::clone_safe_getpid() { // the pid of the parent process (presumably due to internal // caching in libc). Therefore, use the syscall to get // the answer directly. - return syscall(SYS_getpid); + + int retval = syscall(SYS_getpid); + + // If we were fork'd with CLONE_NEWPID, we think our PID is 1. + // In this case, ask the parent! + if (retval == 1) { + if (m_clone_newpid_pid == -1) { + EXCEPT("getpid is 1!"); + } + retval = m_clone_newpid_pid; + } + + return retval; #else return ::getpid(); #endif @@ -6659,12 +6682,115 @@ pid_t CreateProcessForkit::clone_safe_getppid() { #if HAVE_CLONE // See above comment for clone_safe_getpid() for explanation of // why we need to do this. - return syscall(SYS_getppid); + + int retval = syscall(SYS_getppid); + + // If ppid is 0, then either Condor is init (DEAR GOD) or we + // were created with CLONE_NEWPID; ask the parent! + if (retval == 0) { + if (m_clone_newpid_ppid == -1) { + EXCEPT("getppid is 0!"); + } + retval = m_clone_newpid_ppid; + } + + return retval; #else return ::getppid(); #endif } +/** + * fork allows one to use certain clone syscall flags, but provides more + * familiar POSIX fork semantics. + * NOTES: + * - We whitelist the flags you are allowed to pass. Currently supported: + * - CLONE_NEWPID. Implies CLONE_NEWNS. + * If the clone succeeds but the remount fails, the child calls _exit(1), + * but the parent will return successfully. + * It would be a simple fix to have the parent return the failure, if + * someone desired. + * Flags are whitelisted to help us adhere to the fork-like semantics (no + * shared memory between parent and child, for example). If you give other + * flags, they are silently ignored. + * - man pages indicate that clone on i386 is only fully functional when used + * via ASM, not the vsyscall interface. This doesn't appear to be relevant + * to this particular use case. + * - To avoid linking with pthreads (or copy/pasting lots of glibc code), I + * don't include integration with threads. This means various threading + * calls in the child may not function correctly (pre-exec; post-exec + * should be fine), and pthreads might not notice when the child exits. + * Traditional POSIX calls like wait will still function because the + * parent will receive the SIGCHLD. + * This is simple to fix if someone desired, but I'd mostly rather not link + * with pthreads. + */ + +#define ALLOWED_FLAGS (SIGCHLD | CLONE_NEWPID | CLONE_NEWNS ) + +pid_t CreateProcessForkit::fork(int flags) { + + // If you don't need any fancy flags, just do the old boring POSIX call + if (flags == 0) { + return ::fork(); + } + +#if HAVE_CLONE + + int rw[2]; // Communication pipes for the CLONE_NEWPID case. + + flags |= SIGCHLD; // The only necessary flag. + if (flags & CLONE_NEWPID) { + flags |= CLONE_NEWNS; + if (pipe(rw)) { + EXCEPT("UNABLE TO CREATE PIPE."); + } + } + + // fork as root if we have our fancy flags. + priv_state orig_state = set_priv(PRIV_ROOT); + int retval = syscall(SYS_clone, ALLOWED_FLAGS & flags, 0, NULL, NULL); + + // Child + if ((retval == 0) && (flags & CLONE_NEWPID)) { + + // If we should have forked as non-root, make things in life final. + set_priv(orig_state); + + if (full_read(rw[0], &m_clone_newpid_ppid, sizeof(pid_t)) != sizeof(pid_t)) { + EXCEPT("Unable to write into pipe."); + } + if (full_read(rw[0], &m_clone_newpid_pid, sizeof(pid_t)) != sizeof(pid_t)) { + EXCEPT("Unable to write into pipe."); + } + + // Parent + } else if (retval > 0) { + set_priv(orig_state); + pid_t ppid = getpid(); // We are parent, so don't need clone_safe_pid. + if (full_write(rw[1], &ppid, sizeof(ppid)) != sizeof(ppid)) { + EXCEPT("Unable to write into pipe."); + } + if (full_write(rw[1], &retval, sizeof(ppid)) != sizeof(ppid)) { + EXCEPT("Unable to write into pipe."); + } + } + // retval=-1 falls through here. + if (flags & CLONE_NEWPID) { + close(rw[0]); + close(rw[1]); + } + return retval; + +#else + + // Note we silently ignore flags if there's no clone on the platform. + return ::fork(); + +#endif + +} + pid_t CreateProcessForkit::fork_exec() { pid_t newpid; @@ -6736,7 +6862,11 @@ pid_t CreateProcessForkit::fork_exec() { } #endif /* HAVE_CLONE */ - newpid = fork(); + int fork_flags = 0; + if (m_family_info) { + fork_flags |= m_family_info->want_pid_namespace ? CLONE_NEWPID : 0; + } + newpid = this->fork(fork_flags); if( newpid == 0 ) { // in child enterCreateProcessChild(this); diff --git a/src/condor_starter.V6.1/vanilla_proc.cpp b/src/condor_starter.V6.1/vanilla_proc.cpp index 044cb10..8528ca7 100644 --- a/src/condor_starter.V6.1/vanilla_proc.cpp +++ b/src/condor_starter.V6.1/vanilla_proc.cpp @@ -360,6 +360,24 @@ VanillaProc::StartJob() } } +#if defined(LINUX) + // On Linux kernel 2.6.24 and later, we can give each + // job its own PID namespace + if (param_boolean("USE_PID_NAMESPACES", false)) { + if (!can_switch_ids()) { + EXCEPT("USE_PID_NAMESPACES enabled, but can't perform this " + "call in Linux unless running as root."); + } + fi.want_pid_namespace = true; + if (!fs_remap) { + fs_remap = new FilesystemRemap(); + } + fs_remap->RemapProc(); + } + dprintf(D_FULLDEBUG, "PID namespace option: %s\n", fi.want_pid_namespace ? "true" : "false"); +#endif + + // have OsProc start the job // int retval = OsProc::StartJob(&fi, fs_remap); diff --git a/src/condor_utils/filesystem_remap.cpp b/src/condor_utils/filesystem_remap.cpp index e0f2e61..735c744 100644 --- a/src/condor_utils/filesystem_remap.cpp +++ b/src/condor_utils/filesystem_remap.cpp @@ -29,7 +29,8 @@ FilesystemRemap::FilesystemRemap() : m_mappings(), - m_mounts_shared() + m_mounts_shared(), + m_remap_proc(false) { ParseMountinfo(); } @@ -120,6 +121,9 @@ int FilesystemRemap::PerformMappings() { break; } } + if ((!retval) && m_remap_proc) { + retval = mount("proc", "/proc", "proc", 0, NULL); + } #endif return retval; } @@ -148,6 +152,10 @@ std::string FilesystemRemap::RemapDir(std::string target) { return target; } +void FilesystemRemap::RemapProc() { + m_remap_proc = true; +} + /* Sample mountinfo contents (from http://www.kernel.org/doc/Documentation/filesystems/proc.txt): 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue diff --git a/src/condor_utils/filesystem_remap.h b/src/condor_utils/filesystem_remap.h index 5e9362d..2e17476 100644 --- a/src/condor_utils/filesystem_remap.h +++ b/src/condor_utils/filesystem_remap.h @@ -74,6 +74,12 @@ public: */ std::string RemapFile(std::string); + /** + * Indicate that we should remount /proc in the child process. + * Necessary for PID namespaces. + */ + void RemapProc(); + private: /** @@ -89,6 +95,7 @@ private: std::list m_mappings; std::list m_mounts_shared; std::list m_mounts_autofs; + bool m_remap_proc; }; #endif