b3fe293
diff --git a/src/condor_procd/proc_family.cpp b/src/condor_procd/proc_family.cpp
b3fe293
index d35ffcc..29d9471 100644
b3fe293
--- a/src/condor_procd/proc_family.cpp
b3fe293
+++ b/src/condor_procd/proc_family.cpp
b3fe293
@@ -54,7 +54,9 @@ ProcFamily::ProcFamily(ProcFamilyMonitor* monitor,
b3fe293
 	m_member_list(NULL)
b3fe293
 #if defined(HAVE_EXT_LIBCGROUP)
b3fe293
 	, m_cgroup_string(""),
b3fe293
-	m_cm(CgroupManager::getInstance())
b3fe293
+	m_cm(CgroupManager::getInstance()),
b3fe293
+	m_initial_user_cpu(0),
b3fe293
+	m_initial_sys_cpu(0)
b3fe293
 #endif
b3fe293
 {
b3fe293
 #if !defined(WIN32)
b3fe293
@@ -188,6 +190,7 @@ after_migrate:
b3fe293
 		cgroup_free(&orig_cgroup);
b3fe293
 	}
b3fe293
 
b3fe293
+
b3fe293
 after_restore:
b3fe293
 	if (orig_cgroup_string != NULL) {
b3fe293
 		free(orig_cgroup_string);
b3fe293
@@ -231,6 +234,27 @@ ProcFamily::set_cgroup(const std::string &cgroup_string)
b3fe293
 		member = member->m_next;
b3fe293
 	}
b3fe293
 
b3fe293
+	// Record the amount of pre-existing CPU usage here.
b3fe293
+	m_initial_user_cpu = 0;
b3fe293
+	m_initial_sys_cpu = 0;
b3fe293
+	get_cpu_usage_cgroup(m_initial_user_cpu, m_initial_sys_cpu);
b3fe293
+
b3fe293
+	// Reset block IO controller
b3fe293
+	if (m_cm.isMounted(CgroupManager::BLOCK_CONTROLLER)) {
b3fe293
+		struct cgroup *tmp_cgroup = cgroup_new_cgroup(m_cgroup_string.c_str());
b3fe293
+		struct cgroup_controller *blkio_controller = cgroup_add_controller(tmp_cgroup, BLOCK_CONTROLLER_STR);
b3fe293
+		ASSERT (blkio_controller != NULL); // Block IO controller should already exist.
b3fe293
+		cgroup_add_value_uint64(blkio_controller, "blkio.reset_stats", 0);
b3fe293
+		int err;
b3fe293
+		if ((err = cgroup_modify_cgroup(tmp_cgroup))) {
b3fe293
+			// Not allowed to reset stats?
b3fe293
+			dprintf(D_ALWAYS,
b3fe293
+				"Unable to reset cgroup %s block IO statistics. "
b3fe293
+				"Some block IO accounting will be inaccurate (ProcFamily %u): %u %s\n",
b3fe293
+				m_cgroup_string.c_str(), m_root_pid, err, cgroup_strerror(err));
b3fe293
+		}
b3fe293
+	}
b3fe293
+
b3fe293
 	return 0;
b3fe293
 }
b3fe293
 
b3fe293
@@ -486,6 +510,40 @@ ProcFamily::aggregate_usage_cgroup_blockio(ProcFamilyUsage* usage)
b3fe293
 	return 0;
b3fe293
 }
b3fe293
 
b3fe293
+int ProcFamily::get_cpu_usage_cgroup(long &user_time, long &sys_time) {
b3fe293
+
b3fe293
+	if (!m_cm.isMounted(CgroupManager::CPUACCT_CONTROLLER)) {
b3fe293
+		return 1;
b3fe293
+	}
b3fe293
+
b3fe293
+	void * handle = NULL;
b3fe293
+	u_int64_t tmp = 0;
b3fe293
+	struct cgroup_stat stats;
b3fe293
+	int err = cgroup_read_stats_begin(CPUACCT_CONTROLLER_STR, m_cgroup_string.c_str(), &handle, &stats);
b3fe293
+	while (err != ECGEOF) {
b3fe293
+		if (err > 0) {
b3fe293
+			dprintf(D_PROCFAMILY,
b3fe293
+				"Unable to read cgroup %s cpuacct stats (ProcFamily %u): %s.\n",
b3fe293
+				m_cgroup_string.c_str(), m_root_pid, cgroup_strerror(err));
b3fe293
+			break;
b3fe293
+		}
b3fe293
+		if (_check_stat_uint64(stats, "user", &tmp)) {
b3fe293
+			user_time = tmp/clock_tick-m_initial_user_cpu;
b3fe293
+		} else if (_check_stat_uint64(stats, "system", &tmp)) {
b3fe293
+			sys_time = tmp/clock_tick-m_initial_sys_cpu;
b3fe293
+		}
b3fe293
+			err = cgroup_read_stats_next(&handle, &stats);
b3fe293
+	}
b3fe293
+	if (handle != NULL) {
b3fe293
+		cgroup_read_stats_end(&handle);
b3fe293
+	}
b3fe293
+	if (err != ECGEOF) {
b3fe293
+		dprintf(D_ALWAYS, "Internal cgroup error when retrieving CPU statistics: %s\n", cgroup_strerror(err));
b3fe293
+		return 1;
b3fe293
+	}
b3fe293
+	return 0;
b3fe293
+}
b3fe293
+
b3fe293
 int
b3fe293
 ProcFamily::aggregate_usage_cgroup(ProcFamilyUsage* usage)
b3fe293
 {
b3fe293
@@ -496,16 +554,13 @@ ProcFamily::aggregate_usage_cgroup(ProcFamilyUsage* usage)
b3fe293
 
b3fe293
 	int err;
b3fe293
 	struct cgroup_stat stats;
b3fe293
-	void **handle;
b3fe293
+	void *handle = NULL;
b3fe293
 	u_int64_t tmp = 0, image = 0;
b3fe293
 	bool found_rss = false;
b3fe293
 
b3fe293
 	// Update memory
b3fe293
-	handle = (void **)malloc(sizeof(void*));
b3fe293
-	ASSERT (handle != NULL);
b3fe293
-	*handle = NULL;
b3fe293
 
b3fe293
-	err = cgroup_read_stats_begin(MEMORY_CONTROLLER_STR, m_cgroup_string.c_str(), handle, &stats);
b3fe293
+	err = cgroup_read_stats_begin(MEMORY_CONTROLLER_STR, m_cgroup_string.c_str(), &handle, &stats);
b3fe293
 	while (err != ECGEOF) {
b3fe293
 		if (err > 0) {
b3fe293
 			dprintf(D_PROCFAMILY,
b3fe293
@@ -522,10 +577,10 @@ ProcFamily::aggregate_usage_cgroup(ProcFamilyUsage* usage)
b3fe293
 		} else if (_check_stat_uint64(stats, "total_swap", &tmp)) {
b3fe293
 			image += tmp;
b3fe293
 		}
b3fe293
-		err = cgroup_read_stats_next(handle, &stats);
b3fe293
+		err = cgroup_read_stats_next(&handle, &stats);
b3fe293
 	}
b3fe293
-	if (*handle != NULL) {
b3fe293
-		cgroup_read_stats_end(handle);
b3fe293
+	if (handle != NULL) {
b3fe293
+		cgroup_read_stats_end(&handle);
b3fe293
 	}
b3fe293
 	if (found_rss) {
b3fe293
 		usage->total_image_size = image/1024;
b3fe293
@@ -540,29 +595,12 @@ ProcFamily::aggregate_usage_cgroup(ProcFamilyUsage* usage)
b3fe293
 		m_max_image_size = image/1024;
b3fe293
 	}
b3fe293
 	// Try updating the max size using cgroups
b3fe293
-	update_max_image_size_cgroup();
b3fe293
+	// XXX: This is taken out for now - kernel calculates max INCLUDING
b3fe293
+	// the filesystem cache.  Not what you want.
b3fe293
+	//update_max_image_size_cgroup();
b3fe293
 
b3fe293
 	// Update CPU
b3fe293
-	*handle = NULL;
b3fe293
-	err = cgroup_read_stats_begin(CPUACCT_CONTROLLER_STR, m_cgroup_string.c_str(), handle, &stats);
b3fe293
-	while (err != ECGEOF) {
b3fe293
-		if (err > 0) {
b3fe293
-			dprintf(D_PROCFAMILY,
b3fe293
-				"Unable to read cgroup %s cpuacct stats (ProcFamily %u): %s.\n",
b3fe293
-				m_cgroup_string.c_str(), m_root_pid, cgroup_strerror(err));
b3fe293
-			break;
b3fe293
-		}
b3fe293
-		if (_check_stat_uint64(stats, "user", &tmp)) {
b3fe293
-			usage->user_cpu_time = tmp/clock_tick;
b3fe293
-		} else if (_check_stat_uint64(stats, "system", &tmp)) {
b3fe293
-			usage->sys_cpu_time = tmp/clock_tick;
b3fe293
-		}
b3fe293
-		err = cgroup_read_stats_next(handle, &stats);
b3fe293
-	}
b3fe293
-	if (*handle != NULL) {
b3fe293
-		cgroup_read_stats_end(handle);
b3fe293
-	}
b3fe293
-	free(handle);
b3fe293
+	get_cpu_usage_cgroup(usage->user_cpu_time, usage->sys_cpu_time);
b3fe293
 
b3fe293
 	aggregate_usage_cgroup_blockio(usage);
b3fe293
b3fe293
--- a/src/condor_procd/proc_family.h
b3fe293
+++ b/src/condor_procd/proc_family.h
b3fe293
@@ -181,6 +181,11 @@ private:
b3fe293
 	std::string m_cgroup_string;
b3fe293
 	CgroupManager &m_cm;
b3fe293
 	static long clock_tick;
b3fe293
+	// Sometimes Condor doesn't successfully clear out the cgroup from the
b3fe293
+	// previous run.  Hence, we subtract off any CPU usage found at the
b3fe293
+	// start of the job.
b3fe293
+	long m_initial_user_cpu;
b3fe293
+	long m_initial_sys_cpu;
b3fe293
 	static bool have_warned_about_memsw;
b3fe293
 
b3fe293
 	int count_tasks_cgroup();
b3fe293
@@ -190,6 +195,7 @@ private:
b3fe293
 	int spree_cgroup(int);
b3fe293
 	int migrate_to_cgroup(pid_t);
b3fe293
 	void update_max_image_size_cgroup();
b3fe293
+	int get_cpu_usage_cgroup(long &user_cpu, long &sys_cpu);
b3fe293
 #endif
b3fe293
 };
b3fe293