api.c 102 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/*
 * Copyright IBM Corporation. 2007
 *
 * Author:	Dhaval Giani <dhaval@linux.vnet.ibm.com>
 * Author:	Balbir Singh <balbir@linux.vnet.ibm.com>
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of version 2.1 of the GNU Lesser General Public License
 * as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it would be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 *
 * TODOs:
16 17
 *	1. Add more APIs for the control groups.
 *	2. Handle the configuration related APIs.
18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
 *
 * Code initiated and designed by Dhaval Giani. All faults are most likely
 * his mistake.
 *
 * Bharata B Rao <bharata@linux.vnet.ibm.com> is willing is take blame
 * for mistakes in APIs for reading statistics.
 */

#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif

#include <dirent.h>
#include <errno.h>
#include <libcgroup.h>
#include <libcgroup-internal.h>
#include <mntent.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/socket.h>
#include <fcntl.h>
#include <sys/syscall.h>
#include <unistd.h>
#include <fts.h>
#include <ctype.h>
#include <pwd.h>
#include <libgen.h>
#include <assert.h>
#include <linux/un.h>
51
#include <grp.h>
52 53 54 55 56 57 58 59 60

/*
 * The errno which happend the last time (have to be thread specific)
 */
__thread int last_errno;

#define MAXLEN 256

/* the value have to be thread specific */
61
static __thread char errtext[MAXLEN];
62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77

/* Task command name length */
#define TASK_COMM_LEN 16

/* Check if cgroup_init has been called or not. */
static int cgroup_initialized;

/* List of configuration rules */
static struct cgroup_rule_list rl;

/* Temporary list of configuration rules (for non-cache apps) */
static struct cgroup_rule_list trl;

/* Lock for the list of rules (rl) */
static pthread_rwlock_t rl_lock = PTHREAD_RWLOCK_INITIALIZER;

78 79 80
/* Namespace */
__thread char *cg_namespace_table[CG_CONTROLLER_MAX];

81 82 83 84
pthread_rwlock_t cg_mount_table_lock = PTHREAD_RWLOCK_INITIALIZER;
struct cg_mount_table_s cg_mount_table[CG_CONTROLLER_MAX];

const char const *cgroup_strerror_codes[] = {
85 86 87 88 89 90
	"Cgroup is not compiled in",
	"Cgroup is not mounted",
	"Cgroup does not exist",
	"Cgroup has not been created",
	"Cgroup one of the needed subsystems is not mounted",
	"Cgroup, request came in from non owner",
91
	"Cgroup controllers are bound to different mount points",
92 93 94 95 96 97 98 99
	"Cgroup, operation not allowed",
	"Cgroup value set exceeds maximum",
	"Cgroup controller already exists",
	"Cgroup value already exists",
	"Cgroup invalid operation",
	"Cgroup, creation of controller failed",
	"Cgroup operation failed",
	"Cgroup not initialized",
100
	"Cgroup, requested group parameter does not exist",
101 102 103 104 105 106 107
	"Cgroup generic error",
	"Cgroup values are not equal",
	"Cgroup controllers are different",
	"Cgroup parsing failed",
	"Cgroup, rules file does not exist",
	"Cgroup mounting failed",
	"End of File or iterator",
108 109 110
	"Failed to parse config file",
	"Have multiple paths for the same namespace",
	"Controller in namespace does not exist",
111
	"Either mount or namespace keyword has to be specified in the configuration file",
Jon Bernard's avatar
Jon Bernard committed
112 113
	"This kernel does not support this feature",
	"Value setting does not succeed",
Jon Bernard's avatar
Jon Bernard committed
114
	"Failed to remove a non-empty group",
115 116
};

Jon Bernard's avatar
Jon Bernard committed
117 118 119 120 121
static const char const *cgroup_ignored_tasks_files[] = { "tasks", NULL };

static int cg_chown(const char *filename, uid_t owner, gid_t group)
{
	if (owner == NO_UID_GID)
122
		owner = getuid();
Jon Bernard's avatar
Jon Bernard committed
123
	if (group == NO_UID_GID)
124
		group = getgid();
Jon Bernard's avatar
Jon Bernard committed
125 126
	return chown(filename, owner, group);
}
127 128 129 130
static int cg_chown_file(FTS *fts, FTSENT *ent, uid_t owner, gid_t group)
{
	int ret = 0;
	const char *filename = fts->fts_path;
Jon Bernard's avatar
Jon Bernard committed
131
	cgroup_dbg("chown: seeing file %s\n", filename);
132 133 134 135 136 137 138 139 140 141 142 143
	switch (ent->fts_info) {
	case FTS_ERR:
		errno = ent->fts_errno;
		break;
	case FTS_D:
	case FTS_DC:
	case FTS_NSOK:
	case FTS_NS:
	case FTS_DNR:
	case FTS_DP:
	case FTS_F:
	case FTS_DEFAULT:
Jon Bernard's avatar
Jon Bernard committed
144
		ret = cg_chown(filename, owner, group);
145 146 147
		break;
	}
	if (ret < 0) {
148 149
		cgroup_warn("Warning: cannot change owner of file %s: %s\n",
				filename, strerror(errno));
150 151 152 153 154 155 156 157 158 159 160 161
		last_errno = errno;
		ret = ECGOTHER;
	}
	return ret;
}

/*
 * TODO: Need to decide a better place to put this function.
 */
static int cg_chown_recursive(char **path, uid_t owner, gid_t group)
{
	int ret = 0;
162 163
	FTS *fts;

Jon Bernard's avatar
Jon Bernard committed
164
	cgroup_dbg("chown: path is %s\n", *path);
165
	fts = fts_open(path, FTS_PHYSICAL | FTS_NOCHDIR |
166
				FTS_NOSTAT, NULL);
Jon Bernard's avatar
Jon Bernard committed
167
	if (fts == NULL) {
168 169
		cgroup_warn("Warning: cannot open directory %s: %s\n",
				path, strerror(errno));
Jon Bernard's avatar
Jon Bernard committed
170 171 172
		last_errno = errno;
		return ECGOTHER;
	}
173 174 175 176
	while (1) {
		FTSENT *ent;
		ent = fts_read(fts);
		if (!ent) {
177
			cgroup_warn("Warning: fts_read failed\n");
178 179 180 181 182 183 184 185
			break;
		}
		ret = cg_chown_file(fts, ent, owner, group);
	}
	fts_close(fts);
	return ret;
}

Jon Bernard's avatar
Jon Bernard committed
186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214
int cg_chmod_path(const char *path, mode_t mode, int owner_is_umask)
{
	struct stat buf;
	mode_t mask = -1U;

	if (owner_is_umask) {
		mode_t umask, gmask, omask;

		/*
		 * Use owner permissions as an umask for group and others
		 * permissions because we trust kernel to initialize owner
		 * permissions to something useful.
		 * Keep SUID and SGID bits.
		 */
		if (stat(path, &buf) == -1)
			goto fail;
		umask = S_IRWXU & buf.st_mode;
		gmask = umask >> 3;
		omask = gmask >> 3;

		mask = umask|gmask|omask|S_ISUID|S_ISGID|S_ISVTX;
	}

	if (chmod(path, mode & mask))
		goto fail;

	return 0;

fail:
215 216
	cgroup_warn("Warning: cannot change permissions of file %s: %s\n", path,
			strerror(errno));
Jon Bernard's avatar
Jon Bernard committed
217 218 219 220
	last_errno = errno;
	return ECGOTHER;
}

Jon Bernard's avatar
Jon Bernard committed
221
int cg_chmod_file(FTS *fts, FTSENT *ent, mode_t dir_mode,
Jon Bernard's avatar
Jon Bernard committed
222 223
	int dirm_change, mode_t file_mode, int filem_change,
	int owner_is_umask)
Jon Bernard's avatar
Jon Bernard committed
224 225 226
{
	int ret = 0;
	const char *filename = fts->fts_path;
Jon Bernard's avatar
Jon Bernard committed
227

Jon Bernard's avatar
Jon Bernard committed
228
	cgroup_dbg("chmod: seeing file %s\n", filename);
Jon Bernard's avatar
Jon Bernard committed
229

Jon Bernard's avatar
Jon Bernard committed
230 231 232 233 234 235 236 237 238
	switch (ent->fts_info) {
	case FTS_ERR:
		errno = ent->fts_errno;
		break;
	case FTS_D:
	case FTS_DC:
	case FTS_DNR:
	case FTS_DP:
		if (dirm_change)
Jon Bernard's avatar
Jon Bernard committed
239
			ret = cg_chmod_path(filename, dir_mode, owner_is_umask);
Jon Bernard's avatar
Jon Bernard committed
240 241 242 243 244 245
		break;
	case FTS_F:
	case FTS_NSOK:
	case FTS_NS:
	case FTS_DEFAULT:
		if (filem_change)
Jon Bernard's avatar
Jon Bernard committed
246 247
			ret = cg_chmod_path(filename, file_mode,
					owner_is_umask);
Jon Bernard's avatar
Jon Bernard committed
248 249 250 251 252 253
		break;
	}
	return ret;
}


Jon Bernard's avatar
Jon Bernard committed
254 255 256 257
/**
 * Changes permissions of all directories and control files (i.e. all
 * files except files named in ignore_list. The list must be terminated with
 * NULL.
Jon Bernard's avatar
Jon Bernard committed
258
 */
Jon Bernard's avatar
Jon Bernard committed
259 260 261
static int cg_chmod_recursive_controller(char *path, mode_t dir_mode,
		int dirm_change, mode_t file_mode, int filem_change,
		int owner_is_umask, const char const **ignore_list)
Jon Bernard's avatar
Jon Bernard committed
262 263 264 265 266
{
	int ret = 0;
	int final_ret =0;
	FTS *fts;
	char *fts_path[2];
Jon Bernard's avatar
Jon Bernard committed
267
	int i, ignored;
Jon Bernard's avatar
Jon Bernard committed
268

Jon Bernard's avatar
Jon Bernard committed
269
	fts_path[0] = path;
Jon Bernard's avatar
Jon Bernard committed
270 271 272 273 274 275
	fts_path[1] = NULL;
	cgroup_dbg("chmod: path is %s\n", path);

	fts = fts_open(fts_path, FTS_PHYSICAL | FTS_NOCHDIR |
			FTS_NOSTAT, NULL);
	if (fts == NULL) {
276 277
		cgroup_warn("Warning: cannot open directory %s: %s\n",
				fts_path, strerror(errno));
Jon Bernard's avatar
Jon Bernard committed
278
		last_errno = errno;
Jon Bernard's avatar
Jon Bernard committed
279
		return ECGOTHER;
Jon Bernard's avatar
Jon Bernard committed
280 281 282 283 284
	}
	while (1) {
		FTSENT *ent;
		ent = fts_read(fts);
		if (!ent) {
Jon Bernard's avatar
Jon Bernard committed
285 286 287 288 289
			if (errno != 0) {
				cgroup_dbg("fts_read failed\n");
				last_errno = errno;
				final_ret = ECGOTHER;
			}
Jon Bernard's avatar
Jon Bernard committed
290 291
			break;
		}
Jon Bernard's avatar
Jon Bernard committed
292 293 294 295 296 297 298 299 300 301
		ignored = 0;
		if (ignore_list != NULL)
			for (i = 0; ignore_list[i] != NULL; i++)
				if (!strcmp(ignore_list[i], ent->fts_name)) {
					ignored = 1;
					break;
				}
		if (ignored)
			continue;

Jon Bernard's avatar
Jon Bernard committed
302
		ret = cg_chmod_file(fts, ent, dir_mode, dirm_change,
Jon Bernard's avatar
Jon Bernard committed
303 304
				file_mode, filem_change,
				owner_is_umask);
Jon Bernard's avatar
Jon Bernard committed
305
		if (ret) {
306 307
			cgroup_warn("Warning: cannot change file mode %s: %s\n",
					fts_path, strerror(errno));
Jon Bernard's avatar
Jon Bernard committed
308 309 310 311 312
			last_errno = errno;
			final_ret = ECGOTHER;
		}
	}
	fts_close(fts);
Jon Bernard's avatar
Jon Bernard committed
313 314
	return final_ret;
}
Jon Bernard's avatar
Jon Bernard committed
315

Jon Bernard's avatar
Jon Bernard committed
316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340
int cg_chmod_recursive(struct cgroup *cgroup, mode_t dir_mode,
		int dirm_change, mode_t file_mode, int filem_change)
{
	int i;
	char *path;
	int final_ret = 0;
	int ret;

	path = malloc(FILENAME_MAX);
	if (!path) {
		last_errno = errno;
		return ECGOTHER;
	}
	for (i = 0; i < cgroup->index; i++) {
		if (!cg_build_path(cgroup->name, path,
				cgroup->controller[i]->name)) {
			final_ret = ECGFAIL;
			break;
		}
		ret = cg_chmod_recursive_controller(path, dir_mode, dirm_change,
				file_mode, filem_change, 0, NULL);
		if (ret)
			final_ret = ret;
	}
	free(path);
Jon Bernard's avatar
Jon Bernard committed
341 342 343
	return final_ret;
}

Jon Bernard's avatar
Jon Bernard committed
344 345 346 347 348 349 350 351
void cgroup_set_permissions(struct cgroup *cgroup,
		mode_t control_dperm, mode_t control_fperm,
		mode_t task_fperm)
{
	cgroup->control_dperm = control_dperm;
	cgroup->control_fperm = control_fperm;
	cgroup->task_fperm = task_fperm;
}
Jon Bernard's avatar
Jon Bernard committed
352

353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369
static char *cgroup_basename(const char *path)
{
	char *base;
	char *tmp_string;

	tmp_string = strdup(path);

	if (!tmp_string)
		return NULL;

	base = strdup(basename(tmp_string));

	free(tmp_string);

	return base;
}

370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388
static int cgroup_test_subsys_mounted(const char *name)
{
	int i;

	pthread_rwlock_rdlock(&cg_mount_table_lock);

	for (i = 0; cg_mount_table[i].name[0] != '\0'; i++) {
		if (strncmp(cg_mount_table[i].name, name,
				sizeof(cg_mount_table[i].name)) == 0) {
			pthread_rwlock_unlock(&cg_mount_table_lock);
			return 1;
		}
	}
	pthread_rwlock_unlock(&cg_mount_table_lock);
	return 0;
}

/**
 * Free a single cgroup_rule struct.
389
 *	@param r The rule to free from memory
390 391 392 393 394 395 396 397
 */
static void cgroup_free_rule(struct cgroup_rule *r)
{
	/* Loop variable */
	int i = 0;

	/* Make sure our rule is not NULL, first. */
	if (!r) {
398
		cgroup_warn("Warning: attempted to free NULL rule\n");
399 400 401 402 403 404 405
		return;
	}
	if (r->procname) {
		free(r->procname);
		r->procname = NULL;
	}
	/* We must free any used controller strings, too. */
406
	for (i = 0; i < MAX_MNT_ELEMENTS; i++) {
407 408 409 410 411 412 413 414 415 416
		if (r->controllers[i])
			free(r->controllers[i]);
	}

	free(r);
}

/**
 * Free a list of cgroup_rule structs.  If rl is the main list of rules,
 * the lock must be taken for writing before calling this function!
417
 *	@param rl Pointer to the list of rules to free from memory
418
 */
419
static void cgroup_free_rule_list(struct cgroup_rule_list *cg_rl)
420 421 422 423 424
{
	/* Temporary pointer */
	struct cgroup_rule *tmp = NULL;

	/* Make sure we're not freeing NULL memory! */
425
	if (!(cg_rl->head)) {
426
		cgroup_warn("Warning: attempted to free NULL list\n");
427 428 429
		return;
	}

430 431 432
	while (cg_rl->head) {
		tmp = cg_rl->head;
		cg_rl->head = tmp->next;
433 434 435 436
		cgroup_free_rule(tmp);
	}

	/* Don't leave wild pointers around! */
437 438
	cg_rl->head = NULL;
	cg_rl->tail = NULL;
439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479
}

static char *cg_skip_unused_charactors_in_rule(char *rule)
{
	char *itr;

	/* We ignore anything after a # sign as comments. */
	itr = strchr(rule, '#');
	if (itr)
		*itr = '\0';

	/* We also need to remove the newline character. */
	itr = strchr(rule, '\n');
	if (itr)
		*itr = '\0';

	/* Now, skip any leading tabs and spaces. */
	itr = rule;
	while (itr && isblank(*itr))
		itr++;

	/* If there's nothing left, we can ignore this line. */
	if (!strlen(itr))
		return NULL;

	return itr;
}

/**
 * Parse the configuration file that maps UID/GIDs to cgroups.  If ever the
 * configuration file is modified, applications should call this function to
 * load the new configuration rules.  The function caller is responsible for
 * calling free() on each rule in the list.
 *
 * The cache parameter alters the behavior of this function.  If true, this
 * function will read the entire configuration file and store the results in
 * rl (global rules list).  If false, this function will only parse until it
 * finds a rule matching the given UID or GID.  It will store this rule in rl,
 * as well as any children rules (rules that begin with a %) that it has.
 *
 * This function is NOT thread safe!
480 481 482 483
 *	@param cache True to cache rules, else false
 *	@param muid If cache is false, the UID to match against
 *	@param mgid If cache is false, the GID to match against
 *	@return 0 on success, -1 if no cache and match found, > 0 on error.
484 485 486
 * TODO: Make this function thread safe!
 */
static int cgroup_parse_rules(bool cache, uid_t muid,
487
					  gid_t mgid, const char *mprocname)
488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519
{
	/* File descriptor for the configuration file */
	FILE *fp = NULL;

	/* Buffer to store the line we're working on */
	char buff[CGROUP_RULE_MAXLINE] = { '\0' };

	/* Iterator for the line we're working on */
	char *itr = NULL;

	/* Pointer to process name in a line of the configuration file */
	char *procname = NULL;

	/* Pointer to the list that we're using */
	struct cgroup_rule_list *lst = NULL;

	/* Rule to add to the list */
	struct cgroup_rule *newrule = NULL;

	/* Structure to get GID from group name */
	struct group *grp = NULL;

	/* Structure to get UID from user name */
	struct passwd *pwd = NULL;

	/* Temporary storage for a configuration rule */
	char key[CGROUP_RULE_MAXKEY] = { '\0' };
	char user[LOGIN_NAME_MAX] = { '\0' };
	char controllers[CG_CONTROLLER_MAX] = { '\0' };
	char destination[FILENAME_MAX] = { '\0' };
	uid_t uid = CGRULE_INVALID;
	gid_t gid = CGRULE_INVALID;
520
	size_t len_username;
521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550
	int len_procname;

	/* The current line number */
	unsigned int linenum = 0;

	/* Did we skip the previous line? */
	bool skipped = false;

	/* Have we found a matching rule (non-cache mode)? */
	bool matched = false;

	/* Return codes */
	int ret = 0;

	/* Temporary buffer for strtok() */
	char *stok_buff = NULL;

	/* Loop variable. */
	int i = 0;

	/* Determine which list we're using. */
	if (cache)
		lst = &rl;
	else
		lst = &trl;

	/* If our list already exists, clean it. */
	if (lst->head)
		cgroup_free_rule_list(lst);

551 552 553 554 555 556 557 558 559
	/* Open the configuration file. */
	pthread_rwlock_wrlock(&rl_lock);
	fp = fopen(CGRULES_CONF_FILE, "re");
	if (!fp) {
		cgroup_warn("Warning: failed to open configuration file %s: %s\n",
				CGRULES_CONF_FILE, strerror(errno));
		goto unlock;
	}

560 561 562 563 564 565 566 567 568 569 570 571 572 573
	/* Now, parse the configuration file one line at a time. */
	cgroup_dbg("Parsing configuration file.\n");
	while (fgets(buff, sizeof(buff), fp) != NULL) {
		linenum++;

		itr = cg_skip_unused_charactors_in_rule(buff);
		if (!itr)
			continue;

		/*
		 * If we skipped the last rule and this rule is a continuation
		 * of it (begins with %), then we should skip this rule too.
		 */
		if (skipped && *itr == '%') {
574
			cgroup_warn("Warning: skipped child of invalid rule,"
575 576 577 578 579 580 581 582 583 584 585
					" line %d.\n", linenum);
			continue;
		}

		/*
		 * If there is something left, it should be a rule.  Otherwise,
		 * there's an error in the configuration file.
		 */
		skipped = false;
		i = sscanf(itr, "%s%s%s", key, controllers, destination);
		if (i != 3) {
586 587 588
			cgroup_err(
					"Error: failed to parse configuration file on line %d\n",
					linenum);
589 590 591 592 593 594 595 596 597
			goto parsefail;
		}
		procname = strchr(key, ':');
		if (procname) {
			/* <user>:<procname>  <subsystem>  <destination> */
			procname++;	/* skip ':' */
			len_username = procname - key - 1;
			len_procname = strlen(procname);
			if (len_procname < 0) {
598 599 600
				cgroup_err(
						"Error: failed to parse configuration file on line %d\n",
						linenum);
601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629
				goto parsefail;
			}
		} else {
			len_username = strlen(key);
			len_procname = 0;
		}
		len_username = min(len_username, sizeof(user) - 1);
		memset(user, '\0', sizeof(user));
		strncpy(user, key, len_username);

		/*
		 * Next, check the user/group.  If it's a % sign, then we
		 * are continuing another rule and UID/GID should not be
		 * reset.  If it's a @, we're dealing with a GID rule.  If
		 * it's a *, then we do not need to do a lookup because the
		 * rule always applies (it's a wildcard).  If we're using
		 * non-cache mode and we've found a matching rule, we only
		 * continue to parse if we're looking at a child rule.
		 */
		if ((!cache) && matched && (strncmp(user, "%", 1) != 0)) {
			/* If we make it here, we finished (non-cache). */
			cgroup_dbg("Parsing of configuration file"
				" complete.\n\n");
			ret = -1;
			goto close;
		}
		if (strncmp(user, "@", 1) == 0) {
			/* New GID rule. */
			itr = &(user[1]);
630 631
			grp = getgrnam(itr);
			if (grp) {
632 633 634 635 636 637 638 639 640 641 642 643 644 645 646
				uid = CGRULE_INVALID;
				gid = grp->gr_gid;
			} else {
				cgroup_dbg("Warning: Entry for %s not"
						"found.  Skipping rule on line"
						" %d.\n", itr, linenum);
				skipped = true;
				continue;
			}
		} else if (strncmp(user, "*", 1) == 0) {
			/* Special wildcard rule. */
			uid = CGRULE_WILD;
			gid = CGRULE_WILD;
		} else if (*itr != '%') {
			/* New UID rule. */
647 648
			pwd = getpwnam(user);
			if (pwd) {
649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669
				uid = pwd->pw_uid;
				gid = CGRULE_INVALID;
			} else {
				cgroup_dbg("Warning: Entry for %s not"
						"found.  Skipping rule on line"
						" %d.\n", user, linenum);
				skipped = true;
				continue;
			}
		} /* Else, we're continuing another rule (UID/GID are okay). */

		/*
		 * If we are not caching rules, then we need to check for a
		 * match before doing anything else.  We consider four cases:
		 * The UID matches, the GID matches, the UID is a member of the
		 * GID, or we're looking at the wildcard rule, which always
		 * matches.  If none of these are true, we simply continue to
		 * the next line in the file.
		 */
		if (grp && muid != CGRULE_INVALID) {
			pwd = getpwuid(muid);
Jon Bernard's avatar
Jon Bernard committed
670 671 672
			if (!pwd) {
				continue;
			}
673 674 675 676 677 678
			for (i = 0; grp->gr_mem[i]; i++) {
				if (!(strcmp(pwd->pw_name, grp->gr_mem[i])))
					matched = true;
			}
		}

679
		if (uid == muid || gid == mgid || uid == CGRULE_WILD)
680 681 682 683 684 685
			matched = true;

		if (!cache) {
			if (!matched)
				continue;
			if (len_procname) {
686
				char *mproc_base;
687 688 689 690 691 692 693 694 695 696
				/*
				 * If there is a rule based on process name,
				 * it should be matched with mprocname.
				 */
				if (!mprocname) {
					uid = CGRULE_INVALID;
					gid = CGRULE_INVALID;
					matched = false;
					continue;
				}
697 698

				mproc_base = cgroup_basename(mprocname);
699
				if (strcmp(mprocname, procname) &&
700
					strcmp(mproc_base, procname)) {
701 702 703
					uid = CGRULE_INVALID;
					gid = CGRULE_INVALID;
					matched = false;
704
					free(mproc_base);
705 706
					continue;
				}
707
				free(mproc_base);
708 709 710 711 712 713 714 715 716 717
			}
		}

		/*
		 * Now, we're either caching rules or we found a match.  Either
		 * way, copy everything into a new rule and push it into the
		 * list.
		 */
		newrule = calloc(1, sizeof(struct cgroup_rule));
		if (!newrule) {
718
			cgroup_err("Error: out of memory? Error was: %s\n",
719 720 721 722 723 724 725 726
				strerror(errno));
			last_errno = errno;
			ret = ECGOTHER;
			goto close;
		}

		newrule->uid = uid;
		newrule->gid = gid;
727 728
		len_username = min(len_username,
					sizeof(newrule->username) - 1);
729 730 731 732
		strncpy(newrule->username, user, len_username);
		if (len_procname) {
			newrule->procname = strdup(procname);
			if (!newrule->procname) {
733 734
				cgroup_err("Error: strdup failed to allocate memory %s\n",
						strerror(errno));
Jon Bernard's avatar
Jon Bernard committed
735
				free(newrule);
736 737 738 739 740 741 742 743 744 745 746 747 748 749
				last_errno = errno;
				ret = ECGOTHER;
				goto close;
			}
		} else {
			newrule->procname = NULL;
		}
		strncpy(newrule->destination, destination,
			sizeof(newrule->destination) - 1);
		newrule->next = NULL;

		/* Parse the controller list, and add that to newrule too. */
		stok_buff = strtok(controllers, ",");
		if (!stok_buff) {
750 751
			cgroup_err("Error: failed to parse controllers on line %d\n",
					linenum);
752 753 754 755 756 757
			goto destroyrule;
		}

		i = 0;
		do {
			if (i >= MAX_MNT_ELEMENTS) {
758 759
				cgroup_err("Error: too many controllers listed on line %d\n",
						linenum);
760 761 762 763 764 765
				goto destroyrule;
			}

			newrule->controllers[i] = strndup(stok_buff,
							strlen(stok_buff) + 1);
			if (!(newrule->controllers[i])) {
766 767
				cgroup_err("Error: out of memory? Error was: %s\n",
						strerror(errno));
768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784
				goto destroyrule;
			}
			i++;
		} while ((stok_buff = strtok(NULL, ",")));

		/* Now, push the rule. */
		if (lst->head == NULL) {
			lst->head = newrule;
			lst->tail = newrule;
		} else {
			lst->tail->next = newrule;
			lst->tail = newrule;
		}

		cgroup_dbg("Added rule %s (UID: %d, GID: %d) -> %s for"
			" controllers:", lst->tail->username, lst->tail->uid,
			lst->tail->gid, lst->tail->destination);
785
		for (i = 0; lst->tail->controllers[i]; i++)
786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802
			cgroup_dbg(" %s", lst->tail->controllers[i]);
		cgroup_dbg("\n");

		/* Finally, clear the buffer. */
		grp = NULL;
		pwd = NULL;
	}

	/* If we make it here, there were no errors. */
	cgroup_dbg("Parsing of configuration file complete.\n\n");
	ret = (matched && !cache) ? -1 : 0;
	goto close;

destroyrule:
	cgroup_free_rule(newrule);

parsefail:
803
	ret = ECGRULESPARSEFAIL;
804 805 806 807 808 809 810 811

close:
	fclose(fp);
unlock:
	pthread_rwlock_unlock(&rl_lock);
	return ret;
}

Jon Bernard's avatar
Jon Bernard committed
812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836
int cg_add_duplicate_mount(struct cg_mount_table_s *item, const char *path)
{
	struct cg_mount_point *mount, *it;

	mount = malloc(sizeof(struct cg_mount_point));
	if (!mount) {
		last_errno = errno;
		return ECGOTHER;
	}
	mount->next = NULL;
	strncpy(mount->path, path, sizeof(mount->path));
	mount->path[sizeof(mount->path)-1] = '\0';

	/*
	 * Add the mount point to the end of the list.
	 * Assuming the list is short, no optimization is done.
	 */
	it = &item->mount;
	while (it->next)
		it = it->next;

	it->next = mount;
	return 0;
}

837 838 839 840 841 842 843 844
/**
 * cgroup_init(), initializes the MOUNT_POINT.
 *
 * This code is theoretically thread safe now. Its not really tested
 * so it can blow up. If does for you, please let us know with your
 * test case and we can really make it thread safe.
 *
 */
845
int cgroup_init(void)
846 847 848 849 850 851 852 853 854 855
{
	FILE *proc_mount = NULL;
	struct mntent *ent = NULL;
	struct mntent *temp_ent = NULL;
	int found_mnt = 0;
	int ret = 0;
	static char *controllers[CG_CONTROLLER_MAX];
	FILE *proc_cgroup = NULL;
	char subsys_name[FILENAME_MAX];
	int hierarchy, num_cgroups, enabled;
856
	int i = 0;
857 858 859 860 861 862 863 864
	int j;
	int duplicate = 0;
	char *mntopt = NULL;
	int err;
	char *buf = NULL;
	char mntent_buffer[4 * FILENAME_MAX];
	char *strtok_buffer = NULL;

865 866
	cgroup_set_default_logger(-1);

867 868
	pthread_rwlock_wrlock(&cg_mount_table_lock);

Jon Bernard's avatar
Jon Bernard committed
869 870 871 872 873 874 875 876 877 878 879
	/* free global variables filled by previous cgroup_init() */
	for (i = 0; cg_mount_table[i].name[0] != '\0'; i++) {
		struct cg_mount_point *mount = cg_mount_table[i].mount.next;
		while (mount) {
			struct cg_mount_point *tmp = mount;
			mount = mount->next;
			free(tmp);
		}
	}
	memset(&cg_mount_table, 0, sizeof(cg_mount_table));

880
	proc_cgroup = fopen("/proc/cgroups", "re");
881 882

	if (!proc_cgroup) {
883 884
		cgroup_err("Error: cannot open /proc/cgroups: %s\n",
				strerror(errno));
885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903
		last_errno = errno;
		ret = ECGOTHER;
		goto unlock_exit;
	}

	/*
	 * The first line of the file has stuff we are not interested in.
	 * So just read it and discard the information.
	 *
	 * XX: fix the size for fgets
	 */
	buf = malloc(FILENAME_MAX);
	if (!buf) {
		last_errno = errno;
		ret = ECGOTHER;
		goto unlock_exit;
	}
	if (!fgets(buf, FILENAME_MAX, proc_cgroup)) {
		free(buf);
904 905
		cgroup_err("Error: cannot read /proc/cgroups: %s\n",
				strerror(errno));
906 907 908 909 910 911
		last_errno = errno;
		ret = ECGOTHER;
		goto unlock_exit;
	}
	free(buf);

Jon Bernard's avatar
Jon Bernard committed
912
	i = 0;
913 914 915 916 917 918 919 920 921 922
	while (!feof(proc_cgroup)) {
		err = fscanf(proc_cgroup, "%s %d %d %d", subsys_name,
				&hierarchy, &num_cgroups, &enabled);
		if (err < 0)
			break;
		controllers[i] = strdup(subsys_name);
		i++;
	}
	controllers[i] = NULL;

923
	proc_mount = fopen("/proc/mounts", "re");
924
	if (proc_mount == NULL) {
925 926 927 928
		cgroup_err("Error: cannot open /proc/mounts: %s\n",
				strerror(errno));
		last_errno = errno;
		ret = ECGOTHER;
929 930 931 932 933 934
		goto unlock_exit;
	}

	temp_ent = (struct mntent *) malloc(sizeof(struct mntent));

	if (!temp_ent) {
935
		last_errno = errno;
936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951
		ret = ECGOTHER;
		goto unlock_exit;
	}

	while ((ent = getmntent_r(proc_mount, temp_ent,
					mntent_buffer,
					sizeof(mntent_buffer))) != NULL) {
		if (strcmp(ent->mnt_type, "cgroup"))
			continue;

		for (i = 0; controllers[i] != NULL; i++) {
			mntopt = hasmntopt(ent, controllers[i]);

			if (!mntopt)
				continue;

Jon Bernard's avatar
Jon Bernard committed
952
			cgroup_dbg("found %s in %s\n", controllers[i], ent->mnt_opts);
953 954 955 956

			/* do not have duplicates in mount table */
			duplicate = 0;
			for  (j = 0; j < found_mnt; j++) {
Jon Bernard's avatar
Jon Bernard committed
957 958
				if (strncmp(controllers[i],
							cg_mount_table[j].name,
959
							FILENAME_MAX) == 0) {
960 961 962 963 964 965
					duplicate = 1;
					break;
				}
			}
			if (duplicate) {
				cgroup_dbg("controller %s is already mounted on %s\n",
Jon Bernard's avatar
Jon Bernard committed
966 967 968 969 970 971 972
					mntopt, cg_mount_table[j].mount.path);
				ret = cg_add_duplicate_mount(&cg_mount_table[j],
						ent->mnt_dir);
				if (ret)
					goto unlock_exit;
				/* continue with next controller */
				continue;
973 974
			}

Jon Bernard's avatar
Jon Bernard committed
975 976 977 978 979 980 981 982
			strncpy(cg_mount_table[found_mnt].name,
				controllers[i], FILENAME_MAX);
			cg_mount_table[found_mnt].name[FILENAME_MAX-1] = '\0';
			strncpy(cg_mount_table[found_mnt].mount.path,
				ent->mnt_dir, FILENAME_MAX);
			cg_mount_table[found_mnt].mount.path[FILENAME_MAX-1] =
				'\0';
			cg_mount_table[found_mnt].mount.next = NULL;
983 984 985 986
			cgroup_dbg("Found cgroup option %s, count %d\n",
				ent->mnt_opts, found_mnt);
			found_mnt++;
		}
987 988 989 990 991 992 993 994 995

		/*
		 * Doesn't match the controller.
		 * Check if it is a named hierarchy.
		 */
		mntopt = hasmntopt(ent, "name");

		if (mntopt) {
			mntopt = strtok_r(mntopt, ",", &strtok_buffer);
996 997
			if (!mntopt)
				continue;
998 999 1000 1001
			/*
			 * Check if it is a duplicate
			 */
			duplicate = 0;
Jon Bernard's avatar
Jon Bernard committed
1002 1003 1004 1005 1006 1007 1008 1009 1010

#ifdef OPAQUE_HIERARCHY
			/*
			 * Ignore the opaque hierarchy.
			 */
			if (strcmp(mntopt, OPAQUE_HIERARCHY) == 0)
					continue;
#endif

1011 1012 1013 1014 1015 1016 1017 1018 1019 1020
			for (j = 0; j < found_mnt; j++) {
				if (strncmp(mntopt, cg_mount_table[j].name,
							FILENAME_MAX) == 0) {
					duplicate = 1;
					break;
				}
			}

			if (duplicate) {
				cgroup_dbg("controller %s is already mounted on %s\n",
Jon Bernard's avatar
Jon Bernard committed
1021 1022 1023 1024 1025
					mntopt, cg_mount_table[j].mount.path);
				ret = cg_add_duplicate_mount(&cg_mount_table[j],
						ent->mnt_dir);
				if (ret)
					goto unlock_exit;
1026 1027 1028
				continue;
			}

Jon Bernard's avatar
Jon Bernard committed
1029 1030 1031 1032 1033 1034 1035 1036
			strncpy(cg_mount_table[found_mnt].name,
				mntopt, FILENAME_MAX);
			cg_mount_table[found_mnt].name[FILENAME_MAX-1] = '\0';
			strncpy(cg_mount_table[found_mnt].mount.path,
				ent->mnt_dir, FILENAME_MAX);
			cg_mount_table[found_mnt].mount.path[FILENAME_MAX-1] =
				'\0';
			cg_mount_table[found_mnt].mount.next = NULL;
1037 1038 1039 1040
			cgroup_dbg("Found cgroup option %s, count %d\n",
				ent->mnt_opts, found_mnt);
			found_mnt++;
		}
1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072
	}

	free(temp_ent);

	if (!found_mnt) {
		cg_mount_table[0].name[0] = '\0';
		ret = ECGROUPNOTMOUNTED;
		goto unlock_exit;
	}

	found_mnt++;
	cg_mount_table[found_mnt].name[0] = '\0';

	cgroup_initialized = 1;

unlock_exit:
	if (proc_cgroup)
		fclose(proc_cgroup);

	if (proc_mount)
		fclose(proc_mount);

	for (i = 0; controllers[i]; i++) {
		free(controllers[i]);
		controllers[i] = NULL;
	}

	pthread_rwlock_unlock(&cg_mount_table_lock);

	return ret;
}

1073
static int cg_test_mounted_fs(void)
1074 1075 1076 1077 1078 1079 1080
{
	FILE *proc_mount = NULL;
	struct mntent *ent = NULL;
	struct mntent *temp_ent = NULL;
	char mntent_buff[4 * FILENAME_MAX];
	int ret = 1;

1081
	proc_mount = fopen("/proc/mounts", "re");
1082
	if (proc_mount == NULL)
1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099
		return 0;

	temp_ent = (struct mntent *) malloc(sizeof(struct mntent));
	if (!temp_ent) {
		/* We just fail at the moment. */
		fclose(proc_mount);
		return 0;
	}

	ent = getmntent_r(proc_mount, temp_ent, mntent_buff,
						sizeof(mntent_buff));

	if (!ent) {
		ret = 0;
		goto done;
	}

1100
	while (strcmp(ent->mnt_type, "cgroup") != 0) {
1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113
		ent = getmntent_r(proc_mount, temp_ent, mntent_buff,
						sizeof(mntent_buff));
		if (ent == NULL) {
			ret = 0;
			goto done;
		}
	}
done:
	fclose(proc_mount);
	free(temp_ent);
	return ret;
}

1114
static inline pid_t cg_gettid(void)
1115 1116 1117 1118
{
	return syscall(__NR_gettid);
}

1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132
static char *cg_concat_path(const char *pref, const char *suf, char *path)
{
	if ((suf[strlen(suf)-1] == '/') ||
		((strlen(suf) == 0) && (pref[strlen(pref)-1] == '/'))) {
		snprintf(path, FILENAME_MAX, "%s%s", pref,
			suf+((suf[0] == '/') ? 1 : 0));
	} else {
		snprintf(path, FILENAME_MAX, "%s%s/", pref,
			suf+((suf[0] == '/') ? 1 : 0));
	}
	path[FILENAME_MAX-1] = '\0';
	return path;
}

1133 1134

/* Call with cg_mount_table_lock taken */
Jon Bernard's avatar
Jon Bernard committed
1135
/* path value have to have size at least FILENAME_MAX */
1136 1137
static char *cg_build_path_locked(const char *name, char *path,
						const char *type)
1138 1139 1140 1141
{
	int i;
	for (i = 0; cg_mount_table[i].name[0] != '\0'; i++) {
		if (strcmp(cg_mount_table[i].name, type) == 0) {
1142
			if (cg_namespace_table[i]) {
Jon Bernard's avatar
Jon Bernard committed
1143 1144 1145 1146
				snprintf(path, FILENAME_MAX, "%s/%s/",
						cg_mount_table[i].mount.path,
						cg_namespace_table[i]);
				path[FILENAME_MAX-1] = '\0';
1147
			} else {
Jon Bernard's avatar
Jon Bernard committed
1148 1149 1150
				snprintf(path, FILENAME_MAX, "%s/",
						cg_mount_table[i].mount.path);
				path[FILENAME_MAX-1] = '\0';
1151 1152
			}

1153 1154 1155
			if (name) {
				char *tmp;
				tmp = strdup(path);
1156 1157 1158

				/* FIXME: missing OOM check here! */

1159
				cg_concat_path(tmp, name, path);
1160 1161 1162 1163 1164 1165 1166 1167
				free(tmp);
			}
			return path;
		}
	}
	return NULL;
}

1168
char *cg_build_path(const char *name, char *path, const char *type)
1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181
{
	pthread_rwlock_rdlock(&cg_mount_table_lock);
	path = cg_build_path_locked(name, path, type);
	pthread_rwlock_unlock(&cg_mount_table_lock);

	return path;
}

static int __cgroup_attach_task_pid(char *path, pid_t tid)
{
	int ret = 0;
	FILE *tasks = NULL;

1182
	tasks = fopen(path, "we");
1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207
	if (!tasks) {
		switch (errno) {
		case EPERM:
			return ECGROUPNOTOWNER;
		case ENOENT:
			return ECGROUPNOTEXIST;
		default:
			return ECGROUPNOTALLOWED;
		}
	}
	ret = fprintf(tasks, "%d", tid);
	if (ret < 0) {
		last_errno = errno;
		ret = ECGOTHER;
		goto err;
	}
	ret = fflush(tasks);
	if (ret) {
		last_errno = errno;
		ret = ECGOTHER;
		goto err;
	}
	fclose(tasks);
	return 0;
err:
1208
	cgroup_warn("Warning: cannot write tid %d to %s:%s\n",
1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227
			tid, path, strerror(errno));
	fclose(tasks);
	return ret;
}

/** cgroup_attach_task_pid is used to assign tasks to a cgroup.
 *  struct cgroup *cgroup: The cgroup to assign the thread to.
 *  pid_t tid: The thread to be assigned to the cgroup.
 *
 *  returns 0 on success.
 *  returns ECGROUPNOTOWNER if the caller does not have access to the cgroup.
 *  returns ECGROUPNOTALLOWED for other causes of failure.
 */
int cgroup_attach_task_pid(struct cgroup *cgroup, pid_t tid)
{
	char path[FILENAME_MAX];
	int i, ret = 0;

	if (!cgroup_initialized) {
1228
		cgroup_warn("Warning: libcgroup is not initialized\n");
1229 1230
		return ECGROUPNOTINITIALIZED;
	}
1231
	if (!cgroup) {
1232
		pthread_rwlock_rdlock(&cg_mount_table_lock);
1233 1234
		for (i = 0; i < CG_CONTROLLER_MAX &&
				cg_mount_table[i].name[0] != '\0'; i++) {
1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248
			if (!cg_build_path_locked(NULL, path,
						cg_mount_table[i].name))
				continue;
			strncat(path, "/tasks", sizeof(path) - strlen(path));
			ret = __cgroup_attach_task_pid(path, tid);
			if (ret) {
				pthread_rwlock_unlock(&cg_mount_table_lock);
				return ret;
			}
		}
		pthread_rwlock_unlock(&cg_mount_table_lock);
	} else {
		for (i = 0; i < cgroup->index; i++) {
			if (!cgroup_test_subsys_mounted(cgroup->controller[i]->name)) {
1249 1250
				cgroup_warn("Warning: subsystem %s is not mounted\n",
						cgroup->controller[i]->name);
1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289
				return ECGROUPSUBSYSNOTMOUNTED;
			}
		}

		for (i = 0; i < cgroup->index; i++) {
			if (!cg_build_path(cgroup->name, path,
					cgroup->controller[i]->name))
				continue;
			strncat(path, "/tasks", sizeof(path) - strlen(path));
			ret = __cgroup_attach_task_pid(path, tid);
			if (ret)
				return ret;
		}
	}
	return 0;
}

/** cgroup_attach_task is used to attach the current thread to a cgroup.
 *  struct cgroup *cgroup: The cgroup to assign the current thread to.
 *
 *  See cg_attach_task_pid for return values.
 */
int cgroup_attach_task(struct cgroup *cgroup)
{
	pid_t tid = cg_gettid();
	int error;

	error = cgroup_attach_task_pid(cgroup, tid);

	return error;
}

/**
 * cg_mkdir_p, emulate the mkdir -p command (recursively creating paths)
 * @path: path to create
 */
int cg_mkdir_p(const char *path)
{
	char *real_path = NULL;
Jon Bernard's avatar
Jon Bernard committed
1290
	int i = 0;
1291
	char pos;
1292 1293
	int ret = 0, stat_ret;
	struct stat st;
1294 1295 1296 1297 1298 1299 1300 1301

	real_path = strdup(path);
	if (!real_path) {
		last_errno = errno;
		return ECGOTHER;
	}

	do {
Jon Bernard's avatar
Jon Bernard committed
1302 1303 1304 1305 1306 1307 1308 1309 1310 1311
		while (real_path[i] != '\0' && real_path[i] == '/')
			i++;
		if (real_path[i] == '\0')
			break; /* The path ends with '/', ignore it. */
		while (real_path[i] != '\0' && real_path[i] != '/')
			i++;
		pos = real_path[i];
		real_path[i] = '\0';		/* Temporarily overwrite "/" */
		ret = mkdir(real_path, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
		real_path[i] = pos;
1312 1313 1314 1315 1316 1317 1318 1319 1320
		if (ret) {
			switch (errno) {
			case EEXIST:
				ret = 0;	/* Not fatal really */
				break;
			case EPERM:
				ret = ECGROUPNOTOWNER;
				goto done;
			default:
1321 1322 1323 1324 1325 1326 1327 1328
				/* Check if path exists */
				real_path[i] = '\0';
				stat_ret = stat(real_path, &st);
				real_path[i] = pos;
				if (stat_ret == 0) {
					ret = 0;	/* Path exists */
					break;
				}
1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345
				ret = ECGROUPNOTALLOWED;
				goto done;
			}
		}
	} while (real_path[i]);

done:
	free(real_path);
	return ret;
}

/*
 * create_control_group()
 * This is the basic function used to create the control group. This function
 * just makes the group. It does not set any permissions, or any control values.
 * The argument path is the fully qualified path name to make it generic.
 */
1346
static int cg_create_control_group(const char *path)
1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360
{
	int error;
	if (!cg_test_mounted_fs())
		return ECGROUPNOTMOUNTED;
	error = cg_mkdir_p(path);
	return error;
}

/*
 * set_control_value()
 * This is the low level function for putting in a value in a control file.
 * This function takes in the complete path and sets the value in val in that
 * file.
 */
1361
static int cg_set_control_value(char *path, const char *val)
1362 1363 1364 1365 1366
{
	FILE *control_file = NULL;
	if (!cg_test_mounted_fs())
		return ECGROUPNOTMOUNTED;

1367
	control_file = fopen(path, "r+e");
1368 1369 1370 1371 1372 1373 1374 1375 1376 1377

	if (!control_file) {
		if (errno == EPERM) {
			/*
			 * We need to set the correct error value, does the
			 * group exist but we don't have the subsystem
			 * mounted at that point, or is it that the group
			 * does not exist. So we check if the tasks file
			 * exist. Before that, we need to extract the path.
			 */
Jon Bernard's avatar
Jon Bernard committed
1378 1379 1380 1381 1382 1383 1384
			char *path_dir_end;
			char *tasks_path;

			path_dir_end = strrchr(path, '/');
			if (path_dir_end == NULL)
				return ECGROUPVALUENOTEXIST;
			path_dir_end = '\0';
1385

Jon Bernard's avatar
Jon Bernard committed
1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396
			/* task_path contain: $path/tasks */
			tasks_path = (char *)malloc(strlen(path) + 6 + 1);
			if (tasks_path == NULL) {
				last_errno = errno;
				return ECGOTHER;
			}
			strcpy(tasks_path, path);
			strcat(tasks_path, "/tasks");

			/* test tasks file for read flag */
			control_file = fopen(tasks_path, "re");
1397
			if (!control_file) {
Jon Bernard's avatar
Jon Bernard committed
1398 1399
				if (errno == ENOENT) {
					free(tasks_path);
1400
					return ECGROUPSUBSYSNOTMOUNTED;
Jon Bernard's avatar
Jon Bernard committed
1401 1402 1403
				}
			} else {
				fclose(control_file);
1404
			}
Jon Bernard's avatar
Jon Bernard committed
1405
			free(tasks_path);
1406 1407 1408 1409 1410
			return ECGROUPNOTALLOWED;
		}
		return ECGROUPVALUENOTEXIST;
	}

1411 1412 1413 1414 1415 1416 1417 1418 1419
	if (fprintf(control_file, "%s", val) < 0) {
		last_errno = errno;
		fclose(control_file);
		return ECGOTHER;
	}
	if (fclose(control_file) < 0) {
		last_errno = errno;
		return ECGOTHER;
	}
1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437
	return 0;
}

/** cgroup_modify_cgroup modifies the cgroup control files.
 * struct cgroup *cgroup: The name will be the cgroup to be modified.
 * The values will be the values to be modified, those not mentioned
 * in the structure will not be modified.
 *
 * The uids cannot be modified yet.
 *
 * returns 0 on success.
 *
 */

int cgroup_modify_cgroup(struct cgroup *cgroup)
{
	char *path, base[FILENAME_MAX];
	int i;
Jon Bernard's avatar
Jon Bernard committed
1438
	int error = 0;
1439 1440 1441 1442 1443 1444 1445 1446 1447 1448
	int ret;

	if (!cgroup_initialized)
		return ECGROUPNOTINITIALIZED;

	if (!cgroup)
		return ECGROUPNOTALLOWED;

	for (i = 0; i < cgroup->index; i++) {
		if (!cgroup_test_subsys_mounted(cgroup->controller[i]->name)) {
1449
			cgroup_warn("Warning: subsystem %s is not mounted\n",
1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471
				cgroup->controller[i]->name);
			return ECGROUPSUBSYSNOTMOUNTED;
		}
	}

	for (i = 0; i < cgroup->index; i++) {
		int j;
		if (!cg_build_path(cgroup->name, base,
			cgroup->controller[i]->name))
			continue;
		for (j = 0; j < cgroup->controller[i]->index; j++) {
			ret = asprintf(&path, "%s%s", base,
				cgroup->controller[i]->values[j]->name);
			if (ret < 0) {
				last_errno = errno;
				error = ECGOTHER;
				goto err;
			}
			error = cg_set_control_value(path,
				cgroup->controller[i]->values[j]->value);
			free(path);
			path = NULL;
1472 1473 1474 1475 1476 1477
			/* don't consider error in files directly written by
			 * the user as fatal */
			if (error && !cgroup->controller[i]->values[j]->dirty) {
				error = 0;
				continue;
			}
1478 1479
			if (error)
				goto err;
1480
			cgroup->controller[i]->values[j]->dirty = false;
1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493
		}
	}
err:
	return error;

}

/**
 * @dst: Destination controller
 * @src: Source controller from which values will be copied to dst
 *
 * Create a duplicate copy of values under the specified controller
 */
1494
static int cgroup_copy_controller_values(struct cgroup_controller *dst,
1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508
					struct cgroup_controller *src)
{
	int i, ret = 0;

	if (!dst || !src)
		return ECGFAIL;

	strncpy(dst->name, src->name, FILENAME_MAX);
	for (i = 0; i < src->index; i++, dst->index++) {
		struct control_value *src_val = src->values[i];
		struct control_value *dst_val;

		dst->values[i] = calloc(1, sizeof(struct control_value));
		if (!dst->values[i]) {
Jon Bernard's avatar
Jon Bernard committed
1509 1510
			last_errno = errno;
			ret = ECGOTHER;
1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549
			goto err;
		}

		dst_val = dst->values[i];
		strncpy(dst_val->value, src_val->value, CG_VALUE_MAX);
		strncpy(dst_val->name, src_val->name, FILENAME_MAX);
	}
err:
	return ret;
}

/**
 * @dst: Destination control group
 * @src: Source from which values will be copied to dst
 *
 * Create a duplicate copy of src in dst. This will be useful for those who
 * that intend to create new instances based on an existing control group
 */
int cgroup_copy_cgroup(struct cgroup *dst, struct cgroup *src)
{
	int ret = 0, i;

	if (!dst || !src)
		return ECGROUPNOTEXIST;

	/*
	 * Should we just use the restrict keyword instead?
	 */
	if (dst == src)
		return ECGFAIL;

	cgroup_free_controllers(dst);

	for (i = 0; i < src->index; i++, dst->index++) {
		struct cgroup_controller *src_ctlr = src->controller[i];
		struct cgroup_controller *dst_ctlr;

		dst->controller[i] = calloc(1, sizeof(struct cgroup_controller));
		if (!dst->controller[i]) {
Jon Bernard's avatar
Jon Bernard committed
1550 1551
			last_errno = errno;
			ret = ECGOTHER;
1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568
			goto err;
		}

		dst_ctlr = dst->controller[i];
		ret = cgroup_copy_controller_values(dst_ctlr, src_ctlr);
		if (ret)
			goto err;
	}
err:
	return ret;
}

/** cgroup_create_cgroup creates a new control group.
 * struct cgroup *cgroup: The control group to be created
 *
 * returns 0 on success. We recommend calling cg_delete_cgroup
 * if this routine fails. That should do the cleanup operation.
Jon Bernard's avatar
Jon Bernard committed
1569 1570
 * If ECGCANTSETVALUE is returned, the group was created successfully
 * but not all controller parameters were successfully set.
1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626
 */
int cgroup_create_cgroup(struct cgroup *cgroup, int ignore_ownership)
{
	char *fts_path[2];
	char *base = NULL;
	char *path = NULL;
	int i, j, k;
	int error = 0;
	int retval = 0;
	int ret;

	if (!cgroup_initialized)
		return ECGROUPNOTINITIALIZED;

	if (!cgroup)
		return ECGROUPNOTALLOWED;

	for (i = 0; i < cgroup->index;	i++) {
		if (!cgroup_test_subsys_mounted(cgroup->controller[i]->name))
			return ECGROUPSUBSYSNOTMOUNTED;
	}

	fts_path[0] = (char *)malloc(FILENAME_MAX);
	if (!fts_path[0]) {
		last_errno = errno;
		return ECGOTHER;
	}
	fts_path[1] = NULL;
	path = fts_path[0];

	/*
	 * XX: One important test to be done is to check, if you have multiple
	 * subsystems mounted at one point, all of them *have* be on the cgroup
	 * data structure. If not, we fail.
	 */
	for (k = 0; k < cgroup->index; k++) {
		if (!cg_build_path(cgroup->name, path,
				cgroup->controller[k]->name))
			continue;

		error = cg_create_control_group(path);
		if (error)
			goto err;

		base = strdup(path);

		if (!base) {
			last_errno = errno;
			error = ECGOTHER;
			goto err;
		}

		if (!ignore_ownership) {
			cgroup_dbg("Changing ownership of %s\n", fts_path[0]);
			error = cg_chown_recursive(fts_path,
				cgroup->control_uid, cgroup->control_gid);
Jon Bernard's avatar
Jon Bernard committed
1627 1628 1629 1630 1631 1632 1633
			if (!error)
				error = cg_chmod_recursive_controller(fts_path[0],
						cgroup->control_dperm,
						cgroup->control_dperm != NO_PERMS,
						cgroup->control_fperm,
						cgroup->control_fperm != NO_PERMS,
						1, cgroup_ignored_tasks_files);
1634 1635 1636 1637 1638 1639 1640 1641
		}

		if (error)
			goto err;

		for (j = 0; j < cgroup->controller[k]->index; j++) {
			ret = snprintf(path, FILENAME_MAX, "%s%s", base,
					cgroup->controller[k]->values[j]->name);
Jon Bernard's avatar
Jon Bernard committed
1642 1643
			cgroup_dbg("setting %s to \"%s\", pathlen %d\n", path,
				cgroup->controller[k]->values[j]->value, ret);
1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657
			if (ret < 0 || ret >= FILENAME_MAX) {
				last_errno = errno;
				error = ECGOTHER;
				goto err;
			}
			error = cg_set_control_value(path,
				cgroup->controller[k]->values[j]->value);
			/*
			 * Should we undo, what we've done in the loops above?
			 * An error should not be treated as fatal, since we
			 * have several read-only files and several files that
			 * are only conditionally created in the child.
			 *
			 * A middle ground would be to track that there
Jon Bernard's avatar
Jon Bernard committed
1658 1659 1660
			 * was an error and return a diagnostic value--
			 * callers don't get context for the error, but can
			 * ignore it specifically if they wish.
1661 1662
			 */
			if (error) {
1663 1664
				cgroup_err("Error: failed to set %s: %s\n",
					path, cgroup_strerror(error));
Jon Bernard's avatar
Jon Bernard committed
1665
				retval = ECGCANTSETVALUE;
1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676
				continue;
			}
		}

		if (!ignore_ownership) {
			ret = snprintf(path, FILENAME_MAX, "%s/tasks", base);
			if (ret < 0 || ret >= FILENAME_MAX) {
				last_errno = errno;
				error = ECGOTHER;
				goto err;
			}
Jon Bernard's avatar
Jon Bernard committed
1677
			error = cg_chown(path, cgroup->tasks_uid,
1678
							cgroup->tasks_gid);
Jon Bernard's avatar
Jon Bernard committed
1679 1680 1681 1682
			if (!error && cgroup->task_fperm != NO_PERMS)
				error = cg_chmod_path(path, cgroup->task_fperm,
						1);

1683 1684 1685 1686 1687
			if (error) {
				last_errno = errno;
				error = ECGOTHER;
				goto err;
			}
Jon Bernard's avatar
Jon Bernard committed
1688

1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703
		}
		free(base);
		base = NULL;
	}

err:
	if (path)
		free(path);
	if (base)
		free(base);
	if (retval && !error)
		error = retval;
	return error;
}

Jon Bernard's avatar
Jon Bernard committed
1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725
/**
 * Obtain the calculated parent name of specified cgroup; no validation
 * of the existence of the child or parent group is performed.
 *
 * Given the path-like hierarchy of cgroup names, this function returns
 * the dirname() of the cgroup name as the likely parent name; the caller
 * is responsible for validating parent as appropriate.
 *
 * @param cgroup The cgroup to query for parent's name
 * @param parent Output, name of parent's group, or NULL if the
 * 	provided cgroup is the root group.
 *	Caller is responsible to free the returned string.
 * @return 0 on success, > 0 on error
 */
static int cgroup_get_parent_name(struct cgroup *cgroup, char **parent)
{
	int ret = 0;
	char *dir = NULL;
	char *pdir = NULL;

	dir = strdup(cgroup->name);
	if (!dir) {
Jon Bernard's avatar
Jon Bernard committed
1726 1727
		last_errno = errno;
		return ECGOTHER;
Jon Bernard's avatar
Jon Bernard committed
1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741
	}
	cgroup_dbg("group name is %s\n", dir);

	pdir = dirname(dir);
	cgroup_dbg("parent's group name is %s\n", pdir);

	/* check for root group */
	if (strlen(cgroup->name) == 0 || !strcmp(cgroup->name, pdir)) {
		cgroup_dbg("specified cgroup \"%s\" is root group\n",
			cgroup->name);
		*parent = NULL;
	}
	else {
		*parent = strdup(pdir);
Jon Bernard's avatar
Jon Bernard committed
1742 1743 1744 1745
		if (*parent == NULL) {
			last_errno = errno;
			ret = ECGOTHER;
		}
Jon Bernard's avatar
Jon Bernard committed
1746 1747 1748 1749 1750 1751
	}
	free(dir);

	return ret;
}

1752
/**
Jon Bernard's avatar
Jon Bernard committed
1753 1754 1755 1756 1757 1758 1759
 * Find the parent of the specified directory. It returns the parent in
 * hierarchy of given controller (the parent is usually name/.. unless name is
 * a mount point.  It is assumed both the cgroup (and, therefore, parent)
 * already exist, and will fail otherwise.
 *
 * When namespaces are used, a group can have different parents for different
 * controllers.
1760 1761
 *
 * @param cgroup The cgroup
Jon Bernard's avatar
Jon Bernard committed
1762
 * @param controller The controller
1763 1764 1765 1766
 * @param parent Output, name of parent's group (if the group has parent) or
 *	NULL, if the provided cgroup is the root group and has no parent.
 *	Caller is responsible to free the returned string!
 * @return 0 on success, >0 on error.
1767
 */
Jon Bernard's avatar
Jon Bernard committed
1768 1769
static int cgroup_find_parent(struct cgroup *cgroup, char *controller,
		char **parent)
1770
{
1771 1772
	char child_path[FILENAME_MAX];
	char *parent_path = NULL;
1773
	struct stat stat_child, stat_parent;
1774 1775 1776
	int ret = 0;

	*parent = NULL;
1777 1778

	pthread_rwlock_rdlock(&cg_mount_table_lock);
1779
	if (!cg_build_path_locked(cgroup->name, child_path, controller)) {
1780
		pthread_rwlock_unlock(&cg_mount_table_lock);
1781
		return ECGFAIL;
1782 1783 1784
	}
	pthread_rwlock_unlock(&cg_mount_table_lock);

1785
	cgroup_dbg("path is %s\n", child_path);
1786

1787 1788
	if (asprintf(&parent_path, "%s/..", child_path) < 0)
		return ECGFAIL;
1789

1790
	cgroup_dbg("parent's name is %s\n", parent_path);
1791

1792 1793 1794
	if (stat(child_path, &stat_child) < 0) {
		last_errno = errno;
		ret = ECGOTHER;
1795
		goto free_parent;
1796
	}
1797

1798 1799 1800
	if (stat(parent_path, &stat_parent) < 0) {
		last_errno = errno;
		ret = ECGOTHER;
1801
		goto free_parent;
1802
	}
1803 1804 1805 1806 1807

	/*
	 * Is the specified "name" a mount point?
	 */
	if (stat_parent.st_dev != stat_child.st_dev) {
1808 1809 1810
		*parent = NULL;
		ret = 0;
		cgroup_dbg("Parent is on different device\n");
1811
	} else {
Jon Bernard's avatar
Jon Bernard committed
1812
		ret = cgroup_get_parent_name(cgroup, parent);
1813
	}
1814 1815

free_parent:
1816 1817
	free(parent_path);
	return ret;
1818 1819 1820 1821 1822 1823
}

/**
 * @cgroup: cgroup data structure to be filled with parent values and then
 *	  passed down for creation
 * @ignore_ownership: Ignore doing a chown on the newly created cgroup
Jon Bernard's avatar
Jon Bernard committed
1824 1825 1826
 * @return 0 on success, > 0 on failure.  If  ECGCANTSETVALUE is returned,
 * the group was created successfully, but not all controller parameters
 * were copied from the parent successfully; unfortunately, this is expected...
1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837
 */
int cgroup_create_cgroup_from_parent(struct cgroup *cgroup,
					int ignore_ownership)
{
	char *parent = NULL;
	struct cgroup *parent_cgroup = NULL;
	int ret = ECGFAIL;

	if (!cgroup_initialized)
		return ECGROUPNOTINITIALIZED;

Jon Bernard's avatar
Jon Bernard committed
1838
	ret = cgroup_get_parent_name(cgroup, &parent);
1839
	if (ret)
1840 1841
		return ret;

1842 1843 1844 1845 1846 1847 1848 1849
	if (parent == NULL) {
		/*
		 * The group to create is root group!
		 * TODO: find better error code?
		 */
		return ECGFAIL;
	}

1850 1851
	cgroup_dbg("parent is %s\n", parent);
	parent_cgroup = cgroup_new_cgroup(parent);
Jon Bernard's avatar
Jon Bernard committed
1852 1853
	if (!parent_cgroup) {
		ret = ECGFAIL;
1854
		goto err_nomem;
Jon Bernard's avatar
Jon Bernard committed
1855
	}
1856

Jon Bernard's avatar
Jon Bernard committed
1857 1858
	if (cgroup_get_cgroup(parent_cgroup)) {
		ret = ECGFAIL;
1859
		goto err_parent;
Jon Bernard's avatar
Jon Bernard committed
1860
	}
1861 1862 1863

	cgroup_dbg("got parent group for %s\n", parent_cgroup->name);
	ret = cgroup_copy_cgroup(cgroup, parent_cgroup);
Jon Bernard's avatar
Jon Bernard committed
1864
	if (ret) {
1865
		goto err_parent;
Jon Bernard's avatar
Jon Bernard committed
1866
	}
1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878

	cgroup_dbg("copied parent group %s to %s\n", parent_cgroup->name,
							cgroup->name);
	ret = cgroup_create_cgroup(cgroup, ignore_ownership);

err_parent:
	cgroup_free(&parent_cgroup);
err_nomem:
	free(parent);
	return ret;
}

1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931
/**
 * Move all processes from one task file to another.
 * @param input_tasks Pre-opened file to read tasks from.
 * @param output_tasks Pre-opened file to write tasks to.
 * @return 0 on succes, >0 on error.
 */
static int cg_move_task_files(FILE *input_tasks, FILE *output_tasks)
{
	int tids;
	int ret = 0;

	while (!feof(input_tasks)) {
		ret = fscanf(input_tasks, "%d", &tids);
		if (ret == EOF || ret == 0) {
			ret = 0;
			break;
		}
		if (ret < 0)
			break;

		ret = fprintf(output_tasks, "%d", tids);
		if (ret < 0)
			break;

		/*
		 * Flush the file, we need only one process per write() call.
		 */
		ret = fflush(output_tasks);
		if (ret < 0)
			break;
	}

	if (ret < 0) {
		last_errno = errno;
		return ECGOTHER;
	}
	return 0;
}

/**
 * Remove one cgroup from specific controller. The function  moves all
 * processes from it to given target group.
 *
 * The function succeeds if the group to remove is already removed - when
 * cgroup_delete_cgroup is called with group with two controllers mounted
 * to the same hierarchy, this function is called once for each of these
 * controllers. And during the second call the group is already removed...
 *
 * @param cgroup_name Name of the group to remove.
 * @param controller  Name of the controller.
 * @param target_tasks Opened tasks file of the target group, where all
 *	processes should be moved.
 * @param flags Flag indicating whether the errors from task
1932
 *	migration should be ignored (CGROUP_DELETE_IGNORE_MIGRATION) or not (0).
1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943
 * @returns 0 on success, >0 on error.
 */
static int cg_delete_cgroup_controller(char *cgroup_name, char *controller,
		FILE *target_tasks, int flags)
{
	FILE *delete_tasks;
	char path[FILENAME_MAX];
	int ret = 0;

	cgroup_dbg("Removing group %s:%s\n", controller, cgroup_name);

Jon Bernard's avatar
Jon Bernard committed
1944
	if (!(flags & CGFLAG_DELETE_EMPTY_ONLY)) {
1945
		/*
Jon Bernard's avatar
Jon Bernard committed
1946
		 * Open tasks file of the group to delete.
1947
		 */
Jon Bernard's avatar
Jon Bernard committed
1948 1949 1950 1951 1952 1953 1954
		if (!cg_build_path(cgroup_name, path, controller))
			return ECGROUPSUBSYSNOTMOUNTED;
		strncat(path, "tasks", sizeof(path) - strlen(path));

		delete_tasks = fopen(path, "re");
		if (delete_tasks) {
			ret = cg_move_task_files(delete_tasks, target_tasks);
1955 1956 1957
			if (ret != 0)
				cgroup_warn("Warning: removing tasks from %s failed: %s\n",
						path, cgroup_strerror(ret));
Jon Bernard's avatar
Jon Bernard committed
1958 1959 1960 1961 1962 1963 1964 1965
			fclose(delete_tasks);
		} else {
			/*
			 * Can't open the tasks file. If the file does not
			 * exist, ignore it - the group has been already
			 * removed.
			 */
			if (errno != ENOENT) {
1966 1967
				cgroup_err("Error: cannot open %s: %s\n",
						path, strerror(errno));
Jon Bernard's avatar
Jon Bernard committed
1968 1969 1970
				last_errno = errno;
				ret = ECGOTHER;
			}
1971 1972
		}

Jon Bernard's avatar
Jon Bernard committed
1973 1974 1975
		if (ret != 0 && !(flags & CGFLAG_DELETE_IGNORE_MIGRATION))
			return ret;
	}
1976 1977 1978 1979 1980 1981 1982 1983

	/*
	 * Remove the group.
	 */
	if (!cg_build_path(cgroup_name, path, controller))
		return ECGROUPSUBSYSNOTMOUNTED;

	ret = rmdir(path);
Jon Bernard's avatar
Jon Bernard committed
1984 1985
	if (ret == 0 || errno == ENOENT)
		return 0;
1986