Hi. I`m rewrite jail limit patch under CURRENT. New patch limited CPU, memory, open files, process. And allow change limit on the fly. Any questions? My email: kostjn@peterhost.ru Limits on cpu the soft. Realisation is simple. If jail exceeds the limit, threads started in it jail are passed in the scheduler. Limits are specified in percentage of one processor core. The primary goal. To guarantee allocation of jail processor times specified in a limit. Certainly it is possible, only if the sum of percent for all jail started on 1 processor kernel, does not exceed 100 %. Limits for memory the hard. The primary goal. Not to allow jail to consume it is more than memory, than it is specified in a limit. If jail exceeds the limit, it is forbidden to do system calls fork, mmap, obreak. Limits on number of processes and open files also the hard. =========================================================================== How to use. =========================================================================== Build #cat ~/supfile *default date=2009.05.25.00.00.00 *default host=cvsup6.ru.FreeBSD.org *default prefix=/usr *default base=/var/db *default release=cvs delete use-rel-suffix compress src-all #cvsup ~/supfile #cd /usr/src #patch -p0 < patch-jail-limit-8CURRENT #make buildkernel #make buildworld #make installkernel #shutdown -r now #make installworld Create new entry in login.conf, for example class jail128 jail128:\ :cputime=10:\ :memoryuse=128M:\ :maxproc=256:\ :openfiles=1024:\ :tc=default: Cputime is percent on 1 core. Openfiles is sum filedesc for all proc in jail. Create new jail. ... Add in /etc/rc.conf jail_test_flags="-Ljail128" Run new jail /etc/rc.d/jail start test =========================================================================== Sysctl =========================================================================== [root@book ~/jail-limit]# sysctl security.jail.limit security.jail.limit.enable: 1 security.jail.limit.kill: 0 [root@book ~/jail-limit]# sysctl -d security.jail.limit security.jail.limit: Jail limit security.jail.limit.enable: Enable jail limit security.jail.limit.kill: Allow kill proc, if jail exceed memory limit =========================================================================== Jset and Jget =========================================================================== jset and jget is program for set new jail limit and get current limit Example [root@book ~]# cat /etc/rc.conf | grep jail2 jail_list="jail1 jail2 jail3 jail4 jail5 jail6 jail7 jail8 jail9 jail10" jail_jail2_rootdir="/usr/jails/jail2/" jail_jail2_hostname="jail2.book.pht" jail_jail2_interface="re0" jail_jail2_ip="192.168.200.22" jail_jail2_flags="-Ljail64" [root@book ~]# /etc/rc.d/jail start jail2 Configuring jails:. Starting jails: jail2.book.pht. [root@book ~]# cd ~kostjn/ [root@book /home/kostjn]# ./jget.o 1 Jail limits and rusage, jid = 1 Limits: CPU 5, MEM 64M, NPROC 128, NOFILE 512 Usage: CPU 0, MEM 6M, NPROC 9, NOFILE 65 [root@book /home/kostjn]# ./jset.o 1 jail2048 Set new jail limits, jid = 1 Limits: CPU 30, MEM 2048M, NPROC 1024, NOFILE 2048 [root@book /home/kostjn]# ./jget.o 1 Jail limits and rusage, jid = 1 Limits: CPU 30, MEM 2048M, NPROC 1024, NOFILE 2048 Usage: CPU 0, MEM 6M, NPROC 9, NOFILE 65 You see that new limit is set. =========================================================================== Test =========================================================================== Cpu limit <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< Script [root@book /home/kostjn]# cat test.sh #!/bin/sh for i in `jot 8 1`; do cpuset -l0 jexec 1 /a.out & done for i in `jot 8 1`; do cpuset -l0 jexec 2 /a.out & done for i in `jot 8 1`; do cpuset -l0 jexec 3 /a.out & done for i in `jot 8 1`; do cpuset -l0 jexec 4 /a.out & done for i in `jot 8 1`; do cpuset -l0 jexec 5 /a.out & done for i in `jot 8 1`; do cpuset -l0 jexec 6 /a.out & done for i in `jot 8 1`; do cpuset -l0 jexec 7 /a.out & done for i in `jot 8 1`; do cpuset -l0 jexec 8 /a.out & done for i in `jot 8 1`; do cpuset -l0 jexec 9 /a.out & done cpuset -l0 jexec 10 /a.out & Set class for all jail. [root@book /home/kostjn]# for i in `jot 10 1`; do ./jset.o $i jail128 ;done Set new jail limits, jid = 1 Limits: CPU 10, MEM 128M, NPROC 256, NOFILE 1024 Set new jail limits, jid = 2 Limits: CPU 10, MEM 128M, NPROC 256, NOFILE 1024 Set new jail limits, jid = 3 Limits: CPU 10, MEM 128M, NPROC 256, NOFILE 1024 Set new jail limits, jid = 4 Limits: CPU 10, MEM 128M, NPROC 256, NOFILE 1024 Set new jail limits, jid = 5 Limits: CPU 10, MEM 128M, NPROC 256, NOFILE 1024 Set new jail limits, jid = 6 Limits: CPU 10, MEM 128M, NPROC 256, NOFILE 1024 Set new jail limits, jid = 7 Limits: CPU 10, MEM 128M, NPROC 256, NOFILE 1024 Set new jail limits, jid = 8 Limits: CPU 10, MEM 128M, NPROC 256, NOFILE 1024 Set new jail limits, jid = 9 Limits: CPU 10, MEM 128M, NPROC 256, NOFILE 1024 Set new jail limits, jid = 10 Limits: CPU 10, MEM 128M, NPROC 256, NOFILE 1024 [root@book /home/kostjn]# jexec 1 bash [root@jail1 /]# cat cpu.c #include #include #include #include int main(int argc,char *argv[]){ int64_t i,j=0; char *s; for (;;){ } } Run test.sh Result top last pid: 3513; load averages: 70.87, 37.58, 16.40 up 0+00:44:02 14:19:46 185 processes: 74 running, 111 sleeping CPU: 49.9% user, 0.0% nice, 0.0% system, 0.2% interrupt, 49.9% idle Mem: 139M Active, 24M Inact, 47M Wired, 192K Cache, 29M Buf, 1785M Free Swap: 4044M Total, 4044M Free PID JID USERNAME THR PRI NICE SIZE RES STATE C TIME WCPU COMMAN 3502 10 root 1 97 0 1480K 1244K CPU0 0 0:13 8.79% a.out 3474 6 root 1 97 0 1480K 1244K RUN 0 0:04 4.69% a.out 3431 2 root 1 96 0 1480K 1244K RUN 0 0:03 4.30% a.out 3454 4 root 1 97 0 1480K 1244K RUN 0 0:03 4.05% a.out 3422 1 root 1 96 0 1480K 1244K RUN 0 0:04 3.86% a.out 3482 7 root 1 97 0 1480K 1244K RUN 0 0:03 3.86% a.out 3447 3 root 1 97 0 1480K 1244K RUN 0 0:03 3.86% a.out 3429 1 root 1 96 0 1480K 1244K RUN 0 0:03 3.66% a.out 3485 8 root 1 97 0 1480K 1244K RUN 0 0:05 3.56% a.out 3424 1 root 1 96 0 1480K 1244K RUN 0 0:04 3.56% a.out 3464 5 root 1 97 0 1480K 1244K RUN 0 0:02 3.56% a.out 3438 2 root 1 96 0 1480K 1244K RUN 0 0:03 3.47% a.out 3494 9 root 1 96 0 1480K 1244K RUN 0 0:03 3.27% a.out 3497 9 root 1 97 0 1480K 1244K RUN 0 0:05 3.17% a.out 3433 2 root 1 96 0 1480K 1244K RUN 0 0:03 2.88% a.out 3428 1 root 1 96 0 1480K 1244K RUN 0 0:02 2.88% a.out 3487 8 root 1 97 0 1480K 1244K RUN 0 0:04 2.78% a.out ps auxwwww -ojid | more root 3502 9.0 0.1 1480 1244 v2 RJ 2:15PM 0:07.40 /a.out 10 root 3476 4.4 0.1 1480 1244 v2 RJ 2:15PM 0:04.38 /a.out 7 root 3480 4.1 0.1 1480 1244 v2 RJ 2:15PM 0:03.02 /a.out 7 root 3498 3.9 0.1 1480 1244 v2 RJ 2:15PM 0:04.00 /a.out 9 root 3429 3.7 0.1 1480 1244 v2 RJ 2:15PM 0:01.38 /a.out 1 root 3487 3.6 0.1 1480 1244 v2 RJ 2:15PM 0:03.32 /a.out 8 root 3452 3.5 0.1 1480 1244 v2 RJ 2:15PM 0:01.37 /a.out 4 root 3463 3.5 0.1 1480 1244 v2 RJ 2:15PM 0:01.65 /a.out 5 root 3472 3.3 0.1 1480 1244 v2 RJ 2:15PM 0:02.63 /a.out 6 root 3437 3.2 0.1 1480 1244 v2 RJ 2:15PM 0:01.93 /a.out 2 root 3494 3.0 0.1 1480 1244 v2 RJ 2:15PM 0:02.92 /a.out 9 root 3500 3.0 0.1 1480 1244 v2 RJ 2:15PM 0:03.63 /a.out 9 We see that jail 10 (1 thread), used ~10 % cpu under heavy load. <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< Resourse compute [root@book /home/kostjn]# ./jset.o 1 jail64 Set new jail limits, jid = 1 Limits: CPU 5, MEM 64M, NPROC 128, NOFILE 512 [root@book /home/kostjn]# ./jget.o 1 Jail limits and rusage, jid = 1 Limits: CPU 5, MEM 64M, NPROC 128, NOFILE 512 Usage: CPU 0, MEM 6M, NPROC 9, NOFILE 65 [root@book /home/kostjn]# [root@book /home/kostjn]# jexec 1 bash [root@jail1 /]# apachectl stop /usr/local/sbin/apachectl stop: httpd stopped [root@jail1 /]# exit [root@book /home/kostjn]# ./jget.o 1 Jail limits and rusage, jid = 1 Limits: CPU 5, MEM 64M, NPROC 128, NOFILE 512 Usage: CPU 0, MEM 3M, NPROC 3, NOFILE 24 <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< Resource limit [root@jail1 /]# cat mem.c #include #include #include #include #define GB 1000000000 #define MB 1000000 #define KB 1000 int main(int argc,char *argv[]){ int64_t i,j=0; char *s; for(;;){ s = malloc(10 * MB); } sleep(100); } [root@book /usr/home/kostjn]# ./jget.o 1 Jail limits and rusage, jid = 1 Limits: CPU 6, MEM 65536K, NPROC 128, NOFILE 512 Usage: CPU 0, MEM 17305K, NPROC 66, NOFILE 114 [root@jail1 /]# cc -o mem.o mem.c && ./mem.o & [1] 14083 [root@jail1 /]# sleep 1 bash: fork: Cannot allocate memory [root@book /usr/home/kostjn]# ./jget.o 1 Jail limits and rusage, jid = 1 Limits: CPU 6, MEM 65536K, NPROC 128, NOFILE 512 Usage: CPU 109, MEM 69140K, NPROC 68, NOFILE 114 <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< We see that jail exceed memory limit. And new fork, mmap syscall not permitted. If you set sysctl security.jail.limit.kill=1, process which exceed memory limit will be killed. [root@jail1 /]# cc -o mem.o mem.c && ./mem.o & [1] 14099 [root@book /usr/home/kostjn]# tail -f -n1 /var/log/messages Jun 2 12:45:42 book kernel: pid 14104 (mem.o), uid 0, jid 1 was killed: Prison exceed memory limit <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< If you attempt set nonexisten class, limit set to infinity. [root@book /home/kostjn]# ./jset.o 1 jail123 Set new jail limits, jid = 1 Limits: CPU 9223372036854775807, MEM 20M, NPROC 9223372036854775807, NOFILE 9223 372036854775807 =========================================================================== Problem =========================================================================== If you have problem in this patch. Add to kernel config options KTR options KTR_ENTRIES=1024 options KTR_COMPILE=(KTR_PROC|KTR_JAIL|KTR_SCHED|KTR_RUNQ|KTR_LOCK|KTR_CONTENTIO N) options KTR_MASK=KTR_JAIL options KTR_CPUMASK=0x3 options KTR_VERBOSE options PRINTF_BUFR_SIZE=128 Rebuild kernel. Reboot. Set sysctl sysctl debug.ktr.mask=65536 and check /var/log/messages Patch diff -w -b -B -r -U3 /usr/src/sys/kern/kern_clock.c /usr/src.new8/sys/kern/kern_clock.c --- /usr/src/sys/kern/kern_clock.c 2009-05-18 12:03:43.000000000 +0000 +++ /usr/src.new8/sys/kern/kern_clock.c 2009-06-02 09:00:48.000000000 +0000 @@ -52,6 +52,7 @@ #include #include #include +#include #include #include #include @@ -502,6 +503,7 @@ "prio:%d", td->td_priority, "stathz:%d", (stathz)?stathz:hz); thread_lock_flags(td, MTX_QUIET); sched_clock(td); + prison_rusage_update_nolock(td, JRL_CPU, 1, "statclock"); thread_unlock(td); } diff -w -b -B -r -U3 /usr/src/sys/kern/kern_descrip.c /usr/src.new8/sys/kern/kern_descrip.c --- /usr/src/sys/kern/kern_descrip.c 2009-05-20 18:42:04.000000000 +0000 +++ /usr/src.new8/sys/kern/kern_descrip.c 2009-06-02 09:00:48.000000000 +0000 @@ -1453,7 +1453,13 @@ uma_zfree(file_zone, fp); return (ENFILE); } + + error = prison_rusage_check(td, JRL_FILE, 1, "falloc"); + if (error) + return (error); + atomic_add_int(&openfiles, 1); + prison_rusage_update(td, JRL_FILE, 1, "falloc"); /* * If the process has file descriptor zero open, add the new file @@ -2268,6 +2274,7 @@ if (fp->f_cdevpriv != NULL) devfs_fpdrop(fp); atomic_subtract_int(&openfiles, 1); + prison_rusage_update(td, JRL_FILE, -1, "fdrop"); crfree(fp->f_cred); uma_zfree(file_zone, fp); diff -w -b -B -r -U3 /usr/src/sys/kern/kern_exit.c /usr/src.new8/sys/kern/kern_exit.c --- /usr/src/sys/kern/kern_exit.c 2009-05-08 14:11:06.000000000 +0000 +++ /usr/src.new8/sys/kern/kern_exit.c 2009-06-02 09:00:48.000000000 +0000 @@ -457,7 +457,7 @@ /* In case we are jailed tell the prison that we are gone. */ if (jailed(p->p_ucred)) - prison_proc_free(p->p_ucred->cr_prison); + prison_proc_free(p); #ifdef KDTRACE_HOOKS /* @@ -550,6 +550,7 @@ p->p_state = PRS_ZOMBIE; PROC_UNLOCK(p->p_pptr); + prison_rusage_update(td, JRL_PROC, -1, "exit1"); /* * Hopefully no one will try to deliver a signal to the process this * late in the game. diff -w -b -B -r -U3 /usr/src/sys/kern/kern_fork.c /usr/src.new8/sys/kern/kern_fork.c --- /usr/src/sys/kern/kern_fork.c 2009-05-08 14:11:06.000000000 +0000 +++ /usr/src.new8/sys/kern/kern_fork.c 2009-06-02 09:00:48.000000000 +0000 @@ -222,6 +222,12 @@ return (EINVAL); p1 = td->td_proc; + error = prison_rusage_check(td, JRL_PROC, 1, "fork1"); + if (error) + return (error); + error = prison_rusage_check(td, JRL_MEM, 0, "fork1"); + if (error) + return (error); /* * Here we don't create a new process, but we divorce @@ -460,7 +466,7 @@ /* In case we are jailed tell the prison that we exist. */ if (jailed(p2->p_ucred)) - prison_proc_hold(p2->p_ucred->cr_prison); + prison_proc_hold(p2); PROC_UNLOCK(p2); @@ -766,6 +772,7 @@ cv_wait(&p2->p_pwait, &p2->p_mtx); PROC_UNLOCK(p2); + prison_rusage_update(td, JRL_PROC, 1, "fork1"); /* * Return child proc pointer to parent. */ diff -w -b -B -r -U3 /usr/src/sys/kern/kern_jail.c /usr/src.new8/sys/kern/kern_jail.c --- /usr/src/sys/kern/kern_jail.c 2009-05-23 16:13:26.000000000 +0000 +++ /usr/src.new8/sys/kern/kern_jail.c 2009-06-02 09:00:48.000000000 +0000 @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -58,6 +59,8 @@ #include #include #include +#include +#include #include #include #ifdef DDB @@ -114,6 +117,27 @@ &jail_max_af_ips, 0, "Number of IP addresses a jail may have at most per address family"); +SYSCTL_NODE(_security_jail, OID_AUTO, limit, CTLFLAG_RW, + 0, "Jail limit"); +int jlimit_mem_exceed_kill = 0; +SYSCTL_INT(_security_jail_limit, OID_AUTO, kill, + CTLFLAG_RW, &jlimit_mem_exceed_kill, 0, + "Allow kill proc, if jail exceed memory limit"); +int jlimit_enable = 1; +SYSCTL_INT(_security_jail_limit, OID_AUTO, enable, CTLFLAG_RW, + &jlimit_enable, 0, "Enable jail limit"); + +/* Kernel proc for compute jail resourse usage */ +static void j_daemon(void); +static struct proc *j_proc; +static struct kproc_desc j_kp = { + "jdaemon", + j_daemon, + &j_proc +}; + +SYSINIT(jdaemon, SI_SUB_KTHREAD_JAIL, SI_ORDER_FIRST, kproc_start, &j_kp); + /* allprison, lastprid, and prisoncount are protected by allprison_lock. */ struct sx allprison_lock; SX_SYSINIT(allprison_lock, &allprison_lock, "allprison"); @@ -204,6 +228,7 @@ #ifdef INET6 struct in6_addr *u_ip6; #endif + rlim_t u_rlimit[JRL_NLIM]; uint32_t version; int error; @@ -371,6 +396,14 @@ } opt.uio_iovcnt++; #endif + /* Prison resource limits */ + optiov[opt.uio_iovcnt].iov_base = "rlimit"; + optiov[opt.uio_iovcnt].iov_len = sizeof("rlimit"); + opt.uio_iovcnt++; + optiov[opt.uio_iovcnt].iov_base = u_rlimit; + optiov[opt.uio_iovcnt].iov_len = sizeof(u_rlimit); + bcopy(j.rlimit, u_rlimit, sizeof(u_rlimit)); + opt.uio_iovcnt++; break; } @@ -418,6 +451,8 @@ #ifdef INET6 struct in6_addr *ip6; #endif + rlim_t *rlimit; + int set_rlimit = 0; struct vfsopt *opt; struct vfsoptlist *opts; struct prison *pr, *deadpr, *tpr; @@ -651,6 +686,20 @@ VFS_UNLOCK_GIANT(vfslocked); } } + rlimit = malloc(sizeof(rlim_t) * JRL_NLIM, M_PRISON, M_WAITOK); + error = vfs_getopt(opts, "rlimit", (void **)&rlimit, &len); + + if (error == 0 && len == sizeof(rlim_t) * JRL_NLIM){ + set_rlimit = 1; + } else if (error == ENOENT){ + /* Limits required at jail create */ + if ((flags & JAIL_CREATE) == JAIL_CREATE) + goto done_free; + } else if (len != sizeof(rlim_t) * JRL_NLIM){ + error = EINVAL; + goto done_free; + } else if (error) + goto done_free; /* * Grab the allprison lock before letting modules check their @@ -849,6 +898,7 @@ if (tpr == NULL) TAILQ_INSERT_TAIL(&allprison, pr, pr_list); prisoncount++; + LIST_INIT(&(pr->pr_proc_list)); pr->pr_id = jid; if (name == NULL) @@ -984,6 +1034,16 @@ } if (host != NULL) strlcpy(pr->pr_host, host, sizeof(pr->pr_host)); + + /* Set limits only if they reseived. */ + if (set_rlimit == 1){ + mtx_lock(&pr->pr_mtx); + bcopy(rlimit, pr->pr_rlimit, sizeof(rlim_t) * JRL_NLIM); + /* Cpu limit passed in percent, but real limit in stahz ticks. */ + pr->pr_rlimit[JRL_CPU] = + rlimit[JRL_CPU] * (stathz ? stathz : hz) / 100; + mtx_unlock(&pr->pr_mtx); + } /* * Persistent prisons get an extra reference, and prisons losing their * persist flag lose that reference. Only do this for existing prisons @@ -1298,6 +1358,15 @@ error = vfs_setopts(opts, "path", pr->pr_path); if (error != 0 && error != ENOENT) goto done_deref; + error = vfs_setopt(opts, "rlimit", pr->pr_rlimit, + sizeof(rlim_t) * JRL_NLIM); + if (error != 0 && error != ENOENT) + goto done_deref; + error = vfs_setopt(opts, "rusage", pr->pr_rusage, + sizeof(rlim_t) * JRL_NLIM); + if (error != 0 && error != ENOENT) + goto done_deref; + #ifdef INET error = vfs_setopt_part(opts, "ip4.addr", pr->pr_ip4, pr->pr_ip4s * sizeof(*pr->pr_ip4)); @@ -1511,8 +1580,10 @@ * a process root from one prison, but attached to the jail * of another. */ + p = td->td_proc; pr->pr_ref++; pr->pr_uref++; + LIST_INSERT_HEAD(&(pr->pr_proc_list), p, pj_list); mtx_unlock(&pr->pr_mtx); /* Let modules do whatever they need to prepare for attaching. */ @@ -1526,7 +1597,6 @@ /* * Reparent the newly attached process to this jail. */ - p = td->td_proc; error = cpuset_setproc_update_set(p, pr->pr_cpuset); if (error) goto e_revert_osd; @@ -1553,6 +1623,8 @@ p->p_ucred = newcred; PROC_UNLOCK(p); crfree(oldcred); + prison_rusage_update(td, JRL_PROC, 1, "jail_attach"); + prison_rusage_update(td, JRL_MEM, vmspace_usage(p), "jail_attach"); return (0); e_unlock: VOP_UNLOCK(pr->pr_root, 0); @@ -1711,6 +1783,7 @@ VFS_UNLOCK_GIANT(vfslocked); } mtx_destroy(&pr->pr_mtx); + free(pr->pr_linux, M_PRISON); #ifdef INET free(pr->pr_ip4, M_PRISON); #endif @@ -1743,23 +1816,27 @@ } void -prison_proc_hold(struct prison *pr) +prison_proc_hold(struct proc *p) { - + struct prison *pr; + pr = p->p_ucred->cr_prison; mtx_lock(&pr->pr_mtx); KASSERT(pr->pr_uref > 0, ("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id)); pr->pr_uref++; + LIST_INSERT_HEAD(&(pr->pr_proc_list), p, pj_list); mtx_unlock(&pr->pr_mtx); } void -prison_proc_free(struct prison *pr) +prison_proc_free(struct proc *p) { - + struct prison *pr; + pr = p->p_ucred->cr_prison; mtx_lock(&pr->pr_mtx); KASSERT(pr->pr_uref > 0, ("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id)); + LIST_REMOVE(p, pj_list); prison_deref(pr, PD_DEUREF | PD_LOCKED); } @@ -2527,6 +2604,121 @@ return (EPERM); } } +/* + * Daemon every second compute resource usage for all prison in system. + */ +static void +j_daemon(void) +{ + struct prison *pr; + for (;;){ + sx_xlock(&allprison_lock); + TAILQ_FOREACH(pr, &allprison, pr_list) { + prison_rusage_compute(pr); + } + sx_xunlock(&allprison_lock); + tsleep(j_proc, 0, "-", hz); + } +} +/* + * Check prison resourse usage. + */ +int +prison_rusage_check(struct thread *td, int type, int diff, char *where) +{ + struct prison *pr; + if (td->td_proc->p_ucred == NULL || + !jailed(td->td_proc->p_ucred) || + jlimit_enable == 0) + return (0); + pr = td->td_proc->p_ucred->cr_prison; + + if (pr->pr_rusage[type] + diff >= pr->pr_rlimit[type]){/* EXCEED */ + switch (type){ + case JRL_CPU: + return EBUSY; + case JRL_MEM: + if (jlimit_mem_exceed_kill != 0) + killproc(td->td_proc, "Prison exceed memory limit"); + CTR4(KTR_JAIL, "exceed MEM in %s: pr=%u, lim=%lu, usage=%lu", + where, pr->pr_id, pr->pr_rlimit[type], pr->pr_rusage[type]); + return ENOMEM; + case JRL_PROC: + CTR4(KTR_JAIL, "exceed PROC in %s: pr=%u, lim=%lu, usage=%lu", + where, pr->pr_id, pr->pr_rlimit[type], pr->pr_rusage[type]); + return EPROCLIM; + case JRL_FILE: + CTR4(KTR_JAIL, "exceed FD in %s: pr=%u, lim=%lu, usage=%lu", + where, pr->pr_id, pr->pr_rlimit[type], pr->pr_rusage[type]); + return EMFILE; + /* Not implemented. */ + return (0); + } + } + return (0); +} +/* + * Used in functions which cannot be blocked. + */ +void inline +prison_rusage_update_nolock(struct thread *td, int type, + int diff, char *where) +{ + struct prison *pr; + if (td->td_proc->p_ucred == NULL || + !jailed(td->td_proc->p_ucred) || + jlimit_enable == 0) + return; + pr = td->td_proc->p_ucred->cr_prison; + if (mtx_trylock(&pr->pr_mtx) !=0 ){ + pr->pr_rusage[type] +=diff; + mtx_unlock(&pr->pr_mtx); + } +} +/* + * Update prison resourse usage. + */ +void +prison_rusage_update(struct thread *td, int type, + int diff, char *where) +{ + struct prison *pr; + if (td->td_proc->p_ucred == NULL || + !jailed(td->td_proc->p_ucred) || + jlimit_enable == 0) + return; + pr = td->td_proc->p_ucred->cr_prison; + mtx_lock(&pr->pr_mtx); + pr->pr_rusage[type] +=diff; + mtx_unlock(&pr->pr_mtx); + CTR4(KTR_JAIL, "Update %u in %s, diff %i, usage %li", + type, where, diff, pr->pr_rusage[type]); + if (pr->pr_rusage[type] < 0) + CTR4(KTR_JAIL, "ERROR! usage < 0, %u in %s, diff %i, usage %li", type, where, + diff, pr->pr_rusage[type]); +} +/* + * Recalculate prison memory usage. + */ +void +prison_rusage_compute(struct prison *pr) +{ + struct proc *p; + int reduce; + rlim_t pr_mem; + reduce = 2; + pr_mem = 0; + + if (jlimit_enable == 0) + return; + LIST_FOREACH(p, &(pr->pr_proc_list), pj_list){ + pr_mem += vmspace_usage(p); + } + mtx_lock(&pr->pr_mtx); + pr->pr_rusage[JRL_CPU] /= reduce; + pr->pr_rusage[JRL_MEM] = pr_mem; + mtx_unlock(&pr->pr_mtx); +} static int sysctl_jail_list(SYSCTL_HANDLER_ARGS) diff -w -b -B -r -U3 /usr/src/sys/kern/kern_sig.c /usr/src.new8/sys/kern/kern_sig.c --- /usr/src/sys/kern/kern_sig.c 2009-04-10 10:52:19.000000000 +0000 +++ /usr/src.new8/sys/kern/kern_sig.c 2009-06-02 09:00:48.000000000 +0000 @@ -57,6 +57,7 @@ #include #include #include +#include #include #include #include @@ -2672,6 +2673,11 @@ PROC_LOCK_ASSERT(p, MA_OWNED); CTR3(KTR_PROC, "killproc: proc %p (pid %d, %s)", p, p->p_pid, p->p_comm); + if (jailed(p->p_ucred)) + log(LOG_ERR, "pid %d (%s), uid %d, jid %d was killed: %s\n", p->p_pid, p->p_comm, + p->p_ucred ? p->p_ucred->cr_uid : -1, + p->p_ucred->cr_prison->pr_id, why); + else log(LOG_ERR, "pid %d (%s), uid %d, was killed: %s\n", p->p_pid, p->p_comm, p->p_ucred ? p->p_ucred->cr_uid : -1, why); psignal(p, SIGKILL); diff -w -b -B -r -U3 /usr/src/sys/kern/sched_ule.c /usr/src.new8/sys/kern/sched_ule.c --- /usr/src/sys/kern/sched_ule.c 2009-04-29 23:04:31.000000000 +0000 +++ /usr/src.new8/sys/kern/sched_ule.c 2009-06-02 09:00:48.000000000 +0000 @@ -50,6 +50,7 @@ #include #include #include +#include #include #include #include @@ -2229,14 +2230,30 @@ { struct thread *td; struct tdq *tdq; + int skip = 0; tdq = TDQ_SELF(); TDQ_LOCK_ASSERT(tdq, MA_OWNED); +retry: td = tdq_choose(tdq); if (td) { td->td_sched->ts_ltick = ticks; tdq_runq_rem(tdq, td); tdq->tdq_lowpri = td->td_priority; + skip++; + /* + * We skip jailed thread, that exceed limit, if no more + * thread in tdq_timeshare, allow run this thread. + */ + if (jailed(td->td_ucred) && skip < tdq->tdq_load && + prison_rusage_check(td, JRL_CPU, 0, "sched_ule") != 0){ + + if (tdq->tdq_idx == tdq->tdq_ridx) + tdq->tdq_idx = (tdq->tdq_idx + 1) % RQ_NQS; + + tdq_runq_add(tdq, td, 0); + goto retry; + } return (td); } tdq->tdq_lowpri = PRI_MAX_IDLE; diff -w -b -B -r -U3 /usr/src/sys/sys/jail.h /usr/src.new8/sys/sys/jail.h --- /usr/src/sys/sys/jail.h 2009-05-07 18:36:47.000000000 +0000 +++ /usr/src.new8/sys/sys/jail.h 2009-06-02 09:01:06.000000000 +0000 @@ -24,12 +24,13 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: src/sys/sys/jail.h,v 1.41 2009/05/07 18:36:47 jamie Exp $ + * $FreeBSD: src/sys/sys/jail.h,v 1.40 2009/05/05 05:49:08 jamie Exp $ */ #ifndef _SYS_JAIL_H_ #define _SYS_JAIL_H_ +#include #ifdef _KERNEL struct jail_v0 { u_int32_t version; @@ -44,6 +45,7 @@ char *path; char *hostname; char *jailname; + rlim_t rlimit[JRL_NLIM]; uint32_t ip4s; uint32_t ip6s; struct in_addr *ip4; @@ -139,6 +141,8 @@ struct cpuset; +LIST_HEAD(prison_proc_list, proc); + /* * This structure describes a prison. It is pointed to by all struct * ucreds's of the inmates. pr_ref keeps track of them and is used to @@ -153,6 +157,7 @@ */ struct prison { TAILQ_ENTRY(prison) pr_list; /* (a) all prisons */ + struct prison_proc_list pr_proc_list; /* (p) list proc in prison */ int pr_id; /* (c) prison id */ int pr_ref; /* (p) refcount */ int pr_uref; /* (p) user (alive) refcount */ @@ -162,7 +167,9 @@ struct vnode *pr_root; /* (c) vnode to rdir */ char pr_host[MAXHOSTNAMELEN]; /* (p) jail hostname */ char pr_name[MAXHOSTNAMELEN]; /* (p) admin jail name */ - void *pr_spare; /* was pr_linux */ + rlim_t pr_rlimit[JRL_NLIM]; /* (p) resource limit */ + rlim_t pr_rusage[JRL_NLIM]; /* (p) resource usage */ + void *pr_linux; /* (p) linux abi */ int pr_securelevel; /* (p) securelevel */ struct task pr_task; /* (d) destroy task */ struct mtx pr_mtx; @@ -245,8 +252,8 @@ void prison_free_locked(struct prison *pr); void prison_hold(struct prison *pr); void prison_hold_locked(struct prison *pr); -void prison_proc_hold(struct prison *); -void prison_proc_free(struct prison *); +void prison_proc_hold(struct proc *p); +void prison_proc_free(struct proc *p); int prison_get_ip4(struct ucred *cred, struct in_addr *ia); int prison_local_ip4(struct ucred *cred, struct in_addr *ia); int prison_remote_ip4(struct ucred *cred, struct in_addr *ia); @@ -261,6 +268,14 @@ int prison_if(struct ucred *cred, struct sockaddr *sa); int prison_priv_check(struct ucred *cred, int priv); int sysctl_jail_param(struct sysctl_oid *, void *, int , struct sysctl_req *); +int prison_rusage_check(struct thread *td, int type, + int diff, char *where); +void prison_rusage_update_nolock(struct thread *td, int type, + int diff, char *where); +void prison_rusage_update(struct thread *td, int type, + int diff, char *where); +void prison_rusage_compute(struct prison *pr); + #endif /* _KERNEL */ #endif /* !_SYS_JAIL_H_ */ diff -w -b -B -r -U3 /usr/src/sys/sys/kernel.h /usr/src.new8/sys/sys/kernel.h --- /usr/src/sys/sys/kernel.h 2009-05-08 14:11:06.000000000 +0000 +++ /usr/src.new8/sys/sys/kernel.h 2009-06-02 09:01:06.000000000 +0000 @@ -172,6 +172,7 @@ SI_SUB_KTHREAD_BUF = 0xea00000, /* buffer daemon*/ SI_SUB_KTHREAD_UPDATE = 0xec00000, /* update daemon*/ SI_SUB_KTHREAD_IDLE = 0xee00000, /* idle procs*/ + SI_SUB_KTHREAD_JAIL = 0xed00000, /* jdaemon compute prison rusage */ SI_SUB_SMP = 0xf000000, /* start the APs*/ SI_SUB_RUN_SCHEDULER = 0xfffffff /* scheduler*/ }; diff -w -b -B -r -U3 /usr/src/sys/sys/ktr.h /usr/src.new8/sys/sys/ktr.h --- /usr/src/sys/sys/ktr.h 2009-04-29 09:54:33.000000000 +0000 +++ /usr/src.new8/sys/sys/ktr.h 2009-06-02 09:01:06.000000000 +0000 @@ -61,6 +61,7 @@ #define KTR_INIT 0x00004000 /* System initialization */ #define KTR_SPARE3 0x00008000 /* XXX Used by cxgb */ #define KTR_SPARE4 0x00010000 /* XXX Used by cxgb */ +#define KTR_JAIL 0x00010000 /* Jails */ #define KTR_EVH 0x00020000 /* Eventhandler */ #define KTR_VFS 0x00040000 /* VFS events */ #define KTR_VOP 0x00080000 /* Auto-generated vop events */ diff -w -b -B -r -U3 /usr/src/sys/sys/proc.h /usr/src.new8/sys/sys/proc.h --- /usr/src/sys/sys/proc.h 2009-05-20 18:45:49.000000000 +0000 +++ /usr/src.new8/sys/sys/proc.h 2009-06-02 09:01:06.000000000 +0000 @@ -449,6 +449,7 @@ */ struct proc { LIST_ENTRY(proc) p_list; /* (d) List of all processes. */ + LIST_ENTRY(proc) pj_list; /* List all process in prison*/ TAILQ_HEAD(, thread) p_threads; /* (c) all threads. */ struct mtx p_slock; /* process spin lock */ struct ucred *p_ucred; /* (c) Process owner's identity. */ diff -w -b -B -r -U3 /usr/src/sys/sys/resource.h /usr/src.new8/sys/sys/resource.h --- /usr/src/sys/sys/resource.h 2008-12-11 18:32:05.000000000 +0000 +++ /usr/src.new8/sys/sys/resource.h 2009-06-02 09:01:07.000000000 +0000 @@ -97,6 +97,36 @@ #define RLIM_NLIMITS 12 /* number of resource limits */ +/* + * Jail resource limits + */ +/* Main */ +#define JRL_CPU 0 /* cpu in percent (1 core) */ +#define JRL_MEM 1 /* memory usage */ +#define JRL_VMEM 2 /* virtual memory usage */ +#define JRL_PROC 3 /* number of processes */ +#define JRL_FILE 4 /* total number of open files (all type) */ + +/* Disk quota */ +#define JDQ_BLOCK 5 /* number of disk block */ +#define JDQ_INODE 6 /* number of inode */ + +/* Network */ +#define JRL_SBSIZE 8 /* size of all socket buffers */ + +/* Descriptor */ +#define JRL_VNODE 9 /* number of open files */ +#define JRL_SOCKET 10 /* number of open socket */ +#define JRL_PIPE 11 /* number of open pipe */ +#define JRL_FIFO 12 /* number of open fifo */ +#define JRL_KQUEUE 13 /* number of kqueue */ +#define JRL_MQUEUE 14 /* number of posix messages kqueue */ +#define JRL_SHM 15 /* number of open shm segment */ +#define JRL_SEM 16 /* number of posix semaphore */ +#define JRL_PTS 17 /* number of open files */ + +#define JRL_NLIM 18 /* number of jail resource limits */ + #define RLIM_INFINITY ((rlim_t)(((uint64_t)1 << 63) - 1)) /* XXX Missing: RLIM_SAVED_MAX, RLIM_SAVED_CUR */ diff -w -b -B -r -U3 /usr/src/sys/vm/vm_extern.h /usr/src.new8/sys/vm/vm_extern.h --- /usr/src/sys/vm/vm_extern.h 2009-04-01 04:36:37.000000000 +0000 +++ /usr/src.new8/sys/vm/vm_extern.h 2009-06-02 09:01:08.000000000 +0000 @@ -70,6 +70,7 @@ struct vmspace *vmspace_acquire_ref(struct proc *); void vmspace_free(struct vmspace *); void vmspace_exitfree(struct proc *); +int vmspace_usage(struct proc *); void vnode_pager_setsize(struct vnode *, vm_ooffset_t); int vslock(void *, size_t); void vsunlock(void *, size_t); diff -w -b -B -r -U3 /usr/src/sys/vm/vm_map.c /usr/src.new8/sys/vm/vm_map.c --- /usr/src/sys/vm/vm_map.c 2009-04-19 00:34:34.000000000 +0000 +++ /usr/src.new8/sys/vm/vm_map.c 2009-06-02 09:01:08.000000000 +0000 @@ -361,6 +361,40 @@ KASSERT(vm == &vmspace0, ("vmspace_exitfree: wrong vmspace")); vmspace_free(vm); } +/* + * Compute memory usage for process. We can`t simply usage RSS, + * because process can have share vm object. + */ +int +vmspace_usage(struct proc *p) +{ + int mem_used; + struct vmspace *vm; + vm_map_t map; + vm_map_entry_t entry; + vm_object_t object; + mem_used =0; + vm = p->p_vmspace; + if (vm == NULL) + return (0); + map = &vm->vm_map; + vm_map_lock_read(map); + for (entry = map->header.next; + entry != &map->header; entry = entry->next) { + if ((object = entry->object.vm_object) == NULL) + continue; + VM_OBJECT_LOCK(object); + if (object->type == OBJT_DEVICE || object->ref_count == 0){ + VM_OBJECT_UNLOCK(object); + continue; + } + mem_used += object->resident_page_count * PAGE_SIZE / + object->ref_count; + VM_OBJECT_UNLOCK(object); + } + vm_map_unlock_read(map); + return mem_used; +} void vmspace_exit(struct thread *td) diff -w -b -B -r -U3 /usr/src/sys/vm/vm_mmap.c /usr/src.new8/sys/vm/vm_mmap.c --- /usr/src/sys/vm/vm_mmap.c 2009-04-04 23:12:14.000000000 +0000 +++ /usr/src.new8/sys/vm/vm_mmap.c 2009-06-02 09:01:08.000000000 +0000 @@ -56,6 +56,7 @@ #include #include #include +#include #include #include #include @@ -241,7 +242,9 @@ flags |= MAP_ANON; pos = 0; } - + error = prison_rusage_check(td, JRL_MEM, 0, "mmap"); + if (error) + return (error); /* * Align the file position to a page boundary, * and save its page offset component. diff -w -b -B -r -U3 /usr/src/sys/vm/vm_page.c /usr/src.new8/sys/vm/vm_page.c --- /usr/src/sys/vm/vm_page.c 2009-05-13 05:39:39.000000000 +0000 +++ /usr/src.new8/sys/vm/vm_page.c 2009-06-02 09:01:08.000000000 +0000 @@ -110,6 +110,7 @@ #include #include #include +#include #include #include #include @@ -701,6 +702,11 @@ */ if (m->flags & PG_WRITEABLE) vm_object_set_writeable_dirty(object); + + prison_rusage_check(curthread, JRL_MEM, PAGE_SIZE, + "vm_page_insert"); + prison_rusage_update_nolock(curthread, JRL_MEM, PAGE_SIZE, + "vm_page_insert"); } /* @@ -756,6 +762,9 @@ vdrop((struct vnode *)object->handle); m->object = NULL; + + prison_rusage_update_nolock(curthread, JRL_MEM, -PAGE_SIZE, + "vm_page_remove"); } /* diff -w -b -B -r -U3 /usr/src/sys/vm/vm_unix.c /usr/src.new8/sys/vm/vm_unix.c --- /usr/src/sys/vm/vm_unix.c 2009-04-11 22:34:08.000000000 +0000 +++ /usr/src.new8/sys/vm/vm_unix.c 2009-06-02 09:01:08.000000000 +0000 @@ -47,6 +47,7 @@ #include #include #include +#include #include #include #include @@ -78,6 +79,10 @@ int error = 0; boolean_t do_map_wirefuture; + error = prison_rusage_check(td, JRL_MEM, 0, "obreak"); + if (error) + return (error); + PROC_LOCK(td->td_proc); datalim = lim_cur(td->td_proc, RLIMIT_DATA); vmemlim = lim_cur(td->td_proc, RLIMIT_VMEM); diff -w -b -B -r -U3 /usr/src/usr.sbin/jail/jail.c /usr/src.new8/usr.sbin/jail/jail.c --- /usr/src/usr.sbin/jail/jail.c 2009-04-29 16:02:52.000000000 +0000 +++ /usr/src.new8/usr.sbin/jail/jail.c 2009-06-02 08:59:49.000000000 +0000 @@ -105,13 +105,15 @@ long ltmp; FILE *fp; struct addrinfo hints, *res0; + login_cap_t *lc; + lc = NULL; hflag = iflag = Jflag = lflag = uflag = Uflag = 0; securelevel = -1; jailname = username = JidFile = cleanenv = NULL; fp = NULL; - while ((ch = getopt(argc, argv, "hiln:s:u:U:J:")) != -1) { + while ((ch = getopt(argc, argv, "hiln:s:u:U:J:L:")) != -1) { switch (ch) { case 'h': hflag = 1; @@ -143,6 +145,9 @@ case 'l': lflag = 1; break; + case 'L': + lc = login_getclass(optarg); + break; default: usage(); } @@ -222,6 +227,25 @@ if (fp == NULL) errx(1, "Could not create JidFile: %s", JidFile); } + + j.rlimit[JRL_CPU] = + login_getcaptime(lc, "cputime", RLIM_INFINITY, RLIM_INFINITY); + + j.rlimit[JRL_MEM] = + login_getcapsize(lc, "memoryuse", RLIM_INFINITY, RLIM_INFINITY); + + j.rlimit[JRL_PROC] = + login_getcapnum(lc, "maxproc", RLIM_INFINITY, RLIM_INFINITY); + + j.rlimit[JRL_FILE] = + login_getcapnum(lc, "openfiles", RLIM_INFINITY, RLIM_INFINITY); + + //printf("Limits cpu %llu, mem %llu, proc %llu file %llu", + // j.rlimit[JRL_CPU], + // j.rlimit[JRL_MEM], + // j.rlimit[JRL_PROC], + // j.rlimit[JRL_FILE]); + i = jail(&j); if (i == -1) err(1, "syscall failed with"); @@ -281,9 +305,10 @@ usage(void) { - (void)fprintf(stderr, "%s%s%s\n", + (void)fprintf(stderr, "%s%s%s%s\n", "usage: jail [-hi] [-n jailname] [-J jid_file] ", "[-s securelevel] [-l -u username | -U username] ", + "[-L login class] ", "path hostname [ip[,..]] command ..."); exit(1); }