traefik/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go
2018-01-09 21:46:04 +01:00

1579 lines
42 KiB
Go

// +build linux
package libcontainer
import (
"bytes"
"encoding/json"
"fmt"
"io"
"io/ioutil"
"os"
"os/exec"
"path/filepath"
"reflect"
"strings"
"sync"
"syscall"
"time"
"github.com/Sirupsen/logrus"
"github.com/golang/protobuf/proto"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/criurpc"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/runc/libcontainer/utils"
"github.com/syndtr/gocapability/capability"
"github.com/vishvananda/netlink/nl"
)
const stdioFdCount = 3
type linuxContainer struct {
id string
root string
config *configs.Config
cgroupManager cgroups.Manager
initArgs []string
initProcess parentProcess
initProcessStartTime string
criuPath string
m sync.Mutex
criuVersion int
state containerState
created time.Time
}
// State represents a running container's state
type State struct {
BaseState
// Platform specific fields below here
// Specifies if the container was started under the rootless mode.
Rootless bool `json:"rootless"`
// Path to all the cgroups setup for a container. Key is cgroup subsystem name
// with the value as the path.
CgroupPaths map[string]string `json:"cgroup_paths"`
// NamespacePaths are filepaths to the container's namespaces. Key is the namespace type
// with the value as the path.
NamespacePaths map[configs.NamespaceType]string `json:"namespace_paths"`
// Container's standard descriptors (std{in,out,err}), needed for checkpoint and restore
ExternalDescriptors []string `json:"external_descriptors,omitempty"`
}
// Container is a libcontainer container object.
//
// Each container is thread-safe within the same process. Since a container can
// be destroyed by a separate process, any function may return that the container
// was not found.
type Container interface {
BaseContainer
// Methods below here are platform specific
// Checkpoint checkpoints the running container's state to disk using the criu(8) utility.
//
// errors:
// Systemerror - System error.
Checkpoint(criuOpts *CriuOpts) error
// Restore restores the checkpointed container to a running state using the criu(8) utility.
//
// errors:
// Systemerror - System error.
Restore(process *Process, criuOpts *CriuOpts) error
// If the Container state is RUNNING or CREATED, sets the Container state to PAUSING and pauses
// the execution of any user processes. Asynchronously, when the container finished being paused the
// state is changed to PAUSED.
// If the Container state is PAUSED, do nothing.
//
// errors:
// ContainerNotExists - Container no longer exists,
// ContainerNotRunning - Container not running or created,
// Systemerror - System error.
Pause() error
// If the Container state is PAUSED, resumes the execution of any user processes in the
// Container before setting the Container state to RUNNING.
// If the Container state is RUNNING, do nothing.
//
// errors:
// ContainerNotExists - Container no longer exists,
// ContainerNotPaused - Container is not paused,
// Systemerror - System error.
Resume() error
// NotifyOOM returns a read-only channel signaling when the container receives an OOM notification.
//
// errors:
// Systemerror - System error.
NotifyOOM() (<-chan struct{}, error)
// NotifyMemoryPressure returns a read-only channel signaling when the container reaches a given pressure level
//
// errors:
// Systemerror - System error.
NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error)
}
// ID returns the container's unique ID
func (c *linuxContainer) ID() string {
return c.id
}
// Config returns the container's configuration
func (c *linuxContainer) Config() configs.Config {
return *c.config
}
func (c *linuxContainer) Status() (Status, error) {
c.m.Lock()
defer c.m.Unlock()
return c.currentStatus()
}
func (c *linuxContainer) State() (*State, error) {
c.m.Lock()
defer c.m.Unlock()
return c.currentState()
}
func (c *linuxContainer) Processes() ([]int, error) {
pids, err := c.cgroupManager.GetAllPids()
if err != nil {
return nil, newSystemErrorWithCause(err, "getting all container pids from cgroups")
}
return pids, nil
}
func (c *linuxContainer) Stats() (*Stats, error) {
var (
err error
stats = &Stats{}
)
if stats.CgroupStats, err = c.cgroupManager.GetStats(); err != nil {
return stats, newSystemErrorWithCause(err, "getting container stats from cgroups")
}
for _, iface := range c.config.Networks {
switch iface.Type {
case "veth":
istats, err := getNetworkInterfaceStats(iface.HostInterfaceName)
if err != nil {
return stats, newSystemErrorWithCausef(err, "getting network stats for interface %q", iface.HostInterfaceName)
}
stats.Interfaces = append(stats.Interfaces, istats)
}
}
return stats, nil
}
func (c *linuxContainer) Set(config configs.Config) error {
c.m.Lock()
defer c.m.Unlock()
status, err := c.currentStatus()
if err != nil {
return err
}
if status == Stopped {
return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning)
}
c.config = &config
return c.cgroupManager.Set(c.config)
}
func (c *linuxContainer) Start(process *Process) error {
c.m.Lock()
defer c.m.Unlock()
status, err := c.currentStatus()
if err != nil {
return err
}
if status == Stopped {
if err := c.createExecFifo(); err != nil {
return err
}
}
if err := c.start(process, status == Stopped); err != nil {
if status == Stopped {
c.deleteExecFifo()
}
return err
}
return nil
}
func (c *linuxContainer) Run(process *Process) error {
c.m.Lock()
status, err := c.currentStatus()
if err != nil {
c.m.Unlock()
return err
}
c.m.Unlock()
if err := c.Start(process); err != nil {
return err
}
if status == Stopped {
return c.exec()
}
return nil
}
func (c *linuxContainer) Exec() error {
c.m.Lock()
defer c.m.Unlock()
return c.exec()
}
func (c *linuxContainer) exec() error {
path := filepath.Join(c.root, execFifoFilename)
f, err := os.OpenFile(path, os.O_RDONLY, 0)
if err != nil {
return newSystemErrorWithCause(err, "open exec fifo for reading")
}
defer f.Close()
data, err := ioutil.ReadAll(f)
if err != nil {
return err
}
if len(data) > 0 {
os.Remove(path)
return nil
}
return fmt.Errorf("cannot start an already running container")
}
func (c *linuxContainer) start(process *Process, isInit bool) error {
parent, err := c.newParentProcess(process, isInit)
if err != nil {
return newSystemErrorWithCause(err, "creating new parent process")
}
if err := parent.start(); err != nil {
// terminate the process to ensure that it properly is reaped.
if err := parent.terminate(); err != nil {
logrus.Warn(err)
}
return newSystemErrorWithCause(err, "starting container process")
}
// generate a timestamp indicating when the container was started
c.created = time.Now().UTC()
if isInit {
c.state = &createdState{
c: c,
}
state, err := c.updateState(parent)
if err != nil {
return err
}
c.initProcessStartTime = state.InitProcessStartTime
if c.config.Hooks != nil {
s := configs.HookState{
Version: c.config.Version,
ID: c.id,
Pid: parent.pid(),
Bundle: utils.SearchLabels(c.config.Labels, "bundle"),
}
for i, hook := range c.config.Hooks.Poststart {
if err := hook.Run(s); err != nil {
if err := parent.terminate(); err != nil {
logrus.Warn(err)
}
return newSystemErrorWithCausef(err, "running poststart hook %d", i)
}
}
}
} else {
c.state = &runningState{
c: c,
}
}
return nil
}
func (c *linuxContainer) Signal(s os.Signal, all bool) error {
if all {
return signalAllProcesses(c.cgroupManager, s)
}
if err := c.initProcess.signal(s); err != nil {
return newSystemErrorWithCause(err, "signaling init process")
}
return nil
}
func (c *linuxContainer) createExecFifo() error {
rootuid, err := c.Config().HostRootUID()
if err != nil {
return err
}
rootgid, err := c.Config().HostRootGID()
if err != nil {
return err
}
fifoName := filepath.Join(c.root, execFifoFilename)
if _, err := os.Stat(fifoName); err == nil {
return fmt.Errorf("exec fifo %s already exists", fifoName)
}
oldMask := syscall.Umask(0000)
if err := syscall.Mkfifo(fifoName, 0622); err != nil {
syscall.Umask(oldMask)
return err
}
syscall.Umask(oldMask)
if err := os.Chown(fifoName, rootuid, rootgid); err != nil {
return err
}
return nil
}
func (c *linuxContainer) deleteExecFifo() {
fifoName := filepath.Join(c.root, execFifoFilename)
os.Remove(fifoName)
}
func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProcess, error) {
parentPipe, childPipe, err := utils.NewSockPair("init")
if err != nil {
return nil, newSystemErrorWithCause(err, "creating new init pipe")
}
cmd, err := c.commandTemplate(p, childPipe)
if err != nil {
return nil, newSystemErrorWithCause(err, "creating new command template")
}
if !doInit {
return c.newSetnsProcess(p, cmd, parentPipe, childPipe)
}
// We only set up rootDir if we're not doing a `runc exec`. The reason for
// this is to avoid cases where a racing, unprivileged process inside the
// container can get access to the statedir file descriptor (which would
// allow for container rootfs escape).
rootDir, err := os.Open(c.root)
if err != nil {
return nil, err
}
cmd.ExtraFiles = append(cmd.ExtraFiles, rootDir)
cmd.Env = append(cmd.Env,
fmt.Sprintf("_LIBCONTAINER_STATEDIR=%d", stdioFdCount+len(cmd.ExtraFiles)-1))
return c.newInitProcess(p, cmd, parentPipe, childPipe, rootDir)
}
func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.Cmd, error) {
cmd := exec.Command(c.initArgs[0], c.initArgs[1:]...)
cmd.Stdin = p.Stdin
cmd.Stdout = p.Stdout
cmd.Stderr = p.Stderr
cmd.Dir = c.config.Rootfs
if cmd.SysProcAttr == nil {
cmd.SysProcAttr = &syscall.SysProcAttr{}
}
cmd.ExtraFiles = append(cmd.ExtraFiles, p.ExtraFiles...)
if p.ConsoleSocket != nil {
cmd.ExtraFiles = append(cmd.ExtraFiles, p.ConsoleSocket)
cmd.Env = append(cmd.Env,
fmt.Sprintf("_LIBCONTAINER_CONSOLE=%d", stdioFdCount+len(cmd.ExtraFiles)-1),
)
}
cmd.ExtraFiles = append(cmd.ExtraFiles, childPipe)
cmd.Env = append(cmd.Env,
fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1),
)
// NOTE: when running a container with no PID namespace and the parent process spawning the container is
// PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason
// even with the parent still running.
if c.config.ParentDeathSignal > 0 {
cmd.SysProcAttr.Pdeathsig = syscall.Signal(c.config.ParentDeathSignal)
}
return cmd, nil
}
func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*initProcess, error) {
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
nsMaps := make(map[configs.NamespaceType]string)
for _, ns := range c.config.Namespaces {
if ns.Path != "" {
nsMaps[ns.Type] = ns.Path
}
}
_, sharePidns := nsMaps[configs.NEWPID]
data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps)
if err != nil {
return nil, err
}
return &initProcess{
cmd: cmd,
childPipe: childPipe,
parentPipe: parentPipe,
manager: c.cgroupManager,
config: c.newInitConfig(p),
container: c,
process: p,
bootstrapData: data,
sharePidns: sharePidns,
rootDir: rootDir,
}, nil
}
func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*setnsProcess, error) {
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns))
state, err := c.currentState()
if err != nil {
return nil, newSystemErrorWithCause(err, "getting container's current state")
}
// for setns process, we don't have to set cloneflags as the process namespaces
// will only be set via setns syscall
data, err := c.bootstrapData(0, state.NamespacePaths)
if err != nil {
return nil, err
}
return &setnsProcess{
cmd: cmd,
cgroupPaths: c.cgroupManager.GetPaths(),
childPipe: childPipe,
parentPipe: parentPipe,
config: c.newInitConfig(p),
process: p,
bootstrapData: data,
}, nil
}
func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
cfg := &initConfig{
Config: c.config,
Args: process.Args,
Env: process.Env,
User: process.User,
AdditionalGroups: process.AdditionalGroups,
Cwd: process.Cwd,
Capabilities: process.Capabilities,
PassedFilesCount: len(process.ExtraFiles),
ContainerId: c.ID(),
NoNewPrivileges: c.config.NoNewPrivileges,
Rootless: c.config.Rootless,
AppArmorProfile: c.config.AppArmorProfile,
ProcessLabel: c.config.ProcessLabel,
Rlimits: c.config.Rlimits,
}
if process.NoNewPrivileges != nil {
cfg.NoNewPrivileges = *process.NoNewPrivileges
}
if process.AppArmorProfile != "" {
cfg.AppArmorProfile = process.AppArmorProfile
}
if process.Label != "" {
cfg.ProcessLabel = process.Label
}
if len(process.Rlimits) > 0 {
cfg.Rlimits = process.Rlimits
}
cfg.CreateConsole = process.ConsoleSocket != nil
return cfg
}
func (c *linuxContainer) Destroy() error {
c.m.Lock()
defer c.m.Unlock()
return c.state.destroy()
}
func (c *linuxContainer) Pause() error {
c.m.Lock()
defer c.m.Unlock()
status, err := c.currentStatus()
if err != nil {
return err
}
switch status {
case Running, Created:
if err := c.cgroupManager.Freeze(configs.Frozen); err != nil {
return err
}
return c.state.transition(&pausedState{
c: c,
})
}
return newGenericError(fmt.Errorf("container not running or created: %s", status), ContainerNotRunning)
}
func (c *linuxContainer) Resume() error {
c.m.Lock()
defer c.m.Unlock()
status, err := c.currentStatus()
if err != nil {
return err
}
if status != Paused {
return newGenericError(fmt.Errorf("container not paused"), ContainerNotPaused)
}
if err := c.cgroupManager.Freeze(configs.Thawed); err != nil {
return err
}
return c.state.transition(&runningState{
c: c,
})
}
func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) {
// XXX(cyphar): This requires cgroups.
if c.config.Rootless {
return nil, fmt.Errorf("cannot get OOM notifications from rootless container")
}
return notifyOnOOM(c.cgroupManager.GetPaths())
}
func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) {
// XXX(cyphar): This requires cgroups.
if c.config.Rootless {
return nil, fmt.Errorf("cannot get memory pressure notifications from rootless container")
}
return notifyMemoryPressure(c.cgroupManager.GetPaths(), level)
}
var criuFeatures *criurpc.CriuFeatures
func (c *linuxContainer) checkCriuFeatures(criuOpts *CriuOpts, rpcOpts *criurpc.CriuOpts, criuFeat *criurpc.CriuFeatures) error {
var t criurpc.CriuReqType
t = criurpc.CriuReqType_FEATURE_CHECK
if err := c.checkCriuVersion("1.8"); err != nil {
// Feature checking was introduced with CRIU 1.8.
// Ignore the feature check if an older CRIU version is used
// and just act as before.
// As all automated PR testing is done using CRIU 1.7 this
// code will not be tested by automated PR testing.
return nil
}
// make sure the features we are looking for are really not from
// some previous check
criuFeatures = nil
req := &criurpc.CriuReq{
Type: &t,
// Theoretically this should not be necessary but CRIU
// segfaults if Opts is empty.
// Fixed in CRIU 2.12
Opts: rpcOpts,
Features: criuFeat,
}
err := c.criuSwrk(nil, req, criuOpts, false)
if err != nil {
logrus.Debugf("%s", err)
return fmt.Errorf("CRIU feature check failed")
}
logrus.Debugf("Feature check says: %s", criuFeatures)
missingFeatures := false
if *criuFeat.MemTrack && !*criuFeatures.MemTrack {
missingFeatures = true
logrus.Debugf("CRIU does not support MemTrack")
}
if missingFeatures {
return fmt.Errorf("CRIU is missing features")
}
return nil
}
// checkCriuVersion checks Criu version greater than or equal to minVersion
func (c *linuxContainer) checkCriuVersion(minVersion string) error {
var x, y, z, versionReq int
_, err := fmt.Sscanf(minVersion, "%d.%d.%d\n", &x, &y, &z) // 1.5.2
if err != nil {
_, err = fmt.Sscanf(minVersion, "Version: %d.%d\n", &x, &y) // 1.6
}
versionReq = x*10000 + y*100 + z
out, err := exec.Command(c.criuPath, "-V").Output()
if err != nil {
return fmt.Errorf("Unable to execute CRIU command: %s", c.criuPath)
}
x = 0
y = 0
z = 0
if ep := strings.Index(string(out), "-"); ep >= 0 {
// criu Git version format
var version string
if sp := strings.Index(string(out), "GitID"); sp > 0 {
version = string(out)[sp:ep]
} else {
return fmt.Errorf("Unable to parse the CRIU version: %s", c.criuPath)
}
n, err := fmt.Sscanf(string(version), "GitID: v%d.%d.%d", &x, &y, &z) // 1.5.2
if err != nil {
n, err = fmt.Sscanf(string(version), "GitID: v%d.%d", &x, &y) // 1.6
y++
} else {
z++
}
if n < 2 || err != nil {
return fmt.Errorf("Unable to parse the CRIU version: %s %d %s", version, n, err)
}
} else {
// criu release version format
n, err := fmt.Sscanf(string(out), "Version: %d.%d.%d\n", &x, &y, &z) // 1.5.2
if err != nil {
n, err = fmt.Sscanf(string(out), "Version: %d.%d\n", &x, &y) // 1.6
}
if n < 2 || err != nil {
return fmt.Errorf("Unable to parse the CRIU version: %s %d %s", out, n, err)
}
}
c.criuVersion = x*10000 + y*100 + z
if c.criuVersion < versionReq {
return fmt.Errorf("CRIU version must be %s or higher", minVersion)
}
return nil
}
const descriptorsFilename = "descriptors.json"
func (c *linuxContainer) addCriuDumpMount(req *criurpc.CriuReq, m *configs.Mount) {
mountDest := m.Destination
if strings.HasPrefix(mountDest, c.config.Rootfs) {
mountDest = mountDest[len(c.config.Rootfs):]
}
extMnt := &criurpc.ExtMountMap{
Key: proto.String(mountDest),
Val: proto.String(mountDest),
}
req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
}
func (c *linuxContainer) addMaskPaths(req *criurpc.CriuReq) error {
for _, path := range c.config.MaskPaths {
fi, err := os.Stat(fmt.Sprintf("/proc/%d/root/%s", c.initProcess.pid(), path))
if err != nil {
if os.IsNotExist(err) {
continue
}
return err
}
if fi.IsDir() {
continue
}
extMnt := &criurpc.ExtMountMap{
Key: proto.String(path),
Val: proto.String("/dev/null"),
}
req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
}
return nil
}
func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
c.m.Lock()
defer c.m.Unlock()
// TODO(avagin): Figure out how to make this work nicely. CRIU 2.0 has
// support for doing unprivileged dumps, but the setup of
// rootless containers might make this complicated.
if c.config.Rootless {
return fmt.Errorf("cannot checkpoint a rootless container")
}
if err := c.checkCriuVersion("1.5.2"); err != nil {
return err
}
if criuOpts.ImagesDirectory == "" {
return fmt.Errorf("invalid directory to save checkpoint")
}
// Since a container can be C/R'ed multiple times,
// the checkpoint directory may already exist.
if err := os.Mkdir(criuOpts.ImagesDirectory, 0755); err != nil && !os.IsExist(err) {
return err
}
if criuOpts.WorkDirectory == "" {
criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work")
}
if err := os.Mkdir(criuOpts.WorkDirectory, 0755); err != nil && !os.IsExist(err) {
return err
}
workDir, err := os.Open(criuOpts.WorkDirectory)
if err != nil {
return err
}
defer workDir.Close()
imageDir, err := os.Open(criuOpts.ImagesDirectory)
if err != nil {
return err
}
defer imageDir.Close()
rpcOpts := criurpc.CriuOpts{
ImagesDirFd: proto.Int32(int32(imageDir.Fd())),
WorkDirFd: proto.Int32(int32(workDir.Fd())),
LogLevel: proto.Int32(4),
LogFile: proto.String("dump.log"),
Root: proto.String(c.config.Rootfs),
ManageCgroups: proto.Bool(true),
NotifyScripts: proto.Bool(true),
Pid: proto.Int32(int32(c.initProcess.pid())),
ShellJob: proto.Bool(criuOpts.ShellJob),
LeaveRunning: proto.Bool(criuOpts.LeaveRunning),
TcpEstablished: proto.Bool(criuOpts.TcpEstablished),
ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections),
FileLocks: proto.Bool(criuOpts.FileLocks),
EmptyNs: proto.Uint32(criuOpts.EmptyNs),
}
// append optional criu opts, e.g., page-server and port
if criuOpts.PageServer.Address != "" && criuOpts.PageServer.Port != 0 {
rpcOpts.Ps = &criurpc.CriuPageServerInfo{
Address: proto.String(criuOpts.PageServer.Address),
Port: proto.Int32(criuOpts.PageServer.Port),
}
}
//pre-dump may need parentImage param to complete iterative migration
if criuOpts.ParentImage != "" {
rpcOpts.ParentImg = proto.String(criuOpts.ParentImage)
rpcOpts.TrackMem = proto.Bool(true)
}
// append optional manage cgroups mode
if criuOpts.ManageCgroupsMode != 0 {
if err := c.checkCriuVersion("1.7"); err != nil {
return err
}
mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode)
rpcOpts.ManageCgroupsMode = &mode
}
var t criurpc.CriuReqType
if criuOpts.PreDump {
feat := criurpc.CriuFeatures{
MemTrack: proto.Bool(true),
}
if err := c.checkCriuFeatures(criuOpts, &rpcOpts, &feat); err != nil {
return err
}
t = criurpc.CriuReqType_PRE_DUMP
} else {
t = criurpc.CriuReqType_DUMP
}
req := &criurpc.CriuReq{
Type: &t,
Opts: &rpcOpts,
}
//no need to dump these information in pre-dump
if !criuOpts.PreDump {
for _, m := range c.config.Mounts {
switch m.Device {
case "bind":
c.addCriuDumpMount(req, m)
break
case "cgroup":
binds, err := getCgroupMounts(m)
if err != nil {
return err
}
for _, b := range binds {
c.addCriuDumpMount(req, b)
}
break
}
}
if err := c.addMaskPaths(req); err != nil {
return err
}
for _, node := range c.config.Devices {
m := &configs.Mount{Destination: node.Path, Source: node.Path}
c.addCriuDumpMount(req, m)
}
// Write the FD info to a file in the image directory
fdsJSON, err := json.Marshal(c.initProcess.externalDescriptors())
if err != nil {
return err
}
err = ioutil.WriteFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename), fdsJSON, 0655)
if err != nil {
return err
}
}
err = c.criuSwrk(nil, req, criuOpts, false)
if err != nil {
return err
}
return nil
}
func (c *linuxContainer) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mount) {
mountDest := m.Destination
if strings.HasPrefix(mountDest, c.config.Rootfs) {
mountDest = mountDest[len(c.config.Rootfs):]
}
extMnt := &criurpc.ExtMountMap{
Key: proto.String(mountDest),
Val: proto.String(m.Source),
}
req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
}
func (c *linuxContainer) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts) {
for _, iface := range c.config.Networks {
switch iface.Type {
case "veth":
veth := new(criurpc.CriuVethPair)
veth.IfOut = proto.String(iface.HostInterfaceName)
veth.IfIn = proto.String(iface.Name)
req.Opts.Veths = append(req.Opts.Veths, veth)
break
case "loopback":
break
}
}
for _, i := range criuOpts.VethPairs {
veth := new(criurpc.CriuVethPair)
veth.IfOut = proto.String(i.HostInterfaceName)
veth.IfIn = proto.String(i.ContainerInterfaceName)
req.Opts.Veths = append(req.Opts.Veths, veth)
}
}
func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
c.m.Lock()
defer c.m.Unlock()
// TODO(avagin): Figure out how to make this work nicely. CRIU doesn't have
// support for unprivileged restore at the moment.
if c.config.Rootless {
return fmt.Errorf("cannot restore a rootless container")
}
if err := c.checkCriuVersion("1.5.2"); err != nil {
return err
}
if criuOpts.WorkDirectory == "" {
criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work")
}
// Since a container can be C/R'ed multiple times,
// the work directory may already exist.
if err := os.Mkdir(criuOpts.WorkDirectory, 0655); err != nil && !os.IsExist(err) {
return err
}
workDir, err := os.Open(criuOpts.WorkDirectory)
if err != nil {
return err
}
defer workDir.Close()
if criuOpts.ImagesDirectory == "" {
return fmt.Errorf("invalid directory to restore checkpoint")
}
imageDir, err := os.Open(criuOpts.ImagesDirectory)
if err != nil {
return err
}
defer imageDir.Close()
// CRIU has a few requirements for a root directory:
// * it must be a mount point
// * its parent must not be overmounted
// c.config.Rootfs is bind-mounted to a temporary directory
// to satisfy these requirements.
root := filepath.Join(c.root, "criu-root")
if err := os.Mkdir(root, 0755); err != nil {
return err
}
defer os.Remove(root)
root, err = filepath.EvalSymlinks(root)
if err != nil {
return err
}
err = syscall.Mount(c.config.Rootfs, root, "", syscall.MS_BIND|syscall.MS_REC, "")
if err != nil {
return err
}
defer syscall.Unmount(root, syscall.MNT_DETACH)
t := criurpc.CriuReqType_RESTORE
req := &criurpc.CriuReq{
Type: &t,
Opts: &criurpc.CriuOpts{
ImagesDirFd: proto.Int32(int32(imageDir.Fd())),
WorkDirFd: proto.Int32(int32(workDir.Fd())),
EvasiveDevices: proto.Bool(true),
LogLevel: proto.Int32(4),
LogFile: proto.String("restore.log"),
RstSibling: proto.Bool(true),
Root: proto.String(root),
ManageCgroups: proto.Bool(true),
NotifyScripts: proto.Bool(true),
ShellJob: proto.Bool(criuOpts.ShellJob),
ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections),
TcpEstablished: proto.Bool(criuOpts.TcpEstablished),
FileLocks: proto.Bool(criuOpts.FileLocks),
EmptyNs: proto.Uint32(criuOpts.EmptyNs),
},
}
for _, m := range c.config.Mounts {
switch m.Device {
case "bind":
c.addCriuRestoreMount(req, m)
break
case "cgroup":
binds, err := getCgroupMounts(m)
if err != nil {
return err
}
for _, b := range binds {
c.addCriuRestoreMount(req, b)
}
break
}
}
if len(c.config.MaskPaths) > 0 {
m := &configs.Mount{Destination: "/dev/null", Source: "/dev/null"}
c.addCriuRestoreMount(req, m)
}
for _, node := range c.config.Devices {
m := &configs.Mount{Destination: node.Path, Source: node.Path}
c.addCriuRestoreMount(req, m)
}
if criuOpts.EmptyNs&syscall.CLONE_NEWNET == 0 {
c.restoreNetwork(req, criuOpts)
}
// append optional manage cgroups mode
if criuOpts.ManageCgroupsMode != 0 {
if err := c.checkCriuVersion("1.7"); err != nil {
return err
}
mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode)
req.Opts.ManageCgroupsMode = &mode
}
var (
fds []string
fdJSON []byte
)
if fdJSON, err = ioutil.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil {
return err
}
if err := json.Unmarshal(fdJSON, &fds); err != nil {
return err
}
for i := range fds {
if s := fds[i]; strings.Contains(s, "pipe:") {
inheritFd := new(criurpc.InheritFd)
inheritFd.Key = proto.String(s)
inheritFd.Fd = proto.Int32(int32(i))
req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd)
}
}
return c.criuSwrk(process, req, criuOpts, true)
}
func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error {
// XXX: Do we need to deal with this case? AFAIK criu still requires root.
if err := c.cgroupManager.Apply(pid); err != nil {
return err
}
if err := c.cgroupManager.Set(c.config); err != nil {
return newSystemError(err)
}
path := fmt.Sprintf("/proc/%d/cgroup", pid)
cgroupsPaths, err := cgroups.ParseCgroupFile(path)
if err != nil {
return err
}
for c, p := range cgroupsPaths {
cgroupRoot := &criurpc.CgroupRoot{
Ctrl: proto.String(c),
Path: proto.String(p),
}
req.Opts.CgRoot = append(req.Opts.CgRoot, cgroupRoot)
}
return nil
}
func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *CriuOpts, applyCgroups bool) error {
fds, err := syscall.Socketpair(syscall.AF_LOCAL, syscall.SOCK_SEQPACKET|syscall.SOCK_CLOEXEC, 0)
if err != nil {
return err
}
logPath := filepath.Join(opts.WorkDirectory, req.GetOpts().GetLogFile())
criuClient := os.NewFile(uintptr(fds[0]), "criu-transport-client")
criuServer := os.NewFile(uintptr(fds[1]), "criu-transport-server")
defer criuClient.Close()
defer criuServer.Close()
args := []string{"swrk", "3"}
logrus.Debugf("Using CRIU %d at: %s", c.criuVersion, c.criuPath)
logrus.Debugf("Using CRIU with following args: %s", args)
cmd := exec.Command(c.criuPath, args...)
if process != nil {
cmd.Stdin = process.Stdin
cmd.Stdout = process.Stdout
cmd.Stderr = process.Stderr
}
cmd.ExtraFiles = append(cmd.ExtraFiles, criuServer)
if err := cmd.Start(); err != nil {
return err
}
criuServer.Close()
defer func() {
criuClient.Close()
_, err := cmd.Process.Wait()
if err != nil {
return
}
}()
if applyCgroups {
err := c.criuApplyCgroups(cmd.Process.Pid, req)
if err != nil {
return err
}
}
var extFds []string
if process != nil {
extFds, err = getPipeFds(cmd.Process.Pid)
if err != nil {
return err
}
}
logrus.Debugf("Using CRIU in %s mode", req.GetType().String())
// In the case of criurpc.CriuReqType_FEATURE_CHECK req.GetOpts()
// should be empty. For older CRIU versions it still will be
// available but empty.
if req.GetType() != criurpc.CriuReqType_FEATURE_CHECK {
val := reflect.ValueOf(req.GetOpts())
v := reflect.Indirect(val)
for i := 0; i < v.NumField(); i++ {
st := v.Type()
name := st.Field(i).Name
if strings.HasPrefix(name, "XXX_") {
continue
}
value := val.MethodByName("Get" + name).Call([]reflect.Value{})
logrus.Debugf("CRIU option %s with value %v", name, value[0])
}
}
data, err := proto.Marshal(req)
if err != nil {
return err
}
_, err = criuClient.Write(data)
if err != nil {
return err
}
buf := make([]byte, 10*4096)
for true {
n, err := criuClient.Read(buf)
if err != nil {
return err
}
if n == 0 {
return fmt.Errorf("unexpected EOF")
}
if n == len(buf) {
return fmt.Errorf("buffer is too small")
}
resp := new(criurpc.CriuResp)
err = proto.Unmarshal(buf[:n], resp)
if err != nil {
return err
}
if !resp.GetSuccess() {
typeString := req.GetType().String()
return fmt.Errorf("criu failed: type %s errno %d\nlog file: %s", typeString, resp.GetCrErrno(), logPath)
}
t := resp.GetType()
switch {
case t == criurpc.CriuReqType_FEATURE_CHECK:
logrus.Debugf("Feature check says: %s", resp)
criuFeatures = resp.GetFeatures()
break
case t == criurpc.CriuReqType_NOTIFY:
if err := c.criuNotifications(resp, process, opts, extFds); err != nil {
return err
}
t = criurpc.CriuReqType_NOTIFY
req = &criurpc.CriuReq{
Type: &t,
NotifySuccess: proto.Bool(true),
}
data, err = proto.Marshal(req)
if err != nil {
return err
}
_, err = criuClient.Write(data)
if err != nil {
return err
}
continue
case t == criurpc.CriuReqType_RESTORE:
case t == criurpc.CriuReqType_DUMP:
break
case t == criurpc.CriuReqType_PRE_DUMP:
// In pre-dump mode CRIU is in a loop and waits for
// the final DUMP command.
// The current runc pre-dump approach, however, is
// start criu in PRE_DUMP once for a single pre-dump
// and not the whole series of pre-dump, pre-dump, ...m, dump
// If we got the message CriuReqType_PRE_DUMP it means
// CRIU was successful and we need to forcefully stop CRIU
logrus.Debugf("PRE_DUMP finished. Send close signal to CRIU service")
criuClient.Close()
// Process status won't be success, because one end of sockets is closed
_, err := cmd.Process.Wait()
if err != nil {
logrus.Debugf("After PRE_DUMP CRIU exiting failed")
return err
}
return nil
default:
return fmt.Errorf("unable to parse the response %s", resp.String())
}
break
}
// cmd.Wait() waits cmd.goroutines which are used for proxying file descriptors.
// Here we want to wait only the CRIU process.
st, err := cmd.Process.Wait()
if err != nil {
return err
}
if !st.Success() {
return fmt.Errorf("criu failed: %s\nlog file: %s", st.String(), logPath)
}
return nil
}
// block any external network activity
func lockNetwork(config *configs.Config) error {
for _, config := range config.Networks {
strategy, err := getStrategy(config.Type)
if err != nil {
return err
}
if err := strategy.detach(config); err != nil {
return err
}
}
return nil
}
func unlockNetwork(config *configs.Config) error {
for _, config := range config.Networks {
strategy, err := getStrategy(config.Type)
if err != nil {
return err
}
if err = strategy.attach(config); err != nil {
return err
}
}
return nil
}
func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Process, opts *CriuOpts, fds []string) error {
notify := resp.GetNotify()
if notify == nil {
return fmt.Errorf("invalid response: %s", resp.String())
}
switch {
case notify.GetScript() == "post-dump":
f, err := os.Create(filepath.Join(c.root, "checkpoint"))
if err != nil {
return err
}
f.Close()
case notify.GetScript() == "network-unlock":
if err := unlockNetwork(c.config); err != nil {
return err
}
case notify.GetScript() == "network-lock":
if err := lockNetwork(c.config); err != nil {
return err
}
case notify.GetScript() == "setup-namespaces":
if c.config.Hooks != nil {
s := configs.HookState{
Version: c.config.Version,
ID: c.id,
Pid: int(notify.GetPid()),
Bundle: utils.SearchLabels(c.config.Labels, "bundle"),
}
for i, hook := range c.config.Hooks.Prestart {
if err := hook.Run(s); err != nil {
return newSystemErrorWithCausef(err, "running prestart hook %d", i)
}
}
}
case notify.GetScript() == "post-restore":
pid := notify.GetPid()
r, err := newRestoredProcess(int(pid), fds)
if err != nil {
return err
}
process.ops = r
if err := c.state.transition(&restoredState{
imageDir: opts.ImagesDirectory,
c: c,
}); err != nil {
return err
}
// create a timestamp indicating when the restored checkpoint was started
c.created = time.Now().UTC()
if _, err := c.updateState(r); err != nil {
return err
}
if err := os.Remove(filepath.Join(c.root, "checkpoint")); err != nil {
if !os.IsNotExist(err) {
logrus.Error(err)
}
}
}
return nil
}
func (c *linuxContainer) updateState(process parentProcess) (*State, error) {
c.initProcess = process
state, err := c.currentState()
if err != nil {
return nil, err
}
err = c.saveState(state)
if err != nil {
return nil, err
}
return state, nil
}
func (c *linuxContainer) saveState(s *State) error {
f, err := os.Create(filepath.Join(c.root, stateFilename))
if err != nil {
return err
}
defer f.Close()
return utils.WriteJSON(f, s)
}
func (c *linuxContainer) deleteState() error {
return os.Remove(filepath.Join(c.root, stateFilename))
}
func (c *linuxContainer) currentStatus() (Status, error) {
if err := c.refreshState(); err != nil {
return -1, err
}
return c.state.status(), nil
}
// refreshState needs to be called to verify that the current state on the
// container is what is true. Because consumers of libcontainer can use it
// out of process we need to verify the container's status based on runtime
// information and not rely on our in process info.
func (c *linuxContainer) refreshState() error {
paused, err := c.isPaused()
if err != nil {
return err
}
if paused {
return c.state.transition(&pausedState{c: c})
}
t, err := c.runType()
if err != nil {
return err
}
switch t {
case Created:
return c.state.transition(&createdState{c: c})
case Running:
return c.state.transition(&runningState{c: c})
}
return c.state.transition(&stoppedState{c: c})
}
// doesInitProcessExist checks if the init process is still the same process
// as the initial one, it could happen that the original process has exited
// and a new process has been created with the same pid, in this case, the
// container would already be stopped.
func (c *linuxContainer) doesInitProcessExist(initPid int) (bool, error) {
startTime, err := system.GetProcessStartTime(initPid)
if err != nil {
return false, newSystemErrorWithCausef(err, "getting init process %d start time", initPid)
}
if c.initProcessStartTime != startTime {
return false, nil
}
return true, nil
}
func (c *linuxContainer) runType() (Status, error) {
if c.initProcess == nil {
return Stopped, nil
}
pid := c.initProcess.pid()
// return Running if the init process is alive
if err := syscall.Kill(pid, 0); err != nil {
if err == syscall.ESRCH {
// It means the process does not exist anymore, could happen when the
// process exited just when we call the function, we should not return
// error in this case.
return Stopped, nil
}
return Stopped, newSystemErrorWithCausef(err, "sending signal 0 to pid %d", pid)
}
// check if the process is still the original init process.
exist, err := c.doesInitProcessExist(pid)
if !exist || err != nil {
return Stopped, err
}
// We'll create exec fifo and blocking on it after container is created,
// and delete it after start container.
if _, err := os.Stat(filepath.Join(c.root, execFifoFilename)); err == nil {
return Created, nil
}
return Running, nil
}
func (c *linuxContainer) isPaused() (bool, error) {
fcg := c.cgroupManager.GetPaths()["freezer"]
if fcg == "" {
// A container doesn't have a freezer cgroup
return false, nil
}
data, err := ioutil.ReadFile(filepath.Join(fcg, "freezer.state"))
if err != nil {
// If freezer cgroup is not mounted, the container would just be not paused.
if os.IsNotExist(err) {
return false, nil
}
return false, newSystemErrorWithCause(err, "checking if container is paused")
}
return bytes.Equal(bytes.TrimSpace(data), []byte("FROZEN")), nil
}
func (c *linuxContainer) currentState() (*State, error) {
var (
startTime string
externalDescriptors []string
pid = -1
)
if c.initProcess != nil {
pid = c.initProcess.pid()
startTime, _ = c.initProcess.startTime()
externalDescriptors = c.initProcess.externalDescriptors()
}
state := &State{
BaseState: BaseState{
ID: c.ID(),
Config: *c.config,
InitProcessPid: pid,
InitProcessStartTime: startTime,
Created: c.created,
},
Rootless: c.config.Rootless,
CgroupPaths: c.cgroupManager.GetPaths(),
NamespacePaths: make(map[configs.NamespaceType]string),
ExternalDescriptors: externalDescriptors,
}
if pid > 0 {
for _, ns := range c.config.Namespaces {
state.NamespacePaths[ns.Type] = ns.GetPath(pid)
}
for _, nsType := range configs.NamespaceTypes() {
if !configs.IsNamespaceSupported(nsType) {
continue
}
if _, ok := state.NamespacePaths[nsType]; !ok {
ns := configs.Namespace{Type: nsType}
state.NamespacePaths[ns.Type] = ns.GetPath(pid)
}
}
}
return state, nil
}
// orderNamespacePaths sorts namespace paths into a list of paths that we
// can setns in order.
func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) {
paths := []string{}
order := []configs.NamespaceType{
// The user namespace *must* be done first.
configs.NEWUSER,
configs.NEWIPC,
configs.NEWUTS,
configs.NEWNET,
configs.NEWPID,
configs.NEWNS,
}
// Remove namespaces that we don't need to join.
var nsTypes []configs.NamespaceType
for _, ns := range order {
if c.config.Namespaces.Contains(ns) {
nsTypes = append(nsTypes, ns)
}
}
for _, nsType := range nsTypes {
if p, ok := namespaces[nsType]; ok && p != "" {
// check if the requested namespace is supported
if !configs.IsNamespaceSupported(nsType) {
return nil, newSystemError(fmt.Errorf("namespace %s is not supported", nsType))
}
// only set to join this namespace if it exists
if _, err := os.Lstat(p); err != nil {
return nil, newSystemErrorWithCausef(err, "running lstat on namespace path %q", p)
}
// do not allow namespace path with comma as we use it to separate
// the namespace paths
if strings.ContainsRune(p, ',') {
return nil, newSystemError(fmt.Errorf("invalid path %s", p))
}
paths = append(paths, fmt.Sprintf("%s:%s", configs.NsName(nsType), p))
}
}
return paths, nil
}
func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) {
data := bytes.NewBuffer(nil)
for _, im := range idMap {
line := fmt.Sprintf("%d %d %d\n", im.ContainerID, im.HostID, im.Size)
if _, err := data.WriteString(line); err != nil {
return nil, err
}
}
return data.Bytes(), nil
}
// bootstrapData encodes the necessary data in netlink binary format
// as a io.Reader.
// Consumer can write the data to a bootstrap program
// such as one that uses nsenter package to bootstrap the container's
// init process correctly, i.e. with correct namespaces, uid/gid
// mapping etc.
func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string) (io.Reader, error) {
// create the netlink message
r := nl.NewNetlinkRequest(int(InitMsg), 0)
// write cloneFlags
r.AddData(&Int32msg{
Type: CloneFlagsAttr,
Value: uint32(cloneFlags),
})
// write custom namespace paths
if len(nsMaps) > 0 {
nsPaths, err := c.orderNamespacePaths(nsMaps)
if err != nil {
return nil, err
}
r.AddData(&Bytemsg{
Type: NsPathsAttr,
Value: []byte(strings.Join(nsPaths, ",")),
})
}
// write namespace paths only when we are not joining an existing user ns
_, joinExistingUser := nsMaps[configs.NEWUSER]
if !joinExistingUser {
// write uid mappings
if len(c.config.UidMappings) > 0 {
b, err := encodeIDMapping(c.config.UidMappings)
if err != nil {
return nil, err
}
r.AddData(&Bytemsg{
Type: UidmapAttr,
Value: b,
})
}
// write gid mappings
if len(c.config.GidMappings) > 0 {
b, err := encodeIDMapping(c.config.GidMappings)
if err != nil {
return nil, err
}
r.AddData(&Bytemsg{
Type: GidmapAttr,
Value: b,
})
// The following only applies if we are root.
if !c.config.Rootless {
// check if we have CAP_SETGID to setgroup properly
pid, err := capability.NewPid(os.Getpid())
if err != nil {
return nil, err
}
if !pid.Get(capability.EFFECTIVE, capability.CAP_SETGID) {
r.AddData(&Boolmsg{
Type: SetgroupAttr,
Value: true,
})
}
}
}
}
// write oom_score_adj
r.AddData(&Bytemsg{
Type: OomScoreAdjAttr,
Value: []byte(fmt.Sprintf("%d", c.config.OomScoreAdj)),
})
// write rootless
r.AddData(&Boolmsg{
Type: RootlessAttr,
Value: c.config.Rootless,
})
return bytes.NewReader(r.Serialize()), nil
}