~comcloudway/builds.sr.ht

e147a87a3bad055c9813aae2d6aae584c4745c78 — Drew DeVault 5 years ago c49170d
Keep VMs alive post-failure for users to inspect
6 files changed, 90 insertions(+), 11 deletions(-)

M buildsrht-shell
M buildsrht/app.py
M buildsrht/templates/job.html
M worker/context.go
M worker/http.go
M worker/tasks.go
M buildsrht-shell => buildsrht-shell +9 -5
@@ 59,14 59,16 @@ deadline = datetime.utcfromtimestamp(info["deadline"])
manifest = Manifest(yaml.safe_load(info["manifest"]))

def connect(job_id, info):
    print("Your VM will be terminated "
            + naturaltime(datetime.utcnow() - deadline))
    """Opens a shell on the build VM"""
    limit = naturaltime(datetime.utcnow() - deadline)
    print(f"Your VM will be terminated {limit}, or when you log out.")
    print()
    requests.post(f"http://localhost:8080/job/{job_id}/claim")
    sys.stdout.flush()
    sys.stderr.flush()
    tty = os.open("/dev/tty", os.O_RDWR)
    os.dup2(0, tty)
    os.execvp("ssh", [
    subprocess.call([
        "ssh", "-qt",
        "-p", str(info["port"]),
        "-o", "UserKnownHostsFile=/dev/null",


@@ 74,8 76,10 @@ def connect(job_id, info):
        "-o", "LogLevel=quiet",
        "build@localhost", "bash"
    ])
    requests.post(f"http://localhost:8080/job/{job_id}/terminate")

def tail(job_id, info):
    """Tails the build logs to stdout"""
    logs = os.path.join(cfg("builds.sr.ht::worker", "buildlogs"), str(job_id))
    p = subprocess.Popen(["tail", "-f", os.path.join(logs, "log")])
    tasks = set()


@@ 88,7 92,7 @@ def tail(job_id, info):
            path = os.path.join(logs, task.name, "log")
            if os.path.exists(path):
                procs.append(subprocess.Popen(
                    f"tail -c +1 -f {shlex.quote(path)} | " +
                    f"tail -f {shlex.quote(path)} | " +
                    "awk '{ print \"[" + shlex.quote(task.name) + "] \" $0 }'",
                    shell=True))
                tasks.update({ task.name })


@@ 102,7 106,7 @@ def tail(job_id, info):
        time.sleep(3)

if op == "connect":
    if info["task"] != info["tasks"]:
    if info["task"] != info["tasks"] and info["status"] == "running":
        tail(job_id, info)
    connect(job_id, info)
elif op == "tail":

M buildsrht/app.py => buildsrht/app.py +6 -1
@@ 1,4 1,5 @@
from buildsrht.types import JobStatus, OAuthToken, User
from datetime import datetime, timedelta
from flask import session
from srht.config import cfg
from srht.database import DbSession


@@ 32,6 33,10 @@ class BuildApp(SrhtFlask):

        @self.context_processor
        def inject():
            return { "JobStatus": JobStatus }
            return {
                "datetime": datetime,
                "timedelta": timedelta,
                "JobStatus": JobStatus,
            }

app = BuildApp()

M buildsrht/templates/job.html => buildsrht/templates/job.html +14 -1
@@ 1,4 1,4 @@
{% extends "layout.html" %}
{% extends "layout-full.html" %}
{% block title %}
<title>
  builds #{{ job.id }} - {{ job.status.value }}


@@ 103,6 103,19 @@
      <a href="#bottom">go to bottom »</a>
    </div>
    <div class="col-md-9">
      {% if current_user and current_user.id == job.owner_id %}
      {% if (job.status.value == "failed" and
          datetime.utcnow() < job.updated + timedelta(minutes=10)) %}
      <div class="alert alert-danger">
        <strong>This build job failed.</strong> You may log into the failed
        build environment within 10 minutes to examine the results with the
        following command:
        <pre
          style="margin-bottom: 0; margin-top: 1rem;"
        >ssh builds@{{job.runner}} connect {{job.id}}</pre>
      </div>
      {% endif %}
      {% endif %}
      {% for log in logs %}
      <details open>
        {% if log["name"] %}

M worker/context.go => worker/context.go +39 -3
@@ 56,16 56,18 @@ type WorkerContext struct {

type JobContext struct {
	Cancel   context.CancelFunc
	Claimed  bool
	Conf     func(section, key string) string
	Context  context.Context
	Db       *sql.DB
	Deadline time.Time
	Job      *Job
	Log      *log.Logger
	LogDir   string
	LogFile  *os.File
	Log      *log.Logger
	Manifest *Manifest
	Port     int
	Settled  bool

	NTasks int
	Task   int


@@ 80,6 82,8 @@ func (wctx *WorkerContext) RunBuild(
		err error
		job *Job
		ctx *JobContext

		cleanup func()
	)

	timer := prometheus.NewTimer(buildDuration)


@@ 120,6 124,9 @@ func (wctx *WorkerContext) RunBuild(
					job.SetStatus("failed")
				}
				ctx.ProcessTriggers()
				if ctx.Settled {
					ctx.Standby()
				}
				if ctx.Log != nil {
					ctx.Log.Printf("Error: %v\n", err)
					ctx.LogFile.Close()


@@ 129,6 136,9 @@ func (wctx *WorkerContext) RunBuild(
			}
			failedBuilds.Inc()
		}
		if cleanup != nil {
			cleanup()
		}
	}()

	timeout, _ := time.ParseDuration(conf("builds.sr.ht::worker", "timeout"))


@@ 160,8 170,7 @@ func (wctx *WorkerContext) RunBuild(
	ctx.Log = log.New(io.MultiWriter(ctx.LogFile, os.Stdout),
		"[#"+strconv.Itoa(job.Id)+"] ", log.LstdFlags)

	cleanup := ctx.Boot(wctx.Redis)
	defer cleanup()
	cleanup = ctx.Boot(wctx.Redis)

	tasks := []func() error{
		ctx.Settle,


@@ 204,6 213,33 @@ func (wctx *WorkerContext) RunBuild(
	successfulBuilds.Inc()
}

func (ctx *JobContext) Standby() {
	ctx.Log.Println("\x1B[1m\x1B[91mBuild failed.\x1B[0m")
	ctx.Log.Println("The build environment will be kept alive for 10 minutes.")
	ctx.Log.Println("To log in with SSH and examine it, use the following command:")
	ctx.Log.Println()
	ctx.Log.Printf("\tssh -t builds@%s connect %d", *ctx.Job.Runner, ctx.Job.Id)
	ctx.Log.Println()
	ctx.Log.Println("After logging in, the deadline is increased to your remaining build time.")
	select {
	case <-time.After(10*time.Minute):
		break
	case <-ctx.Context.Done():
		ctx.Log.Println("Build cancelled. Terminating build environment.")
		return
	}
	if ctx.Claimed {
		select {
		case <-time.After(time.Until(ctx.Deadline)):
			break
		case <-ctx.Context.Done():
			break
		}
	} else {
		ctx.Log.Println("Deadline elapsed. Terminating build environment.")
	}
}

func (ctx *JobContext) Control(
	context context.Context, args ...string) *exec.Cmd {


M worker/http.go => worker/http.go +21 -1
@@ 56,6 56,8 @@ func HttpServer() {
				w.Write([]byte("404 not found"))
			}
		case "cancel":
			fallthrough
		case "terminate":
			if r.Method != "POST" {
				w.WriteHeader(405)
				w.Write([]byte("405 method not allowed"))


@@ 65,7 67,9 @@ func HttpServer() {
			defer jobsMutex.Unlock()
			if job, ok := jobs[jobId]; ok {
				job.Cancel()
				job.Job.SetStatus("cancelled")
				if op == "cancel" {
					job.Job.SetStatus("cancelled")
				}
			} else {
				w.WriteHeader(404)
				w.Write([]byte("404 not found"))


@@ 73,6 77,22 @@ func HttpServer() {
			}
			w.WriteHeader(200)
			w.Write([]byte("cancelled"))
		case "claim":
			if r.Method != "POST" {
				w.WriteHeader(405)
				w.Write([]byte("405 method not allowed"))
				return
			}
			jobsMutex.Lock()
			defer jobsMutex.Unlock()
			if job, ok := jobs[jobId]; ok {
				job.Claimed = true
				w.WriteHeader(200)
				w.Write([]byte("claimed"))
			} else {
				w.WriteHeader(404)
				w.Write([]byte("404 not found"))
			}
		default:
			w.WriteHeader(404)
			w.Write([]byte("404 not found"))

M worker/tasks.go => worker/tasks.go +1 -0
@@ 110,6 110,7 @@ func (ctx *JobContext) Settle() error {
			stdout, _ := ioutil.ReadAll(pipe)
			if err := check.Wait(); err == nil {
				if string(stdout) == "hello world" {
					ctx.Settled = true
					done <- nil
					return
				} else {