From e147a87a3bad055c9813aae2d6aae584c4745c78 Mon Sep 17 00:00:00 2001 From: Drew DeVault Date: Wed, 14 Aug 2019 16:03:15 +0900 Subject: [PATCH] Keep VMs alive post-failure for users to inspect --- buildsrht-shell | 14 +++++++----- buildsrht/app.py | 7 +++++- buildsrht/templates/job.html | 15 ++++++++++++- worker/context.go | 42 +++++++++++++++++++++++++++++++++--- worker/http.go | 22 ++++++++++++++++++- worker/tasks.go | 1 + 6 files changed, 90 insertions(+), 11 deletions(-) diff --git a/buildsrht-shell b/buildsrht-shell index 56b9e21..2cb0c2b 100755 --- a/buildsrht-shell +++ b/buildsrht-shell @@ -59,14 +59,16 @@ deadline = datetime.utcfromtimestamp(info["deadline"]) manifest = Manifest(yaml.safe_load(info["manifest"])) def connect(job_id, info): - print("Your VM will be terminated " - + naturaltime(datetime.utcnow() - deadline)) + """Opens a shell on the build VM""" + limit = naturaltime(datetime.utcnow() - deadline) + print(f"Your VM will be terminated {limit}, or when you log out.") print() + requests.post(f"http://localhost:8080/job/{job_id}/claim") sys.stdout.flush() sys.stderr.flush() tty = os.open("/dev/tty", os.O_RDWR) os.dup2(0, tty) - os.execvp("ssh", [ + subprocess.call([ "ssh", "-qt", "-p", str(info["port"]), "-o", "UserKnownHostsFile=/dev/null", @@ -74,8 +76,10 @@ def connect(job_id, info): "-o", "LogLevel=quiet", "build@localhost", "bash" ]) + requests.post(f"http://localhost:8080/job/{job_id}/terminate") def tail(job_id, info): + """Tails the build logs to stdout""" logs = os.path.join(cfg("builds.sr.ht::worker", "buildlogs"), str(job_id)) p = subprocess.Popen(["tail", "-f", os.path.join(logs, "log")]) tasks = set() @@ -88,7 +92,7 @@ def tail(job_id, info): path = os.path.join(logs, task.name, "log") if os.path.exists(path): procs.append(subprocess.Popen( - f"tail -c +1 -f {shlex.quote(path)} | " + + f"tail -f {shlex.quote(path)} | " + "awk '{ print \"[" + shlex.quote(task.name) + "] \" $0 }'", shell=True)) tasks.update({ task.name }) @@ -102,7 +106,7 @@ def tail(job_id, info): time.sleep(3) if op == "connect": - if info["task"] != info["tasks"]: + if info["task"] != info["tasks"] and info["status"] == "running": tail(job_id, info) connect(job_id, info) elif op == "tail": diff --git a/buildsrht/app.py b/buildsrht/app.py index 1eccc01..e223479 100644 --- a/buildsrht/app.py +++ b/buildsrht/app.py @@ -1,4 +1,5 @@ from buildsrht.types import JobStatus, OAuthToken, User +from datetime import datetime, timedelta from flask import session from srht.config import cfg from srht.database import DbSession @@ -32,6 +33,10 @@ class BuildApp(SrhtFlask): @self.context_processor def inject(): - return { "JobStatus": JobStatus } + return { + "datetime": datetime, + "timedelta": timedelta, + "JobStatus": JobStatus, + } app = BuildApp() diff --git a/buildsrht/templates/job.html b/buildsrht/templates/job.html index daf9c76..3252c31 100644 --- a/buildsrht/templates/job.html +++ b/buildsrht/templates/job.html @@ -1,4 +1,4 @@ -{% extends "layout.html" %} +{% extends "layout-full.html" %} {% block title %} builds #{{ job.id }} - {{ job.status.value }} @@ -103,6 +103,19 @@ <a href="#bottom">go to bottom ยป</a> </div> <div class="col-md-9"> + {% if current_user and current_user.id == job.owner_id %} + {% if (job.status.value == "failed" and + datetime.utcnow() < job.updated + timedelta(minutes=10)) %} + <div class="alert alert-danger"> + <strong>This build job failed.</strong> You may log into the failed + build environment within 10 minutes to examine the results with the + following command: + <pre + style="margin-bottom: 0; margin-top: 1rem;" + >ssh builds@{{job.runner}} connect {{job.id}}</pre> + </div> + {% endif %} + {% endif %} {% for log in logs %} <details open> {% if log["name"] %} diff --git a/worker/context.go b/worker/context.go index a2bd3c3..2007998 100644 --- a/worker/context.go +++ b/worker/context.go @@ -56,16 +56,18 @@ type WorkerContext struct { type JobContext struct { Cancel context.CancelFunc + Claimed bool Conf func(section, key string) string Context context.Context Db *sql.DB Deadline time.Time Job *Job + Log *log.Logger LogDir string LogFile *os.File - Log *log.Logger Manifest *Manifest Port int + Settled bool NTasks int Task int @@ -80,6 +82,8 @@ func (wctx *WorkerContext) RunBuild( err error job *Job ctx *JobContext + + cleanup func() ) timer := prometheus.NewTimer(buildDuration) @@ -120,6 +124,9 @@ func (wctx *WorkerContext) RunBuild( job.SetStatus("failed") } ctx.ProcessTriggers() + if ctx.Settled { + ctx.Standby() + } if ctx.Log != nil { ctx.Log.Printf("Error: %v\n", err) ctx.LogFile.Close() @@ -129,6 +136,9 @@ func (wctx *WorkerContext) RunBuild( } failedBuilds.Inc() } + if cleanup != nil { + cleanup() + } }() timeout, _ := time.ParseDuration(conf("builds.sr.ht::worker", "timeout")) @@ -160,8 +170,7 @@ func (wctx *WorkerContext) RunBuild( ctx.Log = log.New(io.MultiWriter(ctx.LogFile, os.Stdout), "[#"+strconv.Itoa(job.Id)+"] ", log.LstdFlags) - cleanup := ctx.Boot(wctx.Redis) - defer cleanup() + cleanup = ctx.Boot(wctx.Redis) tasks := []func() error{ ctx.Settle, @@ -204,6 +213,33 @@ func (wctx *WorkerContext) RunBuild( successfulBuilds.Inc() } +func (ctx *JobContext) Standby() { + ctx.Log.Println("\x1B[1m\x1B[91mBuild failed.\x1B[0m") + ctx.Log.Println("The build environment will be kept alive for 10 minutes.") + ctx.Log.Println("To log in with SSH and examine it, use the following command:") + ctx.Log.Println() + ctx.Log.Printf("\tssh -t builds@%s connect %d", *ctx.Job.Runner, ctx.Job.Id) + ctx.Log.Println() + ctx.Log.Println("After logging in, the deadline is increased to your remaining build time.") + select { + case <-time.After(10*time.Minute): + break + case <-ctx.Context.Done(): + ctx.Log.Println("Build cancelled. Terminating build environment.") + return + } + if ctx.Claimed { + select { + case <-time.After(time.Until(ctx.Deadline)): + break + case <-ctx.Context.Done(): + break + } + } else { + ctx.Log.Println("Deadline elapsed. Terminating build environment.") + } +} + func (ctx *JobContext) Control( context context.Context, args ...string) *exec.Cmd { diff --git a/worker/http.go b/worker/http.go index f8ba97a..4ab5a93 100644 --- a/worker/http.go +++ b/worker/http.go @@ -56,6 +56,8 @@ func HttpServer() { w.Write([]byte("404 not found")) } case "cancel": + fallthrough + case "terminate": if r.Method != "POST" { w.WriteHeader(405) w.Write([]byte("405 method not allowed")) @@ -65,7 +67,9 @@ func HttpServer() { defer jobsMutex.Unlock() if job, ok := jobs[jobId]; ok { job.Cancel() - job.Job.SetStatus("cancelled") + if op == "cancel" { + job.Job.SetStatus("cancelled") + } } else { w.WriteHeader(404) w.Write([]byte("404 not found")) @@ -73,6 +77,22 @@ func HttpServer() { } w.WriteHeader(200) w.Write([]byte("cancelled")) + case "claim": + if r.Method != "POST" { + w.WriteHeader(405) + w.Write([]byte("405 method not allowed")) + return + } + jobsMutex.Lock() + defer jobsMutex.Unlock() + if job, ok := jobs[jobId]; ok { + job.Claimed = true + w.WriteHeader(200) + w.Write([]byte("claimed")) + } else { + w.WriteHeader(404) + w.Write([]byte("404 not found")) + } default: w.WriteHeader(404) w.Write([]byte("404 not found")) diff --git a/worker/tasks.go b/worker/tasks.go index 2f849e6..7fdf8df 100644 --- a/worker/tasks.go +++ b/worker/tasks.go @@ -110,6 +110,7 @@ func (ctx *JobContext) Settle() error { stdout, _ := ioutil.ReadAll(pipe) if err := check.Wait(); err == nil { if string(stdout) == "hello world" { + ctx.Settled = true done <- nil return } else { -- 2.38.5