feat: handle upgrade job

This commit is contained in:
2024-12-13 15:00:43 +03:30
parent 89702d287a
commit 9b219d967e
12 changed files with 147 additions and 127 deletions

View File

@@ -2,7 +2,6 @@ package managed_job
import (
"flink-kube-operator/internal/crd/v1alpha1"
"strings"
"time"
"flink-kube-operator/pkg"
@@ -10,83 +9,14 @@ import (
"go.uber.org/zap"
)
// func (job *ManagedJob) startCycle() {
// ticker := time.NewTicker(5 * time.Second)
// quit := make(chan struct{})
// // load job state from db
// job.loadState()
// go func() {
// for {
// select {
// case <-ticker.C:
// job.cycle()
// case <-quit:
// ticker.Stop()
// return
// }
// }
// }()
// }
func (job *ManagedJob) Cycle() {
pkg.Logger.Debug("[managed-job] [new] check cycle", zap.String("jobKey", string(job.def.UID)))
// Init job
if job.def.Status.JobStatus == "" {
if job.def.Status.LastSavepointPath == nil {
if job.def.Status.JarId == nil {
err := job.upload()
if err != nil {
job.crd.Patch(job.def.UID, map[string]interface{}{
"status": map[string]interface{}{
"error": "[upload-error] " + err.Error(),
},
})
return
}
}
for {
err := job.run()
if err != nil {
if strings.ContainsAny(err.Error(), ".jar does not exist") {
err := job.upload()
if err != nil {
job.crd.Patch(job.def.UID, map[string]interface{}{
"status": map[string]interface{}{
"error": "[upload-error] " + err.Error(),
},
})
return
}
continue
}
job.crd.Patch(job.def.UID, map[string]interface{}{
"status": map[string]interface{}{
"error": "[run-error] " + err.Error(),
},
})
return
}
return
}
} else {
job.restore()
}
if job.def.Status.LifeCycleStatus == "" && job.def.Status.JobStatus == "" {
job.run()
return
}
// job.crd.SetJobStatus(job.def.UID, v1alpha1.FlinkJobStatus{
// JobStatus: job.def.Status.JobStatus,
// })
// Check for set running or error state
/* if job.state.Status == JobStatusCreating || job.state.Status == JobStatusFailing {
err := job.checkStatus()
if errors.Is(err, ErrNoJobId) {
job.state = nil
}
return
} */
if job.def.Status.JobStatus == v1alpha1.JobStatusRunning {
if (job.def.Spec.SavepointInterval.Duration != 0) && ((job.def.Status.LastSavepointDate == nil) || time.Now().Add(-job.def.Spec.SavepointInterval.Duration).After(*job.def.Status.LastSavepointDate)) {
@@ -96,16 +26,20 @@ func (job *ManagedJob) Cycle() {
job.trackSavepoint()
}
}
if job.def.Status.RunningJarURI != nil && job.def.Spec.JarURI != *job.def.Status.RunningJarURI {
job.upgrade()
}
return
}
if job.def.Status.JobStatus == v1alpha1.JobStatusFailed && job.def.Status.LastSavepointPath != nil {
job.restore()
return
}
if job.def.Status.JobStatus == v1alpha1.JobStatusFailed && job.def.Status.LastSavepointPath == nil {
job.restore()
return
}
// if job.def.Status.JobStatus == v1alpha1.JobStatusFailed && job.def.Status.LastSavepointPath != nil {
// //job.restore()
// return
// }
// if job.def.Status.JobStatus == v1alpha1.JobStatusFailed && job.def.Status.LastSavepointPath == nil {
// //job.restore()
// return
// }
pkg.Logger.Warn("[managed-job] [cycle]", zap.String("unhanded job status", string(job.def.Status.JobStatus)))
}

View File

@@ -1,7 +1,6 @@
package managed_job
import (
"errors"
"flink-kube-operator/internal/crd/v1alpha1"
"strings"
"time"
@@ -12,8 +11,8 @@ import (
"go.uber.org/zap"
)
// restore the job from savepoint and jarId in managedJob
func (job *ManagedJob) restore() error {
// run the job from savepoint and jarId in managedJob
func (job *ManagedJob) run() error {
var savepointPath string
if job.def.Status.LastSavepointPath == nil {
pkg.Logger.Error("[managed-job] [restore]", zap.Error(v1alpha1.ErrNoSavepointPath))
@@ -21,39 +20,48 @@ func (job *ManagedJob) restore() error {
} else {
savepointPath = *job.def.Status.LastSavepointPath
}
if job.def.Status.JarId == nil {
err := errors.New("missing jar id")
pkg.Logger.Error("[managed-job] [restore]", zap.Error(err))
return err
}
pkg.Logger.Info("[managed-job] [restore] restoring job", zap.String("name", job.def.GetName()), zap.String("savepointPath", savepointPath))
var jobId *string
for {
runJarResp, err := job.client.RunJar(api.RunOpts{
JarID: *job.def.Status.JarId,
AllowNonRestoredState: true,
EntryClass: job.def.Spec.EntryClass,
SavepointPath: savepointPath,
})
if err != nil {
if strings.ContainsAny(err.Error(), ".jar does not exist") {
err := job.upload()
if err != nil {
job.crd.Patch(job.def.UID, map[string]interface{}{
"status": map[string]interface{}{
"error": "[upload-error] " + err.Error(),
},
})
return nil
shouldUpload := false
if job.def.Status.JarId == nil {
err := v1alpha1.ErrNoJarId
pkg.Logger.Error("[managed-job] [run]", zap.Error(err))
shouldUpload = true
} else {
runJarResp, err := job.client.RunJar(api.RunOpts{
JarID: *job.def.Status.JarId,
AllowNonRestoredState: true,
EntryClass: job.def.Spec.EntryClass,
SavepointPath: savepointPath,
})
if err == nil {
pkg.Logger.Info("[managed-job] [run] jar successfully ran", zap.Any("run-jar-resp", runJarResp))
jobId = &runJarResp.JobId
break
} else {
if strings.ContainsAny(err.Error(), ".jar does not exist") {
shouldUpload = true
} else {
pkg.Logger.Error("[managed-job] [run] unhandled jar run Flink error", zap.Error(err))
}
continue
}
pkg.Logger.Error("[managed-job] [restore]", zap.Error(err))
return err
}
jobId = &runJarResp.JobId
pkg.Logger.Debug("[main] after run jar", zap.Any("run-jar-resp", runJarResp))
break
if shouldUpload {
err := job.upload()
if err != nil {
job.crd.Patch(job.def.UID, map[string]interface{}{
"status": map[string]interface{}{
"error": "[upload-error] " + err.Error(),
},
})
return nil
}
continue
}
return nil
}
// job.def.Status.JobId = &runJarResp.JobId
@@ -62,6 +70,7 @@ func (job *ManagedJob) restore() error {
job.crd.Patch(job.def.UID, map[string]interface{}{
"status": map[string]interface{}{
"jobId": jobId,
"runningJarURI": job.def.Spec.JarURI,
"jobStatus": v1alpha1.JobStatusCreating,
"lifeCycleStatus": v1alpha1.LifeCycleStatusRestoring,
"lastRestoredSavepointDate": job.def.Status.LastSavepointDate,

View File

@@ -0,0 +1,28 @@
package managed_job
import (
"flink-kube-operator/internal/crd/v1alpha1"
"flink-kube-operator/pkg"
"go.uber.org/zap"
)
func (job *ManagedJob) upgrade() {
if job.def.Status.LastSavepointPath != nil {
pkg.Logger.Info("upgrading job ",
zap.String("jobName", job.def.GetName()),
zap.String("currentJarURI", job.def.Spec.JarURI),
zap.String("prevJarURI", *job.def.Status.RunningJarURI),
)
job.run()
} else {
err := "There is no savepoint path existing"
pkg.Logger.Error(err)
job.crd.Patch(job.def.UID, map[string]interface{}{
"status": map[string]interface{}{
"lifeCycleStatus": v1alpha1.LifeCycleStatusUpgradeFailed,
"error": err,
},
})
}
}

View File

@@ -0,0 +1,36 @@
package managed_job
import (
"flink-kube-operator/internal/jar"
"flink-kube-operator/pkg"
"go.uber.org/zap"
)
// upload jar file and set the jarId for later usages
func (job *ManagedJob) upload() error {
jarFile, err := jar.NewJarFile(job.def.Spec.JarURI)
if err != nil {
pkg.Logger.Debug("[main] error on download jar", zap.Error(err))
return err
}
jarId, err := jarFile.Upload(job.client)
if err != nil {
pkg.Logger.Debug("[main] error on upload jar", zap.Error(err))
return err
}
err = jarFile.Delete()
if err != nil {
pkg.Logger.Debug("[main] error on delete jar", zap.Error(err))
}
pkg.Logger.Debug("[main] after upload jar", zap.Any("upload-jar-resp", jarId))
job.def.Status.JarId = &jarId
job.crd.Patch(job.def.UID, map[string]interface{}{
"status": map[string]interface{}{
"jarId": job.def.Status.JarId,
},
})
return nil
}