From e223b43496a608745d0513fc47175feb27015c6f Mon Sep 17 00:00:00 2001
From: Piotr Roslaniec
Date: Thu, 9 Apr 2026 15:42:49 +0000
Subject: [PATCH 01/17] fix(covenantsigner): rename misleading test after
resilient loading change
TestStoreLoadFailsOnInvalidUpdatedAtForDuplicateRouteKeys now asserts
success (resilient loading), not failure. Rename to reflect actual
behavior.
---
pkg/covenantsigner/store_test.go | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pkg/covenantsigner/store_test.go b/pkg/covenantsigner/store_test.go
index fd271530ed..5c1e1589e8 100644
--- a/pkg/covenantsigner/store_test.go
+++ b/pkg/covenantsigner/store_test.go
@@ -205,7 +205,7 @@ func TestStoreLoadSelectsNewestJobForDuplicateRouteKeys(t *testing.T) {
}
}
-func TestStoreLoadFailsOnInvalidUpdatedAtForDuplicateRouteKeys(t *testing.T) {
+func TestStoreLoadResolvesInvalidUpdatedAtForDuplicateRouteKeys(t *testing.T) {
handle := newMemoryHandle()
first := &Job{
From 1ed010f8c0fce79f2937ee00afc72812e905362f Mon Sep 17 00:00:00 2001
From: Piotr Roslaniec
Date: Thu, 9 Apr 2026 15:43:53 +0000
Subject: [PATCH 02/17] fix(covenantsigner): use errors.Is for errJobNotFound
comparison in Poll
Direct == comparison is correct today since errJobNotFound is never
wrapped, but errors.Is is more resilient to future wrapping changes.
---
pkg/covenantsigner/service.go | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/pkg/covenantsigner/service.go b/pkg/covenantsigner/service.go
index 3d112f9e3f..1114b1382b 100644
--- a/pkg/covenantsigner/service.go
+++ b/pkg/covenantsigner/service.go
@@ -4,6 +4,7 @@ import (
"context"
"crypto/rand"
"encoding/hex"
+ "errors"
"fmt"
"reflect"
"sync"
@@ -378,7 +379,7 @@ func (s *Service) Poll(ctx context.Context, route TemplateID, input SignerPollIn
transition, pollErr := s.engine.OnPoll(ctx, job)
if pollErr != nil {
- if pollErr != errJobNotFound {
+ if !errors.Is(pollErr, errJobNotFound) {
return StepResult{}, pollErr
}
}
@@ -398,7 +399,7 @@ func (s *Service) Poll(ctx context.Context, route TemplateID, input SignerPollIn
return mapJobResult(currentJob), nil
}
- if pollErr == errJobNotFound {
+ if errors.Is(pollErr, errJobNotFound) {
applyTransition(currentJob, &Transition{
State: JobStateFailed,
Reason: ReasonJobNotFound,
From 31578278aaaf953321cbe0b0c323c43afcd940e0 Mon Sep 17 00:00:00 2001
From: Piotr Roslaniec
Date: Thu, 9 Apr 2026 15:44:04 +0000
Subject: [PATCH 03/17] fix(covenantsigner): restrict healthz auth bypass to
GET method
The auth bypass checked only the path, allowing any HTTP method to
skip bearer auth on /healthz. Restrict to GET to match the registered
handler and prevent unintended bypass on other methods.
---
pkg/covenantsigner/server.go | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pkg/covenantsigner/server.go b/pkg/covenantsigner/server.go
index 96ad083120..0bf38bf3fb 100644
--- a/pkg/covenantsigner/server.go
+++ b/pkg/covenantsigner/server.go
@@ -251,7 +251,7 @@ func newHandler(service *Service, serviceCtx context.Context, authToken string,
}
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
- if r.URL.Path == "/healthz" {
+ if r.Method == http.MethodGet && r.URL.Path == "/healthz" {
mux.ServeHTTP(w, r)
return
}
From b1482fee48c6ea2a4dd30403478640c317fd582d Mon Sep 17 00:00:00 2001
From: Piotr Roslaniec
Date: Thu, 9 Apr 2026 15:44:24 +0000
Subject: [PATCH 04/17] docs(covenantsigner): warn against CLI flag for
AuthToken
The auth token is visible in /proc/PID/cmdline when passed as a CLI
flag. Add documentation recommending environment variables or config
files for non-loopback deployments.
---
pkg/covenantsigner/config.go | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/pkg/covenantsigner/config.go b/pkg/covenantsigner/config.go
index 16ede9a4f9..e11f7fec0a 100644
--- a/pkg/covenantsigner/config.go
+++ b/pkg/covenantsigner/config.go
@@ -10,7 +10,9 @@ type Config struct {
// binds to. Empty defaults to loopback-only.
ListenAddress string
// AuthToken enables static Bearer authentication for signer endpoints.
- // Non-loopback binds must set this.
+ // Non-loopback binds must set this. Prefer environment variables or
+ // config files over CLI flags to avoid exposing the token in
+ // /proc/PID/cmdline.
AuthToken string
// EnableSelfV1 exposes the self_v1 signer HTTP routes. Keep this disabled
// for a qc_v1-first launch unless self_v1 has cleared its own go-live gate.
From 5cf862d36f3cf3497883ac182d78b7a6ef82ceb4 Mon Sep 17 00:00:00 2001
From: Piotr Roslaniec
Date: Thu, 9 Apr 2026 15:45:14 +0000
Subject: [PATCH 05/17] fix(covenantsigner): add aggregate load summary with
skip count
Operators previously saw only individual warnings per corrupt file but
had no summary of total loaded vs skipped. Add a summary log line at
the end of load() for operational visibility.
---
pkg/covenantsigner/store.go | 15 +++++++++++++++
1 file changed, 15 insertions(+)
diff --git a/pkg/covenantsigner/store.go b/pkg/covenantsigner/store.go
index 3684edb73e..e5432bdbf8 100644
--- a/pkg/covenantsigner/store.go
+++ b/pkg/covenantsigner/store.go
@@ -155,6 +155,8 @@ func (s *Store) load() error {
dataChan, errorChan := s.handle.ReadAll()
+ var loaded, skipped int
+
for dataChan != nil || errorChan != nil {
select {
case descriptor, ok := <-dataChan:
@@ -174,6 +176,7 @@ func (s *Store) load() error {
descriptor.Name(),
err,
)
+ skipped++
continue
}
@@ -184,6 +187,7 @@ func (s *Store) load() error {
descriptor.Name(),
err,
)
+ skipped++
continue
}
@@ -226,6 +230,7 @@ func (s *Store) load() error {
s.byRequestID[job.RequestID] = job
s.byRouteKey[key] = job.RequestID
+ loaded++
case err, ok := <-errorChan:
if !ok {
errorChan = nil
@@ -237,6 +242,16 @@ func (s *Store) load() error {
}
}
+ if skipped > 0 {
+ logger.Warnf(
+ "store load complete: loaded [%d] jobs, skipped [%d] unreadable or malformed files",
+ loaded,
+ skipped,
+ )
+ } else if loaded > 0 {
+ logger.Infof("store load complete: loaded [%d] jobs", loaded)
+ }
+
return nil
}
From cc6943454041706e58ee489243d606008b139ade Mon Sep 17 00:00:00 2001
From: Piotr Roslaniec
Date: Thu, 9 Apr 2026 15:45:39 +0000
Subject: [PATCH 06/17] fix(covenantsigner): remove superseded job from
byRequestID on dedup
When load replaces a job during route-key deduplication, the superseded
job's entry remained in byRequestID, leaking stale data in the
secondary index.
---
pkg/covenantsigner/store.go | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/pkg/covenantsigner/store.go b/pkg/covenantsigner/store.go
index e5432bdbf8..c5ac36b699 100644
--- a/pkg/covenantsigner/store.go
+++ b/pkg/covenantsigner/store.go
@@ -225,6 +225,10 @@ func (s *Store) load() error {
} else if existingIsNewerOrSame {
continue
}
+
+ // Remove the superseded job from the primary index
+ // so stale entries do not leak in byRequestID.
+ delete(s.byRequestID, existingID)
}
}
From 065519c3276716b688db05408dc65fc06676954e Mon Sep 17 00:00:00 2001
From: Piotr Roslaniec
Date: Thu, 9 Apr 2026 15:46:30 +0000
Subject: [PATCH 07/17] fix(covenantsigner): use deterministic tiebreaker when
both timestamps unparseable
When two duplicate-route-key jobs both have unparseable timestamps, the
winner previously depended on non-deterministic file iteration order.
Use lexicographic RequestID comparison as a stable tiebreaker.
---
pkg/covenantsigner/store.go | 35 +++++++++++++++++++++++++----------
1 file changed, 25 insertions(+), 10 deletions(-)
diff --git a/pkg/covenantsigner/store.go b/pkg/covenantsigner/store.go
index c5ac36b699..d3922f4350 100644
--- a/pkg/covenantsigner/store.go
+++ b/pkg/covenantsigner/store.go
@@ -206,22 +206,37 @@ func (s *Store) load() error {
// the existing job -- replace it. Otherwise skip the
// candidate.
if _, parseErr := time.Parse(time.RFC3339Nano, job.UpdatedAt); parseErr != nil {
+ // Both timestamps are unparseable. Use
+ // lexicographic RequestID as a deterministic
+ // tiebreaker so the outcome does not depend on
+ // file iteration order.
+ if existing.RequestID <= job.RequestID {
+ logger.Warnf(
+ "skipping job [%s] with invalid timestamp on duplicate route key [%s/%s] (keeping [%s]): [%v]",
+ job.RequestID,
+ job.Route,
+ job.RouteRequestID,
+ existing.RequestID,
+ err,
+ )
+ continue
+ }
logger.Warnf(
- "skipping job [%s] with invalid timestamp on duplicate route key [%s/%s]: [%v]",
- job.RequestID,
+ "replacing job [%s] with invalid timestamp on duplicate route key [%s/%s] (both unparseable, lexicographic tiebreak): [%v]",
+ existing.RequestID,
+ job.Route,
+ job.RouteRequestID,
+ err,
+ )
+ } else {
+ logger.Warnf(
+ "replacing job [%s] with invalid timestamp on duplicate route key [%s/%s]: [%v]",
+ existing.RequestID,
job.Route,
job.RouteRequestID,
err,
)
- continue
}
- logger.Warnf(
- "replacing job [%s] with invalid timestamp on duplicate route key [%s/%s]: [%v]",
- existing.RequestID,
- job.Route,
- job.RouteRequestID,
- err,
- )
} else if existingIsNewerOrSame {
continue
}
From 495caf39da02e0de74577065384296c386a67f15 Mon Sep 17 00:00:00 2001
From: Piotr Roslaniec
Date: Thu, 9 Apr 2026 16:00:53 +0000
Subject: [PATCH 08/17] fix(covenantsigner): poison route keys from skipped
jobs to preserve dedupe
When load() skips a malformed job file, GetByRouteRequest can no longer
find that job. A retry then silently creates a duplicate signing job,
breaking node-local idempotency. Fix by partially parsing skipped files
to extract route keys and marking them as poisoned. GetByRouteRequest
returns an error for poisoned keys, forcing the caller to investigate
rather than creating a duplicate.
---
pkg/covenantsigner/store.go | 73 ++++++++++++++++++++++++++++++++-----
1 file changed, 64 insertions(+), 9 deletions(-)
diff --git a/pkg/covenantsigner/store.go b/pkg/covenantsigner/store.go
index d3922f4350..9cedaad287 100644
--- a/pkg/covenantsigner/store.go
+++ b/pkg/covenantsigner/store.go
@@ -16,11 +16,13 @@ const jobsDirectory = "covenant-signer/jobs"
const lockFileName = ".lock"
type Store struct {
- handle persistence.BasicHandle
- mutex sync.Mutex
- lockFile *os.File
- byRequestID map[string]*Job
- byRouteKey map[string]string
+ handle persistence.BasicHandle
+ mutex sync.Mutex
+ lockFile *os.File
+ byRequestID map[string]*Job
+ byRouteKey map[string]string
+ poisonedRoutes map[string]bool
+ skippedJobFiles []string
}
// NewStore creates a new Store backed by the given persistence handle. When
@@ -30,9 +32,10 @@ type Store struct {
// error. When dataDir is empty (in-memory handles), file locking is skipped.
func NewStore(handle persistence.BasicHandle, dataDir string) (*Store, error) {
store := &Store{
- handle: handle,
- byRequestID: make(map[string]*Job),
- byRouteKey: make(map[string]string),
+ handle: handle,
+ byRequestID: make(map[string]*Job),
+ byRouteKey: make(map[string]string),
+ poisonedRoutes: make(map[string]bool),
}
if dataDir != "" {
@@ -107,10 +110,50 @@ func (s *Store) Close() error {
return err
}
+var errPoisonedRouteKey = fmt.Errorf(
+ "route key belongs to a job that could not be loaded; " +
+ "manual recovery of the corrupt job file is required",
+)
+
func routeKey(route TemplateID, routeRequestID string) string {
return fmt.Sprintf("%s:%s", route, routeRequestID)
}
+// poisonRouteFromPartialJob attempts a lenient parse of content to extract
+// Route and RouteRequestID. If successful, the route key is marked as
+// poisoned so that future submissions are rejected rather than silently
+// creating a duplicate job.
+func (s *Store) poisonRouteFromPartialJob(content []byte, fileName string) {
+ var partial struct {
+ Route TemplateID `json:"Route"`
+ RouteRequestID string `json:"RouteRequestID"`
+ }
+ if err := json.Unmarshal(content, &partial); err != nil {
+ return
+ }
+ if partial.Route == "" || partial.RouteRequestID == "" {
+ return
+ }
+ key := routeKey(partial.Route, partial.RouteRequestID)
+ s.poisonedRoutes[key] = true
+ logger.Warnf(
+ "poisoned route key [%s] from skipped job file [%s]",
+ key,
+ fileName,
+ )
+}
+
+// SkippedJobFiles returns the file names of job files that could not be
+// loaded during startup. Operators should investigate and repair or remove
+// these files.
+func (s *Store) SkippedJobFiles() []string {
+ s.mutex.Lock()
+ defer s.mutex.Unlock()
+ result := make([]string, len(s.skippedJobFiles))
+ copy(result, s.skippedJobFiles)
+ return result
+}
+
func cloneJob(job *Job) (*Job, error) {
payload, err := json.Marshal(job)
if err != nil {
@@ -176,6 +219,7 @@ func (s *Store) load() error {
descriptor.Name(),
err,
)
+ s.skippedJobFiles = append(s.skippedJobFiles, descriptor.Name())
skipped++
continue
}
@@ -187,6 +231,11 @@ func (s *Store) load() error {
descriptor.Name(),
err,
)
+ // Attempt partial parse to extract route info for
+ // poisoning. If the route key is recoverable, block
+ // future submissions for this route to preserve dedupe.
+ s.poisonRouteFromPartialJob(content, descriptor.Name())
+ s.skippedJobFiles = append(s.skippedJobFiles, descriptor.Name())
skipped++
continue
}
@@ -295,7 +344,13 @@ func (s *Store) GetByRouteRequest(route TemplateID, routeRequestID string) (*Job
s.mutex.Lock()
defer s.mutex.Unlock()
- requestID, ok := s.byRouteKey[routeKey(route, routeRequestID)]
+ key := routeKey(route, routeRequestID)
+
+ if s.poisonedRoutes[key] {
+ return nil, false, errPoisonedRouteKey
+ }
+
+ requestID, ok := s.byRouteKey[key]
if !ok {
return nil, false, nil
}
From 9eb4194b39a470d8b1989cf0285563c488e21b41 Mon Sep 17 00:00:00 2001
From: Piotr Roslaniec
Date: Thu, 9 Apr 2026 16:02:53 +0000
Subject: [PATCH 09/17] fix(covenantsigner): extract Submit critical section
into createOrDedup
The Submit method had 5 separate mutex.Unlock() call sites, making the
locking pattern fragile for a security-critical signing path. Extract
the dedup-check-and-create logic into a helper that uses defer Unlock,
reducing the main Submit method to two clean lock scopes.
---
pkg/covenantsigner/service.go | 104 ++++++++++++++++++++--------------
1 file changed, 60 insertions(+), 44 deletions(-)
diff --git a/pkg/covenantsigner/service.go b/pkg/covenantsigner/service.go
index 1114b1382b..fa339844d8 100644
--- a/pkg/covenantsigner/service.go
+++ b/pkg/covenantsigner/service.go
@@ -230,50 +230,28 @@ func (s *Service) loadPollJob(route TemplateID, input SignerPollInput) (*Job, er
return job, nil
}
-func (s *Service) Submit(ctx context.Context, route TemplateID, input SignerSubmitInput) (StepResult, error) {
- submitValidationOptions := validationOptions{
- migrationPlanQuoteTrustRoots: s.migrationPlanQuoteTrustRoots,
- depositorTrustRoots: s.depositorTrustRoots,
- custodianTrustRoots: s.custodianTrustRoots,
- requireFreshMigrationPlanQuote: true,
- migrationPlanQuoteVerificationNow: s.now(),
- signerApprovalVerifier: s.signerApprovalVerifier,
- }
- if err := validateSubmitInput(route, input, submitValidationOptions); err != nil {
- return StepResult{}, err
- }
-
- normalizedRequest, err := normalizeRouteSubmitRequest(
- input.Request,
- validationOptions{
- migrationPlanQuoteTrustRoots: s.migrationPlanQuoteTrustRoots,
- depositorTrustRoots: s.depositorTrustRoots,
- custodianTrustRoots: s.custodianTrustRoots,
- signerApprovalVerifier: s.signerApprovalVerifier,
- },
- )
- if err != nil {
- return StepResult{}, err
- }
-
- requestDigest, err := requestDigestFromNormalized(normalizedRequest)
- if err != nil {
- return StepResult{}, err
- }
-
+// createOrDedup creates a new job under the service mutex, or returns the
+// existing job result if the route request is already known. Returns
+// (job, nil, nil) for a new job, or (nil, result, nil) for a dedup hit.
+func (s *Service) createOrDedup(
+ route TemplateID,
+ input SignerSubmitInput,
+ normalizedRequest RouteSubmitRequest,
+ requestDigest string,
+) (*Job, *StepResult, error) {
s.mutex.Lock()
+ defer s.mutex.Unlock()
+
if existing, ok, err := s.store.GetByRouteRequest(route, input.RouteRequestID); err != nil {
- s.mutex.Unlock()
- return StepResult{}, err
+ return nil, nil, err
} else if ok {
if existing.RequestDigest != requestDigest {
- s.mutex.Unlock()
- return StepResult{}, &inputError{
+ return nil, nil, &inputError{
"routeRequestId already exists with a different request payload",
}
}
- s.mutex.Unlock()
- return mapJobResult(existing), nil
+ result := mapJobResult(existing)
+ return nil, &result, nil
}
requestIDPrefix := ""
@@ -283,14 +261,12 @@ func (s *Service) Submit(ctx context.Context, route TemplateID, input SignerSubm
case TemplateSelfV1:
requestIDPrefix = "kcs_self"
default:
- s.mutex.Unlock()
- return StepResult{}, fmt.Errorf("unsupported route: %s", route)
+ return nil, nil, fmt.Errorf("unsupported route: %s", route)
}
requestID, err := newRequestID(requestIDPrefix)
if err != nil {
- s.mutex.Unlock()
- return StepResult{}, err
+ return nil, nil, err
}
now := s.now()
@@ -310,10 +286,50 @@ func (s *Service) Submit(ctx context.Context, route TemplateID, input SignerSubm
}
if err := s.store.Put(job); err != nil {
- s.mutex.Unlock()
+ return nil, nil, err
+ }
+
+ return job, nil, nil
+}
+
+func (s *Service) Submit(ctx context.Context, route TemplateID, input SignerSubmitInput) (StepResult, error) {
+ submitValidationOptions := validationOptions{
+ migrationPlanQuoteTrustRoots: s.migrationPlanQuoteTrustRoots,
+ depositorTrustRoots: s.depositorTrustRoots,
+ custodianTrustRoots: s.custodianTrustRoots,
+ requireFreshMigrationPlanQuote: true,
+ migrationPlanQuoteVerificationNow: s.now(),
+ signerApprovalVerifier: s.signerApprovalVerifier,
+ }
+ if err := validateSubmitInput(route, input, submitValidationOptions); err != nil {
return StepResult{}, err
}
- s.mutex.Unlock()
+
+ normalizedRequest, err := normalizeRouteSubmitRequest(
+ input.Request,
+ validationOptions{
+ migrationPlanQuoteTrustRoots: s.migrationPlanQuoteTrustRoots,
+ depositorTrustRoots: s.depositorTrustRoots,
+ custodianTrustRoots: s.custodianTrustRoots,
+ signerApprovalVerifier: s.signerApprovalVerifier,
+ },
+ )
+ if err != nil {
+ return StepResult{}, err
+ }
+
+ requestDigest, err := requestDigestFromNormalized(normalizedRequest)
+ if err != nil {
+ return StepResult{}, err
+ }
+
+ job, existingResult, err := s.createOrDedup(route, input, normalizedRequest, requestDigest)
+ if err != nil {
+ return StepResult{}, err
+ }
+ if existingResult != nil {
+ return *existingResult, nil
+ }
transition, err := s.engine.OnSubmit(ctx, job)
if err != nil {
@@ -330,7 +346,7 @@ func (s *Service) Submit(ctx context.Context, route TemplateID, input SignerSubm
s.mutex.Lock()
defer s.mutex.Unlock()
- currentJob, ok, err := s.store.GetByRequestID(requestID)
+ currentJob, ok, err := s.store.GetByRequestID(job.RequestID)
if err != nil {
return StepResult{}, err
}
From 07b9bccd20132554e65b2bf1526c2b77b8f50d3f Mon Sep 17 00:00:00 2001
From: Piotr Roslaniec
Date: Thu, 9 Apr 2026 16:03:35 +0000
Subject: [PATCH 10/17] fix(covenantsigner): cancel service context on init
failure and OS signals
Two fixes:
- Call cancelService() when net.Listen fails after context creation to
prevent a context leak on initialization error.
- Add SIGINT/SIGTERM signal handling so in-flight signing operations are
cancelled promptly on any shutdown path, not only when the parent
context is cancelled.
---
pkg/covenantsigner/server.go | 18 +++++++++++++++++-
1 file changed, 17 insertions(+), 1 deletion(-)
diff --git a/pkg/covenantsigner/server.go b/pkg/covenantsigner/server.go
index 0bf38bf3fb..4679c8876a 100644
--- a/pkg/covenantsigner/server.go
+++ b/pkg/covenantsigner/server.go
@@ -10,8 +10,10 @@ import (
"net"
"net/http"
"net/url"
+ "os/signal"
"strconv"
"strings"
+ "syscall"
"time"
"github.com/ipfs/go-log/v2"
@@ -124,11 +126,25 @@ func Initialize(
listener, err := net.Listen("tcp", server.httpServer.Addr)
if err != nil {
+ cancelService()
return nil, false, fmt.Errorf("failed to bind covenant signer port [%d]: %w", config.Port, err)
}
+ // Listen for both the parent context cancellation and OS signals so
+ // that in-flight signing operations are cancelled promptly on any
+ // shutdown path, including SIGINT/SIGTERM.
+ signalCtx, stopSignal := signal.NotifyContext(
+ context.Background(),
+ syscall.SIGINT,
+ syscall.SIGTERM,
+ )
+
go func() {
- <-ctx.Done()
+ select {
+ case <-ctx.Done():
+ case <-signalCtx.Done():
+ }
+ stopSignal()
// Cancel the service context so in-flight threshold signing
// operations observe shutdown and terminate promptly.
From 272b66111b520a56bf66364a91e2c6b5b38a9797 Mon Sep 17 00:00:00 2001
From: Piotr Roslaniec
Date: Thu, 9 Apr 2026 16:03:50 +0000
Subject: [PATCH 11/17] docs(covenantsigner): document advisory flock
limitations and storage requirements
POSIX flock is advisory and Linux-specific. Document that the data
directory must use local or block-level storage with single-writer
access, not network filesystems.
---
pkg/covenantsigner/store.go | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/pkg/covenantsigner/store.go b/pkg/covenantsigner/store.go
index 9cedaad287..526d6b7256 100644
--- a/pkg/covenantsigner/store.go
+++ b/pkg/covenantsigner/store.go
@@ -58,6 +58,12 @@ func NewStore(handle persistence.BasicHandle, dataDir string) (*Store, error) {
// acquireFileLock creates and acquires an exclusive non-blocking advisory lock
// on a lock file inside the jobs directory. The returned file handle must be
// kept open for the lifetime of the lock; closing it releases the lock.
+//
+// IMPORTANT: This uses POSIX flock(2), which is advisory and Linux-specific.
+// It protects against concurrent processes on the same host but does NOT
+// protect against concurrent access over network filesystems (NFS, EFS,
+// CIFS). The data directory MUST reside on local or block-level storage
+// with single-writer access (e.g., Kubernetes ReadWriteOnce PV).
func acquireFileLock(dataDir string) (*os.File, error) {
lockPath := filepath.Join(dataDir, jobsDirectory, lockFileName)
From 795f50b8ad460f14a8d2f6720598733264e5cdef Mon Sep 17 00:00:00 2001
From: Piotr Roslaniec
Date: Thu, 9 Apr 2026 16:04:30 +0000
Subject: [PATCH 12/17] fix(tbtc): improve error messages and docs for degraded
wallet registry
Three improvements for operator visibility during registry outages:
- Sentinel errors now mention that the wallet registry may be
unavailable, helping operators distinguish registry failures from
genuinely missing data.
- GetWallet log elevated from Warn to Error with actionable message
explaining that signer approval operations will fail.
- WalletChainData godoc documents zero-value semantics for registry-
sourced fields.
---
pkg/chain/ethereum/tbtc.go | 6 ++++--
pkg/tbtc/chain.go | 6 ++++++
pkg/tbtc/signer_approval_certificate.go | 10 ++++++++--
3 files changed, 18 insertions(+), 4 deletions(-)
diff --git a/pkg/chain/ethereum/tbtc.go b/pkg/chain/ethereum/tbtc.go
index d2d9efaf63..5b359c2cae 100644
--- a/pkg/chain/ethereum/tbtc.go
+++ b/pkg/chain/ethereum/tbtc.go
@@ -1478,8 +1478,10 @@ func (tc *TbtcChain) GetWallet(
walletRegistryWallet, err := tc.walletRegistry.GetWallet(wallet.EcdsaWalletID)
if err != nil {
- logger.Warnf(
- "cannot get wallet registry data for wallet [0x%x]: [%v]",
+ logger.Errorf(
+ "wallet registry unavailable for wallet [0x%x]; "+
+ "MembersIDsHash will be zero -- signer approval "+
+ "operations will fail until the registry recovers: [%v]",
wallet.EcdsaWalletID,
err,
)
diff --git a/pkg/tbtc/chain.go b/pkg/tbtc/chain.go
index c70e4b73c0..391906c91a 100644
--- a/pkg/tbtc/chain.go
+++ b/pkg/tbtc/chain.go
@@ -415,6 +415,12 @@ type DepositChainRequest struct {
}
// WalletChainData represents wallet data stored on-chain.
+//
+// EcdsaWalletID and MembersIDsHash are sourced from the wallet registry.
+// When the registry is unavailable during a fault-isolated GetWallet call,
+// these fields contain their zero values. Consumers that require registry
+// data (e.g. signer approval certificate computation) must guard against
+// zero values -- see ErrMissingWalletID and ErrMissingMembersIDsHash.
type WalletChainData struct {
EcdsaWalletID [32]byte
MembersIDsHash [32]byte
diff --git a/pkg/tbtc/signer_approval_certificate.go b/pkg/tbtc/signer_approval_certificate.go
index 72a5dfd5c8..442e995ccd 100644
--- a/pkg/tbtc/signer_approval_certificate.go
+++ b/pkg/tbtc/signer_approval_certificate.go
@@ -22,12 +22,18 @@ var (
// ErrMissingWalletID is returned when wallet chain data does not
// include a wallet ID, typically because the wallet registry was
// unavailable during a fault-isolated GetWallet call.
- ErrMissingWalletID = fmt.Errorf("wallet chain data must include wallet ID")
+ ErrMissingWalletID = fmt.Errorf(
+ "wallet chain data must include wallet ID; " +
+ "the wallet registry may be unavailable",
+ )
// ErrMissingMembersIDsHash is returned when wallet chain data does
// not include a members IDs hash, typically because the wallet
// registry was unavailable during a fault-isolated GetWallet call.
- ErrMissingMembersIDsHash = fmt.Errorf("wallet chain data must include members IDs hash")
+ ErrMissingMembersIDsHash = fmt.Errorf(
+ "wallet chain data must include members IDs hash; " +
+ "the wallet registry may be unavailable",
+ )
)
const (
From 054e261b91ac1073664a982a93f0b3e53e1bd5f8 Mon Sep 17 00:00:00 2001
From: Piotr Roslaniec
Date: Thu, 9 Apr 2026 16:05:42 +0000
Subject: [PATCH 13/17] fix(tbtc): use canonicaljson.Marshal for handoff
payload hash
Switch from encoding/json.Marshal to canonicaljson.Marshal for the
content-addressed handoff bundle ID. Both produce identical output for
current payloads (alphabetical key ordering, no HTML content), but
canonicaljson explicitly disables HTML escaping, making the
serialization contract clearer for non-Go consumers.
---
pkg/tbtc/covenant_signer.go | 13 +++++++++----
1 file changed, 9 insertions(+), 4 deletions(-)
diff --git a/pkg/tbtc/covenant_signer.go b/pkg/tbtc/covenant_signer.go
index ba1ed8b7e2..9858c9cf77 100644
--- a/pkg/tbtc/covenant_signer.go
+++ b/pkg/tbtc/covenant_signer.go
@@ -16,6 +16,7 @@ import (
"github.com/btcsuite/btcd/txscript"
"github.com/keep-network/keep-core/pkg/bitcoin"
"github.com/keep-network/keep-core/pkg/covenantsigner"
+ "github.com/keep-network/keep-core/pkg/internal/canonicaljson"
"github.com/keep-network/keep-core/pkg/tecdsa"
)
@@ -870,10 +871,14 @@ func buildWitnessSignatureBytes(signature *tecdsa.Signature) ([]byte, error) {
}
func computeQcV1SignerHandoffPayloadHash(payload map[string]any) (string, error) {
- // The handoff bundle ID is content-addressed using Go's stable JSON map-key
- // ordering. Future non-Go custodian consumers that want to recompute this
- // hash must preserve the same canonical field set and serialization rules.
- rawPayload, err := json.Marshal(payload)
+ // The handoff bundle ID is content-addressed using canonical JSON
+ // (alphabetical key ordering, no HTML escaping, no trailing newline).
+ // Go's encoding/json.Marshal already sorts map keys alphabetically
+ // (since Go 1.12), so using canonicaljson.Marshal produces identical
+ // output for non-HTML content while also disabling HTML escaping for
+ // safety. Non-Go custodian consumers that recompute this hash must
+ // use the same canonical serialization rules.
+ rawPayload, err := canonicaljson.Marshal(payload)
if err != nil {
return "", err
}
From 5ef7617093acae7c6d2780e2ac226a40b0831f09 Mon Sep 17 00:00:00 2001
From: Piotr Roslaniec
Date: Thu, 9 Apr 2026 16:07:06 +0000
Subject: [PATCH 14/17] fix(covenantsigner): correctly distinguish single vs
both unparseable timestamps
The earlier tiebreaker fix incorrectly entered the "both unparseable"
branch when only the candidate had an invalid timestamp. Parse both
timestamps explicitly to distinguish three cases: only candidate bad
(keep existing), only existing bad (replace), both bad (lexicographic
tiebreak).
---
pkg/covenantsigner/store.go | 42 +++++++++++++++++++++++++------------
1 file changed, 29 insertions(+), 13 deletions(-)
diff --git a/pkg/covenantsigner/store.go b/pkg/covenantsigner/store.go
index 526d6b7256..d0407824bb 100644
--- a/pkg/covenantsigner/store.go
+++ b/pkg/covenantsigner/store.go
@@ -260,14 +260,38 @@ func (s *Store) load() error {
// candidate's timestamp is valid, the failure is on
// the existing job -- replace it. Otherwise skip the
// candidate.
- if _, parseErr := time.Parse(time.RFC3339Nano, job.UpdatedAt); parseErr != nil {
+ _, existingParseErr := time.Parse(time.RFC3339Nano, existing.UpdatedAt)
+ _, candidateParseErr := time.Parse(time.RFC3339Nano, job.UpdatedAt)
+
+ switch {
+ case candidateParseErr != nil && existingParseErr == nil:
+ // Only the candidate is unparseable; keep existing.
+ logger.Warnf(
+ "skipping job [%s] with invalid timestamp on duplicate route key [%s/%s] (keeping [%s]): [%v]",
+ job.RequestID,
+ job.Route,
+ job.RouteRequestID,
+ existing.RequestID,
+ err,
+ )
+ continue
+ case candidateParseErr == nil && existingParseErr != nil:
+ // Only the existing is unparseable; replace with candidate.
+ logger.Warnf(
+ "replacing job [%s] with invalid timestamp on duplicate route key [%s/%s]: [%v]",
+ existing.RequestID,
+ job.Route,
+ job.RouteRequestID,
+ err,
+ )
+ default:
// Both timestamps are unparseable. Use
// lexicographic RequestID as a deterministic
- // tiebreaker so the outcome does not depend on
- // file iteration order.
+ // tiebreaker so the outcome does not depend
+ // on file iteration order.
if existing.RequestID <= job.RequestID {
logger.Warnf(
- "skipping job [%s] with invalid timestamp on duplicate route key [%s/%s] (keeping [%s]): [%v]",
+ "skipping job [%s] on duplicate route key [%s/%s] (keeping [%s], lexicographic tiebreak): [%v]",
job.RequestID,
job.Route,
job.RouteRequestID,
@@ -277,15 +301,7 @@ func (s *Store) load() error {
continue
}
logger.Warnf(
- "replacing job [%s] with invalid timestamp on duplicate route key [%s/%s] (both unparseable, lexicographic tiebreak): [%v]",
- existing.RequestID,
- job.Route,
- job.RouteRequestID,
- err,
- )
- } else {
- logger.Warnf(
- "replacing job [%s] with invalid timestamp on duplicate route key [%s/%s]: [%v]",
+ "replacing job [%s] on duplicate route key [%s/%s] (lexicographic tiebreak): [%v]",
existing.RequestID,
job.Route,
job.RouteRequestID,
From c9f9544d427b4cb92ad1c3a4930d235a716c2d06 Mon Sep 17 00:00:00 2001
From: Piotr Roslaniec
Date: Thu, 9 Apr 2026 16:17:24 +0000
Subject: [PATCH 15/17] fix(covenantsigner): correct context preservation test
to use service context
The test injected a value into the HTTP request context and expected it
to be visible in the engine, but the submit handler deliberately
derives its context from the service context (not the request context)
to survive HTTP disconnects. Fix the test to inject the value into the
service context, which matches the actual design contract.
---
pkg/covenantsigner/server_test.go | 18 ++++++++----------
1 file changed, 8 insertions(+), 10 deletions(-)
diff --git a/pkg/covenantsigner/server_test.go b/pkg/covenantsigner/server_test.go
index 4b4f651e65..a81d4c853d 100644
--- a/pkg/covenantsigner/server_test.go
+++ b/pkg/covenantsigner/server_test.go
@@ -867,7 +867,7 @@ func TestSubmitHandlerPreCancelledContextStillSucceeds(t *testing.T) {
type contextKey string
-func TestSubmitHandlerPreservesContextValues(t *testing.T) {
+func TestSubmitHandlerPreservesServiceContextValues(t *testing.T) {
const testKey contextKey = "test-trace-id"
const testValue = "trace-abc-123"
@@ -889,15 +889,13 @@ func TestSubmitHandlerPreservesContextValues(t *testing.T) {
t.Fatal(err)
}
- // Wrap the handler with middleware that injects a value into the request
- // context. The detached context should preserve this value.
- innerHandler := newHandler(service, context.Background(), "", true)
- wrappedHandler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
- enrichedCtx := context.WithValue(r.Context(), testKey, testValue)
- innerHandler.ServeHTTP(w, r.WithContext(enrichedCtx))
- })
+ // Inject a value into the service context. The submit handler derives
+ // its timeout context from serviceCtx (not from the HTTP request), so
+ // values on the service context must be visible to the engine.
+ serviceCtx := context.WithValue(context.Background(), testKey, testValue)
+ handler := newHandler(service, serviceCtx, "", true)
- server := httptest.NewServer(wrappedHandler)
+ server := httptest.NewServer(handler)
defer server.Close()
submitPayload := mustJSON(t, SignerSubmitInput{
@@ -925,7 +923,7 @@ func TestSubmitHandlerPreservesContextValues(t *testing.T) {
defer mu.Unlock()
if capturedValue != testValue {
t.Fatalf(
- "expected context value %q to be preserved through detachment, "+
+ "expected service context value %q to be visible in engine, "+
"got %v",
testValue,
capturedValue,
From 0c15ba1e63fe98361165b244d8a2550bbf388ed9 Mon Sep 17 00:00:00 2001
From: Piotr Roslaniec
Date: Thu, 9 Apr 2026 17:46:01 +0000
Subject: [PATCH 16/17] fix(covenantsigner): remove subsystem signal handler
that steals process signals
The signal.NotifyContext for SIGINT/SIGTERM inside Initialize consumed
the first signal, cancelling only the signer's service context while
the rest of the node kept running. The process root context in
cmd/start.go is context.Background() and relies on the OS default
signal handler to terminate. Revert to the original parent-context-only
shutdown path.
---
pkg/covenantsigner/server.go | 17 +----------------
1 file changed, 1 insertion(+), 16 deletions(-)
diff --git a/pkg/covenantsigner/server.go b/pkg/covenantsigner/server.go
index 4679c8876a..917a9a5651 100644
--- a/pkg/covenantsigner/server.go
+++ b/pkg/covenantsigner/server.go
@@ -10,10 +10,8 @@ import (
"net"
"net/http"
"net/url"
- "os/signal"
"strconv"
"strings"
- "syscall"
"time"
"github.com/ipfs/go-log/v2"
@@ -130,21 +128,8 @@ func Initialize(
return nil, false, fmt.Errorf("failed to bind covenant signer port [%d]: %w", config.Port, err)
}
- // Listen for both the parent context cancellation and OS signals so
- // that in-flight signing operations are cancelled promptly on any
- // shutdown path, including SIGINT/SIGTERM.
- signalCtx, stopSignal := signal.NotifyContext(
- context.Background(),
- syscall.SIGINT,
- syscall.SIGTERM,
- )
-
go func() {
- select {
- case <-ctx.Done():
- case <-signalCtx.Done():
- }
- stopSignal()
+ <-ctx.Done()
// Cancel the service context so in-flight threshold signing
// operations observe shutdown and terminate promptly.
From 7615a854f9c4844e110497a2d91d6045e14829bd Mon Sep 17 00:00:00 2001
From: Piotr Roslaniec
Date: Thu, 9 Apr 2026 17:46:06 +0000
Subject: [PATCH 17/17] fix(covenantsigner): clear route poison when a valid
job loads for same key
A malformed file processed before its valid sibling would poison the
route key, then the valid job would load into byRouteKey but remain
inaccessible via GetByRouteRequest because the poison check ran first.
Clear the poison entry when a valid job is successfully indexed for that
route key.
---
pkg/covenantsigner/store.go | 3 +++
1 file changed, 3 insertions(+)
diff --git a/pkg/covenantsigner/store.go b/pkg/covenantsigner/store.go
index d0407824bb..1e70bb2586 100644
--- a/pkg/covenantsigner/store.go
+++ b/pkg/covenantsigner/store.go
@@ -320,6 +320,9 @@ func (s *Store) load() error {
s.byRequestID[job.RequestID] = job
s.byRouteKey[key] = job.RequestID
+ // A valid job for this route supersedes any earlier poison
+ // from a malformed sibling file for the same route key.
+ delete(s.poisonedRoutes, key)
loaded++
case err, ok := <-errorChan:
if !ok {