From e16d46b8b4dc92e1a597205898a60fa0afbaa302 Mon Sep 17 00:00:00 2001 From: Samuel Attard Date: Wed, 8 Apr 2026 23:24:47 -0700 Subject: [PATCH] patch: replace errno-87 retry with dropping the redundant second open MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root cause is a bindflt.sys race on handle-relative NtCreateFile when a second relative open arrives while the first handle to the same target is live (see the investigation on electron/electron sam/windows-io-repro). Single opens on the same files, same relative paths, same bindflt emptyDir do not trigger it — so sharing the outer *os.File for ReadAt (which is documented concurrent-safe) removes the only overlapping-open in the manifest loader and makes the retry unnecessary. 7-line deletion; also halves CreateFileW calls per subninja. --- ...ient-ERROR_INVALID_PARAMETER-when-op.patch | 140 ------------------ ...uter-os.File-for-chunked-ReadAt-in-f.patch | 47 ++++++ 2 files changed, 47 insertions(+), 140 deletions(-) delete mode 100644 patches/0001-siso-retry-transient-ERROR_INVALID_PARAMETER-when-op.patch create mode 100644 patches/0001-siso-reuse-the-outer-os.File-for-chunked-ReadAt-in-f.patch diff --git a/patches/0001-siso-retry-transient-ERROR_INVALID_PARAMETER-when-op.patch b/patches/0001-siso-retry-transient-ERROR_INVALID_PARAMETER-when-op.patch deleted file mode 100644 index b9eb79d..0000000 --- a/patches/0001-siso-retry-transient-ERROR_INVALID_PARAMETER-when-op.patch +++ /dev/null @@ -1,140 +0,0 @@ -From 6459c2dbdfb40130943ed3e9b4645ce2a8d559a8 Mon Sep 17 00:00:00 2001 -From: Samuel Attard -Date: Wed, 8 Apr 2026 16:43:55 -0700 -Subject: [PATCH] siso: retry transient ERROR_INVALID_PARAMETER when opening - ninja files on Windows - -ManifestParser.Load fans out across all subninja files (~90k in a -Chromium build) at NumCPU parallelism, with each file opened twice by -fileParser.readFile. On Windows builders where out/ is served through -a filesystem filter driver (e.g. bindflt/wcifs for container bind -mounts), CreateFileW can intermittently return ERROR_INVALID_PARAMETER -under this concurrent open burst. There is currently no retry, so a -single transient failure aborts the entire manifest load. - -Wrap the two os.Open calls in readFile in a small Windows-only retry -for ERROR_INVALID_PARAMETER (5 attempts, 5-80ms backoff). Each retry -is logged via clog.Warningf and also written to stderr so it is -visible in CI step output where glog warnings are file-only by -default. Other platforms keep the direct os.Open path. ---- - siso/toolsupport/ninjautil/file_parser.go | 5 +- - siso/toolsupport/ninjautil/openfile_other.go | 18 +++++++ - .../toolsupport/ninjautil/openfile_windows.go | 50 +++++++++++++++++++ - 3 files changed, 70 insertions(+), 3 deletions(-) - create mode 100644 siso/toolsupport/ninjautil/openfile_other.go - create mode 100644 siso/toolsupport/ninjautil/openfile_windows.go - -diff --git a/siso/toolsupport/ninjautil/file_parser.go b/siso/toolsupport/ninjautil/file_parser.go -index 8c18d084..4289a479 100644 ---- a/siso/toolsupport/ninjautil/file_parser.go -+++ b/siso/toolsupport/ninjautil/file_parser.go -@@ -7,7 +7,6 @@ package ninjautil - import ( - "context" - "fmt" -- "os" - "runtime/trace" - "sync" - "time" -@@ -91,7 +90,7 @@ func (p *fileParser) parseFile(ctx context.Context, fname string) error { - // readFile reads a file of fname in parallel. - func (p *fileParser) readFile(ctx context.Context, fname string) ([]byte, error) { - defer trace.StartRegion(ctx, "ninja.read").End() -- f, err := os.Open(fname) -+ f, err := openFile(ctx, fname) - if err != nil { - return nil, err - } -@@ -111,7 +110,7 @@ func (p *fileParser) readFile(ctx context.Context, fname string) ([]byte, error) - eg.Go(func() error { - p.sema <- struct{}{} - defer func() { <-p.sema }() -- f, err := os.Open(fname) -+ f, err := openFile(ctx, fname) - if err != nil { - return err - } -diff --git a/siso/toolsupport/ninjautil/openfile_other.go b/siso/toolsupport/ninjautil/openfile_other.go -new file mode 100644 -index 00000000..9fca6901 ---- /dev/null -+++ b/siso/toolsupport/ninjautil/openfile_other.go -@@ -0,0 +1,18 @@ -+// Copyright 2026 The Chromium Authors -+// Use of this source code is governed by a BSD-style license that can be -+// found in the LICENSE file. -+ -+//go:build !windows -+ -+package ninjautil -+ -+import ( -+ "context" -+ "os" -+) -+ -+// openFile opens fname for reading. -+// See openfile_windows.go for the Windows variant with transient-error retry. -+func openFile(ctx context.Context, fname string) (*os.File, error) { -+ return os.Open(fname) -+} -diff --git a/siso/toolsupport/ninjautil/openfile_windows.go b/siso/toolsupport/ninjautil/openfile_windows.go -new file mode 100644 -index 00000000..f9d8e9dc ---- /dev/null -+++ b/siso/toolsupport/ninjautil/openfile_windows.go -@@ -0,0 +1,50 @@ -+// Copyright 2026 The Chromium Authors -+// Use of this source code is governed by a BSD-style license that can be -+// found in the LICENSE file. -+ -+//go:build windows -+ -+package ninjautil -+ -+import ( -+ "context" -+ "errors" -+ "fmt" -+ "os" -+ "time" -+ -+ "golang.org/x/sys/windows" -+ -+ "go.chromium.org/build/siso/o11y/clog" -+) -+ -+// openFile opens fname for reading, retrying transient -+// ERROR_INVALID_PARAMETER failures. -+// -+// On Windows, CreateFileW can intermittently return -+// ERROR_INVALID_PARAMETER when the target lives behind a filesystem -+// filter driver (e.g. bindflt/wcifs for container bind mounts) under -+// highly concurrent opens. loadFile fans out across ~90k subninja -+// files at NumCPU parallelism, so a single transient failure would -+// otherwise abort the whole manifest load. -+func openFile(ctx context.Context, fname string) (*os.File, error) { -+ const maxAttempts = 5 -+ delay := 5 * time.Millisecond -+ for i := 0; ; i++ { -+ f, err := os.Open(fname) -+ if err == nil { -+ return f, nil -+ } -+ if i+1 >= maxAttempts || !errors.Is(err, windows.ERROR_INVALID_PARAMETER) { -+ return nil, err -+ } -+ clog.Warningf(ctx, "open %s: %v; retrying (%d/%d) after %s", fname, err, i+1, maxAttempts, delay) -+ fmt.Fprintf(os.Stderr, "siso: open %s: %v; retrying (%d/%d) after %s\n", fname, err, i+1, maxAttempts, delay) -+ select { -+ case <-time.After(delay): -+ case <-ctx.Done(): -+ return nil, context.Cause(ctx) -+ } -+ delay *= 2 -+ } -+} --- -2.53.0 - diff --git a/patches/0001-siso-reuse-the-outer-os.File-for-chunked-ReadAt-in-f.patch b/patches/0001-siso-reuse-the-outer-os.File-for-chunked-ReadAt-in-f.patch new file mode 100644 index 0000000..8e85a6b --- /dev/null +++ b/patches/0001-siso-reuse-the-outer-os.File-for-chunked-ReadAt-in-f.patch @@ -0,0 +1,47 @@ +From 85b561ea4dbc76ba98af020b970f3aa6b20fdb9e Mon Sep 17 00:00:00 2001 +From: Samuel Attard +Date: Wed, 8 Apr 2026 23:24:15 -0700 +Subject: [PATCH] siso: reuse the outer *os.File for chunked ReadAt in + fileParser.readFile +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The per-chunk goroutine currently re-opens fname to get its own handle +for ReadAt. (*os.File).ReadAt is documented as safe for concurrent +calls on the same File (on Windows it is ReadFile with an OVERLAPPED +offset, so there is no shared seek state), so the extra open is +redundant — the goroutines can share the outer f. + +Besides halving the CreateFileW calls per subninja, this avoids an +intermittent 'The parameter is incorrect.' (ERROR_INVALID_PARAMETER) +from bindflt.sys when out/ is a mapped directory inside a Windows +container: bindflt's handle-relative NtCreateFile path races when a +second relative open arrives while the first handle to the same target +is still being set up. Absolute paths and single opens do not trigger +it; see microsoft/Windows-Containers#. +--- + siso/toolsupport/ninjautil/file_parser.go | 7 ------- + 1 file changed, 7 deletions(-) + +diff --git a/siso/toolsupport/ninjautil/file_parser.go b/siso/toolsupport/ninjautil/file_parser.go +index 8c18d084..63116662 100644 +--- a/siso/toolsupport/ninjautil/file_parser.go ++++ b/siso/toolsupport/ninjautil/file_parser.go +@@ -111,13 +111,6 @@ func (p *fileParser) readFile(ctx context.Context, fname string) ([]byte, error) + eg.Go(func() error { + p.sema <- struct{}{} + defer func() { <-p.sema }() +- f, err := os.Open(fname) +- if err != nil { +- return err +- } +- defer func() { +- _ = f.Close() +- }() + for len(chunkBuf) > 0 { + n, err := f.ReadAt(chunkBuf, pos) + if err != nil { +-- +2.53.0 +