From 0f58e3482ca05aefaa44669ba38e2e4e96cf2185 Mon Sep 17 00:00:00 2001 From: epiphyte Date: Mon, 19 Jan 2026 10:20:35 -0500 Subject: [PATCH 01/15] Initial open fd monitoring w/ cell write lock (SYN-10120) --- synapse/lib/cell.py | 73 +++++++++++++++++++++++++++++----- synapse/lib/platforms/linux.py | 5 +++ 2 files changed, 68 insertions(+), 10 deletions(-) diff --git a/synapse/lib/cell.py b/synapse/lib/cell.py index 9ecec45259e..aa1d945172c 100644 --- a/synapse/lib/cell.py +++ b/synapse/lib/cell.py @@ -85,7 +85,8 @@ feat_aha_callpeers_v1 = ('callpeers', 1) -diskspace = "Insufficient free space on disk." +diskspace_mesg = "Insufficient free space on disk." +openfd_mesg = "Insufficient open file descriptors available." def adminapi(log=False): ''' @@ -1026,6 +1027,13 @@ class Cell(s_nexus.Pusher, s_telepath.Aware): 'minimum': 0, 'maximum': 100, }, + 'limit:fd:free': { + 'default': 5, + 'description': 'Minimum amount, as a percentage, of unused file descriptors before setting the cell read-only.', + 'type': ['integer', 'null'], + 'minimum': 0, + 'maximum': 100, + }, 'health:sysctl:checks': { 'default': True, 'description': 'Enable sysctl parameter checks and warn if values are not optimal.', @@ -1156,6 +1164,7 @@ class Cell(s_nexus.Pusher, s_telepath.Aware): BACKUP_SPAWN_TIMEOUT = 60.0 FREE_SPACE_CHECK_FREQ = 60.0 + OPEN_FD_CHECK_FREQ = 60.0 COMMIT = s_version.commit VERSION = s_version.version @@ -1194,6 +1203,7 @@ async def __anit__(self, dirn, conf=None, readonly=False, parent=None): self.https_listeners = [] self.ahaclient = None self._checkspace = s_coro.Event() + self._checkopenfd = s_coro.Event() self._reloadfuncs = {} # name -> func self.nexslock = asyncio.Lock() @@ -1212,16 +1222,21 @@ async def __anit__(self, dirn, conf=None, readonly=False, parent=None): mesg = f'Booting {self.getCellType()} in safe-mode. Some functionality may be disabled.' logger.warning(mesg) - self.minfree = self.conf.get('limit:disk:free') - if self.minfree is not None: - self.minfree = self.minfree / 100 + self.min_disk_free = self.conf.get('limit:disk:free') + if self.min_disk_free is not None: + self.min_disk_free = self.min_disk_free / 100 disk = shutil.disk_usage(self.dirn) - if (disk.free / disk.total) <= self.minfree: + if (disk.free / disk.total) <= self.min_disk_free: free = disk.free / disk.total * 100 mesg = f'Free space on {self.dirn} below minimum threshold (currently {free:.2f}%)' raise s_exc.LowSpace(mesg=mesg, dirn=self.dirn) + self.min_fd_free = self.conf.get('limit:fd:free') + if self.min_fd_free is not None: + self.min_fd_free = self.min_fd_free / 100 + # DISCUSS When is it even possible to check this during bootsrapping? postAnit ? + self._delTmpFiles() if self.conf.get('onboot:optimize'): @@ -1769,6 +1784,9 @@ async def _bumpCellVers(self, name, updates, nexs=True): def checkFreeSpace(self): self._checkspace.set() + def checkOpenFD(self): + self._checkopenfd.set() + async def _runFreeSpaceLoop(self): while not self.isfini: @@ -1779,15 +1797,15 @@ async def _runFreeSpaceLoop(self): disk = shutil.disk_usage(self.dirn) - if (disk.free / disk.total) <= self.minfree: + if (disk.free / disk.total) <= self.min_disk_free: - await nexsroot.addWriteHold(diskspace) + await nexsroot.addWriteHold(diskspace_mesg) mesg = f'Free space on {self.dirn} below minimum threshold (currently ' \ f'{disk.free / disk.total * 100:.2f}%), setting Cell to read-only.' logger.error(mesg) - elif nexsroot.readonly and await nexsroot.delWriteHold(diskspace): + elif nexsroot.readonly and await nexsroot.delWriteHold(diskspace_mesg): mesg = f'Free space on {self.dirn} above minimum threshold (currently ' \ f'{disk.free / disk.total * 100:.2f}%), removing free space write hold.' @@ -1819,6 +1837,36 @@ async def _runSysctlLoop(self): await self.waitfini(self.SYSCTL_CHECK_FREQ) + async def _runOpenFdLoop(self): + + while not self.isfini: + + nexsroot = self.getCellNexsRoot() + + self._checkopenfd.clear() + + fdusage = s_thisplat.getOpenFdInfo() + + hard_limit = fdusage['hard_limit'] + usage = fdusage['usage'] + free = hard_limit - usage + + if ( free / hard_limit ) <= self.min_fd_free: + + await nexsroot.addWriteHold(openfd_mesg) + + mesg = f'Available file descriptors has dropped below minimum threshold' \ + f'(currently {free / hard_limit * 100:.2f}%), setting Cell to read-only.' + logger.error(mesg, extra={'synapse': fdusage}) + + elif nexsroot.readonly and await nexsroot.delWriteHold(openfd_mesg): + + mesg = f'Available file descriptors above minimum threshold' \ + f'(currently {free / hard_limit * 100:.2f}%), removing file descriptor write hold.' + logger.error(mesg, extra={'synapse': fdusage}) + + await self._checkopenfd.timewait(timeout=self.OPEN_FD_CHECK_FREQ) + async def _initAhaRegistry(self): ahaurls = self.conf.get('aha:registry') @@ -2081,9 +2129,12 @@ async def initNexusSubsystem(self): await self.nexsroot.startup() await self.setCellActive(self.conf.get('mirror') is None) - if self.minfree is not None: + if self.min_disk_free is not None: self.schedCoro(self._runFreeSpaceLoop()) + if self.min_fd_free is not None: + self.schedCoro(self._runOpenFdLoop()) + async def _bindDmonListen(self): # functionalized so downstream code can bind early. @@ -2650,7 +2701,7 @@ def _reqBackupSpace(self): cellsize, _ = s_common.getDirSize(self.dirn) if os.stat(self.dirn).st_dev == os.stat(self.backdirn).st_dev: - reqspace = self.minfree * disk.total + cellsize + reqspace = self.min_fd_free * disk.total + cellsize else: reqspace = cellsize @@ -4888,6 +4939,7 @@ async def getSystemInfo(self): availmem = s_thisplat.getAvailableMemory() pyversion = platform.python_version() cpucount = multiprocessing.cpu_count() + fdusage = s_thisplat.getOpenFdInfo() sysctls = s_thisplat.getSysctls() tmpdir = s_thisplat.getTempDir() @@ -4907,6 +4959,7 @@ async def getSystemInfo(self): 'cpucount': cpucount, # Number of CPUs on system 'sysctls': sysctls, # Performance related sysctls 'tmpdir': tmpdir, # Temporary File / Folder Directory + 'fdusage': fdusage, # Soft limits, hard limits, and open fd descriptors for the current process. } return retn diff --git a/synapse/lib/platforms/linux.py b/synapse/lib/platforms/linux.py index 55f2b1caa90..b467ec2bf91 100644 --- a/synapse/lib/platforms/linux.py +++ b/synapse/lib/platforms/linux.py @@ -105,6 +105,11 @@ def getTotalMemory(): logger.warning('Unable to find max memory limit') # pragma: no cover return 0 # pragma: no cover +def getOpenFdInfo(): + soft_limit, hard_limit = resource.getrlimit(resource.RLIMIT_NOFILE) + usage = len(os.listdir(f'/proc/{os.getpid()}/fd')) + ret = {'soft_limit': soft_limit, 'hard_limit': hard_limit, 'usage': usage} + return ret def getSysctls(): _sysctls = ( From 1295774474989c0c549a423c01c20c5504daa70b Mon Sep 17 00:00:00 2001 From: epiphyte Date: Mon, 19 Jan 2026 10:37:32 -0500 Subject: [PATCH 02/15] todo comments --- synapse/lib/cell.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/synapse/lib/cell.py b/synapse/lib/cell.py index aa1d945172c..dcf4f35ae09 100644 --- a/synapse/lib/cell.py +++ b/synapse/lib/cell.py @@ -1784,7 +1784,8 @@ async def _bumpCellVers(self, name, updates, nexs=True): def checkFreeSpace(self): self._checkspace.set() - def checkOpenFD(self): + def checkOpenFd(self): + # TODO Insert this call before places where a user may open persistent files ( mainly lmdb slabs ! ) self._checkopenfd.set() async def _runFreeSpaceLoop(self): @@ -1847,6 +1848,8 @@ async def _runOpenFdLoop(self): fdusage = s_thisplat.getOpenFdInfo() + # TODO Handle constant here https://docs.python.org/3/library/resource.html#resource.RLIM_INFINITY + # TODO soft_limit vs hard_limit hard_limit = fdusage['hard_limit'] usage = fdusage['usage'] free = hard_limit - usage From 96987d6acdb6310a88c326f2e8a3ba9e42b9903c Mon Sep 17 00:00:00 2001 From: epiphyte Date: Mon, 19 Jan 2026 11:42:23 -0500 Subject: [PATCH 03/15] Updates from testing --- synapse/cortex.py | 2 ++ synapse/lib/cell.py | 14 ++++++++------ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/synapse/cortex.py b/synapse/cortex.py index 37e2699550d..e149f1c3236 100644 --- a/synapse/cortex.py +++ b/synapse/cortex.py @@ -5119,6 +5119,7 @@ async def addView(self, vdef, nexs=True): s_schemas.reqValidView(vdef) if nexs: + self.checkOpenFd() return await self._push('view:add', vdef) else: return await self._addView(vdef) @@ -5459,6 +5460,7 @@ async def addLayer(self, ldef=None, nexs=True): s_layer.reqValidLdef(ldef) if nexs: + self.checkOpenFd() return await self._push('layer:add', ldef) else: return await self._addLayer(ldef, (None, None)) diff --git a/synapse/lib/cell.py b/synapse/lib/cell.py index dcf4f35ae09..24a481931e6 100644 --- a/synapse/lib/cell.py +++ b/synapse/lib/cell.py @@ -1849,23 +1849,25 @@ async def _runOpenFdLoop(self): fdusage = s_thisplat.getOpenFdInfo() # TODO Handle constant here https://docs.python.org/3/library/resource.html#resource.RLIM_INFINITY - # TODO soft_limit vs hard_limit - hard_limit = fdusage['hard_limit'] + limit = fdusage['soft_limit'] usage = fdusage['usage'] - free = hard_limit - usage + free = limit - usage - if ( free / hard_limit ) <= self.min_fd_free: + # TODO REMOVE ME + logger.debug(f'{usage=} {limit=} {free=} {free / limit} <= {self.min_fd_free=} ?') + + if ( free / limit ) <= self.min_fd_free: await nexsroot.addWriteHold(openfd_mesg) mesg = f'Available file descriptors has dropped below minimum threshold' \ - f'(currently {free / hard_limit * 100:.2f}%), setting Cell to read-only.' + f'(currently {free / limit * 100:.2f}%), setting Cell to read-only.' logger.error(mesg, extra={'synapse': fdusage}) elif nexsroot.readonly and await nexsroot.delWriteHold(openfd_mesg): mesg = f'Available file descriptors above minimum threshold' \ - f'(currently {free / hard_limit * 100:.2f}%), removing file descriptor write hold.' + f'(currently {free / limit * 100:.2f}%), removing file descriptor write hold.' logger.error(mesg, extra={'synapse': fdusage}) await self._checkopenfd.timewait(timeout=self.OPEN_FD_CHECK_FREQ) From 948b10df67f5963d6a76f8bd463ca631cb25c296 Mon Sep 17 00:00:00 2001 From: epiphyte Date: Mon, 19 Jan 2026 13:44:27 -0500 Subject: [PATCH 04/15] Add a darwin implementation for openfds since its a pretty straightfoward one. --- synapse/lib/platforms/darwin.py | 9 +++++++++ synapse/lib/platforms/linux.py | 1 + synapse/tests/test_lib_cell.py | 3 +++ synapse/tests/test_lib_platforms_linux.py | 9 +++++++++ 4 files changed, 22 insertions(+) diff --git a/synapse/lib/platforms/darwin.py b/synapse/lib/platforms/darwin.py index 321bf33791f..552b48ee9d1 100644 --- a/synapse/lib/platforms/darwin.py +++ b/synapse/lib/platforms/darwin.py @@ -1,4 +1,6 @@ +import os import logging +import resource logger = logging.getLogger(__name__) @@ -6,4 +8,11 @@ def initHostInfo(): return { 'format': 'macho', 'platform': 'darwin', + 'hasopenfds': True, } + +def getOpenFdInfo(): + soft_limit, hard_limit = resource.getrlimit(resource.RLIMIT_NOFILE) + usage = len(os.listdir(f'/dev/fd')) + ret = {'soft_limit': soft_limit, 'hard_limit': hard_limit, 'usage': usage} + return ret diff --git a/synapse/lib/platforms/linux.py b/synapse/lib/platforms/linux.py index b467ec2bf91..45a7a8a869e 100644 --- a/synapse/lib/platforms/linux.py +++ b/synapse/lib/platforms/linux.py @@ -19,6 +19,7 @@ def initHostInfo(): 'platform': 'linux', 'hasmemlocking': True, # has mlock, and all the below related functions 'hassysctls': True, + 'hasopenfds': True, } def getFileMappedRegion(filename): diff --git a/synapse/tests/test_lib_cell.py b/synapse/tests/test_lib_cell.py index 5ca3d8e7d5c..1451fff5ed4 100644 --- a/synapse/tests/test_lib_cell.py +++ b/synapse/tests/test_lib_cell.py @@ -2561,6 +2561,9 @@ async def test_passwd_regression(self): self.false(await root.tryPasswd('root')) self.true(await root.tryPasswd('supersecretpassword')) + async def test_cell_minfiles(self): + pass + async def test_cell_minspace(self): with self.raises(s_exc.LowSpace): diff --git a/synapse/tests/test_lib_platforms_linux.py b/synapse/tests/test_lib_platforms_linux.py index bd4547b5bb9..3d10e4f5b83 100644 --- a/synapse/tests/test_lib_platforms_linux.py +++ b/synapse/tests/test_lib_platforms_linux.py @@ -64,3 +64,12 @@ def test_sysctls(self): self.isinstance(ret['vm.dirty_bytes'], int) self.isin('vm.dirty_background_bytes', ret) self.isinstance(ret['vm.dirty_background_bytes'], int) + + def test_openfds(self): + self.thisHostMust(hasopenfds=True) + ret = s_thisplat.getOpenFdInfo() + self.isinstance(ret, dict) + self.isin('hard_limit', ret) + self.isin('soft_limit', ret) + self.isin('usage', ret) + self.true(ret.get('usage') > 0) From 80b3822839ca6e6d3ba1fd5b7e93e6090941d710 Mon Sep 17 00:00:00 2001 From: epiphyte Date: Mon, 19 Jan 2026 15:03:20 -0500 Subject: [PATCH 05/15] Add some tests --- synapse/lib/cell.py | 2 +- synapse/lib/spooled.py | 6 ++ synapse/tests/test_lib_cell.py | 128 +++++++++++++++++++++++++++++++-- 3 files changed, 128 insertions(+), 8 deletions(-) diff --git a/synapse/lib/cell.py b/synapse/lib/cell.py index 24a481931e6..889571d335f 100644 --- a/synapse/lib/cell.py +++ b/synapse/lib/cell.py @@ -1854,7 +1854,7 @@ async def _runOpenFdLoop(self): free = limit - usage # TODO REMOVE ME - logger.debug(f'{usage=} {limit=} {free=} {free / limit} <= {self.min_fd_free=} ?') + logger.debug(f'{self.dirn=} {usage=} {limit=} {free=} {free / limit} <= {self.min_fd_free=} ?') if ( free / limit ) <= self.min_fd_free: diff --git a/synapse/lib/spooled.py b/synapse/lib/spooled.py index c89cfbc4acf..7e7cb9d210e 100644 --- a/synapse/lib/spooled.py +++ b/synapse/lib/spooled.py @@ -49,6 +49,12 @@ async def _initFallBack(self): slabpath = tempfile.mkdtemp(dir=dirn, prefix='spooled_', suffix='.lmdb') + if self.cell is not None: + # Wake the host cell and give it the opportunity to check openfd counts. + # This will not stop the spooled set creation but may prevent downstream + # activity from causing an issue. + self.cell.checkOpenFd() + self.slab = await s_lmdbslab.Slab.anit(slabpath, map_size=DEFAULT_MAPSIZE) if self.cell is not None: self.slab.addResizeCallback(self.cell.checkFreeSpace) diff --git a/synapse/tests/test_lib_cell.py b/synapse/tests/test_lib_cell.py index 1451fff5ed4..bfad5557715 100644 --- a/synapse/tests/test_lib_cell.py +++ b/synapse/tests/test_lib_cell.py @@ -2562,7 +2562,123 @@ async def test_passwd_regression(self): self.true(await root.tryPasswd('supersecretpassword')) async def test_cell_minfiles(self): - pass + self.thisHostMust(hasopenfds=True) + + with self.raises(s_exc.IsReadOnly) as cm: + conf = {'limit:fd:free': 100} + async with self.getTestCell(conf=conf) as cell: + await asyncio.sleep(0.1) + cell.checkOpenFd() + await cell.sync() + self.isin('Insufficient open file descriptors available.', cm.exception.get('mesg')) + + revt = asyncio.Event() + addWriteHold = s_nexus.NexsRoot.addWriteHold + delWriteHold = s_nexus.NexsRoot.delWriteHold + + async def wrapAddWriteHold(root, reason): + retn = await addWriteHold(root, reason) + revt.set() + return retn + + async def wrapDelWriteHold(root, reason): + retn = await delWriteHold(root, reason) + revt.set() + return retn + + _ntuple_diskusage = collections.namedtuple('usage', 'total used free') + + def full_fds(): + print(f'IN FULL FDS') + return {'hard_limit': 256, 'soft_limit': 256, 'usage': 255} + + def unlimited_fds(): + return {'hard_limit': -1, 'soft_limit': -1, 'usage': 255} + + revt = asyncio.Event() + addWriteHold = s_nexus.NexsRoot.addWriteHold + delWriteHold = s_nexus.NexsRoot.delWriteHold + + async def wrapAddWriteHold(root, reason): + retn = await addWriteHold(root, reason) + revt.set() + return retn + + async def wrapDelWriteHold(root, reason): + retn = await delWriteHold(root, reason) + revt.set() + return retn + + with mock.patch.object(s_cell.Cell, 'OPEN_FD_CHECK_FREQ', 0.1), \ + mock.patch.object(s_nexus.NexsRoot, 'addWriteHold', wrapAddWriteHold), \ + mock.patch.object(s_nexus.NexsRoot, 'delWriteHold', wrapDelWriteHold): + async with self.getTestCore() as core: + + fork_q = 'view.fork --name somefork $lib.view.get().iden' + + await core.nodes(fork_q) + + with mock.patch('synapse.lib.thisplat.getOpenFdInfo', full_fds): + self.true(await asyncio.wait_for(revt.wait(), 6)) + + msgs = await core.stormlist(fork_q) + self.stormIsInErr(s_cell.openfd_mesg, msgs) + + revt.clear() + self.true(await asyncio.wait_for(revt.wait(), 6)) + + await core.nodes(fork_q) + + # Check with a unlimited ulimit + with mock.patch('synapse.lib.thisplat.getOpenFdInfo', unlimited_fds): + self.true(await asyncio.wait_for(revt.wait(), 6)) + + msgs = await core.stormlist(fork_q) + self.stormIsInErr(s_cell.openfd_mesg, msgs) + + # Mirrors can be blocked and then recover + with self.getTestDir() as dirn: + + path00 = s_common.gendir(dirn, 'core00') + path01 = s_common.gendir(dirn, 'core01') + + conf = {'limit:fd:free': 0} + async with self.getTestCore(dirn=path00, conf=conf) as core00: + await core00.nodes('[ inet:ipv4=1.2.3.4 ]') + + s_tools_backup.backup(path00, path01) + + async with self.getTestCore(dirn=path00, conf=conf) as core00: + + core01conf = {'mirror': core00.getLocalUrl()} + + async with self.getTestCore(dirn=path01, conf=core01conf) as core01: + + await core01.sync() + + revt.clear() + with mock.patch('synapse.lib.thisplat.getOpenFdInfo', full_fds): + self.true(await asyncio.wait_for(revt.wait(), 1)) + + msgs = await core01.stormlist('[inet:fqdn=newp.fail]') + self.stormIsInErr(s_cell.openfd_mesg, msgs) + msgs = await core01.stormlist('[inet:fqdn=newp.fail]') + self.stormIsInErr(s_cell.openfd_mesg, msgs) + self.len(1, await core00.nodes('[ inet:ipv4=2.3.4.5 ]')) + + offs = await core00.getNexsIndx() + self.false(await core01.waitNexsOffs(offs, 1)) + + self.len(1, await core01.nodes('inet:ipv4=1.2.3.4')) + self.len(0, await core01.nodes('inet:ipv4=2.3.4.5')) + revt.clear() + + revt.clear() + self.true(await asyncio.wait_for(revt.wait(), 1)) + await core01.sync() + + self.len(1, await core01.nodes('inet:ipv4=1.2.3.4')) + self.len(1, await core01.nodes('inet:ipv4=2.3.4.5')) async def test_cell_minspace(self): @@ -2589,8 +2705,6 @@ async def wrapDelWriteHold(root, reason): revt.set() return retn - errmsg = 'Insufficient free space on disk.' - with mock.patch.object(s_cell.Cell, 'FREE_SPACE_CHECK_FREQ', 0.1), \ mock.patch.object(s_nexus.NexsRoot, 'addWriteHold', wrapAddWriteHold), \ mock.patch.object(s_nexus.NexsRoot, 'delWriteHold', wrapDelWriteHold): @@ -2615,7 +2729,7 @@ async def wrapDelWriteHold(root, reason): self.true(await asyncio.wait_for(revt.wait(), 1)) msgs = await core.stormlist('[inet:fqdn=newp.fail]') - self.stormIsInErr(errmsg, msgs) + self.stormIsInErr(s_cell.diskspace_mesg, msgs) revt.clear() self.true(await asyncio.wait_for(revt.wait(), 1)) @@ -2646,9 +2760,9 @@ async def wrapDelWriteHold(root, reason): self.true(await asyncio.wait_for(revt.wait(), 1)) msgs = await core01.stormlist('[inet:fqdn=newp.fail]') - self.stormIsInErr(errmsg, msgs) + self.stormIsInErr(s_cell.diskspace_mesg, msgs) msgs = await core01.stormlist('[inet:fqdn=newp.fail]') - self.stormIsInErr(errmsg, msgs) + self.stormIsInErr(s_cell.diskspace_mesg, msgs) self.len(1, await core00.nodes('[ inet:ipv4=2.3.4.5 ]')) offs = await core00.getNexsIndx() @@ -2678,7 +2792,7 @@ async def wrapDelWriteHold(root, reason): with mock.patch('shutil.disk_usage', full_disk): opts = {'view': viewiden} msgs = await core.stormlist('for $x in $lib.range(20000) {[inet:ipv4=$x]}', opts=opts) - self.stormIsInErr(errmsg, msgs) + self.stormIsInErr(s_cell.diskspace_mesg, msgs) nodes = await core.nodes('inet:ipv4', opts=opts) self.gt(len(nodes), 0) self.lt(len(nodes), 20000) From 5c52daa51b3e03bfea86e5e84c0d64cec85f4e6a Mon Sep 17 00:00:00 2001 From: epiphyte Date: Mon, 19 Jan 2026 15:17:19 -0500 Subject: [PATCH 06/15] Handle resource.RLIM_INFINITY constant --- synapse/lib/cell.py | 16 +++++++++------- synapse/tests/test_lib_cell.py | 12 +++++++++--- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/synapse/lib/cell.py b/synapse/lib/cell.py index 889571d335f..6f1f033e8ae 100644 --- a/synapse/lib/cell.py +++ b/synapse/lib/cell.py @@ -13,6 +13,7 @@ import argparse import datetime import platform +import resource import tempfile import functools import contextlib @@ -1848,26 +1849,27 @@ async def _runOpenFdLoop(self): fdusage = s_thisplat.getOpenFdInfo() - # TODO Handle constant here https://docs.python.org/3/library/resource.html#resource.RLIM_INFINITY limit = fdusage['soft_limit'] usage = fdusage['usage'] - free = limit - usage - # TODO REMOVE ME - logger.debug(f'{self.dirn=} {usage=} {limit=} {free=} {free / limit} <= {self.min_fd_free=} ?') + free = (limit - usage) / limit - if ( free / limit ) <= self.min_fd_free: + # If the soft_limit is not unlimited ( signaled via resource.RLIM_INFINITY ) and the + # free percentage is < self.min_fd_free, we lock the cell; otherwise we remove our lock + # on the cell. + + if limit != resource.RLIM_INFINITY and free <= self.min_fd_free: await nexsroot.addWriteHold(openfd_mesg) mesg = f'Available file descriptors has dropped below minimum threshold' \ - f'(currently {free / limit * 100:.2f}%), setting Cell to read-only.' + f'(currently {free * 100:.2f}%), setting Cell to read-only.' logger.error(mesg, extra={'synapse': fdusage}) elif nexsroot.readonly and await nexsroot.delWriteHold(openfd_mesg): mesg = f'Available file descriptors above minimum threshold' \ - f'(currently {free / limit * 100:.2f}%), removing file descriptor write hold.' + f'(currently {free * 100:.2f}%), removing file descriptor write hold.' logger.error(mesg, extra={'synapse': fdusage}) await self._checkopenfd.timewait(timeout=self.OPEN_FD_CHECK_FREQ) diff --git a/synapse/tests/test_lib_cell.py b/synapse/tests/test_lib_cell.py index bfad5557715..6dc89589444 100644 --- a/synapse/tests/test_lib_cell.py +++ b/synapse/tests/test_lib_cell.py @@ -2629,12 +2629,18 @@ async def wrapDelWriteHold(root, reason): await core.nodes(fork_q) - # Check with a unlimited ulimit + # Check with an unlimited ulimit. + # First we can set the write hold manually + await cell.nexsroot.addWriteHold(s_cell.openfd_mesg) + self.true(cell.nexsroot.readonly) + with mock.patch('synapse.lib.thisplat.getOpenFdInfo', unlimited_fds): - self.true(await asyncio.wait_for(revt.wait(), 6)) + + # Then see it be cleared + self.true(await asyncio.wait_for(revt.wait(), 1)) msgs = await core.stormlist(fork_q) - self.stormIsInErr(s_cell.openfd_mesg, msgs) + self.stormHasNoWarnErr(msgs) # Mirrors can be blocked and then recover with self.getTestDir() as dirn: From 58b4f87d552329634f84cf5b3f578db5af5c794f Mon Sep 17 00:00:00 2001 From: epiphyte Date: Mon, 19 Jan 2026 15:19:15 -0500 Subject: [PATCH 07/15] comment --- synapse/lib/cell.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/synapse/lib/cell.py b/synapse/lib/cell.py index 6f1f033e8ae..cc80c32e7e8 100644 --- a/synapse/lib/cell.py +++ b/synapse/lib/cell.py @@ -1236,7 +1236,7 @@ async def __anit__(self, dirn, conf=None, readonly=False, parent=None): self.min_fd_free = self.conf.get('limit:fd:free') if self.min_fd_free is not None: self.min_fd_free = self.min_fd_free / 100 - # DISCUSS When is it even possible to check this during bootsrapping? postAnit ? + # DISCUSS When is it even possible to check this during startup? postAnit ? self._delTmpFiles() From 05a631ca99a0e2a8d53ffb7739c5129132e6fab4 Mon Sep 17 00:00:00 2001 From: epiphyte Date: Mon, 19 Jan 2026 15:20:44 -0500 Subject: [PATCH 08/15] Remove print statement --- synapse/tests/test_lib_cell.py | 1 - 1 file changed, 1 deletion(-) diff --git a/synapse/tests/test_lib_cell.py b/synapse/tests/test_lib_cell.py index 6dc89589444..a3e47f23a31 100644 --- a/synapse/tests/test_lib_cell.py +++ b/synapse/tests/test_lib_cell.py @@ -2589,7 +2589,6 @@ async def wrapDelWriteHold(root, reason): _ntuple_diskusage = collections.namedtuple('usage', 'total used free') def full_fds(): - print(f'IN FULL FDS') return {'hard_limit': 256, 'soft_limit': 256, 'usage': 255} def unlimited_fds(): From d68cc00c68d09f3ae221ea5ba418db63c3e3006f Mon Sep 17 00:00:00 2001 From: epiphyte Date: Mon, 19 Jan 2026 16:24:39 -0500 Subject: [PATCH 09/15] Add changelog and end of fini check --- changes/ab03fe160c11224d2e8b98043be9bf3d.yaml | 11 +++++++++++ synapse/lib/cell.py | 4 ++++ 2 files changed, 15 insertions(+) create mode 100644 changes/ab03fe160c11224d2e8b98043be9bf3d.yaml diff --git a/changes/ab03fe160c11224d2e8b98043be9bf3d.yaml b/changes/ab03fe160c11224d2e8b98043be9bf3d.yaml new file mode 100644 index 00000000000..ad27a0db441 --- /dev/null +++ b/changes/ab03fe160c11224d2e8b98043be9bf3d.yaml @@ -0,0 +1,11 @@ +--- +desc: Add a new Cell configuration option, ``limit:fd:free``. This represents the + minimum percentage of available file descriptors that a Synapse service that + is required in order to start up without entering a read-only state. This value + is also monitored every minute and will disable the Cell Nexus if the free + file descriptors drops below the specified value. This value defaults to five + percent ( ``5 %`` ) of available file descriptors. +desc:literal: false +prs: [] +type: feat +... diff --git a/synapse/lib/cell.py b/synapse/lib/cell.py index cc80c32e7e8..a209bc5bc4c 100644 --- a/synapse/lib/cell.py +++ b/synapse/lib/cell.py @@ -1413,6 +1413,10 @@ async def fini(): # phase 5 - service networking await self.initServiceNetwork() + # End of __anit__ - refire the fd loop in the event that conditions have + # changed such that the service would now go into a write only mode. + self.checkOpenFd() + async def _storCellHiveMigration(self): logger.warning(f'migrating Cell ({self.getCellType()}) info out of hive') From d8ee84aa03bb2c1c28b83bee63ee2247546940bf Mon Sep 17 00:00:00 2001 From: epiphyte Date: Mon, 19 Jan 2026 18:46:29 -0500 Subject: [PATCH 10/15] Allow getOpenFdInfo to survive emfil --- synapse/lib/cell.py | 2 ++ synapse/lib/platforms/darwin.py | 11 ++++++- synapse/lib/platforms/linux.py | 35 ++++++++++++++++------- synapse/tests/test_lib_platforms_linux.py | 25 ++++++++++++++++ 4 files changed, 61 insertions(+), 12 deletions(-) diff --git a/synapse/lib/cell.py b/synapse/lib/cell.py index a209bc5bc4c..fe49751025b 100644 --- a/synapse/lib/cell.py +++ b/synapse/lib/cell.py @@ -1857,6 +1857,8 @@ async def _runOpenFdLoop(self): usage = fdusage['usage'] free = (limit - usage) / limit + # TODO remove me + # logger.debug(f'{self.dirn=} {usage=} {limit=} {free=} {free / limit} <= {self.min_fd_free=} ?') # If the soft_limit is not unlimited ( signaled via resource.RLIM_INFINITY ) and the # free percentage is < self.min_fd_free, we lock the cell; otherwise we remove our lock diff --git a/synapse/lib/platforms/darwin.py b/synapse/lib/platforms/darwin.py index 552b48ee9d1..3b304e8989b 100644 --- a/synapse/lib/platforms/darwin.py +++ b/synapse/lib/platforms/darwin.py @@ -1,4 +1,5 @@ import os +import errno import logging import resource @@ -13,6 +14,14 @@ def initHostInfo(): def getOpenFdInfo(): soft_limit, hard_limit = resource.getrlimit(resource.RLIMIT_NOFILE) - usage = len(os.listdir(f'/dev/fd')) + try: + usage = len(os.listdir(f'/dev/fd')) + except OSError as err: + if err.errno == errno.EMFILE: + # We've hit the maximum allowed files and cannot list contents of /proc/; + # so we set usage to soft_limit so the caller can know that we're exactly at the limit. + usage = soft_limit + else: + raise ret = {'soft_limit': soft_limit, 'hard_limit': hard_limit, 'usage': usage} return ret diff --git a/synapse/lib/platforms/linux.py b/synapse/lib/platforms/linux.py index 45a7a8a869e..b62fda508fa 100644 --- a/synapse/lib/platforms/linux.py +++ b/synapse/lib/platforms/linux.py @@ -1,4 +1,5 @@ import os +import errno import logging import resource import contextlib @@ -108,7 +109,15 @@ def getTotalMemory(): def getOpenFdInfo(): soft_limit, hard_limit = resource.getrlimit(resource.RLIMIT_NOFILE) - usage = len(os.listdir(f'/proc/{os.getpid()}/fd')) + try: + usage = len(os.listdir(f'/proc/{os.getpid()}/fd')) + except OSError as err: + if err.errno == errno.EMFILE: + # We've hit the maximum allowed files and cannot list contents of /proc/; + # so we set usage to soft_limit so the caller can know that we're exactly at the limit. + usage = soft_limit + else: + raise ret = {'soft_limit': soft_limit, 'hard_limit': hard_limit, 'usage': usage} return ret @@ -124,16 +133,20 @@ def getSysctls(): ) ret = {} for key, fp, func in _sysctls: - if os.path.isfile(fp): - with open(fp) as f: - valu = f.read().strip() - try: - ret[key] = func(valu) - except Exception: # pragma: no cover - logger.exception(f'Error normalizing sysctl: {key} @ {fp}, valu={valu}') - ret[key] = None - else: # pragma: no cover - logger.warning(f'Missing sysctl: {key} @ {fp}') + try: + if os.path.isfile(fp): + with open(fp) as f: + valu = f.read().strip() + try: + ret[key] = func(valu) + except Exception: # pragma: no cover + logger.exception(f'Error normalizing sysctl: {key} @ {fp}, valu={valu}') + ret[key] = None + else: # pragma: no cover + logger.warning(f'Missing sysctl: {key} @ {fp}') + ret[key] = None + except: + logger.exception(f'Error while reading sysctl: {key} @ {fp}') ret[key] = None return ret diff --git a/synapse/tests/test_lib_platforms_linux.py b/synapse/tests/test_lib_platforms_linux.py index 3d10e4f5b83..fec5bc98607 100644 --- a/synapse/tests/test_lib_platforms_linux.py +++ b/synapse/tests/test_lib_platforms_linux.py @@ -1,5 +1,8 @@ +import errno import pathlib +import unittest.mock as mock + import synapse.exc as s_exc import synapse.tests.utils as s_t_utils @@ -73,3 +76,25 @@ def test_openfds(self): self.isin('soft_limit', ret) self.isin('usage', ret) self.true(ret.get('usage') > 0) + + def bad_listdir_emfile(path): + e = OSError('ruh roh') + e.errno = errno.EMFILE + raise e + + def bad_listdir_enoent(path): + e = OSError('ruh roh') + e.errno = errno.ENOENT + raise e + + with mock.patch('os.listdir', bad_listdir_emfile): + ret = s_thisplat.getOpenFdInfo() + self.isinstance(ret, dict) + self.isin('hard_limit', ret) + self.isin('soft_limit', ret) + self.true(ret.get('usage') > 0) + self.eq(ret.get('usage'), ret.get('soft_limit'), ret) + + with mock.patch('os.listdir', bad_listdir_enoent): + with self.raises(OSError) as cm: + s_thisplat.getOpenFdInfo() From 1bcb65fc1fdad126f9e2ee9d0fae1b2e2d0170f3 Mon Sep 17 00:00:00 2001 From: epiphyte Date: Mon, 19 Jan 2026 19:11:37 -0500 Subject: [PATCH 11/15] clean up again --- changes/ab03fe160c11224d2e8b98043be9bf3d.yaml | 2 +- synapse/lib/cell.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/changes/ab03fe160c11224d2e8b98043be9bf3d.yaml b/changes/ab03fe160c11224d2e8b98043be9bf3d.yaml index ad27a0db441..f00a50ad352 100644 --- a/changes/ab03fe160c11224d2e8b98043be9bf3d.yaml +++ b/changes/ab03fe160c11224d2e8b98043be9bf3d.yaml @@ -2,7 +2,7 @@ desc: Add a new Cell configuration option, ``limit:fd:free``. This represents the minimum percentage of available file descriptors that a Synapse service that is required in order to start up without entering a read-only state. This value - is also monitored every minute and will disable the Cell Nexus if the free + is also monitored every minute and will disable the Cell Nexus if the free file descriptors drops below the specified value. This value defaults to five percent ( ``5 %`` ) of available file descriptors. desc:literal: false diff --git a/synapse/lib/cell.py b/synapse/lib/cell.py index fe49751025b..a209bc5bc4c 100644 --- a/synapse/lib/cell.py +++ b/synapse/lib/cell.py @@ -1857,8 +1857,6 @@ async def _runOpenFdLoop(self): usage = fdusage['usage'] free = (limit - usage) / limit - # TODO remove me - # logger.debug(f'{self.dirn=} {usage=} {limit=} {free=} {free / limit} <= {self.min_fd_free=} ?') # If the soft_limit is not unlimited ( signaled via resource.RLIM_INFINITY ) and the # free percentage is < self.min_fd_free, we lock the cell; otherwise we remove our lock From 1ae10ce0141036ebd05d516d5ad3d61ee3c7ec40 Mon Sep 17 00:00:00 2001 From: epiphyte Date: Mon, 19 Jan 2026 19:13:51 -0500 Subject: [PATCH 12/15] cleanup --- synapse/lib/cell.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/synapse/lib/cell.py b/synapse/lib/cell.py index a209bc5bc4c..6af02fedf5b 100644 --- a/synapse/lib/cell.py +++ b/synapse/lib/cell.py @@ -1236,7 +1236,6 @@ async def __anit__(self, dirn, conf=None, readonly=False, parent=None): self.min_fd_free = self.conf.get('limit:fd:free') if self.min_fd_free is not None: self.min_fd_free = self.min_fd_free / 100 - # DISCUSS When is it even possible to check this during startup? postAnit ? self._delTmpFiles() @@ -1413,7 +1412,7 @@ async def fini(): # phase 5 - service networking await self.initServiceNetwork() - # End of __anit__ - refire the fd loop in the event that conditions have + # End of __anit__ - wake up the fd loop in the event that the cell has # changed such that the service would now go into a write only mode. self.checkOpenFd() From b3e4a060b912f68184efe7ec8f8531898706a630 Mon Sep 17 00:00:00 2001 From: epiphyte Date: Mon, 19 Jan 2026 19:26:58 -0500 Subject: [PATCH 13/15] Add another place to wake the cell for open fds --- synapse/lib/view.py | 1 + 1 file changed, 1 insertion(+) diff --git a/synapse/lib/view.py b/synapse/lib/view.py index 8e8cb8a8f03..dd101b75e42 100644 --- a/synapse/lib/view.py +++ b/synapse/lib/view.py @@ -1402,6 +1402,7 @@ async def insertParentFork(self, useriden, name=None): s_layer.reqValidLdef(ldef) s_schemas.reqValidView(vdef) + self.core.checkOpenFd() return await self._push('view:forkparent', ldef, vdef) @s_nexus.Pusher.onPush('view:forkparent', passitem=True) From 4501ff9558147590dd1de811a3b6dca85c81ed9c Mon Sep 17 00:00:00 2001 From: vEpiphyte Date: Tue, 20 Jan 2026 09:57:50 -0500 Subject: [PATCH 14/15] Update synapse/lib/platforms/linux.py Co-authored-by: blackout --- synapse/lib/platforms/linux.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/synapse/lib/platforms/linux.py b/synapse/lib/platforms/linux.py index b62fda508fa..7f32010c6b1 100644 --- a/synapse/lib/platforms/linux.py +++ b/synapse/lib/platforms/linux.py @@ -110,7 +110,7 @@ def getTotalMemory(): def getOpenFdInfo(): soft_limit, hard_limit = resource.getrlimit(resource.RLIMIT_NOFILE) try: - usage = len(os.listdir(f'/proc/{os.getpid()}/fd')) + usage = len(os.listdir(f'/proc/self/fd')) except OSError as err: if err.errno == errno.EMFILE: # We've hit the maximum allowed files and cannot list contents of /proc/; From da6cdebbb6e3561c299b06373488f7f7b424de79 Mon Sep 17 00:00:00 2001 From: epiphyte Date: Wed, 21 Jan 2026 12:07:40 -0500 Subject: [PATCH 15/15] Back out sysctl related change --- synapse/lib/platforms/linux.py | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/synapse/lib/platforms/linux.py b/synapse/lib/platforms/linux.py index b62fda508fa..1a79ed93e56 100644 --- a/synapse/lib/platforms/linux.py +++ b/synapse/lib/platforms/linux.py @@ -133,20 +133,16 @@ def getSysctls(): ) ret = {} for key, fp, func in _sysctls: - try: - if os.path.isfile(fp): - with open(fp) as f: - valu = f.read().strip() - try: - ret[key] = func(valu) - except Exception: # pragma: no cover - logger.exception(f'Error normalizing sysctl: {key} @ {fp}, valu={valu}') - ret[key] = None - else: # pragma: no cover - logger.warning(f'Missing sysctl: {key} @ {fp}') - ret[key] = None - except: - logger.exception(f'Error while reading sysctl: {key} @ {fp}') + if os.path.isfile(fp): + with open(fp) as f: + valu = f.read().strip() + try: + ret[key] = func(valu) + except Exception: # pragma: no cover + logger.exception(f'Error normalizing sysctl: {key} @ {fp}, valu={valu}') + ret[key] = None + else: # pragma: no cover + logger.warning(f'Missing sysctl: {key} @ {fp}') ret[key] = None return ret