From 0a425e1ab43924dd5a9f1ec99fe8e3abf97cd3db Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Thu, 15 Jan 2026 22:37:26 -0800 Subject: [PATCH 1/4] pathfinder: Use LOAD_WITH_ALTERED_SEARCH_PATH for system DLL search on Windows When loading CUDA DLLs via system search on Windows, the previous approach using LoadLibraryExW with flags=0 would find the DLL on PATH but fail to locate its co-located dependencies (error 126). This fix uses SearchPathW to first find the DLL's full path, then loads it with LOAD_WITH_ALTERED_SEARCH_PATH so Windows searches for dependencies starting from the DLL's directory. --- .../_dynamic_libs/load_dl_windows.py | 50 +++++++++++++++++-- 1 file changed, 46 insertions(+), 4 deletions(-) diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_dl_windows.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_dl_windows.py index b9f15ea50b..247cc4d467 100644 --- a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_dl_windows.py +++ b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_dl_windows.py @@ -13,6 +13,7 @@ ) # Mirrors WinBase.h (unfortunately not defined already elsewhere) +WINBASE_LOAD_WITH_ALTERED_SEARCH_PATH = 0x00000008 WINBASE_LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR = 0x00000100 WINBASE_LOAD_LIBRARY_SEARCH_DEFAULT_DIRS = 0x00001000 @@ -45,6 +46,17 @@ kernel32.AddDllDirectory.argtypes = [ctypes.wintypes.LPCWSTR] kernel32.AddDllDirectory.restype = ctypes.c_void_p # DLL_DIRECTORY_COOKIE +# SearchPathW - find a file in the system search path +kernel32.SearchPathW.argtypes = [ + ctypes.wintypes.LPCWSTR, # lpPath (NULL to use standard search) + ctypes.wintypes.LPCWSTR, # lpFileName + ctypes.wintypes.LPCWSTR, # lpExtension + ctypes.wintypes.DWORD, # nBufferLength + ctypes.wintypes.LPWSTR, # lpBuffer + ctypes.POINTER(ctypes.wintypes.LPWSTR), # lpFilePart +] +kernel32.SearchPathW.restype = ctypes.wintypes.DWORD + def ctypes_handle_to_unsigned_int(handle: ctypes.wintypes.HMODULE) -> int: """Convert ctypes HMODULE to unsigned int.""" @@ -113,6 +125,31 @@ def check_if_already_loaded_from_elsewhere(libname: str, have_abs_path: bool) -> return None +def _search_path_for_dll(dll_name: str) -> str | None: + """Search for a DLL using Windows SearchPathW. + + Args: + dll_name: The name of the DLL to find + + Returns: + The absolute path to the DLL if found, None otherwise + """ + buffer = ctypes.create_unicode_buffer(260) # MAX_PATH + length = kernel32.SearchPathW(None, dll_name, None, len(buffer), buffer, None) + + if length == 0: + return None + + # If buffer was too small, try with larger buffer + if length > len(buffer): + buffer = ctypes.create_unicode_buffer(length) + length = kernel32.SearchPathW(None, dll_name, None, len(buffer), buffer, None) + if length == 0: + return None + + return buffer.value + + def load_with_system_search(libname: str) -> LoadedDL | None: """Try to load a DLL using system search paths. @@ -124,10 +161,15 @@ def load_with_system_search(libname: str) -> LoadedDL | None: """ # Reverse tabulated names to achieve new → old search order. for dll_name in reversed(SUPPORTED_WINDOWS_DLLS.get(libname, ())): - handle = kernel32.LoadLibraryExW(dll_name, None, 0) - if handle: - abs_path = abs_path_for_dynamic_library(libname, handle) - return LoadedDL(abs_path, False, ctypes_handle_to_unsigned_int(handle), "system-search") + # First, find the DLL's full path using SearchPathW + found_path = _search_path_for_dll(dll_name) + if found_path: + # Load with LOAD_WITH_ALTERED_SEARCH_PATH so Windows searches for + # dependencies from the DLL's directory (required for CUDA DLLs + # whose dependencies are co-located) + handle = kernel32.LoadLibraryExW(found_path, None, WINBASE_LOAD_WITH_ALTERED_SEARCH_PATH) + if handle: + return LoadedDL(found_path, False, ctypes_handle_to_unsigned_int(handle), "system-search") return None From 2f024a57e24ee0cdfe336546b08d4304d54a1b2c Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Wed, 18 Mar 2026 17:30:25 -0700 Subject: [PATCH 2/4] test: add regression test for #1781 (nvrtc loading without CUDA_HOME/CUDA_PATH) Loads nvrtc in a subprocess with CUDA_HOME and CUDA_PATH stripped from the environment. On Windows CI where nvrtc is only reachable via PATH, this exercises the LOAD_WITH_ALTERED_SEARCH_PATH fix; on other platforms the test passes harmlessly via whatever search path finds nvrtc first. Made-with: Cursor --- .../tests/test_load_nvidia_dynamic_lib.py | 60 ++++++++++++++++++- 1 file changed, 59 insertions(+), 1 deletion(-) diff --git a/cuda_pathfinder/tests/test_load_nvidia_dynamic_lib.py b/cuda_pathfinder/tests/test_load_nvidia_dynamic_lib.py index 016acfd25d..abc0aa0ca9 100644 --- a/cuda_pathfinder/tests/test_load_nvidia_dynamic_lib.py +++ b/cuda_pathfinder/tests/test_load_nvidia_dynamic_lib.py @@ -3,9 +3,12 @@ import os import platform +import subprocess import pytest from child_load_nvidia_dynamic_lib_helper import ( + LOAD_NVIDIA_DYNAMIC_LIB_SUBPROCESS_CWD, + LOAD_NVIDIA_DYNAMIC_LIB_SUBPROCESS_MODE, build_child_process_failed_for_libname_message, run_load_nvidia_dynamic_lib_in_subprocess, ) @@ -14,7 +17,11 @@ from cuda.pathfinder import DynamicLibNotAvailableError, DynamicLibUnknownError, load_nvidia_dynamic_lib from cuda.pathfinder._dynamic_libs import load_nvidia_dynamic_lib as load_nvidia_dynamic_lib_module from cuda.pathfinder._dynamic_libs import supported_nvidia_libs -from cuda.pathfinder._dynamic_libs.subprocess_protocol import STATUS_NOT_FOUND, parse_dynamic_lib_subprocess_payload +from cuda.pathfinder._dynamic_libs.subprocess_protocol import ( + STATUS_NOT_FOUND, + build_dynamic_lib_subprocess_command, + parse_dynamic_lib_subprocess_payload, +) from cuda.pathfinder._utils.platform_aware import IS_WINDOWS, quote_for_shell STRICTNESS = os.environ.get("CUDA_PATHFINDER_TEST_LOAD_NVIDIA_DYNAMIC_LIB_STRICTNESS", "see_what_works") @@ -134,3 +141,54 @@ def raise_child_process_failed(): assert abs_path is not None info_summary_append(f"abs_path={quote_for_shell(abs_path)}") assert os.path.isfile(abs_path) # double-check the abs_path + + +def test_load_nvrtc_without_cuda_home_or_cuda_path(info_summary_append): + """Regression test for issue #1781: nvrtc must load without CUDA_HOME/CUDA_PATH. + + On Windows, when CUDA DLLs are discovered via PATH (system search), the + previous LoadLibraryExW(flags=0) call would find the DLL but fail to + resolve co-located dependencies like nvrtc-builtins (error 126). + + The fix uses SearchPathW to resolve the full path, then loads with + LOAD_WITH_ALTERED_SEARCH_PATH so dependency search starts from the + DLL's directory. + + This test strips CUDA_HOME and CUDA_PATH, then loads nvrtc in a fresh + subprocess. In CI environments where nvrtc is only available via system + search, this exercises the exact code path that was broken. + """ + env = os.environ.copy() + env.pop("CUDA_HOME", None) + env.pop("CUDA_PATH", None) + + timeout = 120 if IS_WINDOWS else 30 + command = build_dynamic_lib_subprocess_command(LOAD_NVIDIA_DYNAMIC_LIB_SUBPROCESS_MODE, "nvrtc") + result = subprocess.run( # noqa: S603 + command, + capture_output=True, + text=True, + timeout=timeout, + check=False, + env=env, + cwd=LOAD_NVIDIA_DYNAMIC_LIB_SUBPROCESS_CWD, + ) + + if result.returncode != 0: + raise RuntimeError(build_child_process_failed_for_libname_message("nvrtc", result)) + assert not result.stderr + + payload = parse_dynamic_lib_subprocess_payload( + result.stdout, + libname="nvrtc", + error_label="Load subprocess child process (no CUDA_HOME/CUDA_PATH)", + ) + + if payload.status == STATUS_NOT_FOUND: + info_summary_append("nvrtc not found without CUDA_HOME/CUDA_PATH") + pytest.skip("nvrtc not available without CUDA_HOME/CUDA_PATH") + + abs_path = payload.abs_path + assert abs_path is not None + info_summary_append(f"nvrtc (no CUDA_HOME/CUDA_PATH): abs_path={quote_for_shell(abs_path)}") + assert os.path.isfile(abs_path) From b2aabfe7eb29f477f33e1816801cdda6792ae891 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Thu, 19 Mar 2026 14:42:15 -0700 Subject: [PATCH 3/4] test: harden nvrtc regression test with PATH-based failure detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Also load nvrtc with the normal environment first. If the normal load finds nvrtc in a directory on PATH but the CUDA_HOME/CUDA_PATH-stripped load fails, the test now fails instead of skipping — directly catching the Windows bug where SetDefaultDllDirectories excludes PATH from LoadLibraryExW. Made-with: Cursor --- .../tests/test_load_nvidia_dynamic_lib.py | 89 +++++++++++++------ 1 file changed, 60 insertions(+), 29 deletions(-) diff --git a/cuda_pathfinder/tests/test_load_nvidia_dynamic_lib.py b/cuda_pathfinder/tests/test_load_nvidia_dynamic_lib.py index abc0aa0ca9..34ab208c43 100644 --- a/cuda_pathfinder/tests/test_load_nvidia_dynamic_lib.py +++ b/cuda_pathfinder/tests/test_load_nvidia_dynamic_lib.py @@ -146,25 +146,42 @@ def raise_child_process_failed(): def test_load_nvrtc_without_cuda_home_or_cuda_path(info_summary_append): """Regression test for issue #1781: nvrtc must load without CUDA_HOME/CUDA_PATH. - On Windows, when CUDA DLLs are discovered via PATH (system search), the - previous LoadLibraryExW(flags=0) call would find the DLL but fail to - resolve co-located dependencies like nvrtc-builtins (error 126). - - The fix uses SearchPathW to resolve the full path, then loads with - LOAD_WITH_ALTERED_SEARCH_PATH so dependency search starts from the - DLL's directory. - - This test strips CUDA_HOME and CUDA_PATH, then loads nvrtc in a fresh - subprocess. In CI environments where nvrtc is only available via system - search, this exercises the exact code path that was broken. + On Windows, Python 3.8+ calls SetDefaultDllDirectories(LOAD_LIBRARY_SEARCH_DEFAULT_DIRS) + at startup, which excludes PATH from the LoadLibraryExW search order. The fix uses + SearchPathW (unaffected by SetDefaultDllDirectories) to locate the DLL via PATH, then + loads it by absolute path with LOAD_WITH_ALTERED_SEARCH_PATH. + + This test loads nvrtc twice in fresh subprocesses: once with the normal environment, + once with CUDA_HOME and CUDA_PATH stripped. If the normal load finds nvrtc in a + directory on PATH, the stripped load must also succeed — otherwise the system search + is broken. """ + timeout = 120 if IS_WINDOWS else 30 + + # Phase 1: load nvrtc with normal environment. + normal_result = run_load_nvidia_dynamic_lib_in_subprocess("nvrtc", timeout=timeout) + if normal_result.returncode != 0: + raise RuntimeError(build_child_process_failed_for_libname_message("nvrtc", normal_result)) + assert not normal_result.stderr + normal_payload = parse_dynamic_lib_subprocess_payload( + normal_result.stdout, + libname="nvrtc", + error_label="Load subprocess child process (normal env)", + ) + if normal_payload.status == STATUS_NOT_FOUND: + info_summary_append("nvrtc not found (normal env)") + pytest.skip("nvrtc not available in this environment") + normal_abs_path = normal_payload.abs_path + assert normal_abs_path is not None + assert os.path.isfile(normal_abs_path) + info_summary_append(f"nvrtc (normal env): abs_path={quote_for_shell(normal_abs_path)}") + + # Phase 2: load nvrtc without CUDA_HOME/CUDA_PATH. env = os.environ.copy() env.pop("CUDA_HOME", None) env.pop("CUDA_PATH", None) - - timeout = 120 if IS_WINDOWS else 30 command = build_dynamic_lib_subprocess_command(LOAD_NVIDIA_DYNAMIC_LIB_SUBPROCESS_MODE, "nvrtc") - result = subprocess.run( # noqa: S603 + stripped_result = subprocess.run( # noqa: S603 command, capture_output=True, text=True, @@ -173,22 +190,36 @@ def test_load_nvrtc_without_cuda_home_or_cuda_path(info_summary_append): env=env, cwd=LOAD_NVIDIA_DYNAMIC_LIB_SUBPROCESS_CWD, ) - - if result.returncode != 0: - raise RuntimeError(build_child_process_failed_for_libname_message("nvrtc", result)) - assert not result.stderr - - payload = parse_dynamic_lib_subprocess_payload( - result.stdout, + if stripped_result.returncode != 0: + raise RuntimeError(build_child_process_failed_for_libname_message("nvrtc", stripped_result)) + assert not stripped_result.stderr + stripped_payload = parse_dynamic_lib_subprocess_payload( + stripped_result.stdout, libname="nvrtc", error_label="Load subprocess child process (no CUDA_HOME/CUDA_PATH)", ) - if payload.status == STATUS_NOT_FOUND: - info_summary_append("nvrtc not found without CUDA_HOME/CUDA_PATH") - pytest.skip("nvrtc not available without CUDA_HOME/CUDA_PATH") - - abs_path = payload.abs_path - assert abs_path is not None - info_summary_append(f"nvrtc (no CUDA_HOME/CUDA_PATH): abs_path={quote_for_shell(abs_path)}") - assert os.path.isfile(abs_path) + # Phase 3: evaluate. + if stripped_payload.status != STATUS_NOT_FOUND: + stripped_abs_path = stripped_payload.abs_path + assert stripped_abs_path is not None + assert os.path.isfile(stripped_abs_path) + info_summary_append(f"nvrtc (no CUDA_HOME/CUDA_PATH): abs_path={quote_for_shell(stripped_abs_path)}") + return + + # nvrtc was found normally but not without CUDA_HOME/CUDA_PATH. + # If the DLL's directory is on PATH, the system search should have found it. + dll_dir = os.path.normcase(os.path.normpath(os.path.dirname(normal_abs_path))) + on_path = any( + os.path.normcase(os.path.normpath(d)) == dll_dir for d in os.environ.get("PATH", "").split(os.pathsep) if d + ) + if on_path: + pytest.fail( + f"nvrtc was found at {normal_abs_path!r} (directory is on PATH) " + f"but could not be loaded without CUDA_HOME/CUDA_PATH. " + f"System search should find DLLs in PATH directories." + ) + info_summary_append( + f"nvrtc (no CUDA_HOME/CUDA_PATH): not found " + f"(normal-env directory not on PATH: {quote_for_shell(os.path.dirname(normal_abs_path))})" + ) From c3620f41ea3e059f48a016e1a1c34160b24e74cd Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Thu, 19 Mar 2026 15:06:53 -0700 Subject: [PATCH 4/4] fix: update comments in load_with_system_search to reflect correct diagnosis SearchPathW is the primary fix (bypasses Python 3.8+'s SetDefaultDllDirectories restriction); LOAD_WITH_ALTERED_SEARCH_PATH is a secondary benefit for dependency resolution. Made-with: Cursor --- .../cuda/pathfinder/_dynamic_libs/load_dl_windows.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_dl_windows.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_dl_windows.py index 7ee443d450..b7c0a8c4c3 100644 --- a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_dl_windows.py +++ b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_dl_windows.py @@ -163,12 +163,11 @@ def load_with_system_search(desc: LibDescriptor) -> LoadedDL | None: """ # Reverse tabulated names to achieve new -> old search order. for dll_name in reversed(desc.windows_dlls): - # First, find the DLL's full path using SearchPathW + # SearchPathW bypasses Python 3.8+'s SetDefaultDllDirectories restriction. found_path = _search_path_for_dll(dll_name) if found_path: - # Load with LOAD_WITH_ALTERED_SEARCH_PATH so Windows searches for - # dependencies from the DLL's directory (required for CUDA DLLs - # whose dependencies are co-located) + # LOAD_WITH_ALTERED_SEARCH_PATH additionally ensures dependencies + # are resolved from the DLL's directory. handle = kernel32.LoadLibraryExW(found_path, None, WINBASE_LOAD_WITH_ALTERED_SEARCH_PATH) if handle: return LoadedDL(found_path, False, ctypes_handle_to_unsigned_int(handle), "system-search")