From afa766ceea0b0372ba1412c8210a61d08ddaf0b6 Mon Sep 17 00:00:00 2001 From: uday1o1 Date: Wed, 27 May 2026 09:43:38 -0700 Subject: [PATCH] Fix cuda.core example links in docs --- cuda_core/docs/source/conf.py | 14 ++++++++--- cuda_core/docs/source/examples.rst | 28 +++++++++++----------- cuda_core/docs/source/getting-started.rst | 4 ++-- cuda_core/docs/source/interoperability.rst | 6 ++--- 4 files changed, 30 insertions(+), 22 deletions(-) diff --git a/cuda_core/docs/source/conf.py b/cuda_core/docs/source/conf.py index 6c0fe6b3072..14d93297937 100644 --- a/cuda_core/docs/source/conf.py +++ b/cuda_core/docs/source/conf.py @@ -46,6 +46,7 @@ def _github_examples_ref(): "sphinx.ext.autosummary", "sphinx.ext.napoleon", "sphinx.ext.intersphinx", + "sphinx.ext.extlinks", "myst_nb", "sphinx_copybutton", "sphinx_toolbox.more_autodoc.autoprotocol", @@ -107,9 +108,16 @@ def _github_examples_ref(): # skip cmdline prompts copybutton_exclude = ".linenos, .gp" -rst_epilog = f""" -.. |cuda_core_github_ref| replace:: {GITHUB_EXAMPLES_REF} -""" +extlinks = { + "cuda-core-example": ( + f"https://github.com/NVIDIA/cuda-python/blob/{GITHUB_EXAMPLES_REF}/cuda_core/examples/%s", + "%s", + ), + "cuda-core-examples": ( + f"https://github.com/NVIDIA/cuda-python/tree/{GITHUB_EXAMPLES_REF}/cuda_core/examples%s", + "%s", + ), +} intersphinx_mapping = { "python": ("https://docs.python.org/3/", None), diff --git a/cuda_core/docs/source/examples.rst b/cuda_core/docs/source/examples.rst index 45044a0c905..e3b2ef8f3f5 100644 --- a/cuda_core/docs/source/examples.rst +++ b/cuda_core/docs/source/examples.rst @@ -5,55 +5,55 @@ Examples ======== This page links to the ``cuda.core`` examples shipped in the -`cuda-python repository `_. +:cuda-core-examples:`cuda-python repository `. Use it as a quick index when you want a runnable starting point for a specific workflow. Compilation and kernel launch ----------------------------- -- `vector_add.py `_ +- :cuda-core-example:`vector_add.py` compiles and launches a simple vector-add kernel with CuPy arrays. -- `saxpy.py `_ +- :cuda-core-example:`saxpy.py` JIT-compiles a templated SAXPY kernel and launches both float and double instantiations. -- `pytorch_example.py `_ +- :cuda-core-example:`pytorch_example.py` launches a CUDA kernel with PyTorch tensors and a wrapped PyTorch stream. Multi-device and advanced launch configuration ---------------------------------------------- -- `simple_multi_gpu_example.py `_ +- :cuda-core-example:`simple_multi_gpu_example.py` compiles and launches kernels across multiple GPUs. -- `thread_block_cluster.py `_ +- :cuda-core-example:`thread_block_cluster.py` demonstrates thread block cluster launch configuration on Hopper-class GPUs. -- `tma_tensor_map.py `_ +- :cuda-core-example:`tma_tensor_map.py` demonstrates Tensor Memory Accelerator descriptors and TMA-based bulk copies. Linking and graphs ------------------ -- `jit_lto_fractal.py `_ +- :cuda-core-example:`jit_lto_fractal.py` uses JIT link-time optimization to link user-provided device code into a fractal workflow at runtime. -- `cuda_graphs.py `_ +- :cuda-core-example:`cuda_graphs.py` captures and replays a multi-kernel CUDA graph to reduce launch overhead. Interoperability and memory access ---------------------------------- -- `memory_ops.py `_ +- :cuda-core-example:`memory_ops.py` covers memory resources, pinned memory, device transfers, and DLPack interop. -- `strided_memory_view_cpu.py `_ +- :cuda-core-example:`strided_memory_view_cpu.py` uses ``StridedMemoryView`` with JIT-compiled CPU code via ``cffi``. -- `strided_memory_view_gpu.py `_ +- :cuda-core-example:`strided_memory_view_gpu.py` uses ``StridedMemoryView`` with JIT-compiled GPU code and foreign GPU buffers. -- `gl_interop_plasma.py `_ +- :cuda-core-example:`gl_interop_plasma.py` renders a CUDA-generated plasma effect through OpenGL interop without CPU copies. System inspection ----------------- -- `show_device_properties.py `_ +- :cuda-core-example:`show_device_properties.py` prints a detailed report of the CUDA devices available on the system. diff --git a/cuda_core/docs/source/getting-started.rst b/cuda_core/docs/source/getting-started.rst index ebe97df8347..fb2f0b22fcf 100644 --- a/cuda_core/docs/source/getting-started.rst +++ b/cuda_core/docs/source/getting-started.rst @@ -32,7 +32,7 @@ Example: Compiling and Launching a CUDA kernel ---------------------------------------------- To get a taste for ``cuda.core``, let's walk through a simple example that compiles and launches a vector addition kernel. -You can find the complete example in `vector_add.py `_ +You can find the complete example in :cuda-core-example:`vector_add.py` and browse the :doc:`examples page ` for the rest of the shipped workflows. @@ -80,7 +80,7 @@ Note the use of the ``name_expressions`` parameter to the :meth:`Program.compile Next, we retrieve the compiled kernel from the CUBIN and prepare the arguments and kernel configuration. We're using `CuPy `_ arrays as inputs for this example, but you can use PyTorch tensors too (see -`pytorch_example.py `_ +:cuda-core-example:`pytorch_example.py` and the :doc:`examples page `). .. code-block:: python diff --git a/cuda_core/docs/source/interoperability.rst b/cuda_core/docs/source/interoperability.rst index 4aac89d13df..4aa155ce5f9 100644 --- a/cuda_core/docs/source/interoperability.rst +++ b/cuda_core/docs/source/interoperability.rst @@ -70,11 +70,11 @@ a few iterations to ensure correctness. for extracting the metadata (such as pointer address, shape, strides, and dtype) from any Python objects supporting either CAI or DLPack and returning a :class:`~utils.StridedMemoryView` object. See the -`strided_memory_view_constructors.py `_ +:cuda-core-example:`strided_memory_view_constructors.py` example for the explicit constructors, or -`strided_memory_view_cpu.py `_ +:cuda-core-example:`strided_memory_view_cpu.py` and -`strided_memory_view_gpu.py `_ +:cuda-core-example:`strided_memory_view_gpu.py` for decorator-based workflows. This provides a *concrete implementation* to both protocols that is **array-library-agnostic**, so that all Python projects can just rely on this without either re-implementing (the consumer-side of)