From afa766ceea0b0372ba1412c8210a61d08ddaf0b6 Mon Sep 17 00:00:00 2001
From: uday1o1 <uday1o1arora@gmail.com>
Date: Wed, 27 May 2026 09:43:38 -0700
Subject: [PATCH] Fix cuda.core example links in docs

---
 cuda_core/docs/source/conf.py              | 14 ++++++++---
 cuda_core/docs/source/examples.rst         | 28 +++++++++++-----------
 cuda_core/docs/source/getting-started.rst  |  4 ++--
 cuda_core/docs/source/interoperability.rst |  6 ++---
 4 files changed, 30 insertions(+), 22 deletions(-)

diff --git a/cuda_core/docs/source/conf.py b/cuda_core/docs/source/conf.py
index 6c0fe6b3072..14d93297937 100644
--- a/cuda_core/docs/source/conf.py
+++ b/cuda_core/docs/source/conf.py
@@ -46,6 +46,7 @@ def _github_examples_ref():
     "sphinx.ext.autosummary",
     "sphinx.ext.napoleon",
     "sphinx.ext.intersphinx",
+    "sphinx.ext.extlinks",
     "myst_nb",
     "sphinx_copybutton",
     "sphinx_toolbox.more_autodoc.autoprotocol",
@@ -107,9 +108,16 @@ def _github_examples_ref():
 # skip cmdline prompts
 copybutton_exclude = ".linenos, .gp"
 
-rst_epilog = f"""
-.. |cuda_core_github_ref| replace:: {GITHUB_EXAMPLES_REF}
-"""
+extlinks = {
+    "cuda-core-example": (
+        f"https://github.com/NVIDIA/cuda-python/blob/{GITHUB_EXAMPLES_REF}/cuda_core/examples/%s",
+        "%s",
+    ),
+    "cuda-core-examples": (
+        f"https://github.com/NVIDIA/cuda-python/tree/{GITHUB_EXAMPLES_REF}/cuda_core/examples%s",
+        "%s",
+    ),
+}
 
 intersphinx_mapping = {
     "python": ("https://docs.python.org/3/", None),
diff --git a/cuda_core/docs/source/examples.rst b/cuda_core/docs/source/examples.rst
index 45044a0c905..e3b2ef8f3f5 100644
--- a/cuda_core/docs/source/examples.rst
+++ b/cuda_core/docs/source/examples.rst
@@ -5,55 +5,55 @@ Examples
 ========
 
 This page links to the ``cuda.core`` examples shipped in the
-`cuda-python repository <https://github.com/NVIDIA/cuda-python/tree/|cuda_core_github_ref|/cuda_core/examples>`_.
+:cuda-core-examples:`cuda-python repository </>`.
 Use it as a quick index when you want a runnable starting point for a specific
 workflow.
 
 Compilation and kernel launch
 -----------------------------
 
-- `vector_add.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_core_github_ref|/cuda_core/examples/vector_add.py>`_
+- :cuda-core-example:`vector_add.py`
   compiles and launches a simple vector-add kernel with CuPy arrays.
-- `saxpy.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_core_github_ref|/cuda_core/examples/saxpy.py>`_
+- :cuda-core-example:`saxpy.py`
   JIT-compiles a templated SAXPY kernel and launches both float and double
   instantiations.
-- `pytorch_example.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_core_github_ref|/cuda_core/examples/pytorch_example.py>`_
+- :cuda-core-example:`pytorch_example.py`
   launches a CUDA kernel with PyTorch tensors and a wrapped PyTorch stream.
 
 Multi-device and advanced launch configuration
 ----------------------------------------------
 
-- `simple_multi_gpu_example.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_core_github_ref|/cuda_core/examples/simple_multi_gpu_example.py>`_
+- :cuda-core-example:`simple_multi_gpu_example.py`
   compiles and launches kernels across multiple GPUs.
-- `thread_block_cluster.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_core_github_ref|/cuda_core/examples/thread_block_cluster.py>`_
+- :cuda-core-example:`thread_block_cluster.py`
   demonstrates thread block cluster launch configuration on Hopper-class GPUs.
-- `tma_tensor_map.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_core_github_ref|/cuda_core/examples/tma_tensor_map.py>`_
+- :cuda-core-example:`tma_tensor_map.py`
   demonstrates Tensor Memory Accelerator descriptors and TMA-based bulk copies.
 
 Linking and graphs
 ------------------
 
-- `jit_lto_fractal.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_core_github_ref|/cuda_core/examples/jit_lto_fractal.py>`_
+- :cuda-core-example:`jit_lto_fractal.py`
   uses JIT link-time optimization to link user-provided device code into a
   fractal workflow at runtime.
-- `cuda_graphs.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_core_github_ref|/cuda_core/examples/cuda_graphs.py>`_
+- :cuda-core-example:`cuda_graphs.py`
   captures and replays a multi-kernel CUDA graph to reduce launch overhead.
 
 Interoperability and memory access
 ----------------------------------
 
-- `memory_ops.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_core_github_ref|/cuda_core/examples/memory_ops.py>`_
+- :cuda-core-example:`memory_ops.py`
   covers memory resources, pinned memory, device transfers, and DLPack interop.
-- `strided_memory_view_cpu.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_core_github_ref|/cuda_core/examples/strided_memory_view_cpu.py>`_
+- :cuda-core-example:`strided_memory_view_cpu.py`
   uses ``StridedMemoryView`` with JIT-compiled CPU code via ``cffi``.
-- `strided_memory_view_gpu.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_core_github_ref|/cuda_core/examples/strided_memory_view_gpu.py>`_
+- :cuda-core-example:`strided_memory_view_gpu.py`
   uses ``StridedMemoryView`` with JIT-compiled GPU code and foreign GPU buffers.
-- `gl_interop_plasma.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_core_github_ref|/cuda_core/examples/gl_interop_plasma.py>`_
+- :cuda-core-example:`gl_interop_plasma.py`
   renders a CUDA-generated plasma effect through OpenGL interop without CPU
   copies.
 
 System inspection
 -----------------
 
-- `show_device_properties.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_core_github_ref|/cuda_core/examples/show_device_properties.py>`_
+- :cuda-core-example:`show_device_properties.py`
   prints a detailed report of the CUDA devices available on the system.
diff --git a/cuda_core/docs/source/getting-started.rst b/cuda_core/docs/source/getting-started.rst
index ebe97df8347..fb2f0b22fcf 100644
--- a/cuda_core/docs/source/getting-started.rst
+++ b/cuda_core/docs/source/getting-started.rst
@@ -32,7 +32,7 @@ Example: Compiling and Launching a CUDA kernel
 ----------------------------------------------
 
 To get a taste for ``cuda.core``, let's walk through a simple example that compiles and launches a vector addition kernel.
-You can find the complete example in `vector_add.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_core_github_ref|/cuda_core/examples/vector_add.py>`_
+You can find the complete example in :cuda-core-example:`vector_add.py`
 and browse the :doc:`examples page <examples>` for the rest of the shipped
 workflows.
 
@@ -80,7 +80,7 @@ Note the use of the ``name_expressions`` parameter to the :meth:`Program.compile
 Next, we retrieve the compiled kernel from the CUBIN and prepare the arguments and kernel configuration.
 We're using `CuPy <https://cupy.dev/>`_ arrays as inputs for this example, but
 you can use PyTorch tensors too (see
-`pytorch_example.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_core_github_ref|/cuda_core/examples/pytorch_example.py>`_
+:cuda-core-example:`pytorch_example.py`
 and the :doc:`examples page <examples>`).
 
 .. code-block:: python
diff --git a/cuda_core/docs/source/interoperability.rst b/cuda_core/docs/source/interoperability.rst
index 4aac89d13df..4aa155ce5f9 100644
--- a/cuda_core/docs/source/interoperability.rst
+++ b/cuda_core/docs/source/interoperability.rst
@@ -70,11 +70,11 @@ a few iterations to ensure correctness.
 for extracting the metadata (such as pointer address, shape, strides, and
 dtype) from any Python objects supporting either CAI or DLPack and returning a
 :class:`~utils.StridedMemoryView` object. See the
-`strided_memory_view_constructors.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_core_github_ref|/cuda_core/examples/strided_memory_view_constructors.py>`_
+:cuda-core-example:`strided_memory_view_constructors.py`
 example for the explicit constructors, or
-`strided_memory_view_cpu.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_core_github_ref|/cuda_core/examples/strided_memory_view_cpu.py>`_
+:cuda-core-example:`strided_memory_view_cpu.py`
 and
-`strided_memory_view_gpu.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_core_github_ref|/cuda_core/examples/strided_memory_view_gpu.py>`_
+:cuda-core-example:`strided_memory_view_gpu.py`
 for decorator-based workflows. This provides a *concrete implementation* to
 both protocols that is **array-library-agnostic**, so that all Python projects
 can just rely on this without either re-implementing (the consumer-side of)