diff --git a/.dockerignore b/.dockerignore index 038120b..a6f4128 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,3 +1,4 @@ +README.md bin/ .git/ -*.tar.gz \ No newline at end of file +*.tar.gz diff --git a/.github/workflows/e2e-tests.yaml b/.github/workflows/e2e-tests.yaml new file mode 100644 index 0000000..f3e8c45 --- /dev/null +++ b/.github/workflows/e2e-tests.yaml @@ -0,0 +1,109 @@ +name: e2e +on: + pull_request: + branches: [main] + push: + branches: [main] +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +env: + KIND_VERSION: v0.32.0 + IMAGE: ghcr.io/converged-computing/fluence:latest + +jobs: + e2e: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + #- name: Set up Docker Buildx + # uses: docker/setup-buildx-action@v3 + + # - name: Build fluence image + # uses: docker/build-push-action@v6 + # with: + # context: . + # file: ./Dockerfile + # push: false + # load: true + # tags: ${{ env.IMAGE }} + # cache-from: type=gha + # cache-to: type=gha,mode=max + - name: Create k8s Kind Cluster + uses: helm/kind-action@v1.10.0 + with: + version: v0.32.0 # Define your custom KinD CLI version here + node_image: kindest/node:v1.36.1 + config: ./deploy/kind-config.yaml + + - name: Free Disk Space (Ubuntu) + run: | + echo "=== Disk space before cleanup ===" + df -h + + # Remove large software runtimes and tools + sudo rm -rf /usr/share/dotnet + sudo rm -rf /usr/local/lib/android + sudo rm -rf /opt/ghc + sudo rm -rf /opt/hostedtoolcache/CodeQL + + # Clean package caches + sudo apt-get clean + echo "=== Disk space after cleanup ===" + df -h + + - name: Deploy fluence (base) + run: | + kubectl apply -f deploy/fluence-test.yaml + kubectl rollout status -n kube-system deployment/fluence --timeout=180s + POD=$(kubectl -n kube-system get pods -l app=fluence -o name | head -1) + kubectl -n kube-system exec "${POD#pod/}" -- ls /tmp/ + kubectl -n kube-system logs "${POD#pod/}" + kubectl -n kube-system exec "${POD#pod/}" -- /bin/bash -c "cat /tmp/fluence-graph-*.json" + kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{": cpu="}{.status.allocatable.cpu}{" mem="}{.status.allocatable.memory}{"\n"}{end}' + + - name: E2E - classical gang + run: bash test/e2e/01-classical-gang.sh + + - name: Deploy quantum add-on + run: | + # Includes the device plugin and oriented to testing container + kubectl apply -f deploy/fluence-resources-test.yaml + kubectl rollout restart -n kube-system deployment/fluence + kubectl rollout status -n kube-system deployment/fluence --timeout=60s + for i in $(seq 1 60); do + kubectl get nodes -o jsonpath='{range .items[*]}{.status.allocatable}{"\n"}{end}' + kubectl get nodes -o jsonpath='{range .items[*]}{.status.allocatable}{"\n"}{end}' | grep -q 'fluxion.flux-framework.org/qpu' && break + sleep 1 + done + POD=$(kubectl -n kube-system get pods -l app=fluence -o name | head -1) + kubectl -n kube-system exec "${POD#pod/}" -- /bin/bash -c "cat /tmp/fluence-graph-*.json" + + - name: Wait for webhook + run: | + + # wait for the deployment AND for the caBundle to be populated on the webhook config + kubectl -n kube-system rollout status deployment/fluence-webhook --timeout=120s + for i in $(seq 1 30); do + cab=$(kubectl get mutatingwebhookconfiguration fluence-webhook \ + -o jsonpath='{.webhooks[0].clientConfig.caBundle}' 2>/dev/null) + [ -n "$cab" ] && break + sleep 2 + done + # let TLS serving settle after caBundle patch + sleep 3 + + - name: E2E - quantum placement + run: bash test/e2e/02-quantum-placement.sh + + #- name: E2E - restart recovery (no double-book) + # run: bash test/e2e/03-restart-recovery.sh + + - name: Dump diagnostics on failure + if: failure() + run: | + kubectl get pods -A -o wide + kubectl logs -n kube-system deployment/fluence diff --git a/Dockerfile b/Dockerfile index ceed5da..2fe9596 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,33 +1,5 @@ -# Mr. Fluence! -# Multi-stage build for the fluence scheduler. -# The scheduler binary cgo-links flux-sched (Fluxion) for resource matching. +FROM ghcr.io/converged-computing/fluence-base:latest AS builder -FROM fluxrm/flux-core:noble AS builder - -USER root -ENV LD_LIBRARY_PATH=/usr/lib:/usr/local/lib -ENV DEBIAN_FRONTEND=noninteractive - -RUN apt-get update && apt-get install -y --no-install-recommends \ - libboost-graph-dev libboost-system-dev libboost-filesystem-dev \ - libboost-regex-dev libyaml-cpp-dev libedit-dev libczmq-dev \ - python3-yaml ninja-build cmake curl git wget ca-certificates \ - && rm -rf /var/lib/apt/lists/* - -# Go toolchain -RUN wget -q https://go.dev/dl/go1.26.0.linux-amd64.tar.gz \ - && tar -C /usr/local -xzf go1.26.0.linux-amd64.tar.gz && rm go1.26.0.linux-amd64.tar.gz -ENV PATH=$PATH:/usr/local/go/bin - -# flux-sched (Fluxion) with the Go reapi bindings -> /usr; build tree at /opt/flux-sched -#RUN git clone https://github.com/flux-framework/flux-sched /opt/flux-sched \ -RUN git clone -b implement-reapi-cli-update-allocate https://github.com/vsoch/flux-sched /opt/flux-sched \ - && export FLUX_SCHED_VERSION=0.53.0 \ - && cd /opt/flux-sched && export WITH_GO=yes && ./configure --prefix=/usr \ - && mkdir build && cd build && cmake ../ && cd ../ && make -j"$(nproc)" && make install -ENV FLUX_SCHED_ROOT=/opt/flux-sched - -# Build the scheduler WORKDIR /src COPY go.mod go.sum* ./ RUN go mod download || true diff --git a/Makefile b/Makefile index 5ea76a7..46c3344 100644 --- a/Makefile +++ b/Makefile @@ -43,6 +43,20 @@ test-restore: image: ## Build the scheduler container image docker build -t $(IMG) . +.PHONY: test-image +test-image: ## Build the scheduler container image + docker build -t $(IMG)-test . + docker push $(IMG)-test + +.PHONY: test-image-deploy +test-image-deploy: test-image + kubectl patch podgroup training -n default --type=merge -p '{"metadata":{"finalizers":null}}' || true + kubectl delete deployments --all + kubectl delete pods --all + kubectl delete -f deploy/fluence-test.yaml + kubectl delete pods --all + + .PHONY: deploy deploy: ## Install RBAC + scheduler into kube-system kubectl apply -f deploy/fluence.yaml @@ -50,4 +64,4 @@ deploy: ## Install RBAC + scheduler into kube-system .PHONY: help help: @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | \ - awk 'BEGIN{FS=":.*?## "}{printf " %-14s %s\n", $$1, $$2}' \ No newline at end of file + awk 'BEGIN{FS=":.*?## "}{printf " %-14s %s\n", $$1, $$2}' diff --git a/README.md b/README.md index b1b3d33..50849dc 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,8 @@ ![img/fluence.png](img/fluence.png) +🚧 **UNDER DEVELOPMENT** 🚧 not ready for production use! I rolled back features since the recorded demo, and am going to add them back with proper testing. I have not finished this yet, but anticipate later in the week of 6/16/2026. Thank you for your patience! -@vsoch + A Kubernetes scheduler plugin that places **pod groups** (and individual pods) by matching them against a [Fluxion](https://github.com/flux-framework/flux-sched) (flux-sched) resource graph built from the live cluster. diff --git a/deploy/fluence-resources-test.yaml b/deploy/fluence-resources-test.yaml new file mode 100644 index 0000000..d4fed97 --- /dev/null +++ b/deploy/fluence-resources-test.yaml @@ -0,0 +1,76 @@ +# Resources add-on for fluence. Turns on fluence-managed resources by supplying +# (1) the resources config and (2) the device plugin that advertises them. +# Quantum backends are just the example payload here; any resource type fluence +# can model goes in the same ConfigMap. Apply AFTER deploy/fluence.yaml: +# +# kubectl apply -f deploy/fluence.yaml # base scheduler (no devices) +# kubectl apply -f deploy/fluence-resources.yaml # + resources config + device plugin +# kubectl rollout restart deployment/fluence -n kube-system # scheduler re-reads resources +# +# The base scheduler already mounts the `fluence-resources` ConfigMap optionally +# and reads FLUENCE_RESOURCES, so this add-on is purely additive — no edits to +# the base Deployment. + +# Resources config: the SINGLE source of truth for the resource types fluence +# injects/advertises. The scheduler builds qpu/qubit graph vertices from it; the +# device plugin derives which extended resources to advertise from the SAME +# document (same rule), so the two cannot drift. +apiVersion: v1 +kind: ConfigMap +metadata: + name: fluence-resources + namespace: kube-system +data: + resources.yaml: | + backends: + - name: ibm_fez + num_qubits: 156 + vendor: ibm + qrmi_type: qiskit-runtime-service + - name: ibm_marrakesh + num_qubits: 156 + vendor: ibm + qrmi_type: qiskit-runtime-service +--- +# Device plugin: advertises the exotic Fluxion resource types (derived from the +# resources config above) on every node, so pods can request them via resources +# and NodeResourcesFit is satisfied. +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: fluence-deviceplugin + namespace: kube-system + labels: {app: fluence-deviceplugin} +spec: + selector: + matchLabels: {app: fluence-deviceplugin} + template: + metadata: + labels: {app: fluence-deviceplugin} + spec: + priorityClassName: system-node-critical + tolerations: + - operator: Exists # run on every node, including tainted/control-plane + containers: + - name: deviceplugin + image: ghcr.io/converged-computing/fluence:test + command: ["/bin/fluence-deviceplugin"] + env: + - name: FLUENCE_RESOURCES + value: /etc/fluence/resources.yaml + - name: FLUENCE_RESOURCE_CAPACITY + value: "1000" + securityContext: + privileged: true + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins + - name: resources + mountPath: /etc/fluence + volumes: + - name: device-plugin + hostPath: + path: /var/lib/kubelet/device-plugins + - name: resources + configMap: + name: fluence-resources diff --git a/deploy/fluence-test.yaml b/deploy/fluence-test.yaml new file mode 100644 index 0000000..525713e --- /dev/null +++ b/deploy/fluence-test.yaml @@ -0,0 +1,224 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: fluence + namespace: kube-system +--- +# Bind the built-in scheduler roles so fluence (a full kube-scheduler build) has +# every list/watch the scheduling framework needs (nodes, pods, PV/PVC, CSI, +# storageclasses, resourceclaims/slices, volumeattachments, events, etc.). +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: fluence-as-kube-scheduler +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: system:kube-scheduler +subjects: + - kind: ServiceAccount + name: fluence + namespace: kube-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: fluence-as-volume-scheduler +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: system:volume-scheduler +subjects: + - kind: ServiceAccount + name: fluence + namespace: kube-system +--- +# Delegated authentication: read the auth configmap in kube-system. This is the +# fix for the "extension-apiserver-authentication ... forbidden" errors. +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: fluence-extension-apiserver-authentication-reader + namespace: kube-system +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: extension-apiserver-authentication-reader +subjects: + - kind: ServiceAccount + name: fluence + namespace: kube-system +--- +# Extras the built-in scheduler role does not grant: the alpha PodGroup/Workload +# API (gang), and leader-election leases under our scheduler name. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: fluence-extra +rules: + - apiGroups: ["scheduling.k8s.io"] + resources: ["podgroups", "workloads", "podgroups/status", "workloads/status"] + verbs: ["get", "list", "watch", "update", "patch"] + - apiGroups: ["coordination.k8s.io"] + resources: ["leases"] + verbs: ["create", "get", "update", "list", "watch"] + # PreBind stamps the allocated backend onto the pod as an annotation; the + # built-in system:kube-scheduler role only allows patching pods/status, not + # the pod object, so grant it here. + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list", "watch", "patch", "update"] + # The webhook self-manages its TLS by patching its own config's caBundle. + - apiGroups: ["admissionregistration.k8s.io"] + resources: ["mutatingwebhookconfigurations"] + verbs: ["get", "list", "watch", "patch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: fluence-extra +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: fluence-extra +subjects: + - kind: ServiceAccount + name: fluence + namespace: kube-system +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: fluence-scheduler-config + namespace: kube-system +data: + scheduler-config.yaml: | + apiVersion: kubescheduler.config.k8s.io/v1 + kind: KubeSchedulerConfiguration + leaderElection: + leaderElect: false + profiles: + - schedulerName: fluence + plugins: + # multiPoint wires Fluence into every extension point its Go type + # implements: PreFilter, Filter, and PreBind (which stamps the backend + # annotation). Listing points individually risks omitting one — that is + # exactly what left PreBind unwired and the backend annotation unset. + multiPoint: + enabled: [{name: Fluence}] +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: fluence + namespace: kube-system + labels: {app: fluence} +spec: + replicas: 1 + selector: + matchLabels: {app: fluence} + template: + metadata: + labels: {app: fluence} + spec: + serviceAccountName: fluence + containers: + - name: fluence + image: ghcr.io/converged-computing/fluence:test + # Allows for kind load + # imagePullPolicy: Never + command: + - /bin/fluence + - --config=/etc/fluence/scheduler-config.yaml + # fluence is its own scheduler binary, so it needs the gang gates set + # here (the cluster-level kube-scheduler gates don't apply to it). + # Without these its PodGroup/GangScheduling plugin is inactive, pods + # schedule with no gang semantics, and PodGroup status stays Pending. + - --feature-gates=GenericWorkload=true,GangScheduling=true + - --v=4 + env: + # Path to the resources config (e.g. quantum backends). Unset/empty + # file -> classical-only graph. Supplied by the quantum add-on. + - name: FLUENCE_RESOURCES + value: /etc/fluence/resources.yaml + volumeMounts: + - name: config + mountPath: /etc/fluence + volumes: + - name: config + projected: + sources: + - configMap: {name: fluence-scheduler-config} + - configMap: {name: fluence-resources, optional: true} +--- +# Mutating webhook: injects scheduler-chosen values into pods at creation time +# (currently a downward-API QRMI_BACKEND env for quantum pods). It self-manages +# TLS — generates a CA + serving cert at startup and patches the caBundle below — +# so no cert-manager and no committed keys. failurePolicy Ignore keeps a webhook +# outage from blocking pod creation cluster-wide. +apiVersion: apps/v1 +kind: Deployment +metadata: + name: fluence-webhook + namespace: kube-system + labels: {app: fluence-webhook} +spec: + replicas: 1 + selector: + matchLabels: {app: fluence-webhook} + template: + metadata: + labels: {app: fluence-webhook} + spec: + serviceAccountName: fluence + containers: + - name: webhook + image: ghcr.io/converged-computing/fluence:test + # Allows for kind load + # imagePullPolicy: Never + command: ["/bin/fluence-webhook"] + ports: + - containerPort: 8443 + readinessProbe: + httpGet: {path: /healthz, port: 8443, scheme: HTTPS} + initialDelaySeconds: 2 +--- +apiVersion: v1 +kind: Service +metadata: + name: fluence-webhook + namespace: kube-system +spec: + selector: {app: fluence-webhook} + ports: + - port: 443 + targetPort: 8443 +--- +apiVersion: admissionregistration.k8s.io/v1 +kind: MutatingWebhookConfiguration +metadata: + name: fluence-webhook +webhooks: + - name: pods.fluence.flux-framework.org + admissionReviewVersions: ["v1"] + sideEffects: None + failurePolicy: Ignore # never block pod creation if the webhook is down + # caBundle is filled in at runtime by the webhook patching this object. + clientConfig: + service: + name: fluence-webhook + namespace: kube-system + path: /mutate + port: 443 + rules: + - apiGroups: [""] + apiVersions: ["v1"] + operations: ["CREATE"] + resources: ["pods"] + scope: Namespaced + # Don't intercept system pods (and avoid bootstrap coupling). + namespaceSelector: + matchExpressions: + - key: kubernetes.io/metadata.name + operator: NotIn + values: ["kube-system"] diff --git a/deploy/fluence.yaml b/deploy/fluence.yaml index 8090a34..0b66246 100644 --- a/deploy/fluence.yaml +++ b/deploy/fluence.yaml @@ -125,8 +125,6 @@ spec: containers: - name: fluence image: ghcr.io/converged-computing/fluence:latest - # Uncomment for development or if loaded into kind - # imagePullPolicy: Never command: - /bin/fluence - --config=/etc/fluence/scheduler-config.yaml @@ -174,7 +172,6 @@ spec: containers: - name: webhook image: ghcr.io/converged-computing/fluence:latest - imagePullPolicy: Never command: ["/bin/fluence-webhook"] ports: - containerPort: 8443 diff --git a/deploy/kind-config.yaml b/deploy/kind-config.yaml index ec310bc..c94e070 100644 --- a/deploy/kind-config.yaml +++ b/deploy/kind-config.yaml @@ -32,4 +32,4 @@ nodes: - name: feature-gates value: "GenericWorkload=true" - role: worker - - role: worker \ No newline at end of file + - role: worker diff --git a/docker/Dockerfile b/docker/Dockerfile new file mode 100644 index 0000000..4ec6a49 --- /dev/null +++ b/docker/Dockerfile @@ -0,0 +1,25 @@ +FROM fluxrm/flux-core:noble AS builder + +# base image for fluence to speed up build time +# docker build -t ghcr.io/converged-computing/fluence-base:latest . + +USER root +ENV LD_LIBRARY_PATH=/usr/lib:/usr/local/lib +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && apt-get install -y --no-install-recommends \ + libboost-graph-dev libboost-system-dev libboost-filesystem-dev \ + libboost-regex-dev libyaml-cpp-dev libedit-dev libczmq-dev \ + python3-yaml ninja-build cmake curl git wget ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +# Go toolchain +RUN wget -q https://go.dev/dl/go1.26.0.linux-amd64.tar.gz \ + && tar -C /usr/local -xzf go1.26.0.linux-amd64.tar.gz && rm go1.26.0.linux-amd64.tar.gz +ENV PATH=$PATH:/usr/local/go/bin + +RUN wget https://github.com/flux-framework/flux-sched/releases/download/v0.52.0/flux-sched-0.52.0.tar.gz && \ + tar -xzvf flux-sched-0.52.0.tar.gz && mv flux-sched-0.52.0 /opt/flux-sched \ + && cd /opt/flux-sched && export WITH_GO=yes && ./configure --prefix=/usr \ + && mkdir build && cd build && cmake ../ && cd ../ && make -j"$(nproc)" && make install +ENV FLUX_SCHED_ROOT=/opt/flux-sched diff --git a/examples/podgroup.yaml b/examples/podgroup.yaml index 068e56c..7184a8f 100644 --- a/examples/podgroup.yaml +++ b/examples/podgroup.yaml @@ -33,4 +33,4 @@ spec: resources: requests: cpu: "4" - memory: 8Gi \ No newline at end of file + memory: 8Gi diff --git a/examples/single-podgroup.yaml b/examples/single-podgroup.yaml new file mode 100644 index 0000000..e44780b --- /dev/null +++ b/examples/single-podgroup.yaml @@ -0,0 +1,35 @@ +# Native gang scheduling (k8s >= 1.36, GangScheduling/GenericWorkload gates on). +# Fluence does placement; the PodGroup gives all-or-nothing semantics. Pods link +# to the PodGroup via the first-class field spec.schedulingGroup.podGroupName. +apiVersion: scheduling.k8s.io/v1alpha2 +kind: PodGroup +metadata: + name: training +spec: + schedulingPolicy: + gang: + minCount: 1 +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: training +spec: + replicas: 1 + selector: + matchLabels: {app: training} + template: + metadata: + labels: + app: training + spec: + schedulerName: fluence + schedulingGroup: + podGroupName: training + containers: + - name: worker + image: busybox + command: ["sleep", "3600"] + resources: + requests: + cpu: "1" diff --git a/examples/test/e2e/quantum-pod-mock-2.yaml b/examples/test/e2e/quantum-pod-mock-2.yaml new file mode 100644 index 0000000..5286fe0 --- /dev/null +++ b/examples/test/e2e/quantum-pod-mock-2.yaml @@ -0,0 +1,19 @@ +# A second qpu pod, identical request to sampler-mock. Used by the restart-recovery +# test: after fluence restarts and replays the first allocation, this one must NOT be +# given the same exclusive backend (it should stay Pending if the backend is taken). +apiVersion: v1 +kind: Pod +metadata: + name: sampler-mock-2 +spec: + schedulerName: fluence + restartPolicy: Never + containers: + - name: sampler + image: busybox + command: ["sh", "-c", "echo BACKEND=$QRMI_BACKEND; sleep 3600"] + resources: + requests: + fluxion.flux-framework.org/qpu: "1" + limits: + fluxion.flux-framework.org/qpu: "1" diff --git a/examples/test/e2e/quantum-pod-mock.yaml b/examples/test/e2e/quantum-pod-mock.yaml new file mode 100644 index 0000000..92849dd --- /dev/null +++ b/examples/test/e2e/quantum-pod-mock.yaml @@ -0,0 +1,21 @@ +# CI mock of examples/quantum-pod.yaml: requests the same fluxion.flux-framework.org/qpu +# resource so fluence matches a backend and the webhook injects QRMI_BACKEND, but runs a +# trivial container instead of qrmi-sampler -- no IBM secrets, no real submission. Proves +# placement + the backend handoff end to end. +apiVersion: v1 +kind: Pod +metadata: + name: sampler-mock +spec: + schedulerName: fluence + restartPolicy: Never + containers: + - name: sampler + image: busybox + # Print the injected backend, then idle so we can assert on it. + command: ["sh", "-c", "echo BACKEND=$QRMI_BACKEND; sleep 3600"] + resources: + requests: + fluxion.flux-framework.org/qpu: "1" + limits: + fluxion.flux-framework.org/qpu: "1" diff --git a/pkg/fluence/fluence.go b/pkg/fluence/fluence.go index 58d069d..13cdcb3 100644 --- a/pkg/fluence/fluence.go +++ b/pkg/fluence/fluence.go @@ -106,6 +106,7 @@ func New(ctx context.Context, _ runtime.Object, h fwk.Handle) (fwk.Plugin, error if err != nil { return nil, fmt.Errorf("build resource graph: %w", err) } + fmt.Printf("Fluence resource graph:\n%s", jgfBytes) // FluxionGraph.Init reads from a file path, so stage the generated graph. tmp, err := os.CreateTemp("", "fluence-graph-*.json") @@ -168,17 +169,21 @@ func (f *Fluence) PreFilter( return nil, fwk.AsStatus(err) } + fmt.Printf("Attempting to match:\n%s\n", specYAML) f.matcherMu.Lock() req, err := f.matcher.MatchAllocateSpec(specYAML) f.matcherMu.Unlock() if err != nil { + fmt.Printf("FAIL Match failed: %s\n", err) return nil, fwk.NewStatus(fwk.Unschedulable, fmt.Sprintf("fluxion match failed: %v", err)) } place, err := placement.PlacementFromAllocation(req.Allocation) if err != nil { + fmt.Printf("FAIL Placement failed: %s\n", err) return nil, fwk.AsStatus(err) } if len(place.Nodes) == 0 && place.Backend == "" { + fmt.Println("FAIL No nodes") return nil, fwk.NewStatus(fwk.Unschedulable, "fluxion returned no allocation") } // A quantum-only allocation has a Backend but no Nodes (a qpu vertex lives diff --git a/pkg/jgf/jgf.go b/pkg/jgf/jgf.go index 140a39f..ca9bd98 100644 --- a/pkg/jgf/jgf.go +++ b/pkg/jgf/jgf.go @@ -10,6 +10,7 @@ import ( "encoding/json" "fmt" "sort" + "strings" ) const containment = "containment" @@ -188,6 +189,10 @@ type Doc struct { func (b *Builder) Doc() Doc { d := Doc{} for _, v := range b.vertices { + // Skip the control plane + if strings.Contains(v.name, "control-plane") { + continue + } d.Graph.Nodes = append(d.Graph.Nodes, node{ID: v.key, Metadata: metadata{v: v}}) } for _, e := range b.edges { diff --git a/pkg/placement/placement.go b/pkg/placement/placement.go index 46c4222..0213dc4 100644 --- a/pkg/placement/placement.go +++ b/pkg/placement/placement.go @@ -65,27 +65,17 @@ func podResources(p *corev1.Pod) map[string]int { } } } - // A pod that requested no exotic (non-classical) resource still needs at - // least one core to land on. - if !hasExotic(counts) && counts["core"] == 0 { + + // Every pod runs on a node, so always request at least one core, even when the + // pod also asks for an exotic resource (qpu/qubit). Without this a qpu-only pod + // produces a slot with no compute and Fluxion allocates a bare backend with no + // node to land the pod on. + if counts["core"] == 0 { counts["core"] = 1 } return counts } -// hasExotic reports whether counts contains any non-classical type (i.e. one -// that came through the Fluxion prefix, like qpu/qubit). -func hasExotic(counts map[string]int) bool { - for t := range counts { - switch t { - case "core", "memory", "gpu": - default: - return true - } - } - return false -} - // JobspecForGroup builds a Fluxion jobspec for a whole pod group: a slot per pod // (count = group size), each holding the per-pod resources as `with` entries — // one per requested Fluxion type. A hybrid pod (e.g. cores + a qpu) produces a diff --git a/test/e2e/01-classical-gang.sh b/test/e2e/01-classical-gang.sh new file mode 100644 index 0000000..7c3e4b2 --- /dev/null +++ b/test/e2e/01-classical-gang.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash +# Classical gang scheduling: a PodGroup of 2 must be placed all-or-nothing on real nodes. +set -euo pipefail +HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE}/lib.sh" + +log "TEST 1: classical gang scheduling" +kubectl apply -f examples/single-podgroup.yaml + +# All pods in the 'training' deployment must reach Running (scheduled + started). +log "waiting for both training pods to schedule" +kubectl wait --for=condition=Ready pod -l app=training || fail "training gang did not all become Ready (gang scheduling failed)" + +# Each pod must have a real node assigned by fluence. +for p in $(kubectl get pods -l app=training -o name); do + pod="${p#pod/}" + assert_scheduled "$pod" || fail "$pod has no nodeName" + sched="$(kubectl get pod "$pod" -o jsonpath='{.spec.schedulerName}')" + [ "$sched" = "fluence" ] || fail "$pod was not scheduled by fluence (got: $sched)" +done + +count="$(kubectl get pods -l app=training --no-headers | wc -l | tr -d ' ')" +[ "$count" = "1" ] || fail "expected 2 training pods, got $count" + +log "PASS: classical gang placed all $count pods via fluence" +kubectl delete -f examples/single-podgroup.yaml --wait=false || true +kubectl patch podgroup training --type=merge -p '{"metadata":{"finalizers":null}}' 2>/dev/null || true diff --git a/test/e2e/02-quantum-placement.sh b/test/e2e/02-quantum-placement.sh new file mode 100644 index 0000000..17897a3 --- /dev/null +++ b/test/e2e/02-quantum-placement.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash +# Quantum placement: a qpu pod is matched to a backend and the webhook injects QRMI_BACKEND. +set -euo pipefail +HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE}/lib.sh" +ANN="fluence.flux-framework.org/backend" + +log "TEST 2: quantum placement and backend handoff" +kubectl apply -f examples/test/e2e/quantum-pod-mock.yaml + +wait_pod_phase sampler-mock Running 120 || fail "sampler-mock did not reach Running" + +# fluence must have stamped the chosen backend annotation. +backend="$(kubectl get pod sampler-mock -o jsonpath="{.metadata.annotations.${ANN//./\\.}}" 2>/dev/null || true)" +[ -n "$backend" ] || (show_webhook sampler-mock && fail "backend annotation ($ANN) was not set by fluence") +log "fluence chose backend: $backend" + +# The webhook must have surfaced it as QRMI_BACKEND inside the container. +out="$(kubectl logs sampler-mock || true)" +echo "$out" | grep -q "BACKEND=${backend}" \ + || (show_webhook sampler-mock && fail "QRMI_BACKEND in container ('$out') does not match annotation ($backend)") + +log "PASS: qpu pod scheduled, backend '$backend' chosen and injected as QRMI_BACKEND" +kubectl delete -f examples/test/e2e/quantum-pod-mock.yaml --wait=false || true diff --git a/test/e2e/03-restart-recovery.sh b/test/e2e/03-restart-recovery.sh new file mode 100644 index 0000000..20c1be9 --- /dev/null +++ b/test/e2e/03-restart-recovery.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash +# Restart recovery: after the scheduler restarts, it must replay the existing allocation +# (via reapi update_allocate) and NOT double-book an exclusive qpu backend. +set -euo pipefail +HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE}/lib.sh" +NS=default +ANN="fluence.flux-framework.org/backend" + +log "TEST 3: restart does not double-book an exclusive backend" + +# 1. Schedule the first qpu pod and capture its backend. +kubectl apply -f examples/test/e2e/quantum-pod-mock.yaml +wait_pod_phase sampler-mock "$NS" Running 120 || fail "sampler-mock did not reach Running" +backend="$(kubectl get pod sampler-mock -n "$NS" -o jsonpath="{.metadata.annotations.${ANN//./\\.}}" 2>/dev/null || true)" +[ -n "$backend" ] || fail "first pod has no backend annotation" +log "first pod holds backend: $backend" + +# 2. Restart the scheduler. Its in-memory Fluxion graph is rebuilt empty; recovery must +# replay the persisted allocation so the backend stays occupied. +log "restarting fluence scheduler" +kubectl rollout restart -n "${NS_KUBE}" deployment/fluence +wait_fluence_ready + +# 3. The original pod must still be Running and still hold the same backend. +wait_pod_phase sampler-mock "$NS" Running 30 || fail "first pod not Running after restart" + +# 4. A second pod requesting the same exclusive qpu must NOT get the same backend. +# If recovery worked, the backend is occupied and the second pod stays Pending. +kubectl apply -f examples/test/e2e/quantum-pod-mock-2.yaml +if assert_stays_pending sampler-mock-2 "$NS" 45; then + log "PASS: second qpu pod stayed Pending; backend '$backend' was not double-booked" +else + backend2="$(kubectl get pod sampler-mock-2 -n "$NS" -o jsonpath="{.metadata.annotations.${ANN//./\\.}}" 2>/dev/null || true)" + if [ "$backend2" = "$backend" ]; then + fail "DOUBLE-BOOK: second pod got the same exclusive backend '$backend' after restart (recovery did not replay)" + else + log "NOTE: second pod scheduled to a DIFFERENT backend '$backend2' (ok only if >1 backend configured)" + fi +fi + +kubectl delete -f examples/test/e2e/quantum-pod-mock-2.yaml --wait=false || true +kubectl delete -f examples/test/e2e/quantum-pod-mock.yaml --wait=false || true diff --git a/test/e2e/lib.sh b/test/e2e/lib.sh new file mode 100644 index 0000000..341634a --- /dev/null +++ b/test/e2e/lib.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# Shared helpers for fluence e2e tests. Sourced by each scenario script. +set -euo pipefail + +NS_KUBE="${NS_KUBE:-kube-system}" +TIMEOUT="${TIMEOUT:-180s}" + +log() { echo "=== $*"; } +fail() { echo "FAIL: $*" >&2; dump; exit 1; } + +# Dump cluster state to help debug a CI failure. +dump() { + echo "----- pods (all namespaces) -----" + kubectl get pods -A -o wide || true + echo "----- events -----" + kubectl get events -A --sort-by=.lastTimestamp | tail -40 || true + echo "----- fluence scheduler logs -----" + kubectl logs -n "${NS_KUBE}" deployment/fluence --tail=120 || true +} + +# Wait until the fluence scheduler deployment is Available. +wait_fluence_ready() { + log "waiting for fluence scheduler to be ready" + kubectl rollout status -n "${NS_KUBE}" deployment/fluence --timeout="${TIMEOUT}" \ + || fail "fluence deployment did not become ready" +} + +show_webhook() { + pod=$1 + echo "FAIL: QRMI_BACKEND mismatch" + kubectl get pod $pod -o jsonpath='{.spec.containers[0].env}'; echo + kubectl get pod $pod -o jsonpath='{.metadata.annotations}'; echo + kubectl -n kube-system logs deploy/fluence-webhook --tail=50 +} + +# wait_pod_phase [timeout] +wait_pod_phase() { + local pod="$1" want="$2" t="${3:-120}" + echo "pod: $pod" + echo "want: $want" + echo "t: $t" + local i=0 + while [ "$i" -lt "$t" ]; do + local got + got="$(kubectl get pod "$pod" -o jsonpath='{.status.phase}' 2>/dev/null || true)" + [ "$got" = "$want" ] && return 0 + sleep 1; i=$((i+1)) + done + return 1 +} + +# assert_scheduled : pod has a node assigned. +assert_scheduled() { + local pod="$1" node + node="$(kubectl get pod "$pod" -o jsonpath='{.spec.nodeName}' 2>/dev/null || true)" + [ -n "$node" ] || return 1 + echo " $pod scheduled on $node" +} + +# Assert a pod stays Pending for the whole window (used for the "must not double-book" check). +assert_stays_pending() { + local pod="$1" t="${2:-30}" i=0 + while [ "$i" -lt "$t" ]; do + local node + node="$(kubectl get pod "$pod" -o jsonpath='{.spec.nodeName}' 2>/dev/null || true)" + [ -n "$node" ] && return 1 # got scheduled -> failure for this assertion + sleep 1; i=$((i+1)) + done + return 0 +}