Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .dockerignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
README.md
bin/
.git/
*.tar.gz
*.tar.gz
109 changes: 109 additions & 0 deletions .github/workflows/e2e-tests.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
name: e2e
on:
pull_request:
branches: [main]
push:
branches: [main]
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

env:
KIND_VERSION: v0.32.0
IMAGE: ghcr.io/converged-computing/fluence:latest

jobs:
e2e:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4

#- name: Set up Docker Buildx
# uses: docker/setup-buildx-action@v3

# - name: Build fluence image
# uses: docker/build-push-action@v6
# with:
# context: .
# file: ./Dockerfile
# push: false
# load: true
# tags: ${{ env.IMAGE }}
# cache-from: type=gha
# cache-to: type=gha,mode=max
- name: Create k8s Kind Cluster
uses: helm/kind-action@v1.10.0
with:
version: v0.32.0 # Define your custom KinD CLI version here
node_image: kindest/node:v1.36.1
config: ./deploy/kind-config.yaml

- name: Free Disk Space (Ubuntu)
run: |
echo "=== Disk space before cleanup ==="
df -h

# Remove large software runtimes and tools
sudo rm -rf /usr/share/dotnet
sudo rm -rf /usr/local/lib/android
sudo rm -rf /opt/ghc
sudo rm -rf /opt/hostedtoolcache/CodeQL

# Clean package caches
sudo apt-get clean
echo "=== Disk space after cleanup ==="
df -h

- name: Deploy fluence (base)
run: |
kubectl apply -f deploy/fluence-test.yaml
kubectl rollout status -n kube-system deployment/fluence --timeout=180s
POD=$(kubectl -n kube-system get pods -l app=fluence -o name | head -1)
kubectl -n kube-system exec "${POD#pod/}" -- ls /tmp/
kubectl -n kube-system logs "${POD#pod/}"
kubectl -n kube-system exec "${POD#pod/}" -- /bin/bash -c "cat /tmp/fluence-graph-*.json"
kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{": cpu="}{.status.allocatable.cpu}{" mem="}{.status.allocatable.memory}{"\n"}{end}'

- name: E2E - classical gang
run: bash test/e2e/01-classical-gang.sh

- name: Deploy quantum add-on
run: |
# Includes the device plugin and oriented to testing container
kubectl apply -f deploy/fluence-resources-test.yaml
kubectl rollout restart -n kube-system deployment/fluence
kubectl rollout status -n kube-system deployment/fluence --timeout=60s
for i in $(seq 1 60); do
kubectl get nodes -o jsonpath='{range .items[*]}{.status.allocatable}{"\n"}{end}'
kubectl get nodes -o jsonpath='{range .items[*]}{.status.allocatable}{"\n"}{end}' | grep -q 'fluxion.flux-framework.org/qpu' && break
sleep 1
done
POD=$(kubectl -n kube-system get pods -l app=fluence -o name | head -1)
kubectl -n kube-system exec "${POD#pod/}" -- /bin/bash -c "cat /tmp/fluence-graph-*.json"

- name: Wait for webhook
run: |

# wait for the deployment AND for the caBundle to be populated on the webhook config
kubectl -n kube-system rollout status deployment/fluence-webhook --timeout=120s
for i in $(seq 1 30); do
cab=$(kubectl get mutatingwebhookconfiguration fluence-webhook \
-o jsonpath='{.webhooks[0].clientConfig.caBundle}' 2>/dev/null)
[ -n "$cab" ] && break
sleep 2
done
# let TLS serving settle after caBundle patch
sleep 3

- name: E2E - quantum placement
run: bash test/e2e/02-quantum-placement.sh

#- name: E2E - restart recovery (no double-book)
# run: bash test/e2e/03-restart-recovery.sh

- name: Dump diagnostics on failure
if: failure()
run: |
kubectl get pods -A -o wide
kubectl logs -n kube-system deployment/fluence
30 changes: 1 addition & 29 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,33 +1,5 @@
# Mr. Fluence!
# Multi-stage build for the fluence scheduler.
# The scheduler binary cgo-links flux-sched (Fluxion) for resource matching.
FROM ghcr.io/converged-computing/fluence-base:latest AS builder

FROM fluxrm/flux-core:noble AS builder

USER root
ENV LD_LIBRARY_PATH=/usr/lib:/usr/local/lib
ENV DEBIAN_FRONTEND=noninteractive

RUN apt-get update && apt-get install -y --no-install-recommends \
libboost-graph-dev libboost-system-dev libboost-filesystem-dev \
libboost-regex-dev libyaml-cpp-dev libedit-dev libczmq-dev \
python3-yaml ninja-build cmake curl git wget ca-certificates \
&& rm -rf /var/lib/apt/lists/*

# Go toolchain
RUN wget -q https://go.dev/dl/go1.26.0.linux-amd64.tar.gz \
&& tar -C /usr/local -xzf go1.26.0.linux-amd64.tar.gz && rm go1.26.0.linux-amd64.tar.gz
ENV PATH=$PATH:/usr/local/go/bin

# flux-sched (Fluxion) with the Go reapi bindings -> /usr; build tree at /opt/flux-sched
#RUN git clone https://github.com/flux-framework/flux-sched /opt/flux-sched \
RUN git clone -b implement-reapi-cli-update-allocate https://github.com/vsoch/flux-sched /opt/flux-sched \
&& export FLUX_SCHED_VERSION=0.53.0 \
&& cd /opt/flux-sched && export WITH_GO=yes && ./configure --prefix=/usr \
&& mkdir build && cd build && cmake ../ && cd ../ && make -j"$(nproc)" && make install
ENV FLUX_SCHED_ROOT=/opt/flux-sched

# Build the scheduler
WORKDIR /src
COPY go.mod go.sum* ./
RUN go mod download || true
Expand Down
16 changes: 15 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,25 @@ test-restore:
image: ## Build the scheduler container image
docker build -t $(IMG) .

.PHONY: test-image
test-image: ## Build the scheduler container image
docker build -t $(IMG)-test .
docker push $(IMG)-test

.PHONY: test-image-deploy
test-image-deploy: test-image
kubectl patch podgroup training -n default --type=merge -p '{"metadata":{"finalizers":null}}' || true
kubectl delete deployments --all
kubectl delete pods --all
kubectl delete -f deploy/fluence-test.yaml
kubectl delete pods --all


.PHONY: deploy
deploy: ## Install RBAC + scheduler into kube-system
kubectl apply -f deploy/fluence.yaml

.PHONY: help
help:
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | \
awk 'BEGIN{FS=":.*?## "}{printf " %-14s %s\n", $$1, $$2}'
awk 'BEGIN{FS=":.*?## "}{printf " %-14s %s\n", $$1, $$2}'
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

![img/fluence.png](img/fluence.png)

🚧 **UNDER DEVELOPMENT** 🚧 not ready for production use! I rolled back features since the recorded demo, and am going to add them back with proper testing. I have not finished this yet, but anticipate later in the week of 6/16/2026. Thank you for your patience! -@vsoch

A Kubernetes scheduler plugin that places **pod groups** (and individual pods)
by matching them against a [Fluxion](https://github.com/flux-framework/flux-sched)
(flux-sched) resource graph built from the live cluster.
Expand Down
76 changes: 76 additions & 0 deletions deploy/fluence-resources-test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# Resources add-on for fluence. Turns on fluence-managed resources by supplying
# (1) the resources config and (2) the device plugin that advertises them.
# Quantum backends are just the example payload here; any resource type fluence
# can model goes in the same ConfigMap. Apply AFTER deploy/fluence.yaml:
#
# kubectl apply -f deploy/fluence.yaml # base scheduler (no devices)
# kubectl apply -f deploy/fluence-resources.yaml # + resources config + device plugin
# kubectl rollout restart deployment/fluence -n kube-system # scheduler re-reads resources
#
# The base scheduler already mounts the `fluence-resources` ConfigMap optionally
# and reads FLUENCE_RESOURCES, so this add-on is purely additive — no edits to
# the base Deployment.

# Resources config: the SINGLE source of truth for the resource types fluence
# injects/advertises. The scheduler builds qpu/qubit graph vertices from it; the
# device plugin derives which extended resources to advertise from the SAME
# document (same rule), so the two cannot drift.
apiVersion: v1
kind: ConfigMap
metadata:
name: fluence-resources
namespace: kube-system
data:
resources.yaml: |
backends:
- name: ibm_fez
num_qubits: 156
vendor: ibm
qrmi_type: qiskit-runtime-service
- name: ibm_marrakesh
num_qubits: 156
vendor: ibm
qrmi_type: qiskit-runtime-service
---
# Device plugin: advertises the exotic Fluxion resource types (derived from the
# resources config above) on every node, so pods can request them via resources
# and NodeResourcesFit is satisfied.
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: fluence-deviceplugin
namespace: kube-system
labels: {app: fluence-deviceplugin}
spec:
selector:
matchLabels: {app: fluence-deviceplugin}
template:
metadata:
labels: {app: fluence-deviceplugin}
spec:
priorityClassName: system-node-critical
tolerations:
- operator: Exists # run on every node, including tainted/control-plane
containers:
- name: deviceplugin
image: ghcr.io/converged-computing/fluence:test
command: ["/bin/fluence-deviceplugin"]
env:
- name: FLUENCE_RESOURCES
value: /etc/fluence/resources.yaml
- name: FLUENCE_RESOURCE_CAPACITY
value: "1000"
securityContext:
privileged: true
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
- name: resources
mountPath: /etc/fluence
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
- name: resources
configMap:
name: fluence-resources
Loading
Loading