Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
236 changes: 236 additions & 0 deletions .github/workflows/mshv-infra.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
name: MSHV Infra Setup
on:
workflow_call:
inputs:
ARCH:
description: 'Architecture for the VM'
required: true
type: string
KEY:
description: 'SSH Key Name'
required: true
type: string
OS_DISK_SIZE:
description: 'OS Disk Size in GB'
required: true
type: string
RG:
description: 'Resource Group Name'
required: true
type: string
VM_SKU:
description: 'VM SKU'
required: true
type: string
secrets:
MI_CLIENT_ID:
required: true
RUNNER_RG:
required: true
STORAGE_ACCOUNT_PATHS:
required: true
ARCH_SOURCE_PATH:
required: true
USERNAME:
required: true
outputs:
PRIVATE_IP:
description: 'Private IP of the VM'
value: ${{ jobs.infra-setup.outputs.PRIVATE_IP }}
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
infra-setup:
name: ${{ inputs.ARCH }} VM Provision
runs-on: mshv
outputs:
PRIVATE_IP: ${{ steps.get-vm-ip.outputs.PRIVATE_IP }}
steps:
- name: Install & login to AZ CLI
env:
MI_CLIENT_ID: ${{ secrets.MI_CLIENT_ID }}
run: |
set -e
echo "Installing Azure CLI if not already installed"
if ! command -v az &>/dev/null; then
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
else
echo "Azure CLI already installed"
fi
az --version
echo "Logging into Azure CLI using Managed Identity"
az login --identity --client-id ${MI_CLIENT_ID}

- name: Get Location
id: get-location
env:
SKU: ${{ inputs.VM_SKU }}
STORAGE_ACCOUNT_PATHS: ${{ secrets.STORAGE_ACCOUNT_PATHS }}
run: |
set -e
# Extract vCPU count from SKU (e.g., "Standard_D2s_v3" => 2)
vcpu=$(echo "$SKU" | sed -n 's/^Standard_[A-Za-z]\+\([0-9]\+\).*/\1/p')
if [[ -z "$vcpu" ]]; then
echo "Cannot extract vCPU count from SKU: $SKU"
exit 1
fi

SUPPORTED_LOCATIONS=$(echo "$STORAGE_ACCOUNT_PATHS" | jq -r 'to_entries[] | .key')

for location in $SUPPORTED_LOCATIONS; do
family=$(az vm list-skus --size "$SKU" --location "$location" --resource-type "virtualMachines" --query '[0].family' -o tsv)
if [[ -z "$family" ]]; then
echo "Cannot determine VM family for SKU: $SKU in $location"
continue
fi

usage=$(az vm list-usage --location "$location" --query "[?name.value=='$family'] | [0]" -o json)
current=$(echo "$usage" | jq -r '.currentValue')
limit=$(echo "$usage" | jq -r '.limit')

if [[ $((limit - current)) -ge $vcpu ]]; then
echo "Sufficient quota found in $location"
echo "location=$location" >> "$GITHUB_OUTPUT"
exit 0
fi
done

echo "No location found with sufficient vCPU quota for SKU: $SKU"
exit 1

- name: Create Resource Group
id: rg-setup
env:
LOCATION: ${{ steps.get-location.outputs.location }}
RG: ${{ inputs.RG }}
STORAGE_ACCOUNT_PATHS: ${{ secrets.STORAGE_ACCOUNT_PATHS }}
run: |
set -e
echo "Creating Resource Group: $RG"
# Create the resource group
echo "Creating resource group in location: ${LOCATION}"
az group create --name ${RG} --location ${LOCATION}
echo "Resource group created successfully."

- name: Generate SSH Key
id: generate-ssh-key
env:
KEY: ${{ inputs.KEY }}
run: |
set -e
echo "Generating SSH key: $KEY"
mkdir -p ~/.ssh
ssh-keygen -t rsa -b 4096 -f ~/.ssh/${KEY} -N ""

- name: Create VM
id: vm-setup
env:
KEY: ${{ inputs.KEY }}
LOCATION: ${{ steps.get-location.outputs.location }}
OS_DISK_SIZE: ${{ inputs.OS_DISK_SIZE }}
RG: ${{ inputs.RG }}
RUNNER_RG: ${{ secrets.RUNNER_RG }}
USERNAME: ${{ secrets.USERNAME }}
VM_SKU: ${{ inputs.VM_SKU }}
VM_IMAGE_NAME: ${{ inputs.ARCH }}_${{ steps.get-location.outputs.location }}_image
VM_NAME: ${{ inputs.ARCH }}_${{ steps.get-location.outputs.location }}_${{ github.run_id }}
run: |
set -e
echo "Creating $VM_SKU VM: $VM_NAME"

# Extract subnet ID from the runner VM
echo "Retrieving subnet ID..."
SUBNET_ID=$(az network vnet list --resource-group ${RUNNER_RG} --query "[?contains(location, '${LOCATION}')].{SUBNETS:subnets}" | jq -r ".[0].SUBNETS[0].id")
if [[ -z "${SUBNET_ID}" ]]; then
echo "ERROR: Failed to retrieve Subnet ID."
exit 1
fi

# Extract image ID from the runner VM
echo "Retrieving image ID..."
IMAGE_ID=$(az image show --resource-group ${RUNNER_RG} --name ${VM_IMAGE_NAME} --query "id" -o tsv)
if [[ -z "${IMAGE_ID}" ]]; then
echo "ERROR: Failed to retrieve Image ID."
exit 1
fi

# Create VM
az vm create \
--resource-group ${RG} \
--name ${VM_NAME} \
--subnet ${SUBNET_ID} \
--size ${VM_SKU} \
--location ${LOCATION} \
--image ${IMAGE_ID} \
--os-disk-size-gb ${OS_DISK_SIZE} \
--public-ip-sku Standard \
--storage-sku Premium_LRS \
--public-ip-address "" \
--admin-username ${USERNAME} \
--ssh-key-value ~/.ssh/${KEY}.pub \
--security-type Standard \
--output json

echo "VM creation process completed successfully."

- name: Get VM Private IP
id: get-vm-ip
env:
RG: ${{ inputs.RG }}
VM_NAME: ${{ inputs.ARCH }}_${{ steps.get-location.outputs.location }}_${{ github.run_id }}
run: |
set -e
echo "Retrieving VM Private IP address..."
# Retrieve VM Private IP address
PRIVATE_IP=$(az vm show -g ${RG} -n ${VM_NAME} -d --query privateIps -o tsv)
if [[ -z "$PRIVATE_IP" ]]; then
echo "ERROR: Failed to retrieve private IP address."
exit 1
fi
echo "PRIVATE_IP=$PRIVATE_IP" >> $GITHUB_OUTPUT

- name: Wait for SSH availability
env:
KEY: ${{ inputs.KEY }}
PRIVATE_IP: ${{ steps.get-vm-ip.outputs.PRIVATE_IP }}
USERNAME: ${{ secrets.USERNAME }}
run: |
echo "Waiting for SSH to be accessible..."
timeout 120 bash -c 'until ssh -o StrictHostKeyChecking=no -i ~/.ssh/${KEY} ${USERNAME}@${PRIVATE_IP} "exit" 2>/dev/null; do sleep 5; done'
echo "VM is accessible!"

- name: Remove Old Host Key
env:
PRIVATE_IP: ${{ steps.get-vm-ip.outputs.PRIVATE_IP }}
run: |
set -e
echo "Removing the old host key"
ssh-keygen -R $PRIVATE_IP

- name: SSH into VM and Install Dependencies
env:
KEY: ${{ inputs.KEY }}
PRIVATE_IP: ${{ steps.get-vm-ip.outputs.PRIVATE_IP }}
USERNAME: ${{ secrets.USERNAME }}
run: |
set -e
ssh -i ~/.ssh/${KEY} -o StrictHostKeyChecking=no ${USERNAME}@${PRIVATE_IP} << EOF
set -e
echo "Logged in successfully."
echo "Installing dependencies..."
sudo tdnf install -y git moby-engine moby-cli clang llvm pkg-config make gcc glibc-devel
echo "Installing Rust..."
curl -sSf https://sh.rustup.rs | sh -s -- --default-toolchain stable --profile default -y
export PATH="\$HOME/.cargo/bin:\$PATH"
cargo --version
sudo mkdir -p /etc/docker/
echo '{"default-ulimits":{"nofile":{"Hard":65535,"Name":"nofile","Soft":65535}}}' | sudo tee /etc/docker/daemon.json
sudo systemctl stop docker
sudo systemctl enable docker.service
sudo systemctl enable containerd.service
sudo systemctl start docker
sudo groupadd -f docker
sudo usermod -a -G docker ${USERNAME}
sudo systemctl restart docker
EOF
108 changes: 108 additions & 0 deletions .github/workflows/mshv-integration.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
name: MSHV Integration Tests
on: [pull_request_target, merge_group]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not recommended to use this - from the docs:

This event allows your workflow to do things like label or comment on pull requests from forks. Avoid using this event if you need to build or run code from the pull request

We should run on merge_group as then the code has already been reviewed and can be sure isn't extracting secrets.

Copy link
Contributor Author

@gamora12 gamora12 Oct 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With pull_request_target, the CI will only run the base repository code. Even if someone tries to introduce code change that accesses secrets, the code won't be run until it's merged. The runner vm has access to the secrets but it only runs the workflow code (not the PR code), the cloud-hypervisor code will be run on a separate azure vm (which doesn't have access to secrets & can't label or comment on PR), so we're actually safe.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah - yes! Because dev_cli.sh is run inside a separately created VM. I'm happy if you're happy with that.


jobs:
infra-setup:
name: MSHV Infra Setup (x86_64)
uses: ./.github/workflows/mshv-infra.yaml
with:
ARCH: x86_64
KEY: azure_key_${{ github.run_id }}
OS_DISK_SIZE: 512
RG: MSHV-INTEGRATION-${{ github.run_id }}
VM_SKU: Standard_D16s_v5
secrets:
MI_CLIENT_ID: ${{ secrets.MSHV_MI_CLIENT_ID }}
RUNNER_RG: ${{ secrets.MSHV_RUNNER_RG }}
STORAGE_ACCOUNT_PATHS: ${{ secrets.MSHV_STORAGE_ACCOUNT_PATHS }}
ARCH_SOURCE_PATH: ${{ secrets.MSHV_X86_SOURCE_PATH }}
USERNAME: ${{ secrets.MSHV_USERNAME }}

run-tests:
name: Integration Tests
needs: infra-setup
if: ${{ always() && needs.infra-setup.result == 'success' }}
runs-on: mshv
continue-on-error: true
steps:
- name: Run integration tests
env:
KEY: azure_key_${{ github.run_id }}
PR_NUMBER: ${{ github.event.pull_request.number }}
REPO_URL: https://github.com/cloud-hypervisor/cloud-hypervisor.git
REPO_DIR: cloud-hypervisor
PRIVATE_IP: ${{ needs.infra-setup.outputs.PRIVATE_IP }}
RG: MSHV-${{ github.run_id }}
USERNAME: ${{ secrets.MSHV_USERNAME }}
run: |
set -e
echo "Connecting to the VM via SSH..."
ssh -i ~/.ssh/${KEY} -o StrictHostKeyChecking=no ${USERNAME}@${PRIVATE_IP} << EOF
set -e
echo "Logged in successfully."
export PATH="\$HOME/.cargo/bin:\$PATH"

if [[ "${{ github.event_name }}" == "pull_request_target" ]]; then
git clone --depth 1 "$REPO_URL" "$REPO_DIR"
cd "$REPO_DIR"
git fetch origin pull/${{ github.event.pull_request.number }}/merge
git checkout FETCH_HEAD
else
git clone --depth 1 --single-branch --branch "${{ github.ref_name }}" "$REPO_URL" "$REPO_DIR"
cd "$REPO_DIR"
fi

echo "Loading VDPA kernel modules..."
sudo modprobe vdpa
sudo modprobe vhost_vdpa
sudo modprobe vdpa_sim
sudo modprobe vdpa_sim_blk
sudo modprobe vdpa_sim_net

echo "Creating VDPA devices..."
sudo vdpa dev add name vdpa-blk0 mgmtdev vdpasim_blk
sudo vdpa dev add name vdpa-blk1 mgmtdev vdpasim_blk
sudo vdpa dev add name vdpa-blk2 mgmtdev vdpasim_net

echo "Setting permissions..."
for i in 0 1 2; do
dev="/dev/vhost-vdpa-$i"
if [ -e "$dev" ]; then
sudo chown $USER:$USER "$dev"
sudo chmod 660 "$dev"
else
echo "Warning: Device $dev not found"
fi
done

sudo ./scripts/dev_cli.sh tests --hypervisor mshv --integration -- -- --skip common_parallel::test_tpm --skip common_parallel::test_cpu_topology_421 --skip common_parallel::test_cpu_topology_142 --skip common_parallel::test_cpu_topology_262 --skip common_sequential::test_snapshot_restore_basic --skip common_sequential::test_snapshot_restore_with_fd --skip common_sequential::test_snapshot_restore_pvpanic --skip virtio_net_latency_us --skip common_parallel::test_cpu_hotplug
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

May be this skips tests could be retrieved from somewhere else, like a environment variable or a file.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, this can be done.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What value would it add?

I imagine this list would shrink over time, so would it really be worth it?

Also, how would the environment variable be set?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What value would it add?

I imagine this list would shrink over time, so would it really be worth it?

Also, how would the environment variable be set?

Not that much of value just clean code.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should disable the tests in integration.rs file may be. @likebreath ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the integration.rs file we do have some build time filtering of tests that don't work on mshv. You will need to make sure you just pass the --no-default features --features mshv when running/building the tests. You can still use a CH binary that is both kvm and mshv - we just want to conditional build the test suite.

Copy link
Contributor

@russell-islam russell-islam Oct 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think since this CI will be still testing mode for some time, we can merge this as now and work in parallel to make changes in integration.rs and eventually remove these tests from here. We should create an issue @gamora12 and start working. @rbradford ??

Copy link
Contributor Author

@gamora12 gamora12 Oct 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, that's fine by me. I'm already working on it; I can raise a separate PR for these changes as well.

EOF

cleanup:
name: Cleanup
needs: run-tests
if: always()
runs-on: mshv
steps:
- name: Delete RG
env:
RG: MSHV-INTEGRATION-${{ github.run_id }}
run: |
if az group exists --name ${RG}; then
az group delete --name ${RG} --yes --no-wait
else
echo "Resource Group ${RG} does not exist. Skipping deletion."
fi
echo "Cleanup process completed."

- name: Delete SSH Key
env:
KEY: azure_key_${{ github.run_id }}
run: |
if [ -f ~/.ssh/${KEY} ]; then
rm -f ~/.ssh/${KEY} ~/.ssh/${KEY}.pub
echo "SSH key deleted successfully."
else
echo "SSH key does not exist. Skipping deletion."
fi
echo "Cleanup process completed."
Loading