diff --git a/.envrc b/.envrc new file mode 100644 index 0000000000..3550a30f2d --- /dev/null +++ b/.envrc @@ -0,0 +1 @@ +use flake diff --git a/.github/workflows/build_nix.yaml b/.github/workflows/build_nix.yaml new file mode 100644 index 0000000000..586c985897 --- /dev/null +++ b/.github/workflows/build_nix.yaml @@ -0,0 +1,38 @@ +name: Cloud Hypervisor Build (Nix) +on: [push, pull_request, merge_group] +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + build: + name: Build + runs-on: ubuntu-latest + steps: + - name: Code checkout + uses: actions/checkout@v6 + with: + fetch-depth: 0 + - uses: cachix/install-nix-action@v31 + # We restore Nix evaluation and Nix tarball cache, speeding up the CI. + # This does not cover any Nix artifacts from the Nix store. + - name: Restore Nix cache + uses: actions/cache@v5 + with: + path: ~/.cache/nix + key: nix-cache-${{ github.job }} + # Nix binary cache + - uses: DeterminateSystems/magic-nix-cache-action@main + # Dedicated step to separate all the + # "copying path '/nix/store/...' from 'https://cache.nixos.org'." + # messages from the actual build output. + - name: Prepare Nix Store + run: nix develop --command bash -c "nix --version" + - name: Check Nix format + run: nix fmt -- --ci + - name: Check Nix Flake + run: nix flake check -L + - name: Build Cloud Hypervisor + run: | + nix build -L .#default + nix build -L .#cloud-hypervisor diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index e080c3fef9..34ee87f942 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -106,44 +106,6 @@ jobs: - name: Lint git commit messages run: | gitlint --commits "origin/$GITHUB_BASE_REF.." - lychee: - name: lychee - needs: [preflight] - if: needs.preflight.outputs.docs == 'true' || needs.preflight.outputs.full == 'true' - runs-on: ubuntu-latest - steps: - - name: Code checkout - uses: actions/checkout@v6 - with: - fetch-depth: 0 - - name: Get changed files in PR - id: changed-files - uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6 - with: - base_sha: ${{ github.event.pull_request.base.sha }} - - name: Verify Changed Files - run: | - set -eufo pipefail - echo "--- tj-actions/changed-files Outputs ---" - echo "any_changed: ${{ steps.changed-files.outputs.any_changed }}" - echo "all_changed_files: ${{ steps.changed-files.outputs.all_changed_files }}" - echo "added_files: ${{ steps.changed-files.outputs.added_files }}" - echo "modified_files: ${{ steps.changed-files.outputs.modified_files }}" - echo "deleted_files: ${{ steps.changed-files.outputs.deleted_files }}" - echo "renamed_files: ${{ steps.changed-files.outputs.renamed_files }}" - echo "----------------------------------------" - if [ -n "${{ steps.changed-files.outputs.all_changed_files }}" ]; then - echo "Detected changes: all_changed_files output is NOT empty." - else - echo "No changes detected: all_changed_files output IS empty." - fi - - name: Link Availability Check (Diff Only) - if: ${{ steps.changed-files.outputs.all_changed_files != '' }} - uses: lycheeverse/lychee-action@8646ba30535128ac92d33dfc9133794bfdd9b411 # v2.8.0 - with: - args: --verbose --config .lychee.toml ${{ steps.changed-files.outputs.all_changed_files }} - failIfEmpty: false - fail: true taplo: name: taplo needs: [preflight] @@ -314,16 +276,10 @@ jobs: fail-fast: false matrix: rust: - - beta - stable target: - - aarch64-unknown-linux-gnu - - aarch64-unknown-linux-musl - x86_64-unknown-linux-gnu - - x86_64-unknown-linux-musl include: - - rust: beta - experimental: true - rust: stable experimental: false steps: @@ -462,12 +418,8 @@ jobs: matrix: rust: - stable - - beta - - nightly - - "1.89.0" # MSRV — keep quoted. target: - x86_64-unknown-linux-gnu - - x86_64-unknown-linux-musl steps: - name: Code checkout uses: actions/checkout@v6 @@ -510,223 +462,6 @@ jobs: run: cargo build --locked --all --release --target=${{ matrix.target }} - name: Check build did not modify any files run: test -z "$(git status --porcelain)" - # garm-jammy + gnu: runs on PR and MQ. Other 3 matrix entries are in - # integration-x86-64-mq (sibling, MQ-only, runs in parallel). - integration-x86-64-pr: - name: integration-x86-64-pr - needs: [preflight, dco, quality, build] - if: >- - needs.preflight.outputs.full == 'true' && needs.dco.result == 'success' && needs.quality.result == 'success' && needs.build.result == 'success' - timeout-minutes: 80 - env: - # Our runner has 16 cores (nproc). - # We limit parallelism only to avoid exhausting disk space and memory - # resources, not to save CPU resources. - PARALLEL_INTEGRATION_TESTS_NUM: 12 - runs-on: garm-jammy-16 - steps: - - name: Code checkout - uses: actions/checkout@v6 - with: - fetch-depth: 0 - - name: Install Docker - run: | - set -eufo pipefail - sudo apt-get update - sudo apt-get -y install ca-certificates curl gnupg - curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg - sudo chmod a+r /usr/share/keyrings/docker-archive-keyring.gpg - echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null - sudo apt-get update - sudo apt install -y docker-ce docker-ce-cli - - name: Prepare for VDPA - run: scripts/prepare_vdpa.sh - - name: Run unit tests - run: scripts/dev_cli.sh tests --unit --libc gnu - - name: Load openvswitch module - run: sudo modprobe openvswitch - - name: Run integration tests - timeout-minutes: 60 - run: scripts/dev_cli.sh tests --integration --libc gnu - # MQ-only: the 3 matrix entries that integration-x86-64-pr does not cover. - integration-x86-64-mq: - name: integration-x86-64-mq - needs: [preflight, dco, quality, build] - if: >- - github.event_name == 'merge_group' && needs.preflight.outputs.full == 'true' && needs.dco.result == 'success' && needs.quality.result == 'success' && needs.build.result == 'success' - timeout-minutes: 80 - env: - # Our runner has 16 cores (nproc). - # We limit parallelism only to avoid exhausting disk space and memory - # resources, not to save CPU resources. - PARALLEL_INTEGRATION_TESTS_NUM: 12 - strategy: - fail-fast: false - matrix: - include: - - {runner: garm-jammy, libc: musl} - - {runner: garm-jammy-amd, libc: gnu} - - {runner: garm-jammy-amd, libc: musl} - # format() because `${{ matrix.runner }}-16` is not valid in runs-on. - runs-on: ${{ format('{0}-16', matrix.runner) }} - steps: - - name: Code checkout - uses: actions/checkout@v6 - with: - fetch-depth: 0 - - name: Install Docker - run: | - set -eufo pipefail - sudo apt-get update - sudo apt-get -y install ca-certificates curl gnupg - curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg - sudo chmod a+r /usr/share/keyrings/docker-archive-keyring.gpg - echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null - sudo apt-get update - sudo apt install -y docker-ce docker-ce-cli - - name: Prepare for VDPA - run: scripts/prepare_vdpa.sh - - name: Run unit tests - run: scripts/dev_cli.sh tests --unit --libc ${{ matrix.libc }} - - name: Load openvswitch module - run: sudo modprobe openvswitch - - name: Run integration tests - timeout-minutes: 60 - run: scripts/dev_cli.sh tests --integration --libc ${{ matrix.libc }} - integration-arm64: - name: integration-arm64 - needs: [preflight, dco, quality, build] - if: >- - github.event_name == 'merge_group' && needs.preflight.outputs.full == 'true' && needs.dco.result == 'success' && needs.quality.result == 'success' && needs.build.result == 'success' - timeout-minutes: 120 - env: - # Our runner has 80 cores (nproc). - # We limit parallelism only to avoid exhausting disk space and memory - # resources, not to save CPU resources. - PARALLEL_INTEGRATION_TESTS_NUM: 25 - runs-on: bookworm-arm64 - steps: - # arm64 runner user is "runner" (vfio's is "github-runner"). - - name: Fix workspace permissions - run: sudo chown -R runner:runner ${GITHUB_WORKSPACE} - - name: Code checkout - uses: actions/checkout@v6 - with: - fetch-depth: 0 - - name: Run unit tests (musl) - run: scripts/dev_cli.sh tests --unit --libc musl - - name: Load openvswitch module - run: sudo modprobe openvswitch - - name: Run integration tests (musl) - timeout-minutes: 60 - run: scripts/dev_cli.sh tests --integration --libc musl - - name: Install Azure CLI - run: | - set -eufo pipefail - sudo apt install -y ca-certificates curl apt-transport-https lsb-release gnupg - curl -sL https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor | sudo tee /etc/apt/trusted.gpg.d/microsoft.gpg > /dev/null - echo "deb [arch=arm64] https://packages.microsoft.com/repos/azure-cli/ bookworm main" | sudo tee /etc/apt/sources.list.d/azure-cli.list - sudo apt update - sudo apt install -y azure-cli - - name: Download Windows image - shell: bash - run: | - set -eufo pipefail - IMG_BASENAME=windows-11-iot-enterprise-aarch64.raw - IMG_PATH=$HOME/workloads/$IMG_BASENAME - IMG_GZ_PATH=$HOME/workloads/$IMG_BASENAME.gz - IMG_GZ_BLOB_NAME=windows-11-iot-enterprise-aarch64-9-min.raw.gz - cp "scripts/$IMG_BASENAME.sha1" "$HOME/workloads/" - pushd "$HOME/workloads" - if sha1sum "$IMG_BASENAME.sha1" --check; then - exit - fi - popd - mkdir -p "$HOME/workloads" - az storage blob download --container-name private-images --file "$IMG_GZ_PATH" --name "$IMG_GZ_BLOB_NAME" --connection-string "${{ secrets.CH_PRIVATE_IMAGES }}" - gzip -d "$IMG_GZ_PATH" - - name: Run Windows guest integration tests - timeout-minutes: 30 - run: scripts/dev_cli.sh tests --integration-windows --libc musl - integration-vfio: - name: integration-vfio - needs: [preflight, dco, quality, build] - if: >- - github.event_name == 'merge_group' && needs.preflight.outputs.full == 'true' && needs.dco.result == 'success' && needs.quality.result == 'success' && needs.build.result == 'success' - runs-on: vfio-nvidia - env: - AUTH_DOWNLOAD_TOKEN: ${{ secrets.AUTH_DOWNLOAD_TOKEN }} - steps: - # vfio-nvidia runner user is "github-runner" (not "runner" like arm64). - - name: Fix workspace permissions - run: sudo chown -R github-runner:github-runner "${GITHUB_WORKSPACE}" - - name: Code checkout - uses: actions/checkout@v6 - with: - fetch-depth: 0 - - name: Run VFIO integration tests - timeout-minutes: 25 - run: scripts/dev_cli.sh tests --integration-vfio - # Most tests are failing with musl, see #6790 - # - name: Run VFIO integration tests for musl - # timeout-minutes: 25 - # run: scripts/dev_cli.sh tests --integration-vfio --libc musl - integration-windows: - name: integration-windows - needs: [preflight, dco, quality, build] - if: >- - github.event_name == 'merge_group' && needs.preflight.outputs.full == 'true' && needs.dco.result == 'success' && needs.quality.result == 'success' && needs.build.result == 'success' - runs-on: garm-jammy-16 - steps: - - name: Code checkout - uses: actions/checkout@v6 - with: - fetch-depth: 0 - - name: Install Docker - run: | - set -eufo pipefail - sudo apt-get update - sudo apt-get -y install ca-certificates curl gnupg - curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg - sudo chmod a+r /usr/share/keyrings/docker-archive-keyring.gpg - echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null - sudo apt-get update - sudo apt install -y docker-ce docker-ce-cli - - name: Install Azure CLI - run: | - set -eufo pipefail - sudo apt install -y ca-certificates curl apt-transport-https lsb-release gnupg - curl -sL https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor | sudo tee /etc/apt/trusted.gpg.d/microsoft.gpg > /dev/null - echo "deb [arch=amd64] https://packages.microsoft.com/repos/azure-cli/ jammy main" | sudo tee /etc/apt/sources.list.d/azure-cli.list - sudo apt update - sudo apt install -y azure-cli - - name: Download Windows image - run: | - set -eufo pipefail - mkdir $HOME/workloads - az storage blob download --container-name private-images --file "$HOME/workloads/windows-server-2025-amd64-1.raw" --name windows-server-2025-amd64-1.raw --connection-string "${{ secrets.CH_PRIVATE_IMAGES }}" - - name: Run Windows guest integration tests - timeout-minutes: 15 - run: scripts/dev_cli.sh tests --integration-windows - - name: Run Windows guest integration tests for musl - timeout-minutes: 15 - run: scripts/dev_cli.sh tests --integration-windows --libc musl - integration-rate-limiter: - name: integration-rate-limiter - needs: [preflight, dco, quality, build] - if: >- - github.event_name == 'merge_group' && needs.preflight.outputs.full == 'true' && needs.dco.result == 'success' && needs.quality.result == 'success' && needs.build.result == 'success' - runs-on: bare-metal-9950x - env: - AUTH_DOWNLOAD_TOKEN: ${{ secrets.AUTH_DOWNLOAD_TOKEN }} - steps: - - name: Code checkout - uses: actions/checkout@v6 - with: - fetch-depth: 0 - - name: Run rate-limiter integration tests - timeout-minutes: 20 - run: scripts/dev_cli.sh tests --integration-rate-limiter # The single required-status check. Branch protection requires this one job. all-green: name: all-green @@ -738,13 +473,6 @@ jobs: - fuzz-build - gitlint - hadolint - - integration-arm64 - # VFIO worker is failing #8160 - # - integration-vfio - # See: #8211 - # - integration-windows - - integration-x86-64-mq - - integration-x86-64-pr - openapi - package-consistency - preflight diff --git a/.github/workflows/docker-image.yaml b/.github/workflows/docker-image.yaml deleted file mode 100644 index 8636d35f00..0000000000 --- a/.github/workflows/docker-image.yaml +++ /dev/null @@ -1,65 +0,0 @@ -name: Cloud Hypervisor's Docker image update -on: - push: - branches: main - paths: resources/Dockerfile - pull_request: - paths: resources/Dockerfile -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event_name }} - cancel-in-progress: true - -env: - REGISTRY: ghcr.io - IMAGE_NAME: ${{ github.repository }} - -jobs: - main: - runs-on: ubuntu-latest - steps: - - name: Code checkout - uses: actions/checkout@v6 - - - name: Set up QEMU - uses: docker/setup-qemu-action@v4 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v4 - - - name: Login to ghcr - uses: docker/login-action@v4 - with: - registry: ${{ env.REGISTRY }} - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - - name: Docker meta - id: meta - uses: docker/metadata-action@v6 - with: - images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} - # generate Docker tags based on the following events/attributes - tags: | - type=raw,value=20251114-0 - type=sha - - - name: Build and push - if: ${{ github.event_name == 'push' }} - uses: docker/build-push-action@v7 - with: - file: ./resources/Dockerfile - platforms: linux/amd64,linux/arm64 - push: true - tags: ${{ steps.meta.outputs.tags }} - - - name: Build only - if: ${{ github.event_name == 'pull_request' }} - uses: docker/build-push-action@v7 - with: - file: ./resources/Dockerfile - platforms: linux/amd64,linux/arm64 - tags: ${{ steps.meta.outputs.tags }} - - - name: Image digest - run: echo ${{ steps.docker_build.outputs.digest }} diff --git a/.github/workflows/integration-metrics.yaml b/.github/workflows/integration-metrics.yaml deleted file mode 100644 index 4e66f4b614..0000000000 --- a/.github/workflows/integration-metrics.yaml +++ /dev/null @@ -1,22 +0,0 @@ -name: Cloud Hypervisor Tests (Metrics) -on: - push: - branches: - - main - -jobs: - build: - name: Tests (Metrics) - runs-on: bare-metal-9950x - env: - METRICS_PUBLISH_KEY: ${{ secrets.METRICS_PUBLISH_KEY }} - steps: - - name: Code checkout - uses: actions/checkout@v6 - with: - fetch-depth: 0 - - name: Run metrics tests - timeout-minutes: 60 - run: scripts/dev_cli.sh tests --metrics -- --test-exclude micro_ -- --report-file /root/workloads/metrics.json - - name: Upload metrics report - run: 'curl -X PUT https://ch-metrics.azurewebsites.net/api/publishmetrics -H "x-functions-key: $METRICS_PUBLISH_KEY" -T ~/workloads/metrics.json' diff --git a/.github/workflows/mshv-infra.yaml b/.github/workflows/mshv-infra.yaml deleted file mode 100644 index 89cb5f6fbc..0000000000 --- a/.github/workflows/mshv-infra.yaml +++ /dev/null @@ -1,246 +0,0 @@ -name: MSHV Infra Setup -on: - workflow_call: - inputs: - ARCH: - description: 'Architecture for the VM' - required: true - type: string - KEY: - description: 'SSH Key Name' - required: true - type: string - OS_DISK_SIZE: - description: 'OS Disk Size in GB' - required: true - type: number - RG: - description: 'Resource Group Name' - required: true - type: string - VM_SKU: - description: 'VM SKU' - required: true - type: string - secrets: - MI_CLIENT_ID: - required: true - RUNNER_RG: - required: true - STORAGE_ACCOUNT_PATHS: - required: true - ARCH_SOURCE_PATH: - required: true - USERNAME: - required: true - outputs: - RG_NAME: - description: 'Resource group of the VM' - value: ${{ jobs.infra-setup.outputs.RG_NAME }} - VM_NAME: - description: 'Name of the VM' - value: ${{ jobs.infra-setup.outputs.VM_NAME }} - PRIVATE_IP: - description: 'Private IP of the VM' - value: ${{ jobs.infra-setup.outputs.PRIVATE_IP }} -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event_name }} - cancel-in-progress: true -jobs: - infra-setup: - name: ${{ inputs.ARCH }} VM Provision - runs-on: mshv - outputs: - RG_NAME: ${{ steps.rg-setup.outputs.RG_NAME }} - VM_NAME: ${{ steps.vm-setup.outputs.VM_NAME }} - PRIVATE_IP: ${{ steps.get-vm-ip.outputs.PRIVATE_IP }} - steps: - - name: Install & login to AZ CLI - env: - MI_CLIENT_ID: ${{ secrets.MI_CLIENT_ID }} - run: | - set -eufo pipefail - echo "Installing Azure CLI if not already installed" - if ! command -v az &>/dev/null; then - curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash - else - echo "Azure CLI already installed" - fi - az --version - echo "Logging into Azure CLI using Managed Identity" - az login --identity --client-id "${MI_CLIENT_ID}" - - - name: Get Location - id: get-location - env: - SKU: ${{ inputs.VM_SKU }} - STORAGE_ACCOUNT_PATHS: ${{ secrets.STORAGE_ACCOUNT_PATHS }} - run: | - set -eufo pipefail - # Extract vCPU count from SKU (e.g., "Standard_D2s_v3" => 2) - if ! [[ "$SKU" =~ ^Standard_[A-Za-z]+([1-9][0-9]*) ]]; then - printf 'Cannot extract vCPU count from SKU: %q\n' "$SKU" - exit 1 - fi - vcpu=${BASH_REMATCH[1]} - - SUPPORTED_LOCATIONS=$(echo "$STORAGE_ACCOUNT_PATHS" | jq -r 'to_entries[] | .key') - - for location in $SUPPORTED_LOCATIONS; do - family=$(az vm list-skus --size "$SKU" --location "$location" --resource-type "virtualMachines" --query '[0].family' -o tsv) - if [[ -z "$family" ]]; then - echo "Cannot determine VM family for SKU: $SKU in $location" - continue - fi - - remaining=$(az vm list-usage --location "$location" --query "[?name.value=='$family'] | [0]" -o json | - jq '(.limit | tonumber) - (.currentValue | tonumber) >= ($ARGS.positional[0] | tonumber)' --jsonargs "$vcpu") - if [[ "$remaining" = true ]]; then - echo "Sufficient quota found in $location" - echo "location=$location" >> "$GITHUB_OUTPUT" - exit 0 - fi - done - - echo "No location found with sufficient vCPU quota for SKU: $SKU" - exit 1 - - - name: Create Resource Group - id: rg-setup - env: - LOCATION: ${{ steps.get-location.outputs.location }} - RG: ${{ inputs.RG }} - STORAGE_ACCOUNT_PATHS: ${{ secrets.STORAGE_ACCOUNT_PATHS }} - run: | - set -eufo pipefail - echo "Creating Resource Group: $RG" - # Create the resource group - echo "Creating resource group in location: ${LOCATION}" - az group create --name "${RG}" --location "${LOCATION}" - echo "RG_NAME=${RG}" >> $GITHUB_OUTPUT - echo "Resource group created successfully." - - - name: Generate SSH Key - id: generate-ssh-key - env: - KEY: ${{ inputs.KEY }} - run: | - set -eufo pipefail - echo "Generating SSH key: $KEY" - mkdir -p ~/.ssh - ssh-keygen -t rsa -b 4096 -f ~/.ssh/"${KEY}" -N "" - - - name: Create VM - id: vm-setup - env: - KEY: ${{ inputs.KEY }} - LOCATION: ${{ steps.get-location.outputs.location }} - OS_DISK_SIZE: ${{ inputs.OS_DISK_SIZE }} - RG: ${{ inputs.RG }} - RUNNER_RG: ${{ secrets.RUNNER_RG }} - USERNAME: ${{ secrets.USERNAME }} - VM_SKU: ${{ inputs.VM_SKU }} - VM_IMAGE_NAME: ${{ inputs.ARCH }}_${{ steps.get-location.outputs.location }}_image - VM_NAME: ${{ inputs.ARCH }}_${{ steps.get-location.outputs.location }}_${{ github.run_id }} - run: | - set -eufo pipefail - echo "Creating $VM_SKU VM: $VM_NAME" - - # Extract subnet ID from the runner VM - echo "Retrieving subnet ID..." - SUBNET_ID=$(az network vnet list --resource-group "$RUNNER_RG" --query "[?contains(location, '${LOCATION}')].{SUBNETS:subnets}" | jq -r ".[0].SUBNETS[0].id") - if [[ -z "${SUBNET_ID}" ]]; then - echo "ERROR: Failed to retrieve Subnet ID." - exit 1 - fi - - # Extract image ID from the runner VM - echo "Retrieving image ID..." - IMAGE_ID=$(az image show --resource-group "$RUNNER_RG" --name "$VM_IMAGE_NAME" --query "id" -o tsv) - if [[ -z "${IMAGE_ID}" ]]; then - echo "ERROR: Failed to retrieve Image ID." - exit 1 - fi - - # Create VM - az vm create \ - --resource-group "${RG}" \ - --name "${VM_NAME}" \ - --subnet "${SUBNET_ID}" \ - --size "${VM_SKU}" \ - --location "${LOCATION}" \ - --image "${IMAGE_ID}" \ - --os-disk-size-gb "${OS_DISK_SIZE}" \ - --public-ip-sku Standard \ - --storage-sku Premium_LRS \ - --public-ip-address "" \ - --admin-username "${USERNAME}" \ - --ssh-key-value ~/.ssh/"${KEY}".pub \ - --security-type Standard \ - --output json - - az vm boot-diagnostics enable --name "${VM_NAME}" --resource-group "${RG}" - - echo "VM_NAME=${VM_NAME}" >> "$GITHUB_OUTPUT" - echo "VM creation process completed successfully." - - - name: Get VM Private IP - id: get-vm-ip - env: - RG: ${{ inputs.RG }} - VM_NAME: ${{ inputs.ARCH }}_${{ steps.get-location.outputs.location }}_${{ github.run_id }} - run: | - set -eufo pipefail - echo "Retrieving VM Private IP address..." - # Retrieve VM Private IP address - PRIVATE_IP=$(az vm show -g "${RG}" -n "${VM_NAME}" -d --query privateIps -o tsv) - if [[ -z "$PRIVATE_IP" ]]; then - echo "ERROR: Failed to retrieve private IP address." - exit 1 - fi - echo "PRIVATE_IP=$PRIVATE_IP" >> "$GITHUB_OUTPUT" - - - name: Wait for SSH availability - env: - KEY: ${{ inputs.KEY }} - PRIVATE_IP: ${{ steps.get-vm-ip.outputs.PRIVATE_IP }} - USERNAME: ${{ secrets.USERNAME }} - run: | - echo "Waiting for SSH to be accessible..." - timeout 120 bash -c 'until ssh -o StrictHostKeyChecking=no -i ~/.ssh/"${KEY}" -- "${USERNAME}@${PRIVATE_IP}" "exit" 2>/dev/null; do sleep 5; done' - echo "VM is accessible!" - - - name: Remove Old Host Key - env: - PRIVATE_IP: ${{ steps.get-vm-ip.outputs.PRIVATE_IP }} - run: | - set -eufo pipefail - echo "Removing the old host key" - ssh-keygen -R "$PRIVATE_IP" - - - name: SSH into VM and Install Dependencies - env: - KEY: ${{ inputs.KEY }} - PRIVATE_IP: ${{ steps.get-vm-ip.outputs.PRIVATE_IP }} - USERNAME: ${{ secrets.USERNAME }} - run: | - set -eufo pipefail - ssh -i ~/.ssh/"${KEY}" -o StrictHostKeyChecking=no -- "${USERNAME}@${PRIVATE_IP}" << EOF - set -eufo pipefail - echo "Logged in successfully." - echo "Installing dependencies..." - sudo tdnf install -y git moby-engine moby-cli clang llvm pkg-config make gcc glibc-devel - echo "Installing Rust..." - curl -sSf https://sh.rustup.rs | sh -s -- --default-toolchain stable --profile default -y - export PATH="\$HOME/.cargo/bin:\$PATH" - cargo --version - sudo mkdir -p /etc/docker/ - echo '{"default-ulimits":{"nofile":{"Hard":65535,"Name":"nofile","Soft":65535}}}' | sudo tee /etc/docker/daemon.json - sudo systemctl stop docker - sudo systemctl enable docker.service - sudo systemctl enable containerd.service - sudo systemctl start docker - sudo groupadd -f docker - sudo usermod -a -G docker "${USERNAME}" - sudo systemctl restart docker - EOF diff --git a/.github/workflows/mshv-integration.yaml b/.github/workflows/mshv-integration.yaml deleted file mode 100644 index 437cf44f6c..0000000000 --- a/.github/workflows/mshv-integration.yaml +++ /dev/null @@ -1,129 +0,0 @@ -name: Cloud Hypervisor Tests (MSHV) (x86_64) -on: [pull_request_target, merge_group] -permissions: {} - -jobs: - infra-setup: - name: MSHV Infra Setup (x86_64) - uses: ./.github/workflows/mshv-infra.yaml - with: - ARCH: x86_64 - KEY: azure_key_${{ github.run_id }} - OS_DISK_SIZE: 512 - RG: MSHV-INTEGRATION-${{ github.run_id }} - VM_SKU: Standard_D16s_v5 - secrets: - MI_CLIENT_ID: ${{ secrets.MSHV_MI_CLIENT_ID }} - RUNNER_RG: ${{ secrets.MSHV_RUNNER_RG }} - STORAGE_ACCOUNT_PATHS: ${{ secrets.MSHV_STORAGE_ACCOUNT_PATHS }} - ARCH_SOURCE_PATH: ${{ secrets.MSHV_X86_SOURCE_PATH }} - USERNAME: ${{ secrets.MSHV_USERNAME }} - - run-tests: - name: Integration Tests (x86_64) - needs: infra-setup - if: ${{ always() && needs.infra-setup.result == 'success' }} - runs-on: mshv - steps: - - name: Run integration tests - timeout-minutes: 60 - env: - KEY: azure_key_${{ github.run_id }} - PR_NUMBER: ${{ github.event.pull_request.number }} - REPO_URL: https://github.com/cloud-hypervisor/cloud-hypervisor.git - REPO_DIR: cloud-hypervisor - PRIVATE_IP: ${{ needs.infra-setup.outputs.PRIVATE_IP }} - RG: MSHV-${{ github.run_id }} - USERNAME: ${{ secrets.MSHV_USERNAME }} - run: | - set -eufo pipefail - echo "Connecting to the VM via SSH..." - ssh -i ~/.ssh/"${KEY}" -o StrictHostKeyChecking=no -- "${USERNAME}@${PRIVATE_IP}" << EOF - set -e - echo "Logged in successfully." - export PATH="\$HOME/.cargo/bin:\$PATH" - - if [[ "${{ github.event_name }}" == "pull_request_target" ]]; then - git clone --depth 1 "$REPO_URL" "$REPO_DIR" - cd "$REPO_DIR" - git fetch origin pull/${{ github.event.pull_request.number }}/merge - git checkout FETCH_HEAD - else - git clone --depth 1 --single-branch --branch "${{ github.ref_name }}" "$REPO_URL" "$REPO_DIR" - cd "$REPO_DIR" - fi - - echo "Loading VDPA kernel modules..." - sudo modprobe vdpa - sudo modprobe vhost_vdpa - sudo modprobe vdpa_sim - sudo modprobe vdpa_sim_blk - sudo modprobe vdpa_sim_net - - echo "Creating VDPA devices..." - sudo vdpa dev add name vdpa-blk0 mgmtdev vdpasim_blk - sudo vdpa dev add name vdpa-blk1 mgmtdev vdpasim_blk - sudo vdpa dev add name vdpa-blk2 mgmtdev vdpasim_net - - echo "Setting permissions..." - for i in 0 1 2; do - dev="/dev/vhost-vdpa-\$i" - if [ -e "\$dev" ]; then - sudo chown \$USER:\$USER "\$dev" - sudo chmod 660 "\$dev" - else - echo "Warning: Device \$dev not found" - fi - done - - sudo ./scripts/dev_cli.sh tests --hypervisor mshv --integration - EOF - - - name: Dump dmesg - if: always() - continue-on-error: true - env: - KEY: azure_key_${{ github.run_id }} - PRIVATE_IP: ${{ needs.infra-setup.outputs.PRIVATE_IP }} - USERNAME: ${{ secrets.MSHV_USERNAME }} - run: | - ssh -i ~/.ssh/"${KEY}" -o StrictHostKeyChecking=no -- "${USERNAME}@${PRIVATE_IP}" sudo dmesg - - - name: Dump serial console logs - if: always() - continue-on-error: true - env: - RG_NAME: ${{ needs.infra-setup.outputs.RG_NAME }} - VM_NAME: ${{ needs.infra-setup.outputs.VM_NAME }} - run: | - set -eufo pipefail - az vm boot-diagnostics get-boot-log --name "${VM_NAME}" --resource-group "${RG_NAME}" | jq -r - - cleanup: - name: Cleanup - needs: run-tests - if: always() - runs-on: mshv - steps: - - name: Delete RG - env: - RG: MSHV-INTEGRATION-${{ github.run_id }} - run: | - if az group exists --name "${RG}"; then - az group delete --name "${RG}" --yes --no-wait - else - echo "Resource Group ${RG} does not exist. Skipping deletion." - fi - echo "Cleanup process completed." - - - name: Delete SSH Key - env: - KEY: azure_key_${{ github.run_id }} - run: | - if [ -f ~/.ssh/"${KEY}" ]; then - rm -f ~/.ssh/"${KEY}" ~/.ssh/"${KEY}.pub" - echo "SSH key deleted successfully." - else - echo "SSH key does not exist. Skipping deletion." - fi - echo "Cleanup process completed." diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml deleted file mode 100644 index bc7c3e152e..0000000000 --- a/.github/workflows/release.yaml +++ /dev/null @@ -1,95 +0,0 @@ -name: Cloud Hypervisor Release -on: [create, merge_group] -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event_name }} - cancel-in-progress: true -env: - GITHUB_TOKEN: ${{ github.token }} - -jobs: - release: - if: (github.event_name == 'create' && github.event.ref_type == 'tag') || github.event_name == 'merge_group' - name: Release ${{ matrix.platform.target }} - strategy: - fail-fast: false - matrix: - platform: - - target: x86_64-unknown-linux-gnu - args: --all --release --features mshv - name_ch: cloud-hypervisor - name_ch_remote: ch-remote - - target: x86_64-unknown-linux-musl - args: --all --release --features mshv - name_ch: cloud-hypervisor-static - name_ch_remote: ch-remote-static - - target: aarch64-unknown-linux-musl - args: --all --release - name_ch: cloud-hypervisor-static-aarch64 - name_ch_remote: ch-remote-static-aarch64 - runs-on: ubuntu-latest - steps: - - name: Code checkout - uses: actions/checkout@v6 - - name: Install musl-gcc - if: contains(matrix.platform.target, 'musl') - run: sudo apt install -y musl-tools - - name: Create release directory - if: | - github.event_name == 'create' && github.event.ref_type == 'tag' && - matrix.platform.target == 'x86_64-unknown-linux-gnu' - run: rsync -rv --exclude=.git . ../cloud-hypervisor-${{ github.event.ref }} - - name: Build ${{ matrix.platform.target }} - uses: houseabsolute/actions-rust-cross@v1 - with: - command: build - target: ${{ matrix.platform.target }} - args: ${{ matrix.platform.args }} - strip: true - toolchain: "1.89.0" - - name: Copy Release Binaries - if: github.event_name == 'create' && github.event.ref_type == 'tag' - shell: bash - run: | - cp target/${{ matrix.platform.target }}/release/cloud-hypervisor ./${{ matrix.platform.name_ch }} - cp target/${{ matrix.platform.target }}/release/ch-remote ./${{ matrix.platform.name_ch_remote }} - - name: Upload Release Artifacts - if: github.event_name == 'create' && github.event.ref_type == 'tag' - uses: actions/upload-artifact@v7 - with: - name: Artifacts for ${{ matrix.platform.target }} - path: | - ./${{ matrix.platform.name_ch }} - ./${{ matrix.platform.name_ch_remote }} - - name: Vendor - if: | - github.event_name == 'create' && github.event.ref_type == 'tag' && - matrix.platform.target == 'x86_64-unknown-linux-gnu' - working-directory: ../cloud-hypervisor-${{ github.event.ref }} - run: | - mkdir ../vendor-cargo-home - export CARGO_HOME=$(realpath ../vendor-cargo-home) - mkdir .cargo - cargo vendor > .cargo/config.toml - - name: Create vendored source archive - if: | - github.event_name == 'create' && github.event.ref_type == 'tag' && - matrix.platform.target == 'x86_64-unknown-linux-gnu' - run: tar cJf cloud-hypervisor-${{ github.event.ref }}.tar.xz ../cloud-hypervisor-${{ github.event.ref }} - - name: Upload cloud-hypervisor vendored source archive - if: | - github.event_name == 'create' && github.event.ref_type == 'tag' && - matrix.platform.target == 'x86_64-unknown-linux-gnu' - id: upload-release-cloud-hypervisor-vendored-sources - uses: actions/upload-artifact@v7 - with: - path: cloud-hypervisor-${{ github.event.ref }}.tar.xz - name: cloud-hypervisor-${{ github.event.ref }}.tar.xz - - name: Create GitHub Release - if: github.event_name == 'create' && github.event.ref_type == 'tag' - uses: softprops/action-gh-release@v3 - with: - draft: true - files: | - ./${{ matrix.platform.name_ch }} - ./${{ matrix.platform.name_ch_remote }} - ./cloud-hypervisor-${{ github.event.ref }}.tar.xz diff --git a/.reuse/dep5 b/.reuse/dep5 index 0e17b4b7e2..e624ecf662 100644 --- a/.reuse/dep5 +++ b/.reuse/dep5 @@ -7,6 +7,6 @@ Files: docs/*.md *.md Copyright: 2024 License: CC-BY-4.0 -Files: scripts/* test_data/* *.toml .git* .editorconfig fuzz/Cargo.lock fuzz/.gitignore resources/linux-config-* vmm/src/api/openapi/cloud-hypervisor.yaml CODEOWNERS Cargo.lock +Files: scripts/* test_data/* *.toml .git* .editorconfig fuzz/Cargo.lock fuzz/.gitignore resources/linux-config-* vmm/src/api/openapi/cloud-hypervisor.yaml CODEOWNERS Cargo.lock flake.nix flake.lock chv.nix .envrc Copyright: 2024 License: Apache-2.0 diff --git a/.typos.toml b/.typos.toml index 5dff00d28c..ef9c7b96d0 100644 --- a/.typos.toml +++ b/.typos.toml @@ -5,6 +5,8 @@ extend-exclude = [ "hypervisor/src/kvm/x86_64/mod.rs", "resources/linux-config-*", ] +[default] +extend-ignore-re = ["_TME_"] [default.extend-words] CLASSE = "CLASSE" @@ -26,3 +28,7 @@ fo = "fo" fpr = "fpr" # Public Linux API msg_controllen = "msg_controllen" +tme = "tme" +l3c_qm_conver_factor = "l3c_qm_conver_factor" +IA32_PMC_GPn_CFG_C = "IA32_PMC_GPn_CFG_C" +IA32_PMC_FXm_CFG_C = "IA32_PMC_FXm_CFG_C" diff --git a/AGENTS.md b/AGENTS.md index 94ad277331..2c6a5d4bf5 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -4,6 +4,10 @@ This is a compact [AGENTS.md](https://agents.md/) file for Cloud Hypervisor. It is meant to help automated coding agents make useful changes that stay safe, reviewable, and compatible with the project's normal engineering constraints. +This checkout is a Cyberus Technology fork of Cloud Hypervisor. It is maintained +independently from upstream, while still following upstream contribution and +code-quality guidance unless fork-specific requirements say otherwise. + ## For LLMs ### Project Context @@ -14,6 +18,9 @@ reviewable, and compatible with the project's normal engineering constraints. - The main supported architectures are `x86_64` and `aarch64`; the main hypervisor backends are KVM and MSHV. `x86_64` with KVM gets the most regular exercise, but changes must not make the other first-class targets worse. +- Treat live migration and the vCPU lifecycle as first-class production areas. + Preserve deterministic state transfer, robust failure handling, correct device + and memory state, and explicit race-free vCPU state transitions. ### Change Guidelines @@ -30,6 +37,11 @@ reviewable, and compatible with the project's normal engineering constraints. - Preserve existing behavior unless the requested change explicitly needs a behavior change; refactors must preserve behavior. Call out compatibility or migration implications. +- Prefer simple solutions over unnecessary traits, excessive indirection, or + premature abstraction. +- Prefer `Result` over panics for recoverable production-path errors. Handle + syscall and KVM ioctl return values explicitly and include useful context in + error messages. - Do not invent APIs, behavior, or requirements. If something is uncertain, state the uncertainty and proceed only with minimal, explicit assumptions. @@ -39,6 +51,8 @@ reviewable, and compatible with the project's normal engineering constraints. comment with the invariants, and make sure the surrounding code upholds them. - Assume concurrency matters. Avoid races, unsynchronized shared state, and implicit ordering assumptions; prefer clear ownership and synchronization. +- Keep KVM code aligned with the kernel API. Do not rely on undocumented + behavior or ignore backend-specific failure modes. - Keep docs and comments short and useful. Document non-trivial invariants at struct definitions and critical state transitions. - Logging should be minimal and high signal. Use `info!` for important normal @@ -64,6 +78,11 @@ reviewable, and compatible with the project's normal engineering constraints. these code paths; otherwise the integration-test code is not included. Do not assume the tests can be run directly in a restricted agent environment; ask the developer to run them when real integration coverage is needed. +- For broader VM behavior, this fork also uses an external `libvirt-tests` suite + outside this repository. If a change likely needs that coverage, say so and + ask whether it should be run, skipped, or handled manually by the developer. + Only run it yourself if the developer provides the necessary instructions and + access details. ### Commit and Patch Formatting @@ -78,4 +97,15 @@ reviewable, and compatible with the project's normal engineering constraints. - Temporary allowances such as `#[allow(unused)]` or ignored tests are only acceptable if resolved within the same commit series or paired with a clear TODO referencing a ticket. Ask the developer if in doubt. +- Commits need a `On-behalf-of: SAP $firstname.$lastname@sap.com` trailer: e.g.: + ``` + $component: $summary + + $body + On-behalf-of: SAP philipp.schuster@sap.com + Signed-off-by: Philipp Schuster + ``` + as our work is sponsored by SAP, which gets its money from the EU (Apeiro + project). The enforcing CI rule is in + `./scripts/gitlint/rules/on-behalf-of-marker.py` diff --git a/Cargo.lock b/Cargo.lock index d0a21c5f0d..7dac00e030 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -71,7 +71,7 @@ version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" dependencies = [ - "windows-sys", + "windows-sys 0.61.2", ] [[package]] @@ -82,7 +82,7 @@ checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" dependencies = [ "anstyle", "once_cell_polyfill", - "windows-sys", + "windows-sys 0.61.2", ] [[package]] @@ -114,12 +114,19 @@ version = "0.1.0" dependencies = [ "anyhow", "byteorder", + "clap", "fdt", + "flate2", "hypervisor", "libc", "linux-loader", "log", + "prettyplease", + "proptest", + "quote", "serde", + "serde_json", + "syn", "thiserror", "uuid", "vm-fdt", @@ -180,7 +187,7 @@ dependencies = [ "polling", "rustix", "slab", - "windows-sys", + "windows-sys 0.61.2", ] [[package]] @@ -238,7 +245,7 @@ dependencies = [ "rustix", "signal-hook-registry", "slab", - "windows-sys", + "windows-sys 0.61.2", ] [[package]] @@ -270,6 +277,28 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" +[[package]] +name = "aws-lc-rs" +version = "1.16.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ec6fb3fe69024a75fa7e1bfb48aa6cf59706a101658ea01bfd33b2b248a038f" +dependencies = [ + "aws-lc-sys", + "zeroize", +] + +[[package]] +name = "aws-lc-sys" +version = "0.40.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f50037ee5e1e41e7b8f9d161680a725bd1626cb6f8c7e901f91f942850852fe7" +dependencies = [ + "cc", + "cmake", + "dunce", + "fs_extra", +] + [[package]] name = "backtrace" version = "0.3.76" @@ -285,6 +314,21 @@ dependencies = [ "windows-link", ] +[[package]] +name = "bit-set" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" + [[package]] name = "bitfield-struct" version = "0.10.1" @@ -404,7 +448,7 @@ checksum = "6f8d983286843e49675a4b7a2d174efe136dc93a18d69130dd18198a6c167601" dependencies = [ "cfg-if", "cpufeatures", - "rand_core", + "rand_core 0.10.1", ] [[package]] @@ -462,12 +506,22 @@ dependencies = [ "tpm", "tracer", "vm-memory", + "vm-migration", "vmm", "vmm-sys-util", "wait-timeout", "zbus", ] +[[package]] +name = "cmake" +version = "0.1.58" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0f78a02292a74a88ac736019ab962ece0bc380e3f977bf72e376c5d78ff0678" +dependencies = [ + "cc", +] + [[package]] name = "colorchoice" version = "1.0.5" @@ -654,9 +708,15 @@ dependencies = [ "libc", "option-ext", "redox_users", - "windows-sys", + "windows-sys 0.61.2", ] +[[package]] +name = "dunce" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813" + [[package]] name = "either" version = "1.15.0" @@ -736,7 +796,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys", + "windows-sys 0.61.2", ] [[package]] @@ -814,12 +874,24 @@ dependencies = [ "spin", ] +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + [[package]] name = "foldhash" version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" +[[package]] +name = "fs_extra" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" + [[package]] name = "futures" version = "0.3.32" @@ -979,7 +1051,7 @@ dependencies = [ "cfg-if", "libc", "r-efi 6.0.0", - "rand_core", + "rand_core 0.10.1", "wasip2", "wasip3", ] @@ -1201,6 +1273,7 @@ dependencies = [ "portable-atomic", "portable-atomic-util", "serde_core", + "windows-sys 0.52.0", ] [[package]] @@ -1765,7 +1838,7 @@ dependencies = [ "hermit-abi", "pin-project-lite", "rustix", - "windows-sys", + "windows-sys 0.61.2", ] [[package]] @@ -1783,6 +1856,15 @@ dependencies = [ "portable-atomic", ] +[[package]] +name = "ppv-lite86" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" +dependencies = [ + "zerocopy", +] + [[package]] name = "prettyplease" version = "0.2.37" @@ -1811,6 +1893,31 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "proptest" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b45fcc2344c680f5025fe57779faef368840d0bd1f42f216291f0dc4ace4744" +dependencies = [ + "bit-set", + "bit-vec", + "bitflags 2.11.1", + "num-traits", + "rand 0.9.4", + "rand_chacha", + "rand_xorshift", + "regex-syntax", + "rusty-fork", + "tempfile", + "unarray", +] + +[[package]] +name = "quick-error" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" + [[package]] name = "quote" version = "1.0.45" @@ -1832,6 +1939,16 @@ version = "6.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" +[[package]] +name = "rand" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea" +dependencies = [ + "rand_chacha", + "rand_core 0.9.5", +] + [[package]] name = "rand" version = "0.10.1" @@ -1840,7 +1957,26 @@ checksum = "d2e8e8bcc7961af1fdac401278c6a831614941f6164ee3bf4ce61b7edb162207" dependencies = [ "chacha20", "getrandom 0.4.2", - "rand_core", + "rand_core 0.10.1", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core 0.9.5", +] + +[[package]] +name = "rand_core" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" +dependencies = [ + "getrandom 0.3.4", ] [[package]] @@ -1849,6 +1985,15 @@ version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "63b8176103e19a2643978565ca18b50549f6101881c443590420e4dc998a3c69" +[[package]] +name = "rand_xorshift" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "513962919efc330f829edb2535844d1b912b0fbe2ca165d613e4e8788bb05a5a" +dependencies = [ + "rand_core 0.9.5", +] + [[package]] name = "range_map_vec" version = "0.2.0" @@ -1926,6 +2071,20 @@ dependencies = [ "syn", ] +[[package]] +name = "ring" +version = "0.17.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" +dependencies = [ + "cc", + "cfg-if", + "getrandom 0.2.17", + "libc", + "untrusted", + "windows-sys 0.52.0", +] + [[package]] name = "rustc-demangle" version = "0.1.27" @@ -1948,7 +2107,42 @@ dependencies = [ "errno", "libc", "linux-raw-sys", - "windows-sys", + "windows-sys 0.61.2", +] + +[[package]] +name = "rustls" +version = "0.23.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef86cd5876211988985292b91c96a8f2d298df24e75989a43a3c73f2d4d8168b" +dependencies = [ + "aws-lc-rs", + "once_cell", + "rustls-pki-types", + "rustls-webpki", + "subtle", + "zeroize", +] + +[[package]] +name = "rustls-pki-types" +version = "1.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30a7197ae7eb376e574fe940d068c30fe0462554a3ddbe4eca7838e049c937a9" +dependencies = [ + "zeroize", +] + +[[package]] +name = "rustls-webpki" +version = "0.103.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61c429a8649f110dddef65e2a5ad240f747e85f7758a6bccc7e5777bd33f756e" +dependencies = [ + "aws-lc-rs", + "ring", + "rustls-pki-types", + "untrusted", ] [[package]] @@ -1957,6 +2151,18 @@ version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" +[[package]] +name = "rusty-fork" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc6bf79ff24e648f6da1f8d1f011e9cac26491b619e6b9280f2b47f1774e6ee2" +dependencies = [ + "fnv", + "quick-error", + "tempfile", + "wait-timeout", +] + [[package]] name = "scopeguard" version = "1.2.0" @@ -2146,6 +2352,12 @@ version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" +[[package]] +name = "subtle" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" + [[package]] name = "syn" version = "2.0.117" @@ -2167,7 +2379,7 @@ dependencies = [ "getrandom 0.4.2", "once_cell", "rustix", - "windows-sys", + "windows-sys 0.61.2", ] [[package]] @@ -2177,7 +2389,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "230a1b821ccbd75b185820a1f1ff7b14d21da1e442e22c0863ea5f08771a8874" dependencies = [ "rustix", - "windows-sys", + "windows-sys 0.61.2", ] [[package]] @@ -2187,7 +2399,7 @@ dependencies = [ "dirs", "epoll", "libc", - "rand", + "rand 0.10.1", "serde_json", "ssh2", "thiserror", @@ -2317,9 +2529,15 @@ checksum = "f2f6fb2847f6742cd76af783a2a2c49e9375d0a111c7bef6f71cd9e738c72d6e" dependencies = [ "memoffset", "tempfile", - "windows-sys", + "windows-sys 0.61.2", ] +[[package]] +name = "unarray" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94" + [[package]] name = "unicode-ident" version = "1.0.24" @@ -2332,6 +2550,12 @@ version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" +[[package]] +name = "untrusted" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" + [[package]] name = "utf8parse" version = "0.2.2" @@ -2346,7 +2570,7 @@ checksum = "ddd74a9687298c6858e9b88ec8935ec45d22e8fd5e6394fa1bd4e99a87789c76" dependencies = [ "getrandom 0.4.2", "js-sys", - "rand", + "rand 0.10.1", "serde_core", "wasm-bindgen", ] @@ -2495,6 +2719,7 @@ dependencies = [ "rate_limiter", "seccompiler", "serde", + "serde_json", "serde_with", "serial_buffer", "thiserror", @@ -2568,10 +2793,12 @@ version = "0.1.0" dependencies = [ "anyhow", "itertools", + "rustls", "serde", "serde_json", "thiserror", "vm-memory", + "zerocopy", ] [[package]] @@ -2607,6 +2834,7 @@ dependencies = [ "igvm", "igvm_defs", "iommufd-ioctls", + "kvm-bindings", "landlock", "libc", "linux-loader", @@ -2795,6 +3023,15 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets", +] + [[package]] name = "windows-sys" version = "0.61.2" @@ -2804,6 +3041,70 @@ dependencies = [ "windows-link", ] +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + [[package]] name = "winnow" version = "1.0.0" @@ -2935,7 +3236,7 @@ dependencies = [ "tracing", "uds_windows", "uuid", - "windows-sys", + "windows-sys 0.61.2", "winnow", "zbus_macros", "zbus_names", @@ -2988,6 +3289,12 @@ dependencies = [ "syn", ] +[[package]] +name = "zeroize" +version = "1.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" + [[package]] name = "zmij" version = "1.0.21" diff --git a/Cargo.toml b/Cargo.toml index 3ef574c2f4..e45dc39d19 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,7 +6,20 @@ codegen-units = 1 lto = true opt-level = "s" -strip = true + +# Tradeof between performance and fast compilation times for local testing and +# development with frequent rebuilds. +[profile.optimized-dev] +codegen-units = 16 +inherits = "release" +lto = false +opt-level = 2 +strip = false + +# Optimize more for dependencies: They don't require frequent rebuilds. +[profile.optimized-dev.package."*"] +codegen-units = 1 +opt-level = 3 [profile.profiling] debug = true @@ -90,11 +103,21 @@ dhat = "0.3.3" dirs = "6.0.0" env_logger = "0.11.10" epoll = "4.4.0" +# Used for (de-) compressing CPU profiles +flate2 = "1.1.9" flume = "0.12.0" itertools = "0.14.0" -jiff = { version = "0.2", default-features = false, features = ["std"] } +jiff = { version = "0.2.23", default-features = false, features = [ + "std", + "tz-system", +] } libc = "0.2.186" log = "0.4.29" +rustls = { version = "0.23.38", default-features = false, features = [ + "aws-lc-rs", + "std", + "tls12", +] } sha2 = "0.11.0" signal-hook = "0.4.4" thiserror = "2.0.18" diff --git a/README.md b/README.md index 80048ab328..0ee4b3af4b 100644 --- a/README.md +++ b/README.md @@ -1,408 +1,27 @@ -- [1. What is Cloud Hypervisor?](#1-what-is-cloud-hypervisor) - - [Objectives](#objectives) - - [High Level](#high-level) - - [Architectures](#architectures) - - [Guest OS](#guest-os) -- [2. Getting Started](#2-getting-started) - - [Host OS](#host-os) - - [Use Pre-built Binaries](#use-pre-built-binaries) - - [Packages](#packages) - - [Building from Source](#building-from-source) - - [Booting Linux](#booting-linux) - - [Firmware Booting](#firmware-booting) - - [Custom Kernel and Disk Image](#custom-kernel-and-disk-image) - - [Building your Kernel](#building-your-kernel) - - [Disk image](#disk-image) - - [Booting the guest VM](#booting-the-guest-vm) -- [3. Status](#3-status) - - [Hot Plug](#hot-plug) - - [Device Model](#device-model) - - [Roadmap](#roadmap) -- [4. Relationship with _Rust VMM_ Project](#4-relationship-with-rust-vmm-project) - - [Differences with Firecracker and crosvm](#differences-with-firecracker-and-crosvm) -- [5. Community](#5-community) - - [Contribute](#contribute) - - [Slack](#slack) - - [Mailing list](#mailing-list) - - [Security issues](#security-issues) - -# 1. What is Cloud Hypervisor? - -Cloud Hypervisor is an open source Virtual Machine Monitor (VMM) that runs on -top of the [KVM](https://www.kernel.org/doc/Documentation/virtual/kvm/api.txt) -hypervisor and the Microsoft Hypervisor (MSHV). - -The project focuses on running modern, _Cloud Workloads_, on specific, common, -hardware architectures. In this case _Cloud Workloads_ refers to those that are -run by customers inside a Cloud Service Provider. This means modern operating -systems with most I/O handled by -paravirtualised devices (e.g. _virtio_), no requirement for legacy devices, and -64-bit CPUs. - -Cloud Hypervisor is implemented in [Rust](https://www.rust-lang.org/) and is -based on the [Rust VMM](https://github.com/rust-vmm) crates. - -## Objectives - -### High Level - -- Runs on KVM or MSHV -- Minimal emulation -- Low latency -- Low memory footprint -- Low complexity -- High performance -- Small attack surface -- 64-bit support only -- CPU, memory, PCI hotplug -- Machine to machine migration - -### Architectures - -Cloud Hypervisor supports the `x86-64`, `AArch64` and `riscv64` -architectures, with functionality varying across these platforms. The -functionality differences between `x86-64` and `AArch64` are documented -in [#1125](https://github.com/cloud-hypervisor/cloud-hypervisor/issues/1125). -The `riscv64` architecture support is experimental and offers limited -functionality. For more details and instructions, please refer to [riscv -documentation](docs/riscv.md). - -### Guest OS - -Cloud Hypervisor supports `64-bit Linux` and Windows 10/Windows Server 2019. - -# 2. Getting Started - -The following sections describe how to build and run Cloud Hypervisor. - -## Prerequisites for AArch64 - -- AArch64 servers (recommended) or development boards equipped with the GICv3 - interrupt controller. - -## Host OS - -For required KVM functionality and adequate performance the recommended host -kernel version is 5.13. The majority of the CI currently tests with kernel -version 5.15. - -## Use Pre-built Binaries - -The recommended approach to getting started with Cloud Hypervisor is by using a -pre-built binary. Binaries are available for the [latest -release](https://github.com/cloud-hypervisor/cloud-hypervisor/releases/latest). -Use `cloud-hypervisor-static` for `x86-64` or `cloud-hypervisor-static-aarch64` -for `AArch64` platform. - -## Packages - -For convenience, packages are also available targeting some popular Linux -distributions. This is thanks to the [Open Build -Service](https://build.opensuse.org). The [OBS -README](https://github.com/cloud-hypervisor/obs-packaging) explains how to -enable the repository in a supported Linux distribution and install Cloud Hypervisor -and accompanying packages. Please report any packaging issues in the -[obs-packaging](https://github.com/cloud-hypervisor/obs-packaging) repository. - -## Building from Source - -Please see the [instructions for building from source](docs/building.md) if you -do not wish to use the pre-built binaries. - -## Booting Linux - -Cloud Hypervisor boots guests in one of two ways. The first is direct -kernel boot, where a kernel image is passed to `--kernel`. The x86-64 -kernel must be built with PVH support or be a bzImage. The second is -firmware boot, where a firmware image is passed to `--firmware` and -brings up the guest's normal boot loader. - -Two firmware options are supported, and which one works best depends -on the guest OS. [Rust Hypervisor -Firmware](https://github.com/cloud-hypervisor/rust-hypervisor-firmware) -is a lightweight Rust-based PVH firmware. The edk2 UEFI firmware is -called `CLOUDHV.fd` for x86-64 and `CLOUDHV_EFI.fd` for AArch64. -Prebuilt binaries for both are available at their respective releases -pages, [Rust Hypervisor -Firmware](https://github.com/cloud-hypervisor/rust-hypervisor-firmware/releases/latest) -and [our edk2 -fork](https://github.com/cloud-hypervisor/edk2/releases/latest). -The edk2 fork carries customizations required to boot AArch64 guests -on cloud-hypervisor. See [docs/uefi.md](docs/uefi.md) for differences -with upstream tianocore/edk2. - -### Firmware Booting - -Cloud Hypervisor supports booting disk images containing all needed components -to run cloud workloads, a.k.a. cloud images. - -The following sample commands will download an Ubuntu Cloud image, converting -it into a format that Cloud Hypervisor can use and a firmware to boot the image -with. - -```shell -$ wget https://cloud-images.ubuntu.com/focal/current/focal-server-cloudimg-amd64.img -$ qemu-img convert -p -f qcow2 -O raw focal-server-cloudimg-amd64.img focal-server-cloudimg-amd64.raw -$ wget https://github.com/cloud-hypervisor/rust-hypervisor-firmware/releases/download/0.4.2/hypervisor-fw -``` - -The Ubuntu cloud images do not ship with a default password so it necessary to -use a `cloud-init` disk image to customise the image on the first boot. A basic -`cloud-init` image is generated by this [script](scripts/create-cloud-init.sh). -This seeds the image with a default username/password of `cloud/cloud123`. It -is only necessary to add this disk image on the first boot. Script also assigns -default IP address using `test_data/cloud-init/ubuntu/local/network-config` details -with `--net "mac=12:34:56:78:90:ab,tap="` option. Then the matching mac address -interface will be enabled as per `network-config` details. - -```shell -$ sudo setcap cap_net_admin+ep ./cloud-hypervisor -$ ./create-cloud-init.sh -$ ./cloud-hypervisor \ - --firmware ./hypervisor-fw \ - --disk path=focal-server-cloudimg-amd64.raw path=/tmp/ubuntu-cloudinit.img \ - --cpus boot=4 \ - --memory size=1024M \ - --net "tap=,mac=,ip=,mask=" -``` - -If access to the firmware messages or interaction with the boot loader (e.g. -GRUB) is required then it necessary to switch to the serial console instead of -`virtio-console`. - -```shell -$ ./cloud-hypervisor \ - --kernel ./hypervisor-fw \ - --disk path=focal-server-cloudimg-amd64.raw path=/tmp/ubuntu-cloudinit.img \ - --cpus boot=4 \ - --memory size=1024M \ - --net "tap=,mac=,ip=,mask=" \ - --serial tty \ - --console off -``` - -## Booting: `--firmware` vs `--kernel` - -The following scenarios are supported by Cloud Hypervisor to bootstrap a VM, i.e., -to load a payload/bootitem(s): - -- Provide firmware -- Provide kernel \[+ cmdline\]\ [+ initrd\] - -Please note that our Cloud Hypervisor firmware (`hypervisor-fw`) has a Xen PVH -boot entry, therefore it can also be booted via the `--kernel` parameter, as -seen in some examples. - -### Custom Kernel and Disk Image - -#### Building your Kernel - -Cloud Hypervisor also supports direct kernel boot. For x86-64, a `vmlinux` ELF kernel (compiled with PVH support) or a regular bzImage are supported. In order to support development there is a custom branch; however provided the required options are enabled any recent kernel will suffice. - -To build the kernel: - -```shell -# Clone the Cloud Hypervisor Linux branch -$ git clone --depth 1 https://github.com/cloud-hypervisor/linux.git -b ch-6.12.8 linux-cloud-hypervisor -$ pushd linux-cloud-hypervisor -$ make ch_defconfig -# Do native build of the x86-64 kernel -$ KCFLAGS="-Wa,-mx86-used-note=no" make bzImage -j `nproc` -# Do native build of the AArch64 kernel -$ make -j `nproc` -$ popd -``` - -For x86-64, the `vmlinux` kernel image will then be located at -`linux-cloud-hypervisor/arch/x86/boot/compressed/vmlinux.bin`. -For AArch64, the `Image` kernel image will then be located at -`linux-cloud-hypervisor/arch/arm64/boot/Image`. - -#### Disk image - -For the disk image the same Ubuntu image as before can be used. This contains -an `ext4` root filesystem. - -```shell -$ wget https://cloud-images.ubuntu.com/focal/current/focal-server-cloudimg-amd64.img # x86-64 -$ wget https://cloud-images.ubuntu.com/focal/current/focal-server-cloudimg-arm64.img # AArch64 -$ qemu-img convert -p -f qcow2 -O raw focal-server-cloudimg-amd64.img focal-server-cloudimg-amd64.raw # x86-64 -$ qemu-img convert -p -f qcow2 -O raw focal-server-cloudimg-arm64.img focal-server-cloudimg-arm64.raw # AArch64 -``` - -#### Booting the guest VM - -These sample commands boot the disk image using the custom kernel whilst also -supplying the desired kernel command line. - -- x86-64 - -```shell -$ sudo setcap cap_net_admin+ep ./cloud-hypervisor -$ ./create-cloud-init.sh -$ ./cloud-hypervisor \ - --kernel ./linux-cloud-hypervisor/arch/x86/boot/compressed/vmlinux.bin \ - --disk path=focal-server-cloudimg-amd64.raw path=/tmp/ubuntu-cloudinit.img \ - --cmdline "console=hvc0 root=/dev/vda1 rw" \ - --cpus boot=4 \ - --memory size=1024M \ - --net "tap=,mac=,ip=,mask=" -``` - -- AArch64 - -```shell -$ sudo setcap cap_net_admin+ep ./cloud-hypervisor -$ ./create-cloud-init.sh -$ ./cloud-hypervisor \ - --kernel ./linux-cloud-hypervisor/arch/arm64/boot/Image \ - --disk path=focal-server-cloudimg-arm64.raw path=/tmp/ubuntu-cloudinit.img \ - --cmdline "console=hvc0 root=/dev/vda1 rw" \ - --cpus boot=4 \ - --memory size=1024M \ - --net "tap=,mac=,ip=,mask=" -``` - -If earlier kernel messages are required the serial console should be used instead of `virtio-console`. - -- x86-64 - -```shell -$ ./cloud-hypervisor \ - --kernel ./linux-cloud-hypervisor/arch/x86/boot/compressed/vmlinux.bin \ - --console off \ - --serial tty \ - --disk path=focal-server-cloudimg-amd64.raw \ - --cmdline "console=ttyS0 root=/dev/vda1 rw" \ - --cpus boot=4 \ - --memory size=1024M \ - --net "tap=,mac=,ip=,mask=" -``` - -- AArch64 - -```shell -$ ./cloud-hypervisor \ - --kernel ./linux-cloud-hypervisor/arch/arm64/boot/Image \ - --console off \ - --serial tty \ - --disk path=focal-server-cloudimg-arm64.raw \ - --cmdline "console=ttyAMA0 root=/dev/vda1 rw" \ - --cpus boot=4 \ - --memory size=1024M \ - --net "tap=,mac=,ip=,mask=" -``` - -# 3. Status - -Cloud Hypervisor is under active development. The following stability -guarantees are currently made: - -* The API (including command line options) will not be removed or changed in a - breaking way without a minimum of 2 major releases notice. Where possible - warnings will be given about the use of deprecated functionality and the - deprecations will be documented in the release notes. - -* Point releases will be made between individual releases where there are - substantial bug fixes or security issues that need to be fixed. These point - releases will only include bug fixes. - -Currently the following items are **not** guaranteed across updates: - -* Snapshot/restore is not supported across different versions -* Live migration is not supported across different versions -* The following features are considered experimental and may change - substantially between releases: TDX, vfio-user, vDPA. - -Further details can be found in the [release documentation](docs/releases.md). - -As of 2023-01-03, the following cloud images are supported: - -- [Ubuntu Focal](https://cloud-images.ubuntu.com/focal/current/) (focal-server-cloudimg-{amd64,arm64}.img) -- [Ubuntu Jammy](https://cloud-images.ubuntu.com/jammy/current/) (jammy-server-cloudimg-{amd64,arm64}.img) -- [Ubuntu Noble](https://cloud-images.ubuntu.com/noble/current/) (noble-server-cloudimg-{amd64,arm64}.img) -- [Fedora 36](https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/36/Cloud/) ([Fedora-Cloud-Base-36-1.5.x86_64.raw.xz](https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/36/Cloud/x86_64/images/) / [Fedora-Cloud-Base-36-1.5.aarch64.raw.xz](https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/36/Cloud/aarch64/images/)) - -Direct kernel boot to userspace should work with a rootfs from most -distributions although you may need to enable exotic filesystem types in the -reference kernel configuration (e.g. XFS or btrfs.) - -## Hot Plug - -Cloud Hypervisor supports hotplug of CPUs, passthrough devices (VFIO), -`virtio-{net,block,pmem,fs,vsock}` and memory resizing. This -[document](docs/hotplug.md) details how to add devices to a running VM. - -## Device Model - -Details of the device model can be found in this -[documentation](docs/device_model.md). - -## Roadmap - -The project roadmap is tracked through a [GitHub -project](https://github.com/orgs/cloud-hypervisor/projects/6). - -# 4. Relationship with _Rust VMM_ Project - -In order to satisfy the design goal of having a high-performance, -security-focused hypervisor the decision was made to use the -[Rust](https://www.rust-lang.org/) programming language. The language's strong -focus on memory and thread safety makes it an ideal candidate for implementing -VMMs. - -Instead of implementing the VMM components from scratch, Cloud Hypervisor is -importing the [Rust VMM](https://github.com/rust-vmm) crates, and sharing code -and architecture together with other VMMs like e.g. Amazon's -[Firecracker](https://firecracker-microvm.github.io/) and Google's -[crosvm](https://chromium.googlesource.com/chromiumos/platform/crosvm/). - -Cloud Hypervisor embraces the _Rust VMM_ project's goals, which is to be able -to share and re-use as many virtualization crates as possible. - -## Differences with Firecracker and crosvm - -A large part of the Cloud Hypervisor code is based on either the Firecracker or -the crosvm project's implementations. Both of these are VMMs written in Rust -with a focus on safety and security, like Cloud Hypervisor. - -The goal of the Cloud Hypervisor project differs from the aforementioned -projects in that it aims to be a general purpose VMM for _Cloud Workloads_ and -not limited to container/serverless or client workloads. - -The Cloud Hypervisor community thanks the communities of both the Firecracker -and crosvm projects for their excellent work. - -# 5. Community - -The Cloud Hypervisor project follows the governance, and community guidelines -described in the [Community](https://github.com/cloud-hypervisor/community) -repository. - -## Contribute - -The project strongly believes in building a global, diverse and collaborative -community around the Cloud Hypervisor project. Anyone who is interested in -[contributing](CONTRIBUTING.md) to the project is welcome to participate. - -Contributing to a open source project like Cloud Hypervisor covers a lot more -than just sending code. Testing, documentation, pull request -reviews, bug reports, feature requests, project improvement suggestions, etc, -are all equal and welcome means of contribution. See the -[CONTRIBUTING](CONTRIBUTING.md) document for more details. - -## Slack - -Get an [invite to our Slack channel](https://join.slack.com/t/cloud-hypervisor/shared_invite/enQtNjY3MTE3MDkwNDQ4LWQ1MTA1ZDVmODkwMWQ1MTRhYzk4ZGNlN2UwNTI3ZmFlODU0OTcwOWZjMTkwZDExYWE3YjFmNzgzY2FmNDAyMjI), - [join us on Slack](https://cloud-hypervisor.slack.com/), and [participate in our community activities](https://cloud-hypervisor.slack.com/archives/C04R5DUQVBN). - -## Mailing list - -Please report bugs using the [GitHub issue -tracker](https://github.com/cloud-hypervisor/cloud-hypervisor/issues) but for -broader community discussions you may use our [mailing -list](https://lists.cloudhypervisor.org/g/dev/). - -## Security issues - -Please contact the maintainers listed in the MAINTAINERS.md file with security issues. +# Cloud Hypervisor Fork for SAP gardenlinux + +The `gardenlinux` branch is the branch from that our SAP colleagues [build] +[sap-gl-ci] their Cloud Hypervisor packages. + +## Development Model + +- The `gardenlinux` branch is always what SAP builds. From SAPs side, we can + force push or rewrite history on that branch. +- We use branch protection for `gradenlinux`, PRs, CI, and code reviews +- With every new CHV release, we rename `gardenlinux` to `gardenlinux-vXX` and + create a new `gardenlinux` branch manually: + - use release as base and push it into the repo + - cherry-pick all commits from `gardenlinux-vXX` that are still relevant onto a + new branch and create a pull request against this fork + - adapt git commit history +- PoC Development: + - happens here (in [cyberus-technology/cloud-hypervisor](https://github.com/cyberus-technology/cloud-hypervisor)) + - open PR against `gardenlinux` + - Branch name patterns **must not** follow `gardenlinux-*` pattern + - We recommend `cyberus-fork-*` as branch pattern to better keep the overview. +- Productization: + - happens upstream (in [cloud-hypervisor/cloud-hypervisor](https://github.com/cloud-hypervisor/cloud-hypervisor)) + - We recommend `productize-*` as branch pattern to better keep the overview. + + +[sap-gl-ci]: https://github.com/gardenlinux/package-cloud-hypervisor-gl/blob/main/prepare_source#L1 diff --git a/arch/Cargo.toml b/arch/Cargo.toml index 2e30b9e532..a0b8fdb5df 100644 --- a/arch/Cargo.toml +++ b/arch/Cargo.toml @@ -5,29 +5,55 @@ name = "arch" rust-version.workspace = true version = "0.1.0" +# TODO: Consider making this a binary of the main package instead +[[bin]] +name = "generate-cpu-profile" +path = "src/bin/generate-cpu-profile.rs" +required-features = ["cpu_profile_generation"] + [features] default = [] fw_cfg = [] kvm = ["hypervisor/kvm"] sev_snp = [] tdx = [] +# Currently cpu profiles can only be generated with KVM +cpu_profile_generation = ["dep:clap", "kvm"] [dependencies] anyhow = { workspace = true } byteorder = { workspace = true } +clap = { workspace = true, optional = true } hypervisor = { path = "../hypervisor" } libc = { workspace = true } linux-loader = { workspace = true, features = ["bzimage", "elf", "pe"] } log = { workspace = true } serde = { workspace = true, features = ["derive", "rc"] } +# We currently use this for (de-)serializing CPU profile data +serde_json = { workspace = true } thiserror = { workspace = true } uuid = { workspace = true } vm-memory = { workspace = true, features = ["backend-bitmap", "backend-mmap"] } vmm-sys-util = { workspace = true, features = ["with-serde"] } +[target.'cfg(target_arch = "x86_64")'.dependencies] +flate2 = { workspace = true } + [target.'cfg(any(target_arch = "aarch64", target_arch = "riscv64"))'.dependencies] fdt_parser = { version = "0.1.5", package = "fdt" } vm-fdt = { workspace = true } +[build-dependencies] +anyhow = { workspace = true } +flate2 = { workspace = true } +prettyplease = "0.2.37" +quote = "1.0.45" +syn = "2.0.117" + +# Use this to test our custom serialization logic +[dev-dependencies] +proptest = "1.0.0" +serde_json = { workspace = true } + [lints] workspace = true diff --git a/arch/build.rs b/arch/build.rs new file mode 100644 index 0000000000..01170f7289 --- /dev/null +++ b/arch/build.rs @@ -0,0 +1,254 @@ +// Copyright © 2026 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::collections::BTreeSet; +use std::ffi::OsStr; +use std::io::{Read, Write}; +use std::path::Path; +use std::{env, fs}; + +use anyhow::Context; +use flate2::Compression; +use flate2::write::ZlibEncoder; +use quote::{format_ident, quote}; + +/// This is where the CPU profile generation tool writes the JSON files associated with +/// a CPU profile. +const X86_64_CPU_PROFILES_PATH: &str = "./src/x86_64/cpu_profiles"; + +fn main() -> anyhow::Result<()> { + let target_arch = std::env::var("CARGO_CFG_TARGET_ARCH") + .context("Could not get env var CARGO_CFG_TARGET_ARCH")?; + + if target_arch == "x86_64" { + generate_code_for_x86_64_cpu_profiles().context("CPU profile code generation failed")?; + // We only want the build script to be rerun if new CPU profiles are generated, or the + // build script itself changes (see the final println! before this function returns). + println!("cargo::rerun-if-changed={X86_64_CPU_PROFILES_PATH}"); + } + + // Disable automatic rerun after package changes. + // See: https://doc.rust-lang.org/cargo/reference/build-scripts.html#rerun-if-changed + println!("cargo::rerun-if-changed=build.rs"); + Ok(()) +} + +/// This function generates the `generated_cpu_profiles.rs` file which consists of the following: +/// +/// - a `CpuProfile` enum with a `Host` variant and one additional variant per pre-generated CPU profile. +/// - A function `compressed_cpuid_data` that takes a `&CpuProfile` and returns the compressed CPUID adjustment data required for the given CPU profile. +/// - A function `compressed_msr_data` that takes a `&CpuProfile` and returns the compressed MSR adjustment data required for the given CPU profile. +/// +/// This function works by traversing the JSON files in `X86_64_CPU_PROFILES_PATH` generated by +/// the CPU profile generation tool. +fn generate_code_for_x86_64_cpu_profiles() -> anyhow::Result<()> { + let out_dir = env::var_os("OUT_DIR").unwrap(); + let profile_names = x86_64_cpu_profile_names() + .context("Failed to extract CPU profile names from pre-generated JSON files")?; + // Compress each CPUID and MSR JSON file + compress_json_files(&profile_names, &out_dir) + .context("Failed to create compressed CPU profile data files")?; + + let mut out = generate_cpu_profile_enum(&profile_names); + out.push('\n'); + out.push_str(&generate_compressed_data_fn( + &profile_names, + DataType::Cpuid, + )); + out.push('\n'); + out.push_str(&generate_compressed_data_fn(&profile_names, DataType::Msr)); + + let generated_file_path = Path::new(&out_dir).join("generated_cpu_profiles.rs"); + let mut f = fs::File::create(&generated_file_path) + .with_context(|| format!("Could not create file with path:={generated_file_path:#?}"))?; + f.write_all(out.as_bytes()) + .with_context(|| format!("Could not write to file with path:={generated_file_path:#?}"))?; + Ok(()) +} + +/// The name of a pre-generated CPU profile. +/// +/// Each CPU profile has two associated JSON files: +/// +/// 1. .cpuid.json +/// 2. .msr.json +/// +/// and each instance of `ProfileName` is extracted from +/// ``. +struct ProfileName { + /// The `kebab_case` name converted to camel case. + camel_case: String, + kebab_case: String, +} + +/// Each CPU profile has two associated JSON files: +/// +/// one for CPUID adjustment data and one for MSR adjustment data. +#[derive(Copy, Clone)] +enum DataType { + Cpuid, + Msr, +} + +impl DataType { + fn as_str(&self) -> &str { + match self { + Self::Cpuid => "cpuid", + Self::Msr => "msr", + } + } +} + +/// Traverse the `X86_64_CPU_PROFILES_PATH` and extract a `[ProfileName]` per encountered +/// pre-generated CPU profile. +fn x86_64_cpu_profile_names() -> anyhow::Result> { + let dir = fs::read_dir(X86_64_CPU_PROFILES_PATH) + .with_context(|| format!("Could not read directory:={X86_64_CPU_PROFILES_PATH}"))?; + + let mut profile_names_kebab_case = BTreeSet::new(); + for entry in dir { + let file = entry.with_context(|| { + format!("Encountered error while traversing directory:={X86_64_CPU_PROFILES_PATH}") + })?; + let file_name = file.file_name().into_string().unwrap(); + let profile_name_kebab_case = { + let dot_pos = file_name + .find('.') + .expect("all files in the cpu_profiles directory should contain a '.' character"); + file_name[..dot_pos].to_string() + }; + profile_names_kebab_case.insert(profile_name_kebab_case); + } + + let profile_name_iter = profile_names_kebab_case.into_iter().map(|kebab_case| { + let mut camel_case = String::new(); + for part in kebab_case.split('-') { + if let Some(first_char) = part.chars().next() { + camel_case.extend(first_char.to_uppercase()); + let rest = &part[first_char.len_utf8()..]; + camel_case.push_str(rest); + } + } + ProfileName { + camel_case, + kebab_case, + } + }); + Ok(profile_name_iter.collect()) +} + +/// Compresses the CPUID and MSR related JSON files per CPU profile +/// that are found in `X86_64_CPU_PROFILES_PATH`. +fn compress_json_files(names: &[ProfileName], out_dir: &OsStr) -> anyhow::Result<()> { + for ProfileName { + kebab_case, + camel_case: _, + } in names + { + let file_bytes = |data_type: &str| -> anyhow::Result> { + let path = + Path::new(X86_64_CPU_PROFILES_PATH).join(format!("{kebab_case}.{data_type}.json")); + let mut file = fs::File::open(&path) + .with_context(|| format!("Could not open file with path:={path:#?}"))?; + let mut v = Vec::new(); + file.read_to_end(&mut v) + .with_context(|| format!("Could not read contents of file with path:={path:#?}"))?; + Ok(v) + }; + let cpuid_bytes = file_bytes("cpuid")?; + let msr_bytes = file_bytes("msr")?; + let compress_to_file = |data_type: &str, data: &[u8]| -> anyhow::Result<()> { + let path = Path::new(&out_dir).join(format!("{kebab_case}.{data_type}.zz")); + let file = fs::File::create(&path) + .with_context(|| format!("Could not create file with path:={path:#?}"))?; + let mut encoder = ZlibEncoder::new(file, Compression::best()); + encoder.write_all(data).with_context(|| { + format!("Could not write compressed bytes to file with path:={path:#?}") + })?; + encoder + .flush() + .with_context(|| format!("Could not flush to file with path:={path:#?}"))?; + Ok(()) + }; + compress_to_file(DataType::Cpuid.as_str(), &cpuid_bytes)?; + compress_to_file(DataType::Msr.as_str(), &msr_bytes)?; + } + + Ok(()) +} + +/// Generates Rust code as a String defining a `CpuProfile` enum with a `Host` variant +/// together with a variant per entry in `profile_names`. +fn generate_cpu_profile_enum(profile_names: &[ProfileName]) -> String { + // Obtain a vector of the non-host CPU profile enum variants from the previously parsed camel case names + let non_host_enum_variants = non_host_cpu_profile_variants(profile_names); + + // Use the quote crate to build the CpuProfile enum as a TokenStream. + let tokens = quote! { + #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize, Default)] + #[serde(rename_all = "kebab-case")] + pub enum CpuProfile { + #[default] + Host, + #(#non_host_enum_variants),* + } + }; + + // Parse this to a syntax tree and return it and convert it to a pretty printed string of Rust code + let syntax_tree = syn::parse2(tokens).unwrap(); + prettyplease::unparse(&syntax_tree) +} + +/// Generates the function that extracts the compressed bytes for `data_type` corresponding to the user's +/// selected CPU profile. +fn generate_compressed_data_fn(profile_names: &[ProfileName], data_type: DataType) -> String { + let data_type_str = data_type.as_str(); + let doc_str = format!( + "Extract compressed {data_type_str} CPU profile data corresponding to the given profile" + ); + let non_host_enum_variants = non_host_cpu_profile_variants(profile_names); + let compressed_file_names: Vec = profile_names + .iter() + .map( + |ProfileName { + kebab_case, + camel_case: _, + }| format!("/{kebab_case}.{data_type_str}.zz"), + ) + .collect(); + + // Workaround to interpolate `data_type_str` in the function name within a `quote!` invocation. + let fn_name_ident = format_ident!("compressed_{data_type_str}_data"); + + // We now use quote! to produce our function that matches against each enum variant and returns the compressed file as a byte slice. + // + // Note that the compressed bytes are no longer stand alone files after compiling since we will use `include_bytes!` to compile them + // into the final binary. + let tokens = quote! { + #[doc=#doc_str] + fn #fn_name_ident (profile: &CpuProfile) -> Option<&'static [u8]> { + use CpuProfile::*; + match profile { + Host => None, + #(#non_host_enum_variants => Some(&include_bytes!(concat!(env!("OUT_DIR"), #compressed_file_names))[..])),* + } + } + }; + + // Parse this to a syntax tree and return it and convert it to a pretty printed string of Rust code + let syntax_tree = syn::parse2(tokens).unwrap(); + prettyplease::unparse(&syntax_tree) +} + +/// Converts the parsed CPU profile names to a enum variants that may be placed into a token stream. +fn non_host_cpu_profile_variants(names: &[ProfileName]) -> Vec { + names + .iter() + .map(|name| { + syn::parse_str(name.camel_case.as_str()) + .expect("Should be able to parse camelcase name to syn::Variant") + }) + .collect() +} diff --git a/arch/src/bin/generate-cpu-profile.rs b/arch/src/bin/generate-cpu-profile.rs new file mode 100644 index 0000000000..4710fd277e --- /dev/null +++ b/arch/src/bin/generate-cpu-profile.rs @@ -0,0 +1,30 @@ +// Copyright © 2025 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 +// +#![cfg(all( + target_arch = "x86_64", + feature = "cpu_profile_generation", + feature = "kvm" +))] + +use anyhow::Context; +use clap::{Arg, Command}; + +fn main() -> anyhow::Result<()> { + let cmd_arg = Command::new("generate-cpu-profile") + .version(env!("CARGO_PKG_VERSION")) + .arg_required_else_help(true) + .arg( + Arg::new("name") + .help("The name to give the CPU profile") + .num_args(1) + .required(true), + ) + .get_matches(); + + let profile_name = cmd_arg.get_one::("name").unwrap(); + + let hypervisor = hypervisor::new().context("Could not obtain hypervisor")?; + arch::x86_64::cpu_profile_generation::generate_profile_data(hypervisor.as_ref(), profile_name) +} diff --git a/arch/src/lib.rs b/arch/src/lib.rs index c1c1973667..28c095fff6 100644 --- a/arch/src/lib.rs +++ b/arch/src/lib.rs @@ -9,12 +9,18 @@ //! Supported platforms: x86_64, aarch64, riscv64. use std::collections::BTreeMap; +use std::io::Write; +use std::str::FromStr; use std::sync::Arc; use std::{fmt, result}; -use serde::{Deserialize, Serialize}; +use serde::de::IntoDeserializer; +use serde::{Deserialize, Deserializer, Serialize, Serializer}; use thiserror::Error; +#[cfg(target_arch = "x86_64")] +pub use crate::x86_64::cpu_profile::CpuProfile; + type GuestMemoryMmap = vm_memory::GuestMemoryMmap; type GuestRegionMmap = vm_memory::GuestRegionMmap; @@ -53,6 +59,68 @@ pub enum Error { /// Type for returning public functions outcome. pub type Result = result::Result; +// If the target_arch is x86_64 we import CpuProfile from the x86_64 module, otherwise we +// declare it here. +#[cfg(not(target_arch = "x86_64"))] +#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[serde(rename_all = "kebab-case")] +/// A [`CpuProfile`] is a mechanism for ensuring live migration compatibility +/// between host's with potentially different CPU models. +pub enum CpuProfile { + #[default] + Host, +} + +impl FromStr for CpuProfile { + type Err = serde::de::value::Error; + fn from_str(s: &str) -> result::Result { + // Should accept both plain strings, and strings surrounded by `"`. + let normalized = s + .strip_prefix('"') + .unwrap_or(s) + .strip_suffix('"') + .unwrap_or(s); + Self::deserialize(normalized.into_deserializer()) + } +} + +// We introduce some utilities for serializing u32 values as hex. +// These are only necessary for (de-)serializing CPU profile data. + +/// Serializes the given `input` as a hex string (starting with "0x") +fn serialize_u32_hex( + input: &u32, + serializer: S, +) -> std::result::Result { + eval_u32_hex(*input, |hex| serializer.serialize_str(hex)) +} + +/// Converts `input` into a hex string representation (starting with "0x", but the length may vary) and +/// applies the given `callback` to it. +fn eval_u32_hex(input: u32, callback: F) -> T +where + F: FnOnce(&str) -> T, +{ + // two bytes for "0x" prefix and at most eight for the hex encoded number + let mut buffer = [0_u8; 10]; + let mut write_slice = &mut buffer[..]; + write!(write_slice, "{input:#x}").expect("This write should be infallible"); + let len = 10 - write_slice.len(); + let str = core::str::from_utf8(&buffer[..len]) + .expect("the buffer should be filled with valid UTF-8 bytes"); + callback(str) +} + +/// Deserializes a u32 from a hex string representation +fn deserialize_u32_hex<'de, D: Deserializer<'de>>( + deserializer: D, +) -> std::result::Result { + let hex = <&'de str as Deserialize>::deserialize(deserializer)?; + u32::from_str_radix(hex.strip_prefix("0x").unwrap_or(""), 16).map_err(|_| { + ::custom(format!("{hex} is not a hex encoded 32 bit integer")) + }) +} + /// Type for memory region types. #[derive(Clone, Copy, PartialEq, Eq, Debug, Serialize, Deserialize)] pub enum RegionType { diff --git a/arch/src/x86_64/cpu_profile.rs b/arch/src/x86_64/cpu_profile.rs new file mode 100644 index 0000000000..820c296b32 --- /dev/null +++ b/arch/src/x86_64/cpu_profile.rs @@ -0,0 +1,493 @@ +// Copyright © 2025 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::io::{Read, Write}; + +use flate2::read::ZlibDecoder; +use hypervisor::arch::x86::{CpuIdEntry, MsrEntry}; +use hypervisor::{CpuVendor, HypervisorType}; +use log::error; +use serde::ser::SerializeStruct; +use serde::{Deserialize, Serialize}; +use thiserror::Error; + +use crate::deserialize_u32_hex; +use crate::x86_64::CpuidReg; +use crate::x86_64::cpuid_definitions::Parameters; +use crate::x86_64::msr_definitions::RegisterAddress; + +// build.rs generates a CpuProfiles enum with a variant for each +// CPU profile in arch/x86_64/cpu_profiles and also has the default +// host variant as well. +// +// Furthermore the build script also generates the functions +// `compressed_cpuid_data`, `compressed_msr_data` for obtaining the +// compressed JSON data associated with the given cpu profile. +include!(concat!(env!("OUT_DIR"), "/generated_cpu_profiles.rs")); + +impl CpuProfile { + /// Loads pre-generated CPUID data associated with a CPU profile. + /// + /// If the `amx` flag is false then the AMX tile state components will be + /// zeroed out from the associated profile data. This is necessary because + /// they will then not be present in the vector of [`CpuidEntry`] values + /// obtained from the hypervisor. + // + // We can only generate CPU profiles for the KVM hypervisor for the time being. + pub(in crate::x86_64) fn cpuid_data(&self, amx: bool) -> Option { + const ESTIMATED_CPUID_CPU_PROFILE_DATA_COMPRESSION_RATIO: usize = 32; + + // The compressed_cpuid_data function is generated by build.rs + let compressed: &[u8] = compressed_cpuid_data(self)?; + let mut data: CpuIdProfileData = { + serde_json::from_slice(&Self::decompress_cpu_profile_data( + compressed, + ESTIMATED_CPUID_CPU_PROFILE_DATA_COMPRESSION_RATIO, + )) + .expect("Should be able to deserialize CPU profile CPUID data") + }; + if !amx { + // In this case we will need to wipe out the AMX tile state components (if they are included in the profile) + for adj in data.adjustments.iter_mut() { + if adj.0.sub_leaf.start() != adj.0.sub_leaf.end() { + continue; + } + let sub_leaf = *adj.0.sub_leaf.start(); + let leaf = adj.0.leaf; + if (leaf == 0xd) && (sub_leaf == 0) && (adj.0.register == CpuidReg::EAX) { + adj.1.replacements &= !((1 << 17) | (1 << 18)); + } + + if (leaf == 0xd) && (sub_leaf == 1) && (adj.0.register == CpuidReg::ECX) { + adj.1.replacements &= !((1 << 17) | (1 << 18)); + } + + if (leaf == 0xd) && ((sub_leaf == 17) | (sub_leaf == 18)) { + adj.1.replacements = 0; + } + } + } + + Some(data) + } + + /// Loads pre-generated MSR data associated with a CPU profile. + pub(in crate::x86_64) fn msr_data(&self) -> Option { + const ESTIMATED_MSR_CPU_PROFILE_DATA_COMPRESSION_RATIO: usize = 4; + + // compressed_msr_data is created by build.rs + let compressed: &[u8] = compressed_msr_data(self)?; + serde_json::from_slice(&Self::decompress_cpu_profile_data( + compressed, + ESTIMATED_MSR_CPU_PROFILE_DATA_COMPRESSION_RATIO, + )) + .expect("Should be able to deserialize CPU profile MSR data") + } + + /// Decompress the `compressed` byte slice. + /// + /// The `estimated_compression_ratio` is just used for optimizing the number of necessary allocations + /// and does not have to be accurate. + fn decompress_cpu_profile_data( + compressed: &[u8], + estimated_compression_ratip: usize, + ) -> Vec { + let mut decoder = ZlibDecoder::new(compressed); + // Don't expect more than a 32x compression ratio + let mut v = Vec::with_capacity(compressed.len() * estimated_compression_ratip); + decoder + .read_to_end(&mut v) + .expect("Should be able to decompress CPU profile data"); + v + } +} + +/// Every [`CpuProfile`] different from `Host` has associated [`CpuIdProfileData`]. +/// +/// New constructors of this struct may only be generated through the CHV CLI (when built from source with +/// the `cpu-profile-generation` feature) which other hosts may then attempt to load in order to +/// increase the likelihood of successful live migrations among all hosts that opted in to the given +/// CPU profile. +#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] +#[allow(dead_code)] +pub struct CpuIdProfileData { + /// The hypervisor used when generating this CPU profile. + pub(in crate::x86_64) hypervisor: HypervisorType, + /// The vendor of the CPU belonging to the host that generated this CPU profile. + pub(in crate::x86_64) cpu_vendor: CpuVendor, + /// Adjustments necessary to become compatible with the desired target. + pub(in crate::x86_64) adjustments: Vec<(Parameters, CpuidOutputRegisterAdjustments)>, +} + +/// Used for adjusting an entire cpuid output register (EAX, EBX, ECX or EDX) +#[derive(Debug, Clone, Copy, PartialEq, Eq, Deserialize)] +pub(super) struct CpuidOutputRegisterAdjustments { + #[serde(deserialize_with = "deserialize_u32_hex")] + pub(in crate::x86_64) replacements: u32, + /// Used to zero out the area `replacements` occupy. This mask is not necessarily !replacements, as replacements may pack values of different types (i.e. it is wrong to think of it as a bitset conceptually speaking). + #[serde(deserialize_with = "deserialize_u32_hex")] + pub(in crate::x86_64) mask: u32, +} + +/* +We want to serialize the values as 10 bytes, starting with 0x, +regardless of the value. This makes it easier for humans to compare different serialized values. +*/ +impl Serialize for CpuidOutputRegisterAdjustments { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + let mut s = serializer.serialize_struct("CpuidOutputRegisterAdjustments", 2)?; + let mut serialize_field = |key, value| { + // two bytes for "0x" prefix and eight for the hex encoded number + let mut buffer = [0_u8; 10]; + write!(&mut buffer[..], "{value:#010x}").expect("This write should be infallible"); + let str = core::str::from_utf8(&buffer[..]) + .expect("the buffer should be filled with valid UTF-8 bytes"); + s.serialize_field(key, str) + }; + serialize_field("replacements", self.replacements)?; + serialize_field("mask", self.mask)?; + s.end() + } +} + +impl CpuidOutputRegisterAdjustments { + pub(in crate::x86_64) fn adjust(self, cpuid_output_register: &mut u32) { + let temp_register_copy = *cpuid_output_register; + let replacements_area_masked_in_temp_copy = temp_register_copy & self.mask; + *cpuid_output_register = replacements_area_masked_in_temp_copy | self.replacements; + } + + pub(in crate::x86_64) fn adjust_cpuid_entries( + mut cpuid: Vec, + adjustments: &[(Parameters, Self)], + ) -> Result, MissingCpuidEntriesError> { + for entry in &mut cpuid { + for (reg, reg_value) in [ + (CpuidReg::EAX, &mut entry.eax), + (CpuidReg::EBX, &mut entry.ebx), + (CpuidReg::ECX, &mut entry.ecx), + (CpuidReg::EDX, &mut entry.edx), + ] { + // Get the adjustment corresponding to the entry's function/leaf and index/sub-leaf for each of the register. If no such + // adjustment is found we use the trivial adjustment (leading to the register being zeroed out entirely). + let adjustment = adjustments + .iter() + .find_map(|(param, adjustment)| { + ((param.leaf == entry.function) + & param.sub_leaf.contains(&entry.index) + & (param.register == reg)) + .then_some(*adjustment) + }) + .unwrap_or(CpuidOutputRegisterAdjustments { + mask: 0, + replacements: 0, + }); + adjustment.adjust(reg_value); + } + } + + Self::expected_entries_found(&cpuid, adjustments).map(|_| cpuid) + } + + /// Check that we found every value that was supposed to be replaced with something else than 0 + /// + /// IMPORTANT: This function assumes that the given `cpuid` has already been adjusted with the + /// provided `adjustments`. + fn expected_entries_found( + cpuid: &[CpuIdEntry], + adjustments: &[(Parameters, Self)], + ) -> Result<(), MissingCpuidEntriesError> { + let mut missing_entry = false; + + // Invalid state components can be ignored. The next few lines obtain the relevant entries to + // check for this. + let eax_0xd_0 = cpuid + .iter() + .find(|entry| (entry.function == 0xd) && (entry.index == 0)) + .map_or(0, |entry| entry.eax); + let ecx_0xd_1 = cpuid + .iter() + .find(|entry| (entry.function == 0xd) && (entry.index == 1)) + .map_or(0, |entry| entry.ecx); + + let edx_0xd_0 = cpuid + .iter() + .find(|entry| (entry.function == 0xd) && (entry.index == 0)) + .map_or(0, |entry| entry.edx); + let edx_0xd_1 = cpuid + .iter() + .find(|entry| (entry.function == 0xd) && (entry.index == 1)) + .map_or(0, |entry| entry.edx); + + for (param, adjustment) in adjustments { + if adjustment.replacements == 0 { + continue; + } + let sub_start = *param.sub_leaf.start(); + let sub_end = *param.sub_leaf.end(); + + let can_skip_lo = if (param.leaf == 0xd) && (2..32).contains(&sub_start) { + let start = sub_start; + let end = std::cmp::min(sub_end, 31); + let mask = (start..=end).fold(0, |acc, next| acc | (1 << next)); + ((mask & eax_0xd_0) == 0) & ((mask & ecx_0xd_1) == 0) + } else { + false + }; + + let can_skip_hi = if (param.leaf == 0xd) && (32..64).contains(&sub_end) { + let start = std::cmp::max(32, sub_start); + let end = sub_end; + let mask = (start..=end) + .map(|val| val - 32) + .fold(0, |acc, next| acc | (1 << next)); + ((mask & edx_0xd_0) == 0) & ((mask & edx_0xd_1) == 0) + } else { + false + }; + + if can_skip_lo && can_skip_hi { + // This means that all state components referred to by the specified sub-leaf range are not valid + // and may be skipped. + continue; + } + if !cpuid.iter().any(|entry| { + (entry.function == param.leaf) && (param.sub_leaf.contains(&entry.index)) + }) { + error!( + "cannot adjust CPU profile. No entry found matching the required parameters: {param:?}" + ); + missing_entry = true; + } + } + if missing_entry { + Err(MissingCpuidEntriesError) + } else { + Ok(()) + } + } +} + +#[derive(Debug, Clone, Eq, PartialEq)] +pub(in crate::x86_64) struct FeatureMsrAdjustment { + pub(in crate::x86_64) mask: u64, + pub(in crate::x86_64) replacements: u64, +} + +impl Serialize for FeatureMsrAdjustment { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + let mut s = serializer.serialize_struct("FeatureMsrAdjustment", 2)?; + let mut serialize_field = |key, value| { + // two bytes for "0x" prefix and 16 for the hex encoded number + let mut buffer = [0_u8; 18]; + let _ = write!(&mut buffer[..], "{value:#018x}"); + let str = core::str::from_utf8(&buffer[..]) + .expect("the buffer should be filled with valid UTF-8 bytes"); + s.serialize_field(key, str) + }; + serialize_field("mask", self.mask)?; + serialize_field("replacements", self.replacements)?; + s.end() + } +} + +impl<'de> Deserialize<'de> for FeatureMsrAdjustment { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + #[derive(Deserialize)] + struct ProvisionalFeatureMsrAdjustment<'a> { + #[serde(borrow)] + mask: &'a str, + #[serde(borrow)] + replacements: &'a str, + } + + let ProvisionalFeatureMsrAdjustment { mask, replacements } = + ProvisionalFeatureMsrAdjustment::deserialize(deserializer)?; + let parse_u64 = |hex: &str, field_name: &str| { + u64::from_str_radix(hex.strip_prefix("0x").unwrap_or(""), 16).map_err(|_| { + ::custom(format!("Unable to deserialize FeatureMsrAdjustment: could not deserialize {field_name} the value {hex} is not a hex encoded 64 bit integer")) + }) + }; + let mask = parse_u64(mask, "mask")?; + let replacements = parse_u64(replacements, "replacements")?; + Ok(FeatureMsrAdjustment { mask, replacements }) + } +} + +impl FeatureMsrAdjustment { + /// Returns a struct describing the Feature MSRs that should be set + /// and the ones that should be denied based on `adjustments` and the given + /// `feature_msrs`. + /// + /// # Errors + /// + /// The only way for this to error is if there exists one or more entries in + /// `adjustments` that do not have a corresponding entry in `feature_msrs`. + /// In this case the missing MSR will be logged and the unit type is returned + /// as the error variant. + pub(in crate::x86_64) fn adjust_to( + adjustments: &[(RegisterAddress, FeatureMsrAdjustment)], + feature_msrs: &[MsrEntry], + ) -> Result, ()> { + let mut output_feature_msrs = Vec::with_capacity(feature_msrs.len()); + for (reg_address, adjustment) in adjustments { + let Some(entry) = feature_msrs + .iter() + .find(|entry| entry.index == reg_address.0) + else { + error!( + "Did not find feature based MSR entry for MSR:={:#x}", + reg_address.0 + ); + return Err(()); + }; + // Adjust the entry and push it to outputs + { + let mut entry = *entry; + let data = entry.data; + entry.data = (adjustment.mask & data) | adjustment.replacements; + // TODO: Perhaps trace! would be a better log level? + log::debug!( + "adjusted MSR-based feature: register address:={:#x} value:={:#x}, previous value:={data:#x}", + entry.index, + entry.data + ); + output_feature_msrs.push(entry); + } + } + Ok(output_feature_msrs) + } +} + +pub struct RequiredMsrUpdates { + pub msr_based_features: Vec, + pub denied_msrs: Vec, +} + +/// Every [`CpuProfile`] different from `Host` has associated [`MsrProfileData`]. +/// +/// New constructors of this struct may only be generated through the CHV CLI (when built from source with +/// the `cpu-profile-generation` feature) which other hosts may then attempt to load in order to +/// increase the likelihood of successful live migrations among all hosts that opted in to the given +/// CPU profile. +#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] +pub(in crate::x86_64) struct MsrProfileData { + pub(in crate::x86_64) cpu_vendor: CpuVendor, + pub(in crate::x86_64) hypervisor_type: HypervisorType, + pub(in crate::x86_64) adjustments: Vec<(RegisterAddress, FeatureMsrAdjustment)>, + pub(in crate::x86_64) permitted_msrs: Vec, +} + +#[derive(Debug, Error)] +#[error("Required CPUID entries not found")] +pub struct MissingCpuidEntriesError; + +#[derive(Debug, Error)] +#[error("Required MSR entries not found")] +pub struct MissingMsrEntriesError; + +#[cfg(test)] +mod tests { + use proptest::prelude::*; + + use super::CpuidOutputRegisterAdjustments; + use crate::CpuProfile; + #[cfg(feature = "kvm")] + use crate::x86_64::cpu_profile::{CpuIdProfileData, MsrProfileData}; + + // Check that serializing and then deserializing `CpuidOutputResiterAdjustments` results in the same value we started with. + // + // Also check that the serialized numeric values satisfy our expectations: They are 10-byte hex encoded strings + proptest! { + #[test] + fn cpuid_output_register_adjustments_serialization_works(replacements in any::(), mask in any::()) { + // Randomly generate these values. Several of the generated values will not represent anything that may be + // produced in practice, but (de-)serialization does not take such domain knowledge into account (if that changes + // then this test will need to be updated). + let adjustments = CpuidOutputRegisterAdjustments { + replacements, + mask + }; + let serialized = serde_json::to_string(&adjustments).unwrap(); + let deserialized: CpuidOutputRegisterAdjustments = serde_json::from_str(&serialized).unwrap(); + prop_assert_eq!(&deserialized, &adjustments); + let json = serde_json::to_value(adjustments).unwrap(); + let replacements_str = json.get("replacements").unwrap().as_str().unwrap(); + let mask_str = json.get("mask").unwrap().as_str().unwrap(); + let check_str_invariants = |value: &str| { + prop_assert!(value.starts_with("0x")); + prop_assert_eq!(value.len(),10); + prop_assert!(value.as_bytes().iter().all(|byte| byte.is_ascii())); + let is_hex_digit = |byte: &u8| -> bool { + byte.is_ascii_digit() | (*byte == b'a') | (*byte == b'b') | (*byte == b'c') | (*byte == b'd') | (*byte == b'e') | (*byte == b'f') + }; + prop_assert!( + value.as_bytes()[2..].iter().all(is_hex_digit) + ); + Ok(()) + }; + check_str_invariants(replacements_str)?; + check_str_invariants(mask_str)?; + } + } + + #[test] + fn cpu_profile_host_loads_no_data() { + assert_eq!(CpuProfile::Host.cpuid_data(true), None); + assert_eq!(CpuProfile::Host.cpuid_data(false), None); + assert_eq!(CpuProfile::Host.msr_data(), None); + } + + /// Check that the `CpuProfile::cpuid_data` and `CpuProfile::msr_data` methods + /// coincide with direct deserialization for the `sapphire-rapids` profile. + #[cfg(feature = "kvm")] + #[test] + fn cpu_profile_loading_sapphire_rapids() { + // Now check that the methods coincide with direct deserialization. For the + // Sapphire Rapids profile this should be the case when `amx` is enabled. + let profile = CpuProfile::SapphireRapids; + let cpuid_data = profile.cpuid_data(true).unwrap(); + let deserialized_cpuid_data: CpuIdProfileData = + serde_json::from_slice(include_bytes!("./cpu_profiles/sapphire-rapids.cpuid.json")) + .unwrap(); + + assert_eq!(cpuid_data, deserialized_cpuid_data); + + let msr_data = profile.msr_data().unwrap(); + let deserialized_msr_data: MsrProfileData = + serde_json::from_slice(include_bytes!("./cpu_profiles/sapphire-rapids.msr.json")) + .unwrap(); + assert_eq!(msr_data, deserialized_msr_data); + } + + /// Check that the `CpuProfile::cpuid_data` and `CpuProfile::msr_data` methods + /// coincide with direct deserialization for the `skylake` profile. + #[cfg(feature = "kvm")] + #[test] + fn cpu_profile_loading_skylake() { + // Now check that the methods coincide with direct deserialization. For the + // Sapphire Rapids profile this should be the case when `amx` is enabled. + let profile = CpuProfile::Skylake; + let cpuid_data = profile.cpuid_data(true).unwrap(); + let deserialized_cpuid_data: CpuIdProfileData = + serde_json::from_slice(include_bytes!("./cpu_profiles/skylake.cpuid.json")).unwrap(); + + assert_eq!(cpuid_data, deserialized_cpuid_data); + + let msr_data = profile.msr_data().unwrap(); + let deserialized_msr_data: MsrProfileData = + serde_json::from_slice(include_bytes!("./cpu_profiles/skylake.msr.json")).unwrap(); + assert_eq!(msr_data, deserialized_msr_data); + } +} diff --git a/arch/src/x86_64/cpu_profile_generation.rs b/arch/src/x86_64/cpu_profile_generation.rs new file mode 100644 index 0000000000..a66a336bfa --- /dev/null +++ b/arch/src/x86_64/cpu_profile_generation.rs @@ -0,0 +1,591 @@ +// Copyright © 2025 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::collections::HashSet; +use std::fs::File; +use std::io::Write; +use std::ops::{BitOr, RangeInclusive, Shl}; +use std::path::PathBuf; + +use anyhow::{Context, anyhow}; +use hypervisor::arch::x86::{CpuIdEntry, MsrEntry}; +use hypervisor::{CpuVendor, Hypervisor, HypervisorError, HypervisorType}; +use log::warn; + +use crate::x86_64::cpu_profile::{CpuIdProfileData, FeatureMsrAdjustment, MsrProfileData}; +#[cfg(feature = "kvm")] +use crate::x86_64::cpuid_definitions::CpuidDefinitions; +use crate::x86_64::cpuid_definitions::intel::INTEL_CPUID_DEFINITIONS; +use crate::x86_64::cpuid_definitions::kvm::KVM_CPUID_DEFINITIONS; +use crate::x86_64::cpuid_definitions::{Parameters, ProfilePolicy}; +use crate::x86_64::msr_definitions::{self, MsrDefinitions, RegisterAddress}; +use crate::x86_64::{CpuidOutputRegisterAdjustments, CpuidReg}; + +/// Generate CPU profile data and convert it to a string, embeddable as Rust code, which is +/// written to the given `writer` (e.g. a File). +// +// NOTE: The MVP only works with KVM as the hypervisor and Intel CPUs. +#[cfg(feature = "kvm")] +pub fn generate_profile_data( + hypervisor: &dyn Hypervisor, + profile_name: &str, +) -> anyhow::Result<()> { + let cpu_vendor = hypervisor.get_cpu_vendor(); + if cpu_vendor != CpuVendor::Intel { + unimplemented!("CPU profiles can only be generated for Intel CPUs at this point in time"); + } + + let hypervisor_type = hypervisor.hypervisor_type(); + // This is just a reality check. + if hypervisor_type != HypervisorType::Kvm { + unimplemented!( + "CPU profiles can only be generated when using KVM as the hypervisor at this point in time" + ); + } + + let brand_string_bytes = cpu_brand_string_bytes(cpu_vendor, profile_name)?; + let cpuid = supported_cpuid(hypervisor)?; + let cpuid = overwrite_brand_string(cpuid, brand_string_bytes); + let supported_cpuid_sorted = sort_entries(cpuid); + + let Files { + cpuid_data_file, + cpuid_data_license_file, + msr_data_file, + msr_data_license_file, + } = create_files(profile_name)?; + + generate_cpuid_profile_data_with( + hypervisor_type, + cpu_vendor, + &supported_cpuid_sorted, + &INTEL_CPUID_DEFINITIONS, + &KVM_CPUID_DEFINITIONS, + cpuid_data_file, + cpuid_data_license_file, + )?; + + let supported_feature_msrs = hypervisor.get_msr_based_features().context("CPU profile generation failed: Could not get the supported MSR-based features from the hypervisor")?; + let supported_msrs = hypervisor + .get_msr_index_list() + .context("CPU profile generation failed: Could not get MSR index list")? + .into_iter() + .collect(); + + generate_msr_profile_data_with( + MsrProfileDataParams { + hypervisor_type, + cpu_vendor, + processor_feature_msr_definitions: + &msr_definitions::intel::INTEL_MSR_FEATURE_DEFINITIONS, + supported_feature_msrs: &supported_feature_msrs, + supported_msrs, + permitted_architectural_msrs: &msr_definitions::intel::PERMITTED_IA32_MSRS[..], + permitted_hypervisor_msrs: &msr_definitions::kvm::PROFILE_PERMITTED_KVM_MSRS[..], + permitted_hyperv_msrs: &msr_definitions::hyperv::HYPERV_MSRS[..], + non_architectural_msrs: &msr_definitions::intel::NON_ARCHITECTURAL_INTEL_MSRS[..], + forbidden_architectural_msrs: &msr_definitions::intel::FORBIDDEN_IA32_MSR_RANGES[..], + }, + msr_data_file, + msr_data_license_file, + ) +} + +struct Files { + cpuid_data_file: File, + cpuid_data_license_file: File, + msr_data_file: File, + msr_data_license_file: File, +} +/// Create empty files with names derived from the name given to the CPU profile. +/// The name will be lowercase and spaces are replaced with "-". +fn create_files(profile_name: &str) -> anyhow::Result { + let profile_file_name = { + let mut name = String::new(); + for part in profile_name.split_whitespace().map(|s| s.to_lowercase()) { + if !name.is_empty() { + name.push('-'); + } + name.push_str(&part); + } + name + }; + + let create_file = |path: PathBuf| { + File::create(path.clone()).with_context(|| { + format!( + "CPU profile generation failed: Could not create file:={}", + path.to_string_lossy() + ) + }) + }; + + let path_with_license = |mut path: PathBuf| { + path.as_mut_os_string().push(".license"); + path + }; + + let current_dir = std::env::current_dir() + .context("CPU profile generation failed: Unable to get the current working directory")?; + + let common_path = format!("arch/src/x86_64/cpu_profiles/{profile_file_name}"); + + let cpuid_profile_file_name = { + let mut path = current_dir.clone(); + path.push(format!("{common_path}.cpuid.json")); + path + }; + + let cpuid_data_file = create_file(cpuid_profile_file_name.clone())?; + + let cpuid_data_license_file_path = path_with_license(cpuid_profile_file_name); + + let cpuid_data_license_file = create_file(cpuid_data_license_file_path)?; + + let msr_profile_file_name = { + let mut path = current_dir; + path.push(format!("{common_path}.msr.json")); + path + }; + + let msr_data_file = create_file(msr_profile_file_name.clone())?; + + let msr_data_license_file_path = path_with_license(msr_profile_file_name); + let msr_data_license_file = create_file(msr_data_license_file_path)?; + + Ok(Files { + cpuid_data_file, + cpuid_data_license_file, + msr_data_file, + msr_data_license_file, + }) +} + +/// Prepare the bytes which the brand string should consist of +fn cpu_brand_string_bytes(cpu_vendor: CpuVendor, profile_name: &str) -> anyhow::Result<[u8; 48]> { + let cpu_vendor_str: String = serde_json::to_string(&cpu_vendor) + .expect("Should be possible to serialize CPU vendor to a string"); + let cpu_vendor_str = cpu_vendor_str.trim_start_matches('"').trim_end_matches('"'); + let mut brand_string_bytes = [0_u8; 4 * 3 * 4]; + if cpu_vendor_str.len() + 1 + profile_name.len() > brand_string_bytes.len() { + return Err(anyhow!( + "The profile name is too long. Try using a shorter name" + )); + } + for (b, brand_byte) in cpu_vendor_str + .as_bytes() + .iter() + .chain(std::iter::once(&b' ')) + .chain(profile_name.as_bytes()) + .zip(brand_string_bytes.iter_mut()) + { + *brand_byte = *b; + } + Ok(brand_string_bytes) +} +/// Computes [`CpuIdProfileData`] based on the given sorted vector of CPUID entries, hypervisor type, cpu_vendor +/// and cpuid_definitions. +/// +/// The computed [`CpuIdProfileData`] is then converted to a string representation, embeddable as Rust code, which is +/// then written by the given `writer`. +/// +// TODO: Consider making a snapshot test or two for this function. +fn generate_cpuid_profile_data_with( + hypervisor_type: HypervisorType, + cpu_vendor: CpuVendor, + supported_cpuid_sorted: &[CpuIdEntry], + processor_cpuid_definitions: &CpuidDefinitions, + hypervisor_cpuid_definitions: &CpuidDefinitions, + mut cpuid_data_file: impl Write, + cpuid_license_file: impl Write, +) -> anyhow::Result<()> { + let mut adjustments: Vec<(Parameters, CpuidOutputRegisterAdjustments)> = Vec::new(); + + for (parameter, values) in processor_cpuid_definitions + .as_slice() + .iter() + .chain(hypervisor_cpuid_definitions.as_slice().iter()) + { + for (sub_leaf_range, maybe_matching_register_output_value) in + extract_parameter_matches(parameter, supported_cpuid_sorted) + { + // If the compatibility target (current host) has multiple sub-leaves matching the parameter's range + // then we want to specialize: + let mut mask: u32 = 0; + let mut replacements: u32 = 0; + for value in values.as_slice() { + // Reality check on the bit range listed in `value` + { + assert!(value.bits_range.0 <= value.bits_range.1); + assert!(value.bits_range.1 < 32); + } + + match value.policy { + ProfilePolicy::Passthrough => { + // The profile should take whatever we get from the host, hence there is no adjustment, but our + // mask needs to retain all bits in the range of bits corresponding to this value + let (first_bit_pos, last_bit_pos) = value.bits_range; + mask |= bit_range_mask::(first_bit_pos, last_bit_pos); + } + ProfilePolicy::Static(overwrite_value) => { + replacements |= overwrite_value << value.bits_range.0; + } + ProfilePolicy::Inherit => { + // The value is supposed to be obtained from the compatibility target if it exists + let (first_bit_pos, last_bit_pos) = value.bits_range; + if let Some(matching_register_value) = maybe_matching_register_output_value + { + let extraction_mask = + bit_range_mask::(first_bit_pos, last_bit_pos); + let value = matching_register_value & extraction_mask; + replacements |= value; + } + } + } + } + adjustments.push(( + Parameters { + leaf: parameter.leaf, + sub_leaf: sub_leaf_range, + register: parameter.register, + }, + CpuidOutputRegisterAdjustments { mask, replacements }, + )); + } + } + + let cpuid_profile_data = CpuIdProfileData { + hypervisor: hypervisor_type, + cpu_vendor, + adjustments, + }; + + serde_json::to_writer_pretty(&mut cpuid_data_file, &cpuid_profile_data) + .context("Cpu profile generation failed: Could not serialize the generated cpuid profile data to the given writer")?; + cpuid_data_file + .flush() + .context("CPU profile generation failed: Unable to flush cpuid profile data")?; + write_license_file(cpuid_license_file, "CPUID") +} + +struct MsrProfileDataParams<'a, const N: usize> { + hypervisor_type: HypervisorType, + cpu_vendor: CpuVendor, + processor_feature_msr_definitions: &'a MsrDefinitions, + + /// MSR-based features supported by the hardware and hypervisor used to + /// generate this CPU profile. + supported_feature_msrs: &'a [MsrEntry], + /// MSRs supported by the hardware and hypervisor used to generate this + /// CPU profile. + supported_msrs: HashSet, + /// A list of all architectural MSRs that are permitted if they are also + /// contained in `supported_msrs`. + permitted_architectural_msrs: &'a [u32], + /// MSRs defined by the hypervisor that are permitted if they are supported + /// by the hardware and hypervisor used when generating this CPU profile + /// + /// We let CHV make the final decision at runtime whether they should be + /// available to guests (currently via CPUID) + permitted_hypervisor_msrs: &'a [u32], + /// Hyper-V related MSRs. + /// + /// NOTE: We can only know if these are truly permitted when the profile is + ///applied at runtime, hence we include them in the profile regardless and + ///let CHV remove them if necessary upon applying the CPU profile. + permitted_hyperv_msrs: &'a [u32], + /// A list of known non-architectural MSRs. This list is only used to help + /// us detect MSRs that we might not be aware of. + non_architectural_msrs: &'a [u32], + /// A list of known ranges of architectural msrs, that should not be + /// permitted by any generated CPU profile. This list is only used to help + /// us detect MSRs that we might not be aware of. + forbidden_architectural_msrs: &'a [(u32, u32)], +} + +fn generate_msr_profile_data_with<'a, const N: usize>( + MsrProfileDataParams { + hypervisor_type, + cpu_vendor, + processor_feature_msr_definitions, + supported_feature_msrs, + supported_msrs, + permitted_architectural_msrs, + permitted_hypervisor_msrs, + permitted_hyperv_msrs, + non_architectural_msrs, + forbidden_architectural_msrs, + }: MsrProfileDataParams<'a, N>, + mut msr_data_file: impl Write, + msr_license_file: impl Write, +) -> anyhow::Result<()> { + const KVM_GET_NOT_SET_MSRS: [RegisterAddress; 6] = [ + RegisterAddress::IA32_VMX_PINBASED_CTLS, + RegisterAddress::IA32_VMX_PROCBASED_CTLS, + RegisterAddress::IA32_VMX_EXIT_CTLS, + RegisterAddress::IA32_VMX_ENTRY_CTLS, + RegisterAddress::IA32_VMX_CR0_FIXED1, + RegisterAddress::IA32_VMX_CR4_FIXED1, + ]; + let mut entries_encountered = 0; + let mut adjustments = Vec::new(); + let mut permitted_msrs = HashSet::new(); + 'table: for (reg_addr, definitions) in processor_feature_msr_definitions.as_slice() { + let Some(entry) = supported_feature_msrs + .iter() + .find(|e| e.index == reg_addr.0) + else { + continue; + }; + entries_encountered += 1; + + // NOTE: For now this tool only supports KVM, but we insert this check so we don't forget + // about (possible) KVM specific behavior. + if hypervisor_type == HypervisorType::Kvm && KVM_GET_NOT_SET_MSRS.contains(reg_addr) { + // In this case we do not want to record an update, but just that the MSR is permitted. + permitted_msrs.insert(reg_addr.0); + continue; + } + + let value = entry.data; + let mut replacements = 0; + let mut mask = 0; + let mut bits_accounted_for = 0; + for msr_definitions::ValueDefinition { + policy, + bits_range: (first_bit_pos, last_bit_pos), + .. + } in definitions.as_slice().iter().copied() + { + let temp_mask = bit_range_mask::(first_bit_pos, last_bit_pos); + bits_accounted_for |= temp_mask; + match policy { + msr_definitions::ProfilePolicy::Deny => { + // This can only be applied to the entire MSR + assert_eq!(first_bit_pos, 0); + assert_eq!(last_bit_pos, 63); + continue 'table; + } + msr_definitions::ProfilePolicy::Inherit => { + replacements |= value & temp_mask; + } + msr_definitions::ProfilePolicy::Passthrough => { + mask |= temp_mask; + } + msr_definitions::ProfilePolicy::Static(overwrite_value) => { + replacements |= (overwrite_value) << (first_bit_pos); + } + } + } + // Reserved bit positions within an MSR value may get assigned meaning by hardware vendors in the future. + // For this reason we decide to have an "inherit" policy for these bits during profile generation. + let reserved_values = value & (!bits_accounted_for); + replacements |= reserved_values; + + permitted_msrs.insert(reg_addr.0); + adjustments.push((*reg_addr, FeatureMsrAdjustment { mask, replacements })); + } + + if entries_encountered != supported_feature_msrs.len() { + let unknown_register_address = supported_feature_msrs.iter().find(|entry| !processor_feature_msr_definitions.as_slice().iter().any(|(reg_addr, _)| reg_addr.0 == entry.index )).expect("We have checked that there should be at least one unknown supported MSR-based feature").index; + Err(anyhow!( + "CPU profile generation failed: The hardware and hypervisor supports MSR-based feature with register address:={unknown_register_address:#x}, but the CPU profile generation tool does not know what to do with this MSR. Please update the appropriate `MsrDefinitions` and try again." + ))?; + } + + for msr in permitted_architectural_msrs + .iter() + .chain(permitted_hypervisor_msrs) + .chain(permitted_hyperv_msrs) + { + if supported_msrs.contains(msr) { + let _ = permitted_msrs.insert(*msr); + } + } + + // Also check to see if there are any MSRs on the system that we are not aware off. In that case + // it might be a sign that this tool needs to update its definitions! + for msr in supported_msrs.difference(&permitted_msrs) { + let is_proc_feat_msr = processor_feature_msr_definitions + .as_slice() + .iter() + .any(|(reg_addr, _)| reg_addr.0 == *msr); + + let is_architectural_msr = forbidden_architectural_msrs + .iter() + .any(|r| (r.0..=r.1).contains(msr)); + + let is_non_architectural_msr = non_architectural_msrs.contains(msr); + + if is_proc_feat_msr || is_architectural_msr || is_non_architectural_msr { + continue; + } + + // TODO: Make this a hard error before upstreaming + warn!( + "Encountered unknown MSR:={:#x} when generating CPU profile. This CPU profile generation tool might not be up-to-date", + *msr + ); + } + + let permitted_msrs: Vec = { + let mut permitted_msrs: Vec = permitted_msrs.into_iter().collect(); + permitted_msrs.sort(); + permitted_msrs.into_iter().map(RegisterAddress).collect() + }; + + let msr_profile_data = MsrProfileData { + hypervisor_type, + cpu_vendor, + adjustments, + permitted_msrs, + }; + + serde_json::to_writer_pretty(&mut msr_data_file, &msr_profile_data) + .context("Cpu profile generation failed: Could not serialize the generated MSR profile data to the given writer")?; + msr_data_file + .flush() + .context("CPU profile generation failed: Unable to flush MSR profile data")?; + write_license_file(msr_license_file, "MSR") +} + +fn write_license_file(mut license_file: impl Write, data_type: &str) -> anyhow::Result<()> { + let license_text = { + r#"SPDX-FileCopyrightText: 2025 Cyberus Technology GmbH + +SPDX-License-Identifier: Apache-2.0 +"# + }; + license_file + .write_all(license_text.as_bytes()) + .with_context(|| { + format!("CPU profile generation failed: Unable to write to {data_type} profile data license file") + })?; + license_file.flush().context(format!( + "CPU profile generation failed: Unable to flush {data_type} profile data license file" + )) +} +/// Get as many of the supported CPUID entries from the hypervisor as possible. +fn supported_cpuid(hypervisor: &dyn Hypervisor) -> anyhow::Result> { + // Check for AMX compatibility. If this is supported we need to call arch_prctl before requesting the supported + // CPUID entries from the hypervisor. We simply call the enable_amx_state_components method on the hypervisor and + // ignore any AMX not supported error to achieve this. + match hypervisor.enable_amx_state_components() { + Ok(()) => {} + Err(HypervisorError::CouldNotEnableAmxStateComponents(amx_err)) => { + if matches!( + amx_err, + hypervisor::arch::x86::AmxGuestSupportError::AmxGuestTileRequest { .. } + ) { + return Err(amx_err).context("Unable to enable AMX state tiles for guests"); + } + } + Err(_) => unreachable!("Unexpected error when checking AMX support"), + } + + hypervisor + .get_supported_cpuid() + .context("CPU profile data generation failed") +} + +/// Overwrite the Processor brand string with the given `brand_string_bytes` +fn overwrite_brand_string( + mut cpuid: Vec, + brand_string_bytes: [u8; 48], +) -> Vec { + let mut iter = brand_string_bytes + .as_chunks::<4>() + .0 + .iter() + .map(|c| u32::from_le_bytes(*c)); + let mut overwrite = |leaf: u32| CpuIdEntry { + function: leaf, + index: 0, + flags: 0, + eax: iter.next().unwrap_or(0), + ebx: iter.next().unwrap_or(0), + ecx: iter.next().unwrap_or(0), + edx: iter.next().unwrap_or(0), + }; + for leaf in [0x80000002, 0x80000003, 0x80000004] { + if let Some(entry) = cpuid + .iter_mut() + .find(|entry| (entry.function == leaf) && (entry.index == 0)) + { + *entry = overwrite(leaf); + } else { + cpuid.push(overwrite(leaf)); + } + } + cpuid +} + +/// Sort the CPUID entries by function and index +fn sort_entries(mut cpuid: Vec) -> Vec { + cpuid.sort_unstable_by(|entry, other_entry| { + let fn_cmp = entry.function.cmp(&other_entry.function); + if fn_cmp == core::cmp::Ordering::Equal { + entry.index.cmp(&other_entry.index) + } else { + fn_cmp + } + }); + cpuid +} + +/// Returns a numeric value where each bit between `first_bit_pos` and `last_bit_pos` is set (including both ends) and all other bits are 0. +fn bit_range_mask(first_bit_pos: u8, last_bit_pos: u8) -> T +where + T: Shl, + T: BitOr, + T: From, +{ + (first_bit_pos..=last_bit_pos).fold(T::from(0_u8), |acc, next| acc | ((T::from(1_u8)) << next)) +} +/// Returns a vector of exact parameter matches ((sub_leaf ..= sub_leaf), register_value) interleaved by +/// the sub_leaf ranges specified by `param` that did not match any cpuid entry. +fn extract_parameter_matches( + param: &Parameters, + supported_cpuid_sorted: &[CpuIdEntry], +) -> Vec<(RangeInclusive, Option)> { + let register_value = |entry: &CpuIdEntry| -> u32 { + match param.register { + CpuidReg::EAX => entry.eax, + CpuidReg::EBX => entry.ebx, + CpuidReg::ECX => entry.ecx, + CpuidReg::EDX => entry.edx, + } + }; + let mut out = Vec::new(); + let param_range = param.sub_leaf.clone(); + let mut range_for_consideration = param_range.clone(); + let range_end = *range_for_consideration.end(); + for sub_leaf_entry in supported_cpuid_sorted + .iter() + .filter(|entry| entry.function == param.leaf && param_range.contains(&entry.index)) + { + let matching_subleaf = sub_leaf_entry.index; + + // If we are in the middle of the range, it means there is no entry matching the first few sub-leaves within the range + let current_range_start = *range_for_consideration.start(); + if current_range_start < matching_subleaf { + let range_not_matching = RangeInclusive::new(current_range_start, matching_subleaf - 1); + out.push((range_not_matching, None)); + } + + out.push(( + RangeInclusive::new(matching_subleaf, matching_subleaf), + Some(register_value(sub_leaf_entry)), + )); + if matching_subleaf == range_end { + return out; + } + // Update range_for_consideration: Note that we must have index + 1 <= range_end + range_for_consideration = RangeInclusive::new(matching_subleaf + 1, range_end); + } + // We did not find the last entry within the range hence we push the final range for consideration together with no matching register value + out.push((range_for_consideration, None)); + out +} diff --git a/arch/src/x86_64/cpu_profiles/sapphire-rapids.cpuid.json b/arch/src/x86_64/cpu_profiles/sapphire-rapids.cpuid.json new file mode 100644 index 0000000000..b0790bb426 --- /dev/null +++ b/arch/src/x86_64/cpu_profiles/sapphire-rapids.cpuid.json @@ -0,0 +1,3366 @@ +{ + "hypervisor": "Kvm", + "cpu_vendor": "Intel", + "adjustments": [ + [ + { + "leaf": "0x0", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000020", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x756e6547", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x6c65746e", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EDX" + }, + { + "replacements": "0x49656e69", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x1", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x000806f8", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x1", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ff00" + } + ], + [ + { + "leaf": "0x1", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x76fa3223", + "mask": "0x89000000" + } + ], + [ + { + "leaf": "0x1", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EDX" + }, + { + "replacements": "0x078bfbff", + "mask": "0x08000000" + } + ], + [ + { + "leaf": "0x2", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x2", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x2", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x2", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffc3ff" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffc3ff" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x2", + "end": "0x2" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffc3ff" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x3", + "end": "0x3" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffc3ff" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x4", + "end": "0x4" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffc3ff" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x5", + "end": "0xffffffff" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffc3ff" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x2", + "end": "0x2" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x3", + "end": "0x3" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x4", + "end": "0x4" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x5", + "end": "0xffffffff" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x7fffffff" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x7fffffff" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x2", + "end": "0x2" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x7fffffff" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x3", + "end": "0x3" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x7fffffff" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x4", + "end": "0x4" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x7fffffff" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x5", + "end": "0xffffffff" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x7fffffff" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000007" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000007" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x2", + "end": "0x2" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000007" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x3", + "end": "0x3" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000007" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x4", + "end": "0x4" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000007" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x5", + "end": "0xffffffff" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000007" + } + ], + [ + { + "leaf": "0x5", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x5", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x5", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x5", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x6", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000004", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x6", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x6", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x6", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x7", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000002", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x7", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0xf1bf07ab", + "mask": "0x00002040" + } + ], + [ + { + "leaf": "0x7", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x1b415f46", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x7", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EDX" + }, + { + "replacements": "0xbfc04410", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x7", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "EAX" + }, + { + "replacements": "0x00001c30", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x7", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x7", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x7", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x7", + "sub_leaf": { + "start": "0x2", + "end": "0x2" + }, + "register": "EDX" + }, + { + "replacements": "0x00000017", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x9", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xa", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xa", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xa", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xa", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xb", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000001f" + } + ], + [ + { + "leaf": "0xb", + "sub_leaf": { + "start": "0x1", + "end": "0xffffffff" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000001f" + } + ], + [ + { + "leaf": "0xb", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0xb", + "sub_leaf": { + "start": "0x1", + "end": "0xffffffff" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0xb", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0xb", + "sub_leaf": { + "start": "0x1", + "end": "0xffffffff" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0xb", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0xb", + "sub_leaf": { + "start": "0x1", + "end": "0xffffffff" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x000602e7", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "EAX" + }, + { + "replacements": "0x0000001f", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x2", + "end": "0x2" + }, + "register": "EAX" + }, + { + "replacements": "0x00000100", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x2", + "end": "0x2" + }, + "register": "EBX" + }, + { + "replacements": "0x00000240", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x2", + "end": "0x2" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x3", + "end": "0x4" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x3", + "end": "0x4" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x3", + "end": "0x4" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x3", + "end": "0x4" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x5", + "end": "0x5" + }, + "register": "EAX" + }, + { + "replacements": "0x00000040", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x6", + "end": "0x6" + }, + "register": "EAX" + }, + { + "replacements": "0x00000200", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x7", + "end": "0x7" + }, + "register": "EAX" + }, + { + "replacements": "0x00000400", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x5", + "end": "0x5" + }, + "register": "EBX" + }, + { + "replacements": "0x00000440", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x6", + "end": "0x6" + }, + "register": "EBX" + }, + { + "replacements": "0x00000480", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x7", + "end": "0x7" + }, + "register": "EBX" + }, + { + "replacements": "0x00000680", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x5", + "end": "0x5" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x6", + "end": "0x6" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x7", + "end": "0x7" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x8", + "end": "0x8" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x8", + "end": "0x8" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x8", + "end": "0x8" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x8", + "end": "0x8" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x9", + "end": "0x9" + }, + "register": "EAX" + }, + { + "replacements": "0x00000008", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x9", + "end": "0x9" + }, + "register": "EBX" + }, + { + "replacements": "0x00000a80", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x9", + "end": "0x9" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xa", + "end": "0xa" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xa", + "end": "0xa" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xa", + "end": "0xa" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xa", + "end": "0xa" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xb", + "end": "0xb" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xc", + "end": "0xc" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xb", + "end": "0xb" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xc", + "end": "0xc" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xb", + "end": "0xb" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xc", + "end": "0xc" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xb", + "end": "0xb" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xc", + "end": "0xc" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xd", + "end": "0xd" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xd", + "end": "0xd" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xd", + "end": "0xd" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xd", + "end": "0xd" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xe", + "end": "0xe" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xe", + "end": "0xe" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xe", + "end": "0xe" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xe", + "end": "0xe" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xf", + "end": "0xf" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xf", + "end": "0xf" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xf", + "end": "0xf" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xf", + "end": "0xf" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x10", + "end": "0x10" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x10", + "end": "0x10" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x10", + "end": "0x10" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x10", + "end": "0x10" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x11", + "end": "0x11" + }, + "register": "EAX" + }, + { + "replacements": "0x00000040", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x12", + "end": "0x12" + }, + "register": "EAX" + }, + { + "replacements": "0x00002000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x13", + "end": "0x3f" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x11", + "end": "0x11" + }, + "register": "EBX" + }, + { + "replacements": "0x00000ac0", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x12", + "end": "0x12" + }, + "register": "EBX" + }, + { + "replacements": "0x00000b00", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x13", + "end": "0x3f" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x11", + "end": "0x11" + }, + "register": "ECX" + }, + { + "replacements": "0x00000002", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x12", + "end": "0x12" + }, + "register": "ECX" + }, + { + "replacements": "0x00000006", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x13", + "end": "0x3f" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xf", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xf", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xf", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xf", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xf", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xf", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x10", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x10", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000001f" + } + ], + [ + { + "leaf": "0x10", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x10", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x10", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x10", + "sub_leaf": { + "start": "0x2", + "end": "0x2" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000001f" + } + ], + [ + { + "leaf": "0x10", + "sub_leaf": { + "start": "0x2", + "end": "0x2" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x10", + "sub_leaf": { + "start": "0x2", + "end": "0x2" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x10", + "sub_leaf": { + "start": "0x2", + "end": "0x2" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x10", + "sub_leaf": { + "start": "0x3", + "end": "0x3" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x10", + "sub_leaf": { + "start": "0x3", + "end": "0x3" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x10", + "sub_leaf": { + "start": "0x3", + "end": "0x3" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x10", + "sub_leaf": { + "start": "0x5", + "end": "0x5" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x10", + "sub_leaf": { + "start": "0x5", + "end": "0x5" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x10", + "sub_leaf": { + "start": "0x5", + "end": "0x5" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x14", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x14", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x14", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x14", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x14", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x15", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x15", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x15", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x16", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x16", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x16", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x17", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x18", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x18", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffff070f" + } + ], + [ + { + "leaf": "0x18", + "sub_leaf": { + "start": "0x1", + "end": "0xffffffff" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffff070f" + } + ], + [ + { + "leaf": "0x18", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x18", + "sub_leaf": { + "start": "0x1", + "end": "0xffffffff" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x18", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x03ffc1ff" + } + ], + [ + { + "leaf": "0x18", + "sub_leaf": { + "start": "0x1", + "end": "0xffffffff" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x03ffc1ff" + } + ], + [ + { + "leaf": "0x1c", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x1c", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x1c", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x1d", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000001", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x1d", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "EAX" + }, + { + "replacements": "0x04002000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x1d", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "EBX" + }, + { + "replacements": "0x00080040", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x1d", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "ECX" + }, + { + "replacements": "0x00000010", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x1e", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x1e", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00004010", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x1e", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x1f", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000001f" + } + ], + [ + { + "leaf": "0x1f", + "sub_leaf": { + "start": "0x1", + "end": "0xffffffff" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000001f" + } + ], + [ + { + "leaf": "0x1f", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x1f", + "sub_leaf": { + "start": "0x1", + "end": "0xffffffff" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x1f", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x1f", + "sub_leaf": { + "start": "0x1", + "end": "0xffffffff" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x1f", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x1f", + "sub_leaf": { + "start": "0x1", + "end": "0xffffffff" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x20", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x20", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x21", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x21", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x21", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x23", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x23", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x23", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x23", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x23", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x23", + "sub_leaf": { + "start": "0x2", + "end": "0x2" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x23", + "sub_leaf": { + "start": "0x3", + "end": "0x3" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x23", + "sub_leaf": { + "start": "0x4", + "end": "0x4" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x23", + "sub_leaf": { + "start": "0x4", + "end": "0x4" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x23", + "sub_leaf": { + "start": "0x5", + "end": "0x5" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x23", + "sub_leaf": { + "start": "0x5", + "end": "0x5" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x23", + "sub_leaf": { + "start": "0x5", + "end": "0x5" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x23", + "sub_leaf": { + "start": "0x5", + "end": "0x5" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x24", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x24", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000000", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x80000008", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000000", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x80000000", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x80000000", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x80000001", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x00000121", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000001", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EDX" + }, + { + "replacements": "0x2c100800", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000002", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x65746e49", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000002", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x6153206c", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000002", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x69687070", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000002", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EDX" + }, + { + "replacements": "0x52206572", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000003", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x64697061", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000003", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000073", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000003", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000003", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000004", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000004", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000004", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000004", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000006", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x80000007", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EDX" + }, + { + "replacements": "0x00000100", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000008", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00ffffff" + } + ], + [ + { + "leaf": "0x80000008", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x40000000", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x40000000", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x40000000", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x40000000", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x40000001", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0103feff" + } + ], + [ + { + "leaf": "0x40000001", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000001" + } + ] + ] +} \ No newline at end of file diff --git a/arch/src/x86_64/cpu_profiles/sapphire-rapids.cpuid.json.license b/arch/src/x86_64/cpu_profiles/sapphire-rapids.cpuid.json.license new file mode 100644 index 0000000000..579657c531 --- /dev/null +++ b/arch/src/x86_64/cpu_profiles/sapphire-rapids.cpuid.json.license @@ -0,0 +1,3 @@ +SPDX-FileCopyrightText: 2025 Cyberus Technology GmbH + +SPDX-License-Identifier: Apache-2.0 diff --git a/arch/src/x86_64/cpu_profiles/sapphire-rapids.msr.json b/arch/src/x86_64/cpu_profiles/sapphire-rapids.msr.json new file mode 100644 index 0000000000..c9b5d42089 --- /dev/null +++ b/arch/src/x86_64/cpu_profiles/sapphire-rapids.msr.json @@ -0,0 +1,206 @@ +{ + "cpu_vendor": "Intel", + "hypervisor_type": "Kvm", + "adjustments": [ + [ + "0x8b", + { + "mask": "0xffffffff00000000", + "replacements": "0x0000000000000000" + } + ], + [ + "0x10a", + { + "mask": "0x4000000000000000", + "replacements": "0x000000000c08e06b" + } + ], + [ + "0x480", + { + "mask": "0x0000000000000000", + "replacements": "0x00d8100011e57ed0" + } + ], + [ + "0x485", + { + "mask": "0x000000000000001f", + "replacements": "0x0000000020000060" + } + ], + [ + "0x486", + { + "mask": "0x0000000000000000", + "replacements": "0x0000000080000021" + } + ], + [ + "0x488", + { + "mask": "0x0000000000000000", + "replacements": "0x0000000000002000" + } + ], + [ + "0x48a", + { + "mask": "0x0000000000000000", + "replacements": "0x0000000000000032" + } + ], + [ + "0x48b", + { + "mask": "0x0000000000000000", + "replacements": "0x06137bff00000000" + } + ], + [ + "0x48c", + { + "mask": "0x0000000000000000", + "replacements": "0x00000f01063340c1" + } + ], + [ + "0x48d", + { + "mask": "0x0000000000000000", + "replacements": "0x000000ff00000016" + } + ], + [ + "0x48e", + { + "mask": "0x0000000000000000", + "replacements": "0xfff9fffe04006172" + } + ], + [ + "0x48f", + { + "mask": "0x0000000000000000", + "replacements": "0x007fefff00036dfb" + } + ], + [ + "0x490", + { + "mask": "0x0000000000000000", + "replacements": "0x0000d3ff000011fb" + } + ], + [ + "0x491", + { + "mask": "0x0000000000000000", + "replacements": "0x0000000000000001" + } + ] + ], + "permitted_msrs": [ + "0x10", + "0x11", + "0x12", + "0x3a", + "0x3b", + "0x48", + "0x8b", + "0x10a", + "0x174", + "0x175", + "0x176", + "0x17a", + "0x1a0", + "0x1c4", + "0x1c5", + "0x200", + "0x201", + "0x202", + "0x203", + "0x204", + "0x205", + "0x206", + "0x207", + "0x208", + "0x209", + "0x20a", + "0x20b", + "0x20c", + "0x20d", + "0x20e", + "0x20f", + "0x250", + "0x258", + "0x259", + "0x268", + "0x269", + "0x26a", + "0x26b", + "0x26c", + "0x26d", + "0x26e", + "0x26f", + "0x277", + "0x2ff", + "0x480", + "0x481", + "0x482", + "0x483", + "0x484", + "0x485", + "0x486", + "0x487", + "0x488", + "0x489", + "0x48a", + "0x48b", + "0x48c", + "0x48d", + "0x48e", + "0x48f", + "0x490", + "0x491", + "0x6e0", + "0x40000000", + "0x40000001", + "0x40000002", + "0x40000003", + "0x40000010", + "0x40000020", + "0x40000021", + "0x40000022", + "0x40000023", + "0x40000073", + "0x40000080", + "0x400000b0", + "0x400000f1", + "0x400000f2", + "0x400000f3", + "0x400000f4", + "0x400000f5", + "0x40000100", + "0x40000101", + "0x40000102", + "0x40000103", + "0x40000104", + "0x40000105", + "0x4b564d00", + "0x4b564d01", + "0x4b564d02", + "0x4b564d03", + "0x4b564d04", + "0x4b564d05", + "0x4b564d06", + "0x4b564d07", + "0xc0000081", + "0xc0000082", + "0xc0000083", + "0xc0000084", + "0xc0000102", + "0xc0000103" + ] +} \ No newline at end of file diff --git a/arch/src/x86_64/cpu_profiles/sapphire-rapids.msr.json.license b/arch/src/x86_64/cpu_profiles/sapphire-rapids.msr.json.license new file mode 100644 index 0000000000..579657c531 --- /dev/null +++ b/arch/src/x86_64/cpu_profiles/sapphire-rapids.msr.json.license @@ -0,0 +1,3 @@ +SPDX-FileCopyrightText: 2025 Cyberus Technology GmbH + +SPDX-License-Identifier: Apache-2.0 diff --git a/arch/src/x86_64/cpu_profiles/skylake.cpuid.json b/arch/src/x86_64/cpu_profiles/skylake.cpuid.json new file mode 100644 index 0000000000..bbe3ec73a8 --- /dev/null +++ b/arch/src/x86_64/cpu_profiles/skylake.cpuid.json @@ -0,0 +1,3184 @@ +{ + "hypervisor": "Kvm", + "cpu_vendor": "Intel", + "adjustments": [ + [ + { + "leaf": "0x0", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000016", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x756e6547", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x6c65746e", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EDX" + }, + { + "replacements": "0x49656e69", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x1", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00050654", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x1", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ff00" + } + ], + [ + { + "leaf": "0x1", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x76fa3223", + "mask": "0x89000000" + } + ], + [ + { + "leaf": "0x1", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EDX" + }, + { + "replacements": "0x078bfbff", + "mask": "0x08000000" + } + ], + [ + { + "leaf": "0x2", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x2", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x2", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x2", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffc3ff" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffc3ff" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x2", + "end": "0x2" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffc3ff" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x3", + "end": "0x3" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffc3ff" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x4", + "end": "0x4" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffc3ff" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x5", + "end": "0xffffffff" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffc3ff" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x2", + "end": "0x2" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x3", + "end": "0x3" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x4", + "end": "0x4" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x5", + "end": "0xffffffff" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x7fffffff" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x7fffffff" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x2", + "end": "0x2" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x7fffffff" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x3", + "end": "0x3" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x7fffffff" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x4", + "end": "0x4" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x7fffffff" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x5", + "end": "0xffffffff" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x7fffffff" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000007" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000007" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x2", + "end": "0x2" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000007" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x3", + "end": "0x3" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000007" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x4", + "end": "0x4" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000007" + } + ], + [ + { + "leaf": "0x4", + "sub_leaf": { + "start": "0x5", + "end": "0xffffffff" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000007" + } + ], + [ + { + "leaf": "0x5", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x5", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x5", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x5", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x6", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000004", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x6", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x6", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x6", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x7", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x7", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0xd19f07ab", + "mask": "0x00002040" + } + ], + [ + { + "leaf": "0x7", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x00000004", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x7", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EDX" + }, + { + "replacements": "0xbc000400", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x7", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x7", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x7", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x7", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x7", + "sub_leaf": { + "start": "0x2", + "end": "0x2" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x9", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xa", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xa", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xa", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xa", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xb", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000001f" + } + ], + [ + { + "leaf": "0xb", + "sub_leaf": { + "start": "0x1", + "end": "0xffffffff" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000001f" + } + ], + [ + { + "leaf": "0xb", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0xb", + "sub_leaf": { + "start": "0x1", + "end": "0xffffffff" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0xb", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0xb", + "sub_leaf": { + "start": "0x1", + "end": "0xffffffff" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0xb", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0xb", + "sub_leaf": { + "start": "0x1", + "end": "0xffffffff" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x000002e7", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "EAX" + }, + { + "replacements": "0x0000000f", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x2", + "end": "0x2" + }, + "register": "EAX" + }, + { + "replacements": "0x00000100", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x2", + "end": "0x2" + }, + "register": "EBX" + }, + { + "replacements": "0x00000240", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x2", + "end": "0x2" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x3", + "end": "0x3" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x4", + "end": "0x4" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x3", + "end": "0x3" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x4", + "end": "0x4" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x3", + "end": "0x3" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x4", + "end": "0x4" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x3", + "end": "0x3" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x4", + "end": "0x4" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x5", + "end": "0x5" + }, + "register": "EAX" + }, + { + "replacements": "0x00000040", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x6", + "end": "0x6" + }, + "register": "EAX" + }, + { + "replacements": "0x00000200", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x7", + "end": "0x7" + }, + "register": "EAX" + }, + { + "replacements": "0x00000400", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x5", + "end": "0x5" + }, + "register": "EBX" + }, + { + "replacements": "0x00000440", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x6", + "end": "0x6" + }, + "register": "EBX" + }, + { + "replacements": "0x00000480", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x7", + "end": "0x7" + }, + "register": "EBX" + }, + { + "replacements": "0x00000680", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x5", + "end": "0x5" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x6", + "end": "0x6" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x7", + "end": "0x7" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x8", + "end": "0x8" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x8", + "end": "0x8" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x8", + "end": "0x8" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x8", + "end": "0x8" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x9", + "end": "0x9" + }, + "register": "EAX" + }, + { + "replacements": "0x00000008", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x9", + "end": "0x9" + }, + "register": "EBX" + }, + { + "replacements": "0x00000a80", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x9", + "end": "0x9" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xa", + "end": "0xa" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xa", + "end": "0xa" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xa", + "end": "0xa" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xa", + "end": "0xa" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xb", + "end": "0xc" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xb", + "end": "0xc" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xb", + "end": "0xc" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xb", + "end": "0xc" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xd", + "end": "0xd" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xd", + "end": "0xd" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xd", + "end": "0xd" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xd", + "end": "0xd" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xe", + "end": "0xe" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xe", + "end": "0xe" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xe", + "end": "0xe" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xe", + "end": "0xe" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xf", + "end": "0xf" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xf", + "end": "0xf" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xf", + "end": "0xf" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xf", + "end": "0xf" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x10", + "end": "0x10" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x10", + "end": "0x10" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x10", + "end": "0x10" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x10", + "end": "0x10" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x11", + "end": "0x3f" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x11", + "end": "0x3f" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x11", + "end": "0x3f" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xf", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xf", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xf", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xf", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xf", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xf", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x10", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x10", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000001f" + } + ], + [ + { + "leaf": "0x10", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x10", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x10", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x10", + "sub_leaf": { + "start": "0x2", + "end": "0x2" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000001f" + } + ], + [ + { + "leaf": "0x10", + "sub_leaf": { + "start": "0x2", + "end": "0x2" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x10", + "sub_leaf": { + "start": "0x2", + "end": "0x2" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x10", + "sub_leaf": { + "start": "0x2", + "end": "0x2" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x10", + "sub_leaf": { + "start": "0x3", + "end": "0x3" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x10", + "sub_leaf": { + "start": "0x3", + "end": "0x3" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x10", + "sub_leaf": { + "start": "0x3", + "end": "0x3" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x10", + "sub_leaf": { + "start": "0x5", + "end": "0x5" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x10", + "sub_leaf": { + "start": "0x5", + "end": "0x5" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x10", + "sub_leaf": { + "start": "0x5", + "end": "0x5" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x14", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x14", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x14", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x14", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x14", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x15", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x15", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x15", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x16", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x16", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x16", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x17", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x18", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x18", + "sub_leaf": { + "start": "0x0", + "end": "0xffffffff" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffff070f" + } + ], + [ + { + "leaf": "0x18", + "sub_leaf": { + "start": "0x0", + "end": "0xffffffff" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x18", + "sub_leaf": { + "start": "0x0", + "end": "0xffffffff" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x03ffc1ff" + } + ], + [ + { + "leaf": "0x1c", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x1c", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x1c", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x1d", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x1d", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x1d", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x1d", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x1e", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x1e", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x1e", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x1f", + "sub_leaf": { + "start": "0x0", + "end": "0xffffffff" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000001f" + } + ], + [ + { + "leaf": "0x1f", + "sub_leaf": { + "start": "0x0", + "end": "0xffffffff" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x1f", + "sub_leaf": { + "start": "0x0", + "end": "0xffffffff" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x1f", + "sub_leaf": { + "start": "0x0", + "end": "0xffffffff" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x20", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x20", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x21", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x21", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x21", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x23", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x23", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x23", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x23", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x23", + "sub_leaf": { + "start": "0x1", + "end": "0x1" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x23", + "sub_leaf": { + "start": "0x2", + "end": "0x2" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x23", + "sub_leaf": { + "start": "0x3", + "end": "0x3" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x23", + "sub_leaf": { + "start": "0x4", + "end": "0x4" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x23", + "sub_leaf": { + "start": "0x4", + "end": "0x4" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x23", + "sub_leaf": { + "start": "0x5", + "end": "0x5" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x23", + "sub_leaf": { + "start": "0x5", + "end": "0x5" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x23", + "sub_leaf": { + "start": "0x5", + "end": "0x5" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x23", + "sub_leaf": { + "start": "0x5", + "end": "0x5" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x24", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x24", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000000", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x80000008", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000000", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x80000000", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x80000000", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x80000001", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x00000121", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000001", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EDX" + }, + { + "replacements": "0x2c100800", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000002", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x65746e49", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000002", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x6b53206c", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000002", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x6b616c79", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000002", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EDX" + }, + { + "replacements": "0x00000065", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000003", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000003", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000003", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000003", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000004", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000004", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000004", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000004", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000006", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x80000007", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EDX" + }, + { + "replacements": "0x00000100", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000008", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00ffffff" + } + ], + [ + { + "leaf": "0x80000008", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x40000000", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x40000000", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x40000000", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x40000000", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x40000001", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0103feff" + } + ], + [ + { + "leaf": "0x40000001", + "sub_leaf": { + "start": "0x0", + "end": "0x0" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000001" + } + ] + ] +} \ No newline at end of file diff --git a/arch/src/x86_64/cpu_profiles/skylake.cpuid.json.license b/arch/src/x86_64/cpu_profiles/skylake.cpuid.json.license new file mode 100644 index 0000000000..579657c531 --- /dev/null +++ b/arch/src/x86_64/cpu_profiles/skylake.cpuid.json.license @@ -0,0 +1,3 @@ +SPDX-FileCopyrightText: 2025 Cyberus Technology GmbH + +SPDX-License-Identifier: Apache-2.0 diff --git a/arch/src/x86_64/cpu_profiles/skylake.msr.json b/arch/src/x86_64/cpu_profiles/skylake.msr.json new file mode 100644 index 0000000000..eceb91fcda --- /dev/null +++ b/arch/src/x86_64/cpu_profiles/skylake.msr.json @@ -0,0 +1,204 @@ +{ + "cpu_vendor": "Intel", + "hypervisor_type": "Kvm", + "adjustments": [ + [ + "0x8b", + { + "mask": "0xffffffff00000000", + "replacements": "0x0000000000000000" + } + ], + [ + "0x10a", + { + "mask": "0x4000000000000000", + "replacements": "0x000000000c00004c" + } + ], + [ + "0x480", + { + "mask": "0x0000000000000000", + "replacements": "0x00d8100011e57ed0" + } + ], + [ + "0x485", + { + "mask": "0x000000000000001f", + "replacements": "0x0000000020000060" + } + ], + [ + "0x486", + { + "mask": "0x0000000000000000", + "replacements": "0x0000000080000021" + } + ], + [ + "0x488", + { + "mask": "0x0000000000000000", + "replacements": "0x0000000000002000" + } + ], + [ + "0x48a", + { + "mask": "0x0000000000000000", + "replacements": "0x0000000000000032" + } + ], + [ + "0x48b", + { + "mask": "0x0000000000000000", + "replacements": "0x02137bff00000000" + } + ], + [ + "0x48c", + { + "mask": "0x0000000000000000", + "replacements": "0x00000f0106334041" + } + ], + [ + "0x48d", + { + "mask": "0x0000000000000000", + "replacements": "0x000000ff00000016" + } + ], + [ + "0x48e", + { + "mask": "0x0000000000000000", + "replacements": "0xfff9fffe04006172" + } + ], + [ + "0x48f", + { + "mask": "0x0000000000000000", + "replacements": "0x007fefff00036dfb" + } + ], + [ + "0x490", + { + "mask": "0x0000000000000000", + "replacements": "0x0000d3ff000011fb" + } + ], + [ + "0x491", + { + "mask": "0x0000000000000000", + "replacements": "0x0000000000000001" + } + ] + ], + "permitted_msrs": [ + "0x10", + "0x11", + "0x12", + "0x3a", + "0x3b", + "0x48", + "0x8b", + "0x10a", + "0x174", + "0x175", + "0x176", + "0x17a", + "0x1a0", + "0x200", + "0x201", + "0x202", + "0x203", + "0x204", + "0x205", + "0x206", + "0x207", + "0x208", + "0x209", + "0x20a", + "0x20b", + "0x20c", + "0x20d", + "0x20e", + "0x20f", + "0x250", + "0x258", + "0x259", + "0x268", + "0x269", + "0x26a", + "0x26b", + "0x26c", + "0x26d", + "0x26e", + "0x26f", + "0x277", + "0x2ff", + "0x480", + "0x481", + "0x482", + "0x483", + "0x484", + "0x485", + "0x486", + "0x487", + "0x488", + "0x489", + "0x48a", + "0x48b", + "0x48c", + "0x48d", + "0x48e", + "0x48f", + "0x490", + "0x491", + "0x6e0", + "0x40000000", + "0x40000001", + "0x40000002", + "0x40000003", + "0x40000010", + "0x40000020", + "0x40000021", + "0x40000022", + "0x40000023", + "0x40000073", + "0x40000080", + "0x400000b0", + "0x400000f1", + "0x400000f2", + "0x400000f3", + "0x400000f4", + "0x400000f5", + "0x40000100", + "0x40000101", + "0x40000102", + "0x40000103", + "0x40000104", + "0x40000105", + "0x4b564d00", + "0x4b564d01", + "0x4b564d02", + "0x4b564d03", + "0x4b564d04", + "0x4b564d05", + "0x4b564d06", + "0x4b564d07", + "0xc0000081", + "0xc0000082", + "0xc0000083", + "0xc0000084", + "0xc0000102", + "0xc0000103" + ] +} \ No newline at end of file diff --git a/arch/src/x86_64/cpu_profiles/skylake.msr.json.license b/arch/src/x86_64/cpu_profiles/skylake.msr.json.license new file mode 100644 index 0000000000..579657c531 --- /dev/null +++ b/arch/src/x86_64/cpu_profiles/skylake.msr.json.license @@ -0,0 +1,3 @@ +SPDX-FileCopyrightText: 2025 Cyberus Technology GmbH + +SPDX-License-Identifier: Apache-2.0 diff --git a/arch/src/x86_64/cpuid_definitions/intel.rs b/arch/src/x86_64/cpuid_definitions/intel.rs new file mode 100644 index 0000000000..61517e7e1b --- /dev/null +++ b/arch/src/x86_64/cpuid_definitions/intel.rs @@ -0,0 +1,5306 @@ +// Copyright © 2025 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 +// + +//! This module contains CPUID definitions for Intel CPUs. +use std::ops::RangeInclusive; + +use super::{ + CpuidDefinitions, CpuidReg, Parameters, ProfilePolicy, ValueDefinition, ValueDefinitions, +}; + +/// Contains CPUID definitions described in "Intel Architecture Instruction Set Extensions and Future Features" +/// +/// ## Missing leaves +/// +/// The following known CPUID leaves are left out of this table: +/// - 0x3 (Only relevant for Intel Pentium III), +/// - 0x12 (Only relevant for SGX which is deprecated), +/// - 0x19 (Key locker leaf. These features are not in scope for CPU profiles for the time being) +/// - 0x1a (Native Model ID Enumeration leaf), +/// - 0x1b (PCONFIG Information Sub-leaf. This is not in scope for CPU profiles for the time being), +/// - 0x27 (L3 Cache Intel RDT Monitoring Capability Asymmetric Enumeration), +/// - 0x28 (Intel Resource Director Technology Allocation Asymmetric Enumeration), +/// - 0x21 (Only relevant for Intel TDX which is not in scope fore CPU profiles for the time being), +/// - 0x40000000 - 0x4FFFFFFF (Reserved for hypervisors), +/// +/// ### How we produced this table +/// +/// We first ran the [`cpuidgen` tool](https://gitlab.com/x86-cpuid.org/x86-cpuid-db), whose +/// output is licensed under the SPDX Creative Commons Zero 1.0 Universal License. We then wrote a +/// throw-away Rust script to modify the output into something more similar to Rust code. Following +/// this we used macros and other functionality in the [Helix editor](https://helix-editor.com/) to +/// get actual Rust code. +/// +/// We then read through the CPUID section (1.4) of the Intel Architecture Instruction Set +/// Extensions and Future Features manual and manually inserted several leaf definitions that +/// we noticed were missing from the table we had produced. During this process we also changed +/// a few of the short names and descriptions to be more inline with what is written in the +/// aforementioned Intel manual. Finally we decided on a [`ProfilePolicy`] to be set for every +/// single [`ValueDefinition`] and manually appended those. +pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<187> = const { + CpuidDefinitions([ + // ========================================================================================= + // Basic CPUID Information + // ========================================================================================= + ( + Parameters { + leaf: 0x0, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "max_std_leaf", + description: "Maximum Input value for Basic CPUID Information", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x0, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_vendorid_0", + description: "CPU vendor ID string bytes 0 - 3", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x0, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_vendorid_2", + description: "CPU vendor ID string bytes 8 - 11", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x0, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_vendorid_1", + description: "CPU vendor ID string bytes 4 - 7", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + // TODO: Do we really want to inherit these values from the corresponding CPU, or should we zero it out or set something else here? + ( + Parameters { + leaf: 0x1, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "stepping", + description: "Stepping ID", + bits_range: (0, 3), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "base_model", + description: "Base CPU model ID", + bits_range: (4, 7), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "base_family_id", + description: "Base CPU family ID", + bits_range: (8, 11), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "cpu_type", + description: "CPU type", + bits_range: (12, 13), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "ext_model", + description: "Extended CPU model ID", + bits_range: (16, 19), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "ext_family", + description: "Extended CPU family ID", + bits_range: (20, 27), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + ( + Parameters { + leaf: 0x1, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "brand_id", + description: "Brand index", + bits_range: (0, 7), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "clflush_size", + description: "CLFLUSH instruction cache line size", + bits_range: (8, 15), + policy: ProfilePolicy::Passthrough, + }, + // This is set by cloud hypervisor + ValueDefinition { + short: "n_logical_cpu", + description: "Logical CPU count", + bits_range: (16, 23), + policy: ProfilePolicy::Static(0), + }, + // This is set by cloud hypervisor + ValueDefinition { + short: "local_apic_id", + description: "Initial local APIC physical ID", + bits_range: (24, 31), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x1, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "sse3", + description: "Streaming SIMD Extensions 3 (SSE3)", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "pclmulqdq", + description: "PCLMULQDQ instruction support", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "dtes64", + description: "64-bit DS save area", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "monitor", + description: "MONITOR/MWAIT support", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "ds_cpl", + description: "CPL Qualified Debug Store", + bits_range: (4, 4), + policy: ProfilePolicy::Static(0), + }, + // TODO: Ideally configurable by the user (host must have this otherwise CHV will not run) + ValueDefinition { + short: "vmx", + description: "Virtual Machine Extensions", + bits_range: (5, 5), + policy: ProfilePolicy::Static(1), + }, + ValueDefinition { + short: "smx", + description: "Safer Mode Extensions", + bits_range: (6, 6), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "est", + description: "Enhanced Intel SpeedStep", + bits_range: (7, 7), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "tm2", + description: "Thermal Monitor 2", + bits_range: (8, 8), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "ssse3", + description: "Supplemental SSE3", + bits_range: (9, 9), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "cnxt_id", + description: "L1 Context ID", + bits_range: (10, 10), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "sdbg", + description: "Silicon Debug", + bits_range: (11, 11), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "fma", + description: "FMA extensions using YMM state", + bits_range: (12, 12), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "cx16", + description: "CMPXCHG16B instruction support", + bits_range: (13, 13), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "xtpr", + description: "xTPR Update Control", + bits_range: (14, 14), + policy: ProfilePolicy::Static(0), + }, + // MSR related + ValueDefinition { + short: "pdcm", + description: "Perfmon and Debug Capability", + bits_range: (15, 15), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "pcid", + description: "Process-context identifiers", + bits_range: (17, 17), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "dca", + description: "Direct Cache Access", + bits_range: (18, 18), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "sse4_1", + description: "SSE4.1", + bits_range: (19, 19), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "sse4_2", + description: "SSE4.2", + bits_range: (20, 20), + policy: ProfilePolicy::Inherit, + }, + // Set by Cloud hypervisor + ValueDefinition { + short: "x2apic", + description: "X2APIC support", + bits_range: (21, 21), + policy: ProfilePolicy::Static(1), + }, + ValueDefinition { + short: "movbe", + description: "MOVBE instruction support", + bits_range: (22, 22), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "popcnt", + description: "POPCNT instruction support", + bits_range: (23, 23), + policy: ProfilePolicy::Inherit, + }, + // Set by Cloud hypervisor + ValueDefinition { + short: "tsc_deadline_timer", + description: "APIC timer one-shot operation", + bits_range: (24, 24), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "aes", + description: "AES instructions", + bits_range: (25, 25), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xsave", + description: "XSAVE (and related instructions) support", + bits_range: (26, 26), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "osxsave", + description: "XSAVE (and related instructions) are enabled by OS", + bits_range: (27, 27), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "avx", + description: "AVX instructions support", + bits_range: (28, 28), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "f16c", + description: "Half-precision floating-point conversion support", + bits_range: (29, 29), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "rdrand", + description: "RDRAND instruction support", + bits_range: (30, 30), + policy: ProfilePolicy::Inherit, + }, + // TODO: If set by CHV set to 0 and write comment + ValueDefinition { + short: "guest_status", + description: "System is running as guest; (para-)virtualized system", + bits_range: (31, 31), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + ( + Parameters { + leaf: 0x1, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "fpu", + description: "Floating-Point Unit on-chip (x87)", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "vme", + description: "Virtual-8086 Mode Extensions", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "de", + description: "Debugging Extensions", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "pse", + description: "Page Size Extension", + bits_range: (3, 3), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "tsc", + description: "Time Stamp Counter", + bits_range: (4, 4), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "msr", + description: "Model-Specific Registers (RDMSR and WRMSR support)", + bits_range: (5, 5), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "pae", + description: "Physical Address Extensions", + bits_range: (6, 6), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "mce", + description: "Machine Check Exception", + bits_range: (7, 7), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "cx8", + description: "CMPXCHG8B instruction", + bits_range: (8, 8), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "apic", + description: "APIC on-chip", + bits_range: (9, 9), + policy: ProfilePolicy::Static(1), + }, + // MSR related + ValueDefinition { + short: "sep", + description: "SYSENTER, SYSEXIT, and associated MSRs", + bits_range: (11, 11), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "mtrr", + description: "Memory Type Range Registers", + bits_range: (12, 12), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "pge", + description: "Page Global Extensions", + bits_range: (13, 13), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "mca", + description: "Machine Check Architecture", + bits_range: (14, 14), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "cmov", + description: "Conditional Move Instruction", + bits_range: (15, 15), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "pat", + description: "Page Attribute Table", + bits_range: (16, 16), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "pse36", + description: "Page Size Extension (36-bit)", + bits_range: (17, 17), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "psn", + description: "Processor Serial Number", + bits_range: (18, 18), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "clfsh", + description: "CLFLUSH instruction", + bits_range: (19, 19), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "ds", + description: "Debug Store", + bits_range: (21, 21), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "acpi", + description: "Thermal monitor and clock control", + bits_range: (22, 22), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "mmx", + description: "MMX instructions", + bits_range: (23, 23), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "fxsr", + description: "FXSAVE and FXRSTOR instructions", + bits_range: (24, 24), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "sse", + description: "SSE instructions", + bits_range: (25, 25), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "sse2", + description: "SSE2 instructions", + bits_range: (26, 26), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "ss", + description: "Self Snoop", + bits_range: (27, 27), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "htt", + description: "Hyper-threading", + bits_range: (28, 28), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "tm", + description: "Thermal Monitor", + bits_range: (29, 29), + policy: ProfilePolicy::Static(0), + }, + // MSR related + ValueDefinition { + short: "pbe", + description: "Pending Break Enable", + bits_range: (31, 31), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + // ========================================================================================= + // Cache and TLB Information + // ========================================================================================= + ( + Parameters { + leaf: 0x2, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "iteration_count", + description: "Number of times this leaf must be queried", + bits_range: (0, 7), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "desc1", + description: "Descriptor #1", + bits_range: (8, 15), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "desc2", + description: "Descriptor #2", + bits_range: (16, 23), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "desc3", + description: "Descriptor #3", + bits_range: (24, 30), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "eax_invalid", + description: "Descriptors 1-3 are invalid if set", + bits_range: (31, 31), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + ( + Parameters { + leaf: 0x2, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "desc4", + description: "Descriptor #4", + bits_range: (0, 7), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "desc5", + description: "Descriptor #5", + bits_range: (8, 15), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "desc6", + description: "Descriptor #6", + bits_range: (16, 23), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "desc7", + description: "Descriptor #7", + bits_range: (24, 30), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "ebx_invalid", + description: "Descriptors 4-7 are invalid if set", + bits_range: (31, 31), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + ( + Parameters { + leaf: 0x2, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "desc8", + description: "Descriptor #8", + bits_range: (0, 7), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "desc9", + description: "Descriptor #9", + bits_range: (8, 15), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "desc10", + description: "Descriptor #10", + bits_range: (16, 23), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "desc11", + description: "Descriptor #11", + bits_range: (24, 30), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "ecx_invalid", + description: "Descriptors 8-11 are invalid if set", + bits_range: (31, 31), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + ( + Parameters { + leaf: 0x2, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "desc12", + description: "Descriptor #12", + bits_range: (0, 7), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "desc13", + description: "Descriptor #13", + bits_range: (8, 15), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "desc14", + description: "Descriptor #14", + bits_range: (16, 23), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "desc15", + description: "Descriptor #15", + bits_range: (24, 30), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "edx_invalid", + description: "Descriptors 12-15 are invalid if set", + bits_range: (31, 31), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + // ========================================================================================= + // Deterministic Cache Parameters + // ========================================================================================= + ( + Parameters { + leaf: 0x4, + sub_leaf: RangeInclusive::new(0, u32::MAX), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "cache_type", + description: "Cache type field", + bits_range: (0, 4), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "cache_level", + description: "Cache level (1-based)", + bits_range: (5, 7), + policy: ProfilePolicy::Passthrough, + }, + // TODO: Could there be a problem migrating from a CPU with self-initializing cache to one without? + ValueDefinition { + short: "cache_self_init", + description: "Self-initializing cache level", + bits_range: (8, 8), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "fully_associative", + description: "Fully-associative cache", + bits_range: (9, 9), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "num_threads_sharing", + description: "Number logical CPUs sharing this cache", + bits_range: (14, 25), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "num_cores_on_die", + description: "Number of cores in the physical package", + bits_range: (26, 31), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + ( + Parameters { + leaf: 0x4, + sub_leaf: RangeInclusive::new(0, u32::MAX), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "cache_linesize", + description: "System coherency line size (0-based)", + bits_range: (0, 11), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "cache_npartitions", + description: "Physical line partitions (0-based)", + bits_range: (12, 21), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "cache_nways", + description: "Ways of associativity (0-based)", + bits_range: (22, 31), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + ( + Parameters { + leaf: 0x4, + sub_leaf: RangeInclusive::new(0, u32::MAX), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cache_nsets", + description: "Cache number of sets (0-based)", + bits_range: (0, 30), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x4, + sub_leaf: RangeInclusive::new(0, u32::MAX), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "wbinvd_rll_no_guarantee", + description: "WBINVD/INVD not guaranteed for Remote Lower-Level caches", + bits_range: (0, 0), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "ll_inclusive", + description: "Cache is inclusive of Lower-Level caches", + bits_range: (1, 1), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "complex_indexing", + description: "Not a direct-mapped cache (complex function)", + bits_range: (2, 2), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + // ========================================================================================= + // MONITOR/MWAIT + // ========================================================================================= + ( + Parameters { + leaf: 0x5, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "min_mon_size", + description: "Smallest monitor-line size, in bytes", + bits_range: (0, 15), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0x5, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "max_mon_size", + description: "Largest monitor-line size, in bytes", + bits_range: (0, 15), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0x5, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "mwait_ext", + description: "Enumeration of MONITOR/MWAIT extensions is supported", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "mwait_irq_break", + description: "Interrupts as a break-event for MWAIT is supported", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x5, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "n_c0_substates", + description: "Number of C0 sub C-states supported using MWAIT", + bits_range: (0, 3), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "n_c1_substates", + description: "Number of C1 sub C-states supported using MWAIT", + bits_range: (4, 7), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "n_c2_substates", + description: "Number of C2 sub C-states supported using MWAIT", + bits_range: (8, 11), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "n_c3_substates", + description: "Number of C3 sub C-states supported using MWAIT", + bits_range: (12, 15), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "n_c4_substates", + description: "Number of C4 sub C-states supported using MWAIT", + bits_range: (16, 19), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "n_c5_substates", + description: "Number of C5 sub C-states supported using MWAIT", + bits_range: (20, 23), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "n_c6_substates", + description: "Number of C6 sub C-states supported using MWAIT", + bits_range: (24, 27), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "n_c7_substates", + description: "Number of C7 sub C-states supported using MWAIT", + bits_range: (28, 31), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + // ========================================================================================= + // Thermal and Power Management + // ========================================================================================= + ( + Parameters { + leaf: 0x6, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "dtherm", + description: "Digital temperature sensor", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "turbo_boost", + description: "Intel Turbo Boost", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "arat", + description: "Always-Running APIC Timer (not affected by p-state)", + bits_range: (2, 2), + // The timer is emulated by KVM and thus always always-running :) + policy: ProfilePolicy::Static(1), + }, + ValueDefinition { + short: "pln", + description: "Power Limit Notification (PLN) event", + bits_range: (4, 4), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "ecmd", + description: "Clock modulation duty cycle extension", + bits_range: (5, 5), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "pts", + description: "Package thermal management", + bits_range: (6, 6), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "hwp", + description: "HWP (Hardware P-states) base registers are supported", + bits_range: (7, 7), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "hwp_notify", + description: "HWP notification (IA32_HWP_INTERRUPT MSR)", + bits_range: (8, 8), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "hwp_act_window", + description: "HWP activity window (IA32_HWP_REQUEST[bits 41:32]) supported", + bits_range: (9, 9), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "hwp_epp", + description: "HWP Energy Performance Preference", + bits_range: (10, 10), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "hwp_pkg_req", + description: "HWP Package Level Request", + bits_range: (11, 11), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "hdc_base_regs", + description: "HDC base registers are supported", + bits_range: (13, 13), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "turbo_boost_3_0", + description: "Intel Turbo Boost Max 3.0", + bits_range: (14, 14), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "hwp_capabilities", + description: "HWP Highest Performance change", + bits_range: (15, 15), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "hwp_peci_override", + description: "HWP PECI override", + bits_range: (16, 16), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "hwp_flexible", + description: "Flexible HWP", + bits_range: (17, 17), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "hwp_fast", + description: "IA32_HWP_REQUEST MSR fast access mode", + bits_range: (18, 18), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "hfi", + description: "HW_FEEDBACK MSRs supported", + bits_range: (19, 19), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "hwp_ignore_idle", + description: "Ignoring idle logical CPU HWP req is supported", + bits_range: (20, 20), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "thread_director", + description: "Intel thread director support", + bits_range: (23, 23), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "therm_interrupt_bit25", + description: "IA32_THERM_INTERRUPT MSR bit 25 is supported", + bits_range: (24, 24), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x6, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "n_therm_thresholds", + description: "Digital thermometer thresholds", + bits_range: (0, 3), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0x6, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + // MSR related + ValueDefinition { + short: "aperfmperf", + description: "MPERF/APERF MSRs (effective frequency interface)", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + // MSR related + ValueDefinition { + short: "epb", + description: "IA32_ENERGY_PERF_BIAS MSR support", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "thrd_director_nclasses", + description: "Number of classes, Intel thread director", + bits_range: (8, 15), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x6, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "perfcap_reporting", + description: "Performance capability reporting", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "encap_reporting", + description: "Energy efficiency capability reporting", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "feedback_sz", + description: "Feedback interface structure size, in 4K pages", + bits_range: (8, 11), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "this_lcpu_hwfdbk_idx", + description: "This logical CPU hardware feedback interface index", + bits_range: (16, 31), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + // =================================================================================================================== + // Structured Extended Feature Flags Enumeration Main Leaf + // =================================================================================================================== + ( + Parameters { + leaf: 0x7, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "leaf7_n_subleaves", + description: "Number of leaf 0x7 subleaves", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x7, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "fsgsbase", + description: "FSBASE/GSBASE read/write support", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "tsc_adjust", + description: "IA32_TSC_ADJUST MSR supported", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit, + }, + // SGX is deprecated so we disable it unconditionally for all CPU profiles + ValueDefinition { + short: "sgx", + description: "Intel SGX (Software Guard Extensions)", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "bmi1", + description: "Bit manipulation extensions group 1", + bits_range: (3, 3), + policy: ProfilePolicy::Inherit, + }, + // TSX related which is riddled with CVEs. Consider two profiles, or making it opt-in/out. QEMU always has a CPU model with and without TSX. + ValueDefinition { + short: "hle", + description: "Hardware Lock Elision", + bits_range: (4, 4), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "avx2", + description: "AVX2 instruction set", + bits_range: (5, 5), + policy: ProfilePolicy::Inherit, + }, + /*The KVM docs recommend always setting this (https://docs.kernel.org/virt/kvm/x86/errata.html#kvm-get-supported-cpuid-issues). + + Keep in mind however that in my limited understanding this isn't about enabling or disabling a feature, but it describes critical behaviour. + Hence I am wondering whether it should be a hard error if the host does not have this bit set, but the desired CPU profile does? + + TODO: Check what KVM_GET_SUPPORTED_CPUID actually gives here (on the Skylake server) + */ + ValueDefinition { + short: "fdp_excptn_only", + description: "FPU Data Pointer updated only on x87 exceptions", + bits_range: (6, 6), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "smep", + description: "Supervisor Mode Execution Protection", + bits_range: (7, 7), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "bmi2", + description: "Bit manipulation extensions group 2", + bits_range: (8, 8), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "erms", + description: "Enhanced REP MOVSB/STOSB", + bits_range: (9, 9), + policy: ProfilePolicy::Inherit, + }, + /* + The instruction enabled by this seems rather powerful. Are we sure that doesn't have security implications? + I included this because it seems like QEMU does (to the best of my understanding). + */ + ValueDefinition { + short: "invpcid", + description: "INVPCID instruction (Invalidate Processor Context ID)", + bits_range: (10, 10), + policy: ProfilePolicy::Inherit, + }, + // This is TSX related. TSX is riddled with CVEs: Consider two profiles (one with it disabled) or an opt-in/out feature. + ValueDefinition { + short: "rtm", + description: "Intel restricted transactional memory", + bits_range: (11, 11), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "rdt_m", + description: "Supports Intel Resource Director Technology Monitoring Capability if 1", + bits_range: (12, 12), + policy: ProfilePolicy::Static(0), + }, + // The KVM docs recommend always setting this (https://docs.kernel.org/virt/kvm/x86/errata.html#kvm-get-supported-cpuid-issues). TODO: Is it OK to just set this to 1? + ValueDefinition { + short: "zero_fcs_fds", + description: "Deprecates FPU CS and FPU DS values if 1", + bits_range: (13, 13), + policy: ProfilePolicy::Passthrough, + }, + // This has been deprecated + ValueDefinition { + short: "mpx", + description: "Intel memory protection extensions", + bits_range: (14, 14), + policy: ProfilePolicy::Static(0), + }, + // This might be useful for certain high performance applications, but it also seems like a rather niche and advanced feature. QEMU does also not automatically enable this from what we can tell. + // TODO: Should we make this OPT-IN? + ValueDefinition { + short: "rdt_a", + description: "Intel RDT-A. Supports Intel Resource Director Technology Allocation Capability if 1", + bits_range: (15, 15), + policy: ProfilePolicy::Static(0), + }, + // TODO: Do the wider avx512 zmm registers work out of the box when the hardware supports it? + ValueDefinition { + short: "avx512f", + description: "AVX-512 foundation instructions", + bits_range: (16, 16), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "avx512dq", + description: "AVX-512 double/quadword instructions", + bits_range: (17, 17), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "rdseed", + description: "RDSEED instruction", + bits_range: (18, 18), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "adx", + description: "ADCX/ADOX instructions", + bits_range: (19, 19), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "smap", + description: "Supervisor mode access prevention", + bits_range: (20, 20), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "avx512ifma", + description: "AVX-512 integer fused multiply add", + bits_range: (21, 21), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "clflushopt", + description: "CLFLUSHOPT instruction", + bits_range: (23, 23), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "clwb", + description: "CLWB instruction", + bits_range: (24, 24), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "intel_pt", + description: "Intel processor trace", + bits_range: (25, 25), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "avx512pf", + description: "AVX-512 prefetch instructions", + bits_range: (26, 26), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "avx512er", + description: "AVX-512 exponent/reciprocal instructions", + bits_range: (27, 27), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "avx512cd", + description: "AVX-512 conflict detection instructions", + bits_range: (28, 28), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "sha_ni", + description: "SHA/SHA256 instructions", + bits_range: (29, 29), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "avx512bw", + description: "AVX-512 byte/word instructions", + bits_range: (30, 30), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "avx512vl", + description: "AVX-512 VL (128/256 vector length) extensions", + bits_range: (31, 31), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + ( + Parameters { + leaf: 0x7, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "prefetchwt1", + description: "PREFETCHWT1 (Intel Xeon Phi only)", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "avx512vbmi", + description: "AVX-512 Vector byte manipulation instructions", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit, + }, + // Also set by QEMU for CPU models from what we can tell + ValueDefinition { + short: "umip", + description: "User mode instruction protection", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit, + }, + // TODO: This is however set by QEMU for CPU models from what we can tell? + ValueDefinition { + short: "pku", + description: "Protection keys for user-space", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + // NOTE: This field is mutable in principle and can be changed by the OS (TODO: Under which circumstances?) + ValueDefinition { + short: "ospke", + description: "OS protection keys enable", + bits_range: (4, 4), + policy: ProfilePolicy::Static(0), + }, + // TODO: Revisit this decision. Setting this to 0 for now in order to be compatible with QEMU + ValueDefinition { + short: "waitpkg", + description: "WAITPKG instructions", + bits_range: (5, 5), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "avx512_vbmi2", + description: "AVX-512 vector byte manipulation instructions group 2", + bits_range: (6, 6), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "cet_ss", + description: "CET shadow stack features", + bits_range: (7, 7), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "gfni", + description: "Galois field new instructions", + bits_range: (8, 8), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "vaes", + description: "Vector AES instructions", + bits_range: (9, 9), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "vpclmulqdq", + description: "VPCLMULQDQ 256-bit instruction support", + bits_range: (10, 10), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "avx512_vnni", + description: "Vector neural network instructions", + bits_range: (11, 11), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "avx512_bitalg", + description: "AVX-512 bitwise algorithms", + bits_range: (12, 12), + policy: ProfilePolicy::Inherit, + }, + // Seems to be TDX related which is experimental in CHV. We disable this for CPU profiles for now, but could potentially add it as an opt-in feature eventually. + ValueDefinition { + short: "tme", + description: "Intel total memory encryption", + bits_range: (13, 13), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "avx512_vpopcntdq", + description: "AVX-512: POPCNT for vectors of DWORD/QWORD", + bits_range: (14, 14), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "la57", + description: "57-bit linear addresses (five-level paging)", + bits_range: (16, 16), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "mawau_val_lm", + description: "BNDLDX/BNDSTX MAWAU value in 64-bit mode", + bits_range: (17, 21), + policy: ProfilePolicy::Static(0), + }, + // MSR related + ValueDefinition { + short: "rdpid", + description: "RDPID instruction", + bits_range: (22, 22), + policy: ProfilePolicy::Inherit, + }, + // We leave key locker support out for CPU profiles for the time being. We may want this to be opt-in in the future though + ValueDefinition { + short: "key_locker", + description: "Intel key locker support", + bits_range: (23, 23), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "bus_lock_detect", + description: "OS bus-lock detection", + bits_range: (24, 24), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "cldemote", + description: "CLDEMOTE instruction", + bits_range: (25, 25), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "movdiri", + description: "MOVDIRI instruction", + bits_range: (27, 27), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "movdir64b", + description: "MOVDIR64B instruction", + bits_range: (28, 28), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "enqcmd", + description: "Enqueue stores supported (ENQCMD{,S})", + bits_range: (29, 29), + policy: ProfilePolicy::Static(0), + }, + // SGX support is deprecated so we disable it unconditionally for CPU profiles + ValueDefinition { + short: "sgx_lc", + description: "Intel SGX launch configuration", + bits_range: (30, 30), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "pks", + description: "Protection keys for supervisor-mode pages", + bits_range: (31, 31), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x7, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ + // SGX is deprecated + ValueDefinition { + short: "sgx_keys", + description: "Intel SGX attestation services", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "avx512_4vnniw", + description: "AVX-512 neural network instructions (Intel Xeon Phi only)", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "avx512_4fmaps", + description: "AVX-512 multiply accumulation single precision (Intel Xeon Phi only)", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "fsrm", + description: "Fast short REP MOV", + bits_range: (4, 4), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "uintr", + description: "CPU supports user interrupts", + bits_range: (5, 5), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "avx512_vp2intersect", + description: "VP2INTERSECT{D,Q} instructions", + bits_range: (8, 8), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "srdbs_ctrl", + description: "SRBDS mitigation MSR available: If 1, enumerates support for the IA32_MCU_OPT_CTRL MSR and indicates that its bit 0 (RNGDS_MITG_DIS) is also supported.", + bits_range: (9, 9), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "md_clear", + description: "VERW MD_CLEAR microcode support", + bits_range: (10, 10), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "rtm_always_abort", + description: "XBEGIN (RTM transaction) always aborts", + bits_range: (11, 11), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "tsx_force_abort", + description: "MSR TSX_FORCE_ABORT, RTM_ABORT bit, supported", + bits_range: (13, 13), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "serialize", + description: "SERIALIZE instruction", + bits_range: (14, 14), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "hybrid_cpu", + description: "The CPU is identified as a 'hybrid part'", + bits_range: (15, 15), + policy: ProfilePolicy::Inherit, + }, + // TODO: This is TSX related which is riddled with CVEs. We could consider an additional profile enabling TSX in the future, but we leave it out for now. + ValueDefinition { + short: "tsxldtrk", + description: "TSX suspend/resume load address tracking", + bits_range: (16, 16), + policy: ProfilePolicy::Static(0), + }, + // Might be relevant for confidential computing + ValueDefinition { + short: "pconfig", + description: "PCONFIG instruction", + bits_range: (18, 18), + policy: ProfilePolicy::Static(0), + }, + // MSR related + ValueDefinition { + short: "arch_lbr", + description: "Intel architectural LBRs", + bits_range: (19, 19), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "ibt", + description: "CET indirect branch tracking", + bits_range: (20, 20), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "amx_bf16", + description: "AMX-BF16: tile bfloat16 support", + bits_range: (22, 22), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "avx512_fp16", + description: "AVX-512 FP16 instructions", + bits_range: (23, 23), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "amx_tile", + description: "AMX-TILE: tile architecture support", + bits_range: (24, 24), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "amx_int8", + description: "AMX-INT8: tile 8-bit integer support", + bits_range: (25, 25), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "spec_ctrl", + description: "Speculation Control (IBRS/IBPB: indirect branch restrictions)", + bits_range: (26, 26), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "intel_stibp", + description: "Single thread indirect branch predictors", + bits_range: (27, 27), + policy: ProfilePolicy::Inherit, + }, + // MSR related + // + // TODO: Is passthrough correct? + // If this bit is set then MSR IA32_FLUSH_CMD + // becomes available, otherwise it is not. + ValueDefinition { + short: "flush_l1d", + description: "FLUSH L1D cache: IA32_FLUSH_CMD MSR", + bits_range: (28, 28), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "arch_capabilities", + description: "Intel IA32_ARCH_CAPABILITIES MSR", + bits_range: (29, 29), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "core_capabilities", + description: "IA32_CORE_CAPABILITIES MSR", + bits_range: (30, 30), + policy: ProfilePolicy::Static(0), + }, + // MSR related + ValueDefinition { + short: "spec_ctrl_ssbd", + description: "Speculative store bypass disable", + bits_range: (31, 31), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + // =================================================================================================================== + // Structured Extended Feature Flags Enumeration Sub-Leaf 1 + // =================================================================================================================== + ( + Parameters { + leaf: 0x7, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "sha512", + description: "SHA-512 extensions", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "sm3", + description: "SM3 instructions", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "sm4", + description: "SM4 instructions", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit, + }, + // RAO-INT is deprecated and removed from many compilers as far as we are aware. + // This policy can be changed if requested in the future. + ValueDefinition { + short: "RAO-INT", + description: "RAO-INT instructions", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "avx_vnni", + description: "AVX-VNNI instructions", + bits_range: (4, 4), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "avx512_bf16", + description: "AVX-512 bfloat16 instructions", + bits_range: (5, 5), + policy: ProfilePolicy::Inherit, + }, + /* + Not set in QEMU from what we can tell, but according seems to be fine to expose this to guests + if we understood https://www.phoronix.com/news/Intel-Linux-LASS-KVM correctly. It is also + our understanding that this feature can enable guests opting in to more security (possibly at the cost of some performance). + */ + ValueDefinition { + short: "lass", + description: "Linear address space separation", + bits_range: (6, 6), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "cmpccxadd", + description: "CMPccXADD instructions", + bits_range: (7, 7), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "arch_perfmon_ext", + description: "ArchPerfmonExt: leaf 0x23 is supported", + bits_range: (8, 8), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "fzrm", + description: "Fast zero-length REP MOVSB", + bits_range: (10, 10), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "fsrs", + description: "Fast short REP STOSB", + bits_range: (11, 11), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "fsrc", + description: "Fast Short REP CMPSB/SCASB", + bits_range: (12, 12), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "fred", + description: "FRED: Flexible return and event delivery transitions", + bits_range: (17, 17), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "lkgs", + description: "LKGS: Load 'kernel' (userspace) GS", + bits_range: (18, 18), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "wrmsrns", + description: "WRMSRNS instruction (WRMSR-non-serializing)", + bits_range: (19, 19), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "nmi_src", + description: "NMI-source reporting with FRED event data", + bits_range: (20, 20), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "amx_fp16", + description: "AMX-FP16: FP16 tile operations", + bits_range: (21, 21), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "hreset", + description: "History reset support", + bits_range: (22, 22), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "avx_ifma", + description: "Integer fused multiply add", + bits_range: (23, 23), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "lam", + description: "Linear address masking", + bits_range: (26, 26), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "rd_wr_msrlist", + description: "RDMSRLIST/WRMSRLIST instructions", + bits_range: (27, 27), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "invd_disable_post_bios_done", + description: "If 1, supports INVD execution prevention after BIOS Done", + bits_range: (30, 30), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "movrs", + description: "MOVRS", + bits_range: (31, 31), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + ( + Parameters { + leaf: 0x7, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "intel_ppin", + description: "Protected processor inventory number (PPIN{,_CTL} MSRs)", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + // MSR related + ValueDefinition { + short: "pbndkb", + description: "PBNDKB instruction supported and enumerates the existence of the IA32_TSE_CAPABILITY MSR", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x7, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "asymmetric-rdt-M", + description: "At least one logical processor supports Asymmetrical Intel RDT Monitoring Capability", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "asymmetric-rdt-A", + description: "At least one logical processor supports Asymmetrical Intel RDT Allocation Capability", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "MSR_IMM", + description: "Immediate forms of the RDMSR and WRMSRNS instructions are supported", + bits_range: (5, 5), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + ( + Parameters { + leaf: 0x7, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "avx_vnni_int8", + description: "AVX-VNNI-INT8 instructions", + bits_range: (4, 4), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "avx_ne_convert", + description: "AVX-NE-CONVERT instructions", + bits_range: (5, 5), + policy: ProfilePolicy::Inherit, + }, + // NOTE: AMX currently requires opt-in, even for the host CPU profile. We still inherit this value for profiles as the value will be zeroed out if the user has not opted in for "amx" via CpuFeatures. + ValueDefinition { + short: "amx_complex", + description: "AMX-COMPLEX instructions (starting from Granite Rapids)", + bits_range: (8, 8), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "avx_vnni_int16", + description: "AVX-VNNI-INT16 instructions", + bits_range: (10, 10), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "utmr", + description: "If 1, supports user-timer events", + bits_range: (13, 13), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "prefetchit_0_1", + description: "PREFETCHIT0/1 instructions", + bits_range: (14, 14), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "user_msr", + description: "If 1, supports the URDMSR and UWRMSR instructions", + bits_range: (15, 15), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "uiret_uif", + description: "If 1, UIRET sets UIF to the value of bit 1 of the RFLAGS image loaded from the stack", + bits_range: (15, 15), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "cet_sss", + description: "CET supervisor shadow stacks safe to use", + bits_range: (18, 18), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "avx10", + description: "If 1, supports the Intel AVX10 instructions and indicates the presence of leaf 0x24", + bits_range: (19, 19), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "apx_f", + description: "If 1, the processor provides foundational support for Intel Advanced Performance Extensions", + bits_range: (21, 21), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "mwait", + description: "If 1, MWAIT is supported even if (0x1 ECX bit 3 (monitor) is enumerated as 0)", + bits_range: (23, 23), + policy: ProfilePolicy::Static(0), + }, + // MSR related + ValueDefinition { + short: "slsm", + description: "If 1, indicates bit 0 of the IA32_INTEGRITY_STATUS MSR is supported. Bit 0 of this MSR indicates whether static lockstep is active on this logical processor", + bits_range: (24, 24), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + // =================================================================================================================== + // Structured Extended Feature Flags Enumeration Sub-Leaf 2 + // =================================================================================================================== + ( + Parameters { + leaf: 0x7, + sub_leaf: RangeInclusive::new(2, 2), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ + // MSR related + ValueDefinition { + short: "intel_psfd", + description: "If 1, indicates bit 7 of the IA32_SPEC_CTRL_MSR is supported. Bit 7 of this MSR disables fast store forwarding predictor without disabling speculative store bypass", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "ipred_ctrl", + description: "MSR bits IA32_SPEC_CTRL.IPRED_DIS_{U,S}", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "rrsba_ctrl", + description: "MSR bits IA32_SPEC_CTRL.RRSBA_DIS_{U,S}", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "ddp_ctrl", + description: "MSR bit IA32_SPEC_CTRL.DDPD_U", + bits_range: (3, 3), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "bhi_ctrl", + description: "MSR bit IA32_SPEC_CTRL.BHI_DIS_S", + bits_range: (4, 4), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "mcdt_no", + description: "MCDT mitigation not needed", + bits_range: (5, 5), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "uclock_disable", + description: "UC-lock disable is supported", + bits_range: (6, 6), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + // =================================================================================================================== + // Direct Cache Access Information + // =================================================================================================================== + ( + Parameters { + leaf: 0x9, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + // MSR related + ValueDefinition { + short: "dca_cap_msr_value", + description: "Value of bits [31:0] of IA32_PLATFORM_DCA_CAP MSR (address 1f8H)", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + // =================================================================================================================== + // Architectural Performance Monitoring + // =================================================================================================================== + // We will just zero out everything to do with PMU for CPU profiles + ( + Parameters { + leaf: 0xa, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "pmu_version", + description: "Performance monitoring unit version ID", + bits_range: (0, 7), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "pmu_n_gcounters", + description: "Number of general PMU counters per logical CPU", + bits_range: (8, 15), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "pmu_gcounters_nbits", + description: "Bitwidth of PMU general counters", + bits_range: (16, 23), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "pmu_cpuid_ebx_bits", + description: "Length of leaf 0xa EBX bit vector", + bits_range: (24, 31), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0xa, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "no_core_cycle_evt", + description: "Core cycle event not available", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "no_insn_retired_evt", + description: "Instruction retired event not available", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "no_refcycle_evt", + description: "Reference cycles event not available", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "no_llc_ref_evt", + description: "LLC-reference event not available", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "no_llc_miss_evt", + description: "LLC-misses event not available", + bits_range: (4, 4), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "no_br_insn_ret_evt", + description: "Branch instruction retired event not available", + bits_range: (5, 5), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "no_br_mispredict_evt", + description: "Branch mispredict retired event not available", + bits_range: (6, 6), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "no_td_slots_evt", + description: "Topdown slots event not available", + bits_range: (7, 7), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0xa, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "pmu_fcounters_bitmap", + description: "Fixed-function PMU counters support bitmap", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xa, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "pmu_n_fcounters", + description: "Number of fixed PMU counters", + bits_range: (0, 4), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "pmu_fcounters_nbits", + description: "Bitwidth of PMU fixed counters", + bits_range: (5, 12), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "anythread_depr", + description: "AnyThread deprecation", + bits_range: (15, 15), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + // =================================================================================================================== + // Extended Topology Enumeration + // =================================================================================================================== + + // Leaf 0xB must be set by CHV itself (and do all necessary checks) + ( + Parameters { + leaf: 0xb, + sub_leaf: RangeInclusive::new(0, u32::MAX), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "x2apic_id_shift", + description: "Bit width of this level (previous levels inclusive)", + bits_range: (0, 4), + policy: ProfilePolicy::Passthrough, + }]), + ), + // Set by VMM/user provided config + ( + Parameters { + leaf: 0xb, + sub_leaf: RangeInclusive::new(0, u32::MAX), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "domain_lcpus_count", + description: "Logical CPUs count across all instances of this domain", + bits_range: (0, 15), + policy: ProfilePolicy::Passthrough, + }]), + ), + // Set by VMM/user provided config + ( + Parameters { + leaf: 0xb, + sub_leaf: RangeInclusive::new(0, u32::MAX), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "domain_nr", + description: "This domain level (subleaf ID)", + bits_range: (0, 7), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "domain_type", + description: "This domain type", + bits_range: (8, 15), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + // Set by VMM/user provided config + ( + Parameters { + leaf: 0xb, + sub_leaf: RangeInclusive::new(0, u32::MAX), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "x2apic_id", + description: "x2APIC ID of current logical CPU", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + // =================================================================================================================== + // Processor Extended State Enumeration Main Leaf + // =================================================================================================================== + // TODO: Implement CPUID compatibility checks in CHV for this leaf + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "xcr0_x87", + description: "XCR0.X87 (bit 0) supported", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xcr0_sse", + description: "XCR0.SEE (bit 1) supported", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xcr0_avx", + description: "XCR0.AVX (bit 2) supported", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit, + }, + // MPX is deprecated + ValueDefinition { + short: "xcr0_mpx_bndregs", + description: "XCR0.BNDREGS (bit 3) supported (MPX BND0-BND3 registers)", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + // MPX is deprecated + ValueDefinition { + short: "xcr0_mpx_bndcsr", + description: "XCR0.BNDCSR (bit 4) supported (MPX BNDCFGU/BNDSTATUS registers)", + bits_range: (4, 4), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "xcr0_avx512_opmask", + description: "XCR0.OPMASK (bit 5) supported (AVX-512 k0-k7 registers)", + bits_range: (5, 5), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xcr0_avx512_zmm_hi256", + description: "XCR0.ZMM_Hi256 (bit 6) supported (AVX-512 ZMM0->ZMM7/15 registers)", + bits_range: (6, 6), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xcr0_avx512_hi16_zmm", + description: "XCR0.HI16_ZMM (bit 7) supported (AVX-512 ZMM16->ZMM31 registers)", + bits_range: (7, 7), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "xcr0_ia32_xss", + description: "XCR0.IA32_XSS (bit 8) used for PT in IA32_XSS", + bits_range: (8, 8), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "xcr0_pkru", + description: "XCR0.PKRU (bit 9) supported (XSAVE PKRU registers)", + bits_range: (9, 9), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xcr0_ia32_xss_pasid", + description: "XCR0.IA32_XSS (bit 10) used for PASID in IA32_XSS", + bits_range: (10, 10), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "xcr0_ia32_xss_cet", + description: "XCR0.IA32_XSS (bits 11 - 12) used for IA32_XSS", + bits_range: (11, 12), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "xcr0_ia32_xss_hdc", + description: "XCR0.IA32_XSS (bit 13) used for IA32_XSS", + bits_range: (13, 13), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "xcr0_ia32_xss_UINTR", + description: "XCR0.IA32_XSS (bit 14) used for UINTR in IA32_XSS", + bits_range: (14, 14), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "xcr0_ia32_xss_LBR", + description: "XCR0.IA32_XSS (bit 15) used for LBR in IA32_XSS", + bits_range: (15, 15), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "xcr0_ia32_xss_bits_hwp", + description: "XCR0.IA32_XSS (bit 16) used for HWP in IA32_XSS", + bits_range: (16, 16), + policy: ProfilePolicy::Static(0), + }, + // NOTE: AMX currently requires opt-in, even for the host CPU profile. We still inherit this value for profiles and modify this value at runtime if AMX is not enabled by the user. + ValueDefinition { + short: "xcr0_tileconfig", + description: "XCR0.TILECONFIG (bit 17) supported (AMX can manage TILECONFIG)", + bits_range: (17, 17), + policy: ProfilePolicy::Inherit, + }, + // NOTE: AMX currently requires opt-in, even for the host CPU profile. We still inherit this value for profiles and modify this value at runtime if AMX is not ebabled by the user. + ValueDefinition { + short: "xcr0_tiledata", + description: "XCR0.TILEDATA (bit 18) supported (AMX can manage TILEDATA)", + bits_range: (18, 18), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + // This value can be changed by the OS and must thus be passthrough + ValueDefinitions::new(&[ValueDefinition { + short: "xsave_sz_xcr0_enabled", + description: "XSAVE/XRSTOR area byte size, for XCR0 enabled features", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + // This may be passthrough because we restrict each individual state component + ValueDefinitions::new(&[ValueDefinition { + short: "xsave_sz_max", + description: "XSAVE/XRSTOR area max byte size, all CPU features", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + // TODO: Do we know of any state components corresponding to the upper bits in XCR0? Perhaps it would be + // better to have `ProfilePolicy::Static(0)` here? + ValueDefinitions::new(&[ValueDefinition { + short: "xcr0_upper_bits", + description: "Reports the valid bit fields of the upper 32 bits of the XCR0 register", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + // =================================================================================================================== + // Processor Extended State Enumeration Sub-leaf 1 + // =================================================================================================================== + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "xsaveopt", + description: "XSAVEOPT instruction", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xsavec", + description: "XSAVEC instruction", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xgetbv1", + description: "XGETBV instruction with ECX = 1", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit, + }, + // TODO: Can this have security implications in terms of supervisor state getting exposed? + ValueDefinition { + short: "xsaves", + description: "XSAVES/XRSTORS instructions (and XSS MSR)", + bits_range: (3, 3), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xfd", + description: "Extended feature disable support", + bits_range: (4, 4), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + /*NOTE: This will depend on which CPU features (in CHV) are enabled and pre-computation can potentially lead to a combinatorial explosion. Luckily we can deal with each component (and its size) separately, hence we can just passthrough whatever we get from the host here.*/ + ValueDefinition { + short: "xsave_sz_xcr0_xmms_enabled", + description: "XSAVE area size, all XCR0 and IA32_XSS features enabled", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::ECX, + }, + /* Reports the supported bits of the lower IA32_XSS MSR. IA32_XSS[n] can be set to 1 only if ECX[n] = 1*/ + ValueDefinitions::new(&[ + ValueDefinition { + short: "xcr0_7bits", + description: "Used for XCR0", + bits_range: (0, 7), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xss_pt", + description: "PT state, supported", + bits_range: (8, 8), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "xcr0_bit9", + description: "Used for XCR0", + bits_range: (9, 9), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xss_pasid", + description: "PASID state, supported", + bits_range: (10, 10), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "xss_cet_u", + description: "CET user state, supported", + bits_range: (11, 11), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "xss_cet_p", + description: "CET supervisor state, supported", + bits_range: (12, 12), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "xss_hdc", + description: "HDC state, supported", + bits_range: (13, 13), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "xss_uintr", + description: "UINTR state, supported", + bits_range: (14, 14), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "xss_lbr", + description: "LBR state, supported", + bits_range: (15, 15), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "xss_hwp", + description: "HWP state, supported", + bits_range: (16, 16), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "xcr0_bits", + description: "Used for XCR0", + bits_range: (17, 18), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EDX, + }, + /* Reports the supported bits of the upper 32 bits of the IA32_XSS MSR. IA32_XSS[n + 32 ] can be set to 1 only if EDX[n] = 1*/ + ValueDefinitions::new(&[ValueDefinition { + short: "ia32_xss_upper", + description: " Reports the supported bits of the upper 32 bits of the IA32_XSS MSR. IA32_XSS[n + 32 ] can be set to 1 only if EDX[n] = 1", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + // =================================================================================================================== + // Processor Extended State Enumeration Sub-leaves + // =================================================================================================================== + + /* LEAF 0xd sub-leaf n >=2 : + If ECX contains an invalid sub-leaf index, EAX/EBX/ECX/EDX return 0. Sub-leaf n (0 ≤ n ≤ 31) is + invalid if sub-leaf 0 returns 0 in EAX[n] and sub-leaf 1 returns 0 in ECX[n]. Sub-leaf n (32 ≤ n ≤ 63) + is invalid if sub-leaf 0 returns 0 in EDX[n-32] and sub-leaf 1 returns 0 in EDX[n-32]. + */ + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(2, 2), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "xsave_sz", + description: "Size of save area for subleaf-N feature, in bytes", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(2, 2), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "xsave_offset", + description: "Offset of save area for subleaf-N feature, in bytes", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(2, 2), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "is_xss_bit", + description: "Subleaf N describes an XSS bit, otherwise XCR0 bit", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "compacted_xsave_64byte_aligned", + description: "When compacted, subleaf-N feature XSAVE area is 64-byte aligned", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xfd_faulting", + description: "Indicates support for xfd faulting", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + // Intel MPX is deprecated hence we zero out these sub-leaves + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(3, 4), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-3-4-eax-mpx-zero", + description: "This leaf has been zeroed out because MPX state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(3, 4), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-3-4-ebx-mpx-zero", + description: "This leaf has been zeroed out because MPX state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(3, 4), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-3-4-ecx-mpx-zero", + description: "This leaf has been zeroed out because MPX state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(3, 4), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-3-4-edx-mpx-zero", + description: "This leaf has been zeroed out because MPX state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(5, 7), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "xsave_sz", + description: "Size of save area for subleaf-N feature, in bytes", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(5, 7), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "xsave_offset", + description: "Offset of save area for subleaf-N feature, in bytes", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(5, 7), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "is_xss_bit", + description: "Subleaf N describes an XSS bit, otherwise XCR0 bit", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "compacted_xsave_64byte_aligned", + description: "When compacted, subleaf-N feature XSAVE area is 64-byte aligned", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xfd_faulting", + description: "Indicates support for xfd faulting", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + // Disable PT for CPU profiles + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(8, 8), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-8-eax-pt-zero", + description: "This leaf has been zeroed out because PT state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(8, 8), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-8-ebx-pt-zero", + description: "This leaf has been zeroed out because PT state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(8, 8), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-8-ecx-pt-zero", + description: "This leaf has been zeroed out because PT state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(8, 8), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-8-edx-pt-zero", + description: "This leaf has been zeroed out because PT state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(9, 9), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "xsave_sz", + description: "Size of save area for subleaf-N feature, in bytes", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(9, 9), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "xsave_offset", + description: "Offset of save area for subleaf-N feature, in bytes", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(9, 9), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "is_xss_bit", + description: "Subleaf N describes an XSS bit, otherwise XCR0 bit", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "compacted_xsave_64byte_aligned", + description: "When compacted, subleaf-N feature XSAVE area is 64-byte aligned", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xfd_faulting", + description: "Indicates support for xfd faulting", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + // Disable PASID for CPU profiles + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(10, 10), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-10-eax-pasid-zero", + description: "This leaf has been zeroed out because PASID state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(10, 10), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-10-ebx-pasid-zero", + description: "This leaf has been zeroed out because PASID state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(10, 10), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-10-ecx-pasid-zero", + description: "This leaf has been zeroed out because PASID state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(10, 10), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-10-edx-pasid-zero", + description: "This leaf has been zeroed out because PASID state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + // We leave CET out of CPU profiles for the time being + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(11, 12), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-11-12-eax-cet-zero", + description: "This leaf has been zeroed out because CET state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(11, 12), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-11-12-ebx-cet-zero", + description: "This leaf has been zeroed out because CET state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(11, 12), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-11-12-ecx-cet-zero", + description: "This leaf has been zeroed out because CET state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(11, 12), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-11-12-edx-cet-zero", + description: "This leaf has been zeroed out because CET state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + // Disable HDC for CPU profiles + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(13, 13), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-13-eax-edc-zero", + description: "This leaf has been zeroed out because CET state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(13, 13), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-13-ebx-hdc-zero", + description: "This leaf has been zeroed out because CET state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(13, 13), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-13-ecx-hdc-zero", + description: "This leaf has been zeroed out because CET state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(13, 13), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-13-edx-hdc-zero", + description: "This leaf has been zeroed out because CET state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + // We decided to disable UINTR for CPU profiles, hence we zero out these sub-leaves + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(14, 14), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-eax-uintr-zero", + description: "This leaf has been zeroed out because UINTR state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(14, 14), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-ebx-uintr-zero", + description: "This leaf has been zeroed out because UINTR state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(14, 14), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-ecx-uintr-zero", + description: "This leaf has been zeroed out because UINTR state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(14, 14), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-edx-uintr-zero", + description: "This leaf has been zeroed out because UINTR state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + // Disable LBR for CPU Profiles + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(15, 15), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-eax-lbr-zero", + description: "This leaf has been zeroed out because LBR state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(15, 15), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-ebx-lbr-zero", + description: "This leaf has been zeroed out because LBR state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(15, 15), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-ecx-lbr-zero", + description: "This leaf has been zeroed out because LBR state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(15, 15), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-edx-lbr-zero", + description: "This leaf has been zeroed out because LBR state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + // Disable HWP for CPU profiles + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(16, 16), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-eax-hwp-zero", + description: "This leaf has been zeroed out because HWP state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(16, 16), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-ebx-hwp-zero", + description: "This leaf has been zeroed out because HWP state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(16, 16), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-ecx-hwp-zero", + description: "This leaf has been zeroed out because HWP state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(16, 16), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-edx-hwp-zero", + description: "This leaf has been zeroed out because HWP state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + // NOTE: Sub-leaves 17 & 18 are AMX related and we will alter the adjustments corresponding to + // the policy declared here at runtime for those values. + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(17, 63), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "xsave_sz", + description: "Size of save area for subleaf-N feature, in bytes", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(17, 63), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "xsave_offset", + description: "Offset of save area for subleaf-N feature, in bytes", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(17, 63), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "is_xss_bit", + description: "Subleaf N describes an XSS bit, otherwise XCR0 bit", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "compacted_xsave_64byte_aligned", + description: "When compacted, subleaf-N feature XSAVE area is 64-byte aligned", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xfd_faulting", + description: "Indicates support for xfd faulting", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + // =================================================================================================================== + // Intel Resource Director Technology Monitoring Enumeration + // =================================================================================================================== + ( + Parameters { + leaf: 0xf, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "core_rmid_max", + description: "RMID max, within this core, all types (0-based)", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xf, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "l3-cache-rdt-monitoring", + description: "Supports L3 Cache Intel RDT Monitoring if 1", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }]), + ), + // =================================================================================================================== + // Intel Resource Director Technology Monitoring Enumeration Sub-leaf 1 + // =================================================================================================================== + ( + Parameters { + leaf: 0xf, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "l3c_qm_bitwidth", + description: "L3 QoS-monitoring counter bitwidth (24-based)", + bits_range: (0, 7), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "l3c_qm_overflow_bit", + description: "QM_CTR MSR bit 61 is an overflow bit", + bits_range: (8, 8), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "l3c_qm_non_cpu_agent", + description: "If 1, indicates the presence of non-CPU agent Intel RDT CTM support", + bits_range: (9, 9), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "l3c_qm_non_cpu_agent", + description: "If 1, indicates the presence of non-CPU agent Intel RDT MBM support", + bits_range: (10, 10), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0xf, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "l3c_qm_conver_factor", + description: "QM_CTR MSR conversion factor to bytes", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xf, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "l3c_qm_rmid_max", + description: "L3 QoS-monitoring max RMID", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xf, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "cqm_occup_llc", + description: "L3 QoS occupancy monitoring supported", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "cqm_mbm_total", + description: "L3 QoS total bandwidth monitoring supported", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "cqm_mbm_local", + description: "L3 QoS local bandwidth monitoring supported", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + // =================================================================================================================== + // Intel Resource Director Technology Allocation Enumeration + // =================================================================================================================== + ( + Parameters { + leaf: 0x10, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + //TODO: These features may be good for increased performance. Perhaps there needs to be some mechanism to opt-in for non-host CPU profiles? + ValueDefinitions::new(&[ + ValueDefinition { + short: "cat_l3", + description: "L3 Cache Allocation Technology supported", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "cat_l2", + description: "L2 Cache Allocation Technology supported", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "mba", + description: "Memory Bandwidth Allocation supported", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + // =================================================================================================================== + // Intel Resource Director Technology Allocation Enumeration Sub-leaf (ECX = ResID = 1) + // =================================================================================================================== + ( + Parameters { + leaf: 0x10, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cat_cbm_len", + description: "L3_CAT capacity bitmask length, minus-one notation", + bits_range: (0, 4), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x10, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cat_units_bitmap", + description: "L3_CAT bitmap of allocation units", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x10, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::ECX, + }, + //TODO: These feature may be good for increased performance. Perhaps there needs to be some mechanism to opt-in for non-host CPU profiles? + ValueDefinitions::new(&[ + ValueDefinition { + short: "l3_cat_non_cpu_agents", + description: "L3_CAT for non-CPU agent is supported", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "cdp_l3", + description: "L3/L2_CAT CDP (Code and Data Prioritization)", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "cat_sparse_1s", + description: "L3/L2_CAT non-contiguous 1s value supported", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x10, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EDX, + }, + // TODO: We might need some way to opt in to use Intel cache allocation technology in guests with non-host CPU profiles. + ValueDefinitions::new(&[ValueDefinition { + short: "cat_cos_max", + description: "Highest COS number supported for this ResID", + bits_range: (0, 15), + policy: ProfilePolicy::Static(0), + }]), + ), + // =================================================================================================================== + // Intel Resource Director Technology Allocation Enumeration Sub-leaf (ECX = ResID = 2) + // =================================================================================================================== + ( + Parameters { + leaf: 0x10, + sub_leaf: RangeInclusive::new(2, 2), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cat_cbm_len", + description: "L2_CAT capacity bitmask length, minus-one notation", + bits_range: (0, 4), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x10, + sub_leaf: RangeInclusive::new(2, 2), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cat_units_bitmap", + description: "L2_CAT bitmap of allocation units", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x10, + sub_leaf: RangeInclusive::new(2, 2), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cat_cos_max", + description: "Highest COS number supported for this ResID", + bits_range: (0, 15), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0x10, + sub_leaf: RangeInclusive::new(2, 2), + register: CpuidReg::ECX, + }, + // TODO: We might need some way to opt in to use Intel cache allocation technology in guests with non-host CPU profiles. + ValueDefinitions::new(&[ + ValueDefinition { + short: "cdp_l2", + description: "L2_CAT CDP (Code and Data Prioritization)", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "cat_sparse_1s", + description: "L2_CAT non-contiguous 1s value supported", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + // =================================================================================================================== + // Intel Resource Director Technology Allocation Enumeration Sub-leaf (ECX = ResID = 3) + // =================================================================================================================== + ( + Parameters { + leaf: 0x10, + sub_leaf: RangeInclusive::new(3, 3), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + // TODO: We might need some way to opt in to use Intel MBA technology in guests with non-host CPU profiles. + ValueDefinition { + short: "mba_max_delay", + description: "Max MBA throttling value; minus-one notation", + bits_range: (0, 11), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x10, + sub_leaf: RangeInclusive::new(3, 3), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "per_thread_mba", + description: "Per-thread MBA controls are supported", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "mba_delay_linear", + description: "Delay values are linear", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x10, + sub_leaf: RangeInclusive::new(3, 3), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "mba_cos_max", + description: "MBA max Class of Service supported", + bits_range: (0, 15), + policy: ProfilePolicy::Static(0), + }]), + ), + // =================================================================================================================== + // Intel Resource Director Technology Allocation Enumeration Sub-leaf (ECX = ResID = 5) + // =================================================================================================================== + // + // TODO: We may want to have some way to opt-in to use Intel RDT for guests with non-host CPU profiles. + ( + Parameters { + leaf: 0x10, + sub_leaf: RangeInclusive::new(5, 5), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "core_max_throttle", + description: "Max Core throttling level supported by the corresponding ResID", + bits_range: (0, 7), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "core_scope", + description: "If 1, indicates the logical processor scope of the IA32_QoS_Core_BW_Thrtl_n MSRs. Other values are reserved", + bits_range: (8, 11), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x10, + sub_leaf: RangeInclusive::new(5, 5), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cba_delay_linear", + description: "The response of the bandwidth control is approximately linear", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0x10, + sub_leaf: RangeInclusive::new(5, 5), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "core_cos_max", + description: "Core max Class of Service supported", + bits_range: (0, 15), + policy: ProfilePolicy::Static(0), + }]), + ), + // SGX is already disabled and deprecated so we don't need to worry about leaf 0x12 and its subleaves + + // =================================================================================================================== + // Intel Processor Trace Enumeration Main Leaf + // =================================================================================================================== + ( + Parameters { + leaf: 0x14, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "pt_max_subleaf", + description: "Maximum leaf 0x14 subleaf", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0x14, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "cr3_filtering", + description: "IA32_RTIT_CR3_MATCH is accessible", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "psb_cyc", + description: "Configurable PSB and cycle-accurate mode", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "ip_filtering", + description: "IP/TraceStop filtering; Warm-reset PT MSRs preservation", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "mtc_timing", + description: "MTC timing packet; COFI-based packets suppression", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "ptwrite", + description: "PTWRITE support", + bits_range: (4, 4), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "power_event_trace", + description: "Power Event Trace support", + bits_range: (5, 5), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "psb_pmi_preserve", + description: "PSB and PMI preservation support", + bits_range: (6, 6), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "event_trace", + description: "Event Trace packet generation through IA32_RTIT_CTL.EventEn", + bits_range: (7, 7), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "tnt_disable", + description: "TNT packet generation disable through IA32_RTIT_CTL.DisTNT", + bits_range: (8, 8), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x14, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "topa_output", + description: "ToPA output scheme support", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "topa_multiple_entries", + description: "ToPA tables can hold multiple entries", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "single_range_output", + description: "Single-range output scheme supported", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "trance_transport_output", + description: "Trace Transport subsystem output support", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "ip_payloads_lip", + description: "IP payloads have LIP values (CS base included)", + bits_range: (31, 31), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + // =================================================================================================================== + // Intel Processor Trace Enumeration Sub-leaf 1 + // =================================================================================================================== + ( + Parameters { + leaf: 0x14, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "num_address_ranges", + description: "Filtering number of configurable Address Ranges", + bits_range: (0, 2), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "mtc_periods_bmp", + description: "Bitmap of supported MTC period encodings", + bits_range: (16, 31), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x14, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "cycle_thresholds_bmp", + description: "Bitmap of supported Cycle Threshold encodings", + bits_range: (0, 15), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "psb_periods_bmp", + description: "Bitmap of supported Configurable PSB frequency encodings", + bits_range: (16, 31), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + // =================================================================================================================== + // Time Stamp Counter and Core Crystal Clock Information + // =================================================================================================================== + ( + Parameters { + leaf: 0x15, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "tsc_denominator", + description: "Denominator of the TSC/'core crystal clock' ratio", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x15, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "tsc_numerator", + description: "Numerator of the TSC/'core crystal clock' ratio", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x15, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_crystal_hz", + description: "Core crystal clock nominal frequency, in Hz", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + // =================================================================================================================== + // Processor Frequency Information + // =================================================================================================================== + ( + Parameters { + leaf: 0x16, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_base_mhz", + description: "Processor base frequency, in MHz", + bits_range: (0, 15), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x16, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_max_mhz", + description: "Processor max frequency, in MHz", + bits_range: (0, 15), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x16, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "bus_mhz", + description: "Bus reference frequency, in MHz", + bits_range: (0, 15), + policy: ProfilePolicy::Passthrough, + }]), + ), + // =================================================================================================================== + // System-On-Chip Vendor Attribute Enumeration Main Leaf + // =================================================================================================================== + + // System-On-Chip should probably not be supported for CPU profiles for the foreseeable feature. + ( + Parameters { + leaf: 0x17, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "soc_max_subleaf", + description: "Maximum leaf 0x17 subleaf", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + // =================================================================================================================== + // Deterministic Address Translation Parameters + // =================================================================================================================== + ( + Parameters { + leaf: 0x18, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "tlb_max_subleaf", + description: "Maximum leaf 0x18 subleaf", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x18, + sub_leaf: RangeInclusive::new(0, u32::MAX), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "tlb_4k_page", + description: "TLB 4KB-page entries supported", + bits_range: (0, 0), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "tlb_2m_page", + description: "TLB 2MB-page entries supported", + bits_range: (1, 1), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "tlb_4m_page", + description: "TLB 4MB-page entries supported", + bits_range: (2, 2), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "tlb_1g_page", + description: "TLB 1GB-page entries supported", + bits_range: (3, 3), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "hard_partitioning", + description: "(Hard/Soft) partitioning between logical CPUs sharing this structure", + bits_range: (8, 10), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "n_way_associative", + description: "Ways of associativity", + bits_range: (16, 31), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + ( + Parameters { + leaf: 0x18, + sub_leaf: RangeInclusive::new(0, u32::MAX), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "n_sets", + description: "Number of sets", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x18, + sub_leaf: RangeInclusive::new(0, u32::MAX), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "tlb_type", + description: "Translation cache type (TLB type)", + bits_range: (0, 4), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "tlb_cache_level", + description: "Translation cache level (1-based)", + bits_range: (5, 7), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "is_fully_associative", + description: "Fully-associative structure", + bits_range: (8, 8), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "tlb_max_addressable_ids", + description: "Max number of addressable IDs for logical CPUs sharing this TLB - 1", + bits_range: (14, 25), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + // We don't support key locker for now (leaf 0x19): Hence we zero out leaf 0x19 for CPU profiles We zero LEAF + // 0x1A (Native Model ID Enumeration) out for CPU profiles LEAF 0x1B (PCONFIG) is zeroed out for CPU profiles + // for now + + // =================================================================================================================== + // Last Branch Records Information + // =================================================================================================================== + ( + Parameters { + leaf: 0x1c, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "lbr_depth_8", + description: "Max stack depth (number of LBR entries) = 8", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr_depth_16", + description: "Max stack depth (number of LBR entries) = 16", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr_depth_24", + description: "Max stack depth (number of LBR entries) = 24", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr_depth_32", + description: "Max stack depth (number of LBR entries) = 32", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr_depth_40", + description: "Max stack depth (number of LBR entries) = 40", + bits_range: (4, 4), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr_depth_48", + description: "Max stack depth (number of LBR entries) = 48", + bits_range: (5, 5), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr_depth_56", + description: "Max stack depth (number of LBR entries) = 56", + bits_range: (6, 6), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr_depth_64", + description: "Max stack depth (number of LBR entries) = 64", + bits_range: (7, 7), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr_deep_c_reset", + description: "LBRs maybe cleared on MWAIT C-state > C1", + bits_range: (30, 30), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr_ip_is_lip", + description: "LBR IP contain Last IP, otherwise effective IP", + bits_range: (31, 31), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x1c, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "lbr_cpl", + description: "CPL filtering (non-zero IA32_LBR_CTL[2:1]) supported", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr_branch_filter", + description: "Branch filtering (non-zero IA32_LBR_CTL[22:16]) supported", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr_call_stack", + description: "Call-stack mode (IA32_LBR_CTL[3] = 1) supported", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x1c, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "lbr_mispredict", + description: "Branch misprediction bit supported (IA32_LBR_x_INFO[63])", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr_timed_lbr", + description: "Timed LBRs (CPU cycles since last LBR entry) supported", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr_branch_type", + description: "Branch type field (IA32_LBR_INFO_x[59:56]) supported", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr_events_gpc_bmp", + description: "LBR PMU-events logging support; bitmap for first 4 GP (general-purpose) Counters", + bits_range: (16, 19), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + // =================================================================================================================== + // Tile Information Main Leaf + // =================================================================================================================== + // NOTE: AMX is opt-in, but there are no problems with inheriting these values. The CHV will take care of zeroing out the bits userspace applications should check for if the user did not opt-in to amx. + ( + Parameters { + leaf: 0x1d, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "amx_max_palette", + description: "Highest palette ID / subleaf ID", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + // =================================================================================================================== + // Tile Palette 1 Sub-leaf + // =================================================================================================================== + // NOTE: AMX is opt-in, but there are no problems with inheriting these values. The CHV will take care of zeroing out the bits userspace applications should check for if the user did not opt-in to amx. + ( + Parameters { + leaf: 0x1d, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "amx_palette_size", + description: "AMX palette total tiles size, in bytes", + bits_range: (0, 15), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "amx_tile_size", + description: "AMX single tile's size, in bytes", + bits_range: (16, 31), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + ( + Parameters { + leaf: 0x1d, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "amx_tile_row_size", + description: "AMX tile single row's size, in bytes", + bits_range: (0, 15), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "amx_palette_nr_tiles", + description: "AMX palette number of tiles", + bits_range: (16, 31), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + ( + Parameters { + leaf: 0x1d, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "amx_tile_nr_rows", + description: "AMX tile max number of rows", + bits_range: (0, 15), + policy: ProfilePolicy::Inherit, + }]), + ), + // =================================================================================================================== + // TMUL Information Main Leaf + // =================================================================================================================== + // NOTE: AMX is opt-in, but there are no problems with inheriting these values. The CHV will take care of zeroing out the bits userspace applications should check for if the user did not opt-in to amx. + ( + Parameters { + leaf: 0x1e, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "tmul_info_max", + description: "Reports the maximum number of sub-leaves that are supported in leaf 0x1e", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x1e, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "tmul_maxk", + description: "TMUL unit maximum height, K (rows or columns)", + bits_range: (0, 7), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "tmul_maxn", + description: "TMUL unit maximum SIMD dimension, N (column bytes)", + bits_range: (8, 23), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + // =================================================================================================================== + // TMUL Information Sub-leaf 1 + // =================================================================================================================== + // NOTE: AMX is opt-in, but there are no problems with inheriting these values. The CHV will take care of zeroing out the bits userspace applications should check for if the user did not opt-in to amx. + ( + Parameters { + leaf: 0x1e, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EAX, + }, + // NOTE: AMX currently requires opt-in, even for the host CPU profile. We still inherit this value for profiles as the relevant feature bits that userspace applications must check will be zeroed out if the user has not opted in for "amx" via CpuFeatures. + ValueDefinitions::new(&[ + ValueDefinition { + short: "amx_int8", + description: "If 1, the processor supports tile computational operations on 8-bit integers", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "amx_bf16", + description: "If 1, the processor supports tile computational operations on bfloat16 numbers", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "amx_complex", + description: "If 1, the processor supports the AMX-COMPLEX instructions", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "amx_fp16", + description: "If 1, the processor supports tile computational operations on FP16 numbers", + bits_range: (3, 3), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "amx_fp8", + description: "If 1, the processor supports tile computational operations on FP8 numbers", + bits_range: (4, 4), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "amx_transpose", + description: "If 1, the processor supports the AMX-TRANSPOSE instructions", + bits_range: (5, 5), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "amx_tf32", + description: "If 1, the processor supports the AMX-TF32 (FP19) instructions", + bits_range: (6, 6), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "amx_avx512", + description: "If 1, the processor supports the AMX-AVX512 instructions", + bits_range: (7, 7), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "amx_movrs", + description: "If 1, the processor supports the AMX-MOVRS instructions", + bits_range: (8, 8), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + // =================================================================================================================== + // V2 Extended Topology Enumeration + // =================================================================================================================== + + // The values in leaf 0x1f must be set by CHV itself. + ( + Parameters { + leaf: 0x1f, + sub_leaf: RangeInclusive::new(0, u32::MAX), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "x2apic_id_shift", + description: "Bit width of this level (previous levels inclusive)", + bits_range: (0, 4), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x1f, + sub_leaf: RangeInclusive::new(0, u32::MAX), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "domain_lcpus_count", + description: "Logical CPUs count across all instances of this domain", + bits_range: (0, 15), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x1f, + sub_leaf: RangeInclusive::new(0, u32::MAX), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "domain_level", + description: "This domain level (subleaf ID)", + bits_range: (0, 7), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "domain_type", + description: "This domain type", + bits_range: (8, 15), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + ( + Parameters { + leaf: 0x1f, + sub_leaf: RangeInclusive::new(0, u32::MAX), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "x2apic_id", + description: "x2APIC ID of current logical CPU", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + // =================================================================================================================== + // Processor History Reset + // =================================================================================================================== + ( + Parameters { + leaf: 0x20, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "hreset_nr_subleaves", + description: "CPUID 0x20 max subleaf + 1", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x20, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "hreset_thread_director", + description: "HRESET of Intel thread director is supported", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }]), + ), + // =================================================================================================================== + // TDX + // =================================================================================================================== + + // TDX is not supported by CPU profiles for now. We just zero out this leaf for CPU profiles for the time being. + ( + Parameters { + leaf: 0x21, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "tdx_vendorid_0", + description: "TDX vendor ID string bytes 0 - 3", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0x21, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "tdx_vendorid_2", + description: "CPU vendor ID string bytes 8 - 11", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0x21, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "tdx_vendorid_1", + description: "CPU vendor ID string bytes 4 - 7", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + // =================================================================================================================== + // Architectural Performance Monitoring Extended Main Leaf + // =================================================================================================================== + ( + Parameters { + leaf: 0x23, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "subleaf_0", + description: "If 1, subleaf 0 exists", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "subleaf_1", + description: "If 1, subleaf 1 exists", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "subleaf_2", + description: "If 1, subleaf 2 exists", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "subleaf_3", + description: "If 1, subleaf 3 exists", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "subleaf_4", + description: "If 1, subleaf 4 exists", + bits_range: (4, 4), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "subleaf_5", + description: "If 1, subleaf 5 exists. The processor supports Architectural PEBS. The IA32_PEBS_BASE and IA32_PEBS_INDEX MSRs exist", + bits_range: (5, 5), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x23, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "unitmask2", + description: "IA32_PERFEVTSELx MSRs UnitMask2 is supported", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "eq_bit", + description: "equal flag in the IA32_PERFEVTSELx MSR is supported", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "RDPMC_USR_DISABLE", + description: "RDPMC_USR_DISABLE", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x23, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "num_slots_per_cycle", + description: "Number of slots per cycle. This number can be multiplied by the number of cycles (from CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.CORE or IA32_FIXED_CTR1) to determine the total number of slots", + bits_range: (0, 7), + policy: ProfilePolicy::Static(0), + }]), + ), + // =================================================================================================================== + // Architectural Performance Monitoring Extended Sub-leaf 1 + // =================================================================================================================== + ( + Parameters { + leaf: 0x23, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "pmu_gp_counters_bitmap", + description: "General-purpose PMU counters bitmap", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0x23, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "pmu_f_counters_bitmap", + description: "Fixed PMU counters bitmap", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + // =================================================================================================================== + // Architectural Performance Monitoring Extended Sub-leaf 2 + // =================================================================================================================== + ( + Parameters { + leaf: 0x23, + sub_leaf: RangeInclusive::new(2, 2), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "pmu_acr_bitmap", + description: "Bitmap of Auto Counter Reload (ACR) general-purpose counters that can be reloaded", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + // =================================================================================================================== + // Architectural Performance Monitoring Extended Sub-leaf 3 + // =================================================================================================================== + ( + Parameters { + leaf: 0x23, + sub_leaf: RangeInclusive::new(3, 3), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "core_cycles_evt", + description: "Core cycles event supported", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "insn_retired_evt", + description: "Instructions retired event supported", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "ref_cycles_evt", + description: "Reference cycles event supported", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "llc_refs_evt", + description: "Last-level cache references event supported", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "llc_misses_evt", + description: "Last-level cache misses event supported", + bits_range: (4, 4), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "br_insn_ret_evt", + description: "Branch instruction retired event supported", + bits_range: (5, 5), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "br_mispr_evt", + description: "Branch mispredict retired event supported", + bits_range: (6, 6), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "td_slots_evt", + description: "Topdown slots event supported", + bits_range: (7, 7), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "td_backend_bound_evt", + description: "Topdown backend bound event supported", + bits_range: (8, 8), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "td_bad_spec_evt", + description: "Topdown bad speculation event supported", + bits_range: (9, 9), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "td_frontend_bound_evt", + description: "Topdown frontend bound event supported", + bits_range: (10, 10), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "td_retiring_evt", + description: "Topdown retiring event support", + bits_range: (11, 11), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr_inserts", + description: "LBR support", + bits_range: (12, 12), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + // =================================================================================================================== + // Architectural Performance Monitoring Extended Sub-leaf 4 + // =================================================================================================================== + ( + Parameters { + leaf: 0x23, + sub_leaf: RangeInclusive::new(4, 4), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "allow_in_record", + description: "If 1, indicates that the ALLOW_IN_RECORD bit is available in the IA32_PMC_GPn_CFG_C and IA32_PMC_FXm_CFG_C MSRs", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "cntr", + description: "Counters group sub-groups general-purpose counters, fixed-function counters, and performance metrics are available", + bits_range: (0, 7), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr", + description: "LBR group and both bits [41:40] are available", + bits_range: (8, 9), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "xer", + description: "These bits correspond to XER group bits [55:49]", + bits_range: (17, 23), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "grp", + description: "If 1, the GRP group is available", + bits_range: (29, 29), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "aux", + description: "If 1, the AUX group is available", + bits_range: (30, 30), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x23, + sub_leaf: RangeInclusive::new(4, 4), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "allow_in_record", + description: "If 1, indicates that the ALLOW_IN_RECORD bit is available in the IA32_PMC_GPn_CFG_C and IA32_PMC_FXm_CFG_C MSRs", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "cntr", + description: "Counters group sub-groups general-purpose counters, fixed-function counters, and performance metrics are available", + bits_range: (0, 7), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr", + description: "LBR group and both bits [41:40] are available", + bits_range: (8, 9), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "xer", + description: "These bits correspond to XER group bits [55:49]", + bits_range: (17, 23), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "grp", + description: "If 1, the GRP group is available", + bits_range: (29, 29), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "aux", + description: "If 1, the AUX group is available", + bits_range: (30, 30), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + // =================================================================================================================== + // Architectural Performance Monitoring Extended Sub-leaf 5 + // =================================================================================================================== + ( + Parameters { + leaf: 0x23, + sub_leaf: RangeInclusive::new(5, 5), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "architectural_pebs_counters", + description: "General-purpose counters support Architectural PEBS. Bit vector of general-purpose counters for which the Architectural PEBS mechanism is available", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0x23, + sub_leaf: RangeInclusive::new(5, 5), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "pebs_pdist_counters", + description: "General-purpose counters for which PEBS support PDIST", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0x23, + sub_leaf: RangeInclusive::new(5, 5), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "pebs_fixed_function_counters", + description: "Fixed-function counters support Architectural PEBS. Bit vector of fixed-function counters for which the Architectural PEBS mechanism is available. If ECX[x] == 1, then the IA32_PMC_FXm_CFG_C MSR is available, and PEBS is supported", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0x23, + sub_leaf: RangeInclusive::new(5, 5), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "pebs_fixed_function_pdist_counters", + description: "Fixed-function counters for which PEBS supports PDIST", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + // =================================================================================================================== + // Converged Vector ISA Main Leaf + // =================================================================================================================== + ( + Parameters { + leaf: 0x24, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "converged_vector_isa_max_sub_leaves", + description: "Reports the maximum number of sub-leaves that are supported in leaf 0x24", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x24, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "avx_10_version", + description: "Reports the intel AVX10 Converged Vector ISA version", + bits_range: (0, 7), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "avx_10_lengths", + description: "Reserved at 111", + bits_range: (0, 7), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + // Hypervisor reserved CPUID leaves are set elsewhere + + // =================================================================================================================== + // Extended Function CPUID Information + // =================================================================================================================== + ( + Parameters { + leaf: 0x80000000, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "max_ext_leaf", + description: "Maximum extended CPUID leaf supported", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x80000000, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_vendorid_0", + description: "Vendor ID string bytes 0 - 3", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x80000000, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_vendorid_2", + description: "Vendor ID string bytes 8 - 11", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x80000000, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_vendorid_1", + description: "Vendor ID string bytes 4 - 7", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + // 0x80000001.EAX and EBX are both Reserved on Intel hence we just zero them out + ( + Parameters { + leaf: 0x80000001, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "lahf_lm", + description: "LAHF and SAHF in 64-bit mode", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "lzcnt", + description: "LZCNT advanced bit manipulation", + bits_range: (5, 5), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "prefetchw", + description: "3DNow PREFETCH/PREFETCHW support", + bits_range: (8, 8), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + ( + Parameters { + leaf: 0x80000001, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "syscall", + description: "SYSCALL and SYSRET instructions", + bits_range: (11, 11), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "nx", + description: "Execute Disable Bit available", + bits_range: (20, 20), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "pdpe1gb", + description: "1-GB large page support", + bits_range: (26, 26), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "rdtscp", + description: "RDTSCP instruction and IA32_TSC_AUX are available", + bits_range: (27, 27), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "lm", + description: "Long mode (x86-64, 64-bit support)", + bits_range: (29, 29), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + // The profile generation tool will actually modify the brand id string before + // acting on the policy set here. + ( + Parameters { + leaf: 0x80000002, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_brandid_0", + description: "CPU brand ID string, bytes 0 - 3", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x80000002, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_brandid_1", + description: "CPU brand ID string, bytes 4 - 7", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x80000002, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_brandid_2", + description: "CPU brand ID string, bytes 8 - 11", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x80000002, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_brandid_3", + description: "CPU brand ID string, bytes 12 - 15", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x80000003, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_brandid_4", + description: "CPU brand ID string bytes, 16 - 19", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x80000003, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_brandid_5", + description: "CPU brand ID string bytes, 20 - 23", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x80000003, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_brandid_6", + description: "CPU brand ID string bytes, 24 - 27", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x80000003, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_brandid_7", + description: "CPU brand ID string bytes, 28 - 31", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x80000004, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_brandid_8", + description: "CPU brand ID string, bytes 32 - 35", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x80000004, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_brandid_9", + description: "CPU brand ID string, bytes 36 - 39", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x80000004, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_brandid_10", + description: "CPU brand ID string, bytes 40 - 43", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x80000004, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_brandid_11", + description: "CPU brand ID string, bytes 44 - 47", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x80000006, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "l2_line_size", + description: "L2 cache line size, in bytes", + bits_range: (0, 7), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "l2_nlines", + description: "L2 cache number of lines per tag", + bits_range: (8, 11), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "l2_assoc", + description: "L2 cache associativity", + bits_range: (12, 15), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "l2_size_kb", + description: "L2 cache size, in KB", + bits_range: (16, 31), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + // EAX, EBX and ECX of 0x8000_0007 are all reserved (=0) on Intel + ( + Parameters { + leaf: 0x80000007, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ + // TODO: We may want some mechanism to let users opt-in to using an invariant TSC provided by the hardware (when available). + // TODO: Probably unconditionally set by CHV + ValueDefinition { + short: "constant_tsc", + description: "TSC ticks at constant rate across all P and C states", + bits_range: (8, 8), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + ( + Parameters { + leaf: 0x80000008, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "phys_addr_bits", + description: "Max physical address bits", + bits_range: (0, 7), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "virt_addr_bits", + description: "Max virtual address bits", + bits_range: (8, 15), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "guest_phys_addr_bits", + description: "Max nested-paging guest physical address bits", + bits_range: (16, 23), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + ( + Parameters { + leaf: 0x80000008, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "wbnoinvd", + description: "WBNOINVD supported", + bits_range: (9, 9), + policy: ProfilePolicy::Static(0), + }]), + ), + ]) +}; + +/// Compile time check that the given `BIT` in the CPUID output register specified by `params` is not +/// declared to be overwritten by `0` for non-host CPU profiles. +pub const fn assert_not_denied_cpuid_feature(params: &Parameters) { + if let Some(defs) = INTEL_CPUID_DEFINITIONS.get(params) + && let Some(def) = defs.find_bit::() + { + assert!(!matches!(def.policy, ProfilePolicy::Static(0))); + } else { + panic!("Unable to lookup CPUID value definition with the given parameters and feature bit"); + } +} + +// TODO: Also include assert_denied_cpuid_feature diff --git a/arch/src/x86_64/cpuid_definitions/kvm.rs b/arch/src/x86_64/cpuid_definitions/kvm.rs new file mode 100644 index 0000000000..9523f4ffab --- /dev/null +++ b/arch/src/x86_64/cpuid_definitions/kvm.rs @@ -0,0 +1,223 @@ +// Copyright © 2025 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 +// + +//! This module contains CPUID definitions for the KVM hypervisor. + +use std::ops::RangeInclusive; + +use crate::x86_64::CpuidReg; +use crate::x86_64::cpuid_definitions::{ + CpuidDefinitions, Parameters, ProfilePolicy, ValueDefinition, ValueDefinitions, +}; + +/// CPUID features defined for the KVM hypervisor. +/// +/// See https://www.kernel.org/doc/html/latest/virt/kvm/x86/cpuid.html +pub const KVM_CPUID_DEFINITIONS: CpuidDefinitions<6> = const { + CpuidDefinitions([ + //===================================================================== + // KVM CPUID Signature + // =================================================================== + ( + Parameters { + leaf: 0x4000_0000, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "max_hypervisor_leaf", + description: "The maximum valid leaf between 0x4000_0000 and 0x4FFF_FFF", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x4000_0000, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "hypervisor_string_ebx", + description: "Part of the hypervisor string", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x4000_0000, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "hypervisor_string_ecx", + description: "Part of the hypervisor string", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x4000_0000, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "hypervisor_string_edx", + description: "Part of the hypervisor string", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + //===================================================================== + // KVM CPUID Features + // =================================================================== + ( + Parameters { + leaf: 0x4000_0001, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "kvm_feature_clocksource", + description: "kvmclock available at MSRs 0x11 and 0x12", + bits_range: (0, 0), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_nop_io_delay", + description: "Not necessary to perform delays on PIO operations", + bits_range: (1, 1), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_mmu_op", + description: "Deprecated", + bits_range: (2, 2), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_clocksource2", + description: "kvmclock available at MSRs 0x4b564d00 and 0x4b564d01", + bits_range: (3, 3), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_async_pf", + description: "async pf can be enabled by writing to MSR 0x4b564d02", + bits_range: (4, 4), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_steal_time", + description: "steal time can be enabled by writing to msr 0x4b564d03", + bits_range: (5, 5), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_pv_eoi", + description: "paravirtualized end of interrupt handler can be enabled by writing to msr 0x4b564d04", + bits_range: (6, 6), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_pv_unhalt", + description: "guest checks this feature bit before enabling paravirtualized spinlock support", + bits_range: (7, 7), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_pv_tlb_flush", + description: "guest checks this feature bit before enabling paravirtualized tlb flush", + bits_range: (9, 9), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_async_pf_vmexit", + description: "paravirtualized async PF VM EXIT can be enabled by setting bit 2 when writing to msr 0x4b564d02", + bits_range: (10, 10), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_pv_send_ipi", + description: "guest checks this feature bit before enabling paravirtualized send IPIs", + bits_range: (11, 11), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_poll_control", + description: "host-side polling on HLT can be disabled by writing to msr 0x4b564d05.", + bits_range: (12, 12), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_pv_sched_yield", + description: "guest checks this feature bit before using paravirtualized sched yield.", + bits_range: (13, 13), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_async_pf_int", + description: "guest checks this feature bit before using the second async pf control msr 0x4b564d06 and async pf acknowledgment msr 0x4b564d07.", + bits_range: (14, 14), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_msi_ext_dest_id", + description: "guest checks this feature bit before using extended destination ID bits in MSI address bits 11-5.", + bits_range: (15, 15), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_hc_map_gpa_range", + description: "guest checks this feature bit before using the map gpa range hypercall to notify the page state change", + bits_range: (16, 16), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_migration_control", + description: "guest checks this feature bit before using MSR_KVM_MIGRATION_CONTROL", + bits_range: (17, 17), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_clocksource_stable_bit", + description: "host will warn if no guest-side per-cpu warps are expected in kvmclock", + bits_range: (24, 24), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + ( + Parameters { + leaf: 0x4000_0001, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "kvm_hints_realtime", + description: "guest checks this feature bit to determine that vCPUs are never preempted for an unlimited time allowing optimizations", + bits_range: (0, 0), + policy: ProfilePolicy::Passthrough, + }]), + ), + ]) +}; + +/// Compile time check that the given `BIT` in the CPUID output register specified by `params` is not +/// declared to be overwritten by `0` for non-host CPU profiles. +pub const fn assert_not_denied_cpuid_feature(params: &Parameters) { + if let Some(defs) = KVM_CPUID_DEFINITIONS.get(params) + && let Some(def) = defs.find_bit::() + { + assert!(!matches!(def.policy, ProfilePolicy::Static(0))); + } else { + panic!("Unable to lookup CPUID value definition with the given parameters and feature bit"); + } +} + +// TODO: Also include assert_denied_cpuid_feature diff --git a/arch/src/x86_64/cpuid_definitions/mod.rs b/arch/src/x86_64/cpuid_definitions/mod.rs new file mode 100644 index 0000000000..f45dc4a9e4 --- /dev/null +++ b/arch/src/x86_64/cpuid_definitions/mod.rs @@ -0,0 +1,233 @@ +// Copyright © 2025 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::ops::RangeInclusive; + +use serde::{Deserialize, Serialize}; + +use crate::x86_64::CpuidReg; +use crate::{deserialize_u32_hex, serialize_u32_hex}; + +pub mod intel; +#[cfg(feature = "kvm")] +pub mod kvm; + +/// Parameters for inspecting CPUID definitions. +#[derive(Debug, Clone, Eq, PartialEq, Serialize, Deserialize)] +pub struct Parameters { + // The leaf (EAX) parameter used with the CPUID instruction + #[serde( + serialize_with = "serialize_u32_hex", + deserialize_with = "deserialize_u32_hex" + )] + pub leaf: u32, + // The sub-leaf (ECX) parameter used with the CPUID instruction + #[serde( + serialize_with = "serialize_range_hex", + deserialize_with = "deserialize_range_hex" + )] + pub sub_leaf: RangeInclusive, + // The register we are interested in inspecting which gets filled by the CPUID instruction + pub register: CpuidReg, +} + +// Only used for (de-)serialization +#[derive(Debug, Serialize, Deserialize)] +struct ProvisionalRangeInclusive { + #[serde( + serialize_with = "serialize_u32_hex", + deserialize_with = "deserialize_u32_hex" + )] + start: u32, + #[serde( + serialize_with = "serialize_u32_hex", + deserialize_with = "deserialize_u32_hex" + )] + end: u32, +} + +fn serialize_range_hex( + input: &RangeInclusive, + serializer: S, +) -> Result { + let provisional = ProvisionalRangeInclusive { + start: *input.start(), + end: *input.end(), + }; + provisional.serialize(serializer) +} + +fn deserialize_range_hex<'de, D: serde::Deserializer<'de>>( + deserializer: D, +) -> Result, D::Error> { + let ProvisionalRangeInclusive { start, end } = + ProvisionalRangeInclusive::deserialize(deserializer)?; + Ok(start..=end) +} + +/// Describes a policy for how the corresponding CPUID data should be considered when building +/// a CPU profile. +/// +/// This enum is mostly intended for the CPU profile generation tool, but it's debug representation +/// might also appear in logs if/when CPUID compatibility checks fail at runtime. +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub enum ProfilePolicy { + /// Store the corresponding data when building the CPU profile. + /// + /// When the CPU profile gets utilized the corresponding data will be set into the modified + /// CPUID instruction(s). + Inherit, + /// Ignore the corresponding data when building the CPU profile. + /// + /// When the CPU profile gets utilized the corresponding data will then instead get + /// extracted from the host. + /// + /// This variant is typically set for data that has no effect on migration compatibility, + /// but there may be some exceptions such as data which is necessary to run the VM at all, + /// but must coincide with whatever is on the host. + Passthrough, + /// Set the following hardcoded value in the CPU profile. + /// + /// This variant is typically used for features/values that don't work well with live migration (even when using the exact same physical CPU model). + Static(u32), +} + +/// A description of a range of bits in a register populated by the CPUID instruction with specific parameters. +#[derive(Clone, Copy, Debug)] +pub struct ValueDefinition { + /// A short name for the value obtainable through CPUID + pub short: &'static str, + /// A description of the value obtainable through CPUID + pub description: &'static str, + /// The range of bits in the output register corresponding to this feature or value. + /// + /// This is not a `RangeInclusive` because that type does unfortunately not implement `Copy`. + pub bits_range: (u8, u8), + /// The policy corresponding to this value when building CPU profiles. + pub policy: ProfilePolicy, +} + +/// Describes values within a register populated by the CPUID instruction with specific parameters. +pub struct ValueDefinitions(&'static [ValueDefinition]); +impl ValueDefinitions { + /// Constructor permitting at most 32 entries. + const fn new(cpuid_descriptions: &'static [ValueDefinition]) -> Self { + // Note that this function is only called within this module, at compile time, hence it is fine to have some + // additional sanity checks such as the following assert. + assert!(cpuid_descriptions.len() <= 32); + Self(cpuid_descriptions) + } + /// Converts this into a slice representation. This is the only way to read values of this type. + pub const fn as_slice(&self) -> &'static [ValueDefinition] { + self.0 + } + + /// Lookup the [`ValueDefinition`] whose bits range contains the given `BIT`. + pub const fn find_bit(&self) -> Option<&ValueDefinition> { + let mut idx = 0; + let len = self.0.len(); + while idx < len { + let def = &self.0[idx]; + let start = def.bits_range.0; + let end = def.bits_range.1; + if (start <= BIT) & (end >= BIT) { + return Some(def); + } + idx += 1; + } + None + } +} + +/// Describes multiple CPUID outputs. +/// +/// Each wrapped [`ValueDefinitions`] corresponds to the given [`Parameters`] in the same tuple. +/// +pub struct CpuidDefinitions( + [(Parameters, ValueDefinitions); NUM_PARAMETERS], +); + +impl CpuidDefinitions { + pub const fn as_slice(&self) -> &[(Parameters, ValueDefinitions); NUM_PARAMETERS] { + &self.0 + } + + /// Lookup the [`ValueDefinitions`] corresponding to the given `parameters`. + pub const fn get(&self, parameters: &Parameters) -> Option<&ValueDefinitions> { + let mut idx = 0; + let len = self.0.len(); + let leaf = parameters.leaf; + let sub_leaf_start = *parameters.sub_leaf.start(); + let sub_leaf_end = *parameters.sub_leaf.end(); + // Note that as of today const Rust is quite a bit more vorbose than normal Rust. + // This is why the following implementation doesn't look so idiomatic. + let is_eax = matches!(parameters.register, CpuidReg::EAX); + let is_ebx = matches!(parameters.register, CpuidReg::EBX); + let is_ecx = matches!(parameters.register, CpuidReg::ECX); + let is_edx = matches!(parameters.register, CpuidReg::EDX); + while idx < len { + let (param, defs) = &self.0[idx]; + let matching_leaf = leaf == param.leaf; + let matching_sub_leaf = (sub_leaf_start >= *param.sub_leaf.start()) + & (sub_leaf_end <= *param.sub_leaf.end()); + let matching_reg = { + match param.register { + CpuidReg::EAX => is_eax, + CpuidReg::EBX => is_ebx, + CpuidReg::ECX => is_ecx, + CpuidReg::EDX => is_edx, + } + }; + if matching_leaf & matching_sub_leaf & matching_reg { + return Some(defs); + } + idx += 1; + } + None + } +} + +#[cfg(test)] +mod tests { + use proptest::prelude::*; + + use super::Parameters; + use crate::x86_64::CpuidReg; + + // Check that serializing and then deserializing a value of type `Parameter` results in the + // same value we started with. + // + // Also check that the serialized numeric values are hex strings + proptest! { + #[test] + fn parameter_serialization_roundtrip_works(leaf in any::(), x1 in 0u32..100, x2 in 0u32..100, reg in 0..4) { + let sub_leaf_range_start = std::cmp::min(x1, x2); + let sub_leaf_range_end = std::cmp::max(x1,x2); + let sub_leaf = sub_leaf_range_start..=sub_leaf_range_end; + let register = match reg { + 0 => CpuidReg::EAX, + 1 => CpuidReg::EBX, + 2 => CpuidReg::ECX, + 3 => CpuidReg::EDX, + _ => unreachable!() + }; + let cpuid_parameters = Parameters { + leaf, + sub_leaf, + register + }; + let serialized = serde_json::to_string(&cpuid_parameters).unwrap(); + let deserialized: Parameters = serde_json::from_str(&serialized).unwrap(); + prop_assert_eq!(&deserialized, &cpuid_parameters); + + // Check that all numeric values are hex strings when serialized to json + let params_json = serde_json::to_value(cpuid_parameters).unwrap(); + prop_assert!(params_json.get("leaf").unwrap().as_str().unwrap().starts_with("0x")); + let sub_leaf_map = params_json.get("sub_leaf").unwrap().as_object().unwrap(); + prop_assert!(sub_leaf_map.get("start").unwrap().as_str().unwrap().starts_with("0x")); + prop_assert!(sub_leaf_map.get("end").unwrap().as_str().unwrap().starts_with("0x")); + } + } +} diff --git a/arch/src/x86_64/mod.rs b/arch/src/x86_64/mod.rs index 09577b436c..b8c10d9e0e 100644 --- a/arch/src/x86_64/mod.rs +++ b/arch/src/x86_64/mod.rs @@ -6,9 +6,13 @@ // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE-BSD-3-Clause file. - +pub mod cpu_profile; +#[cfg(feature = "cpu_profile_generation")] +pub mod cpu_profile_generation; +pub mod cpuid_definitions; pub mod interrupts; pub mod layout; +pub mod msr_definitions; pub mod regs; #[cfg(feature = "tdx")] @@ -16,25 +20,34 @@ pub mod tdx; mod mpspec; mod mptable; +mod msr_filter; mod smbios; use std::arch::x86_64; +use std::collections::{HashMap, HashSet}; use std::mem; -use hypervisor::arch::x86::{CPUID_FLAG_VALID_INDEX, CpuIdEntry}; -use hypervisor::{CpuVendor, HypervisorCpuError, HypervisorError}; +use hypervisor::arch::x86::{CPUID_FLAG_VALID_INDEX, CpuIdEntry, MsrEntry}; +use hypervisor::{CpuVendor, HypervisorCpuError, HypervisorError, HypervisorVmError}; use linux_loader::loader::bootparam::{boot_params, setup_header}; use linux_loader::loader::elf::start_info::{ hvm_memmap_table_entry, hvm_modlist_entry, hvm_start_info, }; -use log::{debug, error, info}; +use log::{debug, error, info, trace}; +pub use msr_filter::{MAX_BITMAP_SIZE, filter_denied_msrs}; +use serde::{Deserialize, Serialize}; +pub use smbios::{SmbiosChassisConfig, SmbiosConfig, SmbiosSystem}; use thiserror::Error; use vm_memory::{ Address, Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, GuestMemoryRegion, }; -use crate::{GuestMemoryMmap, InitramfsConfig, RegionType}; +use crate::x86_64::cpu_profile::{ + CpuidOutputRegisterAdjustments, FeatureMsrAdjustment, RequiredMsrUpdates, +}; +use crate::x86_64::msr_definitions::RegisterAddress; +use crate::{CpuProfile, GuestMemoryMmap, InitramfsConfig, RegionType}; // While modern architectures support more than 255 CPUs via x2APIC, // legacy devices such as mptable support at most 254 CPUs. @@ -93,6 +106,7 @@ pub struct CpuidConfig { #[cfg(feature = "tdx")] pub tdx: bool, pub amx: bool, + pub profile: CpuProfile, } #[derive(Debug, Error)] @@ -132,7 +146,47 @@ pub enum Error { /// Error getting supported CPUID through the hypervisor (kvm/mshv) API #[error("Error getting supported CPUID through the hypervisor API")] CpuidGetSupported(#[source] HypervisorError), - + /// Error getting the MSR-based features through the hypervisor (kvm) API + #[error("Error getting the MSR-based features through the hypervisor API")] + MsrBasedFeaturesGetSupported(#[source] HypervisorError), + + #[error("Error getting the MSRs supported by the hypervisor")] + MsrIndexList(#[source] HypervisorError), + + #[error( + "The selected CPU profile cannot be utilized because the host's CPUID entries are not compatible with the profile" + )] + CpuProfileCpuidIncompatibility, + + #[error( + "The selected CPU profile cannot be utilized because the host's MSR-based features are not compatible with the profile" + )] + CpuProfileMsrIncompatibility, + + #[error( + "Unable to apply MSR filter: Bitmaps exceed maximum permitted memory usage: {0} > {MAX_BITMAP_SIZE}" + )] + MsrFilterTooLarge(usize), + + #[error("The hypervisor failed to set the given MSR filter")] + MsrFilter(#[source] HypervisorVmError), + + /// Error because TDX cannot be enabled when a custom (non host) CPU profile has been selected + #[error("TDX cannot be enabled when a custom CPU profile has been selected")] + CpuProfileTdxIncompatibility, + #[error( + "The selected CPU profile cannot be utilized because a necessary CPUID entry was not found" + )] + /// Error when trying to apply a CPU profile because a necessary CPUID entry was not found + MissingExpectedCpuidEntry(#[source] cpu_profile::MissingCpuidEntriesError), + /// Error when trying to apply a CPU profile because the host has a CPU from a different vendor + #[error( + "The selected CPU profile cannot be utilized because the host has a CPU from a different vendor: host_vendor:={cpu_vendor_host:?}, expected_vendor:={cpu_vendor_profile:?}" + )] + CpuProfileVendorIncompatibility { + cpu_vendor_profile: CpuVendor, + cpu_vendor_host: CpuVendor, + }, /// Error populating CPUID with KVM HyperV emulation details #[error("Error populating CPUID with KVM HyperV emulation details")] CpuidKvmHyperV(#[source] vmm_sys_util::fam::Error), @@ -190,7 +244,7 @@ pub fn get_max_x2apic_id(topology: (u16, u16, u16, u16)) -> u32 { ) } -#[derive(Copy, Clone, Debug)] +#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] pub enum CpuidReg { EAX, EBX, @@ -512,8 +566,62 @@ impl CpuidFeatureEntry { let src_vm_features = Self::get_features_from_cpuid(src_vm_cpuid, feature_entry_list); let dest_vm_features = Self::get_features_from_cpuid(dest_vm_cpuid, feature_entry_list); - // Loop on feature bit and check if the 'source vm' feature is a subset - // of those of the 'destination vm' feature + // If both processors are Intel then we can use the existing Intel CPUID definitions to log more + // precise information about potential errors + let both_intel = { + // Check if the vendor string is "GenuineIntel". This assumes that `leaf_0` is the entry + // corresponding to CPUID leaf 0. + let is_intel = |leaf_0: &CpuIdEntry| { + leaf_0.ebx == 0x756e_6547 && leaf_0.ecx == 0x6c65_746e && leaf_0.edx == 0x4965_6e69 + }; + let src_0 = src_vm_cpuid + .iter() + .find(|entry| (entry.function == 0x0) & (entry.index == 0x0)); + let dest_0 = dest_vm_cpuid + .iter() + .find(|entry| (entry.function == 0x0) & (entry.index == 0x0)); + src_0 + .zip(dest_0) + .is_some_and(|(src, dest)| is_intel(src) & is_intel(dest)) + }; + let extra_reporting = |entry: &CpuidFeatureEntry, src_reg: u32, dest_reg: u32| { + if let Some((_, defs)) = cpuid_definitions::intel::INTEL_CPUID_DEFINITIONS + .as_slice() + .iter() + .find(|(param, _)| { + (param.leaf == entry.function) + && (param.sub_leaf.contains(&entry.index) + && (param.register == entry.feature_reg)) + }) + { + for def in defs.as_slice() { + let mask = (def.bits_range.0..=def.bits_range.1) + .fold(0, |acc, next| acc | (1 << next)); + + let src_val = src_reg & mask; + let dest_val = dest_reg & mask; + + let is_compatible = match entry.compatible_check { + CpuidCompatibleCheck::BitwiseSubset => (src_val & (!dest_val)) == 0, + CpuidCompatibleCheck::NumNotGreater => src_val <= dest_val, + CpuidCompatibleCheck::Equal => src_val == dest_val, + }; + if !is_compatible { + info!( + "CPUID incompatibility for value definition='{:?}' detected in leaf={:#02x}, sub-leaf={:#02x}, register={:?}, compatibility_check={:?}, source VM value='{:#04x}' destination VM value='{:#04x}'", + def, + entry.function, + entry.index, + entry.feature_reg, + entry.compatible_check, + src_val, + dest_val + ); + } + } + } + }; + let mut compatible = true; for (i, (src_vm_feature, dest_vm_feature)) in src_vm_features .iter() @@ -541,7 +649,9 @@ impl CpuidFeatureEntry { src_vm_feature, dest_vm_feature ); - + if both_intel { + extra_reporting(entry, *src_vm_feature, *dest_vm_feature); + } compatible = false; } } @@ -555,6 +665,10 @@ impl CpuidFeatureEntry { } } +/// This function generates the CPUID entries to be set for all CPUs. +/// +/// If the `config` has a CPU profile set (other than host) then the profile +/// will be applied pub fn generate_common_cpuid( hypervisor: &dyn hypervisor::Hypervisor, config: &CpuidConfig, @@ -621,135 +735,21 @@ pub fn generate_common_cpuid( }); } - // Supported CPUID - let mut cpuid = hypervisor + // Supported CPUID according to the host and hypervisor + let mut host_cpuid = hypervisor .get_supported_cpuid() .map_err(Error::CpuidGetSupported)?; - CpuidPatch::patch_cpuid(&mut cpuid, &cpuid_patches); - - #[cfg(feature = "tdx")] - let tdx_capabilities = if config.tdx { - let caps = hypervisor - .tdx_capabilities() - .map_err(Error::TdxCapabilities)?; - info!("TDX capabilities {caps:#?}"); - Some(caps) - } else { - None - }; - - // Update some existing CPUID - for entry in cpuid.as_mut_slice().iter_mut() { - #[allow(unused_unsafe)] - match entry.function { - // Clear AMX related bits if the AMX feature is not enabled - 0x7 if !config.amx => { - if entry.index == 0 { - entry.edx &= !((1 << AMX_BF16) | (1 << AMX_TILE) | (1 << AMX_INT8)); - } - if entry.index == 1 { - entry.eax &= !(1 << AMX_FP16); - entry.edx &= !(1 << AMX_COMPLEX); - } - } - 0xd => - { - #[cfg(feature = "tdx")] - if let Some(caps) = &tdx_capabilities { - let xcr0_mask: u64 = 0x82ff; - let xss_mask: u64 = !xcr0_mask; - if entry.index == 0 { - entry.eax &= (caps.xfam_fixed0 as u32) & (xcr0_mask as u32); - entry.eax |= (caps.xfam_fixed1 as u32) & (xcr0_mask as u32); - entry.edx &= ((caps.xfam_fixed0 & xcr0_mask) >> 32) as u32; - entry.edx |= ((caps.xfam_fixed1 & xcr0_mask) >> 32) as u32; - } else if entry.index == 1 { - entry.ecx &= (caps.xfam_fixed0 as u32) & (xss_mask as u32); - entry.ecx |= (caps.xfam_fixed1 as u32) & (xss_mask as u32); - entry.edx &= ((caps.xfam_fixed0 & xss_mask) >> 32) as u32; - entry.edx |= ((caps.xfam_fixed1 & xss_mask) >> 32) as u32; - } - } - } - // Tile Information (purely AMX related). - 0x1d if !config.amx => { - entry.eax = 0; - entry.ebx = 0; - entry.ecx = 0; - entry.edx = 0; - } - // TMUL information (purely AMX related) - 0x1e if !config.amx => { - entry.eax = 0; - entry.ebx = 0; - entry.ecx = 0; - entry.edx = 0; - } - - // Copy host L1 cache details if not populated by KVM - 0x8000_0005 - if entry.eax == 0 - && entry.ebx == 0 - && entry.ecx == 0 - && entry.edx == 0 - // SAFETY: cpuid called with valid leaves - && unsafe { std::arch::x86_64::__cpuid(0x8000_0000).eax } >= 0x8000_0005 => - { - // SAFETY: cpuid called with valid leaves - let leaf = unsafe { std::arch::x86_64::__cpuid(0x8000_0005) }; - entry.eax = leaf.eax; - entry.ebx = leaf.ebx; - entry.ecx = leaf.ecx; - entry.edx = leaf.edx; - } - // Copy host L2 cache details if not populated by KVM - 0x8000_0006 - if entry.eax == 0 - && entry.ebx == 0 - && entry.ecx == 0 - && entry.edx == 0 - // SAFETY: cpuid called with valid leaves - && unsafe { std::arch::x86_64::__cpuid(0x8000_0000).eax } >= 0x8000_0006 => - { - // SAFETY: cpuid called with valid leaves - let leaf = unsafe { std::arch::x86_64::__cpuid(0x8000_0006) }; - entry.eax = leaf.eax; - entry.ebx = leaf.ebx; - entry.ecx = leaf.ecx; - entry.edx = leaf.edx; - } - // Set CPU physical bits - 0x8000_0008 => { - entry.eax = (entry.eax & 0xffff_ff00) | (config.phys_bits as u32 & 0xff); - } - 0x4000_0001 => { - // Enable KVM_FEATURE_MSI_EXT_DEST_ID. This allows the guest to target - // device interrupts to cpus with APIC IDs > 254 without interrupt remapping. - entry.eax |= 1 << KVM_FEATURE_MSI_EXT_DEST_ID; - - // These features are not supported by TDX - #[cfg(feature = "tdx")] - if config.tdx { - entry.eax &= !((1 << KVM_FEATURE_CLOCKSOURCE_BIT) - | (1 << KVM_FEATURE_CLOCKSOURCE2_BIT) - | (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) - | (1 << KVM_FEATURE_ASYNC_PF_BIT) - | (1 << KVM_FEATURE_ASYNC_PF_VMEXIT_BIT) - | (1 << KVM_FEATURE_STEAL_TIME_BIT)); - } - } - _ => {} - } - } - // Copy CPU identification string + // + // If a CPU profile has been applied then this will get + // overwritten as soon as the profile is applied for i in 0x8000_0002..=0x8000_0004 { - cpuid.retain(|c| c.function != i); + host_cpuid.retain(|c| c.function != i); // SAFETY: call cpuid with valid leaves #[allow(unused_unsafe)] let leaf = unsafe { std::arch::x86_64::__cpuid(i) }; - cpuid.push(CpuIdEntry { + host_cpuid.push(CpuIdEntry { function: i, eax: leaf.eax, ebx: leaf.ebx, @@ -759,54 +759,355 @@ pub fn generate_common_cpuid( }); } - if config.kvm_hyperv { - // Remove conflicting entries - cpuid.retain(|c| c.function != 0x4000_0000); - cpuid.retain(|c| c.function != 0x4000_0001); - // See "Hypervisor Top Level Functional Specification" for details - // Compliance with "Hv#1" requires leaves up to 0x4000_000a - cpuid.push(CpuIdEntry { - function: 0x40000000, - eax: 0x4000000a, // Maximum cpuid leaf - ebx: 0x756e694c, // "Linu" - ecx: 0x564b2078, // "x KV" - edx: 0x7648204d, // "M Hv" - ..Default::default() - }); - cpuid.push(CpuIdEntry { - function: 0x40000001, - eax: 0x31237648, // "Hv#1" - ..Default::default() - }); - cpuid.push(CpuIdEntry { - function: 0x40000002, - eax: 0x3839, // "Build number" - ebx: 0xa0000, // "Version" - ..Default::default() - }); - cpuid.push(CpuIdEntry { - function: 0x4000_0003, - eax: (1 << 1) // AccessPartitionReferenceCounter + let use_custom_profile = config.profile != CpuProfile::Host; + // Obtain cpuid entries that are adjusted to the specified CPU profile and the cpuid entries of the compatibility target + // TODO: Try to write this in a clearer way + let (host_adjusted_to_profile, profile_cpu_vendor) = { + config + .profile + .cpuid_data(config.amx) + .map_or((Ok(None), None), |profile_data| { + ( + CpuidOutputRegisterAdjustments::adjust_cpuid_entries( + host_cpuid.clone(), + &profile_data.adjustments, + ) + .map(Some), + Some(profile_data.cpu_vendor), + ) + }) + }; + let mut host_adjusted_to_profile = + host_adjusted_to_profile.map_err(Error::MissingExpectedCpuidEntry)?; + + // There should be relatively few cases where live migration can succeed between hosts from different + // CPU vendors and making our checks account for that possibility would complicate things substantially. + // We thus require that the host's cpu vendor matches the one used to generate the CPU profile. + if let Some(cpu_vendor_profile) = profile_cpu_vendor + && let cpu_vendor_host = hypervisor.get_cpu_vendor() + && cpu_vendor_profile != cpu_vendor_host + { + return Err(Error::CpuProfileVendorIncompatibility { + cpu_vendor_profile, + cpu_vendor_host, + } + .into()); + } + // We now make the modifications according to the config parameters to each of the cpuid entries + // declared above and then perform a compatibility check. + for cpuid_option in [Some(&mut host_cpuid), host_adjusted_to_profile.as_mut()] { + let Some(cpuid) = cpuid_option else { + break; + }; + CpuidPatch::patch_cpuid(cpuid, &cpuid_patches); + + #[cfg(feature = "tdx")] + let tdx_capabilities = if config.tdx { + if use_custom_profile { + return Err(Error::CpuProfileTdxIncompatibility.into()); + } + let caps = hypervisor + .tdx_capabilities() + .map_err(Error::TdxCapabilities)?; + info!("TDX capabilities {caps:#?}"); + Some(caps) + } else { + None + }; + + // Update some existing CPUID + for entry in cpuid.as_mut_slice().iter_mut() { + match entry.function { + // Clear AMX related bits if the AMX feature is not enabled + 0x7 + if !config.amx =>{ + if entry.index == 0 { + entry.edx &= !((1 << AMX_BF16) | (1 << AMX_TILE) | (1 << AMX_INT8)); + } + if entry.index == 1 { + entry.eax &= !(1 << AMX_FP16); + entry.edx &= !(1 << AMX_COMPLEX); + } + } + + 0xd => + { + #[cfg(feature = "tdx")] + if let Some(caps) = &tdx_capabilities { + let xcr0_mask: u64 = 0x82ff; + let xss_mask: u64 = !xcr0_mask; + if entry.index == 0 { + entry.eax &= (caps.xfam_fixed0 as u32) & (xcr0_mask as u32); + entry.eax |= (caps.xfam_fixed1 as u32) & (xcr0_mask as u32); + entry.edx &= ((caps.xfam_fixed0 & xcr0_mask) >> 32) as u32; + entry.edx |= ((caps.xfam_fixed1 & xcr0_mask) >> 32) as u32; + } else if entry.index == 1 { + entry.ecx &= (caps.xfam_fixed0 as u32) & (xss_mask as u32); + entry.ecx |= (caps.xfam_fixed1 as u32) & (xss_mask as u32); + entry.edx &= ((caps.xfam_fixed0 & xss_mask) >> 32) as u32; + entry.edx |= ((caps.xfam_fixed1 & xss_mask) >> 32) as u32; + } + } + } + + 0x1d + // Tile Information (purely AMX related). + if !config.amx =>{ + entry.eax = 0; + entry.ebx = 0; + entry.ecx = 0; + entry.edx = 0; + } + + 0x1e + // TMUL information (purely AMX related) + if !config.amx =>{ + entry.eax = 0; + entry.ebx = 0; + entry.ecx = 0; + entry.edx = 0; + } + + + // Copy host L1 cache details if not populated by KVM + #[allow(unused_unsafe)] + 0x8000_0005 + if entry.eax == 0 && entry.ebx == 0 && entry.ecx == 0 && entry.edx == 0 + + // SAFETY: cpuid called with valid leaves + && unsafe { std::arch::x86_64::__cpuid(0x8000_0000).eax } >= 0x8000_0005 =>{ + #[allow(unused_unsafe)] + // SAFETY: cpuid called with valid leaves + let leaf = unsafe { std::arch::x86_64::__cpuid(0x8000_0005) }; + entry.eax = leaf.eax; + entry.ebx = leaf.ebx; + entry.ecx = leaf.ecx; + entry.edx = leaf.edx; + } + + // Copy host L2 cache details if not populated by KVM + #[allow(unused_unsafe)] + 0x8000_0006 + if entry.eax == 0 && entry.ebx == 0 && entry.ecx == 0 && entry.edx == 0 + + // SAFETY: cpuid called with valid leaves + && unsafe { std::arch::x86_64::__cpuid(0x8000_0000).eax } >= 0x8000_0006 =>{ + #[allow(unused_unsafe)] + // SAFETY: cpuid called with valid leaves + let leaf = unsafe { std::arch::x86_64::__cpuid(0x8000_0006) }; + entry.eax = leaf.eax; + entry.ebx = leaf.ebx; + entry.ecx = leaf.ecx; + entry.edx = leaf.edx; + + } + // Set CPU physical bits + 0x8000_0008 => { + entry.eax = (entry.eax & 0xffff_ff00) | (config.phys_bits as u32 & 0xff); + } + 0x4000_0001 => { + // Enable KVM_FEATURE_MSI_EXT_DEST_ID. This allows the guest to target + // device interrupts to cpus with APIC IDs > 254 without interrupt remapping. + entry.eax |= 1 << KVM_FEATURE_MSI_EXT_DEST_ID; + + // These features are not supported by TDX + #[cfg(feature = "tdx")] + if config.tdx { + entry.eax &= !((1 << KVM_FEATURE_CLOCKSOURCE_BIT) + | (1 << KVM_FEATURE_CLOCKSOURCE2_BIT) + | (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) + | (1 << KVM_FEATURE_ASYNC_PF_BIT) + | (1 << KVM_FEATURE_ASYNC_PF_VMEXIT_BIT) + | (1 << KVM_FEATURE_STEAL_TIME_BIT)); + } + } + _ => {} + } + } + + if config.kvm_hyperv { + // Remove conflicting entries + cpuid.retain(|c| c.function != 0x4000_0000); + cpuid.retain(|c| c.function != 0x4000_0001); + // See "Hypervisor Top Level Functional Specification" for details + // Compliance with "Hv#1" requires leaves up to 0x4000_000a + cpuid.push(CpuIdEntry { + function: 0x40000000, + eax: 0x4000000a, // Maximum cpuid leaf + ebx: 0x756e694c, // "Linu" + ecx: 0x564b2078, // "x KV" + edx: 0x7648204d, // "M Hv" + ..Default::default() + }); + cpuid.push(CpuIdEntry { + function: 0x40000001, + eax: 0x31237648, // "Hv#1" + ..Default::default() + }); + cpuid.push(CpuIdEntry { + function: 0x40000002, + eax: 0x3839, // "Build number" + ebx: 0xa0000, // "Version" + ..Default::default() + }); + cpuid.push(CpuIdEntry { + function: 0x4000_0003, + eax: (1 << 1) // AccessPartitionReferenceCounter | (1 << 2) // AccessSynicRegs | (1 << 3) // AccessSyntheticTimerRegs | (1 << 9), // AccessPartitionReferenceTsc - edx: 1 << 3, // CPU dynamic partitioning - ..Default::default() - }); - cpuid.push(CpuIdEntry { - function: 0x4000_0004, - eax: 1 << 5, // Recommend relaxed timing - ..Default::default() - }); - for i in 0x4000_0005..=0x4000_000a { + edx: 1 << 3, // CPU dynamic partitioning + ..Default::default() + }); cpuid.push(CpuIdEntry { - function: i, + function: 0x4000_0004, + eax: 1 << 5, // Recommend relaxed timing ..Default::default() }); + for i in 0x4000_0005..=0x4000_000a { + cpuid.push(CpuIdEntry { + function: i, + ..Default::default() + }); + } } } - Ok(cpuid) + if use_custom_profile { + // Final compatibility checks to ensure that the CPUID values we return are compatible both with the CPU profile and the host we are currently running on. + let host_adjusted_to_profile = host_adjusted_to_profile.expect("The profile adjusted cpuid entries should exist as we checked that we have a custom CPU profile"); + + // Check that the host's cpuid is indeed compatible with the adjusted profile. This is not by construction. + info!("checking compatibility between host adjusted to profile and the host itself"); + CpuidFeatureEntry::check_cpuid_compatibility(&host_adjusted_to_profile, &host_cpuid) + .map_err(|_| Error::CpuProfileCpuidIncompatibility)?; + Ok(host_adjusted_to_profile) + } else { + Ok(host_cpuid) + } +} + +/// This function computes the [`RequiredMsrUpdates`] according to the +/// given `cpu_profile`, and `kvm_hyperv` parameters. +/// +/// If [`CpuProfile::Host`] is used then this function immediately returns `Ok(None)`, +/// regardless of the other parameters. +/// +/// ## Consistency with CPUID +/// +/// Some MSRs are only present when certain related bits in CPUID leaves are. +/// The CPU profile definition ensures consistency between the MSRs it permits and the +/// CPUID adjustments it prescribes. +/// +/// There are however certain CPUID values that can be modified by the VMM independently of the +/// CPUID profile and there may be corresponding MSRs that should then not be accessible. +/// At this point in time this only concerns the KVM and Hyper-V specific CPUID leaves and we +/// assume that the end user checks CPUID before accessing any of the related MSRs for now. +// TODO: Add `cpuid: &[CpuidEntry]` as a parameter and patch the permitted MSRs accordingly +// before upstreaming. +pub fn compute_required_msr_updates( + hypervisor: &dyn hypervisor::Hypervisor, + cpu_profile: CpuProfile, + kvm_hyperv: bool, +) -> super::Result> { + let Some(data) = cpu_profile.msr_data() else { + return Ok(None); + }; + + let cpu_vendor_host = hypervisor.get_cpu_vendor(); + let cpu_vendor_profile = data.cpu_vendor; + if cpu_vendor_host != cpu_vendor_profile { + return Err(Error::CpuProfileVendorIncompatibility { + cpu_vendor_profile, + cpu_vendor_host, + } + .into()); + } + + let msr_based_features = hypervisor + .get_msr_based_features() + .map_err(Error::MsrBasedFeaturesGetSupported)?; + + let msr_index_list = hypervisor + .get_msr_index_list() + .map_err(Error::MsrIndexList)?; + + let all_host_msrs: HashSet = msr_based_features + .iter() + .map(|entry| entry.index) + .chain(msr_index_list.iter().copied()) + .collect(); + + let mut permitted_msrs: HashSet = data.permitted_msrs.iter().map(|msr| msr.0).collect(); + + if kvm_hyperv { + // Log the Hyper-V MSRs that are not in the list of permitted MSRs. + // Some of these MSRs not being permitted by the profile might be benign or even intentional, + // but it might also indicate a BUG, or misconceptions that lead to bad CPU profiles. We thus + // log this at the info level for now. + for msr in msr_definitions::hyperv::HYPERV_MSRS { + if !permitted_msrs.contains(&msr) { + info!( + "NOTE: Hyper-V MSR: {msr:#x} is not in the list of MSRs supported by the CPU profile" + ); + } + } + } else { + // Remove all HYPER-V MSRs from the list of permitted MSRs + for msr in msr_definitions::hyperv::HYPERV_MSRS { + if permitted_msrs.remove(&msr) { + trace!("Removed Hyper-V MSR {msr:#x} from the set of supported MSRs"); + } + } + } + + let forbidden_msrs: Vec = all_host_msrs + .difference(&permitted_msrs) + .map(|msr| RegisterAddress(*msr)) + .collect(); + + if (all_host_msrs.len() - forbidden_msrs.len()) != permitted_msrs.len() { + error!("Host does not have all the permitted MSRS"); + for msr in permitted_msrs.iter() { + if !all_host_msrs.contains(msr) { + error!("Host is missing the required MSR:={msr:#x}"); + } + } + Err(Error::CpuProfileMsrIncompatibility)?; + } + + // NOTE: It is fine to ignore the inner error because the called function logs any missing MSRs. + let adjusted_msr_based_features = + FeatureMsrAdjustment::adjust_to(&data.adjustments, &msr_based_features) + .map_err(|_| Error::CpuProfileMsrIncompatibility)?; + + // TODO: CPU profiles are only available for Intel CPUs at the moment. We need to branch on the vendor + // once we also have CPU profiles for AMD. + assert!(matches!(cpu_vendor_host, CpuVendor::Intel)); + crate::x86_64::msr_definitions::intel::check_feature_msr_compatibility( + &HashMap::from_iter( + adjusted_msr_based_features + .iter() + .map(|entry| (entry.index, entry.data)), + ), + &HashMap::from_iter( + msr_based_features + .iter() + .map(|entry| (entry.index, entry.data)), + ), + "CPU Profile", + "Host", + ) + .map_err(|_| { + error!("feature-based MSR compatibility check failed"); + Error::CpuProfileMsrIncompatibility + })?; + + let update = RequiredMsrUpdates { + msr_based_features: adjusted_msr_based_features, + denied_msrs: forbidden_msrs, + }; + Ok(Some(update)) } #[allow(clippy::too_many_arguments)] @@ -815,6 +1116,7 @@ pub fn configure_vcpu( id: u32, boot_setup: Option<(EntryPoint, &GuestMemoryAtomic)>, cpuid: Vec, + feature_msrs: &[MsrEntry], kvm_hyperv: bool, cpu_vendor: CpuVendor, topology: (u16, u16, u16, u16), @@ -891,7 +1193,7 @@ pub fn configure_vcpu( vcpu.enable_hyperv_synic().unwrap(); } - regs::setup_msrs(vcpu).map_err(Error::MsrsConfiguration)?; + regs::setup_msrs(vcpu, feature_msrs).map_err(Error::MsrsConfiguration)?; if let Some((kernel_entry_point, guest_memory)) = boot_setup { if setup_registers { regs::setup_regs(vcpu, kernel_entry_point).map_err(Error::RegsConfiguration)?; @@ -958,9 +1260,7 @@ pub fn configure_system( _num_cpus: u32, setup_header: Option, rsdp_addr: Option, - serial_number: Option<&str>, - uuid: Option<&str>, - oem_strings: Option<&[&str]>, + smbios: Option<&SmbiosConfig>, topology: Option<(u16, u16, u16, u16)>, ) -> super::Result<()> { // Write EBDA address to location where ACPICA expects to find it @@ -968,8 +1268,7 @@ pub fn configure_system( .write_obj((layout::EBDA_START.0 >> 4) as u16, layout::EBDA_POINTER) .map_err(Error::EbdaSetup)?; - let size = smbios::setup_smbios(guest_mem, serial_number, uuid, oem_strings) - .map_err(Error::SmbiosSetup)?; + let size = smbios::setup_smbios(guest_mem, smbios).map_err(Error::SmbiosSetup)?; // Place the MP table after the SMIOS table aligned to 16 bytes let offset = GuestAddress(layout::SMBIOS_START).unchecked_add(size); @@ -1514,8 +1813,6 @@ mod unit_tests { Some(layout::RSDP_POINTER), None, None, - None, - None, ); config_err.unwrap_err(); @@ -1538,8 +1835,6 @@ mod unit_tests { None, None, None, - None, - None, ) .unwrap(); @@ -1567,8 +1862,6 @@ mod unit_tests { None, None, None, - None, - None, ) .unwrap(); @@ -1582,8 +1875,6 @@ mod unit_tests { None, None, None, - None, - None, ) .unwrap(); } diff --git a/arch/src/x86_64/msr_definitions/hyperv.rs b/arch/src/x86_64/msr_definitions/hyperv.rs new file mode 100644 index 0000000000..8d5b6577e4 --- /dev/null +++ b/arch/src/x86_64/msr_definitions/hyperv.rs @@ -0,0 +1,169 @@ +// Copyright © 2025 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 +// + +//! This module exports a list of all known Hyper-V MSRs that we found in Appendix F in +//! the Microsoft Hypervisor Top Level Functional Specification document from February 2017. + +const HV_X64_MSR_GUEST_OS_ID: u32 = 0x40000000; +const HV_X64_MSR_HYPERCALL: u32 = 0x40000001; +const HV_X64_MSR_VP_INDEX: u32 = 0x40000002; +const HV_X64_MSR_RESET: u32 = 0x40000003; +const HV_X64_MSR_VP_RUNTIME: u32 = 0x40000010; +const HV_X64_MSR_TIME_REF_COUNT: u32 = 0x40000020; +const HV_X64_MSR_REFERENCE_TSC: u32 = 0x40000021; +const HV_X64_MSR_TSC_FREQUENCY: u32 = 0x40000022; +const HV_X64_MSR_APIC_FREQUENCY: u32 = 0x40000023; +const HV_X64_MSR_EOI: u32 = 0x40000070; +const HV_X64_MSR_ICR: u32 = 0x40000071; +const HV_X64_MSR_TPR: u32 = 0x40000072; +const HV_X64_MSR_VP_ASSIST_PAGE: u32 = 0x40000073; +const HV_X64_MSR_SCONTROL: u32 = 0x40000080; +const HV_X64_MSR_SVERSION: u32 = 0x40000081; +const HV_X64_MSR_SIEFP: u32 = 0x40000082; +const HV_X64_MSR_SIMP: u32 = 0x40000083; +const HV_X64_MSR_EOM: u32 = 0x40000084; +const HV_X64_MSR_SINT0: u32 = 0x40000090; +const HV_X64_MSR_SINT1: u32 = 0x40000091; +const HV_X64_MSR_SINT2: u32 = 0x40000092; +const HV_X64_MSR_SINT3: u32 = 0x40000093; +const HV_X64_MSR_SINT4: u32 = 0x40000094; +const HV_X64_MSR_SINT5: u32 = 0x40000095; +const HV_X64_MSR_SINT6: u32 = 0x40000096; +const HV_X64_MSR_SINT7: u32 = 0x40000097; +const HV_X64_MSR_SINT8: u32 = 0x40000098; +const HV_X64_MSR_SINT9: u32 = 0x40000099; +const HV_X64_MSR_SINT10: u32 = 0x4000009A; +const HV_X64_MSR_SINT11: u32 = 0x4000009B; +const HV_X64_MSR_SINT12: u32 = 0x4000009C; +const HV_X64_MSR_SINT13: u32 = 0x4000009D; +const HV_X64_MSR_SINT14: u32 = 0x4000009E; +const HV_X64_MSR_SINT15: u32 = 0x4000009F; +const HV_X64_MSR_STIMER0_CONFIG: u32 = 0x400000B0; +const HV_X64_MSR_STIMER0_COUNT: u32 = 0x400000B1; +const HV_X64_MSR_STIMER1_CONFIG: u32 = 0x400000B2; +const HV_X64_MSR_STIMER1_COUNT: u32 = 0x400000B3; +const HV_X64_MSR_STIMER2_CONFIG: u32 = 0x400000B4; +const HV_X64_MSR_STIMER2_COUNT: u32 = 0x400000B5; +const HV_X64_MSR_STIMER3_CONFIG: u32 = 0x400000B6; +const HV_X64_MSR_STIMER3_COUNT: u32 = 0x400000B7; +const HV_X64_MSR_POWER_STATE_TRIGGER_C1: u32 = 0x400000C1; +const HV_X64_MSR_POWER_STATE_TRIGGER_C2: u32 = 0x400000C2; +const HV_X64_MSR_POWER_STATE_TRIGGER_C3: u32 = 0x400000C3; +const HV_X64_MSR_POWER_STATE_CONFIG_C1: u32 = 0x400000D1; +const HV_X64_MSR_POWER_STATE_CONFIG_C2: u32 = 0x400000D2; +const HV_X64_MSR_POWER_STATE_CONFIG_C3: u32 = 0x400000D3; +const HV_X64_MSR_STATS_PARTITION_RETAIL_PAGE: u32 = 0x400000E0; +const HV_X64_MSR_STATS_PARTITION_INTERNAL_PAGE: u32 = 0x400000E1; +const HV_X64_MSR_STATS_VP_RETAIL_PAGE: u32 = 0x400000E2; +const HV_X64_MSR_STATS_VP_INTERNAL_PAGE: u32 = 0x400000E3; +const HV_X64_MSR_GUEST_IDLE: u32 = 0x400000F0; +const HV_X64_MSR_SYNTH_DEBUG_CONTROL: u32 = 0x400000F1; +const HV_X64_MSR_SYNTH_DEBUG_STATUS: u32 = 0x400000F2; +const HV_X64_MSR_SYNTH_DEBUG_SEND_BUFFER: u32 = 0x400000F3; +const HV_X64_MSR_SYNTH_DEBUG_RECEIVE_BUFFER: u32 = 0x400000F4; +const HV_X64_MSR_SYNTH_DEBUG_PENDING_BUFFER: u32 = 0x400000F5; +const HV_X64_MSR_CRASH_P0: u32 = 0x40000100; +const HV_X64_MSR_CRASH_P1: u32 = 0x40000101; +const HV_X64_MSR_CRASH_P2: u32 = 0x40000102; +const HV_X64_MSR_CRASH_P3: u32 = 0x40000103; +const HV_X64_MSR_CRASH_P4: u32 = 0x40000104; +const HV_X64_MSR_CRASH_CTL: u32 = 0x40000105; + +/// This is a list of all Hyper-V MSRs that we found in Appendix F in the Microsoft +/// Hypervisor Top Level Functional Specification document from February 2017 +pub(in crate::x86_64) const HYPERV_MSRS: [u32; 64] = [ + HV_X64_MSR_GUEST_OS_ID, + HV_X64_MSR_HYPERCALL, + HV_X64_MSR_VP_INDEX, + HV_X64_MSR_RESET, + HV_X64_MSR_VP_RUNTIME, + HV_X64_MSR_TIME_REF_COUNT, + HV_X64_MSR_REFERENCE_TSC, + HV_X64_MSR_TSC_FREQUENCY, + HV_X64_MSR_APIC_FREQUENCY, + HV_X64_MSR_EOI, + HV_X64_MSR_ICR, + HV_X64_MSR_TPR, + HV_X64_MSR_VP_ASSIST_PAGE, + HV_X64_MSR_SCONTROL, + HV_X64_MSR_SVERSION, + HV_X64_MSR_SIEFP, + HV_X64_MSR_SIMP, + HV_X64_MSR_EOM, + HV_X64_MSR_SINT0, + HV_X64_MSR_SINT1, + HV_X64_MSR_SINT2, + HV_X64_MSR_SINT3, + HV_X64_MSR_SINT4, + HV_X64_MSR_SINT5, + HV_X64_MSR_SINT6, + HV_X64_MSR_SINT7, + HV_X64_MSR_SINT8, + HV_X64_MSR_SINT9, + HV_X64_MSR_SINT10, + HV_X64_MSR_SINT11, + HV_X64_MSR_SINT12, + HV_X64_MSR_SINT13, + HV_X64_MSR_SINT14, + HV_X64_MSR_SINT15, + HV_X64_MSR_STIMER0_CONFIG, + HV_X64_MSR_STIMER0_COUNT, + HV_X64_MSR_STIMER1_CONFIG, + HV_X64_MSR_STIMER1_COUNT, + HV_X64_MSR_STIMER2_CONFIG, + HV_X64_MSR_STIMER2_COUNT, + HV_X64_MSR_STIMER3_CONFIG, + HV_X64_MSR_STIMER3_COUNT, + HV_X64_MSR_POWER_STATE_TRIGGER_C1, + HV_X64_MSR_POWER_STATE_TRIGGER_C2, + HV_X64_MSR_POWER_STATE_TRIGGER_C3, + HV_X64_MSR_POWER_STATE_CONFIG_C1, + HV_X64_MSR_POWER_STATE_CONFIG_C2, + HV_X64_MSR_POWER_STATE_CONFIG_C3, + HV_X64_MSR_STATS_PARTITION_RETAIL_PAGE, + HV_X64_MSR_STATS_PARTITION_INTERNAL_PAGE, + HV_X64_MSR_STATS_VP_RETAIL_PAGE, + HV_X64_MSR_STATS_VP_INTERNAL_PAGE, + HV_X64_MSR_GUEST_IDLE, + HV_X64_MSR_SYNTH_DEBUG_CONTROL, + HV_X64_MSR_SYNTH_DEBUG_STATUS, + HV_X64_MSR_SYNTH_DEBUG_SEND_BUFFER, + HV_X64_MSR_SYNTH_DEBUG_RECEIVE_BUFFER, + HV_X64_MSR_SYNTH_DEBUG_PENDING_BUFFER, + HV_X64_MSR_CRASH_P0, + HV_X64_MSR_CRASH_P1, + HV_X64_MSR_CRASH_P2, + HV_X64_MSR_CRASH_P3, + HV_X64_MSR_CRASH_P4, + HV_X64_MSR_CRASH_CTL, +]; + +#[cfg(all(test, feature = "kvm", feature = "cpu_profile_generation"))] +mod tests { + use super::*; + use crate::x86_64::msr_definitions::intel::{ + INTEL_MSR_FEATURE_DEFINITIONS, PERMITTED_IA32_MSRS, + }; + use crate::x86_64::msr_definitions::kvm::PROFILE_PERMITTED_KVM_MSRS; + + // If this can be assumed than that simplifies some things. + // + // NOTE: It is perfectly possible to make this a compile time check instead, + // but that is more cumbersome hence we leave that for later. + #[test] + fn does_not_intersect_other_permitted_msr_sets() { + for msr in HYPERV_MSRS { + assert!( + !INTEL_MSR_FEATURE_DEFINITIONS + .as_slice() + .iter() + .map(|r| r.0.0) + .chain(PERMITTED_IA32_MSRS) + .chain(PROFILE_PERMITTED_KVM_MSRS) + .any(|other_permitted_msr| other_permitted_msr == msr) + ); + } + } +} diff --git a/arch/src/x86_64/msr_definitions/intel/architectural_msrs.rs b/arch/src/x86_64/msr_definitions/intel/architectural_msrs.rs new file mode 100644 index 0000000000..af7b4e7cc0 --- /dev/null +++ b/arch/src/x86_64/msr_definitions/intel/architectural_msrs.rs @@ -0,0 +1,1654 @@ +// Copyright © 2025 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 +// +//! This module contains lists of architectural MSRs (or more accurately MSR register addresses) that +//! are permitted and forbidden for use with CPU profiles. +//! +//! The CPU profile generation tool obtains all MSRS supported by both KVM and the hardware +//! when it runs and uses the permitted list to only record those that are permitted. +//! +//! The list of forbidden architectural MSRs is only used to rule out "false" new MSRs that otherwise +//! would require updating the CPU profile generation tool. + +// We occasionally write doc comments for constants that are defined in private modules. This +// is still helpful for developers as the LSP can then provide information about the constants +// directly at the site(s) where they are being used. +#![allow(unused_doc_comments)] + +pub(in crate::x86_64) use forbidden_architectural_msrs::FORBIDDEN_IA32_MSR_RANGES; +pub(in crate::x86_64) use permitted_architectural_msrs::PERMITTED_IA32_MSRS; + +use crate::x86_64::CpuidReg; +use crate::x86_64::cpuid_definitions::Parameters; +use crate::x86_64::cpuid_definitions::intel::assert_not_denied_cpuid_feature; + +mod permitted_architectural_msrs { + use read_only::READ_ONLY_IA32_MSRS; + use read_write::READ_WRITE_IA32_MSRS; + use write_only::WRITE_ONLY_IA32_MSRS; + + use super::{CpuidReg, Parameters}; + use crate::x86_64::msr_definitions::intel::architectural_msrs::assert_not_denied_cpuid_feature; + + mod read_only { + use super::{CpuidReg, Parameters, assert_not_denied_cpuid_feature}; + /// (R/O) + const IA32_BARRIER: u32 = 0x2f; + const _IA32_BARRIER_CPUID_CHECK: () = const { + assert_not_denied_cpuid_feature::<27>(&Parameters { + leaf: 0x7, + sub_leaf: 0..=0, + register: CpuidReg::EAX, + }); + }; + + /// MTRR Capability (R/O) + const IA32_MTRRCAP: u32 = 0xfe; + + // TODO: Not sure whether the IA32_FZM_* msrs should be permitted + const IA32_FZM_DOMAIN_CONFIG: u32 = 0x83; + const IA32_FZM_RANGE_STARTADDR: u32 = 0x84; + const IA32_FZM_RANGE_ENDADDR: u32 = 0x85; + const IA32_FZM_RANGE_WRITESTATUS: u32 = 0x86; + // NOTE: This is permitted, but will be zeroed out for all non-host CPU profiles. + const IA32_MCG_CAP: u32 = 0x179; + + /// DCA Capability (R) + const IA32_PLATFORM_DCA_CAP: u32 = 0x1f8; + /// If set, CPU supports Prefetch-Hint type + const IA32_CPU_DCA_CAP: u32 = 0x1f9; + + const _IA32_DCA_CAP_CPUID_CHECK: () = assert_not_denied_cpuid_feature::<18>(&Parameters { + leaf: 0x1, + sub_leaf: 0..=0, + register: CpuidReg::ECX, + }); + + // TODO: Can we rather place this MSR in the deny list? + const IA32_MCU_STAGING_MBOX_ADDR: u32 = 0x7a5; + + // NOTE: THE X2APIC related MSRs cannot be filtered by KVM, but we include them here anyway for completeness sake. + const IA32_X2APIC_APICID: u32 = 0x802; + const IA32_X2APIC_VERSION: u32 = 0x803; + const IA32_X2APIC_PPR: u32 = 0x80a; + const IA32_X2APIC_LDR: u32 = 0x80d; + const IA32_X2APIC_ISR0: u32 = 0x810; + const IA32_X2APIC_ISR1: u32 = 0x811; + const IA32_X2APIC_ISR2: u32 = 0x812; + + const IA32_X2APIC_ISR3: u32 = 0x813; + const IA32_X2APIC_ISR4: u32 = 0x814; + const IA32_X2APIC_ISR5: u32 = 0x815; + const IA32_X2APIC_ISR6: u32 = 0x816; + const IA32_X2APIC_ISR7: u32 = 0x817; + const IA32_X2APIC_TMR0: u32 = 0x818; + const IA32_X2APIC_TMR1: u32 = 0x819; + const IA32_X2APIC_TMR2: u32 = 0x81a; + const IA32_X2APIC_TMR3: u32 = 0x81b; + const IA32_X2APIC_TMR4: u32 = 0x81c; + const IA32_X2APIC_TMR5: u32 = 0x81d; + const IA32_X2APIC_TMR6: u32 = 0x81e; + const IA32_X2APIC_TMR7: u32 = 0x81f; + const IA32_X2APIC_IRR0: u32 = 0x820; + const IA32_X2APIC_IRR1: u32 = 0x821; + const IA32_X2APIC_IRR2: u32 = 0x822; + const IA32_X2APIC_IRR3: u32 = 0x823; + const IA32_X2APIC_IRR4: u32 = 0x824; + const IA32_X2APIC_IRR5: u32 = 0x825; + const IA32_X2APIC_IRR6: u32 = 0x826; + const IA32_X2APIC_IRR7: u32 = 0x827; + const IA32_X2APIC_CUR_COUNT: u32 = 0x839; + + pub(super) const READ_ONLY_IA32_MSRS: [u32; 39] = [ + IA32_BARRIER, + IA32_MTRRCAP, + IA32_FZM_DOMAIN_CONFIG, + IA32_FZM_RANGE_STARTADDR, + IA32_FZM_RANGE_ENDADDR, + IA32_FZM_RANGE_WRITESTATUS, + IA32_MCG_CAP, + IA32_PLATFORM_DCA_CAP, + IA32_CPU_DCA_CAP, + IA32_MCU_STAGING_MBOX_ADDR, + IA32_X2APIC_APICID, + IA32_X2APIC_VERSION, + IA32_X2APIC_PPR, + IA32_X2APIC_LDR, + IA32_X2APIC_ISR0, + IA32_X2APIC_ISR1, + IA32_X2APIC_ISR2, + IA32_X2APIC_ISR3, + IA32_X2APIC_ISR4, + IA32_X2APIC_ISR5, + IA32_X2APIC_ISR6, + IA32_X2APIC_ISR7, + IA32_X2APIC_TMR0, + IA32_X2APIC_TMR1, + IA32_X2APIC_TMR2, + IA32_X2APIC_TMR3, + IA32_X2APIC_TMR4, + IA32_X2APIC_TMR5, + IA32_X2APIC_TMR6, + IA32_X2APIC_TMR7, + IA32_X2APIC_IRR0, + IA32_X2APIC_IRR1, + IA32_X2APIC_IRR2, + IA32_X2APIC_IRR3, + IA32_X2APIC_IRR4, + IA32_X2APIC_IRR5, + IA32_X2APIC_IRR6, + IA32_X2APIC_IRR7, + IA32_X2APIC_CUR_COUNT, + ]; + } + + mod read_write { + use super::{CpuidReg, Parameters, assert_not_denied_cpuid_feature}; + + const IA32_TIME_STAMP_COUNTER: u32 = 0x10; + + const IA32_APIC_BASE: u32 = 0x1b; + + const IA32_FEATURE_CONTROL: u32 = 0x3a; + + /// Per Logical Processor TSC Adjust (R/Write to clear) + const IA32_TSC_ADJUST: u32 = 0x3b; + const _IA32_TSC_ADJUST_CPUID_CHECK: () = + assert_not_denied_cpuid_feature::<1>(&Parameters { + leaf: 0x7, + sub_leaf: 0..=0, + register: CpuidReg::EBX, + }); + + const IA32_SPEC_CTRL: u32 = 0x48; + const _IA32_SPECT_CTRL_CPUID_CHECK: () = + assert_not_denied_cpuid_feature::<26>(&Parameters { + leaf: 0x7, + sub_leaf: 0..=0, + register: CpuidReg::EDX, + }); + + const IA32_MCU_OPT_CTRL: u32 = 0x123; + const _IA32_MCU_OPT_CTRL_CPUID_CHECK: () = + assert_not_denied_cpuid_feature::<9>(&Parameters { + leaf: 0x7, + sub_leaf: (0..=0), + register: CpuidReg::EDX, + }); + + /// SYSENTER_CS_MSR + const IA32_SYSENTER_CS: u32 = 0x174; + + /// SYSENTER_ESP_MSR + const IA32_SYSENTER_ESP: u32 = 0x175; + + /// SYSENTER_ESP_MSR + const IA32_SYSENTER_EIP: u32 = 0x176; + + // Technically permitted (as users will expect it given that MCA is available via CPUID), + // but probably not very useful since IA32_MCG_CAP will be zeroed out for all non-host + // CPU profiles + const IA32_MCG_STATUS: u32 = 0x17a; + + // TODO: Does it really make sense to permit this MSR? + const IA32_SMM_MONITOR_CTL: u32 = 0x9b; + const _IA32_SMM_MONITOR_CTL_CPUID_CHECK: () = + assert_not_denied_cpuid_feature::<5>(&Parameters { + leaf: 0x1, + sub_leaf: 0..=0, + register: CpuidReg::ECX, + }); + + /// Enable Misc. Processr Features + const IA32_MISC_ENABLE: u32 = 0x1a0; + + const IA32_XFD: u32 = 0x1c4; + const IA32_XFD_ERR: u32 = 0x1c5; + + const IA32_DCA_0_CAP: u32 = 0x1fa; + + const _IA32_DCA_0_CAP_CPUID_CHECK: () = + assert_not_denied_cpuid_feature::<18>(&Parameters { + leaf: 0x1, + sub_leaf: 0..=0, + register: CpuidReg::ECX, + }); + + const IA32_MTRR_PHYSBASE0: u32 = 0x200; + const IA32_MTRR_PHYSMASK0: u32 = 0x201; + const IA32_MTRR_PHYSBASE1: u32 = 0x202; + const IA32_MTRR_PHYSMASK1: u32 = 0x203; + const IA32_MTRR_PHYSBASE2: u32 = 0x204; + const IA32_MTRR_PHYSMASK2: u32 = 0x205; + const IA32_MTRR_PHYSBASE3: u32 = 0x206; + const IA32_MTRR_PHYSMASK3: u32 = 0x207; + const IA32_MTRR_PHYSBASE4: u32 = 0x208; + const IA32_MTRR_PHYSMASK4: u32 = 0x209; + const IA32_MTRR_PHYSBASE5: u32 = 0x20a; + const IA32_MTRR_PHYSMASK5: u32 = 0x20b; + const IA32_MTRR_PHYSBASE6: u32 = 0x20c; + const IA32_MTRR_PHYSMASK6: u32 = 0x20d; + const IA32_MTRR_PHYSBASE7: u32 = 0x20e; + const IA32_MTRR_PHYSMASK7: u32 = 0x20f; + const IA32_MTRR_PHYSBASE8: u32 = 0x210; + const IA32_MTRR_PHYSMASK8: u32 = 0x211; + const IA32_MTRR_PHYSBASE9: u32 = 0x212; + const IA32_MTRR_PHYSMASK9: u32 = 0x213; + + const IA32_MTRR_FIX64K_00000: u32 = 0x250; + const IA32_MTRR_FIX16K_80000: u32 = 0x258; + const IA32_MTRR_FIX16K_A0000: u32 = 0x259; + const IA32_MTRR_FIX4K_C0000: u32 = 0x268; + const IA32_MTRR_FIX4K_C8000: u32 = 0x269; + const IA32_MTRR_FIX4K_D0000: u32 = 0x26a; + const IA32_MTRR_FIX4K_D8000: u32 = 0x26b; + const IA32_MTRR_FIX4K_E0000: u32 = 0x26c; + const IA32_MTRR_FIX4K_E8000: u32 = 0x26d; + const IA32_MTRR_FIX4K_F0000: u32 = 0x26e; + const IA32_MTRR_FIX4K_F8000: u32 = 0x26f; + + const _IA32_MTRR_FIX_I_X_CPUID_CHECK: () = + assert_not_denied_cpuid_feature::<12>(&Parameters { + leaf: 0x1, + sub_leaf: 0..=0, + register: CpuidReg::EDX, + }); + + const IA32_PAT: u32 = 0x277; + const _IA32_PAT_CPUID_CHECK: () = assert_not_denied_cpuid_feature::<16>(&Parameters { + leaf: 0x1, + sub_leaf: 0..=0, + register: CpuidReg::EDX, + }); + + const IA32_MTRR_DEF_TYPE: u32 = 0x2ff; + + // Error reporting banks. KVM always reports 32 + // of them by default. + // TODO: Consider conditionally compiling this based + // on whether we are using KVM + const IA32_MC0_CTL: u32 = 0x400; + const IA32_MC0_STATUS: u32 = 0x401; + const IA32_MC0_ADDR: u32 = 0x402; + const IA32_MC0_MISC: u32 = 0x403; + const IA32_MC1_CTL: u32 = 0x404; + const IA32_MC1_STATUS: u32 = 0x405; + const IA32_MC1_ADDR: u32 = 0x406; + + const IA32_MC1_MISC: u32 = 0x407; + const IA32_MC2_CTL: u32 = 0x408; + const IA32_MC2_STATUS: u32 = 0x409; + const IA32_MC2_ADDR: u32 = 0x40a; + const IA32_MC2_MISC: u32 = 0x40b; + const IA32_MC3_CTL: u32 = 0x40c; + const IA32_MC3_STATUS: u32 = 0x40d; + const IA32_MC3_ADDR1: u32 = 0x40e; + const IA32_MC3_MISC: u32 = 0x40f; + const IA32_MC4_CTL: u32 = 0x410; + const IA32_MC4_STATUS: u32 = 0x411; + const IA32_MC4_ADDR: u32 = 0x412; + const IA32_MC4_MISC: u32 = 0x413; + const IA32_MC5_CTL: u32 = 0x414; + const IA32_MC5_STATUS: u32 = 0x415; + const IA32_MC5_ADDR: u32 = 0x416; + const IA32_MC5_MISC: u32 = 0x417; + const IA32_MC6_CTL: u32 = 0x418; + + const IA32_MC6_STATUS: u32 = 0x419; + const IA32_MC6_ADDR1: u32 = 0x41a; + const IA32_MC6_MISC: u32 = 0x41b; + const IA32_MC7_CTL: u32 = 0x41c; + const IA32_MC7_STATUS: u32 = 0x41d; + const IA32_MC7_ADDR: u32 = 0x41e; + const IA32_MC7_MISC: u32 = 0x41f; + const IA32_MC8_CTL: u32 = 0x420; + const IA32_MC8_STATUS: u32 = 0x421; + const IA32_MC8_ADDR: u32 = 0x422; + const IA32_MC8_MISC: u32 = 0x423; + const IA32_MC9_CTL: u32 = 0x424; + const IA32_MC9_STATUS: u32 = 0x425; + const IA32_MC9_ADDR: u32 = 0x426; + const IA32_MC9_MISC: u32 = 0x427; + const IA32_MC10_CTL: u32 = 0x428; + const IA32_MC10_STATUS: u32 = 0x429; + const IA32_MC10_ADDR: u32 = 0x42a; + const IA32_MC10_MISC: u32 = 0x42b; + + const IA32_MC11_CTL: u32 = 0x42c; + const IA32_MC11_STATUS: u32 = 0x42d; + const IA32_MC11_ADDR: u32 = 0x42e; + const IA32_MC11_MISC: u32 = 0x42f; + const IA32_MC12_CTL: u32 = 0x430; + const IA32_MC12_STATUS: u32 = 0x431; + const IA32_MC12_ADDR: u32 = 0x432; + const IA32_MC12_MISC: u32 = 0x433; + const IA32_MC13_CTL: u32 = 0x434; + const IA32_MC13_STATUS: u32 = 0x435; + const IA32_MC13_ADDR: u32 = 0x436; + const IA32_MC13_MISC: u32 = 0x437; + const IA32_MC14_CTL: u32 = 0x438; + const IA32_MC14_STATUS: u32 = 0x439; + const IA32_MC14_ADDR: u32 = 0x43a; + const IA32_MC14_MISC: u32 = 0x43b; + const IA32_MC15_CTL: u32 = 0x43c; + const IA32_MC15_STATUS: u32 = 0x43d; + + const IA32_MC15_ADDR: u32 = 0x43e; + const IA32_MC15_MISC: u32 = 0x43f; + const IA32_MC16_CTL: u32 = 0x440; + const IA32_MC16_STATUS: u32 = 0x441; + const IA32_MC16_ADDR: u32 = 0x442; + const IA32_MC16_MISC: u32 = 0x443; + const IA32_MC17_CTL: u32 = 0x444; + const IA32_MC17_STATUS: u32 = 0x445; + const IA32_MC17_ADDR: u32 = 0x446; + const IA32_MC17_MISC: u32 = 0x447; + const IA32_MC18_CTL: u32 = 0x448; + const IA32_MC18_STATUS: u32 = 0x449; + const IA32_MC18_ADDR: u32 = 0x44a; + const IA32_MC18_MISC: u32 = 0x44b; + const IA32_MC19_CTL: u32 = 0x44c; + const IA32_MC19_STATUS: u32 = 0x44d; + const IA32_MC19_ADDR: u32 = 0x44e; + const IA32_MC19_MISC: u32 = 0x44f; + const IA32_MC20_CTL: u32 = 0x450; + + const IA32_MC20_STATUS: u32 = 0x451; + const IA32_MC20_ADDR: u32 = 0x452; + const IA32_MC20_MISC: u32 = 0x453; + const IA32_MC21_CTL: u32 = 0x454; + const IA32_MC21_STATUS: u32 = 0x455; + const IA32_MC21_ADDR: u32 = 0x456; + const IA32_MC21_MISC: u32 = 0x457; + const IA32_MC22_CTL: u32 = 0x458; + const IA32_MC22_STATUS: u32 = 0x459; + const IA32_MC22_ADDR: u32 = 0x45a; + const IA32_MC22_MISC: u32 = 0x45b; + const IA32_MC23_CTL: u32 = 0x45c; + const IA32_MC23_STATUS: u32 = 0x45d; + const IA32_MC23_ADDR: u32 = 0x45e; + const IA32_MC23_MISC: u32 = 0x45f; + const IA32_MC24_CTL: u32 = 0x460; + const IA32_MC24_STATUS: u32 = 0x461; + const IA32_MC24_ADDR: u32 = 0x462; + + const IA32_MC24_MISC: u32 = 0x463; + const IA32_MC25_CTL: u32 = 0x464; + const IA32_MC25_STATUS: u32 = 0x465; + const IA32_MC25_ADDR: u32 = 0x466; + const IA32_MC25_MISC: u32 = 0x467; + const IA32_MC26_CTL: u32 = 0x468; + const IA32_MC26_STATUS: u32 = 0x469; + const IA32_MC26_ADDR: u32 = 0x46a; + const IA32_MC26_MISC: u32 = 0x46b; + const IA32_MC27_CTL: u32 = 0x46c; + const IA32_MC27_STATUS: u32 = 0x46d; + const IA32_MC27_ADDR: u32 = 0x46e; + const IA32_MC27_MISC: u32 = 0x46f; + const IA32_MC28_CTL: u32 = 0x470; + const IA32_MC28_STATUS: u32 = 0x471; + const IA32_MC28_ADDR: u32 = 0x472; + const IA32_MC28_MISC: u32 = 0x473; + const IA32_MC29_CTL: u32 = 0x474; + const IA32_MC29_STATUS: u32 = 0x475; + + const IA32_MC29_ADDR: u32 = 0x476; + const IA32_MC29_MISC: u32 = 0x477; + const IA32_MC30_CTL: u32 = 0x478; + const IA32_MC30_STATUS: u32 = 0x479; + const IA32_MC30_ADDR: u32 = 0x47a; + const IA32_MC30_MISC: u32 = 0x47b; + const IA32_MC31_CTL: u32 = 0x47c; + const IA32_MC31_STATUS: u32 = 0x47d; + const IA32_MC31_ADDR: u32 = 0x47e; + const IA32_MC31_MISC: u32 = 0x47f; + + const IA32_TSC_DEADLINE: u32 = 0x6e0; + const _IA32_TSC_DEADLINE_CPUID_CHECK: () = + assert_not_denied_cpuid_feature::<24>(&Parameters { + leaf: 0x1, + sub_leaf: 0..=0, + register: CpuidReg::ECX, + }); + + // NOTE: THE X2APIC related MSRs cannot be filtered by KVM, but we include them here anyway for completeness sake. + const IA32_X2APIC_TPR: u32 = 0x808; + const IA32_X2APIC_SIVR: u32 = 0x80f; + + const IA32_X2APIC_ESR: u32 = 0x828; + const IA32_X2APIC_LVT_CMCI: u32 = 0x82f; + const IA32_X2APIC_ICR: u32 = 0x830; + const IA32_X2APIC_LVT_TIMER: u32 = 0x832; + const IA32_X2APIC_LVT_THERMAL: u32 = 0x833; + const IA32_X2APIC_LVT_PMI: u32 = 0x834; + const IA32_X2APIC_LVT_LINT0: u32 = 0x835; + + const IA32_X2APIC_LVT_LINT1: u32 = 0x836; + const IA32_X2APIC_LVT_ERROR: u32 = 0x837; + const IA32_X2APIC_INIT_COUNT: u32 = 0x838; + const IA32_X2APIC_DIV_CONF: u32 = 0x83e; + + /// Extended Feature Enable + const IA32_EFER: u32 = 0xc0000080; + + const IA32_STAR: u32 = 0xc000_0081; + const IA32_LSTAR: u32 = 0xc000_0082; + const IA32_CSTAR: u32 = 0xc000_0083; + const IA32_FMASK: u32 = 0xc000_0084; + const IA32_FS_BASE: u32 = 0xc000_0100; + const IA32_GS_BASE: u32 = 0xc000_0101; + const IA32_KERNEL_GS_BASE: u32 = 0xc000_0102; + const _IA32_EFER_UPTO_IA32_KERNEL_GS_BASE_CPUID_CHECK: () = + assert_not_denied_cpuid_feature::<29>(&Parameters { + leaf: 0x80000001, + sub_leaf: 0..=0, + register: CpuidReg::EDX, + }); + + const IA32_TSC_AUX: u32 = 0xc000_0103; + // NOTE That either the following has to pass, or the same test with 0x80000001.EDX[27] + const _IA32_TSC_AUX_CPUID_CHECK: () = assert_not_denied_cpuid_feature::<22>(&Parameters { + leaf: 0x7, + sub_leaf: 0..=0, + register: CpuidReg::ECX, + }); + + pub(super) const READ_WRITE_IA32_MSRS: [u32; 199] = [ + IA32_TIME_STAMP_COUNTER, + IA32_APIC_BASE, + IA32_FEATURE_CONTROL, + IA32_TSC_ADJUST, + IA32_SPEC_CTRL, + IA32_MCU_OPT_CTRL, + IA32_SYSENTER_CS, + IA32_SYSENTER_ESP, + IA32_SYSENTER_EIP, + IA32_MCG_STATUS, + IA32_SMM_MONITOR_CTL, + IA32_MISC_ENABLE, + IA32_XFD, + IA32_XFD_ERR, + IA32_DCA_0_CAP, + IA32_MTRR_PHYSBASE0, + IA32_MTRR_PHYSMASK0, + IA32_MTRR_PHYSBASE1, + IA32_MTRR_PHYSMASK1, + IA32_MTRR_PHYSBASE2, + IA32_MTRR_PHYSMASK2, + IA32_MTRR_PHYSBASE3, + IA32_MTRR_PHYSMASK3, + IA32_MTRR_PHYSBASE4, + IA32_MTRR_PHYSMASK4, + IA32_MTRR_PHYSBASE5, + IA32_MTRR_PHYSMASK5, + IA32_MTRR_PHYSBASE6, + IA32_MTRR_PHYSMASK6, + IA32_MTRR_PHYSBASE7, + IA32_MTRR_PHYSMASK7, + IA32_MTRR_PHYSBASE8, + IA32_MTRR_PHYSMASK8, + IA32_MTRR_PHYSBASE9, + IA32_MTRR_PHYSMASK9, + IA32_MTRR_FIX64K_00000, + IA32_MTRR_FIX16K_80000, + IA32_MTRR_FIX16K_A0000, + IA32_MTRR_FIX4K_C0000, + IA32_MTRR_FIX4K_C8000, + IA32_MTRR_FIX4K_D0000, + IA32_MTRR_FIX4K_D8000, + IA32_MTRR_FIX4K_E0000, + IA32_MTRR_FIX4K_E8000, + IA32_MTRR_FIX4K_F0000, + IA32_MTRR_FIX4K_F8000, + IA32_PAT, + IA32_MTRR_DEF_TYPE, + IA32_MC0_CTL, + IA32_MC0_STATUS, + IA32_MC0_ADDR, + IA32_MC0_MISC, + IA32_MC1_CTL, + IA32_MC1_STATUS, + IA32_MC1_ADDR, + IA32_MC1_MISC, + IA32_MC2_CTL, + IA32_MC2_STATUS, + IA32_MC2_ADDR, + IA32_MC2_MISC, + IA32_MC3_CTL, + IA32_MC3_STATUS, + IA32_MC3_ADDR1, + IA32_MC3_MISC, + IA32_MC4_CTL, + IA32_MC4_STATUS, + IA32_MC4_ADDR, + IA32_MC4_MISC, + IA32_MC5_CTL, + IA32_MC5_STATUS, + IA32_MC5_ADDR, + IA32_MC5_MISC, + IA32_MC6_CTL, + IA32_MC6_STATUS, + IA32_MC6_ADDR1, + IA32_MC6_MISC, + IA32_MC7_CTL, + IA32_MC7_STATUS, + IA32_MC7_ADDR, + IA32_MC7_MISC, + IA32_MC8_CTL, + IA32_MC8_STATUS, + IA32_MC8_ADDR, + IA32_MC8_MISC, + IA32_MC9_CTL, + IA32_MC9_STATUS, + IA32_MC9_ADDR, + IA32_MC9_MISC, + IA32_MC10_CTL, + IA32_MC10_STATUS, + IA32_MC10_ADDR, + IA32_MC10_MISC, + IA32_MC11_CTL, + IA32_MC11_STATUS, + IA32_MC11_ADDR, + IA32_MC11_MISC, + IA32_MC12_CTL, + IA32_MC12_STATUS, + IA32_MC12_ADDR, + IA32_MC12_MISC, + IA32_MC13_CTL, + IA32_MC13_STATUS, + IA32_MC13_ADDR, + IA32_MC13_MISC, + IA32_MC14_CTL, + IA32_MC14_STATUS, + IA32_MC14_ADDR, + IA32_MC14_MISC, + IA32_MC15_CTL, + IA32_MC15_STATUS, + IA32_MC15_ADDR, + IA32_MC15_MISC, + IA32_MC16_CTL, + IA32_MC16_STATUS, + IA32_MC16_ADDR, + IA32_MC16_MISC, + IA32_MC17_CTL, + IA32_MC17_STATUS, + IA32_MC17_ADDR, + IA32_MC17_MISC, + IA32_MC18_CTL, + IA32_MC18_STATUS, + IA32_MC18_ADDR, + IA32_MC18_MISC, + IA32_MC19_CTL, + IA32_MC19_STATUS, + IA32_MC19_ADDR, + IA32_MC19_MISC, + IA32_MC20_CTL, + IA32_MC20_STATUS, + IA32_MC20_ADDR, + IA32_MC20_MISC, + IA32_MC21_CTL, + IA32_MC21_STATUS, + IA32_MC21_ADDR, + IA32_MC21_MISC, + IA32_MC22_CTL, + IA32_MC22_STATUS, + IA32_MC22_ADDR, + IA32_MC22_MISC, + IA32_MC23_CTL, + IA32_MC23_STATUS, + IA32_MC23_ADDR, + IA32_MC23_MISC, + IA32_MC24_CTL, + IA32_MC24_STATUS, + IA32_MC24_ADDR, + IA32_MC24_MISC, + IA32_MC25_CTL, + IA32_MC25_STATUS, + IA32_MC25_ADDR, + IA32_MC25_MISC, + IA32_MC26_CTL, + IA32_MC26_STATUS, + IA32_MC26_ADDR, + IA32_MC26_MISC, + IA32_MC27_CTL, + IA32_MC27_STATUS, + IA32_MC27_ADDR, + IA32_MC27_MISC, + IA32_MC28_CTL, + IA32_MC28_STATUS, + IA32_MC28_ADDR, + IA32_MC28_MISC, + IA32_MC29_CTL, + IA32_MC29_STATUS, + IA32_MC29_ADDR, + IA32_MC29_MISC, + IA32_MC30_CTL, + IA32_MC30_STATUS, + IA32_MC30_ADDR, + IA32_MC30_MISC, + IA32_MC31_CTL, + IA32_MC31_STATUS, + IA32_MC31_ADDR, + IA32_MC31_MISC, + IA32_TSC_DEADLINE, + IA32_X2APIC_TPR, + IA32_X2APIC_SIVR, + IA32_X2APIC_ESR, + IA32_X2APIC_LVT_CMCI, + IA32_X2APIC_ICR, + IA32_X2APIC_LVT_TIMER, + IA32_X2APIC_LVT_THERMAL, + IA32_X2APIC_LVT_PMI, + IA32_X2APIC_LVT_LINT0, + IA32_X2APIC_LVT_LINT1, + IA32_X2APIC_LVT_ERROR, + IA32_X2APIC_INIT_COUNT, + IA32_X2APIC_DIV_CONF, + IA32_EFER, + IA32_STAR, + IA32_LSTAR, + IA32_CSTAR, + IA32_FMASK, + IA32_FS_BASE, + IA32_GS_BASE, + IA32_KERNEL_GS_BASE, + IA32_TSC_AUX, + ]; + } + + mod write_only { + use super::{CpuidReg, Parameters, assert_not_denied_cpuid_feature}; + + /// Prediction Command (WO) + const IA32_PRED_CMD: u32 = 0x49; + const _IA32_PRED_CMD_CPUID_CHECK: () = assert_not_denied_cpuid_feature::<26>(&Parameters { + leaf: 0x7, + sub_leaf: 0..=0, + register: CpuidReg::EDX, + }); + + /// Flush Command (WO) + const IA32_FLUSH_CMD: u32 = 0x10b; + + // TODO: Should probably use inherit policy here + const _IA32_FLUSH_CMD_CPUID_CHECK: () = + assert_not_denied_cpuid_feature::<28>(&Parameters { + leaf: 0x7, + sub_leaf: 0..=0, + register: CpuidReg::EDX, + }); + + // X2apic related MSRS cannot be filtered by KVM, but we include it here anyway for completeness sake + const IA32_X2APIC_EOI: u32 = 0x80b; + + const IA32_X2APIC_SELF_IPI: u32 = 0x83f; + + pub(super) const WRITE_ONLY_IA32_MSRS: [u32; 4] = [ + IA32_PRED_CMD, + IA32_FLUSH_CMD, + IA32_X2APIC_EOI, + IA32_X2APIC_SELF_IPI, + ]; + } + + /// A list of permitted Intel IA32 MSRs that are not considered MSR-based feature indices + /// by KVM. + /// + /// The MSRs listed here can be studied further in Table 2.2 in Section 2.1 of the Intel SDM + /// Vol. 4 from October 2025 + pub(in crate::x86_64) const PERMITTED_IA32_MSRS: [u32; 242] = const { + let mut permitted = [0u32; 242]; + let read_only_len = READ_ONLY_IA32_MSRS.len(); + let write_only_len = WRITE_ONLY_IA32_MSRS.len(); + let read_write_len = READ_WRITE_IA32_MSRS.len(); + assert!(permitted.len() == (read_only_len + write_only_len + read_write_len)); + let mut idx = 0; + // Insert read only msrs + { + let mut i = 0; + while i < read_only_len { + permitted[idx + i] = READ_ONLY_IA32_MSRS[i]; + i += 1; + } + idx += read_only_len; + } + // Insert write only msrs + { + let mut i = 0; + while i < write_only_len { + permitted[idx + i] = WRITE_ONLY_IA32_MSRS[i]; + i += 1; + } + idx += write_only_len; + } + // Insert read & write msrs + { + let mut i = 0; + while i < read_write_len { + permitted[idx + i] = READ_WRITE_IA32_MSRS[i]; + i += 1; + } + } + permitted + }; +} + +mod forbidden_architectural_msrs { + const IA32_P5_MC_ADDR: (u32, u32) = (0x0, 0x0); + const IA32_P5_MC_TYPE: (u32, u32) = (0x1, 0x1); + + const IA32_MONITOR_FILTER_SIZE: (u32, u32) = (0x6, 0x6); + // TODO: Not sure about this one + const IA32_PLATFORM_ID: (u32, u32) = (0x17, 0x17); + + /// Only available is CPUID 0x7.0x1.EBX[0] = 1, but this is always 0 for non-host CPU profiles + const IA32_PPIN_CTL: (u32, u32) = (0x4e, 0x4e); + + /// Only available is CPUID 0x7.0x1.EBX[0] = 1, but this is always 0 for non-host CPU profiles + const IA32_PPIN: (u32, u32) = (0x4f, 0x4f); + + /// Used for microcode updates. Should not be available for guests. + const IA32_BIOS_UPDT_TRIG: (u32, u32) = (0x79, 0x79); + + /// Currently only related to Secure enclaves/Keylocker which is not available for non-host CPU profiles + const IA32_FEATURE_ACTIVATION: (u32, u32) = (0x7a, 0x7a); + + /// Related to microcode updates + const IA32_MCU_ENUMERATION: (u32, u32) = (0x7b, 0x7b); + + const IA32_MCU_STATUS: (u32, u32) = (0x7c, 0x7c); + + // TODO: Not sure what this does and whether it should be enabled + const IA32_FZM_RANGE_INDEX: (u32, u32) = (0x82, 0x82); + + /// Related to total memory encryption + /// + const IA32_MKTME_KEYID_PARTITIONING: (u32, u32) = (0x87, 0x87); + + const IA32_SGXLEPUBKEYHASH0: (u32, u32) = (0x8c, 0x8c); + + const IA32_SGXLEPUBKEYHASH1: (u32, u32) = (0x8d, 0x8d); + + const IA32_SGXLEPUBKEYHASH2: (u32, u32) = (0x8e, 0x8e); + + const IA32_SGXLEPUBKEYHASH3: (u32, u32) = (0x8f, 0x8f); + + const IA32_SGXLEPUBKEYHASH4: (u32, u32) = (0x90, 0x90); + + const IA32_SGXLEPUBKEYHASH5: (u32, u32) = (0x91, 0x91); + + // TODO: Check this + const IA32_SMBASE: (u32, u32) = (0x9e, 0x9e); + + const IA32_MISC_PACKAGE_CTLS: (u32, u32) = (0xbc, 0xbc); + + /// xAPIC Disable Status + // TODO: Also check consistency with IA32_ARCH_CAPABILITIES[21] + const IA32_XAPIC_DISABLE_STATUS: (u32, u32) = (0xbd, 0xbd); + + const IA32_SMRR_PHYS_BASE_MASK: (u32, u32) = (0x1f2, 0x1f3); + + /// Overclocking Status (R/O) + // TODO: Also check consistency with IA32_ARCH_CAPABILITIES[23] + const IA32_OVERCLOCKING_STATUS: (u32, u32) = (0x195, 0x195); + + /// Clock Modulation Control + /// This is disabled via CPUID for non-host CPU profiles + const IA32_CLOCK_MODULATION: (u32, u32) = (0x19a, 0x19a); + + // IA32_PLI_SSP is disabled via CPUID for non-host profiles + const IA32_PLI_SSP: (u32, u32) = (0x6a4, 0x6a7); + + // This is disabled via CPUID for non-host profiles + const IA32_INTERRUPT_SSP_TABLE_ADDR: (u32, u32) = (0x6a8, 0x6a8); + + const IA32_PECI_HWP_REQUEST_INFO: (u32, u32) = (0x775, 0x775); + const IA32_PMC0: (u32, u32) = (0xc1, 0xc1); + const IA32_PMC1: (u32, u32) = (0xc2, 0xc2); + const IA32_PMC2: (u32, u32) = (0xc3, 0xc3); + const IA32_PMC3: (u32, u32) = (0xc4, 0xc4); + const IA32_PMC4: (u32, u32) = (0xc5, 0xc5); + const IA32_PMC5: (u32, u32) = (0xc6, 0xc6); + const IA32_PMC6: (u32, u32) = (0xc7, 0xc7); + const IA32_PMC7: (u32, u32) = (0xc8, 0xc8); + const IA32_PMC8: (u32, u32) = (0xc9, 0xc9); + const IA32_PMC9: (u32, u32) = (0xca, 0xca); + + const IA32_CORE_CAPABILITIES: (u32, u32) = (0xcf, 0xcf); + + // TODO: Do we really want to forbid this MSR? + const IA32_UMWAIT_CONTROL: (u32, u32) = (0xe1, 0xe1); + + // Disabled by CPUID for non-host CPU profiles + const IA32_MPERF: (u32, u32) = (0xe7, 0xe7); + + const IA32_APERF: (u32, u32) = (0xe8, 0xe8); + + const IA32_TSX_FORCE_ABORT: (u32, u32) = (0x10f, 0x10f); + + // Disabled via static IA32_ARCH_CAPABILITIES bit for non-host CPU profiles + const IA32_TSX_CTRL: (u32, u32) = (0x122, 0x122); + + // NOTE: IA32_MCU_OPT_CTRL must necessarily be available, due to + // what we set in CPUID for some CPU profiles (inherit policy) + + const IA32_MCG_CTL: (u32, u32) = (0x17b, 0x17b); + + // TODO: 0x180- 0x185 is reserved, we should not list these MSRS at all + + /// Disabled via CPUID for all non-host CPU profiles + const IA32_PERFEVTSEL0: (u32, u32) = (0x186, 0x186); + const IA32_PERFEVTSEL1: (u32, u32) = (0x187, 0x187); + const IA32_PERFEVTSEL2: (u32, u32) = (0x188, 0x188); + const IA32_PERFEVTSEL3: (u32, u32) = (0x189, 0x189); + const IA32_PERFEVTSEL4: (u32, u32) = (0x18a, 0x18a); + const IA32_PERFEVTSEL5: (u32, u32) = (0x18b, 0x18b); + const IA32_PERFEVTSEL6: (u32, u32) = (0x18c, 0x18c); + const IA32_PERFEVTSEL7: (u32, u32) = (0x18d, 0x18d); + const IA32_PERFEVTSEL8: (u32, u32) = (0x18e, 0x18e); + const IA32_PERFEVTSEL9: (u32, u32) = (0x18f, 0x18f); + + // TODO: 0x18a - 0x194 is reserved and should not be included in any list + + // TODO: 0x196, 197 is reserved and should not be included in any list + // + + const IA32_PERF_STATUS: (u32, u32) = (0x198, 0x198); + + const IA32_PERF_CTL: (u32, u32) = (0x199, 0x199); + + // Disabled via CPUID for non-host profiles + const IA32_THERM_INTERRUPT: (u32, u32) = (0x19b, 0x19b); + + // Disabled via CPUID for non-host profiles + const IA32_THERM_STATUS: (u32, u32) = (0x19c, 0x19c); + + // Disabled via CPUID for non-host profiles + const IA32_ENERGY_PERF_BIAS: (u32, u32) = (0x1b0, 0x1b0); + + // Disabled via CPUID for non-host profiles + const IA32_PACKAGE_THERM_STATUS: (u32, u32) = (0x1b1, 0x1b1); + + // Disabled via CPUID for non-host profiles + const IA32_PACKAGE_THERM_INTERRUPT: (u32, u32) = (0x1b2, 0x1b2); + + const IA32_DEBUGCTL: (u32, u32) = (0x1d9, 0x1d9); + + const IA32_LER_FROM_IP: (u32, u32) = (0x1dd, 0x1dd); + + const IA32_LER_TO_IP: (u32, u32) = (0x1de, 0x1de); + + const IA32_LER_INFO: (u32, u32) = (0x1e0, 0x1e0); + + const IA32_MC_I_CTL2: (u32, u32) = (0x280, 0x29f); + + // Disabled via CPUID for non-host profiles + const IA32_INTEGRITY_STATUS: (u32, u32) = (0x2dc, 0x2dc); + + const IA32_FIXED_CTRI: (u32, u32) = (0x309, 0x30f); + + // IA32_PERF_CAPABILITIES is an MSR-based feature thus not listed here + + // Disabled via CPUID for non-host profiles + const IA32_FIXED_CTR_CTRL: (u32, u32) = (0x38d, 0x38d); + + // Disabled via CPUID for non-host profiles + const IA32_PERF_GLOBAL_STATUS: (u32, u32) = (0x38e, 0x38e); + + // Disabled via CPUID for non-host profiles + const IA32_PERF_GLOBAL_CTRL: (u32, u32) = (0x38f, 0x38f); + + // Disabled via CPUID for non-host profiles + const IA32_PERF_GLOBAL_STATUS_RESET: (u32, u32) = (0x390, 0x390); + + // Disabled via CPUID for non-host profiles + const IA32_PERF_GLOBAL_STATUS_SET: (u32, u32) = (0x391, 0x391); + + // Disabled via CPUID for non-host profiles + const IA32_PERF_GLOBAL_INUSE: (u32, u32) = (0x392, 0x392); + + // TODO: Not sure about this one, but seems to be related to performance monitoring which + // should be disabled for non-host CPU profiles. + const IA32_PEBS_ENABLE: (u32, u32) = (0x3f1, 0x3f1); + + const IA32_A_PMC0: (u32, u32) = (0x4c1, 0x4c1); + const IA32_A_PMC1: (u32, u32) = (0x4c2, 0x4c2); + const IA32_A_PMC2: (u32, u32) = (0x4c3, 0x4c3); + const IA32_A_PMC3: (u32, u32) = (0x4c4, 0x4c4); + const IA32_A_PMC4: (u32, u32) = (0x4c5, 0x4c5); + const IA32_A_PMC5: (u32, u32) = (0x4c6, 0x4c6); + const IA32_A_PMC6: (u32, u32) = (0x4c7, 0x4c7); + const IA32_A_PMC7: (u32, u32) = (0x4c8, 0x4c8); + const IA32_A_PMC8: (u32, u32) = (0x4c9, 0x4c9); + const IA32_A_PMC9: (u32, u32) = (0x4ca, 0x4ca); + + const IA32_MCG_EXT_CTL: (u32, u32) = (0x4d0, 0x4d0); + + // SGX is disabled via CPUID for non-host CPU profiles + const IA32_SGX_SVN_STATUS: (u32, u32) = (0x500, 0x500); + + // Disabled via CPUID for non-host CPU profiles + const IA32_RTIT_OUTPUT_BASE: (u32, u32) = (0x560, 0x560); + + // Disabled via CPUID for non-host CPU profiles + const IA32_RTIT_OUTPUT_MASK_PTRS: (u32, u32) = (0x561, 0x561); + + // Disabled via CPUID for non-host CPU profiles + const IA32_RTIT_CTL: (u32, u32) = (0x570, 0x570); + + // Disabled via CPUID for non-host CPU profiles + const IA32_RTIT_STATUS: (u32, u32) = (0x571, 0x571); + + // Disabled via CPU profiles + const IA32_RTIT_CR3_MATCH: (u32, u32) = (0x572, 0x572); + + const IA32_RTIT_ADDR0_A: (u32, u32) = (0x580, 0x580); + const IA32_RTIT_ADDR0_B: (u32, u32) = (0x581, 0x581); + const IA32_RTIT_ADDR1_A: (u32, u32) = (0x582, 0x582); + const IA32_RTIT_ADDR1_B: (u32, u32) = (0x583, 0x583); + const IA32_RTIT_ADDR2_A: (u32, u32) = (0x584, 0x584); + const IA32_RTIT_ADDR2_B: (u32, u32) = (0x585, 0x585); + const IA32_RTIT_ADDR3_A: (u32, u32) = (0x586, 0x586); + const IA32_RTIT_ADDR3_B: (u32, u32) = (0x587, 0x587); + + // Disabled via CPUID for non-host CPU profiles + const IA32_DS_AREA: (u32, u32) = (0x600, 0x600); + + // U_CET and S_CET are disabled via CPUID + // TODO: Include compile time checks for that + const IA32_U_CET: (u32, u32) = (0x6a0, 0x6a0); + const IA32_S_CET: (u32, u32) = (0x6a2, 0x6a2); + + // TODO: IA32_TSC_DEADLINE should be available because the TSC_DEADLINE CPUID bit + // is set by CHV unconditionally. The availability of this MSR probably needs to be + // handled by CHV itself and not the CPU profiles + + // Disabled via CPUID for non-host CPU profiles + const IA32_PKRS: (u32, u32) = (0x6e1, 0x6e1); + + // Disabled via CPUID for non-host CPU profiles + const IA32_PM_ENABLE: (u32, u32) = (0x770, 0x770); + + // Disabled via CPUID for non-host CPU profiles + const IA32_HWP_CAPABILITIES: (u32, u32) = (0x771, 0x771); + + // Disabled via CPUID for non-host CPU profiles + const IA32_HWP_REQUEST_PKG: (u32, u32) = (0x772, 0x772); + + // Disabled via CPUID for non-host CPU profiles + const IA32_HWP_INTERRUPT: (u32, u32) = (0x773, 0x773); + + // Disabled via CPUID for non-host CPU profiles + const IA32_HWP_REQUEST: (u32, u32) = (0x774, 0x774); + + // Disabled via CPUID for non-host CPU profiles + const IA32_HWP_CTL: (u32, u32) = (0x776, 0x776); + + // Disabled via CPUID for non-host CPU profiles + const IA32_HWP_STATUS: (u32, u32) = (0x777, 0x777); + + const IA32_MCU_EXT_SERVICE: (u32, u32) = (0x7a3, 0x7a3); + + const IA32_MCU_ROLLBACK_MIN_ID: (u32, u32) = (0x7a4, 0x7a4); + + // TODO: Not sure about IA32_MCU_STAGING_MBOX_ADDR + + const IA32_ROLLBACK_SIGN_ID_0: (u32, u32) = (0x7b0, 0x7b0); + const IA32_ROLLBACK_SIGN_ID_1: (u32, u32) = (0x7b1, 0x7b1); + const IA32_ROLLBACK_SIGN_ID_2: (u32, u32) = (0x7b2, 0x7b2); + const IA32_ROLLBACK_SIGN_ID_3: (u32, u32) = (0x7b3, 0x7b3); + const IA32_ROLLBACK_SIGN_ID_4: (u32, u32) = (0x7b4, 0x7b4); + const IA32_ROLLBACK_SIGN_ID_5: (u32, u32) = (0x7b5, 0x7b5); + const IA32_ROLLBACK_SIGN_ID_6: (u32, u32) = (0x7b6, 0x7b6); + const IA32_ROLLBACK_SIGN_ID_7: (u32, u32) = (0x7b7, 0x7b7); + const IA32_ROLLBACK_SIGN_ID_8: (u32, u32) = (0x7b8, 0x7b8); + const IA32_ROLLBACK_SIGN_ID_9: (u32, u32) = (0x7b9, 0x7b9); + const IA32_ROLLBACK_SIGN_ID_10: (u32, u32) = (0x7ba, 0x7ba); + const IA32_ROLLBACK_SIGN_ID_11: (u32, u32) = (0x7bb, 0x7bb); + const IA32_ROLLBACK_SIGN_ID_12: (u32, u32) = (0x7bc, 0x7bc); + const IA32_ROLLBACK_SIGN_ID_13: (u32, u32) = (0x7bd, 0x7bd); + const IA32_ROLLBACK_SIGN_ID_14: (u32, u32) = (0x7be, 0x7be); + const IA32_ROLLBACK_SIGN_ID_15: (u32, u32) = (0x7bf, 0x7bf); + + // Disabled via CPUID for non-host CPU profiles + const IA32_TME_CAPABILITY: (u32, u32) = (0x981, 0x981); + + // Disabled via CPUID for non-host CPU profiles + const IA32_TME_ACTIVATE: (u32, u32) = (0x982, 0x982); + + // Disabled via CPUID for non-host CPU profiles + const IA32_TME_EXCLUDE_MASK: (u32, u32) = (0x983, 0x983); + + // Disabled via CPUID for non-host CPU profiles + const IA32_TME_EXCLUDE_BASE: (u32, u32) = (0x984, 0x984); + + // Disabled via CPUID for non-host CPU profiles + const IA32_UINTR_RR: (u32, u32) = (0x985, 0x985); + + // Disabled via CPUID for non-host CPU profiles + const IA32_UINTR_HANDLER: (u32, u32) = (0x986, 0x986); + + // Disabled via CPUID for non-host CPU profiles + const IA32_UINTR_STACKADJUST: (u32, u32) = (0x987, 0x987); + + // Disabled via CPUID for non-host CPU profiles + const IA32_UINTR_MISC: (u32, u32) = (0x988, 0x988); + + // Disabled via CPUID for non-host CPU profiles + const IA32_UINTR_PD: (u32, u32) = (0x989, 0x989); + + // Disabled via CPUID for non-host CPU profiles + const IA32_UINTR_TT: (u32, u32) = (0x98a, 0x98a); + + // Disabled via CPUID for non-host CPU profiles + const IA32_COPY_STATUS: (u32, u32) = (0x990, 0x990); + + // Disabled via CPUID for non-host CPU profiles + const IA32_IWKEYBACKUP_STATUS: (u32, u32) = (0x991, 0x991); + + const IA32_TME_CLEAR_SAVED_KEY: (u32, u32) = (0x9fb, 0x9fb); + + // Disabled via CPUID for non-host CPU profiles + const IA32_DEBUG_INTERFACE: (u32, u32) = (0xc80, 0xc80); + + // Disabled via CPUID for non-host CPU profiles + const IA32_L3_QOS_CFG: (u32, u32) = (0xc81, 0xc81); + + // Disabled via CPUID + const IA32_L2_QOS_CFG: (u32, u32) = (0xc82, 0xc82); + + // Disabled via CPUID + const IA32_L3_IO_QOS_CFG: (u32, u32) = (0xc83, 0xc83); + + const IA32_RESOURCE_PRIORITY: (u32, u32) = (0xc88, 0xc88); + const IA32_RESOURCE_PRIORITY_PKG: (u32, u32) = (0xc89, 0xc89); + + // Disabled via CPUID for non-host CPU profiles + const IA32_QM_EVTSEL: (u32, u32) = (0xc8d, 0xc8d); + + // Disabled via CPUID for non-host CPU profiles + const IA32_QM_CTR: (u32, u32) = (0xc8e, 0xc8e); + + // Disabled via CPUID for non-host CPU profiles + const IA32_PQR_ASSOC: (u32, u32) = (0xc8f, 0xc8f); + + // Disabled via CPUID for non-host CPU profiles + const IA32_L3_MASK_0: (u32, u32) = (0xc90, 0xc90); + + const IA32_L3_MASK_N: (u32, u32) = (0xc91, 0xd8f); + + // Disabled via CPUID for non-host CPU profiles + const IA32_L2_MASK_0: (u32, u32) = (0xd10, 0xd10); + + // Disabled via CPUID for non-host CPU profiles + const IA32_L2_MASK_N: (u32, u32) = (0xd11, 0xd4f); + + // Disabled via CPUID for non-host CPU profiles + const IA32_L2_QOS_EXT_BW_THRTL_I: (u32, u32) = (0xd50, 0xd5e); + + // Disabled via CPUID for non-host CPU profiles + const IA32_BNDCFGS: (u32, u32) = (0xd90, 0xd90); + + // Disabled via CPUID for non-host CPU profiles + const IA32_COPY_LOCAL_TO_PLATFORM: (u32, u32) = (0xd91, 0xd91); + + // Disabled via CPUID for non-host CPU profiles + const IA32_COPY_PLATFORM_TO_LOCAL: (u32, u32) = (0xd92, 0xd92); + + const IA32_PASID: (u32, u32) = (0xd93, 0xd93); + + /* + IA32_XSS is a bit problematic: Only never kernels will report it via + KVM_GET_MSR_INDEX_LIST, but CPUID 0xd.0x1.EAX[3] reports that this MSR + exists. + + In order for CPU profiles generated with recent kernels to work with + deployments operating with older kernels, we decide to forbid this MSR + for now even though CPUID indicates that it is available to the guest. + + We consider this OK because we have disabled every single IA32_XSS + related state component in the 0xd CPUID leaves, hence there is no + reason for the guest to want to use this. + */ + const IA32_XSS: (u32, u32) = (0xda0, 0xda0); + // Disabled via CPUID for non-host CPU profiles + const IA32_PKG_HDC_CTL: (u32, u32) = (0xdb0, 0xdb0); + + // Disabled via CPUID for non-host CPU profiles + const IA32_PM_CTL1: (u32, u32) = (0xdb1, 0xdb1); + + // Disabled via CPUID for non-host CPU profiles + const IA32_THREAD_STALL: (u32, u32) = (0xdb2, 0xdb2); + + // Disabled via CPUID for non-host CPU profiles + const IA32_QOS_CORE_BW_THRTL_0: (u32, u32) = (0xe00, 0xe00); + + // Disabled via CPUID for non-host CPU profiles + const IA32_QOS_CORE_BW_THRTL_1: (u32, u32) = (0xe01, 0xe01); + + // Note that we have CPUID 0x7.EDX.[19] = 0 (ARCH_LBR) + const IA32_LBR_X_INFO: (u32, u32) = (0x1200, 0x121f); + + // TDX related. + const IA32_SEAMRR_BASE: (u32, u32) = (0x1400, 0x1400); + + // TDX related. + const IA32_SEAMRR_MASK: (u32, u32) = (0x1401, 0x1401); + + // Disabled via ARCH_CAPABILITIES for non-host CPU profiles + // TODO: Check that deny policy is compatible with + // the policy for IA32_ARCH_COMPATIBILITY[9] + const IA32_MCU_CONTROL: (u32, u32) = (0x1406, 1406); + + const IA32_LBR_CTL: (u32, u32) = (0x14ce, 0x14ce); + + const IA32_LBR_DEPTH: (u32, u32) = (0x14cf, 0x14cf); + + const IA32_LBR_X_FROM_IP: (u32, u32) = (0x1500, 0x151f); + + const IA32_LBR_X_TO_IP: (u32, u32) = (0x1600, 0x161f); + + // Disabled via CPUID for non-host CPU profiles + const IA32_HW_FEEDBACK_PTR: (u32, u32) = (0x17d0, 0x17d0); + + // Disabled via CPUID for non-host CPU profiles + const IA32_HW_FEEDBACK_CONFIG: (u32, u32) = (0x17d1, 0x17d1); + + // Disabled via CPUID for non-host CPU profiles + const IA32_HW_FEEDBACK_THREAD_CHAR: (u32, u32) = (0x17d2, 0x17d2); + + const IA32_HW_FEEDBACK_THREAD_CONFIG: (u32, u32) = (0x17d4, 0x17d4); + + const IA32_HRESET_ENABLE: (u32, u32) = (0x17da, 0x17da); + + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP0_CTR: (u32, u32) = (0x1900, 0x1900); + + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP0_CFG_A: (u32, u32) = (0x1901, 0x1901); + + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP0_CFG_C: (u32, u32) = (0x1903, 0x1903); + + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP1_CTR: (u32, u32) = (0x1904, 0x1904); + + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP1_CFG_A: (u32, u32) = (0x1905, 0x1905); + + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP1_CFG_C: (u32, u32) = (0x1907, 0x1907); + + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP2_CTR: (u32, u32) = (0x1908, 0x1908); + + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP2_CFG_A: (u32, u32) = (0x1909, 0x1909); + + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP2_CFG_B: (u32, u32) = (0x190a, 0x190a); + + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP2_CFG_C: (u32, u32) = (0x190b, 0x190b); + + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP3_CTR: (u32, u32) = (0x190c, 0x190c); + + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP3_CFG_A: (u32, u32) = (0x190d, 0x190d); + + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP3_CFG_B: (u32, u32) = (0x190e, 0x190e); + + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP3_CFG_C: (u32, u32) = (0x190f, 0x190f); + + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP4_CTR: (u32, u32) = (0x1910, 0x1910); + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP4_CFG_A: (u32, u32) = (0x1911, 0x1911); + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP4_CFG_B: (u32, u32) = (0x1912, 0x1912); + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP4_CFG_C: (u32, u32) = (0x1913, 0x1913); + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP5_CTR: (u32, u32) = (0x1914, 0x1914); + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP5_CFG_A: (u32, u32) = (0x1915, 0x1915); + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP5_CFG_B: (u32, u32) = (0x1916, 0x1916); + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP5_CFG_C: (u32, u32) = (0x1917, 0x1917); + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP6_CTR: (u32, u32) = (0x1918, 0x1918); + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP6_CFG_A: (u32, u32) = (0x1919, 0x1919); + + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP6_CFG_B: (u32, u32) = (0x191a, 0x191a); + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP6_CFG_C: (u32, u32) = (0x191b, 0x191b); + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP7_CTR: (u32, u32) = (0x191c, 0x191c); + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP7_CFG_A: (u32, u32) = (0x191d, 0x191d); + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP7_CFG_B: (u32, u32) = (0x191e, 0x191e); + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP7_CFG_C: (u32, u32) = (0x191f, 0x191f); + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP8_CTR: (u32, u32) = (0x1920, 0x1920); + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP8_CFG_A: (u32, u32) = (0x1921, 0x1921); + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP9_CTR: (u32, u32) = (0x1924, 0x1924); + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP9_CFG_A: (u32, u32) = (0x1925, 0x1925); + + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_FX0_CTR: (u32, u32) = (0x1980, 0x1980); + + const IA32_PMC_FX0_CFG_B: (u32, u32) = (0x1982, 0x1982); + const IA32_PMC_FX0_CFG_C: (u32, u32) = (0x1983, 0x1983); + + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_FX1_CTR: (u32, u32) = (0x1984, 0x1984); + const IA32_PMC_FX1_CFG_B: (u32, u32) = (0x1986, 0x1986); + const IA32_PMC_FX1_CFG_C: (u32, u32) = (0x1987, 0x1987); + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_FX2_CTR: (u32, u32) = (0x1988, 0x1988); + + const IA32_PMC_FX2_CFG_C: (u32, u32) = (0x198b, 0x198b); + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_FX3_CTR: (u32, u32) = (0x198c, 0x198c); + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_FX4_CTR: (u32, u32) = (0x1990, 0x1990); + const IA32_PMC_FX4_CFG_C: (u32, u32) = (0x1993, 0x1993); + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_FX5_CTR: (u32, u32) = (0x1994, 0x1994); + const IA32_PMC_FX5_CFG_C: (u32, u32) = (0x1997, 0x1997); + + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_FX6_CTR: (u32, u32) = (0x1998, 0x1998); + const IA32_PMC_FX6_CFG_C: (u32, u32) = (0x199b, 0x199b); + + // TODO: Check against IA32_ARCH_CAPABILITIES[12] + const IA32_UARCH_MISC_CTL: (u32, u32) = (0x1b01, 0x1b01); + /// A list of ARCHITECTURAL MSR register addresses that are forbidden for all non-host CPU profiles and also not + /// considered MSR-based FEATURE indices by KVM. + pub(in crate::x86_64) const FORBIDDEN_IA32_MSR_RANGES: [(u32, u32); 229] = [ + IA32_P5_MC_ADDR, + IA32_P5_MC_TYPE, + // TODO: Not sure about IA32_P5_MC_ADDR & IA32_P5_MC_TYPE + IA32_MONITOR_FILTER_SIZE, + // TODO: Not sure about this one + IA32_PLATFORM_ID, + /// Only available is CPUID 0x7.0x1.EBX[0] = 1, but this is always 0 for non-host CPU profiles + IA32_PPIN_CTL, + /// Only available is CPUID 0x7.0x1.EBX[0] = 1, but this is always 0 for non-host CPU profiles + IA32_PPIN, + /// Used for microcode updates. Should not be available for guests. + IA32_BIOS_UPDT_TRIG, + /// Currently only related to Secure enclaves/Keylocker which is not available for non-host CPU profiles + IA32_FEATURE_ACTIVATION, + IA32_FZM_RANGE_INDEX, + IA32_SMRR_PHYS_BASE_MASK, + IA32_PECI_HWP_REQUEST_INFO, + /// Related to microcode updates + IA32_MCU_ENUMERATION, + IA32_MCU_STATUS, + /// Related to total memory encryption + IA32_MKTME_KEYID_PARTITIONING, + // TODO: Not sure what to do about IA32_BIOS_SIGN_ID (note that it is also a MSR-based feature according to KVM) + IA32_SGXLEPUBKEYHASH0, + IA32_SGXLEPUBKEYHASH1, + IA32_SGXLEPUBKEYHASH2, + IA32_SGXLEPUBKEYHASH3, + IA32_SGXLEPUBKEYHASH4, + IA32_SGXLEPUBKEYHASH5, + // TODO: Check this + IA32_SMBASE, + IA32_MISC_PACKAGE_CTLS, + IA32_XAPIC_DISABLE_STATUS, + IA32_OVERCLOCKING_STATUS, + IA32_PMC0, + IA32_PMC1, + IA32_PMC2, + IA32_PMC3, + IA32_PMC4, + IA32_PMC5, + IA32_PMC6, + IA32_PMC7, + IA32_PMC8, + IA32_PMC9, + IA32_CORE_CAPABILITIES, + IA32_UMWAIT_CONTROL, + IA32_CLOCK_MODULATION, + IA32_PLI_SSP, + IA32_INTERRUPT_SSP_TABLE_ADDR, + // Disabled by CPUID for non-host CPU profiles + IA32_MPERF, + IA32_APERF, + IA32_TSX_FORCE_ABORT, + // Disabled via static IA32_ARCH_CAPABILITIES bit for non-host CPU profiles + IA32_TSX_CTRL, + // NOTE: IA32_MCU_OPT_CTRL must necessarily be available, due to + // what we set in CPUID for some CPU profiles (inherit policy) + + // TODO: Don't know about IA32_SYSENTER_CS, IA32_SYSENTER_ESP, + // IA32_SYSENTER_EIP + // + IA32_MCG_CTL, + // TODO: 0x180- 0x185 is reserved, we should not list these MSRS at all + /// Disabled via CPUID for all non-host CPU profiles + IA32_PERFEVTSEL0, + IA32_PERFEVTSEL1, + IA32_PERFEVTSEL2, + IA32_PERFEVTSEL3, + IA32_PERFEVTSEL4, + IA32_PERFEVTSEL5, + IA32_PERFEVTSEL6, + IA32_PERFEVTSEL7, + IA32_PERFEVTSEL8, + IA32_PERFEVTSEL9, + // TODO: 0x18a - 0x194 is reserved and should not be included in any list + + // TODO: 0x196, 197 is reserved and should not be included in any list + // + IA32_PERF_STATUS, + IA32_PERF_CTL, + // Disabled via CPUID for non-host profiles + IA32_THERM_INTERRUPT, + // Disabled via CPUID for non-host profiles + IA32_THERM_STATUS, + // TODO: Consider disabling IA32_MISC_ENABLE + + // Disabled via CPUID for non-host profiles + IA32_ENERGY_PERF_BIAS, + // Disabled via CPUID for non-host profiles + IA32_PACKAGE_THERM_STATUS, + // Disabled via CPUID for non-host profiles + IA32_PACKAGE_THERM_INTERRUPT, + IA32_DEBUGCTL, + IA32_LER_FROM_IP, + IA32_LER_TO_IP, + IA32_LER_INFO, + // TODO: Not sure about IA32_SMRR_PHYSBASE & IA32_SMRR_PHYSMASK + IA32_MC_I_CTL2, + // Disabled via CPUID for non-host profiles + IA32_INTEGRITY_STATUS, + IA32_FIXED_CTRI, + // IA32_PERF_CAPABILITIES is an MSR-based feature thus not listed here + + // Disabled via CPUID for non-host profiles + IA32_FIXED_CTR_CTRL, + // Disabled via CPUID for non-host profiles + IA32_PERF_GLOBAL_STATUS, + // Disabled via CPUID for non-host profiles + IA32_PERF_GLOBAL_CTRL, + // Disabled via CPUID for non-host profiles + IA32_PERF_GLOBAL_STATUS_RESET, + // Disabled via CPUID for non-host profiles + IA32_PERF_GLOBAL_STATUS_SET, + // Disabled via CPUID for non-host profiles + IA32_PERF_GLOBAL_INUSE, + // TODO: Not sure about this one, but seems to be related to performance monitoring which + // should be disabled for non-host CPU profiles. + IA32_PEBS_ENABLE, + IA32_A_PMC0, + IA32_A_PMC1, + IA32_A_PMC2, + IA32_A_PMC3, + IA32_A_PMC4, + IA32_A_PMC5, + IA32_A_PMC6, + IA32_A_PMC7, + IA32_A_PMC8, + IA32_A_PMC9, + IA32_MCG_EXT_CTL, + // SGX is disabled via CPUID for non-host CPU profiles + IA32_SGX_SVN_STATUS, + // Disabled via CPUID for non-host CPU profiles + IA32_RTIT_OUTPUT_BASE, + // Disabled via CPUID for non-host CPU profiles + IA32_RTIT_OUTPUT_MASK_PTRS, + // Disabled via CPUID for non-host CPU profiles + IA32_RTIT_CTL, + // Disabled via CPUID for non-host CPU profiles + IA32_RTIT_STATUS, + // Disabled via CPU profiles + IA32_RTIT_CR3_MATCH, + IA32_RTIT_ADDR0_A, + IA32_RTIT_ADDR0_B, + IA32_RTIT_ADDR1_A, + IA32_RTIT_ADDR1_B, + IA32_RTIT_ADDR2_A, + IA32_RTIT_ADDR2_B, + IA32_RTIT_ADDR3_A, + IA32_RTIT_ADDR3_B, + // Disabled via CPUID for non-host CPU profiles + IA32_DS_AREA, + IA32_U_CET, + IA32_S_CET, + // Disabled via CPUID for non-host CPU profiles + IA32_PKRS, + // Disabled via CPUID for non-host CPU profiles + IA32_PM_ENABLE, + // Disabled via CPUID for non-host CPU profiles + IA32_HWP_CAPABILITIES, + // Disabled via CPUID for non-host CPU profiles + IA32_HWP_REQUEST_PKG, + // Disabled via CPUID for non-host CPU profiles + IA32_HWP_INTERRUPT, + // Disabled via CPUID for non-host CPU profiles + IA32_HWP_REQUEST, + // TODO: Can we also deny IA32_PECI_HWP_REQUEST_INFO? + + // Disabled via CPUID for non-host CPU profiles + IA32_HWP_CTL, + // Disabled via CPUID for non-host CPU profiles + IA32_HWP_STATUS, + // TODO: Currently permitted via IA32_ARCH_CAPABILITIES (bit 22), + // but that bit should probably have policy Static(0) ? + IA32_MCU_EXT_SERVICE, + IA32_MCU_ROLLBACK_MIN_ID, + // TODO: Not sure about IA32_MCU_STAGING_MBOX_ADDR + IA32_ROLLBACK_SIGN_ID_0, + IA32_ROLLBACK_SIGN_ID_1, + IA32_ROLLBACK_SIGN_ID_2, + IA32_ROLLBACK_SIGN_ID_3, + IA32_ROLLBACK_SIGN_ID_4, + IA32_ROLLBACK_SIGN_ID_5, + IA32_ROLLBACK_SIGN_ID_6, + IA32_ROLLBACK_SIGN_ID_7, + IA32_ROLLBACK_SIGN_ID_8, + IA32_ROLLBACK_SIGN_ID_9, + IA32_ROLLBACK_SIGN_ID_10, + IA32_ROLLBACK_SIGN_ID_11, + IA32_ROLLBACK_SIGN_ID_12, + IA32_ROLLBACK_SIGN_ID_13, + IA32_ROLLBACK_SIGN_ID_14, + IA32_ROLLBACK_SIGN_ID_15, + // Disabled via CPUID for non-host CPU profiles + IA32_TME_CAPABILITY, + // Disabled via CPUID for non-host CPU profiles + IA32_TME_ACTIVATE, + // Disabled via CPUID for non-host CPU profiles + IA32_TME_EXCLUDE_MASK, + // Disabled via CPUID for non-host CPU profiles + IA32_TME_EXCLUDE_BASE, + // Disabled via CPUID for non-host CPU profiles + IA32_UINTR_RR, + // Disabled via CPUID for non-host CPU profiles + IA32_UINTR_HANDLER, + // Disabled via CPUID for non-host CPU profiles + IA32_UINTR_STACKADJUST, + // Disabled via CPUID for non-host CPU profiles + IA32_UINTR_MISC, + // Disabled via CPUID for non-host CPU profiles + IA32_UINTR_PD, + // Disabled via CPUID for non-host CPU profiles + IA32_UINTR_TT, + // Disabled via CPUID for non-host CPU profiles + IA32_COPY_STATUS, + // Disabled via CPUID for non-host CPU profiles + IA32_IWKEYBACKUP_STATUS, + IA32_TME_CLEAR_SAVED_KEY, + // Disabled via CPUID for non-host CPU profiles + IA32_DEBUG_INTERFACE, + // Disabled via CPUID for non-host CPU profiles + IA32_L3_QOS_CFG, + // Disabled via CPUID + IA32_L2_QOS_CFG, + // Disabled via CPUID + IA32_L3_IO_QOS_CFG, + IA32_RESOURCE_PRIORITY, + IA32_RESOURCE_PRIORITY_PKG, + // Disabled via CPUID for non-host CPU profiles + IA32_QM_EVTSEL, + // Disabled via CPUID for non-host CPU profiles + IA32_QM_CTR, + // Disabled via CPUID for non-host CPU profiles + IA32_PQR_ASSOC, + // Disabled via CPUID for non-host CPU profiles + IA32_L3_MASK_0, + IA32_L3_MASK_N, + // Disabled via CPUID for non-host CPU profiles + IA32_L2_MASK_0, + // Disabled via CPUID for non-host CPU profiles + IA32_L2_MASK_N, + // Disabled via CPUID for non-host CPU profiles + IA32_L2_QOS_EXT_BW_THRTL_I, + // Disabled via CPUID for non-host CPU profiles + IA32_BNDCFGS, + // Disabled via CPUID for non-host CPU profiles + IA32_COPY_LOCAL_TO_PLATFORM, + // Disabled via CPUID for non-host CPU profiles + IA32_COPY_PLATFORM_TO_LOCAL, + IA32_PASID, + IA32_XSS, + // Disabled via CPUID for non-host CPU profiles + IA32_PKG_HDC_CTL, + // Disabled via CPUID for non-host CPU profiles + IA32_PM_CTL1, + // Disabled via CPUID for non-host CPU profiles + IA32_THREAD_STALL, + // Disabled via CPUID for non-host CPU profiles + IA32_QOS_CORE_BW_THRTL_0, + // Disabled via CPUID for non-host CPU profiles + IA32_QOS_CORE_BW_THRTL_1, + // TODO: Is it OK to disable this for CPU profiles? + // Note that we have CPUID 0x7.EDX.[19] = 0 (ARCH_LBR) + IA32_LBR_X_INFO, + // TDX related. + IA32_SEAMRR_BASE, + // TDX related. + IA32_SEAMRR_MASK, + // Disabled via ARCH_CAPABILITIES for non-host CPU profiles + IA32_MCU_CONTROL, + IA32_LBR_CTL, + IA32_LBR_DEPTH, + IA32_LBR_X_FROM_IP, + IA32_LBR_X_TO_IP, + // Disabled via CPUID for non-host CPU profiles + IA32_HW_FEEDBACK_PTR, + // Disabled via CPUID for non-host CPU profiles + IA32_HW_FEEDBACK_CONFIG, + // Disabled via CPUID for non-host CPU profiles + IA32_HW_FEEDBACK_THREAD_CHAR, + IA32_HW_FEEDBACK_THREAD_CONFIG, + IA32_HRESET_ENABLE, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP0_CTR, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP0_CFG_A, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP0_CFG_C, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP1_CTR, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP1_CFG_A, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP1_CFG_C, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP2_CTR, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP2_CFG_A, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP2_CFG_B, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP2_CFG_C, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP3_CTR, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP3_CFG_A, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP3_CFG_B, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP3_CFG_C, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP4_CTR, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP4_CFG_A, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP4_CFG_B, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP4_CFG_C, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP5_CTR, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP5_CFG_A, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP5_CFG_B, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP5_CFG_C, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP6_CTR, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP6_CFG_A, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP6_CFG_B, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP6_CFG_C, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP7_CTR, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP7_CFG_A, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP7_CFG_B, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP7_CFG_C, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP8_CTR, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP8_CFG_A, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP9_CTR, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP9_CFG_A, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_FX0_CTR, + IA32_PMC_FX0_CFG_B, + IA32_PMC_FX0_CFG_C, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_FX1_CTR, + IA32_PMC_FX1_CFG_B, + IA32_PMC_FX1_CFG_C, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_FX2_CTR, + IA32_PMC_FX2_CFG_C, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_FX3_CTR, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_FX4_CTR, + IA32_PMC_FX4_CFG_C, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_FX5_CTR, + IA32_PMC_FX5_CFG_C, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_FX6_CTR, + IA32_PMC_FX6_CFG_C, + IA32_UARCH_MISC_CTL, + ]; +} diff --git a/arch/src/x86_64/msr_definitions/intel/mod.rs b/arch/src/x86_64/msr_definitions/intel/mod.rs new file mode 100644 index 0000000000..c8e8a91d5c --- /dev/null +++ b/arch/src/x86_64/msr_definitions/intel/mod.rs @@ -0,0 +1,21 @@ +// Copyright © 2025 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 +// + +#[cfg(feature = "cpu_profile_generation")] +mod architectural_msrs; + +#[cfg(feature = "cpu_profile_generation")] +mod non_architectural_msrs; + +mod msr_based_features; + +#[cfg(feature = "cpu_profile_generation")] +pub(in crate::x86_64) use architectural_msrs::FORBIDDEN_IA32_MSR_RANGES; +#[cfg(feature = "cpu_profile_generation")] +pub(in crate::x86_64) use architectural_msrs::PERMITTED_IA32_MSRS; +pub use msr_based_features::INTEL_MSR_FEATURE_DEFINITIONS; +pub(in crate::x86_64) use msr_based_features::check_feature_msr_compatibility; +#[cfg(feature = "cpu_profile_generation")] +pub(in crate::x86_64) use non_architectural_msrs::NON_ARCHITECTURAL_INTEL_MSRS; diff --git a/arch/src/x86_64/msr_definitions/intel/msr_based_features.rs b/arch/src/x86_64/msr_definitions/intel/msr_based_features.rs new file mode 100644 index 0000000000..e5cb7b214d --- /dev/null +++ b/arch/src/x86_64/msr_definitions/intel/msr_based_features.rs @@ -0,0 +1,4442 @@ +// Copyright © 2025 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::collections::HashMap; + +use log::{debug, error, warn}; + +use crate::x86_64::msr_definitions::{ + MsrDefinitions, ProfilePolicy, RegisterAddress, ValueDefinition, ValueDefinitions, +}; + +impl RegisterAddress { + pub const IA32_BIOS_SIGN_ID: Self = Self(0x8b); + pub const IA32_ARCH_CAPABILITIES: Self = Self(0x10a); + pub const IA32_PERF_CAPABILITIES: Self = Self(0x345); + pub const IA32_VMX_BASIC: Self = Self(0x480); + pub const IA32_VMX_PINBASED_CTLS: Self = Self(0x481); + pub const IA32_VMX_PROCBASED_CTLS: Self = Self(0x482); + pub const IA32_VMX_EXIT_CTLS: Self = Self(0x483); + pub const IA32_VMX_ENTRY_CTLS: Self = Self(0x484); + pub const IA32_VMX_MISC: Self = Self(0x485); + pub const IA32_VMX_CR0_FIXED0: Self = Self(0x486); + pub const IA32_VMX_CR0_FIXED1: Self = Self(0x487); + pub const IA32_VMX_CR4_FIXED0: Self = Self(0x488); + pub const IA32_VMX_CR4_FIXED1: Self = Self(0x489); + pub const IA32_VMX_VMCS_ENUM: Self = Self(0x48a); + pub const IA32_VMX_PROCBASED_CTLS2: Self = Self(0x48b); + pub const IA32_VMX_EPT_VPID_CAP: Self = Self(0x48c); + pub const IA32_VMX_TRUE_PINBASED_CTLS: Self = Self(0x48d); + pub const IA32_VMX_TRUE_PROCBASED_CTLS: Self = Self(0x48e); + pub const IA32_VMX_TRUE_EXIT_CTLS: Self = Self(0x48f); + pub const IA32_VMX_TRUE_ENTRY_CTLS: Self = Self(0x490); + pub const IA32_VMX_VMFUNC: Self = Self(0x491); + pub const IA32_VMX_PROCBASED_CTLS3: Self = Self(0x492); + pub const IA32_VMX_EXIT_CTLS2: Self = Self(0x493); + + // =============== Non-architectural MSRs ======== + + // KVM + Intel Skylake reports this as an MSR-based feature + pub const MSR_PLATFORM_INFO: Self = Self(0xce); +} + +/// This table contains descriptions of all the MSRs whose register addresses can be contained in +/// the list returned by `KVM_GET_MSR_FEATURE_INDEX_LIST` when executed on an Intel CPU. +/// +/// The values described here are based on the Intel 64 and IA-32 Architectures Software Developer's +/// Manual Combined Volumes: 1,2A, 2B, 2C, 2D, 3A, 3B, 3C, 3D, and 4 from October 2025. +/// +/// We try to use the same short descriptions as Intel, but in the cases where we could not find an +/// official name for the bit field(s) we invented our own based on the description. +/// +/// The descriptions written here are based on those found in the aforementioned manual, but often less +/// detailed. We recommend consulting the official Intel documentation whenever more information +/// is required. +/// +/// +/// ## Future-proofing +/// +/// Future processors and/or KVM versions may of course introduce more MSR-based features than those listed here at this time of writing. +/// In order to make sure that this is taken into account, the CPU profile generation tool will error when this is detected. The person +/// attempting to create a new CPU profile should then update this table accordingly and try again. +pub static INTEL_MSR_FEATURE_DEFINITIONS: MsrDefinitions<24> = const { + MsrDefinitions([ + ( + RegisterAddress::IA32_BIOS_SIGN_ID, + ValueDefinitions::new(&[ + ValueDefinition { + short: "PATCH_SIGN_ID", + description: "Any non-zero value is the microcode update signature patch signature ID", + bits_range: (32, 63), + policy: ProfilePolicy::Passthrough, + } + ]) + ), + + ( + RegisterAddress::IA32_ARCH_CAPABILITIES, + ValueDefinitions::new(&[ + ValueDefinition { + short: "RDCL_NO", + description: "The processor is not susceptible to Rogue Data Cache Load (RDCL)", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "IBRS_ALL", + description: "The processor supports enhanced IBRS", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit, + }, + // Skylake has this bit set, but not Sapphire Rapids + // TODO: Is Inherit the right policy here? (Will it still be possible to use the Skylake profile on a Sapphire Rapids machine?) + ValueDefinition { + short: "RSBA", + description: "The processor supports RSB Alternate", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "SKIP_L1DFL_VMENTRY", + description: "A value of 1 indicates the hypervisor need not flush the L1D on VM entry", + bits_range: (3, 3), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "SSB_NO", + description: "Processor is not susceptible to Speculation Store Bypass", + bits_range: (4, 4), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "MDS_NO", + description: "Processor is not susceptible to Microarchitectural Data Sampling (MDS)", + bits_range: (5, 5), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "IF_PSCHANGE_MC_NO", + description: "The processor is not susceptible to a machine check error due to modifying the size of a code page without TLB invalidation", + bits_range: (6, 6), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "TSX_CTRL", + description: "If 1, indicates presence of IA32_TSX_CTRL MSR", + bits_range: (7, 7), + // TSX is riddled with CVEs + // TODO: Check that this is indeed the right policy + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "TAA_NO", + description: "If 1, processor is not affected by TAA", + bits_range: (8, 8), + // This is TSX related which we disable anyway + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "MCU_CONTROL", + description: "If 1, the processor supports the IA32_MCU_CONTROL MSR", + bits_range: (9, 9), + // TODO: Check what the IA32_MCU_CONTROL MSR is + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "MISC_PACKAGE_CTLS", + description: "The processor supports IA32_MISC_PACKAGE_CTLS MSR", + bits_range: (10, 10), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "ENERGY_FILTERING_CTL", + description: "The processor supports setting and reading the IA32_MISC_PACKAGE_CTLS[0] (ENERGY_FILTERING_ENABLE) bit", + bits_range: (11, 11), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "DOITM:", + description: "If 1, the processor supports Data Operand Independent Timing Mode", + bits_range: (12, 12), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "SBDR_SSDP_NO", + description: "The processor is not affected by either the Shared Buffers Data Read (SBDR) vulnerability or the Sideband Stale Data Propagator (SSDP)", + bits_range: (13, 13), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "FBSDP_NO", + description: "The processor is not affected by the Fill Buffer Stale Data Propagator (DBSDP)", + bits_range: (14, 14), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "PSDP_NO", + description: "The processor is not affected by vulnerabilities involving the Primary Stale Data Propagator (PSDP)", + bits_range: (15, 15), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "MCU_ENUMERATION", + description: "If 1, the processor supportss the IA32_MCU_ENUMERATION and IA32_MCU_STATUS MSRs", + bits_range: (16, 16), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "FB_CLEAR", + description: "If 1, the processor supports overwrite of fill buffer values as part of MD_CLEAR operations with the VERW instruction. + On these processors L1D_FLUSH does not overwrite fill buffer values", + bits_range: (17, 17), + policy: ProfilePolicy::Inherit, + }, + + ValueDefinition { + short: "FB_CLEAR_CTRL", + description: "If 1, the processor supports the IA32_MCU_OPT_CTRL MSR and allows software to set bit 3 of that MSR (FB_CLEAR_DIS)", + bits_range: (18, 18), + policy: ProfilePolicy::Static(0), + }, + + ValueDefinition { + short: "RRSBA", + description: "A value of 1 indicates the processor may have the RRSBA alternate prediction behavior, if not disabled by RRSBA_DIS_U or RRSBA_DIS_S", + bits_range: (19, 19), + policy: ProfilePolicy::Inherit, + }, + + ValueDefinition { + short: "BHI_NO", + description: "A value of 1 indicates BHI_NO branch prediction behavior, regardless of the value of IA32_SPEC_CTRL[BHI_DIS_S] MSR bit", + bits_range: (20, 20), + policy: ProfilePolicy::Inherit, + }, + + ValueDefinition { + short: "XAPIC_DISABLE_STATUS", + description: "Enumerates that the IA32_XAPIC_DISABLE_STATUS MSR exists, and that bit 0 specifies whether the legacy xAPIC is disabled and APIC state is locked to x2APIC", + bits_range: (21, 21), + policy: ProfilePolicy::Static(0), + }, + + ValueDefinition { + short: "MCU_EXTENDED_SERVICE", + description: "If 1, the processor supports MCU extended servicing - IA32_MCU_EXT_SERVICE MSR", + bits_range: (22, 22), + // TODO: Check + policy: ProfilePolicy::Static(0), + }, + + ValueDefinition { + short: "OVERCLOCKING_STATUS", + description: "If set, the IA32_OVERCLOCKING_STATUS MSR exists", + bits_range: (23, 23), + // TODO: Check + policy: ProfilePolicy::Static(0), + }, + + ValueDefinition { + short: "PBRSB_NO", + description: "If 1, the processor is not affected by issues related to Post-Barrier Return Stack Buffer Predictions", + bits_range: (24, 24), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "GDS_CTRL", + description: "If 1, the processor supports the GDS_MITG_DIS and GDS_MITG_LOCK bits of the IA32_MCU_OPT_CTRL MSR", + bits_range: (25, 25), + // TODO: Check + policy: ProfilePolicy::Inherit, + }, + + ValueDefinition { + short: "GDS_NO", + description: "If 1, the processor is not affected by Gather Data Sampling", + bits_range: (26, 26), + policy: ProfilePolicy::Inherit, + }, + + ValueDefinition { + short: "RFDS_NO", + description: "If 1, processor is not affected by Register File Data Sampling", + bits_range: (27, 27), + policy: ProfilePolicy::Inherit, + }, + + ValueDefinition { + short: "RFDS_CLEAR", + description: "If 1, when VERW is executed the processor will clear stale data from register files affected by Register File Data Sampling", + bits_range: (28, 28), + policy: ProfilePolicy::Inherit, + }, + + ValueDefinition { + short: "IGN_UMONITOR_SUPPORT", + description: "If 0, IA32_MCU_OPT_CTRL bit 6 (IGN_UMONITOR) is not supported. If 1, it indicates support of IA32_MCU_OPT_CTRL bit 6 (IGN_UMONITOR)", + bits_range: (29, 29), + policy: ProfilePolicy::Static(0), + }, + + ValueDefinition { + short: "MON_UMON_MITG_SUPPORT", + description: "If 1, indicates support for IA32_MCU_OPT_CTRL bit 7 (MON_UMON_MITG), otherwise it is not supported", + bits_range: (30, 30), + policy: ProfilePolicy::Static(0), + }, + + ValueDefinition { + short: "PBOPT_SUPPORT", + description: "If 1, IA32_PBOPT_CTRL bit 0 (Prediction Barrier Option (PBOPT)) is supported, otherwise it is not", + bits_range: (32, 32), + policy: ProfilePolicy::Inherit, + }, + + ValueDefinition { + short: "ITS_NO", + description: "If 0, the hypervisor indicates that the system is not affected by indirect Target Selection. If 1, then the hypervisor + indicates that the system may be affected by indirect Target Selection", + bits_range: (62, 62), + policy: ProfilePolicy::Passthrough, + + }, + + ]), + ), + + ( + RegisterAddress::IA32_PERF_CAPABILITIES, + ValueDefinitions::new(&[ + ValueDefinition { + short: "IA32_PERF_CAPABILITIES", + description: "Read Only MSR that enumerates the existence of performance monitoring features", + bits_range: (0, 63), + // This MSR is only valid if CPUID 0x1.ECX[15] is set, but that bit is always zeroed out for CPU profiles different from host + policy: ProfilePolicy::Deny + } + ]) + ), + + ( + RegisterAddress::IA32_VMX_BASIC, + ValueDefinitions::new(&[ + ValueDefinition { + short: "VMCS_REV_ID", + description: "31-bit VMCS revision identifier. Processors that use the same VMCS revision identifier + use the same size for VMCS regions", + bits_range: (0,31), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition { + short: "REGION_SIZE", + description: "Number of bytes that software should allocate for the VMXON region and any VMCS region. It is a value greater than + 0 and at most 4096", + bits_range: (32, 44), + policy: ProfilePolicy::Inherit, + }, + + ValueDefinition { + short: "DUAL_MON", + description: " If 1, the logical processor supports the dual-monitor treatment of system-management + interrupts and system-management mode. See Section 33.15 for details of this treatment", + bits_range: (49, 49), + // TODO: Should we have Static(0)? here (I think that might be equivalent to what QEMU does) + policy: ProfilePolicy::Inherit + }, + + ValueDefinition { + short: "MEM_TYPE", + description: "The memory type that should be used for the VMCS, for data structures referenced by pointers + in the VMCS (I/O bitmaps, virtual-APIC page, MSR areas for VMX transitions), and for the MSEG header", + bits_range: (50, 53), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition { + short: "VM_EXIT_INFO_INS_OUTS", + description: " If 1, the processor reports information in the VM-exit instruction-information field on VM exits + due to execution of the INS and OUTS instructions. + ", + bits_range: (54, 54), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition { + short: "VMX_CTRLS_DEFAULT_MUT", + description: "Any VMX controls that default to 1 may be cleared to 0", + bits_range: (55,55), + policy: ProfilePolicy::Inherit + }, + // This is only available for relatively recent kernels + // TODO: Revisit this policy + ValueDefinition { + short: "VM_ENTRY_HARDWARE_EXCEPTIONS", + description: "If 1, then software can use VM entry to deliver a hardware exception", + bits_range: (56, 56), + policy: ProfilePolicy::Static(0) + } + ]) + ), + + ( + RegisterAddress::IA32_VMX_PINBASED_CTLS, + ValueDefinitions::new(&[ + ValueDefinition { + short:"ALLOWED_ZERO_EXTERNAL_INTERRUPT_EXITING", + description: "See Intel SDM Vol.3C Section 26.6.1 Table 26-5 (Definitions of Pin-Based VM-Execution Controls)", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_1_2", + description: "VM entry allows control X to be 0 if bit X in this MSR is zero", + bits_range: (1, 2), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_NMI_EXITING", + description: "See Intel SDM Vol.3C Section 26.6.1 Table 26-5 (Definitions of Pin-Based VM-Execution Controls)", + bits_range: (3, 3), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_4", + description: "VM entry allows control X to be 0 if bit X in this MSR is zero", + bits_range: (4, 4), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_VIRTUAL_NMIS", + description: "See Intel SDM Vol.3C Section 26.6.1 Table 26-5 (Definitions of Pin-Based VM-Execution Controls)", + bits_range: (5, 5), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ACTIVATE_VMX_PREEMPTION_TIMER", + description: "See Intel SDM Vol.3C Section 26.6.1 Table 26-5 (Definitions of Pin-Based VM-Execution Controls)", + bits_range: (6, 6), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_PROCESS_POSTED_INTERRUPTS", + description: "See Intel SDM Vol.3C Section 26.6.1 Table 26-5 (Definitions of Pin-Based VM-Execution Controls)", + bits_range: (7, 7), + policy: ProfilePolicy::Inherit + }, + + + ValueDefinition { + short: "ALLOWED_ZERO", + description: "VM entry allows control X to be 0 if bit X in this MSR is zero", + bits_range: (8, 31), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition{ + short:"ALLOWED_ONE_EXTERNAL_INTERRUPT_EXITING", + description: "See Intel SDM Vol.3C Section 26.6.1 Table 26-5 (Definitions of Pin-Based VM-Execution Controls)", + bits_range: (32, 32), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_1_2", + description: "VM entry allows control X to be 1 if bit X in this MSR is 1", + bits_range: (33, 34), + policy: ProfilePolicy::Inherit + }, + ValueDefinition{ + short:"ALLOWED_ONE_NMI_EXITING", + description: "See Intel SDM Vol.3C Section 26.6.1 Table 26-5 (Definitions of Pin-Based VM-Execution Controls)", + bits_range: (35, 35), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_4", + description: "VM entry allows control X to be 1 if bit X in this MSR is 1", + bits_range: (36, 36), + policy: ProfilePolicy::Inherit + }, + ValueDefinition{ + short:"ALLOWED_ONE_VIRTUAL_NMIS", + description: "See Intel SDM Vol.3C Section 26.6.1 Table 26-5 (Definitions of Pin-Based VM-Execution Controls)", + bits_range: (37, 37), + policy: ProfilePolicy::Inherit + }, + ValueDefinition{ + short:"ALLOWED_ONE_ACTIVATE_VMX__PREEMPTION_TIMER", + description: "See Intel SDM Vol.3C Section 26.6.1 Table 26-5 (Definitions of Pin-Based VM-Execution Controls)", + bits_range: (38, 38), + policy: ProfilePolicy::Inherit + }, + ValueDefinition{ + short:"ALLOWED_ONE_PROCESS_POSTED_INTERRUPTS", + description: "See Intel SDM Vol.3C Section 26.6.1 Table 26-5 (Definitions of Pin-Based VM-Execution Controls)", + bits_range: (39, 39), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE", + description:"VM entry allows control X to be 1 if bit X + 32 in this MSR is 1", + bits_range: (40, 63), + policy: ProfilePolicy::Inherit + } + ]) + ), + + ( + RegisterAddress::IA32_VMX_PROCBASED_CTLS, + ValueDefinitions::new(&[ + ValueDefinition { + short: "ALLOWED_ZERO_0_1", + description: "Control X is allowed to be 0 if bit X of this MSR is 0", + bits_range: (0, 1), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_INTERRUPT_WINDOW_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_USE_TSC_OFFSETTING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (3, 3), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_4_6", + description: "Control X is allowed to be 0 if bit X of this MSR is 0", + bits_range: (4, 6), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_HLT_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (7, 7), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_8", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (8, 8), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_INVLPG_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (9, 9), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_MWAIT_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (10, 10), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_RDPMC_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (11, 11), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_RDTSC_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (12, 12), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_13_14", + description: "Control X is allowed to be 0 if bit X of this MSR is 0", + bits_range: (13, 14), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_CR3_LOAD_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (15, 15), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_CR3_STORE_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (16, 16), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ACTIVATE_TERTIARY_CONTROLS", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (17, 17), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_18", + description: "Control X is allowed to be 0 if bit X of this MSR is 0", + bits_range: (18, 18), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_CR8_LOAD_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (19, 19), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_CR8_STORE_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (20, 20), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_USE_TPR_SHADOW", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (21, 21), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_NMI_WINDOW_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (22, 22), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_MOV_DR_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (23, 23), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_UNCONDITIONAL_I/O_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (24, 24), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_USE_I/O_BITMAPS", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (25, 25), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_26", + description: "Control X is allowed to be 0 if bit X of this MSR is 0", + bits_range: (26, 26), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_MONITOR_TRAP_FLAG", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (27, 27), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_USE_MSR_BITMAPS", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (28, 28), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_MONITOR_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (29, 29), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_PAUSE_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (30, 30), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ACTIVATE_SECONDARY_CONTROLS", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (31, 31), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_0_1", + description: "Control X is allowed to be 1 if bit 32 + X of this MSR is 1", + bits_range: (32, 33), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_INTERRUPT_WINDOW_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (34, 34), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_USE_TSC_OFFSETTING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (35, 35), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_4_6", + description:"VM entry allows control X to be 1 if bit X + 32 in this MSR is 1", + bits_range: (36, 38), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_HLT_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (39, 39), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_8", + description:"VM entry allows control X to be 1 if bit X + 32 in this MSR is 1", + bits_range: (40, 40), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_INVLPG_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (41, 41), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_MWAIT_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (42, 42), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_RDPMC_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (43, 43), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_RDTSC_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (44, 44), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition { + short: "ALLOWED_ONE_13_14", + description:"VM entry allows control X to be 1 if bit X + 32 in this MSR is 1", + bits_range: (45, 46), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_CR3_LOAD_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (47, 47), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_CR3_STORE_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (48, 48), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_ACTIVATE_TERTIARY_CONTROLS", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (49, 49), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_18", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (50, 50), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition { + short:"ALLOWED_ONE_CR8_LOAD_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (51, 51), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_CR8_STORE_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (52, 52), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_USE_TPR_SHADOW", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (53, 53), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_NMI_WINDOW_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (54, 54), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_MOV_DR_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (55, 55), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_UNCONDITIONAL_I/O_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (56, 56), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_USE_I/O_BITMAPS", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (57, 57), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_26", + description: "Control X is allowed to be 1 if bit X of this MSR is 1", + bits_range: (58, 58), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition { + short:"ALLOWED_ONE_MONITOR_TRAP_FLAG", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (59, 59), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_USE_MSR_BITMAPS", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (60, 60), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_MONITOR_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (61, 61), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_PAUSE_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (62, 62), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_ACTIVATE_SECONDARY_CONTROLS", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (63, 63), + policy: ProfilePolicy::Inherit + }, + + ]) + ), + + ( + RegisterAddress::IA32_VMX_EXIT_CTLS, + ValueDefinitions::new(&[ + ValueDefinition { + short: "ALLOWED_ZERO_0_1", + description: "Control X is allowed to be 0 if bit X in this MSR is 0", + bits_range: (0, 1), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_SAVE_DEBUG_CONTROLS", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_3_8", + description: "Control X is allowed to be 0 if bit X in this MSR is 0", + bits_range: (3, 8), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_HOST_ADDRESS_SPACE_SIZE", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (9, 9), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_10_11", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (10, 11), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_IA32_PERF_GLOBAL_CTRL", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (12, 12), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_13_14", + description: "Control X is allowed to be 0 if bit X in this MSR is 0", + bits_range: (13, 14), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ACKNOWLEDGE_INTERRUPT_O_EXIT", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (15, 15), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_16_17", + description: "Control X is allowed to be 0 if bit X in this MSR is 0", + bits_range: (16, 17), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_SAVE_IA32_PAT", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (18, 18), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_IA32_PAT", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (19, 19), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_SAVE_IA32_EFER", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (20, 20), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_IA32_EFER", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (21, 21), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_SAVE_VMX_PREEMPTION_TIMER_VALUE", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (22, 22), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_CLEAR_IA32_BNDCFGS", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (23, 23), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_CONCEAL_VMX_FROM_PT", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (24, 24), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_CLEAR_IA32_RTIT_CTL", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (25, 25), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_CLEAR_IA32_LBR_CTL", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (26, 26), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short:"ALLOWED_ZERO_CLEAR_UINV", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (27, 27), + policy: ProfilePolicy::Inherit + }, + // TODO: Also determines whether SSP is loaded on VM exit (do we need that?) + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_CET_STATE", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (28, 28), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_PKRS", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (29, 29), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_SAVE_IA32_PERF_GLOBAL_CTL", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (30, 30), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ACTIVATE_SECONDARY_CONTROLS", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (31, 31), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_0_1", + description: "Control X is allowed to be 1 if bit X in this MSR is 1", + bits_range: (32, 33), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition { + short:"ALLOWED_ONE_SAVE_DEBUG_CONTROLS", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (34, 34), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition { + short: "ALLOWED_ONE_3_8", + description: "Control X is allowed to be 1 if bit X in this MSR is 1", + bits_range: (35, 40), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_HOST_ADDRESS_SPACE_SIZE", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (41, 41), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_10_11", + description: "Control X is allowed to be 1 if bit X in this MSR is 1", + bits_range: (42, 43), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_IA32_PERF_GLOBAL_CTRL", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (44, 44), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short: "ALLOWED_ONE_13_14", + description: "Control X is allowed to be 1 if bit X in this MSR is 1", + bits_range: (45, 46), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_ACKNOWLEDGE_INTERRUPT_O_EXIT", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (47, 47), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_16_17", + description: "Control X is allowed to be 1 if bit X in this MSR is 1", + bits_range: (48, 49), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_SAVE_IA32_PAT", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (50, 50), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_IA32_PAT", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (51, 51), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_SAVE_IA32_EFER", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (52, 52), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_IA32_EFER", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (53, 53), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_SAVE_VMX_PREEMPTION_TIMER_VALUE", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (54, 54), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_CLEAR_IA32_BNDCFGS", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (55, 55), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short:"ALLOWED_ONE_CONCEAL_VMX_FROM_PT", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (56, 56), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_CLEAR_IA32_RTIT_CTL", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (57, 57), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_CLEAR_IA32_LBR_CTL", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (58, 58), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short:"ALLOWED_ONE_CLEAR_UINV", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (59, 59), + policy: ProfilePolicy::Inherit + }, + // TODO: Also determines whether SSP is loaded on VM exit (do we need that?) + ValueDefinition { + short:"ALLOWED_ONE_LOAD_CET_STATE", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (60, 60), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_PKRS", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (61, 61), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_SAVE_IA32_PERF_GLOBAL_CTL", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (62, 62), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short:"ALLOWED_ONE_ACTIVATE_SECONDARY_CONTROLS", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (63, 63), + policy: ProfilePolicy::Inherit + }, + ]) + ), + ( + RegisterAddress::IA32_VMX_ENTRY_CTLS, + ValueDefinitions::new(&[ + ValueDefinition { + short: "ALLOWED_ZERO_0_1", + description: "VM entry allows control X to be 0 if bit X in this MSR is zero", + bits_range: (0, 1), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_DEBUG_CONTROLS", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_3_8", + description: "VM entry allows control X to be 0 if bit X in this MSR is zero", + bits_range: (3, 8), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_IA_32E_MODE_GUES", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (9, 9), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ENTRY_TO_SMM", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (10, 10), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_DEACTIVATE_DUAL__MONITOR_TREATMENT", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (11, 11), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_12", + description: "VM entry allows control X to be 0 if bit X in this MSR is zero", + bits_range: (12, 12), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_IA32_PERF_GLOBAL_CTRL", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (13, 13), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_IA32_PAT", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (14, 14), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_IA32_EFER", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (15, 15), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_IA32_BNDCFGS", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (16, 16), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_CONCEAL_VMX_FROM_PT", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (17, 17), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_IA32_RTIT_CTL", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (18, 18), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_UINV", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (19, 19), + policy: ProfilePolicy::Inherit + }, + // TODO: Also determines whether SSP is loaded on VM exit (do we need that?) + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_CET_STATE", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (20, 20), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_GUEST_IA32_LBR_CTL", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (21, 21), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_PKRS", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (22, 22), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_23_24", + description: "VM entry allows control X to be 0 if bit X in this MSR is zero", + bits_range: (23, 24), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ALLOW_SEAM_GUEST_TELEMETRY", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (25, 25), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_26_31", + description: "VM entry allows control X to be 0 if bit X in this MSR is zero", + bits_range: (26, 31), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_0_1", + description:"VM entry allows control X to be 1 if bit X + 32 in this MSR is 1", + bits_range: (32, 33), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_DEBUG_CONTROLS", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (34, 34), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_3_8", + description:"VM entry allows control X to be 1 if bit X + 32 in this MSR is 1", + bits_range: (35, 40), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_IA_32E_MODE_GUES", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (41, 41), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_ENTRY_TO_SMM", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (42, 42), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_DEACTIVATE_DUAL__MONITOR_TREATMENT", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (43, 43), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_12", + description:"VM entry allows control X to be 1 if bit X + 32 in this MSR is 1", + bits_range: (44, 44), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_IA32_PERF_GLOBAL_CTRL", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (45, 45), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_IA32_PAT", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (46, 46), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_IA32_EFER", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (47, 47), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_IA32_BNDCFGS", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (48, 48), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short:"ALLOWED_ONE_CONCEAL_VMX_FROM_PT", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (49, 49), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_IA32_RTIT_CTL", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (50, 50), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_UINV", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (51, 51), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_CET_STATE", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (52, 52), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_GUEST_IA32_LBR_CTL", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (53, 53), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_PKRS", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (54, 54), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_23_24", + description:"VM entry allows control X to be 1 if bit X + 32 in this MSR is 1", + bits_range: (55, 56), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_ALLOW_SEAM_GUEST_TELEMETRY", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (57, 57), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_26_31", + description:"VM entry allows control X to be 1 if bit X + 32 in this MSR is 1", + bits_range: (58, 63), + policy: ProfilePolicy::Inherit + }, + ]) + ), + + ( + RegisterAddress::IA32_VMX_MISC, + ValueDefinitions::new(&[ + ValueDefinition { + short: "VMX_PREEMPTION_TSC_REL", + description: "specifies the relationship between the rate of the VMX-preemption timer and that of the timestamp counter (TSC)", + bits_range: (0, 4), + policy: ProfilePolicy::Passthrough + }, + ValueDefinition { + short: "IA32_EFER.LMA_STORE", + description: "If 1, then VM exits store the value of IA32_EFER.LMA into the IA32-e mode guest VM-entry control", + bits_range: (5,5), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "HLT_STATE", + description: "Activity state 1 (HLT) is supported", + bits_range: (6,6), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "SHUTDOWN_STATE", + description: "Activity state 2 (shutdown) is supported", + bits_range: (7,7), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "WAIT_FOR_SIPI__STATE", + description: "Activity state 3 (wait-for-SIPI) is supported", + bits_range: (8,8), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short: "VMX_INTEL_PT", + description: "If 1 then Intel Processor Trace can be used in VMX operation", + bits_range: (14,14), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short: "RDMSR_SMM", + description: "If 1 then the RDMSR instruction can be used in system management mode (SMM) to read the IA32_SMBASE MSR", + bits_range: (15,15), + // TODO: Is this a reasonable policy? + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short: "VMX_NUM_CR3", + description: "The number of CR3-target values supported by the processor", + bits_range: (16,24), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "MAX_MSR_STORE_LISTS", + description: "If N then 512*(N +1) is the recommended maximum number of MSRs to be included each of the VM-exit MSR-store list, VM-exit-MSR-load-list, VM-entry MSR-load list", + bits_range: (25, 27), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "SMM_MONITOR_CTL_BIT2", + description: "If set then bit 2 of the IA32_SMM_MONITOR_CTL can be set to 1", + // TODO: Check policy. Perhaps this should rather be Static(0) ? + bits_range: (28, 28), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "VM_WRITE_EXIT_FIELDS", + description: "If 1 then software can use VMWRITE to write to any supported field in the VMCS", + bits_range: (29,29), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "VM_ENTRY_INJECTION", + description: "If 1 then VM entry permits injection of the following: software interrupt, software exception, or privileged software exception with an instruction length of 0", + bits_range: (30,30), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "MSEG_REV_ID", + description: "MSEG revision identifier used by the processor", + bits_range: (32,63), + // TODO: Should this be Passthrough? + policy: ProfilePolicy::Inherit + }, + ]) + ), + + ( + RegisterAddress::IA32_VMX_CR0_FIXED0, + // NOTE 1: If any entry in IA32_VMX_CR0_FIXED1 has ProfilePolicy::Stattic(0) then the corresponding entry here must also have ProfilePolicy::Static(0) + // + // NOTE 2: We use the inherit policy for reserved fields. + ValueDefinitions::new(&[ + ValueDefinition { + short: "CR0.PE", + description: "If 0, then bit 0 (Protection Enable) of CR0 is allowed to be 0. bit 0 of CR0 enables real-address mode when clear.", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR0.MP", + description: "If 0, then bit 1 (Monitor Coprocessor) of CR0 is allowed to be 0. See Intel SDM Vol. 3A Section 2.5 for more information", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit + }, + // We expect this to be 0 for all modern processors, but Inherit is fine. + ValueDefinition { + short: "CR0.EM", + description: "If 0, then bit 2 (Emulation) of CR0 is allowed to be 0. See Intel SDM Vol. 3A Section 2.5 for more information", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR0.TS", + description: "If 0, then bit 3 (Task Switched) of CR0 is allowed to be 0. See Intel SDM Vol. 3A Section 2.5 for more information", + bits_range: (3, 3), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR0.ET", + description: "If 0, then bit 4 (Extension Type) of CR0 is allowed to be 0. See Intel SDM Vol. 3A Section 2.5 for more information", + bits_range: (4, 4), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR0.NE", + description: "If 0, then bit 5 (Numeric Error) of CR0 is allowed to be 0. Enables the PC-style x87 FPU error reporting mechanism when clear in CR0.", + bits_range: (5, 5), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition { + short: "IA32_VMX_CR0_FIXED1_RESERVED_6_15", + description: "Reports bits allowed to be 0 in CR0", + bits_range: (6, 15), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR0.WP", + description: "If 0, then bit 16 (Write protect) of CR0 is allowed to be 0. If this bit is clear in CR0 then supervisor-level procedures are + allowed to write into read-only pages", + bits_range: (16, 16), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "IA32_VMX_CR0_FIXED1_RESERVED_17_17", + description: "Reports bits allowed to be 0 in CR0", + bits_range: (17, 17), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR0.AM", + description: "If 0, then bit 18 (Alignment Mask) of CR0 is allowed to be 0. If this bit is clear in CR0 then alignment checking is disabled.", + bits_range: (18, 18), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "IA32_VMX_CR0_FIXED1_RESERVED_19_28", + description: "Reports bits allowed to be 0 in CR0", + bits_range: (19, 28), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR0.NW", + description: "If 0, then bit 29 (Not Write-through) of CR0 is allowed to be 0. See Intel SDM Vol. 3A Section 2.5 for more information", + bits_range: (29, 29), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR0.CD", + description: "If 0, then bit 30 (Cache disable) of CR0 is allowed to be 0. If CR0 bits 30 and 29 are 0 then caching of memory locations + for the whole of physical memory in the processor's internal (and external) cache is enabled.", + bits_range: (30, 30), + policy: ProfilePolicy::Inherit + }, + // TOD0: Disabling paging sounds bad, should we force this to 1? + ValueDefinition { + short: "CR0.PG", + description: "If 0, then bit 31 (Paging) of CR0 is allowed to be 0. If bit 31 of CR0 is cleared then paging is disabled (all linear addresses get treated as physical addresses).", + bits_range: (31, 31), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "IA32_VMX_CR0_FIXED1_RESERVED_32_63", + description: "Reports bits allowed to be 0 in CR0", + bits_range: (32, 63), + policy: ProfilePolicy::Inherit + }, + ]) + ), + + // NOTE: CR0_FIXED1 cannot be set by KVM, but this is OK, because its value is determined by CPUID anyway + ( + RegisterAddress::IA32_VMX_CR0_FIXED1, + ValueDefinitions::new(&[ + + ValueDefinition { + short: "CR0.PE", + description: "If 1, then bit 0 (Protection Enable) of CR0 is allowed to be 1. bit 0 of CR0 enables protected mode when set", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition { + short: "CR0.MP", + description: "If 1, then bit 1 (Monitor Coprocessor) of CR0 is allowed to be 1. See Intel SDM Vol. 3A Section 2.5 for more information", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit + }, + // We expect this to be 0 for all modern processors, but Inherit is fine. + ValueDefinition { + short: "CR0.EM", + description: "If 1, then bit 2 (Emulation) of CR0 is allowed to be 1. See Intel SDM Vol. 3A Section 2.5 for more information", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR0.TS", + description: "If 1, then bit 3 (Task Switched) of CR0 is allowed to be 1. See Intel SDM Vol. 3A Section 2.5 for more information", + bits_range: (3, 3), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR0.ET", + description: "If 1, then bit 4 (Extension Type) of CR0 is allowed to be 1. See Intel SDM Vol. 3A Section 2.5 for more information", + bits_range: (4, 4), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR0.NE", + description: "If 1, then bit 5 (Numeric Error) of CR0 is allowed to be 1. This bit enables the native (internal) mechanism for reporting x87 FPU errors when set in CR0.", + bits_range: (5, 5), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "IA32_VMX_CR0_FIXED1_RESERVED_6_15", + description: "Reports bits allowed to be 1 in CR0", + bits_range: (6, 15), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR0.WP", + description: "If 1, then bit 16 (Write protect) of CR0 is allowed to be 1. If this bit is set in CR0 then supervisor-level procedures are + inhibited from writing into read-only pages", + bits_range: (16, 16), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "IA32_VMX_CR0_FIXED1_RESERVED_17_17", + description: "Reports bits allowed to be 1 in CR0", + bits_range: (17, 17), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR0.AM", + description: "If 1, then bit 18 (Alignment Mask) of CR0 is allowed to be 1. If bit 18 of CR0 is set then automatic alignment checking is possible.", + bits_range: (18, 18), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "IA32_VMX_CR0_FIXED1_RESERVED_19_28", + description: "Reports bits allowed to be 1 in CR0", + bits_range: (19, 28), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR0.NW", + description: "If 1, then bit 29 (Not Write-through) of CR0 is allowed to be 1. See Intel SDM Vol. 3A Section 2.5 for more information", + bits_range: (29, 29), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR0.CD", + description: "If 1, then bit 30 (Cache disable) of CR0 is allowed to be 1. If CR0 bit 30 is 1 then caching is restricted", + bits_range: (30, 30), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR0.PG", + description: "If 1, then bit 31 (Paging) of CR0 is allowed to be 1 which enables paging", + bits_range: (31, 31), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "IA32_VMX_CR0_FIXED1_RESERVED_32_63", + description: "Reports bits allowed to be 1 in CR0", + bits_range: (32, 63), + policy: ProfilePolicy::Inherit + }, + ]) + ), + + ( + RegisterAddress::IA32_VMX_CR4_FIXED0, + ValueDefinitions::new(&[ + ValueDefinition { + short: "CR4.VME", + description: "If 0, then bit 0 (Virtual-8086 Mode Extension) of CR4 is allowed to be 0. Bit 0 of CR4 disables the interrupt and exception-handling extensions in virtual-8086 mode when clear.", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.PVI", + description: "If 0, then bit 1 (Protected-Mode Virtual Interrupts) of CR4 is allowed to be 0. Bit 1 of CR4 disables the virtual interrupt flag in protected mode when clear.", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.TSD", + description: "If 0, then bit 2 (Time Stamp Disable) of CR4 is allowed to be 0. Bit 2 of CR4 allows RDTSC instruction to be executed at any privilege level when clear.", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.DE", + description: "If 0, then bit 3 (Debugging extensions) of CR4 is allowed to be 0. When Bit 3 of CR4 is clear the processor aliases references to registers DR4 and DR5 for compatibility with legacy software", + bits_range: (3, 3), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.PSE", + description: "If 0, then bit 4 (Page Size Extensions) of CR4 is allowed to be 0. Bit 4 of CR4 restricts 32-bit paging to pages of 4 KBytes when clear.", + bits_range: (4, 4), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.PAE", + description: "If 0, then bit 5 (Physical Address Extension) of CR4 is allowed to be 0. Bit 5 of CR4 restricts physical addresses to 32 bits when clear", + bits_range: (5, 5), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.MCE", + description: "If 0, then bit 6 (Machine-Check Enable) of CR4 is allowed to be 0. Bit 6 of CR4 disables the machine-check exception when clear", + bits_range: (6, 6), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.PGE", + description: "If 0, then bit 7 (Page Global Enable) of CR4 is allowed to be 0. Bit 7 of CR4 disables the global page feature when clear", + bits_range: (7, 7), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.PCE", + description: "If 0, then bit 8 (Performance-Monitoring Counter Enable) of CR4 is allowed to be 0. The RDPMC instruction can only be executed at protection level 0 when bit 8 of CR4 is clear", + bits_range: (8, 8), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short: "CR4.OSFXSR", + description: "If 0, then bit 9 (OS Support for FXSAVE and FXRSTOR) of CR4 is allowed to be 0. See Intel SDM Vol.3A Section 2.5 for more information.", + bits_range: (9, 9), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.OSXMMEXCPT", + description: "If 0, then bit 10 (OS Support for Unmaksed SIMD Floating-Point Exceptions) of CR4 is allowed to be 0. See Intel SDM Vol.3A Section 2.5 for more information.", + bits_range: (10, 10), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.UMIP", + description: "If 0, then bit 11 (User-Mode instruction Prevention) of CR4 is allowed to be 0. See Intel SDM Vol.3A Section 2.5 for more information.", + bits_range: (11, 11), + policy: ProfilePolicy::Inherit + }, + // Maybe this could even be passthrogh? CHV is 64-bit only. + ValueDefinition { + short: "CR4.LA57", + description: "If 0, then bit 12 (57-bit linear addresses) of CR4 is allowed to be 0. See Intel SDM Vol.3A Section 2.5 for more information.", + bits_range: (12, 12), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.VMXE", + description: "If 0, then bit 13 (VMX-Enable) of CR4 is allowed to be 0. See Intel SDM Vol.3A Section 2.5 for more information.", + bits_range: (13, 13), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.SMXE", + description: "If 0, then bit 14 (SMX-Enable) of CR4 is allowed to be 0. See Intel SDM Vol.3A Section 2.5 for more information.", + bits_range: (14, 14), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short: "CR4.RESERVED_15", + description: "If 0, then bit 15 (RESERVED) of CR4 is allowed to be 0. See Intel SDM Vol.3A Section 2.5 for more information.", + bits_range: (15, 15), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.FSGSBASE", + description: "If 0, then bit 16 (FSGSBASE-Enable) of CR4 is allowed to be 0. See Intel SDM Vol.3A Section 2.5 for more information", + bits_range: (16, 16), + policy: ProfilePolicy::Inherit + }, + // Probably irrelevant? + ValueDefinition { + short: "CR4.PCIDE", + description: "If 0, then bit 17 (PCID-Enable) of CR4 is allowed to be 0. See Intel SDM Vol.3A Section 2.5 for more information", + bits_range: (17, 17), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.OSXSAVE", + description: "If 0, then bit 18 (XSAVE and Processor Extended States-Enable) of CR4 is allowed to be 0. See Intel SDM Vol.3A Section 2.5 for more information", + bits_range: (18, 18), + policy: ProfilePolicy::Inherit + }, + // CPU Profiles do not support Key locker features for now + ValueDefinition { + short: "CR4.KL", + description: "If 0, then bit 19 (Key-Locker-Enable) of CR4 is allowed to be 0. When bit 19 of CR4 is set, the LOADIWKEY instruction is enabled and CPUID.0x19.EBX[0] is set if support for AES key locker instructions has been activated by system firmware", + bits_range: (19, 19), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short: "CR4.SMEP", + description: "If 0, then bit 20 (SMEP-Enable) of CR4 is allowed to be 0. See Intel SDM Vol 3.A Section 2.5 for more information", + bits_range: (20, 20), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.SMAP", + description: "If 0, then bit 21 (SMAP-Enable) of CR4 is allowed to be 0. See Intel SDM Vol 3.A Section 2.5 for more information", + bits_range: (21, 21), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.PKE", + description: "If 0, then bit 22 (Enable protection keys for user-mode pages) of CR4 is allowed to be 0. See Intel SDM Vol. 3.A Section 2.5 for more information.", + bits_range: (22, 22), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "CR4.CET", + description: "If 0, then bit 23 (Control-flow Enforcement Technology) of CR4 is allowed to be 0. See Intel SDM Vol. 3.A Section 2.5 for more information.", + bits_range: (23, 23), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short: "CR4.PKS", + description: "If 0, then bit 24 (Enable protection keys for supervisor-mode pages) of CR4 is allowed to be 0. See Intel SDM Vol. 3.A Section 2.5 for more information.", + bits_range: (24, 24), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short: "CR4.UINTR", + description: "If 0, then bit 25 (User Interrupts Enable) of CR4 is allowed to be 0. See Intel SDM Vol. 3.A Section 2.5 for more information.", + bits_range: (25, 25), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short: "CR4.RESERVED_26", + description: "If 0, then bit 26 (RESERVED) of CR4 is allowed to be 0. See Intel SDM Vol.3.A Section 2.5 for more information.", + bits_range: (26, 26), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.LASS", + description: "If 0, then bit 27 (User Interrupts Enable) of CR4 is allowed to be 0. See Intel SDM Vol. 3.A Section 2.5 for more information.", + bits_range: (27, 27), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.LAM_SUP", + description: "If 0, then bit 28 (Supervisor LAM-enable) of CR4 is allowed to be 0. See Intel SDM Vol. 3.A Section 25 for more information.", + bits_range: (28, 28), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "IA32_VMX_CR4_FIXED0", + description: "Reports bits allowed to be 0 in CR4", + bits_range: (29, 63), + policy: ProfilePolicy::Inherit + } + ]) + ), + + // NOTE: CR4_FIXED1 cannot be set by KVM, but this is OK, because its value is determined by CPUID anyway + ( + RegisterAddress::IA32_VMX_CR4_FIXED1, + ValueDefinitions::new(&[ + ValueDefinition { + short: "CR4.VME", + description: "If 1, then bit 1 (Virtual-8086 Mode Extension) of CR4 is allowed to be 1. Bit 0 of CR4 enables the interrupt and exception-handling extensions in virtual-8086 mode when set.", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.PVI", + description: "If 1, then bit 1 (Protected-Mode Virtual Interrupts) of CR4 is allowed to be 1. Bit 1 of CR4 enables hardware support for a virtual interrupt flag in protected mode when set.", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.TSD", + description: "If 1, then bit 2 (Time Stamp Disable) of CR4 is allowed to be 1. Bit 2 of CR4 restricts the execution of the RDTS instruction to procedures running at privilege level 0 when set.", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.DE", + description: "If 1, then bit 3 (Debugging extensions) of CR4 is allowed to be 1. Bit 3 of CR4 make references to debug registers DR4 and DR5 cause an undefined opcode exception when set", + bits_range: (3, 3), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.PSE", + description: "If 1, then bit 4 (Page Size Extensions) of CR4 is allowed to be 1. Bit 4 of CR4 enables 4-MByte pages with 32-bit paging when set", + bits_range: (4, 4), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.PAE", + description: "If 1, then bit 5 (Physical Address Extension) of CR4 is allowed to be 1. Bit 5 of CR4 enables paging to produce physical addresses of more than 32 bits when set", + bits_range: (5, 5), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.MCE", + description: "If 1, then bit 6 (Machine-Check Enable) of CR4 is allowed to be 1. Bit 6 of CR4 enables the machine-check exception when set", + bits_range: (6, 6), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.PGE", + description: "If 1, then bit 7 (Page Global Enable) of CR4 is allowed to be 1. Bit 7 of CR4 enables the global page feature when set", + bits_range: (7, 7), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.PCE", + description: "If 1, then bit 8 (Performance-Monitoring Counter Enable) of CR4 is allowed to be 1. The RDPMC instruction can be executed at any protection level when bit 8 of CR4 is set.", + bits_range: (8, 8), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short: "CR4.OSFXSR", + description: "If 1, then bit 9 (OS Support for FXSAVE and FXRSTOR) of CR4 is allowed to be 1. See Intel SDM Vol.3A Section 2.5 for more information.", + bits_range: (9, 9), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.OSXMMEXCPT", + description: "If 1, then bit 10 (OS Support for Unmaksed SIMD Floating-Point Exceptions) of CR4 is allowed to be 1. See Intel SDM Vol.3A Section 2.5 for more information.", + bits_range: (10, 10), + policy: ProfilePolicy::Inherit + }, + // TODO: Is this always 0 for QEMU? + ValueDefinition { + short: "CR4.UMIP", + description: "If 1, then bit 11 (User-Mode instruction Prevention) of CR4 is allowed to be 1. If bit 11 of CR4 is set and CPL > 0 then the SGDT,SIDT,SLDT,SMSW and STR instructions cannot be executed.", + bits_range: (11, 11), + policy: ProfilePolicy::Inherit + }, + // Maybe this could even be passthrogh? CHV is 64-bit only. + ValueDefinition { + short: "CR4.LA57", + description: "If 1, then bit 12 (57-bit linear addresses) of CR4 is allowed to be 1. See Intel SDM Vol.3A Section 2.5 for more information.", + bits_range: (12, 12), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.VMXE", + description: "If 1, then bit 13 (VMX-Enable) of CR4 is allowed to be 1. Bit 13 of CR4 enables VMX operation when set.", + bits_range: (13, 13), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.SMXE", + description: "If 1, then bit 14 (SMX-Enable) of CR4 is allowed to be 1. Bit 14 of CR4 enables SMX operation when set.", + bits_range: (14, 14), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short: "CR4.RESERVED_15", + description: "If 1, then bit 15 (RESERVED) of CR4 is allowed to be 1. See Intel SDM Vol.3A Section 2.5 for more information.", + bits_range: (15, 15), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.FSGSBASE", + description: "If 1, then bit 16 (FSGSBASE-Enable) of CR4 is allowed to be 1. See Intel SDM Vol.3A Section 2.5 for more information", + bits_range: (16, 16), + policy: ProfilePolicy::Inherit + }, + // Probably irrelevant? + ValueDefinition { + short: "CR4.PCIDE", + description: "If 1, then bit 17 (PCID-Enable) of CR4 is allowed to be 1. Enables process-context identifiers (PCIDs) when bit 17 of CR4 is set. Applies only in IA-32e mode", + bits_range: (17, 17), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.OSXSAVE", + description: "If 1, then bit 18 (XSAVE and Processor Extended States-Enable) of CR4 is allowed to be 1. See Intel SDM Vol.3A Section 2.5 for more information", + bits_range: (18, 18), + policy: ProfilePolicy::Inherit + }, + // CPU Profiles do not support Key locker features for now + ValueDefinition { + short: "CR4.KL", + description: "If 1, then bit 19 (Key-Locker-Enable) of CR4 is allowed to be 1. When bit 19 of CR4 is set, the LOADIWKEY instruction is enabled and CPUID.0x19.EBX[0] is set if support for AES key locker instructions has been activated by system firmware", + bits_range: (19, 19), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short: "CR4.SMEP", + description: "If 1, then bit 20 (SMEP-Enable) of CR4 is allowed to be 1. Bit 20 of CR4 enables supervisor-mode execution prevention when set", + bits_range: (20, 20), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.SMAP", + description: "If 1, then bit 21 (SMAP-Enable) of CR4 is allowed to be 1. Bit 21 of CR4 enables supervisor-mode access prevention when set", + bits_range: (21, 21), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.PKE", + description: "If 1, then bit 22 (Enable protection keys for user-mode pages) of CR4 is allowed to be 1. When bit 22 of CR4 is set, CPUID.0x7.ECX[4] is displayed as 1. See Intel SDM Vol. 3.A Section 2.5 for more information.", + bits_range: (22, 22), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short: "CR4.CET", + description: "If 1, then bit 23 (Control-flow Enforcement Technology) of CR4 is allowed to be 1. See Intel SDM Vol. 3.A Section 2.5 for more information.", + bits_range: (23, 23), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short: "CR4.PKS", + description: "If 1, then bit 24 (Enable protection keys for supervisor-mode pages) of CR4 is allowed to be 1. See Intel SDM Vol. 3.A Section 2.5 for more information.", + bits_range: (24, 24), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short: "CR4.UINTR", + description: "If 1, then bit 25 (User Interrupts Enable) of CR4 is allowed to be 1. Bit 25 of CR4 enables user interrupts when set.", + bits_range: (25, 25), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short: "CR4.RESERVED_26", + description: "If 1, then bit 26 (RESERVED) of CR4 is allowed to be 1. See Intel SDM Vol.3A Section 2.5 for more information.", + bits_range: (26, 26), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.LASS", + description: "If 1, then bit 27 (User Interrupts Enable) of CR4 is allowed to be 1. Bit 27 of CR4 enables LASS (Linear-Address-Space Separation) when set.", + bits_range: (27, 27), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.LAM_SUP", + description: "If 1, then bit 28 (Supervisor LAM-enable) of CR4 is allowed to be 1. Bit 28 of CR4 enables LAM (linear-address masking) for supervisor pointers when set.", + bits_range: (28, 28), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.RESERVED_29_63", + description: "Reports bits allowed to be 1 in CR4", + bits_range: (29, 63), + policy: ProfilePolicy::Inherit + } + ]) + ), + + ( + RegisterAddress::IA32_VMX_VMCS_ENUM, + ValueDefinitions::new(&[ + ValueDefinition{ + short: "MAX_INDEX", + description: "highest index value used for any VCMS encoding", + bits_range: (1, 9), + policy: ProfilePolicy::Inherit + } + ]) + + ), + + ( + RegisterAddress::IA32_VMX_PROCBASED_CTLS2, + ValueDefinitions::new(&[ + // Intel SDM Vol.3D A.3.3 documents that the ALLOWED_ZERO bits are actually always 0 for this MSR. + ValueDefinition { + short:"ALLOWED_ZERO_VIRTUALIZE_APIC_ACCESSES", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ENABLE_EPT", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_DESCRIPTOR_TABLE_EXITING", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ENABLE_RDTSCP", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (3, 3), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_VIRTUALIZE_X2APIC_MODE", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (4, 4), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ENABLE_VPID", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (5, 5), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_WBINVD_EXITING", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (6, 6), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_UNRESTRICTED_GUEST", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (7, 7), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_APIC_REGISTER_VIRTUALIZATION", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (8, 8), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_VIRTUAL_INTERRUPT_DELIVERY", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (9, 9), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_PAUSE_LOOP_EXITING", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (10, 10), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_RDRAND_EXITING", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (11, 11), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ENABLE_INVPCID", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (12, 12), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ENABLE_VM_FUNCTIONS", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (13, 13), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_VMCS_SHADOWING", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (14, 14), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ENABLE_ENCLS_EXITING", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (15, 15), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_RDSEED_EXITING", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (16, 16), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ENABLE_PML", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (17, 17), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_EPT_VIOLATION_#VE", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (18, 18), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_CONCEAL_VMX_FROM_PT", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (19, 19), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ENABLE_XSAVES/XRSTORS", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (20, 20), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_PASID_TRANSLATION", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (21, 21), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_MODE_BASED_EXECUTE_CONTROL_FOR_EPT", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (22, 22), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_SUB_PAGE_WRITE_PERMISSIONS_FOR_EPT", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (23, 23), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_INTEL_PT_USES_GUEST_PHYSICAL_ADDRESSES", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (24, 24), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_USE_TSC_SCALING", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (25, 25), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ENABLE_USER_WAIT_AND_PAUSE", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (26, 26), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ENABLE_PCONFIG", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (27, 27), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_28_29", + description: "Control X is allowed to be 0 if bit X of this MSR is 0", + bits_range: (28, 29), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_VMM_BUS_LOCK_DETECTION", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (30, 30), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_INSTRUCTION_TIMEOU", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (31, 31), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_VIRTUALIZE_APIC_ACCESSES", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (32, 32), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_ENABLE_EPT", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (33, 33), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_DESCRIPTOR_TABLE_EXITING", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (34, 34), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_ENABLE_RDTSCP", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (35, 35), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_VIRTUALIZE_X2APIC_MODE", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (36, 36), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_ENABLE_VPID", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (37, 37), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_WBINVD_EXITING", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (38, 38), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_UNRESTRICTED_GUEST", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (39, 39), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_APIC_REGISTER_VIRTUALIZATION", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (40, 40), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_VIRTUAL_INTERRUPT_DELIVERY", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (41, 41), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_PAUSE_LOOP_EXITING", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (42, 42), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_RDRAND_EXITING", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (43, 43), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_ENABLE_INVPCID", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (44, 44), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_ENABLE_VM_FUNCTIONS", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (45, 45), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_VMCS_SHADOWING", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (46, 46), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_ENABLE_ENCLS_EXITING", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (47, 47), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_RDSEED_EXITING", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (48, 48), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_ENABLE_PML", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (49, 49), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_EPT_VIOLATION_#VE", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (50, 50), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_CONCEAL_VMX_FROM_PT", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (51, 51), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_ENABLE_XSAVES/XRSTORS", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (52, 52), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_PASID_TRANSLATION", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (53, 53), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_MODE_BASED_EXECUTE_CONTROL_FOR_EPT", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (54, 54), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_SUB_PAGE_WRITE_PERMISSIONS_FOR_EPT", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (55, 55), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_INTEL_PT_USES_GUEST_PHYSICAL_ADDRESSES", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (56, 56), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short:"ALLOWED_ONE_USE_TSC_SCALING", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (57, 57), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_ENABLE_USER_WAIT_AND_PAUSE", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (58, 58), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_ENABLE_PCONFIG", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (59, 59), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short:"ALLOWED_ONE_28_29", + description: "Control X is allowed to be 1 if bit X of this MSR is 1", + bits_range: (60, 61), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_VMM_BUS_LOCK_DETECTION", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (62, 62), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_INSTRUCTION_TIMEOUT", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (63, 63), + policy: ProfilePolicy::Inherit + }, + ]) + ), + ( + RegisterAddress::IA32_VMX_EPT_VPID_CAP, + ValueDefinitions::new(&[ + ValueDefinition{ + short: "EPT_EXECUTE_ONLY", + description: "The processor supports execute-only translations by EPT", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition{ + short: "PAGE_WALK_LENGTH_4", + description: "Support for Page-walk length of 4", + bits_range: (6, 6), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition{ + short: "PAGE_WALK_LENGTH_5", + description: "Support for Page-walk length of 5", + bits_range: (7, 7), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition{ + short: "EPT_MEM_TYPE_UC", + description: "Software can configure the EPT paging structure to memory type to be unreachable (UC)", + bits_range: (8, 8), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition{ + short: "EPT_MEM_TYPE_WB", + description: "Software can configure the EPT paging structure to memory type to be write-back (WB)", + bits_range: (14, 14), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition{ + short: "EPT_PDE_2M", + description: "Software can configure the EPT PDE to map a 2-Mbyte page", + bits_range: (16, 16), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition{ + short: "EPT_PDPTE_1G", + description: "Software can configure the EPT PDPTE to map a 1-Gbyte page", + bits_range: (17, 17), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition{ + short: "INVEPT", + description: "INVEPT instruction is supported", + bits_range: (20, 20), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition { + short: "FLAGS_EPT", + description: "Accessed and dirty flags for EPT are supported", + bits_range: (21, 21), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition { + short: "VM_EXIT_VIOLATIONS_INFO", + description: "If set, the processors advanced VM-exit information for EPT violations", + bits_range: (22, 22), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition { + short: "SHADOW_STACK_CTL", + description: "Supervisor shadow-stack control is supported", + bits_range: (23, 23), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition{ + short: "SINGLE_CONTEXT_INVEPT", + description: "The single-context INVEPT type is supported", + bits_range: (25, 25), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition{ + short: "ALL_CONTEXT_INVEPT", + description: "The all-context INVEPT type is supported", + bits_range: (26, 26), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition{ + short: "INVVPID", + description: "INVVPID instruction is supported", + bits_range: (32, 32), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition{ + short: "INDIVIDUAL_ADDRESS_INVVPID", + description: "The individual address INVVPID type is supported", + bits_range: (40, 40), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition{ + short: "SINGLE_CONTEXT_INVVPID", + description: "The single-context INVVPID type is supported", + bits_range: (41, 41), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition{ + short: "ALL_CONTEXT_INVVPID", + description: "The all-context INVEPT type is supported", + bits_range: (42, 42), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition{ + short: "SINGLE_CONTEXT_RETAINING_GLOBALS_INVVPID", + description: "The single-context-retaining-globals INVVPID type is supported", + bits_range: (43, 43), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition{ + short: "MAX_HLAT_PREFIX", + description: "Enumerates the maximum HLAT prefix size", + bits_range: (48, 53), + policy: ProfilePolicy::Inherit + }, + ]) + ), + + ( + + RegisterAddress::IA32_VMX_TRUE_PINBASED_CTLS, + ValueDefinitions::new(&[ + ValueDefinition { + short:"ALLOWED_ZERO_EXTERNAL_INTERRUPT_EXITING", + description: "See Intel SDM Vol.3C Section 26.6.1 Table 26-5 (Definitions of Pin-Based VM-Execution Controls)", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_1_2", + description: "VM entry allows control X to be 0 if bit X in this MSR is zero", + bits_range: (1, 2), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_NMI_EXITING", + description: "See Intel SDM Vol.3C Section 26.6.1 Table 26-5 (Definitions of Pin-Based VM-Execution Controls)", + bits_range: (3, 3), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_4", + description: "VM entry allows control X to be 0 if bit X in this MSR is zero", + bits_range: (4, 4), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_VIRTUAL_NMIS", + description: "See Intel SDM Vol.3C Section 26.6.1 Table 26-5 (Definitions of Pin-Based VM-Execution Controls)", + bits_range: (5, 5), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ACTIVATE_VMX_PREEMPTION_TIMER", + description: "See Intel SDM Vol.3C Section 26.6.1 Table 26-5 (Definitions of Pin-Based VM-Execution Controls)", + bits_range: (6, 6), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_PROCESS_POSTED_INTERRUPTS", + description: "See Intel SDM Vol.3C Section 26.6.1 Table 26-5 (Definitions of Pin-Based VM-Execution Controls)", + bits_range: (7, 7), + policy: ProfilePolicy::Inherit + }, + + + ValueDefinition { + short: "ALLOWED_ZERO", + description: "VM entry allows control X to be 0 if bit X in this MSR is zero", + bits_range: (8, 31), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition{ + short:"ALLOWED_ONE_EXTERNAL_INTERRUPT_EXITING", + description: "See Intel SDM Vol.3C Section 26.6.1 Table 26-5 (Definitions of Pin-Based VM-Execution Controls)", + bits_range: (32, 32), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_1_2", + description: "VM entry allows control X to be 1 if bit X in this MSR is 1", + bits_range: (33, 34), + policy: ProfilePolicy::Inherit + }, + ValueDefinition{ + short:"ALLOWED_ONE_NMI_EXITING", + description: "See Intel SDM Vol.3C Section 26.6.1 Table 26-5 (Definitions of Pin-Based VM-Execution Controls)", + bits_range: (35, 35), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_4", + description: "VM entry allows control X to be 1 if bit X in this MSR is 1", + bits_range: (36, 36), + policy: ProfilePolicy::Inherit + }, + ValueDefinition{ + short:"ALLOWED_ONE_VIRTUAL_NMIS", + description: "See Intel SDM Vol.3C Section 26.6.1 Table 26-5 (Definitions of Pin-Based VM-Execution Controls)", + bits_range: (37, 37), + policy: ProfilePolicy::Inherit + }, + ValueDefinition{ + short:"ALLOWED_ONE_ACTIVATE_VMX__PREEMPTION_TIMER", + description: "See Intel SDM Vol.3C Section 26.6.1 Table 26-5 (Definitions of Pin-Based VM-Execution Controls)", + bits_range: (38, 38), + policy: ProfilePolicy::Inherit + }, + ValueDefinition{ + short:"ALLOWED_ONE_PROCESS_POSTED_INTERRUPTS", + description: "See Intel SDM Vol.3C Section 26.6.1 Table 26-5 (Definitions of Pin-Based VM-Execution Controls)", + bits_range: (39, 39), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE", + description:"VM entry allows control X to be 1 if bit X + 32 in this MSR is 1", + bits_range: (40, 63), + policy: ProfilePolicy::Inherit + } + ]) + ), + + ( + RegisterAddress::IA32_VMX_TRUE_PROCBASED_CTLS, + ValueDefinitions::new(&[ + ValueDefinition { + short: "ALLOWED_ZERO_0_1", + description: "Control X is allowed to be 0 if bit X of this MSR is 0", + bits_range: (0, 1), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_INTERRUPT_WINDOW_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_USE_TSC_OFFSETTING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (3, 3), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_4_6", + description: "Control X is allowed to be 0 if bit X of this MSR is 0", + bits_range: (4, 6), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_HLT_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (7, 7), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_8", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (8, 8), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_INVLPG_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (9, 9), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_MWAIT_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (10, 10), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_RDPMC_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (11, 11), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_RDTSC_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (12, 12), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_13_14", + description: "Control X is allowed to be 0 if bit X of this MSR is 0", + bits_range: (13, 14), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_CR3_LOAD_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (15, 15), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_CR3_STORE_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (16, 16), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ACTIVATE_TERTIARY_CONTROLS", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (17, 17), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_18", + description: "Control X is allowed to be 0 if bit X of this MSR is 0", + bits_range: (18, 18), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_CR8_LOAD_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (19, 19), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_CR8_STORE_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (20, 20), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_USE_TPR_SHADOW", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (21, 21), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_NMI_WINDOW_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (22, 22), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_MOV_DR_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (23, 23), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_UNCONDITIONAL_I/O_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (24, 24), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_USE_I/O_BITMAPS", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (25, 25), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_26", + description: "Control X is allowed to be 0 if bit X of this MSR is 0", + bits_range: (26, 26), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_MONITOR_TRAP_FLAG", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (27, 27), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_USE_MSR_BITMAPS", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (28, 28), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_MONITOR_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (29, 29), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_PAUSE_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (30, 30), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ACTIVATE_SECONDARY_CONTROLS", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (31, 31), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_0_1", + description: "Control X is allowed to be 1 if bit 32 + X of this MSR is 1", + bits_range: (32, 33), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_INTERRUPT_WINDOW_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (34, 34), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_USE_TSC_OFFSETTING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (35, 35), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_4_6", + description:"VM entry allows control X to be 1 if bit X + 32 in this MSR is 1", + bits_range: (36, 38), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_HLT_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (39, 39), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_8", + description:"VM entry allows control X to be 1 if bit X + 32 in this MSR is 1", + bits_range: (40, 40), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_INVLPG_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (41, 41), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_MWAIT_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (42, 42), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_RDPMC_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (43, 43), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_RDTSC_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (44, 44), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition { + short: "ALLOWED_ONE_13_14", + description:"VM entry allows control X to be 1 if bit X + 32 in this MSR is 1", + bits_range: (45, 46), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_CR3_LOAD_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (47, 47), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_CR3_STORE_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (48, 48), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_ACTIVATE_TERTIARY_CONTROLS", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (49, 49), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_18", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (50, 50), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition { + short:"ALLOWED_ONE_CR8_LOAD_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (51, 51), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_CR8_STORE_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (52, 52), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_USE_TPR_SHADOW", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (53, 53), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_NMI_WINDOW_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (54, 54), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_MOV_DR_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (55, 55), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_UNCONDITIONAL_I/O_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (56, 56), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_USE_I/O_BITMAPS", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (57, 57), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_26", + description: "Control X is allowed to be 1 if bit X of this MSR is 1", + bits_range: (58, 58), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition { + short:"ALLOWED_ONE_MONITOR_TRAP_FLAG", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (59, 59), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_USE_MSR_BITMAPS", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (60, 60), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_MONITOR_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (61, 61), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_PAUSE_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (62, 62), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_ACTIVATE_SECONDARY_CONTROLS", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (63, 63), + policy: ProfilePolicy::Inherit + }, + + ]) + ), + + ( + RegisterAddress::IA32_VMX_TRUE_EXIT_CTLS, + ValueDefinitions::new(&[ + ValueDefinition { + short: "ALLOWED_ZERO_0_1", + description: "Control X is allowed to be 0 if bit X in this MSR is 0", + bits_range: (0, 1), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_SAVE_DEBUG_CONTROLS", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_3_8", + description: "Control X is allowed to be 0 if bit X in this MSR is 0", + bits_range: (3, 8), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_HOST_ADDRESS_SPACE_SIZE", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (9, 9), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_10_11", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (10, 11), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_IA32_PERF_GLOBAL_CTRL", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (12, 12), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_13_14", + description: "Control X is allowed to be 0 if bit X in this MSR is 0", + bits_range: (13, 14), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ACKNOWLEDGE_INTERRUPT_O_EXIT", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (15, 15), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_16_17", + description: "Control X is allowed to be 0 if bit X in this MSR is 0", + bits_range: (16, 17), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_SAVE_IA32_PAT", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (18, 18), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_IA32_PAT", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (19, 19), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_SAVE_IA32_EFER", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (20, 20), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_IA32_EFER", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (21, 21), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_SAVE_VMX_PREEMPTION_TIMER_VALUE", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (22, 22), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_CLEAR_IA32_BNDCFGS", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (23, 23), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_CONCEAL_VMX_FROM_PT", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (24, 24), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_CLEAR_IA32_RTIT_CTL", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (25, 25), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_CLEAR_IA32_LBR_CTL", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (26, 26), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short:"ALLOWED_ZERO_CLEAR_UINV", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (27, 27), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_CET_STATE", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (28, 28), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_PKRS", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (29, 29), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_SAVE_IA32_PERF_GLOBAL_CTL", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (30, 30), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ACTIVATE_SECONDARY_CONTROLS", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (31, 31), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_0_1", + description: "Control X is allowed to be 1 if bit X in this MSR is 1", + bits_range: (32, 33), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition { + short:"ALLOWED_ONE_SAVE_DEBUG_CONTROLS", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (34, 34), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition { + short: "ALLOWED_ONE_3_8", + description: "Control X is allowed to be 1 if bit X in this MSR is 1", + bits_range: (35, 40), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_HOST_ADDRESS_SPACE_SIZE", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (41, 41), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_10_11", + description: "Control X is allowed to be 1 if bit X in this MSR is 1", + bits_range: (42, 43), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_IA32_PERF_GLOBAL_CTRL", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (44, 44), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short: "ALLOWED_ONE_13_14", + description: "Control X is allowed to be 1 if bit X in this MSR is 1", + bits_range: (45, 46), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_ACKNOWLEDGE_INTERRUPT_O_EXIT", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (47, 47), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_16_17", + description: "Control X is allowed to be 1 if bit X in this MSR is 1", + bits_range: (48, 49), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_SAVE_IA32_PAT", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (50, 50), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_IA32_PAT", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (51, 51), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_SAVE_IA32_EFER", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (52, 52), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_IA32_EFER", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (53, 53), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_SAVE_VMX_PREEMPTION_TIMER_VALUE", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (54, 54), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_CLEAR_IA32_BNDCFGS", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (55, 55), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short:"ALLOWED_ONE_CONCEAL_VMX_FROM_PT", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (56, 56), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_CLEAR_IA32_RTIT_CTL", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (57, 57), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_CLEAR_IA32_LBR_CTL", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (58, 58), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short:"ALLOWED_ONE_CLEAR_UINV", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (59, 59), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_CET_STATE", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (60, 60), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_PKRS", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (61, 61), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_SAVE_IA32_PERF_GLOBAL_CTL", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (62, 62), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short:"ALLOWED_ONE_ACTIVATE_SECONDARY_CONTROLS", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (63, 63), + policy: ProfilePolicy::Inherit + }, + ]) + ), + + ( + RegisterAddress::IA32_VMX_TRUE_ENTRY_CTLS, + ValueDefinitions::new(&[ + ValueDefinition { + short: "ALLOWED_ZERO_0_1", + description: "VM entry allows control X to be 0 if bit X in this MSR is zero", + bits_range: (0, 1), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_DEBUG_CONTROLS", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_3_8", + description: "VM entry allows control X to be 0 if bit X in this MSR is zero", + bits_range: (3, 8), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_IA_32E_MODE_GUES", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (9, 9), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ENTRY_TO_SMM", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (10, 10), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_DEACTIVATE_DUAL__MONITOR_TREATMENT", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (11, 11), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_12", + description: "VM entry allows control X to be 0 if bit X in this MSR is zero", + bits_range: (12, 12), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_IA32_PERF_GLOBAL_CTRL", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (13, 13), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_IA32_PAT", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (14, 14), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_IA32_EFER", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (15, 15), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_IA32_BNDCFGS", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (16, 16), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_CONCEAL_VMX_FROM_PT", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (17, 17), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_IA32_RTIT_CTL", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (18, 18), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_UINV", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (19, 19), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_CET_STATE", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (20, 20), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_GUEST_IA32_LBR_CTL", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (21, 21), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_PKRS", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (22, 22), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_23_24", + description: "VM entry allows control X to be 0 if bit X in this MSR is zero", + bits_range: (23, 24), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ALLOW_SEAM_GUEST_TELEMETRY", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (25, 25), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_26_31", + description: "VM entry allows control X to be 0 if bit X in this MSR is zero", + bits_range: (26, 31), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_0_1", + description:"VM entry allows control X to be 1 if bit X + 32 in this MSR is 1", + bits_range: (32, 33), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_DEBUG_CONTROLS", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (34, 34), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_3_8", + description:"VM entry allows control X to be 1 if bit X + 32 in this MSR is 1", + bits_range: (35, 40), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_IA_32E_MODE_GUES", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (41, 41), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_ENTRY_TO_SMM", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (42, 42), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_DEACTIVATE_DUAL__MONITOR_TREATMENT", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (43, 43), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_12", + description:"VM entry allows control X to be 1 if bit X + 32 in this MSR is 1", + bits_range: (44, 44), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_IA32_PERF_GLOBAL_CTRL", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (45, 45), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_IA32_PAT", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (46, 46), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_IA32_EFER", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (47, 47), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_IA32_BNDCFGS", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (48, 48), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short:"ALLOWED_ONE_CONCEAL_VMX_FROM_PT", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (49, 49), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_IA32_RTIT_CTL", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (50, 50), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_UINV", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (51, 51), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_CET_STATE", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (52, 52), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_GUEST_IA32_LBR_CTL", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (53, 53), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_PKRS", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (54, 54), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_23_24", + description:"VM entry allows control X to be 1 if bit X + 32 in this MSR is 1", + bits_range: (55, 56), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_ALLOW_SEAM_GUEST_TELEMETRY", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (57, 57), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_26_31", + description:"VM entry allows control X to be 1 if bit X + 32 in this MSR is 1", + bits_range: (58, 63), + policy: ProfilePolicy::Inherit + }, + ]) + ), + + ( + RegisterAddress::IA32_VMX_VMFUNC, + ValueDefinitions::new(&[ + ValueDefinition { + short:"ALLOWED_ONE_EPTP_SWITCHING", + description: "See Intel SDM Vol.3C Section 26.6.14 Table 26-10. (Definitions of VM-Function Controls)", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_1_63", + description: "See Intel SDM Vol.3C Section 26.6.14 Table 26-10. (Definitions of VM-Function Controls)", + bits_range: (1, 63), + policy: ProfilePolicy::Inherit + }, + + ]) + ), + + // NOTE: This MSR is currently not supported by KVM. We keep the definition here regardless. (TODO: Maybe it would be better to remove it?) + ( + RegisterAddress::IA32_VMX_PROCBASED_CTLS3, + ValueDefinitions::new(&[ + ValueDefinition { + short: "ALLOWED_ONE_LOADIWKEY_EXITING", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-8 (Definitions of Tertiary Processor-Based VM-Execution Controls)", + bits_range: (0,0), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_ENABLE_HLAT", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-8 (Definitions of Tertiary Processor-Based VM-Execution Controls)", + bits_range: (1,1), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_EPT_PAGING_WRITE_CONTROL", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-8 (Definitions of Tertiary Processor-Based VM-Execution Controls)", + bits_range: (2,2), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_GUEST_PAGING_VERIFICATION", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-8 (Definitions of Tertiary Processor-Based VM-Execution Controls)", + bits_range: (3,3), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_IPI_VIRTUALIZATION", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-8 (Definitions of Tertiary Processor-Based VM-Execution Controls)", + bits_range: (4,4), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_SEAM_GUEST_PHYSICAL_ADDRESS_WIDTH", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-8 (Definitions of Tertiary Processor-Based VM-Execution Controls)", + bits_range: (5,5), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_ENABLE_MSR_LIST_INSTRUCTIONS", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-8 (Definitions of Tertiary Processor-Based VM-Execution Controls)", + bits_range: (6,6), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_VIRTUALIZE_IA32_SPEC_CTRL", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-8 (Definitions of Tertiary Processor-Based VM-Execution Controls)", + bits_range: (7,7), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_8_63", + description: "Control X is allowed to be 1 if bit X in this MSR is 1", + bits_range: (8,63), + policy: ProfilePolicy::Inherit + }, + ]) + ), + + // NOTE: This MSR is currently not supported by KVM. We keep the definition here regardless. (TODO: Maybe it would be better to remove it?) + ( + RegisterAddress::IA32_VMX_EXIT_CTLS2, + ValueDefinitions::new(&[ + ValueDefinition { + short: "ALLOWED_ONE_0_2", + description:"VM entry allows control X to be 1 if bit X is 1", + bits_range: (0, 2), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_PREMATURELY_BUSY_SHADOW_STACK", + description:"See Intel SDM Vol.3C Section 26.7.1", + bits_range: (3, 3), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_4_63", + description:"VM entry allows control X to be 1 if bit X is 1", + bits_range: (4, 63), + policy: ProfilePolicy::Inherit + } + ]) + ), + ( + RegisterAddress::MSR_PLATFORM_INFO, + ValueDefinitions::new(&[ + ValueDefinition { + short: "PLATFORM_INFORMATION", + description: "Contains power management and other model specific features enumeration. In reality bits 15:8 describe the maximum frequency that does not require turbo. All other bits are reserved", + bits_range: (0, 63), + policy: ProfilePolicy::Deny + } + ]) + ) + ]) +}; + +/// Convenience function to lookup value definitions corresponding to the given MSR register address (as a const parameter). +#[cold] +#[inline(never)] +pub(in crate::x86_64) const fn msr_definitions() -> &'static [ValueDefinition] +{ + const { + let mut out = [].as_slice(); + let intel_definitions = INTEL_MSR_FEATURE_DEFINITIONS.as_slice(); + let mut i = 0; + let length = intel_definitions.len(); + while i < length { + let (addr, definitions) = intel_definitions[i]; + if addr.0 == REG_ADDR { + out = definitions.as_slice(); + break; + } + i += 1; + } + if out.is_empty() { + panic!("MSR definition not found"); + } + out + } +} + +/// Check that the `src_feature_msrs` are compatible with those given in `dest_feature_msrs`. +/// +/// If this check fails, then software that works under the `src_feature_msrs`, may no longer +/// behave correctly with `dest_feature_msrs`. +/// +/// The `src_id` and `dest_id` strings are only used for logging purposes to identify what +/// is being compared (e.g. CPU profile vs host where the profile should be applied, etc). +/// +/// NOTE: This function assumes CPUID compatibility. +/// +/// All register addresses/keys in [`INTEL_MSR_FEATURE_DEFINITIONS`] are checked, except for: +/// - IA32_BIOS_SIGN_ID, +/// - IA32_PERF_CAPABILITIES, +/// - MSR_PLATFORM_INFO +/// +/// IA32_PERF_CAPABILITIES are inherently incompatible between different VMs and we do not +/// think it makes much sense to compare IA32_BIOS_SIGN_ID or MSR_PLATFORM_INFO in this context. +/// +/// # Errors +/// +/// This function does not return early upon error, but rather attempts all MSR-based feature +/// checks while logging errors it encounters. If any of these checks fail an error is returned +/// at the end. +/// +/// We also just use the unit type as the error variant for now, as not much can be done to +/// recover from these errors at runtime and the logs should provide the user with enough +/// information to debug the problem. +/// +/// At this moment in time we prefer the aforementioned approach over designing a complex +/// error type capable of tracking everything that might fail. +pub(in crate::x86_64) fn check_feature_msr_compatibility( + src_feature_msrs: &HashMap, + dest_feature_msrs: &HashMap, + src_id: &str, + dest_id: &str, +) -> Result<(), ()> { + let mut is_err = false; + // First check IA32_ARCH_CAPABILITIES + // Since we are assuming CPUID to be compatible we + // may assume that either both src and dest have this + // MSR or none of them do + if let Some((src_val, dest_val)) = src_feature_msrs + .get(&RegisterAddress::IA32_ARCH_CAPABILITIES.0) + .zip(dest_feature_msrs.get(&RegisterAddress::IA32_ARCH_CAPABILITIES.0)) + { + is_err |= + check_arch_capabilities_compatibility(*src_val, *dest_val, src_id, dest_id).is_err(); + } + + // Next let us consider IA32_VMX_BASIC + let mut true_ctls_exist_src = false; + let mut true_ctls_exist_dest = false; + // Since we assume compatibility of CPUID we can again check that either both src and dest + // have the IA32_VMX_BASIC MSR or none of them do + if let Some((src_val, dest_val)) = src_feature_msrs + .get(&RegisterAddress::IA32_VMX_BASIC.0) + .zip(dest_feature_msrs.get(&RegisterAddress::IA32_VMX_BASIC.0)) + { + true_ctls_exist_src = (*src_val & (1 << 55)) != 0; + true_ctls_exist_dest = (*dest_val & (1 << 55)) != 0; + is_err |= check_vmx_basic_compatibility(*src_val, *dest_val, src_id, dest_id).is_err(); + } + // The following closure saves us some boiler plate when checking the various VMX CTLS that have a default1 class + let check_vmx_ctls_with_default1_class = |vmx_ctrl_reg_address: RegisterAddress, + vmx_true_ctrl_reg_address: RegisterAddress, + check_id: &str, + src_id: &str, + dest_id: &str| + -> Result<(), ()> { + let mut is_err = false; + let src_reg_address = { + conditional_select( + vmx_ctrl_reg_address.0, + vmx_true_ctrl_reg_address.0, + true_ctls_exist_src, + ) + }; + + let dest_reg_address = { + conditional_select( + vmx_ctrl_reg_address.0, + vmx_true_ctrl_reg_address.0, + true_ctls_exist_dest, + ) + }; + + let src_val = src_feature_msrs.get(&src_reg_address); + let dest_val = dest_feature_msrs.get(&dest_reg_address); + if src_val.is_some() && dest_val.is_none() { + error!( + "{check_id} compatibility check failed: unable to compare value of MSR {src_reg_address:#x} of {src_id} with value of MSR {dest_reg_address:#x} of {dest_id}, because the latter value was not found" + ); + is_err = true; + } + if let Some((src_val, dest_val)) = src_val.zip(dest_val) + && let Err(CtlsCheck { + bitset_only_zero_src_lo, + bitset_only_one_src_hi, + }) = check_negative_subset_lo_and_subset_hi(*src_val, *dest_val) + { + is_err = true; + if let Some(bitset) = bitset_only_zero_src_lo { + for_each_bitpos(bitset, |bit_pos| { + debug!( + "{check_id} compatibility check failed: bit {bit_pos} is 0 in MSR:={src_reg_address:#x} of {src_id}, but 1 in MSR:={dest_reg_address:#x} of {dest_id}" + ); + }); + } + + if let Some(bitset) = bitset_only_one_src_hi { + for_each_bitpos(bitset, |bit_pos| { + debug!( + "{check_id} compatibility check failed: bit {bit_pos} is 1 in MSR:={src_reg_address:#x} of {src_id}, but 0 in MSR:={dest_reg_address:#x} of {dest_id}" + ); + }); + } + } + + if is_err { + if let Some(src_val) = src_val + && let Some(dest_val) = dest_val + { + error!( + "{check_id} compatibility check failed: {src_id} register address:={src_reg_address:#x}, {src_id} value:={:#x}, {dest_id} register address:={dest_reg_address:#x}, {dest_id} value:={:#x}", + *src_val, *dest_val + ); + } + Err(()) + } else { + Ok(()) + } + }; + + // Now we consider IA32_VMX_PINBASED_CTLS and/or IA32_VMX_TRUE_BINBASED_CTLS + // (Intel SDM Vol.3D A.3.1) + is_err |= check_vmx_ctls_with_default1_class( + RegisterAddress::IA32_VMX_PINBASED_CTLS, + RegisterAddress::IA32_VMX_TRUE_PINBASED_CTLS, + "IA32_VMX_PINBASED_CTLS", + src_id, + dest_id, + ) + .is_err(); + + // Next up is IA32_VMX_PROCBASED_CTLS and/or IA32_VMX_TRUE_PROCBASED_CTLS + // (Intel SDM Vol.3D A.3.2.) + is_err |= check_vmx_ctls_with_default1_class( + RegisterAddress::IA32_VMX_PROCBASED_CTLS, + RegisterAddress::IA32_VMX_TRUE_PROCBASED_CTLS, + "IA32_PROCBASED_CTLS", + src_id, + dest_id, + ) + .is_err(); + // Check IA32_VMX_EXIT_CTLS and/or IA32_VMX_TRUE_EXIT_CTLS + // (Intel SDM Vol.3D A.4) + is_err |= check_vmx_ctls_with_default1_class( + RegisterAddress::IA32_VMX_EXIT_CTLS, + RegisterAddress::IA32_VMX_TRUE_EXIT_CTLS, + "IA32_VMX_EXIT_CTLS", + src_id, + dest_id, + ) + .is_err(); + // Check IA32_VMX_ENTRY_CTLS and/or IA32_VMX_TRUE_ENTRY_CTLS + // (Intel SDM Vol.3D A.5) + is_err |= check_vmx_ctls_with_default1_class( + RegisterAddress::IA32_VMX_ENTRY_CTLS, + RegisterAddress::IA32_VMX_TRUE_ENTRY_CTLS, + "IA32_VMX_ENTRY_CTLS", + src_id, + dest_id, + ) + .is_err(); + // Check IA32_VMX_MISC + if let Some((src_val, dest_val)) = src_feature_msrs + .get(&RegisterAddress::IA32_VMX_MISC.0) + .zip(dest_feature_msrs.get(&RegisterAddress::IA32_VMX_MISC.0)) + { + is_err |= check_vmx_misc_msr(*src_val, *dest_val, src_id, dest_id).is_err(); + } + // Check IA32_VMX_CR0_FIXED0 + if let Some((src_fixed0, dest_fixed0)) = src_feature_msrs + .get(&RegisterAddress::IA32_VMX_CR0_FIXED0.0) + .zip(dest_feature_msrs.get(&RegisterAddress::IA32_VMX_CR0_FIXED0.0)) + { + is_err |= + check_cr_i_compatibility::<0>(*src_fixed0, *dest_fixed0, src_id, dest_id).is_err(); + } + + // Check IA32_VMX_CR4_FIXED0 + if let Some((src_fixed0, dest_fixed0)) = src_feature_msrs + .get(&RegisterAddress::IA32_VMX_CR4_FIXED0.0) + .zip(dest_feature_msrs.get(&RegisterAddress::IA32_VMX_CR4_FIXED0.0)) + { + is_err |= + check_cr_i_compatibility::<4>(*src_fixed0, *dest_fixed0, src_id, dest_id).is_err(); + } + + // Check IA32_VMX_VMCS_ENUM + if let Some((src_val, dest_val)) = src_feature_msrs + .get(&RegisterAddress::IA32_VMX_VMCS_ENUM.0) + .zip(dest_feature_msrs.get(&RegisterAddress::IA32_VMX_VMCS_ENUM.0)) + { + is_err |= check_vmx_vmcs_enum_compatibility(*src_val, *dest_val, src_id, dest_id).is_err(); + } + + // Check IA32_VMX_PROCBASED_CTLS2 + // This MSR exists only if bit 63 of IA32_VMX_PROCBASED_CTLS is set + // (note that if it is set on src then our IA32_VMX_PROCBASED_CTLS check + // ensures that it is also set on dest) + if let Some((src_val, dest_val)) = src_feature_msrs + .get(&RegisterAddress::IA32_VMX_PROCBASED_CTLS2.0) + .zip(dest_feature_msrs.get(&RegisterAddress::IA32_VMX_PROCBASED_CTLS2.0)) + { + let src_val = *src_val; + let dest_val = *dest_val; + // First verify that the first 32 bits are indeed 0 as documented by Intel, otherwise we have misunderstood the documentation + // and we should not continue. + let lo_mask = u64::from(u32::MAX); + assert_eq!( + src_val & lo_mask, + 0, + "BUG: The 32-first bits of the IA32_VMX_PROCBASED_CTLS2 MSR were not zero for src" + ); + assert_eq!( + dest_val & lo_mask, + 0, + "BUG: The 32-first bits of the IA32_VMX_PROCBASED_CTLS2 MSR were not zero for dest" + ); + // Note that the 32-first bits are documented to always be 0 + if let Err(bits_only_in_src) = check_subset(src_val, dest_val) { + is_err = true; + error!( + "IA32_VMX_PROCBASED_CTLS2 compatibility check failed: {src_id} value:={src_val:#x}, {dest_id} value:={dest_val:#x}" + ); + for_each_bitpos(bits_only_in_src, |bit_pos| { + debug!( + "IA32_VMX_PROCBASED_CTLS2 check failed: VM entry allows control X:={bit_pos} to be 1 for {src_id}, but not for {dest_id}" + ); + }); + } + } + + // Check IA32_VMX_PROCBASED_CTLS3 + // This MSR exists only if bit 49 of IA32_VMX_PROCBASED_CTLS is set + // (note that if it is set on src then our IA32_VMX_PROCBASED_CTLS check + // ensures that it is also set on dest) + + if let Some((src_val, dest_val)) = src_feature_msrs + .get(&RegisterAddress::IA32_VMX_PROCBASED_CTLS3.0) + .zip(dest_feature_msrs.get(&RegisterAddress::IA32_VMX_PROCBASED_CTLS3.0)) + && let Err(bits_only_in_src) = check_subset(*src_val, *dest_val) + { + is_err = true; + error!( + "IA32_VMX_PROCBASED_CTLS3 compatibility check failed: {src_id} value:= {:#x}, {dest_id} value:={:#x}", + *src_val, *dest_val + ); + + for_each_bitpos(bits_only_in_src, |bit_pos| { + debug!( + "IA32_VMX_PROCBASED_CTLS3 compatibility check failed: VM entry allows control X:={bit_pos} for {src_id}, but not for {dest_id}" + ); + }); + } + + // Check IA32_VMX_EXIT_CTLS2 + // This MSR exists only if bit 63 of the IA32_VMX_EXIT_CTLS is set + // (note that if it is set on src then our IA32_VMX_EXIT_CTLS check + // ensures that it is also set on dest) + if let Some((src_val, dest_val)) = src_feature_msrs + .get(&RegisterAddress::IA32_VMX_EXIT_CTLS2.0) + .zip(dest_feature_msrs.get(&RegisterAddress::IA32_VMX_EXIT_CTLS2.0)) + && let Err(bits_only_in_src) = check_subset(*src_val, *dest_val) + { + is_err = true; + error!( + "IA32_VMX_EXIT_CTLS2 compatibility check failed: {src_id} value:={:#x}, {dest_id} value:={:#x}", + *src_val, *dest_val + ); + for_each_bitpos(bits_only_in_src, |bit_pos| { + debug!( + "IA32_VMX_EXIT_CTLS2 compatibility check failed: bit {bit_pos} is set for {src_id}, but not for {dest_id}" + ); + }); + } + + // Check IA32_VMX_EPT_VPID_CAP (Intel SDM Vol.3D A.10) + // + // This MSR is only available on processors where bit 63 of IA32_VMX_PROCBASED_CTLS is 1 and that either + // have bit 33 of IA32_VMX_PROCBASED_CTLS2 set, or bit 37 of IA32_VMX_PROC_BASED_CTLS2 set. Since we + // already check for compatibility of those bits, we may assume that if this MSR is available for src, then + // it is also available for dest. + if let Some((src_val, dest_val)) = src_feature_msrs + .get(&RegisterAddress::IA32_VMX_EPT_VPID_CAP.0) + .zip(dest_feature_msrs.get(&RegisterAddress::IA32_VMX_EPT_VPID_CAP.0)) + { + is_err |= check_vpid_and_ept_capabilities(*src_val, *dest_val, src_id, dest_id).is_err(); + } + + if let Some((src_val, dest_val)) = src_feature_msrs + .get(&RegisterAddress::IA32_VMX_VMFUNC.0) + .zip(dest_feature_msrs.get(&RegisterAddress::IA32_VMX_VMFUNC.0)) + && let Err(bits_only_in_src) = check_subset(*src_val, *dest_val) + { + is_err = true; + error!( + "IA32_VMX_VMFUNC compatibility check failed: {src_id} value:={:#x}, {dest_id} value:={:#x}", + *src_val, *dest_val + ); + for_each_bitpos(bits_only_in_src, |bit_pos| { + debug!( + "IA32_VMX_VMFUNC compatibility check failed: VM entry allows bit X:={bit_pos} of the VM-function controls to be 1 for {src_id}, but not for {dest_id}" + ); + }); + } + + if is_err { Err(()) } else { Ok(()) } +} + +/// `a` if `condition` else `b` +fn conditional_select(a: u32, b: u32, condition: bool) -> u32 { + let a_mask = u32::from(condition).wrapping_neg(); + let b_mask = !a_mask; + (a & a_mask) | (b & b_mask) +} + +/// Check that the values of MSR IA32_ARCH_CAPABILITIES are compatible. +/// +/// If this check fails then programs that work when the value is `src_val`, may possibly +/// no longer work if the value is `dest_val`. +/// +/// See: Ch.2 Table 2-2. IA-32 Architectural MSRs in Intel SDM Vol.4 +fn check_arch_capabilities_compatibility( + src_val: u64, + dest_val: u64, + src_id: &str, + dest_id: &str, +) -> Result<(), ()> { + // Make a mask out of + const RDCL_NO: u64 = 1 << 0; + const IBRS_ALL: u64 = 1 << 1; + const SKIP_L1_DFL_VMENTRY: u64 = 1 << 3; + const SSB_NO: u64 = 1 << 4; + const MDS_NO: u64 = 1 << 5; + const TSX_CONTROL: u64 = 1 << 7; + const TAA_NO: u64 = 1 << 8; + const MCU_CONTROL: u64 = 1 << 9; + const MISC_PACKAGE_CTLS: u64 = 1 << 10; + const ENERGY_FILTERING_CTL: u64 = 1 << 11; + const DOITM: u64 = 1 << 12; + const MCU_ENUMERATION: u64 = 1 << 16; + const FB_CLEAR: u64 = 1 << 17; + const FB_CLEAR_CTRL: u64 = 1 << 18; + const BHI_NO: u64 = 1 << 20; + const XAPIC_DISABLE_STATUS: u64 = 1 << 21; + const MCU_EXTENDED_SERVICE: u64 = 1 << 22; + const OVERCLOCKING_STATUS: u64 = 1 << 23; + const PBRSB_NO: u64 = 1 << 24; + const GDS_CTRL: u64 = 1 << 25; + const GDS_NO: u64 = 1 << 26; + const RFDS_NO: u64 = 1 << 27; + // TODO: Should we perhaps ignore checking this (is it too strict)? + const RFDS_CLEAR: u64 = 1 << 28; + const IGN_UMONITOR_SUPPORT: u64 = 1 << 29; + const MON_UMON_MITG_SUPPORT: u64 = 1 << 30; + const PBOPT_SUPPORT: u64 = 1 << 32; + + let mask: u64 = { + RDCL_NO + | IBRS_ALL + | SKIP_L1_DFL_VMENTRY + | SSB_NO + | MDS_NO + | TAA_NO + | TSX_CONTROL + | MCU_CONTROL + | MISC_PACKAGE_CTLS + | ENERGY_FILTERING_CTL + | DOITM + | MCU_ENUMERATION + | FB_CLEAR + | FB_CLEAR_CTRL + | XAPIC_DISABLE_STATUS + | MCU_EXTENDED_SERVICE + | OVERCLOCKING_STATUS + | GDS_CTRL + | IGN_UMONITOR_SUPPORT + | MON_UMON_MITG_SUPPORT + | PBOPT_SUPPORT + | RFDS_CLEAR + | PBRSB_NO + | GDS_NO + | RFDS_NO + | BHI_NO + }; + if let Err(only_in_src) = check_subset(src_val & mask, dest_val & mask) { + error!( + "IA32_ARCH_CAPABILITIES compatibility check failed: {src_id} value:={src_val:#x}, {dest_id} value:={dest_val:#x}" + ); + let definitions = msr_definitions::<{ RegisterAddress::IA32_ARCH_CAPABILITIES.0 }>(); + log_features_only_in_src(only_in_src, src_id, definitions, "IA32_ARCH_CAPABILITIES"); + Err(()) + } else { + Ok(()) + } +} + +/// Check that the values of MSR IA32_VMX_BASIC are compatible. +/// +/// See Intel SDM Vol.3D A.1 for more information about the IA32_VMX_BASIC MSR +fn check_vmx_basic_compatibility( + src_val: u64, + dest_val: u64, + src_id: &str, + dest_id: &str, +) -> Result<(), ()> { + let mut is_err = false; + // All bits between 0 and 53 are expected to be equal (except bit 49) + let req_eq_mask: u64 = ((1 << 54) - 1) & (!(1 << 49)); + let src_req_eq = src_val & req_eq_mask; + let dest_req_eq = dest_val & req_eq_mask; + if src_req_eq != dest_req_eq { + is_err = true; + let definitions = msr_definitions::<{ RegisterAddress::IA32_VMX_BASIC.0 }>(); + log_inequalities( + src_req_eq, + dest_req_eq, + definitions, + src_id, + dest_id, + "IA32_VMX_BASIC compatibility", + ); + } + // bits 49, 54, 55, and 56 indicate some form of capability and we need to check + // that these bits in the `src_value` are a subset of those in `dest_value` + let req_subset_eq_mask: u64 = (1 << 54) | (1 << 55) | (1 << 56) | (1 << 49); + let src_val_seq = req_subset_eq_mask & src_val; + let dest_val_seq = req_subset_eq_mask & dest_val; + is_err |= check_subset(src_val_seq, dest_val_seq).is_err(); + + if is_err { + error!( + "IA32_VMX_BASIC compatibility check failed: {src_id} value:={src_val:#x}, {dest_id} value:={dest_val:#x}" + ); + Err(()) + } else { + Ok(()) + } +} + +/// Check that no values are only in a +/// +/// Upon error a bitset is returned with the +/// bits that are only available in `src_val` +fn check_subset(src_val: u64, dest_val: u64) -> Result<(), u64> { + let only_in_src_val = src_val & (src_val ^ dest_val); + if only_in_src_val != 0 { + Err(only_in_src_val) + } else { + Ok(()) + } +} + +/// Checks the following: +/// 1. For any X < 32; If bit X of src_val is 0 then bit X of dest_val is also 0 +/// 2. For any X >= 32; If bit X of src_val is 1 then bit X of dest_val is also 1 +struct CtlsCheck { + bitset_only_zero_src_lo: Option, + bitset_only_one_src_hi: Option, +} + +fn check_negative_subset_lo_and_subset_hi(src_val: u64, dest_val: u64) -> Result<(), CtlsCheck> { + let lo_mask = (1_u64 << 32) - 1; + let hi_mask = !lo_mask; + + let lo_check = check_subset((!src_val) & lo_mask, (!dest_val) & lo_mask); + + let hi_check = check_subset(src_val & hi_mask, dest_val & hi_mask); + + if lo_check.is_ok() && hi_check.is_ok() { + Ok(()) + } else { + Err(CtlsCheck { + bitset_only_zero_src_lo: lo_check.err(), + bitset_only_one_src_hi: hi_check.err(), + }) + } +} + +/// Check that the values of MSR IA32_VMX_MISC are compatible. +/// +/// See Intel SDM Vol.3D A.6 for more information about the IA32_VMX_MISC MSR +fn check_vmx_misc_msr( + src_value: u64, + dest_value: u64, + src_id: &str, + dest_id: &str, +) -> Result<(), ()> { + let mut is_err = false; + let subset_eq_check_mask: u64 = { + (1 << 5) + | (1 << 6) + | (1 << 7) + | (1 << 8) + | (1 << 14) + | (1 << 15) + | (1 << 28) + | (1 << 29) + | (1 << 30) + }; + if let Err(only_in_src) = check_subset( + subset_eq_check_mask & src_value, + subset_eq_check_mask & dest_value, + ) { + is_err = true; + let definitions = msr_definitions::<{ RegisterAddress::IA32_VMX_MISC.0 }>(); + log_features_only_in_src(only_in_src, src_id, definitions, "IA32_VMX_MISC"); + } + + let eq_mask: u64 = { + // TODO: Do we also need to check that the MSEG revisions match? + (16..=24).fold(0_u64, |acc, next| acc | (1 << next)) + }; + + let src_req_eq_val = src_value & eq_mask; + let dest_req_eq_val = dest_value & eq_mask; + if src_req_eq_val != dest_req_eq_val { + is_err = true; + let definitions = msr_definitions::<{ RegisterAddress::IA32_VMX_MISC.0 }>(); + log_inequalities( + src_req_eq_val, + dest_req_eq_val, + definitions, + src_id, + dest_id, + "IA32_VMX_MISC", + ); + } + + let leq_mask: u64 = { (25..=27).fold(0_u64, |acc, next| acc | (1 << next)) }; + + let src_req_leq = src_value & leq_mask; + let dest_req_leq = dest_value & leq_mask; + if src_req_leq > dest_req_leq { + is_err = true; + debug!( + "IA32_VMX_MISC compatibility check failed when checking definition: {:?}, {src_id} has value:={src_req_leq}, {dest_id} has value:={dest_req_leq}", + max_msr_store_lists_def(), + ); + } + + if is_err { + error!( + "IA32_VMX_MISC compatibility check failed: {src_id} value:={src_value:#x}, {dest_id} value:={dest_value:#x}" + ); + Err(()) + } else { + Ok(()) + } +} + +/// Check compatibility of MSRs IA32_VMX_CR{I}_FIXED0 for I = 0, 4. +/// +/// See Intel SDM Vol.3D A.7 & A.8 for more information about these MSRs. +/// +/// NOTE: We don't need to check compatibility for CR{I}_FIXED1 because +/// that is ensured by CPUID. +fn check_cr_i_compatibility( + src_fixed0: u64, + dest_fixed0: u64, + src_id: &str, + dest_id: &str, +) -> Result<(), ()> { + let cri = const { + match I { + 0 => "CR0", + 4 => "CR4", + _ => { + panic!("only 0 and 4 may be used") + } + } + }; + + // Need to ensure that there are no bits that are only 0 in src_fixed0 and also no bits + // that are only 1 in src_fixed1. + + if let Err(only_zero_in_src) = check_subset(!src_fixed0, !dest_fixed0) { + error!( + "IA32_VMX_{cri}_FIXED0 compatibility check failed: {src_id} value:={src_fixed0:#x}, {dest_id} value:={dest_fixed0:#x}" + ); + for_each_bitpos(only_zero_in_src, |bit_pos| { + debug!( + "IA32_VMX_{cri}_FIXED0 compatibility check failed: bit {bit_pos} is allowed to be 0 in {cri} for {src_id}, but not for {dest_id}" + ); + }); + Err(()) + } else { + Ok(()) + } +} + +/// Check compatibility of MSRs IA32_VMX_VMCS_ENUM. +/// +/// See Intel SDM Vol.3D A.9 for more information about IA32_VMX_VMCS_ENUM. +fn check_vmx_vmcs_enum_compatibility( + src_value: u64, + dest_value: u64, + src_id: &str, + dest_id: &str, +) -> Result<(), ()> { + let mask = (1..=9).fold(0_u64, |acc, next| acc | (1 << next)); + let src_req_leq = src_value & mask; + let dest_req_leq = dest_value & mask; + if src_req_leq > dest_req_leq { + error!( + "VMX_VMCS_ENUM compatibility check failed: MAX_INDEX for {src_id}:={src_req_leq} is greater than MAX_INDEX:={dest_req_leq} for {dest_id}" + ); + Err(()) + } else { + Ok(()) + } +} + +/// Check compatibility of MSRs IA32_VMX_EPT_VPID_CAP. +/// +/// See (Intel TODO:) Vol. 3D A.10 for more information about IA32_VMX_EPT_VPID_CAP. +// Only if IA32_VMX_PROCBASED_CTLS[63] & (IA32_VMX_PROCBASED_CTLS2[33] | IA32_VMX_PROCBASED_CTLS2[37]) +fn check_vpid_and_ept_capabilities( + src_value: u64, + dest_value: u64, + src_id: &str, + dest_id: &str, +) -> Result<(), ()> { + let mut is_err = false; + let subset_eq_mask = { (1 << 44) - 1 }; + + if let Err(bits_only_in_src) = + check_subset(src_value & subset_eq_mask, dest_value & subset_eq_mask) + { + is_err = true; + let definitions = msr_definitions::<{ RegisterAddress::IA32_VMX_EPT_VPID_CAP.0 }>(); + log_features_only_in_src( + bits_only_in_src, + src_id, + definitions, + "IA32_VMX_EPT_VPID_CAP", + ); + } + + let leq_mask = { (48..=53).fold(0_u64, |acc, next| acc | (1 << next)) }; + let src_req_leq = src_value & leq_mask; + let dest_req_leq = dest_value & leq_mask; + if src_req_leq > dest_req_leq { + is_err = true; + debug!( + "IA32_VMX_EPT_VPID_CAP compatibility check failed: maximum HLAT prefix size is {src_req_leq} for {src_id}, but {dest_req_leq} for {dest_id}" + ); + } + if is_err { + error!( + "IA32_VMX_EPT_VPID_CAP compatibility check failed: {src_id} value:={src_value:#x}, {dest_id} value:={dest_value:#x}" + ); + Err(()) + } else { + Ok(()) + } +} + +fn for_each_bitpos(bits: u64, mut cb: impl FnMut(u8)) { + let mut bits = bits; + while bits != 0 { + let pos = bits.trailing_zeros() as u8; + cb(pos); + let lsb = bits & bits.wrapping_neg(); + bits ^= lsb; + } +} + +#[inline(never)] +#[cold] +fn log_features_only_in_src( + only_in_src: u64, + src_id: &str, + definitions: &[ValueDefinition], + check_id: &str, +) { + for_each_bitpos(only_in_src, |bit_pos| { + let Some(def) = definitions + .iter() + .find(|def| (def.bits_range.0..=def.bits_range.1).contains(&bit_pos)) + else { + debug!( + "{check_id} compatibility check failed: bit:={bit_pos} is only set for {src_id}" + ); + warn!( + "unable to produce proper debug log: No MSR value definition found for bit:={bit_pos} check:={check_id} compatibility" + ); + return; + }; + debug!( + "{check_id} compatibility check failed: feature bit {bit_pos} only set for {src_id}: feature definition:={def:?}" + ); + }); +} + +#[inline(never)] +#[cold] +fn log_inequalities( + src_val: u64, + dest_val: u64, + definitions: &[ValueDefinition], + src_id: &str, + dest_id: &str, + check_id: &str, +) { + for def in definitions { + let mask = + (def.bits_range.0..=def.bits_range.1).fold(0_u64, |acc, next| acc | (1_u64 << next)); + let val_src = mask & src_val; + let val_dest = mask & dest_val; + if src_val != dest_val { + debug!( + "Check: {check_id} compatibility failed: on definition:={def:?}, values are required to be equal, but we have {src_id} value:={val_src:#x}, {dest_id} value:={val_dest:#x}" + ); + } + } +} + +#[inline(never)] +#[cold] +const fn max_msr_store_lists_def() -> &'static ValueDefinition { + const { + let defs = msr_definitions::<{ RegisterAddress::IA32_VMX_MISC.0 }>(); + // Currently stored at index = 8, if this changes we make sure that we fail at compile time. + // We do not perform a search as the order is unlikely to change frequently and we want to keep + // compile times down. + let def = &defs[8]; + assert!( + def.bits_range.0 == 25, + "MAX_MSR_STORE_LISTS definition is no longer at index 8 in the ValueDefinitions corresponding to IA32_VMX_MISC, please update the index" + ); + assert!( + def.bits_range.1 == 27, + "MAX_MSR_STORE_LISTS definition is no longer at index 8 in the ValueDefinitions corresponding to IA32_VMX_MISC, please update the index" + ); + def + } +} diff --git a/arch/src/x86_64/msr_definitions/intel/non_architectural_msrs.rs b/arch/src/x86_64/msr_definitions/intel/non_architectural_msrs.rs new file mode 100644 index 0000000000..b1f88aa809 --- /dev/null +++ b/arch/src/x86_64/msr_definitions/intel/non_architectural_msrs.rs @@ -0,0 +1,113 @@ +// Copyright © 2025 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 +// + +//! This module contains a list of all known non-architectural MSRS for various Intel +//! CPUs. This list only helps us detect new MSRs that we are not (yet) aware of when +//! generating CPU profiles, but has no importance beyond that. + +/// A list of known non-architectural MSRs +/// +/// Note: KVM_GET_MSR_FEATURE_INDEX_LIST may return non-architectural MSRS. We append those +/// to [`crate::x86_64::msr_definitions_intel::INTEL_MSR_FEATURE_DEFINITIONS`] and not here. +pub(in crate::x86_64) const NON_ARCHITECTURAL_INTEL_MSRS: [u32; 872] = [ + 0x11, 0x12, 0x13, 0x2a, 0x2b, 0x2c, 0x33, 0x34, 0x35, 0x39, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, + 0x46, 0x47, 0x53, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x80, 0x88, 0x89, 0x8a, 0x98, + 0x99, 0x9a, 0xa0, 0xa1, 0xa5, 0xa7, 0xcd, 0xe2, 0xe4, 0xed, 0xee, 0xef, 0xf0, 0x105, 0x116, + 0x118, 0x119, 0x11a, 0x11b, 0x11e, 0x13c, 0x140, 0x151, 0x17d, 0x17f, 0x180, 0x181, 0x182, + 0x183, 0x184, 0x185, 0x190, 0x191, 0x192, 0x193, 0x194, 0x196, 0x197, 0x19d, 0x1a1, 0x1a2, + 0x1a4, 0x1a6, 0x1a7, 0x1aa, 0x1ac, 0x1ad, 0x1ae, 0x1af, 0x1c8, 0x1c9, 0x1d7, 0x1d8, 0x1da, + 0x1db, 0x1dc, 0x1f1, 0x1f4, 0x1f5, 0x1fb, 0x1fc, 0x2a0, 0x2a1, 0x2a2, 0x2a3, 0x2a4, 0x2a5, + 0x2a6, 0x2a7, 0x2b8, 0x2b9, 0x2ba, 0x2bb, 0x2bc, 0x2bd, 0x2be, 0x2bf, 0x2c2, 0x2c3, 0x2c4, + 0x2c5, 0x2c6, 0x2c7, 0x2c8, 0x2c9, 0x2d6, 0x2d7, 0x2d9, 0x2f4, 0x2f5, 0x300, 0x301, 0x302, + 0x303, 0x304, 0x305, 0x306, 0x307, 0x308, 0x310, 0x311, 0x329, 0x350, 0x351, 0x354, 0x355, + 0x360, 0x361, 0x362, 0x363, 0x364, 0x365, 0x366, 0x367, 0x368, 0x369, 0x36a, 0x36b, 0x36c, + 0x36d, 0x36e, 0x36f, 0x370, 0x371, 0x393, 0x394, 0x395, 0x396, 0x39c, 0x3a0, 0x3a1, 0x3a2, + 0x3a3, 0x3a4, 0x3a5, 0x3a6, 0x3a7, 0x3a8, 0x3a9, 0x3aa, 0x3ab, 0x3ac, 0x3ad, 0x3ae, 0x3af, + 0x3b0, 0x3b1, 0x3b2, 0x3b3, 0x3b4, 0x3b5, 0x3b6, 0x3b7, 0x3b8, 0x3b9, 0x3ba, 0x3bb, 0x3bc, + 0x3bd, 0x3be, 0x3c0, 0x3c1, 0x3c2, 0x3c3, 0x3c4, 0x3c5, 0x3c6, 0x3c7, 0x3c8, 0x3c9, 0x3ca, + 0x3cb, 0x3cc, 0x3cd, 0x3e0, 0x3e1, 0x3f0, 0x3f2, 0x3f6, 0x3f7, 0x3f8, 0x3f9, 0x3fa, 0x3fc, + 0x3fd, 0x3fe, 0x3ff, 0x4e0, 0x4e2, 0x4e3, 0x4f0, 0x4f8, 0x540, 0x541, 0x601, 0x606, 0x60a, + 0x60b, 0x60c, 0x60d, 0x610, 0x611, 0x612, 0x613, 0x614, 0x618, 0x619, 0x61b, 0x61c, 0x61e, + 0x620, 0x630, 0x631, 0x632, 0x638, 0x639, 0x63a, 0x640, 0x641, 0x642, 0x648, 0x649, 0x64a, + 0x64b, 0x64c, 0x64d, 0x64e, 0x64f, 0x650, 0x651, 0x652, 0x653, 0x655, 0x656, 0x657, 0x658, + 0x659, 0x65a, 0x65b, 0x65c, 0x65e, 0x65f, 0x660, 0x662, 0x664, 0x665, 0x666, 0x668, 0x669, + 0x66e, 0x680, 0x681, 0x682, 0x683, 0x684, 0x685, 0x686, 0x687, 0x688, 0x689, 0x68a, 0x68b, + 0x68c, 0x68d, 0x68e, 0x68f, 0x690, 0x691, 0x692, 0x693, 0x694, 0x695, 0x696, 0x697, 0x698, + 0x699, 0x69a, 0x69b, 0x69c, 0x69d, 0x69e, 0x69f, 0x6b0, 0x6b1, 0x6c0, 0x6c1, 0x6c2, 0x6c3, + 0x6c4, 0x6c5, 0x6c6, 0x6c7, 0x6c8, 0x6c9, 0x6ca, 0x6cb, 0x6cc, 0x6cd, 0x6ce, 0x6cf, 0x6d0, + 0x6d1, 0x6d2, 0x6d3, 0x6d4, 0x6d5, 0x6d6, 0x6d7, 0x6d8, 0x6d9, 0x6da, 0x6db, 0x6dc, 0x6dd, + 0x6de, 0x6df, 0x700, 0x701, 0x702, 0x703, 0x704, 0x705, 0x706, 0x707, 0x708, 0x709, 0x70a, + 0x70b, 0x710, 0x711, 0x712, 0x713, 0x714, 0x715, 0x716, 0x717, 0x718, 0x719, 0x71a, 0x71b, + 0x720, 0x721, 0x722, 0x723, 0x724, 0x725, 0x726, 0x727, 0x728, 0x729, 0x72a, 0x72b, 0x72c, + 0x72d, 0x72e, 0x72f, 0x730, 0x731, 0x732, 0x733, 0x734, 0x735, 0x736, 0x737, 0x738, 0x739, + 0x73a, 0x73b, 0x73c, 0x73d, 0x73e, 0x73f, 0x740, 0x741, 0x742, 0x743, 0x744, 0x745, 0x746, + 0x747, 0x748, 0x749, 0x9ff, 0xc00, 0xc01, 0xc02, 0xc06, 0xc08, 0xc09, 0xc10, 0xc11, 0xc16, + 0xc17, 0xc20, 0xc21, 0xc22, 0xc24, 0xc30, 0xc31, 0xc32, 0xc33, 0xc34, 0xc35, 0xc36, 0xc37, + 0xc38, 0xc39, 0xc40, 0xc41, 0xc42, 0xc50, 0xc51, 0xc52, 0xc53, 0xc54, 0xc55, 0xc56, 0xc57, + 0xc60, 0xc61, 0xc62, 0xc70, 0xc71, 0xc72, 0xc73, 0xc74, 0xc75, 0xc76, 0xc77, 0xc84, 0xd94, + 0xd95, 0xd96, 0xd97, 0xd98, 0xd99, 0xd9a, 0xd9b, 0xda1, 0xda2, 0xda4, 0xdb3, 0xdb4, 0xdb5, + 0xdb6, 0xdb7, 0xdb8, 0xdb9, 0xdba, 0xdbb, 0xdc0, 0xdc1, 0xdc2, 0xdc3, 0xdc4, 0xdc5, 0xdc6, + 0xdc7, 0xdc8, 0xdc9, 0xdca, 0xdcb, 0xdcc, 0xdcd, 0xdce, 0xdcf, 0xdd0, 0xdd1, 0xdd2, 0xdd3, + 0xdd4, 0xdd5, 0xdd6, 0xdd7, 0xdd8, 0xdd9, 0xdda, 0xddb, 0xddc, 0xddd, 0xdde, 0xddf, 0xde0, + 0xde1, 0xde2, 0xde4, 0xdf0, 0xdf1, 0xdf2, 0xdf3, 0xdf4, 0xdf5, 0xdf6, 0xdf7, 0xdf8, 0xdf9, + 0xdfa, 0xdfb, 0xe02, 0xe03, 0xe04, 0xe05, 0xe06, 0xe07, 0xe08, 0xe09, 0xe0a, 0xe0b, 0xe0c, + 0xe0d, 0xe0e, 0xe0f, 0xe10, 0xe11, 0xe12, 0xe13, 0xe14, 0xe15, 0xe16, 0xe17, 0xe18, 0xe19, + 0xe1a, 0xe1b, 0xe1c, 0xe1d, 0xe1e, 0xe1f, 0xe20, 0xe21, 0xe22, 0xe23, 0xe24, 0xe25, 0xe26, + 0xe27, 0xe28, 0xe29, 0xe2a, 0xe2b, 0xe2c, 0xe2d, 0xe2e, 0xe2f, 0xe30, 0xe31, 0xe32, 0xe33, + 0xe34, 0xe35, 0xe36, 0xe37, 0xe38, 0xe39, 0xe3a, 0xe3b, 0xe3c, 0xe3d, 0xe3e, 0xe3f, 0xe40, + 0xe41, 0xe42, 0xe43, 0xe44, 0xe45, 0xe46, 0xe47, 0xe48, 0xe49, 0xe4a, 0xe4b, 0xe4d, 0xe4e, + 0xe50, 0xe51, 0xe52, 0xe53, 0xe54, 0xe55, 0xe56, 0xe57, 0xe58, 0xe59, 0xe5a, 0xe5c, 0xe5d, + 0xe5e, 0xe60, 0xe61, 0xe62, 0xe63, 0xe64, 0xe65, 0xe66, 0xe67, 0xe68, 0xe69, 0xe6a, 0xe6b, + 0xe70, 0xe71, 0xe72, 0xe73, 0xe74, 0xe75, 0xe76, 0xe77, 0xe78, 0xe79, 0xe7a, 0xe7b, 0xe80, + 0xe81, 0xe82, 0xe83, 0xe84, 0xe85, 0xe86, 0xe87, 0xe88, 0xe89, 0xe8b, 0xe90, 0xe91, 0xe92, + 0xe93, 0xe94, 0xe95, 0xe96, 0xe97, 0xe98, 0xe99, 0xe9a, 0xe9b, 0xea0, 0xea1, 0xea2, 0xea3, + 0xea4, 0xea5, 0xea6, 0xea7, 0xea8, 0xea9, 0xeaa, 0xeab, 0xeb0, 0xeb1, 0xeb2, 0xeb3, 0xeb4, + 0xeb5, 0xeb6, 0xeb7, 0xeb8, 0xeb9, 0xeba, 0xebb, 0xec0, 0xec1, 0xec2, 0xec3, 0xec4, 0xec5, + 0xec6, 0xec7, 0xec8, 0xec9, 0xeca, 0xecb, 0xed0, 0xed1, 0xed2, 0xed3, 0xed4, 0xed5, 0xed6, + 0xed7, 0xed8, 0xed9, 0xeda, 0xedb, 0xee0, 0xee1, 0xee2, 0xee3, 0xee4, 0xee5, 0xee6, 0xee7, + 0xee8, 0xee9, 0xeea, 0xeeb, 0xef0, 0xef1, 0xef2, 0xef3, 0xef4, 0xef5, 0xef6, 0xef7, 0xef8, + 0xef9, 0xefa, 0xefb, 0xf00, 0xf01, 0xf02, 0xf03, 0xf04, 0xf05, 0xf06, 0xf07, 0xf08, 0xf09, + 0xf0a, 0xf0b, 0xf10, 0xf11, 0xf12, 0xf13, 0xf14, 0xf15, 0xf16, 0xf17, 0xf18, 0xf19, 0xf1a, + 0xf1b, 0xf40, 0xf41, 0xf42, 0xf50, 0xf51, 0xf52, 0xf53, 0xf54, 0xf55, 0xf56, 0xf57, 0xf58, + 0xf59, 0xf5a, 0xf5b, 0xfc0, 0xfc1, 0xfc2, 0xfd0, 0xfd1, 0xfd2, 0xfd3, 0xfd4, 0xfd5, 0xfd6, + 0xfd7, 0xfd8, 0xfd9, 0xfda, 0xfdb, 0x1309, 0x130a, 0x130b, 0x14c1, 0x14c2, 0x14c3, 0x14c4, + 0x14c5, 0x14c6, 0x14c7, 0x14c8, 0x1878, 0x1a8e, 0x1a8f, 0x2000, 0x2001, 0x2002, 0x2003, 0x2008, + 0x2009, 0x200a, 0x200b, 0x2010, 0x2011, 0x2012, 0x2013, 0x2018, 0x2019, 0x201a, 0x201b, 0x2020, + 0x2021, 0x2022, 0x2023, 0x2028, 0x2029, 0x202a, 0x202b, 0x2030, 0x2031, 0x2032, 0x2033, 0x2038, + 0x2039, 0x203a, 0x203b, 0x2040, 0x2041, 0x2042, 0x2043, 0x2048, 0x2049, 0x204a, 0x204b, 0x2fd0, + 0x2fd1, 0x2fd2, 0x2fd3, 0x2fd4, 0x2fd5, 0x2fd8, 0x2fd9, 0x2fda, 0x2fdb, 0x2fdc, 0x2fdd, 0x2fde, + 0x2fdf, 0x2ff0, 0x2ff2, 0x107cc, 0x107cd, 0x107ce, 0x107cf, 0x107d0, 0x107d1, 0x107d2, 0x107d3, + 0x107d8, +]; + +// TODO: Look out for 0x13c (used to check for AES instruction on Intel Atom and ..)? +// TODO: 0x35 gives THREAD_COUNT will some programs stop working if we deny this MSR? + +// TODO: It is perfectly possible to convert the following test into compile time checks. +// We take care of that later. +#[cfg(test)] +mod tests { + use super::super::msr_based_features::INTEL_MSR_FEATURE_DEFINITIONS; + use super::super::{FORBIDDEN_IA32_MSR_RANGES, PERMITTED_IA32_MSRS}; + use super::NON_ARCHITECTURAL_INTEL_MSRS; + #[test] + fn disjoint_from_others() { + let mut unique_count = 0; + for msr in NON_ARCHITECTURAL_INTEL_MSRS { + if (!PERMITTED_IA32_MSRS.contains(&msr)) + && (!FORBIDDEN_IA32_MSR_RANGES + .iter() + .any(|r| (r.0..=r.1).contains(&msr))) + && (!INTEL_MSR_FEATURE_DEFINITIONS + .as_slice() + .iter() + .any(|(address, _)| address.0 == msr)) + { + unique_count += 1; + } + } + assert_eq!(unique_count, NON_ARCHITECTURAL_INTEL_MSRS.len()); + } +} diff --git a/arch/src/x86_64/msr_definitions/kvm.rs b/arch/src/x86_64/msr_definitions/kvm.rs new file mode 100644 index 0000000000..d85cc22f98 --- /dev/null +++ b/arch/src/x86_64/msr_definitions/kvm.rs @@ -0,0 +1,93 @@ +// Copyright © 2025 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 +// + +//! This module lists KVM defined MSRS. It is currently only used when generating CPU profiles +//! (hence feature gated), but may possibly be extended and utilized for better debug logs in +//! the future. +pub(in crate::x86_64) use permitted_msrs::PROFILE_PERMITTED_KVM_MSRS; + +use crate::x86_64::CpuidReg; +use crate::x86_64::cpuid_definitions::Parameters; + +mod permitted_msrs { + use super::{CpuidReg, Parameters}; + use crate::x86_64::cpuid_definitions::kvm::assert_not_denied_cpuid_feature; + + const MSR_KVM_WALL_CLOCK: u32 = 0x11; + const MSR_KVM_SYSTEM_TIME: u32 = 0x12; + const _KVM_CLOCKSOURCE_CPUID_CHECK: () = assert_not_denied_cpuid_feature::<0>(&Parameters { + leaf: 0x4000_0001, + sub_leaf: (0..=0), + register: CpuidReg::EAX, + }); + + const MSR_KVM_WALL_CLOCK_NEW: u32 = 0x4b564d00; + const MSR_KVM_SYSTEM_TIME_NEW: u32 = 0x4b564d01; + const _KVM_CLOCKSOURCE2_CHECK: () = assert_not_denied_cpuid_feature::<3>(&Parameters { + leaf: 0x4000_0001, + sub_leaf: (0..=0), + register: CpuidReg::EAX, + }); + + const MSR_KVM_ASYNC_PF_EN: u32 = 0x4b564d02; + const _KVM_ASYNC_PF_CHECK: () = assert_not_denied_cpuid_feature::<4>(&Parameters { + leaf: 0x4000_0001, + sub_leaf: (0..=0), + register: CpuidReg::EAX, + }); + + const MSR_KVM_STEAL_TIME: u32 = 0x4b564d03; + const _KVM_STEAL_TIME_CHECK: () = assert_not_denied_cpuid_feature::<5>(&Parameters { + leaf: 0x4000_0001, + sub_leaf: (0..=0), + register: CpuidReg::EAX, + }); + + const MSR_KVM_EOI_EN: u32 = 0x4b564d04; + const _KVM_EOI_EN_CHECK: () = assert_not_denied_cpuid_feature::<6>(&Parameters { + leaf: 0x4000_0001, + sub_leaf: (0..=0), + register: CpuidReg::EAX, + }); + + const MSR_KVM_POLL_CONTROL: u32 = 0x4b564d05; + const _KVM_POLL_CONTROL_CHECK: () = assert_not_denied_cpuid_feature::<12>(&Parameters { + leaf: 0x4000_0001, + sub_leaf: (0..=0), + register: CpuidReg::EAX, + }); + + const MSR_KVM_ASYNC_PF_INT: u32 = 0x4b564d06; + const MSR_KVM_ASYNC_PF_ACK: u32 = 0x4b564d07; + const _KVM_ASYNC_PF_INT_ACK_CHECK: () = assert_not_denied_cpuid_feature::<14>(&Parameters { + leaf: 0x4000_0001, + sub_leaf: (0..=0), + register: CpuidReg::EAX, + }); + + const MSR_KVM_MIGRATION_CONTROL: u32 = 0x4b564d08; + const _KVM_MIGRATION_CONTROL_CHECK: () = assert_not_denied_cpuid_feature::<17>(&Parameters { + leaf: 0x4000_0001, + sub_leaf: (0..=0), + register: CpuidReg::EAX, + }); + + /// KVM defined MSRS that CPU profiles may inclide in their permitted MSR definitions. + /// + /// This list is (currently) only utilized when generating CPU profiles. + pub(in crate::x86_64) const PROFILE_PERMITTED_KVM_MSRS: [u32; 11] = [ + MSR_KVM_WALL_CLOCK, + MSR_KVM_SYSTEM_TIME, + MSR_KVM_WALL_CLOCK_NEW, + MSR_KVM_SYSTEM_TIME_NEW, + MSR_KVM_ASYNC_PF_EN, + MSR_KVM_STEAL_TIME, + MSR_KVM_EOI_EN, + MSR_KVM_POLL_CONTROL, + MSR_KVM_ASYNC_PF_INT, + MSR_KVM_ASYNC_PF_ACK, + MSR_KVM_MIGRATION_CONTROL, + ]; +} diff --git a/arch/src/x86_64/msr_definitions/mod.rs b/arch/src/x86_64/msr_definitions/mod.rs new file mode 100644 index 0000000000..805b83c863 --- /dev/null +++ b/arch/src/x86_64/msr_definitions/mod.rs @@ -0,0 +1,101 @@ +// Copyright © 2025 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 +// + +use serde::{Deserialize, Serialize}; +pub mod intel; +#[cfg(all(feature = "kvm", feature = "cpu_profile_generation"))] +pub mod kvm; + +pub mod hyperv; + +use crate::{deserialize_u32_hex, serialize_u32_hex}; +/// The register address of an MSR +#[derive(Debug, Copy, Clone, Eq, PartialEq, Serialize, Deserialize)] +pub struct RegisterAddress( + #[serde( + serialize_with = "serialize_u32_hex", + deserialize_with = "deserialize_u32_hex" + )] + pub u32, +); + +/// Describes a policy for how the corresponding MSR data should be considered when building +/// a CPU profile. +/// +/// This is the MSR analogue of [cpuid_definitions::ProfilePolicy](crate::x86_64::cpuid_definitions::ProfilePolicy) +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub enum ProfilePolicy { + /// Store the corresponding data when building the CPU profile. + /// + /// When the CPU profile gets utilized the corresponding data will be set into the modified + /// MSR(s) + Inherit, + /// Ignore the corresponding data when building the CPU profile. + /// + /// When the CPU profile gets utilized the corresponding data will then instead get + /// extracted from the host. + /// + /// This variant is typically set for data that has no effect on migration compatibility, + /// but there may be some exceptions such as data which is necessary to run the VM at all, + /// but must coincide with whatever is on the host. + Passthrough, + /// Set the following hardcoded value in the CPU profile. + /// + /// This variant is typically used for features/values that don't work well with live migration (even when using the exact same physical CPU model). + Static(u64), + /// Deny read and write accesses to this MSR. + /// + /// This can only be applied to an MSR in its entirety and not to individual bit ranges + Deny, +} + +/// A description of a range of bits in an MSR. +/// +/// This is the MSR analogue of [cpuid_definitions::ValueDefinition](crate::x86_64::cpuid_definitions::ValueDefinition) +#[derive(Clone, Copy, Debug)] +pub struct ValueDefinition { + /// A short name for the value. + pub short: &'static str, + /// A description of the value. + pub description: &'static str, + /// The range of bits in the MSR corresponding to this feature or value. + /// + /// This is not a `RangeInclusive` because that type does unfortunately not implement `Copy`. + pub bits_range: (u8, u8), + /// The policy corresponding to this value when building CPU profiles. + pub policy: ProfilePolicy, +} + +/// Describes values within an MSR. +/// +/// NOTE: The only way to interact with this value (beyond this crate) is via the const [`Self::as_slice()`](Self::as_slice) method. +/// +/// This is the MSR analogue of [cpuid_definitions::ValueDefinitions](crate::x86_64::cpuid_definitions::ValueDefinitions) +#[derive(Clone, Copy, Debug)] +pub struct ValueDefinitions(&'static [ValueDefinition]); +impl ValueDefinitions { + /// Constructor permitting at most 64 entries. + const fn new(msr_descriptions: &'static [ValueDefinition]) -> Self { + // Note that this function is only called within this module, at compile time, hence it is fine to have some + // additional sanity checks such as the following assert. + assert!(msr_descriptions.len() <= 64); + Self(msr_descriptions) + } + /// Converts this into a slice representation. This is the only way to read values of this type. + pub const fn as_slice(&self) -> &'static [ValueDefinition] { + self.0 + } +} + +/// Describes multiple MSRs. +/// +/// Each wrapped [`ValueDefinitions`] corresponds to the given [`RegisterAddress`] in the same tuple. +pub struct MsrDefinitions([(RegisterAddress, ValueDefinitions); NUM]); + +impl MsrDefinitions { + pub const fn as_slice(&self) -> &[(RegisterAddress, ValueDefinitions); NUM] { + &self.0 + } +} diff --git a/arch/src/x86_64/msr_filter.rs b/arch/src/x86_64/msr_filter.rs new file mode 100644 index 0000000000..0edad0e364 --- /dev/null +++ b/arch/src/x86_64/msr_filter.rs @@ -0,0 +1,361 @@ +// Copyright © 2025 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::cell::Cell; +use std::fmt::Write; + +use hypervisor::MsrFilterRange; + +use super::Error; + +/// The maximum number of MSR filter ranges an MSR filter may consist of. +const MAX_FILTERS: usize = { + #[cfg(feature = "kvm")] + { + hypervisor::kvm::KVM_MSR_FILTER_MAX_RANGES + } + #[cfg(not(feature = "kvm"))] + { + // TODO: Change this when adding support for CPU profiles with MSHV + 16 + } +}; + +/// THE maximum number of bytes the bitmap arena used for the filter may occupy. +/// This is to ensure that we do not allocate too much memory for the bitmaps in +/// the filter ranges. +pub const MAX_BITMAP_SIZE: usize = MAX_FILTERS * 1024 * 1024; + +/// Apply a filter which denies guests any kind of access to the MSRs in `denied_msrs`. +/// +/// # Assumptions +/// +/// This function may explicitly mark certain MSRs different from those in `denied_msrs` as +/// both READ + Write permitted. We assume that the hypervisor will permit this filter being set +/// regardless and rather injects an exception if guests attempt to read/modify these MSRs in anyway +/// that is incompatible with the hardware and/or hypervisor. +/// +/// # Errors +/// +/// This errors if any of the following conditions hold: +/// +/// 1. Too much memory is required to construct the MSR filter that covers all of the denied MSRs. +/// 2. The VM/Hypervisor fails to apply the MSR filter. +pub fn filter_denied_msrs( + mut denied_msrs: Vec, + vm: &dyn hypervisor::Vm, +) -> Result<(), crate::Error> { + if denied_msrs.is_empty() { + return Ok(()); + } + denied_msrs.sort_unstable(); + + for msr in &denied_msrs { + log::debug!("MSR:={msr:#x} is set to be denied"); + } + + let mut bitmap_arena = Vec::new(); + let (filter, num_filter_ranges) = denied_to_filter(&denied_msrs, &mut bitmap_arena)?; + + if let Err(e) = vm.msr_filter(&filter[..num_filter_ranges], false) { + // Log more details at the debug level. Note that this error is likely to be reproducible and happens close to startup, hence + // it should be relatively easy to set the necessary log level if/when debugging becomes desirable. + for filter_range in &filter[..num_filter_ranges] { + // We want to encode the bitmap as a string of the form "[, ,... ]" + let mut bitmap_hex_encoded = String::with_capacity((4 * filter_range.bitmap.len()) + 2); + let _ = write!(&mut bitmap_hex_encoded, "["); + for b in filter_range.bitmap.iter() { + let _ = write!(&mut bitmap_hex_encoded, "{b:#x},"); + } + // Remove the final "," from the string + bitmap_hex_encoded.pop(); + let _ = write!(&mut bitmap_hex_encoded, "]"); + log::debug!( + "Failed to set MSR filter containing filter range: base:={:#x}, nmsrs:={:#x}, bitmap:={}", + filter_range.base, + filter_range.nmsrs, + bitmap_hex_encoded + ); + } + Err(Into::into(Error::MsrFilter(e))) + } else { + Ok(()) + } +} + +/// Essentially partitions `denied_sorted` into up to [`MAX_FILTERS`] ranges of +/// indices. +/// +/// These ranges may then be used to place the MSRs into distinct [`MsrFilterRanges`](MsrFilterRange). +/// In other words; If (a,b) is an entry in the output of this function, then all MSRs in +/// `denied_sorted[a..=b]` are intended to be placed in the same filter range. +/// +/// This partition minimizes the amount of memory necessary to construct the bitmaps for each +/// MSR filter range, that collectively cover all MSRs in `denied_sorted`, under the constraint +/// that none of the MSR filter ranges can intersect the x2APIC-related MSR range (0x801..=0x8ff). +/// +/// ## Performance +/// +/// This function has complexity` O(MAX_FILTERS * denied_sorted.len())` and does not allocate. +fn denied_to_range_indices<'a>( + denied_sorted: &[u32], + r_buff: &'a mut [(usize, usize); MAX_FILTERS], +) -> &'a [(usize, usize)] { + let mut d_prevs = [u32::MAX; MAX_FILTERS]; + let mut r_cnt = 0; + let mut min_dprev = u32::MAX; + let mut min_pos = 0_usize; + + let compute_dprev = |p: u32, n: u32| { + // Make dprev impractically large if it overlaps the x2apic MSR range + if (p <= 0x8ff) && (n > 0x800) { + u32::MAX + } else { + n - p + } + }; + + // Called as soon as we discover a full contiguous range of MSRs to be denied + // `r_s` is the index of the first MSR in this range and `r_e` the last. + let mut eval_deny_range = |r_s: usize, r_e: usize| { + const LAST_IDX: usize = MAX_FILTERS - 1; + let is_first = r_cnt == 0; + + let d_prev = if is_first { + u32::MAX + } else { + let l_prev_idx = r_buff[r_cnt - 1].1; + let l_prev = denied_sorted[l_prev_idx]; + compute_dprev(l_prev, denied_sorted[r_s]) + }; + + if r_cnt < MAX_FILTERS { + d_prevs[r_cnt] = d_prev; + r_buff[r_cnt] = (r_s, r_e); + if d_prev < min_dprev { + min_dprev = d_prev; + min_pos = r_cnt; + } + r_cnt += 1; + } else { + // Need to join ranges to find space + // The idea is to merge the range groups closest to each other + if d_prev <= min_dprev { + // Make the final range group cover this range + r_buff[LAST_IDX].1 = r_e; + } else { + // Merge some previously gathered range groups to make space + r_buff[min_pos - 1].1 = r_buff[min_pos].1; + // shift every thing after min_pos left + { + shift_left(&mut r_buff[min_pos..]); + shift_left(&mut d_prevs[min_pos..]); + } + // Now we have space for the new entry + r_buff[LAST_IDX] = (r_s, r_e); + d_prevs[LAST_IDX] = d_prev; + // Recompute minimum meta data + min_dprev = *d_prevs.iter().min().unwrap(); + min_pos = d_prevs.iter().position(|d| *d == min_dprev).unwrap(); + } + } + }; + // Produce all range groups + let mut offset = 0_usize; + let mut deny_slice = denied_sorted; + while let Some(deny_slice_skip1) = deny_slice.get(1..) { + let Some(pos) = deny_slice_skip1 + .iter() + .zip(deny_slice) + .position(|(n, p)| (n - p) > 1) + else { + break; + }; + let r_s = offset; + let r_e = offset + pos; + eval_deny_range(r_s, r_e); + offset = r_e + 1; + deny_slice = &denied_sorted[offset..]; + } + // Since there is no gap beyond the last element, we have one final deny range to + // evaluate + eval_deny_range(offset, denied_sorted.len() - 1); + &r_buff[..r_cnt] +} + +/// Construct `range_indices.len() (<= MAX_FILTERS)` [`MsrFilterRanges`](MsrFilterRange) +/// to deny all MSRs in `denied_sorted`. +/// +/// For each pair `(r_s, r_e)` in `range_indices` there will be a corresponding +/// filter range denying the MSRs in [`denied_sorted[r_s..=r_e]`]. +/// +/// # Errors +/// +/// This function can only error if more than [`MAX_BITMAP_SIZE`] bytes are required +/// to construct the filters. +/// +/// # Performance +/// +/// This function allocates once (but a possibly large allocation) and has otherwise +/// computational complexity `O(MAX_FILTERS * denied_sorted.len())`. +fn range_indices_to_filter<'a>( + denied_sorted: &[u32], + range_indices: &[(usize, usize)], + bitmap_arena: &'a mut Vec, +) -> Result<[MsrFilterRange<'a>; MAX_FILTERS], Error> { + let mut out = [MsrFilterRange::default().with_read_write_flags(); MAX_FILTERS]; + let bytes_to_allocate: usize = range_indices + .iter() + .copied() + .map(|(s, e)| ((denied_sorted[e] - denied_sorted[s]) + 1).div_ceil(8)) + .map(|v| v as usize) + .sum(); + + if bytes_to_allocate > MAX_BITMAP_SIZE { + return Err(Error::MsrFilterTooLarge(bytes_to_allocate)); + } + + bitmap_arena.extend(std::iter::repeat_n(u8::MAX, bytes_to_allocate)); + + let mut arena_slice = &mut bitmap_arena[..]; + for (idx, (r_s, r_e)) in range_indices.iter().enumerate() { + let base = denied_sorted[*r_s]; + let nmsrs = (denied_sorted[*r_e] - denied_sorted[*r_s]) + 1; + let (bm, rest) = arena_slice.split_at_mut(nmsrs.div_ceil(8) as usize); + arena_slice = rest; + for msr in &denied_sorted[*r_s..=*r_e] { + let d_base = *msr - base; + let byte_idx = (d_base) / 8; + let bit = 1 << (d_base % 8); + bm[byte_idx as usize] ^= bit; + } + // Set the fields in the range filter + { + let filter_range = &mut out[idx]; + filter_range.base = base; + filter_range.nmsrs = nmsrs; + filter_range.bitmap = bm; + } + } + + Ok(out) +} + +/// Prepare up to [`MAX_FILTERS`] [`MsrFilterRanges`](MsrFilterRange) +/// that collectively deny each of the MSRs specified in `denied_sorted`. +/// +/// The second component returned from this function is the number of +/// valid entries in the returned array. +/// +/// # Errors +/// +/// This function can only error if more than [`MAX_BITMAP_SIZE`] bytes are required +/// to construct the filters. +fn denied_to_filter<'a>( + denied_sorted: &[u32], + bitmap_arena: &'a mut Vec, +) -> Result<([MsrFilterRange<'a>; MAX_FILTERS], usize), Error> { + let mut range_indices_buffer = [(0, 0); MAX_FILTERS]; + let range_indices = denied_to_range_indices(denied_sorted, &mut range_indices_buffer); + + range_indices_to_filter(denied_sorted, range_indices, bitmap_arena) + .map(|filter| (filter, range_indices.len())) +} + +/// Convenience function that moves all elements apart from the first and last left by one. +/// +/// The slice's first element will be removed from the slice, while the modified +/// slice's last element will be equal to the second last (prior to calling this method). +fn shift_left(slice: &mut [T]) { + for w in Cell::from_mut(slice).as_slice_of_cells().windows(2) { + Cell::swap(&w[0], &w[1]); + } +} + +#[cfg(test)] +mod unit_tests { + use hypervisor::MsrFilterRange; + use proptest::prelude::*; + + use super::{MAX_BITMAP_SIZE, MAX_FILTERS, denied_to_filter}; + + /// transforms entries out of the x2apic MSR range and sorts + dedups the vector + fn prepare(bases: Vec) -> Vec { + // Remove bases in the x2apic MSR range + let mut v: Vec = bases + .into_iter() + .map(|b| { + if (0x800..=0x8ff).contains(&b) { + b % 0x800 + } else { + b + } + }) + .collect(); + v.sort_unstable(); + v.dedup(); + v + } + + fn filter_to_msrs(filter: &[MsrFilterRange<'_>]) -> Vec { + let mut out = Vec::new(); + for filter_range in filter { + let base = filter_range.base; + let mut num_msrs: u32 = 0; + for byte in filter_range.bitmap { + let mut inverse = !(*byte); + while inverse != 0 { + let idx = inverse.trailing_zeros(); + if num_msrs + idx > filter_range.nmsrs { + break; + } + out.push(base + num_msrs + idx); + let lsb = inverse & inverse.wrapping_neg(); + inverse ^= lsb; + } + num_msrs += 8; + } + } + out + } + + proptest! { + #[test] + fn denied_to_filer_works_short(prepared_msrs in (prop::collection::vec(0..u32::MAX, 1..MAX_FILTERS)).prop_map(prepare)) { + let mut bitmap_arena = Vec::new(); + let Ok((filter, num_filter_ranges)) = denied_to_filter(&prepared_msrs, &mut bitmap_arena) else { + return Ok(()); + }; + let mut recomputed_msrs = filter_to_msrs(&filter[..num_filter_ranges]); + recomputed_msrs.sort_unstable(); + prop_assert_eq!(prepared_msrs, recomputed_msrs); + } + } + + proptest! { + #[test] + fn denied_to_filer_works(prepared_msrs in (prop::collection::vec(0..u32::MAX, 17..70)).prop_map(prepare)) { + let mut bitmap_arena = Vec::new(); + let Ok((filter, num_filter_ranges)) = denied_to_filter(&prepared_msrs, &mut bitmap_arena) else { + return Ok(()); + }; + let mut recomputed_msrs = filter_to_msrs(&filter[..num_filter_ranges]); + recomputed_msrs.sort_unstable(); + prop_assert_eq!(prepared_msrs, recomputed_msrs); + } + } + + // Simple test that doesn't take too long to execute. We can + // include a more thorough test later if desired. + #[test] + fn catches_attempt_to_allocate_too_much_memory() { + let mut bitmap_arena = Vec::new(); + let denied_msrs: Vec = (0..MAX_FILTERS * 8 * 2) + .map(|i| i * MAX_BITMAP_SIZE) + .map(|v| u32::try_from(v).unwrap()) + .collect(); + let _ = denied_to_filter(&denied_msrs, &mut bitmap_arena).unwrap_err(); + } +} diff --git a/arch/src/x86_64/regs.rs b/arch/src/x86_64/regs.rs index c93f39520b..baaedf57ed 100644 --- a/arch/src/x86_64/regs.rs +++ b/arch/src/x86_64/regs.rs @@ -10,7 +10,7 @@ use std::{mem, result}; use hypervisor::arch::x86::gdt::{gdt_entry, segment_from_gdt}; use hypervisor::arch::x86::regs::CR0_PE; -use hypervisor::arch::x86::{FpuState, SpecialRegisters}; +use hypervisor::arch::x86::{FpuState, MsrEntry, SpecialRegisters}; use thiserror::Error; use vm_memory::{Address, Bytes, GuestMemory, GuestMemoryError}; @@ -33,6 +33,8 @@ pub enum Error { /// Setting up MSRs failed. #[error("Setting up MSRs failed")] SetModelSpecificRegisters(#[source] hypervisor::HypervisorCpuError), + #[error("Setting up MSRs failed: Not all MSRs could be set. See logs for more info.")] + SetModelSpecificRegistersPartial, /// Failed to set SREGs for this CPU. #[error("Failed to set SREGs for this CPU")] SetStatusRegisters(#[source] hypervisor::HypervisorCpuError), @@ -81,11 +83,35 @@ pub fn setup_fpu(vcpu: &dyn hypervisor::Vcpu) -> Result<()> { /// # Arguments /// /// * `vcpu` - Structure for the VCPU that holds the VCPU's fd. -pub fn setup_msrs(vcpu: &dyn hypervisor::Vcpu) -> Result<()> { - vcpu.set_msrs(vcpu.boot_msr_entries()) - .map_err(Error::SetModelSpecificRegisters)?; +/// * `feature_msr_updates` - A (possibly empty) slice of MSR-based features +/// that should be set as as part of the setup. If the slice is empty then +/// only boot msr entries are set, otherwise the given slice will also be +/// included in the setup. +pub fn setup_msrs(vcpu: &dyn hypervisor::Vcpu, feature_msr_updates: &[MsrEntry]) -> Result<()> { + let boot_entries = vcpu.boot_msr_entries(); + let mut entries_for_update = Vec::new(); + let setup_entries: &mut &[MsrEntry] = &mut (&boot_entries[..]); - Ok(()) + if !feature_msr_updates.is_empty() { + entries_for_update.extend_from_slice(feature_msr_updates); + entries_for_update.extend_from_slice(boot_entries); + *setup_entries = &entries_for_update[..]; + } + let num_msrs_written = vcpu + .set_msrs(setup_entries) + .map_err(Error::SetModelSpecificRegisters)?; + if num_msrs_written < setup_entries.len() { + for msr in &setup_entries[num_msrs_written..] { + log::error!( + "Could not set MSR with register address:={:#x} and value:={:#x}", + msr.index, + msr.data + ); + } + Err(Into::into(Error::SetModelSpecificRegistersPartial)) + } else { + Ok(()) + } } /// Configure base registers for a given CPU. diff --git a/arch/src/x86_64/smbios.rs b/arch/src/x86_64/smbios.rs index 6f1139888b..a59c39fef3 100644 --- a/arch/src/x86_64/smbios.rs +++ b/arch/src/x86_64/smbios.rs @@ -35,6 +35,9 @@ pub enum Error { /// Failure to parse uuid, uuid format may be error #[error("Failure to parse uuid: {1}")] ParseUuid(#[source] uuid::Error, String), + /// SMBIOS string index overflow (u8 limit reached). + #[error("SMBIOS string index overflow (u8 limit reached)")] + TooManyStrings, } pub type Result = result::Result; @@ -44,9 +47,39 @@ const SM3_MAGIC_IDENT: &[u8; 5usize] = b"_SM3_"; const BIOS_INFORMATION: u8 = 0; const SYSTEM_INFORMATION: u8 = 1; const OEM_STRINGS: u8 = 11; +const SYSTEM_ENCLOSURE: u8 = 3; const END_OF_TABLE: u8 = 127; const PCI_SUPPORTED: u64 = 1 << 7; const IS_VIRTUAL_MACHINE: u8 = 1 << 4; +pub const DEFAULT_SYSTEM_MANUFACTURER: &str = "Cloud Hypervisor"; +pub const DEFAULT_SYSTEM_PRODUCT_NAME: &str = "cloud-hypervisor"; + +#[derive(Clone, Debug, Default, PartialEq, Eq)] +pub struct SmbiosConfig { + pub system: Option, + pub chassis: Option, + pub oem_strings: Vec, +} + +#[derive(Clone, Debug, Default, PartialEq, Eq)] +pub struct SmbiosSystem { + pub manufacturer: Option, + pub product_name: Option, + pub version: Option, + pub serial_number: Option, + pub uuid: Option, + pub sku_number: Option, + pub family: Option, +} + +#[derive(Clone, Debug, Default, PartialEq, Eq)] +pub struct SmbiosChassisConfig { + pub manufacturer: Option, + pub chassis_type: Option, + pub version: Option, + pub serial_number: Option, + pub asset_tag: Option, +} fn compute_checksum(v: &T) -> u8 { let v: *const T = v; @@ -59,8 +92,7 @@ fn compute_checksum(v: &T) -> u8 { (!checksum).wrapping_add(1) } -#[repr(C)] -#[repr(packed)] +#[repr(C, packed)] #[derive(Default, Copy, Clone)] struct Smbios30Entrypoint { signature: [u8; 5usize], @@ -75,8 +107,7 @@ struct Smbios30Entrypoint { physptr: u64, } -#[repr(C)] -#[repr(packed)] +#[repr(C, packed)] #[derive(Default, Copy, Clone)] struct SmbiosBiosInfo { r#type: u8, @@ -92,8 +123,7 @@ struct SmbiosBiosInfo { characteristics_ext2: u8, } -#[repr(C)] -#[repr(packed)] +#[repr(C, packed)] #[derive(Default, Copy, Clone)] struct SmbiosSysInfo { r#type: u8, @@ -109,8 +139,7 @@ struct SmbiosSysInfo { family: u8, } -#[repr(C)] -#[repr(packed)] +#[repr(C, packed)] #[derive(Default, Copy, Clone)] struct SmbiosOemStrings { r#type: u8, @@ -119,8 +148,34 @@ struct SmbiosOemStrings { count: u8, } -#[repr(C)] -#[repr(packed)] +/// SMBIOS Chassis Table (Type 3) as defined in DMTF SMBIOS 3.9.0: +/// https://www.dmtf.org/sites/default/files/standards/documents/DSP0134_3.9.0.pdf +/// Note: trailing fields are omitted, so this structure is not complete. +#[repr(C, packed)] +#[derive(Default, Copy, Clone)] +struct SmbiosChassis { + r#type: u8, + length: u8, + handle: u16, + manufacturer: u8, + chassis_type: u8, + version: u8, + serial_number: u8, + asset_tag: u8, + bootup_state: u8, + power_supply_state: u8, + thermal_state: u8, + security_status: u8, + oem_defined: u32, + height: u8, + number_of_power_cords: u8, + contained_element_count: u8, + contained_element_record_length: u8, + // followed by contained element records (optional, variable-length) + // followed by sku_number: u8, rack_type: u8, rack_height: u8 +} + +#[repr(C, packed)] #[derive(Default, Copy, Clone)] struct SmbiosEndOfTable { r#type: u8, @@ -137,6 +192,8 @@ unsafe impl ByteValued for SmbiosSysInfo {} // SAFETY: data structure only contain a series of integers unsafe impl ByteValued for SmbiosOemStrings {} // SAFETY: data structure only contain a series of integers +unsafe impl ByteValued for SmbiosChassis {} +// SAFETY: data structure only contain a series of integers unsafe impl ByteValued for SmbiosEndOfTable {} fn write_and_incr( @@ -163,12 +220,143 @@ fn write_string( Ok(curptr) } -pub fn setup_smbios( +fn write_opt_string( + mem: &GuestMemoryMmap, + s: Option<&str>, + cur: GuestAddress, +) -> Result { + if let Some(v) = s { + write_string(mem, v, cur) + } else { + Ok(cur) + } +} + +fn write_string_terminator( + mem: &GuestMemoryMmap, + cur: GuestAddress, + has_strings: bool, +) -> Result { + // SMBIOS DSP0134 §6.1.3: if all string-reference fields are 0, follow the + // formatted section with two null bytes (empty string-set). + if has_strings { + write_and_incr(mem, 0u8, cur) + } else { + let cur = write_and_incr(mem, 0u8, cur)?; + write_and_incr(mem, 0u8, cur) + } +} + +fn alloc_index(next: &mut u8, present: bool) -> Result { + if !present { + return Ok(0); + } + + let idx = *next; + if idx == 0 { + // wrapped around, next starts always initially at 1 + return Err(Error::TooManyStrings); + } + + *next = next.wrapping_add(1); + Ok(idx) +} + +fn write_type1_system( mem: &GuestMemoryMmap, - serial_number: Option<&str>, - uuid: Option<&str>, - oem_strings: Option<&[&str]>, -) -> Result { + curptr: &mut GuestAddress, + handle: &mut u16, + system: Option<&SmbiosSystem>, +) -> Result<()> { + *handle += 1; + + let manufacturer = system + .and_then(|s| s.manufacturer.as_deref()) + .unwrap_or(DEFAULT_SYSTEM_MANUFACTURER); + let product = system + .and_then(|s| s.product_name.as_deref()) + .unwrap_or(DEFAULT_SYSTEM_PRODUCT_NAME); + let version = system.and_then(|s| s.version.as_deref()); + let serial = system.and_then(|s| s.serial_number.as_deref()); + let uuid = system.and_then(|s| s.uuid.as_deref()); + let sku = system.and_then(|s| s.sku_number.as_deref()); + let family = system.and_then(|s| s.family.as_deref()); + + let uuid_number = uuid + .map(Uuid::parse_str) + .transpose() + .map_err(|e| Error::ParseUuid(e, uuid.unwrap().to_string()))? + .unwrap_or(Uuid::nil()); + + let mut next = 1u8; + let manufacturer_idx = alloc_index(&mut next, true)?; + let product_idx = alloc_index(&mut next, true)?; + let version_idx = alloc_index(&mut next, version.is_some())?; + let serial_idx = alloc_index(&mut next, serial.is_some())?; + let sku_idx = alloc_index(&mut next, sku.is_some())?; + let family_idx = alloc_index(&mut next, family.is_some())?; + + let sys = SmbiosSysInfo { + r#type: SYSTEM_INFORMATION, + length: mem::size_of::() as u8, + handle: *handle, + manufacturer: manufacturer_idx, + product_name: product_idx, + version: version_idx, + serial_number: serial_idx, + uuid: uuid_number.to_bytes_le(), + sku: sku_idx, + family: family_idx, + ..Default::default() + }; + + *curptr = write_and_incr(mem, sys, *curptr)?; + *curptr = write_string(mem, manufacturer, *curptr)?; + *curptr = write_string(mem, product, *curptr)?; + *curptr = write_opt_string(mem, version, *curptr)?; + *curptr = write_opt_string(mem, serial, *curptr)?; + *curptr = write_opt_string(mem, sku, *curptr)?; + *curptr = write_opt_string(mem, family, *curptr)?; + *curptr = write_and_incr(mem, 0u8, *curptr)?; + Ok(()) +} + +fn write_type3_chassis( + mem: &GuestMemoryMmap, + curptr: &mut GuestAddress, + handle: &mut u16, + chassis: &SmbiosChassisConfig, +) -> Result<()> { + *handle += 1; + + let asset_tag = chassis.asset_tag.as_deref(); + let mut next = 1u8; + let asset_idx = alloc_index(&mut next, asset_tag.is_some())?; + + let ch = SmbiosChassis { + r#type: SYSTEM_ENCLOSURE, + length: mem::size_of::() as u8, + handle: *handle, + manufacturer: 0, + chassis_type: 0, + version: 0, + serial_number: 0, + asset_tag: asset_idx, + contained_element_count: 0, + contained_element_record_length: 0, + ..Default::default() + }; + + *curptr = write_and_incr(mem, ch, *curptr)?; + *curptr = write_opt_string(mem, asset_tag, *curptr)?; + *curptr = write_string_terminator(mem, *curptr, asset_tag.is_some())?; + Ok(()) +} + +pub fn setup_smbios(mem: &GuestMemoryMmap, smbios: Option<&SmbiosConfig>) -> Result { + let system = smbios.and_then(|cfg| cfg.system.as_ref()); + let chassis = smbios.and_then(|cfg| cfg.chassis.as_ref()); + let oem_strings: &[String] = smbios.map_or(&[] as &[String], |cfg| cfg.oem_strings.as_slice()); let physptr = GuestAddress(SMBIOS_START) .checked_add(mem::size_of::() as u64) .ok_or(Error::NotEnoughMemory)?; @@ -193,34 +381,13 @@ pub fn setup_smbios( curptr = write_and_incr(mem, 0u8, curptr)?; } - { - handle += 1; + write_type1_system(mem, &mut curptr, &mut handle, system)?; - let uuid_number = uuid - .map(Uuid::parse_str) - .transpose() - .map_err(|e| Error::ParseUuid(e, uuid.unwrap().to_string()))? - .unwrap_or(Uuid::nil()); - let smbios_sysinfo = SmbiosSysInfo { - r#type: SYSTEM_INFORMATION, - length: mem::size_of::() as u8, - handle, - manufacturer: 1, // First string written in this section - product_name: 2, // Second string written in this section - serial_number: serial_number.map(|_| 3).unwrap_or_default(), // 3rd string - uuid: uuid_number.to_bytes_le(), // set uuid - ..Default::default() - }; - curptr = write_and_incr(mem, smbios_sysinfo, curptr)?; - curptr = write_string(mem, "Cloud Hypervisor", curptr)?; - curptr = write_string(mem, "cloud-hypervisor", curptr)?; - if let Some(serial_number) = serial_number { - curptr = write_string(mem, serial_number, curptr)?; - } - curptr = write_and_incr(mem, 0u8, curptr)?; + if let Some(chassis) = chassis { + write_type3_chassis(mem, &mut curptr, &mut handle, chassis)?; } - if let Some(oem_strings) = oem_strings { + if !oem_strings.is_empty() { handle += 1; let smbios_oemstrings = SmbiosOemStrings { @@ -236,7 +403,7 @@ pub fn setup_smbios( curptr = write_string(mem, s, curptr)?; } - curptr = write_and_incr(mem, 0u8, curptr)?; + curptr = write_string_terminator(mem, curptr, true)?; } { @@ -276,8 +443,55 @@ pub fn setup_smbios( mod unit_tests { use super::*; + /// Collects all strings after a SMBIOS structure, stopping at the double-NUL terminator and returns next addr. + fn read_string_set(mem: &GuestMemoryMmap, addr: GuestAddress) -> (Vec, GuestAddress) { + let mut cur = addr; + let read_byte = |addr: GuestAddress| -> u8 { mem.read_obj(addr).unwrap() }; + + // SMBIOS string-set: NUL-terminated strings, terminated by an extra NUL. + // Empty string-set is exactly "\0\0". + if read_byte(cur) == 0 { + let next = cur.checked_add(1).unwrap(); + assert_eq!(read_byte(next), 0); + return (Vec::new(), next.checked_add(1).unwrap()); + } + + let mut strings = Vec::new(); + loop { + let mut bytes = Vec::new(); + loop { + let b = read_byte(cur); + cur = cur.checked_add(1).unwrap(); + if b == 0 { + break; + } + bytes.push(b); + } + strings.push(String::from_utf8(bytes).unwrap()); + + // If the next byte is NUL, that's the extra terminator. + if read_byte(cur) == 0 { + cur = cur.checked_add(1).unwrap(); + break; + } + } + + (strings, cur) + } + #[test] - fn struct_size() { + fn entrypoint_checksum() { + let mem = GuestMemoryMmap::from_ranges(&[(GuestAddress(SMBIOS_START), 4096)]).unwrap(); + + setup_smbios(&mem, None).unwrap(); + + let smbios_ep: Smbios30Entrypoint = mem.read_obj(GuestAddress(SMBIOS_START)).unwrap(); + + assert_eq!(compute_checksum(&smbios_ep), 0); + } + + #[test] + fn entrypoint_struct_size() { assert_eq!( mem::size_of::(), 0x18usize, @@ -296,13 +510,185 @@ mod unit_tests { } #[test] - fn entrypoint_checksum() { + fn smbios_chassis_empty_string_set_has_double_null() { let mem = GuestMemoryMmap::from_ranges(&[(GuestAddress(SMBIOS_START), 4096)]).unwrap(); + let smbios = SmbiosConfig { + chassis: Some(SmbiosChassisConfig::default()), + ..Default::default() + }; - setup_smbios(&mem, None, None, None).unwrap(); + setup_smbios(&mem, Some(&smbios)).unwrap(); let smbios_ep: Smbios30Entrypoint = mem.read_obj(GuestAddress(SMBIOS_START)).unwrap(); + let mut cur = GuestAddress(smbios_ep.physptr); + + let bios: SmbiosBiosInfo = mem.read_obj(cur).unwrap(); + cur = cur.checked_add(bios.length as u64).unwrap(); + let (_, next) = read_string_set(&mem, cur); + cur = next; + + let sys: SmbiosSysInfo = mem.read_obj(cur).unwrap(); + cur = cur.checked_add(sys.length as u64).unwrap(); + let (_, next) = read_string_set(&mem, cur); + cur = next; + + let chassis: SmbiosChassis = mem.read_obj(cur).unwrap(); + cur = cur.checked_add(chassis.length as u64).unwrap(); + // SMBIOS DSP0134 §6.1.3: empty string-set ends with double NUL. + let b0: u8 = mem.read_obj(cur).unwrap(); + let b1: u8 = mem.read_obj(cur.checked_add(1).unwrap()).unwrap(); + assert_eq!(b0, 0); + assert_eq!(b1, 0); + cur = cur.checked_add(2).unwrap(); + + let end: SmbiosEndOfTable = mem.read_obj(cur).unwrap(); + assert_eq!(end.r#type, END_OF_TABLE); + } - assert_eq!(compute_checksum(&smbios_ep), 0); + #[test] + fn smbios_chassis_oem_strings_layout() { + let mem = GuestMemoryMmap::from_ranges(&[(GuestAddress(SMBIOS_START), 4096)]).unwrap(); + + let smbios = SmbiosConfig { + chassis: Some(SmbiosChassisConfig { + asset_tag: Some("rack1".to_string()), + ..Default::default() + }), + oem_strings: vec!["o1".to_string(), "o2".to_string()], + ..Default::default() + }; + + setup_smbios(&mem, Some(&smbios)).unwrap(); + + let smbios_ep: Smbios30Entrypoint = mem.read_obj(GuestAddress(SMBIOS_START)).unwrap(); + let mut cur = GuestAddress(smbios_ep.physptr); + + let bios: SmbiosBiosInfo = mem.read_obj(cur).unwrap(); + cur = cur.checked_add(bios.length as u64).unwrap(); + let (_, next) = read_string_set(&mem, cur); + cur = next; + + let sys: SmbiosSysInfo = mem.read_obj(cur).unwrap(); + cur = cur.checked_add(sys.length as u64).unwrap(); + let (_, next) = read_string_set(&mem, cur); + cur = next; + + let chassis: SmbiosChassis = mem.read_obj(cur).unwrap(); + assert_eq!(chassis.r#type, SYSTEM_ENCLOSURE); + assert_eq!(chassis.asset_tag, 1); + cur = cur.checked_add(chassis.length as u64).unwrap(); + let (chassis_strings, next) = read_string_set(&mem, cur); + assert_eq!(chassis_strings, vec!["rack1"]); + cur = next; + + let oem: SmbiosOemStrings = mem.read_obj(cur).unwrap(); + assert_eq!(oem.r#type, OEM_STRINGS); + assert_eq!(oem.count, 2); + cur = cur.checked_add(oem.length as u64).unwrap(); + let (oem_strings, next) = read_string_set(&mem, cur); + assert_eq!(oem_strings, vec!["o1", "o2"]); + cur = next; + + let end: SmbiosEndOfTable = mem.read_obj(cur).unwrap(); + assert_eq!(end.r#type, END_OF_TABLE); + } + + #[test] + fn smbios_strings_terminators_default() { + let mem = GuestMemoryMmap::from_ranges(&[(GuestAddress(SMBIOS_START), 4096)]).unwrap(); + + setup_smbios(&mem, None).unwrap(); + + let smbios_ep: Smbios30Entrypoint = mem.read_obj(GuestAddress(SMBIOS_START)).unwrap(); + let mut cur = GuestAddress(smbios_ep.physptr); + + let bios: SmbiosBiosInfo = mem.read_obj(cur).unwrap(); + assert_eq!(bios.r#type, BIOS_INFORMATION); + cur = cur.checked_add(bios.length as u64).unwrap(); + let (bios_strings, next) = read_string_set(&mem, cur); + assert_eq!(bios_strings, vec!["cloud-hypervisor", "0"]); + cur = next; + + let sys: SmbiosSysInfo = mem.read_obj(cur).unwrap(); + assert_eq!(sys.r#type, SYSTEM_INFORMATION); + assert_eq!(sys.manufacturer, 1); + assert_eq!(sys.product_name, 2); + assert_eq!(sys.version, 0); + assert_eq!(sys.serial_number, 0); + assert_eq!(sys.sku, 0); + assert_eq!(sys.family, 0); + cur = cur.checked_add(sys.length as u64).unwrap(); + let (sys_strings, next) = read_string_set(&mem, cur); + assert_eq!( + sys_strings, + vec![DEFAULT_SYSTEM_MANUFACTURER, DEFAULT_SYSTEM_PRODUCT_NAME] + ); + cur = next; + + let end: SmbiosEndOfTable = mem.read_obj(cur).unwrap(); + assert_eq!(end.r#type, END_OF_TABLE); + } + + #[test] + fn smbios_strings_too_many() { + let mut next = 1u8; + for _ in 0..255 { + alloc_index(&mut next, true).unwrap(); + } + let err = alloc_index(&mut next, true).unwrap_err(); + assert!(matches!(err, Error::TooManyStrings)); + } + + #[test] + fn smbios_uuid_invalid_rejected() { + let mem = GuestMemoryMmap::from_ranges(&[(GuestAddress(SMBIOS_START), 4096)]).unwrap(); + let smbios = SmbiosConfig { + system: Some(SmbiosSystem { + uuid: Some("not-a-uuid".to_string()), + ..Default::default() + }), + ..Default::default() + }; + + let err = setup_smbios(&mem, Some(&smbios)).unwrap_err(); + assert!(matches!(err, Error::ParseUuid(_, _))); + } + + #[test] + fn smbios_uuid_written_le() { + let mem = GuestMemoryMmap::from_ranges(&[(GuestAddress(SMBIOS_START), 4096)]).unwrap(); + let uuid_str = "00112233-4455-6677-8899-aabbccddeeff"; + let smbios = SmbiosConfig { + system: Some(SmbiosSystem { + uuid: Some(uuid_str.to_string()), + ..Default::default() + }), + ..Default::default() + }; + + setup_smbios(&mem, Some(&smbios)).unwrap(); + + let smbios_ep: Smbios30Entrypoint = mem.read_obj(GuestAddress(SMBIOS_START)).unwrap(); + let mut cur = GuestAddress(smbios_ep.physptr); + + let bios: SmbiosBiosInfo = mem.read_obj(cur).unwrap(); + cur = cur.checked_add(bios.length as u64).unwrap(); + let (_, next) = read_string_set(&mem, cur); + cur = next; + + let sys: SmbiosSysInfo = mem.read_obj(cur).unwrap(); + assert_eq!(sys.uuid, Uuid::parse_str(uuid_str).unwrap().to_bytes_le()); + } + + #[test] + fn smbios_write_fails_with_too_small_memory() { + let mem = GuestMemoryMmap::from_ranges(&[( + GuestAddress(SMBIOS_START), + mem::size_of::(), + )]) + .unwrap(); + + let err = setup_smbios(&mem, None).unwrap_err(); + assert!(matches!(err, Error::WriteData)); } } diff --git a/chv.nix b/chv.nix new file mode 100644 index 0000000000..83b331ed8b --- /dev/null +++ b/chv.nix @@ -0,0 +1,69 @@ +# Builds Cloud Hypervisor with using crane. +# +# Uses a pragmatic release profile with debug-ability and faster +# compilation times in mind without sacrificing too much performance. + +{ + # helper from nixpkgs + lib, + openssl, + pkg-config, + # other helper + craneLib, + # other + meta, # meta of pkgs.cloud-hypervisor + src, # clean source + chExtraVersion, # Additional information to be appended to the version string. +}: +let + commonArgs = { + inherit meta src; + # Since Nov 2025 (v50), Cloud Hypervisor has a virtual manifest and the + # main package was moved into a sub directory. + cargoToml = "${src}/cloud-hypervisor/Cargo.toml"; + + # Pragmatic release profile with debug-ability and faster + # compilation times in mind. + env = { + CARGO_PROFILE_RELEASE_DEBUG_ASSERTIONS = "true"; + CARGO_PROFILE_RELEASE_OPT_LEVEL = 2; + CARGO_PROFILE_RELEASE_OVERFLOW_CHECKS = "true"; + CARGO_PROFILE_RELEASE_LTO = "thin"; + + # Fix build. Reference: + # - https://github.com/sfackler/rust-openssl/issues/1430 + # - https://docs.rs/openssl/latest/openssl/ + OPENSSL_NO_VENDOR = true; + + # Sets additional information to be appended to the version string. + CH_EXTRA_VERSION = chExtraVersion; + }; + + nativeBuildInputs = [ + pkg-config + ]; + buildInputs = [ + openssl + ]; + }; + + # Downloaded and compiled dependencies. + cargoArtifacts = craneLib.buildDepsOnly ( + commonArgs + // { + doCheck = false; + } + ); + + cargoPackageKvm = craneLib.buildPackage ( + commonArgs + // { + inherit cargoArtifacts; + # Don't execute tests here. Too expensive for local development with + # frequent rebuilds + little benefit. + doCheck = false; + cargoExtraArgs = "--features kvm"; + } + ); +in +cargoPackageKvm diff --git a/cloud-hypervisor/Cargo.toml b/cloud-hypervisor/Cargo.toml index 39bcd3ff47..be61df65f6 100644 --- a/cloud-hypervisor/Cargo.toml +++ b/cloud-hypervisor/Cargo.toml @@ -30,6 +30,7 @@ thiserror = { workspace = true } tpm = { path = "../tpm" } tracer = { path = "../tracer" } vm-memory = { workspace = true } +vm-migration = { path = "../vm-migration" } vmm = { path = "../vmm" } vmm-sys-util = { workspace = true } zbus = { version = "5.15.0", optional = true } diff --git a/cloud-hypervisor/src/bin/ch-remote.rs b/cloud-hypervisor/src/bin/ch-remote.rs index 236e7438e0..dd2eefb79e 100644 --- a/cloud-hypervisor/src/bin/ch-remote.rs +++ b/cloud-hypervisor/src/bin/ch-remote.rs @@ -11,17 +11,20 @@ use std::io::Read; use std::marker::PhantomData; use std::os::unix::net::UnixStream; use std::process; +use std::thread::sleep; +use std::time::Duration; use api_client::{ - Error as ApiClientError, simple_api_command, simple_api_command_with_fds, - simple_api_full_command, + Error as ApiClientError, StatusCode, simple_api_command, simple_api_command_with_fds, + simple_api_full_command, simple_api_full_command_and_response, }; #[cfg(feature = "dbus_api")] use clap::ArgAction; use clap::{Arg, ArgMatches, Command}; -use log::error; +use log::{error, info}; use option_parser::{ByteSized, ByteSizedParseError}; use thiserror::Error; +use vm_migration::progress::{MigrationProgress, MigrationState}; use vmm::config::RestoreConfig; use vmm::vm_config::{ DeviceConfig, DiskConfig, FsConfig, GenericVhostUserConfig, NetConfig, PmemConfig, @@ -71,6 +74,8 @@ enum Error { ReadingFile(#[source] std::io::Error), #[error("Invalid disk size")] InvalidDiskSize(#[source] ByteSizedParseError), + #[error("Error parsing receive migration configuration")] + ReceiveMigrationConfig(#[from] vmm::api::VmReceiveMigrationConfigError), #[error("Error parsing send migration configuration")] SendMigrationConfig(#[from] vmm::api::VmSendMigrationConfigError), } @@ -105,6 +110,7 @@ trait DBusApi1 { fn vm_delete(&self) -> zbus::Result<()>; fn vm_info(&self) -> zbus::Result; fn vm_pause(&self) -> zbus::Result<()>; + fn vm_post_migration_announce(&self) -> zbus::Result<()>; fn vm_power_button(&self) -> zbus::Result<()>; fn vm_reboot(&self) -> zbus::Result<()>; fn vm_remove_device(&self, vm_remove_device: &str) -> zbus::Result<()>; @@ -220,6 +226,11 @@ impl<'a> DBusApi1ProxyBlocking<'a> { self.vm_pause().map_err(Error::DBusApiClient) } + fn api_vm_post_migration_announce(&self) -> ApiResult { + self.vm_post_migration_announce() + .map_err(Error::DBusApiClient) + } + fn api_vm_power_button(&self) -> ApiResult { self.vm_power_button().map_err(Error::DBusApiClient) } @@ -294,6 +305,10 @@ fn rest_api_do_command(matches: &ArgMatches, socket: &mut UnixStream) -> ApiResu Some("resume") => { simple_api_command(socket, "PUT", "resume", None).map_err(Error::HttpApiClient) } + Some("post-migration-announce") => { + simple_api_command(socket, "PUT", "post-migration-announce", None) + .map_err(Error::HttpApiClient) + } Some("power-button") => { simple_api_command(socket, "PUT", "power-button", None).map_err(Error::HttpApiClient) } @@ -315,6 +330,8 @@ fn rest_api_do_command(matches: &ArgMatches, socket: &mut UnixStream) -> ApiResu Some("shutdown") => { simple_api_command(socket, "PUT", "shutdown", None).map_err(Error::HttpApiClient) } + Some("migration-progress") => simple_api_command(socket, "GET", "migration-progress", None) + .map_err(Error::HttpApiClient), Some("nmi") => simple_api_command(socket, "PUT", "nmi", None).map_err(Error::HttpApiClient), Some("resize") => { let resize = resize_config( @@ -517,6 +534,14 @@ fn rest_api_do_command(matches: &ArgMatches, socket: &mut UnixStream) -> ApiResu .map_err(Error::HttpApiClient) } Some("send-migration") => { + let just_dispatch = matches + .subcommand_matches("send-migration") + .unwrap() + .get_one::("dispatch") + .cloned() + .unwrap_or(false); + let wait_for_migration = !just_dispatch; + let send_migration_data = send_migration_data( matches .subcommand_matches("send-migration") @@ -525,7 +550,65 @@ fn rest_api_do_command(matches: &ArgMatches, socket: &mut UnixStream) -> ApiResu .unwrap(), )?; simple_api_command(socket, "PUT", "send-migration", Some(&send_migration_data)) - .map_err(Error::HttpApiClient) + .map_err(Error::HttpApiClient)?; + + if !wait_for_migration { + return Ok(()); + } + loop { + let response = simple_api_full_command_and_response( + socket, + "GET", + "vm.migration-progress", + None, + ) + .map_err(Error::HttpApiClient)? + // should have response + .ok_or(Error::HttpApiClient(ApiClientError::ServerResponse( + StatusCode::Ok, + None, + )))?; + + // This is guaranteed by the SendMigration call + assert_ne!( + response, "null", + "migration progress should be there immediately when the migration was dispatched" + ); + + let progress = serde_json::from_slice::(response.as_bytes()) + .map_err(|e| { + error!("failed to parse response as MigrationProgress: {e}"); + Error::HttpApiClient(ApiClientError::ServerResponse( + StatusCode::Ok, + Some(response), + )) + })?; + + match progress.state { + MigrationState::Cancelled { .. } => { + info!("Migration was cancelled"); + break; + } + MigrationState::Failed { + error_msg, + error_msg_debug, + } => { + error!("Migration failed! {error_msg}\n{error_msg_debug}"); + break; + } + MigrationState::Finished { .. } => { + info!("Migration finished successfully. Shutting down Cloud Hypervisor"); + simple_api_full_command(socket, "PUT", "vmm.shutdown", None) + .map_err(Error::HttpApiClient)?; + break; + } + MigrationState::Ongoing { .. } => { + sleep(Duration::from_millis(50)); + continue; + } + } + } + Ok(()) } Some("receive-migration") => { let receive_migration_data = receive_migration_data( @@ -534,7 +617,7 @@ fn rest_api_do_command(matches: &ArgMatches, socket: &mut UnixStream) -> ApiResu .unwrap() .get_one::("receive_migration_config") .unwrap(), - ); + )?; simple_api_command( socket, "PUT", @@ -553,6 +636,8 @@ fn rest_api_do_command(matches: &ArgMatches, socket: &mut UnixStream) -> ApiResu )?; simple_api_command(socket, "PUT", "create", Some(&data)).map_err(Error::HttpApiClient) } + Some("cancel-migration") => simple_api_command(socket, "PUT", "cancel-migration", None) + .map_err(Error::HttpApiClient), _ => unreachable!(), } } @@ -564,6 +649,7 @@ fn dbus_api_do_command(matches: &ArgMatches, proxy: &DBusApi1ProxyBlocking<'_>) Some("delete") => proxy.api_vm_delete(), Some("shutdown-vmm") => proxy.api_vmm_shutdown(), Some("resume") => proxy.api_vm_resume(), + Some("post-migration-announce") => proxy.api_vm_post_migration_announce(), Some("power-button") => proxy.api_vm_power_button(), Some("reboot") => proxy.api_vm_reboot(), Some("pause") => proxy.api_vm_pause(), @@ -753,7 +839,7 @@ fn dbus_api_do_command(matches: &ArgMatches, proxy: &DBusApi1ProxyBlocking<'_>) .unwrap() .get_one::("receive_migration_config") .unwrap(), - ); + )?; proxy.api_vm_receive_migration(&receive_migration_data) } Some("create") => { @@ -941,12 +1027,10 @@ fn coredump_config(destination_url: &str) -> String { serde_json::to_string(&coredump_config).unwrap() } -fn receive_migration_data(url: &str) -> String { - let receive_migration_data = vmm::api::VmReceiveMigrationData { - receiver_url: url.to_owned(), - }; - - serde_json::to_string(&receive_migration_data).unwrap() +fn receive_migration_data(config: &str) -> Result { + let receive_migration_data = + vmm::api::VmReceiveMigrationData::parse(config).map_err(Error::ReceiveMigrationConfig)?; + Ok(serde_json::to_string(&receive_migration_data).unwrap()) } fn send_migration_data(config: &str) -> Result { @@ -1050,6 +1134,7 @@ fn get_cli_commands_sorted() -> Box<[Command]> { .about("Add vsock device") .arg(Arg::new("vsock_config").index(1).help(VsockConfig::SYNTAX)), Command::new("boot").about("Boot a created VM"), + Command::new("cancel-migration").about("Cancel any ongoing migration"), Command::new("coredump") .about("Create a coredump from VM") .arg(Arg::new("coredump_config").index(1).help("")), @@ -1059,9 +1144,11 @@ fn get_cli_commands_sorted() -> Box<[Command]> { .arg(Arg::new("path").index(1).default_value("-")), Command::new("delete").about("Delete a VM"), Command::new("info").about("Info on the VM"), + Command::new("migration-progress"), Command::new("nmi").about("Trigger NMI"), Command::new("pause").about("Pause the VM"), Command::new("ping").about("Ping the VMM to check for API server availability"), + Command::new("post-migration-announce").about("Trigger post-migration announcements"), Command::new("power-button").about("Trigger a power button in the VM"), Command::new("reboot").about("Reboot the VM"), Command::new("receive-migration") @@ -1069,7 +1156,7 @@ fn get_cli_commands_sorted() -> Box<[Command]> { .arg( Arg::new("receive_migration_config") .index(1) - .help(""), + .help(vmm::api::VmReceiveMigrationData::SYNTAX), ), Command::new("remove-device") .about("Remove VFIO and PCI device") @@ -1132,6 +1219,12 @@ fn get_cli_commands_sorted() -> Box<[Command]> { Command::new("resume").about("Resume the VM"), Command::new("send-migration") .about("Initiate a VM migration") + .arg( + Arg::new("dispatch") + .long("dispatch") + .help("just dispatch the migration without waiting for it to finish") + .num_args(0), + ) .arg( Arg::new("send_migration_config") .index(1) diff --git a/cloud-hypervisor/src/logger.rs b/cloud-hypervisor/src/logger.rs index 2245256c7b..6c0b40b375 100644 --- a/cloud-hypervisor/src/logger.rs +++ b/cloud-hypervisor/src/logger.rs @@ -98,7 +98,7 @@ fn parse_format(fmt: &str) -> Result, Error> { } pub const DEFAULT_FORMAT: &str = - "cloud-hypervisor: {boottime}s: <{thread}> {level}:{location} -- {msg}"; + "cloud-hypervisor: {wallclock}: <{thread}> {level}:{location} -- {msg}"; pub struct Logger { output: Mutex>, @@ -135,9 +135,7 @@ impl log::Log for Logger { Token::Literal(s) => out.write_all(s.as_bytes()), // 10: 6 decimal places + sep => whole seconds in range `0..=999` properly aligned Token::BootTime => write!(&mut *out, "{duration_s:>10.6?}"), - Token::WallClock => { - write!(out, "{:.6}", jiff::Timestamp::now()) - } + Token::WallClock => write!(out, "{:.6}", jiff::Zoned::now()), Token::Pid => write!(&mut *out, "{}", self.pid), // SAFETY: gettid(2) always succeeds Token::Tid => write!(&mut *out, "{}", unsafe { libc::gettid() }), @@ -237,7 +235,7 @@ mod tests { fn parse_default_format_succeeds() { let tokens = parse_format(DEFAULT_FORMAT).unwrap(); // Default format has 5 tokens interleaved with literals. - assert!(tokens.iter().any(|t| matches!(t, Token::BootTime))); + assert!(tokens.iter().any(|t| matches!(t, Token::WallClock))); assert!(tokens.iter().any(|t| matches!(t, Token::Thread))); assert!(tokens.iter().any(|t| matches!(t, Token::Level))); assert!(tokens.iter().any(|t| matches!(t, Token::Location))); @@ -367,14 +365,13 @@ mod tests { let out = buf.contents(); let out = out.trim(); - assert_eq!(out.len(), 27, "got: {out}"); + assert_eq!(out.len(), 40, "got: {out}"); assert_eq!(&out[4..5], "-", "got: {out}"); assert_eq!(&out[7..8], "-", "got: {out}"); assert_eq!(&out[10..11], "T", "got: {out}"); assert_eq!(&out[13..14], ":", "got: {out}"); assert_eq!(&out[16..17], ":", "got: {out}"); assert_eq!(&out[19..20], ".", "got: {out}"); - assert!(out.ends_with('Z'), "got: {out}"); } #[test] diff --git a/cloud-hypervisor/src/main.rs b/cloud-hypervisor/src/main.rs index 5de2e4c2df..46d97d912d 100644 --- a/cloud-hypervisor/src/main.rs +++ b/cloud-hypervisor/src/main.rs @@ -685,6 +685,11 @@ fn start_vmm( .map_err(Error::EventMonitorThread)?; } + info!( + "Cloud Hypervisor starting: build version: {}, date: {}", + env!("BUILD_VERSION"), + jiff::Zoned::now().strftime("%Y-%m-%dT%H:%M:%S%.f%:z") + ); event!("vmm", "starting"); let vmm_thread_handle = vmm::start_vmm_thread( @@ -975,6 +980,7 @@ mod unit_tests { features: CpuFeatures::default(), nested: true, core_scheduling: CoreScheduling::Vm, + profile: Default::default(), }, memory: MemoryConfig { size: 536_870_912, @@ -1017,6 +1023,7 @@ mod unit_tests { file: None, mode: ConsoleOutputMode::Null, socket: None, + url: None, }, }, console: ConsoleConfig { @@ -1024,6 +1031,7 @@ mod unit_tests { file: None, mode: ConsoleOutputMode::Tty, socket: None, + url: None, }, pci_common: PciDeviceCommonConfig::default(), }, diff --git a/cloud-hypervisor/tests/common/tests_wrappers.rs b/cloud-hypervisor/tests/common/tests_wrappers.rs index 693702b552..27e4c1f272 100644 --- a/cloud-hypervisor/tests/common/tests_wrappers.rs +++ b/cloud-hypervisor/tests/common/tests_wrappers.rs @@ -2219,7 +2219,7 @@ pub(crate) fn _test_dmi_serial_number(guest: &Guest) { let mut child = GuestCommand::new(guest) .default_cpus() .default_memory() - .default_kernel_cmdline_with_platform(Some("serial_number=a=b;c=d")) + .default_kernel_cmdline_with_platform(Some("system_serial_number=a=b;c=d")) .default_disks() .default_net() .capture_output() @@ -2248,7 +2248,9 @@ pub(crate) fn _test_dmi_uuid(guest: &Guest) { let mut child = GuestCommand::new(guest) .default_cpus() .default_memory() - .default_kernel_cmdline_with_platform(Some("uuid=1e8aa28a-435d-4027-87f4-40dceff1fa0a")) + .default_kernel_cmdline_with_platform(Some( + "system_uuid=1e8aa28a-435d-4027-87f4-40dceff1fa0a", + )) .default_disks() .default_net() .capture_output() diff --git a/devices/src/acpi.rs b/devices/src/acpi.rs index a9c86aa18e..49f166655d 100644 --- a/devices/src/acpi.rs +++ b/devices/src/acpi.rs @@ -24,6 +24,7 @@ pub struct AcpiShutdownDevice { guest_exit_evt: EventFd, reset_evt: EventFd, vcpus_kill_signalled: Arc, + vcpus_pause_signalled: Arc, } impl AcpiShutdownDevice { @@ -32,11 +33,13 @@ impl AcpiShutdownDevice { guest_exit_evt: EventFd, reset_evt: EventFd, vcpus_kill_signalled: Arc, + vcpus_pause_signalled: Arc, ) -> AcpiShutdownDevice { AcpiShutdownDevice { guest_exit_evt, reset_evt, vcpus_kill_signalled, + vcpus_pause_signalled, } } } @@ -56,7 +59,9 @@ impl BusDevice for AcpiShutdownDevice { } // Spin until we are sure the reset_evt has been handled and that when // we return from the KVM_RUN we will exit rather than re-enter the guest. - while !self.vcpus_kill_signalled.load(Ordering::SeqCst) { + while !self.vcpus_kill_signalled.load(Ordering::SeqCst) + && !self.vcpus_pause_signalled.load(Ordering::SeqCst) + { // This is more effective than thread::yield_now() at // avoiding a priority inversion with the VMM thread thread::sleep(std::time::Duration::from_millis(1)); @@ -73,7 +78,9 @@ impl BusDevice for AcpiShutdownDevice { } // Spin until we are sure the reset_evt has been handled and that when // we return from the KVM_RUN we will exit rather than re-enter the guest. - while !self.vcpus_kill_signalled.load(Ordering::SeqCst) { + while !self.vcpus_kill_signalled.load(Ordering::SeqCst) + && !self.vcpus_pause_signalled.load(Ordering::SeqCst) + { // This is more effective than thread::yield_now() at // avoiding a priority inversion with the VMM thread thread::sleep(std::time::Duration::from_millis(1)); diff --git a/devices/src/ioapic.rs b/devices/src/ioapic.rs index ba05c1ed5b..9312ab1156 100644 --- a/devices/src/ioapic.rs +++ b/devices/src/ioapic.rs @@ -172,7 +172,7 @@ impl BusDevice for Ioapic { return None; } - debug!("IOAPIC_W @ offset 0x{offset:x}"); + trace!("IOAPIC_W @ offset 0x{offset:x}"); let value = LittleEndian::read_u32(data); @@ -250,7 +250,7 @@ impl Ioapic { } fn ioapic_write(&mut self, val: u32) { - debug!("IOAPIC_W reg 0x{:x}, val 0x{:x}", self.reg_sel, val); + trace!("IOAPIC_W reg 0x{:x}, val 0x{:x}", self.reg_sel, val); match self.reg_sel as u8 { IOAPIC_REG_VERSION => { diff --git a/devices/src/legacy/cmos.rs b/devices/src/legacy/cmos.rs index 238f2184d6..8f4b44941b 100644 --- a/devices/src/legacy/cmos.rs +++ b/devices/src/legacy/cmos.rs @@ -27,7 +27,8 @@ pub struct Cmos { index: u8, data: [u8; DATA_LEN], reset_evt: EventFd, - vcpus_kill_signalled: Option>, + vcpus_kill_signalled: Arc, + vcpus_pause_signalled: Arc, } impl Cmos { @@ -38,7 +39,8 @@ impl Cmos { mem_below_4g: u64, mem_above_4g: u64, reset_evt: EventFd, - vcpus_kill_signalled: Option>, + vcpus_kill_signalled: Arc, + vcpus_pause_signalled: Arc, ) -> Cmos { let mut data = [0u8; DATA_LEN]; @@ -61,6 +63,7 @@ impl Cmos { data, reset_evt, vcpus_kill_signalled, + vcpus_pause_signalled, } } } @@ -78,14 +81,14 @@ impl BusDevice for Cmos { if self.index == 0x8f && data[0] == 0 { info!("CMOS reset"); self.reset_evt.write(1).unwrap(); - if let Some(vcpus_kill_signalled) = self.vcpus_kill_signalled.take() { - // Spin until we are sure the reset_evt has been handled and that when - // we return from the KVM_RUN we will exit rather than re-enter the guest. - while !vcpus_kill_signalled.load(Ordering::SeqCst) { - // This is more effective than thread::yield_now() at - // avoiding a priority inversion with the VMM thread - thread::sleep(std::time::Duration::from_millis(1)); - } + // Spin until we are sure the reset_evt has been handled and that when + // we return from the KVM_RUN we will exit rather than re-enter the guest. + while !self.vcpus_kill_signalled.load(Ordering::SeqCst) + && !self.vcpus_pause_signalled.load(Ordering::SeqCst) + { + // This is more effective than thread::yield_now() at + // avoiding a priority inversion with the VMM thread + thread::sleep(std::time::Duration::from_millis(1)); } } else { self.data[(self.index & INDEX_MASK) as usize] = data[0]; diff --git a/devices/src/legacy/i8042.rs b/devices/src/legacy/i8042.rs index 0e014ab8bf..7639f819e0 100644 --- a/devices/src/legacy/i8042.rs +++ b/devices/src/legacy/i8042.rs @@ -16,14 +16,20 @@ use vmm_sys_util::eventfd::EventFd; pub struct I8042Device { reset_evt: EventFd, vcpus_kill_signalled: Arc, + vcpus_pause_signalled: Arc, } impl I8042Device { /// Constructs a i8042 device that will signal the given event when the guest requests it. - pub fn new(reset_evt: EventFd, vcpus_kill_signalled: Arc) -> I8042Device { + pub fn new( + reset_evt: EventFd, + vcpus_kill_signalled: Arc, + vcpus_pause_signalled: Arc, + ) -> I8042Device { I8042Device { reset_evt, vcpus_kill_signalled, + vcpus_pause_signalled, } } } @@ -50,7 +56,9 @@ impl BusDevice for I8042Device { } // Spin until we are sure the reset_evt has been handled and that when // we return from the KVM_RUN we will exit rather than re-enter the guest. - while !self.vcpus_kill_signalled.load(Ordering::SeqCst) { + while !self.vcpus_kill_signalled.load(Ordering::SeqCst) + && !self.vcpus_pause_signalled.load(Ordering::SeqCst) + { // This is more effective than thread::yield_now() at // avoiding a priority inversion with the VMM thread thread::sleep(std::time::Duration::from_millis(1)); diff --git a/docs/api.md b/docs/api.md index cea3f31812..d6f9be6e9e 100644 --- a/docs/api.md +++ b/docs/api.md @@ -71,8 +71,8 @@ The Cloud Hypervisor API exposes the following actions through its endpoints: ##### Virtual Machine (VM) Actions -| Action | Endpoint | Request Body | Response Body | Prerequisites | -| --------------------------------------- | ---------------------------- | --------------------------------- | ------------------------ | ------------------------------------------------------ | +| Action | Endpoint | Request Body | Response Body | Prerequisites | +|-----------------------------------------| ---------------------------- | --------------------------------- | ------------------------ | ------------------------------------------------------ | | Create the VM | `/vm.create` | `/schemas/VmConfig` | N/A | The VM is not created yet | | Delete the VM | `/vm.delete` | N/A | N/A | N/A | | Boot the VM | `/vm.boot` | N/A | N/A | The VM is created but not booted | @@ -81,6 +81,7 @@ The Cloud Hypervisor API exposes the following actions through its endpoints: | Trigger power button of the VM | `/vm.power-button` | N/A | N/A | The VM is booted | | Pause the VM | `/vm.pause` | N/A | N/A | The VM is booted | | Resume the VM | `/vm.resume` | N/A | N/A | The VM is paused | +| Trigger post-migration announce | `/vm.post-migration-announce` | N/A | N/A | The VM is booted and not paused | | Take a snapshot of the VM | `/vm.snapshot` | `/schemas/VmSnapshotConfig` | N/A | The VM is paused | | Perform a coredump of the VM* | `/vm.coredump` | `/schemas/VmCoredumpData` | N/A | The VM is paused | | Restore the VM from a snapshot | `/vm.restore` | `/schemas/RestoreConfig` | N/A | The VM is created but not booted | diff --git a/docs/cpu_profile_generation.md b/docs/cpu_profile_generation.md new file mode 100644 index 0000000000..ccd7b47f70 --- /dev/null +++ b/docs/cpu_profile_generation.md @@ -0,0 +1,26 @@ +# CPU Profile Generation + +## Generating a CPU profile for a new target + +To generate a new CPU profile you execute the following command + +```shell +$ cargo run --release -p arch --bin generate-cpu-profile --features="cpu_profile_generation" "" +``` +on the machine you want to create a CPU profile for. This creates four new files in the `arch/src/x86_64/cpu_profiles` directory: +- `.cpuid.json` +- `.msr.json` +- one license file for each of the two files listed above + +check them in to git and then simply rebuild cloud-hypervisor `cargo build --release --bin cloud-hypervisor`. + +You can now use the new profile by adding `,profile=` to the list of `--cpus` configuration +options on the command line. + +## Can existing CPU profiles be updated? + +More recent KVM versions may introduce more support for already existing hardware features. When this happens it is of course +tempting to run the CPU profile generation tool again with the new KVM version as we then get a profile supporting more CPU +functionality. Doing this without giving the CPU profile a new name is however a breaking change and thus not permitted. +Such PRs will **not be accepted**. Instead we encourage you add a `V2` (or higher number if `V` already exists) suffix +when generating the profile. diff --git a/docs/live_migration.md b/docs/live_migration.md index 81eed06665..36191dfc0c 100644 --- a/docs/live_migration.md +++ b/docs/live_migration.md @@ -134,6 +134,10 @@ src $ ch-remote --api-socket=/tmp/api send-migration unix:/tmp/sock When the above commands completed, the VM should be successfully migrated to the destination machine without interrupting the workload. +Cloud Hypervisor sends out RARP packages after the migration, to +announce the new location of the VM to the network. For `virtio-net` +devices, Cloud Hypervisor asks guests that negotiated +`VIRTIO_NET_F_GUEST_ANNOUNCE` to also re-announce themselves. ### TCP Socket Migration @@ -190,6 +194,40 @@ After completing the above commands, the source VM will be migrated to the destination host and continue running there. The source VM instance will terminate normally. All ongoing processes and connections within the VM should remain intact after the migration. +Cloud Hypervisor sends out RARP packages after the migration, to +announce the new location of the VM to the network. For `virtio-net` +devices, Cloud Hypervisor asks guests that negotiated +`VIRTIO_NET_F_GUEST_ANNOUNCE` to also re-announce themselves. + +#### Encryption + +TCP migration can be protected with TLS by passing `tls_dir=` to +both `receive-migration` and `send-migration`. + +The destination host needs a directory containing: + +- `server-cert.pem`: the certificate presented by the destination +- `server-key.pem`: the private key for `server-cert.pem` + +The source host needs a directory containing: + +- `ca-cert.pem`: the CA certificate used to verify the destination + certificate + +Example receiver command: + +```console +dst $ ch-remote --api-socket=/tmp/api receive-migration receiver_url=tcp:0.0.0.0:{port},tls_dir=/path/to/dst-tls +``` + +Example sender command: + +```console +src $ ch-remote --api-socket=/tmp/api send-migration destination_url=tcp:{dst}:{port},tls_dir=/path/to/src-tls +``` + +TLS encryption is only supported with `tcp::` migration +URLs, not with local UNIX-socket migration. #### Migration Parameters diff --git a/flake.lock b/flake.lock new file mode 100644 index 0000000000..ba1d721934 --- /dev/null +++ b/flake.lock @@ -0,0 +1,81 @@ +{ + "nodes": { + "crane": { + "locked": { + "lastModified": 1779041105, + "narHash": "sha256-nnGD2f8OlAZT2i5OfwikJsw+ifWfiA4d6A8BWlgOXV0=", + "owner": "ipetkov", + "repo": "crane", + "rev": "10e6e3cb966f7cfcc789fe5eee7a85f3188ce08b", + "type": "github" + }, + "original": { + "owner": "ipetkov", + "ref": "master", + "repo": "crane", + "type": "github" + } + }, + "dried-nix-flakes": { + "locked": { + "lastModified": 1756139350, + "narHash": "sha256-pObQv94NclXVXjJV8sTiKwFes4fGEWpkNzDsXw5DqnY=", + "owner": "cyberus-technology", + "repo": "dried-nix-flakes", + "rev": "1b2ba62710c6c1d9eba0e8e3adc029cc2e9291a4", + "type": "github" + }, + "original": { + "owner": "cyberus-technology", + "repo": "dried-nix-flakes", + "type": "github" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1778737229, + "narHash": "sha256-6xWoytx8jFW4PF1GjRm/i/53trbpKGfz6zjzQGBr4cI=", + "owner": "nixos", + "repo": "nixpkgs", + "rev": "d7a713c0b7e47c908258e71cba7a2d77cc8d71d5", + "type": "github" + }, + "original": { + "owner": "nixos", + "ref": "nixos-25.11", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "crane": "crane", + "dried-nix-flakes": "dried-nix-flakes", + "nixpkgs": "nixpkgs", + "rust-overlay": "rust-overlay" + } + }, + "rust-overlay": { + "inputs": { + "nixpkgs": [ + "nixpkgs" + ] + }, + "locked": { + "lastModified": 1779074409, + "narHash": "sha256-6aXy8Ga41iLVM8ibddFU1O5+wYWcBGNEfZzZuL91eIc=", + "owner": "oxalica", + "repo": "rust-overlay", + "rev": "2a77b5b1dc952f214e8102acdef1622b68515560", + "type": "github" + }, + "original": { + "owner": "oxalica", + "repo": "rust-overlay", + "type": "github" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 0000000000..512e718cef --- /dev/null +++ b/flake.nix @@ -0,0 +1,79 @@ +{ + description = "Cyberus Hypervisor for SAP / Apeiro"; + + inputs = { + dried-nix-flakes.url = "github:cyberus-technology/dried-nix-flakes"; + nixpkgs.url = "github:nixos/nixpkgs?ref=nixos-25.11"; + # Convenient Nix tooling to build Rust projects. + crane.url = "github:ipetkov/crane/master"; + # Get proper Rust toolchain, independent of pkgs.rustc. + rust-overlay = { + url = "github:oxalica/rust-overlay"; + inputs.nixpkgs.follows = "nixpkgs"; + }; + }; + + outputs = + inputs: + let + dnf = (inputs.dried-nix-flakes.for inputs).override { + systems = [ "x86_64-linux" ]; + }; + inherit (dnf) + exportOutputs + ; + in + exportOutputs ( + { + self, + # Keep list sorted: + crane, + nixpkgs, + rust-overlay, + ... + }: + let + pkgs = nixpkgs.legacyPackages; + lib = pkgs.lib; + rust-bin = (rust-overlay.lib.mkRustBin { }) pkgs; + in + { + + formatter = pkgs.nixfmt-tree; + devShells.default = pkgs.mkShellNoCC { + inputsFrom = builtins.attrValues self.packages; + packages = with pkgs; [ + gitlint + rustup + ]; + }; + packages = + let + jsonFilter = path: _type: builtins.match ".*json$" path != null; + sourceFilter = path: type: (jsonFilter path type) || (craneLib.filterCargoSources path type); + src = lib.cleanSourceWith { + src = self; + filter = sourceFilter; + name = "source"; + }; + + rustToolchain = rust-bin.stable.latest.default; + craneLib = crane.mkLib pkgs; + craneLib' = craneLib.overrideToolchain rustToolchain; + + cloud-hypervisor = pkgs.callPackage ./chv.nix { + inherit (pkgs.cloud-hypervisor) meta; + inherit src; + craneLib = craneLib'; + + # Query the repo revision to pass the cloud-hypervisor to be printed in the version string. + chExtraVersion = self.dirtyRev or self.rev or "unknown-revision"; + }; + in + { + default = cloud-hypervisor; + inherit cloud-hypervisor; + }; + } + ); +} diff --git a/fuzz/Cargo.lock b/fuzz/Cargo.lock index 3b3011114f..dcc82409e8 100644 --- a/fuzz/Cargo.lock +++ b/fuzz/Cargo.lock @@ -53,7 +53,7 @@ version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" dependencies = [ - "windows-sys", + "windows-sys 0.61.2", ] [[package]] @@ -64,7 +64,7 @@ checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" dependencies = [ "anstyle", "once_cell_polyfill", - "windows-sys", + "windows-sys 0.61.2", ] [[package]] @@ -113,6 +113,28 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" +[[package]] +name = "aws-lc-rs" +version = "1.16.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ec6fb3fe69024a75fa7e1bfb48aa6cf59706a101658ea01bfd33b2b248a038f" +dependencies = [ + "aws-lc-sys", + "zeroize", +] + +[[package]] +name = "aws-lc-sys" +version = "0.40.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f50037ee5e1e41e7b8f9d161680a725bd1626cb6f8c7e901f91f942850852fe7" +dependencies = [ + "cc", + "cmake", + "dunce", + "fs_extra", +] + [[package]] name = "bitfield-struct" version = "0.13.0" @@ -262,6 +284,15 @@ dependencies = [ "vmm-sys-util", ] +[[package]] +name = "cmake" +version = "0.1.58" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0f78a02292a74a88ac736019ab962ece0bc380e3f977bf72e376c5d78ff0678" +dependencies = [ + "cc", +] + [[package]] name = "colorchoice" version = "1.0.5" @@ -396,6 +427,12 @@ dependencies = [ "crypto-common", ] +[[package]] +name = "dunce" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813" + [[package]] name = "either" version = "1.15.0" @@ -445,7 +482,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys", + "windows-sys 0.61.2", ] [[package]] @@ -508,6 +545,12 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" +[[package]] +name = "fs_extra" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" + [[package]] name = "futures-core" version = "0.3.32" @@ -562,6 +605,17 @@ dependencies = [ "num-traits", ] +[[package]] +name = "getrandom" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + [[package]] name = "getrandom" version = "0.3.4" @@ -1086,6 +1140,55 @@ dependencies = [ "syn", ] +[[package]] +name = "ring" +version = "0.17.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" +dependencies = [ + "cc", + "cfg-if", + "getrandom 0.2.17", + "libc", + "untrusted", + "windows-sys 0.52.0", +] + +[[package]] +name = "rustls" +version = "0.23.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef86cd5876211988985292b91c96a8f2d298df24e75989a43a3c73f2d4d8168b" +dependencies = [ + "aws-lc-rs", + "once_cell", + "rustls-pki-types", + "rustls-webpki", + "subtle", + "zeroize", +] + +[[package]] +name = "rustls-pki-types" +version = "1.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30a7197ae7eb376e574fe940d068c30fe0462554a3ddbe4eca7838e049c937a9" +dependencies = [ + "zeroize", +] + +[[package]] +name = "rustls-webpki" +version = "0.103.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61c429a8649f110dddef65e2a5ad240f747e85f7758a6bccc7e5777bd33f756e" +dependencies = [ + "aws-lc-rs", + "ring", + "rustls-pki-types", + "untrusted", +] + [[package]] name = "rustversion" version = "1.0.22" @@ -1252,6 +1355,12 @@ version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" +[[package]] +name = "subtle" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" + [[package]] name = "syn" version = "2.0.117" @@ -1352,6 +1461,12 @@ version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" +[[package]] +name = "untrusted" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" + [[package]] name = "utf8parse" version = "0.2.2" @@ -1485,6 +1600,7 @@ version = "0.1.0" dependencies = [ "arch", "libc", + "thiserror", "vm-memory", ] @@ -1524,6 +1640,7 @@ version = "0.1.0" dependencies = [ "anyhow", "itertools", + "rustls", "serde", "serde_json", "thiserror", @@ -1557,6 +1674,7 @@ dependencies = [ "gdbstub_arch", "hypervisor", "iommufd-ioctls", + "kvm-bindings", "landlock", "libc", "linux-loader", @@ -1601,6 +1719,12 @@ dependencies = [ "serde_derive", ] +[[package]] +name = "wasi" +version = "0.11.1+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" + [[package]] name = "wasip2" version = "1.0.3+wasi-0.2.9" @@ -1726,6 +1850,15 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets", +] + [[package]] name = "windows-sys" version = "0.61.2" @@ -1735,6 +1868,70 @@ dependencies = [ "windows-link", ] +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + [[package]] name = "winnow" version = "1.0.2" @@ -1858,6 +2055,12 @@ dependencies = [ "syn", ] +[[package]] +name = "zeroize" +version = "1.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" + [[package]] name = "zmij" version = "1.0.21" diff --git a/fuzz/fuzz_targets/cmos.rs b/fuzz/fuzz_targets/cmos.rs index 9ae5b59da5..c925295790 100644 --- a/fuzz/fuzz_targets/cmos.rs +++ b/fuzz/fuzz_targets/cmos.rs @@ -3,6 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 #![no_main] +use std::sync::atomic::AtomicBool; +use std::sync::Arc; + use devices::legacy::Cmos; use libc::EFD_NONBLOCK; use libfuzzer_sys::{fuzz_target, Corpus}; @@ -25,7 +28,8 @@ fuzz_target!(|bytes: &[u8]| -> Corpus { u64::from_le_bytes(below_4g), u64::from_le_bytes(above_4g), EventFd::new(EFD_NONBLOCK).unwrap(), - None, + Arc::new(AtomicBool::new(false)), + Arc::new(AtomicBool::new(false)), ); let mut i = 16; diff --git a/fuzz/fuzz_targets/http_api.rs b/fuzz/fuzz_targets/http_api.rs index 07a1effa4d..aa3841243d 100644 --- a/fuzz/fuzz_targets/http_api.rs +++ b/fuzz/fuzz_targets/http_api.rs @@ -11,6 +11,7 @@ use std::thread; use libfuzzer_sys::{fuzz_target, Corpus}; use micro_http::Request; +use vm_migration::progress::MigrationProgress; use vm_migration::MigratableError; use vmm::api::http::*; use vmm::api::{ @@ -138,6 +139,7 @@ impl RequestHandler for StubApiRequestHandler { features: CpuFeatures::default(), nested: true, core_scheduling: CoreScheduling::default(), + profile: Default::default(), }, memory: MemoryConfig { size: 536_870_912, @@ -176,6 +178,7 @@ impl RequestHandler for StubApiRequestHandler { file: None, mode: ConsoleOutputMode::Tty, socket: None, + url: None, }, }, console: ConsoleConfig { @@ -183,6 +186,7 @@ impl RequestHandler for StubApiRequestHandler { file: None, mode: ConsoleOutputMode::Tty, socket: None, + url: None, }, pci_common: PciDeviceCommonConfig::default(), }, @@ -301,6 +305,18 @@ impl RequestHandler for StubApiRequestHandler { fn vm_nmi(&mut self) -> Result<(), VmError> { Ok(()) } + + fn vm_migration_progress(&mut self) -> Option { + None + } + + fn vm_cancel_migration(&mut self) -> Result<(), MigratableError> { + Ok(()) + } + + fn vm_post_migration_announce(&mut self) -> Result<(), VmError> { + Ok(()) + } } fn http_receiver_stub(exit_evt: EventFd, api_evt: EventFd, api_receiver: Receiver) { diff --git a/hypervisor/src/cpu.rs b/hypervisor/src/cpu.rs index 044c81a2e8..39909ef370 100644 --- a/hypervisor/src/cpu.rs +++ b/hypervisor/src/cpu.rs @@ -10,6 +10,9 @@ // // +#[cfg(feature = "kvm")] +use std::os::fd::RawFd; + use thiserror::Error; #[cfg(not(target_arch = "riscv64"))] use {anyhow::anyhow, vm_memory::GuestAddress}; @@ -25,7 +28,7 @@ use crate::kvm::{TdxExitDetails, TdxExitStatus}; use crate::{CpuState, MpState, StandardRegisters}; #[cfg(target_arch = "x86_64")] -#[derive(Copy, Clone, Default)] +#[derive(Debug, Copy, Clone, Default, serde::Serialize, serde::Deserialize, Eq, PartialEq)] pub enum CpuVendor { #[default] Unknown, @@ -608,4 +611,11 @@ pub trait Vcpu: Send + Sync { /// Trigger NMI interrupt /// fn nmi(&self) -> Result<()>; + /// Returns the underlying vCPU FD of KVM. + /// + /// # SAFETY + /// This is safe as we only use this to map the KVM_RUN structure for the + /// signal handler and only use it from there. + #[cfg(feature = "kvm")] + unsafe fn get_kvm_vcpu_raw_fd(&self) -> RawFd; } diff --git a/hypervisor/src/hypervisor.rs b/hypervisor/src/hypervisor.rs index 05852a230f..629306c3cf 100644 --- a/hypervisor/src/hypervisor.rs +++ b/hypervisor/src/hypervisor.rs @@ -16,6 +16,8 @@ use thiserror::Error; #[cfg(target_arch = "x86_64")] use crate::arch::x86::CpuIdEntry; #[cfg(target_arch = "x86_64")] +use crate::arch::x86::MsrEntry; +#[cfg(target_arch = "x86_64")] use crate::cpu::CpuVendor; #[cfg(feature = "tdx")] use crate::kvm::TdxCapabilities; @@ -60,6 +62,10 @@ pub enum HypervisorError { #[error("Failed to get the list of supported MSRs")] GetMsrList(#[source] anyhow::Error), /// + /// Failed to get MSRs from the hypervisor. + #[error("Failed to get MSRs")] + GetMsr(#[source] anyhow::Error), + /// /// API version is not compatible /// #[error("Incompatible API version")] @@ -128,6 +134,17 @@ pub trait Hypervisor: Send + Sync { /// Get the supported CpuID /// fn get_supported_cpuid(&self) -> Result>; + #[cfg(target_arch = "x86_64")] + /// + /// Get the MSR-based features supported by the hardware and hypervisor + /// + fn get_msr_based_features(&self) -> Result>; + + /// + /// Get the MSR indices supported by the hardware and hypervisor + /// + #[cfg(target_arch = "x86_64")] + fn get_msr_index_list(&self) -> Result>; /// /// Check particular extensions if any /// diff --git a/hypervisor/src/kvm/mod.rs b/hypervisor/src/kvm/mod.rs index 39f76dca31..a8c516c51f 100644 --- a/hypervisor/src/kvm/mod.rs +++ b/hypervisor/src/kvm/mod.rs @@ -17,9 +17,9 @@ use std::mem::offset_of; #[cfg(feature = "sev_snp")] use std::os::fd::FromRawFd; use std::os::fd::OwnedFd; -#[cfg(any(feature = "sev_snp", feature = "tdx"))] +#[cfg(any(feature = "kvm", feature = "sev_snp"))] use std::os::unix::io::AsRawFd; -#[cfg(feature = "tdx")] +#[cfg(any(feature = "kvm", feature = "tdx"))] use std::os::unix::io::RawFd; use std::result; #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] @@ -34,6 +34,7 @@ use kvm_bindings::kvm_create_guest_memfd; use kvm_ioctls::{NoDatamatch, VcpuFd, VmFd}; #[cfg(feature = "sev_snp")] use log::debug; +use log::trace; #[cfg(target_arch = "x86_64")] use log::warn; use vmm_sys_util::errno; @@ -72,13 +73,13 @@ use x86_64::check_required_kvm_extensions; #[cfg(target_arch = "x86_64")] pub use x86_64::{CpuId, ExtendedControlRegisters, MsrEntries, VcpuKvmState}; -#[cfg(target_arch = "x86_64")] -use crate::ClockData; #[cfg(target_arch = "x86_64")] use crate::arch::x86::{ CpuIdEntry, FpuState, LapicState, MTRR_MSR_INDICES, MsrEntry, NUM_IOAPIC_PINS, SpecialRegisters, XsaveState, }; +#[cfg(target_arch = "x86_64")] +use crate::{ClockData, MsrFilterRange}; use crate::{ CpuState, HypervisorType, HypervisorVmConfig, InterruptSourceConfig, IoEventAddress, IrqRoutingEntry, MpState, StandardRegisters, USER_MEMORY_REGION_GUEST_MEMFD, @@ -211,6 +212,8 @@ const TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT: u64 = 0x10004; const TDG_VP_VMCALL_SUCCESS: u64 = 0; #[cfg(feature = "tdx")] const TDG_VP_VMCALL_INVALID_OPERAND: u64 = 0x8000000000000000; +/// Maximum number of MSR ranges that KVM can filter +pub const KVM_MSR_FILTER_MAX_RANGES: usize = 16; #[cfg(feature = "tdx")] ioctl_iowr_nr!(KVM_MEMORY_ENCRYPT_OP, KVMIO, 0xba, std::os::raw::c_ulong); @@ -567,8 +570,6 @@ struct KvmMemorySlot { /// Wrapper over KVM VM ioctls. pub struct KvmVm { fd: Arc, - #[cfg(target_arch = "x86_64")] - msrs: Vec, #[cfg(all(feature = "sev_snp", target_arch = "x86_64"))] sev_fd: Option, #[cfg(all(feature = "sev_snp", target_arch = "x86_64"))] @@ -693,6 +694,69 @@ impl KvmVm { /// let vm = hypervisor.create_vm(HypervisorVmConfig::default()).expect("new VM fd creation failed"); /// ``` impl vm::Vm for KvmVm { + #[cfg(target_arch = "x86_64")] + fn msr_filter<'a>(&self, filter: &[MsrFilterRange<'a>], default_deny: bool) -> vm::Result<()> { + // Found here https://github.com/torvalds/linux/blob/master/include/uapi/linux/kvm.h#L929C9-L929C31 + const KVM_CAP_MSR_FILTER: u64 = 189; + // Can be computed from https://github.com/torvalds/linux/blob/master/include/uapi/linux/kvm.h#L1458 + const KVM_X86_SET_MSR_FILTER: u64 = 0x4188aec6; + + let cap_result = self.fd.check_extension_raw(KVM_CAP_MSR_FILTER); + if cap_result <= 0 { + return Err(vm::HypervisorVmError::MissingMsrFilterCapability { + error_code: cap_result, + }); + } + // Workaround until https://github.com/rust-vmm/kvm/pull/359 is merged + #[repr(C)] + #[derive(Clone, Copy, Default)] + struct KvmMsrFilterRange { + flags: u32, + nmrs: u32, + base: u32, + bitmap: *const u8, + } + + #[repr(C)] + struct KvmMsrFilter { + flags: u32, + ranges: [KvmMsrFilterRange; KVM_MSR_FILTER_MAX_RANGES], + } + + let mut kvm_filter = KvmMsrFilter { + flags: u32::from(default_deny), + ranges: [Default::default(); KVM_MSR_FILTER_MAX_RANGES], + }; + + let num_ranges = kvm_filter.ranges.len(); + if num_ranges > KVM_MSR_FILTER_MAX_RANGES { + return Err(vm::HypervisorVmError::TooManyMsrFilterRanges { + num_ranges, + num_permitted_ranges: KVM_MSR_FILTER_MAX_RANGES, + }); + } + + for (range, kvm_range) in filter.iter().zip(kvm_filter.ranges.iter_mut()) { + kvm_range.flags = range.flags; + kvm_range.nmrs = range.nmsrs; + kvm_range.base = range.base; + kvm_range.bitmap = range.bitmap.as_ptr(); + } + // SAFETY: SYSCALL with valid parameters. All raw pointers are derived from references that are valid for the duration of this entire method call. + let result = unsafe { + libc::ioctl( + self.fd.as_raw_fd(), + KVM_X86_SET_MSR_FILTER, + (&raw const kvm_filter).cast::(), + ) + }; + if result == 0 { + Ok(()) + } else { + Err(vm::HypervisorVmError::MsrFilter { error_code: result }) + } + } + #[cfg(all(feature = "sev_snp", target_arch = "x86_64"))] fn sev_snp_init(&self, guest_policy: igvm_defs::SnpPolicy) -> vm::Result<()> { self.sev_fd @@ -825,6 +889,7 @@ impl vm::Vm for KvmVm { &self, id: u32, vm_ops: Option>, + #[cfg(target_arch = "x86_64")] msrs: Vec, ) -> vm::Result> { let fd = self .fd @@ -844,7 +909,7 @@ impl vm::Vm for KvmVm { let vcpu = KvmVcpu { fd, #[cfg(target_arch = "x86_64")] - msrs: self.msrs.clone(), + msrs, vm_ops, #[cfg(target_arch = "x86_64")] hyperv_synic: AtomicBool::new(false), @@ -1632,7 +1697,6 @@ impl hypervisor::Hypervisor for KvmHypervisor { Ok(Arc::new(KvmVm { fd: Arc::new(fd), - msrs, dirty_log_slots: RwLock::new(HashMap::new()), #[cfg(feature = "sev_snp")] sev_fd, @@ -1672,6 +1736,50 @@ impl hypervisor::Hypervisor for KvmHypervisor { Ok(v) } + #[cfg(target_arch = "x86_64")] + fn get_msr_based_features(&self) -> hypervisor::Result> { + let list = self + .kvm + .get_msr_feature_index_list() + .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into()))?; + let list_len = list.as_fam_struct_ref().nmsrs; + trace!("number of MSR-based feature register addresses:={list_len}"); + let kvm_msrs: Vec = list + .as_slice() + .iter() + .copied() + .map(|index| kvm_msr_entry { + index, + ..Default::default() + }) + .collect(); + let mut kvm_msrs = MsrEntries::from_entries(&kvm_msrs).unwrap(); + let num_writes = self + .kvm + .get_msrs(&mut kvm_msrs) + .map_err(|e| hypervisor::HypervisorError::GetMsr(e.into()))?; + trace!("number of MSR-based feature MSRs written to by KVM:={num_writes}"); + Ok(kvm_msrs + .as_slice() + .iter() + .copied() + .map(MsrEntry::from) + .collect()) + } + + #[cfg(target_arch = "x86_64")] + fn get_msr_index_list(&self) -> hypervisor::Result> { + let list = self.get_msr_list()?; + let num_msrs = list.as_fam_struct_ref().nmsrs; + let actual_num_msrs = list.as_slice().len(); + assert_eq!( + actual_num_msrs, num_msrs as usize, + "BUG: the length of the MSR Index LIST FAM wrapper does not coincide with + the nmrs field value " + ); + Ok(list.as_slice().to_vec()) + } + #[cfg(target_arch = "aarch64")] /// /// Retrieve AArch64 host maximum IPA size supported by KVM. @@ -1792,7 +1900,7 @@ impl KvmVcpu { /// let kvm = KvmHypervisor::new().unwrap(); /// let hypervisor = Arc::new(kvm); /// let vm = hypervisor.create_vm(HypervisorVmConfig::default()).expect("new VM fd creation failed"); -/// let vcpu = vm.create_vcpu(0, None).unwrap(); +/// let vcpu = vm.create_vcpu(0, None, vec![]).unwrap(); /// ``` impl cpu::Vcpu for KvmVcpu { /// @@ -2533,7 +2641,11 @@ impl cpu::Vcpu for KvmVcpu { }, Err(ref e) => match e.errno() { - libc::EAGAIN | libc::EINTR => Ok(cpu::VmExit::Ignore), + libc::EINTR => { + self.fd.set_kvm_immediate_exit(0); + Ok(cpu::VmExit::Ignore) + } + libc::EAGAIN => Ok(cpu::VmExit::Ignore), _ => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( "VCPU error {e:?}" ))), @@ -2856,7 +2968,7 @@ impl cpu::Vcpu for KvmVcpu { /// let hv = Arc::new(kvm); /// let vm = hv.create_vm(HypervisorVmConfig::default()).expect("new VM fd creation failed"); /// vm.enable_split_irq().unwrap(); - /// let vcpu = vm.create_vcpu(0, None).unwrap(); + /// let vcpu = vm.create_vcpu(0, None, vec![]).unwrap(); /// let state = vcpu.state().unwrap(); /// ``` fn state(&self) -> cpu::Result { @@ -3095,7 +3207,7 @@ impl cpu::Vcpu for KvmVcpu { /// let hv = Arc::new(kvm); /// let vm = hv.create_vm(HypervisorVmConfig::default()).expect("new VM fd creation failed"); /// vm.enable_split_irq().unwrap(); - /// let vcpu = vm.create_vcpu(0, None).unwrap(); + /// let vcpu = vm.create_vcpu(0, None, vec![]).unwrap(); /// let state = vcpu.state().unwrap(); /// vcpu.set_state(&state).unwrap(); /// ``` @@ -3222,6 +3334,11 @@ impl cpu::Vcpu for KvmVcpu { self.fd.set_kvm_immediate_exit(exit.into()); } + #[cfg(feature = "kvm")] + unsafe fn get_kvm_vcpu_raw_fd(&self) -> RawFd { + self.fd.as_raw_fd() + } + /// /// Returns the details about TDX exit reason /// diff --git a/hypervisor/src/kvm/x86_64/mod.rs b/hypervisor/src/kvm/x86_64/mod.rs index 62185fd84e..47f52d2be2 100644 --- a/hypervisor/src/kvm/x86_64/mod.rs +++ b/hypervisor/src/kvm/x86_64/mod.rs @@ -66,6 +66,7 @@ pub fn check_required_kvm_extensions(kvm: &Kvm) -> KvmResult<()> { check_extension!(Cap::VcpuEvents); check_extension!(Cap::Xcrs); check_extension!(Cap::Xsave); + check_extension!(Cap::GetMsrFeatures); Ok(()) } diff --git a/hypervisor/src/lib.rs b/hypervisor/src/lib.rs index f224e7217c..0d5dc91d74 100644 --- a/hypervisor/src/lib.rs +++ b/hypervisor/src/lib.rs @@ -64,7 +64,7 @@ pub use vm::{ pub use crate::hypervisor::{Hypervisor, HypervisorError}; -#[derive(Debug, Copy, Clone, PartialEq)] +#[derive(Debug, Copy, Clone, serde::Serialize, serde::Deserialize, PartialEq, Eq)] pub enum HypervisorType { #[cfg(feature = "kvm")] Kvm, @@ -212,6 +212,29 @@ pub enum VcpuInit { Mshv(mshv_bindings::MshvVcpuInit), } +#[cfg(target_arch = "x86_64")] +/// Parameters for filtering read and/or write accesses to a range of MSRs. +#[derive(Debug, Clone, Copy, Default)] +pub struct MsrFilterRange<'a> { + /// The type of operation(s) to filter: `1 << 0`, `1 << 1`, `(1 << 0) | (1 << 1)` refers to read, write, read and write respectively. + // TODO: Consider using an enum here + pub flags: u32, + /// The number of MSRs in this filter range. + pub nmsrs: u32, + /// The first MSR index the bitmap starts at. + pub base: u32, + /// For bit position P ( 0 <= P <= nmsrs), the operations in `flags` are allowed for MSR:= base + P if the bit is set, otherwise they are denied. + pub bitmap: &'a [u8], +} + +impl<'a> MsrFilterRange<'a> { + /// Modify the `flags` so that the ops in the bitmap refer to both reads and writes. + pub fn with_read_write_flags(mut self) -> Self { + self.flags = 1 | (1 << 1); + self + } +} + #[derive(Debug, Clone, PartialEq)] pub enum RegList { #[cfg(all(feature = "kvm", any(target_arch = "aarch64", target_arch = "riscv64")))] diff --git a/hypervisor/src/mshv/mod.rs b/hypervisor/src/mshv/mod.rs index d38aff9860..de98c256e9 100644 --- a/hypervisor/src/mshv/mod.rs +++ b/hypervisor/src/mshv/mod.rs @@ -49,6 +49,8 @@ pub mod x86_64; // aarch64 dependencies #[cfg(target_arch = "aarch64")] pub mod aarch64; +#[cfg(feature = "kvm")] +use std::os::fd::RawFd; use std::os::unix::io::AsRawFd; #[cfg(target_arch = "aarch64")] use std::sync::Mutex; @@ -393,6 +395,17 @@ impl hypervisor::Hypervisor for MshvHypervisor { Ok(cpuid) } + #[cfg(target_arch = "x86_64")] + fn get_msr_index_list(&self) -> hypervisor::Result> { + // TODO: We need to implement this before upstreaming + unimplemented!() + } + + #[cfg(target_arch = "x86_64")] + fn get_msr_based_features(&self) -> hypervisor::Result> { + unimplemented!() + } + /// Get maximum number of vCPUs fn get_max_vcpus(&self) -> u32 { // TODO: Using HV_MAXIMUM_PROCESSORS would be better @@ -1661,6 +1674,11 @@ impl cpu::Vcpu for MshvVcpu { Ok(()) } + + #[cfg(feature = "kvm")] + unsafe fn get_kvm_vcpu_raw_fd(&self) -> RawFd { + unimplemented!() + } } impl MshvVcpu { @@ -1931,6 +1949,7 @@ impl vm::Vm for MshvVm { &self, id: u32, vm_ops: Option>, + #[cfg(target_arch = "x86_64")] _msrs: Vec, ) -> vm::Result> { let id: u8 = id.try_into().unwrap(); let vcpu_fd = self @@ -2601,4 +2620,13 @@ impl vm::Vm for MshvVm { } Ok(()) } + + #[cfg(target_arch = "x86_64")] + fn msr_filter<'a>( + &self, + _filter: &[crate::MsrFilterRange<'a>], + _default_deny: bool, + ) -> vm::Result<()> { + todo!() + } } diff --git a/hypervisor/src/vm.rs b/hypervisor/src/vm.rs index f671204c0c..41a7d27ba9 100644 --- a/hypervisor/src/vm.rs +++ b/hypervisor/src/vm.rs @@ -22,15 +22,17 @@ use igvm_defs::SnpPolicy; use thiserror::Error; use vmm_sys_util::eventfd::EventFd; -#[cfg(target_arch = "x86_64")] -use crate::ClockData; #[cfg(target_arch = "aarch64")] use crate::arch::aarch64::gic::{Vgic, VgicConfig}; #[cfg(target_arch = "riscv64")] use crate::arch::riscv64::aia::{Vaia, VaiaConfig}; #[cfg(feature = "tdx")] use crate::arch::x86::CpuIdEntry; +#[cfg(target_arch = "x86_64")] +use crate::arch::x86::MsrEntry; use crate::cpu::Vcpu; +#[cfg(target_arch = "x86_64")] +use crate::{ClockData, MsrFilterRange}; use crate::{IoEventAddress, IrqRoutingEntry}; /// @@ -61,6 +63,22 @@ pub enum HypervisorVmError { #[error("Failed to create Vcpu")] CreateVcpu(#[source] anyhow::Error), /// + /// Could not filter the given MSRs because too many MSR filter ranges were provided. + /// + #[error( + "Too many separate MSR ranges to filter. Number of given ranges:={num_ranges}, but number of permitted ranges:={num_permitted_ranges}" + )] + TooManyMsrFilterRanges { + num_ranges: usize, + num_permitted_ranges: usize, + }, + #[error( + "Could not filter the given MSR ranges: Failed to confirm MSR filtering capability: error_code:={error_code}" + )] + MissingMsrFilterCapability { error_code: i32 }, + #[error("Could not filter the given MSR ranges. Error code:={error_code}")] + MsrFilter { error_code: i32 }, + /// /// Identity map address error /// #[error("Failed to set identity map address")] @@ -323,8 +341,28 @@ pub trait Vm: Send + Sync + Any { fn register_irqfd(&self, fd: &EventFd, gsi: u32) -> Result<()>; /// Unregister an event that will, when signaled, trigger the `gsi` IRQ. fn unregister_irqfd(&self, fd: &EventFd, gsi: u32) -> Result<()>; + #[cfg(target_arch = "x86_64")] + /// Filter the given ranges of MSRs. This can be used to specify certain MSRs + /// that guests may not access. + /// + /// If the `default_deny` flag is set, MSRs that do not match any of the given + /// ranges, will be automatically denied, otherwise they are allowed. + /// + /// # Important + /// + /// This method should be called once before creating any vCPUs and never again. + fn msr_filter<'a>(&self, filter: &[MsrFilterRange<'a>], default_deny: bool) -> Result<()>; /// Creates a new KVM vCPU file descriptor and maps the memory corresponding - fn create_vcpu(&self, id: u32, vm_ops: Option>) -> Result>; + /// + /// The `msr_buffer` is used to store MSR state. The entries given here are + /// expected to hold indices/register addresses supported by both the host's + /// hardware and the hypervisor. + fn create_vcpu( + &self, + id: u32, + vm_ops: Option>, + #[cfg(target_arch = "x86_64")] msr_buffer: Vec, + ) -> Result>; #[cfg(target_arch = "aarch64")] fn create_vgic(&self, config: &VgicConfig) -> Result>>; #[cfg(target_arch = "riscv64")] diff --git a/net_util/src/ctrl_queue.rs b/net_util/src/ctrl_queue.rs index 8b34a33a7a..530c9367ab 100644 --- a/net_util/src/ctrl_queue.rs +++ b/net_util/src/ctrl_queue.rs @@ -2,6 +2,9 @@ // // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; + use log::{debug, error, info, warn}; use thiserror::Error; use virtio_bindings::virtio_net::{ @@ -72,18 +75,23 @@ fn is_tolerated_ctrl_command(ctrl_hdr: ControlHeader) -> bool { u32::from(ctrl_hdr.cmd), VIRTIO_NET_CTRL_VLAN_ADD | VIRTIO_NET_CTRL_VLAN_DEL ), - VIRTIO_NET_CTRL_ANNOUNCE => u32::from(ctrl_hdr.cmd) == VIRTIO_NET_CTRL_ANNOUNCE_ACK, _ => false, } } pub struct CtrlQueue { pub taps: Vec, + /// Tracks whether the guest still needs to acknowledge a post-migration + /// announce request through the control queue. + pub announce_pending: Arc, } impl CtrlQueue { - pub fn new(taps: Vec) -> Self { - CtrlQueue { taps } + pub fn new(taps: Vec, announce_pending: Arc) -> Self { + CtrlQueue { + taps, + announce_pending, + } } pub fn process( @@ -104,22 +112,19 @@ impl CtrlQueue { .map_err(|e| Error::GuestMemory(GuestMemoryError::IOError(e)))?, ) .map_err(Error::GuestMemory)?; - let data_desc = desc_chain.next().ok_or(Error::NoDataDescriptor)?; - - let data_desc_addr = data_desc - .addr() - .translate_gva(access_platform, data_desc.len() as usize) - .map_err(|e| Error::GuestMemory(GuestMemoryError::IOError(e)))?; - - let status_desc = desc_chain.next().ok_or(Error::NoStatusDescriptor)?; - - let ok = match u32::from(ctrl_hdr.class) { + let (ok, status_desc) = match u32::from(ctrl_hdr.class) { VIRTIO_NET_CTRL_MQ => { + let data_desc = desc_chain.next().ok_or(Error::NoDataDescriptor)?; + let data_desc_addr = data_desc + .addr() + .translate_gva(access_platform, data_desc.len() as usize) + .map_err(|e| Error::GuestMemory(GuestMemoryError::IOError(e)))?; + let status_desc = desc_chain.next().ok_or(Error::NoStatusDescriptor)?; let queue_pairs = desc_chain .memory() .read_obj::(data_desc_addr) .map_err(Error::GuestMemory)?; - if u32::from(ctrl_hdr.cmd) != VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET { + let ok = if u32::from(ctrl_hdr.cmd) != VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET { warn!("Unsupported command: {}", ctrl_hdr.cmd); false } else if (queue_pairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN as u16) @@ -130,14 +135,22 @@ impl CtrlQueue { } else { info!("Number of MQ pairs requested: {queue_pairs}"); true - } + }; + + (ok, status_desc) } VIRTIO_NET_CTRL_GUEST_OFFLOADS => { + let data_desc = desc_chain.next().ok_or(Error::NoDataDescriptor)?; + let data_desc_addr = data_desc + .addr() + .translate_gva(access_platform, data_desc.len() as usize) + .map_err(|e| Error::GuestMemory(GuestMemoryError::IOError(e)))?; + let status_desc = desc_chain.next().ok_or(Error::NoStatusDescriptor)?; let features = desc_chain .memory() .read_obj::(data_desc_addr) .map_err(Error::GuestMemory)?; - if u32::from(ctrl_hdr.cmd) == VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET { + let ok = if u32::from(ctrl_hdr.cmd) == VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET { let mut ok = true; for tap in self.taps.iter_mut() { info!("Reprogramming tap offload with features: {features}"); @@ -152,15 +165,33 @@ impl CtrlQueue { } else { warn!("Unsupported command: {}", ctrl_hdr.cmd); false - } + }; + + (ok, status_desc) } - _ if is_tolerated_ctrl_command(ctrl_hdr) => { - debug!("Ignoring unsupported but tolerated control command {ctrl_hdr:?}"); - true + VIRTIO_NET_CTRL_ANNOUNCE => { + let status_desc = desc_chain.next().ok_or(Error::NoStatusDescriptor)?; + let ok = if u32::from(ctrl_hdr.cmd) == VIRTIO_NET_CTRL_ANNOUNCE_ACK { + self.announce_pending.store(false, Ordering::Release); + true + } else { + warn!("Unsupported command: {}", ctrl_hdr.cmd); + false + }; + + (ok, status_desc) } _ => { - warn!("Unsupported command {ctrl_hdr:?}"); - false + let _data_desc = desc_chain.next().ok_or(Error::NoDataDescriptor)?; + let status_desc = desc_chain.next().ok_or(Error::NoStatusDescriptor)?; + let ok = if is_tolerated_ctrl_command(ctrl_hdr) { + debug!("Ignoring unsupported but tolerated control command {ctrl_hdr:?}"); + true + } else { + warn!("Unsupported command {ctrl_hdr:?}"); + false + }; + (ok, status_desc) } }; @@ -174,8 +205,6 @@ impl CtrlQueue { .map_err(|e| Error::GuestMemory(GuestMemoryError::IOError(e)))?, ) .map_err(Error::GuestMemory)?; - // Per the virtio spec the used length is bytes the device wrote - // to device-writable descriptors; here just the 1-byte ack. queue .add_used(desc_chain.memory(), desc_chain.head_index(), 1) .map_err(Error::QueueAddUsed)?; @@ -191,3 +220,67 @@ impl CtrlQueue { Ok(()) } } + +#[cfg(test)] +mod unit_tests { + use std::mem::size_of; + use std::sync::Arc; + use std::sync::atomic::{AtomicBool, Ordering}; + + use virtio_bindings::virtio_net::{VIRTIO_NET_CTRL_ANNOUNCE, VIRTIO_NET_CTRL_ANNOUNCE_ACK}; + use virtio_bindings::virtio_ring::{VRING_DESC_F_NEXT, VRING_DESC_F_WRITE}; + use vm_memory::{Bytes, GuestAddress}; + use vm_virtio::queue::testing::VirtQueue as GuestQ; + + use super::*; + + #[test] + fn test_process_announce_ack() { + // The guest acknowledges the post-migration request on the control + // queue, which clears the pending announce state in the device model. + // This test builds the minimal virtqueue request for that command: + // one readable descriptor for the control header, followed by one + // writable descriptor where the device stores the command status. + let mem = GuestMemoryMmap::from_ranges(&[(GuestAddress(0), 0x10000)]).unwrap(); + let guest_q = GuestQ::new(GuestAddress(0), &mem, 16); + let mut queue = guest_q.create_queue(); + + let ctrl_hdr = ControlHeader { + class: VIRTIO_NET_CTRL_ANNOUNCE as u8, + cmd: VIRTIO_NET_CTRL_ANNOUNCE_ACK as u8, + }; + let ctrl_addr = GuestAddress(0x1000); + let status_addr = GuestAddress(0x1100); + mem.write_obj(ctrl_hdr, ctrl_addr).unwrap(); + + // Descriptor 0 contains the control header and points to descriptor 1. + guest_q.dtable[0].set( + ctrl_addr.0, + size_of::() as u32, + VRING_DESC_F_NEXT.try_into().unwrap(), + 1, + ); + // Descriptor 1 is the writable status byte returned by the device. + guest_q.dtable[1].set(status_addr.0, 1, VRING_DESC_F_WRITE.try_into().unwrap(), 0); + + // Publish the two-descriptor request to the available ring so + // CtrlQueue::process() can pop and handle it. + guest_q.avail.ring[0].set(0); + guest_q.avail.idx.set(1); + + // Start from the state reached after post_migration(): the guest still + // owes us an ANNOUNCE_ACK on the control queue. + let announce_pending = Arc::new(AtomicBool::new(true)); + let mut ctrl_q = CtrlQueue::new(Vec::new(), Arc::clone(&announce_pending)); + + ctrl_q.process(&mem, &mut queue, None).unwrap(); + + // A successful ANNOUNCE_ACK clears the pending flag and reports + // VIRTIO_NET_OK in the guest-provided status buffer. + assert!(!announce_pending.load(Ordering::Acquire)); + assert_eq!( + mem.read_obj::(status_addr).unwrap(), + VIRTIO_NET_OK as u8 + ); + } +} diff --git a/net_util/src/lib.rs b/net_util/src/lib.rs index 7152c1676f..de8c5b1465 100644 --- a/net_util/src/lib.rs +++ b/net_util/src/lib.rs @@ -101,7 +101,7 @@ fn create_unix_socket() -> Result { Ok(unsafe { net::UdpSocket::from_raw_fd(sock) }) } -fn vnet_hdr_len() -> usize { +pub fn vnet_hdr_len() -> usize { std::mem::size_of::() } diff --git a/net_util/src/open_tap.rs b/net_util/src/open_tap.rs index a5168d22a0..f04e248540 100644 --- a/net_util/src/open_tap.rs +++ b/net_util/src/open_tap.rs @@ -77,7 +77,14 @@ fn open_tap_rx_q_0( let tap = match if_name { Some(name) => Tap::open_named(name, num_rx_q, flags).map_err(Error::TapOpen)?, // Create a new Tap device in Linux, if none was specified. - None => Tap::new(num_rx_q).map_err(Error::TapOpen)?, + None => { + let tap = Tap::new(num_rx_q).map_err(Error::TapOpen)?; + log::info!( + "Created tap device: name={}, num_rx_q={num_rx_q}", + tap.if_name_as_str() + ); + tap + } }; // Don't overwrite ip configuration of existing interfaces: if tap_exists { diff --git a/net_util/src/tap.rs b/net_util/src/tap.rs index 012c5b9442..8efb13ef1f 100644 --- a/net_util/src/tap.rs +++ b/net_util/src/tap.rs @@ -13,6 +13,7 @@ use std::os::raw::*; use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; use libc::{__c_anonymous_ifr_ifru, ifreq}; +use log::debug; use thiserror::Error; use vmm_sys_util::ioctl::{ioctl_with_mut_ref, ioctl_with_ref, ioctl_with_val}; @@ -70,6 +71,16 @@ pub struct Tap { if_name: CString, } +impl Drop for Tap { + fn drop(&mut self) { + debug!( + "Dropping Tap: if_name={}, FD={}", + self.if_name_as_str(), + self.tap_file.as_raw_fd() + ); + } +} + impl PartialEq for Tap { fn eq(&self, other: &Tap) -> bool { self.if_name == other.if_name @@ -117,6 +128,9 @@ fn ipv6_mask_to_prefix(mask: Ipv6Addr) -> Result { } impl Tap { + /// The default naming scheme for Tap devices that are created by Cloud Hypervisor. + pub const DEFAULT_NAME_SCHEME: &'static str = "vmtap%d"; + /// # Safety /// The caller should ensure to pass a valid file descriptor and valid /// arguments for the `ioctl()` syscall. @@ -176,6 +190,7 @@ impl Tap { if fd < 0 { return Err(Error::OpenTun(IoError::last_os_error())); } + debug!("Opening Tap device with given name: ifname={if_name}, fd={fd}"); // SAFETY: We just checked that the fd is valid. let tuntap = unsafe { File::from_raw_fd(fd) }; @@ -235,7 +250,7 @@ impl Tap { /// Create a new tap interface. pub fn new(num_queue_pairs: usize) -> Result { - Self::open_named("vmtap%d", num_queue_pairs, None) + Self::open_named(Self::DEFAULT_NAME_SCHEME, num_queue_pairs, None) } pub fn from_tap_fd(fd: RawFd, num_queue_pairs: usize) -> Result { diff --git a/scripts/gitlint/rules/on-behalf-of-marker.py b/scripts/gitlint/rules/on-behalf-of-marker.py new file mode 100644 index 0000000000..d08e334b17 --- /dev/null +++ b/scripts/gitlint/rules/on-behalf-of-marker.py @@ -0,0 +1,36 @@ +from gitlint.rules import LineRule, RuleViolation, CommitMessageTitle, CommitRule + +class BodyContainsOnBehalfOfSAPMarker(CommitRule): + """Enforce that each commit coming from an SAP contractor contains an + "On-behalf-of SAP user@sap.com" marker. + """ + + # A rule MUST have a human friendly name + name = "body-requires-on-behalf-of-sap" + + # A rule MUST have a *unique* id + # We recommend starting with UC (for User-defined Commit-rule). + id = "UC-sap" + + # Lower-case list of contractors + contractors = [ + "@cyberus-technology.de" + ] + + # Marker followed by " name.surname@sap.com" + marker = "On-behalf-of: SAP" + + def validate(self, commit): + if "@sap.com" in commit.author_email.lower(): + return + + # Allow third-party open-source contributions + if not any(contractor in commit.author_email.lower() for contractor in self.contractors): + return + + for line in commit.message.body: + if line.startswith(self.marker) and "@sap.com" in line.lower(): + return + + msg = f"Body does not contain a '{self.marker} user@sap.com' line" + return [RuleViolation(self.id, msg, line_nr=1)] diff --git a/virtio-devices/Cargo.toml b/virtio-devices/Cargo.toml index d2658eeeca..ef6b4f717a 100644 --- a/virtio-devices/Cargo.toml +++ b/virtio-devices/Cargo.toml @@ -50,5 +50,8 @@ vm-migration = { path = "../vm-migration" } vm-virtio = { path = "../vm-virtio" } vmm-sys-util = { workspace = true } +[dev-dependencies] +serde_json = { workspace = true } + [lints] workspace = true diff --git a/virtio-devices/src/device.rs b/virtio-devices/src/device.rs index 4c61ba35d1..acddd015e4 100644 --- a/virtio-devices/src/device.rs +++ b/virtio-devices/src/device.rs @@ -191,6 +191,12 @@ pub trait VirtioDevice: Send { fn access_platform(&self) -> Option> { None } + + /// Some devices can announce their location after a live migration to + /// speed up normal execution. + fn post_migration_announcer(&self) -> Option> { + None + } } /// Trait to define address translation for devices managed by virtio-iommu @@ -432,3 +438,14 @@ impl Pausable for VirtioCommon { Ok(()) } } + +/// A PostMigrationAnnouncer provides a callback that informs other components +/// in the system. For example, network devices send out RARP packets to update +/// the MAC to port mappings of switches. +pub trait PostMigrationAnnouncer: Send { + /// Announces that a migration _might_ have occurred. + /// Implementers need to assume that the announcement can be + /// scheduled to run some time after a migration has occurred and + /// that it might even be executed when no migration has happened. + fn announce(&mut self); +} diff --git a/virtio-devices/src/lib.rs b/virtio-devices/src/lib.rs index 6ac3977982..15af1b6126 100644 --- a/virtio-devices/src/lib.rs +++ b/virtio-devices/src/lib.rs @@ -43,8 +43,8 @@ pub use self::balloon::Balloon; pub use self::block::{Block, BlockState}; pub use self::console::{Console, ConsoleResizer, Endpoint}; pub use self::device::{ - ActivationContext, DmaRemapping, VirtioCommon, VirtioDevice, VirtioInterrupt, - VirtioInterruptType, VirtioSharedMemoryList, + ActivationContext, DmaRemapping, PostMigrationAnnouncer, VirtioCommon, VirtioDevice, + VirtioInterrupt, VirtioInterruptType, VirtioSharedMemoryList, }; pub use self::epoll_helper::{ EPOLL_HELPER_EVENT_LAST, EpollHelper, EpollHelperError, EpollHelperHandler, diff --git a/virtio-devices/src/net.rs b/virtio-devices/src/net.rs index e8d1af1e50..f1fba651fe 100644 --- a/virtio-devices/src/net.rs +++ b/virtio-devices/src/net.rs @@ -10,18 +10,19 @@ use std::net::IpAddr; use std::num::Wrapping; use std::ops::Deref; use std::os::unix::io::{AsRawFd, RawFd}; -use std::sync::atomic::{AtomicBool, AtomicU8, Ordering}; +use std::sync::atomic::{AtomicBool, AtomicU8, AtomicU64, Ordering}; use std::sync::{Arc, Barrier}; use std::{result, thread}; use anyhow::anyhow; use event_monitor::event; -use log::{debug, error, info, warn}; +use log::{debug, error, info, trace, warn}; #[cfg(not(fuzzing))] use net_util::virtio_features_to_tap_offload; use net_util::{ - CtrlQueue, MacAddr, NetCounters, NetQueuePair, OpenTapError, RxVirtio, Tap, TapError, TxVirtio, - VirtioNetConfig, build_net_config_space, build_net_config_space_with_mq, open_tap, + CtrlQueue, MAC_ADDR_LEN, MacAddr, NetCounters, NetQueuePair, OpenTapError, RxVirtio, Tap, + TapError, TxVirtio, VirtioNetConfig, build_net_config_space, build_net_config_space_with_mq, + open_tap, vnet_hdr_len, }; use seccompiler::SeccompAction; use serde::{Deserialize, Serialize}; @@ -40,6 +41,7 @@ use super::{ EpollHelperHandler, Error as DeviceError, RateLimiterConfig, VirtioCommon, VirtioDevice, VirtioDeviceType, VirtioInterruptType, }; +use crate::device::PostMigrationAnnouncer; use crate::seccomp_filters::Thread; use crate::thread_helper::spawn_virtio_thread; use crate::{GuestMemoryMmap, VirtioInterrupt}; @@ -252,9 +254,9 @@ impl NetEpollHandler { if res { self.signal_used_queue(self.queue_index_base)?; - debug!("Signalling RX queue"); + trace!("Signalling RX queue"); } else { - debug!("Not signalling RX queue"); + trace!("Not signalling RX queue"); } Ok(()) } @@ -400,6 +402,12 @@ pub struct Net { id: String, taps: Vec, config: VirtioNetConfig, + /// Tracks whether the guest still needs to acknowledge a post-migration + /// announce request through the control queue. + announce_pending: Arc, + /// Generation counter used to invalidate active announcers created before a + /// reset or device teardown, so they stop sending notifications. + announce_generation: Arc, ctrl_queue_epoll_thread: Option>, counters: NetCounters, seccomp_action: SeccompAction, @@ -409,14 +417,115 @@ pub struct Net { } #[derive(Serialize, Deserialize)] +/// Serialized snapshot of the device state. The fields are copied from the +/// live device when snapshotting and restored back into a new device instance. +/// +/// Fields not present in previous versions are tagged with `#[serde(default)]` +/// to allow deserialization if the field is not present. pub struct NetState { pub avail_features: u64, pub acked_features: u64, pub config: VirtioNetConfig, + #[serde(default)] + pub announce_pending: bool, pub queue_size: Vec, } +// Minimum length of an ethernet frame. This size omits the FCS/CRC (frame check +// sequence), which will be added by the hardware. This size can also be found +// in the Linux kernel's UAPI headers. +const ETH_FRAME_LEN: usize = 60; + +/// Constructor-time copy of the fields needed to initialize the live device +/// state, derived either from a restored NetState or from fresh defaults. +struct NetConstructorState { + avail_features: u64, + acked_features: u64, + config: VirtioNetConfig, + announce_pending: bool, + queue_sizes: Vec, + paused: bool, +} + impl Net { + /// Restores a [`NetConstructorState`] from the provided [`NetState`]. + fn restored_constructor_state(id: &str, state: NetState) -> NetConstructorState { + info!("Restoring virtio-net {id}"); + + NetConstructorState { + avail_features: state.avail_features, + acked_features: state.acked_features, + config: state.config, + announce_pending: state.announce_pending, + queue_sizes: state.queue_size, + paused: true, + } + } + + #[allow(clippy::too_many_arguments)] + /// Creates a new [`NetConstructorState`]. + fn fresh_constructor_state( + guest_mac: Option, + access_platform_enabled: bool, + mtu: Option, + num_queues: usize, + queue_size: u16, + offload_tso: bool, + offload_ufo: bool, + offload_csum: bool, + ) -> NetConstructorState { + let mut avail_features = (1 << VIRTIO_RING_F_EVENT_IDX) | (1 << VIRTIO_F_VERSION_1); + + if mtu.is_some() { + avail_features |= 1 << VIRTIO_NET_F_MTU; + } + + if access_platform_enabled { + avail_features |= 1u64 << VIRTIO_F_ACCESS_PLATFORM; + } + + // Configure TSO/UFO features when hardware checksum offload is enabled. + if offload_csum { + avail_features |= (1 << VIRTIO_NET_F_CSUM) + | (1 << VIRTIO_NET_F_GUEST_CSUM) + | (1 << VIRTIO_NET_F_CTRL_GUEST_OFFLOADS); + + if offload_tso { + avail_features |= (1 << VIRTIO_NET_F_HOST_ECN) + | (1 << VIRTIO_NET_F_HOST_TSO4) + | (1 << VIRTIO_NET_F_HOST_TSO6) + | (1 << VIRTIO_NET_F_GUEST_ECN) + | (1 << VIRTIO_NET_F_GUEST_TSO4) + | (1 << VIRTIO_NET_F_GUEST_TSO6); + } + + if offload_ufo { + avail_features |= (1 << VIRTIO_NET_F_HOST_UFO) | (1 << VIRTIO_NET_F_GUEST_UFO); + } + } + + avail_features |= 1 << VIRTIO_NET_F_CTRL_VQ; + avail_features |= 1 << VIRTIO_NET_F_STATUS; + avail_features |= 1 << VIRTIO_NET_F_GUEST_ANNOUNCE; + let queue_num = num_queues + 1; + + let mut config = VirtioNetConfig::default(); + if let Some(mac) = guest_mac { + build_net_config_space(&mut config, mac, num_queues, mtu, &mut avail_features); + } else { + build_net_config_space_with_mq(&mut config, num_queues, mtu, &mut avail_features); + } + + NetConstructorState { + avail_features, + acked_features: 0, + config, + announce_pending: false, + queue_sizes: vec![queue_size; queue_num], + paused: false, + } + } + /// Create a new virtio network device with the given TAP interface. #[allow(clippy::too_many_arguments)] pub fn new_with_tap( @@ -445,81 +554,37 @@ impl Net { } }; - let (avail_features, acked_features, config, queue_sizes, paused) = if let Some(state) = - state - { - info!("Restoring virtio-net {id}"); - ( - state.avail_features, - state.acked_features, - state.config, - state.queue_size, - true, - ) + let constructor_state = if let Some(state) = state { + Self::restored_constructor_state(&id, state) } else { - let mut avail_features = (1 << VIRTIO_RING_F_EVENT_IDX) | (1 << VIRTIO_F_VERSION_1); - - if mtu.is_some() { - avail_features |= 1 << VIRTIO_NET_F_MTU; - } - - if access_platform_enabled { - avail_features |= 1u64 << VIRTIO_F_ACCESS_PLATFORM; - } - - // Configure TSO/UFO features when hardware checksum offload is enabled. - if offload_csum { - avail_features |= (1 << VIRTIO_NET_F_CSUM) - | (1 << VIRTIO_NET_F_GUEST_CSUM) - | (1 << VIRTIO_NET_F_CTRL_GUEST_OFFLOADS); - - if offload_tso { - avail_features |= (1 << VIRTIO_NET_F_HOST_ECN) - | (1 << VIRTIO_NET_F_HOST_TSO4) - | (1 << VIRTIO_NET_F_HOST_TSO6) - | (1 << VIRTIO_NET_F_GUEST_ECN) - | (1 << VIRTIO_NET_F_GUEST_TSO4) - | (1 << VIRTIO_NET_F_GUEST_TSO6); - } - - if offload_ufo { - avail_features |= (1 << VIRTIO_NET_F_HOST_UFO) | (1 << VIRTIO_NET_F_GUEST_UFO); - } - } - - avail_features |= 1 << VIRTIO_NET_F_CTRL_VQ; - let queue_num = num_queues + 1; - - let mut config = VirtioNetConfig::default(); - if let Some(mac) = guest_mac { - build_net_config_space(&mut config, mac, num_queues, mtu, &mut avail_features); - } else { - build_net_config_space_with_mq(&mut config, num_queues, mtu, &mut avail_features); - } - - ( - avail_features, - 0, - config, - vec![queue_size; queue_num], - false, + Self::fresh_constructor_state( + guest_mac, + access_platform_enabled, + mtu, + num_queues, + queue_size, + offload_tso, + offload_ufo, + offload_csum, ) }; Ok(Net { common: VirtioCommon { device_type: VirtioDeviceType::Net as u32, - avail_features, - acked_features, - queue_sizes, + avail_features: constructor_state.avail_features, + acked_features: constructor_state.acked_features, + queue_sizes: constructor_state.queue_sizes, paused_sync: Some(Arc::new(Barrier::new((num_queues / 2) + 1))), min_queues: 2, - paused: Arc::new(AtomicBool::new(paused)), + paused: Arc::new(AtomicBool::new(constructor_state.paused)), ..Default::default() }, id, taps, - config, + config: constructor_state.config, + announce_pending: Arc::new(AtomicBool::new(constructor_state.announce_pending)), + announce_generation: Arc::new(AtomicU64::new(0)), ctrl_queue_epoll_thread: None, counters: NetCounters::default(), seccomp_action, @@ -601,11 +666,12 @@ impl Net { for fd in fds.iter() { // Duplicate so that it can survive reboots // SAFETY: FFI call to dup. Trivially safe. - let fd = unsafe { libc::dup(*fd) }; - if fd < 0 { + let fd_duped = unsafe { libc::dup(*fd) }; + if fd_duped < 0 { return Err(Error::DuplicateTapFd(std::io::Error::last_os_error())); } - let tap = Tap::from_tap_fd(fd, num_queue_pairs).map_err(Error::TapError)?; + debug!("dup'ed fd {fd} => {fd_duped} for virtio-net device {id}"); + let tap = Tap::from_tap_fd(fd_duped, num_queue_pairs).map_err(Error::TapError)?; taps.push(tap); } @@ -637,18 +703,108 @@ impl Net { avail_features: self.common.avail_features, acked_features: self.common.acked_features, config: self.config, + announce_pending: self.announce_pending.load(Ordering::Acquire), queue_size: self.common.queue_sizes.clone(), } } + /// Return the guest-visible virtio-net config, recomputing `status` from the + /// current state of the device. + fn config_with_status(&self) -> VirtioNetConfig { + let mut config = self.config; + + // We want to recompute the guest-visible status field from the current state of + // the device. We clear this field first to avoid showing stale data. + config.status = 0; + + if self.common.feature_acked(VIRTIO_NET_F_STATUS.into()) { + config.status |= VIRTIO_NET_S_LINK_UP as u16; + + if self.announce_pending.load(Ordering::Acquire) { + config.status |= VIRTIO_NET_S_ANNOUNCE as u16; + } + } + + config + } + #[cfg(fuzzing)] pub fn wait_for_epoll_threads(&mut self) { self.common.wait_for_epoll_threads(); } + + // Builds a reverse ARP packet with this device's MAC address. + fn build_rarp_announce(&self) -> [u8; ETH_FRAME_LEN] { + const ETH_P_RARP: u16 = 0x8035; // Ethertype RARP + const ARP_HTYPE_ETH: u16 = 0x1; // Hardware type Ethernet + const ARP_PTYPE_IP: u16 = 0x0800; // Protocol type IPv4 + const ARP_OP_REQUEST_REV: u16 = 0x0003; // RARP Request opcode + + const IPV4_ADDR_LENGTH: usize = 4; // Size of an IPv4 address + + let mut buf = [0u8; ETH_FRAME_LEN]; + + // Ethernet header + buf[0..6].copy_from_slice(&[0xff; MAC_ADDR_LEN]); // This is a broadcast + buf[6..12].copy_from_slice(&self.config.mac); // Src is this NIC + buf[12..14].copy_from_slice(Ð_P_RARP.to_be_bytes()); // This is a RARP packet + + // ARP Header + buf[14..16].copy_from_slice(&ARP_HTYPE_ETH.to_be_bytes()); + buf[16..18].copy_from_slice(&ARP_PTYPE_IP.to_be_bytes()); + buf[18] = MAC_ADDR_LEN as u8; // Hardware address length (ethernet) + buf[19] = IPV4_ADDR_LENGTH as u8; // Protocol address length (IPv4) + // This is a "fake RARP" packet, we don't want to perform a real RARP lookup. + // Thus the content of the next fields is largely irrelevant. Setting source + // hardware address = target hardware address is fine according to RFC 903. + buf[20..22].copy_from_slice(&ARP_OP_REQUEST_REV.to_be_bytes()); + buf[22..28].copy_from_slice(&self.config.mac); // Source hardware address + buf[28..32].copy_from_slice(&[0x00; IPV4_ADDR_LENGTH]); // Source protocol address + buf[32..38].copy_from_slice(&self.config.mac); // Target hardware address + buf[38..42].copy_from_slice(&[0x00; IPV4_ADDR_LENGTH]); // Target protocol address + + buf + } + + /// Re-notify the guest about a restored pending ANNOUNCE request once the + /// transport has installed an interrupt callback during activation. + fn notify_pending_guest_announce(&self) { + if self.announce_pending.load(Ordering::Acquire) + && self + .common + .feature_acked(VIRTIO_NET_F_GUEST_ANNOUNCE.into()) + && let Some(interrupt_cb) = &self.common.interrupt_cb + { + interrupt_cb + .trigger(VirtioInterruptType::Config) + .inspect_err(|e| { + warn!( + "Unable to resend pending announce interrupt for virtio-net device {}: {e}", + self.id + ); + }) + .ok(); + } + } } impl Drop for Net { fn drop(&mut self) { + self.announce_generation.fetch_add(1, Ordering::AcqRel); + + // Get a comma-separated list of the interface names of the tap devices + // associated with this network device. + let ifnames_str = self + .taps + .iter() + .map(|tap| tap.if_name_as_str()) + .collect::>(); + let ifnames_str = ifnames_str.join(","); + debug!( + "virtio-net device closed: id={}, ifnames=[{ifnames_str}]", + self.id + ); + if let Some(kill_evt) = self.common.kill_evt.take() { // Ignore the result because there is nothing we can do about it. let _ = kill_evt.write(1); @@ -681,7 +837,8 @@ impl VirtioDevice for Net { } fn read_config(&self, offset: u64, data: &mut [u8]) { - self.read_config_from_slice(self.config.as_slice(), offset, data); + let config = self.config_with_status(); + self.read_config_from_slice(config.as_slice(), offset, data); } fn activate(&mut self, context: crate::device::ActivationContext) -> ActivateResult { @@ -715,7 +872,7 @@ impl VirtioDevice for Net { mem: mem.clone(), kill_evt, pause_evt, - ctrl_q: CtrlQueue::new(self.taps.clone()), + ctrl_q: CtrlQueue::new(self.taps.clone(), Arc::clone(&self.announce_pending)), queue: ctrl_queue, queue_evt: ctrl_queue_evt, access_platform: self.common.access_platform(), @@ -819,6 +976,7 @@ impl VirtioDevice for Net { } self.common.epoll_threads = Some(epoll_threads); + self.notify_pending_guest_announce(); event!("virtio-device", "activated", "id", &self.id); Ok(()) @@ -826,6 +984,8 @@ impl VirtioDevice for Net { fn reset(&mut self) { self.common.reset(); + self.announce_pending.store(false, Ordering::Release); + self.announce_generation.fetch_add(1, Ordering::AcqRel); event!("virtio-device", "reset", "id", &self.id); } @@ -859,6 +1019,10 @@ impl VirtioDevice for Net { fn access_platform(&self) -> Option> { self.common.access_platform() } + + fn post_migration_announcer(&self) -> Option> { + Some(Box::new(VirtioNetPostMigrationAnnouncer::new(self))) + } } impl Pausable for Net { @@ -887,3 +1051,317 @@ impl Snapshottable for Net { } impl Transportable for Net {} impl Migratable for Net {} + +/// Announces this virtio-net device on the network. +/// Most fields are cloned references to device state so retry rounds can run +/// without borrowing the device itself. +pub struct VirtioNetPostMigrationAnnouncer { + id: String, + /// Remembers whether this device negotiated the guest-visible announce path. + guest_announce_negotiated: bool, + announce_pending: Arc, + announce_generation: Arc, + /// Captures the announce generation at creation time to invalidate stale + /// retry sessions after reset or teardown. + generation: u64, + interrupt_cb: Option>, + /// Prebuilt host-side RARP payload used for immediate post-migration + /// announcement retries. + rarp_announce: [u8; ETH_FRAME_LEN], + taps: Vec, +} + +impl VirtioNetPostMigrationAnnouncer { + pub fn new(dev: &Net) -> Self { + Self { + id: dev.id.clone(), + guest_announce_negotiated: dev.common.feature_acked(VIRTIO_NET_F_GUEST_ANNOUNCE.into()), + announce_pending: Arc::clone(&dev.announce_pending), + announce_generation: Arc::clone(&dev.announce_generation), + generation: dev.announce_generation.load(Ordering::Acquire), + interrupt_cb: dev.common.interrupt_cb.clone(), + rarp_announce: dev.build_rarp_announce(), + taps: dev.taps.clone(), + } + } +} + +impl PostMigrationAnnouncer for VirtioNetPostMigrationAnnouncer { + // Send a host-side RARP immediately so the network can converge before the + // guest runs again, and then also ask the guest to re-announce itself when + // GUEST_ANNOUNCE was negotiated. + fn announce(&mut self) { + // If the announce generations don't match, we don't send any announcements. + if self.announce_generation.load(Ordering::Acquire) != self.generation { + return; + } + + // We have to add a virtio-net header to the announce. + let mut buf = vec![0u8; vnet_hdr_len() + self.rarp_announce.len()]; + buf[vnet_hdr_len()..].copy_from_slice(&self.rarp_announce); + + for tap in &self.taps { + // SAFETY: `buf.as_ptr()` is valid for `buf.len()` bytes and remains + // valid until the syscall returns. `tap.as_raw_fd()` is a valid TAP fd. + let _ = unsafe { + libc::write( + tap.as_raw_fd(), + buf.as_ptr() as *const libc::c_void, + buf.len(), + ) + }; + } + + if self.guest_announce_negotiated + && let Some(interrupt_cb) = &self.interrupt_cb + { + self.announce_pending.store(true, Ordering::Release); + + interrupt_cb + .trigger(VirtioInterruptType::Config) + .inspect_err(|e| { + warn!( + "Unable to send interrupt for virtio-net device {}: {e}", + self.id + ); + }) + .ok(); + } + } +} + +#[cfg(test)] +mod unit_tests { + use std::mem::size_of; + use std::sync::Arc; + use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; + + use seccompiler::SeccompAction; + use virtio_bindings::virtio_net::{ + VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_STATUS, VIRTIO_NET_S_ANNOUNCE, + VIRTIO_NET_S_LINK_UP, + }; + use vmm_sys_util::eventfd::EventFd; + + use super::*; + use crate::device::{VirtioInterrupt, VirtioInterruptType}; + + struct TestInterrupt { + config_count: AtomicUsize, + } + + impl TestInterrupt { + fn new() -> Self { + Self { + config_count: AtomicUsize::new(0), + } + } + } + + impl VirtioInterrupt for TestInterrupt { + fn trigger( + &self, + int_type: VirtioInterruptType, + ) -> std::result::Result<(), std::io::Error> { + if matches!(int_type, VirtioInterruptType::Config) { + self.config_count.fetch_add(1, Ordering::AcqRel); + } + Ok(()) + } + + fn set_notifier( + &self, + _int_type: u32, + _notifier: Option, + _vm: &dyn hypervisor::Vm, + ) -> std::io::Result<()> { + unimplemented!() + } + } + + fn test_net(acked_features: u64, interrupt_cb: Option>) -> Net { + Net { + common: VirtioCommon { + acked_features, + interrupt_cb, + ..Default::default() + }, + id: "test-net".to_string(), + taps: Vec::new(), + config: VirtioNetConfig::default(), + announce_pending: Arc::new(AtomicBool::new(false)), + announce_generation: Arc::new(AtomicU64::new(0)), + ctrl_queue_epoll_thread: None, + counters: NetCounters::default(), + seccomp_action: SeccompAction::Allow, + rate_limiter_config: None, + exit_evt: EventFd::new(libc::EFD_NONBLOCK).unwrap(), + device_status: Arc::new(AtomicU8::new(0)), + } + } + + const STATUS_OFFSET: usize = std::mem::offset_of!(VirtioNetConfig, status); + fn read_status(device: &Net) -> u16 { + let mut data = vec![0; size_of::()]; + device.read_config(0, &mut data); + + u16::from_le_bytes( + data[STATUS_OFFSET..STATUS_OFFSET + size_of::()] + .try_into() + .unwrap(), + ) + } + + #[test] + fn test_fresh_constructor_state_exposes_status() { + let state = + Net::fresh_constructor_state(None, false, Some(MIN_MTU), 2, 256, false, false, false); + + assert_ne!(state.avail_features & (1 << VIRTIO_NET_F_STATUS), 0); + } + + #[test] + fn test_status_feature_reports_link_up() { + let net = test_net(1 << VIRTIO_NET_F_STATUS, None); + + assert_eq!(read_status(&net), VIRTIO_NET_S_LINK_UP as u16); + } + + #[test] + fn test_post_migration_sets_announce_and_triggers_config() { + let interrupt = Arc::new(TestInterrupt::new()); + let net = test_net( + (1 << VIRTIO_NET_F_GUEST_ANNOUNCE) | (1 << VIRTIO_NET_F_STATUS), + Some(interrupt.clone() as Arc), + ); + + net.post_migration_announcer().unwrap().announce(); + + assert!(net.announce_pending.load(Ordering::Acquire)); + assert_ne!(read_status(&net) & VIRTIO_NET_S_ANNOUNCE as u16, 0); + assert_eq!(interrupt.config_count.load(Ordering::Acquire), 1); + } + + #[test] + fn test_post_migration_without_feature_is_noop() { + let interrupt = Arc::new(TestInterrupt::new()); + let net = test_net(0, Some(interrupt.clone() as Arc)); + + net.post_migration_announcer().unwrap().announce(); + + assert!(!net.announce_pending.load(Ordering::Acquire)); + assert_eq!(read_status(&net) & VIRTIO_NET_S_ANNOUNCE as u16, 0); + assert_eq!(interrupt.config_count.load(Ordering::Acquire), 0); + } + + #[test] + fn test_restored_pending_announce_retriggers_config_interrupt() { + let interrupt = Arc::new(TestInterrupt::new()); + let net = test_net( + (1 << VIRTIO_NET_F_GUEST_ANNOUNCE) | (1 << VIRTIO_NET_F_STATUS), + Some(interrupt.clone() as Arc), + ); + net.announce_pending.store(true, Ordering::Release); + + net.notify_pending_guest_announce(); + + assert!(net.announce_pending.load(Ordering::Acquire)); + assert_ne!(read_status(&net) & VIRTIO_NET_S_ANNOUNCE as u16, 0); + assert_eq!(interrupt.config_count.load(Ordering::Acquire), 1); + } + + #[test] + fn test_post_migration_retries_retrigger_config_interrupt() { + let interrupt = Arc::new(TestInterrupt::new()); + let net = test_net( + (1 << VIRTIO_NET_F_GUEST_ANNOUNCE) | (1 << VIRTIO_NET_F_STATUS), + Some(interrupt.clone() as Arc), + ); + let mut announcer = net.post_migration_announcer().unwrap(); + + announcer.announce(); + announcer.announce(); + + assert!(net.announce_pending.load(Ordering::Acquire)); + assert_ne!(read_status(&net) & VIRTIO_NET_S_ANNOUNCE as u16, 0); + assert_eq!(interrupt.config_count.load(Ordering::Acquire), 2); + } + + #[test] + fn test_reset_clears_pending_announce() { + let interrupt = Arc::new(TestInterrupt::new()); + let mut net = test_net( + (1 << VIRTIO_NET_F_GUEST_ANNOUNCE) | (1 << VIRTIO_NET_F_STATUS), + Some(interrupt.clone() as Arc), + ); + + net.post_migration_announcer().unwrap().announce(); + assert!(net.announce_pending.load(Ordering::Acquire)); + + net.reset(); + + assert!(!net.announce_pending.load(Ordering::Acquire)); + assert_eq!(read_status(&net) & VIRTIO_NET_S_ANNOUNCE as u16, 0); + } + + #[test] + fn test_reset_invalidates_old_announcer() { + let interrupt = Arc::new(TestInterrupt::new()); + let mut net = test_net( + 1 << VIRTIO_NET_F_GUEST_ANNOUNCE, + Some(interrupt.clone() as Arc), + ); + let mut announcer = net.post_migration_announcer().unwrap(); + + announcer.announce(); + assert_eq!(interrupt.config_count.load(Ordering::Acquire), 1); + + net.reset(); + announcer.announce(); + + assert!(!net.announce_pending.load(Ordering::Acquire)); + assert_eq!(read_status(&net) & VIRTIO_NET_S_ANNOUNCE as u16, 0); + assert_eq!(interrupt.config_count.load(Ordering::Acquire), 1); + } + + #[test] + fn test_drop_invalidates_old_announcer() { + let interrupt = Arc::new(TestInterrupt::new()); + let mut announcer = { + let net = test_net( + 1 << VIRTIO_NET_F_GUEST_ANNOUNCE, + Some(interrupt.clone() as Arc), + ); + let mut announcer = net.post_migration_announcer().unwrap(); + + announcer.announce(); + assert_eq!(interrupt.config_count.load(Ordering::Acquire), 1); + + announcer + }; + + announcer.announce(); + + assert_eq!(interrupt.config_count.load(Ordering::Acquire), 1); + } + + #[test] + fn test_net_state_deserialize_without_announce_pending_defaults_to_false() { + // Older snapshots do not contain announce_pending. Restoring them on a + // newer binary must treat the missing field as "no announce pending". + let state = NetState { + avail_features: 1, + acked_features: 2, + config: VirtioNetConfig::default(), + announce_pending: true, + queue_size: vec![256, 256], + }; + let mut value = serde_json::to_value(state).unwrap(); + + value.as_object_mut().unwrap().remove("announce_pending"); + + let restored: NetState = serde_json::from_value(value).unwrap(); + + assert!(!restored.announce_pending); + } +} diff --git a/virtio-devices/src/transport/pci_common_config.rs b/virtio-devices/src/transport/pci_common_config.rs index 98ea81392c..0ba1175007 100644 --- a/virtio-devices/src/transport/pci_common_config.rs +++ b/virtio-devices/src/transport/pci_common_config.rs @@ -10,7 +10,7 @@ use std::sync::atomic::{AtomicBool, AtomicU8, AtomicU16, Ordering}; use std::sync::{Arc, Mutex}; use byteorder::{ByteOrder, LittleEndian}; -use log::{debug, error, warn}; +use log::{debug, error, trace, warn}; use serde::{Deserialize, Serialize}; use virtio_queue::{Queue, QueueT}; use vm_migration::{MigratableError, Pausable, Snapshot, Snapshottable}; @@ -251,7 +251,7 @@ impl VirtioPciCommonConfig { } fn read_common_config_word(&self, offset: u64, queues: &[Queue]) -> u16 { - debug!("read_common_config_word: offset 0x{offset:x}"); + trace!("read_common_config_word: offset 0x{offset:x}"); match offset { 0x10 => self.msix_config.load(Ordering::Acquire), 0x12 => queues.len() as u16, // num_queues diff --git a/virtio-devices/src/vhost_user/net.rs b/virtio-devices/src/vhost_user/net.rs index eed7c8284d..1017df21da 100644 --- a/virtio-devices/src/vhost_user/net.rs +++ b/virtio-devices/src/vhost_user/net.rs @@ -1,20 +1,21 @@ // Copyright 2019 Intel Corporation. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 -use std::sync::atomic::AtomicBool; +use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; use std::sync::{Arc, Barrier, Mutex}; use std::{result, thread}; -use log::{error, info}; +use log::{error, info, warn}; use net_util::{CtrlQueue, MacAddr, VirtioNetConfig, build_net_config_space}; use seccompiler::SeccompAction; use vhost::vhost_user::message::{VhostUserProtocolFeatures, VhostUserVirtioFeatures}; use vhost::vhost_user::{FrontendReqHandler, VhostUserFrontend, VhostUserFrontendReqHandler}; use virtio_bindings::virtio_net::{ - VIRTIO_NET_F_CSUM, VIRTIO_NET_F_CTRL_VQ, VIRTIO_NET_F_GUEST_CSUM, VIRTIO_NET_F_GUEST_ECN, - VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, VIRTIO_NET_F_GUEST_UFO, - VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_TSO6, VIRTIO_NET_F_HOST_UFO, - VIRTIO_NET_F_MAC, VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_MTU, + VIRTIO_NET_F_CSUM, VIRTIO_NET_F_CTRL_VQ, VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_GUEST_CSUM, + VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, + VIRTIO_NET_F_GUEST_UFO, VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_TSO6, + VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_MAC, VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_MTU, + VIRTIO_NET_F_STATUS, VIRTIO_NET_S_ANNOUNCE, VIRTIO_NET_S_LINK_UP, }; use virtio_bindings::virtio_ring::VIRTIO_RING_F_EVENT_IDX; use virtio_queue::QueueT; @@ -28,8 +29,8 @@ use crate::thread_helper::spawn_virtio_thread; use crate::vhost_user::vu_common_ctrl::{VhostUserConfig, VhostUserHandle}; use crate::vhost_user::{DEFAULT_VIRTIO_FEATURES, Error, Result, VhostUserCommon, VhostUserState}; use crate::{ - ActivateResult, GuestMemoryMmap, GuestRegionMmap, NetCtrlEpollHandler, - VIRTIO_F_ACCESS_PLATFORM, VirtioCommon, VirtioDevice, VirtioDeviceType, + ActivateResult, GuestMemoryMmap, GuestRegionMmap, NetCtrlEpollHandler, PostMigrationAnnouncer, + VIRTIO_F_ACCESS_PLATFORM, VirtioCommon, VirtioDevice, VirtioDeviceType, VirtioInterrupt, }; const DEFAULT_QUEUE_NUMBER: usize = 2; @@ -43,6 +44,12 @@ pub struct Net { vu_common: VhostUserCommon, id: String, config: VirtioNetConfig, + /// Tracks whether the guest still needs to acknowledge a post-migration + /// announce request through the control queue. + announce_pending: Arc, + /// Generation counter used to invalidate active announcers created before a + /// reset or device teardown, so they stop sending notifications. + announce_generation: Arc, guest_memory: Option>, ctrl_queue_epoll_thread: Option>, seccomp_action: SeccompAction, @@ -51,6 +58,23 @@ pub struct Net { } impl Net { + /// Derive the guest-visible feature set from the backend-negotiated + /// features plus frontend-only bits that Cloud Hypervisor implements + /// locally, such as `VIRTIO_NET_F_MAC`, `VIRTIO_NET_F_STATUS`, and + /// `VIRTIO_NET_F_GUEST_ANNOUNCE`. + fn frontend_avail_features(backend_acked_features: u64) -> u64 { + let mut guest_avail_features = backend_acked_features | (1 << VIRTIO_NET_F_MAC); + + // Guest announce is implemented by the frontend through config + // changes and the locally handled control queue. + if guest_avail_features & (1 << VIRTIO_NET_F_CTRL_VQ) != 0 { + guest_avail_features |= 1 << VIRTIO_NET_F_STATUS; + guest_avail_features |= 1 << VIRTIO_NET_F_GUEST_ANNOUNCE; + } + + guest_avail_features + } + /// Create a new vhost-user-net device #[allow(clippy::too_many_arguments)] pub fn new( @@ -83,15 +107,18 @@ impl Net { acked_protocol_features, vu_num_queues, config, + announce_pending, paused, vring_bases, ) = if let Some(state) = state { info!("Restoring vhost-user-net {id}"); - // The backend acknowledged features must not contain - // VIRTIO_NET_F_MAC since we don't expect the backend - // to handle it. - let backend_acked_features = state.acked_features & !(1 << VIRTIO_NET_F_MAC); + // The backend acknowledged features must not contain frontend-only + // bits since we don't expect the backend to handle them. + let backend_acked_features = state.acked_features + & !((1 << VIRTIO_NET_F_MAC) + | (1 << VIRTIO_NET_F_STATUS) + | (1 << VIRTIO_NET_F_GUEST_ANNOUNCE)); vu.set_protocol_features_vhost_user( backend_acked_features, @@ -106,12 +133,15 @@ impl Net { num_queues += 1; } + let announce_pending = (state.config.status & (VIRTIO_NET_S_ANNOUNCE as u16)) != 0; + ( state.avail_features, state.acked_features, state.acked_protocol_features, state.vu_num_queues, state.config, + announce_pending, true, state.vring_bases, ) @@ -119,6 +149,7 @@ impl Net { // Filling device and vring features VMM supports. let mut avail_features = (1 << VIRTIO_NET_F_MRG_RXBUF) | (1 << VIRTIO_NET_F_CTRL_VQ) + | (1 << VIRTIO_NET_F_GUEST_ANNOUNCE) | DEFAULT_VIRTIO_FEATURES; if mtu.is_some() { @@ -153,7 +184,7 @@ impl Net { | VhostUserProtocolFeatures::LOG_SHMFD | VhostUserProtocolFeatures::DEVICE_STATE; - let (mut acked_features, acked_protocol_features) = + let (acked_features, acked_protocol_features) = vu.negotiate_features_vhost_user(avail_features, avail_protocol_features)?; let backend_num_queues = @@ -179,12 +210,12 @@ impl Net { num_queues += 1; } - // Make sure the virtio feature to set the MAC address is exposed to - // the guest, even if it hasn't been negotiated with the backend. - acked_features |= 1 << VIRTIO_NET_F_MAC; + // Build the feature set that gets exposed to the guest. Some frontend available + // features are dependent on the features the backend supports. + let guest_avail_features = Self::frontend_avail_features(acked_features); ( - acked_features, + guest_avail_features, // If part of the available features that have been acked, // the PROTOCOL_FEATURES bit must be already set through // the VIRTIO acked features as we know the guest would @@ -194,6 +225,7 @@ impl Net { vu_num_queues, config, false, + false, None, ) }; @@ -220,6 +252,8 @@ impl Net { ..Default::default() }, config, + announce_pending: Arc::new(AtomicBool::new(announce_pending)), + announce_generation: Arc::new(AtomicU64::new(0)), guest_memory: None, ctrl_queue_epoll_thread: None, seccomp_action, @@ -229,13 +263,60 @@ impl Net { } fn state(&self) -> std::result::Result { - self.vu_common.state(self.config) + self.vu_common.state(self.config_with_status()) + } + + /// Return the guest-visible virtio-net config, recomputing `status` from the + /// current state of the device. + fn config_with_status(&self) -> VirtioNetConfig { + let mut config = self.config; + + // We want to recompute the guest-visible status field from the current state of + // the device. We clear this field first to avoid showing stale data. + config.status = 0; + + if self + .vu_common + .virtio_common + .feature_acked(VIRTIO_NET_F_STATUS.into()) + { + config.status |= VIRTIO_NET_S_LINK_UP as u16; + + if self.announce_pending.load(Ordering::Acquire) { + config.status |= VIRTIO_NET_S_ANNOUNCE as u16; + } + } + + config + } + + /// Re-notify the guest about a restored pending ANNOUNCE request once the + /// transport has installed an interrupt callback during activation. + fn notify_pending_guest_announce(&self) { + if self.announce_pending.load(Ordering::Acquire) + && self + .vu_common + .virtio_common + .feature_acked(VIRTIO_NET_F_GUEST_ANNOUNCE.into()) + && let Some(interrupt_cb) = &self.vu_common.virtio_common.interrupt_cb + { + interrupt_cb + .trigger(crate::VirtioInterruptType::Config) + .inspect_err(|e| { + warn!( + "Unable to resend pending announce interrupt for virtio-net device {}: {e}", + self.id + ); + }) + .ok(); + } } } impl Drop for Net { fn drop(&mut self) { self.vu_common.shutdown(); + self.announce_generation.fetch_add(1, Ordering::AcqRel); if let Some(thread) = self.ctrl_queue_epoll_thread.take() && let Err(e) = thread.join() @@ -267,7 +348,8 @@ impl VirtioDevice for Net { } fn read_config(&self, offset: u64, data: &mut [u8]) { - self.read_config_from_slice(self.config.as_slice(), offset, data); + let config = self.config_with_status(); + self.read_config_from_slice(config.as_slice(), offset, data); } fn activate(&mut self, context: crate::device::ActivationContext) -> ActivateResult { @@ -304,7 +386,7 @@ impl VirtioDevice for Net { mem: mem.clone(), kill_evt, pause_evt, - ctrl_q: CtrlQueue::new(Vec::new()), + ctrl_q: CtrlQueue::new(Vec::new(), Arc::clone(&self.announce_pending)), queue: ctrl_queue, queue_evt: ctrl_queue_evt, access_platform: None, @@ -335,10 +417,12 @@ impl VirtioDevice for Net { let backend_req_handler: Option> = None; - // The backend acknowledged features must not contain VIRTIO_NET_F_MAC - // since we don't expect the backend to handle it. - let backend_acked_features = - self.vu_common.virtio_common.acked_features & !(1 << VIRTIO_NET_F_MAC); + // The backend acknowledged features must not contain frontend-only + // features since we don't expect the backend to handle them. + let backend_acked_features = self.vu_common.virtio_common.acked_features + & !((1 << VIRTIO_NET_F_MAC) + | (1 << VIRTIO_NET_F_STATUS) + | (1 << VIRTIO_NET_F_GUEST_ANNOUNCE)); // Run a dedicated thread for handling potential reconnections with // the backend. @@ -370,17 +454,24 @@ impl VirtioDevice for Net { )?; self.vu_common.epoll_thread = Some(epoll_threads.remove(0)); + self.notify_pending_guest_announce(); Ok(()) } fn reset(&mut self) { self.vu_common.reset(&self.id); + self.announce_pending.store(false, Ordering::Release); + self.announce_generation.fetch_add(1, Ordering::AcqRel); } fn shutdown(&mut self) { self.vu_common.shutdown(); } + fn post_migration_announcer(&self) -> Option> { + Some(Box::new(VhostUserNetPostMigrationAnnouncer::new(self))) + } + fn add_memory_region( &mut self, region: &Arc, @@ -442,3 +533,277 @@ impl Migratable for Net { self.vu_common.complete_migration() } } + +/// Announces this vhost-user-net device on the network. +/// Most fields are cloned references to device state so retry rounds can run +/// without borrowing the device itself. +pub struct VhostUserNetPostMigrationAnnouncer { + id: String, + /// Remembers whether this device negotiated the guest-visible announce path. + guest_announce_negotiated: bool, + announce_pending: Arc, + announce_generation: Arc, + /// Captures the announce generation at creation time to invalidate stale + /// retry sessions after reset or teardown. + generation: u64, + interrupt_cb: Option>, +} + +impl VhostUserNetPostMigrationAnnouncer { + pub fn new(dev: &Net) -> Self { + Self { + id: dev.id.clone(), + guest_announce_negotiated: dev + .vu_common + .virtio_common + .feature_acked(VIRTIO_NET_F_GUEST_ANNOUNCE.into()), + announce_pending: Arc::clone(&dev.announce_pending), + announce_generation: Arc::clone(&dev.announce_generation), + generation: dev.announce_generation.load(Ordering::Acquire), + interrupt_cb: dev.vu_common.virtio_common.interrupt_cb.clone(), + } + } +} + +impl PostMigrationAnnouncer for VhostUserNetPostMigrationAnnouncer { + // Vhost-user-net relies on the guest-visible announce path: mark the + // request pending and re-trigger the config interrupt while this retry + // session remains valid. + fn announce(&mut self) { + // If the announce generations don't match, we don't send any announcements. + if self.announce_generation.load(Ordering::Acquire) != self.generation { + return; + } + + if self.guest_announce_negotiated + && let Some(interrupt_cb) = &self.interrupt_cb + { + self.announce_pending.store(true, Ordering::Release); + + interrupt_cb + .trigger(crate::VirtioInterruptType::Config) + .inspect_err(|e| { + warn!( + "Unable to send interrupt for virtio-net device {}: {e}", + self.id + ); + }) + .ok(); + } + } +} +#[cfg(test)] +mod unit_tests { + use std::mem::size_of; + use std::sync::Arc; + use std::sync::atomic::{AtomicUsize, Ordering}; + + use seccompiler::SeccompAction; + use virtio_bindings::virtio_net::{ + VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_STATUS, VIRTIO_NET_S_ANNOUNCE, + VIRTIO_NET_S_LINK_UP, + }; + use vmm_sys_util::eventfd::EventFd; + + use super::*; + use crate::device::{VirtioInterrupt, VirtioInterruptType}; + + struct TestInterrupt { + config_count: AtomicUsize, + } + + impl TestInterrupt { + fn new() -> Self { + Self { + config_count: AtomicUsize::new(0), + } + } + } + + impl VirtioInterrupt for TestInterrupt { + fn trigger( + &self, + int_type: VirtioInterruptType, + ) -> std::result::Result<(), std::io::Error> { + if matches!(int_type, VirtioInterruptType::Config) { + self.config_count.fetch_add(1, Ordering::AcqRel); + } + Ok(()) + } + + fn set_notifier( + &self, + _int_type: u32, + _notifier: Option, + _vm: &dyn hypervisor::Vm, + ) -> std::io::Result<()> { + unimplemented!() + } + } + + fn test_net(acked_features: u64, interrupt_cb: Option>) -> Net { + Net { + vu_common: VhostUserCommon { + virtio_common: VirtioCommon { + acked_features, + interrupt_cb, + ..Default::default() + }, + ..Default::default() + }, + id: "test-vu-net".to_string(), + config: VirtioNetConfig::default(), + announce_pending: Arc::new(AtomicBool::new(false)), + announce_generation: Arc::new(AtomicU64::new(0)), + guest_memory: None, + ctrl_queue_epoll_thread: None, + seccomp_action: SeccompAction::Allow, + exit_evt: EventFd::new(libc::EFD_NONBLOCK).unwrap(), + access_platform_enabled: false, + } + } + + const STATUS_OFFSET: usize = std::mem::offset_of!(VirtioNetConfig, status); + fn read_status(device: &Net) -> u16 { + let mut data = vec![0; size_of::()]; + device.read_config(0, &mut data); + + u16::from_le_bytes( + data[STATUS_OFFSET..STATUS_OFFSET + size_of::()] + .try_into() + .unwrap(), + ) + } + + #[test] + fn test_status_feature_reports_link_up() { + let net = test_net(1 << VIRTIO_NET_F_STATUS, None); + + assert_eq!(read_status(&net), VIRTIO_NET_S_LINK_UP as u16); + } + + #[test] + fn test_post_migration_sets_announce_and_triggers_config() { + let interrupt = Arc::new(TestInterrupt::new()); + let net = test_net( + (1 << VIRTIO_NET_F_GUEST_ANNOUNCE) | (1 << VIRTIO_NET_F_STATUS), + Some(interrupt.clone() as Arc), + ); + + net.post_migration_announcer().unwrap().announce(); + + assert!(net.announce_pending.load(Ordering::Acquire)); + assert_ne!(read_status(&net) & VIRTIO_NET_S_ANNOUNCE as u16, 0); + assert_eq!(interrupt.config_count.load(Ordering::Acquire), 1); + } + + #[test] + fn test_frontend_avail_features_expose_guest_announce_and_status() { + let avail_features = Net::frontend_avail_features(1 << VIRTIO_NET_F_CTRL_VQ); + + assert_ne!(avail_features & (1 << VIRTIO_NET_F_MAC), 0); + assert_ne!(avail_features & (1 << VIRTIO_NET_F_STATUS), 0); + assert_ne!(avail_features & (1 << VIRTIO_NET_F_GUEST_ANNOUNCE), 0); + } + + #[test] + fn test_post_migration_without_feature_is_noop() { + let interrupt = Arc::new(TestInterrupt::new()); + let net = test_net(0, Some(interrupt.clone() as Arc)); + + net.post_migration_announcer().unwrap().announce(); + + assert!(!net.announce_pending.load(Ordering::Acquire)); + assert_eq!(read_status(&net) & VIRTIO_NET_S_ANNOUNCE as u16, 0); + assert_eq!(interrupt.config_count.load(Ordering::Acquire), 0); + } + + #[test] + fn test_restored_pending_announce_retriggers_config_interrupt() { + let interrupt = Arc::new(TestInterrupt::new()); + let net = test_net( + (1 << VIRTIO_NET_F_GUEST_ANNOUNCE) | (1 << VIRTIO_NET_F_STATUS), + Some(interrupt.clone() as Arc), + ); + net.announce_pending.store(true, Ordering::Release); + + net.notify_pending_guest_announce(); + + assert!(net.announce_pending.load(Ordering::Acquire)); + assert_ne!(read_status(&net) & VIRTIO_NET_S_ANNOUNCE as u16, 0); + assert_eq!(interrupt.config_count.load(Ordering::Acquire), 1); + } + + #[test] + fn test_post_migration_with_ctrl_vq_but_without_guest_announce_is_noop() { + let interrupt = Arc::new(TestInterrupt::new()); + let net = test_net( + 1 << VIRTIO_NET_F_CTRL_VQ, + Some(interrupt.clone() as Arc), + ); + + net.post_migration_announcer().unwrap().announce(); + + assert!(!net.announce_pending.load(Ordering::Acquire)); + assert_eq!(read_status(&net) & VIRTIO_NET_S_ANNOUNCE as u16, 0); + assert_eq!(interrupt.config_count.load(Ordering::Acquire), 0); + } + + #[test] + fn test_reset_clears_pending_announce() { + let interrupt = Arc::new(TestInterrupt::new()); + let mut net = test_net( + (1 << VIRTIO_NET_F_GUEST_ANNOUNCE) | (1 << VIRTIO_NET_F_STATUS), + Some(interrupt.clone() as Arc), + ); + + net.post_migration_announcer().unwrap().announce(); + assert!(net.announce_pending.load(Ordering::Acquire)); + + net.reset(); + + assert!(!net.announce_pending.load(Ordering::Acquire)); + assert_eq!(read_status(&net) & VIRTIO_NET_S_ANNOUNCE as u16, 0); + } + + #[test] + fn test_reset_invalidates_old_announcer() { + let interrupt = Arc::new(TestInterrupt::new()); + let mut net = test_net( + 1 << VIRTIO_NET_F_GUEST_ANNOUNCE, + Some(interrupt.clone() as Arc), + ); + let mut announcer = net.post_migration_announcer().unwrap(); + + announcer.announce(); + assert_eq!(interrupt.config_count.load(Ordering::Acquire), 1); + + net.reset(); + announcer.announce(); + + assert!(!net.announce_pending.load(Ordering::Acquire)); + assert_eq!(read_status(&net) & VIRTIO_NET_S_ANNOUNCE as u16, 0); + assert_eq!(interrupt.config_count.load(Ordering::Acquire), 1); + } + + #[test] + fn test_drop_invalidates_old_announcer() { + let interrupt = Arc::new(TestInterrupt::new()); + let mut announcer = { + let net = test_net( + 1 << VIRTIO_NET_F_GUEST_ANNOUNCE, + Some(interrupt.clone() as Arc), + ); + let mut announcer = net.post_migration_announcer().unwrap(); + + announcer.announce(); + assert_eq!(interrupt.config_count.load(Ordering::Acquire), 1); + + announcer + }; + + announcer.announce(); + + assert_eq!(interrupt.config_count.load(Ordering::Acquire), 1); + } +} diff --git a/vm-migration/Cargo.toml b/vm-migration/Cargo.toml index 66b4e4f6a9..b6444dbfc8 100644 --- a/vm-migration/Cargo.toml +++ b/vm-migration/Cargo.toml @@ -8,10 +8,12 @@ version = "0.1.0" [dependencies] anyhow = { workspace = true } itertools = { workspace = true } +rustls = { workspace = true } serde = { workspace = true, features = ["derive", "rc"] } serde_json = { workspace = true } thiserror = { workspace = true } vm-memory = { workspace = true, features = ["backend-atomic", "backend-mmap"] } +zerocopy = { workspace = true, features = ["derive", "std"] } [lints] workspace = true diff --git a/vm-migration/src/context.rs b/vm-migration/src/context.rs index 21801c0290..8e4d28c4f0 100644 --- a/vm-migration/src/context.rs +++ b/vm-migration/src/context.rs @@ -225,13 +225,13 @@ pub struct MemoryMigrationContext { /// Current iteration: 0 initial total transmission, >0 delta transmission. pub iteration: usize, /// Total bytes sent across all iterations. - total_sent_bytes: u64, + pub total_sent_bytes: u64, /// Total bytes to send in the current iteration. pub current_iteration_total_bytes: u64, /// The currently measured bandwidth. /// /// This is updated (at least) after each completed iteration. - bandwidth_bytes_per_second: f64, + pub bandwidth_bytes_per_second: f64, /// Calculated downtime in milliseconds regarding the current bandwidth and /// the remaining memory. /// diff --git a/vm-migration/src/keep_alive_stream.rs b/vm-migration/src/keep_alive_stream.rs new file mode 100644 index 0000000000..24f06390df --- /dev/null +++ b/vm-migration/src/keep_alive_stream.rs @@ -0,0 +1,351 @@ +// Copyright © 2025 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::io::{self, Read, Write}; +use std::os::fd::{AsFd, BorrowedFd, OwnedFd}; +use std::sync::mpsc::{Receiver, RecvTimeoutError, SyncSender, sync_channel}; +use std::thread::JoinHandle; +use std::time::Duration; +use std::{result, thread}; + +use vm_memory::bitmap::BitmapSlice; +use vm_memory::{ReadVolatile, VolatileMemoryError, VolatileSlice, WriteVolatile}; + +use crate::protocol::{Request, Response}; + +/// The `KeepAliveStream` is a stream that is intended to be used for the main +/// connection of live migrations. If the `KeepAliveStream` does not read or +/// write often enough, it will send keep alive messages on the given stream. +/// +/// The `KeepAliveStream` is designed to be compatible with the `SocketStream` +/// enum, and thus it should be really easy to use it. +/// +/// The `KeepAliveStream` consists of a thread (the `KeepAliveWorker`) that owns +/// the given stream, and channels to send messages to said thread, and receive +/// answers from it. +// The messages that will be sent to the `KeepAliveWorker`. +#[derive(Debug)] +enum KeepAliveStreamMessage { + // Read `len` bytes into `buf` from `stream`. + Read { len: usize, buf: Vec }, + // Write `buf[..len]` to `stream`. + Write { len: usize, buf: Vec }, + // Flush `stream`. + Flush, + // Stop listening for messages, i.e. stop the worker. + Disconnect, +} + +// The answer we will get from the `KeepAliveWorker`. +#[derive(Debug)] +enum KeepAliveStreamAnswer { + // Result of reading from `stream`. + Read(io::Result<(Vec, usize)>), + // Result of writing to `stream`. + Write(io::Result<(Vec, usize)>), + // Result of flushing `stream`. + Flush(io::Result<()>), +} + +struct KeepAliveWorker { + stream: S, + /// Is this running on the sender or receiver side? + is_sender: bool, +} + +impl KeepAliveWorker +where + S: Read + Write + AsFd, +{ + pub fn new(stream: S, is_sender: bool) -> Self { + Self { stream, is_sender } + } + + pub fn read(&mut self, mut buf: Vec, len: usize) -> io::Result<(Vec, usize)> { + if buf.len() < len { + buf.resize(len, 0); + } + + let n = Read::read(&mut self.stream, &mut buf[..len])?; + Ok((buf, n)) + } + + pub fn write(&mut self, buf: Vec, len: usize) -> io::Result<(Vec, usize)> { + debug_assert!(len <= buf.len()); + let n = Write::write(&mut self.stream, &buf[..len])?; + Ok((buf, n)) + } + + pub fn flush(&mut self) -> io::Result<()> { + Write::flush(&mut self.stream) + } +} + +pub struct KeepAliveStream { + /// The `KeepAliveWorker`. + thread: Option>, + /// Duplicated file descriptor for `AsFd`. + fd: OwnedFd, + + /// Used to send messages to the worker. + message_tx: SyncSender, + /// Used to receive answers from the worker. + answer_rx: Receiver, + /// Scratch buffer that gets moved to/from the worker for reads. + read_buf: Vec, + /// Scratch buffer that gets moved to/from the worker for writes. + write_buf: Vec, +} + +impl KeepAliveStream { + pub fn new( + stream: T, + timeout: Duration, + is_sender: bool, + ) -> result::Result { + let fd = stream.as_fd().try_clone_to_owned()?; + + // We want to block on send and on recv if nobody listens. Thus we set the bound to 0. + let (message_tx, message_rx) = sync_channel::(0); + let (answer_tx, answer_rx) = sync_channel::(0); + + let thread = thread::Builder::new() + .name("migration_keep_alive_thread".to_string()) + .spawn(move || { + let mut worker = KeepAliveWorker::new(stream, is_sender); + loop { + // The idea is to always send a keep alive message when this times out. + match message_rx.recv_timeout(timeout) { + Ok(message) => match message { + KeepAliveStreamMessage::Read { len, buf } => { + if answer_tx + .send(KeepAliveStreamAnswer::Read(worker.read(buf, len))) + .is_err() + { + // We simply break the loop and thus stop the thread if anything bad happens. + // The main thread will notice next time it tries to send a message to the thread. + break; + } + } + KeepAliveStreamMessage::Write { len, buf } => { + if answer_tx + .send(KeepAliveStreamAnswer::Write(worker.write(buf, len))) + .is_err() + { + break; + } + } + KeepAliveStreamMessage::Flush => { + if answer_tx + .send(KeepAliveStreamAnswer::Flush(worker.flush())) + .is_err() + { + break; + } + } + KeepAliveStreamMessage::Disconnect => break, + }, + Err(RecvTimeoutError::Timeout) => { + if worker.is_sender { + let keep_alive = Request::keep_alive(); + let _ = keep_alive.write_to(&mut worker.stream); + } else { + let keep_alive = Response::keep_alive(); + let _ = keep_alive.write_to(&mut worker.stream); + } + } + Err(RecvTimeoutError::Disconnected) => break, + } + } + })?; + + Ok(Self { + thread: Some(thread), + fd, + message_tx, + answer_rx, + read_buf: Vec::new(), + write_buf: Vec::new(), + }) + } +} + +impl Drop for KeepAliveStream { + fn drop(&mut self) { + let _ = self.message_tx.send(KeepAliveStreamMessage::Disconnect); + if let Some(handle) = self.thread.take() { + let _ = handle.join(); + } + } +} + +impl AsFd for KeepAliveStream { + fn as_fd(&self) -> BorrowedFd<'_> { + self.fd.as_fd() + } +} + +impl Read for KeepAliveStream { + fn read(&mut self, out_buf: &mut [u8]) -> io::Result { + let len = out_buf.len(); + // Move the buffer to avoid lifetime or ownership issues. + let read_buf = std::mem::take(&mut self.read_buf); + + self.message_tx + .send(KeepAliveStreamMessage::Read { len, buf: read_buf }) + .map_err(|e| { + io::Error::other(format!("Unable to send message to KeepAliveWorker: {e}")) + })?; + + match self.answer_rx.recv() { + Ok(KeepAliveStreamAnswer::Read(result)) => match result { + Ok((buf, len)) => { + self.read_buf = buf; + out_buf[..len].copy_from_slice(&self.read_buf[..len]); + Ok(len) + } + Err(e) => Err(e), + }, + Ok(a) => Err(io::Error::other(format!( + "Received unexpected answer: {a:?}. This is most likely a bug!" + ))), + Err(e) => Err(io::Error::other(format!( + "Unable to receive answer from KeepAliveWorker: {e}" + ))), + } + } +} + +impl Write for KeepAliveStream { + fn write(&mut self, in_buf: &[u8]) -> io::Result { + let len = in_buf.len(); + if self.write_buf.len() < len { + self.write_buf.resize(len, 0); + } + + self.write_buf[..len].copy_from_slice(in_buf); + // Move the buffer to avoid lifetime or ownership issues. + let write_buf = std::mem::take(&mut self.write_buf); + + self.message_tx + .send(KeepAliveStreamMessage::Write { + len, + buf: write_buf, + }) + .map_err(|e| { + io::Error::other(format!("Unable to send message to KeepAliveWorker: {e}")) + })?; + + match self.answer_rx.recv() { + Ok(KeepAliveStreamAnswer::Write(result)) => match result { + Ok((buf, len)) => { + self.write_buf = buf; + Ok(len) + } + Err(e) => Err(e), + }, + Ok(a) => Err(io::Error::other(format!( + "Received unexpected answer: {a:?}. This is most likely a bug!", + ))), + Err(e) => Err(io::Error::other(format!( + "Unable to receive answer from KeepAliveWorker: {e}" + ))), + } + } + + fn flush(&mut self) -> io::Result<()> { + self.message_tx + .send(KeepAliveStreamMessage::Flush) + .map_err(|e| { + io::Error::other(format!("Unable to send message to KeepAliveWorker: {e}")) + })?; + match self.answer_rx.recv() { + Ok(KeepAliveStreamAnswer::Flush(result)) => result, + Ok(a) => Err(io::Error::other(format!( + "Received unexpected answer: {a:?}. This is most likely a bug!", + ))), + Err(e) => Err(io::Error::other(format!( + "Unable to receive answer from KeepAliveWorker: {e}" + ))), + } + } +} + +impl ReadVolatile for KeepAliveStream { + fn read_volatile( + &mut self, + vs: &mut VolatileSlice, + ) -> result::Result { + let len = vs.len(); + // Move the buffer to avoid lifetime or ownership issues. + let read_buf = std::mem::take(&mut self.read_buf); + + self.message_tx + .send(KeepAliveStreamMessage::Read { len, buf: read_buf }) + .map_err(|e| { + io::Error::other(format!("Unable to send message to KeepAliveWorker: {e}")) + }) + .map_err(VolatileMemoryError::IOError)?; + + match self.answer_rx.recv() { + Ok(KeepAliveStreamAnswer::Read(result)) => match result { + Ok((buf, len)) => { + self.read_buf = buf; + vs.copy_from(&self.read_buf[..len]); + Ok(len) + } + Err(e) => Err(VolatileMemoryError::IOError(e)), + }, + Ok(a) => Err(VolatileMemoryError::IOError(io::Error::other(format!( + "Received unexpected answer: {a:?}. This is most likely a bug!", + )))), + Err(e) => Err(VolatileMemoryError::IOError(io::Error::other(format!( + "Unable to receive answer from KeepAliveWorker: {e}" + )))), + } + } +} + +impl WriteVolatile for KeepAliveStream { + fn write_volatile( + &mut self, + vs: &VolatileSlice, + ) -> result::Result { + let len = vs.len(); + if self.write_buf.len() < len { + self.write_buf.resize(len, 0); + } + + let len = vs.copy_to(&mut self.write_buf[..len]); + // Move the buffer to avoid lifetime or ownership issues. + let write_buf = std::mem::take(&mut self.write_buf); + + self.message_tx + .send(KeepAliveStreamMessage::Write { + len, + buf: write_buf, + }) + .map_err(|e| { + io::Error::other(format!("Unable to send message to KeepAliveWorker: {e}")) + }) + .map_err(VolatileMemoryError::IOError)?; + + match self.answer_rx.recv() { + Ok(KeepAliveStreamAnswer::Write(result)) => match result { + Ok((buf, len)) => { + self.write_buf = buf; + Ok(len) + } + Err(e) => Err(VolatileMemoryError::IOError(e)), + }, + Ok(a) => Err(VolatileMemoryError::IOError(io::Error::other(format!( + "Received unexpected answer: {a:?}. This is most likely a bug!", + )))), + Err(e) => Err(VolatileMemoryError::IOError(io::Error::other(format!( + "Unable to receive answer from KeepAliveWorker: {e}" + )))), + } + } +} diff --git a/vm-migration/src/lib.rs b/vm-migration/src/lib.rs index 125d762bff..60b1a47496 100644 --- a/vm-migration/src/lib.rs +++ b/vm-migration/src/lib.rs @@ -15,7 +15,10 @@ use crate::protocol::MemoryRangeTable; mod bitpos_iterator; mod context; +pub mod keep_alive_stream; +pub mod progress; pub mod protocol; +pub mod tls; #[derive(Error, Debug)] pub enum UffdError { @@ -48,7 +51,6 @@ pub enum UffdError { #[error("Handler failed after startup")] HandlerFailed(#[source] std::io::Error), } - #[derive(Error, Debug)] pub enum MigratableError { #[error("Failed to pause migratable component")] @@ -84,17 +86,29 @@ pub enum MigratableError { #[error("Failed to retrieve dirty ranges for migratable component")] DirtyLog(#[source] anyhow::Error), + #[error("Failed to cancel migration")] + CancelMigration(#[source] anyhow::Error), + #[error("Failed to start migration for migratable component")] StartMigration(#[source] anyhow::Error), #[error("Failed to complete migration for migratable component")] CompleteMigration(#[source] anyhow::Error), + #[error("Failed to continue the migration as it was cancelled")] + Cancelled, + #[error("Failed to release a disk lock")] UnlockError(#[source] anyhow::Error), #[error("Lifecycle operation skipped for disconnected component {0}")] DeviceDisconnected(String), + + #[error("Failed to deserialize network data")] + DeserializeError(#[source] anyhow::Error), + + #[error("Error setting up a TLS-encrypted connection")] + Tls(#[source] tls::TlsError), } /// A Pausable component can be paused and resumed. diff --git a/vm-migration/src/progress.rs b/vm-migration/src/progress.rs new file mode 100644 index 0000000000..8a5083068d --- /dev/null +++ b/vm-migration/src/progress.rs @@ -0,0 +1,564 @@ +// Copyright © 2025 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 + +//! Module for reporting status and progress of live migrations. +//! +//! The main export is [`MigrationProgress`]. +//! +//! # Motivation +//! +//! Monitoring a live-migration is important for debugging of cloud deployments, +//! for cloud monitoring in general, and for network optimization, such as +//! verifying the throughput for the migration is as high as expected. +//! +//! It also helps to analyze the downtime of VMs and see how much pressure a +//! guest is putting on its memory (by writing), which is slowing down +//! migrations. + +use std::error::Error; +use std::fmt; +use std::fmt::Display; +use std::num::NonZeroU32; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; + +#[derive( + Clone, Debug, PartialOrd, Ord, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize, +)] +pub enum TransportationMode { + Local, + Tcp { connections: NonZeroU32, tls: bool }, +} + +/// Carries information about the transmission of the VM's memory. +#[derive( + Clone, + Copy, + Debug, + Default, + PartialOrd, + Ord, + PartialEq, + Eq, + Hash, + serde::Serialize, + serde::Deserialize, +)] +pub struct MemoryTransmissionInfo { + /// The memory iteration (only in precopy mode). + pub memory_iteration: u64, + /// Memory bytes per second. + pub memory_transmission_bps: u64, + /// The total size of the VMs memory in bytes. + pub memory_bytes_total: u64, + /// The total size of transmitted bytes. + pub memory_bytes_transmitted: u64, + /// The amount of remaining bytes for this iteration. + pub memory_bytes_remaining_iteration: u64, + /// The amount of transmitted 4k pages. + pub memory_pages_4k_transmitted: u64, + /// The amount of remaining 4k pages for this iteration. + pub memory_pages_4k_remaining_iteration: u64, + /// The amount of constant pages for that we could take a shortcut. + /// Pages where all bits are either zero or one. + pub memory_pages_constant_count: u64, + /// Current memory dirty rate in pages per seconds (pps). + pub memory_dirty_rate_pps: u64, +} + +/// The different phases of an ongoing ([`MigrationState::Ongoing`]) migration +/// (good case). +/// +/// The states correspond to the [live-migration protocol]. +/// +/// [live-migration protocol]: super::protocol +#[derive( + Clone, Debug, PartialOrd, Ord, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize, +)] +pub enum MigrationStateOngoingPhase { + /// The migration starts. Handshake and transfer of VM config. + Starting, + /// Transfer of memory FDs. + /// + /// Only used for local migrations. + MemoryFds, + /// Transfer of VM memory in precopy mode. + /// + /// Not used for local migrations. + MemoryPrecopy, + // TODO eventually add MemoryPostcopy here + /// The VM migration is completing. This means the last chunks of memory + /// are transmitted as well as the final VM state (vCPUs, devices). + Completing, +} + +impl Display for MigrationStateOngoingPhase { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Starting => write!(f, "starting"), + Self::MemoryFds => write!(f, "memory FDs"), + Self::MemoryPrecopy => write!(f, "memory (precopy)"), + Self::Completing => write!(f, "completing"), + } + } +} + +/// The different states of a migration, covering steady progress and failure. +#[derive( + Clone, Debug, PartialOrd, Ord, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize, +)] +pub enum MigrationState { + /// The migration has been cancelled. + Cancelled {}, + /// The migration has failed. + Failed { + /// Stringified error. + error_msg: String, + /// Debug-stringified error. + error_msg_debug: String, + // TODO this is very tricky because I need clone() + // error: Box, + }, + /// The migration has finished successfully. + Finished {}, + /// The migration is ongoing. + Ongoing { + phase: MigrationStateOngoingPhase, + /// Percent in range `0..=100`. + vcpu_throttle_percent: u8, + }, +} + +impl Display for MigrationState { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + MigrationState::Cancelled { .. } => write!(f, "{}", self.state_name()), + MigrationState::Failed { error_msg, .. } => { + write!(f, "{}: {error_msg}", self.state_name()) + } + MigrationState::Finished { .. } => write!(f, "{}", self.state_name()), + MigrationState::Ongoing { + phase, + vcpu_throttle_percent, + } => write!( + f, + "{}: phase={phase}, vcpu_throttle={vcpu_throttle_percent}", + self.state_name() + ), + } + } +} + +impl MigrationState { + fn state_name(&self) -> &'static str { + match self { + MigrationState::Cancelled { .. } => "cancelled", + MigrationState::Failed { .. } => "failed", + MigrationState::Finished { .. } => "finished", + MigrationState::Ongoing { .. } => "ongoing", + } + } +} + +/// Returns the current UNIX timestamp in ms. +fn current_unix_timestamp_ms() -> u64 { + SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("should be valid duration") + .as_millis() as u64 +} + +/// Holds a snapshot of progress and status information for an ongoing live +/// migration, or the last snapshot of a canceled or aborted migration. +/// +/// This type carries insightful information for every step of the +/// [live-migration protocol] in a way that makes it easy for API users to +/// parse the data with ease while retaining all important information. +/// +/// [live-migration protocol]: super::protocol +#[derive( + Clone, Debug, PartialOrd, Ord, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize, +)] +pub struct MigrationProgress { + /// UNIX timestamp of the start of the live-migration process in ms. + pub timestamp_begin_ms: u64, + /// UNIX timestamp of the current snapshot in ms. + pub timestamp_snapshot_ms: u64, + /// Relative timestamp since the beginning of the migration in ms. + pub timestamp_snapshot_relative_ms: u64, + /// Configured target downtime. + pub downtime_configured_ms: u64, + /// Currently estimated (computed) downtime given the remaining + /// transmissions and the bandwidth. + /// + /// If this is `0`, the downtime could not yet be calculated. + pub downtime_estimated_ms: u64, + /// Requested transportation mode. + pub transportation_mode: TransportationMode, + /// Snapshot of the current phase. + pub state: MigrationState, + /// Latest [`MemoryTransmissionInfo`] info, if any. + /// + /// The most interesting phase is when current state is + /// [`MigrationState::Ongoing`] and [`MigrationStateOngoingPhase::MemoryPrecopy`] + /// as this value will be updated frequently. + pub memory_transmission_info: MemoryTransmissionInfo, +} + +impl MigrationProgress { + /// Creates new progress in a valid init state. + /// + /// This progress must be updated using any of: + /// - [`Self::update`] + /// - [`Self::mark_as_finished`] + /// - [`Self::mark_as_failed`] + /// - [`Self::mark_as_cancelled`] + pub fn new(transportation_mode: TransportationMode, target_downtime: Duration) -> Self { + let timestamp = current_unix_timestamp_ms(); + Self { + timestamp_begin_ms: timestamp, + timestamp_snapshot_ms: timestamp, + timestamp_snapshot_relative_ms: 0, + downtime_configured_ms: target_downtime.as_millis() as u64, + downtime_estimated_ms: 0, + transportation_mode, + state: MigrationState::Ongoing { + phase: MigrationStateOngoingPhase::Starting, + vcpu_throttle_percent: 0, + }, + memory_transmission_info: MemoryTransmissionInfo::default(), + } + } + + /// Updates the state of an ongoing migration. + /// + /// Only updates new values that are provided via `Some`. + /// + /// # Arguments + /// + /// - `new_phase`: The current [`MigrationStateOngoingPhase`]. + /// - `new_memory_transmission_info`: If `Some`, the current [`MemoryTransmissionInfo`]. + /// - `new_cpu_throttle_percent`: If `Some`, the current value of the vCPU throttle percentage. + /// Must be in range `0..=100`. + /// - `new_estimated_downtime`: If `Some`, the latest expected (calculated) downtime. + pub fn update( + &mut self, + new_phase: MigrationStateOngoingPhase, + new_memory_transmission_info: Option, + new_cpu_throttle_percent: Option, + new_estimated_downtime: Option, + ) { + if let Some(percent) = new_cpu_throttle_percent { + assert!(percent <= 100); + } + + if let Some(downtime) = new_estimated_downtime { + self.downtime_estimated_ms = u64::try_from(downtime.as_millis()).unwrap(); + } else { + // This is better than showing `0` and it is likely close to the final actual downtime. + self.downtime_estimated_ms = self.downtime_configured_ms; + } + + match &self.state { + MigrationState::Ongoing { + phase: _old_phase, + vcpu_throttle_percent: old_vcpu_throttle_percent, + } => { + self.timestamp_snapshot_ms = current_unix_timestamp_ms(); + self.timestamp_snapshot_relative_ms = + self.timestamp_snapshot_ms - self.timestamp_begin_ms; + + self.memory_transmission_info = + new_memory_transmission_info.unwrap_or(self.memory_transmission_info); + self.state = MigrationState::Ongoing { + phase: new_phase, + vcpu_throttle_percent: new_cpu_throttle_percent + .unwrap_or(*old_vcpu_throttle_percent), + }; + } + illegal => { + // panic is fine as we have a logic error here, nothing that was caused by a user. + panic!( + "illegal state transition: {} -> ongoing", + illegal.state_name(), + ); + } + } + } + + /// Sets the underlying state to [`MigrationState::Cancelled`] and + /// updates all corresponding metadata. + /// + /// After this state change, the object is supposed to be handled as immutable. + /// + /// # Panics + /// + /// If the current state is not [`MigrationState::Ongoing`], this function panics. + pub fn mark_as_cancelled(&mut self) { + if !matches!(self.state, MigrationState::Ongoing { .. }) { + panic!( + "illegal state transition: {} -> cancelled", + self.state.state_name() + ); + } + self.timestamp_snapshot_ms = current_unix_timestamp_ms(); + self.timestamp_snapshot_relative_ms = self.timestamp_snapshot_ms - self.timestamp_begin_ms; + self.state = MigrationState::Cancelled {}; + } + + /// Sets the underlying state to [`MigrationState::Failed`] and + /// updates all corresponding metadata. + /// + /// After this state change, the object is supposed to be handled as immutable. + /// + /// # Panics + /// + /// If the current state is not [`MigrationState::Ongoing`], this function panics. + pub fn mark_as_failed(&mut self, error: &dyn Error) { + if !matches!(self.state, MigrationState::Ongoing { .. }) { + panic!( + "illegal state transition: {} -> failed", + self.state.state_name() + ); + } + self.timestamp_snapshot_ms = current_unix_timestamp_ms(); + self.timestamp_snapshot_relative_ms = self.timestamp_snapshot_ms - self.timestamp_begin_ms; + self.state = MigrationState::Failed { + error_msg: format!("{error}",), + error_msg_debug: format!("{error:?}",), + }; + } + + /// Sets the underlying state to [`MigrationState::Finished`] and + /// updates all corresponding metadata. + /// + /// After this state change, the object is supposed to be handled as immutable. + /// + /// # Panics + /// + /// If the current state is not [`MigrationState::Ongoing`], this function panics. + pub fn mark_as_finished(&mut self) { + if !matches!(self.state, MigrationState::Ongoing { .. }) { + panic!( + "illegal state transition: {} -> finished", + self.state.state_name() + ); + } + self.timestamp_snapshot_ms = current_unix_timestamp_ms(); + self.timestamp_snapshot_relative_ms = self.timestamp_snapshot_ms - self.timestamp_begin_ms; + self.state = MigrationState::Finished {}; + } +} + +#[cfg(test)] +mod unit_tests { + use std::thread; + + use super::*; + + fn tcp_mode() -> TransportationMode { + TransportationMode::Tcp { + connections: NonZeroU32::new(2).unwrap(), + tls: true, + } + } + + #[test] + fn new_initializes_valid_state() { + let target = Duration::from_millis(150); + let progress = MigrationProgress::new(tcp_mode(), target); + + assert_eq!(progress.timestamp_snapshot_ms, progress.timestamp_begin_ms); + assert_eq!(progress.timestamp_snapshot_relative_ms, 0); + assert_eq!(progress.downtime_configured_ms, 150); + assert_eq!(progress.downtime_estimated_ms, 0); + + match progress.state { + MigrationState::Ongoing { + phase, + vcpu_throttle_percent, + } => { + assert_eq!(phase, MigrationStateOngoingPhase::Starting); + assert_eq!(vcpu_throttle_percent, 0); + } + _ => panic!("expected Ongoing state"), + } + + assert_eq!( + progress.memory_transmission_info, + MemoryTransmissionInfo::default() + ); + } + + #[test] + fn update_changes_phase_and_preserves_previous_values() { + let mut progress = + MigrationProgress::new(TransportationMode::Local, Duration::from_millis(200)); + + let initial_timestamp = progress.timestamp_snapshot_ms; + + thread::sleep(Duration::from_millis(1)); + + progress.update(MigrationStateOngoingPhase::MemoryPrecopy, None, None, None); + + match progress.state { + MigrationState::Ongoing { + phase, + vcpu_throttle_percent, + } => { + assert_eq!(phase, MigrationStateOngoingPhase::MemoryPrecopy); + assert_eq!(vcpu_throttle_percent, 0); // unchanged + } + _ => panic!("expected Ongoing"), + } + + assert!(progress.timestamp_snapshot_ms >= initial_timestamp); + assert!(progress.timestamp_snapshot_relative_ms > 0); + + // If no estimated downtime provided, fallback to configured value + assert_eq!( + progress.downtime_estimated_ms, + progress.downtime_configured_ms + ); + } + + #[test] + fn update_replaces_memory_info_and_throttle() { + let mut progress = + MigrationProgress::new(TransportationMode::Local, Duration::from_millis(100)); + + let mem = MemoryTransmissionInfo { + memory_iteration: 3, + memory_transmission_bps: 10_000, + memory_bytes_total: 1_000_000, + memory_bytes_transmitted: 400_000, + memory_bytes_remaining_iteration: 100_000, + memory_pages_4k_transmitted: 100, + memory_pages_4k_remaining_iteration: 25, + memory_pages_constant_count: 10, + memory_dirty_rate_pps: 500, + }; + + progress.update( + MigrationStateOngoingPhase::MemoryPrecopy, + Some(mem), + Some(42), + Some(Duration::from_millis(55)), + ); + + assert_eq!(progress.memory_transmission_info, mem); + assert_eq!(progress.downtime_estimated_ms, 55); + + match progress.state { + MigrationState::Ongoing { + phase, + vcpu_throttle_percent, + } => { + assert_eq!(phase, MigrationStateOngoingPhase::MemoryPrecopy); + assert_eq!(vcpu_throttle_percent, 42); + } + _ => panic!("expected Ongoing"), + } + } + + #[test] + #[should_panic] + fn update_panics_if_not_ongoing() { + let mut progress = + MigrationProgress::new(TransportationMode::Local, Duration::from_millis(10)); + progress.mark_as_finished(); + + progress.update(MigrationStateOngoingPhase::Completing, None, None, None); + } + + #[test] + #[should_panic] + fn throttle_above_100_panics() { + let mut progress = + MigrationProgress::new(TransportationMode::Local, Duration::from_millis(10)); + + progress.update( + MigrationStateOngoingPhase::MemoryPrecopy, + None, + Some(101), + None, + ); + } + + #[test] + fn mark_as_finished_transitions_state() { + let mut progress = + MigrationProgress::new(TransportationMode::Local, Duration::from_millis(10)); + + thread::sleep(Duration::from_millis(1)); + progress.mark_as_finished(); + + match progress.state { + MigrationState::Finished {} => {} + _ => panic!("expected Finished"), + } + + assert!(progress.timestamp_snapshot_relative_ms > 0); + } + + #[test] + #[should_panic] + fn mark_as_finished_twice_panics() { + let mut progress = + MigrationProgress::new(TransportationMode::Local, Duration::from_millis(10)); + + progress.mark_as_finished(); + progress.mark_as_finished(); + } + + #[test] + fn mark_as_failed_sets_error_strings() { + #[derive(Debug)] + struct TestError; + + impl fmt::Display for TestError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "test error") + } + } + + impl Error for TestError {} + + let mut progress = + MigrationProgress::new(TransportationMode::Local, Duration::from_millis(10)); + + progress.mark_as_failed(&TestError); + + match &progress.state { + MigrationState::Failed { + error_msg, + error_msg_debug, + } => { + assert_eq!(error_msg, "test error"); + assert!(error_msg_debug.contains("TestError")); + } + _ => panic!("expected Failed"), + } + } + + #[test] + fn display_formats_are_stable() { + let mut progress = + MigrationProgress::new(TransportationMode::Local, Duration::from_millis(10)); + + progress.update( + MigrationStateOngoingPhase::MemoryPrecopy, + None, + Some(12), + None, + ); + + let s = format!("{}", progress.state); + assert!(s.contains("ongoing")); + assert!(s.contains("phase=memory (precopy)")); + assert!(s.contains("vcpu_throttle=12")); + + progress.mark_as_cancelled(); + assert_eq!(format!("{}", progress.state), "cancelled"); + } +} diff --git a/vm-migration/src/protocol.rs b/vm-migration/src/protocol.rs index 997ac8d815..bc3633fac4 100644 --- a/vm-migration/src/protocol.rs +++ b/vm-migration/src/protocol.rs @@ -76,9 +76,10 @@ use std::io::{Read, Write}; +use anyhow::anyhow; use itertools::Itertools; use serde::{Deserialize, Serialize}; -use vm_memory::ByteValued; +use zerocopy::{Immutable, IntoBytes, KnownLayout, TryFromBytes}; use crate::MigratableError; use crate::bitpos_iterator::BitposIteratorExt; @@ -108,35 +109,39 @@ use crate::bitpos_iterator::BitposIteratorExt; /// /// [live-migration protocol]: super::protocol #[repr(u16)] -#[derive(Debug, Copy, Clone, Default, PartialEq, Eq)] +#[derive( + Debug, Copy, Clone, Default, PartialEq, Eq, Immutable, IntoBytes, KnownLayout, TryFromBytes, +)] pub enum Command { #[default] - Invalid, - Start, - Config, - State, - Memory, + Invalid = 0, + Start = 1, + Config = 2, + State = 3, + Memory = 4, /// Finalizes the migration and resumes the VM on the destination. /// Sent when the source VM was running at migration time. - Complete, - Abandon, - MemoryFd, + Complete = 5, + Abandon = 6, + MemoryFd = 7, /// Finalizes the migration without resuming the VM on the destination. /// Sent when the source VM was paused at migration time. - CompletePaused, + CompletePaused = 9, + // We introduced this with discriminant eight but in the meantime, + // upstream introduced a new command with discriminant 8. For + // migration-compatibility we stick to this temporarily, until we have + // a solution for the discriminant collision. + KeepAlive = 8, } #[repr(C)] -#[derive(Default, Copy, Clone)] +#[derive(Default, Copy, Clone, Immutable, IntoBytes, KnownLayout, TryFromBytes)] pub struct Request { command: Command, padding: [u8; 6], length: u64, // Length of payload for command excluding the Request struct } -// SAFETY: Request contains a series of integers with no implicit padding -unsafe impl ByteValued for Request {} - impl Request { pub fn new(command: Command, length: u64) -> Self { Self { @@ -180,6 +185,10 @@ impl Request { Self::new(Command::Abandon, 0) } + pub fn keep_alive() -> Self { + Self::new(Command::KeepAlive, 0) + } + pub fn command(&self) -> Command { self.command } @@ -189,39 +198,56 @@ impl Request { } pub fn read_from(fd: &mut dyn Read) -> Result { - let mut request = Request::default(); - fd.read_exact(Self::as_mut_slice(&mut request)) - .map_err(MigratableError::MigrateSocket)?; + /// A byte buffer that matches `Self` in size and alignment to allow deserializing `Self` into. + #[repr(C, align(8))] + struct RequestBuffer([u8; const { size_of::() }]); + const _: () = const { + // Check that the alignment of the buffer matches `Self`. + assert!(align_of::() == align_of::()); + }; + let mut buffer = RequestBuffer([0; size_of::()]); + let RequestBuffer(request) = &mut buffer; - Ok(request) + loop { + fd.read_exact(request) + .map_err(MigratableError::MigrateSocket)?; + + let request = Self::try_mut_from_bytes(request) + .map_err(|error| MigratableError::DeserializeError(anyhow!("{error:?}")))?; + + // If we read a keep alive message, we throw it away and keep reading. + if request.command() == Command::KeepAlive { + *request = Request::default(); + continue; + } + return Ok(*request); + } } pub fn write_to(&self, fd: &mut dyn Write) -> Result<(), MigratableError> { - fd.write_all(Self::as_slice(self)) + fd.write_all(self.as_bytes()) .map_err(MigratableError::MigrateSocket) } } #[repr(u16)] -#[derive(Copy, Clone, PartialEq, Eq, Default)] +#[derive(Copy, Clone, PartialEq, Eq, Default, Immutable, IntoBytes, KnownLayout, TryFromBytes)] pub enum Status { #[default] Invalid, Ok, Error, + KeepAlive, } #[repr(C)] -#[derive(Default, Copy, Clone)] +#[derive(Default, Copy, Clone, Immutable, IntoBytes, KnownLayout, TryFromBytes)] pub struct Response { status: Status, padding: [u8; 6], length: u64, // Length of payload for command excluding the Response struct } -// SAFETY: Response contains a series of integers with no implicit padding -unsafe impl ByteValued for Response {} - impl Response { pub fn new(status: Status, length: u64) -> Self { Self { @@ -239,6 +265,10 @@ impl Response { Self::new(Status::Error, 0) } + pub fn keep_alive() -> Self { + Self::new(Status::KeepAlive, 0) + } + pub fn status(&self) -> Status { self.status } @@ -248,31 +278,42 @@ impl Response { } pub fn read_from(fd: &mut dyn Read) -> Result { - let mut response = Response::default(); - fd.read_exact(Self::as_mut_slice(&mut response)) - .map_err(MigratableError::MigrateSocket)?; + /// A byte buffer that matches `Self` in size and alignment to allow deserializing `Self` into. + #[repr(C, align(8))] + struct ResponseBuffer([u8; const { size_of::() }]); + const _: () = const { + // Check that the alignment of the buffer matches `Self`. + assert!(align_of::() == align_of::()); + }; + let mut buffer = ResponseBuffer([0; size_of::()]); + let ResponseBuffer(response) = &mut buffer; - Ok(response) + loop { + fd.read_exact(response) + .map_err(MigratableError::MigrateSocket)?; + + let response = Self::try_mut_from_bytes(response) + .map_err(|error| MigratableError::DeserializeError(anyhow!("{error:?}")))?; + + // If we read a keep alive message, we throw it away and keep reading. + if response.status() == Status::KeepAlive { + *response = Response::default(); + continue; + } + return Ok(*response); + } } - pub fn ok_or_abandon( - self, - fd: &mut T, - error: MigratableError, - ) -> Result - where - T: Read + Write, - { + /// Return the response if its status is `Ok`; return the caller-provided error for any other status. + pub fn ok_or_error(self, error: MigratableError) -> Result { if self.status != Status::Ok { - Request::abandon().write_to(fd)?; - Response::read_from(fd)?; return Err(error); } Ok(self) } pub fn write_to(&self, fd: &mut dyn Write) -> Result<(), MigratableError> { - fd.write_all(Self::as_slice(self)) + fd.write_all(self.as_bytes()) .map_err(MigratableError::MigrateSocket) } } diff --git a/vm-migration/src/tls.rs b/vm-migration/src/tls.rs new file mode 100644 index 0000000000..f9377728b0 --- /dev/null +++ b/vm-migration/src/tls.rs @@ -0,0 +1,330 @@ +// Copyright © 2026 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 +// + +//! TLS support for migration streams over TCP. +//! +//! This module wraps `rustls` to provide a blocking [`TlsStream`] for migration +//! traffic. +//! +//! [`TlsStream`] implements [`Read`], [`Write`], [`ReadVolatile`], +//! [`WriteVolatile`], and [`AsFd`] so it can be used by the transport layer like +//! other migration streams. All data must pass through rustls; direct I/O on the +//! underlying socket would bypass TLS processing and break the connection. + +use std::io::{self, Read, Write}; +use std::net::TcpStream; +use std::os::fd::{AsFd, BorrowedFd}; +use std::path::Path; +use std::result; +use std::sync::Arc; + +use rustls::pki_types::pem::PemObject; +use rustls::pki_types::{CertificateDer, InvalidDnsNameError, PrivateKeyDer, ServerName}; +use rustls::server::VerifierBuilderError; +use rustls::{ + ClientConfig, ClientConnection, RootCertStore, ServerConfig, ServerConnection, StreamOwned, +}; +use thiserror::Error; +use vm_memory::bitmap::BitmapSlice; +use vm_memory::{ReadVolatile, VolatileMemoryError, VolatileSlice, WriteVolatile}; + +use crate::MigratableError; + +/// Errors that can occur when establishing a TLS-encrypted migration channel. +#[derive(Error, Debug)] +pub enum TlsError { + #[error("The provided hostname could not be parsed")] + InvalidDnsName(#[source] InvalidDnsNameError), + + #[error("Rustls protocol error")] + RustlsError(#[from] rustls::Error), + + #[error("Rustls verifier configuration error")] + RustlsVerifierBuilderError(#[source] VerifierBuilderError), + + #[error("Rustls protocol IO error")] + RustlsIoError(#[from] std::io::Error), + + #[error("TLS handshake stalled: no read/write progress while handshake is still in progress")] + HandshakeError, + + #[error("Error handling PEM file")] + RustlsPemError(#[from] rustls::pki_types::pem::Error), +} + +/// Wraps the concrete rustls stream for either side (server or client) of the +/// TLS connection. +/// +/// [`TlsStream`] uses this enum to store a [`StreamOwned`] with either a +/// [`ClientConnection`] or [`ServerConnection`] while exposing a single +/// transport-agnostic API. +#[derive(Debug)] +enum TlsStreamParticipant { + Client(StreamOwned), + Server(StreamOwned), +} + +/// Server/Client-agnostic TLS stream. +pub struct TlsStream { + stream: TlsStreamParticipant, + // We have to implement [`ReadVolatile`] and [`WriteVolatile`] for + // [`TlsStream`]. We use this buffer to avoid allocating a new buffer for + // every volatile read or write. + buf: Vec, +} + +impl TlsStream { + /// The maximum size of [`TlsStream::buf`]. This keeps the reusable buffer + /// from growing without bound. + const BUF_SIZE: usize = 64 /* KiB */ << 10; + + /// Creates a client [`TlsStream`]. + /// + /// The client verifies the server certificate against `ca-cert.pem` and the + /// provided `hostname`, and presents the certificate chain in + /// `client-cert.pem` together with the private key in `client-key.pem` for + /// mutual TLS authentication. + pub fn new_client( + socket: TcpStream, + cert_dir: &Path, + hostname: &str, + ) -> result::Result { + let root_store = load_root_store(&cert_dir.join("ca-cert.pem"))?; + + let config = ClientConfig::builder() + .with_root_certificates(root_store) + .with_no_client_auth(); + let config = Arc::new(config); + + let server_name = ServerName::try_from(hostname.to_string()) + .map_err(TlsError::InvalidDnsName) + .map_err(MigratableError::Tls)?; + let conn = ClientConnection::new(config.clone(), server_name.clone()) + .map_err(TlsError::RustlsError) + .map_err(MigratableError::Tls)?; + + let mut tls = StreamOwned::new(conn, socket); + while tls.conn.is_handshaking() { + let (rd, wr) = tls + .conn + .complete_io(&mut tls.sock) + .map_err(TlsError::RustlsIoError) + .map_err(MigratableError::Tls)?; + // No handshake progress on a connection that should be handshaking, we treat + // that as a failure. + if rd == 0 && wr == 0 { + Err(MigratableError::Tls(TlsError::HandshakeError))?; + } + } + + Ok(Self { + stream: TlsStreamParticipant::Client(tls), + buf: Vec::new(), + }) + } + + /// Creates a server [`TlsStream`]. Encrypts and decrypts data sent through + /// this stream using the certificates and key from the provided + /// [`TlsServerConfig`]. + pub fn new_server( + socket: TcpStream, + config: &TlsServerConfig, + ) -> result::Result { + let conn = ServerConnection::new(config.config.clone()) + .map_err(TlsError::RustlsError) + .map_err(MigratableError::Tls)?; + + let mut tls = StreamOwned::new(conn, socket); + while tls.conn.is_handshaking() { + let (rd, wr) = tls + .conn + .complete_io(&mut tls.sock) + .map_err(TlsError::RustlsIoError) + .map_err(MigratableError::Tls)?; + // No handshake progress on a connection that should be handshaking, we treat + // that as a failure. + if rd == 0 && wr == 0 { + Err(MigratableError::Tls(TlsError::HandshakeError))?; + } + } + + Ok(Self { + stream: TlsStreamParticipant::Server(tls), + buf: Vec::new(), + }) + } +} + +impl Read for TlsStream { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + match &mut self.stream { + TlsStreamParticipant::Client(s) => Read::read(s, buf), + TlsStreamParticipant::Server(s) => Read::read(s, buf), + } + } +} + +impl Write for TlsStream { + fn write(&mut self, buf: &[u8]) -> io::Result { + match &mut self.stream { + TlsStreamParticipant::Client(s) => Write::write(s, buf), + TlsStreamParticipant::Server(s) => Write::write(s, buf), + } + } + + fn flush(&mut self) -> io::Result<()> { + match &mut self.stream { + TlsStreamParticipant::Client(s) => Write::flush(s), + TlsStreamParticipant::Server(s) => Write::flush(s), + } + } +} + +// Reading from or writing to these FDs would break the connection, because +// those reads or writes wouldn't go through rustls. But the FD is necessary to +// listen for incoming connections. +impl AsFd for TlsStream { + fn as_fd(&self) -> BorrowedFd<'_> { + match &self.stream { + TlsStreamParticipant::Client(s) => s.get_ref().as_fd(), + TlsStreamParticipant::Server(s) => s.get_ref().as_fd(), + } + } +} + +impl ReadVolatile for TlsStream { + fn read_volatile( + &mut self, + vs: &mut VolatileSlice, + ) -> result::Result { + let len = vs.len().min(Self::BUF_SIZE); + + if len == 0 { + return Ok(0); + } + + if self.buf.len() < len { + self.buf.resize(len, 0); + } + + let n = { + let (stream, buf) = (&mut self.stream, &mut self.buf[..len]); + + match stream { + TlsStreamParticipant::Client(s) => Read::read(s, buf), + TlsStreamParticipant::Server(s) => Read::read(s, buf), + } + .map_err(VolatileMemoryError::IOError)? + }; + + if n == 0 { + return Ok(0); + } + + vs.copy_from(&self.buf[..n]); + self.buf.clear(); + Ok(n) + } +} + +impl WriteVolatile for TlsStream { + fn write_volatile( + &mut self, + vs: &VolatileSlice, + ) -> Result { + let len = vs.len().min(Self::BUF_SIZE); + + if len == 0 { + return Ok(0); + } + + if self.buf.len() < len { + self.buf.resize(len, 0); + } + + let buf = &mut self.buf[..len]; + let n = vs.copy_to(&mut buf[..len]); + + if n == 0 { + return Ok(0); + } + + let n = { + let stream = &mut self.stream; + + match stream { + TlsStreamParticipant::Client(s) => Write::write(s, buf), + TlsStreamParticipant::Server(s) => Write::write(s, buf), + } + .map_err(VolatileMemoryError::IOError)? + }; + + self.buf.clear(); + Ok(n) + } +} + +/// Carries a server-TLS-config. Intended to be turned into a [`TlsStream`] +/// when paired with a [`TcpStream`]. +#[derive(Debug, Clone)] +pub struct TlsServerConfig { + /// This config is shared between all server connections. + config: Arc, +} + +impl TlsServerConfig { + /// Creates a [`TlsServerConfig`] from the certificate chain in + /// `server-cert.pem`, the private key in `server-key.pem`, and the client + /// trust anchors in `ca-cert.pem`. + /// + /// Client certificates presented during the TLS handshake must chain to a CA in + /// `ca-cert.pem`. + pub fn new(cert_dir: &Path) -> result::Result { + let server_certs = load_cert_chain(&cert_dir.join("server-cert.pem"))?; + let server_key = load_private_key(&cert_dir.join("server-key.pem"))?; + + let config = ServerConfig::builder() + .with_no_client_auth() + .with_single_cert(server_certs, server_key) + .map_err(TlsError::RustlsError) + .map_err(MigratableError::Tls)?; + let config = Arc::new(config); + Ok(Self { config }) + } +} + +/// Loads trusted CA certificates into a root store, i.e. the set of trust anchors +/// used to verify the peer's certificate chain. +fn load_root_store(cert_path: &Path) -> result::Result { + let mut root_store = RootCertStore::empty(); + root_store.add_parsable_certificates( + CertificateDer::pem_file_iter(cert_path) + .map_err(TlsError::RustlsPemError) + .map_err(MigratableError::Tls)? + .map(|cert| cert.map_err(TlsError::RustlsPemError)) + .collect::>, TlsError>>() + .map_err(MigratableError::Tls)?, + ); + Ok(root_store) +} + +/// Loads a certificate chain to present during the TLS handshake. +fn load_cert_chain( + cert_path: &Path, +) -> result::Result>, MigratableError> { + CertificateDer::pem_file_iter(cert_path) + .map_err(TlsError::RustlsPemError) + .map_err(MigratableError::Tls)? + .map(|cert| cert.map_err(TlsError::RustlsPemError)) + .collect::>, TlsError>>() + .map_err(MigratableError::Tls) +} + +/// Loads the private key that proves ownership of the presented certificate chain. +fn load_private_key(key_path: &Path) -> result::Result, MigratableError> { + PrivateKeyDer::from_pem_file(key_path) + .map_err(TlsError::RustlsPemError) + .map_err(MigratableError::Tls) +} diff --git a/vmm/Cargo.toml b/vmm/Cargo.toml index 7a38fa5006..865ce00449 100644 --- a/vmm/Cargo.toml +++ b/vmm/Cargo.toml @@ -63,10 +63,13 @@ hypervisor = { path = "../hypervisor" } igvm = { workspace = true, optional = true } igvm_defs = { workspace = true, optional = true } iommufd-ioctls = { workspace = true, optional = true } +kvm-bindings = { workspace = true } landlock = "0.4.4" libc = { workspace = true } linux-loader = { workspace = true, features = ["bzimage", "elf", "pe"] } log = { workspace = true } +# Special fork of micro_http that combines HTTP traffic over a UNIX domain +# socket with UNIX' SCM_RIGHTS mechanism for transferring file descriptors. micro_http = { git = "https://github.com/firecracker-microvm/micro-http", branch = "main" } mshv-bindings = { workspace = true, features = [ "fam-wrappers", diff --git a/vmm/src/api/dbus/mod.rs b/vmm/src/api/dbus/mod.rs index ae39feb7d7..94d007cad7 100644 --- a/vmm/src/api/dbus/mod.rs +++ b/vmm/src/api/dbus/mod.rs @@ -24,8 +24,9 @@ use crate::api::VmCoredump; use crate::api::{ AddDisk, Body, VmAddDevice, VmAddFs, VmAddGenericVhostUser, VmAddNet, VmAddPmem, VmAddUserDevice, VmAddVdpa, VmAddVsock, VmBoot, VmCounters, VmCreate, VmDelete, VmInfo, - VmPause, VmPowerButton, VmReboot, VmReceiveMigration, VmRemoveDevice, VmResize, VmResizeZone, - VmRestore, VmResume, VmSendMigration, VmShutdown, VmSnapshot, VmmPing, VmmShutdown, + VmPause, VmPostMigrationAnnounce, VmPowerButton, VmReboot, VmReceiveMigration, VmRemoveDevice, + VmResize, VmResizeZone, VmRestore, VmResume, VmSendMigration, VmShutdown, VmSnapshot, VmmPing, + VmmShutdown, }; use crate::seccomp_filters::{Thread, get_seccomp_filter}; use crate::{Error as VmmError, NetConfig, Result as VmmResult, VmConfig}; @@ -250,6 +251,12 @@ impl DBusApi { self.vm_action(&VmPause, ()).await.map(|_| ()) } + async fn vm_post_migration_announce(&self) -> Result<()> { + self.vm_action(&VmPostMigrationAnnounce, ()) + .await + .map(|_| ()) + } + async fn vm_power_button(&self) -> Result<()> { self.vm_action(&VmPowerButton, ()).await.map(|_| ()) } diff --git a/vmm/src/api/http/http_endpoint.rs b/vmm/src/api/http/http_endpoint.rs index 92b53ac68e..57aa6c4469 100644 --- a/vmm/src/api/http/http_endpoint.rs +++ b/vmm/src/api/http/http_endpoint.rs @@ -6,11 +6,11 @@ //! # HTTP Endpoints of the Cloud Hypervisor API //! -//! ## Special Handling for Devices Backed by Network File Descriptors (FDs) (e.g., virtio-net) +//! ## Special Handling for Externally Provided File Descriptors (FDs) (e.g., virtio-net) //! //! Some of the HTTP handlers here implement special logic for devices -//! **backed by network FDs** to enable live-migration, state save/resume -//! (restore), and similar VM lifecycle events. +//! **backed by externally opened FDs** to enable live-migration, +//! state save/resume (restore), and similar VM lifecycle events. //! //! The utilized mechanism requires that the control software (e.g., libvirt) //! connects to Cloud Hypervisor by using a UNIX domain socket and that it @@ -37,6 +37,7 @@ use std::fs::File; use std::sync::mpsc::Sender; +use log::info; use micro_http::{Body, Method, Request, Response, StatusCode, Version}; use vmm_sys_util::eventfd::EventFd; @@ -47,7 +48,8 @@ use crate::api::http::{EndpointHandler, HttpError, error_response}; use crate::api::{ AddDisk, ApiAction, ApiError, ApiRequest, NetConfig, VmAddDevice, VmAddFs, VmAddGenericVhostUser, VmAddNet, VmAddPmem, VmAddUserDevice, VmAddVdpa, VmAddVsock, VmBoot, - VmConfig, VmCounters, VmDelete, VmNmi, VmPause, VmPowerButton, VmReboot, VmReceiveMigration, + VmCancelMigration, VmConfig, VmCounters, VmDelete, VmMigrationProgress, VmNmi, VmPause, + VmPostMigrationAnnounce, VmPowerButton, VmReboot, VmReceiveMigration, VmReceiveMigrationData, VmRemoveDevice, VmResize, VmResizeDisk, VmResizeZone, VmRestore, VmResume, VmSendMigration, VmShutdown, VmSnapshot, }; @@ -414,8 +416,10 @@ vm_action_put_handler!(VmShutdown); vm_action_put_handler!(VmReboot); vm_action_put_handler!(VmPause); vm_action_put_handler!(VmResume); +vm_action_put_handler!(VmPostMigrationAnnounce); vm_action_put_handler!(VmPowerButton); vm_action_put_handler!(VmNmi); +vm_action_put_handler!(VmCancelMigration); vm_action_put_handler_body!(VmAddDevice); vm_action_put_handler_body!(AddDisk); @@ -429,13 +433,11 @@ vm_action_put_handler_body!(VmRemoveDevice); vm_action_put_handler_body!(VmResizeDisk); vm_action_put_handler_body!(VmResizeZone); vm_action_put_handler_body!(VmSnapshot); -vm_action_put_handler_body!(VmReceiveMigration); -vm_action_put_handler_body!(VmSendMigration); #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] vm_action_put_handler_body!(VmCoredump); -// Special handling for virtio-net devices backed by network FDs. +// Special handling for externally provided FDs. // See module description for more info. impl PutHandler for VmAddNet { fn handle_request( @@ -459,6 +461,63 @@ impl PutHandler for VmAddNet { impl GetHandler for VmAddNet {} +// Special handling for externally provided FDs. +// See module description for more info. +impl PutHandler for VmReceiveMigration { + fn handle_request( + &'static self, + api_notifier: EventFd, + api_sender: Sender, + body: &Option, + files: Vec, + ) -> std::result::Result, HttpError> { + if let Some(body) = body { + let mut net_cfg: VmReceiveMigrationData = serde_json::from_slice(body.raw())?; + if !net_cfg.net_fds.is_empty() { + let mut cfgs = net_cfg.net_fds.iter_mut().collect::>(); + let cfgs = cfgs.as_mut_slice(); + attach_fds_to_cfgs(files, cfgs)?; + } + + self.send(api_notifier, api_sender, net_cfg) + .map_err(HttpError::ApiError) + } else { + Err(HttpError::BadRequest) + } + } +} + +impl GetHandler for VmReceiveMigration {} + +// Special Handling for virtio-net Devices Backed by Network File Descriptors +// +// See above. +impl PutHandler for VmSendMigration { + fn handle_request( + &'static self, + api_notifier: EventFd, + api_sender: Sender, + body: &Option, + _files: Vec, + ) -> std::result::Result, HttpError> { + if let Some(body) = body { + self.send( + api_notifier, + api_sender, + serde_json::from_slice(body.raw())?, + ) + .inspect(|_| { + info!("live migration started (in background)"); + }) + .map_err(HttpError::ApiError) + } else { + Err(HttpError::BadRequest) + } + } +} + +impl GetHandler for VmSendMigration {} + impl PutHandler for VmResize { fn handle_request( &'static self, @@ -487,7 +546,7 @@ impl PutHandler for VmResize { impl GetHandler for VmResize {} -// Special handling for virtio-net devices backed by network FDs. +// Special handling for externally provided FDs. // See module description for more info. impl PutHandler for VmRestore { fn handle_request( @@ -632,6 +691,32 @@ impl EndpointHandler for VmmShutdown { } } +impl EndpointHandler for VmMigrationProgress { + fn handle_request( + &self, + req: &Request, + api_notifier: EventFd, + api_sender: Sender, + ) -> Response { + match req.method() { + Method::Get => match crate::api::VmMigrationProgress + .send(api_notifier, api_sender, ()) + .map_err(HttpError::ApiError) + { + Ok(info) => { + let mut response = Response::new(Version::Http11, StatusCode::OK); + let info_serialized = serde_json::to_string(&info).unwrap(); + + response.set_body(Body::new(info_serialized)); + response + } + Err(e) => error_response(e, StatusCode::InternalServerError), + }, + _ => error_response(HttpError::BadRequest, StatusCode::BadRequest), + } + } +} + #[cfg(test)] mod external_fds_tests { use super::*; diff --git a/vmm/src/api/http/mod.rs b/vmm/src/api/http/mod.rs index f7cea4fafa..5464ca87ab 100644 --- a/vmm/src/api/http/mod.rs +++ b/vmm/src/api/http/mod.rs @@ -29,9 +29,10 @@ use self::http_endpoint::{VmActionHandler, VmCreate, VmInfo, VmmPing, VmmShutdow use crate::api::VmCoredump; use crate::api::{ AddDisk, ApiError, ApiRequest, VmAddDevice, VmAddFs, VmAddGenericVhostUser, VmAddNet, - VmAddPmem, VmAddUserDevice, VmAddVdpa, VmAddVsock, VmBoot, VmCounters, VmDelete, VmNmi, - VmPause, VmPowerButton, VmReboot, VmReceiveMigration, VmRemoveDevice, VmResize, VmResizeDisk, - VmResizeZone, VmRestore, VmResume, VmSendMigration, VmShutdown, VmSnapshot, + VmAddPmem, VmAddUserDevice, VmAddVdpa, VmAddVsock, VmBoot, VmCancelMigration, VmCounters, + VmDelete, VmMigrationProgress, VmNmi, VmPause, VmPostMigrationAnnounce, VmPowerButton, + VmReboot, VmReceiveMigration, VmRemoveDevice, VmResize, VmResizeDisk, VmResizeZone, VmRestore, + VmResume, VmSendMigration, VmShutdown, VmSnapshot, }; use crate::landlock::Landlock; use crate::seccomp_filters::{Thread, get_seccomp_filter}; @@ -273,14 +274,26 @@ pub static HTTP_ROUTES: LazyLock = LazyLock::new(|| { endpoint!("/vm.resume"), Box::new(VmActionHandler::new(&VmResume)), ); + r.routes.insert( + endpoint!("/vm.post-migration-announce"), + Box::new(VmActionHandler::new(&VmPostMigrationAnnounce)), + ); r.routes.insert( endpoint!("/vm.send-migration"), Box::new(VmActionHandler::new(&VmSendMigration)), ); + r.routes.insert( + endpoint!("/vm.cancel-migration"), + Box::new(VmActionHandler::new(&VmCancelMigration)), + ); r.routes.insert( endpoint!("/vm.shutdown"), Box::new(VmActionHandler::new(&VmShutdown)), ); + r.routes.insert( + endpoint!("/vm.migration-progress"), + Box::new(VmMigrationProgress {}), + ); r.routes.insert( endpoint!("/vm.snapshot"), Box::new(VmActionHandler::new(&VmSnapshot)), diff --git a/vmm/src/api/mod.rs b/vmm/src/api/mod.rs index e4ee7235ad..6d1f7c6d7e 100644 --- a/vmm/src/api/mod.rs +++ b/vmm/src/api/mod.rs @@ -35,29 +35,31 @@ pub mod http; use std::io; use std::num::{NonZeroU32, NonZeroU64}; +use std::path::PathBuf; use std::str::FromStr; use std::sync::mpsc::{RecvError, SendError, Sender, channel}; use std::time::Duration; -use log::info; +use log::{info, trace}; use micro_http::Body; use option_parser::{OptionParser, OptionParserError, Toggle}; use serde::{Deserialize, Serialize}; use thiserror::Error; use vm_migration::MigratableError; +use vm_migration::progress::MigrationProgress; use vmm_sys_util::eventfd::EventFd; #[cfg(feature = "dbus_api")] pub use self::dbus::start_dbus_thread; pub use self::http::{start_http_fd_thread, start_http_path_thread}; use crate::Error as VmmError; -use crate::config::RestoreConfig; +use crate::config::{RestoreConfig, RestoredNetConfig}; use crate::device_tree::DeviceTree; use crate::migration_transport::MAX_MIGRATION_CONNECTIONS; use crate::vm::{Error as VmError, VmState}; use crate::vm_config::{ - DeviceConfig, DiskConfig, FsConfig, GenericVhostUserConfig, NetConfig, PmemConfig, - UserDeviceConfig, VdpaConfig, VmConfig, VsockConfig, + DeviceConfig, DiskConfig, FsConfig, GenericVhostUserConfig, MemoryZoneConfig, NetConfig, + PmemConfig, UserDeviceConfig, VdpaConfig, VmConfig, VsockConfig, }; /// API errors are sent back from the VMM API server through the ApiResponse. @@ -103,6 +105,10 @@ pub enum ApiError { #[error("The VM could not resume")] VmResume(#[source] VmError), + /// The VM could not perform the post-migration announcement. + #[error("The VM could not perform the post-migration announcement")] + VmPostMigrationAnnounce(#[source] VmError), + /// The VM is not booted. #[error("The VM is not booted")] VmNotBooted, @@ -203,6 +209,10 @@ pub enum ApiError { #[error("Error starting migration sender")] VmSendMigration(#[source] MigratableError), + /// Error cancelling migration + #[error("Error cancelling migration")] + VmCancelMigration(#[source] MigratableError), + /// Error triggering power button #[error("Error triggering power button")] VmPowerButton(#[source] VmError), @@ -210,6 +220,10 @@ pub enum ApiError { /// Error triggering NMI #[error("Error triggering NMI")] VmNmi(#[source] VmError), + + /// Error fetching the migration progress + #[error("Error fetching the migration progress")] + VmMigrationProgress(#[source] VmError), } pub type ApiResult = Result; @@ -266,9 +280,164 @@ pub struct VmCoredumpData { } #[derive(Clone, Deserialize, Serialize, Default, Debug)] +#[cfg_attr(test, derive(PartialEq))] pub struct VmReceiveMigrationData { /// URL for the reception of migration state pub receiver_url: String, + /// Directory containing the TLS server certificate (server-cert.pem), the TLS server key (server-key.pem), and the client TLS root CA certificate (ca-cert.pem). + #[serde(default)] + pub tls_dir: Option, + /// Map with new network FDs on the new host. + #[serde(default)] + pub net_fds: Vec, + /// Optional URL if the TCP serial configuration must be changed during + /// migration. Example: "192.168.1.1:2222". + #[serde(default)] + pub tcp_serial_url: Option, + /// Optional memory zone reconfiguration data. + #[serde(default)] + pub zones: Vec, +} + +#[derive(Debug, Error)] +pub enum VmReceiveMigrationConfigError { + #[error("Error parsing receive migration parameters")] + ParseError(#[source] OptionParserError), + + #[error("Error validating receive migration parameters")] + ValidationError(String), +} + +/// Validates the host and port portion of a TCP migration URL. +/// +/// The expected format is `:` for hostnames and IPv4 addresses, or +/// `[]:` for IPv6 addresses. The host and port must both be +/// present, and the port must parse as a `u16`. +fn validate_tcp_migration_address(address: &str) -> Result<(), String> { + let (host, port) = if let Some(rest) = address.strip_prefix('[') { + let (host, rest) = rest + .split_once(']') + .ok_or_else(|| "missing closing ']' for bracketed IPv6 address".to_string())?; + + let port = rest + .strip_prefix(':') + .ok_or_else(|| "missing port separator after bracketed host".to_string())?; + + (host, port) + } else { + address + .rsplit_once(':') + .ok_or_else(|| "missing TCP port".to_string())? + }; + + if host.is_empty() { + return Err("host must not be empty".to_string()); + } + + if port.is_empty() { + return Err("port must not be empty".to_string()); + } + + port.parse::() + .map_err(|_| format!("invalid TCP port: {port}"))?; + + Ok(()) +} + +impl VmReceiveMigrationData { + pub const SYNTAX: &'static str = "VM receive migration parameters \ + \"\" or \"receiver_url=[,tls_dir=,tcp_serial_url=]\""; + + pub fn parse(migration: &str) -> Result { + let uses_key_value_syntax = migration.split(',').any( + |part| matches!(part, p if p.starts_with("receiver_url=") || p.starts_with("tls_dir=")), + ); + + if !uses_key_value_syntax { + let data = Self { + receiver_url: migration.to_owned(), + tls_dir: None, + net_fds: vec![], + tcp_serial_url: None, + zones: vec![], + }; + + data.validate()?; + + return Ok(data); + } + + let mut parser = OptionParser::new(); + parser + .add("receiver_url") + .add("tls_dir") + .add("tcp_serial_url"); + parser + .parse(migration) + .map_err(VmReceiveMigrationConfigError::ParseError)?; + + let receiver_url = parser.get("receiver_url").ok_or_else(|| { + VmReceiveMigrationConfigError::ParseError(OptionParserError::InvalidSyntax( + "receiver_url is required".to_string(), + )) + })?; + let tls_dir = parser + .convert::("tls_dir") + .map_err(VmReceiveMigrationConfigError::ParseError)? + .map(|path| PathBuf::from(&path)); + let tcp_serial_url = parser + .convert::("tcp_serial_url") + .map_err(VmReceiveMigrationConfigError::ParseError)?; + + let data = Self { + receiver_url, + tls_dir, + net_fds: vec![], + tcp_serial_url, + zones: vec![], + }; + + data.validate()?; + + Ok(data) + } + + pub fn validate(&self) -> Result<(), VmReceiveMigrationConfigError> { + if let Some(addr) = self.receiver_url.strip_prefix("tcp:") { + validate_tcp_migration_address(addr).map_err(|e| { + VmReceiveMigrationConfigError::ValidationError(format!( + "receiver_url must use tcp:: or unix:: {e}." + )) + })?; + } else if self + .receiver_url + .strip_prefix("unix:") + .is_some_and(|path| !path.is_empty()) + { + if self.tls_dir.is_some() { + return Err(VmReceiveMigrationConfigError::ValidationError( + "UNIX sockets and TLS encryption cannot be used at the same time.".to_string(), + )); + } + } else { + return Err(VmReceiveMigrationConfigError::ValidationError( + "receiver_url must use tcp:: or unix:.".to_string(), + )); + } + + // The TLS implementation checks for all necessary files. Here we only + // check whether the path exists and points to a directory. + if let Some(tls_dir) = &self.tls_dir + && !tls_dir.is_dir() + { + return Err(VmReceiveMigrationConfigError::ValidationError(format!( + "tls_dir must point to a directory. Path: {}", + tls_dir.display() + ))); + } + + Ok(()) + } } #[derive(Copy, Clone, Default, Deserialize, Serialize, Debug, PartialEq, Eq)] @@ -330,13 +499,19 @@ pub struct VmSendMigrationData { /// Must be between 1 and `MAX_MIGRATION_CONNECTIONS` inclusive. #[serde(default = "VmSendMigrationData::default_connections")] pub connections: NonZeroU32, + /// Path to the directory containing the TLS root CA certificate (ca-cert.pem), the TLS client certificate (client-cert.pem), and TLS client key (client-key.pem). + #[serde(default)] + pub tls_dir: Option, + /// Keep the VMM alive. + pub keep_alive: bool, } impl VmSendMigrationData { pub const SYNTAX: &'static str = "VM send migration parameters \ \"destination_url=[,local=on|off,\ downtime_ms=,timeout_s=,\ - timeout_strategy=cancel|ignore,connections=]\""; + timeout_strategy=cancel|ignore,connections=,\ + tls_dir=]\""; // Same as QEMU. pub const DEFAULT_DOWNTIME: Duration = Duration::from_millis(300); @@ -364,7 +539,9 @@ impl VmSendMigrationData { .add("downtime_ms") .add("timeout_s") .add("timeout_strategy") - .add("connections"); + .add("connections") + .add("tls_dir") + .add("keep_alive"); parser .parse(migration) .map_err(VmSendMigrationConfigError::ParseError)?; @@ -416,6 +593,15 @@ impl VmSendMigrationData { })?, None => Self::default_connections(), }; + let tls_dir = parser + .convert::("tls_dir") + .map_err(VmSendMigrationConfigError::ParseError)? + .map(|path| PathBuf::from(&path)); + let keep_alive = parser + .convert::("keep_alive") + .map_err(VmSendMigrationConfigError::ParseError)? + .unwrap_or(Toggle(false)) + .0; let data = Self { destination_url, @@ -424,6 +610,8 @@ impl VmSendMigrationData { timeout_s, timeout_strategy, connections, + tls_dir, + keep_alive, }; data.validate()?; @@ -440,26 +628,32 @@ impl VmSendMigrationData { } pub fn validate(&self) -> Result<(), VmSendMigrationConfigError> { - match self.destination_url.as_str() { - url if url - .strip_prefix("tcp:") - .is_some_and(|addr| !addr.is_empty()) => {} - url if url - .strip_prefix("unix:") - .is_some_and(|path| !path.is_empty()) => - { - if self.connections.get() > 1 { - return Err(VmSendMigrationConfigError::ValidationError( - "UNIX sockets and connections option cannot be used at the same time." - .to_string(), - )); - } + if let Some(addr) = self.destination_url.strip_prefix("tcp:") { + validate_tcp_migration_address(addr).map_err(|e| { + VmSendMigrationConfigError::ValidationError(format!( + "destination_url must use tcp:: or unix:: {e}." + )) + })?; + } else if self + .destination_url + .strip_prefix("unix:") + .is_some_and(|path| !path.is_empty()) + { + if self.connections.get() > 1 { + return Err(VmSendMigrationConfigError::ValidationError( + "UNIX sockets and connections option cannot be used at the same time." + .to_string(), + )); } - _ => { + if self.tls_dir.is_some() { return Err(VmSendMigrationConfigError::ValidationError( - "destination_url must use tcp:: or unix:.".to_string(), + "UNIX sockets and TLS encryption cannot be used at the same time.".to_string(), )); } + } else { + return Err(VmSendMigrationConfigError::ValidationError( + "destination_url must use tcp:: or unix:.".to_string(), + )); } if self.connections.get() > MAX_MIGRATION_CONNECTIONS { @@ -483,6 +677,17 @@ impl VmSendMigrationData { } } + // The TLS implementation checks for all necessary files. Here we only + // check whether the path exists and points to a directory. + if let Some(tls_dir) = &self.tls_dir + && !tls_dir.is_dir() + { + return Err(VmSendMigrationConfigError::ValidationError(format!( + "tls_dir must point to a directory. Path: {}", + tls_dir.display() + ))); + } + Ok(()) } } @@ -494,6 +699,9 @@ pub enum ApiResponsePayload { /// Virtual machine information VmInfo(VmInfoResponse), + /// The progress of a possibly ongoing live migration. + VmMigrationProgress(Box>), + /// Vmm ping response VmmPing(VmmPingResponse), @@ -513,6 +721,8 @@ pub trait RequestHandler { fn vm_resume(&mut self) -> Result<(), VmError>; + fn vm_post_migration_announce(&mut self) -> Result<(), VmError>; + fn vm_snapshot(&mut self, destination_url: &str) -> Result<(), VmError>; fn vm_restore(&mut self, restore_cfg: RestoreConfig) -> Result<(), VmError>; @@ -578,12 +788,23 @@ pub trait RequestHandler { receive_data_migration: VmReceiveMigrationData, ) -> Result<(), MigratableError>; + /// Dispatches the migration. fn vm_send_migration( &mut self, send_data_migration: VmSendMigrationData, ) -> Result<(), MigratableError>; + /// Triggers a migration cancellation. + /// + /// The cancellation is not guaranteed to succeed, as the migration may have + /// succeeded already. + fn vm_cancel_migration(&mut self) -> Result<(), MigratableError>; + fn vm_nmi(&mut self) -> Result<(), VmError>; + + /// Returns the progress of the currently active migration or any previous + /// failed or canceled migration. + fn vm_migration_progress(&mut self) -> Option; } /// It would be nice if we could pass around an object like this: @@ -1062,7 +1283,7 @@ impl ApiAction for VmCounters { fn request(&self, _: Self::RequestBody, response_sender: Sender) -> ApiRequest { Box::new(move |vmm| { - info!("API request event: VmCounters"); + trace!("API request event: VmCounters"); let response = vmm .vm_counters() @@ -1167,7 +1388,7 @@ impl ApiAction for VmInfo { fn request(&self, _: Self::RequestBody, response_sender: Sender) -> ApiRequest { Box::new(move |vmm| { - info!("API request event: VmInfo"); + trace!("API request event: VmInfo"); let response = vmm .vm_info() @@ -1329,6 +1550,39 @@ impl ApiAction for VmReceiveMigration { } } +pub struct VmCancelMigration; + +impl ApiAction for VmCancelMigration { + type RequestBody = (); + type ResponseBody = Option; + + fn request(&self, data: Self::RequestBody, response_sender: Sender) -> ApiRequest { + Box::new(move |vmm| { + info!("API request event: VmCancelMigration {data:?}"); + + let response = vmm + .vm_cancel_migration() + .map_err(ApiError::VmCancelMigration) + .map(|_| ApiResponsePayload::Empty); + + response_sender + .send(response) + .map_err(VmmError::ApiResponseSend)?; + + Ok(false) + }) + } + + fn send( + &self, + api_evt: EventFd, + api_sender: Sender, + data: Self::RequestBody, + ) -> ApiResult { + get_response_body(self, api_evt, api_sender, data) + } +} + pub struct VmRemoveDevice; impl ApiAction for VmRemoveDevice { @@ -1549,6 +1803,39 @@ impl ApiAction for VmResume { } } +pub struct VmPostMigrationAnnounce; + +impl ApiAction for VmPostMigrationAnnounce { + type RequestBody = (); + type ResponseBody = Option; + + fn request(&self, _: Self::RequestBody, response_sender: Sender) -> ApiRequest { + Box::new(move |vmm| { + info!("API request event: VmPostMigrationAnnounce"); + + let response = vmm + .vm_post_migration_announce() + .map_err(ApiError::VmPostMigrationAnnounce) + .map(|_| ApiResponsePayload::Empty); + + response_sender + .send(response) + .map_err(VmmError::ApiResponseSend)?; + + Ok(false) + }) + } + + fn send( + &self, + api_evt: EventFd, + api_sender: Sender, + data: Self::RequestBody, + ) -> ApiResult { + get_response_body(self, api_evt, api_sender, data) + } +} + pub struct VmSendMigration; impl ApiAction for VmSendMigration { @@ -1754,10 +2041,146 @@ impl ApiAction for VmNmi { } } +pub struct VmMigrationProgress; + +impl ApiAction for VmMigrationProgress { + type RequestBody = (); + type ResponseBody = Box>; + + fn request(&self, _: Self::RequestBody, response_sender: Sender) -> ApiRequest { + Box::new(move |vmm| { + trace!("API request event: VmMigrationProgress"); + + let snapshot = Ok(vmm.vm_migration_progress()); + let response = snapshot + .map(Box::new) + .map(ApiResponsePayload::VmMigrationProgress) + .map_err(ApiError::VmMigrationProgress); + + response_sender + .send(response) + .map_err(VmmError::ApiResponseSend)?; + + Ok(false) + }) + } + + fn send( + &self, + api_evt: EventFd, + api_sender: Sender, + data: Self::RequestBody, + ) -> ApiResult { + let info = get_response(self, api_evt, api_sender, data)?; + + match info { + ApiResponsePayload::VmMigrationProgress(info) => Ok(info), + _ => Err(ApiError::ResponsePayloadType), + } + } +} + #[cfg(test)] mod unit_tests { use super::*; + #[test] + fn test_validate_tcp_migration_address() { + for address in [ + "192.168.1.1:8080", + "destination.example:8080", + "[2001:db8::1]:8080", + "[::1]:0", + "localhost:65535", + ] { + validate_tcp_migration_address(address) + .unwrap_or_else(|e| panic!("expected {address} to be valid, got: {e}")); + } + + assert_eq!( + validate_tcp_migration_address("192.168.1.1").unwrap_err(), + "missing TCP port" + ); + assert_eq!( + validate_tcp_migration_address(":8080").unwrap_err(), + "host must not be empty" + ); + assert_eq!( + validate_tcp_migration_address("host:").unwrap_err(), + "port must not be empty" + ); + assert_eq!( + validate_tcp_migration_address("host:not-a-port").unwrap_err(), + "invalid TCP port: not-a-port" + ); + assert_eq!( + validate_tcp_migration_address("[2001:db8::1").unwrap_err(), + "missing closing ']' for bracketed IPv6 address" + ); + assert_eq!( + validate_tcp_migration_address("[]:8080").unwrap_err(), + "host must not be empty" + ); + assert_eq!( + validate_tcp_migration_address("[2001:db8::1]").unwrap_err(), + "missing port separator after bracketed host" + ); + assert_eq!( + validate_tcp_migration_address("[2001:db8::1]:").unwrap_err(), + "port must not be empty" + ); + assert_eq!( + validate_tcp_migration_address("[2001:db8::1]:99999").unwrap_err(), + "invalid TCP port: 99999" + ); + } + + #[test] + fn test_vm_receive_migration_data_parse() { + let data = VmReceiveMigrationData::parse("tcp:192.168.1.1:8080").unwrap(); + assert_eq!( + data, + VmReceiveMigrationData { + receiver_url: "tcp:192.168.1.1:8080".to_string(), + tls_dir: None, + net_fds: vec![], + tcp_serial_url: None, + zones: vec![], + } + ); + + let data = VmReceiveMigrationData::parse("tcp:[2001:db8::1]:8080").unwrap(); + assert_eq!(data.receiver_url, "tcp:[2001:db8::1]:8080"); + + let data = VmReceiveMigrationData::parse("tcp:destination.example:8080").unwrap(); + assert_eq!(data.receiver_url, "tcp:destination.example:8080"); + + let data = VmReceiveMigrationData::parse("unix:/tmp/ch=migrate.sock").unwrap(); + assert_eq!(data.receiver_url, "unix:/tmp/ch=migrate.sock"); + + let tls_dir = std::env::temp_dir(); + let data = VmReceiveMigrationData::parse(&format!( + "receiver_url=tcp:192.168.1.1:8080,tls_dir={},tcp_serial_url=1.2.3.4:6789", + tls_dir.display(), + )) + .unwrap(); + assert_eq!( + data, + VmReceiveMigrationData { + receiver_url: "tcp:192.168.1.1:8080".to_string(), + tls_dir: Some(tls_dir), + net_fds: vec![], + tcp_serial_url: Some("1.2.3.4:6789".to_string()), + zones: vec![], + } + ); + + VmReceiveMigrationData::parse("receiver_url=file:///tmp/migration").unwrap_err(); + VmReceiveMigrationData::parse("tcp:192.168.1.1").unwrap_err(); + VmReceiveMigrationData::parse("tcp:[2001:db8::1]").unwrap_err(); + VmReceiveMigrationData::parse("receiver_url=unix:/tmp/sock,tls_dir=/tmp").unwrap_err(); + } + #[test] fn test_vm_send_migration_data_parse() { // Fully specified @@ -1781,6 +2204,14 @@ mod unit_tests { assert_eq!(data.timeout_strategy, TimeoutStrategy::default()); assert_eq!(data.connections, VmSendMigrationData::default_connections()); + let data = VmSendMigrationData::parse("destination_url=tcp:[2001:db8::1]:8080") + .expect("IPv6 migration string should parse"); + assert_eq!(data.destination_url, "tcp:[2001:db8::1]:8080"); + + let data = VmSendMigrationData::parse("destination_url=tcp:destination.example:8080") + .expect("hostname migration string should parse"); + assert_eq!(data.destination_url, "tcp:destination.example:8080"); + // Missing destination_url is an error VmSendMigrationData::parse("local=on,downtime_ms=200").unwrap_err(); @@ -1817,6 +2248,8 @@ mod unit_tests { // Invalid destination URL scheme is rejected VmSendMigrationData::parse("destination_url=file:///tmp/migration").unwrap_err(); + VmSendMigrationData::parse("destination_url=tcp:192.168.1.1").unwrap_err(); + VmSendMigrationData::parse("destination_url=tcp:[2001:db8::1]").unwrap_err(); // Local migration requires a UNIX socket destination VmSendMigrationData::parse("destination_url=tcp:192.168.1.1:8080,local=yes").unwrap_err(); @@ -1838,12 +2271,15 @@ mod unit_tests { timeout_s: VmSendMigrationData::default_timeout_s(), timeout_strategy: Default::default(), connections: VmSendMigrationData::default_connections(), + tls_dir: None, + keep_alive: false, } ); // Happy path, fully specified + let tls_dir = std::env::temp_dir(); let data = - VmSendMigrationData::parse("destination_url=tcp:192.168.1.1:8080,downtime_ms=150,timeout_s=900,timeout_strategy=ignore,connections=4") + VmSendMigrationData::parse(&format!("destination_url=tcp:192.168.1.1:8080,downtime_ms=150,timeout_s=900,timeout_strategy=ignore,connections=4,tls_dir={},keep_alive=true", tls_dir.display())) .unwrap(); assert_eq!( data, @@ -1854,6 +2290,8 @@ mod unit_tests { timeout_s: NonZeroU64::new(900).unwrap(), timeout_strategy: TimeoutStrategy::Ignore, connections: NonZeroU32::new(4).unwrap(), + tls_dir: Some(tls_dir), + keep_alive: true } ); } diff --git a/vmm/src/api/openapi/cloud-hypervisor.yaml b/vmm/src/api/openapi/cloud-hypervisor.yaml index e8b72f7484..fd5ccec531 100644 --- a/vmm/src/api/openapi/cloud-hypervisor.yaml +++ b/vmm/src/api/openapi/cloud-hypervisor.yaml @@ -109,6 +109,16 @@ paths: 405: description: The VM instance could not resume because it is not paused. + /vm.post-migration-announce: + put: + summary: Trigger post-migration announcements for a running VM instance. + operationId: postMigrationAnnounceVM + responses: + 204: + description: The VM instance successfully triggered post-migration announcements. + 500: + description: The VM instance could not trigger post-migration announcements because it is not running. + /vm.shutdown: put: summary: Shut the VM instance down. @@ -784,14 +794,32 @@ components: iommu_address_width: type: integer format: uint8 + system_serial_number: + type: string serial_number: type: string + deprecated: true + system_uuid: + type: string uuid: type: string + deprecated: true oem_strings: type: array items: type: string + system_manufacturer: + type: string + system_product_name: + type: string + system_version: + type: string + system_family: + type: string + system_sku_number: + type: string + chassis_asset_tag: + type: string tdx: type: boolean default: false @@ -1452,6 +1480,12 @@ components: properties: receiver_url: type: string + tls_dir: + type: string + description: > + Directory containing the TLS server certificate (server-cert.pem), the TLS + server key (server-key.pem), and the client TLS root CA certificate (ca-cert.pem). + TLS is only supported with tcp:: receiver URLs. TimeoutStrategy: type: string @@ -1499,6 +1533,12 @@ components: The number of parallel TCP connections to use for migration. Must be between 1 and 128. Multiple connections are not supported with local UNIX-socket migration. + tls_dir: + type: string + description: > + Directory containing the TLS root CA certificate (ca-cert.pem), the TLS client + certificate (client-cert.pem), and TLS client key (client-key.pem). + TLS is only supported with tcp:: destination URLs. VmAddUserDevice: required: diff --git a/vmm/src/config.rs b/vmm/src/config.rs index 00bf251f3a..d7d1283067 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -11,6 +11,7 @@ use std::result; use std::str::FromStr; use std::sync::LazyLock; +use arch::CpuProfile; use block::ImageType; use clap::ArgMatches; use log::{debug, warn}; @@ -222,6 +223,9 @@ pub enum ValidationError { /// Missing file value for console #[error("Path missing when using file console mode")] ConsoleFileMissing, + /// Missing TCP address for console + #[error("Address missing when using TCP console mode")] + ConsoleTcpAddressMissing, /// Missing socket path for console #[error("Path missing when using socket console mode")] ConsoleSocketPathMissing, @@ -691,7 +695,8 @@ impl CpusConfig { .add("affinity") .add("features") .add("nested") - .add("core_scheduling"); + .add("core_scheduling") + .add("profile"); parser.parse(cpus).map_err(Error::ParseCpus)?; let boot_vcpus: u32 = parser @@ -723,6 +728,12 @@ impl CpusConfig { }) .collect() }); + + let profile = parser + .convert::("profile") + .map_err(Error::ParseCpus)? + .unwrap_or_default(); + let features_list = parser .convert::("features") .map_err(Error::ParseCpus)? @@ -765,6 +776,7 @@ impl CpusConfig { features, nested, core_scheduling, + profile, }) } } @@ -832,9 +844,11 @@ impl PlatformConfig { static SYNTAX: LazyLock = LazyLock::new(|| { let mut syntax = "Platform configuration parameters \ \"num_pci_segments=,iommu_segments=,\ - iommu_address_width=,serial_number=,\ - uuid=,oem_strings=,iommufd=on|off,\ - vfio_p2p_dma=on|off" + iommu_address_width=,iommufd=on|off,vfio_p2p_dma=on|off,system_manufacturer=,\ + system_product_name=,system_version=,\ + system_serial_number=,system_uuid=,\ + system_sku_number=,system_family=,\ + oem_strings=,chassis_asset_tag=" .to_string(); if cfg!(feature = "tdx") { @@ -854,16 +868,61 @@ impl PlatformConfig { } pub fn parse(platform: &str) -> Result { + struct StringField { + key: &'static str, + apply: fn(&mut PlatformConfig, String), + } + + const SMBIOS_STRING_FIELDS: &[StringField] = &[ + StringField { + key: "system_manufacturer", + apply: |config, value| config.system_manufacturer = Some(value), + }, + StringField { + key: "system_product_name", + apply: |config, value| config.system_product_name = Some(value), + }, + StringField { + key: "system_version", + apply: |config, value| config.system_version = Some(value), + }, + StringField { + key: "system_serial_number", + apply: |config, value| config.system_serial_number = Some(value), + }, + StringField { + key: "system_uuid", + apply: |config, value| config.system_uuid = Some(value), + }, + StringField { + key: "system_sku_number", + apply: |config, value| config.system_sku_number = Some(value), + }, + StringField { + key: "system_family", + apply: |config, value| config.system_family = Some(value), + }, + StringField { + key: "chassis_asset_tag", + apply: |config, value| config.chassis_asset_tag = Some(value), + }, + ]; + let mut parser = OptionParser::new(); parser .add("num_pci_segments") .add("iommu_segments") .add("iommu_address_width") + .add("oem_strings") .add("serial_number") .add("uuid") .add("oem_strings") .add("iommufd") - .add("vfio_p2p_dma"); + .add("vfio_p2p_dma") + .add("uuid"); + for field in SMBIOS_STRING_FIELDS { + parser.add(field.key); + } #[cfg(feature = "tdx")] parser.add("tdx"); #[cfg(feature = "sev_snp")] @@ -882,14 +941,11 @@ impl PlatformConfig { .convert("iommu_address_width") .map_err(Error::ParsePlatform)? .unwrap_or(MAX_IOMMU_ADDRESS_WIDTH_BITS); - let serial_number = parser - .convert("serial_number") - .map_err(Error::ParsePlatform)?; - let uuid = parser.convert("uuid").map_err(Error::ParsePlatform)?; let oem_strings = parser .convert::("oem_strings") .map_err(Error::ParsePlatform)? - .map(|v| v.0); + .map(|v| v.0) + .unwrap_or_default(); let iommufd = parser .convert::("iommufd") .map_err(Error::ParsePlatform)? @@ -912,20 +968,77 @@ impl PlatformConfig { .map_err(Error::ParsePlatform)? .unwrap_or(Toggle(false)) .0; - Ok(PlatformConfig { + + let mut platform_config = PlatformConfig { num_pci_segments, iommu_segments, iommu_address_width_bits, - serial_number, - uuid, + system_serial_number: None, + system_uuid: None, oem_strings, + system_manufacturer: None, + system_product_name: None, + system_version: None, + system_family: None, + system_sku_number: None, + chassis_asset_tag: None, iommufd, - vfio_p2p_dma, #[cfg(feature = "tdx")] tdx, #[cfg(feature = "sev_snp")] sev_snp, - }) + vfio_p2p_dma, + }; + + for field in SMBIOS_STRING_FIELDS { + if let Some(value) = parser + .convert::(field.key) + .map_err(Error::ParsePlatform)? + { + (field.apply)(&mut platform_config, value); + } + } + + let legacy_serial_number = parser + .convert::("serial_number") + .map_err(Error::ParsePlatform)?; + if legacy_serial_number.is_some() { + warn!("'serial_number' in --platform is deprecated; use 'system_serial_number'."); + } + platform_config.system_serial_number = platform_config + .system_serial_number + .or(legacy_serial_number); + + let legacy_uuid = parser + .convert::("uuid") + .map_err(Error::ParsePlatform)?; + if legacy_uuid.is_some() { + warn!("'uuid' in --platform is deprecated; use 'system_uuid'."); + } + platform_config.system_uuid = platform_config.system_uuid.or(legacy_uuid); + #[cfg(feature = "tdx")] + let tdx = parser + .convert::("tdx") + .map_err(Error::ParsePlatform)? + .unwrap_or(Toggle(false)) + .0; + #[cfg(feature = "sev_snp")] + let sev_snp = parser + .convert::("sev_snp") + .map_err(Error::ParsePlatform)? + .unwrap_or(Toggle(false)) + .0; + + #[cfg(feature = "tdx")] + { + platform_config.tdx = tdx; + } + #[cfg(feature = "sev_snp")] + { + platform_config.sev_snp = sev_snp; + } + + Ok(platform_config) } pub fn validate(&self) -> ValidationResult<()> { @@ -2141,7 +2254,7 @@ impl PmemConfig { } impl CommonConsoleConfig { - const VALUELESS_OPTIONS: &[&str] = &["off", "pty", "tty", "null"]; + const VALUELESS_OPTIONS: &[&str] = &["off", "pty", "tty", "null", "tcp"]; const VALUE_OPTIONS: &[&str] = &["file", "socket"]; fn parse(console: &str, map_err: impl Fn(OptionParserError) -> Error) -> Result { @@ -2153,6 +2266,7 @@ impl CommonConsoleConfig { let mut file: Option = None; let mut socket: Option = None; + let mut url: Option = None; let mut mode: ConsoleOutputMode = ConsoleOutputMode::Off; if parser.is_set("off") { @@ -2168,6 +2282,19 @@ impl CommonConsoleConfig { Some(PathBuf::from(parser.get("file").ok_or( Error::Validation(ValidationError::ConsoleFileMissing), )?)); + } else if parser.is_set("tcp") { + mode = ConsoleOutputMode::Tcp; + url = Some( + parser + .get("tcp") + .ok_or(Error::Validation(ValidationError::ConsoleTcpAddressMissing))?, + ); + if parser.is_set("file") { + file = + Some(PathBuf::from(parser.get("file").ok_or( + Error::Validation(ValidationError::ConsoleFileMissing), + )?)); + } } else if parser.is_set("socket") { mode = ConsoleOutputMode::Socket; socket = Some(PathBuf::from(parser.get("socket").ok_or( @@ -2177,7 +2304,12 @@ impl CommonConsoleConfig { return Err(Error::ParseConsoleInvalidModeGiven); } - Ok(Self { mode, file, socket }) + Ok(Self { + mode, + file, + socket, + url, + }) } } @@ -2558,6 +2690,27 @@ pub struct RestoredNetConfig { pub fds: Option>, } +impl RestoredNetConfig { + // Ensure all net devices from 'VmConfig' backed by FDs have a + // corresponding 'RestoreNetConfig' with a matched 'id' and expected + // number of FDs. + pub fn validate(&self, vm_config: &VmConfig) -> ValidationResult<()> { + let found = vm_config + .net + .iter() + .flatten() + .any(|net| net.pci_common.id.as_ref() == Some(&self.id)); + + if found { + Ok(()) + } else { + Err(ValidationError::RestoreMissingRequiredNetId( + self.id.clone(), + )) + } + } +} + fn deserialize_restorednetconfig_fds<'de, D>( d: D, ) -> std::result::Result>, D::Error> @@ -3560,6 +3713,8 @@ impl VmConfig { /// To use this safely, the caller must guarantee that the input /// fds are all valid. pub unsafe fn add_preserved_fds(&mut self, mut fds: Vec) { + debug!("adding preserved FDs to VM list: {fds:?}"); + if fds.is_empty() { return; } @@ -3614,7 +3769,16 @@ impl Clone for VmConfig { .preserved_fds .as_ref() // SAFETY: FFI call with valid FDs - .map(|fds| fds.iter().map(|fd| unsafe { libc::dup(*fd) }).collect()), + .map(|fds| { + fds.iter() + .map(|fd| { + // SAFETY: Trivially safe. + let fd_duped = unsafe { libc::dup(*fd) }; + warn!("Cloning VM config: duping preserved FD {fd} => {fd_duped}"); + fd_duped + }) + .collect() + }), landlock_rules: self.landlock_rules.clone(), #[cfg(feature = "ivshmem")] ivshmem: self.ivshmem.clone(), @@ -3626,6 +3790,7 @@ impl Clone for VmConfig { impl Drop for VmConfig { fn drop(&mut self) { if let Some(mut fds) = self.preserved_fds.take() { + debug!("Closing preserved FDs from VM: fds={fds:?}"); for fd in fds.drain(..) { // SAFETY: FFI call with valid FDs unsafe { libc::close(fd) }; @@ -4412,7 +4577,12 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" #[test] fn test_console_parsing() -> Result<()> { let console_config = |mode, file, socket, iommu| ConsoleConfig { - common: CommonConsoleConfig { file, mode, socket }, + common: CommonConsoleConfig { + file, + mode, + socket, + url: None, + }, pci_common: PciDeviceCommonConfig { iommu, ..Default::default() @@ -4977,11 +5147,17 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" num_pci_segments: MAX_NUM_PCI_SEGMENTS, iommu_segments: None, iommu_address_width_bits: MAX_IOMMU_ADDRESS_WIDTH_BITS, - serial_number: None, - uuid: None, - oem_strings: None, + system_serial_number: None, + system_uuid: None, + oem_strings: Vec::new(), iommufd: false, vfio_p2p_dma: default_platformconfig_vfio_p2p_dma(), + system_manufacturer: None, + system_product_name: None, + system_version: None, + system_family: None, + system_sku_number: None, + chassis_asset_tag: None, #[cfg(feature = "tdx")] tdx: false, #[cfg(feature = "sev_snp")] @@ -5051,6 +5227,7 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" file: None, mode: ConsoleOutputMode::Null, socket: None, + url: None, }, }, console: ConsoleConfig { @@ -5058,6 +5235,7 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" file: None, mode: ConsoleOutputMode::Tty, socket: None, + url: None, }, pci_common: PciDeviceCommonConfig::default(), }, diff --git a/vmm/src/console_devices.rs b/vmm/src/console_devices.rs index a1f3493fd2..672d4277b3 100644 --- a/vmm/src/console_devices.rs +++ b/vmm/src/console_devices.rs @@ -12,6 +12,7 @@ use std::fs::{File, OpenOptions, read_link}; use std::mem::zeroed; +use std::net::TcpListener; use std::os::fd::{AsRawFd, FromRawFd, RawFd}; use std::os::unix::fs::OpenOptionsExt; use std::os::unix::net::UnixListener; @@ -40,6 +41,10 @@ pub enum ConsoleDeviceError { #[error("No socket option support for console device")] NoSocketOptionSupportForConsoleDevice, + /// Error parsing the TCP address + #[error("Wrong TCP address format: {0}")] + WrongTcpAddressFormat(std::string::String), + /// Error setting pty raw mode #[error("Error setting pty raw mode")] SetPtyRaw(#[source] vmm_sys_util::errno::Error), @@ -62,6 +67,7 @@ pub enum ConsoleTransport { Tty(Arc), Null, Socket(Arc), + Tcp(Arc, Option>), Off, } @@ -227,6 +233,7 @@ pub(crate) fn pre_create_console_devices(vmm: &mut Vmm) -> ConsoleDeviceResult { return Err(ConsoleDeviceError::NoSocketOptionSupportForConsoleDevice); } + ConsoleOutputMode::Tcp => ConsoleTransport::Null, ConsoleOutputMode::Null => ConsoleTransport::Null, ConsoleOutputMode::Off => ConsoleTransport::Off, }, @@ -264,6 +271,21 @@ pub(crate) fn pre_create_console_devices(vmm: &mut Vmm) -> ConsoleDeviceResult { + let url = vmconfig.serial.common.url.as_ref().unwrap(); + let socket_addr: std::net::SocketAddr = url + .parse() + .map_err(|_| ConsoleDeviceError::WrongTcpAddressFormat(url.to_string()))?; + let listener = TcpListener::bind(socket_addr) + .map_err(ConsoleDeviceError::CreateConsoleDevice)?; + + let mut f = None; + if let Some(p) = &vmconfig.serial.common.file { + let file = File::create(p).map_err(ConsoleDeviceError::CreateConsoleDevice)?; + f = Some(Arc::new(file)); + } + ConsoleTransport::Tcp(Arc::new(listener), f) + } ConsoleOutputMode::Null => ConsoleTransport::Null, ConsoleOutputMode::Off => ConsoleTransport::Off, }, @@ -290,6 +312,7 @@ pub(crate) fn pre_create_console_devices(vmm: &mut Vmm) -> ConsoleDeviceResult { return Err(ConsoleDeviceError::NoSocketOptionSupportForConsoleDevice); } + ConsoleOutputMode::Tcp => ConsoleTransport::Null, ConsoleOutputMode::Null => ConsoleTransport::Null, ConsoleOutputMode::Off => ConsoleTransport::Off, }, diff --git a/vmm/src/cpu.rs b/vmm/src/cpu.rs index 1f31ff61ee..b23ae0a413 100644 --- a/vmm/src/cpu.rs +++ b/vmm/src/cpu.rs @@ -19,6 +19,7 @@ use std::mem::size_of; use std::os::unix::thread::JoinHandleExt; use std::sync::atomic::{AtomicBool, AtomicI32, Ordering}; use std::sync::{Arc, Barrier, Mutex}; +use std::time::{Duration, Instant}; use std::{cmp, io, result, thread}; use acpi_tables::sdt::Sdt; @@ -46,7 +47,7 @@ use hypervisor::arch::aarch64::gic::Vgic; use hypervisor::arch::aarch64::regs::{ID_AA64MMFR0_EL1, TCR_EL1, TTBR1_EL1}; #[cfg(target_arch = "x86_64")] use hypervisor::arch::x86::CpuIdEntry; -#[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] +#[cfg(target_arch = "x86_64")] use hypervisor::arch::x86::MsrEntry; #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] use hypervisor::arch::x86::SpecialRegisters; @@ -80,6 +81,8 @@ use vm_migration::{ use vmm_sys_util::eventfd::EventFd; use vmm_sys_util::signal::{SIGRTMIN, register_signal_handler}; use zerocopy::{FromBytes, Immutable, IntoBytes}; +#[cfg(feature = "kvm")] +use {kvm_bindings::kvm_run, std::cell::Cell, std::os::fd::RawFd, std::sync::RwLock}; #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] use crate::coredump::{ @@ -95,6 +98,16 @@ use crate::vm::physical_bits; use crate::vm_config::{CoreScheduling, CpusConfig}; use crate::{CPU_MANAGER_SNAPSHOT_ID, GuestMemoryMmap}; +#[cfg(feature = "kvm")] +thread_local! { + static KVM_RUN: Cell<*mut kvm_run> = const {Cell::new(core::ptr::null_mut())}; +} +#[cfg(feature = "kvm")] +/// Tell signal handler to not access certain stuff anymore during shutdown. +/// Otherwise => panics. +/// Better alternative would be to prevent signals there at all. +pub static IS_IN_SHUTDOWN: RwLock = RwLock::new(false); + #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] /// Extract the specified bits of a 64-bit integer. /// For example, to extrace 2 bits from offset 1 (zero based) of `6u64`, @@ -130,6 +143,12 @@ pub enum Error { #[error("Error generating common CPUID")] CommonCpuId(#[source] arch::Error), + #[error("Error computing required MSR updates")] + RequiredMsrUpdates(#[source] arch::Error), + + #[error("Error applying MSR filter")] + MsrFilter(#[source] arch::Error), + #[error("Error configuring vCPU")] VcpuConfiguration(#[source] arch::Error), @@ -512,15 +531,17 @@ impl Vcpu { /// * `vm` - The virtual machine this vcpu will get attached to. /// * `vm_ops` - Optional object for exit handling. /// * `cpu_vendor` - CPU vendor as reported by __cpuid(0x0) + /// * `msr_buffer`(x86_64 only) - A buffer for supported MSRs. pub fn new( id: u32, apic_id: u32, vm: &dyn hypervisor::Vm, vm_ops: Option>, #[cfg(target_arch = "x86_64")] cpu_vendor: CpuVendor, + #[cfg(target_arch = "x86_64")] msr_buffer: Vec, ) -> Result { let vcpu = vm - .create_vcpu(apic_id, vm_ops) + .create_vcpu(apic_id, vm_ops, msr_buffer) .map_err(|e| Error::VcpuCreate(e.into()))?; // Initially the cpuid per vCPU is the one supported by this VM. Ok(Vcpu { @@ -541,11 +562,13 @@ impl Vcpu { /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used. /// * `guest_memory` - Guest memory. /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure. + #[cfg_attr(feature = "igvm", expect(clippy::too_many_arguments))] pub fn configure( &mut self, #[cfg(target_arch = "aarch64")] vm: &dyn hypervisor::Vm, boot_setup: Option<(EntryPoint, &GuestMemoryAtomic)>, #[cfg(target_arch = "x86_64")] cpuid: Vec, + #[cfg(target_arch = "x86_64")] feature_msr_updates: &[MsrEntry], #[cfg(target_arch = "x86_64")] kvm_hyperv: bool, #[cfg(target_arch = "x86_64")] topology: (u16, u16, u16, u16), #[cfg(target_arch = "x86_64")] nested: bool, @@ -580,6 +603,7 @@ impl Vcpu { self.id, boot_setup, cpuid, + feature_msr_updates, kvm_hyperv, self.vendor, topology, @@ -670,6 +694,13 @@ impl Vcpu { .map_err(Error::VcpuSetGicrBaseAddr)?; Ok(()) } + + #[cfg(feature = "kvm")] + pub fn get_kvm_vcpu_raw_fd(&self) -> RawFd { + // SAFETY: We happen to know that all current uses respect the safety contract. + // TODO find a better way to keep this safe and/or express its fragile state. + unsafe { self.vcpu.get_kvm_vcpu_raw_fd() } + } } impl Pausable for Vcpu {} @@ -698,6 +729,11 @@ pub struct CpuManager { interrupt_controller: Option>>, #[cfg(target_arch = "x86_64")] cpuid: Vec, + #[cfg(target_arch = "x86_64")] + /// A buffer for MSRs supported by the hardware and hypervisor + msr_buffer: Vec, + #[cfg(target_arch = "x86_64")] + profile_msr_based_features: Vec, #[cfg_attr(target_arch = "aarch64", allow(dead_code))] vm: Arc, vcpus_kill_signalled: Arc, @@ -798,30 +834,52 @@ impl VcpuState { } } - /// Blocks until the vCPU thread has acknowledged the signal. It retries to send - /// the signal every 10ms. Times out after 1000ms. + /// Blocks until the vCPU thread has acknowledged the signal. + /// + /// The signal is resent every ms until the vCPU thread acknowledges it. + /// A warning is emitted every 100ms while the acknowledgment is pending. + /// + /// The wait is bounded by a total timeout of 10 seconds. If the vCPU thread + /// does not acknowledge the signal within this time window, + /// [`Error::SignalAcknowledgeTimeout`] is returned. /// /// This is the counterpart of [`Self::signal_thread`]. fn wait_until_signal_acknowledged(&self) -> Result<()> { - if let Some(_handle) = self.handle.as_ref() { - let mut count = 0; - loop { - if self.vcpu_run_interrupted.load(Ordering::SeqCst) { - return Ok(()); - } - // This is more effective than thread::yield_now() at - // avoiding a priority inversion with the vCPU thread - thread::sleep(std::time::Duration::from_millis(1)); - count += 1; - if count >= 1000 { - return Err(Error::SignalAcknowledgeTimeout); - } else if count % 10 == 0 { - warn!("vCPU thread did not respond in {count}ms to signal - retrying"); - self.signal_thread(); - } + if self.handle.is_none() { + return Ok(()); + } + + let start = Instant::now(); + let timeout = Duration::from_secs(10); + let retry_interval = Duration::from_millis(1); + let warn_interval = Duration::from_millis(100); + + let mut next_warn = warn_interval; + loop { + if self.vcpu_run_interrupted.load(Ordering::SeqCst) { + return Ok(()); + } + + // Re-signal: it is cheap and idempotent. + self.signal_thread(); + + let elapsed = start.elapsed(); + if elapsed >= timeout { + return Err(Error::SignalAcknowledgeTimeout); + } + + // Emit warning every 100ms + if elapsed >= next_warn { + warn!( + "vCPU thread did not respond in {}ms to signal - retrying (timeout: {}s)", + elapsed.as_millis(), + timeout.as_secs(), + ); + next_warn += warn_interval; } + + thread::sleep(retry_interval); } - Ok(()) } fn join_thread(&mut self) -> Result<()> { @@ -909,6 +967,10 @@ impl CpuManager { interrupt_controller: None, #[cfg(target_arch = "x86_64")] cpuid: Vec::new(), + #[cfg(target_arch = "x86_64")] + msr_buffer: Self::construct_msr_buffer(hypervisor.as_ref())?, + #[cfg(target_arch = "x86_64")] + profile_msr_based_features: Vec::new(), vm, vcpus_kill_signalled: Arc::new(AtomicBool::new(false)), vcpus_pause_signalled: Arc::new(AtomicBool::new(false)), @@ -936,6 +998,20 @@ impl CpuManager { }))) } + #[cfg(target_arch = "x86_64")] + fn construct_msr_buffer(hypervisor: &dyn hypervisor::Hypervisor) -> Result> { + let msr_indices = hypervisor + .get_msr_index_list() + .map_err(|e| Error::VcpuCreate(e.into()))?; + Ok(msr_indices + .into_iter() + .map(|index| MsrEntry { + index, + ..Default::default() + }) + .collect()) + } + #[cfg(target_arch = "x86_64")] pub fn populate_cpuid( &mut self, @@ -952,6 +1028,7 @@ impl CpuManager { #[cfg(feature = "tdx")] tdx, amx: self.config.features.amx, + profile: self.config.profile, }, ) .map_err(Error::CommonCpuId)? @@ -960,6 +1037,63 @@ impl CpuManager { Ok(()) } + #[cfg(target_arch = "x86_64")] + /// Prepares common MSR-based feature value updates that will be set when vCPUs are configured. + /// + /// This is only relevant when (non-host) CPU profiles are present, otherwise it is infallible + /// and we set an empty vector. + pub fn apply_msr_updates(&mut self) -> Result<()> { + let profile_msr_based_features = { + if let Some(arch::x86_64::cpu_profile::RequiredMsrUpdates { + msr_based_features, + denied_msrs, + }) = arch::x86_64::compute_required_msr_updates( + self.hypervisor.as_ref(), + self.config.profile, + self.config.kvm_hyperv, + ) + .map_err(Error::RequiredMsrUpdates)? + { + // Remove denied MSRS from the MSR buffer + self.msr_buffer.retain(|entry| { + !denied_msrs + .contains(&arch::x86_64::msr_definitions::RegisterAddress(entry.index)) + }); + + // Assert that all required msr_based_features to be set are part of the MSR buffer. + // It is a bug if this is not the case + for msr in &msr_based_features { + if !self + .msr_buffer + .iter() + .any(|msr_container| msr_container.index == msr.index) + { + error!( + "BUG: feature based MSR:={:#x} is not contained in the set MSR buffer for the CPU manager", + msr.index + ); + panic!( + "Broken invariant: The CPU Manager's MSR buffer does not have an entry for the computed MSR-based feature update" + ); + } + } + + // Create and apply a filter to prevent guests from accessing the denied MSRs + // TODO: Better error! + arch::x86_64::filter_denied_msrs( + denied_msrs.into_iter().map(|reg| reg.0).collect(), + self.vm.as_ref(), + ) + .map_err(Error::MsrFilter)?; + msr_based_features + } else { + Vec::new() + } + }; + self.profile_msr_based_features = profile_msr_based_features; + Ok(()) + } + fn create_vcpu( &mut self, cpu_id: u32, @@ -981,6 +1115,8 @@ impl CpuManager { Some(self.vm_ops.clone()), #[cfg(target_arch = "x86_64")] self.hypervisor.get_cpu_vendor(), + #[cfg(target_arch = "x86_64")] + self.msr_buffer.clone(), )?; if let Some(snapshot) = snapshot { @@ -1052,6 +1188,7 @@ impl CpuManager { vcpu.configure( boot_setup, self.cpuid.clone(), + &self.profile_msr_based_features, self.config.kvm_hyperv, topology, self.config.nested, @@ -1185,6 +1322,28 @@ impl CpuManager { thread::Builder::new() .name(format!("vcpu{vcpu_id}")) .spawn(move || { + // init thread-local kvm_run structure + #[cfg(feature = "kvm")] + { + let raw_kvm_fd = vcpu.lock().unwrap().get_kvm_vcpu_raw_fd(); + + // SAFETY: We know the FD is valid and have the proper args. + let buffer = unsafe { + libc::mmap( + core::ptr::null_mut(), + 4096, + libc::PROT_WRITE | libc::PROT_READ, + libc::MAP_SHARED, + raw_kvm_fd, + 0, + ) + }; + assert!(!buffer.is_null()); + assert_ne!(buffer, libc::MAP_FAILED); + let kvm_run = buffer.cast::(); + KVM_RUN.set(kvm_run); + } + // Schedule the thread to run on the expected CPU set if let Some(cpuset) = cpuset.as_ref() { let cpuset: *const libc::cpu_set_t = cpuset; @@ -1276,7 +1435,35 @@ impl CpuManager { return; } + #[cfg(not(feature = "kvm"))] extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {} + #[cfg(feature = "kvm")] + extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) { + // We do not need a self-pipe for safe UNIX signal handling here as in this + // signal handler, we only expect the same signal over and over again. While + // different signals can interrupt a signal being handled, the same signal + // again can't by default. Therefore, this is safe. + + // This lock prevents accessing thread locals when a signal is received + // in the teardown phase of the Rust standard library. Otherwise, we would + // panic. + // + // Masking signals would be a nicer approach but this is the pragmatic + // solution. + // + // We don't have lock contention in normal operation. When the writer + // sets the bool to true, the lock is only held for a couple of µs. + let lock = IS_IN_SHUTDOWN.read().unwrap(); + if *lock { + return; + } + + let kvm_run = KVM_RUN.get(); + // SAFETY: the mapping is valid + let kvm_run = unsafe { + kvm_run.as_mut().expect("kvm_run should have been mapped as part of vCPU setup") }; + kvm_run.immediate_exit = 1; + } // This uses an async signal safe handler to kill the vcpu handles. register_signal_handler(SIGRTMIN(), handle_signal) .expect("Failed to register vcpu signal handler"); @@ -1315,12 +1502,14 @@ impl CpuManager { #[cfg(feature = "kvm")] if matches!(hypervisor_type, HypervisorType::Kvm) { - vcpu.lock().unwrap().vcpu.set_immediate_exit(true); - if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) { + let lock = vcpu.lock(); + let mut lock = lock.unwrap(); + lock.vcpu.set_immediate_exit(true); + if !matches!(lock.run(), Ok(VmExit::Ignore)) { error!("Unexpected VM exit on \"immediate_exit\" run"); break; } - vcpu.lock().unwrap().vcpu.set_immediate_exit(false); + lock.vcpu.set_immediate_exit(false); } vcpu_run_interrupted.store(true, Ordering::SeqCst); @@ -2216,6 +2405,11 @@ impl CpuManager { &self.vcpus_kill_signalled } + pub(crate) fn vcpus_pause_signalled(&self) -> &Arc { + &self.vcpus_pause_signalled + } + + #[cfg(feature = "igvm")] #[cfg(all(feature = "igvm", feature = "mshv"))] pub(crate) fn get_cpuid_leaf( &self, @@ -3297,7 +3491,7 @@ mod unit_tests { hv.check_required_extensions().unwrap(); // Calling get_lapic will fail if there is no irqchip before hand. vm.create_irq_chip().unwrap(); - let vcpu = vm.create_vcpu(0, None).unwrap(); + let vcpu = vm.create_vcpu(0, None, vec![]).unwrap(); let klapic_before: LapicState = vcpu.get_lapic().unwrap(); // Compute the value that is expected to represent LVT0 and LVT1. @@ -3322,7 +3516,7 @@ mod unit_tests { let vm = hv .create_vm(HypervisorVmConfig::default()) .expect("new VM fd creation failed"); - let vcpu = vm.create_vcpu(0, None).unwrap(); + let vcpu = vm.create_vcpu(0, None, vec![]).unwrap(); setup_fpu(vcpu.as_ref()).unwrap(); let expected_fpu: FpuState = FpuState { @@ -3348,8 +3542,9 @@ mod unit_tests { let vm = hv .create_vm(HypervisorVmConfig::default()) .expect("new VM fd creation failed"); - let vcpu = vm.create_vcpu(0, None).unwrap(); - setup_msrs(vcpu.as_ref()).unwrap(); + // TODO: Use a proper MSR buffer here + let vcpu = vm.create_vcpu(0, None, vec![]).unwrap(); + setup_msrs(vcpu.as_ref(), &[]).unwrap(); // This test will check against the last MSR entry configured (the tenth one). // See create_msr_entries for details. @@ -3376,7 +3571,7 @@ mod unit_tests { let vm = hv .create_vm(HypervisorVmConfig::default()) .expect("new VM fd creation failed"); - let vcpu = vm.create_vcpu(0, None).unwrap(); + let vcpu = vm.create_vcpu(0, None, vec![]).unwrap(); let mut expected_regs: StandardRegisters = vcpu.create_standard_regs(); expected_regs.set_rflags(0x0000000000000002u64); @@ -3402,7 +3597,7 @@ mod unit_tests { let vm = hv .create_vm(HypervisorVmConfig::default()) .expect("new VM fd creation failed"); - let vcpu = vm.create_vcpu(0, None).unwrap(); + let vcpu = vm.create_vcpu(0, None, vec![]).unwrap(); let mut expected_regs: StandardRegisters = vcpu.create_standard_regs(); expected_regs.set_rflags(0x0000000000000002u64); diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index e6902a4cbf..80598d1d02 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -18,10 +18,11 @@ use std::os::unix::io::{AsRawFd, FromRawFd}; #[cfg(not(target_arch = "riscv64"))] use std::path::Path; use std::path::PathBuf; -use std::result; use std::sync::{Arc, Mutex}; +use std::time::Duration; #[cfg(not(target_arch = "riscv64"))] use std::time::Instant; +use std::{result, thread}; use acpi_tables::sdt::GenericAddress; use acpi_tables::{Aml, aml}; @@ -88,8 +89,8 @@ use vfio_ioctls::{VfioContainer, VfioDevice, VfioDeviceFd, VfioOps}; use virtio_devices::transport::{VirtioPciDevice, VirtioPciDeviceActivator, VirtioTransport}; use virtio_devices::vhost_user::VhostUserConfig; use virtio_devices::{ - AccessPlatformMapping, ActivateError, Block, Endpoint, IommuMapping, VdpaDmaMapping, - VirtioMemMappingSource, + AccessPlatformMapping, ActivateError, Block, Endpoint, IommuMapping, PostMigrationAnnouncer, + VdpaDmaMapping, VirtioMemMappingSource, }; use vm_allocator::{AddressAllocator, InterruptAllocError, SystemAllocator}; use vm_device::dma_mapping::ExternalDmaMapping; @@ -1138,8 +1139,6 @@ pub struct DeviceManager { // Addresses for ACPI platform devices e.g. ACPI PM timer, sleep/reset registers acpi_platform_addresses: AcpiPlatformAddresses, - snapshot: Option, - rate_limit_groups: HashMap>, mmio_regions: Arc>>, @@ -1433,7 +1432,6 @@ impl DeviceManager { timestamp, pending_activations: Arc::new(Mutex::new(Vec::default())), acpi_platform_addresses: AcpiPlatformAddresses::default(), - snapshot: snapshot.cloned(), rate_limit_groups, mmio_regions: Arc::new(Mutex::new(Vec::new())), #[cfg(feature = "fw_cfg")] @@ -1463,8 +1461,9 @@ impl DeviceManager { pub fn create_interrupt_controller( &mut self, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult>> { - self.add_interrupt_controller() + self.add_interrupt_controller(snapshot) } #[allow(clippy::needless_pass_by_value)] @@ -1474,6 +1473,7 @@ impl DeviceManager { console_resize_pipe: Option>, original_termios_opt: Arc>>, interrupt_controller: Arc>, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult<()> { trace_scoped!("create_devices"); @@ -1511,7 +1511,7 @@ impl DeviceManager { )?; #[cfg(target_arch = "aarch64")] - self.add_legacy_devices(legacy_interrupt_manager.as_ref())?; + self.add_legacy_devices(legacy_interrupt_manager.as_ref(), snapshot)?; { self.ged_notification_device = self.add_acpi_devices( @@ -1531,6 +1531,7 @@ impl DeviceManager { legacy_interrupt_manager.as_ref(), console_info, console_resize_pipe, + snapshot, )?; #[cfg(not(target_arch = "riscv64"))] @@ -1541,8 +1542,8 @@ impl DeviceManager { } self.legacy_interrupt_manager = Some(legacy_interrupt_manager); - self.make_virtio_devices()?; - self.add_pci_devices()?; + self.make_virtio_devices(snapshot)?; + self.add_pci_devices(snapshot)?; // Add pvmemcontrol if required #[cfg(feature = "pvmemcontrol")] @@ -1556,12 +1557,12 @@ impl DeviceManager { } if self.config.clone().lock().unwrap().pvpanic { - self.pvpanic_device = self.add_pvpanic_device()?; + self.pvpanic_device = self.add_pvpanic_device(snapshot)?; } #[cfg(feature = "ivshmem")] if let Some(ivshmem) = self.config.clone().lock().unwrap().ivshmem.as_ref() { - self.ivshmem_device = self.add_ivshmem_device(ivshmem)?; + self.ivshmem_device = self.add_ivshmem_device(ivshmem, snapshot)?; } Ok(()) @@ -1651,7 +1652,7 @@ impl DeviceManager { } #[allow(unused_variables)] - fn add_pci_devices(&mut self) -> DeviceManagerResult<()> { + fn add_pci_devices(&mut self, snapshot: Option<&Snapshot>) -> DeviceManagerResult<()> { let iommu_id = String::from(IOMMU_DEVICE_NAME); let iommu_address_width_bits = @@ -1670,7 +1671,7 @@ impl DeviceManager { .map_err(DeviceManagerError::EventFd)?, self.get_msi_iova_space(), iommu_address_width_bits, - state_from_id(self.snapshot.as_ref(), iommu_id.as_str()) + state_from_id(snapshot, iommu_id.as_str()) .map_err(DeviceManagerError::RestoreGetState)?, ) .map_err(DeviceManagerError::CreateVirtioIommu)?; @@ -1713,6 +1714,7 @@ impl DeviceManager { false, handle.dma_handler, handle.pci_common.pci_device_id, + snapshot, )?; // Track device BDF for Generic Initiator support @@ -1723,10 +1725,10 @@ impl DeviceManager { } } - let mut vfio_iommu_device_ids = self.add_vfio_devices()?; + let mut vfio_iommu_device_ids = self.add_vfio_devices(snapshot)?; iommu_attached_devices.append(&mut vfio_iommu_device_ids); - let mut vfio_user_iommu_device_ids = self.add_user_devices()?; + let mut vfio_user_iommu_device_ids = self.add_user_devices(snapshot)?; iommu_attached_devices.append(&mut vfio_user_iommu_device_ids); // Add all devices from forced iommu segments @@ -1752,6 +1754,7 @@ impl DeviceManager { false, None, None, + snapshot, )?; self.iommu_attached_devices = Some((dev_id, iommu_attached_devices)); } @@ -1774,6 +1777,7 @@ impl DeviceManager { #[cfg(target_arch = "aarch64")] fn add_interrupt_controller( &mut self, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult>> { let interrupt_controller: Arc> = Arc::new(Mutex::new( gic::Gic::new( @@ -1788,7 +1792,7 @@ impl DeviceManager { // Restore the vGic if this is in the process of restoration let id = String::from(gic::GIC_SNAPSHOT_ID); - if let Some(vgic_snapshot) = snapshot_from_id(self.snapshot.as_ref(), &id) { + if let Some(vgic_snapshot) = snapshot_from_id(snapshot, &id) { // PMU support is optional. Nothing should be impacted if the PMU initialization failed. if self .cpu_manager @@ -1827,6 +1831,7 @@ impl DeviceManager { #[cfg(target_arch = "riscv64")] fn add_interrupt_controller( &mut self, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult>> { let interrupt_controller: Arc> = Arc::new(Mutex::new( aia::Aia::new( @@ -1841,7 +1846,7 @@ impl DeviceManager { // Restore the vAia if this is in the process of restoration let id = String::from(aia::_AIA_SNAPSHOT_ID); - if let Some(_vaia_snapshot) = snapshot_from_id(self.snapshot.as_ref(), &id) { + if let Some(_vaia_snapshot) = snapshot_from_id(snapshot, &id) { // TODO: vAia snapshotting and restoration is scheduled to next stage of riscv64 support. // TODO: PMU support is scheduled to next stage of riscv64 support. // PMU support is optional. Nothing should be impacted if the PMU initialization failed. @@ -1864,11 +1869,12 @@ impl DeviceManager { #[cfg(target_arch = "x86_64")] fn add_interrupt_controller( &mut self, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult>> { let id = String::from(IOAPIC_DEVICE_NAME); - let state = state_from_id(self.snapshot.as_ref(), id.as_str()) - .map_err(DeviceManagerError::RestoreGetState)?; + let state = + state_from_id(snapshot, id.as_str()).map_err(DeviceManagerError::RestoreGetState)?; // Create IOAPIC let interrupt_controller = Arc::new(Mutex::new( ioapic::Ioapic::new( @@ -1913,10 +1919,17 @@ impl DeviceManager { .unwrap() .vcpus_kill_signalled() .clone(); + let vcpus_pause_signalled = self + .cpu_manager + .lock() + .unwrap() + .vcpus_pause_signalled() + .clone(); let shutdown_device = Arc::new(Mutex::new(devices::AcpiShutdownDevice::new( guest_exit_evt, reset_evt, vcpus_kill_signalled, + vcpus_pause_signalled, ))); self.bus_devices @@ -2021,10 +2034,17 @@ impl DeviceManager { .unwrap() .vcpus_kill_signalled() .clone(); + let vcpus_pause_signalled = self + .cpu_manager + .lock() + .unwrap() + .vcpus_pause_signalled() + .clone(); // Add a shutdown device (i8042) let i8042 = Arc::new(Mutex::new(devices::legacy::I8042Device::new( reset_evt.try_clone().unwrap(), vcpus_kill_signalled.clone(), + vcpus_pause_signalled.clone(), ))); self.bus_devices @@ -2052,7 +2072,8 @@ impl DeviceManager { mem_below_4g, mem_above_4g, reset_evt, - Some(vcpus_kill_signalled), + vcpus_kill_signalled, + vcpus_pause_signalled.clone(), ))); self.bus_devices @@ -2090,6 +2111,7 @@ impl DeviceManager { fn add_legacy_devices( &mut self, interrupt_manager: &dyn InterruptManager, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult<()> { // Add a RTC device let rtc_irq = self @@ -2140,8 +2162,7 @@ impl DeviceManager { let gpio_device = Arc::new(Mutex::new(devices::legacy::Gpio::new( id.clone(), interrupt_group, - state_from_id(self.snapshot.as_ref(), id.as_str()) - .map_err(DeviceManagerError::RestoreGetState)?, + state_from_id(snapshot, id.as_str()).map_err(DeviceManagerError::RestoreGetState)?, ))); self.bus_devices @@ -2224,6 +2245,7 @@ impl DeviceManager { &mut self, interrupt_manager: &dyn InterruptManager, serial_writer: Option>, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult>> { // Serial is tied to IRQ #4 let serial_irq = 4; @@ -2240,8 +2262,7 @@ impl DeviceManager { id.clone(), interrupt_group, serial_writer, - state_from_id(self.snapshot.as_ref(), id.as_str()) - .map_err(DeviceManagerError::RestoreGetState)?, + state_from_id(snapshot, id.as_str()).map_err(DeviceManagerError::RestoreGetState)?, ))); self.bus_devices @@ -2275,6 +2296,7 @@ impl DeviceManager { &mut self, interrupt_manager: &dyn InterruptManager, serial_writer: Option>, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult>> { let id = String::from(SERIAL_DEVICE_NAME); @@ -2297,8 +2319,7 @@ impl DeviceManager { interrupt_group, serial_writer, self.timestamp, - state_from_id(self.snapshot.as_ref(), id.as_str()) - .map_err(DeviceManagerError::RestoreGetState)?, + state_from_id(snapshot, id.as_str()).map_err(DeviceManagerError::RestoreGetState)?, ))); self.bus_devices @@ -2339,6 +2360,7 @@ impl DeviceManager { &mut self, interrupt_manager: &dyn InterruptManager, serial_writer: Option>, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult>> { let id = String::from(SERIAL_DEVICE_NAME); @@ -2360,8 +2382,7 @@ impl DeviceManager { id.clone(), interrupt_group, serial_writer, - state_from_id(self.snapshot.as_ref(), id.as_str()) - .map_err(DeviceManagerError::RestoreGetState)?, + state_from_id(snapshot, id.as_str()).map_err(DeviceManagerError::RestoreGetState)?, ))); self.bus_devices @@ -2401,6 +2422,7 @@ impl DeviceManager { &mut self, transport: ConsoleTransport, resize_pipe: Option>, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult>> { let mut console_config = self.config.lock().unwrap().console.clone(); let endpoint = match transport { @@ -2433,6 +2455,9 @@ impl DeviceManager { ConsoleTransport::Socket(_) => { return Err(DeviceManagerError::NoSocketOptionSupportForConsoleDevice); } + ConsoleTransport::Tcp(_, _) => { + return Err(DeviceManagerError::NoSocketOptionSupportForConsoleDevice); + } ConsoleTransport::Null => Endpoint::Null, ConsoleTransport::Off => return Ok(None), }; @@ -2457,8 +2482,7 @@ impl DeviceManager { self.exit_evt .try_clone() .map_err(DeviceManagerError::EventFd)?, - state_from_id(self.snapshot.as_ref(), id.as_str()) - .map_err(DeviceManagerError::RestoreGetState)?, + state_from_id(snapshot, id.as_str()).map_err(DeviceManagerError::RestoreGetState)?, ) .map_err(DeviceManagerError::CreateVirtioConsole)?; let virtio_console_device = Arc::new(Mutex::new(virtio_console_device)); @@ -2497,6 +2521,7 @@ impl DeviceManager { interrupt_manager: &dyn InterruptManager, console_info: Option, console_resize_pipe: Option>, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult> { let serial_config = self.config.lock().unwrap().serial.clone(); if console_info.is_none() { @@ -2514,14 +2539,16 @@ impl DeviceManager { | ConsoleTransport::Null | ConsoleTransport::Pty(_) | ConsoleTransport::Socket(_) => None, + ConsoleTransport::Tcp(_, _) => None, }; if !matches!(console_info.serial, ConsoleTransport::Off) { - let serial = self.add_serial_device(interrupt_manager, serial_writer)?; + let serial = self.add_serial_device(interrupt_manager, serial_writer, snapshot)?; self.serial_manager = match console_info.serial { ConsoleTransport::Pty(_) | ConsoleTransport::Tty(_) - | ConsoleTransport::Socket(_) => { + | ConsoleTransport::Socket(_) + | ConsoleTransport::Tcp(_, _) => { let serial_manager = SerialManager::new( serial, console_info.serial, @@ -2552,7 +2579,8 @@ impl DeviceManager { ConsoleTransport::Off | ConsoleTransport::Null | ConsoleTransport::Pty(_) - | ConsoleTransport::Socket(_) => None, + | ConsoleTransport::Socket(_) + | ConsoleTransport::Tcp(_, _) => None, }; if let Some(writer) = debug_console_writer { let _ = self.add_debug_console_device(writer)?; @@ -2560,7 +2588,7 @@ impl DeviceManager { } let console_resizer = - self.add_virtio_console_device(console_info.console, console_resize_pipe)?; + self.add_virtio_console_device(console_info.console, console_resize_pipe, snapshot)?; Ok(Arc::new(Console { console_resizer })) } @@ -2618,34 +2646,34 @@ impl DeviceManager { Ok(()) } - fn make_virtio_devices(&mut self) -> DeviceManagerResult<()> { + fn make_virtio_devices(&mut self, snapshot: Option<&Snapshot>) -> DeviceManagerResult<()> { // Create "standard" virtio devices (net/block/rng) - self.make_virtio_block_devices()?; - self.make_virtio_net_devices()?; - self.make_virtio_rng_devices()?; + self.make_virtio_block_devices(snapshot)?; + self.make_virtio_net_devices(snapshot)?; + self.make_virtio_rng_devices(snapshot)?; // Add generic vhost-user if required - self.make_generic_vhost_user_devices()?; + self.make_generic_vhost_user_devices(snapshot)?; // Add virtio-fs if required - self.make_virtio_fs_devices()?; + self.make_virtio_fs_devices(snapshot)?; // Add virtio-pmem if required - self.make_virtio_pmem_devices()?; + self.make_virtio_pmem_devices(snapshot)?; // Add virtio-vsock if required - self.make_virtio_vsock_devices()?; + self.make_virtio_vsock_devices(snapshot)?; - self.make_virtio_mem_devices()?; + self.make_virtio_mem_devices(snapshot)?; // Add virtio-balloon if required - self.make_virtio_balloon_devices()?; + self.make_virtio_balloon_devices(snapshot)?; // Add virtio-watchdog device - self.make_virtio_watchdog_devices()?; + self.make_virtio_watchdog_devices(snapshot)?; // Add vDPA devices if required - self.make_vdpa_devices()?; + self.make_vdpa_devices(snapshot)?; Ok(()) } @@ -2663,6 +2691,7 @@ impl DeviceManager { &mut self, disk_cfg: &mut DiskConfig, is_hotplug: bool, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult { let id = match disk_cfg.pci_common.id.as_ref() { Some(id) => id.clone(), @@ -2694,7 +2723,7 @@ impl DeviceManager { .try_clone() .map_err(DeviceManagerError::EventFd)?, self.force_access_platform, - state_from_id(self.snapshot.as_ref(), id.as_str()) + state_from_id(snapshot, id.as_str()) .map_err(DeviceManagerError::RestoreGetState)?, ) { Ok(vub_device) => vub_device, @@ -2829,7 +2858,7 @@ impl DeviceManager { self.exit_evt .try_clone() .map_err(DeviceManagerError::EventFd)?, - state_from_id(self.snapshot.as_ref(), id.as_str()) + state_from_id(snapshot, id.as_str()) .map_err(DeviceManagerError::RestoreGetState)?, queue_affinity, disk_cfg.sparse, @@ -2873,11 +2902,14 @@ impl DeviceManager { }) } - fn make_virtio_block_devices(&mut self) -> DeviceManagerResult<()> { + fn make_virtio_block_devices( + &mut self, + snapshot: Option<&Snapshot>, + ) -> DeviceManagerResult<()> { let mut block_devices = self.config.lock().unwrap().disks.take(); if let Some(disk_list_cfg) = &mut block_devices { for disk_cfg in disk_list_cfg.iter_mut() { - let device = self.make_virtio_block_device(disk_cfg, false)?; + let device = self.make_virtio_block_device(disk_cfg, false, snapshot)?; self.virtio_devices.push(device); } } @@ -2889,6 +2921,7 @@ impl DeviceManager { fn make_virtio_net_device( &mut self, net_cfg: &mut NetConfig, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult { let id = match net_cfg.pci_common.id.as_ref() { Some(id) => id.clone(), @@ -2902,6 +2935,7 @@ impl DeviceManager { let (virtio_device, migratable_device) = if net_cfg.vhost_user { let socket = net_cfg.vhost_socket.as_ref().unwrap().clone(); + debug!("Creating virtio-net device with vhost-user backend: {socket}"); let vu_cfg = VhostUserConfig { socket, num_queues: net_cfg.num_queues, @@ -2923,7 +2957,7 @@ impl DeviceManager { .try_clone() .map_err(DeviceManagerError::EventFd)?, self.force_access_platform, - state_from_id(self.snapshot.as_ref(), id.as_str()) + state_from_id(snapshot, id.as_str()) .map_err(DeviceManagerError::RestoreGetState)?, net_cfg.offload_tso, net_cfg.offload_ufo, @@ -2941,9 +2975,10 @@ impl DeviceManager { vhost_user_net as Arc>, ) } else { - let state = state_from_id(self.snapshot.as_ref(), id.as_str()) + let state = state_from_id(snapshot, id.as_str()) .map_err(DeviceManagerError::RestoreGetState)?; let virtio_net = if let Some(ref tap_if_name) = net_cfg.tap { + debug!("Creating virtio-net device from Tap device: {tap_if_name}"); Arc::new(Mutex::new( virtio_devices::Net::new( id.clone(), @@ -2969,6 +3004,7 @@ impl DeviceManager { .map_err(DeviceManagerError::CreateVirtioNet)?, )) } else if let Some(fds) = &net_cfg.fds { + debug!("Creating virtio-net device from network FDs: {fds:?}"); let net = virtio_devices::Net::from_tap_fds( id.clone(), fds, @@ -2995,6 +3031,9 @@ impl DeviceManager { Arc::new(Mutex::new(net)) } else { + debug!( + "Creating virtio-net device: no ifname or FDs given, creating new Tap device" + ); Arc::new(Mutex::new( virtio_devices::Net::new( id.clone(), @@ -3043,11 +3082,11 @@ impl DeviceManager { } /// Add virto-net and vhost-user-net devices - fn make_virtio_net_devices(&mut self) -> DeviceManagerResult<()> { + fn make_virtio_net_devices(&mut self, snapshot: Option<&Snapshot>) -> DeviceManagerResult<()> { let mut net_devices = self.config.lock().unwrap().net.take(); if let Some(net_list_cfg) = &mut net_devices { for net_cfg in net_list_cfg.iter_mut() { - let device = self.make_virtio_net_device(net_cfg)?; + let device = self.make_virtio_net_device(net_cfg, snapshot)?; self.virtio_devices.push(device); } } @@ -3056,7 +3095,7 @@ impl DeviceManager { Ok(()) } - fn make_virtio_rng_devices(&mut self) -> DeviceManagerResult<()> { + fn make_virtio_rng_devices(&mut self, snapshot: Option<&Snapshot>) -> DeviceManagerResult<()> { // Add virtio-rng if required let mut rng_config = self.config.lock().unwrap().rng.clone(); if let Some(rng_path) = rng_config.src.to_str() { @@ -3080,7 +3119,7 @@ impl DeviceManager { self.exit_evt .try_clone() .map_err(DeviceManagerError::EventFd)?, - state_from_id(self.snapshot.as_ref(), id.as_str()) + state_from_id(snapshot, id.as_str()) .map_err(DeviceManagerError::RestoreGetState)?, ) .map_err(DeviceManagerError::CreateVirtioRng)?, @@ -3107,6 +3146,7 @@ impl DeviceManager { fn make_generic_vhost_user_device( &mut self, generic_vhost_user_cfg: &mut GenericVhostUserConfig, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult { let id = match generic_vhost_user_cfg.pci_common.id.as_ref() { Some(id) => id.clone(), @@ -3134,7 +3174,7 @@ impl DeviceManager { .try_clone() .map_err(DeviceManagerError::EventFd)?, self.force_access_platform, - state_from_id(self.snapshot.as_ref(), id.as_str()) + state_from_id(snapshot, id.as_str()) .map_err(DeviceManagerError::RestoreGetState)?, ) .map_err(DeviceManagerError::CreateGenericVhostUser)?, @@ -3156,11 +3196,15 @@ impl DeviceManager { } } - fn make_generic_vhost_user_devices(&mut self) -> DeviceManagerResult<()> { + fn make_generic_vhost_user_devices( + &mut self, + snapshot: Option<&Snapshot>, + ) -> DeviceManagerResult<()> { let mut generic_vhost_user_devices = self.config.lock().unwrap().generic_vhost_user.clone(); if let Some(generic_vhost_user_list_cfg) = &mut generic_vhost_user_devices { for generic_vhost_user_cfg in generic_vhost_user_list_cfg.iter_mut() { - let device = self.make_generic_vhost_user_device(generic_vhost_user_cfg)?; + let device = + self.make_generic_vhost_user_device(generic_vhost_user_cfg, snapshot)?; self.virtio_devices.push(device); } } @@ -3172,6 +3216,7 @@ impl DeviceManager { fn make_virtio_fs_device( &mut self, fs_cfg: &mut FsConfig, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult { let id = match fs_cfg.pci_common.id.as_ref() { Some(id) => id.clone(), @@ -3200,7 +3245,7 @@ impl DeviceManager { .try_clone() .map_err(DeviceManagerError::EventFd)?, self.force_access_platform, - state_from_id(self.snapshot.as_ref(), id.as_str()) + state_from_id(snapshot, id.as_str()) .map_err(DeviceManagerError::RestoreGetState)?, ) .map_err(DeviceManagerError::CreateVirtioFs)?, @@ -3221,11 +3266,11 @@ impl DeviceManager { } } - fn make_virtio_fs_devices(&mut self) -> DeviceManagerResult<()> { + fn make_virtio_fs_devices(&mut self, snapshot: Option<&Snapshot>) -> DeviceManagerResult<()> { let mut fs_devices = self.config.lock().unwrap().fs.take(); if let Some(fs_list_cfg) = &mut fs_devices { for fs_cfg in fs_list_cfg.iter_mut() { - let device = self.make_virtio_fs_device(fs_cfg)?; + let device = self.make_virtio_fs_device(fs_cfg, snapshot)?; self.virtio_devices.push(device); } } @@ -3237,6 +3282,7 @@ impl DeviceManager { fn make_virtio_pmem_device( &mut self, pmem_cfg: &mut PmemConfig, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult { let id = match pmem_cfg.pci_common.id.as_ref() { Some(id) => id.clone(), @@ -3383,7 +3429,7 @@ impl DeviceManager { self.exit_evt .try_clone() .map_err(DeviceManagerError::EventFd)?, - state_from_id(self.snapshot.as_ref(), id.as_str()) + state_from_id(snapshot, id.as_str()) .map_err(DeviceManagerError::RestoreGetState)?, ) .map_err(DeviceManagerError::CreateVirtioPmem)?, @@ -3406,12 +3452,12 @@ impl DeviceManager { }) } - fn make_virtio_pmem_devices(&mut self) -> DeviceManagerResult<()> { + fn make_virtio_pmem_devices(&mut self, snapshot: Option<&Snapshot>) -> DeviceManagerResult<()> { // Add virtio-pmem if required let mut pmem_devices = self.config.lock().unwrap().pmem.take(); if let Some(pmem_list_cfg) = &mut pmem_devices { for pmem_cfg in pmem_list_cfg.iter_mut() { - let device = self.make_virtio_pmem_device(pmem_cfg)?; + let device = self.make_virtio_pmem_device(pmem_cfg, snapshot)?; self.virtio_devices.push(device); } } @@ -3423,6 +3469,7 @@ impl DeviceManager { fn make_virtio_vsock_device( &mut self, vsock_cfg: &mut VsockConfig, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult { let id = match vsock_cfg.pci_common.id.as_ref() { Some(id) => id.clone(), @@ -3454,7 +3501,7 @@ impl DeviceManager { self.exit_evt .try_clone() .map_err(DeviceManagerError::EventFd)?, - state_from_id(self.snapshot.as_ref(), id.as_str()) + state_from_id(snapshot, id.as_str()) .map_err(DeviceManagerError::RestoreGetState)?, ) .map_err(DeviceManagerError::CreateVirtioVsock)?, @@ -3476,10 +3523,13 @@ impl DeviceManager { }) } - fn make_virtio_vsock_devices(&mut self) -> DeviceManagerResult<()> { + fn make_virtio_vsock_devices( + &mut self, + snapshot: Option<&Snapshot>, + ) -> DeviceManagerResult<()> { let mut vsock = self.config.lock().unwrap().vsock.take(); if let Some(vsock_cfg) = &mut vsock { - let device = self.make_virtio_vsock_device(vsock_cfg)?; + let device = self.make_virtio_vsock_device(vsock_cfg, snapshot)?; self.virtio_devices.push(device); } self.config.lock().unwrap().vsock = vsock; @@ -3487,7 +3537,7 @@ impl DeviceManager { Ok(()) } - fn make_virtio_mem_devices(&mut self) -> DeviceManagerResult<()> { + fn make_virtio_mem_devices(&mut self, snapshot: Option<&Snapshot>) -> DeviceManagerResult<()> { let mm = self.memory_manager.clone(); let mut mm = mm.lock().unwrap(); for (memory_zone_id, memory_zone) in mm.memory_zones_mut().iter_mut() { @@ -3509,7 +3559,7 @@ impl DeviceManager { .try_clone() .map_err(DeviceManagerError::EventFd)?, virtio_mem_zone.blocks_state().clone(), - state_from_id(self.snapshot.as_ref(), memory_zone_id.as_str()) + state_from_id(snapshot, memory_zone_id.as_str()) .map_err(DeviceManagerError::RestoreGetState)?, ) .map_err(DeviceManagerError::CreateVirtioMem)?, @@ -3587,7 +3637,10 @@ impl DeviceManager { Ok((pvmemcontrol_bus_device, pvmemcontrol_pci_device)) } - fn make_virtio_balloon_devices(&mut self) -> DeviceManagerResult<()> { + fn make_virtio_balloon_devices( + &mut self, + snapshot: Option<&Snapshot>, + ) -> DeviceManagerResult<()> { if let Some(balloon_config) = &self.config.lock().unwrap().balloon { let id = String::from(BALLOON_DEVICE_NAME); info!("Creating virtio-balloon device: id = {id}"); @@ -3603,7 +3656,7 @@ impl DeviceManager { self.exit_evt .try_clone() .map_err(DeviceManagerError::EventFd)?, - state_from_id(self.snapshot.as_ref(), id.as_str()) + state_from_id(snapshot, id.as_str()) .map_err(DeviceManagerError::RestoreGetState)?, ) .map_err(DeviceManagerError::CreateVirtioBalloon)?, @@ -3630,7 +3683,10 @@ impl DeviceManager { Ok(()) } - fn make_virtio_watchdog_devices(&mut self) -> DeviceManagerResult<()> { + fn make_virtio_watchdog_devices( + &mut self, + snapshot: Option<&Snapshot>, + ) -> DeviceManagerResult<()> { if !self.config.lock().unwrap().watchdog { return Ok(()); } @@ -3646,7 +3702,7 @@ impl DeviceManager { self.exit_evt .try_clone() .map_err(DeviceManagerError::EventFd)?, - state_from_id(self.snapshot.as_ref(), id.as_str()) + state_from_id(snapshot, id.as_str()) .map_err(DeviceManagerError::RestoreGetState)?, ) .map_err(DeviceManagerError::CreateVirtioWatchdog)?, @@ -3672,6 +3728,7 @@ impl DeviceManager { fn make_vdpa_device( &mut self, vdpa_cfg: &mut VdpaConfig, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult { let id = match vdpa_cfg.pci_common.id.as_ref() { Some(id) => id.clone(), @@ -3695,7 +3752,7 @@ impl DeviceManager { device_path, self.memory_manager.lock().unwrap().guest_memory(), vdpa_cfg.num_queues as u16, - state_from_id(self.snapshot.as_ref(), id.as_str()) + state_from_id(snapshot, id.as_str()) .map_err(DeviceManagerError::RestoreGetState)?, ) .map_err(DeviceManagerError::CreateVdpa)?, @@ -3719,12 +3776,12 @@ impl DeviceManager { }) } - fn make_vdpa_devices(&mut self) -> DeviceManagerResult<()> { + fn make_vdpa_devices(&mut self, snapshot: Option<&Snapshot>) -> DeviceManagerResult<()> { // Add vdpa if required let mut vdpa_devices = self.config.lock().unwrap().vdpa.take(); if let Some(vdpa_list_cfg) = &mut vdpa_devices { for vdpa_cfg in vdpa_list_cfg.iter_mut() { - let device = self.make_vdpa_device(vdpa_cfg)?; + let device = self.make_vdpa_device(vdpa_cfg, snapshot)?; self.virtio_devices.push(device); } } @@ -3759,6 +3816,7 @@ impl DeviceManager { fn add_passthrough_device( &mut self, device_cfg: &mut DeviceConfig, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult<(PciBdf, String)> { // If the passthrough device has not been created yet, it is created // here and stored in the DeviceManager structure for future needs. @@ -3771,7 +3829,7 @@ impl DeviceManager { ); } - self.add_vfio_device(device_cfg) + self.add_vfio_device(device_cfg, snapshot) } fn create_vfio_ops(&self) -> DeviceManagerResult> { @@ -3814,6 +3872,7 @@ impl DeviceManager { fn add_vfio_device( &mut self, device_cfg: &mut DeviceConfig, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult<(PciBdf, String)> { let vfio_name = if let Some(id) = &device_cfg.pci_common.id { id.clone() @@ -3953,7 +4012,7 @@ impl DeviceManager { vfio_p2p_dma, pci_device_bdf, memory_manager.lock().unwrap().memory_slot_allocator(), - vm_migration::snapshot_from_id(self.snapshot.as_ref(), vfio_name.as_str()), + vm_migration::snapshot_from_id(snapshot, vfio_name.as_str()), device_cfg.x_nv_gpudirect_clique, device_cfg .x_exclude_mmap_bars @@ -4062,13 +4121,16 @@ impl DeviceManager { Ok(new_resources) } - fn add_vfio_devices(&mut self) -> DeviceManagerResult> { + fn add_vfio_devices( + &mut self, + snapshot: Option<&Snapshot>, + ) -> DeviceManagerResult> { let mut iommu_attached_device_ids = Vec::new(); let mut devices = self.config.lock().unwrap().devices.take(); if let Some(device_list_cfg) = &mut devices { for device_cfg in device_list_cfg.iter_mut() { - let (device_id, _) = self.add_passthrough_device(device_cfg)?; + let (device_id, _) = self.add_passthrough_device(device_cfg, snapshot)?; if device_cfg.pci_common.iommu && self.iommu_device.is_some() { iommu_attached_device_ids.push(device_id); } @@ -4084,6 +4146,7 @@ impl DeviceManager { fn add_vfio_user_device( &mut self, device_cfg: &mut UserDeviceConfig, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult<(PciBdf, String)> { let vfio_user_name = if let Some(id) = &device_cfg.pci_common.id { id.clone() @@ -4129,7 +4192,7 @@ impl DeviceManager { legacy_interrupt_group, pci_device_bdf, memory_manager.lock().unwrap().memory_slot_allocator(), - vm_migration::snapshot_from_id(self.snapshot.as_ref(), vfio_user_name.as_str()), + vm_migration::snapshot_from_id(snapshot, vfio_user_name.as_str()), ) .map_err(DeviceManagerError::VfioUserCreate)?; @@ -4191,12 +4254,15 @@ impl DeviceManager { Ok((pci_device_bdf, vfio_user_name)) } - fn add_user_devices(&mut self) -> DeviceManagerResult> { + fn add_user_devices( + &mut self, + snapshot: Option<&Snapshot>, + ) -> DeviceManagerResult> { let mut user_devices = self.config.lock().unwrap().user_devices.take(); if let Some(device_list_cfg) = &mut user_devices { for device_cfg in device_list_cfg.iter_mut() { - let (_device_id, _id) = self.add_vfio_user_device(device_cfg)?; + let (_device_id, _id) = self.add_vfio_user_device(device_cfg, snapshot)?; } } @@ -4216,6 +4282,7 @@ impl DeviceManager { is_hotplug: bool, dma_handler: Option>, pci_device_id: Option, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult { let id = format!("{VIRTIO_PCI_DEVICE_NAME_PREFIX}-{virtio_device_id}"); @@ -4313,7 +4380,7 @@ impl DeviceManager { use_64bit_bar_for_virtio_device(device_type, pci_segment_id, is_hotplug), dma_handler, self.pending_activations.clone(), - vm_migration::snapshot_from_id(self.snapshot.as_ref(), id.as_str()), + vm_migration::snapshot_from_id(snapshot, id.as_str()), ) .map_err(DeviceManagerError::VirtioDevice)?, )); @@ -4347,6 +4414,7 @@ impl DeviceManager { fn add_pvpanic_device( &mut self, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult>>> { let id = String::from(PVPANIC_DEVICE_NAME); let pci_segment_id = 0x0_u16; @@ -4356,7 +4424,7 @@ impl DeviceManager { let (pci_segment_id, pci_device_bdf, resources) = self.pci_resources(&id, pci_segment_id, None)?; - let snapshot = snapshot_from_id(self.snapshot.as_ref(), id.as_str()); + let snapshot = snapshot_from_id(snapshot, id.as_str()); let pvpanic_device = devices::PvPanicDevice::new(id.clone(), snapshot) .map_err(DeviceManagerError::PvPanicCreate)?; @@ -4386,6 +4454,7 @@ impl DeviceManager { fn add_ivshmem_device( &mut self, ivshmem_cfg: &IvshmemConfig, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult>>> { let id = String::from(IVSHMEM_DEVICE_NAME); let pci_segment_id = 0x0_u16; @@ -4393,7 +4462,7 @@ impl DeviceManager { let (pci_segment_id, pci_device_bdf, resources) = self.pci_resources(&id, pci_segment_id, None)?; - let snapshot = snapshot_from_id(self.snapshot.as_ref(), id.as_str()); + let snapshot = snapshot_from_id(snapshot, id.as_str()); let ivshmem_ops = Arc::new(Mutex::new(IvshmemHandler { memory_manager: self.memory_manager.clone(), @@ -4609,6 +4678,10 @@ impl DeviceManager { Ok(()) } + /// Notifies the VM for a hotplug. + /// + /// This call doesn't wait for the vCPU receiving the + /// interrupt to acknowledge. pub fn notify_hotplug( &self, _notification_type: AcpiNotificationFlags, @@ -4634,7 +4707,7 @@ impl DeviceManager { return Err(DeviceManagerError::InvalidIommuHotplug); } - let (bdf, device_name) = self.add_passthrough_device(device_cfg)?; + let (bdf, device_name) = self.add_passthrough_device(device_cfg, None)?; // Update the PCIU bitmap self.pci_segments[device_cfg.pci_common.pci_segment as usize].pci_devices_up |= @@ -4663,7 +4736,7 @@ impl DeviceManager { )); } - let (bdf, device_name) = self.add_vfio_user_device(device_cfg)?; + let (bdf, device_name) = self.add_vfio_user_device(device_cfg, None)?; // Update the PCIU bitmap self.pci_segments[device_cfg.pci_common.pci_segment as usize].pci_devices_up |= @@ -5033,6 +5106,7 @@ impl DeviceManager { true, handle.dma_handler, handle.pci_common.pci_device_id, + None, )?; // Update the PCIU bitmap @@ -5065,14 +5139,14 @@ impl DeviceManager { return Err(DeviceManagerError::InvalidIommuHotplug); } - let device = self.make_virtio_block_device(disk_cfg, true)?; + let device = self.make_virtio_block_device(disk_cfg, true, None)?; self.hotplug_virtio_pci_device(device) } pub fn add_fs(&mut self, fs_cfg: &mut FsConfig) -> DeviceManagerResult { self.validate_identifier(&fs_cfg.pci_common.id)?; - let device = self.make_virtio_fs_device(fs_cfg)?; + let device = self.make_virtio_fs_device(fs_cfg, None)?; self.hotplug_virtio_pci_device(device) } @@ -5082,7 +5156,7 @@ impl DeviceManager { ) -> DeviceManagerResult { self.validate_identifier(&generic_vhost_user_cfg.pci_common.id)?; - let device = self.make_generic_vhost_user_device(generic_vhost_user_cfg)?; + let device = self.make_generic_vhost_user_device(generic_vhost_user_cfg, None)?; self.hotplug_virtio_pci_device(device) } @@ -5093,7 +5167,7 @@ impl DeviceManager { return Err(DeviceManagerError::InvalidIommuHotplug); } - let device = self.make_virtio_pmem_device(pmem_cfg)?; + let device = self.make_virtio_pmem_device(pmem_cfg, None)?; self.hotplug_virtio_pci_device(device) } @@ -5104,7 +5178,7 @@ impl DeviceManager { return Err(DeviceManagerError::InvalidIommuHotplug); } - let device = self.make_virtio_net_device(net_cfg)?; + let device = self.make_virtio_net_device(net_cfg, None)?; self.hotplug_virtio_pci_device(device) } @@ -5115,7 +5189,7 @@ impl DeviceManager { return Err(DeviceManagerError::InvalidIommuHotplug); } - let device = self.make_vdpa_device(vdpa_cfg)?; + let device = self.make_vdpa_device(vdpa_cfg, None)?; self.hotplug_virtio_pci_device(device) } @@ -5126,7 +5200,7 @@ impl DeviceManager { return Err(DeviceManagerError::InvalidIommuHotplug); } - let device = self.make_virtio_vsock_device(vsock_cfg)?; + let device = self.make_virtio_vsock_device(vsock_cfg, None)?; self.hotplug_virtio_pci_device(device) } @@ -5245,6 +5319,75 @@ impl DeviceManager { self.vfio_ops = None; } } + + /// Helps the environment converge quickly after a live migration by + /// prompting devices to advertise the VM from its new host. + /// + /// This is mainly useful for networking: switches and peers can refresh + /// their view of where the guest now lives instead of waiting for normal + /// traffic to update MAC-to-port mappings on its own. + /// + /// The method gathers the [`PostMigrationAnnouncer`] implementations + /// exposed by virtio devices, runs one announcement synchronously for + /// minimum delay, and then schedules a few retries from a background + /// thread. + pub fn post_migration_announce(&self) { + let mut announcers: Vec> = self + .virtio_devices + .iter() + .filter_map(|dev| dev.virtio_device.lock().unwrap().post_migration_announcer()) + .collect(); + + if announcers.is_empty() { + info!("No announcers"); + return; + } + + // We do the first announcement synchronously, because we want the announcements + // as soon as possible. + announcers.iter_mut().for_each(|a| a.announce()); + info!("Post migration announce (sync)"); + + // For good measure we repeat the announcements. This increases the chance that + // the announcements have the expected effect. + const ROUNDS: u32 = 4; + const INITIAL_DELAY: Duration = Duration::from_millis(50); + const STEP_DELAY: Duration = Duration::from_millis(100); + const MAX_DELAY: Duration = Duration::from_millis(450); + schedule_post_migration_announcements( + announcers, + ROUNDS, + INITIAL_DELAY, + STEP_DELAY, + MAX_DELAY, + ); + } +} + +/// Starts a thread that periodically performs the post-migration announcements. +fn schedule_post_migration_announcements( + mut announcers: Vec>, + rounds: u32, + initial_delay: Duration, + step_delay: Duration, + max_delay: Duration, +) { + let _ = thread::Builder::new() + .name("post-migration-announcers".to_string()) + .spawn(move || { + for round in 0..rounds { + info!("Post migration announce (async): {}/{}", round + 1, rounds); + + // The first announcement already was done synchronously, thus + // we sleep at the start of the loop. + + let delay = (initial_delay + step_delay.saturating_mul(round)).min(max_delay); + debug!("Sleeping {}ms", delay.as_millis()); + thread::sleep(delay); + + announcers.iter_mut().for_each(|a| a.announce()); + } + }); } #[cfg(feature = "ivshmem")] diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 6c9d476ab2..0811b23dbd 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -3,6 +3,15 @@ // SPDX-License-Identifier: Apache-2.0 // +/// Amount of iterations before auto-converging starts. +const AUTO_CONVERGE_ITERATION_DELAY: u64 = 2; +/// Step size in percent to increase the vCPU throttling. +const AUTO_CONVERGE_STEP_SIZE: u8 = 10; +/// Amount of iterations after that we increase vCPU throttling. +const AUTO_CONVERGE_ITERATION_INCREASE: u64 = 2; +/// Maximum vCPU throttling value. +const AUTO_CONVERGE_MAX: u8 = 99; + use std::collections::HashMap; use std::fs::File; use std::io::{Read, Write, stdout}; @@ -10,17 +19,20 @@ use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; use std::panic::AssertUnwindSafe; #[cfg(feature = "guest_debug")] use std::path::PathBuf; +use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::mpsc::{Receiver, RecvError, SendError, Sender}; -use std::sync::{Arc, Mutex}; +use std::sync::{Arc, Mutex, Weak}; +use std::thread::JoinHandle; use std::time::Duration; #[cfg(not(target_arch = "riscv64"))] use std::time::Instant; -use std::{io, result, thread}; +use std::{io, mem, result, thread}; use anyhow::{Context, anyhow}; #[cfg(feature = "dbus_api")] use api::dbus::{DBusApiOptions, DBusApiShutdownChannels}; use api::http::HttpApiHandle; +use arch::PAGE_SIZE; #[cfg(all(feature = "kvm", target_arch = "x86_64"))] use arch::x86_64::MAX_SUPPORTED_CPUS_LEGACY; use console_devices::{ConsoleInfo, pre_create_console_devices}; @@ -35,9 +47,12 @@ use serde::ser::{SerializeStruct, Serializer}; use serde::{Deserialize, Serialize}; use signal_hook::iterator::{Handle, Signals}; use thiserror::Error; -use tracer::trace_scoped; use vm_memory::GuestMemoryAtomic; use vm_memory::bitmap::AtomicBitmap; +use vm_migration::progress::{ + MemoryTransmissionInfo, MigrationProgress, MigrationState, MigrationStateOngoingPhase, + TransportationMode, +}; use vm_migration::protocol::*; use vm_migration::{ MemoryMigrationContext, Migratable, MigratableError, OngoingMigrationContext, Pausable, @@ -54,19 +69,20 @@ use crate::api::{ use crate::config::{MemoryRestoreMode, RestoreConfig, add_to_config}; #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] use crate::coredump::GuestDebuggable; +#[cfg(feature = "kvm")] +use crate::cpu::IS_IN_SHUTDOWN; +use crate::device_manager::DeviceManager; use crate::landlock::Landlock; use crate::memory_manager::MemoryManager; -#[cfg(all(feature = "kvm", target_arch = "x86_64"))] -use crate::migration::get_vm_snapshot; -use crate::migration::{recv_vm_config, recv_vm_state}; +use crate::migration::{get_vm_snapshot, recv_vm_config, recv_vm_state}; use crate::migration_transport::{ ReceiveAdditionalConnections, ReceiveListener, SendAdditionalConnections, SocketStream, }; use crate::seccomp_filters::{Thread, get_seccomp_filter}; -use crate::vm::{Error as VmError, Vm, VmState}; +use crate::vm::{Error as VmError, PostMigrationLifecycleEvent, Vm, VmState}; use crate::vm_config::{ - DeviceConfig, DiskConfig, FsConfig, GenericVhostUserConfig, NetConfig, PmemConfig, - UserDeviceConfig, VdpaConfig, VmConfig, VsockConfig, + DeviceConfig, DiskConfig, FsConfig, GenericVhostUserConfig, MemoryZoneConfig, NetConfig, + PmemConfig, UserDeviceConfig, VdpaConfig, VmConfig, VsockConfig, }; mod acpi; @@ -102,6 +118,7 @@ mod sigwinch_listener; mod sync_utils; mod uffd; mod userfaultfd; +mod vcpu_throttling; pub mod vm; pub mod vm_config; @@ -264,6 +281,7 @@ pub enum EpollDispatch { ActivateVirtioDevices = 3, Debug = 4, GuestExit = 5, + CheckMigration = 6, Unknown, } @@ -277,11 +295,15 @@ impl From for EpollDispatch { 3 => ActivateVirtioDevices, 4 => Debug, 5 => GuestExit, + 6 => CheckMigration, _ => Unknown, } } } +// TODO make this a member of Vmm? +static MIGRATION_PROGRESS_SNAPSHOT: Mutex> = Mutex::new(None); + pub struct EpollContext { epoll_file: File, } @@ -614,6 +636,145 @@ impl VmmVersionInfo { } } +/// Handle for the [`MigrationWorker`] thread. +struct MigrationWorkerHandle { + // Option to take the inner handle + handle: Option>, + cancel: Arc, +} + +impl MigrationWorkerHandle { + /// Cancels the migration. + /// + /// Note that timing issues in the very last phase of the migration allow a + /// tiny window in that migration succeeds before they could be canceled. + fn trigger_cancellation(&self) { + info!("Will cancel ongoing live-migration"); + self.cancel.store(true, Ordering::Release); + // we just dispatch here and do not block for the migration thread + } + + /// Joins the thread and returns the result. + fn join(mut self) -> MigrationThreadOut { + self.handle + .take() + .expect("should have thread") + .join() + .expect("should join migration thread gracefully") + } +} + +impl Drop for MigrationWorkerHandle { + fn drop(&mut self) { + if let Some(handle) = self.handle.take() { + warn!("Migration thread wasn't cleaned up explicitly via join()"); + handle + .join() + .expect("should join migration thread gracefully"); + } + } +} + +/// Abstraction for the thread controlling and performing the live migration. +/// +/// The migration thread also takes ownership of the [`Vm`] from the [`Vmm`]. +struct MigrationWorker { + vm: Vm, + check_migration_evt: EventFd, + config: VmSendMigrationData, + // Shared with main VMM thread + postponed_lifecycle_event: Arc>>, + #[cfg(all(feature = "kvm", target_arch = "x86_64"))] + hypervisor: Arc, + cancel: Arc, +} + +impl MigrationWorker { + /// Perform the migration and communicate with the [`Vmm`] thread. + fn run(mut self) -> MigrationThreadOut { + debug!("migration thread is starting"); + event!("vm", "migration-started"); + + let res = Vmm::send_migration( + &mut self.vm, + #[cfg(all(feature = "kvm", target_arch = "x86_64"))] + self.hypervisor.as_ref(), + &self.config, + self.postponed_lifecycle_event.as_ref(), + self.cancel.clone(), + ) + .inspect(|_| event!("vm", "migration-finished")) + .inspect_err(|e| { + event!("vm", "migration-failed"); + error!("migrate error: {e}"); + }); + + // Notify VMM thread to get migration result by joining this thread. + self.check_migration_evt.write(1).unwrap(); + + debug!("migration thread is finished"); + MigrationThreadOut { + vm: self.vm, + migration_res: res, + migration_cfg: self.config, + } + } + + #[expect(clippy::result_large_err)] + fn spawn( + vm: Vm, + check_migration_evt: EventFd, + config: VmSendMigrationData, + postponed_lifecycle_event: Arc>>, + #[cfg(all(feature = "kvm", target_arch = "x86_64"))] hypervisor: Arc< + dyn hypervisor::Hypervisor, + >, + ) -> result::Result { + let cancel = Arc::new(AtomicBool::new(false)); + let worker = MigrationWorker { + vm, + check_migration_evt, + config, + postponed_lifecycle_event, + #[cfg(all(feature = "kvm", target_arch = "x86_64"))] + hypervisor, + cancel: cancel.clone(), + }; + + // Cumbersome but we need this to take a value from the worker when + // thread spawning failed. Ownership of the worker is either by the + // thread or this function. + let worker = Arc::new(Mutex::new(Some(worker))); + let thread_worker = worker.clone(); + + let inner_handle = thread::Builder::new() + .name("migration".into()) + .spawn(move || { + thread_worker + .lock() + .unwrap() + .take() + .expect("migration worker should only be taken once") + .run() + }) + .context("should spawn migration thread") + .map_err(|e| { + // Get the VM back from the worker. + let worker = worker + .lock() + .unwrap() + .take() + .expect("migration worker should remain available on spawn failure"); + (worker.vm, MigratableError::MigrateSend(e)) + })?; + + Ok(MigrationWorkerHandle { + handle: Some(inner_handle), + cancel, + }) + } +} + pub struct VmmThreadHandle { pub thread_handle: thread::JoinHandle>, #[cfg(feature = "dbus_api")] @@ -621,6 +782,74 @@ pub struct VmmThreadHandle { pub http_api_handle: Option, } +struct MigrationVmState { + // The migration worker owns the VM during migration, so this should stop + // working once that VM has been dropped. + device_manager: Weak>, +} + +impl MigrationVmState { + fn new(vm: &Vm) -> Self { + Self { + device_manager: Arc::downgrade(vm.device_manager()), + } + } + + fn activate_virtio_devices(&self) -> result::Result<(), VmError> { + self.device_manager + .upgrade() + .expect("device manager should remain alive during migration") + .lock() + .unwrap() + .activate_virtio_devices() + .map_err(VmError::ActivateVirtioDevices) + } +} + +/// Describes the current ownership of a running VM. +#[allow(clippy::large_enum_variant)] +enum MaybeVmOwnership { + /// The VMM holds the ownership of the VM. + Vmm(Vm), + /// The VM is temporarily blocked by the current ongoing migration. + /// + /// We still keep the device manager reachable so the epoll thread can + /// drain pending virtio activations while the migration worker owns the VM. + Migration(MigrationVmState), + /// No VM is running. + None, +} + +impl MaybeVmOwnership { + /// Takes the VM and replaces it with [`Self::Migration`]. + /// + /// # Panics + /// This method panics if `self` is not [`Self::Vmm`]. + fn take_vm_for_migration(&mut self) -> Vm { + match mem::replace(self, Self::None) { + Self::Vmm(vm) => { + *self = Self::Migration(MigrationVmState::new(&vm)); + vm + } + _ => panic!("should only be called when a migration can start"), + } + } + + fn vm_mut(&mut self) -> Option<&mut Vm> { + match self { + MaybeVmOwnership::Vmm(vm) => Some(vm), + _ => None, + } + } +} + +/// Output value of [`MigrationWorker`]. +struct MigrationThreadOut { + vm: Vm, + migration_res: result::Result<(), MigratableError>, + migration_cfg: VmSendMigrationData, +} + pub struct Vmm { epoll: EpollContext, exit_evt: EventFd, @@ -632,7 +861,7 @@ pub struct Vmm { #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, version: VmmVersionInfo, - vm: Option, + vm: MaybeVmOwnership, vm_config: Option>>, seccomp_action: SeccompAction, hypervisor: Arc, @@ -643,6 +872,11 @@ pub struct Vmm { console_resize_pipe: Option>, console_info: Option, no_shutdown: bool, + check_migration_evt: EventFd, + postponed_lifecycle_event: Arc>>, + received_postponed_lifecycle_event: Option, + /// Handle to the [`MigrationWorker`] thread. + migration_thread_handle: Option, } /// Just a wrapper for the data that goes into @@ -685,6 +919,18 @@ enum ReceiveMigrationState { } impl ReceiveMigrationState { + fn variant_name(&self) -> &'static str { + match self { + ReceiveMigrationState::Established => "Established", + ReceiveMigrationState::Started => "Started", + ReceiveMigrationState::MemoryFdsReceived(_) => "MemoryFdsReceived", + ReceiveMigrationState::Configured(_) => "Configured", + ReceiveMigrationState::StateReceived { .. } => "StateReceived", + ReceiveMigrationState::Completed => "Completed", + ReceiveMigrationState::Aborted => "Aborted", + } + } + fn finished(&self) -> bool { matches!( self, @@ -749,14 +995,14 @@ impl Vmm { .name("vmm_signal_handler".to_string()) .spawn(move || { if !signal_handler_seccomp_filter.is_empty() && let Err(e) = apply_filter(&signal_handler_seccomp_filter) - .map_err(Error::ApplySeccompFilter) - { - error!("Error applying seccomp filter: {e:?}"); - exit_evt.write(1).ok(); - return; - } + .map_err(Error::ApplySeccompFilter) + { + error!("Error applying seccomp filter: {e:?}"); + exit_evt.write(1).ok(); + return; + } - if landlock_enable{ + if landlock_enable { match Landlock::new() { Ok(landlock) => { let _ = landlock.restrict_self().map_err(Error::ApplyLandlock).map_err(|e| { @@ -774,11 +1020,11 @@ impl Vmm { std::panic::catch_unwind(AssertUnwindSafe(|| { Vmm::signal_handler(signals, original_termios_opt.as_ref(), &exit_evt); })) - .map_err(|_| { - error!("vmm signal_handler thread panicked"); - exit_evt.write(1).ok() - }) - .ok(); + .map_err(|_| { + error!("vmm signal_handler thread panicked"); + exit_evt.write(1).ok() + }) + .ok(); }) .map_err(Error::SignalHandlerSpawn)?, ); @@ -803,6 +1049,7 @@ impl Vmm { let reset_evt = EventFd::new(EFD_NONBLOCK).map_err(Error::EventFdCreate)?; let guest_exit_evt = EventFd::new(EFD_NONBLOCK).map_err(Error::EventFdCreate)?; let activate_evt = EventFd::new(EFD_NONBLOCK).map_err(Error::EventFdCreate)?; + let check_migration_evt = EventFd::new(EFD_NONBLOCK).map_err(Error::EventFdCreate)?; epoll .add_event(&exit_evt, EpollDispatch::Exit) @@ -829,6 +1076,10 @@ impl Vmm { .add_event(&debug_evt, EpollDispatch::Debug) .map_err(Error::Epoll)?; + epoll + .add_event(&check_migration_evt, EpollDispatch::CheckMigration) + .map_err(Error::Epoll)?; + Ok(Vmm { epoll, exit_evt, @@ -840,7 +1091,7 @@ impl Vmm { #[cfg(feature = "guest_debug")] vm_debug_evt, version: vmm_version, - vm: None, + vm: MaybeVmOwnership::None, vm_config: None, seccomp_action, hypervisor, @@ -851,18 +1102,40 @@ impl Vmm { console_resize_pipe: None, console_info: None, no_shutdown, + check_migration_evt, + postponed_lifecycle_event: Arc::new(Mutex::new(None)), + received_postponed_lifecycle_event: None, + migration_thread_handle: None, }) } + fn postpone_lifecycle_event_during_migration(&self, event: PostMigrationLifecycleEvent) { + let mut postponed_event = self.postponed_lifecycle_event.lock().unwrap(); + if postponed_event.is_none() { + *postponed_event = Some(event); + info!("Postponed post-migration lifecycle event: {event:?}"); + } + } + + fn current_postponed_lifecycle_event(&self) -> Option { + *self.postponed_lifecycle_event.lock().unwrap() + } + + fn clear_postponed_lifecycle_event(&self) { + let mut postponed_event = self.postponed_lifecycle_event.lock().unwrap(); + *postponed_event = None; + } + /// Try to receive a file descriptor from a socket. Returns the slot number and the file descriptor. fn vm_receive_memory_fd( socket: &mut SocketStream, ) -> std::result::Result<(u32, File), MigratableError> { if let SocketStream::Unix(unix_socket) = socket { let mut buf = [0u8; 4]; - let (_, file) = unix_socket.recv_with_fd(&mut buf).map_err(|e| { - MigratableError::MigrateReceive(anyhow!("Error receiving slot from socket: {e}")) - })?; + let (_, file) = unix_socket + .recv_with_fd(&mut buf) + .context("Error receiving slot from socket") + .map_err(MigratableError::MigrateReceive)?; file.ok_or_else(|| MigratableError::MigrateReceive(anyhow!("Failed to receive socket"))) .map(|file| (u32::from_le_bytes(buf), file)) @@ -883,13 +1156,13 @@ impl Vmm { listener: &ReceiveListener, state: ReceiveMigrationState, req: &Request, - _receive_data_migration: &VmReceiveMigrationData, + receive_data_migration: &VmReceiveMigrationData, ) -> std::result::Result { use ReceiveMigrationState::*; - let invalid_command = || { + let invalid_command = |state: &str, cmd: Command| { Err(MigratableError::MigrateReceive(anyhow!( - "Can't handle command in current state" + "Can't handle command {cmd:?} in current receive state {state}" ))) }; @@ -897,7 +1170,28 @@ impl Vmm { |socket: &mut SocketStream, memory_files: HashMap| -> std::result::Result { - let memory_manager = self.vm_receive_config(req, socket, memory_files)?; + let memory_manager = self.vm_receive_config( + req, + socket, + memory_files, + receive_data_migration.tcp_serial_url.clone(), + receive_data_migration.zones.clone(), + )?; + + if !receive_data_migration.net_fds.is_empty() { + let mut vm_config = self.vm_config.as_mut().unwrap().lock().unwrap(); + for restored_net in &receive_data_migration.net_fds { + for net_config in vm_config.net.iter_mut().flatten() { + // Only update net devices that are backed directly by file descriptors. + if net_config.pci_common.id.as_ref() == Some(&restored_net.id) + && net_config.fds.is_some() + { + net_config.fds.clone_from(&restored_net.fds); + } + } + } + } + let guest_memory = memory_manager.lock().unwrap().guest_memory(); // Create the additional-connection receiver even in the single-connection case. // At this point the receiver does not know whether the sender will use extra TCP @@ -927,22 +1221,23 @@ impl Vmm { return Ok(Aborted); } + let state_name = state.variant_name(); match state { Established => match req.command() { Command::Start => Ok(Started), - _ => invalid_command(), + c => invalid_command(state_name, c), }, Started => match req.command() { Command::MemoryFd => recv_memory_fd(socket, Vec::new()).map(MemoryFdsReceived), Command::Config => configure_vm(socket, Default::default()).map(Configured), - _ => invalid_command(), + c => invalid_command(state_name, c), }, MemoryFdsReceived(memory_files) => match req.command() { Command::MemoryFd => recv_memory_fd(socket, memory_files).map(MemoryFdsReceived), Command::Config => { configure_vm(socket, HashMap::from_iter(memory_files)).map(Configured) } - _ => invalid_command(), + c => invalid_command(state_name, c), }, Configured(mut config_data) => match req.command() { // Memory commands use the main connection only in the single-connection case. @@ -981,7 +1276,7 @@ impl Vmm { state_receive_begin, }) } - _ => invalid_command(), + c => invalid_command(state_name, c), }, StateReceived { state_receive_begin, @@ -993,12 +1288,40 @@ impl Vmm { Command::Complete => { // The unwrap is safe, because the state machine makes sure we called // vm_receive_state before, which creates the VM. - let vm = self.vm.as_mut().unwrap(); - let (_, resume_duration) = measure_ok(|| vm.resume())?; - debug!( - "Migration (incoming): resume:{}ms", - resume_duration.as_millis() - ); + let vm = self.vm.vm_mut().unwrap(); + + // Advertise new VM location to network switches. + // The thread in background periodically sends multiple messages. + vm.post_migration_announce(); + + // We are on the control-loop thread handling an API request, so + // there is no concurrent access from other VMM or migration + // threads. The VM is in the Paused state , which permits both + // the Running transition (resume) and the Shutdown transition (reboot / exit) + // triggered via the eventfds below. + match self.received_postponed_lifecycle_event { + None => { + let (_, resume_duration) = measure_ok(|| vm.resume())?; + debug!( + "Migration (incoming): resume:{}ms", + resume_duration.as_millis() + ); + } + Some(PostMigrationLifecycleEvent::VmReboot) => { + self.reset_evt + .write(1) + .context("Failed writing reset eventfd after migration") + .map_err(MigratableError::MigrateReceive)?; + } + Some(PostMigrationLifecycleEvent::VmShutdown) => { + self.guest_exit_evt + .write(1) + .context("Failed writing guest exit eventfd after migration") + .map_err(MigratableError::MigrateReceive)?; + } + } + self.received_postponed_lifecycle_event = None; + // This logs the downtime without the final memory delta, so // it does not reflect the actual downtime. While we could // pass along the timestamp from when the VM was paused, @@ -1011,7 +1334,7 @@ impl Vmm { ); Ok(Completed) } - _ => invalid_command(), + c => invalid_command(state_name, c), }, Completed | Aborted => { unreachable!("Performed a step on the finished state machine") @@ -1024,6 +1347,8 @@ impl Vmm { req: &Request, socket: &mut T, existing_memory_files: HashMap, + tcp_serial_url: Option, + zones: Vec, ) -> std::result::Result>, MigratableError> where T: Read, @@ -1035,10 +1360,9 @@ impl Vmm { .read_exact(&mut data) .map_err(MigratableError::MigrateSocket)?; - let vm_migration_config: VmMigrationConfig = - serde_json::from_slice(&data).map_err(|e| { - MigratableError::MigrateReceive(anyhow!("Error deserialising config: {e}")) - })?; + let vm_migration_config: VmMigrationConfig = serde_json::from_slice(&data) + .context("Error deserialising config") + .map_err(MigratableError::MigrateReceive)?; #[cfg(all(feature = "kvm", target_arch = "x86_64"))] self.vm_check_cpuid_compatibility( @@ -1048,9 +1372,53 @@ impl Vmm { let config = vm_migration_config.vm_config.clone(); self.vm_config = Some(vm_migration_config.vm_config); - self.console_info = Some(pre_create_console_devices(self).map_err(|e| { - MigratableError::MigrateReceive(anyhow!("Error creating console devices: {e:?}")) - })?); + + if let Some(tcp_serial_url) = tcp_serial_url { + let mut vm_config = self.vm_config.as_mut().unwrap().lock().unwrap(); + vm_config.serial.common.url = Some(tcp_serial_url); + } + + // Adopt host nodes. + if !zones.is_empty() { + let mut vm_config = self.vm_config.as_mut().unwrap().lock().unwrap(); + if let Some(config_zones) = &mut vm_config.memory.zones { + for zone in zones { + // We currently only support to move MemoryZones to different host nodes. We therefore ensure that + // there exists a memory zone in the new config that matches the same size and ID for each memory + // zone of the old config. + if let Some(matched_zone) = config_zones.iter_mut().find(|z| z.id == zone.id) { + if matched_zone.size != zone.size { + return Err(MigratableError::MigrateReceive(anyhow!( + "Size update of memory zone with ID {} not allowed. Tried to resize from {:018x?} to {:018x?}", + zone.id, + zone.size, + matched_zone.size + ))); + } + // Override the host numa node + matched_zone.host_numa_node = zone.host_numa_node; + } else { + // We did not find a match for a memory zone that was defined in the old config, so we cannot + // update it. + return Err(MigratableError::MigrateReceive(anyhow!( + "Failed to associate new memory zone information with ID {} to an existing zone", + zone.id + ))); + } + } + } else { + // MemoryZoneConfigs were provided but the initial config didn't contain any + return Err(MigratableError::MigrateReceive(anyhow!( + "Updating memory zone data is forbidden as VM was instantiated without any zones" + ))); + } + } + + self.console_info = Some( + pre_create_console_devices(self) + .context("Error creating console devices") + .map_err(MigratableError::MigrateReceive)?, + ); if self .vm_config @@ -1061,9 +1429,9 @@ impl Vmm { .landlock_enable { let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - apply_landlock(&mut config).map_err(|e| { - MigratableError::MigrateReceive(anyhow!("Error applying landlock: {e:?}")) - })?; + apply_landlock(&mut config) + .context("Error applying landlock") + .map_err(MigratableError::MigrateReceive)?; } let vm = Vm::create_hypervisor_vm( @@ -1096,11 +1464,8 @@ impl Vmm { Some(&vm_migration_config.memory_manager_data), existing_memory_files, ) - .map_err(|e| { - MigratableError::MigrateReceive(anyhow!( - "Error creating MemoryManager from snapshot: {e:?}" - )) - })?; + .context("Error creating MemoryManager from snapshot") + .map_err(MigratableError::MigrateReceive)?; Ok(memory_manager) } @@ -1129,27 +1494,40 @@ impl Vmm { socket .read_exact(&mut data) .map_err(MigratableError::MigrateSocket)?; - serde_json::from_slice(&data).map_err(|e| { - MigratableError::MigrateReceive(anyhow!("Error deserialising snapshot: {e}")) - }) + serde_json::from_slice(&data) + .context("Error deserialising snapshot") + .map_err(MigratableError::MigrateReceive) })?; - let exit_evt = self.exit_evt.try_clone().map_err(|e| { - MigratableError::MigrateReceive(anyhow!("Error cloning exit EventFd: {e}")) - })?; - let reset_evt = self.reset_evt.try_clone().map_err(|e| { - MigratableError::MigrateReceive(anyhow!("Error cloning reset EventFd: {e}")) - })?; + let vm_snapshot = get_vm_snapshot(&snapshot) + .context("Failed extracting VM snapshot data") + .map_err(MigratableError::MigrateReceive)?; + self.received_postponed_lifecycle_event = vm_snapshot.post_migration_lifecycle_event; + + let exit_evt = self + .exit_evt + .try_clone() + .context("Error cloning exit EventFd") + .map_err(MigratableError::MigrateReceive)?; + let reset_evt = self + .reset_evt + .try_clone() + .context("Error cloning reset EventFd") + .map_err(MigratableError::MigrateReceive)?; let guest_exit_evt = self.guest_exit_evt.try_clone().map_err(|e| { MigratableError::MigrateReceive(anyhow!("Error cloning guest exit EventFd: {e}")) })?; #[cfg(feature = "guest_debug")] - let debug_evt = self.vm_debug_evt.try_clone().map_err(|e| { - MigratableError::MigrateReceive(anyhow!("Error cloning debug EventFd: {e}")) - })?; - let activate_evt = self.activate_evt.try_clone().map_err(|e| { - MigratableError::MigrateReceive(anyhow!("Error cloning activate EventFd: {e}")) - })?; + let debug_evt = self + .vm_debug_evt + .try_clone() + .context("Error clonung debug EventFd") + .map_err(MigratableError::MigrateReceive)?; + let activate_evt = self + .activate_evt + .try_clone() + .context("Error cloning activate EventFd") + .map_err(MigratableError::MigrateReceive)?; let (vm, restore_duration) = measure_ok(|| { #[cfg(not(target_arch = "riscv64"))] @@ -1189,11 +1567,20 @@ impl Vmm { Ok(vm) })?; - self.vm = Some(vm); + self.vm = MaybeVmOwnership::Vmm(vm); Ok((receive_duration, restore_duration)) } + fn can_increase_autoconverge_step(s: &MemoryMigrationContext) -> bool { + if (s.iteration as u64) < AUTO_CONVERGE_ITERATION_DELAY { + false + } else { + let iteration = s.iteration as u64 - AUTO_CONVERGE_ITERATION_DELAY; + iteration.is_multiple_of(AUTO_CONVERGE_ITERATION_INCREASE) + } + } + /// Performs the initial memory transmission (iteration zero) plus a /// variable number of memory iterations with the goal to eventually migrate /// the VM in a reasonably small downtime. @@ -1206,8 +1593,55 @@ impl Vmm { ctx: &mut MemoryMigrationContext, is_converged: impl Fn(&MemoryMigrationContext) -> result::Result, mem_send: &mut SendAdditionalConnections, + postponed_lifecycle_event: &Mutex>, + return_if_cancelled_cb: &impl Fn(&mut SocketStream) -> result::Result<(), MigratableError>, ) -> result::Result { + let update_migration_progress = |s: &mut MemoryMigrationContext, vm: &Vm| { + let mut lock = MIGRATION_PROGRESS_SNAPSHOT.lock().unwrap(); + lock.as_mut() + .expect("live migration should be ongoing") + .update( + MigrationStateOngoingPhase::MemoryPrecopy, + Some(MemoryTransmissionInfo { + memory_iteration: s.iteration as u64, + memory_transmission_bps: s.current_iteration_total_bytes, + memory_bytes_total: s.bandwidth_bytes_per_second as u64, + memory_bytes_transmitted: s.total_sent_bytes, + memory_pages_4k_transmitted: s.total_sent_bytes.div_ceil(PAGE_SIZE as u64), + memory_pages_4k_remaining_iteration: s + .current_iteration_total_bytes + .div_ceil(PAGE_SIZE as u64), + memory_bytes_remaining_iteration: s.current_iteration_total_bytes, + memory_dirty_rate_pps: { + let pages = s.current_iteration_total_bytes.div_ceil(PAGE_SIZE as u64); + s.iteration_duration + .filter(|d| !d.is_zero()) + .map(|d| (pages as f64 / d.as_secs_f64()).ceil()) + .map_or(0, |dirty_rate| dirty_rate as u64) + }, + memory_pages_constant_count: 0, /* TODO */ + }), + Some(vm.throttle_percent()), + s.estimated_downtime, + ); + }; + loop { + return_if_cancelled_cb(socket)?; + + // todo: check if auto-converge is enabled at all? + if Self::can_increase_autoconverge_step(ctx) + && vm.throttle_percent() < AUTO_CONVERGE_MAX + { + let current_throttle = vm.throttle_percent(); + let new_throttle = current_throttle + AUTO_CONVERGE_STEP_SIZE; + let new_throttle = std::cmp::min(new_throttle, AUTO_CONVERGE_MAX); + info!("Increasing auto-converge: {new_throttle}%"); + if new_throttle != current_throttle { + vm.set_throttle_percent(new_throttle); + } + } + let iteration_begin = Instant::now(); let iteration_table = if ctx.iteration == 0 { @@ -1218,19 +1652,24 @@ impl Vmm { }; ctx.update_metrics_before_transfer(iteration_begin, &iteration_table); + // Update before we might exit the loop. + update_migration_progress(ctx, vm); if is_converged(ctx)? { - debug!("Precopy converged: {ctx}"); + info!("Precopy converged: {ctx}"); break Ok(iteration_table); } + // Update with new metrics before transmission. + update_migration_progress(ctx, vm); + // Send the current dirty pages let transfer_begin = Instant::now(); - mem_send.send_memory(iteration_table, socket)?; + mem_send.send_memory(iteration_table, socket, return_if_cancelled_cb)?; let transfer_duration = transfer_begin.elapsed(); ctx.update_metrics_after_transfer(transfer_begin, transfer_duration); // Log progress of the current iteration - debug!("Precopy: {ctx}"); + info!("Precopy: {ctx}"); // Enables management software (e.g., libvirt) to easily track forward progress. event!( @@ -1243,6 +1682,16 @@ impl Vmm { // Increment iteration last: This way we ensure that the logging // above matches the actual iteration. ctx.iteration += 1; + + let event = *postponed_lifecycle_event.lock().unwrap(); + if let Some(event) = event { + info!( + "Lifecycle event postponed during migration ({event:?}), switching to downtime phase early" + ); + // The current iteration has already been sent, therefore no extra range + // needs to be carried into the final transfer batch. + break Ok(MemoryRangeTable::default()); + } } } @@ -1352,6 +1801,8 @@ impl Vmm { send_data_migration: &VmSendMigrationData, mem_send: &mut SendAdditionalConnections, ctx: &mut OngoingMigrationContext, + postponed_lifecycle_event: &Mutex>, + return_if_cancelled_cb: &impl Fn(&mut SocketStream) -> result::Result<(), MigratableError>, ) -> result::Result<(), MigratableError> { let mut mem_ctx = MemoryMigrationContext::new(); @@ -1363,11 +1814,17 @@ impl Vmm { // We bind send_data_migration to the callback |ctx| Self::is_precopy_converged(ctx, send_data_migration), mem_send, + postponed_lifecycle_event, + return_if_cancelled_cb, )?; let downtime_begin = Instant::now(); - if vm.get_state() != VmState::Paused { - vm.pause()?; - } + // End throttle thread + info!("stopping vcpu thread"); + vm.stop_vcpu_throttling(); + info!("stopped vcpu thread"); + info!("pausing VM"); + vm.pause()?; + info!("paused VM"); // Send last batch of dirty pages: final iteration { @@ -1378,7 +1835,7 @@ impl Vmm { mem_ctx.update_metrics_before_transfer(iteration_begin, &final_table); let transfer_begin = Instant::now(); - mem_send.send_memory(final_table, socket)?; + mem_send.send_memory(final_table, socket, return_if_cancelled_cb)?; let transfer_duration = transfer_begin.elapsed(); mem_ctx.update_metrics_after_transfer(transfer_begin, transfer_duration); mem_ctx.iteration += 1; @@ -1391,28 +1848,54 @@ impl Vmm { Ok(()) } - /// Performs a migration including all its phases. + /// Performs a live-migration. + /// + /// This function performs necessary after-migration cleanup only in the + /// good case. Callers are responsible for properly handling failed + /// migrations. + #[allow(unused_assignments)] // TODO remove fn send_migration( vm: &mut Vm, #[cfg(all(feature = "kvm", target_arch = "x86_64"))] hypervisor: &dyn hypervisor::Hypervisor, send_data_migration: &VmSendMigrationData, - initial_vm_state: VmState, + postponed_lifecycle_event: &Mutex>, + cancel: Arc, ) -> result::Result<(), MigratableError> { // State machine that is updated with more context as we progress. let mut ctx = OngoingMigrationContext::new(); + let return_if_cancelled_cb = move |socket: &mut SocketStream| { + if cancel.load(Ordering::Acquire) { + info!("Cancelling migration now"); + Request::abandon().write_to(socket)?; + Err(MigratableError::Cancelled) + } else { + Ok(()) + } + }; // Set up the socket connection - let mut socket = - migration_transport::send_migration_socket(&send_data_migration.destination_url)?; + let mut socket = if send_data_migration.local { + migration_transport::send_migration_socket( + &send_data_migration.destination_url, + send_data_migration.tls_dir.as_deref(), + )? + } else { + migration_transport::send_migration_socket_with_keep_alive( + &send_data_migration.destination_url, + send_data_migration.tls_dir.as_deref(), + )? + }; // Start the migration migration_transport::send_request_expect_ok( &mut socket, Request::start(), - MigratableError::MigrateSend(anyhow!("Error starting migration")), + MigratableError::MigrateSend(anyhow!("Error starting migration (got bad response)")), )?; + return_if_cancelled_cb(&mut socket)?; + // Send config let vm_config = vm.get_config(); #[cfg(all(feature = "kvm", target_arch = "x86_64"))] @@ -1424,38 +1907,56 @@ impl Vmm { ))); } - let amx = vm_config.lock().unwrap().cpus.features.amx; - let phys_bits = - vm::physical_bits(hypervisor, vm_config.lock().unwrap().cpus.max_phys_bits); + let (amx, phys_bits, profile, kvm_hyperv) = { + let guard = vm_config.lock().unwrap(); + let amx = guard.cpus.features.amx; + let max_phys_bits = guard.cpus.max_phys_bits; + let profile = guard.cpus.profile; + let kvm_hyperv = guard.cpus.kvm_hyperv; + // Drop lock before function call + core::mem::drop(guard); + let phys_bits = vm::physical_bits(hypervisor, max_phys_bits); + (amx, phys_bits, profile, kvm_hyperv) + }; + arch::generate_common_cpuid( hypervisor, &arch::CpuidConfig { phys_bits, - kvm_hyperv: vm_config.lock().unwrap().cpus.kvm_hyperv, + kvm_hyperv, #[cfg(feature = "tdx")] tdx: false, amx, + profile, }, ) - .map_err(|e| { - MigratableError::MigrateSend(anyhow!("Error generating common cpuid': {e:?}")) - })? + .context("Error generating common cpuid") + .map_err(MigratableError::MigrateSend)? }; + return_if_cancelled_cb(&mut socket)?; + if send_data_migration.local { match &mut socket { SocketStream::Unix(unix_socket) => { + let mut lock = MIGRATION_PROGRESS_SNAPSHOT.lock().unwrap(); + lock.as_mut() + .expect("live migration should be ongoing") + .update(MigrationStateOngoingPhase::MemoryFds, None, None, None); + // Proceed with sending memory file descriptors over UNIX socket vm.send_memory_fds(unix_socket)?; } - SocketStream::Tcp(_tcp_socket) => { + _ => { return Err(MigratableError::MigrateSend(anyhow!( - "--local option is not supported with TCP sockets", + "--local option is only supported with UNIX sockets", ))); } } } + return_if_cancelled_cb(&mut socket)?; + let vm_migration_config = VmMigrationConfig { vm_config, #[cfg(all(feature = "kvm", target_arch = "x86_64"))] @@ -1464,6 +1965,8 @@ impl Vmm { }; migration_transport::send_config(&mut socket, &vm_migration_config)?; + return_if_cancelled_cb(&mut socket)?; + // Let every Migratable object know about the migration being started. vm.start_migration()?; @@ -1483,6 +1986,7 @@ impl Vmm { let mut mem_send = migration_transport::SendAdditionalConnections::new( &send_data_migration.destination_url, send_data_migration.connections, + send_data_migration.tls_dir.as_deref(), &vm.guest_memory(), )?; @@ -1492,6 +1996,8 @@ impl Vmm { send_data_migration, &mut mem_send, &mut ctx, + postponed_lifecycle_event, + &return_if_cancelled_cb, ) .inspect_err(|_| { // Calling cleanup multiple times is fine, thus here we just make sure @@ -1504,12 +2010,33 @@ impl Vmm { mem_send.cleanup()?; } + // Very last cancellation check. After this, we release the disk locks and we can't cancel + // anymore. + return_if_cancelled_cb(&mut socket)?; + + // Update migration progress snapshot + { + let mut lock = MIGRATION_PROGRESS_SNAPSHOT.lock().unwrap(); + lock.as_mut() + .expect("live migration should be ongoing") + .update(MigrationStateOngoingPhase::Completing, None, None, None); + } + // We release the locks early to enable locking them on the destination host. // The VM is already stopped. vm.release_disk_locks() .map_err(|e| MigratableError::UnlockError(anyhow!("{e}")))?; + #[cfg(feature = "kvm")] + // Prevent signal handler to access thread local storage when signals are received + // close to the end when thread-local storage is already destroyed. + { + let mut lock = IS_IN_SHUTDOWN.write().unwrap(); + *lock = true; + } + // Capture snapshot and send it + vm.set_post_migration_lifecycle_event(*postponed_lifecycle_event.lock().unwrap()); let (vm_snapshot, snapshot_duration) = measure_ok(|| vm.snapshot())?; let (_, send_snapshot_duration) = measure_ok(|| migration_transport::send_state(&mut socket, &vm_snapshot))?; @@ -1518,15 +2045,10 @@ impl Vmm { // When this returns, we know the VM was resumed (if it was running // before the migration) and that the receiving VMM acquired disk // locks again. - let complete_req = if initial_vm_state == VmState::Running { - Request::complete() - } else { - Request::complete_paused() - }; let (_, complete_duration) = measure_ok(|| { migration_transport::send_request_expect_ok( &mut socket, - complete_req, + Request::complete(), MigratableError::MigrateSend(anyhow!("Error completing migration")), ) })?; @@ -1570,8 +2092,17 @@ impl Vmm { let dest_cpuid = &{ let vm_config = &src_vm_config.lock().unwrap(); + if vm_config.cpus.features.amx { + // Need to enable AMX tile state components before generating common cpuid + // as this affects what Hypervisor::get_supported_cpuid returns. + self.hypervisor + .enable_amx_state_components() + .map_err(|e| MigratableError::MigrateReceive(e.into()))?; + } + let phys_bits = vm::physical_bits(self.hypervisor.as_ref(), vm_config.cpus.max_phys_bits); + arch::generate_common_cpuid( self.hypervisor.as_ref(), &arch::CpuidConfig { @@ -1580,17 +2111,15 @@ impl Vmm { #[cfg(feature = "tdx")] tdx: false, amx: vm_config.cpus.features.amx, + profile: vm_config.cpus.profile, }, ) - .map_err(|e| { - MigratableError::MigrateReceive(anyhow!("Error generating common cpuid: {e:?}")) - })? + .context("Error generating common cpuid") + .map_err(MigratableError::MigrateReceive)? }; - arch::CpuidFeatureEntry::check_cpuid_compatibility(src_vm_cpuid, dest_cpuid).map_err(|e| { - MigratableError::MigrateReceive(anyhow!( - "Error checking cpu feature compatibility': {e:?}" - )) - }) + arch::CpuidFeatureEntry::check_cpuid_compatibility(src_vm_cpuid, dest_cpuid) + .context("Error checking cpu feature compatibility") + .map_err(MigratableError::MigrateReceive) } fn vm_restore( @@ -1600,6 +2129,10 @@ impl Vmm { prefault: bool, memory_restore_mode: MemoryRestoreMode, ) -> std::result::Result<(), VmError> { + if matches!(self.vm, MaybeVmOwnership::Migration(_)) { + return Err(VmError::VmMigrating); + } + let snapshot = recv_vm_state(source_url).map_err(VmError::Restore)?; #[cfg(all(feature = "kvm", target_arch = "x86_64"))] let vm_snapshot = get_vm_snapshot(&snapshot).map_err(VmError::Restore)?; @@ -1648,7 +2181,7 @@ impl Vmm { Some(prefault), Some(memory_restore_mode), )?; - self.vm = Some(vm); + self.vm = MaybeVmOwnership::Vmm(vm); if self .vm_config @@ -1663,13 +2196,157 @@ impl Vmm { } // Now we can restore the rest of the VM. - if let Some(ref mut vm) = self.vm { - vm.restore() + // PANIC: won't panic, we just checked that the VM is there. + self.vm.vm_mut().unwrap().restore() + } + + /// Prints the error chain to `error!()` level, akin to user-facing errors when Cloud Hypervisor + /// or ch-remote fail. + // TODO: For upstreaming, we should unify this with the code-paths used by ch-remote and + // Cloud Hypervisor on failure. + fn log_print_error_chain<'a>(top_error: &'a (dyn std::error::Error + 'static)) { + // Print chain of errors + if top_error.source().is_none() { + error!("Migration failed with the following error:"); + error!(" {top_error}"); } else { - Err(VmError::VmNotCreated) + // In cli_print_error_chain(), we also print the + // ::fmt() as oneliner so that we can see all + // properties. As we use anyhow errors in the migration path, + // Debug::fmt() is not helpful for us as it doesn't print the + // underlying properties (like the default Debug::fmt() impl would + // do). Instead, it would print a trace itself, which is not what + // we want to do here. + + error!("Migration failed with the following chain of errors:"); + std::iter::successors(Some(top_error), |sub_error| { + // Dereference necessary to mitigate rustc compiler bug. + // See + (*sub_error).source() + }) + .enumerate() + .for_each(|(level, error)| { + error!(" {level}: {error}"); + }); } } + /// Checks the migration result. + /// + /// This should be called when the migration thread indicated a state + /// change (and therefore, its termination). The function checks the result + /// of that thread and either shuts down the VMM on success or keeps the VM + /// and the VMM running on migration failure. + fn check_migration_result(&mut self) { + // At this point, the thread must be finished. + // If we fail here, we have lost anyway. Just panic. + let MigrationThreadOut { + vm, + migration_res, + migration_cfg, + } = self + .migration_thread_handle + .take() + .expect("should have thread") + .join(); + + let mut try_resume_vm = |mut vm: Vm| { + // If the failure happened very late in the migration path, the VM might already be + // stopped. We resume it to ensure proper operation. + // + // Cloud Hypervisor only supports migration of running VMs, therefore it cannot + // happen that we resume a previously paused VM. + if vm.get_state() == VmState::Paused { + match vm.resume() { + Ok(_) => { + info!("Resumed VM successfully after failed migration"); + } + Err(e) => { + error!("Failed resuming VM after failed migration: {e}"); + self.exit_evt.write(1).unwrap(); + } + } + } + + // Ensure full VM performance. The operation is idempotent. + let _ = vm.stop_dirty_log().inspect_err(|e| { + warn!("Failed stopping dirty log after resuming VM: {e} - VM performance might be slower than usual"); + }); + + // Give VMM back control. + self.vm = MaybeVmOwnership::Vmm(vm); + + if let Some(event) = self.current_postponed_lifecycle_event() { + match event { + PostMigrationLifecycleEvent::VmReboot => { + self.reset_evt + .write(1) + .context("Failed replaying reset event after failed migration") + .inspect_err(|write_err| error!("{write_err}")) + .ok(); + } + PostMigrationLifecycleEvent::VmShutdown => { + self.guest_exit_evt + .write(1) + .context("Failed replaying guest exit event after failed migration") + .inspect_err(|write_err| error!("{write_err}")) + .ok(); + } + } + } + }; + + match migration_res { + Ok(()) => { + self.vm = MaybeVmOwnership::None; + drop(vm); + + { + let mut lock = MIGRATION_PROGRESS_SNAPSHOT.lock().unwrap(); + lock.as_mut() + .expect("live migration should be ongoing") + .mark_as_finished(); + } + + if migration_cfg.keep_alive { + // API users can still query live-migration statistics + info!("Keeping VMM alive as requested"); + } else { + // Shutdown the VM after the migration succeeded + if let Err(e) = self.exit_evt.write(1) { + error!("Failed shutting down the VM after migration: {e}"); + } + } + } + Err(MigratableError::Cancelled) => { + error!("Migration cancelled"); + event!("vm", "migration-cancelled"); + try_resume_vm(vm); + + // Update migration progress snapshot + { + let mut lock = MIGRATION_PROGRESS_SNAPSHOT.lock().unwrap(); + lock.as_mut() + .expect("live migration should be ongoing") + .mark_as_cancelled(); + } + } + Err(e) => { + Self::log_print_error_chain(&e); + try_resume_vm(vm); + + // Update migration progress snapshot + { + let mut lock = MIGRATION_PROGRESS_SNAPSHOT.lock().unwrap(); + lock.as_mut() + .expect("live migration should be ongoing") + .mark_as_failed(&e); + } + } + } + self.clear_postponed_lifecycle_event(); + } + fn control_loop( &mut self, api_receiver: &Receiver, @@ -1717,11 +2394,25 @@ impl Vmm { info!("VM reset event"); // Consume the event. self.reset_evt.read().map_err(Error::EventFdRead)?; + // Workaround for guest-induced shutdown during a live-migration. + if matches!(self.vm, MaybeVmOwnership::Migration(_)) { + self.postpone_lifecycle_event_during_migration( + PostMigrationLifecycleEvent::VmReboot, + ); + continue; + } self.vm_reboot().map_err(Error::VmReboot)?; } EpollDispatch::GuestExit => { info!("VM guest exit event"); self.guest_exit_evt.read().map_err(Error::EventFdRead)?; + // Workaround for guest-induced shutdown during a live-migration. + if matches!(self.vm, MaybeVmOwnership::Migration(_)) { + self.postpone_lifecycle_event_during_migration( + PostMigrationLifecycleEvent::VmShutdown, + ); + continue; + } if self.no_shutdown { self.vm_shutdown().map_err(Error::VmShutdown)?; } else { @@ -1730,11 +2421,18 @@ impl Vmm { } } EpollDispatch::ActivateVirtioDevices => { - if let Some(ref vm) = self.vm { - let count = self.activate_evt.read().map_err(Error::EventFdRead)?; - info!("Trying to activate pending virtio devices: count = {count}"); - vm.activate_virtio_devices() - .map_err(Error::ActivateVirtioDevices)?; + let count = self.activate_evt.read().map_err(Error::EventFdRead)?; + info!("Trying to activate pending virtio devices: count = {count}"); + match &self.vm { + MaybeVmOwnership::Vmm(vm) => vm + .activate_virtio_devices() + .map_err(Error::ActivateVirtioDevices)?, + MaybeVmOwnership::Migration(state) => { + state + .activate_virtio_devices() + .map_err(Error::ActivateVirtioDevices)?; + } + MaybeVmOwnership::None => {} } } EpollDispatch::Api => { @@ -1755,7 +2453,7 @@ impl Vmm { // Read from the API receiver channel let gdb_request = gdb_receiver.recv().map_err(Error::GdbRequestRecv)?; - let response = if let Some(ref mut vm) = self.vm { + let response = if let MaybeVmOwnership::Vmm(ref mut vm) = self.vm { vm.debug_request(&gdb_request.payload, gdb_request.cpu_id) } else { Err(VmError::VmNotRunning) @@ -1770,6 +2468,14 @@ impl Vmm { } #[cfg(not(feature = "guest_debug"))] EpollDispatch::Debug => {} + EpollDispatch::CheckMigration => { + info!("VM migration check event"); + // Consume the event. + self.check_migration_evt + .read() + .map_err(Error::EventFdRead)?; + self.check_migration_result(); + } } } } @@ -1820,108 +2526,125 @@ impl RequestHandler for Vmm { tracer::start(); info!("Booting VM"); event!("vm", "booting"); - let r = { - trace_scoped!("vm_boot"); - // If we don't have a config, we cannot boot a VM. - if self.vm_config.is_none() { - return Err(VmError::VmMissingConfig); - } - // console_info is set to None in vm_shutdown. re-populate here if empty - if self.console_info.is_none() { - self.console_info = - Some(pre_create_console_devices(self).map_err(VmError::CreateConsoleDevices)?); - } + if matches!(self.vm, MaybeVmOwnership::Migration(_)) { + return Err(VmError::VmMigrating); + } - // Create a new VM if we don't have one yet. - if self.vm.is_none() { - let exit_evt = self.exit_evt.try_clone().map_err(VmError::EventFdClone)?; - let reset_evt = self.reset_evt.try_clone().map_err(VmError::EventFdClone)?; - let guest_exit_evt = self - .guest_exit_evt - .try_clone() - .map_err(VmError::EventFdClone)?; - #[cfg(feature = "guest_debug")] - let vm_debug_evt = self - .vm_debug_evt - .try_clone() - .map_err(VmError::EventFdClone)?; - let activate_evt = self - .activate_evt - .try_clone() - .map_err(VmError::EventFdClone)?; - - if let Some(ref vm_config) = self.vm_config { - let vm = Vm::new( - Arc::clone(vm_config), - exit_evt, - reset_evt, - guest_exit_evt, - #[cfg(feature = "guest_debug")] - vm_debug_evt, - &self.seccomp_action, - self.hypervisor.clone(), - activate_evt, - self.console_info.clone(), - self.console_resize_pipe.clone(), - Arc::clone(&self.original_termios_opt), - None, - None, - None, - None, - )?; - - self.vm = Some(vm); - } + // Create a new VM if we don't have one yet. + if matches!(self.vm, MaybeVmOwnership::None) { + let exit_evt = self.exit_evt.try_clone().map_err(VmError::EventFdClone)?; + let reset_evt = self.reset_evt.try_clone().map_err(VmError::EventFdClone)?; + let guest_exit_evt = self + .guest_exit_evt + .try_clone() + .map_err(VmError::EventFdClone)?; + #[cfg(feature = "guest_debug")] + let vm_debug_evt = self + .vm_debug_evt + .try_clone() + .map_err(VmError::EventFdClone)?; + let activate_evt = self + .activate_evt + .try_clone() + .map_err(VmError::EventFdClone)?; + + if let Some(ref vm_config) = self.vm_config { + let vm = Vm::new( + Arc::clone(vm_config), + exit_evt, + reset_evt, + guest_exit_evt, + #[cfg(feature = "guest_debug")] + vm_debug_evt, + &self.seccomp_action, + self.hypervisor.clone(), + activate_evt, + self.console_info.clone(), + self.console_resize_pipe.clone(), + Arc::clone(&self.original_termios_opt), + None, + None, + None, + None, + )?; + + self.vm = MaybeVmOwnership::Vmm(vm); } + } - // Now we can boot the VM. - if let Some(ref mut vm) = self.vm { - vm.boot() - } else { - Err(VmError::VmNotCreated) + // Now we can boot the VM. + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + vm.boot()?; + event!("vm", "booted"); } - }; - tracer::end(); - if r.is_ok() { - event!("vm", "booted"); + MaybeVmOwnership::None => { + return Err(VmError::VmNotCreated); + } + _ => unreachable!(), } - r + + tracer::end(); + Ok(()) } fn vm_pause(&mut self) -> result::Result<(), VmError> { - if let Some(ref mut vm) = self.vm { - vm.pause().map_err(VmError::Pause) - } else { - Err(VmError::VmNotRunning) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => vm.pause().map_err(VmError::Pause), + MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating)?, + MaybeVmOwnership::None => Err(VmError::VmNotRunning)?, } } fn vm_resume(&mut self) -> result::Result<(), VmError> { - if let Some(ref mut vm) = self.vm { - vm.resume().map_err(VmError::Resume) - } else { - Err(VmError::VmNotRunning) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => vm.resume().map_err(VmError::Resume), + MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating)?, + MaybeVmOwnership::None => Err(VmError::VmNotRunning)?, + } + } + + fn vm_post_migration_announce(&mut self) -> result::Result<(), VmError> { + match self.vm { + MaybeVmOwnership::Vmm(ref vm) => { + if vm.get_state() != VmState::Running { + return Err(VmError::VmNotRunning); + } + + vm.post_migration_announce(); + Ok(()) + } + MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating)?, + MaybeVmOwnership::None => Err(VmError::VmNotRunning)?, } } fn vm_snapshot(&mut self, destination_url: &str) -> result::Result<(), VmError> { - if let Some(ref mut vm) = self.vm { - // Drain console_info so that FDs are not reused - let _ = self.console_info.take(); - vm.snapshot() - .map_err(VmError::Snapshot) - .and_then(|snapshot| { - vm.send(&snapshot, destination_url) - .map_err(VmError::SnapshotSend) - }) - } else { - Err(VmError::VmNotRunning) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + // Drain console_info so that FDs are not reused + let _ = self.console_info.take(); + vm.snapshot() + .map_err(VmError::Snapshot) + .and_then(|snapshot| { + vm.send(&snapshot, destination_url) + .map_err(VmError::SnapshotSend) + }) + } + MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating)?, + MaybeVmOwnership::None => Err(VmError::VmNotRunning)?, } } fn vm_restore(&mut self, restore_cfg: RestoreConfig) -> result::Result<(), VmError> { - if self.vm.is_some() || self.vm_config.is_some() { + match &self.vm { + MaybeVmOwnership::Vmm(_vm) => return Err(VmError::VmAlreadyCreated), + MaybeVmOwnership::Migration(_) => return Err(VmError::VmMigrating), + MaybeVmOwnership::None => (), + } + + if self.vm_config.is_some() { return Err(VmError::VmAlreadyCreated); } @@ -1981,21 +2704,25 @@ impl RequestHandler for Vmm { #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] fn vm_coredump(&mut self, destination_url: &str) -> result::Result<(), VmError> { - if let Some(ref mut vm) = self.vm { - vm.coredump(destination_url).map_err(VmError::Coredump) - } else { - Err(VmError::VmNotRunning) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + vm.coredump(destination_url).map_err(VmError::Coredump) + } + MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating), + MaybeVmOwnership::None => Err(VmError::VmNotRunning), } } fn vm_shutdown(&mut self) -> result::Result<(), VmError> { - let r = if let Some(ref mut vm) = self.vm.take() { - // Drain console_info so that the FDs are not reused - let _ = self.console_info.take(); - vm.shutdown() - } else { - Err(VmError::VmNotRunning) + let vm = match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => vm, + MaybeVmOwnership::Migration(_) => return Err(VmError::VmMigrating), + MaybeVmOwnership::None => return Err(VmError::VmNotRunning), }; + // Drain console_info so that the FDs are not reused + let _ = self.console_info.take(); + let r = vm.shutdown(); + self.vm = MaybeVmOwnership::None; if r.is_ok() { event!("vm", "shutdown"); @@ -2008,13 +2735,14 @@ impl RequestHandler for Vmm { event!("vm", "rebooting"); // First we stop the current VM - let config = if let Some(mut vm) = self.vm.take() { - let config = vm.get_config(); - vm.shutdown()?; - config - } else { - return Err(VmError::VmNotCreated); + let vm = match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => vm, + MaybeVmOwnership::Migration(_) => return Err(VmError::VmMigrating), + MaybeVmOwnership::None => return Err(VmError::VmNotRunning), }; + let config = vm.get_config(); + vm.shutdown()?; + self.vm = MaybeVmOwnership::None; // vm.shutdown() closes all the console devices, so set console_info to None // so that the closed FD #s are not reused. @@ -2069,7 +2797,7 @@ impl RequestHandler for Vmm { // And we boot it vm.boot()?; - self.vm = Some(vm); + self.vm = MaybeVmOwnership::Vmm(vm); event!("vm", "rebooted"); @@ -2077,35 +2805,40 @@ impl RequestHandler for Vmm { } fn vm_info(&self) -> result::Result { - match &self.vm_config { - Some(vm_config) => { - let state = match &self.vm { - Some(vm) => vm.get_state(), - None => VmState::Created, - }; - let config = vm_config.lock().unwrap().clone(); - - let mut memory_actual_size = - config.memory.total_size() - config.memory.hotplugged_size(); - if let Some(vm) = &self.vm { - memory_actual_size = memory_actual_size.saturating_sub(vm.balloon_size()); - memory_actual_size += vm.virtio_mem_plugged_size(); - } - - let device_tree = self - .vm - .as_ref() - .map(|vm| vm.device_tree().lock().unwrap().clone()); + let vm_config = self.vm_config.as_ref().ok_or(VmError::VmNotCreated)?; + let vm_config = vm_config.lock().unwrap().clone(); + + let state = match &self.vm { + MaybeVmOwnership::Vmm(vm) => vm.get_state(), + // TODO in theory one could live-migrate a non-running VM .. + MaybeVmOwnership::Migration(_) => VmState::Running, + MaybeVmOwnership::None => VmState::Created, + }; - Ok(VmInfoResponse { - config: Box::new(config), - state, - memory_actual_size, - device_tree, - }) + let mut memory_actual_size = + vm_config.memory.total_size() - vm_config.memory.hotplugged_size(); + match &self.vm { + MaybeVmOwnership::Vmm(vm) => { + memory_actual_size = memory_actual_size.saturating_sub(vm.balloon_size()); + memory_actual_size += vm.virtio_mem_plugged_size(); } - None => Err(VmError::VmNotCreated), + MaybeVmOwnership::Migration(_) => {} + MaybeVmOwnership::None => {} } + + let device_tree = match &self.vm { + MaybeVmOwnership::Vmm(vm) => Some(vm.device_tree().lock().unwrap().clone()), + // TODO we need to fix this + MaybeVmOwnership::Migration(_) => None, + MaybeVmOwnership::None => None, + }; + + Ok(VmInfoResponse { + config: Box::new(vm_config), + state, + memory_actual_size, + device_tree, + }) } fn vmm_ping(&self) -> VmmPingResponse { @@ -2127,14 +2860,19 @@ impl RequestHandler for Vmm { return Ok(()); } - // If a VM is booted, we first try to shut it down. - if self.vm.is_some() { - self.vm_shutdown()?; - } - - self.vm_config = None; + match &self.vm { + MaybeVmOwnership::Vmm(_vm) => { + event!("vm", "deleted"); - event!("vm", "deleted"); + // If a VM is booted, we first try to shut it down. + self.vm_shutdown()?; + self.vm_config = None; + } + MaybeVmOwnership::None => { + self.vm_config = None; + } + MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating)?, + } Ok(()) } @@ -2153,59 +2891,80 @@ impl RequestHandler for Vmm { ) -> result::Result<(), VmError> { self.vm_config.as_ref().ok_or(VmError::VmNotCreated)?; - if let Some(ref mut vm) = self.vm { - vm.resize(desired_vcpus, desired_ram, desired_balloon) - .inspect_err(|e| error!("Error when resizing VM: {e:?}"))?; - Ok(()) - } else { - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - if let Some(desired_vcpus) = desired_vcpus { - config.cpus.boot_vcpus = desired_vcpus; - } - if let Some(desired_ram) = desired_ram { - config.memory.size = desired_ram; + if desired_vcpus.is_some() { + todo!("doesn't work currently with our thread-local KVM_RUN approach"); + } + + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + vm.resize(desired_vcpus, desired_ram, desired_balloon) + .inspect_err(|e| error!("Error when resizing VM: {e:?}"))?; + Ok(()) } - if let Some(desired_balloon) = desired_balloon - && let Some(balloon_config) = &mut config.balloon - { - balloon_config.size = desired_balloon; + MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + if let Some(desired_vcpus) = desired_vcpus { + config.cpus.boot_vcpus = desired_vcpus; + } + if let Some(desired_ram) = desired_ram { + config.memory.size = desired_ram; + } + if let Some(desired_balloon) = desired_balloon + && let Some(balloon_config) = &mut config.balloon + { + balloon_config.size = desired_balloon; + } + + Ok(()) } - Ok(()) } } fn vm_resize_disk(&mut self, id: String, desired_size: u64) -> result::Result<(), VmError> { + info!("request to resize disk: id={id}"); self.vm_config.as_ref().ok_or(VmError::VmNotCreated)?; - if let Some(ref mut vm) = self.vm { - return vm.resize_disk(&id, desired_size); + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + if let Err(e) = vm.resize_disk(&id, desired_size) { + error!("Error when resizing disk: {e:?}"); + Err(e) + } else { + Ok(()) + } + } + MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating), + MaybeVmOwnership::None => Err(VmError::ResizeDisk), } - - Err(VmError::ResizeDisk) } fn vm_resize_zone(&mut self, id: String, desired_ram: u64) -> result::Result<(), VmError> { self.vm_config.as_ref().ok_or(VmError::VmNotCreated)?; - if let Some(ref mut vm) = self.vm { - vm.resize_zone(&id, desired_ram) - .inspect_err(|e| error!("Error when resizing zone: {e:?}"))?; - Ok(()) - } else { - // Update VmConfig by setting the new desired ram. - let memory_config = &mut self.vm_config.as_ref().unwrap().lock().unwrap().memory; - - if let Some(zones) = &mut memory_config.zones { - for zone in zones.iter_mut() { - if zone.id == id { - zone.size = desired_ram; - return Ok(()); + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + vm.resize_zone(&id, desired_ram) + .inspect_err(|e| error!("Error when resizing zone: {e:?}"))?; + Ok(()) + } + MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + // Update VmConfig by setting the new desired ram. + let memory_config = &mut self.vm_config.as_ref().unwrap().lock().unwrap().memory; + + if let Some(zones) = &mut memory_config.zones { + for zone in zones.iter_mut() { + if zone.id == id { + zone.size = desired_ram; + return Ok(()); + } } } - } - error!("Could not find the memory zone {id} for the resize"); - Err(VmError::ResizeZone) + error!("Could not find the memory zone {id} for the resize"); + Err(VmError::ResizeZone) + } } } @@ -2222,18 +2981,22 @@ impl RequestHandler for Vmm { config.validate().map_err(VmError::ConfigValidation)?; } - if let Some(ref mut vm) = self.vm { - let info = vm.add_device(device_cfg).inspect_err(|e| { - error!("Error when adding new device to the VM: {e:?}"); - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - // Update VmConfig by adding the new device. - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - add_to_config(&mut config.devices, device_cfg); - Ok(None) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + let info = vm.add_device(device_cfg).inspect_err(|e| { + error!("Error when adding new device to the VM: {e:?}"); + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } + MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + // Update VmConfig by adding the new device. + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + add_to_config(&mut config.devices, device_cfg); + Ok(None) + } } } @@ -2250,35 +3013,45 @@ impl RequestHandler for Vmm { config.validate().map_err(VmError::ConfigValidation)?; } - if let Some(ref mut vm) = self.vm { - let info = vm.add_user_device(device_cfg).inspect_err(|e| { - error!("Error when adding new user device to the VM: {e:?}"); - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - // Update VmConfig by adding the new device. - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - add_to_config(&mut config.user_devices, device_cfg); - Ok(None) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + let info = vm.add_user_device(device_cfg).inspect_err(|e| { + error!("Error when adding new user device to the VM: {e:?}"); + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } + MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + // Update VmConfig by adding the new device. + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + add_to_config(&mut config.user_devices, device_cfg); + Ok(None) + } } } fn vm_remove_device(&mut self, id: String) -> result::Result<(), VmError> { - if let Some(ref mut vm) = self.vm { - vm.remove_device(&id) - .inspect_err(|e| error!("Error when removing device from the VM: {e:?}"))?; - Ok(()) - } else if let Some(ref config) = self.vm_config { - let mut config = config.lock().unwrap(); - if config.remove_device(&id) { + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + vm.remove_device(&id) + .inspect_err(|e| error!("Error when removing device from the VM: {e:?}"))?; Ok(()) - } else { - Err(VmError::NoDeviceToRemove(id)) } - } else { - Err(VmError::VmNotCreated) + MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + if let Some(ref config) = self.vm_config { + let mut config = config.lock().unwrap(); + if config.remove_device(&id) { + Ok(()) + } else { + Err(VmError::NoDeviceToRemove(id)) + } + } else { + Err(VmError::VmNotCreated) + } + } } } @@ -2292,18 +3065,22 @@ impl RequestHandler for Vmm { config.validate().map_err(VmError::ConfigValidation)?; } - if let Some(ref mut vm) = self.vm { - let info = vm.add_disk(disk_cfg).inspect_err(|e| { - error!("Error when adding new disk to the VM: {e:?}"); - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - // Update VmConfig by adding the new device. - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - add_to_config(&mut config.disks, disk_cfg); - Ok(None) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + let info = vm.add_disk(disk_cfg).inspect_err(|e| { + error!("Error when adding new disk to the VM: {e:?}"); + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } + MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + // Update VmConfig by adding the new device. + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + add_to_config(&mut config.disks, disk_cfg); + Ok(None) + } } } @@ -2317,52 +3094,32 @@ impl RequestHandler for Vmm { config.validate().map_err(VmError::ConfigValidation)?; } - if let Some(ref mut vm) = self.vm { - let info = vm.add_fs(fs_cfg).inspect_err(|e| { - error!("Error when adding new fs to the VM: {e:?}"); - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - // Update VmConfig by adding the new device. - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - add_to_config(&mut config.fs, fs_cfg); - Ok(None) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + let info = vm.add_fs(fs_cfg).inspect_err(|e| { + error!("Error when adding new fs to the VM: {e:?}"); + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } + MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + // Update VmConfig by adding the new device. + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + add_to_config(&mut config.fs, fs_cfg); + Ok(None) + } } } fn vm_add_generic_vhost_user( &mut self, - generic_vhost_user_cfg: GenericVhostUserConfig, + _generic_vhost_user_cfg: GenericVhostUserConfig, ) -> result::Result>, VmError> { self.vm_config.as_ref().ok_or(VmError::VmNotCreated)?; - { - // Validate the configuration change in a cloned configuration - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap().clone(); - add_to_config( - &mut config.generic_vhost_user, - generic_vhost_user_cfg.clone(), - ); - config.validate().map_err(VmError::ConfigValidation)?; - } - - if let Some(ref mut vm) = self.vm { - let info = vm - .add_generic_vhost_user(generic_vhost_user_cfg) - .inspect_err(|e| { - error!("Error when adding new generic vhost-user device to the VM: {e:?}"); - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - // Update VmConfig by adding the new device. - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - add_to_config(&mut config.generic_vhost_user, generic_vhost_user_cfg); - Ok(None) - } + unimplemented!("removed in our fork for simplicity"); } fn vm_add_pmem(&mut self, pmem_cfg: PmemConfig) -> result::Result>, VmError> { @@ -2375,18 +3132,22 @@ impl RequestHandler for Vmm { config.validate().map_err(VmError::ConfigValidation)?; } - if let Some(ref mut vm) = self.vm { - let info = vm.add_pmem(pmem_cfg).inspect_err(|e| { - error!("Error when adding new pmem device to the VM: {e:?}"); - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - // Update VmConfig by adding the new device. - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - add_to_config(&mut config.pmem, pmem_cfg); - Ok(None) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + let info = vm.add_pmem(pmem_cfg).inspect_err(|e| { + error!("Error when adding new pmem device to the VM: {e:?}"); + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } + MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + // Update VmConfig by adding the new device. + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + add_to_config(&mut config.pmem, pmem_cfg); + Ok(None) + } } } @@ -2400,18 +3161,22 @@ impl RequestHandler for Vmm { config.validate().map_err(VmError::ConfigValidation)?; } - if let Some(ref mut vm) = self.vm { - let info = vm.add_net(net_cfg).inspect_err(|e| { - error!("Error when adding new network device to the VM: {e:?}"); - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - // Update VmConfig by adding the new device. - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - add_to_config(&mut config.net, net_cfg); - Ok(None) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + let info = vm.add_net(net_cfg).inspect_err(|e| { + error!("Error when adding new network device to the VM: {e:?}"); + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } + MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + // Update VmConfig by adding the new device. + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + add_to_config(&mut config.net, net_cfg); + Ok(None) + } } } @@ -2425,18 +3190,22 @@ impl RequestHandler for Vmm { config.validate().map_err(VmError::ConfigValidation)?; } - if let Some(ref mut vm) = self.vm { - let info = vm.add_vdpa(vdpa_cfg).inspect_err(|e| { - error!("Error when adding new vDPA device to the VM: {e:?}"); - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - // Update VmConfig by adding the new device. - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - add_to_config(&mut config.vdpa, vdpa_cfg); - Ok(None) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + let info = vm.add_vdpa(vdpa_cfg).inspect_err(|e| { + error!("Error when adding new vDPA device to the VM: {e:?}"); + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } + MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + // Update VmConfig by adding the new device. + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + add_to_config(&mut config.vdpa, vdpa_cfg); + Ok(None) + } } } @@ -2455,47 +3224,53 @@ impl RequestHandler for Vmm { config.validate().map_err(VmError::ConfigValidation)?; } - if let Some(ref mut vm) = self.vm { - let info = vm.add_vsock(vsock_cfg).inspect_err(|e| { - error!("Error when adding new vsock device to the VM: {e:?}"); - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - // Update VmConfig by adding the new device. - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - config.vsock = Some(vsock_cfg); - Ok(None) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + let info = vm.add_vsock(vsock_cfg).inspect_err(|e| { + error!("Error when adding new vsock device to the VM: {e:?}"); + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } + MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + // Update VmConfig by adding the new device. + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + config.vsock = Some(vsock_cfg); + Ok(None) + } } } fn vm_counters(&mut self) -> result::Result>, VmError> { - if let Some(ref mut vm) = self.vm { - let info = vm.counters().inspect_err(|e| { - error!("Error when getting counters from the VM: {e:?}"); - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - Err(VmError::VmNotRunning) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + let info = vm.counters().inspect_err(|e| { + error!("Error when getting counters from the VM: {e:?}"); + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } + MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating), + MaybeVmOwnership::None => Err(VmError::VmNotRunning), } } fn vm_power_button(&mut self) -> result::Result<(), VmError> { - if let Some(ref mut vm) = self.vm { - vm.power_button() - } else { - Err(VmError::VmNotRunning) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => vm.power_button(), + MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating), + MaybeVmOwnership::None => Err(VmError::VmNotRunning), } } fn vm_nmi(&mut self) -> result::Result<(), VmError> { - if let Some(ref mut vm) = self.vm { - vm.nmi() - } else { - Err(VmError::VmNotRunning) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => vm.nmi(), + MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating), + MaybeVmOwnership::None => Err(VmError::VmNotRunning), } } @@ -2503,55 +3278,83 @@ impl RequestHandler for Vmm { &mut self, receive_data_migration: VmReceiveMigrationData, ) -> result::Result<(), MigratableError> { + receive_data_migration + .validate() + .context("Invalid receive migration configuration") + .map_err(MigratableError::MigrateReceive)?; + + // Prevent stale lifecycle intent from a previous failed receive attempt. + self.received_postponed_lifecycle_event = None; + info!( - "Receiving migration: receiver_url = {}", - receive_data_migration.receiver_url + "Receiving migration: receiver_url={},tls={},net_fds={:?}, tcp_url={:?}, zones={:?}", + receive_data_migration.receiver_url, + receive_data_migration.tls_dir.is_some(), + &receive_data_migration.net_fds, + &receive_data_migration.tcp_serial_url, + &receive_data_migration.zones, ); - let mut listener = - migration_transport::receive_migration_listener(&receive_data_migration.receiver_url)?; + let mut listener = migration_transport::receive_migration_listener( + &receive_data_migration.receiver_url, + receive_data_migration.tls_dir.as_deref(), + )?; // Accept the connection and get the socket - let mut socket = listener.accept()?; + let mut socket = listener + .accept(true) + .inspect_err(|e| warn!("{e}")) + .context("Failed to accept incoming migration") + .map_err(MigratableError::MigrateReceive)?; event!("vm", "migration-receive-started"); let mut state = ReceiveMigrationState::Established; - while !state.finished() { + let res: result::Result = loop { let req = Request::read_from(&mut socket)?; trace!("Command {:?} received", req.command()); - let (response, new_state) = match self.vm_receive_migration_step( + let (response, new_state, mut maybe_error) = match self.vm_receive_migration_step( &mut socket, &listener, state, &req, &receive_data_migration, ) { - Ok(next_state) => (Response::ok(), next_state), + Ok(next_state) => (Response::ok(), next_state, None), Err(err) => { warn!( "Migration aborted as migration command {:?} failed: {}", req.command(), err ); - (Response::error(), ReceiveMigrationState::Aborted) + (Response::error(), ReceiveMigrationState::Aborted, Some(err)) } }; state = new_state; assert_eq!(response.length(), 0); response.write_to(&mut socket)?; - } - if let ReceiveMigrationState::Aborted = state { + if maybe_error.is_some() { + break Err(maybe_error.take().unwrap()); + } else if state.finished() { + break Ok(state); + } + }; + + if matches!(res, Err(_) | Ok(ReceiveMigrationState::Aborted)) { event!("vm", "migration-receive-failed"); - self.vm = None; + self.vm = MaybeVmOwnership::None; self.vm_config = None; - } else { - event!("vm", "migration-receive-finished"); + return match res { + Ok(_) => Err(MigratableError::CompleteMigration(anyhow!( + "Migration was aborted by sender" + ))), + Err(e) => Err(MigratableError::CompleteMigration(e.into())), + }; } - + event!("vm", "migration-receive-finished"); Ok(()) } @@ -2564,15 +3367,31 @@ impl RequestHandler for Vmm { .context("Invalid send migration configuration") .map_err(MigratableError::MigrateSend)?; + match self.vm { + MaybeVmOwnership::Vmm(_) => (), + MaybeVmOwnership::Migration(_) => { + return Err(MigratableError::MigrateSend(anyhow!( + "There is already an ongoing migration" + ))); + } + MaybeVmOwnership::None => { + return Err(MigratableError::MigrateSend(anyhow!("VM is not running"))); + } + } + info!( - "Sending migration: destination_url={},local={},downtime={}ms,timeout={}s,timeout_strategy={:?}", + "Sending migration: destination_url={},local={},tls={},downtime={}ms,timeout={}s,timeout_strategy={:?}", send_data_migration.destination_url, send_data_migration.local, + send_data_migration.tls_dir.is_some(), send_data_migration.downtime().as_millis(), send_data_migration.timeout().as_secs(), send_data_migration.timeout_strategy ); + // New migration attempt: clear postponed lifecycle from any previous run. + self.clear_postponed_lifecycle_event(); + if !self .vm_config .as_ref() @@ -2587,10 +3406,17 @@ impl RequestHandler for Vmm { ))); } - let vm = self - .vm - .as_mut() - .ok_or_else(|| MigratableError::MigrateSend(anyhow!("VM is not running")))?; + // Cloud Hypervisor only supports the migration of running VMs. + let current_state = self.vm.vm_mut().as_ref().unwrap().get_state(); + if current_state != VmState::Running { + return Err(MigratableError::MigrateSend(anyhow!(format!( + "Only running VMs can be migrated! state={current_state:?}" + )))); + } + + // Take VM ownership. This also means that API events can no longer + // change the VM (e.g. net device hotplug). + let vm = self.vm.take_vm_for_migration(); let initial_vm_state = vm.get_state(); if initial_vm_state != VmState::Running && initial_vm_state != VmState::Paused { @@ -2599,45 +3425,85 @@ impl RequestHandler for Vmm { ))); } - event!("vm", "migration-started"); - Self::send_migration( + // Update migration progress snapshot early: + // We guarantee that migration statistics can be fetched as soon as SendMigration returns. + // + // If the migration fails, the state will later be updated accordingly. + { + let mut lock = MIGRATION_PROGRESS_SNAPSHOT.lock().unwrap(); + if lock + .as_ref() + .map(|p| &p.state) + .is_some_and(|snapshot| matches!(snapshot, MigrationState::Ongoing { .. })) + { + // If this panic triggers, we made a programming error in our state handling. + panic!("migration already ongoing"); + } + let transportation_mode = if send_data_migration.local { + TransportationMode::Local + } else { + TransportationMode::Tcp { + connections: send_data_migration.connections, + tls: send_data_migration.tls_dir.is_some(), + } + }; + lock.replace(MigrationProgress::new( + transportation_mode, + send_data_migration.downtime(), + )); + } + + // When spawning the thread fails, the VM keeps running normally. + let migration_worker = MigrationWorker::spawn( vm, + self.check_migration_evt.try_clone().unwrap(), + send_data_migration, + self.postponed_lifecycle_event.clone(), #[cfg(all(feature = "kvm", target_arch = "x86_64"))] - self.hypervisor.as_ref(), - &send_data_migration, - initial_vm_state, + self.hypervisor.clone(), ) - .map_err(|migration_err| { - error!("Migration failed: {migration_err:?}"); - event!("vm", "migration-failed"); + .map_err(|(vm, e)| { + self.vm = MaybeVmOwnership::Vmm(vm); - // Stop logging dirty pages only for non-local migrations - if !send_data_migration.local - && let Err(e) = vm.stop_dirty_log() - { - return e; - } + let mut lock = MIGRATION_PROGRESS_SNAPSHOT.lock().unwrap(); + lock.as_mut() + .expect("live migration should be ongoing") + .mark_as_failed(&e); - // Only resume if the VM was originally running; a VM that was already - // paused before migration should remain paused after failure. - if initial_vm_state == VmState::Running - && vm.get_state() == VmState::Paused - && let Err(e) = vm.resume() - { - return e; + e + })?; + let old = self.migration_thread_handle.replace(migration_worker); + // If this fails, we messed up the thread lifecycle management. + debug_assert!(old.is_none()); + + Ok(()) + } + + fn vm_cancel_migration(&mut self) -> result::Result<(), MigratableError> { + match self.vm { + MaybeVmOwnership::Migration(_) => (), + _ => { + return Err(MigratableError::CancelMigration(anyhow!( + "There is no ongoing migration" + ))); } + } - migration_err - })?; + let handle = self + .migration_thread_handle + .as_ref() + .expect("should have handle"); + // We just dispatch the cancellation. + handle.trigger_cancellation(); - event!("vm", "migration-finished"); + Ok(()) + } - // Shutdown the VM after the migration succeeded - self.exit_evt.write(1).map_err(|e| { - MigratableError::MigrateSend(anyhow!( - "Failed shutting down the VM after migration: {e:?}" - )) - }) + fn vm_migration_progress(&mut self) -> Option { + // We explicitly do not check here for `is VM running?` to always + // enable querying the state of the last failed migration. + let lock = MIGRATION_PROGRESS_SNAPSHOT.lock().unwrap(); + lock.clone() } } @@ -2649,6 +3515,8 @@ const DEVICE_MANAGER_SNAPSHOT_ID: &str = "device-manager"; mod unit_tests { use std::path::PathBuf; + use arch::CpuProfile; + use super::*; #[cfg(target_arch = "x86_64")] use crate::vm_config::DebugConsoleConfig; @@ -2686,6 +3554,7 @@ mod unit_tests { features: CpuFeatures::default(), nested: true, core_scheduling: CoreScheduling::default(), + profile: CpuProfile::default(), }, memory: MemoryConfig { size: 536_870_912, @@ -2728,6 +3597,7 @@ mod unit_tests { file: None, mode: ConsoleOutputMode::Null, socket: None, + url: None, }, }, console: ConsoleConfig { @@ -2736,6 +3606,7 @@ mod unit_tests { // Caution: Don't use `Tty` to not mess with users terminal mode: ConsoleOutputMode::Off, socket: None, + url: None, }, pci_common: PciDeviceCommonConfig::default(), }, @@ -2961,6 +3832,7 @@ mod unit_tests { ); } + #[ignore] // skipped in our fork for simplicity #[test] fn test_vmm_vm_cold_add_generic_vhost_user() { let mut vmm = create_dummy_vmm(); diff --git a/vmm/src/memory_manager.rs b/vmm/src/memory_manager.rs index dfe0f3e490..edbb918dd3 100644 --- a/vmm/src/memory_manager.rs +++ b/vmm/src/memory_manager.rs @@ -19,7 +19,7 @@ use std::sync::{Arc, Barrier, Mutex}; use std::{ffi, result, thread}; use acpi_tables::{Aml, aml}; -use anyhow::anyhow; +use anyhow::{Context, anyhow}; use arch::RegionType; #[cfg(target_arch = "x86_64")] use devices::ioapic; @@ -439,6 +439,10 @@ pub enum Error { /// Memory size is misaligned with default page size or its hugepage size #[error("Memory size is misaligned with default page size or its hugepage size")] MisalignedMemorySize, + + /// Failed to prefault memory + #[error("Failed to prefault memory")] + PrefaultMemory(#[source] io::Error), } impl From for Error { @@ -1571,7 +1575,6 @@ impl MemoryManager { .filter(|r| r.2 == RegionType::Ram) .map(|r| (r.0, r.1)) .collect(); - let arch_mem_regions: Vec = arch_mem_regions .iter() .map(|(a, b, c)| ArchMemRegion { @@ -1932,8 +1935,8 @@ impl MemoryManager { mmap_flags |= libc::MAP_SHARED; Some(Self::create_anonymous_file(size, hugepages, hugepage_size)?) } else { - mmap_flags |= libc::MAP_PRIVATE; - Some(Self::create_anonymous_file(size, hugepages, hugepage_size)?) + mmap_flags |= libc::MAP_PRIVATE | libc::MAP_ANONYMOUS; + None }; let region = MmapRegion::build(fo, size, libc::PROT_READ | libc::PROT_WRITE, mmap_flags) @@ -1967,6 +1970,10 @@ impl MemoryManager { // MPOL_BIND is the selected mode as it specifies a strict policy // that restricts memory allocation to the nodes specified in the // nodemask. + info!( + "Creating raw memory region: host-addr={:018x}, len={len}, mode={mode}, host-node={node}", + addr as u64 + ); Self::mbind(addr, len, mode, &nodemask, maxnode, flags) .map_err(Error::ApplyNumaPolicy)?; } @@ -1988,29 +1995,49 @@ impl MemoryManager { let remainder = num_pages % num_threads; let barrier = Arc::new(Barrier::new(num_threads)); - thread::scope(|s| { + thread::scope(|s| -> Result<(), Error> { let r = ®ion; + let mut handles = Vec::new(); for i in 0..num_threads { let barrier = Arc::clone(&barrier); - s.spawn(move || { - // Wait until all threads have been spawned to avoid contention - // over mmap_sem between thread stack allocation and page faulting. - barrier.wait(); - let pages = pages_per_thread + if i < remainder { 1 } else { 0 }; - let offset = - page_size * ((i * pages_per_thread) + std::cmp::min(i, remainder)); - // SAFETY: FFI call with correct arguments - let ret = unsafe { - let addr = r.as_ptr().add(offset); - libc::madvise(addr.cast(), pages * page_size, libc::MADV_POPULATE_WRITE) - }; - if ret != 0 { - let e = io::Error::last_os_error(); - warn!("Failed to prefault pages: {e}"); - } - }); + let h: thread::ScopedJoinHandle<'_, Result<(), io::Error>> = + s.spawn(move || { + // Wait until all threads have been spawned to avoid contention + // over mmap_sem between thread stack allocation and page faulting. + barrier.wait(); + let pages = pages_per_thread + if i < remainder { 1 } else { 0 }; + let offset = + page_size * ((i * pages_per_thread) + std::cmp::min(i, remainder)); + // SAFETY: FFI call with correct arguments + let ret = unsafe { + let addr = r.as_ptr().add(offset); + libc::madvise( + addr.cast(), + pages * page_size, + libc::MADV_POPULATE_WRITE, + ) + }; + if ret != 0 { + let e = io::Error::last_os_error(); + warn!("Failed to prefault pages: {e}"); + return Err(e); + } + Ok(()) + }); + handles.push(h); } - }); + + for handle in handles { + handle + .join() + .map_err(|_| { + Error::PrefaultMemory(io::Error::other("Prefault thread died")) + })? + .map_err(Error::PrefaultMemory)?; + } + + Ok(()) + })?; } info!( @@ -3159,7 +3186,8 @@ impl Transportable for MemoryManager { .write(true) .create_new(true) .open(&memory_file_path) - .map_err(|e| MigratableError::MigrateSend(e.into()))?; + .context("Error creating snapshot file for memory") + .map_err(MigratableError::MigrateSend)?; let total_len: u64 = self .snapshot_memory_ranges @@ -3221,7 +3249,8 @@ impl Transportable for MemoryManager { &mut memory_file, (range.length - offset) as usize, ) - .map_err(|e| MigratableError::MigrateSend(e.into()))?; + .context("Error writing guest memory to snapshot file") + .map_err(MigratableError::MigrateSend)?; offset += bytes_written as u64; if offset == range.length { break; @@ -3243,9 +3272,10 @@ impl Migratable for MemoryManager { // Just before we do a bulk copy we want to start/clear the dirty log so that // pages touched during our bulk copy are tracked. fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { - self.vm.start_dirty_log().map_err(|e| { - MigratableError::MigrateSend(anyhow!("Error starting VM dirty log {e}")) - })?; + self.vm + .start_dirty_log() + .context("Error starting VM dirty log") + .map_err(MigratableError::MigrateSend)?; for r in self.guest_memory.memory().iter() { (**r).bitmap().reset(); @@ -3255,9 +3285,10 @@ impl Migratable for MemoryManager { } fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { - self.vm.stop_dirty_log().map_err(|e| { - MigratableError::MigrateSend(anyhow!("Error stopping VM dirty log {e}")) - })?; + self.vm + .stop_dirty_log() + .context("Error stopping VM dirty log") + .map_err(MigratableError::MigrateSend)?; Ok(()) } @@ -3267,9 +3298,11 @@ impl Migratable for MemoryManager { fn dirty_log(&mut self) -> std::result::Result { let mut table = MemoryRangeTable::default(); for r in &self.guest_ram_mappings { - let vm_dirty_bitmap = self.vm.get_dirty_log(r.slot, r.gpa, r.size).map_err(|e| { - MigratableError::MigrateSend(anyhow!("Error getting VM dirty log {e}")) - })?; + let vm_dirty_bitmap = self + .vm + .get_dirty_log(r.slot, r.gpa, r.size) + .context("Error getting VM dirty log") + .map_err(MigratableError::MigrateSend)?; let vmm_dirty_bitmap = match self.guest_memory.memory().find_region(GuestAddress(r.gpa)) { Some(region) => { diff --git a/vmm/src/migration_transport.rs b/vmm/src/migration_transport.rs index 6440dc8fd6..e73b8bfcd5 100644 --- a/vmm/src/migration_transport.rs +++ b/vmm/src/migration_transport.rs @@ -7,12 +7,14 @@ use std::io::{self, ErrorKind, Read, Write}; use std::net::{TcpListener, TcpStream}; use std::num::NonZeroU32; use std::os::fd::{AsFd, BorrowedFd}; -use std::os::unix::io::{AsRawFd, RawFd}; +use std::os::unix::io::AsRawFd; use std::os::unix::net::{UnixListener, UnixStream}; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; use std::result::Result; use std::sync::atomic::{AtomicBool, Ordering}; -use std::sync::mpsc::{Receiver, Sender, SyncSender, TrySendError, channel, sync_channel}; +use std::sync::mpsc::{ + Receiver, Sender, SyncSender, TryRecvError, TrySendError, channel, sync_channel, +}; use std::sync::{Arc, Mutex}; use std::thread; use std::time::Duration; @@ -25,9 +27,12 @@ use vm_memory::{ Bytes, GuestAddress, GuestAddressSpace, GuestMemoryAtomic, ReadVolatile, VolatileMemoryError, VolatileSlice, WriteVolatile, }; +use vm_migration::keep_alive_stream::KeepAliveStream; use vm_migration::protocol::{Command, MemoryRangeTable, Request, Response}; +use vm_migration::tls::{TlsServerConfig, TlsStream}; use vm_migration::{MigratableError, Snapshot}; use vmm_sys_util::eventfd::EventFd; +use vmm_sys_util::timerfd::TimerFd; use crate::sync_utils::Gate; use crate::{GuestMemoryMmap, VmMigrationConfig}; @@ -36,27 +41,118 @@ use crate::{GuestMemoryMmap, VmMigrationConfig}; /// receiver side. pub(crate) const MAX_MIGRATION_CONNECTIONS: u32 = 128; +/// The time a writer may block on a socket until it throws an error. +/// +/// Also the interval at which the [`KeepAliveStream`] sends keep alive messages. +/// +/// # Relation with [`MIGRATION_READ_TIMEOUT_DURATION`] +/// +/// This timeout has to be smaller than [`MIGRATION_READ_TIMEOUT_DURATION`], +/// otherwise spurious timeouts may happen. +const MIGRATION_WRITE_TIMEOUT_DURATION: Duration = Duration::from_secs(5); + +/// The time a reader may block on a socket until it throws an error. +/// +/// # Relation with [`MIGRATION_WRITE_TIMEOUT_DURATION`] +/// +/// This timeout has to be larger than [`MIGRATION_WRITE_TIMEOUT_DURATION`], +/// otherwise spurious timeouts may happen. +const MIGRATION_READ_TIMEOUT_DURATION: Duration = { + let migration_read_timeout_duration = Duration::from_secs(10); + + // This timeout has to be larger than [`MIGRATION_WRITE_TIMEOUT_DURATION`], + // otherwise spurious timeouts may happen. + assert!( + MIGRATION_WRITE_TIMEOUT_DURATION.as_millis() < migration_read_timeout_duration.as_millis(), + "MIGRATION_WRITE_TIMEOUT_DURATION must be smaller than MIGRATION_READ_TIMEOUT_DURATION", + ); + migration_read_timeout_duration +}; + +/// The timeout of the migration-receiver. +/// +/// We set this to a relatively high number to ease local development with +/// `ch-remote`. For production, this has no negative impacts as the management +/// software has full control over the Cloud Hypervisor process and will kill +/// the process on terminated migration. The timeout is used as a fallback +/// if the management software doesn't kill the process correctly. +const MIGRATION_ACCEPT_TIMEOUT_DURATION: Duration = Duration::from_secs(60); + +fn set_migration_socket_timeouts(socket: &TcpStream) -> anyhow::Result<()> { + socket + .set_read_timeout(Some(MIGRATION_READ_TIMEOUT_DURATION)) + .context("Error setting read timeout on TCP socket")?; + socket + .set_write_timeout(Some(MIGRATION_WRITE_TIMEOUT_DURATION)) + .context("Error setting write timeout on TCP socket")?; + Ok(()) +} + /// Transport-agnostic listener used to receive connections. #[derive(Debug)] pub(crate) enum ReceiveListener { Tcp(TcpListener), Unix(UnixListener), + Tls(TcpListener, TlsServerConfig), } impl ReceiveListener { /// Block until a connection is accepted. - pub(crate) fn accept(&mut self) -> Result { + pub(crate) fn accept( + &mut self, + main_connection: bool, + ) -> Result { match self { - ReceiveListener::Tcp(listener) => listener - .accept() - .map(|(socket, _)| SocketStream::Tcp(socket)) - .context("Failed to accept TCP migration connection") - .map_err(MigratableError::MigrateReceive), + ReceiveListener::Tcp(listener) => { + info!( + "Waiting for incoming migration via TCP (timeout {}s) ...", + MIGRATION_ACCEPT_TIMEOUT_DURATION.as_secs() + ); + let (socket, _) = accept_with_timeout(listener, MIGRATION_ACCEPT_TIMEOUT_DURATION) + .context("Failed to accept TCP migration connection") + .map_err(MigratableError::MigrateReceive)?; + set_migration_socket_timeouts(&socket).map_err(MigratableError::MigrateReceive)?; + + let socket = SocketStream::Tcp(socket); + if main_connection { + KeepAliveStream::new(socket, MIGRATION_WRITE_TIMEOUT_DURATION, false) + .map(SocketStream::KeepAlive) + .context("Error creating keep-alive migration stream") + .map_err(MigratableError::MigrateReceive) + } else { + Ok(socket) + } + } ReceiveListener::Unix(listener) => listener .accept() .map(|(socket, _)| SocketStream::Unix(socket)) .context("Failed to accept Unix migration connection") .map_err(MigratableError::MigrateReceive), + ReceiveListener::Tls(listener, config) => { + info!( + "Waiting for incoming migration via TCP/TLS (timeout {}s) ...", + MIGRATION_ACCEPT_TIMEOUT_DURATION.as_secs() + ); + let (socket, _) = accept_with_timeout(listener, MIGRATION_ACCEPT_TIMEOUT_DURATION) + .context("Failed to accept TCP connection") + .map_err(MigratableError::MigrateReceive)?; + set_migration_socket_timeouts(&socket).map_err(MigratableError::MigrateReceive)?; + + let socket = TlsStream::new_server(socket, config) + .map(Box::new) + .map(SocketStream::Tls) + .context("Failed to accept TLS migration connection") + .map_err(MigratableError::MigrateReceive)?; + + if main_connection { + KeepAliveStream::new(socket, MIGRATION_WRITE_TIMEOUT_DURATION, false) + .map(SocketStream::KeepAlive) + .context("Error creating keep-alive migration stream") + .map_err(MigratableError::MigrateReceive) + } else { + Ok(socket) + } + } } } @@ -70,7 +166,7 @@ impl ReceiveListener { .map_err(MigratableError::MigrateReceive)? { // The listener is readable; accept the connection. - Ok(Some(self.accept()?)) + Ok(Some(self.accept(false)?)) } else { // The abort event was signaled before any connection arrived. Ok(None) @@ -90,15 +186,41 @@ impl ReceiveListener { .map(ReceiveListener::Unix) .context("Failed to clone Unix listener") .map_err(MigratableError::MigrateReceive), + ReceiveListener::Tls(listener, config) => listener + .try_clone() + .map(|listener| ReceiveListener::Tls(listener, config.clone())) + .context("Failed to clone TLS listener") + .map_err(MigratableError::MigrateReceive), } } } +/// Same as [`TcpListener::accept`], but returns an error if `timeout` expires. +fn accept_with_timeout( + listener: &TcpListener, + timeout: Duration, +) -> Result<(TcpStream, std::net::SocketAddr), io::Error> { + let mut timer_fd = TimerFd::new()?; + timer_fd + .reset(timeout, None) + .map_err(|e| io::Error::from_raw_os_error(e.errno()))?; + + wait_for_readable(listener, &timer_fd)? + .then(|| listener.accept()) + .ok_or_else(|| { + io::Error::new( + io::ErrorKind::TimedOut, + "Timed out waiting for sender to connect.", + ) + })? +} + impl AsFd for ReceiveListener { fn as_fd(&self) -> BorrowedFd<'_> { match self { ReceiveListener::Tcp(listener) => listener.as_fd(), ReceiveListener::Unix(listener) => listener.as_fd(), + ReceiveListener::Tls(listener, _) => listener.as_fd(), } } } @@ -107,6 +229,8 @@ impl AsFd for ReceiveListener { pub(crate) enum SocketStream { Unix(UnixStream), Tcp(TcpStream), + Tls(Box), + KeepAlive(KeepAliveStream), } impl Read for SocketStream { @@ -114,6 +238,8 @@ impl Read for SocketStream { match self { SocketStream::Unix(stream) => stream.read(buf), SocketStream::Tcp(stream) => stream.read(buf), + SocketStream::Tls(stream) => stream.read(buf), + SocketStream::KeepAlive(stream) => stream.read(buf), } } } @@ -123,6 +249,8 @@ impl Write for SocketStream { match self { SocketStream::Unix(stream) => stream.write(buf), SocketStream::Tcp(stream) => stream.write(buf), + SocketStream::Tls(stream) => stream.write(buf), + SocketStream::KeepAlive(stream) => stream.write(buf), } } @@ -130,15 +258,8 @@ impl Write for SocketStream { match self { SocketStream::Unix(stream) => stream.flush(), SocketStream::Tcp(stream) => stream.flush(), - } - } -} - -impl AsRawFd for SocketStream { - fn as_raw_fd(&self) -> RawFd { - match self { - SocketStream::Unix(s) => s.as_raw_fd(), - SocketStream::Tcp(s) => s.as_raw_fd(), + SocketStream::Tls(stream) => stream.flush(), + SocketStream::KeepAlive(stream) => stream.flush(), } } } @@ -148,6 +269,8 @@ impl AsFd for SocketStream { match self { SocketStream::Unix(s) => s.as_fd(), SocketStream::Tcp(s) => s.as_fd(), + SocketStream::Tls(s) => s.as_fd(), + SocketStream::KeepAlive(s) => s.as_fd(), } } } @@ -160,16 +283,8 @@ impl ReadVolatile for SocketStream { match self { SocketStream::Unix(s) => s.read_volatile(buf), SocketStream::Tcp(s) => s.read_volatile(buf), - } - } - - fn read_exact_volatile( - &mut self, - buf: &mut VolatileSlice, - ) -> Result<(), VolatileMemoryError> { - match self { - SocketStream::Unix(s) => s.read_exact_volatile(buf), - SocketStream::Tcp(s) => s.read_exact_volatile(buf), + SocketStream::Tls(s) => s.read_volatile(buf), + SocketStream::KeepAlive(s) => s.read_volatile(buf), } } } @@ -182,16 +297,8 @@ impl WriteVolatile for SocketStream { match self { SocketStream::Unix(s) => s.write_volatile(buf), SocketStream::Tcp(s) => s.write_volatile(buf), - } - } - - fn write_all_volatile( - &mut self, - buf: &VolatileSlice, - ) -> Result<(), VolatileMemoryError> { - match self { - SocketStream::Unix(s) => s.write_all_volatile(buf), - SocketStream::Tcp(s) => s.write_all_volatile(buf), + SocketStream::Tls(s) => s.write_volatile(buf), + SocketStream::KeepAlive(s) => s.write_volatile(buf), } } } @@ -478,6 +585,9 @@ pub(crate) struct SendAdditionalConnections { /// this using this flag. Only the main thread checks this variable, the worker /// threads will be stopped during cleanup. worker_error: Arc, + /// Externally triggered cancellation. Workers drain queued memory messages + /// after this is set and wait for the disconnect message. + external_cancel: Arc, /// After the main thread sent all memory chunks to the sender threads, it waits /// until one of the workers notifies it. Either because an error occurred, or /// because they arrived at the gate. @@ -512,6 +622,7 @@ impl SendAdditionalConnections { pub(crate) fn new( destination: &str, connections: NonZeroU32, + tls_dir: Option<&Path>, guest_memory: &GuestMemoryAtomic, ) -> Result { let mut threads = Vec::new(); @@ -519,6 +630,7 @@ impl SendAdditionalConnections { let buffer_size = Self::BUFFERED_REQUESTS_PER_THREAD * configured_connections as usize; let (message_tx, message_rx) = sync_channel::(buffer_size); let worker_error = Arc::new(AtomicBool::new(false)); + let external_cancel = Arc::new(AtomicBool::new(false)); let (notify_tx, notify_rx) = channel::(); // If one connection is configured, we don't have to create any additional threads. @@ -529,6 +641,7 @@ impl SendAdditionalConnections { threads, message_tx, worker_error, + external_cancel, notify_rx, }); } @@ -538,10 +651,11 @@ impl SendAdditionalConnections { // the memory chunks to the workers, but does not send memory anymore. Thus in // this case we create one additional thread for each connection. for n in 0..configured_connections { - let mut socket = send_migration_socket(destination)?; + let mut socket = send_migration_socket(destination, tls_dir)?; let guest_memory = guest_memory.clone(); let message_rx = message_rx.clone(); let worker_error = worker_error.clone(); + let external_cancel = external_cancel.clone(); let notify_tx = notify_tx.clone(); let thread = thread::Builder::new() @@ -552,6 +666,7 @@ impl SendAdditionalConnections { &guest_memory, &message_rx, &worker_error, + &external_cancel, ¬ify_tx, ) }) @@ -574,6 +689,7 @@ impl SendAdditionalConnections { threads, message_tx, worker_error, + external_cancel, notify_rx, }) } @@ -583,6 +699,7 @@ impl SendAdditionalConnections { guest_memory: &GuestMemoryAtomic, message_rx: &Mutex>, worker_error: &AtomicBool, + external_cancel: &AtomicBool, notify_tx: &Sender, ) -> Result<(), MigratableError> { info!("Spawned thread to send VM memory."); @@ -607,6 +724,10 @@ impl SendAdditionalConnections { })?; match message { SendMemoryThreadMessage::Memory(table) => { + if external_cancel.load(Ordering::Acquire) { + continue; + } + send_memory_ranges(guest_memory, &table, socket) .inspect_err(|_| { worker_error.store(true, Ordering::Relaxed); @@ -643,6 +764,7 @@ impl SendAdditionalConnections { &mut self, table: MemoryRangeTable, socket: &mut SocketStream, + return_if_cancelled_cb: &impl Fn(&mut SocketStream) -> Result<(), MigratableError>, ) -> Result { if table.regions().is_empty() { return Ok(false); @@ -650,17 +772,25 @@ impl SendAdditionalConnections { // If we use only one connection, we send the memory directly. if self.threads.is_empty() { - send_memory_ranges(&self.guest_memory, &table, socket)?; + for chunk in table.partition(Self::CHUNK_SIZE) { + return_if_cancelled_cb(socket) + .inspect_err(|_| info!("cancelling migration during memory iteration"))?; + send_memory_ranges(&self.guest_memory, &chunk, socket)?; + } return Ok(true); } // The chunk size is chosen to be big enough so that even very fast links need some // milliseconds to send it. for chunk in table.partition(Self::CHUNK_SIZE) { + return_if_cancelled_cb(socket).inspect_err(|_| { + info!("cancelling migration during memory iteration"); + self.external_cancel.store(true, Ordering::Release); + })?; self.send_chunk(chunk)?; } - self.wait_for_pending_data()?; + self.wait_for_pending_data(socket, return_if_cancelled_cb)?; Ok(true) } @@ -695,7 +825,11 @@ impl SendAdditionalConnections { } /// Wait until all data that is in-flight has actually been sent and acknowledged. - fn wait_for_pending_data(&mut self) -> Result<(), MigratableError> { + fn wait_for_pending_data( + &mut self, + socket: &mut SocketStream, + return_if_cancelled_cb: &impl Fn(&mut SocketStream) -> Result<(), MigratableError>, + ) -> Result<(), MigratableError> { let gate = Arc::new(Gate::new()); for _ in 0..self.threads.len() { self.message_tx @@ -709,26 +843,34 @@ impl SendAdditionalConnections { // they arrived at the gate. let mut seen_threads = 0; loop { - match self - .notify_rx - .recv() - .context("Error receiving message from workers") - .map_err(MigratableError::MigrateSend)? - { - SendMemoryThreadNotify::Gate => { + return_if_cancelled_cb(socket).inspect_err(|_| { + gate.open(); + self.external_cancel.store(true, Ordering::Release); + })?; + + thread::sleep(Duration::from_millis(2)); + + match self.notify_rx.try_recv() { + Ok(SendMemoryThreadNotify::Gate) => { seen_threads += 1; if seen_threads == self.threads.len() { gate.open(); return Ok(()); } } - SendMemoryThreadNotify::Error => { + Ok(SendMemoryThreadNotify::Error) => { // If an error occurred in one of the worker threads, we open // the gate to make sure that no thread hangs. After that, we // receive the error from Self::cleanup() and return it. gate.open(); return self.cleanup(); } + Err(TryRecvError::Empty) => {} + Err(TryRecvError::Disconnected) => { + return Err(MigratableError::MigrateSend(anyhow!( + "All senders died unexpectedly." + ))); + } } } } @@ -784,9 +926,40 @@ fn socket_url_to_path(url: &str) -> Result { .map(|s| s.into()) } +/// Extract the server name from a TCP address. This function assumes that +/// `tcp:` has already been stripped. +fn tcp_address_to_server_name(address: &str) -> Result<&str, anyhow::Error> { + if let Some(rest) = address.strip_prefix('[') { + let (host, port) = rest + .split_once(']') + .ok_or_else(|| anyhow!("Could not extract host from TCP address: {address}"))?; + + if host.is_empty() || !port.starts_with(':') || port.len() == 1 { + return Err(anyhow!( + "Could not extract host from TCP address: {address}" + )); + } + + Ok(host) + } else { + let (host, port) = address + .rsplit_once(':') + .ok_or_else(|| anyhow!("Could not extract host from TCP address: {address}"))?; + + if host.is_empty() || port.is_empty() { + return Err(anyhow!( + "Could not extract host from TCP address: {address}" + )); + } + + Ok(host) + } +} + /// Connect to a migration endpoint and return the established stream. pub(crate) fn send_migration_socket( destination_url: &str, + tls_dir: Option<&Path>, ) -> Result { if let Some(address) = destination_url.strip_prefix("tcp:") { info!("Connecting to TCP socket at {address}"); @@ -794,8 +967,20 @@ pub(crate) fn send_migration_socket( let socket = TcpStream::connect(address).map_err(|e| { MigratableError::MigrateSend(anyhow!("Error connecting to TCP socket: {e}")) })?; + set_migration_socket_timeouts(&socket).map_err(MigratableError::MigrateSend)?; - Ok(SocketStream::Tcp(socket)) + if let Some(tls_dir) = tls_dir { + let server_name = tcp_address_to_server_name(address) + .context("Error extracting TLS server name from destination URL") + .map_err(MigratableError::MigrateSend)?; + TlsStream::new_client(socket, tls_dir, server_name) + .map(Box::new) + .map(SocketStream::Tls) + .context("Error creating TLS migration stream") + .map_err(MigratableError::MigrateSend) + } else { + Ok(SocketStream::Tcp(socket)) + } } else { let path = socket_url_to_path(destination_url).map_err(MigratableError::MigrateSend)?; info!("Connecting to UNIX socket at {path:?}"); @@ -808,15 +993,41 @@ pub(crate) fn send_migration_socket( } } +/// Connect to the main migration endpoint and keep the connection active while +/// memory is transferred over additional streams. +pub(crate) fn send_migration_socket_with_keep_alive( + destination_url: &str, + tls_dir: Option<&Path>, +) -> Result { + match send_migration_socket(destination_url, tls_dir)? { + socket @ (SocketStream::Tcp(_) | SocketStream::Tls(_)) => { + KeepAliveStream::new(socket, MIGRATION_WRITE_TIMEOUT_DURATION, true) + .map(SocketStream::KeepAlive) + .context("Error creating keep-alive migration stream") + .map_err(MigratableError::MigrateSend) + } + socket => Ok(socket), + } +} + /// Bind a migration listener for the receiver side. pub(crate) fn receive_migration_listener( receiver_url: &str, + tls_dir: Option<&Path>, ) -> Result { if let Some(address) = receiver_url.strip_prefix("tcp:") { - TcpListener::bind(address) - .map(ReceiveListener::Tcp) + let listener = TcpListener::bind(address) .context("Error binding to TCP socket") - .map_err(MigratableError::MigrateReceive) + .map_err(MigratableError::MigrateReceive)?; + + if let Some(tls_dir) = tls_dir { + let config = TlsServerConfig::new(tls_dir) + .context("Error creating TLS server config") + .map_err(MigratableError::MigrateReceive)?; + Ok(ReceiveListener::Tls(listener, config)) + } else { + Ok(ReceiveListener::Tcp(listener)) + } } else { let path = socket_url_to_path(receiver_url).map_err(MigratableError::MigrateReceive)?; UnixListener::bind(&path) @@ -831,9 +1042,7 @@ pub(crate) fn expect_ok_response( socket: &mut SocketStream, error: MigratableError, ) -> Result<(), MigratableError> { - Response::read_from(socket)? - .ok_or_abandon(socket, error) - .map(|_| ()) + Response::read_from(socket)?.ok_or_error(error).map(|_| ()) } /// Send a request and validate that the peer responds with OK. @@ -977,3 +1186,32 @@ pub(crate) fn receive_memory_ranges( Ok(()) } + +#[cfg(test)] +mod tests { + use super::tcp_address_to_server_name; + + #[test] + fn test_tcp_address_to_server_name() { + assert_eq!( + tcp_address_to_server_name("example.com:1234").unwrap(), + "example.com" + ); + assert_eq!( + tcp_address_to_server_name("192.0.2.1:1234").unwrap(), + "192.0.2.1" + ); + assert_eq!( + tcp_address_to_server_name("[2001:db8::1]:1234").unwrap(), + "2001:db8::1" + ); + } + + #[test] + fn test_tcp_address_to_server_name_rejects_invalid_addresses() { + tcp_address_to_server_name("example.com").unwrap_err(); + tcp_address_to_server_name(":1234").unwrap_err(); + tcp_address_to_server_name("[2001:db8::1]").unwrap_err(); + tcp_address_to_server_name("[2001:db8::1]1234").unwrap_err(); + } +} diff --git a/vmm/src/seccomp_filters.rs b/vmm/src/seccomp_filters.rs index fb5aaedf9f..8b4996ccc5 100644 --- a/vmm/src/seccomp_filters.rs +++ b/vmm/src/seccomp_filters.rs @@ -84,6 +84,7 @@ mod kvm { pub const KVM_CHECK_EXTENSION: u64 = 0xae03; pub const KVM_GET_VCPU_MMAP_SIZE: u64 = 0xae04; pub const KVM_CREATE_VCPU: u64 = 0xae41; + pub const KVM_X86_SET_MSR_FILTER: u64 = 0x4188aec6; pub const KVM_CREATE_IRQCHIP: u64 = 0xae60; pub const KVM_RUN: u64 = 0xae80; pub const KVM_SET_MP_STATE: u64 = 0x4004_ae99; @@ -235,6 +236,7 @@ fn create_vmm_ioctl_seccomp_rule_common_kvm() -> Result, Backen Ok(or![ and![Cond::new(1, ArgLen::Dword, Eq, KVM_CHECK_EXTENSION)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_CREATE_DEVICE,)?], + and![Cond::new(1, ArgLen::Dword, Eq, KVM_X86_SET_MSR_FILTER,)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_CREATE_IRQCHIP,)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_CREATE_VCPU)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_CREATE_VM)?], @@ -443,6 +445,7 @@ fn create_vmm_ioctl_seccomp_rule_kvm() -> Result, BackendError> const KVM_GET_FPU: u64 = 0x81a0_ae8c; const KVM_GET_LAPIC: u64 = 0x8400_ae8e; const KVM_GET_MSR_INDEX_LIST: u64 = 0xc004_ae02; + const KVM_GET_MSR_FEATURE_INDEX_LIST: u64 = 0xc004_ae0a; const KVM_GET_MSRS: u64 = 0xc008_ae88; const KVM_GET_SREGS: u64 = 0x8138_ae83; const KVM_GET_TSC_KHZ: u64 = 0xaea3; @@ -472,6 +475,12 @@ fn create_vmm_ioctl_seccomp_rule_kvm() -> Result, BackendError> and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_FPU)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_LAPIC)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_MSR_INDEX_LIST)?], + and![Cond::new( + 1, + ArgLen::Dword, + Eq, + KVM_GET_MSR_FEATURE_INDEX_LIST + )?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_MSRS)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_SREGS)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_TSC_KHZ)?], @@ -976,6 +985,9 @@ fn http_api_thread_rules() -> Result)>, BackendError> (libc::SYS_sendto, vec![]), (libc::SYS_sigaltstack, vec![]), (libc::SYS_write, vec![]), + (libc::SYS_rt_sigprocmask, vec![]), + (libc::SYS_getcwd, vec![]), + (libc::SYS_clock_nanosleep, vec![]), ]) } diff --git a/vmm/src/serial_manager.rs b/vmm/src/serial_manager.rs index 27f359bec6..638a2ff37d 100644 --- a/vmm/src/serial_manager.rs +++ b/vmm/src/serial_manager.rs @@ -3,9 +3,11 @@ // SPDX-License-Identifier: Apache-2.0 // +use std::any::TypeId; +use std::collections::HashMap; use std::fs::File; -use std::io::Read; -use std::net::Shutdown; +use std::io::{Read, Write}; +use std::net::{Shutdown, TcpStream}; use std::os::fd::OwnedFd; use std::os::unix::io::{AsRawFd, FromRawFd}; use std::os::unix::net::UnixStream; @@ -69,9 +71,9 @@ pub enum Error { #[error("Error accepting connection")] AcceptConnection(#[source] io::Error), - /// Cannot clone the UnixStream - #[error("Error cloning UnixStream")] - CloneUnixStream(#[source] io::Error), + /// Cannot clone the Stream + #[error("Error cloning Stream")] + CloneStream(#[source] io::Error), /// Cannot shutdown the connection #[error("Error shutting down a connection")] @@ -93,9 +95,10 @@ pub enum EpollDispatch { File = 0, Kill = 1, Socket = 2, + Tcp = 3, Unknown, } -const EPOLL_EVENTS_LEN: usize = 4; +const EPOLL_EVENTS_LEN: usize = 5; impl From for EpollDispatch { fn from(v: u64) -> Self { @@ -104,11 +107,64 @@ impl From for EpollDispatch { 0 => File, 1 => Kill, 2 => Socket, + 3 => Tcp, _ => Unknown, } } } +/// A thread-safe writer that fans out to multiple keyed writers. Allows for +/// bundling different kinds of writers for the serial device, e.g. writing to +/// a TCP socket and a file. +#[derive(Clone)] +pub struct FanoutWriter { + writers: Arc>>>, +} + +impl FanoutWriter { + pub fn new() -> Self { + FanoutWriter { + writers: Arc::new(Mutex::new(HashMap::new())), + } + } + + pub fn add_writer(&self, writer: W) { + let mut writers = self.writers.lock().unwrap(); + writers.insert(TypeId::of::(), Box::new(writer)); + } + + pub fn remove_writer(&self, id: TypeId) -> Option> { + let mut writers = self.writers.lock().unwrap(); + writers.remove(&id) + } +} + +impl Write for FanoutWriter { + fn write(&mut self, buf: &[u8]) -> io::Result { + let mut writers = self.writers.lock().unwrap(); + let mut result: io::Result = Ok(buf.len()); + + for (i, w) in writers.values_mut().enumerate() { + let r = w.write(buf); + if i == 0 { + result = r; + } else { + r?; + } + } + + result + } + + fn flush(&mut self) -> io::Result<()> { + let mut writers = self.writers.lock().unwrap(); + for w in writers.values_mut() { + w.flush()?; + } + Ok(()) + } +} + pub struct SerialManager { #[cfg(any(target_arch = "x86_64", target_arch = "riscv64"))] serial: Arc>, @@ -179,13 +235,14 @@ impl SerialManager { } listener.as_raw_fd() } + ConsoleTransport::Tcp(ref listener, _) => listener.as_raw_fd(), _ => return Ok(None), }; - let in_event = if let ConsoleTransport::Socket(_) = transport { - EpollDispatch::Socket - } else { - EpollDispatch::File + let in_event = match &transport { + ConsoleTransport::Socket(_) => EpollDispatch::Socket, + ConsoleTransport::Tcp(_, _) => EpollDispatch::Tcp, + _ => EpollDispatch::File, }; epoll::ctl( @@ -262,6 +319,7 @@ impl SerialManager { let serial = self.serial.clone(); let pty_write_out = self.pty_write_out.clone(); let mut reader: Option = None; + let mut reader_tcp: Option = None; // In case of PTY, we want to be able to detect a connection on the // other end of the PTY. This is done by detecting there's no event @@ -275,6 +333,22 @@ impl SerialManager { .name("serial-manager".to_string()) .spawn(move || { std::panic::catch_unwind(AssertUnwindSafe(move || { + let write_distributor = match &transport { + ConsoleTransport::Tcp(_, file_opt) => { + let distributor = FanoutWriter::new(); + if let Some(file) = file_opt { + distributor.add_writer(Arc::clone(file)); + } + serial + .as_ref() + .lock() + .unwrap() + .set_out(Some(Box::new(distributor.clone()))); + Some(distributor) + } + _ => None, + }; + let mut events = [epoll::Event::new(epoll::Events::empty(), 0); EPOLL_EVENTS_LEN]; @@ -331,7 +405,7 @@ impl SerialManager { let (unix_stream, _) = listener.accept().map_err(Error::AcceptConnection)?; let writer = - unix_stream.try_clone().map_err(Error::CloneUnixStream)?; + unix_stream.try_clone().map_err(Error::CloneStream)?; epoll::ctl( epoll_fd.as_raw_fd(), @@ -347,6 +421,44 @@ impl SerialManager { reader = Some(unix_stream); serial.lock().unwrap().set_out(Some(Box::new(writer))); } + EpollDispatch::Tcp => { + // New connection request arrived. + // Shutdown the previous connection, if any + if let Some(ref previous_reader) = reader_tcp { + previous_reader + .shutdown(Shutdown::Both) + .map_err(Error::AcceptConnection)?; + if let Some(distributor) = &write_distributor { + distributor.remove_writer(TypeId::of::()); + } + } + + let ConsoleTransport::Tcp(ref listener, _) = transport else { + unreachable!(); + }; + + // Events on the listening socket will be connection requests. + // Accept them, create a reader and a writer. + let (tcp_stream, _) = + listener.accept().map_err(Error::AcceptConnection)?; + let writer = + tcp_stream.try_clone().map_err(Error::CloneStream)?; + + epoll::ctl( + epoll_fd.as_raw_fd(), + epoll::ControlOptions::EPOLL_CTL_ADD, + tcp_stream.as_raw_fd(), + epoll::Event::new( + epoll::Events::EPOLLIN, + EpollDispatch::File as u64, + ), + ) + .map_err(Error::Epoll)?; + reader_tcp = Some(tcp_stream); + if let Some(distributor) = &write_distributor { + distributor.add_writer(writer); + } + } EpollDispatch::File => { if event.events & libc::EPOLLIN as u32 != 0 { let mut input = [0u8; 64]; @@ -373,6 +485,31 @@ impl SerialManager { 0 } } + ConsoleTransport::Tcp(_, _) => { + if let Some(mut serial_reader) = reader_tcp.as_ref() + { + let count = serial_reader + .read(&mut input) + .map_err(Error::ReadInput)?; + if count == 0 { + info!("Remote end closed serial socket"); + serial_reader + .shutdown(Shutdown::Both) + .map_err(Error::ShutdownConnection)?; + reader_tcp = None; + if let Some(distributor) = + &write_distributor + { + distributor.remove_writer( + TypeId::of::(), + ); + } + } + count + } else { + 0 + } + } ConsoleTransport::Pty(file) | ConsoleTransport::Tty(file) => (&**file) .read(&mut input) diff --git a/vmm/src/vcpu_throttling.rs b/vmm/src/vcpu_throttling.rs new file mode 100644 index 0000000000..e8fd0d3b12 --- /dev/null +++ b/vmm/src/vcpu_throttling.rs @@ -0,0 +1,605 @@ +// Copyright © 2025 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 + +//! # vCPU throttling for Auto Converging +//! +//! vCPU throttling is crucial to reach a reasonable downtime when using a +//! precopy strategy for live-migration of VMs with memory-intensive workloads. +//! Auto converge means an increasing vCPU throttling over time until the memory +//! delta is small enough for the migration thread(s) to perform the switch-over +//! to the new host. +//! +//! Therefore, the migration thread(s) use this thread to help them reach their +//! goal. Next to typical lifecycle management, this thread must fulfill various +//! requirements to ensure a minimal downtime. +//! +//! ## Thread Requirements +//! - Needs to be able to gracefully wait for work. +//! - Must be able to exit gracefully. +//! - Must be able to cancel any work and return to its init state to support +//! live-migration cancellation and restart of live-migrations. +//! - Must not block the migration thread(s) whenever possible, to facilitate +//! fast live-migrations with short downtimes. +//! - Must be interruptible during a sleep phase to not block the migration +//! thread(s). +//! - Must not confuse or hinder the migration thread(s) regarding +//! pause()/resume() operations. Context: migration thread shuts down the +//! vCPUs for the handover. The throttle thread must not restart the vCPUs +//! again. + +use std::cell::Cell; +use std::cmp::min; +use std::sync::mpsc::RecvTimeoutError; +use std::sync::{Arc, Mutex, mpsc}; +use std::thread; +use std::thread::JoinHandle; +use std::time::{Duration, Instant}; + +use log::{debug, warn}; +use vm_migration::Pausable; + +use crate::cpu::CpuManager; + +/// The possible command of the thread, i.e., the current state. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +enum ThrottleCommand { + /// Waiting for next event. + Waiting, + /// Ongoing vCPU throttling. + /// + /// The inner value shows the current throttling percentage in range `1..=99`. + Throttling(u8 /* `1..=99` */), + /// Thread is shutting down gracefully. + Exiting, +} + +/// Helper to adapt the throttling timeslice as we go, depending on the time it +/// takes to pause() and resume() all vCPUs. +#[derive(Debug)] +struct TimesliceContext { + current_timeslice: Duration, + /// Duration it took to pause() all vCPUs on the previous iteration. + previous_pause_duration: Duration, + /// Duration it took to resume() all vCPUs on the previous iteration. + previous_resume_duration: Duration, +} + +impl TimesliceContext { + /// The initial timeslice for a throttling cycle (vCPU pause & resume). + const INITIAL_TIMESLICE: Duration = Duration::from_millis(100); + + /// The minimal value for the operations. + /// + /// Any value smaller than this is upgraded to this to prevent math + /// exceptions during timing calculations. + const MIN_DURATION: Duration = Duration::from_millis(1); + + /// Maximum time slice. This should not be too big. + /// + /// Otherwise, for example: Assuming we have 10% throttling and + /// 2000ms time slice, then the WM will be unresponsive for + /// 200ms every 1800ms. This is not convenient. /// + const MAX_TIMESLICE: Duration = Duration::from_millis(800); + + /// Creates a new instance with [`Self::INITIAL_TIMESLICE`]. + fn new() -> Self { + Self { + current_timeslice: Self::INITIAL_TIMESLICE, + previous_pause_duration: Self::MIN_DURATION, + previous_resume_duration: Self::MIN_DURATION, + } + } + + /// Updates the timeslice. + fn update_timeslice(&mut self) { + // CpuManager::pause() plus CpuManager::resume() without additional delay is the shortest + // we can get. + let one_percent = self.previous_pause_duration + self.previous_resume_duration; + self.current_timeslice = one_percent * 100; + self.current_timeslice = min(self.current_timeslice, Self::MAX_TIMESLICE); + } + + /// Calculates the sleep durations for after the `pause()` and `resume()` operations with + /// the current `timeslice`. + /// + /// It uses the `timeslice` that was calculated on the previous + /// invocation of [`Self::update_timeslice`]. + fn calc_sleep_durations( + &mut self, + percentage: u64, + ) -> ( + Duration, /* after pause */ + Duration, /* after resume */ + ) { + assert!(percentage <= 100); + assert!(percentage > 0); + + let timeslice_ms = self.current_timeslice.as_millis() as u64; + let wait_ms_after_pause_ms = timeslice_ms * percentage / 100; + let wait_ms_after_resume_ms = timeslice_ms - wait_ms_after_pause_ms; + + let wait_ms_after_pause_ms = + wait_ms_after_pause_ms.saturating_sub(self.previous_pause_duration.as_millis() as u64); + let wait_ms_after_resume_ms = wait_ms_after_resume_ms + .saturating_sub(self.previous_resume_duration.as_millis() as u64); + + ( + Duration::from_millis(wait_ms_after_pause_ms), + Duration::from_millis(wait_ms_after_resume_ms), + ) + } + + /// Set the previous pause duration. + /// + /// In case this is below [`Self::MIN_DURATION`], we upgrade it to [`Self::MIN_DURATION`]. + pub fn set_previous_pause_duration(&mut self, mut duration: Duration) { + if duration < Self::MIN_DURATION { + duration = Self::MIN_DURATION; + } + + self.previous_pause_duration = duration; + } + + /// Set the duration it took to `resume()` all vCPUs on the previous iteration. + /// + /// In case this is below [`Self::MIN_DURATION`], we upgrade it to [`Self::MIN_DURATION`]. + pub fn set_previous_resume_duration(&mut self, mut duration: Duration) { + if duration < Self::MIN_DURATION { + duration = Self::MIN_DURATION; + } + self.previous_resume_duration = duration; + } +} + +/// Context of the vCPU throttle thread. +// The main justification for this dedicated type is to split the thread +// functions from the higher-level control API. +// TODO seccomp is missing +pub struct ThrottleWorker { + handle: Option>, +} + +impl ThrottleWorker { + /// This should not be named "vcpu*" as libvirt fails when + /// iterating the vCPU threads then. Fix this first in libvirt! + const THREAD_NAME: &'static str = "throttle-vcpu"; + + /// Executes the provided callback and goes to sleep until the specified + /// `sleep_duration` passed. + /// + /// The time to execute the callback itself is not taken into account + /// when sleeping for `sleep_duration`. Therefore, the callback is + /// supposed to be quick (a couple of milliseconds). + /// + /// The thread is interruptible during the sleep phase when the `receiver` + /// receives a new [`ThrottleCommand`]. + /// + /// # Arguments + /// - `callback`: Function to run + /// - `set_callback_duration`: Set the duration to execute the callback. + /// - `sleep_duration`: Duration this function takes at most, including + /// running the `callback`. + /// - `receiver`: Receiving end of the channel to the migration managing + /// thread. + fn execute_and_wait_interruptible( + callback: &impl Fn(), + mut set_callback_duration: impl FnMut(Duration), + sleep_duration: Duration, + receiver: &mpsc::Receiver, + ) -> Option { + let begin = Instant::now(); + callback(); + let cb_duration = begin.elapsed(); + // Help to adjust the timeslice in the next cycle. + set_callback_duration(cb_duration); + + // It might happen that sometimes we get interrupted during a sleep phase + // with a new higher throttle percentage but this is negligible. For an + // auto-converge cycle, there are typically only ~10 steps involved over + // a time frame from a couple of seconds up to a couple of minutes. + match receiver.recv_timeout(sleep_duration) { + Ok(next_task) => Some(next_task), + Err(RecvTimeoutError::Timeout) => None, + Err(RecvTimeoutError::Disconnected) => { + panic!("thread and channel should exit gracefully") + } + } + } + + /// Executes one throttling step: either pause or resume of vCPUs. + /// + /// Runs the given callback, then waits for the specified duration, unless + /// interrupted by a new [`ThrottleCommand`]. + /// + /// # Behavior + /// - Runs the provided `callback` immediately. + /// - Waits up to `duration` for new commands on the `receiver`. + /// - If no command arrives before the timeout, this step completes + /// normally and returns `None`. + /// - If a [`ThrottleCommand::Throttling`] arrives, updates the current + /// throttle percentage in `current_throttle` and continues with the + /// loop. Returns `None`. + /// - If a [`ThrottleCommand::Waiting`] or [`ThrottleCommand::Exiting`] + /// arrives, this command is forwarded to the caller. + /// + /// # Arguments + /// - `callback`: Function to run (e.g., pause or resume vCPUs). + /// - `set_callback_duration`: Set the duration to execute the callback. + /// - `receiver`: Channel for receiving new [`ThrottleCommand`]s. + /// - `current_throttle`: Mutable reference to the current throttle + /// percentage (updated on [`ThrottleCommand::Throttling`]). + /// + /// # Returns + /// - `None` if the throttling cycle should continue. + /// - `Some(ThrottleCommand::Waiting | ThrottleCommand::Exiting)` if + /// throttling should stop. + fn throttle_step( + callback: &F, + set_callback_duration: impl FnMut(Duration), + duration: Duration, + receiver: &mpsc::Receiver, + current_throttle: &mut u64, + ) -> Option + where + F: Fn(), + { + let maybe_task = Self::execute_and_wait_interruptible( + callback, + set_callback_duration, + duration, + receiver, + ); + match maybe_task { + None => None, + Some(ThrottleCommand::Throttling(next)) => { + // A new throttle value is only applied at the end of a full + // throttling cycle. This is fine and negligible in a series of + // (tens of) thousands of cycles. + *current_throttle = next as u64; + None + } + Some(cmd @ (ThrottleCommand::Exiting | ThrottleCommand::Waiting)) => Some(cmd), + } + } + + /// Helper for [`Self::control_loop`] that runs the actual throttling loop. + /// + /// This function returns the next [`ThrottleCommand`] **only** if the thread + /// stopped the vCPU throttling. + fn throttle_loop( + receiver: &mpsc::Receiver, + initial_throttle: u8, + callback_pause_vcpus: &impl Fn(), + callback_resume_vcpus: &impl Fn(), + ) -> ThrottleCommand { + // The current throttle value, as long as the thread is throttling. + let mut current_throttle = initial_throttle as u64; + let mut timeslice_ctx = TimesliceContext::new(); + + loop { + // Catch logic bug: We should have exited in this case already. + assert_ne!(current_throttle, 0); + assert!(current_throttle < 100); + + let (wait_ms_after_pause, wait_ms_after_resume) = + timeslice_ctx.calc_sleep_durations(current_throttle); + + // pause vCPUs + if let Some(cmd) = Self::throttle_step( + callback_pause_vcpus, + |new_duration| timeslice_ctx.set_previous_pause_duration(new_duration), + wait_ms_after_pause, + receiver, + &mut current_throttle, + ) { + // TODO: future optimization + // Prevent unnecessary resume() here when the migration thread + // performs .pause() right after anyway. We could make .pause() and + // .resume() idempotent. + callback_resume_vcpus(); + // We only exit here in case if ThrottleCommand::Waiting or ::Exiting + return cmd; + } + + // resume vCPUs + if let Some(cmd) = Self::throttle_step( + callback_resume_vcpus, + |new_duration| timeslice_ctx.set_previous_resume_duration(new_duration), + wait_ms_after_resume, + receiver, + &mut current_throttle, + ) { + // We only exit here in case if ThrottleCommand::Waiting or ::Exiting + return cmd; + } + + // Update timeslice for next cycle. This way, we can closely match the expected + // percentage for pause() and resume(). + timeslice_ctx.update_timeslice(); + } + } + + /// Implements the control loop of the thread. + /// + /// It wraps the actual throttling with the necessary thread lifecycle + /// management. + fn control_loop( + receiver: mpsc::Receiver, + callback_pause_vcpus: impl Fn() + Send + 'static, + callback_resume_vcpus: impl Fn() + Send + 'static, + ) -> impl Fn() { + move || { + // In the outer loop, we gracefully wait for commands. + 'control: loop { + let thread_task = receiver.recv().expect("channel should not be closed"); + match thread_task { + ThrottleCommand::Exiting => { + break 'control; + } + ThrottleCommand::Waiting => { + continue 'control; + } + ThrottleCommand::Throttling(initial_throttle) => { + let next_task = Self::throttle_loop( + &receiver, + initial_throttle, + &callback_pause_vcpus, + &callback_resume_vcpus, + ); + if next_task == ThrottleCommand::Exiting { + break 'control; + } + // else: thread is in Waiting state + } + } + } + debug!("thread exited gracefully"); + } + } + + /// Spawns a new thread. + fn spawn( + receiver: mpsc::Receiver, + callback_pause_vcpus: impl Fn() + Send + 'static, + callback_resume_vcpus: impl Fn() + Send + 'static, + ) -> Self { + let handle = { + let thread_fn = + Self::control_loop(receiver, callback_pause_vcpus, callback_resume_vcpus); + thread::Builder::new() + .name(String::from(Self::THREAD_NAME)) + .spawn(thread_fn) + .expect("should spawn thread") + }; + + Self { + handle: Some(handle), + } + } +} + +impl Drop for ThrottleWorker { + fn drop(&mut self) { + // Note: The thread handle must send the shutdown command first. + if let Some(handle) = self.handle.take() { + handle.join().expect("thread should have succeeded"); + } + } +} + +/// Handler for controlling the vCPU throttle thread. +/// +/// vCPU throttling is needed for live-migration of memory-intensive workloads. +/// The current design assumes that all vCPUs are throttled equally. +/// +/// # Transitions +/// - `Waiting` -> `Throttling(x %)`, `Exit` +/// - `Throttling(x %)` -> `Exit`, `Waiting`, `Throttling(y %)` +/// - `Exiting` +pub struct ThrottleThreadHandle { + /// Thread state wrapped by synchronization primitives. + state_sender: mpsc::Sender, + /// Current throttle value. + /// + /// This is the last throttle value that was sent to the + /// thread. + current_throttle: Cell, + /// The underlying thread handle. Option to have more control over when it is dropped. + throttle_thread: Option, +} + +impl ThrottleThreadHandle { + /// Spawns a new thread and returning a handle to it. + /// + /// # Parameters + /// - `cpu_manager`: CPU manager to pause and resume vCPUs + pub fn new_from_cpu_manager(cpu_manager: &Arc>) -> Self { + let callback_pause_vcpus = { + let cpu_manager = cpu_manager.clone(); + Box::new(move || cpu_manager.lock().unwrap().pause().unwrap()) + }; + + let callback_resume_vcpus = { + let cpu_manager = cpu_manager.clone(); + Box::new(move || cpu_manager.lock().unwrap().resume().unwrap()) + }; + + Self::new(callback_pause_vcpus, callback_resume_vcpus) + } + + /// Spawns a new thread and returning a handle to it. + /// + /// This function returns when the thread gracefully arrived in + /// [`ThrottleCommand::Waiting`]. + /// + /// # Parameters + /// - `callback_pause_vcpus`: Function putting all vCPUs into pause state. The + /// function must not perform any artificial delay itself. + /// - `callback_resume_vcpus`: Function putting all vCPUs back into running + /// state. The function must not perform any artificial delay itself. + fn new( + callback_pause_vcpus: Box, + callback_resume_vcpus: Box, + ) -> Self { + // Channel used for synchronization. + let (sender, receiver) = mpsc::channel::(); + + let thread = ThrottleWorker::spawn(receiver, callback_pause_vcpus, callback_resume_vcpus); + + Self { + state_sender: sender, + current_throttle: Cell::new(0), + throttle_thread: Some(thread), + } + } + + /// Set's the throttle percentage to a value in range `0..=99` and updates + /// the thread's state. + /// + /// Setting the value back to `0` equals setting the thread back into + /// [`ThrottleCommand::Waiting`]. + /// + /// In case of an ongoing throttling cycle (vCPU pause & resume), any new + /// throttling percentage will be applied no later than when the current cycle + /// ends. + /// + /// # Panic + /// Panics, if `percent_new` is not in range `0..=99`. + pub fn set_throttle_percent(&self, percent_new: u8) { + assert!( + percent_new <= 100, + "setting a percentage of 100 or above is not allowed: {percent_new}%" + ); + + // We have no problematic race condition here as in normal operation + // there is exactly one thread calling these functions. + let percent_old = self.throttle_percent(); + + // Return early, no action needed. + if percent_old == percent_new { + return; + } + + if percent_new == 0 { + self.state_sender + .send(ThrottleCommand::Waiting) + .expect("channel should not be closed"); + } else { + self.state_sender + .send(ThrottleCommand::Throttling(percent_new)) + .expect("channel should not be closed"); + } + + self.current_throttle.set(percent_new); + } + + /// Get the current throttle percentage in range `0..=99`. + /// + /// Please note that the value is not synchronized. + pub fn throttle_percent(&self) -> u8 { + self.current_throttle.get() + } + + /// Stops and terminates the thread gracefully. + /// + /// Waits for the thread to finish. This function **must** be called before + /// the migration thread(s) do anything with the CPU manager to prevent + /// odd states. + pub fn shutdown(&mut self) { + let begin = Instant::now(); + + { + // drop thread; ensure that the channel is still alive when it is dropped + if let Some(worker) = self.throttle_thread.take() { + self.state_sender + .send(ThrottleCommand::Exiting) + .expect("channel should not be closed"); + + // Ensure the sender is still living when this is dropped. + drop(worker); + } + } + + let elapsed = begin.elapsed(); + if elapsed > Duration::from_millis(20) { + warn!( + "shutting down thread takes too long ({} ms): this increases the downtime!", + elapsed.as_millis() + ); + } + } +} + +impl Drop for ThrottleThreadHandle { + fn drop(&mut self) { + self.shutdown(); + } +} + +#[cfg(test)] +mod tests { + use std::sync::atomic::{AtomicBool, Ordering}; + use std::thread::sleep; + + use super::*; + + // The test is successful if it does not get stuck. Then, the thread exits + // gracefully. + #[test] + fn test_vcpu_throttling_thread_lifecycle() { + for _ in 0..5 { + // State transitions: Waiting -> Exit + { + let mut handler = ThrottleThreadHandle::new(Box::new(|| {}), Box::new(|| {})); + + // The test is successful if it does not get stuck. + handler.shutdown(); + } + + // Dummy CpuManager + let cpus_throttled = Arc::new(AtomicBool::new(false)); + let callback_pause_vcpus = { + let cpus_running = cpus_throttled.clone(); + Box::new(move || { + let old = cpus_running.swap(true, Ordering::SeqCst); + assert!(!old); + }) + }; + let callback_resume_vcpus = { + let cpus_running = cpus_throttled.clone(); + Box::new(move || { + let old = cpus_running.swap(false, Ordering::SeqCst); + assert!(old); + }) + }; + + // State transitions: Waiting -> Throttle -> Waiting -> Throttle -> Exit + { + let mut handler = + ThrottleThreadHandle::new(callback_pause_vcpus, callback_resume_vcpus); + handler.set_throttle_percent(5); + sleep(TimesliceContext::INITIAL_TIMESLICE); + handler.set_throttle_percent(10); + sleep(TimesliceContext::INITIAL_TIMESLICE); + + // Assume we aborted vCPU throttling (or the live-migration at all). + handler.set_throttle_percent(0 /* reset to waiting */); + handler.set_throttle_percent(5); + sleep(TimesliceContext::INITIAL_TIMESLICE); + handler.set_throttle_percent(10); + sleep(TimesliceContext::INITIAL_TIMESLICE); + + // The test is successful if we don't have a panic here due to a + // closed channel. + for _ in 0..10 { + handler.shutdown(); + sleep(Duration::from_millis(1)); + } + + // The test is successful if it does not get stuck. + drop(handler); + } + } + } +} diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index 1c3baf8652..d62173e5fa 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -24,7 +24,7 @@ use std::sync::{Arc, Mutex}; use std::time::Instant; use std::{cmp, result, str, thread}; -use anyhow::anyhow; +use anyhow::{Context, anyhow}; #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] use arch::PciSpaceInfo; #[cfg(target_arch = "x86_64")] @@ -100,11 +100,9 @@ use crate::landlock::LandlockError; use crate::memory_manager::{ Error as MemoryManagerError, MemoryManager, MemoryManagerSnapshotData, }; -#[cfg(target_arch = "x86_64")] -use crate::migration::get_vm_snapshot; #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] use crate::migration::url_to_file; -use crate::migration::{SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE, url_to_path}; +use crate::migration::{SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE, get_vm_snapshot, url_to_path}; #[cfg(all( feature = "kvm", feature = "sev_snp", @@ -112,6 +110,7 @@ use crate::migration::{SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE, url_to_path}; target_arch = "x86_64" ))] use crate::sev::MeasuredBootInfo; +use crate::vcpu_throttling::ThrottleThreadHandle; #[cfg(feature = "fw_cfg")] use crate::vm_config::FwCfgConfig; use crate::vm_config::{ @@ -198,6 +197,9 @@ pub enum Error { #[error("VM is not running")] VmNotRunning, + #[error("VM is currently migrating and can't be modified")] + VmMigrating, + #[error("Cannot clone EventFd")] EventFdClone(#[source] io::Error), @@ -540,6 +542,14 @@ pub struct Vm { hypervisor: Arc, stop_on_boot: bool, load_payload_handle: Option>>, + vcpu_throttler: ThrottleThreadHandle, + post_migration_lifecycle_event: Option, +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +pub enum PostMigrationLifecycleEvent { + VmReboot, + VmShutdown, } impl Vm { @@ -698,6 +708,19 @@ impl Vm { } else { VmState::Created }; + let post_migration_lifecycle_event = snapshot + .as_ref() + .map(|snapshot| { + get_vm_snapshot(snapshot) + .map(|vm_snapshot| vm_snapshot.post_migration_lifecycle_event) + .map_err(Error::Restore) + }) + .transpose()? + .flatten(); + + // TODO we could also spawn the thread when a migration with auto-converge starts. + // Probably this is the better design. + let vcpu_throttler = ThrottleThreadHandle::new_from_cpu_manager(&cpu_manager); Ok(Vm { #[cfg(feature = "tdx")] @@ -718,6 +741,8 @@ impl Vm { hypervisor, stop_on_boot, load_payload_handle, + vcpu_throttler, + post_migration_lifecycle_event, }) } @@ -796,15 +821,23 @@ impl Vm { .map_err(Error::CpuManager)?; #[cfg(target_arch = "x86_64")] - cpu_manager - .lock() - .unwrap() - .populate_cpuid( - hypervisor.as_ref(), - #[cfg(feature = "tdx")] - tdx_enabled, - ) - .map_err(Error::CpuManager)?; + { + cpu_manager + .lock() + .unwrap() + .populate_cpuid( + hypervisor.as_ref(), + #[cfg(feature = "tdx")] + tdx_enabled, + ) + .map_err(Error::CpuManager)?; + + cpu_manager + .lock() + .unwrap() + .apply_msr_updates() + .map_err(Error::CpuManager)?; + } Ok(cpu_manager) } @@ -940,6 +973,7 @@ impl Vm { console_info, console_resize_pipe, original_termios, + snapshot, )?; } @@ -980,6 +1014,7 @@ impl Vm { console_info.cloned(), console_resize_pipe.cloned(), original_termios.clone(), + snapshot, )?; } @@ -1039,10 +1074,11 @@ impl Vm { }; // Create interrupt controller and devices for MSHV + let dm_snapshot = snapshot_from_id(snapshot, DEVICE_MANAGER_SNAPSHOT_ID); let ic = device_manager .lock() .unwrap() - .create_interrupt_controller() + .create_interrupt_controller(dm_snapshot) .map_err(Error::DeviceManager)?; #[cfg(target_arch = "aarch64")] @@ -1056,6 +1092,7 @@ impl Vm { console_resize_pipe.cloned(), original_termios.clone(), ic, + dm_snapshot, ) .map_err(Error::DeviceManager)?; @@ -1073,11 +1110,13 @@ impl Vm { console_info: Option<&ConsoleInfo>, console_resize_pipe: Option<&Arc>, original_termios: &Arc>>, + snapshot: Option<&Snapshot>, ) -> Result<()> { + let dm_snapshot = snapshot_from_id(snapshot, DEVICE_MANAGER_SNAPSHOT_ID); let ic = device_manager .lock() .unwrap() - .create_interrupt_controller() + .create_interrupt_controller(dm_snapshot) .map_err(Error::DeviceManager)?; #[cfg(target_arch = "aarch64")] @@ -1091,6 +1130,7 @@ impl Vm { console_resize_pipe.cloned(), original_termios.clone(), ic, + dm_snapshot, ) .map_err(Error::DeviceManager)?; @@ -1105,13 +1145,15 @@ impl Vm { console_info: Option, console_resize_pipe: Option>, original_termios: Arc>>, + snapshot: Option<&Snapshot>, ) -> Result<()> { // For KVM, create interrupt controller after boot vcpus // because GIC state is restored from snapshot during vcpu creation + let dm_snapshot = snapshot_from_id(snapshot, DEVICE_MANAGER_SNAPSHOT_ID); let ic = device_manager .lock() .unwrap() - .create_interrupt_controller() + .create_interrupt_controller(dm_snapshot) .map_err(Error::DeviceManager)?; vm.init().map_err(Error::InitializeVm)?; @@ -1119,7 +1161,13 @@ impl Vm { device_manager .lock() .unwrap() - .create_devices(console_info, console_resize_pipe, original_termios, ic) + .create_devices( + console_info, + console_resize_pipe, + original_termios, + ic, + dm_snapshot, + ) .map_err(Error::DeviceManager)?; Ok(()) @@ -1319,6 +1367,42 @@ impl Vm { Ok(numa_nodes) } + /// Set's the throttle percentage to a value in range `0..=99`. + /// + /// Setting the value back to `0` brings the thread back into a waiting + /// state. + /// + /// # Panic + /// Panics, if `percent_new` is not in range `0..=99`. + pub fn set_throttle_percent(&self, percent: u8 /* 1..=99 */) { + self.vcpu_throttler.set_throttle_percent(percent); + } + + /// Get the current throttle percentage in range `0..=99`. + /// + /// Please note that the value is not synchronized. + pub fn throttle_percent(&self) -> u8 { + self.vcpu_throttler.throttle_percent() + } + + /// Stops and terminates the thread gracefully. + /// + /// Waits for the thread to finish. + pub fn stop_vcpu_throttling(&mut self) { + self.vcpu_throttler.shutdown(); + } + + pub fn set_post_migration_lifecycle_event( + &mut self, + event: Option, + ) { + self.post_migration_lifecycle_event = event; + } + + pub fn post_migration_lifecycle_event(&self) -> Option { + self.post_migration_lifecycle_event + } + #[allow(clippy::too_many_arguments)] pub fn new( vm_config: Arc>, @@ -1838,33 +1922,13 @@ impl Vm { let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus(); - let serial_number = self - .config - .lock() - .unwrap() - .platform - .as_ref() - .and_then(|p| p.serial_number.clone()); - - let uuid = self + let smbios = self .config .lock() .unwrap() .platform .as_ref() - .and_then(|p| p.uuid.clone()); - - let oem_strings = self - .config - .lock() - .unwrap() - .platform - .as_ref() - .and_then(|p| p.oem_strings.clone()); - - let oem_strings = oem_strings - .as_deref() - .map(|strings| strings.iter().map(|s| s.as_ref()).collect::>()); + .and_then(|p| p.smbios_config()); let topology = self.cpu_manager.lock().unwrap().get_vcpu_topology(); @@ -1876,9 +1940,7 @@ impl Vm { boot_vcpus, entry_addr.setup_header, rsdp_addr, - serial_number.as_deref(), - uuid.as_deref(), - oem_strings.as_deref(), + smbios.as_ref(), topology, ) .map_err(Error::ConfigureSystem)?; @@ -2997,6 +3059,10 @@ impl Vm { .try_lock_disks() .map_err(Error::LockingError)?; + // TODO for upstreaming probably relevant + // Advertise new VM location to network switches. + // self.post_migration_announce(); + // Now we can start all vCPUs from here. self.cpu_manager .lock() @@ -3044,19 +3110,16 @@ impl Vm { { Request::memory_fd(std::mem::size_of_val(&slot) as u64) .write_to(socket) - .map_err(|e| { - MigratableError::MigrateSend(anyhow!("Error sending memory fd request: {e}")) - })?; + .context("Error sending memory fd request") + .map_err(MigratableError::MigrateSend)?; socket .send_with_fd(&slot.to_le_bytes()[..], fd) - .map_err(|e| { - MigratableError::MigrateSend(anyhow!("Error sending memory fd: {e}")) - })?; + .context("Error sending memory fd") + .map_err(MigratableError::MigrateSend)?; - Response::read_from(socket)?.ok_or_abandon( - socket, - MigratableError::MigrateSend(anyhow!("Error during memory fd migration")), - )?; + Response::read_from(socket)?.ok_or_error(MigratableError::MigrateSend(anyhow!( + "Error during memory fd migration (got bad response)" + )))?; } Ok(()) @@ -3092,6 +3155,10 @@ impl Vm { Ok(()) } + pub fn device_manager(&self) -> &Arc> { + &self.device_manager + } + pub fn activate_virtio_devices(&self) -> Result<()> { self.device_manager .lock() @@ -3233,6 +3300,14 @@ impl Vm { .nmi() .map_err(Error::ErrorNmi); } + + /// Calls [`DeviceManager::post_migration_announce`]. + pub fn post_migration_announce(&self) { + self.device_manager + .lock() + .unwrap() + .post_migration_announce(); + } } impl Pausable for Vm { @@ -3310,6 +3385,8 @@ impl Pausable for Vm { #[derive(Serialize, Deserialize)] pub struct VmSnapshot { + #[serde(default)] + pub post_migration_lifecycle_event: Option, #[cfg(target_arch = "x86_64")] pub clock: Option, #[cfg(all(feature = "kvm", target_arch = "x86_64"))] @@ -3342,19 +3419,22 @@ impl Snapshottable for Vm { #[cfg(all(feature = "kvm", target_arch = "x86_64"))] let common_cpuid = { - let amx = self.config.lock().unwrap().cpus.features.amx; - let phys_bits = physical_bits( - self.hypervisor.as_ref(), - self.config.lock().unwrap().cpus.max_phys_bits, - ); + let guard = self.config.lock().unwrap(); + let amx = guard.cpus.features.amx; + let phys_bits = physical_bits(self.hypervisor.as_ref(), guard.cpus.max_phys_bits); + let kvm_hyperv = guard.cpus.kvm_hyperv; + let profile = guard.cpus.profile; + // Drop the guard before function call + core::mem::drop(guard); arch::generate_common_cpuid( self.hypervisor.as_ref(), &arch::CpuidConfig { phys_bits, - kvm_hyperv: self.config.lock().unwrap().cpus.kvm_hyperv, + kvm_hyperv, #[cfg(feature = "tdx")] tdx: false, amx, + profile, }, ) .map_err(|e| { @@ -3363,6 +3443,7 @@ impl Snapshottable for Vm { }; let vm_snapshot_state = VmSnapshot { + post_migration_lifecycle_event: self.post_migration_lifecycle_event(), #[cfg(target_arch = "x86_64")] clock: self.saved_clock, #[cfg(all(feature = "kvm", target_arch = "x86_64"))] @@ -3407,15 +3488,18 @@ impl Transportable for Vm { .write(true) .create_new(true) .open(snapshot_config_path) - .map_err(|e| MigratableError::MigrateSend(e.into()))?; + .context("Error creating config snapshot file") + .map_err(MigratableError::MigrateSend)?; // Serialize and write the snapshot config let vm_config = serde_json::to_string(self.config.lock().unwrap().deref()) - .map_err(|e| MigratableError::MigrateSend(e.into()))?; + .context("Error serializing VM config") + .map_err(MigratableError::MigrateSend)?; snapshot_config_file .write(vm_config.as_bytes()) - .map_err(|e| MigratableError::MigrateSend(e.into()))?; + .context("Error writing serialized VM config") + .map_err(MigratableError::MigrateSend)?; let mut snapshot_state_path = url_to_path(destination_url)?; snapshot_state_path.push(SNAPSHOT_STATE_FILE); @@ -3426,15 +3510,18 @@ impl Transportable for Vm { .write(true) .create_new(true) .open(snapshot_state_path) - .map_err(|e| MigratableError::MigrateSend(e.into()))?; + .context("Error creating state snapshot file") + .map_err(MigratableError::MigrateSend)?; // Serialize and write the snapshot state - let vm_state = - serde_json::to_vec(snapshot).map_err(|e| MigratableError::MigrateSend(e.into()))?; + let vm_state = serde_json::to_vec(snapshot) + .context("Error serializing state snapshot") + .map_err(MigratableError::MigrateSend)?; snapshot_state_file .write(&vm_state) - .map_err(|e| MigratableError::MigrateSend(e.into()))?; + .context("Error writing serialized state snapshot") + .map_err(MigratableError::MigrateSend)?; // Tell the memory manager to also send/write its own snapshot. if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) { @@ -3942,7 +4029,7 @@ mod unit_tests { mem.write_slice(&code, load_addr) .expect("Writing code to memory failed"); - let mut vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed"); + let mut vcpu = vm.create_vcpu(0, None, vec![]).expect("new Vcpu failed"); let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed"); vcpu_sregs.cs.base = 0; @@ -4079,7 +4166,7 @@ pub fn test_vm() { mem.write_slice(&code, load_addr) .expect("Writing code to memory failed"); - let mut vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed"); + let mut vcpu = vm.create_vcpu(0, None, vec![]).expect("new Vcpu failed"); let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed"); vcpu_sregs.cs.base = 0; diff --git a/vmm/src/vm_config.rs b/vmm/src/vm_config.rs index 6fefc8f06c..8ea8f0e8ff 100644 --- a/vmm/src/vm_config.rs +++ b/vmm/src/vm_config.rs @@ -8,6 +8,7 @@ use std::path::{Path, PathBuf}; use std::str::FromStr; use std::{fs, result}; +use arch::CpuProfile; use block::ImageType; pub use block::fcntl::LockGranularityChoice; use log::{debug, warn}; @@ -83,6 +84,8 @@ pub struct CpusConfig { pub nested: bool, #[serde(default)] pub core_scheduling: CoreScheduling, + #[serde(default)] + pub profile: CpuProfile, } pub const DEFAULT_VCPUS: u32 = 1; @@ -99,6 +102,7 @@ impl Default for CpusConfig { features: CpuFeatures::default(), nested: true, core_scheduling: CoreScheduling::default(), + profile: CpuProfile::default(), } } } @@ -125,12 +129,24 @@ pub struct PlatformConfig { pub iommu_segments: Option>, #[serde(default = "default_platformconfig_iommu_address_width_bits")] pub iommu_address_width_bits: u8, + #[serde(default, alias = "serial_number")] + pub system_serial_number: Option, + #[serde(default, alias = "uuid")] + pub system_uuid: Option, + #[serde(default)] + pub oem_strings: Vec, + #[serde(default)] + pub system_manufacturer: Option, #[serde(default)] - pub serial_number: Option, + pub system_product_name: Option, #[serde(default)] - pub uuid: Option, + pub system_version: Option, #[serde(default)] - pub oem_strings: Option>, + pub system_family: Option, + #[serde(default)] + pub system_sku_number: Option, + #[serde(default)] + pub chassis_asset_tag: Option, #[cfg(feature = "tdx")] #[serde(default)] pub tdx: bool, @@ -143,6 +159,53 @@ pub struct PlatformConfig { pub vfio_p2p_dma: bool, } +#[cfg(target_arch = "x86_64")] +impl PlatformConfig { + pub fn smbios_config(&self) -> Option { + let has_system = [ + &self.system_serial_number, + &self.system_uuid, + &self.system_manufacturer, + &self.system_product_name, + &self.system_version, + &self.system_family, + &self.system_sku_number, + ] + .iter() + .any(|v| v.is_some()); + + let system = has_system.then_some(arch::x86_64::SmbiosSystem { + manufacturer: self.system_manufacturer.clone(), + product_name: self.system_product_name.clone(), + version: self.system_version.clone(), + serial_number: self.system_serial_number.clone(), + uuid: self.system_uuid.clone(), + sku_number: self.system_sku_number.clone(), + family: self.system_family.clone(), + }); + + let chassis = + self.chassis_asset_tag + .clone() + .map(|asset_tag| arch::x86_64::SmbiosChassisConfig { + asset_tag: Some(asset_tag), + ..Default::default() + }); + + let smbios = arch::x86_64::SmbiosConfig { + system, + chassis, + oem_strings: self.oem_strings.clone(), + }; + + if smbios.system.is_none() && smbios.chassis.is_none() && smbios.oem_strings.is_empty() { + None + } else { + Some(smbios) + } + } +} + pub const DEFAULT_PCI_SEGMENT_APERTURE_WEIGHT: u32 = 1; fn default_pci_segment_aperture_weight() -> u32 { @@ -281,7 +344,7 @@ pub struct PciDeviceCommonConfig { pub iommu: bool, #[serde(default)] pub pci_segment: u16, - #[serde(default)] + #[serde(default, alias = "pci_device_id", rename = "bdf_device")] pub pci_device_id: Option, } @@ -542,6 +605,7 @@ pub enum ConsoleOutputMode { Tty, File, Socket, + Tcp, Null, } @@ -555,6 +619,7 @@ pub struct CommonConsoleConfig { pub mode: ConsoleOutputMode, #[serde(default)] pub socket: Option, + pub url: Option, } impl ApplyLandlock for CommonConsoleConfig { @@ -581,7 +646,8 @@ pub struct SerialConfig { } impl SerialConfig { - pub const SYNTAX: &str = "Control serial port: \"off|null|pty|tty|file=|socket=\""; + pub const SYNTAX: &str = + "Control serial port: \"off|null|pty|tty|file=|socket=|tcp=\""; } impl Default for SerialConfig { @@ -591,6 +657,7 @@ impl Default for SerialConfig { file: None, mode: ConsoleOutputMode::Null, socket: None, + url: None, }, } } @@ -622,6 +689,7 @@ impl Default for ConsoleConfig { file: None, mode: ConsoleOutputMode::Tty, socket: None, + url: None, }, pci_common: PciDeviceCommonConfig::default(), }