diff --git a/src/azure-cli/azure/cli/command_modules/appservice/_deployment_context_engine.py b/src/azure-cli/azure/cli/command_modules/appservice/_deployment_context_engine.py new file mode 100644 index 00000000000..65849e001c2 --- /dev/null +++ b/src/azure-cli/azure/cli/command_modules/appservice/_deployment_context_engine.py @@ -0,0 +1,314 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for license information. +# -------------------------------------------------------------------------------------------- + +""" +Context-enriched error builder for az webapp deploy / az functionapp deploy. + +Instead of raising a bare "Status Code: 504" error, this module builds a structured +diagnostic context block that includes the error code, deployment stage, runtime info, +common causes, suggested fixes, and a ready-to-use Copilot prompt. +""" + +import yaml +from knack.log import get_logger +from knack.util import CLIError + +from ._deployment_failure_patterns import match_failure_pattern + +logger = get_logger(__name__) + + +def _safe_yaml_dump(data): + """Dump dict to YAML string, falling back to repr on error.""" + try: + return yaml.dump(data, default_flow_style=False, sort_keys=False, allow_unicode=True).rstrip() + except Exception: # pylint: disable=broad-except + return repr(data) + + +def _get_app_runtime(cmd, resource_group_name, webapp_name, slot=None): + """Fetch the runtime name/version from the webapp config.""" + try: + from ._client_factory import web_client_factory + client = web_client_factory(cmd.cli_ctx) + if slot: + config = client.web_apps.get_configuration_slot(resource_group_name, webapp_name, slot) + else: + config = client.web_apps.get_configuration(resource_group_name, webapp_name) + # Linux apps store runtime in linux_fx_version (e.g. "PYTHON|3.11") + if config.linux_fx_version: + return config.linux_fx_version + # Windows apps: check e.g. net_framework_version, java_version, python_version, etc. + for attr in ('net_framework_version', 'java_version', 'python_version', + 'php_version', 'node_version', 'power_shell_version'): + val = getattr(config, attr, None) + if val: + return f"{attr.replace('_version', '').replace('_', ' ').title()} {val}" + return "Unknown" + except Exception: # pylint: disable=broad-except + return "Unknown" + + +def _get_app_region(cmd, resource_group_name, webapp_name): + """Fetch the Azure region of the web app.""" + try: + from ._client_factory import web_client_factory + client = web_client_factory(cmd.cli_ctx) + app = client.web_apps.get(resource_group_name, webapp_name) + return app.location if app else "Unknown" + except Exception: # pylint: disable=broad-except + return "Unknown" + + +def _get_app_plan_sku(cmd, resource_group_name, webapp_name): + """Fetch the App Service plan SKU (e.g. B1, P1V2).""" + try: + from ._client_factory import web_client_factory + from azure.mgmt.core.tools import parse_resource_id + client = web_client_factory(cmd.cli_ctx) + app = client.web_apps.get(resource_group_name, webapp_name) + if app and app.server_farm_id: + plan_parts = parse_resource_id(app.server_farm_id) + plan = client.app_service_plans.get(plan_parts['resource_group'], plan_parts['name']) + if plan and plan.sku: + return plan.sku.name + return "Unknown" + except Exception: # pylint: disable=broad-except + return "Unknown" + + +def _determine_deployment_type(params=None, *, src_url=None, artifact_type=None): + """Infer the deployment mechanism from params object or explicit kwargs. + + When *params* is supplied the values are read from it; explicit kwargs + override the params-derived values when both are provided. + """ + _src_url = src_url if src_url is not None else (getattr(params, 'src_url', None) if params else None) + _artifact = artifact_type if artifact_type is not None else (getattr(params, 'artifact_type', None) if params else None) + + if _src_url: + return "OneDeploy (URL-based)" + if _artifact == 'zip': + return "ZipDeploy" + if _artifact == 'war': + return "WarDeploy" + if _artifact == 'jar': + return "JarDeploy" + if _artifact == 'ear': + return "EarDeploy" + if _artifact == 'startup': + return "StartupFile" + if _artifact == 'static': + return "StaticDeploy" + return "OneDeploy" + + +def build_enriched_error_context(params=None, *, cmd=None, resource_group_name=None, + webapp_name=None, slot=None, src_url=None, + artifact_type=None, status_code=None, error_message=None, + deployment_status=None, deployment_properties=None, + last_known_step=None, kudu_status=None): + """ + Build a structured context-enriched error dict for a deployment failure. + + Accepts either a *params* object (``OneDeployParams``) **or** individual + keyword arguments — callers that already have a params object can keep + passing it; callers in code-paths that don't (e.g. zipdeploy) can pass + the relevant values directly. Explicit kwargs override params values. + + Parameters + ---------- + params : OneDeployParams, optional + The deployment parameters object. + cmd, resource_group_name, webapp_name, slot, src_url, artifact_type : + Individual app-context values; used when *params* is not supplied. + status_code : int, optional + HTTP status code of the failed response. + error_message : str, optional + Raw error message / response body text. + deployment_status : str, optional + Deployment status string (e.g. RuntimeFailed, BuildFailed). + deployment_properties : dict, optional + Full deployment properties dict from the status API. + last_known_step : str, optional + The last step that completed successfully. + kudu_status : str, optional + The SCM/Kudu HTTP status if available. + + Returns + ------- + dict + Structured error context ready for display. + """ + # Normalise — extract from params when available, explicit kwargs win + _cmd = cmd or (params.cmd if params else None) + _rg = resource_group_name or (params.resource_group_name if params else None) + _name = webapp_name or (params.webapp_name if params else None) + _slot = slot if slot is not None else (getattr(params, 'slot', None) if params else None) + _src_url = src_url if src_url is not None else (getattr(params, 'src_url', None) if params else None) + _artifact = artifact_type if artifact_type is not None else (getattr(params, 'artifact_type', None) if params else None) + + pattern = match_failure_pattern( + status_code=status_code, + error_message=error_message, + deployment_status=deployment_status + ) + + # Build base context + context = {} + + if pattern: + context["errorCode"] = pattern["errorCode"] + context["stage"] = pattern["stage"] + else: + context["errorCode"] = f"HTTP_{status_code}" if status_code else "UnknownDeploymentError" + context["stage"] = deployment_status or "Unknown" + + # App metadata (best-effort) + if _cmd and _rg and _name: + context["runtime"] = _get_app_runtime(_cmd, _rg, _name, _slot) + context["region"] = _get_app_region(_cmd, _rg, _name) + context["planSku"] = _get_app_plan_sku(_cmd, _rg, _name) + else: + context["runtime"] = "Unknown" + context["region"] = "Unknown" + context["planSku"] = "Unknown" + + context["deploymentType"] = _determine_deployment_type( + params, src_url=_src_url, artifact_type=_artifact + ) + + # Causes and fixes + if pattern: + context["commonCauses"] = pattern["commonCauses"] + context["suggestedFixes"] = pattern["suggestedFixes"] + else: + context["commonCauses"] = ["Unrecognised failure — see error details below"] + context["suggestedFixes"] = [ + "Check deployment logs: 'az webapp log deployment show -n {} -g {}'".format( + _name or '', _rg or ''), + "Check runtime logs: 'az webapp log tail -n {} -g {}'".format( + _name or '', _rg or '') + ] + + # Extra diagnostics + if last_known_step: + context["lastKnownStep"] = last_known_step + if kudu_status: + context["kuduStatus"] = str(kudu_status) + + # Instance counts from deployment properties + if deployment_properties: + for key in ('numberOfInstancesInProgress', 'numberOfInstancesSuccessful', + 'numberOfInstancesFailed'): + val = deployment_properties.get(key) + if val is not None: + context.setdefault("instanceStatus", {})[key] = int(val) + errors = deployment_properties.get('errors') + if errors: + context["deploymentErrors"] = [ + {"code": e.get('extendedCode', ''), "message": e.get('message', '')} + for e in errors[:3] # cap at 3 + ] + logs = deployment_properties.get('failedInstancesLogs') + if logs: + context["failedInstanceLogs"] = logs[0] if len(logs) == 1 else logs + + # Raw details + if error_message: + context["rawError"] = error_message[:500] # truncate long bodies + + return context + + +def format_enriched_error_message(context): + """ + Format the structured context dict into a human-readable error message. + + The output includes the YAML context block and a ready-to-use Copilot prompt. + """ + lines = [] + lines.append("") + lines.append("=" * 72) + lines.append("DEPLOYMENT FAILED — Context-Enriched Diagnostics") + lines.append("=" * 72) + lines.append("") + + # YAML context block + lines.append("--- COPILOT CONTEXT ---") + lines.append(_safe_yaml_dump(context)) + lines.append("--- END CONTEXT ---") + lines.append("") + + # Human-readable summary + lines.append(f"Error Code : {context.get('errorCode', 'Unknown')}") + lines.append(f"Stage : {context.get('stage', 'Unknown')}") + lines.append(f"Runtime : {context.get('runtime', 'Unknown')}") + lines.append(f"Deploy Type : {context.get('deploymentType', 'Unknown')}") + lines.append(f"Region : {context.get('region', 'Unknown')}") + lines.append(f"Plan SKU : {context.get('planSku', 'Unknown')}") + lines.append("") + + causes = context.get("commonCauses", []) + if causes: + lines.append("Common Causes:") + for c in causes: + lines.append(f" - {c}") + lines.append("") + + fixes = context.get("suggestedFixes", []) + if fixes: + lines.append("Suggested Fixes:") + for f in fixes: + lines.append(f" - {f}") + lines.append("") + + if context.get("rawError"): + lines.append(f"Raw Error : {context['rawError']}") + lines.append("") + + # Copilot prompt + lines.append("-" * 72) + lines.append("Ask Copilot:") + lines.append(' Copy-paste the COPILOT CONTEXT block above into GitHub Copilot Chat,') + lines.append(' or run:') + lines.append(' gh copilot explain "Paste the COPILOT CONTEXT above and explain') + lines.append(' why this deployment failed and what I should do"') + lines.append("-" * 72) + + return "\n".join(lines) + + +def raise_enriched_deployment_error(params=None, *, cmd=None, resource_group_name=None, + webapp_name=None, slot=None, src_url=None, + artifact_type=None, status_code=None, error_message=None, + deployment_status=None, deployment_properties=None, + last_known_step=None, kudu_status=None): + """ + Build context-enriched diagnostics and raise a CLIError. + + This is the main entry-point called from the deployment code paths. + Accepts either a *params* object or individual keyword arguments. + """ + context = build_enriched_error_context( + params=params, + cmd=cmd, + resource_group_name=resource_group_name, + webapp_name=webapp_name, + slot=slot, + src_url=src_url, + artifact_type=artifact_type, + status_code=status_code, + error_message=error_message, + deployment_status=deployment_status, + deployment_properties=deployment_properties, + last_known_step=last_known_step, + kudu_status=kudu_status + ) + + logger.info("Deployment failure context: %s", context) + + message = format_enriched_error_message(context) + raise CLIError(message) diff --git a/src/azure-cli/azure/cli/command_modules/appservice/_deployment_failure_patterns.py b/src/azure-cli/azure/cli/command_modules/appservice/_deployment_failure_patterns.py new file mode 100644 index 00000000000..2519ffc3f3a --- /dev/null +++ b/src/azure-cli/azure/cli/command_modules/appservice/_deployment_failure_patterns.py @@ -0,0 +1,373 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for license information. +# -------------------------------------------------------------------------------------------- + +""" +Well-known deployment failure patterns for az webapp deploy / az functionapp deploy. + +Each pattern maps an errorCode to its deployment stage, common causes, and suggested fixes. +These patterns are used by the context-enriched error handler to produce actionable diagnostics +instead of generic "Status Code: 504" messages. +""" + +DEPLOYMENT_FAILURE_PATTERNS = [ + { + "errorCode": "ZipDeployTimeout", + "stage": "ZipExtract", + "commonCauses": [ + "Large node_modules or dependency folder", + "Slow network between client and App Service", + "B1 plan under-provisioned for artifact size" + ], + "suggestedFixes": [ + "Scale up the App Service plan (e.g., B1 -> P1V2)", + "Set SCM_DO_BUILD_DURING_DEPLOYMENT=false to disable remote build", + "Reduce artifact size by excluding dev dependencies", + "Retry the deployment" + ] + }, + { + "errorCode": "Exit137", + "stage": "ContainerStartup", + "commonCauses": [ + "Out-of-memory (OOM) kill during startup", + "Startup memory spike exceeds plan limits" + ], + "suggestedFixes": [ + "Scale up the App Service plan to get more memory", + "Reduce startup memory footprint", + "Lazy-load heavy dependencies instead of importing at startup" + ] + }, + { + "errorCode": "OryxBuildFailed", + "stage": "Build", + "commonCauses": [ + "Missing requirements.txt or package.json", + "Oryx build system misconfigured", + "Incompatible dependency versions" + ], + "suggestedFixes": [ + "Ensure the correct build manifest file exists (requirements.txt / package.json)", + "Set SCM_DO_BUILD_DURING_DEPLOYMENT=false and pre-build artifacts locally", + "Check Oryx build logs for specific dependency errors" + ] + }, + { + "errorCode": "StartupProbeFailed", + "stage": "ContainerStartup", + "commonCauses": [ + "Application not listening on the expected port", + "Slow initialization exceeding probe timeout" + ], + "suggestedFixes": [ + "Increase WEBSITES_CONTAINER_START_TIME_LIMIT (e.g., to 600)", + "Verify the PORT environment variable and that the app binds to it", + "Add a /health or /ready endpoint for the startup probe" + ] + }, + { + "errorCode": "AuthFailed", + "stage": "Deployment", + "commonCauses": [ + "RBAC role not assigned or misconfigured", + "Managed identity not enabled on the app" + ], + "suggestedFixes": [ + "Ensure the deploying identity has Contributor or Website Contributor role", + "Enable system-assigned managed identity on the web app", + "Run 'az role assignment list' to verify permissions" + ] + }, + { + "errorCode": "AppOfflineDetected", + "stage": "Deployment", + "commonCauses": [ + "Deployment file lock preventing updates", + "app_offline.htm file left from a previous deployment" + ], + "suggestedFixes": [ + "Remove the app_offline.htm file from wwwroot", + "Retry the deployment after a brief wait", + "Restart the app before redeploying" + ] + }, + { + "errorCode": "DockerImagePullFailed", + "stage": "ContainerStartup", + "commonCauses": [ + "Invalid image name or tag", + "Container registry authentication failure", + "Network connectivity issue to registry" + ], + "suggestedFixes": [ + "Verify the image name and tag exist in the registry", + "Check container registry credentials and permissions", + "Ensure network connectivity between App Service and the registry" + ] + }, + { + "errorCode": "SCMTimeout", + "stage": "ZipExtract", + "commonCauses": [ + "Slow SCM (Kudu) operations under load", + "Very large deployment artifact" + ], + "suggestedFixes": [ + "Split deployment into smaller artifacts", + "Set SCM_DO_BUILD_DURING_DEPLOYMENT=false to skip build during deploy", + "Retry the deployment" + ] + }, + { + "errorCode": "ConfigConflict", + "stage": "ConfigUpdate", + "commonCauses": [ + "Conflicting settings between portal and CLI/ARM", + "Stale configuration cached by the platform" + ], + "suggestedFixes": [ + "Resolve conflicts manually in the Azure portal", + "Use an ARM template or Bicep to enforce consistent configuration", + "Run 'az webapp config show' to review current settings" + ] + }, + { + "errorCode": "RuntimeMismatch", + "stage": "ContainerStartup", + "commonCauses": [ + "Runtime version set in config does not match deployed code", + "Container base image uses a different runtime version" + ], + "suggestedFixes": [ + "Update the runtime stack via 'az webapp config set --linux-fx-version'", + "Rebuild the container image with the correct runtime version", + "Check 'az webapp config show' for linuxFxVersion or windowsFxVersion" + ] + }, + { + "errorCode": "SSLValidationFailed", + "stage": "Deployment", + "commonCauses": [ + "Invalid or expired SSL certificate", + "Certificate-key mismatch" + ], + "suggestedFixes": [ + "Upload a valid SSL certificate with matching private key", + "Check certificate expiration date", + "Verify the certificate password is correct" + ] + }, + { + "errorCode": "InsufficientQuota", + "stage": "Deployment", + "commonCauses": [ + "App Service plan instance or core limits reached", + "Subscription quota exhausted" + ], + "suggestedFixes": [ + "Upgrade the App Service plan to a higher tier", + "Free up quota by deleting unused apps", + "Request a quota increase via Azure support" + ] + }, + { + "errorCode": "PermissionDenied", + "stage": "Deployment", + "commonCauses": [ + "Service principal or user lacks required RBAC role", + "Scope of role assignment is incorrect" + ], + "suggestedFixes": [ + "Assign Contributor or Website Contributor role at the correct scope", + "Run 'az role assignment list --assignee ' to verify", + "Check if a deny assignment or policy is blocking access" + ] + }, + { + "errorCode": "FileLockError", + "stage": "ZipExtract", + "commonCauses": [ + "File in use by a running process during deployment", + "Antivirus or file lock from another deployment" + ], + "suggestedFixes": [ + "Stop the app before deploying: 'az webapp stop'", + "Retry the deployment after a short delay", + "Enable MSDEPLOY_RENAME_LOCKED_FILES=1 in app settings" + ] + }, + { + "errorCode": "ColdStartTimeout", + "stage": "ContainerStartup", + "commonCauses": [ + "Large dependency tree causing slow cold start", + "No pre-warmed instances available" + ], + "suggestedFixes": [ + "Increase WEBSITES_CONTAINER_START_TIME_LIMIT", + "Scale up the plan for faster cold starts", + "Enable Always On to avoid cold starts" + ] + }, + { + "errorCode": "DBConnectionFailed", + "stage": "ContainerStartup", + "commonCauses": [ + "Database connection string missing from app settings", + "Database firewall blocking App Service IP" + ], + "suggestedFixes": [ + "Set the connection string via 'az webapp config connection-string set'", + "Add App Service outbound IPs to the database firewall rules", + "Use a service connector: 'az webapp connection create'" + ] + }, + { + "errorCode": "WebJobFailed", + "stage": "WebJobStartup", + "commonCauses": [ + "Missing runtime for the WebJob", + "Package or dependency errors in the WebJob" + ], + "suggestedFixes": [ + "Check WebJob runtime requirements and logs", + "Run 'az webapp webjob continuous list' to see WebJob status", + "Review logs at https://.scm.azurewebsites.net/api/continuouswebjobs" + ] + }, + { + "errorCode": "PortBindingError", + "stage": "ContainerStartup", + "commonCauses": [ + "Container not exposing port 80 or 8080", + "WEBSITES_PORT not set to the correct port" + ], + "suggestedFixes": [ + "Set WEBSITES_PORT app setting to match the container's listening port", + "Ensure the Dockerfile exposes the correct port", + "Check 'az webapp config appsettings list' for WEBSITES_PORT" + ] + }, + { + "errorCode": "AppSettingsMisconfigured", + "stage": "ContainerStartup", + "commonCauses": [ + "Missing required environment variables", + "Incorrect app setting names or values" + ], + "suggestedFixes": [ + "Review app settings: 'az webapp config appsettings list'", + "Set required environment variables: 'az webapp config appsettings set'", + "Compare with working configuration or documentation" + ] + }, + { + "errorCode": "StorageMountFailed", + "stage": "ContainerStartup", + "commonCauses": [ + "SMB/NFS mount failure due to storage account issues", + "Incorrect storage credentials or share name" + ], + "suggestedFixes": [ + "Verify the storage account name, key, and share exist", + "Check network connectivity (private endpoints, firewalls)", + "Run 'az webapp config storage-account list' to review mounts" + ] + } +] + +# Index for O(1) lookup by error code +_PATTERN_INDEX = {p["errorCode"]: p for p in DEPLOYMENT_FAILURE_PATTERNS} + + +def get_failure_pattern(error_code): + """Look up a well-known failure pattern by its error code.""" + return _PATTERN_INDEX.get(error_code) + + +def match_failure_pattern(status_code=None, error_message=None, deployment_status=None): + """ + Attempt to match an error to a well-known failure pattern based on heuristics. + + Examines status codes, error messages, and deployment status text to find the + most relevant failure pattern. + + Returns the matched pattern dict or None. + """ + if error_message is None: + error_message = "" + + error_lower = error_message.lower() + + # Status code based matching + if status_code in (504, 408): + if "scm" in error_lower or "kudu" in error_lower: + return get_failure_pattern("SCMTimeout") + return get_failure_pattern("ZipDeployTimeout") + + if status_code == 401 or status_code == 403: + if "ssl" in error_lower or "cert" in error_lower: + return get_failure_pattern("SSLValidationFailed") + if "permission" in error_lower or "denied" in error_lower: + return get_failure_pattern("PermissionDenied") + return get_failure_pattern("AuthFailed") + + if status_code == 409: + if "lock" in error_lower or "locked" in error_lower: + return get_failure_pattern("FileLockError") + if "offline" in error_lower: + return get_failure_pattern("AppOfflineDetected") + + if status_code == 429 or "quota" in error_lower or "insufficient" in error_lower: + return get_failure_pattern("InsufficientQuota") + + # Deployment status based matching + if deployment_status == "BuildFailed": + if "oryx" in error_lower: + return get_failure_pattern("OryxBuildFailed") + return get_failure_pattern("OryxBuildFailed") # default build failure + + if deployment_status == "RuntimeFailed": + # Try to narrow down the runtime failure + if "137" in error_lower or "oom" in error_lower or "out of memory" in error_lower: + return get_failure_pattern("Exit137") + if "port" in error_lower or "bind" in error_lower: + return get_failure_pattern("PortBindingError") + if "probe" in error_lower or "health" in error_lower: + return get_failure_pattern("StartupProbeFailed") + if "image" in error_lower or "pull" in error_lower or "docker" in error_lower: + return get_failure_pattern("DockerImagePullFailed") + if "runtime" in error_lower and "mismatch" in error_lower: + return get_failure_pattern("RuntimeMismatch") + if "connection" in error_lower and ("db" in error_lower or "database" in error_lower or "sql" in error_lower): + return get_failure_pattern("DBConnectionFailed") + if "storage" in error_lower or "mount" in error_lower or "smb" in error_lower: + return get_failure_pattern("StorageMountFailed") + if "setting" in error_lower or "env" in error_lower or "environment" in error_lower: + return get_failure_pattern("AppSettingsMisconfigured") + if "cold" in error_lower or "startup" in error_lower or "timeout" in error_lower: + return get_failure_pattern("ColdStartTimeout") + # Generic runtime failure — use StartupProbeFailed as the closest match + return get_failure_pattern("StartupProbeFailed") + + # Message-based matching (fallback heuristics) + if "artifact type" in error_lower and "cannot be deployed to stack" in error_lower: + return get_failure_pattern("RuntimeMismatch") + if "webjob" in error_lower: + return get_failure_pattern("WebJobFailed") + if "config" in error_lower and "conflict" in error_lower: + return get_failure_pattern("ConfigConflict") + if "offline" in error_lower: + return get_failure_pattern("AppOfflineDetected") + if "timeout" in error_lower: + return get_failure_pattern("ZipDeployTimeout") + if "permission" in error_lower or "denied" in error_lower or "unauthorized" in error_lower: + return get_failure_pattern("PermissionDenied") + if "quota" in error_lower or "exceeded" in error_lower: + return get_failure_pattern("InsufficientQuota") + if "lock" in error_lower: + return get_failure_pattern("FileLockError") + + return None diff --git a/src/azure-cli/azure/cli/command_modules/appservice/custom.py b/src/azure-cli/azure/cli/command_modules/appservice/custom.py index 6373ef7b1c8..c6b98c60001 100644 --- a/src/azure-cli/azure/cli/command_modules/appservice/custom.py +++ b/src/azure-cli/azure/cli/command_modules/appservice/custom.py @@ -58,6 +58,7 @@ appcontainers_client_factory) from ._appservice_utils import _generic_site_operation, _generic_settings_operation from ._appservice_utils import MSI_LOCAL_ID +from ._deployment_context_engine import raise_enriched_deployment_error from .utils import (_normalize_sku, get_sku_tier, retryable_method, @@ -880,6 +881,9 @@ def enable_zip_deploy(cmd, resource_group_name, name, src, timeout=None, slot=No app_is_linux_webapp = is_linux_webapp(app) app_is_function_app = is_functionapp(app) + # Should we enrich deployment errors with context? (webapp only, not functionapp) + _should_enrich_errors = not app_is_function_app + # Read file content with open(os.path.realpath(os.path.expanduser(src)), 'rb') as fs: zip_content = fs.read() @@ -912,13 +916,26 @@ def enable_zip_deploy(cmd, resource_group_name, name, src, timeout=None, slot=No # check the status of async deployment if res.status_code == 202: response_body = None - if track_status: - response_body = _check_runtimestatus_with_deploymentstatusapi(cmd, resource_group_name, name, slot, - deployment_status_url, is_async=True, - timeout=timeout) - else: - response_body = _check_zip_deployment_status(cmd, resource_group_name, name, deployment_status_url, - slot, timeout) + try: + if track_status: + response_body = _check_runtimestatus_with_deploymentstatusapi(cmd, resource_group_name, name, slot, + deployment_status_url, is_async=True, + timeout=timeout) + else: + response_body = _check_zip_deployment_status(cmd, resource_group_name, name, deployment_status_url, + slot, timeout) + except CLIError as deploy_err: + if _should_enrich_errors: + raise_enriched_deployment_error( + cmd=cmd, + resource_group_name=resource_group_name, + webapp_name=name, + slot=slot, + artifact_type="zip", + error_message=str(deploy_err), + last_known_step="Zip deployment accepted (HTTP 202), tracking status" + ) + raise return response_body # check if there's an ongoing process @@ -933,6 +950,18 @@ def enable_zip_deploy(cmd, resource_group_name, name, src, timeout=None, slot=No # check if an error occured during deployment if res.status_code: + if _should_enrich_errors: + raise_enriched_deployment_error( + cmd=cmd, + resource_group_name=resource_group_name, + webapp_name=name, + slot=slot, + artifact_type="zip", + status_code=res.status_code, + error_message=res.text if res.text else None, + last_known_step="Zip deployment HTTP request", + kudu_status=str(res.status_code) + ) raise AzureInternalError("An error occured during deployment. Status Code: {}, Details: {}" .format(res.status_code, res.text)) @@ -9843,15 +9872,27 @@ def _make_onedeploy_request(params): if response.status_code == 202 or response.status_code == 200: response_body = None if poll_async_deployment_for_debugging: - if params.track_status is not None and params.track_status: - response_body = _check_runtimestatus_with_deploymentstatusapi(params.cmd, params.resource_group_name, - params.webapp_name, params.slot, - deployment_status_url, - params.is_async_deployment, - params.timeout) - else: - response_body = _check_zip_deployment_status(params.cmd, params.resource_group_name, params.webapp_name, - deployment_status_url, params.slot, params.timeout) + try: + if params.track_status is not None and params.track_status: + response_body = _check_runtimestatus_with_deploymentstatusapi(params.cmd, + params.resource_group_name, + params.webapp_name, params.slot, + deployment_status_url, + params.is_async_deployment, + params.timeout) + else: + response_body = _check_zip_deployment_status(params.cmd, params.resource_group_name, + params.webapp_name, + deployment_status_url, params.slot, params.timeout) + except CLIError as deploy_err: + if not params.is_functionapp: + # Enrich the downstream deployment-tracking error with context + raise_enriched_deployment_error( + params=params, + error_message=str(deploy_err), + last_known_step="Deployment accepted (HTTP 200/202), tracking status" + ) + raise logger.info('Server response: %s', response_body) else: if 'application/json' in response.headers.get('content-type', ""): @@ -9874,8 +9915,16 @@ def _make_onedeploy_request(params): "starting a new deployment. You can track the ongoing deployment at {}" .format(deployment_status_url)) - # check if an error occured during deployment + # check if an error occurred during deployment if response.status_code: + if not params.is_functionapp: + raise_enriched_deployment_error( + params=params, + status_code=response.status_code, + error_message=response.text if response.text else None, + last_known_step="HTTP request sent to deployment API", + kudu_status=str(response.status_code) + ) scm_url = _get_scm_url(params.cmd, params.resource_group_name, params.webapp_name, params.slot) latest_deploymentinfo_url = scm_url + "/api/deployments/latest" raise CLIError("An error occurred during deployment. Status Code: {}, {} Please visit {}" @@ -9892,8 +9941,43 @@ def _perform_onedeploy_internal(params): # Now make the OneDeploy API call logger.warning("Initiating deployment") - response = _make_onedeploy_request(params) - return response + try: + response = _make_onedeploy_request(params) + return response + except (ValidationError, ResourceNotFoundError): + # Known CLI validation errors (e.g. 409 conflict, 404 API not available) — re-raise as-is + raise + except CLIError as ex: + # Check if this is already an enriched error (from raise_enriched_deployment_error) + if "COPILOT CONTEXT" in str(ex): + raise + if not params.is_functionapp: + # Raw CLIError from send_raw_request or other deployment calls — enrich it + raise_enriched_deployment_error( + params=params, + error_message=str(ex), + last_known_step="Deployment request" + ) + raise + except HttpResponseError as ex: + if not params.is_functionapp: + # Azure SDK errors (e.g. Bad Request from ARM) + raise_enriched_deployment_error( + params=params, + status_code=ex.status_code if hasattr(ex, 'status_code') else None, + error_message=str(ex), + last_known_step="ARM deployment request" + ) + raise + except Exception as ex: # pylint: disable=broad-except + if not params.is_functionapp: + # Catch-all for unexpected errors (connection errors, timeouts, etc.) + raise_enriched_deployment_error( + params=params, + error_message=str(ex), + last_known_step="Deployment request" + ) + raise def _wait_for_webapp(tunnel_server): diff --git a/src/azure-cli/azure/cli/command_modules/appservice/tests/latest/test_deployment_context_engine.py b/src/azure-cli/azure/cli/command_modules/appservice/tests/latest/test_deployment_context_engine.py new file mode 100644 index 00000000000..e9559cb3290 --- /dev/null +++ b/src/azure-cli/azure/cli/command_modules/appservice/tests/latest/test_deployment_context_engine.py @@ -0,0 +1,435 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for license information. +# -------------------------------------------------------------------------------------------- + +""" +Unit tests for the deployment context engineering feature: + - _deployment_failure_patterns.py (pattern matching) + - _deployment_context_engine.py (context building & formatting) +""" + +import unittest +from unittest.mock import MagicMock, patch + +from azure.cli.command_modules.appservice._deployment_failure_patterns import ( + DEPLOYMENT_FAILURE_PATTERNS, + get_failure_pattern, + match_failure_pattern, +) +from azure.cli.command_modules.appservice._deployment_context_engine import ( + build_enriched_error_context, + format_enriched_error_message, + raise_enriched_deployment_error, + _determine_deployment_type, +) + + +def _make_mock_params(**overrides): + """Create a minimal mock OneDeployParams object.""" + params = MagicMock() + params.cmd = MagicMock() + params.cmd.cli_ctx = MagicMock() + params.resource_group_name = overrides.get("resource_group_name", "test-rg") + params.webapp_name = overrides.get("webapp_name", "test-app") + params.slot = overrides.get("slot", None) + params.src_url = overrides.get("src_url", None) + params.src_path = overrides.get("src_path", "app.zip") + params.artifact_type = overrides.get("artifact_type", "zip") + params.is_async_deployment = overrides.get("is_async_deployment", None) + params.timeout = overrides.get("timeout", None) + params.track_status = overrides.get("track_status", True) + params.enable_kudu_warmup = overrides.get("enable_kudu_warmup", True) + params.is_linux_webapp = overrides.get("is_linux_webapp", True) + params.is_functionapp = overrides.get("is_functionapp", False) + return params + + +# --------------------------------------------------------------------------- +# Tests for _deployment_failure_patterns +# --------------------------------------------------------------------------- +class TestDeploymentFailurePatterns(unittest.TestCase): + """Tests for the failure pattern definitions and lookup functions.""" + + def test_all_patterns_have_required_keys(self): + required_keys = {"errorCode", "stage", "commonCauses", "suggestedFixes"} + for pattern in DEPLOYMENT_FAILURE_PATTERNS: + with self.subTest(errorCode=pattern["errorCode"]): + self.assertTrue(required_keys.issubset(pattern.keys())) + self.assertIsInstance(pattern["commonCauses"], list) + self.assertIsInstance(pattern["suggestedFixes"], list) + self.assertGreater(len(pattern["commonCauses"]), 0) + self.assertGreater(len(pattern["suggestedFixes"]), 0) + + def test_pattern_count(self): + self.assertEqual(len(DEPLOYMENT_FAILURE_PATTERNS), 20) + + def test_get_failure_pattern_found(self): + pattern = get_failure_pattern("ZipDeployTimeout") + self.assertIsNotNone(pattern) + self.assertEqual(pattern["errorCode"], "ZipDeployTimeout") + self.assertEqual(pattern["stage"], "ZipExtract") + + def test_get_failure_pattern_not_found(self): + self.assertIsNone(get_failure_pattern("NonExistentCode")) + + # --- match_failure_pattern: status-code based --- + def test_match_504_returns_zip_deploy_timeout(self): + p = match_failure_pattern(status_code=504) + self.assertEqual(p["errorCode"], "ZipDeployTimeout") + + def test_match_504_with_scm_returns_scm_timeout(self): + p = match_failure_pattern(status_code=504, error_message="SCM site timed out") + self.assertEqual(p["errorCode"], "SCMTimeout") + + def test_match_408_returns_zip_deploy_timeout(self): + p = match_failure_pattern(status_code=408) + self.assertEqual(p["errorCode"], "ZipDeployTimeout") + + def test_match_401_returns_auth_failed(self): + p = match_failure_pattern(status_code=401) + self.assertEqual(p["errorCode"], "AuthFailed") + + def test_match_403_ssl_returns_ssl_validation_failed(self): + p = match_failure_pattern(status_code=403, error_message="SSL certificate error") + self.assertEqual(p["errorCode"], "SSLValidationFailed") + + def test_match_403_permission_denied(self): + p = match_failure_pattern(status_code=403, error_message="Permission denied") + self.assertEqual(p["errorCode"], "PermissionDenied") + + def test_match_409_lock_returns_file_lock_error(self): + p = match_failure_pattern(status_code=409, error_message="File is locked") + self.assertEqual(p["errorCode"], "FileLockError") + + def test_match_429_returns_insufficient_quota(self): + p = match_failure_pattern(status_code=429) + self.assertEqual(p["errorCode"], "InsufficientQuota") + + # --- match_failure_pattern: deployment-status based --- + def test_match_build_failed(self): + p = match_failure_pattern(deployment_status="BuildFailed") + self.assertEqual(p["errorCode"], "OryxBuildFailed") + + def test_match_runtime_failed_oom(self): + p = match_failure_pattern(deployment_status="RuntimeFailed", + error_message="Container killed with exit code 137") + self.assertEqual(p["errorCode"], "Exit137") + + def test_match_runtime_failed_port(self): + p = match_failure_pattern(deployment_status="RuntimeFailed", + error_message="Failed to bind to port 8080") + self.assertEqual(p["errorCode"], "PortBindingError") + + def test_match_runtime_failed_probe(self): + p = match_failure_pattern(deployment_status="RuntimeFailed", + error_message="Health probe failed") + self.assertEqual(p["errorCode"], "StartupProbeFailed") + + def test_match_runtime_failed_docker(self): + p = match_failure_pattern(deployment_status="RuntimeFailed", + error_message="Failed to pull Docker image") + self.assertEqual(p["errorCode"], "DockerImagePullFailed") + + def test_match_runtime_failed_generic(self): + p = match_failure_pattern(deployment_status="RuntimeFailed", + error_message="some unknown error") + self.assertIsNotNone(p) # should still return a pattern + + # --- match_failure_pattern: message-based fallback --- + def test_match_webjob_message(self): + p = match_failure_pattern(error_message="WebJob startup error") + self.assertEqual(p["errorCode"], "WebJobFailed") + + def test_match_timeout_message(self): + p = match_failure_pattern(error_message="Timeout reached while tracking status") + self.assertEqual(p["errorCode"], "ZipDeployTimeout") + + def test_match_no_match(self): + p = match_failure_pattern(status_code=200, error_message="all good") + self.assertIsNone(p) + + +# --------------------------------------------------------------------------- +# Tests for _deployment_context_engine +# --------------------------------------------------------------------------- +class TestDeploymentContextEngine(unittest.TestCase): + """Tests for the context builder and formatter.""" + + def _patch_app_metadata(self): + """Patch the metadata fetching functions to avoid real API calls.""" + patcher_runtime = patch( + "azure.cli.command_modules.appservice._deployment_context_engine._get_app_runtime", + return_value="PYTHON|3.11" + ) + patcher_region = patch( + "azure.cli.command_modules.appservice._deployment_context_engine._get_app_region", + return_value="Central India" + ) + patcher_sku = patch( + "azure.cli.command_modules.appservice._deployment_context_engine._get_app_plan_sku", + return_value="B1" + ) + self.mock_runtime = patcher_runtime.start() + self.mock_region = patcher_region.start() + self.mock_sku = patcher_sku.start() + self.addCleanup(patcher_runtime.stop) + self.addCleanup(patcher_region.stop) + self.addCleanup(patcher_sku.stop) + + def test_determine_deployment_type_zip(self): + params = _make_mock_params(artifact_type="zip", src_url=None) + self.assertEqual(_determine_deployment_type(params), "ZipDeploy") + + def test_determine_deployment_type_url(self): + params = _make_mock_params(src_url="https://example.com/app.zip") + self.assertEqual(_determine_deployment_type(params), "OneDeploy (URL-based)") + + def test_determine_deployment_type_war(self): + params = _make_mock_params(artifact_type="war", src_url=None) + self.assertEqual(_determine_deployment_type(params), "WarDeploy") + + def test_determine_deployment_type_kwargs_zip(self): + """kwargs-only calling convention (no params object).""" + self.assertEqual(_determine_deployment_type(artifact_type="zip"), "ZipDeploy") + + def test_determine_deployment_type_kwargs_url(self): + self.assertEqual( + _determine_deployment_type(src_url="https://example.com/app.zip"), + "OneDeploy (URL-based)" + ) + + def test_determine_deployment_type_kwargs_override(self): + """Explicit kwargs should override params values.""" + params = _make_mock_params(artifact_type="war", src_url=None) + self.assertEqual( + _determine_deployment_type(params, artifact_type="jar"), + "JarDeploy" + ) + + def test_build_context_with_known_pattern(self): + self._patch_app_metadata() + params = _make_mock_params() + ctx = build_enriched_error_context( + params, status_code=504, error_message="Gateway Timeout" + ) + self.assertEqual(ctx["errorCode"], "ZipDeployTimeout") + self.assertEqual(ctx["stage"], "ZipExtract") + self.assertEqual(ctx["runtime"], "PYTHON|3.11") + self.assertEqual(ctx["region"], "Central India") + self.assertEqual(ctx["planSku"], "B1") + self.assertEqual(ctx["deploymentType"], "ZipDeploy") + self.assertIn("commonCauses", ctx) + self.assertIn("suggestedFixes", ctx) + + def test_build_context_with_unknown_error(self): + self._patch_app_metadata() + params = _make_mock_params() + ctx = build_enriched_error_context( + params, status_code=599, error_message="Something weird" + ) + self.assertEqual(ctx["errorCode"], "HTTP_599") + self.assertIn("rawError", ctx) + + def test_build_context_with_deployment_properties(self): + self._patch_app_metadata() + params = _make_mock_params() + props = { + "numberOfInstancesInProgress": "1", + "numberOfInstancesSuccessful": "0", + "numberOfInstancesFailed": "2", + "errors": [{"extendedCode": "EXT001", "message": "OOM killed"}], + "failedInstancesLogs": ["https://logs.example.com/log1"] + } + ctx = build_enriched_error_context( + params, deployment_status="RuntimeFailed", + error_message="OOM killed", deployment_properties=props + ) + self.assertEqual(ctx["errorCode"], "Exit137") + self.assertIn("instanceStatus", ctx) + self.assertEqual(ctx["instanceStatus"]["numberOfInstancesFailed"], 2) + self.assertIn("deploymentErrors", ctx) + self.assertEqual(ctx["failedInstanceLogs"], "https://logs.example.com/log1") + + def test_build_context_includes_last_known_step(self): + self._patch_app_metadata() + params = _make_mock_params() + ctx = build_enriched_error_context( + params, status_code=504, last_known_step="ZipExtract started" + ) + self.assertEqual(ctx["lastKnownStep"], "ZipExtract started") + + def test_build_context_includes_kudu_status(self): + self._patch_app_metadata() + params = _make_mock_params() + ctx = build_enriched_error_context( + params, status_code=504, kudu_status="504" + ) + self.assertEqual(ctx["kuduStatus"], "504") + + def test_format_error_message_contains_key_sections(self): + self._patch_app_metadata() + params = _make_mock_params() + ctx = build_enriched_error_context(params, status_code=504) + msg = format_enriched_error_message(ctx) + + self.assertIn("DEPLOYMENT FAILED", msg) + self.assertIn("COPILOT CONTEXT", msg) + self.assertIn("ZipDeployTimeout", msg) + self.assertIn("ZipExtract", msg) + self.assertIn("Common Causes:", msg) + self.assertIn("Suggested Fixes:", msg) + self.assertIn("Ask Copilot:", msg) + self.assertIn("gh copilot explain", msg) + + def test_format_error_message_yaml_block(self): + self._patch_app_metadata() + params = _make_mock_params() + ctx = build_enriched_error_context(params, status_code=504) + msg = format_enriched_error_message(ctx) + + self.assertIn("--- COPILOT CONTEXT ---", msg) + self.assertIn("--- END CONTEXT ---", msg) + # The YAML block should contain the errorCode + start_idx = msg.index("--- COPILOT CONTEXT ---") + end_idx = msg.index("--- END CONTEXT ---") + yaml_block = msg[start_idx:end_idx] + self.assertIn("errorCode: ZipDeployTimeout", yaml_block) + + def test_raise_enriched_deployment_error(self): + self._patch_app_metadata() + params = _make_mock_params() + from knack.util import CLIError + with self.assertRaises(CLIError) as cm: + raise_enriched_deployment_error( + params, status_code=504, error_message="Gateway Timeout" + ) + self.assertIn("ZipDeployTimeout", str(cm.exception)) + self.assertIn("COPILOT CONTEXT", str(cm.exception)) + + def test_raise_enriched_deployment_error_kwargs_only(self): + """Call raise_enriched_deployment_error with kwargs instead of params.""" + self._patch_app_metadata() + mock_cmd = MagicMock() + mock_cmd.cli_ctx = MagicMock() + from knack.util import CLIError + with self.assertRaises(CLIError) as cm: + raise_enriched_deployment_error( + cmd=mock_cmd, + resource_group_name="test-rg", + webapp_name="test-app", + artifact_type="zip", + status_code=504, + error_message="Gateway Timeout" + ) + self.assertIn("ZipDeployTimeout", str(cm.exception)) + self.assertIn("COPILOT CONTEXT", str(cm.exception)) + self.assertIn("ZipDeploy", str(cm.exception)) + + def test_build_context_kwargs_only(self): + """Call build_enriched_error_context with kwargs instead of params.""" + self._patch_app_metadata() + mock_cmd = MagicMock() + mock_cmd.cli_ctx = MagicMock() + ctx = build_enriched_error_context( + cmd=mock_cmd, + resource_group_name="test-rg", + webapp_name="test-app", + artifact_type="zip", + status_code=504, + error_message="Gateway Timeout" + ) + self.assertEqual(ctx["errorCode"], "ZipDeployTimeout") + self.assertEqual(ctx["deploymentType"], "ZipDeploy") + + +# --------------------------------------------------------------------------- +# Integration-level test: verify the full error flow +# --------------------------------------------------------------------------- +class TestDeploymentErrorFlow(unittest.TestCase): + """End-to-end tests simulating real deployment failures.""" + + def _patch_app_metadata(self): + patcher_runtime = patch( + "azure.cli.command_modules.appservice._deployment_context_engine._get_app_runtime", + return_value="NODE|18" + ) + patcher_region = patch( + "azure.cli.command_modules.appservice._deployment_context_engine._get_app_region", + return_value="East US" + ) + patcher_sku = patch( + "azure.cli.command_modules.appservice._deployment_context_engine._get_app_plan_sku", + return_value="P1V2" + ) + self.mock_runtime = patcher_runtime.start() + self.mock_region = patcher_region.start() + self.mock_sku = patcher_sku.start() + self.addCleanup(patcher_runtime.stop) + self.addCleanup(patcher_region.stop) + self.addCleanup(patcher_sku.stop) + + def test_timeout_scenario(self): + """Simulate a 504 Gateway Timeout during zip deploy.""" + self._patch_app_metadata() + params = _make_mock_params(artifact_type="zip") + from knack.util import CLIError + with self.assertRaises(CLIError) as cm: + raise_enriched_deployment_error( + params, status_code=504, + error_message="The gateway did not receive a response from the upstream server in time.", + kudu_status="504" + ) + error_msg = str(cm.exception) + self.assertIn("ZipDeployTimeout", error_msg) + self.assertIn("NODE|18", error_msg) + self.assertIn("P1V2", error_msg) + self.assertIn("Scale up the App Service plan", error_msg) + + def test_build_failed_scenario(self): + """Simulate a build failure (Oryx).""" + self._patch_app_metadata() + params = _make_mock_params() + props = { + "errors": [{"extendedCode": "ORYX_BUILD_001", + "message": "Could not find requirements.txt"}], + "failedInstancesLogs": [] + } + from knack.util import CLIError + with self.assertRaises(CLIError) as cm: + raise_enriched_deployment_error( + params, deployment_status="BuildFailed", + error_message="Oryx build failed: Could not find requirements.txt", + deployment_properties=props + ) + error_msg = str(cm.exception) + self.assertIn("OryxBuildFailed", error_msg) + self.assertIn("Build", error_msg) + + def test_runtime_failed_oom_scenario(self): + """Simulate a runtime failure due to OOM (exit 137).""" + self._patch_app_metadata() + params = _make_mock_params() + props = { + "numberOfInstancesInProgress": "0", + "numberOfInstancesSuccessful": "0", + "numberOfInstancesFailed": "1", + "errors": [{"extendedCode": "RUNTIME_OOM", + "message": "Container exited with code 137"}], + "failedInstancesLogs": ["https://logs.example.com/instance0"] + } + from knack.util import CLIError + with self.assertRaises(CLIError) as cm: + raise_enriched_deployment_error( + params, deployment_status="RuntimeFailed", + error_message="Container exited with code 137 OOM", + deployment_properties=props + ) + error_msg = str(cm.exception) + self.assertIn("Exit137", error_msg) + self.assertIn("ContainerStartup", error_msg) + self.assertIn("Lazy-load", error_msg) + + +if __name__ == '__main__': + unittest.main()