Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import time
import warnings
from typing import TYPE_CHECKING, Literal, overload
from typing import TYPE_CHECKING, Any, Literal, overload

from foundry_dev_tools.clients.api_client import APIClient
from foundry_dev_tools.errors.handling import ErrorHandlingConfig
Expand All @@ -13,7 +13,14 @@
FoundrySqlQueryFailedError,
FoundrySqlSerializationFormatNotImplementedError,
)
from foundry_dev_tools.utils.api_types import Ref, SqlDialect, SQLReturnType, assert_in_literal
from foundry_dev_tools.utils.api_types import (
ArrowCompressionCodec,
FurnaceSqlDialect,
Ref,
SqlDialect,
SQLReturnType,
assert_in_literal,
)

if TYPE_CHECKING:
import pandas as pd
Expand Down Expand Up @@ -296,3 +303,346 @@ def api_queries_results(
},
**kwargs,
)


class FoundrySqlServerClientV2(APIClient):
"""FoundrySqlServerClientV2 implements the newer foundry-sql-server API.

This client uses a different API flow compared to V1:
- Executes queries via POST to /api/ with applicationId and sql
- Polls POST to /api/status for query completion
- Retrieves results via POST to /api/stream with tickets
"""

api_name = "foundry-sql-server"

@overload
def query_foundry_sql(
self,
query: str,
return_type: Literal["pandas"],
branch: Ref = ...,
sql_dialect: FurnaceSqlDialect = ...,
arrow_compression_codec: ArrowCompressionCodec = ...,
timeout: int = ...,
experimental_use_trino: bool = ...,
) -> pd.DataFrame: ...

@overload
def query_foundry_sql(
self,
query: str,
return_type: Literal["polars"],
branch: Ref = ...,
sql_dialect: FurnaceSqlDialect = ...,
arrow_compression_codec: ArrowCompressionCodec = ...,
timeout: int = ...,
Comment on lines +335 to +339
Copy link

Copilot AI Feb 17, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The overload signature for the "polars" return type is inconsistent with other overloads and the actual implementation. This overload specifies branch and sql_dialect parameters, but the actual implementation (line 365-371) does not accept these parameters. Instead, it accepts application_id and disable_arrow_compression like the other overloads. The parameters should match: application_id: str, disable_arrow_compression: bool = ..., and timeout should be removed if not implemented (see other issue). The V2 API uses a different flow than V1 and does not use branch/sql_dialect parameters.

Suggested change
return_type: Literal["polars"],
branch: Ref = ...,
sql_dialect: SqlDialect = ...,
timeout: int = ...,
application_id: str,
return_type: Literal["polars"],
disable_arrow_compression: bool = ...,

Copilot uses AI. Check for mistakes.
experimental_use_trino: bool = ...,
) -> pl.DataFrame: ...

@overload
def query_foundry_sql(
self,
query: str,
return_type: Literal["spark"],
branch: Ref = ...,
sql_dialect: FurnaceSqlDialect = ...,
arrow_compression_codec: ArrowCompressionCodec = ...,
timeout: int = ...,
experimental_use_trino: bool = ...,
) -> pyspark.sql.DataFrame: ...

@overload
def query_foundry_sql(
self,
query: str,
return_type: Literal["arrow"],
branch: Ref = ...,
sql_dialect: FurnaceSqlDialect = ...,
arrow_compression_codec: ArrowCompressionCodec = ...,
timeout: int = ...,
experimental_use_trino: bool = ...,
) -> pa.Table: ...

@overload
def query_foundry_sql(
self,
query: str,
return_type: Literal["pandas", "polars", "spark", "arrow"] = ...,
branch: Ref = ...,
sql_dialect: FurnaceSqlDialect = ...,
arrow_compression_codec: ArrowCompressionCodec = ...,
timeout: int = ...,
experimental_use_trino: bool = ...,
) -> pd.DataFrame | pl.DataFrame | pa.Table | pyspark.sql.DataFrame: ...

def query_foundry_sql(
self,
query: str,
return_type: Literal["pandas", "polars", "spark", "arrow"] = "pandas",
branch: Ref = "master",
sql_dialect: FurnaceSqlDialect = "SPARK",
arrow_compression_codec: ArrowCompressionCodec = "NONE",
timeout: int = 600,
experimental_use_trino: bool = False,
) -> pd.DataFrame | pl.DataFrame | pa.Table | pyspark.sql.DataFrame:
"""Queries the Foundry SQL server using the V2 API.

Uses Arrow IPC to communicate with the Foundry SQL Server Endpoint.

Example:
df = client.query_foundry_sql(
query="SELECT * FROM `ri.foundry.main.dataset.abc` LIMIT 10"
)

Args:
query: The SQL Query
return_type: The return type (pandas, polars, spark, or arrow). Note: "raw" is not supported in V2.
branch: The dataset branch to query
sql_dialect: The SQL dialect to use (only SPARK is supported for V2)
arrow_compression_codec: Arrow compression codec (NONE, LZ4, ZSTD)
timeout: Query timeout in seconds
experimental_use_trino: If True, modifies the query to use Trino backend by adding /*+ backend(trino) */ hint
Copy link

Copilot AI Feb 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The experimental_use_trino parameter documentation should clarify its limitations: it only works for queries that start with uppercase "SELECT " (with space). It won't work for lowercase "select", queries with different whitespace, or queries starting with WITH clauses. Consider documenting these limitations in the parameter description to set proper expectations for users.

Suggested change
experimental_use_trino: If True, modifies the query to use Trino backend by adding /*+ backend(trino) */ hint
experimental_use_trino: If True, modifies the query to use the Trino backend by adding a
``/*+ backend(trino) */`` hint after the first ``SELECT ``. This is experimental and
only affects queries that contain an uppercase ``SELECT `` (with a trailing space);
it will not modify queries using lowercase ``select``, different whitespace, or
queries starting with ``WITH`` clauses.

Copilot uses AI. Check for mistakes.

Returns:
:external+pandas:py:class:`~pandas.DataFrame` | :external+polars:py:class:`~polars.DataFrame` | :external+pyarrow:py:class:`~pyarrow.Table` | :external+spark:py:class:`~pyspark.sql.DataFrame`:

A pandas DataFrame, polars, Spark DataFrame or pyarrow.Table with the result.

Raises:
FoundrySqlQueryFailedError: If the query fails
FoundrySqlQueryClientTimedOutError: If the query times out
Copy link

Copilot AI Feb 17, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The docstring claims this method raises FoundrySqlQueryClientTimedOutError on timeout, but timeout handling is not implemented in the V2 client. This is inconsistent with the actual behavior. Either implement timeout handling or remove this from the Raises section of the docstring.

Suggested change
FoundrySqlQueryClientTimedOutError: If the query times out

Copilot uses AI. Check for mistakes.
TypeError: If an invalid sql_dialect or arrow_compression_codec is provided
ValueError: If an unsupported return_type is provided

""" # noqa: E501
if experimental_use_trino:
# Case-insensitive replacement of first SELECT keyword
import re

query = re.sub(r"\bSELECT\b", "SELECT /*+ backend(trino) */", query, count=1, flags=re.IGNORECASE)

response_json = self.api_query(
Comment on lines +424 to +425
Copy link

Copilot AI Feb 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The query modification for experimental_use_trino uses a simple string replace that only replaces the first occurrence of "SELECT ". This approach will fail for queries that use lowercase "select " or have different whitespace patterns. It will also fail for queries that don't start with SELECT (e.g., "WITH cte AS (...) SELECT ..."). Consider using a more robust approach like a regex that handles case-insensitivity and various whitespace patterns, or insert the hint after detecting the first SELECT keyword more reliably.

Copilot uses AI. Check for mistakes.
query=query,
dialect=sql_dialect,
branch=branch,
arrow_compression_codec=arrow_compression_codec,
Comment on lines +419 to +429
Copy link

Copilot AI Feb 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The arrow_compression_codec parameter is not validated with assert_in_literal like sql_dialect is. Add validation to ensure only valid codec values (NONE, LZ4, ZSTD) are passed to the API. This would provide better error messages if an invalid codec is specified.

Copilot uses AI. Check for mistakes.
timeout=timeout,
).json()

query_handle = self._extract_query_handle(response_json)
start_time = time.time()

while response_json.get("status", {}).get("type") != "ready":
time.sleep(0.2)
response = self.api_status(query_handle)
response_json = response.json()

if response_json.get("status", {}).get("type") == "failed":
raise FoundrySqlQueryFailedError(response, query=query, branch=branch, dialect=sql_dialect)
if time.time() > start_time + timeout:
raise FoundrySqlQueryClientTimedOutError(response, timeout=timeout)

ticket = self._extract_ticket(response_json)

arrow_stream_reader = self.read_stream_results_arrow(ticket)

if return_type == "pandas":
return arrow_stream_reader.read_pandas()

if return_type == "polars":
from foundry_dev_tools._optional.polars import pl

arrow_table = arrow_stream_reader.read_all()
return pl.from_arrow(arrow_table)

if return_type == "spark":
from foundry_dev_tools.utils.converter.foundry_spark import (
arrow_stream_to_spark_dataframe,
)

return arrow_stream_to_spark_dataframe(arrow_stream_reader)

if return_type == "arrow":
return arrow_stream_reader.read_all()

msg = (
f"Unsupported return_type: {return_type}. "
f"V2 API supports: pandas, polars, spark, arrow (raw is not supported)"
)
raise ValueError(msg)

def _extract_query_handle(self, response_json: dict[str, Any]) -> dict[str, Any]:
Copy link

Copilot AI Feb 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The Args section in the docstring for _extract_query_handle has an extra blank line after the response_json parameter description and before the Returns section. This is inconsistent with other docstrings in the codebase. Remove the extra blank line at line 477.

Copilot uses AI. Check for mistakes.
"""Extract query handle from execute response.

Args:
response_json: Response JSON from execute API


Returns:
Query handle dict

Raises:
KeyError: If the response JSON doesn't contain the expected structure

"""
response_type = response_json.get("type")
if not response_type:
msg = f"Response JSON missing 'type' field. Response: {response_json}"
raise KeyError(msg)

type_data = response_json.get(response_type)
if not type_data:
msg = f"Response JSON missing '{response_type}' field. Response: {response_json}"
raise KeyError(msg)

query_handle = type_data.get("queryHandle")
if not query_handle:
msg = f"Response JSON missing 'queryHandle' in '{response_type}'. Response: {response_json}"
raise KeyError(msg)

return query_handle

def _extract_ticket(self, response_json: dict[str, Any]) -> dict[str, Any]:
"""Extract tickets from success response.

Comment on lines +506 to +508
Copy link

Copilot AI Feb 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The docstring for _extract_ticket says "Returns: List of tickets for fetching results" but the method actually returns a dictionary with keys "id", "tickets", and "type", not a list. The return type annotation correctly shows dict[str, Any], but the docstring is misleading. Update the docstring to accurately describe the returned dictionary structure.

Copilot uses AI. Check for mistakes.
Args:
response_json: Success response JSON from status API

Returns:
Ticket dict with id, tickets list, and type. Example: {"id": 0, "tickets": [...], "type": "furnace"}

Raises:
KeyError: If the response JSON doesn't contain the expected structure

"""
try:
status = response_json["status"]
ready = status["ready"]
ticket_groups = ready["tickets"]
except KeyError as exc:
msg = (
f"Response JSON missing expected structure. "
f"Expected path: status.ready.tickets. Response: {response_json}"
)
raise KeyError(msg) from exc

# we combine all tickets into one to get the full data
# if performance is a concern this should be done in parallel
return {
Comment on lines +512 to +532
Copy link

Copilot AI Feb 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The ticket extraction uses a nested list comprehension that flattens tickets from multiple ticket groups. However, there's no error handling if the response structure is unexpected (e.g., missing "status", "ready", or "tickets" keys). Consider adding error handling to provide a more informative error message if the response structure is not as expected, similar to how _extract_query_handle uses KeyError-prone dictionary access.

Copilot uses AI. Check for mistakes.
"id": 0,
"tickets": [ticket for ticket_group in ticket_groups for ticket in ticket_group["tickets"]],
"type": "furnace",
}

def read_stream_results_arrow(self, ticket: dict[str, Any]) -> pa.ipc.RecordBatchStreamReader:
"""Fetch query results using tickets and return Arrow stream reader.

Args:
ticket: dict of tickets e.g. { "id": 0, "tickets": ["ey...", ...], "type": "furnace", }

Returns:
Arrow RecordBatchStreamReader

"""
from foundry_dev_tools._optional.pyarrow import pa

response = self.api_stream_ticket(ticket)
response.raw.decode_content = True

return pa.ipc.RecordBatchStreamReader(response.raw)

def api_query(
self,
query: str,
dialect: FurnaceSqlDialect,
branch: Ref,
arrow_compression_codec: ArrowCompressionCodec = "NONE",
timeout: int = 600,
**kwargs,
) -> requests.Response:
"""Execute a SQL query via the V2 API.

Args:
query: The SQL query string
dialect: The SQL dialect to use (only SPARK is supported)
branch: The dataset branch to query
arrow_compression_codec: Arrow compression codec (NONE, LZ4, ZSTD)
timeout: Query timeout in seconds (used for error context)
**kwargs: gets passed to :py:meth:`APIClient.api_request`

Returns:
Response with query handle and initial status

"""
assert_in_literal(dialect, FurnaceSqlDialect, "dialect")
assert_in_literal(arrow_compression_codec, ArrowCompressionCodec, "arrow_compression_codec")

return self.api_request(
"POST",
"sql-endpoint/v1/queries/query",
json={
"querySpec": {
"query": query,
"tableProviders": {},
"dialect": dialect,
"options": {"options": [{"option": "arrowCompressionCodec", "value": arrow_compression_codec}]},
},
"executionParams": {
"defaultBranchIds": [{"type": "datasetBranch", "datasetBranch": branch}],
"resultFormat": "ARROW",
"resultMode": "AUTO",
},
},
error_handling=ErrorHandlingConfig(branch=branch, dialect=dialect, timeout=timeout),
**kwargs,
)

def api_status(
self,
query_handle: dict[str, Any],
**kwargs,
) -> requests.Response:
"""Get the status of a SQL query via the V2 API.

Args:
query_handle: Query handle dict from execute response
**kwargs: gets passed to :py:meth:`APIClient.api_request`

Returns:
Response with query status

"""
return self.api_request(
"POST",
"sql-endpoint/v1/queries/status",
json=query_handle,
**kwargs,
)

def api_stream_ticket(
self,
ticket: dict,
**kwargs,
) -> requests.Response:
"""Stream query results using a ticket via the V2 API.

Args:
ticket: Ticket dict containing id, tickets list, and type.
Example: {"id": 0, "tickets": ["eyJhbGc...", "eyJhbGc..."], "type": "furnace"}
**kwargs: gets passed to :py:meth:`APIClient.api_request`

Returns:
Response with streaming Arrow data

"""
return self.api_request(
"POST",
"sql-endpoint/v1/queries/stream",
json=ticket,
headers={
"Accept": "application/octet-stream",
},
stream=True,
**kwargs,
)
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,11 @@ def foundry_sql_server(self) -> foundry_sql_server.FoundrySqlServerClient:
"""Returns :py:class:`foundry_dev_tools.clients.foundry_sql_server.FoundrySqlServerClient`."""
return foundry_sql_server.FoundrySqlServerClient(self)

@cached_property
def foundry_sql_server_v2(self) -> foundry_sql_server.FoundrySqlServerClientV2:
"""Returns :py:class:`foundry_dev_tools.clients.foundry_sql_server.FoundrySqlServerClientV2`."""
Comment on lines 152 to +157
Copy link

Copilot AI Feb 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The V2 client is not documented yet. Consider adding documentation that explains when to use foundry_sql_server_v2 vs foundry_sql_server, what are the differences between them, and what are the benefits of the V2 API (such as support for Arrow compression codecs and the experimental Trino backend).

Suggested change
"""Returns :py:class:`foundry_dev_tools.clients.foundry_sql_server.FoundrySqlServerClient`."""
return foundry_sql_server.FoundrySqlServerClient(self)
@cached_property
def foundry_sql_server_v2(self) -> foundry_sql_server.FoundrySqlServerClientV2:
"""Returns :py:class:`foundry_dev_tools.clients.foundry_sql_server.FoundrySqlServerClientV2`."""
"""Returns :py:class:`foundry_dev_tools.clients.foundry_sql_server.FoundrySqlServerClient`.
This client exposes the original / legacy Foundry SQL Server API and is kept for
backwards compatibility. Prefer :py:meth:`foundry_sql_server_v2` for new
development unless you explicitly depend on legacy behaviour.
"""
return foundry_sql_server.FoundrySqlServerClient(self)
@cached_property
def foundry_sql_server_v2(self) -> foundry_sql_server.FoundrySqlServerClientV2:
"""Returns :py:class:`foundry_dev_tools.clients.foundry_sql_server.FoundrySqlServerClientV2`.
This client uses the V2 SQL Server API. Use this for new workloads when possible,
especially when you:
* want support for Arrow compression codecs for more efficient data transfer, or
* want to experiment with the Trino-based execution backend (experimental).
In contrast, :py:meth:`foundry_sql_server` targets the legacy API surface and
is primarily intended for existing code that relies on its behaviour. The V2
client may evolve independently and can introduce new capabilities that are not
available via the legacy client.
"""

Copilot uses AI. Check for mistakes.
return foundry_sql_server.FoundrySqlServerClientV2(self)

@cached_property
def build2(self) -> build2.Build2Client:
"""Returns :py:class:`foundry_dev_tools.clients.build2.Build2Client`."""
Expand Down
Loading