import datetime
import json
from dataclasses import asdict, dataclass
from typing import Dict, Optional
from flyteidl.admin.agent_pb2 import (
CreateTaskResponse,
DeleteTaskResponse,
GetTaskResponse,
Resource,
)
from flyteidl.core.execution_pb2 import TaskExecution
from google.cloud import bigquery
from flytekit import FlyteContextManager, StructuredDataset, logger
from flytekit.core.type_engine import TypeEngine
from flytekit.extend.backend.base_agent import AgentBase, AgentRegistry, convert_to_flyte_phase
from flytekit.models import literals
from flytekit.models.core.execution import TaskLog
from flytekit.models.literals import LiteralMap
from flytekit.models.task import TaskTemplate
from flytekit.models.types import LiteralType, StructuredDatasetType
pythonTypeToBigQueryType: Dict[type, str] = {
# https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#data_type_sizes
list: "ARRAY",
bool: "BOOL",
bytes: "BYTES",
datetime.datetime: "DATETIME",
float: "FLOAT64",
int: "INT64",
str: "STRING",
}
@dataclass
class Metadata:
job_id: str
project: str
location: str
[docs]class BigQueryAgent(AgentBase):
name = "Bigquery Agent"
def __init__(self):
super().__init__(task_type="bigquery_query_job_task")
[docs] def create(
self,
output_prefix: str,
task_template: TaskTemplate,
inputs: Optional[LiteralMap] = None,
**kwargs,
) -> CreateTaskResponse:
job_config = None
if inputs:
ctx = FlyteContextManager.current_context()
python_interface_inputs = {
name: TypeEngine.guess_python_type(lt.type) for name, lt in task_template.interface.inputs.items()
}
native_inputs = TypeEngine.literal_map_to_kwargs(ctx, inputs, python_interface_inputs)
logger.info(f"Create BigQuery job config with inputs: {native_inputs}")
job_config = bigquery.QueryJobConfig(
query_parameters=[
bigquery.ScalarQueryParameter(name, pythonTypeToBigQueryType[python_interface_inputs[name]], val)
for name, val in native_inputs.items()
]
)
custom = task_template.custom
project = custom["ProjectID"]
location = custom["Location"]
client = bigquery.Client(project=project, location=location)
query_job = client.query(task_template.sql.statement, job_config=job_config)
metadata = Metadata(job_id=str(query_job.job_id), location=location, project=project)
return CreateTaskResponse(resource_meta=json.dumps(asdict(metadata)).encode("utf-8"))
[docs] def get(self, resource_meta: bytes, **kwargs) -> GetTaskResponse:
client = bigquery.Client()
metadata = Metadata(**json.loads(resource_meta.decode("utf-8")))
log_links = [
TaskLog(
uri=f"https://console.cloud.google.com/bigquery?project={metadata.project}&j=bq:{metadata.location}:{metadata.job_id}&page=queryresults",
name="BigQuery Console",
).to_flyte_idl()
]
job = client.get_job(metadata.job_id, metadata.project, metadata.location)
if job.errors:
logger.error("failed to run BigQuery job with error:", job.errors.__str__())
return GetTaskResponse(
resource=Resource(state=TaskExecution.FAILED, message=job.errors.__str__()), log_links=log_links
)
cur_phase = convert_to_flyte_phase(str(job.state))
res = None
if cur_phase == TaskExecution.SUCCEEDED:
ctx = FlyteContextManager.current_context()
if job.destination:
output_location = (
f"bq://{job.destination.project}:{job.destination.dataset_id}.{job.destination.table_id}"
)
res = literals.LiteralMap(
{
"results": TypeEngine.to_literal(
ctx,
StructuredDataset(uri=output_location),
StructuredDataset,
LiteralType(structured_dataset_type=StructuredDatasetType(format="")),
)
}
).to_flyte_idl()
return GetTaskResponse(resource=Resource(phase=cur_phase, outputs=res), log_links=log_links)
[docs] def delete(self, resource_meta: bytes, **kwargs) -> DeleteTaskResponse:
client = bigquery.Client()
metadata = Metadata(**json.loads(resource_meta.decode("utf-8")))
client.cancel_job(metadata.job_id, metadata.project, metadata.location)
return DeleteTaskResponse()
AgentRegistry.register(BigQueryAgent())