Source code for flytekitplugins.inference.ollama.serve

import base64
from dataclasses import dataclass
from typing import Optional

from ..sidecar_template import ModelInferenceTemplate


[docs] @dataclass class Model: """Represents the configuration for a model used in a Kubernetes pod template. :param name: The name of the model. :param mem: The amount of memory allocated for the model, specified as a string. Default is "500Mi". :param cpu: The number of CPU cores allocated for the model. Default is 1. :param modelfile: The actual model file as a JSON-serializable string. This represents the file content. Default is `None` if not applicable. """ name: str mem: str = "500Mi" cpu: int = 1 modelfile: Optional[str] = None
[docs] class Ollama(ModelInferenceTemplate): def __init__( self, *, model: Model, image: str = "ollama/ollama", port: int = 11434, cpu: int = 1, gpu: int = 1, mem: str = "15Gi", download_inputs_mem: str = "500Mi", download_inputs_cpu: int = 2, ): """Initialize Ollama class for managing a Kubernetes pod template. :param model: An instance of the Model class containing the model's configuration, including its name, memory, CPU, and file. :param image: The Docker image to be used for the container. Default is "ollama/ollama". :param port: The port number on which the container should expose its service. Default is 11434. :param cpu: The number of CPU cores requested for the container. Default is 1. :param gpu: The number of GPUs requested for the container. Default is 1. :param mem: The amount of memory requested for the container, specified as a string. Default is "15Gi". :param download_inputs_mem: The amount of memory requested for downloading inputs, specified as a string. Default is "500Mi". :param download_inputs_cpu: The number of CPU cores requested for downloading inputs. Default is 2. """ self._model_name = model.name self._model_mem = model.mem self._model_cpu = model.cpu self._model_modelfile = model.modelfile super().__init__( image=image, port=port, cpu=cpu, gpu=gpu, mem=mem, download_inputs_mem=download_inputs_mem, download_inputs_cpu=download_inputs_cpu, download_inputs=(True if self._model_modelfile and "{inputs" in self._model_modelfile else False), ) self.setup_ollama_pod_template()
[docs] def setup_ollama_pod_template(self): from kubernetes.client.models import ( V1Container, V1ResourceRequirements, V1SecurityContext, V1VolumeMount, ) container_name = "create-model" if self._model_modelfile else "pull-model" base_code = """ import base64 import time import ollama import requests """ ollama_service_ready = f""" # Wait for Ollama service to be ready max_retries = 30 retry_interval = 1 for _ in range(max_retries): try: response = requests.get('{self.base_url}') if response.status_code == 200: print('Ollama service is ready') break except requests.RequestException: pass time.sleep(retry_interval) else: print('Ollama service did not become ready in time') exit(1) """ if self._model_modelfile: encoded_modelfile = base64.b64encode(self._model_modelfile.encode("utf-8")).decode("utf-8") if "{inputs" in self._model_modelfile: python_code = f""" {base_code} import json with open('/shared/inputs.json', 'r') as f: inputs = json.load(f) class AttrDict(dict): def __init__(self, *args, **kwargs): super(AttrDict, self).__init__(*args, **kwargs) self.__dict__ = self inputs = {{'inputs': AttrDict(inputs)}} encoded_model_file = '{encoded_modelfile}' modelfile = base64.b64decode(encoded_model_file).decode('utf-8').format(**inputs) modelfile = modelfile.replace('{{', '{{{{').replace('}}', '}}}}') with open('Modelfile', 'w') as f: f.write(modelfile) {ollama_service_ready} # Debugging: Shows the status of model creation. for chunk in ollama.create(model='{self._model_name}', path='Modelfile', stream=True): print(chunk) """ else: python_code = f""" {base_code} encoded_model_file = '{encoded_modelfile}' modelfile = base64.b64decode(encoded_model_file).decode('utf-8') with open('Modelfile', 'w') as f: f.write(modelfile) {ollama_service_ready} # Debugging: Shows the status of model creation. for chunk in ollama.create(model='{self._model_name}', path='Modelfile', stream=True): print(chunk) """ else: python_code = f""" {base_code} {ollama_service_ready} # Debugging: Shows the status of model pull. for chunk in ollama.pull('{self._model_name}', stream=True): print(chunk) """ command = f'python3 -c "{python_code}"' self.pod_template.pod_spec.init_containers.append( V1Container( name=container_name, image="python:3.11-slim", command=["/bin/sh", "-c"], args=[f"pip install requests && pip install ollama && {command}"], resources=V1ResourceRequirements( requests={ "cpu": self._model_cpu, "memory": self._model_mem, }, limits={ "cpu": self._model_cpu, "memory": self._model_mem, }, ), security_context=V1SecurityContext( run_as_user=0, ), volume_mounts=[ V1VolumeMount(name="shared-data", mount_path="/shared"), V1VolumeMount(name="tmp", mount_path="/tmp"), ], ) )