Custom Deployments
The PaiTIENT Secure Model Service offers flexible deployment options to meet the specific needs of your organization. This guide covers advanced customization options for model deployments.
Custom Deployment Options
Deployment Types
The PaiTIENT platform supports several deployment types:
- Standard Deployment: Default deployment with balanced performance and cost
- Performance-Optimized: Configurations for maximum throughput and lowest latency
- Cost-Optimized: Configurations designed to minimize costs
- Custom Deployment: Fully customizable deployments for specific requirements
Resource Customization
Instance Types
You can select from a range of instance types based on your needs:
| Instance Type | vCPUs | Memory | GPU | Use Case |
|---|---|---|---|---|
| c5.2xlarge | 8 | 16 GB | - | Small models, CPU inference |
| g4dn.xlarge | 4 | 16 GB | 1x T4 | Medium-sized models |
| g4dn.2xlarge | 8 | 32 GB | 1x T4 | Large models, higher throughput |
| g5.xlarge | 4 | 16 GB | 1x A10G | Advanced models, better performance |
| g5.2xlarge | 8 | 32 GB | 1x A10G | Large models, high throughput |
| p4d.24xlarge | 96 | 1152 GB | 8x A100 | Ultra-large models, maximum performance |
Custom Resource Allocation
Fine-tune resource allocation for your specific workload:
# Python SDK example
from paitient_secure_model import Client
client = Client()
deployment = client.create_deployment(
model_name="ZimaBlueAI/HuatuoGPT-o1-8B",
deployment_name="custom-healthcare-model",
compute_type="gpu",
instance_type="g4dn.xlarge",
resources={
"requests": {
"cpu": "2",
"memory": "8Gi"
},
"limits": {
"cpu": "4",
"memory": "16Gi",
"nvidia.com/gpu": "1"
}
}
)// Node.js SDK example
const { PaiTIENTClient } = require('paitient-secure-model');
const client = new PaiTIENTClient();
async function createCustomDeployment() {
try {
const deployment = await client.createDeployment({
modelName: "ZimaBlueAI/HuatuoGPT-o1-8B",
deploymentName: "custom-healthcare-model",
computeType: "gpu",
instanceType: "g4dn.xlarge",
resources: {
requests: {
cpu: "2",
memory: "8Gi"
},
limits: {
cpu: "4",
memory: "16Gi",
"nvidia.com/gpu": "1"
}
}
});
console.log(`Deployment created: ${deployment.id}`);
} catch (error) {
console.error('Deployment failed:', error);
}
}Scaling Options
Auto-scaling
Configure auto-scaling to automatically adjust capacity based on load:
# Python SDK example
deployment = client.create_deployment(
model_name="ZimaBlueAI/HuatuoGPT-o1-8B",
deployment_name="auto-scaling-deployment",
compute_type="gpu",
instance_type="g4dn.xlarge",
min_replicas=1,
max_replicas=5,
auto_scaling=True,
auto_scaling_config={
"target_cpu_utilization": 70,
"target_gpu_utilization": 80,
"scale_down_window": "5m",
"scale_up_window": "1m"
}
)Manual Scaling
For predictable workloads, you can manually set the number of replicas:
# Python SDK example
deployment = client.create_deployment(
model_name="ZimaBlueAI/HuatuoGPT-o1-8B",
deployment_name="fixed-capacity-deployment",
compute_type="gpu",
instance_type="g4dn.xlarge",
replicas=3,
auto_scaling=False
)Scale to Zero
Enable scale-to-zero for cost savings during idle periods:
# Python SDK example
deployment = client.create_deployment(
model_name="ZimaBlueAI/HuatuoGPT-o1-8B",
deployment_name="scale-to-zero-deployment",
compute_type="gpu",
instance_type="g4dn.xlarge",
min_replicas=0, # Can scale down to zero
max_replicas=3,
auto_scaling=True,
scale_to_zero=True,
scale_to_zero_delay="30m" # Scale to zero after 30 minutes of inactivity
)Advanced Deployment Options
Model Quantization
Reduce model size and improve inference speed with quantization:
# Python SDK example
deployment = client.create_deployment(
model_name="ZimaBlueAI/HuatuoGPT-o1-8B",
deployment_name="quantized-model",
compute_type="gpu",
instance_type="g4dn.xlarge",
model_optimization={
"quantization": {
"type": "int8", # Options: "int8", "int4", "none"
"method": "dynamic" # Options: "dynamic", "static"
}
}
)Custom Model Formats
Deploy models with different formats and optimizations:
# Python SDK example
deployment = client.create_deployment(
model_name="ZimaBlueAI/HuatuoGPT-o1-8B",
deployment_name="optimized-model",
compute_type="gpu",
instance_type="g4dn.xlarge",
model_format="onnx", # Options: "pytorch", "onnx", "tensorrt"
model_optimization={
"onnx_opset": 15,
"enable_model_fusion": True,
"enable_tensor_parallel": True,
"tensor_parallel_degree": 2
}
)Custom Container Images
Use custom container images for specialized requirements:
# Python SDK example
deployment = client.create_deployment(
model_name="ZimaBlueAI/HuatuoGPT-o1-8B",
deployment_name="custom-container-deployment",
compute_type="gpu",
instance_type="g4dn.xlarge",
container_image="your-registry.com/custom-model-server:latest",
container_config={
"command": ["/opt/model-server/start.sh"],
"args": ["--model-path", "/models/model.pt"],
"env": [
{"name": "MODEL_MAX_LENGTH", "value": "4096"},
{"name": "CUDA_VISIBLE_DEVICES", "value": "0"}
],
"ports": [
{"name": "http", "containerPort": 8080}
]
}
)Deployment Customization Examples
High-Performance Clinical Assistant
# Python SDK example for high-performance deployment
deployment = client.create_deployment(
model_name="ZimaBlueAI/HuatuoGPT-o1-8B",
deployment_name="high-performance-clinical",
compute_type="gpu",
instance_type="g5.2xlarge", # Using more powerful GPU
replicas=3, # Multiple replicas for high throughput
model_optimization={
"quantization": {"type": "int8", "method": "dynamic"},
"enable_tensor_parallel": True,
"inference_mode": "flash_attention", # Optimized attention mechanism
"kv_cache_enabled": True # Enable KV cache for faster inference
},
endpoint_config={
"max_batch_size": 16,
"dynamic_batching": True,
"batch_wait_timeout_ms": 50,
"max_concurrent_requests": 100
}
)Cost-Optimized Deployment
# Python SDK example for cost-optimized deployment
deployment = client.create_deployment(
model_name="ZimaBlueAI/HuatuoGPT-o1-8B",
deployment_name="cost-efficient-deployment",
compute_type="gpu",
instance_type="g4dn.xlarge",
min_replicas=0,
max_replicas=2,
auto_scaling=True,
scale_to_zero=True,
scale_to_zero_delay="15m",
model_optimization={
"quantization": {"type": "int4", "method": "dynamic"}, # Aggressive quantization
"pruning": {"level": "moderate"}, # Model pruning for size reduction
"enable_model_fusion": True
},
endpoint_config={
"max_batch_size": 32, # Larger batch size for better efficiency
"dynamic_batching": True,
"priority_batch_queue": False
}
)High-Security Medical Deployment
# Python SDK example for high-security deployment
from paitient_secure_model import Client
from paitient_secure_model.security import SecuritySettings
client = Client()
deployment = client.create_deployment(
model_name="ZimaBlueAI/HuatuoGPT-o1-8B",
deployment_name="high-security-medical",
compute_type="gpu",
instance_type="g4dn.xlarge",
security_settings=SecuritySettings(
network_isolation=True,
private_endpoints=True,
encryption_level="maximum",
audit_logging=True,
compliance_mode="hipaa"
),
vpc_config={
"subnet_ids": ["subnet-abc123", "subnet-def456"],
"security_group_ids": ["sg-123456"]
},
endpoint_config={
"authentication": {
"type": "mutual_tls",
"client_cert_required": True
},
"rate_limiting": {
"requests_per_minute": 1000,
"burst_size": 50
}
}
)Multi-region Deployments
Deploy models across multiple regions for resilience and latency optimization:
# Python SDK example for multi-region deployment
deployments = []
regions = ["us-east-1", "us-west-2", "eu-west-1"]
for region in regions:
deployment = client.create_deployment(
model_name="ZimaBlueAI/HuatuoGPT-o1-8B",
deployment_name=f"global-clinical-assistant-{region}",
region=region,
compute_type="gpu",
instance_type="g4dn.xlarge",
tags={"region": region, "deployment-group": "global-clinical-assistant"}
)
deployments.append(deployment)
# Create a global endpoint that routes to the nearest regional deployment
global_endpoint = client.create_global_endpoint(
name="global-clinical-assistant",
deployment_ids=[d.id for d in deployments],
routing_strategy="latency" # Options: "latency", "geolocation", "weighted"
)
print(f"Global endpoint: {global_endpoint.url}")Hybrid Deployments
For specific security or compliance requirements, you can deploy across cloud and on-premises environments:
# Python SDK example for hybrid deployment
cloud_deployment = client.create_deployment(
model_name="ZimaBlueAI/HuatuoGPT-o1-8B",
deployment_name="cloud-clinical-assistant",
deployment_type="cloud",
compute_type="gpu",
instance_type="g4dn.xlarge"
)
onprem_deployment = client.create_deployment(
model_name="ZimaBlueAI/HuatuoGPT-o1-8B",
deployment_name="onprem-clinical-assistant",
deployment_type="on_premises",
target_environment={
"kubernetes_context": "on-prem-k8s",
"namespace": "paitient-secure-models"
},
compute_type="gpu",
instance_type="nvidia-a10"
)
# Create a hybrid endpoint that routes sensitive traffic to on-premises
hybrid_endpoint = client.create_hybrid_endpoint(
name="hybrid-clinical-assistant",
deployments=[
{"id": cloud_deployment.id, "traffic_percentage": 80},
{"id": onprem_deployment.id, "traffic_percentage": 20}
],
routing_rules=[
{
"condition": "contains(request.prompt, 'PHI') OR contains(request.prompt, 'PII')",
"target_deployment_id": onprem_deployment.id
}
]
)Custom Inference Parameters
Fine-tune inference parameters for your specific needs:
# Python SDK example for custom inference parameters
deployment = client.create_deployment(
model_name="ZimaBlueAI/HuatuoGPT-o1-8B",
deployment_name="custom-inference-deployment",
compute_type="gpu",
instance_type="g4dn.xlarge",
inference_config={
"default_max_tokens": 500,
"default_temperature": 0.7,
"default_top_p": 0.95,
"default_top_k": 50,
"default_repetition_penalty": 1.1,
"max_allowed_max_tokens": 2048,
"max_input_length": 4096,
"custom_stopping_criteria": ["\n\n", "END OF RESPONSE"]
}
)Deployment Lifecycle Management
Blue-Green Deployments
Implement blue-green deployments for zero-downtime updates:
# Python SDK example for blue-green deployment
# Deploy the "blue" version
blue_deployment = client.create_deployment(
model_name="ZimaBlueAI/HuatuoGPT-o1-8B",
deployment_name="clinical-assistant-blue",
compute_type="gpu",
instance_type="g4dn.xlarge",
tags={"environment": "production", "color": "blue"}
)
# Create a production endpoint pointing to the blue deployment
production_endpoint = client.create_endpoint(
name="clinical-assistant-production",
deployment_id=blue_deployment.id
)
# Later, deploy the "green" version
green_deployment = client.create_deployment(
model_name="ZimaBlueAI/HuatuoGPT-o1-8B-v2", # Updated model version
deployment_name="clinical-assistant-green",
compute_type="gpu",
instance_type="g4dn.xlarge",
tags={"environment": "production", "color": "green"}
)
# Update the production endpoint to point to the green deployment
client.update_endpoint(
endpoint_id=production_endpoint.id,
deployment_id=green_deployment.id
)
# If needed, rollback to the blue deployment
client.update_endpoint(
endpoint_id=production_endpoint.id,
deployment_id=blue_deployment.id
)Canary Deployments
Gradually shift traffic to a new deployment version:
# Python SDK example for canary deployment
# Deploy the stable version
stable_deployment = client.create_deployment(
model_name="ZimaBlueAI/HuatuoGPT-o1-8B",
deployment_name="clinical-assistant-stable",
compute_type="gpu",
instance_type="g4dn.xlarge"
)
# Deploy the canary version
canary_deployment = client.create_deployment(
model_name="ZimaBlueAI/HuatuoGPT-o1-8B-v2", # Updated model version
deployment_name="clinical-assistant-canary",
compute_type="gpu",
instance_type="g4dn.xlarge"
)
# Create a production endpoint with traffic splitting
canary_endpoint = client.create_canary_endpoint(
name="clinical-assistant-canary",
deployments=[
{"id": stable_deployment.id, "traffic_percentage": 90},
{"id": canary_deployment.id, "traffic_percentage": 10}
]
)
# Gradually increase traffic to the canary deployment
for percentage in [25, 50, 75, 100]:
client.update_canary_endpoint(
endpoint_id=canary_endpoint.id,
deployments=[
{"id": stable_deployment.id, "traffic_percentage": 100 - percentage},
{"id": canary_deployment.id, "traffic_percentage": percentage}
]
)
# Wait and evaluate metrics before increasing traffic
time.sleep(3600) # 1 hour for evaluationNext Steps
- Learn about Secure Deployment
- Explore Fine-tuning
- Understand Model Evaluation
- Review our Python SDK and Node.js SDK