Instructions to use DeepXR/Helion-V2.5-Rnd with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use DeepXR/Helion-V2.5-Rnd with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="DeepXR/Helion-V2.5-Rnd", trust_remote_code=True)

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("DeepXR/Helion-V2.5-Rnd", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("DeepXR/Helion-V2.5-Rnd", trust_remote_code=True)

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use DeepXR/Helion-V2.5-Rnd with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "DeepXR/Helion-V2.5-Rnd"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "DeepXR/Helion-V2.5-Rnd",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker

docker model run hf.co/DeepXR/Helion-V2.5-Rnd

SGLang

How to use DeepXR/Helion-V2.5-Rnd with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "DeepXR/Helion-V2.5-Rnd" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "DeepXR/Helion-V2.5-Rnd",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "DeepXR/Helion-V2.5-Rnd" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "DeepXR/Helion-V2.5-Rnd",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Docker Model Runner
How to use DeepXR/Helion-V2.5-Rnd with Docker Model Runner:
```
docker model run hf.co/DeepXR/Helion-V2.5-Rnd
```

Helion-V2.5-Rnd / monitoring_config.json

Trouter-Library

Create monitoring_config.json

b7062e4 verified 5 months ago

raw

history blame contribute delete

4.38 kB

	{
	"monitoring": {
	"enabled": true,
	"interval_seconds": 15,
	"retention_days": 30
	},
	"metrics": {
	"system": {
	"enabled": true,
	"collect": [
	"cpu_usage",
	"memory_usage",
	"disk_usage",
	"network_io"
	]
	},
	"gpu": {
	"enabled": true,
	"collect": [
	"gpu_utilization",
	"gpu_memory_used",
	"gpu_memory_total",
	"gpu_temperature",
	"gpu_power_usage"
	],
	"alert_thresholds": {
	"temperature_celsius": 85,
	"memory_utilization_percent": 95,
	"power_watts": 400
	}
	},
	"model": {
	"enabled": true,
	"collect": [
	"requests_per_second",
	"tokens_per_second",
	"average_latency_ms",
	"p50_latency_ms",
	"p95_latency_ms",
	"p99_latency_ms",
	"error_rate",
	"active_connections",
	"queue_depth"
	]
	},
	"inference": {
	"enabled": true,
	"collect": [
	"prompt_tokens",
	"completion_tokens",
	"total_tokens",
	"generation_time_ms",
	"preprocessing_time_ms",
	"postprocessing_time_ms"
	]
	}
	},
	"alerts": {
	"enabled": true,
	"channels": [
	"email",
	"slack",
	"pagerduty"
	],
	"rules": [
	{
	"name": "high_error_rate",
	"condition": "error_rate > 0.05",
	"duration_seconds": 300,
	"severity": "critical",
	"message": "Error rate exceeded 5% for 5 minutes"
	},
	{
	"name": "high_latency",
	"condition": "p95_latency_ms > 5000",
	"duration_seconds": 180,
	"severity": "warning",
	"message": "P95 latency exceeded 5 seconds"
	},
	{
	"name": "gpu_temperature_high",
	"condition": "gpu_temperature > 85",
	"duration_seconds": 60,
	"severity": "critical",
	"message": "GPU temperature critically high"
	},
	{
	"name": "memory_pressure",
	"condition": "gpu_memory_used / gpu_memory_total > 0.95",
	"duration_seconds": 300,
	"severity": "warning",
	"message": "GPU memory utilization above 95%"
	},
	{
	"name": "low_throughput",
	"condition": "tokens_per_second < 10",
	"duration_seconds": 600,
	"severity": "warning",
	"message": "Throughput below 10 tokens/second"
	}
	]
	},
	"logging": {
	"level": "INFO",
	"format": "json",
	"outputs": [
	{
	"type": "file",
	"path": "./logs/monitoring.log",
	"rotation": "daily",
	"retention_days": 30
	},
	{
	"type": "stdout",
	"enabled": true
	},
	{
	"type": "elasticsearch",
	"enabled": false,
	"host": "localhost:9200",
	"index": "helion-metrics"
	}
	]
	},
	"prometheus": {
	"enabled": true,
	"port": 8001,
	"path": "/metrics",
	"namespace": "helion",
	"subsystem": "inference",
	"labels": {
	"model": "Helion-2.5-Rnd",
	"version": "2.5.0-rnd",
	"environment": "production"
	}
	},
	"grafana": {
	"enabled": true,
	"dashboards": [
	{
	"name": "Helion Overview",
	"file": "./monitoring/dashboards/overview.json",
	"refresh": "30s"
	},
	{
	"name": "GPU Metrics",
	"file": "./monitoring/dashboards/gpu.json",
	"refresh": "15s"
	},
	{
	"name": "Inference Performance",
	"file": "./monitoring/dashboards/inference.json",
	"refresh": "30s"
	}
	]
	},
	"health_checks": {
	"enabled": true,
	"endpoint": "/health",
	"interval_seconds": 30,
	"timeout_seconds": 10,
	"checks": [
	{
	"name": "model_loaded",
	"type": "internal",
	"critical": true
	},
	{
	"name": "gpu_available",
	"type": "internal",
	"critical": true
	},
	{
	"name": "inference_responsive",
	"type": "endpoint",
	"url": "http://localhost:8000/v1/models",
	"critical": false
	}
	]
	},
	"tracing": {
	"enabled": true,
	"sample_rate": 0.1,
	"exporter": "jaeger",
	"endpoint": "http://localhost:14268/api/traces"
	},
	"profiling": {
	"enabled": false,
	"interval_seconds": 3600,
	"duration_seconds": 300,
	"output_dir": "./profiling"
	}
	}