All settings
environment_variables: {}
model_list:
- model_name: string
litellm_params: {}
model_info:
id: string
mode: embedding
input_cost_per_token: 0
output_cost_per_token: 0
max_tokens: 2048
base_model: gpt-4-1106-preview
additionalProp1: {}
litellm_settings:
# Logging/Callback settings
success_callback: ["langfuse"] # list of success callbacks
failure_callback: ["sentry"] # list of failure callbacks
callbacks: ["otel"] # list of callbacks - runs on success and failure
service_callbacks: ["datadog", "prometheus"] # logs redis, postgres failures on datadog, prometheus
turn_off_message_logging: boolean # prevent the messages and responses from being logged to on your callbacks, but request metadata will still be logged. Useful for privacy/compliance when handling sensitive data.
redact_user_api_key_info: boolean # Redact information about the user api key (hashed token, user_id, team id, etc.), from logs. Currently supported for Langfuse, OpenTelemetry, Logfire, ArizeAI logging.
langfuse_default_tags: ["cache_hit", "cache_key", "proxy_base_url", "user_api_key_alias", "user_api_key_user_id", "user_api_key_user_email", "user_api_key_team_alias", "semantic-similarity", "proxy_base_url"] # default tags for Langfuse Logging
# Networking settings
request_timeout: 10 # (int) llm requesttimeout in seconds. Raise Timeout error if call takes longer than 10s. Sets litellm.request_timeout
force_ipv4: boolean # If true, litellm will force ipv4 for all LLM requests. Some users have seen httpx ConnectionError when using ipv6 + Anthropic API
# Cost tracking settings
cost_discount_config:
vertex_ai: 0.05 # Apply a 5% discount to Vertex AI costs
gemini: 0.05 # Apply a 5% discount to Gemini costs
cost_margin_config:
global: 0.05 # Apply a 5% margin to all providers
openai: 0.10 # Apply a 10% margin to OpenAI costs
# Debugging - see debugging docs for more options
# Use `--debug` or `--detailed_debug` CLI flags, or set LITELLM_LOG env var to "INFO", "DEBUG", or "ERROR"
json_logs: boolean # if true, logs will be in json format
# Fallbacks, reliability
default_fallbacks: ["claude-opus"] # set default_fallbacks, in case a specific model group is misconfigured / bad.
content_policy_fallbacks: [{ "gpt-3.5-turbo-small": ["claude-opus"] }] # fallbacks for ContentPolicyErrors
context_window_fallbacks: [{ "gpt-3.5-turbo-small": ["gpt-3.5-turbo-large", "claude-opus"] }] # fallbacks for ContextWindowExceededErrors
# MCP Aliases - Map aliases to MCP server names for easier tool access
mcp_aliases: {
"github": "github_mcp_server",
"zapier": "zapier_mcp_server",
"deepwiki": "deepwiki_mcp_server",
} # Maps friendly aliases to MCP server names. Only the first alias for each server is used
# Caching settings
cache: true
cache_params: # set cache params for redis
type: redis # type of cache to initialize (options: "local", "redis", "s3", "gcs")
# Optional - Redis Settings
host: "localhost" # The host address for the Redis cache. Required if type is "redis".
port: 6379 # The port number for the Redis cache. Required if type is "redis".
password: "your_password" # The password for the Redis cache. Required if type is "redis".
namespace: "litellm.caching.caching" # namespace for redis cache
max_connections: 100 # [OPTIONAL] Set Maximum number of Redis connections. Passed directly to redis-py.
# Optional - Redis Cluster Settings
redis_startup_nodes: [{ "host": "127.0.0.1", "port": "7001" }]
# Optional - Redis Sentinel Settings
service_name: "mymaster"
sentinel_nodes: [["localhost", 26379]]
# Optional - GCP IAM Authentication for Redis
gcp_service_account: "projects/-/serviceAccounts/your-sa@project.iam.gserviceaccount.com" # GCP service account for IAM authentication
gcp_ssl_ca_certs: "./server-ca.pem" # Path to SSL CA certificate file for GCP Memorystore Redis
ssl: true # Enable SSL for secure connections
ssl_cert_reqs: null # Set to null for self-signed certificates
ssl_check_hostname: false # Set to false for self-signed certificates
# Optional - Qdrant Semantic Cache Settings
qdrant_semantic_cache_embedding_model: openai-embedding # the model should be defined on the model_list
qdrant_collection_name: test_collection
qdrant_quantization_config: binary
qdrant_semantic_cache_vector_size: 1536 # vector size must match embedding model dimensionality
similarity_threshold: 0.8 # similarity threshold for semantic cache
# Optional - S3 Cache Settings
s3_bucket_name: cache-bucket-litellm # AWS Bucket Name for S3
s3_region_name: us-west-2 # AWS Region Name for S3
s3_aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID # us os.environ/<variable name> to pass environment variables. This is AWS Access Key ID for S3
s3_aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY # AWS Secret Access Key for S3
s3_endpoint_url: https://s3.amazonaws.com # [OPTIONAL] S3 endpoint URL, if you want to use Backblaze/cloudflare s3 bucket
# Optional - GCS Cache Settings
gcs_bucket_name: cache-bucket-litellm # GCS Bucket Name for caching
gcs_path_service_account: os.environ/GCS_PATH_SERVICE_ACCOUNT # Path to GCS service account JSON file
gcs_path: cache/ # [OPTIONAL] GCS path prefix for cache objects
# Common Cache settings
# Optional - Supported call types for caching
supported_call_types:
["acompletion", "atext_completion", "aembedding", "atranscription"]
# /chat/completions, /completions, /embeddings, /audio/transcriptions
mode: default_off # if default_off, you need to opt in to caching on a per call basis
ttl: 600 # ttl for caching
disable_copilot_system_to_assistant: False # DEPRECATED - GitHub Copilot API supports system prompts.
# Virtual key auth cache — shares API key / virtual-key auth across workers via Redis.
# Reduces DB round trips when caches are cold on new workers or pods.
# Requires litellm_settings.cache: true AND cache_params.type: redis above.
enable_redis_auth_cache: false
callback_settings:
otel:
message_logging: boolean # OTEL logging callback specific settings
general_settings:
completion_model: string
store_prompts_in_spend_logs: boolean
forward_client_headers_to_llm_api: boolean
disable_spend_logs: boolean # turn off writing each transaction to the db
disable_master_key_return: boolean # turn off returning master key on UI (checked on '/user/info' endpoint)
disable_retry_on_max_parallel_request_limit_error: boolean # turn off retries when max parallel request limit is reached
disable_reset_budget: boolean # turn off reset budget scheduled task
disable_adding_master_key_hash_to_db: boolean # turn off storing master key hash in db, for spend tracking
disable_responses_id_security: boolean # turn off response ID security checks that prevent users from accessing other users' responses
enable_jwt_auth: boolean # allow proxy admin to auth in via jwt tokens with 'litellm_proxy_admin' in claims
enforce_user_param: boolean # requires all openai endpoint requests to have a 'user' param
reject_clientside_metadata_tags: boolean # if true, rejects requests with client-side 'metadata.tags' to prevent users from influencing budgets
allowed_routes: ["route1", "route2"] # list of allowed proxy API routes - a user can access. (currently JWT-Auth only)
key_management_system: google_kms # either google_kms or azure_kms
master_key: string
maximum_spend_logs_retention_period: 30d # The maximum time to retain spend logs before deletion.
maximum_spend_logs_retention_interval: 1d # interval in which the spend log cleanup task should run in.
user_mcp_management_mode: restricted # or "view_all"
# Database Settings
database_url: string
database_connection_pool_limit: 0 # default 10
database_connection_timeout: 0 # default 60s
database_connect_timeout: 0 # Prisma `connect_timeout` URL param (seconds). Unset => Prisma default.
database_socket_timeout: 0 # Prisma `socket_timeout` URL param (seconds). Idle/slow connections beyond this are closed.
database_extra_connection_params: {} # Extra key/value pairs appended to the Prisma DATABASE_URL / DIRECT_URL query string (e.g. sslmode, pgbouncer, statement_cache_size). Overrides LiteLLM defaults.
allow_requests_on_db_unavailable: boolean # if true, will allow requests that can not connect to the DB to verify Virtual Key to still work
custom_auth: string
max_parallel_requests: 0 # the max parallel requests allowed per deployment
global_max_parallel_requests: 0 # the max parallel requests allowed on the proxy all up
infer_model_from_keys: true
background_health_checks: true
health_check_interval: 300
alerting: ["slack", "email"]
alerting_threshold: 0
use_client_credentials_pass_through_routes: boolean # use client credentials for all pass through routes like "/vertex-ai", /bedrock/. When this is True Virtual Key auth will not be applied on these endpoints
router_settings:
routing_strategy: simple-shuffle # Literal["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle" - RECOMMENDED for best performance
redis_host: <your-redis-host> # string
redis_password: <your-redis-password> # string
redis_port: <your-redis-port> # string
enable_pre_call_checks: true # bool - Before call is made check if a call is within model context window
allowed_fails: 3 # cooldown model if it fails > 1 call in a minute.
cooldown_time: 30 # (in seconds) how long to cooldown model if fails/min > allowed_fails
disable_cooldowns: True # bool - Disable cooldowns for all models
enable_tag_filtering: True # bool - Use tag based routing for requests
tag_filtering_match_any: True # bool - Tag matching behavior (only when enable_tag_filtering=true). `true`: match if deployment has ANY requested tag; `false`: match only if deployment has ALL requested tags
retry_policy: { # Dict[str, int]: retry policy for different types of exceptions
"AuthenticationErrorRetries": 3,
"TimeoutErrorRetries": 3,
"RateLimitErrorRetries": 3,
"ContentPolicyViolationErrorRetries": 4,
"InternalServerErrorRetries": 4
}
allowed_fails_policy: {
"BadRequestErrorAllowedFails": 1000, # Allow 1000 BadRequestErrors before cooling down a deployment
"AuthenticationErrorAllowedFails": 10, # int
"TimeoutErrorAllowedFails": 12, # int
"RateLimitErrorAllowedFails": 10000, # int
"ContentPolicyViolationErrorAllowedFails": 15, # int
"InternalServerErrorAllowedFails": 20, # int
}
content_policy_fallbacks=[{"claude-2": ["my-fallback-model"]}] # List[Dict[str, List[str]]]: Fallback model for content policy violations
fallbacks=[{"claude-2": ["my-fallback-model"]}] # List[Dict[str, List[str]]]: Fallback model for all errors
litellm_settings - Reference​
| Name | Type | Description |
|---|---|---|
| success_callback | array of strings | List of success callbacks. Doc Proxy logging callbacks, Doc Metrics |
| failure_callback | array of strings | List of failure callbacks Doc Proxy logging callbacks, Doc Metrics |
| callbacks | array of strings | List of callbacks - runs on success and failure Doc Proxy logging callbacks, Doc Metrics |
| service_callbacks | array of strings | System health monitoring - Logs redis, postgres failures on specified services (e.g. datadog, prometheus) Doc Metrics |
| turn_off_message_logging | boolean | If true, prevents messages and responses from being logged to callbacks, but request metadata will still be logged. Useful for privacy/compliance when handling sensitive data Proxy Logging |
| modify_params | boolean | If true, allows modifying the parameters of the request before it is sent to the LLM provider |
| enable_preview_features | boolean | If true, enables preview features - e.g. Azure O1 Models with streaming support. |
| LITELLM_DISABLE_STOP_SEQUENCE_LIMIT | Disable validation for stop sequence limit (default: 4) | |
| redact_user_api_key_info | boolean | If true, redacts information about the user api key from logs Proxy Logging |
| mcp_aliases | object | Maps friendly aliases to MCP server names for easier tool access. Only the first alias for each server is used. MCP Aliases |
| langfuse_default_tags | array of strings | Default tags for Langfuse Logging. Use this if you want to control which LiteLLM-specific fields are logged as tags by the LiteLLM proxy. By default LiteLLM Proxy logs no LiteLLM-specific fields as tags. Further docs |
| set_verbose | boolean | DEPRECATED - see debugging docs Use --debug or --detailed_debug CLI flags, or set LITELLM_LOG env var to "INFO", "DEBUG", or "ERROR" instead. |
| json_logs | boolean | If true, logs will be in json format. If you need to store the logs as JSON, just set the litellm.json_logs = True. We currently just log the raw POST request from litellm as a JSON Further docs |
| default_fallbacks | array of strings | List of fallback models to use if a specific model group is misconfigured / bad. Further docs |
| request_timeout | integer | The timeout for requests in seconds. If not set, the default value is 6000 seconds. For reference OpenAI Python SDK defaults to 600 seconds. |
| force_ipv4 | boolean | If true, litellm will force ipv4 for all LLM requests. Some users have seen httpx ConnectionError when using ipv6 + Anthropic API |
| content_policy_fallbacks | array of objects | Fallbacks to use when a ContentPolicyViolationError is encountered. Further docs |
| context_window_fallbacks | array of objects | Fallbacks to use when a ContextWindowExceededError is encountered. Further docs |
| cache | boolean | If true, enables caching. Further docs |
| cache_params | object | Parameters for the cache. Further docs |
| enable_redis_auth_cache | boolean | When true, stores virtual-key auth payloads in Redis (same client as response caching) so every worker/pod shares cached auth lookups—fewer repeated database reads on cache misses. Requires cache: true and cache_params.type: redis (Redis or Redis Cluster). Optional: set general_settings.user_api_key_cache_ttl so TTL applies consistently to memory and Redis. Further docs |
| disable_end_user_cost_tracking | boolean | If true, turns off end user cost tracking on prometheus metrics + litellm spend logs table on proxy. |
| enable_end_user_cost_tracking_prometheus_only | boolean | If true, includes the end_user label on Prometheus metrics. Disabled by default to keep Prometheus cardinality bounded. Further docs |
| cost_discount_config | object | Provider-specific percentage discounts applied to cost calculations. Configure under litellm_settings. Further docs |
| cost_margin_config | object | Provider-specific or global percentage/fixed margins applied to cost calculations. Configure under litellm_settings. Further docs |
| key_generation_settings | object | Restricts who can generate keys. Further docs |
| disable_add_transform_inline_image_block | boolean | For Fireworks AI models - if true, turns off the auto-add of #transform=inline to the url of the image_url, if the model is not a vision model. |
| use_chat_completions_url_for_anthropic_messages | boolean | If true, routes OpenAI /v1/messages requests through chat/completions instead of the Responses API. Can also be set via env var LITELLM_USE_CHAT_COMPLETIONS_URL_FOR_ANTHROPIC_MESSAGES=true. |
| route_all_chat_openai_to_responses | boolean | If true, routes all OpenAI /chat/completions requests through the Responses API bridge. Recommended for OpenAI models. Can also be set via env var LITELLM_ROUTE_ALL_CHAT_OPENAI_TO_RESPONSES=true. |
| skip_system_message_in_guardrail | boolean | If true, unified guardrails omit role: system from scanned input on chat completions and Anthropic /v1/messages only; the LLM still receives full messages. Per-guardrail override: litellm_params.skip_system_message_in_guardrail on each guardrail. Guardrails quick start |
| disable_hf_tokenizer_download | boolean | If true, it defaults to using the openai tokenizer for all models (including huggingface models). |
| enable_json_schema_validation | boolean | If true, enables json schema validation for all requests. |
| enable_key_alias_format_validation | boolean | If true, validates key_alias format on /key/generate and /key/update. Must be 2-255 chars, start/end with alphanumeric, only allow a-zA-Z0-9_-/.@. Default false. |
| user_url_validation | boolean | Default true. When true, the proxy validates user-controlled URLs (e.g. OpenAPI spec_path when it is an http(s) URL, image URLs, and similar) before fetching: DNS is resolved and connections to non–globally-routable addresses (RFC1918, loopback, link-local, etc.) are blocked unless the hostname in the URL is listed in user_url_allowed_hosts. Set to false to skip validation (only if you trust who can supply URLs). Must be set under litellm_settings, not general_settings. |
| user_url_allowed_hosts | array of strings | Hostnames allowed to resolve to private/internal IPs when user_url_validation is true. Match the host as it appears in the URL (e.g. api.corp.internal, 127.0.0.1, 127.0.0.1:8080, [::1]:443). For split-horizon DNS, allowlist the public hostname, not the resolved 10.x address. Must be set under litellm_settings, not general_settings. See MCP from OpenAPI. |
| disable_copilot_system_to_assistant | boolean | DEPRECATED - GitHub Copilot API supports system prompts. |
| default_team_params | object | Default parameters applied to every new team created via /team/new (including SSO auto-created teams). Only fills in fields not explicitly set in the request. Sub-fields: max_budget (float), budget_duration (string, e.g. "30d"), tpm_limit (integer), rpm_limit (integer), team_member_permissions (array of strings, e.g. ["/team/daily/activity", "/key/generate"]), models (array of strings — only applied to SSO auto-created teams). |
general_settings - Reference​
| Name | Type | Description |
|---|---|---|
| completion_model | string | The model to use for all completions, overriding any model specified in the request |
| disable_spend_logs | boolean | If true, turns off writing each transaction to the database |
| disable_spend_updates | boolean | If true, turns off all spend updates to the DB. Including key/user/team spend updates. |
| disable_master_key_return | boolean | If true, turns off returning master key on UI. (checked on '/user/info' endpoint) |
| disable_retry_on_max_parallel_request_limit_error | boolean | If true, turns off retries when max parallel request limit is reached |
| disable_reset_budget | boolean | If true, turns off reset budget scheduled task |
| disable_adding_master_key_hash_to_db | boolean | If true, turns off storing master key hash in db |
| disable_responses_id_security | boolean | If true, disables response ID security checks that prevent users from accessing response IDs from other users. When false (default), response IDs are encrypted with user information to ensure users can only access their own responses. Applies to /v1/responses endpoints |
| enable_jwt_auth | boolean | allow proxy admin to auth in via jwt tokens with 'litellm_proxy_admin' in claims. Doc on JWT Tokens |
| enforce_user_param | boolean | If true, requires all OpenAI endpoint requests to have a 'user' param. Doc on call hooks |
| reject_clientside_metadata_tags | boolean | If true, rejects requests that contain client-side 'metadata.tags' to prevent users from influencing budgets by sending different tags. Tags can only be inherited from the API key metadata. |
| allowed_routes | array of strings | List of allowed proxy API routes a user can access Doc on controlling allowed routes |
| key_management_system | string | Specifies the key management system. Doc Secret Managers |
| master_key | string | The master key for the proxy Set up Virtual Keys |
| database_url | string | The URL for the database connection Set up Virtual Keys |
| database_connection_pool_limit | integer | The limit for database connection pool Setting DB Connection Pool limit |
| database_connection_timeout | integer | The timeout for database connections in seconds Setting DB Connection Pool limit, timeout |
| database_connect_timeout | float | Maps to the Prisma connect_timeout URL param (seconds). Bounds how long the engine waits to establish a new connection before failing. Defaults to Prisma's built-in value when unset. |
| database_socket_timeout | float | Maps to the Prisma socket_timeout URL param (seconds). When set, an idle or slow connection that has not produced data within this window is closed. Use this to cap idle Prisma connections from LiteLLM. |
| database_extra_connection_params | object | Escape hatch — extra key/value pairs appended verbatim to the Prisma DATABASE_URL / DIRECT_URL query string (e.g. sslmode, pgbouncer, statement_cache_size). Keys here override any default LiteLLM sets. |
| allow_requests_on_db_unavailable | boolean | If true, allows requests to succeed even if DB is unreachable. Only use this if running LiteLLM in your VPC This will allow requests to work even when LiteLLM cannot connect to the DB to verify a Virtual Key Doc on graceful db unavailability |
| custom_auth | string | Write your own custom authentication logic Doc Custom Auth |
| max_parallel_requests | integer | The max parallel requests allowed per deployment |
| global_max_parallel_requests | integer | The max parallel requests allowed on the proxy overall |
| infer_model_from_keys | boolean | If true, infers the model from the provided keys |
| background_health_checks | boolean | If true, enables background health checks. Doc on health checks |
| health_check_interval | integer | The interval for health checks in seconds Doc on health checks |
| alerting | array of strings | List of alerting methods Doc on Slack Alerting |
| alerting_threshold | integer | The threshold for triggering alerts Doc on Slack Alerting |
| use_client_credentials_pass_through_routes | boolean | If true, uses client credentials for all pass-through routes. Doc on pass through routes |
| health_check_details | boolean | If false, hides health check details (e.g. remaining rate limit). Doc on health checks |
| public_routes | List[str] | (Enterprise Feature) Control list of public routes |
| alert_types | List[str] | Control list of alert types to send to slack (Doc on alert types)[./alerting.md] |
| enforced_params | List[str] | (Enterprise Feature) List of params that must be included in all requests to the proxy |
| enable_oauth2_auth | boolean | (Enterprise Feature) If true, enables oauth2.0 authentication on LLM + info routes |
| use_x_forwarded_for | str | If true, uses the X-Forwarded-For header to derive the client IP and (for MCP OAuth) the proxy's public origin from X-Forwarded-Proto / X-Forwarded-Host / X-Forwarded-Port. For MCP OAuth, headers are honored only when mcp_trusted_proxy_ranges is also set and the request peer's IP falls inside one of those CIDRs. For ingressed deployments, prefer PROXY_BASE_URL. See MCP OAuth — Reverse proxy and ingress configuration. |
| service_account_settings | List[Dict[str, Any]] | Set service_account_settings if you want to create settings that only apply to service account keys (Doc on service accounts)[./service_accounts.md] |
| image_generation_model | str | The default model to use for image generation - ignores model set in request |
| store_model_in_db | boolean | If true, enables storing model + credential information in the DB. |
| supported_db_objects | List[str] | Fine-grained control over which object types to load from the database when store_model_in_db is True. Available types: "models", "mcp", "guardrails", "vector_stores", "pass_through_endpoints", "prompts", "model_cost_map". If not set, all object types are loaded (default behavior). Example: supported_db_objects: ["mcp"] to only load MCP servers from DB. |
| user_mcp_management_mode | string | Controls what non-admins can see on the MCP dashboard. restricted (default) only lists MCP servers that the user’s teams are explicitly allowed to access. view_all lets every user see the full MCP server list. Tool list/call always respects per-key permissions, so users still cannot run MCP calls without access. |
| store_prompts_in_spend_logs | boolean | If true, allows prompts and responses to be stored in the spend logs table. |
| max_request_size_mb | int | The maximum size for requests in MB. Requests above this size will be rejected. |
| max_response_size_mb | int | The maximum size for responses in MB. LLM Responses above this size will not be sent. |
| proxy_budget_rescheduler_min_time | int | The minimum time (in seconds) to wait before checking db for budget resets. Default is 597 seconds |
| proxy_budget_rescheduler_max_time | int | The maximum time (in seconds) to wait before checking db for budget resets. Default is 605 seconds |
| proxy_batch_write_at | int | Time (in seconds) to wait before batch writing spend logs to the db. Default is 10 seconds |
| proxy_batch_polling_interval | int | Time (in seconds) to wait before polling a batch, to check if it's completed. Default is 6000 seconds (1 hour) |
| alerting_args | dict | Args for Slack Alerting Doc on Slack Alerting |
| custom_key_generate | str | Custom function for key generation Doc on custom key generation |
| allowed_ips | List[str] | List of IPs allowed to access the proxy. If not set, all IPs are allowed. |
| embedding_model | str | The default model to use for embeddings - ignores model set in request |
| default_team_disabled | boolean | If true, users cannot create 'personal' keys (keys with no team_id). |
| alert_to_webhook_url | Dict[str] | Specify a webhook url for each alert type. |
| key_management_settings | List[Dict[str, Any]] | Settings for key management system (e.g. AWS KMS, Azure Key Vault) Doc on key management |
| allow_user_auth | boolean | (Deprecated) old approach for user authentication. |
| user_api_key_cache_ttl | int | The time (in seconds) to cache user api keys in memory. |
| disable_prisma_schema_update | boolean | If true, turns off automatic schema updates to DB |
| litellm_key_header_name | str | If set, allows passing LiteLLM keys as a custom header. Doc on custom headers |
| moderation_model | str | The default model to use for moderation. |
| custom_sso | str | Path to a python file that implements custom SSO logic. Doc on custom SSO |
| allow_client_side_credentials | boolean | If true, allows passing client side credentials to the proxy. (Useful when testing finetuning models) Doc on client side credentials |
| admin_only_routes | List[str] | (Enterprise Feature) List of routes that are only accessible to admin users. Doc on admin only routes |
| use_azure_key_vault | boolean | If true, load keys from azure key vault |
| use_google_kms | boolean | If true, load keys from google kms |
| spend_report_frequency | str | Specify how often you want a Spend Report to be sent (e.g. "1d", "2d", "30d") More on this |
| ui_access_mode | Literal["admin_only"] | If set, restricts access to the UI to admin users only. Docs |
| litellm_jwtauth | Dict[str, Any] | Settings for JWT authentication. Docs |
| litellm_license | str | The license key for the proxy. Docs |
| oauth2_config_mappings | Dict[str, str] | Define the OAuth2 config mappings |
| pass_through_endpoints | List[Dict[str, Any]] | Define the pass through endpoints. Docs |
| enable_oauth2_proxy_auth | boolean | (Enterprise Feature) If true, enables oauth2.0 authentication |
| forward_openai_org_id | boolean | If true, forwards the OpenAI Organization ID to the backend LLM call (if it's OpenAI). |
| forward_client_headers_to_llm_api | boolean | If true, forwards the client headers (any x- headers and anthropic-beta headers) to the backend LLM call |
| maximum_spend_logs_retention_period | str | Used to set the max retention time for spend logs in the db, after which they will be auto-purged |
| maximum_spend_logs_retention_interval | str | Used to set the interval in which the spend log cleanup task should run in. |
| alert_type_config | dict | Configuration mapping alert types to their handler settings |
| always_include_stream_usage | boolean | If true, includes usage metrics in every streaming response chunk |
| auto_redirect_ui_login_to_sso | boolean | If true, automatically redirects UI login page to SSO provider |
| control_plane_url | string | URL of the control plane for cross-instance state sharing |
| custom_auth_run_common_checks | boolean | If true, runs standard auth validation checks alongside custom auth handlers |
| custom_ui_sso_sign_in_handler | string | Custom handler for SSO sign-in logic in the UI |
| database_connection_pool_timeout | integer | Database connection pool timeout in seconds |
| disable_error_logs | boolean | If true, suppresses error tracking and storage in the database |
| enable_health_check_routing | boolean | If true, enables health check-driven request routing to avoid unhealthy deployments |
| health_check_ignore_transient_errors | boolean | If true, 429 (rate limit) and 408 (timeout) health check failures are ignored and do not affect routing or cooldown |
| enable_mcp_registry | boolean | If true, enables access to the centralized MCP server registry |
| enforce_rbac | boolean | If true, enables role-based access control (RBAC) for all proxy operations |
| forward_llm_provider_auth_headers | boolean | If true, forwards provider-specific auth headers to LLM API calls |
| health_check_concurrency | integer | Maximum number of concurrent health check operations |
| health_check_skip_disabled_background_models | boolean | If true, skips health probes for deployments with model_info.disable_background_health_check: true on on-demand GET /health and related health runs (not only the background loop). Doc on health checks |
| health_check_staleness_threshold | integer | Maximum age in seconds for health check results before marking deployments as stale |
| maximum_spend_logs_cleanup_cron | string | Cron expression for scheduling automatic spend log cleanup tasks |
| mcp_client_side_auth_header_name | string | HTTP header name for client-side MCP server credentials |
| mcp_internal_ip_ranges | list | CIDR ranges considered internal for non-public MCP server access control |
| mcp_required_fields | list | List of required field names for MCP server submissions |
| mcp_trusted_proxy_ranges | list | CIDR ranges of proxies trusted to forward X-Forwarded-* headers for MCP. Required (in addition to use_x_forwarded_for: true) for the MCP OAuth authorize endpoint to derive its public origin from those headers. Without this, headers are ignored and the proxy falls back to the request's literal base URL. For ingressed deployments, prefer PROXY_BASE_URL. See MCP OAuth — Reverse proxy and ingress configuration. |
| require_end_user_mcp_access_defined | boolean | If true, requires end users to have explicit MCP access permissions defined |
| role_permissions | list | List of role-based permission configurations |
| search_tools | list | List of search tool configurations for enabling web search capabilities |
| token_rate_limit_type | string | Rate limit counting method: "total", "output", or "input" tokens |
| use_redis_transaction_buffer | boolean | If true, buffers database transactions in Redis before writing |
| use_shared_health_check | boolean | If true, uses Redis-backed shared health check state across multiple proxy instances |
| user_header_mappings | dict | Map custom request headers to user IDs using lookup rules |
| user_header_name | string | HTTP header name to extract user identity from requests |
router_settings - Reference​
info
Most values can also be set via litellm_settings. If you see overlapping values, settings on
router_settings will override those on litellm_settings. :::
router_settings:
routing_strategy: simple-shuffle # Literal["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle" - RECOMMENDED for best performance
redis_host: <your-redis-host> # string
redis_password: <your-redis-password> # string
redis_port: <your-redis-port> # string
enable_pre_call_checks: true # bool - Before call is made check if a call is within model context window
allowed_fails: 3 # cooldown model if it fails > 1 call in a minute.
cooldown_time: 30 # (in seconds) how long to cooldown model if fails/min > allowed_fails
disable_cooldowns: True # bool - Disable cooldowns for all models
enable_tag_filtering: True # bool - Use tag based routing for requests
tag_filtering_match_any: True # bool - Tag matching behavior (only when enable_tag_filtering=true). `true`: match if deployment has ANY requested tag; `false`: match only if deployment has ALL requested tags
retry_policy: { # Dict[str, int]: retry policy for different types of exceptions
"AuthenticationErrorRetries": 3,
"TimeoutErrorRetries": 3,
"RateLimitErrorRetries": 3,
"ContentPolicyViolationErrorRetries": 4,
"InternalServerErrorRetries": 4
}
allowed_fails_policy: {
"BadRequestErrorAllowedFails": 1000, # Allow 1000 BadRequestErrors before cooling down a deployment
"AuthenticationErrorAllowedFails": 10, # int
"TimeoutErrorAllowedFails": 12, # int
"RateLimitErrorAllowedFails": 10000, # int
"ContentPolicyViolationErrorAllowedFails": 15, # int
"InternalServerErrorAllowedFails": 20, # int
}
content_policy_fallbacks=[{"claude-2": ["my-fallback-model"]}] # List[Dict[str, List[str]]]: Fallback model for content policy violations
fallbacks=[{"claude-2": ["my-fallback-model"]}] # List[Dict[str, List[str]]]: Fallback model for all errors
| Name | Type | Description |
|---|---|---|
| routing_strategy | string | The strategy used for routing requests. Options: "simple-shuffle", "least-busy", "usage-based-routing", "latency-based-routing". Default is "simple-shuffle". More information here |
| redis_host | string | The host address for the Redis server. Only set this if you have multiple instances of LiteLLM Proxy and want current tpm/rpm tracking to be shared across them |
| redis_password | string | The password for the Redis server. Only set this if you have multiple instances of LiteLLM Proxy and want current tpm/rpm tracking to be shared across them |
| redis_port | string | The port number for the Redis server. Only set this if you have multiple instances of LiteLLM Proxy and want current tpm/rpm tracking to be shared across them |
| redis_db | int | The database number for the Redis server. Only set this if you have multiple instances of LiteLLM Proxy and want current tpm/rpm tracking to be shared across them |
| enable_pre_call_check | boolean | If true, checks if a call is within the model's context window before making the call. More information here |
| content_policy_fallbacks | array of objects | Specifies fallback models for content policy violations. More information here |
| fallbacks | array of objects | Specifies fallback models for all types of errors. More information here |
| enable_tag_filtering | boolean | If true, uses tag based routing for requests Tag Based Routing |
| enable_weighted_failover | boolean | If true and routing_strategy is simple-shuffle, a retryable failure on one deployment re-picks (weighted) across other deployments in the same model group before cross-group fallbacks. Default: false. |
| tag_filtering_match_any | boolean | Tag matching behavior (only when enable_tag_filtering=true). true: match if deployment has ANY requested tag; false: match only if deployment has ALL requested tags |
| cooldown_time | integer | The duration (in seconds) to cooldown a model if it exceeds the allowed failures. |
| disable_cooldowns | boolean | If true, disables cooldowns for all models. More information here |
| retry_policy | object | Specifies the number of retries for different types of exceptions. More information here |
| allowed_fails | integer | The number of failures allowed before cooling down a model. More information here |
| allowed_fails_policy | object | Specifies the number of allowed failures for different error types before cooling down a deployment. More information here |
| default_max_parallel_requests | Optional[int] | The default maximum number of parallel requests for a deployment. |
| default_priority | (Optional[int]) | The default priority for a request. Only for '.scheduler_acompletion()'. Default is None. |
| polling_interval | (Optional[float]) | frequency of polling queue. Only for '.scheduler_acompletion()'. Default is 3ms. |
| max_fallbacks | Optional[int] | The maximum number of fallbacks to try before exiting the call. Defaults to 5. |
| default_litellm_params | Optional[dict] | The default litellm parameters to add to all requests (e.g. temperature, max_tokens). |
| timeout | Optional[float] | The default timeout for a request. Default is 10 minutes. |
| stream_timeout | Optional[float] | The default timeout for a streaming request. If not set, the 'timeout' value is used. |
| debug_level | Literal["DEBUG", "INFO"] | The debug level for the logging library in the router. Defaults to "INFO". |
| client_ttl | int | Time-to-live for cached clients in seconds. Defaults to 3600. |
| cache_kwargs | dict | Additional keyword arguments for the cache initialization. Use this for non-string Redis parameters that may fail when set via REDIS_* environment variables. |
| routing_strategy_args | dict | Additional keyword arguments for the routing strategy - e.g. lowest latency routing default ttl |
| model_group_alias | dict | Model group alias mapping. E.g. {"claude-3-haiku": "claude-3-haiku-20240229"} |
| num_retries | int | Number of retries for a request. Defaults to 3. |
| default_fallbacks | Optional[List[str]] | Fallbacks to try if no model group-specific fallbacks are defined. |
| caching_groups | Optional[List[tuple]] | List of model groups for caching across model groups. Defaults to None. - e.g. caching_groups=[("openai-gpt-3.5-turbo", "azure-gpt-3.5-turbo")] |
| alerting_config | AlertingConfig | [SDK-only arg] Slack alerting configuration. Defaults to None. Further Docs |
| assistants_config | AssistantsConfig | Set on proxy via assistant_settings. Further docs |
| set_verbose | boolean | DEPRECATED PARAM - see debug docs If true, sets the logging level to verbose. |
| retry_after | int | Time to wait before retrying a request in seconds. Defaults to 0. If x-retry-after is received from LLM API, this value is overridden. |
| provider_budget_config | ProviderBudgetConfig | Provider budget configuration. Use this to set llm_provider budget limits. example $100/day to OpenAI, $100/day to Azure, etc. Defaults to None. Further Docs |
| enable_pre_call_checks | boolean | If true, checks if a call is within the model's context window before making the call. Required for model_info.max_input_tokens enforcement. Default: false. More information here |
| model_group_retry_policy | Dict[str, RetryPolicy] | [SDK-only arg] Set retry policy for model groups. |
| context_window_fallbacks | List[Dict[str, List[str]]] | Fallback models for context window violations. |
| redis_url | str | URL for Redis server. Known performance issue with Redis URL. |
| cache_responses | boolean | Flag to enable caching LLM Responses, if cache set under router_settings. If true, caches responses. Defaults to False. |
| router_general_settings | RouterGeneralSettings | [SDK-Only] Router general settings - contains optimizations like 'async_only_mode'. Docs |
| optional_pre_call_checks | List[str] | List of pre-call checks to add to the router. Supported: router_budget_limiting, prompt_caching, responses_api_deployment_check, encrypted_content_affinity (requires LiteLLM >= 1.82.3), deployment_affinity, session_affinity, forward_client_headers_by_model_group |
| deployment_affinity_ttl_seconds | int | TTL (seconds) for user-key → deployment affinity mapping when deployment_affinity is enabled (configured at Router init / proxy startup). Defaults to 3600 (1 hour). |
| model_group_affinity_config | Dict[str, List[str]] | Per-model-group affinity flags. Keys are model group names; values are lists of checks to enable (deployment_affinity, responses_api_deployment_check, session_affinity). Groups not listed fall back to the global optional_pre_call_checks. Docs |
| ignore_invalid_deployments | boolean | If true, ignores invalid deployments. Default for proxy is True - to prevent invalid models from blocking other models from being loaded. |
| search_tools | List[SearchToolTypedDict] | List of search tool configurations for Search API integration. Each tool specifies a search_tool_name and litellm_params with search_provider, api_key, api_base, etc. Further Docs |
| guardrail_list | List[GuardrailTypedDict] | List of guardrail configurations for guardrail load balancing. Enables load balancing across multiple guardrail deployments with the same guardrail_name. Further Docs |
| enable_health_check_routing | boolean | If true, enables health check-driven deployment filtering to avoid routing requests to unhealthy deployments |
| health_check_staleness_threshold | integer | Maximum age in seconds for cached health check results before marking deployments as stale |
| health_check_ignore_transient_errors | boolean | If true, 429 (rate limit) and 408 (timeout) health check failures are ignored and do not affect routing or cooldown |
| routing_groups | Optional[List[RoutingGroup]] | List of model groups that each apply their own routing strategy to a subset of models. Each group has a group_name, models (list of model names matched against the request's model), routing_strategy, and optional routing_strategy_args. Defaults to None. |