# Version of Legion
# By default .Chart.AppVersion version is used
# Type: string
# legionVersion: "1.0"
# Does cluster require RBAC
# Type: bool
rbac: true
# Docker images registry
# This registry will be used for automatic docker image name deduction based on Legion naming for images
# Each image could be overridden manually in .<service>.image, please see .edi root section or other
# Type: string
imagesRegistry: legionplatform/
# Verbosity of logging features in components
# Valid values are:
# - info
# - debug
# - warning
# - error
logLevel: info
# Configuration of Legion ingresses
# Ingresses are for next <service>s
# - edi
# - edge
ingress:
# Global flag for Ingress enabling
# Each Ingress could be configured manually in .<service>.ingress.enabled
# Type: bool
enabled: false
# Root domain for auto-created Ingress domains.
# Each domain could be configured manually in .<service>.ingress.domain
# Also it controls building URLs for external resources such as auth endpoint
# Type: string
globalDomain: example.com
# Global annotations for all services
# Each Ingress could be configured manually in .<service>.ingress.annotations
# Type: string->string map
annotations: {}
# kubernetes.io/ingress.class: nginx
# kubernetes.io/tls-acme: "true"
# Global TLS flag
# Each Ingress could be configured manually in .<service>.ingress.tlsEnabled
# Also it controls building URLs for external resources such as auth endpoint
# Type: bool
tlsEnabled: false
# Global TLS secret name
# Each Ingress could be configured manually in .<service>.ingress.tlsSecretName
# Type: string
tlsSecretName: ~
# Security configuration. Model API security configures in .modelApiSecurity section (see below)
security:
# Is authorization for WEB requests enabled or not
# Type: bool
enabled: false
# Type of authorization. Currently only oauth2_proxy is supported
# Valid values are:
# - oauth2_proxy
integration: oauth2_proxy
# Detail configuration of oauth2_proxy
oauth2_proxy:
# Internal URL of oauth2_proxy that will be called on each Ingress request. Is used in auth_request directive on Ingress Nginx
# Type: string
url: http://oauth2-proxy.kube-system.svc.cluster.local:4180/oauth2/auth
# Public URL on which user will be redirected for authrorization
# Uncomment for custom public URL, otherwise auth.<ingress.globalDomain> will be used
# besides standard Nginx Ingress variables, escaped_request_uri is available too
# Type: string
# public_url: https://auth.my-company.com/oauth2/start?rd=https://$host$escaped_request_uri
# Model API security
modelApiSecurity:
# Should Model API security be closed by auth. or not
# Type: bool
enabled: false
# Type of auth. mechanism for Model API gateway
# Valid values are:
# - jwt
integration: jwt
# Detailed JWT configuration
jwt:
# Secret for JWT
# Type: string
secret: example
# Default token's TTL in minutes
# Type: integer
defaultTokenTTLInMinutes: 120
# Maximum value of TTL in minutes
# This value is used in EDI to validate requests for token generation
# Type: integer
maxTokenTTLInMinutes: 525600
# Default TTL end date
# Type: date sting, e.g. 2030-12-30T00:00:00
defaultTokenTTLEndDate: "2030-12-30T00:00:00"
# Components metrics measurement
# Measures components performance through prometheus protocol
metrics:
# Is measurements enabled or not
# Type: bool
enabled: false
# Labels for ServiceMonitor CR objects
# Type: string -> string map
serviceMonitoringLabels:
monitoring: prometheus
# StatsD configuration for metrics that requires StatsD format (e.g. model invocation & training metrics)
modelMetrics:
# Is model performance metrics enabled or not
# Type: bool
enabled: false
# StatsD host
# Type: string
host: statsd-exporter.kube-monitoring.svc.cluster.local
# StatsD port
# Type: integer
port: 9125
# Default VCS instances. Will be spawned on cluster start
# For more information, read the VCSCredential documentation. Example:
# - name: "legion"
# type: "git"
# uri: "[email protected]:legion-platform/legion.git"
# defaultReference: "origin/develop"
# creds: ""
vcs: []
# Model storage is a Docker Registry
# Credentials are required for gathering model information
modelStorage:
# Prefix for all built images
# Might be useful for management purposes
# Type: string
buildPrefix: "legion"
# Type of Docker Registry.
# Valid values are:
# - external - use external Docker Registry
type: external
# TODO: add "internal" variant - deploy and use internal Docker Registry with ephemeral disk (only for development purposes)
# Detailed configuration of external Docker Registry
external:
# Protocol for connection to Docker Registry
# Valid values are:
# - https
# - http
protocol: ~
# URL of Docker Registry
# Type: string
url: ~
# Credentials on Docker Registry
# Type: string
user: example
# Credentials on Docker Registry
# Type: string
password: example
# Configuration of model execution process
modelExecution:
# Limitations of model deployment pods
# For declaration format see https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/
limits:
cpu: 256m
mem: 256Mi
# Feedback configuration
feedback:
# Is feedback gathering stack enabled or not
enabled: false
fluentd:
# This variable can be enabled to setup custom image name for fluentd
# Type: string
# image: custom-image:1.0
# Resources for each instance
# For declaration format see https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/
resources:
requests:
cpu: "300m"
memory: "256Mi"
limits:
cpu: "500m"
memory: "1024Mi"
# Input port number
# Type: integer
port: 24224
output:
# Where feedback data should be stored
# Only S3 is supported nowadays
# Valid values:
# s3 - AWS S3
target: s3
# Detail configuration for storing on S3
s3:
# Type of authorization on S3
# Valid values are:
# - iam - requires kube2iam to be installed in cluster,
# adds annotation "iam.amazonaws.com/role" to FluentD Pod
# value of annotation could be specified in .feedback.output.s3.customIAMRole
# - secret - provide AWS Key ID and AWS Secret Key in ENV variables for FluentD server
# AWS Key ID and AWS Secret Key should be specified in
# .feedback.output.s3.AWSKeyID and .feedback.output.s3.AWSSecretKey
authorization: iam
# Custom name for IAM for iam-based authorization mode of FluentD
# For details see authorization directive above
# By default "<.ingress.globalDomain>-<.Release.Namespace>-collector-role" is used
# Type: string
#customIAMRole: ~
# AWS Key ID for secret-based authorization mode of FluentD
# For details see authorization directive above
# Type: string
#AWSKeyID: ~
# AWS Secret Key for secret-based authorization mode of FluentD
# For details see authorization directive above
# Type: string
#AWSSecretKey: ~
# S3 bucket name
# Type: string
bucket: ~
# S3 region
# Type: string
region: ~
# Directory for data storing
# Type: string
path: "model_log/${tag}/${model_id}/${model_version}/year=%Y/month=%m/day=%d/"
# Format of file names
# Type: string
objectKeyFormat: "%{path}%{time_slice}_%{index}.%{file_extension}"
# Slicing format
# Type: string
timeSliceFormat: "%Y%m%d%H"
# Slicing wait time
# Type: string
timeSliceWait: "5m"
# Storage type
# Type: string
storeAs: "json"
# Storage format
# Type: string
format: "json"
# Buffering (chunking)
buffering:
# Chunks length (window size)
# Type: string
timekey: 1m
# Delay for flush (after end of window)
# Type: string
timekeyWait: 0s
# Temporary buffering location
# Type: string
path: /tmp
# Operator configuration
# Operator handles all Legion's CustomResources such as ModelTraining and etc.
operator:
# Operator's server configuration
# It listens Kubernetes API for Legion CR update events
# and creates/updates appropriate Pods / Secrets
server:
# This variable can be uncommented to setup custom image name for operator (server)
# Type: string
# image: custom-image:1.0
# Count of operator replicas
# Type: integer
replicas: 1
# Resources for each instance
# For declaration format see https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/
resources:
limits:
cpu: "256m"
memory: "256Mi"
requests:
cpu: "128m"
memory: "128Mi"
# Builder's configuration
# It places in sidecar container for training pod
# and it is in charge of communicating with host's Docker socket
# for training container capturing.
builder:
# This variable can be uncommented to setup custom image name for operator (builder)
# Type: string
# image: custom-image:1.0
# EDI server configuration
# It provides HTTP API for model training & model deployment management
# also it creates JWT tokens for model invocation
edi:
# EDI could be disabled
# Type: bool
enabled: true
# This variable can be uncommented to setup custom image name for operator (server)
# Type: string
# image: custom-image:1.0
# Count of EDI replicas
# Type: integer
replicas: 1
# Maximum number of retries for K8S API calls
# Type: integer
k8sApiRetryNumberMaxLimit: 10
# Delay between retries for K8S API calls
# Type: integer
k8sApiRetryDelaySec: 3
# Port on which EDI listens income traffic
# Type: integer
port: 80
# Resources for each instance
# For declaration format see https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/
resources:
requests:
cpu: "50m"
memory: "128Mi"
limits:
cpu: "100m"
memory: "512Mi"
# Configuration of ingress object
ingress:
# Custom enabling/disabling of Ingress resource for EDI
# To use specific value, uncomment and replace ~ with target value
# Type: bool
# enabled: ~
# Annotations for ingress
# Will be added to global annotations (.ingress.annotations)
# Type: string->string map
annotations: {}
# Custom domain name
# By default domain name "edi.<..ingress.globalDomain>" is used
# To use specific value, replace ~ with target value
# Type: string
# domain: ~
# Is TLS enabled for this Ingress or not
# By default global variable is used (.ingress.tlsEnabled)
# To use specific value, replace ~ with target value
# Type: string
# tlsEnabled: false
# Global TLS secret name
# By default global variable is used (.ingress.tlsSecretName)
# To use specific value, replace ~ with target value
# Type: string
# tlsSecretName: ~
# EDGE gateway
# It handles all income traffic for model invocation
# and it does JWT validation of requests if it is enabled
edge:
# EDGE gateway could be disabled
# Type: bool
enabled: true
# This variable can be uncommented to setup custom image name for operator (server)
# Type: string
# image: custom-image:1.0
# Count of EDGE replicas
# Type: integer
replicas: 1
# Port on which EDI listens income traffic
# Type: integer
port: 80
# Resources for each instance
# For declaration format see https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/
resources:
requests:
cpu: "50m"
memory: "128Mi"
limits:
cpu: "100m"
memory: "512Mi"
# Ingress object configuration
ingress:
# Custom enabling/disabling of Ingress resource for EDGE
# To use specific value, uncomment and replace ~ with target value
# Type: bool
#enabled: ~
# Annotations for ingress
# Will be added to global annotations (.ingress.annotations)
# Type: string->string map
annotations: {}
# Custom domain name
# By default domain name "edge.<..ingress.globalDomain>" is used
# To use specific value, uncomment and replace ~ with target value
# Type: string
#domain: ~
# Is TLS enabled for this Ingress or not
# By default global variable is used (.ingress.tlsEnabled)
# To use specific value, uncomment and replace ~ with target value
# Type: string
#tlsEnabled: ~
# Global TLS secret name
# By default global variable is used (.ingress.tlsSecretName)
# To use specific value, uncomment and replace ~ with target value
# Type: string
#tlsSecretName: ~
toolchains:
python:
# This variable can be uncommented to setup custom image name for python
# Type: string
# image: custom-image:1.0
Each cluster that you want to deploy with our Jenkinsfiles and ansible playbooks should be configured using profiles and secrets. For application configuration you may use CLI configuration interface or environment variables configuration.
Profile describes main characteristicts of cluster, such as DNS names, machines shapes (RAM, CPU and etc.) and so on.
Secret describes private cluster information, such as credentials for internal and external systems, secret keys and so on.
CLI configuration interface is a way to change Legion applications appearance (logging level, auth tokens and etc.) but may control only part of Legion applications configuration.
Environment variables configuration allows fully control Legion applications configurations.
Each profile should be located in /deploy/profiles
directory of current repository. Profile file name consists of two parts: envinroment name (usually equal to DNS name) and .yml
extension.
File should consists of appropriate YAML formatted text.
Here is an example of profile file
# DNS
base_domain: legion-dev.epm.kharlamov.biz # DNS name of environment
route53_zone: epm.kharlamov.biz # AWS Route53 zone on which domain will be created (zone should be parked before deploy)
# Common
vendor: legion # name of vendor, will be used in resource tags
env_name: legion-dev # short name of env, will be added in resource tags
# Ansible variables
tmp_dir: /tmp/ # directory for storing temporary files (on host during deploy)
git_key: "/home/jenkins/deploy.cert" # SSH Git access key which will be copied to Jenkins in cluster
ssh_public_key: ~/.ssh/id_rsa.pub # public key which will be copied to cluster
# AWS configuration
aws_region: us-east-2 # target AWS region for EC2 instances
bastion_shape: t2.micro # shape for bastion nodes (not as part of Kubernetes cluster)
master_shape: t2.large # shape for masters
node_shape: t2.large # shape for modes
node_autoscaler_min: 3 # minimum count of nodes for autoscaler group
node_autoscaler_max: 5 # maximum count of nodes for autoscaler group
node_extra_shapes: # list of shapes that can be started up during model building of Jenkins (will be shutted down automatically)
- r4.large # 2 cpu / 15.25Gb / $0.133 ph
- r4.xlarge # 4 cpu / 30.5Gb / $0.266 ph
- r4.2xlarge # 8 cpu / 61Gb / $0.532 ph
- r4.4xlarge # 16 cpu / 122Gb / $1.064 ph
- r4.8xlarge # 32 cpu / 244Gb / $2.128 ph
- r4.16xlarge # 64 cpu / 488Gb / $4.256 ph
- x1.16xlarge # 64 cpu / 976Gb / $6.669 ph
- x1.32xlarge # 128 cpu / 1952Gb / $13.338 ph
node_extra_min: 0 # minimum count of nodes for model building
node_extra_max: 2 # maximum count of nodes for model building
vpc_id: vpc-5729c13e # VPC id where the cluster will be created
# Common cluster configuration for KOPS
cluster_name: legion-dev.epm.kharlamov.biz # unique KOPS cluster name
state_store: s3://legion-cluster # AWS S3 bucket for storing KOPS state
aws_image: kope.io/k8s-1.8-debian-jessie-amd64-hvm-ebs-2018-02-08 # base Kubernetes image
kubernetes_version: 1.9.3 # kubernetes version
private_network: '172.31' # private network prefix
cluster_zones: # configuration of cluster zones
- zone_name: us-east-2a
kops_cidr: "{{ private_network }}.100.0/24"
kops_utility_cidr: "{{ private_network }}.103.0/24"
- zone_name: us-east-2b
kops_cidr: "{{ private_network }}.101.0/24"
kops_utility_cidr: "{{ private_network }}.104.0/24"
- zone_name: us-east-2c
kops_cidr: "{{ private_network }}.102.0/24"
kops_utility_cidr: "{{ private_network }}.105.0/24"
# TLS sertificates issuing configuration (via Let's Encrypt)
certificate_email: [email protected] # Let's Encrypt notification email
cert_dir: "/etc/dynssl" # folder for storing SSL certificates on host
# Deploying and test configuration
use_https: "yes" # [?]
use_https_for_tests: "yes" # [?]
pypi_repo: "https://nexus-local.cc.epm.kharlamov.biz/repository/pypi-hosted/simple" # repository for Python packages
docker_repo: "nexus-local.cc.epm.kharlamov.biz:443" # docker registry with builded images
namespace: default # namespace of core deployment
deployment: legion # name of deployment
examples_to_test: # which Jenkins examples will be executed in tests
- Test-Summation
- Digit-Recognition
model_id_to_test: income # id of model which will be tested in EDI tests
enclaves: # list of enclaves which will be automatically deployed after Legion deploy
- company-a
legion_data_s3_bucket: "{{ legion_data_bucket_prefix }}-{{ env_name }}-{{ enclave }}" # Airflow storage location at S3
# Dex
dex:
enabled: false # by default Dex is disabled in profiles (but enabled in secrets)
# Secrets
secrets_bucket: "legion-secrets" # S3 bucket with secrets
secrets_file: "/tmp/{{ cluster_name }}-secrets" # path for temporary storage
Each secret should be encrypted with Ansible vault and uploaded to S3.
Secret should be stored on a Jenkins like credentials file (for example vault-legion-dev.epm.kharlamov.biz).
S3 path to secrets builds using next template {{ secrets_bucket }}/vault/{{ profile }}
for example legion-secrets/vault/legion-dev.epm.kharlamov.biz
Decrypted file should consists of appropriate YAML formatted text.
Here is an example of secret file
# AWS resources configuration
aws:
account_id: 000000000000
rds: # credentials for dynamically deployed RDS
username: example
password: example
database_name: db
external_access_sgs: # list of AWS SG that should be added on ELB
- sg-00000000
allowed_wan_ips: # list of whitelisted CIDRs
- 1.2.3.4/32
jenkins_cc_sg: sg-00000000 # CC Jenkins Security Group to be whitelisted on cluster
# DEX configuration
dex:
enabled: true
config:
client_id: legion-dev.epm.kharlamov.biz # env. name ()
client_secret: AAAAAAAAAAAAAAAA # randomly generated 24-len password
connectors:
- type: github
id: github
name: GitHub
config:
clientID: client_id
clientSecret: client_secret
redirectURI: https://dex.legion-dev.epm.kharlamov.biz/callback # DEX callback URL
orgs:
- name: legion-platform # linked GitHub organizations
staticPasswords: # static hardcoded passwords for test
- email: [email protected]
password: example
hash: "$2a$10$2b2cU8CPhOTaGrs1HRQuAueS7JTT5ZHsHSzYiFPm1leZck7Mc8T4W" # bcrypt hash of the string "password"
username: example
userID: "08a8684b-db88-4b73-90a9-3cd1661f5466"
groups: # GitHub groups mapping
- clusterrolebinding: cluster-admin
group: legion-platform:admin
- clusterrolebinding: view
group: legion-platform:view