Содержание

ClearML

Server

Пример запуска в swarm/compose через traefik:

compose.server.yaml

compose.server.yaml

volumes:
  logs:
    driver: local
    driver_opts:
      o: bind
      type: none
      device: /data/docker/clearml/logs
  config:
    driver: local
    driver_opts:
      o: bind
      type: none
      device: /data/docker/clearml/config
  fileserver:
    driver: local
    driver_opts:
      o: bind
      type: none
      device: /data/docker/clearml/data/fileserver
  elastic_7:
    driver: local
    driver_opts:
      o: bind
      type: none
      device: /data/docker/clearml/data/elastic_7
  mongo_4_db:
    driver: local
    driver_opts:
      o: bind
      type: none
      device: /data/docker/clearml/data/mongo_4/db
  mongo_4_configdb:
    driver: local
    driver_opts:
      o: bind
      type: none
      device: /data/docker/clearml/data/mongo_4/configdb
  redis:
    driver: local
    driver_opts:
      o: bind
      type: none
      device: /data/docker/clearml/data/redis

services:
  apiserver:
    command:
    - apiserver
    container_name: clearml-apiserver
    image: clearml/server:latest
    restart: unless-stopped
    volumes:
    - logs:/var/log/clearml
    - config:/opt/clearml/config
    - fileserver:/mnt/fileserver
    depends_on:
      - redis
      - mongo
      - elasticsearch
      - fileserver
    environment:
      CLEARML_ELASTIC_SERVICE_HOST: elasticsearch
      CLEARML_ELASTIC_SERVICE_PORT: 9200
      CLEARML_MONGODB_SERVICE_HOST: mongo
      CLEARML_MONGODB_SERVICE_PORT: 27017
      CLEARML_REDIS_SERVICE_HOST: redis
      CLEARML_REDIS_SERVICE_PORT: 6379
      CLEARML_SERVER_DEPLOYMENT_TYPE: linux
      CLEARML__apiserver__pre_populate__enabled: "true"
      CLEARML__apiserver__pre_populate__zip_files: "/opt/clearml/db-pre-populate"
      CLEARML__apiserver__pre_populate__artifacts_path: "/mnt/fileserver"
      CLEARML__services__async_urls_delete__enabled: "true"
      CLEARML__services__async_urls_delete__fileserver__url_prefixes: "[${CLEARML_FILES_HOST:-fs.clearml.domain.com}]"
      CLEARML__secure__credentials__services_agent__user_key: ${CLEARML_AGENT_ACCESS_KEY:-}
      CLEARML__secure__credentials__services_agent__user_secret: ${CLEARML_AGENT_SECRET_KEY:-}
    networks:
      - backend
      - traefik-public
    deploy:
      resources:
        limits:
          memory: ${MEMORYWEBSERVER-500m}
          cpus: ${CPUWEBSERVER-1}
      labels:
        - traefik.enable=true
        - traefik.swarm.network=traefik-public
        - traefik.http.routers.api-clearml-loc-http.entrypoints=http
        - traefik.http.routers.api-clearml-loc-http.rule=Host(`api.clearml.domain.com`)
        - traefik.http.routers.api-clearml-loc-http.service=api-clearml
        - traefik.http.services.api-clearml.loadbalancer.server.port=8008

  elasticsearch:
    networks:
      - backend
    container_name: clearml-elastic
    environment:
      bootstrap.memory_lock: "true"
      cluster.name: clearml
      cluster.routing.allocation.node_initial_primaries_recoveries: "500"
      cluster.routing.allocation.disk.watermark.low: 500mb
      cluster.routing.allocation.disk.watermark.high: 500mb
      cluster.routing.allocation.disk.watermark.flood_stage: 500mb
      discovery.type: "single-node"
      http.compression_level: "7"
      node.name: clearml
      reindex.remote.whitelist: "'*.*'"
      xpack.security.enabled: "false"
    ulimits:
      memlock:
        soft: -1
        hard: -1
      nofile:
        soft: 65536
        hard: 65536
    image: elasticsearch:8.17.0
    restart: unless-stopped
    volumes:
      - elastic_7:/usr/share/elasticsearch/data
    deploy:
      resources:
        limits:
          memory: ${MEMORYELASTIC-2G}
          cpus: ${CPUELASTIC-2}

  fileserver:
    networks:
      - backend
      - traefik-public
    command:
    - fileserver
    container_name: clearml-fileserver
    image: clearml/server:latest
    environment:
      CLEARML__fileserver__delete__allow_batch: "true"
      CLEARML__fileserver__auth__enabled: "false"
    restart: unless-stopped
    volumes:
    - logs:/var/log/clearml
    - fileserver:/mnt/fileserver
    - config:/opt/clearml/config
    deploy:
      resources:
        limits:
          memory: ${MEMORYWEBSERVER-500m}
          cpus: ${CPUWEBSERVER-1}
      labels:
        - traefik.enable=true
        - traefik.swarm.network=traefik-public
        - traefik.http.routers.fs-clearml-loc-http.entrypoints=http
        - traefik.http.routers.fs-clearml-loc-http.rule=Host(`fs.clearml.domain.com`)
        - traefik.http.routers.fs-clearml-loc-http.service=fs-clearml
        - traefik.http.services.fs-clearml.loadbalancer.server.port=8081

  mongo:
    networks:
      - backend
    container_name: clearml-mongo
    image: mongo:7.0.22
    restart: unless-stopped
    command: --setParameter internalQueryMaxBlockingSortMemoryUsageBytes=196100200
    volumes:
    - mongo_4_db:/data/db
    - mongo_4_configdb:/data/configdb
    deploy:
      resources:
        limits:
          memory: ${MEMORYMONGO-2G}
          cpus: ${CPUMONGO-2}

  redis:
    networks:
      - backend
    container_name: clearml-redis
    image: redis:7.4.1
    restart: unless-stopped
    volumes:
    - redis:/data
    deploy:
      replicas: 1
      resources:
        limits:
          memory: ${MEMORYREDIS-128m}
          cpus: ${CPUREDIS-1}

  webserver:
    command:
    - webserver
    container_name: clearml-webserver
    environment:
      WEBSERVER__apiBaseUrl: "http://api.clearml.domain.com/"
      WEBSERVER__fileBaseUrl: "http://fs.clearml.domain.com/"
      WEBSERVER__useFilesProxy: "true"
    image: clearml/server:latest
    restart: unless-stopped
    depends_on:
      - apiserver
    networks:
      - backend
      - traefik-public
    deploy:
      resources:
        limits:
          memory: ${MEMORYWEBSERVER-500m}
          cpus: ${CPUWEBSERVER-2}
      labels:
        - traefik.enable=true
        - traefik.swarm.network=traefik-public
        - traefik.http.routers.clearml-loc-http.entrypoints=http
        - traefik.http.routers.clearml-loc-http.rule=Host(`clearml.domain.com`)
        - traefik.http.routers.clearml-loc-http.service=clearml
        - traefik.http.services.clearml.loadbalancer.server.port=80

  async_delete:
    depends_on:
      - apiserver
      - redis
      - mongo
      - elasticsearch
      - fileserver
    container_name: async_delete
    image: clearml/server:latest
    networks:
      - backend
    restart: unless-stopped
    environment:
      CLEARML_ELASTIC_SERVICE_HOST: elasticsearch
      CLEARML_ELASTIC_SERVICE_PORT: 9200
      CLEARML_MONGODB_SERVICE_HOST: mongo
      CLEARML_MONGODB_SERVICE_PORT: 27017
      CLEARML_REDIS_SERVICE_HOST: redis
      CLEARML_REDIS_SERVICE_PORT: 6379
      PYTHONPATH: /opt/clearml/apiserver
      CLEARML__services__async_urls_delete__fileserver__url_prefixes: "[${CLEARML_FILES_HOST:-fs.clearml.domain.com}]"
    entrypoint:
      - python3
      - -m
      - jobs.async_urls_delete
      - --fileserver-host
      - http://fileserver:8081
    volumes:
      - logs:/var/log/clearml
      - config:/opt/clearml/config

networks:
  backend:
    attachable: true
  traefik-public:
    external: true


https://clear.ml/docs/latest/docs/deploying_clearml/clearml_server_linux_mac/
https://github.com/clearml/clearml-server/blob/master/docker/docker-compose.yml

Agent

Пример запуска агента с автоматической чисткой директории /data/dataset и едиными директориями кэша:

compose.agent.yaml

compose.agent.yaml

volumes:
  cache:
    driver: local
    driver_opts:
      o: bind
      type: none
      device: /data/clearml/cache
  dataset:
    driver: local
    driver_opts:
      o: bind
      type: none
      device: /data/clearml/dataset

services:
  clearml-agent:
    container_name: clearml-agent-services
    image: clearml/clearml-agent-services:latest
    deploy:
      replicas: 1
      restart_policy:
        condition: on-failure
    privileged: true
    environment:
      CLEARML_WEB_HOST: ${CLEARML_WEB_HOST:-http://clearml.domain.com}
      CLEARML_API_HOST: ${CLEARML_API_HOST:-http://api.clearml.domain.com}
      CLEARML_FILES_HOST: ${CLEARML_FILES_HOST:-http://fs.clearml.domain.com}
      CLEARML_API_ACCESS_KEY: ${CLEARML_AGENT_ACCESS_KEY:-SET}
      CLEARML_API_SECRET_KEY: ${CLEARML_AGENT_SECRET_KEY:-SET}
      CLEARML_AGENT_GIT_USER: ${CLEARML_AGENT_GIT_USER:-SET}
      CLEARML_AGENT_GIT_PASS: ${CLEARML_AGENT_GIT_PASS:-SET}
      CLEARML_AGENT_NO_UPDATE: 1
      CLEARML_AGENT_DAEMON_OPTIONS: "--create-queue"
      CLEARML_AGENT_QUEUES: ${CLEARML_AGENT_QUEUE:-mlqueue}
      CLEARML_WORKER_ID: "docker3"
      CLEARML_AGENT_DEFAULT_BASE_DOCKER: "pytorch/pytorch2.8.0-cuda12.9-cudnn9-runtime"
      CLEARML_AGENT_DOCKER_HOST_MOUNT: "/data/clearml/cache:/root/.clearml"
      SHUTDOWN_IF_NO_ACCESS_KEY: 1
      CLEARML_AGENT_SKIP_PYTHON_ENV_INSTALL: 1
      CLEARML_AGENT_SKIP_PIP_VENV_INSTALL: 1
      CLEARML_TASK_NO_REUSE: 1
      CLEARML_AGENT_DOCKER_ARGS_HIDE_ENV: "CLEARML_API_SECRET_KEY CLEARML_AGENT_GIT_PASS DB_DS_PASSWORD"
      CLEARML_AGENT_DOCKER_ARGS_FILTERS: "^--env$$ ^-e$$ ^--workdir$$ ^-w$$"
      CLEARML_AGENT_EXTRA_DOCKER_ARGS: "--rm --shm-size=8g --volume /data/clearml/dataset:/dataset -e CLEARML_API_HOST=$${CLEARML_API_HOST} -e CLEARML_API_ACCESS_KEY=$${CLEARML_API_ACCESS_KEY} -e CLEARML_API_SECRET_KEY=$${CLEARML_API_SECRET_KEY}"
    volumes:
      - /var/run/docker.sock:/var/run/docker.sock    
      - cache:/root/.clearml    
      - dataset:/dataset
    entrypoint:
      - bash
      - -c
      - |
        cat <<EOF > /root/clearml.conf
        agent {
            venvs_dir = /root/.clearml/venvs-builds
            venvs_cache: {
                path: /root/.clearml/venvs-cache
            },
            vcs_cache: {
                enabled: true,
                path: /root/.clearml/vcs-cache
            },
            pip_download_cache {
                enabled: true,
                path: /root/.clearml/pip-download-cache
            },
            extra_docker_shell_script: ["if [ -d \"/dataset\" ]; then echo \"Cleaning up /dataset ...\"; rm -rf /dataset/*; fi"]
            docker_pip_cache = /root/.clearml/pip-cache
            docker_apt_cache = /root/.clearml/apt-cache
            docker_internal_mounts {
                sdk_cache: "/clearml_agent_cache"
                apt_cache: "/root/.clearml/apt-cache"
                ssh_folder: "/root/.ssh"
                ssh_ro_folder: "/.ssh"
                pip_cache: "/root/.clearml/pip-cache"
                poetry_cache: "/root/.cache/pypoetry"
                vcs_cache: "/root/.clearml/vcs-cache"
                venvs_cache: "/root/.clearml/venvs-cache"
                venv_build: "/root/.clearml/venvs-builds"
                pip_download: "/root/.clearml/pip-download-cache"
            }
        }
        EOF
        sed -i 's/--cpu-only /--gpus all /' /usr/agent/entrypoint.sh
        if [ -d "/dataset" ]; then 
          echo "Cleaning up /dataset ..."
          rm -rf /dataset/*
        fi
        curl --retry 3 --silent --fail --show-error --output /dev/null --retry-delay 10 --retry-all-errors $${CLEARML_API_HOST}/debug.ping \
        && /usr/agent/entrypoint.sh


https://clear.ml/docs/latest/docs/configs/env_vars
https://clear.ml/docs/latest/docs/clearml_agent/clearml_agent_env_var
https://clear.ml/docs/latest/docs/configs/clearml_conf#agent-section