Files
vmis/backend/app/services/scheduler/schedule_tenant.py
VMIS Developer cf1e26c2e9 revert: all tenants (including manager) need NC+OO containers
Manager tenant is a real company tenant with employees who need
NC Drive + OO. is_manager only controls admin portal access and
Traefik route inclusion, not whether NC/OO infrastructure is needed.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-15 22:33:25 +08:00

1050 lines
42 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Schedule 1 — 租戶檢查(每 3 分鐘)
3-state: None=未設定(灰), True=正常(綠), False=異常(紅)
- None → 自動嘗試建立/部署,記錄 done_at
- False → 發送告警 email 給所有管理員
"""
import logging
import smtplib
from email.mime.text import MIMEText
from typing import Optional
import httpx
from sqlalchemy.orm import Session
from app.core.config import settings
from app.core.utils import now_tw
from app.models.tenant import Tenant
from app.models.result import TenantScheduleResult
logger = logging.getLogger(__name__)
PG_PORT_TRIAL = 5433
PG_PORT_ACTIVE = 5432
PG_USER = "admin"
PG_PASS = "DC1qaz2wsx"
PG_HOST_TRIAL = "10.1.0.20"
PG_HOST_ACTIVE = "10.1.0.254"
REDIS_HOST_TRIAL = "10.14.0.20"
REDIS_HOST_ACTIVE = "10.1.0.254"
REDIS_PORT = 6379
REDIS_PASS = "DC1qaz2wsx"
KC_HOST_TRIAL = "auth.lab.taipei"
KC_HOST_ACTIVE = "auth.ease.taipei"
TRAEFIK_DYNAMIC_DIR = "/home/porsche/traefik/dynamic"
TRAEFIK_API_URL = "http://10.1.0.254:8080"
# ─── Docker Compose 範本產生 ─────────────────────────────────────────────────
OO_AI_PLUGIN_GUID = "{9DC93CDB-B576-4F0C-B55E-FCC9C48DD007}"
OO_AI_TRANSLATIONS_HOST = "/home/porsche/tenants/shared/oo-plugins/ai-translations"
OO_AI_TRANSLATIONS_CONTAINER = (
f"/var/www/onlyoffice/documentserver/sdkjs-plugins/{OO_AI_PLUGIN_GUID}/translations"
)
def _generate_tenant_compose(tenant, is_active: bool) -> str:
"""
產生租戶 docker-compose.yml 內容。
包含 NC + OO 容器設定OO 已加入繁中 AI plugin bind mount。
"""
code = tenant.code
suffix = "" if is_active else "-test"
nc = f"nc-{code}{suffix}"
oo = f"oo-{code}{suffix}"
pg_host = PG_HOST_ACTIVE if is_active else PG_HOST_TRIAL
pg_port = PG_PORT_ACTIVE if is_active else PG_PORT_TRIAL
pg_db = f"nc_{code}_db"
nc_domain = tenant.domain
oo_host = f"office-{code}.ease.taipei" if is_active else f"office-{code}.lab.taipei"
ai_base = OO_AI_TRANSLATIONS_HOST
ai_cont = OO_AI_TRANSLATIONS_CONTAINER
return f"""services:
{nc}:
image: nextcloud:31
container_name: {nc}
restart: unless-stopped
volumes:
- {nc}-data:/var/www/html
- {nc}-apps:/var/www/html/custom_apps
- {nc}-config:/var/www/html/config
environment:
POSTGRES_HOST: {pg_host}:{pg_port}
POSTGRES_DB: {pg_db}
POSTGRES_USER: {PG_USER}
POSTGRES_PASSWORD: ${{NC_DB_PASSWORD}}
NEXTCLOUD_ADMIN_USER: ${{NC_ADMIN_USER}}
NEXTCLOUD_ADMIN_PASSWORD: ${{NC_ADMIN_PASSWORD}}
NEXTCLOUD_TRUSTED_DOMAINS: {nc_domain}
OVERWRITEPROTOCOL: https
TRUSTED_PROXIES: 172.18.0.0/16
TZ: Asia/Taipei
networks:
- traefik-network
labels:
- "traefik.enable=false"
{oo}:
image: onlyoffice/documentserver:latest
container_name: {oo}
restart: unless-stopped
environment:
- JWT_SECRET=${{OO_JWT_SECRET}}
volumes:
- {oo}-data:/var/www/onlyoffice/Data
- {oo}-log:/var/log/onlyoffice
- {ai_base}/zh-TW.json:{ai_cont}/zh-TW.json:ro
- {ai_base}/zh-TW.json.gz:{ai_cont}/zh-TW.json.gz:ro
- {ai_base}/langs.json:{ai_cont}/langs.json:ro
- {ai_base}/langs.json.gz:{ai_cont}/langs.json.gz:ro
networks:
- traefik-network
labels:
- "traefik.enable=true"
- "traefik.docker.network=traefik-network"
- "traefik.http.routers.{oo}.rule=Host(`{oo_host}`)"
- "traefik.http.routers.{oo}.entrypoints=websecure"
- "traefik.http.routers.{oo}.tls=true"
- "traefik.http.routers.{oo}.tls.certresolver=letsencrypt"
- "traefik.http.services.{oo}.loadbalancer.server.port=80"
- "traefik.http.middlewares.{oo}-headers.headers.customrequestheaders.X-Forwarded-Proto=https"
- "traefik.http.routers.{oo}.middlewares={oo}-headers"
networks:
traefik-network:
external: true
volumes:
{nc}-data:
{nc}-apps:
{nc}-config:
{oo}-data:
{oo}-log:
"""
def _ensure_tenant_compose(tenant, is_active: bool) -> bool:
"""
確保租戶 docker-compose.yml 存在且為最新範本。
若不存在則自動產生並寫入,同時補齊 .env。
"""
try:
import paramiko
code = tenant.code
deploy_dir = f"{settings.TENANT_DEPLOY_BASE}/{code}"
compose_path = f"{deploy_dir}/docker-compose.yml"
env_path = f"{deploy_dir}/.env"
client = paramiko.SSHClient()
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
client.connect(settings.DOCKER_SSH_HOST, username=settings.DOCKER_SSH_USER, timeout=15)
# 確保目錄存在
_, out, _ = client.exec_command(f"mkdir -p {deploy_dir}")
out.channel.recv_exit_status()
# 若 docker-compose.yml 不存在才寫入(避免覆蓋手動調整)
_, stdout, _ = client.exec_command(f"test -f {compose_path} && echo exists || echo missing")
exists = stdout.read().decode().strip() == "exists"
if not exists:
content = _generate_tenant_compose(tenant, is_active)
sftp = client.open_sftp()
with sftp.open(compose_path, "w") as f:
f.write(content)
sftp.close()
logger.info(f"docker-compose.yml generated for {code}")
# 確保 .env 存在(含必要變數)
_, stdout2, _ = client.exec_command(f"test -f {env_path} && echo exists || echo missing")
env_exists = stdout2.read().decode().strip() == "exists"
if not env_exists:
from app.core.config import settings as _cfg
env_content = (
f"NC_DB_PASSWORD={PG_PASS}\n"
f"NC_ADMIN_USER={_cfg.NC_ADMIN_USER}\n"
f"NC_ADMIN_PASSWORD={_cfg.NC_ADMIN_PASSWORD}\n"
f"OO_JWT_SECRET={_cfg.OO_JWT_SECRET}\n"
)
sftp2 = client.open_sftp()
with sftp2.open(env_path, "w") as f:
f.write(env_content)
sftp2.close()
logger.info(f".env generated for {code}")
client.close()
return True
except Exception as e:
logger.error(f"_ensure_tenant_compose {tenant.code}: {e}")
return False
# ─── Traefik file provider helpers ───────────────────────────────────────────
def _generate_tenant_route_yaml(tenant, is_active: bool) -> str:
"""
產生租戶 Traefik 路由 YAML 內容。
租戶網域根路徑直接指向 NC無路徑前綴
is_manager=true 的租戶額外加入 /admin 路由priority 200比 drive 高)。
"""
code = tenant.code
domain = tenant.domain
nc_url = f"http://nc-{code}:80" if is_active else f"http://nc-{code}-test:80"
lines = ["http:"]
if tenant.is_manager:
# Manager 租戶:根路徑 redirect 到 /admin/admin 和 /api 指向 vmis-backend
lines += [
" middlewares:",
" vmis-strip-admin:",
" stripPrefix:",
' prefixes: ["/admin"]',
"",
" vmis-redirect-admin:",
" redirectRegex:",
f' regex: "^https://{domain}/?$"',
f' replacement: "https://{domain}/admin/"',
" permanent: false",
"",
]
lines += [
" routers:",
f" {code}-admin:",
f' rule: "Host(`{domain}`) && PathPrefix(`/admin`)"',
f" service: {code}-vmis",
" entryPoints: [websecure]",
" middlewares: [vmis-strip-admin]",
" tls:",
" certResolver: letsencrypt",
" priority: 200",
"",
f" {code}-api:",
f' rule: "Host(`{domain}`) && PathPrefix(`/api`)"',
f" service: {code}-vmis",
" entryPoints: [websecure]",
" tls:",
" certResolver: letsencrypt",
" priority: 200",
"",
f" {code}-root:",
f' rule: "Host(`{domain}`)"',
f" service: {code}-vmis",
" entryPoints: [websecure]",
" middlewares: [vmis-redirect-admin]",
" tls:",
" certResolver: letsencrypt",
" priority: 100",
"",
f" {code}-http:",
f' rule: "Host(`{domain}`)"',
" entryPoints: [web]",
" middlewares: [redirect-https]",
f" service: {code}-vmis",
"",
" services:",
f" {code}-vmis:",
" loadBalancer:",
" servers:",
' - url: "http://vmis-backend:10281"',
]
else:
lines += [
" routers:",
f" {code}-drive:",
f' rule: "Host(`{domain}`)"',
f" service: {code}-drive",
" entryPoints: [websecure]",
" tls:",
" certResolver: letsencrypt",
"",
f" {code}-http:",
f' rule: "Host(`{domain}`)"',
" entryPoints: [web]",
" middlewares: [redirect-https]",
f" service: {code}-drive",
"",
" services:",
f" {code}-drive:",
" loadBalancer:",
" servers:",
f' - url: "{nc_url}"',
]
return "\n".join(lines) + "\n"
def _ensure_traefik_routes(tenant, is_active: bool) -> bool:
"""
確保租戶 Traefik 路由檔案存在且內容正確。
使用 SFTP 寫入 /home/porsche/traefik/dynamic/{code}.ymlTraefik 熱重載。
回傳 True=路由已生效, False=失敗
"""
import time
try:
import paramiko
code = tenant.code
file_path = f"{TRAEFIK_DYNAMIC_DIR}/{code}.yml"
expected = _generate_tenant_route_yaml(tenant, is_active)
client = paramiko.SSHClient()
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
client.connect(settings.DOCKER_SSH_HOST, username=settings.DOCKER_SSH_USER, timeout=15)
sftp = client.open_sftp()
needs_write = True
try:
with sftp.open(file_path, "r") as f:
existing = f.read().decode()
if existing == expected:
needs_write = False
logger.info(f"Traefik route {code}.yml: already correct")
except FileNotFoundError:
logger.info(f"Traefik route {code}.yml: not found, creating")
if needs_write:
with sftp.open(file_path, "w") as f:
f.write(expected)
logger.info(f"Traefik route {code}.yml: written")
sftp.close()
client.close()
# 驗證 Traefik 已載入路由(最多等待 4 秒)
route_name = f"{code}-drive@file"
for attempt in range(2):
try:
resp = httpx.get(
f"{TRAEFIK_API_URL}/api/http/routers/{route_name}",
timeout=5.0,
)
if resp.status_code == 200 and resp.json().get("status") == "enabled":
logger.info(f"Traefik route {route_name}: enabled ✓")
return True
except Exception:
pass
if attempt == 0:
time.sleep(2)
logger.warning(f"Traefik route {route_name}: not visible in API after write")
return False
except Exception as e:
logger.error(f"_ensure_traefik_routes {tenant.code}: {e}")
return False
# ─── Keycloak helpers ────────────────────────────────────────────────────────
def _check_kc_realm(host: str, realm: str) -> Optional[bool]:
"""
None = realm 不存在(未設定)
True = realm 存在且可連線
False = 連線失敗
"""
try:
resp = httpx.get(
f"https://{host}/realms/{realm}/.well-known/openid-configuration",
timeout=5.0,
)
if resp.status_code == 200:
return True
if resp.status_code == 404:
return None
return False
except Exception as e:
logger.warning(f"KC realm check failed {host}/{realm}: {e}")
return False
def _create_kc_realm(realm: str, tenant_name: str):
from app.services.keycloak_client import get_keycloak_client
kc = get_keycloak_client()
kc.create_realm(realm, tenant_name)
kc.update_realm_token_settings(realm, access_code_lifespan=600)
def _ensure_kc_drive_client(realm: str, domain: str) -> Optional[str]:
"""
確保 Keycloak realm 中存在 'drive' confidential clientNC OIDC 用)。
回傳 client_secret若失敗回傳 None。
"""
try:
from app.services.keycloak_client import get_keycloak_client
kc = get_keycloak_client()
# redirectUri 為根路徑NC 直接服務租戶網域,無路徑前綴)
status = kc.create_confidential_client(
realm, "drive", [f"https://{domain}/*"]
)
if status in ("exists", "created"):
return kc.get_client_secret(realm, "drive")
logger.error(f"Failed to ensure drive client in realm {realm}: {status}")
return None
except Exception as e:
logger.error(f"_ensure_kc_drive_client {realm}: {e}")
return None
def _nc_db_check(container_name: str, pg_host: str, pg_db: str, nc_domain: str, pg_port: int = PG_PORT_TRIAL) -> bool:
"""
驗證 NC 是否正確安裝並使用 PostgreSQL。
若偵測到 SQLite常見於 volumes 持久但重新部署的情況),自動修復:
1. 刪除 config.php
2. 清空 PostgreSQL DB schema
3. 清除 data 目錄中的 SQLite 檔案
4. 重新執行 occ maintenance:install 使用 pgsql
5. 設定 overwritehost
回傳 True=已正確使用 pgsql, False=仍有問題
"""
try:
import paramiko, json as _json, psycopg2
client = paramiko.SSHClient()
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
client.connect(settings.DOCKER_SSH_HOST, username=settings.DOCKER_SSH_USER, timeout=15)
# 取得 NC 安裝狀態
_, stdout, _ = client.exec_command(
f"docker exec -u www-data {container_name} "
f"php /var/www/html/occ status --output=json 2>/dev/null"
)
status_raw = stdout.read().decode().strip()
installed = False
try:
installed = _json.loads(status_raw).get("installed", False)
except Exception:
pass
if not installed:
logger.info(f"NC {container_name}: not installed yet, installing with pgsql...")
# 重置 PostgreSQL DB schema確保乾淨狀態
conn = psycopg2.connect(
host=pg_host, port=pg_port, dbname=pg_db,
user=PG_USER, password=PG_PASS, connect_timeout=5,
)
conn.autocommit = True
cur = conn.cursor()
cur.execute("DROP SCHEMA public CASCADE; CREATE SCHEMA public; GRANT ALL ON SCHEMA public TO PUBLIC;")
conn.close()
logger.info(f"PostgreSQL {pg_db}@{pg_host}: schema reset for fresh install")
install_cmd = (
f"docker exec -u www-data {container_name} php /var/www/html/occ maintenance:install "
f"-n --admin-user admin --admin-pass NC1qaz2wsx "
f"--database pgsql --database-name '{pg_db}' "
f"--database-user {PG_USER} --database-pass {PG_PASS} "
f"--database-host '{pg_host}:{pg_port}' 2>&1"
)
_, stdout_inst, _ = client.exec_command(install_cmd)
stdout_inst.channel.settimeout(120)
try:
install_out = stdout_inst.read().decode().strip()
except Exception:
install_out = ""
logger.info(f"NC fresh install output: {install_out}")
# 設定 overwritehost
for cfg_cmd in [
f"docker exec -u www-data {container_name} php /var/www/html/occ config:system:set overwritehost --value={nc_domain}",
f"docker exec -u www-data {container_name} php /var/www/html/occ config:system:set overwrite.cli.url --value=https://{nc_domain}",
]:
_, out_cfg, _ = client.exec_command(cfg_cmd)
out_cfg.channel.settimeout(30)
try:
out_cfg.read()
except Exception:
pass
client.close()
success = "successfully installed" in install_out
if success:
logger.info(f"NC {container_name}: installed with pgsql ✓")
else:
logger.error(f"NC {container_name}: fresh install failed: {install_out}")
return success
# 檢查 dbtype
_, stdout2, _ = client.exec_command(
f"docker exec -u www-data {container_name} "
f"php /var/www/html/occ config:system:get dbtype 2>/dev/null"
)
dbtype = stdout2.read().decode().strip()
client.close()
if dbtype == "pgsql":
logger.info(f"NC {container_name}: already using pgsql ✓")
return True
# ─── 偵測到 SQLite自動修復 ───────────────────────────────────────
logger.warning(f"NC {container_name}: dbtype={dbtype}, fixing to pgsql...")
# 1. 刪除 config.php 和 SQLite 殘留資料
client2 = paramiko.SSHClient()
client2.set_missing_host_key_policy(paramiko.AutoAddPolicy())
client2.connect(settings.DOCKER_SSH_HOST, username=settings.DOCKER_SSH_USER, timeout=15)
for cmd in [
f"docker exec {container_name} rm -f /var/www/html/config/config.php",
f"docker exec {container_name} sh -c "
f"'rm -rf /var/www/html/data/admin /var/www/html/data/appdata_* "
f"/var/www/html/data/*.db /var/www/html/data/nextcloud.log'",
]:
_, out, _ = client2.exec_command(cmd)
out.channel.recv_exit_status()
logger.info(f"Cleanup: {cmd}")
# 2. 重置 PostgreSQL DB schema
conn = psycopg2.connect(
host=pg_host, port=pg_port, dbname=pg_db,
user=PG_USER, password=PG_PASS, connect_timeout=5,
)
conn.autocommit = True
cur = conn.cursor()
cur.execute("DROP SCHEMA public CASCADE; CREATE SCHEMA public; GRANT ALL ON SCHEMA public TO PUBLIC;")
conn.close()
logger.info(f"PostgreSQL {pg_db}@{pg_host}: schema reset")
# 3. 重新安裝 NC 使用 PostgreSQL
install_cmd = (
f"docker exec -u www-data {container_name} php /var/www/html/occ maintenance:install "
f"-n --admin-user admin --admin-pass NC1qaz2wsx "
f"--database pgsql --database-name '{pg_db}' "
f"--database-user {PG_USER} --database-pass {PG_PASS} "
f"--database-host '{pg_host}:{pg_port}' 2>&1"
)
_, stdout3, _ = client2.exec_command(install_cmd)
stdout3.channel.recv_exit_status()
install_out = stdout3.read().decode().strip()
logger.info(f"NC reinstall output: {install_out}")
# 4. 設定 overwritehost
for cfg_cmd in [
f"docker exec -u www-data {container_name} php /var/www/html/occ config:system:set overwritehost --value={nc_domain}",
f"docker exec -u www-data {container_name} php /var/www/html/occ config:system:set overwrite.cli.url --value=https://{nc_domain}",
]:
_, out4, _ = client2.exec_command(cfg_cmd)
out4.channel.recv_exit_status()
client2.close()
success = "successfully installed" in install_out
if success:
logger.info(f"NC {container_name}: fixed to pgsql ✓")
else:
logger.error(f"NC {container_name}: reinstall failed: {install_out}")
return success
except Exception as e:
logger.error(f"_nc_db_check {container_name}: {e}")
return False
def _nc_initialized(container_name: str) -> bool:
"""
以 force_language == zh_TW 作為 NC 已完成初始化的判斷依據。
"""
try:
import paramiko
client = paramiko.SSHClient()
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
client.connect(settings.DOCKER_SSH_HOST, username=settings.DOCKER_SSH_USER, timeout=15)
_, stdout, _ = client.exec_command(
f"docker exec -u www-data {container_name} "
f"php /var/www/html/occ config:system:get force_language 2>/dev/null"
)
result = stdout.read().decode().strip()
client.close()
return result == "zh_TW"
except Exception as e:
logger.warning(f"NC initialized check failed {container_name}: {e}")
return False
def _nc_initialize(
container_name: str, kc_host: str, realm: str, client_secret: str, nc_domain: str,
oo_container: str, oo_url: str, is_active: bool = False,
) -> bool:
"""
NC 容器首次完整初始化:
Init-1: 語言設定zh_TW
Init-2: 安裝必要 Appscontacts / calendar / mail / onlyoffice
Init-3: OIDC Provider 設定(呼叫 _configure_nc_oidc
Init-4: SSO 強制模式allow_multiple_user_backends=0
Init-5: OnlyOffice 整合設定
⚠️ Init-4 必須在 Init-3 之後執行,否則無法登入
"""
try:
import paramiko
client = paramiko.SSHClient()
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
client.connect(settings.DOCKER_SSH_HOST, username=settings.DOCKER_SSH_USER, timeout=15)
redis_host = REDIS_HOST_ACTIVE if is_active else REDIS_HOST_TRIAL
# Init-1: 語言設定
for cfg_key, cfg_val in [
("default_language", "zh_TW"),
("default_locale", "zh_TW"),
("force_language", "zh_TW"),
("force_locale", "zh_TW"),
]:
_, out, _ = client.exec_command(
f"docker exec -u www-data {container_name} "
f"php /var/www/html/occ config:system:set {cfg_key} --value={cfg_val} 2>&1"
)
out.channel.recv_exit_status()
# Init-1b: Redis + APCu memcacheOIDC session 持久化必須)
for cfg_cmd in [
f"docker exec -u www-data {container_name} php /var/www/html/occ config:system:set memcache.local --value='\\OC\\Memcache\\APCu' 2>&1",
f"docker exec -u www-data {container_name} php /var/www/html/occ config:system:set memcache.distributed --value='\\OC\\Memcache\\Redis' 2>&1",
f"docker exec -u www-data {container_name} php /var/www/html/occ config:system:set memcache.locking --value='\\OC\\Memcache\\Redis' 2>&1",
f"docker exec -u www-data {container_name} php /var/www/html/occ config:system:set redis host --value={redis_host} 2>&1",
f"docker exec -u www-data {container_name} php /var/www/html/occ config:system:set redis port --value={REDIS_PORT} --type=integer 2>&1",
f"docker exec -u www-data {container_name} php /var/www/html/occ config:system:set redis password --value={REDIS_PASS} 2>&1",
]:
_, out, _ = client.exec_command(cfg_cmd)
out.channel.recv_exit_status()
logger.info(f"NC {container_name}: Redis memcache configured (host={redis_host})")
# Init-2: 安裝必要 Apps已安裝時不影響結果
for app in ["contacts", "calendar", "mail", "onlyoffice"]:
_, out, _ = client.exec_command(
f"docker exec -u www-data {container_name} "
f"php /var/www/html/occ app:install {app} 2>&1"
)
out.channel.recv_exit_status()
text = out.read().decode().strip()
logger.info(f"NC {container_name} app:install {app}: {text}")
client.close()
# Init-3: OIDC Provider 設定(複用現有函式)
_configure_nc_oidc(container_name, kc_host, realm, client_secret, nc_domain)
# Init-4: 強制 SSO禁用本地登入+ Init-5: OnlyOffice 整合設定
client2 = paramiko.SSHClient()
client2.set_missing_host_key_policy(paramiko.AutoAddPolicy())
client2.connect(settings.DOCKER_SSH_HOST, username=settings.DOCKER_SSH_USER, timeout=15)
_, out, _ = client2.exec_command(
f"docker exec -u www-data {container_name} "
f"php /var/www/html/occ config:app:set user_oidc allow_multiple_user_backends --value=0 2>&1"
)
out.channel.recv_exit_status()
# Init-5: OnlyOffice 整合設定
from app.core.config import settings as _cfg
for oo_key, oo_val in [
("DocumentServerUrl", f"{oo_url}/"),
("DocumentServerInternalUrl", f"http://{oo_container}/"),
("StorageUrl", f"https://{nc_domain}/"),
("jwt_secret", _cfg.OO_JWT_SECRET),
]:
_, out, _ = client2.exec_command(
f"docker exec -u www-data {container_name} "
f"php /var/www/html/occ config:app:set onlyoffice {oo_key} --value='{oo_val}' 2>&1"
)
out.channel.recv_exit_status()
client2.close()
logger.info(f"NC {container_name}: initialization complete ✓")
return True
except Exception as e:
logger.error(f"NC initialization failed {container_name}: {e}")
return False
def _nc_oidc_configured(container_name: str) -> bool:
"""檢查 NC 容器是否已設定 OIDC provideruser_oidc app 已安裝且有 provider"""
try:
import paramiko
client = paramiko.SSHClient()
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
client.connect(
settings.DOCKER_SSH_HOST,
username=settings.DOCKER_SSH_USER,
timeout=15,
)
_, stdout, _ = client.exec_command(
f"docker exec -u www-data {container_name} "
f"php /var/www/html/occ user_oidc:providers 2>/dev/null | grep -q clientId && echo yes || echo no"
)
result = stdout.read().decode().strip()
client.close()
return result == "yes"
except Exception as e:
logger.warning(f"NC OIDC check failed for {container_name}: {e}")
return False
def _configure_nc_oidc(
container_name: str, kc_host: str, realm: str, client_secret: str, nc_domain: str
) -> bool:
"""
設定 NC OIDC provider使用 provider name='drive',對應 Keycloak client_id=drive
同時設定 overwritehost 確保 OIDC callback URL 正確。
"""
try:
import paramiko
client = paramiko.SSHClient()
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
client.connect(
settings.DOCKER_SSH_HOST,
username=settings.DOCKER_SSH_USER,
timeout=15,
)
discovery = f"https://{kc_host}/realms/{realm}/.well-known/openid-configuration"
# 設定 overwritehostOIDC callback 必須)
for cfg_cmd in [
f"docker exec -u www-data {container_name} php /var/www/html/occ config:system:set overwritehost --value={nc_domain}",
f"docker exec -u www-data {container_name} php /var/www/html/occ config:system:set overwrite.cli.url --value=https://{nc_domain}",
]:
_, out, _ = client.exec_command(cfg_cmd)
out.channel.recv_exit_status()
# 啟用 user_oidc app
_, out2, _ = client.exec_command(
f"docker exec -u www-data {container_name} php /var/www/html/occ app:enable user_oidc 2>&1"
)
out2.channel.recv_exit_status()
# 設定 OIDC providername=drive與 Keycloak client_id 一致)
oidc_cmd = (
f"docker exec -u www-data {container_name} php /var/www/html/occ user_oidc:provider drive "
f"--clientid=drive "
f"--clientsecret={client_secret} "
f"--discoveryuri={discovery} "
f"--mapping-uid=preferred_username "
f"--mapping-display-name=name "
f"--mapping-email=email "
f"--unique-uid=0 "
f"--check-bearer=0 --send-id-token-hint=1 2>&1"
)
_, stdout, _ = client.exec_command(oidc_cmd)
stdout.channel.recv_exit_status()
out_text = stdout.read().decode().strip()
client.close()
logger.info(f"NC OIDC configure output: {out_text}")
return True
except Exception as e:
logger.error(f"NC OIDC configure failed for {container_name}: {e}")
return False
# ─── PostgreSQL helpers ───────────────────────────────────────────────────────
def _ensure_nc_db(host: str, dbname: str, port: int = PG_PORT_TRIAL) -> bool:
"""
確保 NC 用的 PostgreSQL DB 存在,並授予 public schema 建表權限給所有用戶。
NC Docker 會自動建立 oc_admin* 用戶,需預先開放 public schema CREATE 權限。
"""
try:
import psycopg2
# Connect to postgres DB to create NC DB
conn = psycopg2.connect(
host=host, port=port, dbname="postgres",
user=PG_USER, password=PG_PASS,
connect_timeout=5,
)
conn.autocommit = True
cur = conn.cursor()
cur.execute("SELECT 1 FROM pg_database WHERE datname = %s", (dbname,))
if not cur.fetchone():
cur.execute(f'CREATE DATABASE "{dbname}" OWNER {PG_USER}')
logger.info(f"Created database {dbname}@{host}")
conn.close()
# Grant CREATE on public schema to all users (for NC oc_admin* users)
conn2 = psycopg2.connect(
host=host, port=port, dbname=dbname,
user=PG_USER, password=PG_PASS,
connect_timeout=5,
)
conn2.autocommit = True
cur2 = conn2.cursor()
cur2.execute("GRANT ALL ON SCHEMA public TO PUBLIC")
conn2.close()
return True
except Exception as e:
logger.error(f"_ensure_nc_db {dbname}@{host}: {e}")
return False
def _get_pg_db_size_gb(host: str, dbname: str, port: int = PG_PORT_TRIAL) -> Optional[float]:
try:
import psycopg2
conn = psycopg2.connect(
host=host, port=port, dbname=dbname,
user=PG_USER, password=PG_PASS,
connect_timeout=5,
)
cur = conn.cursor()
cur.execute("SELECT pg_database_size(%s)", (dbname,))
size_bytes = cur.fetchone()[0]
cur.close()
conn.close()
return round(size_bytes / (1024 ** 3), 3)
except Exception as e:
logger.warning(f"PG size check failed {dbname}@{host}: {e}")
return None
# ─── Email notification ───────────────────────────────────────────────────────
def _get_admin_emails(db: Session) -> list[str]:
"""取得所有管理員租戶下所有啟用帳號的 notification_email"""
from app.models.account import Account
rows = (
db.query(Account.email)
.join(Tenant, Account.tenant_id == Tenant.id)
.filter(
Tenant.is_manager == True,
Account.is_active == True,
Account.email != None,
Account.email != "",
)
.all()
)
return list({r.email for r in rows})
def _send_failure_alert(
tenant_code: str,
tenant_name: str,
domain: str,
failed_items: list[str],
admin_emails: list[str],
):
"""任何檢查項目 False 時,統一發送告警 email 給所有管理員"""
if not admin_emails:
logger.warning(f"No admin emails for failure alert on {tenant_code}")
return
try:
item_lines = "\n".join(f"{item}" for item in failed_items)
body = (
f"【Virtual MIS 告警】租戶服務異常\n\n"
f"租戶代碼 : {tenant_code}\n"
f"租戶名稱 : {tenant_name}\n"
f"網域 : {domain}\n"
f"時間 : {now_tw().strftime('%Y-%m-%d %H:%M:%S')}\n\n"
f"異常項目:\n{item_lines}\n\n"
f"請盡速登入 Virtual MIS 後台確認並處理。"
)
msg = MIMEText(body, "plain", "utf-8")
msg["Subject"] = f"[VirtualMIS] 服務異常告警 — {tenant_code} ({domain})"
msg["From"] = f"vmis-alert@{settings.MAIL_MX_HOST}"
msg["To"] = ", ".join(admin_emails)
with smtplib.SMTP(settings.MAIL_MX_HOST, 25, timeout=10) as smtp:
smtp.sendmail(msg["From"], admin_emails, msg.as_string())
logger.info(f"Failure alert sent to {admin_emails} for {tenant_code}: {failed_items}")
except Exception as e:
logger.error(f"Failed to send failure alert for {tenant_code}: {e}")
# ─── Main check ──────────────────────────────────────────────────────────────
def run_tenant_check(schedule_log_id: int, db: Session):
from app.services.mail_client import MailClient
from app.services.docker_client import DockerClient
tenants = db.query(Tenant).filter(Tenant.is_active == True).all()
mail = MailClient()
docker = DockerClient()
admin_emails = _get_admin_emails(db)
for tenant in tenants:
realm = tenant.keycloak_realm or tenant.code
is_active = tenant.status == "active"
nc_name = f"nc-{tenant.code}" if is_active else f"nc-{tenant.code}-test"
oo_name = f"oo-{tenant.code}" if is_active else f"oo-{tenant.code}-test"
kc_host = KC_HOST_ACTIVE if is_active else KC_HOST_TRIAL
pg_host = PG_HOST_ACTIVE if is_active else PG_HOST_TRIAL
pg_port = PG_PORT_ACTIVE if is_active else PG_PORT_TRIAL
result = TenantScheduleResult(
schedule_log_id=schedule_log_id,
tenant_id=tenant.id,
recorded_at=now_tw(),
)
fail_reasons = []
# ── [1] Traefik 路由檔案確認 ─────────────────────────────────────────
# 使用 file providerscheduler 直接寫入 /home/porsche/traefik/dynamic/{code}.yml
try:
ok = _ensure_traefik_routes(tenant, is_active)
result.traefik_status = ok
if not ok:
fail_reasons.append("traefik: route not loaded after write")
result.traefik_done_at = now_tw()
except Exception as e:
result.traefik_status = False
result.traefik_done_at = now_tw()
fail_reasons.append(f"traefik: {e}")
# ── [2] SSO (Keycloak realm + drive client) ──────────────────────────
kc_drive_secret: Optional[str] = None
try:
sso_state = _check_kc_realm(kc_host, realm)
if sso_state is None:
# Realm 不存在 → 建立 realm + drive client
try:
_create_kc_realm(realm, tenant.name)
kc_drive_secret = _ensure_kc_drive_client(realm, tenant.domain)
result.sso_result = True if kc_drive_secret else False
if not kc_drive_secret:
fail_reasons.append("sso: realm created but drive client failed")
except Exception as ce:
result.sso_result = False
fail_reasons.append(f"sso create: {ce}")
elif sso_state is True:
# Realm 存在 → 確保 drive client 存在,並確保 token 逾時設定
kc_drive_secret = _ensure_kc_drive_client(realm, tenant.domain)
result.sso_result = True
if not kc_drive_secret:
fail_reasons.append("sso: drive client missing/failed")
result.sso_result = False
else:
try:
from app.services.keycloak_client import get_keycloak_client
get_keycloak_client().update_realm_token_settings(realm, access_code_lifespan=600)
except Exception:
pass
else:
result.sso_result = False
fail_reasons.append("sso: realm unreachable")
result.sso_done_at = now_tw()
except Exception as e:
result.sso_result = False
result.sso_done_at = now_tw()
fail_reasons.append(f"sso: {e}")
# ── [3] Mailbox domain ───────────────────────────────────────────────
try:
if mail.domain_exists(tenant.domain):
result.mailbox_result = True
else:
# Domain 未設定 → 建立
ok = mail.create_domain(tenant.domain)
result.mailbox_result = True if ok else False
if not ok:
fail_reasons.append("mailbox: create domain failed")
result.mailbox_done_at = now_tw()
except Exception as e:
result.mailbox_result = False
result.mailbox_done_at = now_tw()
fail_reasons.append(f"mailbox: {e}")
# ── [4] NC container + DB 驗證 + OIDC 設定 ─────────────────────────
pg_db = f"nc_{tenant.code}_db"
try:
nc_state = docker.check_container_ssh(nc_name)
if nc_state is None:
# 容器不存在 → 確保 docker-compose.yml + DB + 部署
logger.info(f"NC {nc_name}: not found, ensuring compose/DB and deploying")
_ensure_tenant_compose(tenant, is_active)
_ensure_nc_db(pg_host, pg_db, pg_port)
ok = docker.ssh_compose_up(tenant.code)
result.nc_result = True if ok else False
if not ok:
fail_reasons.append("nc: deploy failed")
else:
if not _nc_db_check(nc_name, pg_host, pg_db, tenant.domain, pg_port):
result.nc_result = False
fail_reasons.append("nc: installed but not using pgsql")
elif nc_state is False:
# 容器存在但已停止 → 重啟
logger.info(f"NC {nc_name}: stopped, restarting")
ok = docker.ssh_compose_up(tenant.code)
result.nc_result = True if ok else False
if not ok:
fail_reasons.append("nc: start failed")
else:
# 容器正常運行 → 驗證 DB 類型
db_ok = _nc_db_check(nc_name, pg_host, pg_db, tenant.domain, pg_port)
if not db_ok:
result.nc_result = False
fail_reasons.append("nc: DB check failed (possible sqlite3 issue)")
else:
result.nc_result = True
if kc_drive_secret:
if not _nc_initialized(nc_name):
oo_url = (f"https://office-{tenant.code}.ease.taipei" if is_active
else f"https://office-{tenant.code}.lab.taipei")
ok = _nc_initialize(nc_name, kc_host, realm, kc_drive_secret, tenant.domain,
oo_name, oo_url, is_active)
if not ok:
fail_reasons.append("nc: initialization failed")
else:
ok = _configure_nc_oidc(nc_name, kc_host, realm, kc_drive_secret, tenant.domain)
if not ok:
fail_reasons.append("nc: OIDC sync failed")
result.nc_done_at = now_tw()
except Exception as e:
result.nc_result = False
result.nc_done_at = now_tw()
fail_reasons.append(f"nc: {e}")
# ── [5] OO container ─────────────────────────────────────────────────
try:
oo_state = docker.check_container_ssh(oo_name)
if oo_state is None:
ok = docker.ssh_compose_up(tenant.code)
result.office_result = True if ok else False
if not ok:
fail_reasons.append("oo: deploy failed")
elif oo_state is False:
ok = docker.ssh_compose_up(tenant.code)
result.office_result = True if ok else False
if not ok:
fail_reasons.append("oo: start failed")
else:
result.office_result = True
result.office_done_at = now_tw()
except Exception as e:
result.office_result = False
result.office_done_at = now_tw()
fail_reasons.append(f"oo: {e}")
# ── [6] Quota (OO disk + PG DB size) ────────────────────────────────
try:
oo_gb = docker.get_oo_disk_usage_gb(oo_name) or 0.0
pg_gb = _get_pg_db_size_gb(pg_host, pg_db, pg_port) or 0.0
result.quota_usage = round(oo_gb + pg_gb, 3)
except Exception as e:
logger.warning(f"Quota check failed for {tenant.code}: {e}")
if fail_reasons:
result.fail_reason = "; ".join(fail_reasons)
# 任何項目 False → 統一發送告警給所有管理員
failed_items = []
if result.traefik_status is False:
failed_items.append("Traefik 路由")
if result.sso_result is False:
failed_items.append("SSO (Keycloak Realm)")
if result.mailbox_result is False:
failed_items.append("Mailbox Domain")
if result.nc_result is False:
failed_items.append("Nextcloud 容器")
if result.office_result is False:
failed_items.append("OnlyOffice 容器")
if failed_items:
_send_failure_alert(
tenant.code, tenant.name, tenant.domain, failed_items, admin_emails
)
db.add(result)
db.commit()
logger.info(f"Tenant check done: {len(tenants)} tenants processed")