feat(backend): Phase 1-4 全新開發完成，37/37 TDD 通過

[Phase 0 Reset] - 清除舊版 app/、alembic/versions/、雜亂測試腳本 - 新 requirements.txt (移除 caldav/redis/keycloak-lib，加入 apscheduler/croniter/docker/paramiko/ping3/dnspython) [Phase 1 資料庫] - 9 張資料表 SQLAlchemy Models：tenants / accounts / schedules / schedule_logs / tenant_schedule_results / account_schedule_results / servers / server_status_logs / system_status_logs - Alembic migration 001_create_all_tables (已套用到 10.1.0.20:5433/virtual_mis) - seed.py：schedules 初始 3 筆 / servers 初始 4 筆 [Phase 2 CRUD API] - GET/POST/PUT/DELETE: /api/v1/tenants / accounts / servers / schedules - /api/v1/system-status - 帳號編碼自動產生 (prefix + seq_no 4碼左補0) - 燈號 (lights) 從最新排程結果取得 [Phase 3 Watchdog] - APScheduler interval 3分鐘，原子 UPDATE status=Going 防重複執行 - 手動觸發 API: POST /api/v1/schedules/{id}/run [Phase 4 Service Clients] - KeycloakClient：vmis-admin realm，REST API (不用 python-keycloak) - MailClient：Docker Mailserver @ 10.1.0.254:8080，含 MX DNS 驗證 - DockerClient：docker-py 本機 + paramiko SSH 遠端 compose - NextcloudClient：OCS API user/quota - SystemChecker：功能驗證 (traefik routers>0 / keycloak token / SMTP EHLO / DB SELECT 1 / ping) [TDD] - 37 tests / 37 passed (2.11s) - SQLite in-memory + StaticPool，無需外部 DB Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-14 13:10:15 +08:00
parent 22611f7f73
commit 42d1420f9c
52 changed files with 2934 additions and 0 deletions
--- a/backend/app/services/scheduler/init.py
+++ b/backend/app/services/scheduler/init.py
--- a/backend/app/services/scheduler/runner.py
+++ b/backend/app/services/scheduler/runner.py
@@ -0,0 +1,59 @@
+"""
+Schedule dispatcher: routes schedule_id to the correct run function.
+Also used by manual trigger API.
+"""
+import logging
+from sqlalchemy.orm import Session
+
+from app.core.database import SessionLocal
+
+logger = logging.getLogger(__name__)
+
+
+def dispatch_schedule(schedule_id: int, log_id: int = None, db: Session = None):
+    """
+    Dispatch to correct schedule function.
+    When called from watchdog, db and log_id are provided.
+    When called from manual API, creates its own session and log.
+    """
+    own_db = db is None
+    if own_db:
+        db = SessionLocal()
+
+    if log_id is None:
+        from datetime import datetime
+        from app.models.schedule import ScheduleLog, Schedule
+        schedule = db.get(Schedule, schedule_id)
+        if not schedule:
+            if own_db:
+                db.close()
+            return
+        log = ScheduleLog(
+            schedule_id=schedule_id,
+            schedule_name=schedule.name,
+            started_at=datetime.utcnow(),
+            status="running",
+        )
+        db.add(log)
+        db.commit()
+        db.refresh(log)
+        log_id = log.id
+
+    try:
+        if schedule_id == 1:
+            from app.services.scheduler.schedule_tenant import run_tenant_check
+            run_tenant_check(log_id, db)
+        elif schedule_id == 2:
+            from app.services.scheduler.schedule_account import run_account_check
+            run_account_check(log_id, db)
+        elif schedule_id == 3:
+            from app.services.scheduler.schedule_system import run_system_status
+            run_system_status(log_id, db)
+        else:
+            logger.warning(f"Unknown schedule_id: {schedule_id}")
+    except Exception as e:
+        logger.exception(f"dispatch_schedule({schedule_id}) error: {e}")
+        raise
+    finally:
+        if own_db:
+            db.close()
--- a/backend/app/services/scheduler/schedule_account.py
+++ b/backend/app/services/scheduler/schedule_account.py
@@ -0,0 +1,103 @@
+"""
+Schedule 2 — 帳號檢查（每 3 分鐘）
+檢查每個 active 帳號的: SSO使用者 / Mailbox / NC使用者 / Quota
+"""
+import logging
+from datetime import datetime
+from sqlalchemy.orm import Session
+
+from app.models.account import Account
+from app.models.result import AccountScheduleResult
+
+logger = logging.getLogger(__name__)
+
+
+def run_account_check(schedule_log_id: int, db: Session):
+    from app.services.keycloak_client import KeycloakClient
+    from app.services.mail_client import MailClient
+    from app.services.nextcloud_client import NextcloudClient
+
+    accounts = (
+        db.query(Account)
+        .filter(Account.is_active == True)
+        .all()
+    )
+    kc = KeycloakClient()
+    mail = MailClient()
+
+    for account in accounts:
+        tenant = account.tenant
+        realm = tenant.keycloak_realm or tenant.code
+        result = AccountScheduleResult(
+            schedule_log_id=schedule_log_id,
+            account_id=account.id,
+            sso_account=account.sso_account,
+            recorded_at=datetime.utcnow(),
+        )
+        fail_reasons = []
+
+        # [1] SSO user check
+        try:
+            sso_uuid = kc.get_user_uuid(realm, account.sso_account)
+            if sso_uuid:
+                result.sso_result = True
+                result.sso_uuid = sso_uuid
+                if not account.sso_uuid:
+                    account.sso_uuid = sso_uuid
+            else:
+                sso_uuid = kc.create_user(realm, account.sso_account, account.email, account.default_password)
+                result.sso_result = sso_uuid is not None
+                result.sso_uuid = sso_uuid
+                if sso_uuid and not account.sso_uuid:
+                    account.sso_uuid = sso_uuid
+            result.sso_done_at = datetime.utcnow()
+        except Exception as e:
+            result.sso_result = False
+            result.sso_done_at = datetime.utcnow()
+            fail_reasons.append(f"sso: {e}")
+
+        # [2] Mailbox check (skip if mail domain not ready)
+        try:
+            email = account.email or f"{account.sso_account}@{tenant.domain}"
+            mb_exists = mail.mailbox_exists(email)
+            if mb_exists:
+                result.mailbox_result = True
+            else:
+                created = mail.create_mailbox(email, account.default_password, account.quota_limit)
+                result.mailbox_result = created
+            result.mailbox_done_at = datetime.utcnow()
+        except Exception as e:
+            result.mailbox_result = False
+            result.mailbox_done_at = datetime.utcnow()
+            fail_reasons.append(f"mailbox: {e}")
+
+        # [3] NC user check
+        try:
+            nc = NextcloudClient(tenant.domain)
+            nc_exists = nc.user_exists(account.sso_account)
+            if nc_exists:
+                result.nc_result = True
+            else:
+                created = nc.create_user(account.sso_account, account.default_password, account.quota_limit)
+                result.nc_result = created
+            result.nc_done_at = datetime.utcnow()
+        except Exception as e:
+            result.nc_result = False
+            result.nc_done_at = datetime.utcnow()
+            fail_reasons.append(f"nc: {e}")
+
+        # [4] Quota
+        try:
+            nc = NextcloudClient(tenant.domain)
+            result.quota_usage = nc.get_user_quota_used_gb(account.sso_account)
+        except Exception as e:
+            logger.warning(f"Quota check failed for {account.account_code}: {e}")
+
+        if fail_reasons:
+            result.fail_reason = "; ".join(fail_reasons)
+
+        db.add(result)
+
+    db.commit()
+    db.flush()
+    logger.info(f"Account check done: {len(accounts)} accounts processed")
--- a/backend/app/services/scheduler/schedule_system.py
+++ b/backend/app/services/scheduler/schedule_system.py
@@ -0,0 +1,94 @@
+"""
+Schedule 3 — 系統狀態（每日 08:00）
+Part A: 基礎設施服務功能驗證（traefik/keycloak/mail/db）
+Part B: 伺服器 ping 檢查
+"""
+import logging
+from datetime import datetime
+from sqlalchemy.orm import Session
+
+from app.models.server import SystemStatusLog, ServerStatusLog, Server
+
+logger = logging.getLogger(__name__)
+
+# Fixed 8 services: environment × service_name
+SERVICES = [
+    {"environment": "test", "service_name": "traefik",
+     "service_desc": "測試環境反向代理", "host": "localhost", "port": 8080},
+    {"environment": "test", "service_name": "keycloak",
+     "service_desc": "測試環境 SSO",
+     "url": "https://auth.lab.taipei", "realm": "master"},
+    {"environment": "test", "service_name": "mail",
+     "service_desc": "測試環境 Mail Server", "host": "localhost", "port": 587},
+    {"environment": "test", "service_name": "db",
+     "service_desc": "10.1.0.20:5433 PostgreSQL",
+     "db_host": "10.1.0.20", "db_port": 5433},
+    {"environment": "prod", "service_name": "traefik",
+     "service_desc": "正式環境反向代理", "host": "localhost", "port": 8080},
+    {"environment": "prod", "service_name": "keycloak",
+     "service_desc": "正式環境 SSO",
+     "url": "https://auth.ease.taipei", "realm": "master"},
+    {"environment": "prod", "service_name": "mail",
+     "service_desc": "正式環境 Mail Server", "host": "10.1.0.254", "port": 587},
+    {"environment": "prod", "service_name": "db",
+     "service_desc": "10.1.0.254:5432 PostgreSQL",
+     "db_host": "10.1.0.254", "db_port": 5432},
+]
+
+
+def run_system_status(schedule_log_id: int, db: Session):
+    from app.services.system_checker import SystemChecker
+    checker = SystemChecker()
+
+    # Part A: Infrastructure services
+    for svc in SERVICES:
+        result = False
+        fail_reason = None
+        try:
+            if svc["service_name"] == "traefik":
+                result = checker.check_traefik(svc["host"], svc["port"])
+            elif svc["service_name"] == "keycloak":
+                result = checker.check_keycloak(svc["url"], svc["realm"])
+            elif svc["service_name"] == "mail":
+                result = checker.check_smtp(svc["host"], svc["port"])
+            elif svc["service_name"] == "db":
+                result = checker.check_postgres(svc["db_host"], svc["db_port"])
+        except Exception as e:
+            result = False
+            fail_reason = str(e)
+
+        db.add(SystemStatusLog(
+            schedule_log_id=schedule_log_id,
+            environment=svc["environment"],
+            service_name=svc["service_name"],
+            service_desc=svc["service_desc"],
+            result=result,
+            fail_reason=fail_reason,
+            recorded_at=datetime.utcnow(),
+        ))
+
+    # Part B: Server ping
+    servers = db.query(Server).filter(Server.is_active == True).order_by(Server.sort_order).all()
+    for server in servers:
+        response_time = None
+        fail_reason = None
+        try:
+            response_time = checker.ping_server(server.ip_address)
+            result = response_time is not None
+            if not result:
+                fail_reason = "No response"
+        except Exception as e:
+            result = False
+            fail_reason = str(e)
+
+        db.add(ServerStatusLog(
+            schedule_log_id=schedule_log_id,
+            server_id=server.id,
+            result=result,
+            response_time=response_time,
+            fail_reason=fail_reason,
+            recorded_at=datetime.utcnow(),
+        ))
+
+    db.commit()
+    logger.info(f"System status check done: {len(SERVICES)} services + {len(servers)} servers")
--- a/backend/app/services/scheduler/schedule_tenant.py
+++ b/backend/app/services/scheduler/schedule_tenant.py
@@ -0,0 +1,110 @@
+"""
+Schedule 1 — 租戶檢查（每 3 分鐘）
+檢查每個 active 租戶的: Traefik路由 / SSO Realm / Mailbox Domain / NC容器 / OO容器 / Quota
+"""
+import logging
+from datetime import datetime
+from sqlalchemy.orm import Session
+
+from app.models.tenant import Tenant
+from app.models.result import TenantScheduleResult
+
+logger = logging.getLogger(__name__)
+
+
+def run_tenant_check(schedule_log_id: int, db: Session):
+    from app.services.keycloak_client import KeycloakClient
+    from app.services.mail_client import MailClient
+    from app.services.docker_client import DockerClient
+    from app.services.nextcloud_client import NextcloudClient
+
+    tenants = db.query(Tenant).filter(Tenant.is_active == True).all()
+    kc = KeycloakClient()
+    mail = MailClient()
+    docker = DockerClient()
+
+    for tenant in tenants:
+        realm = tenant.keycloak_realm or tenant.code
+        result = TenantScheduleResult(
+            schedule_log_id=schedule_log_id,
+            tenant_id=tenant.id,
+            recorded_at=datetime.utcnow(),
+        )
+        fail_reasons = []
+
+        # [1] Traefik
+        try:
+            result.traefik_status = docker.check_traefik_route(tenant.domain)
+            result.traefik_done_at = datetime.utcnow()
+        except Exception as e:
+            result.traefik_status = False
+            result.traefik_done_at = datetime.utcnow()
+            fail_reasons.append(f"traefik: {e}")
+
+        # [2] SSO
+        try:
+            exists = kc.realm_exists(realm)
+            if not exists:
+                kc.create_realm(realm, tenant.name)
+            result.sso_result = True
+            result.sso_done_at = datetime.utcnow()
+        except Exception as e:
+            result.sso_result = False
+            result.sso_done_at = datetime.utcnow()
+            fail_reasons.append(f"sso: {e}")
+
+        # [3] Mailbox Domain (with DNS check for active tenants)
+        try:
+            if tenant.status == "active":
+                dns_ok = mail.check_mx_dns(tenant.domain)
+                if not dns_ok:
+                    result.mailbox_result = False
+                    result.mailbox_done_at = datetime.utcnow()
+                    fail_reasons.append("mailbox: MX record not configured")
+                    db.add(result)
+                    db.commit()
+                    continue
+            domain_exists = mail.domain_exists(tenant.domain)
+            if not domain_exists:
+                mail.create_domain(tenant.domain)
+            result.mailbox_result = True
+            result.mailbox_done_at = datetime.utcnow()
+        except Exception as e:
+            result.mailbox_result = False
+            result.mailbox_done_at = datetime.utcnow()
+            fail_reasons.append(f"mailbox: {e}")
+
+        # [4] Nextcloud container
+        try:
+            nc_name = f"nc-{realm}"
+            result.nc_result = docker.ensure_container_running(nc_name, tenant.code, realm)
+            result.nc_done_at = datetime.utcnow()
+        except Exception as e:
+            result.nc_result = False
+            result.nc_done_at = datetime.utcnow()
+            fail_reasons.append(f"nc: {e}")
+
+        # [5] OnlyOffice container
+        try:
+            oo_name = f"oo-{realm}"
+            result.office_result = docker.ensure_container_running(oo_name, tenant.code, realm)
+            result.office_done_at = datetime.utcnow()
+        except Exception as e:
+            result.office_result = False
+            result.office_done_at = datetime.utcnow()
+            fail_reasons.append(f"office: {e}")
+
+        # [6] Quota
+        try:
+            nc = NextcloudClient(tenant.domain)
+            result.quota_usage = nc.get_total_quota_used_gb()
+        except Exception as e:
+            logger.warning(f"Quota check failed for {tenant.code}: {e}")
+
+        if fail_reasons:
+            result.fail_reason = "; ".join(fail_reasons)
+
+        db.add(result)
+
+    db.commit()
+    logger.info(f"Tenant check done: {len(tenants)} tenants processed")
--- a/backend/app/services/scheduler/watchdog.py
+++ b/backend/app/services/scheduler/watchdog.py
@@ -0,0 +1,107 @@
+"""
+Watchdog: APScheduler BackgroundScheduler，每 3 分鐘掃描 schedules 表。
+防重複執行：原子 UPDATE status='Going'，影響 0 筆則跳過。
+"""
+import logging
+from datetime import datetime
+from apscheduler.schedulers.background import BackgroundScheduler
+from croniter import croniter
+from sqlalchemy import update
+from sqlalchemy.orm import Session
+
+from app.core.database import SessionLocal
+from app.models.schedule import Schedule, ScheduleLog
+
+logger = logging.getLogger(__name__)
+
+_scheduler = BackgroundScheduler(timezone="Asia/Taipei")
+
+
+def _watchdog_tick():
+    db: Session = SessionLocal()
+    try:
+        due = (
+            db.query(Schedule)
+            .filter(
+                Schedule.status == "Waiting",
+                Schedule.next_run_at <= datetime.utcnow(),
+            )
+            .all()
+        )
+        for schedule in due:
+            # Atomic lock: only one process wins
+            affected = db.execute(
+                update(Schedule)
+                .where(Schedule.id == schedule.id, Schedule.status == "Waiting")
+                .values(status="Going")
+            ).rowcount
+            db.commit()
+
+            if affected == 0:
+                # Another process already grabbed it
+                continue
+
+            log = ScheduleLog(
+                schedule_id=schedule.id,
+                schedule_name=schedule.name,
+                started_at=datetime.utcnow(),
+                status="running",
+            )
+            db.add(log)
+            db.commit()
+            db.refresh(log)
+
+            try:
+                from app.services.scheduler.runner import dispatch_schedule
+                dispatch_schedule(schedule.id, log.id, db)
+                final_status = "ok"
+            except Exception as e:
+                logger.exception(f"Schedule {schedule.name} failed: {e}")
+                final_status = "error"
+
+            # Update log
+            log.ended_at = datetime.utcnow()
+            log.status = final_status
+
+            # Recalculate next_run_at
+            try:
+                cron = croniter(schedule.cron_timer, datetime.utcnow())
+                next_run = cron.get_next(datetime)
+            except Exception:
+                next_run = None
+
+            # Reset schedule
+            db.execute(
+                update(Schedule)
+                .where(Schedule.id == schedule.id)
+                .values(
+                    status="Waiting",
+                    last_run_at=datetime.utcnow(),
+                    next_run_at=next_run,
+                    last_status=final_status,
+                )
+            )
+            db.commit()
+
+    except Exception as e:
+        logger.exception(f"Watchdog tick error: {e}")
+        db.rollback()
+    finally:
+        db.close()
+
+
+def start_watchdog():
+    _scheduler.add_job(
+        _watchdog_tick,
+        trigger="interval",
+        minutes=3,
+        id="watchdog",
+        replace_existing=True,
+    )
+    _scheduler.start()
+    logger.info("Watchdog scheduler started")
+
+
+def stop_watchdog():
+    _scheduler.shutdown(wait=False)
+    logger.info("Watchdog scheduler stopped")