feat(backend): Phase 1-4 全新開發完成,37/37 TDD 通過

[Phase 0 Reset]
- 清除舊版 app/、alembic/versions/、雜亂測試腳本
- 新 requirements.txt (移除 caldav/redis/keycloak-lib,加入 apscheduler/croniter/docker/paramiko/ping3/dnspython)

[Phase 1 資料庫]
- 9 張資料表 SQLAlchemy Models:tenants / accounts / schedules / schedule_logs /
  tenant_schedule_results / account_schedule_results / servers / server_status_logs / system_status_logs
- Alembic migration 001_create_all_tables (已套用到 10.1.0.20:5433/virtual_mis)
- seed.py:schedules 初始 3 筆 / servers 初始 4 筆

[Phase 2 CRUD API]
- GET/POST/PUT/DELETE: /api/v1/tenants / accounts / servers / schedules
- /api/v1/system-status
- 帳號編碼自動產生 (prefix + seq_no 4碼左補0)
- 燈號 (lights) 從最新排程結果取得

[Phase 3 Watchdog]
- APScheduler interval 3分鐘,原子 UPDATE status=Going 防重複執行
- 手動觸發 API: POST /api/v1/schedules/{id}/run

[Phase 4 Service Clients]
- KeycloakClient:vmis-admin realm,REST API (不用 python-keycloak)
- MailClient:Docker Mailserver @ 10.1.0.254:8080,含 MX DNS 驗證
- DockerClient:docker-py 本機 + paramiko SSH 遠端 compose
- NextcloudClient:OCS API user/quota
- SystemChecker:功能驗證 (traefik routers>0 / keycloak token / SMTP EHLO / DB SELECT 1 / ping)

[TDD]
- 37 tests / 37 passed (2.11s)
- SQLite in-memory + StaticPool,無需外部 DB

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
VMIS Developer
2026-03-14 13:10:15 +08:00
parent 22611f7f73
commit 42d1420f9c
52 changed files with 2934 additions and 0 deletions

View File

@@ -0,0 +1,59 @@
"""
Schedule dispatcher: routes schedule_id to the correct run function.
Also used by manual trigger API.
"""
import logging
from sqlalchemy.orm import Session
from app.core.database import SessionLocal
logger = logging.getLogger(__name__)
def dispatch_schedule(schedule_id: int, log_id: int = None, db: Session = None):
"""
Dispatch to correct schedule function.
When called from watchdog, db and log_id are provided.
When called from manual API, creates its own session and log.
"""
own_db = db is None
if own_db:
db = SessionLocal()
if log_id is None:
from datetime import datetime
from app.models.schedule import ScheduleLog, Schedule
schedule = db.get(Schedule, schedule_id)
if not schedule:
if own_db:
db.close()
return
log = ScheduleLog(
schedule_id=schedule_id,
schedule_name=schedule.name,
started_at=datetime.utcnow(),
status="running",
)
db.add(log)
db.commit()
db.refresh(log)
log_id = log.id
try:
if schedule_id == 1:
from app.services.scheduler.schedule_tenant import run_tenant_check
run_tenant_check(log_id, db)
elif schedule_id == 2:
from app.services.scheduler.schedule_account import run_account_check
run_account_check(log_id, db)
elif schedule_id == 3:
from app.services.scheduler.schedule_system import run_system_status
run_system_status(log_id, db)
else:
logger.warning(f"Unknown schedule_id: {schedule_id}")
except Exception as e:
logger.exception(f"dispatch_schedule({schedule_id}) error: {e}")
raise
finally:
if own_db:
db.close()

View File

@@ -0,0 +1,103 @@
"""
Schedule 2 — 帳號檢查(每 3 分鐘)
檢查每個 active 帳號的: SSO使用者 / Mailbox / NC使用者 / Quota
"""
import logging
from datetime import datetime
from sqlalchemy.orm import Session
from app.models.account import Account
from app.models.result import AccountScheduleResult
logger = logging.getLogger(__name__)
def run_account_check(schedule_log_id: int, db: Session):
from app.services.keycloak_client import KeycloakClient
from app.services.mail_client import MailClient
from app.services.nextcloud_client import NextcloudClient
accounts = (
db.query(Account)
.filter(Account.is_active == True)
.all()
)
kc = KeycloakClient()
mail = MailClient()
for account in accounts:
tenant = account.tenant
realm = tenant.keycloak_realm or tenant.code
result = AccountScheduleResult(
schedule_log_id=schedule_log_id,
account_id=account.id,
sso_account=account.sso_account,
recorded_at=datetime.utcnow(),
)
fail_reasons = []
# [1] SSO user check
try:
sso_uuid = kc.get_user_uuid(realm, account.sso_account)
if sso_uuid:
result.sso_result = True
result.sso_uuid = sso_uuid
if not account.sso_uuid:
account.sso_uuid = sso_uuid
else:
sso_uuid = kc.create_user(realm, account.sso_account, account.email, account.default_password)
result.sso_result = sso_uuid is not None
result.sso_uuid = sso_uuid
if sso_uuid and not account.sso_uuid:
account.sso_uuid = sso_uuid
result.sso_done_at = datetime.utcnow()
except Exception as e:
result.sso_result = False
result.sso_done_at = datetime.utcnow()
fail_reasons.append(f"sso: {e}")
# [2] Mailbox check (skip if mail domain not ready)
try:
email = account.email or f"{account.sso_account}@{tenant.domain}"
mb_exists = mail.mailbox_exists(email)
if mb_exists:
result.mailbox_result = True
else:
created = mail.create_mailbox(email, account.default_password, account.quota_limit)
result.mailbox_result = created
result.mailbox_done_at = datetime.utcnow()
except Exception as e:
result.mailbox_result = False
result.mailbox_done_at = datetime.utcnow()
fail_reasons.append(f"mailbox: {e}")
# [3] NC user check
try:
nc = NextcloudClient(tenant.domain)
nc_exists = nc.user_exists(account.sso_account)
if nc_exists:
result.nc_result = True
else:
created = nc.create_user(account.sso_account, account.default_password, account.quota_limit)
result.nc_result = created
result.nc_done_at = datetime.utcnow()
except Exception as e:
result.nc_result = False
result.nc_done_at = datetime.utcnow()
fail_reasons.append(f"nc: {e}")
# [4] Quota
try:
nc = NextcloudClient(tenant.domain)
result.quota_usage = nc.get_user_quota_used_gb(account.sso_account)
except Exception as e:
logger.warning(f"Quota check failed for {account.account_code}: {e}")
if fail_reasons:
result.fail_reason = "; ".join(fail_reasons)
db.add(result)
db.commit()
db.flush()
logger.info(f"Account check done: {len(accounts)} accounts processed")

View File

@@ -0,0 +1,94 @@
"""
Schedule 3 — 系統狀態(每日 08:00
Part A: 基礎設施服務功能驗證traefik/keycloak/mail/db
Part B: 伺服器 ping 檢查
"""
import logging
from datetime import datetime
from sqlalchemy.orm import Session
from app.models.server import SystemStatusLog, ServerStatusLog, Server
logger = logging.getLogger(__name__)
# Fixed 8 services: environment × service_name
SERVICES = [
{"environment": "test", "service_name": "traefik",
"service_desc": "測試環境反向代理", "host": "localhost", "port": 8080},
{"environment": "test", "service_name": "keycloak",
"service_desc": "測試環境 SSO",
"url": "https://auth.lab.taipei", "realm": "master"},
{"environment": "test", "service_name": "mail",
"service_desc": "測試環境 Mail Server", "host": "localhost", "port": 587},
{"environment": "test", "service_name": "db",
"service_desc": "10.1.0.20:5433 PostgreSQL",
"db_host": "10.1.0.20", "db_port": 5433},
{"environment": "prod", "service_name": "traefik",
"service_desc": "正式環境反向代理", "host": "localhost", "port": 8080},
{"environment": "prod", "service_name": "keycloak",
"service_desc": "正式環境 SSO",
"url": "https://auth.ease.taipei", "realm": "master"},
{"environment": "prod", "service_name": "mail",
"service_desc": "正式環境 Mail Server", "host": "10.1.0.254", "port": 587},
{"environment": "prod", "service_name": "db",
"service_desc": "10.1.0.254:5432 PostgreSQL",
"db_host": "10.1.0.254", "db_port": 5432},
]
def run_system_status(schedule_log_id: int, db: Session):
from app.services.system_checker import SystemChecker
checker = SystemChecker()
# Part A: Infrastructure services
for svc in SERVICES:
result = False
fail_reason = None
try:
if svc["service_name"] == "traefik":
result = checker.check_traefik(svc["host"], svc["port"])
elif svc["service_name"] == "keycloak":
result = checker.check_keycloak(svc["url"], svc["realm"])
elif svc["service_name"] == "mail":
result = checker.check_smtp(svc["host"], svc["port"])
elif svc["service_name"] == "db":
result = checker.check_postgres(svc["db_host"], svc["db_port"])
except Exception as e:
result = False
fail_reason = str(e)
db.add(SystemStatusLog(
schedule_log_id=schedule_log_id,
environment=svc["environment"],
service_name=svc["service_name"],
service_desc=svc["service_desc"],
result=result,
fail_reason=fail_reason,
recorded_at=datetime.utcnow(),
))
# Part B: Server ping
servers = db.query(Server).filter(Server.is_active == True).order_by(Server.sort_order).all()
for server in servers:
response_time = None
fail_reason = None
try:
response_time = checker.ping_server(server.ip_address)
result = response_time is not None
if not result:
fail_reason = "No response"
except Exception as e:
result = False
fail_reason = str(e)
db.add(ServerStatusLog(
schedule_log_id=schedule_log_id,
server_id=server.id,
result=result,
response_time=response_time,
fail_reason=fail_reason,
recorded_at=datetime.utcnow(),
))
db.commit()
logger.info(f"System status check done: {len(SERVICES)} services + {len(servers)} servers")

View File

@@ -0,0 +1,110 @@
"""
Schedule 1 — 租戶檢查(每 3 分鐘)
檢查每個 active 租戶的: Traefik路由 / SSO Realm / Mailbox Domain / NC容器 / OO容器 / Quota
"""
import logging
from datetime import datetime
from sqlalchemy.orm import Session
from app.models.tenant import Tenant
from app.models.result import TenantScheduleResult
logger = logging.getLogger(__name__)
def run_tenant_check(schedule_log_id: int, db: Session):
from app.services.keycloak_client import KeycloakClient
from app.services.mail_client import MailClient
from app.services.docker_client import DockerClient
from app.services.nextcloud_client import NextcloudClient
tenants = db.query(Tenant).filter(Tenant.is_active == True).all()
kc = KeycloakClient()
mail = MailClient()
docker = DockerClient()
for tenant in tenants:
realm = tenant.keycloak_realm or tenant.code
result = TenantScheduleResult(
schedule_log_id=schedule_log_id,
tenant_id=tenant.id,
recorded_at=datetime.utcnow(),
)
fail_reasons = []
# [1] Traefik
try:
result.traefik_status = docker.check_traefik_route(tenant.domain)
result.traefik_done_at = datetime.utcnow()
except Exception as e:
result.traefik_status = False
result.traefik_done_at = datetime.utcnow()
fail_reasons.append(f"traefik: {e}")
# [2] SSO
try:
exists = kc.realm_exists(realm)
if not exists:
kc.create_realm(realm, tenant.name)
result.sso_result = True
result.sso_done_at = datetime.utcnow()
except Exception as e:
result.sso_result = False
result.sso_done_at = datetime.utcnow()
fail_reasons.append(f"sso: {e}")
# [3] Mailbox Domain (with DNS check for active tenants)
try:
if tenant.status == "active":
dns_ok = mail.check_mx_dns(tenant.domain)
if not dns_ok:
result.mailbox_result = False
result.mailbox_done_at = datetime.utcnow()
fail_reasons.append("mailbox: MX record not configured")
db.add(result)
db.commit()
continue
domain_exists = mail.domain_exists(tenant.domain)
if not domain_exists:
mail.create_domain(tenant.domain)
result.mailbox_result = True
result.mailbox_done_at = datetime.utcnow()
except Exception as e:
result.mailbox_result = False
result.mailbox_done_at = datetime.utcnow()
fail_reasons.append(f"mailbox: {e}")
# [4] Nextcloud container
try:
nc_name = f"nc-{realm}"
result.nc_result = docker.ensure_container_running(nc_name, tenant.code, realm)
result.nc_done_at = datetime.utcnow()
except Exception as e:
result.nc_result = False
result.nc_done_at = datetime.utcnow()
fail_reasons.append(f"nc: {e}")
# [5] OnlyOffice container
try:
oo_name = f"oo-{realm}"
result.office_result = docker.ensure_container_running(oo_name, tenant.code, realm)
result.office_done_at = datetime.utcnow()
except Exception as e:
result.office_result = False
result.office_done_at = datetime.utcnow()
fail_reasons.append(f"office: {e}")
# [6] Quota
try:
nc = NextcloudClient(tenant.domain)
result.quota_usage = nc.get_total_quota_used_gb()
except Exception as e:
logger.warning(f"Quota check failed for {tenant.code}: {e}")
if fail_reasons:
result.fail_reason = "; ".join(fail_reasons)
db.add(result)
db.commit()
logger.info(f"Tenant check done: {len(tenants)} tenants processed")

View File

@@ -0,0 +1,107 @@
"""
Watchdog: APScheduler BackgroundScheduler每 3 分鐘掃描 schedules 表。
防重複執行:原子 UPDATE status='Going',影響 0 筆則跳過。
"""
import logging
from datetime import datetime
from apscheduler.schedulers.background import BackgroundScheduler
from croniter import croniter
from sqlalchemy import update
from sqlalchemy.orm import Session
from app.core.database import SessionLocal
from app.models.schedule import Schedule, ScheduleLog
logger = logging.getLogger(__name__)
_scheduler = BackgroundScheduler(timezone="Asia/Taipei")
def _watchdog_tick():
db: Session = SessionLocal()
try:
due = (
db.query(Schedule)
.filter(
Schedule.status == "Waiting",
Schedule.next_run_at <= datetime.utcnow(),
)
.all()
)
for schedule in due:
# Atomic lock: only one process wins
affected = db.execute(
update(Schedule)
.where(Schedule.id == schedule.id, Schedule.status == "Waiting")
.values(status="Going")
).rowcount
db.commit()
if affected == 0:
# Another process already grabbed it
continue
log = ScheduleLog(
schedule_id=schedule.id,
schedule_name=schedule.name,
started_at=datetime.utcnow(),
status="running",
)
db.add(log)
db.commit()
db.refresh(log)
try:
from app.services.scheduler.runner import dispatch_schedule
dispatch_schedule(schedule.id, log.id, db)
final_status = "ok"
except Exception as e:
logger.exception(f"Schedule {schedule.name} failed: {e}")
final_status = "error"
# Update log
log.ended_at = datetime.utcnow()
log.status = final_status
# Recalculate next_run_at
try:
cron = croniter(schedule.cron_timer, datetime.utcnow())
next_run = cron.get_next(datetime)
except Exception:
next_run = None
# Reset schedule
db.execute(
update(Schedule)
.where(Schedule.id == schedule.id)
.values(
status="Waiting",
last_run_at=datetime.utcnow(),
next_run_at=next_run,
last_status=final_status,
)
)
db.commit()
except Exception as e:
logger.exception(f"Watchdog tick error: {e}")
db.rollback()
finally:
db.close()
def start_watchdog():
_scheduler.add_job(
_watchdog_tick,
trigger="interval",
minutes=3,
id="watchdog",
replace_existing=True,
)
_scheduler.start()
logger.info("Watchdog scheduler started")
def stop_watchdog():
_scheduler.shutdown(wait=False)
logger.info("Watchdog scheduler stopped")