feat(backend): Phase 1-4 全新開發完成,37/37 TDD 通過
[Phase 0 Reset]
- 清除舊版 app/、alembic/versions/、雜亂測試腳本
- 新 requirements.txt (移除 caldav/redis/keycloak-lib,加入 apscheduler/croniter/docker/paramiko/ping3/dnspython)
[Phase 1 資料庫]
- 9 張資料表 SQLAlchemy Models:tenants / accounts / schedules / schedule_logs /
tenant_schedule_results / account_schedule_results / servers / server_status_logs / system_status_logs
- Alembic migration 001_create_all_tables (已套用到 10.1.0.20:5433/virtual_mis)
- seed.py:schedules 初始 3 筆 / servers 初始 4 筆
[Phase 2 CRUD API]
- GET/POST/PUT/DELETE: /api/v1/tenants / accounts / servers / schedules
- /api/v1/system-status
- 帳號編碼自動產生 (prefix + seq_no 4碼左補0)
- 燈號 (lights) 從最新排程結果取得
[Phase 3 Watchdog]
- APScheduler interval 3分鐘,原子 UPDATE status=Going 防重複執行
- 手動觸發 API: POST /api/v1/schedules/{id}/run
[Phase 4 Service Clients]
- KeycloakClient:vmis-admin realm,REST API (不用 python-keycloak)
- MailClient:Docker Mailserver @ 10.1.0.254:8080,含 MX DNS 驗證
- DockerClient:docker-py 本機 + paramiko SSH 遠端 compose
- NextcloudClient:OCS API user/quota
- SystemChecker:功能驗證 (traefik routers>0 / keycloak token / SMTP EHLO / DB SELECT 1 / ping)
[TDD]
- 37 tests / 37 passed (2.11s)
- SQLite in-memory + StaticPool,無需外部 DB
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
0
backend/app/services/scheduler/__init__.py
Normal file
0
backend/app/services/scheduler/__init__.py
Normal file
59
backend/app/services/scheduler/runner.py
Normal file
59
backend/app/services/scheduler/runner.py
Normal file
@@ -0,0 +1,59 @@
|
||||
"""
|
||||
Schedule dispatcher: routes schedule_id to the correct run function.
|
||||
Also used by manual trigger API.
|
||||
"""
|
||||
import logging
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.core.database import SessionLocal
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def dispatch_schedule(schedule_id: int, log_id: int = None, db: Session = None):
|
||||
"""
|
||||
Dispatch to correct schedule function.
|
||||
When called from watchdog, db and log_id are provided.
|
||||
When called from manual API, creates its own session and log.
|
||||
"""
|
||||
own_db = db is None
|
||||
if own_db:
|
||||
db = SessionLocal()
|
||||
|
||||
if log_id is None:
|
||||
from datetime import datetime
|
||||
from app.models.schedule import ScheduleLog, Schedule
|
||||
schedule = db.get(Schedule, schedule_id)
|
||||
if not schedule:
|
||||
if own_db:
|
||||
db.close()
|
||||
return
|
||||
log = ScheduleLog(
|
||||
schedule_id=schedule_id,
|
||||
schedule_name=schedule.name,
|
||||
started_at=datetime.utcnow(),
|
||||
status="running",
|
||||
)
|
||||
db.add(log)
|
||||
db.commit()
|
||||
db.refresh(log)
|
||||
log_id = log.id
|
||||
|
||||
try:
|
||||
if schedule_id == 1:
|
||||
from app.services.scheduler.schedule_tenant import run_tenant_check
|
||||
run_tenant_check(log_id, db)
|
||||
elif schedule_id == 2:
|
||||
from app.services.scheduler.schedule_account import run_account_check
|
||||
run_account_check(log_id, db)
|
||||
elif schedule_id == 3:
|
||||
from app.services.scheduler.schedule_system import run_system_status
|
||||
run_system_status(log_id, db)
|
||||
else:
|
||||
logger.warning(f"Unknown schedule_id: {schedule_id}")
|
||||
except Exception as e:
|
||||
logger.exception(f"dispatch_schedule({schedule_id}) error: {e}")
|
||||
raise
|
||||
finally:
|
||||
if own_db:
|
||||
db.close()
|
||||
103
backend/app/services/scheduler/schedule_account.py
Normal file
103
backend/app/services/scheduler/schedule_account.py
Normal file
@@ -0,0 +1,103 @@
|
||||
"""
|
||||
Schedule 2 — 帳號檢查(每 3 分鐘)
|
||||
檢查每個 active 帳號的: SSO使用者 / Mailbox / NC使用者 / Quota
|
||||
"""
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.models.account import Account
|
||||
from app.models.result import AccountScheduleResult
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def run_account_check(schedule_log_id: int, db: Session):
|
||||
from app.services.keycloak_client import KeycloakClient
|
||||
from app.services.mail_client import MailClient
|
||||
from app.services.nextcloud_client import NextcloudClient
|
||||
|
||||
accounts = (
|
||||
db.query(Account)
|
||||
.filter(Account.is_active == True)
|
||||
.all()
|
||||
)
|
||||
kc = KeycloakClient()
|
||||
mail = MailClient()
|
||||
|
||||
for account in accounts:
|
||||
tenant = account.tenant
|
||||
realm = tenant.keycloak_realm or tenant.code
|
||||
result = AccountScheduleResult(
|
||||
schedule_log_id=schedule_log_id,
|
||||
account_id=account.id,
|
||||
sso_account=account.sso_account,
|
||||
recorded_at=datetime.utcnow(),
|
||||
)
|
||||
fail_reasons = []
|
||||
|
||||
# [1] SSO user check
|
||||
try:
|
||||
sso_uuid = kc.get_user_uuid(realm, account.sso_account)
|
||||
if sso_uuid:
|
||||
result.sso_result = True
|
||||
result.sso_uuid = sso_uuid
|
||||
if not account.sso_uuid:
|
||||
account.sso_uuid = sso_uuid
|
||||
else:
|
||||
sso_uuid = kc.create_user(realm, account.sso_account, account.email, account.default_password)
|
||||
result.sso_result = sso_uuid is not None
|
||||
result.sso_uuid = sso_uuid
|
||||
if sso_uuid and not account.sso_uuid:
|
||||
account.sso_uuid = sso_uuid
|
||||
result.sso_done_at = datetime.utcnow()
|
||||
except Exception as e:
|
||||
result.sso_result = False
|
||||
result.sso_done_at = datetime.utcnow()
|
||||
fail_reasons.append(f"sso: {e}")
|
||||
|
||||
# [2] Mailbox check (skip if mail domain not ready)
|
||||
try:
|
||||
email = account.email or f"{account.sso_account}@{tenant.domain}"
|
||||
mb_exists = mail.mailbox_exists(email)
|
||||
if mb_exists:
|
||||
result.mailbox_result = True
|
||||
else:
|
||||
created = mail.create_mailbox(email, account.default_password, account.quota_limit)
|
||||
result.mailbox_result = created
|
||||
result.mailbox_done_at = datetime.utcnow()
|
||||
except Exception as e:
|
||||
result.mailbox_result = False
|
||||
result.mailbox_done_at = datetime.utcnow()
|
||||
fail_reasons.append(f"mailbox: {e}")
|
||||
|
||||
# [3] NC user check
|
||||
try:
|
||||
nc = NextcloudClient(tenant.domain)
|
||||
nc_exists = nc.user_exists(account.sso_account)
|
||||
if nc_exists:
|
||||
result.nc_result = True
|
||||
else:
|
||||
created = nc.create_user(account.sso_account, account.default_password, account.quota_limit)
|
||||
result.nc_result = created
|
||||
result.nc_done_at = datetime.utcnow()
|
||||
except Exception as e:
|
||||
result.nc_result = False
|
||||
result.nc_done_at = datetime.utcnow()
|
||||
fail_reasons.append(f"nc: {e}")
|
||||
|
||||
# [4] Quota
|
||||
try:
|
||||
nc = NextcloudClient(tenant.domain)
|
||||
result.quota_usage = nc.get_user_quota_used_gb(account.sso_account)
|
||||
except Exception as e:
|
||||
logger.warning(f"Quota check failed for {account.account_code}: {e}")
|
||||
|
||||
if fail_reasons:
|
||||
result.fail_reason = "; ".join(fail_reasons)
|
||||
|
||||
db.add(result)
|
||||
|
||||
db.commit()
|
||||
db.flush()
|
||||
logger.info(f"Account check done: {len(accounts)} accounts processed")
|
||||
94
backend/app/services/scheduler/schedule_system.py
Normal file
94
backend/app/services/scheduler/schedule_system.py
Normal file
@@ -0,0 +1,94 @@
|
||||
"""
|
||||
Schedule 3 — 系統狀態(每日 08:00)
|
||||
Part A: 基礎設施服務功能驗證(traefik/keycloak/mail/db)
|
||||
Part B: 伺服器 ping 檢查
|
||||
"""
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.models.server import SystemStatusLog, ServerStatusLog, Server
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Fixed 8 services: environment × service_name
|
||||
SERVICES = [
|
||||
{"environment": "test", "service_name": "traefik",
|
||||
"service_desc": "測試環境反向代理", "host": "localhost", "port": 8080},
|
||||
{"environment": "test", "service_name": "keycloak",
|
||||
"service_desc": "測試環境 SSO",
|
||||
"url": "https://auth.lab.taipei", "realm": "master"},
|
||||
{"environment": "test", "service_name": "mail",
|
||||
"service_desc": "測試環境 Mail Server", "host": "localhost", "port": 587},
|
||||
{"environment": "test", "service_name": "db",
|
||||
"service_desc": "10.1.0.20:5433 PostgreSQL",
|
||||
"db_host": "10.1.0.20", "db_port": 5433},
|
||||
{"environment": "prod", "service_name": "traefik",
|
||||
"service_desc": "正式環境反向代理", "host": "localhost", "port": 8080},
|
||||
{"environment": "prod", "service_name": "keycloak",
|
||||
"service_desc": "正式環境 SSO",
|
||||
"url": "https://auth.ease.taipei", "realm": "master"},
|
||||
{"environment": "prod", "service_name": "mail",
|
||||
"service_desc": "正式環境 Mail Server", "host": "10.1.0.254", "port": 587},
|
||||
{"environment": "prod", "service_name": "db",
|
||||
"service_desc": "10.1.0.254:5432 PostgreSQL",
|
||||
"db_host": "10.1.0.254", "db_port": 5432},
|
||||
]
|
||||
|
||||
|
||||
def run_system_status(schedule_log_id: int, db: Session):
|
||||
from app.services.system_checker import SystemChecker
|
||||
checker = SystemChecker()
|
||||
|
||||
# Part A: Infrastructure services
|
||||
for svc in SERVICES:
|
||||
result = False
|
||||
fail_reason = None
|
||||
try:
|
||||
if svc["service_name"] == "traefik":
|
||||
result = checker.check_traefik(svc["host"], svc["port"])
|
||||
elif svc["service_name"] == "keycloak":
|
||||
result = checker.check_keycloak(svc["url"], svc["realm"])
|
||||
elif svc["service_name"] == "mail":
|
||||
result = checker.check_smtp(svc["host"], svc["port"])
|
||||
elif svc["service_name"] == "db":
|
||||
result = checker.check_postgres(svc["db_host"], svc["db_port"])
|
||||
except Exception as e:
|
||||
result = False
|
||||
fail_reason = str(e)
|
||||
|
||||
db.add(SystemStatusLog(
|
||||
schedule_log_id=schedule_log_id,
|
||||
environment=svc["environment"],
|
||||
service_name=svc["service_name"],
|
||||
service_desc=svc["service_desc"],
|
||||
result=result,
|
||||
fail_reason=fail_reason,
|
||||
recorded_at=datetime.utcnow(),
|
||||
))
|
||||
|
||||
# Part B: Server ping
|
||||
servers = db.query(Server).filter(Server.is_active == True).order_by(Server.sort_order).all()
|
||||
for server in servers:
|
||||
response_time = None
|
||||
fail_reason = None
|
||||
try:
|
||||
response_time = checker.ping_server(server.ip_address)
|
||||
result = response_time is not None
|
||||
if not result:
|
||||
fail_reason = "No response"
|
||||
except Exception as e:
|
||||
result = False
|
||||
fail_reason = str(e)
|
||||
|
||||
db.add(ServerStatusLog(
|
||||
schedule_log_id=schedule_log_id,
|
||||
server_id=server.id,
|
||||
result=result,
|
||||
response_time=response_time,
|
||||
fail_reason=fail_reason,
|
||||
recorded_at=datetime.utcnow(),
|
||||
))
|
||||
|
||||
db.commit()
|
||||
logger.info(f"System status check done: {len(SERVICES)} services + {len(servers)} servers")
|
||||
110
backend/app/services/scheduler/schedule_tenant.py
Normal file
110
backend/app/services/scheduler/schedule_tenant.py
Normal file
@@ -0,0 +1,110 @@
|
||||
"""
|
||||
Schedule 1 — 租戶檢查(每 3 分鐘)
|
||||
檢查每個 active 租戶的: Traefik路由 / SSO Realm / Mailbox Domain / NC容器 / OO容器 / Quota
|
||||
"""
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.models.tenant import Tenant
|
||||
from app.models.result import TenantScheduleResult
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def run_tenant_check(schedule_log_id: int, db: Session):
|
||||
from app.services.keycloak_client import KeycloakClient
|
||||
from app.services.mail_client import MailClient
|
||||
from app.services.docker_client import DockerClient
|
||||
from app.services.nextcloud_client import NextcloudClient
|
||||
|
||||
tenants = db.query(Tenant).filter(Tenant.is_active == True).all()
|
||||
kc = KeycloakClient()
|
||||
mail = MailClient()
|
||||
docker = DockerClient()
|
||||
|
||||
for tenant in tenants:
|
||||
realm = tenant.keycloak_realm or tenant.code
|
||||
result = TenantScheduleResult(
|
||||
schedule_log_id=schedule_log_id,
|
||||
tenant_id=tenant.id,
|
||||
recorded_at=datetime.utcnow(),
|
||||
)
|
||||
fail_reasons = []
|
||||
|
||||
# [1] Traefik
|
||||
try:
|
||||
result.traefik_status = docker.check_traefik_route(tenant.domain)
|
||||
result.traefik_done_at = datetime.utcnow()
|
||||
except Exception as e:
|
||||
result.traefik_status = False
|
||||
result.traefik_done_at = datetime.utcnow()
|
||||
fail_reasons.append(f"traefik: {e}")
|
||||
|
||||
# [2] SSO
|
||||
try:
|
||||
exists = kc.realm_exists(realm)
|
||||
if not exists:
|
||||
kc.create_realm(realm, tenant.name)
|
||||
result.sso_result = True
|
||||
result.sso_done_at = datetime.utcnow()
|
||||
except Exception as e:
|
||||
result.sso_result = False
|
||||
result.sso_done_at = datetime.utcnow()
|
||||
fail_reasons.append(f"sso: {e}")
|
||||
|
||||
# [3] Mailbox Domain (with DNS check for active tenants)
|
||||
try:
|
||||
if tenant.status == "active":
|
||||
dns_ok = mail.check_mx_dns(tenant.domain)
|
||||
if not dns_ok:
|
||||
result.mailbox_result = False
|
||||
result.mailbox_done_at = datetime.utcnow()
|
||||
fail_reasons.append("mailbox: MX record not configured")
|
||||
db.add(result)
|
||||
db.commit()
|
||||
continue
|
||||
domain_exists = mail.domain_exists(tenant.domain)
|
||||
if not domain_exists:
|
||||
mail.create_domain(tenant.domain)
|
||||
result.mailbox_result = True
|
||||
result.mailbox_done_at = datetime.utcnow()
|
||||
except Exception as e:
|
||||
result.mailbox_result = False
|
||||
result.mailbox_done_at = datetime.utcnow()
|
||||
fail_reasons.append(f"mailbox: {e}")
|
||||
|
||||
# [4] Nextcloud container
|
||||
try:
|
||||
nc_name = f"nc-{realm}"
|
||||
result.nc_result = docker.ensure_container_running(nc_name, tenant.code, realm)
|
||||
result.nc_done_at = datetime.utcnow()
|
||||
except Exception as e:
|
||||
result.nc_result = False
|
||||
result.nc_done_at = datetime.utcnow()
|
||||
fail_reasons.append(f"nc: {e}")
|
||||
|
||||
# [5] OnlyOffice container
|
||||
try:
|
||||
oo_name = f"oo-{realm}"
|
||||
result.office_result = docker.ensure_container_running(oo_name, tenant.code, realm)
|
||||
result.office_done_at = datetime.utcnow()
|
||||
except Exception as e:
|
||||
result.office_result = False
|
||||
result.office_done_at = datetime.utcnow()
|
||||
fail_reasons.append(f"office: {e}")
|
||||
|
||||
# [6] Quota
|
||||
try:
|
||||
nc = NextcloudClient(tenant.domain)
|
||||
result.quota_usage = nc.get_total_quota_used_gb()
|
||||
except Exception as e:
|
||||
logger.warning(f"Quota check failed for {tenant.code}: {e}")
|
||||
|
||||
if fail_reasons:
|
||||
result.fail_reason = "; ".join(fail_reasons)
|
||||
|
||||
db.add(result)
|
||||
|
||||
db.commit()
|
||||
logger.info(f"Tenant check done: {len(tenants)} tenants processed")
|
||||
107
backend/app/services/scheduler/watchdog.py
Normal file
107
backend/app/services/scheduler/watchdog.py
Normal file
@@ -0,0 +1,107 @@
|
||||
"""
|
||||
Watchdog: APScheduler BackgroundScheduler,每 3 分鐘掃描 schedules 表。
|
||||
防重複執行:原子 UPDATE status='Going',影響 0 筆則跳過。
|
||||
"""
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from apscheduler.schedulers.background import BackgroundScheduler
|
||||
from croniter import croniter
|
||||
from sqlalchemy import update
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.core.database import SessionLocal
|
||||
from app.models.schedule import Schedule, ScheduleLog
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_scheduler = BackgroundScheduler(timezone="Asia/Taipei")
|
||||
|
||||
|
||||
def _watchdog_tick():
|
||||
db: Session = SessionLocal()
|
||||
try:
|
||||
due = (
|
||||
db.query(Schedule)
|
||||
.filter(
|
||||
Schedule.status == "Waiting",
|
||||
Schedule.next_run_at <= datetime.utcnow(),
|
||||
)
|
||||
.all()
|
||||
)
|
||||
for schedule in due:
|
||||
# Atomic lock: only one process wins
|
||||
affected = db.execute(
|
||||
update(Schedule)
|
||||
.where(Schedule.id == schedule.id, Schedule.status == "Waiting")
|
||||
.values(status="Going")
|
||||
).rowcount
|
||||
db.commit()
|
||||
|
||||
if affected == 0:
|
||||
# Another process already grabbed it
|
||||
continue
|
||||
|
||||
log = ScheduleLog(
|
||||
schedule_id=schedule.id,
|
||||
schedule_name=schedule.name,
|
||||
started_at=datetime.utcnow(),
|
||||
status="running",
|
||||
)
|
||||
db.add(log)
|
||||
db.commit()
|
||||
db.refresh(log)
|
||||
|
||||
try:
|
||||
from app.services.scheduler.runner import dispatch_schedule
|
||||
dispatch_schedule(schedule.id, log.id, db)
|
||||
final_status = "ok"
|
||||
except Exception as e:
|
||||
logger.exception(f"Schedule {schedule.name} failed: {e}")
|
||||
final_status = "error"
|
||||
|
||||
# Update log
|
||||
log.ended_at = datetime.utcnow()
|
||||
log.status = final_status
|
||||
|
||||
# Recalculate next_run_at
|
||||
try:
|
||||
cron = croniter(schedule.cron_timer, datetime.utcnow())
|
||||
next_run = cron.get_next(datetime)
|
||||
except Exception:
|
||||
next_run = None
|
||||
|
||||
# Reset schedule
|
||||
db.execute(
|
||||
update(Schedule)
|
||||
.where(Schedule.id == schedule.id)
|
||||
.values(
|
||||
status="Waiting",
|
||||
last_run_at=datetime.utcnow(),
|
||||
next_run_at=next_run,
|
||||
last_status=final_status,
|
||||
)
|
||||
)
|
||||
db.commit()
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(f"Watchdog tick error: {e}")
|
||||
db.rollback()
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
def start_watchdog():
|
||||
_scheduler.add_job(
|
||||
_watchdog_tick,
|
||||
trigger="interval",
|
||||
minutes=3,
|
||||
id="watchdog",
|
||||
replace_existing=True,
|
||||
)
|
||||
_scheduler.start()
|
||||
logger.info("Watchdog scheduler started")
|
||||
|
||||
|
||||
def stop_watchdog():
|
||||
_scheduler.shutdown(wait=False)
|
||||
logger.info("Watchdog scheduler stopped")
|
||||
Reference in New Issue
Block a user